xref: /linux/mm/page_alloc.c (revision 3d4f1a54160046d5059ec6c5f2152e054e7b12d7)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *
4  *  Manages the free list, the system allocates free pages here.
5  *  Note that kmalloc() lives in slab.c
6  *
7  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
8  *  Swap reorganised 29.12.95, Stephen Tweedie
9  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15  */
16 
17 #include <linux/stddef.h>
18 #include <linux/mm.h>
19 #include <linux/highmem.h>
20 #include <linux/interrupt.h>
21 #include <linux/jiffies.h>
22 #include <linux/compiler.h>
23 #include <linux/kernel.h>
24 #include <linux/kasan.h>
25 #include <linux/kmsan.h>
26 #include <linux/module.h>
27 #include <linux/suspend.h>
28 #include <linux/ratelimit.h>
29 #include <linux/oom.h>
30 #include <linux/topology.h>
31 #include <linux/sysctl.h>
32 #include <linux/cpu.h>
33 #include <linux/cpuset.h>
34 #include <linux/folio_batch.h>
35 #include <linux/memory_hotplug.h>
36 #include <linux/nodemask.h>
37 #include <linux/vmstat.h>
38 #include <linux/fault-inject.h>
39 #include <linux/compaction.h>
40 #include <trace/events/kmem.h>
41 #include <trace/events/oom.h>
42 #include <linux/prefetch.h>
43 #include <linux/mm_inline.h>
44 #include <linux/mmu_notifier.h>
45 #include <linux/migrate.h>
46 #include <linux/sched/mm.h>
47 #include <linux/page_owner.h>
48 #include <linux/page_table_check.h>
49 #include <linux/memcontrol.h>
50 #include <linux/ftrace.h>
51 #include <linux/lockdep.h>
52 #include <linux/psi.h>
53 #include <linux/khugepaged.h>
54 #include <linux/delayacct.h>
55 #include <linux/cacheinfo.h>
56 #include <linux/pgalloc_tag.h>
57 #include <asm/div64.h>
58 #include "internal.h"
59 #include "shuffle.h"
60 #include "page_reporting.h"
61 
62 /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
63 typedef int __bitwise fpi_t;
64 
65 /* No special request */
66 #define FPI_NONE		((__force fpi_t)0)
67 
68 /*
69  * Skip free page reporting notification for the (possibly merged) page.
70  * This does not hinder free page reporting from grabbing the page,
71  * reporting it and marking it "reported" -  it only skips notifying
72  * the free page reporting infrastructure about a newly freed page. For
73  * example, used when temporarily pulling a page from a freelist and
74  * putting it back unmodified.
75  */
76 #define FPI_SKIP_REPORT_NOTIFY	((__force fpi_t)BIT(0))
77 
78 /*
79  * Place the (possibly merged) page to the tail of the freelist. Will ignore
80  * page shuffling (relevant code - e.g., memory onlining - is expected to
81  * shuffle the whole zone).
82  *
83  * Note: No code should rely on this flag for correctness - it's purely
84  *       to allow for optimizations when handing back either fresh pages
85  *       (memory onlining) or untouched pages (page isolation, free page
86  *       reporting).
87  */
88 #define FPI_TO_TAIL		((__force fpi_t)BIT(1))
89 
90 /* Free the page without taking locks. Rely on trylock only. */
91 #define FPI_TRYLOCK		((__force fpi_t)BIT(2))
92 
93 /* free_pages_prepare() has already been called for page(s) being freed. */
94 #define FPI_PREPARED		((__force fpi_t)BIT(3))
95 
96 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
97 static DEFINE_MUTEX(pcp_batch_high_lock);
98 #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
99 
100 /*
101  * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
102  * a migration causing the wrong PCP to be locked and remote memory being
103  * potentially allocated, pin the task to the CPU for the lookup+lock.
104  * preempt_disable is used on !RT because it is faster than migrate_disable.
105  * migrate_disable is used on RT because otherwise RT spinlock usage is
106  * interfered with and a high priority task cannot preempt the allocator.
107  */
108 #ifndef CONFIG_PREEMPT_RT
109 #define pcpu_task_pin()		preempt_disable()
110 #define pcpu_task_unpin()	preempt_enable()
111 #else
112 #define pcpu_task_pin()		migrate_disable()
113 #define pcpu_task_unpin()	migrate_enable()
114 #endif
115 
116 /*
117  * A helper to lookup and trylock pcp with embedded spinlock.
118  * The return value should be used with the unlock helper.
119  * NULL return value means the trylock failed.
120  */
121 #ifdef CONFIG_SMP
122 #define pcp_spin_trylock(ptr)						\
123 ({									\
124 	struct per_cpu_pages *_ret;					\
125 	pcpu_task_pin();						\
126 	_ret = this_cpu_ptr(ptr);					\
127 	if (!spin_trylock(&_ret->lock)) {				\
128 		pcpu_task_unpin();					\
129 		_ret = NULL;						\
130 	}								\
131 	_ret;								\
132 })
133 
134 #define pcp_spin_unlock(ptr)						\
135 ({									\
136 	spin_unlock(&ptr->lock);					\
137 	pcpu_task_unpin();						\
138 })
139 
140 /*
141  * On CONFIG_SMP=n the UP implementation of spin_trylock() never fails and thus
142  * is not compatible with our locking scheme. However we do not need pcp for
143  * scalability in the first place, so just make all the trylocks fail and take
144  * the slow path unconditionally.
145  */
146 #else
147 #define pcp_spin_trylock(ptr)		\
148 		NULL
149 
150 #define pcp_spin_unlock(ptr)		\
151 		BUG_ON(1)
152 #endif
153 
154 /*
155  * In some cases we do not need to pin the task to the CPU because we are
156  * already given a specific cpu's pcp pointer.
157  */
158 #define pcp_spin_lock_nopin(ptr)			\
159 		spin_lock(&(ptr)->lock)
160 #define pcp_spin_unlock_nopin(ptr)			\
161 		spin_unlock(&(ptr)->lock)
162 
163 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
164 DEFINE_PER_CPU(int, numa_node);
165 EXPORT_PER_CPU_SYMBOL(numa_node);
166 #endif
167 
168 DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
169 
170 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
171 /*
172  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
173  * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
174  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
175  * defined in <linux/topology.h>.
176  */
177 DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
178 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
179 #endif
180 
181 static DEFINE_MUTEX(pcpu_drain_mutex);
182 
183 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
184 volatile unsigned long latent_entropy __latent_entropy;
185 EXPORT_SYMBOL(latent_entropy);
186 #endif
187 
188 /*
189  * Array of node states.
190  */
191 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
192 	[N_POSSIBLE] = NODE_MASK_ALL,
193 	[N_ONLINE] = { { [0] = 1UL } },
194 #ifndef CONFIG_NUMA
195 	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
196 #ifdef CONFIG_HIGHMEM
197 	[N_HIGH_MEMORY] = { { [0] = 1UL } },
198 #endif
199 	[N_MEMORY] = { { [0] = 1UL } },
200 	[N_CPU] = { { [0] = 1UL } },
201 #endif	/* NUMA */
202 };
203 EXPORT_SYMBOL(node_states);
204 
205 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
206 
207 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
208 unsigned int pageblock_order __read_mostly;
209 #endif
210 
211 static void __free_pages_ok(struct page *page, unsigned int order,
212 			    fpi_t fpi_flags);
213 static void reserve_highatomic_pageblock(struct page *page, int order,
214 					 struct zone *zone);
215 
216 /*
217  * results with 256, 32 in the lowmem_reserve sysctl:
218  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
219  *	1G machine -> (16M dma, 784M normal, 224M high)
220  *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
221  *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
222  *	HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
223  *
224  * TBD: should special case ZONE_DMA32 machines here - in those we normally
225  * don't need any ZONE_NORMAL reservation
226  */
227 static int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
228 #ifdef CONFIG_ZONE_DMA
229 	[ZONE_DMA] = 256,
230 #endif
231 #ifdef CONFIG_ZONE_DMA32
232 	[ZONE_DMA32] = 256,
233 #endif
234 	[ZONE_NORMAL] = 32,
235 #ifdef CONFIG_HIGHMEM
236 	[ZONE_HIGHMEM] = 0,
237 #endif
238 	[ZONE_MOVABLE] = 0,
239 };
240 
241 char * const zone_names[MAX_NR_ZONES] = {
242 #ifdef CONFIG_ZONE_DMA
243 	 "DMA",
244 #endif
245 #ifdef CONFIG_ZONE_DMA32
246 	 "DMA32",
247 #endif
248 	 "Normal",
249 #ifdef CONFIG_HIGHMEM
250 	 "HighMem",
251 #endif
252 	 "Movable",
253 #ifdef CONFIG_ZONE_DEVICE
254 	 "Device",
255 #endif
256 };
257 
258 const char * const migratetype_names[MIGRATE_TYPES] = {
259 	"Unmovable",
260 	"Movable",
261 	"Reclaimable",
262 	"HighAtomic",
263 #ifdef CONFIG_CMA
264 	"CMA",
265 #endif
266 #ifdef CONFIG_MEMORY_ISOLATION
267 	"Isolate",
268 #endif
269 };
270 
271 int min_free_kbytes = 1024;
272 int user_min_free_kbytes = -1;
273 static int watermark_boost_factor __read_mostly = 15000;
274 static int watermark_scale_factor = 10;
275 int defrag_mode;
276 
277 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
278 int movable_zone;
279 EXPORT_SYMBOL(movable_zone);
280 
281 #if MAX_NUMNODES > 1
282 unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
283 unsigned int nr_online_nodes __read_mostly = 1;
284 EXPORT_SYMBOL(nr_node_ids);
285 EXPORT_SYMBOL(nr_online_nodes);
286 #endif
287 
288 /*
289  * When page allocations stall for longer than a threshold,
290  * ALLOC_STALL_WARN_MSECS, leave a warning in the kernel log.  Only one warning
291  * will be printed during this duration for the entire system.
292  */
293 #define ALLOC_STALL_WARN_MSECS (10 * 1000UL)
294 static unsigned long alloc_stall_warn_jiffies = INITIAL_JIFFIES;
295 
296 static bool page_contains_unaccepted(struct page *page, unsigned int order);
297 static bool cond_accept_memory(struct zone *zone, unsigned int order,
298 			       int alloc_flags);
299 static bool __free_unaccepted(struct page *page);
300 
301 int page_group_by_mobility_disabled __read_mostly;
302 
303 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
304 /*
305  * During boot we initialize deferred pages on-demand, as needed, but once
306  * page_alloc_init_late() has finished, the deferred pages are all initialized,
307  * and we can permanently disable that path.
308  */
309 DEFINE_STATIC_KEY_TRUE(deferred_pages);
310 
311 /*
312  * deferred_grow_zone() is __init, but it is called from
313  * get_page_from_freelist() during early boot until deferred_pages permanently
314  * disables this call. This is why we have refdata wrapper to avoid warning,
315  * and to ensure that the function body gets unloaded.
316  */
317 static bool __ref
318 _deferred_grow_zone(struct zone *zone, unsigned int order)
319 {
320 	return deferred_grow_zone(zone, order);
321 }
322 #else
323 static inline bool _deferred_grow_zone(struct zone *zone, unsigned int order)
324 {
325 	return false;
326 }
327 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
328 
329 /* Return a pointer to the bitmap storing bits affecting a block of pages */
330 static inline unsigned long *get_pageblock_bitmap(const struct page *page,
331 							unsigned long pfn)
332 {
333 #ifdef CONFIG_SPARSEMEM
334 	return section_to_usemap(__pfn_to_section(pfn));
335 #else
336 	return page_zone(page)->pageblock_flags;
337 #endif /* CONFIG_SPARSEMEM */
338 }
339 
340 static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
341 {
342 #ifdef CONFIG_SPARSEMEM
343 	pfn &= (PAGES_PER_SECTION-1);
344 #else
345 	pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
346 #endif /* CONFIG_SPARSEMEM */
347 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
348 }
349 
350 static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit)
351 {
352 	return pb_bit >= PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS;
353 }
354 
355 static __always_inline void
356 get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn,
357 			   unsigned long **bitmap_word, unsigned long *bitidx)
358 {
359 	unsigned long *bitmap;
360 	unsigned long word_bitidx;
361 
362 #ifdef CONFIG_MEMORY_ISOLATION
363 	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 8);
364 #else
365 	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
366 #endif
367 	BUILD_BUG_ON(__MIGRATE_TYPE_END > PAGEBLOCK_MIGRATETYPE_MASK);
368 	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
369 
370 	bitmap = get_pageblock_bitmap(page, pfn);
371 	*bitidx = pfn_to_bitidx(page, pfn);
372 	word_bitidx = *bitidx / BITS_PER_LONG;
373 	*bitidx &= (BITS_PER_LONG - 1);
374 	*bitmap_word = &bitmap[word_bitidx];
375 }
376 
377 
378 /**
379  * __get_pfnblock_flags_mask - Return the requested group of flags for
380  * a pageblock_nr_pages block of pages
381  * @page: The page within the block of interest
382  * @pfn: The target page frame number
383  * @mask: mask of bits that the caller is interested in
384  *
385  * Return: pageblock_bits flags
386  */
387 static unsigned long __get_pfnblock_flags_mask(const struct page *page,
388 					       unsigned long pfn,
389 					       unsigned long mask)
390 {
391 	unsigned long *bitmap_word;
392 	unsigned long bitidx;
393 	unsigned long word;
394 
395 	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
396 	/*
397 	 * This races, without locks, with set_pfnblock_migratetype(). Ensure
398 	 * a consistent read of the memory array, so that results, even though
399 	 * racy, are not corrupted.
400 	 */
401 	word = READ_ONCE(*bitmap_word);
402 	return (word >> bitidx) & mask;
403 }
404 
405 /**
406  * get_pfnblock_bit - Check if a standalone bit of a pageblock is set
407  * @page: The page within the block of interest
408  * @pfn: The target page frame number
409  * @pb_bit: pageblock bit to check
410  *
411  * Return: true if the bit is set, otherwise false
412  */
413 bool get_pfnblock_bit(const struct page *page, unsigned long pfn,
414 		      enum pageblock_bits pb_bit)
415 {
416 	unsigned long *bitmap_word;
417 	unsigned long bitidx;
418 
419 	if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
420 		return false;
421 
422 	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
423 
424 	return test_bit(bitidx + pb_bit, bitmap_word);
425 }
426 
427 /**
428  * get_pfnblock_migratetype - Return the migratetype of a pageblock
429  * @page: The page within the block of interest
430  * @pfn: The target page frame number
431  *
432  * Return: The migratetype of the pageblock
433  *
434  * Use get_pfnblock_migratetype() if caller already has both @page and @pfn
435  * to save a call to page_to_pfn().
436  */
437 enum migratetype
438 get_pfnblock_migratetype(const struct page *page, unsigned long pfn)
439 {
440 	unsigned long mask = PAGEBLOCK_MIGRATETYPE_MASK | PAGEBLOCK_ISO_MASK;
441 	unsigned long flags;
442 
443 	flags = __get_pfnblock_flags_mask(page, pfn, mask);
444 
445 #ifdef CONFIG_MEMORY_ISOLATION
446 	if (flags & BIT(PB_migrate_isolate))
447 		return MIGRATE_ISOLATE;
448 #endif
449 	return flags & PAGEBLOCK_MIGRATETYPE_MASK;
450 }
451 
452 /**
453  * __set_pfnblock_flags_mask - Set the requested group of flags for
454  * a pageblock_nr_pages block of pages
455  * @page: The page within the block of interest
456  * @pfn: The target page frame number
457  * @flags: The flags to set
458  * @mask: mask of bits that the caller is interested in
459  */
460 static void __set_pfnblock_flags_mask(struct page *page, unsigned long pfn,
461 				      unsigned long flags, unsigned long mask)
462 {
463 	unsigned long *bitmap_word;
464 	unsigned long bitidx;
465 	unsigned long word;
466 
467 	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
468 
469 	mask <<= bitidx;
470 	flags <<= bitidx;
471 
472 	word = READ_ONCE(*bitmap_word);
473 	do {
474 	} while (!try_cmpxchg(bitmap_word, &word, (word & ~mask) | flags));
475 }
476 
477 /**
478  * set_pfnblock_bit - Set a standalone bit of a pageblock
479  * @page: The page within the block of interest
480  * @pfn: The target page frame number
481  * @pb_bit: pageblock bit to set
482  */
483 void set_pfnblock_bit(const struct page *page, unsigned long pfn,
484 		      enum pageblock_bits pb_bit)
485 {
486 	unsigned long *bitmap_word;
487 	unsigned long bitidx;
488 
489 	if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
490 		return;
491 
492 	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
493 
494 	set_bit(bitidx + pb_bit, bitmap_word);
495 }
496 
497 /**
498  * clear_pfnblock_bit - Clear a standalone bit of a pageblock
499  * @page: The page within the block of interest
500  * @pfn: The target page frame number
501  * @pb_bit: pageblock bit to clear
502  */
503 void clear_pfnblock_bit(const struct page *page, unsigned long pfn,
504 			enum pageblock_bits pb_bit)
505 {
506 	unsigned long *bitmap_word;
507 	unsigned long bitidx;
508 
509 	if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
510 		return;
511 
512 	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
513 
514 	clear_bit(bitidx + pb_bit, bitmap_word);
515 }
516 
517 /**
518  * set_pageblock_migratetype - Set the migratetype of a pageblock
519  * @page: The page within the block of interest
520  * @migratetype: migratetype to set
521  */
522 static void set_pageblock_migratetype(struct page *page,
523 				      enum migratetype migratetype)
524 {
525 	if (unlikely(page_group_by_mobility_disabled &&
526 		     migratetype < MIGRATE_PCPTYPES))
527 		migratetype = MIGRATE_UNMOVABLE;
528 
529 #ifdef CONFIG_MEMORY_ISOLATION
530 	if (migratetype == MIGRATE_ISOLATE) {
531 		VM_WARN_ONCE(1,
532 			"Use set_pageblock_isolate() for pageblock isolation");
533 		return;
534 	}
535 	VM_WARN_ONCE(get_pageblock_isolate(page),
536 		     "Use clear_pageblock_isolate() to unisolate pageblock");
537 	/* PAGEBLOCK_ISO_MASK clears PB_migrate_isolate if it is set */
538 #endif
539 	__set_pfnblock_flags_mask(page, page_to_pfn(page),
540 				  (unsigned long)migratetype,
541 				  PAGEBLOCK_MIGRATETYPE_MASK | PAGEBLOCK_ISO_MASK);
542 }
543 
544 void __meminit init_pageblock_migratetype(struct page *page,
545 					  enum migratetype migratetype,
546 					  bool isolate)
547 {
548 	unsigned long flags;
549 
550 	if (unlikely(page_group_by_mobility_disabled &&
551 		     migratetype < MIGRATE_PCPTYPES))
552 		migratetype = MIGRATE_UNMOVABLE;
553 
554 	flags = migratetype;
555 
556 #ifdef CONFIG_MEMORY_ISOLATION
557 	if (migratetype == MIGRATE_ISOLATE) {
558 		VM_WARN_ONCE(
559 			1,
560 			"Set isolate=true to isolate pageblock with a migratetype");
561 		return;
562 	}
563 	if (isolate)
564 		flags |= BIT(PB_migrate_isolate);
565 #endif
566 	__set_pfnblock_flags_mask(page, page_to_pfn(page), flags,
567 				  PAGEBLOCK_MIGRATETYPE_MASK | PAGEBLOCK_ISO_MASK);
568 }
569 
570 #ifdef CONFIG_DEBUG_VM
571 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
572 {
573 	int ret;
574 	unsigned seq;
575 	unsigned long pfn = page_to_pfn(page);
576 	unsigned long sp, start_pfn;
577 
578 	do {
579 		seq = zone_span_seqbegin(zone);
580 		start_pfn = zone->zone_start_pfn;
581 		sp = zone->spanned_pages;
582 		ret = !zone_spans_pfn(zone, pfn);
583 	} while (zone_span_seqretry(zone, seq));
584 
585 	if (ret)
586 		pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
587 			pfn, zone_to_nid(zone), zone->name,
588 			start_pfn, start_pfn + sp);
589 
590 	return ret;
591 }
592 
593 /*
594  * Temporary debugging check for pages not lying within a given zone.
595  */
596 static bool __maybe_unused bad_range(struct zone *zone, struct page *page)
597 {
598 	if (page_outside_zone_boundaries(zone, page))
599 		return true;
600 	if (zone != page_zone(page))
601 		return true;
602 
603 	return false;
604 }
605 #else
606 static inline bool __maybe_unused bad_range(struct zone *zone, struct page *page)
607 {
608 	return false;
609 }
610 #endif
611 
612 static void bad_page(struct page *page, const char *reason)
613 {
614 	static unsigned long resume;
615 	static unsigned long nr_shown;
616 	static unsigned long nr_unshown;
617 
618 	/*
619 	 * Allow a burst of 60 reports, then keep quiet for that minute;
620 	 * or allow a steady drip of one report per second.
621 	 */
622 	if (nr_shown == 60) {
623 		if (time_before(jiffies, resume)) {
624 			nr_unshown++;
625 			goto out;
626 		}
627 		if (nr_unshown) {
628 			pr_alert(
629 			      "BUG: Bad page state: %lu messages suppressed\n",
630 				nr_unshown);
631 			nr_unshown = 0;
632 		}
633 		nr_shown = 0;
634 	}
635 	if (nr_shown++ == 0)
636 		resume = jiffies + 60 * HZ;
637 
638 	pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
639 		current->comm, page_to_pfn(page));
640 	dump_page(page, reason);
641 
642 	print_modules();
643 	dump_stack();
644 out:
645 	/* Leave bad fields for debug, except PageBuddy could make trouble */
646 	if (PageBuddy(page))
647 		__ClearPageBuddy(page);
648 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
649 }
650 
651 static inline unsigned int order_to_pindex(int migratetype, int order)
652 {
653 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
654 		bool movable = migratetype == MIGRATE_MOVABLE;
655 
656 		if (order > PAGE_ALLOC_COSTLY_ORDER)
657 			return NR_LOWORDER_PCP_LISTS + movable;
658 	}
659 
660 	return (MIGRATE_PCPTYPES * order) + migratetype;
661 }
662 
663 static inline int pindex_to_order(unsigned int pindex)
664 {
665 	int order = pindex / MIGRATE_PCPTYPES;
666 
667 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
668 		if (pindex >= NR_LOWORDER_PCP_LISTS)
669 			order = HPAGE_PMD_ORDER;
670 	}
671 
672 	return order;
673 }
674 
675 static inline bool pcp_allowed_order(unsigned int order)
676 {
677 	if (order <= PAGE_ALLOC_COSTLY_ORDER)
678 		return true;
679 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
680 	if (is_pmd_order(order))
681 		return true;
682 #endif
683 	return false;
684 }
685 
686 /*
687  * Higher-order pages are called "compound pages".  They are structured thusly:
688  *
689  * The first PAGE_SIZE page is called the "head page" and have PG_head set.
690  *
691  * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
692  * in bit 0 of page->compound_info. The rest of bits is pointer to head page.
693  *
694  * The first tail page's ->compound_order holds the order of allocation.
695  * This usage means that zero-order pages may not be compound.
696  */
697 
698 void prep_compound_page(struct page *page, unsigned int order)
699 {
700 	int i;
701 	int nr_pages = 1 << order;
702 
703 	__SetPageHead(page);
704 	for (i = 1; i < nr_pages; i++)
705 		prep_compound_tail(page + i, page, order);
706 
707 	prep_compound_head(page, order);
708 }
709 
710 static inline void set_buddy_order(struct page *page, unsigned int order)
711 {
712 	set_page_private(page, order);
713 	__SetPageBuddy(page);
714 }
715 
716 #ifdef CONFIG_COMPACTION
717 static inline struct capture_control *task_capc(struct zone *zone)
718 {
719 	struct capture_control *capc = current->capture_control;
720 
721 	return unlikely(capc) &&
722 		!(current->flags & PF_KTHREAD) &&
723 		!capc->page &&
724 		capc->cc->zone == zone ? capc : NULL;
725 }
726 
727 static inline bool
728 compaction_capture(struct capture_control *capc, struct page *page,
729 		   int order, int migratetype)
730 {
731 	if (!capc || order != capc->cc->order)
732 		return false;
733 
734 	/* Do not accidentally pollute CMA or isolated regions*/
735 	if (is_migrate_cma(migratetype) ||
736 	    is_migrate_isolate(migratetype))
737 		return false;
738 
739 	/*
740 	 * Do not let lower order allocations pollute a movable pageblock
741 	 * unless compaction is also requesting movable pages.
742 	 * This might let an unmovable request use a reclaimable pageblock
743 	 * and vice-versa but no more than normal fallback logic which can
744 	 * have trouble finding a high-order free page.
745 	 */
746 	if (order < pageblock_order && migratetype == MIGRATE_MOVABLE &&
747 	    capc->cc->migratetype != MIGRATE_MOVABLE)
748 		return false;
749 
750 	if (migratetype != capc->cc->migratetype)
751 		trace_mm_page_alloc_extfrag(page, capc->cc->order, order,
752 					    capc->cc->migratetype, migratetype);
753 
754 	capc->page = page;
755 	return true;
756 }
757 
758 #else
759 static inline struct capture_control *task_capc(struct zone *zone)
760 {
761 	return NULL;
762 }
763 
764 static inline bool
765 compaction_capture(struct capture_control *capc, struct page *page,
766 		   int order, int migratetype)
767 {
768 	return false;
769 }
770 #endif /* CONFIG_COMPACTION */
771 
772 static inline void account_freepages(struct zone *zone, int nr_pages,
773 				     int migratetype)
774 {
775 	lockdep_assert_held(&zone->lock);
776 
777 	if (is_migrate_isolate(migratetype))
778 		return;
779 
780 	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
781 
782 	if (is_migrate_cma(migratetype))
783 		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
784 	else if (migratetype == MIGRATE_HIGHATOMIC)
785 		WRITE_ONCE(zone->nr_free_highatomic,
786 			   zone->nr_free_highatomic + nr_pages);
787 }
788 
789 /* Used for pages not on another list */
790 static inline void __add_to_free_list(struct page *page, struct zone *zone,
791 				      unsigned int order, int migratetype,
792 				      bool tail)
793 {
794 	struct free_area *area = &zone->free_area[order];
795 	int nr_pages = 1 << order;
796 
797 	VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
798 		     "page type is %d, passed migratetype is %d (nr=%d)\n",
799 		     get_pageblock_migratetype(page), migratetype, nr_pages);
800 
801 	if (tail)
802 		list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
803 	else
804 		list_add(&page->buddy_list, &area->free_list[migratetype]);
805 	area->nr_free++;
806 
807 	if (order >= pageblock_order && !is_migrate_isolate(migratetype))
808 		__mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
809 }
810 
811 /*
812  * Used for pages which are on another list. Move the pages to the tail
813  * of the list - so the moved pages won't immediately be considered for
814  * allocation again (e.g., optimization for memory onlining).
815  */
816 static inline void move_to_free_list(struct page *page, struct zone *zone,
817 				     unsigned int order, int old_mt, int new_mt)
818 {
819 	struct free_area *area = &zone->free_area[order];
820 	int nr_pages = 1 << order;
821 
822 	/* Free page moving can fail, so it happens before the type update */
823 	VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
824 		     "page type is %d, passed migratetype is %d (nr=%d)\n",
825 		     get_pageblock_migratetype(page), old_mt, nr_pages);
826 
827 	list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
828 
829 	account_freepages(zone, -nr_pages, old_mt);
830 	account_freepages(zone, nr_pages, new_mt);
831 
832 	if (order >= pageblock_order &&
833 	    is_migrate_isolate(old_mt) != is_migrate_isolate(new_mt)) {
834 		if (!is_migrate_isolate(old_mt))
835 			nr_pages = -nr_pages;
836 		__mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
837 	}
838 }
839 
840 static inline void __del_page_from_free_list(struct page *page, struct zone *zone,
841 					     unsigned int order, int migratetype)
842 {
843 	int nr_pages = 1 << order;
844 
845         VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
846 		     "page type is %d, passed migratetype is %d (nr=%d)\n",
847 		     get_pageblock_migratetype(page), migratetype, nr_pages);
848 
849 	/* clear reported state and update reported page count */
850 	if (page_reported(page))
851 		__ClearPageReported(page);
852 
853 	list_del(&page->buddy_list);
854 	__ClearPageBuddy(page);
855 	set_page_private(page, 0);
856 	zone->free_area[order].nr_free--;
857 
858 	if (order >= pageblock_order && !is_migrate_isolate(migratetype))
859 		__mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, -nr_pages);
860 }
861 
862 static inline void del_page_from_free_list(struct page *page, struct zone *zone,
863 					   unsigned int order, int migratetype)
864 {
865 	__del_page_from_free_list(page, zone, order, migratetype);
866 	account_freepages(zone, -(1 << order), migratetype);
867 }
868 
869 static inline struct page *get_page_from_free_area(struct free_area *area,
870 					    int migratetype)
871 {
872 	return list_first_entry_or_null(&area->free_list[migratetype],
873 					struct page, buddy_list);
874 }
875 
876 /*
877  * If this is less than the 2nd largest possible page, check if the buddy
878  * of the next-higher order is free. If it is, it's possible
879  * that pages are being freed that will coalesce soon. In case,
880  * that is happening, add the free page to the tail of the list
881  * so it's less likely to be used soon and more likely to be merged
882  * as a 2-level higher order page
883  */
884 static inline bool
885 buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
886 		   struct page *page, unsigned int order)
887 {
888 	unsigned long higher_page_pfn;
889 	struct page *higher_page;
890 
891 	if (order >= MAX_PAGE_ORDER - 1)
892 		return false;
893 
894 	higher_page_pfn = buddy_pfn & pfn;
895 	higher_page = page + (higher_page_pfn - pfn);
896 
897 	return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1,
898 			NULL) != NULL;
899 }
900 
901 static void change_pageblock_range(struct page *pageblock_page,
902 				   int start_order, int migratetype)
903 {
904 	int nr_pageblocks = 1 << (start_order - pageblock_order);
905 
906 	while (nr_pageblocks--) {
907 		set_pageblock_migratetype(pageblock_page, migratetype);
908 		pageblock_page += pageblock_nr_pages;
909 	}
910 }
911 
912 /*
913  * Freeing function for a buddy system allocator.
914  *
915  * The concept of a buddy system is to maintain direct-mapped table
916  * (containing bit values) for memory blocks of various "orders".
917  * The bottom level table contains the map for the smallest allocatable
918  * units of memory (here, pages), and each level above it describes
919  * pairs of units from the levels below, hence, "buddies".
920  * At a high level, all that happens here is marking the table entry
921  * at the bottom level available, and propagating the changes upward
922  * as necessary, plus some accounting needed to play nicely with other
923  * parts of the VM system.
924  * At each level, we keep a list of pages, which are heads of continuous
925  * free pages of length of (1 << order) and marked with PageBuddy.
926  * Page's order is recorded in page_private(page) field.
927  * So when we are allocating or freeing one, we can derive the state of the
928  * other.  That is, if we allocate a small block, and both were
929  * free, the remainder of the region must be split into blocks.
930  * If a block is freed, and its buddy is also free, then this
931  * triggers coalescing into a block of larger size.
932  *
933  * -- nyc
934  */
935 
936 static inline void __free_one_page(struct page *page,
937 		unsigned long pfn,
938 		struct zone *zone, unsigned int order,
939 		int migratetype, fpi_t fpi_flags)
940 {
941 	struct capture_control *capc = task_capc(zone);
942 	unsigned long buddy_pfn = 0;
943 	unsigned long combined_pfn;
944 	struct page *buddy;
945 	bool to_tail;
946 
947 	VM_BUG_ON(!zone_is_initialized(zone));
948 	VM_BUG_ON_PAGE(page->flags.f & PAGE_FLAGS_CHECK_AT_PREP, page);
949 
950 	VM_BUG_ON(migratetype == -1);
951 	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
952 	VM_BUG_ON_PAGE(bad_range(zone, page), page);
953 
954 	account_freepages(zone, 1 << order, migratetype);
955 
956 	while (order < MAX_PAGE_ORDER) {
957 		int buddy_mt = migratetype;
958 
959 		if (compaction_capture(capc, page, order, migratetype)) {
960 			account_freepages(zone, -(1 << order), migratetype);
961 			return;
962 		}
963 
964 		buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
965 		if (!buddy)
966 			goto done_merging;
967 
968 		if (unlikely(order >= pageblock_order)) {
969 			/*
970 			 * We want to prevent merge between freepages on pageblock
971 			 * without fallbacks and normal pageblock. Without this,
972 			 * pageblock isolation could cause incorrect freepage or CMA
973 			 * accounting or HIGHATOMIC accounting.
974 			 */
975 			buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
976 
977 			if (migratetype != buddy_mt &&
978 			    (!migratetype_is_mergeable(migratetype) ||
979 			     !migratetype_is_mergeable(buddy_mt)))
980 				goto done_merging;
981 		}
982 
983 		/*
984 		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
985 		 * merge with it and move up one order.
986 		 */
987 		if (page_is_guard(buddy))
988 			clear_page_guard(zone, buddy, order);
989 		else
990 			__del_page_from_free_list(buddy, zone, order, buddy_mt);
991 
992 		if (unlikely(buddy_mt != migratetype)) {
993 			/*
994 			 * Match buddy type. This ensures that an
995 			 * expand() down the line puts the sub-blocks
996 			 * on the right freelists.
997 			 */
998 			change_pageblock_range(buddy, order, migratetype);
999 		}
1000 
1001 		combined_pfn = buddy_pfn & pfn;
1002 		page = page + (combined_pfn - pfn);
1003 		pfn = combined_pfn;
1004 		order++;
1005 	}
1006 
1007 done_merging:
1008 	set_buddy_order(page, order);
1009 
1010 	if (fpi_flags & FPI_TO_TAIL)
1011 		to_tail = true;
1012 	else if (is_shuffle_order(order))
1013 		to_tail = shuffle_pick_tail();
1014 	else
1015 		to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
1016 
1017 	__add_to_free_list(page, zone, order, migratetype, to_tail);
1018 
1019 	/* Notify page reporting subsystem of freed page */
1020 	if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
1021 		page_reporting_notify_free(order);
1022 }
1023 
1024 /*
1025  * A bad page could be due to a number of fields. Instead of multiple branches,
1026  * try and check multiple fields with one check. The caller must do a detailed
1027  * check if necessary.
1028  */
1029 static inline bool page_expected_state(struct page *page,
1030 					unsigned long check_flags)
1031 {
1032 	if (unlikely(atomic_read(&page->_mapcount) != -1))
1033 		return false;
1034 
1035 	if (unlikely((unsigned long)page->mapping |
1036 			page_ref_count(page) |
1037 #ifdef CONFIG_MEMCG
1038 			page->memcg_data |
1039 #endif
1040 			page_pool_page_is_pp(page) |
1041 			(page->flags.f & check_flags)))
1042 		return false;
1043 
1044 	return true;
1045 }
1046 
1047 static const char *page_bad_reason(struct page *page, unsigned long flags)
1048 {
1049 	const char *bad_reason = NULL;
1050 
1051 	if (unlikely(atomic_read(&page->_mapcount) != -1))
1052 		bad_reason = "nonzero mapcount";
1053 	if (unlikely(page->mapping != NULL))
1054 		bad_reason = "non-NULL mapping";
1055 	if (unlikely(page_ref_count(page) != 0))
1056 		bad_reason = "nonzero _refcount";
1057 	if (unlikely(page->flags.f & flags)) {
1058 		if (flags == PAGE_FLAGS_CHECK_AT_PREP)
1059 			bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
1060 		else
1061 			bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
1062 	}
1063 #ifdef CONFIG_MEMCG
1064 	if (unlikely(page->memcg_data))
1065 		bad_reason = "page still charged to cgroup";
1066 #endif
1067 	if (unlikely(page_pool_page_is_pp(page)))
1068 		bad_reason = "page_pool leak";
1069 	return bad_reason;
1070 }
1071 
1072 static inline bool free_page_is_bad(struct page *page)
1073 {
1074 	if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
1075 		return false;
1076 
1077 	/* Something has gone sideways, find it */
1078 	bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
1079 	return true;
1080 }
1081 
1082 static inline bool is_check_pages_enabled(void)
1083 {
1084 	return static_branch_unlikely(&check_pages_enabled);
1085 }
1086 
1087 static int free_tail_page_prepare(struct page *head_page, struct page *page)
1088 {
1089 	struct folio *folio = (struct folio *)head_page;
1090 	int ret = 1;
1091 
1092 	/*
1093 	 * We rely page->lru.next never has bit 0 set, unless the page
1094 	 * is PageTail(). Let's make sure that's true even for poisoned ->lru.
1095 	 */
1096 	BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
1097 
1098 	if (!is_check_pages_enabled()) {
1099 		ret = 0;
1100 		goto out;
1101 	}
1102 	switch (page - head_page) {
1103 	case 1:
1104 		/* the first tail page: these may be in place of ->mapping */
1105 		if (unlikely(folio_large_mapcount(folio))) {
1106 			bad_page(page, "nonzero large_mapcount");
1107 			goto out;
1108 		}
1109 		if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT) &&
1110 		    unlikely(atomic_read(&folio->_nr_pages_mapped))) {
1111 			bad_page(page, "nonzero nr_pages_mapped");
1112 			goto out;
1113 		}
1114 		if (IS_ENABLED(CONFIG_MM_ID)) {
1115 			if (unlikely(folio->_mm_id_mapcount[0] != -1)) {
1116 				bad_page(page, "nonzero mm mapcount 0");
1117 				goto out;
1118 			}
1119 			if (unlikely(folio->_mm_id_mapcount[1] != -1)) {
1120 				bad_page(page, "nonzero mm mapcount 1");
1121 				goto out;
1122 			}
1123 		}
1124 		if (IS_ENABLED(CONFIG_64BIT)) {
1125 			if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
1126 				bad_page(page, "nonzero entire_mapcount");
1127 				goto out;
1128 			}
1129 			if (unlikely(atomic_read(&folio->_pincount))) {
1130 				bad_page(page, "nonzero pincount");
1131 				goto out;
1132 			}
1133 		}
1134 		break;
1135 	case 2:
1136 		/* the second tail page: deferred_list overlaps ->mapping */
1137 		if (unlikely(!list_empty(&folio->_deferred_list))) {
1138 			bad_page(page, "on deferred list");
1139 			goto out;
1140 		}
1141 		if (!IS_ENABLED(CONFIG_64BIT)) {
1142 			if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
1143 				bad_page(page, "nonzero entire_mapcount");
1144 				goto out;
1145 			}
1146 			if (unlikely(atomic_read(&folio->_pincount))) {
1147 				bad_page(page, "nonzero pincount");
1148 				goto out;
1149 			}
1150 		}
1151 		break;
1152 	case 3:
1153 		/* the third tail page: hugetlb specifics overlap ->mappings */
1154 		if (IS_ENABLED(CONFIG_HUGETLB_PAGE))
1155 			break;
1156 		fallthrough;
1157 	default:
1158 		if (page->mapping != TAIL_MAPPING) {
1159 			bad_page(page, "corrupted mapping in tail page");
1160 			goto out;
1161 		}
1162 		break;
1163 	}
1164 	if (unlikely(!PageTail(page))) {
1165 		bad_page(page, "PageTail not set");
1166 		goto out;
1167 	}
1168 	if (unlikely(compound_head(page) != head_page)) {
1169 		bad_page(page, "compound_head not consistent");
1170 		goto out;
1171 	}
1172 	ret = 0;
1173 out:
1174 	page->mapping = NULL;
1175 	clear_compound_head(page);
1176 	return ret;
1177 }
1178 
1179 /*
1180  * Skip KASAN memory poisoning when either:
1181  *
1182  * 1. For generic KASAN: deferred memory initialization has not yet completed.
1183  *    Tag-based KASAN modes skip pages freed via deferred memory initialization
1184  *    using page tags instead (see below).
1185  * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating
1186  *    that error detection is disabled for accesses via the page address.
1187  *
1188  * Pages will have match-all tags in the following circumstances:
1189  *
1190  * 1. Pages are being initialized for the first time, including during deferred
1191  *    memory init; see the call to page_kasan_tag_reset in __init_single_page.
1192  * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the
1193  *    exception of pages unpoisoned by kasan_unpoison_vmalloc.
1194  * 3. The allocation was excluded from being checked due to sampling,
1195  *    see the call to kasan_unpoison_pages.
1196  *
1197  * Poisoning pages during deferred memory init will greatly lengthen the
1198  * process and cause problem in large memory systems as the deferred pages
1199  * initialization is done with interrupt disabled.
1200  *
1201  * Assuming that there will be no reference to those newly initialized
1202  * pages before they are ever allocated, this should have no effect on
1203  * KASAN memory tracking as the poison will be properly inserted at page
1204  * allocation time. The only corner case is when pages are allocated by
1205  * on-demand allocation and then freed again before the deferred pages
1206  * initialization is done, but this is not likely to happen.
1207  */
1208 static inline bool should_skip_kasan_poison(struct page *page)
1209 {
1210 	if (IS_ENABLED(CONFIG_KASAN_GENERIC))
1211 		return deferred_pages_enabled();
1212 
1213 	return page_kasan_tag(page) == KASAN_TAG_KERNEL;
1214 }
1215 
1216 static void clear_highpages_kasan_tagged(struct page *page, int numpages)
1217 {
1218 	/* s390's use of memset() could override KASAN redzones. */
1219 	kasan_disable_current();
1220 	if (!IS_ENABLED(CONFIG_HIGHMEM)) {
1221 		clear_pages(kasan_reset_tag(page_address(page)), numpages);
1222 	} else {
1223 		int i;
1224 
1225 		for (i = 0; i < numpages; i++)
1226 			clear_highpage_kasan_tagged(page + i);
1227 	}
1228 	kasan_enable_current();
1229 }
1230 
1231 #ifdef CONFIG_MEM_ALLOC_PROFILING
1232 
1233 /* Should be called only if mem_alloc_profiling_enabled() */
1234 void __clear_page_tag_ref(struct page *page)
1235 {
1236 	union pgtag_ref_handle handle;
1237 	union codetag_ref ref;
1238 
1239 	if (get_page_tag_ref(page, &ref, &handle)) {
1240 		set_codetag_empty(&ref);
1241 		update_page_tag_ref(handle, &ref);
1242 		put_page_tag_ref(handle);
1243 	}
1244 }
1245 
1246 /* Should be called only if mem_alloc_profiling_enabled() */
1247 static noinline
1248 void __pgalloc_tag_add(struct page *page, struct task_struct *task,
1249 		       unsigned int nr)
1250 {
1251 	union pgtag_ref_handle handle;
1252 	union codetag_ref ref;
1253 
1254 	if (likely(get_page_tag_ref(page, &ref, &handle))) {
1255 		alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
1256 		update_page_tag_ref(handle, &ref);
1257 		put_page_tag_ref(handle);
1258 	} else {
1259 		/*
1260 		 * page_ext is not available yet, record the pfn so we can
1261 		 * clear the tag ref later when page_ext is initialized.
1262 		 */
1263 		alloc_tag_add_early_pfn(page_to_pfn(page));
1264 		if (task->alloc_tag)
1265 			alloc_tag_set_inaccurate(task->alloc_tag);
1266 	}
1267 }
1268 
1269 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
1270 				   unsigned int nr)
1271 {
1272 	if (mem_alloc_profiling_enabled())
1273 		__pgalloc_tag_add(page, task, nr);
1274 }
1275 
1276 /* Should be called only if mem_alloc_profiling_enabled() */
1277 static noinline
1278 void __pgalloc_tag_sub(struct page *page, unsigned int nr)
1279 {
1280 	union pgtag_ref_handle handle;
1281 	union codetag_ref ref;
1282 
1283 	if (get_page_tag_ref(page, &ref, &handle)) {
1284 		alloc_tag_sub(&ref, PAGE_SIZE * nr);
1285 		update_page_tag_ref(handle, &ref);
1286 		put_page_tag_ref(handle);
1287 	}
1288 }
1289 
1290 static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
1291 {
1292 	if (mem_alloc_profiling_enabled())
1293 		__pgalloc_tag_sub(page, nr);
1294 }
1295 
1296 /* When tag is not NULL, assuming mem_alloc_profiling_enabled */
1297 static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
1298 {
1299 	if (tag)
1300 		this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
1301 }
1302 
1303 #else /* CONFIG_MEM_ALLOC_PROFILING */
1304 
1305 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
1306 				   unsigned int nr) {}
1307 static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
1308 static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
1309 
1310 #endif /* CONFIG_MEM_ALLOC_PROFILING */
1311 
1312 static __always_inline bool __free_pages_prepare(struct page *page,
1313 		unsigned int order, fpi_t fpi_flags)
1314 {
1315 	int bad = 0;
1316 	bool skip_kasan_poison = should_skip_kasan_poison(page);
1317 	bool init = want_init_on_free();
1318 	bool compound = PageCompound(page);
1319 	struct folio *folio = page_folio(page);
1320 
1321 	if (fpi_flags & FPI_PREPARED)
1322 		return true;
1323 
1324 	VM_BUG_ON_PAGE(PageTail(page), page);
1325 
1326 	trace_mm_page_free(page, order);
1327 	kmsan_free_page(page, order);
1328 
1329 	if (memcg_kmem_online() && PageMemcgKmem(page))
1330 		__memcg_kmem_uncharge_page(page, order);
1331 
1332 	/*
1333 	 * In rare cases, when truncation or holepunching raced with
1334 	 * munlock after VM_LOCKED was cleared, Mlocked may still be
1335 	 * found set here.  This does not indicate a problem, unless
1336 	 * "unevictable_pgs_cleared" appears worryingly large.
1337 	 */
1338 	if (unlikely(folio_test_mlocked(folio))) {
1339 		long nr_pages = folio_nr_pages(folio);
1340 
1341 		__folio_clear_mlocked(folio);
1342 		zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
1343 		count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
1344 	}
1345 
1346 	if (unlikely(PageHWPoison(page)) && !order) {
1347 		/* Do not let hwpoison pages hit pcplists/buddy */
1348 		reset_page_owner(page, order);
1349 		page_table_check_free(page, order);
1350 		pgalloc_tag_sub(page, 1 << order);
1351 
1352 		/*
1353 		 * The page is isolated and accounted for.
1354 		 * Mark the codetag as empty to avoid accounting error
1355 		 * when the page is freed by unpoison_memory().
1356 		 */
1357 		clear_page_tag_ref(page);
1358 		return false;
1359 	}
1360 
1361 	VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
1362 
1363 	/*
1364 	 * Check tail pages before head page information is cleared to
1365 	 * avoid checking PageCompound for order-0 pages.
1366 	 */
1367 	if (unlikely(order)) {
1368 		int i;
1369 
1370 		if (compound) {
1371 			page[1].flags.f &= ~PAGE_FLAGS_SECOND;
1372 #ifdef NR_PAGES_IN_LARGE_FOLIO
1373 			folio->_nr_pages = 0;
1374 #endif
1375 		}
1376 		for (i = 1; i < (1 << order); i++) {
1377 			if (compound)
1378 				bad += free_tail_page_prepare(page, page + i);
1379 			if (is_check_pages_enabled()) {
1380 				if (free_page_is_bad(page + i)) {
1381 					bad++;
1382 					continue;
1383 				}
1384 			}
1385 			(page + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
1386 		}
1387 	}
1388 	if (folio_test_anon(folio)) {
1389 		mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
1390 		folio->mapping = NULL;
1391 	}
1392 	if (unlikely(page_has_type(page)))
1393 		/* Reset the page_type (which overlays _mapcount) */
1394 		page->page_type = UINT_MAX;
1395 
1396 	if (is_check_pages_enabled()) {
1397 		if (free_page_is_bad(page))
1398 			bad++;
1399 		if (bad)
1400 			return false;
1401 	}
1402 
1403 	page_cpupid_reset_last(page);
1404 	page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
1405 	page->private = 0;
1406 	reset_page_owner(page, order);
1407 	page_table_check_free(page, order);
1408 	pgalloc_tag_sub(page, 1 << order);
1409 
1410 	if (!PageHighMem(page) && !(fpi_flags & FPI_TRYLOCK)) {
1411 		debug_check_no_locks_freed(page_address(page),
1412 					   PAGE_SIZE << order);
1413 		debug_check_no_obj_freed(page_address(page),
1414 					   PAGE_SIZE << order);
1415 	}
1416 
1417 	kernel_poison_pages(page, 1 << order);
1418 
1419 	/*
1420 	 * As memory initialization might be integrated into KASAN,
1421 	 * KASAN poisoning and memory initialization code must be
1422 	 * kept together to avoid discrepancies in behavior.
1423 	 *
1424 	 * With hardware tag-based KASAN, memory tags must be set before the
1425 	 * page becomes unavailable via debug_pagealloc or arch_free_page.
1426 	 */
1427 	if (!skip_kasan_poison) {
1428 		kasan_poison_pages(page, order, init);
1429 
1430 		/* Memory is already initialized if KASAN did it internally. */
1431 		if (kasan_has_integrated_init())
1432 			init = false;
1433 	}
1434 	if (init)
1435 		clear_highpages_kasan_tagged(page, 1 << order);
1436 
1437 	/*
1438 	 * arch_free_page() can make the page's contents inaccessible.  s390
1439 	 * does this.  So nothing which can access the page's contents should
1440 	 * happen after this.
1441 	 */
1442 	arch_free_page(page, order);
1443 
1444 	debug_pagealloc_unmap_pages(page, 1 << order);
1445 
1446 	return true;
1447 }
1448 
1449 bool free_pages_prepare(struct page *page, unsigned int order)
1450 {
1451 	return __free_pages_prepare(page, order, FPI_NONE);
1452 }
1453 
1454 /*
1455  * Frees a number of pages from the PCP lists
1456  * Assumes all pages on list are in same zone.
1457  * count is the number of pages to free.
1458  */
1459 static void free_pcppages_bulk(struct zone *zone, int count,
1460 					struct per_cpu_pages *pcp,
1461 					int pindex)
1462 {
1463 	unsigned int order;
1464 	struct page *page;
1465 
1466 	/*
1467 	 * Ensure proper count is passed which otherwise would stuck in the
1468 	 * below while (list_empty(list)) loop.
1469 	 */
1470 	count = min(pcp->count, count);
1471 
1472 	/* Ensure requested pindex is drained first. */
1473 	pindex = pindex - 1;
1474 
1475 	guard(spinlock_irqsave)(&zone->lock);
1476 
1477 	while (count > 0) {
1478 		struct list_head *list;
1479 		int nr_pages;
1480 
1481 		/* Remove pages from lists in a round-robin fashion. */
1482 		do {
1483 			if (++pindex > NR_PCP_LISTS - 1)
1484 				pindex = 0;
1485 			list = &pcp->lists[pindex];
1486 		} while (list_empty(list));
1487 
1488 		order = pindex_to_order(pindex);
1489 		nr_pages = 1 << order;
1490 		do {
1491 			unsigned long pfn;
1492 			int mt;
1493 
1494 			page = list_last_entry(list, struct page, pcp_list);
1495 			pfn = page_to_pfn(page);
1496 			mt = get_pfnblock_migratetype(page, pfn);
1497 
1498 			/* must delete to avoid corrupting pcp list */
1499 			list_del(&page->pcp_list);
1500 			count -= nr_pages;
1501 			pcp->count -= nr_pages;
1502 
1503 			__free_one_page(page, pfn, zone, order, mt, FPI_NONE);
1504 			trace_mm_page_pcpu_drain(page, order, mt);
1505 		} while (count > 0 && !list_empty(list));
1506 	}
1507 }
1508 
1509 /* Split a multi-block free page into its individual pageblocks. */
1510 static void split_large_buddy(struct zone *zone, struct page *page,
1511 			      unsigned long pfn, int order, fpi_t fpi)
1512 {
1513 	unsigned long end = pfn + (1 << order);
1514 
1515 	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order));
1516 	/* Caller removed page from freelist, buddy info cleared! */
1517 	VM_WARN_ON_ONCE(PageBuddy(page));
1518 
1519 	if (order > pageblock_order)
1520 		order = pageblock_order;
1521 
1522 	do {
1523 		int mt = get_pfnblock_migratetype(page, pfn);
1524 
1525 		__free_one_page(page, pfn, zone, order, mt, fpi);
1526 		pfn += 1 << order;
1527 		if (pfn == end)
1528 			break;
1529 		page = pfn_to_page(pfn);
1530 	} while (1);
1531 }
1532 
1533 static void add_page_to_zone_llist(struct zone *zone, struct page *page,
1534 				   unsigned int order)
1535 {
1536 	/* Remember the order */
1537 	page->private = order;
1538 	/* Add the page to the free list */
1539 	llist_add(&page->pcp_llist, &zone->trylock_free_pages);
1540 }
1541 
1542 static void free_one_page(struct zone *zone, struct page *page,
1543 			  unsigned long pfn, unsigned int order,
1544 			  fpi_t fpi_flags)
1545 {
1546 	struct llist_head *llhead;
1547 	unsigned long flags;
1548 
1549 	if (unlikely(fpi_flags & FPI_TRYLOCK)) {
1550 		if (!spin_trylock_irqsave(&zone->lock, flags)) {
1551 			add_page_to_zone_llist(zone, page, order);
1552 			return;
1553 		}
1554 	} else {
1555 		spin_lock_irqsave(&zone->lock, flags);
1556 	}
1557 
1558 	/* The lock succeeded. Process deferred pages. */
1559 	llhead = &zone->trylock_free_pages;
1560 	if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) {
1561 		struct llist_node *llnode;
1562 		struct page *p, *tmp;
1563 
1564 		llnode = llist_del_all(llhead);
1565 		llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
1566 			unsigned int p_order = p->private;
1567 
1568 			split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
1569 			__count_vm_events(PGFREE, 1 << p_order);
1570 		}
1571 	}
1572 	split_large_buddy(zone, page, pfn, order, fpi_flags);
1573 	spin_unlock_irqrestore(&zone->lock, flags);
1574 
1575 	__count_vm_events(PGFREE, 1 << order);
1576 }
1577 
1578 static void __free_pages_ok(struct page *page, unsigned int order,
1579 			    fpi_t fpi_flags)
1580 {
1581 	unsigned long pfn = page_to_pfn(page);
1582 	struct zone *zone = page_zone(page);
1583 
1584 	if (__free_pages_prepare(page, order, fpi_flags))
1585 		free_one_page(zone, page, pfn, order, fpi_flags);
1586 }
1587 
1588 void __meminit __free_pages_core(struct page *page, unsigned int order,
1589 		enum meminit_context context)
1590 {
1591 	unsigned int nr_pages = 1 << order;
1592 	struct page *p = page;
1593 	unsigned int loop;
1594 
1595 	/*
1596 	 * When initializing the memmap, __init_single_page() sets the refcount
1597 	 * of all pages to 1 ("allocated"/"not free"). We have to set the
1598 	 * refcount of all involved pages to 0.
1599 	 *
1600 	 * Note that hotplugged memory pages are initialized to PageOffline().
1601 	 * Pages freed from memblock might be marked as reserved.
1602 	 */
1603 	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) &&
1604 	    unlikely(context == MEMINIT_HOTPLUG)) {
1605 		for (loop = 0; loop < nr_pages; loop++, p++) {
1606 			VM_WARN_ON_ONCE(PageReserved(p));
1607 			__ClearPageOffline(p);
1608 			set_page_count(p, 0);
1609 		}
1610 
1611 		adjust_managed_page_count(page, nr_pages);
1612 	} else {
1613 		for (loop = 0; loop < nr_pages; loop++, p++) {
1614 			__ClearPageReserved(p);
1615 			set_page_count(p, 0);
1616 		}
1617 
1618 		/* memblock adjusts totalram_pages() manually. */
1619 		atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
1620 	}
1621 
1622 	if (page_contains_unaccepted(page, order)) {
1623 		if (order == MAX_PAGE_ORDER && __free_unaccepted(page))
1624 			return;
1625 
1626 		accept_memory(page_to_phys(page), PAGE_SIZE << order);
1627 	}
1628 
1629 	/*
1630 	 * Bypass PCP and place fresh pages right to the tail, primarily
1631 	 * relevant for memory onlining.
1632 	 */
1633 	__free_pages_ok(page, order, FPI_TO_TAIL);
1634 }
1635 
1636 /*
1637  * Check that the whole (or subset of) a pageblock given by the interval of
1638  * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
1639  * with the migration of free compaction scanner.
1640  *
1641  * Return struct page pointer of start_pfn, or NULL if checks were not passed.
1642  *
1643  * It's possible on some configurations to have a setup like node0 node1 node0
1644  * i.e. it's possible that all pages within a zones range of pages do not
1645  * belong to a single zone. We assume that a border between node0 and node1
1646  * can occur within a single pageblock, but not a node0 node1 node0
1647  * interleaving within a single pageblock. It is therefore sufficient to check
1648  * the first and last page of a pageblock and avoid checking each individual
1649  * page in a pageblock.
1650  *
1651  * Note: the function may return non-NULL struct page even for a page block
1652  * which contains a memory hole (i.e. there is no physical memory for a subset
1653  * of the pfn range). For example, if the pageblock order is MAX_PAGE_ORDER, which
1654  * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
1655  * even though the start pfn is online and valid. This should be safe most of
1656  * the time because struct pages are still initialized via init_unavailable_range()
1657  * and pfn walkers shouldn't touch any physical memory range for which they do
1658  * not recognize any specific metadata in struct pages.
1659  */
1660 struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
1661 				     unsigned long end_pfn, struct zone *zone)
1662 {
1663 	struct page *start_page;
1664 	struct page *end_page;
1665 
1666 	/* end_pfn is one past the range we are checking */
1667 	end_pfn--;
1668 
1669 	if (!pfn_valid(end_pfn))
1670 		return NULL;
1671 
1672 	start_page = pfn_to_online_page(start_pfn);
1673 	if (!start_page)
1674 		return NULL;
1675 
1676 	if (page_zone(start_page) != zone)
1677 		return NULL;
1678 
1679 	end_page = pfn_to_page(end_pfn);
1680 
1681 	/* This gives a shorter code than deriving page_zone(end_page) */
1682 	if (page_zone_id(start_page) != page_zone_id(end_page))
1683 		return NULL;
1684 
1685 	return start_page;
1686 }
1687 
1688 /*
1689  * The order of subdivision here is critical for the IO subsystem.
1690  * Please do not alter this order without good reasons and regression
1691  * testing. Specifically, as large blocks of memory are subdivided,
1692  * the order in which smaller blocks are delivered depends on the order
1693  * they're subdivided in this function. This is the primary factor
1694  * influencing the order in which pages are delivered to the IO
1695  * subsystem according to empirical testing, and this is also justified
1696  * by considering the behavior of a buddy system containing a single
1697  * large block of memory acted on by a series of small allocations.
1698  * This behavior is a critical factor in sglist merging's success.
1699  *
1700  * -- nyc
1701  */
1702 static inline unsigned int expand(struct zone *zone, struct page *page, int low,
1703 				  int high, int migratetype)
1704 {
1705 	unsigned int size = 1 << high;
1706 	unsigned int nr_added = 0;
1707 
1708 	while (high > low) {
1709 		high--;
1710 		size >>= 1;
1711 		VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
1712 
1713 		/*
1714 		 * Mark as guard pages (or page), that will allow to
1715 		 * merge back to allocator when buddy will be freed.
1716 		 * Corresponding page table entries will not be touched,
1717 		 * pages will stay not present in virtual address space
1718 		 */
1719 		if (set_page_guard(zone, &page[size], high))
1720 			continue;
1721 
1722 		__add_to_free_list(&page[size], zone, high, migratetype, false);
1723 		set_buddy_order(&page[size], high);
1724 		nr_added += size;
1725 	}
1726 
1727 	return nr_added;
1728 }
1729 
1730 static __always_inline void page_del_and_expand(struct zone *zone,
1731 						struct page *page, int low,
1732 						int high, int migratetype)
1733 {
1734 	int nr_pages = 1 << high;
1735 
1736 	__del_page_from_free_list(page, zone, high, migratetype);
1737 	nr_pages -= expand(zone, page, low, high, migratetype);
1738 	account_freepages(zone, -nr_pages, migratetype);
1739 }
1740 
1741 static void check_new_page_bad(struct page *page)
1742 {
1743 	if (unlikely(PageHWPoison(page))) {
1744 		/* Don't complain about hwpoisoned pages */
1745 		if (PageBuddy(page))
1746 			__ClearPageBuddy(page);
1747 		return;
1748 	}
1749 
1750 	bad_page(page,
1751 		 page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
1752 }
1753 
1754 /*
1755  * This page is about to be returned from the page allocator
1756  */
1757 static bool check_new_page(struct page *page)
1758 {
1759 	if (likely(page_expected_state(page,
1760 				PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
1761 		return false;
1762 
1763 	check_new_page_bad(page);
1764 	return true;
1765 }
1766 
1767 static inline bool check_new_pages(struct page *page, unsigned int order)
1768 {
1769 	if (is_check_pages_enabled()) {
1770 		for (int i = 0; i < (1 << order); i++) {
1771 			struct page *p = page + i;
1772 
1773 			if (check_new_page(p))
1774 				return true;
1775 		}
1776 	}
1777 
1778 	return false;
1779 }
1780 
1781 static inline bool should_skip_kasan_unpoison(gfp_t flags)
1782 {
1783 	/* Don't skip if a software KASAN mode is enabled. */
1784 	if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
1785 	    IS_ENABLED(CONFIG_KASAN_SW_TAGS))
1786 		return false;
1787 
1788 	/* Skip, if hardware tag-based KASAN is not enabled. */
1789 	if (!kasan_hw_tags_enabled())
1790 		return true;
1791 
1792 	/*
1793 	 * With hardware tag-based KASAN enabled, skip if this has been
1794 	 * requested via __GFP_SKIP_KASAN.
1795 	 */
1796 	return flags & __GFP_SKIP_KASAN;
1797 }
1798 
1799 static inline bool should_skip_init(gfp_t flags)
1800 {
1801 	/* Don't skip, if hardware tag-based KASAN is not enabled. */
1802 	if (!kasan_hw_tags_enabled())
1803 		return false;
1804 
1805 	/* For hardware tag-based KASAN, skip if requested. */
1806 	return (flags & __GFP_SKIP_ZERO);
1807 }
1808 
1809 inline void post_alloc_hook(struct page *page, unsigned int order,
1810 				gfp_t gfp_flags)
1811 {
1812 	const bool zero_tags = gfp_flags & __GFP_ZEROTAGS;
1813 	bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
1814 			!should_skip_init(gfp_flags);
1815 	int i;
1816 
1817 	set_page_private(page, 0);
1818 
1819 	arch_alloc_page(page, order);
1820 	debug_pagealloc_map_pages(page, 1 << order);
1821 
1822 	/*
1823 	 * Page unpoisoning must happen before memory initialization.
1824 	 * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
1825 	 * allocations and the page unpoisoning code will complain.
1826 	 */
1827 	kernel_unpoison_pages(page, 1 << order);
1828 
1829 	/*
1830 	 * As memory initialization might be integrated into KASAN,
1831 	 * KASAN unpoisoning and memory initialization code must be
1832 	 * kept together to avoid discrepancies in behavior.
1833 	 */
1834 
1835 	/*
1836 	 * Clearing tags can efficiently clear the memory for us as well, if
1837 	 * required.
1838 	 */
1839 	if (zero_tags)
1840 		init = tag_clear_highpages(page, 1 << order, /* clear_pages= */init);
1841 
1842 	if (!should_skip_kasan_unpoison(gfp_flags) &&
1843 	    kasan_unpoison_pages(page, order, init)) {
1844 		/* Take note that memory was initialized by KASAN. */
1845 		if (kasan_has_integrated_init())
1846 			init = false;
1847 	} else {
1848 		/*
1849 		 * If memory tags have not been set by KASAN, reset the page
1850 		 * tags to ensure page_address() dereferencing does not fault.
1851 		 */
1852 		for (i = 0; i != 1 << order; ++i)
1853 			page_kasan_tag_reset(page + i);
1854 	}
1855 	/* If memory is still not initialized, initialize it now. */
1856 	if (init)
1857 		clear_highpages_kasan_tagged(page, 1 << order);
1858 
1859 	set_page_owner(page, order, gfp_flags);
1860 	page_table_check_alloc(page, order);
1861 	pgalloc_tag_add(page, current, 1 << order);
1862 }
1863 
1864 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
1865 							unsigned int alloc_flags)
1866 {
1867 	post_alloc_hook(page, order, gfp_flags);
1868 
1869 	if (order && (gfp_flags & __GFP_COMP))
1870 		prep_compound_page(page, order);
1871 
1872 	/*
1873 	 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
1874 	 * allocate the page. The expectation is that the caller is taking
1875 	 * steps that will free more memory. The caller should avoid the page
1876 	 * being used for !PFMEMALLOC purposes.
1877 	 */
1878 	if (alloc_flags & ALLOC_NO_WATERMARKS)
1879 		set_page_pfmemalloc(page);
1880 	else
1881 		clear_page_pfmemalloc(page);
1882 }
1883 
1884 /*
1885  * Go through the free lists for the given migratetype and remove
1886  * the smallest available page from the freelists
1887  */
1888 static __always_inline
1889 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
1890 						int migratetype)
1891 {
1892 	unsigned int current_order;
1893 	struct free_area *area;
1894 	struct page *page;
1895 
1896 	/* Find a page of the appropriate size in the preferred list */
1897 	for (current_order = order; current_order < NR_PAGE_ORDERS; ++current_order) {
1898 		area = &(zone->free_area[current_order]);
1899 		page = get_page_from_free_area(area, migratetype);
1900 		if (!page)
1901 			continue;
1902 
1903 		page_del_and_expand(zone, page, order, current_order,
1904 				    migratetype);
1905 		trace_mm_page_alloc_zone_locked(page, order, migratetype,
1906 				pcp_allowed_order(order) &&
1907 				migratetype < MIGRATE_PCPTYPES);
1908 		return page;
1909 	}
1910 
1911 	return NULL;
1912 }
1913 
1914 
1915 /*
1916  * This array describes the order lists are fallen back to when
1917  * the free lists for the desirable migrate type are depleted
1918  *
1919  * The other migratetypes do not have fallbacks.
1920  */
1921 static int fallbacks[MIGRATE_PCPTYPES][MIGRATE_PCPTYPES - 1] = {
1922 	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE   },
1923 	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE },
1924 	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE   },
1925 };
1926 
1927 #ifdef CONFIG_CMA
1928 static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
1929 					unsigned int order)
1930 {
1931 	return __rmqueue_smallest(zone, order, MIGRATE_CMA);
1932 }
1933 #else
1934 static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
1935 					unsigned int order) { return NULL; }
1936 #endif
1937 
1938 /*
1939  * Move all free pages of a block to new type's freelist. Caller needs to
1940  * change the block type.
1941  */
1942 static int __move_freepages_block(struct zone *zone, unsigned long start_pfn,
1943 				  int old_mt, int new_mt)
1944 {
1945 	struct page *page;
1946 	unsigned long pfn, end_pfn;
1947 	unsigned int order;
1948 	int pages_moved = 0;
1949 
1950 	VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1));
1951 	end_pfn = pageblock_end_pfn(start_pfn);
1952 
1953 	for (pfn = start_pfn; pfn < end_pfn;) {
1954 		page = pfn_to_page(pfn);
1955 		if (!PageBuddy(page)) {
1956 			pfn++;
1957 			continue;
1958 		}
1959 
1960 		/* Make sure we are not inadvertently changing nodes */
1961 		VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
1962 		VM_BUG_ON_PAGE(page_zone(page) != zone, page);
1963 
1964 		order = buddy_order(page);
1965 
1966 		move_to_free_list(page, zone, order, old_mt, new_mt);
1967 
1968 		pfn += 1 << order;
1969 		pages_moved += 1 << order;
1970 	}
1971 
1972 	return pages_moved;
1973 }
1974 
1975 static bool prep_move_freepages_block(struct zone *zone, struct page *page,
1976 				      unsigned long *start_pfn,
1977 				      int *num_free, int *num_movable)
1978 {
1979 	unsigned long pfn, start, end;
1980 
1981 	pfn = page_to_pfn(page);
1982 	start = pageblock_start_pfn(pfn);
1983 	end = pageblock_end_pfn(pfn);
1984 
1985 	/*
1986 	 * The caller only has the lock for @zone, don't touch ranges
1987 	 * that straddle into other zones. While we could move part of
1988 	 * the range that's inside the zone, this call is usually
1989 	 * accompanied by other operations such as migratetype updates
1990 	 * which also should be locked.
1991 	 */
1992 	if (!zone_spans_pfn(zone, start))
1993 		return false;
1994 	if (!zone_spans_pfn(zone, end - 1))
1995 		return false;
1996 
1997 	*start_pfn = start;
1998 
1999 	if (num_free) {
2000 		*num_free = 0;
2001 		*num_movable = 0;
2002 		for (pfn = start; pfn < end;) {
2003 			page = pfn_to_page(pfn);
2004 			if (PageBuddy(page)) {
2005 				int nr = 1 << buddy_order(page);
2006 
2007 				*num_free += nr;
2008 				pfn += nr;
2009 				continue;
2010 			}
2011 			/*
2012 			 * We assume that pages that could be isolated for
2013 			 * migration are movable. But we don't actually try
2014 			 * isolating, as that would be expensive.
2015 			 */
2016 			if (PageLRU(page) || page_has_movable_ops(page))
2017 				(*num_movable)++;
2018 			pfn++;
2019 		}
2020 	}
2021 
2022 	return true;
2023 }
2024 
2025 static int move_freepages_block(struct zone *zone, struct page *page,
2026 				int old_mt, int new_mt)
2027 {
2028 	unsigned long start_pfn;
2029 	int res;
2030 
2031 	if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
2032 		return -1;
2033 
2034 	res = __move_freepages_block(zone, start_pfn, old_mt, new_mt);
2035 	set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
2036 
2037 	return res;
2038 
2039 }
2040 
2041 #ifdef CONFIG_MEMORY_ISOLATION
2042 /* Look for a buddy that straddles start_pfn */
2043 static unsigned long find_large_buddy(unsigned long start_pfn)
2044 {
2045 	/*
2046 	 * If start_pfn is not an order-0 PageBuddy, next PageBuddy containing
2047 	 * start_pfn has minimal order of __ffs(start_pfn) + 1. Start checking
2048 	 * the order with __ffs(start_pfn). If start_pfn is order-0 PageBuddy,
2049 	 * the starting order does not matter.
2050 	 */
2051 	int order = start_pfn ? __ffs(start_pfn) : MAX_PAGE_ORDER;
2052 	struct page *page;
2053 	unsigned long pfn = start_pfn;
2054 
2055 	while (!PageBuddy(page = pfn_to_page(pfn))) {
2056 		/* Nothing found */
2057 		if (++order > MAX_PAGE_ORDER)
2058 			return start_pfn;
2059 		pfn &= ~0UL << order;
2060 	}
2061 
2062 	/*
2063 	 * Found a preceding buddy, but does it straddle?
2064 	 */
2065 	if (pfn + (1 << buddy_order(page)) > start_pfn)
2066 		return pfn;
2067 
2068 	/* Nothing found */
2069 	return start_pfn;
2070 }
2071 
2072 static inline void toggle_pageblock_isolate(struct page *page, bool isolate)
2073 {
2074 	if (isolate)
2075 		set_pageblock_isolate(page);
2076 	else
2077 		clear_pageblock_isolate(page);
2078 }
2079 
2080 /**
2081  * __move_freepages_block_isolate - move free pages in block for page isolation
2082  * @zone: the zone
2083  * @page: the pageblock page
2084  * @isolate: to isolate the given pageblock or unisolate it
2085  *
2086  * This is similar to move_freepages_block(), but handles the special
2087  * case encountered in page isolation, where the block of interest
2088  * might be part of a larger buddy spanning multiple pageblocks.
2089  *
2090  * Unlike the regular page allocator path, which moves pages while
2091  * stealing buddies off the freelist, page isolation is interested in
2092  * arbitrary pfn ranges that may have overlapping buddies on both ends.
2093  *
2094  * This function handles that. Straddling buddies are split into
2095  * individual pageblocks. Only the block of interest is moved.
2096  *
2097  * Returns %true if pages could be moved, %false otherwise.
2098  */
2099 static bool __move_freepages_block_isolate(struct zone *zone,
2100 		struct page *page, bool isolate)
2101 {
2102 	unsigned long start_pfn, buddy_pfn;
2103 	int from_mt;
2104 	int to_mt;
2105 	struct page *buddy;
2106 
2107 	if (isolate == get_pageblock_isolate(page)) {
2108 		VM_WARN_ONCE(1, "%s a pageblock that is already in that state",
2109 			     isolate ? "Isolate" : "Unisolate");
2110 		return false;
2111 	}
2112 
2113 	if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
2114 		return false;
2115 
2116 	/* No splits needed if buddies can't span multiple blocks */
2117 	if (pageblock_order == MAX_PAGE_ORDER)
2118 		goto move;
2119 
2120 	buddy_pfn = find_large_buddy(start_pfn);
2121 	buddy = pfn_to_page(buddy_pfn);
2122 	/* We're a part of a larger buddy */
2123 	if (PageBuddy(buddy) && buddy_order(buddy) > pageblock_order) {
2124 		int order = buddy_order(buddy);
2125 
2126 		del_page_from_free_list(buddy, zone, order,
2127 					get_pfnblock_migratetype(buddy, buddy_pfn));
2128 		toggle_pageblock_isolate(page, isolate);
2129 		split_large_buddy(zone, buddy, buddy_pfn, order, FPI_NONE);
2130 		return true;
2131 	}
2132 
2133 move:
2134 	/* Use PAGEBLOCK_MIGRATETYPE_MASK to get non-isolate migratetype */
2135 	if (isolate) {
2136 		from_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
2137 						    PAGEBLOCK_MIGRATETYPE_MASK);
2138 		to_mt = MIGRATE_ISOLATE;
2139 	} else {
2140 		from_mt = MIGRATE_ISOLATE;
2141 		to_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
2142 						  PAGEBLOCK_MIGRATETYPE_MASK);
2143 	}
2144 
2145 	__move_freepages_block(zone, start_pfn, from_mt, to_mt);
2146 	toggle_pageblock_isolate(pfn_to_page(start_pfn), isolate);
2147 
2148 	return true;
2149 }
2150 
2151 bool pageblock_isolate_and_move_free_pages(struct zone *zone, struct page *page)
2152 {
2153 	return __move_freepages_block_isolate(zone, page, true);
2154 }
2155 
2156 bool pageblock_unisolate_and_move_free_pages(struct zone *zone, struct page *page)
2157 {
2158 	return __move_freepages_block_isolate(zone, page, false);
2159 }
2160 
2161 #endif /* CONFIG_MEMORY_ISOLATION */
2162 
2163 static inline bool boost_watermark(struct zone *zone)
2164 {
2165 	unsigned long max_boost;
2166 
2167 	if (!watermark_boost_factor)
2168 		return false;
2169 	/*
2170 	 * Don't bother in zones that are unlikely to produce results.
2171 	 * On small machines, including kdump capture kernels running
2172 	 * in a small area, boosting the watermark can cause an out of
2173 	 * memory situation immediately.
2174 	 */
2175 	if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
2176 		return false;
2177 
2178 	max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
2179 			watermark_boost_factor, 10000);
2180 
2181 	/*
2182 	 * high watermark may be uninitialised if fragmentation occurs
2183 	 * very early in boot so do not boost. We do not fall
2184 	 * through and boost by pageblock_nr_pages as failing
2185 	 * allocations that early means that reclaim is not going
2186 	 * to help and it may even be impossible to reclaim the
2187 	 * boosted watermark resulting in a hang.
2188 	 */
2189 	if (!max_boost)
2190 		return false;
2191 
2192 	max_boost = max(pageblock_nr_pages, max_boost);
2193 
2194 	zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
2195 		max_boost);
2196 
2197 	return true;
2198 }
2199 
2200 /*
2201  * When we are falling back to another migratetype during allocation, should we
2202  * try to claim an entire block to satisfy further allocations, instead of
2203  * polluting multiple pageblocks?
2204  */
2205 static bool should_try_claim_block(unsigned int order, int start_mt)
2206 {
2207 	/*
2208 	 * Leaving this order check is intended, although there is
2209 	 * relaxed order check in next check. The reason is that
2210 	 * we can actually claim the whole pageblock if this condition met,
2211 	 * but, below check doesn't guarantee it and that is just heuristic
2212 	 * so could be changed anytime.
2213 	 */
2214 	if (order >= pageblock_order)
2215 		return true;
2216 
2217 	/*
2218 	 * Above a certain threshold, always try to claim, as it's likely there
2219 	 * will be more free pages in the pageblock.
2220 	 */
2221 	if (order >= pageblock_order / 2)
2222 		return true;
2223 
2224 	/*
2225 	 * Unmovable/reclaimable allocations would cause permanent
2226 	 * fragmentations if they fell back to allocating from a movable block
2227 	 * (polluting it), so we try to claim the whole block regardless of the
2228 	 * allocation size. Later movable allocations can always steal from this
2229 	 * block, which is less problematic.
2230 	 */
2231 	if (start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE)
2232 		return true;
2233 
2234 	if (page_group_by_mobility_disabled)
2235 		return true;
2236 
2237 	/*
2238 	 * Movable pages won't cause permanent fragmentation, so when you alloc
2239 	 * small pages, we just need to temporarily steal unmovable or
2240 	 * reclaimable pages that are closest to the request size. After a
2241 	 * while, memory compaction may occur to form large contiguous pages,
2242 	 * and the next movable allocation may not need to steal.
2243 	 */
2244 	return false;
2245 }
2246 
2247 /*
2248  * Check whether there is a suitable fallback freepage with requested order.
2249  * If claimable is true, this function returns fallback_mt only if
2250  * we would do this whole-block claiming. This would help to reduce
2251  * fragmentation due to mixed migratetype pages in one pageblock.
2252  */
2253 enum fallback_result
2254 find_suitable_fallback(struct free_area *area, unsigned int order,
2255 		       int migratetype, bool claimable, int *mt_out)
2256 {
2257 	int i;
2258 
2259 	if (claimable && !should_try_claim_block(order, migratetype))
2260 		return FALLBACK_NOCLAIM;
2261 
2262 	if (area->nr_free == 0)
2263 		return FALLBACK_EMPTY;
2264 
2265 	for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
2266 		int fallback_mt = fallbacks[migratetype][i];
2267 
2268 		if (!free_area_empty(area, fallback_mt)) {
2269 			if (mt_out)
2270 				*mt_out = fallback_mt;
2271 			return FALLBACK_FOUND;
2272 		}
2273 	}
2274 
2275 	return FALLBACK_EMPTY;
2276 }
2277 
2278 /*
2279  * This function implements actual block claiming behaviour. If order is large
2280  * enough, we can claim the whole pageblock for the requested migratetype. If
2281  * not, we check the pageblock for constituent pages; if at least half of the
2282  * pages are free or compatible, we can still claim the whole block, so pages
2283  * freed in the future will be put on the correct free list.
2284  */
2285 static struct page *
2286 try_to_claim_block(struct zone *zone, struct page *page,
2287 		   int current_order, int order, int start_type,
2288 		   int block_type, unsigned int alloc_flags)
2289 {
2290 	int free_pages, movable_pages, alike_pages;
2291 	unsigned long start_pfn;
2292 
2293 	/* Take ownership for orders >= pageblock_order */
2294 	if (current_order >= pageblock_order) {
2295 		unsigned int nr_added;
2296 
2297 		del_page_from_free_list(page, zone, current_order, block_type);
2298 		change_pageblock_range(page, current_order, start_type);
2299 		nr_added = expand(zone, page, order, current_order, start_type);
2300 		account_freepages(zone, nr_added, start_type);
2301 		return page;
2302 	}
2303 
2304 	/*
2305 	 * Boost watermarks to increase reclaim pressure to reduce the
2306 	 * likelihood of future fallbacks. Wake kswapd now as the node
2307 	 * may be balanced overall and kswapd will not wake naturally.
2308 	 */
2309 	if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
2310 		set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
2311 
2312 	/* moving whole block can fail due to zone boundary conditions */
2313 	if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages,
2314 				       &movable_pages))
2315 		return NULL;
2316 
2317 	/*
2318 	 * Determine how many pages are compatible with our allocation.
2319 	 * For movable allocation, it's the number of movable pages which
2320 	 * we just obtained. For other types it's a bit more tricky.
2321 	 */
2322 	if (start_type == MIGRATE_MOVABLE) {
2323 		alike_pages = movable_pages;
2324 	} else {
2325 		/*
2326 		 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
2327 		 * to MOVABLE pageblock, consider all non-movable pages as
2328 		 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
2329 		 * vice versa, be conservative since we can't distinguish the
2330 		 * exact migratetype of non-movable pages.
2331 		 */
2332 		if (block_type == MIGRATE_MOVABLE)
2333 			alike_pages = pageblock_nr_pages
2334 						- (free_pages + movable_pages);
2335 		else
2336 			alike_pages = 0;
2337 	}
2338 	/*
2339 	 * If a sufficient number of pages in the block are either free or of
2340 	 * compatible migratability as our allocation, claim the whole block.
2341 	 */
2342 	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
2343 			page_group_by_mobility_disabled) {
2344 		__move_freepages_block(zone, start_pfn, block_type, start_type);
2345 		set_pageblock_migratetype(pfn_to_page(start_pfn), start_type);
2346 		return __rmqueue_smallest(zone, order, start_type);
2347 	}
2348 
2349 	return NULL;
2350 }
2351 
2352 /*
2353  * Try to allocate from some fallback migratetype by claiming the entire block,
2354  * i.e. converting it to the allocation's start migratetype.
2355  *
2356  * The use of signed ints for order and current_order is a deliberate
2357  * deviation from the rest of this file, to make the for loop
2358  * condition simpler.
2359  */
2360 static __always_inline struct page *
2361 __rmqueue_claim(struct zone *zone, int order, int start_migratetype,
2362 						unsigned int alloc_flags)
2363 {
2364 	struct free_area *area;
2365 	int current_order;
2366 	int min_order = order;
2367 	struct page *page;
2368 	int fallback_mt;
2369 
2370 	/*
2371 	 * Do not steal pages from freelists belonging to other pageblocks
2372 	 * i.e. orders < pageblock_order. If there are no local zones free,
2373 	 * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
2374 	 */
2375 	if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT)
2376 		min_order = pageblock_order;
2377 
2378 	/*
2379 	 * Find the largest available free page in the other list. This roughly
2380 	 * approximates finding the pageblock with the most free pages, which
2381 	 * would be too costly to do exactly.
2382 	 */
2383 	for (current_order = MAX_PAGE_ORDER; current_order >= min_order;
2384 				--current_order) {
2385 		enum fallback_result result;
2386 
2387 		area = &(zone->free_area[current_order]);
2388 		result = find_suitable_fallback(area, current_order,
2389 						start_migratetype, true, &fallback_mt);
2390 
2391 		if (result == FALLBACK_EMPTY)
2392 			continue;
2393 
2394 		if (result == FALLBACK_NOCLAIM)
2395 			break;
2396 
2397 		page = get_page_from_free_area(area, fallback_mt);
2398 		page = try_to_claim_block(zone, page, current_order, order,
2399 					  start_migratetype, fallback_mt,
2400 					  alloc_flags);
2401 		if (page) {
2402 			trace_mm_page_alloc_extfrag(page, order, current_order,
2403 						    start_migratetype, fallback_mt);
2404 			return page;
2405 		}
2406 	}
2407 
2408 	return NULL;
2409 }
2410 
2411 /*
2412  * Try to steal a single page from some fallback migratetype. Leave the rest of
2413  * the block as its current migratetype, potentially causing fragmentation.
2414  */
2415 static __always_inline struct page *
2416 __rmqueue_steal(struct zone *zone, int order, int start_migratetype)
2417 {
2418 	struct free_area *area;
2419 	int current_order;
2420 	struct page *page;
2421 	int fallback_mt;
2422 
2423 	for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
2424 		enum fallback_result result;
2425 
2426 		area = &(zone->free_area[current_order]);
2427 		result = find_suitable_fallback(area, current_order, start_migratetype,
2428 						false, &fallback_mt);
2429 		if (result == FALLBACK_EMPTY)
2430 			continue;
2431 
2432 		page = get_page_from_free_area(area, fallback_mt);
2433 		page_del_and_expand(zone, page, order, current_order, fallback_mt);
2434 		trace_mm_page_alloc_extfrag(page, order, current_order,
2435 					    start_migratetype, fallback_mt);
2436 		return page;
2437 	}
2438 
2439 	return NULL;
2440 }
2441 
2442 enum rmqueue_mode {
2443 	RMQUEUE_NORMAL,
2444 	RMQUEUE_CMA,
2445 	RMQUEUE_CLAIM,
2446 	RMQUEUE_STEAL,
2447 };
2448 
2449 /*
2450  * Do the hard work of removing an element from the buddy allocator.
2451  * Call me with the zone->lock already held.
2452  */
2453 static __always_inline struct page *
2454 __rmqueue(struct zone *zone, unsigned int order, int migratetype,
2455 	  unsigned int alloc_flags, enum rmqueue_mode *mode)
2456 {
2457 	struct page *page;
2458 
2459 	if (IS_ENABLED(CONFIG_CMA)) {
2460 		/*
2461 		 * Balance movable allocations between regular and CMA areas by
2462 		 * allocating from CMA when over half of the zone's free memory
2463 		 * is in the CMA area.
2464 		 */
2465 		if (alloc_flags & ALLOC_CMA &&
2466 		    zone_page_state(zone, NR_FREE_CMA_PAGES) >
2467 		    zone_page_state(zone, NR_FREE_PAGES) / 2) {
2468 			page = __rmqueue_cma_fallback(zone, order);
2469 			if (page)
2470 				return page;
2471 		}
2472 	}
2473 
2474 	/*
2475 	 * First try the freelists of the requested migratetype, then try
2476 	 * fallbacks modes with increasing levels of fragmentation risk.
2477 	 *
2478 	 * The fallback logic is expensive and rmqueue_bulk() calls in
2479 	 * a loop with the zone->lock held, meaning the freelists are
2480 	 * not subject to any outside changes. Remember in *mode where
2481 	 * we found pay dirt, to save us the search on the next call.
2482 	 */
2483 	switch (*mode) {
2484 	case RMQUEUE_NORMAL:
2485 		page = __rmqueue_smallest(zone, order, migratetype);
2486 		if (page)
2487 			return page;
2488 		fallthrough;
2489 	case RMQUEUE_CMA:
2490 		if (alloc_flags & ALLOC_CMA) {
2491 			page = __rmqueue_cma_fallback(zone, order);
2492 			if (page) {
2493 				*mode = RMQUEUE_CMA;
2494 				return page;
2495 			}
2496 		}
2497 		fallthrough;
2498 	case RMQUEUE_CLAIM:
2499 		page = __rmqueue_claim(zone, order, migratetype, alloc_flags);
2500 		if (page) {
2501 			/* Replenished preferred freelist, back to normal mode. */
2502 			*mode = RMQUEUE_NORMAL;
2503 			return page;
2504 		}
2505 		fallthrough;
2506 	case RMQUEUE_STEAL:
2507 		if (!(alloc_flags & ALLOC_NOFRAGMENT)) {
2508 			page = __rmqueue_steal(zone, order, migratetype);
2509 			if (page) {
2510 				*mode = RMQUEUE_STEAL;
2511 				return page;
2512 			}
2513 		}
2514 	}
2515 	return NULL;
2516 }
2517 
2518 /*
2519  * Obtain a specified number of elements from the buddy allocator, all under
2520  * a single hold of the lock, for efficiency.  Add them to the supplied list.
2521  * Returns the number of new pages which were placed at *list.
2522  */
2523 static int rmqueue_bulk(struct zone *zone, unsigned int order,
2524 			unsigned long count, struct list_head *list,
2525 			int migratetype, unsigned int alloc_flags)
2526 {
2527 	enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
2528 	unsigned long flags;
2529 	int i;
2530 
2531 	if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
2532 		if (!spin_trylock_irqsave(&zone->lock, flags))
2533 			return 0;
2534 	} else {
2535 		spin_lock_irqsave(&zone->lock, flags);
2536 	}
2537 	for (i = 0; i < count; ++i) {
2538 		struct page *page = __rmqueue(zone, order, migratetype,
2539 					      alloc_flags, &rmqm);
2540 		if (unlikely(page == NULL))
2541 			break;
2542 
2543 		/*
2544 		 * Split buddy pages returned by expand() are received here in
2545 		 * physical page order. The page is added to the tail of
2546 		 * caller's list. From the callers perspective, the linked list
2547 		 * is ordered by page number under some conditions. This is
2548 		 * useful for IO devices that can forward direction from the
2549 		 * head, thus also in the physical page order. This is useful
2550 		 * for IO devices that can merge IO requests if the physical
2551 		 * pages are ordered properly.
2552 		 */
2553 		list_add_tail(&page->pcp_list, list);
2554 	}
2555 	spin_unlock_irqrestore(&zone->lock, flags);
2556 
2557 	return i;
2558 }
2559 
2560 /*
2561  * Called from the vmstat counter updater to decay the PCP high.
2562  * Return whether there are addition works to do.
2563  */
2564 bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
2565 {
2566 	int high_min, to_drain, to_drain_batched, batch;
2567 	bool todo = false;
2568 
2569 	high_min = READ_ONCE(pcp->high_min);
2570 	batch = READ_ONCE(pcp->batch);
2571 	/*
2572 	 * Decrease pcp->high periodically to try to free possible
2573 	 * idle PCP pages.  And, avoid to free too many pages to
2574 	 * control latency.  This caps pcp->high decrement too.
2575 	 */
2576 	if (pcp->high > high_min) {
2577 		pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
2578 				 pcp->high - (pcp->high >> 3), high_min);
2579 		if (pcp->high > high_min)
2580 			todo = true;
2581 	}
2582 
2583 	to_drain = pcp->count - pcp->high;
2584 	while (to_drain > 0) {
2585 		to_drain_batched = min(to_drain, batch);
2586 		pcp_spin_lock_nopin(pcp);
2587 		free_pcppages_bulk(zone, to_drain_batched, pcp, 0);
2588 		pcp_spin_unlock_nopin(pcp);
2589 		todo = true;
2590 
2591 		to_drain -= to_drain_batched;
2592 	}
2593 
2594 	return todo;
2595 }
2596 
2597 #ifdef CONFIG_NUMA
2598 /*
2599  * Called from the vmstat counter updater to drain pagesets of this
2600  * currently executing processor on remote nodes after they have
2601  * expired.
2602  */
2603 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
2604 {
2605 	int to_drain, batch;
2606 
2607 	batch = READ_ONCE(pcp->batch);
2608 	to_drain = min(pcp->count, batch);
2609 	if (to_drain > 0) {
2610 		pcp_spin_lock_nopin(pcp);
2611 		free_pcppages_bulk(zone, to_drain, pcp, 0);
2612 		pcp_spin_unlock_nopin(pcp);
2613 	}
2614 }
2615 #endif
2616 
2617 /*
2618  * Drain pcplists of the indicated processor and zone.
2619  */
2620 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
2621 {
2622 	struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
2623 	int count;
2624 
2625 	do {
2626 		pcp_spin_lock_nopin(pcp);
2627 		count = pcp->count;
2628 		if (count) {
2629 			int to_drain = min(count,
2630 				pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX);
2631 
2632 			free_pcppages_bulk(zone, to_drain, pcp, 0);
2633 			count -= to_drain;
2634 		}
2635 		pcp_spin_unlock_nopin(pcp);
2636 	} while (count);
2637 }
2638 
2639 /*
2640  * Drain pcplists of all zones on the indicated processor.
2641  */
2642 static void drain_pages(unsigned int cpu)
2643 {
2644 	struct zone *zone;
2645 
2646 	for_each_populated_zone(zone) {
2647 		drain_pages_zone(cpu, zone);
2648 	}
2649 }
2650 
2651 /*
2652  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
2653  */
2654 void drain_local_pages(struct zone *zone)
2655 {
2656 	int cpu = smp_processor_id();
2657 
2658 	if (zone)
2659 		drain_pages_zone(cpu, zone);
2660 	else
2661 		drain_pages(cpu);
2662 }
2663 
2664 /*
2665  * The implementation of drain_all_pages(), exposing an extra parameter to
2666  * drain on all cpus.
2667  *
2668  * drain_all_pages() is optimized to only execute on cpus where pcplists are
2669  * not empty. The check for non-emptiness can however race with a free to
2670  * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
2671  * that need the guarantee that every CPU has drained can disable the
2672  * optimizing racy check.
2673  */
2674 static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
2675 {
2676 	int cpu;
2677 
2678 	/*
2679 	 * Allocate in the BSS so we won't require allocation in
2680 	 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
2681 	 */
2682 	static cpumask_t cpus_with_pcps;
2683 
2684 	/*
2685 	 * Do not drain if one is already in progress unless it's specific to
2686 	 * a zone. Such callers are primarily CMA and memory hotplug and need
2687 	 * the drain to be complete when the call returns.
2688 	 */
2689 	if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
2690 		if (!zone)
2691 			return;
2692 		mutex_lock(&pcpu_drain_mutex);
2693 	}
2694 
2695 	/*
2696 	 * We don't care about racing with CPU hotplug event
2697 	 * as offline notification will cause the notified
2698 	 * cpu to drain that CPU pcps and on_each_cpu_mask
2699 	 * disables preemption as part of its processing
2700 	 */
2701 	for_each_online_cpu(cpu) {
2702 		struct per_cpu_pages *pcp;
2703 		struct zone *z;
2704 		bool has_pcps = false;
2705 
2706 		if (force_all_cpus) {
2707 			/*
2708 			 * The pcp.count check is racy, some callers need a
2709 			 * guarantee that no cpu is missed.
2710 			 */
2711 			has_pcps = true;
2712 		} else if (zone) {
2713 			pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
2714 			if (pcp->count)
2715 				has_pcps = true;
2716 		} else {
2717 			for_each_populated_zone(z) {
2718 				pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
2719 				if (pcp->count) {
2720 					has_pcps = true;
2721 					break;
2722 				}
2723 			}
2724 		}
2725 
2726 		if (has_pcps)
2727 			cpumask_set_cpu(cpu, &cpus_with_pcps);
2728 		else
2729 			cpumask_clear_cpu(cpu, &cpus_with_pcps);
2730 	}
2731 
2732 	for_each_cpu(cpu, &cpus_with_pcps) {
2733 		if (zone)
2734 			drain_pages_zone(cpu, zone);
2735 		else
2736 			drain_pages(cpu);
2737 	}
2738 
2739 	mutex_unlock(&pcpu_drain_mutex);
2740 }
2741 
2742 /*
2743  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
2744  *
2745  * When zone parameter is non-NULL, spill just the single zone's pages.
2746  */
2747 void drain_all_pages(struct zone *zone)
2748 {
2749 	__drain_all_pages(zone, false);
2750 }
2751 
2752 static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high)
2753 {
2754 	int min_nr_free, max_nr_free;
2755 
2756 	/* Free as much as possible if batch freeing high-order pages. */
2757 	if (unlikely(free_high))
2758 		return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX);
2759 
2760 	/* Check for PCP disabled or boot pageset */
2761 	if (unlikely(high < batch))
2762 		return 1;
2763 
2764 	/* Leave at least pcp->batch pages on the list */
2765 	min_nr_free = batch;
2766 	max_nr_free = high - batch;
2767 
2768 	/*
2769 	 * Increase the batch number to the number of the consecutive
2770 	 * freed pages to reduce zone lock contention.
2771 	 */
2772 	batch = clamp_t(int, pcp->free_count, min_nr_free, max_nr_free);
2773 
2774 	return batch;
2775 }
2776 
2777 static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
2778 		       int batch, bool free_high)
2779 {
2780 	int high, high_min, high_max;
2781 
2782 	high_min = READ_ONCE(pcp->high_min);
2783 	high_max = READ_ONCE(pcp->high_max);
2784 	high = pcp->high = clamp(pcp->high, high_min, high_max);
2785 
2786 	if (unlikely(!high))
2787 		return 0;
2788 
2789 	if (unlikely(free_high)) {
2790 		pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
2791 				high_min);
2792 		return 0;
2793 	}
2794 
2795 	/*
2796 	 * If reclaim is active, limit the number of pages that can be
2797 	 * stored on pcp lists
2798 	 */
2799 	if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) {
2800 		int free_count = max_t(int, pcp->free_count, batch);
2801 
2802 		pcp->high = max(high - free_count, high_min);
2803 		return min(batch << 2, pcp->high);
2804 	}
2805 
2806 	if (high_min == high_max)
2807 		return high;
2808 
2809 	if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) {
2810 		int free_count = max_t(int, pcp->free_count, batch);
2811 
2812 		pcp->high = max(high - free_count, high_min);
2813 		high = max(pcp->count, high_min);
2814 	} else if (pcp->count >= high) {
2815 		int need_high = pcp->free_count + batch;
2816 
2817 		/* pcp->high should be large enough to hold batch freed pages */
2818 		if (pcp->high < need_high)
2819 			pcp->high = clamp(need_high, high_min, high_max);
2820 	}
2821 
2822 	return high;
2823 }
2824 
2825 /*
2826  * Tune pcp alloc factor and adjust count & free_count. Free pages to bring the
2827  * pcp's watermarks below high.
2828  *
2829  * May return a freed pcp, if during page freeing the pcp spinlock cannot be
2830  * reacquired. Return true if pcp is locked, false otherwise.
2831  */
2832 static bool free_frozen_page_commit(struct zone *zone,
2833 		struct per_cpu_pages *pcp, struct page *page, int migratetype,
2834 		unsigned int order, fpi_t fpi_flags)
2835 {
2836 	int high, batch;
2837 	int to_free, to_free_batched;
2838 	int pindex;
2839 	int cpu = smp_processor_id();
2840 	int ret = true;
2841 	bool free_high = false;
2842 
2843 	/*
2844 	 * On freeing, reduce the number of pages that are batch allocated.
2845 	 * See nr_pcp_alloc() where alloc_factor is increased for subsequent
2846 	 * allocations.
2847 	 */
2848 	pcp->alloc_factor >>= 1;
2849 	__count_vm_events(PGFREE, 1 << order);
2850 	pindex = order_to_pindex(migratetype, order);
2851 	list_add(&page->pcp_list, &pcp->lists[pindex]);
2852 	pcp->count += 1 << order;
2853 
2854 	batch = READ_ONCE(pcp->batch);
2855 	/*
2856 	 * As high-order pages other than THP's stored on PCP can contribute
2857 	 * to fragmentation, limit the number stored when PCP is heavily
2858 	 * freeing without allocation. The remainder after bulk freeing
2859 	 * stops will be drained from vmstat refresh context.
2860 	 */
2861 	if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
2862 		free_high = (pcp->free_count >= (batch + pcp->high_min / 2) &&
2863 			     (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
2864 			     (!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
2865 			      pcp->count >= batch));
2866 		pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
2867 	} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
2868 		pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
2869 	}
2870 	if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
2871 		pcp->free_count += (1 << order);
2872 
2873 	if (unlikely(fpi_flags & FPI_TRYLOCK)) {
2874 		/*
2875 		 * Do not attempt to take a zone lock. Let pcp->count get
2876 		 * over high mark temporarily.
2877 		 */
2878 		return true;
2879 	}
2880 
2881 	high = nr_pcp_high(pcp, zone, batch, free_high);
2882 	if (pcp->count < high)
2883 		return true;
2884 
2885 	to_free = nr_pcp_free(pcp, batch, high, free_high);
2886 	while (to_free > 0 && pcp->count > 0) {
2887 		to_free_batched = min(to_free, batch);
2888 		free_pcppages_bulk(zone, to_free_batched, pcp, pindex);
2889 		to_free -= to_free_batched;
2890 
2891 		if (to_free == 0 || pcp->count == 0)
2892 			break;
2893 
2894 		pcp_spin_unlock(pcp);
2895 
2896 		pcp = pcp_spin_trylock(zone->per_cpu_pageset);
2897 		if (!pcp) {
2898 			ret = false;
2899 			break;
2900 		}
2901 
2902 		/*
2903 		 * Check if this thread has been migrated to a different CPU.
2904 		 * If that is the case, give up and indicate that the pcp is
2905 		 * returned in an unlocked state.
2906 		 */
2907 		if (smp_processor_id() != cpu) {
2908 			pcp_spin_unlock(pcp);
2909 			ret = false;
2910 			break;
2911 		}
2912 	}
2913 
2914 	if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
2915 	    zone_watermark_ok(zone, 0, high_wmark_pages(zone),
2916 			      ZONE_MOVABLE, 0)) {
2917 		struct pglist_data *pgdat = zone->zone_pgdat;
2918 		clear_bit(ZONE_BELOW_HIGH, &zone->flags);
2919 
2920 		/*
2921 		 * Assume that memory pressure on this node is gone and may be
2922 		 * in a reclaimable state. If a memory fallback node exists,
2923 		 * direct reclaim may not have been triggered, causing a
2924 		 * 'hopeless node' to stay in that state for a while.  Let
2925 		 * kswapd work again by resetting kswapd_failures.
2926 		 */
2927 		if (kswapd_test_hopeless(pgdat) &&
2928 		    next_memory_node(pgdat->node_id) < MAX_NUMNODES)
2929 			kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_PCP);
2930 	}
2931 	return ret;
2932 }
2933 
2934 /*
2935  * Free a pcp page
2936  */
2937 static void __free_frozen_pages(struct page *page, unsigned int order,
2938 				fpi_t fpi_flags)
2939 {
2940 	struct per_cpu_pages *pcp;
2941 	struct zone *zone;
2942 	unsigned long pfn = page_to_pfn(page);
2943 	int migratetype;
2944 
2945 	if (!pcp_allowed_order(order)) {
2946 		__free_pages_ok(page, order, fpi_flags);
2947 		return;
2948 	}
2949 
2950 	if (!__free_pages_prepare(page, order, fpi_flags))
2951 		return;
2952 
2953 	/*
2954 	 * We only track unmovable, reclaimable and movable on pcp lists.
2955 	 * Place ISOLATE pages on the isolated list because they are being
2956 	 * offlined but treat HIGHATOMIC and CMA as movable pages so we can
2957 	 * get those areas back if necessary. Otherwise, we may have to free
2958 	 * excessively into the page allocator
2959 	 */
2960 	zone = page_zone(page);
2961 	migratetype = get_pfnblock_migratetype(page, pfn);
2962 	if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
2963 		if (unlikely(is_migrate_isolate(migratetype))) {
2964 			free_one_page(zone, page, pfn, order, fpi_flags);
2965 			return;
2966 		}
2967 		migratetype = MIGRATE_MOVABLE;
2968 	}
2969 
2970 	if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT)
2971 		     && (in_nmi() || in_hardirq()))) {
2972 		add_page_to_zone_llist(zone, page, order);
2973 		return;
2974 	}
2975 	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
2976 	if (pcp) {
2977 		if (!free_frozen_page_commit(zone, pcp, page, migratetype,
2978 						order, fpi_flags))
2979 			return;
2980 		pcp_spin_unlock(pcp);
2981 	} else {
2982 		free_one_page(zone, page, pfn, order, fpi_flags);
2983 	}
2984 }
2985 
2986 void free_frozen_pages(struct page *page, unsigned int order)
2987 {
2988 	__free_frozen_pages(page, order, FPI_NONE);
2989 }
2990 
2991 void free_frozen_pages_nolock(struct page *page, unsigned int order)
2992 {
2993 	__free_frozen_pages(page, order, FPI_TRYLOCK);
2994 }
2995 
2996 /*
2997  * Free a batch of folios
2998  */
2999 void free_unref_folios(struct folio_batch *folios)
3000 {
3001 	struct per_cpu_pages *pcp = NULL;
3002 	struct zone *locked_zone = NULL;
3003 	int i, j;
3004 
3005 	/* Prepare folios for freeing */
3006 	for (i = 0, j = 0; i < folios->nr; i++) {
3007 		struct folio *folio = folios->folios[i];
3008 		unsigned long pfn = folio_pfn(folio);
3009 		unsigned int order = folio_order(folio);
3010 
3011 		if (!__free_pages_prepare(&folio->page, order, FPI_NONE))
3012 			continue;
3013 		/*
3014 		 * Free orders not handled on the PCP directly to the
3015 		 * allocator.
3016 		 */
3017 		if (!pcp_allowed_order(order)) {
3018 			free_one_page(folio_zone(folio), &folio->page,
3019 				      pfn, order, FPI_NONE);
3020 			continue;
3021 		}
3022 		folio->private = (void *)(unsigned long)order;
3023 		if (j != i)
3024 			folios->folios[j] = folio;
3025 		j++;
3026 	}
3027 	folios->nr = j;
3028 
3029 	for (i = 0; i < folios->nr; i++) {
3030 		struct folio *folio = folios->folios[i];
3031 		struct zone *zone = folio_zone(folio);
3032 		unsigned long pfn = folio_pfn(folio);
3033 		unsigned int order = (unsigned long)folio->private;
3034 		int migratetype;
3035 
3036 		folio->private = NULL;
3037 		migratetype = get_pfnblock_migratetype(&folio->page, pfn);
3038 
3039 		/* Different zone requires a different pcp lock */
3040 		if (zone != locked_zone ||
3041 		    is_migrate_isolate(migratetype)) {
3042 			if (pcp) {
3043 				pcp_spin_unlock(pcp);
3044 				locked_zone = NULL;
3045 				pcp = NULL;
3046 			}
3047 
3048 			/*
3049 			 * Free isolated pages directly to the
3050 			 * allocator, see comment in free_frozen_pages.
3051 			 */
3052 			if (is_migrate_isolate(migratetype)) {
3053 				free_one_page(zone, &folio->page, pfn,
3054 					      order, FPI_NONE);
3055 				continue;
3056 			}
3057 
3058 			/*
3059 			 * trylock is necessary as folios may be getting freed
3060 			 * from IRQ or SoftIRQ context after an IO completion.
3061 			 */
3062 			pcp = pcp_spin_trylock(zone->per_cpu_pageset);
3063 			if (unlikely(!pcp)) {
3064 				free_one_page(zone, &folio->page, pfn,
3065 					      order, FPI_NONE);
3066 				continue;
3067 			}
3068 			locked_zone = zone;
3069 		}
3070 
3071 		/*
3072 		 * Non-isolated types over MIGRATE_PCPTYPES get added
3073 		 * to the MIGRATE_MOVABLE pcp list.
3074 		 */
3075 		if (unlikely(migratetype >= MIGRATE_PCPTYPES))
3076 			migratetype = MIGRATE_MOVABLE;
3077 
3078 		trace_mm_page_free_batched(&folio->page);
3079 		if (!free_frozen_page_commit(zone, pcp, &folio->page,
3080 				migratetype, order, FPI_NONE)) {
3081 			pcp = NULL;
3082 			locked_zone = NULL;
3083 		}
3084 	}
3085 
3086 	if (pcp)
3087 		pcp_spin_unlock(pcp);
3088 	folio_batch_reinit(folios);
3089 }
3090 
3091 static void __split_page(struct page *page, unsigned int order)
3092 {
3093 	VM_WARN_ON_PAGE(PageCompound(page), page);
3094 
3095 	split_page_owner(page, order, 0);
3096 	pgalloc_tag_split(page_folio(page), order, 0);
3097 	split_page_memcg(page, order);
3098 }
3099 
3100 /*
3101  * split_page takes a non-compound higher-order page, and splits it into
3102  * n (1<<order) sub-pages: page[0..n]
3103  * Each sub-page must be freed individually.
3104  *
3105  * Note: this is probably too low level an operation for use in drivers.
3106  * Please consult with lkml before using this in your driver.
3107  */
3108 void split_page(struct page *page, unsigned int order)
3109 {
3110 	int i;
3111 
3112 	VM_WARN_ON_PAGE(!page_count(page), page);
3113 
3114 	for (i = 1; i < (1 << order); i++)
3115 		set_page_refcounted(page + i);
3116 
3117 	__split_page(page, order);
3118 }
3119 EXPORT_SYMBOL_GPL(split_page);
3120 
3121 int __isolate_free_page(struct page *page, unsigned int order)
3122 {
3123 	struct zone *zone = page_zone(page);
3124 	int mt = get_pageblock_migratetype(page);
3125 
3126 	if (!is_migrate_isolate(mt)) {
3127 		unsigned long watermark;
3128 		/*
3129 		 * Obey watermarks as if the page was being allocated. We can
3130 		 * emulate a high-order watermark check with a raised order-0
3131 		 * watermark, because we already know our high-order page
3132 		 * exists.
3133 		 */
3134 		watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
3135 		if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
3136 			return 0;
3137 	}
3138 
3139 	del_page_from_free_list(page, zone, order, mt);
3140 
3141 	/*
3142 	 * Set the pageblock if the isolated page is at least half of a
3143 	 * pageblock
3144 	 */
3145 	if (order >= pageblock_order - 1) {
3146 		struct page *endpage = page + (1 << order) - 1;
3147 		for (; page < endpage; page += pageblock_nr_pages) {
3148 			int mt = get_pageblock_migratetype(page);
3149 			/*
3150 			 * Only change normal pageblocks (i.e., they can merge
3151 			 * with others)
3152 			 */
3153 			if (migratetype_is_mergeable(mt))
3154 				move_freepages_block(zone, page, mt,
3155 						     MIGRATE_MOVABLE);
3156 		}
3157 	}
3158 
3159 	return 1UL << order;
3160 }
3161 
3162 /**
3163  * __putback_isolated_page - Return a now-isolated page back where we got it
3164  * @page: Page that was isolated
3165  * @order: Order of the isolated page
3166  * @mt: The page's pageblock's migratetype
3167  *
3168  * This function is meant to return a page pulled from the free lists via
3169  * __isolate_free_page back to the free lists they were pulled from.
3170  */
3171 void __putback_isolated_page(struct page *page, unsigned int order, int mt)
3172 {
3173 	struct zone *zone = page_zone(page);
3174 
3175 	/* zone lock should be held when this function is called */
3176 	lockdep_assert_held(&zone->lock);
3177 
3178 	/* Return isolated page to tail of freelist. */
3179 	__free_one_page(page, page_to_pfn(page), zone, order, mt,
3180 			FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
3181 }
3182 
3183 /*
3184  * Update NUMA hit/miss statistics
3185  */
3186 static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
3187 				   long nr_account)
3188 {
3189 #ifdef CONFIG_NUMA
3190 	enum numa_stat_item local_stat = NUMA_LOCAL;
3191 
3192 	/* skip numa counters update if numa stats is disabled */
3193 	if (!static_branch_likely(&vm_numa_stat_key))
3194 		return;
3195 
3196 	if (zone_to_nid(z) != numa_node_id())
3197 		local_stat = NUMA_OTHER;
3198 
3199 	if (zone_to_nid(z) == zone_to_nid(preferred_zone))
3200 		__count_numa_events(z, NUMA_HIT, nr_account);
3201 	else {
3202 		__count_numa_events(z, NUMA_MISS, nr_account);
3203 		__count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
3204 	}
3205 	__count_numa_events(z, local_stat, nr_account);
3206 #endif
3207 }
3208 
3209 static __always_inline
3210 struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
3211 			   unsigned int order, unsigned int alloc_flags,
3212 			   int migratetype)
3213 {
3214 	struct page *page;
3215 	unsigned long flags;
3216 
3217 	do {
3218 		page = NULL;
3219 		if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
3220 			if (!spin_trylock_irqsave(&zone->lock, flags))
3221 				return NULL;
3222 		} else {
3223 			spin_lock_irqsave(&zone->lock, flags);
3224 		}
3225 		if (alloc_flags & ALLOC_HIGHATOMIC)
3226 			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
3227 		if (!page) {
3228 			enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
3229 
3230 			page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm);
3231 
3232 			/*
3233 			 * If the allocation fails, allow OOM handling and
3234 			 * order-0 (atomic) allocs access to HIGHATOMIC
3235 			 * reserves as failing now is worse than failing a
3236 			 * high-order atomic allocation in the future.
3237 			 */
3238 			if (!page && (alloc_flags & (ALLOC_OOM|ALLOC_NON_BLOCK)))
3239 				page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
3240 
3241 			if (!page) {
3242 				spin_unlock_irqrestore(&zone->lock, flags);
3243 				return NULL;
3244 			}
3245 		}
3246 		spin_unlock_irqrestore(&zone->lock, flags);
3247 	} while (check_new_pages(page, order));
3248 
3249 	/*
3250 	 * If this is a high-order atomic allocation then check
3251 	 * if the pageblock should be reserved for the future
3252 	 */
3253 	if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
3254 		reserve_highatomic_pageblock(page, order, zone);
3255 
3256 	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
3257 	zone_statistics(preferred_zone, zone, 1);
3258 
3259 	return page;
3260 }
3261 
3262 static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order)
3263 {
3264 	int high, base_batch, batch, max_nr_alloc;
3265 	int high_max, high_min;
3266 
3267 	base_batch = READ_ONCE(pcp->batch);
3268 	high_min = READ_ONCE(pcp->high_min);
3269 	high_max = READ_ONCE(pcp->high_max);
3270 	high = pcp->high = clamp(pcp->high, high_min, high_max);
3271 
3272 	/* Check for PCP disabled or boot pageset */
3273 	if (unlikely(high < base_batch))
3274 		return 1;
3275 
3276 	if (order)
3277 		batch = base_batch;
3278 	else
3279 		batch = (base_batch << pcp->alloc_factor);
3280 
3281 	/*
3282 	 * If we had larger pcp->high, we could avoid to allocate from
3283 	 * zone.
3284 	 */
3285 	if (high_min != high_max && !test_bit(ZONE_BELOW_HIGH, &zone->flags))
3286 		high = pcp->high = min(high + batch, high_max);
3287 
3288 	if (!order) {
3289 		max_nr_alloc = max(high - pcp->count - base_batch, base_batch);
3290 		/*
3291 		 * Double the number of pages allocated each time there is
3292 		 * subsequent allocation of order-0 pages without any freeing.
3293 		 */
3294 		if (batch <= max_nr_alloc &&
3295 		    pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX)
3296 			pcp->alloc_factor++;
3297 		batch = min(batch, max_nr_alloc);
3298 	}
3299 
3300 	/*
3301 	 * Scale batch relative to order if batch implies free pages
3302 	 * can be stored on the PCP. Batch can be 1 for small zones or
3303 	 * for boot pagesets which should never store free pages as
3304 	 * the pages may belong to arbitrary zones.
3305 	 */
3306 	if (batch > 1)
3307 		batch = max(batch >> order, 2);
3308 
3309 	return batch;
3310 }
3311 
3312 /* Remove page from the per-cpu list, caller must protect the list */
3313 static inline
3314 struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
3315 			int migratetype,
3316 			unsigned int alloc_flags,
3317 			struct per_cpu_pages *pcp,
3318 			struct list_head *list)
3319 {
3320 	struct page *page;
3321 
3322 	do {
3323 		if (list_empty(list)) {
3324 			int batch = nr_pcp_alloc(pcp, zone, order);
3325 			int alloced;
3326 
3327 			/*
3328 			 * Don't refill the list for a higher order atomic
3329 			 * allocation under memory pressure, as this would
3330 			 * not build up any HIGHATOMIC reserves, which
3331 			 * might be needed soon.
3332 			 *
3333 			 * Instead, direct it towards the reserves by
3334 			 * returning NULL, which will make the caller fall
3335 			 * back to rmqueue_buddy. This will try to use the
3336 			 * reserves first and grow them if needed.
3337 			 */
3338 			if (alloc_flags & ALLOC_HIGHATOMIC)
3339 				return NULL;
3340 
3341 			alloced = rmqueue_bulk(zone, order,
3342 					batch, list,
3343 					migratetype, alloc_flags);
3344 
3345 			pcp->count += alloced << order;
3346 			if (unlikely(list_empty(list)))
3347 				return NULL;
3348 		}
3349 
3350 		page = list_first_entry(list, struct page, pcp_list);
3351 		list_del(&page->pcp_list);
3352 		pcp->count -= 1 << order;
3353 	} while (check_new_pages(page, order));
3354 
3355 	return page;
3356 }
3357 
3358 /* Lock and remove page from the per-cpu list */
3359 static struct page *rmqueue_pcplist(struct zone *preferred_zone,
3360 			struct zone *zone, unsigned int order,
3361 			int migratetype, unsigned int alloc_flags)
3362 {
3363 	struct per_cpu_pages *pcp;
3364 	struct list_head *list;
3365 	struct page *page;
3366 
3367 	/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
3368 	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
3369 	if (!pcp)
3370 		return NULL;
3371 
3372 	/*
3373 	 * On allocation, reduce the number of pages that are batch freed.
3374 	 * See nr_pcp_free() where free_factor is increased for subsequent
3375 	 * frees.
3376 	 */
3377 	pcp->free_count >>= 1;
3378 	list = &pcp->lists[order_to_pindex(migratetype, order)];
3379 	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
3380 	pcp_spin_unlock(pcp);
3381 	if (page) {
3382 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
3383 		zone_statistics(preferred_zone, zone, 1);
3384 	}
3385 	return page;
3386 }
3387 
3388 /*
3389  * Allocate a page from the given zone.
3390  * Use pcplists for THP or "cheap" high-order allocations.
3391  */
3392 
3393 /*
3394  * Do not instrument rmqueue() with KMSAN. This function may call
3395  * __msan_poison_alloca() through a call to set_pfnblock_migratetype().
3396  * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
3397  * may call rmqueue() again, which will result in a deadlock.
3398  */
3399 __no_sanitize_memory
3400 static inline
3401 struct page *rmqueue(struct zone *preferred_zone,
3402 			struct zone *zone, unsigned int order,
3403 			gfp_t gfp_flags, unsigned int alloc_flags,
3404 			int migratetype)
3405 {
3406 	struct page *page;
3407 
3408 	if (likely(pcp_allowed_order(order))) {
3409 		page = rmqueue_pcplist(preferred_zone, zone, order,
3410 				       migratetype, alloc_flags);
3411 		if (likely(page))
3412 			goto out;
3413 	}
3414 
3415 	page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
3416 							migratetype);
3417 
3418 out:
3419 	/* Separate test+clear to avoid unnecessary atomics */
3420 	if ((alloc_flags & ALLOC_KSWAPD) &&
3421 	    unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
3422 		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
3423 		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
3424 	}
3425 
3426 	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
3427 	return page;
3428 }
3429 
3430 /*
3431  * Reserve the pageblock(s) surrounding an allocation request for
3432  * exclusive use of high-order atomic allocations if there are no
3433  * empty page blocks that contain a page with a suitable order
3434  */
3435 static void reserve_highatomic_pageblock(struct page *page, int order,
3436 					 struct zone *zone)
3437 {
3438 	int mt;
3439 	unsigned long max_managed;
3440 
3441 	/*
3442 	 * The number reserved as: minimum is 1 pageblock, maximum is
3443 	 * roughly 1% of a zone. But if 1% of a zone falls below a
3444 	 * pageblock size, then don't reserve any pageblocks.
3445 	 * Check is race-prone but harmless.
3446 	 */
3447 	if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages)
3448 		return;
3449 	max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages);
3450 	if (zone->nr_reserved_highatomic >= max_managed)
3451 		return;
3452 
3453 	guard(spinlock_irqsave)(&zone->lock);
3454 
3455 	/* Recheck the nr_reserved_highatomic limit under the lock */
3456 	if (zone->nr_reserved_highatomic >= max_managed)
3457 		return;
3458 
3459 	/* Yoink! */
3460 	mt = get_pageblock_migratetype(page);
3461 	/* Only reserve normal pageblocks (i.e., they can merge with others) */
3462 	if (!migratetype_is_mergeable(mt))
3463 		return;
3464 
3465 	if (order < pageblock_order) {
3466 		if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
3467 			return;
3468 		zone->nr_reserved_highatomic += pageblock_nr_pages;
3469 	} else {
3470 		change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
3471 		zone->nr_reserved_highatomic += 1 << order;
3472 	}
3473 }
3474 
3475 /*
3476  * Used when an allocation is about to fail under memory pressure. This
3477  * potentially hurts the reliability of high-order allocations when under
3478  * intense memory pressure but failed atomic allocations should be easier
3479  * to recover from than an OOM.
3480  *
3481  * If @force is true, try to unreserve pageblocks even though highatomic
3482  * pageblock is exhausted.
3483  */
3484 static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
3485 						bool force)
3486 {
3487 	struct zonelist *zonelist = ac->zonelist;
3488 	struct zoneref *z;
3489 	struct zone *zone;
3490 	struct page *page;
3491 	int order;
3492 	int ret;
3493 
3494 	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
3495 								ac->nodemask) {
3496 		/*
3497 		 * Preserve at least one pageblock unless memory pressure
3498 		 * is really high.
3499 		 */
3500 		if (!force && zone->nr_reserved_highatomic <=
3501 					pageblock_nr_pages)
3502 			continue;
3503 
3504 		guard(spinlock_irqsave)(&zone->lock);
3505 		for (order = 0; order < NR_PAGE_ORDERS; order++) {
3506 			struct free_area *area = &(zone->free_area[order]);
3507 			unsigned long size;
3508 
3509 			page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
3510 			if (!page)
3511 				continue;
3512 
3513 			size = max(pageblock_nr_pages, 1UL << order);
3514 			/*
3515 			 * It should never happen but changes to
3516 			 * locking could inadvertently allow a per-cpu
3517 			 * drain to add pages to MIGRATE_HIGHATOMIC
3518 			 * while unreserving so be safe and watch for
3519 			 * underflows.
3520 			 */
3521 			if (WARN_ON_ONCE(size > zone->nr_reserved_highatomic))
3522 				size = zone->nr_reserved_highatomic;
3523 			zone->nr_reserved_highatomic -= size;
3524 
3525 			/*
3526 			 * Convert to ac->migratetype and avoid the normal
3527 			 * pageblock stealing heuristics. Minimally, the caller
3528 			 * is doing the work and needs the pages. More
3529 			 * importantly, if the block was always converted to
3530 			 * MIGRATE_UNMOVABLE or another type then the number
3531 			 * of pageblocks that cannot be completely freed
3532 			 * may increase.
3533 			 */
3534 			if (order < pageblock_order)
3535 				ret = move_freepages_block(zone, page,
3536 							   MIGRATE_HIGHATOMIC,
3537 							   ac->migratetype);
3538 			else {
3539 				move_to_free_list(page, zone, order,
3540 						  MIGRATE_HIGHATOMIC,
3541 						  ac->migratetype);
3542 				change_pageblock_range(page, order,
3543 						       ac->migratetype);
3544 				ret = 1;
3545 			}
3546 			/*
3547 			 * Reserving the block(s) already succeeded,
3548 			 * so this should not fail on zone boundaries.
3549 			 */
3550 			WARN_ON_ONCE(ret == -1);
3551 			if (ret > 0)
3552 				return ret;
3553 		}
3554 	}
3555 
3556 	return false;
3557 }
3558 
3559 static inline long __zone_watermark_unusable_free(struct zone *z,
3560 				unsigned int order, unsigned int alloc_flags)
3561 {
3562 	long unusable_free = (1 << order) - 1;
3563 
3564 	/*
3565 	 * If the caller does not have rights to reserves below the min
3566 	 * watermark then subtract the free pages reserved for highatomic.
3567 	 */
3568 	if (likely(!(alloc_flags & ALLOC_RESERVES)))
3569 		unusable_free += READ_ONCE(z->nr_free_highatomic);
3570 
3571 #ifdef CONFIG_CMA
3572 	/* If allocation can't use CMA areas don't use free CMA pages */
3573 	if (!(alloc_flags & ALLOC_CMA))
3574 		unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
3575 #endif
3576 
3577 	return unusable_free;
3578 }
3579 
3580 /*
3581  * Return true if free base pages are above 'mark'. For high-order checks it
3582  * will return true of the order-0 watermark is reached and there is at least
3583  * one free page of a suitable size. Checking now avoids taking the zone lock
3584  * to check in the allocation paths if no pages are free.
3585  */
3586 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3587 			 int highest_zoneidx, unsigned int alloc_flags,
3588 			 long free_pages)
3589 {
3590 	long min = mark;
3591 	int o;
3592 
3593 	/* free_pages may go negative - that's OK */
3594 	free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
3595 
3596 	if (unlikely(alloc_flags & ALLOC_RESERVES)) {
3597 		/*
3598 		 * __GFP_HIGH allows access to 50% of the min reserve as well
3599 		 * as OOM.
3600 		 */
3601 		if (alloc_flags & ALLOC_MIN_RESERVE) {
3602 			min -= min / 2;
3603 
3604 			/*
3605 			 * Non-blocking allocations (e.g. GFP_ATOMIC) can
3606 			 * access more reserves than just __GFP_HIGH. Other
3607 			 * non-blocking allocations requests such as GFP_NOWAIT
3608 			 * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get
3609 			 * access to the min reserve.
3610 			 */
3611 			if (alloc_flags & ALLOC_NON_BLOCK)
3612 				min -= min / 4;
3613 		}
3614 
3615 		/*
3616 		 * OOM victims can try even harder than the normal reserve
3617 		 * users on the grounds that it's definitely going to be in
3618 		 * the exit path shortly and free memory. Any allocation it
3619 		 * makes during the free path will be small and short-lived.
3620 		 */
3621 		if (alloc_flags & ALLOC_OOM)
3622 			min -= min / 2;
3623 	}
3624 
3625 	/*
3626 	 * Check watermarks for an order-0 allocation request. If these
3627 	 * are not met, then a high-order request also cannot go ahead
3628 	 * even if a suitable page happened to be free.
3629 	 */
3630 	if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
3631 		return false;
3632 
3633 	/* If this is an order-0 request then the watermark is fine */
3634 	if (!order)
3635 		return true;
3636 
3637 	/* For a high-order request, check at least one suitable page is free */
3638 	for (o = order; o < NR_PAGE_ORDERS; o++) {
3639 		struct free_area *area = &z->free_area[o];
3640 		int mt;
3641 
3642 		if (!area->nr_free)
3643 			continue;
3644 
3645 		for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
3646 			if (!free_area_empty(area, mt))
3647 				return true;
3648 		}
3649 
3650 #ifdef CONFIG_CMA
3651 		if ((alloc_flags & ALLOC_CMA) &&
3652 		    !free_area_empty(area, MIGRATE_CMA)) {
3653 			return true;
3654 		}
3655 #endif
3656 		if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) &&
3657 		    !free_area_empty(area, MIGRATE_HIGHATOMIC)) {
3658 			return true;
3659 		}
3660 	}
3661 	return false;
3662 }
3663 
3664 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3665 		      int highest_zoneidx, unsigned int alloc_flags)
3666 {
3667 	return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
3668 					zone_page_state(z, NR_FREE_PAGES));
3669 }
3670 
3671 static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
3672 				unsigned long mark, int highest_zoneidx,
3673 				unsigned int alloc_flags, gfp_t gfp_mask)
3674 {
3675 	long free_pages;
3676 
3677 	free_pages = zone_page_state(z, NR_FREE_PAGES);
3678 
3679 	/*
3680 	 * Fast check for order-0 only. If this fails then the reserves
3681 	 * need to be calculated.
3682 	 */
3683 	if (!order) {
3684 		long usable_free;
3685 		long reserved;
3686 
3687 		usable_free = free_pages;
3688 		reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
3689 
3690 		/* reserved may over estimate high-atomic reserves. */
3691 		usable_free -= min(usable_free, reserved);
3692 		if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
3693 			return true;
3694 	}
3695 
3696 	if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
3697 					free_pages))
3698 		return true;
3699 
3700 	/*
3701 	 * Ignore watermark boosting for __GFP_HIGH order-0 allocations
3702 	 * when checking the min watermark. The min watermark is the
3703 	 * point where boosting is ignored so that kswapd is woken up
3704 	 * when below the low watermark.
3705 	 */
3706 	if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost
3707 		&& ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
3708 		mark = z->_watermark[WMARK_MIN];
3709 		return __zone_watermark_ok(z, order, mark, highest_zoneidx,
3710 					alloc_flags, free_pages);
3711 	}
3712 
3713 	return false;
3714 }
3715 
3716 #ifdef CONFIG_NUMA
3717 int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
3718 
3719 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3720 {
3721 	return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
3722 				node_reclaim_distance;
3723 }
3724 #else	/* CONFIG_NUMA */
3725 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3726 {
3727 	return true;
3728 }
3729 #endif	/* CONFIG_NUMA */
3730 
3731 /*
3732  * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
3733  * fragmentation is subtle. If the preferred zone was HIGHMEM then
3734  * premature use of a lower zone may cause lowmem pressure problems that
3735  * are worse than fragmentation. If the next zone is ZONE_DMA then it is
3736  * probably too small. It only makes sense to spread allocations to avoid
3737  * fragmentation between the Normal and DMA32 zones.
3738  */
3739 static inline unsigned int
3740 alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
3741 {
3742 	unsigned int alloc_flags;
3743 
3744 	/*
3745 	 * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
3746 	 * to save a branch.
3747 	 */
3748 	alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
3749 
3750 	if (defrag_mode) {
3751 		alloc_flags |= ALLOC_NOFRAGMENT;
3752 		return alloc_flags;
3753 	}
3754 
3755 #ifdef CONFIG_ZONE_DMA32
3756 	if (!zone)
3757 		return alloc_flags;
3758 
3759 	if (zone_idx(zone) != ZONE_NORMAL)
3760 		return alloc_flags;
3761 
3762 	/*
3763 	 * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
3764 	 * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
3765 	 * on UMA that if Normal is populated then so is DMA32.
3766 	 */
3767 	BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
3768 	if (nr_online_nodes > 1 && !populated_zone(--zone))
3769 		return alloc_flags;
3770 
3771 	alloc_flags |= ALLOC_NOFRAGMENT;
3772 #endif /* CONFIG_ZONE_DMA32 */
3773 	return alloc_flags;
3774 }
3775 
3776 /* Must be called after current_gfp_context() which can change gfp_mask */
3777 static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
3778 						  unsigned int alloc_flags)
3779 {
3780 #ifdef CONFIG_CMA
3781 	if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
3782 		alloc_flags |= ALLOC_CMA;
3783 #endif
3784 	return alloc_flags;
3785 }
3786 
3787 /*
3788  * get_page_from_freelist goes through the zonelist trying to allocate
3789  * a page.
3790  */
3791 static struct page *
3792 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3793 						const struct alloc_context *ac)
3794 {
3795 	struct zoneref *z;
3796 	struct zone *zone;
3797 	struct pglist_data *last_pgdat = NULL;
3798 	bool last_pgdat_dirty_ok = false;
3799 	bool no_fallback;
3800 	bool skip_kswapd_nodes = nr_online_nodes > 1;
3801 	bool skipped_kswapd_nodes = false;
3802 
3803 retry:
3804 	/*
3805 	 * Scan zonelist, looking for a zone with enough free.
3806 	 * See also cpuset_current_node_allowed() comment in kernel/cgroup/cpuset.c.
3807 	 */
3808 	no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
3809 	z = ac->preferred_zoneref;
3810 	for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
3811 					ac->nodemask) {
3812 		struct page *page;
3813 		unsigned long mark;
3814 
3815 		if (cpusets_enabled() &&
3816 			(alloc_flags & ALLOC_CPUSET) &&
3817 			!__cpuset_zone_allowed(zone, gfp_mask))
3818 				continue;
3819 		/*
3820 		 * When allocating a page cache page for writing, we
3821 		 * want to get it from a node that is within its dirty
3822 		 * limit, such that no single node holds more than its
3823 		 * proportional share of globally allowed dirty pages.
3824 		 * The dirty limits take into account the node's
3825 		 * lowmem reserves and high watermark so that kswapd
3826 		 * should be able to balance it without having to
3827 		 * write pages from its LRU list.
3828 		 *
3829 		 * XXX: For now, allow allocations to potentially
3830 		 * exceed the per-node dirty limit in the slowpath
3831 		 * (spread_dirty_pages unset) before going into reclaim,
3832 		 * which is important when on a NUMA setup the allowed
3833 		 * nodes are together not big enough to reach the
3834 		 * global limit.  The proper fix for these situations
3835 		 * will require awareness of nodes in the
3836 		 * dirty-throttling and the flusher threads.
3837 		 */
3838 		if (ac->spread_dirty_pages) {
3839 			if (last_pgdat != zone->zone_pgdat) {
3840 				last_pgdat = zone->zone_pgdat;
3841 				last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat);
3842 			}
3843 
3844 			if (!last_pgdat_dirty_ok)
3845 				continue;
3846 		}
3847 
3848 		if (no_fallback && !defrag_mode && nr_online_nodes > 1 &&
3849 		    zone != zonelist_zone(ac->preferred_zoneref)) {
3850 			int local_nid;
3851 
3852 			/*
3853 			 * If moving to a remote node, retry but allow
3854 			 * fragmenting fallbacks. Locality is more important
3855 			 * than fragmentation avoidance.
3856 			 */
3857 			local_nid = zonelist_node_idx(ac->preferred_zoneref);
3858 			if (zone_to_nid(zone) != local_nid) {
3859 				alloc_flags &= ~ALLOC_NOFRAGMENT;
3860 				goto retry;
3861 			}
3862 		}
3863 
3864 		/*
3865 		 * If kswapd is already active on a node, keep looking
3866 		 * for other nodes that might be idle. This can happen
3867 		 * if another process has NUMA bindings and is causing
3868 		 * kswapd wakeups on only some nodes. Avoid accidental
3869 		 * "node_reclaim_mode"-like behavior in this case.
3870 		 */
3871 		if (skip_kswapd_nodes &&
3872 		    !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) {
3873 			skipped_kswapd_nodes = true;
3874 			continue;
3875 		}
3876 
3877 		cond_accept_memory(zone, order, alloc_flags);
3878 
3879 		/*
3880 		 * Detect whether the number of free pages is below high
3881 		 * watermark.  If so, we will decrease pcp->high and free
3882 		 * PCP pages in free path to reduce the possibility of
3883 		 * premature page reclaiming.  Detection is done here to
3884 		 * avoid to do that in hotter free path.
3885 		 */
3886 		if (test_bit(ZONE_BELOW_HIGH, &zone->flags))
3887 			goto check_alloc_wmark;
3888 
3889 		mark = high_wmark_pages(zone);
3890 		if (zone_watermark_fast(zone, order, mark,
3891 					ac->highest_zoneidx, alloc_flags,
3892 					gfp_mask))
3893 			goto try_this_zone;
3894 		else
3895 			set_bit(ZONE_BELOW_HIGH, &zone->flags);
3896 
3897 check_alloc_wmark:
3898 		mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
3899 		if (!zone_watermark_fast(zone, order, mark,
3900 				       ac->highest_zoneidx, alloc_flags,
3901 				       gfp_mask)) {
3902 			int ret;
3903 
3904 			if (cond_accept_memory(zone, order, alloc_flags))
3905 				goto try_this_zone;
3906 
3907 			/*
3908 			 * Watermark failed for this zone, but see if we can
3909 			 * grow this zone if it contains deferred pages.
3910 			 */
3911 			if (deferred_pages_enabled()) {
3912 				if (_deferred_grow_zone(zone, order))
3913 					goto try_this_zone;
3914 			}
3915 			/* Checked here to keep the fast path fast */
3916 			BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
3917 			if (alloc_flags & ALLOC_NO_WATERMARKS)
3918 				goto try_this_zone;
3919 
3920 			if (!node_reclaim_enabled() ||
3921 			    !zone_allows_reclaim(zonelist_zone(ac->preferred_zoneref), zone))
3922 				continue;
3923 
3924 			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
3925 			switch (ret) {
3926 			case NODE_RECLAIM_NOSCAN:
3927 				/* did not scan */
3928 				continue;
3929 			case NODE_RECLAIM_FULL:
3930 				/* scanned but unreclaimable */
3931 				continue;
3932 			default:
3933 				/* did we reclaim enough */
3934 				if (zone_watermark_ok(zone, order, mark,
3935 					ac->highest_zoneidx, alloc_flags))
3936 					goto try_this_zone;
3937 
3938 				continue;
3939 			}
3940 		}
3941 
3942 try_this_zone:
3943 		page = rmqueue(zonelist_zone(ac->preferred_zoneref), zone, order,
3944 				gfp_mask, alloc_flags, ac->migratetype);
3945 		if (page) {
3946 			prep_new_page(page, order, gfp_mask, alloc_flags);
3947 
3948 			return page;
3949 		} else {
3950 			if (cond_accept_memory(zone, order, alloc_flags))
3951 				goto try_this_zone;
3952 
3953 			/* Try again if zone has deferred pages */
3954 			if (deferred_pages_enabled()) {
3955 				if (_deferred_grow_zone(zone, order))
3956 					goto try_this_zone;
3957 			}
3958 		}
3959 	}
3960 
3961 	/*
3962 	 * If we skipped over nodes with active kswapds and found no
3963 	 * idle nodes, retry and place anywhere the watermarks permit.
3964 	 */
3965 	if (skip_kswapd_nodes && skipped_kswapd_nodes) {
3966 		skip_kswapd_nodes = false;
3967 		goto retry;
3968 	}
3969 
3970 	/*
3971 	 * It's possible on a UMA machine to get through all zones that are
3972 	 * fragmented. If avoiding fragmentation, reset and try again.
3973 	 */
3974 	if (no_fallback && !defrag_mode) {
3975 		alloc_flags &= ~ALLOC_NOFRAGMENT;
3976 		goto retry;
3977 	}
3978 
3979 	return NULL;
3980 }
3981 
3982 static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
3983 {
3984 	unsigned int filter = SHOW_MEM_FILTER_NODES;
3985 
3986 	/*
3987 	 * This documents exceptions given to allocations in certain
3988 	 * contexts that are allowed to allocate outside current's set
3989 	 * of allowed nodes.
3990 	 */
3991 	if (!(gfp_mask & __GFP_NOMEMALLOC))
3992 		if (tsk_is_oom_victim(current) ||
3993 		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
3994 			filter &= ~SHOW_MEM_FILTER_NODES;
3995 	if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
3996 		filter &= ~SHOW_MEM_FILTER_NODES;
3997 
3998 	__show_mem(filter, nodemask, gfp_zone(gfp_mask));
3999 	mem_cgroup_show_protected_memory(NULL);
4000 }
4001 
4002 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
4003 {
4004 	struct va_format vaf;
4005 	va_list args;
4006 	static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
4007 
4008 	if ((gfp_mask & __GFP_NOWARN) ||
4009 	     !__ratelimit(&nopage_rs) ||
4010 	     ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
4011 		return;
4012 
4013 	va_start(args, fmt);
4014 	vaf.fmt = fmt;
4015 	vaf.va = &args;
4016 	pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
4017 			current->comm, &vaf, gfp_mask, &gfp_mask,
4018 			nodemask_pr_args(nodemask));
4019 	va_end(args);
4020 
4021 	cpuset_print_current_mems_allowed();
4022 	pr_cont("\n");
4023 	dump_stack();
4024 	warn_alloc_show_mem(gfp_mask, nodemask);
4025 }
4026 
4027 static inline struct page *
4028 __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
4029 			      unsigned int alloc_flags,
4030 			      const struct alloc_context *ac)
4031 {
4032 	struct page *page;
4033 
4034 	page = get_page_from_freelist(gfp_mask, order,
4035 			alloc_flags|ALLOC_CPUSET, ac);
4036 	/*
4037 	 * fallback to ignore cpuset restriction if our nodes
4038 	 * are depleted
4039 	 */
4040 	if (!page)
4041 		page = get_page_from_freelist(gfp_mask, order,
4042 				alloc_flags, ac);
4043 	return page;
4044 }
4045 
4046 static inline struct page *
4047 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
4048 	const struct alloc_context *ac, unsigned long *did_some_progress)
4049 {
4050 	struct oom_control oc = {
4051 		.zonelist = ac->zonelist,
4052 		.nodemask = ac->nodemask,
4053 		.memcg = NULL,
4054 		.gfp_mask = gfp_mask,
4055 		.order = order,
4056 	};
4057 	struct page *page;
4058 
4059 	*did_some_progress = 0;
4060 
4061 	/*
4062 	 * Acquire the oom lock.  If that fails, somebody else is
4063 	 * making progress for us.
4064 	 */
4065 	if (!mutex_trylock(&oom_lock)) {
4066 		*did_some_progress = 1;
4067 		schedule_timeout_uninterruptible(1);
4068 		return NULL;
4069 	}
4070 
4071 	/*
4072 	 * Go through the zonelist yet one more time, keep very high watermark
4073 	 * here, this is only to catch a parallel oom killing, we must fail if
4074 	 * we're still under heavy pressure. But make sure that this reclaim
4075 	 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
4076 	 * allocation which will never fail due to oom_lock already held.
4077 	 */
4078 	page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
4079 				      ~__GFP_DIRECT_RECLAIM, order,
4080 				      ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
4081 	if (page)
4082 		goto out;
4083 
4084 	/* Coredumps can quickly deplete all memory reserves */
4085 	if (current->flags & PF_DUMPCORE)
4086 		goto out;
4087 	/* The OOM killer will not help higher order allocs */
4088 	if (order > PAGE_ALLOC_COSTLY_ORDER)
4089 		goto out;
4090 	/*
4091 	 * We have already exhausted all our reclaim opportunities without any
4092 	 * success so it is time to admit defeat. We will skip the OOM killer
4093 	 * because it is very likely that the caller has a more reasonable
4094 	 * fallback than shooting a random task.
4095 	 *
4096 	 * The OOM killer may not free memory on a specific node.
4097 	 */
4098 	if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
4099 		goto out;
4100 	/* The OOM killer does not needlessly kill tasks for lowmem */
4101 	if (ac->highest_zoneidx < ZONE_NORMAL)
4102 		goto out;
4103 	if (pm_suspended_storage())
4104 		goto out;
4105 	/*
4106 	 * XXX: GFP_NOFS allocations should rather fail than rely on
4107 	 * other request to make a forward progress.
4108 	 * We are in an unfortunate situation where out_of_memory cannot
4109 	 * do much for this context but let's try it to at least get
4110 	 * access to memory reserved if the current task is killed (see
4111 	 * out_of_memory). Once filesystems are ready to handle allocation
4112 	 * failures more gracefully we should just bail out here.
4113 	 */
4114 
4115 	/* Exhausted what can be done so it's blame time */
4116 	if (out_of_memory(&oc) ||
4117 	    WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) {
4118 		*did_some_progress = 1;
4119 
4120 		/*
4121 		 * Help non-failing allocations by giving them access to memory
4122 		 * reserves
4123 		 */
4124 		if (gfp_mask & __GFP_NOFAIL)
4125 			page = __alloc_pages_cpuset_fallback(gfp_mask, order,
4126 					ALLOC_NO_WATERMARKS, ac);
4127 	}
4128 out:
4129 	mutex_unlock(&oom_lock);
4130 	return page;
4131 }
4132 
4133 /*
4134  * Maximum number of compaction retries with a progress before OOM
4135  * killer is consider as the only way to move forward.
4136  */
4137 #define MAX_COMPACT_RETRIES 16
4138 
4139 #ifdef CONFIG_COMPACTION
4140 /* Try memory compaction for high-order allocations before reclaim */
4141 static struct page *
4142 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
4143 		unsigned int alloc_flags, const struct alloc_context *ac,
4144 		enum compact_priority prio, enum compact_result *compact_result)
4145 {
4146 	struct page *page = NULL;
4147 	unsigned long pflags;
4148 	unsigned int noreclaim_flag;
4149 
4150 	if (!order)
4151 		return NULL;
4152 
4153 	psi_memstall_enter(&pflags);
4154 	delayacct_compact_start();
4155 	noreclaim_flag = memalloc_noreclaim_save();
4156 
4157 	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
4158 								prio, &page);
4159 
4160 	memalloc_noreclaim_restore(noreclaim_flag);
4161 	psi_memstall_leave(&pflags);
4162 	delayacct_compact_end();
4163 
4164 	if (*compact_result == COMPACT_SKIPPED ||
4165 	    *compact_result == COMPACT_DEFERRED)
4166 		return NULL;
4167 	/*
4168 	 * At least in one zone compaction wasn't deferred or skipped, so let's
4169 	 * count a compaction stall
4170 	 */
4171 	count_vm_event(COMPACTSTALL);
4172 
4173 	/* Prep a captured page if available */
4174 	if (page)
4175 		prep_new_page(page, order, gfp_mask, alloc_flags);
4176 
4177 	/* Try get a page from the freelist if available */
4178 	if (!page)
4179 		page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4180 
4181 	if (page) {
4182 		struct zone *zone = page_zone(page);
4183 
4184 		zone->compact_blockskip_flush = false;
4185 		compaction_defer_reset(zone, order, true);
4186 		count_vm_event(COMPACTSUCCESS);
4187 		return page;
4188 	}
4189 
4190 	/*
4191 	 * It's bad if compaction run occurs and fails. The most likely reason
4192 	 * is that pages exist, but not enough to satisfy watermarks.
4193 	 */
4194 	count_vm_event(COMPACTFAIL);
4195 
4196 	cond_resched();
4197 
4198 	return NULL;
4199 }
4200 
4201 static inline bool
4202 should_compact_retry(gfp_t gfp_mask, struct alloc_context *ac, int order,
4203 		     int alloc_flags,
4204 		     enum compact_result compact_result,
4205 		     enum compact_priority *compact_priority,
4206 		     int *compaction_retries)
4207 {
4208 	int max_retries = MAX_COMPACT_RETRIES;
4209 	int min_priority;
4210 	bool ret = false;
4211 	int retries = *compaction_retries;
4212 	enum compact_priority priority = *compact_priority;
4213 
4214 	if (!order)
4215 		return false;
4216 
4217 	if (fatal_signal_pending(current))
4218 		return false;
4219 
4220 	/*
4221 	 * Compaction was skipped due to a lack of free order-0
4222 	 * migration targets. Continue if reclaim can help.
4223 	 */
4224 	if (compact_result == COMPACT_SKIPPED) {
4225 		ret = compaction_zonelist_suitable(ac, order, alloc_flags,
4226 						   gfp_mask);
4227 		goto out;
4228 	}
4229 
4230 	/*
4231 	 * Compaction managed to coalesce some page blocks, but the
4232 	 * allocation failed presumably due to a race. Retry some.
4233 	 */
4234 	if (compact_result == COMPACT_SUCCESS) {
4235 		/*
4236 		 * !costly requests are much more important than
4237 		 * __GFP_RETRY_MAYFAIL costly ones because they are de
4238 		 * facto nofail and invoke OOM killer to move on while
4239 		 * costly can fail and users are ready to cope with
4240 		 * that. 1/4 retries is rather arbitrary but we would
4241 		 * need much more detailed feedback from compaction to
4242 		 * make a better decision.
4243 		 */
4244 		if (order > PAGE_ALLOC_COSTLY_ORDER)
4245 			max_retries /= 4;
4246 
4247 		if (++(*compaction_retries) <= max_retries) {
4248 			ret = true;
4249 			goto out;
4250 		}
4251 	}
4252 
4253 	/*
4254 	 * Compaction failed. Retry with increasing priority.
4255 	 */
4256 	min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
4257 			MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
4258 
4259 	if (*compact_priority > min_priority) {
4260 		(*compact_priority)--;
4261 		*compaction_retries = 0;
4262 		ret = true;
4263 	}
4264 out:
4265 	trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
4266 	return ret;
4267 }
4268 #else
4269 static inline struct page *
4270 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
4271 		unsigned int alloc_flags, const struct alloc_context *ac,
4272 		enum compact_priority prio, enum compact_result *compact_result)
4273 {
4274 	*compact_result = COMPACT_SKIPPED;
4275 	return NULL;
4276 }
4277 
4278 static inline bool
4279 should_compact_retry(gfp_t gfp_mask, struct alloc_context *ac, int order,
4280 		     int alloc_flags,
4281 		     enum compact_result compact_result,
4282 		     enum compact_priority *compact_priority,
4283 		     int *compaction_retries)
4284 {
4285 	struct zone *zone;
4286 	struct zoneref *z;
4287 
4288 	if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
4289 		return false;
4290 
4291 	/*
4292 	 * There are setups with compaction disabled which would prefer to loop
4293 	 * inside the allocator rather than hit the oom killer prematurely.
4294 	 * Let's give them a good hope and keep retrying while the order-0
4295 	 * watermarks are OK.
4296 	 */
4297 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
4298 				ac->highest_zoneidx, ac->nodemask) {
4299 		if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
4300 					ac->highest_zoneidx, alloc_flags))
4301 			return true;
4302 	}
4303 	return false;
4304 }
4305 #endif /* CONFIG_COMPACTION */
4306 
4307 #ifdef CONFIG_LOCKDEP
4308 static struct lockdep_map __fs_reclaim_map =
4309 	STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
4310 
4311 static bool __need_reclaim(gfp_t gfp_mask)
4312 {
4313 	/* no reclaim without waiting on it */
4314 	if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
4315 		return false;
4316 
4317 	/* this guy won't enter reclaim */
4318 	if (current->flags & PF_MEMALLOC)
4319 		return false;
4320 
4321 	if (gfp_mask & __GFP_NOLOCKDEP)
4322 		return false;
4323 
4324 	return true;
4325 }
4326 
4327 void __fs_reclaim_acquire(unsigned long ip)
4328 {
4329 	lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip);
4330 }
4331 
4332 void __fs_reclaim_release(unsigned long ip)
4333 {
4334 	lock_release(&__fs_reclaim_map, ip);
4335 }
4336 
4337 void fs_reclaim_acquire(gfp_t gfp_mask)
4338 {
4339 	gfp_mask = current_gfp_context(gfp_mask);
4340 
4341 	if (__need_reclaim(gfp_mask)) {
4342 		if (gfp_mask & __GFP_FS)
4343 			__fs_reclaim_acquire(_RET_IP_);
4344 
4345 #ifdef CONFIG_MMU_NOTIFIER
4346 		lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
4347 		lock_map_release(&__mmu_notifier_invalidate_range_start_map);
4348 #endif
4349 
4350 	}
4351 }
4352 EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
4353 
4354 void fs_reclaim_release(gfp_t gfp_mask)
4355 {
4356 	gfp_mask = current_gfp_context(gfp_mask);
4357 
4358 	if (__need_reclaim(gfp_mask)) {
4359 		if (gfp_mask & __GFP_FS)
4360 			__fs_reclaim_release(_RET_IP_);
4361 	}
4362 }
4363 EXPORT_SYMBOL_GPL(fs_reclaim_release);
4364 #endif
4365 
4366 /*
4367  * Zonelists may change due to hotplug during allocation. Detect when zonelists
4368  * have been rebuilt so allocation retries. Reader side does not lock and
4369  * retries the allocation if zonelist changes. Writer side is protected by the
4370  * embedded spin_lock.
4371  */
4372 static DEFINE_SEQLOCK(zonelist_update_seq);
4373 
4374 static unsigned int zonelist_iter_begin(void)
4375 {
4376 	if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
4377 		return read_seqbegin(&zonelist_update_seq);
4378 
4379 	return 0;
4380 }
4381 
4382 static unsigned int check_retry_zonelist(unsigned int seq)
4383 {
4384 	if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
4385 		return read_seqretry(&zonelist_update_seq, seq);
4386 
4387 	return seq;
4388 }
4389 
4390 /* Perform direct synchronous page reclaim */
4391 static unsigned long
4392 __perform_reclaim(gfp_t gfp_mask, unsigned int order,
4393 					const struct alloc_context *ac)
4394 {
4395 	unsigned int noreclaim_flag;
4396 	unsigned long progress;
4397 
4398 	cond_resched();
4399 
4400 	/* We now go into synchronous reclaim */
4401 	cpuset_memory_pressure_bump();
4402 	fs_reclaim_acquire(gfp_mask);
4403 	noreclaim_flag = memalloc_noreclaim_save();
4404 
4405 	progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
4406 								ac->nodemask);
4407 
4408 	memalloc_noreclaim_restore(noreclaim_flag);
4409 	fs_reclaim_release(gfp_mask);
4410 
4411 	cond_resched();
4412 
4413 	return progress;
4414 }
4415 
4416 /* The really slow allocator path where we enter direct reclaim */
4417 static inline struct page *
4418 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
4419 		unsigned int alloc_flags, const struct alloc_context *ac,
4420 		unsigned long *did_some_progress)
4421 {
4422 	struct page *page = NULL;
4423 	unsigned long pflags;
4424 	bool drained = false;
4425 
4426 	psi_memstall_enter(&pflags);
4427 	*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
4428 	if (unlikely(!(*did_some_progress)))
4429 		goto out;
4430 
4431 retry:
4432 	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4433 
4434 	/*
4435 	 * If an allocation failed after direct reclaim, it could be because
4436 	 * pages are pinned on the per-cpu lists or in high alloc reserves.
4437 	 * Shrink them and try again
4438 	 */
4439 	if (!page && !drained) {
4440 		unreserve_highatomic_pageblock(ac, false);
4441 		drain_all_pages(NULL);
4442 		drained = true;
4443 		goto retry;
4444 	}
4445 out:
4446 	psi_memstall_leave(&pflags);
4447 
4448 	return page;
4449 }
4450 
4451 static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
4452 			     const struct alloc_context *ac)
4453 {
4454 	struct zoneref *z;
4455 	struct zone *zone;
4456 	pg_data_t *last_pgdat = NULL;
4457 	enum zone_type highest_zoneidx = ac->highest_zoneidx;
4458 	unsigned int reclaim_order;
4459 
4460 	if (defrag_mode)
4461 		reclaim_order = max(order, pageblock_order);
4462 	else
4463 		reclaim_order = order;
4464 
4465 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
4466 					ac->nodemask) {
4467 		if (!managed_zone(zone))
4468 			continue;
4469 		if (last_pgdat == zone->zone_pgdat)
4470 			continue;
4471 		wakeup_kswapd(zone, gfp_mask, reclaim_order, highest_zoneidx);
4472 		last_pgdat = zone->zone_pgdat;
4473 	}
4474 }
4475 
4476 static inline unsigned int
4477 gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
4478 {
4479 	unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
4480 
4481 	/*
4482 	 * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE
4483 	 * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
4484 	 * to save two branches.
4485 	 */
4486 	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE);
4487 	BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
4488 
4489 	/*
4490 	 * The caller may dip into page reserves a bit more if the caller
4491 	 * cannot run direct reclaim, or if the caller has realtime scheduling
4492 	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
4493 	 * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
4494 	 */
4495 	alloc_flags |= (__force int)
4496 		(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
4497 
4498 	if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
4499 		/*
4500 		 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
4501 		 * if it can't schedule.
4502 		 */
4503 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
4504 			alloc_flags |= ALLOC_NON_BLOCK;
4505 
4506 			if (order > 0 && (alloc_flags & ALLOC_MIN_RESERVE))
4507 				alloc_flags |= ALLOC_HIGHATOMIC;
4508 		}
4509 
4510 		/*
4511 		 * Ignore cpuset mems for non-blocking __GFP_HIGH (probably
4512 		 * GFP_ATOMIC) rather than fail, see the comment for
4513 		 * cpuset_current_node_allowed().
4514 		 */
4515 		if (alloc_flags & ALLOC_MIN_RESERVE)
4516 			alloc_flags &= ~ALLOC_CPUSET;
4517 	} else if (unlikely(rt_or_dl_task(current)) && in_task())
4518 		alloc_flags |= ALLOC_MIN_RESERVE;
4519 
4520 	alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
4521 
4522 	if (defrag_mode)
4523 		alloc_flags |= ALLOC_NOFRAGMENT;
4524 
4525 	return alloc_flags;
4526 }
4527 
4528 static bool oom_reserves_allowed(struct task_struct *tsk)
4529 {
4530 	if (!tsk_is_oom_victim(tsk))
4531 		return false;
4532 
4533 	/*
4534 	 * !MMU doesn't have oom reaper so give access to memory reserves
4535 	 * only to the thread with TIF_MEMDIE set
4536 	 */
4537 	if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
4538 		return false;
4539 
4540 	return true;
4541 }
4542 
4543 /*
4544  * Distinguish requests which really need access to full memory
4545  * reserves from oom victims which can live with a portion of it
4546  */
4547 static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
4548 {
4549 	if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
4550 		return 0;
4551 	if (gfp_mask & __GFP_MEMALLOC)
4552 		return ALLOC_NO_WATERMARKS;
4553 	if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
4554 		return ALLOC_NO_WATERMARKS;
4555 	if (!in_interrupt()) {
4556 		if (current->flags & PF_MEMALLOC)
4557 			return ALLOC_NO_WATERMARKS;
4558 		else if (oom_reserves_allowed(current))
4559 			return ALLOC_OOM;
4560 	}
4561 
4562 	return 0;
4563 }
4564 
4565 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
4566 {
4567 	return !!__gfp_pfmemalloc_flags(gfp_mask);
4568 }
4569 
4570 /*
4571  * Checks whether it makes sense to retry the reclaim to make a forward progress
4572  * for the given allocation request.
4573  *
4574  * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
4575  * without success, or when we couldn't even meet the watermark if we
4576  * reclaimed all remaining pages on the LRU lists.
4577  *
4578  * Returns true if a retry is viable or false to enter the oom path.
4579  */
4580 static inline bool
4581 should_reclaim_retry(gfp_t gfp_mask, unsigned order,
4582 		     struct alloc_context *ac, int alloc_flags,
4583 		     bool did_some_progress, int *no_progress_loops)
4584 {
4585 	struct zone *zone;
4586 	struct zoneref *z;
4587 	bool ret = false;
4588 
4589 	/*
4590 	 * Costly allocations might have made a progress but this doesn't mean
4591 	 * their order will become available due to high fragmentation so
4592 	 * always increment the no progress counter for them
4593 	 */
4594 	if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
4595 		*no_progress_loops = 0;
4596 	else
4597 		(*no_progress_loops)++;
4598 
4599 	if (*no_progress_loops > MAX_RECLAIM_RETRIES)
4600 		goto out;
4601 
4602 
4603 	/*
4604 	 * Keep reclaiming pages while there is a chance this will lead
4605 	 * somewhere.  If none of the target zones can satisfy our allocation
4606 	 * request even if all reclaimable pages are considered then we are
4607 	 * screwed and have to go OOM.
4608 	 */
4609 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
4610 				ac->highest_zoneidx, ac->nodemask) {
4611 		unsigned long available;
4612 		unsigned long reclaimable;
4613 		unsigned long min_wmark = min_wmark_pages(zone);
4614 		bool wmark;
4615 
4616 		if (cpusets_enabled() &&
4617 			(alloc_flags & ALLOC_CPUSET) &&
4618 			!__cpuset_zone_allowed(zone, gfp_mask))
4619 				continue;
4620 
4621 		available = reclaimable = zone_reclaimable_pages(zone);
4622 		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
4623 
4624 		/*
4625 		 * Would the allocation succeed if we reclaimed all
4626 		 * reclaimable pages?
4627 		 */
4628 		wmark = __zone_watermark_ok(zone, order, min_wmark,
4629 				ac->highest_zoneidx, alloc_flags, available);
4630 		trace_reclaim_retry_zone(z, order, reclaimable,
4631 				available, min_wmark, *no_progress_loops, wmark);
4632 		if (wmark) {
4633 			ret = true;
4634 			break;
4635 		}
4636 	}
4637 
4638 	/*
4639 	 * Memory allocation/reclaim might be called from a WQ context and the
4640 	 * current implementation of the WQ concurrency control doesn't
4641 	 * recognize that a particular WQ is congested if the worker thread is
4642 	 * looping without ever sleeping. Therefore we have to do a short sleep
4643 	 * here rather than calling cond_resched().
4644 	 */
4645 	if (current->flags & PF_WQ_WORKER)
4646 		schedule_timeout_uninterruptible(1);
4647 	else
4648 		cond_resched();
4649 out:
4650 	/* Before OOM, exhaust highatomic_reserve */
4651 	if (!ret)
4652 		return unreserve_highatomic_pageblock(ac, true);
4653 
4654 	return ret;
4655 }
4656 
4657 static inline bool
4658 check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
4659 {
4660 	/*
4661 	 * It's possible that cpuset's mems_allowed and the nodemask from
4662 	 * mempolicy don't intersect. This should be normally dealt with by
4663 	 * policy_nodemask(), but it's possible to race with cpuset update in
4664 	 * such a way the check therein was true, and then it became false
4665 	 * before we got our cpuset_mems_cookie here.
4666 	 * This assumes that for all allocations, ac->nodemask can come only
4667 	 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
4668 	 * when it does not intersect with the cpuset restrictions) or the
4669 	 * caller can deal with a violated nodemask.
4670 	 */
4671 	if (cpusets_enabled() && ac->nodemask &&
4672 			!cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
4673 		ac->nodemask = NULL;
4674 		return true;
4675 	}
4676 
4677 	/*
4678 	 * When updating a task's mems_allowed or mempolicy nodemask, it is
4679 	 * possible to race with parallel threads in such a way that our
4680 	 * allocation can fail while the mask is being updated. If we are about
4681 	 * to fail, check if the cpuset changed during allocation and if so,
4682 	 * retry.
4683 	 */
4684 	if (read_mems_allowed_retry(cpuset_mems_cookie))
4685 		return true;
4686 
4687 	return false;
4688 }
4689 
4690 static void check_alloc_stall_warn(gfp_t gfp_mask, nodemask_t *nodemask,
4691 				unsigned int order, unsigned long alloc_start_time)
4692 {
4693 	static DEFINE_SPINLOCK(alloc_stall_lock);
4694 	unsigned long stall_msecs = jiffies_to_msecs(jiffies - alloc_start_time);
4695 
4696 	if (likely(stall_msecs < ALLOC_STALL_WARN_MSECS))
4697 		return;
4698 	if (time_is_after_jiffies(READ_ONCE(alloc_stall_warn_jiffies)))
4699 		return;
4700 	if (gfp_mask & __GFP_NOWARN)
4701 		return;
4702 
4703 	if (!spin_trylock(&alloc_stall_lock))
4704 		return;
4705 
4706 	/* Check again, this time under the lock */
4707 	if (time_is_after_jiffies(alloc_stall_warn_jiffies)) {
4708 		spin_unlock(&alloc_stall_lock);
4709 		return;
4710 	}
4711 
4712 	WRITE_ONCE(alloc_stall_warn_jiffies, jiffies + msecs_to_jiffies(ALLOC_STALL_WARN_MSECS));
4713 	spin_unlock(&alloc_stall_lock);
4714 
4715 	pr_warn("%s: page allocation stall for %lu secs: order:%d, mode:%#x(%pGg) nodemask=%*pbl",
4716 		current->comm, stall_msecs / MSEC_PER_SEC, order, gfp_mask, &gfp_mask,
4717 		nodemask_pr_args(nodemask));
4718 	cpuset_print_current_mems_allowed();
4719 	pr_cont("\n");
4720 	dump_stack();
4721 	warn_alloc_show_mem(gfp_mask, nodemask);
4722 }
4723 
4724 static inline struct page *
4725 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
4726 						struct alloc_context *ac)
4727 {
4728 	bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
4729 	bool can_compact = can_direct_reclaim && gfp_compaction_allowed(gfp_mask);
4730 	bool nofail = gfp_mask & __GFP_NOFAIL;
4731 	const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
4732 	struct page *page = NULL;
4733 	unsigned int alloc_flags;
4734 	unsigned long did_some_progress;
4735 	enum compact_priority compact_priority;
4736 	enum compact_result compact_result;
4737 	int compaction_retries;
4738 	int no_progress_loops;
4739 	unsigned int cpuset_mems_cookie;
4740 	unsigned int zonelist_iter_cookie;
4741 	int reserve_flags;
4742 	bool compact_first = false;
4743 	bool can_retry_reserves = true;
4744 	unsigned long alloc_start_time = jiffies;
4745 
4746 	if (unlikely(nofail)) {
4747 		/*
4748 		 * Also we don't support __GFP_NOFAIL without __GFP_DIRECT_RECLAIM,
4749 		 * otherwise, we may result in lockup.
4750 		 */
4751 		WARN_ON_ONCE(!can_direct_reclaim);
4752 		/*
4753 		 * PF_MEMALLOC request from this context is rather bizarre
4754 		 * because we cannot reclaim anything and only can loop waiting
4755 		 * for somebody to do a work for us.
4756 		 */
4757 		WARN_ON_ONCE(current->flags & PF_MEMALLOC);
4758 	}
4759 
4760 restart:
4761 	compaction_retries = 0;
4762 	no_progress_loops = 0;
4763 	compact_result = COMPACT_SKIPPED;
4764 	compact_priority = DEF_COMPACT_PRIORITY;
4765 	cpuset_mems_cookie = read_mems_allowed_begin();
4766 	zonelist_iter_cookie = zonelist_iter_begin();
4767 
4768 	/*
4769 	 * For costly allocations, try direct compaction first, as it's likely
4770 	 * that we have enough base pages and don't need to reclaim. For non-
4771 	 * movable high-order allocations, do that as well, as compaction will
4772 	 * try prevent permanent fragmentation by migrating from blocks of the
4773 	 * same migratetype.
4774 	 */
4775 	if (can_compact && (costly_order || (order > 0 &&
4776 					ac->migratetype != MIGRATE_MOVABLE))) {
4777 		compact_first = true;
4778 		compact_priority = INIT_COMPACT_PRIORITY;
4779 	}
4780 
4781 	/*
4782 	 * The fast path uses conservative alloc_flags to succeed only until
4783 	 * kswapd needs to be woken up, and to avoid the cost of setting up
4784 	 * alloc_flags precisely. So we do that now.
4785 	 */
4786 	alloc_flags = gfp_to_alloc_flags(gfp_mask, order);
4787 
4788 	/*
4789 	 * We need to recalculate the starting point for the zonelist iterator
4790 	 * because we might have used different nodemask in the fast path, or
4791 	 * there was a cpuset modification and we are retrying - otherwise we
4792 	 * could end up iterating over non-eligible zones endlessly.
4793 	 */
4794 	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4795 					ac->highest_zoneidx, ac->nodemask);
4796 	if (!zonelist_zone(ac->preferred_zoneref))
4797 		goto nopage;
4798 
4799 	/*
4800 	 * Check for insane configurations where the cpuset doesn't contain
4801 	 * any suitable zone to satisfy the request - e.g. non-movable
4802 	 * GFP_HIGHUSER allocations from MOVABLE nodes only.
4803 	 */
4804 	if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) {
4805 		struct zoneref *z = first_zones_zonelist(ac->zonelist,
4806 					ac->highest_zoneidx,
4807 					&cpuset_current_mems_allowed);
4808 		if (!zonelist_zone(z))
4809 			goto nopage;
4810 	}
4811 
4812 retry:
4813 	/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
4814 	if (alloc_flags & ALLOC_KSWAPD)
4815 		wake_all_kswapds(order, gfp_mask, ac);
4816 
4817 	/*
4818 	 * The adjusted alloc_flags might result in immediate success, so try
4819 	 * that first
4820 	 */
4821 	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4822 	if (page)
4823 		goto got_pg;
4824 
4825 	reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
4826 	if (reserve_flags)
4827 		alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) |
4828 					  (alloc_flags & ALLOC_KSWAPD);
4829 
4830 	/*
4831 	 * Reset the nodemask and zonelist iterators if memory policies can be
4832 	 * ignored. These allocations are high priority and system rather than
4833 	 * user oriented.
4834 	 */
4835 	if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
4836 		ac->nodemask = NULL;
4837 		ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4838 					ac->highest_zoneidx, ac->nodemask);
4839 
4840 		/*
4841 		 * The first time we adjust anything due to being allowed to
4842 		 * ignore memory policies or watermarks, retry immediately. This
4843 		 * allows us to keep the first allocation attempt optimistic so
4844 		 * it can succeed in a zone that is still above watermarks.
4845 		 */
4846 		if (can_retry_reserves) {
4847 			can_retry_reserves = false;
4848 			goto retry;
4849 		}
4850 	}
4851 
4852 	/* Caller is not willing to reclaim, we can't balance anything */
4853 	if (!can_direct_reclaim) {
4854 		/*
4855 		 * Reclaim/compaction cannot run, so defrag_mode's strategy
4856 		 * of enforcing ALLOC_NOFRAGMENT cannot be fulfilled. Allow
4857 		 * fallbacks rather than failing the allocation outright.
4858 		 */
4859 		if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT) &&
4860 		    (gfp_mask & __GFP_KSWAPD_RECLAIM)) {
4861 			alloc_flags &= ~ALLOC_NOFRAGMENT;
4862 			goto retry;
4863 		}
4864 		goto nopage;
4865 	}
4866 
4867 	/* Avoid recursion of direct reclaim */
4868 	if (current->flags & PF_MEMALLOC)
4869 		goto nopage;
4870 
4871 	/* If allocation has taken excessively long, warn about it */
4872 	check_alloc_stall_warn(gfp_mask, ac->nodemask, order, alloc_start_time);
4873 
4874 	/* Try direct reclaim and then allocating */
4875 	if (!compact_first) {
4876 		page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags,
4877 							ac, &did_some_progress);
4878 		if (page)
4879 			goto got_pg;
4880 	}
4881 
4882 	/* Try direct compaction and then allocating */
4883 	page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
4884 					compact_priority, &compact_result);
4885 	if (page)
4886 		goto got_pg;
4887 
4888 	if (compact_first) {
4889 		/*
4890 		 * THP page faults may attempt local node only first, but are
4891 		 * then allowed to only compact, not reclaim, see
4892 		 * alloc_pages_mpol().
4893 		 *
4894 		 * Compaction has failed above and we don't want such THP
4895 		 * allocations to put reclaim pressure on a single node in a
4896 		 * situation where other nodes might have plenty of available
4897 		 * memory.
4898 		 */
4899 		if (gfp_has_flags(gfp_mask, __GFP_NORETRY | __GFP_THISNODE))
4900 			goto nopage;
4901 
4902 		/*
4903 		 * For the initial compaction attempt we have lowered its
4904 		 * priority. Restore it for further retries, if those are
4905 		 * allowed. With __GFP_NORETRY there will be a single round of
4906 		 * reclaim and compaction with the lowered priority.
4907 		 */
4908 		if (!(gfp_mask & __GFP_NORETRY))
4909 			compact_priority = DEF_COMPACT_PRIORITY;
4910 
4911 		compact_first = false;
4912 		goto retry;
4913 	}
4914 
4915 	/* Do not loop if specifically requested */
4916 	if (gfp_mask & __GFP_NORETRY)
4917 		goto nopage;
4918 
4919 	/*
4920 	 * Do not retry costly high order allocations unless they are
4921 	 * __GFP_RETRY_MAYFAIL and we can compact
4922 	 */
4923 	if (costly_order && (!can_compact ||
4924 			     !(gfp_mask & __GFP_RETRY_MAYFAIL)))
4925 		goto nopage;
4926 
4927 	/*
4928 	 * Deal with possible cpuset update races or zonelist updates to avoid
4929 	 * infinite retries. No "goto retry;" can be placed above this check
4930 	 * unless it can execute just once.
4931 	 */
4932 	if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
4933 	    check_retry_zonelist(zonelist_iter_cookie))
4934 		goto restart;
4935 
4936 	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
4937 				 did_some_progress > 0, &no_progress_loops))
4938 		goto retry;
4939 
4940 	/*
4941 	 * It doesn't make any sense to retry for the compaction if the order-0
4942 	 * reclaim is not able to make any progress because the current
4943 	 * implementation of the compaction depends on the sufficient amount
4944 	 * of free memory (see __compaction_suitable)
4945 	 */
4946 	if (did_some_progress > 0 && can_compact &&
4947 	    should_compact_retry(gfp_mask, ac, order, alloc_flags,
4948 				 compact_result, &compact_priority,
4949 				 &compaction_retries))
4950 		goto retry;
4951 
4952 	/* Reclaim/compaction failed to prevent the fallback */
4953 	if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT)) {
4954 		alloc_flags &= ~ALLOC_NOFRAGMENT;
4955 		goto retry;
4956 	}
4957 
4958 	/*
4959 	 * Deal with possible cpuset update races or zonelist updates to avoid
4960 	 * a unnecessary OOM kill.
4961 	 */
4962 	if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
4963 	    check_retry_zonelist(zonelist_iter_cookie))
4964 		goto restart;
4965 
4966 	/* Reclaim has failed us, start killing things */
4967 	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
4968 	if (page)
4969 		goto got_pg;
4970 
4971 	/* Avoid allocations with no watermarks from looping endlessly */
4972 	if (tsk_is_oom_victim(current) &&
4973 	    (alloc_flags & ALLOC_OOM ||
4974 	     (gfp_mask & __GFP_NOMEMALLOC)))
4975 		goto nopage;
4976 
4977 	/* Retry as long as the OOM killer is making progress */
4978 	if (did_some_progress) {
4979 		no_progress_loops = 0;
4980 		goto retry;
4981 	}
4982 
4983 nopage:
4984 	/*
4985 	 * Deal with possible cpuset update races or zonelist updates to avoid
4986 	 * a unnecessary OOM kill.
4987 	 */
4988 	if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
4989 	    check_retry_zonelist(zonelist_iter_cookie))
4990 		goto restart;
4991 
4992 	/*
4993 	 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
4994 	 * we always retry
4995 	 */
4996 	if (unlikely(nofail)) {
4997 		/*
4998 		 * Lacking direct_reclaim we can't do anything to reclaim memory,
4999 		 * we disregard these unreasonable nofail requests and still
5000 		 * return NULL
5001 		 */
5002 		if (!can_direct_reclaim)
5003 			goto fail;
5004 
5005 		/*
5006 		 * Help non-failing allocations by giving some access to memory
5007 		 * reserves normally used for high priority non-blocking
5008 		 * allocations but do not use ALLOC_NO_WATERMARKS because this
5009 		 * could deplete whole memory reserves which would just make
5010 		 * the situation worse.
5011 		 */
5012 		page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac);
5013 		if (page)
5014 			goto got_pg;
5015 
5016 		cond_resched();
5017 		goto retry;
5018 	}
5019 fail:
5020 	warn_alloc(gfp_mask, ac->nodemask,
5021 			"page allocation failure: order:%u", order);
5022 got_pg:
5023 	return page;
5024 }
5025 
5026 static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
5027 		int preferred_nid, nodemask_t *nodemask,
5028 		struct alloc_context *ac, gfp_t *alloc_gfp,
5029 		unsigned int *alloc_flags)
5030 {
5031 	ac->highest_zoneidx = gfp_zone(gfp_mask);
5032 	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
5033 	ac->nodemask = nodemask;
5034 	ac->migratetype = gfp_migratetype(gfp_mask);
5035 
5036 	if (cpusets_enabled()) {
5037 		*alloc_gfp |= __GFP_HARDWALL;
5038 		/*
5039 		 * When we are in the interrupt context, it is irrelevant
5040 		 * to the current task context. It means that any node ok.
5041 		 */
5042 		if (in_task() && !ac->nodemask)
5043 			ac->nodemask = &cpuset_current_mems_allowed;
5044 		else
5045 			*alloc_flags |= ALLOC_CPUSET;
5046 	}
5047 
5048 	might_alloc(gfp_mask);
5049 
5050 	/*
5051 	 * Don't invoke should_fail logic, since it may call
5052 	 * get_random_u32() and printk() which need to spin_lock.
5053 	 */
5054 	if (!(*alloc_flags & ALLOC_TRYLOCK) &&
5055 	    should_fail_alloc_page(gfp_mask, order))
5056 		return false;
5057 
5058 	*alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
5059 
5060 	/* Dirty zone balancing only done in the fast path */
5061 	ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
5062 
5063 	/*
5064 	 * The preferred zone is used for statistics but crucially it is
5065 	 * also used as the starting point for the zonelist iterator. It
5066 	 * may get reset for allocations that ignore memory policies.
5067 	 */
5068 	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
5069 					ac->highest_zoneidx, ac->nodemask);
5070 
5071 	return true;
5072 }
5073 
5074 /*
5075  * __alloc_pages_bulk - Allocate a number of order-0 pages to an array
5076  * @gfp: GFP flags for the allocation
5077  * @preferred_nid: The preferred NUMA node ID to allocate from
5078  * @nodemask: Set of nodes to allocate from, may be NULL
5079  * @nr_pages: The number of pages desired in the array
5080  * @page_array: Array to store the pages
5081  *
5082  * This is a batched version of the page allocator that attempts to allocate
5083  * @nr_pages quickly.  Pages are added to @page_array.
5084  *
5085  * Note that only the elements in @page_array that were cleared to %NULL on
5086  * entry are populated with newly allocated pages. @nr_pages is the maximum
5087  * number of pages that will be stored in the array.
5088  *
5089  * Returns the number of pages in @page_array, including ones already
5090  * allocated on entry.  This can be less than the number requested in @nr_pages,
5091  * but all empty slots are filled from the beginning.  I.e., if all slots in
5092  * @page_array were set to %NULL on entry, the slots from 0 to the return value
5093  * - 1 will be filled.
5094  */
5095 unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
5096 			nodemask_t *nodemask, int nr_pages,
5097 			struct page **page_array)
5098 {
5099 	struct page *page;
5100 	struct zone *zone;
5101 	struct zoneref *z;
5102 	struct per_cpu_pages *pcp;
5103 	struct list_head *pcp_list;
5104 	struct alloc_context ac;
5105 	unsigned int alloc_flags = ALLOC_WMARK_LOW;
5106 	int nr_populated = 0, nr_account = 0;
5107 
5108 	/*
5109 	 * Skip populated array elements to determine if any pages need
5110 	 * to be allocated before disabling IRQs.
5111 	 */
5112 	while (nr_populated < nr_pages && page_array[nr_populated])
5113 		nr_populated++;
5114 
5115 	/* No pages requested? */
5116 	if (unlikely(nr_pages <= 0))
5117 		goto out;
5118 
5119 	/* Already populated array? */
5120 	if (unlikely(nr_pages - nr_populated == 0))
5121 		goto out;
5122 
5123 	/* Bulk allocator does not support memcg accounting. */
5124 	if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT))
5125 		goto failed;
5126 
5127 	/* Use the single page allocator for one page. */
5128 	if (nr_pages - nr_populated == 1)
5129 		goto failed;
5130 
5131 #ifdef CONFIG_PAGE_OWNER
5132 	/*
5133 	 * PAGE_OWNER may recurse into the allocator to allocate space to
5134 	 * save the stack with pagesets.lock held. Releasing/reacquiring
5135 	 * removes much of the performance benefit of bulk allocation so
5136 	 * force the caller to allocate one page at a time as it'll have
5137 	 * similar performance to added complexity to the bulk allocator.
5138 	 */
5139 	if (static_branch_unlikely(&page_owner_inited))
5140 		goto failed;
5141 #endif
5142 
5143 	/* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
5144 	gfp &= gfp_allowed_mask;
5145 	if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &gfp, &alloc_flags))
5146 		goto out;
5147 
5148 	/* Find an allowed local zone that meets the low watermark. */
5149 	z = ac.preferred_zoneref;
5150 	for_next_zone_zonelist_nodemask(zone, z, ac.highest_zoneidx, ac.nodemask) {
5151 		unsigned long mark;
5152 
5153 		if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
5154 		    !__cpuset_zone_allowed(zone, gfp)) {
5155 			continue;
5156 		}
5157 
5158 		if (nr_online_nodes > 1 && zone != zonelist_zone(ac.preferred_zoneref) &&
5159 		    zone_to_nid(zone) != zonelist_node_idx(ac.preferred_zoneref)) {
5160 			goto failed;
5161 		}
5162 
5163 		cond_accept_memory(zone, 0, alloc_flags);
5164 retry_this_zone:
5165 		mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages - nr_populated;
5166 		if (zone_watermark_fast(zone, 0,  mark,
5167 				zonelist_zone_idx(ac.preferred_zoneref),
5168 				alloc_flags, gfp)) {
5169 			break;
5170 		}
5171 
5172 		if (cond_accept_memory(zone, 0, alloc_flags))
5173 			goto retry_this_zone;
5174 
5175 		/* Try again if zone has deferred pages */
5176 		if (deferred_pages_enabled()) {
5177 			if (_deferred_grow_zone(zone, 0))
5178 				goto retry_this_zone;
5179 		}
5180 	}
5181 
5182 	/*
5183 	 * If there are no allowed local zones that meets the watermarks then
5184 	 * try to allocate a single page and reclaim if necessary.
5185 	 */
5186 	if (unlikely(!zone))
5187 		goto failed;
5188 
5189 	/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
5190 	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
5191 	if (!pcp)
5192 		goto failed;
5193 
5194 	/* Attempt the batch allocation */
5195 	pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
5196 	while (nr_populated < nr_pages) {
5197 
5198 		/* Skip existing pages */
5199 		if (page_array[nr_populated]) {
5200 			nr_populated++;
5201 			continue;
5202 		}
5203 
5204 		page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
5205 								pcp, pcp_list);
5206 		if (unlikely(!page)) {
5207 			/* Try and allocate at least one page */
5208 			if (!nr_account) {
5209 				pcp_spin_unlock(pcp);
5210 				goto failed;
5211 			}
5212 			break;
5213 		}
5214 		nr_account++;
5215 
5216 		prep_new_page(page, 0, gfp, 0);
5217 		set_page_refcounted(page);
5218 		page_array[nr_populated++] = page;
5219 	}
5220 
5221 	pcp_spin_unlock(pcp);
5222 
5223 	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
5224 	zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account);
5225 
5226 out:
5227 	return nr_populated;
5228 
5229 failed:
5230 	page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask);
5231 	if (page)
5232 		page_array[nr_populated++] = page;
5233 	goto out;
5234 }
5235 EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
5236 
5237 /*
5238  * free_pages_bulk - Free an array of order-0 pages
5239  * @page_array: Array of pages to free
5240  * @nr_pages: The number of pages in the array
5241  *
5242  * Free the order-0 pages. Adjacent entries whose PFNs form a contiguous
5243  * run are released with a single __free_contig_range() call.
5244  *
5245  * This assumes page_array is sorted in ascending PFN order. Without that,
5246  * the function still frees all pages, but contiguous runs may not be
5247  * detected and the freeing pattern can degrade to freeing one page at a
5248  * time.
5249  *
5250  * Context: Sleepable process context only; calls cond_resched()
5251  */
5252 void free_pages_bulk(struct page **page_array, unsigned long nr_pages)
5253 {
5254 	while (nr_pages) {
5255 		unsigned long nr_contig = num_pages_contiguous(page_array, nr_pages);
5256 
5257 		__free_contig_range(page_to_pfn(*page_array), nr_contig);
5258 
5259 		nr_pages -= nr_contig;
5260 		page_array += nr_contig;
5261 		cond_resched();
5262 	}
5263 }
5264 
5265 /*
5266  * This is the 'heart' of the zoned buddy allocator.
5267  */
5268 struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order,
5269 		int preferred_nid, nodemask_t *nodemask)
5270 {
5271 	struct page *page;
5272 	unsigned int alloc_flags = ALLOC_WMARK_LOW;
5273 	gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
5274 	struct alloc_context ac = { };
5275 
5276 	/*
5277 	 * There are several places where we assume that the order value is sane
5278 	 * so bail out early if the request is out of bound.
5279 	 */
5280 	if (WARN_ON_ONCE_GFP(order > MAX_PAGE_ORDER, gfp))
5281 		return NULL;
5282 
5283 	gfp &= gfp_allowed_mask;
5284 	/*
5285 	 * Apply scoped allocation constraints. This is mainly about GFP_NOFS
5286 	 * resp. GFP_NOIO which has to be inherited for all allocation requests
5287 	 * from a particular context which has been marked by
5288 	 * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
5289 	 * movable zones are not used during allocation.
5290 	 */
5291 	gfp = current_gfp_context(gfp);
5292 	alloc_gfp = gfp;
5293 	if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
5294 			&alloc_gfp, &alloc_flags))
5295 		return NULL;
5296 
5297 	/*
5298 	 * Forbid the first pass from falling back to types that fragment
5299 	 * memory until all local zones are considered.
5300 	 */
5301 	alloc_flags |= alloc_flags_nofragment(zonelist_zone(ac.preferred_zoneref), gfp);
5302 
5303 	/* First allocation attempt */
5304 	page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
5305 	if (likely(page))
5306 		goto out;
5307 
5308 	alloc_gfp = gfp;
5309 	ac.spread_dirty_pages = false;
5310 
5311 	/*
5312 	 * Restore the original nodemask if it was potentially replaced with
5313 	 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
5314 	 */
5315 	ac.nodemask = nodemask;
5316 
5317 	page = __alloc_pages_slowpath(alloc_gfp, order, &ac);
5318 
5319 out:
5320 	if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
5321 	    unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
5322 		free_frozen_pages(page, order);
5323 		page = NULL;
5324 	}
5325 
5326 	trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
5327 	kmsan_alloc_page(page, order, alloc_gfp);
5328 
5329 	return page;
5330 }
5331 EXPORT_SYMBOL(__alloc_frozen_pages_noprof);
5332 
5333 struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
5334 		int preferred_nid, nodemask_t *nodemask)
5335 {
5336 	struct page *page;
5337 
5338 	page = __alloc_frozen_pages_noprof(gfp, order, preferred_nid, nodemask);
5339 	if (page)
5340 		set_page_refcounted(page);
5341 	return page;
5342 }
5343 EXPORT_SYMBOL(__alloc_pages_noprof);
5344 
5345 struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
5346 		nodemask_t *nodemask)
5347 {
5348 	struct page *page = __alloc_pages_noprof(gfp | __GFP_COMP, order,
5349 					preferred_nid, nodemask);
5350 	return page_rmappable_folio(page);
5351 }
5352 EXPORT_SYMBOL(__folio_alloc_noprof);
5353 
5354 /*
5355  * Common helper functions. Never use with __GFP_HIGHMEM because the returned
5356  * address cannot represent highmem pages. Use alloc_pages and then kmap if
5357  * you need to access high mem.
5358  */
5359 unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order)
5360 {
5361 	struct page *page;
5362 
5363 	page = alloc_pages_noprof(gfp_mask & ~__GFP_HIGHMEM, order);
5364 	if (!page)
5365 		return 0;
5366 	return (unsigned long) page_address(page);
5367 }
5368 EXPORT_SYMBOL(get_free_pages_noprof);
5369 
5370 unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
5371 {
5372 	return get_free_pages_noprof(gfp_mask | __GFP_ZERO, 0);
5373 }
5374 EXPORT_SYMBOL(get_zeroed_page_noprof);
5375 
5376 static void ___free_pages(struct page *page, unsigned int order,
5377 			  fpi_t fpi_flags)
5378 {
5379 	/* get PageHead before we drop reference */
5380 	int head = PageHead(page);
5381 	/* get alloc tag in case the page is released by others */
5382 	struct alloc_tag *tag = pgalloc_tag_get(page);
5383 
5384 	if (put_page_testzero(page))
5385 		__free_frozen_pages(page, order, fpi_flags);
5386 	else if (!head) {
5387 		pgalloc_tag_sub_pages(tag, (1 << order) - 1);
5388 		while (order-- > 0) {
5389 			/*
5390 			 * The "tail" pages of this non-compound high-order
5391 			 * page will have no code tags, so to avoid warnings
5392 			 * mark them as empty.
5393 			 */
5394 			clear_page_tag_ref(page + (1 << order));
5395 			__free_frozen_pages(page + (1 << order), order,
5396 					    fpi_flags);
5397 		}
5398 	}
5399 }
5400 
5401 /**
5402  * __free_pages - Free pages allocated with alloc_pages().
5403  * @page: The page pointer returned from alloc_pages().
5404  * @order: The order of the allocation.
5405  *
5406  * This function can free multi-page allocations that are not compound
5407  * pages.  It does not check that the @order passed in matches that of
5408  * the allocation, so it is easy to leak memory.  Freeing more memory
5409  * than was allocated will probably emit a warning.
5410  *
5411  * If the last reference to this page is speculative, it will be released
5412  * by put_page() which only frees the first page of a non-compound
5413  * allocation.  To prevent the remaining pages from being leaked, we free
5414  * the subsequent pages here.  If you want to use the page's reference
5415  * count to decide when to free the allocation, you should allocate a
5416  * compound page, and use put_page() instead of __free_pages().
5417  *
5418  * Context: May be called in interrupt context or while holding a normal
5419  * spinlock, but not in NMI context or while holding a raw spinlock.
5420  */
5421 void __free_pages(struct page *page, unsigned int order)
5422 {
5423 	___free_pages(page, order, FPI_NONE);
5424 }
5425 EXPORT_SYMBOL(__free_pages);
5426 
5427 /*
5428  * Can be called while holding raw_spin_lock or from IRQ and NMI for any
5429  * page type (not only those that came from alloc_pages_nolock)
5430  */
5431 void free_pages_nolock(struct page *page, unsigned int order)
5432 {
5433 	___free_pages(page, order, FPI_TRYLOCK);
5434 }
5435 
5436 /**
5437  * free_pages - Free pages allocated with __get_free_pages().
5438  * @addr: The virtual address tied to a page returned from __get_free_pages().
5439  * @order: The order of the allocation.
5440  *
5441  * This function behaves the same as __free_pages(). Use this function
5442  * to free pages when you only have a valid virtual address. If you have
5443  * the page, call __free_pages() instead.
5444  */
5445 void free_pages(unsigned long addr, unsigned int order)
5446 {
5447 	if (addr != 0) {
5448 		VM_BUG_ON(!virt_addr_valid((void *)addr));
5449 		__free_pages(virt_to_page((void *)addr), order);
5450 	}
5451 }
5452 
5453 EXPORT_SYMBOL(free_pages);
5454 
5455 static void *make_alloc_exact(unsigned long addr, unsigned int order,
5456 		size_t size)
5457 {
5458 	if (addr) {
5459 		unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE);
5460 		struct page *page = virt_to_page((void *)addr);
5461 		struct page *last = page + nr;
5462 
5463 		__split_page(page, order);
5464 		while (page < --last)
5465 			set_page_refcounted(last);
5466 
5467 		last = page + (1UL << order);
5468 		for (page += nr; page < last; page++)
5469 			__free_pages_ok(page, 0, FPI_TO_TAIL);
5470 	}
5471 	return (void *)addr;
5472 }
5473 
5474 /**
5475  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
5476  * @size: the number of bytes to allocate
5477  * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
5478  *
5479  * This function is similar to alloc_pages(), except that it allocates the
5480  * minimum number of pages to satisfy the request.  alloc_pages() can only
5481  * allocate memory in power-of-two pages.
5482  *
5483  * This function is also limited by MAX_PAGE_ORDER.
5484  *
5485  * Memory allocated by this function must be released by free_pages_exact().
5486  *
5487  * Return: pointer to the allocated area or %NULL in case of error.
5488  */
5489 void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask)
5490 {
5491 	unsigned int order = get_order(size);
5492 	unsigned long addr;
5493 
5494 	if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
5495 		gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
5496 
5497 	addr = get_free_pages_noprof(gfp_mask, order);
5498 	return make_alloc_exact(addr, order, size);
5499 }
5500 EXPORT_SYMBOL(alloc_pages_exact_noprof);
5501 
5502 /**
5503  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
5504  *			   pages on a node.
5505  * @nid: the preferred node ID where memory should be allocated
5506  * @size: the number of bytes to allocate
5507  * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
5508  *
5509  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
5510  * back.
5511  *
5512  * Return: pointer to the allocated area or %NULL in case of error.
5513  */
5514 void * __meminit alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask)
5515 {
5516 	unsigned int order = get_order(size);
5517 	struct page *p;
5518 
5519 	if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
5520 		gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
5521 
5522 	p = alloc_pages_node_noprof(nid, gfp_mask, order);
5523 	if (!p)
5524 		return NULL;
5525 	return make_alloc_exact((unsigned long)page_address(p), order, size);
5526 }
5527 
5528 /**
5529  * free_pages_exact - release memory allocated via alloc_pages_exact()
5530  * @virt: the value returned by alloc_pages_exact.
5531  * @size: size of allocation, same value as passed to alloc_pages_exact().
5532  *
5533  * Release the memory allocated by a previous call to alloc_pages_exact.
5534  */
5535 void free_pages_exact(void *virt, size_t size)
5536 {
5537 	unsigned long addr = (unsigned long)virt;
5538 	unsigned long end = addr + PAGE_ALIGN(size);
5539 
5540 	while (addr < end) {
5541 		free_page(addr);
5542 		addr += PAGE_SIZE;
5543 	}
5544 }
5545 EXPORT_SYMBOL(free_pages_exact);
5546 
5547 /**
5548  * nr_free_zone_pages - count number of pages beyond high watermark
5549  * @offset: The zone index of the highest zone
5550  *
5551  * nr_free_zone_pages() counts the number of pages which are beyond the
5552  * high watermark within all zones at or below a given zone index.  For each
5553  * zone, the number of pages is calculated as:
5554  *
5555  *     nr_free_zone_pages = managed_pages - high_pages
5556  *
5557  * Return: number of pages beyond high watermark.
5558  */
5559 static unsigned long nr_free_zone_pages(int offset)
5560 {
5561 	struct zoneref *z;
5562 	struct zone *zone;
5563 
5564 	/* Just pick one node, since fallback list is circular */
5565 	unsigned long sum = 0;
5566 
5567 	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
5568 
5569 	for_each_zone_zonelist(zone, z, zonelist, offset) {
5570 		unsigned long size = zone_managed_pages(zone);
5571 		unsigned long high = high_wmark_pages(zone);
5572 		if (size > high)
5573 			sum += size - high;
5574 	}
5575 
5576 	return sum;
5577 }
5578 
5579 /**
5580  * nr_free_buffer_pages - count number of pages beyond high watermark
5581  *
5582  * nr_free_buffer_pages() counts the number of pages which are beyond the high
5583  * watermark within ZONE_DMA and ZONE_NORMAL.
5584  *
5585  * Return: number of pages beyond high watermark within ZONE_DMA and
5586  * ZONE_NORMAL.
5587  */
5588 unsigned long nr_free_buffer_pages(void)
5589 {
5590 	return nr_free_zone_pages(gfp_zone(GFP_USER));
5591 }
5592 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
5593 
5594 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
5595 {
5596 	zoneref->zone = zone;
5597 	zoneref->zone_idx = zone_idx(zone);
5598 }
5599 
5600 /*
5601  * Builds allocation fallback zone lists.
5602  *
5603  * Add all populated zones of a node to the zonelist.
5604  */
5605 static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
5606 {
5607 	struct zone *zone;
5608 	enum zone_type zone_type = MAX_NR_ZONES;
5609 	int nr_zones = 0;
5610 
5611 	do {
5612 		zone_type--;
5613 		zone = pgdat->node_zones + zone_type;
5614 		if (populated_zone(zone)) {
5615 			zoneref_set_zone(zone, &zonerefs[nr_zones++]);
5616 			check_highest_zone(zone_type);
5617 		}
5618 	} while (zone_type);
5619 
5620 	return nr_zones;
5621 }
5622 
5623 #ifdef CONFIG_NUMA
5624 
5625 static int __parse_numa_zonelist_order(char *s)
5626 {
5627 	/*
5628 	 * We used to support different zonelists modes but they turned
5629 	 * out to be just not useful. Let's keep the warning in place
5630 	 * if somebody still use the cmd line parameter so that we do
5631 	 * not fail it silently
5632 	 */
5633 	if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
5634 		pr_warn("Ignoring unsupported numa_zonelist_order value:  %s\n", s);
5635 		return -EINVAL;
5636 	}
5637 	return 0;
5638 }
5639 
5640 static char numa_zonelist_order[] = "Node";
5641 #define NUMA_ZONELIST_ORDER_LEN	16
5642 /*
5643  * sysctl handler for numa_zonelist_order
5644  */
5645 static int numa_zonelist_order_handler(const struct ctl_table *table, int write,
5646 		void *buffer, size_t *length, loff_t *ppos)
5647 {
5648 	if (write)
5649 		return __parse_numa_zonelist_order(buffer);
5650 	return proc_dostring(table, write, buffer, length, ppos);
5651 }
5652 
5653 static int node_load[MAX_NUMNODES];
5654 
5655 /**
5656  * find_next_best_node - find the next node that should appear in a given node's fallback list
5657  * @node: node whose fallback list we're appending
5658  * @used_node_mask: nodemask_t of already used nodes
5659  *
5660  * We use a number of factors to determine which is the next node that should
5661  * appear on a given node's fallback list.  The node should not have appeared
5662  * already in @node's fallback list, and it should be the next closest node
5663  * according to the distance array (which contains arbitrary distance values
5664  * from each node to each node in the system), and should also prefer nodes
5665  * with no CPUs, since presumably they'll have very little allocation pressure
5666  * on them otherwise.
5667  *
5668  * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
5669  */
5670 int find_next_best_node(int node, nodemask_t *used_node_mask)
5671 {
5672 	int n, val;
5673 	int min_val = INT_MAX;
5674 	int best_node = NUMA_NO_NODE;
5675 
5676 	/*
5677 	 * Use the local node if we haven't already, but for memoryless local
5678 	 * node, we should skip it and fall back to other nodes.
5679 	 */
5680 	if (!node_isset(node, *used_node_mask) && node_state(node, N_MEMORY)) {
5681 		node_set(node, *used_node_mask);
5682 		return node;
5683 	}
5684 
5685 	for_each_node_state(n, N_MEMORY) {
5686 
5687 		/* Don't want a node to appear more than once */
5688 		if (node_isset(n, *used_node_mask))
5689 			continue;
5690 
5691 		/* Use the distance array to find the distance */
5692 		val = node_distance(node, n);
5693 
5694 		/* Penalize nodes under us ("prefer the next node") */
5695 		val += (n < node);
5696 
5697 		/* Give preference to headless and unused nodes */
5698 		if (!cpumask_empty(cpumask_of_node(n)))
5699 			val += PENALTY_FOR_NODE_WITH_CPUS;
5700 
5701 		/* Slight preference for less loaded node */
5702 		val *= MAX_NUMNODES;
5703 		val += node_load[n];
5704 
5705 		if (val < min_val) {
5706 			min_val = val;
5707 			best_node = n;
5708 		}
5709 	}
5710 
5711 	if (best_node >= 0)
5712 		node_set(best_node, *used_node_mask);
5713 
5714 	return best_node;
5715 }
5716 
5717 
5718 /*
5719  * Build zonelists ordered by node and zones within node.
5720  * This results in maximum locality--normal zone overflows into local
5721  * DMA zone, if any--but risks exhausting DMA zone.
5722  */
5723 static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
5724 		unsigned nr_nodes)
5725 {
5726 	struct zoneref *zonerefs;
5727 	int i;
5728 
5729 	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5730 
5731 	for (i = 0; i < nr_nodes; i++) {
5732 		int nr_zones;
5733 
5734 		pg_data_t *node = NODE_DATA(node_order[i]);
5735 
5736 		nr_zones = build_zonerefs_node(node, zonerefs);
5737 		zonerefs += nr_zones;
5738 	}
5739 	zonerefs->zone = NULL;
5740 	zonerefs->zone_idx = 0;
5741 }
5742 
5743 /*
5744  * Build __GFP_THISNODE zonelists
5745  */
5746 static void build_thisnode_zonelists(pg_data_t *pgdat)
5747 {
5748 	struct zoneref *zonerefs;
5749 	int nr_zones;
5750 
5751 	zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
5752 	nr_zones = build_zonerefs_node(pgdat, zonerefs);
5753 	zonerefs += nr_zones;
5754 	zonerefs->zone = NULL;
5755 	zonerefs->zone_idx = 0;
5756 }
5757 
5758 static void build_zonelists(pg_data_t *pgdat)
5759 {
5760 	static int node_order[MAX_NUMNODES];
5761 	int node, nr_nodes = 0;
5762 	nodemask_t used_mask = NODE_MASK_NONE;
5763 	int local_node, prev_node;
5764 
5765 	/* NUMA-aware ordering of nodes */
5766 	local_node = pgdat->node_id;
5767 	prev_node = local_node;
5768 
5769 	memset(node_order, 0, sizeof(node_order));
5770 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
5771 		/*
5772 		 * We don't want to pressure a particular node.
5773 		 * So adding penalty to the first node in same
5774 		 * distance group to make it round-robin.
5775 		 */
5776 		if (node_distance(local_node, node) !=
5777 		    node_distance(local_node, prev_node))
5778 			node_load[node] += 1;
5779 
5780 		node_order[nr_nodes++] = node;
5781 		prev_node = node;
5782 	}
5783 
5784 	build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
5785 	build_thisnode_zonelists(pgdat);
5786 	pr_info("Fallback order for Node %d: ", local_node);
5787 	for (node = 0; node < nr_nodes; node++)
5788 		pr_cont("%d ", node_order[node]);
5789 	pr_cont("\n");
5790 }
5791 
5792 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
5793 /*
5794  * Return node id of node used for "local" allocations.
5795  * I.e., first node id of first zone in arg node's generic zonelist.
5796  * Used for initializing percpu 'numa_mem', which is used primarily
5797  * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
5798  */
5799 int local_memory_node(int node)
5800 {
5801 	struct zoneref *z;
5802 
5803 	z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
5804 				   gfp_zone(GFP_KERNEL),
5805 				   NULL);
5806 	return zonelist_node_idx(z);
5807 }
5808 #endif
5809 
5810 static void setup_min_unmapped_ratio(void);
5811 static void setup_min_slab_ratio(void);
5812 #else	/* CONFIG_NUMA */
5813 
5814 static void build_zonelists(pg_data_t *pgdat)
5815 {
5816 	struct zoneref *zonerefs;
5817 	int nr_zones;
5818 
5819 	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5820 	nr_zones = build_zonerefs_node(pgdat, zonerefs);
5821 	zonerefs += nr_zones;
5822 
5823 	zonerefs->zone = NULL;
5824 	zonerefs->zone_idx = 0;
5825 }
5826 
5827 #endif	/* CONFIG_NUMA */
5828 
5829 /*
5830  * Boot pageset table. One per cpu which is going to be used for all
5831  * zones and all nodes. The parameters will be set in such a way
5832  * that an item put on a list will immediately be handed over to
5833  * the buddy list. This is safe since pageset manipulation is done
5834  * with interrupts disabled.
5835  *
5836  * The boot_pagesets must be kept even after bootup is complete for
5837  * unused processors and/or zones. They do play a role for bootstrapping
5838  * hotplugged processors.
5839  *
5840  * zoneinfo_show() and maybe other functions do
5841  * not check if the processor is online before following the pageset pointer.
5842  * Other parts of the kernel may not check if the zone is available.
5843  */
5844 static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats);
5845 /* These effectively disable the pcplists in the boot pageset completely */
5846 #define BOOT_PAGESET_HIGH	0
5847 #define BOOT_PAGESET_BATCH	1
5848 static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
5849 static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
5850 
5851 static void __build_all_zonelists(void *data)
5852 {
5853 	int nid;
5854 	int __maybe_unused cpu;
5855 	pg_data_t *self = data;
5856 	unsigned long flags;
5857 
5858 	/*
5859 	 * The zonelist_update_seq must be acquired with irqsave because the
5860 	 * reader can be invoked from IRQ with GFP_ATOMIC.
5861 	 */
5862 	write_seqlock_irqsave(&zonelist_update_seq, flags);
5863 	/*
5864 	 * Also disable synchronous printk() to prevent any printk() from
5865 	 * trying to hold port->lock, for
5866 	 * tty_insert_flip_string_and_push_buffer() on other CPU might be
5867 	 * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
5868 	 */
5869 	printk_deferred_enter();
5870 
5871 #ifdef CONFIG_NUMA
5872 	memset(node_load, 0, sizeof(node_load));
5873 #endif
5874 
5875 	/*
5876 	 * This node is hotadded and no memory is yet present.   So just
5877 	 * building zonelists is fine - no need to touch other nodes.
5878 	 */
5879 	if (self && !node_online(self->node_id)) {
5880 		build_zonelists(self);
5881 	} else {
5882 		/*
5883 		 * All possible nodes have pgdat preallocated
5884 		 * in free_area_init
5885 		 */
5886 		for_each_node(nid) {
5887 			pg_data_t *pgdat = NODE_DATA(nid);
5888 
5889 			build_zonelists(pgdat);
5890 		}
5891 
5892 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
5893 		/*
5894 		 * We now know the "local memory node" for each node--
5895 		 * i.e., the node of the first zone in the generic zonelist.
5896 		 * Set up numa_mem percpu variable for on-line cpus.  During
5897 		 * boot, only the boot cpu should be on-line;  we'll init the
5898 		 * secondary cpus' numa_mem as they come on-line.  During
5899 		 * node/memory hotplug, we'll fixup all on-line cpus.
5900 		 */
5901 		for_each_online_cpu(cpu)
5902 			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
5903 #endif
5904 	}
5905 
5906 	printk_deferred_exit();
5907 	write_sequnlock_irqrestore(&zonelist_update_seq, flags);
5908 }
5909 
5910 static noinline void __init
5911 build_all_zonelists_init(void)
5912 {
5913 	int cpu;
5914 
5915 	__build_all_zonelists(NULL);
5916 
5917 	/*
5918 	 * Initialize the boot_pagesets that are going to be used
5919 	 * for bootstrapping processors. The real pagesets for
5920 	 * each zone will be allocated later when the per cpu
5921 	 * allocator is available.
5922 	 *
5923 	 * boot_pagesets are used also for bootstrapping offline
5924 	 * cpus if the system is already booted because the pagesets
5925 	 * are needed to initialize allocators on a specific cpu too.
5926 	 * F.e. the percpu allocator needs the page allocator which
5927 	 * needs the percpu allocator in order to allocate its pagesets
5928 	 * (a chicken-egg dilemma).
5929 	 */
5930 	for_each_possible_cpu(cpu)
5931 		per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu));
5932 
5933 	mminit_verify_zonelist();
5934 	cpuset_init_current_mems_allowed();
5935 }
5936 
5937 /*
5938  * unless system_state == SYSTEM_BOOTING.
5939  *
5940  * __ref due to call of __init annotated helper build_all_zonelists_init
5941  * [protected by SYSTEM_BOOTING].
5942  */
5943 void __ref build_all_zonelists(pg_data_t *pgdat)
5944 {
5945 	unsigned long vm_total_pages;
5946 
5947 	if (system_state == SYSTEM_BOOTING) {
5948 		build_all_zonelists_init();
5949 	} else {
5950 		__build_all_zonelists(pgdat);
5951 		/* cpuset refresh routine should be here */
5952 	}
5953 	/* Get the number of free pages beyond high watermark in all zones. */
5954 	vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
5955 	/*
5956 	 * Disable grouping by mobility if the number of pages in the
5957 	 * system is too low to allow the mechanism to work. It would be
5958 	 * more accurate, but expensive to check per-zone. This check is
5959 	 * made on memory-hotadd so a system can start with mobility
5960 	 * disabled and enable it later
5961 	 */
5962 	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
5963 		page_group_by_mobility_disabled = 1;
5964 	else
5965 		page_group_by_mobility_disabled = 0;
5966 
5967 	pr_info("Built %u zonelists, mobility grouping %s.  Total pages: %ld\n",
5968 		nr_online_nodes,
5969 		str_off_on(page_group_by_mobility_disabled),
5970 		vm_total_pages);
5971 #ifdef CONFIG_NUMA
5972 	pr_info("Policy zone: %s\n", zone_names[policy_zone]);
5973 #endif
5974 }
5975 
5976 static int zone_batchsize(struct zone *zone)
5977 {
5978 #ifdef CONFIG_MMU
5979 	int batch;
5980 
5981 	/*
5982 	 * The number of pages to batch allocate is either ~0.025%
5983 	 * of the zone or 256KB, whichever is smaller. The batch
5984 	 * size is striking a balance between allocation latency
5985 	 * and zone lock contention.
5986 	 */
5987 	batch = min(zone_managed_pages(zone) >> 12, SZ_256K / PAGE_SIZE);
5988 	if (batch <= 1)
5989 		return 1;
5990 
5991 	/*
5992 	 * Clamp the batch to a 2^n - 1 value. Having a power
5993 	 * of 2 value was found to be more likely to have
5994 	 * suboptimal cache aliasing properties in some cases.
5995 	 *
5996 	 * For example if 2 tasks are alternately allocating
5997 	 * batches of pages, one task can end up with a lot
5998 	 * of pages of one half of the possible page colors
5999 	 * and the other with pages of the other colors.
6000 	 */
6001 	batch = rounddown_pow_of_two(batch + batch/2) - 1;
6002 
6003 	return batch;
6004 
6005 #else
6006 	/* The deferral and batching of frees should be suppressed under NOMMU
6007 	 * conditions.
6008 	 *
6009 	 * The problem is that NOMMU needs to be able to allocate large chunks
6010 	 * of contiguous memory as there's no hardware page translation to
6011 	 * assemble apparent contiguous memory from discontiguous pages.
6012 	 *
6013 	 * Queueing large contiguous runs of pages for batching, however,
6014 	 * causes the pages to actually be freed in smaller chunks.  As there
6015 	 * can be a significant delay between the individual batches being
6016 	 * recycled, this leads to the once large chunks of space being
6017 	 * fragmented and becoming unavailable for high-order allocations.
6018 	 */
6019 	return 1;
6020 #endif
6021 }
6022 
6023 static int percpu_pagelist_high_fraction;
6024 static int zone_highsize(struct zone *zone, int batch, int cpu_online,
6025 			 int high_fraction)
6026 {
6027 #ifdef CONFIG_MMU
6028 	int high;
6029 	int nr_split_cpus;
6030 	unsigned long total_pages;
6031 
6032 	if (!high_fraction) {
6033 		/*
6034 		 * By default, the high value of the pcp is based on the zone
6035 		 * low watermark so that if they are full then background
6036 		 * reclaim will not be started prematurely.
6037 		 */
6038 		total_pages = low_wmark_pages(zone);
6039 	} else {
6040 		/*
6041 		 * If percpu_pagelist_high_fraction is configured, the high
6042 		 * value is based on a fraction of the managed pages in the
6043 		 * zone.
6044 		 */
6045 		total_pages = zone_managed_pages(zone) / high_fraction;
6046 	}
6047 
6048 	/*
6049 	 * Split the high value across all online CPUs local to the zone. Note
6050 	 * that early in boot that CPUs may not be online yet and that during
6051 	 * CPU hotplug that the cpumask is not yet updated when a CPU is being
6052 	 * onlined. For memory nodes that have no CPUs, split the high value
6053 	 * across all online CPUs to mitigate the risk that reclaim is triggered
6054 	 * prematurely due to pages stored on pcp lists.
6055 	 */
6056 	nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
6057 	if (!nr_split_cpus)
6058 		nr_split_cpus = num_online_cpus();
6059 	high = total_pages / nr_split_cpus;
6060 
6061 	/*
6062 	 * Ensure high is at least batch*4. The multiple is based on the
6063 	 * historical relationship between high and batch.
6064 	 */
6065 	high = max(high, batch << 2);
6066 
6067 	return high;
6068 #else
6069 	return 0;
6070 #endif
6071 }
6072 
6073 /*
6074  * pcp->high and pcp->batch values are related and generally batch is lower
6075  * than high. They are also related to pcp->count such that count is lower
6076  * than high, and as soon as it reaches high, the pcplist is flushed.
6077  *
6078  * However, guaranteeing these relations at all times would require e.g. write
6079  * barriers here but also careful usage of read barriers at the read side, and
6080  * thus be prone to error and bad for performance. Thus the update only prevents
6081  * store tearing. Any new users of pcp->batch, pcp->high_min and pcp->high_max
6082  * should ensure they can cope with those fields changing asynchronously, and
6083  * fully trust only the pcp->count field on the local CPU with interrupts
6084  * disabled.
6085  *
6086  * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
6087  * outside of boot time (or some other assurance that no concurrent updaters
6088  * exist).
6089  */
6090 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high_min,
6091 			   unsigned long high_max, unsigned long batch)
6092 {
6093 	WRITE_ONCE(pcp->batch, batch);
6094 	WRITE_ONCE(pcp->high_min, high_min);
6095 	WRITE_ONCE(pcp->high_max, high_max);
6096 }
6097 
6098 static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
6099 {
6100 	int pindex;
6101 
6102 	memset(pcp, 0, sizeof(*pcp));
6103 	memset(pzstats, 0, sizeof(*pzstats));
6104 
6105 	spin_lock_init(&pcp->lock);
6106 	for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
6107 		INIT_LIST_HEAD(&pcp->lists[pindex]);
6108 
6109 	/*
6110 	 * Set batch and high values safe for a boot pageset. A true percpu
6111 	 * pageset's initialization will update them subsequently. Here we don't
6112 	 * need to be as careful as pageset_update() as nobody can access the
6113 	 * pageset yet.
6114 	 */
6115 	pcp->high_min = BOOT_PAGESET_HIGH;
6116 	pcp->high_max = BOOT_PAGESET_HIGH;
6117 	pcp->batch = BOOT_PAGESET_BATCH;
6118 }
6119 
6120 static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min,
6121 					      unsigned long high_max, unsigned long batch)
6122 {
6123 	struct per_cpu_pages *pcp;
6124 	int cpu;
6125 
6126 	for_each_possible_cpu(cpu) {
6127 		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
6128 		pageset_update(pcp, high_min, high_max, batch);
6129 	}
6130 }
6131 
6132 /*
6133  * Calculate and set new high and batch values for all per-cpu pagesets of a
6134  * zone based on the zone's size.
6135  */
6136 static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
6137 {
6138 	int new_high_min, new_high_max, new_batch;
6139 
6140 	new_batch = zone_batchsize(zone);
6141 	if (percpu_pagelist_high_fraction) {
6142 		new_high_min = zone_highsize(zone, new_batch, cpu_online,
6143 					     percpu_pagelist_high_fraction);
6144 		/*
6145 		 * PCP high is tuned manually, disable auto-tuning via
6146 		 * setting high_min and high_max to the manual value.
6147 		 */
6148 		new_high_max = new_high_min;
6149 	} else {
6150 		new_high_min = zone_highsize(zone, new_batch, cpu_online, 0);
6151 		new_high_max = zone_highsize(zone, new_batch, cpu_online,
6152 					     MIN_PERCPU_PAGELIST_HIGH_FRACTION);
6153 	}
6154 
6155 	if (zone->pageset_high_min == new_high_min &&
6156 	    zone->pageset_high_max == new_high_max &&
6157 	    zone->pageset_batch == new_batch)
6158 		return;
6159 
6160 	zone->pageset_high_min = new_high_min;
6161 	zone->pageset_high_max = new_high_max;
6162 	zone->pageset_batch = new_batch;
6163 
6164 	__zone_set_pageset_high_and_batch(zone, new_high_min, new_high_max,
6165 					  new_batch);
6166 }
6167 
6168 void __meminit setup_zone_pageset(struct zone *zone)
6169 {
6170 	int cpu;
6171 
6172 	/* Size may be 0 on !SMP && !NUMA */
6173 	if (sizeof(struct per_cpu_zonestat) > 0)
6174 		zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
6175 
6176 	zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
6177 	for_each_possible_cpu(cpu) {
6178 		struct per_cpu_pages *pcp;
6179 		struct per_cpu_zonestat *pzstats;
6180 
6181 		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
6182 		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
6183 		per_cpu_pages_init(pcp, pzstats);
6184 	}
6185 
6186 	zone_set_pageset_high_and_batch(zone, 0);
6187 }
6188 
6189 /*
6190  * The zone indicated has a new number of managed_pages; batch sizes and percpu
6191  * page high values need to be recalculated.
6192  */
6193 static void zone_pcp_update(struct zone *zone, int cpu_online)
6194 {
6195 	mutex_lock(&pcp_batch_high_lock);
6196 	zone_set_pageset_high_and_batch(zone, cpu_online);
6197 	mutex_unlock(&pcp_batch_high_lock);
6198 }
6199 
6200 static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
6201 {
6202 	struct per_cpu_pages *pcp;
6203 	struct cpu_cacheinfo *cci;
6204 
6205 	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
6206 	cci = get_cpu_cacheinfo(cpu);
6207 	/*
6208 	 * If data cache slice of CPU is large enough, "pcp->batch"
6209 	 * pages can be preserved in PCP before draining PCP for
6210 	 * consecutive high-order pages freeing without allocation.
6211 	 * This can reduce zone lock contention without hurting
6212 	 * cache-hot pages sharing.
6213 	 */
6214 	pcp_spin_lock_nopin(pcp);
6215 	if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
6216 		pcp->flags |= PCPF_FREE_HIGH_BATCH;
6217 	else
6218 		pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
6219 	pcp_spin_unlock_nopin(pcp);
6220 }
6221 
6222 void setup_pcp_cacheinfo(unsigned int cpu)
6223 {
6224 	struct zone *zone;
6225 
6226 	for_each_populated_zone(zone)
6227 		zone_pcp_update_cacheinfo(zone, cpu);
6228 }
6229 
6230 /*
6231  * Allocate per cpu pagesets and initialize them.
6232  * Before this call only boot pagesets were available.
6233  */
6234 void __init setup_per_cpu_pageset(void)
6235 {
6236 	struct pglist_data *pgdat;
6237 	struct zone *zone;
6238 	int __maybe_unused cpu;
6239 
6240 	for_each_populated_zone(zone)
6241 		setup_zone_pageset(zone);
6242 
6243 #ifdef CONFIG_NUMA
6244 	/*
6245 	 * Unpopulated zones continue using the boot pagesets.
6246 	 * The numa stats for these pagesets need to be reset.
6247 	 * Otherwise, they will end up skewing the stats of
6248 	 * the nodes these zones are associated with.
6249 	 */
6250 	for_each_possible_cpu(cpu) {
6251 		struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
6252 		memset(pzstats->vm_numa_event, 0,
6253 		       sizeof(pzstats->vm_numa_event));
6254 	}
6255 #endif
6256 
6257 	for_each_online_pgdat(pgdat)
6258 		pgdat->per_cpu_nodestats =
6259 			alloc_percpu(struct per_cpu_nodestat);
6260 }
6261 
6262 __meminit void zone_pcp_init(struct zone *zone)
6263 {
6264 	/*
6265 	 * per cpu subsystem is not up at this point. The following code
6266 	 * relies on the ability of the linker to provide the
6267 	 * offset of a (static) per cpu variable into the per cpu area.
6268 	 */
6269 	zone->per_cpu_pageset = &boot_pageset;
6270 	zone->per_cpu_zonestats = &boot_zonestats;
6271 	zone->pageset_high_min = BOOT_PAGESET_HIGH;
6272 	zone->pageset_high_max = BOOT_PAGESET_HIGH;
6273 	zone->pageset_batch = BOOT_PAGESET_BATCH;
6274 
6275 	if (populated_zone(zone))
6276 		pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,
6277 			 zone->present_pages, zone_batchsize(zone));
6278 }
6279 
6280 static void setup_per_zone_lowmem_reserve(void);
6281 
6282 void adjust_managed_page_count(struct page *page, long count)
6283 {
6284 	atomic_long_add(count, &page_zone(page)->managed_pages);
6285 	totalram_pages_add(count);
6286 	setup_per_zone_lowmem_reserve();
6287 }
6288 EXPORT_SYMBOL(adjust_managed_page_count);
6289 
6290 void free_reserved_page(struct page *page)
6291 {
6292 	clear_page_tag_ref(page);
6293 	ClearPageReserved(page);
6294 	init_page_count(page);
6295 	__free_page(page);
6296 	adjust_managed_page_count(page, 1);
6297 }
6298 EXPORT_SYMBOL(free_reserved_page);
6299 
6300 static int page_alloc_cpu_dead(unsigned int cpu)
6301 {
6302 	struct zone *zone;
6303 
6304 	lru_add_drain_cpu(cpu);
6305 	mlock_drain_remote(cpu);
6306 	drain_pages(cpu);
6307 
6308 	/*
6309 	 * Spill the event counters of the dead processor
6310 	 * into the current processors event counters.
6311 	 * This artificially elevates the count of the current
6312 	 * processor.
6313 	 */
6314 	vm_events_fold_cpu(cpu);
6315 
6316 	/*
6317 	 * Zero the differential counters of the dead processor
6318 	 * so that the vm statistics are consistent.
6319 	 *
6320 	 * This is only okay since the processor is dead and cannot
6321 	 * race with what we are doing.
6322 	 */
6323 	cpu_vm_stats_fold(cpu);
6324 
6325 	for_each_populated_zone(zone)
6326 		zone_pcp_update(zone, 0);
6327 
6328 	return 0;
6329 }
6330 
6331 static int page_alloc_cpu_online(unsigned int cpu)
6332 {
6333 	struct zone *zone;
6334 
6335 	for_each_populated_zone(zone)
6336 		zone_pcp_update(zone, 1);
6337 	return 0;
6338 }
6339 
6340 void __init page_alloc_init_cpuhp(void)
6341 {
6342 	int ret;
6343 
6344 	ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC,
6345 					"mm/page_alloc:pcp",
6346 					page_alloc_cpu_online,
6347 					page_alloc_cpu_dead);
6348 	WARN_ON(ret < 0);
6349 }
6350 
6351 /*
6352  * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
6353  *	or min_free_kbytes changes.
6354  */
6355 static void calculate_totalreserve_pages(void)
6356 {
6357 	struct pglist_data *pgdat;
6358 	unsigned long reserve_pages = 0;
6359 	enum zone_type i, j;
6360 
6361 	for_each_online_pgdat(pgdat) {
6362 
6363 		pgdat->totalreserve_pages = 0;
6364 
6365 		for (i = 0; i < MAX_NR_ZONES; i++) {
6366 			struct zone *zone = pgdat->node_zones + i;
6367 			long max = 0;
6368 			unsigned long managed_pages = zone_managed_pages(zone);
6369 
6370 			/*
6371 			 * lowmem_reserve[j] is monotonically non-decreasing
6372 			 * in j for a given zone (see
6373 			 * setup_per_zone_lowmem_reserve()). The maximum
6374 			 * valid reserve lives at the highest index with a
6375 			 * non-zero value, so scan backwards and stop at the
6376 			 * first hit.
6377 			 */
6378 			for (j = MAX_NR_ZONES - 1; j > i; j--) {
6379 				if (!zone->lowmem_reserve[j])
6380 					continue;
6381 
6382 				max = zone->lowmem_reserve[j];
6383 				break;
6384 			}
6385 			/* we treat the high watermark as reserved pages. */
6386 			max += high_wmark_pages(zone);
6387 
6388 			max = min_t(unsigned long, max, managed_pages);
6389 
6390 			pgdat->totalreserve_pages += max;
6391 
6392 			reserve_pages += max;
6393 		}
6394 	}
6395 	totalreserve_pages = reserve_pages;
6396 	trace_mm_calculate_totalreserve_pages(totalreserve_pages);
6397 }
6398 
6399 /*
6400  * setup_per_zone_lowmem_reserve - called whenever
6401  *	sysctl_lowmem_reserve_ratio changes.  Ensures that each zone
6402  *	has a correct pages reserved value, so an adequate number of
6403  *	pages are left in the zone after a successful __alloc_pages().
6404  */
6405 static void setup_per_zone_lowmem_reserve(void)
6406 {
6407 	struct pglist_data *pgdat;
6408 	enum zone_type i, j;
6409 	/*
6410 	 * For a given zone node_zones[i], lowmem_reserve[j] (j > i)
6411 	 * represents how many pages in zone i must effectively be kept
6412 	 * in reserve when deciding whether an allocation class that is
6413 	 * allowed to allocate from zones up to j may fall back into
6414 	 * zone i.
6415 	 *
6416 	 * As j increases, the allocation class can use a strictly larger
6417 	 * set of fallback zones and therefore must not be allowed to
6418 	 * deplete low zones more aggressively than a less flexible one.
6419 	 * As a result, lowmem_reserve[j] is required to be monotonically
6420 	 * non-decreasing in j for each zone i. Callers such as
6421 	 * calculate_totalreserve_pages() rely on this monotonicity when
6422 	 * selecting the maximum reserve entry.
6423 	 */
6424 	for_each_online_pgdat(pgdat) {
6425 		for (i = 0; i < MAX_NR_ZONES - 1; i++) {
6426 			struct zone *zone = &pgdat->node_zones[i];
6427 			int ratio = sysctl_lowmem_reserve_ratio[i];
6428 			bool clear = !ratio || !zone_managed_pages(zone);
6429 			unsigned long managed_pages = 0;
6430 
6431 			for (j = i + 1; j < MAX_NR_ZONES; j++) {
6432 				struct zone *upper_zone = &pgdat->node_zones[j];
6433 
6434 				managed_pages += zone_managed_pages(upper_zone);
6435 
6436 				if (clear)
6437 					zone->lowmem_reserve[j] = 0;
6438 				else
6439 					zone->lowmem_reserve[j] = managed_pages / ratio;
6440 				trace_mm_setup_per_zone_lowmem_reserve(zone, upper_zone,
6441 								       zone->lowmem_reserve[j]);
6442 			}
6443 		}
6444 	}
6445 
6446 	/* update totalreserve_pages */
6447 	calculate_totalreserve_pages();
6448 }
6449 
6450 static void __setup_per_zone_wmarks(void)
6451 {
6452 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
6453 	unsigned long lowmem_pages = 0;
6454 	struct zone *zone;
6455 	unsigned long flags;
6456 
6457 	/* Calculate total number of !ZONE_HIGHMEM and !ZONE_MOVABLE pages */
6458 	for_each_zone(zone) {
6459 		if (!is_highmem(zone) && zone_idx(zone) != ZONE_MOVABLE)
6460 			lowmem_pages += zone_managed_pages(zone);
6461 	}
6462 
6463 	for_each_zone(zone) {
6464 		u64 tmp;
6465 
6466 		spin_lock_irqsave(&zone->lock, flags);
6467 		tmp = (u64)pages_min * zone_managed_pages(zone);
6468 		tmp = div64_ul(tmp, lowmem_pages);
6469 		if (is_highmem(zone) || zone_idx(zone) == ZONE_MOVABLE) {
6470 			/*
6471 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
6472 			 * need highmem and movable zones pages, so cap pages_min
6473 			 * to a small  value here.
6474 			 *
6475 			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
6476 			 * deltas control async page reclaim, and so should
6477 			 * not be capped for highmem and movable zones.
6478 			 */
6479 			unsigned long min_pages;
6480 
6481 			min_pages = zone_managed_pages(zone) / 1024;
6482 			min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
6483 			zone->_watermark[WMARK_MIN] = min_pages;
6484 		} else {
6485 			/*
6486 			 * If it's a lowmem zone, reserve a number of pages
6487 			 * proportionate to the zone's size.
6488 			 */
6489 			zone->_watermark[WMARK_MIN] = tmp;
6490 		}
6491 
6492 		/*
6493 		 * Set the kswapd watermarks distance according to the
6494 		 * scale factor in proportion to available memory, but
6495 		 * ensure a minimum size on small systems.
6496 		 */
6497 		tmp = max_t(u64, tmp >> 2,
6498 			    mult_frac(zone_managed_pages(zone),
6499 				      watermark_scale_factor, 10000));
6500 
6501 		zone->watermark_boost = 0;
6502 		zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
6503 		zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
6504 		zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
6505 		trace_mm_setup_per_zone_wmarks(zone);
6506 
6507 		spin_unlock_irqrestore(&zone->lock, flags);
6508 	}
6509 
6510 	/* update totalreserve_pages */
6511 	calculate_totalreserve_pages();
6512 }
6513 
6514 /**
6515  * setup_per_zone_wmarks - called when min_free_kbytes changes
6516  * or when memory is hot-{added|removed}
6517  *
6518  * Ensures that the watermark[min,low,high] values for each zone are set
6519  * correctly with respect to min_free_kbytes.
6520  */
6521 void setup_per_zone_wmarks(void)
6522 {
6523 	struct zone *zone;
6524 	static DEFINE_SPINLOCK(lock);
6525 
6526 	spin_lock(&lock);
6527 	__setup_per_zone_wmarks();
6528 	spin_unlock(&lock);
6529 
6530 	/*
6531 	 * The watermark size have changed so update the pcpu batch
6532 	 * and high limits or the limits may be inappropriate.
6533 	 */
6534 	for_each_zone(zone)
6535 		zone_pcp_update(zone, 0);
6536 }
6537 
6538 /*
6539  * Initialise min_free_kbytes.
6540  *
6541  * For small machines we want it small (128k min).  For large machines
6542  * we want it large (256MB max).  But it is not linear, because network
6543  * bandwidth does not increase linearly with machine size.  We use
6544  *
6545  *	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
6546  *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
6547  *
6548  * which yields
6549  *
6550  * 16MB:	512k
6551  * 32MB:	724k
6552  * 64MB:	1024k
6553  * 128MB:	1448k
6554  * 256MB:	2048k
6555  * 512MB:	2896k
6556  * 1024MB:	4096k
6557  * 2048MB:	5792k
6558  * 4096MB:	8192k
6559  * 8192MB:	11584k
6560  * 16384MB:	16384k
6561  */
6562 void calculate_min_free_kbytes(void)
6563 {
6564 	unsigned long lowmem_kbytes;
6565 	int new_min_free_kbytes;
6566 
6567 	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
6568 	new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
6569 
6570 	if (new_min_free_kbytes > user_min_free_kbytes)
6571 		min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144);
6572 	else
6573 		pr_warn_ratelimited("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
6574 				    new_min_free_kbytes, user_min_free_kbytes);
6575 
6576 }
6577 
6578 int __meminit init_per_zone_wmark_min(void)
6579 {
6580 	calculate_min_free_kbytes();
6581 	setup_per_zone_wmarks();
6582 	refresh_zone_stat_thresholds();
6583 	setup_per_zone_lowmem_reserve();
6584 
6585 #ifdef CONFIG_NUMA
6586 	setup_min_unmapped_ratio();
6587 	setup_min_slab_ratio();
6588 #endif
6589 
6590 	khugepaged_min_free_kbytes_update();
6591 
6592 	return 0;
6593 }
6594 postcore_initcall(init_per_zone_wmark_min)
6595 
6596 /*
6597  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
6598  *	that we can call two helper functions whenever min_free_kbytes
6599  *	changes.
6600  */
6601 static int min_free_kbytes_sysctl_handler(const struct ctl_table *table, int write,
6602 		void *buffer, size_t *length, loff_t *ppos)
6603 {
6604 	int rc;
6605 
6606 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
6607 	if (rc)
6608 		return rc;
6609 
6610 	if (write) {
6611 		user_min_free_kbytes = min_free_kbytes;
6612 		setup_per_zone_wmarks();
6613 	}
6614 	return 0;
6615 }
6616 
6617 static int watermark_scale_factor_sysctl_handler(const struct ctl_table *table, int write,
6618 		void *buffer, size_t *length, loff_t *ppos)
6619 {
6620 	int rc;
6621 
6622 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
6623 	if (rc)
6624 		return rc;
6625 
6626 	if (write)
6627 		setup_per_zone_wmarks();
6628 
6629 	return 0;
6630 }
6631 
6632 #ifdef CONFIG_NUMA
6633 static void setup_min_unmapped_ratio(void)
6634 {
6635 	pg_data_t *pgdat;
6636 	struct zone *zone;
6637 
6638 	for_each_online_pgdat(pgdat)
6639 		pgdat->min_unmapped_pages = 0;
6640 
6641 	for_each_zone(zone)
6642 		zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
6643 						         sysctl_min_unmapped_ratio) / 100;
6644 }
6645 
6646 
6647 static int sysctl_min_unmapped_ratio_sysctl_handler(const struct ctl_table *table, int write,
6648 		void *buffer, size_t *length, loff_t *ppos)
6649 {
6650 	int rc;
6651 
6652 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
6653 	if (rc)
6654 		return rc;
6655 
6656 	setup_min_unmapped_ratio();
6657 
6658 	return 0;
6659 }
6660 
6661 static void setup_min_slab_ratio(void)
6662 {
6663 	pg_data_t *pgdat;
6664 	struct zone *zone;
6665 
6666 	for_each_online_pgdat(pgdat)
6667 		pgdat->min_slab_pages = 0;
6668 
6669 	for_each_zone(zone)
6670 		zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
6671 						     sysctl_min_slab_ratio) / 100;
6672 }
6673 
6674 static int sysctl_min_slab_ratio_sysctl_handler(const struct ctl_table *table, int write,
6675 		void *buffer, size_t *length, loff_t *ppos)
6676 {
6677 	int rc;
6678 
6679 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
6680 	if (rc)
6681 		return rc;
6682 
6683 	setup_min_slab_ratio();
6684 
6685 	return 0;
6686 }
6687 #endif
6688 
6689 /*
6690  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
6691  *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
6692  *	whenever sysctl_lowmem_reserve_ratio changes.
6693  *
6694  * The reserve ratio obviously has absolutely no relation with the
6695  * minimum watermarks. The lowmem reserve ratio can only make sense
6696  * if in function of the boot time zone sizes.
6697  */
6698 static int lowmem_reserve_ratio_sysctl_handler(const struct ctl_table *table,
6699 		int write, void *buffer, size_t *length, loff_t *ppos)
6700 {
6701 	int i;
6702 
6703 	proc_dointvec_minmax(table, write, buffer, length, ppos);
6704 
6705 	for (i = 0; i < MAX_NR_ZONES; i++) {
6706 		if (sysctl_lowmem_reserve_ratio[i] < 1)
6707 			sysctl_lowmem_reserve_ratio[i] = 0;
6708 	}
6709 
6710 	setup_per_zone_lowmem_reserve();
6711 	return 0;
6712 }
6713 
6714 /*
6715  * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
6716  * cpu. It is the fraction of total pages in each zone that a hot per cpu
6717  * pagelist can have before it gets flushed back to buddy allocator.
6718  */
6719 static int percpu_pagelist_high_fraction_sysctl_handler(const struct ctl_table *table,
6720 		int write, void *buffer, size_t *length, loff_t *ppos)
6721 {
6722 	struct zone *zone;
6723 	int old_percpu_pagelist_high_fraction;
6724 	int ret;
6725 
6726 	/*
6727 	 * Avoid using pcp_batch_high_lock for reads as the value is read
6728 	 * atomically and a race with offlining is harmless.
6729 	 */
6730 
6731 	if (!write)
6732 		return proc_dointvec_minmax(table, write, buffer, length, ppos);
6733 
6734 	mutex_lock(&pcp_batch_high_lock);
6735 	old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
6736 
6737 	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
6738 	if (ret < 0)
6739 		goto out;
6740 
6741 	/* Sanity checking to avoid pcp imbalance */
6742 	if (percpu_pagelist_high_fraction &&
6743 	    percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
6744 		percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
6745 		ret = -EINVAL;
6746 		goto out;
6747 	}
6748 
6749 	/* No change? */
6750 	if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
6751 		goto out;
6752 
6753 	for_each_populated_zone(zone)
6754 		zone_set_pageset_high_and_batch(zone, 0);
6755 out:
6756 	mutex_unlock(&pcp_batch_high_lock);
6757 	return ret;
6758 }
6759 
6760 static const struct ctl_table page_alloc_sysctl_table[] = {
6761 	{
6762 		.procname	= "min_free_kbytes",
6763 		.data		= &min_free_kbytes,
6764 		.maxlen		= sizeof(min_free_kbytes),
6765 		.mode		= 0644,
6766 		.proc_handler	= min_free_kbytes_sysctl_handler,
6767 		.extra1		= SYSCTL_ZERO,
6768 	},
6769 	{
6770 		.procname	= "watermark_boost_factor",
6771 		.data		= &watermark_boost_factor,
6772 		.maxlen		= sizeof(watermark_boost_factor),
6773 		.mode		= 0644,
6774 		.proc_handler	= proc_dointvec_minmax,
6775 		.extra1		= SYSCTL_ZERO,
6776 	},
6777 	{
6778 		.procname	= "watermark_scale_factor",
6779 		.data		= &watermark_scale_factor,
6780 		.maxlen		= sizeof(watermark_scale_factor),
6781 		.mode		= 0644,
6782 		.proc_handler	= watermark_scale_factor_sysctl_handler,
6783 		.extra1		= SYSCTL_ONE,
6784 		.extra2		= SYSCTL_THREE_THOUSAND,
6785 	},
6786 	{
6787 		.procname	= "defrag_mode",
6788 		.data		= &defrag_mode,
6789 		.maxlen		= sizeof(defrag_mode),
6790 		.mode		= 0644,
6791 		.proc_handler	= proc_dointvec_minmax,
6792 		.extra1		= SYSCTL_ZERO,
6793 		.extra2		= SYSCTL_ONE,
6794 	},
6795 	{
6796 		.procname	= "percpu_pagelist_high_fraction",
6797 		.data		= &percpu_pagelist_high_fraction,
6798 		.maxlen		= sizeof(percpu_pagelist_high_fraction),
6799 		.mode		= 0644,
6800 		.proc_handler	= percpu_pagelist_high_fraction_sysctl_handler,
6801 		.extra1		= SYSCTL_ZERO,
6802 	},
6803 	{
6804 		.procname	= "lowmem_reserve_ratio",
6805 		.data		= &sysctl_lowmem_reserve_ratio,
6806 		.maxlen		= sizeof(sysctl_lowmem_reserve_ratio),
6807 		.mode		= 0644,
6808 		.proc_handler	= lowmem_reserve_ratio_sysctl_handler,
6809 	},
6810 #ifdef CONFIG_NUMA
6811 	{
6812 		.procname	= "numa_zonelist_order",
6813 		.data		= &numa_zonelist_order,
6814 		.maxlen		= NUMA_ZONELIST_ORDER_LEN,
6815 		.mode		= 0644,
6816 		.proc_handler	= numa_zonelist_order_handler,
6817 	},
6818 	{
6819 		.procname	= "min_unmapped_ratio",
6820 		.data		= &sysctl_min_unmapped_ratio,
6821 		.maxlen		= sizeof(sysctl_min_unmapped_ratio),
6822 		.mode		= 0644,
6823 		.proc_handler	= sysctl_min_unmapped_ratio_sysctl_handler,
6824 		.extra1		= SYSCTL_ZERO,
6825 		.extra2		= SYSCTL_ONE_HUNDRED,
6826 	},
6827 	{
6828 		.procname	= "min_slab_ratio",
6829 		.data		= &sysctl_min_slab_ratio,
6830 		.maxlen		= sizeof(sysctl_min_slab_ratio),
6831 		.mode		= 0644,
6832 		.proc_handler	= sysctl_min_slab_ratio_sysctl_handler,
6833 		.extra1		= SYSCTL_ZERO,
6834 		.extra2		= SYSCTL_ONE_HUNDRED,
6835 	},
6836 #endif
6837 };
6838 
6839 void __init page_alloc_sysctl_init(void)
6840 {
6841 	register_sysctl_init("vm", page_alloc_sysctl_table);
6842 }
6843 
6844 static void free_prepared_contig_range(struct page *page,
6845 		unsigned long nr_pages)
6846 {
6847 	unsigned long pfn = page_to_pfn(page);
6848 
6849 	while (nr_pages) {
6850 		unsigned int order;
6851 
6852 		/* We are limited by the largest buddy order. */
6853 		order = pfn ? __ffs(pfn) : MAX_PAGE_ORDER;
6854 		/* Don't exceed the number of pages to free. */
6855 		order = min_t(unsigned int, order, ilog2(nr_pages));
6856 		order = min_t(unsigned int, order, MAX_PAGE_ORDER);
6857 
6858 		/*
6859 		 * Free the chunk as a single block. Our caller has already
6860 		 * called free_pages_prepare() for each order-0 page.
6861 		 */
6862 		__free_frozen_pages(page, order, FPI_PREPARED);
6863 
6864 		pfn += 1UL << order;
6865 		page += 1UL << order;
6866 		nr_pages -= 1UL << order;
6867 	}
6868 }
6869 
6870 static void __free_contig_range_common(unsigned long pfn, unsigned long nr_pages,
6871 		bool is_frozen)
6872 {
6873 	struct page *page, *start = NULL;
6874 	unsigned long nr_start = 0;
6875 	unsigned long start_sec;
6876 	unsigned long i;
6877 
6878 	for (i = 0; i < nr_pages; i++) {
6879 		bool can_free = true;
6880 
6881 		/*
6882 		 * Contiguous PFNs might not have contiguous "struct pages"
6883 		 * in some kernel configs: page++ across a section boundary
6884 		 * is undefined. Use pfn_to_page() for each PFN.
6885 		 */
6886 		page = pfn_to_page(pfn + i);
6887 
6888 		VM_WARN_ON_ONCE(PageHead(page));
6889 		VM_WARN_ON_ONCE(PageTail(page));
6890 
6891 		if (!is_frozen)
6892 			can_free = put_page_testzero(page);
6893 
6894 		if (can_free)
6895 			can_free = free_pages_prepare(page, 0);
6896 
6897 		if (!can_free) {
6898 			if (start) {
6899 				free_prepared_contig_range(start, i - nr_start);
6900 				start = NULL;
6901 			}
6902 			continue;
6903 		}
6904 
6905 		if (start && memdesc_section(page->flags) != start_sec) {
6906 			free_prepared_contig_range(start, i - nr_start);
6907 			start = page;
6908 			nr_start = i;
6909 			start_sec = memdesc_section(page->flags);
6910 		} else if (!start) {
6911 			start = page;
6912 			nr_start = i;
6913 			start_sec = memdesc_section(page->flags);
6914 		}
6915 	}
6916 
6917 	if (start)
6918 		free_prepared_contig_range(start, nr_pages - nr_start);
6919 }
6920 
6921 /**
6922  * __free_contig_range - Free contiguous range of order-0 pages.
6923  * @pfn: Page frame number of the first page in the range.
6924  * @nr_pages: Number of pages to free.
6925  *
6926  * For each order-0 struct page in the physically contiguous range, put a
6927  * reference. Free any page who's reference count falls to zero. The
6928  * implementation is functionally equivalent to, but significantly faster than
6929  * calling __free_page() for each struct page in a loop.
6930  *
6931  * Memory allocated with alloc_pages(order>=1) then subsequently split to
6932  * order-0 with split_page() is an example of appropriate contiguous pages that
6933  * can be freed with this API.
6934  *
6935  * Context: May be called in interrupt context or while holding a normal
6936  * spinlock, but not in NMI context or while holding a raw spinlock.
6937  */
6938 void __free_contig_range(unsigned long pfn, unsigned long nr_pages)
6939 {
6940 	__free_contig_range_common(pfn, nr_pages, /* is_frozen= */ false);
6941 }
6942 
6943 #ifdef CONFIG_CONTIG_ALLOC
6944 /* Usage: See admin-guide/dynamic-debug-howto.rst */
6945 static void alloc_contig_dump_pages(struct list_head *page_list)
6946 {
6947 	DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
6948 
6949 	if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
6950 		struct page *page;
6951 
6952 		dump_stack();
6953 		list_for_each_entry(page, page_list, lru)
6954 			dump_page(page, "migration failure");
6955 	}
6956 }
6957 
6958 /* [start, end) must belong to a single zone. */
6959 static int __alloc_contig_migrate_range(struct compact_control *cc,
6960 					unsigned long start, unsigned long end)
6961 {
6962 	/* This function is based on compact_zone() from compaction.c. */
6963 	unsigned int nr_reclaimed;
6964 	unsigned long pfn = start;
6965 	unsigned int tries = 0;
6966 	int ret = 0;
6967 	struct migration_target_control mtc = {
6968 		.nid = zone_to_nid(cc->zone),
6969 		.gfp_mask = cc->gfp_mask,
6970 		.reason = MR_CONTIG_RANGE,
6971 	};
6972 
6973 	lru_cache_disable();
6974 
6975 	while (pfn < end || !list_empty(&cc->migratepages)) {
6976 		if (fatal_signal_pending(current)) {
6977 			ret = -EINTR;
6978 			break;
6979 		}
6980 
6981 		if (list_empty(&cc->migratepages)) {
6982 			cc->nr_migratepages = 0;
6983 			ret = isolate_migratepages_range(cc, pfn, end);
6984 			if (ret && ret != -EAGAIN)
6985 				break;
6986 			pfn = cc->migrate_pfn;
6987 			tries = 0;
6988 		} else if (++tries == 5) {
6989 			ret = -EBUSY;
6990 			break;
6991 		}
6992 
6993 		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
6994 							&cc->migratepages);
6995 		cc->nr_migratepages -= nr_reclaimed;
6996 
6997 		ret = migrate_pages(&cc->migratepages, alloc_migration_target,
6998 			NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
6999 
7000 		/*
7001 		 * On -ENOMEM, migrate_pages() bails out right away. It is pointless
7002 		 * to retry again over this error, so do the same here.
7003 		 */
7004 		if (ret == -ENOMEM)
7005 			break;
7006 	}
7007 
7008 	lru_cache_enable();
7009 	if (ret < 0) {
7010 		if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
7011 			alloc_contig_dump_pages(&cc->migratepages);
7012 		putback_movable_pages(&cc->migratepages);
7013 	}
7014 
7015 	return (ret < 0) ? ret : 0;
7016 }
7017 
7018 static void split_free_frozen_pages(struct list_head *list, gfp_t gfp_mask)
7019 {
7020 	int order;
7021 
7022 	for (order = 0; order < NR_PAGE_ORDERS; order++) {
7023 		struct page *page, *next;
7024 		int nr_pages = 1 << order;
7025 
7026 		list_for_each_entry_safe(page, next, &list[order], lru) {
7027 			int i;
7028 
7029 			post_alloc_hook(page, order, gfp_mask);
7030 			if (!order)
7031 				continue;
7032 
7033 			__split_page(page, order);
7034 
7035 			/* Add all subpages to the order-0 head, in sequence. */
7036 			list_del(&page->lru);
7037 			for (i = 0; i < nr_pages; i++)
7038 				list_add_tail(&page[i].lru, &list[0]);
7039 		}
7040 	}
7041 }
7042 
7043 static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
7044 {
7045 	const gfp_t reclaim_mask = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
7046 	const gfp_t action_mask = __GFP_COMP | __GFP_RETRY_MAYFAIL | __GFP_NOWARN |
7047 				  __GFP_ZERO | __GFP_ZEROTAGS | __GFP_SKIP_ZERO |
7048 				  __GFP_SKIP_KASAN;
7049 	const gfp_t cc_action_mask = __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
7050 
7051 	/*
7052 	 * We are given the range to allocate; node, mobility and placement
7053 	 * hints are irrelevant at this point. We'll simply ignore them.
7054 	 */
7055 	gfp_mask &= ~(GFP_ZONEMASK | __GFP_RECLAIMABLE | __GFP_WRITE |
7056 		      __GFP_HARDWALL | __GFP_THISNODE | __GFP_MOVABLE);
7057 
7058 	/*
7059 	 * We only support most reclaim flags (but not NOFAIL/NORETRY), and
7060 	 * selected action flags.
7061 	 */
7062 	if (gfp_mask & ~(reclaim_mask | action_mask))
7063 		return -EINVAL;
7064 
7065 	/*
7066 	 * Flags to control page compaction/migration/reclaim, to free up our
7067 	 * page range. Migratable pages are movable, __GFP_MOVABLE is implied
7068 	 * for them.
7069 	 *
7070 	 * Traditionally we always had __GFP_RETRY_MAYFAIL set, keep doing that
7071 	 * to not degrade callers.
7072 	 */
7073 	*gfp_cc_mask = (gfp_mask & (reclaim_mask | cc_action_mask)) |
7074 			__GFP_MOVABLE | __GFP_RETRY_MAYFAIL;
7075 	return 0;
7076 }
7077 
7078 static void __free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages)
7079 {
7080 	__free_contig_range_common(pfn, nr_pages, /* is_frozen= */ true);
7081 }
7082 
7083 /**
7084  * alloc_contig_frozen_range() -- tries to allocate given range of frozen pages
7085  * @start:	start PFN to allocate
7086  * @end:	one-past-the-last PFN to allocate
7087  * @alloc_flags:	allocation information
7088  * @gfp_mask:	GFP mask. Node/zone/placement hints are ignored; only some
7089  *		action and reclaim modifiers are supported. Reclaim modifiers
7090  *		control allocation behavior during compaction/migration/reclaim.
7091  *
7092  * The PFN range does not have to be pageblock aligned. The PFN range must
7093  * belong to a single zone.
7094  *
7095  * The first thing this routine does is attempt to MIGRATE_ISOLATE all
7096  * pageblocks in the range.  Once isolated, the pageblocks should not
7097  * be modified by others.
7098  *
7099  * All frozen pages which PFN is in [start, end) are allocated for the
7100  * caller, and they could be freed with free_contig_frozen_range(),
7101  * free_frozen_pages() also could be used to free compound frozen pages
7102  * directly.
7103  *
7104  * Return: zero on success or negative error code.
7105  */
7106 int alloc_contig_frozen_range_noprof(unsigned long start, unsigned long end,
7107 		acr_flags_t alloc_flags, gfp_t gfp_mask)
7108 {
7109 	const unsigned int order = ilog2(end - start);
7110 	unsigned long outer_start, outer_end;
7111 	int ret = 0;
7112 
7113 	struct compact_control cc = {
7114 		.nr_migratepages = 0,
7115 		.order = -1,
7116 		.zone = page_zone(pfn_to_page(start)),
7117 		.mode = MIGRATE_SYNC,
7118 		.ignore_skip_hint = true,
7119 		.no_set_skip_hint = true,
7120 		.alloc_contig = true,
7121 	};
7122 	INIT_LIST_HEAD(&cc.migratepages);
7123 	enum pb_isolate_mode mode = (alloc_flags & ACR_FLAGS_CMA) ?
7124 					    PB_ISOLATE_MODE_CMA_ALLOC :
7125 					    PB_ISOLATE_MODE_OTHER;
7126 
7127 	/*
7128 	 * In contrast to the buddy, we allow for orders here that exceed
7129 	 * MAX_PAGE_ORDER, so we must manually make sure that we are not
7130 	 * exceeding the maximum folio order.
7131 	 */
7132 	if (WARN_ON_ONCE((gfp_mask & __GFP_COMP) && order > MAX_FOLIO_ORDER))
7133 		return -EINVAL;
7134 
7135 	gfp_mask = current_gfp_context(gfp_mask);
7136 	if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask))
7137 		return -EINVAL;
7138 
7139 	/*
7140 	 * What we do here is we mark all pageblocks in range as
7141 	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
7142 	 * have different sizes, and due to the way page allocator
7143 	 * work, start_isolate_page_range() has special handlings for this.
7144 	 *
7145 	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
7146 	 * migrate the pages from an unaligned range (ie. pages that
7147 	 * we are interested in). This will put all the pages in
7148 	 * range back to page allocator as MIGRATE_ISOLATE.
7149 	 *
7150 	 * When this is done, we take the pages in range from page
7151 	 * allocator removing them from the buddy system.  This way
7152 	 * page allocator will never consider using them.
7153 	 *
7154 	 * This lets us mark the pageblocks back as
7155 	 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
7156 	 * aligned range but not in the unaligned, original range are
7157 	 * put back to page allocator so that buddy can use them.
7158 	 */
7159 
7160 	ret = start_isolate_page_range(start, end, mode);
7161 	if (ret)
7162 		goto done;
7163 
7164 	drain_all_pages(cc.zone);
7165 
7166 	/*
7167 	 * In case of -EBUSY, we'd like to know which page causes problem.
7168 	 * So, just fall through. test_pages_isolated() has a tracepoint
7169 	 * which will report the busy page.
7170 	 *
7171 	 * It is possible that busy pages could become available before
7172 	 * the call to test_pages_isolated, and the range will actually be
7173 	 * allocated.  So, if we fall through be sure to clear ret so that
7174 	 * -EBUSY is not accidentally used or returned to caller.
7175 	 */
7176 	ret = __alloc_contig_migrate_range(&cc, start, end);
7177 	if (ret && ret != -EBUSY)
7178 		goto done;
7179 
7180 	/*
7181 	 * When in-use hugetlb pages are migrated, they may simply be released
7182 	 * back into the free hugepage pool instead of being returned to the
7183 	 * buddy system.  After the migration of in-use huge pages is completed,
7184 	 * we will invoke replace_free_hugepage_folios() to ensure that these
7185 	 * hugepages are properly released to the buddy system.
7186 	 */
7187 	ret = replace_free_hugepage_folios(start, end);
7188 	if (ret)
7189 		goto done;
7190 
7191 	/*
7192 	 * Pages from [start, end) are within a pageblock_nr_pages
7193 	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
7194 	 * more, all pages in [start, end) are free in page allocator.
7195 	 * What we are going to do is to allocate all pages from
7196 	 * [start, end) (that is remove them from page allocator).
7197 	 *
7198 	 * The only problem is that pages at the beginning and at the
7199 	 * end of interesting range may be not aligned with pages that
7200 	 * page allocator holds, ie. they can be part of higher order
7201 	 * pages.  Because of this, we reserve the bigger range and
7202 	 * once this is done free the pages we are not interested in.
7203 	 *
7204 	 * We don't have to hold zone->lock here because the pages are
7205 	 * isolated thus they won't get removed from buddy.
7206 	 */
7207 	outer_start = find_large_buddy(start);
7208 
7209 	/* Make sure the range is really isolated. */
7210 	if (test_pages_isolated(outer_start, end, mode)) {
7211 		ret = -EBUSY;
7212 		goto done;
7213 	}
7214 
7215 	/* Grab isolated pages from freelists. */
7216 	outer_end = isolate_freepages_range(&cc, outer_start, end);
7217 	if (!outer_end) {
7218 		ret = -EBUSY;
7219 		goto done;
7220 	}
7221 
7222 	if (!(gfp_mask & __GFP_COMP)) {
7223 		split_free_frozen_pages(cc.freepages, gfp_mask);
7224 
7225 		/* Free head and tail (if any) */
7226 		if (start != outer_start)
7227 			__free_contig_frozen_range(outer_start, start - outer_start);
7228 		if (end != outer_end)
7229 			__free_contig_frozen_range(end, outer_end - end);
7230 	} else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) {
7231 		struct page *head = pfn_to_page(start);
7232 
7233 		check_new_pages(head, order);
7234 		prep_new_page(head, order, gfp_mask, 0);
7235 	} else {
7236 		ret = -EINVAL;
7237 		WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n",
7238 		     start, end, outer_start, outer_end);
7239 	}
7240 done:
7241 	undo_isolate_page_range(start, end);
7242 	return ret;
7243 }
7244 EXPORT_SYMBOL(alloc_contig_frozen_range_noprof);
7245 
7246 /**
7247  * alloc_contig_range() -- tries to allocate given range of pages
7248  * @start:	start PFN to allocate
7249  * @end:	one-past-the-last PFN to allocate
7250  * @alloc_flags:	allocation information
7251  * @gfp_mask:	GFP mask.
7252  *
7253  * This routine is a wrapper around alloc_contig_frozen_range(), it can't
7254  * be used to allocate compound pages, the refcount of each allocated page
7255  * will be set to one.
7256  *
7257  * All pages which PFN is in [start, end) are allocated for the caller,
7258  * and should be freed with free_contig_range() or by manually calling
7259  * __free_page() on each allocated page.
7260  *
7261  * Return: zero on success or negative error code.
7262  */
7263 int alloc_contig_range_noprof(unsigned long start, unsigned long end,
7264 			      acr_flags_t alloc_flags, gfp_t gfp_mask)
7265 {
7266 	int ret;
7267 
7268 	if (WARN_ON(gfp_mask & __GFP_COMP))
7269 		return -EINVAL;
7270 
7271 	ret = alloc_contig_frozen_range_noprof(start, end, alloc_flags, gfp_mask);
7272 	if (!ret)
7273 		set_pages_refcounted(pfn_to_page(start), end - start);
7274 
7275 	return ret;
7276 }
7277 EXPORT_SYMBOL(alloc_contig_range_noprof);
7278 
7279 static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
7280 				   unsigned long nr_pages, bool skip_hugetlb,
7281 				   bool *skipped_hugetlb)
7282 {
7283 	unsigned long end_pfn = start_pfn + nr_pages;
7284 	struct page *page;
7285 
7286 	while (start_pfn < end_pfn) {
7287 		unsigned long step = 1;
7288 
7289 		page = pfn_to_online_page(start_pfn);
7290 		if (!page)
7291 			return false;
7292 
7293 		if (page_zone(page) != z)
7294 			return false;
7295 
7296 		if (page_is_unmovable(z, page, PB_ISOLATE_MODE_OTHER, &step))
7297 			return false;
7298 
7299 		/*
7300 		 * Only consider ranges containing hugepages if those pages are
7301 		 * smaller than the requested contiguous region.  e.g.:
7302 		 *     Move 2MB pages to free up a 1GB range.
7303 		 *     Don't move 1GB pages to free up a 2MB range.
7304 		 *
7305 		 * This makes contiguous allocation more reliable if multiple
7306 		 * hugepage sizes are used without causing needless movement.
7307 		 */
7308 		if (PageHuge(page)) {
7309 			unsigned int order;
7310 
7311 			if (skip_hugetlb) {
7312 				*skipped_hugetlb = true;
7313 				return false;
7314 			}
7315 
7316 			page = compound_head(page);
7317 			order = compound_order(page);
7318 			if ((order >= MAX_FOLIO_ORDER) ||
7319 			    (nr_pages <= (1 << order)))
7320 				return false;
7321 		}
7322 
7323 		start_pfn += step;
7324 	}
7325 	return true;
7326 }
7327 
7328 static bool zone_spans_last_pfn(const struct zone *zone,
7329 				unsigned long start_pfn, unsigned long nr_pages)
7330 {
7331 	unsigned long last_pfn = start_pfn + nr_pages - 1;
7332 
7333 	return zone_spans_pfn(zone, last_pfn);
7334 }
7335 
7336 /**
7337  * alloc_contig_frozen_pages() -- tries to find and allocate contiguous range of frozen pages
7338  * @nr_pages:	Number of contiguous pages to allocate
7339  * @gfp_mask:	GFP mask. Node/zone/placement hints limit the search; only some
7340  *		action and reclaim modifiers are supported. Reclaim modifiers
7341  *		control allocation behavior during compaction/migration/reclaim.
7342  * @nid:	Target node
7343  * @nodemask:	Mask for other possible nodes
7344  *
7345  * This routine is a wrapper around alloc_contig_frozen_range(). It scans over
7346  * zones on an applicable zonelist to find a contiguous pfn range which can then
7347  * be tried for allocation with alloc_contig_frozen_range(). This routine is
7348  * intended for allocation requests which can not be fulfilled with the buddy
7349  * allocator.
7350  *
7351  * The allocated memory is always aligned to a page boundary. If nr_pages is a
7352  * power of two, then allocated range is also guaranteed to be aligned to same
7353  * nr_pages (e.g. 1GB request would be aligned to 1GB).
7354  *
7355  * Allocated frozen pages need be freed with free_contig_frozen_range(),
7356  * or by manually calling free_frozen_pages() on each allocated frozen
7357  * non-compound page, for compound frozen pages could be freed with
7358  * free_frozen_pages() directly.
7359  *
7360  * Return: pointer to contiguous frozen pages on success, or NULL if not successful.
7361  */
7362 struct page *alloc_contig_frozen_pages_noprof(unsigned long nr_pages,
7363 		gfp_t gfp_mask, int nid, nodemask_t *nodemask)
7364 {
7365 	unsigned long ret, pfn, flags;
7366 	struct zonelist *zonelist;
7367 	struct zone *zone;
7368 	struct zoneref *z;
7369 	bool skip_hugetlb = true;
7370 	bool skipped_hugetlb = false;
7371 
7372 retry:
7373 	zonelist = node_zonelist(nid, gfp_mask);
7374 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
7375 					gfp_zone(gfp_mask), nodemask) {
7376 		spin_lock_irqsave(&zone->lock, flags);
7377 
7378 		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
7379 		while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
7380 			if (pfn_range_valid_contig(zone, pfn, nr_pages,
7381 						   skip_hugetlb,
7382 						   &skipped_hugetlb)) {
7383 				/*
7384 				 * We release the zone lock here because
7385 				 * alloc_contig_frozen_range() will also lock
7386 				 * the zone at some point. If there's an
7387 				 * allocation spinning on this lock, it may
7388 				 * win the race and cause allocation to fail.
7389 				 */
7390 				spin_unlock_irqrestore(&zone->lock, flags);
7391 				ret = alloc_contig_frozen_range_noprof(pfn,
7392 							pfn + nr_pages,
7393 							ACR_FLAGS_NONE,
7394 							gfp_mask);
7395 				if (!ret)
7396 					return pfn_to_page(pfn);
7397 				spin_lock_irqsave(&zone->lock, flags);
7398 			}
7399 			pfn += nr_pages;
7400 		}
7401 		spin_unlock_irqrestore(&zone->lock, flags);
7402 	}
7403 	/*
7404 	 * If we failed, retry the search, but treat regions with HugeTLB pages
7405 	 * as valid targets.  This retains fast-allocations on first pass
7406 	 * without trying to migrate HugeTLB pages (which may fail). On the
7407 	 * second pass, we will try moving HugeTLB pages when those pages are
7408 	 * smaller than the requested contiguous region size.
7409 	 */
7410 	if (skip_hugetlb && skipped_hugetlb) {
7411 		skip_hugetlb = false;
7412 		goto retry;
7413 	}
7414 	return NULL;
7415 }
7416 EXPORT_SYMBOL(alloc_contig_frozen_pages_noprof);
7417 
7418 /**
7419  * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
7420  * @nr_pages:	Number of contiguous pages to allocate
7421  * @gfp_mask:	GFP mask.
7422  * @nid:	Target node
7423  * @nodemask:	Mask for other possible nodes
7424  *
7425  * This routine is a wrapper around alloc_contig_frozen_pages(), it can't
7426  * be used to allocate compound pages, the refcount of each allocated page
7427  * will be set to one.
7428  *
7429  * Allocated pages can be freed with free_contig_range() or by manually
7430  * calling __free_page() on each allocated page.
7431  *
7432  * Return: pointer to contiguous pages on success, or NULL if not successful.
7433  */
7434 struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
7435 		int nid, nodemask_t *nodemask)
7436 {
7437 	struct page *page;
7438 
7439 	if (WARN_ON(gfp_mask & __GFP_COMP))
7440 		return NULL;
7441 
7442 	page = alloc_contig_frozen_pages_noprof(nr_pages, gfp_mask, nid,
7443 						nodemask);
7444 	if (page)
7445 		set_pages_refcounted(page, nr_pages);
7446 
7447 	return page;
7448 }
7449 EXPORT_SYMBOL(alloc_contig_pages_noprof);
7450 
7451 /**
7452  * free_contig_frozen_range() -- free the contiguous range of frozen pages
7453  * @pfn:	start PFN to free
7454  * @nr_pages:	Number of contiguous frozen pages to free
7455  *
7456  * This can be used to free the allocated compound/non-compound frozen pages.
7457  */
7458 void free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages)
7459 {
7460 	struct page *first_page = pfn_to_page(pfn);
7461 	const unsigned int order = ilog2(nr_pages);
7462 
7463 	if (WARN_ON_ONCE(first_page != compound_head(first_page)))
7464 		return;
7465 
7466 	if (PageHead(first_page)) {
7467 		WARN_ON_ONCE(order != compound_order(first_page));
7468 		free_frozen_pages(first_page, order);
7469 		return;
7470 	}
7471 
7472 	__free_contig_frozen_range(pfn, nr_pages);
7473 }
7474 EXPORT_SYMBOL(free_contig_frozen_range);
7475 
7476 /**
7477  * free_contig_range() -- free the contiguous range of pages
7478  * @pfn:	start PFN to free
7479  * @nr_pages:	Number of contiguous pages to free
7480  *
7481  * This can be only used to free the allocated non-compound pages.
7482  */
7483 void free_contig_range(unsigned long pfn, unsigned long nr_pages)
7484 {
7485 	if (WARN_ON_ONCE(PageHead(pfn_to_page(pfn))))
7486 		return;
7487 
7488 	__free_contig_range(pfn, nr_pages);
7489 }
7490 EXPORT_SYMBOL(free_contig_range);
7491 #endif /* CONFIG_CONTIG_ALLOC */
7492 
7493 /*
7494  * Effectively disable pcplists for the zone by setting the high limit to 0
7495  * and draining all cpus. A concurrent page freeing on another CPU that's about
7496  * to put the page on pcplist will either finish before the drain and the page
7497  * will be drained, or observe the new high limit and skip the pcplist.
7498  *
7499  * Must be paired with a call to zone_pcp_enable().
7500  */
7501 void zone_pcp_disable(struct zone *zone)
7502 {
7503 	mutex_lock(&pcp_batch_high_lock);
7504 	__zone_set_pageset_high_and_batch(zone, 0, 0, 1);
7505 	__drain_all_pages(zone, true);
7506 }
7507 
7508 void zone_pcp_enable(struct zone *zone)
7509 {
7510 	__zone_set_pageset_high_and_batch(zone, zone->pageset_high_min,
7511 		zone->pageset_high_max, zone->pageset_batch);
7512 	mutex_unlock(&pcp_batch_high_lock);
7513 }
7514 
7515 void zone_pcp_reset(struct zone *zone)
7516 {
7517 	int cpu;
7518 	struct per_cpu_zonestat *pzstats;
7519 
7520 	if (zone->per_cpu_pageset != &boot_pageset) {
7521 		for_each_online_cpu(cpu) {
7522 			pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
7523 			drain_zonestat(zone, pzstats);
7524 		}
7525 		free_percpu(zone->per_cpu_pageset);
7526 		zone->per_cpu_pageset = &boot_pageset;
7527 		if (zone->per_cpu_zonestats != &boot_zonestats) {
7528 			free_percpu(zone->per_cpu_zonestats);
7529 			zone->per_cpu_zonestats = &boot_zonestats;
7530 		}
7531 	}
7532 }
7533 
7534 #ifdef CONFIG_MEMORY_HOTREMOVE
7535 /*
7536  * All pages in the range must be in a single zone, must not contain holes,
7537  * must span full sections, and must be isolated before calling this function.
7538  *
7539  * Returns the number of managed (non-PageOffline()) pages in the range: the
7540  * number of pages for which memory offlining code must adjust managed page
7541  * counters using adjust_managed_page_count().
7542  */
7543 unsigned long __offline_isolated_pages(unsigned long start_pfn,
7544 		unsigned long end_pfn)
7545 {
7546 	unsigned long already_offline = 0;
7547 	unsigned long pfn = start_pfn;
7548 	struct page *page;
7549 	struct zone *zone;
7550 	unsigned int order;
7551 
7552 	offline_mem_sections(pfn, end_pfn);
7553 	zone = page_zone(pfn_to_page(pfn));
7554 	guard(spinlock_irqsave)(&zone->lock);
7555 	while (pfn < end_pfn) {
7556 		page = pfn_to_page(pfn);
7557 		/*
7558 		 * The HWPoisoned page may be not in buddy system, and
7559 		 * page_count() is not 0.
7560 		 */
7561 		if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
7562 			pfn++;
7563 			continue;
7564 		}
7565 		/*
7566 		 * At this point all remaining PageOffline() pages have a
7567 		 * reference count of 0 and can simply be skipped.
7568 		 */
7569 		if (PageOffline(page)) {
7570 			BUG_ON(page_count(page));
7571 			BUG_ON(PageBuddy(page));
7572 			already_offline++;
7573 			pfn++;
7574 			continue;
7575 		}
7576 
7577 		BUG_ON(page_count(page));
7578 		BUG_ON(!PageBuddy(page));
7579 		VM_WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE);
7580 		order = buddy_order(page);
7581 		del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE);
7582 		pfn += (1 << order);
7583 	}
7584 
7585 	return end_pfn - start_pfn - already_offline;
7586 }
7587 #endif
7588 
7589 /*
7590  * This function returns a stable result only if called under zone lock.
7591  */
7592 bool is_free_buddy_page(const struct page *page)
7593 {
7594 	unsigned long pfn = page_to_pfn(page);
7595 	unsigned int order;
7596 
7597 	for (order = 0; order < NR_PAGE_ORDERS; order++) {
7598 		const struct page *head = page - (pfn & ((1 << order) - 1));
7599 
7600 		if (PageBuddy(head) &&
7601 		    buddy_order_unsafe(head) >= order)
7602 			break;
7603 	}
7604 
7605 	return order <= MAX_PAGE_ORDER;
7606 }
7607 EXPORT_SYMBOL(is_free_buddy_page);
7608 
7609 #ifdef CONFIG_MEMORY_FAILURE
7610 static inline void add_to_free_list(struct page *page, struct zone *zone,
7611 				    unsigned int order, int migratetype,
7612 				    bool tail)
7613 {
7614 	__add_to_free_list(page, zone, order, migratetype, tail);
7615 	account_freepages(zone, 1 << order, migratetype);
7616 }
7617 
7618 /*
7619  * Break down a higher-order page in sub-pages, and keep our target out of
7620  * buddy allocator.
7621  */
7622 static void break_down_buddy_pages(struct zone *zone, struct page *page,
7623 				   struct page *target, int low, int high,
7624 				   int migratetype)
7625 {
7626 	unsigned long size = 1 << high;
7627 	struct page *current_buddy;
7628 
7629 	while (high > low) {
7630 		high--;
7631 		size >>= 1;
7632 
7633 		if (target >= &page[size]) {
7634 			current_buddy = page;
7635 			page = page + size;
7636 		} else {
7637 			current_buddy = page + size;
7638 		}
7639 
7640 		if (set_page_guard(zone, current_buddy, high))
7641 			continue;
7642 
7643 		add_to_free_list(current_buddy, zone, high, migratetype, false);
7644 		set_buddy_order(current_buddy, high);
7645 	}
7646 }
7647 
7648 /*
7649  * Take a page that will be marked as poisoned off the buddy allocator.
7650  */
7651 bool take_page_off_buddy(struct page *page)
7652 {
7653 	struct zone *zone = page_zone(page);
7654 	unsigned long pfn = page_to_pfn(page);
7655 	unsigned int order;
7656 
7657 	guard(spinlock_irqsave)(&zone->lock);
7658 	for (order = 0; order < NR_PAGE_ORDERS; order++) {
7659 		struct page *page_head = page - (pfn & ((1 << order) - 1));
7660 		int page_order = buddy_order(page_head);
7661 
7662 		if (PageBuddy(page_head) && page_order >= order) {
7663 			unsigned long pfn_head = page_to_pfn(page_head);
7664 			int migratetype = get_pfnblock_migratetype(page_head,
7665 								   pfn_head);
7666 
7667 			del_page_from_free_list(page_head, zone, page_order,
7668 						migratetype);
7669 			break_down_buddy_pages(zone, page_head, page, 0,
7670 						page_order, migratetype);
7671 			SetPageHWPoisonTakenOff(page);
7672 			return true;
7673 		}
7674 		if (page_count(page_head) > 0)
7675 			break;
7676 	}
7677 	return false;
7678 }
7679 
7680 /*
7681  * Cancel takeoff done by take_page_off_buddy().
7682  */
7683 bool put_page_back_buddy(struct page *page)
7684 {
7685 	struct zone *zone = page_zone(page);
7686 
7687 	guard(spinlock_irqsave)(&zone->lock);
7688 	if (put_page_testzero(page)) {
7689 		unsigned long pfn = page_to_pfn(page);
7690 		int migratetype = get_pfnblock_migratetype(page, pfn);
7691 
7692 		ClearPageHWPoisonTakenOff(page);
7693 		__free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
7694 		if (TestClearPageHWPoison(page))
7695 			return true;
7696 	}
7697 
7698 	return false;
7699 }
7700 #endif
7701 
7702 bool has_managed_zone(enum zone_type zone)
7703 {
7704 	struct pglist_data *pgdat;
7705 
7706 	for_each_online_pgdat(pgdat) {
7707 		if (managed_zone(&pgdat->node_zones[zone]))
7708 			return true;
7709 	}
7710 	return false;
7711 }
7712 
7713 #ifdef CONFIG_UNACCEPTED_MEMORY
7714 
7715 static bool lazy_accept = true;
7716 
7717 static int __init accept_memory_parse(char *p)
7718 {
7719 	if (!strcmp(p, "lazy")) {
7720 		lazy_accept = true;
7721 		return 0;
7722 	} else if (!strcmp(p, "eager")) {
7723 		lazy_accept = false;
7724 		return 0;
7725 	} else {
7726 		return -EINVAL;
7727 	}
7728 }
7729 early_param("accept_memory", accept_memory_parse);
7730 
7731 static bool page_contains_unaccepted(struct page *page, unsigned int order)
7732 {
7733 	phys_addr_t start = page_to_phys(page);
7734 
7735 	return range_contains_unaccepted_memory(start, PAGE_SIZE << order);
7736 }
7737 
7738 static void __accept_page(struct zone *zone, unsigned long *flags,
7739 			  struct page *page)
7740 {
7741 	list_del(&page->lru);
7742 	account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
7743 	__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
7744 	__ClearPageUnaccepted(page);
7745 	spin_unlock_irqrestore(&zone->lock, *flags);
7746 
7747 	accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER);
7748 
7749 	__free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
7750 }
7751 
7752 void accept_page(struct page *page)
7753 {
7754 	struct zone *zone = page_zone(page);
7755 	unsigned long flags;
7756 
7757 	spin_lock_irqsave(&zone->lock, flags);
7758 	if (!PageUnaccepted(page)) {
7759 		spin_unlock_irqrestore(&zone->lock, flags);
7760 		return;
7761 	}
7762 
7763 	/* Unlocks zone->lock */
7764 	__accept_page(zone, &flags, page);
7765 }
7766 
7767 static bool try_to_accept_memory_one(struct zone *zone)
7768 {
7769 	unsigned long flags;
7770 	struct page *page;
7771 
7772 	spin_lock_irqsave(&zone->lock, flags);
7773 	page = list_first_entry_or_null(&zone->unaccepted_pages,
7774 					struct page, lru);
7775 	if (!page) {
7776 		spin_unlock_irqrestore(&zone->lock, flags);
7777 		return false;
7778 	}
7779 
7780 	/* Unlocks zone->lock */
7781 	__accept_page(zone, &flags, page);
7782 
7783 	return true;
7784 }
7785 
7786 static bool cond_accept_memory(struct zone *zone, unsigned int order,
7787 			       int alloc_flags)
7788 {
7789 	long to_accept, wmark;
7790 	bool ret = false;
7791 
7792 	if (list_empty(&zone->unaccepted_pages))
7793 		return false;
7794 
7795 	/* Bailout, since try_to_accept_memory_one() needs to take a lock */
7796 	if (alloc_flags & ALLOC_TRYLOCK)
7797 		return false;
7798 
7799 	wmark = promo_wmark_pages(zone);
7800 
7801 	/*
7802 	 * Watermarks have not been initialized yet.
7803 	 *
7804 	 * Accepting one MAX_ORDER page to ensure progress.
7805 	 */
7806 	if (!wmark)
7807 		return try_to_accept_memory_one(zone);
7808 
7809 	/* How much to accept to get to promo watermark? */
7810 	to_accept = wmark -
7811 		    (zone_page_state(zone, NR_FREE_PAGES) -
7812 		    __zone_watermark_unusable_free(zone, order, 0) -
7813 		    zone_page_state(zone, NR_UNACCEPTED));
7814 
7815 	while (to_accept > 0) {
7816 		if (!try_to_accept_memory_one(zone))
7817 			break;
7818 		ret = true;
7819 		to_accept -= MAX_ORDER_NR_PAGES;
7820 	}
7821 
7822 	return ret;
7823 }
7824 
7825 static bool __free_unaccepted(struct page *page)
7826 {
7827 	struct zone *zone = page_zone(page);
7828 	unsigned long flags;
7829 
7830 	if (!lazy_accept)
7831 		return false;
7832 
7833 	spin_lock_irqsave(&zone->lock, flags);
7834 	list_add_tail(&page->lru, &zone->unaccepted_pages);
7835 	account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
7836 	__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
7837 	__SetPageUnaccepted(page);
7838 	spin_unlock_irqrestore(&zone->lock, flags);
7839 
7840 	return true;
7841 }
7842 
7843 #else
7844 
7845 static bool page_contains_unaccepted(struct page *page, unsigned int order)
7846 {
7847 	return false;
7848 }
7849 
7850 static bool cond_accept_memory(struct zone *zone, unsigned int order,
7851 			       int alloc_flags)
7852 {
7853 	return false;
7854 }
7855 
7856 static bool __free_unaccepted(struct page *page)
7857 {
7858 	BUILD_BUG();
7859 	return false;
7860 }
7861 
7862 #endif /* CONFIG_UNACCEPTED_MEMORY */
7863 
7864 struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order)
7865 {
7866 	/*
7867 	 * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed.
7868 	 * Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd
7869 	 * is not safe in arbitrary context.
7870 	 *
7871 	 * These two are the conditions for gfpflags_allow_spinning() being true.
7872 	 *
7873 	 * Specify __GFP_NOWARN since failing alloc_pages_nolock() is not a reason
7874 	 * to warn. Also warn would trigger printk() which is unsafe from
7875 	 * various contexts. We cannot use printk_deferred_enter() to mitigate,
7876 	 * since the running context is unknown.
7877 	 *
7878 	 * Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below
7879 	 * is safe in any context. Also zeroing the page is mandatory for
7880 	 * BPF use cases.
7881 	 *
7882 	 * Though __GFP_NOMEMALLOC is not checked in the code path below,
7883 	 * specify it here to highlight that alloc_pages_nolock()
7884 	 * doesn't want to deplete reserves.
7885 	 */
7886 	gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC | __GFP_COMP
7887 			| gfp_flags;
7888 	unsigned int alloc_flags = ALLOC_TRYLOCK;
7889 	struct alloc_context ac = { };
7890 	struct page *page;
7891 
7892 	VM_WARN_ON_ONCE(gfp_flags & ~__GFP_ACCOUNT);
7893 	/*
7894 	 * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is
7895 	 * unsafe in NMI. If spin_trylock() is called from hard IRQ the current
7896 	 * task may be waiting for one rt_spin_lock, but rt_spin_trylock() will
7897 	 * mark the task as the owner of another rt_spin_lock which will
7898 	 * confuse PI logic, so return immediately if called from hard IRQ or
7899 	 * NMI.
7900 	 *
7901 	 * Note, irqs_disabled() case is ok. This function can be called
7902 	 * from raw_spin_lock_irqsave region.
7903 	 */
7904 	if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
7905 		return NULL;
7906 
7907 	/* On UP, spin_trylock() always succeeds even when it is locked */
7908 	if (!IS_ENABLED(CONFIG_SMP) && in_nmi())
7909 		return NULL;
7910 
7911 	if (!pcp_allowed_order(order))
7912 		return NULL;
7913 
7914 	/* Bailout, since _deferred_grow_zone() needs to take a lock */
7915 	if (deferred_pages_enabled())
7916 		return NULL;
7917 
7918 	if (nid == NUMA_NO_NODE)
7919 		nid = numa_node_id();
7920 
7921 	prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac,
7922 			    &alloc_gfp, &alloc_flags);
7923 
7924 	/*
7925 	 * Best effort allocation from percpu free list.
7926 	 * If it's empty attempt to spin_trylock zone->lock.
7927 	 */
7928 	page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
7929 
7930 	/* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
7931 
7932 	if (memcg_kmem_online() && page && (gfp_flags & __GFP_ACCOUNT) &&
7933 	    unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) {
7934 		__free_frozen_pages(page, order, FPI_TRYLOCK);
7935 		page = NULL;
7936 	}
7937 	trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
7938 	kmsan_alloc_page(page, order, alloc_gfp);
7939 	return page;
7940 }
7941 /**
7942  * alloc_pages_nolock - opportunistic reentrant allocation from any context
7943  * @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed.
7944  * @nid: node to allocate from
7945  * @order: allocation order size
7946  *
7947  * Allocates pages of a given order from the given node. This is safe to
7948  * call from any context where RCU is watching (from atomic, NMI, and also
7949  * reentrant allocator -> tracepoint -> alloc_pages_nolock_noprof).
7950  * Allocation is best effort and to be expected to fail easily so nobody should
7951  * rely on the success. Failures are not reported via warn_alloc().
7952  * See always fail conditions below.
7953  *
7954  * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN.
7955  * It means ENOMEM. There is no reason to call it again and expect !NULL.
7956  */
7957 struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order)
7958 {
7959 	struct page *page;
7960 
7961 	page = alloc_frozen_pages_nolock_noprof(gfp_flags, nid, order);
7962 	if (page)
7963 		set_page_refcounted(page);
7964 	return page;
7965 }
7966 EXPORT_SYMBOL_GPL(alloc_pages_nolock_noprof);
7967