xref: /linux/mm/internal.h (revision bba2c3615bd6cfee7456d1130f2e6b01b3f4e9ba)
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /* internal.h: mm/ internal definitions
3  *
4  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
5  * Written by David Howells (dhowells@redhat.com)
6  */
7 #ifndef __MM_INTERNAL_H
8 #define __MM_INTERNAL_H
9 
10 #include <linux/fs.h>
11 #include <linux/khugepaged.h>
12 #include <linux/mm.h>
13 #include <linux/mm_inline.h>
14 #include <linux/mmu_notifier.h>
15 #include <linux/pagemap.h>
16 #include <linux/pagewalk.h>
17 #include <linux/rmap.h>
18 #include <linux/swap.h>
19 #include <linux/leafops.h>
20 #include <linux/tracepoint-defs.h>
21 
22 /* Internal core VMA manipulation functions. */
23 #include "vma.h"
24 
25 struct folio_batch;
26 
27 /*
28  * Maintains state across a page table move. The operation assumes both source
29  * and destination VMAs already exist and are specified by the user.
30  *
31  * Partial moves are permitted, but the old and new ranges must both reside
32  * within a VMA.
33  *
34  * mmap lock must be held in write and VMA write locks must be held on any VMA
35  * that is visible.
36  *
37  * Use the PAGETABLE_MOVE() macro to initialise this struct.
38  *
39  * The old_addr and new_addr fields are updated as the page table move is
40  * executed.
41  *
42  * NOTE: The page table move is affected by reading from [old_addr, old_end),
43  * and old_addr may be updated for better page table alignment, so len_in
44  * represents the length of the range being copied as specified by the user.
45  */
46 struct pagetable_move_control {
47 	struct vm_area_struct *old; /* Source VMA. */
48 	struct vm_area_struct *new; /* Destination VMA. */
49 	unsigned long old_addr; /* Address from which the move begins. */
50 	unsigned long old_end; /* Exclusive address at which old range ends. */
51 	unsigned long new_addr; /* Address to move page tables to. */
52 	unsigned long len_in; /* Bytes to remap specified by user. */
53 
54 	bool need_rmap_locks; /* Do rmap locks need to be taken? */
55 	bool for_stack; /* Is this an early temp stack being moved? */
56 };
57 
58 #define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_)	\
59 	struct pagetable_move_control name = {				\
60 		.old = old_,						\
61 		.new = new_,						\
62 		.old_addr = old_addr_,					\
63 		.old_end = (old_addr_) + (len_),			\
64 		.new_addr = new_addr_,					\
65 		.len_in = len_,						\
66 	}
67 
68 /*
69  * The set of flags that only affect watermark checking and reclaim
70  * behaviour. This is used by the MM to obey the caller constraints
71  * about IO, FS and watermark checking while ignoring placement
72  * hints such as HIGHMEM usage.
73  */
74 #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
75 			__GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
76 			__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
77 			__GFP_NOLOCKDEP)
78 
79 /* The GFP flags allowed during early boot */
80 #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
81 
82 /* Control allocation cpuset and node placement constraints */
83 #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
84 
85 /* Do not use these with a slab allocator */
86 #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
87 
88 /*
89  * Different from WARN_ON_ONCE(), no warning will be issued
90  * when we specify __GFP_NOWARN.
91  */
92 #define WARN_ON_ONCE_GFP(cond, gfp)	({				\
93 	static bool __section(".data..once") __warned;			\
94 	int __ret_warn_once = !!(cond);					\
95 									\
96 	if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \
97 		__warned = true;					\
98 		WARN_ON(1);						\
99 	}								\
100 	unlikely(__ret_warn_once);					\
101 })
102 
103 void page_writeback_init(void);
104 
105 /*
106  * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
107  * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit
108  * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE).  Hugetlb currently
109  * leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
110  */
111 #define ENTIRELY_MAPPED		0x800000
112 #define FOLIO_PAGES_MAPPED	(ENTIRELY_MAPPED - 1)
113 
114 /*
115  * Flags passed to __show_mem() and show_free_areas() to suppress output in
116  * various contexts.
117  */
118 #define SHOW_MEM_FILTER_NODES		(0x0001u)	/* disallowed nodes */
119 
120 /*
121  * How many individual pages have an elevated _mapcount.  Excludes
122  * the folio's entire_mapcount.
123  *
124  * Don't use this function outside of debugging code.
125  */
126 static inline int folio_nr_pages_mapped(const struct folio *folio)
127 {
128 	if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
129 		return -1;
130 	return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
131 }
132 
133 /*
134  * Retrieve the first entry of a folio based on a provided entry within the
135  * folio. We cannot rely on folio->swap as there is no guarantee that it has
136  * been initialized. Used for calling arch_swap_restore()
137  */
138 static inline swp_entry_t folio_swap(swp_entry_t entry,
139 		const struct folio *folio)
140 {
141 	swp_entry_t swap = {
142 		.val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)),
143 	};
144 
145 	return swap;
146 }
147 
148 static inline void *folio_raw_mapping(const struct folio *folio)
149 {
150 	unsigned long mapping = (unsigned long)folio->mapping;
151 
152 	return (void *)(mapping & ~FOLIO_MAPPING_FLAGS);
153 }
154 
155 /*
156  * This is a file-backed mapping, and is about to be memory mapped - invoke its
157  * mmap hook and safely handle error conditions. On error, VMA hooks will be
158  * mutated.
159  *
160  * @file: File which backs the mapping.
161  * @vma:  VMA which we are mapping.
162  *
163  * Returns: 0 if success, error otherwise.
164  */
165 static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
166 {
167 	int err = vfs_mmap(file, vma);
168 
169 	if (likely(!err))
170 		return 0;
171 
172 	/*
173 	 * OK, we tried to call the file hook for mmap(), but an error
174 	 * arose. The mapping is in an inconsistent state and we must not invoke
175 	 * any further hooks on it.
176 	 */
177 	vma->vm_ops = &vma_dummy_vm_ops;
178 
179 	return err;
180 }
181 
182 /*
183  * If the VMA has a close hook then close it, and since closing it might leave
184  * it in an inconsistent state which makes the use of any hooks suspect, clear
185  * them down by installing dummy empty hooks.
186  */
187 static inline void vma_close(struct vm_area_struct *vma)
188 {
189 	if (vma->vm_ops && vma->vm_ops->close) {
190 		vma->vm_ops->close(vma);
191 
192 		/*
193 		 * The mapping is in an inconsistent state, and no further hooks
194 		 * may be invoked upon it.
195 		 */
196 		vma->vm_ops = &vma_dummy_vm_ops;
197 	}
198 }
199 
200 /* unmap_vmas is in mm/memory.c */
201 void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);
202 
203 #ifdef CONFIG_MMU
204 
205 static inline void get_anon_vma(struct anon_vma *anon_vma)
206 {
207 	atomic_inc(&anon_vma->refcount);
208 }
209 
210 void __put_anon_vma(struct anon_vma *anon_vma);
211 
212 static inline void put_anon_vma(struct anon_vma *anon_vma)
213 {
214 	if (atomic_dec_and_test(&anon_vma->refcount))
215 		__put_anon_vma(anon_vma);
216 }
217 
218 static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
219 {
220 	down_write(&anon_vma->root->rwsem);
221 }
222 
223 static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
224 {
225 	return down_write_trylock(&anon_vma->root->rwsem);
226 }
227 
228 static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
229 {
230 	up_write(&anon_vma->root->rwsem);
231 }
232 
233 static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
234 {
235 	down_read(&anon_vma->root->rwsem);
236 }
237 
238 static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
239 {
240 	return down_read_trylock(&anon_vma->root->rwsem);
241 }
242 
243 static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
244 {
245 	up_read(&anon_vma->root->rwsem);
246 }
247 
248 struct anon_vma *folio_get_anon_vma(const struct folio *folio);
249 
250 /* Operations which modify VMAs. */
251 enum vma_operation {
252 	VMA_OP_SPLIT,
253 	VMA_OP_MERGE_UNFAULTED,
254 	VMA_OP_REMAP,
255 	VMA_OP_FORK,
256 };
257 
258 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
259 	enum vma_operation operation);
260 int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma);
261 int  __anon_vma_prepare(struct vm_area_struct *vma);
262 void unlink_anon_vmas(struct vm_area_struct *vma);
263 
264 static inline int anon_vma_prepare(struct vm_area_struct *vma)
265 {
266 	if (likely(vma->anon_vma))
267 		return 0;
268 
269 	return __anon_vma_prepare(vma);
270 }
271 
272 /* Flags for folio_pte_batch(). */
273 typedef int __bitwise fpb_t;
274 
275 /* Compare PTEs respecting the dirty bit. */
276 #define FPB_RESPECT_DIRTY		((__force fpb_t)BIT(0))
277 
278 /* Compare PTEs respecting the soft-dirty bit. */
279 #define FPB_RESPECT_SOFT_DIRTY		((__force fpb_t)BIT(1))
280 
281 /* Compare PTEs respecting the writable bit. */
282 #define FPB_RESPECT_WRITE		((__force fpb_t)BIT(2))
283 
284 /*
285  * Merge PTE write bits: if any PTE in the batch is writable, modify the
286  * PTE at @ptentp to be writable.
287  */
288 #define FPB_MERGE_WRITE			((__force fpb_t)BIT(3))
289 
290 /*
291  * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty,
292  * modify the PTE at @ptentp to be young or dirty, respectively.
293  */
294 #define FPB_MERGE_YOUNG_DIRTY		((__force fpb_t)BIT(4))
295 
296 static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
297 {
298 	if (!(flags & FPB_RESPECT_DIRTY))
299 		pte = pte_mkclean(pte);
300 	if (likely(!(flags & FPB_RESPECT_SOFT_DIRTY)))
301 		pte = pte_clear_soft_dirty(pte);
302 	if (likely(!(flags & FPB_RESPECT_WRITE)))
303 		pte = pte_wrprotect(pte);
304 	return pte_mkold(pte);
305 }
306 
307 /**
308  * folio_pte_batch_flags - detect a PTE batch for a large folio
309  * @folio: The large folio to detect a PTE batch for.
310  * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL.
311  * @ptep: Page table pointer for the first entry.
312  * @ptentp: Pointer to a COPY of the first page table entry whose flags this
313  *	    function updates based on @flags if appropriate.
314  * @max_nr: The maximum number of table entries to consider.
315  * @flags: Flags to modify the PTE batch semantics.
316  *
317  * Detect a PTE batch: consecutive (present) PTEs that map consecutive
318  * pages of the same large folio in a single VMA and a single page table.
319  *
320  * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
321  * the accessed bit, writable bit, dirty bit (unless FPB_RESPECT_DIRTY is set)
322  * and soft-dirty bit (unless FPB_RESPECT_SOFT_DIRTY is set).
323  *
324  * @ptep must map any page of the folio. max_nr must be at least one and
325  * must be limited by the caller so scanning cannot exceed a single VMA and
326  * a single page table.
327  *
328  * Depending on the FPB_MERGE_* flags, the pte stored at @ptentp will
329  * be updated: it's crucial that a pointer to a COPY of the first
330  * page table entry, obtained through ptep_get(), is provided as @ptentp.
331  *
332  * This function will be inlined to optimize based on the input parameters;
333  * consider using folio_pte_batch() instead if applicable.
334  *
335  * Return: the number of table entries in the batch.
336  */
337 static inline unsigned int folio_pte_batch_flags(struct folio *folio,
338 		struct vm_area_struct *vma, pte_t *ptep, pte_t *ptentp,
339 		unsigned int max_nr, fpb_t flags)
340 {
341 	bool any_writable = false, any_young = false, any_dirty = false;
342 	pte_t expected_pte, pte = *ptentp;
343 	unsigned int nr, cur_nr;
344 
345 	VM_WARN_ON_FOLIO(!pte_present(pte), folio);
346 	VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
347 	VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
348 	/*
349 	 * Ensure this is a pointer to a copy not a pointer into a page table.
350 	 * If this is a stack value, it won't be a valid virtual address, but
351 	 * that's fine because it also cannot be pointing into the page table.
352 	 */
353 	VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp)));
354 
355 	/* Limit max_nr to the actual remaining PFNs in the folio we could batch. */
356 	max_nr = min_t(unsigned long, max_nr,
357 		       folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte));
358 
359 	nr = pte_batch_hint(ptep, pte);
360 	expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
361 	ptep = ptep + nr;
362 
363 	while (nr < max_nr) {
364 		pte = ptep_get(ptep);
365 
366 		if (!pte_same(__pte_batch_clear_ignored(pte, flags), expected_pte))
367 			break;
368 
369 		if (flags & FPB_MERGE_WRITE)
370 			any_writable |= pte_write(pte);
371 		if (flags & FPB_MERGE_YOUNG_DIRTY) {
372 			any_young |= pte_young(pte);
373 			any_dirty |= pte_dirty(pte);
374 		}
375 
376 		cur_nr = pte_batch_hint(ptep, pte);
377 		expected_pte = pte_advance_pfn(expected_pte, cur_nr);
378 		ptep += cur_nr;
379 		nr += cur_nr;
380 	}
381 
382 	if (any_writable)
383 		*ptentp = pte_mkwrite(*ptentp, vma);
384 	if (any_young)
385 		*ptentp = pte_mkyoung(*ptentp);
386 	if (any_dirty)
387 		*ptentp = pte_mkdirty(*ptentp);
388 
389 	return min(nr, max_nr);
390 }
391 
392 unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
393 		unsigned int max_nr);
394 
395 /**
396  * pte_move_swp_offset - Move the swap entry offset field of a swap pte
397  *	 forward or backward by delta
398  * @pte: The initial pte state; must be a swap entry
399  * @delta: The direction and the offset we are moving; forward if delta
400  *	 is positive; backward if delta is negative
401  *
402  * Moves the swap offset, while maintaining all other fields, including
403  * swap type, and any swp pte bits. The resulting pte is returned.
404  */
405 static inline pte_t pte_move_swp_offset(pte_t pte, long delta)
406 {
407 	const softleaf_t entry = softleaf_from_pte(pte);
408 	pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
409 						   (swp_offset(entry) + delta)));
410 
411 	if (pte_swp_soft_dirty(pte))
412 		new = pte_swp_mksoft_dirty(new);
413 	if (pte_swp_exclusive(pte))
414 		new = pte_swp_mkexclusive(new);
415 	if (pte_swp_uffd_wp(pte))
416 		new = pte_swp_mkuffd_wp(new);
417 
418 	return new;
419 }
420 
421 
422 /**
423  * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
424  * @pte: The initial pte state; must be a swap entry.
425  *
426  * Increments the swap offset, while maintaining all other fields, including
427  * swap type, and any swp pte bits. The resulting pte is returned.
428  */
429 static inline pte_t pte_next_swp_offset(pte_t pte)
430 {
431 	return pte_move_swp_offset(pte, 1);
432 }
433 
434 /**
435  * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
436  * @start_ptep: Page table pointer for the first entry.
437  * @max_nr: The maximum number of table entries to consider.
438  * @pte: Page table entry for the first entry.
439  *
440  * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
441  * containing swap entries all with consecutive offsets and targeting the same
442  * swap type, all with matching swp pte bits.
443  *
444  * max_nr must be at least one and must be limited by the caller so scanning
445  * cannot exceed a single page table.
446  *
447  * Return: the number of table entries in the batch.
448  */
449 static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
450 {
451 	pte_t expected_pte = pte_next_swp_offset(pte);
452 	const pte_t *end_ptep = start_ptep + max_nr;
453 	pte_t *ptep = start_ptep + 1;
454 
455 	VM_WARN_ON(max_nr < 1);
456 	VM_WARN_ON(!softleaf_is_swap(softleaf_from_pte(pte)));
457 
458 	while (ptep < end_ptep) {
459 		pte = ptep_get(ptep);
460 
461 		if (!pte_same(pte, expected_pte))
462 			break;
463 		expected_pte = pte_next_swp_offset(expected_pte);
464 		ptep++;
465 	}
466 
467 	return ptep - start_ptep;
468 }
469 #endif /* CONFIG_MMU */
470 
471 void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
472 						int nr_throttled);
473 static inline void acct_reclaim_writeback(struct folio *folio)
474 {
475 	pg_data_t *pgdat = folio_pgdat(folio);
476 	int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled);
477 
478 	if (nr_throttled)
479 		__acct_reclaim_writeback(pgdat, folio, nr_throttled);
480 }
481 
482 static inline void wake_throttle_isolated(pg_data_t *pgdat)
483 {
484 	wait_queue_head_t *wqh;
485 
486 	wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
487 	if (waitqueue_active(wqh))
488 		wake_up(wqh);
489 }
490 
491 vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf);
492 static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
493 {
494 	vm_fault_t ret = __vmf_anon_prepare(vmf);
495 
496 	if (unlikely(ret & VM_FAULT_RETRY))
497 		vma_end_read(vmf->vma);
498 	return ret;
499 }
500 
501 vm_fault_t do_swap_page(struct vm_fault *vmf);
502 void folio_rotate_reclaimable(struct folio *folio);
503 bool __folio_end_writeback(struct folio *folio);
504 void deactivate_file_folio(struct folio *folio);
505 void folio_activate(struct folio *folio);
506 
507 void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *desc);
508 
509 void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
510 
511 /**
512  * sync_with_folio_pmd_zap - sync with concurrent zapping of a folio PMD
513  * @mm: The mm_struct.
514  * @pmdp: Pointer to the pmd that was found to be pmd_none().
515  *
516  * When we find a pmd_none() while unmapping a folio without holding the PTL,
517  * zap_huge_pmd() may have cleared the PMD but not yet modified the folio to
518  * indicate that it's unmapped. Skipping the PMD without synchronization could
519  * make folio unmapping code assume that unmapping failed.
520  *
521  * Wait for concurrent zapping to complete by grabbing the PTL.
522  */
523 static inline void sync_with_folio_pmd_zap(struct mm_struct *mm, pmd_t *pmdp)
524 {
525 	spinlock_t *ptl = pmd_lock(mm, pmdp);
526 
527 	spin_unlock(ptl);
528 }
529 
530 struct zap_details;
531 void zap_vma_range_batched(struct mmu_gather *tlb,
532 		struct vm_area_struct *vma, unsigned long addr,
533 		unsigned long size, struct zap_details *details);
534 int zap_vma_for_reaping(struct vm_area_struct *vma);
535 int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
536 			   gfp_t gfp);
537 
538 void page_cache_ra_order(struct readahead_control *, struct file_ra_state *);
539 void force_page_cache_ra(struct readahead_control *, unsigned long nr);
540 static inline void force_page_cache_readahead(struct address_space *mapping,
541 		struct file *file, pgoff_t index, unsigned long nr_to_read)
542 {
543 	DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
544 	force_page_cache_ra(&ractl, nr_to_read);
545 }
546 
547 unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
548 		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
549 unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
550 		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
551 int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
552 bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
553 		loff_t end);
554 long mapping_evict_folio(struct address_space *mapping, struct folio *folio);
555 unsigned long mapping_try_invalidate(struct address_space *mapping,
556 		pgoff_t start, pgoff_t end, unsigned long *nr_failed);
557 
558 /**
559  * folio_evictable - Test whether a folio is evictable.
560  * @folio: The folio to test.
561  *
562  * Test whether @folio is evictable -- i.e., should be placed on
563  * active/inactive lists vs unevictable list.
564  *
565  * Reasons folio might not be evictable:
566  * 1. folio's mapping marked unevictable
567  * 2. One of the pages in the folio is part of an mlocked VMA
568  */
569 static inline bool folio_evictable(struct folio *folio)
570 {
571 	bool ret;
572 
573 	/* Prevent address_space of inode and swap cache from being freed */
574 	rcu_read_lock();
575 	ret = !mapping_unevictable(folio_mapping(folio)) &&
576 			!folio_test_mlocked(folio);
577 	rcu_read_unlock();
578 	return ret;
579 }
580 
581 /*
582  * Turn a non-refcounted page (->_refcount == 0) into refcounted with
583  * a count of one.
584  */
585 static inline void set_page_refcounted(struct page *page)
586 {
587 	VM_BUG_ON_PAGE(PageTail(page), page);
588 	VM_BUG_ON_PAGE(page_ref_count(page), page);
589 	set_page_count(page, 1);
590 }
591 
592 static inline void set_pages_refcounted(struct page *page, unsigned long nr_pages)
593 {
594 	unsigned long pfn = page_to_pfn(page);
595 
596 	for (; nr_pages--; pfn++)
597 		set_page_refcounted(pfn_to_page(pfn));
598 }
599 
600 /*
601  * Return true if a folio needs ->release_folio() calling upon it.
602  */
603 static inline bool folio_needs_release(struct folio *folio)
604 {
605 	struct address_space *mapping = folio_mapping(folio);
606 
607 	return folio_has_private(folio) ||
608 		(mapping && mapping_release_always(mapping));
609 }
610 
611 extern unsigned long highest_memmap_pfn;
612 
613 /*
614  * Maximum number of reclaim retries without progress before the OOM
615  * killer is consider the only way forward.
616  */
617 #define MAX_RECLAIM_RETRIES 16
618 
619 /*
620  * in mm/vmscan.c:
621  */
622 bool folio_isolate_lru(struct folio *folio);
623 void folio_putback_lru(struct folio *folio);
624 extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
625 int user_proactive_reclaim(char *buf,
626 			   struct mem_cgroup *memcg, pg_data_t *pgdat);
627 
628 /*
629  * in mm/rmap.c:
630  */
631 pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
632 
633 /*
634  * in mm/khugepaged.c
635  */
636 void set_recommended_min_free_kbytes(void);
637 
638 /*
639  * in mm/page_alloc.c
640  */
641 #define K(x) ((x) << (PAGE_SHIFT-10))
642 
643 extern char * const zone_names[MAX_NR_ZONES];
644 
645 /* perform sanity checks on struct pages being allocated or freed */
646 DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
647 
648 extern int min_free_kbytes;
649 extern int defrag_mode;
650 
651 void setup_per_zone_wmarks(void);
652 void calculate_min_free_kbytes(void);
653 int __meminit init_per_zone_wmark_min(void);
654 void page_alloc_sysctl_init(void);
655 
656 /*
657  * Structure for holding the mostly immutable allocation parameters passed
658  * between functions involved in allocations, including the alloc_pages*
659  * family of functions.
660  *
661  * nodemask, migratetype and highest_zoneidx are initialized only once in
662  * __alloc_pages() and then never change.
663  *
664  * zonelist, preferred_zone and highest_zoneidx are set first in
665  * __alloc_pages() for the fast path, and might be later changed
666  * in __alloc_pages_slowpath(). All other functions pass the whole structure
667  * by a const pointer.
668  */
669 struct alloc_context {
670 	struct zonelist *zonelist;
671 	nodemask_t *nodemask;
672 	struct zoneref *preferred_zoneref;
673 	int migratetype;
674 
675 	/*
676 	 * highest_zoneidx represents highest usable zone index of
677 	 * the allocation request. Due to the nature of the zone,
678 	 * memory on lower zone than the highest_zoneidx will be
679 	 * protected by lowmem_reserve[highest_zoneidx].
680 	 *
681 	 * highest_zoneidx is also used by reclaim/compaction to limit
682 	 * the target zone since higher zone than this index cannot be
683 	 * usable for this allocation request.
684 	 */
685 	enum zone_type highest_zoneidx;
686 	bool spread_dirty_pages;
687 };
688 
689 /*
690  * This function returns the order of a free page in the buddy system. In
691  * general, page_zone(page)->lock must be held by the caller to prevent the
692  * page from being allocated in parallel and returning garbage as the order.
693  * If a caller does not hold page_zone(page)->lock, it must guarantee that the
694  * page cannot be allocated or merged in parallel. Alternatively, it must
695  * handle invalid values gracefully, and use buddy_order_unsafe() below.
696  */
697 static inline unsigned int buddy_order(struct page *page)
698 {
699 	/* PageBuddy() must be checked by the caller */
700 	return page_private(page);
701 }
702 
703 /*
704  * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
705  * PageBuddy() should be checked first by the caller to minimize race window,
706  * and invalid values must be handled gracefully.
707  *
708  * READ_ONCE is used so that if the caller assigns the result into a local
709  * variable and e.g. tests it for valid range before using, the compiler cannot
710  * decide to remove the variable and inline the page_private(page) multiple
711  * times, potentially observing different values in the tests and the actual
712  * use of the result.
713  */
714 #define buddy_order_unsafe(page)	READ_ONCE(page_private(page))
715 
716 /*
717  * This function checks whether a page is free && is the buddy
718  * we can coalesce a page and its buddy if
719  * (a) the buddy is not in a hole (check before calling!) &&
720  * (b) the buddy is in the buddy system &&
721  * (c) a page and its buddy have the same order &&
722  * (d) a page and its buddy are in the same zone.
723  *
724  * For recording whether a page is in the buddy system, we set PageBuddy.
725  * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
726  *
727  * For recording page's order, we use page_private(page).
728  */
729 static inline bool page_is_buddy(struct page *page, struct page *buddy,
730 				 unsigned int order)
731 {
732 	if (!page_is_guard(buddy) && !PageBuddy(buddy))
733 		return false;
734 
735 	if (buddy_order(buddy) != order)
736 		return false;
737 
738 	/*
739 	 * zone check is done late to avoid uselessly calculating
740 	 * zone/node ids for pages that could never merge.
741 	 */
742 	if (page_zone_id(page) != page_zone_id(buddy))
743 		return false;
744 
745 	VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
746 
747 	return true;
748 }
749 
750 /*
751  * Locate the struct page for both the matching buddy in our
752  * pair (buddy1) and the combined O(n+1) page they form (page).
753  *
754  * 1) Any buddy B1 will have an order O twin B2 which satisfies
755  * the following equation:
756  *     B2 = B1 ^ (1 << O)
757  * For example, if the starting buddy (buddy2) is #8 its order
758  * 1 buddy is #10:
759  *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
760  *
761  * 2) Any buddy B will have an order O+1 parent P which
762  * satisfies the following equation:
763  *     P = B & ~(1 << O)
764  *
765  * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER
766  */
767 static inline unsigned long
768 __find_buddy_pfn(unsigned long page_pfn, unsigned int order)
769 {
770 	return page_pfn ^ (1 << order);
771 }
772 
773 /*
774  * Find the buddy of @page and validate it.
775  * @page: The input page
776  * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the
777  *       function is used in the performance-critical __free_one_page().
778  * @order: The order of the page
779  * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to
780  *             page_to_pfn().
781  *
782  * The found buddy can be a non PageBuddy, out of @page's zone, or its order is
783  * not the same as @page. The validation is necessary before use it.
784  *
785  * Return: the found buddy page or NULL if not found.
786  */
787 static inline struct page *find_buddy_page_pfn(struct page *page,
788 			unsigned long pfn, unsigned int order, unsigned long *buddy_pfn)
789 {
790 	unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order);
791 	struct page *buddy;
792 
793 	buddy = page + (__buddy_pfn - pfn);
794 	if (buddy_pfn)
795 		*buddy_pfn = __buddy_pfn;
796 
797 	if (page_is_buddy(page, buddy, order))
798 		return buddy;
799 	return NULL;
800 }
801 
802 extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
803 				unsigned long end_pfn, struct zone *zone);
804 
805 static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
806 				unsigned long end_pfn, struct zone *zone)
807 {
808 	if (zone->contiguous)
809 		return pfn_to_page(start_pfn);
810 
811 	return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
812 }
813 
814 void set_zone_contiguous(struct zone *zone);
815 bool pfn_range_intersects_zones(int nid, unsigned long start_pfn,
816 			   unsigned long nr_pages);
817 
818 static inline void clear_zone_contiguous(struct zone *zone)
819 {
820 	zone->contiguous = false;
821 }
822 
823 extern int __isolate_free_page(struct page *page, unsigned int order);
824 extern void __putback_isolated_page(struct page *page, unsigned int order,
825 				    int mt);
826 extern void memblock_free_pages(unsigned long pfn, unsigned int order);
827 extern void __free_pages_core(struct page *page, unsigned int order,
828 		enum meminit_context context);
829 
830 /*
831  * This will have no effect, other than possibly generating a warning, if the
832  * caller passes in a non-large folio.
833  */
834 static inline void folio_set_order(struct folio *folio, unsigned int order)
835 {
836 	if (WARN_ON_ONCE(!order || !folio_test_large(folio)))
837 		return;
838 	VM_WARN_ON_ONCE(order > MAX_FOLIO_ORDER);
839 
840 	folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order;
841 #ifdef NR_PAGES_IN_LARGE_FOLIO
842 	folio->_nr_pages = 1U << order;
843 #endif
844 }
845 
846 bool __folio_unqueue_deferred_split(struct folio *folio);
847 static inline bool folio_unqueue_deferred_split(struct folio *folio)
848 {
849 	if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio))
850 		return false;
851 
852 	/*
853 	 * At this point, there is no one trying to add the folio to
854 	 * deferred_list. If folio is not in deferred_list, it's safe
855 	 * to check without acquiring the list_lru lock.
856 	 */
857 	if (data_race(list_empty(&folio->_deferred_list)))
858 		return false;
859 
860 	return __folio_unqueue_deferred_split(folio);
861 }
862 
863 static inline struct folio *page_rmappable_folio(struct page *page)
864 {
865 	struct folio *folio = (struct folio *)page;
866 
867 	if (folio && folio_test_large(folio))
868 		folio_set_large_rmappable(folio);
869 	return folio;
870 }
871 
872 static inline void prep_compound_head(struct page *page, unsigned int order)
873 {
874 	struct folio *folio = (struct folio *)page;
875 
876 	folio_set_order(folio, order);
877 	atomic_set(&folio->_large_mapcount, -1);
878 	if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
879 		atomic_set(&folio->_nr_pages_mapped, 0);
880 	if (IS_ENABLED(CONFIG_MM_ID)) {
881 		folio->_mm_ids = 0;
882 		folio->_mm_id_mapcount[0] = -1;
883 		folio->_mm_id_mapcount[1] = -1;
884 	}
885 	if (IS_ENABLED(CONFIG_64BIT) || order > 1) {
886 		atomic_set(&folio->_pincount, 0);
887 		atomic_set(&folio->_entire_mapcount, -1);
888 	}
889 	if (order > 1)
890 		INIT_LIST_HEAD(&folio->_deferred_list);
891 }
892 
893 static inline void prep_compound_tail(struct page *tail,
894 		const struct page *head, unsigned int order)
895 {
896 	tail->mapping = TAIL_MAPPING;
897 	set_compound_head(tail, head, order);
898 	set_page_private(tail, 0);
899 }
900 
901 static inline void init_compound_tail(struct page *tail,
902 		const struct page *head, unsigned int order, struct zone *zone)
903 {
904 	atomic_set(&tail->_mapcount, -1);
905 	set_page_node(tail, zone_to_nid(zone));
906 	set_page_zone(tail, zone_idx(zone));
907 	prep_compound_tail(tail, head, order);
908 }
909 
910 void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags);
911 extern bool free_pages_prepare(struct page *page, unsigned int order);
912 
913 extern int user_min_free_kbytes;
914 
915 struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid,
916 		nodemask_t *);
917 #define __alloc_frozen_pages(...) \
918 	alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__))
919 void free_frozen_pages(struct page *page, unsigned int order);
920 void free_unref_folios(struct folio_batch *fbatch);
921 
922 #ifdef CONFIG_NUMA
923 struct page *alloc_frozen_pages_noprof(gfp_t, unsigned int order);
924 #else
925 static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order)
926 {
927 	return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL);
928 }
929 #endif
930 
931 #define alloc_frozen_pages(...) \
932 	alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__))
933 
934 struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order);
935 #define alloc_frozen_pages_nolock(...) \
936 	alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__))
937 void free_frozen_pages_nolock(struct page *page, unsigned int order);
938 
939 extern void zone_pcp_reset(struct zone *zone);
940 extern void zone_pcp_disable(struct zone *zone);
941 extern void zone_pcp_enable(struct zone *zone);
942 extern void zone_pcp_init(struct zone *zone);
943 
944 extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
945 			  phys_addr_t min_addr,
946 			  int nid, bool exact_nid);
947 
948 void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
949 		unsigned long, enum meminit_context, struct vmem_altmap *, int,
950 		bool);
951 
952 /*
953  * mm/sparse.c
954  */
955 #ifdef CONFIG_SPARSEMEM
956 void sparse_init(void);
957 int sparse_index_init(unsigned long section_nr, int nid);
958 
959 static inline void sparse_init_one_section(struct mem_section *ms,
960 		unsigned long pnum, struct page *mem_map,
961 		struct mem_section_usage *usage, unsigned long flags)
962 {
963 	unsigned long coded_mem_map;
964 
965 	BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT);
966 
967 	/*
968 	 * We encode the start PFN of the section into the mem_map such that
969 	 * page_to_pfn() on !CONFIG_SPARSEMEM_VMEMMAP can simply subtract it
970 	 * from the page pointer to obtain the PFN.
971 	 */
972 	coded_mem_map = (unsigned long)(mem_map - section_nr_to_pfn(pnum));
973 	VM_WARN_ON_ONCE(coded_mem_map & ~SECTION_MAP_MASK);
974 
975 	ms->section_mem_map &= ~SECTION_MAP_MASK;
976 	ms->section_mem_map |= coded_mem_map;
977 	ms->section_mem_map |= flags | SECTION_HAS_MEM_MAP;
978 	ms->usage = usage;
979 }
980 
981 static inline void __section_mark_present(struct mem_section *ms,
982 		unsigned long section_nr)
983 {
984 	if (section_nr > __highest_present_section_nr)
985 		__highest_present_section_nr = section_nr;
986 
987 	ms->section_mem_map |= SECTION_MARKED_PRESENT;
988 }
989 #else
990 static inline void sparse_init(void) {}
991 #endif /* CONFIG_SPARSEMEM */
992 
993 /*
994  * mm/sparse-vmemmap.c
995  */
996 #ifdef CONFIG_SPARSEMEM_VMEMMAP
997 void sparse_init_subsection_map(unsigned long pfn, unsigned long nr_pages);
998 #else
999 static inline void sparse_init_subsection_map(unsigned long pfn,
1000 		unsigned long nr_pages)
1001 {
1002 }
1003 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
1004 
1005 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
1006 
1007 /*
1008  * in mm/compaction.c
1009  */
1010 /*
1011  * compact_control is used to track pages being migrated and the free pages
1012  * they are being migrated to during memory compaction. The free_pfn starts
1013  * at the end of a zone and migrate_pfn begins at the start. Movable pages
1014  * are moved to the end of a zone during a compaction run and the run
1015  * completes when free_pfn <= migrate_pfn
1016  */
1017 struct compact_control {
1018 	struct list_head freepages[NR_PAGE_ORDERS];	/* List of free pages to migrate to */
1019 	struct list_head migratepages;	/* List of pages being migrated */
1020 	unsigned int nr_freepages;	/* Number of isolated free pages */
1021 	unsigned int nr_migratepages;	/* Number of pages to migrate */
1022 	unsigned long free_pfn;		/* isolate_freepages search base */
1023 	/*
1024 	 * Acts as an in/out parameter to page isolation for migration.
1025 	 * isolate_migratepages uses it as a search base.
1026 	 * isolate_migratepages_block will update the value to the next pfn
1027 	 * after the last isolated one.
1028 	 */
1029 	unsigned long migrate_pfn;
1030 	unsigned long fast_start_pfn;	/* a pfn to start linear scan from */
1031 	struct zone *zone;
1032 	unsigned long total_migrate_scanned;
1033 	unsigned long total_free_scanned;
1034 	unsigned short fast_search_fail;/* failures to use free list searches */
1035 	short search_order;		/* order to start a fast search at */
1036 	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
1037 	int order;			/* order a direct compactor needs */
1038 	int migratetype;		/* migratetype of direct compactor */
1039 	const unsigned int alloc_flags;	/* alloc flags of a direct compactor */
1040 	const int highest_zoneidx;	/* zone index of a direct compactor */
1041 	enum migrate_mode mode;		/* Async or sync migration mode */
1042 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
1043 	bool no_set_skip_hint;		/* Don't mark blocks for skipping */
1044 	bool ignore_block_suitable;	/* Scan blocks considered unsuitable */
1045 	bool direct_compaction;		/* False from kcompactd or /proc/... */
1046 	bool proactive_compaction;	/* kcompactd proactive compaction */
1047 	bool whole_zone;		/* Whole zone should/has been scanned */
1048 	bool contended;			/* Signal lock contention */
1049 	bool finish_pageblock;		/* Scan the remainder of a pageblock. Used
1050 					 * when there are potentially transient
1051 					 * isolation or migration failures to
1052 					 * ensure forward progress.
1053 					 */
1054 	bool alloc_contig;		/* alloc_contig_range allocation */
1055 };
1056 
1057 /*
1058  * Used in direct compaction when a page should be taken from the freelists
1059  * immediately when one is created during the free path.
1060  */
1061 struct capture_control {
1062 	struct compact_control *cc;
1063 	struct page *page;
1064 };
1065 
1066 unsigned long
1067 isolate_freepages_range(struct compact_control *cc,
1068 			unsigned long start_pfn, unsigned long end_pfn);
1069 int
1070 isolate_migratepages_range(struct compact_control *cc,
1071 			   unsigned long low_pfn, unsigned long end_pfn);
1072 
1073 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
1074 void init_cma_reserved_pageblock(struct page *page);
1075 
1076 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
1077 
1078 struct cma;
1079 
1080 #ifdef CONFIG_CMA
1081 bool cma_validate_zones(struct cma *cma);
1082 void *cma_reserve_early(struct cma *cma, unsigned long size);
1083 void init_cma_pageblock(struct page *page);
1084 #else
1085 static inline bool cma_validate_zones(struct cma *cma)
1086 {
1087 	return false;
1088 }
1089 static inline void *cma_reserve_early(struct cma *cma, unsigned long size)
1090 {
1091 	return NULL;
1092 }
1093 static inline void init_cma_pageblock(struct page *page)
1094 {
1095 }
1096 #endif
1097 
1098 enum fallback_result {
1099 	/* Found suitable migratetype, *mt_out is valid. */
1100 	FALLBACK_FOUND,
1101 	/* No fallback found in requested order. */
1102 	FALLBACK_EMPTY,
1103 	/* Passed @claimable, but claiming whole block is a bad idea. */
1104 	FALLBACK_NOCLAIM,
1105 };
1106 enum fallback_result
1107 find_suitable_fallback(struct free_area *area, unsigned int order,
1108 		       int migratetype, bool claimable, int *mt_out);
1109 
1110 static inline bool free_area_empty(struct free_area *area, int migratetype)
1111 {
1112 	return list_empty(&area->free_list[migratetype]);
1113 }
1114 
1115 /* mm/util.c */
1116 struct anon_vma *folio_anon_vma(const struct folio *folio);
1117 
1118 #ifdef CONFIG_MMU
1119 void unmap_mapping_folio(struct folio *folio);
1120 extern long populate_vma_page_range(struct vm_area_struct *vma,
1121 		unsigned long start, unsigned long end, int *locked);
1122 extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
1123 		unsigned long end, bool write, int *locked);
1124 bool mlock_future_ok(const struct mm_struct *mm, bool is_vma_locked,
1125 		unsigned long bytes);
1126 
1127 /*
1128  * NOTE: This function can't tell whether the folio is "fully mapped" in the
1129  * range.
1130  * "fully mapped" means all the pages of folio is associated with the page
1131  * table of range while this function just check whether the folio range is
1132  * within the range [start, end). Function caller needs to do page table
1133  * check if it cares about the page table association.
1134  *
1135  * Typical usage (like mlock or madvise) is:
1136  * Caller knows at least 1 page of folio is associated with page table of VMA
1137  * and the range [start, end) is intersect with the VMA range. Caller wants
1138  * to know whether the folio is fully associated with the range. It calls
1139  * this function to check whether the folio is in the range first. Then checks
1140  * the page table to know whether the folio is fully mapped to the range.
1141  */
1142 static inline bool
1143 folio_within_range(struct folio *folio, struct vm_area_struct *vma,
1144 		unsigned long start, unsigned long end)
1145 {
1146 	pgoff_t pgoff, addr;
1147 	unsigned long vma_pglen = vma_pages(vma);
1148 
1149 	VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio);
1150 	if (start > end)
1151 		return false;
1152 
1153 	if (start < vma->vm_start)
1154 		start = vma->vm_start;
1155 
1156 	if (end > vma->vm_end)
1157 		end = vma->vm_end;
1158 
1159 	pgoff = folio_pgoff(folio);
1160 
1161 	/* if folio start address is not in vma range */
1162 	if (!in_range(pgoff, vma->vm_pgoff, vma_pglen))
1163 		return false;
1164 
1165 	addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1166 
1167 	return !(addr < start || end - addr < folio_size(folio));
1168 }
1169 
1170 static inline bool
1171 folio_within_vma(struct folio *folio, struct vm_area_struct *vma)
1172 {
1173 	return folio_within_range(folio, vma, vma->vm_start, vma->vm_end);
1174 }
1175 
1176 /*
1177  * mlock_vma_folio() and munlock_vma_folio():
1178  * should be called with vma's mmap_lock held for read or write,
1179  * under page table lock for the pte/pmd being added or removed.
1180  *
1181  * mlock is usually called at the end of folio_add_*_rmap_*(), munlock at
1182  * the end of folio_remove_rmap_*(); but new anon folios are managed by
1183  * folio_add_lru_vma() calling mlock_new_folio().
1184  */
1185 void mlock_folio(struct folio *folio);
1186 static inline void mlock_vma_folio(struct folio *folio,
1187 				struct vm_area_struct *vma)
1188 {
1189 	/*
1190 	 * The VM_SPECIAL check here serves two purposes.
1191 	 * 1) VM_IO check prevents migration from double-counting during mlock.
1192 	 * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED
1193 	 *    is never left set on a VM_SPECIAL vma, there is an interval while
1194 	 *    file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
1195 	 *    still be set while VM_SPECIAL bits are added: so ignore it then.
1196 	 */
1197 	if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED))
1198 		mlock_folio(folio);
1199 }
1200 
1201 void munlock_folio(struct folio *folio);
1202 static inline void munlock_vma_folio(struct folio *folio,
1203 					struct vm_area_struct *vma)
1204 {
1205 	/*
1206 	 * munlock if the function is called. Ideally, we should only
1207 	 * do munlock if any page of folio is unmapped from VMA and
1208 	 * cause folio not fully mapped to VMA.
1209 	 *
1210 	 * But it's not easy to confirm that's the situation. So we
1211 	 * always munlock the folio and page reclaim will correct it
1212 	 * if it's wrong.
1213 	 */
1214 	if (unlikely(vma->vm_flags & VM_LOCKED))
1215 		munlock_folio(folio);
1216 }
1217 
1218 void mlock_new_folio(struct folio *folio);
1219 bool need_mlock_drain(int cpu);
1220 void mlock_drain_local(void);
1221 void mlock_drain_remote(int cpu);
1222 
1223 extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
1224 
1225 /**
1226  * vma_address - Find the virtual address a page range is mapped at
1227  * @vma: The vma which maps this object.
1228  * @pgoff: The page offset within its object.
1229  * @nr_pages: The number of pages to consider.
1230  *
1231  * If any page in this range is mapped by this VMA, return the first address
1232  * where any of these pages appear.  Otherwise, return -EFAULT.
1233  */
1234 static inline unsigned long vma_address(const struct vm_area_struct *vma,
1235 		pgoff_t pgoff, unsigned long nr_pages)
1236 {
1237 	unsigned long address;
1238 
1239 	if (pgoff >= vma->vm_pgoff) {
1240 		address = vma->vm_start +
1241 			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1242 		/* Check for address beyond vma (or wrapped through 0?) */
1243 		if (address < vma->vm_start || address >= vma->vm_end)
1244 			address = -EFAULT;
1245 	} else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
1246 		/* Test above avoids possibility of wrap to 0 on 32-bit */
1247 		address = vma->vm_start;
1248 	} else {
1249 		address = -EFAULT;
1250 	}
1251 	return address;
1252 }
1253 
1254 /*
1255  * Then at what user virtual address will none of the range be found in vma?
1256  * Assumes that vma_address() already returned a good starting address.
1257  */
1258 static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw)
1259 {
1260 	struct vm_area_struct *vma = pvmw->vma;
1261 	pgoff_t pgoff;
1262 	unsigned long address;
1263 
1264 	/* Common case, plus ->pgoff is invalid for KSM */
1265 	if (pvmw->nr_pages == 1)
1266 		return pvmw->address + PAGE_SIZE;
1267 
1268 	pgoff = pvmw->pgoff + pvmw->nr_pages;
1269 	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1270 	/* Check for address beyond vma (or wrapped through 0?) */
1271 	if (address < vma->vm_start || address > vma->vm_end)
1272 		address = vma->vm_end;
1273 	return address;
1274 }
1275 
1276 static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
1277 						    struct file *fpin)
1278 {
1279 	int flags = vmf->flags;
1280 
1281 	if (fpin)
1282 		return fpin;
1283 
1284 	/*
1285 	 * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
1286 	 * anything, so we only pin the file and drop the mmap_lock if only
1287 	 * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
1288 	 */
1289 	if (fault_flag_allow_retry_first(flags) &&
1290 	    !(flags & FAULT_FLAG_RETRY_NOWAIT)) {
1291 		fpin = get_file(vmf->vma->vm_file);
1292 		release_fault_lock(vmf);
1293 	}
1294 	return fpin;
1295 }
1296 
1297 static inline bool vma_supports_mlock(const struct vm_area_struct *vma)
1298 {
1299 	if (vma_test_any_mask(vma, VMA_SPECIAL_FLAGS))
1300 		return false;
1301 	if (vma_test_single_mask(vma, VMA_DROPPABLE))
1302 		return false;
1303 	if (vma_is_dax(vma) || is_vm_hugetlb_page(vma))
1304 		return false;
1305 	return vma != get_gate_vma(current->mm);
1306 }
1307 
1308 #else /* !CONFIG_MMU */
1309 static inline void unmap_mapping_folio(struct folio *folio) { }
1310 static inline void mlock_new_folio(struct folio *folio) { }
1311 static inline bool need_mlock_drain(int cpu) { return false; }
1312 static inline void mlock_drain_local(void) { }
1313 static inline void mlock_drain_remote(int cpu) { }
1314 static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
1315 {
1316 }
1317 #endif /* !CONFIG_MMU */
1318 
1319 /* Memory initialisation debug and verification */
1320 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1321 DECLARE_STATIC_KEY_TRUE(deferred_pages);
1322 
1323 static inline bool deferred_pages_enabled(void)
1324 {
1325 	return static_branch_unlikely(&deferred_pages);
1326 }
1327 
1328 bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
1329 #else
1330 static inline bool deferred_pages_enabled(void)
1331 {
1332 	return false;
1333 }
1334 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1335 
1336 void init_deferred_page(unsigned long pfn, int nid);
1337 
1338 enum mminit_level {
1339 	MMINIT_WARNING,
1340 	MMINIT_VERIFY,
1341 	MMINIT_TRACE
1342 };
1343 
1344 #ifdef CONFIG_DEBUG_MEMORY_INIT
1345 
1346 extern int mminit_loglevel;
1347 
1348 #define mminit_dprintk(level, prefix, fmt, arg...) \
1349 do { \
1350 	if (level < mminit_loglevel) { \
1351 		if (level <= MMINIT_WARNING) \
1352 			pr_warn("mminit::" prefix " " fmt, ##arg);	\
1353 		else \
1354 			printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
1355 	} \
1356 } while (0)
1357 
1358 extern void mminit_verify_pageflags_layout(void);
1359 extern void mminit_verify_zonelist(void);
1360 #else
1361 
1362 static inline void mminit_dprintk(enum mminit_level level,
1363 				const char *prefix, const char *fmt, ...)
1364 {
1365 }
1366 
1367 static inline void mminit_verify_pageflags_layout(void)
1368 {
1369 }
1370 
1371 static inline void mminit_verify_zonelist(void)
1372 {
1373 }
1374 #endif /* CONFIG_DEBUG_MEMORY_INIT */
1375 
1376 #define NODE_RECLAIM_NOSCAN	-2
1377 #define NODE_RECLAIM_FULL	-1
1378 #define NODE_RECLAIM_SOME	0
1379 #define NODE_RECLAIM_SUCCESS	1
1380 
1381 #ifdef CONFIG_NUMA
1382 extern int node_reclaim_mode;
1383 
1384 extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
1385 extern int find_next_best_node(int node, nodemask_t *used_node_mask);
1386 #else
1387 #define node_reclaim_mode 0
1388 
1389 static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
1390 				unsigned int order)
1391 {
1392 	return NODE_RECLAIM_NOSCAN;
1393 }
1394 static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
1395 {
1396 	return NUMA_NO_NODE;
1397 }
1398 #endif
1399 
1400 static inline bool node_reclaim_enabled(void)
1401 {
1402 	/* Is any node_reclaim_mode bit set? */
1403 	return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
1404 }
1405 
1406 /*
1407  * mm/memory-failure.c
1408  */
1409 #ifdef CONFIG_MEMORY_FAILURE
1410 int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill);
1411 void shake_folio(struct folio *folio);
1412 typedef int hwpoison_filter_func_t(struct page *p);
1413 void hwpoison_filter_register(hwpoison_filter_func_t *filter);
1414 void hwpoison_filter_unregister(void);
1415 
1416 #define MAGIC_HWPOISON	0x48575053U	/* HWPS */
1417 void SetPageHWPoisonTakenOff(struct page *page);
1418 void ClearPageHWPoisonTakenOff(struct page *page);
1419 bool take_page_off_buddy(struct page *page);
1420 bool put_page_back_buddy(struct page *page);
1421 struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
1422 void add_to_kill_ksm(struct task_struct *tsk, const struct page *p,
1423 		     struct vm_area_struct *vma, struct list_head *to_kill,
1424 		     unsigned long ksm_addr);
1425 unsigned long page_mapped_in_vma(const struct page *page,
1426 		struct vm_area_struct *vma);
1427 
1428 #else
1429 static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
1430 {
1431 	return -EBUSY;
1432 }
1433 #endif
1434 
1435 extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
1436         unsigned long, unsigned long,
1437         unsigned long, unsigned long);
1438 
1439 extern void set_pageblock_order(void);
1440 unsigned long reclaim_pages(struct list_head *folio_list);
1441 unsigned int reclaim_clean_pages_from_list(struct zone *zone,
1442 					    struct list_head *folio_list);
1443 /* The ALLOC_WMARK bits are used as an index to zone->watermark */
1444 #define ALLOC_WMARK_MIN		WMARK_MIN
1445 #define ALLOC_WMARK_LOW		WMARK_LOW
1446 #define ALLOC_WMARK_HIGH	WMARK_HIGH
1447 #define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */
1448 
1449 /* Mask to get the watermark bits */
1450 #define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)
1451 
1452 /*
1453  * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
1454  * cannot assume a reduced access to memory reserves is sufficient for
1455  * !MMU
1456  */
1457 #ifdef CONFIG_MMU
1458 #define ALLOC_OOM		0x08
1459 #else
1460 #define ALLOC_OOM		ALLOC_NO_WATERMARKS
1461 #endif
1462 
1463 #define ALLOC_NON_BLOCK		 0x10 /* Caller cannot block. Allow access
1464 				       * to 25% of the min watermark or
1465 				       * 62.5% if __GFP_HIGH is set.
1466 				       */
1467 #define ALLOC_MIN_RESERVE	 0x20 /* __GFP_HIGH set. Allow access to 50%
1468 				       * of the min watermark.
1469 				       */
1470 #define ALLOC_CPUSET		 0x40 /* check for correct cpuset */
1471 #define ALLOC_CMA		 0x80 /* allow allocations from CMA areas */
1472 #ifdef CONFIG_ZONE_DMA32
1473 #define ALLOC_NOFRAGMENT	0x100 /* avoid mixing pageblock types */
1474 #else
1475 #define ALLOC_NOFRAGMENT	  0x0
1476 #endif
1477 #define ALLOC_HIGHATOMIC	0x200 /* Allows access to MIGRATE_HIGHATOMIC */
1478 #define ALLOC_TRYLOCK		0x400 /* Only use spin_trylock in allocation path */
1479 #define ALLOC_KSWAPD		0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
1480 
1481 /* Flags that allow allocations below the min watermark. */
1482 #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)
1483 
1484 enum ttu_flags;
1485 struct tlbflush_unmap_batch;
1486 
1487 
1488 /*
1489  * only for MM internal work items which do not depend on
1490  * any allocations or locks which might depend on allocations
1491  */
1492 extern struct workqueue_struct *mm_percpu_wq;
1493 
1494 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
1495 void try_to_unmap_flush(void);
1496 void try_to_unmap_flush_dirty(void);
1497 void flush_tlb_batched_pending(struct mm_struct *mm);
1498 #else
1499 static inline void try_to_unmap_flush(void)
1500 {
1501 }
1502 static inline void try_to_unmap_flush_dirty(void)
1503 {
1504 }
1505 static inline void flush_tlb_batched_pending(struct mm_struct *mm)
1506 {
1507 }
1508 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
1509 
1510 extern const struct trace_print_flags pageflag_names[];
1511 extern const struct trace_print_flags vmaflag_names[];
1512 extern const struct trace_print_flags gfpflag_names[];
1513 
1514 void setup_zone_pageset(struct zone *zone);
1515 
1516 struct migration_target_control {
1517 	int nid;		/* preferred node id */
1518 	nodemask_t *nmask;
1519 	gfp_t gfp_mask;
1520 	enum migrate_reason reason;
1521 };
1522 
1523 /*
1524  * mm/filemap.c
1525  */
1526 size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
1527 			      struct folio *folio, loff_t fpos, size_t size);
1528 
1529 /*
1530  * mm/vmalloc.c
1531  */
1532 #ifdef CONFIG_MMU
1533 void __init vmalloc_init(void);
1534 int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
1535 	pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask);
1536 unsigned int get_vm_area_page_order(struct vm_struct *vm);
1537 #else
1538 static inline void vmalloc_init(void)
1539 {
1540 }
1541 
1542 static inline
1543 int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
1544 	pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask)
1545 {
1546 	return -EINVAL;
1547 }
1548 #endif
1549 
1550 void clear_vm_uninitialized_flag(struct vm_struct *vm);
1551 
1552 int __must_check __vmap_pages_range_noflush(unsigned long addr,
1553 			       unsigned long end, pgprot_t prot,
1554 			       struct page **pages, unsigned int page_shift);
1555 
1556 void vunmap_range_noflush(unsigned long start, unsigned long end);
1557 
1558 void __vunmap_range_noflush(unsigned long start, unsigned long end);
1559 
1560 static inline bool vma_is_single_threaded_private(struct vm_area_struct *vma)
1561 {
1562 	if (vma->vm_flags & VM_SHARED)
1563 		return false;
1564 
1565 	return atomic_read(&vma->vm_mm->mm_users) == 1;
1566 }
1567 
1568 #ifdef CONFIG_NUMA_BALANCING
1569 bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma,
1570 		bool is_private_single_threaded);
1571 
1572 #else
1573 static inline bool folio_can_map_prot_numa(struct folio *folio,
1574 		struct vm_area_struct *vma, bool is_private_single_threaded)
1575 {
1576 	return false;
1577 }
1578 #endif
1579 
1580 int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
1581 		      unsigned long addr, int *flags, bool writable,
1582 		      int *last_cpupid);
1583 
1584 void free_zone_device_folio(struct folio *folio);
1585 int migrate_device_coherent_folio(struct folio *folio);
1586 
1587 struct vm_struct *__get_vm_area_node(unsigned long size,
1588 				     unsigned long align, unsigned long shift,
1589 				     unsigned long vm_flags, unsigned long start,
1590 				     unsigned long end, int node, gfp_t gfp_mask,
1591 				     const void *caller);
1592 
1593 /*
1594  * mm/gup.c
1595  */
1596 int __must_check try_grab_folio(struct folio *folio, int refs,
1597 				unsigned int flags);
1598 
1599 /*
1600  * mm/huge_memory.c
1601  */
1602 void touch_pud(struct vm_area_struct *vma, unsigned long addr,
1603 	       pud_t *pud, bool write);
1604 bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
1605 	       pmd_t *pmd, bool write);
1606 
1607 /*
1608  * Parses a string with mem suffixes into its order. Useful to parse kernel
1609  * parameters.
1610  */
1611 static inline int get_order_from_str(const char *size_str,
1612 				     unsigned long valid_orders)
1613 {
1614 	unsigned long size;
1615 	char *endptr;
1616 	int order;
1617 
1618 	size = memparse(size_str, &endptr);
1619 
1620 	if (!is_power_of_2(size))
1621 		return -EINVAL;
1622 	order = get_order(size);
1623 	if (BIT(order) & ~valid_orders)
1624 		return -EINVAL;
1625 
1626 	return order;
1627 }
1628 
1629 enum {
1630 	/* mark page accessed */
1631 	FOLL_TOUCH = 1 << 16,
1632 	/* a retry, previous pass started an IO */
1633 	FOLL_TRIED = 1 << 17,
1634 	/* we are working on non-current tsk/mm */
1635 	FOLL_REMOTE = 1 << 18,
1636 	/* pages must be released via unpin_user_page */
1637 	FOLL_PIN = 1 << 19,
1638 	/* gup_fast: prevent fall-back to slow gup */
1639 	FOLL_FAST_ONLY = 1 << 20,
1640 	/* allow unlocking the mmap lock */
1641 	FOLL_UNLOCKABLE = 1 << 21,
1642 	/* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */
1643 	FOLL_MADV_POPULATE = 1 << 22,
1644 };
1645 
1646 #define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \
1647 			    FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \
1648 			    FOLL_MADV_POPULATE)
1649 
1650 /*
1651  * Indicates for which pages that are write-protected in the page table,
1652  * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the
1653  * GUP pin will remain consistent with the pages mapped into the page tables
1654  * of the MM.
1655  *
1656  * Temporary unmapping of PageAnonExclusive() pages or clearing of
1657  * PageAnonExclusive() has to protect against concurrent GUP:
1658  * * Ordinary GUP: Using the PT lock
1659  * * GUP-fast and fork(): mm->write_protect_seq
1660  * * GUP-fast and KSM or temporary unmapping (swap, migration): see
1661  *    folio_try_share_anon_rmap_*()
1662  *
1663  * Must be called with the (sub)page that's actually referenced via the
1664  * page table entry, which might not necessarily be the head page for a
1665  * PTE-mapped THP.
1666  *
1667  * If the vma is NULL, we're coming from the GUP-fast path and might have
1668  * to fallback to the slow path just to lookup the vma.
1669  */
1670 static inline bool gup_must_unshare(struct vm_area_struct *vma,
1671 				    unsigned int flags, struct page *page)
1672 {
1673 	/*
1674 	 * FOLL_WRITE is implicitly handled correctly as the page table entry
1675 	 * has to be writable -- and if it references (part of) an anonymous
1676 	 * folio, that part is required to be marked exclusive.
1677 	 */
1678 	if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN)
1679 		return false;
1680 	/*
1681 	 * Note: PageAnon(page) is stable until the page is actually getting
1682 	 * freed.
1683 	 */
1684 	if (!PageAnon(page)) {
1685 		/*
1686 		 * We only care about R/O long-term pining: R/O short-term
1687 		 * pinning does not have the semantics to observe successive
1688 		 * changes through the process page tables.
1689 		 */
1690 		if (!(flags & FOLL_LONGTERM))
1691 			return false;
1692 
1693 		/* We really need the vma ... */
1694 		if (!vma)
1695 			return true;
1696 
1697 		/*
1698 		 * ... because we only care about writable private ("COW")
1699 		 * mappings where we have to break COW early.
1700 		 */
1701 		return is_cow_mapping(vma->vm_flags);
1702 	}
1703 
1704 	/* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */
1705 	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
1706 		smp_rmb();
1707 
1708 	/*
1709 	 * Note that KSM pages cannot be exclusive, and consequently,
1710 	 * cannot get pinned.
1711 	 */
1712 	return !PageAnonExclusive(page);
1713 }
1714 
1715 extern bool mirrored_kernelcore;
1716 bool memblock_has_mirror(void);
1717 void memblock_free_all(void);
1718 
1719 static __always_inline void vma_set_range(struct vm_area_struct *vma,
1720 					  unsigned long start, unsigned long end,
1721 					  pgoff_t pgoff)
1722 {
1723 	vma->vm_start = start;
1724 	vma->vm_end = end;
1725 	vma->vm_pgoff = pgoff;
1726 }
1727 
1728 static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
1729 {
1730 	/*
1731 	 * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty
1732 	 * enablements, because when without soft-dirty being compiled in,
1733 	 * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY)
1734 	 * will be constantly true.
1735 	 */
1736 	if (!pgtable_supports_soft_dirty())
1737 		return false;
1738 
1739 	/*
1740 	 * Soft-dirty is kind of special: its tracking is enabled when the
1741 	 * vma flags not set.
1742 	 */
1743 	return !(vma->vm_flags & VM_SOFTDIRTY);
1744 }
1745 
1746 static inline bool pmd_needs_soft_dirty_wp(struct vm_area_struct *vma, pmd_t pmd)
1747 {
1748 	return vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd);
1749 }
1750 
1751 static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte)
1752 {
1753 	return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte);
1754 }
1755 
1756 void __meminit __init_single_page(struct page *page, unsigned long pfn,
1757 				unsigned long zone, int nid);
1758 void __meminit __init_page_from_nid(unsigned long pfn, int nid);
1759 
1760 /* shrinker related functions */
1761 unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
1762 			  int priority);
1763 
1764 int shmem_add_to_page_cache(struct folio *folio,
1765 			    struct address_space *mapping,
1766 			    pgoff_t index, void *expected, gfp_t gfp);
1767 int shmem_inode_acct_blocks(struct inode *inode, long pages);
1768 bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped);
1769 
1770 #ifdef CONFIG_SHRINKER_DEBUG
1771 static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
1772 			struct shrinker *shrinker, const char *fmt, va_list ap)
1773 {
1774 	shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
1775 
1776 	return shrinker->name ? 0 : -ENOMEM;
1777 }
1778 
1779 static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
1780 {
1781 	kfree_const(shrinker->name);
1782 	shrinker->name = NULL;
1783 }
1784 
1785 extern int shrinker_debugfs_add(struct shrinker *shrinker);
1786 extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
1787 					      int *debugfs_id);
1788 extern void shrinker_debugfs_remove(struct dentry *debugfs_entry,
1789 				    int debugfs_id);
1790 #else /* CONFIG_SHRINKER_DEBUG */
1791 static inline int shrinker_debugfs_add(struct shrinker *shrinker)
1792 {
1793 	return 0;
1794 }
1795 static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker,
1796 					      const char *fmt, va_list ap)
1797 {
1798 	return 0;
1799 }
1800 static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
1801 {
1802 }
1803 static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
1804 						     int *debugfs_id)
1805 {
1806 	*debugfs_id = -1;
1807 	return NULL;
1808 }
1809 static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
1810 					   int debugfs_id)
1811 {
1812 }
1813 #endif /* CONFIG_SHRINKER_DEBUG */
1814 
1815 /* Only track the nodes of mappings with shadow entries */
1816 void workingset_update_node(struct xa_node *node);
1817 extern struct list_lru shadow_nodes;
1818 #define mapping_set_update(xas, mapping) do {			\
1819 	if (!dax_mapping(mapping) && !shmem_mapping(mapping)) {	\
1820 		xas_set_update(xas, workingset_update_node);	\
1821 		xas_set_lru(xas, &shadow_nodes);		\
1822 	}							\
1823 } while (0)
1824 
1825 /* mremap.c */
1826 unsigned long move_page_tables(struct pagetable_move_control *pmc);
1827 
1828 #ifdef CONFIG_UNACCEPTED_MEMORY
1829 void accept_page(struct page *page);
1830 #else /* CONFIG_UNACCEPTED_MEMORY */
1831 static inline void accept_page(struct page *page)
1832 {
1833 }
1834 #endif /* CONFIG_UNACCEPTED_MEMORY */
1835 
1836 /* pagewalk.c */
1837 int walk_page_range_mm_unsafe(struct mm_struct *mm, unsigned long start,
1838 		unsigned long end, const struct mm_walk_ops *ops,
1839 		void *private);
1840 int walk_page_range_vma_unsafe(struct vm_area_struct *vma, unsigned long start,
1841 		unsigned long end, const struct mm_walk_ops *ops,
1842 		void *private);
1843 int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
1844 			  unsigned long end, const struct mm_walk_ops *ops,
1845 			  pgd_t *pgd, void *private);
1846 
1847 void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm);
1848 int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm);
1849 
1850 int remap_pfn_range_prepare(struct vm_area_desc *desc);
1851 int remap_pfn_range_complete(struct vm_area_struct *vma,
1852 			     struct mmap_action *action);
1853 int simple_ioremap_prepare(struct vm_area_desc *desc);
1854 
1855 static inline int io_remap_pfn_range_prepare(struct vm_area_desc *desc)
1856 {
1857 	struct mmap_action *action = &desc->action;
1858 	const unsigned long orig_pfn = action->remap.start_pfn;
1859 	const pgprot_t orig_pgprot = action->remap.pgprot;
1860 	const unsigned long size = action->remap.size;
1861 	const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size);
1862 	int err;
1863 
1864 	action->remap.start_pfn = pfn;
1865 	action->remap.pgprot = pgprot_decrypted(orig_pgprot);
1866 	err = remap_pfn_range_prepare(desc);
1867 	if (err)
1868 		return err;
1869 
1870 	/* Remap does the actual work. */
1871 	action->type = MMAP_REMAP_PFN;
1872 	return 0;
1873 }
1874 
1875 /*
1876  * When we succeed an mmap action or just before we unmap a VMA on error, we
1877  * need to ensure any rmap lock held is released. On unmap it's required to
1878  * avoid a deadlock.
1879  */
1880 static inline void maybe_rmap_unlock_action(struct vm_area_struct *vma,
1881 		struct mmap_action *action)
1882 {
1883 	struct file *file;
1884 
1885 	if (!action->hide_from_rmap_until_complete)
1886 		return;
1887 
1888 	VM_WARN_ON_ONCE(vma_is_anonymous(vma));
1889 	file = vma->vm_file;
1890 	i_mmap_unlock_write(file->f_mapping);
1891 	action->hide_from_rmap_until_complete = false;
1892 }
1893 
1894 #ifdef CONFIG_MMU_NOTIFIER
1895 static inline bool clear_flush_young_ptes_notify(struct vm_area_struct *vma,
1896 		unsigned long addr, pte_t *ptep, unsigned int nr)
1897 {
1898 	bool young;
1899 
1900 	young = clear_flush_young_ptes(vma, addr, ptep, nr);
1901 	young |= mmu_notifier_clear_flush_young(vma->vm_mm, addr,
1902 						addr + nr * PAGE_SIZE);
1903 	return young;
1904 }
1905 
1906 static inline bool pmdp_clear_flush_young_notify(struct vm_area_struct *vma,
1907 		unsigned long addr, pmd_t *pmdp)
1908 {
1909 	bool young;
1910 
1911 	young = pmdp_clear_flush_young(vma, addr, pmdp);
1912 	young |= mmu_notifier_clear_flush_young(vma->vm_mm, addr, addr + PMD_SIZE);
1913 	return young;
1914 }
1915 
1916 static inline bool test_and_clear_young_ptes_notify(struct vm_area_struct *vma,
1917 		unsigned long addr, pte_t *ptep, unsigned int nr)
1918 {
1919 	bool young;
1920 
1921 	young = test_and_clear_young_ptes(vma, addr, ptep, nr);
1922 	young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + nr * PAGE_SIZE);
1923 	return young;
1924 }
1925 
1926 static inline bool pmdp_test_and_clear_young_notify(struct vm_area_struct *vma,
1927 		unsigned long addr, pmd_t *pmdp)
1928 {
1929 	bool young;
1930 
1931 	young = pmdp_test_and_clear_young(vma, addr, pmdp);
1932 	young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PMD_SIZE);
1933 	return young;
1934 }
1935 
1936 #else /* CONFIG_MMU_NOTIFIER */
1937 
1938 #define clear_flush_young_ptes_notify	clear_flush_young_ptes
1939 #define pmdp_clear_flush_young_notify	pmdp_clear_flush_young
1940 #define test_and_clear_young_ptes_notify	test_and_clear_young_ptes
1941 #define pmdp_test_and_clear_young_notify	pmdp_test_and_clear_young
1942 
1943 #endif /* CONFIG_MMU_NOTIFIER */
1944 
1945 extern int sysctl_max_map_count;
1946 static inline int get_sysctl_max_map_count(void)
1947 {
1948 	return READ_ONCE(sysctl_max_map_count);
1949 }
1950 
1951 bool may_expand_vm(struct mm_struct *mm, const vma_flags_t *vma_flags,
1952 		   unsigned long npages);
1953 
1954 #endif	/* __MM_INTERNAL_H */
1955