xref: /linux/mm/internal.h (revision f7c7c5aa556378a2c8da72c1f7f238b6648f95fb)
1  /* SPDX-License-Identifier: GPL-2.0-or-later */
2  /* internal.h: mm/ internal definitions
3   *
4   * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
5   * Written by David Howells (dhowells@redhat.com)
6   */
7  #ifndef __MM_INTERNAL_H
8  #define __MM_INTERNAL_H
9  
10  #include <linux/fs.h>
11  #include <linux/khugepaged.h>
12  #include <linux/mm.h>
13  #include <linux/mm_inline.h>
14  #include <linux/pagemap.h>
15  #include <linux/rmap.h>
16  #include <linux/swap.h>
17  #include <linux/swapops.h>
18  #include <linux/swap_cgroup.h>
19  #include <linux/tracepoint-defs.h>
20  
21  /* Internal core VMA manipulation functions. */
22  #include "vma.h"
23  
24  struct folio_batch;
25  
26  /*
27   * The set of flags that only affect watermark checking and reclaim
28   * behaviour. This is used by the MM to obey the caller constraints
29   * about IO, FS and watermark checking while ignoring placement
30   * hints such as HIGHMEM usage.
31   */
32  #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
33  			__GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
34  			__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
35  			__GFP_NOLOCKDEP)
36  
37  /* The GFP flags allowed during early boot */
38  #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
39  
40  /* Control allocation cpuset and node placement constraints */
41  #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
42  
43  /* Do not use these with a slab allocator */
44  #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
45  
46  /*
47   * Different from WARN_ON_ONCE(), no warning will be issued
48   * when we specify __GFP_NOWARN.
49   */
50  #define WARN_ON_ONCE_GFP(cond, gfp)	({				\
51  	static bool __section(".data.once") __warned;			\
52  	int __ret_warn_once = !!(cond);					\
53  									\
54  	if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \
55  		__warned = true;					\
56  		WARN_ON(1);						\
57  	}								\
58  	unlikely(__ret_warn_once);					\
59  })
60  
61  void page_writeback_init(void);
62  
63  /*
64   * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
65   * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit
66   * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE).  Hugetlb currently
67   * leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
68   */
69  #define ENTIRELY_MAPPED		0x800000
70  #define FOLIO_PAGES_MAPPED	(ENTIRELY_MAPPED - 1)
71  
72  /*
73   * Flags passed to __show_mem() and show_free_areas() to suppress output in
74   * various contexts.
75   */
76  #define SHOW_MEM_FILTER_NODES		(0x0001u)	/* disallowed nodes */
77  
78  /*
79   * How many individual pages have an elevated _mapcount.  Excludes
80   * the folio's entire_mapcount.
81   *
82   * Don't use this function outside of debugging code.
83   */
84  static inline int folio_nr_pages_mapped(const struct folio *folio)
85  {
86  	return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
87  }
88  
89  /*
90   * Retrieve the first entry of a folio based on a provided entry within the
91   * folio. We cannot rely on folio->swap as there is no guarantee that it has
92   * been initialized. Used for calling arch_swap_restore()
93   */
94  static inline swp_entry_t folio_swap(swp_entry_t entry,
95  		const struct folio *folio)
96  {
97  	swp_entry_t swap = {
98  		.val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)),
99  	};
100  
101  	return swap;
102  }
103  
104  static inline void *folio_raw_mapping(const struct folio *folio)
105  {
106  	unsigned long mapping = (unsigned long)folio->mapping;
107  
108  	return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
109  }
110  
111  #ifdef CONFIG_MMU
112  
113  /* Flags for folio_pte_batch(). */
114  typedef int __bitwise fpb_t;
115  
116  /* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */
117  #define FPB_IGNORE_DIRTY		((__force fpb_t)BIT(0))
118  
119  /* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */
120  #define FPB_IGNORE_SOFT_DIRTY		((__force fpb_t)BIT(1))
121  
122  static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
123  {
124  	if (flags & FPB_IGNORE_DIRTY)
125  		pte = pte_mkclean(pte);
126  	if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
127  		pte = pte_clear_soft_dirty(pte);
128  	return pte_wrprotect(pte_mkold(pte));
129  }
130  
131  /**
132   * folio_pte_batch - detect a PTE batch for a large folio
133   * @folio: The large folio to detect a PTE batch for.
134   * @addr: The user virtual address the first page is mapped at.
135   * @start_ptep: Page table pointer for the first entry.
136   * @pte: Page table entry for the first page.
137   * @max_nr: The maximum number of table entries to consider.
138   * @flags: Flags to modify the PTE batch semantics.
139   * @any_writable: Optional pointer to indicate whether any entry except the
140   *		  first one is writable.
141   * @any_young: Optional pointer to indicate whether any entry except the
142   *		  first one is young.
143   * @any_dirty: Optional pointer to indicate whether any entry except the
144   *		  first one is dirty.
145   *
146   * Detect a PTE batch: consecutive (present) PTEs that map consecutive
147   * pages of the same large folio.
148   *
149   * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
150   * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
151   * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
152   *
153   * start_ptep must map any page of the folio. max_nr must be at least one and
154   * must be limited by the caller so scanning cannot exceed a single page table.
155   *
156   * Return: the number of table entries in the batch.
157   */
158  static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
159  		pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
160  		bool *any_writable, bool *any_young, bool *any_dirty)
161  {
162  	unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
163  	const pte_t *end_ptep = start_ptep + max_nr;
164  	pte_t expected_pte, *ptep;
165  	bool writable, young, dirty;
166  	int nr;
167  
168  	if (any_writable)
169  		*any_writable = false;
170  	if (any_young)
171  		*any_young = false;
172  	if (any_dirty)
173  		*any_dirty = false;
174  
175  	VM_WARN_ON_FOLIO(!pte_present(pte), folio);
176  	VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
177  	VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
178  
179  	nr = pte_batch_hint(start_ptep, pte);
180  	expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
181  	ptep = start_ptep + nr;
182  
183  	while (ptep < end_ptep) {
184  		pte = ptep_get(ptep);
185  		if (any_writable)
186  			writable = !!pte_write(pte);
187  		if (any_young)
188  			young = !!pte_young(pte);
189  		if (any_dirty)
190  			dirty = !!pte_dirty(pte);
191  		pte = __pte_batch_clear_ignored(pte, flags);
192  
193  		if (!pte_same(pte, expected_pte))
194  			break;
195  
196  		/*
197  		 * Stop immediately once we reached the end of the folio. In
198  		 * corner cases the next PFN might fall into a different
199  		 * folio.
200  		 */
201  		if (pte_pfn(pte) >= folio_end_pfn)
202  			break;
203  
204  		if (any_writable)
205  			*any_writable |= writable;
206  		if (any_young)
207  			*any_young |= young;
208  		if (any_dirty)
209  			*any_dirty |= dirty;
210  
211  		nr = pte_batch_hint(ptep, pte);
212  		expected_pte = pte_advance_pfn(expected_pte, nr);
213  		ptep += nr;
214  	}
215  
216  	return min(ptep - start_ptep, max_nr);
217  }
218  
219  /**
220   * pte_move_swp_offset - Move the swap entry offset field of a swap pte
221   *	 forward or backward by delta
222   * @pte: The initial pte state; is_swap_pte(pte) must be true and
223   *	 non_swap_entry() must be false.
224   * @delta: The direction and the offset we are moving; forward if delta
225   *	 is positive; backward if delta is negative
226   *
227   * Moves the swap offset, while maintaining all other fields, including
228   * swap type, and any swp pte bits. The resulting pte is returned.
229   */
230  static inline pte_t pte_move_swp_offset(pte_t pte, long delta)
231  {
232  	swp_entry_t entry = pte_to_swp_entry(pte);
233  	pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
234  						   (swp_offset(entry) + delta)));
235  
236  	if (pte_swp_soft_dirty(pte))
237  		new = pte_swp_mksoft_dirty(new);
238  	if (pte_swp_exclusive(pte))
239  		new = pte_swp_mkexclusive(new);
240  	if (pte_swp_uffd_wp(pte))
241  		new = pte_swp_mkuffd_wp(new);
242  
243  	return new;
244  }
245  
246  
247  /**
248   * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
249   * @pte: The initial pte state; is_swap_pte(pte) must be true and
250   *	 non_swap_entry() must be false.
251   *
252   * Increments the swap offset, while maintaining all other fields, including
253   * swap type, and any swp pte bits. The resulting pte is returned.
254   */
255  static inline pte_t pte_next_swp_offset(pte_t pte)
256  {
257  	return pte_move_swp_offset(pte, 1);
258  }
259  
260  /**
261   * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
262   * @start_ptep: Page table pointer for the first entry.
263   * @max_nr: The maximum number of table entries to consider.
264   * @pte: Page table entry for the first entry.
265   *
266   * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
267   * containing swap entries all with consecutive offsets and targeting the same
268   * swap type, all with matching swp pte bits.
269   *
270   * max_nr must be at least one and must be limited by the caller so scanning
271   * cannot exceed a single page table.
272   *
273   * Return: the number of table entries in the batch.
274   */
275  static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
276  {
277  	pte_t expected_pte = pte_next_swp_offset(pte);
278  	const pte_t *end_ptep = start_ptep + max_nr;
279  	swp_entry_t entry = pte_to_swp_entry(pte);
280  	pte_t *ptep = start_ptep + 1;
281  	unsigned short cgroup_id;
282  
283  	VM_WARN_ON(max_nr < 1);
284  	VM_WARN_ON(!is_swap_pte(pte));
285  	VM_WARN_ON(non_swap_entry(entry));
286  
287  	cgroup_id = lookup_swap_cgroup_id(entry);
288  	while (ptep < end_ptep) {
289  		pte = ptep_get(ptep);
290  
291  		if (!pte_same(pte, expected_pte))
292  			break;
293  		if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id)
294  			break;
295  		expected_pte = pte_next_swp_offset(expected_pte);
296  		ptep++;
297  	}
298  
299  	return ptep - start_ptep;
300  }
301  #endif /* CONFIG_MMU */
302  
303  void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
304  						int nr_throttled);
305  static inline void acct_reclaim_writeback(struct folio *folio)
306  {
307  	pg_data_t *pgdat = folio_pgdat(folio);
308  	int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled);
309  
310  	if (nr_throttled)
311  		__acct_reclaim_writeback(pgdat, folio, nr_throttled);
312  }
313  
314  static inline void wake_throttle_isolated(pg_data_t *pgdat)
315  {
316  	wait_queue_head_t *wqh;
317  
318  	wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
319  	if (waitqueue_active(wqh))
320  		wake_up(wqh);
321  }
322  
323  vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf);
324  static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
325  {
326  	vm_fault_t ret = __vmf_anon_prepare(vmf);
327  
328  	if (unlikely(ret & VM_FAULT_RETRY))
329  		vma_end_read(vmf->vma);
330  	return ret;
331  }
332  
333  vm_fault_t do_swap_page(struct vm_fault *vmf);
334  void folio_rotate_reclaimable(struct folio *folio);
335  bool __folio_end_writeback(struct folio *folio);
336  void deactivate_file_folio(struct folio *folio);
337  void folio_activate(struct folio *folio);
338  
339  void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
340  		   struct vm_area_struct *start_vma, unsigned long floor,
341  		   unsigned long ceiling, bool mm_wr_locked);
342  void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
343  
344  struct zap_details;
345  void unmap_page_range(struct mmu_gather *tlb,
346  			     struct vm_area_struct *vma,
347  			     unsigned long addr, unsigned long end,
348  			     struct zap_details *details);
349  
350  void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
351  		unsigned int order);
352  void force_page_cache_ra(struct readahead_control *, unsigned long nr);
353  static inline void force_page_cache_readahead(struct address_space *mapping,
354  		struct file *file, pgoff_t index, unsigned long nr_to_read)
355  {
356  	DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
357  	force_page_cache_ra(&ractl, nr_to_read);
358  }
359  
360  unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
361  		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
362  unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
363  		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
364  void filemap_free_folio(struct address_space *mapping, struct folio *folio);
365  int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
366  bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
367  		loff_t end);
368  long mapping_evict_folio(struct address_space *mapping, struct folio *folio);
369  unsigned long mapping_try_invalidate(struct address_space *mapping,
370  		pgoff_t start, pgoff_t end, unsigned long *nr_failed);
371  
372  /**
373   * folio_evictable - Test whether a folio is evictable.
374   * @folio: The folio to test.
375   *
376   * Test whether @folio is evictable -- i.e., should be placed on
377   * active/inactive lists vs unevictable list.
378   *
379   * Reasons folio might not be evictable:
380   * 1. folio's mapping marked unevictable
381   * 2. One of the pages in the folio is part of an mlocked VMA
382   */
383  static inline bool folio_evictable(struct folio *folio)
384  {
385  	bool ret;
386  
387  	/* Prevent address_space of inode and swap cache from being freed */
388  	rcu_read_lock();
389  	ret = !mapping_unevictable(folio_mapping(folio)) &&
390  			!folio_test_mlocked(folio);
391  	rcu_read_unlock();
392  	return ret;
393  }
394  
395  /*
396   * Turn a non-refcounted page (->_refcount == 0) into refcounted with
397   * a count of one.
398   */
399  static inline void set_page_refcounted(struct page *page)
400  {
401  	VM_BUG_ON_PAGE(PageTail(page), page);
402  	VM_BUG_ON_PAGE(page_ref_count(page), page);
403  	set_page_count(page, 1);
404  }
405  
406  /*
407   * Return true if a folio needs ->release_folio() calling upon it.
408   */
409  static inline bool folio_needs_release(struct folio *folio)
410  {
411  	struct address_space *mapping = folio_mapping(folio);
412  
413  	return folio_has_private(folio) ||
414  		(mapping && mapping_release_always(mapping));
415  }
416  
417  extern unsigned long highest_memmap_pfn;
418  
419  /*
420   * Maximum number of reclaim retries without progress before the OOM
421   * killer is consider the only way forward.
422   */
423  #define MAX_RECLAIM_RETRIES 16
424  
425  /*
426   * in mm/vmscan.c:
427   */
428  bool folio_isolate_lru(struct folio *folio);
429  void folio_putback_lru(struct folio *folio);
430  extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
431  
432  /*
433   * in mm/rmap.c:
434   */
435  pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
436  
437  /*
438   * in mm/page_alloc.c
439   */
440  #define K(x) ((x) << (PAGE_SHIFT-10))
441  
442  extern char * const zone_names[MAX_NR_ZONES];
443  
444  /* perform sanity checks on struct pages being allocated or freed */
445  DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
446  
447  extern int min_free_kbytes;
448  
449  void setup_per_zone_wmarks(void);
450  void calculate_min_free_kbytes(void);
451  int __meminit init_per_zone_wmark_min(void);
452  void page_alloc_sysctl_init(void);
453  
454  /*
455   * Structure for holding the mostly immutable allocation parameters passed
456   * between functions involved in allocations, including the alloc_pages*
457   * family of functions.
458   *
459   * nodemask, migratetype and highest_zoneidx are initialized only once in
460   * __alloc_pages() and then never change.
461   *
462   * zonelist, preferred_zone and highest_zoneidx are set first in
463   * __alloc_pages() for the fast path, and might be later changed
464   * in __alloc_pages_slowpath(). All other functions pass the whole structure
465   * by a const pointer.
466   */
467  struct alloc_context {
468  	struct zonelist *zonelist;
469  	nodemask_t *nodemask;
470  	struct zoneref *preferred_zoneref;
471  	int migratetype;
472  
473  	/*
474  	 * highest_zoneidx represents highest usable zone index of
475  	 * the allocation request. Due to the nature of the zone,
476  	 * memory on lower zone than the highest_zoneidx will be
477  	 * protected by lowmem_reserve[highest_zoneidx].
478  	 *
479  	 * highest_zoneidx is also used by reclaim/compaction to limit
480  	 * the target zone since higher zone than this index cannot be
481  	 * usable for this allocation request.
482  	 */
483  	enum zone_type highest_zoneidx;
484  	bool spread_dirty_pages;
485  };
486  
487  /*
488   * This function returns the order of a free page in the buddy system. In
489   * general, page_zone(page)->lock must be held by the caller to prevent the
490   * page from being allocated in parallel and returning garbage as the order.
491   * If a caller does not hold page_zone(page)->lock, it must guarantee that the
492   * page cannot be allocated or merged in parallel. Alternatively, it must
493   * handle invalid values gracefully, and use buddy_order_unsafe() below.
494   */
495  static inline unsigned int buddy_order(struct page *page)
496  {
497  	/* PageBuddy() must be checked by the caller */
498  	return page_private(page);
499  }
500  
501  /*
502   * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
503   * PageBuddy() should be checked first by the caller to minimize race window,
504   * and invalid values must be handled gracefully.
505   *
506   * READ_ONCE is used so that if the caller assigns the result into a local
507   * variable and e.g. tests it for valid range before using, the compiler cannot
508   * decide to remove the variable and inline the page_private(page) multiple
509   * times, potentially observing different values in the tests and the actual
510   * use of the result.
511   */
512  #define buddy_order_unsafe(page)	READ_ONCE(page_private(page))
513  
514  /*
515   * This function checks whether a page is free && is the buddy
516   * we can coalesce a page and its buddy if
517   * (a) the buddy is not in a hole (check before calling!) &&
518   * (b) the buddy is in the buddy system &&
519   * (c) a page and its buddy have the same order &&
520   * (d) a page and its buddy are in the same zone.
521   *
522   * For recording whether a page is in the buddy system, we set PageBuddy.
523   * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
524   *
525   * For recording page's order, we use page_private(page).
526   */
527  static inline bool page_is_buddy(struct page *page, struct page *buddy,
528  				 unsigned int order)
529  {
530  	if (!page_is_guard(buddy) && !PageBuddy(buddy))
531  		return false;
532  
533  	if (buddy_order(buddy) != order)
534  		return false;
535  
536  	/*
537  	 * zone check is done late to avoid uselessly calculating
538  	 * zone/node ids for pages that could never merge.
539  	 */
540  	if (page_zone_id(page) != page_zone_id(buddy))
541  		return false;
542  
543  	VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
544  
545  	return true;
546  }
547  
548  /*
549   * Locate the struct page for both the matching buddy in our
550   * pair (buddy1) and the combined O(n+1) page they form (page).
551   *
552   * 1) Any buddy B1 will have an order O twin B2 which satisfies
553   * the following equation:
554   *     B2 = B1 ^ (1 << O)
555   * For example, if the starting buddy (buddy2) is #8 its order
556   * 1 buddy is #10:
557   *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
558   *
559   * 2) Any buddy B will have an order O+1 parent P which
560   * satisfies the following equation:
561   *     P = B & ~(1 << O)
562   *
563   * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER
564   */
565  static inline unsigned long
566  __find_buddy_pfn(unsigned long page_pfn, unsigned int order)
567  {
568  	return page_pfn ^ (1 << order);
569  }
570  
571  /*
572   * Find the buddy of @page and validate it.
573   * @page: The input page
574   * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the
575   *       function is used in the performance-critical __free_one_page().
576   * @order: The order of the page
577   * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to
578   *             page_to_pfn().
579   *
580   * The found buddy can be a non PageBuddy, out of @page's zone, or its order is
581   * not the same as @page. The validation is necessary before use it.
582   *
583   * Return: the found buddy page or NULL if not found.
584   */
585  static inline struct page *find_buddy_page_pfn(struct page *page,
586  			unsigned long pfn, unsigned int order, unsigned long *buddy_pfn)
587  {
588  	unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order);
589  	struct page *buddy;
590  
591  	buddy = page + (__buddy_pfn - pfn);
592  	if (buddy_pfn)
593  		*buddy_pfn = __buddy_pfn;
594  
595  	if (page_is_buddy(page, buddy, order))
596  		return buddy;
597  	return NULL;
598  }
599  
600  extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
601  				unsigned long end_pfn, struct zone *zone);
602  
603  static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
604  				unsigned long end_pfn, struct zone *zone)
605  {
606  	if (zone->contiguous)
607  		return pfn_to_page(start_pfn);
608  
609  	return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
610  }
611  
612  void set_zone_contiguous(struct zone *zone);
613  
614  static inline void clear_zone_contiguous(struct zone *zone)
615  {
616  	zone->contiguous = false;
617  }
618  
619  extern int __isolate_free_page(struct page *page, unsigned int order);
620  extern void __putback_isolated_page(struct page *page, unsigned int order,
621  				    int mt);
622  extern void memblock_free_pages(struct page *page, unsigned long pfn,
623  					unsigned int order);
624  extern void __free_pages_core(struct page *page, unsigned int order,
625  		enum meminit_context context);
626  
627  /*
628   * This will have no effect, other than possibly generating a warning, if the
629   * caller passes in a non-large folio.
630   */
631  static inline void folio_set_order(struct folio *folio, unsigned int order)
632  {
633  	if (WARN_ON_ONCE(!order || !folio_test_large(folio)))
634  		return;
635  
636  	folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order;
637  #ifdef CONFIG_64BIT
638  	folio->_folio_nr_pages = 1U << order;
639  #endif
640  }
641  
642  void __folio_undo_large_rmappable(struct folio *folio);
643  static inline void folio_undo_large_rmappable(struct folio *folio)
644  {
645  	if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio))
646  		return;
647  
648  	/*
649  	 * At this point, there is no one trying to add the folio to
650  	 * deferred_list. If folio is not in deferred_list, it's safe
651  	 * to check without acquiring the split_queue_lock.
652  	 */
653  	if (data_race(list_empty(&folio->_deferred_list)))
654  		return;
655  
656  	__folio_undo_large_rmappable(folio);
657  }
658  
659  static inline struct folio *page_rmappable_folio(struct page *page)
660  {
661  	struct folio *folio = (struct folio *)page;
662  
663  	if (folio && folio_test_large(folio))
664  		folio_set_large_rmappable(folio);
665  	return folio;
666  }
667  
668  static inline void prep_compound_head(struct page *page, unsigned int order)
669  {
670  	struct folio *folio = (struct folio *)page;
671  
672  	folio_set_order(folio, order);
673  	atomic_set(&folio->_large_mapcount, -1);
674  	atomic_set(&folio->_entire_mapcount, -1);
675  	atomic_set(&folio->_nr_pages_mapped, 0);
676  	atomic_set(&folio->_pincount, 0);
677  	if (order > 1)
678  		INIT_LIST_HEAD(&folio->_deferred_list);
679  }
680  
681  static inline void prep_compound_tail(struct page *head, int tail_idx)
682  {
683  	struct page *p = head + tail_idx;
684  
685  	p->mapping = TAIL_MAPPING;
686  	set_compound_head(p, head);
687  	set_page_private(p, 0);
688  }
689  
690  extern void prep_compound_page(struct page *page, unsigned int order);
691  
692  extern void post_alloc_hook(struct page *page, unsigned int order,
693  					gfp_t gfp_flags);
694  extern bool free_pages_prepare(struct page *page, unsigned int order);
695  
696  extern int user_min_free_kbytes;
697  
698  void free_unref_page(struct page *page, unsigned int order);
699  void free_unref_folios(struct folio_batch *fbatch);
700  
701  extern void zone_pcp_reset(struct zone *zone);
702  extern void zone_pcp_disable(struct zone *zone);
703  extern void zone_pcp_enable(struct zone *zone);
704  extern void zone_pcp_init(struct zone *zone);
705  
706  extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
707  			  phys_addr_t min_addr,
708  			  int nid, bool exact_nid);
709  
710  void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
711  		unsigned long, enum meminit_context, struct vmem_altmap *, int);
712  
713  #if defined CONFIG_COMPACTION || defined CONFIG_CMA
714  
715  /*
716   * in mm/compaction.c
717   */
718  /*
719   * compact_control is used to track pages being migrated and the free pages
720   * they are being migrated to during memory compaction. The free_pfn starts
721   * at the end of a zone and migrate_pfn begins at the start. Movable pages
722   * are moved to the end of a zone during a compaction run and the run
723   * completes when free_pfn <= migrate_pfn
724   */
725  struct compact_control {
726  	struct list_head freepages[NR_PAGE_ORDERS];	/* List of free pages to migrate to */
727  	struct list_head migratepages;	/* List of pages being migrated */
728  	unsigned int nr_freepages;	/* Number of isolated free pages */
729  	unsigned int nr_migratepages;	/* Number of pages to migrate */
730  	unsigned long free_pfn;		/* isolate_freepages search base */
731  	/*
732  	 * Acts as an in/out parameter to page isolation for migration.
733  	 * isolate_migratepages uses it as a search base.
734  	 * isolate_migratepages_block will update the value to the next pfn
735  	 * after the last isolated one.
736  	 */
737  	unsigned long migrate_pfn;
738  	unsigned long fast_start_pfn;	/* a pfn to start linear scan from */
739  	struct zone *zone;
740  	unsigned long total_migrate_scanned;
741  	unsigned long total_free_scanned;
742  	unsigned short fast_search_fail;/* failures to use free list searches */
743  	short search_order;		/* order to start a fast search at */
744  	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
745  	int order;			/* order a direct compactor needs */
746  	int migratetype;		/* migratetype of direct compactor */
747  	const unsigned int alloc_flags;	/* alloc flags of a direct compactor */
748  	const int highest_zoneidx;	/* zone index of a direct compactor */
749  	enum migrate_mode mode;		/* Async or sync migration mode */
750  	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
751  	bool no_set_skip_hint;		/* Don't mark blocks for skipping */
752  	bool ignore_block_suitable;	/* Scan blocks considered unsuitable */
753  	bool direct_compaction;		/* False from kcompactd or /proc/... */
754  	bool proactive_compaction;	/* kcompactd proactive compaction */
755  	bool whole_zone;		/* Whole zone should/has been scanned */
756  	bool contended;			/* Signal lock contention */
757  	bool finish_pageblock;		/* Scan the remainder of a pageblock. Used
758  					 * when there are potentially transient
759  					 * isolation or migration failures to
760  					 * ensure forward progress.
761  					 */
762  	bool alloc_contig;		/* alloc_contig_range allocation */
763  };
764  
765  /*
766   * Used in direct compaction when a page should be taken from the freelists
767   * immediately when one is created during the free path.
768   */
769  struct capture_control {
770  	struct compact_control *cc;
771  	struct page *page;
772  };
773  
774  unsigned long
775  isolate_freepages_range(struct compact_control *cc,
776  			unsigned long start_pfn, unsigned long end_pfn);
777  int
778  isolate_migratepages_range(struct compact_control *cc,
779  			   unsigned long low_pfn, unsigned long end_pfn);
780  
781  int __alloc_contig_migrate_range(struct compact_control *cc,
782  					unsigned long start, unsigned long end,
783  					int migratetype);
784  
785  /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
786  void init_cma_reserved_pageblock(struct page *page);
787  
788  #endif /* CONFIG_COMPACTION || CONFIG_CMA */
789  
790  int find_suitable_fallback(struct free_area *area, unsigned int order,
791  			int migratetype, bool only_stealable, bool *can_steal);
792  
793  static inline bool free_area_empty(struct free_area *area, int migratetype)
794  {
795  	return list_empty(&area->free_list[migratetype]);
796  }
797  
798  /* mm/util.c */
799  struct anon_vma *folio_anon_vma(struct folio *folio);
800  
801  #ifdef CONFIG_MMU
802  void unmap_mapping_folio(struct folio *folio);
803  extern long populate_vma_page_range(struct vm_area_struct *vma,
804  		unsigned long start, unsigned long end, int *locked);
805  extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
806  		unsigned long end, bool write, int *locked);
807  extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
808  			       unsigned long bytes);
809  
810  /*
811   * NOTE: This function can't tell whether the folio is "fully mapped" in the
812   * range.
813   * "fully mapped" means all the pages of folio is associated with the page
814   * table of range while this function just check whether the folio range is
815   * within the range [start, end). Function caller needs to do page table
816   * check if it cares about the page table association.
817   *
818   * Typical usage (like mlock or madvise) is:
819   * Caller knows at least 1 page of folio is associated with page table of VMA
820   * and the range [start, end) is intersect with the VMA range. Caller wants
821   * to know whether the folio is fully associated with the range. It calls
822   * this function to check whether the folio is in the range first. Then checks
823   * the page table to know whether the folio is fully mapped to the range.
824   */
825  static inline bool
826  folio_within_range(struct folio *folio, struct vm_area_struct *vma,
827  		unsigned long start, unsigned long end)
828  {
829  	pgoff_t pgoff, addr;
830  	unsigned long vma_pglen = vma_pages(vma);
831  
832  	VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio);
833  	if (start > end)
834  		return false;
835  
836  	if (start < vma->vm_start)
837  		start = vma->vm_start;
838  
839  	if (end > vma->vm_end)
840  		end = vma->vm_end;
841  
842  	pgoff = folio_pgoff(folio);
843  
844  	/* if folio start address is not in vma range */
845  	if (!in_range(pgoff, vma->vm_pgoff, vma_pglen))
846  		return false;
847  
848  	addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
849  
850  	return !(addr < start || end - addr < folio_size(folio));
851  }
852  
853  static inline bool
854  folio_within_vma(struct folio *folio, struct vm_area_struct *vma)
855  {
856  	return folio_within_range(folio, vma, vma->vm_start, vma->vm_end);
857  }
858  
859  /*
860   * mlock_vma_folio() and munlock_vma_folio():
861   * should be called with vma's mmap_lock held for read or write,
862   * under page table lock for the pte/pmd being added or removed.
863   *
864   * mlock is usually called at the end of folio_add_*_rmap_*(), munlock at
865   * the end of folio_remove_rmap_*(); but new anon folios are managed by
866   * folio_add_lru_vma() calling mlock_new_folio().
867   */
868  void mlock_folio(struct folio *folio);
869  static inline void mlock_vma_folio(struct folio *folio,
870  				struct vm_area_struct *vma)
871  {
872  	/*
873  	 * The VM_SPECIAL check here serves two purposes.
874  	 * 1) VM_IO check prevents migration from double-counting during mlock.
875  	 * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED
876  	 *    is never left set on a VM_SPECIAL vma, there is an interval while
877  	 *    file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
878  	 *    still be set while VM_SPECIAL bits are added: so ignore it then.
879  	 */
880  	if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED))
881  		mlock_folio(folio);
882  }
883  
884  void munlock_folio(struct folio *folio);
885  static inline void munlock_vma_folio(struct folio *folio,
886  					struct vm_area_struct *vma)
887  {
888  	/*
889  	 * munlock if the function is called. Ideally, we should only
890  	 * do munlock if any page of folio is unmapped from VMA and
891  	 * cause folio not fully mapped to VMA.
892  	 *
893  	 * But it's not easy to confirm that's the situation. So we
894  	 * always munlock the folio and page reclaim will correct it
895  	 * if it's wrong.
896  	 */
897  	if (unlikely(vma->vm_flags & VM_LOCKED))
898  		munlock_folio(folio);
899  }
900  
901  void mlock_new_folio(struct folio *folio);
902  bool need_mlock_drain(int cpu);
903  void mlock_drain_local(void);
904  void mlock_drain_remote(int cpu);
905  
906  extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
907  
908  /**
909   * vma_address - Find the virtual address a page range is mapped at
910   * @vma: The vma which maps this object.
911   * @pgoff: The page offset within its object.
912   * @nr_pages: The number of pages to consider.
913   *
914   * If any page in this range is mapped by this VMA, return the first address
915   * where any of these pages appear.  Otherwise, return -EFAULT.
916   */
917  static inline unsigned long vma_address(struct vm_area_struct *vma,
918  		pgoff_t pgoff, unsigned long nr_pages)
919  {
920  	unsigned long address;
921  
922  	if (pgoff >= vma->vm_pgoff) {
923  		address = vma->vm_start +
924  			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
925  		/* Check for address beyond vma (or wrapped through 0?) */
926  		if (address < vma->vm_start || address >= vma->vm_end)
927  			address = -EFAULT;
928  	} else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
929  		/* Test above avoids possibility of wrap to 0 on 32-bit */
930  		address = vma->vm_start;
931  	} else {
932  		address = -EFAULT;
933  	}
934  	return address;
935  }
936  
937  /*
938   * Then at what user virtual address will none of the range be found in vma?
939   * Assumes that vma_address() already returned a good starting address.
940   */
941  static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw)
942  {
943  	struct vm_area_struct *vma = pvmw->vma;
944  	pgoff_t pgoff;
945  	unsigned long address;
946  
947  	/* Common case, plus ->pgoff is invalid for KSM */
948  	if (pvmw->nr_pages == 1)
949  		return pvmw->address + PAGE_SIZE;
950  
951  	pgoff = pvmw->pgoff + pvmw->nr_pages;
952  	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
953  	/* Check for address beyond vma (or wrapped through 0?) */
954  	if (address < vma->vm_start || address > vma->vm_end)
955  		address = vma->vm_end;
956  	return address;
957  }
958  
959  static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
960  						    struct file *fpin)
961  {
962  	int flags = vmf->flags;
963  
964  	if (fpin)
965  		return fpin;
966  
967  	/*
968  	 * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
969  	 * anything, so we only pin the file and drop the mmap_lock if only
970  	 * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
971  	 */
972  	if (fault_flag_allow_retry_first(flags) &&
973  	    !(flags & FAULT_FLAG_RETRY_NOWAIT)) {
974  		fpin = get_file(vmf->vma->vm_file);
975  		release_fault_lock(vmf);
976  	}
977  	return fpin;
978  }
979  #else /* !CONFIG_MMU */
980  static inline void unmap_mapping_folio(struct folio *folio) { }
981  static inline void mlock_new_folio(struct folio *folio) { }
982  static inline bool need_mlock_drain(int cpu) { return false; }
983  static inline void mlock_drain_local(void) { }
984  static inline void mlock_drain_remote(int cpu) { }
985  static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
986  {
987  }
988  #endif /* !CONFIG_MMU */
989  
990  /* Memory initialisation debug and verification */
991  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
992  DECLARE_STATIC_KEY_TRUE(deferred_pages);
993  
994  bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
995  #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
996  
997  enum mminit_level {
998  	MMINIT_WARNING,
999  	MMINIT_VERIFY,
1000  	MMINIT_TRACE
1001  };
1002  
1003  #ifdef CONFIG_DEBUG_MEMORY_INIT
1004  
1005  extern int mminit_loglevel;
1006  
1007  #define mminit_dprintk(level, prefix, fmt, arg...) \
1008  do { \
1009  	if (level < mminit_loglevel) { \
1010  		if (level <= MMINIT_WARNING) \
1011  			pr_warn("mminit::" prefix " " fmt, ##arg);	\
1012  		else \
1013  			printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
1014  	} \
1015  } while (0)
1016  
1017  extern void mminit_verify_pageflags_layout(void);
1018  extern void mminit_verify_zonelist(void);
1019  #else
1020  
1021  static inline void mminit_dprintk(enum mminit_level level,
1022  				const char *prefix, const char *fmt, ...)
1023  {
1024  }
1025  
1026  static inline void mminit_verify_pageflags_layout(void)
1027  {
1028  }
1029  
1030  static inline void mminit_verify_zonelist(void)
1031  {
1032  }
1033  #endif /* CONFIG_DEBUG_MEMORY_INIT */
1034  
1035  #define NODE_RECLAIM_NOSCAN	-2
1036  #define NODE_RECLAIM_FULL	-1
1037  #define NODE_RECLAIM_SOME	0
1038  #define NODE_RECLAIM_SUCCESS	1
1039  
1040  #ifdef CONFIG_NUMA
1041  extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
1042  extern int find_next_best_node(int node, nodemask_t *used_node_mask);
1043  #else
1044  static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
1045  				unsigned int order)
1046  {
1047  	return NODE_RECLAIM_NOSCAN;
1048  }
1049  static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
1050  {
1051  	return NUMA_NO_NODE;
1052  }
1053  #endif
1054  
1055  /*
1056   * mm/memory-failure.c
1057   */
1058  #ifdef CONFIG_MEMORY_FAILURE
1059  void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu);
1060  void shake_folio(struct folio *folio);
1061  extern int hwpoison_filter(struct page *p);
1062  
1063  extern u32 hwpoison_filter_dev_major;
1064  extern u32 hwpoison_filter_dev_minor;
1065  extern u64 hwpoison_filter_flags_mask;
1066  extern u64 hwpoison_filter_flags_value;
1067  extern u64 hwpoison_filter_memcg;
1068  extern u32 hwpoison_filter_enable;
1069  #define MAGIC_HWPOISON	0x48575053U	/* HWPS */
1070  void SetPageHWPoisonTakenOff(struct page *page);
1071  void ClearPageHWPoisonTakenOff(struct page *page);
1072  bool take_page_off_buddy(struct page *page);
1073  bool put_page_back_buddy(struct page *page);
1074  struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
1075  void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
1076  		     struct vm_area_struct *vma, struct list_head *to_kill,
1077  		     unsigned long ksm_addr);
1078  unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
1079  
1080  #else
1081  static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
1082  {
1083  }
1084  #endif
1085  
1086  extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
1087          unsigned long, unsigned long,
1088          unsigned long, unsigned long);
1089  
1090  extern void set_pageblock_order(void);
1091  struct folio *alloc_migrate_folio(struct folio *src, unsigned long private);
1092  unsigned long reclaim_pages(struct list_head *folio_list);
1093  unsigned int reclaim_clean_pages_from_list(struct zone *zone,
1094  					    struct list_head *folio_list);
1095  /* The ALLOC_WMARK bits are used as an index to zone->watermark */
1096  #define ALLOC_WMARK_MIN		WMARK_MIN
1097  #define ALLOC_WMARK_LOW		WMARK_LOW
1098  #define ALLOC_WMARK_HIGH	WMARK_HIGH
1099  #define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */
1100  
1101  /* Mask to get the watermark bits */
1102  #define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)
1103  
1104  /*
1105   * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
1106   * cannot assume a reduced access to memory reserves is sufficient for
1107   * !MMU
1108   */
1109  #ifdef CONFIG_MMU
1110  #define ALLOC_OOM		0x08
1111  #else
1112  #define ALLOC_OOM		ALLOC_NO_WATERMARKS
1113  #endif
1114  
1115  #define ALLOC_NON_BLOCK		 0x10 /* Caller cannot block. Allow access
1116  				       * to 25% of the min watermark or
1117  				       * 62.5% if __GFP_HIGH is set.
1118  				       */
1119  #define ALLOC_MIN_RESERVE	 0x20 /* __GFP_HIGH set. Allow access to 50%
1120  				       * of the min watermark.
1121  				       */
1122  #define ALLOC_CPUSET		 0x40 /* check for correct cpuset */
1123  #define ALLOC_CMA		 0x80 /* allow allocations from CMA areas */
1124  #ifdef CONFIG_ZONE_DMA32
1125  #define ALLOC_NOFRAGMENT	0x100 /* avoid mixing pageblock types */
1126  #else
1127  #define ALLOC_NOFRAGMENT	  0x0
1128  #endif
1129  #define ALLOC_HIGHATOMIC	0x200 /* Allows access to MIGRATE_HIGHATOMIC */
1130  #define ALLOC_KSWAPD		0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
1131  
1132  /* Flags that allow allocations below the min watermark. */
1133  #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)
1134  
1135  enum ttu_flags;
1136  struct tlbflush_unmap_batch;
1137  
1138  
1139  /*
1140   * only for MM internal work items which do not depend on
1141   * any allocations or locks which might depend on allocations
1142   */
1143  extern struct workqueue_struct *mm_percpu_wq;
1144  
1145  #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
1146  void try_to_unmap_flush(void);
1147  void try_to_unmap_flush_dirty(void);
1148  void flush_tlb_batched_pending(struct mm_struct *mm);
1149  #else
1150  static inline void try_to_unmap_flush(void)
1151  {
1152  }
1153  static inline void try_to_unmap_flush_dirty(void)
1154  {
1155  }
1156  static inline void flush_tlb_batched_pending(struct mm_struct *mm)
1157  {
1158  }
1159  #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
1160  
1161  extern const struct trace_print_flags pageflag_names[];
1162  extern const struct trace_print_flags vmaflag_names[];
1163  extern const struct trace_print_flags gfpflag_names[];
1164  
1165  static inline bool is_migrate_highatomic(enum migratetype migratetype)
1166  {
1167  	return migratetype == MIGRATE_HIGHATOMIC;
1168  }
1169  
1170  void setup_zone_pageset(struct zone *zone);
1171  
1172  struct migration_target_control {
1173  	int nid;		/* preferred node id */
1174  	nodemask_t *nmask;
1175  	gfp_t gfp_mask;
1176  	enum migrate_reason reason;
1177  };
1178  
1179  /*
1180   * mm/filemap.c
1181   */
1182  size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
1183  			      struct folio *folio, loff_t fpos, size_t size);
1184  
1185  /*
1186   * mm/vmalloc.c
1187   */
1188  #ifdef CONFIG_MMU
1189  void __init vmalloc_init(void);
1190  int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
1191                  pgprot_t prot, struct page **pages, unsigned int page_shift);
1192  #else
1193  static inline void vmalloc_init(void)
1194  {
1195  }
1196  
1197  static inline
1198  int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
1199                  pgprot_t prot, struct page **pages, unsigned int page_shift)
1200  {
1201  	return -EINVAL;
1202  }
1203  #endif
1204  
1205  int __must_check __vmap_pages_range_noflush(unsigned long addr,
1206  			       unsigned long end, pgprot_t prot,
1207  			       struct page **pages, unsigned int page_shift);
1208  
1209  void vunmap_range_noflush(unsigned long start, unsigned long end);
1210  
1211  void __vunmap_range_noflush(unsigned long start, unsigned long end);
1212  
1213  int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
1214  		      unsigned long addr, int *flags, bool writable,
1215  		      int *last_cpupid);
1216  
1217  void free_zone_device_folio(struct folio *folio);
1218  int migrate_device_coherent_folio(struct folio *folio);
1219  
1220  /*
1221   * mm/gup.c
1222   */
1223  int __must_check try_grab_folio(struct folio *folio, int refs,
1224  				unsigned int flags);
1225  
1226  /*
1227   * mm/huge_memory.c
1228   */
1229  void touch_pud(struct vm_area_struct *vma, unsigned long addr,
1230  	       pud_t *pud, bool write);
1231  void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
1232  	       pmd_t *pmd, bool write);
1233  
1234  enum {
1235  	/* mark page accessed */
1236  	FOLL_TOUCH = 1 << 16,
1237  	/* a retry, previous pass started an IO */
1238  	FOLL_TRIED = 1 << 17,
1239  	/* we are working on non-current tsk/mm */
1240  	FOLL_REMOTE = 1 << 18,
1241  	/* pages must be released via unpin_user_page */
1242  	FOLL_PIN = 1 << 19,
1243  	/* gup_fast: prevent fall-back to slow gup */
1244  	FOLL_FAST_ONLY = 1 << 20,
1245  	/* allow unlocking the mmap lock */
1246  	FOLL_UNLOCKABLE = 1 << 21,
1247  	/* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */
1248  	FOLL_MADV_POPULATE = 1 << 22,
1249  };
1250  
1251  #define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \
1252  			    FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \
1253  			    FOLL_MADV_POPULATE)
1254  
1255  /*
1256   * Indicates for which pages that are write-protected in the page table,
1257   * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the
1258   * GUP pin will remain consistent with the pages mapped into the page tables
1259   * of the MM.
1260   *
1261   * Temporary unmapping of PageAnonExclusive() pages or clearing of
1262   * PageAnonExclusive() has to protect against concurrent GUP:
1263   * * Ordinary GUP: Using the PT lock
1264   * * GUP-fast and fork(): mm->write_protect_seq
1265   * * GUP-fast and KSM or temporary unmapping (swap, migration): see
1266   *    folio_try_share_anon_rmap_*()
1267   *
1268   * Must be called with the (sub)page that's actually referenced via the
1269   * page table entry, which might not necessarily be the head page for a
1270   * PTE-mapped THP.
1271   *
1272   * If the vma is NULL, we're coming from the GUP-fast path and might have
1273   * to fallback to the slow path just to lookup the vma.
1274   */
1275  static inline bool gup_must_unshare(struct vm_area_struct *vma,
1276  				    unsigned int flags, struct page *page)
1277  {
1278  	/*
1279  	 * FOLL_WRITE is implicitly handled correctly as the page table entry
1280  	 * has to be writable -- and if it references (part of) an anonymous
1281  	 * folio, that part is required to be marked exclusive.
1282  	 */
1283  	if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN)
1284  		return false;
1285  	/*
1286  	 * Note: PageAnon(page) is stable until the page is actually getting
1287  	 * freed.
1288  	 */
1289  	if (!PageAnon(page)) {
1290  		/*
1291  		 * We only care about R/O long-term pining: R/O short-term
1292  		 * pinning does not have the semantics to observe successive
1293  		 * changes through the process page tables.
1294  		 */
1295  		if (!(flags & FOLL_LONGTERM))
1296  			return false;
1297  
1298  		/* We really need the vma ... */
1299  		if (!vma)
1300  			return true;
1301  
1302  		/*
1303  		 * ... because we only care about writable private ("COW")
1304  		 * mappings where we have to break COW early.
1305  		 */
1306  		return is_cow_mapping(vma->vm_flags);
1307  	}
1308  
1309  	/* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */
1310  	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
1311  		smp_rmb();
1312  
1313  	/*
1314  	 * Note that PageKsm() pages cannot be exclusive, and consequently,
1315  	 * cannot get pinned.
1316  	 */
1317  	return !PageAnonExclusive(page);
1318  }
1319  
1320  extern bool mirrored_kernelcore;
1321  extern bool memblock_has_mirror(void);
1322  
1323  static __always_inline void vma_set_range(struct vm_area_struct *vma,
1324  					  unsigned long start, unsigned long end,
1325  					  pgoff_t pgoff)
1326  {
1327  	vma->vm_start = start;
1328  	vma->vm_end = end;
1329  	vma->vm_pgoff = pgoff;
1330  }
1331  
1332  static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
1333  {
1334  	/*
1335  	 * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty
1336  	 * enablements, because when without soft-dirty being compiled in,
1337  	 * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY)
1338  	 * will be constantly true.
1339  	 */
1340  	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
1341  		return false;
1342  
1343  	/*
1344  	 * Soft-dirty is kind of special: its tracking is enabled when the
1345  	 * vma flags not set.
1346  	 */
1347  	return !(vma->vm_flags & VM_SOFTDIRTY);
1348  }
1349  
1350  static inline bool pmd_needs_soft_dirty_wp(struct vm_area_struct *vma, pmd_t pmd)
1351  {
1352  	return vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd);
1353  }
1354  
1355  static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte)
1356  {
1357  	return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte);
1358  }
1359  
1360  void __meminit __init_single_page(struct page *page, unsigned long pfn,
1361  				unsigned long zone, int nid);
1362  
1363  /* shrinker related functions */
1364  unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
1365  			  int priority);
1366  
1367  #ifdef CONFIG_64BIT
1368  static inline int can_do_mseal(unsigned long flags)
1369  {
1370  	if (flags)
1371  		return -EINVAL;
1372  
1373  	return 0;
1374  }
1375  
1376  #else
1377  static inline int can_do_mseal(unsigned long flags)
1378  {
1379  	return -EPERM;
1380  }
1381  #endif
1382  
1383  #ifdef CONFIG_SHRINKER_DEBUG
1384  static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
1385  			struct shrinker *shrinker, const char *fmt, va_list ap)
1386  {
1387  	shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
1388  
1389  	return shrinker->name ? 0 : -ENOMEM;
1390  }
1391  
1392  static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
1393  {
1394  	kfree_const(shrinker->name);
1395  	shrinker->name = NULL;
1396  }
1397  
1398  extern int shrinker_debugfs_add(struct shrinker *shrinker);
1399  extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
1400  					      int *debugfs_id);
1401  extern void shrinker_debugfs_remove(struct dentry *debugfs_entry,
1402  				    int debugfs_id);
1403  #else /* CONFIG_SHRINKER_DEBUG */
1404  static inline int shrinker_debugfs_add(struct shrinker *shrinker)
1405  {
1406  	return 0;
1407  }
1408  static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker,
1409  					      const char *fmt, va_list ap)
1410  {
1411  	return 0;
1412  }
1413  static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
1414  {
1415  }
1416  static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
1417  						     int *debugfs_id)
1418  {
1419  	*debugfs_id = -1;
1420  	return NULL;
1421  }
1422  static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
1423  					   int debugfs_id)
1424  {
1425  }
1426  #endif /* CONFIG_SHRINKER_DEBUG */
1427  
1428  /* Only track the nodes of mappings with shadow entries */
1429  void workingset_update_node(struct xa_node *node);
1430  extern struct list_lru shadow_nodes;
1431  
1432  /* mremap.c */
1433  unsigned long move_page_tables(struct vm_area_struct *vma,
1434  	unsigned long old_addr, struct vm_area_struct *new_vma,
1435  	unsigned long new_addr, unsigned long len,
1436  	bool need_rmap_locks, bool for_stack);
1437  
1438  #ifdef CONFIG_UNACCEPTED_MEMORY
1439  void accept_page(struct page *page);
1440  #else /* CONFIG_UNACCEPTED_MEMORY */
1441  static inline void accept_page(struct page *page)
1442  {
1443  }
1444  #endif /* CONFIG_UNACCEPTED_MEMORY */
1445  
1446  #endif	/* __MM_INTERNAL_H */
1447