xref: /linux/mm/madvise.c (revision 868ddfcef31ff93ea8961b2e81ea7fe12f6f144b)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   *	linux/mm/madvise.c
4   *
5   * Copyright (C) 1999  Linus Torvalds
6   * Copyright (C) 2002  Christoph Hellwig
7   */
8  
9  #include <linux/mman.h>
10  #include <linux/pagemap.h>
11  #include <linux/syscalls.h>
12  #include <linux/mempolicy.h>
13  #include <linux/page-isolation.h>
14  #include <linux/page_idle.h>
15  #include <linux/userfaultfd_k.h>
16  #include <linux/hugetlb.h>
17  #include <linux/falloc.h>
18  #include <linux/fadvise.h>
19  #include <linux/sched.h>
20  #include <linux/sched/mm.h>
21  #include <linux/uio.h>
22  #include <linux/ksm.h>
23  #include <linux/fs.h>
24  #include <linux/file.h>
25  #include <linux/blkdev.h>
26  #include <linux/backing-dev.h>
27  #include <linux/pagewalk.h>
28  #include <linux/swap.h>
29  #include <linux/swapops.h>
30  #include <linux/shmem_fs.h>
31  #include <linux/mmu_notifier.h>
32  
33  #include <asm/tlb.h>
34  
35  #include "internal.h"
36  
37  struct madvise_walk_private {
38  	struct mmu_gather *tlb;
39  	bool pageout;
40  };
41  
42  /*
43   * Any behaviour which results in changes to the vma->vm_flags needs to
44   * take mmap_lock for writing. Others, which simply traverse vmas, need
45   * to only take it for reading.
46   */
47  static int madvise_need_mmap_write(int behavior)
48  {
49  	switch (behavior) {
50  	case MADV_REMOVE:
51  	case MADV_WILLNEED:
52  	case MADV_DONTNEED:
53  	case MADV_COLD:
54  	case MADV_PAGEOUT:
55  	case MADV_FREE:
56  	case MADV_POPULATE_READ:
57  	case MADV_POPULATE_WRITE:
58  		return 0;
59  	default:
60  		/* be safe, default to 1. list exceptions explicitly */
61  		return 1;
62  	}
63  }
64  
65  /*
66   * We can potentially split a vm area into separate
67   * areas, each area with its own behavior.
68   */
69  static long madvise_behavior(struct vm_area_struct *vma,
70  		     struct vm_area_struct **prev,
71  		     unsigned long start, unsigned long end, int behavior)
72  {
73  	struct mm_struct *mm = vma->vm_mm;
74  	int error = 0;
75  	pgoff_t pgoff;
76  	unsigned long new_flags = vma->vm_flags;
77  
78  	switch (behavior) {
79  	case MADV_NORMAL:
80  		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
81  		break;
82  	case MADV_SEQUENTIAL:
83  		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
84  		break;
85  	case MADV_RANDOM:
86  		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
87  		break;
88  	case MADV_DONTFORK:
89  		new_flags |= VM_DONTCOPY;
90  		break;
91  	case MADV_DOFORK:
92  		if (vma->vm_flags & VM_IO) {
93  			error = -EINVAL;
94  			goto out;
95  		}
96  		new_flags &= ~VM_DONTCOPY;
97  		break;
98  	case MADV_WIPEONFORK:
99  		/* MADV_WIPEONFORK is only supported on anonymous memory. */
100  		if (vma->vm_file || vma->vm_flags & VM_SHARED) {
101  			error = -EINVAL;
102  			goto out;
103  		}
104  		new_flags |= VM_WIPEONFORK;
105  		break;
106  	case MADV_KEEPONFORK:
107  		new_flags &= ~VM_WIPEONFORK;
108  		break;
109  	case MADV_DONTDUMP:
110  		new_flags |= VM_DONTDUMP;
111  		break;
112  	case MADV_DODUMP:
113  		if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
114  			error = -EINVAL;
115  			goto out;
116  		}
117  		new_flags &= ~VM_DONTDUMP;
118  		break;
119  	case MADV_MERGEABLE:
120  	case MADV_UNMERGEABLE:
121  		error = ksm_madvise(vma, start, end, behavior, &new_flags);
122  		if (error)
123  			goto out_convert_errno;
124  		break;
125  	case MADV_HUGEPAGE:
126  	case MADV_NOHUGEPAGE:
127  		error = hugepage_madvise(vma, &new_flags, behavior);
128  		if (error)
129  			goto out_convert_errno;
130  		break;
131  	}
132  
133  	if (new_flags == vma->vm_flags) {
134  		*prev = vma;
135  		goto out;
136  	}
137  
138  	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
139  	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
140  			  vma->vm_file, pgoff, vma_policy(vma),
141  			  vma->vm_userfaultfd_ctx);
142  	if (*prev) {
143  		vma = *prev;
144  		goto success;
145  	}
146  
147  	*prev = vma;
148  
149  	if (start != vma->vm_start) {
150  		if (unlikely(mm->map_count >= sysctl_max_map_count)) {
151  			error = -ENOMEM;
152  			goto out;
153  		}
154  		error = __split_vma(mm, vma, start, 1);
155  		if (error)
156  			goto out_convert_errno;
157  	}
158  
159  	if (end != vma->vm_end) {
160  		if (unlikely(mm->map_count >= sysctl_max_map_count)) {
161  			error = -ENOMEM;
162  			goto out;
163  		}
164  		error = __split_vma(mm, vma, end, 0);
165  		if (error)
166  			goto out_convert_errno;
167  	}
168  
169  success:
170  	/*
171  	 * vm_flags is protected by the mmap_lock held in write mode.
172  	 */
173  	vma->vm_flags = new_flags;
174  
175  out_convert_errno:
176  	/*
177  	 * madvise() returns EAGAIN if kernel resources, such as
178  	 * slab, are temporarily unavailable.
179  	 */
180  	if (error == -ENOMEM)
181  		error = -EAGAIN;
182  out:
183  	return error;
184  }
185  
186  #ifdef CONFIG_SWAP
187  static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
188  	unsigned long end, struct mm_walk *walk)
189  {
190  	pte_t *orig_pte;
191  	struct vm_area_struct *vma = walk->private;
192  	unsigned long index;
193  
194  	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
195  		return 0;
196  
197  	for (index = start; index != end; index += PAGE_SIZE) {
198  		pte_t pte;
199  		swp_entry_t entry;
200  		struct page *page;
201  		spinlock_t *ptl;
202  
203  		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
204  		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
205  		pte_unmap_unlock(orig_pte, ptl);
206  
207  		if (pte_present(pte) || pte_none(pte))
208  			continue;
209  		entry = pte_to_swp_entry(pte);
210  		if (unlikely(non_swap_entry(entry)))
211  			continue;
212  
213  		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
214  							vma, index, false);
215  		if (page)
216  			put_page(page);
217  	}
218  
219  	return 0;
220  }
221  
222  static const struct mm_walk_ops swapin_walk_ops = {
223  	.pmd_entry		= swapin_walk_pmd_entry,
224  };
225  
226  static void force_shm_swapin_readahead(struct vm_area_struct *vma,
227  		unsigned long start, unsigned long end,
228  		struct address_space *mapping)
229  {
230  	XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
231  	pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
232  	struct page *page;
233  
234  	rcu_read_lock();
235  	xas_for_each(&xas, page, end_index) {
236  		swp_entry_t swap;
237  
238  		if (!xa_is_value(page))
239  			continue;
240  		xas_pause(&xas);
241  		rcu_read_unlock();
242  
243  		swap = radix_to_swp_entry(page);
244  		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
245  							NULL, 0, false);
246  		if (page)
247  			put_page(page);
248  
249  		rcu_read_lock();
250  	}
251  	rcu_read_unlock();
252  
253  	lru_add_drain();	/* Push any new pages onto the LRU now */
254  }
255  #endif		/* CONFIG_SWAP */
256  
257  /*
258   * Schedule all required I/O operations.  Do not wait for completion.
259   */
260  static long madvise_willneed(struct vm_area_struct *vma,
261  			     struct vm_area_struct **prev,
262  			     unsigned long start, unsigned long end)
263  {
264  	struct mm_struct *mm = vma->vm_mm;
265  	struct file *file = vma->vm_file;
266  	loff_t offset;
267  
268  	*prev = vma;
269  #ifdef CONFIG_SWAP
270  	if (!file) {
271  		walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
272  		lru_add_drain(); /* Push any new pages onto the LRU now */
273  		return 0;
274  	}
275  
276  	if (shmem_mapping(file->f_mapping)) {
277  		force_shm_swapin_readahead(vma, start, end,
278  					file->f_mapping);
279  		return 0;
280  	}
281  #else
282  	if (!file)
283  		return -EBADF;
284  #endif
285  
286  	if (IS_DAX(file_inode(file))) {
287  		/* no bad return value, but ignore advice */
288  		return 0;
289  	}
290  
291  	/*
292  	 * Filesystem's fadvise may need to take various locks.  We need to
293  	 * explicitly grab a reference because the vma (and hence the
294  	 * vma's reference to the file) can go away as soon as we drop
295  	 * mmap_lock.
296  	 */
297  	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
298  	get_file(file);
299  	offset = (loff_t)(start - vma->vm_start)
300  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
301  	mmap_read_unlock(mm);
302  	vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
303  	fput(file);
304  	mmap_read_lock(mm);
305  	return 0;
306  }
307  
308  static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
309  				unsigned long addr, unsigned long end,
310  				struct mm_walk *walk)
311  {
312  	struct madvise_walk_private *private = walk->private;
313  	struct mmu_gather *tlb = private->tlb;
314  	bool pageout = private->pageout;
315  	struct mm_struct *mm = tlb->mm;
316  	struct vm_area_struct *vma = walk->vma;
317  	pte_t *orig_pte, *pte, ptent;
318  	spinlock_t *ptl;
319  	struct page *page = NULL;
320  	LIST_HEAD(page_list);
321  
322  	if (fatal_signal_pending(current))
323  		return -EINTR;
324  
325  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
326  	if (pmd_trans_huge(*pmd)) {
327  		pmd_t orig_pmd;
328  		unsigned long next = pmd_addr_end(addr, end);
329  
330  		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
331  		ptl = pmd_trans_huge_lock(pmd, vma);
332  		if (!ptl)
333  			return 0;
334  
335  		orig_pmd = *pmd;
336  		if (is_huge_zero_pmd(orig_pmd))
337  			goto huge_unlock;
338  
339  		if (unlikely(!pmd_present(orig_pmd))) {
340  			VM_BUG_ON(thp_migration_supported() &&
341  					!is_pmd_migration_entry(orig_pmd));
342  			goto huge_unlock;
343  		}
344  
345  		page = pmd_page(orig_pmd);
346  
347  		/* Do not interfere with other mappings of this page */
348  		if (page_mapcount(page) != 1)
349  			goto huge_unlock;
350  
351  		if (next - addr != HPAGE_PMD_SIZE) {
352  			int err;
353  
354  			get_page(page);
355  			spin_unlock(ptl);
356  			lock_page(page);
357  			err = split_huge_page(page);
358  			unlock_page(page);
359  			put_page(page);
360  			if (!err)
361  				goto regular_page;
362  			return 0;
363  		}
364  
365  		if (pmd_young(orig_pmd)) {
366  			pmdp_invalidate(vma, addr, pmd);
367  			orig_pmd = pmd_mkold(orig_pmd);
368  
369  			set_pmd_at(mm, addr, pmd, orig_pmd);
370  			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
371  		}
372  
373  		ClearPageReferenced(page);
374  		test_and_clear_page_young(page);
375  		if (pageout) {
376  			if (!isolate_lru_page(page)) {
377  				if (PageUnevictable(page))
378  					putback_lru_page(page);
379  				else
380  					list_add(&page->lru, &page_list);
381  			}
382  		} else
383  			deactivate_page(page);
384  huge_unlock:
385  		spin_unlock(ptl);
386  		if (pageout)
387  			reclaim_pages(&page_list);
388  		return 0;
389  	}
390  
391  regular_page:
392  	if (pmd_trans_unstable(pmd))
393  		return 0;
394  #endif
395  	tlb_change_page_size(tlb, PAGE_SIZE);
396  	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
397  	flush_tlb_batched_pending(mm);
398  	arch_enter_lazy_mmu_mode();
399  	for (; addr < end; pte++, addr += PAGE_SIZE) {
400  		ptent = *pte;
401  
402  		if (pte_none(ptent))
403  			continue;
404  
405  		if (!pte_present(ptent))
406  			continue;
407  
408  		page = vm_normal_page(vma, addr, ptent);
409  		if (!page)
410  			continue;
411  
412  		/*
413  		 * Creating a THP page is expensive so split it only if we
414  		 * are sure it's worth. Split it if we are only owner.
415  		 */
416  		if (PageTransCompound(page)) {
417  			if (page_mapcount(page) != 1)
418  				break;
419  			get_page(page);
420  			if (!trylock_page(page)) {
421  				put_page(page);
422  				break;
423  			}
424  			pte_unmap_unlock(orig_pte, ptl);
425  			if (split_huge_page(page)) {
426  				unlock_page(page);
427  				put_page(page);
428  				pte_offset_map_lock(mm, pmd, addr, &ptl);
429  				break;
430  			}
431  			unlock_page(page);
432  			put_page(page);
433  			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
434  			pte--;
435  			addr -= PAGE_SIZE;
436  			continue;
437  		}
438  
439  		/* Do not interfere with other mappings of this page */
440  		if (page_mapcount(page) != 1)
441  			continue;
442  
443  		VM_BUG_ON_PAGE(PageTransCompound(page), page);
444  
445  		if (pte_young(ptent)) {
446  			ptent = ptep_get_and_clear_full(mm, addr, pte,
447  							tlb->fullmm);
448  			ptent = pte_mkold(ptent);
449  			set_pte_at(mm, addr, pte, ptent);
450  			tlb_remove_tlb_entry(tlb, pte, addr);
451  		}
452  
453  		/*
454  		 * We are deactivating a page for accelerating reclaiming.
455  		 * VM couldn't reclaim the page unless we clear PG_young.
456  		 * As a side effect, it makes confuse idle-page tracking
457  		 * because they will miss recent referenced history.
458  		 */
459  		ClearPageReferenced(page);
460  		test_and_clear_page_young(page);
461  		if (pageout) {
462  			if (!isolate_lru_page(page)) {
463  				if (PageUnevictable(page))
464  					putback_lru_page(page);
465  				else
466  					list_add(&page->lru, &page_list);
467  			}
468  		} else
469  			deactivate_page(page);
470  	}
471  
472  	arch_leave_lazy_mmu_mode();
473  	pte_unmap_unlock(orig_pte, ptl);
474  	if (pageout)
475  		reclaim_pages(&page_list);
476  	cond_resched();
477  
478  	return 0;
479  }
480  
481  static const struct mm_walk_ops cold_walk_ops = {
482  	.pmd_entry = madvise_cold_or_pageout_pte_range,
483  };
484  
485  static void madvise_cold_page_range(struct mmu_gather *tlb,
486  			     struct vm_area_struct *vma,
487  			     unsigned long addr, unsigned long end)
488  {
489  	struct madvise_walk_private walk_private = {
490  		.pageout = false,
491  		.tlb = tlb,
492  	};
493  
494  	tlb_start_vma(tlb, vma);
495  	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
496  	tlb_end_vma(tlb, vma);
497  }
498  
499  static long madvise_cold(struct vm_area_struct *vma,
500  			struct vm_area_struct **prev,
501  			unsigned long start_addr, unsigned long end_addr)
502  {
503  	struct mm_struct *mm = vma->vm_mm;
504  	struct mmu_gather tlb;
505  
506  	*prev = vma;
507  	if (!can_madv_lru_vma(vma))
508  		return -EINVAL;
509  
510  	lru_add_drain();
511  	tlb_gather_mmu(&tlb, mm);
512  	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
513  	tlb_finish_mmu(&tlb);
514  
515  	return 0;
516  }
517  
518  static void madvise_pageout_page_range(struct mmu_gather *tlb,
519  			     struct vm_area_struct *vma,
520  			     unsigned long addr, unsigned long end)
521  {
522  	struct madvise_walk_private walk_private = {
523  		.pageout = true,
524  		.tlb = tlb,
525  	};
526  
527  	tlb_start_vma(tlb, vma);
528  	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
529  	tlb_end_vma(tlb, vma);
530  }
531  
532  static inline bool can_do_pageout(struct vm_area_struct *vma)
533  {
534  	if (vma_is_anonymous(vma))
535  		return true;
536  	if (!vma->vm_file)
537  		return false;
538  	/*
539  	 * paging out pagecache only for non-anonymous mappings that correspond
540  	 * to the files the calling process could (if tried) open for writing;
541  	 * otherwise we'd be including shared non-exclusive mappings, which
542  	 * opens a side channel.
543  	 */
544  	return inode_owner_or_capable(&init_user_ns,
545  				      file_inode(vma->vm_file)) ||
546  	       file_permission(vma->vm_file, MAY_WRITE) == 0;
547  }
548  
549  static long madvise_pageout(struct vm_area_struct *vma,
550  			struct vm_area_struct **prev,
551  			unsigned long start_addr, unsigned long end_addr)
552  {
553  	struct mm_struct *mm = vma->vm_mm;
554  	struct mmu_gather tlb;
555  
556  	*prev = vma;
557  	if (!can_madv_lru_vma(vma))
558  		return -EINVAL;
559  
560  	if (!can_do_pageout(vma))
561  		return 0;
562  
563  	lru_add_drain();
564  	tlb_gather_mmu(&tlb, mm);
565  	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
566  	tlb_finish_mmu(&tlb);
567  
568  	return 0;
569  }
570  
571  static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
572  				unsigned long end, struct mm_walk *walk)
573  
574  {
575  	struct mmu_gather *tlb = walk->private;
576  	struct mm_struct *mm = tlb->mm;
577  	struct vm_area_struct *vma = walk->vma;
578  	spinlock_t *ptl;
579  	pte_t *orig_pte, *pte, ptent;
580  	struct page *page;
581  	int nr_swap = 0;
582  	unsigned long next;
583  
584  	next = pmd_addr_end(addr, end);
585  	if (pmd_trans_huge(*pmd))
586  		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
587  			goto next;
588  
589  	if (pmd_trans_unstable(pmd))
590  		return 0;
591  
592  	tlb_change_page_size(tlb, PAGE_SIZE);
593  	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
594  	flush_tlb_batched_pending(mm);
595  	arch_enter_lazy_mmu_mode();
596  	for (; addr != end; pte++, addr += PAGE_SIZE) {
597  		ptent = *pte;
598  
599  		if (pte_none(ptent))
600  			continue;
601  		/*
602  		 * If the pte has swp_entry, just clear page table to
603  		 * prevent swap-in which is more expensive rather than
604  		 * (page allocation + zeroing).
605  		 */
606  		if (!pte_present(ptent)) {
607  			swp_entry_t entry;
608  
609  			entry = pte_to_swp_entry(ptent);
610  			if (non_swap_entry(entry))
611  				continue;
612  			nr_swap--;
613  			free_swap_and_cache(entry);
614  			pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
615  			continue;
616  		}
617  
618  		page = vm_normal_page(vma, addr, ptent);
619  		if (!page)
620  			continue;
621  
622  		/*
623  		 * If pmd isn't transhuge but the page is THP and
624  		 * is owned by only this process, split it and
625  		 * deactivate all pages.
626  		 */
627  		if (PageTransCompound(page)) {
628  			if (page_mapcount(page) != 1)
629  				goto out;
630  			get_page(page);
631  			if (!trylock_page(page)) {
632  				put_page(page);
633  				goto out;
634  			}
635  			pte_unmap_unlock(orig_pte, ptl);
636  			if (split_huge_page(page)) {
637  				unlock_page(page);
638  				put_page(page);
639  				pte_offset_map_lock(mm, pmd, addr, &ptl);
640  				goto out;
641  			}
642  			unlock_page(page);
643  			put_page(page);
644  			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
645  			pte--;
646  			addr -= PAGE_SIZE;
647  			continue;
648  		}
649  
650  		VM_BUG_ON_PAGE(PageTransCompound(page), page);
651  
652  		if (PageSwapCache(page) || PageDirty(page)) {
653  			if (!trylock_page(page))
654  				continue;
655  			/*
656  			 * If page is shared with others, we couldn't clear
657  			 * PG_dirty of the page.
658  			 */
659  			if (page_mapcount(page) != 1) {
660  				unlock_page(page);
661  				continue;
662  			}
663  
664  			if (PageSwapCache(page) && !try_to_free_swap(page)) {
665  				unlock_page(page);
666  				continue;
667  			}
668  
669  			ClearPageDirty(page);
670  			unlock_page(page);
671  		}
672  
673  		if (pte_young(ptent) || pte_dirty(ptent)) {
674  			/*
675  			 * Some of architecture(ex, PPC) don't update TLB
676  			 * with set_pte_at and tlb_remove_tlb_entry so for
677  			 * the portability, remap the pte with old|clean
678  			 * after pte clearing.
679  			 */
680  			ptent = ptep_get_and_clear_full(mm, addr, pte,
681  							tlb->fullmm);
682  
683  			ptent = pte_mkold(ptent);
684  			ptent = pte_mkclean(ptent);
685  			set_pte_at(mm, addr, pte, ptent);
686  			tlb_remove_tlb_entry(tlb, pte, addr);
687  		}
688  		mark_page_lazyfree(page);
689  	}
690  out:
691  	if (nr_swap) {
692  		if (current->mm == mm)
693  			sync_mm_rss(mm);
694  
695  		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
696  	}
697  	arch_leave_lazy_mmu_mode();
698  	pte_unmap_unlock(orig_pte, ptl);
699  	cond_resched();
700  next:
701  	return 0;
702  }
703  
704  static const struct mm_walk_ops madvise_free_walk_ops = {
705  	.pmd_entry		= madvise_free_pte_range,
706  };
707  
708  static int madvise_free_single_vma(struct vm_area_struct *vma,
709  			unsigned long start_addr, unsigned long end_addr)
710  {
711  	struct mm_struct *mm = vma->vm_mm;
712  	struct mmu_notifier_range range;
713  	struct mmu_gather tlb;
714  
715  	/* MADV_FREE works for only anon vma at the moment */
716  	if (!vma_is_anonymous(vma))
717  		return -EINVAL;
718  
719  	range.start = max(vma->vm_start, start_addr);
720  	if (range.start >= vma->vm_end)
721  		return -EINVAL;
722  	range.end = min(vma->vm_end, end_addr);
723  	if (range.end <= vma->vm_start)
724  		return -EINVAL;
725  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
726  				range.start, range.end);
727  
728  	lru_add_drain();
729  	tlb_gather_mmu(&tlb, mm);
730  	update_hiwater_rss(mm);
731  
732  	mmu_notifier_invalidate_range_start(&range);
733  	tlb_start_vma(&tlb, vma);
734  	walk_page_range(vma->vm_mm, range.start, range.end,
735  			&madvise_free_walk_ops, &tlb);
736  	tlb_end_vma(&tlb, vma);
737  	mmu_notifier_invalidate_range_end(&range);
738  	tlb_finish_mmu(&tlb);
739  
740  	return 0;
741  }
742  
743  /*
744   * Application no longer needs these pages.  If the pages are dirty,
745   * it's OK to just throw them away.  The app will be more careful about
746   * data it wants to keep.  Be sure to free swap resources too.  The
747   * zap_page_range call sets things up for shrink_active_list to actually free
748   * these pages later if no one else has touched them in the meantime,
749   * although we could add these pages to a global reuse list for
750   * shrink_active_list to pick up before reclaiming other pages.
751   *
752   * NB: This interface discards data rather than pushes it out to swap,
753   * as some implementations do.  This has performance implications for
754   * applications like large transactional databases which want to discard
755   * pages in anonymous maps after committing to backing store the data
756   * that was kept in them.  There is no reason to write this data out to
757   * the swap area if the application is discarding it.
758   *
759   * An interface that causes the system to free clean pages and flush
760   * dirty pages is already available as msync(MS_INVALIDATE).
761   */
762  static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
763  					unsigned long start, unsigned long end)
764  {
765  	zap_page_range(vma, start, end - start);
766  	return 0;
767  }
768  
769  static long madvise_dontneed_free(struct vm_area_struct *vma,
770  				  struct vm_area_struct **prev,
771  				  unsigned long start, unsigned long end,
772  				  int behavior)
773  {
774  	struct mm_struct *mm = vma->vm_mm;
775  
776  	*prev = vma;
777  	if (!can_madv_lru_vma(vma))
778  		return -EINVAL;
779  
780  	if (!userfaultfd_remove(vma, start, end)) {
781  		*prev = NULL; /* mmap_lock has been dropped, prev is stale */
782  
783  		mmap_read_lock(mm);
784  		vma = find_vma(mm, start);
785  		if (!vma)
786  			return -ENOMEM;
787  		if (start < vma->vm_start) {
788  			/*
789  			 * This "vma" under revalidation is the one
790  			 * with the lowest vma->vm_start where start
791  			 * is also < vma->vm_end. If start <
792  			 * vma->vm_start it means an hole materialized
793  			 * in the user address space within the
794  			 * virtual range passed to MADV_DONTNEED
795  			 * or MADV_FREE.
796  			 */
797  			return -ENOMEM;
798  		}
799  		if (!can_madv_lru_vma(vma))
800  			return -EINVAL;
801  		if (end > vma->vm_end) {
802  			/*
803  			 * Don't fail if end > vma->vm_end. If the old
804  			 * vma was split while the mmap_lock was
805  			 * released the effect of the concurrent
806  			 * operation may not cause madvise() to
807  			 * have an undefined result. There may be an
808  			 * adjacent next vma that we'll walk
809  			 * next. userfaultfd_remove() will generate an
810  			 * UFFD_EVENT_REMOVE repetition on the
811  			 * end-vma->vm_end range, but the manager can
812  			 * handle a repetition fine.
813  			 */
814  			end = vma->vm_end;
815  		}
816  		VM_WARN_ON(start >= end);
817  	}
818  
819  	if (behavior == MADV_DONTNEED)
820  		return madvise_dontneed_single_vma(vma, start, end);
821  	else if (behavior == MADV_FREE)
822  		return madvise_free_single_vma(vma, start, end);
823  	else
824  		return -EINVAL;
825  }
826  
827  static long madvise_populate(struct vm_area_struct *vma,
828  			     struct vm_area_struct **prev,
829  			     unsigned long start, unsigned long end,
830  			     int behavior)
831  {
832  	const bool write = behavior == MADV_POPULATE_WRITE;
833  	struct mm_struct *mm = vma->vm_mm;
834  	unsigned long tmp_end;
835  	int locked = 1;
836  	long pages;
837  
838  	*prev = vma;
839  
840  	while (start < end) {
841  		/*
842  		 * We might have temporarily dropped the lock. For example,
843  		 * our VMA might have been split.
844  		 */
845  		if (!vma || start >= vma->vm_end) {
846  			vma = find_vma(mm, start);
847  			if (!vma || start < vma->vm_start)
848  				return -ENOMEM;
849  		}
850  
851  		tmp_end = min_t(unsigned long, end, vma->vm_end);
852  		/* Populate (prefault) page tables readable/writable. */
853  		pages = faultin_vma_page_range(vma, start, tmp_end, write,
854  					       &locked);
855  		if (!locked) {
856  			mmap_read_lock(mm);
857  			locked = 1;
858  			*prev = NULL;
859  			vma = NULL;
860  		}
861  		if (pages < 0) {
862  			switch (pages) {
863  			case -EINTR:
864  				return -EINTR;
865  			case -EINVAL: /* Incompatible mappings / permissions. */
866  				return -EINVAL;
867  			case -EHWPOISON:
868  				return -EHWPOISON;
869  			case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
870  				return -EFAULT;
871  			default:
872  				pr_warn_once("%s: unhandled return value: %ld\n",
873  					     __func__, pages);
874  				fallthrough;
875  			case -ENOMEM:
876  				return -ENOMEM;
877  			}
878  		}
879  		start += pages * PAGE_SIZE;
880  	}
881  	return 0;
882  }
883  
884  /*
885   * Application wants to free up the pages and associated backing store.
886   * This is effectively punching a hole into the middle of a file.
887   */
888  static long madvise_remove(struct vm_area_struct *vma,
889  				struct vm_area_struct **prev,
890  				unsigned long start, unsigned long end)
891  {
892  	loff_t offset;
893  	int error;
894  	struct file *f;
895  	struct mm_struct *mm = vma->vm_mm;
896  
897  	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
898  
899  	if (vma->vm_flags & VM_LOCKED)
900  		return -EINVAL;
901  
902  	f = vma->vm_file;
903  
904  	if (!f || !f->f_mapping || !f->f_mapping->host) {
905  			return -EINVAL;
906  	}
907  
908  	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
909  		return -EACCES;
910  
911  	offset = (loff_t)(start - vma->vm_start)
912  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
913  
914  	/*
915  	 * Filesystem's fallocate may need to take i_rwsem.  We need to
916  	 * explicitly grab a reference because the vma (and hence the
917  	 * vma's reference to the file) can go away as soon as we drop
918  	 * mmap_lock.
919  	 */
920  	get_file(f);
921  	if (userfaultfd_remove(vma, start, end)) {
922  		/* mmap_lock was not released by userfaultfd_remove() */
923  		mmap_read_unlock(mm);
924  	}
925  	error = vfs_fallocate(f,
926  				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
927  				offset, end - start);
928  	fput(f);
929  	mmap_read_lock(mm);
930  	return error;
931  }
932  
933  #ifdef CONFIG_MEMORY_FAILURE
934  /*
935   * Error injection support for memory error handling.
936   */
937  static int madvise_inject_error(int behavior,
938  		unsigned long start, unsigned long end)
939  {
940  	unsigned long size;
941  
942  	if (!capable(CAP_SYS_ADMIN))
943  		return -EPERM;
944  
945  
946  	for (; start < end; start += size) {
947  		unsigned long pfn;
948  		struct page *page;
949  		int ret;
950  
951  		ret = get_user_pages_fast(start, 1, 0, &page);
952  		if (ret != 1)
953  			return ret;
954  		pfn = page_to_pfn(page);
955  
956  		/*
957  		 * When soft offlining hugepages, after migrating the page
958  		 * we dissolve it, therefore in the second loop "page" will
959  		 * no longer be a compound page.
960  		 */
961  		size = page_size(compound_head(page));
962  
963  		if (behavior == MADV_SOFT_OFFLINE) {
964  			pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
965  				 pfn, start);
966  			ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
967  		} else {
968  			pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
969  				 pfn, start);
970  			ret = memory_failure(pfn, MF_COUNT_INCREASED);
971  		}
972  
973  		if (ret)
974  			return ret;
975  	}
976  
977  	return 0;
978  }
979  #endif
980  
981  static long
982  madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
983  		unsigned long start, unsigned long end, int behavior)
984  {
985  	switch (behavior) {
986  	case MADV_REMOVE:
987  		return madvise_remove(vma, prev, start, end);
988  	case MADV_WILLNEED:
989  		return madvise_willneed(vma, prev, start, end);
990  	case MADV_COLD:
991  		return madvise_cold(vma, prev, start, end);
992  	case MADV_PAGEOUT:
993  		return madvise_pageout(vma, prev, start, end);
994  	case MADV_FREE:
995  	case MADV_DONTNEED:
996  		return madvise_dontneed_free(vma, prev, start, end, behavior);
997  	case MADV_POPULATE_READ:
998  	case MADV_POPULATE_WRITE:
999  		return madvise_populate(vma, prev, start, end, behavior);
1000  	default:
1001  		return madvise_behavior(vma, prev, start, end, behavior);
1002  	}
1003  }
1004  
1005  static bool
1006  madvise_behavior_valid(int behavior)
1007  {
1008  	switch (behavior) {
1009  	case MADV_DOFORK:
1010  	case MADV_DONTFORK:
1011  	case MADV_NORMAL:
1012  	case MADV_SEQUENTIAL:
1013  	case MADV_RANDOM:
1014  	case MADV_REMOVE:
1015  	case MADV_WILLNEED:
1016  	case MADV_DONTNEED:
1017  	case MADV_FREE:
1018  	case MADV_COLD:
1019  	case MADV_PAGEOUT:
1020  	case MADV_POPULATE_READ:
1021  	case MADV_POPULATE_WRITE:
1022  #ifdef CONFIG_KSM
1023  	case MADV_MERGEABLE:
1024  	case MADV_UNMERGEABLE:
1025  #endif
1026  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1027  	case MADV_HUGEPAGE:
1028  	case MADV_NOHUGEPAGE:
1029  #endif
1030  	case MADV_DONTDUMP:
1031  	case MADV_DODUMP:
1032  	case MADV_WIPEONFORK:
1033  	case MADV_KEEPONFORK:
1034  #ifdef CONFIG_MEMORY_FAILURE
1035  	case MADV_SOFT_OFFLINE:
1036  	case MADV_HWPOISON:
1037  #endif
1038  		return true;
1039  
1040  	default:
1041  		return false;
1042  	}
1043  }
1044  
1045  static bool
1046  process_madvise_behavior_valid(int behavior)
1047  {
1048  	switch (behavior) {
1049  	case MADV_COLD:
1050  	case MADV_PAGEOUT:
1051  	case MADV_WILLNEED:
1052  		return true;
1053  	default:
1054  		return false;
1055  	}
1056  }
1057  
1058  /*
1059   * The madvise(2) system call.
1060   *
1061   * Applications can use madvise() to advise the kernel how it should
1062   * handle paging I/O in this VM area.  The idea is to help the kernel
1063   * use appropriate read-ahead and caching techniques.  The information
1064   * provided is advisory only, and can be safely disregarded by the
1065   * kernel without affecting the correct operation of the application.
1066   *
1067   * behavior values:
1068   *  MADV_NORMAL - the default behavior is to read clusters.  This
1069   *		results in some read-ahead and read-behind.
1070   *  MADV_RANDOM - the system should read the minimum amount of data
1071   *		on any access, since it is unlikely that the appli-
1072   *		cation will need more than what it asks for.
1073   *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
1074   *		once, so they can be aggressively read ahead, and
1075   *		can be freed soon after they are accessed.
1076   *  MADV_WILLNEED - the application is notifying the system to read
1077   *		some pages ahead.
1078   *  MADV_DONTNEED - the application is finished with the given range,
1079   *		so the kernel can free resources associated with it.
1080   *  MADV_FREE - the application marks pages in the given range as lazy free,
1081   *		where actual purges are postponed until memory pressure happens.
1082   *  MADV_REMOVE - the application wants to free up the given range of
1083   *		pages and associated backing store.
1084   *  MADV_DONTFORK - omit this area from child's address space when forking:
1085   *		typically, to avoid COWing pages pinned by get_user_pages().
1086   *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
1087   *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
1088   *              range after a fork.
1089   *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
1090   *  MADV_HWPOISON - trigger memory error handler as if the given memory range
1091   *		were corrupted by unrecoverable hardware memory failure.
1092   *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
1093   *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
1094   *		this area with pages of identical content from other such areas.
1095   *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1096   *  MADV_HUGEPAGE - the application wants to back the given range by transparent
1097   *		huge pages in the future. Existing pages might be coalesced and
1098   *		new pages might be allocated as THP.
1099   *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
1100   *		transparent huge pages so the existing pages will not be
1101   *		coalesced into THP and new pages will not be allocated as THP.
1102   *  MADV_DONTDUMP - the application wants to prevent pages in the given range
1103   *		from being included in its core dump.
1104   *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1105   *  MADV_COLD - the application is not expected to use this memory soon,
1106   *		deactivate pages in this range so that they can be reclaimed
1107   *		easily if memory pressure happens.
1108   *  MADV_PAGEOUT - the application is not expected to use this memory soon,
1109   *		page out the pages in this range immediately.
1110   *  MADV_POPULATE_READ - populate (prefault) page tables readable by
1111   *		triggering read faults if required
1112   *  MADV_POPULATE_WRITE - populate (prefault) page tables writable by
1113   *		triggering write faults if required
1114   *
1115   * return values:
1116   *  zero    - success
1117   *  -EINVAL - start + len < 0, start is not page-aligned,
1118   *		"behavior" is not a valid value, or application
1119   *		is attempting to release locked or shared pages,
1120   *		or the specified address range includes file, Huge TLB,
1121   *		MAP_SHARED or VMPFNMAP range.
1122   *  -ENOMEM - addresses in the specified range are not currently
1123   *		mapped, or are outside the AS of the process.
1124   *  -EIO    - an I/O error occurred while paging in data.
1125   *  -EBADF  - map exists, but area maps something that isn't a file.
1126   *  -EAGAIN - a kernel resource was temporarily unavailable.
1127   */
1128  int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
1129  {
1130  	unsigned long end, tmp;
1131  	struct vm_area_struct *vma, *prev;
1132  	int unmapped_error = 0;
1133  	int error = -EINVAL;
1134  	int write;
1135  	size_t len;
1136  	struct blk_plug plug;
1137  
1138  	start = untagged_addr(start);
1139  
1140  	if (!madvise_behavior_valid(behavior))
1141  		return error;
1142  
1143  	if (!PAGE_ALIGNED(start))
1144  		return error;
1145  	len = PAGE_ALIGN(len_in);
1146  
1147  	/* Check to see whether len was rounded up from small -ve to zero */
1148  	if (len_in && !len)
1149  		return error;
1150  
1151  	end = start + len;
1152  	if (end < start)
1153  		return error;
1154  
1155  	error = 0;
1156  	if (end == start)
1157  		return error;
1158  
1159  #ifdef CONFIG_MEMORY_FAILURE
1160  	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
1161  		return madvise_inject_error(behavior, start, start + len_in);
1162  #endif
1163  
1164  	write = madvise_need_mmap_write(behavior);
1165  	if (write) {
1166  		if (mmap_write_lock_killable(mm))
1167  			return -EINTR;
1168  	} else {
1169  		mmap_read_lock(mm);
1170  	}
1171  
1172  	/*
1173  	 * If the interval [start,end) covers some unmapped address
1174  	 * ranges, just ignore them, but return -ENOMEM at the end.
1175  	 * - different from the way of handling in mlock etc.
1176  	 */
1177  	vma = find_vma_prev(mm, start, &prev);
1178  	if (vma && start > vma->vm_start)
1179  		prev = vma;
1180  
1181  	blk_start_plug(&plug);
1182  	for (;;) {
1183  		/* Still start < end. */
1184  		error = -ENOMEM;
1185  		if (!vma)
1186  			goto out;
1187  
1188  		/* Here start < (end|vma->vm_end). */
1189  		if (start < vma->vm_start) {
1190  			unmapped_error = -ENOMEM;
1191  			start = vma->vm_start;
1192  			if (start >= end)
1193  				goto out;
1194  		}
1195  
1196  		/* Here vma->vm_start <= start < (end|vma->vm_end) */
1197  		tmp = vma->vm_end;
1198  		if (end < tmp)
1199  			tmp = end;
1200  
1201  		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
1202  		error = madvise_vma(vma, &prev, start, tmp, behavior);
1203  		if (error)
1204  			goto out;
1205  		start = tmp;
1206  		if (prev && start < prev->vm_end)
1207  			start = prev->vm_end;
1208  		error = unmapped_error;
1209  		if (start >= end)
1210  			goto out;
1211  		if (prev)
1212  			vma = prev->vm_next;
1213  		else	/* madvise_remove dropped mmap_lock */
1214  			vma = find_vma(mm, start);
1215  	}
1216  out:
1217  	blk_finish_plug(&plug);
1218  	if (write)
1219  		mmap_write_unlock(mm);
1220  	else
1221  		mmap_read_unlock(mm);
1222  
1223  	return error;
1224  }
1225  
1226  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1227  {
1228  	return do_madvise(current->mm, start, len_in, behavior);
1229  }
1230  
1231  SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
1232  		size_t, vlen, int, behavior, unsigned int, flags)
1233  {
1234  	ssize_t ret;
1235  	struct iovec iovstack[UIO_FASTIOV], iovec;
1236  	struct iovec *iov = iovstack;
1237  	struct iov_iter iter;
1238  	struct pid *pid;
1239  	struct task_struct *task;
1240  	struct mm_struct *mm;
1241  	size_t total_len;
1242  	unsigned int f_flags;
1243  
1244  	if (flags != 0) {
1245  		ret = -EINVAL;
1246  		goto out;
1247  	}
1248  
1249  	ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1250  	if (ret < 0)
1251  		goto out;
1252  
1253  	pid = pidfd_get_pid(pidfd, &f_flags);
1254  	if (IS_ERR(pid)) {
1255  		ret = PTR_ERR(pid);
1256  		goto free_iov;
1257  	}
1258  
1259  	task = get_pid_task(pid, PIDTYPE_PID);
1260  	if (!task) {
1261  		ret = -ESRCH;
1262  		goto put_pid;
1263  	}
1264  
1265  	if (!process_madvise_behavior_valid(behavior)) {
1266  		ret = -EINVAL;
1267  		goto release_task;
1268  	}
1269  
1270  	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
1271  	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
1272  	if (IS_ERR_OR_NULL(mm)) {
1273  		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
1274  		goto release_task;
1275  	}
1276  
1277  	/*
1278  	 * Require CAP_SYS_NICE for influencing process performance. Note that
1279  	 * only non-destructive hints are currently supported.
1280  	 */
1281  	if (!capable(CAP_SYS_NICE)) {
1282  		ret = -EPERM;
1283  		goto release_mm;
1284  	}
1285  
1286  	total_len = iov_iter_count(&iter);
1287  
1288  	while (iov_iter_count(&iter)) {
1289  		iovec = iov_iter_iovec(&iter);
1290  		ret = do_madvise(mm, (unsigned long)iovec.iov_base,
1291  					iovec.iov_len, behavior);
1292  		if (ret < 0)
1293  			break;
1294  		iov_iter_advance(&iter, iovec.iov_len);
1295  	}
1296  
1297  	if (ret == 0)
1298  		ret = total_len - iov_iter_count(&iter);
1299  
1300  release_mm:
1301  	mmput(mm);
1302  release_task:
1303  	put_task_struct(task);
1304  put_pid:
1305  	put_pid(pid);
1306  free_iov:
1307  	kfree(iov);
1308  out:
1309  	return ret;
1310  }
1311