xref: /linux/mm/mprotect.c (revision 5ea5880764cbb164afb17a62e76ca75dc371409d)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  mm/mprotect.c
4  *
5  *  (C) Copyright 1994 Linus Torvalds
6  *  (C) Copyright 2002 Christoph Hellwig
7  *
8  *  Address space accounting code	<alan@lxorguk.ukuu.org.uk>
9  *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
10  */
11 
12 #include <linux/pagewalk.h>
13 #include <linux/hugetlb.h>
14 #include <linux/shm.h>
15 #include <linux/mman.h>
16 #include <linux/fs.h>
17 #include <linux/highmem.h>
18 #include <linux/security.h>
19 #include <linux/mempolicy.h>
20 #include <linux/personality.h>
21 #include <linux/syscalls.h>
22 #include <linux/swap.h>
23 #include <linux/swapops.h>
24 #include <linux/mmu_notifier.h>
25 #include <linux/migrate.h>
26 #include <linux/perf_event.h>
27 #include <linux/pkeys.h>
28 #include <linux/ksm.h>
29 #include <linux/uaccess.h>
30 #include <linux/mm_inline.h>
31 #include <linux/pgtable.h>
32 #include <linux/userfaultfd_k.h>
33 #include <uapi/linux/mman.h>
34 #include <asm/cacheflush.h>
35 #include <asm/mmu_context.h>
36 #include <asm/tlbflush.h>
37 #include <asm/tlb.h>
38 
39 #include "internal.h"
40 
41 static bool maybe_change_pte_writable(struct vm_area_struct *vma, pte_t pte)
42 {
43 	if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
44 		return false;
45 
46 	/* Don't touch entries that are not even readable. */
47 	if (pte_protnone(pte))
48 		return false;
49 
50 	/* Do we need write faults for softdirty tracking? */
51 	if (pte_needs_soft_dirty_wp(vma, pte))
52 		return false;
53 
54 	/* Do we need write faults for uffd-wp tracking? */
55 	if (userfaultfd_pte_wp(vma, pte))
56 		return false;
57 
58 	return true;
59 }
60 
61 static bool can_change_private_pte_writable(struct vm_area_struct *vma,
62 					    unsigned long addr, pte_t pte)
63 {
64 	struct page *page;
65 
66 	if (!maybe_change_pte_writable(vma, pte))
67 		return false;
68 
69 	/*
70 	 * Writable MAP_PRIVATE mapping: We can only special-case on
71 	 * exclusive anonymous pages, because we know that our
72 	 * write-fault handler similarly would map them writable without
73 	 * any additional checks while holding the PT lock.
74 	 */
75 	page = vm_normal_page(vma, addr, pte);
76 	return page && PageAnon(page) && PageAnonExclusive(page);
77 }
78 
79 static bool can_change_shared_pte_writable(struct vm_area_struct *vma,
80 					   pte_t pte)
81 {
82 	if (!maybe_change_pte_writable(vma, pte))
83 		return false;
84 
85 	VM_WARN_ON_ONCE(is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte));
86 
87 	/*
88 	 * Writable MAP_SHARED mapping: "clean" might indicate that the FS still
89 	 * needs a real write-fault for writenotify
90 	 * (see vma_wants_writenotify()). If "dirty", the assumption is that the
91 	 * FS was already notified and we can simply mark the PTE writable
92 	 * just like the write-fault handler would do.
93 	 */
94 	return pte_dirty(pte);
95 }
96 
97 bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
98 			     pte_t pte)
99 {
100 	if (!(vma->vm_flags & VM_SHARED))
101 		return can_change_private_pte_writable(vma, addr, pte);
102 
103 	return can_change_shared_pte_writable(vma, pte);
104 }
105 
106 static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
107 				    pte_t pte, int max_nr_ptes, fpb_t flags)
108 {
109 	/* No underlying folio, so cannot batch */
110 	if (!folio)
111 		return 1;
112 
113 	if (!folio_test_large(folio))
114 		return 1;
115 
116 	return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr_ptes, flags);
117 }
118 
119 /* Set nr_ptes number of ptes, starting from idx */
120 static __always_inline void prot_commit_flush_ptes(struct vm_area_struct *vma,
121 		unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent,
122 		int nr_ptes, int idx, bool set_write, struct mmu_gather *tlb)
123 {
124 	/*
125 	 * Advance the position in the batch by idx; note that if idx > 0,
126 	 * then the nr_ptes passed here is <= batch size - idx.
127 	 */
128 	addr += idx * PAGE_SIZE;
129 	ptep += idx;
130 	oldpte = pte_advance_pfn(oldpte, idx);
131 	ptent = pte_advance_pfn(ptent, idx);
132 
133 	if (set_write)
134 		ptent = pte_mkwrite(ptent, vma);
135 
136 	modify_prot_commit_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes);
137 	if (pte_needs_flush(oldpte, ptent))
138 		tlb_flush_pte_range(tlb, addr, nr_ptes * PAGE_SIZE);
139 }
140 
141 /*
142  * Get max length of consecutive ptes pointing to PageAnonExclusive() pages or
143  * !PageAnonExclusive() pages, starting from start_idx. Caller must enforce
144  * that the ptes point to consecutive pages of the same anon large folio.
145  */
146 static __always_inline int page_anon_exclusive_sub_batch(int start_idx, int max_len,
147 		struct page *first_page, bool expected_anon_exclusive)
148 {
149 	int idx;
150 
151 	for (idx = start_idx + 1; idx < start_idx + max_len; ++idx) {
152 		if (expected_anon_exclusive != PageAnonExclusive(first_page + idx))
153 			break;
154 	}
155 	return idx - start_idx;
156 }
157 
158 /*
159  * This function is a result of trying our very best to retain the
160  * "avoid the write-fault handler" optimization. In can_change_pte_writable(),
161  * if the vma is a private vma, and we cannot determine whether to change
162  * the pte to writable just from the vma and the pte, we then need to look
163  * at the actual page pointed to by the pte. Unfortunately, if we have a
164  * batch of ptes pointing to consecutive pages of the same anon large folio,
165  * the anon-exclusivity (or the negation) of the first page does not guarantee
166  * the anon-exclusivity (or the negation) of the other pages corresponding to
167  * the pte batch; hence in this case it is incorrect to decide to change or
168  * not change the ptes to writable just by using information from the first
169  * pte of the batch. Therefore, we must individually check all pages and
170  * retrieve sub-batches.
171  */
172 static __always_inline void commit_anon_folio_batch(struct vm_area_struct *vma,
173 		struct folio *folio, struct page *first_page, unsigned long addr, pte_t *ptep,
174 		pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
175 {
176 	bool expected_anon_exclusive;
177 	int sub_batch_idx = 0;
178 	int len;
179 
180 	while (nr_ptes) {
181 		expected_anon_exclusive = PageAnonExclusive(first_page + sub_batch_idx);
182 		len = page_anon_exclusive_sub_batch(sub_batch_idx, nr_ptes,
183 					first_page, expected_anon_exclusive);
184 		prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, len,
185 				       sub_batch_idx, expected_anon_exclusive, tlb);
186 		sub_batch_idx += len;
187 		nr_ptes -= len;
188 	}
189 }
190 
191 static __always_inline void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma,
192 		struct folio *folio, struct page *page, unsigned long addr, pte_t *ptep,
193 		pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
194 {
195 	bool set_write;
196 
197 	if (vma->vm_flags & VM_SHARED) {
198 		set_write = can_change_shared_pte_writable(vma, ptent);
199 		prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes,
200 				       /* idx = */ 0, set_write, tlb);
201 		return;
202 	}
203 
204 	set_write = maybe_change_pte_writable(vma, ptent) &&
205 		    (folio && folio_test_anon(folio));
206 	if (!set_write) {
207 		prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes,
208 				       /* idx = */ 0, set_write, tlb);
209 		return;
210 	}
211 	commit_anon_folio_batch(vma, folio, page, addr, ptep, oldpte, ptent, nr_ptes, tlb);
212 }
213 
214 static long change_softleaf_pte(struct vm_area_struct *vma,
215 	unsigned long addr, pte_t *pte, pte_t oldpte, unsigned long cp_flags)
216 {
217 	const bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
218 	const bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
219 	softleaf_t entry = softleaf_from_pte(oldpte);
220 	pte_t newpte;
221 
222 	if (softleaf_is_migration_write(entry)) {
223 		const struct folio *folio = softleaf_to_folio(entry);
224 
225 		/*
226 		 * A protection check is difficult so
227 		 * just be safe and disable write
228 		 */
229 		if (folio_test_anon(folio))
230 			entry = make_readable_exclusive_migration_entry(swp_offset(entry));
231 		else
232 			entry = make_readable_migration_entry(swp_offset(entry));
233 		newpte = swp_entry_to_pte(entry);
234 		if (pte_swp_soft_dirty(oldpte))
235 			newpte = pte_swp_mksoft_dirty(newpte);
236 	} else if (softleaf_is_device_private_write(entry)) {
237 		/*
238 		 * We do not preserve soft-dirtiness. See
239 		 * copy_nonpresent_pte() for explanation.
240 		 */
241 		entry = make_readable_device_private_entry(swp_offset(entry));
242 		newpte = swp_entry_to_pte(entry);
243 		if (pte_swp_uffd_wp(oldpte))
244 			newpte = pte_swp_mkuffd_wp(newpte);
245 	} else if (softleaf_is_marker(entry)) {
246 		/*
247 		 * Ignore error swap entries unconditionally,
248 		 * because any access should sigbus/sigsegv
249 		 * anyway.
250 		 */
251 		if (softleaf_is_poison_marker(entry) ||
252 		    softleaf_is_guard_marker(entry))
253 			return 0;
254 		/*
255 		 * If this is uffd-wp pte marker and we'd like
256 		 * to unprotect it, drop it; the next page
257 		 * fault will trigger without uffd trapping.
258 		 */
259 		if (uffd_wp_resolve) {
260 			pte_clear(vma->vm_mm, addr, pte);
261 			return 1;
262 		}
263 		return 0;
264 	} else {
265 		newpte = oldpte;
266 	}
267 
268 	if (uffd_wp)
269 		newpte = pte_swp_mkuffd_wp(newpte);
270 	else if (uffd_wp_resolve)
271 		newpte = pte_swp_clear_uffd_wp(newpte);
272 
273 	if (!pte_same(oldpte, newpte)) {
274 		set_pte_at(vma->vm_mm, addr, pte, newpte);
275 		return 1;
276 	}
277 	return 0;
278 }
279 
280 static __always_inline void change_present_ptes(struct mmu_gather *tlb,
281 		struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
282 		int nr_ptes, unsigned long end, pgprot_t newprot,
283 		struct folio *folio, struct page *page, unsigned long cp_flags)
284 {
285 	const bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
286 	const bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
287 	pte_t ptent, oldpte;
288 
289 	oldpte = modify_prot_start_ptes(vma, addr, ptep, nr_ptes);
290 	ptent = pte_modify(oldpte, newprot);
291 
292 	if (uffd_wp)
293 		ptent = pte_mkuffd_wp(ptent);
294 	else if (uffd_wp_resolve)
295 		ptent = pte_clear_uffd_wp(ptent);
296 
297 	/*
298 	 * In some writable, shared mappings, we might want
299 	 * to catch actual write access -- see
300 	 * vma_wants_writenotify().
301 	 *
302 	 * In all writable, private mappings, we have to
303 	 * properly handle COW.
304 	 *
305 	 * In both cases, we can sometimes still change PTEs
306 	 * writable and avoid the write-fault handler, for
307 	 * example, if a PTE is already dirty and no other
308 	 * COW or special handling is required.
309 	 */
310 	if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
311 	     !pte_write(ptent))
312 		set_write_prot_commit_flush_ptes(vma, folio, page,
313 			addr, ptep, oldpte, ptent, nr_ptes, tlb);
314 	else
315 		prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent,
316 			nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb);
317 }
318 
319 static long change_pte_range(struct mmu_gather *tlb,
320 		struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
321 		unsigned long end, pgprot_t newprot, unsigned long cp_flags)
322 {
323 	pte_t *pte, oldpte;
324 	spinlock_t *ptl;
325 	long pages = 0;
326 	bool is_private_single_threaded;
327 	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
328 	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
329 	int nr_ptes;
330 
331 	tlb_change_page_size(tlb, PAGE_SIZE);
332 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
333 	if (!pte)
334 		return -EAGAIN;
335 
336 	if (prot_numa)
337 		is_private_single_threaded = vma_is_single_threaded_private(vma);
338 
339 	flush_tlb_batched_pending(vma->vm_mm);
340 	lazy_mmu_mode_enable();
341 	do {
342 		nr_ptes = 1;
343 		oldpte = ptep_get(pte);
344 		if (pte_present(oldpte)) {
345 			const fpb_t flags = FPB_RESPECT_SOFT_DIRTY | FPB_RESPECT_WRITE;
346 			int max_nr_ptes = (end - addr) >> PAGE_SHIFT;
347 			struct folio *folio = NULL;
348 			struct page *page;
349 
350 			/* Already in the desired state. */
351 			if (prot_numa && pte_protnone(oldpte))
352 				continue;
353 
354 			page = vm_normal_page(vma, addr, oldpte);
355 			if (page)
356 				folio = page_folio(page);
357 
358 			/*
359 			 * Avoid trapping faults against the zero or KSM
360 			 * pages. See similar comment in change_huge_pmd.
361 			 */
362 			if (prot_numa &&
363 			    !folio_can_map_prot_numa(folio, vma,
364 						is_private_single_threaded)) {
365 
366 				/* determine batch to skip */
367 				nr_ptes = mprotect_folio_pte_batch(folio,
368 					  pte, oldpte, max_nr_ptes, /* flags = */ 0);
369 				continue;
370 			}
371 
372 			nr_ptes = mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, flags);
373 
374 			/*
375 			 * Optimize for the small-folio common case by
376 			 * special-casing it here. Compiler constant propagation
377 			 * plus copious amounts of __always_inline does wonders.
378 			 */
379 			if (likely(nr_ptes == 1)) {
380 				change_present_ptes(tlb, vma, addr, pte, 1,
381 					end, newprot, folio, page, cp_flags);
382 			} else {
383 				change_present_ptes(tlb, vma, addr, pte,
384 					nr_ptes, end, newprot, folio, page,
385 					cp_flags);
386 			}
387 
388 			pages += nr_ptes;
389 		} else if (pte_none(oldpte)) {
390 			/*
391 			 * Nobody plays with any none ptes besides
392 			 * userfaultfd when applying the protections.
393 			 */
394 			if (likely(!uffd_wp))
395 				continue;
396 
397 			if (userfaultfd_wp_use_markers(vma)) {
398 				/*
399 				 * For file-backed mem, we need to be able to
400 				 * wr-protect a none pte, because even if the
401 				 * pte is none, the page/swap cache could
402 				 * exist.  Doing that by install a marker.
403 				 */
404 				set_pte_at(vma->vm_mm, addr, pte,
405 					   make_pte_marker(PTE_MARKER_UFFD_WP));
406 				pages++;
407 			}
408 		} else  {
409 			pages += change_softleaf_pte(vma, addr, pte, oldpte, cp_flags);
410 		}
411 	} while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
412 	lazy_mmu_mode_disable();
413 	pte_unmap_unlock(pte - 1, ptl);
414 
415 	return pages;
416 }
417 
418 /*
419  * Return true if we want to split THPs into PTE mappings in change
420  * protection procedure, false otherwise.
421  */
422 static inline bool
423 pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags)
424 {
425 	/*
426 	 * pte markers only resides in pte level, if we need pte markers,
427 	 * we need to split.  For example, we cannot wr-protect a file thp
428 	 * (e.g. 2M shmem) because file thp is handled differently when
429 	 * split by erasing the pmd so far.
430 	 */
431 	return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma);
432 }
433 
434 /*
435  * Return true if we want to populate pgtables in change protection
436  * procedure, false otherwise
437  */
438 static inline bool
439 pgtable_populate_needed(struct vm_area_struct *vma, unsigned long cp_flags)
440 {
441 	/* If not within ioctl(UFFDIO_WRITEPROTECT), then don't bother */
442 	if (!(cp_flags & MM_CP_UFFD_WP))
443 		return false;
444 
445 	/* Populate if the userfaultfd mode requires pte markers */
446 	return userfaultfd_wp_use_markers(vma);
447 }
448 
449 /*
450  * Populate the pgtable underneath for whatever reason if requested.
451  * When {pte|pmd|...}_alloc() failed we treat it the same way as pgtable
452  * allocation failures during page faults by kicking OOM and returning
453  * error.
454  */
455 #define  change_pmd_prepare(vma, pmd, cp_flags)				\
456 	({								\
457 		long err = 0;						\
458 		if (unlikely(pgtable_populate_needed(vma, cp_flags))) {	\
459 			if (pte_alloc(vma->vm_mm, pmd))			\
460 				err = -ENOMEM;				\
461 		}							\
462 		err;							\
463 	})
464 
465 /*
466  * This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to
467  * have separate change_pmd_prepare() because pte_alloc() returns 0 on success,
468  * while {pmd|pud|p4d}_alloc() returns the valid pointer on success.
469  */
470 #define  change_prepare(vma, high, low, addr, cp_flags)			\
471 	  ({								\
472 		long err = 0;						\
473 		if (unlikely(pgtable_populate_needed(vma, cp_flags))) {	\
474 			low##_t *p = low##_alloc(vma->vm_mm, high, addr); \
475 			if (p == NULL)					\
476 				err = -ENOMEM;				\
477 		}							\
478 		err;							\
479 	})
480 
481 static inline long change_pmd_range(struct mmu_gather *tlb,
482 		struct vm_area_struct *vma, pud_t *pud, unsigned long addr,
483 		unsigned long end, pgprot_t newprot, unsigned long cp_flags)
484 {
485 	pmd_t *pmd;
486 	unsigned long next;
487 	long pages = 0;
488 	unsigned long nr_huge_updates = 0;
489 
490 	pmd = pmd_offset(pud, addr);
491 	do {
492 		long ret;
493 		pmd_t _pmd;
494 again:
495 		next = pmd_addr_end(addr, end);
496 
497 		ret = change_pmd_prepare(vma, pmd, cp_flags);
498 		if (ret) {
499 			pages = ret;
500 			break;
501 		}
502 
503 		if (pmd_none(*pmd))
504 			goto next;
505 
506 		_pmd = pmdp_get_lockless(pmd);
507 		if (pmd_is_huge(_pmd)) {
508 			if ((next - addr != HPAGE_PMD_SIZE) ||
509 			    pgtable_split_needed(vma, cp_flags)) {
510 				__split_huge_pmd(vma, pmd, addr, false);
511 				/*
512 				 * For file-backed, the pmd could have been
513 				 * cleared; make sure pmd populated if
514 				 * necessary, then fall-through to pte level.
515 				 */
516 				ret = change_pmd_prepare(vma, pmd, cp_flags);
517 				if (ret) {
518 					pages = ret;
519 					break;
520 				}
521 			} else {
522 				ret = change_huge_pmd(tlb, vma, pmd,
523 						addr, newprot, cp_flags);
524 				if (ret) {
525 					if (ret == HPAGE_PMD_NR) {
526 						pages += HPAGE_PMD_NR;
527 						nr_huge_updates++;
528 					}
529 
530 					/* huge pmd was handled */
531 					goto next;
532 				}
533 			}
534 			/* fall through, the trans huge pmd just split */
535 		}
536 
537 		ret = change_pte_range(tlb, vma, pmd, addr, next, newprot,
538 				       cp_flags);
539 		if (ret < 0)
540 			goto again;
541 		pages += ret;
542 next:
543 		cond_resched();
544 	} while (pmd++, addr = next, addr != end);
545 
546 	if (nr_huge_updates)
547 		count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
548 	return pages;
549 }
550 
551 static inline long change_pud_range(struct mmu_gather *tlb,
552 		struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr,
553 		unsigned long end, pgprot_t newprot, unsigned long cp_flags)
554 {
555 	struct mmu_notifier_range range;
556 	pud_t *pudp, pud;
557 	unsigned long next;
558 	long pages = 0, ret;
559 
560 	range.start = 0;
561 
562 	pudp = pud_offset(p4d, addr);
563 	do {
564 again:
565 		next = pud_addr_end(addr, end);
566 		ret = change_prepare(vma, pudp, pmd, addr, cp_flags);
567 		if (ret) {
568 			pages = ret;
569 			break;
570 		}
571 
572 		pud = pudp_get(pudp);
573 		if (pud_none(pud))
574 			continue;
575 
576 		if (!range.start) {
577 			mmu_notifier_range_init(&range,
578 						MMU_NOTIFY_PROTECTION_VMA, 0,
579 						vma->vm_mm, addr, end);
580 			mmu_notifier_invalidate_range_start(&range);
581 		}
582 
583 		if (pud_leaf(pud)) {
584 			if ((next - addr != PUD_SIZE) ||
585 			    pgtable_split_needed(vma, cp_flags)) {
586 				__split_huge_pud(vma, pudp, addr);
587 				goto again;
588 			} else {
589 				ret = change_huge_pud(tlb, vma, pudp,
590 						      addr, newprot, cp_flags);
591 				if (ret == 0)
592 					goto again;
593 				/* huge pud was handled */
594 				if (ret == HPAGE_PUD_NR)
595 					pages += HPAGE_PUD_NR;
596 				continue;
597 			}
598 		}
599 
600 		pages += change_pmd_range(tlb, vma, pudp, addr, next, newprot,
601 					  cp_flags);
602 	} while (pudp++, addr = next, addr != end);
603 
604 	if (range.start)
605 		mmu_notifier_invalidate_range_end(&range);
606 
607 	return pages;
608 }
609 
610 static inline long change_p4d_range(struct mmu_gather *tlb,
611 		struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr,
612 		unsigned long end, pgprot_t newprot, unsigned long cp_flags)
613 {
614 	p4d_t *p4d;
615 	unsigned long next;
616 	long pages = 0, ret;
617 
618 	p4d = p4d_offset(pgd, addr);
619 	do {
620 		next = p4d_addr_end(addr, end);
621 		ret = change_prepare(vma, p4d, pud, addr, cp_flags);
622 		if (ret)
623 			return ret;
624 		if (p4d_none_or_clear_bad(p4d))
625 			continue;
626 		pages += change_pud_range(tlb, vma, p4d, addr, next, newprot,
627 					  cp_flags);
628 	} while (p4d++, addr = next, addr != end);
629 
630 	return pages;
631 }
632 
633 static long change_protection_range(struct mmu_gather *tlb,
634 		struct vm_area_struct *vma, unsigned long addr,
635 		unsigned long end, pgprot_t newprot, unsigned long cp_flags)
636 {
637 	struct mm_struct *mm = vma->vm_mm;
638 	pgd_t *pgd;
639 	unsigned long next;
640 	long pages = 0, ret;
641 
642 	BUG_ON(addr >= end);
643 	pgd = pgd_offset(mm, addr);
644 	tlb_start_vma(tlb, vma);
645 	do {
646 		next = pgd_addr_end(addr, end);
647 		ret = change_prepare(vma, pgd, p4d, addr, cp_flags);
648 		if (ret) {
649 			pages = ret;
650 			break;
651 		}
652 		if (pgd_none_or_clear_bad(pgd))
653 			continue;
654 		pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot,
655 					  cp_flags);
656 	} while (pgd++, addr = next, addr != end);
657 
658 	tlb_end_vma(tlb, vma);
659 
660 	return pages;
661 }
662 
663 long change_protection(struct mmu_gather *tlb,
664 		       struct vm_area_struct *vma, unsigned long start,
665 		       unsigned long end, unsigned long cp_flags)
666 {
667 	pgprot_t newprot = vma->vm_page_prot;
668 	long pages;
669 
670 	BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
671 
672 #ifdef CONFIG_NUMA_BALANCING
673 	/*
674 	 * Ordinary protection updates (mprotect, uffd-wp, softdirty tracking)
675 	 * are expected to reflect their requirements via VMA flags such that
676 	 * vma_set_page_prot() will adjust vma->vm_page_prot accordingly.
677 	 */
678 	if (cp_flags & MM_CP_PROT_NUMA)
679 		newprot = PAGE_NONE;
680 #else
681 	WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA);
682 #endif
683 
684 	if (is_vm_hugetlb_page(vma))
685 		pages = hugetlb_change_protection(vma, start, end, newprot,
686 						  cp_flags);
687 	else
688 		pages = change_protection_range(tlb, vma, start, end, newprot,
689 						cp_flags);
690 
691 	return pages;
692 }
693 
694 static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
695 			       unsigned long next, struct mm_walk *walk)
696 {
697 	return pfn_modify_allowed(pte_pfn(ptep_get(pte)),
698 				  *(pgprot_t *)(walk->private)) ?
699 		0 : -EACCES;
700 }
701 
702 static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
703 				   unsigned long addr, unsigned long next,
704 				   struct mm_walk *walk)
705 {
706 	return pfn_modify_allowed(pte_pfn(ptep_get(pte)),
707 				  *(pgprot_t *)(walk->private)) ?
708 		0 : -EACCES;
709 }
710 
711 static int prot_none_test(unsigned long addr, unsigned long next,
712 			  struct mm_walk *walk)
713 {
714 	return 0;
715 }
716 
717 static const struct mm_walk_ops prot_none_walk_ops = {
718 	.pte_entry		= prot_none_pte_entry,
719 	.hugetlb_entry		= prot_none_hugetlb_entry,
720 	.test_walk		= prot_none_test,
721 	.walk_lock		= PGWALK_WRLOCK,
722 };
723 
724 int
725 mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
726 	       struct vm_area_struct *vma, struct vm_area_struct **pprev,
727 	       unsigned long start, unsigned long end, vm_flags_t newflags)
728 {
729 	struct mm_struct *mm = vma->vm_mm;
730 	const vma_flags_t old_vma_flags = READ_ONCE(vma->flags);
731 	vma_flags_t new_vma_flags = legacy_to_vma_flags(newflags);
732 	long nrpages = (end - start) >> PAGE_SHIFT;
733 	unsigned int mm_cp_flags = 0;
734 	unsigned long charged = 0;
735 	int error;
736 
737 	if (vma_is_sealed(vma))
738 		return -EPERM;
739 
740 	if (vma_flags_same_pair(&old_vma_flags, &new_vma_flags)) {
741 		*pprev = vma;
742 		return 0;
743 	}
744 
745 	/*
746 	 * Do PROT_NONE PFN permission checks here when we can still
747 	 * bail out without undoing a lot of state. This is a rather
748 	 * uncommon case, so doesn't need to be very optimized.
749 	 */
750 	if (arch_has_pfn_modify_check() &&
751 	    vma_flags_test_any(&old_vma_flags, VMA_PFNMAP_BIT,
752 			       VMA_MIXEDMAP_BIT) &&
753 	    !vma_flags_test_any_mask(&new_vma_flags, VMA_ACCESS_FLAGS)) {
754 		pgprot_t new_pgprot = vm_get_page_prot(newflags);
755 
756 		error = walk_page_range(current->mm, start, end,
757 				&prot_none_walk_ops, &new_pgprot);
758 		if (error)
759 			return error;
760 	}
761 
762 	/*
763 	 * If we make a private mapping writable we increase our commit;
764 	 * but (without finer accounting) cannot reduce our commit if we
765 	 * make it unwritable again except in the anonymous case where no
766 	 * anon_vma has yet to be assigned.
767 	 *
768 	 * hugetlb mapping were accounted for even if read-only so there is
769 	 * no need to account for them here.
770 	 */
771 	if (vma_flags_test(&new_vma_flags, VMA_WRITE_BIT)) {
772 		/* Check space limits when area turns into data. */
773 		if (!may_expand_vm(mm, &new_vma_flags, nrpages) &&
774 		    may_expand_vm(mm, &old_vma_flags, nrpages))
775 			return -ENOMEM;
776 		if (!vma_flags_test_any(&old_vma_flags,
777 				VMA_ACCOUNT_BIT, VMA_WRITE_BIT, VMA_HUGETLB_BIT,
778 				VMA_SHARED_BIT, VMA_NORESERVE_BIT)) {
779 			charged = nrpages;
780 			if (security_vm_enough_memory_mm(mm, charged))
781 				return -ENOMEM;
782 			vma_flags_set(&new_vma_flags, VMA_ACCOUNT_BIT);
783 		}
784 	} else if (vma_flags_test(&old_vma_flags, VMA_ACCOUNT_BIT) &&
785 		   vma_is_anonymous(vma) && !vma->anon_vma) {
786 		vma_flags_clear(&new_vma_flags, VMA_ACCOUNT_BIT);
787 	}
788 
789 	vma = vma_modify_flags(vmi, *pprev, vma, start, end, &new_vma_flags);
790 	if (IS_ERR(vma)) {
791 		error = PTR_ERR(vma);
792 		goto fail;
793 	}
794 
795 	*pprev = vma;
796 
797 	/*
798 	 * vm_flags and vm_page_prot are protected by the mmap_lock
799 	 * held in write mode.
800 	 */
801 	vma_start_write(vma);
802 	vma_flags_reset_once(vma, &new_vma_flags);
803 	if (vma_wants_manual_pte_write_upgrade(vma))
804 		mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
805 	vma_set_page_prot(vma);
806 
807 	change_protection(tlb, vma, start, end, mm_cp_flags);
808 
809 	if (vma_flags_test(&old_vma_flags, VMA_ACCOUNT_BIT) &&
810 	    !vma_flags_test(&new_vma_flags, VMA_ACCOUNT_BIT))
811 		vm_unacct_memory(nrpages);
812 
813 	/*
814 	 * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
815 	 * fault on access.
816 	 */
817 	if (vma_flags_test(&new_vma_flags, VMA_WRITE_BIT) &&
818 	    vma_flags_test(&old_vma_flags, VMA_LOCKED_BIT) &&
819 	    !vma_flags_test_any(&old_vma_flags, VMA_WRITE_BIT, VMA_SHARED_BIT))
820 		populate_vma_page_range(vma, start, end, NULL);
821 
822 	vm_stat_account(mm, vma_flags_to_legacy(old_vma_flags), -nrpages);
823 	newflags = vma_flags_to_legacy(new_vma_flags);
824 	vm_stat_account(mm, newflags, nrpages);
825 	perf_event_mmap(vma);
826 	return 0;
827 
828 fail:
829 	vm_unacct_memory(charged);
830 	return error;
831 }
832 
833 /*
834  * pkey==-1 when doing a legacy mprotect()
835  */
836 static int do_mprotect_pkey(unsigned long start, size_t len,
837 		unsigned long prot, int pkey)
838 {
839 	unsigned long nstart, end, tmp, reqprot;
840 	struct vm_area_struct *vma, *prev;
841 	int error;
842 	const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
843 	const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
844 				(prot & PROT_READ);
845 	struct mmu_gather tlb;
846 	struct vma_iterator vmi;
847 
848 	start = untagged_addr(start);
849 
850 	prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
851 	if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
852 		return -EINVAL;
853 
854 	if (start & ~PAGE_MASK)
855 		return -EINVAL;
856 	if (!len)
857 		return 0;
858 	len = PAGE_ALIGN(len);
859 	end = start + len;
860 	if (end <= start)
861 		return -ENOMEM;
862 	if (!arch_validate_prot(prot, start))
863 		return -EINVAL;
864 
865 	reqprot = prot;
866 
867 	if (mmap_write_lock_killable(current->mm))
868 		return -EINTR;
869 
870 	/*
871 	 * If userspace did not allocate the pkey, do not let
872 	 * them use it here.
873 	 */
874 	error = -EINVAL;
875 	if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
876 		goto out;
877 
878 	vma_iter_init(&vmi, current->mm, start);
879 	vma = vma_find(&vmi, end);
880 	error = -ENOMEM;
881 	if (!vma)
882 		goto out;
883 
884 	if (unlikely(grows & PROT_GROWSDOWN)) {
885 		if (vma->vm_start >= end)
886 			goto out;
887 		start = vma->vm_start;
888 		error = -EINVAL;
889 		if (!(vma->vm_flags & VM_GROWSDOWN))
890 			goto out;
891 	} else {
892 		if (vma->vm_start > start)
893 			goto out;
894 		if (unlikely(grows & PROT_GROWSUP)) {
895 			end = vma->vm_end;
896 			error = -EINVAL;
897 			if (!(vma->vm_flags & VM_GROWSUP))
898 				goto out;
899 		}
900 	}
901 
902 	prev = vma_prev(&vmi);
903 	if (start > vma->vm_start)
904 		prev = vma;
905 
906 	tlb_gather_mmu(&tlb, current->mm);
907 	nstart = start;
908 	tmp = vma->vm_start;
909 	for_each_vma_range(vmi, vma, end) {
910 		vm_flags_t mask_off_old_flags;
911 		vma_flags_t new_vma_flags;
912 		vm_flags_t newflags;
913 		int new_vma_pkey;
914 
915 		if (vma->vm_start != tmp) {
916 			error = -ENOMEM;
917 			break;
918 		}
919 
920 		/* Does the application expect PROT_READ to imply PROT_EXEC */
921 		if (rier && (vma->vm_flags & VM_MAYEXEC))
922 			prot |= PROT_EXEC;
923 
924 		/*
925 		 * Each mprotect() call explicitly passes r/w/x permissions.
926 		 * If a permission is not passed to mprotect(), it must be
927 		 * cleared from the VMA.
928 		 */
929 		mask_off_old_flags = VM_ACCESS_FLAGS | VM_FLAGS_CLEAR;
930 
931 		new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
932 		newflags = calc_vm_prot_bits(prot, new_vma_pkey);
933 		newflags |= (vma->vm_flags & ~mask_off_old_flags);
934 		new_vma_flags = legacy_to_vma_flags(newflags);
935 
936 		/* newflags >> 4 shift VM_MAY% in place of VM_% */
937 		if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) {
938 			error = -EACCES;
939 			break;
940 		}
941 
942 		if (map_deny_write_exec(&vma->flags, &new_vma_flags)) {
943 			error = -EACCES;
944 			break;
945 		}
946 
947 		/* Allow architectures to sanity-check the new flags */
948 		if (!arch_validate_flags(newflags)) {
949 			error = -EINVAL;
950 			break;
951 		}
952 
953 		error = security_file_mprotect(vma, reqprot, prot);
954 		if (error)
955 			break;
956 
957 		tmp = vma->vm_end;
958 		if (tmp > end)
959 			tmp = end;
960 
961 		if (vma->vm_ops && vma->vm_ops->mprotect) {
962 			error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags);
963 			if (error)
964 				break;
965 		}
966 
967 		error = mprotect_fixup(&vmi, &tlb, vma, &prev, nstart, tmp, newflags);
968 		if (error)
969 			break;
970 
971 		tmp = vma_iter_end(&vmi);
972 		nstart = tmp;
973 		prot = reqprot;
974 	}
975 	tlb_finish_mmu(&tlb);
976 
977 	if (!error && tmp < end)
978 		error = -ENOMEM;
979 
980 out:
981 	mmap_write_unlock(current->mm);
982 	return error;
983 }
984 
985 SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
986 		unsigned long, prot)
987 {
988 	return do_mprotect_pkey(start, len, prot, -1);
989 }
990 
991 #ifdef CONFIG_ARCH_HAS_PKEYS
992 
993 SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
994 		unsigned long, prot, int, pkey)
995 {
996 	return do_mprotect_pkey(start, len, prot, pkey);
997 }
998 
999 SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
1000 {
1001 	int pkey;
1002 	int ret;
1003 
1004 	/* No flags supported yet. */
1005 	if (flags)
1006 		return -EINVAL;
1007 	/* check for unsupported init values */
1008 	if (init_val & ~PKEY_ACCESS_MASK)
1009 		return -EINVAL;
1010 
1011 	mmap_write_lock(current->mm);
1012 	pkey = mm_pkey_alloc(current->mm);
1013 
1014 	ret = -ENOSPC;
1015 	if (pkey == -1)
1016 		goto out;
1017 
1018 	ret = arch_set_user_pkey_access(pkey, init_val);
1019 	if (ret) {
1020 		mm_pkey_free(current->mm, pkey);
1021 		goto out;
1022 	}
1023 	ret = pkey;
1024 out:
1025 	mmap_write_unlock(current->mm);
1026 	return ret;
1027 }
1028 
1029 SYSCALL_DEFINE1(pkey_free, int, pkey)
1030 {
1031 	int ret;
1032 
1033 	mmap_write_lock(current->mm);
1034 	ret = mm_pkey_free(current->mm, pkey);
1035 	mmap_write_unlock(current->mm);
1036 
1037 	/*
1038 	 * We could provide warnings or errors if any VMA still
1039 	 * has the pkey set here.
1040 	 */
1041 	return ret;
1042 }
1043 
1044 #endif /* CONFIG_ARCH_HAS_PKEYS */
1045