xref: /linux/mm/mseal.c (revision 42b16d3ac371a2fac9b6f08fd75f23f34ba3955a)
18be7258aSJeff Xu // SPDX-License-Identifier: GPL-2.0
28be7258aSJeff Xu /*
38be7258aSJeff Xu  *  Implement mseal() syscall.
48be7258aSJeff Xu  *
58be7258aSJeff Xu  *  Copyright (c) 2023,2024 Google, Inc.
68be7258aSJeff Xu  *
78be7258aSJeff Xu  *  Author: Jeff Xu <jeffxu@chromium.org>
88be7258aSJeff Xu  */
98be7258aSJeff Xu 
108be7258aSJeff Xu #include <linux/mempolicy.h>
118be7258aSJeff Xu #include <linux/mman.h>
128be7258aSJeff Xu #include <linux/mm.h>
138be7258aSJeff Xu #include <linux/mm_inline.h>
148be7258aSJeff Xu #include <linux/mmu_context.h>
158be7258aSJeff Xu #include <linux/syscalls.h>
168be7258aSJeff Xu #include <linux/sched.h>
178be7258aSJeff Xu #include "internal.h"
188be7258aSJeff Xu 
set_vma_sealed(struct vm_area_struct * vma)198be7258aSJeff Xu static inline void set_vma_sealed(struct vm_area_struct *vma)
208be7258aSJeff Xu {
218be7258aSJeff Xu 	vm_flags_set(vma, VM_SEALED);
228be7258aSJeff Xu }
238be7258aSJeff Xu 
is_madv_discard(int behavior)248be7258aSJeff Xu static bool is_madv_discard(int behavior)
258be7258aSJeff Xu {
268be7258aSJeff Xu 	switch (behavior) {
278be7258aSJeff Xu 	case MADV_FREE:
288be7258aSJeff Xu 	case MADV_DONTNEED:
298be7258aSJeff Xu 	case MADV_DONTNEED_LOCKED:
308be7258aSJeff Xu 	case MADV_REMOVE:
318be7258aSJeff Xu 	case MADV_DONTFORK:
328be7258aSJeff Xu 	case MADV_WIPEONFORK:
338be7258aSJeff Xu 		return true;
348be7258aSJeff Xu 	}
358be7258aSJeff Xu 
368be7258aSJeff Xu 	return false;
378be7258aSJeff Xu }
388be7258aSJeff Xu 
is_ro_anon(struct vm_area_struct * vma)398be7258aSJeff Xu static bool is_ro_anon(struct vm_area_struct *vma)
408be7258aSJeff Xu {
418be7258aSJeff Xu 	/* check anonymous mapping. */
428be7258aSJeff Xu 	if (vma->vm_file || vma->vm_flags & VM_SHARED)
43*e46bc2e7SPedro Falcato 		return false;
44*e46bc2e7SPedro Falcato 
45*e46bc2e7SPedro Falcato 	/*
46*e46bc2e7SPedro Falcato 	 * check for non-writable:
47*e46bc2e7SPedro Falcato 	 * PROT=RO or PKRU is not writeable.
48*e46bc2e7SPedro Falcato 	 */
49*e46bc2e7SPedro Falcato 	if (!(vma->vm_flags & VM_WRITE) ||
50*e46bc2e7SPedro Falcato 		!arch_vma_access_permitted(vma, true, false, false))
51*e46bc2e7SPedro Falcato 		return true;
52*e46bc2e7SPedro Falcato 
53*e46bc2e7SPedro Falcato 	return false;
548be7258aSJeff Xu }
558be7258aSJeff Xu 
568be7258aSJeff Xu /*
578be7258aSJeff Xu  * Check if a vma is allowed to be modified by madvise.
588be7258aSJeff Xu  */
can_modify_vma_madv(struct vm_area_struct * vma,int behavior)598be7258aSJeff Xu bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
608be7258aSJeff Xu {
618be7258aSJeff Xu 	if (!is_madv_discard(behavior))
628be7258aSJeff Xu 		return true;
638be7258aSJeff Xu 
648be7258aSJeff Xu 	if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
658be7258aSJeff Xu 		return false;
668be7258aSJeff Xu 
678be7258aSJeff Xu 	/* Allow by default. */
688be7258aSJeff Xu 	return true;
698be7258aSJeff Xu }
708be7258aSJeff Xu 
mseal_fixup(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,vm_flags_t newflags)718be7258aSJeff Xu static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
728be7258aSJeff Xu 		struct vm_area_struct **prev, unsigned long start,
738be7258aSJeff Xu 		unsigned long end, vm_flags_t newflags)
748be7258aSJeff Xu {
758be7258aSJeff Xu 	int ret = 0;
768be7258aSJeff Xu 	vm_flags_t oldflags = vma->vm_flags;
778be7258aSJeff Xu 
788be7258aSJeff Xu 	if (newflags == oldflags)
798be7258aSJeff Xu 		goto out;
808be7258aSJeff Xu 
818be7258aSJeff Xu 	vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
828be7258aSJeff Xu 	if (IS_ERR(vma)) {
838be7258aSJeff Xu 		ret = PTR_ERR(vma);
848be7258aSJeff Xu 		goto out;
858be7258aSJeff Xu 	}
868be7258aSJeff Xu 
878be7258aSJeff Xu 	set_vma_sealed(vma);
888be7258aSJeff Xu out:
898be7258aSJeff Xu 	*prev = vma;
908be7258aSJeff Xu 	return ret;
918be7258aSJeff Xu }
928be7258aSJeff Xu 
938be7258aSJeff Xu /*
948be7258aSJeff Xu  * Check for do_mseal:
958be7258aSJeff Xu  * 1> start is part of a valid vma.
968be7258aSJeff Xu  * 2> end is part of a valid vma.
978be7258aSJeff Xu  * 3> No gap (unallocated address) between start and end.
988be7258aSJeff Xu  * 4> map is sealable.
998be7258aSJeff Xu  */
check_mm_seal(unsigned long start,unsigned long end)1008be7258aSJeff Xu static int check_mm_seal(unsigned long start, unsigned long end)
1018be7258aSJeff Xu {
1028be7258aSJeff Xu 	struct vm_area_struct *vma;
1038be7258aSJeff Xu 	unsigned long nstart = start;
1048be7258aSJeff Xu 
1058be7258aSJeff Xu 	VMA_ITERATOR(vmi, current->mm, start);
1068be7258aSJeff Xu 
1078be7258aSJeff Xu 	/* going through each vma to check. */
1088be7258aSJeff Xu 	for_each_vma_range(vmi, vma, end) {
1098be7258aSJeff Xu 		if (vma->vm_start > nstart)
1108be7258aSJeff Xu 			/* unallocated memory found. */
1118be7258aSJeff Xu 			return -ENOMEM;
1128be7258aSJeff Xu 
1138be7258aSJeff Xu 		if (vma->vm_end >= end)
1148be7258aSJeff Xu 			return 0;
1158be7258aSJeff Xu 
1168be7258aSJeff Xu 		nstart = vma->vm_end;
1178be7258aSJeff Xu 	}
1188be7258aSJeff Xu 
1198be7258aSJeff Xu 	return -ENOMEM;
1208be7258aSJeff Xu }
1218be7258aSJeff Xu 
1228be7258aSJeff Xu /*
1238be7258aSJeff Xu  * Apply sealing.
1248be7258aSJeff Xu  */
apply_mm_seal(unsigned long start,unsigned long end)1258be7258aSJeff Xu static int apply_mm_seal(unsigned long start, unsigned long end)
1268be7258aSJeff Xu {
1278be7258aSJeff Xu 	unsigned long nstart;
1288be7258aSJeff Xu 	struct vm_area_struct *vma, *prev;
1298be7258aSJeff Xu 
1308be7258aSJeff Xu 	VMA_ITERATOR(vmi, current->mm, start);
1318be7258aSJeff Xu 
1328be7258aSJeff Xu 	vma = vma_iter_load(&vmi);
1338be7258aSJeff Xu 	/*
1348be7258aSJeff Xu 	 * Note: check_mm_seal should already checked ENOMEM case.
1358be7258aSJeff Xu 	 * so vma should not be null, same for the other ENOMEM cases.
1368be7258aSJeff Xu 	 */
1378be7258aSJeff Xu 	prev = vma_prev(&vmi);
1388be7258aSJeff Xu 	if (start > vma->vm_start)
1398be7258aSJeff Xu 		prev = vma;
1408be7258aSJeff Xu 
1418be7258aSJeff Xu 	nstart = start;
1428be7258aSJeff Xu 	for_each_vma_range(vmi, vma, end) {
1438be7258aSJeff Xu 		int error;
1448be7258aSJeff Xu 		unsigned long tmp;
1458be7258aSJeff Xu 		vm_flags_t newflags;
1468be7258aSJeff Xu 
1478be7258aSJeff Xu 		newflags = vma->vm_flags | VM_SEALED;
1488be7258aSJeff Xu 		tmp = vma->vm_end;
1498be7258aSJeff Xu 		if (tmp > end)
1508be7258aSJeff Xu 			tmp = end;
1518be7258aSJeff Xu 		error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
1528be7258aSJeff Xu 		if (error)
1538be7258aSJeff Xu 			return error;
1548be7258aSJeff Xu 		nstart = vma_iter_end(&vmi);
1558be7258aSJeff Xu 	}
1568be7258aSJeff Xu 
1578be7258aSJeff Xu 	return 0;
1588be7258aSJeff Xu }
1598be7258aSJeff Xu 
1608be7258aSJeff Xu /*
1618be7258aSJeff Xu  * mseal(2) seals the VM's meta data from
1628be7258aSJeff Xu  * selected syscalls.
1638be7258aSJeff Xu  *
1648be7258aSJeff Xu  * addr/len: VM address range.
1658be7258aSJeff Xu  *
1668be7258aSJeff Xu  *  The address range by addr/len must meet:
1678be7258aSJeff Xu  *   start (addr) must be in a valid VMA.
1688be7258aSJeff Xu  *   end (addr + len) must be in a valid VMA.
1698be7258aSJeff Xu  *   no gap (unallocated memory) between start and end.
1708be7258aSJeff Xu  *   start (addr) must be page aligned.
1718be7258aSJeff Xu  *
1728be7258aSJeff Xu  *  len: len will be page aligned implicitly.
1738be7258aSJeff Xu  *
1748be7258aSJeff Xu  *   Below VMA operations are blocked after sealing.
1758be7258aSJeff Xu  *   1> Unmapping, moving to another location, and shrinking
1768be7258aSJeff Xu  *	the size, via munmap() and mremap(), can leave an empty
1778be7258aSJeff Xu  *	space, therefore can be replaced with a VMA with a new
1788be7258aSJeff Xu  *	set of attributes.
1798be7258aSJeff Xu  *   2> Moving or expanding a different vma into the current location,
1808be7258aSJeff Xu  *	via mremap().
1818be7258aSJeff Xu  *   3> Modifying a VMA via mmap(MAP_FIXED).
1828be7258aSJeff Xu  *   4> Size expansion, via mremap(), does not appear to pose any
1838be7258aSJeff Xu  *	specific risks to sealed VMAs. It is included anyway because
1848be7258aSJeff Xu  *	the use case is unclear. In any case, users can rely on
1858be7258aSJeff Xu  *	merging to expand a sealed VMA.
1868be7258aSJeff Xu  *   5> mprotect and pkey_mprotect.
1878be7258aSJeff Xu  *   6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
1888be7258aSJeff Xu  *      for anonymous memory, when users don't have write permission to the
1898be7258aSJeff Xu  *	memory. Those behaviors can alter region contents by discarding pages,
1908be7258aSJeff Xu  *	effectively a memset(0) for anonymous memory.
1918be7258aSJeff Xu  *
1928be7258aSJeff Xu  *  flags: reserved.
1938be7258aSJeff Xu  *
1948be7258aSJeff Xu  * return values:
1958be7258aSJeff Xu  *  zero: success.
1968be7258aSJeff Xu  *  -EINVAL:
1978be7258aSJeff Xu  *   invalid input flags.
1988be7258aSJeff Xu  *   start address is not page aligned.
1998be7258aSJeff Xu  *   Address arange (start + len) overflow.
2008be7258aSJeff Xu  *  -ENOMEM:
2018be7258aSJeff Xu  *   addr is not a valid address (not allocated).
2028be7258aSJeff Xu  *   end (start + len) is not a valid address.
2038be7258aSJeff Xu  *   a gap (unallocated memory) between start and end.
2048be7258aSJeff Xu  *  -EPERM:
2058be7258aSJeff Xu  *  - In 32 bit architecture, sealing is not supported.
2068be7258aSJeff Xu  * Note:
2078be7258aSJeff Xu  *  user can call mseal(2) multiple times, adding a seal on an
2088be7258aSJeff Xu  *  already sealed memory is a no-action (no error).
2098be7258aSJeff Xu  *
2108be7258aSJeff Xu  *  unseal() is not supported.
2118be7258aSJeff Xu  */
do_mseal(unsigned long start,size_t len_in,unsigned long flags)2128be7258aSJeff Xu int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
2138be7258aSJeff Xu {
2148be7258aSJeff Xu 	size_t len;
2158be7258aSJeff Xu 	int ret = 0;
2168be7258aSJeff Xu 	unsigned long end;
2178be7258aSJeff Xu 	struct mm_struct *mm = current->mm;
2188be7258aSJeff Xu 
2198be7258aSJeff Xu 	ret = can_do_mseal(flags);
2208be7258aSJeff Xu 	if (ret)
2218be7258aSJeff Xu 		return ret;
2228be7258aSJeff Xu 
2238be7258aSJeff Xu 	start = untagged_addr(start);
2248be7258aSJeff Xu 	if (!PAGE_ALIGNED(start))
2258be7258aSJeff Xu 		return -EINVAL;
2268be7258aSJeff Xu 
2278be7258aSJeff Xu 	len = PAGE_ALIGN(len_in);
2288be7258aSJeff Xu 	/* Check to see whether len was rounded up from small -ve to zero. */
2298be7258aSJeff Xu 	if (len_in && !len)
2308be7258aSJeff Xu 		return -EINVAL;
2318be7258aSJeff Xu 
2328be7258aSJeff Xu 	end = start + len;
2338be7258aSJeff Xu 	if (end < start)
2348be7258aSJeff Xu 		return -EINVAL;
2358be7258aSJeff Xu 
2368be7258aSJeff Xu 	if (end == start)
2378be7258aSJeff Xu 		return 0;
2388be7258aSJeff Xu 
2398be7258aSJeff Xu 	if (mmap_write_lock_killable(mm))
2408be7258aSJeff Xu 		return -EINTR;
2418be7258aSJeff Xu 
2428be7258aSJeff Xu 	/*
2438be7258aSJeff Xu 	 * First pass, this helps to avoid
2448be7258aSJeff Xu 	 * partial sealing in case of error in input address range,
2458be7258aSJeff Xu 	 * e.g. ENOMEM error.
2468be7258aSJeff Xu 	 */
2478be7258aSJeff Xu 	ret = check_mm_seal(start, end);
2488be7258aSJeff Xu 	if (ret)
2498be7258aSJeff Xu 		goto out;
2508be7258aSJeff Xu 
2518be7258aSJeff Xu 	/*
2528be7258aSJeff Xu 	 * Second pass, this should success, unless there are errors
2538be7258aSJeff Xu 	 * from vma_modify_flags, e.g. merge/split error, or process
2548be7258aSJeff Xu 	 * reaching the max supported VMAs, however, those cases shall
2558be7258aSJeff Xu 	 * be rare.
2568be7258aSJeff Xu 	 */
2578be7258aSJeff Xu 	ret = apply_mm_seal(start, end);
2588be7258aSJeff Xu 
2598be7258aSJeff Xu out:
2608be7258aSJeff Xu 	mmap_write_unlock(current->mm);
2618be7258aSJeff Xu 	return ret;
2628be7258aSJeff Xu }
2638be7258aSJeff Xu 
SYSCALL_DEFINE3(mseal,unsigned long,start,size_t,len,unsigned long,flags)2648be7258aSJeff Xu SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
2658be7258aSJeff Xu 		flags)
2668be7258aSJeff Xu {
2678be7258aSJeff Xu 	return do_mseal(start, len, flags);
2688be7258aSJeff Xu }
2698be7258aSJeff Xu