xref: /linux/mm/mseal.c (revision 04c319e05d0b08cc789db7abccce0fcb13dbab16)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Implement mseal() syscall.
4  *
5  *  Copyright (c) 2023,2024 Google, Inc.
6  *
7  *  Author: Jeff Xu <jeffxu@chromium.org>
8  */
9 
10 #include <linux/mempolicy.h>
11 #include <linux/mman.h>
12 #include <linux/mm.h>
13 #include <linux/mm_inline.h>
14 #include <linux/mmu_context.h>
15 #include <linux/syscalls.h>
16 #include <linux/sched.h>
17 #include "internal.h"
18 
19 static inline void set_vma_sealed(struct vm_area_struct *vma)
20 {
21 	vm_flags_set(vma, VM_SEALED);
22 }
23 
24 static bool is_madv_discard(int behavior)
25 {
26 	switch (behavior) {
27 	case MADV_FREE:
28 	case MADV_DONTNEED:
29 	case MADV_DONTNEED_LOCKED:
30 	case MADV_REMOVE:
31 	case MADV_DONTFORK:
32 	case MADV_WIPEONFORK:
33 	case MADV_GUARD_INSTALL:
34 		return true;
35 	}
36 
37 	return false;
38 }
39 
40 static bool is_ro_anon(struct vm_area_struct *vma)
41 {
42 	/* check anonymous mapping. */
43 	if (vma->vm_file || vma->vm_flags & VM_SHARED)
44 		return false;
45 
46 	/*
47 	 * check for non-writable:
48 	 * PROT=RO or PKRU is not writeable.
49 	 */
50 	if (!(vma->vm_flags & VM_WRITE) ||
51 		!arch_vma_access_permitted(vma, true, false, false))
52 		return true;
53 
54 	return false;
55 }
56 
57 /*
58  * Check if a vma is allowed to be modified by madvise.
59  */
60 bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
61 {
62 	if (!is_madv_discard(behavior))
63 		return true;
64 
65 	if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
66 		return false;
67 
68 	/* Allow by default. */
69 	return true;
70 }
71 
72 static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
73 		struct vm_area_struct **prev, unsigned long start,
74 		unsigned long end, vm_flags_t newflags)
75 {
76 	int ret = 0;
77 	vm_flags_t oldflags = vma->vm_flags;
78 
79 	if (newflags == oldflags)
80 		goto out;
81 
82 	vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
83 	if (IS_ERR(vma)) {
84 		ret = PTR_ERR(vma);
85 		goto out;
86 	}
87 
88 	set_vma_sealed(vma);
89 out:
90 	*prev = vma;
91 	return ret;
92 }
93 
94 /*
95  * Check for do_mseal:
96  * 1> start is part of a valid vma.
97  * 2> end is part of a valid vma.
98  * 3> No gap (unallocated address) between start and end.
99  * 4> map is sealable.
100  */
101 static int check_mm_seal(unsigned long start, unsigned long end)
102 {
103 	struct vm_area_struct *vma;
104 	unsigned long nstart = start;
105 
106 	VMA_ITERATOR(vmi, current->mm, start);
107 
108 	/* going through each vma to check. */
109 	for_each_vma_range(vmi, vma, end) {
110 		if (vma->vm_start > nstart)
111 			/* unallocated memory found. */
112 			return -ENOMEM;
113 
114 		if (vma->vm_end >= end)
115 			return 0;
116 
117 		nstart = vma->vm_end;
118 	}
119 
120 	return -ENOMEM;
121 }
122 
123 /*
124  * Apply sealing.
125  */
126 static int apply_mm_seal(unsigned long start, unsigned long end)
127 {
128 	unsigned long nstart;
129 	struct vm_area_struct *vma, *prev;
130 
131 	VMA_ITERATOR(vmi, current->mm, start);
132 
133 	vma = vma_iter_load(&vmi);
134 	/*
135 	 * Note: check_mm_seal should already checked ENOMEM case.
136 	 * so vma should not be null, same for the other ENOMEM cases.
137 	 */
138 	prev = vma_prev(&vmi);
139 	if (start > vma->vm_start)
140 		prev = vma;
141 
142 	nstart = start;
143 	for_each_vma_range(vmi, vma, end) {
144 		int error;
145 		unsigned long tmp;
146 		vm_flags_t newflags;
147 
148 		newflags = vma->vm_flags | VM_SEALED;
149 		tmp = vma->vm_end;
150 		if (tmp > end)
151 			tmp = end;
152 		error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
153 		if (error)
154 			return error;
155 		nstart = vma_iter_end(&vmi);
156 	}
157 
158 	return 0;
159 }
160 
161 /*
162  * mseal(2) seals the VM's meta data from
163  * selected syscalls.
164  *
165  * addr/len: VM address range.
166  *
167  *  The address range by addr/len must meet:
168  *   start (addr) must be in a valid VMA.
169  *   end (addr + len) must be in a valid VMA.
170  *   no gap (unallocated memory) between start and end.
171  *   start (addr) must be page aligned.
172  *
173  *  len: len will be page aligned implicitly.
174  *
175  *   Below VMA operations are blocked after sealing.
176  *   1> Unmapping, moving to another location, and shrinking
177  *	the size, via munmap() and mremap(), can leave an empty
178  *	space, therefore can be replaced with a VMA with a new
179  *	set of attributes.
180  *   2> Moving or expanding a different vma into the current location,
181  *	via mremap().
182  *   3> Modifying a VMA via mmap(MAP_FIXED).
183  *   4> Size expansion, via mremap(), does not appear to pose any
184  *	specific risks to sealed VMAs. It is included anyway because
185  *	the use case is unclear. In any case, users can rely on
186  *	merging to expand a sealed VMA.
187  *   5> mprotect and pkey_mprotect.
188  *   6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
189  *      for anonymous memory, when users don't have write permission to the
190  *	memory. Those behaviors can alter region contents by discarding pages,
191  *	effectively a memset(0) for anonymous memory.
192  *
193  *  flags: reserved.
194  *
195  * return values:
196  *  zero: success.
197  *  -EINVAL:
198  *   invalid input flags.
199  *   start address is not page aligned.
200  *   Address arange (start + len) overflow.
201  *  -ENOMEM:
202  *   addr is not a valid address (not allocated).
203  *   end (start + len) is not a valid address.
204  *   a gap (unallocated memory) between start and end.
205  *  -EPERM:
206  *  - In 32 bit architecture, sealing is not supported.
207  * Note:
208  *  user can call mseal(2) multiple times, adding a seal on an
209  *  already sealed memory is a no-action (no error).
210  *
211  *  unseal() is not supported.
212  */
213 int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
214 {
215 	size_t len;
216 	int ret = 0;
217 	unsigned long end;
218 	struct mm_struct *mm = current->mm;
219 
220 	ret = can_do_mseal(flags);
221 	if (ret)
222 		return ret;
223 
224 	start = untagged_addr(start);
225 	if (!PAGE_ALIGNED(start))
226 		return -EINVAL;
227 
228 	len = PAGE_ALIGN(len_in);
229 	/* Check to see whether len was rounded up from small -ve to zero. */
230 	if (len_in && !len)
231 		return -EINVAL;
232 
233 	end = start + len;
234 	if (end < start)
235 		return -EINVAL;
236 
237 	if (end == start)
238 		return 0;
239 
240 	if (mmap_write_lock_killable(mm))
241 		return -EINTR;
242 
243 	/*
244 	 * First pass, this helps to avoid
245 	 * partial sealing in case of error in input address range,
246 	 * e.g. ENOMEM error.
247 	 */
248 	ret = check_mm_seal(start, end);
249 	if (ret)
250 		goto out;
251 
252 	/*
253 	 * Second pass, this should success, unless there are errors
254 	 * from vma_modify_flags, e.g. merge/split error, or process
255 	 * reaching the max supported VMAs, however, those cases shall
256 	 * be rare.
257 	 */
258 	ret = apply_mm_seal(start, end);
259 
260 out:
261 	mmap_write_unlock(current->mm);
262 	return ret;
263 }
264 
265 SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
266 		flags)
267 {
268 	return do_mseal(start, len, flags);
269 }
270