xref: /linux/mm/mseal.c (revision 001821b0e79716c4e17c71d8e053a23599a7a508)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Implement mseal() syscall.
4  *
5  *  Copyright (c) 2023,2024 Google, Inc.
6  *
7  *  Author: Jeff Xu <jeffxu@chromium.org>
8  */
9 
10 #include <linux/mempolicy.h>
11 #include <linux/mman.h>
12 #include <linux/mm.h>
13 #include <linux/mm_inline.h>
14 #include <linux/mmu_context.h>
15 #include <linux/syscalls.h>
16 #include <linux/sched.h>
17 #include "internal.h"
18 
19 static inline bool vma_is_sealed(struct vm_area_struct *vma)
20 {
21 	return (vma->vm_flags & VM_SEALED);
22 }
23 
24 static inline void set_vma_sealed(struct vm_area_struct *vma)
25 {
26 	vm_flags_set(vma, VM_SEALED);
27 }
28 
29 /*
30  * check if a vma is sealed for modification.
31  * return true, if modification is allowed.
32  */
33 static bool can_modify_vma(struct vm_area_struct *vma)
34 {
35 	if (unlikely(vma_is_sealed(vma)))
36 		return false;
37 
38 	return true;
39 }
40 
41 static bool is_madv_discard(int behavior)
42 {
43 	return	behavior &
44 		(MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED |
45 		 MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK);
46 }
47 
48 static bool is_ro_anon(struct vm_area_struct *vma)
49 {
50 	/* check anonymous mapping. */
51 	if (vma->vm_file || vma->vm_flags & VM_SHARED)
52 		return false;
53 
54 	/*
55 	 * check for non-writable:
56 	 * PROT=RO or PKRU is not writeable.
57 	 */
58 	if (!(vma->vm_flags & VM_WRITE) ||
59 		!arch_vma_access_permitted(vma, true, false, false))
60 		return true;
61 
62 	return false;
63 }
64 
65 /*
66  * Check if the vmas of a memory range are allowed to be modified.
67  * the memory ranger can have a gap (unallocated memory).
68  * return true, if it is allowed.
69  */
70 bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
71 {
72 	struct vm_area_struct *vma;
73 
74 	VMA_ITERATOR(vmi, mm, start);
75 
76 	/* going through each vma to check. */
77 	for_each_vma_range(vmi, vma, end) {
78 		if (unlikely(!can_modify_vma(vma)))
79 			return false;
80 	}
81 
82 	/* Allow by default. */
83 	return true;
84 }
85 
86 /*
87  * Check if the vmas of a memory range are allowed to be modified by madvise.
88  * the memory ranger can have a gap (unallocated memory).
89  * return true, if it is allowed.
90  */
91 bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
92 		int behavior)
93 {
94 	struct vm_area_struct *vma;
95 
96 	VMA_ITERATOR(vmi, mm, start);
97 
98 	if (!is_madv_discard(behavior))
99 		return true;
100 
101 	/* going through each vma to check. */
102 	for_each_vma_range(vmi, vma, end)
103 		if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
104 			return false;
105 
106 	/* Allow by default. */
107 	return true;
108 }
109 
110 static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
111 		struct vm_area_struct **prev, unsigned long start,
112 		unsigned long end, vm_flags_t newflags)
113 {
114 	int ret = 0;
115 	vm_flags_t oldflags = vma->vm_flags;
116 
117 	if (newflags == oldflags)
118 		goto out;
119 
120 	vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
121 	if (IS_ERR(vma)) {
122 		ret = PTR_ERR(vma);
123 		goto out;
124 	}
125 
126 	set_vma_sealed(vma);
127 out:
128 	*prev = vma;
129 	return ret;
130 }
131 
132 /*
133  * Check for do_mseal:
134  * 1> start is part of a valid vma.
135  * 2> end is part of a valid vma.
136  * 3> No gap (unallocated address) between start and end.
137  * 4> map is sealable.
138  */
139 static int check_mm_seal(unsigned long start, unsigned long end)
140 {
141 	struct vm_area_struct *vma;
142 	unsigned long nstart = start;
143 
144 	VMA_ITERATOR(vmi, current->mm, start);
145 
146 	/* going through each vma to check. */
147 	for_each_vma_range(vmi, vma, end) {
148 		if (vma->vm_start > nstart)
149 			/* unallocated memory found. */
150 			return -ENOMEM;
151 
152 		if (vma->vm_end >= end)
153 			return 0;
154 
155 		nstart = vma->vm_end;
156 	}
157 
158 	return -ENOMEM;
159 }
160 
161 /*
162  * Apply sealing.
163  */
164 static int apply_mm_seal(unsigned long start, unsigned long end)
165 {
166 	unsigned long nstart;
167 	struct vm_area_struct *vma, *prev;
168 
169 	VMA_ITERATOR(vmi, current->mm, start);
170 
171 	vma = vma_iter_load(&vmi);
172 	/*
173 	 * Note: check_mm_seal should already checked ENOMEM case.
174 	 * so vma should not be null, same for the other ENOMEM cases.
175 	 */
176 	prev = vma_prev(&vmi);
177 	if (start > vma->vm_start)
178 		prev = vma;
179 
180 	nstart = start;
181 	for_each_vma_range(vmi, vma, end) {
182 		int error;
183 		unsigned long tmp;
184 		vm_flags_t newflags;
185 
186 		newflags = vma->vm_flags | VM_SEALED;
187 		tmp = vma->vm_end;
188 		if (tmp > end)
189 			tmp = end;
190 		error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
191 		if (error)
192 			return error;
193 		nstart = vma_iter_end(&vmi);
194 	}
195 
196 	return 0;
197 }
198 
199 /*
200  * mseal(2) seals the VM's meta data from
201  * selected syscalls.
202  *
203  * addr/len: VM address range.
204  *
205  *  The address range by addr/len must meet:
206  *   start (addr) must be in a valid VMA.
207  *   end (addr + len) must be in a valid VMA.
208  *   no gap (unallocated memory) between start and end.
209  *   start (addr) must be page aligned.
210  *
211  *  len: len will be page aligned implicitly.
212  *
213  *   Below VMA operations are blocked after sealing.
214  *   1> Unmapping, moving to another location, and shrinking
215  *	the size, via munmap() and mremap(), can leave an empty
216  *	space, therefore can be replaced with a VMA with a new
217  *	set of attributes.
218  *   2> Moving or expanding a different vma into the current location,
219  *	via mremap().
220  *   3> Modifying a VMA via mmap(MAP_FIXED).
221  *   4> Size expansion, via mremap(), does not appear to pose any
222  *	specific risks to sealed VMAs. It is included anyway because
223  *	the use case is unclear. In any case, users can rely on
224  *	merging to expand a sealed VMA.
225  *   5> mprotect and pkey_mprotect.
226  *   6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
227  *      for anonymous memory, when users don't have write permission to the
228  *	memory. Those behaviors can alter region contents by discarding pages,
229  *	effectively a memset(0) for anonymous memory.
230  *
231  *  flags: reserved.
232  *
233  * return values:
234  *  zero: success.
235  *  -EINVAL:
236  *   invalid input flags.
237  *   start address is not page aligned.
238  *   Address arange (start + len) overflow.
239  *  -ENOMEM:
240  *   addr is not a valid address (not allocated).
241  *   end (start + len) is not a valid address.
242  *   a gap (unallocated memory) between start and end.
243  *  -EPERM:
244  *  - In 32 bit architecture, sealing is not supported.
245  * Note:
246  *  user can call mseal(2) multiple times, adding a seal on an
247  *  already sealed memory is a no-action (no error).
248  *
249  *  unseal() is not supported.
250  */
251 static int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
252 {
253 	size_t len;
254 	int ret = 0;
255 	unsigned long end;
256 	struct mm_struct *mm = current->mm;
257 
258 	ret = can_do_mseal(flags);
259 	if (ret)
260 		return ret;
261 
262 	start = untagged_addr(start);
263 	if (!PAGE_ALIGNED(start))
264 		return -EINVAL;
265 
266 	len = PAGE_ALIGN(len_in);
267 	/* Check to see whether len was rounded up from small -ve to zero. */
268 	if (len_in && !len)
269 		return -EINVAL;
270 
271 	end = start + len;
272 	if (end < start)
273 		return -EINVAL;
274 
275 	if (end == start)
276 		return 0;
277 
278 	if (mmap_write_lock_killable(mm))
279 		return -EINTR;
280 
281 	/*
282 	 * First pass, this helps to avoid
283 	 * partial sealing in case of error in input address range,
284 	 * e.g. ENOMEM error.
285 	 */
286 	ret = check_mm_seal(start, end);
287 	if (ret)
288 		goto out;
289 
290 	/*
291 	 * Second pass, this should success, unless there are errors
292 	 * from vma_modify_flags, e.g. merge/split error, or process
293 	 * reaching the max supported VMAs, however, those cases shall
294 	 * be rare.
295 	 */
296 	ret = apply_mm_seal(start, end);
297 
298 out:
299 	mmap_write_unlock(current->mm);
300 	return ret;
301 }
302 
303 SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
304 		flags)
305 {
306 	return do_mseal(start, len, flags);
307 }
308