xref: /linux/mm/mseal.c (revision bf36793fa260cb68cc817f311f1f683788261796)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Implement mseal() syscall.
4  *
5  *  Copyright (c) 2023,2024 Google, Inc.
6  *
7  *  Author: Jeff Xu <jeffxu@chromium.org>
8  */
9 
10 #include <linux/mempolicy.h>
11 #include <linux/mman.h>
12 #include <linux/mm.h>
13 #include <linux/mm_inline.h>
14 #include <linux/mmu_context.h>
15 #include <linux/syscalls.h>
16 #include <linux/sched.h>
17 #include "internal.h"
18 
19 static inline bool vma_is_sealed(struct vm_area_struct *vma)
20 {
21 	return (vma->vm_flags & VM_SEALED);
22 }
23 
24 static inline void set_vma_sealed(struct vm_area_struct *vma)
25 {
26 	vm_flags_set(vma, VM_SEALED);
27 }
28 
29 /*
30  * check if a vma is sealed for modification.
31  * return true, if modification is allowed.
32  */
33 static bool can_modify_vma(struct vm_area_struct *vma)
34 {
35 	if (unlikely(vma_is_sealed(vma)))
36 		return false;
37 
38 	return true;
39 }
40 
41 static bool is_madv_discard(int behavior)
42 {
43 	switch (behavior) {
44 	case MADV_FREE:
45 	case MADV_DONTNEED:
46 	case MADV_DONTNEED_LOCKED:
47 	case MADV_REMOVE:
48 	case MADV_DONTFORK:
49 	case MADV_WIPEONFORK:
50 		return true;
51 	}
52 
53 	return false;
54 }
55 
56 static bool is_ro_anon(struct vm_area_struct *vma)
57 {
58 	/* check anonymous mapping. */
59 	if (vma->vm_file || vma->vm_flags & VM_SHARED)
60 		return false;
61 
62 	/*
63 	 * check for non-writable:
64 	 * PROT=RO or PKRU is not writeable.
65 	 */
66 	if (!(vma->vm_flags & VM_WRITE) ||
67 		!arch_vma_access_permitted(vma, true, false, false))
68 		return true;
69 
70 	return false;
71 }
72 
73 /*
74  * Check if the vmas of a memory range are allowed to be modified.
75  * the memory ranger can have a gap (unallocated memory).
76  * return true, if it is allowed.
77  */
78 bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
79 {
80 	struct vm_area_struct *vma;
81 
82 	VMA_ITERATOR(vmi, mm, start);
83 
84 	/* going through each vma to check. */
85 	for_each_vma_range(vmi, vma, end) {
86 		if (unlikely(!can_modify_vma(vma)))
87 			return false;
88 	}
89 
90 	/* Allow by default. */
91 	return true;
92 }
93 
94 /*
95  * Check if the vmas of a memory range are allowed to be modified by madvise.
96  * the memory ranger can have a gap (unallocated memory).
97  * return true, if it is allowed.
98  */
99 bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
100 		int behavior)
101 {
102 	struct vm_area_struct *vma;
103 
104 	VMA_ITERATOR(vmi, mm, start);
105 
106 	if (!is_madv_discard(behavior))
107 		return true;
108 
109 	/* going through each vma to check. */
110 	for_each_vma_range(vmi, vma, end)
111 		if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
112 			return false;
113 
114 	/* Allow by default. */
115 	return true;
116 }
117 
118 static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
119 		struct vm_area_struct **prev, unsigned long start,
120 		unsigned long end, vm_flags_t newflags)
121 {
122 	int ret = 0;
123 	vm_flags_t oldflags = vma->vm_flags;
124 
125 	if (newflags == oldflags)
126 		goto out;
127 
128 	vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
129 	if (IS_ERR(vma)) {
130 		ret = PTR_ERR(vma);
131 		goto out;
132 	}
133 
134 	set_vma_sealed(vma);
135 out:
136 	*prev = vma;
137 	return ret;
138 }
139 
140 /*
141  * Check for do_mseal:
142  * 1> start is part of a valid vma.
143  * 2> end is part of a valid vma.
144  * 3> No gap (unallocated address) between start and end.
145  * 4> map is sealable.
146  */
147 static int check_mm_seal(unsigned long start, unsigned long end)
148 {
149 	struct vm_area_struct *vma;
150 	unsigned long nstart = start;
151 
152 	VMA_ITERATOR(vmi, current->mm, start);
153 
154 	/* going through each vma to check. */
155 	for_each_vma_range(vmi, vma, end) {
156 		if (vma->vm_start > nstart)
157 			/* unallocated memory found. */
158 			return -ENOMEM;
159 
160 		if (vma->vm_end >= end)
161 			return 0;
162 
163 		nstart = vma->vm_end;
164 	}
165 
166 	return -ENOMEM;
167 }
168 
169 /*
170  * Apply sealing.
171  */
172 static int apply_mm_seal(unsigned long start, unsigned long end)
173 {
174 	unsigned long nstart;
175 	struct vm_area_struct *vma, *prev;
176 
177 	VMA_ITERATOR(vmi, current->mm, start);
178 
179 	vma = vma_iter_load(&vmi);
180 	/*
181 	 * Note: check_mm_seal should already checked ENOMEM case.
182 	 * so vma should not be null, same for the other ENOMEM cases.
183 	 */
184 	prev = vma_prev(&vmi);
185 	if (start > vma->vm_start)
186 		prev = vma;
187 
188 	nstart = start;
189 	for_each_vma_range(vmi, vma, end) {
190 		int error;
191 		unsigned long tmp;
192 		vm_flags_t newflags;
193 
194 		newflags = vma->vm_flags | VM_SEALED;
195 		tmp = vma->vm_end;
196 		if (tmp > end)
197 			tmp = end;
198 		error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
199 		if (error)
200 			return error;
201 		nstart = vma_iter_end(&vmi);
202 	}
203 
204 	return 0;
205 }
206 
207 /*
208  * mseal(2) seals the VM's meta data from
209  * selected syscalls.
210  *
211  * addr/len: VM address range.
212  *
213  *  The address range by addr/len must meet:
214  *   start (addr) must be in a valid VMA.
215  *   end (addr + len) must be in a valid VMA.
216  *   no gap (unallocated memory) between start and end.
217  *   start (addr) must be page aligned.
218  *
219  *  len: len will be page aligned implicitly.
220  *
221  *   Below VMA operations are blocked after sealing.
222  *   1> Unmapping, moving to another location, and shrinking
223  *	the size, via munmap() and mremap(), can leave an empty
224  *	space, therefore can be replaced with a VMA with a new
225  *	set of attributes.
226  *   2> Moving or expanding a different vma into the current location,
227  *	via mremap().
228  *   3> Modifying a VMA via mmap(MAP_FIXED).
229  *   4> Size expansion, via mremap(), does not appear to pose any
230  *	specific risks to sealed VMAs. It is included anyway because
231  *	the use case is unclear. In any case, users can rely on
232  *	merging to expand a sealed VMA.
233  *   5> mprotect and pkey_mprotect.
234  *   6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
235  *      for anonymous memory, when users don't have write permission to the
236  *	memory. Those behaviors can alter region contents by discarding pages,
237  *	effectively a memset(0) for anonymous memory.
238  *
239  *  flags: reserved.
240  *
241  * return values:
242  *  zero: success.
243  *  -EINVAL:
244  *   invalid input flags.
245  *   start address is not page aligned.
246  *   Address arange (start + len) overflow.
247  *  -ENOMEM:
248  *   addr is not a valid address (not allocated).
249  *   end (start + len) is not a valid address.
250  *   a gap (unallocated memory) between start and end.
251  *  -EPERM:
252  *  - In 32 bit architecture, sealing is not supported.
253  * Note:
254  *  user can call mseal(2) multiple times, adding a seal on an
255  *  already sealed memory is a no-action (no error).
256  *
257  *  unseal() is not supported.
258  */
259 static int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
260 {
261 	size_t len;
262 	int ret = 0;
263 	unsigned long end;
264 	struct mm_struct *mm = current->mm;
265 
266 	ret = can_do_mseal(flags);
267 	if (ret)
268 		return ret;
269 
270 	start = untagged_addr(start);
271 	if (!PAGE_ALIGNED(start))
272 		return -EINVAL;
273 
274 	len = PAGE_ALIGN(len_in);
275 	/* Check to see whether len was rounded up from small -ve to zero. */
276 	if (len_in && !len)
277 		return -EINVAL;
278 
279 	end = start + len;
280 	if (end < start)
281 		return -EINVAL;
282 
283 	if (end == start)
284 		return 0;
285 
286 	if (mmap_write_lock_killable(mm))
287 		return -EINTR;
288 
289 	/*
290 	 * First pass, this helps to avoid
291 	 * partial sealing in case of error in input address range,
292 	 * e.g. ENOMEM error.
293 	 */
294 	ret = check_mm_seal(start, end);
295 	if (ret)
296 		goto out;
297 
298 	/*
299 	 * Second pass, this should success, unless there are errors
300 	 * from vma_modify_flags, e.g. merge/split error, or process
301 	 * reaching the max supported VMAs, however, those cases shall
302 	 * be rare.
303 	 */
304 	ret = apply_mm_seal(start, end);
305 
306 out:
307 	mmap_write_unlock(current->mm);
308 	return ret;
309 }
310 
311 SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
312 		flags)
313 {
314 	return do_mseal(start, len, flags);
315 }
316