xref: /linux/mm/mseal.c (revision 42b16d3ac371a2fac9b6f08fd75f23f34ba3955a)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Implement mseal() syscall.
4  *
5  *  Copyright (c) 2023,2024 Google, Inc.
6  *
7  *  Author: Jeff Xu <jeffxu@chromium.org>
8  */
9 
10 #include <linux/mempolicy.h>
11 #include <linux/mman.h>
12 #include <linux/mm.h>
13 #include <linux/mm_inline.h>
14 #include <linux/mmu_context.h>
15 #include <linux/syscalls.h>
16 #include <linux/sched.h>
17 #include "internal.h"
18 
set_vma_sealed(struct vm_area_struct * vma)19 static inline void set_vma_sealed(struct vm_area_struct *vma)
20 {
21 	vm_flags_set(vma, VM_SEALED);
22 }
23 
is_madv_discard(int behavior)24 static bool is_madv_discard(int behavior)
25 {
26 	switch (behavior) {
27 	case MADV_FREE:
28 	case MADV_DONTNEED:
29 	case MADV_DONTNEED_LOCKED:
30 	case MADV_REMOVE:
31 	case MADV_DONTFORK:
32 	case MADV_WIPEONFORK:
33 		return true;
34 	}
35 
36 	return false;
37 }
38 
is_ro_anon(struct vm_area_struct * vma)39 static bool is_ro_anon(struct vm_area_struct *vma)
40 {
41 	/* check anonymous mapping. */
42 	if (vma->vm_file || vma->vm_flags & VM_SHARED)
43 		return false;
44 
45 	/*
46 	 * check for non-writable:
47 	 * PROT=RO or PKRU is not writeable.
48 	 */
49 	if (!(vma->vm_flags & VM_WRITE) ||
50 		!arch_vma_access_permitted(vma, true, false, false))
51 		return true;
52 
53 	return false;
54 }
55 
56 /*
57  * Check if a vma is allowed to be modified by madvise.
58  */
can_modify_vma_madv(struct vm_area_struct * vma,int behavior)59 bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
60 {
61 	if (!is_madv_discard(behavior))
62 		return true;
63 
64 	if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
65 		return false;
66 
67 	/* Allow by default. */
68 	return true;
69 }
70 
mseal_fixup(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,vm_flags_t newflags)71 static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
72 		struct vm_area_struct **prev, unsigned long start,
73 		unsigned long end, vm_flags_t newflags)
74 {
75 	int ret = 0;
76 	vm_flags_t oldflags = vma->vm_flags;
77 
78 	if (newflags == oldflags)
79 		goto out;
80 
81 	vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
82 	if (IS_ERR(vma)) {
83 		ret = PTR_ERR(vma);
84 		goto out;
85 	}
86 
87 	set_vma_sealed(vma);
88 out:
89 	*prev = vma;
90 	return ret;
91 }
92 
93 /*
94  * Check for do_mseal:
95  * 1> start is part of a valid vma.
96  * 2> end is part of a valid vma.
97  * 3> No gap (unallocated address) between start and end.
98  * 4> map is sealable.
99  */
check_mm_seal(unsigned long start,unsigned long end)100 static int check_mm_seal(unsigned long start, unsigned long end)
101 {
102 	struct vm_area_struct *vma;
103 	unsigned long nstart = start;
104 
105 	VMA_ITERATOR(vmi, current->mm, start);
106 
107 	/* going through each vma to check. */
108 	for_each_vma_range(vmi, vma, end) {
109 		if (vma->vm_start > nstart)
110 			/* unallocated memory found. */
111 			return -ENOMEM;
112 
113 		if (vma->vm_end >= end)
114 			return 0;
115 
116 		nstart = vma->vm_end;
117 	}
118 
119 	return -ENOMEM;
120 }
121 
122 /*
123  * Apply sealing.
124  */
apply_mm_seal(unsigned long start,unsigned long end)125 static int apply_mm_seal(unsigned long start, unsigned long end)
126 {
127 	unsigned long nstart;
128 	struct vm_area_struct *vma, *prev;
129 
130 	VMA_ITERATOR(vmi, current->mm, start);
131 
132 	vma = vma_iter_load(&vmi);
133 	/*
134 	 * Note: check_mm_seal should already checked ENOMEM case.
135 	 * so vma should not be null, same for the other ENOMEM cases.
136 	 */
137 	prev = vma_prev(&vmi);
138 	if (start > vma->vm_start)
139 		prev = vma;
140 
141 	nstart = start;
142 	for_each_vma_range(vmi, vma, end) {
143 		int error;
144 		unsigned long tmp;
145 		vm_flags_t newflags;
146 
147 		newflags = vma->vm_flags | VM_SEALED;
148 		tmp = vma->vm_end;
149 		if (tmp > end)
150 			tmp = end;
151 		error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
152 		if (error)
153 			return error;
154 		nstart = vma_iter_end(&vmi);
155 	}
156 
157 	return 0;
158 }
159 
160 /*
161  * mseal(2) seals the VM's meta data from
162  * selected syscalls.
163  *
164  * addr/len: VM address range.
165  *
166  *  The address range by addr/len must meet:
167  *   start (addr) must be in a valid VMA.
168  *   end (addr + len) must be in a valid VMA.
169  *   no gap (unallocated memory) between start and end.
170  *   start (addr) must be page aligned.
171  *
172  *  len: len will be page aligned implicitly.
173  *
174  *   Below VMA operations are blocked after sealing.
175  *   1> Unmapping, moving to another location, and shrinking
176  *	the size, via munmap() and mremap(), can leave an empty
177  *	space, therefore can be replaced with a VMA with a new
178  *	set of attributes.
179  *   2> Moving or expanding a different vma into the current location,
180  *	via mremap().
181  *   3> Modifying a VMA via mmap(MAP_FIXED).
182  *   4> Size expansion, via mremap(), does not appear to pose any
183  *	specific risks to sealed VMAs. It is included anyway because
184  *	the use case is unclear. In any case, users can rely on
185  *	merging to expand a sealed VMA.
186  *   5> mprotect and pkey_mprotect.
187  *   6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
188  *      for anonymous memory, when users don't have write permission to the
189  *	memory. Those behaviors can alter region contents by discarding pages,
190  *	effectively a memset(0) for anonymous memory.
191  *
192  *  flags: reserved.
193  *
194  * return values:
195  *  zero: success.
196  *  -EINVAL:
197  *   invalid input flags.
198  *   start address is not page aligned.
199  *   Address arange (start + len) overflow.
200  *  -ENOMEM:
201  *   addr is not a valid address (not allocated).
202  *   end (start + len) is not a valid address.
203  *   a gap (unallocated memory) between start and end.
204  *  -EPERM:
205  *  - In 32 bit architecture, sealing is not supported.
206  * Note:
207  *  user can call mseal(2) multiple times, adding a seal on an
208  *  already sealed memory is a no-action (no error).
209  *
210  *  unseal() is not supported.
211  */
do_mseal(unsigned long start,size_t len_in,unsigned long flags)212 int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
213 {
214 	size_t len;
215 	int ret = 0;
216 	unsigned long end;
217 	struct mm_struct *mm = current->mm;
218 
219 	ret = can_do_mseal(flags);
220 	if (ret)
221 		return ret;
222 
223 	start = untagged_addr(start);
224 	if (!PAGE_ALIGNED(start))
225 		return -EINVAL;
226 
227 	len = PAGE_ALIGN(len_in);
228 	/* Check to see whether len was rounded up from small -ve to zero. */
229 	if (len_in && !len)
230 		return -EINVAL;
231 
232 	end = start + len;
233 	if (end < start)
234 		return -EINVAL;
235 
236 	if (end == start)
237 		return 0;
238 
239 	if (mmap_write_lock_killable(mm))
240 		return -EINTR;
241 
242 	/*
243 	 * First pass, this helps to avoid
244 	 * partial sealing in case of error in input address range,
245 	 * e.g. ENOMEM error.
246 	 */
247 	ret = check_mm_seal(start, end);
248 	if (ret)
249 		goto out;
250 
251 	/*
252 	 * Second pass, this should success, unless there are errors
253 	 * from vma_modify_flags, e.g. merge/split error, or process
254 	 * reaching the max supported VMAs, however, those cases shall
255 	 * be rare.
256 	 */
257 	ret = apply_mm_seal(start, end);
258 
259 out:
260 	mmap_write_unlock(current->mm);
261 	return ret;
262 }
263 
SYSCALL_DEFINE3(mseal,unsigned long,start,size_t,len,unsigned long,flags)264 SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
265 		flags)
266 {
267 	return do_mseal(start, len, flags);
268 }
269