1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Implement mseal() syscall. 4 * 5 * Copyright (c) 2023,2024 Google, Inc. 6 * 7 * Author: Jeff Xu <jeffxu@chromium.org> 8 */ 9 10 #include <linux/mempolicy.h> 11 #include <linux/mman.h> 12 #include <linux/mm.h> 13 #include <linux/mm_inline.h> 14 #include <linux/mmu_context.h> 15 #include <linux/syscalls.h> 16 #include <linux/sched.h> 17 #include "internal.h" 18 19 static inline bool vma_is_sealed(struct vm_area_struct *vma) 20 { 21 return (vma->vm_flags & VM_SEALED); 22 } 23 24 static inline void set_vma_sealed(struct vm_area_struct *vma) 25 { 26 vm_flags_set(vma, VM_SEALED); 27 } 28 29 /* 30 * check if a vma is sealed for modification. 31 * return true, if modification is allowed. 32 */ 33 static bool can_modify_vma(struct vm_area_struct *vma) 34 { 35 if (unlikely(vma_is_sealed(vma))) 36 return false; 37 38 return true; 39 } 40 41 static bool is_madv_discard(int behavior) 42 { 43 return behavior & 44 (MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED | 45 MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK); 46 } 47 48 static bool is_ro_anon(struct vm_area_struct *vma) 49 { 50 /* check anonymous mapping. */ 51 if (vma->vm_file || vma->vm_flags & VM_SHARED) 52 return false; 53 54 /* 55 * check for non-writable: 56 * PROT=RO or PKRU is not writeable. 57 */ 58 if (!(vma->vm_flags & VM_WRITE) || 59 !arch_vma_access_permitted(vma, true, false, false)) 60 return true; 61 62 return false; 63 } 64 65 /* 66 * Check if the vmas of a memory range are allowed to be modified. 67 * the memory ranger can have a gap (unallocated memory). 68 * return true, if it is allowed. 69 */ 70 bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end) 71 { 72 struct vm_area_struct *vma; 73 74 VMA_ITERATOR(vmi, mm, start); 75 76 /* going through each vma to check. */ 77 for_each_vma_range(vmi, vma, end) { 78 if (unlikely(!can_modify_vma(vma))) 79 return false; 80 } 81 82 /* Allow by default. */ 83 return true; 84 } 85 86 /* 87 * Check if the vmas of a memory range are allowed to be modified by madvise. 88 * the memory ranger can have a gap (unallocated memory). 89 * return true, if it is allowed. 90 */ 91 bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end, 92 int behavior) 93 { 94 struct vm_area_struct *vma; 95 96 VMA_ITERATOR(vmi, mm, start); 97 98 if (!is_madv_discard(behavior)) 99 return true; 100 101 /* going through each vma to check. */ 102 for_each_vma_range(vmi, vma, end) 103 if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma))) 104 return false; 105 106 /* Allow by default. */ 107 return true; 108 } 109 110 static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, 111 struct vm_area_struct **prev, unsigned long start, 112 unsigned long end, vm_flags_t newflags) 113 { 114 int ret = 0; 115 vm_flags_t oldflags = vma->vm_flags; 116 117 if (newflags == oldflags) 118 goto out; 119 120 vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); 121 if (IS_ERR(vma)) { 122 ret = PTR_ERR(vma); 123 goto out; 124 } 125 126 set_vma_sealed(vma); 127 out: 128 *prev = vma; 129 return ret; 130 } 131 132 /* 133 * Check for do_mseal: 134 * 1> start is part of a valid vma. 135 * 2> end is part of a valid vma. 136 * 3> No gap (unallocated address) between start and end. 137 * 4> map is sealable. 138 */ 139 static int check_mm_seal(unsigned long start, unsigned long end) 140 { 141 struct vm_area_struct *vma; 142 unsigned long nstart = start; 143 144 VMA_ITERATOR(vmi, current->mm, start); 145 146 /* going through each vma to check. */ 147 for_each_vma_range(vmi, vma, end) { 148 if (vma->vm_start > nstart) 149 /* unallocated memory found. */ 150 return -ENOMEM; 151 152 if (vma->vm_end >= end) 153 return 0; 154 155 nstart = vma->vm_end; 156 } 157 158 return -ENOMEM; 159 } 160 161 /* 162 * Apply sealing. 163 */ 164 static int apply_mm_seal(unsigned long start, unsigned long end) 165 { 166 unsigned long nstart; 167 struct vm_area_struct *vma, *prev; 168 169 VMA_ITERATOR(vmi, current->mm, start); 170 171 vma = vma_iter_load(&vmi); 172 /* 173 * Note: check_mm_seal should already checked ENOMEM case. 174 * so vma should not be null, same for the other ENOMEM cases. 175 */ 176 prev = vma_prev(&vmi); 177 if (start > vma->vm_start) 178 prev = vma; 179 180 nstart = start; 181 for_each_vma_range(vmi, vma, end) { 182 int error; 183 unsigned long tmp; 184 vm_flags_t newflags; 185 186 newflags = vma->vm_flags | VM_SEALED; 187 tmp = vma->vm_end; 188 if (tmp > end) 189 tmp = end; 190 error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); 191 if (error) 192 return error; 193 nstart = vma_iter_end(&vmi); 194 } 195 196 return 0; 197 } 198 199 /* 200 * mseal(2) seals the VM's meta data from 201 * selected syscalls. 202 * 203 * addr/len: VM address range. 204 * 205 * The address range by addr/len must meet: 206 * start (addr) must be in a valid VMA. 207 * end (addr + len) must be in a valid VMA. 208 * no gap (unallocated memory) between start and end. 209 * start (addr) must be page aligned. 210 * 211 * len: len will be page aligned implicitly. 212 * 213 * Below VMA operations are blocked after sealing. 214 * 1> Unmapping, moving to another location, and shrinking 215 * the size, via munmap() and mremap(), can leave an empty 216 * space, therefore can be replaced with a VMA with a new 217 * set of attributes. 218 * 2> Moving or expanding a different vma into the current location, 219 * via mremap(). 220 * 3> Modifying a VMA via mmap(MAP_FIXED). 221 * 4> Size expansion, via mremap(), does not appear to pose any 222 * specific risks to sealed VMAs. It is included anyway because 223 * the use case is unclear. In any case, users can rely on 224 * merging to expand a sealed VMA. 225 * 5> mprotect and pkey_mprotect. 226 * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED) 227 * for anonymous memory, when users don't have write permission to the 228 * memory. Those behaviors can alter region contents by discarding pages, 229 * effectively a memset(0) for anonymous memory. 230 * 231 * flags: reserved. 232 * 233 * return values: 234 * zero: success. 235 * -EINVAL: 236 * invalid input flags. 237 * start address is not page aligned. 238 * Address arange (start + len) overflow. 239 * -ENOMEM: 240 * addr is not a valid address (not allocated). 241 * end (start + len) is not a valid address. 242 * a gap (unallocated memory) between start and end. 243 * -EPERM: 244 * - In 32 bit architecture, sealing is not supported. 245 * Note: 246 * user can call mseal(2) multiple times, adding a seal on an 247 * already sealed memory is a no-action (no error). 248 * 249 * unseal() is not supported. 250 */ 251 static int do_mseal(unsigned long start, size_t len_in, unsigned long flags) 252 { 253 size_t len; 254 int ret = 0; 255 unsigned long end; 256 struct mm_struct *mm = current->mm; 257 258 ret = can_do_mseal(flags); 259 if (ret) 260 return ret; 261 262 start = untagged_addr(start); 263 if (!PAGE_ALIGNED(start)) 264 return -EINVAL; 265 266 len = PAGE_ALIGN(len_in); 267 /* Check to see whether len was rounded up from small -ve to zero. */ 268 if (len_in && !len) 269 return -EINVAL; 270 271 end = start + len; 272 if (end < start) 273 return -EINVAL; 274 275 if (end == start) 276 return 0; 277 278 if (mmap_write_lock_killable(mm)) 279 return -EINTR; 280 281 /* 282 * First pass, this helps to avoid 283 * partial sealing in case of error in input address range, 284 * e.g. ENOMEM error. 285 */ 286 ret = check_mm_seal(start, end); 287 if (ret) 288 goto out; 289 290 /* 291 * Second pass, this should success, unless there are errors 292 * from vma_modify_flags, e.g. merge/split error, or process 293 * reaching the max supported VMAs, however, those cases shall 294 * be rare. 295 */ 296 ret = apply_mm_seal(start, end); 297 298 out: 299 mmap_write_unlock(current->mm); 300 return ret; 301 } 302 303 SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long, 304 flags) 305 { 306 return do_mseal(start, len, flags); 307 } 308