1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Implement mseal() syscall. 4 * 5 * Copyright (c) 2023,2024 Google, Inc. 6 * 7 * Author: Jeff Xu <jeffxu@chromium.org> 8 */ 9 10 #include <linux/mempolicy.h> 11 #include <linux/mman.h> 12 #include <linux/mm.h> 13 #include <linux/mm_inline.h> 14 #include <linux/mmu_context.h> 15 #include <linux/syscalls.h> 16 #include <linux/sched.h> 17 #include "internal.h" 18 19 static inline bool vma_is_sealed(struct vm_area_struct *vma) 20 { 21 return (vma->vm_flags & VM_SEALED); 22 } 23 24 static inline void set_vma_sealed(struct vm_area_struct *vma) 25 { 26 vm_flags_set(vma, VM_SEALED); 27 } 28 29 /* 30 * check if a vma is sealed for modification. 31 * return true, if modification is allowed. 32 */ 33 static bool can_modify_vma(struct vm_area_struct *vma) 34 { 35 if (unlikely(vma_is_sealed(vma))) 36 return false; 37 38 return true; 39 } 40 41 static bool is_madv_discard(int behavior) 42 { 43 switch (behavior) { 44 case MADV_FREE: 45 case MADV_DONTNEED: 46 case MADV_DONTNEED_LOCKED: 47 case MADV_REMOVE: 48 case MADV_DONTFORK: 49 case MADV_WIPEONFORK: 50 return true; 51 } 52 53 return false; 54 } 55 56 static bool is_ro_anon(struct vm_area_struct *vma) 57 { 58 /* check anonymous mapping. */ 59 if (vma->vm_file || vma->vm_flags & VM_SHARED) 60 return false; 61 62 /* 63 * check for non-writable: 64 * PROT=RO or PKRU is not writeable. 65 */ 66 if (!(vma->vm_flags & VM_WRITE) || 67 !arch_vma_access_permitted(vma, true, false, false)) 68 return true; 69 70 return false; 71 } 72 73 /* 74 * Check if the vmas of a memory range are allowed to be modified. 75 * the memory ranger can have a gap (unallocated memory). 76 * return true, if it is allowed. 77 */ 78 bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end) 79 { 80 struct vm_area_struct *vma; 81 82 VMA_ITERATOR(vmi, mm, start); 83 84 /* going through each vma to check. */ 85 for_each_vma_range(vmi, vma, end) { 86 if (unlikely(!can_modify_vma(vma))) 87 return false; 88 } 89 90 /* Allow by default. */ 91 return true; 92 } 93 94 /* 95 * Check if the vmas of a memory range are allowed to be modified by madvise. 96 * the memory ranger can have a gap (unallocated memory). 97 * return true, if it is allowed. 98 */ 99 bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end, 100 int behavior) 101 { 102 struct vm_area_struct *vma; 103 104 VMA_ITERATOR(vmi, mm, start); 105 106 if (!is_madv_discard(behavior)) 107 return true; 108 109 /* going through each vma to check. */ 110 for_each_vma_range(vmi, vma, end) 111 if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma))) 112 return false; 113 114 /* Allow by default. */ 115 return true; 116 } 117 118 static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, 119 struct vm_area_struct **prev, unsigned long start, 120 unsigned long end, vm_flags_t newflags) 121 { 122 int ret = 0; 123 vm_flags_t oldflags = vma->vm_flags; 124 125 if (newflags == oldflags) 126 goto out; 127 128 vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); 129 if (IS_ERR(vma)) { 130 ret = PTR_ERR(vma); 131 goto out; 132 } 133 134 set_vma_sealed(vma); 135 out: 136 *prev = vma; 137 return ret; 138 } 139 140 /* 141 * Check for do_mseal: 142 * 1> start is part of a valid vma. 143 * 2> end is part of a valid vma. 144 * 3> No gap (unallocated address) between start and end. 145 * 4> map is sealable. 146 */ 147 static int check_mm_seal(unsigned long start, unsigned long end) 148 { 149 struct vm_area_struct *vma; 150 unsigned long nstart = start; 151 152 VMA_ITERATOR(vmi, current->mm, start); 153 154 /* going through each vma to check. */ 155 for_each_vma_range(vmi, vma, end) { 156 if (vma->vm_start > nstart) 157 /* unallocated memory found. */ 158 return -ENOMEM; 159 160 if (vma->vm_end >= end) 161 return 0; 162 163 nstart = vma->vm_end; 164 } 165 166 return -ENOMEM; 167 } 168 169 /* 170 * Apply sealing. 171 */ 172 static int apply_mm_seal(unsigned long start, unsigned long end) 173 { 174 unsigned long nstart; 175 struct vm_area_struct *vma, *prev; 176 177 VMA_ITERATOR(vmi, current->mm, start); 178 179 vma = vma_iter_load(&vmi); 180 /* 181 * Note: check_mm_seal should already checked ENOMEM case. 182 * so vma should not be null, same for the other ENOMEM cases. 183 */ 184 prev = vma_prev(&vmi); 185 if (start > vma->vm_start) 186 prev = vma; 187 188 nstart = start; 189 for_each_vma_range(vmi, vma, end) { 190 int error; 191 unsigned long tmp; 192 vm_flags_t newflags; 193 194 newflags = vma->vm_flags | VM_SEALED; 195 tmp = vma->vm_end; 196 if (tmp > end) 197 tmp = end; 198 error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); 199 if (error) 200 return error; 201 nstart = vma_iter_end(&vmi); 202 } 203 204 return 0; 205 } 206 207 /* 208 * mseal(2) seals the VM's meta data from 209 * selected syscalls. 210 * 211 * addr/len: VM address range. 212 * 213 * The address range by addr/len must meet: 214 * start (addr) must be in a valid VMA. 215 * end (addr + len) must be in a valid VMA. 216 * no gap (unallocated memory) between start and end. 217 * start (addr) must be page aligned. 218 * 219 * len: len will be page aligned implicitly. 220 * 221 * Below VMA operations are blocked after sealing. 222 * 1> Unmapping, moving to another location, and shrinking 223 * the size, via munmap() and mremap(), can leave an empty 224 * space, therefore can be replaced with a VMA with a new 225 * set of attributes. 226 * 2> Moving or expanding a different vma into the current location, 227 * via mremap(). 228 * 3> Modifying a VMA via mmap(MAP_FIXED). 229 * 4> Size expansion, via mremap(), does not appear to pose any 230 * specific risks to sealed VMAs. It is included anyway because 231 * the use case is unclear. In any case, users can rely on 232 * merging to expand a sealed VMA. 233 * 5> mprotect and pkey_mprotect. 234 * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED) 235 * for anonymous memory, when users don't have write permission to the 236 * memory. Those behaviors can alter region contents by discarding pages, 237 * effectively a memset(0) for anonymous memory. 238 * 239 * flags: reserved. 240 * 241 * return values: 242 * zero: success. 243 * -EINVAL: 244 * invalid input flags. 245 * start address is not page aligned. 246 * Address arange (start + len) overflow. 247 * -ENOMEM: 248 * addr is not a valid address (not allocated). 249 * end (start + len) is not a valid address. 250 * a gap (unallocated memory) between start and end. 251 * -EPERM: 252 * - In 32 bit architecture, sealing is not supported. 253 * Note: 254 * user can call mseal(2) multiple times, adding a seal on an 255 * already sealed memory is a no-action (no error). 256 * 257 * unseal() is not supported. 258 */ 259 int do_mseal(unsigned long start, size_t len_in, unsigned long flags) 260 { 261 size_t len; 262 int ret = 0; 263 unsigned long end; 264 struct mm_struct *mm = current->mm; 265 266 ret = can_do_mseal(flags); 267 if (ret) 268 return ret; 269 270 start = untagged_addr(start); 271 if (!PAGE_ALIGNED(start)) 272 return -EINVAL; 273 274 len = PAGE_ALIGN(len_in); 275 /* Check to see whether len was rounded up from small -ve to zero. */ 276 if (len_in && !len) 277 return -EINVAL; 278 279 end = start + len; 280 if (end < start) 281 return -EINVAL; 282 283 if (end == start) 284 return 0; 285 286 if (mmap_write_lock_killable(mm)) 287 return -EINTR; 288 289 /* 290 * First pass, this helps to avoid 291 * partial sealing in case of error in input address range, 292 * e.g. ENOMEM error. 293 */ 294 ret = check_mm_seal(start, end); 295 if (ret) 296 goto out; 297 298 /* 299 * Second pass, this should success, unless there are errors 300 * from vma_modify_flags, e.g. merge/split error, or process 301 * reaching the max supported VMAs, however, those cases shall 302 * be rare. 303 */ 304 ret = apply_mm_seal(start, end); 305 306 out: 307 mmap_write_unlock(current->mm); 308 return ret; 309 } 310 311 SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long, 312 flags) 313 { 314 return do_mseal(start, len, flags); 315 } 316