1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Implement mseal() syscall. 4 * 5 * Copyright (c) 2023,2024 Google, Inc. 6 * 7 * Author: Jeff Xu <jeffxu@chromium.org> 8 */ 9 10 #include <linux/mempolicy.h> 11 #include <linux/mman.h> 12 #include <linux/mm.h> 13 #include <linux/mm_inline.h> 14 #include <linux/mmu_context.h> 15 #include <linux/syscalls.h> 16 #include <linux/sched.h> 17 #include "internal.h" 18 19 static inline void set_vma_sealed(struct vm_area_struct *vma) 20 { 21 vm_flags_set(vma, VM_SEALED); 22 } 23 24 static bool is_madv_discard(int behavior) 25 { 26 switch (behavior) { 27 case MADV_FREE: 28 case MADV_DONTNEED: 29 case MADV_DONTNEED_LOCKED: 30 case MADV_REMOVE: 31 case MADV_DONTFORK: 32 case MADV_WIPEONFORK: 33 case MADV_GUARD_INSTALL: 34 return true; 35 } 36 37 return false; 38 } 39 40 static bool is_ro_anon(struct vm_area_struct *vma) 41 { 42 /* check anonymous mapping. */ 43 if (vma->vm_file || vma->vm_flags & VM_SHARED) 44 return false; 45 46 /* 47 * check for non-writable: 48 * PROT=RO or PKRU is not writeable. 49 */ 50 if (!(vma->vm_flags & VM_WRITE) || 51 !arch_vma_access_permitted(vma, true, false, false)) 52 return true; 53 54 return false; 55 } 56 57 /* 58 * Check if a vma is allowed to be modified by madvise. 59 */ 60 bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) 61 { 62 if (!is_madv_discard(behavior)) 63 return true; 64 65 if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma))) 66 return false; 67 68 /* Allow by default. */ 69 return true; 70 } 71 72 static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, 73 struct vm_area_struct **prev, unsigned long start, 74 unsigned long end, vm_flags_t newflags) 75 { 76 int ret = 0; 77 vm_flags_t oldflags = vma->vm_flags; 78 79 if (newflags == oldflags) 80 goto out; 81 82 vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); 83 if (IS_ERR(vma)) { 84 ret = PTR_ERR(vma); 85 goto out; 86 } 87 88 set_vma_sealed(vma); 89 out: 90 *prev = vma; 91 return ret; 92 } 93 94 /* 95 * Check for do_mseal: 96 * 1> start is part of a valid vma. 97 * 2> end is part of a valid vma. 98 * 3> No gap (unallocated address) between start and end. 99 * 4> map is sealable. 100 */ 101 static int check_mm_seal(unsigned long start, unsigned long end) 102 { 103 struct vm_area_struct *vma; 104 unsigned long nstart = start; 105 106 VMA_ITERATOR(vmi, current->mm, start); 107 108 /* going through each vma to check. */ 109 for_each_vma_range(vmi, vma, end) { 110 if (vma->vm_start > nstart) 111 /* unallocated memory found. */ 112 return -ENOMEM; 113 114 if (vma->vm_end >= end) 115 return 0; 116 117 nstart = vma->vm_end; 118 } 119 120 return -ENOMEM; 121 } 122 123 /* 124 * Apply sealing. 125 */ 126 static int apply_mm_seal(unsigned long start, unsigned long end) 127 { 128 unsigned long nstart; 129 struct vm_area_struct *vma, *prev; 130 131 VMA_ITERATOR(vmi, current->mm, start); 132 133 vma = vma_iter_load(&vmi); 134 /* 135 * Note: check_mm_seal should already checked ENOMEM case. 136 * so vma should not be null, same for the other ENOMEM cases. 137 */ 138 prev = vma_prev(&vmi); 139 if (start > vma->vm_start) 140 prev = vma; 141 142 nstart = start; 143 for_each_vma_range(vmi, vma, end) { 144 int error; 145 unsigned long tmp; 146 vm_flags_t newflags; 147 148 newflags = vma->vm_flags | VM_SEALED; 149 tmp = vma->vm_end; 150 if (tmp > end) 151 tmp = end; 152 error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); 153 if (error) 154 return error; 155 nstart = vma_iter_end(&vmi); 156 } 157 158 return 0; 159 } 160 161 /* 162 * mseal(2) seals the VM's meta data from 163 * selected syscalls. 164 * 165 * addr/len: VM address range. 166 * 167 * The address range by addr/len must meet: 168 * start (addr) must be in a valid VMA. 169 * end (addr + len) must be in a valid VMA. 170 * no gap (unallocated memory) between start and end. 171 * start (addr) must be page aligned. 172 * 173 * len: len will be page aligned implicitly. 174 * 175 * Below VMA operations are blocked after sealing. 176 * 1> Unmapping, moving to another location, and shrinking 177 * the size, via munmap() and mremap(), can leave an empty 178 * space, therefore can be replaced with a VMA with a new 179 * set of attributes. 180 * 2> Moving or expanding a different vma into the current location, 181 * via mremap(). 182 * 3> Modifying a VMA via mmap(MAP_FIXED). 183 * 4> Size expansion, via mremap(), does not appear to pose any 184 * specific risks to sealed VMAs. It is included anyway because 185 * the use case is unclear. In any case, users can rely on 186 * merging to expand a sealed VMA. 187 * 5> mprotect and pkey_mprotect. 188 * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED) 189 * for anonymous memory, when users don't have write permission to the 190 * memory. Those behaviors can alter region contents by discarding pages, 191 * effectively a memset(0) for anonymous memory. 192 * 193 * flags: reserved. 194 * 195 * return values: 196 * zero: success. 197 * -EINVAL: 198 * invalid input flags. 199 * start address is not page aligned. 200 * Address arange (start + len) overflow. 201 * -ENOMEM: 202 * addr is not a valid address (not allocated). 203 * end (start + len) is not a valid address. 204 * a gap (unallocated memory) between start and end. 205 * -EPERM: 206 * - In 32 bit architecture, sealing is not supported. 207 * Note: 208 * user can call mseal(2) multiple times, adding a seal on an 209 * already sealed memory is a no-action (no error). 210 * 211 * unseal() is not supported. 212 */ 213 int do_mseal(unsigned long start, size_t len_in, unsigned long flags) 214 { 215 size_t len; 216 int ret = 0; 217 unsigned long end; 218 struct mm_struct *mm = current->mm; 219 220 ret = can_do_mseal(flags); 221 if (ret) 222 return ret; 223 224 start = untagged_addr(start); 225 if (!PAGE_ALIGNED(start)) 226 return -EINVAL; 227 228 len = PAGE_ALIGN(len_in); 229 /* Check to see whether len was rounded up from small -ve to zero. */ 230 if (len_in && !len) 231 return -EINVAL; 232 233 end = start + len; 234 if (end < start) 235 return -EINVAL; 236 237 if (end == start) 238 return 0; 239 240 if (mmap_write_lock_killable(mm)) 241 return -EINTR; 242 243 /* 244 * First pass, this helps to avoid 245 * partial sealing in case of error in input address range, 246 * e.g. ENOMEM error. 247 */ 248 ret = check_mm_seal(start, end); 249 if (ret) 250 goto out; 251 252 /* 253 * Second pass, this should success, unless there are errors 254 * from vma_modify_flags, e.g. merge/split error, or process 255 * reaching the max supported VMAs, however, those cases shall 256 * be rare. 257 */ 258 ret = apply_mm_seal(start, end); 259 260 out: 261 mmap_write_unlock(current->mm); 262 return ret; 263 } 264 265 SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long, 266 flags) 267 { 268 return do_mseal(start, len, flags); 269 } 270