1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Implement mseal() syscall. 4 * 5 * Copyright (c) 2023,2024 Google, Inc. 6 * 7 * Author: Jeff Xu <jeffxu@chromium.org> 8 */ 9 10 #include <linux/mempolicy.h> 11 #include <linux/mman.h> 12 #include <linux/mm.h> 13 #include <linux/mm_inline.h> 14 #include <linux/mmu_context.h> 15 #include <linux/syscalls.h> 16 #include <linux/sched.h> 17 #include "internal.h" 18 19 static inline void set_vma_sealed(struct vm_area_struct *vma) 20 { 21 vm_flags_set(vma, VM_SEALED); 22 } 23 24 static bool is_madv_discard(int behavior) 25 { 26 switch (behavior) { 27 case MADV_FREE: 28 case MADV_DONTNEED: 29 case MADV_DONTNEED_LOCKED: 30 case MADV_REMOVE: 31 case MADV_DONTFORK: 32 case MADV_WIPEONFORK: 33 return true; 34 } 35 36 return false; 37 } 38 39 static bool is_ro_anon(struct vm_area_struct *vma) 40 { 41 /* check anonymous mapping. */ 42 if (vma->vm_file || vma->vm_flags & VM_SHARED) 43 return false; 44 45 /* 46 * check for non-writable: 47 * PROT=RO or PKRU is not writeable. 48 */ 49 if (!(vma->vm_flags & VM_WRITE) || 50 !arch_vma_access_permitted(vma, true, false, false)) 51 return true; 52 53 return false; 54 } 55 56 /* 57 * Check if a vma is allowed to be modified by madvise. 58 */ 59 bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) 60 { 61 if (!is_madv_discard(behavior)) 62 return true; 63 64 if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma))) 65 return false; 66 67 /* Allow by default. */ 68 return true; 69 } 70 71 static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, 72 struct vm_area_struct **prev, unsigned long start, 73 unsigned long end, vm_flags_t newflags) 74 { 75 int ret = 0; 76 vm_flags_t oldflags = vma->vm_flags; 77 78 if (newflags == oldflags) 79 goto out; 80 81 vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); 82 if (IS_ERR(vma)) { 83 ret = PTR_ERR(vma); 84 goto out; 85 } 86 87 set_vma_sealed(vma); 88 out: 89 *prev = vma; 90 return ret; 91 } 92 93 /* 94 * Check for do_mseal: 95 * 1> start is part of a valid vma. 96 * 2> end is part of a valid vma. 97 * 3> No gap (unallocated address) between start and end. 98 * 4> map is sealable. 99 */ 100 static int check_mm_seal(unsigned long start, unsigned long end) 101 { 102 struct vm_area_struct *vma; 103 unsigned long nstart = start; 104 105 VMA_ITERATOR(vmi, current->mm, start); 106 107 /* going through each vma to check. */ 108 for_each_vma_range(vmi, vma, end) { 109 if (vma->vm_start > nstart) 110 /* unallocated memory found. */ 111 return -ENOMEM; 112 113 if (vma->vm_end >= end) 114 return 0; 115 116 nstart = vma->vm_end; 117 } 118 119 return -ENOMEM; 120 } 121 122 /* 123 * Apply sealing. 124 */ 125 static int apply_mm_seal(unsigned long start, unsigned long end) 126 { 127 unsigned long nstart; 128 struct vm_area_struct *vma, *prev; 129 130 VMA_ITERATOR(vmi, current->mm, start); 131 132 vma = vma_iter_load(&vmi); 133 /* 134 * Note: check_mm_seal should already checked ENOMEM case. 135 * so vma should not be null, same for the other ENOMEM cases. 136 */ 137 prev = vma_prev(&vmi); 138 if (start > vma->vm_start) 139 prev = vma; 140 141 nstart = start; 142 for_each_vma_range(vmi, vma, end) { 143 int error; 144 unsigned long tmp; 145 vm_flags_t newflags; 146 147 newflags = vma->vm_flags | VM_SEALED; 148 tmp = vma->vm_end; 149 if (tmp > end) 150 tmp = end; 151 error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); 152 if (error) 153 return error; 154 nstart = vma_iter_end(&vmi); 155 } 156 157 return 0; 158 } 159 160 /* 161 * mseal(2) seals the VM's meta data from 162 * selected syscalls. 163 * 164 * addr/len: VM address range. 165 * 166 * The address range by addr/len must meet: 167 * start (addr) must be in a valid VMA. 168 * end (addr + len) must be in a valid VMA. 169 * no gap (unallocated memory) between start and end. 170 * start (addr) must be page aligned. 171 * 172 * len: len will be page aligned implicitly. 173 * 174 * Below VMA operations are blocked after sealing. 175 * 1> Unmapping, moving to another location, and shrinking 176 * the size, via munmap() and mremap(), can leave an empty 177 * space, therefore can be replaced with a VMA with a new 178 * set of attributes. 179 * 2> Moving or expanding a different vma into the current location, 180 * via mremap(). 181 * 3> Modifying a VMA via mmap(MAP_FIXED). 182 * 4> Size expansion, via mremap(), does not appear to pose any 183 * specific risks to sealed VMAs. It is included anyway because 184 * the use case is unclear. In any case, users can rely on 185 * merging to expand a sealed VMA. 186 * 5> mprotect and pkey_mprotect. 187 * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED) 188 * for anonymous memory, when users don't have write permission to the 189 * memory. Those behaviors can alter region contents by discarding pages, 190 * effectively a memset(0) for anonymous memory. 191 * 192 * flags: reserved. 193 * 194 * return values: 195 * zero: success. 196 * -EINVAL: 197 * invalid input flags. 198 * start address is not page aligned. 199 * Address arange (start + len) overflow. 200 * -ENOMEM: 201 * addr is not a valid address (not allocated). 202 * end (start + len) is not a valid address. 203 * a gap (unallocated memory) between start and end. 204 * -EPERM: 205 * - In 32 bit architecture, sealing is not supported. 206 * Note: 207 * user can call mseal(2) multiple times, adding a seal on an 208 * already sealed memory is a no-action (no error). 209 * 210 * unseal() is not supported. 211 */ 212 int do_mseal(unsigned long start, size_t len_in, unsigned long flags) 213 { 214 size_t len; 215 int ret = 0; 216 unsigned long end; 217 struct mm_struct *mm = current->mm; 218 219 ret = can_do_mseal(flags); 220 if (ret) 221 return ret; 222 223 start = untagged_addr(start); 224 if (!PAGE_ALIGNED(start)) 225 return -EINVAL; 226 227 len = PAGE_ALIGN(len_in); 228 /* Check to see whether len was rounded up from small -ve to zero. */ 229 if (len_in && !len) 230 return -EINVAL; 231 232 end = start + len; 233 if (end < start) 234 return -EINVAL; 235 236 if (end == start) 237 return 0; 238 239 if (mmap_write_lock_killable(mm)) 240 return -EINTR; 241 242 /* 243 * First pass, this helps to avoid 244 * partial sealing in case of error in input address range, 245 * e.g. ENOMEM error. 246 */ 247 ret = check_mm_seal(start, end); 248 if (ret) 249 goto out; 250 251 /* 252 * Second pass, this should success, unless there are errors 253 * from vma_modify_flags, e.g. merge/split error, or process 254 * reaching the max supported VMAs, however, those cases shall 255 * be rare. 256 */ 257 ret = apply_mm_seal(start, end); 258 259 out: 260 mmap_write_unlock(current->mm); 261 return ret; 262 } 263 264 SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long, 265 flags) 266 { 267 return do_mseal(start, len, flags); 268 } 269