1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Implement mseal() syscall. 4 * 5 * Copyright (c) 2023,2024 Google, Inc. 6 * 7 * Author: Jeff Xu <jeffxu@chromium.org> 8 */ 9 10 #include <linux/mempolicy.h> 11 #include <linux/minmax.h> 12 #include <linux/mman.h> 13 #include <linux/mm.h> 14 #include <linux/mm_inline.h> 15 #include <linux/syscalls.h> 16 #include <linux/sched.h> 17 #include "internal.h" 18 19 /* 20 * mseal() disallows an input range which contain unmapped ranges (VMA holes). 21 * 22 * It disallows unmapped regions from start to end whether they exist at the 23 * start, in the middle, or at the end of the range, or any combination thereof. 24 * 25 * This is because after sealing a range, there's nothing to stop memory mapping 26 * of ranges in the remaining gaps later, meaning that the user might then 27 * wrongly consider the entirety of the mseal()'d range to be sealed when it 28 * in fact isn't. 29 */ 30 31 /* 32 * Does the [start, end) range contain any unmapped memory? 33 * 34 * We ensure that: 35 * - start is part of a valid VMA. 36 * - end is part of a valid VMA. 37 * - no gap (unallocated memory) exists between start and end. 38 */ 39 static bool range_contains_unmapped(struct mm_struct *mm, 40 unsigned long start, unsigned long end) 41 { 42 struct vm_area_struct *vma; 43 unsigned long prev_end = start; 44 VMA_ITERATOR(vmi, current->mm, start); 45 46 for_each_vma_range(vmi, vma, end) { 47 if (vma->vm_start > prev_end) 48 return true; 49 50 prev_end = vma->vm_end; 51 } 52 53 return prev_end < end; 54 } 55 56 static int mseal_apply(struct mm_struct *mm, 57 unsigned long start, unsigned long end) 58 { 59 struct vm_area_struct *vma, *prev; 60 VMA_ITERATOR(vmi, mm, start); 61 62 /* We know there are no gaps so this will be non-NULL. */ 63 vma = vma_iter_load(&vmi); 64 prev = vma_prev(&vmi); 65 if (start > vma->vm_start) 66 prev = vma; 67 68 for_each_vma_range(vmi, vma, end) { 69 const unsigned long curr_start = max(vma->vm_start, start); 70 const unsigned long curr_end = min(vma->vm_end, end); 71 72 if (!vma_test(vma, VMA_SEALED_BIT)) { 73 vma_flags_t vma_flags = vma->flags; 74 75 vma_flags_set(&vma_flags, VMA_SEALED_BIT); 76 77 vma = vma_modify_flags(&vmi, prev, vma, curr_start, 78 curr_end, &vma_flags); 79 if (IS_ERR(vma)) 80 return PTR_ERR(vma); 81 vma_start_write(vma); 82 vma_set_flags(vma, VMA_SEALED_BIT); 83 } 84 85 prev = vma; 86 } 87 88 return 0; 89 } 90 91 /* 92 * mseal(2) seals the VM's meta data from 93 * selected syscalls. 94 * 95 * addr/len: VM address range. 96 * 97 * The address range by addr/len must meet: 98 * start (addr) must be in a valid VMA. 99 * end (addr + len) must be in a valid VMA. 100 * no gap (unallocated memory) between start and end. 101 * start (addr) must be page aligned. 102 * 103 * len: len will be page aligned implicitly. 104 * 105 * Below VMA operations are blocked after sealing. 106 * 1> Unmapping, moving to another location, and shrinking 107 * the size, via munmap() and mremap(), can leave an empty 108 * space, therefore can be replaced with a VMA with a new 109 * set of attributes. 110 * 2> Moving or expanding a different vma into the current location, 111 * via mremap(). 112 * 3> Modifying a VMA via mmap(MAP_FIXED). 113 * 4> Size expansion, via mremap(), does not appear to pose any 114 * specific risks to sealed VMAs. It is included anyway because 115 * the use case is unclear. In any case, users can rely on 116 * merging to expand a sealed VMA. 117 * 5> mprotect and pkey_mprotect. 118 * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED) 119 * for anonymous memory, when users don't have write permission to the 120 * memory. Those behaviors can alter region contents by discarding pages, 121 * effectively a memset(0) for anonymous memory. 122 * 123 * flags: reserved. 124 * 125 * return values: 126 * zero: success. 127 * -EINVAL: 128 * invalid input flags. 129 * start address is not page aligned. 130 * Address range (start + len) overflow. 131 * -ENOMEM: 132 * addr is not a valid address (not allocated). 133 * end (start + len) is not a valid address. 134 * a gap (unallocated memory) between start and end. 135 * -EPERM: 136 * - In 32 bit architecture, sealing is not supported. 137 * Note: 138 * user can call mseal(2) multiple times, adding a seal on an 139 * already sealed memory is a no-action (no error). 140 * 141 * unseal() is not supported. 142 */ 143 int do_mseal(unsigned long start, size_t len_in, unsigned long flags) 144 { 145 size_t len; 146 int ret = 0; 147 unsigned long end; 148 struct mm_struct *mm = current->mm; 149 150 /* Verify flags not set. */ 151 if (flags) 152 return -EINVAL; 153 154 start = untagged_addr(start); 155 if (!PAGE_ALIGNED(start)) 156 return -EINVAL; 157 158 len = PAGE_ALIGN(len_in); 159 /* Check to see whether len was rounded up from small -ve to zero. */ 160 if (len_in && !len) 161 return -EINVAL; 162 163 end = start + len; 164 if (end < start) 165 return -EINVAL; 166 167 if (end == start) 168 return 0; 169 170 if (mmap_write_lock_killable(mm)) 171 return -EINTR; 172 173 if (range_contains_unmapped(mm, start, end)) { 174 ret = -ENOMEM; 175 goto out; 176 } 177 178 /* 179 * Second pass, this should success, unless there are errors 180 * from vma_modify_flags, e.g. merge/split error, or process 181 * reaching the max supported VMAs, however, those cases shall 182 * be rare. 183 */ 184 ret = mseal_apply(mm, start, end); 185 186 out: 187 mmap_write_unlock(mm); 188 return ret; 189 } 190 191 SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long, 192 flags) 193 { 194 return do_mseal(start, len, flags); 195 } 196