1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Helper functions for KVM guest address space mapping code 4 * 5 * Copyright IBM Corp. 2007, 2025 6 */ 7 8 #include <linux/export.h> 9 #include <linux/mm_types.h> 10 #include <linux/mmap_lock.h> 11 #include <linux/mm.h> 12 #include <linux/hugetlb.h> 13 #include <linux/swap.h> 14 #include <linux/leafops.h> 15 #include <linux/pagewalk.h> 16 #include <linux/ksm.h> 17 #include <asm/gmap_helpers.h> 18 19 /** 20 * ptep_zap_softleaf_entry() - discard a software leaf entry. 21 * @mm: the mm 22 * @entry: the software leaf entry that needs to be zapped 23 * 24 * Discards the given software leaf entry. If the leaf entry was an actual 25 * swap entry (and not a migration entry, for example), the actual swapped 26 * page is also discarded from swap. 27 */ 28 static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) 29 { 30 if (softleaf_is_swap(entry)) 31 dec_mm_counter(mm, MM_SWAPENTS); 32 else if (softleaf_is_migration(entry)) 33 dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry))); 34 swap_put_entries_direct(entry, 1); 35 } 36 37 /** 38 * gmap_helper_zap_one_page() - discard a page if it was swapped. 39 * @mm: the mm 40 * @vmaddr: the userspace virtual address that needs to be discarded 41 * 42 * If the given address maps to a swap entry, discard it. 43 * 44 * Context: needs to be called while holding the mmap lock. 45 */ 46 void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) 47 { 48 struct vm_area_struct *vma; 49 spinlock_t *ptl; 50 pte_t *ptep; 51 52 mmap_assert_locked(mm); 53 54 /* Find the vm address for the guest address */ 55 vma = vma_lookup(mm, vmaddr); 56 if (!vma || is_vm_hugetlb_page(vma)) 57 return; 58 59 /* Get pointer to the page table entry */ 60 ptep = get_locked_pte(mm, vmaddr, &ptl); 61 if (unlikely(!ptep)) 62 return; 63 if (pte_swap(*ptep)) { 64 ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); 65 pte_clear(mm, vmaddr, ptep); 66 } 67 pte_unmap_unlock(ptep, ptl); 68 } 69 EXPORT_SYMBOL_GPL(gmap_helper_zap_one_page); 70 71 /** 72 * gmap_helper_discard() - discard user pages in the given range 73 * @mm: the mm 74 * @vmaddr: starting userspace address 75 * @end: end address (first address outside the range) 76 * 77 * All userpace pages in the range [@vamddr, @end) are discarded and unmapped. 78 * 79 * Context: needs to be called while holding the mmap lock. 80 */ 81 void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end) 82 { 83 struct vm_area_struct *vma; 84 85 mmap_assert_locked(mm); 86 87 while (vmaddr < end) { 88 vma = find_vma_intersection(mm, vmaddr, end); 89 if (!vma) 90 return; 91 if (!is_vm_hugetlb_page(vma)) 92 zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr, NULL); 93 vmaddr = vma->vm_end; 94 } 95 } 96 EXPORT_SYMBOL_GPL(gmap_helper_discard); 97 98 /** 99 * gmap_helper_try_set_pte_unused() - mark a pte entry as unused 100 * @mm: the mm 101 * @vmaddr: the userspace address whose pte is to be marked 102 * 103 * Mark the pte corresponding the given address as unused. This will cause 104 * core mm code to just drop this page instead of swapping it. 105 * 106 * This function needs to be called with interrupts disabled (for example 107 * while holding a spinlock), or while holding the mmap lock. Normally this 108 * function is called as a result of an unmap operation, and thus KVM common 109 * code will already hold kvm->mmu_lock in write mode. 110 * 111 * Context: Needs to be called while holding the mmap lock or with interrupts 112 * disabled. 113 */ 114 void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr) 115 { 116 pmd_t *pmdp, pmd, pmdval; 117 pud_t *pudp, pud; 118 p4d_t *p4dp, p4d; 119 pgd_t *pgdp, pgd; 120 spinlock_t *ptl; /* Lock for the host (userspace) page table */ 121 pte_t *ptep; 122 123 pgdp = pgd_offset(mm, vmaddr); 124 pgd = pgdp_get(pgdp); 125 if (pgd_none(pgd) || !pgd_present(pgd)) 126 return; 127 128 p4dp = p4d_offset(pgdp, vmaddr); 129 p4d = p4dp_get(p4dp); 130 if (p4d_none(p4d) || !p4d_present(p4d)) 131 return; 132 133 pudp = pud_offset(p4dp, vmaddr); 134 pud = pudp_get(pudp); 135 if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud)) 136 return; 137 138 pmdp = pmd_offset(pudp, vmaddr); 139 pmd = pmdp_get_lockless(pmdp); 140 if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd)) 141 return; 142 143 ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, &ptl); 144 if (!ptep) 145 return; 146 147 /* 148 * Several paths exists that takes the ptl lock and then call the 149 * mmu_notifier, which takes the mmu_lock. The unmap path, instead, 150 * takes the mmu_lock in write mode first, and then potentially 151 * calls this function, which takes the ptl lock. This can lead to a 152 * deadlock. 153 * The unused page mechanism is only an optimization, if the 154 * _PAGE_UNUSED bit is not set, the unused page is swapped as normal 155 * instead of being discarded. 156 * If the lock is contended the bit is not set and the deadlock is 157 * avoided. 158 */ 159 if (spin_trylock(ptl)) { 160 /* 161 * Make sure the pte we are touching is still the correct 162 * one. In theory this check should not be needed, but 163 * better safe than sorry. 164 * Disabling interrupts or holding the mmap lock is enough to 165 * guarantee that no concurrent updates to the page tables 166 * are possible. 167 */ 168 if (likely(pmd_same(pmdval, pmdp_get_lockless(pmdp)))) 169 __atomic64_or(_PAGE_UNUSED, (long *)ptep); 170 spin_unlock(ptl); 171 } 172 173 pte_unmap(ptep); 174 } 175 EXPORT_SYMBOL_GPL(gmap_helper_try_set_pte_unused); 176 177 static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr, 178 unsigned long end, struct mm_walk *walk) 179 { 180 unsigned long *found_addr = walk->private; 181 182 /* Return 1 of the page is a zeropage. */ 183 if (is_zero_pfn(pte_pfn(*pte))) { 184 /* 185 * Shared zeropage in e.g., a FS DAX mapping? We cannot do the 186 * right thing and likely don't care: FAULT_FLAG_UNSHARE 187 * currently only works in COW mappings, which is also where 188 * mm_forbids_zeropage() is checked. 189 */ 190 if (!is_cow_mapping(walk->vma->vm_flags)) 191 return -EFAULT; 192 193 *found_addr = addr; 194 return 1; 195 } 196 return 0; 197 } 198 199 static const struct mm_walk_ops find_zeropage_ops = { 200 .pte_entry = find_zeropage_pte_entry, 201 .walk_lock = PGWALK_WRLOCK, 202 }; 203 204 /** __gmap_helper_unshare_zeropages() - unshare all shared zeropages 205 * @mm: the mm whose zero pages are to be unshared 206 * 207 * Unshare all shared zeropages, replacing them by anonymous pages. Note that 208 * we cannot simply zap all shared zeropages, because this could later 209 * trigger unexpected userfaultfd missing events. 210 * 211 * This must be called after mm->context.allow_cow_sharing was 212 * set to 0, to avoid future mappings of shared zeropages. 213 * 214 * mm contracts with s390, that even if mm were to remove a page table, 215 * and racing with walk_page_range_vma() calling pte_offset_map_lock() 216 * would fail, it will never insert a page table containing empty zero 217 * pages once mm_forbids_zeropage(mm) i.e. 218 * mm->context.allow_cow_sharing is set to 0. 219 */ 220 static int __gmap_helper_unshare_zeropages(struct mm_struct *mm) 221 { 222 struct vm_area_struct *vma; 223 VMA_ITERATOR(vmi, mm, 0); 224 unsigned long addr; 225 vm_fault_t fault; 226 int rc; 227 228 for_each_vma(vmi, vma) { 229 /* 230 * We could only look at COW mappings, but it's more future 231 * proof to catch unexpected zeropages in other mappings and 232 * fail. 233 */ 234 if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma)) 235 continue; 236 addr = vma->vm_start; 237 238 retry: 239 rc = walk_page_range_vma(vma, addr, vma->vm_end, 240 &find_zeropage_ops, &addr); 241 if (rc < 0) 242 return rc; 243 else if (!rc) 244 continue; 245 246 /* addr was updated by find_zeropage_pte_entry() */ 247 fault = handle_mm_fault(vma, addr, 248 FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, 249 NULL); 250 if (fault & VM_FAULT_OOM) 251 return -ENOMEM; 252 /* 253 * See break_ksm(): even after handle_mm_fault() returned 0, we 254 * must start the lookup from the current address, because 255 * handle_mm_fault() may back out if there's any difficulty. 256 * 257 * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but 258 * maybe they could trigger in the future on concurrent 259 * truncation. In that case, the shared zeropage would be gone 260 * and we can simply retry and make progress. 261 */ 262 cond_resched(); 263 goto retry; 264 } 265 266 return 0; 267 } 268 269 /** 270 * gmap_helper_disable_cow_sharing() - disable all COW sharing 271 * 272 * Disable most COW-sharing of memory pages for the whole process: 273 * (1) Disable KSM and unmerge/unshare any KSM pages. 274 * (2) Disallow shared zeropages and unshare any zerpages that are mapped. 275 * 276 * Not that we currently don't bother with COW-shared pages that are shared 277 * with parent/child processes due to fork(). 278 */ 279 int gmap_helper_disable_cow_sharing(void) 280 { 281 struct mm_struct *mm = current->mm; 282 int rc; 283 284 mmap_assert_write_locked(mm); 285 286 if (!mm->context.allow_cow_sharing) 287 return 0; 288 289 mm->context.allow_cow_sharing = 0; 290 291 /* Replace all shared zeropages by anonymous pages. */ 292 rc = __gmap_helper_unshare_zeropages(mm); 293 /* 294 * Make sure to disable KSM (if enabled for the whole process or 295 * individual VMAs). Note that nothing currently hinders user space 296 * from re-enabling it. 297 */ 298 if (!rc) 299 rc = ksm_disable(mm); 300 if (rc) 301 mm->context.allow_cow_sharing = 1; 302 return rc; 303 } 304 EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing); 305