1*20019790SClaudio Imbrenda // SPDX-License-Identifier: GPL-2.0 2*20019790SClaudio Imbrenda /* 3*20019790SClaudio Imbrenda * Helper functions for KVM guest address space mapping code 4*20019790SClaudio Imbrenda * 5*20019790SClaudio Imbrenda * Copyright IBM Corp. 2007, 2025 6*20019790SClaudio Imbrenda */ 7*20019790SClaudio Imbrenda #include <linux/mm_types.h> 8*20019790SClaudio Imbrenda #include <linux/mmap_lock.h> 9*20019790SClaudio Imbrenda #include <linux/mm.h> 10*20019790SClaudio Imbrenda #include <linux/hugetlb.h> 11*20019790SClaudio Imbrenda #include <linux/swap.h> 12*20019790SClaudio Imbrenda #include <linux/swapops.h> 13*20019790SClaudio Imbrenda #include <linux/pagewalk.h> 14*20019790SClaudio Imbrenda #include <linux/ksm.h> 15*20019790SClaudio Imbrenda #include <asm/gmap_helpers.h> 16*20019790SClaudio Imbrenda 17*20019790SClaudio Imbrenda /** 18*20019790SClaudio Imbrenda * ptep_zap_swap_entry() - discard a swap entry. 19*20019790SClaudio Imbrenda * @mm: the mm 20*20019790SClaudio Imbrenda * @entry: the swap entry that needs to be zapped 21*20019790SClaudio Imbrenda * 22*20019790SClaudio Imbrenda * Discards the given swap entry. If the swap entry was an actual swap 23*20019790SClaudio Imbrenda * entry (and not a migration entry, for example), the actual swapped 24*20019790SClaudio Imbrenda * page is also discarded from swap. 25*20019790SClaudio Imbrenda */ 26*20019790SClaudio Imbrenda static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) 27*20019790SClaudio Imbrenda { 28*20019790SClaudio Imbrenda if (!non_swap_entry(entry)) 29*20019790SClaudio Imbrenda dec_mm_counter(mm, MM_SWAPENTS); 30*20019790SClaudio Imbrenda else if (is_migration_entry(entry)) 31*20019790SClaudio Imbrenda dec_mm_counter(mm, mm_counter(pfn_swap_entry_folio(entry))); 32*20019790SClaudio Imbrenda free_swap_and_cache(entry); 33*20019790SClaudio Imbrenda } 34*20019790SClaudio Imbrenda 35*20019790SClaudio Imbrenda /** 36*20019790SClaudio Imbrenda * gmap_helper_zap_one_page() - discard a page if it was swapped. 37*20019790SClaudio Imbrenda * @mm: the mm 38*20019790SClaudio Imbrenda * @vmaddr: the userspace virtual address that needs to be discarded 39*20019790SClaudio Imbrenda * 40*20019790SClaudio Imbrenda * If the given address maps to a swap entry, discard it. 41*20019790SClaudio Imbrenda * 42*20019790SClaudio Imbrenda * Context: needs to be called while holding the mmap lock. 43*20019790SClaudio Imbrenda */ 44*20019790SClaudio Imbrenda void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) 45*20019790SClaudio Imbrenda { 46*20019790SClaudio Imbrenda struct vm_area_struct *vma; 47*20019790SClaudio Imbrenda spinlock_t *ptl; 48*20019790SClaudio Imbrenda pte_t *ptep; 49*20019790SClaudio Imbrenda 50*20019790SClaudio Imbrenda mmap_assert_locked(mm); 51*20019790SClaudio Imbrenda 52*20019790SClaudio Imbrenda /* Find the vm address for the guest address */ 53*20019790SClaudio Imbrenda vma = vma_lookup(mm, vmaddr); 54*20019790SClaudio Imbrenda if (!vma || is_vm_hugetlb_page(vma)) 55*20019790SClaudio Imbrenda return; 56*20019790SClaudio Imbrenda 57*20019790SClaudio Imbrenda /* Get pointer to the page table entry */ 58*20019790SClaudio Imbrenda ptep = get_locked_pte(mm, vmaddr, &ptl); 59*20019790SClaudio Imbrenda if (unlikely(!ptep)) 60*20019790SClaudio Imbrenda return; 61*20019790SClaudio Imbrenda if (pte_swap(*ptep)) 62*20019790SClaudio Imbrenda ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep)); 63*20019790SClaudio Imbrenda pte_unmap_unlock(ptep, ptl); 64*20019790SClaudio Imbrenda } 65*20019790SClaudio Imbrenda EXPORT_SYMBOL_GPL(gmap_helper_zap_one_page); 66*20019790SClaudio Imbrenda 67*20019790SClaudio Imbrenda /** 68*20019790SClaudio Imbrenda * gmap_helper_discard() - discard user pages in the given range 69*20019790SClaudio Imbrenda * @mm: the mm 70*20019790SClaudio Imbrenda * @vmaddr: starting userspace address 71*20019790SClaudio Imbrenda * @end: end address (first address outside the range) 72*20019790SClaudio Imbrenda * 73*20019790SClaudio Imbrenda * All userpace pages in the range [@vamddr, @end) are discarded and unmapped. 74*20019790SClaudio Imbrenda * 75*20019790SClaudio Imbrenda * Context: needs to be called while holding the mmap lock. 76*20019790SClaudio Imbrenda */ 77*20019790SClaudio Imbrenda void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end) 78*20019790SClaudio Imbrenda { 79*20019790SClaudio Imbrenda struct vm_area_struct *vma; 80*20019790SClaudio Imbrenda 81*20019790SClaudio Imbrenda mmap_assert_locked(mm); 82*20019790SClaudio Imbrenda 83*20019790SClaudio Imbrenda while (vmaddr < end) { 84*20019790SClaudio Imbrenda vma = find_vma_intersection(mm, vmaddr, end); 85*20019790SClaudio Imbrenda if (!vma) 86*20019790SClaudio Imbrenda return; 87*20019790SClaudio Imbrenda if (!is_vm_hugetlb_page(vma)) 88*20019790SClaudio Imbrenda zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr, NULL); 89*20019790SClaudio Imbrenda vmaddr = vma->vm_end; 90*20019790SClaudio Imbrenda } 91*20019790SClaudio Imbrenda } 92*20019790SClaudio Imbrenda EXPORT_SYMBOL_GPL(gmap_helper_discard); 93*20019790SClaudio Imbrenda 94*20019790SClaudio Imbrenda static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr, 95*20019790SClaudio Imbrenda unsigned long end, struct mm_walk *walk) 96*20019790SClaudio Imbrenda { 97*20019790SClaudio Imbrenda unsigned long *found_addr = walk->private; 98*20019790SClaudio Imbrenda 99*20019790SClaudio Imbrenda /* Return 1 of the page is a zeropage. */ 100*20019790SClaudio Imbrenda if (is_zero_pfn(pte_pfn(*pte))) { 101*20019790SClaudio Imbrenda /* 102*20019790SClaudio Imbrenda * Shared zeropage in e.g., a FS DAX mapping? We cannot do the 103*20019790SClaudio Imbrenda * right thing and likely don't care: FAULT_FLAG_UNSHARE 104*20019790SClaudio Imbrenda * currently only works in COW mappings, which is also where 105*20019790SClaudio Imbrenda * mm_forbids_zeropage() is checked. 106*20019790SClaudio Imbrenda */ 107*20019790SClaudio Imbrenda if (!is_cow_mapping(walk->vma->vm_flags)) 108*20019790SClaudio Imbrenda return -EFAULT; 109*20019790SClaudio Imbrenda 110*20019790SClaudio Imbrenda *found_addr = addr; 111*20019790SClaudio Imbrenda return 1; 112*20019790SClaudio Imbrenda } 113*20019790SClaudio Imbrenda return 0; 114*20019790SClaudio Imbrenda } 115*20019790SClaudio Imbrenda 116*20019790SClaudio Imbrenda static const struct mm_walk_ops find_zeropage_ops = { 117*20019790SClaudio Imbrenda .pte_entry = find_zeropage_pte_entry, 118*20019790SClaudio Imbrenda .walk_lock = PGWALK_WRLOCK, 119*20019790SClaudio Imbrenda }; 120*20019790SClaudio Imbrenda 121*20019790SClaudio Imbrenda /** __gmap_helper_unshare_zeropages() - unshare all shared zeropages 122*20019790SClaudio Imbrenda * @mm: the mm whose zero pages are to be unshared 123*20019790SClaudio Imbrenda * 124*20019790SClaudio Imbrenda * Unshare all shared zeropages, replacing them by anonymous pages. Note that 125*20019790SClaudio Imbrenda * we cannot simply zap all shared zeropages, because this could later 126*20019790SClaudio Imbrenda * trigger unexpected userfaultfd missing events. 127*20019790SClaudio Imbrenda * 128*20019790SClaudio Imbrenda * This must be called after mm->context.allow_cow_sharing was 129*20019790SClaudio Imbrenda * set to 0, to avoid future mappings of shared zeropages. 130*20019790SClaudio Imbrenda * 131*20019790SClaudio Imbrenda * mm contracts with s390, that even if mm were to remove a page table, 132*20019790SClaudio Imbrenda * and racing with walk_page_range_vma() calling pte_offset_map_lock() 133*20019790SClaudio Imbrenda * would fail, it will never insert a page table containing empty zero 134*20019790SClaudio Imbrenda * pages once mm_forbids_zeropage(mm) i.e. 135*20019790SClaudio Imbrenda * mm->context.allow_cow_sharing is set to 0. 136*20019790SClaudio Imbrenda */ 137*20019790SClaudio Imbrenda static int __gmap_helper_unshare_zeropages(struct mm_struct *mm) 138*20019790SClaudio Imbrenda { 139*20019790SClaudio Imbrenda struct vm_area_struct *vma; 140*20019790SClaudio Imbrenda VMA_ITERATOR(vmi, mm, 0); 141*20019790SClaudio Imbrenda unsigned long addr; 142*20019790SClaudio Imbrenda vm_fault_t fault; 143*20019790SClaudio Imbrenda int rc; 144*20019790SClaudio Imbrenda 145*20019790SClaudio Imbrenda for_each_vma(vmi, vma) { 146*20019790SClaudio Imbrenda /* 147*20019790SClaudio Imbrenda * We could only look at COW mappings, but it's more future 148*20019790SClaudio Imbrenda * proof to catch unexpected zeropages in other mappings and 149*20019790SClaudio Imbrenda * fail. 150*20019790SClaudio Imbrenda */ 151*20019790SClaudio Imbrenda if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma)) 152*20019790SClaudio Imbrenda continue; 153*20019790SClaudio Imbrenda addr = vma->vm_start; 154*20019790SClaudio Imbrenda 155*20019790SClaudio Imbrenda retry: 156*20019790SClaudio Imbrenda rc = walk_page_range_vma(vma, addr, vma->vm_end, 157*20019790SClaudio Imbrenda &find_zeropage_ops, &addr); 158*20019790SClaudio Imbrenda if (rc < 0) 159*20019790SClaudio Imbrenda return rc; 160*20019790SClaudio Imbrenda else if (!rc) 161*20019790SClaudio Imbrenda continue; 162*20019790SClaudio Imbrenda 163*20019790SClaudio Imbrenda /* addr was updated by find_zeropage_pte_entry() */ 164*20019790SClaudio Imbrenda fault = handle_mm_fault(vma, addr, 165*20019790SClaudio Imbrenda FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, 166*20019790SClaudio Imbrenda NULL); 167*20019790SClaudio Imbrenda if (fault & VM_FAULT_OOM) 168*20019790SClaudio Imbrenda return -ENOMEM; 169*20019790SClaudio Imbrenda /* 170*20019790SClaudio Imbrenda * See break_ksm(): even after handle_mm_fault() returned 0, we 171*20019790SClaudio Imbrenda * must start the lookup from the current address, because 172*20019790SClaudio Imbrenda * handle_mm_fault() may back out if there's any difficulty. 173*20019790SClaudio Imbrenda * 174*20019790SClaudio Imbrenda * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but 175*20019790SClaudio Imbrenda * maybe they could trigger in the future on concurrent 176*20019790SClaudio Imbrenda * truncation. In that case, the shared zeropage would be gone 177*20019790SClaudio Imbrenda * and we can simply retry and make progress. 178*20019790SClaudio Imbrenda */ 179*20019790SClaudio Imbrenda cond_resched(); 180*20019790SClaudio Imbrenda goto retry; 181*20019790SClaudio Imbrenda } 182*20019790SClaudio Imbrenda 183*20019790SClaudio Imbrenda return 0; 184*20019790SClaudio Imbrenda } 185*20019790SClaudio Imbrenda 186*20019790SClaudio Imbrenda /** 187*20019790SClaudio Imbrenda * gmap_helper_disable_cow_sharing() - disable all COW sharing 188*20019790SClaudio Imbrenda * 189*20019790SClaudio Imbrenda * Disable most COW-sharing of memory pages for the whole process: 190*20019790SClaudio Imbrenda * (1) Disable KSM and unmerge/unshare any KSM pages. 191*20019790SClaudio Imbrenda * (2) Disallow shared zeropages and unshare any zerpages that are mapped. 192*20019790SClaudio Imbrenda * 193*20019790SClaudio Imbrenda * Not that we currently don't bother with COW-shared pages that are shared 194*20019790SClaudio Imbrenda * with parent/child processes due to fork(). 195*20019790SClaudio Imbrenda */ 196*20019790SClaudio Imbrenda int gmap_helper_disable_cow_sharing(void) 197*20019790SClaudio Imbrenda { 198*20019790SClaudio Imbrenda struct mm_struct *mm = current->mm; 199*20019790SClaudio Imbrenda int rc; 200*20019790SClaudio Imbrenda 201*20019790SClaudio Imbrenda mmap_assert_write_locked(mm); 202*20019790SClaudio Imbrenda 203*20019790SClaudio Imbrenda if (!mm->context.allow_cow_sharing) 204*20019790SClaudio Imbrenda return 0; 205*20019790SClaudio Imbrenda 206*20019790SClaudio Imbrenda mm->context.allow_cow_sharing = 0; 207*20019790SClaudio Imbrenda 208*20019790SClaudio Imbrenda /* Replace all shared zeropages by anonymous pages. */ 209*20019790SClaudio Imbrenda rc = __gmap_helper_unshare_zeropages(mm); 210*20019790SClaudio Imbrenda /* 211*20019790SClaudio Imbrenda * Make sure to disable KSM (if enabled for the whole process or 212*20019790SClaudio Imbrenda * individual VMAs). Note that nothing currently hinders user space 213*20019790SClaudio Imbrenda * from re-enabling it. 214*20019790SClaudio Imbrenda */ 215*20019790SClaudio Imbrenda if (!rc) 216*20019790SClaudio Imbrenda rc = ksm_disable(mm); 217*20019790SClaudio Imbrenda if (rc) 218*20019790SClaudio Imbrenda mm->context.allow_cow_sharing = 1; 219*20019790SClaudio Imbrenda return rc; 220*20019790SClaudio Imbrenda } 221*20019790SClaudio Imbrenda EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing); 222