1*c5acad84SThomas Hellstrom // SPDX-License-Identifier: GPL-2.0 2*c5acad84SThomas Hellstrom #include <linux/pagewalk.h> 3*c5acad84SThomas Hellstrom #include <linux/hugetlb.h> 4*c5acad84SThomas Hellstrom #include <linux/bitops.h> 5*c5acad84SThomas Hellstrom #include <linux/mmu_notifier.h> 6*c5acad84SThomas Hellstrom #include <asm/cacheflush.h> 7*c5acad84SThomas Hellstrom #include <asm/tlbflush.h> 8*c5acad84SThomas Hellstrom 9*c5acad84SThomas Hellstrom /** 10*c5acad84SThomas Hellstrom * struct wp_walk - Private struct for pagetable walk callbacks 11*c5acad84SThomas Hellstrom * @range: Range for mmu notifiers 12*c5acad84SThomas Hellstrom * @tlbflush_start: Address of first modified pte 13*c5acad84SThomas Hellstrom * @tlbflush_end: Address of last modified pte + 1 14*c5acad84SThomas Hellstrom * @total: Total number of modified ptes 15*c5acad84SThomas Hellstrom */ 16*c5acad84SThomas Hellstrom struct wp_walk { 17*c5acad84SThomas Hellstrom struct mmu_notifier_range range; 18*c5acad84SThomas Hellstrom unsigned long tlbflush_start; 19*c5acad84SThomas Hellstrom unsigned long tlbflush_end; 20*c5acad84SThomas Hellstrom unsigned long total; 21*c5acad84SThomas Hellstrom }; 22*c5acad84SThomas Hellstrom 23*c5acad84SThomas Hellstrom /** 24*c5acad84SThomas Hellstrom * wp_pte - Write-protect a pte 25*c5acad84SThomas Hellstrom * @pte: Pointer to the pte 26*c5acad84SThomas Hellstrom * @addr: The virtual page address 27*c5acad84SThomas Hellstrom * @walk: pagetable walk callback argument 28*c5acad84SThomas Hellstrom * 29*c5acad84SThomas Hellstrom * The function write-protects a pte and records the range in 30*c5acad84SThomas Hellstrom * virtual address space of touched ptes for efficient range TLB flushes. 31*c5acad84SThomas Hellstrom */ 32*c5acad84SThomas Hellstrom static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end, 33*c5acad84SThomas Hellstrom struct mm_walk *walk) 34*c5acad84SThomas Hellstrom { 35*c5acad84SThomas Hellstrom struct wp_walk *wpwalk = walk->private; 36*c5acad84SThomas Hellstrom pte_t ptent = *pte; 37*c5acad84SThomas Hellstrom 38*c5acad84SThomas Hellstrom if (pte_write(ptent)) { 39*c5acad84SThomas Hellstrom pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte); 40*c5acad84SThomas Hellstrom 41*c5acad84SThomas Hellstrom ptent = pte_wrprotect(old_pte); 42*c5acad84SThomas Hellstrom ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent); 43*c5acad84SThomas Hellstrom wpwalk->total++; 44*c5acad84SThomas Hellstrom wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr); 45*c5acad84SThomas Hellstrom wpwalk->tlbflush_end = max(wpwalk->tlbflush_end, 46*c5acad84SThomas Hellstrom addr + PAGE_SIZE); 47*c5acad84SThomas Hellstrom } 48*c5acad84SThomas Hellstrom 49*c5acad84SThomas Hellstrom return 0; 50*c5acad84SThomas Hellstrom } 51*c5acad84SThomas Hellstrom 52*c5acad84SThomas Hellstrom /** 53*c5acad84SThomas Hellstrom * struct clean_walk - Private struct for the clean_record_pte function. 54*c5acad84SThomas Hellstrom * @base: struct wp_walk we derive from 55*c5acad84SThomas Hellstrom * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap 56*c5acad84SThomas Hellstrom * @bitmap: Bitmap with one bit for each page offset in the address_space range 57*c5acad84SThomas Hellstrom * covered. 58*c5acad84SThomas Hellstrom * @start: Address_space page offset of first modified pte relative 59*c5acad84SThomas Hellstrom * to @bitmap_pgoff 60*c5acad84SThomas Hellstrom * @end: Address_space page offset of last modified pte relative 61*c5acad84SThomas Hellstrom * to @bitmap_pgoff 62*c5acad84SThomas Hellstrom */ 63*c5acad84SThomas Hellstrom struct clean_walk { 64*c5acad84SThomas Hellstrom struct wp_walk base; 65*c5acad84SThomas Hellstrom pgoff_t bitmap_pgoff; 66*c5acad84SThomas Hellstrom unsigned long *bitmap; 67*c5acad84SThomas Hellstrom pgoff_t start; 68*c5acad84SThomas Hellstrom pgoff_t end; 69*c5acad84SThomas Hellstrom }; 70*c5acad84SThomas Hellstrom 71*c5acad84SThomas Hellstrom #define to_clean_walk(_wpwalk) container_of(_wpwalk, struct clean_walk, base) 72*c5acad84SThomas Hellstrom 73*c5acad84SThomas Hellstrom /** 74*c5acad84SThomas Hellstrom * clean_record_pte - Clean a pte and record its address space offset in a 75*c5acad84SThomas Hellstrom * bitmap 76*c5acad84SThomas Hellstrom * @pte: Pointer to the pte 77*c5acad84SThomas Hellstrom * @addr: The virtual page address 78*c5acad84SThomas Hellstrom * @walk: pagetable walk callback argument 79*c5acad84SThomas Hellstrom * 80*c5acad84SThomas Hellstrom * The function cleans a pte and records the range in 81*c5acad84SThomas Hellstrom * virtual address space of touched ptes for efficient TLB flushes. 82*c5acad84SThomas Hellstrom * It also records dirty ptes in a bitmap representing page offsets 83*c5acad84SThomas Hellstrom * in the address_space, as well as the first and last of the bits 84*c5acad84SThomas Hellstrom * touched. 85*c5acad84SThomas Hellstrom */ 86*c5acad84SThomas Hellstrom static int clean_record_pte(pte_t *pte, unsigned long addr, 87*c5acad84SThomas Hellstrom unsigned long end, struct mm_walk *walk) 88*c5acad84SThomas Hellstrom { 89*c5acad84SThomas Hellstrom struct wp_walk *wpwalk = walk->private; 90*c5acad84SThomas Hellstrom struct clean_walk *cwalk = to_clean_walk(wpwalk); 91*c5acad84SThomas Hellstrom pte_t ptent = *pte; 92*c5acad84SThomas Hellstrom 93*c5acad84SThomas Hellstrom if (pte_dirty(ptent)) { 94*c5acad84SThomas Hellstrom pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) + 95*c5acad84SThomas Hellstrom walk->vma->vm_pgoff - cwalk->bitmap_pgoff; 96*c5acad84SThomas Hellstrom pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte); 97*c5acad84SThomas Hellstrom 98*c5acad84SThomas Hellstrom ptent = pte_mkclean(old_pte); 99*c5acad84SThomas Hellstrom ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent); 100*c5acad84SThomas Hellstrom 101*c5acad84SThomas Hellstrom wpwalk->total++; 102*c5acad84SThomas Hellstrom wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr); 103*c5acad84SThomas Hellstrom wpwalk->tlbflush_end = max(wpwalk->tlbflush_end, 104*c5acad84SThomas Hellstrom addr + PAGE_SIZE); 105*c5acad84SThomas Hellstrom 106*c5acad84SThomas Hellstrom __set_bit(pgoff, cwalk->bitmap); 107*c5acad84SThomas Hellstrom cwalk->start = min(cwalk->start, pgoff); 108*c5acad84SThomas Hellstrom cwalk->end = max(cwalk->end, pgoff + 1); 109*c5acad84SThomas Hellstrom } 110*c5acad84SThomas Hellstrom 111*c5acad84SThomas Hellstrom return 0; 112*c5acad84SThomas Hellstrom } 113*c5acad84SThomas Hellstrom 114*c5acad84SThomas Hellstrom /* wp_clean_pmd_entry - The pagewalk pmd callback. */ 115*c5acad84SThomas Hellstrom static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, 116*c5acad84SThomas Hellstrom struct mm_walk *walk) 117*c5acad84SThomas Hellstrom { 118*c5acad84SThomas Hellstrom /* Dirty-tracking should be handled on the pte level */ 119*c5acad84SThomas Hellstrom pmd_t pmdval = pmd_read_atomic(pmd); 120*c5acad84SThomas Hellstrom 121*c5acad84SThomas Hellstrom if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) 122*c5acad84SThomas Hellstrom WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval)); 123*c5acad84SThomas Hellstrom 124*c5acad84SThomas Hellstrom return 0; 125*c5acad84SThomas Hellstrom } 126*c5acad84SThomas Hellstrom 127*c5acad84SThomas Hellstrom /* wp_clean_pud_entry - The pagewalk pud callback. */ 128*c5acad84SThomas Hellstrom static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end, 129*c5acad84SThomas Hellstrom struct mm_walk *walk) 130*c5acad84SThomas Hellstrom { 131*c5acad84SThomas Hellstrom /* Dirty-tracking should be handled on the pte level */ 132*c5acad84SThomas Hellstrom pud_t pudval = READ_ONCE(*pud); 133*c5acad84SThomas Hellstrom 134*c5acad84SThomas Hellstrom if (pud_trans_huge(pudval) || pud_devmap(pudval)) 135*c5acad84SThomas Hellstrom WARN_ON(pud_write(pudval) || pud_dirty(pudval)); 136*c5acad84SThomas Hellstrom 137*c5acad84SThomas Hellstrom return 0; 138*c5acad84SThomas Hellstrom } 139*c5acad84SThomas Hellstrom 140*c5acad84SThomas Hellstrom /* 141*c5acad84SThomas Hellstrom * wp_clean_pre_vma - The pagewalk pre_vma callback. 142*c5acad84SThomas Hellstrom * 143*c5acad84SThomas Hellstrom * The pre_vma callback performs the cache flush, stages the tlb flush 144*c5acad84SThomas Hellstrom * and calls the necessary mmu notifiers. 145*c5acad84SThomas Hellstrom */ 146*c5acad84SThomas Hellstrom static int wp_clean_pre_vma(unsigned long start, unsigned long end, 147*c5acad84SThomas Hellstrom struct mm_walk *walk) 148*c5acad84SThomas Hellstrom { 149*c5acad84SThomas Hellstrom struct wp_walk *wpwalk = walk->private; 150*c5acad84SThomas Hellstrom 151*c5acad84SThomas Hellstrom wpwalk->tlbflush_start = end; 152*c5acad84SThomas Hellstrom wpwalk->tlbflush_end = start; 153*c5acad84SThomas Hellstrom 154*c5acad84SThomas Hellstrom mmu_notifier_range_init(&wpwalk->range, MMU_NOTIFY_PROTECTION_PAGE, 0, 155*c5acad84SThomas Hellstrom walk->vma, walk->mm, start, end); 156*c5acad84SThomas Hellstrom mmu_notifier_invalidate_range_start(&wpwalk->range); 157*c5acad84SThomas Hellstrom flush_cache_range(walk->vma, start, end); 158*c5acad84SThomas Hellstrom 159*c5acad84SThomas Hellstrom /* 160*c5acad84SThomas Hellstrom * We're not using tlb_gather_mmu() since typically 161*c5acad84SThomas Hellstrom * only a small subrange of PTEs are affected, whereas 162*c5acad84SThomas Hellstrom * tlb_gather_mmu() records the full range. 163*c5acad84SThomas Hellstrom */ 164*c5acad84SThomas Hellstrom inc_tlb_flush_pending(walk->mm); 165*c5acad84SThomas Hellstrom 166*c5acad84SThomas Hellstrom return 0; 167*c5acad84SThomas Hellstrom } 168*c5acad84SThomas Hellstrom 169*c5acad84SThomas Hellstrom /* 170*c5acad84SThomas Hellstrom * wp_clean_post_vma - The pagewalk post_vma callback. 171*c5acad84SThomas Hellstrom * 172*c5acad84SThomas Hellstrom * The post_vma callback performs the tlb flush and calls necessary mmu 173*c5acad84SThomas Hellstrom * notifiers. 174*c5acad84SThomas Hellstrom */ 175*c5acad84SThomas Hellstrom static void wp_clean_post_vma(struct mm_walk *walk) 176*c5acad84SThomas Hellstrom { 177*c5acad84SThomas Hellstrom struct wp_walk *wpwalk = walk->private; 178*c5acad84SThomas Hellstrom 179*c5acad84SThomas Hellstrom if (mm_tlb_flush_nested(walk->mm)) 180*c5acad84SThomas Hellstrom flush_tlb_range(walk->vma, wpwalk->range.start, 181*c5acad84SThomas Hellstrom wpwalk->range.end); 182*c5acad84SThomas Hellstrom else if (wpwalk->tlbflush_end > wpwalk->tlbflush_start) 183*c5acad84SThomas Hellstrom flush_tlb_range(walk->vma, wpwalk->tlbflush_start, 184*c5acad84SThomas Hellstrom wpwalk->tlbflush_end); 185*c5acad84SThomas Hellstrom 186*c5acad84SThomas Hellstrom mmu_notifier_invalidate_range_end(&wpwalk->range); 187*c5acad84SThomas Hellstrom dec_tlb_flush_pending(walk->mm); 188*c5acad84SThomas Hellstrom } 189*c5acad84SThomas Hellstrom 190*c5acad84SThomas Hellstrom /* 191*c5acad84SThomas Hellstrom * wp_clean_test_walk - The pagewalk test_walk callback. 192*c5acad84SThomas Hellstrom * 193*c5acad84SThomas Hellstrom * Won't perform dirty-tracking on COW, read-only or HUGETLB vmas. 194*c5acad84SThomas Hellstrom */ 195*c5acad84SThomas Hellstrom static int wp_clean_test_walk(unsigned long start, unsigned long end, 196*c5acad84SThomas Hellstrom struct mm_walk *walk) 197*c5acad84SThomas Hellstrom { 198*c5acad84SThomas Hellstrom unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags); 199*c5acad84SThomas Hellstrom 200*c5acad84SThomas Hellstrom /* Skip non-applicable VMAs */ 201*c5acad84SThomas Hellstrom if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) != 202*c5acad84SThomas Hellstrom (VM_SHARED | VM_MAYWRITE)) 203*c5acad84SThomas Hellstrom return 1; 204*c5acad84SThomas Hellstrom 205*c5acad84SThomas Hellstrom return 0; 206*c5acad84SThomas Hellstrom } 207*c5acad84SThomas Hellstrom 208*c5acad84SThomas Hellstrom static const struct mm_walk_ops clean_walk_ops = { 209*c5acad84SThomas Hellstrom .pte_entry = clean_record_pte, 210*c5acad84SThomas Hellstrom .pmd_entry = wp_clean_pmd_entry, 211*c5acad84SThomas Hellstrom .pud_entry = wp_clean_pud_entry, 212*c5acad84SThomas Hellstrom .test_walk = wp_clean_test_walk, 213*c5acad84SThomas Hellstrom .pre_vma = wp_clean_pre_vma, 214*c5acad84SThomas Hellstrom .post_vma = wp_clean_post_vma 215*c5acad84SThomas Hellstrom }; 216*c5acad84SThomas Hellstrom 217*c5acad84SThomas Hellstrom static const struct mm_walk_ops wp_walk_ops = { 218*c5acad84SThomas Hellstrom .pte_entry = wp_pte, 219*c5acad84SThomas Hellstrom .pmd_entry = wp_clean_pmd_entry, 220*c5acad84SThomas Hellstrom .pud_entry = wp_clean_pud_entry, 221*c5acad84SThomas Hellstrom .test_walk = wp_clean_test_walk, 222*c5acad84SThomas Hellstrom .pre_vma = wp_clean_pre_vma, 223*c5acad84SThomas Hellstrom .post_vma = wp_clean_post_vma 224*c5acad84SThomas Hellstrom }; 225*c5acad84SThomas Hellstrom 226*c5acad84SThomas Hellstrom /** 227*c5acad84SThomas Hellstrom * wp_shared_mapping_range - Write-protect all ptes in an address space range 228*c5acad84SThomas Hellstrom * @mapping: The address_space we want to write protect 229*c5acad84SThomas Hellstrom * @first_index: The first page offset in the range 230*c5acad84SThomas Hellstrom * @nr: Number of incremental page offsets to cover 231*c5acad84SThomas Hellstrom * 232*c5acad84SThomas Hellstrom * Note: This function currently skips transhuge page-table entries, since 233*c5acad84SThomas Hellstrom * it's intended for dirty-tracking on the PTE level. It will warn on 234*c5acad84SThomas Hellstrom * encountering transhuge write-enabled entries, though, and can easily be 235*c5acad84SThomas Hellstrom * extended to handle them as well. 236*c5acad84SThomas Hellstrom * 237*c5acad84SThomas Hellstrom * Return: The number of ptes actually write-protected. Note that 238*c5acad84SThomas Hellstrom * already write-protected ptes are not counted. 239*c5acad84SThomas Hellstrom */ 240*c5acad84SThomas Hellstrom unsigned long wp_shared_mapping_range(struct address_space *mapping, 241*c5acad84SThomas Hellstrom pgoff_t first_index, pgoff_t nr) 242*c5acad84SThomas Hellstrom { 243*c5acad84SThomas Hellstrom struct wp_walk wpwalk = { .total = 0 }; 244*c5acad84SThomas Hellstrom 245*c5acad84SThomas Hellstrom i_mmap_lock_read(mapping); 246*c5acad84SThomas Hellstrom WARN_ON(walk_page_mapping(mapping, first_index, nr, &wp_walk_ops, 247*c5acad84SThomas Hellstrom &wpwalk)); 248*c5acad84SThomas Hellstrom i_mmap_unlock_read(mapping); 249*c5acad84SThomas Hellstrom 250*c5acad84SThomas Hellstrom return wpwalk.total; 251*c5acad84SThomas Hellstrom } 252*c5acad84SThomas Hellstrom EXPORT_SYMBOL_GPL(wp_shared_mapping_range); 253*c5acad84SThomas Hellstrom 254*c5acad84SThomas Hellstrom /** 255*c5acad84SThomas Hellstrom * clean_record_shared_mapping_range - Clean and record all ptes in an 256*c5acad84SThomas Hellstrom * address space range 257*c5acad84SThomas Hellstrom * @mapping: The address_space we want to clean 258*c5acad84SThomas Hellstrom * @first_index: The first page offset in the range 259*c5acad84SThomas Hellstrom * @nr: Number of incremental page offsets to cover 260*c5acad84SThomas Hellstrom * @bitmap_pgoff: The page offset of the first bit in @bitmap 261*c5acad84SThomas Hellstrom * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to 262*c5acad84SThomas Hellstrom * cover the whole range @first_index..@first_index + @nr. 263*c5acad84SThomas Hellstrom * @start: Pointer to number of the first set bit in @bitmap. 264*c5acad84SThomas Hellstrom * is modified as new bits are set by the function. 265*c5acad84SThomas Hellstrom * @end: Pointer to the number of the last set bit in @bitmap. 266*c5acad84SThomas Hellstrom * none set. The value is modified as new bits are set by the function. 267*c5acad84SThomas Hellstrom * 268*c5acad84SThomas Hellstrom * Note: When this function returns there is no guarantee that a CPU has 269*c5acad84SThomas Hellstrom * not already dirtied new ptes. However it will not clean any ptes not 270*c5acad84SThomas Hellstrom * reported in the bitmap. The guarantees are as follows: 271*c5acad84SThomas Hellstrom * a) All ptes dirty when the function starts executing will end up recorded 272*c5acad84SThomas Hellstrom * in the bitmap. 273*c5acad84SThomas Hellstrom * b) All ptes dirtied after that will either remain dirty, be recorded in the 274*c5acad84SThomas Hellstrom * bitmap or both. 275*c5acad84SThomas Hellstrom * 276*c5acad84SThomas Hellstrom * If a caller needs to make sure all dirty ptes are picked up and none 277*c5acad84SThomas Hellstrom * additional are added, it first needs to write-protect the address-space 278*c5acad84SThomas Hellstrom * range and make sure new writers are blocked in page_mkwrite() or 279*c5acad84SThomas Hellstrom * pfn_mkwrite(). And then after a TLB flush following the write-protection 280*c5acad84SThomas Hellstrom * pick up all dirty bits. 281*c5acad84SThomas Hellstrom * 282*c5acad84SThomas Hellstrom * Note: This function currently skips transhuge page-table entries, since 283*c5acad84SThomas Hellstrom * it's intended for dirty-tracking on the PTE level. It will warn on 284*c5acad84SThomas Hellstrom * encountering transhuge dirty entries, though, and can easily be extended 285*c5acad84SThomas Hellstrom * to handle them as well. 286*c5acad84SThomas Hellstrom * 287*c5acad84SThomas Hellstrom * Return: The number of dirty ptes actually cleaned. 288*c5acad84SThomas Hellstrom */ 289*c5acad84SThomas Hellstrom unsigned long clean_record_shared_mapping_range(struct address_space *mapping, 290*c5acad84SThomas Hellstrom pgoff_t first_index, pgoff_t nr, 291*c5acad84SThomas Hellstrom pgoff_t bitmap_pgoff, 292*c5acad84SThomas Hellstrom unsigned long *bitmap, 293*c5acad84SThomas Hellstrom pgoff_t *start, 294*c5acad84SThomas Hellstrom pgoff_t *end) 295*c5acad84SThomas Hellstrom { 296*c5acad84SThomas Hellstrom bool none_set = (*start >= *end); 297*c5acad84SThomas Hellstrom struct clean_walk cwalk = { 298*c5acad84SThomas Hellstrom .base = { .total = 0 }, 299*c5acad84SThomas Hellstrom .bitmap_pgoff = bitmap_pgoff, 300*c5acad84SThomas Hellstrom .bitmap = bitmap, 301*c5acad84SThomas Hellstrom .start = none_set ? nr : *start, 302*c5acad84SThomas Hellstrom .end = none_set ? 0 : *end, 303*c5acad84SThomas Hellstrom }; 304*c5acad84SThomas Hellstrom 305*c5acad84SThomas Hellstrom i_mmap_lock_read(mapping); 306*c5acad84SThomas Hellstrom WARN_ON(walk_page_mapping(mapping, first_index, nr, &clean_walk_ops, 307*c5acad84SThomas Hellstrom &cwalk.base)); 308*c5acad84SThomas Hellstrom i_mmap_unlock_read(mapping); 309*c5acad84SThomas Hellstrom 310*c5acad84SThomas Hellstrom *start = cwalk.start; 311*c5acad84SThomas Hellstrom *end = cwalk.end; 312*c5acad84SThomas Hellstrom 313*c5acad84SThomas Hellstrom return cwalk.base.total; 314*c5acad84SThomas Hellstrom } 315*c5acad84SThomas Hellstrom EXPORT_SYMBOL_GPL(clean_record_shared_mapping_range); 316