1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0 2a520110eSChristoph Hellwig #include <linux/pagewalk.h> 3e6473092SMatt Mackall #include <linux/highmem.h> 4e6473092SMatt Mackall #include <linux/sched.h> 5d33b9f45SNaoya Horiguchi #include <linux/hugetlb.h> 6e6473092SMatt Mackall 7e6473092SMatt Mackall static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 82165009bSDave Hansen struct mm_walk *walk) 9e6473092SMatt Mackall { 10e6473092SMatt Mackall pte_t *pte; 11e6473092SMatt Mackall int err = 0; 127b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 13*ace88f10SThomas Hellstrom spinlock_t *ptl; 14e6473092SMatt Mackall 15*ace88f10SThomas Hellstrom pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 16556637cdSJohannes Weiner for (;;) { 177b86ac33SChristoph Hellwig err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); 18e6473092SMatt Mackall if (err) 19e6473092SMatt Mackall break; 20556637cdSJohannes Weiner addr += PAGE_SIZE; 21556637cdSJohannes Weiner if (addr == end) 22556637cdSJohannes Weiner break; 23556637cdSJohannes Weiner pte++; 24556637cdSJohannes Weiner } 25e6473092SMatt Mackall 26*ace88f10SThomas Hellstrom pte_unmap_unlock(pte, ptl); 27e6473092SMatt Mackall return err; 28e6473092SMatt Mackall } 29e6473092SMatt Mackall 30e6473092SMatt Mackall static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 312165009bSDave Hansen struct mm_walk *walk) 32e6473092SMatt Mackall { 33e6473092SMatt Mackall pmd_t *pmd; 34e6473092SMatt Mackall unsigned long next; 357b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 36e6473092SMatt Mackall int err = 0; 37e6473092SMatt Mackall 38e6473092SMatt Mackall pmd = pmd_offset(pud, addr); 39e6473092SMatt Mackall do { 4003319327SDave Hansen again: 41e6473092SMatt Mackall next = pmd_addr_end(addr, end); 4248684a65SNaoya Horiguchi if (pmd_none(*pmd) || !walk->vma) { 437b86ac33SChristoph Hellwig if (ops->pte_hole) 447b86ac33SChristoph Hellwig err = ops->pte_hole(addr, next, walk); 45e6473092SMatt Mackall if (err) 46e6473092SMatt Mackall break; 47e6473092SMatt Mackall continue; 48e6473092SMatt Mackall } 4903319327SDave Hansen /* 5003319327SDave Hansen * This implies that each ->pmd_entry() handler 5103319327SDave Hansen * needs to know about pmd_trans_huge() pmds 5203319327SDave Hansen */ 537b86ac33SChristoph Hellwig if (ops->pmd_entry) 547b86ac33SChristoph Hellwig err = ops->pmd_entry(pmd, addr, next, walk); 5503319327SDave Hansen if (err) 5603319327SDave Hansen break; 5703319327SDave Hansen 5803319327SDave Hansen /* 5903319327SDave Hansen * Check this here so we only break down trans_huge 6003319327SDave Hansen * pages when we _need_ to 6103319327SDave Hansen */ 627b86ac33SChristoph Hellwig if (!ops->pte_entry) 6303319327SDave Hansen continue; 6403319327SDave Hansen 6578ddc534SKirill A. Shutemov split_huge_pmd(walk->vma, pmd, addr); 66fafaa426SNaoya Horiguchi if (pmd_trans_unstable(pmd)) 6703319327SDave Hansen goto again; 682165009bSDave Hansen err = walk_pte_range(pmd, addr, next, walk); 69e6473092SMatt Mackall if (err) 70e6473092SMatt Mackall break; 71e6473092SMatt Mackall } while (pmd++, addr = next, addr != end); 72e6473092SMatt Mackall 73e6473092SMatt Mackall return err; 74e6473092SMatt Mackall } 75e6473092SMatt Mackall 76c2febafcSKirill A. Shutemov static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 772165009bSDave Hansen struct mm_walk *walk) 78e6473092SMatt Mackall { 79e6473092SMatt Mackall pud_t *pud; 80e6473092SMatt Mackall unsigned long next; 817b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 82e6473092SMatt Mackall int err = 0; 83e6473092SMatt Mackall 84c2febafcSKirill A. Shutemov pud = pud_offset(p4d, addr); 85e6473092SMatt Mackall do { 86a00cc7d9SMatthew Wilcox again: 87e6473092SMatt Mackall next = pud_addr_end(addr, end); 88a00cc7d9SMatthew Wilcox if (pud_none(*pud) || !walk->vma) { 897b86ac33SChristoph Hellwig if (ops->pte_hole) 907b86ac33SChristoph Hellwig err = ops->pte_hole(addr, next, walk); 91e6473092SMatt Mackall if (err) 92e6473092SMatt Mackall break; 93e6473092SMatt Mackall continue; 94e6473092SMatt Mackall } 95a00cc7d9SMatthew Wilcox 967b86ac33SChristoph Hellwig if (ops->pud_entry) { 97a00cc7d9SMatthew Wilcox spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); 98a00cc7d9SMatthew Wilcox 99a00cc7d9SMatthew Wilcox if (ptl) { 1007b86ac33SChristoph Hellwig err = ops->pud_entry(pud, addr, next, walk); 101a00cc7d9SMatthew Wilcox spin_unlock(ptl); 102a00cc7d9SMatthew Wilcox if (err) 103a00cc7d9SMatthew Wilcox break; 104a00cc7d9SMatthew Wilcox continue; 105a00cc7d9SMatthew Wilcox } 106a00cc7d9SMatthew Wilcox } 107a00cc7d9SMatthew Wilcox 108a00cc7d9SMatthew Wilcox split_huge_pud(walk->vma, pud, addr); 109a00cc7d9SMatthew Wilcox if (pud_none(*pud)) 110a00cc7d9SMatthew Wilcox goto again; 111a00cc7d9SMatthew Wilcox 1127b86ac33SChristoph Hellwig if (ops->pmd_entry || ops->pte_entry) 1132165009bSDave Hansen err = walk_pmd_range(pud, addr, next, walk); 114e6473092SMatt Mackall if (err) 115e6473092SMatt Mackall break; 116e6473092SMatt Mackall } while (pud++, addr = next, addr != end); 117e6473092SMatt Mackall 118e6473092SMatt Mackall return err; 119e6473092SMatt Mackall } 120e6473092SMatt Mackall 121c2febafcSKirill A. Shutemov static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 122c2febafcSKirill A. Shutemov struct mm_walk *walk) 123c2febafcSKirill A. Shutemov { 124c2febafcSKirill A. Shutemov p4d_t *p4d; 125c2febafcSKirill A. Shutemov unsigned long next; 1267b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 127c2febafcSKirill A. Shutemov int err = 0; 128c2febafcSKirill A. Shutemov 129c2febafcSKirill A. Shutemov p4d = p4d_offset(pgd, addr); 130c2febafcSKirill A. Shutemov do { 131c2febafcSKirill A. Shutemov next = p4d_addr_end(addr, end); 132c2febafcSKirill A. Shutemov if (p4d_none_or_clear_bad(p4d)) { 1337b86ac33SChristoph Hellwig if (ops->pte_hole) 1347b86ac33SChristoph Hellwig err = ops->pte_hole(addr, next, walk); 135c2febafcSKirill A. Shutemov if (err) 136c2febafcSKirill A. Shutemov break; 137c2febafcSKirill A. Shutemov continue; 138c2febafcSKirill A. Shutemov } 1397b86ac33SChristoph Hellwig if (ops->pmd_entry || ops->pte_entry) 140c2febafcSKirill A. Shutemov err = walk_pud_range(p4d, addr, next, walk); 141c2febafcSKirill A. Shutemov if (err) 142c2febafcSKirill A. Shutemov break; 143c2febafcSKirill A. Shutemov } while (p4d++, addr = next, addr != end); 144c2febafcSKirill A. Shutemov 145c2febafcSKirill A. Shutemov return err; 146c2febafcSKirill A. Shutemov } 147c2febafcSKirill A. Shutemov 148fafaa426SNaoya Horiguchi static int walk_pgd_range(unsigned long addr, unsigned long end, 149fafaa426SNaoya Horiguchi struct mm_walk *walk) 150fafaa426SNaoya Horiguchi { 151fafaa426SNaoya Horiguchi pgd_t *pgd; 152fafaa426SNaoya Horiguchi unsigned long next; 1537b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 154fafaa426SNaoya Horiguchi int err = 0; 155fafaa426SNaoya Horiguchi 156fafaa426SNaoya Horiguchi pgd = pgd_offset(walk->mm, addr); 157fafaa426SNaoya Horiguchi do { 158fafaa426SNaoya Horiguchi next = pgd_addr_end(addr, end); 159fafaa426SNaoya Horiguchi if (pgd_none_or_clear_bad(pgd)) { 1607b86ac33SChristoph Hellwig if (ops->pte_hole) 1617b86ac33SChristoph Hellwig err = ops->pte_hole(addr, next, walk); 162fafaa426SNaoya Horiguchi if (err) 163fafaa426SNaoya Horiguchi break; 164fafaa426SNaoya Horiguchi continue; 165fafaa426SNaoya Horiguchi } 1667b86ac33SChristoph Hellwig if (ops->pmd_entry || ops->pte_entry) 167c2febafcSKirill A. Shutemov err = walk_p4d_range(pgd, addr, next, walk); 168fafaa426SNaoya Horiguchi if (err) 169fafaa426SNaoya Horiguchi break; 170fafaa426SNaoya Horiguchi } while (pgd++, addr = next, addr != end); 171fafaa426SNaoya Horiguchi 172fafaa426SNaoya Horiguchi return err; 173fafaa426SNaoya Horiguchi } 174fafaa426SNaoya Horiguchi 175116354d1SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE 176116354d1SNaoya Horiguchi static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, 177116354d1SNaoya Horiguchi unsigned long end) 178116354d1SNaoya Horiguchi { 179116354d1SNaoya Horiguchi unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); 180116354d1SNaoya Horiguchi return boundary < end ? boundary : end; 181116354d1SNaoya Horiguchi } 182116354d1SNaoya Horiguchi 183fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end, 184116354d1SNaoya Horiguchi struct mm_walk *walk) 185116354d1SNaoya Horiguchi { 186fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma; 187116354d1SNaoya Horiguchi struct hstate *h = hstate_vma(vma); 188116354d1SNaoya Horiguchi unsigned long next; 189116354d1SNaoya Horiguchi unsigned long hmask = huge_page_mask(h); 1907868a208SPunit Agrawal unsigned long sz = huge_page_size(h); 191116354d1SNaoya Horiguchi pte_t *pte; 1927b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 193116354d1SNaoya Horiguchi int err = 0; 194116354d1SNaoya Horiguchi 195116354d1SNaoya Horiguchi do { 196116354d1SNaoya Horiguchi next = hugetlb_entry_end(h, addr, end); 1977868a208SPunit Agrawal pte = huge_pte_offset(walk->mm, addr & hmask, sz); 198373c4557SJann Horn 199373c4557SJann Horn if (pte) 2007b86ac33SChristoph Hellwig err = ops->hugetlb_entry(pte, hmask, addr, next, walk); 2017b86ac33SChristoph Hellwig else if (ops->pte_hole) 2027b86ac33SChristoph Hellwig err = ops->pte_hole(addr, next, walk); 203373c4557SJann Horn 204116354d1SNaoya Horiguchi if (err) 205fafaa426SNaoya Horiguchi break; 206116354d1SNaoya Horiguchi } while (addr = next, addr != end); 207116354d1SNaoya Horiguchi 208fafaa426SNaoya Horiguchi return err; 209116354d1SNaoya Horiguchi } 2106c6d5280SKOSAKI Motohiro 2116c6d5280SKOSAKI Motohiro #else /* CONFIG_HUGETLB_PAGE */ 212fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end, 2136c6d5280SKOSAKI Motohiro struct mm_walk *walk) 2146c6d5280SKOSAKI Motohiro { 2156c6d5280SKOSAKI Motohiro return 0; 2166c6d5280SKOSAKI Motohiro } 2176c6d5280SKOSAKI Motohiro 2186c6d5280SKOSAKI Motohiro #endif /* CONFIG_HUGETLB_PAGE */ 2196c6d5280SKOSAKI Motohiro 220fafaa426SNaoya Horiguchi /* 221fafaa426SNaoya Horiguchi * Decide whether we really walk over the current vma on [@start, @end) 222fafaa426SNaoya Horiguchi * or skip it via the returned value. Return 0 if we do walk over the 223fafaa426SNaoya Horiguchi * current vma, and return 1 if we skip the vma. Negative values means 224fafaa426SNaoya Horiguchi * error, where we abort the current walk. 225e6473092SMatt Mackall */ 226fafaa426SNaoya Horiguchi static int walk_page_test(unsigned long start, unsigned long end, 2272165009bSDave Hansen struct mm_walk *walk) 228e6473092SMatt Mackall { 229fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma; 2307b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 231e6473092SMatt Mackall 2327b86ac33SChristoph Hellwig if (ops->test_walk) 2337b86ac33SChristoph Hellwig return ops->test_walk(start, end, walk); 234fafaa426SNaoya Horiguchi 235fafaa426SNaoya Horiguchi /* 23648684a65SNaoya Horiguchi * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP 23748684a65SNaoya Horiguchi * range, so we don't walk over it as we do for normal vmas. However, 23848684a65SNaoya Horiguchi * Some callers are interested in handling hole range and they don't 23948684a65SNaoya Horiguchi * want to just ignore any single address range. Such users certainly 24048684a65SNaoya Horiguchi * define their ->pte_hole() callbacks, so let's delegate them to handle 24148684a65SNaoya Horiguchi * vma(VM_PFNMAP). 242fafaa426SNaoya Horiguchi */ 24348684a65SNaoya Horiguchi if (vma->vm_flags & VM_PFNMAP) { 24448684a65SNaoya Horiguchi int err = 1; 2457b86ac33SChristoph Hellwig if (ops->pte_hole) 2467b86ac33SChristoph Hellwig err = ops->pte_hole(start, end, walk); 24748684a65SNaoya Horiguchi return err ? err : 1; 24848684a65SNaoya Horiguchi } 249fafaa426SNaoya Horiguchi return 0; 250fafaa426SNaoya Horiguchi } 251fafaa426SNaoya Horiguchi 252fafaa426SNaoya Horiguchi static int __walk_page_range(unsigned long start, unsigned long end, 253fafaa426SNaoya Horiguchi struct mm_walk *walk) 254fafaa426SNaoya Horiguchi { 255fafaa426SNaoya Horiguchi int err = 0; 256fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma; 257fafaa426SNaoya Horiguchi 258fafaa426SNaoya Horiguchi if (vma && is_vm_hugetlb_page(vma)) { 2597b86ac33SChristoph Hellwig if (walk->ops->hugetlb_entry) 260fafaa426SNaoya Horiguchi err = walk_hugetlb_range(start, end, walk); 261fafaa426SNaoya Horiguchi } else 262fafaa426SNaoya Horiguchi err = walk_pgd_range(start, end, walk); 263fafaa426SNaoya Horiguchi 264e6473092SMatt Mackall return err; 265fafaa426SNaoya Horiguchi } 266fafaa426SNaoya Horiguchi 267fafaa426SNaoya Horiguchi /** 268fafaa426SNaoya Horiguchi * walk_page_range - walk page table with caller specific callbacks 2697b86ac33SChristoph Hellwig * @mm: mm_struct representing the target process of page table walk 270e8b098fcSMike Rapoport * @start: start address of the virtual address range 271e8b098fcSMike Rapoport * @end: end address of the virtual address range 2727b86ac33SChristoph Hellwig * @ops: operation to call during the walk 2737b86ac33SChristoph Hellwig * @private: private data for callbacks' usage 274fafaa426SNaoya Horiguchi * 2757b86ac33SChristoph Hellwig * Recursively walk the page table tree of the process represented by @mm 276fafaa426SNaoya Horiguchi * within the virtual address range [@start, @end). During walking, we can do 277fafaa426SNaoya Horiguchi * some caller-specific works for each entry, by setting up pmd_entry(), 278fafaa426SNaoya Horiguchi * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these 279fafaa426SNaoya Horiguchi * callbacks, the associated entries/pages are just ignored. 280fafaa426SNaoya Horiguchi * The return values of these callbacks are commonly defined like below: 281a5d09bedSMike Rapoport * 282fafaa426SNaoya Horiguchi * - 0 : succeeded to handle the current entry, and if you don't reach the 283fafaa426SNaoya Horiguchi * end address yet, continue to walk. 284fafaa426SNaoya Horiguchi * - >0 : succeeded to handle the current entry, and return to the caller 285fafaa426SNaoya Horiguchi * with caller specific value. 286fafaa426SNaoya Horiguchi * - <0 : failed to handle the current entry, and return to the caller 287fafaa426SNaoya Horiguchi * with error code. 288fafaa426SNaoya Horiguchi * 289fafaa426SNaoya Horiguchi * Before starting to walk page table, some callers want to check whether 290fafaa426SNaoya Horiguchi * they really want to walk over the current vma, typically by checking 2917b86ac33SChristoph Hellwig * its vm_flags. walk_page_test() and @ops->test_walk() are used for this 292fafaa426SNaoya Horiguchi * purpose. 293fafaa426SNaoya Horiguchi * 294fafaa426SNaoya Horiguchi * struct mm_walk keeps current values of some common data like vma and pmd, 295fafaa426SNaoya Horiguchi * which are useful for the access from callbacks. If you want to pass some 2967b86ac33SChristoph Hellwig * caller-specific data to callbacks, @private should be helpful. 297fafaa426SNaoya Horiguchi * 298fafaa426SNaoya Horiguchi * Locking: 2997b86ac33SChristoph Hellwig * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem, 3007b86ac33SChristoph Hellwig * because these function traverse vma list and/or access to vma's data. 301fafaa426SNaoya Horiguchi */ 3027b86ac33SChristoph Hellwig int walk_page_range(struct mm_struct *mm, unsigned long start, 3037b86ac33SChristoph Hellwig unsigned long end, const struct mm_walk_ops *ops, 3047b86ac33SChristoph Hellwig void *private) 305fafaa426SNaoya Horiguchi { 306fafaa426SNaoya Horiguchi int err = 0; 307fafaa426SNaoya Horiguchi unsigned long next; 308fafaa426SNaoya Horiguchi struct vm_area_struct *vma; 3097b86ac33SChristoph Hellwig struct mm_walk walk = { 3107b86ac33SChristoph Hellwig .ops = ops, 3117b86ac33SChristoph Hellwig .mm = mm, 3127b86ac33SChristoph Hellwig .private = private, 3137b86ac33SChristoph Hellwig }; 314fafaa426SNaoya Horiguchi 315fafaa426SNaoya Horiguchi if (start >= end) 316fafaa426SNaoya Horiguchi return -EINVAL; 317e6473092SMatt Mackall 3187b86ac33SChristoph Hellwig if (!walk.mm) 3192165009bSDave Hansen return -EINVAL; 3202165009bSDave Hansen 321b4bc7817SChristoph Hellwig lockdep_assert_held(&walk.mm->mmap_sem); 322a9ff785eSCliff Wickman 3237b86ac33SChristoph Hellwig vma = find_vma(walk.mm, start); 324e6473092SMatt Mackall do { 325fafaa426SNaoya Horiguchi if (!vma) { /* after the last vma */ 3267b86ac33SChristoph Hellwig walk.vma = NULL; 327fafaa426SNaoya Horiguchi next = end; 328fafaa426SNaoya Horiguchi } else if (start < vma->vm_start) { /* outside vma */ 3297b86ac33SChristoph Hellwig walk.vma = NULL; 330fafaa426SNaoya Horiguchi next = min(end, vma->vm_start); 331fafaa426SNaoya Horiguchi } else { /* inside vma */ 3327b86ac33SChristoph Hellwig walk.vma = vma; 333fafaa426SNaoya Horiguchi next = min(end, vma->vm_end); 334fafaa426SNaoya Horiguchi vma = vma->vm_next; 3355f0af70aSDavid Sterba 3367b86ac33SChristoph Hellwig err = walk_page_test(start, next, &walk); 337f6837395SNaoya Horiguchi if (err > 0) { 338f6837395SNaoya Horiguchi /* 339f6837395SNaoya Horiguchi * positive return values are purely for 340f6837395SNaoya Horiguchi * controlling the pagewalk, so should never 341f6837395SNaoya Horiguchi * be passed to the callers. 342f6837395SNaoya Horiguchi */ 343f6837395SNaoya Horiguchi err = 0; 344a9ff785eSCliff Wickman continue; 345f6837395SNaoya Horiguchi } 346fafaa426SNaoya Horiguchi if (err < 0) 347fafaa426SNaoya Horiguchi break; 348a9ff785eSCliff Wickman } 3497b86ac33SChristoph Hellwig if (walk.vma || walk.ops->pte_hole) 3507b86ac33SChristoph Hellwig err = __walk_page_range(start, next, &walk); 3515dc37642SNaoya Horiguchi if (err) 3525dc37642SNaoya Horiguchi break; 353fafaa426SNaoya Horiguchi } while (start = next, start < end); 354e6473092SMatt Mackall return err; 355e6473092SMatt Mackall } 356900fc5f1SNaoya Horiguchi 3577b86ac33SChristoph Hellwig int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, 3587b86ac33SChristoph Hellwig void *private) 359900fc5f1SNaoya Horiguchi { 3607b86ac33SChristoph Hellwig struct mm_walk walk = { 3617b86ac33SChristoph Hellwig .ops = ops, 3627b86ac33SChristoph Hellwig .mm = vma->vm_mm, 3637b86ac33SChristoph Hellwig .vma = vma, 3647b86ac33SChristoph Hellwig .private = private, 3657b86ac33SChristoph Hellwig }; 366900fc5f1SNaoya Horiguchi int err; 367900fc5f1SNaoya Horiguchi 3687b86ac33SChristoph Hellwig if (!walk.mm) 369900fc5f1SNaoya Horiguchi return -EINVAL; 370900fc5f1SNaoya Horiguchi 371b4bc7817SChristoph Hellwig lockdep_assert_held(&walk.mm->mmap_sem); 3727b86ac33SChristoph Hellwig 3737b86ac33SChristoph Hellwig err = walk_page_test(vma->vm_start, vma->vm_end, &walk); 374900fc5f1SNaoya Horiguchi if (err > 0) 375900fc5f1SNaoya Horiguchi return 0; 376900fc5f1SNaoya Horiguchi if (err < 0) 377900fc5f1SNaoya Horiguchi return err; 3787b86ac33SChristoph Hellwig return __walk_page_range(vma->vm_start, vma->vm_end, &walk); 379900fc5f1SNaoya Horiguchi } 380