1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0 2a520110eSChristoph Hellwig #include <linux/pagewalk.h> 3e6473092SMatt Mackall #include <linux/highmem.h> 4e6473092SMatt Mackall #include <linux/sched.h> 5d33b9f45SNaoya Horiguchi #include <linux/hugetlb.h> 6e6473092SMatt Mackall 7b7a16c7aSSteven Price /* 8b7a16c7aSSteven Price * We want to know the real level where a entry is located ignoring any 9b7a16c7aSSteven Price * folding of levels which may be happening. For example if p4d is folded then 10b7a16c7aSSteven Price * a missing entry found at level 1 (p4d) is actually at level 0 (pgd). 11b7a16c7aSSteven Price */ 12b7a16c7aSSteven Price static int real_depth(int depth) 13b7a16c7aSSteven Price { 14b7a16c7aSSteven Price if (depth == 3 && PTRS_PER_PMD == 1) 15b7a16c7aSSteven Price depth = 2; 16b7a16c7aSSteven Price if (depth == 2 && PTRS_PER_PUD == 1) 17b7a16c7aSSteven Price depth = 1; 18b7a16c7aSSteven Price if (depth == 1 && PTRS_PER_P4D == 1) 19b7a16c7aSSteven Price depth = 0; 20b7a16c7aSSteven Price return depth; 21b7a16c7aSSteven Price } 22b7a16c7aSSteven Price 23fbf56346SSteven Price static int walk_pte_range_inner(pte_t *pte, unsigned long addr, 24fbf56346SSteven Price unsigned long end, struct mm_walk *walk) 25e6473092SMatt Mackall { 267b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 27fbf56346SSteven Price int err = 0; 28e6473092SMatt Mackall 29556637cdSJohannes Weiner for (;;) { 307b86ac33SChristoph Hellwig err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); 31e6473092SMatt Mackall if (err) 32e6473092SMatt Mackall break; 33c02a9875SSteven Price if (addr >= end - PAGE_SIZE) 34556637cdSJohannes Weiner break; 35c02a9875SSteven Price addr += PAGE_SIZE; 36556637cdSJohannes Weiner pte++; 37556637cdSJohannes Weiner } 38fbf56346SSteven Price return err; 39fbf56346SSteven Price } 40e6473092SMatt Mackall 41fbf56346SSteven Price static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 42fbf56346SSteven Price struct mm_walk *walk) 43fbf56346SSteven Price { 44fbf56346SSteven Price pte_t *pte; 45fbf56346SSteven Price int err = 0; 46fbf56346SSteven Price spinlock_t *ptl; 47fbf56346SSteven Price 48fbf56346SSteven Price if (walk->no_vma) { 49fbf56346SSteven Price pte = pte_offset_map(pmd, addr); 50fbf56346SSteven Price err = walk_pte_range_inner(pte, addr, end, walk); 51fbf56346SSteven Price pte_unmap(pte); 52fbf56346SSteven Price } else { 53fbf56346SSteven Price pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 54fbf56346SSteven Price err = walk_pte_range_inner(pte, addr, end, walk); 55ace88f10SThomas Hellstrom pte_unmap_unlock(pte, ptl); 56fbf56346SSteven Price } 57fbf56346SSteven Price 58e6473092SMatt Mackall return err; 59e6473092SMatt Mackall } 60e6473092SMatt Mackall 61e17eae2bSChristophe Leroy #ifdef CONFIG_ARCH_HAS_HUGEPD 62e17eae2bSChristophe Leroy static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr, 63e17eae2bSChristophe Leroy unsigned long end, struct mm_walk *walk, int pdshift) 64e17eae2bSChristophe Leroy { 65e17eae2bSChristophe Leroy int err = 0; 66e17eae2bSChristophe Leroy const struct mm_walk_ops *ops = walk->ops; 67e17eae2bSChristophe Leroy int shift = hugepd_shift(*phpd); 68e17eae2bSChristophe Leroy int page_size = 1 << shift; 69e17eae2bSChristophe Leroy 70e17eae2bSChristophe Leroy if (!ops->pte_entry) 71e17eae2bSChristophe Leroy return 0; 72e17eae2bSChristophe Leroy 73e17eae2bSChristophe Leroy if (addr & (page_size - 1)) 74e17eae2bSChristophe Leroy return 0; 75e17eae2bSChristophe Leroy 76e17eae2bSChristophe Leroy for (;;) { 77e17eae2bSChristophe Leroy pte_t *pte; 78e17eae2bSChristophe Leroy 79e17eae2bSChristophe Leroy spin_lock(&walk->mm->page_table_lock); 80e17eae2bSChristophe Leroy pte = hugepte_offset(*phpd, addr, pdshift); 81e17eae2bSChristophe Leroy err = ops->pte_entry(pte, addr, addr + page_size, walk); 82e17eae2bSChristophe Leroy spin_unlock(&walk->mm->page_table_lock); 83e17eae2bSChristophe Leroy 84e17eae2bSChristophe Leroy if (err) 85e17eae2bSChristophe Leroy break; 86e17eae2bSChristophe Leroy if (addr >= end - page_size) 87e17eae2bSChristophe Leroy break; 88e17eae2bSChristophe Leroy addr += page_size; 89e17eae2bSChristophe Leroy } 90e17eae2bSChristophe Leroy return err; 91e17eae2bSChristophe Leroy } 92e17eae2bSChristophe Leroy #else 93e17eae2bSChristophe Leroy static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr, 94e17eae2bSChristophe Leroy unsigned long end, struct mm_walk *walk, int pdshift) 95e17eae2bSChristophe Leroy { 96e17eae2bSChristophe Leroy return 0; 97e17eae2bSChristophe Leroy } 98e17eae2bSChristophe Leroy #endif 99e17eae2bSChristophe Leroy 100e6473092SMatt Mackall static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 1012165009bSDave Hansen struct mm_walk *walk) 102e6473092SMatt Mackall { 103e6473092SMatt Mackall pmd_t *pmd; 104e6473092SMatt Mackall unsigned long next; 1057b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 106e6473092SMatt Mackall int err = 0; 107b7a16c7aSSteven Price int depth = real_depth(3); 108e6473092SMatt Mackall 109e6473092SMatt Mackall pmd = pmd_offset(pud, addr); 110e6473092SMatt Mackall do { 11103319327SDave Hansen again: 112e6473092SMatt Mackall next = pmd_addr_end(addr, end); 1138782fb61SSteven Price if (pmd_none(*pmd)) { 1147b86ac33SChristoph Hellwig if (ops->pte_hole) 115b7a16c7aSSteven Price err = ops->pte_hole(addr, next, depth, walk); 116e6473092SMatt Mackall if (err) 117e6473092SMatt Mackall break; 118e6473092SMatt Mackall continue; 119e6473092SMatt Mackall } 1203afc4236SSteven Price 1213afc4236SSteven Price walk->action = ACTION_SUBTREE; 1223afc4236SSteven Price 12303319327SDave Hansen /* 12403319327SDave Hansen * This implies that each ->pmd_entry() handler 12503319327SDave Hansen * needs to know about pmd_trans_huge() pmds 12603319327SDave Hansen */ 1277b86ac33SChristoph Hellwig if (ops->pmd_entry) 1287b86ac33SChristoph Hellwig err = ops->pmd_entry(pmd, addr, next, walk); 12903319327SDave Hansen if (err) 13003319327SDave Hansen break; 13103319327SDave Hansen 1323afc4236SSteven Price if (walk->action == ACTION_AGAIN) 1333afc4236SSteven Price goto again; 1343afc4236SSteven Price 13503319327SDave Hansen /* 13603319327SDave Hansen * Check this here so we only break down trans_huge 13703319327SDave Hansen * pages when we _need_ to 13803319327SDave Hansen */ 139488ae6a2SSteven Price if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) || 140488ae6a2SSteven Price walk->action == ACTION_CONTINUE || 1413afc4236SSteven Price !(ops->pte_entry)) 14203319327SDave Hansen continue; 14303319327SDave Hansen 144488ae6a2SSteven Price if (walk->vma) { 14578ddc534SKirill A. Shutemov split_huge_pmd(walk->vma, pmd, addr); 146fafaa426SNaoya Horiguchi if (pmd_trans_unstable(pmd)) 14703319327SDave Hansen goto again; 148488ae6a2SSteven Price } 1493afc4236SSteven Price 150e17eae2bSChristophe Leroy if (is_hugepd(__hugepd(pmd_val(*pmd)))) 151e17eae2bSChristophe Leroy err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT); 152e17eae2bSChristophe Leroy else 1532165009bSDave Hansen err = walk_pte_range(pmd, addr, next, walk); 154e6473092SMatt Mackall if (err) 155e6473092SMatt Mackall break; 156e6473092SMatt Mackall } while (pmd++, addr = next, addr != end); 157e6473092SMatt Mackall 158e6473092SMatt Mackall return err; 159e6473092SMatt Mackall } 160e6473092SMatt Mackall 161c2febafcSKirill A. Shutemov static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 1622165009bSDave Hansen struct mm_walk *walk) 163e6473092SMatt Mackall { 164e6473092SMatt Mackall pud_t *pud; 165e6473092SMatt Mackall unsigned long next; 1667b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 167e6473092SMatt Mackall int err = 0; 168b7a16c7aSSteven Price int depth = real_depth(2); 169e6473092SMatt Mackall 170c2febafcSKirill A. Shutemov pud = pud_offset(p4d, addr); 171e6473092SMatt Mackall do { 172a00cc7d9SMatthew Wilcox again: 173e6473092SMatt Mackall next = pud_addr_end(addr, end); 1748782fb61SSteven Price if (pud_none(*pud)) { 1757b86ac33SChristoph Hellwig if (ops->pte_hole) 176b7a16c7aSSteven Price err = ops->pte_hole(addr, next, depth, walk); 177e6473092SMatt Mackall if (err) 178e6473092SMatt Mackall break; 179e6473092SMatt Mackall continue; 180e6473092SMatt Mackall } 181a00cc7d9SMatthew Wilcox 1823afc4236SSteven Price walk->action = ACTION_SUBTREE; 183a00cc7d9SMatthew Wilcox 1843afc4236SSteven Price if (ops->pud_entry) 1857b86ac33SChristoph Hellwig err = ops->pud_entry(pud, addr, next, walk); 186a00cc7d9SMatthew Wilcox if (err) 187a00cc7d9SMatthew Wilcox break; 1883afc4236SSteven Price 1893afc4236SSteven Price if (walk->action == ACTION_AGAIN) 1903afc4236SSteven Price goto again; 1913afc4236SSteven Price 192488ae6a2SSteven Price if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) || 193488ae6a2SSteven Price walk->action == ACTION_CONTINUE || 1943afc4236SSteven Price !(ops->pmd_entry || ops->pte_entry)) 195a00cc7d9SMatthew Wilcox continue; 196a00cc7d9SMatthew Wilcox 197488ae6a2SSteven Price if (walk->vma) 198a00cc7d9SMatthew Wilcox split_huge_pud(walk->vma, pud, addr); 199a00cc7d9SMatthew Wilcox if (pud_none(*pud)) 200a00cc7d9SMatthew Wilcox goto again; 201a00cc7d9SMatthew Wilcox 202e17eae2bSChristophe Leroy if (is_hugepd(__hugepd(pud_val(*pud)))) 203e17eae2bSChristophe Leroy err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT); 204e17eae2bSChristophe Leroy else 2052165009bSDave Hansen err = walk_pmd_range(pud, addr, next, walk); 206e6473092SMatt Mackall if (err) 207e6473092SMatt Mackall break; 208e6473092SMatt Mackall } while (pud++, addr = next, addr != end); 209e6473092SMatt Mackall 210e6473092SMatt Mackall return err; 211e6473092SMatt Mackall } 212e6473092SMatt Mackall 213c2febafcSKirill A. Shutemov static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 214c2febafcSKirill A. Shutemov struct mm_walk *walk) 215c2febafcSKirill A. Shutemov { 216c2febafcSKirill A. Shutemov p4d_t *p4d; 217c2febafcSKirill A. Shutemov unsigned long next; 2187b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 219c2febafcSKirill A. Shutemov int err = 0; 220b7a16c7aSSteven Price int depth = real_depth(1); 221c2febafcSKirill A. Shutemov 222c2febafcSKirill A. Shutemov p4d = p4d_offset(pgd, addr); 223c2febafcSKirill A. Shutemov do { 224c2febafcSKirill A. Shutemov next = p4d_addr_end(addr, end); 225c2febafcSKirill A. Shutemov if (p4d_none_or_clear_bad(p4d)) { 2267b86ac33SChristoph Hellwig if (ops->pte_hole) 227b7a16c7aSSteven Price err = ops->pte_hole(addr, next, depth, walk); 228c2febafcSKirill A. Shutemov if (err) 229c2febafcSKirill A. Shutemov break; 230c2febafcSKirill A. Shutemov continue; 231c2febafcSKirill A. Shutemov } 2323afc4236SSteven Price if (ops->p4d_entry) { 2333afc4236SSteven Price err = ops->p4d_entry(p4d, addr, next, walk); 2343afc4236SSteven Price if (err) 2353afc4236SSteven Price break; 2363afc4236SSteven Price } 237e17eae2bSChristophe Leroy if (is_hugepd(__hugepd(p4d_val(*p4d)))) 238e17eae2bSChristophe Leroy err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk, P4D_SHIFT); 239e17eae2bSChristophe Leroy else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry) 240c2febafcSKirill A. Shutemov err = walk_pud_range(p4d, addr, next, walk); 241c2febafcSKirill A. Shutemov if (err) 242c2febafcSKirill A. Shutemov break; 243c2febafcSKirill A. Shutemov } while (p4d++, addr = next, addr != end); 244c2febafcSKirill A. Shutemov 245c2febafcSKirill A. Shutemov return err; 246c2febafcSKirill A. Shutemov } 247c2febafcSKirill A. Shutemov 248fafaa426SNaoya Horiguchi static int walk_pgd_range(unsigned long addr, unsigned long end, 249fafaa426SNaoya Horiguchi struct mm_walk *walk) 250fafaa426SNaoya Horiguchi { 251fafaa426SNaoya Horiguchi pgd_t *pgd; 252fafaa426SNaoya Horiguchi unsigned long next; 2537b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 254fafaa426SNaoya Horiguchi int err = 0; 255fafaa426SNaoya Horiguchi 256e47690d7SSteven Price if (walk->pgd) 257e47690d7SSteven Price pgd = walk->pgd + pgd_index(addr); 258e47690d7SSteven Price else 259fafaa426SNaoya Horiguchi pgd = pgd_offset(walk->mm, addr); 260fafaa426SNaoya Horiguchi do { 261fafaa426SNaoya Horiguchi next = pgd_addr_end(addr, end); 262fafaa426SNaoya Horiguchi if (pgd_none_or_clear_bad(pgd)) { 2637b86ac33SChristoph Hellwig if (ops->pte_hole) 264b7a16c7aSSteven Price err = ops->pte_hole(addr, next, 0, walk); 265fafaa426SNaoya Horiguchi if (err) 266fafaa426SNaoya Horiguchi break; 267fafaa426SNaoya Horiguchi continue; 268fafaa426SNaoya Horiguchi } 2693afc4236SSteven Price if (ops->pgd_entry) { 2703afc4236SSteven Price err = ops->pgd_entry(pgd, addr, next, walk); 2713afc4236SSteven Price if (err) 2723afc4236SSteven Price break; 2733afc4236SSteven Price } 274e17eae2bSChristophe Leroy if (is_hugepd(__hugepd(pgd_val(*pgd)))) 275e17eae2bSChristophe Leroy err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk, PGDIR_SHIFT); 276e17eae2bSChristophe Leroy else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry) 277c2febafcSKirill A. Shutemov err = walk_p4d_range(pgd, addr, next, walk); 278fafaa426SNaoya Horiguchi if (err) 279fafaa426SNaoya Horiguchi break; 280fafaa426SNaoya Horiguchi } while (pgd++, addr = next, addr != end); 281fafaa426SNaoya Horiguchi 282fafaa426SNaoya Horiguchi return err; 283fafaa426SNaoya Horiguchi } 284fafaa426SNaoya Horiguchi 285116354d1SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE 286116354d1SNaoya Horiguchi static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, 287116354d1SNaoya Horiguchi unsigned long end) 288116354d1SNaoya Horiguchi { 289116354d1SNaoya Horiguchi unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); 290116354d1SNaoya Horiguchi return boundary < end ? boundary : end; 291116354d1SNaoya Horiguchi } 292116354d1SNaoya Horiguchi 293fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end, 294116354d1SNaoya Horiguchi struct mm_walk *walk) 295116354d1SNaoya Horiguchi { 296fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma; 297116354d1SNaoya Horiguchi struct hstate *h = hstate_vma(vma); 298116354d1SNaoya Horiguchi unsigned long next; 299116354d1SNaoya Horiguchi unsigned long hmask = huge_page_mask(h); 3007868a208SPunit Agrawal unsigned long sz = huge_page_size(h); 301116354d1SNaoya Horiguchi pte_t *pte; 3027b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 303116354d1SNaoya Horiguchi int err = 0; 304116354d1SNaoya Horiguchi 305*dd361e50SPeter Xu hugetlb_vma_lock_read(vma); 306116354d1SNaoya Horiguchi do { 307116354d1SNaoya Horiguchi next = hugetlb_entry_end(h, addr, end); 3087868a208SPunit Agrawal pte = huge_pte_offset(walk->mm, addr & hmask, sz); 309373c4557SJann Horn 310373c4557SJann Horn if (pte) 3117b86ac33SChristoph Hellwig err = ops->hugetlb_entry(pte, hmask, addr, next, walk); 3127b86ac33SChristoph Hellwig else if (ops->pte_hole) 313b7a16c7aSSteven Price err = ops->pte_hole(addr, next, -1, walk); 314373c4557SJann Horn 315116354d1SNaoya Horiguchi if (err) 316fafaa426SNaoya Horiguchi break; 317116354d1SNaoya Horiguchi } while (addr = next, addr != end); 318*dd361e50SPeter Xu hugetlb_vma_unlock_read(vma); 319116354d1SNaoya Horiguchi 320fafaa426SNaoya Horiguchi return err; 321116354d1SNaoya Horiguchi } 3226c6d5280SKOSAKI Motohiro 3236c6d5280SKOSAKI Motohiro #else /* CONFIG_HUGETLB_PAGE */ 324fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end, 3256c6d5280SKOSAKI Motohiro struct mm_walk *walk) 3266c6d5280SKOSAKI Motohiro { 3276c6d5280SKOSAKI Motohiro return 0; 3286c6d5280SKOSAKI Motohiro } 3296c6d5280SKOSAKI Motohiro 3306c6d5280SKOSAKI Motohiro #endif /* CONFIG_HUGETLB_PAGE */ 3316c6d5280SKOSAKI Motohiro 332fafaa426SNaoya Horiguchi /* 333fafaa426SNaoya Horiguchi * Decide whether we really walk over the current vma on [@start, @end) 334fafaa426SNaoya Horiguchi * or skip it via the returned value. Return 0 if we do walk over the 335fafaa426SNaoya Horiguchi * current vma, and return 1 if we skip the vma. Negative values means 336fafaa426SNaoya Horiguchi * error, where we abort the current walk. 337e6473092SMatt Mackall */ 338fafaa426SNaoya Horiguchi static int walk_page_test(unsigned long start, unsigned long end, 3392165009bSDave Hansen struct mm_walk *walk) 340e6473092SMatt Mackall { 341fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma; 3427b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 343e6473092SMatt Mackall 3447b86ac33SChristoph Hellwig if (ops->test_walk) 3457b86ac33SChristoph Hellwig return ops->test_walk(start, end, walk); 346fafaa426SNaoya Horiguchi 347fafaa426SNaoya Horiguchi /* 34848684a65SNaoya Horiguchi * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP 34948684a65SNaoya Horiguchi * range, so we don't walk over it as we do for normal vmas. However, 35048684a65SNaoya Horiguchi * Some callers are interested in handling hole range and they don't 35148684a65SNaoya Horiguchi * want to just ignore any single address range. Such users certainly 35248684a65SNaoya Horiguchi * define their ->pte_hole() callbacks, so let's delegate them to handle 35348684a65SNaoya Horiguchi * vma(VM_PFNMAP). 354fafaa426SNaoya Horiguchi */ 35548684a65SNaoya Horiguchi if (vma->vm_flags & VM_PFNMAP) { 35648684a65SNaoya Horiguchi int err = 1; 3577b86ac33SChristoph Hellwig if (ops->pte_hole) 358b7a16c7aSSteven Price err = ops->pte_hole(start, end, -1, walk); 35948684a65SNaoya Horiguchi return err ? err : 1; 36048684a65SNaoya Horiguchi } 361fafaa426SNaoya Horiguchi return 0; 362fafaa426SNaoya Horiguchi } 363fafaa426SNaoya Horiguchi 364fafaa426SNaoya Horiguchi static int __walk_page_range(unsigned long start, unsigned long end, 365fafaa426SNaoya Horiguchi struct mm_walk *walk) 366fafaa426SNaoya Horiguchi { 367fafaa426SNaoya Horiguchi int err = 0; 368fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma; 369ecaad8acSThomas Hellstrom const struct mm_walk_ops *ops = walk->ops; 370ecaad8acSThomas Hellstrom 3718782fb61SSteven Price if (ops->pre_vma) { 372ecaad8acSThomas Hellstrom err = ops->pre_vma(start, end, walk); 373ecaad8acSThomas Hellstrom if (err) 374ecaad8acSThomas Hellstrom return err; 375ecaad8acSThomas Hellstrom } 376fafaa426SNaoya Horiguchi 3778782fb61SSteven Price if (is_vm_hugetlb_page(vma)) { 378ecaad8acSThomas Hellstrom if (ops->hugetlb_entry) 379fafaa426SNaoya Horiguchi err = walk_hugetlb_range(start, end, walk); 380fafaa426SNaoya Horiguchi } else 381fafaa426SNaoya Horiguchi err = walk_pgd_range(start, end, walk); 382fafaa426SNaoya Horiguchi 3838782fb61SSteven Price if (ops->post_vma) 384ecaad8acSThomas Hellstrom ops->post_vma(walk); 385ecaad8acSThomas Hellstrom 386e6473092SMatt Mackall return err; 387fafaa426SNaoya Horiguchi } 388fafaa426SNaoya Horiguchi 389fafaa426SNaoya Horiguchi /** 390fafaa426SNaoya Horiguchi * walk_page_range - walk page table with caller specific callbacks 3917b86ac33SChristoph Hellwig * @mm: mm_struct representing the target process of page table walk 392e8b098fcSMike Rapoport * @start: start address of the virtual address range 393e8b098fcSMike Rapoport * @end: end address of the virtual address range 3947b86ac33SChristoph Hellwig * @ops: operation to call during the walk 3957b86ac33SChristoph Hellwig * @private: private data for callbacks' usage 396fafaa426SNaoya Horiguchi * 3977b86ac33SChristoph Hellwig * Recursively walk the page table tree of the process represented by @mm 398fafaa426SNaoya Horiguchi * within the virtual address range [@start, @end). During walking, we can do 399fafaa426SNaoya Horiguchi * some caller-specific works for each entry, by setting up pmd_entry(), 400fafaa426SNaoya Horiguchi * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these 401fafaa426SNaoya Horiguchi * callbacks, the associated entries/pages are just ignored. 402fafaa426SNaoya Horiguchi * The return values of these callbacks are commonly defined like below: 403a5d09bedSMike Rapoport * 404fafaa426SNaoya Horiguchi * - 0 : succeeded to handle the current entry, and if you don't reach the 405fafaa426SNaoya Horiguchi * end address yet, continue to walk. 406fafaa426SNaoya Horiguchi * - >0 : succeeded to handle the current entry, and return to the caller 407fafaa426SNaoya Horiguchi * with caller specific value. 408fafaa426SNaoya Horiguchi * - <0 : failed to handle the current entry, and return to the caller 409fafaa426SNaoya Horiguchi * with error code. 410fafaa426SNaoya Horiguchi * 411fafaa426SNaoya Horiguchi * Before starting to walk page table, some callers want to check whether 412fafaa426SNaoya Horiguchi * they really want to walk over the current vma, typically by checking 4137b86ac33SChristoph Hellwig * its vm_flags. walk_page_test() and @ops->test_walk() are used for this 414fafaa426SNaoya Horiguchi * purpose. 415fafaa426SNaoya Horiguchi * 416ecaad8acSThomas Hellstrom * If operations need to be staged before and committed after a vma is walked, 417ecaad8acSThomas Hellstrom * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), 418ecaad8acSThomas Hellstrom * since it is intended to handle commit-type operations, can't return any 419ecaad8acSThomas Hellstrom * errors. 420ecaad8acSThomas Hellstrom * 421fafaa426SNaoya Horiguchi * struct mm_walk keeps current values of some common data like vma and pmd, 422fafaa426SNaoya Horiguchi * which are useful for the access from callbacks. If you want to pass some 4237b86ac33SChristoph Hellwig * caller-specific data to callbacks, @private should be helpful. 424fafaa426SNaoya Horiguchi * 425fafaa426SNaoya Horiguchi * Locking: 426c1e8d7c6SMichel Lespinasse * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock, 4277b86ac33SChristoph Hellwig * because these function traverse vma list and/or access to vma's data. 428fafaa426SNaoya Horiguchi */ 4297b86ac33SChristoph Hellwig int walk_page_range(struct mm_struct *mm, unsigned long start, 4307b86ac33SChristoph Hellwig unsigned long end, const struct mm_walk_ops *ops, 4317b86ac33SChristoph Hellwig void *private) 432fafaa426SNaoya Horiguchi { 433fafaa426SNaoya Horiguchi int err = 0; 434fafaa426SNaoya Horiguchi unsigned long next; 435fafaa426SNaoya Horiguchi struct vm_area_struct *vma; 4367b86ac33SChristoph Hellwig struct mm_walk walk = { 4377b86ac33SChristoph Hellwig .ops = ops, 4387b86ac33SChristoph Hellwig .mm = mm, 4397b86ac33SChristoph Hellwig .private = private, 4407b86ac33SChristoph Hellwig }; 441fafaa426SNaoya Horiguchi 442fafaa426SNaoya Horiguchi if (start >= end) 443fafaa426SNaoya Horiguchi return -EINVAL; 444e6473092SMatt Mackall 4457b86ac33SChristoph Hellwig if (!walk.mm) 4462165009bSDave Hansen return -EINVAL; 4472165009bSDave Hansen 44842fc5414SMichel Lespinasse mmap_assert_locked(walk.mm); 449a9ff785eSCliff Wickman 4507b86ac33SChristoph Hellwig vma = find_vma(walk.mm, start); 451e6473092SMatt Mackall do { 452fafaa426SNaoya Horiguchi if (!vma) { /* after the last vma */ 4537b86ac33SChristoph Hellwig walk.vma = NULL; 454fafaa426SNaoya Horiguchi next = end; 4558782fb61SSteven Price if (ops->pte_hole) 4568782fb61SSteven Price err = ops->pte_hole(start, next, -1, &walk); 457fafaa426SNaoya Horiguchi } else if (start < vma->vm_start) { /* outside vma */ 4587b86ac33SChristoph Hellwig walk.vma = NULL; 459fafaa426SNaoya Horiguchi next = min(end, vma->vm_start); 4608782fb61SSteven Price if (ops->pte_hole) 4618782fb61SSteven Price err = ops->pte_hole(start, next, -1, &walk); 462fafaa426SNaoya Horiguchi } else { /* inside vma */ 4637b86ac33SChristoph Hellwig walk.vma = vma; 464fafaa426SNaoya Horiguchi next = min(end, vma->vm_end); 4659ec08f30SMatthew Wilcox (Oracle) vma = find_vma(mm, vma->vm_end); 4665f0af70aSDavid Sterba 4677b86ac33SChristoph Hellwig err = walk_page_test(start, next, &walk); 468f6837395SNaoya Horiguchi if (err > 0) { 469f6837395SNaoya Horiguchi /* 470f6837395SNaoya Horiguchi * positive return values are purely for 471f6837395SNaoya Horiguchi * controlling the pagewalk, so should never 472f6837395SNaoya Horiguchi * be passed to the callers. 473f6837395SNaoya Horiguchi */ 474f6837395SNaoya Horiguchi err = 0; 475a9ff785eSCliff Wickman continue; 476f6837395SNaoya Horiguchi } 477fafaa426SNaoya Horiguchi if (err < 0) 478fafaa426SNaoya Horiguchi break; 4797b86ac33SChristoph Hellwig err = __walk_page_range(start, next, &walk); 4808782fb61SSteven Price } 4815dc37642SNaoya Horiguchi if (err) 4825dc37642SNaoya Horiguchi break; 483fafaa426SNaoya Horiguchi } while (start = next, start < end); 484e6473092SMatt Mackall return err; 485e6473092SMatt Mackall } 486900fc5f1SNaoya Horiguchi 4878bd3873dSRolf Eike Beer /** 4888bd3873dSRolf Eike Beer * walk_page_range_novma - walk a range of pagetables not backed by a vma 4898bd3873dSRolf Eike Beer * @mm: mm_struct representing the target process of page table walk 4908bd3873dSRolf Eike Beer * @start: start address of the virtual address range 4918bd3873dSRolf Eike Beer * @end: end address of the virtual address range 4928bd3873dSRolf Eike Beer * @ops: operation to call during the walk 4938bd3873dSRolf Eike Beer * @pgd: pgd to walk if different from mm->pgd 4948bd3873dSRolf Eike Beer * @private: private data for callbacks' usage 4958bd3873dSRolf Eike Beer * 496fbf56346SSteven Price * Similar to walk_page_range() but can walk any page tables even if they are 497fbf56346SSteven Price * not backed by VMAs. Because 'unusual' entries may be walked this function 498fbf56346SSteven Price * will also not lock the PTEs for the pte_entry() callback. This is useful for 499fbf56346SSteven Price * walking the kernel pages tables or page tables for firmware. 500fbf56346SSteven Price */ 501488ae6a2SSteven Price int walk_page_range_novma(struct mm_struct *mm, unsigned long start, 502488ae6a2SSteven Price unsigned long end, const struct mm_walk_ops *ops, 503e47690d7SSteven Price pgd_t *pgd, 504488ae6a2SSteven Price void *private) 505488ae6a2SSteven Price { 506488ae6a2SSteven Price struct mm_walk walk = { 507488ae6a2SSteven Price .ops = ops, 508488ae6a2SSteven Price .mm = mm, 509e47690d7SSteven Price .pgd = pgd, 510488ae6a2SSteven Price .private = private, 511488ae6a2SSteven Price .no_vma = true 512488ae6a2SSteven Price }; 513488ae6a2SSteven Price 514488ae6a2SSteven Price if (start >= end || !walk.mm) 515488ae6a2SSteven Price return -EINVAL; 516488ae6a2SSteven Price 5178782fb61SSteven Price mmap_assert_write_locked(walk.mm); 518488ae6a2SSteven Price 5198782fb61SSteven Price return walk_pgd_range(start, end, &walk); 520488ae6a2SSteven Price } 521488ae6a2SSteven Price 522e07cda5fSDavid Hildenbrand int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, 523e07cda5fSDavid Hildenbrand unsigned long end, const struct mm_walk_ops *ops, 524e07cda5fSDavid Hildenbrand void *private) 525e07cda5fSDavid Hildenbrand { 526e07cda5fSDavid Hildenbrand struct mm_walk walk = { 527e07cda5fSDavid Hildenbrand .ops = ops, 528e07cda5fSDavid Hildenbrand .mm = vma->vm_mm, 529e07cda5fSDavid Hildenbrand .vma = vma, 530e07cda5fSDavid Hildenbrand .private = private, 531e07cda5fSDavid Hildenbrand }; 532e07cda5fSDavid Hildenbrand 533e07cda5fSDavid Hildenbrand if (start >= end || !walk.mm) 534e07cda5fSDavid Hildenbrand return -EINVAL; 535e07cda5fSDavid Hildenbrand if (start < vma->vm_start || end > vma->vm_end) 536e07cda5fSDavid Hildenbrand return -EINVAL; 537e07cda5fSDavid Hildenbrand 538e07cda5fSDavid Hildenbrand mmap_assert_locked(walk.mm); 539e07cda5fSDavid Hildenbrand return __walk_page_range(start, end, &walk); 540e07cda5fSDavid Hildenbrand } 541e07cda5fSDavid Hildenbrand 5427b86ac33SChristoph Hellwig int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, 5437b86ac33SChristoph Hellwig void *private) 544900fc5f1SNaoya Horiguchi { 5457b86ac33SChristoph Hellwig struct mm_walk walk = { 5467b86ac33SChristoph Hellwig .ops = ops, 5477b86ac33SChristoph Hellwig .mm = vma->vm_mm, 5487b86ac33SChristoph Hellwig .vma = vma, 5497b86ac33SChristoph Hellwig .private = private, 5507b86ac33SChristoph Hellwig }; 551900fc5f1SNaoya Horiguchi 5527b86ac33SChristoph Hellwig if (!walk.mm) 553900fc5f1SNaoya Horiguchi return -EINVAL; 554900fc5f1SNaoya Horiguchi 55542fc5414SMichel Lespinasse mmap_assert_locked(walk.mm); 5567b86ac33SChristoph Hellwig return __walk_page_range(vma->vm_start, vma->vm_end, &walk); 557900fc5f1SNaoya Horiguchi } 558ecaad8acSThomas Hellstrom 559ecaad8acSThomas Hellstrom /** 560ecaad8acSThomas Hellstrom * walk_page_mapping - walk all memory areas mapped into a struct address_space. 561ecaad8acSThomas Hellstrom * @mapping: Pointer to the struct address_space 562ecaad8acSThomas Hellstrom * @first_index: First page offset in the address_space 563ecaad8acSThomas Hellstrom * @nr: Number of incremental page offsets to cover 564ecaad8acSThomas Hellstrom * @ops: operation to call during the walk 565ecaad8acSThomas Hellstrom * @private: private data for callbacks' usage 566ecaad8acSThomas Hellstrom * 567ecaad8acSThomas Hellstrom * This function walks all memory areas mapped into a struct address_space. 568ecaad8acSThomas Hellstrom * The walk is limited to only the given page-size index range, but if 569ecaad8acSThomas Hellstrom * the index boundaries cross a huge page-table entry, that entry will be 570ecaad8acSThomas Hellstrom * included. 571ecaad8acSThomas Hellstrom * 572ecaad8acSThomas Hellstrom * Also see walk_page_range() for additional information. 573ecaad8acSThomas Hellstrom * 574ecaad8acSThomas Hellstrom * Locking: 575c1e8d7c6SMichel Lespinasse * This function can't require that the struct mm_struct::mmap_lock is held, 576ecaad8acSThomas Hellstrom * since @mapping may be mapped by multiple processes. Instead 577ecaad8acSThomas Hellstrom * @mapping->i_mmap_rwsem must be held. This might have implications in the 578ecaad8acSThomas Hellstrom * callbacks, and it's up tho the caller to ensure that the 579c1e8d7c6SMichel Lespinasse * struct mm_struct::mmap_lock is not needed. 580ecaad8acSThomas Hellstrom * 581ecaad8acSThomas Hellstrom * Also this means that a caller can't rely on the struct 582ecaad8acSThomas Hellstrom * vm_area_struct::vm_flags to be constant across a call, 583ecaad8acSThomas Hellstrom * except for immutable flags. Callers requiring this shouldn't use 584ecaad8acSThomas Hellstrom * this function. 585ecaad8acSThomas Hellstrom * 586ecaad8acSThomas Hellstrom * Return: 0 on success, negative error code on failure, positive number on 587ecaad8acSThomas Hellstrom * caller defined premature termination. 588ecaad8acSThomas Hellstrom */ 589ecaad8acSThomas Hellstrom int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, 590ecaad8acSThomas Hellstrom pgoff_t nr, const struct mm_walk_ops *ops, 591ecaad8acSThomas Hellstrom void *private) 592ecaad8acSThomas Hellstrom { 593ecaad8acSThomas Hellstrom struct mm_walk walk = { 594ecaad8acSThomas Hellstrom .ops = ops, 595ecaad8acSThomas Hellstrom .private = private, 596ecaad8acSThomas Hellstrom }; 597ecaad8acSThomas Hellstrom struct vm_area_struct *vma; 598ecaad8acSThomas Hellstrom pgoff_t vba, vea, cba, cea; 599ecaad8acSThomas Hellstrom unsigned long start_addr, end_addr; 600ecaad8acSThomas Hellstrom int err = 0; 601ecaad8acSThomas Hellstrom 602ecaad8acSThomas Hellstrom lockdep_assert_held(&mapping->i_mmap_rwsem); 603ecaad8acSThomas Hellstrom vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, 604ecaad8acSThomas Hellstrom first_index + nr - 1) { 605ecaad8acSThomas Hellstrom /* Clip to the vma */ 606ecaad8acSThomas Hellstrom vba = vma->vm_pgoff; 607ecaad8acSThomas Hellstrom vea = vba + vma_pages(vma); 608ecaad8acSThomas Hellstrom cba = first_index; 609ecaad8acSThomas Hellstrom cba = max(cba, vba); 610ecaad8acSThomas Hellstrom cea = first_index + nr; 611ecaad8acSThomas Hellstrom cea = min(cea, vea); 612ecaad8acSThomas Hellstrom 613ecaad8acSThomas Hellstrom start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start; 614ecaad8acSThomas Hellstrom end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start; 615ecaad8acSThomas Hellstrom if (start_addr >= end_addr) 616ecaad8acSThomas Hellstrom continue; 617ecaad8acSThomas Hellstrom 618ecaad8acSThomas Hellstrom walk.vma = vma; 619ecaad8acSThomas Hellstrom walk.mm = vma->vm_mm; 620ecaad8acSThomas Hellstrom 621ecaad8acSThomas Hellstrom err = walk_page_test(vma->vm_start, vma->vm_end, &walk); 622ecaad8acSThomas Hellstrom if (err > 0) { 623ecaad8acSThomas Hellstrom err = 0; 624ecaad8acSThomas Hellstrom break; 625ecaad8acSThomas Hellstrom } else if (err < 0) 626ecaad8acSThomas Hellstrom break; 627ecaad8acSThomas Hellstrom 628ecaad8acSThomas Hellstrom err = __walk_page_range(start_addr, end_addr, &walk); 629ecaad8acSThomas Hellstrom if (err) 630ecaad8acSThomas Hellstrom break; 631ecaad8acSThomas Hellstrom } 632ecaad8acSThomas Hellstrom 633ecaad8acSThomas Hellstrom return err; 634ecaad8acSThomas Hellstrom } 635