1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0 2a520110eSChristoph Hellwig #include <linux/pagewalk.h> 3e6473092SMatt Mackall #include <linux/highmem.h> 4e6473092SMatt Mackall #include <linux/sched.h> 5d33b9f45SNaoya Horiguchi #include <linux/hugetlb.h> 6e6473092SMatt Mackall 7b7a16c7aSSteven Price /* 8b7a16c7aSSteven Price * We want to know the real level where a entry is located ignoring any 9b7a16c7aSSteven Price * folding of levels which may be happening. For example if p4d is folded then 10b7a16c7aSSteven Price * a missing entry found at level 1 (p4d) is actually at level 0 (pgd). 11b7a16c7aSSteven Price */ 12b7a16c7aSSteven Price static int real_depth(int depth) 13b7a16c7aSSteven Price { 14b7a16c7aSSteven Price if (depth == 3 && PTRS_PER_PMD == 1) 15b7a16c7aSSteven Price depth = 2; 16b7a16c7aSSteven Price if (depth == 2 && PTRS_PER_PUD == 1) 17b7a16c7aSSteven Price depth = 1; 18b7a16c7aSSteven Price if (depth == 1 && PTRS_PER_P4D == 1) 19b7a16c7aSSteven Price depth = 0; 20b7a16c7aSSteven Price return depth; 21b7a16c7aSSteven Price } 22b7a16c7aSSteven Price 23fbf56346SSteven Price static int walk_pte_range_inner(pte_t *pte, unsigned long addr, 24fbf56346SSteven Price unsigned long end, struct mm_walk *walk) 25e6473092SMatt Mackall { 267b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 27fbf56346SSteven Price int err = 0; 28e6473092SMatt Mackall 29556637cdSJohannes Weiner for (;;) { 307b86ac33SChristoph Hellwig err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); 31e6473092SMatt Mackall if (err) 32e6473092SMatt Mackall break; 33c02a9875SSteven Price if (addr >= end - PAGE_SIZE) 34556637cdSJohannes Weiner break; 35c02a9875SSteven Price addr += PAGE_SIZE; 36556637cdSJohannes Weiner pte++; 37556637cdSJohannes Weiner } 38fbf56346SSteven Price return err; 39fbf56346SSteven Price } 40e6473092SMatt Mackall 41fbf56346SSteven Price static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 42fbf56346SSteven Price struct mm_walk *walk) 43fbf56346SSteven Price { 44fbf56346SSteven Price pte_t *pte; 45fbf56346SSteven Price int err = 0; 46fbf56346SSteven Price spinlock_t *ptl; 47fbf56346SSteven Price 48fbf56346SSteven Price if (walk->no_vma) { 49fbf56346SSteven Price pte = pte_offset_map(pmd, addr); 50fbf56346SSteven Price err = walk_pte_range_inner(pte, addr, end, walk); 51fbf56346SSteven Price pte_unmap(pte); 52fbf56346SSteven Price } else { 53fbf56346SSteven Price pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 54fbf56346SSteven Price err = walk_pte_range_inner(pte, addr, end, walk); 55ace88f10SThomas Hellstrom pte_unmap_unlock(pte, ptl); 56fbf56346SSteven Price } 57fbf56346SSteven Price 58e6473092SMatt Mackall return err; 59e6473092SMatt Mackall } 60e6473092SMatt Mackall 61e17eae2bSChristophe Leroy #ifdef CONFIG_ARCH_HAS_HUGEPD 62e17eae2bSChristophe Leroy static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr, 63e17eae2bSChristophe Leroy unsigned long end, struct mm_walk *walk, int pdshift) 64e17eae2bSChristophe Leroy { 65e17eae2bSChristophe Leroy int err = 0; 66e17eae2bSChristophe Leroy const struct mm_walk_ops *ops = walk->ops; 67e17eae2bSChristophe Leroy int shift = hugepd_shift(*phpd); 68e17eae2bSChristophe Leroy int page_size = 1 << shift; 69e17eae2bSChristophe Leroy 70e17eae2bSChristophe Leroy if (!ops->pte_entry) 71e17eae2bSChristophe Leroy return 0; 72e17eae2bSChristophe Leroy 73e17eae2bSChristophe Leroy if (addr & (page_size - 1)) 74e17eae2bSChristophe Leroy return 0; 75e17eae2bSChristophe Leroy 76e17eae2bSChristophe Leroy for (;;) { 77e17eae2bSChristophe Leroy pte_t *pte; 78e17eae2bSChristophe Leroy 79e17eae2bSChristophe Leroy spin_lock(&walk->mm->page_table_lock); 80e17eae2bSChristophe Leroy pte = hugepte_offset(*phpd, addr, pdshift); 81e17eae2bSChristophe Leroy err = ops->pte_entry(pte, addr, addr + page_size, walk); 82e17eae2bSChristophe Leroy spin_unlock(&walk->mm->page_table_lock); 83e17eae2bSChristophe Leroy 84e17eae2bSChristophe Leroy if (err) 85e17eae2bSChristophe Leroy break; 86e17eae2bSChristophe Leroy if (addr >= end - page_size) 87e17eae2bSChristophe Leroy break; 88e17eae2bSChristophe Leroy addr += page_size; 89e17eae2bSChristophe Leroy } 90e17eae2bSChristophe Leroy return err; 91e17eae2bSChristophe Leroy } 92e17eae2bSChristophe Leroy #else 93e17eae2bSChristophe Leroy static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr, 94e17eae2bSChristophe Leroy unsigned long end, struct mm_walk *walk, int pdshift) 95e17eae2bSChristophe Leroy { 96e17eae2bSChristophe Leroy return 0; 97e17eae2bSChristophe Leroy } 98e17eae2bSChristophe Leroy #endif 99e17eae2bSChristophe Leroy 100e6473092SMatt Mackall static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 1012165009bSDave Hansen struct mm_walk *walk) 102e6473092SMatt Mackall { 103e6473092SMatt Mackall pmd_t *pmd; 104e6473092SMatt Mackall unsigned long next; 1057b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 106e6473092SMatt Mackall int err = 0; 107b7a16c7aSSteven Price int depth = real_depth(3); 108e6473092SMatt Mackall 109e6473092SMatt Mackall pmd = pmd_offset(pud, addr); 110e6473092SMatt Mackall do { 11103319327SDave Hansen again: 112e6473092SMatt Mackall next = pmd_addr_end(addr, end); 113488ae6a2SSteven Price if (pmd_none(*pmd) || (!walk->vma && !walk->no_vma)) { 1147b86ac33SChristoph Hellwig if (ops->pte_hole) 115b7a16c7aSSteven Price err = ops->pte_hole(addr, next, depth, walk); 116e6473092SMatt Mackall if (err) 117e6473092SMatt Mackall break; 118e6473092SMatt Mackall continue; 119e6473092SMatt Mackall } 1203afc4236SSteven Price 1213afc4236SSteven Price walk->action = ACTION_SUBTREE; 1223afc4236SSteven Price 12303319327SDave Hansen /* 12403319327SDave Hansen * This implies that each ->pmd_entry() handler 12503319327SDave Hansen * needs to know about pmd_trans_huge() pmds 12603319327SDave Hansen */ 1277b86ac33SChristoph Hellwig if (ops->pmd_entry) 1287b86ac33SChristoph Hellwig err = ops->pmd_entry(pmd, addr, next, walk); 12903319327SDave Hansen if (err) 13003319327SDave Hansen break; 13103319327SDave Hansen 1323afc4236SSteven Price if (walk->action == ACTION_AGAIN) 1333afc4236SSteven Price goto again; 1343afc4236SSteven Price 13503319327SDave Hansen /* 13603319327SDave Hansen * Check this here so we only break down trans_huge 13703319327SDave Hansen * pages when we _need_ to 13803319327SDave Hansen */ 139488ae6a2SSteven Price if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) || 140488ae6a2SSteven Price walk->action == ACTION_CONTINUE || 1413afc4236SSteven Price !(ops->pte_entry)) 14203319327SDave Hansen continue; 14303319327SDave Hansen 144488ae6a2SSteven Price if (walk->vma) { 14578ddc534SKirill A. Shutemov split_huge_pmd(walk->vma, pmd, addr); 146fafaa426SNaoya Horiguchi if (pmd_trans_unstable(pmd)) 14703319327SDave Hansen goto again; 148488ae6a2SSteven Price } 1493afc4236SSteven Price 150e17eae2bSChristophe Leroy if (is_hugepd(__hugepd(pmd_val(*pmd)))) 151e17eae2bSChristophe Leroy err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT); 152e17eae2bSChristophe Leroy else 1532165009bSDave Hansen err = walk_pte_range(pmd, addr, next, walk); 154e6473092SMatt Mackall if (err) 155e6473092SMatt Mackall break; 156e6473092SMatt Mackall } while (pmd++, addr = next, addr != end); 157e6473092SMatt Mackall 158e6473092SMatt Mackall return err; 159e6473092SMatt Mackall } 160e6473092SMatt Mackall 161c2febafcSKirill A. Shutemov static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 1622165009bSDave Hansen struct mm_walk *walk) 163e6473092SMatt Mackall { 164e6473092SMatt Mackall pud_t *pud; 165e6473092SMatt Mackall unsigned long next; 1667b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 167e6473092SMatt Mackall int err = 0; 168b7a16c7aSSteven Price int depth = real_depth(2); 169e6473092SMatt Mackall 170c2febafcSKirill A. Shutemov pud = pud_offset(p4d, addr); 171e6473092SMatt Mackall do { 172a00cc7d9SMatthew Wilcox again: 173e6473092SMatt Mackall next = pud_addr_end(addr, end); 174488ae6a2SSteven Price if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) { 1757b86ac33SChristoph Hellwig if (ops->pte_hole) 176b7a16c7aSSteven Price err = ops->pte_hole(addr, next, depth, walk); 177e6473092SMatt Mackall if (err) 178e6473092SMatt Mackall break; 179e6473092SMatt Mackall continue; 180e6473092SMatt Mackall } 181a00cc7d9SMatthew Wilcox 1823afc4236SSteven Price walk->action = ACTION_SUBTREE; 183a00cc7d9SMatthew Wilcox 1843afc4236SSteven Price if (ops->pud_entry) 1857b86ac33SChristoph Hellwig err = ops->pud_entry(pud, addr, next, walk); 186a00cc7d9SMatthew Wilcox if (err) 187a00cc7d9SMatthew Wilcox break; 1883afc4236SSteven Price 1893afc4236SSteven Price if (walk->action == ACTION_AGAIN) 1903afc4236SSteven Price goto again; 1913afc4236SSteven Price 192488ae6a2SSteven Price if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) || 193488ae6a2SSteven Price walk->action == ACTION_CONTINUE || 1943afc4236SSteven Price !(ops->pmd_entry || ops->pte_entry)) 195a00cc7d9SMatthew Wilcox continue; 196a00cc7d9SMatthew Wilcox 197488ae6a2SSteven Price if (walk->vma) 198a00cc7d9SMatthew Wilcox split_huge_pud(walk->vma, pud, addr); 199a00cc7d9SMatthew Wilcox if (pud_none(*pud)) 200a00cc7d9SMatthew Wilcox goto again; 201a00cc7d9SMatthew Wilcox 202e17eae2bSChristophe Leroy if (is_hugepd(__hugepd(pud_val(*pud)))) 203e17eae2bSChristophe Leroy err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT); 204e17eae2bSChristophe Leroy else 2052165009bSDave Hansen err = walk_pmd_range(pud, addr, next, walk); 206e6473092SMatt Mackall if (err) 207e6473092SMatt Mackall break; 208e6473092SMatt Mackall } while (pud++, addr = next, addr != end); 209e6473092SMatt Mackall 210e6473092SMatt Mackall return err; 211e6473092SMatt Mackall } 212e6473092SMatt Mackall 213c2febafcSKirill A. Shutemov static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 214c2febafcSKirill A. Shutemov struct mm_walk *walk) 215c2febafcSKirill A. Shutemov { 216c2febafcSKirill A. Shutemov p4d_t *p4d; 217c2febafcSKirill A. Shutemov unsigned long next; 2187b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 219c2febafcSKirill A. Shutemov int err = 0; 220b7a16c7aSSteven Price int depth = real_depth(1); 221c2febafcSKirill A. Shutemov 222c2febafcSKirill A. Shutemov p4d = p4d_offset(pgd, addr); 223c2febafcSKirill A. Shutemov do { 224c2febafcSKirill A. Shutemov next = p4d_addr_end(addr, end); 225c2febafcSKirill A. Shutemov if (p4d_none_or_clear_bad(p4d)) { 2267b86ac33SChristoph Hellwig if (ops->pte_hole) 227b7a16c7aSSteven Price err = ops->pte_hole(addr, next, depth, walk); 228c2febafcSKirill A. Shutemov if (err) 229c2febafcSKirill A. Shutemov break; 230c2febafcSKirill A. Shutemov continue; 231c2febafcSKirill A. Shutemov } 2323afc4236SSteven Price if (ops->p4d_entry) { 2333afc4236SSteven Price err = ops->p4d_entry(p4d, addr, next, walk); 2343afc4236SSteven Price if (err) 2353afc4236SSteven Price break; 2363afc4236SSteven Price } 237e17eae2bSChristophe Leroy if (is_hugepd(__hugepd(p4d_val(*p4d)))) 238e17eae2bSChristophe Leroy err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk, P4D_SHIFT); 239e17eae2bSChristophe Leroy else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry) 240c2febafcSKirill A. Shutemov err = walk_pud_range(p4d, addr, next, walk); 241c2febafcSKirill A. Shutemov if (err) 242c2febafcSKirill A. Shutemov break; 243c2febafcSKirill A. Shutemov } while (p4d++, addr = next, addr != end); 244c2febafcSKirill A. Shutemov 245c2febafcSKirill A. Shutemov return err; 246c2febafcSKirill A. Shutemov } 247c2febafcSKirill A. Shutemov 248fafaa426SNaoya Horiguchi static int walk_pgd_range(unsigned long addr, unsigned long end, 249fafaa426SNaoya Horiguchi struct mm_walk *walk) 250fafaa426SNaoya Horiguchi { 251fafaa426SNaoya Horiguchi pgd_t *pgd; 252fafaa426SNaoya Horiguchi unsigned long next; 2537b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 254fafaa426SNaoya Horiguchi int err = 0; 255fafaa426SNaoya Horiguchi 256e47690d7SSteven Price if (walk->pgd) 257e47690d7SSteven Price pgd = walk->pgd + pgd_index(addr); 258e47690d7SSteven Price else 259fafaa426SNaoya Horiguchi pgd = pgd_offset(walk->mm, addr); 260fafaa426SNaoya Horiguchi do { 261fafaa426SNaoya Horiguchi next = pgd_addr_end(addr, end); 262fafaa426SNaoya Horiguchi if (pgd_none_or_clear_bad(pgd)) { 2637b86ac33SChristoph Hellwig if (ops->pte_hole) 264b7a16c7aSSteven Price err = ops->pte_hole(addr, next, 0, walk); 265fafaa426SNaoya Horiguchi if (err) 266fafaa426SNaoya Horiguchi break; 267fafaa426SNaoya Horiguchi continue; 268fafaa426SNaoya Horiguchi } 2693afc4236SSteven Price if (ops->pgd_entry) { 2703afc4236SSteven Price err = ops->pgd_entry(pgd, addr, next, walk); 2713afc4236SSteven Price if (err) 2723afc4236SSteven Price break; 2733afc4236SSteven Price } 274e17eae2bSChristophe Leroy if (is_hugepd(__hugepd(pgd_val(*pgd)))) 275e17eae2bSChristophe Leroy err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk, PGDIR_SHIFT); 276e17eae2bSChristophe Leroy else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry) 277c2febafcSKirill A. Shutemov err = walk_p4d_range(pgd, addr, next, walk); 278fafaa426SNaoya Horiguchi if (err) 279fafaa426SNaoya Horiguchi break; 280fafaa426SNaoya Horiguchi } while (pgd++, addr = next, addr != end); 281fafaa426SNaoya Horiguchi 282fafaa426SNaoya Horiguchi return err; 283fafaa426SNaoya Horiguchi } 284fafaa426SNaoya Horiguchi 285116354d1SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE 286116354d1SNaoya Horiguchi static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, 287116354d1SNaoya Horiguchi unsigned long end) 288116354d1SNaoya Horiguchi { 289116354d1SNaoya Horiguchi unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); 290116354d1SNaoya Horiguchi return boundary < end ? boundary : end; 291116354d1SNaoya Horiguchi } 292116354d1SNaoya Horiguchi 293fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end, 294116354d1SNaoya Horiguchi struct mm_walk *walk) 295116354d1SNaoya Horiguchi { 296fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma; 297116354d1SNaoya Horiguchi struct hstate *h = hstate_vma(vma); 298116354d1SNaoya Horiguchi unsigned long next; 299116354d1SNaoya Horiguchi unsigned long hmask = huge_page_mask(h); 3007868a208SPunit Agrawal unsigned long sz = huge_page_size(h); 301116354d1SNaoya Horiguchi pte_t *pte; 3027b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 303116354d1SNaoya Horiguchi int err = 0; 304116354d1SNaoya Horiguchi 305116354d1SNaoya Horiguchi do { 306116354d1SNaoya Horiguchi next = hugetlb_entry_end(h, addr, end); 3077868a208SPunit Agrawal pte = huge_pte_offset(walk->mm, addr & hmask, sz); 308373c4557SJann Horn 309373c4557SJann Horn if (pte) 3107b86ac33SChristoph Hellwig err = ops->hugetlb_entry(pte, hmask, addr, next, walk); 3117b86ac33SChristoph Hellwig else if (ops->pte_hole) 312b7a16c7aSSteven Price err = ops->pte_hole(addr, next, -1, walk); 313373c4557SJann Horn 314116354d1SNaoya Horiguchi if (err) 315fafaa426SNaoya Horiguchi break; 316116354d1SNaoya Horiguchi } while (addr = next, addr != end); 317116354d1SNaoya Horiguchi 318fafaa426SNaoya Horiguchi return err; 319116354d1SNaoya Horiguchi } 3206c6d5280SKOSAKI Motohiro 3216c6d5280SKOSAKI Motohiro #else /* CONFIG_HUGETLB_PAGE */ 322fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end, 3236c6d5280SKOSAKI Motohiro struct mm_walk *walk) 3246c6d5280SKOSAKI Motohiro { 3256c6d5280SKOSAKI Motohiro return 0; 3266c6d5280SKOSAKI Motohiro } 3276c6d5280SKOSAKI Motohiro 3286c6d5280SKOSAKI Motohiro #endif /* CONFIG_HUGETLB_PAGE */ 3296c6d5280SKOSAKI Motohiro 330fafaa426SNaoya Horiguchi /* 331fafaa426SNaoya Horiguchi * Decide whether we really walk over the current vma on [@start, @end) 332fafaa426SNaoya Horiguchi * or skip it via the returned value. Return 0 if we do walk over the 333fafaa426SNaoya Horiguchi * current vma, and return 1 if we skip the vma. Negative values means 334fafaa426SNaoya Horiguchi * error, where we abort the current walk. 335e6473092SMatt Mackall */ 336fafaa426SNaoya Horiguchi static int walk_page_test(unsigned long start, unsigned long end, 3372165009bSDave Hansen struct mm_walk *walk) 338e6473092SMatt Mackall { 339fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma; 3407b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops; 341e6473092SMatt Mackall 3427b86ac33SChristoph Hellwig if (ops->test_walk) 3437b86ac33SChristoph Hellwig return ops->test_walk(start, end, walk); 344fafaa426SNaoya Horiguchi 345fafaa426SNaoya Horiguchi /* 34648684a65SNaoya Horiguchi * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP 34748684a65SNaoya Horiguchi * range, so we don't walk over it as we do for normal vmas. However, 34848684a65SNaoya Horiguchi * Some callers are interested in handling hole range and they don't 34948684a65SNaoya Horiguchi * want to just ignore any single address range. Such users certainly 35048684a65SNaoya Horiguchi * define their ->pte_hole() callbacks, so let's delegate them to handle 35148684a65SNaoya Horiguchi * vma(VM_PFNMAP). 352fafaa426SNaoya Horiguchi */ 35348684a65SNaoya Horiguchi if (vma->vm_flags & VM_PFNMAP) { 35448684a65SNaoya Horiguchi int err = 1; 3557b86ac33SChristoph Hellwig if (ops->pte_hole) 356b7a16c7aSSteven Price err = ops->pte_hole(start, end, -1, walk); 35748684a65SNaoya Horiguchi return err ? err : 1; 35848684a65SNaoya Horiguchi } 359fafaa426SNaoya Horiguchi return 0; 360fafaa426SNaoya Horiguchi } 361fafaa426SNaoya Horiguchi 362fafaa426SNaoya Horiguchi static int __walk_page_range(unsigned long start, unsigned long end, 363fafaa426SNaoya Horiguchi struct mm_walk *walk) 364fafaa426SNaoya Horiguchi { 365fafaa426SNaoya Horiguchi int err = 0; 366fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma; 367ecaad8acSThomas Hellstrom const struct mm_walk_ops *ops = walk->ops; 368ecaad8acSThomas Hellstrom 369ecaad8acSThomas Hellstrom if (vma && ops->pre_vma) { 370ecaad8acSThomas Hellstrom err = ops->pre_vma(start, end, walk); 371ecaad8acSThomas Hellstrom if (err) 372ecaad8acSThomas Hellstrom return err; 373ecaad8acSThomas Hellstrom } 374fafaa426SNaoya Horiguchi 375fafaa426SNaoya Horiguchi if (vma && is_vm_hugetlb_page(vma)) { 376ecaad8acSThomas Hellstrom if (ops->hugetlb_entry) 377fafaa426SNaoya Horiguchi err = walk_hugetlb_range(start, end, walk); 378fafaa426SNaoya Horiguchi } else 379fafaa426SNaoya Horiguchi err = walk_pgd_range(start, end, walk); 380fafaa426SNaoya Horiguchi 381ecaad8acSThomas Hellstrom if (vma && ops->post_vma) 382ecaad8acSThomas Hellstrom ops->post_vma(walk); 383ecaad8acSThomas Hellstrom 384e6473092SMatt Mackall return err; 385fafaa426SNaoya Horiguchi } 386fafaa426SNaoya Horiguchi 387fafaa426SNaoya Horiguchi /** 388fafaa426SNaoya Horiguchi * walk_page_range - walk page table with caller specific callbacks 3897b86ac33SChristoph Hellwig * @mm: mm_struct representing the target process of page table walk 390e8b098fcSMike Rapoport * @start: start address of the virtual address range 391e8b098fcSMike Rapoport * @end: end address of the virtual address range 3927b86ac33SChristoph Hellwig * @ops: operation to call during the walk 3937b86ac33SChristoph Hellwig * @private: private data for callbacks' usage 394fafaa426SNaoya Horiguchi * 3957b86ac33SChristoph Hellwig * Recursively walk the page table tree of the process represented by @mm 396fafaa426SNaoya Horiguchi * within the virtual address range [@start, @end). During walking, we can do 397fafaa426SNaoya Horiguchi * some caller-specific works for each entry, by setting up pmd_entry(), 398fafaa426SNaoya Horiguchi * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these 399fafaa426SNaoya Horiguchi * callbacks, the associated entries/pages are just ignored. 400fafaa426SNaoya Horiguchi * The return values of these callbacks are commonly defined like below: 401a5d09bedSMike Rapoport * 402fafaa426SNaoya Horiguchi * - 0 : succeeded to handle the current entry, and if you don't reach the 403fafaa426SNaoya Horiguchi * end address yet, continue to walk. 404fafaa426SNaoya Horiguchi * - >0 : succeeded to handle the current entry, and return to the caller 405fafaa426SNaoya Horiguchi * with caller specific value. 406fafaa426SNaoya Horiguchi * - <0 : failed to handle the current entry, and return to the caller 407fafaa426SNaoya Horiguchi * with error code. 408fafaa426SNaoya Horiguchi * 409fafaa426SNaoya Horiguchi * Before starting to walk page table, some callers want to check whether 410fafaa426SNaoya Horiguchi * they really want to walk over the current vma, typically by checking 4117b86ac33SChristoph Hellwig * its vm_flags. walk_page_test() and @ops->test_walk() are used for this 412fafaa426SNaoya Horiguchi * purpose. 413fafaa426SNaoya Horiguchi * 414ecaad8acSThomas Hellstrom * If operations need to be staged before and committed after a vma is walked, 415ecaad8acSThomas Hellstrom * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), 416ecaad8acSThomas Hellstrom * since it is intended to handle commit-type operations, can't return any 417ecaad8acSThomas Hellstrom * errors. 418ecaad8acSThomas Hellstrom * 419fafaa426SNaoya Horiguchi * struct mm_walk keeps current values of some common data like vma and pmd, 420fafaa426SNaoya Horiguchi * which are useful for the access from callbacks. If you want to pass some 4217b86ac33SChristoph Hellwig * caller-specific data to callbacks, @private should be helpful. 422fafaa426SNaoya Horiguchi * 423fafaa426SNaoya Horiguchi * Locking: 424c1e8d7c6SMichel Lespinasse * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock, 4257b86ac33SChristoph Hellwig * because these function traverse vma list and/or access to vma's data. 426fafaa426SNaoya Horiguchi */ 4277b86ac33SChristoph Hellwig int walk_page_range(struct mm_struct *mm, unsigned long start, 4287b86ac33SChristoph Hellwig unsigned long end, const struct mm_walk_ops *ops, 4297b86ac33SChristoph Hellwig void *private) 430fafaa426SNaoya Horiguchi { 431fafaa426SNaoya Horiguchi int err = 0; 432fafaa426SNaoya Horiguchi unsigned long next; 433fafaa426SNaoya Horiguchi struct vm_area_struct *vma; 4347b86ac33SChristoph Hellwig struct mm_walk walk = { 4357b86ac33SChristoph Hellwig .ops = ops, 4367b86ac33SChristoph Hellwig .mm = mm, 4377b86ac33SChristoph Hellwig .private = private, 4387b86ac33SChristoph Hellwig }; 439fafaa426SNaoya Horiguchi 440fafaa426SNaoya Horiguchi if (start >= end) 441fafaa426SNaoya Horiguchi return -EINVAL; 442e6473092SMatt Mackall 4437b86ac33SChristoph Hellwig if (!walk.mm) 4442165009bSDave Hansen return -EINVAL; 4452165009bSDave Hansen 44642fc5414SMichel Lespinasse mmap_assert_locked(walk.mm); 447a9ff785eSCliff Wickman 4487b86ac33SChristoph Hellwig vma = find_vma(walk.mm, start); 449e6473092SMatt Mackall do { 450fafaa426SNaoya Horiguchi if (!vma) { /* after the last vma */ 4517b86ac33SChristoph Hellwig walk.vma = NULL; 452fafaa426SNaoya Horiguchi next = end; 453fafaa426SNaoya Horiguchi } else if (start < vma->vm_start) { /* outside vma */ 4547b86ac33SChristoph Hellwig walk.vma = NULL; 455fafaa426SNaoya Horiguchi next = min(end, vma->vm_start); 456fafaa426SNaoya Horiguchi } else { /* inside vma */ 4577b86ac33SChristoph Hellwig walk.vma = vma; 458fafaa426SNaoya Horiguchi next = min(end, vma->vm_end); 459*9ec08f30SMatthew Wilcox (Oracle) vma = find_vma(mm, vma->vm_end); 4605f0af70aSDavid Sterba 4617b86ac33SChristoph Hellwig err = walk_page_test(start, next, &walk); 462f6837395SNaoya Horiguchi if (err > 0) { 463f6837395SNaoya Horiguchi /* 464f6837395SNaoya Horiguchi * positive return values are purely for 465f6837395SNaoya Horiguchi * controlling the pagewalk, so should never 466f6837395SNaoya Horiguchi * be passed to the callers. 467f6837395SNaoya Horiguchi */ 468f6837395SNaoya Horiguchi err = 0; 469a9ff785eSCliff Wickman continue; 470f6837395SNaoya Horiguchi } 471fafaa426SNaoya Horiguchi if (err < 0) 472fafaa426SNaoya Horiguchi break; 473a9ff785eSCliff Wickman } 4747b86ac33SChristoph Hellwig if (walk.vma || walk.ops->pte_hole) 4757b86ac33SChristoph Hellwig err = __walk_page_range(start, next, &walk); 4765dc37642SNaoya Horiguchi if (err) 4775dc37642SNaoya Horiguchi break; 478fafaa426SNaoya Horiguchi } while (start = next, start < end); 479e6473092SMatt Mackall return err; 480e6473092SMatt Mackall } 481900fc5f1SNaoya Horiguchi 4828bd3873dSRolf Eike Beer /** 4838bd3873dSRolf Eike Beer * walk_page_range_novma - walk a range of pagetables not backed by a vma 4848bd3873dSRolf Eike Beer * @mm: mm_struct representing the target process of page table walk 4858bd3873dSRolf Eike Beer * @start: start address of the virtual address range 4868bd3873dSRolf Eike Beer * @end: end address of the virtual address range 4878bd3873dSRolf Eike Beer * @ops: operation to call during the walk 4888bd3873dSRolf Eike Beer * @pgd: pgd to walk if different from mm->pgd 4898bd3873dSRolf Eike Beer * @private: private data for callbacks' usage 4908bd3873dSRolf Eike Beer * 491fbf56346SSteven Price * Similar to walk_page_range() but can walk any page tables even if they are 492fbf56346SSteven Price * not backed by VMAs. Because 'unusual' entries may be walked this function 493fbf56346SSteven Price * will also not lock the PTEs for the pte_entry() callback. This is useful for 494fbf56346SSteven Price * walking the kernel pages tables or page tables for firmware. 495fbf56346SSteven Price */ 496488ae6a2SSteven Price int walk_page_range_novma(struct mm_struct *mm, unsigned long start, 497488ae6a2SSteven Price unsigned long end, const struct mm_walk_ops *ops, 498e47690d7SSteven Price pgd_t *pgd, 499488ae6a2SSteven Price void *private) 500488ae6a2SSteven Price { 501488ae6a2SSteven Price struct mm_walk walk = { 502488ae6a2SSteven Price .ops = ops, 503488ae6a2SSteven Price .mm = mm, 504e47690d7SSteven Price .pgd = pgd, 505488ae6a2SSteven Price .private = private, 506488ae6a2SSteven Price .no_vma = true 507488ae6a2SSteven Price }; 508488ae6a2SSteven Price 509488ae6a2SSteven Price if (start >= end || !walk.mm) 510488ae6a2SSteven Price return -EINVAL; 511488ae6a2SSteven Price 51242fc5414SMichel Lespinasse mmap_assert_locked(walk.mm); 513488ae6a2SSteven Price 514488ae6a2SSteven Price return __walk_page_range(start, end, &walk); 515488ae6a2SSteven Price } 516488ae6a2SSteven Price 5177b86ac33SChristoph Hellwig int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, 5187b86ac33SChristoph Hellwig void *private) 519900fc5f1SNaoya Horiguchi { 5207b86ac33SChristoph Hellwig struct mm_walk walk = { 5217b86ac33SChristoph Hellwig .ops = ops, 5227b86ac33SChristoph Hellwig .mm = vma->vm_mm, 5237b86ac33SChristoph Hellwig .vma = vma, 5247b86ac33SChristoph Hellwig .private = private, 5257b86ac33SChristoph Hellwig }; 526900fc5f1SNaoya Horiguchi int err; 527900fc5f1SNaoya Horiguchi 5287b86ac33SChristoph Hellwig if (!walk.mm) 529900fc5f1SNaoya Horiguchi return -EINVAL; 530900fc5f1SNaoya Horiguchi 53142fc5414SMichel Lespinasse mmap_assert_locked(walk.mm); 5327b86ac33SChristoph Hellwig 5337b86ac33SChristoph Hellwig err = walk_page_test(vma->vm_start, vma->vm_end, &walk); 534900fc5f1SNaoya Horiguchi if (err > 0) 535900fc5f1SNaoya Horiguchi return 0; 536900fc5f1SNaoya Horiguchi if (err < 0) 537900fc5f1SNaoya Horiguchi return err; 5387b86ac33SChristoph Hellwig return __walk_page_range(vma->vm_start, vma->vm_end, &walk); 539900fc5f1SNaoya Horiguchi } 540ecaad8acSThomas Hellstrom 541ecaad8acSThomas Hellstrom /** 542ecaad8acSThomas Hellstrom * walk_page_mapping - walk all memory areas mapped into a struct address_space. 543ecaad8acSThomas Hellstrom * @mapping: Pointer to the struct address_space 544ecaad8acSThomas Hellstrom * @first_index: First page offset in the address_space 545ecaad8acSThomas Hellstrom * @nr: Number of incremental page offsets to cover 546ecaad8acSThomas Hellstrom * @ops: operation to call during the walk 547ecaad8acSThomas Hellstrom * @private: private data for callbacks' usage 548ecaad8acSThomas Hellstrom * 549ecaad8acSThomas Hellstrom * This function walks all memory areas mapped into a struct address_space. 550ecaad8acSThomas Hellstrom * The walk is limited to only the given page-size index range, but if 551ecaad8acSThomas Hellstrom * the index boundaries cross a huge page-table entry, that entry will be 552ecaad8acSThomas Hellstrom * included. 553ecaad8acSThomas Hellstrom * 554ecaad8acSThomas Hellstrom * Also see walk_page_range() for additional information. 555ecaad8acSThomas Hellstrom * 556ecaad8acSThomas Hellstrom * Locking: 557c1e8d7c6SMichel Lespinasse * This function can't require that the struct mm_struct::mmap_lock is held, 558ecaad8acSThomas Hellstrom * since @mapping may be mapped by multiple processes. Instead 559ecaad8acSThomas Hellstrom * @mapping->i_mmap_rwsem must be held. This might have implications in the 560ecaad8acSThomas Hellstrom * callbacks, and it's up tho the caller to ensure that the 561c1e8d7c6SMichel Lespinasse * struct mm_struct::mmap_lock is not needed. 562ecaad8acSThomas Hellstrom * 563ecaad8acSThomas Hellstrom * Also this means that a caller can't rely on the struct 564ecaad8acSThomas Hellstrom * vm_area_struct::vm_flags to be constant across a call, 565ecaad8acSThomas Hellstrom * except for immutable flags. Callers requiring this shouldn't use 566ecaad8acSThomas Hellstrom * this function. 567ecaad8acSThomas Hellstrom * 568ecaad8acSThomas Hellstrom * Return: 0 on success, negative error code on failure, positive number on 569ecaad8acSThomas Hellstrom * caller defined premature termination. 570ecaad8acSThomas Hellstrom */ 571ecaad8acSThomas Hellstrom int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, 572ecaad8acSThomas Hellstrom pgoff_t nr, const struct mm_walk_ops *ops, 573ecaad8acSThomas Hellstrom void *private) 574ecaad8acSThomas Hellstrom { 575ecaad8acSThomas Hellstrom struct mm_walk walk = { 576ecaad8acSThomas Hellstrom .ops = ops, 577ecaad8acSThomas Hellstrom .private = private, 578ecaad8acSThomas Hellstrom }; 579ecaad8acSThomas Hellstrom struct vm_area_struct *vma; 580ecaad8acSThomas Hellstrom pgoff_t vba, vea, cba, cea; 581ecaad8acSThomas Hellstrom unsigned long start_addr, end_addr; 582ecaad8acSThomas Hellstrom int err = 0; 583ecaad8acSThomas Hellstrom 584ecaad8acSThomas Hellstrom lockdep_assert_held(&mapping->i_mmap_rwsem); 585ecaad8acSThomas Hellstrom vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, 586ecaad8acSThomas Hellstrom first_index + nr - 1) { 587ecaad8acSThomas Hellstrom /* Clip to the vma */ 588ecaad8acSThomas Hellstrom vba = vma->vm_pgoff; 589ecaad8acSThomas Hellstrom vea = vba + vma_pages(vma); 590ecaad8acSThomas Hellstrom cba = first_index; 591ecaad8acSThomas Hellstrom cba = max(cba, vba); 592ecaad8acSThomas Hellstrom cea = first_index + nr; 593ecaad8acSThomas Hellstrom cea = min(cea, vea); 594ecaad8acSThomas Hellstrom 595ecaad8acSThomas Hellstrom start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start; 596ecaad8acSThomas Hellstrom end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start; 597ecaad8acSThomas Hellstrom if (start_addr >= end_addr) 598ecaad8acSThomas Hellstrom continue; 599ecaad8acSThomas Hellstrom 600ecaad8acSThomas Hellstrom walk.vma = vma; 601ecaad8acSThomas Hellstrom walk.mm = vma->vm_mm; 602ecaad8acSThomas Hellstrom 603ecaad8acSThomas Hellstrom err = walk_page_test(vma->vm_start, vma->vm_end, &walk); 604ecaad8acSThomas Hellstrom if (err > 0) { 605ecaad8acSThomas Hellstrom err = 0; 606ecaad8acSThomas Hellstrom break; 607ecaad8acSThomas Hellstrom } else if (err < 0) 608ecaad8acSThomas Hellstrom break; 609ecaad8acSThomas Hellstrom 610ecaad8acSThomas Hellstrom err = __walk_page_range(start_addr, end_addr, &walk); 611ecaad8acSThomas Hellstrom if (err) 612ecaad8acSThomas Hellstrom break; 613ecaad8acSThomas Hellstrom } 614ecaad8acSThomas Hellstrom 615ecaad8acSThomas Hellstrom return err; 616ecaad8acSThomas Hellstrom } 617