1e6473092SMatt Mackall #include <linux/mm.h> 2e6473092SMatt Mackall #include <linux/highmem.h> 3e6473092SMatt Mackall #include <linux/sched.h> 4d33b9f45SNaoya Horiguchi #include <linux/hugetlb.h> 5e6473092SMatt Mackall 6e6473092SMatt Mackall static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 72165009bSDave Hansen struct mm_walk *walk) 8e6473092SMatt Mackall { 9e6473092SMatt Mackall pte_t *pte; 10e6473092SMatt Mackall int err = 0; 11e6473092SMatt Mackall 12e6473092SMatt Mackall pte = pte_offset_map(pmd, addr); 13556637cdSJohannes Weiner for (;;) { 142165009bSDave Hansen err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); 15e6473092SMatt Mackall if (err) 16e6473092SMatt Mackall break; 17556637cdSJohannes Weiner addr += PAGE_SIZE; 18556637cdSJohannes Weiner if (addr == end) 19556637cdSJohannes Weiner break; 20556637cdSJohannes Weiner pte++; 21556637cdSJohannes Weiner } 22e6473092SMatt Mackall 23e6473092SMatt Mackall pte_unmap(pte); 24e6473092SMatt Mackall return err; 25e6473092SMatt Mackall } 26e6473092SMatt Mackall 27e6473092SMatt Mackall static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 282165009bSDave Hansen struct mm_walk *walk) 29e6473092SMatt Mackall { 30e6473092SMatt Mackall pmd_t *pmd; 31e6473092SMatt Mackall unsigned long next; 32e6473092SMatt Mackall int err = 0; 33e6473092SMatt Mackall 34e6473092SMatt Mackall pmd = pmd_offset(pud, addr); 35e6473092SMatt Mackall do { 3603319327SDave Hansen again: 37e6473092SMatt Mackall next = pmd_addr_end(addr, end); 3848684a65SNaoya Horiguchi if (pmd_none(*pmd) || !walk->vma) { 39e6473092SMatt Mackall if (walk->pte_hole) 402165009bSDave Hansen err = walk->pte_hole(addr, next, walk); 41e6473092SMatt Mackall if (err) 42e6473092SMatt Mackall break; 43e6473092SMatt Mackall continue; 44e6473092SMatt Mackall } 4503319327SDave Hansen /* 4603319327SDave Hansen * This implies that each ->pmd_entry() handler 4703319327SDave Hansen * needs to know about pmd_trans_huge() pmds 4803319327SDave Hansen */ 49e6473092SMatt Mackall if (walk->pmd_entry) 502165009bSDave Hansen err = walk->pmd_entry(pmd, addr, next, walk); 5103319327SDave Hansen if (err) 5203319327SDave Hansen break; 5303319327SDave Hansen 5403319327SDave Hansen /* 5503319327SDave Hansen * Check this here so we only break down trans_huge 5603319327SDave Hansen * pages when we _need_ to 5703319327SDave Hansen */ 5803319327SDave Hansen if (!walk->pte_entry) 5903319327SDave Hansen continue; 6003319327SDave Hansen 6178ddc534SKirill A. Shutemov split_huge_pmd(walk->vma, pmd, addr); 62fafaa426SNaoya Horiguchi if (pmd_trans_unstable(pmd)) 6303319327SDave Hansen goto again; 642165009bSDave Hansen err = walk_pte_range(pmd, addr, next, walk); 65e6473092SMatt Mackall if (err) 66e6473092SMatt Mackall break; 67e6473092SMatt Mackall } while (pmd++, addr = next, addr != end); 68e6473092SMatt Mackall 69e6473092SMatt Mackall return err; 70e6473092SMatt Mackall } 71e6473092SMatt Mackall 72*c2febafcSKirill A. Shutemov static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 732165009bSDave Hansen struct mm_walk *walk) 74e6473092SMatt Mackall { 75e6473092SMatt Mackall pud_t *pud; 76e6473092SMatt Mackall unsigned long next; 77e6473092SMatt Mackall int err = 0; 78e6473092SMatt Mackall 79*c2febafcSKirill A. Shutemov pud = pud_offset(p4d, addr); 80e6473092SMatt Mackall do { 81a00cc7d9SMatthew Wilcox again: 82e6473092SMatt Mackall next = pud_addr_end(addr, end); 83a00cc7d9SMatthew Wilcox if (pud_none(*pud) || !walk->vma) { 84e6473092SMatt Mackall if (walk->pte_hole) 852165009bSDave Hansen err = walk->pte_hole(addr, next, walk); 86e6473092SMatt Mackall if (err) 87e6473092SMatt Mackall break; 88e6473092SMatt Mackall continue; 89e6473092SMatt Mackall } 90a00cc7d9SMatthew Wilcox 91a00cc7d9SMatthew Wilcox if (walk->pud_entry) { 92a00cc7d9SMatthew Wilcox spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); 93a00cc7d9SMatthew Wilcox 94a00cc7d9SMatthew Wilcox if (ptl) { 95a00cc7d9SMatthew Wilcox err = walk->pud_entry(pud, addr, next, walk); 96a00cc7d9SMatthew Wilcox spin_unlock(ptl); 97a00cc7d9SMatthew Wilcox if (err) 98a00cc7d9SMatthew Wilcox break; 99a00cc7d9SMatthew Wilcox continue; 100a00cc7d9SMatthew Wilcox } 101a00cc7d9SMatthew Wilcox } 102a00cc7d9SMatthew Wilcox 103a00cc7d9SMatthew Wilcox split_huge_pud(walk->vma, pud, addr); 104a00cc7d9SMatthew Wilcox if (pud_none(*pud)) 105a00cc7d9SMatthew Wilcox goto again; 106a00cc7d9SMatthew Wilcox 1070b1fbfe5SNaoya Horiguchi if (walk->pmd_entry || walk->pte_entry) 1082165009bSDave Hansen err = walk_pmd_range(pud, addr, next, walk); 109e6473092SMatt Mackall if (err) 110e6473092SMatt Mackall break; 111e6473092SMatt Mackall } while (pud++, addr = next, addr != end); 112e6473092SMatt Mackall 113e6473092SMatt Mackall return err; 114e6473092SMatt Mackall } 115e6473092SMatt Mackall 116*c2febafcSKirill A. Shutemov static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 117*c2febafcSKirill A. Shutemov struct mm_walk *walk) 118*c2febafcSKirill A. Shutemov { 119*c2febafcSKirill A. Shutemov p4d_t *p4d; 120*c2febafcSKirill A. Shutemov unsigned long next; 121*c2febafcSKirill A. Shutemov int err = 0; 122*c2febafcSKirill A. Shutemov 123*c2febafcSKirill A. Shutemov p4d = p4d_offset(pgd, addr); 124*c2febafcSKirill A. Shutemov do { 125*c2febafcSKirill A. Shutemov next = p4d_addr_end(addr, end); 126*c2febafcSKirill A. Shutemov if (p4d_none_or_clear_bad(p4d)) { 127*c2febafcSKirill A. Shutemov if (walk->pte_hole) 128*c2febafcSKirill A. Shutemov err = walk->pte_hole(addr, next, walk); 129*c2febafcSKirill A. Shutemov if (err) 130*c2febafcSKirill A. Shutemov break; 131*c2febafcSKirill A. Shutemov continue; 132*c2febafcSKirill A. Shutemov } 133*c2febafcSKirill A. Shutemov if (walk->pmd_entry || walk->pte_entry) 134*c2febafcSKirill A. Shutemov err = walk_pud_range(p4d, addr, next, walk); 135*c2febafcSKirill A. Shutemov if (err) 136*c2febafcSKirill A. Shutemov break; 137*c2febafcSKirill A. Shutemov } while (p4d++, addr = next, addr != end); 138*c2febafcSKirill A. Shutemov 139*c2febafcSKirill A. Shutemov return err; 140*c2febafcSKirill A. Shutemov } 141*c2febafcSKirill A. Shutemov 142fafaa426SNaoya Horiguchi static int walk_pgd_range(unsigned long addr, unsigned long end, 143fafaa426SNaoya Horiguchi struct mm_walk *walk) 144fafaa426SNaoya Horiguchi { 145fafaa426SNaoya Horiguchi pgd_t *pgd; 146fafaa426SNaoya Horiguchi unsigned long next; 147fafaa426SNaoya Horiguchi int err = 0; 148fafaa426SNaoya Horiguchi 149fafaa426SNaoya Horiguchi pgd = pgd_offset(walk->mm, addr); 150fafaa426SNaoya Horiguchi do { 151fafaa426SNaoya Horiguchi next = pgd_addr_end(addr, end); 152fafaa426SNaoya Horiguchi if (pgd_none_or_clear_bad(pgd)) { 153fafaa426SNaoya Horiguchi if (walk->pte_hole) 154fafaa426SNaoya Horiguchi err = walk->pte_hole(addr, next, walk); 155fafaa426SNaoya Horiguchi if (err) 156fafaa426SNaoya Horiguchi break; 157fafaa426SNaoya Horiguchi continue; 158fafaa426SNaoya Horiguchi } 159fafaa426SNaoya Horiguchi if (walk->pmd_entry || walk->pte_entry) 160*c2febafcSKirill A. Shutemov err = walk_p4d_range(pgd, addr, next, walk); 161fafaa426SNaoya Horiguchi if (err) 162fafaa426SNaoya Horiguchi break; 163fafaa426SNaoya Horiguchi } while (pgd++, addr = next, addr != end); 164fafaa426SNaoya Horiguchi 165fafaa426SNaoya Horiguchi return err; 166fafaa426SNaoya Horiguchi } 167fafaa426SNaoya Horiguchi 168116354d1SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE 169116354d1SNaoya Horiguchi static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, 170116354d1SNaoya Horiguchi unsigned long end) 171116354d1SNaoya Horiguchi { 172116354d1SNaoya Horiguchi unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); 173116354d1SNaoya Horiguchi return boundary < end ? boundary : end; 174116354d1SNaoya Horiguchi } 175116354d1SNaoya Horiguchi 176fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end, 177116354d1SNaoya Horiguchi struct mm_walk *walk) 178116354d1SNaoya Horiguchi { 179fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma; 180116354d1SNaoya Horiguchi struct hstate *h = hstate_vma(vma); 181116354d1SNaoya Horiguchi unsigned long next; 182116354d1SNaoya Horiguchi unsigned long hmask = huge_page_mask(h); 183116354d1SNaoya Horiguchi pte_t *pte; 184116354d1SNaoya Horiguchi int err = 0; 185116354d1SNaoya Horiguchi 186116354d1SNaoya Horiguchi do { 187116354d1SNaoya Horiguchi next = hugetlb_entry_end(h, addr, end); 188116354d1SNaoya Horiguchi pte = huge_pte_offset(walk->mm, addr & hmask); 189116354d1SNaoya Horiguchi if (pte && walk->hugetlb_entry) 190116354d1SNaoya Horiguchi err = walk->hugetlb_entry(pte, hmask, addr, next, walk); 191116354d1SNaoya Horiguchi if (err) 192fafaa426SNaoya Horiguchi break; 193116354d1SNaoya Horiguchi } while (addr = next, addr != end); 194116354d1SNaoya Horiguchi 195fafaa426SNaoya Horiguchi return err; 196116354d1SNaoya Horiguchi } 1976c6d5280SKOSAKI Motohiro 1986c6d5280SKOSAKI Motohiro #else /* CONFIG_HUGETLB_PAGE */ 199fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end, 2006c6d5280SKOSAKI Motohiro struct mm_walk *walk) 2016c6d5280SKOSAKI Motohiro { 2026c6d5280SKOSAKI Motohiro return 0; 2036c6d5280SKOSAKI Motohiro } 2046c6d5280SKOSAKI Motohiro 2056c6d5280SKOSAKI Motohiro #endif /* CONFIG_HUGETLB_PAGE */ 2066c6d5280SKOSAKI Motohiro 207fafaa426SNaoya Horiguchi /* 208fafaa426SNaoya Horiguchi * Decide whether we really walk over the current vma on [@start, @end) 209fafaa426SNaoya Horiguchi * or skip it via the returned value. Return 0 if we do walk over the 210fafaa426SNaoya Horiguchi * current vma, and return 1 if we skip the vma. Negative values means 211fafaa426SNaoya Horiguchi * error, where we abort the current walk. 212e6473092SMatt Mackall */ 213fafaa426SNaoya Horiguchi static int walk_page_test(unsigned long start, unsigned long end, 2142165009bSDave Hansen struct mm_walk *walk) 215e6473092SMatt Mackall { 216fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma; 217e6473092SMatt Mackall 218fafaa426SNaoya Horiguchi if (walk->test_walk) 219fafaa426SNaoya Horiguchi return walk->test_walk(start, end, walk); 220fafaa426SNaoya Horiguchi 221fafaa426SNaoya Horiguchi /* 22248684a65SNaoya Horiguchi * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP 22348684a65SNaoya Horiguchi * range, so we don't walk over it as we do for normal vmas. However, 22448684a65SNaoya Horiguchi * Some callers are interested in handling hole range and they don't 22548684a65SNaoya Horiguchi * want to just ignore any single address range. Such users certainly 22648684a65SNaoya Horiguchi * define their ->pte_hole() callbacks, so let's delegate them to handle 22748684a65SNaoya Horiguchi * vma(VM_PFNMAP). 228fafaa426SNaoya Horiguchi */ 22948684a65SNaoya Horiguchi if (vma->vm_flags & VM_PFNMAP) { 23048684a65SNaoya Horiguchi int err = 1; 23148684a65SNaoya Horiguchi if (walk->pte_hole) 23248684a65SNaoya Horiguchi err = walk->pte_hole(start, end, walk); 23348684a65SNaoya Horiguchi return err ? err : 1; 23448684a65SNaoya Horiguchi } 235fafaa426SNaoya Horiguchi return 0; 236fafaa426SNaoya Horiguchi } 237fafaa426SNaoya Horiguchi 238fafaa426SNaoya Horiguchi static int __walk_page_range(unsigned long start, unsigned long end, 239fafaa426SNaoya Horiguchi struct mm_walk *walk) 240fafaa426SNaoya Horiguchi { 241fafaa426SNaoya Horiguchi int err = 0; 242fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma; 243fafaa426SNaoya Horiguchi 244fafaa426SNaoya Horiguchi if (vma && is_vm_hugetlb_page(vma)) { 245fafaa426SNaoya Horiguchi if (walk->hugetlb_entry) 246fafaa426SNaoya Horiguchi err = walk_hugetlb_range(start, end, walk); 247fafaa426SNaoya Horiguchi } else 248fafaa426SNaoya Horiguchi err = walk_pgd_range(start, end, walk); 249fafaa426SNaoya Horiguchi 250e6473092SMatt Mackall return err; 251fafaa426SNaoya Horiguchi } 252fafaa426SNaoya Horiguchi 253fafaa426SNaoya Horiguchi /** 254fafaa426SNaoya Horiguchi * walk_page_range - walk page table with caller specific callbacks 255fafaa426SNaoya Horiguchi * 256fafaa426SNaoya Horiguchi * Recursively walk the page table tree of the process represented by @walk->mm 257fafaa426SNaoya Horiguchi * within the virtual address range [@start, @end). During walking, we can do 258fafaa426SNaoya Horiguchi * some caller-specific works for each entry, by setting up pmd_entry(), 259fafaa426SNaoya Horiguchi * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these 260fafaa426SNaoya Horiguchi * callbacks, the associated entries/pages are just ignored. 261fafaa426SNaoya Horiguchi * The return values of these callbacks are commonly defined like below: 262fafaa426SNaoya Horiguchi * - 0 : succeeded to handle the current entry, and if you don't reach the 263fafaa426SNaoya Horiguchi * end address yet, continue to walk. 264fafaa426SNaoya Horiguchi * - >0 : succeeded to handle the current entry, and return to the caller 265fafaa426SNaoya Horiguchi * with caller specific value. 266fafaa426SNaoya Horiguchi * - <0 : failed to handle the current entry, and return to the caller 267fafaa426SNaoya Horiguchi * with error code. 268fafaa426SNaoya Horiguchi * 269fafaa426SNaoya Horiguchi * Before starting to walk page table, some callers want to check whether 270fafaa426SNaoya Horiguchi * they really want to walk over the current vma, typically by checking 271fafaa426SNaoya Horiguchi * its vm_flags. walk_page_test() and @walk->test_walk() are used for this 272fafaa426SNaoya Horiguchi * purpose. 273fafaa426SNaoya Horiguchi * 274fafaa426SNaoya Horiguchi * struct mm_walk keeps current values of some common data like vma and pmd, 275fafaa426SNaoya Horiguchi * which are useful for the access from callbacks. If you want to pass some 276fafaa426SNaoya Horiguchi * caller-specific data to callbacks, @walk->private should be helpful. 277fafaa426SNaoya Horiguchi * 278fafaa426SNaoya Horiguchi * Locking: 279fafaa426SNaoya Horiguchi * Callers of walk_page_range() and walk_page_vma() should hold 280fafaa426SNaoya Horiguchi * @walk->mm->mmap_sem, because these function traverse vma list and/or 281fafaa426SNaoya Horiguchi * access to vma's data. 282fafaa426SNaoya Horiguchi */ 283fafaa426SNaoya Horiguchi int walk_page_range(unsigned long start, unsigned long end, 284fafaa426SNaoya Horiguchi struct mm_walk *walk) 285fafaa426SNaoya Horiguchi { 286fafaa426SNaoya Horiguchi int err = 0; 287fafaa426SNaoya Horiguchi unsigned long next; 288fafaa426SNaoya Horiguchi struct vm_area_struct *vma; 289fafaa426SNaoya Horiguchi 290fafaa426SNaoya Horiguchi if (start >= end) 291fafaa426SNaoya Horiguchi return -EINVAL; 292e6473092SMatt Mackall 2932165009bSDave Hansen if (!walk->mm) 2942165009bSDave Hansen return -EINVAL; 2952165009bSDave Hansen 29696dad67fSSasha Levin VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); 297a9ff785eSCliff Wickman 298fafaa426SNaoya Horiguchi vma = find_vma(walk->mm, start); 299e6473092SMatt Mackall do { 300fafaa426SNaoya Horiguchi if (!vma) { /* after the last vma */ 301fafaa426SNaoya Horiguchi walk->vma = NULL; 302fafaa426SNaoya Horiguchi next = end; 303fafaa426SNaoya Horiguchi } else if (start < vma->vm_start) { /* outside vma */ 304fafaa426SNaoya Horiguchi walk->vma = NULL; 305fafaa426SNaoya Horiguchi next = min(end, vma->vm_start); 306fafaa426SNaoya Horiguchi } else { /* inside vma */ 307fafaa426SNaoya Horiguchi walk->vma = vma; 308fafaa426SNaoya Horiguchi next = min(end, vma->vm_end); 309fafaa426SNaoya Horiguchi vma = vma->vm_next; 3105f0af70aSDavid Sterba 311fafaa426SNaoya Horiguchi err = walk_page_test(start, next, walk); 312f6837395SNaoya Horiguchi if (err > 0) { 313f6837395SNaoya Horiguchi /* 314f6837395SNaoya Horiguchi * positive return values are purely for 315f6837395SNaoya Horiguchi * controlling the pagewalk, so should never 316f6837395SNaoya Horiguchi * be passed to the callers. 317f6837395SNaoya Horiguchi */ 318f6837395SNaoya Horiguchi err = 0; 319a9ff785eSCliff Wickman continue; 320f6837395SNaoya Horiguchi } 321fafaa426SNaoya Horiguchi if (err < 0) 322fafaa426SNaoya Horiguchi break; 323a9ff785eSCliff Wickman } 324fafaa426SNaoya Horiguchi if (walk->vma || walk->pte_hole) 325fafaa426SNaoya Horiguchi err = __walk_page_range(start, next, walk); 3265dc37642SNaoya Horiguchi if (err) 3275dc37642SNaoya Horiguchi break; 328fafaa426SNaoya Horiguchi } while (start = next, start < end); 329e6473092SMatt Mackall return err; 330e6473092SMatt Mackall } 331900fc5f1SNaoya Horiguchi 332900fc5f1SNaoya Horiguchi int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) 333900fc5f1SNaoya Horiguchi { 334900fc5f1SNaoya Horiguchi int err; 335900fc5f1SNaoya Horiguchi 336900fc5f1SNaoya Horiguchi if (!walk->mm) 337900fc5f1SNaoya Horiguchi return -EINVAL; 338900fc5f1SNaoya Horiguchi 339900fc5f1SNaoya Horiguchi VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); 340900fc5f1SNaoya Horiguchi VM_BUG_ON(!vma); 341900fc5f1SNaoya Horiguchi walk->vma = vma; 342900fc5f1SNaoya Horiguchi err = walk_page_test(vma->vm_start, vma->vm_end, walk); 343900fc5f1SNaoya Horiguchi if (err > 0) 344900fc5f1SNaoya Horiguchi return 0; 345900fc5f1SNaoya Horiguchi if (err < 0) 346900fc5f1SNaoya Horiguchi return err; 347900fc5f1SNaoya Horiguchi return __walk_page_range(vma->vm_start, vma->vm_end, walk); 348900fc5f1SNaoya Horiguchi } 349