1e6473092SMatt Mackall #include <linux/mm.h> 2e6473092SMatt Mackall #include <linux/highmem.h> 3e6473092SMatt Mackall #include <linux/sched.h> 4d33b9f45SNaoya Horiguchi #include <linux/hugetlb.h> 5e6473092SMatt Mackall 6e6473092SMatt Mackall static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 72165009bSDave Hansen struct mm_walk *walk) 8e6473092SMatt Mackall { 9e6473092SMatt Mackall pte_t *pte; 10e6473092SMatt Mackall int err = 0; 11e6473092SMatt Mackall 12e6473092SMatt Mackall pte = pte_offset_map(pmd, addr); 13556637cdSJohannes Weiner for (;;) { 142165009bSDave Hansen err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); 15e6473092SMatt Mackall if (err) 16e6473092SMatt Mackall break; 17556637cdSJohannes Weiner addr += PAGE_SIZE; 18556637cdSJohannes Weiner if (addr == end) 19556637cdSJohannes Weiner break; 20556637cdSJohannes Weiner pte++; 21556637cdSJohannes Weiner } 22e6473092SMatt Mackall 23e6473092SMatt Mackall pte_unmap(pte); 24e6473092SMatt Mackall return err; 25e6473092SMatt Mackall } 26e6473092SMatt Mackall 27e6473092SMatt Mackall static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 282165009bSDave Hansen struct mm_walk *walk) 29e6473092SMatt Mackall { 30e6473092SMatt Mackall pmd_t *pmd; 31e6473092SMatt Mackall unsigned long next; 32e6473092SMatt Mackall int err = 0; 33e6473092SMatt Mackall 34e6473092SMatt Mackall pmd = pmd_offset(pud, addr); 35e6473092SMatt Mackall do { 3603319327SDave Hansen again: 37e6473092SMatt Mackall next = pmd_addr_end(addr, end); 3848684a65SNaoya Horiguchi if (pmd_none(*pmd) || !walk->vma) { 39e6473092SMatt Mackall if (walk->pte_hole) 402165009bSDave Hansen err = walk->pte_hole(addr, next, walk); 41e6473092SMatt Mackall if (err) 42e6473092SMatt Mackall break; 43e6473092SMatt Mackall continue; 44e6473092SMatt Mackall } 4503319327SDave Hansen /* 4603319327SDave Hansen * This implies that each ->pmd_entry() handler 4703319327SDave Hansen * needs to know about pmd_trans_huge() pmds 4803319327SDave Hansen */ 49e6473092SMatt Mackall if (walk->pmd_entry) 502165009bSDave Hansen err = walk->pmd_entry(pmd, addr, next, walk); 5103319327SDave Hansen if (err) 5203319327SDave Hansen break; 5303319327SDave Hansen 5403319327SDave Hansen /* 5503319327SDave Hansen * Check this here so we only break down trans_huge 5603319327SDave Hansen * pages when we _need_ to 5703319327SDave Hansen */ 5803319327SDave Hansen if (!walk->pte_entry) 5903319327SDave Hansen continue; 6003319327SDave Hansen 61*78ddc534SKirill A. Shutemov split_huge_pmd(walk->vma, pmd, addr); 62fafaa426SNaoya Horiguchi if (pmd_trans_unstable(pmd)) 6303319327SDave Hansen goto again; 642165009bSDave Hansen err = walk_pte_range(pmd, addr, next, walk); 65e6473092SMatt Mackall if (err) 66e6473092SMatt Mackall break; 67e6473092SMatt Mackall } while (pmd++, addr = next, addr != end); 68e6473092SMatt Mackall 69e6473092SMatt Mackall return err; 70e6473092SMatt Mackall } 71e6473092SMatt Mackall 72e6473092SMatt Mackall static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, 732165009bSDave Hansen struct mm_walk *walk) 74e6473092SMatt Mackall { 75e6473092SMatt Mackall pud_t *pud; 76e6473092SMatt Mackall unsigned long next; 77e6473092SMatt Mackall int err = 0; 78e6473092SMatt Mackall 79e6473092SMatt Mackall pud = pud_offset(pgd, addr); 80e6473092SMatt Mackall do { 81e6473092SMatt Mackall next = pud_addr_end(addr, end); 82e6473092SMatt Mackall if (pud_none_or_clear_bad(pud)) { 83e6473092SMatt Mackall if (walk->pte_hole) 842165009bSDave Hansen err = walk->pte_hole(addr, next, walk); 85e6473092SMatt Mackall if (err) 86e6473092SMatt Mackall break; 87e6473092SMatt Mackall continue; 88e6473092SMatt Mackall } 890b1fbfe5SNaoya Horiguchi if (walk->pmd_entry || walk->pte_entry) 902165009bSDave Hansen err = walk_pmd_range(pud, addr, next, walk); 91e6473092SMatt Mackall if (err) 92e6473092SMatt Mackall break; 93e6473092SMatt Mackall } while (pud++, addr = next, addr != end); 94e6473092SMatt Mackall 95e6473092SMatt Mackall return err; 96e6473092SMatt Mackall } 97e6473092SMatt Mackall 98fafaa426SNaoya Horiguchi static int walk_pgd_range(unsigned long addr, unsigned long end, 99fafaa426SNaoya Horiguchi struct mm_walk *walk) 100fafaa426SNaoya Horiguchi { 101fafaa426SNaoya Horiguchi pgd_t *pgd; 102fafaa426SNaoya Horiguchi unsigned long next; 103fafaa426SNaoya Horiguchi int err = 0; 104fafaa426SNaoya Horiguchi 105fafaa426SNaoya Horiguchi pgd = pgd_offset(walk->mm, addr); 106fafaa426SNaoya Horiguchi do { 107fafaa426SNaoya Horiguchi next = pgd_addr_end(addr, end); 108fafaa426SNaoya Horiguchi if (pgd_none_or_clear_bad(pgd)) { 109fafaa426SNaoya Horiguchi if (walk->pte_hole) 110fafaa426SNaoya Horiguchi err = walk->pte_hole(addr, next, walk); 111fafaa426SNaoya Horiguchi if (err) 112fafaa426SNaoya Horiguchi break; 113fafaa426SNaoya Horiguchi continue; 114fafaa426SNaoya Horiguchi } 115fafaa426SNaoya Horiguchi if (walk->pmd_entry || walk->pte_entry) 116fafaa426SNaoya Horiguchi err = walk_pud_range(pgd, addr, next, walk); 117fafaa426SNaoya Horiguchi if (err) 118fafaa426SNaoya Horiguchi break; 119fafaa426SNaoya Horiguchi } while (pgd++, addr = next, addr != end); 120fafaa426SNaoya Horiguchi 121fafaa426SNaoya Horiguchi return err; 122fafaa426SNaoya Horiguchi } 123fafaa426SNaoya Horiguchi 124116354d1SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE 125116354d1SNaoya Horiguchi static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, 126116354d1SNaoya Horiguchi unsigned long end) 127116354d1SNaoya Horiguchi { 128116354d1SNaoya Horiguchi unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); 129116354d1SNaoya Horiguchi return boundary < end ? boundary : end; 130116354d1SNaoya Horiguchi } 131116354d1SNaoya Horiguchi 132fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end, 133116354d1SNaoya Horiguchi struct mm_walk *walk) 134116354d1SNaoya Horiguchi { 135fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma; 136116354d1SNaoya Horiguchi struct hstate *h = hstate_vma(vma); 137116354d1SNaoya Horiguchi unsigned long next; 138116354d1SNaoya Horiguchi unsigned long hmask = huge_page_mask(h); 139116354d1SNaoya Horiguchi pte_t *pte; 140116354d1SNaoya Horiguchi int err = 0; 141116354d1SNaoya Horiguchi 142116354d1SNaoya Horiguchi do { 143116354d1SNaoya Horiguchi next = hugetlb_entry_end(h, addr, end); 144116354d1SNaoya Horiguchi pte = huge_pte_offset(walk->mm, addr & hmask); 145116354d1SNaoya Horiguchi if (pte && walk->hugetlb_entry) 146116354d1SNaoya Horiguchi err = walk->hugetlb_entry(pte, hmask, addr, next, walk); 147116354d1SNaoya Horiguchi if (err) 148fafaa426SNaoya Horiguchi break; 149116354d1SNaoya Horiguchi } while (addr = next, addr != end); 150116354d1SNaoya Horiguchi 151fafaa426SNaoya Horiguchi return err; 152116354d1SNaoya Horiguchi } 1536c6d5280SKOSAKI Motohiro 1546c6d5280SKOSAKI Motohiro #else /* CONFIG_HUGETLB_PAGE */ 155fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end, 1566c6d5280SKOSAKI Motohiro struct mm_walk *walk) 1576c6d5280SKOSAKI Motohiro { 1586c6d5280SKOSAKI Motohiro return 0; 1596c6d5280SKOSAKI Motohiro } 1606c6d5280SKOSAKI Motohiro 1616c6d5280SKOSAKI Motohiro #endif /* CONFIG_HUGETLB_PAGE */ 1626c6d5280SKOSAKI Motohiro 163fafaa426SNaoya Horiguchi /* 164fafaa426SNaoya Horiguchi * Decide whether we really walk over the current vma on [@start, @end) 165fafaa426SNaoya Horiguchi * or skip it via the returned value. Return 0 if we do walk over the 166fafaa426SNaoya Horiguchi * current vma, and return 1 if we skip the vma. Negative values means 167fafaa426SNaoya Horiguchi * error, where we abort the current walk. 168e6473092SMatt Mackall */ 169fafaa426SNaoya Horiguchi static int walk_page_test(unsigned long start, unsigned long end, 1702165009bSDave Hansen struct mm_walk *walk) 171e6473092SMatt Mackall { 172fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma; 173e6473092SMatt Mackall 174fafaa426SNaoya Horiguchi if (walk->test_walk) 175fafaa426SNaoya Horiguchi return walk->test_walk(start, end, walk); 176fafaa426SNaoya Horiguchi 177fafaa426SNaoya Horiguchi /* 17848684a65SNaoya Horiguchi * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP 17948684a65SNaoya Horiguchi * range, so we don't walk over it as we do for normal vmas. However, 18048684a65SNaoya Horiguchi * Some callers are interested in handling hole range and they don't 18148684a65SNaoya Horiguchi * want to just ignore any single address range. Such users certainly 18248684a65SNaoya Horiguchi * define their ->pte_hole() callbacks, so let's delegate them to handle 18348684a65SNaoya Horiguchi * vma(VM_PFNMAP). 184fafaa426SNaoya Horiguchi */ 18548684a65SNaoya Horiguchi if (vma->vm_flags & VM_PFNMAP) { 18648684a65SNaoya Horiguchi int err = 1; 18748684a65SNaoya Horiguchi if (walk->pte_hole) 18848684a65SNaoya Horiguchi err = walk->pte_hole(start, end, walk); 18948684a65SNaoya Horiguchi return err ? err : 1; 19048684a65SNaoya Horiguchi } 191fafaa426SNaoya Horiguchi return 0; 192fafaa426SNaoya Horiguchi } 193fafaa426SNaoya Horiguchi 194fafaa426SNaoya Horiguchi static int __walk_page_range(unsigned long start, unsigned long end, 195fafaa426SNaoya Horiguchi struct mm_walk *walk) 196fafaa426SNaoya Horiguchi { 197fafaa426SNaoya Horiguchi int err = 0; 198fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma; 199fafaa426SNaoya Horiguchi 200fafaa426SNaoya Horiguchi if (vma && is_vm_hugetlb_page(vma)) { 201fafaa426SNaoya Horiguchi if (walk->hugetlb_entry) 202fafaa426SNaoya Horiguchi err = walk_hugetlb_range(start, end, walk); 203fafaa426SNaoya Horiguchi } else 204fafaa426SNaoya Horiguchi err = walk_pgd_range(start, end, walk); 205fafaa426SNaoya Horiguchi 206e6473092SMatt Mackall return err; 207fafaa426SNaoya Horiguchi } 208fafaa426SNaoya Horiguchi 209fafaa426SNaoya Horiguchi /** 210fafaa426SNaoya Horiguchi * walk_page_range - walk page table with caller specific callbacks 211fafaa426SNaoya Horiguchi * 212fafaa426SNaoya Horiguchi * Recursively walk the page table tree of the process represented by @walk->mm 213fafaa426SNaoya Horiguchi * within the virtual address range [@start, @end). During walking, we can do 214fafaa426SNaoya Horiguchi * some caller-specific works for each entry, by setting up pmd_entry(), 215fafaa426SNaoya Horiguchi * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these 216fafaa426SNaoya Horiguchi * callbacks, the associated entries/pages are just ignored. 217fafaa426SNaoya Horiguchi * The return values of these callbacks are commonly defined like below: 218fafaa426SNaoya Horiguchi * - 0 : succeeded to handle the current entry, and if you don't reach the 219fafaa426SNaoya Horiguchi * end address yet, continue to walk. 220fafaa426SNaoya Horiguchi * - >0 : succeeded to handle the current entry, and return to the caller 221fafaa426SNaoya Horiguchi * with caller specific value. 222fafaa426SNaoya Horiguchi * - <0 : failed to handle the current entry, and return to the caller 223fafaa426SNaoya Horiguchi * with error code. 224fafaa426SNaoya Horiguchi * 225fafaa426SNaoya Horiguchi * Before starting to walk page table, some callers want to check whether 226fafaa426SNaoya Horiguchi * they really want to walk over the current vma, typically by checking 227fafaa426SNaoya Horiguchi * its vm_flags. walk_page_test() and @walk->test_walk() are used for this 228fafaa426SNaoya Horiguchi * purpose. 229fafaa426SNaoya Horiguchi * 230fafaa426SNaoya Horiguchi * struct mm_walk keeps current values of some common data like vma and pmd, 231fafaa426SNaoya Horiguchi * which are useful for the access from callbacks. If you want to pass some 232fafaa426SNaoya Horiguchi * caller-specific data to callbacks, @walk->private should be helpful. 233fafaa426SNaoya Horiguchi * 234fafaa426SNaoya Horiguchi * Locking: 235fafaa426SNaoya Horiguchi * Callers of walk_page_range() and walk_page_vma() should hold 236fafaa426SNaoya Horiguchi * @walk->mm->mmap_sem, because these function traverse vma list and/or 237fafaa426SNaoya Horiguchi * access to vma's data. 238fafaa426SNaoya Horiguchi */ 239fafaa426SNaoya Horiguchi int walk_page_range(unsigned long start, unsigned long end, 240fafaa426SNaoya Horiguchi struct mm_walk *walk) 241fafaa426SNaoya Horiguchi { 242fafaa426SNaoya Horiguchi int err = 0; 243fafaa426SNaoya Horiguchi unsigned long next; 244fafaa426SNaoya Horiguchi struct vm_area_struct *vma; 245fafaa426SNaoya Horiguchi 246fafaa426SNaoya Horiguchi if (start >= end) 247fafaa426SNaoya Horiguchi return -EINVAL; 248e6473092SMatt Mackall 2492165009bSDave Hansen if (!walk->mm) 2502165009bSDave Hansen return -EINVAL; 2512165009bSDave Hansen 25296dad67fSSasha Levin VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); 253a9ff785eSCliff Wickman 254fafaa426SNaoya Horiguchi vma = find_vma(walk->mm, start); 255e6473092SMatt Mackall do { 256fafaa426SNaoya Horiguchi if (!vma) { /* after the last vma */ 257fafaa426SNaoya Horiguchi walk->vma = NULL; 258fafaa426SNaoya Horiguchi next = end; 259fafaa426SNaoya Horiguchi } else if (start < vma->vm_start) { /* outside vma */ 260fafaa426SNaoya Horiguchi walk->vma = NULL; 261fafaa426SNaoya Horiguchi next = min(end, vma->vm_start); 262fafaa426SNaoya Horiguchi } else { /* inside vma */ 263fafaa426SNaoya Horiguchi walk->vma = vma; 264fafaa426SNaoya Horiguchi next = min(end, vma->vm_end); 265fafaa426SNaoya Horiguchi vma = vma->vm_next; 2665f0af70aSDavid Sterba 267fafaa426SNaoya Horiguchi err = walk_page_test(start, next, walk); 268f6837395SNaoya Horiguchi if (err > 0) { 269f6837395SNaoya Horiguchi /* 270f6837395SNaoya Horiguchi * positive return values are purely for 271f6837395SNaoya Horiguchi * controlling the pagewalk, so should never 272f6837395SNaoya Horiguchi * be passed to the callers. 273f6837395SNaoya Horiguchi */ 274f6837395SNaoya Horiguchi err = 0; 275a9ff785eSCliff Wickman continue; 276f6837395SNaoya Horiguchi } 277fafaa426SNaoya Horiguchi if (err < 0) 278fafaa426SNaoya Horiguchi break; 279a9ff785eSCliff Wickman } 280fafaa426SNaoya Horiguchi if (walk->vma || walk->pte_hole) 281fafaa426SNaoya Horiguchi err = __walk_page_range(start, next, walk); 2825dc37642SNaoya Horiguchi if (err) 2835dc37642SNaoya Horiguchi break; 284fafaa426SNaoya Horiguchi } while (start = next, start < end); 285e6473092SMatt Mackall return err; 286e6473092SMatt Mackall } 287900fc5f1SNaoya Horiguchi 288900fc5f1SNaoya Horiguchi int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) 289900fc5f1SNaoya Horiguchi { 290900fc5f1SNaoya Horiguchi int err; 291900fc5f1SNaoya Horiguchi 292900fc5f1SNaoya Horiguchi if (!walk->mm) 293900fc5f1SNaoya Horiguchi return -EINVAL; 294900fc5f1SNaoya Horiguchi 295900fc5f1SNaoya Horiguchi VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); 296900fc5f1SNaoya Horiguchi VM_BUG_ON(!vma); 297900fc5f1SNaoya Horiguchi walk->vma = vma; 298900fc5f1SNaoya Horiguchi err = walk_page_test(vma->vm_start, vma->vm_end, walk); 299900fc5f1SNaoya Horiguchi if (err > 0) 300900fc5f1SNaoya Horiguchi return 0; 301900fc5f1SNaoya Horiguchi if (err < 0) 302900fc5f1SNaoya Horiguchi return err; 303900fc5f1SNaoya Horiguchi return __walk_page_range(vma->vm_start, vma->vm_end, walk); 304900fc5f1SNaoya Horiguchi } 305