1 #include <linux/mm.h> 2 #include <linux/highmem.h> 3 #include <linux/sched.h> 4 #include <linux/hugetlb.h> 5 6 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 7 struct mm_walk *walk) 8 { 9 pte_t *pte; 10 int err = 0; 11 12 pte = pte_offset_map(pmd, addr); 13 for (;;) { 14 err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); 15 if (err) 16 break; 17 addr += PAGE_SIZE; 18 if (addr == end) 19 break; 20 pte++; 21 } 22 23 pte_unmap(pte); 24 return err; 25 } 26 27 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 28 struct mm_walk *walk) 29 { 30 pmd_t *pmd; 31 unsigned long next; 32 int err = 0; 33 34 pmd = pmd_offset(pud, addr); 35 do { 36 again: 37 next = pmd_addr_end(addr, end); 38 if (pmd_none(*pmd) || !walk->vma) { 39 if (walk->pte_hole) 40 err = walk->pte_hole(addr, next, walk); 41 if (err) 42 break; 43 continue; 44 } 45 /* 46 * This implies that each ->pmd_entry() handler 47 * needs to know about pmd_trans_huge() pmds 48 */ 49 if (walk->pmd_entry) 50 err = walk->pmd_entry(pmd, addr, next, walk); 51 if (err) 52 break; 53 54 /* 55 * Check this here so we only break down trans_huge 56 * pages when we _need_ to 57 */ 58 if (!walk->pte_entry) 59 continue; 60 61 split_huge_pmd(walk->vma, pmd, addr); 62 if (pmd_trans_unstable(pmd)) 63 goto again; 64 err = walk_pte_range(pmd, addr, next, walk); 65 if (err) 66 break; 67 } while (pmd++, addr = next, addr != end); 68 69 return err; 70 } 71 72 static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, 73 struct mm_walk *walk) 74 { 75 pud_t *pud; 76 unsigned long next; 77 int err = 0; 78 79 pud = pud_offset(pgd, addr); 80 do { 81 again: 82 next = pud_addr_end(addr, end); 83 if (pud_none(*pud) || !walk->vma) { 84 if (walk->pte_hole) 85 err = walk->pte_hole(addr, next, walk); 86 if (err) 87 break; 88 continue; 89 } 90 91 if (walk->pud_entry) { 92 spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); 93 94 if (ptl) { 95 err = walk->pud_entry(pud, addr, next, walk); 96 spin_unlock(ptl); 97 if (err) 98 break; 99 continue; 100 } 101 } 102 103 split_huge_pud(walk->vma, pud, addr); 104 if (pud_none(*pud)) 105 goto again; 106 107 if (walk->pmd_entry || walk->pte_entry) 108 err = walk_pmd_range(pud, addr, next, walk); 109 if (err) 110 break; 111 } while (pud++, addr = next, addr != end); 112 113 return err; 114 } 115 116 static int walk_pgd_range(unsigned long addr, unsigned long end, 117 struct mm_walk *walk) 118 { 119 pgd_t *pgd; 120 unsigned long next; 121 int err = 0; 122 123 pgd = pgd_offset(walk->mm, addr); 124 do { 125 next = pgd_addr_end(addr, end); 126 if (pgd_none_or_clear_bad(pgd)) { 127 if (walk->pte_hole) 128 err = walk->pte_hole(addr, next, walk); 129 if (err) 130 break; 131 continue; 132 } 133 if (walk->pmd_entry || walk->pte_entry) 134 err = walk_pud_range(pgd, addr, next, walk); 135 if (err) 136 break; 137 } while (pgd++, addr = next, addr != end); 138 139 return err; 140 } 141 142 #ifdef CONFIG_HUGETLB_PAGE 143 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, 144 unsigned long end) 145 { 146 unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); 147 return boundary < end ? boundary : end; 148 } 149 150 static int walk_hugetlb_range(unsigned long addr, unsigned long end, 151 struct mm_walk *walk) 152 { 153 struct vm_area_struct *vma = walk->vma; 154 struct hstate *h = hstate_vma(vma); 155 unsigned long next; 156 unsigned long hmask = huge_page_mask(h); 157 pte_t *pte; 158 int err = 0; 159 160 do { 161 next = hugetlb_entry_end(h, addr, end); 162 pte = huge_pte_offset(walk->mm, addr & hmask); 163 if (pte && walk->hugetlb_entry) 164 err = walk->hugetlb_entry(pte, hmask, addr, next, walk); 165 if (err) 166 break; 167 } while (addr = next, addr != end); 168 169 return err; 170 } 171 172 #else /* CONFIG_HUGETLB_PAGE */ 173 static int walk_hugetlb_range(unsigned long addr, unsigned long end, 174 struct mm_walk *walk) 175 { 176 return 0; 177 } 178 179 #endif /* CONFIG_HUGETLB_PAGE */ 180 181 /* 182 * Decide whether we really walk over the current vma on [@start, @end) 183 * or skip it via the returned value. Return 0 if we do walk over the 184 * current vma, and return 1 if we skip the vma. Negative values means 185 * error, where we abort the current walk. 186 */ 187 static int walk_page_test(unsigned long start, unsigned long end, 188 struct mm_walk *walk) 189 { 190 struct vm_area_struct *vma = walk->vma; 191 192 if (walk->test_walk) 193 return walk->test_walk(start, end, walk); 194 195 /* 196 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP 197 * range, so we don't walk over it as we do for normal vmas. However, 198 * Some callers are interested in handling hole range and they don't 199 * want to just ignore any single address range. Such users certainly 200 * define their ->pte_hole() callbacks, so let's delegate them to handle 201 * vma(VM_PFNMAP). 202 */ 203 if (vma->vm_flags & VM_PFNMAP) { 204 int err = 1; 205 if (walk->pte_hole) 206 err = walk->pte_hole(start, end, walk); 207 return err ? err : 1; 208 } 209 return 0; 210 } 211 212 static int __walk_page_range(unsigned long start, unsigned long end, 213 struct mm_walk *walk) 214 { 215 int err = 0; 216 struct vm_area_struct *vma = walk->vma; 217 218 if (vma && is_vm_hugetlb_page(vma)) { 219 if (walk->hugetlb_entry) 220 err = walk_hugetlb_range(start, end, walk); 221 } else 222 err = walk_pgd_range(start, end, walk); 223 224 return err; 225 } 226 227 /** 228 * walk_page_range - walk page table with caller specific callbacks 229 * 230 * Recursively walk the page table tree of the process represented by @walk->mm 231 * within the virtual address range [@start, @end). During walking, we can do 232 * some caller-specific works for each entry, by setting up pmd_entry(), 233 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these 234 * callbacks, the associated entries/pages are just ignored. 235 * The return values of these callbacks are commonly defined like below: 236 * - 0 : succeeded to handle the current entry, and if you don't reach the 237 * end address yet, continue to walk. 238 * - >0 : succeeded to handle the current entry, and return to the caller 239 * with caller specific value. 240 * - <0 : failed to handle the current entry, and return to the caller 241 * with error code. 242 * 243 * Before starting to walk page table, some callers want to check whether 244 * they really want to walk over the current vma, typically by checking 245 * its vm_flags. walk_page_test() and @walk->test_walk() are used for this 246 * purpose. 247 * 248 * struct mm_walk keeps current values of some common data like vma and pmd, 249 * which are useful for the access from callbacks. If you want to pass some 250 * caller-specific data to callbacks, @walk->private should be helpful. 251 * 252 * Locking: 253 * Callers of walk_page_range() and walk_page_vma() should hold 254 * @walk->mm->mmap_sem, because these function traverse vma list and/or 255 * access to vma's data. 256 */ 257 int walk_page_range(unsigned long start, unsigned long end, 258 struct mm_walk *walk) 259 { 260 int err = 0; 261 unsigned long next; 262 struct vm_area_struct *vma; 263 264 if (start >= end) 265 return -EINVAL; 266 267 if (!walk->mm) 268 return -EINVAL; 269 270 VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); 271 272 vma = find_vma(walk->mm, start); 273 do { 274 if (!vma) { /* after the last vma */ 275 walk->vma = NULL; 276 next = end; 277 } else if (start < vma->vm_start) { /* outside vma */ 278 walk->vma = NULL; 279 next = min(end, vma->vm_start); 280 } else { /* inside vma */ 281 walk->vma = vma; 282 next = min(end, vma->vm_end); 283 vma = vma->vm_next; 284 285 err = walk_page_test(start, next, walk); 286 if (err > 0) { 287 /* 288 * positive return values are purely for 289 * controlling the pagewalk, so should never 290 * be passed to the callers. 291 */ 292 err = 0; 293 continue; 294 } 295 if (err < 0) 296 break; 297 } 298 if (walk->vma || walk->pte_hole) 299 err = __walk_page_range(start, next, walk); 300 if (err) 301 break; 302 } while (start = next, start < end); 303 return err; 304 } 305 306 int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) 307 { 308 int err; 309 310 if (!walk->mm) 311 return -EINVAL; 312 313 VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); 314 VM_BUG_ON(!vma); 315 walk->vma = vma; 316 err = walk_page_test(vma->vm_start, vma->vm_end, walk); 317 if (err > 0) 318 return 0; 319 if (err < 0) 320 return err; 321 return __walk_page_range(vma->vm_start, vma->vm_end, walk); 322 } 323