1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/pagewalk.h> 3 #include <linux/highmem.h> 4 #include <linux/sched.h> 5 #include <linux/hugetlb.h> 6 7 /* 8 * We want to know the real level where a entry is located ignoring any 9 * folding of levels which may be happening. For example if p4d is folded then 10 * a missing entry found at level 1 (p4d) is actually at level 0 (pgd). 11 */ 12 static int real_depth(int depth) 13 { 14 if (depth == 3 && PTRS_PER_PMD == 1) 15 depth = 2; 16 if (depth == 2 && PTRS_PER_PUD == 1) 17 depth = 1; 18 if (depth == 1 && PTRS_PER_P4D == 1) 19 depth = 0; 20 return depth; 21 } 22 23 static int walk_pte_range_inner(pte_t *pte, unsigned long addr, 24 unsigned long end, struct mm_walk *walk) 25 { 26 const struct mm_walk_ops *ops = walk->ops; 27 int err = 0; 28 29 for (;;) { 30 err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); 31 if (err) 32 break; 33 if (addr >= end - PAGE_SIZE) 34 break; 35 addr += PAGE_SIZE; 36 pte++; 37 } 38 return err; 39 } 40 41 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 42 struct mm_walk *walk) 43 { 44 pte_t *pte; 45 int err = 0; 46 spinlock_t *ptl; 47 48 if (walk->no_vma) { 49 pte = pte_offset_map(pmd, addr); 50 err = walk_pte_range_inner(pte, addr, end, walk); 51 pte_unmap(pte); 52 } else { 53 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 54 err = walk_pte_range_inner(pte, addr, end, walk); 55 pte_unmap_unlock(pte, ptl); 56 } 57 58 return err; 59 } 60 61 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 62 struct mm_walk *walk) 63 { 64 pmd_t *pmd; 65 unsigned long next; 66 const struct mm_walk_ops *ops = walk->ops; 67 int err = 0; 68 int depth = real_depth(3); 69 70 pmd = pmd_offset(pud, addr); 71 do { 72 again: 73 next = pmd_addr_end(addr, end); 74 if (pmd_none(*pmd) || (!walk->vma && !walk->no_vma)) { 75 if (ops->pte_hole) 76 err = ops->pte_hole(addr, next, depth, walk); 77 if (err) 78 break; 79 continue; 80 } 81 82 walk->action = ACTION_SUBTREE; 83 84 /* 85 * This implies that each ->pmd_entry() handler 86 * needs to know about pmd_trans_huge() pmds 87 */ 88 if (ops->pmd_entry) 89 err = ops->pmd_entry(pmd, addr, next, walk); 90 if (err) 91 break; 92 93 if (walk->action == ACTION_AGAIN) 94 goto again; 95 96 /* 97 * Check this here so we only break down trans_huge 98 * pages when we _need_ to 99 */ 100 if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) || 101 walk->action == ACTION_CONTINUE || 102 !(ops->pte_entry)) 103 continue; 104 105 if (walk->vma) { 106 split_huge_pmd(walk->vma, pmd, addr); 107 if (pmd_trans_unstable(pmd)) 108 goto again; 109 } 110 111 err = walk_pte_range(pmd, addr, next, walk); 112 if (err) 113 break; 114 } while (pmd++, addr = next, addr != end); 115 116 return err; 117 } 118 119 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 120 struct mm_walk *walk) 121 { 122 pud_t *pud; 123 unsigned long next; 124 const struct mm_walk_ops *ops = walk->ops; 125 int err = 0; 126 int depth = real_depth(2); 127 128 pud = pud_offset(p4d, addr); 129 do { 130 again: 131 next = pud_addr_end(addr, end); 132 if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) { 133 if (ops->pte_hole) 134 err = ops->pte_hole(addr, next, depth, walk); 135 if (err) 136 break; 137 continue; 138 } 139 140 walk->action = ACTION_SUBTREE; 141 142 if (ops->pud_entry) 143 err = ops->pud_entry(pud, addr, next, walk); 144 if (err) 145 break; 146 147 if (walk->action == ACTION_AGAIN) 148 goto again; 149 150 if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) || 151 walk->action == ACTION_CONTINUE || 152 !(ops->pmd_entry || ops->pte_entry)) 153 continue; 154 155 if (walk->vma) 156 split_huge_pud(walk->vma, pud, addr); 157 if (pud_none(*pud)) 158 goto again; 159 160 err = walk_pmd_range(pud, addr, next, walk); 161 if (err) 162 break; 163 } while (pud++, addr = next, addr != end); 164 165 return err; 166 } 167 168 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 169 struct mm_walk *walk) 170 { 171 p4d_t *p4d; 172 unsigned long next; 173 const struct mm_walk_ops *ops = walk->ops; 174 int err = 0; 175 int depth = real_depth(1); 176 177 p4d = p4d_offset(pgd, addr); 178 do { 179 next = p4d_addr_end(addr, end); 180 if (p4d_none_or_clear_bad(p4d)) { 181 if (ops->pte_hole) 182 err = ops->pte_hole(addr, next, depth, walk); 183 if (err) 184 break; 185 continue; 186 } 187 if (ops->p4d_entry) { 188 err = ops->p4d_entry(p4d, addr, next, walk); 189 if (err) 190 break; 191 } 192 if (ops->pud_entry || ops->pmd_entry || ops->pte_entry) 193 err = walk_pud_range(p4d, addr, next, walk); 194 if (err) 195 break; 196 } while (p4d++, addr = next, addr != end); 197 198 return err; 199 } 200 201 static int walk_pgd_range(unsigned long addr, unsigned long end, 202 struct mm_walk *walk) 203 { 204 pgd_t *pgd; 205 unsigned long next; 206 const struct mm_walk_ops *ops = walk->ops; 207 int err = 0; 208 209 if (walk->pgd) 210 pgd = walk->pgd + pgd_index(addr); 211 else 212 pgd = pgd_offset(walk->mm, addr); 213 do { 214 next = pgd_addr_end(addr, end); 215 if (pgd_none_or_clear_bad(pgd)) { 216 if (ops->pte_hole) 217 err = ops->pte_hole(addr, next, 0, walk); 218 if (err) 219 break; 220 continue; 221 } 222 if (ops->pgd_entry) { 223 err = ops->pgd_entry(pgd, addr, next, walk); 224 if (err) 225 break; 226 } 227 if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || 228 ops->pte_entry) 229 err = walk_p4d_range(pgd, addr, next, walk); 230 if (err) 231 break; 232 } while (pgd++, addr = next, addr != end); 233 234 return err; 235 } 236 237 #ifdef CONFIG_HUGETLB_PAGE 238 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, 239 unsigned long end) 240 { 241 unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); 242 return boundary < end ? boundary : end; 243 } 244 245 static int walk_hugetlb_range(unsigned long addr, unsigned long end, 246 struct mm_walk *walk) 247 { 248 struct vm_area_struct *vma = walk->vma; 249 struct hstate *h = hstate_vma(vma); 250 unsigned long next; 251 unsigned long hmask = huge_page_mask(h); 252 unsigned long sz = huge_page_size(h); 253 pte_t *pte; 254 const struct mm_walk_ops *ops = walk->ops; 255 int err = 0; 256 257 do { 258 next = hugetlb_entry_end(h, addr, end); 259 pte = huge_pte_offset(walk->mm, addr & hmask, sz); 260 261 if (pte) 262 err = ops->hugetlb_entry(pte, hmask, addr, next, walk); 263 else if (ops->pte_hole) 264 err = ops->pte_hole(addr, next, -1, walk); 265 266 if (err) 267 break; 268 } while (addr = next, addr != end); 269 270 return err; 271 } 272 273 #else /* CONFIG_HUGETLB_PAGE */ 274 static int walk_hugetlb_range(unsigned long addr, unsigned long end, 275 struct mm_walk *walk) 276 { 277 return 0; 278 } 279 280 #endif /* CONFIG_HUGETLB_PAGE */ 281 282 /* 283 * Decide whether we really walk over the current vma on [@start, @end) 284 * or skip it via the returned value. Return 0 if we do walk over the 285 * current vma, and return 1 if we skip the vma. Negative values means 286 * error, where we abort the current walk. 287 */ 288 static int walk_page_test(unsigned long start, unsigned long end, 289 struct mm_walk *walk) 290 { 291 struct vm_area_struct *vma = walk->vma; 292 const struct mm_walk_ops *ops = walk->ops; 293 294 if (ops->test_walk) 295 return ops->test_walk(start, end, walk); 296 297 /* 298 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP 299 * range, so we don't walk over it as we do for normal vmas. However, 300 * Some callers are interested in handling hole range and they don't 301 * want to just ignore any single address range. Such users certainly 302 * define their ->pte_hole() callbacks, so let's delegate them to handle 303 * vma(VM_PFNMAP). 304 */ 305 if (vma->vm_flags & VM_PFNMAP) { 306 int err = 1; 307 if (ops->pte_hole) 308 err = ops->pte_hole(start, end, -1, walk); 309 return err ? err : 1; 310 } 311 return 0; 312 } 313 314 static int __walk_page_range(unsigned long start, unsigned long end, 315 struct mm_walk *walk) 316 { 317 int err = 0; 318 struct vm_area_struct *vma = walk->vma; 319 const struct mm_walk_ops *ops = walk->ops; 320 321 if (vma && ops->pre_vma) { 322 err = ops->pre_vma(start, end, walk); 323 if (err) 324 return err; 325 } 326 327 if (vma && is_vm_hugetlb_page(vma)) { 328 if (ops->hugetlb_entry) 329 err = walk_hugetlb_range(start, end, walk); 330 } else 331 err = walk_pgd_range(start, end, walk); 332 333 if (vma && ops->post_vma) 334 ops->post_vma(walk); 335 336 return err; 337 } 338 339 /** 340 * walk_page_range - walk page table with caller specific callbacks 341 * @mm: mm_struct representing the target process of page table walk 342 * @start: start address of the virtual address range 343 * @end: end address of the virtual address range 344 * @ops: operation to call during the walk 345 * @private: private data for callbacks' usage 346 * 347 * Recursively walk the page table tree of the process represented by @mm 348 * within the virtual address range [@start, @end). During walking, we can do 349 * some caller-specific works for each entry, by setting up pmd_entry(), 350 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these 351 * callbacks, the associated entries/pages are just ignored. 352 * The return values of these callbacks are commonly defined like below: 353 * 354 * - 0 : succeeded to handle the current entry, and if you don't reach the 355 * end address yet, continue to walk. 356 * - >0 : succeeded to handle the current entry, and return to the caller 357 * with caller specific value. 358 * - <0 : failed to handle the current entry, and return to the caller 359 * with error code. 360 * 361 * Before starting to walk page table, some callers want to check whether 362 * they really want to walk over the current vma, typically by checking 363 * its vm_flags. walk_page_test() and @ops->test_walk() are used for this 364 * purpose. 365 * 366 * If operations need to be staged before and committed after a vma is walked, 367 * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), 368 * since it is intended to handle commit-type operations, can't return any 369 * errors. 370 * 371 * struct mm_walk keeps current values of some common data like vma and pmd, 372 * which are useful for the access from callbacks. If you want to pass some 373 * caller-specific data to callbacks, @private should be helpful. 374 * 375 * Locking: 376 * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock, 377 * because these function traverse vma list and/or access to vma's data. 378 */ 379 int walk_page_range(struct mm_struct *mm, unsigned long start, 380 unsigned long end, const struct mm_walk_ops *ops, 381 void *private) 382 { 383 int err = 0; 384 unsigned long next; 385 struct vm_area_struct *vma; 386 struct mm_walk walk = { 387 .ops = ops, 388 .mm = mm, 389 .private = private, 390 }; 391 392 if (start >= end) 393 return -EINVAL; 394 395 if (!walk.mm) 396 return -EINVAL; 397 398 mmap_assert_locked(walk.mm); 399 400 vma = find_vma(walk.mm, start); 401 do { 402 if (!vma) { /* after the last vma */ 403 walk.vma = NULL; 404 next = end; 405 } else if (start < vma->vm_start) { /* outside vma */ 406 walk.vma = NULL; 407 next = min(end, vma->vm_start); 408 } else { /* inside vma */ 409 walk.vma = vma; 410 next = min(end, vma->vm_end); 411 vma = vma->vm_next; 412 413 err = walk_page_test(start, next, &walk); 414 if (err > 0) { 415 /* 416 * positive return values are purely for 417 * controlling the pagewalk, so should never 418 * be passed to the callers. 419 */ 420 err = 0; 421 continue; 422 } 423 if (err < 0) 424 break; 425 } 426 if (walk.vma || walk.ops->pte_hole) 427 err = __walk_page_range(start, next, &walk); 428 if (err) 429 break; 430 } while (start = next, start < end); 431 return err; 432 } 433 434 /* 435 * Similar to walk_page_range() but can walk any page tables even if they are 436 * not backed by VMAs. Because 'unusual' entries may be walked this function 437 * will also not lock the PTEs for the pte_entry() callback. This is useful for 438 * walking the kernel pages tables or page tables for firmware. 439 */ 440 int walk_page_range_novma(struct mm_struct *mm, unsigned long start, 441 unsigned long end, const struct mm_walk_ops *ops, 442 pgd_t *pgd, 443 void *private) 444 { 445 struct mm_walk walk = { 446 .ops = ops, 447 .mm = mm, 448 .pgd = pgd, 449 .private = private, 450 .no_vma = true 451 }; 452 453 if (start >= end || !walk.mm) 454 return -EINVAL; 455 456 mmap_assert_locked(walk.mm); 457 458 return __walk_page_range(start, end, &walk); 459 } 460 461 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, 462 void *private) 463 { 464 struct mm_walk walk = { 465 .ops = ops, 466 .mm = vma->vm_mm, 467 .vma = vma, 468 .private = private, 469 }; 470 int err; 471 472 if (!walk.mm) 473 return -EINVAL; 474 475 mmap_assert_locked(walk.mm); 476 477 err = walk_page_test(vma->vm_start, vma->vm_end, &walk); 478 if (err > 0) 479 return 0; 480 if (err < 0) 481 return err; 482 return __walk_page_range(vma->vm_start, vma->vm_end, &walk); 483 } 484 485 /** 486 * walk_page_mapping - walk all memory areas mapped into a struct address_space. 487 * @mapping: Pointer to the struct address_space 488 * @first_index: First page offset in the address_space 489 * @nr: Number of incremental page offsets to cover 490 * @ops: operation to call during the walk 491 * @private: private data for callbacks' usage 492 * 493 * This function walks all memory areas mapped into a struct address_space. 494 * The walk is limited to only the given page-size index range, but if 495 * the index boundaries cross a huge page-table entry, that entry will be 496 * included. 497 * 498 * Also see walk_page_range() for additional information. 499 * 500 * Locking: 501 * This function can't require that the struct mm_struct::mmap_lock is held, 502 * since @mapping may be mapped by multiple processes. Instead 503 * @mapping->i_mmap_rwsem must be held. This might have implications in the 504 * callbacks, and it's up tho the caller to ensure that the 505 * struct mm_struct::mmap_lock is not needed. 506 * 507 * Also this means that a caller can't rely on the struct 508 * vm_area_struct::vm_flags to be constant across a call, 509 * except for immutable flags. Callers requiring this shouldn't use 510 * this function. 511 * 512 * Return: 0 on success, negative error code on failure, positive number on 513 * caller defined premature termination. 514 */ 515 int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, 516 pgoff_t nr, const struct mm_walk_ops *ops, 517 void *private) 518 { 519 struct mm_walk walk = { 520 .ops = ops, 521 .private = private, 522 }; 523 struct vm_area_struct *vma; 524 pgoff_t vba, vea, cba, cea; 525 unsigned long start_addr, end_addr; 526 int err = 0; 527 528 lockdep_assert_held(&mapping->i_mmap_rwsem); 529 vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, 530 first_index + nr - 1) { 531 /* Clip to the vma */ 532 vba = vma->vm_pgoff; 533 vea = vba + vma_pages(vma); 534 cba = first_index; 535 cba = max(cba, vba); 536 cea = first_index + nr; 537 cea = min(cea, vea); 538 539 start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start; 540 end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start; 541 if (start_addr >= end_addr) 542 continue; 543 544 walk.vma = vma; 545 walk.mm = vma->vm_mm; 546 547 err = walk_page_test(vma->vm_start, vma->vm_end, &walk); 548 if (err > 0) { 549 err = 0; 550 break; 551 } else if (err < 0) 552 break; 553 554 err = __walk_page_range(start_addr, end_addr, &walk); 555 if (err) 556 break; 557 } 558 559 return err; 560 } 561