1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/pagewalk.h> 3 #include <linux/mm_inline.h> 4 #include <linux/hugetlb.h> 5 #include <linux/huge_mm.h> 6 #include <linux/mount.h> 7 #include <linux/ksm.h> 8 #include <linux/seq_file.h> 9 #include <linux/highmem.h> 10 #include <linux/ptrace.h> 11 #include <linux/slab.h> 12 #include <linux/pagemap.h> 13 #include <linux/mempolicy.h> 14 #include <linux/rmap.h> 15 #include <linux/swap.h> 16 #include <linux/sched/mm.h> 17 #include <linux/swapops.h> 18 #include <linux/mmu_notifier.h> 19 #include <linux/page_idle.h> 20 #include <linux/shmem_fs.h> 21 #include <linux/uaccess.h> 22 #include <linux/pkeys.h> 23 #include <linux/minmax.h> 24 #include <linux/overflow.h> 25 26 #include <asm/elf.h> 27 #include <asm/tlb.h> 28 #include <asm/tlbflush.h> 29 #include "internal.h" 30 31 #define SEQ_PUT_DEC(str, val) \ 32 seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) 33 void task_mem(struct seq_file *m, struct mm_struct *mm) 34 { 35 unsigned long text, lib, swap, anon, file, shmem; 36 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; 37 38 anon = get_mm_counter(mm, MM_ANONPAGES); 39 file = get_mm_counter(mm, MM_FILEPAGES); 40 shmem = get_mm_counter(mm, MM_SHMEMPAGES); 41 42 /* 43 * Note: to minimize their overhead, mm maintains hiwater_vm and 44 * hiwater_rss only when about to *lower* total_vm or rss. Any 45 * collector of these hiwater stats must therefore get total_vm 46 * and rss too, which will usually be the higher. Barriers? not 47 * worth the effort, such snapshots can always be inconsistent. 48 */ 49 hiwater_vm = total_vm = mm->total_vm; 50 if (hiwater_vm < mm->hiwater_vm) 51 hiwater_vm = mm->hiwater_vm; 52 hiwater_rss = total_rss = anon + file + shmem; 53 if (hiwater_rss < mm->hiwater_rss) 54 hiwater_rss = mm->hiwater_rss; 55 56 /* split executable areas between text and lib */ 57 text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK); 58 text = min(text, mm->exec_vm << PAGE_SHIFT); 59 lib = (mm->exec_vm << PAGE_SHIFT) - text; 60 61 swap = get_mm_counter(mm, MM_SWAPENTS); 62 SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); 63 SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); 64 SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); 65 SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm)); 66 SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); 67 SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); 68 SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); 69 SEQ_PUT_DEC(" kB\nRssFile:\t", file); 70 SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem); 71 SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm); 72 SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm); 73 seq_put_decimal_ull_width(m, 74 " kB\nVmExe:\t", text >> 10, 8); 75 seq_put_decimal_ull_width(m, 76 " kB\nVmLib:\t", lib >> 10, 8); 77 seq_put_decimal_ull_width(m, 78 " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8); 79 SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); 80 seq_puts(m, " kB\n"); 81 hugetlb_report_usage(m, mm); 82 } 83 #undef SEQ_PUT_DEC 84 85 unsigned long task_vsize(struct mm_struct *mm) 86 { 87 return PAGE_SIZE * mm->total_vm; 88 } 89 90 unsigned long task_statm(struct mm_struct *mm, 91 unsigned long *shared, unsigned long *text, 92 unsigned long *data, unsigned long *resident) 93 { 94 *shared = get_mm_counter(mm, MM_FILEPAGES) + 95 get_mm_counter(mm, MM_SHMEMPAGES); 96 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) 97 >> PAGE_SHIFT; 98 *data = mm->data_vm + mm->stack_vm; 99 *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); 100 return mm->total_vm; 101 } 102 103 #ifdef CONFIG_NUMA 104 /* 105 * Save get_task_policy() for show_numa_map(). 106 */ 107 static void hold_task_mempolicy(struct proc_maps_private *priv) 108 { 109 struct task_struct *task = priv->task; 110 111 task_lock(task); 112 priv->task_mempolicy = get_task_policy(task); 113 mpol_get(priv->task_mempolicy); 114 task_unlock(task); 115 } 116 static void release_task_mempolicy(struct proc_maps_private *priv) 117 { 118 mpol_put(priv->task_mempolicy); 119 } 120 #else 121 static void hold_task_mempolicy(struct proc_maps_private *priv) 122 { 123 } 124 static void release_task_mempolicy(struct proc_maps_private *priv) 125 { 126 } 127 #endif 128 129 static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, 130 loff_t *ppos) 131 { 132 struct vm_area_struct *vma = vma_next(&priv->iter); 133 134 if (vma) { 135 *ppos = vma->vm_start; 136 } else { 137 *ppos = -2UL; 138 vma = get_gate_vma(priv->mm); 139 } 140 141 return vma; 142 } 143 144 static void *m_start(struct seq_file *m, loff_t *ppos) 145 { 146 struct proc_maps_private *priv = m->private; 147 unsigned long last_addr = *ppos; 148 struct mm_struct *mm; 149 150 /* See m_next(). Zero at the start or after lseek. */ 151 if (last_addr == -1UL) 152 return NULL; 153 154 priv->task = get_proc_task(priv->inode); 155 if (!priv->task) 156 return ERR_PTR(-ESRCH); 157 158 mm = priv->mm; 159 if (!mm || !mmget_not_zero(mm)) { 160 put_task_struct(priv->task); 161 priv->task = NULL; 162 return NULL; 163 } 164 165 if (mmap_read_lock_killable(mm)) { 166 mmput(mm); 167 put_task_struct(priv->task); 168 priv->task = NULL; 169 return ERR_PTR(-EINTR); 170 } 171 172 vma_iter_init(&priv->iter, mm, last_addr); 173 hold_task_mempolicy(priv); 174 if (last_addr == -2UL) 175 return get_gate_vma(mm); 176 177 return proc_get_vma(priv, ppos); 178 } 179 180 static void *m_next(struct seq_file *m, void *v, loff_t *ppos) 181 { 182 if (*ppos == -2UL) { 183 *ppos = -1UL; 184 return NULL; 185 } 186 return proc_get_vma(m->private, ppos); 187 } 188 189 static void m_stop(struct seq_file *m, void *v) 190 { 191 struct proc_maps_private *priv = m->private; 192 struct mm_struct *mm = priv->mm; 193 194 if (!priv->task) 195 return; 196 197 release_task_mempolicy(priv); 198 mmap_read_unlock(mm); 199 mmput(mm); 200 put_task_struct(priv->task); 201 priv->task = NULL; 202 } 203 204 static int proc_maps_open(struct inode *inode, struct file *file, 205 const struct seq_operations *ops, int psize) 206 { 207 struct proc_maps_private *priv = __seq_open_private(file, ops, psize); 208 209 if (!priv) 210 return -ENOMEM; 211 212 priv->inode = inode; 213 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); 214 if (IS_ERR(priv->mm)) { 215 int err = PTR_ERR(priv->mm); 216 217 seq_release_private(inode, file); 218 return err; 219 } 220 221 return 0; 222 } 223 224 static int proc_map_release(struct inode *inode, struct file *file) 225 { 226 struct seq_file *seq = file->private_data; 227 struct proc_maps_private *priv = seq->private; 228 229 if (priv->mm) 230 mmdrop(priv->mm); 231 232 return seq_release_private(inode, file); 233 } 234 235 static int do_maps_open(struct inode *inode, struct file *file, 236 const struct seq_operations *ops) 237 { 238 return proc_maps_open(inode, file, ops, 239 sizeof(struct proc_maps_private)); 240 } 241 242 static void show_vma_header_prefix(struct seq_file *m, 243 unsigned long start, unsigned long end, 244 vm_flags_t flags, unsigned long long pgoff, 245 dev_t dev, unsigned long ino) 246 { 247 seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); 248 seq_put_hex_ll(m, NULL, start, 8); 249 seq_put_hex_ll(m, "-", end, 8); 250 seq_putc(m, ' '); 251 seq_putc(m, flags & VM_READ ? 'r' : '-'); 252 seq_putc(m, flags & VM_WRITE ? 'w' : '-'); 253 seq_putc(m, flags & VM_EXEC ? 'x' : '-'); 254 seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p'); 255 seq_put_hex_ll(m, " ", pgoff, 8); 256 seq_put_hex_ll(m, " ", MAJOR(dev), 2); 257 seq_put_hex_ll(m, ":", MINOR(dev), 2); 258 seq_put_decimal_ull(m, " ", ino); 259 seq_putc(m, ' '); 260 } 261 262 static void 263 show_map_vma(struct seq_file *m, struct vm_area_struct *vma) 264 { 265 struct anon_vma_name *anon_name = NULL; 266 struct mm_struct *mm = vma->vm_mm; 267 struct file *file = vma->vm_file; 268 vm_flags_t flags = vma->vm_flags; 269 unsigned long ino = 0; 270 unsigned long long pgoff = 0; 271 unsigned long start, end; 272 dev_t dev = 0; 273 const char *name = NULL; 274 275 if (file) { 276 const struct inode *inode = file_user_inode(vma->vm_file); 277 278 dev = inode->i_sb->s_dev; 279 ino = inode->i_ino; 280 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; 281 } 282 283 start = vma->vm_start; 284 end = vma->vm_end; 285 show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); 286 if (mm) 287 anon_name = anon_vma_name(vma); 288 289 /* 290 * Print the dentry name for named mappings, and a 291 * special [heap] marker for the heap: 292 */ 293 if (file) { 294 seq_pad(m, ' '); 295 /* 296 * If user named this anon shared memory via 297 * prctl(PR_SET_VMA ..., use the provided name. 298 */ 299 if (anon_name) 300 seq_printf(m, "[anon_shmem:%s]", anon_name->name); 301 else 302 seq_path(m, file_user_path(file), "\n"); 303 goto done; 304 } 305 306 if (vma->vm_ops && vma->vm_ops->name) { 307 name = vma->vm_ops->name(vma); 308 if (name) 309 goto done; 310 } 311 312 name = arch_vma_name(vma); 313 if (!name) { 314 if (!mm) { 315 name = "[vdso]"; 316 goto done; 317 } 318 319 if (vma_is_initial_heap(vma)) { 320 name = "[heap]"; 321 goto done; 322 } 323 324 if (vma_is_initial_stack(vma)) { 325 name = "[stack]"; 326 goto done; 327 } 328 329 if (anon_name) { 330 seq_pad(m, ' '); 331 seq_printf(m, "[anon:%s]", anon_name->name); 332 } 333 } 334 335 done: 336 if (name) { 337 seq_pad(m, ' '); 338 seq_puts(m, name); 339 } 340 seq_putc(m, '\n'); 341 } 342 343 static int show_map(struct seq_file *m, void *v) 344 { 345 show_map_vma(m, v); 346 return 0; 347 } 348 349 static const struct seq_operations proc_pid_maps_op = { 350 .start = m_start, 351 .next = m_next, 352 .stop = m_stop, 353 .show = show_map 354 }; 355 356 static int pid_maps_open(struct inode *inode, struct file *file) 357 { 358 return do_maps_open(inode, file, &proc_pid_maps_op); 359 } 360 361 const struct file_operations proc_pid_maps_operations = { 362 .open = pid_maps_open, 363 .read = seq_read, 364 .llseek = seq_lseek, 365 .release = proc_map_release, 366 }; 367 368 /* 369 * Proportional Set Size(PSS): my share of RSS. 370 * 371 * PSS of a process is the count of pages it has in memory, where each 372 * page is divided by the number of processes sharing it. So if a 373 * process has 1000 pages all to itself, and 1000 shared with one other 374 * process, its PSS will be 1500. 375 * 376 * To keep (accumulated) division errors low, we adopt a 64bit 377 * fixed-point pss counter to minimize division errors. So (pss >> 378 * PSS_SHIFT) would be the real byte count. 379 * 380 * A shift of 12 before division means (assuming 4K page size): 381 * - 1M 3-user-pages add up to 8KB errors; 382 * - supports mapcount up to 2^24, or 16M; 383 * - supports PSS up to 2^52 bytes, or 4PB. 384 */ 385 #define PSS_SHIFT 12 386 387 #ifdef CONFIG_PROC_PAGE_MONITOR 388 struct mem_size_stats { 389 unsigned long resident; 390 unsigned long shared_clean; 391 unsigned long shared_dirty; 392 unsigned long private_clean; 393 unsigned long private_dirty; 394 unsigned long referenced; 395 unsigned long anonymous; 396 unsigned long lazyfree; 397 unsigned long anonymous_thp; 398 unsigned long shmem_thp; 399 unsigned long file_thp; 400 unsigned long swap; 401 unsigned long shared_hugetlb; 402 unsigned long private_hugetlb; 403 unsigned long ksm; 404 u64 pss; 405 u64 pss_anon; 406 u64 pss_file; 407 u64 pss_shmem; 408 u64 pss_dirty; 409 u64 pss_locked; 410 u64 swap_pss; 411 }; 412 413 static void smaps_page_accumulate(struct mem_size_stats *mss, 414 struct folio *folio, unsigned long size, unsigned long pss, 415 bool dirty, bool locked, bool private) 416 { 417 mss->pss += pss; 418 419 if (folio_test_anon(folio)) 420 mss->pss_anon += pss; 421 else if (folio_test_swapbacked(folio)) 422 mss->pss_shmem += pss; 423 else 424 mss->pss_file += pss; 425 426 if (locked) 427 mss->pss_locked += pss; 428 429 if (dirty || folio_test_dirty(folio)) { 430 mss->pss_dirty += pss; 431 if (private) 432 mss->private_dirty += size; 433 else 434 mss->shared_dirty += size; 435 } else { 436 if (private) 437 mss->private_clean += size; 438 else 439 mss->shared_clean += size; 440 } 441 } 442 443 static void smaps_account(struct mem_size_stats *mss, struct page *page, 444 bool compound, bool young, bool dirty, bool locked, 445 bool migration) 446 { 447 struct folio *folio = page_folio(page); 448 int i, nr = compound ? compound_nr(page) : 1; 449 unsigned long size = nr * PAGE_SIZE; 450 451 /* 452 * First accumulate quantities that depend only on |size| and the type 453 * of the compound page. 454 */ 455 if (folio_test_anon(folio)) { 456 mss->anonymous += size; 457 if (!folio_test_swapbacked(folio) && !dirty && 458 !folio_test_dirty(folio)) 459 mss->lazyfree += size; 460 } 461 462 if (folio_test_ksm(folio)) 463 mss->ksm += size; 464 465 mss->resident += size; 466 /* Accumulate the size in pages that have been accessed. */ 467 if (young || folio_test_young(folio) || folio_test_referenced(folio)) 468 mss->referenced += size; 469 470 /* 471 * Then accumulate quantities that may depend on sharing, or that may 472 * differ page-by-page. 473 * 474 * refcount == 1 guarantees the page is mapped exactly once. 475 * If any subpage of the compound page mapped with PTE it would elevate 476 * the refcount. 477 * 478 * The page_mapcount() is called to get a snapshot of the mapcount. 479 * Without holding the page lock this snapshot can be slightly wrong as 480 * we cannot always read the mapcount atomically. It is not safe to 481 * call page_mapcount() even with PTL held if the page is not mapped, 482 * especially for migration entries. Treat regular migration entries 483 * as mapcount == 1. 484 */ 485 if ((folio_ref_count(folio) == 1) || migration) { 486 smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT, 487 dirty, locked, true); 488 return; 489 } 490 for (i = 0; i < nr; i++, page++) { 491 int mapcount = page_mapcount(page); 492 unsigned long pss = PAGE_SIZE << PSS_SHIFT; 493 if (mapcount >= 2) 494 pss /= mapcount; 495 smaps_page_accumulate(mss, folio, PAGE_SIZE, pss, 496 dirty, locked, mapcount < 2); 497 } 498 } 499 500 #ifdef CONFIG_SHMEM 501 static int smaps_pte_hole(unsigned long addr, unsigned long end, 502 __always_unused int depth, struct mm_walk *walk) 503 { 504 struct mem_size_stats *mss = walk->private; 505 struct vm_area_struct *vma = walk->vma; 506 507 mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping, 508 linear_page_index(vma, addr), 509 linear_page_index(vma, end)); 510 511 return 0; 512 } 513 #else 514 #define smaps_pte_hole NULL 515 #endif /* CONFIG_SHMEM */ 516 517 static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk) 518 { 519 #ifdef CONFIG_SHMEM 520 if (walk->ops->pte_hole) { 521 /* depth is not used */ 522 smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk); 523 } 524 #endif 525 } 526 527 static void smaps_pte_entry(pte_t *pte, unsigned long addr, 528 struct mm_walk *walk) 529 { 530 struct mem_size_stats *mss = walk->private; 531 struct vm_area_struct *vma = walk->vma; 532 bool locked = !!(vma->vm_flags & VM_LOCKED); 533 struct page *page = NULL; 534 bool migration = false, young = false, dirty = false; 535 pte_t ptent = ptep_get(pte); 536 537 if (pte_present(ptent)) { 538 page = vm_normal_page(vma, addr, ptent); 539 young = pte_young(ptent); 540 dirty = pte_dirty(ptent); 541 } else if (is_swap_pte(ptent)) { 542 swp_entry_t swpent = pte_to_swp_entry(ptent); 543 544 if (!non_swap_entry(swpent)) { 545 int mapcount; 546 547 mss->swap += PAGE_SIZE; 548 mapcount = swp_swapcount(swpent); 549 if (mapcount >= 2) { 550 u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; 551 552 do_div(pss_delta, mapcount); 553 mss->swap_pss += pss_delta; 554 } else { 555 mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; 556 } 557 } else if (is_pfn_swap_entry(swpent)) { 558 if (is_migration_entry(swpent)) 559 migration = true; 560 page = pfn_swap_entry_to_page(swpent); 561 } 562 } else { 563 smaps_pte_hole_lookup(addr, walk); 564 return; 565 } 566 567 if (!page) 568 return; 569 570 smaps_account(mss, page, false, young, dirty, locked, migration); 571 } 572 573 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 574 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, 575 struct mm_walk *walk) 576 { 577 struct mem_size_stats *mss = walk->private; 578 struct vm_area_struct *vma = walk->vma; 579 bool locked = !!(vma->vm_flags & VM_LOCKED); 580 struct page *page = NULL; 581 struct folio *folio; 582 bool migration = false; 583 584 if (pmd_present(*pmd)) { 585 page = vm_normal_page_pmd(vma, addr, *pmd); 586 } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { 587 swp_entry_t entry = pmd_to_swp_entry(*pmd); 588 589 if (is_migration_entry(entry)) { 590 migration = true; 591 page = pfn_swap_entry_to_page(entry); 592 } 593 } 594 if (IS_ERR_OR_NULL(page)) 595 return; 596 folio = page_folio(page); 597 if (folio_test_anon(folio)) 598 mss->anonymous_thp += HPAGE_PMD_SIZE; 599 else if (folio_test_swapbacked(folio)) 600 mss->shmem_thp += HPAGE_PMD_SIZE; 601 else if (folio_is_zone_device(folio)) 602 /* pass */; 603 else 604 mss->file_thp += HPAGE_PMD_SIZE; 605 606 smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), 607 locked, migration); 608 } 609 #else 610 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, 611 struct mm_walk *walk) 612 { 613 } 614 #endif 615 616 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 617 struct mm_walk *walk) 618 { 619 struct vm_area_struct *vma = walk->vma; 620 pte_t *pte; 621 spinlock_t *ptl; 622 623 ptl = pmd_trans_huge_lock(pmd, vma); 624 if (ptl) { 625 smaps_pmd_entry(pmd, addr, walk); 626 spin_unlock(ptl); 627 goto out; 628 } 629 630 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 631 if (!pte) { 632 walk->action = ACTION_AGAIN; 633 return 0; 634 } 635 for (; addr != end; pte++, addr += PAGE_SIZE) 636 smaps_pte_entry(pte, addr, walk); 637 pte_unmap_unlock(pte - 1, ptl); 638 out: 639 cond_resched(); 640 return 0; 641 } 642 643 static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) 644 { 645 /* 646 * Don't forget to update Documentation/ on changes. 647 */ 648 static const char mnemonics[BITS_PER_LONG][2] = { 649 /* 650 * In case if we meet a flag we don't know about. 651 */ 652 [0 ... (BITS_PER_LONG-1)] = "??", 653 654 [ilog2(VM_READ)] = "rd", 655 [ilog2(VM_WRITE)] = "wr", 656 [ilog2(VM_EXEC)] = "ex", 657 [ilog2(VM_SHARED)] = "sh", 658 [ilog2(VM_MAYREAD)] = "mr", 659 [ilog2(VM_MAYWRITE)] = "mw", 660 [ilog2(VM_MAYEXEC)] = "me", 661 [ilog2(VM_MAYSHARE)] = "ms", 662 [ilog2(VM_GROWSDOWN)] = "gd", 663 [ilog2(VM_PFNMAP)] = "pf", 664 [ilog2(VM_LOCKED)] = "lo", 665 [ilog2(VM_IO)] = "io", 666 [ilog2(VM_SEQ_READ)] = "sr", 667 [ilog2(VM_RAND_READ)] = "rr", 668 [ilog2(VM_DONTCOPY)] = "dc", 669 [ilog2(VM_DONTEXPAND)] = "de", 670 [ilog2(VM_LOCKONFAULT)] = "lf", 671 [ilog2(VM_ACCOUNT)] = "ac", 672 [ilog2(VM_NORESERVE)] = "nr", 673 [ilog2(VM_HUGETLB)] = "ht", 674 [ilog2(VM_SYNC)] = "sf", 675 [ilog2(VM_ARCH_1)] = "ar", 676 [ilog2(VM_WIPEONFORK)] = "wf", 677 [ilog2(VM_DONTDUMP)] = "dd", 678 #ifdef CONFIG_ARM64_BTI 679 [ilog2(VM_ARM64_BTI)] = "bt", 680 #endif 681 #ifdef CONFIG_MEM_SOFT_DIRTY 682 [ilog2(VM_SOFTDIRTY)] = "sd", 683 #endif 684 [ilog2(VM_MIXEDMAP)] = "mm", 685 [ilog2(VM_HUGEPAGE)] = "hg", 686 [ilog2(VM_NOHUGEPAGE)] = "nh", 687 [ilog2(VM_MERGEABLE)] = "mg", 688 [ilog2(VM_UFFD_MISSING)]= "um", 689 [ilog2(VM_UFFD_WP)] = "uw", 690 #ifdef CONFIG_ARM64_MTE 691 [ilog2(VM_MTE)] = "mt", 692 [ilog2(VM_MTE_ALLOWED)] = "", 693 #endif 694 #ifdef CONFIG_ARCH_HAS_PKEYS 695 /* These come out via ProtectionKey: */ 696 [ilog2(VM_PKEY_BIT0)] = "", 697 [ilog2(VM_PKEY_BIT1)] = "", 698 [ilog2(VM_PKEY_BIT2)] = "", 699 [ilog2(VM_PKEY_BIT3)] = "", 700 #if VM_PKEY_BIT4 701 [ilog2(VM_PKEY_BIT4)] = "", 702 #endif 703 #endif /* CONFIG_ARCH_HAS_PKEYS */ 704 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR 705 [ilog2(VM_UFFD_MINOR)] = "ui", 706 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ 707 #ifdef CONFIG_X86_USER_SHADOW_STACK 708 [ilog2(VM_SHADOW_STACK)] = "ss", 709 #endif 710 }; 711 size_t i; 712 713 seq_puts(m, "VmFlags: "); 714 for (i = 0; i < BITS_PER_LONG; i++) { 715 if (!mnemonics[i][0]) 716 continue; 717 if (vma->vm_flags & (1UL << i)) { 718 seq_putc(m, mnemonics[i][0]); 719 seq_putc(m, mnemonics[i][1]); 720 seq_putc(m, ' '); 721 } 722 } 723 seq_putc(m, '\n'); 724 } 725 726 #ifdef CONFIG_HUGETLB_PAGE 727 static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, 728 unsigned long addr, unsigned long end, 729 struct mm_walk *walk) 730 { 731 struct mem_size_stats *mss = walk->private; 732 struct vm_area_struct *vma = walk->vma; 733 pte_t ptent = huge_ptep_get(pte); 734 struct folio *folio = NULL; 735 736 if (pte_present(ptent)) { 737 folio = page_folio(pte_page(ptent)); 738 } else if (is_swap_pte(ptent)) { 739 swp_entry_t swpent = pte_to_swp_entry(ptent); 740 741 if (is_pfn_swap_entry(swpent)) 742 folio = pfn_swap_entry_folio(swpent); 743 } 744 if (folio) { 745 if (folio_likely_mapped_shared(folio) || 746 hugetlb_pmd_shared(pte)) 747 mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); 748 else 749 mss->private_hugetlb += huge_page_size(hstate_vma(vma)); 750 } 751 return 0; 752 } 753 #else 754 #define smaps_hugetlb_range NULL 755 #endif /* HUGETLB_PAGE */ 756 757 static const struct mm_walk_ops smaps_walk_ops = { 758 .pmd_entry = smaps_pte_range, 759 .hugetlb_entry = smaps_hugetlb_range, 760 .walk_lock = PGWALK_RDLOCK, 761 }; 762 763 static const struct mm_walk_ops smaps_shmem_walk_ops = { 764 .pmd_entry = smaps_pte_range, 765 .hugetlb_entry = smaps_hugetlb_range, 766 .pte_hole = smaps_pte_hole, 767 .walk_lock = PGWALK_RDLOCK, 768 }; 769 770 /* 771 * Gather mem stats from @vma with the indicated beginning 772 * address @start, and keep them in @mss. 773 * 774 * Use vm_start of @vma as the beginning address if @start is 0. 775 */ 776 static void smap_gather_stats(struct vm_area_struct *vma, 777 struct mem_size_stats *mss, unsigned long start) 778 { 779 const struct mm_walk_ops *ops = &smaps_walk_ops; 780 781 /* Invalid start */ 782 if (start >= vma->vm_end) 783 return; 784 785 if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { 786 /* 787 * For shared or readonly shmem mappings we know that all 788 * swapped out pages belong to the shmem object, and we can 789 * obtain the swap value much more efficiently. For private 790 * writable mappings, we might have COW pages that are 791 * not affected by the parent swapped out pages of the shmem 792 * object, so we have to distinguish them during the page walk. 793 * Unless we know that the shmem object (or the part mapped by 794 * our VMA) has no swapped out pages at all. 795 */ 796 unsigned long shmem_swapped = shmem_swap_usage(vma); 797 798 if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) || 799 !(vma->vm_flags & VM_WRITE))) { 800 mss->swap += shmem_swapped; 801 } else { 802 ops = &smaps_shmem_walk_ops; 803 } 804 } 805 806 /* mmap_lock is held in m_start */ 807 if (!start) 808 walk_page_vma(vma, ops, mss); 809 else 810 walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); 811 } 812 813 #define SEQ_PUT_DEC(str, val) \ 814 seq_put_decimal_ull_width(m, str, (val) >> 10, 8) 815 816 /* Show the contents common for smaps and smaps_rollup */ 817 static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, 818 bool rollup_mode) 819 { 820 SEQ_PUT_DEC("Rss: ", mss->resident); 821 SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); 822 SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT); 823 if (rollup_mode) { 824 /* 825 * These are meaningful only for smaps_rollup, otherwise two of 826 * them are zero, and the other one is the same as Pss. 827 */ 828 SEQ_PUT_DEC(" kB\nPss_Anon: ", 829 mss->pss_anon >> PSS_SHIFT); 830 SEQ_PUT_DEC(" kB\nPss_File: ", 831 mss->pss_file >> PSS_SHIFT); 832 SEQ_PUT_DEC(" kB\nPss_Shmem: ", 833 mss->pss_shmem >> PSS_SHIFT); 834 } 835 SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); 836 SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); 837 SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); 838 SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); 839 SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); 840 SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); 841 SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm); 842 SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); 843 SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); 844 SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); 845 SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); 846 SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); 847 seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", 848 mss->private_hugetlb >> 10, 7); 849 SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); 850 SEQ_PUT_DEC(" kB\nSwapPss: ", 851 mss->swap_pss >> PSS_SHIFT); 852 SEQ_PUT_DEC(" kB\nLocked: ", 853 mss->pss_locked >> PSS_SHIFT); 854 seq_puts(m, " kB\n"); 855 } 856 857 static int show_smap(struct seq_file *m, void *v) 858 { 859 struct vm_area_struct *vma = v; 860 struct mem_size_stats mss = {}; 861 862 smap_gather_stats(vma, &mss, 0); 863 864 show_map_vma(m, vma); 865 866 SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); 867 SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); 868 SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); 869 seq_puts(m, " kB\n"); 870 871 __show_smap(m, &mss, false); 872 873 seq_printf(m, "THPeligible: %8u\n", 874 !!thp_vma_allowable_orders(vma, vma->vm_flags, true, false, 875 true, THP_ORDERS_ALL)); 876 877 if (arch_pkeys_enabled()) 878 seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); 879 show_smap_vma_flags(m, vma); 880 881 return 0; 882 } 883 884 static int show_smaps_rollup(struct seq_file *m, void *v) 885 { 886 struct proc_maps_private *priv = m->private; 887 struct mem_size_stats mss = {}; 888 struct mm_struct *mm = priv->mm; 889 struct vm_area_struct *vma; 890 unsigned long vma_start = 0, last_vma_end = 0; 891 int ret = 0; 892 VMA_ITERATOR(vmi, mm, 0); 893 894 priv->task = get_proc_task(priv->inode); 895 if (!priv->task) 896 return -ESRCH; 897 898 if (!mm || !mmget_not_zero(mm)) { 899 ret = -ESRCH; 900 goto out_put_task; 901 } 902 903 ret = mmap_read_lock_killable(mm); 904 if (ret) 905 goto out_put_mm; 906 907 hold_task_mempolicy(priv); 908 vma = vma_next(&vmi); 909 910 if (unlikely(!vma)) 911 goto empty_set; 912 913 vma_start = vma->vm_start; 914 do { 915 smap_gather_stats(vma, &mss, 0); 916 last_vma_end = vma->vm_end; 917 918 /* 919 * Release mmap_lock temporarily if someone wants to 920 * access it for write request. 921 */ 922 if (mmap_lock_is_contended(mm)) { 923 vma_iter_invalidate(&vmi); 924 mmap_read_unlock(mm); 925 ret = mmap_read_lock_killable(mm); 926 if (ret) { 927 release_task_mempolicy(priv); 928 goto out_put_mm; 929 } 930 931 /* 932 * After dropping the lock, there are four cases to 933 * consider. See the following example for explanation. 934 * 935 * +------+------+-----------+ 936 * | VMA1 | VMA2 | VMA3 | 937 * +------+------+-----------+ 938 * | | | | 939 * 4k 8k 16k 400k 940 * 941 * Suppose we drop the lock after reading VMA2 due to 942 * contention, then we get: 943 * 944 * last_vma_end = 16k 945 * 946 * 1) VMA2 is freed, but VMA3 exists: 947 * 948 * vma_next(vmi) will return VMA3. 949 * In this case, just continue from VMA3. 950 * 951 * 2) VMA2 still exists: 952 * 953 * vma_next(vmi) will return VMA3. 954 * In this case, just continue from VMA3. 955 * 956 * 3) No more VMAs can be found: 957 * 958 * vma_next(vmi) will return NULL. 959 * No more things to do, just break. 960 * 961 * 4) (last_vma_end - 1) is the middle of a vma (VMA'): 962 * 963 * vma_next(vmi) will return VMA' whose range 964 * contains last_vma_end. 965 * Iterate VMA' from last_vma_end. 966 */ 967 vma = vma_next(&vmi); 968 /* Case 3 above */ 969 if (!vma) 970 break; 971 972 /* Case 1 and 2 above */ 973 if (vma->vm_start >= last_vma_end) 974 continue; 975 976 /* Case 4 above */ 977 if (vma->vm_end > last_vma_end) 978 smap_gather_stats(vma, &mss, last_vma_end); 979 } 980 } for_each_vma(vmi, vma); 981 982 empty_set: 983 show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); 984 seq_pad(m, ' '); 985 seq_puts(m, "[rollup]\n"); 986 987 __show_smap(m, &mss, true); 988 989 release_task_mempolicy(priv); 990 mmap_read_unlock(mm); 991 992 out_put_mm: 993 mmput(mm); 994 out_put_task: 995 put_task_struct(priv->task); 996 priv->task = NULL; 997 998 return ret; 999 } 1000 #undef SEQ_PUT_DEC 1001 1002 static const struct seq_operations proc_pid_smaps_op = { 1003 .start = m_start, 1004 .next = m_next, 1005 .stop = m_stop, 1006 .show = show_smap 1007 }; 1008 1009 static int pid_smaps_open(struct inode *inode, struct file *file) 1010 { 1011 return do_maps_open(inode, file, &proc_pid_smaps_op); 1012 } 1013 1014 static int smaps_rollup_open(struct inode *inode, struct file *file) 1015 { 1016 int ret; 1017 struct proc_maps_private *priv; 1018 1019 priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT); 1020 if (!priv) 1021 return -ENOMEM; 1022 1023 ret = single_open(file, show_smaps_rollup, priv); 1024 if (ret) 1025 goto out_free; 1026 1027 priv->inode = inode; 1028 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); 1029 if (IS_ERR(priv->mm)) { 1030 ret = PTR_ERR(priv->mm); 1031 1032 single_release(inode, file); 1033 goto out_free; 1034 } 1035 1036 return 0; 1037 1038 out_free: 1039 kfree(priv); 1040 return ret; 1041 } 1042 1043 static int smaps_rollup_release(struct inode *inode, struct file *file) 1044 { 1045 struct seq_file *seq = file->private_data; 1046 struct proc_maps_private *priv = seq->private; 1047 1048 if (priv->mm) 1049 mmdrop(priv->mm); 1050 1051 kfree(priv); 1052 return single_release(inode, file); 1053 } 1054 1055 const struct file_operations proc_pid_smaps_operations = { 1056 .open = pid_smaps_open, 1057 .read = seq_read, 1058 .llseek = seq_lseek, 1059 .release = proc_map_release, 1060 }; 1061 1062 const struct file_operations proc_pid_smaps_rollup_operations = { 1063 .open = smaps_rollup_open, 1064 .read = seq_read, 1065 .llseek = seq_lseek, 1066 .release = smaps_rollup_release, 1067 }; 1068 1069 enum clear_refs_types { 1070 CLEAR_REFS_ALL = 1, 1071 CLEAR_REFS_ANON, 1072 CLEAR_REFS_MAPPED, 1073 CLEAR_REFS_SOFT_DIRTY, 1074 CLEAR_REFS_MM_HIWATER_RSS, 1075 CLEAR_REFS_LAST, 1076 }; 1077 1078 struct clear_refs_private { 1079 enum clear_refs_types type; 1080 }; 1081 1082 #ifdef CONFIG_MEM_SOFT_DIRTY 1083 1084 static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) 1085 { 1086 struct page *page; 1087 1088 if (!pte_write(pte)) 1089 return false; 1090 if (!is_cow_mapping(vma->vm_flags)) 1091 return false; 1092 if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))) 1093 return false; 1094 page = vm_normal_page(vma, addr, pte); 1095 if (!page) 1096 return false; 1097 return page_maybe_dma_pinned(page); 1098 } 1099 1100 static inline void clear_soft_dirty(struct vm_area_struct *vma, 1101 unsigned long addr, pte_t *pte) 1102 { 1103 /* 1104 * The soft-dirty tracker uses #PF-s to catch writes 1105 * to pages, so write-protect the pte as well. See the 1106 * Documentation/admin-guide/mm/soft-dirty.rst for full description 1107 * of how soft-dirty works. 1108 */ 1109 pte_t ptent = ptep_get(pte); 1110 1111 if (pte_present(ptent)) { 1112 pte_t old_pte; 1113 1114 if (pte_is_pinned(vma, addr, ptent)) 1115 return; 1116 old_pte = ptep_modify_prot_start(vma, addr, pte); 1117 ptent = pte_wrprotect(old_pte); 1118 ptent = pte_clear_soft_dirty(ptent); 1119 ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); 1120 } else if (is_swap_pte(ptent)) { 1121 ptent = pte_swp_clear_soft_dirty(ptent); 1122 set_pte_at(vma->vm_mm, addr, pte, ptent); 1123 } 1124 } 1125 #else 1126 static inline void clear_soft_dirty(struct vm_area_struct *vma, 1127 unsigned long addr, pte_t *pte) 1128 { 1129 } 1130 #endif 1131 1132 #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 1133 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, 1134 unsigned long addr, pmd_t *pmdp) 1135 { 1136 pmd_t old, pmd = *pmdp; 1137 1138 if (pmd_present(pmd)) { 1139 /* See comment in change_huge_pmd() */ 1140 old = pmdp_invalidate(vma, addr, pmdp); 1141 if (pmd_dirty(old)) 1142 pmd = pmd_mkdirty(pmd); 1143 if (pmd_young(old)) 1144 pmd = pmd_mkyoung(pmd); 1145 1146 pmd = pmd_wrprotect(pmd); 1147 pmd = pmd_clear_soft_dirty(pmd); 1148 1149 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 1150 } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { 1151 pmd = pmd_swp_clear_soft_dirty(pmd); 1152 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 1153 } 1154 } 1155 #else 1156 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, 1157 unsigned long addr, pmd_t *pmdp) 1158 { 1159 } 1160 #endif 1161 1162 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, 1163 unsigned long end, struct mm_walk *walk) 1164 { 1165 struct clear_refs_private *cp = walk->private; 1166 struct vm_area_struct *vma = walk->vma; 1167 pte_t *pte, ptent; 1168 spinlock_t *ptl; 1169 struct folio *folio; 1170 1171 ptl = pmd_trans_huge_lock(pmd, vma); 1172 if (ptl) { 1173 if (cp->type == CLEAR_REFS_SOFT_DIRTY) { 1174 clear_soft_dirty_pmd(vma, addr, pmd); 1175 goto out; 1176 } 1177 1178 if (!pmd_present(*pmd)) 1179 goto out; 1180 1181 folio = pmd_folio(*pmd); 1182 1183 /* Clear accessed and referenced bits. */ 1184 pmdp_test_and_clear_young(vma, addr, pmd); 1185 folio_test_clear_young(folio); 1186 folio_clear_referenced(folio); 1187 out: 1188 spin_unlock(ptl); 1189 return 0; 1190 } 1191 1192 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 1193 if (!pte) { 1194 walk->action = ACTION_AGAIN; 1195 return 0; 1196 } 1197 for (; addr != end; pte++, addr += PAGE_SIZE) { 1198 ptent = ptep_get(pte); 1199 1200 if (cp->type == CLEAR_REFS_SOFT_DIRTY) { 1201 clear_soft_dirty(vma, addr, pte); 1202 continue; 1203 } 1204 1205 if (!pte_present(ptent)) 1206 continue; 1207 1208 folio = vm_normal_folio(vma, addr, ptent); 1209 if (!folio) 1210 continue; 1211 1212 /* Clear accessed and referenced bits. */ 1213 ptep_test_and_clear_young(vma, addr, pte); 1214 folio_test_clear_young(folio); 1215 folio_clear_referenced(folio); 1216 } 1217 pte_unmap_unlock(pte - 1, ptl); 1218 cond_resched(); 1219 return 0; 1220 } 1221 1222 static int clear_refs_test_walk(unsigned long start, unsigned long end, 1223 struct mm_walk *walk) 1224 { 1225 struct clear_refs_private *cp = walk->private; 1226 struct vm_area_struct *vma = walk->vma; 1227 1228 if (vma->vm_flags & VM_PFNMAP) 1229 return 1; 1230 1231 /* 1232 * Writing 1 to /proc/pid/clear_refs affects all pages. 1233 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. 1234 * Writing 3 to /proc/pid/clear_refs only affects file mapped pages. 1235 * Writing 4 to /proc/pid/clear_refs affects all pages. 1236 */ 1237 if (cp->type == CLEAR_REFS_ANON && vma->vm_file) 1238 return 1; 1239 if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file) 1240 return 1; 1241 return 0; 1242 } 1243 1244 static const struct mm_walk_ops clear_refs_walk_ops = { 1245 .pmd_entry = clear_refs_pte_range, 1246 .test_walk = clear_refs_test_walk, 1247 .walk_lock = PGWALK_WRLOCK, 1248 }; 1249 1250 static ssize_t clear_refs_write(struct file *file, const char __user *buf, 1251 size_t count, loff_t *ppos) 1252 { 1253 struct task_struct *task; 1254 char buffer[PROC_NUMBUF] = {}; 1255 struct mm_struct *mm; 1256 struct vm_area_struct *vma; 1257 enum clear_refs_types type; 1258 int itype; 1259 int rv; 1260 1261 if (count > sizeof(buffer) - 1) 1262 count = sizeof(buffer) - 1; 1263 if (copy_from_user(buffer, buf, count)) 1264 return -EFAULT; 1265 rv = kstrtoint(strstrip(buffer), 10, &itype); 1266 if (rv < 0) 1267 return rv; 1268 type = (enum clear_refs_types)itype; 1269 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) 1270 return -EINVAL; 1271 1272 task = get_proc_task(file_inode(file)); 1273 if (!task) 1274 return -ESRCH; 1275 mm = get_task_mm(task); 1276 if (mm) { 1277 VMA_ITERATOR(vmi, mm, 0); 1278 struct mmu_notifier_range range; 1279 struct clear_refs_private cp = { 1280 .type = type, 1281 }; 1282 1283 if (mmap_write_lock_killable(mm)) { 1284 count = -EINTR; 1285 goto out_mm; 1286 } 1287 if (type == CLEAR_REFS_MM_HIWATER_RSS) { 1288 /* 1289 * Writing 5 to /proc/pid/clear_refs resets the peak 1290 * resident set size to this mm's current rss value. 1291 */ 1292 reset_mm_hiwater_rss(mm); 1293 goto out_unlock; 1294 } 1295 1296 if (type == CLEAR_REFS_SOFT_DIRTY) { 1297 for_each_vma(vmi, vma) { 1298 if (!(vma->vm_flags & VM_SOFTDIRTY)) 1299 continue; 1300 vm_flags_clear(vma, VM_SOFTDIRTY); 1301 vma_set_page_prot(vma); 1302 } 1303 1304 inc_tlb_flush_pending(mm); 1305 mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 1306 0, mm, 0, -1UL); 1307 mmu_notifier_invalidate_range_start(&range); 1308 } 1309 walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); 1310 if (type == CLEAR_REFS_SOFT_DIRTY) { 1311 mmu_notifier_invalidate_range_end(&range); 1312 flush_tlb_mm(mm); 1313 dec_tlb_flush_pending(mm); 1314 } 1315 out_unlock: 1316 mmap_write_unlock(mm); 1317 out_mm: 1318 mmput(mm); 1319 } 1320 put_task_struct(task); 1321 1322 return count; 1323 } 1324 1325 const struct file_operations proc_clear_refs_operations = { 1326 .write = clear_refs_write, 1327 .llseek = noop_llseek, 1328 }; 1329 1330 typedef struct { 1331 u64 pme; 1332 } pagemap_entry_t; 1333 1334 struct pagemapread { 1335 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ 1336 pagemap_entry_t *buffer; 1337 bool show_pfn; 1338 }; 1339 1340 #define PAGEMAP_WALK_SIZE (PMD_SIZE) 1341 #define PAGEMAP_WALK_MASK (PMD_MASK) 1342 1343 #define PM_ENTRY_BYTES sizeof(pagemap_entry_t) 1344 #define PM_PFRAME_BITS 55 1345 #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) 1346 #define PM_SOFT_DIRTY BIT_ULL(55) 1347 #define PM_MMAP_EXCLUSIVE BIT_ULL(56) 1348 #define PM_UFFD_WP BIT_ULL(57) 1349 #define PM_FILE BIT_ULL(61) 1350 #define PM_SWAP BIT_ULL(62) 1351 #define PM_PRESENT BIT_ULL(63) 1352 1353 #define PM_END_OF_BUFFER 1 1354 1355 static inline pagemap_entry_t make_pme(u64 frame, u64 flags) 1356 { 1357 return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; 1358 } 1359 1360 static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) 1361 { 1362 pm->buffer[pm->pos++] = *pme; 1363 if (pm->pos >= pm->len) 1364 return PM_END_OF_BUFFER; 1365 return 0; 1366 } 1367 1368 static int pagemap_pte_hole(unsigned long start, unsigned long end, 1369 __always_unused int depth, struct mm_walk *walk) 1370 { 1371 struct pagemapread *pm = walk->private; 1372 unsigned long addr = start; 1373 int err = 0; 1374 1375 while (addr < end) { 1376 struct vm_area_struct *vma = find_vma(walk->mm, addr); 1377 pagemap_entry_t pme = make_pme(0, 0); 1378 /* End of address space hole, which we mark as non-present. */ 1379 unsigned long hole_end; 1380 1381 if (vma) 1382 hole_end = min(end, vma->vm_start); 1383 else 1384 hole_end = end; 1385 1386 for (; addr < hole_end; addr += PAGE_SIZE) { 1387 err = add_to_pagemap(&pme, pm); 1388 if (err) 1389 goto out; 1390 } 1391 1392 if (!vma) 1393 break; 1394 1395 /* Addresses in the VMA. */ 1396 if (vma->vm_flags & VM_SOFTDIRTY) 1397 pme = make_pme(0, PM_SOFT_DIRTY); 1398 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { 1399 err = add_to_pagemap(&pme, pm); 1400 if (err) 1401 goto out; 1402 } 1403 } 1404 out: 1405 return err; 1406 } 1407 1408 static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, 1409 struct vm_area_struct *vma, unsigned long addr, pte_t pte) 1410 { 1411 u64 frame = 0, flags = 0; 1412 struct page *page = NULL; 1413 bool migration = false; 1414 1415 if (pte_present(pte)) { 1416 if (pm->show_pfn) 1417 frame = pte_pfn(pte); 1418 flags |= PM_PRESENT; 1419 page = vm_normal_page(vma, addr, pte); 1420 if (pte_soft_dirty(pte)) 1421 flags |= PM_SOFT_DIRTY; 1422 if (pte_uffd_wp(pte)) 1423 flags |= PM_UFFD_WP; 1424 } else if (is_swap_pte(pte)) { 1425 swp_entry_t entry; 1426 if (pte_swp_soft_dirty(pte)) 1427 flags |= PM_SOFT_DIRTY; 1428 if (pte_swp_uffd_wp(pte)) 1429 flags |= PM_UFFD_WP; 1430 entry = pte_to_swp_entry(pte); 1431 if (pm->show_pfn) { 1432 pgoff_t offset; 1433 /* 1434 * For PFN swap offsets, keeping the offset field 1435 * to be PFN only to be compatible with old smaps. 1436 */ 1437 if (is_pfn_swap_entry(entry)) 1438 offset = swp_offset_pfn(entry); 1439 else 1440 offset = swp_offset(entry); 1441 frame = swp_type(entry) | 1442 (offset << MAX_SWAPFILES_SHIFT); 1443 } 1444 flags |= PM_SWAP; 1445 migration = is_migration_entry(entry); 1446 if (is_pfn_swap_entry(entry)) 1447 page = pfn_swap_entry_to_page(entry); 1448 if (pte_marker_entry_uffd_wp(entry)) 1449 flags |= PM_UFFD_WP; 1450 } 1451 1452 if (page && !PageAnon(page)) 1453 flags |= PM_FILE; 1454 if (page && !migration && page_mapcount(page) == 1) 1455 flags |= PM_MMAP_EXCLUSIVE; 1456 if (vma->vm_flags & VM_SOFTDIRTY) 1457 flags |= PM_SOFT_DIRTY; 1458 1459 return make_pme(frame, flags); 1460 } 1461 1462 static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, 1463 struct mm_walk *walk) 1464 { 1465 struct vm_area_struct *vma = walk->vma; 1466 struct pagemapread *pm = walk->private; 1467 spinlock_t *ptl; 1468 pte_t *pte, *orig_pte; 1469 int err = 0; 1470 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1471 bool migration = false; 1472 1473 ptl = pmd_trans_huge_lock(pmdp, vma); 1474 if (ptl) { 1475 u64 flags = 0, frame = 0; 1476 pmd_t pmd = *pmdp; 1477 struct page *page = NULL; 1478 1479 if (vma->vm_flags & VM_SOFTDIRTY) 1480 flags |= PM_SOFT_DIRTY; 1481 1482 if (pmd_present(pmd)) { 1483 page = pmd_page(pmd); 1484 1485 flags |= PM_PRESENT; 1486 if (pmd_soft_dirty(pmd)) 1487 flags |= PM_SOFT_DIRTY; 1488 if (pmd_uffd_wp(pmd)) 1489 flags |= PM_UFFD_WP; 1490 if (pm->show_pfn) 1491 frame = pmd_pfn(pmd) + 1492 ((addr & ~PMD_MASK) >> PAGE_SHIFT); 1493 } 1494 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1495 else if (is_swap_pmd(pmd)) { 1496 swp_entry_t entry = pmd_to_swp_entry(pmd); 1497 unsigned long offset; 1498 1499 if (pm->show_pfn) { 1500 if (is_pfn_swap_entry(entry)) 1501 offset = swp_offset_pfn(entry); 1502 else 1503 offset = swp_offset(entry); 1504 offset = offset + 1505 ((addr & ~PMD_MASK) >> PAGE_SHIFT); 1506 frame = swp_type(entry) | 1507 (offset << MAX_SWAPFILES_SHIFT); 1508 } 1509 flags |= PM_SWAP; 1510 if (pmd_swp_soft_dirty(pmd)) 1511 flags |= PM_SOFT_DIRTY; 1512 if (pmd_swp_uffd_wp(pmd)) 1513 flags |= PM_UFFD_WP; 1514 VM_BUG_ON(!is_pmd_migration_entry(pmd)); 1515 migration = is_migration_entry(entry); 1516 page = pfn_swap_entry_to_page(entry); 1517 } 1518 #endif 1519 1520 if (page && !migration && page_mapcount(page) == 1) 1521 flags |= PM_MMAP_EXCLUSIVE; 1522 1523 for (; addr != end; addr += PAGE_SIZE) { 1524 pagemap_entry_t pme = make_pme(frame, flags); 1525 1526 err = add_to_pagemap(&pme, pm); 1527 if (err) 1528 break; 1529 if (pm->show_pfn) { 1530 if (flags & PM_PRESENT) 1531 frame++; 1532 else if (flags & PM_SWAP) 1533 frame += (1 << MAX_SWAPFILES_SHIFT); 1534 } 1535 } 1536 spin_unlock(ptl); 1537 return err; 1538 } 1539 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1540 1541 /* 1542 * We can assume that @vma always points to a valid one and @end never 1543 * goes beyond vma->vm_end. 1544 */ 1545 orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); 1546 if (!pte) { 1547 walk->action = ACTION_AGAIN; 1548 return err; 1549 } 1550 for (; addr < end; pte++, addr += PAGE_SIZE) { 1551 pagemap_entry_t pme; 1552 1553 pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); 1554 err = add_to_pagemap(&pme, pm); 1555 if (err) 1556 break; 1557 } 1558 pte_unmap_unlock(orig_pte, ptl); 1559 1560 cond_resched(); 1561 1562 return err; 1563 } 1564 1565 #ifdef CONFIG_HUGETLB_PAGE 1566 /* This function walks within one hugetlb entry in the single call */ 1567 static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, 1568 unsigned long addr, unsigned long end, 1569 struct mm_walk *walk) 1570 { 1571 struct pagemapread *pm = walk->private; 1572 struct vm_area_struct *vma = walk->vma; 1573 u64 flags = 0, frame = 0; 1574 int err = 0; 1575 pte_t pte; 1576 1577 if (vma->vm_flags & VM_SOFTDIRTY) 1578 flags |= PM_SOFT_DIRTY; 1579 1580 pte = huge_ptep_get(ptep); 1581 if (pte_present(pte)) { 1582 struct folio *folio = page_folio(pte_page(pte)); 1583 1584 if (!folio_test_anon(folio)) 1585 flags |= PM_FILE; 1586 1587 if (!folio_likely_mapped_shared(folio) && 1588 !hugetlb_pmd_shared(ptep)) 1589 flags |= PM_MMAP_EXCLUSIVE; 1590 1591 if (huge_pte_uffd_wp(pte)) 1592 flags |= PM_UFFD_WP; 1593 1594 flags |= PM_PRESENT; 1595 if (pm->show_pfn) 1596 frame = pte_pfn(pte) + 1597 ((addr & ~hmask) >> PAGE_SHIFT); 1598 } else if (pte_swp_uffd_wp_any(pte)) { 1599 flags |= PM_UFFD_WP; 1600 } 1601 1602 for (; addr != end; addr += PAGE_SIZE) { 1603 pagemap_entry_t pme = make_pme(frame, flags); 1604 1605 err = add_to_pagemap(&pme, pm); 1606 if (err) 1607 return err; 1608 if (pm->show_pfn && (flags & PM_PRESENT)) 1609 frame++; 1610 } 1611 1612 cond_resched(); 1613 1614 return err; 1615 } 1616 #else 1617 #define pagemap_hugetlb_range NULL 1618 #endif /* HUGETLB_PAGE */ 1619 1620 static const struct mm_walk_ops pagemap_ops = { 1621 .pmd_entry = pagemap_pmd_range, 1622 .pte_hole = pagemap_pte_hole, 1623 .hugetlb_entry = pagemap_hugetlb_range, 1624 .walk_lock = PGWALK_RDLOCK, 1625 }; 1626 1627 /* 1628 * /proc/pid/pagemap - an array mapping virtual pages to pfns 1629 * 1630 * For each page in the address space, this file contains one 64-bit entry 1631 * consisting of the following: 1632 * 1633 * Bits 0-54 page frame number (PFN) if present 1634 * Bits 0-4 swap type if swapped 1635 * Bits 5-54 swap offset if swapped 1636 * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) 1637 * Bit 56 page exclusively mapped 1638 * Bit 57 pte is uffd-wp write-protected 1639 * Bits 58-60 zero 1640 * Bit 61 page is file-page or shared-anon 1641 * Bit 62 page swapped 1642 * Bit 63 page present 1643 * 1644 * If the page is not present but in swap, then the PFN contains an 1645 * encoding of the swap file number and the page's offset into the 1646 * swap. Unmapped pages return a null PFN. This allows determining 1647 * precisely which pages are mapped (or in swap) and comparing mapped 1648 * pages between processes. 1649 * 1650 * Efficient users of this interface will use /proc/pid/maps to 1651 * determine which areas of memory are actually mapped and llseek to 1652 * skip over unmapped regions. 1653 */ 1654 static ssize_t pagemap_read(struct file *file, char __user *buf, 1655 size_t count, loff_t *ppos) 1656 { 1657 struct mm_struct *mm = file->private_data; 1658 struct pagemapread pm; 1659 unsigned long src; 1660 unsigned long svpfn; 1661 unsigned long start_vaddr; 1662 unsigned long end_vaddr; 1663 int ret = 0, copied = 0; 1664 1665 if (!mm || !mmget_not_zero(mm)) 1666 goto out; 1667 1668 ret = -EINVAL; 1669 /* file position must be aligned */ 1670 if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) 1671 goto out_mm; 1672 1673 ret = 0; 1674 if (!count) 1675 goto out_mm; 1676 1677 /* do not disclose physical addresses: attack vector */ 1678 pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); 1679 1680 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); 1681 pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); 1682 ret = -ENOMEM; 1683 if (!pm.buffer) 1684 goto out_mm; 1685 1686 src = *ppos; 1687 svpfn = src / PM_ENTRY_BYTES; 1688 end_vaddr = mm->task_size; 1689 1690 /* watch out for wraparound */ 1691 start_vaddr = end_vaddr; 1692 if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { 1693 unsigned long end; 1694 1695 ret = mmap_read_lock_killable(mm); 1696 if (ret) 1697 goto out_free; 1698 start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT); 1699 mmap_read_unlock(mm); 1700 1701 end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT); 1702 if (end >= start_vaddr && end < mm->task_size) 1703 end_vaddr = end; 1704 } 1705 1706 /* Ensure the address is inside the task */ 1707 if (start_vaddr > mm->task_size) 1708 start_vaddr = end_vaddr; 1709 1710 ret = 0; 1711 while (count && (start_vaddr < end_vaddr)) { 1712 int len; 1713 unsigned long end; 1714 1715 pm.pos = 0; 1716 end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; 1717 /* overflow ? */ 1718 if (end < start_vaddr || end > end_vaddr) 1719 end = end_vaddr; 1720 ret = mmap_read_lock_killable(mm); 1721 if (ret) 1722 goto out_free; 1723 ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); 1724 mmap_read_unlock(mm); 1725 start_vaddr = end; 1726 1727 len = min(count, PM_ENTRY_BYTES * pm.pos); 1728 if (copy_to_user(buf, pm.buffer, len)) { 1729 ret = -EFAULT; 1730 goto out_free; 1731 } 1732 copied += len; 1733 buf += len; 1734 count -= len; 1735 } 1736 *ppos += copied; 1737 if (!ret || ret == PM_END_OF_BUFFER) 1738 ret = copied; 1739 1740 out_free: 1741 kfree(pm.buffer); 1742 out_mm: 1743 mmput(mm); 1744 out: 1745 return ret; 1746 } 1747 1748 static int pagemap_open(struct inode *inode, struct file *file) 1749 { 1750 struct mm_struct *mm; 1751 1752 mm = proc_mem_open(inode, PTRACE_MODE_READ); 1753 if (IS_ERR(mm)) 1754 return PTR_ERR(mm); 1755 file->private_data = mm; 1756 return 0; 1757 } 1758 1759 static int pagemap_release(struct inode *inode, struct file *file) 1760 { 1761 struct mm_struct *mm = file->private_data; 1762 1763 if (mm) 1764 mmdrop(mm); 1765 return 0; 1766 } 1767 1768 #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ 1769 PAGE_IS_FILE | PAGE_IS_PRESENT | \ 1770 PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ 1771 PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY) 1772 #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) 1773 1774 struct pagemap_scan_private { 1775 struct pm_scan_arg arg; 1776 unsigned long masks_of_interest, cur_vma_category; 1777 struct page_region *vec_buf; 1778 unsigned long vec_buf_len, vec_buf_index, found_pages; 1779 struct page_region __user *vec_out; 1780 }; 1781 1782 static unsigned long pagemap_page_category(struct pagemap_scan_private *p, 1783 struct vm_area_struct *vma, 1784 unsigned long addr, pte_t pte) 1785 { 1786 unsigned long categories = 0; 1787 1788 if (pte_present(pte)) { 1789 struct page *page; 1790 1791 categories |= PAGE_IS_PRESENT; 1792 if (!pte_uffd_wp(pte)) 1793 categories |= PAGE_IS_WRITTEN; 1794 1795 if (p->masks_of_interest & PAGE_IS_FILE) { 1796 page = vm_normal_page(vma, addr, pte); 1797 if (page && !PageAnon(page)) 1798 categories |= PAGE_IS_FILE; 1799 } 1800 1801 if (is_zero_pfn(pte_pfn(pte))) 1802 categories |= PAGE_IS_PFNZERO; 1803 if (pte_soft_dirty(pte)) 1804 categories |= PAGE_IS_SOFT_DIRTY; 1805 } else if (is_swap_pte(pte)) { 1806 swp_entry_t swp; 1807 1808 categories |= PAGE_IS_SWAPPED; 1809 if (!pte_swp_uffd_wp_any(pte)) 1810 categories |= PAGE_IS_WRITTEN; 1811 1812 if (p->masks_of_interest & PAGE_IS_FILE) { 1813 swp = pte_to_swp_entry(pte); 1814 if (is_pfn_swap_entry(swp) && 1815 !folio_test_anon(pfn_swap_entry_folio(swp))) 1816 categories |= PAGE_IS_FILE; 1817 } 1818 if (pte_swp_soft_dirty(pte)) 1819 categories |= PAGE_IS_SOFT_DIRTY; 1820 } 1821 1822 return categories; 1823 } 1824 1825 static void make_uffd_wp_pte(struct vm_area_struct *vma, 1826 unsigned long addr, pte_t *pte) 1827 { 1828 pte_t ptent = ptep_get(pte); 1829 1830 if (pte_present(ptent)) { 1831 pte_t old_pte; 1832 1833 old_pte = ptep_modify_prot_start(vma, addr, pte); 1834 ptent = pte_mkuffd_wp(ptent); 1835 ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); 1836 } else if (is_swap_pte(ptent)) { 1837 ptent = pte_swp_mkuffd_wp(ptent); 1838 set_pte_at(vma->vm_mm, addr, pte, ptent); 1839 } else { 1840 set_pte_at(vma->vm_mm, addr, pte, 1841 make_pte_marker(PTE_MARKER_UFFD_WP)); 1842 } 1843 } 1844 1845 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1846 static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, 1847 struct vm_area_struct *vma, 1848 unsigned long addr, pmd_t pmd) 1849 { 1850 unsigned long categories = PAGE_IS_HUGE; 1851 1852 if (pmd_present(pmd)) { 1853 struct page *page; 1854 1855 categories |= PAGE_IS_PRESENT; 1856 if (!pmd_uffd_wp(pmd)) 1857 categories |= PAGE_IS_WRITTEN; 1858 1859 if (p->masks_of_interest & PAGE_IS_FILE) { 1860 page = vm_normal_page_pmd(vma, addr, pmd); 1861 if (page && !PageAnon(page)) 1862 categories |= PAGE_IS_FILE; 1863 } 1864 1865 if (is_zero_pfn(pmd_pfn(pmd))) 1866 categories |= PAGE_IS_PFNZERO; 1867 if (pmd_soft_dirty(pmd)) 1868 categories |= PAGE_IS_SOFT_DIRTY; 1869 } else if (is_swap_pmd(pmd)) { 1870 swp_entry_t swp; 1871 1872 categories |= PAGE_IS_SWAPPED; 1873 if (!pmd_swp_uffd_wp(pmd)) 1874 categories |= PAGE_IS_WRITTEN; 1875 if (pmd_swp_soft_dirty(pmd)) 1876 categories |= PAGE_IS_SOFT_DIRTY; 1877 1878 if (p->masks_of_interest & PAGE_IS_FILE) { 1879 swp = pmd_to_swp_entry(pmd); 1880 if (is_pfn_swap_entry(swp) && 1881 !folio_test_anon(pfn_swap_entry_folio(swp))) 1882 categories |= PAGE_IS_FILE; 1883 } 1884 } 1885 1886 return categories; 1887 } 1888 1889 static void make_uffd_wp_pmd(struct vm_area_struct *vma, 1890 unsigned long addr, pmd_t *pmdp) 1891 { 1892 pmd_t old, pmd = *pmdp; 1893 1894 if (pmd_present(pmd)) { 1895 old = pmdp_invalidate_ad(vma, addr, pmdp); 1896 pmd = pmd_mkuffd_wp(old); 1897 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 1898 } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { 1899 pmd = pmd_swp_mkuffd_wp(pmd); 1900 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 1901 } 1902 } 1903 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1904 1905 #ifdef CONFIG_HUGETLB_PAGE 1906 static unsigned long pagemap_hugetlb_category(pte_t pte) 1907 { 1908 unsigned long categories = PAGE_IS_HUGE; 1909 1910 /* 1911 * According to pagemap_hugetlb_range(), file-backed HugeTLB 1912 * page cannot be swapped. So PAGE_IS_FILE is not checked for 1913 * swapped pages. 1914 */ 1915 if (pte_present(pte)) { 1916 categories |= PAGE_IS_PRESENT; 1917 if (!huge_pte_uffd_wp(pte)) 1918 categories |= PAGE_IS_WRITTEN; 1919 if (!PageAnon(pte_page(pte))) 1920 categories |= PAGE_IS_FILE; 1921 if (is_zero_pfn(pte_pfn(pte))) 1922 categories |= PAGE_IS_PFNZERO; 1923 if (pte_soft_dirty(pte)) 1924 categories |= PAGE_IS_SOFT_DIRTY; 1925 } else if (is_swap_pte(pte)) { 1926 categories |= PAGE_IS_SWAPPED; 1927 if (!pte_swp_uffd_wp_any(pte)) 1928 categories |= PAGE_IS_WRITTEN; 1929 if (pte_swp_soft_dirty(pte)) 1930 categories |= PAGE_IS_SOFT_DIRTY; 1931 } 1932 1933 return categories; 1934 } 1935 1936 static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, 1937 unsigned long addr, pte_t *ptep, 1938 pte_t ptent) 1939 { 1940 unsigned long psize; 1941 1942 if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent)) 1943 return; 1944 1945 psize = huge_page_size(hstate_vma(vma)); 1946 1947 if (is_hugetlb_entry_migration(ptent)) 1948 set_huge_pte_at(vma->vm_mm, addr, ptep, 1949 pte_swp_mkuffd_wp(ptent), psize); 1950 else if (!huge_pte_none(ptent)) 1951 huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, 1952 huge_pte_mkuffd_wp(ptent)); 1953 else 1954 set_huge_pte_at(vma->vm_mm, addr, ptep, 1955 make_pte_marker(PTE_MARKER_UFFD_WP), psize); 1956 } 1957 #endif /* CONFIG_HUGETLB_PAGE */ 1958 1959 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) 1960 static void pagemap_scan_backout_range(struct pagemap_scan_private *p, 1961 unsigned long addr, unsigned long end) 1962 { 1963 struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; 1964 1965 if (cur_buf->start != addr) 1966 cur_buf->end = addr; 1967 else 1968 cur_buf->start = cur_buf->end = 0; 1969 1970 p->found_pages -= (end - addr) / PAGE_SIZE; 1971 } 1972 #endif 1973 1974 static bool pagemap_scan_is_interesting_page(unsigned long categories, 1975 const struct pagemap_scan_private *p) 1976 { 1977 categories ^= p->arg.category_inverted; 1978 if ((categories & p->arg.category_mask) != p->arg.category_mask) 1979 return false; 1980 if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask)) 1981 return false; 1982 1983 return true; 1984 } 1985 1986 static bool pagemap_scan_is_interesting_vma(unsigned long categories, 1987 const struct pagemap_scan_private *p) 1988 { 1989 unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED; 1990 1991 categories ^= p->arg.category_inverted; 1992 if ((categories & required) != required) 1993 return false; 1994 1995 return true; 1996 } 1997 1998 static int pagemap_scan_test_walk(unsigned long start, unsigned long end, 1999 struct mm_walk *walk) 2000 { 2001 struct pagemap_scan_private *p = walk->private; 2002 struct vm_area_struct *vma = walk->vma; 2003 unsigned long vma_category = 0; 2004 bool wp_allowed = userfaultfd_wp_async(vma) && 2005 userfaultfd_wp_use_markers(vma); 2006 2007 if (!wp_allowed) { 2008 /* User requested explicit failure over wp-async capability */ 2009 if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) 2010 return -EPERM; 2011 /* 2012 * User requires wr-protect, and allows silently skipping 2013 * unsupported vmas. 2014 */ 2015 if (p->arg.flags & PM_SCAN_WP_MATCHING) 2016 return 1; 2017 /* 2018 * Then the request doesn't involve wr-protects at all, 2019 * fall through to the rest checks, and allow vma walk. 2020 */ 2021 } 2022 2023 if (vma->vm_flags & VM_PFNMAP) 2024 return 1; 2025 2026 if (wp_allowed) 2027 vma_category |= PAGE_IS_WPALLOWED; 2028 2029 if (vma->vm_flags & VM_SOFTDIRTY) 2030 vma_category |= PAGE_IS_SOFT_DIRTY; 2031 2032 if (!pagemap_scan_is_interesting_vma(vma_category, p)) 2033 return 1; 2034 2035 p->cur_vma_category = vma_category; 2036 2037 return 0; 2038 } 2039 2040 static bool pagemap_scan_push_range(unsigned long categories, 2041 struct pagemap_scan_private *p, 2042 unsigned long addr, unsigned long end) 2043 { 2044 struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; 2045 2046 /* 2047 * When there is no output buffer provided at all, the sentinel values 2048 * won't match here. There is no other way for `cur_buf->end` to be 2049 * non-zero other than it being non-empty. 2050 */ 2051 if (addr == cur_buf->end && categories == cur_buf->categories) { 2052 cur_buf->end = end; 2053 return true; 2054 } 2055 2056 if (cur_buf->end) { 2057 if (p->vec_buf_index >= p->vec_buf_len - 1) 2058 return false; 2059 2060 cur_buf = &p->vec_buf[++p->vec_buf_index]; 2061 } 2062 2063 cur_buf->start = addr; 2064 cur_buf->end = end; 2065 cur_buf->categories = categories; 2066 2067 return true; 2068 } 2069 2070 static int pagemap_scan_output(unsigned long categories, 2071 struct pagemap_scan_private *p, 2072 unsigned long addr, unsigned long *end) 2073 { 2074 unsigned long n_pages, total_pages; 2075 int ret = 0; 2076 2077 if (!p->vec_buf) 2078 return 0; 2079 2080 categories &= p->arg.return_mask; 2081 2082 n_pages = (*end - addr) / PAGE_SIZE; 2083 if (check_add_overflow(p->found_pages, n_pages, &total_pages) || 2084 total_pages > p->arg.max_pages) { 2085 size_t n_too_much = total_pages - p->arg.max_pages; 2086 *end -= n_too_much * PAGE_SIZE; 2087 n_pages -= n_too_much; 2088 ret = -ENOSPC; 2089 } 2090 2091 if (!pagemap_scan_push_range(categories, p, addr, *end)) { 2092 *end = addr; 2093 n_pages = 0; 2094 ret = -ENOSPC; 2095 } 2096 2097 p->found_pages += n_pages; 2098 if (ret) 2099 p->arg.walk_end = *end; 2100 2101 return ret; 2102 } 2103 2104 static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, 2105 unsigned long end, struct mm_walk *walk) 2106 { 2107 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2108 struct pagemap_scan_private *p = walk->private; 2109 struct vm_area_struct *vma = walk->vma; 2110 unsigned long categories; 2111 spinlock_t *ptl; 2112 int ret = 0; 2113 2114 ptl = pmd_trans_huge_lock(pmd, vma); 2115 if (!ptl) 2116 return -ENOENT; 2117 2118 categories = p->cur_vma_category | 2119 pagemap_thp_category(p, vma, start, *pmd); 2120 2121 if (!pagemap_scan_is_interesting_page(categories, p)) 2122 goto out_unlock; 2123 2124 ret = pagemap_scan_output(categories, p, start, &end); 2125 if (start == end) 2126 goto out_unlock; 2127 2128 if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2129 goto out_unlock; 2130 if (~categories & PAGE_IS_WRITTEN) 2131 goto out_unlock; 2132 2133 /* 2134 * Break huge page into small pages if the WP operation 2135 * needs to be performed on a portion of the huge page. 2136 */ 2137 if (end != start + HPAGE_SIZE) { 2138 spin_unlock(ptl); 2139 split_huge_pmd(vma, pmd, start); 2140 pagemap_scan_backout_range(p, start, end); 2141 /* Report as if there was no THP */ 2142 return -ENOENT; 2143 } 2144 2145 make_uffd_wp_pmd(vma, start, pmd); 2146 flush_tlb_range(vma, start, end); 2147 out_unlock: 2148 spin_unlock(ptl); 2149 return ret; 2150 #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ 2151 return -ENOENT; 2152 #endif 2153 } 2154 2155 static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, 2156 unsigned long end, struct mm_walk *walk) 2157 { 2158 struct pagemap_scan_private *p = walk->private; 2159 struct vm_area_struct *vma = walk->vma; 2160 unsigned long addr, flush_end = 0; 2161 pte_t *pte, *start_pte; 2162 spinlock_t *ptl; 2163 int ret; 2164 2165 arch_enter_lazy_mmu_mode(); 2166 2167 ret = pagemap_scan_thp_entry(pmd, start, end, walk); 2168 if (ret != -ENOENT) { 2169 arch_leave_lazy_mmu_mode(); 2170 return ret; 2171 } 2172 2173 ret = 0; 2174 start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); 2175 if (!pte) { 2176 arch_leave_lazy_mmu_mode(); 2177 walk->action = ACTION_AGAIN; 2178 return 0; 2179 } 2180 2181 if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { 2182 /* Fast path for performing exclusive WP */ 2183 for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { 2184 if (pte_uffd_wp(ptep_get(pte))) 2185 continue; 2186 make_uffd_wp_pte(vma, addr, pte); 2187 if (!flush_end) 2188 start = addr; 2189 flush_end = addr + PAGE_SIZE; 2190 } 2191 goto flush_and_return; 2192 } 2193 2194 if (!p->arg.category_anyof_mask && !p->arg.category_inverted && 2195 p->arg.category_mask == PAGE_IS_WRITTEN && 2196 p->arg.return_mask == PAGE_IS_WRITTEN) { 2197 for (addr = start; addr < end; pte++, addr += PAGE_SIZE) { 2198 unsigned long next = addr + PAGE_SIZE; 2199 2200 if (pte_uffd_wp(ptep_get(pte))) 2201 continue; 2202 ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN, 2203 p, addr, &next); 2204 if (next == addr) 2205 break; 2206 if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2207 continue; 2208 make_uffd_wp_pte(vma, addr, pte); 2209 if (!flush_end) 2210 start = addr; 2211 flush_end = next; 2212 } 2213 goto flush_and_return; 2214 } 2215 2216 for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { 2217 unsigned long categories = p->cur_vma_category | 2218 pagemap_page_category(p, vma, addr, ptep_get(pte)); 2219 unsigned long next = addr + PAGE_SIZE; 2220 2221 if (!pagemap_scan_is_interesting_page(categories, p)) 2222 continue; 2223 2224 ret = pagemap_scan_output(categories, p, addr, &next); 2225 if (next == addr) 2226 break; 2227 2228 if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2229 continue; 2230 if (~categories & PAGE_IS_WRITTEN) 2231 continue; 2232 2233 make_uffd_wp_pte(vma, addr, pte); 2234 if (!flush_end) 2235 start = addr; 2236 flush_end = next; 2237 } 2238 2239 flush_and_return: 2240 if (flush_end) 2241 flush_tlb_range(vma, start, addr); 2242 2243 pte_unmap_unlock(start_pte, ptl); 2244 arch_leave_lazy_mmu_mode(); 2245 2246 cond_resched(); 2247 return ret; 2248 } 2249 2250 #ifdef CONFIG_HUGETLB_PAGE 2251 static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, 2252 unsigned long start, unsigned long end, 2253 struct mm_walk *walk) 2254 { 2255 struct pagemap_scan_private *p = walk->private; 2256 struct vm_area_struct *vma = walk->vma; 2257 unsigned long categories; 2258 spinlock_t *ptl; 2259 int ret = 0; 2260 pte_t pte; 2261 2262 if (~p->arg.flags & PM_SCAN_WP_MATCHING) { 2263 /* Go the short route when not write-protecting pages. */ 2264 2265 pte = huge_ptep_get(ptep); 2266 categories = p->cur_vma_category | pagemap_hugetlb_category(pte); 2267 2268 if (!pagemap_scan_is_interesting_page(categories, p)) 2269 return 0; 2270 2271 return pagemap_scan_output(categories, p, start, &end); 2272 } 2273 2274 i_mmap_lock_write(vma->vm_file->f_mapping); 2275 ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); 2276 2277 pte = huge_ptep_get(ptep); 2278 categories = p->cur_vma_category | pagemap_hugetlb_category(pte); 2279 2280 if (!pagemap_scan_is_interesting_page(categories, p)) 2281 goto out_unlock; 2282 2283 ret = pagemap_scan_output(categories, p, start, &end); 2284 if (start == end) 2285 goto out_unlock; 2286 2287 if (~categories & PAGE_IS_WRITTEN) 2288 goto out_unlock; 2289 2290 if (end != start + HPAGE_SIZE) { 2291 /* Partial HugeTLB page WP isn't possible. */ 2292 pagemap_scan_backout_range(p, start, end); 2293 p->arg.walk_end = start; 2294 ret = 0; 2295 goto out_unlock; 2296 } 2297 2298 make_uffd_wp_huge_pte(vma, start, ptep, pte); 2299 flush_hugetlb_tlb_range(vma, start, end); 2300 2301 out_unlock: 2302 spin_unlock(ptl); 2303 i_mmap_unlock_write(vma->vm_file->f_mapping); 2304 2305 return ret; 2306 } 2307 #else 2308 #define pagemap_scan_hugetlb_entry NULL 2309 #endif 2310 2311 static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, 2312 int depth, struct mm_walk *walk) 2313 { 2314 struct pagemap_scan_private *p = walk->private; 2315 struct vm_area_struct *vma = walk->vma; 2316 int ret, err; 2317 2318 if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p)) 2319 return 0; 2320 2321 ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end); 2322 if (addr == end) 2323 return ret; 2324 2325 if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2326 return ret; 2327 2328 err = uffd_wp_range(vma, addr, end - addr, true); 2329 if (err < 0) 2330 ret = err; 2331 2332 return ret; 2333 } 2334 2335 static const struct mm_walk_ops pagemap_scan_ops = { 2336 .test_walk = pagemap_scan_test_walk, 2337 .pmd_entry = pagemap_scan_pmd_entry, 2338 .pte_hole = pagemap_scan_pte_hole, 2339 .hugetlb_entry = pagemap_scan_hugetlb_entry, 2340 }; 2341 2342 static int pagemap_scan_get_args(struct pm_scan_arg *arg, 2343 unsigned long uarg) 2344 { 2345 if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg))) 2346 return -EFAULT; 2347 2348 if (arg->size != sizeof(struct pm_scan_arg)) 2349 return -EINVAL; 2350 2351 /* Validate requested features */ 2352 if (arg->flags & ~PM_SCAN_FLAGS) 2353 return -EINVAL; 2354 if ((arg->category_inverted | arg->category_mask | 2355 arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) 2356 return -EINVAL; 2357 2358 arg->start = untagged_addr((unsigned long)arg->start); 2359 arg->end = untagged_addr((unsigned long)arg->end); 2360 arg->vec = untagged_addr((unsigned long)arg->vec); 2361 2362 /* Validate memory pointers */ 2363 if (!IS_ALIGNED(arg->start, PAGE_SIZE)) 2364 return -EINVAL; 2365 if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start)) 2366 return -EFAULT; 2367 if (!arg->vec && arg->vec_len) 2368 return -EINVAL; 2369 if (arg->vec && !access_ok((void __user *)(long)arg->vec, 2370 arg->vec_len * sizeof(struct page_region))) 2371 return -EFAULT; 2372 2373 /* Fixup default values */ 2374 arg->end = ALIGN(arg->end, PAGE_SIZE); 2375 arg->walk_end = 0; 2376 if (!arg->max_pages) 2377 arg->max_pages = ULONG_MAX; 2378 2379 return 0; 2380 } 2381 2382 static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, 2383 unsigned long uargl) 2384 { 2385 struct pm_scan_arg __user *uarg = (void __user *)uargl; 2386 2387 if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end))) 2388 return -EFAULT; 2389 2390 return 0; 2391 } 2392 2393 static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) 2394 { 2395 if (!p->arg.vec_len) 2396 return 0; 2397 2398 p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, 2399 p->arg.vec_len); 2400 p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf), 2401 GFP_KERNEL); 2402 if (!p->vec_buf) 2403 return -ENOMEM; 2404 2405 p->vec_buf->start = p->vec_buf->end = 0; 2406 p->vec_out = (struct page_region __user *)(long)p->arg.vec; 2407 2408 return 0; 2409 } 2410 2411 static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) 2412 { 2413 const struct page_region *buf = p->vec_buf; 2414 long n = p->vec_buf_index; 2415 2416 if (!p->vec_buf) 2417 return 0; 2418 2419 if (buf[n].end != buf[n].start) 2420 n++; 2421 2422 if (!n) 2423 return 0; 2424 2425 if (copy_to_user(p->vec_out, buf, n * sizeof(*buf))) 2426 return -EFAULT; 2427 2428 p->arg.vec_len -= n; 2429 p->vec_out += n; 2430 2431 p->vec_buf_index = 0; 2432 p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len); 2433 p->vec_buf->start = p->vec_buf->end = 0; 2434 2435 return n; 2436 } 2437 2438 static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) 2439 { 2440 struct pagemap_scan_private p = {0}; 2441 unsigned long walk_start; 2442 size_t n_ranges_out = 0; 2443 int ret; 2444 2445 ret = pagemap_scan_get_args(&p.arg, uarg); 2446 if (ret) 2447 return ret; 2448 2449 p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask | 2450 p.arg.return_mask; 2451 ret = pagemap_scan_init_bounce_buffer(&p); 2452 if (ret) 2453 return ret; 2454 2455 for (walk_start = p.arg.start; walk_start < p.arg.end; 2456 walk_start = p.arg.walk_end) { 2457 struct mmu_notifier_range range; 2458 long n_out; 2459 2460 if (fatal_signal_pending(current)) { 2461 ret = -EINTR; 2462 break; 2463 } 2464 2465 ret = mmap_read_lock_killable(mm); 2466 if (ret) 2467 break; 2468 2469 /* Protection change for the range is going to happen. */ 2470 if (p.arg.flags & PM_SCAN_WP_MATCHING) { 2471 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, 2472 mm, walk_start, p.arg.end); 2473 mmu_notifier_invalidate_range_start(&range); 2474 } 2475 2476 ret = walk_page_range(mm, walk_start, p.arg.end, 2477 &pagemap_scan_ops, &p); 2478 2479 if (p.arg.flags & PM_SCAN_WP_MATCHING) 2480 mmu_notifier_invalidate_range_end(&range); 2481 2482 mmap_read_unlock(mm); 2483 2484 n_out = pagemap_scan_flush_buffer(&p); 2485 if (n_out < 0) 2486 ret = n_out; 2487 else 2488 n_ranges_out += n_out; 2489 2490 if (ret != -ENOSPC) 2491 break; 2492 2493 if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages) 2494 break; 2495 } 2496 2497 /* ENOSPC signifies early stop (buffer full) from the walk. */ 2498 if (!ret || ret == -ENOSPC) 2499 ret = n_ranges_out; 2500 2501 /* The walk_end isn't set when ret is zero */ 2502 if (!p.arg.walk_end) 2503 p.arg.walk_end = p.arg.end; 2504 if (pagemap_scan_writeback_args(&p.arg, uarg)) 2505 ret = -EFAULT; 2506 2507 kfree(p.vec_buf); 2508 return ret; 2509 } 2510 2511 static long do_pagemap_cmd(struct file *file, unsigned int cmd, 2512 unsigned long arg) 2513 { 2514 struct mm_struct *mm = file->private_data; 2515 2516 switch (cmd) { 2517 case PAGEMAP_SCAN: 2518 return do_pagemap_scan(mm, arg); 2519 2520 default: 2521 return -EINVAL; 2522 } 2523 } 2524 2525 const struct file_operations proc_pagemap_operations = { 2526 .llseek = mem_lseek, /* borrow this */ 2527 .read = pagemap_read, 2528 .open = pagemap_open, 2529 .release = pagemap_release, 2530 .unlocked_ioctl = do_pagemap_cmd, 2531 .compat_ioctl = do_pagemap_cmd, 2532 }; 2533 #endif /* CONFIG_PROC_PAGE_MONITOR */ 2534 2535 #ifdef CONFIG_NUMA 2536 2537 struct numa_maps { 2538 unsigned long pages; 2539 unsigned long anon; 2540 unsigned long active; 2541 unsigned long writeback; 2542 unsigned long mapcount_max; 2543 unsigned long dirty; 2544 unsigned long swapcache; 2545 unsigned long node[MAX_NUMNODES]; 2546 }; 2547 2548 struct numa_maps_private { 2549 struct proc_maps_private proc_maps; 2550 struct numa_maps md; 2551 }; 2552 2553 static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, 2554 unsigned long nr_pages) 2555 { 2556 struct folio *folio = page_folio(page); 2557 int count = page_mapcount(page); 2558 2559 md->pages += nr_pages; 2560 if (pte_dirty || folio_test_dirty(folio)) 2561 md->dirty += nr_pages; 2562 2563 if (folio_test_swapcache(folio)) 2564 md->swapcache += nr_pages; 2565 2566 if (folio_test_active(folio) || folio_test_unevictable(folio)) 2567 md->active += nr_pages; 2568 2569 if (folio_test_writeback(folio)) 2570 md->writeback += nr_pages; 2571 2572 if (folio_test_anon(folio)) 2573 md->anon += nr_pages; 2574 2575 if (count > md->mapcount_max) 2576 md->mapcount_max = count; 2577 2578 md->node[folio_nid(folio)] += nr_pages; 2579 } 2580 2581 static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, 2582 unsigned long addr) 2583 { 2584 struct page *page; 2585 int nid; 2586 2587 if (!pte_present(pte)) 2588 return NULL; 2589 2590 page = vm_normal_page(vma, addr, pte); 2591 if (!page || is_zone_device_page(page)) 2592 return NULL; 2593 2594 if (PageReserved(page)) 2595 return NULL; 2596 2597 nid = page_to_nid(page); 2598 if (!node_isset(nid, node_states[N_MEMORY])) 2599 return NULL; 2600 2601 return page; 2602 } 2603 2604 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2605 static struct page *can_gather_numa_stats_pmd(pmd_t pmd, 2606 struct vm_area_struct *vma, 2607 unsigned long addr) 2608 { 2609 struct page *page; 2610 int nid; 2611 2612 if (!pmd_present(pmd)) 2613 return NULL; 2614 2615 page = vm_normal_page_pmd(vma, addr, pmd); 2616 if (!page) 2617 return NULL; 2618 2619 if (PageReserved(page)) 2620 return NULL; 2621 2622 nid = page_to_nid(page); 2623 if (!node_isset(nid, node_states[N_MEMORY])) 2624 return NULL; 2625 2626 return page; 2627 } 2628 #endif 2629 2630 static int gather_pte_stats(pmd_t *pmd, unsigned long addr, 2631 unsigned long end, struct mm_walk *walk) 2632 { 2633 struct numa_maps *md = walk->private; 2634 struct vm_area_struct *vma = walk->vma; 2635 spinlock_t *ptl; 2636 pte_t *orig_pte; 2637 pte_t *pte; 2638 2639 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2640 ptl = pmd_trans_huge_lock(pmd, vma); 2641 if (ptl) { 2642 struct page *page; 2643 2644 page = can_gather_numa_stats_pmd(*pmd, vma, addr); 2645 if (page) 2646 gather_stats(page, md, pmd_dirty(*pmd), 2647 HPAGE_PMD_SIZE/PAGE_SIZE); 2648 spin_unlock(ptl); 2649 return 0; 2650 } 2651 #endif 2652 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 2653 if (!pte) { 2654 walk->action = ACTION_AGAIN; 2655 return 0; 2656 } 2657 do { 2658 pte_t ptent = ptep_get(pte); 2659 struct page *page = can_gather_numa_stats(ptent, vma, addr); 2660 if (!page) 2661 continue; 2662 gather_stats(page, md, pte_dirty(ptent), 1); 2663 2664 } while (pte++, addr += PAGE_SIZE, addr != end); 2665 pte_unmap_unlock(orig_pte, ptl); 2666 cond_resched(); 2667 return 0; 2668 } 2669 #ifdef CONFIG_HUGETLB_PAGE 2670 static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, 2671 unsigned long addr, unsigned long end, struct mm_walk *walk) 2672 { 2673 pte_t huge_pte = huge_ptep_get(pte); 2674 struct numa_maps *md; 2675 struct page *page; 2676 2677 if (!pte_present(huge_pte)) 2678 return 0; 2679 2680 page = pte_page(huge_pte); 2681 2682 md = walk->private; 2683 gather_stats(page, md, pte_dirty(huge_pte), 1); 2684 return 0; 2685 } 2686 2687 #else 2688 static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, 2689 unsigned long addr, unsigned long end, struct mm_walk *walk) 2690 { 2691 return 0; 2692 } 2693 #endif 2694 2695 static const struct mm_walk_ops show_numa_ops = { 2696 .hugetlb_entry = gather_hugetlb_stats, 2697 .pmd_entry = gather_pte_stats, 2698 .walk_lock = PGWALK_RDLOCK, 2699 }; 2700 2701 /* 2702 * Display pages allocated per node and memory policy via /proc. 2703 */ 2704 static int show_numa_map(struct seq_file *m, void *v) 2705 { 2706 struct numa_maps_private *numa_priv = m->private; 2707 struct proc_maps_private *proc_priv = &numa_priv->proc_maps; 2708 struct vm_area_struct *vma = v; 2709 struct numa_maps *md = &numa_priv->md; 2710 struct file *file = vma->vm_file; 2711 struct mm_struct *mm = vma->vm_mm; 2712 char buffer[64]; 2713 struct mempolicy *pol; 2714 pgoff_t ilx; 2715 int nid; 2716 2717 if (!mm) 2718 return 0; 2719 2720 /* Ensure we start with an empty set of numa_maps statistics. */ 2721 memset(md, 0, sizeof(*md)); 2722 2723 pol = __get_vma_policy(vma, vma->vm_start, &ilx); 2724 if (pol) { 2725 mpol_to_str(buffer, sizeof(buffer), pol); 2726 mpol_cond_put(pol); 2727 } else { 2728 mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy); 2729 } 2730 2731 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 2732 2733 if (file) { 2734 seq_puts(m, " file="); 2735 seq_path(m, file_user_path(file), "\n\t= "); 2736 } else if (vma_is_initial_heap(vma)) { 2737 seq_puts(m, " heap"); 2738 } else if (vma_is_initial_stack(vma)) { 2739 seq_puts(m, " stack"); 2740 } 2741 2742 if (is_vm_hugetlb_page(vma)) 2743 seq_puts(m, " huge"); 2744 2745 /* mmap_lock is held by m_start */ 2746 walk_page_vma(vma, &show_numa_ops, md); 2747 2748 if (!md->pages) 2749 goto out; 2750 2751 if (md->anon) 2752 seq_printf(m, " anon=%lu", md->anon); 2753 2754 if (md->dirty) 2755 seq_printf(m, " dirty=%lu", md->dirty); 2756 2757 if (md->pages != md->anon && md->pages != md->dirty) 2758 seq_printf(m, " mapped=%lu", md->pages); 2759 2760 if (md->mapcount_max > 1) 2761 seq_printf(m, " mapmax=%lu", md->mapcount_max); 2762 2763 if (md->swapcache) 2764 seq_printf(m, " swapcache=%lu", md->swapcache); 2765 2766 if (md->active < md->pages && !is_vm_hugetlb_page(vma)) 2767 seq_printf(m, " active=%lu", md->active); 2768 2769 if (md->writeback) 2770 seq_printf(m, " writeback=%lu", md->writeback); 2771 2772 for_each_node_state(nid, N_MEMORY) 2773 if (md->node[nid]) 2774 seq_printf(m, " N%d=%lu", nid, md->node[nid]); 2775 2776 seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); 2777 out: 2778 seq_putc(m, '\n'); 2779 return 0; 2780 } 2781 2782 static const struct seq_operations proc_pid_numa_maps_op = { 2783 .start = m_start, 2784 .next = m_next, 2785 .stop = m_stop, 2786 .show = show_numa_map, 2787 }; 2788 2789 static int pid_numa_maps_open(struct inode *inode, struct file *file) 2790 { 2791 return proc_maps_open(inode, file, &proc_pid_numa_maps_op, 2792 sizeof(struct numa_maps_private)); 2793 } 2794 2795 const struct file_operations proc_pid_numa_maps_operations = { 2796 .open = pid_numa_maps_open, 2797 .read = seq_read, 2798 .llseek = seq_lseek, 2799 .release = proc_map_release, 2800 }; 2801 2802 #endif /* CONFIG_NUMA */ 2803