1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/pagewalk.h> 3 #include <linux/mm_inline.h> 4 #include <linux/hugetlb.h> 5 #include <linux/huge_mm.h> 6 #include <linux/mount.h> 7 #include <linux/ksm.h> 8 #include <linux/seq_file.h> 9 #include <linux/highmem.h> 10 #include <linux/ptrace.h> 11 #include <linux/slab.h> 12 #include <linux/pagemap.h> 13 #include <linux/mempolicy.h> 14 #include <linux/rmap.h> 15 #include <linux/swap.h> 16 #include <linux/sched/mm.h> 17 #include <linux/leafops.h> 18 #include <linux/mmu_notifier.h> 19 #include <linux/page_idle.h> 20 #include <linux/shmem_fs.h> 21 #include <linux/uaccess.h> 22 #include <linux/pkeys.h> 23 #include <linux/minmax.h> 24 #include <linux/overflow.h> 25 #include <linux/buildid.h> 26 27 #include <asm/elf.h> 28 #include <asm/tlb.h> 29 #include <asm/tlbflush.h> 30 #include "internal.h" 31 32 #define SENTINEL_VMA_END -1 33 #define SENTINEL_VMA_GATE -2 34 35 #define SEQ_PUT_DEC(str, val) \ 36 seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) 37 void task_mem(struct seq_file *m, struct mm_struct *mm) 38 { 39 unsigned long text, lib, swap, anon, file, shmem; 40 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; 41 42 anon = get_mm_counter_sum(mm, MM_ANONPAGES); 43 file = get_mm_counter_sum(mm, MM_FILEPAGES); 44 shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES); 45 46 /* 47 * Note: to minimize their overhead, mm maintains hiwater_vm and 48 * hiwater_rss only when about to *lower* total_vm or rss. Any 49 * collector of these hiwater stats must therefore get total_vm 50 * and rss too, which will usually be the higher. Barriers? not 51 * worth the effort, such snapshots can always be inconsistent. 52 */ 53 hiwater_vm = total_vm = mm->total_vm; 54 if (hiwater_vm < mm->hiwater_vm) 55 hiwater_vm = mm->hiwater_vm; 56 hiwater_rss = total_rss = anon + file + shmem; 57 if (hiwater_rss < mm->hiwater_rss) 58 hiwater_rss = mm->hiwater_rss; 59 60 /* split executable areas between text and lib */ 61 text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK); 62 text = min(text, mm->exec_vm << PAGE_SHIFT); 63 lib = (mm->exec_vm << PAGE_SHIFT) - text; 64 65 swap = get_mm_counter_sum(mm, MM_SWAPENTS); 66 SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); 67 SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); 68 SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); 69 SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm)); 70 SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); 71 SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); 72 SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); 73 SEQ_PUT_DEC(" kB\nRssFile:\t", file); 74 SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem); 75 SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm); 76 SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm); 77 seq_put_decimal_ull_width(m, 78 " kB\nVmExe:\t", text >> 10, 8); 79 seq_put_decimal_ull_width(m, 80 " kB\nVmLib:\t", lib >> 10, 8); 81 seq_put_decimal_ull_width(m, 82 " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8); 83 SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); 84 seq_puts(m, " kB\n"); 85 hugetlb_report_usage(m, mm); 86 } 87 #undef SEQ_PUT_DEC 88 89 unsigned long task_vsize(struct mm_struct *mm) 90 { 91 return PAGE_SIZE * mm->total_vm; 92 } 93 94 unsigned long task_statm(struct mm_struct *mm, 95 unsigned long *shared, unsigned long *text, 96 unsigned long *data, unsigned long *resident) 97 { 98 *shared = get_mm_counter_sum(mm, MM_FILEPAGES) + 99 get_mm_counter_sum(mm, MM_SHMEMPAGES); 100 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) 101 >> PAGE_SHIFT; 102 *data = mm->data_vm + mm->stack_vm; 103 *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES); 104 return mm->total_vm; 105 } 106 107 #ifdef CONFIG_NUMA 108 /* 109 * Save get_task_policy() for show_numa_map(). 110 */ 111 static void hold_task_mempolicy(struct proc_maps_private *priv) 112 { 113 struct task_struct *task = priv->task; 114 115 task_lock(task); 116 priv->task_mempolicy = get_task_policy(task); 117 mpol_get(priv->task_mempolicy); 118 task_unlock(task); 119 } 120 static void release_task_mempolicy(struct proc_maps_private *priv) 121 { 122 mpol_put(priv->task_mempolicy); 123 } 124 #else 125 static void hold_task_mempolicy(struct proc_maps_private *priv) 126 { 127 } 128 static void release_task_mempolicy(struct proc_maps_private *priv) 129 { 130 } 131 #endif 132 133 #ifdef CONFIG_PER_VMA_LOCK 134 135 static void reset_lock_ctx(struct proc_maps_locking_ctx *lock_ctx) 136 { 137 lock_ctx->locked_vma = NULL; 138 lock_ctx->mmap_locked = false; 139 } 140 141 static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx) 142 { 143 if (lock_ctx->locked_vma) { 144 vma_end_read(lock_ctx->locked_vma); 145 lock_ctx->locked_vma = NULL; 146 } 147 } 148 149 static const struct seq_operations proc_pid_maps_op; 150 151 static inline bool lock_vma_range(struct seq_file *m, 152 struct proc_maps_locking_ctx *lock_ctx) 153 { 154 /* 155 * smaps and numa_maps perform page table walk, therefore require 156 * mmap_lock but maps can be read with locking just the vma and 157 * walking the vma tree under rcu read protection. 158 */ 159 if (m->op != &proc_pid_maps_op) { 160 if (mmap_read_lock_killable(lock_ctx->mm)) 161 return false; 162 163 lock_ctx->mmap_locked = true; 164 } else { 165 rcu_read_lock(); 166 reset_lock_ctx(lock_ctx); 167 } 168 169 return true; 170 } 171 172 static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) 173 { 174 if (lock_ctx->mmap_locked) { 175 mmap_read_unlock(lock_ctx->mm); 176 } else { 177 unlock_ctx_vma(lock_ctx); 178 rcu_read_unlock(); 179 } 180 } 181 182 static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, 183 loff_t last_pos) 184 { 185 struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx; 186 struct vm_area_struct *vma; 187 188 if (lock_ctx->mmap_locked) 189 return vma_next(&priv->iter); 190 191 unlock_ctx_vma(lock_ctx); 192 vma = lock_next_vma(lock_ctx->mm, &priv->iter, last_pos); 193 if (!IS_ERR_OR_NULL(vma)) 194 lock_ctx->locked_vma = vma; 195 196 return vma; 197 } 198 199 static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, 200 loff_t pos) 201 { 202 struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx; 203 204 if (lock_ctx->mmap_locked) 205 return false; 206 207 rcu_read_unlock(); 208 mmap_read_lock(lock_ctx->mm); 209 /* Reinitialize the iterator after taking mmap_lock */ 210 vma_iter_set(&priv->iter, pos); 211 lock_ctx->mmap_locked = true; 212 213 return true; 214 } 215 216 #else /* CONFIG_PER_VMA_LOCK */ 217 218 static inline bool lock_vma_range(struct seq_file *m, 219 struct proc_maps_locking_ctx *lock_ctx) 220 { 221 return mmap_read_lock_killable(lock_ctx->mm) == 0; 222 } 223 224 static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) 225 { 226 mmap_read_unlock(lock_ctx->mm); 227 } 228 229 static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, 230 loff_t last_pos) 231 { 232 return vma_next(&priv->iter); 233 } 234 235 static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, 236 loff_t pos) 237 { 238 return false; 239 } 240 241 #endif /* CONFIG_PER_VMA_LOCK */ 242 243 static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) 244 { 245 struct proc_maps_private *priv = m->private; 246 struct vm_area_struct *vma; 247 248 retry: 249 vma = get_next_vma(priv, *ppos); 250 /* EINTR of EAGAIN is possible */ 251 if (IS_ERR(vma)) { 252 if (PTR_ERR(vma) == -EAGAIN && fallback_to_mmap_lock(priv, *ppos)) 253 goto retry; 254 255 return vma; 256 } 257 258 /* Store previous position to be able to restart if needed */ 259 priv->last_pos = *ppos; 260 if (vma) { 261 /* 262 * Track the end of the reported vma to ensure position changes 263 * even if previous vma was merged with the next vma and we 264 * found the extended vma with the same vm_start. 265 */ 266 *ppos = vma->vm_end; 267 } else { 268 *ppos = SENTINEL_VMA_GATE; 269 vma = get_gate_vma(priv->lock_ctx.mm); 270 } 271 272 return vma; 273 } 274 275 static void *m_start(struct seq_file *m, loff_t *ppos) 276 { 277 struct proc_maps_private *priv = m->private; 278 struct proc_maps_locking_ctx *lock_ctx; 279 loff_t last_addr = *ppos; 280 struct mm_struct *mm; 281 282 /* See m_next(). Zero at the start or after lseek. */ 283 if (last_addr == SENTINEL_VMA_END) 284 return NULL; 285 286 priv->task = get_proc_task(priv->inode); 287 if (!priv->task) 288 return ERR_PTR(-ESRCH); 289 290 lock_ctx = &priv->lock_ctx; 291 mm = lock_ctx->mm; 292 if (!mm || !mmget_not_zero(mm)) { 293 put_task_struct(priv->task); 294 priv->task = NULL; 295 return NULL; 296 } 297 298 if (!lock_vma_range(m, lock_ctx)) { 299 mmput(mm); 300 put_task_struct(priv->task); 301 priv->task = NULL; 302 return ERR_PTR(-EINTR); 303 } 304 305 /* 306 * Reset current position if last_addr was set before 307 * and it's not a sentinel. 308 */ 309 if (last_addr > 0) 310 *ppos = last_addr = priv->last_pos; 311 vma_iter_init(&priv->iter, mm, (unsigned long)last_addr); 312 hold_task_mempolicy(priv); 313 if (last_addr == SENTINEL_VMA_GATE) 314 return get_gate_vma(mm); 315 316 return proc_get_vma(m, ppos); 317 } 318 319 static void *m_next(struct seq_file *m, void *v, loff_t *ppos) 320 { 321 if (*ppos == SENTINEL_VMA_GATE) { 322 *ppos = SENTINEL_VMA_END; 323 return NULL; 324 } 325 return proc_get_vma(m, ppos); 326 } 327 328 static void m_stop(struct seq_file *m, void *v) 329 { 330 struct proc_maps_private *priv = m->private; 331 struct mm_struct *mm = priv->lock_ctx.mm; 332 333 if (!priv->task) 334 return; 335 336 release_task_mempolicy(priv); 337 unlock_vma_range(&priv->lock_ctx); 338 mmput(mm); 339 put_task_struct(priv->task); 340 priv->task = NULL; 341 } 342 343 static int proc_maps_open(struct inode *inode, struct file *file, 344 const struct seq_operations *ops, int psize) 345 { 346 struct proc_maps_private *priv = __seq_open_private(file, ops, psize); 347 348 if (!priv) 349 return -ENOMEM; 350 351 priv->inode = inode; 352 priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); 353 if (IS_ERR(priv->lock_ctx.mm)) { 354 int err = PTR_ERR(priv->lock_ctx.mm); 355 356 seq_release_private(inode, file); 357 return err; 358 } 359 360 return 0; 361 } 362 363 static int proc_map_release(struct inode *inode, struct file *file) 364 { 365 struct seq_file *seq = file->private_data; 366 struct proc_maps_private *priv = seq->private; 367 368 if (priv->lock_ctx.mm) 369 mmdrop(priv->lock_ctx.mm); 370 371 return seq_release_private(inode, file); 372 } 373 374 static int do_maps_open(struct inode *inode, struct file *file, 375 const struct seq_operations *ops) 376 { 377 return proc_maps_open(inode, file, ops, 378 sizeof(struct proc_maps_private)); 379 } 380 381 static void get_vma_name(struct vm_area_struct *vma, 382 const struct path **path, 383 const char **name, 384 const char **name_fmt) 385 { 386 struct anon_vma_name *anon_name = vma->vm_mm ? anon_vma_name(vma) : NULL; 387 388 *name = NULL; 389 *path = NULL; 390 *name_fmt = NULL; 391 392 /* 393 * Print the dentry name for named mappings, and a 394 * special [heap] marker for the heap: 395 */ 396 if (vma->vm_file) { 397 /* 398 * If user named this anon shared memory via 399 * prctl(PR_SET_VMA ..., use the provided name. 400 */ 401 if (anon_name) { 402 *name_fmt = "[anon_shmem:%s]"; 403 *name = anon_name->name; 404 } else { 405 *path = file_user_path(vma->vm_file); 406 } 407 return; 408 } 409 410 if (vma->vm_ops && vma->vm_ops->name) { 411 *name = vma->vm_ops->name(vma); 412 if (*name) 413 return; 414 } 415 416 *name = arch_vma_name(vma); 417 if (*name) 418 return; 419 420 if (!vma->vm_mm) { 421 *name = "[vdso]"; 422 return; 423 } 424 425 if (vma_is_initial_heap(vma)) { 426 *name = "[heap]"; 427 return; 428 } 429 430 if (vma_is_initial_stack(vma)) { 431 *name = "[stack]"; 432 return; 433 } 434 435 if (anon_name) { 436 *name_fmt = "[anon:%s]"; 437 *name = anon_name->name; 438 return; 439 } 440 } 441 442 static void show_vma_header_prefix(struct seq_file *m, 443 unsigned long start, unsigned long end, 444 vm_flags_t flags, unsigned long long pgoff, 445 dev_t dev, unsigned long ino) 446 { 447 seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); 448 seq_put_hex_ll(m, NULL, start, 8); 449 seq_put_hex_ll(m, "-", end, 8); 450 seq_putc(m, ' '); 451 seq_putc(m, flags & VM_READ ? 'r' : '-'); 452 seq_putc(m, flags & VM_WRITE ? 'w' : '-'); 453 seq_putc(m, flags & VM_EXEC ? 'x' : '-'); 454 seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p'); 455 seq_put_hex_ll(m, " ", pgoff, 8); 456 seq_put_hex_ll(m, " ", MAJOR(dev), 2); 457 seq_put_hex_ll(m, ":", MINOR(dev), 2); 458 seq_put_decimal_ull(m, " ", ino); 459 seq_putc(m, ' '); 460 } 461 462 static void 463 show_map_vma(struct seq_file *m, struct vm_area_struct *vma) 464 { 465 const struct path *path; 466 const char *name_fmt, *name; 467 vm_flags_t flags = vma->vm_flags; 468 unsigned long ino = 0; 469 unsigned long long pgoff = 0; 470 unsigned long start, end; 471 dev_t dev = 0; 472 473 if (vma->vm_file) { 474 const struct inode *inode = file_user_inode(vma->vm_file); 475 476 dev = inode->i_sb->s_dev; 477 ino = inode->i_ino; 478 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; 479 } 480 481 start = vma->vm_start; 482 end = vma->vm_end; 483 show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); 484 485 get_vma_name(vma, &path, &name, &name_fmt); 486 if (path) { 487 seq_pad(m, ' '); 488 seq_path(m, path, "\n"); 489 } else if (name_fmt) { 490 seq_pad(m, ' '); 491 seq_printf(m, name_fmt, name); 492 } else if (name) { 493 seq_pad(m, ' '); 494 seq_puts(m, name); 495 } 496 seq_putc(m, '\n'); 497 } 498 499 static int show_map(struct seq_file *m, void *v) 500 { 501 show_map_vma(m, v); 502 return 0; 503 } 504 505 static const struct seq_operations proc_pid_maps_op = { 506 .start = m_start, 507 .next = m_next, 508 .stop = m_stop, 509 .show = show_map 510 }; 511 512 static int pid_maps_open(struct inode *inode, struct file *file) 513 { 514 return do_maps_open(inode, file, &proc_pid_maps_op); 515 } 516 517 #define PROCMAP_QUERY_VMA_FLAGS ( \ 518 PROCMAP_QUERY_VMA_READABLE | \ 519 PROCMAP_QUERY_VMA_WRITABLE | \ 520 PROCMAP_QUERY_VMA_EXECUTABLE | \ 521 PROCMAP_QUERY_VMA_SHARED \ 522 ) 523 524 #define PROCMAP_QUERY_VALID_FLAGS_MASK ( \ 525 PROCMAP_QUERY_COVERING_OR_NEXT_VMA | \ 526 PROCMAP_QUERY_FILE_BACKED_VMA | \ 527 PROCMAP_QUERY_VMA_FLAGS \ 528 ) 529 530 #ifdef CONFIG_PER_VMA_LOCK 531 532 static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) 533 { 534 reset_lock_ctx(lock_ctx); 535 536 return 0; 537 } 538 539 static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) 540 { 541 if (lock_ctx->mmap_locked) { 542 mmap_read_unlock(lock_ctx->mm); 543 lock_ctx->mmap_locked = false; 544 } else { 545 unlock_ctx_vma(lock_ctx); 546 } 547 } 548 549 static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx, 550 unsigned long addr) 551 { 552 struct mm_struct *mm = lock_ctx->mm; 553 struct vm_area_struct *vma; 554 struct vma_iterator vmi; 555 556 if (lock_ctx->mmap_locked) 557 return find_vma(mm, addr); 558 559 /* Unlock previously locked VMA and find the next one under RCU */ 560 unlock_ctx_vma(lock_ctx); 561 rcu_read_lock(); 562 vma_iter_init(&vmi, mm, addr); 563 vma = lock_next_vma(mm, &vmi, addr); 564 rcu_read_unlock(); 565 566 if (!vma) 567 return NULL; 568 569 if (!IS_ERR(vma)) { 570 lock_ctx->locked_vma = vma; 571 return vma; 572 } 573 574 if (PTR_ERR(vma) == -EAGAIN) { 575 /* Fallback to mmap_lock on vma->vm_refcnt overflow */ 576 mmap_read_lock(mm); 577 vma = find_vma(mm, addr); 578 lock_ctx->mmap_locked = true; 579 } 580 581 return vma; 582 } 583 584 #else /* CONFIG_PER_VMA_LOCK */ 585 586 static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) 587 { 588 return mmap_read_lock_killable(lock_ctx->mm); 589 } 590 591 static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) 592 { 593 mmap_read_unlock(lock_ctx->mm); 594 } 595 596 static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx, 597 unsigned long addr) 598 { 599 return find_vma(lock_ctx->mm, addr); 600 } 601 602 #endif /* CONFIG_PER_VMA_LOCK */ 603 604 static struct vm_area_struct *query_matching_vma(struct proc_maps_locking_ctx *lock_ctx, 605 unsigned long addr, u32 flags) 606 { 607 struct vm_area_struct *vma; 608 609 next_vma: 610 vma = query_vma_find_by_addr(lock_ctx, addr); 611 if (IS_ERR(vma)) 612 return vma; 613 614 if (!vma) 615 goto no_vma; 616 617 /* user requested only file-backed VMA, keep iterating */ 618 if ((flags & PROCMAP_QUERY_FILE_BACKED_VMA) && !vma->vm_file) 619 goto skip_vma; 620 621 /* VMA permissions should satisfy query flags */ 622 if (flags & PROCMAP_QUERY_VMA_FLAGS) { 623 u32 perm = 0; 624 625 if (flags & PROCMAP_QUERY_VMA_READABLE) 626 perm |= VM_READ; 627 if (flags & PROCMAP_QUERY_VMA_WRITABLE) 628 perm |= VM_WRITE; 629 if (flags & PROCMAP_QUERY_VMA_EXECUTABLE) 630 perm |= VM_EXEC; 631 if (flags & PROCMAP_QUERY_VMA_SHARED) 632 perm |= VM_MAYSHARE; 633 634 if ((vma->vm_flags & perm) != perm) 635 goto skip_vma; 636 } 637 638 /* found covering VMA or user is OK with the matching next VMA */ 639 if ((flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) || vma->vm_start <= addr) 640 return vma; 641 642 skip_vma: 643 /* 644 * If the user needs closest matching VMA, keep iterating. 645 */ 646 addr = vma->vm_end; 647 if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) 648 goto next_vma; 649 650 no_vma: 651 return ERR_PTR(-ENOENT); 652 } 653 654 static int do_procmap_query(struct mm_struct *mm, void __user *uarg) 655 { 656 struct proc_maps_locking_ctx lock_ctx = { .mm = mm }; 657 struct procmap_query karg; 658 struct vm_area_struct *vma; 659 struct file *vm_file = NULL; 660 const char *name = NULL; 661 char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; 662 __u64 usize; 663 int err; 664 665 if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize))) 666 return -EFAULT; 667 /* argument struct can never be that large, reject abuse */ 668 if (usize > PAGE_SIZE) 669 return -E2BIG; 670 /* argument struct should have at least query_flags and query_addr fields */ 671 if (usize < offsetofend(struct procmap_query, query_addr)) 672 return -EINVAL; 673 err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); 674 if (err) 675 return err; 676 677 /* reject unknown flags */ 678 if (karg.query_flags & ~PROCMAP_QUERY_VALID_FLAGS_MASK) 679 return -EINVAL; 680 /* either both buffer address and size are set, or both should be zero */ 681 if (!!karg.vma_name_size != !!karg.vma_name_addr) 682 return -EINVAL; 683 if (!!karg.build_id_size != !!karg.build_id_addr) 684 return -EINVAL; 685 686 if (!mm || !mmget_not_zero(mm)) 687 return -ESRCH; 688 689 err = query_vma_setup(&lock_ctx); 690 if (err) { 691 mmput(mm); 692 return err; 693 } 694 695 vma = query_matching_vma(&lock_ctx, karg.query_addr, karg.query_flags); 696 if (IS_ERR(vma)) { 697 err = PTR_ERR(vma); 698 vma = NULL; 699 goto out; 700 } 701 702 karg.vma_start = vma->vm_start; 703 karg.vma_end = vma->vm_end; 704 705 karg.vma_flags = 0; 706 if (vma->vm_flags & VM_READ) 707 karg.vma_flags |= PROCMAP_QUERY_VMA_READABLE; 708 if (vma->vm_flags & VM_WRITE) 709 karg.vma_flags |= PROCMAP_QUERY_VMA_WRITABLE; 710 if (vma->vm_flags & VM_EXEC) 711 karg.vma_flags |= PROCMAP_QUERY_VMA_EXECUTABLE; 712 if (vma->vm_flags & VM_MAYSHARE) 713 karg.vma_flags |= PROCMAP_QUERY_VMA_SHARED; 714 715 karg.vma_page_size = vma_kernel_pagesize(vma); 716 717 if (vma->vm_file) { 718 const struct inode *inode = file_user_inode(vma->vm_file); 719 720 karg.vma_offset = ((__u64)vma->vm_pgoff) << PAGE_SHIFT; 721 karg.dev_major = MAJOR(inode->i_sb->s_dev); 722 karg.dev_minor = MINOR(inode->i_sb->s_dev); 723 karg.inode = inode->i_ino; 724 } else { 725 karg.vma_offset = 0; 726 karg.dev_major = 0; 727 karg.dev_minor = 0; 728 karg.inode = 0; 729 } 730 731 if (karg.vma_name_size) { 732 size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size); 733 const struct path *path; 734 const char *name_fmt; 735 size_t name_sz = 0; 736 737 get_vma_name(vma, &path, &name, &name_fmt); 738 739 if (path || name_fmt || name) { 740 name_buf = kmalloc(name_buf_sz, GFP_KERNEL); 741 if (!name_buf) { 742 err = -ENOMEM; 743 goto out; 744 } 745 } 746 if (path) { 747 name = d_path(path, name_buf, name_buf_sz); 748 if (IS_ERR(name)) { 749 err = PTR_ERR(name); 750 goto out; 751 } 752 name_sz = name_buf + name_buf_sz - name; 753 } else if (name || name_fmt) { 754 name_sz = 1 + snprintf(name_buf, name_buf_sz, name_fmt ?: "%s", name); 755 name = name_buf; 756 } 757 if (name_sz > name_buf_sz) { 758 err = -ENAMETOOLONG; 759 goto out; 760 } 761 karg.vma_name_size = name_sz; 762 } 763 764 if (karg.build_id_size && vma->vm_file) 765 vm_file = get_file(vma->vm_file); 766 767 /* unlock vma or mmap_lock, and put mm_struct before copying data to user */ 768 query_vma_teardown(&lock_ctx); 769 mmput(mm); 770 771 if (karg.build_id_size) { 772 __u32 build_id_sz; 773 774 if (vm_file) 775 err = build_id_parse_file(vm_file, build_id_buf, &build_id_sz); 776 else 777 err = -ENOENT; 778 if (err) { 779 karg.build_id_size = 0; 780 } else { 781 if (karg.build_id_size < build_id_sz) { 782 err = -ENAMETOOLONG; 783 goto out_file; 784 } 785 karg.build_id_size = build_id_sz; 786 } 787 } 788 789 if (vm_file) 790 fput(vm_file); 791 792 if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr), 793 name, karg.vma_name_size)) { 794 kfree(name_buf); 795 return -EFAULT; 796 } 797 kfree(name_buf); 798 799 if (karg.build_id_size && copy_to_user(u64_to_user_ptr(karg.build_id_addr), 800 build_id_buf, karg.build_id_size)) 801 return -EFAULT; 802 803 if (copy_to_user(uarg, &karg, min_t(size_t, sizeof(karg), usize))) 804 return -EFAULT; 805 806 return 0; 807 808 out: 809 query_vma_teardown(&lock_ctx); 810 mmput(mm); 811 out_file: 812 if (vm_file) 813 fput(vm_file); 814 kfree(name_buf); 815 return err; 816 } 817 818 static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 819 { 820 struct seq_file *seq = file->private_data; 821 struct proc_maps_private *priv = seq->private; 822 823 switch (cmd) { 824 case PROCMAP_QUERY: 825 /* priv->lock_ctx.mm is set during file open operation */ 826 return do_procmap_query(priv->lock_ctx.mm, (void __user *)arg); 827 default: 828 return -ENOIOCTLCMD; 829 } 830 } 831 832 const struct file_operations proc_pid_maps_operations = { 833 .open = pid_maps_open, 834 .read = seq_read, 835 .llseek = seq_lseek, 836 .release = proc_map_release, 837 .unlocked_ioctl = procfs_procmap_ioctl, 838 .compat_ioctl = compat_ptr_ioctl, 839 }; 840 841 /* 842 * Proportional Set Size(PSS): my share of RSS. 843 * 844 * PSS of a process is the count of pages it has in memory, where each 845 * page is divided by the number of processes sharing it. So if a 846 * process has 1000 pages all to itself, and 1000 shared with one other 847 * process, its PSS will be 1500. 848 * 849 * To keep (accumulated) division errors low, we adopt a 64bit 850 * fixed-point pss counter to minimize division errors. So (pss >> 851 * PSS_SHIFT) would be the real byte count. 852 * 853 * A shift of 12 before division means (assuming 4K page size): 854 * - 1M 3-user-pages add up to 8KB errors; 855 * - supports mapcount up to 2^24, or 16M; 856 * - supports PSS up to 2^52 bytes, or 4PB. 857 */ 858 #define PSS_SHIFT 12 859 860 #ifdef CONFIG_PROC_PAGE_MONITOR 861 struct mem_size_stats { 862 unsigned long resident; 863 unsigned long shared_clean; 864 unsigned long shared_dirty; 865 unsigned long private_clean; 866 unsigned long private_dirty; 867 unsigned long referenced; 868 unsigned long anonymous; 869 unsigned long lazyfree; 870 unsigned long anonymous_thp; 871 unsigned long shmem_thp; 872 unsigned long file_thp; 873 unsigned long swap; 874 unsigned long shared_hugetlb; 875 unsigned long private_hugetlb; 876 unsigned long ksm; 877 u64 pss; 878 u64 pss_anon; 879 u64 pss_file; 880 u64 pss_shmem; 881 u64 pss_dirty; 882 u64 pss_locked; 883 u64 swap_pss; 884 }; 885 886 static void smaps_page_accumulate(struct mem_size_stats *mss, 887 struct folio *folio, unsigned long size, unsigned long pss, 888 bool dirty, bool locked, bool private) 889 { 890 mss->pss += pss; 891 892 if (folio_test_anon(folio)) 893 mss->pss_anon += pss; 894 else if (folio_test_swapbacked(folio)) 895 mss->pss_shmem += pss; 896 else 897 mss->pss_file += pss; 898 899 if (locked) 900 mss->pss_locked += pss; 901 902 if (dirty || folio_test_dirty(folio)) { 903 mss->pss_dirty += pss; 904 if (private) 905 mss->private_dirty += size; 906 else 907 mss->shared_dirty += size; 908 } else { 909 if (private) 910 mss->private_clean += size; 911 else 912 mss->shared_clean += size; 913 } 914 } 915 916 static void smaps_account(struct mem_size_stats *mss, struct page *page, 917 bool compound, bool young, bool dirty, bool locked, 918 bool present) 919 { 920 struct folio *folio = page_folio(page); 921 int i, nr = compound ? compound_nr(page) : 1; 922 unsigned long size = nr * PAGE_SIZE; 923 bool exclusive; 924 int mapcount; 925 926 /* 927 * First accumulate quantities that depend only on |size| and the type 928 * of the compound page. 929 */ 930 if (folio_test_anon(folio)) { 931 mss->anonymous += size; 932 if (!folio_test_swapbacked(folio) && !dirty && 933 !folio_test_dirty(folio)) 934 mss->lazyfree += size; 935 } 936 937 if (folio_test_ksm(folio)) 938 mss->ksm += size; 939 940 mss->resident += size; 941 /* Accumulate the size in pages that have been accessed. */ 942 if (young || folio_test_young(folio) || folio_test_referenced(folio)) 943 mss->referenced += size; 944 945 /* 946 * Then accumulate quantities that may depend on sharing, or that may 947 * differ page-by-page. 948 * 949 * refcount == 1 for present entries guarantees that the folio is mapped 950 * exactly once. For large folios this implies that exactly one 951 * PTE/PMD/... maps (a part of) this folio. 952 * 953 * Treat all non-present entries (where relying on the mapcount and 954 * refcount doesn't make sense) as "maybe shared, but not sure how 955 * often". We treat device private entries as being fake-present. 956 * 957 * Note that it would not be safe to read the mapcount especially for 958 * pages referenced by migration entries, even with the PTL held. 959 */ 960 if (folio_ref_count(folio) == 1 || !present) { 961 smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT, 962 dirty, locked, present); 963 return; 964 } 965 966 if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { 967 mapcount = folio_average_page_mapcount(folio); 968 exclusive = !folio_maybe_mapped_shared(folio); 969 } 970 971 /* 972 * We obtain a snapshot of the mapcount. Without holding the folio lock 973 * this snapshot can be slightly wrong as we cannot always read the 974 * mapcount atomically. 975 */ 976 for (i = 0; i < nr; i++, page++) { 977 unsigned long pss = PAGE_SIZE << PSS_SHIFT; 978 979 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) { 980 mapcount = folio_precise_page_mapcount(folio, page); 981 exclusive = mapcount < 2; 982 } 983 984 if (mapcount >= 2) 985 pss /= mapcount; 986 smaps_page_accumulate(mss, folio, PAGE_SIZE, pss, 987 dirty, locked, exclusive); 988 } 989 } 990 991 #ifdef CONFIG_SHMEM 992 static int smaps_pte_hole(unsigned long addr, unsigned long end, 993 __always_unused int depth, struct mm_walk *walk) 994 { 995 struct mem_size_stats *mss = walk->private; 996 struct vm_area_struct *vma = walk->vma; 997 998 mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping, 999 linear_page_index(vma, addr), 1000 linear_page_index(vma, end)); 1001 1002 return 0; 1003 } 1004 #else 1005 #define smaps_pte_hole NULL 1006 #endif /* CONFIG_SHMEM */ 1007 1008 static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk) 1009 { 1010 #ifdef CONFIG_SHMEM 1011 if (walk->ops->pte_hole) { 1012 /* depth is not used */ 1013 smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk); 1014 } 1015 #endif 1016 } 1017 1018 static void smaps_pte_entry(pte_t *pte, unsigned long addr, 1019 struct mm_walk *walk) 1020 { 1021 struct mem_size_stats *mss = walk->private; 1022 struct vm_area_struct *vma = walk->vma; 1023 bool locked = !!(vma->vm_flags & VM_LOCKED); 1024 struct page *page = NULL; 1025 bool present = false, young = false, dirty = false; 1026 pte_t ptent = ptep_get(pte); 1027 1028 if (pte_present(ptent)) { 1029 page = vm_normal_page(vma, addr, ptent); 1030 young = pte_young(ptent); 1031 dirty = pte_dirty(ptent); 1032 present = true; 1033 } else if (pte_none(ptent)) { 1034 smaps_pte_hole_lookup(addr, walk); 1035 } else { 1036 const softleaf_t entry = softleaf_from_pte(ptent); 1037 1038 if (softleaf_is_swap(entry)) { 1039 int mapcount; 1040 1041 mss->swap += PAGE_SIZE; 1042 mapcount = swp_swapcount(entry); 1043 if (mapcount >= 2) { 1044 u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; 1045 1046 do_div(pss_delta, mapcount); 1047 mss->swap_pss += pss_delta; 1048 } else { 1049 mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; 1050 } 1051 } else if (softleaf_has_pfn(entry)) { 1052 if (softleaf_is_device_private(entry)) 1053 present = true; 1054 page = softleaf_to_page(entry); 1055 } 1056 } 1057 1058 if (!page) 1059 return; 1060 1061 smaps_account(mss, page, false, young, dirty, locked, present); 1062 } 1063 1064 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1065 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, 1066 struct mm_walk *walk) 1067 { 1068 struct mem_size_stats *mss = walk->private; 1069 struct vm_area_struct *vma = walk->vma; 1070 bool locked = !!(vma->vm_flags & VM_LOCKED); 1071 struct page *page = NULL; 1072 bool present = false; 1073 struct folio *folio; 1074 1075 if (pmd_none(*pmd)) 1076 return; 1077 if (pmd_present(*pmd)) { 1078 page = vm_normal_page_pmd(vma, addr, *pmd); 1079 present = true; 1080 } else if (unlikely(thp_migration_supported())) { 1081 const softleaf_t entry = softleaf_from_pmd(*pmd); 1082 1083 if (softleaf_has_pfn(entry)) 1084 page = softleaf_to_page(entry); 1085 } 1086 if (IS_ERR_OR_NULL(page)) 1087 return; 1088 folio = page_folio(page); 1089 if (folio_test_anon(folio)) 1090 mss->anonymous_thp += HPAGE_PMD_SIZE; 1091 else if (folio_test_swapbacked(folio)) 1092 mss->shmem_thp += HPAGE_PMD_SIZE; 1093 else if (folio_is_zone_device(folio)) 1094 /* pass */; 1095 else 1096 mss->file_thp += HPAGE_PMD_SIZE; 1097 1098 smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), 1099 locked, present); 1100 } 1101 #else 1102 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, 1103 struct mm_walk *walk) 1104 { 1105 } 1106 #endif 1107 1108 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 1109 struct mm_walk *walk) 1110 { 1111 struct vm_area_struct *vma = walk->vma; 1112 pte_t *pte; 1113 spinlock_t *ptl; 1114 1115 ptl = pmd_trans_huge_lock(pmd, vma); 1116 if (ptl) { 1117 smaps_pmd_entry(pmd, addr, walk); 1118 spin_unlock(ptl); 1119 goto out; 1120 } 1121 1122 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 1123 if (!pte) { 1124 walk->action = ACTION_AGAIN; 1125 return 0; 1126 } 1127 for (; addr != end; pte++, addr += PAGE_SIZE) 1128 smaps_pte_entry(pte, addr, walk); 1129 pte_unmap_unlock(pte - 1, ptl); 1130 out: 1131 cond_resched(); 1132 return 0; 1133 } 1134 1135 static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) 1136 { 1137 /* 1138 * Don't forget to update Documentation/ on changes. 1139 * 1140 * The length of the second argument of mnemonics[] 1141 * needs to be 3 instead of previously set 2 1142 * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3]) 1143 * to avoid spurious 1144 * -Werror=unterminated-string-initialization warning 1145 * with GCC 15 1146 */ 1147 static const char mnemonics[BITS_PER_LONG][3] = { 1148 /* 1149 * In case if we meet a flag we don't know about. 1150 */ 1151 [0 ... (BITS_PER_LONG-1)] = "??", 1152 1153 [ilog2(VM_READ)] = "rd", 1154 [ilog2(VM_WRITE)] = "wr", 1155 [ilog2(VM_EXEC)] = "ex", 1156 [ilog2(VM_SHARED)] = "sh", 1157 [ilog2(VM_MAYREAD)] = "mr", 1158 [ilog2(VM_MAYWRITE)] = "mw", 1159 [ilog2(VM_MAYEXEC)] = "me", 1160 [ilog2(VM_MAYSHARE)] = "ms", 1161 [ilog2(VM_GROWSDOWN)] = "gd", 1162 [ilog2(VM_PFNMAP)] = "pf", 1163 [ilog2(VM_MAYBE_GUARD)] = "gu", 1164 [ilog2(VM_LOCKED)] = "lo", 1165 [ilog2(VM_IO)] = "io", 1166 [ilog2(VM_SEQ_READ)] = "sr", 1167 [ilog2(VM_RAND_READ)] = "rr", 1168 [ilog2(VM_DONTCOPY)] = "dc", 1169 [ilog2(VM_DONTEXPAND)] = "de", 1170 [ilog2(VM_LOCKONFAULT)] = "lf", 1171 [ilog2(VM_ACCOUNT)] = "ac", 1172 [ilog2(VM_NORESERVE)] = "nr", 1173 [ilog2(VM_HUGETLB)] = "ht", 1174 [ilog2(VM_SYNC)] = "sf", 1175 [ilog2(VM_ARCH_1)] = "ar", 1176 [ilog2(VM_WIPEONFORK)] = "wf", 1177 [ilog2(VM_DONTDUMP)] = "dd", 1178 #ifdef CONFIG_ARM64_BTI 1179 [ilog2(VM_ARM64_BTI)] = "bt", 1180 #endif 1181 #ifdef CONFIG_MEM_SOFT_DIRTY 1182 [ilog2(VM_SOFTDIRTY)] = "sd", 1183 #endif 1184 [ilog2(VM_MIXEDMAP)] = "mm", 1185 [ilog2(VM_HUGEPAGE)] = "hg", 1186 [ilog2(VM_NOHUGEPAGE)] = "nh", 1187 [ilog2(VM_MERGEABLE)] = "mg", 1188 [ilog2(VM_UFFD_MISSING)]= "um", 1189 [ilog2(VM_UFFD_WP)] = "uw", 1190 #ifdef CONFIG_ARM64_MTE 1191 [ilog2(VM_MTE)] = "mt", 1192 [ilog2(VM_MTE_ALLOWED)] = "", 1193 #endif 1194 #ifdef CONFIG_ARCH_HAS_PKEYS 1195 /* These come out via ProtectionKey: */ 1196 [ilog2(VM_PKEY_BIT0)] = "", 1197 [ilog2(VM_PKEY_BIT1)] = "", 1198 [ilog2(VM_PKEY_BIT2)] = "", 1199 #if CONFIG_ARCH_PKEY_BITS > 3 1200 [ilog2(VM_PKEY_BIT3)] = "", 1201 #endif 1202 #if CONFIG_ARCH_PKEY_BITS > 4 1203 [ilog2(VM_PKEY_BIT4)] = "", 1204 #endif 1205 #endif /* CONFIG_ARCH_HAS_PKEYS */ 1206 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR 1207 [ilog2(VM_UFFD_MINOR)] = "ui", 1208 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ 1209 #ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK 1210 [ilog2(VM_SHADOW_STACK)] = "ss", 1211 #endif 1212 #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) 1213 [ilog2(VM_DROPPABLE)] = "dp", 1214 #endif 1215 #ifdef CONFIG_64BIT 1216 [ilog2(VM_SEALED)] = "sl", 1217 #endif 1218 }; 1219 size_t i; 1220 1221 seq_puts(m, "VmFlags: "); 1222 for (i = 0; i < BITS_PER_LONG; i++) { 1223 if (!mnemonics[i][0]) 1224 continue; 1225 if (vma->vm_flags & (1UL << i)) 1226 seq_printf(m, "%s ", mnemonics[i]); 1227 } 1228 seq_putc(m, '\n'); 1229 } 1230 1231 #ifdef CONFIG_HUGETLB_PAGE 1232 static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, 1233 unsigned long addr, unsigned long end, 1234 struct mm_walk *walk) 1235 { 1236 struct mem_size_stats *mss = walk->private; 1237 struct vm_area_struct *vma = walk->vma; 1238 struct folio *folio = NULL; 1239 bool present = false; 1240 spinlock_t *ptl; 1241 pte_t ptent; 1242 1243 ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); 1244 ptent = huge_ptep_get(walk->mm, addr, pte); 1245 if (pte_present(ptent)) { 1246 folio = page_folio(pte_page(ptent)); 1247 present = true; 1248 } else { 1249 const softleaf_t entry = softleaf_from_pte(ptent); 1250 1251 if (softleaf_has_pfn(entry)) 1252 folio = softleaf_to_folio(entry); 1253 } 1254 1255 if (folio) { 1256 /* We treat non-present entries as "maybe shared". */ 1257 if (!present || folio_maybe_mapped_shared(folio) || 1258 hugetlb_pmd_shared(pte)) 1259 mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); 1260 else 1261 mss->private_hugetlb += huge_page_size(hstate_vma(vma)); 1262 } 1263 spin_unlock(ptl); 1264 return 0; 1265 } 1266 #else 1267 #define smaps_hugetlb_range NULL 1268 #endif /* HUGETLB_PAGE */ 1269 1270 static const struct mm_walk_ops smaps_walk_ops = { 1271 .pmd_entry = smaps_pte_range, 1272 .hugetlb_entry = smaps_hugetlb_range, 1273 .walk_lock = PGWALK_RDLOCK, 1274 }; 1275 1276 static const struct mm_walk_ops smaps_shmem_walk_ops = { 1277 .pmd_entry = smaps_pte_range, 1278 .hugetlb_entry = smaps_hugetlb_range, 1279 .pte_hole = smaps_pte_hole, 1280 .walk_lock = PGWALK_RDLOCK, 1281 }; 1282 1283 /* 1284 * Gather mem stats from @vma with the indicated beginning 1285 * address @start, and keep them in @mss. 1286 * 1287 * Use vm_start of @vma as the beginning address if @start is 0. 1288 */ 1289 static void smap_gather_stats(struct vm_area_struct *vma, 1290 struct mem_size_stats *mss, unsigned long start) 1291 { 1292 const struct mm_walk_ops *ops = &smaps_walk_ops; 1293 1294 /* Invalid start */ 1295 if (start >= vma->vm_end) 1296 return; 1297 1298 if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { 1299 /* 1300 * For shared or readonly shmem mappings we know that all 1301 * swapped out pages belong to the shmem object, and we can 1302 * obtain the swap value much more efficiently. For private 1303 * writable mappings, we might have COW pages that are 1304 * not affected by the parent swapped out pages of the shmem 1305 * object, so we have to distinguish them during the page walk. 1306 * Unless we know that the shmem object (or the part mapped by 1307 * our VMA) has no swapped out pages at all. 1308 */ 1309 unsigned long shmem_swapped = shmem_swap_usage(vma); 1310 1311 if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) || 1312 !(vma->vm_flags & VM_WRITE))) { 1313 mss->swap += shmem_swapped; 1314 } else { 1315 ops = &smaps_shmem_walk_ops; 1316 } 1317 } 1318 1319 /* mmap_lock is held in m_start */ 1320 if (!start) 1321 walk_page_vma(vma, ops, mss); 1322 else 1323 walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); 1324 } 1325 1326 #define SEQ_PUT_DEC(str, val) \ 1327 seq_put_decimal_ull_width(m, str, (val) >> 10, 8) 1328 1329 /* Show the contents common for smaps and smaps_rollup */ 1330 static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, 1331 bool rollup_mode) 1332 { 1333 SEQ_PUT_DEC("Rss: ", mss->resident); 1334 SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); 1335 SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT); 1336 if (rollup_mode) { 1337 /* 1338 * These are meaningful only for smaps_rollup, otherwise two of 1339 * them are zero, and the other one is the same as Pss. 1340 */ 1341 SEQ_PUT_DEC(" kB\nPss_Anon: ", 1342 mss->pss_anon >> PSS_SHIFT); 1343 SEQ_PUT_DEC(" kB\nPss_File: ", 1344 mss->pss_file >> PSS_SHIFT); 1345 SEQ_PUT_DEC(" kB\nPss_Shmem: ", 1346 mss->pss_shmem >> PSS_SHIFT); 1347 } 1348 SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); 1349 SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); 1350 SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); 1351 SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); 1352 SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); 1353 SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); 1354 SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm); 1355 SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); 1356 SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); 1357 SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); 1358 SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); 1359 SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); 1360 seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", 1361 mss->private_hugetlb >> 10, 7); 1362 SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); 1363 SEQ_PUT_DEC(" kB\nSwapPss: ", 1364 mss->swap_pss >> PSS_SHIFT); 1365 SEQ_PUT_DEC(" kB\nLocked: ", 1366 mss->pss_locked >> PSS_SHIFT); 1367 seq_puts(m, " kB\n"); 1368 } 1369 1370 static int show_smap(struct seq_file *m, void *v) 1371 { 1372 struct vm_area_struct *vma = v; 1373 struct mem_size_stats mss = {}; 1374 1375 smap_gather_stats(vma, &mss, 0); 1376 1377 show_map_vma(m, vma); 1378 1379 SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); 1380 SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); 1381 SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); 1382 seq_puts(m, " kB\n"); 1383 1384 __show_smap(m, &mss, false); 1385 1386 seq_printf(m, "THPeligible: %8u\n", 1387 !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS, 1388 THP_ORDERS_ALL)); 1389 1390 if (arch_pkeys_enabled()) 1391 seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); 1392 show_smap_vma_flags(m, vma); 1393 1394 return 0; 1395 } 1396 1397 static int show_smaps_rollup(struct seq_file *m, void *v) 1398 { 1399 struct proc_maps_private *priv = m->private; 1400 struct mem_size_stats mss = {}; 1401 struct mm_struct *mm = priv->lock_ctx.mm; 1402 struct vm_area_struct *vma; 1403 unsigned long vma_start = 0, last_vma_end = 0; 1404 int ret = 0; 1405 VMA_ITERATOR(vmi, mm, 0); 1406 1407 priv->task = get_proc_task(priv->inode); 1408 if (!priv->task) 1409 return -ESRCH; 1410 1411 if (!mm || !mmget_not_zero(mm)) { 1412 ret = -ESRCH; 1413 goto out_put_task; 1414 } 1415 1416 ret = mmap_read_lock_killable(mm); 1417 if (ret) 1418 goto out_put_mm; 1419 1420 hold_task_mempolicy(priv); 1421 vma = vma_next(&vmi); 1422 1423 if (unlikely(!vma)) 1424 goto empty_set; 1425 1426 vma_start = vma->vm_start; 1427 do { 1428 smap_gather_stats(vma, &mss, 0); 1429 last_vma_end = vma->vm_end; 1430 1431 /* 1432 * Release mmap_lock temporarily if someone wants to 1433 * access it for write request. 1434 */ 1435 if (mmap_lock_is_contended(mm)) { 1436 vma_iter_invalidate(&vmi); 1437 mmap_read_unlock(mm); 1438 ret = mmap_read_lock_killable(mm); 1439 if (ret) { 1440 release_task_mempolicy(priv); 1441 goto out_put_mm; 1442 } 1443 1444 /* 1445 * After dropping the lock, there are four cases to 1446 * consider. See the following example for explanation. 1447 * 1448 * +------+------+-----------+ 1449 * | VMA1 | VMA2 | VMA3 | 1450 * +------+------+-----------+ 1451 * | | | | 1452 * 4k 8k 16k 400k 1453 * 1454 * Suppose we drop the lock after reading VMA2 due to 1455 * contention, then we get: 1456 * 1457 * last_vma_end = 16k 1458 * 1459 * 1) VMA2 is freed, but VMA3 exists: 1460 * 1461 * vma_next(vmi) will return VMA3. 1462 * In this case, just continue from VMA3. 1463 * 1464 * 2) VMA2 still exists: 1465 * 1466 * vma_next(vmi) will return VMA3. 1467 * In this case, just continue from VMA3. 1468 * 1469 * 3) No more VMAs can be found: 1470 * 1471 * vma_next(vmi) will return NULL. 1472 * No more things to do, just break. 1473 * 1474 * 4) (last_vma_end - 1) is the middle of a vma (VMA'): 1475 * 1476 * vma_next(vmi) will return VMA' whose range 1477 * contains last_vma_end. 1478 * Iterate VMA' from last_vma_end. 1479 */ 1480 vma = vma_next(&vmi); 1481 /* Case 3 above */ 1482 if (!vma) 1483 break; 1484 1485 /* Case 1 and 2 above */ 1486 if (vma->vm_start >= last_vma_end) { 1487 smap_gather_stats(vma, &mss, 0); 1488 last_vma_end = vma->vm_end; 1489 continue; 1490 } 1491 1492 /* Case 4 above */ 1493 if (vma->vm_end > last_vma_end) { 1494 smap_gather_stats(vma, &mss, last_vma_end); 1495 last_vma_end = vma->vm_end; 1496 } 1497 } 1498 } for_each_vma(vmi, vma); 1499 1500 empty_set: 1501 show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); 1502 seq_pad(m, ' '); 1503 seq_puts(m, "[rollup]\n"); 1504 1505 __show_smap(m, &mss, true); 1506 1507 release_task_mempolicy(priv); 1508 mmap_read_unlock(mm); 1509 1510 out_put_mm: 1511 mmput(mm); 1512 out_put_task: 1513 put_task_struct(priv->task); 1514 priv->task = NULL; 1515 1516 return ret; 1517 } 1518 #undef SEQ_PUT_DEC 1519 1520 static const struct seq_operations proc_pid_smaps_op = { 1521 .start = m_start, 1522 .next = m_next, 1523 .stop = m_stop, 1524 .show = show_smap 1525 }; 1526 1527 static int pid_smaps_open(struct inode *inode, struct file *file) 1528 { 1529 return do_maps_open(inode, file, &proc_pid_smaps_op); 1530 } 1531 1532 static int smaps_rollup_open(struct inode *inode, struct file *file) 1533 { 1534 int ret; 1535 struct proc_maps_private *priv; 1536 1537 priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT); 1538 if (!priv) 1539 return -ENOMEM; 1540 1541 ret = single_open(file, show_smaps_rollup, priv); 1542 if (ret) 1543 goto out_free; 1544 1545 priv->inode = inode; 1546 priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); 1547 if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) { 1548 ret = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH; 1549 1550 single_release(inode, file); 1551 goto out_free; 1552 } 1553 1554 return 0; 1555 1556 out_free: 1557 kfree(priv); 1558 return ret; 1559 } 1560 1561 static int smaps_rollup_release(struct inode *inode, struct file *file) 1562 { 1563 struct seq_file *seq = file->private_data; 1564 struct proc_maps_private *priv = seq->private; 1565 1566 if (priv->lock_ctx.mm) 1567 mmdrop(priv->lock_ctx.mm); 1568 1569 kfree(priv); 1570 return single_release(inode, file); 1571 } 1572 1573 const struct file_operations proc_pid_smaps_operations = { 1574 .open = pid_smaps_open, 1575 .read = seq_read, 1576 .llseek = seq_lseek, 1577 .release = proc_map_release, 1578 }; 1579 1580 const struct file_operations proc_pid_smaps_rollup_operations = { 1581 .open = smaps_rollup_open, 1582 .read = seq_read, 1583 .llseek = seq_lseek, 1584 .release = smaps_rollup_release, 1585 }; 1586 1587 enum clear_refs_types { 1588 CLEAR_REFS_ALL = 1, 1589 CLEAR_REFS_ANON, 1590 CLEAR_REFS_MAPPED, 1591 CLEAR_REFS_SOFT_DIRTY, 1592 CLEAR_REFS_MM_HIWATER_RSS, 1593 CLEAR_REFS_LAST, 1594 }; 1595 1596 struct clear_refs_private { 1597 enum clear_refs_types type; 1598 }; 1599 1600 static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) 1601 { 1602 struct folio *folio; 1603 1604 if (!pte_write(pte)) 1605 return false; 1606 if (!is_cow_mapping(vma->vm_flags)) 1607 return false; 1608 if (likely(!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm))) 1609 return false; 1610 folio = vm_normal_folio(vma, addr, pte); 1611 if (!folio) 1612 return false; 1613 return folio_maybe_dma_pinned(folio); 1614 } 1615 1616 static inline void clear_soft_dirty(struct vm_area_struct *vma, 1617 unsigned long addr, pte_t *pte) 1618 { 1619 if (!pgtable_supports_soft_dirty()) 1620 return; 1621 /* 1622 * The soft-dirty tracker uses #PF-s to catch writes 1623 * to pages, so write-protect the pte as well. See the 1624 * Documentation/admin-guide/mm/soft-dirty.rst for full description 1625 * of how soft-dirty works. 1626 */ 1627 pte_t ptent = ptep_get(pte); 1628 1629 if (pte_none(ptent)) 1630 return; 1631 1632 if (pte_present(ptent)) { 1633 pte_t old_pte; 1634 1635 if (pte_is_pinned(vma, addr, ptent)) 1636 return; 1637 old_pte = ptep_modify_prot_start(vma, addr, pte); 1638 ptent = pte_wrprotect(old_pte); 1639 ptent = pte_clear_soft_dirty(ptent); 1640 ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); 1641 } else { 1642 ptent = pte_swp_clear_soft_dirty(ptent); 1643 set_pte_at(vma->vm_mm, addr, pte, ptent); 1644 } 1645 } 1646 1647 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) 1648 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, 1649 unsigned long addr, pmd_t *pmdp) 1650 { 1651 pmd_t old, pmd = *pmdp; 1652 1653 if (!pgtable_supports_soft_dirty()) 1654 return; 1655 1656 if (pmd_present(pmd)) { 1657 /* See comment in change_huge_pmd() */ 1658 old = pmdp_invalidate(vma, addr, pmdp); 1659 if (pmd_dirty(old)) 1660 pmd = pmd_mkdirty(pmd); 1661 if (pmd_young(old)) 1662 pmd = pmd_mkyoung(pmd); 1663 1664 pmd = pmd_wrprotect(pmd); 1665 pmd = pmd_clear_soft_dirty(pmd); 1666 1667 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 1668 } else if (pmd_is_migration_entry(pmd)) { 1669 pmd = pmd_swp_clear_soft_dirty(pmd); 1670 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 1671 } 1672 } 1673 #else 1674 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, 1675 unsigned long addr, pmd_t *pmdp) 1676 { 1677 } 1678 #endif 1679 1680 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, 1681 unsigned long end, struct mm_walk *walk) 1682 { 1683 struct clear_refs_private *cp = walk->private; 1684 struct vm_area_struct *vma = walk->vma; 1685 pte_t *pte, ptent; 1686 spinlock_t *ptl; 1687 struct folio *folio; 1688 1689 ptl = pmd_trans_huge_lock(pmd, vma); 1690 if (ptl) { 1691 if (cp->type == CLEAR_REFS_SOFT_DIRTY) { 1692 clear_soft_dirty_pmd(vma, addr, pmd); 1693 goto out; 1694 } 1695 1696 if (!pmd_present(*pmd)) 1697 goto out; 1698 1699 folio = pmd_folio(*pmd); 1700 1701 /* Clear accessed and referenced bits. */ 1702 pmdp_test_and_clear_young(vma, addr, pmd); 1703 folio_test_clear_young(folio); 1704 folio_clear_referenced(folio); 1705 out: 1706 spin_unlock(ptl); 1707 return 0; 1708 } 1709 1710 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 1711 if (!pte) { 1712 walk->action = ACTION_AGAIN; 1713 return 0; 1714 } 1715 for (; addr != end; pte++, addr += PAGE_SIZE) { 1716 ptent = ptep_get(pte); 1717 1718 if (cp->type == CLEAR_REFS_SOFT_DIRTY) { 1719 clear_soft_dirty(vma, addr, pte); 1720 continue; 1721 } 1722 1723 if (!pte_present(ptent)) 1724 continue; 1725 1726 folio = vm_normal_folio(vma, addr, ptent); 1727 if (!folio) 1728 continue; 1729 1730 /* Clear accessed and referenced bits. */ 1731 ptep_test_and_clear_young(vma, addr, pte); 1732 folio_test_clear_young(folio); 1733 folio_clear_referenced(folio); 1734 } 1735 pte_unmap_unlock(pte - 1, ptl); 1736 cond_resched(); 1737 return 0; 1738 } 1739 1740 static int clear_refs_test_walk(unsigned long start, unsigned long end, 1741 struct mm_walk *walk) 1742 { 1743 struct clear_refs_private *cp = walk->private; 1744 struct vm_area_struct *vma = walk->vma; 1745 1746 if (vma->vm_flags & VM_PFNMAP) 1747 return 1; 1748 1749 /* 1750 * Writing 1 to /proc/pid/clear_refs affects all pages. 1751 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. 1752 * Writing 3 to /proc/pid/clear_refs only affects file mapped pages. 1753 * Writing 4 to /proc/pid/clear_refs affects all pages. 1754 */ 1755 if (cp->type == CLEAR_REFS_ANON && vma->vm_file) 1756 return 1; 1757 if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file) 1758 return 1; 1759 return 0; 1760 } 1761 1762 static const struct mm_walk_ops clear_refs_walk_ops = { 1763 .pmd_entry = clear_refs_pte_range, 1764 .test_walk = clear_refs_test_walk, 1765 .walk_lock = PGWALK_WRLOCK, 1766 }; 1767 1768 static ssize_t clear_refs_write(struct file *file, const char __user *buf, 1769 size_t count, loff_t *ppos) 1770 { 1771 struct task_struct *task; 1772 char buffer[PROC_NUMBUF] = {}; 1773 struct mm_struct *mm; 1774 struct vm_area_struct *vma; 1775 enum clear_refs_types type; 1776 int itype; 1777 int rv; 1778 1779 if (count > sizeof(buffer) - 1) 1780 count = sizeof(buffer) - 1; 1781 if (copy_from_user(buffer, buf, count)) 1782 return -EFAULT; 1783 rv = kstrtoint(strstrip(buffer), 10, &itype); 1784 if (rv < 0) 1785 return rv; 1786 type = (enum clear_refs_types)itype; 1787 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) 1788 return -EINVAL; 1789 1790 task = get_proc_task(file_inode(file)); 1791 if (!task) 1792 return -ESRCH; 1793 mm = get_task_mm(task); 1794 if (mm) { 1795 VMA_ITERATOR(vmi, mm, 0); 1796 struct mmu_notifier_range range; 1797 struct clear_refs_private cp = { 1798 .type = type, 1799 }; 1800 1801 if (mmap_write_lock_killable(mm)) { 1802 count = -EINTR; 1803 goto out_mm; 1804 } 1805 if (type == CLEAR_REFS_MM_HIWATER_RSS) { 1806 /* 1807 * Writing 5 to /proc/pid/clear_refs resets the peak 1808 * resident set size to this mm's current rss value. 1809 */ 1810 reset_mm_hiwater_rss(mm); 1811 goto out_unlock; 1812 } 1813 1814 if (type == CLEAR_REFS_SOFT_DIRTY) { 1815 for_each_vma(vmi, vma) { 1816 if (!(vma->vm_flags & VM_SOFTDIRTY)) 1817 continue; 1818 vm_flags_clear(vma, VM_SOFTDIRTY); 1819 vma_set_page_prot(vma); 1820 } 1821 1822 inc_tlb_flush_pending(mm); 1823 mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 1824 0, mm, 0, -1UL); 1825 mmu_notifier_invalidate_range_start(&range); 1826 } 1827 walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); 1828 if (type == CLEAR_REFS_SOFT_DIRTY) { 1829 mmu_notifier_invalidate_range_end(&range); 1830 flush_tlb_mm(mm); 1831 dec_tlb_flush_pending(mm); 1832 } 1833 out_unlock: 1834 mmap_write_unlock(mm); 1835 out_mm: 1836 mmput(mm); 1837 } 1838 put_task_struct(task); 1839 1840 return count; 1841 } 1842 1843 const struct file_operations proc_clear_refs_operations = { 1844 .write = clear_refs_write, 1845 .llseek = noop_llseek, 1846 }; 1847 1848 typedef struct { 1849 u64 pme; 1850 } pagemap_entry_t; 1851 1852 struct pagemapread { 1853 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ 1854 pagemap_entry_t *buffer; 1855 bool show_pfn; 1856 }; 1857 1858 #define PAGEMAP_WALK_SIZE (PMD_SIZE) 1859 #define PAGEMAP_WALK_MASK (PMD_MASK) 1860 1861 #define PM_ENTRY_BYTES sizeof(pagemap_entry_t) 1862 #define PM_PFRAME_BITS 55 1863 #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) 1864 #define PM_SOFT_DIRTY BIT_ULL(55) 1865 #define PM_MMAP_EXCLUSIVE BIT_ULL(56) 1866 #define PM_UFFD_WP BIT_ULL(57) 1867 #define PM_GUARD_REGION BIT_ULL(58) 1868 #define PM_FILE BIT_ULL(61) 1869 #define PM_SWAP BIT_ULL(62) 1870 #define PM_PRESENT BIT_ULL(63) 1871 1872 #define PM_END_OF_BUFFER 1 1873 1874 static inline pagemap_entry_t make_pme(u64 frame, u64 flags) 1875 { 1876 return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; 1877 } 1878 1879 static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) 1880 { 1881 pm->buffer[pm->pos++] = *pme; 1882 if (pm->pos >= pm->len) 1883 return PM_END_OF_BUFFER; 1884 return 0; 1885 } 1886 1887 static bool __folio_page_mapped_exclusively(struct folio *folio, struct page *page) 1888 { 1889 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) 1890 return folio_precise_page_mapcount(folio, page) == 1; 1891 return !folio_maybe_mapped_shared(folio); 1892 } 1893 1894 static int pagemap_pte_hole(unsigned long start, unsigned long end, 1895 __always_unused int depth, struct mm_walk *walk) 1896 { 1897 struct pagemapread *pm = walk->private; 1898 unsigned long addr = start; 1899 int err = 0; 1900 1901 while (addr < end) { 1902 struct vm_area_struct *vma = find_vma(walk->mm, addr); 1903 pagemap_entry_t pme = make_pme(0, 0); 1904 /* End of address space hole, which we mark as non-present. */ 1905 unsigned long hole_end; 1906 1907 if (vma) 1908 hole_end = min(end, vma->vm_start); 1909 else 1910 hole_end = end; 1911 1912 for (; addr < hole_end; addr += PAGE_SIZE) { 1913 err = add_to_pagemap(&pme, pm); 1914 if (err) 1915 goto out; 1916 } 1917 1918 if (!vma) 1919 break; 1920 1921 /* Addresses in the VMA. */ 1922 if (vma->vm_flags & VM_SOFTDIRTY) 1923 pme = make_pme(0, PM_SOFT_DIRTY); 1924 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { 1925 err = add_to_pagemap(&pme, pm); 1926 if (err) 1927 goto out; 1928 } 1929 } 1930 out: 1931 return err; 1932 } 1933 1934 static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, 1935 struct vm_area_struct *vma, unsigned long addr, pte_t pte) 1936 { 1937 u64 frame = 0, flags = 0; 1938 struct page *page = NULL; 1939 struct folio *folio; 1940 1941 if (pte_none(pte)) 1942 goto out; 1943 1944 if (pte_present(pte)) { 1945 if (pm->show_pfn) 1946 frame = pte_pfn(pte); 1947 flags |= PM_PRESENT; 1948 page = vm_normal_page(vma, addr, pte); 1949 if (pte_soft_dirty(pte)) 1950 flags |= PM_SOFT_DIRTY; 1951 if (pte_uffd_wp(pte)) 1952 flags |= PM_UFFD_WP; 1953 } else { 1954 softleaf_t entry; 1955 1956 if (pte_swp_soft_dirty(pte)) 1957 flags |= PM_SOFT_DIRTY; 1958 if (pte_swp_uffd_wp(pte)) 1959 flags |= PM_UFFD_WP; 1960 entry = softleaf_from_pte(pte); 1961 if (pm->show_pfn) { 1962 pgoff_t offset; 1963 1964 /* 1965 * For PFN swap offsets, keeping the offset field 1966 * to be PFN only to be compatible with old smaps. 1967 */ 1968 if (softleaf_has_pfn(entry)) 1969 offset = softleaf_to_pfn(entry); 1970 else 1971 offset = swp_offset(entry); 1972 frame = swp_type(entry) | 1973 (offset << MAX_SWAPFILES_SHIFT); 1974 } 1975 flags |= PM_SWAP; 1976 if (softleaf_has_pfn(entry)) 1977 page = softleaf_to_page(entry); 1978 if (softleaf_is_uffd_wp_marker(entry)) 1979 flags |= PM_UFFD_WP; 1980 if (softleaf_is_guard_marker(entry)) 1981 flags |= PM_GUARD_REGION; 1982 } 1983 1984 if (page) { 1985 folio = page_folio(page); 1986 if (!folio_test_anon(folio)) 1987 flags |= PM_FILE; 1988 if ((flags & PM_PRESENT) && 1989 __folio_page_mapped_exclusively(folio, page)) 1990 flags |= PM_MMAP_EXCLUSIVE; 1991 } 1992 1993 out: 1994 if (vma->vm_flags & VM_SOFTDIRTY) 1995 flags |= PM_SOFT_DIRTY; 1996 1997 return make_pme(frame, flags); 1998 } 1999 2000 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2001 static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, 2002 unsigned long end, struct vm_area_struct *vma, 2003 struct pagemapread *pm) 2004 { 2005 unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT; 2006 u64 flags = 0, frame = 0; 2007 pmd_t pmd = *pmdp; 2008 struct page *page = NULL; 2009 struct folio *folio = NULL; 2010 int err = 0; 2011 2012 if (vma->vm_flags & VM_SOFTDIRTY) 2013 flags |= PM_SOFT_DIRTY; 2014 2015 if (pmd_none(pmd)) 2016 goto populate_pagemap; 2017 2018 if (pmd_present(pmd)) { 2019 page = pmd_page(pmd); 2020 2021 flags |= PM_PRESENT; 2022 if (pmd_soft_dirty(pmd)) 2023 flags |= PM_SOFT_DIRTY; 2024 if (pmd_uffd_wp(pmd)) 2025 flags |= PM_UFFD_WP; 2026 if (pm->show_pfn) 2027 frame = pmd_pfn(pmd) + idx; 2028 } else if (thp_migration_supported()) { 2029 const softleaf_t entry = softleaf_from_pmd(pmd); 2030 unsigned long offset; 2031 2032 if (pm->show_pfn) { 2033 if (softleaf_has_pfn(entry)) 2034 offset = softleaf_to_pfn(entry) + idx; 2035 else 2036 offset = swp_offset(entry) + idx; 2037 frame = swp_type(entry) | 2038 (offset << MAX_SWAPFILES_SHIFT); 2039 } 2040 flags |= PM_SWAP; 2041 if (pmd_swp_soft_dirty(pmd)) 2042 flags |= PM_SOFT_DIRTY; 2043 if (pmd_swp_uffd_wp(pmd)) 2044 flags |= PM_UFFD_WP; 2045 VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd)); 2046 page = softleaf_to_page(entry); 2047 } 2048 2049 if (page) { 2050 folio = page_folio(page); 2051 if (!folio_test_anon(folio)) 2052 flags |= PM_FILE; 2053 } 2054 2055 populate_pagemap: 2056 for (; addr != end; addr += PAGE_SIZE, idx++) { 2057 u64 cur_flags = flags; 2058 pagemap_entry_t pme; 2059 2060 if (folio && (flags & PM_PRESENT) && 2061 __folio_page_mapped_exclusively(folio, page)) 2062 cur_flags |= PM_MMAP_EXCLUSIVE; 2063 2064 pme = make_pme(frame, cur_flags); 2065 err = add_to_pagemap(&pme, pm); 2066 if (err) 2067 break; 2068 if (pm->show_pfn) { 2069 if (flags & PM_PRESENT) 2070 frame++; 2071 else if (flags & PM_SWAP) 2072 frame += (1 << MAX_SWAPFILES_SHIFT); 2073 } 2074 } 2075 return err; 2076 } 2077 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2078 2079 static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, 2080 struct mm_walk *walk) 2081 { 2082 struct vm_area_struct *vma = walk->vma; 2083 struct pagemapread *pm = walk->private; 2084 spinlock_t *ptl; 2085 pte_t *pte, *orig_pte; 2086 int err = 0; 2087 2088 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2089 ptl = pmd_trans_huge_lock(pmdp, vma); 2090 if (ptl) { 2091 err = pagemap_pmd_range_thp(pmdp, addr, end, vma, pm); 2092 spin_unlock(ptl); 2093 return err; 2094 } 2095 #endif 2096 2097 /* 2098 * We can assume that @vma always points to a valid one and @end never 2099 * goes beyond vma->vm_end. 2100 */ 2101 orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); 2102 if (!pte) { 2103 walk->action = ACTION_AGAIN; 2104 return err; 2105 } 2106 for (; addr < end; pte++, addr += PAGE_SIZE) { 2107 pagemap_entry_t pme; 2108 2109 pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); 2110 err = add_to_pagemap(&pme, pm); 2111 if (err) 2112 break; 2113 } 2114 pte_unmap_unlock(orig_pte, ptl); 2115 2116 cond_resched(); 2117 2118 return err; 2119 } 2120 2121 #ifdef CONFIG_HUGETLB_PAGE 2122 /* This function walks within one hugetlb entry in the single call */ 2123 static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, 2124 unsigned long addr, unsigned long end, 2125 struct mm_walk *walk) 2126 { 2127 struct pagemapread *pm = walk->private; 2128 struct vm_area_struct *vma = walk->vma; 2129 u64 flags = 0, frame = 0; 2130 spinlock_t *ptl; 2131 int err = 0; 2132 pte_t pte; 2133 2134 if (vma->vm_flags & VM_SOFTDIRTY) 2135 flags |= PM_SOFT_DIRTY; 2136 2137 ptl = huge_pte_lock(hstate_vma(vma), walk->mm, ptep); 2138 pte = huge_ptep_get(walk->mm, addr, ptep); 2139 if (pte_present(pte)) { 2140 struct folio *folio = page_folio(pte_page(pte)); 2141 2142 if (!folio_test_anon(folio)) 2143 flags |= PM_FILE; 2144 2145 if (!folio_maybe_mapped_shared(folio) && 2146 !hugetlb_pmd_shared(ptep)) 2147 flags |= PM_MMAP_EXCLUSIVE; 2148 2149 if (huge_pte_uffd_wp(pte)) 2150 flags |= PM_UFFD_WP; 2151 2152 flags |= PM_PRESENT; 2153 if (pm->show_pfn) 2154 frame = pte_pfn(pte) + 2155 ((addr & ~hmask) >> PAGE_SHIFT); 2156 } else if (pte_swp_uffd_wp_any(pte)) { 2157 flags |= PM_UFFD_WP; 2158 } 2159 2160 for (; addr != end; addr += PAGE_SIZE) { 2161 pagemap_entry_t pme = make_pme(frame, flags); 2162 2163 err = add_to_pagemap(&pme, pm); 2164 if (err) 2165 break; 2166 if (pm->show_pfn && (flags & PM_PRESENT)) 2167 frame++; 2168 } 2169 2170 spin_unlock(ptl); 2171 cond_resched(); 2172 2173 return err; 2174 } 2175 #else 2176 #define pagemap_hugetlb_range NULL 2177 #endif /* HUGETLB_PAGE */ 2178 2179 static const struct mm_walk_ops pagemap_ops = { 2180 .pmd_entry = pagemap_pmd_range, 2181 .pte_hole = pagemap_pte_hole, 2182 .hugetlb_entry = pagemap_hugetlb_range, 2183 .walk_lock = PGWALK_RDLOCK, 2184 }; 2185 2186 /* 2187 * /proc/pid/pagemap - an array mapping virtual pages to pfns 2188 * 2189 * For each page in the address space, this file contains one 64-bit entry 2190 * consisting of the following: 2191 * 2192 * Bits 0-54 page frame number (PFN) if present 2193 * Bits 0-4 swap type if swapped 2194 * Bits 5-54 swap offset if swapped 2195 * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) 2196 * Bit 56 page exclusively mapped 2197 * Bit 57 pte is uffd-wp write-protected 2198 * Bit 58 pte is a guard region 2199 * Bits 59-60 zero 2200 * Bit 61 page is file-page or shared-anon 2201 * Bit 62 page swapped 2202 * Bit 63 page present 2203 * 2204 * If the page is not present but in swap, then the PFN contains an 2205 * encoding of the swap file number and the page's offset into the 2206 * swap. Unmapped pages return a null PFN. This allows determining 2207 * precisely which pages are mapped (or in swap) and comparing mapped 2208 * pages between processes. 2209 * 2210 * Efficient users of this interface will use /proc/pid/maps to 2211 * determine which areas of memory are actually mapped and llseek to 2212 * skip over unmapped regions. 2213 */ 2214 static ssize_t pagemap_read(struct file *file, char __user *buf, 2215 size_t count, loff_t *ppos) 2216 { 2217 struct mm_struct *mm = file->private_data; 2218 struct pagemapread pm; 2219 unsigned long src; 2220 unsigned long svpfn; 2221 unsigned long start_vaddr; 2222 unsigned long end_vaddr; 2223 int ret = 0, copied = 0; 2224 2225 if (!mm || !mmget_not_zero(mm)) 2226 goto out; 2227 2228 ret = -EINVAL; 2229 /* file position must be aligned */ 2230 if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) 2231 goto out_mm; 2232 2233 ret = 0; 2234 if (!count) 2235 goto out_mm; 2236 2237 /* do not disclose physical addresses: attack vector */ 2238 pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); 2239 2240 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); 2241 pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); 2242 ret = -ENOMEM; 2243 if (!pm.buffer) 2244 goto out_mm; 2245 2246 src = *ppos; 2247 svpfn = src / PM_ENTRY_BYTES; 2248 end_vaddr = mm->task_size; 2249 2250 /* watch out for wraparound */ 2251 start_vaddr = end_vaddr; 2252 if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { 2253 unsigned long end; 2254 2255 ret = mmap_read_lock_killable(mm); 2256 if (ret) 2257 goto out_free; 2258 start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT); 2259 mmap_read_unlock(mm); 2260 2261 end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT); 2262 if (end >= start_vaddr && end < mm->task_size) 2263 end_vaddr = end; 2264 } 2265 2266 /* Ensure the address is inside the task */ 2267 if (start_vaddr > mm->task_size) 2268 start_vaddr = end_vaddr; 2269 2270 ret = 0; 2271 while (count && (start_vaddr < end_vaddr)) { 2272 int len; 2273 unsigned long end; 2274 2275 pm.pos = 0; 2276 end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; 2277 /* overflow ? */ 2278 if (end < start_vaddr || end > end_vaddr) 2279 end = end_vaddr; 2280 ret = mmap_read_lock_killable(mm); 2281 if (ret) 2282 goto out_free; 2283 ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); 2284 mmap_read_unlock(mm); 2285 start_vaddr = end; 2286 2287 len = min(count, PM_ENTRY_BYTES * pm.pos); 2288 if (copy_to_user(buf, pm.buffer, len)) { 2289 ret = -EFAULT; 2290 goto out_free; 2291 } 2292 copied += len; 2293 buf += len; 2294 count -= len; 2295 } 2296 *ppos += copied; 2297 if (!ret || ret == PM_END_OF_BUFFER) 2298 ret = copied; 2299 2300 out_free: 2301 kfree(pm.buffer); 2302 out_mm: 2303 mmput(mm); 2304 out: 2305 return ret; 2306 } 2307 2308 static int pagemap_open(struct inode *inode, struct file *file) 2309 { 2310 struct mm_struct *mm; 2311 2312 mm = proc_mem_open(inode, PTRACE_MODE_READ); 2313 if (IS_ERR_OR_NULL(mm)) 2314 return mm ? PTR_ERR(mm) : -ESRCH; 2315 file->private_data = mm; 2316 return 0; 2317 } 2318 2319 static int pagemap_release(struct inode *inode, struct file *file) 2320 { 2321 struct mm_struct *mm = file->private_data; 2322 2323 if (mm) 2324 mmdrop(mm); 2325 return 0; 2326 } 2327 2328 #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ 2329 PAGE_IS_FILE | PAGE_IS_PRESENT | \ 2330 PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ 2331 PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \ 2332 PAGE_IS_GUARD) 2333 #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) 2334 2335 struct pagemap_scan_private { 2336 struct pm_scan_arg arg; 2337 unsigned long masks_of_interest, cur_vma_category; 2338 struct page_region *vec_buf; 2339 unsigned long vec_buf_len, vec_buf_index, found_pages; 2340 struct page_region __user *vec_out; 2341 }; 2342 2343 static unsigned long pagemap_page_category(struct pagemap_scan_private *p, 2344 struct vm_area_struct *vma, 2345 unsigned long addr, pte_t pte) 2346 { 2347 unsigned long categories; 2348 2349 if (pte_none(pte)) 2350 return 0; 2351 2352 if (pte_present(pte)) { 2353 struct page *page; 2354 2355 categories = PAGE_IS_PRESENT; 2356 2357 if (!pte_uffd_wp(pte)) 2358 categories |= PAGE_IS_WRITTEN; 2359 2360 if (p->masks_of_interest & PAGE_IS_FILE) { 2361 page = vm_normal_page(vma, addr, pte); 2362 if (page && !PageAnon(page)) 2363 categories |= PAGE_IS_FILE; 2364 } 2365 2366 if (is_zero_pfn(pte_pfn(pte))) 2367 categories |= PAGE_IS_PFNZERO; 2368 if (pte_soft_dirty(pte)) 2369 categories |= PAGE_IS_SOFT_DIRTY; 2370 } else { 2371 softleaf_t entry; 2372 2373 categories = PAGE_IS_SWAPPED; 2374 2375 if (!pte_swp_uffd_wp_any(pte)) 2376 categories |= PAGE_IS_WRITTEN; 2377 2378 entry = softleaf_from_pte(pte); 2379 if (softleaf_is_guard_marker(entry)) 2380 categories |= PAGE_IS_GUARD; 2381 else if ((p->masks_of_interest & PAGE_IS_FILE) && 2382 softleaf_has_pfn(entry) && 2383 !folio_test_anon(softleaf_to_folio(entry))) 2384 categories |= PAGE_IS_FILE; 2385 2386 if (pte_swp_soft_dirty(pte)) 2387 categories |= PAGE_IS_SOFT_DIRTY; 2388 } 2389 2390 return categories; 2391 } 2392 2393 static void make_uffd_wp_pte(struct vm_area_struct *vma, 2394 unsigned long addr, pte_t *pte, pte_t ptent) 2395 { 2396 if (pte_present(ptent)) { 2397 pte_t old_pte; 2398 2399 old_pte = ptep_modify_prot_start(vma, addr, pte); 2400 ptent = pte_mkuffd_wp(old_pte); 2401 ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); 2402 } else if (pte_none(ptent)) { 2403 set_pte_at(vma->vm_mm, addr, pte, 2404 make_pte_marker(PTE_MARKER_UFFD_WP)); 2405 } else { 2406 ptent = pte_swp_mkuffd_wp(ptent); 2407 set_pte_at(vma->vm_mm, addr, pte, ptent); 2408 } 2409 } 2410 2411 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2412 static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, 2413 struct vm_area_struct *vma, 2414 unsigned long addr, pmd_t pmd) 2415 { 2416 unsigned long categories = PAGE_IS_HUGE; 2417 2418 if (pmd_none(pmd)) 2419 return categories; 2420 2421 if (pmd_present(pmd)) { 2422 struct page *page; 2423 2424 categories |= PAGE_IS_PRESENT; 2425 if (!pmd_uffd_wp(pmd)) 2426 categories |= PAGE_IS_WRITTEN; 2427 2428 if (p->masks_of_interest & PAGE_IS_FILE) { 2429 page = vm_normal_page_pmd(vma, addr, pmd); 2430 if (page && !PageAnon(page)) 2431 categories |= PAGE_IS_FILE; 2432 } 2433 2434 if (is_huge_zero_pmd(pmd)) 2435 categories |= PAGE_IS_PFNZERO; 2436 if (pmd_soft_dirty(pmd)) 2437 categories |= PAGE_IS_SOFT_DIRTY; 2438 } else { 2439 categories |= PAGE_IS_SWAPPED; 2440 if (!pmd_swp_uffd_wp(pmd)) 2441 categories |= PAGE_IS_WRITTEN; 2442 if (pmd_swp_soft_dirty(pmd)) 2443 categories |= PAGE_IS_SOFT_DIRTY; 2444 2445 if (p->masks_of_interest & PAGE_IS_FILE) { 2446 const softleaf_t entry = softleaf_from_pmd(pmd); 2447 2448 if (softleaf_has_pfn(entry) && 2449 !folio_test_anon(softleaf_to_folio(entry))) 2450 categories |= PAGE_IS_FILE; 2451 } 2452 } 2453 2454 return categories; 2455 } 2456 2457 static void make_uffd_wp_pmd(struct vm_area_struct *vma, 2458 unsigned long addr, pmd_t *pmdp) 2459 { 2460 pmd_t old, pmd = *pmdp; 2461 2462 if (pmd_present(pmd)) { 2463 old = pmdp_invalidate_ad(vma, addr, pmdp); 2464 pmd = pmd_mkuffd_wp(old); 2465 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 2466 } else if (pmd_is_migration_entry(pmd)) { 2467 pmd = pmd_swp_mkuffd_wp(pmd); 2468 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 2469 } 2470 } 2471 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2472 2473 #ifdef CONFIG_HUGETLB_PAGE 2474 static unsigned long pagemap_hugetlb_category(pte_t pte) 2475 { 2476 unsigned long categories = PAGE_IS_HUGE; 2477 2478 if (pte_none(pte)) 2479 return categories; 2480 2481 /* 2482 * According to pagemap_hugetlb_range(), file-backed HugeTLB 2483 * page cannot be swapped. So PAGE_IS_FILE is not checked for 2484 * swapped pages. 2485 */ 2486 if (pte_present(pte)) { 2487 categories |= PAGE_IS_PRESENT; 2488 2489 if (!huge_pte_uffd_wp(pte)) 2490 categories |= PAGE_IS_WRITTEN; 2491 if (!PageAnon(pte_page(pte))) 2492 categories |= PAGE_IS_FILE; 2493 if (is_zero_pfn(pte_pfn(pte))) 2494 categories |= PAGE_IS_PFNZERO; 2495 if (pte_soft_dirty(pte)) 2496 categories |= PAGE_IS_SOFT_DIRTY; 2497 } else { 2498 categories |= PAGE_IS_SWAPPED; 2499 2500 if (!pte_swp_uffd_wp_any(pte)) 2501 categories |= PAGE_IS_WRITTEN; 2502 if (pte_swp_soft_dirty(pte)) 2503 categories |= PAGE_IS_SOFT_DIRTY; 2504 } 2505 2506 return categories; 2507 } 2508 2509 static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, 2510 unsigned long addr, pte_t *ptep, 2511 pte_t ptent) 2512 { 2513 const unsigned long psize = huge_page_size(hstate_vma(vma)); 2514 softleaf_t entry; 2515 2516 if (huge_pte_none(ptent)) { 2517 set_huge_pte_at(vma->vm_mm, addr, ptep, 2518 make_pte_marker(PTE_MARKER_UFFD_WP), psize); 2519 return; 2520 } 2521 2522 entry = softleaf_from_pte(ptent); 2523 if (softleaf_is_hwpoison(entry) || softleaf_is_marker(entry)) 2524 return; 2525 2526 if (softleaf_is_migration(entry)) 2527 set_huge_pte_at(vma->vm_mm, addr, ptep, 2528 pte_swp_mkuffd_wp(ptent), psize); 2529 else 2530 huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, 2531 huge_pte_mkuffd_wp(ptent)); 2532 } 2533 #endif /* CONFIG_HUGETLB_PAGE */ 2534 2535 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) 2536 static void pagemap_scan_backout_range(struct pagemap_scan_private *p, 2537 unsigned long addr, unsigned long end) 2538 { 2539 struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; 2540 2541 if (!p->vec_buf) 2542 return; 2543 2544 if (cur_buf->start != addr) 2545 cur_buf->end = addr; 2546 else 2547 cur_buf->start = cur_buf->end = 0; 2548 2549 p->found_pages -= (end - addr) / PAGE_SIZE; 2550 } 2551 #endif 2552 2553 static bool pagemap_scan_is_interesting_page(unsigned long categories, 2554 const struct pagemap_scan_private *p) 2555 { 2556 categories ^= p->arg.category_inverted; 2557 if ((categories & p->arg.category_mask) != p->arg.category_mask) 2558 return false; 2559 if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask)) 2560 return false; 2561 2562 return true; 2563 } 2564 2565 static bool pagemap_scan_is_interesting_vma(unsigned long categories, 2566 const struct pagemap_scan_private *p) 2567 { 2568 unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED; 2569 2570 categories ^= p->arg.category_inverted; 2571 if ((categories & required) != required) 2572 return false; 2573 2574 return true; 2575 } 2576 2577 static int pagemap_scan_test_walk(unsigned long start, unsigned long end, 2578 struct mm_walk *walk) 2579 { 2580 struct pagemap_scan_private *p = walk->private; 2581 struct vm_area_struct *vma = walk->vma; 2582 unsigned long vma_category = 0; 2583 bool wp_allowed = userfaultfd_wp_async(vma) && 2584 userfaultfd_wp_use_markers(vma); 2585 2586 if (!wp_allowed) { 2587 /* User requested explicit failure over wp-async capability */ 2588 if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) 2589 return -EPERM; 2590 /* 2591 * User requires wr-protect, and allows silently skipping 2592 * unsupported vmas. 2593 */ 2594 if (p->arg.flags & PM_SCAN_WP_MATCHING) 2595 return 1; 2596 /* 2597 * Then the request doesn't involve wr-protects at all, 2598 * fall through to the rest checks, and allow vma walk. 2599 */ 2600 } 2601 2602 if (vma->vm_flags & VM_PFNMAP) 2603 return 1; 2604 2605 if (wp_allowed) 2606 vma_category |= PAGE_IS_WPALLOWED; 2607 2608 if (vma->vm_flags & VM_SOFTDIRTY) 2609 vma_category |= PAGE_IS_SOFT_DIRTY; 2610 2611 if (!pagemap_scan_is_interesting_vma(vma_category, p)) 2612 return 1; 2613 2614 p->cur_vma_category = vma_category; 2615 2616 return 0; 2617 } 2618 2619 static bool pagemap_scan_push_range(unsigned long categories, 2620 struct pagemap_scan_private *p, 2621 unsigned long addr, unsigned long end) 2622 { 2623 struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; 2624 2625 /* 2626 * When there is no output buffer provided at all, the sentinel values 2627 * won't match here. There is no other way for `cur_buf->end` to be 2628 * non-zero other than it being non-empty. 2629 */ 2630 if (addr == cur_buf->end && categories == cur_buf->categories) { 2631 cur_buf->end = end; 2632 return true; 2633 } 2634 2635 if (cur_buf->end) { 2636 if (p->vec_buf_index >= p->vec_buf_len - 1) 2637 return false; 2638 2639 cur_buf = &p->vec_buf[++p->vec_buf_index]; 2640 } 2641 2642 cur_buf->start = addr; 2643 cur_buf->end = end; 2644 cur_buf->categories = categories; 2645 2646 return true; 2647 } 2648 2649 static int pagemap_scan_output(unsigned long categories, 2650 struct pagemap_scan_private *p, 2651 unsigned long addr, unsigned long *end) 2652 { 2653 unsigned long n_pages, total_pages; 2654 int ret = 0; 2655 2656 if (!p->vec_buf) 2657 return 0; 2658 2659 categories &= p->arg.return_mask; 2660 2661 n_pages = (*end - addr) / PAGE_SIZE; 2662 if (check_add_overflow(p->found_pages, n_pages, &total_pages) || 2663 total_pages > p->arg.max_pages) { 2664 size_t n_too_much = total_pages - p->arg.max_pages; 2665 *end -= n_too_much * PAGE_SIZE; 2666 n_pages -= n_too_much; 2667 ret = -ENOSPC; 2668 } 2669 2670 if (!pagemap_scan_push_range(categories, p, addr, *end)) { 2671 *end = addr; 2672 n_pages = 0; 2673 ret = -ENOSPC; 2674 } 2675 2676 p->found_pages += n_pages; 2677 if (ret) 2678 p->arg.walk_end = *end; 2679 2680 return ret; 2681 } 2682 2683 static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, 2684 unsigned long end, struct mm_walk *walk) 2685 { 2686 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2687 struct pagemap_scan_private *p = walk->private; 2688 struct vm_area_struct *vma = walk->vma; 2689 unsigned long categories; 2690 spinlock_t *ptl; 2691 int ret = 0; 2692 2693 ptl = pmd_trans_huge_lock(pmd, vma); 2694 if (!ptl) 2695 return -ENOENT; 2696 2697 categories = p->cur_vma_category | 2698 pagemap_thp_category(p, vma, start, *pmd); 2699 2700 if (!pagemap_scan_is_interesting_page(categories, p)) 2701 goto out_unlock; 2702 2703 ret = pagemap_scan_output(categories, p, start, &end); 2704 if (start == end) 2705 goto out_unlock; 2706 2707 if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2708 goto out_unlock; 2709 if (~categories & PAGE_IS_WRITTEN) 2710 goto out_unlock; 2711 2712 /* 2713 * Break huge page into small pages if the WP operation 2714 * needs to be performed on a portion of the huge page. 2715 */ 2716 if (end != start + HPAGE_SIZE) { 2717 spin_unlock(ptl); 2718 split_huge_pmd(vma, pmd, start); 2719 pagemap_scan_backout_range(p, start, end); 2720 /* Report as if there was no THP */ 2721 return -ENOENT; 2722 } 2723 2724 make_uffd_wp_pmd(vma, start, pmd); 2725 flush_tlb_range(vma, start, end); 2726 out_unlock: 2727 spin_unlock(ptl); 2728 return ret; 2729 #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ 2730 return -ENOENT; 2731 #endif 2732 } 2733 2734 static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, 2735 unsigned long end, struct mm_walk *walk) 2736 { 2737 struct pagemap_scan_private *p = walk->private; 2738 struct vm_area_struct *vma = walk->vma; 2739 unsigned long addr, flush_end = 0; 2740 pte_t *pte, *start_pte; 2741 spinlock_t *ptl; 2742 int ret; 2743 2744 ret = pagemap_scan_thp_entry(pmd, start, end, walk); 2745 if (ret != -ENOENT) 2746 return ret; 2747 2748 ret = 0; 2749 start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); 2750 if (!pte) { 2751 walk->action = ACTION_AGAIN; 2752 return 0; 2753 } 2754 2755 lazy_mmu_mode_enable(); 2756 2757 if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { 2758 /* Fast path for performing exclusive WP */ 2759 for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { 2760 pte_t ptent = ptep_get(pte); 2761 2762 if ((pte_present(ptent) && pte_uffd_wp(ptent)) || 2763 pte_swp_uffd_wp_any(ptent)) 2764 continue; 2765 make_uffd_wp_pte(vma, addr, pte, ptent); 2766 if (!flush_end) 2767 start = addr; 2768 flush_end = addr + PAGE_SIZE; 2769 } 2770 goto flush_and_return; 2771 } 2772 2773 if (!p->arg.category_anyof_mask && !p->arg.category_inverted && 2774 p->arg.category_mask == PAGE_IS_WRITTEN && 2775 p->arg.return_mask == PAGE_IS_WRITTEN) { 2776 for (addr = start; addr < end; pte++, addr += PAGE_SIZE) { 2777 unsigned long next = addr + PAGE_SIZE; 2778 pte_t ptent = ptep_get(pte); 2779 2780 if ((pte_present(ptent) && pte_uffd_wp(ptent)) || 2781 pte_swp_uffd_wp_any(ptent)) 2782 continue; 2783 ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN, 2784 p, addr, &next); 2785 if (next == addr) 2786 break; 2787 if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2788 continue; 2789 make_uffd_wp_pte(vma, addr, pte, ptent); 2790 if (!flush_end) 2791 start = addr; 2792 flush_end = next; 2793 } 2794 goto flush_and_return; 2795 } 2796 2797 for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { 2798 pte_t ptent = ptep_get(pte); 2799 unsigned long categories = p->cur_vma_category | 2800 pagemap_page_category(p, vma, addr, ptent); 2801 unsigned long next = addr + PAGE_SIZE; 2802 2803 if (!pagemap_scan_is_interesting_page(categories, p)) 2804 continue; 2805 2806 ret = pagemap_scan_output(categories, p, addr, &next); 2807 if (next == addr) 2808 break; 2809 2810 if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2811 continue; 2812 if (~categories & PAGE_IS_WRITTEN) 2813 continue; 2814 2815 make_uffd_wp_pte(vma, addr, pte, ptent); 2816 if (!flush_end) 2817 start = addr; 2818 flush_end = next; 2819 } 2820 2821 flush_and_return: 2822 if (flush_end) 2823 flush_tlb_range(vma, start, addr); 2824 2825 lazy_mmu_mode_disable(); 2826 pte_unmap_unlock(start_pte, ptl); 2827 2828 cond_resched(); 2829 return ret; 2830 } 2831 2832 #ifdef CONFIG_HUGETLB_PAGE 2833 static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, 2834 unsigned long start, unsigned long end, 2835 struct mm_walk *walk) 2836 { 2837 struct pagemap_scan_private *p = walk->private; 2838 struct vm_area_struct *vma = walk->vma; 2839 unsigned long categories; 2840 spinlock_t *ptl; 2841 int ret = 0; 2842 pte_t pte; 2843 2844 if (~p->arg.flags & PM_SCAN_WP_MATCHING) { 2845 /* Go the short route when not write-protecting pages. */ 2846 2847 pte = huge_ptep_get(walk->mm, start, ptep); 2848 categories = p->cur_vma_category | pagemap_hugetlb_category(pte); 2849 2850 if (!pagemap_scan_is_interesting_page(categories, p)) 2851 return 0; 2852 2853 return pagemap_scan_output(categories, p, start, &end); 2854 } 2855 2856 i_mmap_lock_write(vma->vm_file->f_mapping); 2857 ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); 2858 2859 pte = huge_ptep_get(walk->mm, start, ptep); 2860 categories = p->cur_vma_category | pagemap_hugetlb_category(pte); 2861 2862 if (!pagemap_scan_is_interesting_page(categories, p)) 2863 goto out_unlock; 2864 2865 ret = pagemap_scan_output(categories, p, start, &end); 2866 if (start == end) 2867 goto out_unlock; 2868 2869 if (~categories & PAGE_IS_WRITTEN) 2870 goto out_unlock; 2871 2872 if (end != start + HPAGE_SIZE) { 2873 /* Partial HugeTLB page WP isn't possible. */ 2874 pagemap_scan_backout_range(p, start, end); 2875 p->arg.walk_end = start; 2876 ret = 0; 2877 goto out_unlock; 2878 } 2879 2880 make_uffd_wp_huge_pte(vma, start, ptep, pte); 2881 flush_hugetlb_tlb_range(vma, start, end); 2882 2883 out_unlock: 2884 spin_unlock(ptl); 2885 i_mmap_unlock_write(vma->vm_file->f_mapping); 2886 2887 return ret; 2888 } 2889 #else 2890 #define pagemap_scan_hugetlb_entry NULL 2891 #endif 2892 2893 static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, 2894 int depth, struct mm_walk *walk) 2895 { 2896 struct pagemap_scan_private *p = walk->private; 2897 struct vm_area_struct *vma = walk->vma; 2898 int ret, err; 2899 2900 if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p)) 2901 return 0; 2902 2903 ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end); 2904 if (addr == end) 2905 return ret; 2906 2907 if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2908 return ret; 2909 2910 err = uffd_wp_range(vma, addr, end - addr, true); 2911 if (err < 0) 2912 ret = err; 2913 2914 return ret; 2915 } 2916 2917 static const struct mm_walk_ops pagemap_scan_ops = { 2918 .test_walk = pagemap_scan_test_walk, 2919 .pmd_entry = pagemap_scan_pmd_entry, 2920 .pte_hole = pagemap_scan_pte_hole, 2921 .hugetlb_entry = pagemap_scan_hugetlb_entry, 2922 }; 2923 2924 static int pagemap_scan_get_args(struct pm_scan_arg *arg, 2925 unsigned long uarg) 2926 { 2927 if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg))) 2928 return -EFAULT; 2929 2930 if (arg->size != sizeof(struct pm_scan_arg)) 2931 return -EINVAL; 2932 2933 /* Validate requested features */ 2934 if (arg->flags & ~PM_SCAN_FLAGS) 2935 return -EINVAL; 2936 if ((arg->category_inverted | arg->category_mask | 2937 arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) 2938 return -EINVAL; 2939 2940 arg->start = untagged_addr((unsigned long)arg->start); 2941 arg->end = untagged_addr((unsigned long)arg->end); 2942 arg->vec = untagged_addr((unsigned long)arg->vec); 2943 2944 /* Validate memory pointers */ 2945 if (!IS_ALIGNED(arg->start, PAGE_SIZE)) 2946 return -EINVAL; 2947 if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start)) 2948 return -EFAULT; 2949 if (!arg->vec && arg->vec_len) 2950 return -EINVAL; 2951 if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX) 2952 return -EINVAL; 2953 if (arg->vec && !access_ok((void __user *)(long)arg->vec, 2954 size_mul(arg->vec_len, sizeof(struct page_region)))) 2955 return -EFAULT; 2956 2957 /* Fixup default values */ 2958 arg->end = ALIGN(arg->end, PAGE_SIZE); 2959 arg->walk_end = 0; 2960 if (!arg->max_pages) 2961 arg->max_pages = ULONG_MAX; 2962 2963 return 0; 2964 } 2965 2966 static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, 2967 unsigned long uargl) 2968 { 2969 struct pm_scan_arg __user *uarg = (void __user *)uargl; 2970 2971 if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end))) 2972 return -EFAULT; 2973 2974 return 0; 2975 } 2976 2977 static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) 2978 { 2979 if (!p->arg.vec_len) 2980 return 0; 2981 2982 p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, 2983 p->arg.vec_len); 2984 p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf), 2985 GFP_KERNEL); 2986 if (!p->vec_buf) 2987 return -ENOMEM; 2988 2989 p->vec_buf->start = p->vec_buf->end = 0; 2990 p->vec_out = (struct page_region __user *)(long)p->arg.vec; 2991 2992 return 0; 2993 } 2994 2995 static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) 2996 { 2997 const struct page_region *buf = p->vec_buf; 2998 long n = p->vec_buf_index; 2999 3000 if (!p->vec_buf) 3001 return 0; 3002 3003 if (buf[n].end != buf[n].start) 3004 n++; 3005 3006 if (!n) 3007 return 0; 3008 3009 if (copy_to_user(p->vec_out, buf, n * sizeof(*buf))) 3010 return -EFAULT; 3011 3012 p->arg.vec_len -= n; 3013 p->vec_out += n; 3014 3015 p->vec_buf_index = 0; 3016 p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len); 3017 p->vec_buf->start = p->vec_buf->end = 0; 3018 3019 return n; 3020 } 3021 3022 static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) 3023 { 3024 struct pagemap_scan_private p = {0}; 3025 unsigned long walk_start; 3026 size_t n_ranges_out = 0; 3027 int ret; 3028 3029 ret = pagemap_scan_get_args(&p.arg, uarg); 3030 if (ret) 3031 return ret; 3032 3033 p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask | 3034 p.arg.return_mask; 3035 ret = pagemap_scan_init_bounce_buffer(&p); 3036 if (ret) 3037 return ret; 3038 3039 for (walk_start = p.arg.start; walk_start < p.arg.end; 3040 walk_start = p.arg.walk_end) { 3041 struct mmu_notifier_range range; 3042 long n_out; 3043 3044 if (fatal_signal_pending(current)) { 3045 ret = -EINTR; 3046 break; 3047 } 3048 3049 ret = mmap_read_lock_killable(mm); 3050 if (ret) 3051 break; 3052 3053 /* Protection change for the range is going to happen. */ 3054 if (p.arg.flags & PM_SCAN_WP_MATCHING) { 3055 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, 3056 mm, walk_start, p.arg.end); 3057 mmu_notifier_invalidate_range_start(&range); 3058 } 3059 3060 ret = walk_page_range(mm, walk_start, p.arg.end, 3061 &pagemap_scan_ops, &p); 3062 3063 if (p.arg.flags & PM_SCAN_WP_MATCHING) 3064 mmu_notifier_invalidate_range_end(&range); 3065 3066 mmap_read_unlock(mm); 3067 3068 n_out = pagemap_scan_flush_buffer(&p); 3069 if (n_out < 0) 3070 ret = n_out; 3071 else 3072 n_ranges_out += n_out; 3073 3074 if (ret != -ENOSPC) 3075 break; 3076 3077 if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages) 3078 break; 3079 } 3080 3081 /* ENOSPC signifies early stop (buffer full) from the walk. */ 3082 if (!ret || ret == -ENOSPC) 3083 ret = n_ranges_out; 3084 3085 /* The walk_end isn't set when ret is zero */ 3086 if (!p.arg.walk_end) 3087 p.arg.walk_end = p.arg.end; 3088 if (pagemap_scan_writeback_args(&p.arg, uarg)) 3089 ret = -EFAULT; 3090 3091 kfree(p.vec_buf); 3092 return ret; 3093 } 3094 3095 static long do_pagemap_cmd(struct file *file, unsigned int cmd, 3096 unsigned long arg) 3097 { 3098 struct mm_struct *mm = file->private_data; 3099 3100 switch (cmd) { 3101 case PAGEMAP_SCAN: 3102 return do_pagemap_scan(mm, arg); 3103 3104 default: 3105 return -EINVAL; 3106 } 3107 } 3108 3109 const struct file_operations proc_pagemap_operations = { 3110 .llseek = mem_lseek, /* borrow this */ 3111 .read = pagemap_read, 3112 .open = pagemap_open, 3113 .release = pagemap_release, 3114 .unlocked_ioctl = do_pagemap_cmd, 3115 .compat_ioctl = do_pagemap_cmd, 3116 }; 3117 #endif /* CONFIG_PROC_PAGE_MONITOR */ 3118 3119 #ifdef CONFIG_NUMA 3120 3121 struct numa_maps { 3122 unsigned long pages; 3123 unsigned long anon; 3124 unsigned long active; 3125 unsigned long writeback; 3126 unsigned long mapcount_max; 3127 unsigned long dirty; 3128 unsigned long swapcache; 3129 unsigned long node[MAX_NUMNODES]; 3130 }; 3131 3132 struct numa_maps_private { 3133 struct proc_maps_private proc_maps; 3134 struct numa_maps md; 3135 }; 3136 3137 static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, 3138 unsigned long nr_pages) 3139 { 3140 struct folio *folio = page_folio(page); 3141 int count; 3142 3143 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) 3144 count = folio_precise_page_mapcount(folio, page); 3145 else 3146 count = folio_average_page_mapcount(folio); 3147 3148 md->pages += nr_pages; 3149 if (pte_dirty || folio_test_dirty(folio)) 3150 md->dirty += nr_pages; 3151 3152 if (folio_test_swapcache(folio)) 3153 md->swapcache += nr_pages; 3154 3155 if (folio_test_active(folio) || folio_test_unevictable(folio)) 3156 md->active += nr_pages; 3157 3158 if (folio_test_writeback(folio)) 3159 md->writeback += nr_pages; 3160 3161 if (folio_test_anon(folio)) 3162 md->anon += nr_pages; 3163 3164 if (count > md->mapcount_max) 3165 md->mapcount_max = count; 3166 3167 md->node[folio_nid(folio)] += nr_pages; 3168 } 3169 3170 static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, 3171 unsigned long addr) 3172 { 3173 struct page *page; 3174 int nid; 3175 3176 if (!pte_present(pte)) 3177 return NULL; 3178 3179 page = vm_normal_page(vma, addr, pte); 3180 if (!page || is_zone_device_page(page)) 3181 return NULL; 3182 3183 if (PageReserved(page)) 3184 return NULL; 3185 3186 nid = page_to_nid(page); 3187 if (!node_isset(nid, node_states[N_MEMORY])) 3188 return NULL; 3189 3190 return page; 3191 } 3192 3193 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3194 static struct page *can_gather_numa_stats_pmd(pmd_t pmd, 3195 struct vm_area_struct *vma, 3196 unsigned long addr) 3197 { 3198 struct page *page; 3199 int nid; 3200 3201 if (!pmd_present(pmd)) 3202 return NULL; 3203 3204 page = vm_normal_page_pmd(vma, addr, pmd); 3205 if (!page) 3206 return NULL; 3207 3208 if (PageReserved(page)) 3209 return NULL; 3210 3211 nid = page_to_nid(page); 3212 if (!node_isset(nid, node_states[N_MEMORY])) 3213 return NULL; 3214 3215 return page; 3216 } 3217 #endif 3218 3219 static int gather_pte_stats(pmd_t *pmd, unsigned long addr, 3220 unsigned long end, struct mm_walk *walk) 3221 { 3222 struct numa_maps *md = walk->private; 3223 struct vm_area_struct *vma = walk->vma; 3224 spinlock_t *ptl; 3225 pte_t *orig_pte; 3226 pte_t *pte; 3227 3228 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3229 ptl = pmd_trans_huge_lock(pmd, vma); 3230 if (ptl) { 3231 struct page *page; 3232 3233 page = can_gather_numa_stats_pmd(*pmd, vma, addr); 3234 if (page) 3235 gather_stats(page, md, pmd_dirty(*pmd), 3236 HPAGE_PMD_SIZE/PAGE_SIZE); 3237 spin_unlock(ptl); 3238 return 0; 3239 } 3240 #endif 3241 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 3242 if (!pte) { 3243 walk->action = ACTION_AGAIN; 3244 return 0; 3245 } 3246 do { 3247 pte_t ptent = ptep_get(pte); 3248 struct page *page = can_gather_numa_stats(ptent, vma, addr); 3249 if (!page) 3250 continue; 3251 gather_stats(page, md, pte_dirty(ptent), 1); 3252 3253 } while (pte++, addr += PAGE_SIZE, addr != end); 3254 pte_unmap_unlock(orig_pte, ptl); 3255 cond_resched(); 3256 return 0; 3257 } 3258 #ifdef CONFIG_HUGETLB_PAGE 3259 static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, 3260 unsigned long addr, unsigned long end, struct mm_walk *walk) 3261 { 3262 pte_t huge_pte; 3263 struct numa_maps *md; 3264 struct page *page; 3265 spinlock_t *ptl; 3266 3267 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); 3268 huge_pte = huge_ptep_get(walk->mm, addr, pte); 3269 if (!pte_present(huge_pte)) 3270 goto out; 3271 3272 page = pte_page(huge_pte); 3273 3274 md = walk->private; 3275 gather_stats(page, md, pte_dirty(huge_pte), 1); 3276 out: 3277 spin_unlock(ptl); 3278 return 0; 3279 } 3280 3281 #else 3282 static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, 3283 unsigned long addr, unsigned long end, struct mm_walk *walk) 3284 { 3285 return 0; 3286 } 3287 #endif 3288 3289 static const struct mm_walk_ops show_numa_ops = { 3290 .hugetlb_entry = gather_hugetlb_stats, 3291 .pmd_entry = gather_pte_stats, 3292 .walk_lock = PGWALK_RDLOCK, 3293 }; 3294 3295 /* 3296 * Display pages allocated per node and memory policy via /proc. 3297 */ 3298 static int show_numa_map(struct seq_file *m, void *v) 3299 { 3300 struct numa_maps_private *numa_priv = m->private; 3301 struct proc_maps_private *proc_priv = &numa_priv->proc_maps; 3302 struct vm_area_struct *vma = v; 3303 struct numa_maps *md = &numa_priv->md; 3304 struct file *file = vma->vm_file; 3305 struct mm_struct *mm = vma->vm_mm; 3306 char buffer[64]; 3307 struct mempolicy *pol; 3308 pgoff_t ilx; 3309 int nid; 3310 3311 if (!mm) 3312 return 0; 3313 3314 /* Ensure we start with an empty set of numa_maps statistics. */ 3315 memset(md, 0, sizeof(*md)); 3316 3317 pol = __get_vma_policy(vma, vma->vm_start, &ilx); 3318 if (pol) { 3319 mpol_to_str(buffer, sizeof(buffer), pol); 3320 mpol_cond_put(pol); 3321 } else { 3322 mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy); 3323 } 3324 3325 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 3326 3327 if (file) { 3328 seq_puts(m, " file="); 3329 seq_path(m, file_user_path(file), "\n\t= "); 3330 } else if (vma_is_initial_heap(vma)) { 3331 seq_puts(m, " heap"); 3332 } else if (vma_is_initial_stack(vma)) { 3333 seq_puts(m, " stack"); 3334 } 3335 3336 if (is_vm_hugetlb_page(vma)) 3337 seq_puts(m, " huge"); 3338 3339 /* mmap_lock is held by m_start */ 3340 walk_page_vma(vma, &show_numa_ops, md); 3341 3342 if (!md->pages) 3343 goto out; 3344 3345 if (md->anon) 3346 seq_printf(m, " anon=%lu", md->anon); 3347 3348 if (md->dirty) 3349 seq_printf(m, " dirty=%lu", md->dirty); 3350 3351 if (md->pages != md->anon && md->pages != md->dirty) 3352 seq_printf(m, " mapped=%lu", md->pages); 3353 3354 if (md->mapcount_max > 1) 3355 seq_printf(m, " mapmax=%lu", md->mapcount_max); 3356 3357 if (md->swapcache) 3358 seq_printf(m, " swapcache=%lu", md->swapcache); 3359 3360 if (md->active < md->pages && !is_vm_hugetlb_page(vma)) 3361 seq_printf(m, " active=%lu", md->active); 3362 3363 if (md->writeback) 3364 seq_printf(m, " writeback=%lu", md->writeback); 3365 3366 for_each_node_state(nid, N_MEMORY) 3367 if (md->node[nid]) 3368 seq_printf(m, " N%d=%lu", nid, md->node[nid]); 3369 3370 seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); 3371 out: 3372 seq_putc(m, '\n'); 3373 return 0; 3374 } 3375 3376 static const struct seq_operations proc_pid_numa_maps_op = { 3377 .start = m_start, 3378 .next = m_next, 3379 .stop = m_stop, 3380 .show = show_numa_map, 3381 }; 3382 3383 static int pid_numa_maps_open(struct inode *inode, struct file *file) 3384 { 3385 return proc_maps_open(inode, file, &proc_pid_numa_maps_op, 3386 sizeof(struct numa_maps_private)); 3387 } 3388 3389 const struct file_operations proc_pid_numa_maps_operations = { 3390 .open = pid_numa_maps_open, 3391 .read = seq_read, 3392 .llseek = seq_lseek, 3393 .release = proc_map_release, 3394 }; 3395 3396 #endif /* CONFIG_NUMA */ 3397