1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/pagewalk.h> 3 #include <linux/mm_inline.h> 4 #include <linux/hugetlb.h> 5 #include <linux/huge_mm.h> 6 #include <linux/mount.h> 7 #include <linux/ksm.h> 8 #include <linux/seq_file.h> 9 #include <linux/highmem.h> 10 #include <linux/ptrace.h> 11 #include <linux/slab.h> 12 #include <linux/pagemap.h> 13 #include <linux/mempolicy.h> 14 #include <linux/rmap.h> 15 #include <linux/swap.h> 16 #include <linux/sched/mm.h> 17 #include <linux/swapops.h> 18 #include <linux/mmu_notifier.h> 19 #include <linux/page_idle.h> 20 #include <linux/shmem_fs.h> 21 #include <linux/uaccess.h> 22 #include <linux/pkeys.h> 23 #include <linux/minmax.h> 24 #include <linux/overflow.h> 25 #include <linux/buildid.h> 26 27 #include <asm/elf.h> 28 #include <asm/tlb.h> 29 #include <asm/tlbflush.h> 30 #include "internal.h" 31 32 #define SENTINEL_VMA_END -1 33 #define SENTINEL_VMA_GATE -2 34 35 #define SEQ_PUT_DEC(str, val) \ 36 seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) 37 void task_mem(struct seq_file *m, struct mm_struct *mm) 38 { 39 unsigned long text, lib, swap, anon, file, shmem; 40 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; 41 42 anon = get_mm_counter_sum(mm, MM_ANONPAGES); 43 file = get_mm_counter_sum(mm, MM_FILEPAGES); 44 shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES); 45 46 /* 47 * Note: to minimize their overhead, mm maintains hiwater_vm and 48 * hiwater_rss only when about to *lower* total_vm or rss. Any 49 * collector of these hiwater stats must therefore get total_vm 50 * and rss too, which will usually be the higher. Barriers? not 51 * worth the effort, such snapshots can always be inconsistent. 52 */ 53 hiwater_vm = total_vm = mm->total_vm; 54 if (hiwater_vm < mm->hiwater_vm) 55 hiwater_vm = mm->hiwater_vm; 56 hiwater_rss = total_rss = anon + file + shmem; 57 if (hiwater_rss < mm->hiwater_rss) 58 hiwater_rss = mm->hiwater_rss; 59 60 /* split executable areas between text and lib */ 61 text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK); 62 text = min(text, mm->exec_vm << PAGE_SHIFT); 63 lib = (mm->exec_vm << PAGE_SHIFT) - text; 64 65 swap = get_mm_counter_sum(mm, MM_SWAPENTS); 66 SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); 67 SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); 68 SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); 69 SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm)); 70 SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); 71 SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); 72 SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); 73 SEQ_PUT_DEC(" kB\nRssFile:\t", file); 74 SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem); 75 SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm); 76 SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm); 77 seq_put_decimal_ull_width(m, 78 " kB\nVmExe:\t", text >> 10, 8); 79 seq_put_decimal_ull_width(m, 80 " kB\nVmLib:\t", lib >> 10, 8); 81 seq_put_decimal_ull_width(m, 82 " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8); 83 SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); 84 seq_puts(m, " kB\n"); 85 hugetlb_report_usage(m, mm); 86 } 87 #undef SEQ_PUT_DEC 88 89 unsigned long task_vsize(struct mm_struct *mm) 90 { 91 return PAGE_SIZE * mm->total_vm; 92 } 93 94 unsigned long task_statm(struct mm_struct *mm, 95 unsigned long *shared, unsigned long *text, 96 unsigned long *data, unsigned long *resident) 97 { 98 *shared = get_mm_counter_sum(mm, MM_FILEPAGES) + 99 get_mm_counter_sum(mm, MM_SHMEMPAGES); 100 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) 101 >> PAGE_SHIFT; 102 *data = mm->data_vm + mm->stack_vm; 103 *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES); 104 return mm->total_vm; 105 } 106 107 #ifdef CONFIG_NUMA 108 /* 109 * Save get_task_policy() for show_numa_map(). 110 */ 111 static void hold_task_mempolicy(struct proc_maps_private *priv) 112 { 113 struct task_struct *task = priv->task; 114 115 task_lock(task); 116 priv->task_mempolicy = get_task_policy(task); 117 mpol_get(priv->task_mempolicy); 118 task_unlock(task); 119 } 120 static void release_task_mempolicy(struct proc_maps_private *priv) 121 { 122 mpol_put(priv->task_mempolicy); 123 } 124 #else 125 static void hold_task_mempolicy(struct proc_maps_private *priv) 126 { 127 } 128 static void release_task_mempolicy(struct proc_maps_private *priv) 129 { 130 } 131 #endif 132 133 #ifdef CONFIG_PER_VMA_LOCK 134 135 static void unlock_vma(struct proc_maps_private *priv) 136 { 137 if (priv->locked_vma) { 138 vma_end_read(priv->locked_vma); 139 priv->locked_vma = NULL; 140 } 141 } 142 143 static const struct seq_operations proc_pid_maps_op; 144 145 static inline bool lock_vma_range(struct seq_file *m, 146 struct proc_maps_private *priv) 147 { 148 /* 149 * smaps and numa_maps perform page table walk, therefore require 150 * mmap_lock but maps can be read with locking just the vma and 151 * walking the vma tree under rcu read protection. 152 */ 153 if (m->op != &proc_pid_maps_op) { 154 if (mmap_read_lock_killable(priv->mm)) 155 return false; 156 157 priv->mmap_locked = true; 158 } else { 159 rcu_read_lock(); 160 priv->locked_vma = NULL; 161 priv->mmap_locked = false; 162 } 163 164 return true; 165 } 166 167 static inline void unlock_vma_range(struct proc_maps_private *priv) 168 { 169 if (priv->mmap_locked) { 170 mmap_read_unlock(priv->mm); 171 } else { 172 unlock_vma(priv); 173 rcu_read_unlock(); 174 } 175 } 176 177 static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, 178 loff_t last_pos) 179 { 180 struct vm_area_struct *vma; 181 182 if (priv->mmap_locked) 183 return vma_next(&priv->iter); 184 185 unlock_vma(priv); 186 vma = lock_next_vma(priv->mm, &priv->iter, last_pos); 187 if (!IS_ERR_OR_NULL(vma)) 188 priv->locked_vma = vma; 189 190 return vma; 191 } 192 193 static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, 194 loff_t pos) 195 { 196 if (priv->mmap_locked) 197 return false; 198 199 rcu_read_unlock(); 200 mmap_read_lock(priv->mm); 201 /* Reinitialize the iterator after taking mmap_lock */ 202 vma_iter_set(&priv->iter, pos); 203 priv->mmap_locked = true; 204 205 return true; 206 } 207 208 #else /* CONFIG_PER_VMA_LOCK */ 209 210 static inline bool lock_vma_range(struct seq_file *m, 211 struct proc_maps_private *priv) 212 { 213 return mmap_read_lock_killable(priv->mm) == 0; 214 } 215 216 static inline void unlock_vma_range(struct proc_maps_private *priv) 217 { 218 mmap_read_unlock(priv->mm); 219 } 220 221 static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, 222 loff_t last_pos) 223 { 224 return vma_next(&priv->iter); 225 } 226 227 static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, 228 loff_t pos) 229 { 230 return false; 231 } 232 233 #endif /* CONFIG_PER_VMA_LOCK */ 234 235 static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) 236 { 237 struct proc_maps_private *priv = m->private; 238 struct vm_area_struct *vma; 239 240 retry: 241 vma = get_next_vma(priv, *ppos); 242 /* EINTR of EAGAIN is possible */ 243 if (IS_ERR(vma)) { 244 if (PTR_ERR(vma) == -EAGAIN && fallback_to_mmap_lock(priv, *ppos)) 245 goto retry; 246 247 return vma; 248 } 249 250 /* Store previous position to be able to restart if needed */ 251 priv->last_pos = *ppos; 252 if (vma) { 253 /* 254 * Track the end of the reported vma to ensure position changes 255 * even if previous vma was merged with the next vma and we 256 * found the extended vma with the same vm_start. 257 */ 258 *ppos = vma->vm_end; 259 } else { 260 *ppos = SENTINEL_VMA_GATE; 261 vma = get_gate_vma(priv->mm); 262 } 263 264 return vma; 265 } 266 267 static void *m_start(struct seq_file *m, loff_t *ppos) 268 { 269 struct proc_maps_private *priv = m->private; 270 loff_t last_addr = *ppos; 271 struct mm_struct *mm; 272 273 /* See m_next(). Zero at the start or after lseek. */ 274 if (last_addr == SENTINEL_VMA_END) 275 return NULL; 276 277 priv->task = get_proc_task(priv->inode); 278 if (!priv->task) 279 return ERR_PTR(-ESRCH); 280 281 mm = priv->mm; 282 if (!mm || !mmget_not_zero(mm)) { 283 put_task_struct(priv->task); 284 priv->task = NULL; 285 return NULL; 286 } 287 288 if (!lock_vma_range(m, priv)) { 289 mmput(mm); 290 put_task_struct(priv->task); 291 priv->task = NULL; 292 return ERR_PTR(-EINTR); 293 } 294 295 /* 296 * Reset current position if last_addr was set before 297 * and it's not a sentinel. 298 */ 299 if (last_addr > 0) 300 *ppos = last_addr = priv->last_pos; 301 vma_iter_init(&priv->iter, mm, (unsigned long)last_addr); 302 hold_task_mempolicy(priv); 303 if (last_addr == SENTINEL_VMA_GATE) 304 return get_gate_vma(mm); 305 306 return proc_get_vma(m, ppos); 307 } 308 309 static void *m_next(struct seq_file *m, void *v, loff_t *ppos) 310 { 311 if (*ppos == SENTINEL_VMA_GATE) { 312 *ppos = SENTINEL_VMA_END; 313 return NULL; 314 } 315 return proc_get_vma(m, ppos); 316 } 317 318 static void m_stop(struct seq_file *m, void *v) 319 { 320 struct proc_maps_private *priv = m->private; 321 struct mm_struct *mm = priv->mm; 322 323 if (!priv->task) 324 return; 325 326 release_task_mempolicy(priv); 327 unlock_vma_range(priv); 328 mmput(mm); 329 put_task_struct(priv->task); 330 priv->task = NULL; 331 } 332 333 static int proc_maps_open(struct inode *inode, struct file *file, 334 const struct seq_operations *ops, int psize) 335 { 336 struct proc_maps_private *priv = __seq_open_private(file, ops, psize); 337 338 if (!priv) 339 return -ENOMEM; 340 341 priv->inode = inode; 342 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); 343 if (IS_ERR_OR_NULL(priv->mm)) { 344 int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; 345 346 seq_release_private(inode, file); 347 return err; 348 } 349 350 return 0; 351 } 352 353 static int proc_map_release(struct inode *inode, struct file *file) 354 { 355 struct seq_file *seq = file->private_data; 356 struct proc_maps_private *priv = seq->private; 357 358 if (priv->mm) 359 mmdrop(priv->mm); 360 361 return seq_release_private(inode, file); 362 } 363 364 static int do_maps_open(struct inode *inode, struct file *file, 365 const struct seq_operations *ops) 366 { 367 return proc_maps_open(inode, file, ops, 368 sizeof(struct proc_maps_private)); 369 } 370 371 static void get_vma_name(struct vm_area_struct *vma, 372 const struct path **path, 373 const char **name, 374 const char **name_fmt) 375 { 376 struct anon_vma_name *anon_name = vma->vm_mm ? anon_vma_name(vma) : NULL; 377 378 *name = NULL; 379 *path = NULL; 380 *name_fmt = NULL; 381 382 /* 383 * Print the dentry name for named mappings, and a 384 * special [heap] marker for the heap: 385 */ 386 if (vma->vm_file) { 387 /* 388 * If user named this anon shared memory via 389 * prctl(PR_SET_VMA ..., use the provided name. 390 */ 391 if (anon_name) { 392 *name_fmt = "[anon_shmem:%s]"; 393 *name = anon_name->name; 394 } else { 395 *path = file_user_path(vma->vm_file); 396 } 397 return; 398 } 399 400 if (vma->vm_ops && vma->vm_ops->name) { 401 *name = vma->vm_ops->name(vma); 402 if (*name) 403 return; 404 } 405 406 *name = arch_vma_name(vma); 407 if (*name) 408 return; 409 410 if (!vma->vm_mm) { 411 *name = "[vdso]"; 412 return; 413 } 414 415 if (vma_is_initial_heap(vma)) { 416 *name = "[heap]"; 417 return; 418 } 419 420 if (vma_is_initial_stack(vma)) { 421 *name = "[stack]"; 422 return; 423 } 424 425 if (anon_name) { 426 *name_fmt = "[anon:%s]"; 427 *name = anon_name->name; 428 return; 429 } 430 } 431 432 static void show_vma_header_prefix(struct seq_file *m, 433 unsigned long start, unsigned long end, 434 vm_flags_t flags, unsigned long long pgoff, 435 dev_t dev, unsigned long ino) 436 { 437 seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); 438 seq_put_hex_ll(m, NULL, start, 8); 439 seq_put_hex_ll(m, "-", end, 8); 440 seq_putc(m, ' '); 441 seq_putc(m, flags & VM_READ ? 'r' : '-'); 442 seq_putc(m, flags & VM_WRITE ? 'w' : '-'); 443 seq_putc(m, flags & VM_EXEC ? 'x' : '-'); 444 seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p'); 445 seq_put_hex_ll(m, " ", pgoff, 8); 446 seq_put_hex_ll(m, " ", MAJOR(dev), 2); 447 seq_put_hex_ll(m, ":", MINOR(dev), 2); 448 seq_put_decimal_ull(m, " ", ino); 449 seq_putc(m, ' '); 450 } 451 452 static void 453 show_map_vma(struct seq_file *m, struct vm_area_struct *vma) 454 { 455 const struct path *path; 456 const char *name_fmt, *name; 457 vm_flags_t flags = vma->vm_flags; 458 unsigned long ino = 0; 459 unsigned long long pgoff = 0; 460 unsigned long start, end; 461 dev_t dev = 0; 462 463 if (vma->vm_file) { 464 const struct inode *inode = file_user_inode(vma->vm_file); 465 466 dev = inode->i_sb->s_dev; 467 ino = inode->i_ino; 468 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; 469 } 470 471 start = vma->vm_start; 472 end = vma->vm_end; 473 show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); 474 475 get_vma_name(vma, &path, &name, &name_fmt); 476 if (path) { 477 seq_pad(m, ' '); 478 seq_path(m, path, "\n"); 479 } else if (name_fmt) { 480 seq_pad(m, ' '); 481 seq_printf(m, name_fmt, name); 482 } else if (name) { 483 seq_pad(m, ' '); 484 seq_puts(m, name); 485 } 486 seq_putc(m, '\n'); 487 } 488 489 static int show_map(struct seq_file *m, void *v) 490 { 491 show_map_vma(m, v); 492 return 0; 493 } 494 495 static const struct seq_operations proc_pid_maps_op = { 496 .start = m_start, 497 .next = m_next, 498 .stop = m_stop, 499 .show = show_map 500 }; 501 502 static int pid_maps_open(struct inode *inode, struct file *file) 503 { 504 return do_maps_open(inode, file, &proc_pid_maps_op); 505 } 506 507 #define PROCMAP_QUERY_VMA_FLAGS ( \ 508 PROCMAP_QUERY_VMA_READABLE | \ 509 PROCMAP_QUERY_VMA_WRITABLE | \ 510 PROCMAP_QUERY_VMA_EXECUTABLE | \ 511 PROCMAP_QUERY_VMA_SHARED \ 512 ) 513 514 #define PROCMAP_QUERY_VALID_FLAGS_MASK ( \ 515 PROCMAP_QUERY_COVERING_OR_NEXT_VMA | \ 516 PROCMAP_QUERY_FILE_BACKED_VMA | \ 517 PROCMAP_QUERY_VMA_FLAGS \ 518 ) 519 520 static int query_vma_setup(struct mm_struct *mm) 521 { 522 return mmap_read_lock_killable(mm); 523 } 524 525 static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma) 526 { 527 mmap_read_unlock(mm); 528 } 529 530 static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr) 531 { 532 return find_vma(mm, addr); 533 } 534 535 static struct vm_area_struct *query_matching_vma(struct mm_struct *mm, 536 unsigned long addr, u32 flags) 537 { 538 struct vm_area_struct *vma; 539 540 next_vma: 541 vma = query_vma_find_by_addr(mm, addr); 542 if (!vma) 543 goto no_vma; 544 545 /* user requested only file-backed VMA, keep iterating */ 546 if ((flags & PROCMAP_QUERY_FILE_BACKED_VMA) && !vma->vm_file) 547 goto skip_vma; 548 549 /* VMA permissions should satisfy query flags */ 550 if (flags & PROCMAP_QUERY_VMA_FLAGS) { 551 u32 perm = 0; 552 553 if (flags & PROCMAP_QUERY_VMA_READABLE) 554 perm |= VM_READ; 555 if (flags & PROCMAP_QUERY_VMA_WRITABLE) 556 perm |= VM_WRITE; 557 if (flags & PROCMAP_QUERY_VMA_EXECUTABLE) 558 perm |= VM_EXEC; 559 if (flags & PROCMAP_QUERY_VMA_SHARED) 560 perm |= VM_MAYSHARE; 561 562 if ((vma->vm_flags & perm) != perm) 563 goto skip_vma; 564 } 565 566 /* found covering VMA or user is OK with the matching next VMA */ 567 if ((flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) || vma->vm_start <= addr) 568 return vma; 569 570 skip_vma: 571 /* 572 * If the user needs closest matching VMA, keep iterating. 573 */ 574 addr = vma->vm_end; 575 if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) 576 goto next_vma; 577 578 no_vma: 579 return ERR_PTR(-ENOENT); 580 } 581 582 static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) 583 { 584 struct procmap_query karg; 585 struct vm_area_struct *vma; 586 struct mm_struct *mm; 587 const char *name = NULL; 588 char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; 589 __u64 usize; 590 int err; 591 592 if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize))) 593 return -EFAULT; 594 /* argument struct can never be that large, reject abuse */ 595 if (usize > PAGE_SIZE) 596 return -E2BIG; 597 /* argument struct should have at least query_flags and query_addr fields */ 598 if (usize < offsetofend(struct procmap_query, query_addr)) 599 return -EINVAL; 600 err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); 601 if (err) 602 return err; 603 604 /* reject unknown flags */ 605 if (karg.query_flags & ~PROCMAP_QUERY_VALID_FLAGS_MASK) 606 return -EINVAL; 607 /* either both buffer address and size are set, or both should be zero */ 608 if (!!karg.vma_name_size != !!karg.vma_name_addr) 609 return -EINVAL; 610 if (!!karg.build_id_size != !!karg.build_id_addr) 611 return -EINVAL; 612 613 mm = priv->mm; 614 if (!mm || !mmget_not_zero(mm)) 615 return -ESRCH; 616 617 err = query_vma_setup(mm); 618 if (err) { 619 mmput(mm); 620 return err; 621 } 622 623 vma = query_matching_vma(mm, karg.query_addr, karg.query_flags); 624 if (IS_ERR(vma)) { 625 err = PTR_ERR(vma); 626 vma = NULL; 627 goto out; 628 } 629 630 karg.vma_start = vma->vm_start; 631 karg.vma_end = vma->vm_end; 632 633 karg.vma_flags = 0; 634 if (vma->vm_flags & VM_READ) 635 karg.vma_flags |= PROCMAP_QUERY_VMA_READABLE; 636 if (vma->vm_flags & VM_WRITE) 637 karg.vma_flags |= PROCMAP_QUERY_VMA_WRITABLE; 638 if (vma->vm_flags & VM_EXEC) 639 karg.vma_flags |= PROCMAP_QUERY_VMA_EXECUTABLE; 640 if (vma->vm_flags & VM_MAYSHARE) 641 karg.vma_flags |= PROCMAP_QUERY_VMA_SHARED; 642 643 karg.vma_page_size = vma_kernel_pagesize(vma); 644 645 if (vma->vm_file) { 646 const struct inode *inode = file_user_inode(vma->vm_file); 647 648 karg.vma_offset = ((__u64)vma->vm_pgoff) << PAGE_SHIFT; 649 karg.dev_major = MAJOR(inode->i_sb->s_dev); 650 karg.dev_minor = MINOR(inode->i_sb->s_dev); 651 karg.inode = inode->i_ino; 652 } else { 653 karg.vma_offset = 0; 654 karg.dev_major = 0; 655 karg.dev_minor = 0; 656 karg.inode = 0; 657 } 658 659 if (karg.build_id_size) { 660 __u32 build_id_sz; 661 662 err = build_id_parse(vma, build_id_buf, &build_id_sz); 663 if (err) { 664 karg.build_id_size = 0; 665 } else { 666 if (karg.build_id_size < build_id_sz) { 667 err = -ENAMETOOLONG; 668 goto out; 669 } 670 karg.build_id_size = build_id_sz; 671 } 672 } 673 674 if (karg.vma_name_size) { 675 size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size); 676 const struct path *path; 677 const char *name_fmt; 678 size_t name_sz = 0; 679 680 get_vma_name(vma, &path, &name, &name_fmt); 681 682 if (path || name_fmt || name) { 683 name_buf = kmalloc(name_buf_sz, GFP_KERNEL); 684 if (!name_buf) { 685 err = -ENOMEM; 686 goto out; 687 } 688 } 689 if (path) { 690 name = d_path(path, name_buf, name_buf_sz); 691 if (IS_ERR(name)) { 692 err = PTR_ERR(name); 693 goto out; 694 } 695 name_sz = name_buf + name_buf_sz - name; 696 } else if (name || name_fmt) { 697 name_sz = 1 + snprintf(name_buf, name_buf_sz, name_fmt ?: "%s", name); 698 name = name_buf; 699 } 700 if (name_sz > name_buf_sz) { 701 err = -ENAMETOOLONG; 702 goto out; 703 } 704 karg.vma_name_size = name_sz; 705 } 706 707 /* unlock vma or mmap_lock, and put mm_struct before copying data to user */ 708 query_vma_teardown(mm, vma); 709 mmput(mm); 710 711 if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr), 712 name, karg.vma_name_size)) { 713 kfree(name_buf); 714 return -EFAULT; 715 } 716 kfree(name_buf); 717 718 if (karg.build_id_size && copy_to_user(u64_to_user_ptr(karg.build_id_addr), 719 build_id_buf, karg.build_id_size)) 720 return -EFAULT; 721 722 if (copy_to_user(uarg, &karg, min_t(size_t, sizeof(karg), usize))) 723 return -EFAULT; 724 725 return 0; 726 727 out: 728 query_vma_teardown(mm, vma); 729 mmput(mm); 730 kfree(name_buf); 731 return err; 732 } 733 734 static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 735 { 736 struct seq_file *seq = file->private_data; 737 struct proc_maps_private *priv = seq->private; 738 739 switch (cmd) { 740 case PROCMAP_QUERY: 741 return do_procmap_query(priv, (void __user *)arg); 742 default: 743 return -ENOIOCTLCMD; 744 } 745 } 746 747 const struct file_operations proc_pid_maps_operations = { 748 .open = pid_maps_open, 749 .read = seq_read, 750 .llseek = seq_lseek, 751 .release = proc_map_release, 752 .unlocked_ioctl = procfs_procmap_ioctl, 753 .compat_ioctl = compat_ptr_ioctl, 754 }; 755 756 /* 757 * Proportional Set Size(PSS): my share of RSS. 758 * 759 * PSS of a process is the count of pages it has in memory, where each 760 * page is divided by the number of processes sharing it. So if a 761 * process has 1000 pages all to itself, and 1000 shared with one other 762 * process, its PSS will be 1500. 763 * 764 * To keep (accumulated) division errors low, we adopt a 64bit 765 * fixed-point pss counter to minimize division errors. So (pss >> 766 * PSS_SHIFT) would be the real byte count. 767 * 768 * A shift of 12 before division means (assuming 4K page size): 769 * - 1M 3-user-pages add up to 8KB errors; 770 * - supports mapcount up to 2^24, or 16M; 771 * - supports PSS up to 2^52 bytes, or 4PB. 772 */ 773 #define PSS_SHIFT 12 774 775 #ifdef CONFIG_PROC_PAGE_MONITOR 776 struct mem_size_stats { 777 unsigned long resident; 778 unsigned long shared_clean; 779 unsigned long shared_dirty; 780 unsigned long private_clean; 781 unsigned long private_dirty; 782 unsigned long referenced; 783 unsigned long anonymous; 784 unsigned long lazyfree; 785 unsigned long anonymous_thp; 786 unsigned long shmem_thp; 787 unsigned long file_thp; 788 unsigned long swap; 789 unsigned long shared_hugetlb; 790 unsigned long private_hugetlb; 791 unsigned long ksm; 792 u64 pss; 793 u64 pss_anon; 794 u64 pss_file; 795 u64 pss_shmem; 796 u64 pss_dirty; 797 u64 pss_locked; 798 u64 swap_pss; 799 }; 800 801 static void smaps_page_accumulate(struct mem_size_stats *mss, 802 struct folio *folio, unsigned long size, unsigned long pss, 803 bool dirty, bool locked, bool private) 804 { 805 mss->pss += pss; 806 807 if (folio_test_anon(folio)) 808 mss->pss_anon += pss; 809 else if (folio_test_swapbacked(folio)) 810 mss->pss_shmem += pss; 811 else 812 mss->pss_file += pss; 813 814 if (locked) 815 mss->pss_locked += pss; 816 817 if (dirty || folio_test_dirty(folio)) { 818 mss->pss_dirty += pss; 819 if (private) 820 mss->private_dirty += size; 821 else 822 mss->shared_dirty += size; 823 } else { 824 if (private) 825 mss->private_clean += size; 826 else 827 mss->shared_clean += size; 828 } 829 } 830 831 static void smaps_account(struct mem_size_stats *mss, struct page *page, 832 bool compound, bool young, bool dirty, bool locked, 833 bool present) 834 { 835 struct folio *folio = page_folio(page); 836 int i, nr = compound ? compound_nr(page) : 1; 837 unsigned long size = nr * PAGE_SIZE; 838 bool exclusive; 839 int mapcount; 840 841 /* 842 * First accumulate quantities that depend only on |size| and the type 843 * of the compound page. 844 */ 845 if (folio_test_anon(folio)) { 846 mss->anonymous += size; 847 if (!folio_test_swapbacked(folio) && !dirty && 848 !folio_test_dirty(folio)) 849 mss->lazyfree += size; 850 } 851 852 if (folio_test_ksm(folio)) 853 mss->ksm += size; 854 855 mss->resident += size; 856 /* Accumulate the size in pages that have been accessed. */ 857 if (young || folio_test_young(folio) || folio_test_referenced(folio)) 858 mss->referenced += size; 859 860 /* 861 * Then accumulate quantities that may depend on sharing, or that may 862 * differ page-by-page. 863 * 864 * refcount == 1 for present entries guarantees that the folio is mapped 865 * exactly once. For large folios this implies that exactly one 866 * PTE/PMD/... maps (a part of) this folio. 867 * 868 * Treat all non-present entries (where relying on the mapcount and 869 * refcount doesn't make sense) as "maybe shared, but not sure how 870 * often". We treat device private entries as being fake-present. 871 * 872 * Note that it would not be safe to read the mapcount especially for 873 * pages referenced by migration entries, even with the PTL held. 874 */ 875 if (folio_ref_count(folio) == 1 || !present) { 876 smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT, 877 dirty, locked, present); 878 return; 879 } 880 881 if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { 882 mapcount = folio_average_page_mapcount(folio); 883 exclusive = !folio_maybe_mapped_shared(folio); 884 } 885 886 /* 887 * We obtain a snapshot of the mapcount. Without holding the folio lock 888 * this snapshot can be slightly wrong as we cannot always read the 889 * mapcount atomically. 890 */ 891 for (i = 0; i < nr; i++, page++) { 892 unsigned long pss = PAGE_SIZE << PSS_SHIFT; 893 894 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) { 895 mapcount = folio_precise_page_mapcount(folio, page); 896 exclusive = mapcount < 2; 897 } 898 899 if (mapcount >= 2) 900 pss /= mapcount; 901 smaps_page_accumulate(mss, folio, PAGE_SIZE, pss, 902 dirty, locked, exclusive); 903 } 904 } 905 906 #ifdef CONFIG_SHMEM 907 static int smaps_pte_hole(unsigned long addr, unsigned long end, 908 __always_unused int depth, struct mm_walk *walk) 909 { 910 struct mem_size_stats *mss = walk->private; 911 struct vm_area_struct *vma = walk->vma; 912 913 mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping, 914 linear_page_index(vma, addr), 915 linear_page_index(vma, end)); 916 917 return 0; 918 } 919 #else 920 #define smaps_pte_hole NULL 921 #endif /* CONFIG_SHMEM */ 922 923 static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk) 924 { 925 #ifdef CONFIG_SHMEM 926 if (walk->ops->pte_hole) { 927 /* depth is not used */ 928 smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk); 929 } 930 #endif 931 } 932 933 static void smaps_pte_entry(pte_t *pte, unsigned long addr, 934 struct mm_walk *walk) 935 { 936 struct mem_size_stats *mss = walk->private; 937 struct vm_area_struct *vma = walk->vma; 938 bool locked = !!(vma->vm_flags & VM_LOCKED); 939 struct page *page = NULL; 940 bool present = false, young = false, dirty = false; 941 pte_t ptent = ptep_get(pte); 942 943 if (pte_present(ptent)) { 944 page = vm_normal_page(vma, addr, ptent); 945 young = pte_young(ptent); 946 dirty = pte_dirty(ptent); 947 present = true; 948 } else if (is_swap_pte(ptent)) { 949 swp_entry_t swpent = pte_to_swp_entry(ptent); 950 951 if (!non_swap_entry(swpent)) { 952 int mapcount; 953 954 mss->swap += PAGE_SIZE; 955 mapcount = swp_swapcount(swpent); 956 if (mapcount >= 2) { 957 u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; 958 959 do_div(pss_delta, mapcount); 960 mss->swap_pss += pss_delta; 961 } else { 962 mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; 963 } 964 } else if (is_pfn_swap_entry(swpent)) { 965 if (is_device_private_entry(swpent)) 966 present = true; 967 page = pfn_swap_entry_to_page(swpent); 968 } 969 } else { 970 smaps_pte_hole_lookup(addr, walk); 971 return; 972 } 973 974 if (!page) 975 return; 976 977 smaps_account(mss, page, false, young, dirty, locked, present); 978 } 979 980 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 981 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, 982 struct mm_walk *walk) 983 { 984 struct mem_size_stats *mss = walk->private; 985 struct vm_area_struct *vma = walk->vma; 986 bool locked = !!(vma->vm_flags & VM_LOCKED); 987 struct page *page = NULL; 988 bool present = false; 989 struct folio *folio; 990 991 if (pmd_present(*pmd)) { 992 page = vm_normal_page_pmd(vma, addr, *pmd); 993 present = true; 994 } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { 995 swp_entry_t entry = pmd_to_swp_entry(*pmd); 996 997 if (is_pfn_swap_entry(entry)) 998 page = pfn_swap_entry_to_page(entry); 999 } 1000 if (IS_ERR_OR_NULL(page)) 1001 return; 1002 folio = page_folio(page); 1003 if (folio_test_anon(folio)) 1004 mss->anonymous_thp += HPAGE_PMD_SIZE; 1005 else if (folio_test_swapbacked(folio)) 1006 mss->shmem_thp += HPAGE_PMD_SIZE; 1007 else if (folio_is_zone_device(folio)) 1008 /* pass */; 1009 else 1010 mss->file_thp += HPAGE_PMD_SIZE; 1011 1012 smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), 1013 locked, present); 1014 } 1015 #else 1016 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, 1017 struct mm_walk *walk) 1018 { 1019 } 1020 #endif 1021 1022 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 1023 struct mm_walk *walk) 1024 { 1025 struct vm_area_struct *vma = walk->vma; 1026 pte_t *pte; 1027 spinlock_t *ptl; 1028 1029 ptl = pmd_trans_huge_lock(pmd, vma); 1030 if (ptl) { 1031 smaps_pmd_entry(pmd, addr, walk); 1032 spin_unlock(ptl); 1033 goto out; 1034 } 1035 1036 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 1037 if (!pte) { 1038 walk->action = ACTION_AGAIN; 1039 return 0; 1040 } 1041 for (; addr != end; pte++, addr += PAGE_SIZE) 1042 smaps_pte_entry(pte, addr, walk); 1043 pte_unmap_unlock(pte - 1, ptl); 1044 out: 1045 cond_resched(); 1046 return 0; 1047 } 1048 1049 static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) 1050 { 1051 /* 1052 * Don't forget to update Documentation/ on changes. 1053 * 1054 * The length of the second argument of mnemonics[] 1055 * needs to be 3 instead of previously set 2 1056 * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3]) 1057 * to avoid spurious 1058 * -Werror=unterminated-string-initialization warning 1059 * with GCC 15 1060 */ 1061 static const char mnemonics[BITS_PER_LONG][3] = { 1062 /* 1063 * In case if we meet a flag we don't know about. 1064 */ 1065 [0 ... (BITS_PER_LONG-1)] = "??", 1066 1067 [ilog2(VM_READ)] = "rd", 1068 [ilog2(VM_WRITE)] = "wr", 1069 [ilog2(VM_EXEC)] = "ex", 1070 [ilog2(VM_SHARED)] = "sh", 1071 [ilog2(VM_MAYREAD)] = "mr", 1072 [ilog2(VM_MAYWRITE)] = "mw", 1073 [ilog2(VM_MAYEXEC)] = "me", 1074 [ilog2(VM_MAYSHARE)] = "ms", 1075 [ilog2(VM_GROWSDOWN)] = "gd", 1076 [ilog2(VM_PFNMAP)] = "pf", 1077 [ilog2(VM_LOCKED)] = "lo", 1078 [ilog2(VM_IO)] = "io", 1079 [ilog2(VM_SEQ_READ)] = "sr", 1080 [ilog2(VM_RAND_READ)] = "rr", 1081 [ilog2(VM_DONTCOPY)] = "dc", 1082 [ilog2(VM_DONTEXPAND)] = "de", 1083 [ilog2(VM_LOCKONFAULT)] = "lf", 1084 [ilog2(VM_ACCOUNT)] = "ac", 1085 [ilog2(VM_NORESERVE)] = "nr", 1086 [ilog2(VM_HUGETLB)] = "ht", 1087 [ilog2(VM_SYNC)] = "sf", 1088 [ilog2(VM_ARCH_1)] = "ar", 1089 [ilog2(VM_WIPEONFORK)] = "wf", 1090 [ilog2(VM_DONTDUMP)] = "dd", 1091 #ifdef CONFIG_ARM64_BTI 1092 [ilog2(VM_ARM64_BTI)] = "bt", 1093 #endif 1094 #ifdef CONFIG_MEM_SOFT_DIRTY 1095 [ilog2(VM_SOFTDIRTY)] = "sd", 1096 #endif 1097 [ilog2(VM_MIXEDMAP)] = "mm", 1098 [ilog2(VM_HUGEPAGE)] = "hg", 1099 [ilog2(VM_NOHUGEPAGE)] = "nh", 1100 [ilog2(VM_MERGEABLE)] = "mg", 1101 [ilog2(VM_UFFD_MISSING)]= "um", 1102 [ilog2(VM_UFFD_WP)] = "uw", 1103 #ifdef CONFIG_ARM64_MTE 1104 [ilog2(VM_MTE)] = "mt", 1105 [ilog2(VM_MTE_ALLOWED)] = "", 1106 #endif 1107 #ifdef CONFIG_ARCH_HAS_PKEYS 1108 /* These come out via ProtectionKey: */ 1109 [ilog2(VM_PKEY_BIT0)] = "", 1110 [ilog2(VM_PKEY_BIT1)] = "", 1111 [ilog2(VM_PKEY_BIT2)] = "", 1112 #if VM_PKEY_BIT3 1113 [ilog2(VM_PKEY_BIT3)] = "", 1114 #endif 1115 #if VM_PKEY_BIT4 1116 [ilog2(VM_PKEY_BIT4)] = "", 1117 #endif 1118 #endif /* CONFIG_ARCH_HAS_PKEYS */ 1119 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR 1120 [ilog2(VM_UFFD_MINOR)] = "ui", 1121 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ 1122 #ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK 1123 [ilog2(VM_SHADOW_STACK)] = "ss", 1124 #endif 1125 #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) 1126 [ilog2(VM_DROPPABLE)] = "dp", 1127 #endif 1128 #ifdef CONFIG_64BIT 1129 [ilog2(VM_SEALED)] = "sl", 1130 #endif 1131 }; 1132 size_t i; 1133 1134 seq_puts(m, "VmFlags: "); 1135 for (i = 0; i < BITS_PER_LONG; i++) { 1136 if (!mnemonics[i][0]) 1137 continue; 1138 if (vma->vm_flags & (1UL << i)) 1139 seq_printf(m, "%s ", mnemonics[i]); 1140 } 1141 seq_putc(m, '\n'); 1142 } 1143 1144 #ifdef CONFIG_HUGETLB_PAGE 1145 static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, 1146 unsigned long addr, unsigned long end, 1147 struct mm_walk *walk) 1148 { 1149 struct mem_size_stats *mss = walk->private; 1150 struct vm_area_struct *vma = walk->vma; 1151 pte_t ptent = huge_ptep_get(walk->mm, addr, pte); 1152 struct folio *folio = NULL; 1153 bool present = false; 1154 1155 if (pte_present(ptent)) { 1156 folio = page_folio(pte_page(ptent)); 1157 present = true; 1158 } else if (is_swap_pte(ptent)) { 1159 swp_entry_t swpent = pte_to_swp_entry(ptent); 1160 1161 if (is_pfn_swap_entry(swpent)) 1162 folio = pfn_swap_entry_folio(swpent); 1163 } 1164 1165 if (folio) { 1166 /* We treat non-present entries as "maybe shared". */ 1167 if (!present || folio_maybe_mapped_shared(folio) || 1168 hugetlb_pmd_shared(pte)) 1169 mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); 1170 else 1171 mss->private_hugetlb += huge_page_size(hstate_vma(vma)); 1172 } 1173 return 0; 1174 } 1175 #else 1176 #define smaps_hugetlb_range NULL 1177 #endif /* HUGETLB_PAGE */ 1178 1179 static const struct mm_walk_ops smaps_walk_ops = { 1180 .pmd_entry = smaps_pte_range, 1181 .hugetlb_entry = smaps_hugetlb_range, 1182 .walk_lock = PGWALK_RDLOCK, 1183 }; 1184 1185 static const struct mm_walk_ops smaps_shmem_walk_ops = { 1186 .pmd_entry = smaps_pte_range, 1187 .hugetlb_entry = smaps_hugetlb_range, 1188 .pte_hole = smaps_pte_hole, 1189 .walk_lock = PGWALK_RDLOCK, 1190 }; 1191 1192 /* 1193 * Gather mem stats from @vma with the indicated beginning 1194 * address @start, and keep them in @mss. 1195 * 1196 * Use vm_start of @vma as the beginning address if @start is 0. 1197 */ 1198 static void smap_gather_stats(struct vm_area_struct *vma, 1199 struct mem_size_stats *mss, unsigned long start) 1200 { 1201 const struct mm_walk_ops *ops = &smaps_walk_ops; 1202 1203 /* Invalid start */ 1204 if (start >= vma->vm_end) 1205 return; 1206 1207 if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { 1208 /* 1209 * For shared or readonly shmem mappings we know that all 1210 * swapped out pages belong to the shmem object, and we can 1211 * obtain the swap value much more efficiently. For private 1212 * writable mappings, we might have COW pages that are 1213 * not affected by the parent swapped out pages of the shmem 1214 * object, so we have to distinguish them during the page walk. 1215 * Unless we know that the shmem object (or the part mapped by 1216 * our VMA) has no swapped out pages at all. 1217 */ 1218 unsigned long shmem_swapped = shmem_swap_usage(vma); 1219 1220 if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) || 1221 !(vma->vm_flags & VM_WRITE))) { 1222 mss->swap += shmem_swapped; 1223 } else { 1224 ops = &smaps_shmem_walk_ops; 1225 } 1226 } 1227 1228 /* mmap_lock is held in m_start */ 1229 if (!start) 1230 walk_page_vma(vma, ops, mss); 1231 else 1232 walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); 1233 } 1234 1235 #define SEQ_PUT_DEC(str, val) \ 1236 seq_put_decimal_ull_width(m, str, (val) >> 10, 8) 1237 1238 /* Show the contents common for smaps and smaps_rollup */ 1239 static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, 1240 bool rollup_mode) 1241 { 1242 SEQ_PUT_DEC("Rss: ", mss->resident); 1243 SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); 1244 SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT); 1245 if (rollup_mode) { 1246 /* 1247 * These are meaningful only for smaps_rollup, otherwise two of 1248 * them are zero, and the other one is the same as Pss. 1249 */ 1250 SEQ_PUT_DEC(" kB\nPss_Anon: ", 1251 mss->pss_anon >> PSS_SHIFT); 1252 SEQ_PUT_DEC(" kB\nPss_File: ", 1253 mss->pss_file >> PSS_SHIFT); 1254 SEQ_PUT_DEC(" kB\nPss_Shmem: ", 1255 mss->pss_shmem >> PSS_SHIFT); 1256 } 1257 SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); 1258 SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); 1259 SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); 1260 SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); 1261 SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); 1262 SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); 1263 SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm); 1264 SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); 1265 SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); 1266 SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); 1267 SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); 1268 SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); 1269 seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", 1270 mss->private_hugetlb >> 10, 7); 1271 SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); 1272 SEQ_PUT_DEC(" kB\nSwapPss: ", 1273 mss->swap_pss >> PSS_SHIFT); 1274 SEQ_PUT_DEC(" kB\nLocked: ", 1275 mss->pss_locked >> PSS_SHIFT); 1276 seq_puts(m, " kB\n"); 1277 } 1278 1279 static int show_smap(struct seq_file *m, void *v) 1280 { 1281 struct vm_area_struct *vma = v; 1282 struct mem_size_stats mss = {}; 1283 1284 smap_gather_stats(vma, &mss, 0); 1285 1286 show_map_vma(m, vma); 1287 1288 SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); 1289 SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); 1290 SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); 1291 seq_puts(m, " kB\n"); 1292 1293 __show_smap(m, &mss, false); 1294 1295 seq_printf(m, "THPeligible: %8u\n", 1296 !!thp_vma_allowable_orders(vma, vma->vm_flags, 1297 TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL)); 1298 1299 if (arch_pkeys_enabled()) 1300 seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); 1301 show_smap_vma_flags(m, vma); 1302 1303 return 0; 1304 } 1305 1306 static int show_smaps_rollup(struct seq_file *m, void *v) 1307 { 1308 struct proc_maps_private *priv = m->private; 1309 struct mem_size_stats mss = {}; 1310 struct mm_struct *mm = priv->mm; 1311 struct vm_area_struct *vma; 1312 unsigned long vma_start = 0, last_vma_end = 0; 1313 int ret = 0; 1314 VMA_ITERATOR(vmi, mm, 0); 1315 1316 priv->task = get_proc_task(priv->inode); 1317 if (!priv->task) 1318 return -ESRCH; 1319 1320 if (!mm || !mmget_not_zero(mm)) { 1321 ret = -ESRCH; 1322 goto out_put_task; 1323 } 1324 1325 ret = mmap_read_lock_killable(mm); 1326 if (ret) 1327 goto out_put_mm; 1328 1329 hold_task_mempolicy(priv); 1330 vma = vma_next(&vmi); 1331 1332 if (unlikely(!vma)) 1333 goto empty_set; 1334 1335 vma_start = vma->vm_start; 1336 do { 1337 smap_gather_stats(vma, &mss, 0); 1338 last_vma_end = vma->vm_end; 1339 1340 /* 1341 * Release mmap_lock temporarily if someone wants to 1342 * access it for write request. 1343 */ 1344 if (mmap_lock_is_contended(mm)) { 1345 vma_iter_invalidate(&vmi); 1346 mmap_read_unlock(mm); 1347 ret = mmap_read_lock_killable(mm); 1348 if (ret) { 1349 release_task_mempolicy(priv); 1350 goto out_put_mm; 1351 } 1352 1353 /* 1354 * After dropping the lock, there are four cases to 1355 * consider. See the following example for explanation. 1356 * 1357 * +------+------+-----------+ 1358 * | VMA1 | VMA2 | VMA3 | 1359 * +------+------+-----------+ 1360 * | | | | 1361 * 4k 8k 16k 400k 1362 * 1363 * Suppose we drop the lock after reading VMA2 due to 1364 * contention, then we get: 1365 * 1366 * last_vma_end = 16k 1367 * 1368 * 1) VMA2 is freed, but VMA3 exists: 1369 * 1370 * vma_next(vmi) will return VMA3. 1371 * In this case, just continue from VMA3. 1372 * 1373 * 2) VMA2 still exists: 1374 * 1375 * vma_next(vmi) will return VMA3. 1376 * In this case, just continue from VMA3. 1377 * 1378 * 3) No more VMAs can be found: 1379 * 1380 * vma_next(vmi) will return NULL. 1381 * No more things to do, just break. 1382 * 1383 * 4) (last_vma_end - 1) is the middle of a vma (VMA'): 1384 * 1385 * vma_next(vmi) will return VMA' whose range 1386 * contains last_vma_end. 1387 * Iterate VMA' from last_vma_end. 1388 */ 1389 vma = vma_next(&vmi); 1390 /* Case 3 above */ 1391 if (!vma) 1392 break; 1393 1394 /* Case 1 and 2 above */ 1395 if (vma->vm_start >= last_vma_end) { 1396 smap_gather_stats(vma, &mss, 0); 1397 last_vma_end = vma->vm_end; 1398 continue; 1399 } 1400 1401 /* Case 4 above */ 1402 if (vma->vm_end > last_vma_end) { 1403 smap_gather_stats(vma, &mss, last_vma_end); 1404 last_vma_end = vma->vm_end; 1405 } 1406 } 1407 } for_each_vma(vmi, vma); 1408 1409 empty_set: 1410 show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); 1411 seq_pad(m, ' '); 1412 seq_puts(m, "[rollup]\n"); 1413 1414 __show_smap(m, &mss, true); 1415 1416 release_task_mempolicy(priv); 1417 mmap_read_unlock(mm); 1418 1419 out_put_mm: 1420 mmput(mm); 1421 out_put_task: 1422 put_task_struct(priv->task); 1423 priv->task = NULL; 1424 1425 return ret; 1426 } 1427 #undef SEQ_PUT_DEC 1428 1429 static const struct seq_operations proc_pid_smaps_op = { 1430 .start = m_start, 1431 .next = m_next, 1432 .stop = m_stop, 1433 .show = show_smap 1434 }; 1435 1436 static int pid_smaps_open(struct inode *inode, struct file *file) 1437 { 1438 return do_maps_open(inode, file, &proc_pid_smaps_op); 1439 } 1440 1441 static int smaps_rollup_open(struct inode *inode, struct file *file) 1442 { 1443 int ret; 1444 struct proc_maps_private *priv; 1445 1446 priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT); 1447 if (!priv) 1448 return -ENOMEM; 1449 1450 ret = single_open(file, show_smaps_rollup, priv); 1451 if (ret) 1452 goto out_free; 1453 1454 priv->inode = inode; 1455 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); 1456 if (IS_ERR_OR_NULL(priv->mm)) { 1457 ret = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; 1458 1459 single_release(inode, file); 1460 goto out_free; 1461 } 1462 1463 return 0; 1464 1465 out_free: 1466 kfree(priv); 1467 return ret; 1468 } 1469 1470 static int smaps_rollup_release(struct inode *inode, struct file *file) 1471 { 1472 struct seq_file *seq = file->private_data; 1473 struct proc_maps_private *priv = seq->private; 1474 1475 if (priv->mm) 1476 mmdrop(priv->mm); 1477 1478 kfree(priv); 1479 return single_release(inode, file); 1480 } 1481 1482 const struct file_operations proc_pid_smaps_operations = { 1483 .open = pid_smaps_open, 1484 .read = seq_read, 1485 .llseek = seq_lseek, 1486 .release = proc_map_release, 1487 }; 1488 1489 const struct file_operations proc_pid_smaps_rollup_operations = { 1490 .open = smaps_rollup_open, 1491 .read = seq_read, 1492 .llseek = seq_lseek, 1493 .release = smaps_rollup_release, 1494 }; 1495 1496 enum clear_refs_types { 1497 CLEAR_REFS_ALL = 1, 1498 CLEAR_REFS_ANON, 1499 CLEAR_REFS_MAPPED, 1500 CLEAR_REFS_SOFT_DIRTY, 1501 CLEAR_REFS_MM_HIWATER_RSS, 1502 CLEAR_REFS_LAST, 1503 }; 1504 1505 struct clear_refs_private { 1506 enum clear_refs_types type; 1507 }; 1508 1509 #ifdef CONFIG_MEM_SOFT_DIRTY 1510 1511 static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) 1512 { 1513 struct folio *folio; 1514 1515 if (!pte_write(pte)) 1516 return false; 1517 if (!is_cow_mapping(vma->vm_flags)) 1518 return false; 1519 if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))) 1520 return false; 1521 folio = vm_normal_folio(vma, addr, pte); 1522 if (!folio) 1523 return false; 1524 return folio_maybe_dma_pinned(folio); 1525 } 1526 1527 static inline void clear_soft_dirty(struct vm_area_struct *vma, 1528 unsigned long addr, pte_t *pte) 1529 { 1530 /* 1531 * The soft-dirty tracker uses #PF-s to catch writes 1532 * to pages, so write-protect the pte as well. See the 1533 * Documentation/admin-guide/mm/soft-dirty.rst for full description 1534 * of how soft-dirty works. 1535 */ 1536 pte_t ptent = ptep_get(pte); 1537 1538 if (pte_present(ptent)) { 1539 pte_t old_pte; 1540 1541 if (pte_is_pinned(vma, addr, ptent)) 1542 return; 1543 old_pte = ptep_modify_prot_start(vma, addr, pte); 1544 ptent = pte_wrprotect(old_pte); 1545 ptent = pte_clear_soft_dirty(ptent); 1546 ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); 1547 } else if (is_swap_pte(ptent)) { 1548 ptent = pte_swp_clear_soft_dirty(ptent); 1549 set_pte_at(vma->vm_mm, addr, pte, ptent); 1550 } 1551 } 1552 #else 1553 static inline void clear_soft_dirty(struct vm_area_struct *vma, 1554 unsigned long addr, pte_t *pte) 1555 { 1556 } 1557 #endif 1558 1559 #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 1560 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, 1561 unsigned long addr, pmd_t *pmdp) 1562 { 1563 pmd_t old, pmd = *pmdp; 1564 1565 if (pmd_present(pmd)) { 1566 /* See comment in change_huge_pmd() */ 1567 old = pmdp_invalidate(vma, addr, pmdp); 1568 if (pmd_dirty(old)) 1569 pmd = pmd_mkdirty(pmd); 1570 if (pmd_young(old)) 1571 pmd = pmd_mkyoung(pmd); 1572 1573 pmd = pmd_wrprotect(pmd); 1574 pmd = pmd_clear_soft_dirty(pmd); 1575 1576 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 1577 } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { 1578 pmd = pmd_swp_clear_soft_dirty(pmd); 1579 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 1580 } 1581 } 1582 #else 1583 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, 1584 unsigned long addr, pmd_t *pmdp) 1585 { 1586 } 1587 #endif 1588 1589 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, 1590 unsigned long end, struct mm_walk *walk) 1591 { 1592 struct clear_refs_private *cp = walk->private; 1593 struct vm_area_struct *vma = walk->vma; 1594 pte_t *pte, ptent; 1595 spinlock_t *ptl; 1596 struct folio *folio; 1597 1598 ptl = pmd_trans_huge_lock(pmd, vma); 1599 if (ptl) { 1600 if (cp->type == CLEAR_REFS_SOFT_DIRTY) { 1601 clear_soft_dirty_pmd(vma, addr, pmd); 1602 goto out; 1603 } 1604 1605 if (!pmd_present(*pmd)) 1606 goto out; 1607 1608 folio = pmd_folio(*pmd); 1609 1610 /* Clear accessed and referenced bits. */ 1611 pmdp_test_and_clear_young(vma, addr, pmd); 1612 folio_test_clear_young(folio); 1613 folio_clear_referenced(folio); 1614 out: 1615 spin_unlock(ptl); 1616 return 0; 1617 } 1618 1619 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 1620 if (!pte) { 1621 walk->action = ACTION_AGAIN; 1622 return 0; 1623 } 1624 for (; addr != end; pte++, addr += PAGE_SIZE) { 1625 ptent = ptep_get(pte); 1626 1627 if (cp->type == CLEAR_REFS_SOFT_DIRTY) { 1628 clear_soft_dirty(vma, addr, pte); 1629 continue; 1630 } 1631 1632 if (!pte_present(ptent)) 1633 continue; 1634 1635 folio = vm_normal_folio(vma, addr, ptent); 1636 if (!folio) 1637 continue; 1638 1639 /* Clear accessed and referenced bits. */ 1640 ptep_test_and_clear_young(vma, addr, pte); 1641 folio_test_clear_young(folio); 1642 folio_clear_referenced(folio); 1643 } 1644 pte_unmap_unlock(pte - 1, ptl); 1645 cond_resched(); 1646 return 0; 1647 } 1648 1649 static int clear_refs_test_walk(unsigned long start, unsigned long end, 1650 struct mm_walk *walk) 1651 { 1652 struct clear_refs_private *cp = walk->private; 1653 struct vm_area_struct *vma = walk->vma; 1654 1655 if (vma->vm_flags & VM_PFNMAP) 1656 return 1; 1657 1658 /* 1659 * Writing 1 to /proc/pid/clear_refs affects all pages. 1660 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. 1661 * Writing 3 to /proc/pid/clear_refs only affects file mapped pages. 1662 * Writing 4 to /proc/pid/clear_refs affects all pages. 1663 */ 1664 if (cp->type == CLEAR_REFS_ANON && vma->vm_file) 1665 return 1; 1666 if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file) 1667 return 1; 1668 return 0; 1669 } 1670 1671 static const struct mm_walk_ops clear_refs_walk_ops = { 1672 .pmd_entry = clear_refs_pte_range, 1673 .test_walk = clear_refs_test_walk, 1674 .walk_lock = PGWALK_WRLOCK, 1675 }; 1676 1677 static ssize_t clear_refs_write(struct file *file, const char __user *buf, 1678 size_t count, loff_t *ppos) 1679 { 1680 struct task_struct *task; 1681 char buffer[PROC_NUMBUF] = {}; 1682 struct mm_struct *mm; 1683 struct vm_area_struct *vma; 1684 enum clear_refs_types type; 1685 int itype; 1686 int rv; 1687 1688 if (count > sizeof(buffer) - 1) 1689 count = sizeof(buffer) - 1; 1690 if (copy_from_user(buffer, buf, count)) 1691 return -EFAULT; 1692 rv = kstrtoint(strstrip(buffer), 10, &itype); 1693 if (rv < 0) 1694 return rv; 1695 type = (enum clear_refs_types)itype; 1696 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) 1697 return -EINVAL; 1698 1699 task = get_proc_task(file_inode(file)); 1700 if (!task) 1701 return -ESRCH; 1702 mm = get_task_mm(task); 1703 if (mm) { 1704 VMA_ITERATOR(vmi, mm, 0); 1705 struct mmu_notifier_range range; 1706 struct clear_refs_private cp = { 1707 .type = type, 1708 }; 1709 1710 if (mmap_write_lock_killable(mm)) { 1711 count = -EINTR; 1712 goto out_mm; 1713 } 1714 if (type == CLEAR_REFS_MM_HIWATER_RSS) { 1715 /* 1716 * Writing 5 to /proc/pid/clear_refs resets the peak 1717 * resident set size to this mm's current rss value. 1718 */ 1719 reset_mm_hiwater_rss(mm); 1720 goto out_unlock; 1721 } 1722 1723 if (type == CLEAR_REFS_SOFT_DIRTY) { 1724 for_each_vma(vmi, vma) { 1725 if (!(vma->vm_flags & VM_SOFTDIRTY)) 1726 continue; 1727 vm_flags_clear(vma, VM_SOFTDIRTY); 1728 vma_set_page_prot(vma); 1729 } 1730 1731 inc_tlb_flush_pending(mm); 1732 mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 1733 0, mm, 0, -1UL); 1734 mmu_notifier_invalidate_range_start(&range); 1735 } 1736 walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); 1737 if (type == CLEAR_REFS_SOFT_DIRTY) { 1738 mmu_notifier_invalidate_range_end(&range); 1739 flush_tlb_mm(mm); 1740 dec_tlb_flush_pending(mm); 1741 } 1742 out_unlock: 1743 mmap_write_unlock(mm); 1744 out_mm: 1745 mmput(mm); 1746 } 1747 put_task_struct(task); 1748 1749 return count; 1750 } 1751 1752 const struct file_operations proc_clear_refs_operations = { 1753 .write = clear_refs_write, 1754 .llseek = noop_llseek, 1755 }; 1756 1757 typedef struct { 1758 u64 pme; 1759 } pagemap_entry_t; 1760 1761 struct pagemapread { 1762 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ 1763 pagemap_entry_t *buffer; 1764 bool show_pfn; 1765 }; 1766 1767 #define PAGEMAP_WALK_SIZE (PMD_SIZE) 1768 #define PAGEMAP_WALK_MASK (PMD_MASK) 1769 1770 #define PM_ENTRY_BYTES sizeof(pagemap_entry_t) 1771 #define PM_PFRAME_BITS 55 1772 #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) 1773 #define PM_SOFT_DIRTY BIT_ULL(55) 1774 #define PM_MMAP_EXCLUSIVE BIT_ULL(56) 1775 #define PM_UFFD_WP BIT_ULL(57) 1776 #define PM_GUARD_REGION BIT_ULL(58) 1777 #define PM_FILE BIT_ULL(61) 1778 #define PM_SWAP BIT_ULL(62) 1779 #define PM_PRESENT BIT_ULL(63) 1780 1781 #define PM_END_OF_BUFFER 1 1782 1783 static inline pagemap_entry_t make_pme(u64 frame, u64 flags) 1784 { 1785 return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; 1786 } 1787 1788 static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) 1789 { 1790 pm->buffer[pm->pos++] = *pme; 1791 if (pm->pos >= pm->len) 1792 return PM_END_OF_BUFFER; 1793 return 0; 1794 } 1795 1796 static bool __folio_page_mapped_exclusively(struct folio *folio, struct page *page) 1797 { 1798 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) 1799 return folio_precise_page_mapcount(folio, page) == 1; 1800 return !folio_maybe_mapped_shared(folio); 1801 } 1802 1803 static int pagemap_pte_hole(unsigned long start, unsigned long end, 1804 __always_unused int depth, struct mm_walk *walk) 1805 { 1806 struct pagemapread *pm = walk->private; 1807 unsigned long addr = start; 1808 int err = 0; 1809 1810 while (addr < end) { 1811 struct vm_area_struct *vma = find_vma(walk->mm, addr); 1812 pagemap_entry_t pme = make_pme(0, 0); 1813 /* End of address space hole, which we mark as non-present. */ 1814 unsigned long hole_end; 1815 1816 if (vma) 1817 hole_end = min(end, vma->vm_start); 1818 else 1819 hole_end = end; 1820 1821 for (; addr < hole_end; addr += PAGE_SIZE) { 1822 err = add_to_pagemap(&pme, pm); 1823 if (err) 1824 goto out; 1825 } 1826 1827 if (!vma) 1828 break; 1829 1830 /* Addresses in the VMA. */ 1831 if (vma->vm_flags & VM_SOFTDIRTY) 1832 pme = make_pme(0, PM_SOFT_DIRTY); 1833 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { 1834 err = add_to_pagemap(&pme, pm); 1835 if (err) 1836 goto out; 1837 } 1838 } 1839 out: 1840 return err; 1841 } 1842 1843 static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, 1844 struct vm_area_struct *vma, unsigned long addr, pte_t pte) 1845 { 1846 u64 frame = 0, flags = 0; 1847 struct page *page = NULL; 1848 struct folio *folio; 1849 1850 if (pte_present(pte)) { 1851 if (pm->show_pfn) 1852 frame = pte_pfn(pte); 1853 flags |= PM_PRESENT; 1854 page = vm_normal_page(vma, addr, pte); 1855 if (pte_soft_dirty(pte)) 1856 flags |= PM_SOFT_DIRTY; 1857 if (pte_uffd_wp(pte)) 1858 flags |= PM_UFFD_WP; 1859 } else if (is_swap_pte(pte)) { 1860 swp_entry_t entry; 1861 if (pte_swp_soft_dirty(pte)) 1862 flags |= PM_SOFT_DIRTY; 1863 if (pte_swp_uffd_wp(pte)) 1864 flags |= PM_UFFD_WP; 1865 entry = pte_to_swp_entry(pte); 1866 if (pm->show_pfn) { 1867 pgoff_t offset; 1868 /* 1869 * For PFN swap offsets, keeping the offset field 1870 * to be PFN only to be compatible with old smaps. 1871 */ 1872 if (is_pfn_swap_entry(entry)) 1873 offset = swp_offset_pfn(entry); 1874 else 1875 offset = swp_offset(entry); 1876 frame = swp_type(entry) | 1877 (offset << MAX_SWAPFILES_SHIFT); 1878 } 1879 flags |= PM_SWAP; 1880 if (is_pfn_swap_entry(entry)) 1881 page = pfn_swap_entry_to_page(entry); 1882 if (pte_marker_entry_uffd_wp(entry)) 1883 flags |= PM_UFFD_WP; 1884 if (is_guard_swp_entry(entry)) 1885 flags |= PM_GUARD_REGION; 1886 } 1887 1888 if (page) { 1889 folio = page_folio(page); 1890 if (!folio_test_anon(folio)) 1891 flags |= PM_FILE; 1892 if ((flags & PM_PRESENT) && 1893 __folio_page_mapped_exclusively(folio, page)) 1894 flags |= PM_MMAP_EXCLUSIVE; 1895 } 1896 if (vma->vm_flags & VM_SOFTDIRTY) 1897 flags |= PM_SOFT_DIRTY; 1898 1899 return make_pme(frame, flags); 1900 } 1901 1902 static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, 1903 struct mm_walk *walk) 1904 { 1905 struct vm_area_struct *vma = walk->vma; 1906 struct pagemapread *pm = walk->private; 1907 spinlock_t *ptl; 1908 pte_t *pte, *orig_pte; 1909 int err = 0; 1910 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1911 1912 ptl = pmd_trans_huge_lock(pmdp, vma); 1913 if (ptl) { 1914 unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT; 1915 u64 flags = 0, frame = 0; 1916 pmd_t pmd = *pmdp; 1917 struct page *page = NULL; 1918 struct folio *folio = NULL; 1919 1920 if (vma->vm_flags & VM_SOFTDIRTY) 1921 flags |= PM_SOFT_DIRTY; 1922 1923 if (pmd_present(pmd)) { 1924 page = pmd_page(pmd); 1925 1926 flags |= PM_PRESENT; 1927 if (pmd_soft_dirty(pmd)) 1928 flags |= PM_SOFT_DIRTY; 1929 if (pmd_uffd_wp(pmd)) 1930 flags |= PM_UFFD_WP; 1931 if (pm->show_pfn) 1932 frame = pmd_pfn(pmd) + idx; 1933 } 1934 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1935 else if (is_swap_pmd(pmd)) { 1936 swp_entry_t entry = pmd_to_swp_entry(pmd); 1937 unsigned long offset; 1938 1939 if (pm->show_pfn) { 1940 if (is_pfn_swap_entry(entry)) 1941 offset = swp_offset_pfn(entry) + idx; 1942 else 1943 offset = swp_offset(entry) + idx; 1944 frame = swp_type(entry) | 1945 (offset << MAX_SWAPFILES_SHIFT); 1946 } 1947 flags |= PM_SWAP; 1948 if (pmd_swp_soft_dirty(pmd)) 1949 flags |= PM_SOFT_DIRTY; 1950 if (pmd_swp_uffd_wp(pmd)) 1951 flags |= PM_UFFD_WP; 1952 VM_BUG_ON(!is_pmd_migration_entry(pmd)); 1953 page = pfn_swap_entry_to_page(entry); 1954 } 1955 #endif 1956 1957 if (page) { 1958 folio = page_folio(page); 1959 if (!folio_test_anon(folio)) 1960 flags |= PM_FILE; 1961 } 1962 1963 for (; addr != end; addr += PAGE_SIZE, idx++) { 1964 u64 cur_flags = flags; 1965 pagemap_entry_t pme; 1966 1967 if (folio && (flags & PM_PRESENT) && 1968 __folio_page_mapped_exclusively(folio, page)) 1969 cur_flags |= PM_MMAP_EXCLUSIVE; 1970 1971 pme = make_pme(frame, cur_flags); 1972 err = add_to_pagemap(&pme, pm); 1973 if (err) 1974 break; 1975 if (pm->show_pfn) { 1976 if (flags & PM_PRESENT) 1977 frame++; 1978 else if (flags & PM_SWAP) 1979 frame += (1 << MAX_SWAPFILES_SHIFT); 1980 } 1981 } 1982 spin_unlock(ptl); 1983 return err; 1984 } 1985 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1986 1987 /* 1988 * We can assume that @vma always points to a valid one and @end never 1989 * goes beyond vma->vm_end. 1990 */ 1991 orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); 1992 if (!pte) { 1993 walk->action = ACTION_AGAIN; 1994 return err; 1995 } 1996 for (; addr < end; pte++, addr += PAGE_SIZE) { 1997 pagemap_entry_t pme; 1998 1999 pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); 2000 err = add_to_pagemap(&pme, pm); 2001 if (err) 2002 break; 2003 } 2004 pte_unmap_unlock(orig_pte, ptl); 2005 2006 cond_resched(); 2007 2008 return err; 2009 } 2010 2011 #ifdef CONFIG_HUGETLB_PAGE 2012 /* This function walks within one hugetlb entry in the single call */ 2013 static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, 2014 unsigned long addr, unsigned long end, 2015 struct mm_walk *walk) 2016 { 2017 struct pagemapread *pm = walk->private; 2018 struct vm_area_struct *vma = walk->vma; 2019 u64 flags = 0, frame = 0; 2020 int err = 0; 2021 pte_t pte; 2022 2023 if (vma->vm_flags & VM_SOFTDIRTY) 2024 flags |= PM_SOFT_DIRTY; 2025 2026 pte = huge_ptep_get(walk->mm, addr, ptep); 2027 if (pte_present(pte)) { 2028 struct folio *folio = page_folio(pte_page(pte)); 2029 2030 if (!folio_test_anon(folio)) 2031 flags |= PM_FILE; 2032 2033 if (!folio_maybe_mapped_shared(folio) && 2034 !hugetlb_pmd_shared(ptep)) 2035 flags |= PM_MMAP_EXCLUSIVE; 2036 2037 if (huge_pte_uffd_wp(pte)) 2038 flags |= PM_UFFD_WP; 2039 2040 flags |= PM_PRESENT; 2041 if (pm->show_pfn) 2042 frame = pte_pfn(pte) + 2043 ((addr & ~hmask) >> PAGE_SHIFT); 2044 } else if (pte_swp_uffd_wp_any(pte)) { 2045 flags |= PM_UFFD_WP; 2046 } 2047 2048 for (; addr != end; addr += PAGE_SIZE) { 2049 pagemap_entry_t pme = make_pme(frame, flags); 2050 2051 err = add_to_pagemap(&pme, pm); 2052 if (err) 2053 return err; 2054 if (pm->show_pfn && (flags & PM_PRESENT)) 2055 frame++; 2056 } 2057 2058 cond_resched(); 2059 2060 return err; 2061 } 2062 #else 2063 #define pagemap_hugetlb_range NULL 2064 #endif /* HUGETLB_PAGE */ 2065 2066 static const struct mm_walk_ops pagemap_ops = { 2067 .pmd_entry = pagemap_pmd_range, 2068 .pte_hole = pagemap_pte_hole, 2069 .hugetlb_entry = pagemap_hugetlb_range, 2070 .walk_lock = PGWALK_RDLOCK, 2071 }; 2072 2073 /* 2074 * /proc/pid/pagemap - an array mapping virtual pages to pfns 2075 * 2076 * For each page in the address space, this file contains one 64-bit entry 2077 * consisting of the following: 2078 * 2079 * Bits 0-54 page frame number (PFN) if present 2080 * Bits 0-4 swap type if swapped 2081 * Bits 5-54 swap offset if swapped 2082 * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) 2083 * Bit 56 page exclusively mapped 2084 * Bit 57 pte is uffd-wp write-protected 2085 * Bit 58 pte is a guard region 2086 * Bits 59-60 zero 2087 * Bit 61 page is file-page or shared-anon 2088 * Bit 62 page swapped 2089 * Bit 63 page present 2090 * 2091 * If the page is not present but in swap, then the PFN contains an 2092 * encoding of the swap file number and the page's offset into the 2093 * swap. Unmapped pages return a null PFN. This allows determining 2094 * precisely which pages are mapped (or in swap) and comparing mapped 2095 * pages between processes. 2096 * 2097 * Efficient users of this interface will use /proc/pid/maps to 2098 * determine which areas of memory are actually mapped and llseek to 2099 * skip over unmapped regions. 2100 */ 2101 static ssize_t pagemap_read(struct file *file, char __user *buf, 2102 size_t count, loff_t *ppos) 2103 { 2104 struct mm_struct *mm = file->private_data; 2105 struct pagemapread pm; 2106 unsigned long src; 2107 unsigned long svpfn; 2108 unsigned long start_vaddr; 2109 unsigned long end_vaddr; 2110 int ret = 0, copied = 0; 2111 2112 if (!mm || !mmget_not_zero(mm)) 2113 goto out; 2114 2115 ret = -EINVAL; 2116 /* file position must be aligned */ 2117 if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) 2118 goto out_mm; 2119 2120 ret = 0; 2121 if (!count) 2122 goto out_mm; 2123 2124 /* do not disclose physical addresses: attack vector */ 2125 pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); 2126 2127 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); 2128 pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); 2129 ret = -ENOMEM; 2130 if (!pm.buffer) 2131 goto out_mm; 2132 2133 src = *ppos; 2134 svpfn = src / PM_ENTRY_BYTES; 2135 end_vaddr = mm->task_size; 2136 2137 /* watch out for wraparound */ 2138 start_vaddr = end_vaddr; 2139 if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { 2140 unsigned long end; 2141 2142 ret = mmap_read_lock_killable(mm); 2143 if (ret) 2144 goto out_free; 2145 start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT); 2146 mmap_read_unlock(mm); 2147 2148 end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT); 2149 if (end >= start_vaddr && end < mm->task_size) 2150 end_vaddr = end; 2151 } 2152 2153 /* Ensure the address is inside the task */ 2154 if (start_vaddr > mm->task_size) 2155 start_vaddr = end_vaddr; 2156 2157 ret = 0; 2158 while (count && (start_vaddr < end_vaddr)) { 2159 int len; 2160 unsigned long end; 2161 2162 pm.pos = 0; 2163 end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; 2164 /* overflow ? */ 2165 if (end < start_vaddr || end > end_vaddr) 2166 end = end_vaddr; 2167 ret = mmap_read_lock_killable(mm); 2168 if (ret) 2169 goto out_free; 2170 ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); 2171 mmap_read_unlock(mm); 2172 start_vaddr = end; 2173 2174 len = min(count, PM_ENTRY_BYTES * pm.pos); 2175 if (copy_to_user(buf, pm.buffer, len)) { 2176 ret = -EFAULT; 2177 goto out_free; 2178 } 2179 copied += len; 2180 buf += len; 2181 count -= len; 2182 } 2183 *ppos += copied; 2184 if (!ret || ret == PM_END_OF_BUFFER) 2185 ret = copied; 2186 2187 out_free: 2188 kfree(pm.buffer); 2189 out_mm: 2190 mmput(mm); 2191 out: 2192 return ret; 2193 } 2194 2195 static int pagemap_open(struct inode *inode, struct file *file) 2196 { 2197 struct mm_struct *mm; 2198 2199 mm = proc_mem_open(inode, PTRACE_MODE_READ); 2200 if (IS_ERR_OR_NULL(mm)) 2201 return mm ? PTR_ERR(mm) : -ESRCH; 2202 file->private_data = mm; 2203 return 0; 2204 } 2205 2206 static int pagemap_release(struct inode *inode, struct file *file) 2207 { 2208 struct mm_struct *mm = file->private_data; 2209 2210 if (mm) 2211 mmdrop(mm); 2212 return 0; 2213 } 2214 2215 #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ 2216 PAGE_IS_FILE | PAGE_IS_PRESENT | \ 2217 PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ 2218 PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \ 2219 PAGE_IS_GUARD) 2220 #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) 2221 2222 struct pagemap_scan_private { 2223 struct pm_scan_arg arg; 2224 unsigned long masks_of_interest, cur_vma_category; 2225 struct page_region *vec_buf; 2226 unsigned long vec_buf_len, vec_buf_index, found_pages; 2227 struct page_region __user *vec_out; 2228 }; 2229 2230 static unsigned long pagemap_page_category(struct pagemap_scan_private *p, 2231 struct vm_area_struct *vma, 2232 unsigned long addr, pte_t pte) 2233 { 2234 unsigned long categories = 0; 2235 2236 if (pte_present(pte)) { 2237 struct page *page; 2238 2239 categories |= PAGE_IS_PRESENT; 2240 if (!pte_uffd_wp(pte)) 2241 categories |= PAGE_IS_WRITTEN; 2242 2243 if (p->masks_of_interest & PAGE_IS_FILE) { 2244 page = vm_normal_page(vma, addr, pte); 2245 if (page && !PageAnon(page)) 2246 categories |= PAGE_IS_FILE; 2247 } 2248 2249 if (is_zero_pfn(pte_pfn(pte))) 2250 categories |= PAGE_IS_PFNZERO; 2251 if (pte_soft_dirty(pte)) 2252 categories |= PAGE_IS_SOFT_DIRTY; 2253 } else if (is_swap_pte(pte)) { 2254 swp_entry_t swp; 2255 2256 categories |= PAGE_IS_SWAPPED; 2257 if (!pte_swp_uffd_wp_any(pte)) 2258 categories |= PAGE_IS_WRITTEN; 2259 2260 swp = pte_to_swp_entry(pte); 2261 if (is_guard_swp_entry(swp)) 2262 categories |= PAGE_IS_GUARD; 2263 else if ((p->masks_of_interest & PAGE_IS_FILE) && 2264 is_pfn_swap_entry(swp) && 2265 !folio_test_anon(pfn_swap_entry_folio(swp))) 2266 categories |= PAGE_IS_FILE; 2267 2268 if (pte_swp_soft_dirty(pte)) 2269 categories |= PAGE_IS_SOFT_DIRTY; 2270 } 2271 2272 return categories; 2273 } 2274 2275 static void make_uffd_wp_pte(struct vm_area_struct *vma, 2276 unsigned long addr, pte_t *pte, pte_t ptent) 2277 { 2278 if (pte_present(ptent)) { 2279 pte_t old_pte; 2280 2281 old_pte = ptep_modify_prot_start(vma, addr, pte); 2282 ptent = pte_mkuffd_wp(old_pte); 2283 ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); 2284 } else if (is_swap_pte(ptent)) { 2285 ptent = pte_swp_mkuffd_wp(ptent); 2286 set_pte_at(vma->vm_mm, addr, pte, ptent); 2287 } else { 2288 set_pte_at(vma->vm_mm, addr, pte, 2289 make_pte_marker(PTE_MARKER_UFFD_WP)); 2290 } 2291 } 2292 2293 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2294 static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, 2295 struct vm_area_struct *vma, 2296 unsigned long addr, pmd_t pmd) 2297 { 2298 unsigned long categories = PAGE_IS_HUGE; 2299 2300 if (pmd_present(pmd)) { 2301 struct page *page; 2302 2303 categories |= PAGE_IS_PRESENT; 2304 if (!pmd_uffd_wp(pmd)) 2305 categories |= PAGE_IS_WRITTEN; 2306 2307 if (p->masks_of_interest & PAGE_IS_FILE) { 2308 page = vm_normal_page_pmd(vma, addr, pmd); 2309 if (page && !PageAnon(page)) 2310 categories |= PAGE_IS_FILE; 2311 } 2312 2313 if (is_huge_zero_pmd(pmd)) 2314 categories |= PAGE_IS_PFNZERO; 2315 if (pmd_soft_dirty(pmd)) 2316 categories |= PAGE_IS_SOFT_DIRTY; 2317 } else if (is_swap_pmd(pmd)) { 2318 swp_entry_t swp; 2319 2320 categories |= PAGE_IS_SWAPPED; 2321 if (!pmd_swp_uffd_wp(pmd)) 2322 categories |= PAGE_IS_WRITTEN; 2323 if (pmd_swp_soft_dirty(pmd)) 2324 categories |= PAGE_IS_SOFT_DIRTY; 2325 2326 if (p->masks_of_interest & PAGE_IS_FILE) { 2327 swp = pmd_to_swp_entry(pmd); 2328 if (is_pfn_swap_entry(swp) && 2329 !folio_test_anon(pfn_swap_entry_folio(swp))) 2330 categories |= PAGE_IS_FILE; 2331 } 2332 } 2333 2334 return categories; 2335 } 2336 2337 static void make_uffd_wp_pmd(struct vm_area_struct *vma, 2338 unsigned long addr, pmd_t *pmdp) 2339 { 2340 pmd_t old, pmd = *pmdp; 2341 2342 if (pmd_present(pmd)) { 2343 old = pmdp_invalidate_ad(vma, addr, pmdp); 2344 pmd = pmd_mkuffd_wp(old); 2345 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 2346 } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { 2347 pmd = pmd_swp_mkuffd_wp(pmd); 2348 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 2349 } 2350 } 2351 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2352 2353 #ifdef CONFIG_HUGETLB_PAGE 2354 static unsigned long pagemap_hugetlb_category(pte_t pte) 2355 { 2356 unsigned long categories = PAGE_IS_HUGE; 2357 2358 /* 2359 * According to pagemap_hugetlb_range(), file-backed HugeTLB 2360 * page cannot be swapped. So PAGE_IS_FILE is not checked for 2361 * swapped pages. 2362 */ 2363 if (pte_present(pte)) { 2364 categories |= PAGE_IS_PRESENT; 2365 if (!huge_pte_uffd_wp(pte)) 2366 categories |= PAGE_IS_WRITTEN; 2367 if (!PageAnon(pte_page(pte))) 2368 categories |= PAGE_IS_FILE; 2369 if (is_zero_pfn(pte_pfn(pte))) 2370 categories |= PAGE_IS_PFNZERO; 2371 if (pte_soft_dirty(pte)) 2372 categories |= PAGE_IS_SOFT_DIRTY; 2373 } else if (is_swap_pte(pte)) { 2374 categories |= PAGE_IS_SWAPPED; 2375 if (!pte_swp_uffd_wp_any(pte)) 2376 categories |= PAGE_IS_WRITTEN; 2377 if (pte_swp_soft_dirty(pte)) 2378 categories |= PAGE_IS_SOFT_DIRTY; 2379 } 2380 2381 return categories; 2382 } 2383 2384 static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, 2385 unsigned long addr, pte_t *ptep, 2386 pte_t ptent) 2387 { 2388 unsigned long psize; 2389 2390 if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent)) 2391 return; 2392 2393 psize = huge_page_size(hstate_vma(vma)); 2394 2395 if (is_hugetlb_entry_migration(ptent)) 2396 set_huge_pte_at(vma->vm_mm, addr, ptep, 2397 pte_swp_mkuffd_wp(ptent), psize); 2398 else if (!huge_pte_none(ptent)) 2399 huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, 2400 huge_pte_mkuffd_wp(ptent)); 2401 else 2402 set_huge_pte_at(vma->vm_mm, addr, ptep, 2403 make_pte_marker(PTE_MARKER_UFFD_WP), psize); 2404 } 2405 #endif /* CONFIG_HUGETLB_PAGE */ 2406 2407 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) 2408 static void pagemap_scan_backout_range(struct pagemap_scan_private *p, 2409 unsigned long addr, unsigned long end) 2410 { 2411 struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; 2412 2413 if (cur_buf->start != addr) 2414 cur_buf->end = addr; 2415 else 2416 cur_buf->start = cur_buf->end = 0; 2417 2418 p->found_pages -= (end - addr) / PAGE_SIZE; 2419 } 2420 #endif 2421 2422 static bool pagemap_scan_is_interesting_page(unsigned long categories, 2423 const struct pagemap_scan_private *p) 2424 { 2425 categories ^= p->arg.category_inverted; 2426 if ((categories & p->arg.category_mask) != p->arg.category_mask) 2427 return false; 2428 if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask)) 2429 return false; 2430 2431 return true; 2432 } 2433 2434 static bool pagemap_scan_is_interesting_vma(unsigned long categories, 2435 const struct pagemap_scan_private *p) 2436 { 2437 unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED; 2438 2439 categories ^= p->arg.category_inverted; 2440 if ((categories & required) != required) 2441 return false; 2442 2443 return true; 2444 } 2445 2446 static int pagemap_scan_test_walk(unsigned long start, unsigned long end, 2447 struct mm_walk *walk) 2448 { 2449 struct pagemap_scan_private *p = walk->private; 2450 struct vm_area_struct *vma = walk->vma; 2451 unsigned long vma_category = 0; 2452 bool wp_allowed = userfaultfd_wp_async(vma) && 2453 userfaultfd_wp_use_markers(vma); 2454 2455 if (!wp_allowed) { 2456 /* User requested explicit failure over wp-async capability */ 2457 if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) 2458 return -EPERM; 2459 /* 2460 * User requires wr-protect, and allows silently skipping 2461 * unsupported vmas. 2462 */ 2463 if (p->arg.flags & PM_SCAN_WP_MATCHING) 2464 return 1; 2465 /* 2466 * Then the request doesn't involve wr-protects at all, 2467 * fall through to the rest checks, and allow vma walk. 2468 */ 2469 } 2470 2471 if (vma->vm_flags & VM_PFNMAP) 2472 return 1; 2473 2474 if (wp_allowed) 2475 vma_category |= PAGE_IS_WPALLOWED; 2476 2477 if (vma->vm_flags & VM_SOFTDIRTY) 2478 vma_category |= PAGE_IS_SOFT_DIRTY; 2479 2480 if (!pagemap_scan_is_interesting_vma(vma_category, p)) 2481 return 1; 2482 2483 p->cur_vma_category = vma_category; 2484 2485 return 0; 2486 } 2487 2488 static bool pagemap_scan_push_range(unsigned long categories, 2489 struct pagemap_scan_private *p, 2490 unsigned long addr, unsigned long end) 2491 { 2492 struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; 2493 2494 /* 2495 * When there is no output buffer provided at all, the sentinel values 2496 * won't match here. There is no other way for `cur_buf->end` to be 2497 * non-zero other than it being non-empty. 2498 */ 2499 if (addr == cur_buf->end && categories == cur_buf->categories) { 2500 cur_buf->end = end; 2501 return true; 2502 } 2503 2504 if (cur_buf->end) { 2505 if (p->vec_buf_index >= p->vec_buf_len - 1) 2506 return false; 2507 2508 cur_buf = &p->vec_buf[++p->vec_buf_index]; 2509 } 2510 2511 cur_buf->start = addr; 2512 cur_buf->end = end; 2513 cur_buf->categories = categories; 2514 2515 return true; 2516 } 2517 2518 static int pagemap_scan_output(unsigned long categories, 2519 struct pagemap_scan_private *p, 2520 unsigned long addr, unsigned long *end) 2521 { 2522 unsigned long n_pages, total_pages; 2523 int ret = 0; 2524 2525 if (!p->vec_buf) 2526 return 0; 2527 2528 categories &= p->arg.return_mask; 2529 2530 n_pages = (*end - addr) / PAGE_SIZE; 2531 if (check_add_overflow(p->found_pages, n_pages, &total_pages) || 2532 total_pages > p->arg.max_pages) { 2533 size_t n_too_much = total_pages - p->arg.max_pages; 2534 *end -= n_too_much * PAGE_SIZE; 2535 n_pages -= n_too_much; 2536 ret = -ENOSPC; 2537 } 2538 2539 if (!pagemap_scan_push_range(categories, p, addr, *end)) { 2540 *end = addr; 2541 n_pages = 0; 2542 ret = -ENOSPC; 2543 } 2544 2545 p->found_pages += n_pages; 2546 if (ret) 2547 p->arg.walk_end = *end; 2548 2549 return ret; 2550 } 2551 2552 static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, 2553 unsigned long end, struct mm_walk *walk) 2554 { 2555 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2556 struct pagemap_scan_private *p = walk->private; 2557 struct vm_area_struct *vma = walk->vma; 2558 unsigned long categories; 2559 spinlock_t *ptl; 2560 int ret = 0; 2561 2562 ptl = pmd_trans_huge_lock(pmd, vma); 2563 if (!ptl) 2564 return -ENOENT; 2565 2566 categories = p->cur_vma_category | 2567 pagemap_thp_category(p, vma, start, *pmd); 2568 2569 if (!pagemap_scan_is_interesting_page(categories, p)) 2570 goto out_unlock; 2571 2572 ret = pagemap_scan_output(categories, p, start, &end); 2573 if (start == end) 2574 goto out_unlock; 2575 2576 if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2577 goto out_unlock; 2578 if (~categories & PAGE_IS_WRITTEN) 2579 goto out_unlock; 2580 2581 /* 2582 * Break huge page into small pages if the WP operation 2583 * needs to be performed on a portion of the huge page. 2584 */ 2585 if (end != start + HPAGE_SIZE) { 2586 spin_unlock(ptl); 2587 split_huge_pmd(vma, pmd, start); 2588 pagemap_scan_backout_range(p, start, end); 2589 /* Report as if there was no THP */ 2590 return -ENOENT; 2591 } 2592 2593 make_uffd_wp_pmd(vma, start, pmd); 2594 flush_tlb_range(vma, start, end); 2595 out_unlock: 2596 spin_unlock(ptl); 2597 return ret; 2598 #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ 2599 return -ENOENT; 2600 #endif 2601 } 2602 2603 static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, 2604 unsigned long end, struct mm_walk *walk) 2605 { 2606 struct pagemap_scan_private *p = walk->private; 2607 struct vm_area_struct *vma = walk->vma; 2608 unsigned long addr, flush_end = 0; 2609 pte_t *pte, *start_pte; 2610 spinlock_t *ptl; 2611 int ret; 2612 2613 ret = pagemap_scan_thp_entry(pmd, start, end, walk); 2614 if (ret != -ENOENT) 2615 return ret; 2616 2617 ret = 0; 2618 start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); 2619 if (!pte) { 2620 walk->action = ACTION_AGAIN; 2621 return 0; 2622 } 2623 2624 arch_enter_lazy_mmu_mode(); 2625 2626 if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { 2627 /* Fast path for performing exclusive WP */ 2628 for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { 2629 pte_t ptent = ptep_get(pte); 2630 2631 if ((pte_present(ptent) && pte_uffd_wp(ptent)) || 2632 pte_swp_uffd_wp_any(ptent)) 2633 continue; 2634 make_uffd_wp_pte(vma, addr, pte, ptent); 2635 if (!flush_end) 2636 start = addr; 2637 flush_end = addr + PAGE_SIZE; 2638 } 2639 goto flush_and_return; 2640 } 2641 2642 if (!p->arg.category_anyof_mask && !p->arg.category_inverted && 2643 p->arg.category_mask == PAGE_IS_WRITTEN && 2644 p->arg.return_mask == PAGE_IS_WRITTEN) { 2645 for (addr = start; addr < end; pte++, addr += PAGE_SIZE) { 2646 unsigned long next = addr + PAGE_SIZE; 2647 pte_t ptent = ptep_get(pte); 2648 2649 if ((pte_present(ptent) && pte_uffd_wp(ptent)) || 2650 pte_swp_uffd_wp_any(ptent)) 2651 continue; 2652 ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN, 2653 p, addr, &next); 2654 if (next == addr) 2655 break; 2656 if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2657 continue; 2658 make_uffd_wp_pte(vma, addr, pte, ptent); 2659 if (!flush_end) 2660 start = addr; 2661 flush_end = next; 2662 } 2663 goto flush_and_return; 2664 } 2665 2666 for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { 2667 pte_t ptent = ptep_get(pte); 2668 unsigned long categories = p->cur_vma_category | 2669 pagemap_page_category(p, vma, addr, ptent); 2670 unsigned long next = addr + PAGE_SIZE; 2671 2672 if (!pagemap_scan_is_interesting_page(categories, p)) 2673 continue; 2674 2675 ret = pagemap_scan_output(categories, p, addr, &next); 2676 if (next == addr) 2677 break; 2678 2679 if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2680 continue; 2681 if (~categories & PAGE_IS_WRITTEN) 2682 continue; 2683 2684 make_uffd_wp_pte(vma, addr, pte, ptent); 2685 if (!flush_end) 2686 start = addr; 2687 flush_end = next; 2688 } 2689 2690 flush_and_return: 2691 if (flush_end) 2692 flush_tlb_range(vma, start, addr); 2693 2694 arch_leave_lazy_mmu_mode(); 2695 pte_unmap_unlock(start_pte, ptl); 2696 2697 cond_resched(); 2698 return ret; 2699 } 2700 2701 #ifdef CONFIG_HUGETLB_PAGE 2702 static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, 2703 unsigned long start, unsigned long end, 2704 struct mm_walk *walk) 2705 { 2706 struct pagemap_scan_private *p = walk->private; 2707 struct vm_area_struct *vma = walk->vma; 2708 unsigned long categories; 2709 spinlock_t *ptl; 2710 int ret = 0; 2711 pte_t pte; 2712 2713 if (~p->arg.flags & PM_SCAN_WP_MATCHING) { 2714 /* Go the short route when not write-protecting pages. */ 2715 2716 pte = huge_ptep_get(walk->mm, start, ptep); 2717 categories = p->cur_vma_category | pagemap_hugetlb_category(pte); 2718 2719 if (!pagemap_scan_is_interesting_page(categories, p)) 2720 return 0; 2721 2722 return pagemap_scan_output(categories, p, start, &end); 2723 } 2724 2725 i_mmap_lock_write(vma->vm_file->f_mapping); 2726 ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); 2727 2728 pte = huge_ptep_get(walk->mm, start, ptep); 2729 categories = p->cur_vma_category | pagemap_hugetlb_category(pte); 2730 2731 if (!pagemap_scan_is_interesting_page(categories, p)) 2732 goto out_unlock; 2733 2734 ret = pagemap_scan_output(categories, p, start, &end); 2735 if (start == end) 2736 goto out_unlock; 2737 2738 if (~categories & PAGE_IS_WRITTEN) 2739 goto out_unlock; 2740 2741 if (end != start + HPAGE_SIZE) { 2742 /* Partial HugeTLB page WP isn't possible. */ 2743 pagemap_scan_backout_range(p, start, end); 2744 p->arg.walk_end = start; 2745 ret = 0; 2746 goto out_unlock; 2747 } 2748 2749 make_uffd_wp_huge_pte(vma, start, ptep, pte); 2750 flush_hugetlb_tlb_range(vma, start, end); 2751 2752 out_unlock: 2753 spin_unlock(ptl); 2754 i_mmap_unlock_write(vma->vm_file->f_mapping); 2755 2756 return ret; 2757 } 2758 #else 2759 #define pagemap_scan_hugetlb_entry NULL 2760 #endif 2761 2762 static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, 2763 int depth, struct mm_walk *walk) 2764 { 2765 struct pagemap_scan_private *p = walk->private; 2766 struct vm_area_struct *vma = walk->vma; 2767 int ret, err; 2768 2769 if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p)) 2770 return 0; 2771 2772 ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end); 2773 if (addr == end) 2774 return ret; 2775 2776 if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2777 return ret; 2778 2779 err = uffd_wp_range(vma, addr, end - addr, true); 2780 if (err < 0) 2781 ret = err; 2782 2783 return ret; 2784 } 2785 2786 static const struct mm_walk_ops pagemap_scan_ops = { 2787 .test_walk = pagemap_scan_test_walk, 2788 .pmd_entry = pagemap_scan_pmd_entry, 2789 .pte_hole = pagemap_scan_pte_hole, 2790 .hugetlb_entry = pagemap_scan_hugetlb_entry, 2791 }; 2792 2793 static int pagemap_scan_get_args(struct pm_scan_arg *arg, 2794 unsigned long uarg) 2795 { 2796 if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg))) 2797 return -EFAULT; 2798 2799 if (arg->size != sizeof(struct pm_scan_arg)) 2800 return -EINVAL; 2801 2802 /* Validate requested features */ 2803 if (arg->flags & ~PM_SCAN_FLAGS) 2804 return -EINVAL; 2805 if ((arg->category_inverted | arg->category_mask | 2806 arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) 2807 return -EINVAL; 2808 2809 arg->start = untagged_addr((unsigned long)arg->start); 2810 arg->end = untagged_addr((unsigned long)arg->end); 2811 arg->vec = untagged_addr((unsigned long)arg->vec); 2812 2813 /* Validate memory pointers */ 2814 if (!IS_ALIGNED(arg->start, PAGE_SIZE)) 2815 return -EINVAL; 2816 if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start)) 2817 return -EFAULT; 2818 if (!arg->vec && arg->vec_len) 2819 return -EINVAL; 2820 if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX) 2821 return -EINVAL; 2822 if (arg->vec && !access_ok((void __user *)(long)arg->vec, 2823 size_mul(arg->vec_len, sizeof(struct page_region)))) 2824 return -EFAULT; 2825 2826 /* Fixup default values */ 2827 arg->end = ALIGN(arg->end, PAGE_SIZE); 2828 arg->walk_end = 0; 2829 if (!arg->max_pages) 2830 arg->max_pages = ULONG_MAX; 2831 2832 return 0; 2833 } 2834 2835 static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, 2836 unsigned long uargl) 2837 { 2838 struct pm_scan_arg __user *uarg = (void __user *)uargl; 2839 2840 if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end))) 2841 return -EFAULT; 2842 2843 return 0; 2844 } 2845 2846 static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) 2847 { 2848 if (!p->arg.vec_len) 2849 return 0; 2850 2851 p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, 2852 p->arg.vec_len); 2853 p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf), 2854 GFP_KERNEL); 2855 if (!p->vec_buf) 2856 return -ENOMEM; 2857 2858 p->vec_buf->start = p->vec_buf->end = 0; 2859 p->vec_out = (struct page_region __user *)(long)p->arg.vec; 2860 2861 return 0; 2862 } 2863 2864 static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) 2865 { 2866 const struct page_region *buf = p->vec_buf; 2867 long n = p->vec_buf_index; 2868 2869 if (!p->vec_buf) 2870 return 0; 2871 2872 if (buf[n].end != buf[n].start) 2873 n++; 2874 2875 if (!n) 2876 return 0; 2877 2878 if (copy_to_user(p->vec_out, buf, n * sizeof(*buf))) 2879 return -EFAULT; 2880 2881 p->arg.vec_len -= n; 2882 p->vec_out += n; 2883 2884 p->vec_buf_index = 0; 2885 p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len); 2886 p->vec_buf->start = p->vec_buf->end = 0; 2887 2888 return n; 2889 } 2890 2891 static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) 2892 { 2893 struct pagemap_scan_private p = {0}; 2894 unsigned long walk_start; 2895 size_t n_ranges_out = 0; 2896 int ret; 2897 2898 ret = pagemap_scan_get_args(&p.arg, uarg); 2899 if (ret) 2900 return ret; 2901 2902 p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask | 2903 p.arg.return_mask; 2904 ret = pagemap_scan_init_bounce_buffer(&p); 2905 if (ret) 2906 return ret; 2907 2908 for (walk_start = p.arg.start; walk_start < p.arg.end; 2909 walk_start = p.arg.walk_end) { 2910 struct mmu_notifier_range range; 2911 long n_out; 2912 2913 if (fatal_signal_pending(current)) { 2914 ret = -EINTR; 2915 break; 2916 } 2917 2918 ret = mmap_read_lock_killable(mm); 2919 if (ret) 2920 break; 2921 2922 /* Protection change for the range is going to happen. */ 2923 if (p.arg.flags & PM_SCAN_WP_MATCHING) { 2924 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, 2925 mm, walk_start, p.arg.end); 2926 mmu_notifier_invalidate_range_start(&range); 2927 } 2928 2929 ret = walk_page_range(mm, walk_start, p.arg.end, 2930 &pagemap_scan_ops, &p); 2931 2932 if (p.arg.flags & PM_SCAN_WP_MATCHING) 2933 mmu_notifier_invalidate_range_end(&range); 2934 2935 mmap_read_unlock(mm); 2936 2937 n_out = pagemap_scan_flush_buffer(&p); 2938 if (n_out < 0) 2939 ret = n_out; 2940 else 2941 n_ranges_out += n_out; 2942 2943 if (ret != -ENOSPC) 2944 break; 2945 2946 if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages) 2947 break; 2948 } 2949 2950 /* ENOSPC signifies early stop (buffer full) from the walk. */ 2951 if (!ret || ret == -ENOSPC) 2952 ret = n_ranges_out; 2953 2954 /* The walk_end isn't set when ret is zero */ 2955 if (!p.arg.walk_end) 2956 p.arg.walk_end = p.arg.end; 2957 if (pagemap_scan_writeback_args(&p.arg, uarg)) 2958 ret = -EFAULT; 2959 2960 kfree(p.vec_buf); 2961 return ret; 2962 } 2963 2964 static long do_pagemap_cmd(struct file *file, unsigned int cmd, 2965 unsigned long arg) 2966 { 2967 struct mm_struct *mm = file->private_data; 2968 2969 switch (cmd) { 2970 case PAGEMAP_SCAN: 2971 return do_pagemap_scan(mm, arg); 2972 2973 default: 2974 return -EINVAL; 2975 } 2976 } 2977 2978 const struct file_operations proc_pagemap_operations = { 2979 .llseek = mem_lseek, /* borrow this */ 2980 .read = pagemap_read, 2981 .open = pagemap_open, 2982 .release = pagemap_release, 2983 .unlocked_ioctl = do_pagemap_cmd, 2984 .compat_ioctl = do_pagemap_cmd, 2985 }; 2986 #endif /* CONFIG_PROC_PAGE_MONITOR */ 2987 2988 #ifdef CONFIG_NUMA 2989 2990 struct numa_maps { 2991 unsigned long pages; 2992 unsigned long anon; 2993 unsigned long active; 2994 unsigned long writeback; 2995 unsigned long mapcount_max; 2996 unsigned long dirty; 2997 unsigned long swapcache; 2998 unsigned long node[MAX_NUMNODES]; 2999 }; 3000 3001 struct numa_maps_private { 3002 struct proc_maps_private proc_maps; 3003 struct numa_maps md; 3004 }; 3005 3006 static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, 3007 unsigned long nr_pages) 3008 { 3009 struct folio *folio = page_folio(page); 3010 int count; 3011 3012 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) 3013 count = folio_precise_page_mapcount(folio, page); 3014 else 3015 count = folio_average_page_mapcount(folio); 3016 3017 md->pages += nr_pages; 3018 if (pte_dirty || folio_test_dirty(folio)) 3019 md->dirty += nr_pages; 3020 3021 if (folio_test_swapcache(folio)) 3022 md->swapcache += nr_pages; 3023 3024 if (folio_test_active(folio) || folio_test_unevictable(folio)) 3025 md->active += nr_pages; 3026 3027 if (folio_test_writeback(folio)) 3028 md->writeback += nr_pages; 3029 3030 if (folio_test_anon(folio)) 3031 md->anon += nr_pages; 3032 3033 if (count > md->mapcount_max) 3034 md->mapcount_max = count; 3035 3036 md->node[folio_nid(folio)] += nr_pages; 3037 } 3038 3039 static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, 3040 unsigned long addr) 3041 { 3042 struct page *page; 3043 int nid; 3044 3045 if (!pte_present(pte)) 3046 return NULL; 3047 3048 page = vm_normal_page(vma, addr, pte); 3049 if (!page || is_zone_device_page(page)) 3050 return NULL; 3051 3052 if (PageReserved(page)) 3053 return NULL; 3054 3055 nid = page_to_nid(page); 3056 if (!node_isset(nid, node_states[N_MEMORY])) 3057 return NULL; 3058 3059 return page; 3060 } 3061 3062 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3063 static struct page *can_gather_numa_stats_pmd(pmd_t pmd, 3064 struct vm_area_struct *vma, 3065 unsigned long addr) 3066 { 3067 struct page *page; 3068 int nid; 3069 3070 if (!pmd_present(pmd)) 3071 return NULL; 3072 3073 page = vm_normal_page_pmd(vma, addr, pmd); 3074 if (!page) 3075 return NULL; 3076 3077 if (PageReserved(page)) 3078 return NULL; 3079 3080 nid = page_to_nid(page); 3081 if (!node_isset(nid, node_states[N_MEMORY])) 3082 return NULL; 3083 3084 return page; 3085 } 3086 #endif 3087 3088 static int gather_pte_stats(pmd_t *pmd, unsigned long addr, 3089 unsigned long end, struct mm_walk *walk) 3090 { 3091 struct numa_maps *md = walk->private; 3092 struct vm_area_struct *vma = walk->vma; 3093 spinlock_t *ptl; 3094 pte_t *orig_pte; 3095 pte_t *pte; 3096 3097 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3098 ptl = pmd_trans_huge_lock(pmd, vma); 3099 if (ptl) { 3100 struct page *page; 3101 3102 page = can_gather_numa_stats_pmd(*pmd, vma, addr); 3103 if (page) 3104 gather_stats(page, md, pmd_dirty(*pmd), 3105 HPAGE_PMD_SIZE/PAGE_SIZE); 3106 spin_unlock(ptl); 3107 return 0; 3108 } 3109 #endif 3110 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 3111 if (!pte) { 3112 walk->action = ACTION_AGAIN; 3113 return 0; 3114 } 3115 do { 3116 pte_t ptent = ptep_get(pte); 3117 struct page *page = can_gather_numa_stats(ptent, vma, addr); 3118 if (!page) 3119 continue; 3120 gather_stats(page, md, pte_dirty(ptent), 1); 3121 3122 } while (pte++, addr += PAGE_SIZE, addr != end); 3123 pte_unmap_unlock(orig_pte, ptl); 3124 cond_resched(); 3125 return 0; 3126 } 3127 #ifdef CONFIG_HUGETLB_PAGE 3128 static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, 3129 unsigned long addr, unsigned long end, struct mm_walk *walk) 3130 { 3131 pte_t huge_pte = huge_ptep_get(walk->mm, addr, pte); 3132 struct numa_maps *md; 3133 struct page *page; 3134 3135 if (!pte_present(huge_pte)) 3136 return 0; 3137 3138 page = pte_page(huge_pte); 3139 3140 md = walk->private; 3141 gather_stats(page, md, pte_dirty(huge_pte), 1); 3142 return 0; 3143 } 3144 3145 #else 3146 static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, 3147 unsigned long addr, unsigned long end, struct mm_walk *walk) 3148 { 3149 return 0; 3150 } 3151 #endif 3152 3153 static const struct mm_walk_ops show_numa_ops = { 3154 .hugetlb_entry = gather_hugetlb_stats, 3155 .pmd_entry = gather_pte_stats, 3156 .walk_lock = PGWALK_RDLOCK, 3157 }; 3158 3159 /* 3160 * Display pages allocated per node and memory policy via /proc. 3161 */ 3162 static int show_numa_map(struct seq_file *m, void *v) 3163 { 3164 struct numa_maps_private *numa_priv = m->private; 3165 struct proc_maps_private *proc_priv = &numa_priv->proc_maps; 3166 struct vm_area_struct *vma = v; 3167 struct numa_maps *md = &numa_priv->md; 3168 struct file *file = vma->vm_file; 3169 struct mm_struct *mm = vma->vm_mm; 3170 char buffer[64]; 3171 struct mempolicy *pol; 3172 pgoff_t ilx; 3173 int nid; 3174 3175 if (!mm) 3176 return 0; 3177 3178 /* Ensure we start with an empty set of numa_maps statistics. */ 3179 memset(md, 0, sizeof(*md)); 3180 3181 pol = __get_vma_policy(vma, vma->vm_start, &ilx); 3182 if (pol) { 3183 mpol_to_str(buffer, sizeof(buffer), pol); 3184 mpol_cond_put(pol); 3185 } else { 3186 mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy); 3187 } 3188 3189 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 3190 3191 if (file) { 3192 seq_puts(m, " file="); 3193 seq_path(m, file_user_path(file), "\n\t= "); 3194 } else if (vma_is_initial_heap(vma)) { 3195 seq_puts(m, " heap"); 3196 } else if (vma_is_initial_stack(vma)) { 3197 seq_puts(m, " stack"); 3198 } 3199 3200 if (is_vm_hugetlb_page(vma)) 3201 seq_puts(m, " huge"); 3202 3203 /* mmap_lock is held by m_start */ 3204 walk_page_vma(vma, &show_numa_ops, md); 3205 3206 if (!md->pages) 3207 goto out; 3208 3209 if (md->anon) 3210 seq_printf(m, " anon=%lu", md->anon); 3211 3212 if (md->dirty) 3213 seq_printf(m, " dirty=%lu", md->dirty); 3214 3215 if (md->pages != md->anon && md->pages != md->dirty) 3216 seq_printf(m, " mapped=%lu", md->pages); 3217 3218 if (md->mapcount_max > 1) 3219 seq_printf(m, " mapmax=%lu", md->mapcount_max); 3220 3221 if (md->swapcache) 3222 seq_printf(m, " swapcache=%lu", md->swapcache); 3223 3224 if (md->active < md->pages && !is_vm_hugetlb_page(vma)) 3225 seq_printf(m, " active=%lu", md->active); 3226 3227 if (md->writeback) 3228 seq_printf(m, " writeback=%lu", md->writeback); 3229 3230 for_each_node_state(nid, N_MEMORY) 3231 if (md->node[nid]) 3232 seq_printf(m, " N%d=%lu", nid, md->node[nid]); 3233 3234 seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); 3235 out: 3236 seq_putc(m, '\n'); 3237 return 0; 3238 } 3239 3240 static const struct seq_operations proc_pid_numa_maps_op = { 3241 .start = m_start, 3242 .next = m_next, 3243 .stop = m_stop, 3244 .show = show_numa_map, 3245 }; 3246 3247 static int pid_numa_maps_open(struct inode *inode, struct file *file) 3248 { 3249 return proc_maps_open(inode, file, &proc_pid_numa_maps_op, 3250 sizeof(struct numa_maps_private)); 3251 } 3252 3253 const struct file_operations proc_pid_numa_maps_operations = { 3254 .open = pid_numa_maps_open, 3255 .read = seq_read, 3256 .llseek = seq_lseek, 3257 .release = proc_map_release, 3258 }; 3259 3260 #endif /* CONFIG_NUMA */ 3261