1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/pagewalk.h> 3 #include <linux/mm_inline.h> 4 #include <linux/hugetlb.h> 5 #include <linux/huge_mm.h> 6 #include <linux/mount.h> 7 #include <linux/ksm.h> 8 #include <linux/seq_file.h> 9 #include <linux/highmem.h> 10 #include <linux/ptrace.h> 11 #include <linux/slab.h> 12 #include <linux/pagemap.h> 13 #include <linux/mempolicy.h> 14 #include <linux/rmap.h> 15 #include <linux/swap.h> 16 #include <linux/sched/mm.h> 17 #include <linux/swapops.h> 18 #include <linux/mmu_notifier.h> 19 #include <linux/page_idle.h> 20 #include <linux/shmem_fs.h> 21 #include <linux/uaccess.h> 22 #include <linux/pkeys.h> 23 #include <linux/minmax.h> 24 #include <linux/overflow.h> 25 #include <linux/buildid.h> 26 27 #include <asm/elf.h> 28 #include <asm/tlb.h> 29 #include <asm/tlbflush.h> 30 #include "internal.h" 31 32 #define SENTINEL_VMA_END -1 33 #define SENTINEL_VMA_GATE -2 34 35 #define SEQ_PUT_DEC(str, val) \ 36 seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) 37 void task_mem(struct seq_file *m, struct mm_struct *mm) 38 { 39 unsigned long text, lib, swap, anon, file, shmem; 40 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; 41 42 anon = get_mm_counter_sum(mm, MM_ANONPAGES); 43 file = get_mm_counter_sum(mm, MM_FILEPAGES); 44 shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES); 45 46 /* 47 * Note: to minimize their overhead, mm maintains hiwater_vm and 48 * hiwater_rss only when about to *lower* total_vm or rss. Any 49 * collector of these hiwater stats must therefore get total_vm 50 * and rss too, which will usually be the higher. Barriers? not 51 * worth the effort, such snapshots can always be inconsistent. 52 */ 53 hiwater_vm = total_vm = mm->total_vm; 54 if (hiwater_vm < mm->hiwater_vm) 55 hiwater_vm = mm->hiwater_vm; 56 hiwater_rss = total_rss = anon + file + shmem; 57 if (hiwater_rss < mm->hiwater_rss) 58 hiwater_rss = mm->hiwater_rss; 59 60 /* split executable areas between text and lib */ 61 text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK); 62 text = min(text, mm->exec_vm << PAGE_SHIFT); 63 lib = (mm->exec_vm << PAGE_SHIFT) - text; 64 65 swap = get_mm_counter_sum(mm, MM_SWAPENTS); 66 SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); 67 SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); 68 SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); 69 SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm)); 70 SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); 71 SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); 72 SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); 73 SEQ_PUT_DEC(" kB\nRssFile:\t", file); 74 SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem); 75 SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm); 76 SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm); 77 seq_put_decimal_ull_width(m, 78 " kB\nVmExe:\t", text >> 10, 8); 79 seq_put_decimal_ull_width(m, 80 " kB\nVmLib:\t", lib >> 10, 8); 81 seq_put_decimal_ull_width(m, 82 " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8); 83 SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); 84 seq_puts(m, " kB\n"); 85 hugetlb_report_usage(m, mm); 86 } 87 #undef SEQ_PUT_DEC 88 89 unsigned long task_vsize(struct mm_struct *mm) 90 { 91 return PAGE_SIZE * mm->total_vm; 92 } 93 94 unsigned long task_statm(struct mm_struct *mm, 95 unsigned long *shared, unsigned long *text, 96 unsigned long *data, unsigned long *resident) 97 { 98 *shared = get_mm_counter_sum(mm, MM_FILEPAGES) + 99 get_mm_counter_sum(mm, MM_SHMEMPAGES); 100 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) 101 >> PAGE_SHIFT; 102 *data = mm->data_vm + mm->stack_vm; 103 *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES); 104 return mm->total_vm; 105 } 106 107 #ifdef CONFIG_NUMA 108 /* 109 * Save get_task_policy() for show_numa_map(). 110 */ 111 static void hold_task_mempolicy(struct proc_maps_private *priv) 112 { 113 struct task_struct *task = priv->task; 114 115 task_lock(task); 116 priv->task_mempolicy = get_task_policy(task); 117 mpol_get(priv->task_mempolicy); 118 task_unlock(task); 119 } 120 static void release_task_mempolicy(struct proc_maps_private *priv) 121 { 122 mpol_put(priv->task_mempolicy); 123 } 124 #else 125 static void hold_task_mempolicy(struct proc_maps_private *priv) 126 { 127 } 128 static void release_task_mempolicy(struct proc_maps_private *priv) 129 { 130 } 131 #endif 132 133 #ifdef CONFIG_PER_VMA_LOCK 134 135 static void unlock_vma(struct proc_maps_private *priv) 136 { 137 if (priv->locked_vma) { 138 vma_end_read(priv->locked_vma); 139 priv->locked_vma = NULL; 140 } 141 } 142 143 static const struct seq_operations proc_pid_maps_op; 144 145 static inline bool lock_vma_range(struct seq_file *m, 146 struct proc_maps_private *priv) 147 { 148 /* 149 * smaps and numa_maps perform page table walk, therefore require 150 * mmap_lock but maps can be read with locking just the vma and 151 * walking the vma tree under rcu read protection. 152 */ 153 if (m->op != &proc_pid_maps_op) { 154 if (mmap_read_lock_killable(priv->mm)) 155 return false; 156 157 priv->mmap_locked = true; 158 } else { 159 rcu_read_lock(); 160 priv->locked_vma = NULL; 161 priv->mmap_locked = false; 162 } 163 164 return true; 165 } 166 167 static inline void unlock_vma_range(struct proc_maps_private *priv) 168 { 169 if (priv->mmap_locked) { 170 mmap_read_unlock(priv->mm); 171 } else { 172 unlock_vma(priv); 173 rcu_read_unlock(); 174 } 175 } 176 177 static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, 178 loff_t last_pos) 179 { 180 struct vm_area_struct *vma; 181 182 if (priv->mmap_locked) 183 return vma_next(&priv->iter); 184 185 unlock_vma(priv); 186 vma = lock_next_vma(priv->mm, &priv->iter, last_pos); 187 if (!IS_ERR_OR_NULL(vma)) 188 priv->locked_vma = vma; 189 190 return vma; 191 } 192 193 static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, 194 loff_t pos) 195 { 196 if (priv->mmap_locked) 197 return false; 198 199 rcu_read_unlock(); 200 mmap_read_lock(priv->mm); 201 /* Reinitialize the iterator after taking mmap_lock */ 202 vma_iter_set(&priv->iter, pos); 203 priv->mmap_locked = true; 204 205 return true; 206 } 207 208 #else /* CONFIG_PER_VMA_LOCK */ 209 210 static inline bool lock_vma_range(struct seq_file *m, 211 struct proc_maps_private *priv) 212 { 213 return mmap_read_lock_killable(priv->mm) == 0; 214 } 215 216 static inline void unlock_vma_range(struct proc_maps_private *priv) 217 { 218 mmap_read_unlock(priv->mm); 219 } 220 221 static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, 222 loff_t last_pos) 223 { 224 return vma_next(&priv->iter); 225 } 226 227 static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, 228 loff_t pos) 229 { 230 return false; 231 } 232 233 #endif /* CONFIG_PER_VMA_LOCK */ 234 235 static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) 236 { 237 struct proc_maps_private *priv = m->private; 238 struct vm_area_struct *vma; 239 240 retry: 241 vma = get_next_vma(priv, *ppos); 242 /* EINTR of EAGAIN is possible */ 243 if (IS_ERR(vma)) { 244 if (PTR_ERR(vma) == -EAGAIN && fallback_to_mmap_lock(priv, *ppos)) 245 goto retry; 246 247 return vma; 248 } 249 250 /* Store previous position to be able to restart if needed */ 251 priv->last_pos = *ppos; 252 if (vma) { 253 /* 254 * Track the end of the reported vma to ensure position changes 255 * even if previous vma was merged with the next vma and we 256 * found the extended vma with the same vm_start. 257 */ 258 *ppos = vma->vm_end; 259 } else { 260 *ppos = SENTINEL_VMA_GATE; 261 vma = get_gate_vma(priv->mm); 262 } 263 264 return vma; 265 } 266 267 static void *m_start(struct seq_file *m, loff_t *ppos) 268 { 269 struct proc_maps_private *priv = m->private; 270 loff_t last_addr = *ppos; 271 struct mm_struct *mm; 272 273 /* See m_next(). Zero at the start or after lseek. */ 274 if (last_addr == SENTINEL_VMA_END) 275 return NULL; 276 277 priv->task = get_proc_task(priv->inode); 278 if (!priv->task) 279 return ERR_PTR(-ESRCH); 280 281 mm = priv->mm; 282 if (!mm || !mmget_not_zero(mm)) { 283 put_task_struct(priv->task); 284 priv->task = NULL; 285 return NULL; 286 } 287 288 if (!lock_vma_range(m, priv)) { 289 mmput(mm); 290 put_task_struct(priv->task); 291 priv->task = NULL; 292 return ERR_PTR(-EINTR); 293 } 294 295 /* 296 * Reset current position if last_addr was set before 297 * and it's not a sentinel. 298 */ 299 if (last_addr > 0) 300 *ppos = last_addr = priv->last_pos; 301 vma_iter_init(&priv->iter, mm, (unsigned long)last_addr); 302 hold_task_mempolicy(priv); 303 if (last_addr == SENTINEL_VMA_GATE) 304 return get_gate_vma(mm); 305 306 return proc_get_vma(m, ppos); 307 } 308 309 static void *m_next(struct seq_file *m, void *v, loff_t *ppos) 310 { 311 if (*ppos == SENTINEL_VMA_GATE) { 312 *ppos = SENTINEL_VMA_END; 313 return NULL; 314 } 315 return proc_get_vma(m, ppos); 316 } 317 318 static void m_stop(struct seq_file *m, void *v) 319 { 320 struct proc_maps_private *priv = m->private; 321 struct mm_struct *mm = priv->mm; 322 323 if (!priv->task) 324 return; 325 326 release_task_mempolicy(priv); 327 unlock_vma_range(priv); 328 mmput(mm); 329 put_task_struct(priv->task); 330 priv->task = NULL; 331 } 332 333 static int proc_maps_open(struct inode *inode, struct file *file, 334 const struct seq_operations *ops, int psize) 335 { 336 struct proc_maps_private *priv = __seq_open_private(file, ops, psize); 337 338 if (!priv) 339 return -ENOMEM; 340 341 priv->inode = inode; 342 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); 343 if (IS_ERR(priv->mm)) { 344 int err = PTR_ERR(priv->mm); 345 346 seq_release_private(inode, file); 347 return err; 348 } 349 350 return 0; 351 } 352 353 static int proc_map_release(struct inode *inode, struct file *file) 354 { 355 struct seq_file *seq = file->private_data; 356 struct proc_maps_private *priv = seq->private; 357 358 if (priv->mm) 359 mmdrop(priv->mm); 360 361 return seq_release_private(inode, file); 362 } 363 364 static int do_maps_open(struct inode *inode, struct file *file, 365 const struct seq_operations *ops) 366 { 367 return proc_maps_open(inode, file, ops, 368 sizeof(struct proc_maps_private)); 369 } 370 371 static void get_vma_name(struct vm_area_struct *vma, 372 const struct path **path, 373 const char **name, 374 const char **name_fmt) 375 { 376 struct anon_vma_name *anon_name = vma->vm_mm ? anon_vma_name(vma) : NULL; 377 378 *name = NULL; 379 *path = NULL; 380 *name_fmt = NULL; 381 382 /* 383 * Print the dentry name for named mappings, and a 384 * special [heap] marker for the heap: 385 */ 386 if (vma->vm_file) { 387 /* 388 * If user named this anon shared memory via 389 * prctl(PR_SET_VMA ..., use the provided name. 390 */ 391 if (anon_name) { 392 *name_fmt = "[anon_shmem:%s]"; 393 *name = anon_name->name; 394 } else { 395 *path = file_user_path(vma->vm_file); 396 } 397 return; 398 } 399 400 if (vma->vm_ops && vma->vm_ops->name) { 401 *name = vma->vm_ops->name(vma); 402 if (*name) 403 return; 404 } 405 406 *name = arch_vma_name(vma); 407 if (*name) 408 return; 409 410 if (!vma->vm_mm) { 411 *name = "[vdso]"; 412 return; 413 } 414 415 if (vma_is_initial_heap(vma)) { 416 *name = "[heap]"; 417 return; 418 } 419 420 if (vma_is_initial_stack(vma)) { 421 *name = "[stack]"; 422 return; 423 } 424 425 if (anon_name) { 426 *name_fmt = "[anon:%s]"; 427 *name = anon_name->name; 428 return; 429 } 430 } 431 432 static void show_vma_header_prefix(struct seq_file *m, 433 unsigned long start, unsigned long end, 434 vm_flags_t flags, unsigned long long pgoff, 435 dev_t dev, unsigned long ino) 436 { 437 seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); 438 seq_put_hex_ll(m, NULL, start, 8); 439 seq_put_hex_ll(m, "-", end, 8); 440 seq_putc(m, ' '); 441 seq_putc(m, flags & VM_READ ? 'r' : '-'); 442 seq_putc(m, flags & VM_WRITE ? 'w' : '-'); 443 seq_putc(m, flags & VM_EXEC ? 'x' : '-'); 444 seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p'); 445 seq_put_hex_ll(m, " ", pgoff, 8); 446 seq_put_hex_ll(m, " ", MAJOR(dev), 2); 447 seq_put_hex_ll(m, ":", MINOR(dev), 2); 448 seq_put_decimal_ull(m, " ", ino); 449 seq_putc(m, ' '); 450 } 451 452 static void 453 show_map_vma(struct seq_file *m, struct vm_area_struct *vma) 454 { 455 const struct path *path; 456 const char *name_fmt, *name; 457 vm_flags_t flags = vma->vm_flags; 458 unsigned long ino = 0; 459 unsigned long long pgoff = 0; 460 unsigned long start, end; 461 dev_t dev = 0; 462 463 if (vma->vm_file) { 464 const struct inode *inode = file_user_inode(vma->vm_file); 465 466 dev = inode->i_sb->s_dev; 467 ino = inode->i_ino; 468 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; 469 } 470 471 start = vma->vm_start; 472 end = vma->vm_end; 473 show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); 474 475 get_vma_name(vma, &path, &name, &name_fmt); 476 if (path) { 477 seq_pad(m, ' '); 478 seq_path(m, path, "\n"); 479 } else if (name_fmt) { 480 seq_pad(m, ' '); 481 seq_printf(m, name_fmt, name); 482 } else if (name) { 483 seq_pad(m, ' '); 484 seq_puts(m, name); 485 } 486 seq_putc(m, '\n'); 487 } 488 489 static int show_map(struct seq_file *m, void *v) 490 { 491 show_map_vma(m, v); 492 return 0; 493 } 494 495 static const struct seq_operations proc_pid_maps_op = { 496 .start = m_start, 497 .next = m_next, 498 .stop = m_stop, 499 .show = show_map 500 }; 501 502 static int pid_maps_open(struct inode *inode, struct file *file) 503 { 504 return do_maps_open(inode, file, &proc_pid_maps_op); 505 } 506 507 #define PROCMAP_QUERY_VMA_FLAGS ( \ 508 PROCMAP_QUERY_VMA_READABLE | \ 509 PROCMAP_QUERY_VMA_WRITABLE | \ 510 PROCMAP_QUERY_VMA_EXECUTABLE | \ 511 PROCMAP_QUERY_VMA_SHARED \ 512 ) 513 514 #define PROCMAP_QUERY_VALID_FLAGS_MASK ( \ 515 PROCMAP_QUERY_COVERING_OR_NEXT_VMA | \ 516 PROCMAP_QUERY_FILE_BACKED_VMA | \ 517 PROCMAP_QUERY_VMA_FLAGS \ 518 ) 519 520 static int query_vma_setup(struct mm_struct *mm) 521 { 522 return mmap_read_lock_killable(mm); 523 } 524 525 static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma) 526 { 527 mmap_read_unlock(mm); 528 } 529 530 static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr) 531 { 532 return find_vma(mm, addr); 533 } 534 535 static struct vm_area_struct *query_matching_vma(struct mm_struct *mm, 536 unsigned long addr, u32 flags) 537 { 538 struct vm_area_struct *vma; 539 540 next_vma: 541 vma = query_vma_find_by_addr(mm, addr); 542 if (!vma) 543 goto no_vma; 544 545 /* user requested only file-backed VMA, keep iterating */ 546 if ((flags & PROCMAP_QUERY_FILE_BACKED_VMA) && !vma->vm_file) 547 goto skip_vma; 548 549 /* VMA permissions should satisfy query flags */ 550 if (flags & PROCMAP_QUERY_VMA_FLAGS) { 551 u32 perm = 0; 552 553 if (flags & PROCMAP_QUERY_VMA_READABLE) 554 perm |= VM_READ; 555 if (flags & PROCMAP_QUERY_VMA_WRITABLE) 556 perm |= VM_WRITE; 557 if (flags & PROCMAP_QUERY_VMA_EXECUTABLE) 558 perm |= VM_EXEC; 559 if (flags & PROCMAP_QUERY_VMA_SHARED) 560 perm |= VM_MAYSHARE; 561 562 if ((vma->vm_flags & perm) != perm) 563 goto skip_vma; 564 } 565 566 /* found covering VMA or user is OK with the matching next VMA */ 567 if ((flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) || vma->vm_start <= addr) 568 return vma; 569 570 skip_vma: 571 /* 572 * If the user needs closest matching VMA, keep iterating. 573 */ 574 addr = vma->vm_end; 575 if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) 576 goto next_vma; 577 578 no_vma: 579 return ERR_PTR(-ENOENT); 580 } 581 582 static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) 583 { 584 struct procmap_query karg; 585 struct vm_area_struct *vma; 586 struct mm_struct *mm; 587 const char *name = NULL; 588 char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; 589 __u64 usize; 590 int err; 591 592 if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize))) 593 return -EFAULT; 594 /* argument struct can never be that large, reject abuse */ 595 if (usize > PAGE_SIZE) 596 return -E2BIG; 597 /* argument struct should have at least query_flags and query_addr fields */ 598 if (usize < offsetofend(struct procmap_query, query_addr)) 599 return -EINVAL; 600 err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); 601 if (err) 602 return err; 603 604 /* reject unknown flags */ 605 if (karg.query_flags & ~PROCMAP_QUERY_VALID_FLAGS_MASK) 606 return -EINVAL; 607 /* either both buffer address and size are set, or both should be zero */ 608 if (!!karg.vma_name_size != !!karg.vma_name_addr) 609 return -EINVAL; 610 if (!!karg.build_id_size != !!karg.build_id_addr) 611 return -EINVAL; 612 613 mm = priv->mm; 614 if (!mm || !mmget_not_zero(mm)) 615 return -ESRCH; 616 617 err = query_vma_setup(mm); 618 if (err) { 619 mmput(mm); 620 return err; 621 } 622 623 vma = query_matching_vma(mm, karg.query_addr, karg.query_flags); 624 if (IS_ERR(vma)) { 625 err = PTR_ERR(vma); 626 vma = NULL; 627 goto out; 628 } 629 630 karg.vma_start = vma->vm_start; 631 karg.vma_end = vma->vm_end; 632 633 karg.vma_flags = 0; 634 if (vma->vm_flags & VM_READ) 635 karg.vma_flags |= PROCMAP_QUERY_VMA_READABLE; 636 if (vma->vm_flags & VM_WRITE) 637 karg.vma_flags |= PROCMAP_QUERY_VMA_WRITABLE; 638 if (vma->vm_flags & VM_EXEC) 639 karg.vma_flags |= PROCMAP_QUERY_VMA_EXECUTABLE; 640 if (vma->vm_flags & VM_MAYSHARE) 641 karg.vma_flags |= PROCMAP_QUERY_VMA_SHARED; 642 643 karg.vma_page_size = vma_kernel_pagesize(vma); 644 645 if (vma->vm_file) { 646 const struct inode *inode = file_user_inode(vma->vm_file); 647 648 karg.vma_offset = ((__u64)vma->vm_pgoff) << PAGE_SHIFT; 649 karg.dev_major = MAJOR(inode->i_sb->s_dev); 650 karg.dev_minor = MINOR(inode->i_sb->s_dev); 651 karg.inode = inode->i_ino; 652 } else { 653 karg.vma_offset = 0; 654 karg.dev_major = 0; 655 karg.dev_minor = 0; 656 karg.inode = 0; 657 } 658 659 if (karg.build_id_size) { 660 __u32 build_id_sz; 661 662 err = build_id_parse(vma, build_id_buf, &build_id_sz); 663 if (err) { 664 karg.build_id_size = 0; 665 } else { 666 if (karg.build_id_size < build_id_sz) { 667 err = -ENAMETOOLONG; 668 goto out; 669 } 670 karg.build_id_size = build_id_sz; 671 } 672 } 673 674 if (karg.vma_name_size) { 675 size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size); 676 const struct path *path; 677 const char *name_fmt; 678 size_t name_sz = 0; 679 680 get_vma_name(vma, &path, &name, &name_fmt); 681 682 if (path || name_fmt || name) { 683 name_buf = kmalloc(name_buf_sz, GFP_KERNEL); 684 if (!name_buf) { 685 err = -ENOMEM; 686 goto out; 687 } 688 } 689 if (path) { 690 name = d_path(path, name_buf, name_buf_sz); 691 if (IS_ERR(name)) { 692 err = PTR_ERR(name); 693 goto out; 694 } 695 name_sz = name_buf + name_buf_sz - name; 696 } else if (name || name_fmt) { 697 name_sz = 1 + snprintf(name_buf, name_buf_sz, name_fmt ?: "%s", name); 698 name = name_buf; 699 } 700 if (name_sz > name_buf_sz) { 701 err = -ENAMETOOLONG; 702 goto out; 703 } 704 karg.vma_name_size = name_sz; 705 } 706 707 /* unlock vma or mmap_lock, and put mm_struct before copying data to user */ 708 query_vma_teardown(mm, vma); 709 mmput(mm); 710 711 if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr), 712 name, karg.vma_name_size)) { 713 kfree(name_buf); 714 return -EFAULT; 715 } 716 kfree(name_buf); 717 718 if (karg.build_id_size && copy_to_user(u64_to_user_ptr(karg.build_id_addr), 719 build_id_buf, karg.build_id_size)) 720 return -EFAULT; 721 722 if (copy_to_user(uarg, &karg, min_t(size_t, sizeof(karg), usize))) 723 return -EFAULT; 724 725 return 0; 726 727 out: 728 query_vma_teardown(mm, vma); 729 mmput(mm); 730 kfree(name_buf); 731 return err; 732 } 733 734 static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 735 { 736 struct seq_file *seq = file->private_data; 737 struct proc_maps_private *priv = seq->private; 738 739 switch (cmd) { 740 case PROCMAP_QUERY: 741 return do_procmap_query(priv, (void __user *)arg); 742 default: 743 return -ENOIOCTLCMD; 744 } 745 } 746 747 const struct file_operations proc_pid_maps_operations = { 748 .open = pid_maps_open, 749 .read = seq_read, 750 .llseek = seq_lseek, 751 .release = proc_map_release, 752 .unlocked_ioctl = procfs_procmap_ioctl, 753 .compat_ioctl = compat_ptr_ioctl, 754 }; 755 756 /* 757 * Proportional Set Size(PSS): my share of RSS. 758 * 759 * PSS of a process is the count of pages it has in memory, where each 760 * page is divided by the number of processes sharing it. So if a 761 * process has 1000 pages all to itself, and 1000 shared with one other 762 * process, its PSS will be 1500. 763 * 764 * To keep (accumulated) division errors low, we adopt a 64bit 765 * fixed-point pss counter to minimize division errors. So (pss >> 766 * PSS_SHIFT) would be the real byte count. 767 * 768 * A shift of 12 before division means (assuming 4K page size): 769 * - 1M 3-user-pages add up to 8KB errors; 770 * - supports mapcount up to 2^24, or 16M; 771 * - supports PSS up to 2^52 bytes, or 4PB. 772 */ 773 #define PSS_SHIFT 12 774 775 #ifdef CONFIG_PROC_PAGE_MONITOR 776 struct mem_size_stats { 777 unsigned long resident; 778 unsigned long shared_clean; 779 unsigned long shared_dirty; 780 unsigned long private_clean; 781 unsigned long private_dirty; 782 unsigned long referenced; 783 unsigned long anonymous; 784 unsigned long lazyfree; 785 unsigned long anonymous_thp; 786 unsigned long shmem_thp; 787 unsigned long file_thp; 788 unsigned long swap; 789 unsigned long shared_hugetlb; 790 unsigned long private_hugetlb; 791 unsigned long ksm; 792 u64 pss; 793 u64 pss_anon; 794 u64 pss_file; 795 u64 pss_shmem; 796 u64 pss_dirty; 797 u64 pss_locked; 798 u64 swap_pss; 799 }; 800 801 static void smaps_page_accumulate(struct mem_size_stats *mss, 802 struct folio *folio, unsigned long size, unsigned long pss, 803 bool dirty, bool locked, bool private) 804 { 805 mss->pss += pss; 806 807 if (folio_test_anon(folio)) 808 mss->pss_anon += pss; 809 else if (folio_test_swapbacked(folio)) 810 mss->pss_shmem += pss; 811 else 812 mss->pss_file += pss; 813 814 if (locked) 815 mss->pss_locked += pss; 816 817 if (dirty || folio_test_dirty(folio)) { 818 mss->pss_dirty += pss; 819 if (private) 820 mss->private_dirty += size; 821 else 822 mss->shared_dirty += size; 823 } else { 824 if (private) 825 mss->private_clean += size; 826 else 827 mss->shared_clean += size; 828 } 829 } 830 831 static void smaps_account(struct mem_size_stats *mss, struct page *page, 832 bool compound, bool young, bool dirty, bool locked, 833 bool present) 834 { 835 struct folio *folio = page_folio(page); 836 int i, nr = compound ? compound_nr(page) : 1; 837 unsigned long size = nr * PAGE_SIZE; 838 bool exclusive; 839 int mapcount; 840 841 /* 842 * First accumulate quantities that depend only on |size| and the type 843 * of the compound page. 844 */ 845 if (folio_test_anon(folio)) { 846 mss->anonymous += size; 847 if (!folio_test_swapbacked(folio) && !dirty && 848 !folio_test_dirty(folio)) 849 mss->lazyfree += size; 850 } 851 852 if (folio_test_ksm(folio)) 853 mss->ksm += size; 854 855 mss->resident += size; 856 /* Accumulate the size in pages that have been accessed. */ 857 if (young || folio_test_young(folio) || folio_test_referenced(folio)) 858 mss->referenced += size; 859 860 /* 861 * Then accumulate quantities that may depend on sharing, or that may 862 * differ page-by-page. 863 * 864 * refcount == 1 for present entries guarantees that the folio is mapped 865 * exactly once. For large folios this implies that exactly one 866 * PTE/PMD/... maps (a part of) this folio. 867 * 868 * Treat all non-present entries (where relying on the mapcount and 869 * refcount doesn't make sense) as "maybe shared, but not sure how 870 * often". We treat device private entries as being fake-present. 871 * 872 * Note that it would not be safe to read the mapcount especially for 873 * pages referenced by migration entries, even with the PTL held. 874 */ 875 if (folio_ref_count(folio) == 1 || !present) { 876 smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT, 877 dirty, locked, present); 878 return; 879 } 880 881 if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { 882 mapcount = folio_average_page_mapcount(folio); 883 exclusive = !folio_maybe_mapped_shared(folio); 884 } 885 886 /* 887 * We obtain a snapshot of the mapcount. Without holding the folio lock 888 * this snapshot can be slightly wrong as we cannot always read the 889 * mapcount atomically. 890 */ 891 for (i = 0; i < nr; i++, page++) { 892 unsigned long pss = PAGE_SIZE << PSS_SHIFT; 893 894 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) { 895 mapcount = folio_precise_page_mapcount(folio, page); 896 exclusive = mapcount < 2; 897 } 898 899 if (mapcount >= 2) 900 pss /= mapcount; 901 smaps_page_accumulate(mss, folio, PAGE_SIZE, pss, 902 dirty, locked, exclusive); 903 } 904 } 905 906 #ifdef CONFIG_SHMEM 907 static int smaps_pte_hole(unsigned long addr, unsigned long end, 908 __always_unused int depth, struct mm_walk *walk) 909 { 910 struct mem_size_stats *mss = walk->private; 911 struct vm_area_struct *vma = walk->vma; 912 913 mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping, 914 linear_page_index(vma, addr), 915 linear_page_index(vma, end)); 916 917 return 0; 918 } 919 #else 920 #define smaps_pte_hole NULL 921 #endif /* CONFIG_SHMEM */ 922 923 static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk) 924 { 925 #ifdef CONFIG_SHMEM 926 if (walk->ops->pte_hole) { 927 /* depth is not used */ 928 smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk); 929 } 930 #endif 931 } 932 933 static void smaps_pte_entry(pte_t *pte, unsigned long addr, 934 struct mm_walk *walk) 935 { 936 struct mem_size_stats *mss = walk->private; 937 struct vm_area_struct *vma = walk->vma; 938 bool locked = !!(vma->vm_flags & VM_LOCKED); 939 struct page *page = NULL; 940 bool present = false, young = false, dirty = false; 941 pte_t ptent = ptep_get(pte); 942 943 if (pte_present(ptent)) { 944 page = vm_normal_page(vma, addr, ptent); 945 young = pte_young(ptent); 946 dirty = pte_dirty(ptent); 947 present = true; 948 } else if (is_swap_pte(ptent)) { 949 swp_entry_t swpent = pte_to_swp_entry(ptent); 950 951 if (!non_swap_entry(swpent)) { 952 int mapcount; 953 954 mss->swap += PAGE_SIZE; 955 mapcount = swp_swapcount(swpent); 956 if (mapcount >= 2) { 957 u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; 958 959 do_div(pss_delta, mapcount); 960 mss->swap_pss += pss_delta; 961 } else { 962 mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; 963 } 964 } else if (is_pfn_swap_entry(swpent)) { 965 if (is_device_private_entry(swpent)) 966 present = true; 967 page = pfn_swap_entry_to_page(swpent); 968 } 969 } else { 970 smaps_pte_hole_lookup(addr, walk); 971 return; 972 } 973 974 if (!page) 975 return; 976 977 smaps_account(mss, page, false, young, dirty, locked, present); 978 } 979 980 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 981 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, 982 struct mm_walk *walk) 983 { 984 struct mem_size_stats *mss = walk->private; 985 struct vm_area_struct *vma = walk->vma; 986 bool locked = !!(vma->vm_flags & VM_LOCKED); 987 struct page *page = NULL; 988 bool present = false; 989 struct folio *folio; 990 991 if (pmd_present(*pmd)) { 992 page = vm_normal_page_pmd(vma, addr, *pmd); 993 present = true; 994 } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { 995 swp_entry_t entry = pmd_to_swp_entry(*pmd); 996 997 if (is_pfn_swap_entry(entry)) 998 page = pfn_swap_entry_to_page(entry); 999 } 1000 if (IS_ERR_OR_NULL(page)) 1001 return; 1002 folio = page_folio(page); 1003 if (folio_test_anon(folio)) 1004 mss->anonymous_thp += HPAGE_PMD_SIZE; 1005 else if (folio_test_swapbacked(folio)) 1006 mss->shmem_thp += HPAGE_PMD_SIZE; 1007 else if (folio_is_zone_device(folio)) 1008 /* pass */; 1009 else 1010 mss->file_thp += HPAGE_PMD_SIZE; 1011 1012 smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), 1013 locked, present); 1014 } 1015 #else 1016 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, 1017 struct mm_walk *walk) 1018 { 1019 } 1020 #endif 1021 1022 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 1023 struct mm_walk *walk) 1024 { 1025 struct vm_area_struct *vma = walk->vma; 1026 pte_t *pte; 1027 spinlock_t *ptl; 1028 1029 ptl = pmd_trans_huge_lock(pmd, vma); 1030 if (ptl) { 1031 smaps_pmd_entry(pmd, addr, walk); 1032 spin_unlock(ptl); 1033 goto out; 1034 } 1035 1036 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 1037 if (!pte) { 1038 walk->action = ACTION_AGAIN; 1039 return 0; 1040 } 1041 for (; addr != end; pte++, addr += PAGE_SIZE) 1042 smaps_pte_entry(pte, addr, walk); 1043 pte_unmap_unlock(pte - 1, ptl); 1044 out: 1045 cond_resched(); 1046 return 0; 1047 } 1048 1049 static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) 1050 { 1051 /* 1052 * Don't forget to update Documentation/ on changes. 1053 * 1054 * The length of the second argument of mnemonics[] 1055 * needs to be 3 instead of previously set 2 1056 * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3]) 1057 * to avoid spurious 1058 * -Werror=unterminated-string-initialization warning 1059 * with GCC 15 1060 */ 1061 static const char mnemonics[BITS_PER_LONG][3] = { 1062 /* 1063 * In case if we meet a flag we don't know about. 1064 */ 1065 [0 ... (BITS_PER_LONG-1)] = "??", 1066 1067 [ilog2(VM_READ)] = "rd", 1068 [ilog2(VM_WRITE)] = "wr", 1069 [ilog2(VM_EXEC)] = "ex", 1070 [ilog2(VM_SHARED)] = "sh", 1071 [ilog2(VM_MAYREAD)] = "mr", 1072 [ilog2(VM_MAYWRITE)] = "mw", 1073 [ilog2(VM_MAYEXEC)] = "me", 1074 [ilog2(VM_MAYSHARE)] = "ms", 1075 [ilog2(VM_GROWSDOWN)] = "gd", 1076 [ilog2(VM_PFNMAP)] = "pf", 1077 [ilog2(VM_LOCKED)] = "lo", 1078 [ilog2(VM_IO)] = "io", 1079 [ilog2(VM_SEQ_READ)] = "sr", 1080 [ilog2(VM_RAND_READ)] = "rr", 1081 [ilog2(VM_DONTCOPY)] = "dc", 1082 [ilog2(VM_DONTEXPAND)] = "de", 1083 [ilog2(VM_LOCKONFAULT)] = "lf", 1084 [ilog2(VM_ACCOUNT)] = "ac", 1085 [ilog2(VM_NORESERVE)] = "nr", 1086 [ilog2(VM_HUGETLB)] = "ht", 1087 [ilog2(VM_SYNC)] = "sf", 1088 [ilog2(VM_ARCH_1)] = "ar", 1089 [ilog2(VM_WIPEONFORK)] = "wf", 1090 [ilog2(VM_DONTDUMP)] = "dd", 1091 #ifdef CONFIG_ARM64_BTI 1092 [ilog2(VM_ARM64_BTI)] = "bt", 1093 #endif 1094 #ifdef CONFIG_MEM_SOFT_DIRTY 1095 [ilog2(VM_SOFTDIRTY)] = "sd", 1096 #endif 1097 [ilog2(VM_MIXEDMAP)] = "mm", 1098 [ilog2(VM_HUGEPAGE)] = "hg", 1099 [ilog2(VM_NOHUGEPAGE)] = "nh", 1100 [ilog2(VM_MERGEABLE)] = "mg", 1101 [ilog2(VM_UFFD_MISSING)]= "um", 1102 [ilog2(VM_UFFD_WP)] = "uw", 1103 #ifdef CONFIG_ARM64_MTE 1104 [ilog2(VM_MTE)] = "mt", 1105 [ilog2(VM_MTE_ALLOWED)] = "", 1106 #endif 1107 #ifdef CONFIG_ARCH_HAS_PKEYS 1108 /* These come out via ProtectionKey: */ 1109 [ilog2(VM_PKEY_BIT0)] = "", 1110 [ilog2(VM_PKEY_BIT1)] = "", 1111 [ilog2(VM_PKEY_BIT2)] = "", 1112 #if VM_PKEY_BIT3 1113 [ilog2(VM_PKEY_BIT3)] = "", 1114 #endif 1115 #if VM_PKEY_BIT4 1116 [ilog2(VM_PKEY_BIT4)] = "", 1117 #endif 1118 #endif /* CONFIG_ARCH_HAS_PKEYS */ 1119 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR 1120 [ilog2(VM_UFFD_MINOR)] = "ui", 1121 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ 1122 #ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK 1123 [ilog2(VM_SHADOW_STACK)] = "ss", 1124 #endif 1125 #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) 1126 [ilog2(VM_DROPPABLE)] = "dp", 1127 #endif 1128 #ifdef CONFIG_64BIT 1129 [ilog2(VM_SEALED)] = "sl", 1130 #endif 1131 }; 1132 size_t i; 1133 1134 seq_puts(m, "VmFlags: "); 1135 for (i = 0; i < BITS_PER_LONG; i++) { 1136 if (!mnemonics[i][0]) 1137 continue; 1138 if (vma->vm_flags & (1UL << i)) 1139 seq_printf(m, "%s ", mnemonics[i]); 1140 } 1141 seq_putc(m, '\n'); 1142 } 1143 1144 #ifdef CONFIG_HUGETLB_PAGE 1145 static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, 1146 unsigned long addr, unsigned long end, 1147 struct mm_walk *walk) 1148 { 1149 struct mem_size_stats *mss = walk->private; 1150 struct vm_area_struct *vma = walk->vma; 1151 struct folio *folio = NULL; 1152 bool present = false; 1153 spinlock_t *ptl; 1154 pte_t ptent; 1155 1156 ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); 1157 ptent = huge_ptep_get(walk->mm, addr, pte); 1158 if (pte_present(ptent)) { 1159 folio = page_folio(pte_page(ptent)); 1160 present = true; 1161 } else if (is_swap_pte(ptent)) { 1162 swp_entry_t swpent = pte_to_swp_entry(ptent); 1163 1164 if (is_pfn_swap_entry(swpent)) 1165 folio = pfn_swap_entry_folio(swpent); 1166 } 1167 1168 if (folio) { 1169 /* We treat non-present entries as "maybe shared". */ 1170 if (!present || folio_maybe_mapped_shared(folio) || 1171 hugetlb_pmd_shared(pte)) 1172 mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); 1173 else 1174 mss->private_hugetlb += huge_page_size(hstate_vma(vma)); 1175 } 1176 spin_unlock(ptl); 1177 return 0; 1178 } 1179 #else 1180 #define smaps_hugetlb_range NULL 1181 #endif /* HUGETLB_PAGE */ 1182 1183 static const struct mm_walk_ops smaps_walk_ops = { 1184 .pmd_entry = smaps_pte_range, 1185 .hugetlb_entry = smaps_hugetlb_range, 1186 .walk_lock = PGWALK_RDLOCK, 1187 }; 1188 1189 static const struct mm_walk_ops smaps_shmem_walk_ops = { 1190 .pmd_entry = smaps_pte_range, 1191 .hugetlb_entry = smaps_hugetlb_range, 1192 .pte_hole = smaps_pte_hole, 1193 .walk_lock = PGWALK_RDLOCK, 1194 }; 1195 1196 /* 1197 * Gather mem stats from @vma with the indicated beginning 1198 * address @start, and keep them in @mss. 1199 * 1200 * Use vm_start of @vma as the beginning address if @start is 0. 1201 */ 1202 static void smap_gather_stats(struct vm_area_struct *vma, 1203 struct mem_size_stats *mss, unsigned long start) 1204 { 1205 const struct mm_walk_ops *ops = &smaps_walk_ops; 1206 1207 /* Invalid start */ 1208 if (start >= vma->vm_end) 1209 return; 1210 1211 if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { 1212 /* 1213 * For shared or readonly shmem mappings we know that all 1214 * swapped out pages belong to the shmem object, and we can 1215 * obtain the swap value much more efficiently. For private 1216 * writable mappings, we might have COW pages that are 1217 * not affected by the parent swapped out pages of the shmem 1218 * object, so we have to distinguish them during the page walk. 1219 * Unless we know that the shmem object (or the part mapped by 1220 * our VMA) has no swapped out pages at all. 1221 */ 1222 unsigned long shmem_swapped = shmem_swap_usage(vma); 1223 1224 if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) || 1225 !(vma->vm_flags & VM_WRITE))) { 1226 mss->swap += shmem_swapped; 1227 } else { 1228 ops = &smaps_shmem_walk_ops; 1229 } 1230 } 1231 1232 /* mmap_lock is held in m_start */ 1233 if (!start) 1234 walk_page_vma(vma, ops, mss); 1235 else 1236 walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); 1237 } 1238 1239 #define SEQ_PUT_DEC(str, val) \ 1240 seq_put_decimal_ull_width(m, str, (val) >> 10, 8) 1241 1242 /* Show the contents common for smaps and smaps_rollup */ 1243 static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, 1244 bool rollup_mode) 1245 { 1246 SEQ_PUT_DEC("Rss: ", mss->resident); 1247 SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); 1248 SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT); 1249 if (rollup_mode) { 1250 /* 1251 * These are meaningful only for smaps_rollup, otherwise two of 1252 * them are zero, and the other one is the same as Pss. 1253 */ 1254 SEQ_PUT_DEC(" kB\nPss_Anon: ", 1255 mss->pss_anon >> PSS_SHIFT); 1256 SEQ_PUT_DEC(" kB\nPss_File: ", 1257 mss->pss_file >> PSS_SHIFT); 1258 SEQ_PUT_DEC(" kB\nPss_Shmem: ", 1259 mss->pss_shmem >> PSS_SHIFT); 1260 } 1261 SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); 1262 SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); 1263 SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); 1264 SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); 1265 SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); 1266 SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); 1267 SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm); 1268 SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); 1269 SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); 1270 SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); 1271 SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); 1272 SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); 1273 seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", 1274 mss->private_hugetlb >> 10, 7); 1275 SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); 1276 SEQ_PUT_DEC(" kB\nSwapPss: ", 1277 mss->swap_pss >> PSS_SHIFT); 1278 SEQ_PUT_DEC(" kB\nLocked: ", 1279 mss->pss_locked >> PSS_SHIFT); 1280 seq_puts(m, " kB\n"); 1281 } 1282 1283 static int show_smap(struct seq_file *m, void *v) 1284 { 1285 struct vm_area_struct *vma = v; 1286 struct mem_size_stats mss = {}; 1287 1288 smap_gather_stats(vma, &mss, 0); 1289 1290 show_map_vma(m, vma); 1291 1292 SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); 1293 SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); 1294 SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); 1295 seq_puts(m, " kB\n"); 1296 1297 __show_smap(m, &mss, false); 1298 1299 seq_printf(m, "THPeligible: %8u\n", 1300 !!thp_vma_allowable_orders(vma, vma->vm_flags, 1301 TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL)); 1302 1303 if (arch_pkeys_enabled()) 1304 seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); 1305 show_smap_vma_flags(m, vma); 1306 1307 return 0; 1308 } 1309 1310 static int show_smaps_rollup(struct seq_file *m, void *v) 1311 { 1312 struct proc_maps_private *priv = m->private; 1313 struct mem_size_stats mss = {}; 1314 struct mm_struct *mm = priv->mm; 1315 struct vm_area_struct *vma; 1316 unsigned long vma_start = 0, last_vma_end = 0; 1317 int ret = 0; 1318 VMA_ITERATOR(vmi, mm, 0); 1319 1320 priv->task = get_proc_task(priv->inode); 1321 if (!priv->task) 1322 return -ESRCH; 1323 1324 if (!mm || !mmget_not_zero(mm)) { 1325 ret = -ESRCH; 1326 goto out_put_task; 1327 } 1328 1329 ret = mmap_read_lock_killable(mm); 1330 if (ret) 1331 goto out_put_mm; 1332 1333 hold_task_mempolicy(priv); 1334 vma = vma_next(&vmi); 1335 1336 if (unlikely(!vma)) 1337 goto empty_set; 1338 1339 vma_start = vma->vm_start; 1340 do { 1341 smap_gather_stats(vma, &mss, 0); 1342 last_vma_end = vma->vm_end; 1343 1344 /* 1345 * Release mmap_lock temporarily if someone wants to 1346 * access it for write request. 1347 */ 1348 if (mmap_lock_is_contended(mm)) { 1349 vma_iter_invalidate(&vmi); 1350 mmap_read_unlock(mm); 1351 ret = mmap_read_lock_killable(mm); 1352 if (ret) { 1353 release_task_mempolicy(priv); 1354 goto out_put_mm; 1355 } 1356 1357 /* 1358 * After dropping the lock, there are four cases to 1359 * consider. See the following example for explanation. 1360 * 1361 * +------+------+-----------+ 1362 * | VMA1 | VMA2 | VMA3 | 1363 * +------+------+-----------+ 1364 * | | | | 1365 * 4k 8k 16k 400k 1366 * 1367 * Suppose we drop the lock after reading VMA2 due to 1368 * contention, then we get: 1369 * 1370 * last_vma_end = 16k 1371 * 1372 * 1) VMA2 is freed, but VMA3 exists: 1373 * 1374 * vma_next(vmi) will return VMA3. 1375 * In this case, just continue from VMA3. 1376 * 1377 * 2) VMA2 still exists: 1378 * 1379 * vma_next(vmi) will return VMA3. 1380 * In this case, just continue from VMA3. 1381 * 1382 * 3) No more VMAs can be found: 1383 * 1384 * vma_next(vmi) will return NULL. 1385 * No more things to do, just break. 1386 * 1387 * 4) (last_vma_end - 1) is the middle of a vma (VMA'): 1388 * 1389 * vma_next(vmi) will return VMA' whose range 1390 * contains last_vma_end. 1391 * Iterate VMA' from last_vma_end. 1392 */ 1393 vma = vma_next(&vmi); 1394 /* Case 3 above */ 1395 if (!vma) 1396 break; 1397 1398 /* Case 1 and 2 above */ 1399 if (vma->vm_start >= last_vma_end) { 1400 smap_gather_stats(vma, &mss, 0); 1401 last_vma_end = vma->vm_end; 1402 continue; 1403 } 1404 1405 /* Case 4 above */ 1406 if (vma->vm_end > last_vma_end) { 1407 smap_gather_stats(vma, &mss, last_vma_end); 1408 last_vma_end = vma->vm_end; 1409 } 1410 } 1411 } for_each_vma(vmi, vma); 1412 1413 empty_set: 1414 show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); 1415 seq_pad(m, ' '); 1416 seq_puts(m, "[rollup]\n"); 1417 1418 __show_smap(m, &mss, true); 1419 1420 release_task_mempolicy(priv); 1421 mmap_read_unlock(mm); 1422 1423 out_put_mm: 1424 mmput(mm); 1425 out_put_task: 1426 put_task_struct(priv->task); 1427 priv->task = NULL; 1428 1429 return ret; 1430 } 1431 #undef SEQ_PUT_DEC 1432 1433 static const struct seq_operations proc_pid_smaps_op = { 1434 .start = m_start, 1435 .next = m_next, 1436 .stop = m_stop, 1437 .show = show_smap 1438 }; 1439 1440 static int pid_smaps_open(struct inode *inode, struct file *file) 1441 { 1442 return do_maps_open(inode, file, &proc_pid_smaps_op); 1443 } 1444 1445 static int smaps_rollup_open(struct inode *inode, struct file *file) 1446 { 1447 int ret; 1448 struct proc_maps_private *priv; 1449 1450 priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT); 1451 if (!priv) 1452 return -ENOMEM; 1453 1454 ret = single_open(file, show_smaps_rollup, priv); 1455 if (ret) 1456 goto out_free; 1457 1458 priv->inode = inode; 1459 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); 1460 if (IS_ERR_OR_NULL(priv->mm)) { 1461 ret = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; 1462 1463 single_release(inode, file); 1464 goto out_free; 1465 } 1466 1467 return 0; 1468 1469 out_free: 1470 kfree(priv); 1471 return ret; 1472 } 1473 1474 static int smaps_rollup_release(struct inode *inode, struct file *file) 1475 { 1476 struct seq_file *seq = file->private_data; 1477 struct proc_maps_private *priv = seq->private; 1478 1479 if (priv->mm) 1480 mmdrop(priv->mm); 1481 1482 kfree(priv); 1483 return single_release(inode, file); 1484 } 1485 1486 const struct file_operations proc_pid_smaps_operations = { 1487 .open = pid_smaps_open, 1488 .read = seq_read, 1489 .llseek = seq_lseek, 1490 .release = proc_map_release, 1491 }; 1492 1493 const struct file_operations proc_pid_smaps_rollup_operations = { 1494 .open = smaps_rollup_open, 1495 .read = seq_read, 1496 .llseek = seq_lseek, 1497 .release = smaps_rollup_release, 1498 }; 1499 1500 enum clear_refs_types { 1501 CLEAR_REFS_ALL = 1, 1502 CLEAR_REFS_ANON, 1503 CLEAR_REFS_MAPPED, 1504 CLEAR_REFS_SOFT_DIRTY, 1505 CLEAR_REFS_MM_HIWATER_RSS, 1506 CLEAR_REFS_LAST, 1507 }; 1508 1509 struct clear_refs_private { 1510 enum clear_refs_types type; 1511 }; 1512 1513 #ifdef CONFIG_MEM_SOFT_DIRTY 1514 1515 static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) 1516 { 1517 struct folio *folio; 1518 1519 if (!pte_write(pte)) 1520 return false; 1521 if (!is_cow_mapping(vma->vm_flags)) 1522 return false; 1523 if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))) 1524 return false; 1525 folio = vm_normal_folio(vma, addr, pte); 1526 if (!folio) 1527 return false; 1528 return folio_maybe_dma_pinned(folio); 1529 } 1530 1531 static inline void clear_soft_dirty(struct vm_area_struct *vma, 1532 unsigned long addr, pte_t *pte) 1533 { 1534 /* 1535 * The soft-dirty tracker uses #PF-s to catch writes 1536 * to pages, so write-protect the pte as well. See the 1537 * Documentation/admin-guide/mm/soft-dirty.rst for full description 1538 * of how soft-dirty works. 1539 */ 1540 pte_t ptent = ptep_get(pte); 1541 1542 if (pte_present(ptent)) { 1543 pte_t old_pte; 1544 1545 if (pte_is_pinned(vma, addr, ptent)) 1546 return; 1547 old_pte = ptep_modify_prot_start(vma, addr, pte); 1548 ptent = pte_wrprotect(old_pte); 1549 ptent = pte_clear_soft_dirty(ptent); 1550 ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); 1551 } else if (is_swap_pte(ptent)) { 1552 ptent = pte_swp_clear_soft_dirty(ptent); 1553 set_pte_at(vma->vm_mm, addr, pte, ptent); 1554 } 1555 } 1556 #else 1557 static inline void clear_soft_dirty(struct vm_area_struct *vma, 1558 unsigned long addr, pte_t *pte) 1559 { 1560 } 1561 #endif 1562 1563 #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 1564 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, 1565 unsigned long addr, pmd_t *pmdp) 1566 { 1567 pmd_t old, pmd = *pmdp; 1568 1569 if (pmd_present(pmd)) { 1570 /* See comment in change_huge_pmd() */ 1571 old = pmdp_invalidate(vma, addr, pmdp); 1572 if (pmd_dirty(old)) 1573 pmd = pmd_mkdirty(pmd); 1574 if (pmd_young(old)) 1575 pmd = pmd_mkyoung(pmd); 1576 1577 pmd = pmd_wrprotect(pmd); 1578 pmd = pmd_clear_soft_dirty(pmd); 1579 1580 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 1581 } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { 1582 pmd = pmd_swp_clear_soft_dirty(pmd); 1583 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 1584 } 1585 } 1586 #else 1587 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, 1588 unsigned long addr, pmd_t *pmdp) 1589 { 1590 } 1591 #endif 1592 1593 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, 1594 unsigned long end, struct mm_walk *walk) 1595 { 1596 struct clear_refs_private *cp = walk->private; 1597 struct vm_area_struct *vma = walk->vma; 1598 pte_t *pte, ptent; 1599 spinlock_t *ptl; 1600 struct folio *folio; 1601 1602 ptl = pmd_trans_huge_lock(pmd, vma); 1603 if (ptl) { 1604 if (cp->type == CLEAR_REFS_SOFT_DIRTY) { 1605 clear_soft_dirty_pmd(vma, addr, pmd); 1606 goto out; 1607 } 1608 1609 if (!pmd_present(*pmd)) 1610 goto out; 1611 1612 folio = pmd_folio(*pmd); 1613 1614 /* Clear accessed and referenced bits. */ 1615 pmdp_test_and_clear_young(vma, addr, pmd); 1616 folio_test_clear_young(folio); 1617 folio_clear_referenced(folio); 1618 out: 1619 spin_unlock(ptl); 1620 return 0; 1621 } 1622 1623 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 1624 if (!pte) { 1625 walk->action = ACTION_AGAIN; 1626 return 0; 1627 } 1628 for (; addr != end; pte++, addr += PAGE_SIZE) { 1629 ptent = ptep_get(pte); 1630 1631 if (cp->type == CLEAR_REFS_SOFT_DIRTY) { 1632 clear_soft_dirty(vma, addr, pte); 1633 continue; 1634 } 1635 1636 if (!pte_present(ptent)) 1637 continue; 1638 1639 folio = vm_normal_folio(vma, addr, ptent); 1640 if (!folio) 1641 continue; 1642 1643 /* Clear accessed and referenced bits. */ 1644 ptep_test_and_clear_young(vma, addr, pte); 1645 folio_test_clear_young(folio); 1646 folio_clear_referenced(folio); 1647 } 1648 pte_unmap_unlock(pte - 1, ptl); 1649 cond_resched(); 1650 return 0; 1651 } 1652 1653 static int clear_refs_test_walk(unsigned long start, unsigned long end, 1654 struct mm_walk *walk) 1655 { 1656 struct clear_refs_private *cp = walk->private; 1657 struct vm_area_struct *vma = walk->vma; 1658 1659 if (vma->vm_flags & VM_PFNMAP) 1660 return 1; 1661 1662 /* 1663 * Writing 1 to /proc/pid/clear_refs affects all pages. 1664 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. 1665 * Writing 3 to /proc/pid/clear_refs only affects file mapped pages. 1666 * Writing 4 to /proc/pid/clear_refs affects all pages. 1667 */ 1668 if (cp->type == CLEAR_REFS_ANON && vma->vm_file) 1669 return 1; 1670 if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file) 1671 return 1; 1672 return 0; 1673 } 1674 1675 static const struct mm_walk_ops clear_refs_walk_ops = { 1676 .pmd_entry = clear_refs_pte_range, 1677 .test_walk = clear_refs_test_walk, 1678 .walk_lock = PGWALK_WRLOCK, 1679 }; 1680 1681 static ssize_t clear_refs_write(struct file *file, const char __user *buf, 1682 size_t count, loff_t *ppos) 1683 { 1684 struct task_struct *task; 1685 char buffer[PROC_NUMBUF] = {}; 1686 struct mm_struct *mm; 1687 struct vm_area_struct *vma; 1688 enum clear_refs_types type; 1689 int itype; 1690 int rv; 1691 1692 if (count > sizeof(buffer) - 1) 1693 count = sizeof(buffer) - 1; 1694 if (copy_from_user(buffer, buf, count)) 1695 return -EFAULT; 1696 rv = kstrtoint(strstrip(buffer), 10, &itype); 1697 if (rv < 0) 1698 return rv; 1699 type = (enum clear_refs_types)itype; 1700 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) 1701 return -EINVAL; 1702 1703 task = get_proc_task(file_inode(file)); 1704 if (!task) 1705 return -ESRCH; 1706 mm = get_task_mm(task); 1707 if (mm) { 1708 VMA_ITERATOR(vmi, mm, 0); 1709 struct mmu_notifier_range range; 1710 struct clear_refs_private cp = { 1711 .type = type, 1712 }; 1713 1714 if (mmap_write_lock_killable(mm)) { 1715 count = -EINTR; 1716 goto out_mm; 1717 } 1718 if (type == CLEAR_REFS_MM_HIWATER_RSS) { 1719 /* 1720 * Writing 5 to /proc/pid/clear_refs resets the peak 1721 * resident set size to this mm's current rss value. 1722 */ 1723 reset_mm_hiwater_rss(mm); 1724 goto out_unlock; 1725 } 1726 1727 if (type == CLEAR_REFS_SOFT_DIRTY) { 1728 for_each_vma(vmi, vma) { 1729 if (!(vma->vm_flags & VM_SOFTDIRTY)) 1730 continue; 1731 vm_flags_clear(vma, VM_SOFTDIRTY); 1732 vma_set_page_prot(vma); 1733 } 1734 1735 inc_tlb_flush_pending(mm); 1736 mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 1737 0, mm, 0, -1UL); 1738 mmu_notifier_invalidate_range_start(&range); 1739 } 1740 walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); 1741 if (type == CLEAR_REFS_SOFT_DIRTY) { 1742 mmu_notifier_invalidate_range_end(&range); 1743 flush_tlb_mm(mm); 1744 dec_tlb_flush_pending(mm); 1745 } 1746 out_unlock: 1747 mmap_write_unlock(mm); 1748 out_mm: 1749 mmput(mm); 1750 } 1751 put_task_struct(task); 1752 1753 return count; 1754 } 1755 1756 const struct file_operations proc_clear_refs_operations = { 1757 .write = clear_refs_write, 1758 .llseek = noop_llseek, 1759 }; 1760 1761 typedef struct { 1762 u64 pme; 1763 } pagemap_entry_t; 1764 1765 struct pagemapread { 1766 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ 1767 pagemap_entry_t *buffer; 1768 bool show_pfn; 1769 }; 1770 1771 #define PAGEMAP_WALK_SIZE (PMD_SIZE) 1772 #define PAGEMAP_WALK_MASK (PMD_MASK) 1773 1774 #define PM_ENTRY_BYTES sizeof(pagemap_entry_t) 1775 #define PM_PFRAME_BITS 55 1776 #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) 1777 #define PM_SOFT_DIRTY BIT_ULL(55) 1778 #define PM_MMAP_EXCLUSIVE BIT_ULL(56) 1779 #define PM_UFFD_WP BIT_ULL(57) 1780 #define PM_GUARD_REGION BIT_ULL(58) 1781 #define PM_FILE BIT_ULL(61) 1782 #define PM_SWAP BIT_ULL(62) 1783 #define PM_PRESENT BIT_ULL(63) 1784 1785 #define PM_END_OF_BUFFER 1 1786 1787 static inline pagemap_entry_t make_pme(u64 frame, u64 flags) 1788 { 1789 return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; 1790 } 1791 1792 static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) 1793 { 1794 pm->buffer[pm->pos++] = *pme; 1795 if (pm->pos >= pm->len) 1796 return PM_END_OF_BUFFER; 1797 return 0; 1798 } 1799 1800 static bool __folio_page_mapped_exclusively(struct folio *folio, struct page *page) 1801 { 1802 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) 1803 return folio_precise_page_mapcount(folio, page) == 1; 1804 return !folio_maybe_mapped_shared(folio); 1805 } 1806 1807 static int pagemap_pte_hole(unsigned long start, unsigned long end, 1808 __always_unused int depth, struct mm_walk *walk) 1809 { 1810 struct pagemapread *pm = walk->private; 1811 unsigned long addr = start; 1812 int err = 0; 1813 1814 while (addr < end) { 1815 struct vm_area_struct *vma = find_vma(walk->mm, addr); 1816 pagemap_entry_t pme = make_pme(0, 0); 1817 /* End of address space hole, which we mark as non-present. */ 1818 unsigned long hole_end; 1819 1820 if (vma) 1821 hole_end = min(end, vma->vm_start); 1822 else 1823 hole_end = end; 1824 1825 for (; addr < hole_end; addr += PAGE_SIZE) { 1826 err = add_to_pagemap(&pme, pm); 1827 if (err) 1828 goto out; 1829 } 1830 1831 if (!vma) 1832 break; 1833 1834 /* Addresses in the VMA. */ 1835 if (vma->vm_flags & VM_SOFTDIRTY) 1836 pme = make_pme(0, PM_SOFT_DIRTY); 1837 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { 1838 err = add_to_pagemap(&pme, pm); 1839 if (err) 1840 goto out; 1841 } 1842 } 1843 out: 1844 return err; 1845 } 1846 1847 static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, 1848 struct vm_area_struct *vma, unsigned long addr, pte_t pte) 1849 { 1850 u64 frame = 0, flags = 0; 1851 struct page *page = NULL; 1852 struct folio *folio; 1853 1854 if (pte_present(pte)) { 1855 if (pm->show_pfn) 1856 frame = pte_pfn(pte); 1857 flags |= PM_PRESENT; 1858 page = vm_normal_page(vma, addr, pte); 1859 if (pte_soft_dirty(pte)) 1860 flags |= PM_SOFT_DIRTY; 1861 if (pte_uffd_wp(pte)) 1862 flags |= PM_UFFD_WP; 1863 } else if (is_swap_pte(pte)) { 1864 swp_entry_t entry; 1865 if (pte_swp_soft_dirty(pte)) 1866 flags |= PM_SOFT_DIRTY; 1867 if (pte_swp_uffd_wp(pte)) 1868 flags |= PM_UFFD_WP; 1869 entry = pte_to_swp_entry(pte); 1870 if (pm->show_pfn) { 1871 pgoff_t offset; 1872 /* 1873 * For PFN swap offsets, keeping the offset field 1874 * to be PFN only to be compatible with old smaps. 1875 */ 1876 if (is_pfn_swap_entry(entry)) 1877 offset = swp_offset_pfn(entry); 1878 else 1879 offset = swp_offset(entry); 1880 frame = swp_type(entry) | 1881 (offset << MAX_SWAPFILES_SHIFT); 1882 } 1883 flags |= PM_SWAP; 1884 if (is_pfn_swap_entry(entry)) 1885 page = pfn_swap_entry_to_page(entry); 1886 if (pte_marker_entry_uffd_wp(entry)) 1887 flags |= PM_UFFD_WP; 1888 if (is_guard_swp_entry(entry)) 1889 flags |= PM_GUARD_REGION; 1890 } 1891 1892 if (page) { 1893 folio = page_folio(page); 1894 if (!folio_test_anon(folio)) 1895 flags |= PM_FILE; 1896 if ((flags & PM_PRESENT) && 1897 __folio_page_mapped_exclusively(folio, page)) 1898 flags |= PM_MMAP_EXCLUSIVE; 1899 } 1900 if (vma->vm_flags & VM_SOFTDIRTY) 1901 flags |= PM_SOFT_DIRTY; 1902 1903 return make_pme(frame, flags); 1904 } 1905 1906 static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, 1907 struct mm_walk *walk) 1908 { 1909 struct vm_area_struct *vma = walk->vma; 1910 struct pagemapread *pm = walk->private; 1911 spinlock_t *ptl; 1912 pte_t *pte, *orig_pte; 1913 int err = 0; 1914 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1915 1916 ptl = pmd_trans_huge_lock(pmdp, vma); 1917 if (ptl) { 1918 unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT; 1919 u64 flags = 0, frame = 0; 1920 pmd_t pmd = *pmdp; 1921 struct page *page = NULL; 1922 struct folio *folio = NULL; 1923 1924 if (vma->vm_flags & VM_SOFTDIRTY) 1925 flags |= PM_SOFT_DIRTY; 1926 1927 if (pmd_present(pmd)) { 1928 page = pmd_page(pmd); 1929 1930 flags |= PM_PRESENT; 1931 if (pmd_soft_dirty(pmd)) 1932 flags |= PM_SOFT_DIRTY; 1933 if (pmd_uffd_wp(pmd)) 1934 flags |= PM_UFFD_WP; 1935 if (pm->show_pfn) 1936 frame = pmd_pfn(pmd) + idx; 1937 } 1938 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1939 else if (is_swap_pmd(pmd)) { 1940 swp_entry_t entry = pmd_to_swp_entry(pmd); 1941 unsigned long offset; 1942 1943 if (pm->show_pfn) { 1944 if (is_pfn_swap_entry(entry)) 1945 offset = swp_offset_pfn(entry) + idx; 1946 else 1947 offset = swp_offset(entry) + idx; 1948 frame = swp_type(entry) | 1949 (offset << MAX_SWAPFILES_SHIFT); 1950 } 1951 flags |= PM_SWAP; 1952 if (pmd_swp_soft_dirty(pmd)) 1953 flags |= PM_SOFT_DIRTY; 1954 if (pmd_swp_uffd_wp(pmd)) 1955 flags |= PM_UFFD_WP; 1956 VM_BUG_ON(!is_pmd_migration_entry(pmd)); 1957 page = pfn_swap_entry_to_page(entry); 1958 } 1959 #endif 1960 1961 if (page) { 1962 folio = page_folio(page); 1963 if (!folio_test_anon(folio)) 1964 flags |= PM_FILE; 1965 } 1966 1967 for (; addr != end; addr += PAGE_SIZE, idx++) { 1968 u64 cur_flags = flags; 1969 pagemap_entry_t pme; 1970 1971 if (folio && (flags & PM_PRESENT) && 1972 __folio_page_mapped_exclusively(folio, page)) 1973 cur_flags |= PM_MMAP_EXCLUSIVE; 1974 1975 pme = make_pme(frame, cur_flags); 1976 err = add_to_pagemap(&pme, pm); 1977 if (err) 1978 break; 1979 if (pm->show_pfn) { 1980 if (flags & PM_PRESENT) 1981 frame++; 1982 else if (flags & PM_SWAP) 1983 frame += (1 << MAX_SWAPFILES_SHIFT); 1984 } 1985 } 1986 spin_unlock(ptl); 1987 return err; 1988 } 1989 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1990 1991 /* 1992 * We can assume that @vma always points to a valid one and @end never 1993 * goes beyond vma->vm_end. 1994 */ 1995 orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); 1996 if (!pte) { 1997 walk->action = ACTION_AGAIN; 1998 return err; 1999 } 2000 for (; addr < end; pte++, addr += PAGE_SIZE) { 2001 pagemap_entry_t pme; 2002 2003 pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); 2004 err = add_to_pagemap(&pme, pm); 2005 if (err) 2006 break; 2007 } 2008 pte_unmap_unlock(orig_pte, ptl); 2009 2010 cond_resched(); 2011 2012 return err; 2013 } 2014 2015 #ifdef CONFIG_HUGETLB_PAGE 2016 /* This function walks within one hugetlb entry in the single call */ 2017 static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, 2018 unsigned long addr, unsigned long end, 2019 struct mm_walk *walk) 2020 { 2021 struct pagemapread *pm = walk->private; 2022 struct vm_area_struct *vma = walk->vma; 2023 u64 flags = 0, frame = 0; 2024 spinlock_t *ptl; 2025 int err = 0; 2026 pte_t pte; 2027 2028 if (vma->vm_flags & VM_SOFTDIRTY) 2029 flags |= PM_SOFT_DIRTY; 2030 2031 ptl = huge_pte_lock(hstate_vma(vma), walk->mm, ptep); 2032 pte = huge_ptep_get(walk->mm, addr, ptep); 2033 if (pte_present(pte)) { 2034 struct folio *folio = page_folio(pte_page(pte)); 2035 2036 if (!folio_test_anon(folio)) 2037 flags |= PM_FILE; 2038 2039 if (!folio_maybe_mapped_shared(folio) && 2040 !hugetlb_pmd_shared(ptep)) 2041 flags |= PM_MMAP_EXCLUSIVE; 2042 2043 if (huge_pte_uffd_wp(pte)) 2044 flags |= PM_UFFD_WP; 2045 2046 flags |= PM_PRESENT; 2047 if (pm->show_pfn) 2048 frame = pte_pfn(pte) + 2049 ((addr & ~hmask) >> PAGE_SHIFT); 2050 } else if (pte_swp_uffd_wp_any(pte)) { 2051 flags |= PM_UFFD_WP; 2052 } 2053 2054 for (; addr != end; addr += PAGE_SIZE) { 2055 pagemap_entry_t pme = make_pme(frame, flags); 2056 2057 err = add_to_pagemap(&pme, pm); 2058 if (err) 2059 break; 2060 if (pm->show_pfn && (flags & PM_PRESENT)) 2061 frame++; 2062 } 2063 2064 spin_unlock(ptl); 2065 cond_resched(); 2066 2067 return err; 2068 } 2069 #else 2070 #define pagemap_hugetlb_range NULL 2071 #endif /* HUGETLB_PAGE */ 2072 2073 static const struct mm_walk_ops pagemap_ops = { 2074 .pmd_entry = pagemap_pmd_range, 2075 .pte_hole = pagemap_pte_hole, 2076 .hugetlb_entry = pagemap_hugetlb_range, 2077 .walk_lock = PGWALK_RDLOCK, 2078 }; 2079 2080 /* 2081 * /proc/pid/pagemap - an array mapping virtual pages to pfns 2082 * 2083 * For each page in the address space, this file contains one 64-bit entry 2084 * consisting of the following: 2085 * 2086 * Bits 0-54 page frame number (PFN) if present 2087 * Bits 0-4 swap type if swapped 2088 * Bits 5-54 swap offset if swapped 2089 * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) 2090 * Bit 56 page exclusively mapped 2091 * Bit 57 pte is uffd-wp write-protected 2092 * Bit 58 pte is a guard region 2093 * Bits 59-60 zero 2094 * Bit 61 page is file-page or shared-anon 2095 * Bit 62 page swapped 2096 * Bit 63 page present 2097 * 2098 * If the page is not present but in swap, then the PFN contains an 2099 * encoding of the swap file number and the page's offset into the 2100 * swap. Unmapped pages return a null PFN. This allows determining 2101 * precisely which pages are mapped (or in swap) and comparing mapped 2102 * pages between processes. 2103 * 2104 * Efficient users of this interface will use /proc/pid/maps to 2105 * determine which areas of memory are actually mapped and llseek to 2106 * skip over unmapped regions. 2107 */ 2108 static ssize_t pagemap_read(struct file *file, char __user *buf, 2109 size_t count, loff_t *ppos) 2110 { 2111 struct mm_struct *mm = file->private_data; 2112 struct pagemapread pm; 2113 unsigned long src; 2114 unsigned long svpfn; 2115 unsigned long start_vaddr; 2116 unsigned long end_vaddr; 2117 int ret = 0, copied = 0; 2118 2119 if (!mm || !mmget_not_zero(mm)) 2120 goto out; 2121 2122 ret = -EINVAL; 2123 /* file position must be aligned */ 2124 if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) 2125 goto out_mm; 2126 2127 ret = 0; 2128 if (!count) 2129 goto out_mm; 2130 2131 /* do not disclose physical addresses: attack vector */ 2132 pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); 2133 2134 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); 2135 pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); 2136 ret = -ENOMEM; 2137 if (!pm.buffer) 2138 goto out_mm; 2139 2140 src = *ppos; 2141 svpfn = src / PM_ENTRY_BYTES; 2142 end_vaddr = mm->task_size; 2143 2144 /* watch out for wraparound */ 2145 start_vaddr = end_vaddr; 2146 if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { 2147 unsigned long end; 2148 2149 ret = mmap_read_lock_killable(mm); 2150 if (ret) 2151 goto out_free; 2152 start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT); 2153 mmap_read_unlock(mm); 2154 2155 end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT); 2156 if (end >= start_vaddr && end < mm->task_size) 2157 end_vaddr = end; 2158 } 2159 2160 /* Ensure the address is inside the task */ 2161 if (start_vaddr > mm->task_size) 2162 start_vaddr = end_vaddr; 2163 2164 ret = 0; 2165 while (count && (start_vaddr < end_vaddr)) { 2166 int len; 2167 unsigned long end; 2168 2169 pm.pos = 0; 2170 end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; 2171 /* overflow ? */ 2172 if (end < start_vaddr || end > end_vaddr) 2173 end = end_vaddr; 2174 ret = mmap_read_lock_killable(mm); 2175 if (ret) 2176 goto out_free; 2177 ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); 2178 mmap_read_unlock(mm); 2179 start_vaddr = end; 2180 2181 len = min(count, PM_ENTRY_BYTES * pm.pos); 2182 if (copy_to_user(buf, pm.buffer, len)) { 2183 ret = -EFAULT; 2184 goto out_free; 2185 } 2186 copied += len; 2187 buf += len; 2188 count -= len; 2189 } 2190 *ppos += copied; 2191 if (!ret || ret == PM_END_OF_BUFFER) 2192 ret = copied; 2193 2194 out_free: 2195 kfree(pm.buffer); 2196 out_mm: 2197 mmput(mm); 2198 out: 2199 return ret; 2200 } 2201 2202 static int pagemap_open(struct inode *inode, struct file *file) 2203 { 2204 struct mm_struct *mm; 2205 2206 mm = proc_mem_open(inode, PTRACE_MODE_READ); 2207 if (IS_ERR_OR_NULL(mm)) 2208 return mm ? PTR_ERR(mm) : -ESRCH; 2209 file->private_data = mm; 2210 return 0; 2211 } 2212 2213 static int pagemap_release(struct inode *inode, struct file *file) 2214 { 2215 struct mm_struct *mm = file->private_data; 2216 2217 if (mm) 2218 mmdrop(mm); 2219 return 0; 2220 } 2221 2222 #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ 2223 PAGE_IS_FILE | PAGE_IS_PRESENT | \ 2224 PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ 2225 PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \ 2226 PAGE_IS_GUARD) 2227 #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) 2228 2229 struct pagemap_scan_private { 2230 struct pm_scan_arg arg; 2231 unsigned long masks_of_interest, cur_vma_category; 2232 struct page_region *vec_buf; 2233 unsigned long vec_buf_len, vec_buf_index, found_pages; 2234 struct page_region __user *vec_out; 2235 }; 2236 2237 static unsigned long pagemap_page_category(struct pagemap_scan_private *p, 2238 struct vm_area_struct *vma, 2239 unsigned long addr, pte_t pte) 2240 { 2241 unsigned long categories = 0; 2242 2243 if (pte_present(pte)) { 2244 struct page *page; 2245 2246 categories |= PAGE_IS_PRESENT; 2247 if (!pte_uffd_wp(pte)) 2248 categories |= PAGE_IS_WRITTEN; 2249 2250 if (p->masks_of_interest & PAGE_IS_FILE) { 2251 page = vm_normal_page(vma, addr, pte); 2252 if (page && !PageAnon(page)) 2253 categories |= PAGE_IS_FILE; 2254 } 2255 2256 if (is_zero_pfn(pte_pfn(pte))) 2257 categories |= PAGE_IS_PFNZERO; 2258 if (pte_soft_dirty(pte)) 2259 categories |= PAGE_IS_SOFT_DIRTY; 2260 } else if (is_swap_pte(pte)) { 2261 swp_entry_t swp; 2262 2263 categories |= PAGE_IS_SWAPPED; 2264 if (!pte_swp_uffd_wp_any(pte)) 2265 categories |= PAGE_IS_WRITTEN; 2266 2267 swp = pte_to_swp_entry(pte); 2268 if (is_guard_swp_entry(swp)) 2269 categories |= PAGE_IS_GUARD; 2270 else if ((p->masks_of_interest & PAGE_IS_FILE) && 2271 is_pfn_swap_entry(swp) && 2272 !folio_test_anon(pfn_swap_entry_folio(swp))) 2273 categories |= PAGE_IS_FILE; 2274 2275 if (pte_swp_soft_dirty(pte)) 2276 categories |= PAGE_IS_SOFT_DIRTY; 2277 } 2278 2279 return categories; 2280 } 2281 2282 static void make_uffd_wp_pte(struct vm_area_struct *vma, 2283 unsigned long addr, pte_t *pte, pte_t ptent) 2284 { 2285 if (pte_present(ptent)) { 2286 pte_t old_pte; 2287 2288 old_pte = ptep_modify_prot_start(vma, addr, pte); 2289 ptent = pte_mkuffd_wp(old_pte); 2290 ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); 2291 } else if (is_swap_pte(ptent)) { 2292 ptent = pte_swp_mkuffd_wp(ptent); 2293 set_pte_at(vma->vm_mm, addr, pte, ptent); 2294 } else { 2295 set_pte_at(vma->vm_mm, addr, pte, 2296 make_pte_marker(PTE_MARKER_UFFD_WP)); 2297 } 2298 } 2299 2300 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2301 static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, 2302 struct vm_area_struct *vma, 2303 unsigned long addr, pmd_t pmd) 2304 { 2305 unsigned long categories = PAGE_IS_HUGE; 2306 2307 if (pmd_present(pmd)) { 2308 struct page *page; 2309 2310 categories |= PAGE_IS_PRESENT; 2311 if (!pmd_uffd_wp(pmd)) 2312 categories |= PAGE_IS_WRITTEN; 2313 2314 if (p->masks_of_interest & PAGE_IS_FILE) { 2315 page = vm_normal_page_pmd(vma, addr, pmd); 2316 if (page && !PageAnon(page)) 2317 categories |= PAGE_IS_FILE; 2318 } 2319 2320 if (is_huge_zero_pmd(pmd)) 2321 categories |= PAGE_IS_PFNZERO; 2322 if (pmd_soft_dirty(pmd)) 2323 categories |= PAGE_IS_SOFT_DIRTY; 2324 } else if (is_swap_pmd(pmd)) { 2325 swp_entry_t swp; 2326 2327 categories |= PAGE_IS_SWAPPED; 2328 if (!pmd_swp_uffd_wp(pmd)) 2329 categories |= PAGE_IS_WRITTEN; 2330 if (pmd_swp_soft_dirty(pmd)) 2331 categories |= PAGE_IS_SOFT_DIRTY; 2332 2333 if (p->masks_of_interest & PAGE_IS_FILE) { 2334 swp = pmd_to_swp_entry(pmd); 2335 if (is_pfn_swap_entry(swp) && 2336 !folio_test_anon(pfn_swap_entry_folio(swp))) 2337 categories |= PAGE_IS_FILE; 2338 } 2339 } 2340 2341 return categories; 2342 } 2343 2344 static void make_uffd_wp_pmd(struct vm_area_struct *vma, 2345 unsigned long addr, pmd_t *pmdp) 2346 { 2347 pmd_t old, pmd = *pmdp; 2348 2349 if (pmd_present(pmd)) { 2350 old = pmdp_invalidate_ad(vma, addr, pmdp); 2351 pmd = pmd_mkuffd_wp(old); 2352 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 2353 } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { 2354 pmd = pmd_swp_mkuffd_wp(pmd); 2355 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 2356 } 2357 } 2358 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2359 2360 #ifdef CONFIG_HUGETLB_PAGE 2361 static unsigned long pagemap_hugetlb_category(pte_t pte) 2362 { 2363 unsigned long categories = PAGE_IS_HUGE; 2364 2365 /* 2366 * According to pagemap_hugetlb_range(), file-backed HugeTLB 2367 * page cannot be swapped. So PAGE_IS_FILE is not checked for 2368 * swapped pages. 2369 */ 2370 if (pte_present(pte)) { 2371 categories |= PAGE_IS_PRESENT; 2372 if (!huge_pte_uffd_wp(pte)) 2373 categories |= PAGE_IS_WRITTEN; 2374 if (!PageAnon(pte_page(pte))) 2375 categories |= PAGE_IS_FILE; 2376 if (is_zero_pfn(pte_pfn(pte))) 2377 categories |= PAGE_IS_PFNZERO; 2378 if (pte_soft_dirty(pte)) 2379 categories |= PAGE_IS_SOFT_DIRTY; 2380 } else if (is_swap_pte(pte)) { 2381 categories |= PAGE_IS_SWAPPED; 2382 if (!pte_swp_uffd_wp_any(pte)) 2383 categories |= PAGE_IS_WRITTEN; 2384 if (pte_swp_soft_dirty(pte)) 2385 categories |= PAGE_IS_SOFT_DIRTY; 2386 } 2387 2388 return categories; 2389 } 2390 2391 static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, 2392 unsigned long addr, pte_t *ptep, 2393 pte_t ptent) 2394 { 2395 unsigned long psize; 2396 2397 if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent)) 2398 return; 2399 2400 psize = huge_page_size(hstate_vma(vma)); 2401 2402 if (is_hugetlb_entry_migration(ptent)) 2403 set_huge_pte_at(vma->vm_mm, addr, ptep, 2404 pte_swp_mkuffd_wp(ptent), psize); 2405 else if (!huge_pte_none(ptent)) 2406 huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, 2407 huge_pte_mkuffd_wp(ptent)); 2408 else 2409 set_huge_pte_at(vma->vm_mm, addr, ptep, 2410 make_pte_marker(PTE_MARKER_UFFD_WP), psize); 2411 } 2412 #endif /* CONFIG_HUGETLB_PAGE */ 2413 2414 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) 2415 static void pagemap_scan_backout_range(struct pagemap_scan_private *p, 2416 unsigned long addr, unsigned long end) 2417 { 2418 struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; 2419 2420 if (cur_buf->start != addr) 2421 cur_buf->end = addr; 2422 else 2423 cur_buf->start = cur_buf->end = 0; 2424 2425 p->found_pages -= (end - addr) / PAGE_SIZE; 2426 } 2427 #endif 2428 2429 static bool pagemap_scan_is_interesting_page(unsigned long categories, 2430 const struct pagemap_scan_private *p) 2431 { 2432 categories ^= p->arg.category_inverted; 2433 if ((categories & p->arg.category_mask) != p->arg.category_mask) 2434 return false; 2435 if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask)) 2436 return false; 2437 2438 return true; 2439 } 2440 2441 static bool pagemap_scan_is_interesting_vma(unsigned long categories, 2442 const struct pagemap_scan_private *p) 2443 { 2444 unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED; 2445 2446 categories ^= p->arg.category_inverted; 2447 if ((categories & required) != required) 2448 return false; 2449 2450 return true; 2451 } 2452 2453 static int pagemap_scan_test_walk(unsigned long start, unsigned long end, 2454 struct mm_walk *walk) 2455 { 2456 struct pagemap_scan_private *p = walk->private; 2457 struct vm_area_struct *vma = walk->vma; 2458 unsigned long vma_category = 0; 2459 bool wp_allowed = userfaultfd_wp_async(vma) && 2460 userfaultfd_wp_use_markers(vma); 2461 2462 if (!wp_allowed) { 2463 /* User requested explicit failure over wp-async capability */ 2464 if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) 2465 return -EPERM; 2466 /* 2467 * User requires wr-protect, and allows silently skipping 2468 * unsupported vmas. 2469 */ 2470 if (p->arg.flags & PM_SCAN_WP_MATCHING) 2471 return 1; 2472 /* 2473 * Then the request doesn't involve wr-protects at all, 2474 * fall through to the rest checks, and allow vma walk. 2475 */ 2476 } 2477 2478 if (vma->vm_flags & VM_PFNMAP) 2479 return 1; 2480 2481 if (wp_allowed) 2482 vma_category |= PAGE_IS_WPALLOWED; 2483 2484 if (vma->vm_flags & VM_SOFTDIRTY) 2485 vma_category |= PAGE_IS_SOFT_DIRTY; 2486 2487 if (!pagemap_scan_is_interesting_vma(vma_category, p)) 2488 return 1; 2489 2490 p->cur_vma_category = vma_category; 2491 2492 return 0; 2493 } 2494 2495 static bool pagemap_scan_push_range(unsigned long categories, 2496 struct pagemap_scan_private *p, 2497 unsigned long addr, unsigned long end) 2498 { 2499 struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; 2500 2501 /* 2502 * When there is no output buffer provided at all, the sentinel values 2503 * won't match here. There is no other way for `cur_buf->end` to be 2504 * non-zero other than it being non-empty. 2505 */ 2506 if (addr == cur_buf->end && categories == cur_buf->categories) { 2507 cur_buf->end = end; 2508 return true; 2509 } 2510 2511 if (cur_buf->end) { 2512 if (p->vec_buf_index >= p->vec_buf_len - 1) 2513 return false; 2514 2515 cur_buf = &p->vec_buf[++p->vec_buf_index]; 2516 } 2517 2518 cur_buf->start = addr; 2519 cur_buf->end = end; 2520 cur_buf->categories = categories; 2521 2522 return true; 2523 } 2524 2525 static int pagemap_scan_output(unsigned long categories, 2526 struct pagemap_scan_private *p, 2527 unsigned long addr, unsigned long *end) 2528 { 2529 unsigned long n_pages, total_pages; 2530 int ret = 0; 2531 2532 if (!p->vec_buf) 2533 return 0; 2534 2535 categories &= p->arg.return_mask; 2536 2537 n_pages = (*end - addr) / PAGE_SIZE; 2538 if (check_add_overflow(p->found_pages, n_pages, &total_pages) || 2539 total_pages > p->arg.max_pages) { 2540 size_t n_too_much = total_pages - p->arg.max_pages; 2541 *end -= n_too_much * PAGE_SIZE; 2542 n_pages -= n_too_much; 2543 ret = -ENOSPC; 2544 } 2545 2546 if (!pagemap_scan_push_range(categories, p, addr, *end)) { 2547 *end = addr; 2548 n_pages = 0; 2549 ret = -ENOSPC; 2550 } 2551 2552 p->found_pages += n_pages; 2553 if (ret) 2554 p->arg.walk_end = *end; 2555 2556 return ret; 2557 } 2558 2559 static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, 2560 unsigned long end, struct mm_walk *walk) 2561 { 2562 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2563 struct pagemap_scan_private *p = walk->private; 2564 struct vm_area_struct *vma = walk->vma; 2565 unsigned long categories; 2566 spinlock_t *ptl; 2567 int ret = 0; 2568 2569 ptl = pmd_trans_huge_lock(pmd, vma); 2570 if (!ptl) 2571 return -ENOENT; 2572 2573 categories = p->cur_vma_category | 2574 pagemap_thp_category(p, vma, start, *pmd); 2575 2576 if (!pagemap_scan_is_interesting_page(categories, p)) 2577 goto out_unlock; 2578 2579 ret = pagemap_scan_output(categories, p, start, &end); 2580 if (start == end) 2581 goto out_unlock; 2582 2583 if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2584 goto out_unlock; 2585 if (~categories & PAGE_IS_WRITTEN) 2586 goto out_unlock; 2587 2588 /* 2589 * Break huge page into small pages if the WP operation 2590 * needs to be performed on a portion of the huge page. 2591 */ 2592 if (end != start + HPAGE_SIZE) { 2593 spin_unlock(ptl); 2594 split_huge_pmd(vma, pmd, start); 2595 pagemap_scan_backout_range(p, start, end); 2596 /* Report as if there was no THP */ 2597 return -ENOENT; 2598 } 2599 2600 make_uffd_wp_pmd(vma, start, pmd); 2601 flush_tlb_range(vma, start, end); 2602 out_unlock: 2603 spin_unlock(ptl); 2604 return ret; 2605 #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ 2606 return -ENOENT; 2607 #endif 2608 } 2609 2610 static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, 2611 unsigned long end, struct mm_walk *walk) 2612 { 2613 struct pagemap_scan_private *p = walk->private; 2614 struct vm_area_struct *vma = walk->vma; 2615 unsigned long addr, flush_end = 0; 2616 pte_t *pte, *start_pte; 2617 spinlock_t *ptl; 2618 int ret; 2619 2620 ret = pagemap_scan_thp_entry(pmd, start, end, walk); 2621 if (ret != -ENOENT) 2622 return ret; 2623 2624 ret = 0; 2625 start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); 2626 if (!pte) { 2627 walk->action = ACTION_AGAIN; 2628 return 0; 2629 } 2630 2631 arch_enter_lazy_mmu_mode(); 2632 2633 if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { 2634 /* Fast path for performing exclusive WP */ 2635 for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { 2636 pte_t ptent = ptep_get(pte); 2637 2638 if ((pte_present(ptent) && pte_uffd_wp(ptent)) || 2639 pte_swp_uffd_wp_any(ptent)) 2640 continue; 2641 make_uffd_wp_pte(vma, addr, pte, ptent); 2642 if (!flush_end) 2643 start = addr; 2644 flush_end = addr + PAGE_SIZE; 2645 } 2646 goto flush_and_return; 2647 } 2648 2649 if (!p->arg.category_anyof_mask && !p->arg.category_inverted && 2650 p->arg.category_mask == PAGE_IS_WRITTEN && 2651 p->arg.return_mask == PAGE_IS_WRITTEN) { 2652 for (addr = start; addr < end; pte++, addr += PAGE_SIZE) { 2653 unsigned long next = addr + PAGE_SIZE; 2654 pte_t ptent = ptep_get(pte); 2655 2656 if ((pte_present(ptent) && pte_uffd_wp(ptent)) || 2657 pte_swp_uffd_wp_any(ptent)) 2658 continue; 2659 ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN, 2660 p, addr, &next); 2661 if (next == addr) 2662 break; 2663 if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2664 continue; 2665 make_uffd_wp_pte(vma, addr, pte, ptent); 2666 if (!flush_end) 2667 start = addr; 2668 flush_end = next; 2669 } 2670 goto flush_and_return; 2671 } 2672 2673 for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { 2674 pte_t ptent = ptep_get(pte); 2675 unsigned long categories = p->cur_vma_category | 2676 pagemap_page_category(p, vma, addr, ptent); 2677 unsigned long next = addr + PAGE_SIZE; 2678 2679 if (!pagemap_scan_is_interesting_page(categories, p)) 2680 continue; 2681 2682 ret = pagemap_scan_output(categories, p, addr, &next); 2683 if (next == addr) 2684 break; 2685 2686 if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2687 continue; 2688 if (~categories & PAGE_IS_WRITTEN) 2689 continue; 2690 2691 make_uffd_wp_pte(vma, addr, pte, ptent); 2692 if (!flush_end) 2693 start = addr; 2694 flush_end = next; 2695 } 2696 2697 flush_and_return: 2698 if (flush_end) 2699 flush_tlb_range(vma, start, addr); 2700 2701 arch_leave_lazy_mmu_mode(); 2702 pte_unmap_unlock(start_pte, ptl); 2703 2704 cond_resched(); 2705 return ret; 2706 } 2707 2708 #ifdef CONFIG_HUGETLB_PAGE 2709 static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, 2710 unsigned long start, unsigned long end, 2711 struct mm_walk *walk) 2712 { 2713 struct pagemap_scan_private *p = walk->private; 2714 struct vm_area_struct *vma = walk->vma; 2715 unsigned long categories; 2716 spinlock_t *ptl; 2717 int ret = 0; 2718 pte_t pte; 2719 2720 if (~p->arg.flags & PM_SCAN_WP_MATCHING) { 2721 /* Go the short route when not write-protecting pages. */ 2722 2723 pte = huge_ptep_get(walk->mm, start, ptep); 2724 categories = p->cur_vma_category | pagemap_hugetlb_category(pte); 2725 2726 if (!pagemap_scan_is_interesting_page(categories, p)) 2727 return 0; 2728 2729 return pagemap_scan_output(categories, p, start, &end); 2730 } 2731 2732 i_mmap_lock_write(vma->vm_file->f_mapping); 2733 ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); 2734 2735 pte = huge_ptep_get(walk->mm, start, ptep); 2736 categories = p->cur_vma_category | pagemap_hugetlb_category(pte); 2737 2738 if (!pagemap_scan_is_interesting_page(categories, p)) 2739 goto out_unlock; 2740 2741 ret = pagemap_scan_output(categories, p, start, &end); 2742 if (start == end) 2743 goto out_unlock; 2744 2745 if (~categories & PAGE_IS_WRITTEN) 2746 goto out_unlock; 2747 2748 if (end != start + HPAGE_SIZE) { 2749 /* Partial HugeTLB page WP isn't possible. */ 2750 pagemap_scan_backout_range(p, start, end); 2751 p->arg.walk_end = start; 2752 ret = 0; 2753 goto out_unlock; 2754 } 2755 2756 make_uffd_wp_huge_pte(vma, start, ptep, pte); 2757 flush_hugetlb_tlb_range(vma, start, end); 2758 2759 out_unlock: 2760 spin_unlock(ptl); 2761 i_mmap_unlock_write(vma->vm_file->f_mapping); 2762 2763 return ret; 2764 } 2765 #else 2766 #define pagemap_scan_hugetlb_entry NULL 2767 #endif 2768 2769 static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, 2770 int depth, struct mm_walk *walk) 2771 { 2772 struct pagemap_scan_private *p = walk->private; 2773 struct vm_area_struct *vma = walk->vma; 2774 int ret, err; 2775 2776 if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p)) 2777 return 0; 2778 2779 ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end); 2780 if (addr == end) 2781 return ret; 2782 2783 if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2784 return ret; 2785 2786 err = uffd_wp_range(vma, addr, end - addr, true); 2787 if (err < 0) 2788 ret = err; 2789 2790 return ret; 2791 } 2792 2793 static const struct mm_walk_ops pagemap_scan_ops = { 2794 .test_walk = pagemap_scan_test_walk, 2795 .pmd_entry = pagemap_scan_pmd_entry, 2796 .pte_hole = pagemap_scan_pte_hole, 2797 .hugetlb_entry = pagemap_scan_hugetlb_entry, 2798 }; 2799 2800 static int pagemap_scan_get_args(struct pm_scan_arg *arg, 2801 unsigned long uarg) 2802 { 2803 if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg))) 2804 return -EFAULT; 2805 2806 if (arg->size != sizeof(struct pm_scan_arg)) 2807 return -EINVAL; 2808 2809 /* Validate requested features */ 2810 if (arg->flags & ~PM_SCAN_FLAGS) 2811 return -EINVAL; 2812 if ((arg->category_inverted | arg->category_mask | 2813 arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) 2814 return -EINVAL; 2815 2816 arg->start = untagged_addr((unsigned long)arg->start); 2817 arg->end = untagged_addr((unsigned long)arg->end); 2818 arg->vec = untagged_addr((unsigned long)arg->vec); 2819 2820 /* Validate memory pointers */ 2821 if (!IS_ALIGNED(arg->start, PAGE_SIZE)) 2822 return -EINVAL; 2823 if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start)) 2824 return -EFAULT; 2825 if (!arg->vec && arg->vec_len) 2826 return -EINVAL; 2827 if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX) 2828 return -EINVAL; 2829 if (arg->vec && !access_ok((void __user *)(long)arg->vec, 2830 size_mul(arg->vec_len, sizeof(struct page_region)))) 2831 return -EFAULT; 2832 2833 /* Fixup default values */ 2834 arg->end = ALIGN(arg->end, PAGE_SIZE); 2835 arg->walk_end = 0; 2836 if (!arg->max_pages) 2837 arg->max_pages = ULONG_MAX; 2838 2839 return 0; 2840 } 2841 2842 static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, 2843 unsigned long uargl) 2844 { 2845 struct pm_scan_arg __user *uarg = (void __user *)uargl; 2846 2847 if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end))) 2848 return -EFAULT; 2849 2850 return 0; 2851 } 2852 2853 static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) 2854 { 2855 if (!p->arg.vec_len) 2856 return 0; 2857 2858 p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, 2859 p->arg.vec_len); 2860 p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf), 2861 GFP_KERNEL); 2862 if (!p->vec_buf) 2863 return -ENOMEM; 2864 2865 p->vec_buf->start = p->vec_buf->end = 0; 2866 p->vec_out = (struct page_region __user *)(long)p->arg.vec; 2867 2868 return 0; 2869 } 2870 2871 static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) 2872 { 2873 const struct page_region *buf = p->vec_buf; 2874 long n = p->vec_buf_index; 2875 2876 if (!p->vec_buf) 2877 return 0; 2878 2879 if (buf[n].end != buf[n].start) 2880 n++; 2881 2882 if (!n) 2883 return 0; 2884 2885 if (copy_to_user(p->vec_out, buf, n * sizeof(*buf))) 2886 return -EFAULT; 2887 2888 p->arg.vec_len -= n; 2889 p->vec_out += n; 2890 2891 p->vec_buf_index = 0; 2892 p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len); 2893 p->vec_buf->start = p->vec_buf->end = 0; 2894 2895 return n; 2896 } 2897 2898 static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) 2899 { 2900 struct pagemap_scan_private p = {0}; 2901 unsigned long walk_start; 2902 size_t n_ranges_out = 0; 2903 int ret; 2904 2905 ret = pagemap_scan_get_args(&p.arg, uarg); 2906 if (ret) 2907 return ret; 2908 2909 p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask | 2910 p.arg.return_mask; 2911 ret = pagemap_scan_init_bounce_buffer(&p); 2912 if (ret) 2913 return ret; 2914 2915 for (walk_start = p.arg.start; walk_start < p.arg.end; 2916 walk_start = p.arg.walk_end) { 2917 struct mmu_notifier_range range; 2918 long n_out; 2919 2920 if (fatal_signal_pending(current)) { 2921 ret = -EINTR; 2922 break; 2923 } 2924 2925 ret = mmap_read_lock_killable(mm); 2926 if (ret) 2927 break; 2928 2929 /* Protection change for the range is going to happen. */ 2930 if (p.arg.flags & PM_SCAN_WP_MATCHING) { 2931 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, 2932 mm, walk_start, p.arg.end); 2933 mmu_notifier_invalidate_range_start(&range); 2934 } 2935 2936 ret = walk_page_range(mm, walk_start, p.arg.end, 2937 &pagemap_scan_ops, &p); 2938 2939 if (p.arg.flags & PM_SCAN_WP_MATCHING) 2940 mmu_notifier_invalidate_range_end(&range); 2941 2942 mmap_read_unlock(mm); 2943 2944 n_out = pagemap_scan_flush_buffer(&p); 2945 if (n_out < 0) 2946 ret = n_out; 2947 else 2948 n_ranges_out += n_out; 2949 2950 if (ret != -ENOSPC) 2951 break; 2952 2953 if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages) 2954 break; 2955 } 2956 2957 /* ENOSPC signifies early stop (buffer full) from the walk. */ 2958 if (!ret || ret == -ENOSPC) 2959 ret = n_ranges_out; 2960 2961 /* The walk_end isn't set when ret is zero */ 2962 if (!p.arg.walk_end) 2963 p.arg.walk_end = p.arg.end; 2964 if (pagemap_scan_writeback_args(&p.arg, uarg)) 2965 ret = -EFAULT; 2966 2967 kfree(p.vec_buf); 2968 return ret; 2969 } 2970 2971 static long do_pagemap_cmd(struct file *file, unsigned int cmd, 2972 unsigned long arg) 2973 { 2974 struct mm_struct *mm = file->private_data; 2975 2976 switch (cmd) { 2977 case PAGEMAP_SCAN: 2978 return do_pagemap_scan(mm, arg); 2979 2980 default: 2981 return -EINVAL; 2982 } 2983 } 2984 2985 const struct file_operations proc_pagemap_operations = { 2986 .llseek = mem_lseek, /* borrow this */ 2987 .read = pagemap_read, 2988 .open = pagemap_open, 2989 .release = pagemap_release, 2990 .unlocked_ioctl = do_pagemap_cmd, 2991 .compat_ioctl = do_pagemap_cmd, 2992 }; 2993 #endif /* CONFIG_PROC_PAGE_MONITOR */ 2994 2995 #ifdef CONFIG_NUMA 2996 2997 struct numa_maps { 2998 unsigned long pages; 2999 unsigned long anon; 3000 unsigned long active; 3001 unsigned long writeback; 3002 unsigned long mapcount_max; 3003 unsigned long dirty; 3004 unsigned long swapcache; 3005 unsigned long node[MAX_NUMNODES]; 3006 }; 3007 3008 struct numa_maps_private { 3009 struct proc_maps_private proc_maps; 3010 struct numa_maps md; 3011 }; 3012 3013 static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, 3014 unsigned long nr_pages) 3015 { 3016 struct folio *folio = page_folio(page); 3017 int count; 3018 3019 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) 3020 count = folio_precise_page_mapcount(folio, page); 3021 else 3022 count = folio_average_page_mapcount(folio); 3023 3024 md->pages += nr_pages; 3025 if (pte_dirty || folio_test_dirty(folio)) 3026 md->dirty += nr_pages; 3027 3028 if (folio_test_swapcache(folio)) 3029 md->swapcache += nr_pages; 3030 3031 if (folio_test_active(folio) || folio_test_unevictable(folio)) 3032 md->active += nr_pages; 3033 3034 if (folio_test_writeback(folio)) 3035 md->writeback += nr_pages; 3036 3037 if (folio_test_anon(folio)) 3038 md->anon += nr_pages; 3039 3040 if (count > md->mapcount_max) 3041 md->mapcount_max = count; 3042 3043 md->node[folio_nid(folio)] += nr_pages; 3044 } 3045 3046 static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, 3047 unsigned long addr) 3048 { 3049 struct page *page; 3050 int nid; 3051 3052 if (!pte_present(pte)) 3053 return NULL; 3054 3055 page = vm_normal_page(vma, addr, pte); 3056 if (!page || is_zone_device_page(page)) 3057 return NULL; 3058 3059 if (PageReserved(page)) 3060 return NULL; 3061 3062 nid = page_to_nid(page); 3063 if (!node_isset(nid, node_states[N_MEMORY])) 3064 return NULL; 3065 3066 return page; 3067 } 3068 3069 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3070 static struct page *can_gather_numa_stats_pmd(pmd_t pmd, 3071 struct vm_area_struct *vma, 3072 unsigned long addr) 3073 { 3074 struct page *page; 3075 int nid; 3076 3077 if (!pmd_present(pmd)) 3078 return NULL; 3079 3080 page = vm_normal_page_pmd(vma, addr, pmd); 3081 if (!page) 3082 return NULL; 3083 3084 if (PageReserved(page)) 3085 return NULL; 3086 3087 nid = page_to_nid(page); 3088 if (!node_isset(nid, node_states[N_MEMORY])) 3089 return NULL; 3090 3091 return page; 3092 } 3093 #endif 3094 3095 static int gather_pte_stats(pmd_t *pmd, unsigned long addr, 3096 unsigned long end, struct mm_walk *walk) 3097 { 3098 struct numa_maps *md = walk->private; 3099 struct vm_area_struct *vma = walk->vma; 3100 spinlock_t *ptl; 3101 pte_t *orig_pte; 3102 pte_t *pte; 3103 3104 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3105 ptl = pmd_trans_huge_lock(pmd, vma); 3106 if (ptl) { 3107 struct page *page; 3108 3109 page = can_gather_numa_stats_pmd(*pmd, vma, addr); 3110 if (page) 3111 gather_stats(page, md, pmd_dirty(*pmd), 3112 HPAGE_PMD_SIZE/PAGE_SIZE); 3113 spin_unlock(ptl); 3114 return 0; 3115 } 3116 #endif 3117 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 3118 if (!pte) { 3119 walk->action = ACTION_AGAIN; 3120 return 0; 3121 } 3122 do { 3123 pte_t ptent = ptep_get(pte); 3124 struct page *page = can_gather_numa_stats(ptent, vma, addr); 3125 if (!page) 3126 continue; 3127 gather_stats(page, md, pte_dirty(ptent), 1); 3128 3129 } while (pte++, addr += PAGE_SIZE, addr != end); 3130 pte_unmap_unlock(orig_pte, ptl); 3131 cond_resched(); 3132 return 0; 3133 } 3134 #ifdef CONFIG_HUGETLB_PAGE 3135 static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, 3136 unsigned long addr, unsigned long end, struct mm_walk *walk) 3137 { 3138 pte_t huge_pte; 3139 struct numa_maps *md; 3140 struct page *page; 3141 spinlock_t *ptl; 3142 3143 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); 3144 huge_pte = huge_ptep_get(walk->mm, addr, pte); 3145 if (!pte_present(huge_pte)) 3146 goto out; 3147 3148 page = pte_page(huge_pte); 3149 3150 md = walk->private; 3151 gather_stats(page, md, pte_dirty(huge_pte), 1); 3152 out: 3153 spin_unlock(ptl); 3154 return 0; 3155 } 3156 3157 #else 3158 static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, 3159 unsigned long addr, unsigned long end, struct mm_walk *walk) 3160 { 3161 return 0; 3162 } 3163 #endif 3164 3165 static const struct mm_walk_ops show_numa_ops = { 3166 .hugetlb_entry = gather_hugetlb_stats, 3167 .pmd_entry = gather_pte_stats, 3168 .walk_lock = PGWALK_RDLOCK, 3169 }; 3170 3171 /* 3172 * Display pages allocated per node and memory policy via /proc. 3173 */ 3174 static int show_numa_map(struct seq_file *m, void *v) 3175 { 3176 struct numa_maps_private *numa_priv = m->private; 3177 struct proc_maps_private *proc_priv = &numa_priv->proc_maps; 3178 struct vm_area_struct *vma = v; 3179 struct numa_maps *md = &numa_priv->md; 3180 struct file *file = vma->vm_file; 3181 struct mm_struct *mm = vma->vm_mm; 3182 char buffer[64]; 3183 struct mempolicy *pol; 3184 pgoff_t ilx; 3185 int nid; 3186 3187 if (!mm) 3188 return 0; 3189 3190 /* Ensure we start with an empty set of numa_maps statistics. */ 3191 memset(md, 0, sizeof(*md)); 3192 3193 pol = __get_vma_policy(vma, vma->vm_start, &ilx); 3194 if (pol) { 3195 mpol_to_str(buffer, sizeof(buffer), pol); 3196 mpol_cond_put(pol); 3197 } else { 3198 mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy); 3199 } 3200 3201 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 3202 3203 if (file) { 3204 seq_puts(m, " file="); 3205 seq_path(m, file_user_path(file), "\n\t= "); 3206 } else if (vma_is_initial_heap(vma)) { 3207 seq_puts(m, " heap"); 3208 } else if (vma_is_initial_stack(vma)) { 3209 seq_puts(m, " stack"); 3210 } 3211 3212 if (is_vm_hugetlb_page(vma)) 3213 seq_puts(m, " huge"); 3214 3215 /* mmap_lock is held by m_start */ 3216 walk_page_vma(vma, &show_numa_ops, md); 3217 3218 if (!md->pages) 3219 goto out; 3220 3221 if (md->anon) 3222 seq_printf(m, " anon=%lu", md->anon); 3223 3224 if (md->dirty) 3225 seq_printf(m, " dirty=%lu", md->dirty); 3226 3227 if (md->pages != md->anon && md->pages != md->dirty) 3228 seq_printf(m, " mapped=%lu", md->pages); 3229 3230 if (md->mapcount_max > 1) 3231 seq_printf(m, " mapmax=%lu", md->mapcount_max); 3232 3233 if (md->swapcache) 3234 seq_printf(m, " swapcache=%lu", md->swapcache); 3235 3236 if (md->active < md->pages && !is_vm_hugetlb_page(vma)) 3237 seq_printf(m, " active=%lu", md->active); 3238 3239 if (md->writeback) 3240 seq_printf(m, " writeback=%lu", md->writeback); 3241 3242 for_each_node_state(nid, N_MEMORY) 3243 if (md->node[nid]) 3244 seq_printf(m, " N%d=%lu", nid, md->node[nid]); 3245 3246 seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); 3247 out: 3248 seq_putc(m, '\n'); 3249 return 0; 3250 } 3251 3252 static const struct seq_operations proc_pid_numa_maps_op = { 3253 .start = m_start, 3254 .next = m_next, 3255 .stop = m_stop, 3256 .show = show_numa_map, 3257 }; 3258 3259 static int pid_numa_maps_open(struct inode *inode, struct file *file) 3260 { 3261 return proc_maps_open(inode, file, &proc_pid_numa_maps_op, 3262 sizeof(struct numa_maps_private)); 3263 } 3264 3265 const struct file_operations proc_pid_numa_maps_operations = { 3266 .open = pid_numa_maps_open, 3267 .read = seq_read, 3268 .llseek = seq_lseek, 3269 .release = proc_map_release, 3270 }; 3271 3272 #endif /* CONFIG_NUMA */ 3273