1 /* 2 * mm/rmap.c - physical to virtual reverse mappings 3 * 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 5 * Released under the General Public License (GPL). 6 * 7 * Simple, low overhead reverse mapping scheme. 8 * Please try to keep this thing as modular as possible. 9 * 10 * Provides methods for unmapping each kind of mapped page: 11 * the anon methods track anonymous pages, and 12 * the file methods track pages belonging to an inode. 13 * 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 17 * Contributions by Hugh Dickins 2003, 2004 18 */ 19 20 /* 21 * Lock ordering in mm: 22 * 23 * inode->i_rwsem (while writing or truncating, not reading or faulting) 24 * mm->mmap_lock 25 * mapping->invalidate_lock (in filemap_fault) 26 * folio_lock 27 * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) 28 * vma_start_write 29 * mapping->i_mmap_rwsem 30 * anon_vma->rwsem 31 * mm->page_table_lock or pte_lock 32 * swap_lock (in swap_duplicate, swap_info_get) 33 * mmlist_lock (in mmput, drain_mmlist and others) 34 * mapping->private_lock (in block_dirty_folio) 35 * i_pages lock (widely used) 36 * lruvec->lru_lock (in folio_lruvec_lock_irq) 37 * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 38 * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) 39 * sb_lock (within inode_lock in fs/fs-writeback.c) 40 * i_pages lock (widely used, in set_page_dirty, 41 * in arch-dependent flush_dcache_mmap_lock, 42 * within bdi.wb->list_lock in __sync_single_inode) 43 * 44 * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) 45 * ->tasklist_lock 46 * pte map lock 47 * 48 * hugetlbfs PageHuge() take locks in this order: 49 * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) 50 * vma_lock (hugetlb specific lock for pmd_sharing) 51 * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing) 52 * folio_lock 53 */ 54 55 #include <linux/mm.h> 56 #include <linux/sched/mm.h> 57 #include <linux/sched/task.h> 58 #include <linux/pagemap.h> 59 #include <linux/swap.h> 60 #include <linux/swapops.h> 61 #include <linux/slab.h> 62 #include <linux/init.h> 63 #include <linux/ksm.h> 64 #include <linux/rmap.h> 65 #include <linux/rcupdate.h> 66 #include <linux/export.h> 67 #include <linux/memcontrol.h> 68 #include <linux/mmu_notifier.h> 69 #include <linux/migrate.h> 70 #include <linux/hugetlb.h> 71 #include <linux/huge_mm.h> 72 #include <linux/backing-dev.h> 73 #include <linux/page_idle.h> 74 #include <linux/memremap.h> 75 #include <linux/userfaultfd_k.h> 76 #include <linux/mm_inline.h> 77 #include <linux/oom.h> 78 79 #include <asm/tlbflush.h> 80 81 #define CREATE_TRACE_POINTS 82 #include <trace/events/migrate.h> 83 84 #include "internal.h" 85 86 static struct kmem_cache *anon_vma_cachep; 87 static struct kmem_cache *anon_vma_chain_cachep; 88 89 static inline struct anon_vma *anon_vma_alloc(void) 90 { 91 struct anon_vma *anon_vma; 92 93 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 94 if (anon_vma) { 95 atomic_set(&anon_vma->refcount, 1); 96 anon_vma->num_children = 0; 97 anon_vma->num_active_vmas = 0; 98 anon_vma->parent = anon_vma; 99 /* 100 * Initialise the anon_vma root to point to itself. If called 101 * from fork, the root will be reset to the parents anon_vma. 102 */ 103 anon_vma->root = anon_vma; 104 } 105 106 return anon_vma; 107 } 108 109 static inline void anon_vma_free(struct anon_vma *anon_vma) 110 { 111 VM_BUG_ON(atomic_read(&anon_vma->refcount)); 112 113 /* 114 * Synchronize against folio_lock_anon_vma_read() such that 115 * we can safely hold the lock without the anon_vma getting 116 * freed. 117 * 118 * Relies on the full mb implied by the atomic_dec_and_test() from 119 * put_anon_vma() against the acquire barrier implied by 120 * down_read_trylock() from folio_lock_anon_vma_read(). This orders: 121 * 122 * folio_lock_anon_vma_read() VS put_anon_vma() 123 * down_read_trylock() atomic_dec_and_test() 124 * LOCK MB 125 * atomic_read() rwsem_is_locked() 126 * 127 * LOCK should suffice since the actual taking of the lock must 128 * happen _before_ what follows. 129 */ 130 might_sleep(); 131 if (rwsem_is_locked(&anon_vma->root->rwsem)) { 132 anon_vma_lock_write(anon_vma); 133 anon_vma_unlock_write(anon_vma); 134 } 135 136 kmem_cache_free(anon_vma_cachep, anon_vma); 137 } 138 139 static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) 140 { 141 return kmem_cache_alloc(anon_vma_chain_cachep, gfp); 142 } 143 144 static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 145 { 146 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 147 } 148 149 static void anon_vma_chain_link(struct vm_area_struct *vma, 150 struct anon_vma_chain *avc, 151 struct anon_vma *anon_vma) 152 { 153 avc->vma = vma; 154 avc->anon_vma = anon_vma; 155 list_add(&avc->same_vma, &vma->anon_vma_chain); 156 anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); 157 } 158 159 /** 160 * __anon_vma_prepare - attach an anon_vma to a memory region 161 * @vma: the memory region in question 162 * 163 * This makes sure the memory mapping described by 'vma' has 164 * an 'anon_vma' attached to it, so that we can associate the 165 * anonymous pages mapped into it with that anon_vma. 166 * 167 * The common case will be that we already have one, which 168 * is handled inline by anon_vma_prepare(). But if 169 * not we either need to find an adjacent mapping that we 170 * can re-use the anon_vma from (very common when the only 171 * reason for splitting a vma has been mprotect()), or we 172 * allocate a new one. 173 * 174 * Anon-vma allocations are very subtle, because we may have 175 * optimistically looked up an anon_vma in folio_lock_anon_vma_read() 176 * and that may actually touch the rwsem even in the newly 177 * allocated vma (it depends on RCU to make sure that the 178 * anon_vma isn't actually destroyed). 179 * 180 * As a result, we need to do proper anon_vma locking even 181 * for the new allocation. At the same time, we do not want 182 * to do any locking for the common case of already having 183 * an anon_vma. 184 */ 185 int __anon_vma_prepare(struct vm_area_struct *vma) 186 { 187 struct mm_struct *mm = vma->vm_mm; 188 struct anon_vma *anon_vma, *allocated; 189 struct anon_vma_chain *avc; 190 191 mmap_assert_locked(mm); 192 might_sleep(); 193 194 avc = anon_vma_chain_alloc(GFP_KERNEL); 195 if (!avc) 196 goto out_enomem; 197 198 anon_vma = find_mergeable_anon_vma(vma); 199 allocated = NULL; 200 if (!anon_vma) { 201 anon_vma = anon_vma_alloc(); 202 if (unlikely(!anon_vma)) 203 goto out_enomem_free_avc; 204 anon_vma->num_children++; /* self-parent link for new root */ 205 allocated = anon_vma; 206 } 207 208 anon_vma_lock_write(anon_vma); 209 /* page_table_lock to protect against threads */ 210 spin_lock(&mm->page_table_lock); 211 if (likely(!vma->anon_vma)) { 212 vma->anon_vma = anon_vma; 213 anon_vma_chain_link(vma, avc, anon_vma); 214 anon_vma->num_active_vmas++; 215 allocated = NULL; 216 avc = NULL; 217 } 218 spin_unlock(&mm->page_table_lock); 219 anon_vma_unlock_write(anon_vma); 220 221 if (unlikely(allocated)) 222 put_anon_vma(allocated); 223 if (unlikely(avc)) 224 anon_vma_chain_free(avc); 225 226 return 0; 227 228 out_enomem_free_avc: 229 anon_vma_chain_free(avc); 230 out_enomem: 231 return -ENOMEM; 232 } 233 234 /* 235 * This is a useful helper function for locking the anon_vma root as 236 * we traverse the vma->anon_vma_chain, looping over anon_vma's that 237 * have the same vma. 238 * 239 * Such anon_vma's should have the same root, so you'd expect to see 240 * just a single mutex_lock for the whole traversal. 241 */ 242 static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) 243 { 244 struct anon_vma *new_root = anon_vma->root; 245 if (new_root != root) { 246 if (WARN_ON_ONCE(root)) 247 up_write(&root->rwsem); 248 root = new_root; 249 down_write(&root->rwsem); 250 } 251 return root; 252 } 253 254 static inline void unlock_anon_vma_root(struct anon_vma *root) 255 { 256 if (root) 257 up_write(&root->rwsem); 258 } 259 260 /* 261 * Attach the anon_vmas from src to dst. 262 * Returns 0 on success, -ENOMEM on failure. 263 * 264 * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(), 265 * copy_vma() and anon_vma_fork(). The first four want an exact copy of src, 266 * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to 267 * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before 268 * call, we can identify this case by checking (!dst->anon_vma && 269 * src->anon_vma). 270 * 271 * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find 272 * and reuse existing anon_vma which has no vmas and only one child anon_vma. 273 * This prevents degradation of anon_vma hierarchy to endless linear chain in 274 * case of constantly forking task. On the other hand, an anon_vma with more 275 * than one child isn't reused even if there was no alive vma, thus rmap 276 * walker has a good chance of avoiding scanning the whole hierarchy when it 277 * searches where page is mapped. 278 */ 279 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 280 { 281 struct anon_vma_chain *avc, *pavc; 282 struct anon_vma *root = NULL; 283 284 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { 285 struct anon_vma *anon_vma; 286 287 avc = anon_vma_chain_alloc(GFP_NOWAIT); 288 if (unlikely(!avc)) { 289 unlock_anon_vma_root(root); 290 root = NULL; 291 avc = anon_vma_chain_alloc(GFP_KERNEL); 292 if (!avc) 293 goto enomem_failure; 294 } 295 anon_vma = pavc->anon_vma; 296 root = lock_anon_vma_root(root, anon_vma); 297 anon_vma_chain_link(dst, avc, anon_vma); 298 299 /* 300 * Reuse existing anon_vma if it has no vma and only one 301 * anon_vma child. 302 * 303 * Root anon_vma is never reused: 304 * it has self-parent reference and at least one child. 305 */ 306 if (!dst->anon_vma && src->anon_vma && 307 anon_vma->num_children < 2 && 308 anon_vma->num_active_vmas == 0) 309 dst->anon_vma = anon_vma; 310 } 311 if (dst->anon_vma) 312 dst->anon_vma->num_active_vmas++; 313 unlock_anon_vma_root(root); 314 return 0; 315 316 enomem_failure: 317 /* 318 * dst->anon_vma is dropped here otherwise its num_active_vmas can 319 * be incorrectly decremented in unlink_anon_vmas(). 320 * We can safely do this because callers of anon_vma_clone() don't care 321 * about dst->anon_vma if anon_vma_clone() failed. 322 */ 323 dst->anon_vma = NULL; 324 unlink_anon_vmas(dst); 325 return -ENOMEM; 326 } 327 328 /* 329 * Attach vma to its own anon_vma, as well as to the anon_vmas that 330 * the corresponding VMA in the parent process is attached to. 331 * Returns 0 on success, non-zero on failure. 332 */ 333 int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) 334 { 335 struct anon_vma_chain *avc; 336 struct anon_vma *anon_vma; 337 int error; 338 339 /* Don't bother if the parent process has no anon_vma here. */ 340 if (!pvma->anon_vma) 341 return 0; 342 343 /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ 344 vma->anon_vma = NULL; 345 346 /* 347 * First, attach the new VMA to the parent VMA's anon_vmas, 348 * so rmap can find non-COWed pages in child processes. 349 */ 350 error = anon_vma_clone(vma, pvma); 351 if (error) 352 return error; 353 354 /* An existing anon_vma has been reused, all done then. */ 355 if (vma->anon_vma) 356 return 0; 357 358 /* Then add our own anon_vma. */ 359 anon_vma = anon_vma_alloc(); 360 if (!anon_vma) 361 goto out_error; 362 anon_vma->num_active_vmas++; 363 avc = anon_vma_chain_alloc(GFP_KERNEL); 364 if (!avc) 365 goto out_error_free_anon_vma; 366 367 /* 368 * The root anon_vma's rwsem is the lock actually used when we 369 * lock any of the anon_vmas in this anon_vma tree. 370 */ 371 anon_vma->root = pvma->anon_vma->root; 372 anon_vma->parent = pvma->anon_vma; 373 /* 374 * With refcounts, an anon_vma can stay around longer than the 375 * process it belongs to. The root anon_vma needs to be pinned until 376 * this anon_vma is freed, because the lock lives in the root. 377 */ 378 get_anon_vma(anon_vma->root); 379 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 380 vma->anon_vma = anon_vma; 381 anon_vma_lock_write(anon_vma); 382 anon_vma_chain_link(vma, avc, anon_vma); 383 anon_vma->parent->num_children++; 384 anon_vma_unlock_write(anon_vma); 385 386 return 0; 387 388 out_error_free_anon_vma: 389 put_anon_vma(anon_vma); 390 out_error: 391 unlink_anon_vmas(vma); 392 return -ENOMEM; 393 } 394 395 void unlink_anon_vmas(struct vm_area_struct *vma) 396 { 397 struct anon_vma_chain *avc, *next; 398 struct anon_vma *root = NULL; 399 400 /* 401 * Unlink each anon_vma chained to the VMA. This list is ordered 402 * from newest to oldest, ensuring the root anon_vma gets freed last. 403 */ 404 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 405 struct anon_vma *anon_vma = avc->anon_vma; 406 407 root = lock_anon_vma_root(root, anon_vma); 408 anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); 409 410 /* 411 * Leave empty anon_vmas on the list - we'll need 412 * to free them outside the lock. 413 */ 414 if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { 415 anon_vma->parent->num_children--; 416 continue; 417 } 418 419 list_del(&avc->same_vma); 420 anon_vma_chain_free(avc); 421 } 422 if (vma->anon_vma) { 423 vma->anon_vma->num_active_vmas--; 424 425 /* 426 * vma would still be needed after unlink, and anon_vma will be prepared 427 * when handle fault. 428 */ 429 vma->anon_vma = NULL; 430 } 431 unlock_anon_vma_root(root); 432 433 /* 434 * Iterate the list once more, it now only contains empty and unlinked 435 * anon_vmas, destroy them. Could not do before due to __put_anon_vma() 436 * needing to write-acquire the anon_vma->root->rwsem. 437 */ 438 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 439 struct anon_vma *anon_vma = avc->anon_vma; 440 441 VM_WARN_ON(anon_vma->num_children); 442 VM_WARN_ON(anon_vma->num_active_vmas); 443 put_anon_vma(anon_vma); 444 445 list_del(&avc->same_vma); 446 anon_vma_chain_free(avc); 447 } 448 } 449 450 static void anon_vma_ctor(void *data) 451 { 452 struct anon_vma *anon_vma = data; 453 454 init_rwsem(&anon_vma->rwsem); 455 atomic_set(&anon_vma->refcount, 0); 456 anon_vma->rb_root = RB_ROOT_CACHED; 457 } 458 459 void __init anon_vma_init(void) 460 { 461 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 462 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, 463 anon_vma_ctor); 464 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, 465 SLAB_PANIC|SLAB_ACCOUNT); 466 } 467 468 /* 469 * Getting a lock on a stable anon_vma from a page off the LRU is tricky! 470 * 471 * Since there is no serialization what so ever against folio_remove_rmap_*() 472 * the best this function can do is return a refcount increased anon_vma 473 * that might have been relevant to this page. 474 * 475 * The page might have been remapped to a different anon_vma or the anon_vma 476 * returned may already be freed (and even reused). 477 * 478 * In case it was remapped to a different anon_vma, the new anon_vma will be a 479 * child of the old anon_vma, and the anon_vma lifetime rules will therefore 480 * ensure that any anon_vma obtained from the page will still be valid for as 481 * long as we observe page_mapped() [ hence all those page_mapped() tests ]. 482 * 483 * All users of this function must be very careful when walking the anon_vma 484 * chain and verify that the page in question is indeed mapped in it 485 * [ something equivalent to page_mapped_in_vma() ]. 486 * 487 * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from 488 * folio_remove_rmap_*() that the anon_vma pointer from page->mapping is valid 489 * if there is a mapcount, we can dereference the anon_vma after observing 490 * those. 491 * 492 * NOTE: the caller should hold folio lock when calling this. 493 */ 494 struct anon_vma *folio_get_anon_vma(const struct folio *folio) 495 { 496 struct anon_vma *anon_vma = NULL; 497 unsigned long anon_mapping; 498 499 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); 500 501 rcu_read_lock(); 502 anon_mapping = (unsigned long)READ_ONCE(folio->mapping); 503 if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) 504 goto out; 505 if (!folio_mapped(folio)) 506 goto out; 507 508 anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON); 509 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 510 anon_vma = NULL; 511 goto out; 512 } 513 514 /* 515 * If this folio is still mapped, then its anon_vma cannot have been 516 * freed. But if it has been unmapped, we have no security against the 517 * anon_vma structure being freed and reused (for another anon_vma: 518 * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() 519 * above cannot corrupt). 520 */ 521 if (!folio_mapped(folio)) { 522 rcu_read_unlock(); 523 put_anon_vma(anon_vma); 524 return NULL; 525 } 526 out: 527 rcu_read_unlock(); 528 529 return anon_vma; 530 } 531 532 /* 533 * Similar to folio_get_anon_vma() except it locks the anon_vma. 534 * 535 * Its a little more complex as it tries to keep the fast path to a single 536 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a 537 * reference like with folio_get_anon_vma() and then block on the mutex 538 * on !rwc->try_lock case. 539 */ 540 struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, 541 struct rmap_walk_control *rwc) 542 { 543 struct anon_vma *anon_vma = NULL; 544 struct anon_vma *root_anon_vma; 545 unsigned long anon_mapping; 546 547 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); 548 549 rcu_read_lock(); 550 anon_mapping = (unsigned long)READ_ONCE(folio->mapping); 551 if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) 552 goto out; 553 if (!folio_mapped(folio)) 554 goto out; 555 556 anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON); 557 root_anon_vma = READ_ONCE(anon_vma->root); 558 if (down_read_trylock(&root_anon_vma->rwsem)) { 559 /* 560 * If the folio is still mapped, then this anon_vma is still 561 * its anon_vma, and holding the mutex ensures that it will 562 * not go away, see anon_vma_free(). 563 */ 564 if (!folio_mapped(folio)) { 565 up_read(&root_anon_vma->rwsem); 566 anon_vma = NULL; 567 } 568 goto out; 569 } 570 571 if (rwc && rwc->try_lock) { 572 anon_vma = NULL; 573 rwc->contended = true; 574 goto out; 575 } 576 577 /* trylock failed, we got to sleep */ 578 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 579 anon_vma = NULL; 580 goto out; 581 } 582 583 if (!folio_mapped(folio)) { 584 rcu_read_unlock(); 585 put_anon_vma(anon_vma); 586 return NULL; 587 } 588 589 /* we pinned the anon_vma, its safe to sleep */ 590 rcu_read_unlock(); 591 anon_vma_lock_read(anon_vma); 592 593 if (atomic_dec_and_test(&anon_vma->refcount)) { 594 /* 595 * Oops, we held the last refcount, release the lock 596 * and bail -- can't simply use put_anon_vma() because 597 * we'll deadlock on the anon_vma_lock_write() recursion. 598 */ 599 anon_vma_unlock_read(anon_vma); 600 __put_anon_vma(anon_vma); 601 anon_vma = NULL; 602 } 603 604 return anon_vma; 605 606 out: 607 rcu_read_unlock(); 608 return anon_vma; 609 } 610 611 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 612 /* 613 * Flush TLB entries for recently unmapped pages from remote CPUs. It is 614 * important if a PTE was dirty when it was unmapped that it's flushed 615 * before any IO is initiated on the page to prevent lost writes. Similarly, 616 * it must be flushed before freeing to prevent data leakage. 617 */ 618 void try_to_unmap_flush(void) 619 { 620 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 621 622 if (!tlb_ubc->flush_required) 623 return; 624 625 arch_tlbbatch_flush(&tlb_ubc->arch); 626 tlb_ubc->flush_required = false; 627 tlb_ubc->writable = false; 628 } 629 630 /* Flush iff there are potentially writable TLB entries that can race with IO */ 631 void try_to_unmap_flush_dirty(void) 632 { 633 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 634 635 if (tlb_ubc->writable) 636 try_to_unmap_flush(); 637 } 638 639 /* 640 * Bits 0-14 of mm->tlb_flush_batched record pending generations. 641 * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations. 642 */ 643 #define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16 644 #define TLB_FLUSH_BATCH_PENDING_MASK \ 645 ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1) 646 #define TLB_FLUSH_BATCH_PENDING_LARGE \ 647 (TLB_FLUSH_BATCH_PENDING_MASK / 2) 648 649 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, 650 unsigned long start, unsigned long end) 651 { 652 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 653 int batch; 654 bool writable = pte_dirty(pteval); 655 656 if (!pte_accessible(mm, pteval)) 657 return; 658 659 arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, start, end); 660 tlb_ubc->flush_required = true; 661 662 /* 663 * Ensure compiler does not re-order the setting of tlb_flush_batched 664 * before the PTE is cleared. 665 */ 666 barrier(); 667 batch = atomic_read(&mm->tlb_flush_batched); 668 retry: 669 if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) { 670 /* 671 * Prevent `pending' from catching up with `flushed' because of 672 * overflow. Reset `pending' and `flushed' to be 1 and 0 if 673 * `pending' becomes large. 674 */ 675 if (!atomic_try_cmpxchg(&mm->tlb_flush_batched, &batch, 1)) 676 goto retry; 677 } else { 678 atomic_inc(&mm->tlb_flush_batched); 679 } 680 681 /* 682 * If the PTE was dirty then it's best to assume it's writable. The 683 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() 684 * before the page is queued for IO. 685 */ 686 if (writable) 687 tlb_ubc->writable = true; 688 } 689 690 /* 691 * Returns true if the TLB flush should be deferred to the end of a batch of 692 * unmap operations to reduce IPIs. 693 */ 694 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 695 { 696 if (!(flags & TTU_BATCH_FLUSH)) 697 return false; 698 699 return arch_tlbbatch_should_defer(mm); 700 } 701 702 /* 703 * Reclaim unmaps pages under the PTL but do not flush the TLB prior to 704 * releasing the PTL if TLB flushes are batched. It's possible for a parallel 705 * operation such as mprotect or munmap to race between reclaim unmapping 706 * the page and flushing the page. If this race occurs, it potentially allows 707 * access to data via a stale TLB entry. Tracking all mm's that have TLB 708 * batching in flight would be expensive during reclaim so instead track 709 * whether TLB batching occurred in the past and if so then do a flush here 710 * if required. This will cost one additional flush per reclaim cycle paid 711 * by the first operation at risk such as mprotect and mumap. 712 * 713 * This must be called under the PTL so that an access to tlb_flush_batched 714 * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise 715 * via the PTL. 716 */ 717 void flush_tlb_batched_pending(struct mm_struct *mm) 718 { 719 int batch = atomic_read(&mm->tlb_flush_batched); 720 int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK; 721 int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT; 722 723 if (pending != flushed) { 724 flush_tlb_mm(mm); 725 /* 726 * If the new TLB flushing is pending during flushing, leave 727 * mm->tlb_flush_batched as is, to avoid losing flushing. 728 */ 729 atomic_cmpxchg(&mm->tlb_flush_batched, batch, 730 pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT)); 731 } 732 } 733 #else 734 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, 735 unsigned long start, unsigned long end) 736 { 737 } 738 739 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 740 { 741 return false; 742 } 743 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ 744 745 /** 746 * page_address_in_vma - The virtual address of a page in this VMA. 747 * @folio: The folio containing the page. 748 * @page: The page within the folio. 749 * @vma: The VMA we need to know the address in. 750 * 751 * Calculates the user virtual address of this page in the specified VMA. 752 * It is the caller's responsibility to check the page is actually 753 * within the VMA. There may not currently be a PTE pointing at this 754 * page, but if a page fault occurs at this address, this is the page 755 * which will be accessed. 756 * 757 * Context: Caller should hold a reference to the folio. Caller should 758 * hold a lock (eg the i_mmap_lock or the mmap_lock) which keeps the 759 * VMA from being altered. 760 * 761 * Return: The virtual address corresponding to this page in the VMA. 762 */ 763 unsigned long page_address_in_vma(const struct folio *folio, 764 const struct page *page, const struct vm_area_struct *vma) 765 { 766 if (folio_test_anon(folio)) { 767 struct anon_vma *anon_vma = folio_anon_vma(folio); 768 /* 769 * Note: swapoff's unuse_vma() is more efficient with this 770 * check, and needs it to match anon_vma when KSM is active. 771 */ 772 if (!vma->anon_vma || !anon_vma || 773 vma->anon_vma->root != anon_vma->root) 774 return -EFAULT; 775 } else if (!vma->vm_file) { 776 return -EFAULT; 777 } else if (vma->vm_file->f_mapping != folio->mapping) { 778 return -EFAULT; 779 } 780 781 /* KSM folios don't reach here because of the !anon_vma check */ 782 return vma_address(vma, page_pgoff(folio, page), 1); 783 } 784 785 /* 786 * Returns the actual pmd_t* where we expect 'address' to be mapped from, or 787 * NULL if it doesn't exist. No guarantees / checks on what the pmd_t* 788 * represents. 789 */ 790 pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) 791 { 792 pgd_t *pgd; 793 p4d_t *p4d; 794 pud_t *pud; 795 pmd_t *pmd = NULL; 796 797 pgd = pgd_offset(mm, address); 798 if (!pgd_present(*pgd)) 799 goto out; 800 801 p4d = p4d_offset(pgd, address); 802 if (!p4d_present(*p4d)) 803 goto out; 804 805 pud = pud_offset(p4d, address); 806 if (!pud_present(*pud)) 807 goto out; 808 809 pmd = pmd_offset(pud, address); 810 out: 811 return pmd; 812 } 813 814 struct folio_referenced_arg { 815 int mapcount; 816 int referenced; 817 vm_flags_t vm_flags; 818 struct mem_cgroup *memcg; 819 }; 820 821 /* 822 * arg: folio_referenced_arg will be passed 823 */ 824 static bool folio_referenced_one(struct folio *folio, 825 struct vm_area_struct *vma, unsigned long address, void *arg) 826 { 827 struct folio_referenced_arg *pra = arg; 828 DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); 829 int ptes = 0, referenced = 0; 830 831 while (page_vma_mapped_walk(&pvmw)) { 832 address = pvmw.address; 833 834 if (vma->vm_flags & VM_LOCKED) { 835 ptes++; 836 pra->mapcount--; 837 838 /* Only mlock fully mapped pages */ 839 if (pvmw.pte && ptes != pvmw.nr_pages) 840 continue; 841 842 /* 843 * All PTEs must be protected by page table lock in 844 * order to mlock the page. 845 * 846 * If page table boundary has been cross, current ptl 847 * only protect part of ptes. 848 */ 849 if (pvmw.flags & PVMW_PGTABLE_CROSSED) 850 continue; 851 852 /* Restore the mlock which got missed */ 853 mlock_vma_folio(folio, vma); 854 page_vma_mapped_walk_done(&pvmw); 855 pra->vm_flags |= VM_LOCKED; 856 return false; /* To break the loop */ 857 } 858 859 /* 860 * Skip the non-shared swapbacked folio mapped solely by 861 * the exiting or OOM-reaped process. This avoids redundant 862 * swap-out followed by an immediate unmap. 863 */ 864 if ((!atomic_read(&vma->vm_mm->mm_users) || 865 check_stable_address_space(vma->vm_mm)) && 866 folio_test_anon(folio) && folio_test_swapbacked(folio) && 867 !folio_maybe_mapped_shared(folio)) { 868 pra->referenced = -1; 869 page_vma_mapped_walk_done(&pvmw); 870 return false; 871 } 872 873 if (lru_gen_enabled() && pvmw.pte) { 874 if (lru_gen_look_around(&pvmw)) 875 referenced++; 876 } else if (pvmw.pte) { 877 if (ptep_clear_flush_young_notify(vma, address, 878 pvmw.pte)) 879 referenced++; 880 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { 881 if (pmdp_clear_flush_young_notify(vma, address, 882 pvmw.pmd)) 883 referenced++; 884 } else { 885 /* unexpected pmd-mapped folio? */ 886 WARN_ON_ONCE(1); 887 } 888 889 pra->mapcount--; 890 } 891 892 if (referenced) 893 folio_clear_idle(folio); 894 if (folio_test_clear_young(folio)) 895 referenced++; 896 897 if (referenced) { 898 pra->referenced++; 899 pra->vm_flags |= vma->vm_flags & ~VM_LOCKED; 900 } 901 902 if (!pra->mapcount) 903 return false; /* To break the loop */ 904 905 return true; 906 } 907 908 static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) 909 { 910 struct folio_referenced_arg *pra = arg; 911 struct mem_cgroup *memcg = pra->memcg; 912 913 /* 914 * Ignore references from this mapping if it has no recency. If the 915 * folio has been used in another mapping, we will catch it; if this 916 * other mapping is already gone, the unmap path will have set the 917 * referenced flag or activated the folio in zap_pte_range(). 918 */ 919 if (!vma_has_recency(vma)) 920 return true; 921 922 /* 923 * If we are reclaiming on behalf of a cgroup, skip counting on behalf 924 * of references from different cgroups. 925 */ 926 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) 927 return true; 928 929 return false; 930 } 931 932 /** 933 * folio_referenced() - Test if the folio was referenced. 934 * @folio: The folio to test. 935 * @is_locked: Caller holds lock on the folio. 936 * @memcg: target memory cgroup 937 * @vm_flags: A combination of all the vma->vm_flags which referenced the folio. 938 * 939 * Quick test_and_clear_referenced for all mappings of a folio, 940 * 941 * Return: The number of mappings which referenced the folio. Return -1 if 942 * the function bailed out due to rmap lock contention. 943 */ 944 int folio_referenced(struct folio *folio, int is_locked, 945 struct mem_cgroup *memcg, vm_flags_t *vm_flags) 946 { 947 bool we_locked = false; 948 struct folio_referenced_arg pra = { 949 .mapcount = folio_mapcount(folio), 950 .memcg = memcg, 951 }; 952 struct rmap_walk_control rwc = { 953 .rmap_one = folio_referenced_one, 954 .arg = (void *)&pra, 955 .anon_lock = folio_lock_anon_vma_read, 956 .try_lock = true, 957 .invalid_vma = invalid_folio_referenced_vma, 958 }; 959 960 *vm_flags = 0; 961 if (!pra.mapcount) 962 return 0; 963 964 if (!folio_raw_mapping(folio)) 965 return 0; 966 967 if (!is_locked) { 968 we_locked = folio_trylock(folio); 969 if (!we_locked) 970 return 1; 971 } 972 973 rmap_walk(folio, &rwc); 974 *vm_flags = pra.vm_flags; 975 976 if (we_locked) 977 folio_unlock(folio); 978 979 return rwc.contended ? -1 : pra.referenced; 980 } 981 982 static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) 983 { 984 int cleaned = 0; 985 struct vm_area_struct *vma = pvmw->vma; 986 struct mmu_notifier_range range; 987 unsigned long address = pvmw->address; 988 989 /* 990 * We have to assume the worse case ie pmd for invalidation. Note that 991 * the folio can not be freed from this function. 992 */ 993 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, 994 vma->vm_mm, address, vma_address_end(pvmw)); 995 mmu_notifier_invalidate_range_start(&range); 996 997 while (page_vma_mapped_walk(pvmw)) { 998 int ret = 0; 999 1000 address = pvmw->address; 1001 if (pvmw->pte) { 1002 pte_t *pte = pvmw->pte; 1003 pte_t entry = ptep_get(pte); 1004 1005 /* 1006 * PFN swap PTEs, such as device-exclusive ones, that 1007 * actually map pages are clean and not writable from a 1008 * CPU perspective. The MMU notifier takes care of any 1009 * device aspects. 1010 */ 1011 if (!pte_present(entry)) 1012 continue; 1013 if (!pte_dirty(entry) && !pte_write(entry)) 1014 continue; 1015 1016 flush_cache_page(vma, address, pte_pfn(entry)); 1017 entry = ptep_clear_flush(vma, address, pte); 1018 entry = pte_wrprotect(entry); 1019 entry = pte_mkclean(entry); 1020 set_pte_at(vma->vm_mm, address, pte, entry); 1021 ret = 1; 1022 } else { 1023 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1024 pmd_t *pmd = pvmw->pmd; 1025 pmd_t entry; 1026 1027 if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) 1028 continue; 1029 1030 flush_cache_range(vma, address, 1031 address + HPAGE_PMD_SIZE); 1032 entry = pmdp_invalidate(vma, address, pmd); 1033 entry = pmd_wrprotect(entry); 1034 entry = pmd_mkclean(entry); 1035 set_pmd_at(vma->vm_mm, address, pmd, entry); 1036 ret = 1; 1037 #else 1038 /* unexpected pmd-mapped folio? */ 1039 WARN_ON_ONCE(1); 1040 #endif 1041 } 1042 1043 if (ret) 1044 cleaned++; 1045 } 1046 1047 mmu_notifier_invalidate_range_end(&range); 1048 1049 return cleaned; 1050 } 1051 1052 static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma, 1053 unsigned long address, void *arg) 1054 { 1055 DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC); 1056 int *cleaned = arg; 1057 1058 *cleaned += page_vma_mkclean_one(&pvmw); 1059 1060 return true; 1061 } 1062 1063 static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) 1064 { 1065 if (vma->vm_flags & VM_SHARED) 1066 return false; 1067 1068 return true; 1069 } 1070 1071 int folio_mkclean(struct folio *folio) 1072 { 1073 int cleaned = 0; 1074 struct address_space *mapping; 1075 struct rmap_walk_control rwc = { 1076 .arg = (void *)&cleaned, 1077 .rmap_one = page_mkclean_one, 1078 .invalid_vma = invalid_mkclean_vma, 1079 }; 1080 1081 BUG_ON(!folio_test_locked(folio)); 1082 1083 if (!folio_mapped(folio)) 1084 return 0; 1085 1086 mapping = folio_mapping(folio); 1087 if (!mapping) 1088 return 0; 1089 1090 rmap_walk(folio, &rwc); 1091 1092 return cleaned; 1093 } 1094 EXPORT_SYMBOL_GPL(folio_mkclean); 1095 1096 struct wrprotect_file_state { 1097 int cleaned; 1098 pgoff_t pgoff; 1099 unsigned long pfn; 1100 unsigned long nr_pages; 1101 }; 1102 1103 static bool mapping_wrprotect_range_one(struct folio *folio, 1104 struct vm_area_struct *vma, unsigned long address, void *arg) 1105 { 1106 struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg; 1107 struct page_vma_mapped_walk pvmw = { 1108 .pfn = state->pfn, 1109 .nr_pages = state->nr_pages, 1110 .pgoff = state->pgoff, 1111 .vma = vma, 1112 .address = address, 1113 .flags = PVMW_SYNC, 1114 }; 1115 1116 state->cleaned += page_vma_mkclean_one(&pvmw); 1117 1118 return true; 1119 } 1120 1121 static void __rmap_walk_file(struct folio *folio, struct address_space *mapping, 1122 pgoff_t pgoff_start, unsigned long nr_pages, 1123 struct rmap_walk_control *rwc, bool locked); 1124 1125 /** 1126 * mapping_wrprotect_range() - Write-protect all mappings in a specified range. 1127 * 1128 * @mapping: The mapping whose reverse mapping should be traversed. 1129 * @pgoff: The page offset at which @pfn is mapped within @mapping. 1130 * @pfn: The PFN of the page mapped in @mapping at @pgoff. 1131 * @nr_pages: The number of physically contiguous base pages spanned. 1132 * 1133 * Traverses the reverse mapping, finding all VMAs which contain a shared 1134 * mapping of the pages in the specified range in @mapping, and write-protects 1135 * them (that is, updates the page tables to mark the mappings read-only such 1136 * that a write protection fault arises when the mappings are written to). 1137 * 1138 * The @pfn value need not refer to a folio, but rather can reference a kernel 1139 * allocation which is mapped into userland. We therefore do not require that 1140 * the page maps to a folio with a valid mapping or index field, rather the 1141 * caller specifies these in @mapping and @pgoff. 1142 * 1143 * Return: the number of write-protected PTEs, or an error. 1144 */ 1145 int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff, 1146 unsigned long pfn, unsigned long nr_pages) 1147 { 1148 struct wrprotect_file_state state = { 1149 .cleaned = 0, 1150 .pgoff = pgoff, 1151 .pfn = pfn, 1152 .nr_pages = nr_pages, 1153 }; 1154 struct rmap_walk_control rwc = { 1155 .arg = (void *)&state, 1156 .rmap_one = mapping_wrprotect_range_one, 1157 .invalid_vma = invalid_mkclean_vma, 1158 }; 1159 1160 if (!mapping) 1161 return 0; 1162 1163 __rmap_walk_file(/* folio = */NULL, mapping, pgoff, nr_pages, &rwc, 1164 /* locked = */false); 1165 1166 return state.cleaned; 1167 } 1168 EXPORT_SYMBOL_GPL(mapping_wrprotect_range); 1169 1170 /** 1171 * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of 1172 * [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff) 1173 * within the @vma of shared mappings. And since clean PTEs 1174 * should also be readonly, write protects them too. 1175 * @pfn: start pfn. 1176 * @nr_pages: number of physically contiguous pages srarting with @pfn. 1177 * @pgoff: page offset that the @pfn mapped with. 1178 * @vma: vma that @pfn mapped within. 1179 * 1180 * Returns the number of cleaned PTEs (including PMDs). 1181 */ 1182 int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, 1183 struct vm_area_struct *vma) 1184 { 1185 struct page_vma_mapped_walk pvmw = { 1186 .pfn = pfn, 1187 .nr_pages = nr_pages, 1188 .pgoff = pgoff, 1189 .vma = vma, 1190 .flags = PVMW_SYNC, 1191 }; 1192 1193 if (invalid_mkclean_vma(vma, NULL)) 1194 return 0; 1195 1196 pvmw.address = vma_address(vma, pgoff, nr_pages); 1197 VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma); 1198 1199 return page_vma_mkclean_one(&pvmw); 1200 } 1201 1202 static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) 1203 { 1204 int idx; 1205 1206 if (nr) { 1207 idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; 1208 __lruvec_stat_mod_folio(folio, idx, nr); 1209 } 1210 if (nr_pmdmapped) { 1211 if (folio_test_anon(folio)) { 1212 idx = NR_ANON_THPS; 1213 __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); 1214 } else { 1215 /* NR_*_PMDMAPPED are not maintained per-memcg */ 1216 idx = folio_test_swapbacked(folio) ? 1217 NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED; 1218 __mod_node_page_state(folio_pgdat(folio), idx, 1219 nr_pmdmapped); 1220 } 1221 } 1222 } 1223 1224 static __always_inline void __folio_add_rmap(struct folio *folio, 1225 struct page *page, int nr_pages, struct vm_area_struct *vma, 1226 enum pgtable_level level) 1227 { 1228 atomic_t *mapped = &folio->_nr_pages_mapped; 1229 const int orig_nr_pages = nr_pages; 1230 int first = 0, nr = 0, nr_pmdmapped = 0; 1231 1232 __folio_rmap_sanity_checks(folio, page, nr_pages, level); 1233 1234 switch (level) { 1235 case PGTABLE_LEVEL_PTE: 1236 if (!folio_test_large(folio)) { 1237 nr = atomic_inc_and_test(&folio->_mapcount); 1238 break; 1239 } 1240 1241 if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { 1242 nr = folio_add_return_large_mapcount(folio, orig_nr_pages, vma); 1243 if (nr == orig_nr_pages) 1244 /* Was completely unmapped. */ 1245 nr = folio_large_nr_pages(folio); 1246 else 1247 nr = 0; 1248 break; 1249 } 1250 1251 do { 1252 first += atomic_inc_and_test(&page->_mapcount); 1253 } while (page++, --nr_pages > 0); 1254 1255 if (first && 1256 atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED) 1257 nr = first; 1258 1259 folio_add_large_mapcount(folio, orig_nr_pages, vma); 1260 break; 1261 case PGTABLE_LEVEL_PMD: 1262 case PGTABLE_LEVEL_PUD: 1263 first = atomic_inc_and_test(&folio->_entire_mapcount); 1264 if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { 1265 if (level == PGTABLE_LEVEL_PMD && first) 1266 nr_pmdmapped = folio_large_nr_pages(folio); 1267 nr = folio_inc_return_large_mapcount(folio, vma); 1268 if (nr == 1) 1269 /* Was completely unmapped. */ 1270 nr = folio_large_nr_pages(folio); 1271 else 1272 nr = 0; 1273 break; 1274 } 1275 1276 if (first) { 1277 nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped); 1278 if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) { 1279 nr_pages = folio_large_nr_pages(folio); 1280 /* 1281 * We only track PMD mappings of PMD-sized 1282 * folios separately. 1283 */ 1284 if (level == PGTABLE_LEVEL_PMD) 1285 nr_pmdmapped = nr_pages; 1286 nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); 1287 /* Raced ahead of a remove and another add? */ 1288 if (unlikely(nr < 0)) 1289 nr = 0; 1290 } else { 1291 /* Raced ahead of a remove of ENTIRELY_MAPPED */ 1292 nr = 0; 1293 } 1294 } 1295 folio_inc_large_mapcount(folio, vma); 1296 break; 1297 default: 1298 BUILD_BUG(); 1299 } 1300 __folio_mod_stat(folio, nr, nr_pmdmapped); 1301 } 1302 1303 /** 1304 * folio_move_anon_rmap - move a folio to our anon_vma 1305 * @folio: The folio to move to our anon_vma 1306 * @vma: The vma the folio belongs to 1307 * 1308 * When a folio belongs exclusively to one process after a COW event, 1309 * that folio can be moved into the anon_vma that belongs to just that 1310 * process, so the rmap code will not search the parent or sibling processes. 1311 */ 1312 void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma) 1313 { 1314 void *anon_vma = vma->anon_vma; 1315 1316 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 1317 VM_BUG_ON_VMA(!anon_vma, vma); 1318 1319 anon_vma += FOLIO_MAPPING_ANON; 1320 /* 1321 * Ensure that anon_vma and the FOLIO_MAPPING_ANON bit are written 1322 * simultaneously, so a concurrent reader (eg folio_referenced()'s 1323 * folio_test_anon()) will not see one without the other. 1324 */ 1325 WRITE_ONCE(folio->mapping, anon_vma); 1326 } 1327 1328 /** 1329 * __folio_set_anon - set up a new anonymous rmap for a folio 1330 * @folio: The folio to set up the new anonymous rmap for. 1331 * @vma: VM area to add the folio to. 1332 * @address: User virtual address of the mapping 1333 * @exclusive: Whether the folio is exclusive to the process. 1334 */ 1335 static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma, 1336 unsigned long address, bool exclusive) 1337 { 1338 struct anon_vma *anon_vma = vma->anon_vma; 1339 1340 BUG_ON(!anon_vma); 1341 1342 /* 1343 * If the folio isn't exclusive to this vma, we must use the _oldest_ 1344 * possible anon_vma for the folio mapping! 1345 */ 1346 if (!exclusive) 1347 anon_vma = anon_vma->root; 1348 1349 /* 1350 * page_idle does a lockless/optimistic rmap scan on folio->mapping. 1351 * Make sure the compiler doesn't split the stores of anon_vma and 1352 * the FOLIO_MAPPING_ANON type identifier, otherwise the rmap code 1353 * could mistake the mapping for a struct address_space and crash. 1354 */ 1355 anon_vma = (void *) anon_vma + FOLIO_MAPPING_ANON; 1356 WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma); 1357 folio->index = linear_page_index(vma, address); 1358 } 1359 1360 /** 1361 * __page_check_anon_rmap - sanity check anonymous rmap addition 1362 * @folio: The folio containing @page. 1363 * @page: the page to check the mapping of 1364 * @vma: the vm area in which the mapping is added 1365 * @address: the user virtual address mapped 1366 */ 1367 static void __page_check_anon_rmap(const struct folio *folio, 1368 const struct page *page, struct vm_area_struct *vma, 1369 unsigned long address) 1370 { 1371 /* 1372 * The page's anon-rmap details (mapping and index) are guaranteed to 1373 * be set up correctly at this point. 1374 * 1375 * We have exclusion against folio_add_anon_rmap_*() because the caller 1376 * always holds the page locked. 1377 * 1378 * We have exclusion against folio_add_new_anon_rmap because those pages 1379 * are initially only visible via the pagetables, and the pte is locked 1380 * over the call to folio_add_new_anon_rmap. 1381 */ 1382 VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root, 1383 folio); 1384 VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address), 1385 page); 1386 } 1387 1388 static __always_inline void __folio_add_anon_rmap(struct folio *folio, 1389 struct page *page, int nr_pages, struct vm_area_struct *vma, 1390 unsigned long address, rmap_t flags, enum pgtable_level level) 1391 { 1392 int i; 1393 1394 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 1395 1396 __folio_add_rmap(folio, page, nr_pages, vma, level); 1397 1398 if (likely(!folio_test_ksm(folio))) 1399 __page_check_anon_rmap(folio, page, vma, address); 1400 1401 if (flags & RMAP_EXCLUSIVE) { 1402 switch (level) { 1403 case PGTABLE_LEVEL_PTE: 1404 for (i = 0; i < nr_pages; i++) 1405 SetPageAnonExclusive(page + i); 1406 break; 1407 case PGTABLE_LEVEL_PMD: 1408 SetPageAnonExclusive(page); 1409 break; 1410 case PGTABLE_LEVEL_PUD: 1411 /* 1412 * Keep the compiler happy, we don't support anonymous 1413 * PUD mappings. 1414 */ 1415 WARN_ON_ONCE(1); 1416 break; 1417 default: 1418 BUILD_BUG(); 1419 } 1420 } 1421 1422 VM_WARN_ON_FOLIO(!folio_test_large(folio) && PageAnonExclusive(page) && 1423 atomic_read(&folio->_mapcount) > 0, folio); 1424 for (i = 0; i < nr_pages; i++) { 1425 struct page *cur_page = page + i; 1426 1427 VM_WARN_ON_FOLIO(folio_test_large(folio) && 1428 folio_entire_mapcount(folio) > 1 && 1429 PageAnonExclusive(cur_page), folio); 1430 if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) 1431 continue; 1432 1433 /* 1434 * While PTE-mapping a THP we have a PMD and a PTE 1435 * mapping. 1436 */ 1437 VM_WARN_ON_FOLIO(atomic_read(&cur_page->_mapcount) > 0 && 1438 PageAnonExclusive(cur_page), folio); 1439 } 1440 1441 /* 1442 * Only mlock it if the folio is fully mapped to the VMA. 1443 * 1444 * Partially mapped folios can be split on reclaim and part outside 1445 * of mlocked VMA can be evicted or freed. 1446 */ 1447 if (folio_nr_pages(folio) == nr_pages) 1448 mlock_vma_folio(folio, vma); 1449 } 1450 1451 /** 1452 * folio_add_anon_rmap_ptes - add PTE mappings to a page range of an anon folio 1453 * @folio: The folio to add the mappings to 1454 * @page: The first page to add 1455 * @nr_pages: The number of pages which will be mapped 1456 * @vma: The vm area in which the mappings are added 1457 * @address: The user virtual address of the first page to map 1458 * @flags: The rmap flags 1459 * 1460 * The page range of folio is defined by [first_page, first_page + nr_pages) 1461 * 1462 * The caller needs to hold the page table lock, and the page must be locked in 1463 * the anon_vma case: to serialize mapping,index checking after setting, 1464 * and to ensure that an anon folio is not being upgraded racily to a KSM folio 1465 * (but KSM folios are never downgraded). 1466 */ 1467 void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page, 1468 int nr_pages, struct vm_area_struct *vma, unsigned long address, 1469 rmap_t flags) 1470 { 1471 __folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags, 1472 PGTABLE_LEVEL_PTE); 1473 } 1474 1475 /** 1476 * folio_add_anon_rmap_pmd - add a PMD mapping to a page range of an anon folio 1477 * @folio: The folio to add the mapping to 1478 * @page: The first page to add 1479 * @vma: The vm area in which the mapping is added 1480 * @address: The user virtual address of the first page to map 1481 * @flags: The rmap flags 1482 * 1483 * The page range of folio is defined by [first_page, first_page + HPAGE_PMD_NR) 1484 * 1485 * The caller needs to hold the page table lock, and the page must be locked in 1486 * the anon_vma case: to serialize mapping,index checking after setting. 1487 */ 1488 void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page, 1489 struct vm_area_struct *vma, unsigned long address, rmap_t flags) 1490 { 1491 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1492 __folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags, 1493 PGTABLE_LEVEL_PMD); 1494 #else 1495 WARN_ON_ONCE(true); 1496 #endif 1497 } 1498 1499 /** 1500 * folio_add_new_anon_rmap - Add mapping to a new anonymous folio. 1501 * @folio: The folio to add the mapping to. 1502 * @vma: the vm area in which the mapping is added 1503 * @address: the user virtual address mapped 1504 * @flags: The rmap flags 1505 * 1506 * Like folio_add_anon_rmap_*() but must only be called on *new* folios. 1507 * This means the inc-and-test can be bypassed. 1508 * The folio doesn't necessarily need to be locked while it's exclusive 1509 * unless two threads map it concurrently. However, the folio must be 1510 * locked if it's shared. 1511 * 1512 * If the folio is pmd-mappable, it is accounted as a THP. 1513 */ 1514 void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, 1515 unsigned long address, rmap_t flags) 1516 { 1517 const bool exclusive = flags & RMAP_EXCLUSIVE; 1518 int nr = 1, nr_pmdmapped = 0; 1519 1520 VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); 1521 VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio); 1522 1523 /* 1524 * VM_DROPPABLE mappings don't swap; instead they're just dropped when 1525 * under memory pressure. 1526 */ 1527 if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE)) 1528 __folio_set_swapbacked(folio); 1529 __folio_set_anon(folio, vma, address, exclusive); 1530 1531 if (likely(!folio_test_large(folio))) { 1532 /* increment count (starts at -1) */ 1533 atomic_set(&folio->_mapcount, 0); 1534 if (exclusive) 1535 SetPageAnonExclusive(&folio->page); 1536 } else if (!folio_test_pmd_mappable(folio)) { 1537 int i; 1538 1539 nr = folio_large_nr_pages(folio); 1540 for (i = 0; i < nr; i++) { 1541 struct page *page = folio_page(folio, i); 1542 1543 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) 1544 /* increment count (starts at -1) */ 1545 atomic_set(&page->_mapcount, 0); 1546 if (exclusive) 1547 SetPageAnonExclusive(page); 1548 } 1549 1550 folio_set_large_mapcount(folio, nr, vma); 1551 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) 1552 atomic_set(&folio->_nr_pages_mapped, nr); 1553 } else { 1554 nr = folio_large_nr_pages(folio); 1555 /* increment count (starts at -1) */ 1556 atomic_set(&folio->_entire_mapcount, 0); 1557 folio_set_large_mapcount(folio, 1, vma); 1558 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) 1559 atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED); 1560 if (exclusive) 1561 SetPageAnonExclusive(&folio->page); 1562 nr_pmdmapped = nr; 1563 } 1564 1565 VM_WARN_ON_ONCE(address < vma->vm_start || 1566 address + (nr << PAGE_SHIFT) > vma->vm_end); 1567 1568 __folio_mod_stat(folio, nr, nr_pmdmapped); 1569 mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); 1570 } 1571 1572 static __always_inline void __folio_add_file_rmap(struct folio *folio, 1573 struct page *page, int nr_pages, struct vm_area_struct *vma, 1574 enum pgtable_level level) 1575 { 1576 VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); 1577 1578 __folio_add_rmap(folio, page, nr_pages, vma, level); 1579 1580 /* 1581 * Only mlock it if the folio is fully mapped to the VMA. 1582 * 1583 * Partially mapped folios can be split on reclaim and part outside 1584 * of mlocked VMA can be evicted or freed. 1585 */ 1586 if (folio_nr_pages(folio) == nr_pages) 1587 mlock_vma_folio(folio, vma); 1588 } 1589 1590 /** 1591 * folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio 1592 * @folio: The folio to add the mappings to 1593 * @page: The first page to add 1594 * @nr_pages: The number of pages that will be mapped using PTEs 1595 * @vma: The vm area in which the mappings are added 1596 * 1597 * The page range of the folio is defined by [page, page + nr_pages) 1598 * 1599 * The caller needs to hold the page table lock. 1600 */ 1601 void folio_add_file_rmap_ptes(struct folio *folio, struct page *page, 1602 int nr_pages, struct vm_area_struct *vma) 1603 { 1604 __folio_add_file_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE); 1605 } 1606 1607 /** 1608 * folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio 1609 * @folio: The folio to add the mapping to 1610 * @page: The first page to add 1611 * @vma: The vm area in which the mapping is added 1612 * 1613 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) 1614 * 1615 * The caller needs to hold the page table lock. 1616 */ 1617 void folio_add_file_rmap_pmd(struct folio *folio, struct page *page, 1618 struct vm_area_struct *vma) 1619 { 1620 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1621 __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD); 1622 #else 1623 WARN_ON_ONCE(true); 1624 #endif 1625 } 1626 1627 /** 1628 * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio 1629 * @folio: The folio to add the mapping to 1630 * @page: The first page to add 1631 * @vma: The vm area in which the mapping is added 1632 * 1633 * The page range of the folio is defined by [page, page + HPAGE_PUD_NR) 1634 * 1635 * The caller needs to hold the page table lock. 1636 */ 1637 void folio_add_file_rmap_pud(struct folio *folio, struct page *page, 1638 struct vm_area_struct *vma) 1639 { 1640 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ 1641 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 1642 __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD); 1643 #else 1644 WARN_ON_ONCE(true); 1645 #endif 1646 } 1647 1648 static __always_inline void __folio_remove_rmap(struct folio *folio, 1649 struct page *page, int nr_pages, struct vm_area_struct *vma, 1650 enum pgtable_level level) 1651 { 1652 atomic_t *mapped = &folio->_nr_pages_mapped; 1653 int last = 0, nr = 0, nr_pmdmapped = 0; 1654 bool partially_mapped = false; 1655 1656 __folio_rmap_sanity_checks(folio, page, nr_pages, level); 1657 1658 switch (level) { 1659 case PGTABLE_LEVEL_PTE: 1660 if (!folio_test_large(folio)) { 1661 nr = atomic_add_negative(-1, &folio->_mapcount); 1662 break; 1663 } 1664 1665 if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { 1666 nr = folio_sub_return_large_mapcount(folio, nr_pages, vma); 1667 if (!nr) { 1668 /* Now completely unmapped. */ 1669 nr = folio_large_nr_pages(folio); 1670 } else { 1671 partially_mapped = nr < folio_large_nr_pages(folio) && 1672 !folio_entire_mapcount(folio); 1673 nr = 0; 1674 } 1675 break; 1676 } 1677 1678 folio_sub_large_mapcount(folio, nr_pages, vma); 1679 do { 1680 last += atomic_add_negative(-1, &page->_mapcount); 1681 } while (page++, --nr_pages > 0); 1682 1683 if (last && 1684 atomic_sub_return_relaxed(last, mapped) < ENTIRELY_MAPPED) 1685 nr = last; 1686 1687 partially_mapped = nr && atomic_read(mapped); 1688 break; 1689 case PGTABLE_LEVEL_PMD: 1690 case PGTABLE_LEVEL_PUD: 1691 if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { 1692 last = atomic_add_negative(-1, &folio->_entire_mapcount); 1693 if (level == PGTABLE_LEVEL_PMD && last) 1694 nr_pmdmapped = folio_large_nr_pages(folio); 1695 nr = folio_dec_return_large_mapcount(folio, vma); 1696 if (!nr) { 1697 /* Now completely unmapped. */ 1698 nr = folio_large_nr_pages(folio); 1699 } else { 1700 partially_mapped = last && 1701 nr < folio_large_nr_pages(folio); 1702 nr = 0; 1703 } 1704 break; 1705 } 1706 1707 folio_dec_large_mapcount(folio, vma); 1708 last = atomic_add_negative(-1, &folio->_entire_mapcount); 1709 if (last) { 1710 nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped); 1711 if (likely(nr < ENTIRELY_MAPPED)) { 1712 nr_pages = folio_large_nr_pages(folio); 1713 if (level == PGTABLE_LEVEL_PMD) 1714 nr_pmdmapped = nr_pages; 1715 nr = nr_pages - nr; 1716 /* Raced ahead of another remove and an add? */ 1717 if (unlikely(nr < 0)) 1718 nr = 0; 1719 } else { 1720 /* An add of ENTIRELY_MAPPED raced ahead */ 1721 nr = 0; 1722 } 1723 } 1724 1725 partially_mapped = nr && nr < nr_pmdmapped; 1726 break; 1727 default: 1728 BUILD_BUG(); 1729 } 1730 1731 /* 1732 * Queue anon large folio for deferred split if at least one page of 1733 * the folio is unmapped and at least one page is still mapped. 1734 * 1735 * Check partially_mapped first to ensure it is a large folio. 1736 */ 1737 if (partially_mapped && folio_test_anon(folio) && 1738 !folio_test_partially_mapped(folio)) 1739 deferred_split_folio(folio, true); 1740 1741 __folio_mod_stat(folio, -nr, -nr_pmdmapped); 1742 1743 /* 1744 * It would be tidy to reset folio_test_anon mapping when fully 1745 * unmapped, but that might overwrite a racing folio_add_anon_rmap_*() 1746 * which increments mapcount after us but sets mapping before us: 1747 * so leave the reset to free_pages_prepare, and remember that 1748 * it's only reliable while mapped. 1749 */ 1750 1751 munlock_vma_folio(folio, vma); 1752 } 1753 1754 /** 1755 * folio_remove_rmap_ptes - remove PTE mappings from a page range of a folio 1756 * @folio: The folio to remove the mappings from 1757 * @page: The first page to remove 1758 * @nr_pages: The number of pages that will be removed from the mapping 1759 * @vma: The vm area from which the mappings are removed 1760 * 1761 * The page range of the folio is defined by [page, page + nr_pages) 1762 * 1763 * The caller needs to hold the page table lock. 1764 */ 1765 void folio_remove_rmap_ptes(struct folio *folio, struct page *page, 1766 int nr_pages, struct vm_area_struct *vma) 1767 { 1768 __folio_remove_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE); 1769 } 1770 1771 /** 1772 * folio_remove_rmap_pmd - remove a PMD mapping from a page range of a folio 1773 * @folio: The folio to remove the mapping from 1774 * @page: The first page to remove 1775 * @vma: The vm area from which the mapping is removed 1776 * 1777 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) 1778 * 1779 * The caller needs to hold the page table lock. 1780 */ 1781 void folio_remove_rmap_pmd(struct folio *folio, struct page *page, 1782 struct vm_area_struct *vma) 1783 { 1784 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1785 __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD); 1786 #else 1787 WARN_ON_ONCE(true); 1788 #endif 1789 } 1790 1791 /** 1792 * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio 1793 * @folio: The folio to remove the mapping from 1794 * @page: The first page to remove 1795 * @vma: The vm area from which the mapping is removed 1796 * 1797 * The page range of the folio is defined by [page, page + HPAGE_PUD_NR) 1798 * 1799 * The caller needs to hold the page table lock. 1800 */ 1801 void folio_remove_rmap_pud(struct folio *folio, struct page *page, 1802 struct vm_area_struct *vma) 1803 { 1804 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ 1805 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 1806 __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD); 1807 #else 1808 WARN_ON_ONCE(true); 1809 #endif 1810 } 1811 1812 static inline unsigned int folio_unmap_pte_batch(struct folio *folio, 1813 struct page_vma_mapped_walk *pvmw, 1814 enum ttu_flags flags, pte_t pte) 1815 { 1816 unsigned long end_addr, addr = pvmw->address; 1817 struct vm_area_struct *vma = pvmw->vma; 1818 unsigned int max_nr; 1819 1820 if (flags & TTU_HWPOISON) 1821 return 1; 1822 if (!folio_test_large(folio)) 1823 return 1; 1824 1825 /* We may only batch within a single VMA and a single page table. */ 1826 end_addr = pmd_addr_end(addr, vma->vm_end); 1827 max_nr = (end_addr - addr) >> PAGE_SHIFT; 1828 1829 /* We only support lazyfree batching for now ... */ 1830 if (!folio_test_anon(folio) || folio_test_swapbacked(folio)) 1831 return 1; 1832 if (pte_unused(pte)) 1833 return 1; 1834 1835 return folio_pte_batch(folio, pvmw->pte, pte, max_nr); 1836 } 1837 1838 /* 1839 * @arg: enum ttu_flags will be passed to this argument 1840 */ 1841 static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, 1842 unsigned long address, void *arg) 1843 { 1844 struct mm_struct *mm = vma->vm_mm; 1845 DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); 1846 bool anon_exclusive, ret = true; 1847 pte_t pteval; 1848 struct page *subpage; 1849 struct mmu_notifier_range range; 1850 enum ttu_flags flags = (enum ttu_flags)(long)arg; 1851 unsigned long nr_pages = 1, end_addr; 1852 unsigned long pfn; 1853 unsigned long hsz = 0; 1854 int ptes = 0; 1855 1856 /* 1857 * When racing against e.g. zap_pte_range() on another cpu, 1858 * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(), 1859 * try_to_unmap() may return before page_mapped() has become false, 1860 * if page table locking is skipped: use TTU_SYNC to wait for that. 1861 */ 1862 if (flags & TTU_SYNC) 1863 pvmw.flags = PVMW_SYNC; 1864 1865 /* 1866 * For THP, we have to assume the worse case ie pmd for invalidation. 1867 * For hugetlb, it could be much worse if we need to do pud 1868 * invalidation in the case of pmd sharing. 1869 * 1870 * Note that the folio can not be freed in this function as call of 1871 * try_to_unmap() must hold a reference on the folio. 1872 */ 1873 range.end = vma_address_end(&pvmw); 1874 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 1875 address, range.end); 1876 if (folio_test_hugetlb(folio)) { 1877 /* 1878 * If sharing is possible, start and end will be adjusted 1879 * accordingly. 1880 */ 1881 adjust_range_if_pmd_sharing_possible(vma, &range.start, 1882 &range.end); 1883 1884 /* We need the huge page size for set_huge_pte_at() */ 1885 hsz = huge_page_size(hstate_vma(vma)); 1886 } 1887 mmu_notifier_invalidate_range_start(&range); 1888 1889 while (page_vma_mapped_walk(&pvmw)) { 1890 /* 1891 * If the folio is in an mlock()d vma, we must not swap it out. 1892 */ 1893 if (!(flags & TTU_IGNORE_MLOCK) && 1894 (vma->vm_flags & VM_LOCKED)) { 1895 ptes++; 1896 1897 /* 1898 * Set 'ret' to indicate the page cannot be unmapped. 1899 * 1900 * Do not jump to walk_abort immediately as additional 1901 * iteration might be required to detect fully mapped 1902 * folio an mlock it. 1903 */ 1904 ret = false; 1905 1906 /* Only mlock fully mapped pages */ 1907 if (pvmw.pte && ptes != pvmw.nr_pages) 1908 continue; 1909 1910 /* 1911 * All PTEs must be protected by page table lock in 1912 * order to mlock the page. 1913 * 1914 * If page table boundary has been cross, current ptl 1915 * only protect part of ptes. 1916 */ 1917 if (pvmw.flags & PVMW_PGTABLE_CROSSED) 1918 goto walk_done; 1919 1920 /* Restore the mlock which got missed */ 1921 mlock_vma_folio(folio, vma); 1922 goto walk_done; 1923 } 1924 1925 if (!pvmw.pte) { 1926 if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { 1927 if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio)) 1928 goto walk_done; 1929 /* 1930 * unmap_huge_pmd_locked has either already marked 1931 * the folio as swap-backed or decided to retain it 1932 * due to GUP or speculative references. 1933 */ 1934 goto walk_abort; 1935 } 1936 1937 if (flags & TTU_SPLIT_HUGE_PMD) { 1938 /* 1939 * We temporarily have to drop the PTL and 1940 * restart so we can process the PTE-mapped THP. 1941 */ 1942 split_huge_pmd_locked(vma, pvmw.address, 1943 pvmw.pmd, false); 1944 flags &= ~TTU_SPLIT_HUGE_PMD; 1945 page_vma_mapped_walk_restart(&pvmw); 1946 continue; 1947 } 1948 } 1949 1950 /* Unexpected PMD-mapped THP? */ 1951 VM_BUG_ON_FOLIO(!pvmw.pte, folio); 1952 1953 /* 1954 * Handle PFN swap PTEs, such as device-exclusive ones, that 1955 * actually map pages. 1956 */ 1957 pteval = ptep_get(pvmw.pte); 1958 if (likely(pte_present(pteval))) { 1959 pfn = pte_pfn(pteval); 1960 } else { 1961 pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); 1962 VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); 1963 } 1964 1965 subpage = folio_page(folio, pfn - folio_pfn(folio)); 1966 address = pvmw.address; 1967 anon_exclusive = folio_test_anon(folio) && 1968 PageAnonExclusive(subpage); 1969 1970 if (folio_test_hugetlb(folio)) { 1971 bool anon = folio_test_anon(folio); 1972 1973 /* 1974 * The try_to_unmap() is only passed a hugetlb page 1975 * in the case where the hugetlb page is poisoned. 1976 */ 1977 VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage); 1978 /* 1979 * huge_pmd_unshare may unmap an entire PMD page. 1980 * There is no way of knowing exactly which PMDs may 1981 * be cached for this mm, so we must flush them all. 1982 * start/end were already adjusted above to cover this 1983 * range. 1984 */ 1985 flush_cache_range(vma, range.start, range.end); 1986 1987 /* 1988 * To call huge_pmd_unshare, i_mmap_rwsem must be 1989 * held in write mode. Caller needs to explicitly 1990 * do this outside rmap routines. 1991 * 1992 * We also must hold hugetlb vma_lock in write mode. 1993 * Lock order dictates acquiring vma_lock BEFORE 1994 * i_mmap_rwsem. We can only try lock here and fail 1995 * if unsuccessful. 1996 */ 1997 if (!anon) { 1998 VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); 1999 if (!hugetlb_vma_trylock_write(vma)) 2000 goto walk_abort; 2001 if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { 2002 hugetlb_vma_unlock_write(vma); 2003 flush_tlb_range(vma, 2004 range.start, range.end); 2005 /* 2006 * The ref count of the PMD page was 2007 * dropped which is part of the way map 2008 * counting is done for shared PMDs. 2009 * Return 'true' here. When there is 2010 * no other sharing, huge_pmd_unshare 2011 * returns false and we will unmap the 2012 * actual page and drop map count 2013 * to zero. 2014 */ 2015 goto walk_done; 2016 } 2017 hugetlb_vma_unlock_write(vma); 2018 } 2019 pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); 2020 if (pte_dirty(pteval)) 2021 folio_mark_dirty(folio); 2022 } else if (likely(pte_present(pteval))) { 2023 nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval); 2024 end_addr = address + nr_pages * PAGE_SIZE; 2025 flush_cache_range(vma, address, end_addr); 2026 2027 /* Nuke the page table entry. */ 2028 pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages); 2029 /* 2030 * We clear the PTE but do not flush so potentially 2031 * a remote CPU could still be writing to the folio. 2032 * If the entry was previously clean then the 2033 * architecture must guarantee that a clear->dirty 2034 * transition on a cached TLB entry is written through 2035 * and traps if the PTE is unmapped. 2036 */ 2037 if (should_defer_flush(mm, flags)) 2038 set_tlb_ubc_flush_pending(mm, pteval, address, end_addr); 2039 else 2040 flush_tlb_range(vma, address, end_addr); 2041 if (pte_dirty(pteval)) 2042 folio_mark_dirty(folio); 2043 } else { 2044 pte_clear(mm, address, pvmw.pte); 2045 } 2046 2047 /* 2048 * Now the pte is cleared. If this pte was uffd-wp armed, 2049 * we may want to replace a none pte with a marker pte if 2050 * it's file-backed, so we don't lose the tracking info. 2051 */ 2052 pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval); 2053 2054 /* Update high watermark before we lower rss */ 2055 update_hiwater_rss(mm); 2056 2057 if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) { 2058 pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); 2059 if (folio_test_hugetlb(folio)) { 2060 hugetlb_count_sub(folio_nr_pages(folio), mm); 2061 set_huge_pte_at(mm, address, pvmw.pte, pteval, 2062 hsz); 2063 } else { 2064 dec_mm_counter(mm, mm_counter(folio)); 2065 set_pte_at(mm, address, pvmw.pte, pteval); 2066 } 2067 } else if (likely(pte_present(pteval)) && pte_unused(pteval) && 2068 !userfaultfd_armed(vma)) { 2069 /* 2070 * The guest indicated that the page content is of no 2071 * interest anymore. Simply discard the pte, vmscan 2072 * will take care of the rest. 2073 * A future reference will then fault in a new zero 2074 * page. When userfaultfd is active, we must not drop 2075 * this page though, as its main user (postcopy 2076 * migration) will not expect userfaults on already 2077 * copied pages. 2078 */ 2079 dec_mm_counter(mm, mm_counter(folio)); 2080 } else if (folio_test_anon(folio)) { 2081 swp_entry_t entry = page_swap_entry(subpage); 2082 pte_t swp_pte; 2083 /* 2084 * Store the swap location in the pte. 2085 * See handle_pte_fault() ... 2086 */ 2087 if (unlikely(folio_test_swapbacked(folio) != 2088 folio_test_swapcache(folio))) { 2089 WARN_ON_ONCE(1); 2090 goto walk_abort; 2091 } 2092 2093 /* MADV_FREE page check */ 2094 if (!folio_test_swapbacked(folio)) { 2095 int ref_count, map_count; 2096 2097 /* 2098 * Synchronize with gup_pte_range(): 2099 * - clear PTE; barrier; read refcount 2100 * - inc refcount; barrier; read PTE 2101 */ 2102 smp_mb(); 2103 2104 ref_count = folio_ref_count(folio); 2105 map_count = folio_mapcount(folio); 2106 2107 /* 2108 * Order reads for page refcount and dirty flag 2109 * (see comments in __remove_mapping()). 2110 */ 2111 smp_rmb(); 2112 2113 if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) { 2114 /* 2115 * redirtied either using the page table or a previously 2116 * obtained GUP reference. 2117 */ 2118 set_ptes(mm, address, pvmw.pte, pteval, nr_pages); 2119 folio_set_swapbacked(folio); 2120 goto walk_abort; 2121 } else if (ref_count != 1 + map_count) { 2122 /* 2123 * Additional reference. Could be a GUP reference or any 2124 * speculative reference. GUP users must mark the folio 2125 * dirty if there was a modification. This folio cannot be 2126 * reclaimed right now either way, so act just like nothing 2127 * happened. 2128 * We'll come back here later and detect if the folio was 2129 * dirtied when the additional reference is gone. 2130 */ 2131 set_ptes(mm, address, pvmw.pte, pteval, nr_pages); 2132 goto walk_abort; 2133 } 2134 add_mm_counter(mm, MM_ANONPAGES, -nr_pages); 2135 goto discard; 2136 } 2137 2138 if (swap_duplicate(entry) < 0) { 2139 set_pte_at(mm, address, pvmw.pte, pteval); 2140 goto walk_abort; 2141 } 2142 2143 /* 2144 * arch_unmap_one() is expected to be a NOP on 2145 * architectures where we could have PFN swap PTEs, 2146 * so we'll not check/care. 2147 */ 2148 if (arch_unmap_one(mm, vma, address, pteval) < 0) { 2149 swap_free(entry); 2150 set_pte_at(mm, address, pvmw.pte, pteval); 2151 goto walk_abort; 2152 } 2153 2154 /* See folio_try_share_anon_rmap(): clear PTE first. */ 2155 if (anon_exclusive && 2156 folio_try_share_anon_rmap_pte(folio, subpage)) { 2157 swap_free(entry); 2158 set_pte_at(mm, address, pvmw.pte, pteval); 2159 goto walk_abort; 2160 } 2161 if (list_empty(&mm->mmlist)) { 2162 spin_lock(&mmlist_lock); 2163 if (list_empty(&mm->mmlist)) 2164 list_add(&mm->mmlist, &init_mm.mmlist); 2165 spin_unlock(&mmlist_lock); 2166 } 2167 dec_mm_counter(mm, MM_ANONPAGES); 2168 inc_mm_counter(mm, MM_SWAPENTS); 2169 swp_pte = swp_entry_to_pte(entry); 2170 if (anon_exclusive) 2171 swp_pte = pte_swp_mkexclusive(swp_pte); 2172 if (likely(pte_present(pteval))) { 2173 if (pte_soft_dirty(pteval)) 2174 swp_pte = pte_swp_mksoft_dirty(swp_pte); 2175 if (pte_uffd_wp(pteval)) 2176 swp_pte = pte_swp_mkuffd_wp(swp_pte); 2177 } else { 2178 if (pte_swp_soft_dirty(pteval)) 2179 swp_pte = pte_swp_mksoft_dirty(swp_pte); 2180 if (pte_swp_uffd_wp(pteval)) 2181 swp_pte = pte_swp_mkuffd_wp(swp_pte); 2182 } 2183 set_pte_at(mm, address, pvmw.pte, swp_pte); 2184 } else { 2185 /* 2186 * This is a locked file-backed folio, 2187 * so it cannot be removed from the page 2188 * cache and replaced by a new folio before 2189 * mmu_notifier_invalidate_range_end, so no 2190 * concurrent thread might update its page table 2191 * to point at a new folio while a device is 2192 * still using this folio. 2193 * 2194 * See Documentation/mm/mmu_notifier.rst 2195 */ 2196 dec_mm_counter(mm, mm_counter_file(folio)); 2197 } 2198 discard: 2199 if (unlikely(folio_test_hugetlb(folio))) { 2200 hugetlb_remove_rmap(folio); 2201 } else { 2202 folio_remove_rmap_ptes(folio, subpage, nr_pages, vma); 2203 } 2204 if (vma->vm_flags & VM_LOCKED) 2205 mlock_drain_local(); 2206 folio_put_refs(folio, nr_pages); 2207 2208 /* 2209 * If we are sure that we batched the entire folio and cleared 2210 * all PTEs, we can just optimize and stop right here. 2211 */ 2212 if (nr_pages == folio_nr_pages(folio)) 2213 goto walk_done; 2214 continue; 2215 walk_abort: 2216 ret = false; 2217 walk_done: 2218 page_vma_mapped_walk_done(&pvmw); 2219 break; 2220 } 2221 2222 mmu_notifier_invalidate_range_end(&range); 2223 2224 return ret; 2225 } 2226 2227 static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) 2228 { 2229 return vma_is_temporary_stack(vma); 2230 } 2231 2232 static int folio_not_mapped(struct folio *folio) 2233 { 2234 return !folio_mapped(folio); 2235 } 2236 2237 /** 2238 * try_to_unmap - Try to remove all page table mappings to a folio. 2239 * @folio: The folio to unmap. 2240 * @flags: action and flags 2241 * 2242 * Tries to remove all the page table entries which are mapping this 2243 * folio. It is the caller's responsibility to check if the folio is 2244 * still mapped if needed (use TTU_SYNC to prevent accounting races). 2245 * 2246 * Context: Caller must hold the folio lock. 2247 */ 2248 void try_to_unmap(struct folio *folio, enum ttu_flags flags) 2249 { 2250 struct rmap_walk_control rwc = { 2251 .rmap_one = try_to_unmap_one, 2252 .arg = (void *)flags, 2253 .done = folio_not_mapped, 2254 .anon_lock = folio_lock_anon_vma_read, 2255 }; 2256 2257 if (flags & TTU_RMAP_LOCKED) 2258 rmap_walk_locked(folio, &rwc); 2259 else 2260 rmap_walk(folio, &rwc); 2261 } 2262 2263 /* 2264 * @arg: enum ttu_flags will be passed to this argument. 2265 * 2266 * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs 2267 * containing migration entries. 2268 */ 2269 static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, 2270 unsigned long address, void *arg) 2271 { 2272 struct mm_struct *mm = vma->vm_mm; 2273 DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); 2274 bool anon_exclusive, writable, ret = true; 2275 pte_t pteval; 2276 struct page *subpage; 2277 struct mmu_notifier_range range; 2278 enum ttu_flags flags = (enum ttu_flags)(long)arg; 2279 unsigned long pfn; 2280 unsigned long hsz = 0; 2281 2282 /* 2283 * When racing against e.g. zap_pte_range() on another cpu, 2284 * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(), 2285 * try_to_migrate() may return before page_mapped() has become false, 2286 * if page table locking is skipped: use TTU_SYNC to wait for that. 2287 */ 2288 if (flags & TTU_SYNC) 2289 pvmw.flags = PVMW_SYNC; 2290 2291 /* 2292 * For THP, we have to assume the worse case ie pmd for invalidation. 2293 * For hugetlb, it could be much worse if we need to do pud 2294 * invalidation in the case of pmd sharing. 2295 * 2296 * Note that the page can not be free in this function as call of 2297 * try_to_unmap() must hold a reference on the page. 2298 */ 2299 range.end = vma_address_end(&pvmw); 2300 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 2301 address, range.end); 2302 if (folio_test_hugetlb(folio)) { 2303 /* 2304 * If sharing is possible, start and end will be adjusted 2305 * accordingly. 2306 */ 2307 adjust_range_if_pmd_sharing_possible(vma, &range.start, 2308 &range.end); 2309 2310 /* We need the huge page size for set_huge_pte_at() */ 2311 hsz = huge_page_size(hstate_vma(vma)); 2312 } 2313 mmu_notifier_invalidate_range_start(&range); 2314 2315 while (page_vma_mapped_walk(&pvmw)) { 2316 /* PMD-mapped THP migration entry */ 2317 if (!pvmw.pte) { 2318 if (flags & TTU_SPLIT_HUGE_PMD) { 2319 split_huge_pmd_locked(vma, pvmw.address, 2320 pvmw.pmd, true); 2321 ret = false; 2322 page_vma_mapped_walk_done(&pvmw); 2323 break; 2324 } 2325 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 2326 subpage = folio_page(folio, 2327 pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); 2328 VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || 2329 !folio_test_pmd_mappable(folio), folio); 2330 2331 if (set_pmd_migration_entry(&pvmw, subpage)) { 2332 ret = false; 2333 page_vma_mapped_walk_done(&pvmw); 2334 break; 2335 } 2336 continue; 2337 #endif 2338 } 2339 2340 /* Unexpected PMD-mapped THP? */ 2341 VM_BUG_ON_FOLIO(!pvmw.pte, folio); 2342 2343 /* 2344 * Handle PFN swap PTEs, such as device-exclusive ones, that 2345 * actually map pages. 2346 */ 2347 pteval = ptep_get(pvmw.pte); 2348 if (likely(pte_present(pteval))) { 2349 pfn = pte_pfn(pteval); 2350 } else { 2351 pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); 2352 VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); 2353 } 2354 2355 subpage = folio_page(folio, pfn - folio_pfn(folio)); 2356 address = pvmw.address; 2357 anon_exclusive = folio_test_anon(folio) && 2358 PageAnonExclusive(subpage); 2359 2360 if (folio_test_hugetlb(folio)) { 2361 bool anon = folio_test_anon(folio); 2362 2363 /* 2364 * huge_pmd_unshare may unmap an entire PMD page. 2365 * There is no way of knowing exactly which PMDs may 2366 * be cached for this mm, so we must flush them all. 2367 * start/end were already adjusted above to cover this 2368 * range. 2369 */ 2370 flush_cache_range(vma, range.start, range.end); 2371 2372 /* 2373 * To call huge_pmd_unshare, i_mmap_rwsem must be 2374 * held in write mode. Caller needs to explicitly 2375 * do this outside rmap routines. 2376 * 2377 * We also must hold hugetlb vma_lock in write mode. 2378 * Lock order dictates acquiring vma_lock BEFORE 2379 * i_mmap_rwsem. We can only try lock here and 2380 * fail if unsuccessful. 2381 */ 2382 if (!anon) { 2383 VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); 2384 if (!hugetlb_vma_trylock_write(vma)) { 2385 page_vma_mapped_walk_done(&pvmw); 2386 ret = false; 2387 break; 2388 } 2389 if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { 2390 hugetlb_vma_unlock_write(vma); 2391 flush_tlb_range(vma, 2392 range.start, range.end); 2393 2394 /* 2395 * The ref count of the PMD page was 2396 * dropped which is part of the way map 2397 * counting is done for shared PMDs. 2398 * Return 'true' here. When there is 2399 * no other sharing, huge_pmd_unshare 2400 * returns false and we will unmap the 2401 * actual page and drop map count 2402 * to zero. 2403 */ 2404 page_vma_mapped_walk_done(&pvmw); 2405 break; 2406 } 2407 hugetlb_vma_unlock_write(vma); 2408 } 2409 /* Nuke the hugetlb page table entry */ 2410 pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); 2411 if (pte_dirty(pteval)) 2412 folio_mark_dirty(folio); 2413 writable = pte_write(pteval); 2414 } else if (likely(pte_present(pteval))) { 2415 flush_cache_page(vma, address, pfn); 2416 /* Nuke the page table entry. */ 2417 if (should_defer_flush(mm, flags)) { 2418 /* 2419 * We clear the PTE but do not flush so potentially 2420 * a remote CPU could still be writing to the folio. 2421 * If the entry was previously clean then the 2422 * architecture must guarantee that a clear->dirty 2423 * transition on a cached TLB entry is written through 2424 * and traps if the PTE is unmapped. 2425 */ 2426 pteval = ptep_get_and_clear(mm, address, pvmw.pte); 2427 2428 set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE); 2429 } else { 2430 pteval = ptep_clear_flush(vma, address, pvmw.pte); 2431 } 2432 if (pte_dirty(pteval)) 2433 folio_mark_dirty(folio); 2434 writable = pte_write(pteval); 2435 } else { 2436 pte_clear(mm, address, pvmw.pte); 2437 writable = is_writable_device_private_entry(pte_to_swp_entry(pteval)); 2438 } 2439 2440 VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) && 2441 !anon_exclusive, folio); 2442 2443 /* Update high watermark before we lower rss */ 2444 update_hiwater_rss(mm); 2445 2446 if (PageHWPoison(subpage)) { 2447 VM_WARN_ON_FOLIO(folio_is_device_private(folio), folio); 2448 2449 pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); 2450 if (folio_test_hugetlb(folio)) { 2451 hugetlb_count_sub(folio_nr_pages(folio), mm); 2452 set_huge_pte_at(mm, address, pvmw.pte, pteval, 2453 hsz); 2454 } else { 2455 dec_mm_counter(mm, mm_counter(folio)); 2456 set_pte_at(mm, address, pvmw.pte, pteval); 2457 } 2458 } else if (likely(pte_present(pteval)) && pte_unused(pteval) && 2459 !userfaultfd_armed(vma)) { 2460 /* 2461 * The guest indicated that the page content is of no 2462 * interest anymore. Simply discard the pte, vmscan 2463 * will take care of the rest. 2464 * A future reference will then fault in a new zero 2465 * page. When userfaultfd is active, we must not drop 2466 * this page though, as its main user (postcopy 2467 * migration) will not expect userfaults on already 2468 * copied pages. 2469 */ 2470 dec_mm_counter(mm, mm_counter(folio)); 2471 } else { 2472 swp_entry_t entry; 2473 pte_t swp_pte; 2474 2475 /* 2476 * arch_unmap_one() is expected to be a NOP on 2477 * architectures where we could have PFN swap PTEs, 2478 * so we'll not check/care. 2479 */ 2480 if (arch_unmap_one(mm, vma, address, pteval) < 0) { 2481 if (folio_test_hugetlb(folio)) 2482 set_huge_pte_at(mm, address, pvmw.pte, 2483 pteval, hsz); 2484 else 2485 set_pte_at(mm, address, pvmw.pte, pteval); 2486 ret = false; 2487 page_vma_mapped_walk_done(&pvmw); 2488 break; 2489 } 2490 2491 /* See folio_try_share_anon_rmap_pte(): clear PTE first. */ 2492 if (folio_test_hugetlb(folio)) { 2493 if (anon_exclusive && 2494 hugetlb_try_share_anon_rmap(folio)) { 2495 set_huge_pte_at(mm, address, pvmw.pte, 2496 pteval, hsz); 2497 ret = false; 2498 page_vma_mapped_walk_done(&pvmw); 2499 break; 2500 } 2501 } else if (anon_exclusive && 2502 folio_try_share_anon_rmap_pte(folio, subpage)) { 2503 set_pte_at(mm, address, pvmw.pte, pteval); 2504 ret = false; 2505 page_vma_mapped_walk_done(&pvmw); 2506 break; 2507 } 2508 2509 /* 2510 * Store the pfn of the page in a special migration 2511 * pte. do_swap_page() will wait until the migration 2512 * pte is removed and then restart fault handling. 2513 */ 2514 if (writable) 2515 entry = make_writable_migration_entry( 2516 page_to_pfn(subpage)); 2517 else if (anon_exclusive) 2518 entry = make_readable_exclusive_migration_entry( 2519 page_to_pfn(subpage)); 2520 else 2521 entry = make_readable_migration_entry( 2522 page_to_pfn(subpage)); 2523 if (likely(pte_present(pteval))) { 2524 if (pte_young(pteval)) 2525 entry = make_migration_entry_young(entry); 2526 if (pte_dirty(pteval)) 2527 entry = make_migration_entry_dirty(entry); 2528 swp_pte = swp_entry_to_pte(entry); 2529 if (pte_soft_dirty(pteval)) 2530 swp_pte = pte_swp_mksoft_dirty(swp_pte); 2531 if (pte_uffd_wp(pteval)) 2532 swp_pte = pte_swp_mkuffd_wp(swp_pte); 2533 } else { 2534 swp_pte = swp_entry_to_pte(entry); 2535 if (pte_swp_soft_dirty(pteval)) 2536 swp_pte = pte_swp_mksoft_dirty(swp_pte); 2537 if (pte_swp_uffd_wp(pteval)) 2538 swp_pte = pte_swp_mkuffd_wp(swp_pte); 2539 } 2540 if (folio_test_hugetlb(folio)) 2541 set_huge_pte_at(mm, address, pvmw.pte, swp_pte, 2542 hsz); 2543 else 2544 set_pte_at(mm, address, pvmw.pte, swp_pte); 2545 trace_set_migration_pte(address, pte_val(swp_pte), 2546 folio_order(folio)); 2547 /* 2548 * No need to invalidate here it will synchronize on 2549 * against the special swap migration pte. 2550 */ 2551 } 2552 2553 if (unlikely(folio_test_hugetlb(folio))) 2554 hugetlb_remove_rmap(folio); 2555 else 2556 folio_remove_rmap_pte(folio, subpage, vma); 2557 if (vma->vm_flags & VM_LOCKED) 2558 mlock_drain_local(); 2559 folio_put(folio); 2560 } 2561 2562 mmu_notifier_invalidate_range_end(&range); 2563 2564 return ret; 2565 } 2566 2567 /** 2568 * try_to_migrate - try to replace all page table mappings with swap entries 2569 * @folio: the folio to replace page table entries for 2570 * @flags: action and flags 2571 * 2572 * Tries to remove all the page table entries which are mapping this folio and 2573 * replace them with special swap entries. Caller must hold the folio lock. 2574 */ 2575 void try_to_migrate(struct folio *folio, enum ttu_flags flags) 2576 { 2577 struct rmap_walk_control rwc = { 2578 .rmap_one = try_to_migrate_one, 2579 .arg = (void *)flags, 2580 .done = folio_not_mapped, 2581 .anon_lock = folio_lock_anon_vma_read, 2582 }; 2583 2584 /* 2585 * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and 2586 * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags. 2587 */ 2588 if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | 2589 TTU_SYNC | TTU_BATCH_FLUSH))) 2590 return; 2591 2592 if (folio_is_zone_device(folio) && 2593 (!folio_is_device_private(folio) && !folio_is_device_coherent(folio))) 2594 return; 2595 2596 /* 2597 * During exec, a temporary VMA is setup and later moved. 2598 * The VMA is moved under the anon_vma lock but not the 2599 * page tables leading to a race where migration cannot 2600 * find the migration ptes. Rather than increasing the 2601 * locking requirements of exec(), migration skips 2602 * temporary VMAs until after exec() completes. 2603 */ 2604 if (!folio_test_ksm(folio) && folio_test_anon(folio)) 2605 rwc.invalid_vma = invalid_migration_vma; 2606 2607 if (flags & TTU_RMAP_LOCKED) 2608 rmap_walk_locked(folio, &rwc); 2609 else 2610 rmap_walk(folio, &rwc); 2611 } 2612 2613 #ifdef CONFIG_DEVICE_PRIVATE 2614 /** 2615 * make_device_exclusive() - Mark a page for exclusive use by a device 2616 * @mm: mm_struct of associated target process 2617 * @addr: the virtual address to mark for exclusive device access 2618 * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering 2619 * @foliop: folio pointer will be stored here on success. 2620 * 2621 * This function looks up the page mapped at the given address, grabs a 2622 * folio reference, locks the folio and replaces the PTE with special 2623 * device-exclusive PFN swap entry, preventing access through the process 2624 * page tables. The function will return with the folio locked and referenced. 2625 * 2626 * On fault, the device-exclusive entries are replaced with the original PTE 2627 * under folio lock, after calling MMU notifiers. 2628 * 2629 * Only anonymous non-hugetlb folios are supported and the VMA must have 2630 * write permissions such that we can fault in the anonymous page writable 2631 * in order to mark it exclusive. The caller must hold the mmap_lock in read 2632 * mode. 2633 * 2634 * A driver using this to program access from a device must use a mmu notifier 2635 * critical section to hold a device specific lock during programming. Once 2636 * programming is complete it should drop the folio lock and reference after 2637 * which point CPU access to the page will revoke the exclusive access. 2638 * 2639 * Notes: 2640 * #. This function always operates on individual PTEs mapping individual 2641 * pages. PMD-sized THPs are first remapped to be mapped by PTEs before 2642 * the conversion happens on a single PTE corresponding to @addr. 2643 * #. While concurrent access through the process page tables is prevented, 2644 * concurrent access through other page references (e.g., earlier GUP 2645 * invocation) is not handled and not supported. 2646 * #. device-exclusive entries are considered "clean" and "old" by core-mm. 2647 * Device drivers must update the folio state when informed by MMU 2648 * notifiers. 2649 * 2650 * Returns: pointer to mapped page on success, otherwise a negative error. 2651 */ 2652 struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, 2653 void *owner, struct folio **foliop) 2654 { 2655 struct mmu_notifier_range range; 2656 struct folio *folio, *fw_folio; 2657 struct vm_area_struct *vma; 2658 struct folio_walk fw; 2659 struct page *page; 2660 swp_entry_t entry; 2661 pte_t swp_pte; 2662 int ret; 2663 2664 mmap_assert_locked(mm); 2665 addr = PAGE_ALIGN_DOWN(addr); 2666 2667 /* 2668 * Fault in the page writable and try to lock it; note that if the 2669 * address would already be marked for exclusive use by a device, 2670 * the GUP call would undo that first by triggering a fault. 2671 * 2672 * If any other device would already map this page exclusively, the 2673 * fault will trigger a conversion to an ordinary 2674 * (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE. 2675 */ 2676 retry: 2677 page = get_user_page_vma_remote(mm, addr, 2678 FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD, 2679 &vma); 2680 if (IS_ERR(page)) 2681 return page; 2682 folio = page_folio(page); 2683 2684 if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) { 2685 folio_put(folio); 2686 return ERR_PTR(-EOPNOTSUPP); 2687 } 2688 2689 ret = folio_lock_killable(folio); 2690 if (ret) { 2691 folio_put(folio); 2692 return ERR_PTR(ret); 2693 } 2694 2695 /* 2696 * Inform secondary MMUs that we are going to convert this PTE to 2697 * device-exclusive, such that they unmap it now. Note that the 2698 * caller must filter this event out to prevent livelocks. 2699 */ 2700 mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, 2701 mm, addr, addr + PAGE_SIZE, owner); 2702 mmu_notifier_invalidate_range_start(&range); 2703 2704 /* 2705 * Let's do a second walk and make sure we still find the same page 2706 * mapped writable. Note that any page of an anonymous folio can 2707 * only be mapped writable using exactly one PTE ("exclusive"), so 2708 * there cannot be other mappings. 2709 */ 2710 fw_folio = folio_walk_start(&fw, vma, addr, 0); 2711 if (fw_folio != folio || fw.page != page || 2712 fw.level != FW_LEVEL_PTE || !pte_write(fw.pte)) { 2713 if (fw_folio) 2714 folio_walk_end(&fw, vma); 2715 mmu_notifier_invalidate_range_end(&range); 2716 folio_unlock(folio); 2717 folio_put(folio); 2718 goto retry; 2719 } 2720 2721 /* Nuke the page table entry so we get the uptodate dirty bit. */ 2722 flush_cache_page(vma, addr, page_to_pfn(page)); 2723 fw.pte = ptep_clear_flush(vma, addr, fw.ptep); 2724 2725 /* Set the dirty flag on the folio now the PTE is gone. */ 2726 if (pte_dirty(fw.pte)) 2727 folio_mark_dirty(folio); 2728 2729 /* 2730 * Store the pfn of the page in a special device-exclusive PFN swap PTE. 2731 * do_swap_page() will trigger the conversion back while holding the 2732 * folio lock. 2733 */ 2734 entry = make_device_exclusive_entry(page_to_pfn(page)); 2735 swp_pte = swp_entry_to_pte(entry); 2736 if (pte_soft_dirty(fw.pte)) 2737 swp_pte = pte_swp_mksoft_dirty(swp_pte); 2738 /* The pte is writable, uffd-wp does not apply. */ 2739 set_pte_at(mm, addr, fw.ptep, swp_pte); 2740 2741 folio_walk_end(&fw, vma); 2742 mmu_notifier_invalidate_range_end(&range); 2743 *foliop = folio; 2744 return page; 2745 } 2746 EXPORT_SYMBOL_GPL(make_device_exclusive); 2747 #endif 2748 2749 void __put_anon_vma(struct anon_vma *anon_vma) 2750 { 2751 struct anon_vma *root = anon_vma->root; 2752 2753 anon_vma_free(anon_vma); 2754 if (root != anon_vma && atomic_dec_and_test(&root->refcount)) 2755 anon_vma_free(root); 2756 } 2757 2758 static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio, 2759 struct rmap_walk_control *rwc) 2760 { 2761 struct anon_vma *anon_vma; 2762 2763 if (rwc->anon_lock) 2764 return rwc->anon_lock(folio, rwc); 2765 2766 /* 2767 * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read() 2768 * because that depends on page_mapped(); but not all its usages 2769 * are holding mmap_lock. Users without mmap_lock are required to 2770 * take a reference count to prevent the anon_vma disappearing 2771 */ 2772 anon_vma = folio_anon_vma(folio); 2773 if (!anon_vma) 2774 return NULL; 2775 2776 if (anon_vma_trylock_read(anon_vma)) 2777 goto out; 2778 2779 if (rwc->try_lock) { 2780 anon_vma = NULL; 2781 rwc->contended = true; 2782 goto out; 2783 } 2784 2785 anon_vma_lock_read(anon_vma); 2786 out: 2787 return anon_vma; 2788 } 2789 2790 /* 2791 * rmap_walk_anon - do something to anonymous page using the object-based 2792 * rmap method 2793 * @folio: the folio to be handled 2794 * @rwc: control variable according to each walk type 2795 * @locked: caller holds relevant rmap lock 2796 * 2797 * Find all the mappings of a folio using the mapping pointer and the vma 2798 * chains contained in the anon_vma struct it points to. 2799 */ 2800 static void rmap_walk_anon(struct folio *folio, 2801 struct rmap_walk_control *rwc, bool locked) 2802 { 2803 struct anon_vma *anon_vma; 2804 pgoff_t pgoff_start, pgoff_end; 2805 struct anon_vma_chain *avc; 2806 2807 /* 2808 * The folio lock ensures that folio->mapping can't be changed under us 2809 * to an anon_vma with different root. 2810 */ 2811 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); 2812 2813 if (locked) { 2814 anon_vma = folio_anon_vma(folio); 2815 /* anon_vma disappear under us? */ 2816 VM_BUG_ON_FOLIO(!anon_vma, folio); 2817 } else { 2818 anon_vma = rmap_walk_anon_lock(folio, rwc); 2819 } 2820 if (!anon_vma) 2821 return; 2822 2823 pgoff_start = folio_pgoff(folio); 2824 pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; 2825 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, 2826 pgoff_start, pgoff_end) { 2827 struct vm_area_struct *vma = avc->vma; 2828 unsigned long address = vma_address(vma, pgoff_start, 2829 folio_nr_pages(folio)); 2830 2831 VM_BUG_ON_VMA(address == -EFAULT, vma); 2832 cond_resched(); 2833 2834 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 2835 continue; 2836 2837 if (!rwc->rmap_one(folio, vma, address, rwc->arg)) 2838 break; 2839 if (rwc->done && rwc->done(folio)) 2840 break; 2841 } 2842 2843 if (!locked) 2844 anon_vma_unlock_read(anon_vma); 2845 } 2846 2847 /** 2848 * __rmap_walk_file() - Traverse the reverse mapping for a file-backed mapping 2849 * of a page mapped within a specified page cache object at a specified offset. 2850 * 2851 * @folio: Either the folio whose mappings to traverse, or if NULL, 2852 * the callbacks specified in @rwc will be configured such 2853 * as to be able to look up mappings correctly. 2854 * @mapping: The page cache object whose mapping VMAs we intend to 2855 * traverse. If @folio is non-NULL, this should be equal to 2856 * folio_mapping(folio). 2857 * @pgoff_start: The offset within @mapping of the page which we are 2858 * looking up. If @folio is non-NULL, this should be equal 2859 * to folio_pgoff(folio). 2860 * @nr_pages: The number of pages mapped by the mapping. If @folio is 2861 * non-NULL, this should be equal to folio_nr_pages(folio). 2862 * @rwc: The reverse mapping walk control object describing how 2863 * the traversal should proceed. 2864 * @locked: Is the @mapping already locked? If not, we acquire the 2865 * lock. 2866 */ 2867 static void __rmap_walk_file(struct folio *folio, struct address_space *mapping, 2868 pgoff_t pgoff_start, unsigned long nr_pages, 2869 struct rmap_walk_control *rwc, bool locked) 2870 { 2871 pgoff_t pgoff_end = pgoff_start + nr_pages - 1; 2872 struct vm_area_struct *vma; 2873 2874 VM_WARN_ON_FOLIO(folio && mapping != folio_mapping(folio), folio); 2875 VM_WARN_ON_FOLIO(folio && pgoff_start != folio_pgoff(folio), folio); 2876 VM_WARN_ON_FOLIO(folio && nr_pages != folio_nr_pages(folio), folio); 2877 2878 if (!locked) { 2879 if (i_mmap_trylock_read(mapping)) 2880 goto lookup; 2881 2882 if (rwc->try_lock) { 2883 rwc->contended = true; 2884 return; 2885 } 2886 2887 i_mmap_lock_read(mapping); 2888 } 2889 lookup: 2890 vma_interval_tree_foreach(vma, &mapping->i_mmap, 2891 pgoff_start, pgoff_end) { 2892 unsigned long address = vma_address(vma, pgoff_start, nr_pages); 2893 2894 VM_BUG_ON_VMA(address == -EFAULT, vma); 2895 cond_resched(); 2896 2897 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 2898 continue; 2899 2900 if (!rwc->rmap_one(folio, vma, address, rwc->arg)) 2901 goto done; 2902 if (rwc->done && rwc->done(folio)) 2903 goto done; 2904 } 2905 done: 2906 if (!locked) 2907 i_mmap_unlock_read(mapping); 2908 } 2909 2910 /* 2911 * rmap_walk_file - do something to file page using the object-based rmap method 2912 * @folio: the folio to be handled 2913 * @rwc: control variable according to each walk type 2914 * @locked: caller holds relevant rmap lock 2915 * 2916 * Find all the mappings of a folio using the mapping pointer and the vma chains 2917 * contained in the address_space struct it points to. 2918 */ 2919 static void rmap_walk_file(struct folio *folio, 2920 struct rmap_walk_control *rwc, bool locked) 2921 { 2922 /* 2923 * The folio lock not only makes sure that folio->mapping cannot 2924 * suddenly be NULLified by truncation, it makes sure that the structure 2925 * at mapping cannot be freed and reused yet, so we can safely take 2926 * mapping->i_mmap_rwsem. 2927 */ 2928 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 2929 2930 if (!folio->mapping) 2931 return; 2932 2933 __rmap_walk_file(folio, folio->mapping, folio->index, 2934 folio_nr_pages(folio), rwc, locked); 2935 } 2936 2937 void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc) 2938 { 2939 if (unlikely(folio_test_ksm(folio))) 2940 rmap_walk_ksm(folio, rwc); 2941 else if (folio_test_anon(folio)) 2942 rmap_walk_anon(folio, rwc, false); 2943 else 2944 rmap_walk_file(folio, rwc, false); 2945 } 2946 2947 /* Like rmap_walk, but caller holds relevant rmap lock */ 2948 void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) 2949 { 2950 /* no ksm support for now */ 2951 VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio); 2952 if (folio_test_anon(folio)) 2953 rmap_walk_anon(folio, rwc, true); 2954 else 2955 rmap_walk_file(folio, rwc, true); 2956 } 2957 2958 #ifdef CONFIG_HUGETLB_PAGE 2959 /* 2960 * The following two functions are for anonymous (private mapped) hugepages. 2961 * Unlike common anonymous pages, anonymous hugepages have no accounting code 2962 * and no lru code, because we handle hugepages differently from common pages. 2963 */ 2964 void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma, 2965 unsigned long address, rmap_t flags) 2966 { 2967 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 2968 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 2969 2970 atomic_inc(&folio->_entire_mapcount); 2971 atomic_inc(&folio->_large_mapcount); 2972 if (flags & RMAP_EXCLUSIVE) 2973 SetPageAnonExclusive(&folio->page); 2974 VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 && 2975 PageAnonExclusive(&folio->page), folio); 2976 } 2977 2978 void hugetlb_add_new_anon_rmap(struct folio *folio, 2979 struct vm_area_struct *vma, unsigned long address) 2980 { 2981 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 2982 2983 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 2984 /* increment count (starts at -1) */ 2985 atomic_set(&folio->_entire_mapcount, 0); 2986 atomic_set(&folio->_large_mapcount, 0); 2987 folio_clear_hugetlb_restore_reserve(folio); 2988 __folio_set_anon(folio, vma, address, true); 2989 SetPageAnonExclusive(&folio->page); 2990 } 2991 #endif /* CONFIG_HUGETLB_PAGE */ 2992