1 /* 2 * mm/rmap.c - physical to virtual reverse mappings 3 * 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 5 * Released under the General Public License (GPL). 6 * 7 * Simple, low overhead reverse mapping scheme. 8 * Please try to keep this thing as modular as possible. 9 * 10 * Provides methods for unmapping each kind of mapped page: 11 * the anon methods track anonymous pages, and 12 * the file methods track pages belonging to an inode. 13 * 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 17 * Contributions by Hugh Dickins 2003, 2004 18 */ 19 20 /* 21 * Lock ordering in mm: 22 * 23 * inode->i_mutex (while writing or truncating, not reading or faulting) 24 * mm->mmap_sem 25 * page->flags PG_locked (lock_page) 26 * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) 27 * mapping->i_mmap_rwsem 28 * anon_vma->rwsem 29 * mm->page_table_lock or pte_lock 30 * zone_lru_lock (in mark_page_accessed, isolate_lru_page) 31 * swap_lock (in swap_duplicate, swap_info_get) 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 * mapping->private_lock (in __set_page_dirty_buffers) 34 * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) 35 * i_pages lock (widely used) 36 * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 37 * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) 38 * sb_lock (within inode_lock in fs/fs-writeback.c) 39 * i_pages lock (widely used, in set_page_dirty, 40 * in arch-dependent flush_dcache_mmap_lock, 41 * within bdi.wb->list_lock in __sync_single_inode) 42 * 43 * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) 44 * ->tasklist_lock 45 * pte map lock 46 */ 47 48 #include <linux/mm.h> 49 #include <linux/sched/mm.h> 50 #include <linux/sched/task.h> 51 #include <linux/pagemap.h> 52 #include <linux/swap.h> 53 #include <linux/swapops.h> 54 #include <linux/slab.h> 55 #include <linux/init.h> 56 #include <linux/ksm.h> 57 #include <linux/rmap.h> 58 #include <linux/rcupdate.h> 59 #include <linux/export.h> 60 #include <linux/memcontrol.h> 61 #include <linux/mmu_notifier.h> 62 #include <linux/migrate.h> 63 #include <linux/hugetlb.h> 64 #include <linux/backing-dev.h> 65 #include <linux/page_idle.h> 66 #include <linux/memremap.h> 67 68 #include <asm/tlbflush.h> 69 70 #include <trace/events/tlb.h> 71 72 #include "internal.h" 73 74 static struct kmem_cache *anon_vma_cachep; 75 static struct kmem_cache *anon_vma_chain_cachep; 76 77 static inline struct anon_vma *anon_vma_alloc(void) 78 { 79 struct anon_vma *anon_vma; 80 81 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 82 if (anon_vma) { 83 atomic_set(&anon_vma->refcount, 1); 84 anon_vma->degree = 1; /* Reference for first vma */ 85 anon_vma->parent = anon_vma; 86 /* 87 * Initialise the anon_vma root to point to itself. If called 88 * from fork, the root will be reset to the parents anon_vma. 89 */ 90 anon_vma->root = anon_vma; 91 } 92 93 return anon_vma; 94 } 95 96 static inline void anon_vma_free(struct anon_vma *anon_vma) 97 { 98 VM_BUG_ON(atomic_read(&anon_vma->refcount)); 99 100 /* 101 * Synchronize against page_lock_anon_vma_read() such that 102 * we can safely hold the lock without the anon_vma getting 103 * freed. 104 * 105 * Relies on the full mb implied by the atomic_dec_and_test() from 106 * put_anon_vma() against the acquire barrier implied by 107 * down_read_trylock() from page_lock_anon_vma_read(). This orders: 108 * 109 * page_lock_anon_vma_read() VS put_anon_vma() 110 * down_read_trylock() atomic_dec_and_test() 111 * LOCK MB 112 * atomic_read() rwsem_is_locked() 113 * 114 * LOCK should suffice since the actual taking of the lock must 115 * happen _before_ what follows. 116 */ 117 might_sleep(); 118 if (rwsem_is_locked(&anon_vma->root->rwsem)) { 119 anon_vma_lock_write(anon_vma); 120 anon_vma_unlock_write(anon_vma); 121 } 122 123 kmem_cache_free(anon_vma_cachep, anon_vma); 124 } 125 126 static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) 127 { 128 return kmem_cache_alloc(anon_vma_chain_cachep, gfp); 129 } 130 131 static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 132 { 133 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 134 } 135 136 static void anon_vma_chain_link(struct vm_area_struct *vma, 137 struct anon_vma_chain *avc, 138 struct anon_vma *anon_vma) 139 { 140 avc->vma = vma; 141 avc->anon_vma = anon_vma; 142 list_add(&avc->same_vma, &vma->anon_vma_chain); 143 anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); 144 } 145 146 /** 147 * __anon_vma_prepare - attach an anon_vma to a memory region 148 * @vma: the memory region in question 149 * 150 * This makes sure the memory mapping described by 'vma' has 151 * an 'anon_vma' attached to it, so that we can associate the 152 * anonymous pages mapped into it with that anon_vma. 153 * 154 * The common case will be that we already have one, which 155 * is handled inline by anon_vma_prepare(). But if 156 * not we either need to find an adjacent mapping that we 157 * can re-use the anon_vma from (very common when the only 158 * reason for splitting a vma has been mprotect()), or we 159 * allocate a new one. 160 * 161 * Anon-vma allocations are very subtle, because we may have 162 * optimistically looked up an anon_vma in page_lock_anon_vma_read() 163 * and that may actually touch the spinlock even in the newly 164 * allocated vma (it depends on RCU to make sure that the 165 * anon_vma isn't actually destroyed). 166 * 167 * As a result, we need to do proper anon_vma locking even 168 * for the new allocation. At the same time, we do not want 169 * to do any locking for the common case of already having 170 * an anon_vma. 171 * 172 * This must be called with the mmap_sem held for reading. 173 */ 174 int __anon_vma_prepare(struct vm_area_struct *vma) 175 { 176 struct mm_struct *mm = vma->vm_mm; 177 struct anon_vma *anon_vma, *allocated; 178 struct anon_vma_chain *avc; 179 180 might_sleep(); 181 182 avc = anon_vma_chain_alloc(GFP_KERNEL); 183 if (!avc) 184 goto out_enomem; 185 186 anon_vma = find_mergeable_anon_vma(vma); 187 allocated = NULL; 188 if (!anon_vma) { 189 anon_vma = anon_vma_alloc(); 190 if (unlikely(!anon_vma)) 191 goto out_enomem_free_avc; 192 allocated = anon_vma; 193 } 194 195 anon_vma_lock_write(anon_vma); 196 /* page_table_lock to protect against threads */ 197 spin_lock(&mm->page_table_lock); 198 if (likely(!vma->anon_vma)) { 199 vma->anon_vma = anon_vma; 200 anon_vma_chain_link(vma, avc, anon_vma); 201 /* vma reference or self-parent link for new root */ 202 anon_vma->degree++; 203 allocated = NULL; 204 avc = NULL; 205 } 206 spin_unlock(&mm->page_table_lock); 207 anon_vma_unlock_write(anon_vma); 208 209 if (unlikely(allocated)) 210 put_anon_vma(allocated); 211 if (unlikely(avc)) 212 anon_vma_chain_free(avc); 213 214 return 0; 215 216 out_enomem_free_avc: 217 anon_vma_chain_free(avc); 218 out_enomem: 219 return -ENOMEM; 220 } 221 222 /* 223 * This is a useful helper function for locking the anon_vma root as 224 * we traverse the vma->anon_vma_chain, looping over anon_vma's that 225 * have the same vma. 226 * 227 * Such anon_vma's should have the same root, so you'd expect to see 228 * just a single mutex_lock for the whole traversal. 229 */ 230 static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) 231 { 232 struct anon_vma *new_root = anon_vma->root; 233 if (new_root != root) { 234 if (WARN_ON_ONCE(root)) 235 up_write(&root->rwsem); 236 root = new_root; 237 down_write(&root->rwsem); 238 } 239 return root; 240 } 241 242 static inline void unlock_anon_vma_root(struct anon_vma *root) 243 { 244 if (root) 245 up_write(&root->rwsem); 246 } 247 248 /* 249 * Attach the anon_vmas from src to dst. 250 * Returns 0 on success, -ENOMEM on failure. 251 * 252 * If dst->anon_vma is NULL this function tries to find and reuse existing 253 * anon_vma which has no vmas and only one child anon_vma. This prevents 254 * degradation of anon_vma hierarchy to endless linear chain in case of 255 * constantly forking task. On the other hand, an anon_vma with more than one 256 * child isn't reused even if there was no alive vma, thus rmap walker has a 257 * good chance of avoiding scanning the whole hierarchy when it searches where 258 * page is mapped. 259 */ 260 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 261 { 262 struct anon_vma_chain *avc, *pavc; 263 struct anon_vma *root = NULL; 264 265 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { 266 struct anon_vma *anon_vma; 267 268 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); 269 if (unlikely(!avc)) { 270 unlock_anon_vma_root(root); 271 root = NULL; 272 avc = anon_vma_chain_alloc(GFP_KERNEL); 273 if (!avc) 274 goto enomem_failure; 275 } 276 anon_vma = pavc->anon_vma; 277 root = lock_anon_vma_root(root, anon_vma); 278 anon_vma_chain_link(dst, avc, anon_vma); 279 280 /* 281 * Reuse existing anon_vma if its degree lower than two, 282 * that means it has no vma and only one anon_vma child. 283 * 284 * Do not chose parent anon_vma, otherwise first child 285 * will always reuse it. Root anon_vma is never reused: 286 * it has self-parent reference and at least one child. 287 */ 288 if (!dst->anon_vma && anon_vma != src->anon_vma && 289 anon_vma->degree < 2) 290 dst->anon_vma = anon_vma; 291 } 292 if (dst->anon_vma) 293 dst->anon_vma->degree++; 294 unlock_anon_vma_root(root); 295 return 0; 296 297 enomem_failure: 298 /* 299 * dst->anon_vma is dropped here otherwise its degree can be incorrectly 300 * decremented in unlink_anon_vmas(). 301 * We can safely do this because callers of anon_vma_clone() don't care 302 * about dst->anon_vma if anon_vma_clone() failed. 303 */ 304 dst->anon_vma = NULL; 305 unlink_anon_vmas(dst); 306 return -ENOMEM; 307 } 308 309 /* 310 * Attach vma to its own anon_vma, as well as to the anon_vmas that 311 * the corresponding VMA in the parent process is attached to. 312 * Returns 0 on success, non-zero on failure. 313 */ 314 int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) 315 { 316 struct anon_vma_chain *avc; 317 struct anon_vma *anon_vma; 318 int error; 319 320 /* Don't bother if the parent process has no anon_vma here. */ 321 if (!pvma->anon_vma) 322 return 0; 323 324 /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ 325 vma->anon_vma = NULL; 326 327 /* 328 * First, attach the new VMA to the parent VMA's anon_vmas, 329 * so rmap can find non-COWed pages in child processes. 330 */ 331 error = anon_vma_clone(vma, pvma); 332 if (error) 333 return error; 334 335 /* An existing anon_vma has been reused, all done then. */ 336 if (vma->anon_vma) 337 return 0; 338 339 /* Then add our own anon_vma. */ 340 anon_vma = anon_vma_alloc(); 341 if (!anon_vma) 342 goto out_error; 343 avc = anon_vma_chain_alloc(GFP_KERNEL); 344 if (!avc) 345 goto out_error_free_anon_vma; 346 347 /* 348 * The root anon_vma's spinlock is the lock actually used when we 349 * lock any of the anon_vmas in this anon_vma tree. 350 */ 351 anon_vma->root = pvma->anon_vma->root; 352 anon_vma->parent = pvma->anon_vma; 353 /* 354 * With refcounts, an anon_vma can stay around longer than the 355 * process it belongs to. The root anon_vma needs to be pinned until 356 * this anon_vma is freed, because the lock lives in the root. 357 */ 358 get_anon_vma(anon_vma->root); 359 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 360 vma->anon_vma = anon_vma; 361 anon_vma_lock_write(anon_vma); 362 anon_vma_chain_link(vma, avc, anon_vma); 363 anon_vma->parent->degree++; 364 anon_vma_unlock_write(anon_vma); 365 366 return 0; 367 368 out_error_free_anon_vma: 369 put_anon_vma(anon_vma); 370 out_error: 371 unlink_anon_vmas(vma); 372 return -ENOMEM; 373 } 374 375 void unlink_anon_vmas(struct vm_area_struct *vma) 376 { 377 struct anon_vma_chain *avc, *next; 378 struct anon_vma *root = NULL; 379 380 /* 381 * Unlink each anon_vma chained to the VMA. This list is ordered 382 * from newest to oldest, ensuring the root anon_vma gets freed last. 383 */ 384 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 385 struct anon_vma *anon_vma = avc->anon_vma; 386 387 root = lock_anon_vma_root(root, anon_vma); 388 anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); 389 390 /* 391 * Leave empty anon_vmas on the list - we'll need 392 * to free them outside the lock. 393 */ 394 if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { 395 anon_vma->parent->degree--; 396 continue; 397 } 398 399 list_del(&avc->same_vma); 400 anon_vma_chain_free(avc); 401 } 402 if (vma->anon_vma) 403 vma->anon_vma->degree--; 404 unlock_anon_vma_root(root); 405 406 /* 407 * Iterate the list once more, it now only contains empty and unlinked 408 * anon_vmas, destroy them. Could not do before due to __put_anon_vma() 409 * needing to write-acquire the anon_vma->root->rwsem. 410 */ 411 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 412 struct anon_vma *anon_vma = avc->anon_vma; 413 414 VM_WARN_ON(anon_vma->degree); 415 put_anon_vma(anon_vma); 416 417 list_del(&avc->same_vma); 418 anon_vma_chain_free(avc); 419 } 420 } 421 422 static void anon_vma_ctor(void *data) 423 { 424 struct anon_vma *anon_vma = data; 425 426 init_rwsem(&anon_vma->rwsem); 427 atomic_set(&anon_vma->refcount, 0); 428 anon_vma->rb_root = RB_ROOT_CACHED; 429 } 430 431 void __init anon_vma_init(void) 432 { 433 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 434 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, 435 anon_vma_ctor); 436 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, 437 SLAB_PANIC|SLAB_ACCOUNT); 438 } 439 440 /* 441 * Getting a lock on a stable anon_vma from a page off the LRU is tricky! 442 * 443 * Since there is no serialization what so ever against page_remove_rmap() 444 * the best this function can do is return a locked anon_vma that might 445 * have been relevant to this page. 446 * 447 * The page might have been remapped to a different anon_vma or the anon_vma 448 * returned may already be freed (and even reused). 449 * 450 * In case it was remapped to a different anon_vma, the new anon_vma will be a 451 * child of the old anon_vma, and the anon_vma lifetime rules will therefore 452 * ensure that any anon_vma obtained from the page will still be valid for as 453 * long as we observe page_mapped() [ hence all those page_mapped() tests ]. 454 * 455 * All users of this function must be very careful when walking the anon_vma 456 * chain and verify that the page in question is indeed mapped in it 457 * [ something equivalent to page_mapped_in_vma() ]. 458 * 459 * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() 460 * that the anon_vma pointer from page->mapping is valid if there is a 461 * mapcount, we can dereference the anon_vma after observing those. 462 */ 463 struct anon_vma *page_get_anon_vma(struct page *page) 464 { 465 struct anon_vma *anon_vma = NULL; 466 unsigned long anon_mapping; 467 468 rcu_read_lock(); 469 anon_mapping = (unsigned long)READ_ONCE(page->mapping); 470 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 471 goto out; 472 if (!page_mapped(page)) 473 goto out; 474 475 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 476 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 477 anon_vma = NULL; 478 goto out; 479 } 480 481 /* 482 * If this page is still mapped, then its anon_vma cannot have been 483 * freed. But if it has been unmapped, we have no security against the 484 * anon_vma structure being freed and reused (for another anon_vma: 485 * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() 486 * above cannot corrupt). 487 */ 488 if (!page_mapped(page)) { 489 rcu_read_unlock(); 490 put_anon_vma(anon_vma); 491 return NULL; 492 } 493 out: 494 rcu_read_unlock(); 495 496 return anon_vma; 497 } 498 499 /* 500 * Similar to page_get_anon_vma() except it locks the anon_vma. 501 * 502 * Its a little more complex as it tries to keep the fast path to a single 503 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a 504 * reference like with page_get_anon_vma() and then block on the mutex. 505 */ 506 struct anon_vma *page_lock_anon_vma_read(struct page *page) 507 { 508 struct anon_vma *anon_vma = NULL; 509 struct anon_vma *root_anon_vma; 510 unsigned long anon_mapping; 511 512 rcu_read_lock(); 513 anon_mapping = (unsigned long)READ_ONCE(page->mapping); 514 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 515 goto out; 516 if (!page_mapped(page)) 517 goto out; 518 519 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 520 root_anon_vma = READ_ONCE(anon_vma->root); 521 if (down_read_trylock(&root_anon_vma->rwsem)) { 522 /* 523 * If the page is still mapped, then this anon_vma is still 524 * its anon_vma, and holding the mutex ensures that it will 525 * not go away, see anon_vma_free(). 526 */ 527 if (!page_mapped(page)) { 528 up_read(&root_anon_vma->rwsem); 529 anon_vma = NULL; 530 } 531 goto out; 532 } 533 534 /* trylock failed, we got to sleep */ 535 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 536 anon_vma = NULL; 537 goto out; 538 } 539 540 if (!page_mapped(page)) { 541 rcu_read_unlock(); 542 put_anon_vma(anon_vma); 543 return NULL; 544 } 545 546 /* we pinned the anon_vma, its safe to sleep */ 547 rcu_read_unlock(); 548 anon_vma_lock_read(anon_vma); 549 550 if (atomic_dec_and_test(&anon_vma->refcount)) { 551 /* 552 * Oops, we held the last refcount, release the lock 553 * and bail -- can't simply use put_anon_vma() because 554 * we'll deadlock on the anon_vma_lock_write() recursion. 555 */ 556 anon_vma_unlock_read(anon_vma); 557 __put_anon_vma(anon_vma); 558 anon_vma = NULL; 559 } 560 561 return anon_vma; 562 563 out: 564 rcu_read_unlock(); 565 return anon_vma; 566 } 567 568 void page_unlock_anon_vma_read(struct anon_vma *anon_vma) 569 { 570 anon_vma_unlock_read(anon_vma); 571 } 572 573 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 574 /* 575 * Flush TLB entries for recently unmapped pages from remote CPUs. It is 576 * important if a PTE was dirty when it was unmapped that it's flushed 577 * before any IO is initiated on the page to prevent lost writes. Similarly, 578 * it must be flushed before freeing to prevent data leakage. 579 */ 580 void try_to_unmap_flush(void) 581 { 582 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 583 584 if (!tlb_ubc->flush_required) 585 return; 586 587 arch_tlbbatch_flush(&tlb_ubc->arch); 588 tlb_ubc->flush_required = false; 589 tlb_ubc->writable = false; 590 } 591 592 /* Flush iff there are potentially writable TLB entries that can race with IO */ 593 void try_to_unmap_flush_dirty(void) 594 { 595 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 596 597 if (tlb_ubc->writable) 598 try_to_unmap_flush(); 599 } 600 601 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) 602 { 603 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 604 605 arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); 606 tlb_ubc->flush_required = true; 607 608 /* 609 * Ensure compiler does not re-order the setting of tlb_flush_batched 610 * before the PTE is cleared. 611 */ 612 barrier(); 613 mm->tlb_flush_batched = true; 614 615 /* 616 * If the PTE was dirty then it's best to assume it's writable. The 617 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() 618 * before the page is queued for IO. 619 */ 620 if (writable) 621 tlb_ubc->writable = true; 622 } 623 624 /* 625 * Returns true if the TLB flush should be deferred to the end of a batch of 626 * unmap operations to reduce IPIs. 627 */ 628 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 629 { 630 bool should_defer = false; 631 632 if (!(flags & TTU_BATCH_FLUSH)) 633 return false; 634 635 /* If remote CPUs need to be flushed then defer batch the flush */ 636 if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) 637 should_defer = true; 638 put_cpu(); 639 640 return should_defer; 641 } 642 643 /* 644 * Reclaim unmaps pages under the PTL but do not flush the TLB prior to 645 * releasing the PTL if TLB flushes are batched. It's possible for a parallel 646 * operation such as mprotect or munmap to race between reclaim unmapping 647 * the page and flushing the page. If this race occurs, it potentially allows 648 * access to data via a stale TLB entry. Tracking all mm's that have TLB 649 * batching in flight would be expensive during reclaim so instead track 650 * whether TLB batching occurred in the past and if so then do a flush here 651 * if required. This will cost one additional flush per reclaim cycle paid 652 * by the first operation at risk such as mprotect and mumap. 653 * 654 * This must be called under the PTL so that an access to tlb_flush_batched 655 * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise 656 * via the PTL. 657 */ 658 void flush_tlb_batched_pending(struct mm_struct *mm) 659 { 660 if (mm->tlb_flush_batched) { 661 flush_tlb_mm(mm); 662 663 /* 664 * Do not allow the compiler to re-order the clearing of 665 * tlb_flush_batched before the tlb is flushed. 666 */ 667 barrier(); 668 mm->tlb_flush_batched = false; 669 } 670 } 671 #else 672 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) 673 { 674 } 675 676 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 677 { 678 return false; 679 } 680 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ 681 682 /* 683 * At what user virtual address is page expected in vma? 684 * Caller should check the page is actually part of the vma. 685 */ 686 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 687 { 688 unsigned long address; 689 if (PageAnon(page)) { 690 struct anon_vma *page__anon_vma = page_anon_vma(page); 691 /* 692 * Note: swapoff's unuse_vma() is more efficient with this 693 * check, and needs it to match anon_vma when KSM is active. 694 */ 695 if (!vma->anon_vma || !page__anon_vma || 696 vma->anon_vma->root != page__anon_vma->root) 697 return -EFAULT; 698 } else if (page->mapping) { 699 if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) 700 return -EFAULT; 701 } else 702 return -EFAULT; 703 address = __vma_address(page, vma); 704 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) 705 return -EFAULT; 706 return address; 707 } 708 709 pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) 710 { 711 pgd_t *pgd; 712 p4d_t *p4d; 713 pud_t *pud; 714 pmd_t *pmd = NULL; 715 pmd_t pmde; 716 717 pgd = pgd_offset(mm, address); 718 if (!pgd_present(*pgd)) 719 goto out; 720 721 p4d = p4d_offset(pgd, address); 722 if (!p4d_present(*p4d)) 723 goto out; 724 725 pud = pud_offset(p4d, address); 726 if (!pud_present(*pud)) 727 goto out; 728 729 pmd = pmd_offset(pud, address); 730 /* 731 * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() 732 * without holding anon_vma lock for write. So when looking for a 733 * genuine pmde (in which to find pte), test present and !THP together. 734 */ 735 pmde = *pmd; 736 barrier(); 737 if (!pmd_present(pmde) || pmd_trans_huge(pmde)) 738 pmd = NULL; 739 out: 740 return pmd; 741 } 742 743 struct page_referenced_arg { 744 int mapcount; 745 int referenced; 746 unsigned long vm_flags; 747 struct mem_cgroup *memcg; 748 }; 749 /* 750 * arg: page_referenced_arg will be passed 751 */ 752 static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, 753 unsigned long address, void *arg) 754 { 755 struct page_referenced_arg *pra = arg; 756 struct page_vma_mapped_walk pvmw = { 757 .page = page, 758 .vma = vma, 759 .address = address, 760 }; 761 int referenced = 0; 762 763 while (page_vma_mapped_walk(&pvmw)) { 764 address = pvmw.address; 765 766 if (vma->vm_flags & VM_LOCKED) { 767 page_vma_mapped_walk_done(&pvmw); 768 pra->vm_flags |= VM_LOCKED; 769 return false; /* To break the loop */ 770 } 771 772 if (pvmw.pte) { 773 if (ptep_clear_flush_young_notify(vma, address, 774 pvmw.pte)) { 775 /* 776 * Don't treat a reference through 777 * a sequentially read mapping as such. 778 * If the page has been used in another mapping, 779 * we will catch it; if this other mapping is 780 * already gone, the unmap path will have set 781 * PG_referenced or activated the page. 782 */ 783 if (likely(!(vma->vm_flags & VM_SEQ_READ))) 784 referenced++; 785 } 786 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { 787 if (pmdp_clear_flush_young_notify(vma, address, 788 pvmw.pmd)) 789 referenced++; 790 } else { 791 /* unexpected pmd-mapped page? */ 792 WARN_ON_ONCE(1); 793 } 794 795 pra->mapcount--; 796 } 797 798 if (referenced) 799 clear_page_idle(page); 800 if (test_and_clear_page_young(page)) 801 referenced++; 802 803 if (referenced) { 804 pra->referenced++; 805 pra->vm_flags |= vma->vm_flags; 806 } 807 808 if (!pra->mapcount) 809 return false; /* To break the loop */ 810 811 return true; 812 } 813 814 static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) 815 { 816 struct page_referenced_arg *pra = arg; 817 struct mem_cgroup *memcg = pra->memcg; 818 819 if (!mm_match_cgroup(vma->vm_mm, memcg)) 820 return true; 821 822 return false; 823 } 824 825 /** 826 * page_referenced - test if the page was referenced 827 * @page: the page to test 828 * @is_locked: caller holds lock on the page 829 * @memcg: target memory cgroup 830 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 831 * 832 * Quick test_and_clear_referenced for all mappings to a page, 833 * returns the number of ptes which referenced the page. 834 */ 835 int page_referenced(struct page *page, 836 int is_locked, 837 struct mem_cgroup *memcg, 838 unsigned long *vm_flags) 839 { 840 int we_locked = 0; 841 struct page_referenced_arg pra = { 842 .mapcount = total_mapcount(page), 843 .memcg = memcg, 844 }; 845 struct rmap_walk_control rwc = { 846 .rmap_one = page_referenced_one, 847 .arg = (void *)&pra, 848 .anon_lock = page_lock_anon_vma_read, 849 }; 850 851 *vm_flags = 0; 852 if (!page_mapped(page)) 853 return 0; 854 855 if (!page_rmapping(page)) 856 return 0; 857 858 if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 859 we_locked = trylock_page(page); 860 if (!we_locked) 861 return 1; 862 } 863 864 /* 865 * If we are reclaiming on behalf of a cgroup, skip 866 * counting on behalf of references from different 867 * cgroups 868 */ 869 if (memcg) { 870 rwc.invalid_vma = invalid_page_referenced_vma; 871 } 872 873 rmap_walk(page, &rwc); 874 *vm_flags = pra.vm_flags; 875 876 if (we_locked) 877 unlock_page(page); 878 879 return pra.referenced; 880 } 881 882 static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, 883 unsigned long address, void *arg) 884 { 885 struct page_vma_mapped_walk pvmw = { 886 .page = page, 887 .vma = vma, 888 .address = address, 889 .flags = PVMW_SYNC, 890 }; 891 unsigned long start = address, end; 892 int *cleaned = arg; 893 894 /* 895 * We have to assume the worse case ie pmd for invalidation. Note that 896 * the page can not be free from this function. 897 */ 898 end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); 899 mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); 900 901 while (page_vma_mapped_walk(&pvmw)) { 902 unsigned long cstart; 903 int ret = 0; 904 905 cstart = address = pvmw.address; 906 if (pvmw.pte) { 907 pte_t entry; 908 pte_t *pte = pvmw.pte; 909 910 if (!pte_dirty(*pte) && !pte_write(*pte)) 911 continue; 912 913 flush_cache_page(vma, address, pte_pfn(*pte)); 914 entry = ptep_clear_flush(vma, address, pte); 915 entry = pte_wrprotect(entry); 916 entry = pte_mkclean(entry); 917 set_pte_at(vma->vm_mm, address, pte, entry); 918 ret = 1; 919 } else { 920 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 921 pmd_t *pmd = pvmw.pmd; 922 pmd_t entry; 923 924 if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) 925 continue; 926 927 flush_cache_page(vma, address, page_to_pfn(page)); 928 entry = pmdp_huge_clear_flush(vma, address, pmd); 929 entry = pmd_wrprotect(entry); 930 entry = pmd_mkclean(entry); 931 set_pmd_at(vma->vm_mm, address, pmd, entry); 932 cstart &= PMD_MASK; 933 ret = 1; 934 #else 935 /* unexpected pmd-mapped page? */ 936 WARN_ON_ONCE(1); 937 #endif 938 } 939 940 /* 941 * No need to call mmu_notifier_invalidate_range() as we are 942 * downgrading page table protection not changing it to point 943 * to a new page. 944 * 945 * See Documentation/vm/mmu_notifier.rst 946 */ 947 if (ret) 948 (*cleaned)++; 949 } 950 951 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); 952 953 return true; 954 } 955 956 static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) 957 { 958 if (vma->vm_flags & VM_SHARED) 959 return false; 960 961 return true; 962 } 963 964 int page_mkclean(struct page *page) 965 { 966 int cleaned = 0; 967 struct address_space *mapping; 968 struct rmap_walk_control rwc = { 969 .arg = (void *)&cleaned, 970 .rmap_one = page_mkclean_one, 971 .invalid_vma = invalid_mkclean_vma, 972 }; 973 974 BUG_ON(!PageLocked(page)); 975 976 if (!page_mapped(page)) 977 return 0; 978 979 mapping = page_mapping(page); 980 if (!mapping) 981 return 0; 982 983 rmap_walk(page, &rwc); 984 985 return cleaned; 986 } 987 EXPORT_SYMBOL_GPL(page_mkclean); 988 989 /** 990 * page_move_anon_rmap - move a page to our anon_vma 991 * @page: the page to move to our anon_vma 992 * @vma: the vma the page belongs to 993 * 994 * When a page belongs exclusively to one process after a COW event, 995 * that page can be moved into the anon_vma that belongs to just that 996 * process, so the rmap code will not search the parent or sibling 997 * processes. 998 */ 999 void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) 1000 { 1001 struct anon_vma *anon_vma = vma->anon_vma; 1002 1003 page = compound_head(page); 1004 1005 VM_BUG_ON_PAGE(!PageLocked(page), page); 1006 VM_BUG_ON_VMA(!anon_vma, vma); 1007 1008 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1009 /* 1010 * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written 1011 * simultaneously, so a concurrent reader (eg page_referenced()'s 1012 * PageAnon()) will not see one without the other. 1013 */ 1014 WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); 1015 } 1016 1017 /** 1018 * __page_set_anon_rmap - set up new anonymous rmap 1019 * @page: Page to add to rmap 1020 * @vma: VM area to add page to. 1021 * @address: User virtual address of the mapping 1022 * @exclusive: the page is exclusively owned by the current process 1023 */ 1024 static void __page_set_anon_rmap(struct page *page, 1025 struct vm_area_struct *vma, unsigned long address, int exclusive) 1026 { 1027 struct anon_vma *anon_vma = vma->anon_vma; 1028 1029 BUG_ON(!anon_vma); 1030 1031 if (PageAnon(page)) 1032 return; 1033 1034 /* 1035 * If the page isn't exclusively mapped into this vma, 1036 * we must use the _oldest_ possible anon_vma for the 1037 * page mapping! 1038 */ 1039 if (!exclusive) 1040 anon_vma = anon_vma->root; 1041 1042 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1043 page->mapping = (struct address_space *) anon_vma; 1044 page->index = linear_page_index(vma, address); 1045 } 1046 1047 /** 1048 * __page_check_anon_rmap - sanity check anonymous rmap addition 1049 * @page: the page to add the mapping to 1050 * @vma: the vm area in which the mapping is added 1051 * @address: the user virtual address mapped 1052 */ 1053 static void __page_check_anon_rmap(struct page *page, 1054 struct vm_area_struct *vma, unsigned long address) 1055 { 1056 #ifdef CONFIG_DEBUG_VM 1057 /* 1058 * The page's anon-rmap details (mapping and index) are guaranteed to 1059 * be set up correctly at this point. 1060 * 1061 * We have exclusion against page_add_anon_rmap because the caller 1062 * always holds the page locked, except if called from page_dup_rmap, 1063 * in which case the page is already known to be setup. 1064 * 1065 * We have exclusion against page_add_new_anon_rmap because those pages 1066 * are initially only visible via the pagetables, and the pte is locked 1067 * over the call to page_add_new_anon_rmap. 1068 */ 1069 BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); 1070 BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address)); 1071 #endif 1072 } 1073 1074 /** 1075 * page_add_anon_rmap - add pte mapping to an anonymous page 1076 * @page: the page to add the mapping to 1077 * @vma: the vm area in which the mapping is added 1078 * @address: the user virtual address mapped 1079 * @compound: charge the page as compound or small page 1080 * 1081 * The caller needs to hold the pte lock, and the page must be locked in 1082 * the anon_vma case: to serialize mapping,index checking after setting, 1083 * and to ensure that PageAnon is not being upgraded racily to PageKsm 1084 * (but PageKsm is never downgraded to PageAnon). 1085 */ 1086 void page_add_anon_rmap(struct page *page, 1087 struct vm_area_struct *vma, unsigned long address, bool compound) 1088 { 1089 do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0); 1090 } 1091 1092 /* 1093 * Special version of the above for do_swap_page, which often runs 1094 * into pages that are exclusively owned by the current process. 1095 * Everybody else should continue to use page_add_anon_rmap above. 1096 */ 1097 void do_page_add_anon_rmap(struct page *page, 1098 struct vm_area_struct *vma, unsigned long address, int flags) 1099 { 1100 bool compound = flags & RMAP_COMPOUND; 1101 bool first; 1102 1103 if (compound) { 1104 atomic_t *mapcount; 1105 VM_BUG_ON_PAGE(!PageLocked(page), page); 1106 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 1107 mapcount = compound_mapcount_ptr(page); 1108 first = atomic_inc_and_test(mapcount); 1109 } else { 1110 first = atomic_inc_and_test(&page->_mapcount); 1111 } 1112 1113 if (first) { 1114 int nr = compound ? hpage_nr_pages(page) : 1; 1115 /* 1116 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1117 * these counters are not modified in interrupt context, and 1118 * pte lock(a spinlock) is held, which implies preemption 1119 * disabled. 1120 */ 1121 if (compound) 1122 __inc_node_page_state(page, NR_ANON_THPS); 1123 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); 1124 } 1125 if (unlikely(PageKsm(page))) 1126 return; 1127 1128 VM_BUG_ON_PAGE(!PageLocked(page), page); 1129 1130 /* address might be in next vma when migration races vma_adjust */ 1131 if (first) 1132 __page_set_anon_rmap(page, vma, address, 1133 flags & RMAP_EXCLUSIVE); 1134 else 1135 __page_check_anon_rmap(page, vma, address); 1136 } 1137 1138 /** 1139 * page_add_new_anon_rmap - add pte mapping to a new anonymous page 1140 * @page: the page to add the mapping to 1141 * @vma: the vm area in which the mapping is added 1142 * @address: the user virtual address mapped 1143 * @compound: charge the page as compound or small page 1144 * 1145 * Same as page_add_anon_rmap but must only be called on *new* pages. 1146 * This means the inc-and-test can be bypassed. 1147 * Page does not have to be locked. 1148 */ 1149 void page_add_new_anon_rmap(struct page *page, 1150 struct vm_area_struct *vma, unsigned long address, bool compound) 1151 { 1152 int nr = compound ? hpage_nr_pages(page) : 1; 1153 1154 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); 1155 __SetPageSwapBacked(page); 1156 if (compound) { 1157 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 1158 /* increment count (starts at -1) */ 1159 atomic_set(compound_mapcount_ptr(page), 0); 1160 __inc_node_page_state(page, NR_ANON_THPS); 1161 } else { 1162 /* Anon THP always mapped first with PMD */ 1163 VM_BUG_ON_PAGE(PageTransCompound(page), page); 1164 /* increment count (starts at -1) */ 1165 atomic_set(&page->_mapcount, 0); 1166 } 1167 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); 1168 __page_set_anon_rmap(page, vma, address, 1); 1169 } 1170 1171 /** 1172 * page_add_file_rmap - add pte mapping to a file page 1173 * @page: the page to add the mapping to 1174 * @compound: charge the page as compound or small page 1175 * 1176 * The caller needs to hold the pte lock. 1177 */ 1178 void page_add_file_rmap(struct page *page, bool compound) 1179 { 1180 int i, nr = 1; 1181 1182 VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); 1183 lock_page_memcg(page); 1184 if (compound && PageTransHuge(page)) { 1185 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1186 if (atomic_inc_and_test(&page[i]._mapcount)) 1187 nr++; 1188 } 1189 if (!atomic_inc_and_test(compound_mapcount_ptr(page))) 1190 goto out; 1191 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 1192 __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); 1193 } else { 1194 if (PageTransCompound(page) && page_mapping(page)) { 1195 VM_WARN_ON_ONCE(!PageLocked(page)); 1196 1197 SetPageDoubleMap(compound_head(page)); 1198 if (PageMlocked(page)) 1199 clear_page_mlock(compound_head(page)); 1200 } 1201 if (!atomic_inc_and_test(&page->_mapcount)) 1202 goto out; 1203 } 1204 __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); 1205 out: 1206 unlock_page_memcg(page); 1207 } 1208 1209 static void page_remove_file_rmap(struct page *page, bool compound) 1210 { 1211 int i, nr = 1; 1212 1213 VM_BUG_ON_PAGE(compound && !PageHead(page), page); 1214 lock_page_memcg(page); 1215 1216 /* Hugepages are not counted in NR_FILE_MAPPED for now. */ 1217 if (unlikely(PageHuge(page))) { 1218 /* hugetlb pages are always mapped with pmds */ 1219 atomic_dec(compound_mapcount_ptr(page)); 1220 goto out; 1221 } 1222 1223 /* page still mapped by someone else? */ 1224 if (compound && PageTransHuge(page)) { 1225 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1226 if (atomic_add_negative(-1, &page[i]._mapcount)) 1227 nr++; 1228 } 1229 if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) 1230 goto out; 1231 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 1232 __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); 1233 } else { 1234 if (!atomic_add_negative(-1, &page->_mapcount)) 1235 goto out; 1236 } 1237 1238 /* 1239 * We use the irq-unsafe __{inc|mod}_lruvec_page_state because 1240 * these counters are not modified in interrupt context, and 1241 * pte lock(a spinlock) is held, which implies preemption disabled. 1242 */ 1243 __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); 1244 1245 if (unlikely(PageMlocked(page))) 1246 clear_page_mlock(page); 1247 out: 1248 unlock_page_memcg(page); 1249 } 1250 1251 static void page_remove_anon_compound_rmap(struct page *page) 1252 { 1253 int i, nr; 1254 1255 if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) 1256 return; 1257 1258 /* Hugepages are not counted in NR_ANON_PAGES for now. */ 1259 if (unlikely(PageHuge(page))) 1260 return; 1261 1262 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 1263 return; 1264 1265 __dec_node_page_state(page, NR_ANON_THPS); 1266 1267 if (TestClearPageDoubleMap(page)) { 1268 /* 1269 * Subpages can be mapped with PTEs too. Check how many of 1270 * themi are still mapped. 1271 */ 1272 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1273 if (atomic_add_negative(-1, &page[i]._mapcount)) 1274 nr++; 1275 } 1276 } else { 1277 nr = HPAGE_PMD_NR; 1278 } 1279 1280 if (unlikely(PageMlocked(page))) 1281 clear_page_mlock(page); 1282 1283 if (nr) { 1284 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); 1285 deferred_split_huge_page(page); 1286 } 1287 } 1288 1289 /** 1290 * page_remove_rmap - take down pte mapping from a page 1291 * @page: page to remove mapping from 1292 * @compound: uncharge the page as compound or small page 1293 * 1294 * The caller needs to hold the pte lock. 1295 */ 1296 void page_remove_rmap(struct page *page, bool compound) 1297 { 1298 if (!PageAnon(page)) 1299 return page_remove_file_rmap(page, compound); 1300 1301 if (compound) 1302 return page_remove_anon_compound_rmap(page); 1303 1304 /* page still mapped by someone else? */ 1305 if (!atomic_add_negative(-1, &page->_mapcount)) 1306 return; 1307 1308 /* 1309 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1310 * these counters are not modified in interrupt context, and 1311 * pte lock(a spinlock) is held, which implies preemption disabled. 1312 */ 1313 __dec_node_page_state(page, NR_ANON_MAPPED); 1314 1315 if (unlikely(PageMlocked(page))) 1316 clear_page_mlock(page); 1317 1318 if (PageTransCompound(page)) 1319 deferred_split_huge_page(compound_head(page)); 1320 1321 /* 1322 * It would be tidy to reset the PageAnon mapping here, 1323 * but that might overwrite a racing page_add_anon_rmap 1324 * which increments mapcount after us but sets mapping 1325 * before us: so leave the reset to free_unref_page, 1326 * and remember that it's only reliable while mapped. 1327 * Leaving it set also helps swapoff to reinstate ptes 1328 * faster for those pages still in swapcache. 1329 */ 1330 } 1331 1332 /* 1333 * @arg: enum ttu_flags will be passed to this argument 1334 */ 1335 static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 1336 unsigned long address, void *arg) 1337 { 1338 struct mm_struct *mm = vma->vm_mm; 1339 struct page_vma_mapped_walk pvmw = { 1340 .page = page, 1341 .vma = vma, 1342 .address = address, 1343 }; 1344 pte_t pteval; 1345 struct page *subpage; 1346 bool ret = true; 1347 unsigned long start = address, end; 1348 enum ttu_flags flags = (enum ttu_flags)arg; 1349 1350 /* munlock has nothing to gain from examining un-locked vmas */ 1351 if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) 1352 return true; 1353 1354 if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && 1355 is_zone_device_page(page) && !is_device_private_page(page)) 1356 return true; 1357 1358 if (flags & TTU_SPLIT_HUGE_PMD) { 1359 split_huge_pmd_address(vma, address, 1360 flags & TTU_SPLIT_FREEZE, page); 1361 } 1362 1363 /* 1364 * We have to assume the worse case ie pmd for invalidation. Note that 1365 * the page can not be free in this function as call of try_to_unmap() 1366 * must hold a reference on the page. 1367 */ 1368 end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); 1369 mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); 1370 1371 while (page_vma_mapped_walk(&pvmw)) { 1372 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1373 /* PMD-mapped THP migration entry */ 1374 if (!pvmw.pte && (flags & TTU_MIGRATION)) { 1375 VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); 1376 1377 set_pmd_migration_entry(&pvmw, page); 1378 continue; 1379 } 1380 #endif 1381 1382 /* 1383 * If the page is mlock()d, we cannot swap it out. 1384 * If it's recently referenced (perhaps page_referenced 1385 * skipped over this mm) then we should reactivate it. 1386 */ 1387 if (!(flags & TTU_IGNORE_MLOCK)) { 1388 if (vma->vm_flags & VM_LOCKED) { 1389 /* PTE-mapped THP are never mlocked */ 1390 if (!PageTransCompound(page)) { 1391 /* 1392 * Holding pte lock, we do *not* need 1393 * mmap_sem here 1394 */ 1395 mlock_vma_page(page); 1396 } 1397 ret = false; 1398 page_vma_mapped_walk_done(&pvmw); 1399 break; 1400 } 1401 if (flags & TTU_MUNLOCK) 1402 continue; 1403 } 1404 1405 /* Unexpected PMD-mapped THP? */ 1406 VM_BUG_ON_PAGE(!pvmw.pte, page); 1407 1408 subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); 1409 address = pvmw.address; 1410 1411 1412 if (IS_ENABLED(CONFIG_MIGRATION) && 1413 (flags & TTU_MIGRATION) && 1414 is_zone_device_page(page)) { 1415 swp_entry_t entry; 1416 pte_t swp_pte; 1417 1418 pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte); 1419 1420 /* 1421 * Store the pfn of the page in a special migration 1422 * pte. do_swap_page() will wait until the migration 1423 * pte is removed and then restart fault handling. 1424 */ 1425 entry = make_migration_entry(page, 0); 1426 swp_pte = swp_entry_to_pte(entry); 1427 if (pte_soft_dirty(pteval)) 1428 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1429 set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); 1430 /* 1431 * No need to invalidate here it will synchronize on 1432 * against the special swap migration pte. 1433 */ 1434 goto discard; 1435 } 1436 1437 if (!(flags & TTU_IGNORE_ACCESS)) { 1438 if (ptep_clear_flush_young_notify(vma, address, 1439 pvmw.pte)) { 1440 ret = false; 1441 page_vma_mapped_walk_done(&pvmw); 1442 break; 1443 } 1444 } 1445 1446 /* Nuke the page table entry. */ 1447 flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); 1448 if (should_defer_flush(mm, flags)) { 1449 /* 1450 * We clear the PTE but do not flush so potentially 1451 * a remote CPU could still be writing to the page. 1452 * If the entry was previously clean then the 1453 * architecture must guarantee that a clear->dirty 1454 * transition on a cached TLB entry is written through 1455 * and traps if the PTE is unmapped. 1456 */ 1457 pteval = ptep_get_and_clear(mm, address, pvmw.pte); 1458 1459 set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); 1460 } else { 1461 pteval = ptep_clear_flush(vma, address, pvmw.pte); 1462 } 1463 1464 /* Move the dirty bit to the page. Now the pte is gone. */ 1465 if (pte_dirty(pteval)) 1466 set_page_dirty(page); 1467 1468 /* Update high watermark before we lower rss */ 1469 update_hiwater_rss(mm); 1470 1471 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 1472 pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); 1473 if (PageHuge(page)) { 1474 int nr = 1 << compound_order(page); 1475 hugetlb_count_sub(nr, mm); 1476 set_huge_swap_pte_at(mm, address, 1477 pvmw.pte, pteval, 1478 vma_mmu_pagesize(vma)); 1479 } else { 1480 dec_mm_counter(mm, mm_counter(page)); 1481 set_pte_at(mm, address, pvmw.pte, pteval); 1482 } 1483 1484 } else if (pte_unused(pteval)) { 1485 /* 1486 * The guest indicated that the page content is of no 1487 * interest anymore. Simply discard the pte, vmscan 1488 * will take care of the rest. 1489 */ 1490 dec_mm_counter(mm, mm_counter(page)); 1491 /* We have to invalidate as we cleared the pte */ 1492 mmu_notifier_invalidate_range(mm, address, 1493 address + PAGE_SIZE); 1494 } else if (IS_ENABLED(CONFIG_MIGRATION) && 1495 (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { 1496 swp_entry_t entry; 1497 pte_t swp_pte; 1498 1499 if (arch_unmap_one(mm, vma, address, pteval) < 0) { 1500 set_pte_at(mm, address, pvmw.pte, pteval); 1501 ret = false; 1502 page_vma_mapped_walk_done(&pvmw); 1503 break; 1504 } 1505 1506 /* 1507 * Store the pfn of the page in a special migration 1508 * pte. do_swap_page() will wait until the migration 1509 * pte is removed and then restart fault handling. 1510 */ 1511 entry = make_migration_entry(subpage, 1512 pte_write(pteval)); 1513 swp_pte = swp_entry_to_pte(entry); 1514 if (pte_soft_dirty(pteval)) 1515 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1516 set_pte_at(mm, address, pvmw.pte, swp_pte); 1517 /* 1518 * No need to invalidate here it will synchronize on 1519 * against the special swap migration pte. 1520 */ 1521 } else if (PageAnon(page)) { 1522 swp_entry_t entry = { .val = page_private(subpage) }; 1523 pte_t swp_pte; 1524 /* 1525 * Store the swap location in the pte. 1526 * See handle_pte_fault() ... 1527 */ 1528 if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { 1529 WARN_ON_ONCE(1); 1530 ret = false; 1531 /* We have to invalidate as we cleared the pte */ 1532 mmu_notifier_invalidate_range(mm, address, 1533 address + PAGE_SIZE); 1534 page_vma_mapped_walk_done(&pvmw); 1535 break; 1536 } 1537 1538 /* MADV_FREE page check */ 1539 if (!PageSwapBacked(page)) { 1540 if (!PageDirty(page)) { 1541 /* Invalidate as we cleared the pte */ 1542 mmu_notifier_invalidate_range(mm, 1543 address, address + PAGE_SIZE); 1544 dec_mm_counter(mm, MM_ANONPAGES); 1545 goto discard; 1546 } 1547 1548 /* 1549 * If the page was redirtied, it cannot be 1550 * discarded. Remap the page to page table. 1551 */ 1552 set_pte_at(mm, address, pvmw.pte, pteval); 1553 SetPageSwapBacked(page); 1554 ret = false; 1555 page_vma_mapped_walk_done(&pvmw); 1556 break; 1557 } 1558 1559 if (swap_duplicate(entry) < 0) { 1560 set_pte_at(mm, address, pvmw.pte, pteval); 1561 ret = false; 1562 page_vma_mapped_walk_done(&pvmw); 1563 break; 1564 } 1565 if (arch_unmap_one(mm, vma, address, pteval) < 0) { 1566 set_pte_at(mm, address, pvmw.pte, pteval); 1567 ret = false; 1568 page_vma_mapped_walk_done(&pvmw); 1569 break; 1570 } 1571 if (list_empty(&mm->mmlist)) { 1572 spin_lock(&mmlist_lock); 1573 if (list_empty(&mm->mmlist)) 1574 list_add(&mm->mmlist, &init_mm.mmlist); 1575 spin_unlock(&mmlist_lock); 1576 } 1577 dec_mm_counter(mm, MM_ANONPAGES); 1578 inc_mm_counter(mm, MM_SWAPENTS); 1579 swp_pte = swp_entry_to_pte(entry); 1580 if (pte_soft_dirty(pteval)) 1581 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1582 set_pte_at(mm, address, pvmw.pte, swp_pte); 1583 /* Invalidate as we cleared the pte */ 1584 mmu_notifier_invalidate_range(mm, address, 1585 address + PAGE_SIZE); 1586 } else { 1587 /* 1588 * We should not need to notify here as we reach this 1589 * case only from freeze_page() itself only call from 1590 * split_huge_page_to_list() so everything below must 1591 * be true: 1592 * - page is not anonymous 1593 * - page is locked 1594 * 1595 * So as it is a locked file back page thus it can not 1596 * be remove from the page cache and replace by a new 1597 * page before mmu_notifier_invalidate_range_end so no 1598 * concurrent thread might update its page table to 1599 * point at new page while a device still is using this 1600 * page. 1601 * 1602 * See Documentation/vm/mmu_notifier.rst 1603 */ 1604 dec_mm_counter(mm, mm_counter_file(page)); 1605 } 1606 discard: 1607 /* 1608 * No need to call mmu_notifier_invalidate_range() it has be 1609 * done above for all cases requiring it to happen under page 1610 * table lock before mmu_notifier_invalidate_range_end() 1611 * 1612 * See Documentation/vm/mmu_notifier.rst 1613 */ 1614 page_remove_rmap(subpage, PageHuge(page)); 1615 put_page(page); 1616 } 1617 1618 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); 1619 1620 return ret; 1621 } 1622 1623 bool is_vma_temporary_stack(struct vm_area_struct *vma) 1624 { 1625 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1626 1627 if (!maybe_stack) 1628 return false; 1629 1630 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == 1631 VM_STACK_INCOMPLETE_SETUP) 1632 return true; 1633 1634 return false; 1635 } 1636 1637 static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) 1638 { 1639 return is_vma_temporary_stack(vma); 1640 } 1641 1642 static int page_mapcount_is_zero(struct page *page) 1643 { 1644 return !total_mapcount(page); 1645 } 1646 1647 /** 1648 * try_to_unmap - try to remove all page table mappings to a page 1649 * @page: the page to get unmapped 1650 * @flags: action and flags 1651 * 1652 * Tries to remove all the page table entries which are mapping this 1653 * page, used in the pageout path. Caller must hold the page lock. 1654 * 1655 * If unmap is successful, return true. Otherwise, false. 1656 */ 1657 bool try_to_unmap(struct page *page, enum ttu_flags flags) 1658 { 1659 struct rmap_walk_control rwc = { 1660 .rmap_one = try_to_unmap_one, 1661 .arg = (void *)flags, 1662 .done = page_mapcount_is_zero, 1663 .anon_lock = page_lock_anon_vma_read, 1664 }; 1665 1666 /* 1667 * During exec, a temporary VMA is setup and later moved. 1668 * The VMA is moved under the anon_vma lock but not the 1669 * page tables leading to a race where migration cannot 1670 * find the migration ptes. Rather than increasing the 1671 * locking requirements of exec(), migration skips 1672 * temporary VMAs until after exec() completes. 1673 */ 1674 if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE)) 1675 && !PageKsm(page) && PageAnon(page)) 1676 rwc.invalid_vma = invalid_migration_vma; 1677 1678 if (flags & TTU_RMAP_LOCKED) 1679 rmap_walk_locked(page, &rwc); 1680 else 1681 rmap_walk(page, &rwc); 1682 1683 return !page_mapcount(page) ? true : false; 1684 } 1685 1686 static int page_not_mapped(struct page *page) 1687 { 1688 return !page_mapped(page); 1689 }; 1690 1691 /** 1692 * try_to_munlock - try to munlock a page 1693 * @page: the page to be munlocked 1694 * 1695 * Called from munlock code. Checks all of the VMAs mapping the page 1696 * to make sure nobody else has this page mlocked. The page will be 1697 * returned with PG_mlocked cleared if no other vmas have it mlocked. 1698 */ 1699 1700 void try_to_munlock(struct page *page) 1701 { 1702 struct rmap_walk_control rwc = { 1703 .rmap_one = try_to_unmap_one, 1704 .arg = (void *)TTU_MUNLOCK, 1705 .done = page_not_mapped, 1706 .anon_lock = page_lock_anon_vma_read, 1707 1708 }; 1709 1710 VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); 1711 VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); 1712 1713 rmap_walk(page, &rwc); 1714 } 1715 1716 void __put_anon_vma(struct anon_vma *anon_vma) 1717 { 1718 struct anon_vma *root = anon_vma->root; 1719 1720 anon_vma_free(anon_vma); 1721 if (root != anon_vma && atomic_dec_and_test(&root->refcount)) 1722 anon_vma_free(root); 1723 } 1724 1725 static struct anon_vma *rmap_walk_anon_lock(struct page *page, 1726 struct rmap_walk_control *rwc) 1727 { 1728 struct anon_vma *anon_vma; 1729 1730 if (rwc->anon_lock) 1731 return rwc->anon_lock(page); 1732 1733 /* 1734 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() 1735 * because that depends on page_mapped(); but not all its usages 1736 * are holding mmap_sem. Users without mmap_sem are required to 1737 * take a reference count to prevent the anon_vma disappearing 1738 */ 1739 anon_vma = page_anon_vma(page); 1740 if (!anon_vma) 1741 return NULL; 1742 1743 anon_vma_lock_read(anon_vma); 1744 return anon_vma; 1745 } 1746 1747 /* 1748 * rmap_walk_anon - do something to anonymous page using the object-based 1749 * rmap method 1750 * @page: the page to be handled 1751 * @rwc: control variable according to each walk type 1752 * 1753 * Find all the mappings of a page using the mapping pointer and the vma chains 1754 * contained in the anon_vma struct it points to. 1755 * 1756 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1757 * where the page was found will be held for write. So, we won't recheck 1758 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1759 * LOCKED. 1760 */ 1761 static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, 1762 bool locked) 1763 { 1764 struct anon_vma *anon_vma; 1765 pgoff_t pgoff_start, pgoff_end; 1766 struct anon_vma_chain *avc; 1767 1768 if (locked) { 1769 anon_vma = page_anon_vma(page); 1770 /* anon_vma disappear under us? */ 1771 VM_BUG_ON_PAGE(!anon_vma, page); 1772 } else { 1773 anon_vma = rmap_walk_anon_lock(page, rwc); 1774 } 1775 if (!anon_vma) 1776 return; 1777 1778 pgoff_start = page_to_pgoff(page); 1779 pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; 1780 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, 1781 pgoff_start, pgoff_end) { 1782 struct vm_area_struct *vma = avc->vma; 1783 unsigned long address = vma_address(page, vma); 1784 1785 cond_resched(); 1786 1787 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1788 continue; 1789 1790 if (!rwc->rmap_one(page, vma, address, rwc->arg)) 1791 break; 1792 if (rwc->done && rwc->done(page)) 1793 break; 1794 } 1795 1796 if (!locked) 1797 anon_vma_unlock_read(anon_vma); 1798 } 1799 1800 /* 1801 * rmap_walk_file - do something to file page using the object-based rmap method 1802 * @page: the page to be handled 1803 * @rwc: control variable according to each walk type 1804 * 1805 * Find all the mappings of a page using the mapping pointer and the vma chains 1806 * contained in the address_space struct it points to. 1807 * 1808 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1809 * where the page was found will be held for write. So, we won't recheck 1810 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1811 * LOCKED. 1812 */ 1813 static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, 1814 bool locked) 1815 { 1816 struct address_space *mapping = page_mapping(page); 1817 pgoff_t pgoff_start, pgoff_end; 1818 struct vm_area_struct *vma; 1819 1820 /* 1821 * The page lock not only makes sure that page->mapping cannot 1822 * suddenly be NULLified by truncation, it makes sure that the 1823 * structure at mapping cannot be freed and reused yet, 1824 * so we can safely take mapping->i_mmap_rwsem. 1825 */ 1826 VM_BUG_ON_PAGE(!PageLocked(page), page); 1827 1828 if (!mapping) 1829 return; 1830 1831 pgoff_start = page_to_pgoff(page); 1832 pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; 1833 if (!locked) 1834 i_mmap_lock_read(mapping); 1835 vma_interval_tree_foreach(vma, &mapping->i_mmap, 1836 pgoff_start, pgoff_end) { 1837 unsigned long address = vma_address(page, vma); 1838 1839 cond_resched(); 1840 1841 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1842 continue; 1843 1844 if (!rwc->rmap_one(page, vma, address, rwc->arg)) 1845 goto done; 1846 if (rwc->done && rwc->done(page)) 1847 goto done; 1848 } 1849 1850 done: 1851 if (!locked) 1852 i_mmap_unlock_read(mapping); 1853 } 1854 1855 void rmap_walk(struct page *page, struct rmap_walk_control *rwc) 1856 { 1857 if (unlikely(PageKsm(page))) 1858 rmap_walk_ksm(page, rwc); 1859 else if (PageAnon(page)) 1860 rmap_walk_anon(page, rwc, false); 1861 else 1862 rmap_walk_file(page, rwc, false); 1863 } 1864 1865 /* Like rmap_walk, but caller holds relevant rmap lock */ 1866 void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) 1867 { 1868 /* no ksm support for now */ 1869 VM_BUG_ON_PAGE(PageKsm(page), page); 1870 if (PageAnon(page)) 1871 rmap_walk_anon(page, rwc, true); 1872 else 1873 rmap_walk_file(page, rwc, true); 1874 } 1875 1876 #ifdef CONFIG_HUGETLB_PAGE 1877 /* 1878 * The following three functions are for anonymous (private mapped) hugepages. 1879 * Unlike common anonymous pages, anonymous hugepages have no accounting code 1880 * and no lru code, because we handle hugepages differently from common pages. 1881 */ 1882 static void __hugepage_set_anon_rmap(struct page *page, 1883 struct vm_area_struct *vma, unsigned long address, int exclusive) 1884 { 1885 struct anon_vma *anon_vma = vma->anon_vma; 1886 1887 BUG_ON(!anon_vma); 1888 1889 if (PageAnon(page)) 1890 return; 1891 if (!exclusive) 1892 anon_vma = anon_vma->root; 1893 1894 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1895 page->mapping = (struct address_space *) anon_vma; 1896 page->index = linear_page_index(vma, address); 1897 } 1898 1899 void hugepage_add_anon_rmap(struct page *page, 1900 struct vm_area_struct *vma, unsigned long address) 1901 { 1902 struct anon_vma *anon_vma = vma->anon_vma; 1903 int first; 1904 1905 BUG_ON(!PageLocked(page)); 1906 BUG_ON(!anon_vma); 1907 /* address might be in next vma when migration races vma_adjust */ 1908 first = atomic_inc_and_test(compound_mapcount_ptr(page)); 1909 if (first) 1910 __hugepage_set_anon_rmap(page, vma, address, 0); 1911 } 1912 1913 void hugepage_add_new_anon_rmap(struct page *page, 1914 struct vm_area_struct *vma, unsigned long address) 1915 { 1916 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1917 atomic_set(compound_mapcount_ptr(page), 0); 1918 __hugepage_set_anon_rmap(page, vma, address, 1); 1919 } 1920 #endif /* CONFIG_HUGETLB_PAGE */ 1921