1 /* 2 * mm/rmap.c - physical to virtual reverse mappings 3 * 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 5 * Released under the General Public License (GPL). 6 * 7 * Simple, low overhead reverse mapping scheme. 8 * Please try to keep this thing as modular as possible. 9 * 10 * Provides methods for unmapping each kind of mapped page: 11 * the anon methods track anonymous pages, and 12 * the file methods track pages belonging to an inode. 13 * 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 17 * Contributions by Hugh Dickins 2003, 2004 18 */ 19 20 /* 21 * Lock ordering in mm: 22 * 23 * inode->i_mutex (while writing or truncating, not reading or faulting) 24 * mm->mmap_sem 25 * page->flags PG_locked (lock_page) 26 * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) 27 * mapping->i_mmap_rwsem 28 * anon_vma->rwsem 29 * mm->page_table_lock or pte_lock 30 * zone_lru_lock (in mark_page_accessed, isolate_lru_page) 31 * swap_lock (in swap_duplicate, swap_info_get) 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 * mapping->private_lock (in __set_page_dirty_buffers) 34 * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) 35 * mapping->tree_lock (widely used) 36 * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 37 * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) 38 * sb_lock (within inode_lock in fs/fs-writeback.c) 39 * mapping->tree_lock (widely used, in set_page_dirty, 40 * in arch-dependent flush_dcache_mmap_lock, 41 * within bdi.wb->list_lock in __sync_single_inode) 42 * 43 * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) 44 * ->tasklist_lock 45 * pte map lock 46 */ 47 48 #include <linux/mm.h> 49 #include <linux/sched/mm.h> 50 #include <linux/sched/task.h> 51 #include <linux/pagemap.h> 52 #include <linux/swap.h> 53 #include <linux/swapops.h> 54 #include <linux/slab.h> 55 #include <linux/init.h> 56 #include <linux/ksm.h> 57 #include <linux/rmap.h> 58 #include <linux/rcupdate.h> 59 #include <linux/export.h> 60 #include <linux/memcontrol.h> 61 #include <linux/mmu_notifier.h> 62 #include <linux/migrate.h> 63 #include <linux/hugetlb.h> 64 #include <linux/backing-dev.h> 65 #include <linux/page_idle.h> 66 #include <linux/memremap.h> 67 68 #include <asm/tlbflush.h> 69 70 #include <trace/events/tlb.h> 71 72 #include "internal.h" 73 74 static struct kmem_cache *anon_vma_cachep; 75 static struct kmem_cache *anon_vma_chain_cachep; 76 77 static inline struct anon_vma *anon_vma_alloc(void) 78 { 79 struct anon_vma *anon_vma; 80 81 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 82 if (anon_vma) { 83 atomic_set(&anon_vma->refcount, 1); 84 anon_vma->degree = 1; /* Reference for first vma */ 85 anon_vma->parent = anon_vma; 86 /* 87 * Initialise the anon_vma root to point to itself. If called 88 * from fork, the root will be reset to the parents anon_vma. 89 */ 90 anon_vma->root = anon_vma; 91 } 92 93 return anon_vma; 94 } 95 96 static inline void anon_vma_free(struct anon_vma *anon_vma) 97 { 98 VM_BUG_ON(atomic_read(&anon_vma->refcount)); 99 100 /* 101 * Synchronize against page_lock_anon_vma_read() such that 102 * we can safely hold the lock without the anon_vma getting 103 * freed. 104 * 105 * Relies on the full mb implied by the atomic_dec_and_test() from 106 * put_anon_vma() against the acquire barrier implied by 107 * down_read_trylock() from page_lock_anon_vma_read(). This orders: 108 * 109 * page_lock_anon_vma_read() VS put_anon_vma() 110 * down_read_trylock() atomic_dec_and_test() 111 * LOCK MB 112 * atomic_read() rwsem_is_locked() 113 * 114 * LOCK should suffice since the actual taking of the lock must 115 * happen _before_ what follows. 116 */ 117 might_sleep(); 118 if (rwsem_is_locked(&anon_vma->root->rwsem)) { 119 anon_vma_lock_write(anon_vma); 120 anon_vma_unlock_write(anon_vma); 121 } 122 123 kmem_cache_free(anon_vma_cachep, anon_vma); 124 } 125 126 static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) 127 { 128 return kmem_cache_alloc(anon_vma_chain_cachep, gfp); 129 } 130 131 static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 132 { 133 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 134 } 135 136 static void anon_vma_chain_link(struct vm_area_struct *vma, 137 struct anon_vma_chain *avc, 138 struct anon_vma *anon_vma) 139 { 140 avc->vma = vma; 141 avc->anon_vma = anon_vma; 142 list_add(&avc->same_vma, &vma->anon_vma_chain); 143 anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); 144 } 145 146 /** 147 * __anon_vma_prepare - attach an anon_vma to a memory region 148 * @vma: the memory region in question 149 * 150 * This makes sure the memory mapping described by 'vma' has 151 * an 'anon_vma' attached to it, so that we can associate the 152 * anonymous pages mapped into it with that anon_vma. 153 * 154 * The common case will be that we already have one, which 155 * is handled inline by anon_vma_prepare(). But if 156 * not we either need to find an adjacent mapping that we 157 * can re-use the anon_vma from (very common when the only 158 * reason for splitting a vma has been mprotect()), or we 159 * allocate a new one. 160 * 161 * Anon-vma allocations are very subtle, because we may have 162 * optimistically looked up an anon_vma in page_lock_anon_vma_read() 163 * and that may actually touch the spinlock even in the newly 164 * allocated vma (it depends on RCU to make sure that the 165 * anon_vma isn't actually destroyed). 166 * 167 * As a result, we need to do proper anon_vma locking even 168 * for the new allocation. At the same time, we do not want 169 * to do any locking for the common case of already having 170 * an anon_vma. 171 * 172 * This must be called with the mmap_sem held for reading. 173 */ 174 int __anon_vma_prepare(struct vm_area_struct *vma) 175 { 176 struct mm_struct *mm = vma->vm_mm; 177 struct anon_vma *anon_vma, *allocated; 178 struct anon_vma_chain *avc; 179 180 might_sleep(); 181 182 avc = anon_vma_chain_alloc(GFP_KERNEL); 183 if (!avc) 184 goto out_enomem; 185 186 anon_vma = find_mergeable_anon_vma(vma); 187 allocated = NULL; 188 if (!anon_vma) { 189 anon_vma = anon_vma_alloc(); 190 if (unlikely(!anon_vma)) 191 goto out_enomem_free_avc; 192 allocated = anon_vma; 193 } 194 195 anon_vma_lock_write(anon_vma); 196 /* page_table_lock to protect against threads */ 197 spin_lock(&mm->page_table_lock); 198 if (likely(!vma->anon_vma)) { 199 vma->anon_vma = anon_vma; 200 anon_vma_chain_link(vma, avc, anon_vma); 201 /* vma reference or self-parent link for new root */ 202 anon_vma->degree++; 203 allocated = NULL; 204 avc = NULL; 205 } 206 spin_unlock(&mm->page_table_lock); 207 anon_vma_unlock_write(anon_vma); 208 209 if (unlikely(allocated)) 210 put_anon_vma(allocated); 211 if (unlikely(avc)) 212 anon_vma_chain_free(avc); 213 214 return 0; 215 216 out_enomem_free_avc: 217 anon_vma_chain_free(avc); 218 out_enomem: 219 return -ENOMEM; 220 } 221 222 /* 223 * This is a useful helper function for locking the anon_vma root as 224 * we traverse the vma->anon_vma_chain, looping over anon_vma's that 225 * have the same vma. 226 * 227 * Such anon_vma's should have the same root, so you'd expect to see 228 * just a single mutex_lock for the whole traversal. 229 */ 230 static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) 231 { 232 struct anon_vma *new_root = anon_vma->root; 233 if (new_root != root) { 234 if (WARN_ON_ONCE(root)) 235 up_write(&root->rwsem); 236 root = new_root; 237 down_write(&root->rwsem); 238 } 239 return root; 240 } 241 242 static inline void unlock_anon_vma_root(struct anon_vma *root) 243 { 244 if (root) 245 up_write(&root->rwsem); 246 } 247 248 /* 249 * Attach the anon_vmas from src to dst. 250 * Returns 0 on success, -ENOMEM on failure. 251 * 252 * If dst->anon_vma is NULL this function tries to find and reuse existing 253 * anon_vma which has no vmas and only one child anon_vma. This prevents 254 * degradation of anon_vma hierarchy to endless linear chain in case of 255 * constantly forking task. On the other hand, an anon_vma with more than one 256 * child isn't reused even if there was no alive vma, thus rmap walker has a 257 * good chance of avoiding scanning the whole hierarchy when it searches where 258 * page is mapped. 259 */ 260 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 261 { 262 struct anon_vma_chain *avc, *pavc; 263 struct anon_vma *root = NULL; 264 265 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { 266 struct anon_vma *anon_vma; 267 268 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); 269 if (unlikely(!avc)) { 270 unlock_anon_vma_root(root); 271 root = NULL; 272 avc = anon_vma_chain_alloc(GFP_KERNEL); 273 if (!avc) 274 goto enomem_failure; 275 } 276 anon_vma = pavc->anon_vma; 277 root = lock_anon_vma_root(root, anon_vma); 278 anon_vma_chain_link(dst, avc, anon_vma); 279 280 /* 281 * Reuse existing anon_vma if its degree lower than two, 282 * that means it has no vma and only one anon_vma child. 283 * 284 * Do not chose parent anon_vma, otherwise first child 285 * will always reuse it. Root anon_vma is never reused: 286 * it has self-parent reference and at least one child. 287 */ 288 if (!dst->anon_vma && anon_vma != src->anon_vma && 289 anon_vma->degree < 2) 290 dst->anon_vma = anon_vma; 291 } 292 if (dst->anon_vma) 293 dst->anon_vma->degree++; 294 unlock_anon_vma_root(root); 295 return 0; 296 297 enomem_failure: 298 /* 299 * dst->anon_vma is dropped here otherwise its degree can be incorrectly 300 * decremented in unlink_anon_vmas(). 301 * We can safely do this because callers of anon_vma_clone() don't care 302 * about dst->anon_vma if anon_vma_clone() failed. 303 */ 304 dst->anon_vma = NULL; 305 unlink_anon_vmas(dst); 306 return -ENOMEM; 307 } 308 309 /* 310 * Attach vma to its own anon_vma, as well as to the anon_vmas that 311 * the corresponding VMA in the parent process is attached to. 312 * Returns 0 on success, non-zero on failure. 313 */ 314 int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) 315 { 316 struct anon_vma_chain *avc; 317 struct anon_vma *anon_vma; 318 int error; 319 320 /* Don't bother if the parent process has no anon_vma here. */ 321 if (!pvma->anon_vma) 322 return 0; 323 324 /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ 325 vma->anon_vma = NULL; 326 327 /* 328 * First, attach the new VMA to the parent VMA's anon_vmas, 329 * so rmap can find non-COWed pages in child processes. 330 */ 331 error = anon_vma_clone(vma, pvma); 332 if (error) 333 return error; 334 335 /* An existing anon_vma has been reused, all done then. */ 336 if (vma->anon_vma) 337 return 0; 338 339 /* Then add our own anon_vma. */ 340 anon_vma = anon_vma_alloc(); 341 if (!anon_vma) 342 goto out_error; 343 avc = anon_vma_chain_alloc(GFP_KERNEL); 344 if (!avc) 345 goto out_error_free_anon_vma; 346 347 /* 348 * The root anon_vma's spinlock is the lock actually used when we 349 * lock any of the anon_vmas in this anon_vma tree. 350 */ 351 anon_vma->root = pvma->anon_vma->root; 352 anon_vma->parent = pvma->anon_vma; 353 /* 354 * With refcounts, an anon_vma can stay around longer than the 355 * process it belongs to. The root anon_vma needs to be pinned until 356 * this anon_vma is freed, because the lock lives in the root. 357 */ 358 get_anon_vma(anon_vma->root); 359 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 360 vma->anon_vma = anon_vma; 361 anon_vma_lock_write(anon_vma); 362 anon_vma_chain_link(vma, avc, anon_vma); 363 anon_vma->parent->degree++; 364 anon_vma_unlock_write(anon_vma); 365 366 return 0; 367 368 out_error_free_anon_vma: 369 put_anon_vma(anon_vma); 370 out_error: 371 unlink_anon_vmas(vma); 372 return -ENOMEM; 373 } 374 375 void unlink_anon_vmas(struct vm_area_struct *vma) 376 { 377 struct anon_vma_chain *avc, *next; 378 struct anon_vma *root = NULL; 379 380 /* 381 * Unlink each anon_vma chained to the VMA. This list is ordered 382 * from newest to oldest, ensuring the root anon_vma gets freed last. 383 */ 384 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 385 struct anon_vma *anon_vma = avc->anon_vma; 386 387 root = lock_anon_vma_root(root, anon_vma); 388 anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); 389 390 /* 391 * Leave empty anon_vmas on the list - we'll need 392 * to free them outside the lock. 393 */ 394 if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { 395 anon_vma->parent->degree--; 396 continue; 397 } 398 399 list_del(&avc->same_vma); 400 anon_vma_chain_free(avc); 401 } 402 if (vma->anon_vma) 403 vma->anon_vma->degree--; 404 unlock_anon_vma_root(root); 405 406 /* 407 * Iterate the list once more, it now only contains empty and unlinked 408 * anon_vmas, destroy them. Could not do before due to __put_anon_vma() 409 * needing to write-acquire the anon_vma->root->rwsem. 410 */ 411 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 412 struct anon_vma *anon_vma = avc->anon_vma; 413 414 VM_WARN_ON(anon_vma->degree); 415 put_anon_vma(anon_vma); 416 417 list_del(&avc->same_vma); 418 anon_vma_chain_free(avc); 419 } 420 } 421 422 static void anon_vma_ctor(void *data) 423 { 424 struct anon_vma *anon_vma = data; 425 426 init_rwsem(&anon_vma->rwsem); 427 atomic_set(&anon_vma->refcount, 0); 428 anon_vma->rb_root = RB_ROOT_CACHED; 429 } 430 431 void __init anon_vma_init(void) 432 { 433 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 434 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, 435 anon_vma_ctor); 436 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, 437 SLAB_PANIC|SLAB_ACCOUNT); 438 } 439 440 /* 441 * Getting a lock on a stable anon_vma from a page off the LRU is tricky! 442 * 443 * Since there is no serialization what so ever against page_remove_rmap() 444 * the best this function can do is return a locked anon_vma that might 445 * have been relevant to this page. 446 * 447 * The page might have been remapped to a different anon_vma or the anon_vma 448 * returned may already be freed (and even reused). 449 * 450 * In case it was remapped to a different anon_vma, the new anon_vma will be a 451 * child of the old anon_vma, and the anon_vma lifetime rules will therefore 452 * ensure that any anon_vma obtained from the page will still be valid for as 453 * long as we observe page_mapped() [ hence all those page_mapped() tests ]. 454 * 455 * All users of this function must be very careful when walking the anon_vma 456 * chain and verify that the page in question is indeed mapped in it 457 * [ something equivalent to page_mapped_in_vma() ]. 458 * 459 * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() 460 * that the anon_vma pointer from page->mapping is valid if there is a 461 * mapcount, we can dereference the anon_vma after observing those. 462 */ 463 struct anon_vma *page_get_anon_vma(struct page *page) 464 { 465 struct anon_vma *anon_vma = NULL; 466 unsigned long anon_mapping; 467 468 rcu_read_lock(); 469 anon_mapping = (unsigned long)READ_ONCE(page->mapping); 470 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 471 goto out; 472 if (!page_mapped(page)) 473 goto out; 474 475 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 476 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 477 anon_vma = NULL; 478 goto out; 479 } 480 481 /* 482 * If this page is still mapped, then its anon_vma cannot have been 483 * freed. But if it has been unmapped, we have no security against the 484 * anon_vma structure being freed and reused (for another anon_vma: 485 * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() 486 * above cannot corrupt). 487 */ 488 if (!page_mapped(page)) { 489 rcu_read_unlock(); 490 put_anon_vma(anon_vma); 491 return NULL; 492 } 493 out: 494 rcu_read_unlock(); 495 496 return anon_vma; 497 } 498 499 /* 500 * Similar to page_get_anon_vma() except it locks the anon_vma. 501 * 502 * Its a little more complex as it tries to keep the fast path to a single 503 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a 504 * reference like with page_get_anon_vma() and then block on the mutex. 505 */ 506 struct anon_vma *page_lock_anon_vma_read(struct page *page) 507 { 508 struct anon_vma *anon_vma = NULL; 509 struct anon_vma *root_anon_vma; 510 unsigned long anon_mapping; 511 512 rcu_read_lock(); 513 anon_mapping = (unsigned long)READ_ONCE(page->mapping); 514 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 515 goto out; 516 if (!page_mapped(page)) 517 goto out; 518 519 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 520 root_anon_vma = READ_ONCE(anon_vma->root); 521 if (down_read_trylock(&root_anon_vma->rwsem)) { 522 /* 523 * If the page is still mapped, then this anon_vma is still 524 * its anon_vma, and holding the mutex ensures that it will 525 * not go away, see anon_vma_free(). 526 */ 527 if (!page_mapped(page)) { 528 up_read(&root_anon_vma->rwsem); 529 anon_vma = NULL; 530 } 531 goto out; 532 } 533 534 /* trylock failed, we got to sleep */ 535 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 536 anon_vma = NULL; 537 goto out; 538 } 539 540 if (!page_mapped(page)) { 541 rcu_read_unlock(); 542 put_anon_vma(anon_vma); 543 return NULL; 544 } 545 546 /* we pinned the anon_vma, its safe to sleep */ 547 rcu_read_unlock(); 548 anon_vma_lock_read(anon_vma); 549 550 if (atomic_dec_and_test(&anon_vma->refcount)) { 551 /* 552 * Oops, we held the last refcount, release the lock 553 * and bail -- can't simply use put_anon_vma() because 554 * we'll deadlock on the anon_vma_lock_write() recursion. 555 */ 556 anon_vma_unlock_read(anon_vma); 557 __put_anon_vma(anon_vma); 558 anon_vma = NULL; 559 } 560 561 return anon_vma; 562 563 out: 564 rcu_read_unlock(); 565 return anon_vma; 566 } 567 568 void page_unlock_anon_vma_read(struct anon_vma *anon_vma) 569 { 570 anon_vma_unlock_read(anon_vma); 571 } 572 573 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 574 /* 575 * Flush TLB entries for recently unmapped pages from remote CPUs. It is 576 * important if a PTE was dirty when it was unmapped that it's flushed 577 * before any IO is initiated on the page to prevent lost writes. Similarly, 578 * it must be flushed before freeing to prevent data leakage. 579 */ 580 void try_to_unmap_flush(void) 581 { 582 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 583 584 if (!tlb_ubc->flush_required) 585 return; 586 587 arch_tlbbatch_flush(&tlb_ubc->arch); 588 tlb_ubc->flush_required = false; 589 tlb_ubc->writable = false; 590 } 591 592 /* Flush iff there are potentially writable TLB entries that can race with IO */ 593 void try_to_unmap_flush_dirty(void) 594 { 595 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 596 597 if (tlb_ubc->writable) 598 try_to_unmap_flush(); 599 } 600 601 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) 602 { 603 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 604 605 arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); 606 tlb_ubc->flush_required = true; 607 608 /* 609 * Ensure compiler does not re-order the setting of tlb_flush_batched 610 * before the PTE is cleared. 611 */ 612 barrier(); 613 mm->tlb_flush_batched = true; 614 615 /* 616 * If the PTE was dirty then it's best to assume it's writable. The 617 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() 618 * before the page is queued for IO. 619 */ 620 if (writable) 621 tlb_ubc->writable = true; 622 } 623 624 /* 625 * Returns true if the TLB flush should be deferred to the end of a batch of 626 * unmap operations to reduce IPIs. 627 */ 628 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 629 { 630 bool should_defer = false; 631 632 if (!(flags & TTU_BATCH_FLUSH)) 633 return false; 634 635 /* If remote CPUs need to be flushed then defer batch the flush */ 636 if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) 637 should_defer = true; 638 put_cpu(); 639 640 return should_defer; 641 } 642 643 /* 644 * Reclaim unmaps pages under the PTL but do not flush the TLB prior to 645 * releasing the PTL if TLB flushes are batched. It's possible for a parallel 646 * operation such as mprotect or munmap to race between reclaim unmapping 647 * the page and flushing the page. If this race occurs, it potentially allows 648 * access to data via a stale TLB entry. Tracking all mm's that have TLB 649 * batching in flight would be expensive during reclaim so instead track 650 * whether TLB batching occurred in the past and if so then do a flush here 651 * if required. This will cost one additional flush per reclaim cycle paid 652 * by the first operation at risk such as mprotect and mumap. 653 * 654 * This must be called under the PTL so that an access to tlb_flush_batched 655 * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise 656 * via the PTL. 657 */ 658 void flush_tlb_batched_pending(struct mm_struct *mm) 659 { 660 if (mm->tlb_flush_batched) { 661 flush_tlb_mm(mm); 662 663 /* 664 * Do not allow the compiler to re-order the clearing of 665 * tlb_flush_batched before the tlb is flushed. 666 */ 667 barrier(); 668 mm->tlb_flush_batched = false; 669 } 670 } 671 #else 672 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) 673 { 674 } 675 676 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 677 { 678 return false; 679 } 680 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ 681 682 /* 683 * At what user virtual address is page expected in vma? 684 * Caller should check the page is actually part of the vma. 685 */ 686 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 687 { 688 unsigned long address; 689 if (PageAnon(page)) { 690 struct anon_vma *page__anon_vma = page_anon_vma(page); 691 /* 692 * Note: swapoff's unuse_vma() is more efficient with this 693 * check, and needs it to match anon_vma when KSM is active. 694 */ 695 if (!vma->anon_vma || !page__anon_vma || 696 vma->anon_vma->root != page__anon_vma->root) 697 return -EFAULT; 698 } else if (page->mapping) { 699 if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) 700 return -EFAULT; 701 } else 702 return -EFAULT; 703 address = __vma_address(page, vma); 704 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) 705 return -EFAULT; 706 return address; 707 } 708 709 pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) 710 { 711 pgd_t *pgd; 712 p4d_t *p4d; 713 pud_t *pud; 714 pmd_t *pmd = NULL; 715 pmd_t pmde; 716 717 pgd = pgd_offset(mm, address); 718 if (!pgd_present(*pgd)) 719 goto out; 720 721 p4d = p4d_offset(pgd, address); 722 if (!p4d_present(*p4d)) 723 goto out; 724 725 pud = pud_offset(p4d, address); 726 if (!pud_present(*pud)) 727 goto out; 728 729 pmd = pmd_offset(pud, address); 730 /* 731 * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() 732 * without holding anon_vma lock for write. So when looking for a 733 * genuine pmde (in which to find pte), test present and !THP together. 734 */ 735 pmde = *pmd; 736 barrier(); 737 if (!pmd_present(pmde) || pmd_trans_huge(pmde)) 738 pmd = NULL; 739 out: 740 return pmd; 741 } 742 743 struct page_referenced_arg { 744 int mapcount; 745 int referenced; 746 unsigned long vm_flags; 747 struct mem_cgroup *memcg; 748 }; 749 /* 750 * arg: page_referenced_arg will be passed 751 */ 752 static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, 753 unsigned long address, void *arg) 754 { 755 struct page_referenced_arg *pra = arg; 756 struct page_vma_mapped_walk pvmw = { 757 .page = page, 758 .vma = vma, 759 .address = address, 760 }; 761 int referenced = 0; 762 763 while (page_vma_mapped_walk(&pvmw)) { 764 address = pvmw.address; 765 766 if (vma->vm_flags & VM_LOCKED) { 767 page_vma_mapped_walk_done(&pvmw); 768 pra->vm_flags |= VM_LOCKED; 769 return false; /* To break the loop */ 770 } 771 772 if (pvmw.pte) { 773 if (ptep_clear_flush_young_notify(vma, address, 774 pvmw.pte)) { 775 /* 776 * Don't treat a reference through 777 * a sequentially read mapping as such. 778 * If the page has been used in another mapping, 779 * we will catch it; if this other mapping is 780 * already gone, the unmap path will have set 781 * PG_referenced or activated the page. 782 */ 783 if (likely(!(vma->vm_flags & VM_SEQ_READ))) 784 referenced++; 785 } 786 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { 787 if (pmdp_clear_flush_young_notify(vma, address, 788 pvmw.pmd)) 789 referenced++; 790 } else { 791 /* unexpected pmd-mapped page? */ 792 WARN_ON_ONCE(1); 793 } 794 795 pra->mapcount--; 796 } 797 798 if (referenced) 799 clear_page_idle(page); 800 if (test_and_clear_page_young(page)) 801 referenced++; 802 803 if (referenced) { 804 pra->referenced++; 805 pra->vm_flags |= vma->vm_flags; 806 } 807 808 if (!pra->mapcount) 809 return false; /* To break the loop */ 810 811 return true; 812 } 813 814 static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) 815 { 816 struct page_referenced_arg *pra = arg; 817 struct mem_cgroup *memcg = pra->memcg; 818 819 if (!mm_match_cgroup(vma->vm_mm, memcg)) 820 return true; 821 822 return false; 823 } 824 825 /** 826 * page_referenced - test if the page was referenced 827 * @page: the page to test 828 * @is_locked: caller holds lock on the page 829 * @memcg: target memory cgroup 830 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 831 * 832 * Quick test_and_clear_referenced for all mappings to a page, 833 * returns the number of ptes which referenced the page. 834 */ 835 int page_referenced(struct page *page, 836 int is_locked, 837 struct mem_cgroup *memcg, 838 unsigned long *vm_flags) 839 { 840 int we_locked = 0; 841 struct page_referenced_arg pra = { 842 .mapcount = total_mapcount(page), 843 .memcg = memcg, 844 }; 845 struct rmap_walk_control rwc = { 846 .rmap_one = page_referenced_one, 847 .arg = (void *)&pra, 848 .anon_lock = page_lock_anon_vma_read, 849 }; 850 851 *vm_flags = 0; 852 if (!page_mapped(page)) 853 return 0; 854 855 if (!page_rmapping(page)) 856 return 0; 857 858 if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 859 we_locked = trylock_page(page); 860 if (!we_locked) 861 return 1; 862 } 863 864 /* 865 * If we are reclaiming on behalf of a cgroup, skip 866 * counting on behalf of references from different 867 * cgroups 868 */ 869 if (memcg) { 870 rwc.invalid_vma = invalid_page_referenced_vma; 871 } 872 873 rmap_walk(page, &rwc); 874 *vm_flags = pra.vm_flags; 875 876 if (we_locked) 877 unlock_page(page); 878 879 return pra.referenced; 880 } 881 882 static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, 883 unsigned long address, void *arg) 884 { 885 struct page_vma_mapped_walk pvmw = { 886 .page = page, 887 .vma = vma, 888 .address = address, 889 .flags = PVMW_SYNC, 890 }; 891 unsigned long start = address, end; 892 int *cleaned = arg; 893 894 /* 895 * We have to assume the worse case ie pmd for invalidation. Note that 896 * the page can not be free from this function. 897 */ 898 end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); 899 mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); 900 901 while (page_vma_mapped_walk(&pvmw)) { 902 unsigned long cstart; 903 int ret = 0; 904 905 cstart = address = pvmw.address; 906 if (pvmw.pte) { 907 pte_t entry; 908 pte_t *pte = pvmw.pte; 909 910 if (!pte_dirty(*pte) && !pte_write(*pte)) 911 continue; 912 913 flush_cache_page(vma, address, pte_pfn(*pte)); 914 entry = ptep_clear_flush(vma, address, pte); 915 entry = pte_wrprotect(entry); 916 entry = pte_mkclean(entry); 917 set_pte_at(vma->vm_mm, address, pte, entry); 918 ret = 1; 919 } else { 920 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 921 pmd_t *pmd = pvmw.pmd; 922 pmd_t entry; 923 924 if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) 925 continue; 926 927 flush_cache_page(vma, address, page_to_pfn(page)); 928 entry = pmdp_huge_clear_flush(vma, address, pmd); 929 entry = pmd_wrprotect(entry); 930 entry = pmd_mkclean(entry); 931 set_pmd_at(vma->vm_mm, address, pmd, entry); 932 cstart &= PMD_MASK; 933 ret = 1; 934 #else 935 /* unexpected pmd-mapped page? */ 936 WARN_ON_ONCE(1); 937 #endif 938 } 939 940 /* 941 * No need to call mmu_notifier_invalidate_range() as we are 942 * downgrading page table protection not changing it to point 943 * to a new page. 944 * 945 * See Documentation/vm/mmu_notifier.txt 946 */ 947 if (ret) 948 (*cleaned)++; 949 } 950 951 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); 952 953 return true; 954 } 955 956 static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) 957 { 958 if (vma->vm_flags & VM_SHARED) 959 return false; 960 961 return true; 962 } 963 964 int page_mkclean(struct page *page) 965 { 966 int cleaned = 0; 967 struct address_space *mapping; 968 struct rmap_walk_control rwc = { 969 .arg = (void *)&cleaned, 970 .rmap_one = page_mkclean_one, 971 .invalid_vma = invalid_mkclean_vma, 972 }; 973 974 BUG_ON(!PageLocked(page)); 975 976 if (!page_mapped(page)) 977 return 0; 978 979 mapping = page_mapping(page); 980 if (!mapping) 981 return 0; 982 983 rmap_walk(page, &rwc); 984 985 return cleaned; 986 } 987 EXPORT_SYMBOL_GPL(page_mkclean); 988 989 /** 990 * page_move_anon_rmap - move a page to our anon_vma 991 * @page: the page to move to our anon_vma 992 * @vma: the vma the page belongs to 993 * 994 * When a page belongs exclusively to one process after a COW event, 995 * that page can be moved into the anon_vma that belongs to just that 996 * process, so the rmap code will not search the parent or sibling 997 * processes. 998 */ 999 void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) 1000 { 1001 struct anon_vma *anon_vma = vma->anon_vma; 1002 1003 page = compound_head(page); 1004 1005 VM_BUG_ON_PAGE(!PageLocked(page), page); 1006 VM_BUG_ON_VMA(!anon_vma, vma); 1007 1008 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1009 /* 1010 * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written 1011 * simultaneously, so a concurrent reader (eg page_referenced()'s 1012 * PageAnon()) will not see one without the other. 1013 */ 1014 WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); 1015 } 1016 1017 /** 1018 * __page_set_anon_rmap - set up new anonymous rmap 1019 * @page: Page to add to rmap 1020 * @vma: VM area to add page to. 1021 * @address: User virtual address of the mapping 1022 * @exclusive: the page is exclusively owned by the current process 1023 */ 1024 static void __page_set_anon_rmap(struct page *page, 1025 struct vm_area_struct *vma, unsigned long address, int exclusive) 1026 { 1027 struct anon_vma *anon_vma = vma->anon_vma; 1028 1029 BUG_ON(!anon_vma); 1030 1031 if (PageAnon(page)) 1032 return; 1033 1034 /* 1035 * If the page isn't exclusively mapped into this vma, 1036 * we must use the _oldest_ possible anon_vma for the 1037 * page mapping! 1038 */ 1039 if (!exclusive) 1040 anon_vma = anon_vma->root; 1041 1042 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1043 page->mapping = (struct address_space *) anon_vma; 1044 page->index = linear_page_index(vma, address); 1045 } 1046 1047 /** 1048 * __page_check_anon_rmap - sanity check anonymous rmap addition 1049 * @page: the page to add the mapping to 1050 * @vma: the vm area in which the mapping is added 1051 * @address: the user virtual address mapped 1052 */ 1053 static void __page_check_anon_rmap(struct page *page, 1054 struct vm_area_struct *vma, unsigned long address) 1055 { 1056 #ifdef CONFIG_DEBUG_VM 1057 /* 1058 * The page's anon-rmap details (mapping and index) are guaranteed to 1059 * be set up correctly at this point. 1060 * 1061 * We have exclusion against page_add_anon_rmap because the caller 1062 * always holds the page locked, except if called from page_dup_rmap, 1063 * in which case the page is already known to be setup. 1064 * 1065 * We have exclusion against page_add_new_anon_rmap because those pages 1066 * are initially only visible via the pagetables, and the pte is locked 1067 * over the call to page_add_new_anon_rmap. 1068 */ 1069 BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); 1070 BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address)); 1071 #endif 1072 } 1073 1074 /** 1075 * page_add_anon_rmap - add pte mapping to an anonymous page 1076 * @page: the page to add the mapping to 1077 * @vma: the vm area in which the mapping is added 1078 * @address: the user virtual address mapped 1079 * @compound: charge the page as compound or small page 1080 * 1081 * The caller needs to hold the pte lock, and the page must be locked in 1082 * the anon_vma case: to serialize mapping,index checking after setting, 1083 * and to ensure that PageAnon is not being upgraded racily to PageKsm 1084 * (but PageKsm is never downgraded to PageAnon). 1085 */ 1086 void page_add_anon_rmap(struct page *page, 1087 struct vm_area_struct *vma, unsigned long address, bool compound) 1088 { 1089 do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0); 1090 } 1091 1092 /* 1093 * Special version of the above for do_swap_page, which often runs 1094 * into pages that are exclusively owned by the current process. 1095 * Everybody else should continue to use page_add_anon_rmap above. 1096 */ 1097 void do_page_add_anon_rmap(struct page *page, 1098 struct vm_area_struct *vma, unsigned long address, int flags) 1099 { 1100 bool compound = flags & RMAP_COMPOUND; 1101 bool first; 1102 1103 if (compound) { 1104 atomic_t *mapcount; 1105 VM_BUG_ON_PAGE(!PageLocked(page), page); 1106 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 1107 mapcount = compound_mapcount_ptr(page); 1108 first = atomic_inc_and_test(mapcount); 1109 } else { 1110 first = atomic_inc_and_test(&page->_mapcount); 1111 } 1112 1113 if (first) { 1114 int nr = compound ? hpage_nr_pages(page) : 1; 1115 /* 1116 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1117 * these counters are not modified in interrupt context, and 1118 * pte lock(a spinlock) is held, which implies preemption 1119 * disabled. 1120 */ 1121 if (compound) 1122 __inc_node_page_state(page, NR_ANON_THPS); 1123 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); 1124 } 1125 if (unlikely(PageKsm(page))) 1126 return; 1127 1128 VM_BUG_ON_PAGE(!PageLocked(page), page); 1129 1130 /* address might be in next vma when migration races vma_adjust */ 1131 if (first) 1132 __page_set_anon_rmap(page, vma, address, 1133 flags & RMAP_EXCLUSIVE); 1134 else 1135 __page_check_anon_rmap(page, vma, address); 1136 } 1137 1138 /** 1139 * page_add_new_anon_rmap - add pte mapping to a new anonymous page 1140 * @page: the page to add the mapping to 1141 * @vma: the vm area in which the mapping is added 1142 * @address: the user virtual address mapped 1143 * @compound: charge the page as compound or small page 1144 * 1145 * Same as page_add_anon_rmap but must only be called on *new* pages. 1146 * This means the inc-and-test can be bypassed. 1147 * Page does not have to be locked. 1148 */ 1149 void page_add_new_anon_rmap(struct page *page, 1150 struct vm_area_struct *vma, unsigned long address, bool compound) 1151 { 1152 int nr = compound ? hpage_nr_pages(page) : 1; 1153 1154 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); 1155 __SetPageSwapBacked(page); 1156 if (compound) { 1157 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 1158 /* increment count (starts at -1) */ 1159 atomic_set(compound_mapcount_ptr(page), 0); 1160 __inc_node_page_state(page, NR_ANON_THPS); 1161 } else { 1162 /* Anon THP always mapped first with PMD */ 1163 VM_BUG_ON_PAGE(PageTransCompound(page), page); 1164 /* increment count (starts at -1) */ 1165 atomic_set(&page->_mapcount, 0); 1166 } 1167 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); 1168 __page_set_anon_rmap(page, vma, address, 1); 1169 } 1170 1171 /** 1172 * page_add_file_rmap - add pte mapping to a file page 1173 * @page: the page to add the mapping to 1174 * 1175 * The caller needs to hold the pte lock. 1176 */ 1177 void page_add_file_rmap(struct page *page, bool compound) 1178 { 1179 int i, nr = 1; 1180 1181 VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); 1182 lock_page_memcg(page); 1183 if (compound && PageTransHuge(page)) { 1184 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1185 if (atomic_inc_and_test(&page[i]._mapcount)) 1186 nr++; 1187 } 1188 if (!atomic_inc_and_test(compound_mapcount_ptr(page))) 1189 goto out; 1190 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 1191 __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); 1192 } else { 1193 if (PageTransCompound(page) && page_mapping(page)) { 1194 VM_WARN_ON_ONCE(!PageLocked(page)); 1195 1196 SetPageDoubleMap(compound_head(page)); 1197 if (PageMlocked(page)) 1198 clear_page_mlock(compound_head(page)); 1199 } 1200 if (!atomic_inc_and_test(&page->_mapcount)) 1201 goto out; 1202 } 1203 __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); 1204 out: 1205 unlock_page_memcg(page); 1206 } 1207 1208 static void page_remove_file_rmap(struct page *page, bool compound) 1209 { 1210 int i, nr = 1; 1211 1212 VM_BUG_ON_PAGE(compound && !PageHead(page), page); 1213 lock_page_memcg(page); 1214 1215 /* Hugepages are not counted in NR_FILE_MAPPED for now. */ 1216 if (unlikely(PageHuge(page))) { 1217 /* hugetlb pages are always mapped with pmds */ 1218 atomic_dec(compound_mapcount_ptr(page)); 1219 goto out; 1220 } 1221 1222 /* page still mapped by someone else? */ 1223 if (compound && PageTransHuge(page)) { 1224 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1225 if (atomic_add_negative(-1, &page[i]._mapcount)) 1226 nr++; 1227 } 1228 if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) 1229 goto out; 1230 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 1231 __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); 1232 } else { 1233 if (!atomic_add_negative(-1, &page->_mapcount)) 1234 goto out; 1235 } 1236 1237 /* 1238 * We use the irq-unsafe __{inc|mod}_lruvec_page_state because 1239 * these counters are not modified in interrupt context, and 1240 * pte lock(a spinlock) is held, which implies preemption disabled. 1241 */ 1242 __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); 1243 1244 if (unlikely(PageMlocked(page))) 1245 clear_page_mlock(page); 1246 out: 1247 unlock_page_memcg(page); 1248 } 1249 1250 static void page_remove_anon_compound_rmap(struct page *page) 1251 { 1252 int i, nr; 1253 1254 if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) 1255 return; 1256 1257 /* Hugepages are not counted in NR_ANON_PAGES for now. */ 1258 if (unlikely(PageHuge(page))) 1259 return; 1260 1261 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 1262 return; 1263 1264 __dec_node_page_state(page, NR_ANON_THPS); 1265 1266 if (TestClearPageDoubleMap(page)) { 1267 /* 1268 * Subpages can be mapped with PTEs too. Check how many of 1269 * themi are still mapped. 1270 */ 1271 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1272 if (atomic_add_negative(-1, &page[i]._mapcount)) 1273 nr++; 1274 } 1275 } else { 1276 nr = HPAGE_PMD_NR; 1277 } 1278 1279 if (unlikely(PageMlocked(page))) 1280 clear_page_mlock(page); 1281 1282 if (nr) { 1283 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); 1284 deferred_split_huge_page(page); 1285 } 1286 } 1287 1288 /** 1289 * page_remove_rmap - take down pte mapping from a page 1290 * @page: page to remove mapping from 1291 * @compound: uncharge the page as compound or small page 1292 * 1293 * The caller needs to hold the pte lock. 1294 */ 1295 void page_remove_rmap(struct page *page, bool compound) 1296 { 1297 if (!PageAnon(page)) 1298 return page_remove_file_rmap(page, compound); 1299 1300 if (compound) 1301 return page_remove_anon_compound_rmap(page); 1302 1303 /* page still mapped by someone else? */ 1304 if (!atomic_add_negative(-1, &page->_mapcount)) 1305 return; 1306 1307 /* 1308 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1309 * these counters are not modified in interrupt context, and 1310 * pte lock(a spinlock) is held, which implies preemption disabled. 1311 */ 1312 __dec_node_page_state(page, NR_ANON_MAPPED); 1313 1314 if (unlikely(PageMlocked(page))) 1315 clear_page_mlock(page); 1316 1317 if (PageTransCompound(page)) 1318 deferred_split_huge_page(compound_head(page)); 1319 1320 /* 1321 * It would be tidy to reset the PageAnon mapping here, 1322 * but that might overwrite a racing page_add_anon_rmap 1323 * which increments mapcount after us but sets mapping 1324 * before us: so leave the reset to free_unref_page, 1325 * and remember that it's only reliable while mapped. 1326 * Leaving it set also helps swapoff to reinstate ptes 1327 * faster for those pages still in swapcache. 1328 */ 1329 } 1330 1331 /* 1332 * @arg: enum ttu_flags will be passed to this argument 1333 */ 1334 static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 1335 unsigned long address, void *arg) 1336 { 1337 struct mm_struct *mm = vma->vm_mm; 1338 struct page_vma_mapped_walk pvmw = { 1339 .page = page, 1340 .vma = vma, 1341 .address = address, 1342 }; 1343 pte_t pteval; 1344 struct page *subpage; 1345 bool ret = true; 1346 unsigned long start = address, end; 1347 enum ttu_flags flags = (enum ttu_flags)arg; 1348 1349 /* munlock has nothing to gain from examining un-locked vmas */ 1350 if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) 1351 return true; 1352 1353 if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && 1354 is_zone_device_page(page) && !is_device_private_page(page)) 1355 return true; 1356 1357 if (flags & TTU_SPLIT_HUGE_PMD) { 1358 split_huge_pmd_address(vma, address, 1359 flags & TTU_SPLIT_FREEZE, page); 1360 } 1361 1362 /* 1363 * We have to assume the worse case ie pmd for invalidation. Note that 1364 * the page can not be free in this function as call of try_to_unmap() 1365 * must hold a reference on the page. 1366 */ 1367 end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); 1368 mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); 1369 1370 while (page_vma_mapped_walk(&pvmw)) { 1371 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1372 /* PMD-mapped THP migration entry */ 1373 if (!pvmw.pte && (flags & TTU_MIGRATION)) { 1374 VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); 1375 1376 if (!PageAnon(page)) 1377 continue; 1378 1379 set_pmd_migration_entry(&pvmw, page); 1380 continue; 1381 } 1382 #endif 1383 1384 /* 1385 * If the page is mlock()d, we cannot swap it out. 1386 * If it's recently referenced (perhaps page_referenced 1387 * skipped over this mm) then we should reactivate it. 1388 */ 1389 if (!(flags & TTU_IGNORE_MLOCK)) { 1390 if (vma->vm_flags & VM_LOCKED) { 1391 /* PTE-mapped THP are never mlocked */ 1392 if (!PageTransCompound(page)) { 1393 /* 1394 * Holding pte lock, we do *not* need 1395 * mmap_sem here 1396 */ 1397 mlock_vma_page(page); 1398 } 1399 ret = false; 1400 page_vma_mapped_walk_done(&pvmw); 1401 break; 1402 } 1403 if (flags & TTU_MUNLOCK) 1404 continue; 1405 } 1406 1407 /* Unexpected PMD-mapped THP? */ 1408 VM_BUG_ON_PAGE(!pvmw.pte, page); 1409 1410 subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); 1411 address = pvmw.address; 1412 1413 1414 if (IS_ENABLED(CONFIG_MIGRATION) && 1415 (flags & TTU_MIGRATION) && 1416 is_zone_device_page(page)) { 1417 swp_entry_t entry; 1418 pte_t swp_pte; 1419 1420 pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte); 1421 1422 /* 1423 * Store the pfn of the page in a special migration 1424 * pte. do_swap_page() will wait until the migration 1425 * pte is removed and then restart fault handling. 1426 */ 1427 entry = make_migration_entry(page, 0); 1428 swp_pte = swp_entry_to_pte(entry); 1429 if (pte_soft_dirty(pteval)) 1430 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1431 set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); 1432 /* 1433 * No need to invalidate here it will synchronize on 1434 * against the special swap migration pte. 1435 */ 1436 goto discard; 1437 } 1438 1439 if (!(flags & TTU_IGNORE_ACCESS)) { 1440 if (ptep_clear_flush_young_notify(vma, address, 1441 pvmw.pte)) { 1442 ret = false; 1443 page_vma_mapped_walk_done(&pvmw); 1444 break; 1445 } 1446 } 1447 1448 /* Nuke the page table entry. */ 1449 flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); 1450 if (should_defer_flush(mm, flags)) { 1451 /* 1452 * We clear the PTE but do not flush so potentially 1453 * a remote CPU could still be writing to the page. 1454 * If the entry was previously clean then the 1455 * architecture must guarantee that a clear->dirty 1456 * transition on a cached TLB entry is written through 1457 * and traps if the PTE is unmapped. 1458 */ 1459 pteval = ptep_get_and_clear(mm, address, pvmw.pte); 1460 1461 set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); 1462 } else { 1463 pteval = ptep_clear_flush(vma, address, pvmw.pte); 1464 } 1465 1466 /* Move the dirty bit to the page. Now the pte is gone. */ 1467 if (pte_dirty(pteval)) 1468 set_page_dirty(page); 1469 1470 /* Update high watermark before we lower rss */ 1471 update_hiwater_rss(mm); 1472 1473 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 1474 pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); 1475 if (PageHuge(page)) { 1476 int nr = 1 << compound_order(page); 1477 hugetlb_count_sub(nr, mm); 1478 set_huge_swap_pte_at(mm, address, 1479 pvmw.pte, pteval, 1480 vma_mmu_pagesize(vma)); 1481 } else { 1482 dec_mm_counter(mm, mm_counter(page)); 1483 set_pte_at(mm, address, pvmw.pte, pteval); 1484 } 1485 1486 } else if (pte_unused(pteval)) { 1487 /* 1488 * The guest indicated that the page content is of no 1489 * interest anymore. Simply discard the pte, vmscan 1490 * will take care of the rest. 1491 */ 1492 dec_mm_counter(mm, mm_counter(page)); 1493 /* We have to invalidate as we cleared the pte */ 1494 mmu_notifier_invalidate_range(mm, address, 1495 address + PAGE_SIZE); 1496 } else if (IS_ENABLED(CONFIG_MIGRATION) && 1497 (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { 1498 swp_entry_t entry; 1499 pte_t swp_pte; 1500 /* 1501 * Store the pfn of the page in a special migration 1502 * pte. do_swap_page() will wait until the migration 1503 * pte is removed and then restart fault handling. 1504 */ 1505 entry = make_migration_entry(subpage, 1506 pte_write(pteval)); 1507 swp_pte = swp_entry_to_pte(entry); 1508 if (pte_soft_dirty(pteval)) 1509 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1510 set_pte_at(mm, address, pvmw.pte, swp_pte); 1511 /* 1512 * No need to invalidate here it will synchronize on 1513 * against the special swap migration pte. 1514 */ 1515 } else if (PageAnon(page)) { 1516 swp_entry_t entry = { .val = page_private(subpage) }; 1517 pte_t swp_pte; 1518 /* 1519 * Store the swap location in the pte. 1520 * See handle_pte_fault() ... 1521 */ 1522 if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { 1523 WARN_ON_ONCE(1); 1524 ret = false; 1525 /* We have to invalidate as we cleared the pte */ 1526 mmu_notifier_invalidate_range(mm, address, 1527 address + PAGE_SIZE); 1528 page_vma_mapped_walk_done(&pvmw); 1529 break; 1530 } 1531 1532 /* MADV_FREE page check */ 1533 if (!PageSwapBacked(page)) { 1534 if (!PageDirty(page)) { 1535 /* Invalidate as we cleared the pte */ 1536 mmu_notifier_invalidate_range(mm, 1537 address, address + PAGE_SIZE); 1538 dec_mm_counter(mm, MM_ANONPAGES); 1539 goto discard; 1540 } 1541 1542 /* 1543 * If the page was redirtied, it cannot be 1544 * discarded. Remap the page to page table. 1545 */ 1546 set_pte_at(mm, address, pvmw.pte, pteval); 1547 SetPageSwapBacked(page); 1548 ret = false; 1549 page_vma_mapped_walk_done(&pvmw); 1550 break; 1551 } 1552 1553 if (swap_duplicate(entry) < 0) { 1554 set_pte_at(mm, address, pvmw.pte, pteval); 1555 ret = false; 1556 page_vma_mapped_walk_done(&pvmw); 1557 break; 1558 } 1559 if (list_empty(&mm->mmlist)) { 1560 spin_lock(&mmlist_lock); 1561 if (list_empty(&mm->mmlist)) 1562 list_add(&mm->mmlist, &init_mm.mmlist); 1563 spin_unlock(&mmlist_lock); 1564 } 1565 dec_mm_counter(mm, MM_ANONPAGES); 1566 inc_mm_counter(mm, MM_SWAPENTS); 1567 swp_pte = swp_entry_to_pte(entry); 1568 if (pte_soft_dirty(pteval)) 1569 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1570 set_pte_at(mm, address, pvmw.pte, swp_pte); 1571 /* Invalidate as we cleared the pte */ 1572 mmu_notifier_invalidate_range(mm, address, 1573 address + PAGE_SIZE); 1574 } else { 1575 /* 1576 * We should not need to notify here as we reach this 1577 * case only from freeze_page() itself only call from 1578 * split_huge_page_to_list() so everything below must 1579 * be true: 1580 * - page is not anonymous 1581 * - page is locked 1582 * 1583 * So as it is a locked file back page thus it can not 1584 * be remove from the page cache and replace by a new 1585 * page before mmu_notifier_invalidate_range_end so no 1586 * concurrent thread might update its page table to 1587 * point at new page while a device still is using this 1588 * page. 1589 * 1590 * See Documentation/vm/mmu_notifier.txt 1591 */ 1592 dec_mm_counter(mm, mm_counter_file(page)); 1593 } 1594 discard: 1595 /* 1596 * No need to call mmu_notifier_invalidate_range() it has be 1597 * done above for all cases requiring it to happen under page 1598 * table lock before mmu_notifier_invalidate_range_end() 1599 * 1600 * See Documentation/vm/mmu_notifier.txt 1601 */ 1602 page_remove_rmap(subpage, PageHuge(page)); 1603 put_page(page); 1604 } 1605 1606 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); 1607 1608 return ret; 1609 } 1610 1611 bool is_vma_temporary_stack(struct vm_area_struct *vma) 1612 { 1613 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1614 1615 if (!maybe_stack) 1616 return false; 1617 1618 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == 1619 VM_STACK_INCOMPLETE_SETUP) 1620 return true; 1621 1622 return false; 1623 } 1624 1625 static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) 1626 { 1627 return is_vma_temporary_stack(vma); 1628 } 1629 1630 static int page_mapcount_is_zero(struct page *page) 1631 { 1632 return !total_mapcount(page); 1633 } 1634 1635 /** 1636 * try_to_unmap - try to remove all page table mappings to a page 1637 * @page: the page to get unmapped 1638 * @flags: action and flags 1639 * 1640 * Tries to remove all the page table entries which are mapping this 1641 * page, used in the pageout path. Caller must hold the page lock. 1642 * 1643 * If unmap is successful, return true. Otherwise, false. 1644 */ 1645 bool try_to_unmap(struct page *page, enum ttu_flags flags) 1646 { 1647 struct rmap_walk_control rwc = { 1648 .rmap_one = try_to_unmap_one, 1649 .arg = (void *)flags, 1650 .done = page_mapcount_is_zero, 1651 .anon_lock = page_lock_anon_vma_read, 1652 }; 1653 1654 /* 1655 * During exec, a temporary VMA is setup and later moved. 1656 * The VMA is moved under the anon_vma lock but not the 1657 * page tables leading to a race where migration cannot 1658 * find the migration ptes. Rather than increasing the 1659 * locking requirements of exec(), migration skips 1660 * temporary VMAs until after exec() completes. 1661 */ 1662 if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE)) 1663 && !PageKsm(page) && PageAnon(page)) 1664 rwc.invalid_vma = invalid_migration_vma; 1665 1666 if (flags & TTU_RMAP_LOCKED) 1667 rmap_walk_locked(page, &rwc); 1668 else 1669 rmap_walk(page, &rwc); 1670 1671 return !page_mapcount(page) ? true : false; 1672 } 1673 1674 static int page_not_mapped(struct page *page) 1675 { 1676 return !page_mapped(page); 1677 }; 1678 1679 /** 1680 * try_to_munlock - try to munlock a page 1681 * @page: the page to be munlocked 1682 * 1683 * Called from munlock code. Checks all of the VMAs mapping the page 1684 * to make sure nobody else has this page mlocked. The page will be 1685 * returned with PG_mlocked cleared if no other vmas have it mlocked. 1686 */ 1687 1688 void try_to_munlock(struct page *page) 1689 { 1690 struct rmap_walk_control rwc = { 1691 .rmap_one = try_to_unmap_one, 1692 .arg = (void *)TTU_MUNLOCK, 1693 .done = page_not_mapped, 1694 .anon_lock = page_lock_anon_vma_read, 1695 1696 }; 1697 1698 VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); 1699 VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); 1700 1701 rmap_walk(page, &rwc); 1702 } 1703 1704 void __put_anon_vma(struct anon_vma *anon_vma) 1705 { 1706 struct anon_vma *root = anon_vma->root; 1707 1708 anon_vma_free(anon_vma); 1709 if (root != anon_vma && atomic_dec_and_test(&root->refcount)) 1710 anon_vma_free(root); 1711 } 1712 1713 static struct anon_vma *rmap_walk_anon_lock(struct page *page, 1714 struct rmap_walk_control *rwc) 1715 { 1716 struct anon_vma *anon_vma; 1717 1718 if (rwc->anon_lock) 1719 return rwc->anon_lock(page); 1720 1721 /* 1722 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() 1723 * because that depends on page_mapped(); but not all its usages 1724 * are holding mmap_sem. Users without mmap_sem are required to 1725 * take a reference count to prevent the anon_vma disappearing 1726 */ 1727 anon_vma = page_anon_vma(page); 1728 if (!anon_vma) 1729 return NULL; 1730 1731 anon_vma_lock_read(anon_vma); 1732 return anon_vma; 1733 } 1734 1735 /* 1736 * rmap_walk_anon - do something to anonymous page using the object-based 1737 * rmap method 1738 * @page: the page to be handled 1739 * @rwc: control variable according to each walk type 1740 * 1741 * Find all the mappings of a page using the mapping pointer and the vma chains 1742 * contained in the anon_vma struct it points to. 1743 * 1744 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1745 * where the page was found will be held for write. So, we won't recheck 1746 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1747 * LOCKED. 1748 */ 1749 static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, 1750 bool locked) 1751 { 1752 struct anon_vma *anon_vma; 1753 pgoff_t pgoff_start, pgoff_end; 1754 struct anon_vma_chain *avc; 1755 1756 if (locked) { 1757 anon_vma = page_anon_vma(page); 1758 /* anon_vma disappear under us? */ 1759 VM_BUG_ON_PAGE(!anon_vma, page); 1760 } else { 1761 anon_vma = rmap_walk_anon_lock(page, rwc); 1762 } 1763 if (!anon_vma) 1764 return; 1765 1766 pgoff_start = page_to_pgoff(page); 1767 pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; 1768 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, 1769 pgoff_start, pgoff_end) { 1770 struct vm_area_struct *vma = avc->vma; 1771 unsigned long address = vma_address(page, vma); 1772 1773 cond_resched(); 1774 1775 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1776 continue; 1777 1778 if (!rwc->rmap_one(page, vma, address, rwc->arg)) 1779 break; 1780 if (rwc->done && rwc->done(page)) 1781 break; 1782 } 1783 1784 if (!locked) 1785 anon_vma_unlock_read(anon_vma); 1786 } 1787 1788 /* 1789 * rmap_walk_file - do something to file page using the object-based rmap method 1790 * @page: the page to be handled 1791 * @rwc: control variable according to each walk type 1792 * 1793 * Find all the mappings of a page using the mapping pointer and the vma chains 1794 * contained in the address_space struct it points to. 1795 * 1796 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1797 * where the page was found will be held for write. So, we won't recheck 1798 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1799 * LOCKED. 1800 */ 1801 static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, 1802 bool locked) 1803 { 1804 struct address_space *mapping = page_mapping(page); 1805 pgoff_t pgoff_start, pgoff_end; 1806 struct vm_area_struct *vma; 1807 1808 /* 1809 * The page lock not only makes sure that page->mapping cannot 1810 * suddenly be NULLified by truncation, it makes sure that the 1811 * structure at mapping cannot be freed and reused yet, 1812 * so we can safely take mapping->i_mmap_rwsem. 1813 */ 1814 VM_BUG_ON_PAGE(!PageLocked(page), page); 1815 1816 if (!mapping) 1817 return; 1818 1819 pgoff_start = page_to_pgoff(page); 1820 pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; 1821 if (!locked) 1822 i_mmap_lock_read(mapping); 1823 vma_interval_tree_foreach(vma, &mapping->i_mmap, 1824 pgoff_start, pgoff_end) { 1825 unsigned long address = vma_address(page, vma); 1826 1827 cond_resched(); 1828 1829 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1830 continue; 1831 1832 if (!rwc->rmap_one(page, vma, address, rwc->arg)) 1833 goto done; 1834 if (rwc->done && rwc->done(page)) 1835 goto done; 1836 } 1837 1838 done: 1839 if (!locked) 1840 i_mmap_unlock_read(mapping); 1841 } 1842 1843 void rmap_walk(struct page *page, struct rmap_walk_control *rwc) 1844 { 1845 if (unlikely(PageKsm(page))) 1846 rmap_walk_ksm(page, rwc); 1847 else if (PageAnon(page)) 1848 rmap_walk_anon(page, rwc, false); 1849 else 1850 rmap_walk_file(page, rwc, false); 1851 } 1852 1853 /* Like rmap_walk, but caller holds relevant rmap lock */ 1854 void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) 1855 { 1856 /* no ksm support for now */ 1857 VM_BUG_ON_PAGE(PageKsm(page), page); 1858 if (PageAnon(page)) 1859 rmap_walk_anon(page, rwc, true); 1860 else 1861 rmap_walk_file(page, rwc, true); 1862 } 1863 1864 #ifdef CONFIG_HUGETLB_PAGE 1865 /* 1866 * The following three functions are for anonymous (private mapped) hugepages. 1867 * Unlike common anonymous pages, anonymous hugepages have no accounting code 1868 * and no lru code, because we handle hugepages differently from common pages. 1869 */ 1870 static void __hugepage_set_anon_rmap(struct page *page, 1871 struct vm_area_struct *vma, unsigned long address, int exclusive) 1872 { 1873 struct anon_vma *anon_vma = vma->anon_vma; 1874 1875 BUG_ON(!anon_vma); 1876 1877 if (PageAnon(page)) 1878 return; 1879 if (!exclusive) 1880 anon_vma = anon_vma->root; 1881 1882 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1883 page->mapping = (struct address_space *) anon_vma; 1884 page->index = linear_page_index(vma, address); 1885 } 1886 1887 void hugepage_add_anon_rmap(struct page *page, 1888 struct vm_area_struct *vma, unsigned long address) 1889 { 1890 struct anon_vma *anon_vma = vma->anon_vma; 1891 int first; 1892 1893 BUG_ON(!PageLocked(page)); 1894 BUG_ON(!anon_vma); 1895 /* address might be in next vma when migration races vma_adjust */ 1896 first = atomic_inc_and_test(compound_mapcount_ptr(page)); 1897 if (first) 1898 __hugepage_set_anon_rmap(page, vma, address, 0); 1899 } 1900 1901 void hugepage_add_new_anon_rmap(struct page *page, 1902 struct vm_area_struct *vma, unsigned long address) 1903 { 1904 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1905 atomic_set(compound_mapcount_ptr(page), 0); 1906 __hugepage_set_anon_rmap(page, vma, address, 1); 1907 } 1908 #endif /* CONFIG_HUGETLB_PAGE */ 1909