1 /* 2 * mm/rmap.c - physical to virtual reverse mappings 3 * 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 5 * Released under the General Public License (GPL). 6 * 7 * Simple, low overhead reverse mapping scheme. 8 * Please try to keep this thing as modular as possible. 9 * 10 * Provides methods for unmapping each kind of mapped page: 11 * the anon methods track anonymous pages, and 12 * the file methods track pages belonging to an inode. 13 * 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 17 * Contributions by Hugh Dickins 2003, 2004 18 */ 19 20 /* 21 * Lock ordering in mm: 22 * 23 * inode->i_mutex (while writing or truncating, not reading or faulting) 24 * inode->i_alloc_sem (vmtruncate_range) 25 * mm->mmap_sem 26 * page->flags PG_locked (lock_page) 27 * mapping->i_mmap_lock 28 * anon_vma->lock 29 * mm->page_table_lock or pte_lock 30 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 31 * swap_lock (in swap_duplicate, swap_info_get) 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 * mapping->private_lock (in __set_page_dirty_buffers) 34 * inode_lock (in set_page_dirty's __mark_inode_dirty) 35 * sb_lock (within inode_lock in fs/fs-writeback.c) 36 * mapping->tree_lock (widely used, in set_page_dirty, 37 * in arch-dependent flush_dcache_mmap_lock, 38 * within inode_lock in __sync_single_inode) 39 * 40 * (code doesn't rely on that order so it could be switched around) 41 * ->tasklist_lock 42 * anon_vma->lock (memory_failure, collect_procs_anon) 43 * pte map lock 44 */ 45 46 #include <linux/mm.h> 47 #include <linux/pagemap.h> 48 #include <linux/swap.h> 49 #include <linux/swapops.h> 50 #include <linux/slab.h> 51 #include <linux/init.h> 52 #include <linux/ksm.h> 53 #include <linux/rmap.h> 54 #include <linux/rcupdate.h> 55 #include <linux/module.h> 56 #include <linux/memcontrol.h> 57 #include <linux/mmu_notifier.h> 58 #include <linux/migrate.h> 59 60 #include <asm/tlbflush.h> 61 62 #include "internal.h" 63 64 static struct kmem_cache *anon_vma_cachep; 65 66 static inline struct anon_vma *anon_vma_alloc(void) 67 { 68 return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 69 } 70 71 void anon_vma_free(struct anon_vma *anon_vma) 72 { 73 kmem_cache_free(anon_vma_cachep, anon_vma); 74 } 75 76 /** 77 * anon_vma_prepare - attach an anon_vma to a memory region 78 * @vma: the memory region in question 79 * 80 * This makes sure the memory mapping described by 'vma' has 81 * an 'anon_vma' attached to it, so that we can associate the 82 * anonymous pages mapped into it with that anon_vma. 83 * 84 * The common case will be that we already have one, but if 85 * if not we either need to find an adjacent mapping that we 86 * can re-use the anon_vma from (very common when the only 87 * reason for splitting a vma has been mprotect()), or we 88 * allocate a new one. 89 * 90 * Anon-vma allocations are very subtle, because we may have 91 * optimistically looked up an anon_vma in page_lock_anon_vma() 92 * and that may actually touch the spinlock even in the newly 93 * allocated vma (it depends on RCU to make sure that the 94 * anon_vma isn't actually destroyed). 95 * 96 * As a result, we need to do proper anon_vma locking even 97 * for the new allocation. At the same time, we do not want 98 * to do any locking for the common case of already having 99 * an anon_vma. 100 * 101 * This must be called with the mmap_sem held for reading. 102 */ 103 int anon_vma_prepare(struct vm_area_struct *vma) 104 { 105 struct anon_vma *anon_vma = vma->anon_vma; 106 107 might_sleep(); 108 if (unlikely(!anon_vma)) { 109 struct mm_struct *mm = vma->vm_mm; 110 struct anon_vma *allocated; 111 112 anon_vma = find_mergeable_anon_vma(vma); 113 allocated = NULL; 114 if (!anon_vma) { 115 anon_vma = anon_vma_alloc(); 116 if (unlikely(!anon_vma)) 117 return -ENOMEM; 118 allocated = anon_vma; 119 } 120 spin_lock(&anon_vma->lock); 121 122 /* page_table_lock to protect against threads */ 123 spin_lock(&mm->page_table_lock); 124 if (likely(!vma->anon_vma)) { 125 vma->anon_vma = anon_vma; 126 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 127 allocated = NULL; 128 } 129 spin_unlock(&mm->page_table_lock); 130 131 spin_unlock(&anon_vma->lock); 132 if (unlikely(allocated)) 133 anon_vma_free(allocated); 134 } 135 return 0; 136 } 137 138 void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) 139 { 140 BUG_ON(vma->anon_vma != next->anon_vma); 141 list_del(&next->anon_vma_node); 142 } 143 144 void __anon_vma_link(struct vm_area_struct *vma) 145 { 146 struct anon_vma *anon_vma = vma->anon_vma; 147 148 if (anon_vma) 149 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 150 } 151 152 void anon_vma_link(struct vm_area_struct *vma) 153 { 154 struct anon_vma *anon_vma = vma->anon_vma; 155 156 if (anon_vma) { 157 spin_lock(&anon_vma->lock); 158 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 159 spin_unlock(&anon_vma->lock); 160 } 161 } 162 163 void anon_vma_unlink(struct vm_area_struct *vma) 164 { 165 struct anon_vma *anon_vma = vma->anon_vma; 166 int empty; 167 168 if (!anon_vma) 169 return; 170 171 spin_lock(&anon_vma->lock); 172 list_del(&vma->anon_vma_node); 173 174 /* We must garbage collect the anon_vma if it's empty */ 175 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); 176 spin_unlock(&anon_vma->lock); 177 178 if (empty) 179 anon_vma_free(anon_vma); 180 } 181 182 static void anon_vma_ctor(void *data) 183 { 184 struct anon_vma *anon_vma = data; 185 186 spin_lock_init(&anon_vma->lock); 187 ksm_refcount_init(anon_vma); 188 INIT_LIST_HEAD(&anon_vma->head); 189 } 190 191 void __init anon_vma_init(void) 192 { 193 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 194 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); 195 } 196 197 /* 198 * Getting a lock on a stable anon_vma from a page off the LRU is 199 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 200 */ 201 struct anon_vma *page_lock_anon_vma(struct page *page) 202 { 203 struct anon_vma *anon_vma; 204 unsigned long anon_mapping; 205 206 rcu_read_lock(); 207 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); 208 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 209 goto out; 210 if (!page_mapped(page)) 211 goto out; 212 213 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 214 spin_lock(&anon_vma->lock); 215 return anon_vma; 216 out: 217 rcu_read_unlock(); 218 return NULL; 219 } 220 221 void page_unlock_anon_vma(struct anon_vma *anon_vma) 222 { 223 spin_unlock(&anon_vma->lock); 224 rcu_read_unlock(); 225 } 226 227 /* 228 * At what user virtual address is page expected in @vma? 229 * Returns virtual address or -EFAULT if page's index/offset is not 230 * within the range mapped the @vma. 231 */ 232 static inline unsigned long 233 vma_address(struct page *page, struct vm_area_struct *vma) 234 { 235 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 236 unsigned long address; 237 238 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 239 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 240 /* page should be within @vma mapping range */ 241 return -EFAULT; 242 } 243 return address; 244 } 245 246 /* 247 * At what user virtual address is page expected in vma? 248 * checking that the page matches the vma. 249 */ 250 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 251 { 252 if (PageAnon(page)) { 253 if (vma->anon_vma != page_anon_vma(page)) 254 return -EFAULT; 255 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 256 if (!vma->vm_file || 257 vma->vm_file->f_mapping != page->mapping) 258 return -EFAULT; 259 } else 260 return -EFAULT; 261 return vma_address(page, vma); 262 } 263 264 /* 265 * Check that @page is mapped at @address into @mm. 266 * 267 * If @sync is false, page_check_address may perform a racy check to avoid 268 * the page table lock when the pte is not present (helpful when reclaiming 269 * highly shared pages). 270 * 271 * On success returns with pte mapped and locked. 272 */ 273 pte_t *page_check_address(struct page *page, struct mm_struct *mm, 274 unsigned long address, spinlock_t **ptlp, int sync) 275 { 276 pgd_t *pgd; 277 pud_t *pud; 278 pmd_t *pmd; 279 pte_t *pte; 280 spinlock_t *ptl; 281 282 pgd = pgd_offset(mm, address); 283 if (!pgd_present(*pgd)) 284 return NULL; 285 286 pud = pud_offset(pgd, address); 287 if (!pud_present(*pud)) 288 return NULL; 289 290 pmd = pmd_offset(pud, address); 291 if (!pmd_present(*pmd)) 292 return NULL; 293 294 pte = pte_offset_map(pmd, address); 295 /* Make a quick check before getting the lock */ 296 if (!sync && !pte_present(*pte)) { 297 pte_unmap(pte); 298 return NULL; 299 } 300 301 ptl = pte_lockptr(mm, pmd); 302 spin_lock(ptl); 303 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { 304 *ptlp = ptl; 305 return pte; 306 } 307 pte_unmap_unlock(pte, ptl); 308 return NULL; 309 } 310 311 /** 312 * page_mapped_in_vma - check whether a page is really mapped in a VMA 313 * @page: the page to test 314 * @vma: the VMA to test 315 * 316 * Returns 1 if the page is mapped into the page tables of the VMA, 0 317 * if the page is not mapped into the page tables of this VMA. Only 318 * valid for normal file or anonymous VMAs. 319 */ 320 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) 321 { 322 unsigned long address; 323 pte_t *pte; 324 spinlock_t *ptl; 325 326 address = vma_address(page, vma); 327 if (address == -EFAULT) /* out of vma range */ 328 return 0; 329 pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); 330 if (!pte) /* the page is not in this mm */ 331 return 0; 332 pte_unmap_unlock(pte, ptl); 333 334 return 1; 335 } 336 337 /* 338 * Subfunctions of page_referenced: page_referenced_one called 339 * repeatedly from either page_referenced_anon or page_referenced_file. 340 */ 341 int page_referenced_one(struct page *page, struct vm_area_struct *vma, 342 unsigned long address, unsigned int *mapcount, 343 unsigned long *vm_flags) 344 { 345 struct mm_struct *mm = vma->vm_mm; 346 pte_t *pte; 347 spinlock_t *ptl; 348 int referenced = 0; 349 350 pte = page_check_address(page, mm, address, &ptl, 0); 351 if (!pte) 352 goto out; 353 354 /* 355 * Don't want to elevate referenced for mlocked page that gets this far, 356 * in order that it progresses to try_to_unmap and is moved to the 357 * unevictable list. 358 */ 359 if (vma->vm_flags & VM_LOCKED) { 360 *mapcount = 1; /* break early from loop */ 361 *vm_flags |= VM_LOCKED; 362 goto out_unmap; 363 } 364 365 if (ptep_clear_flush_young_notify(vma, address, pte)) { 366 /* 367 * Don't treat a reference through a sequentially read 368 * mapping as such. If the page has been used in 369 * another mapping, we will catch it; if this other 370 * mapping is already gone, the unmap path will have 371 * set PG_referenced or activated the page. 372 */ 373 if (likely(!VM_SequentialReadHint(vma))) 374 referenced++; 375 } 376 377 /* Pretend the page is referenced if the task has the 378 swap token and is in the middle of a page fault. */ 379 if (mm != current->mm && has_swap_token(mm) && 380 rwsem_is_locked(&mm->mmap_sem)) 381 referenced++; 382 383 out_unmap: 384 (*mapcount)--; 385 pte_unmap_unlock(pte, ptl); 386 387 if (referenced) 388 *vm_flags |= vma->vm_flags; 389 out: 390 return referenced; 391 } 392 393 static int page_referenced_anon(struct page *page, 394 struct mem_cgroup *mem_cont, 395 unsigned long *vm_flags) 396 { 397 unsigned int mapcount; 398 struct anon_vma *anon_vma; 399 struct vm_area_struct *vma; 400 int referenced = 0; 401 402 anon_vma = page_lock_anon_vma(page); 403 if (!anon_vma) 404 return referenced; 405 406 mapcount = page_mapcount(page); 407 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 408 unsigned long address = vma_address(page, vma); 409 if (address == -EFAULT) 410 continue; 411 /* 412 * If we are reclaiming on behalf of a cgroup, skip 413 * counting on behalf of references from different 414 * cgroups 415 */ 416 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 417 continue; 418 referenced += page_referenced_one(page, vma, address, 419 &mapcount, vm_flags); 420 if (!mapcount) 421 break; 422 } 423 424 page_unlock_anon_vma(anon_vma); 425 return referenced; 426 } 427 428 /** 429 * page_referenced_file - referenced check for object-based rmap 430 * @page: the page we're checking references on. 431 * @mem_cont: target memory controller 432 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 433 * 434 * For an object-based mapped page, find all the places it is mapped and 435 * check/clear the referenced flag. This is done by following the page->mapping 436 * pointer, then walking the chain of vmas it holds. It returns the number 437 * of references it found. 438 * 439 * This function is only called from page_referenced for object-based pages. 440 */ 441 static int page_referenced_file(struct page *page, 442 struct mem_cgroup *mem_cont, 443 unsigned long *vm_flags) 444 { 445 unsigned int mapcount; 446 struct address_space *mapping = page->mapping; 447 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 448 struct vm_area_struct *vma; 449 struct prio_tree_iter iter; 450 int referenced = 0; 451 452 /* 453 * The caller's checks on page->mapping and !PageAnon have made 454 * sure that this is a file page: the check for page->mapping 455 * excludes the case just before it gets set on an anon page. 456 */ 457 BUG_ON(PageAnon(page)); 458 459 /* 460 * The page lock not only makes sure that page->mapping cannot 461 * suddenly be NULLified by truncation, it makes sure that the 462 * structure at mapping cannot be freed and reused yet, 463 * so we can safely take mapping->i_mmap_lock. 464 */ 465 BUG_ON(!PageLocked(page)); 466 467 spin_lock(&mapping->i_mmap_lock); 468 469 /* 470 * i_mmap_lock does not stabilize mapcount at all, but mapcount 471 * is more likely to be accurate if we note it after spinning. 472 */ 473 mapcount = page_mapcount(page); 474 475 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 476 unsigned long address = vma_address(page, vma); 477 if (address == -EFAULT) 478 continue; 479 /* 480 * If we are reclaiming on behalf of a cgroup, skip 481 * counting on behalf of references from different 482 * cgroups 483 */ 484 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 485 continue; 486 referenced += page_referenced_one(page, vma, address, 487 &mapcount, vm_flags); 488 if (!mapcount) 489 break; 490 } 491 492 spin_unlock(&mapping->i_mmap_lock); 493 return referenced; 494 } 495 496 /** 497 * page_referenced - test if the page was referenced 498 * @page: the page to test 499 * @is_locked: caller holds lock on the page 500 * @mem_cont: target memory controller 501 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 502 * 503 * Quick test_and_clear_referenced for all mappings to a page, 504 * returns the number of ptes which referenced the page. 505 */ 506 int page_referenced(struct page *page, 507 int is_locked, 508 struct mem_cgroup *mem_cont, 509 unsigned long *vm_flags) 510 { 511 int referenced = 0; 512 int we_locked = 0; 513 514 if (TestClearPageReferenced(page)) 515 referenced++; 516 517 *vm_flags = 0; 518 if (page_mapped(page) && page_rmapping(page)) { 519 if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 520 we_locked = trylock_page(page); 521 if (!we_locked) { 522 referenced++; 523 goto out; 524 } 525 } 526 if (unlikely(PageKsm(page))) 527 referenced += page_referenced_ksm(page, mem_cont, 528 vm_flags); 529 else if (PageAnon(page)) 530 referenced += page_referenced_anon(page, mem_cont, 531 vm_flags); 532 else if (page->mapping) 533 referenced += page_referenced_file(page, mem_cont, 534 vm_flags); 535 if (we_locked) 536 unlock_page(page); 537 } 538 out: 539 if (page_test_and_clear_young(page)) 540 referenced++; 541 542 return referenced; 543 } 544 545 static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, 546 unsigned long address) 547 { 548 struct mm_struct *mm = vma->vm_mm; 549 pte_t *pte; 550 spinlock_t *ptl; 551 int ret = 0; 552 553 pte = page_check_address(page, mm, address, &ptl, 1); 554 if (!pte) 555 goto out; 556 557 if (pte_dirty(*pte) || pte_write(*pte)) { 558 pte_t entry; 559 560 flush_cache_page(vma, address, pte_pfn(*pte)); 561 entry = ptep_clear_flush_notify(vma, address, pte); 562 entry = pte_wrprotect(entry); 563 entry = pte_mkclean(entry); 564 set_pte_at(mm, address, pte, entry); 565 ret = 1; 566 } 567 568 pte_unmap_unlock(pte, ptl); 569 out: 570 return ret; 571 } 572 573 static int page_mkclean_file(struct address_space *mapping, struct page *page) 574 { 575 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 576 struct vm_area_struct *vma; 577 struct prio_tree_iter iter; 578 int ret = 0; 579 580 BUG_ON(PageAnon(page)); 581 582 spin_lock(&mapping->i_mmap_lock); 583 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 584 if (vma->vm_flags & VM_SHARED) { 585 unsigned long address = vma_address(page, vma); 586 if (address == -EFAULT) 587 continue; 588 ret += page_mkclean_one(page, vma, address); 589 } 590 } 591 spin_unlock(&mapping->i_mmap_lock); 592 return ret; 593 } 594 595 int page_mkclean(struct page *page) 596 { 597 int ret = 0; 598 599 BUG_ON(!PageLocked(page)); 600 601 if (page_mapped(page)) { 602 struct address_space *mapping = page_mapping(page); 603 if (mapping) { 604 ret = page_mkclean_file(mapping, page); 605 if (page_test_dirty(page)) { 606 page_clear_dirty(page); 607 ret = 1; 608 } 609 } 610 } 611 612 return ret; 613 } 614 EXPORT_SYMBOL_GPL(page_mkclean); 615 616 /** 617 * __page_set_anon_rmap - setup new anonymous rmap 618 * @page: the page to add the mapping to 619 * @vma: the vm area in which the mapping is added 620 * @address: the user virtual address mapped 621 */ 622 static void __page_set_anon_rmap(struct page *page, 623 struct vm_area_struct *vma, unsigned long address) 624 { 625 struct anon_vma *anon_vma = vma->anon_vma; 626 627 BUG_ON(!anon_vma); 628 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 629 page->mapping = (struct address_space *) anon_vma; 630 page->index = linear_page_index(vma, address); 631 } 632 633 /** 634 * __page_check_anon_rmap - sanity check anonymous rmap addition 635 * @page: the page to add the mapping to 636 * @vma: the vm area in which the mapping is added 637 * @address: the user virtual address mapped 638 */ 639 static void __page_check_anon_rmap(struct page *page, 640 struct vm_area_struct *vma, unsigned long address) 641 { 642 #ifdef CONFIG_DEBUG_VM 643 /* 644 * The page's anon-rmap details (mapping and index) are guaranteed to 645 * be set up correctly at this point. 646 * 647 * We have exclusion against page_add_anon_rmap because the caller 648 * always holds the page locked, except if called from page_dup_rmap, 649 * in which case the page is already known to be setup. 650 * 651 * We have exclusion against page_add_new_anon_rmap because those pages 652 * are initially only visible via the pagetables, and the pte is locked 653 * over the call to page_add_new_anon_rmap. 654 */ 655 struct anon_vma *anon_vma = vma->anon_vma; 656 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 657 BUG_ON(page->mapping != (struct address_space *)anon_vma); 658 BUG_ON(page->index != linear_page_index(vma, address)); 659 #endif 660 } 661 662 /** 663 * page_add_anon_rmap - add pte mapping to an anonymous page 664 * @page: the page to add the mapping to 665 * @vma: the vm area in which the mapping is added 666 * @address: the user virtual address mapped 667 * 668 * The caller needs to hold the pte lock, and the page must be locked in 669 * the anon_vma case: to serialize mapping,index checking after setting, 670 * and to ensure that PageAnon is not being upgraded racily to PageKsm 671 * (but PageKsm is never downgraded to PageAnon). 672 */ 673 void page_add_anon_rmap(struct page *page, 674 struct vm_area_struct *vma, unsigned long address) 675 { 676 int first = atomic_inc_and_test(&page->_mapcount); 677 if (first) 678 __inc_zone_page_state(page, NR_ANON_PAGES); 679 if (unlikely(PageKsm(page))) 680 return; 681 682 VM_BUG_ON(!PageLocked(page)); 683 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 684 if (first) 685 __page_set_anon_rmap(page, vma, address); 686 else 687 __page_check_anon_rmap(page, vma, address); 688 } 689 690 /** 691 * page_add_new_anon_rmap - add pte mapping to a new anonymous page 692 * @page: the page to add the mapping to 693 * @vma: the vm area in which the mapping is added 694 * @address: the user virtual address mapped 695 * 696 * Same as page_add_anon_rmap but must only be called on *new* pages. 697 * This means the inc-and-test can be bypassed. 698 * Page does not have to be locked. 699 */ 700 void page_add_new_anon_rmap(struct page *page, 701 struct vm_area_struct *vma, unsigned long address) 702 { 703 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 704 SetPageSwapBacked(page); 705 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 706 __inc_zone_page_state(page, NR_ANON_PAGES); 707 __page_set_anon_rmap(page, vma, address); 708 if (page_evictable(page, vma)) 709 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 710 else 711 add_page_to_unevictable_list(page); 712 } 713 714 /** 715 * page_add_file_rmap - add pte mapping to a file page 716 * @page: the page to add the mapping to 717 * 718 * The caller needs to hold the pte lock. 719 */ 720 void page_add_file_rmap(struct page *page) 721 { 722 if (atomic_inc_and_test(&page->_mapcount)) { 723 __inc_zone_page_state(page, NR_FILE_MAPPED); 724 mem_cgroup_update_file_mapped(page, 1); 725 } 726 } 727 728 /** 729 * page_remove_rmap - take down pte mapping from a page 730 * @page: page to remove mapping from 731 * 732 * The caller needs to hold the pte lock. 733 */ 734 void page_remove_rmap(struct page *page) 735 { 736 /* page still mapped by someone else? */ 737 if (!atomic_add_negative(-1, &page->_mapcount)) 738 return; 739 740 /* 741 * Now that the last pte has gone, s390 must transfer dirty 742 * flag from storage key to struct page. We can usually skip 743 * this if the page is anon, so about to be freed; but perhaps 744 * not if it's in swapcache - there might be another pte slot 745 * containing the swap entry, but page not yet written to swap. 746 */ 747 if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { 748 page_clear_dirty(page); 749 set_page_dirty(page); 750 } 751 if (PageAnon(page)) { 752 mem_cgroup_uncharge_page(page); 753 __dec_zone_page_state(page, NR_ANON_PAGES); 754 } else { 755 __dec_zone_page_state(page, NR_FILE_MAPPED); 756 mem_cgroup_update_file_mapped(page, -1); 757 } 758 /* 759 * It would be tidy to reset the PageAnon mapping here, 760 * but that might overwrite a racing page_add_anon_rmap 761 * which increments mapcount after us but sets mapping 762 * before us: so leave the reset to free_hot_cold_page, 763 * and remember that it's only reliable while mapped. 764 * Leaving it set also helps swapoff to reinstate ptes 765 * faster for those pages still in swapcache. 766 */ 767 } 768 769 /* 770 * Subfunctions of try_to_unmap: try_to_unmap_one called 771 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 772 */ 773 int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 774 unsigned long address, enum ttu_flags flags) 775 { 776 struct mm_struct *mm = vma->vm_mm; 777 pte_t *pte; 778 pte_t pteval; 779 spinlock_t *ptl; 780 int ret = SWAP_AGAIN; 781 782 pte = page_check_address(page, mm, address, &ptl, 0); 783 if (!pte) 784 goto out; 785 786 /* 787 * If the page is mlock()d, we cannot swap it out. 788 * If it's recently referenced (perhaps page_referenced 789 * skipped over this mm) then we should reactivate it. 790 */ 791 if (!(flags & TTU_IGNORE_MLOCK)) { 792 if (vma->vm_flags & VM_LOCKED) 793 goto out_mlock; 794 795 if (TTU_ACTION(flags) == TTU_MUNLOCK) 796 goto out_unmap; 797 } 798 if (!(flags & TTU_IGNORE_ACCESS)) { 799 if (ptep_clear_flush_young_notify(vma, address, pte)) { 800 ret = SWAP_FAIL; 801 goto out_unmap; 802 } 803 } 804 805 /* Nuke the page table entry. */ 806 flush_cache_page(vma, address, page_to_pfn(page)); 807 pteval = ptep_clear_flush_notify(vma, address, pte); 808 809 /* Move the dirty bit to the physical page now the pte is gone. */ 810 if (pte_dirty(pteval)) 811 set_page_dirty(page); 812 813 /* Update high watermark before we lower rss */ 814 update_hiwater_rss(mm); 815 816 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 817 if (PageAnon(page)) 818 dec_mm_counter(mm, anon_rss); 819 else 820 dec_mm_counter(mm, file_rss); 821 set_pte_at(mm, address, pte, 822 swp_entry_to_pte(make_hwpoison_entry(page))); 823 } else if (PageAnon(page)) { 824 swp_entry_t entry = { .val = page_private(page) }; 825 826 if (PageSwapCache(page)) { 827 /* 828 * Store the swap location in the pte. 829 * See handle_pte_fault() ... 830 */ 831 if (swap_duplicate(entry) < 0) { 832 set_pte_at(mm, address, pte, pteval); 833 ret = SWAP_FAIL; 834 goto out_unmap; 835 } 836 if (list_empty(&mm->mmlist)) { 837 spin_lock(&mmlist_lock); 838 if (list_empty(&mm->mmlist)) 839 list_add(&mm->mmlist, &init_mm.mmlist); 840 spin_unlock(&mmlist_lock); 841 } 842 dec_mm_counter(mm, anon_rss); 843 } else if (PAGE_MIGRATION) { 844 /* 845 * Store the pfn of the page in a special migration 846 * pte. do_swap_page() will wait until the migration 847 * pte is removed and then restart fault handling. 848 */ 849 BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); 850 entry = make_migration_entry(page, pte_write(pteval)); 851 } 852 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 853 BUG_ON(pte_file(*pte)); 854 } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { 855 /* Establish migration entry for a file page */ 856 swp_entry_t entry; 857 entry = make_migration_entry(page, pte_write(pteval)); 858 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 859 } else 860 dec_mm_counter(mm, file_rss); 861 862 page_remove_rmap(page); 863 page_cache_release(page); 864 865 out_unmap: 866 pte_unmap_unlock(pte, ptl); 867 out: 868 return ret; 869 870 out_mlock: 871 pte_unmap_unlock(pte, ptl); 872 873 874 /* 875 * We need mmap_sem locking, Otherwise VM_LOCKED check makes 876 * unstable result and race. Plus, We can't wait here because 877 * we now hold anon_vma->lock or mapping->i_mmap_lock. 878 * if trylock failed, the page remain in evictable lru and later 879 * vmscan could retry to move the page to unevictable lru if the 880 * page is actually mlocked. 881 */ 882 if (down_read_trylock(&vma->vm_mm->mmap_sem)) { 883 if (vma->vm_flags & VM_LOCKED) { 884 mlock_vma_page(page); 885 ret = SWAP_MLOCK; 886 } 887 up_read(&vma->vm_mm->mmap_sem); 888 } 889 return ret; 890 } 891 892 /* 893 * objrmap doesn't work for nonlinear VMAs because the assumption that 894 * offset-into-file correlates with offset-into-virtual-addresses does not hold. 895 * Consequently, given a particular page and its ->index, we cannot locate the 896 * ptes which are mapping that page without an exhaustive linear search. 897 * 898 * So what this code does is a mini "virtual scan" of each nonlinear VMA which 899 * maps the file to which the target page belongs. The ->vm_private_data field 900 * holds the current cursor into that scan. Successive searches will circulate 901 * around the vma's virtual address space. 902 * 903 * So as more replacement pressure is applied to the pages in a nonlinear VMA, 904 * more scanning pressure is placed against them as well. Eventually pages 905 * will become fully unmapped and are eligible for eviction. 906 * 907 * For very sparsely populated VMAs this is a little inefficient - chances are 908 * there there won't be many ptes located within the scan cluster. In this case 909 * maybe we could scan further - to the end of the pte page, perhaps. 910 * 911 * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can 912 * acquire it without blocking. If vma locked, mlock the pages in the cluster, 913 * rather than unmapping them. If we encounter the "check_page" that vmscan is 914 * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. 915 */ 916 #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) 917 #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) 918 919 static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, 920 struct vm_area_struct *vma, struct page *check_page) 921 { 922 struct mm_struct *mm = vma->vm_mm; 923 pgd_t *pgd; 924 pud_t *pud; 925 pmd_t *pmd; 926 pte_t *pte; 927 pte_t pteval; 928 spinlock_t *ptl; 929 struct page *page; 930 unsigned long address; 931 unsigned long end; 932 int ret = SWAP_AGAIN; 933 int locked_vma = 0; 934 935 address = (vma->vm_start + cursor) & CLUSTER_MASK; 936 end = address + CLUSTER_SIZE; 937 if (address < vma->vm_start) 938 address = vma->vm_start; 939 if (end > vma->vm_end) 940 end = vma->vm_end; 941 942 pgd = pgd_offset(mm, address); 943 if (!pgd_present(*pgd)) 944 return ret; 945 946 pud = pud_offset(pgd, address); 947 if (!pud_present(*pud)) 948 return ret; 949 950 pmd = pmd_offset(pud, address); 951 if (!pmd_present(*pmd)) 952 return ret; 953 954 /* 955 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, 956 * keep the sem while scanning the cluster for mlocking pages. 957 */ 958 if (down_read_trylock(&vma->vm_mm->mmap_sem)) { 959 locked_vma = (vma->vm_flags & VM_LOCKED); 960 if (!locked_vma) 961 up_read(&vma->vm_mm->mmap_sem); /* don't need it */ 962 } 963 964 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 965 966 /* Update high watermark before we lower rss */ 967 update_hiwater_rss(mm); 968 969 for (; address < end; pte++, address += PAGE_SIZE) { 970 if (!pte_present(*pte)) 971 continue; 972 page = vm_normal_page(vma, address, *pte); 973 BUG_ON(!page || PageAnon(page)); 974 975 if (locked_vma) { 976 mlock_vma_page(page); /* no-op if already mlocked */ 977 if (page == check_page) 978 ret = SWAP_MLOCK; 979 continue; /* don't unmap */ 980 } 981 982 if (ptep_clear_flush_young_notify(vma, address, pte)) 983 continue; 984 985 /* Nuke the page table entry. */ 986 flush_cache_page(vma, address, pte_pfn(*pte)); 987 pteval = ptep_clear_flush_notify(vma, address, pte); 988 989 /* If nonlinear, store the file page offset in the pte. */ 990 if (page->index != linear_page_index(vma, address)) 991 set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); 992 993 /* Move the dirty bit to the physical page now the pte is gone. */ 994 if (pte_dirty(pteval)) 995 set_page_dirty(page); 996 997 page_remove_rmap(page); 998 page_cache_release(page); 999 dec_mm_counter(mm, file_rss); 1000 (*mapcount)--; 1001 } 1002 pte_unmap_unlock(pte - 1, ptl); 1003 if (locked_vma) 1004 up_read(&vma->vm_mm->mmap_sem); 1005 return ret; 1006 } 1007 1008 /** 1009 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based 1010 * rmap method 1011 * @page: the page to unmap/unlock 1012 * @flags: action and flags 1013 * 1014 * Find all the mappings of a page using the mapping pointer and the vma chains 1015 * contained in the anon_vma struct it points to. 1016 * 1017 * This function is only called from try_to_unmap/try_to_munlock for 1018 * anonymous pages. 1019 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1020 * where the page was found will be held for write. So, we won't recheck 1021 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1022 * 'LOCKED. 1023 */ 1024 static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) 1025 { 1026 struct anon_vma *anon_vma; 1027 struct vm_area_struct *vma; 1028 int ret = SWAP_AGAIN; 1029 1030 anon_vma = page_lock_anon_vma(page); 1031 if (!anon_vma) 1032 return ret; 1033 1034 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1035 unsigned long address = vma_address(page, vma); 1036 if (address == -EFAULT) 1037 continue; 1038 ret = try_to_unmap_one(page, vma, address, flags); 1039 if (ret != SWAP_AGAIN || !page_mapped(page)) 1040 break; 1041 } 1042 1043 page_unlock_anon_vma(anon_vma); 1044 return ret; 1045 } 1046 1047 /** 1048 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method 1049 * @page: the page to unmap/unlock 1050 * @flags: action and flags 1051 * 1052 * Find all the mappings of a page using the mapping pointer and the vma chains 1053 * contained in the address_space struct it points to. 1054 * 1055 * This function is only called from try_to_unmap/try_to_munlock for 1056 * object-based pages. 1057 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1058 * where the page was found will be held for write. So, we won't recheck 1059 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1060 * 'LOCKED. 1061 */ 1062 static int try_to_unmap_file(struct page *page, enum ttu_flags flags) 1063 { 1064 struct address_space *mapping = page->mapping; 1065 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1066 struct vm_area_struct *vma; 1067 struct prio_tree_iter iter; 1068 int ret = SWAP_AGAIN; 1069 unsigned long cursor; 1070 unsigned long max_nl_cursor = 0; 1071 unsigned long max_nl_size = 0; 1072 unsigned int mapcount; 1073 1074 spin_lock(&mapping->i_mmap_lock); 1075 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1076 unsigned long address = vma_address(page, vma); 1077 if (address == -EFAULT) 1078 continue; 1079 ret = try_to_unmap_one(page, vma, address, flags); 1080 if (ret != SWAP_AGAIN || !page_mapped(page)) 1081 goto out; 1082 } 1083 1084 if (list_empty(&mapping->i_mmap_nonlinear)) 1085 goto out; 1086 1087 /* 1088 * We don't bother to try to find the munlocked page in nonlinears. 1089 * It's costly. Instead, later, page reclaim logic may call 1090 * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. 1091 */ 1092 if (TTU_ACTION(flags) == TTU_MUNLOCK) 1093 goto out; 1094 1095 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1096 shared.vm_set.list) { 1097 cursor = (unsigned long) vma->vm_private_data; 1098 if (cursor > max_nl_cursor) 1099 max_nl_cursor = cursor; 1100 cursor = vma->vm_end - vma->vm_start; 1101 if (cursor > max_nl_size) 1102 max_nl_size = cursor; 1103 } 1104 1105 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ 1106 ret = SWAP_FAIL; 1107 goto out; 1108 } 1109 1110 /* 1111 * We don't try to search for this page in the nonlinear vmas, 1112 * and page_referenced wouldn't have found it anyway. Instead 1113 * just walk the nonlinear vmas trying to age and unmap some. 1114 * The mapcount of the page we came in with is irrelevant, 1115 * but even so use it as a guide to how hard we should try? 1116 */ 1117 mapcount = page_mapcount(page); 1118 if (!mapcount) 1119 goto out; 1120 cond_resched_lock(&mapping->i_mmap_lock); 1121 1122 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; 1123 if (max_nl_cursor == 0) 1124 max_nl_cursor = CLUSTER_SIZE; 1125 1126 do { 1127 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1128 shared.vm_set.list) { 1129 cursor = (unsigned long) vma->vm_private_data; 1130 while ( cursor < max_nl_cursor && 1131 cursor < vma->vm_end - vma->vm_start) { 1132 if (try_to_unmap_cluster(cursor, &mapcount, 1133 vma, page) == SWAP_MLOCK) 1134 ret = SWAP_MLOCK; 1135 cursor += CLUSTER_SIZE; 1136 vma->vm_private_data = (void *) cursor; 1137 if ((int)mapcount <= 0) 1138 goto out; 1139 } 1140 vma->vm_private_data = (void *) max_nl_cursor; 1141 } 1142 cond_resched_lock(&mapping->i_mmap_lock); 1143 max_nl_cursor += CLUSTER_SIZE; 1144 } while (max_nl_cursor <= max_nl_size); 1145 1146 /* 1147 * Don't loop forever (perhaps all the remaining pages are 1148 * in locked vmas). Reset cursor on all unreserved nonlinear 1149 * vmas, now forgetting on which ones it had fallen behind. 1150 */ 1151 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 1152 vma->vm_private_data = NULL; 1153 out: 1154 spin_unlock(&mapping->i_mmap_lock); 1155 return ret; 1156 } 1157 1158 /** 1159 * try_to_unmap - try to remove all page table mappings to a page 1160 * @page: the page to get unmapped 1161 * @flags: action and flags 1162 * 1163 * Tries to remove all the page table entries which are mapping this 1164 * page, used in the pageout path. Caller must hold the page lock. 1165 * Return values are: 1166 * 1167 * SWAP_SUCCESS - we succeeded in removing all mappings 1168 * SWAP_AGAIN - we missed a mapping, try again later 1169 * SWAP_FAIL - the page is unswappable 1170 * SWAP_MLOCK - page is mlocked. 1171 */ 1172 int try_to_unmap(struct page *page, enum ttu_flags flags) 1173 { 1174 int ret; 1175 1176 BUG_ON(!PageLocked(page)); 1177 1178 if (unlikely(PageKsm(page))) 1179 ret = try_to_unmap_ksm(page, flags); 1180 else if (PageAnon(page)) 1181 ret = try_to_unmap_anon(page, flags); 1182 else 1183 ret = try_to_unmap_file(page, flags); 1184 if (ret != SWAP_MLOCK && !page_mapped(page)) 1185 ret = SWAP_SUCCESS; 1186 return ret; 1187 } 1188 1189 /** 1190 * try_to_munlock - try to munlock a page 1191 * @page: the page to be munlocked 1192 * 1193 * Called from munlock code. Checks all of the VMAs mapping the page 1194 * to make sure nobody else has this page mlocked. The page will be 1195 * returned with PG_mlocked cleared if no other vmas have it mlocked. 1196 * 1197 * Return values are: 1198 * 1199 * SWAP_AGAIN - no vma is holding page mlocked, or, 1200 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem 1201 * SWAP_FAIL - page cannot be located at present 1202 * SWAP_MLOCK - page is now mlocked. 1203 */ 1204 int try_to_munlock(struct page *page) 1205 { 1206 VM_BUG_ON(!PageLocked(page) || PageLRU(page)); 1207 1208 if (unlikely(PageKsm(page))) 1209 return try_to_unmap_ksm(page, TTU_MUNLOCK); 1210 else if (PageAnon(page)) 1211 return try_to_unmap_anon(page, TTU_MUNLOCK); 1212 else 1213 return try_to_unmap_file(page, TTU_MUNLOCK); 1214 } 1215 1216 #ifdef CONFIG_MIGRATION 1217 /* 1218 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): 1219 * Called by migrate.c to remove migration ptes, but might be used more later. 1220 */ 1221 static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, 1222 struct vm_area_struct *, unsigned long, void *), void *arg) 1223 { 1224 struct anon_vma *anon_vma; 1225 struct vm_area_struct *vma; 1226 int ret = SWAP_AGAIN; 1227 1228 /* 1229 * Note: remove_migration_ptes() cannot use page_lock_anon_vma() 1230 * because that depends on page_mapped(); but not all its usages 1231 * are holding mmap_sem, which also gave the necessary guarantee 1232 * (that this anon_vma's slab has not already been destroyed). 1233 * This needs to be reviewed later: avoiding page_lock_anon_vma() 1234 * is risky, and currently limits the usefulness of rmap_walk(). 1235 */ 1236 anon_vma = page_anon_vma(page); 1237 if (!anon_vma) 1238 return ret; 1239 spin_lock(&anon_vma->lock); 1240 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1241 unsigned long address = vma_address(page, vma); 1242 if (address == -EFAULT) 1243 continue; 1244 ret = rmap_one(page, vma, address, arg); 1245 if (ret != SWAP_AGAIN) 1246 break; 1247 } 1248 spin_unlock(&anon_vma->lock); 1249 return ret; 1250 } 1251 1252 static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, 1253 struct vm_area_struct *, unsigned long, void *), void *arg) 1254 { 1255 struct address_space *mapping = page->mapping; 1256 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1257 struct vm_area_struct *vma; 1258 struct prio_tree_iter iter; 1259 int ret = SWAP_AGAIN; 1260 1261 if (!mapping) 1262 return ret; 1263 spin_lock(&mapping->i_mmap_lock); 1264 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1265 unsigned long address = vma_address(page, vma); 1266 if (address == -EFAULT) 1267 continue; 1268 ret = rmap_one(page, vma, address, arg); 1269 if (ret != SWAP_AGAIN) 1270 break; 1271 } 1272 /* 1273 * No nonlinear handling: being always shared, nonlinear vmas 1274 * never contain migration ptes. Decide what to do about this 1275 * limitation to linear when we need rmap_walk() on nonlinear. 1276 */ 1277 spin_unlock(&mapping->i_mmap_lock); 1278 return ret; 1279 } 1280 1281 int rmap_walk(struct page *page, int (*rmap_one)(struct page *, 1282 struct vm_area_struct *, unsigned long, void *), void *arg) 1283 { 1284 VM_BUG_ON(!PageLocked(page)); 1285 1286 if (unlikely(PageKsm(page))) 1287 return rmap_walk_ksm(page, rmap_one, arg); 1288 else if (PageAnon(page)) 1289 return rmap_walk_anon(page, rmap_one, arg); 1290 else 1291 return rmap_walk_file(page, rmap_one, arg); 1292 } 1293 #endif /* CONFIG_MIGRATION */ 1294