1 /* 2 * mm/rmap.c - physical to virtual reverse mappings 3 * 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 5 * Released under the General Public License (GPL). 6 * 7 * Simple, low overhead reverse mapping scheme. 8 * Please try to keep this thing as modular as possible. 9 * 10 * Provides methods for unmapping each kind of mapped page: 11 * the anon methods track anonymous pages, and 12 * the file methods track pages belonging to an inode. 13 * 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 17 * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 18 */ 19 20 /* 21 * Lock ordering in mm: 22 * 23 * inode->i_mutex (while writing or truncating, not reading or faulting) 24 * inode->i_alloc_sem (vmtruncate_range) 25 * mm->mmap_sem 26 * page->flags PG_locked (lock_page) 27 * mapping->i_mmap_lock 28 * anon_vma->lock 29 * mm->page_table_lock or pte_lock 30 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 31 * swap_lock (in swap_duplicate, swap_info_get) 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 * mapping->private_lock (in __set_page_dirty_buffers) 34 * inode_lock (in set_page_dirty's __mark_inode_dirty) 35 * sb_lock (within inode_lock in fs/fs-writeback.c) 36 * mapping->tree_lock (widely used, in set_page_dirty, 37 * in arch-dependent flush_dcache_mmap_lock, 38 * within inode_lock in __sync_single_inode) 39 */ 40 41 #include <linux/mm.h> 42 #include <linux/pagemap.h> 43 #include <linux/swap.h> 44 #include <linux/swapops.h> 45 #include <linux/slab.h> 46 #include <linux/init.h> 47 #include <linux/rmap.h> 48 #include <linux/rcupdate.h> 49 #include <linux/module.h> 50 #include <linux/kallsyms.h> 51 #include <linux/memcontrol.h> 52 #include <linux/mmu_notifier.h> 53 54 #include <asm/tlbflush.h> 55 56 struct kmem_cache *anon_vma_cachep; 57 58 /* This must be called under the mmap_sem. */ 59 int anon_vma_prepare(struct vm_area_struct *vma) 60 { 61 struct anon_vma *anon_vma = vma->anon_vma; 62 63 might_sleep(); 64 if (unlikely(!anon_vma)) { 65 struct mm_struct *mm = vma->vm_mm; 66 struct anon_vma *allocated, *locked; 67 68 anon_vma = find_mergeable_anon_vma(vma); 69 if (anon_vma) { 70 allocated = NULL; 71 locked = anon_vma; 72 spin_lock(&locked->lock); 73 } else { 74 anon_vma = anon_vma_alloc(); 75 if (unlikely(!anon_vma)) 76 return -ENOMEM; 77 allocated = anon_vma; 78 locked = NULL; 79 } 80 81 /* page_table_lock to protect against threads */ 82 spin_lock(&mm->page_table_lock); 83 if (likely(!vma->anon_vma)) { 84 vma->anon_vma = anon_vma; 85 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 86 allocated = NULL; 87 } 88 spin_unlock(&mm->page_table_lock); 89 90 if (locked) 91 spin_unlock(&locked->lock); 92 if (unlikely(allocated)) 93 anon_vma_free(allocated); 94 } 95 return 0; 96 } 97 98 void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) 99 { 100 BUG_ON(vma->anon_vma != next->anon_vma); 101 list_del(&next->anon_vma_node); 102 } 103 104 void __anon_vma_link(struct vm_area_struct *vma) 105 { 106 struct anon_vma *anon_vma = vma->anon_vma; 107 108 if (anon_vma) 109 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 110 } 111 112 void anon_vma_link(struct vm_area_struct *vma) 113 { 114 struct anon_vma *anon_vma = vma->anon_vma; 115 116 if (anon_vma) { 117 spin_lock(&anon_vma->lock); 118 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 119 spin_unlock(&anon_vma->lock); 120 } 121 } 122 123 void anon_vma_unlink(struct vm_area_struct *vma) 124 { 125 struct anon_vma *anon_vma = vma->anon_vma; 126 int empty; 127 128 if (!anon_vma) 129 return; 130 131 spin_lock(&anon_vma->lock); 132 list_del(&vma->anon_vma_node); 133 134 /* We must garbage collect the anon_vma if it's empty */ 135 empty = list_empty(&anon_vma->head); 136 spin_unlock(&anon_vma->lock); 137 138 if (empty) 139 anon_vma_free(anon_vma); 140 } 141 142 static void anon_vma_ctor(void *data) 143 { 144 struct anon_vma *anon_vma = data; 145 146 spin_lock_init(&anon_vma->lock); 147 INIT_LIST_HEAD(&anon_vma->head); 148 } 149 150 void __init anon_vma_init(void) 151 { 152 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 153 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); 154 } 155 156 /* 157 * Getting a lock on a stable anon_vma from a page off the LRU is 158 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 159 */ 160 static struct anon_vma *page_lock_anon_vma(struct page *page) 161 { 162 struct anon_vma *anon_vma; 163 unsigned long anon_mapping; 164 165 rcu_read_lock(); 166 anon_mapping = (unsigned long) page->mapping; 167 if (!(anon_mapping & PAGE_MAPPING_ANON)) 168 goto out; 169 if (!page_mapped(page)) 170 goto out; 171 172 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 173 spin_lock(&anon_vma->lock); 174 return anon_vma; 175 out: 176 rcu_read_unlock(); 177 return NULL; 178 } 179 180 static void page_unlock_anon_vma(struct anon_vma *anon_vma) 181 { 182 spin_unlock(&anon_vma->lock); 183 rcu_read_unlock(); 184 } 185 186 /* 187 * At what user virtual address is page expected in @vma? 188 * Returns virtual address or -EFAULT if page's index/offset is not 189 * within the range mapped the @vma. 190 */ 191 static inline unsigned long 192 vma_address(struct page *page, struct vm_area_struct *vma) 193 { 194 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 195 unsigned long address; 196 197 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 198 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 199 /* page should be within @vma mapping range */ 200 return -EFAULT; 201 } 202 return address; 203 } 204 205 /* 206 * At what user virtual address is page expected in vma? checking that the 207 * page matches the vma: currently only used on anon pages, by unuse_vma; 208 */ 209 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 210 { 211 if (PageAnon(page)) { 212 if ((void *)vma->anon_vma != 213 (void *)page->mapping - PAGE_MAPPING_ANON) 214 return -EFAULT; 215 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 216 if (!vma->vm_file || 217 vma->vm_file->f_mapping != page->mapping) 218 return -EFAULT; 219 } else 220 return -EFAULT; 221 return vma_address(page, vma); 222 } 223 224 /* 225 * Check that @page is mapped at @address into @mm. 226 * 227 * If @sync is false, page_check_address may perform a racy check to avoid 228 * the page table lock when the pte is not present (helpful when reclaiming 229 * highly shared pages). 230 * 231 * On success returns with pte mapped and locked. 232 */ 233 pte_t *page_check_address(struct page *page, struct mm_struct *mm, 234 unsigned long address, spinlock_t **ptlp, int sync) 235 { 236 pgd_t *pgd; 237 pud_t *pud; 238 pmd_t *pmd; 239 pte_t *pte; 240 spinlock_t *ptl; 241 242 pgd = pgd_offset(mm, address); 243 if (!pgd_present(*pgd)) 244 return NULL; 245 246 pud = pud_offset(pgd, address); 247 if (!pud_present(*pud)) 248 return NULL; 249 250 pmd = pmd_offset(pud, address); 251 if (!pmd_present(*pmd)) 252 return NULL; 253 254 pte = pte_offset_map(pmd, address); 255 /* Make a quick check before getting the lock */ 256 if (!sync && !pte_present(*pte)) { 257 pte_unmap(pte); 258 return NULL; 259 } 260 261 ptl = pte_lockptr(mm, pmd); 262 spin_lock(ptl); 263 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { 264 *ptlp = ptl; 265 return pte; 266 } 267 pte_unmap_unlock(pte, ptl); 268 return NULL; 269 } 270 271 /* 272 * Subfunctions of page_referenced: page_referenced_one called 273 * repeatedly from either page_referenced_anon or page_referenced_file. 274 */ 275 static int page_referenced_one(struct page *page, 276 struct vm_area_struct *vma, unsigned int *mapcount) 277 { 278 struct mm_struct *mm = vma->vm_mm; 279 unsigned long address; 280 pte_t *pte; 281 spinlock_t *ptl; 282 int referenced = 0; 283 284 address = vma_address(page, vma); 285 if (address == -EFAULT) 286 goto out; 287 288 pte = page_check_address(page, mm, address, &ptl, 0); 289 if (!pte) 290 goto out; 291 292 if (vma->vm_flags & VM_LOCKED) { 293 referenced++; 294 *mapcount = 1; /* break early from loop */ 295 } else if (ptep_clear_flush_young_notify(vma, address, pte)) 296 referenced++; 297 298 /* Pretend the page is referenced if the task has the 299 swap token and is in the middle of a page fault. */ 300 if (mm != current->mm && has_swap_token(mm) && 301 rwsem_is_locked(&mm->mmap_sem)) 302 referenced++; 303 304 (*mapcount)--; 305 pte_unmap_unlock(pte, ptl); 306 out: 307 return referenced; 308 } 309 310 static int page_referenced_anon(struct page *page, 311 struct mem_cgroup *mem_cont) 312 { 313 unsigned int mapcount; 314 struct anon_vma *anon_vma; 315 struct vm_area_struct *vma; 316 int referenced = 0; 317 318 anon_vma = page_lock_anon_vma(page); 319 if (!anon_vma) 320 return referenced; 321 322 mapcount = page_mapcount(page); 323 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 324 /* 325 * If we are reclaiming on behalf of a cgroup, skip 326 * counting on behalf of references from different 327 * cgroups 328 */ 329 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 330 continue; 331 referenced += page_referenced_one(page, vma, &mapcount); 332 if (!mapcount) 333 break; 334 } 335 336 page_unlock_anon_vma(anon_vma); 337 return referenced; 338 } 339 340 /** 341 * page_referenced_file - referenced check for object-based rmap 342 * @page: the page we're checking references on. 343 * @mem_cont: target memory controller 344 * 345 * For an object-based mapped page, find all the places it is mapped and 346 * check/clear the referenced flag. This is done by following the page->mapping 347 * pointer, then walking the chain of vmas it holds. It returns the number 348 * of references it found. 349 * 350 * This function is only called from page_referenced for object-based pages. 351 */ 352 static int page_referenced_file(struct page *page, 353 struct mem_cgroup *mem_cont) 354 { 355 unsigned int mapcount; 356 struct address_space *mapping = page->mapping; 357 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 358 struct vm_area_struct *vma; 359 struct prio_tree_iter iter; 360 int referenced = 0; 361 362 /* 363 * The caller's checks on page->mapping and !PageAnon have made 364 * sure that this is a file page: the check for page->mapping 365 * excludes the case just before it gets set on an anon page. 366 */ 367 BUG_ON(PageAnon(page)); 368 369 /* 370 * The page lock not only makes sure that page->mapping cannot 371 * suddenly be NULLified by truncation, it makes sure that the 372 * structure at mapping cannot be freed and reused yet, 373 * so we can safely take mapping->i_mmap_lock. 374 */ 375 BUG_ON(!PageLocked(page)); 376 377 spin_lock(&mapping->i_mmap_lock); 378 379 /* 380 * i_mmap_lock does not stabilize mapcount at all, but mapcount 381 * is more likely to be accurate if we note it after spinning. 382 */ 383 mapcount = page_mapcount(page); 384 385 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 386 /* 387 * If we are reclaiming on behalf of a cgroup, skip 388 * counting on behalf of references from different 389 * cgroups 390 */ 391 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 392 continue; 393 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) 394 == (VM_LOCKED|VM_MAYSHARE)) { 395 referenced++; 396 break; 397 } 398 referenced += page_referenced_one(page, vma, &mapcount); 399 if (!mapcount) 400 break; 401 } 402 403 spin_unlock(&mapping->i_mmap_lock); 404 return referenced; 405 } 406 407 /** 408 * page_referenced - test if the page was referenced 409 * @page: the page to test 410 * @is_locked: caller holds lock on the page 411 * @mem_cont: target memory controller 412 * 413 * Quick test_and_clear_referenced for all mappings to a page, 414 * returns the number of ptes which referenced the page. 415 */ 416 int page_referenced(struct page *page, int is_locked, 417 struct mem_cgroup *mem_cont) 418 { 419 int referenced = 0; 420 421 if (TestClearPageReferenced(page)) 422 referenced++; 423 424 if (page_mapped(page) && page->mapping) { 425 if (PageAnon(page)) 426 referenced += page_referenced_anon(page, mem_cont); 427 else if (is_locked) 428 referenced += page_referenced_file(page, mem_cont); 429 else if (!trylock_page(page)) 430 referenced++; 431 else { 432 if (page->mapping) 433 referenced += 434 page_referenced_file(page, mem_cont); 435 unlock_page(page); 436 } 437 } 438 439 if (page_test_and_clear_young(page)) 440 referenced++; 441 442 return referenced; 443 } 444 445 static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) 446 { 447 struct mm_struct *mm = vma->vm_mm; 448 unsigned long address; 449 pte_t *pte; 450 spinlock_t *ptl; 451 int ret = 0; 452 453 address = vma_address(page, vma); 454 if (address == -EFAULT) 455 goto out; 456 457 pte = page_check_address(page, mm, address, &ptl, 1); 458 if (!pte) 459 goto out; 460 461 if (pte_dirty(*pte) || pte_write(*pte)) { 462 pte_t entry; 463 464 flush_cache_page(vma, address, pte_pfn(*pte)); 465 entry = ptep_clear_flush_notify(vma, address, pte); 466 entry = pte_wrprotect(entry); 467 entry = pte_mkclean(entry); 468 set_pte_at(mm, address, pte, entry); 469 ret = 1; 470 } 471 472 pte_unmap_unlock(pte, ptl); 473 out: 474 return ret; 475 } 476 477 static int page_mkclean_file(struct address_space *mapping, struct page *page) 478 { 479 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 480 struct vm_area_struct *vma; 481 struct prio_tree_iter iter; 482 int ret = 0; 483 484 BUG_ON(PageAnon(page)); 485 486 spin_lock(&mapping->i_mmap_lock); 487 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 488 if (vma->vm_flags & VM_SHARED) 489 ret += page_mkclean_one(page, vma); 490 } 491 spin_unlock(&mapping->i_mmap_lock); 492 return ret; 493 } 494 495 int page_mkclean(struct page *page) 496 { 497 int ret = 0; 498 499 BUG_ON(!PageLocked(page)); 500 501 if (page_mapped(page)) { 502 struct address_space *mapping = page_mapping(page); 503 if (mapping) { 504 ret = page_mkclean_file(mapping, page); 505 if (page_test_dirty(page)) { 506 page_clear_dirty(page); 507 ret = 1; 508 } 509 } 510 } 511 512 return ret; 513 } 514 EXPORT_SYMBOL_GPL(page_mkclean); 515 516 /** 517 * __page_set_anon_rmap - setup new anonymous rmap 518 * @page: the page to add the mapping to 519 * @vma: the vm area in which the mapping is added 520 * @address: the user virtual address mapped 521 */ 522 static void __page_set_anon_rmap(struct page *page, 523 struct vm_area_struct *vma, unsigned long address) 524 { 525 struct anon_vma *anon_vma = vma->anon_vma; 526 527 BUG_ON(!anon_vma); 528 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 529 page->mapping = (struct address_space *) anon_vma; 530 531 page->index = linear_page_index(vma, address); 532 533 /* 534 * nr_mapped state can be updated without turning off 535 * interrupts because it is not modified via interrupt. 536 */ 537 __inc_zone_page_state(page, NR_ANON_PAGES); 538 } 539 540 /** 541 * __page_check_anon_rmap - sanity check anonymous rmap addition 542 * @page: the page to add the mapping to 543 * @vma: the vm area in which the mapping is added 544 * @address: the user virtual address mapped 545 */ 546 static void __page_check_anon_rmap(struct page *page, 547 struct vm_area_struct *vma, unsigned long address) 548 { 549 #ifdef CONFIG_DEBUG_VM 550 /* 551 * The page's anon-rmap details (mapping and index) are guaranteed to 552 * be set up correctly at this point. 553 * 554 * We have exclusion against page_add_anon_rmap because the caller 555 * always holds the page locked, except if called from page_dup_rmap, 556 * in which case the page is already known to be setup. 557 * 558 * We have exclusion against page_add_new_anon_rmap because those pages 559 * are initially only visible via the pagetables, and the pte is locked 560 * over the call to page_add_new_anon_rmap. 561 */ 562 struct anon_vma *anon_vma = vma->anon_vma; 563 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 564 BUG_ON(page->mapping != (struct address_space *)anon_vma); 565 BUG_ON(page->index != linear_page_index(vma, address)); 566 #endif 567 } 568 569 /** 570 * page_add_anon_rmap - add pte mapping to an anonymous page 571 * @page: the page to add the mapping to 572 * @vma: the vm area in which the mapping is added 573 * @address: the user virtual address mapped 574 * 575 * The caller needs to hold the pte lock and the page must be locked. 576 */ 577 void page_add_anon_rmap(struct page *page, 578 struct vm_area_struct *vma, unsigned long address) 579 { 580 VM_BUG_ON(!PageLocked(page)); 581 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 582 if (atomic_inc_and_test(&page->_mapcount)) 583 __page_set_anon_rmap(page, vma, address); 584 else 585 __page_check_anon_rmap(page, vma, address); 586 } 587 588 /** 589 * page_add_new_anon_rmap - add pte mapping to a new anonymous page 590 * @page: the page to add the mapping to 591 * @vma: the vm area in which the mapping is added 592 * @address: the user virtual address mapped 593 * 594 * Same as page_add_anon_rmap but must only be called on *new* pages. 595 * This means the inc-and-test can be bypassed. 596 * Page does not have to be locked. 597 */ 598 void page_add_new_anon_rmap(struct page *page, 599 struct vm_area_struct *vma, unsigned long address) 600 { 601 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 602 atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ 603 __page_set_anon_rmap(page, vma, address); 604 } 605 606 /** 607 * page_add_file_rmap - add pte mapping to a file page 608 * @page: the page to add the mapping to 609 * 610 * The caller needs to hold the pte lock. 611 */ 612 void page_add_file_rmap(struct page *page) 613 { 614 if (atomic_inc_and_test(&page->_mapcount)) 615 __inc_zone_page_state(page, NR_FILE_MAPPED); 616 } 617 618 #ifdef CONFIG_DEBUG_VM 619 /** 620 * page_dup_rmap - duplicate pte mapping to a page 621 * @page: the page to add the mapping to 622 * @vma: the vm area being duplicated 623 * @address: the user virtual address mapped 624 * 625 * For copy_page_range only: minimal extract from page_add_file_rmap / 626 * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's 627 * quicker. 628 * 629 * The caller needs to hold the pte lock. 630 */ 631 void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) 632 { 633 BUG_ON(page_mapcount(page) == 0); 634 if (PageAnon(page)) 635 __page_check_anon_rmap(page, vma, address); 636 atomic_inc(&page->_mapcount); 637 } 638 #endif 639 640 /** 641 * page_remove_rmap - take down pte mapping from a page 642 * @page: page to remove mapping from 643 * @vma: the vm area in which the mapping is removed 644 * 645 * The caller needs to hold the pte lock. 646 */ 647 void page_remove_rmap(struct page *page, struct vm_area_struct *vma) 648 { 649 if (atomic_add_negative(-1, &page->_mapcount)) { 650 if (unlikely(page_mapcount(page) < 0)) { 651 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); 652 printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); 653 printk (KERN_EMERG " page->flags = %lx\n", page->flags); 654 printk (KERN_EMERG " page->count = %x\n", page_count(page)); 655 printk (KERN_EMERG " page->mapping = %p\n", page->mapping); 656 print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); 657 if (vma->vm_ops) { 658 print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); 659 } 660 if (vma->vm_file && vma->vm_file->f_op) 661 print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); 662 BUG(); 663 } 664 665 /* 666 * Now that the last pte has gone, s390 must transfer dirty 667 * flag from storage key to struct page. We can usually skip 668 * this if the page is anon, so about to be freed; but perhaps 669 * not if it's in swapcache - there might be another pte slot 670 * containing the swap entry, but page not yet written to swap. 671 */ 672 if ((!PageAnon(page) || PageSwapCache(page)) && 673 page_test_dirty(page)) { 674 page_clear_dirty(page); 675 set_page_dirty(page); 676 } 677 678 mem_cgroup_uncharge_page(page); 679 __dec_zone_page_state(page, 680 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); 681 /* 682 * It would be tidy to reset the PageAnon mapping here, 683 * but that might overwrite a racing page_add_anon_rmap 684 * which increments mapcount after us but sets mapping 685 * before us: so leave the reset to free_hot_cold_page, 686 * and remember that it's only reliable while mapped. 687 * Leaving it set also helps swapoff to reinstate ptes 688 * faster for those pages still in swapcache. 689 */ 690 } 691 } 692 693 /* 694 * Subfunctions of try_to_unmap: try_to_unmap_one called 695 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 696 */ 697 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 698 int migration) 699 { 700 struct mm_struct *mm = vma->vm_mm; 701 unsigned long address; 702 pte_t *pte; 703 pte_t pteval; 704 spinlock_t *ptl; 705 int ret = SWAP_AGAIN; 706 707 address = vma_address(page, vma); 708 if (address == -EFAULT) 709 goto out; 710 711 pte = page_check_address(page, mm, address, &ptl, 0); 712 if (!pte) 713 goto out; 714 715 /* 716 * If the page is mlock()d, we cannot swap it out. 717 * If it's recently referenced (perhaps page_referenced 718 * skipped over this mm) then we should reactivate it. 719 */ 720 if (!migration && ((vma->vm_flags & VM_LOCKED) || 721 (ptep_clear_flush_young_notify(vma, address, pte)))) { 722 ret = SWAP_FAIL; 723 goto out_unmap; 724 } 725 726 /* Nuke the page table entry. */ 727 flush_cache_page(vma, address, page_to_pfn(page)); 728 pteval = ptep_clear_flush_notify(vma, address, pte); 729 730 /* Move the dirty bit to the physical page now the pte is gone. */ 731 if (pte_dirty(pteval)) 732 set_page_dirty(page); 733 734 /* Update high watermark before we lower rss */ 735 update_hiwater_rss(mm); 736 737 if (PageAnon(page)) { 738 swp_entry_t entry = { .val = page_private(page) }; 739 740 if (PageSwapCache(page)) { 741 /* 742 * Store the swap location in the pte. 743 * See handle_pte_fault() ... 744 */ 745 swap_duplicate(entry); 746 if (list_empty(&mm->mmlist)) { 747 spin_lock(&mmlist_lock); 748 if (list_empty(&mm->mmlist)) 749 list_add(&mm->mmlist, &init_mm.mmlist); 750 spin_unlock(&mmlist_lock); 751 } 752 dec_mm_counter(mm, anon_rss); 753 #ifdef CONFIG_MIGRATION 754 } else { 755 /* 756 * Store the pfn of the page in a special migration 757 * pte. do_swap_page() will wait until the migration 758 * pte is removed and then restart fault handling. 759 */ 760 BUG_ON(!migration); 761 entry = make_migration_entry(page, pte_write(pteval)); 762 #endif 763 } 764 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 765 BUG_ON(pte_file(*pte)); 766 } else 767 #ifdef CONFIG_MIGRATION 768 if (migration) { 769 /* Establish migration entry for a file page */ 770 swp_entry_t entry; 771 entry = make_migration_entry(page, pte_write(pteval)); 772 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 773 } else 774 #endif 775 dec_mm_counter(mm, file_rss); 776 777 778 page_remove_rmap(page, vma); 779 page_cache_release(page); 780 781 out_unmap: 782 pte_unmap_unlock(pte, ptl); 783 out: 784 return ret; 785 } 786 787 /* 788 * objrmap doesn't work for nonlinear VMAs because the assumption that 789 * offset-into-file correlates with offset-into-virtual-addresses does not hold. 790 * Consequently, given a particular page and its ->index, we cannot locate the 791 * ptes which are mapping that page without an exhaustive linear search. 792 * 793 * So what this code does is a mini "virtual scan" of each nonlinear VMA which 794 * maps the file to which the target page belongs. The ->vm_private_data field 795 * holds the current cursor into that scan. Successive searches will circulate 796 * around the vma's virtual address space. 797 * 798 * So as more replacement pressure is applied to the pages in a nonlinear VMA, 799 * more scanning pressure is placed against them as well. Eventually pages 800 * will become fully unmapped and are eligible for eviction. 801 * 802 * For very sparsely populated VMAs this is a little inefficient - chances are 803 * there there won't be many ptes located within the scan cluster. In this case 804 * maybe we could scan further - to the end of the pte page, perhaps. 805 */ 806 #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) 807 #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) 808 809 static void try_to_unmap_cluster(unsigned long cursor, 810 unsigned int *mapcount, struct vm_area_struct *vma) 811 { 812 struct mm_struct *mm = vma->vm_mm; 813 pgd_t *pgd; 814 pud_t *pud; 815 pmd_t *pmd; 816 pte_t *pte; 817 pte_t pteval; 818 spinlock_t *ptl; 819 struct page *page; 820 unsigned long address; 821 unsigned long end; 822 823 address = (vma->vm_start + cursor) & CLUSTER_MASK; 824 end = address + CLUSTER_SIZE; 825 if (address < vma->vm_start) 826 address = vma->vm_start; 827 if (end > vma->vm_end) 828 end = vma->vm_end; 829 830 pgd = pgd_offset(mm, address); 831 if (!pgd_present(*pgd)) 832 return; 833 834 pud = pud_offset(pgd, address); 835 if (!pud_present(*pud)) 836 return; 837 838 pmd = pmd_offset(pud, address); 839 if (!pmd_present(*pmd)) 840 return; 841 842 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 843 844 /* Update high watermark before we lower rss */ 845 update_hiwater_rss(mm); 846 847 for (; address < end; pte++, address += PAGE_SIZE) { 848 if (!pte_present(*pte)) 849 continue; 850 page = vm_normal_page(vma, address, *pte); 851 BUG_ON(!page || PageAnon(page)); 852 853 if (ptep_clear_flush_young_notify(vma, address, pte)) 854 continue; 855 856 /* Nuke the page table entry. */ 857 flush_cache_page(vma, address, pte_pfn(*pte)); 858 pteval = ptep_clear_flush_notify(vma, address, pte); 859 860 /* If nonlinear, store the file page offset in the pte. */ 861 if (page->index != linear_page_index(vma, address)) 862 set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); 863 864 /* Move the dirty bit to the physical page now the pte is gone. */ 865 if (pte_dirty(pteval)) 866 set_page_dirty(page); 867 868 page_remove_rmap(page, vma); 869 page_cache_release(page); 870 dec_mm_counter(mm, file_rss); 871 (*mapcount)--; 872 } 873 pte_unmap_unlock(pte - 1, ptl); 874 } 875 876 static int try_to_unmap_anon(struct page *page, int migration) 877 { 878 struct anon_vma *anon_vma; 879 struct vm_area_struct *vma; 880 int ret = SWAP_AGAIN; 881 882 anon_vma = page_lock_anon_vma(page); 883 if (!anon_vma) 884 return ret; 885 886 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 887 ret = try_to_unmap_one(page, vma, migration); 888 if (ret == SWAP_FAIL || !page_mapped(page)) 889 break; 890 } 891 892 page_unlock_anon_vma(anon_vma); 893 return ret; 894 } 895 896 /** 897 * try_to_unmap_file - unmap file page using the object-based rmap method 898 * @page: the page to unmap 899 * @migration: migration flag 900 * 901 * Find all the mappings of a page using the mapping pointer and the vma chains 902 * contained in the address_space struct it points to. 903 * 904 * This function is only called from try_to_unmap for object-based pages. 905 */ 906 static int try_to_unmap_file(struct page *page, int migration) 907 { 908 struct address_space *mapping = page->mapping; 909 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 910 struct vm_area_struct *vma; 911 struct prio_tree_iter iter; 912 int ret = SWAP_AGAIN; 913 unsigned long cursor; 914 unsigned long max_nl_cursor = 0; 915 unsigned long max_nl_size = 0; 916 unsigned int mapcount; 917 918 spin_lock(&mapping->i_mmap_lock); 919 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 920 ret = try_to_unmap_one(page, vma, migration); 921 if (ret == SWAP_FAIL || !page_mapped(page)) 922 goto out; 923 } 924 925 if (list_empty(&mapping->i_mmap_nonlinear)) 926 goto out; 927 928 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 929 shared.vm_set.list) { 930 if ((vma->vm_flags & VM_LOCKED) && !migration) 931 continue; 932 cursor = (unsigned long) vma->vm_private_data; 933 if (cursor > max_nl_cursor) 934 max_nl_cursor = cursor; 935 cursor = vma->vm_end - vma->vm_start; 936 if (cursor > max_nl_size) 937 max_nl_size = cursor; 938 } 939 940 if (max_nl_size == 0) { /* any nonlinears locked or reserved */ 941 ret = SWAP_FAIL; 942 goto out; 943 } 944 945 /* 946 * We don't try to search for this page in the nonlinear vmas, 947 * and page_referenced wouldn't have found it anyway. Instead 948 * just walk the nonlinear vmas trying to age and unmap some. 949 * The mapcount of the page we came in with is irrelevant, 950 * but even so use it as a guide to how hard we should try? 951 */ 952 mapcount = page_mapcount(page); 953 if (!mapcount) 954 goto out; 955 cond_resched_lock(&mapping->i_mmap_lock); 956 957 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; 958 if (max_nl_cursor == 0) 959 max_nl_cursor = CLUSTER_SIZE; 960 961 do { 962 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 963 shared.vm_set.list) { 964 if ((vma->vm_flags & VM_LOCKED) && !migration) 965 continue; 966 cursor = (unsigned long) vma->vm_private_data; 967 while ( cursor < max_nl_cursor && 968 cursor < vma->vm_end - vma->vm_start) { 969 try_to_unmap_cluster(cursor, &mapcount, vma); 970 cursor += CLUSTER_SIZE; 971 vma->vm_private_data = (void *) cursor; 972 if ((int)mapcount <= 0) 973 goto out; 974 } 975 vma->vm_private_data = (void *) max_nl_cursor; 976 } 977 cond_resched_lock(&mapping->i_mmap_lock); 978 max_nl_cursor += CLUSTER_SIZE; 979 } while (max_nl_cursor <= max_nl_size); 980 981 /* 982 * Don't loop forever (perhaps all the remaining pages are 983 * in locked vmas). Reset cursor on all unreserved nonlinear 984 * vmas, now forgetting on which ones it had fallen behind. 985 */ 986 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 987 vma->vm_private_data = NULL; 988 out: 989 spin_unlock(&mapping->i_mmap_lock); 990 return ret; 991 } 992 993 /** 994 * try_to_unmap - try to remove all page table mappings to a page 995 * @page: the page to get unmapped 996 * @migration: migration flag 997 * 998 * Tries to remove all the page table entries which are mapping this 999 * page, used in the pageout path. Caller must hold the page lock. 1000 * Return values are: 1001 * 1002 * SWAP_SUCCESS - we succeeded in removing all mappings 1003 * SWAP_AGAIN - we missed a mapping, try again later 1004 * SWAP_FAIL - the page is unswappable 1005 */ 1006 int try_to_unmap(struct page *page, int migration) 1007 { 1008 int ret; 1009 1010 BUG_ON(!PageLocked(page)); 1011 1012 if (PageAnon(page)) 1013 ret = try_to_unmap_anon(page, migration); 1014 else 1015 ret = try_to_unmap_file(page, migration); 1016 1017 if (!page_mapped(page)) 1018 ret = SWAP_SUCCESS; 1019 return ret; 1020 } 1021 1022