1 /* 2 * mm/rmap.c - physical to virtual reverse mappings 3 * 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 5 * Released under the General Public License (GPL). 6 * 7 * Simple, low overhead reverse mapping scheme. 8 * Please try to keep this thing as modular as possible. 9 * 10 * Provides methods for unmapping each kind of mapped page: 11 * the anon methods track anonymous pages, and 12 * the file methods track pages belonging to an inode. 13 * 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 17 * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 18 */ 19 20 /* 21 * Lock ordering in mm: 22 * 23 * inode->i_mutex (while writing or truncating, not reading or faulting) 24 * inode->i_alloc_sem (vmtruncate_range) 25 * mm->mmap_sem 26 * page->flags PG_locked (lock_page) 27 * mapping->i_mmap_lock 28 * anon_vma->lock 29 * mm->page_table_lock or pte_lock 30 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 31 * swap_lock (in swap_duplicate, swap_info_get) 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 * mapping->private_lock (in __set_page_dirty_buffers) 34 * inode_lock (in set_page_dirty's __mark_inode_dirty) 35 * sb_lock (within inode_lock in fs/fs-writeback.c) 36 * mapping->tree_lock (widely used, in set_page_dirty, 37 * in arch-dependent flush_dcache_mmap_lock, 38 * within inode_lock in __sync_single_inode) 39 */ 40 41 #include <linux/mm.h> 42 #include <linux/pagemap.h> 43 #include <linux/swap.h> 44 #include <linux/swapops.h> 45 #include <linux/slab.h> 46 #include <linux/init.h> 47 #include <linux/rmap.h> 48 #include <linux/rcupdate.h> 49 #include <linux/module.h> 50 51 #include <asm/tlbflush.h> 52 53 struct kmem_cache *anon_vma_cachep; 54 55 static inline void validate_anon_vma(struct vm_area_struct *find_vma) 56 { 57 #ifdef CONFIG_DEBUG_VM 58 struct anon_vma *anon_vma = find_vma->anon_vma; 59 struct vm_area_struct *vma; 60 unsigned int mapcount = 0; 61 int found = 0; 62 63 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 64 mapcount++; 65 BUG_ON(mapcount > 100000); 66 if (vma == find_vma) 67 found = 1; 68 } 69 BUG_ON(!found); 70 #endif 71 } 72 73 /* This must be called under the mmap_sem. */ 74 int anon_vma_prepare(struct vm_area_struct *vma) 75 { 76 struct anon_vma *anon_vma = vma->anon_vma; 77 78 might_sleep(); 79 if (unlikely(!anon_vma)) { 80 struct mm_struct *mm = vma->vm_mm; 81 struct anon_vma *allocated, *locked; 82 83 anon_vma = find_mergeable_anon_vma(vma); 84 if (anon_vma) { 85 allocated = NULL; 86 locked = anon_vma; 87 spin_lock(&locked->lock); 88 } else { 89 anon_vma = anon_vma_alloc(); 90 if (unlikely(!anon_vma)) 91 return -ENOMEM; 92 allocated = anon_vma; 93 locked = NULL; 94 } 95 96 /* page_table_lock to protect against threads */ 97 spin_lock(&mm->page_table_lock); 98 if (likely(!vma->anon_vma)) { 99 vma->anon_vma = anon_vma; 100 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 101 allocated = NULL; 102 } 103 spin_unlock(&mm->page_table_lock); 104 105 if (locked) 106 spin_unlock(&locked->lock); 107 if (unlikely(allocated)) 108 anon_vma_free(allocated); 109 } 110 return 0; 111 } 112 113 void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) 114 { 115 BUG_ON(vma->anon_vma != next->anon_vma); 116 list_del(&next->anon_vma_node); 117 } 118 119 void __anon_vma_link(struct vm_area_struct *vma) 120 { 121 struct anon_vma *anon_vma = vma->anon_vma; 122 123 if (anon_vma) { 124 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 125 validate_anon_vma(vma); 126 } 127 } 128 129 void anon_vma_link(struct vm_area_struct *vma) 130 { 131 struct anon_vma *anon_vma = vma->anon_vma; 132 133 if (anon_vma) { 134 spin_lock(&anon_vma->lock); 135 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 136 validate_anon_vma(vma); 137 spin_unlock(&anon_vma->lock); 138 } 139 } 140 141 void anon_vma_unlink(struct vm_area_struct *vma) 142 { 143 struct anon_vma *anon_vma = vma->anon_vma; 144 int empty; 145 146 if (!anon_vma) 147 return; 148 149 spin_lock(&anon_vma->lock); 150 validate_anon_vma(vma); 151 list_del(&vma->anon_vma_node); 152 153 /* We must garbage collect the anon_vma if it's empty */ 154 empty = list_empty(&anon_vma->head); 155 spin_unlock(&anon_vma->lock); 156 157 if (empty) 158 anon_vma_free(anon_vma); 159 } 160 161 static void anon_vma_ctor(void *data, struct kmem_cache *cachep, 162 unsigned long flags) 163 { 164 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 165 SLAB_CTOR_CONSTRUCTOR) { 166 struct anon_vma *anon_vma = data; 167 168 spin_lock_init(&anon_vma->lock); 169 INIT_LIST_HEAD(&anon_vma->head); 170 } 171 } 172 173 void __init anon_vma_init(void) 174 { 175 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 176 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL); 177 } 178 179 /* 180 * Getting a lock on a stable anon_vma from a page off the LRU is 181 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 182 */ 183 static struct anon_vma *page_lock_anon_vma(struct page *page) 184 { 185 struct anon_vma *anon_vma = NULL; 186 unsigned long anon_mapping; 187 188 rcu_read_lock(); 189 anon_mapping = (unsigned long) page->mapping; 190 if (!(anon_mapping & PAGE_MAPPING_ANON)) 191 goto out; 192 if (!page_mapped(page)) 193 goto out; 194 195 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 196 spin_lock(&anon_vma->lock); 197 out: 198 rcu_read_unlock(); 199 return anon_vma; 200 } 201 202 /* 203 * At what user virtual address is page expected in vma? 204 */ 205 static inline unsigned long 206 vma_address(struct page *page, struct vm_area_struct *vma) 207 { 208 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 209 unsigned long address; 210 211 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 212 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 213 /* page should be within any vma from prio_tree_next */ 214 BUG_ON(!PageAnon(page)); 215 return -EFAULT; 216 } 217 return address; 218 } 219 220 /* 221 * At what user virtual address is page expected in vma? checking that the 222 * page matches the vma: currently only used on anon pages, by unuse_vma; 223 */ 224 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 225 { 226 if (PageAnon(page)) { 227 if ((void *)vma->anon_vma != 228 (void *)page->mapping - PAGE_MAPPING_ANON) 229 return -EFAULT; 230 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 231 if (!vma->vm_file || 232 vma->vm_file->f_mapping != page->mapping) 233 return -EFAULT; 234 } else 235 return -EFAULT; 236 return vma_address(page, vma); 237 } 238 239 /* 240 * Check that @page is mapped at @address into @mm. 241 * 242 * On success returns with pte mapped and locked. 243 */ 244 pte_t *page_check_address(struct page *page, struct mm_struct *mm, 245 unsigned long address, spinlock_t **ptlp) 246 { 247 pgd_t *pgd; 248 pud_t *pud; 249 pmd_t *pmd; 250 pte_t *pte; 251 spinlock_t *ptl; 252 253 pgd = pgd_offset(mm, address); 254 if (!pgd_present(*pgd)) 255 return NULL; 256 257 pud = pud_offset(pgd, address); 258 if (!pud_present(*pud)) 259 return NULL; 260 261 pmd = pmd_offset(pud, address); 262 if (!pmd_present(*pmd)) 263 return NULL; 264 265 pte = pte_offset_map(pmd, address); 266 /* Make a quick check before getting the lock */ 267 if (!pte_present(*pte)) { 268 pte_unmap(pte); 269 return NULL; 270 } 271 272 ptl = pte_lockptr(mm, pmd); 273 spin_lock(ptl); 274 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { 275 *ptlp = ptl; 276 return pte; 277 } 278 pte_unmap_unlock(pte, ptl); 279 return NULL; 280 } 281 282 /* 283 * Subfunctions of page_referenced: page_referenced_one called 284 * repeatedly from either page_referenced_anon or page_referenced_file. 285 */ 286 static int page_referenced_one(struct page *page, 287 struct vm_area_struct *vma, unsigned int *mapcount) 288 { 289 struct mm_struct *mm = vma->vm_mm; 290 unsigned long address; 291 pte_t *pte; 292 spinlock_t *ptl; 293 int referenced = 0; 294 295 address = vma_address(page, vma); 296 if (address == -EFAULT) 297 goto out; 298 299 pte = page_check_address(page, mm, address, &ptl); 300 if (!pte) 301 goto out; 302 303 if (ptep_clear_flush_young(vma, address, pte)) 304 referenced++; 305 306 /* Pretend the page is referenced if the task has the 307 swap token and is in the middle of a page fault. */ 308 if (mm != current->mm && has_swap_token(mm) && 309 rwsem_is_locked(&mm->mmap_sem)) 310 referenced++; 311 312 (*mapcount)--; 313 pte_unmap_unlock(pte, ptl); 314 out: 315 return referenced; 316 } 317 318 static int page_referenced_anon(struct page *page) 319 { 320 unsigned int mapcount; 321 struct anon_vma *anon_vma; 322 struct vm_area_struct *vma; 323 int referenced = 0; 324 325 anon_vma = page_lock_anon_vma(page); 326 if (!anon_vma) 327 return referenced; 328 329 mapcount = page_mapcount(page); 330 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 331 referenced += page_referenced_one(page, vma, &mapcount); 332 if (!mapcount) 333 break; 334 } 335 spin_unlock(&anon_vma->lock); 336 return referenced; 337 } 338 339 /** 340 * page_referenced_file - referenced check for object-based rmap 341 * @page: the page we're checking references on. 342 * 343 * For an object-based mapped page, find all the places it is mapped and 344 * check/clear the referenced flag. This is done by following the page->mapping 345 * pointer, then walking the chain of vmas it holds. It returns the number 346 * of references it found. 347 * 348 * This function is only called from page_referenced for object-based pages. 349 */ 350 static int page_referenced_file(struct page *page) 351 { 352 unsigned int mapcount; 353 struct address_space *mapping = page->mapping; 354 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 355 struct vm_area_struct *vma; 356 struct prio_tree_iter iter; 357 int referenced = 0; 358 359 /* 360 * The caller's checks on page->mapping and !PageAnon have made 361 * sure that this is a file page: the check for page->mapping 362 * excludes the case just before it gets set on an anon page. 363 */ 364 BUG_ON(PageAnon(page)); 365 366 /* 367 * The page lock not only makes sure that page->mapping cannot 368 * suddenly be NULLified by truncation, it makes sure that the 369 * structure at mapping cannot be freed and reused yet, 370 * so we can safely take mapping->i_mmap_lock. 371 */ 372 BUG_ON(!PageLocked(page)); 373 374 spin_lock(&mapping->i_mmap_lock); 375 376 /* 377 * i_mmap_lock does not stabilize mapcount at all, but mapcount 378 * is more likely to be accurate if we note it after spinning. 379 */ 380 mapcount = page_mapcount(page); 381 382 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 383 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) 384 == (VM_LOCKED|VM_MAYSHARE)) { 385 referenced++; 386 break; 387 } 388 referenced += page_referenced_one(page, vma, &mapcount); 389 if (!mapcount) 390 break; 391 } 392 393 spin_unlock(&mapping->i_mmap_lock); 394 return referenced; 395 } 396 397 /** 398 * page_referenced - test if the page was referenced 399 * @page: the page to test 400 * @is_locked: caller holds lock on the page 401 * 402 * Quick test_and_clear_referenced for all mappings to a page, 403 * returns the number of ptes which referenced the page. 404 */ 405 int page_referenced(struct page *page, int is_locked) 406 { 407 int referenced = 0; 408 409 if (page_test_and_clear_young(page)) 410 referenced++; 411 412 if (TestClearPageReferenced(page)) 413 referenced++; 414 415 if (page_mapped(page) && page->mapping) { 416 if (PageAnon(page)) 417 referenced += page_referenced_anon(page); 418 else if (is_locked) 419 referenced += page_referenced_file(page); 420 else if (TestSetPageLocked(page)) 421 referenced++; 422 else { 423 if (page->mapping) 424 referenced += page_referenced_file(page); 425 unlock_page(page); 426 } 427 } 428 return referenced; 429 } 430 431 static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) 432 { 433 struct mm_struct *mm = vma->vm_mm; 434 unsigned long address; 435 pte_t *pte, entry; 436 spinlock_t *ptl; 437 int ret = 0; 438 439 address = vma_address(page, vma); 440 if (address == -EFAULT) 441 goto out; 442 443 pte = page_check_address(page, mm, address, &ptl); 444 if (!pte) 445 goto out; 446 447 if (!pte_dirty(*pte) && !pte_write(*pte)) 448 goto unlock; 449 450 entry = ptep_get_and_clear(mm, address, pte); 451 entry = pte_mkclean(entry); 452 entry = pte_wrprotect(entry); 453 ptep_establish(vma, address, pte, entry); 454 lazy_mmu_prot_update(entry); 455 ret = 1; 456 457 unlock: 458 pte_unmap_unlock(pte, ptl); 459 out: 460 return ret; 461 } 462 463 static int page_mkclean_file(struct address_space *mapping, struct page *page) 464 { 465 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 466 struct vm_area_struct *vma; 467 struct prio_tree_iter iter; 468 int ret = 0; 469 470 BUG_ON(PageAnon(page)); 471 472 spin_lock(&mapping->i_mmap_lock); 473 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 474 if (vma->vm_flags & VM_SHARED) 475 ret += page_mkclean_one(page, vma); 476 } 477 spin_unlock(&mapping->i_mmap_lock); 478 return ret; 479 } 480 481 int page_mkclean(struct page *page) 482 { 483 int ret = 0; 484 485 BUG_ON(!PageLocked(page)); 486 487 if (page_mapped(page)) { 488 struct address_space *mapping = page_mapping(page); 489 if (mapping) 490 ret = page_mkclean_file(mapping, page); 491 } 492 493 return ret; 494 } 495 496 /** 497 * page_set_anon_rmap - setup new anonymous rmap 498 * @page: the page to add the mapping to 499 * @vma: the vm area in which the mapping is added 500 * @address: the user virtual address mapped 501 */ 502 static void __page_set_anon_rmap(struct page *page, 503 struct vm_area_struct *vma, unsigned long address) 504 { 505 struct anon_vma *anon_vma = vma->anon_vma; 506 507 BUG_ON(!anon_vma); 508 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 509 page->mapping = (struct address_space *) anon_vma; 510 511 page->index = linear_page_index(vma, address); 512 513 /* 514 * nr_mapped state can be updated without turning off 515 * interrupts because it is not modified via interrupt. 516 */ 517 __inc_zone_page_state(page, NR_ANON_PAGES); 518 } 519 520 /** 521 * page_add_anon_rmap - add pte mapping to an anonymous page 522 * @page: the page to add the mapping to 523 * @vma: the vm area in which the mapping is added 524 * @address: the user virtual address mapped 525 * 526 * The caller needs to hold the pte lock. 527 */ 528 void page_add_anon_rmap(struct page *page, 529 struct vm_area_struct *vma, unsigned long address) 530 { 531 if (atomic_inc_and_test(&page->_mapcount)) 532 __page_set_anon_rmap(page, vma, address); 533 /* else checking page index and mapping is racy */ 534 } 535 536 /* 537 * page_add_new_anon_rmap - add pte mapping to a new anonymous page 538 * @page: the page to add the mapping to 539 * @vma: the vm area in which the mapping is added 540 * @address: the user virtual address mapped 541 * 542 * Same as page_add_anon_rmap but must only be called on *new* pages. 543 * This means the inc-and-test can be bypassed. 544 */ 545 void page_add_new_anon_rmap(struct page *page, 546 struct vm_area_struct *vma, unsigned long address) 547 { 548 atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ 549 __page_set_anon_rmap(page, vma, address); 550 } 551 552 /** 553 * page_add_file_rmap - add pte mapping to a file page 554 * @page: the page to add the mapping to 555 * 556 * The caller needs to hold the pte lock. 557 */ 558 void page_add_file_rmap(struct page *page) 559 { 560 if (atomic_inc_and_test(&page->_mapcount)) 561 __inc_zone_page_state(page, NR_FILE_MAPPED); 562 } 563 564 /** 565 * page_remove_rmap - take down pte mapping from a page 566 * @page: page to remove mapping from 567 * 568 * The caller needs to hold the pte lock. 569 */ 570 void page_remove_rmap(struct page *page) 571 { 572 if (atomic_add_negative(-1, &page->_mapcount)) { 573 if (unlikely(page_mapcount(page) < 0)) { 574 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); 575 printk (KERN_EMERG " page->flags = %lx\n", page->flags); 576 printk (KERN_EMERG " page->count = %x\n", page_count(page)); 577 printk (KERN_EMERG " page->mapping = %p\n", page->mapping); 578 BUG(); 579 } 580 581 /* 582 * It would be tidy to reset the PageAnon mapping here, 583 * but that might overwrite a racing page_add_anon_rmap 584 * which increments mapcount after us but sets mapping 585 * before us: so leave the reset to free_hot_cold_page, 586 * and remember that it's only reliable while mapped. 587 * Leaving it set also helps swapoff to reinstate ptes 588 * faster for those pages still in swapcache. 589 */ 590 if (page_test_and_clear_dirty(page)) 591 set_page_dirty(page); 592 __dec_zone_page_state(page, 593 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); 594 } 595 } 596 597 /* 598 * Subfunctions of try_to_unmap: try_to_unmap_one called 599 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 600 */ 601 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 602 int migration) 603 { 604 struct mm_struct *mm = vma->vm_mm; 605 unsigned long address; 606 pte_t *pte; 607 pte_t pteval; 608 spinlock_t *ptl; 609 int ret = SWAP_AGAIN; 610 611 address = vma_address(page, vma); 612 if (address == -EFAULT) 613 goto out; 614 615 pte = page_check_address(page, mm, address, &ptl); 616 if (!pte) 617 goto out; 618 619 /* 620 * If the page is mlock()d, we cannot swap it out. 621 * If it's recently referenced (perhaps page_referenced 622 * skipped over this mm) then we should reactivate it. 623 */ 624 if (!migration && ((vma->vm_flags & VM_LOCKED) || 625 (ptep_clear_flush_young(vma, address, pte)))) { 626 ret = SWAP_FAIL; 627 goto out_unmap; 628 } 629 630 /* Nuke the page table entry. */ 631 flush_cache_page(vma, address, page_to_pfn(page)); 632 pteval = ptep_clear_flush(vma, address, pte); 633 634 /* Move the dirty bit to the physical page now the pte is gone. */ 635 if (pte_dirty(pteval)) 636 set_page_dirty(page); 637 638 /* Update high watermark before we lower rss */ 639 update_hiwater_rss(mm); 640 641 if (PageAnon(page)) { 642 swp_entry_t entry = { .val = page_private(page) }; 643 644 if (PageSwapCache(page)) { 645 /* 646 * Store the swap location in the pte. 647 * See handle_pte_fault() ... 648 */ 649 swap_duplicate(entry); 650 if (list_empty(&mm->mmlist)) { 651 spin_lock(&mmlist_lock); 652 if (list_empty(&mm->mmlist)) 653 list_add(&mm->mmlist, &init_mm.mmlist); 654 spin_unlock(&mmlist_lock); 655 } 656 dec_mm_counter(mm, anon_rss); 657 #ifdef CONFIG_MIGRATION 658 } else { 659 /* 660 * Store the pfn of the page in a special migration 661 * pte. do_swap_page() will wait until the migration 662 * pte is removed and then restart fault handling. 663 */ 664 BUG_ON(!migration); 665 entry = make_migration_entry(page, pte_write(pteval)); 666 #endif 667 } 668 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 669 BUG_ON(pte_file(*pte)); 670 } else 671 #ifdef CONFIG_MIGRATION 672 if (migration) { 673 /* Establish migration entry for a file page */ 674 swp_entry_t entry; 675 entry = make_migration_entry(page, pte_write(pteval)); 676 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 677 } else 678 #endif 679 dec_mm_counter(mm, file_rss); 680 681 682 page_remove_rmap(page); 683 page_cache_release(page); 684 685 out_unmap: 686 pte_unmap_unlock(pte, ptl); 687 out: 688 return ret; 689 } 690 691 /* 692 * objrmap doesn't work for nonlinear VMAs because the assumption that 693 * offset-into-file correlates with offset-into-virtual-addresses does not hold. 694 * Consequently, given a particular page and its ->index, we cannot locate the 695 * ptes which are mapping that page without an exhaustive linear search. 696 * 697 * So what this code does is a mini "virtual scan" of each nonlinear VMA which 698 * maps the file to which the target page belongs. The ->vm_private_data field 699 * holds the current cursor into that scan. Successive searches will circulate 700 * around the vma's virtual address space. 701 * 702 * So as more replacement pressure is applied to the pages in a nonlinear VMA, 703 * more scanning pressure is placed against them as well. Eventually pages 704 * will become fully unmapped and are eligible for eviction. 705 * 706 * For very sparsely populated VMAs this is a little inefficient - chances are 707 * there there won't be many ptes located within the scan cluster. In this case 708 * maybe we could scan further - to the end of the pte page, perhaps. 709 */ 710 #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) 711 #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) 712 713 static void try_to_unmap_cluster(unsigned long cursor, 714 unsigned int *mapcount, struct vm_area_struct *vma) 715 { 716 struct mm_struct *mm = vma->vm_mm; 717 pgd_t *pgd; 718 pud_t *pud; 719 pmd_t *pmd; 720 pte_t *pte; 721 pte_t pteval; 722 spinlock_t *ptl; 723 struct page *page; 724 unsigned long address; 725 unsigned long end; 726 727 address = (vma->vm_start + cursor) & CLUSTER_MASK; 728 end = address + CLUSTER_SIZE; 729 if (address < vma->vm_start) 730 address = vma->vm_start; 731 if (end > vma->vm_end) 732 end = vma->vm_end; 733 734 pgd = pgd_offset(mm, address); 735 if (!pgd_present(*pgd)) 736 return; 737 738 pud = pud_offset(pgd, address); 739 if (!pud_present(*pud)) 740 return; 741 742 pmd = pmd_offset(pud, address); 743 if (!pmd_present(*pmd)) 744 return; 745 746 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 747 748 /* Update high watermark before we lower rss */ 749 update_hiwater_rss(mm); 750 751 for (; address < end; pte++, address += PAGE_SIZE) { 752 if (!pte_present(*pte)) 753 continue; 754 page = vm_normal_page(vma, address, *pte); 755 BUG_ON(!page || PageAnon(page)); 756 757 if (ptep_clear_flush_young(vma, address, pte)) 758 continue; 759 760 /* Nuke the page table entry. */ 761 flush_cache_page(vma, address, pte_pfn(*pte)); 762 pteval = ptep_clear_flush(vma, address, pte); 763 764 /* If nonlinear, store the file page offset in the pte. */ 765 if (page->index != linear_page_index(vma, address)) 766 set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); 767 768 /* Move the dirty bit to the physical page now the pte is gone. */ 769 if (pte_dirty(pteval)) 770 set_page_dirty(page); 771 772 page_remove_rmap(page); 773 page_cache_release(page); 774 dec_mm_counter(mm, file_rss); 775 (*mapcount)--; 776 } 777 pte_unmap_unlock(pte - 1, ptl); 778 } 779 780 static int try_to_unmap_anon(struct page *page, int migration) 781 { 782 struct anon_vma *anon_vma; 783 struct vm_area_struct *vma; 784 int ret = SWAP_AGAIN; 785 786 anon_vma = page_lock_anon_vma(page); 787 if (!anon_vma) 788 return ret; 789 790 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 791 ret = try_to_unmap_one(page, vma, migration); 792 if (ret == SWAP_FAIL || !page_mapped(page)) 793 break; 794 } 795 spin_unlock(&anon_vma->lock); 796 return ret; 797 } 798 799 /** 800 * try_to_unmap_file - unmap file page using the object-based rmap method 801 * @page: the page to unmap 802 * 803 * Find all the mappings of a page using the mapping pointer and the vma chains 804 * contained in the address_space struct it points to. 805 * 806 * This function is only called from try_to_unmap for object-based pages. 807 */ 808 static int try_to_unmap_file(struct page *page, int migration) 809 { 810 struct address_space *mapping = page->mapping; 811 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 812 struct vm_area_struct *vma; 813 struct prio_tree_iter iter; 814 int ret = SWAP_AGAIN; 815 unsigned long cursor; 816 unsigned long max_nl_cursor = 0; 817 unsigned long max_nl_size = 0; 818 unsigned int mapcount; 819 820 spin_lock(&mapping->i_mmap_lock); 821 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 822 ret = try_to_unmap_one(page, vma, migration); 823 if (ret == SWAP_FAIL || !page_mapped(page)) 824 goto out; 825 } 826 827 if (list_empty(&mapping->i_mmap_nonlinear)) 828 goto out; 829 830 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 831 shared.vm_set.list) { 832 if ((vma->vm_flags & VM_LOCKED) && !migration) 833 continue; 834 cursor = (unsigned long) vma->vm_private_data; 835 if (cursor > max_nl_cursor) 836 max_nl_cursor = cursor; 837 cursor = vma->vm_end - vma->vm_start; 838 if (cursor > max_nl_size) 839 max_nl_size = cursor; 840 } 841 842 if (max_nl_size == 0) { /* any nonlinears locked or reserved */ 843 ret = SWAP_FAIL; 844 goto out; 845 } 846 847 /* 848 * We don't try to search for this page in the nonlinear vmas, 849 * and page_referenced wouldn't have found it anyway. Instead 850 * just walk the nonlinear vmas trying to age and unmap some. 851 * The mapcount of the page we came in with is irrelevant, 852 * but even so use it as a guide to how hard we should try? 853 */ 854 mapcount = page_mapcount(page); 855 if (!mapcount) 856 goto out; 857 cond_resched_lock(&mapping->i_mmap_lock); 858 859 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; 860 if (max_nl_cursor == 0) 861 max_nl_cursor = CLUSTER_SIZE; 862 863 do { 864 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 865 shared.vm_set.list) { 866 if ((vma->vm_flags & VM_LOCKED) && !migration) 867 continue; 868 cursor = (unsigned long) vma->vm_private_data; 869 while ( cursor < max_nl_cursor && 870 cursor < vma->vm_end - vma->vm_start) { 871 try_to_unmap_cluster(cursor, &mapcount, vma); 872 cursor += CLUSTER_SIZE; 873 vma->vm_private_data = (void *) cursor; 874 if ((int)mapcount <= 0) 875 goto out; 876 } 877 vma->vm_private_data = (void *) max_nl_cursor; 878 } 879 cond_resched_lock(&mapping->i_mmap_lock); 880 max_nl_cursor += CLUSTER_SIZE; 881 } while (max_nl_cursor <= max_nl_size); 882 883 /* 884 * Don't loop forever (perhaps all the remaining pages are 885 * in locked vmas). Reset cursor on all unreserved nonlinear 886 * vmas, now forgetting on which ones it had fallen behind. 887 */ 888 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 889 vma->vm_private_data = NULL; 890 out: 891 spin_unlock(&mapping->i_mmap_lock); 892 return ret; 893 } 894 895 /** 896 * try_to_unmap - try to remove all page table mappings to a page 897 * @page: the page to get unmapped 898 * 899 * Tries to remove all the page table entries which are mapping this 900 * page, used in the pageout path. Caller must hold the page lock. 901 * Return values are: 902 * 903 * SWAP_SUCCESS - we succeeded in removing all mappings 904 * SWAP_AGAIN - we missed a mapping, try again later 905 * SWAP_FAIL - the page is unswappable 906 */ 907 int try_to_unmap(struct page *page, int migration) 908 { 909 int ret; 910 911 BUG_ON(!PageLocked(page)); 912 913 if (PageAnon(page)) 914 ret = try_to_unmap_anon(page, migration); 915 else 916 ret = try_to_unmap_file(page, migration); 917 918 if (!page_mapped(page)) 919 ret = SWAP_SUCCESS; 920 return ret; 921 } 922 923