1 /* 2 * linux/mm/swap.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 */ 6 7 /* 8 * This file contains the default values for the operation of the 9 * Linux VM subsystem. Fine-tuning documentation can be found in 10 * Documentation/sysctl/vm.txt. 11 * Started 18.12.91 12 * Swap aging added 23.2.95, Stephen Tweedie. 13 * Buffermem limits added 12.3.98, Rik van Riel. 14 */ 15 16 #include <linux/mm.h> 17 #include <linux/sched.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/swap.h> 20 #include <linux/mman.h> 21 #include <linux/pagemap.h> 22 #include <linux/pagevec.h> 23 #include <linux/init.h> 24 #include <linux/module.h> 25 #include <linux/mm_inline.h> 26 #include <linux/buffer_head.h> /* for try_to_release_page() */ 27 #include <linux/percpu_counter.h> 28 #include <linux/percpu.h> 29 #include <linux/cpu.h> 30 #include <linux/notifier.h> 31 #include <linux/backing-dev.h> 32 #include <linux/memcontrol.h> 33 34 #include "internal.h" 35 36 /* How many pages do we try to swap or page in/out together? */ 37 int page_cluster; 38 39 static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); 40 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 41 42 /* 43 * This path almost never happens for VM activity - pages are normally 44 * freed via pagevecs. But it gets used by networking. 45 */ 46 static void __page_cache_release(struct page *page) 47 { 48 if (PageLRU(page)) { 49 unsigned long flags; 50 struct zone *zone = page_zone(page); 51 52 spin_lock_irqsave(&zone->lru_lock, flags); 53 VM_BUG_ON(!PageLRU(page)); 54 __ClearPageLRU(page); 55 del_page_from_lru(zone, page); 56 spin_unlock_irqrestore(&zone->lru_lock, flags); 57 } 58 free_hot_page(page); 59 } 60 61 static void put_compound_page(struct page *page) 62 { 63 page = compound_head(page); 64 if (put_page_testzero(page)) { 65 compound_page_dtor *dtor; 66 67 dtor = get_compound_page_dtor(page); 68 (*dtor)(page); 69 } 70 } 71 72 void put_page(struct page *page) 73 { 74 if (unlikely(PageCompound(page))) 75 put_compound_page(page); 76 else if (put_page_testzero(page)) 77 __page_cache_release(page); 78 } 79 EXPORT_SYMBOL(put_page); 80 81 /** 82 * put_pages_list() - release a list of pages 83 * @pages: list of pages threaded on page->lru 84 * 85 * Release a list of pages which are strung together on page.lru. Currently 86 * used by read_cache_pages() and related error recovery code. 87 */ 88 void put_pages_list(struct list_head *pages) 89 { 90 while (!list_empty(pages)) { 91 struct page *victim; 92 93 victim = list_entry(pages->prev, struct page, lru); 94 list_del(&victim->lru); 95 page_cache_release(victim); 96 } 97 } 98 EXPORT_SYMBOL(put_pages_list); 99 100 /* 101 * pagevec_move_tail() must be called with IRQ disabled. 102 * Otherwise this may cause nasty races. 103 */ 104 static void pagevec_move_tail(struct pagevec *pvec) 105 { 106 int i; 107 int pgmoved = 0; 108 struct zone *zone = NULL; 109 110 for (i = 0; i < pagevec_count(pvec); i++) { 111 struct page *page = pvec->pages[i]; 112 struct zone *pagezone = page_zone(page); 113 114 if (pagezone != zone) { 115 if (zone) 116 spin_unlock(&zone->lru_lock); 117 zone = pagezone; 118 spin_lock(&zone->lru_lock); 119 } 120 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 121 int lru = page_is_file_cache(page); 122 list_move_tail(&page->lru, &zone->lru[lru].list); 123 pgmoved++; 124 } 125 } 126 if (zone) 127 spin_unlock(&zone->lru_lock); 128 __count_vm_events(PGROTATED, pgmoved); 129 release_pages(pvec->pages, pvec->nr, pvec->cold); 130 pagevec_reinit(pvec); 131 } 132 133 /* 134 * Writeback is about to end against a page which has been marked for immediate 135 * reclaim. If it still appears to be reclaimable, move it to the tail of the 136 * inactive list. 137 */ 138 void rotate_reclaimable_page(struct page *page) 139 { 140 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 141 !PageUnevictable(page) && PageLRU(page)) { 142 struct pagevec *pvec; 143 unsigned long flags; 144 145 page_cache_get(page); 146 local_irq_save(flags); 147 pvec = &__get_cpu_var(lru_rotate_pvecs); 148 if (!pagevec_add(pvec, page)) 149 pagevec_move_tail(pvec); 150 local_irq_restore(flags); 151 } 152 } 153 154 /* 155 * FIXME: speed this up? 156 */ 157 void activate_page(struct page *page) 158 { 159 struct zone *zone = page_zone(page); 160 161 spin_lock_irq(&zone->lru_lock); 162 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 163 int file = page_is_file_cache(page); 164 int lru = LRU_BASE + file; 165 del_page_from_lru_list(zone, page, lru); 166 167 SetPageActive(page); 168 lru += LRU_ACTIVE; 169 add_page_to_lru_list(zone, page, lru); 170 __count_vm_event(PGACTIVATE); 171 mem_cgroup_move_lists(page, lru); 172 173 zone->recent_rotated[!!file]++; 174 zone->recent_scanned[!!file]++; 175 } 176 spin_unlock_irq(&zone->lru_lock); 177 } 178 179 /* 180 * Mark a page as having seen activity. 181 * 182 * inactive,unreferenced -> inactive,referenced 183 * inactive,referenced -> active,unreferenced 184 * active,unreferenced -> active,referenced 185 */ 186 void mark_page_accessed(struct page *page) 187 { 188 if (!PageActive(page) && !PageUnevictable(page) && 189 PageReferenced(page) && PageLRU(page)) { 190 activate_page(page); 191 ClearPageReferenced(page); 192 } else if (!PageReferenced(page)) { 193 SetPageReferenced(page); 194 } 195 } 196 197 EXPORT_SYMBOL(mark_page_accessed); 198 199 void __lru_cache_add(struct page *page, enum lru_list lru) 200 { 201 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; 202 203 page_cache_get(page); 204 if (!pagevec_add(pvec, page)) 205 ____pagevec_lru_add(pvec, lru); 206 put_cpu_var(lru_add_pvecs); 207 } 208 209 /** 210 * lru_cache_add_lru - add a page to a page list 211 * @page: the page to be added to the LRU. 212 * @lru: the LRU list to which the page is added. 213 */ 214 void lru_cache_add_lru(struct page *page, enum lru_list lru) 215 { 216 if (PageActive(page)) { 217 VM_BUG_ON(PageUnevictable(page)); 218 ClearPageActive(page); 219 } else if (PageUnevictable(page)) { 220 VM_BUG_ON(PageActive(page)); 221 ClearPageUnevictable(page); 222 } 223 224 VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); 225 __lru_cache_add(page, lru); 226 } 227 228 /** 229 * add_page_to_unevictable_list - add a page to the unevictable list 230 * @page: the page to be added to the unevictable list 231 * 232 * Add page directly to its zone's unevictable list. To avoid races with 233 * tasks that might be making the page evictable, through eg. munlock, 234 * munmap or exit, while it's not on the lru, we want to add the page 235 * while it's locked or otherwise "invisible" to other tasks. This is 236 * difficult to do when using the pagevec cache, so bypass that. 237 */ 238 void add_page_to_unevictable_list(struct page *page) 239 { 240 struct zone *zone = page_zone(page); 241 242 spin_lock_irq(&zone->lru_lock); 243 SetPageUnevictable(page); 244 SetPageLRU(page); 245 add_page_to_lru_list(zone, page, LRU_UNEVICTABLE); 246 spin_unlock_irq(&zone->lru_lock); 247 } 248 249 /** 250 * lru_cache_add_active_or_unevictable 251 * @page: the page to be added to LRU 252 * @vma: vma in which page is mapped for determining reclaimability 253 * 254 * place @page on active or unevictable LRU list, depending on 255 * page_evictable(). Note that if the page is not evictable, 256 * it goes directly back onto it's zone's unevictable list. It does 257 * NOT use a per cpu pagevec. 258 */ 259 void lru_cache_add_active_or_unevictable(struct page *page, 260 struct vm_area_struct *vma) 261 { 262 if (page_evictable(page, vma)) 263 lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page)); 264 else 265 add_page_to_unevictable_list(page); 266 } 267 268 /* 269 * Drain pages out of the cpu's pagevecs. 270 * Either "cpu" is the current CPU, and preemption has already been 271 * disabled; or "cpu" is being hot-unplugged, and is already dead. 272 */ 273 static void drain_cpu_pagevecs(int cpu) 274 { 275 struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); 276 struct pagevec *pvec; 277 int lru; 278 279 for_each_lru(lru) { 280 pvec = &pvecs[lru - LRU_BASE]; 281 if (pagevec_count(pvec)) 282 ____pagevec_lru_add(pvec, lru); 283 } 284 285 pvec = &per_cpu(lru_rotate_pvecs, cpu); 286 if (pagevec_count(pvec)) { 287 unsigned long flags; 288 289 /* No harm done if a racing interrupt already did this */ 290 local_irq_save(flags); 291 pagevec_move_tail(pvec); 292 local_irq_restore(flags); 293 } 294 } 295 296 void lru_add_drain(void) 297 { 298 drain_cpu_pagevecs(get_cpu()); 299 put_cpu(); 300 } 301 302 #if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU) 303 static void lru_add_drain_per_cpu(struct work_struct *dummy) 304 { 305 lru_add_drain(); 306 } 307 308 /* 309 * Returns 0 for success 310 */ 311 int lru_add_drain_all(void) 312 { 313 return schedule_on_each_cpu(lru_add_drain_per_cpu); 314 } 315 316 #else 317 318 /* 319 * Returns 0 for success 320 */ 321 int lru_add_drain_all(void) 322 { 323 lru_add_drain(); 324 return 0; 325 } 326 #endif 327 328 /* 329 * Batched page_cache_release(). Decrement the reference count on all the 330 * passed pages. If it fell to zero then remove the page from the LRU and 331 * free it. 332 * 333 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it 334 * for the remainder of the operation. 335 * 336 * The locking in this function is against shrink_inactive_list(): we recheck 337 * the page count inside the lock to see whether shrink_inactive_list() 338 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() 339 * will free it. 340 */ 341 void release_pages(struct page **pages, int nr, int cold) 342 { 343 int i; 344 struct pagevec pages_to_free; 345 struct zone *zone = NULL; 346 unsigned long uninitialized_var(flags); 347 348 pagevec_init(&pages_to_free, cold); 349 for (i = 0; i < nr; i++) { 350 struct page *page = pages[i]; 351 352 if (unlikely(PageCompound(page))) { 353 if (zone) { 354 spin_unlock_irqrestore(&zone->lru_lock, flags); 355 zone = NULL; 356 } 357 put_compound_page(page); 358 continue; 359 } 360 361 if (!put_page_testzero(page)) 362 continue; 363 364 if (PageLRU(page)) { 365 struct zone *pagezone = page_zone(page); 366 367 if (pagezone != zone) { 368 if (zone) 369 spin_unlock_irqrestore(&zone->lru_lock, 370 flags); 371 zone = pagezone; 372 spin_lock_irqsave(&zone->lru_lock, flags); 373 } 374 VM_BUG_ON(!PageLRU(page)); 375 __ClearPageLRU(page); 376 del_page_from_lru(zone, page); 377 } 378 379 if (!pagevec_add(&pages_to_free, page)) { 380 if (zone) { 381 spin_unlock_irqrestore(&zone->lru_lock, flags); 382 zone = NULL; 383 } 384 __pagevec_free(&pages_to_free); 385 pagevec_reinit(&pages_to_free); 386 } 387 } 388 if (zone) 389 spin_unlock_irqrestore(&zone->lru_lock, flags); 390 391 pagevec_free(&pages_to_free); 392 } 393 394 /* 395 * The pages which we're about to release may be in the deferred lru-addition 396 * queues. That would prevent them from really being freed right now. That's 397 * OK from a correctness point of view but is inefficient - those pages may be 398 * cache-warm and we want to give them back to the page allocator ASAP. 399 * 400 * So __pagevec_release() will drain those queues here. __pagevec_lru_add() 401 * and __pagevec_lru_add_active() call release_pages() directly to avoid 402 * mutual recursion. 403 */ 404 void __pagevec_release(struct pagevec *pvec) 405 { 406 lru_add_drain(); 407 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 408 pagevec_reinit(pvec); 409 } 410 411 EXPORT_SYMBOL(__pagevec_release); 412 413 /* 414 * pagevec_release() for pages which are known to not be on the LRU 415 * 416 * This function reinitialises the caller's pagevec. 417 */ 418 void __pagevec_release_nonlru(struct pagevec *pvec) 419 { 420 int i; 421 struct pagevec pages_to_free; 422 423 pagevec_init(&pages_to_free, pvec->cold); 424 for (i = 0; i < pagevec_count(pvec); i++) { 425 struct page *page = pvec->pages[i]; 426 427 VM_BUG_ON(PageLRU(page)); 428 if (put_page_testzero(page)) 429 pagevec_add(&pages_to_free, page); 430 } 431 pagevec_free(&pages_to_free); 432 pagevec_reinit(pvec); 433 } 434 435 /* 436 * Add the passed pages to the LRU, then drop the caller's refcount 437 * on them. Reinitialises the caller's pagevec. 438 */ 439 void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) 440 { 441 int i; 442 struct zone *zone = NULL; 443 VM_BUG_ON(is_unevictable_lru(lru)); 444 445 for (i = 0; i < pagevec_count(pvec); i++) { 446 struct page *page = pvec->pages[i]; 447 struct zone *pagezone = page_zone(page); 448 449 if (pagezone != zone) { 450 if (zone) 451 spin_unlock_irq(&zone->lru_lock); 452 zone = pagezone; 453 spin_lock_irq(&zone->lru_lock); 454 } 455 VM_BUG_ON(PageActive(page)); 456 VM_BUG_ON(PageUnevictable(page)); 457 VM_BUG_ON(PageLRU(page)); 458 SetPageLRU(page); 459 if (is_active_lru(lru)) 460 SetPageActive(page); 461 add_page_to_lru_list(zone, page, lru); 462 } 463 if (zone) 464 spin_unlock_irq(&zone->lru_lock); 465 release_pages(pvec->pages, pvec->nr, pvec->cold); 466 pagevec_reinit(pvec); 467 } 468 469 EXPORT_SYMBOL(____pagevec_lru_add); 470 471 /* 472 * Try to drop buffers from the pages in a pagevec 473 */ 474 void pagevec_strip(struct pagevec *pvec) 475 { 476 int i; 477 478 for (i = 0; i < pagevec_count(pvec); i++) { 479 struct page *page = pvec->pages[i]; 480 481 if (PagePrivate(page) && trylock_page(page)) { 482 if (PagePrivate(page)) 483 try_to_release_page(page, 0); 484 unlock_page(page); 485 } 486 } 487 } 488 489 /** 490 * pagevec_swap_free - try to free swap space from the pages in a pagevec 491 * @pvec: pagevec with swapcache pages to free the swap space of 492 * 493 * The caller needs to hold an extra reference to each page and 494 * not hold the page lock on the pages. This function uses a 495 * trylock on the page lock so it may not always free the swap 496 * space associated with a page. 497 */ 498 void pagevec_swap_free(struct pagevec *pvec) 499 { 500 int i; 501 502 for (i = 0; i < pagevec_count(pvec); i++) { 503 struct page *page = pvec->pages[i]; 504 505 if (PageSwapCache(page) && trylock_page(page)) { 506 if (PageSwapCache(page)) 507 remove_exclusive_swap_page_ref(page); 508 unlock_page(page); 509 } 510 } 511 } 512 513 /** 514 * pagevec_lookup - gang pagecache lookup 515 * @pvec: Where the resulting pages are placed 516 * @mapping: The address_space to search 517 * @start: The starting page index 518 * @nr_pages: The maximum number of pages 519 * 520 * pagevec_lookup() will search for and return a group of up to @nr_pages pages 521 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 522 * reference against the pages in @pvec. 523 * 524 * The search returns a group of mapping-contiguous pages with ascending 525 * indexes. There may be holes in the indices due to not-present pages. 526 * 527 * pagevec_lookup() returns the number of pages which were found. 528 */ 529 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 530 pgoff_t start, unsigned nr_pages) 531 { 532 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 533 return pagevec_count(pvec); 534 } 535 536 EXPORT_SYMBOL(pagevec_lookup); 537 538 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 539 pgoff_t *index, int tag, unsigned nr_pages) 540 { 541 pvec->nr = find_get_pages_tag(mapping, index, tag, 542 nr_pages, pvec->pages); 543 return pagevec_count(pvec); 544 } 545 546 EXPORT_SYMBOL(pagevec_lookup_tag); 547 548 #ifdef CONFIG_SMP 549 /* 550 * We tolerate a little inaccuracy to avoid ping-ponging the counter between 551 * CPUs 552 */ 553 #define ACCT_THRESHOLD max(16, NR_CPUS * 2) 554 555 static DEFINE_PER_CPU(long, committed_space); 556 557 void vm_acct_memory(long pages) 558 { 559 long *local; 560 561 preempt_disable(); 562 local = &__get_cpu_var(committed_space); 563 *local += pages; 564 if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) { 565 atomic_long_add(*local, &vm_committed_space); 566 *local = 0; 567 } 568 preempt_enable(); 569 } 570 571 #ifdef CONFIG_HOTPLUG_CPU 572 573 /* Drop the CPU's cached committed space back into the central pool. */ 574 static int cpu_swap_callback(struct notifier_block *nfb, 575 unsigned long action, 576 void *hcpu) 577 { 578 long *committed; 579 580 committed = &per_cpu(committed_space, (long)hcpu); 581 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 582 atomic_long_add(*committed, &vm_committed_space); 583 *committed = 0; 584 drain_cpu_pagevecs((long)hcpu); 585 } 586 return NOTIFY_OK; 587 } 588 #endif /* CONFIG_HOTPLUG_CPU */ 589 #endif /* CONFIG_SMP */ 590 591 /* 592 * Perform any setup for the swap system 593 */ 594 void __init swap_setup(void) 595 { 596 unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); 597 598 #ifdef CONFIG_SWAP 599 bdi_init(swapper_space.backing_dev_info); 600 #endif 601 602 /* Use a smaller cluster for small-memory machines */ 603 if (megs < 16) 604 page_cluster = 2; 605 else 606 page_cluster = 3; 607 /* 608 * Right now other parts of the system means that we 609 * _really_ don't want to cluster much more 610 */ 611 #ifdef CONFIG_HOTPLUG_CPU 612 hotcpu_notifier(cpu_swap_callback, 0); 613 #endif 614 } 615