1 /* 2 * linux/mm/swap.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 */ 6 7 /* 8 * This file contains the default values for the opereation of the 9 * Linux VM subsystem. Fine-tuning documentation can be found in 10 * Documentation/sysctl/vm.txt. 11 * Started 18.12.91 12 * Swap aging added 23.2.95, Stephen Tweedie. 13 * Buffermem limits added 12.3.98, Rik van Riel. 14 */ 15 16 #include <linux/mm.h> 17 #include <linux/sched.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/swap.h> 20 #include <linux/mman.h> 21 #include <linux/pagemap.h> 22 #include <linux/pagevec.h> 23 #include <linux/init.h> 24 #include <linux/module.h> 25 #include <linux/mm_inline.h> 26 #include <linux/buffer_head.h> /* for try_to_release_page() */ 27 #include <linux/percpu_counter.h> 28 #include <linux/percpu.h> 29 #include <linux/cpu.h> 30 #include <linux/notifier.h> 31 32 /* How many pages do we try to swap or page in/out together? */ 33 int page_cluster; 34 35 static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; 36 static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; 37 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; 38 39 /* 40 * This path almost never happens for VM activity - pages are normally 41 * freed via pagevecs. But it gets used by networking. 42 */ 43 static void fastcall __page_cache_release(struct page *page) 44 { 45 if (PageLRU(page)) { 46 unsigned long flags; 47 struct zone *zone = page_zone(page); 48 49 spin_lock_irqsave(&zone->lru_lock, flags); 50 VM_BUG_ON(!PageLRU(page)); 51 __ClearPageLRU(page); 52 del_page_from_lru(zone, page); 53 spin_unlock_irqrestore(&zone->lru_lock, flags); 54 } 55 free_hot_page(page); 56 } 57 58 static void put_compound_page(struct page *page) 59 { 60 page = compound_head(page); 61 if (put_page_testzero(page)) { 62 compound_page_dtor *dtor; 63 64 dtor = get_compound_page_dtor(page); 65 (*dtor)(page); 66 } 67 } 68 69 void put_page(struct page *page) 70 { 71 if (unlikely(PageCompound(page))) 72 put_compound_page(page); 73 else if (put_page_testzero(page)) 74 __page_cache_release(page); 75 } 76 EXPORT_SYMBOL(put_page); 77 78 /** 79 * put_pages_list(): release a list of pages 80 * 81 * Release a list of pages which are strung together on page.lru. Currently 82 * used by read_cache_pages() and related error recovery code. 83 * 84 * @pages: list of pages threaded on page->lru 85 */ 86 void put_pages_list(struct list_head *pages) 87 { 88 while (!list_empty(pages)) { 89 struct page *victim; 90 91 victim = list_entry(pages->prev, struct page, lru); 92 list_del(&victim->lru); 93 page_cache_release(victim); 94 } 95 } 96 EXPORT_SYMBOL(put_pages_list); 97 98 /* 99 * pagevec_move_tail() must be called with IRQ disabled. 100 * Otherwise this may cause nasty races. 101 */ 102 static void pagevec_move_tail(struct pagevec *pvec) 103 { 104 int i; 105 int pgmoved = 0; 106 struct zone *zone = NULL; 107 108 for (i = 0; i < pagevec_count(pvec); i++) { 109 struct page *page = pvec->pages[i]; 110 struct zone *pagezone = page_zone(page); 111 112 if (pagezone != zone) { 113 if (zone) 114 spin_unlock(&zone->lru_lock); 115 zone = pagezone; 116 spin_lock(&zone->lru_lock); 117 } 118 if (PageLRU(page) && !PageActive(page)) { 119 list_move_tail(&page->lru, &zone->inactive_list); 120 pgmoved++; 121 } 122 } 123 if (zone) 124 spin_unlock(&zone->lru_lock); 125 __count_vm_events(PGROTATED, pgmoved); 126 release_pages(pvec->pages, pvec->nr, pvec->cold); 127 pagevec_reinit(pvec); 128 } 129 130 /* 131 * Writeback is about to end against a page which has been marked for immediate 132 * reclaim. If it still appears to be reclaimable, move it to the tail of the 133 * inactive list. 134 * 135 * Returns zero if it cleared PG_writeback. 136 */ 137 int rotate_reclaimable_page(struct page *page) 138 { 139 struct pagevec *pvec; 140 unsigned long flags; 141 142 if (PageLocked(page)) 143 return 1; 144 if (PageDirty(page)) 145 return 1; 146 if (PageActive(page)) 147 return 1; 148 if (!PageLRU(page)) 149 return 1; 150 151 page_cache_get(page); 152 local_irq_save(flags); 153 pvec = &__get_cpu_var(lru_rotate_pvecs); 154 if (!pagevec_add(pvec, page)) 155 pagevec_move_tail(pvec); 156 local_irq_restore(flags); 157 158 if (!test_clear_page_writeback(page)) 159 BUG(); 160 161 return 0; 162 } 163 164 /* 165 * FIXME: speed this up? 166 */ 167 void fastcall activate_page(struct page *page) 168 { 169 struct zone *zone = page_zone(page); 170 171 spin_lock_irq(&zone->lru_lock); 172 if (PageLRU(page) && !PageActive(page)) { 173 del_page_from_inactive_list(zone, page); 174 SetPageActive(page); 175 add_page_to_active_list(zone, page); 176 __count_vm_event(PGACTIVATE); 177 } 178 spin_unlock_irq(&zone->lru_lock); 179 } 180 181 /* 182 * Mark a page as having seen activity. 183 * 184 * inactive,unreferenced -> inactive,referenced 185 * inactive,referenced -> active,unreferenced 186 * active,unreferenced -> active,referenced 187 */ 188 void fastcall mark_page_accessed(struct page *page) 189 { 190 if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { 191 activate_page(page); 192 ClearPageReferenced(page); 193 } else if (!PageReferenced(page)) { 194 SetPageReferenced(page); 195 } 196 } 197 198 EXPORT_SYMBOL(mark_page_accessed); 199 200 /** 201 * lru_cache_add: add a page to the page lists 202 * @page: the page to add 203 */ 204 void fastcall lru_cache_add(struct page *page) 205 { 206 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); 207 208 page_cache_get(page); 209 if (!pagevec_add(pvec, page)) 210 __pagevec_lru_add(pvec); 211 put_cpu_var(lru_add_pvecs); 212 } 213 214 void fastcall lru_cache_add_active(struct page *page) 215 { 216 struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); 217 218 page_cache_get(page); 219 if (!pagevec_add(pvec, page)) 220 __pagevec_lru_add_active(pvec); 221 put_cpu_var(lru_add_active_pvecs); 222 } 223 224 /* 225 * Drain pages out of the cpu's pagevecs. 226 * Either "cpu" is the current CPU, and preemption has already been 227 * disabled; or "cpu" is being hot-unplugged, and is already dead. 228 */ 229 static void drain_cpu_pagevecs(int cpu) 230 { 231 struct pagevec *pvec; 232 233 pvec = &per_cpu(lru_add_pvecs, cpu); 234 if (pagevec_count(pvec)) 235 __pagevec_lru_add(pvec); 236 237 pvec = &per_cpu(lru_add_active_pvecs, cpu); 238 if (pagevec_count(pvec)) 239 __pagevec_lru_add_active(pvec); 240 241 pvec = &per_cpu(lru_rotate_pvecs, cpu); 242 if (pagevec_count(pvec)) { 243 unsigned long flags; 244 245 /* No harm done if a racing interrupt already did this */ 246 local_irq_save(flags); 247 pagevec_move_tail(pvec); 248 local_irq_restore(flags); 249 } 250 } 251 252 void lru_add_drain(void) 253 { 254 drain_cpu_pagevecs(get_cpu()); 255 put_cpu(); 256 } 257 258 #ifdef CONFIG_NUMA 259 static void lru_add_drain_per_cpu(struct work_struct *dummy) 260 { 261 lru_add_drain(); 262 } 263 264 /* 265 * Returns 0 for success 266 */ 267 int lru_add_drain_all(void) 268 { 269 return schedule_on_each_cpu(lru_add_drain_per_cpu); 270 } 271 272 #else 273 274 /* 275 * Returns 0 for success 276 */ 277 int lru_add_drain_all(void) 278 { 279 lru_add_drain(); 280 return 0; 281 } 282 #endif 283 284 /* 285 * Batched page_cache_release(). Decrement the reference count on all the 286 * passed pages. If it fell to zero then remove the page from the LRU and 287 * free it. 288 * 289 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it 290 * for the remainder of the operation. 291 * 292 * The locking in this function is against shrink_cache(): we recheck the 293 * page count inside the lock to see whether shrink_cache grabbed the page 294 * via the LRU. If it did, give up: shrink_cache will free it. 295 */ 296 void release_pages(struct page **pages, int nr, int cold) 297 { 298 int i; 299 struct pagevec pages_to_free; 300 struct zone *zone = NULL; 301 unsigned long uninitialized_var(flags); 302 303 pagevec_init(&pages_to_free, cold); 304 for (i = 0; i < nr; i++) { 305 struct page *page = pages[i]; 306 307 if (unlikely(PageCompound(page))) { 308 if (zone) { 309 spin_unlock_irqrestore(&zone->lru_lock, flags); 310 zone = NULL; 311 } 312 put_compound_page(page); 313 continue; 314 } 315 316 if (!put_page_testzero(page)) 317 continue; 318 319 if (PageLRU(page)) { 320 struct zone *pagezone = page_zone(page); 321 if (pagezone != zone) { 322 if (zone) 323 spin_unlock_irqrestore(&zone->lru_lock, 324 flags); 325 zone = pagezone; 326 spin_lock_irqsave(&zone->lru_lock, flags); 327 } 328 VM_BUG_ON(!PageLRU(page)); 329 __ClearPageLRU(page); 330 del_page_from_lru(zone, page); 331 } 332 333 if (!pagevec_add(&pages_to_free, page)) { 334 if (zone) { 335 spin_unlock_irqrestore(&zone->lru_lock, flags); 336 zone = NULL; 337 } 338 __pagevec_free(&pages_to_free); 339 pagevec_reinit(&pages_to_free); 340 } 341 } 342 if (zone) 343 spin_unlock_irqrestore(&zone->lru_lock, flags); 344 345 pagevec_free(&pages_to_free); 346 } 347 348 /* 349 * The pages which we're about to release may be in the deferred lru-addition 350 * queues. That would prevent them from really being freed right now. That's 351 * OK from a correctness point of view but is inefficient - those pages may be 352 * cache-warm and we want to give them back to the page allocator ASAP. 353 * 354 * So __pagevec_release() will drain those queues here. __pagevec_lru_add() 355 * and __pagevec_lru_add_active() call release_pages() directly to avoid 356 * mutual recursion. 357 */ 358 void __pagevec_release(struct pagevec *pvec) 359 { 360 lru_add_drain(); 361 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 362 pagevec_reinit(pvec); 363 } 364 365 EXPORT_SYMBOL(__pagevec_release); 366 367 /* 368 * pagevec_release() for pages which are known to not be on the LRU 369 * 370 * This function reinitialises the caller's pagevec. 371 */ 372 void __pagevec_release_nonlru(struct pagevec *pvec) 373 { 374 int i; 375 struct pagevec pages_to_free; 376 377 pagevec_init(&pages_to_free, pvec->cold); 378 for (i = 0; i < pagevec_count(pvec); i++) { 379 struct page *page = pvec->pages[i]; 380 381 VM_BUG_ON(PageLRU(page)); 382 if (put_page_testzero(page)) 383 pagevec_add(&pages_to_free, page); 384 } 385 pagevec_free(&pages_to_free); 386 pagevec_reinit(pvec); 387 } 388 389 /* 390 * Add the passed pages to the LRU, then drop the caller's refcount 391 * on them. Reinitialises the caller's pagevec. 392 */ 393 void __pagevec_lru_add(struct pagevec *pvec) 394 { 395 int i; 396 struct zone *zone = NULL; 397 398 for (i = 0; i < pagevec_count(pvec); i++) { 399 struct page *page = pvec->pages[i]; 400 struct zone *pagezone = page_zone(page); 401 402 if (pagezone != zone) { 403 if (zone) 404 spin_unlock_irq(&zone->lru_lock); 405 zone = pagezone; 406 spin_lock_irq(&zone->lru_lock); 407 } 408 VM_BUG_ON(PageLRU(page)); 409 SetPageLRU(page); 410 add_page_to_inactive_list(zone, page); 411 } 412 if (zone) 413 spin_unlock_irq(&zone->lru_lock); 414 release_pages(pvec->pages, pvec->nr, pvec->cold); 415 pagevec_reinit(pvec); 416 } 417 418 EXPORT_SYMBOL(__pagevec_lru_add); 419 420 void __pagevec_lru_add_active(struct pagevec *pvec) 421 { 422 int i; 423 struct zone *zone = NULL; 424 425 for (i = 0; i < pagevec_count(pvec); i++) { 426 struct page *page = pvec->pages[i]; 427 struct zone *pagezone = page_zone(page); 428 429 if (pagezone != zone) { 430 if (zone) 431 spin_unlock_irq(&zone->lru_lock); 432 zone = pagezone; 433 spin_lock_irq(&zone->lru_lock); 434 } 435 VM_BUG_ON(PageLRU(page)); 436 SetPageLRU(page); 437 VM_BUG_ON(PageActive(page)); 438 SetPageActive(page); 439 add_page_to_active_list(zone, page); 440 } 441 if (zone) 442 spin_unlock_irq(&zone->lru_lock); 443 release_pages(pvec->pages, pvec->nr, pvec->cold); 444 pagevec_reinit(pvec); 445 } 446 447 /* 448 * Try to drop buffers from the pages in a pagevec 449 */ 450 void pagevec_strip(struct pagevec *pvec) 451 { 452 int i; 453 454 for (i = 0; i < pagevec_count(pvec); i++) { 455 struct page *page = pvec->pages[i]; 456 457 if (PagePrivate(page) && !TestSetPageLocked(page)) { 458 if (PagePrivate(page)) 459 try_to_release_page(page, 0); 460 unlock_page(page); 461 } 462 } 463 } 464 465 /** 466 * pagevec_lookup - gang pagecache lookup 467 * @pvec: Where the resulting pages are placed 468 * @mapping: The address_space to search 469 * @start: The starting page index 470 * @nr_pages: The maximum number of pages 471 * 472 * pagevec_lookup() will search for and return a group of up to @nr_pages pages 473 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 474 * reference against the pages in @pvec. 475 * 476 * The search returns a group of mapping-contiguous pages with ascending 477 * indexes. There may be holes in the indices due to not-present pages. 478 * 479 * pagevec_lookup() returns the number of pages which were found. 480 */ 481 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 482 pgoff_t start, unsigned nr_pages) 483 { 484 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 485 return pagevec_count(pvec); 486 } 487 488 EXPORT_SYMBOL(pagevec_lookup); 489 490 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 491 pgoff_t *index, int tag, unsigned nr_pages) 492 { 493 pvec->nr = find_get_pages_tag(mapping, index, tag, 494 nr_pages, pvec->pages); 495 return pagevec_count(pvec); 496 } 497 498 EXPORT_SYMBOL(pagevec_lookup_tag); 499 500 #ifdef CONFIG_SMP 501 /* 502 * We tolerate a little inaccuracy to avoid ping-ponging the counter between 503 * CPUs 504 */ 505 #define ACCT_THRESHOLD max(16, NR_CPUS * 2) 506 507 static DEFINE_PER_CPU(long, committed_space) = 0; 508 509 void vm_acct_memory(long pages) 510 { 511 long *local; 512 513 preempt_disable(); 514 local = &__get_cpu_var(committed_space); 515 *local += pages; 516 if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) { 517 atomic_add(*local, &vm_committed_space); 518 *local = 0; 519 } 520 preempt_enable(); 521 } 522 523 #ifdef CONFIG_HOTPLUG_CPU 524 525 /* Drop the CPU's cached committed space back into the central pool. */ 526 static int cpu_swap_callback(struct notifier_block *nfb, 527 unsigned long action, 528 void *hcpu) 529 { 530 long *committed; 531 532 committed = &per_cpu(committed_space, (long)hcpu); 533 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 534 atomic_add(*committed, &vm_committed_space); 535 *committed = 0; 536 drain_cpu_pagevecs((long)hcpu); 537 } 538 return NOTIFY_OK; 539 } 540 #endif /* CONFIG_HOTPLUG_CPU */ 541 #endif /* CONFIG_SMP */ 542 543 /* 544 * Perform any setup for the swap system 545 */ 546 void __init swap_setup(void) 547 { 548 unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); 549 550 /* Use a smaller cluster for small-memory machines */ 551 if (megs < 16) 552 page_cluster = 2; 553 else 554 page_cluster = 3; 555 /* 556 * Right now other parts of the system means that we 557 * _really_ don't want to cluster much more 558 */ 559 #ifdef CONFIG_HOTPLUG_CPU 560 hotcpu_notifier(cpu_swap_callback, 0); 561 #endif 562 } 563