xref: /linux/mm/swap.c (revision f24e9f586b377749dff37554696cf3a105540c94)
1 /*
2  *  linux/mm/swap.c
3  *
4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
5  */
6 
7 /*
8  * This file contains the default values for the opereation of the
9  * Linux VM subsystem. Fine-tuning documentation can be found in
10  * Documentation/sysctl/vm.txt.
11  * Started 18.12.91
12  * Swap aging added 23.2.95, Stephen Tweedie.
13  * Buffermem limits added 12.3.98, Rik van Riel.
14  */
15 
16 #include <linux/mm.h>
17 #include <linux/sched.h>
18 #include <linux/kernel_stat.h>
19 #include <linux/swap.h>
20 #include <linux/mman.h>
21 #include <linux/pagemap.h>
22 #include <linux/pagevec.h>
23 #include <linux/init.h>
24 #include <linux/module.h>
25 #include <linux/mm_inline.h>
26 #include <linux/buffer_head.h>	/* for try_to_release_page() */
27 #include <linux/module.h>
28 #include <linux/percpu_counter.h>
29 #include <linux/percpu.h>
30 #include <linux/cpu.h>
31 #include <linux/notifier.h>
32 #include <linux/init.h>
33 
34 /* How many pages do we try to swap or page in/out together? */
35 int page_cluster;
36 
37 static void put_compound_page(struct page *page)
38 {
39 	page = (struct page *)page_private(page);
40 	if (put_page_testzero(page)) {
41 		void (*dtor)(struct page *page);
42 
43 		dtor = (void (*)(struct page *))page[1].lru.next;
44 		(*dtor)(page);
45 	}
46 }
47 
48 void put_page(struct page *page)
49 {
50 	if (unlikely(PageCompound(page)))
51 		put_compound_page(page);
52 	else if (put_page_testzero(page))
53 		__page_cache_release(page);
54 }
55 EXPORT_SYMBOL(put_page);
56 
57 /**
58  * put_pages_list(): release a list of pages
59  *
60  * Release a list of pages which are strung together on page.lru.  Currently
61  * used by read_cache_pages() and related error recovery code.
62  *
63  * @pages: list of pages threaded on page->lru
64  */
65 void put_pages_list(struct list_head *pages)
66 {
67 	while (!list_empty(pages)) {
68 		struct page *victim;
69 
70 		victim = list_entry(pages->prev, struct page, lru);
71 		list_del(&victim->lru);
72 		page_cache_release(victim);
73 	}
74 }
75 EXPORT_SYMBOL(put_pages_list);
76 
77 /*
78  * Writeback is about to end against a page which has been marked for immediate
79  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
80  * inactive list.  The page still has PageWriteback set, which will pin it.
81  *
82  * We don't expect many pages to come through here, so don't bother batching
83  * things up.
84  *
85  * To avoid placing the page at the tail of the LRU while PG_writeback is still
86  * set, this function will clear PG_writeback before performing the page
87  * motion.  Do that inside the lru lock because once PG_writeback is cleared
88  * we may not touch the page.
89  *
90  * Returns zero if it cleared PG_writeback.
91  */
92 int rotate_reclaimable_page(struct page *page)
93 {
94 	struct zone *zone;
95 	unsigned long flags;
96 
97 	if (PageLocked(page))
98 		return 1;
99 	if (PageDirty(page))
100 		return 1;
101 	if (PageActive(page))
102 		return 1;
103 	if (!PageLRU(page))
104 		return 1;
105 
106 	zone = page_zone(page);
107 	spin_lock_irqsave(&zone->lru_lock, flags);
108 	if (PageLRU(page) && !PageActive(page)) {
109 		list_move_tail(&page->lru, &zone->inactive_list);
110 		__count_vm_event(PGROTATED);
111 	}
112 	if (!test_clear_page_writeback(page))
113 		BUG();
114 	spin_unlock_irqrestore(&zone->lru_lock, flags);
115 	return 0;
116 }
117 
118 /*
119  * FIXME: speed this up?
120  */
121 void fastcall activate_page(struct page *page)
122 {
123 	struct zone *zone = page_zone(page);
124 
125 	spin_lock_irq(&zone->lru_lock);
126 	if (PageLRU(page) && !PageActive(page)) {
127 		del_page_from_inactive_list(zone, page);
128 		SetPageActive(page);
129 		add_page_to_active_list(zone, page);
130 		__count_vm_event(PGACTIVATE);
131 	}
132 	spin_unlock_irq(&zone->lru_lock);
133 }
134 
135 /*
136  * Mark a page as having seen activity.
137  *
138  * inactive,unreferenced	->	inactive,referenced
139  * inactive,referenced		->	active,unreferenced
140  * active,unreferenced		->	active,referenced
141  */
142 void fastcall mark_page_accessed(struct page *page)
143 {
144 	if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
145 		activate_page(page);
146 		ClearPageReferenced(page);
147 	} else if (!PageReferenced(page)) {
148 		SetPageReferenced(page);
149 	}
150 }
151 
152 EXPORT_SYMBOL(mark_page_accessed);
153 
154 /**
155  * lru_cache_add: add a page to the page lists
156  * @page: the page to add
157  */
158 static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
159 static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
160 
161 void fastcall lru_cache_add(struct page *page)
162 {
163 	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
164 
165 	page_cache_get(page);
166 	if (!pagevec_add(pvec, page))
167 		__pagevec_lru_add(pvec);
168 	put_cpu_var(lru_add_pvecs);
169 }
170 
171 void fastcall lru_cache_add_active(struct page *page)
172 {
173 	struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
174 
175 	page_cache_get(page);
176 	if (!pagevec_add(pvec, page))
177 		__pagevec_lru_add_active(pvec);
178 	put_cpu_var(lru_add_active_pvecs);
179 }
180 
181 static void __lru_add_drain(int cpu)
182 {
183 	struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
184 
185 	/* CPU is dead, so no locking needed. */
186 	if (pagevec_count(pvec))
187 		__pagevec_lru_add(pvec);
188 	pvec = &per_cpu(lru_add_active_pvecs, cpu);
189 	if (pagevec_count(pvec))
190 		__pagevec_lru_add_active(pvec);
191 }
192 
193 void lru_add_drain(void)
194 {
195 	__lru_add_drain(get_cpu());
196 	put_cpu();
197 }
198 
199 #ifdef CONFIG_NUMA
200 static void lru_add_drain_per_cpu(void *dummy)
201 {
202 	lru_add_drain();
203 }
204 
205 /*
206  * Returns 0 for success
207  */
208 int lru_add_drain_all(void)
209 {
210 	return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
211 }
212 
213 #else
214 
215 /*
216  * Returns 0 for success
217  */
218 int lru_add_drain_all(void)
219 {
220 	lru_add_drain();
221 	return 0;
222 }
223 #endif
224 
225 /*
226  * This path almost never happens for VM activity - pages are normally
227  * freed via pagevecs.  But it gets used by networking.
228  */
229 void fastcall __page_cache_release(struct page *page)
230 {
231 	if (PageLRU(page)) {
232 		unsigned long flags;
233 		struct zone *zone = page_zone(page);
234 
235 		spin_lock_irqsave(&zone->lru_lock, flags);
236 		BUG_ON(!PageLRU(page));
237 		__ClearPageLRU(page);
238 		del_page_from_lru(zone, page);
239 		spin_unlock_irqrestore(&zone->lru_lock, flags);
240 	}
241 	free_hot_page(page);
242 }
243 EXPORT_SYMBOL(__page_cache_release);
244 
245 /*
246  * Batched page_cache_release().  Decrement the reference count on all the
247  * passed pages.  If it fell to zero then remove the page from the LRU and
248  * free it.
249  *
250  * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
251  * for the remainder of the operation.
252  *
253  * The locking in this function is against shrink_cache(): we recheck the
254  * page count inside the lock to see whether shrink_cache grabbed the page
255  * via the LRU.  If it did, give up: shrink_cache will free it.
256  */
257 void release_pages(struct page **pages, int nr, int cold)
258 {
259 	int i;
260 	struct pagevec pages_to_free;
261 	struct zone *zone = NULL;
262 
263 	pagevec_init(&pages_to_free, cold);
264 	for (i = 0; i < nr; i++) {
265 		struct page *page = pages[i];
266 
267 		if (unlikely(PageCompound(page))) {
268 			if (zone) {
269 				spin_unlock_irq(&zone->lru_lock);
270 				zone = NULL;
271 			}
272 			put_compound_page(page);
273 			continue;
274 		}
275 
276 		if (!put_page_testzero(page))
277 			continue;
278 
279 		if (PageLRU(page)) {
280 			struct zone *pagezone = page_zone(page);
281 			if (pagezone != zone) {
282 				if (zone)
283 					spin_unlock_irq(&zone->lru_lock);
284 				zone = pagezone;
285 				spin_lock_irq(&zone->lru_lock);
286 			}
287 			BUG_ON(!PageLRU(page));
288 			__ClearPageLRU(page);
289 			del_page_from_lru(zone, page);
290 		}
291 
292 		if (!pagevec_add(&pages_to_free, page)) {
293 			if (zone) {
294 				spin_unlock_irq(&zone->lru_lock);
295 				zone = NULL;
296 			}
297 			__pagevec_free(&pages_to_free);
298 			pagevec_reinit(&pages_to_free);
299   		}
300 	}
301 	if (zone)
302 		spin_unlock_irq(&zone->lru_lock);
303 
304 	pagevec_free(&pages_to_free);
305 }
306 
307 /*
308  * The pages which we're about to release may be in the deferred lru-addition
309  * queues.  That would prevent them from really being freed right now.  That's
310  * OK from a correctness point of view but is inefficient - those pages may be
311  * cache-warm and we want to give them back to the page allocator ASAP.
312  *
313  * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
314  * and __pagevec_lru_add_active() call release_pages() directly to avoid
315  * mutual recursion.
316  */
317 void __pagevec_release(struct pagevec *pvec)
318 {
319 	lru_add_drain();
320 	release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
321 	pagevec_reinit(pvec);
322 }
323 
324 EXPORT_SYMBOL(__pagevec_release);
325 
326 /*
327  * pagevec_release() for pages which are known to not be on the LRU
328  *
329  * This function reinitialises the caller's pagevec.
330  */
331 void __pagevec_release_nonlru(struct pagevec *pvec)
332 {
333 	int i;
334 	struct pagevec pages_to_free;
335 
336 	pagevec_init(&pages_to_free, pvec->cold);
337 	for (i = 0; i < pagevec_count(pvec); i++) {
338 		struct page *page = pvec->pages[i];
339 
340 		BUG_ON(PageLRU(page));
341 		if (put_page_testzero(page))
342 			pagevec_add(&pages_to_free, page);
343 	}
344 	pagevec_free(&pages_to_free);
345 	pagevec_reinit(pvec);
346 }
347 
348 /*
349  * Add the passed pages to the LRU, then drop the caller's refcount
350  * on them.  Reinitialises the caller's pagevec.
351  */
352 void __pagevec_lru_add(struct pagevec *pvec)
353 {
354 	int i;
355 	struct zone *zone = NULL;
356 
357 	for (i = 0; i < pagevec_count(pvec); i++) {
358 		struct page *page = pvec->pages[i];
359 		struct zone *pagezone = page_zone(page);
360 
361 		if (pagezone != zone) {
362 			if (zone)
363 				spin_unlock_irq(&zone->lru_lock);
364 			zone = pagezone;
365 			spin_lock_irq(&zone->lru_lock);
366 		}
367 		BUG_ON(PageLRU(page));
368 		SetPageLRU(page);
369 		add_page_to_inactive_list(zone, page);
370 	}
371 	if (zone)
372 		spin_unlock_irq(&zone->lru_lock);
373 	release_pages(pvec->pages, pvec->nr, pvec->cold);
374 	pagevec_reinit(pvec);
375 }
376 
377 EXPORT_SYMBOL(__pagevec_lru_add);
378 
379 void __pagevec_lru_add_active(struct pagevec *pvec)
380 {
381 	int i;
382 	struct zone *zone = NULL;
383 
384 	for (i = 0; i < pagevec_count(pvec); i++) {
385 		struct page *page = pvec->pages[i];
386 		struct zone *pagezone = page_zone(page);
387 
388 		if (pagezone != zone) {
389 			if (zone)
390 				spin_unlock_irq(&zone->lru_lock);
391 			zone = pagezone;
392 			spin_lock_irq(&zone->lru_lock);
393 		}
394 		BUG_ON(PageLRU(page));
395 		SetPageLRU(page);
396 		BUG_ON(PageActive(page));
397 		SetPageActive(page);
398 		add_page_to_active_list(zone, page);
399 	}
400 	if (zone)
401 		spin_unlock_irq(&zone->lru_lock);
402 	release_pages(pvec->pages, pvec->nr, pvec->cold);
403 	pagevec_reinit(pvec);
404 }
405 
406 /*
407  * Try to drop buffers from the pages in a pagevec
408  */
409 void pagevec_strip(struct pagevec *pvec)
410 {
411 	int i;
412 
413 	for (i = 0; i < pagevec_count(pvec); i++) {
414 		struct page *page = pvec->pages[i];
415 
416 		if (PagePrivate(page) && !TestSetPageLocked(page)) {
417 			if (PagePrivate(page))
418 				try_to_release_page(page, 0);
419 			unlock_page(page);
420 		}
421 	}
422 }
423 
424 /**
425  * pagevec_lookup - gang pagecache lookup
426  * @pvec:	Where the resulting pages are placed
427  * @mapping:	The address_space to search
428  * @start:	The starting page index
429  * @nr_pages:	The maximum number of pages
430  *
431  * pagevec_lookup() will search for and return a group of up to @nr_pages pages
432  * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
433  * reference against the pages in @pvec.
434  *
435  * The search returns a group of mapping-contiguous pages with ascending
436  * indexes.  There may be holes in the indices due to not-present pages.
437  *
438  * pagevec_lookup() returns the number of pages which were found.
439  */
440 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
441 		pgoff_t start, unsigned nr_pages)
442 {
443 	pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
444 	return pagevec_count(pvec);
445 }
446 
447 EXPORT_SYMBOL(pagevec_lookup);
448 
449 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
450 		pgoff_t *index, int tag, unsigned nr_pages)
451 {
452 	pvec->nr = find_get_pages_tag(mapping, index, tag,
453 					nr_pages, pvec->pages);
454 	return pagevec_count(pvec);
455 }
456 
457 EXPORT_SYMBOL(pagevec_lookup_tag);
458 
459 #ifdef CONFIG_SMP
460 /*
461  * We tolerate a little inaccuracy to avoid ping-ponging the counter between
462  * CPUs
463  */
464 #define ACCT_THRESHOLD	max(16, NR_CPUS * 2)
465 
466 static DEFINE_PER_CPU(long, committed_space) = 0;
467 
468 void vm_acct_memory(long pages)
469 {
470 	long *local;
471 
472 	preempt_disable();
473 	local = &__get_cpu_var(committed_space);
474 	*local += pages;
475 	if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
476 		atomic_add(*local, &vm_committed_space);
477 		*local = 0;
478 	}
479 	preempt_enable();
480 }
481 
482 #ifdef CONFIG_HOTPLUG_CPU
483 
484 /* Drop the CPU's cached committed space back into the central pool. */
485 static int cpu_swap_callback(struct notifier_block *nfb,
486 			     unsigned long action,
487 			     void *hcpu)
488 {
489 	long *committed;
490 
491 	committed = &per_cpu(committed_space, (long)hcpu);
492 	if (action == CPU_DEAD) {
493 		atomic_add(*committed, &vm_committed_space);
494 		*committed = 0;
495 		__lru_add_drain((long)hcpu);
496 	}
497 	return NOTIFY_OK;
498 }
499 #endif /* CONFIG_HOTPLUG_CPU */
500 #endif /* CONFIG_SMP */
501 
502 /*
503  * Perform any setup for the swap system
504  */
505 void __init swap_setup(void)
506 {
507 	unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
508 
509 	/* Use a smaller cluster for small-memory machines */
510 	if (megs < 16)
511 		page_cluster = 2;
512 	else
513 		page_cluster = 3;
514 	/*
515 	 * Right now other parts of the system means that we
516 	 * _really_ don't want to cluster much more
517 	 */
518 	hotcpu_notifier(cpu_swap_callback, 0);
519 }
520