xref: /linux/mm/vmstat.c (revision f24e9f586b377749dff37554696cf3a105540c94)
1 /*
2  *  linux/mm/vmstat.c
3  *
4  *  Manages VM statistics
5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6  *
7  *  zoned VM statistics
8  *  Copyright (C) 2006 Silicon Graphics, Inc.,
9  *		Christoph Lameter <christoph@lameter.com>
10  */
11 
12 #include <linux/config.h>
13 #include <linux/mm.h>
14 #include <linux/module.h>
15 #include <linux/cpu.h>
16 
17 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
18 			unsigned long *free, struct pglist_data *pgdat)
19 {
20 	struct zone *zones = pgdat->node_zones;
21 	int i;
22 
23 	*active = 0;
24 	*inactive = 0;
25 	*free = 0;
26 	for (i = 0; i < MAX_NR_ZONES; i++) {
27 		*active += zones[i].nr_active;
28 		*inactive += zones[i].nr_inactive;
29 		*free += zones[i].free_pages;
30 	}
31 }
32 
33 void get_zone_counts(unsigned long *active,
34 		unsigned long *inactive, unsigned long *free)
35 {
36 	struct pglist_data *pgdat;
37 
38 	*active = 0;
39 	*inactive = 0;
40 	*free = 0;
41 	for_each_online_pgdat(pgdat) {
42 		unsigned long l, m, n;
43 		__get_zone_counts(&l, &m, &n, pgdat);
44 		*active += l;
45 		*inactive += m;
46 		*free += n;
47 	}
48 }
49 
50 #ifdef CONFIG_VM_EVENT_COUNTERS
51 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
52 EXPORT_PER_CPU_SYMBOL(vm_event_states);
53 
54 static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
55 {
56 	int cpu = 0;
57 	int i;
58 
59 	memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
60 
61 	cpu = first_cpu(*cpumask);
62 	while (cpu < NR_CPUS) {
63 		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
64 
65 		cpu = next_cpu(cpu, *cpumask);
66 
67 		if (cpu < NR_CPUS)
68 			prefetch(&per_cpu(vm_event_states, cpu));
69 
70 
71 		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
72 			ret[i] += this->event[i];
73 	}
74 }
75 
76 /*
77  * Accumulate the vm event counters across all CPUs.
78  * The result is unavoidably approximate - it can change
79  * during and after execution of this function.
80 */
81 void all_vm_events(unsigned long *ret)
82 {
83 	sum_vm_events(ret, &cpu_online_map);
84 }
85 EXPORT_SYMBOL_GPL(all_vm_events);
86 
87 #ifdef CONFIG_HOTPLUG
88 /*
89  * Fold the foreign cpu events into our own.
90  *
91  * This is adding to the events on one processor
92  * but keeps the global counts constant.
93  */
94 void vm_events_fold_cpu(int cpu)
95 {
96 	struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
97 	int i;
98 
99 	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
100 		count_vm_events(i, fold_state->event[i]);
101 		fold_state->event[i] = 0;
102 	}
103 }
104 #endif /* CONFIG_HOTPLUG */
105 
106 #endif /* CONFIG_VM_EVENT_COUNTERS */
107 
108 /*
109  * Manage combined zone based / global counters
110  *
111  * vm_stat contains the global counters
112  */
113 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
114 EXPORT_SYMBOL(vm_stat);
115 
116 #ifdef CONFIG_SMP
117 
118 static int calculate_threshold(struct zone *zone)
119 {
120 	int threshold;
121 	int mem;	/* memory in 128 MB units */
122 
123 	/*
124 	 * The threshold scales with the number of processors and the amount
125 	 * of memory per zone. More memory means that we can defer updates for
126 	 * longer, more processors could lead to more contention.
127  	 * fls() is used to have a cheap way of logarithmic scaling.
128 	 *
129 	 * Some sample thresholds:
130 	 *
131 	 * Threshold	Processors	(fls)	Zonesize	fls(mem+1)
132 	 * ------------------------------------------------------------------
133 	 * 8		1		1	0.9-1 GB	4
134 	 * 16		2		2	0.9-1 GB	4
135 	 * 20 		2		2	1-2 GB		5
136 	 * 24		2		2	2-4 GB		6
137 	 * 28		2		2	4-8 GB		7
138 	 * 32		2		2	8-16 GB		8
139 	 * 4		2		2	<128M		1
140 	 * 30		4		3	2-4 GB		5
141 	 * 48		4		3	8-16 GB		8
142 	 * 32		8		4	1-2 GB		4
143 	 * 32		8		4	0.9-1GB		4
144 	 * 10		16		5	<128M		1
145 	 * 40		16		5	900M		4
146 	 * 70		64		7	2-4 GB		5
147 	 * 84		64		7	4-8 GB		6
148 	 * 108		512		9	4-8 GB		6
149 	 * 125		1024		10	8-16 GB		8
150 	 * 125		1024		10	16-32 GB	9
151 	 */
152 
153 	mem = zone->present_pages >> (27 - PAGE_SHIFT);
154 
155 	threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
156 
157 	/*
158 	 * Maximum threshold is 125
159 	 */
160 	threshold = min(125, threshold);
161 
162 	return threshold;
163 }
164 
165 /*
166  * Refresh the thresholds for each zone.
167  */
168 static void refresh_zone_stat_thresholds(void)
169 {
170 	struct zone *zone;
171 	int cpu;
172 	int threshold;
173 
174 	for_each_zone(zone) {
175 
176 		if (!zone->present_pages)
177 			continue;
178 
179 		threshold = calculate_threshold(zone);
180 
181 		for_each_online_cpu(cpu)
182 			zone_pcp(zone, cpu)->stat_threshold = threshold;
183 	}
184 }
185 
186 /*
187  * For use when we know that interrupts are disabled.
188  */
189 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
190 				int delta)
191 {
192 	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
193 	s8 *p = pcp->vm_stat_diff + item;
194 	long x;
195 
196 	x = delta + *p;
197 
198 	if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
199 		zone_page_state_add(x, zone, item);
200 		x = 0;
201 	}
202 	*p = x;
203 }
204 EXPORT_SYMBOL(__mod_zone_page_state);
205 
206 /*
207  * For an unknown interrupt state
208  */
209 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
210 					int delta)
211 {
212 	unsigned long flags;
213 
214 	local_irq_save(flags);
215 	__mod_zone_page_state(zone, item, delta);
216 	local_irq_restore(flags);
217 }
218 EXPORT_SYMBOL(mod_zone_page_state);
219 
220 /*
221  * Optimized increment and decrement functions.
222  *
223  * These are only for a single page and therefore can take a struct page *
224  * argument instead of struct zone *. This allows the inclusion of the code
225  * generated for page_zone(page) into the optimized functions.
226  *
227  * No overflow check is necessary and therefore the differential can be
228  * incremented or decremented in place which may allow the compilers to
229  * generate better code.
230  * The increment or decrement is known and therefore one boundary check can
231  * be omitted.
232  *
233  * NOTE: These functions are very performance sensitive. Change only
234  * with care.
235  *
236  * Some processors have inc/dec instructions that are atomic vs an interrupt.
237  * However, the code must first determine the differential location in a zone
238  * based on the processor number and then inc/dec the counter. There is no
239  * guarantee without disabling preemption that the processor will not change
240  * in between and therefore the atomicity vs. interrupt cannot be exploited
241  * in a useful way here.
242  */
243 static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
244 {
245 	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
246 	s8 *p = pcp->vm_stat_diff + item;
247 
248 	(*p)++;
249 
250 	if (unlikely(*p > pcp->stat_threshold)) {
251 		int overstep = pcp->stat_threshold / 2;
252 
253 		zone_page_state_add(*p + overstep, zone, item);
254 		*p = -overstep;
255 	}
256 }
257 
258 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
259 {
260 	__inc_zone_state(page_zone(page), item);
261 }
262 EXPORT_SYMBOL(__inc_zone_page_state);
263 
264 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
265 {
266 	struct zone *zone = page_zone(page);
267 	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
268 	s8 *p = pcp->vm_stat_diff + item;
269 
270 	(*p)--;
271 
272 	if (unlikely(*p < - pcp->stat_threshold)) {
273 		int overstep = pcp->stat_threshold / 2;
274 
275 		zone_page_state_add(*p - overstep, zone, item);
276 		*p = overstep;
277 	}
278 }
279 EXPORT_SYMBOL(__dec_zone_page_state);
280 
281 void inc_zone_state(struct zone *zone, enum zone_stat_item item)
282 {
283 	unsigned long flags;
284 
285 	local_irq_save(flags);
286 	__inc_zone_state(zone, item);
287 	local_irq_restore(flags);
288 }
289 
290 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
291 {
292 	unsigned long flags;
293 	struct zone *zone;
294 
295 	zone = page_zone(page);
296 	local_irq_save(flags);
297 	__inc_zone_state(zone, item);
298 	local_irq_restore(flags);
299 }
300 EXPORT_SYMBOL(inc_zone_page_state);
301 
302 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
303 {
304 	unsigned long flags;
305 
306 	local_irq_save(flags);
307 	__dec_zone_page_state(page, item);
308 	local_irq_restore(flags);
309 }
310 EXPORT_SYMBOL(dec_zone_page_state);
311 
312 /*
313  * Update the zone counters for one cpu.
314  */
315 void refresh_cpu_vm_stats(int cpu)
316 {
317 	struct zone *zone;
318 	int i;
319 	unsigned long flags;
320 
321 	for_each_zone(zone) {
322 		struct per_cpu_pageset *pcp;
323 
324 		pcp = zone_pcp(zone, cpu);
325 
326 		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
327 			if (pcp->vm_stat_diff[i]) {
328 				local_irq_save(flags);
329 				zone_page_state_add(pcp->vm_stat_diff[i],
330 					zone, i);
331 				pcp->vm_stat_diff[i] = 0;
332 				local_irq_restore(flags);
333 			}
334 	}
335 }
336 
337 static void __refresh_cpu_vm_stats(void *dummy)
338 {
339 	refresh_cpu_vm_stats(smp_processor_id());
340 }
341 
342 /*
343  * Consolidate all counters.
344  *
345  * Note that the result is less inaccurate but still inaccurate
346  * if concurrent processes are allowed to run.
347  */
348 void refresh_vm_stats(void)
349 {
350 	on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
351 }
352 EXPORT_SYMBOL(refresh_vm_stats);
353 
354 #endif
355 
356 #ifdef CONFIG_NUMA
357 /*
358  * zonelist = the list of zones passed to the allocator
359  * z 	    = the zone from which the allocation occurred.
360  *
361  * Must be called with interrupts disabled.
362  */
363 void zone_statistics(struct zonelist *zonelist, struct zone *z)
364 {
365 	if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
366 		__inc_zone_state(z, NUMA_HIT);
367 	} else {
368 		__inc_zone_state(z, NUMA_MISS);
369 		__inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
370 	}
371 	if (z->zone_pgdat == NODE_DATA(numa_node_id()))
372 		__inc_zone_state(z, NUMA_LOCAL);
373 	else
374 		__inc_zone_state(z, NUMA_OTHER);
375 }
376 #endif
377 
378 #ifdef CONFIG_PROC_FS
379 
380 #include <linux/seq_file.h>
381 
382 static void *frag_start(struct seq_file *m, loff_t *pos)
383 {
384 	pg_data_t *pgdat;
385 	loff_t node = *pos;
386 	for (pgdat = first_online_pgdat();
387 	     pgdat && node;
388 	     pgdat = next_online_pgdat(pgdat))
389 		--node;
390 
391 	return pgdat;
392 }
393 
394 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
395 {
396 	pg_data_t *pgdat = (pg_data_t *)arg;
397 
398 	(*pos)++;
399 	return next_online_pgdat(pgdat);
400 }
401 
402 static void frag_stop(struct seq_file *m, void *arg)
403 {
404 }
405 
406 /*
407  * This walks the free areas for each zone.
408  */
409 static int frag_show(struct seq_file *m, void *arg)
410 {
411 	pg_data_t *pgdat = (pg_data_t *)arg;
412 	struct zone *zone;
413 	struct zone *node_zones = pgdat->node_zones;
414 	unsigned long flags;
415 	int order;
416 
417 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
418 		if (!populated_zone(zone))
419 			continue;
420 
421 		spin_lock_irqsave(&zone->lock, flags);
422 		seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
423 		for (order = 0; order < MAX_ORDER; ++order)
424 			seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
425 		spin_unlock_irqrestore(&zone->lock, flags);
426 		seq_putc(m, '\n');
427 	}
428 	return 0;
429 }
430 
431 struct seq_operations fragmentation_op = {
432 	.start	= frag_start,
433 	.next	= frag_next,
434 	.stop	= frag_stop,
435 	.show	= frag_show,
436 };
437 
438 static char *vmstat_text[] = {
439 	/* Zoned VM counters */
440 	"nr_anon_pages",
441 	"nr_mapped",
442 	"nr_file_pages",
443 	"nr_slab",
444 	"nr_page_table_pages",
445 	"nr_dirty",
446 	"nr_writeback",
447 	"nr_unstable",
448 	"nr_bounce",
449 
450 #ifdef CONFIG_NUMA
451 	"numa_hit",
452 	"numa_miss",
453 	"numa_foreign",
454 	"numa_interleave",
455 	"numa_local",
456 	"numa_other",
457 #endif
458 
459 #ifdef CONFIG_VM_EVENT_COUNTERS
460 	"pgpgin",
461 	"pgpgout",
462 	"pswpin",
463 	"pswpout",
464 
465 	"pgalloc_dma",
466 	"pgalloc_dma32",
467 	"pgalloc_normal",
468 	"pgalloc_high",
469 
470 	"pgfree",
471 	"pgactivate",
472 	"pgdeactivate",
473 
474 	"pgfault",
475 	"pgmajfault",
476 
477 	"pgrefill_dma",
478 	"pgrefill_dma32",
479 	"pgrefill_normal",
480 	"pgrefill_high",
481 
482 	"pgsteal_dma",
483 	"pgsteal_dma32",
484 	"pgsteal_normal",
485 	"pgsteal_high",
486 
487 	"pgscan_kswapd_dma",
488 	"pgscan_kswapd_dma32",
489 	"pgscan_kswapd_normal",
490 	"pgscan_kswapd_high",
491 
492 	"pgscan_direct_dma",
493 	"pgscan_direct_dma32",
494 	"pgscan_direct_normal",
495 	"pgscan_direct_high",
496 
497 	"pginodesteal",
498 	"slabs_scanned",
499 	"kswapd_steal",
500 	"kswapd_inodesteal",
501 	"pageoutrun",
502 	"allocstall",
503 
504 	"pgrotated",
505 #endif
506 };
507 
508 /*
509  * Output information about zones in @pgdat.
510  */
511 static int zoneinfo_show(struct seq_file *m, void *arg)
512 {
513 	pg_data_t *pgdat = arg;
514 	struct zone *zone;
515 	struct zone *node_zones = pgdat->node_zones;
516 	unsigned long flags;
517 
518 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
519 		int i;
520 
521 		if (!populated_zone(zone))
522 			continue;
523 
524 		spin_lock_irqsave(&zone->lock, flags);
525 		seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
526 		seq_printf(m,
527 			   "\n  pages free     %lu"
528 			   "\n        min      %lu"
529 			   "\n        low      %lu"
530 			   "\n        high     %lu"
531 			   "\n        active   %lu"
532 			   "\n        inactive %lu"
533 			   "\n        scanned  %lu (a: %lu i: %lu)"
534 			   "\n        spanned  %lu"
535 			   "\n        present  %lu",
536 			   zone->free_pages,
537 			   zone->pages_min,
538 			   zone->pages_low,
539 			   zone->pages_high,
540 			   zone->nr_active,
541 			   zone->nr_inactive,
542 			   zone->pages_scanned,
543 			   zone->nr_scan_active, zone->nr_scan_inactive,
544 			   zone->spanned_pages,
545 			   zone->present_pages);
546 
547 		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
548 			seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
549 					zone_page_state(zone, i));
550 
551 		seq_printf(m,
552 			   "\n        protection: (%lu",
553 			   zone->lowmem_reserve[0]);
554 		for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
555 			seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
556 		seq_printf(m,
557 			   ")"
558 			   "\n  pagesets");
559 		for_each_online_cpu(i) {
560 			struct per_cpu_pageset *pageset;
561 			int j;
562 
563 			pageset = zone_pcp(zone, i);
564 			for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
565 				if (pageset->pcp[j].count)
566 					break;
567 			}
568 			if (j == ARRAY_SIZE(pageset->pcp))
569 				continue;
570 			for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
571 				seq_printf(m,
572 					   "\n    cpu: %i pcp: %i"
573 					   "\n              count: %i"
574 					   "\n              high:  %i"
575 					   "\n              batch: %i",
576 					   i, j,
577 					   pageset->pcp[j].count,
578 					   pageset->pcp[j].high,
579 					   pageset->pcp[j].batch);
580 			}
581 #ifdef CONFIG_SMP
582 			seq_printf(m, "\n  vm stats threshold: %d",
583 					pageset->stat_threshold);
584 #endif
585 		}
586 		seq_printf(m,
587 			   "\n  all_unreclaimable: %u"
588 			   "\n  prev_priority:     %i"
589 			   "\n  temp_priority:     %i"
590 			   "\n  start_pfn:         %lu",
591 			   zone->all_unreclaimable,
592 			   zone->prev_priority,
593 			   zone->temp_priority,
594 			   zone->zone_start_pfn);
595 		spin_unlock_irqrestore(&zone->lock, flags);
596 		seq_putc(m, '\n');
597 	}
598 	return 0;
599 }
600 
601 struct seq_operations zoneinfo_op = {
602 	.start	= frag_start, /* iterate over all zones. The same as in
603 			       * fragmentation. */
604 	.next	= frag_next,
605 	.stop	= frag_stop,
606 	.show	= zoneinfo_show,
607 };
608 
609 static void *vmstat_start(struct seq_file *m, loff_t *pos)
610 {
611 	unsigned long *v;
612 #ifdef CONFIG_VM_EVENT_COUNTERS
613 	unsigned long *e;
614 #endif
615 	int i;
616 
617 	if (*pos >= ARRAY_SIZE(vmstat_text))
618 		return NULL;
619 
620 #ifdef CONFIG_VM_EVENT_COUNTERS
621 	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
622 			+ sizeof(struct vm_event_state), GFP_KERNEL);
623 #else
624 	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
625 			GFP_KERNEL);
626 #endif
627 	m->private = v;
628 	if (!v)
629 		return ERR_PTR(-ENOMEM);
630 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
631 		v[i] = global_page_state(i);
632 #ifdef CONFIG_VM_EVENT_COUNTERS
633 	e = v + NR_VM_ZONE_STAT_ITEMS;
634 	all_vm_events(e);
635 	e[PGPGIN] /= 2;		/* sectors -> kbytes */
636 	e[PGPGOUT] /= 2;
637 #endif
638 	return v + *pos;
639 }
640 
641 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
642 {
643 	(*pos)++;
644 	if (*pos >= ARRAY_SIZE(vmstat_text))
645 		return NULL;
646 	return (unsigned long *)m->private + *pos;
647 }
648 
649 static int vmstat_show(struct seq_file *m, void *arg)
650 {
651 	unsigned long *l = arg;
652 	unsigned long off = l - (unsigned long *)m->private;
653 
654 	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
655 	return 0;
656 }
657 
658 static void vmstat_stop(struct seq_file *m, void *arg)
659 {
660 	kfree(m->private);
661 	m->private = NULL;
662 }
663 
664 struct seq_operations vmstat_op = {
665 	.start	= vmstat_start,
666 	.next	= vmstat_next,
667 	.stop	= vmstat_stop,
668 	.show	= vmstat_show,
669 };
670 
671 #endif /* CONFIG_PROC_FS */
672 
673 #ifdef CONFIG_SMP
674 /*
675  * Use the cpu notifier to insure that the thresholds are recalculated
676  * when necessary.
677  */
678 static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
679 		unsigned long action,
680 		void *hcpu)
681 {
682 	switch (action) {
683 		case CPU_UP_PREPARE:
684 		case CPU_UP_CANCELED:
685 		case CPU_DEAD:
686 			refresh_zone_stat_thresholds();
687 			break;
688 		default:
689 			break;
690 	}
691 	return NOTIFY_OK;
692 }
693 
694 static struct notifier_block __cpuinitdata vmstat_notifier =
695 	{ &vmstat_cpuup_callback, NULL, 0 };
696 
697 int __init setup_vmstat(void)
698 {
699 	refresh_zone_stat_thresholds();
700 	register_cpu_notifier(&vmstat_notifier);
701 	return 0;
702 }
703 module_init(setup_vmstat)
704 #endif
705