xref: /linux/mm/vmstat.c (revision 9924003807a9738b3f5295174b6c623f5a85eb97)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   *  linux/mm/vmstat.c
4   *
5   *  Manages VM statistics
6   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
7   *
8   *  zoned VM statistics
9   *  Copyright (C) 2006 Silicon Graphics, Inc.,
10   *		Christoph Lameter <christoph@lameter.com>
11   *  Copyright (C) 2008-2014 Christoph Lameter
12   */
13  #include <linux/fs.h>
14  #include <linux/mm.h>
15  #include <linux/err.h>
16  #include <linux/module.h>
17  #include <linux/slab.h>
18  #include <linux/cpu.h>
19  #include <linux/cpumask.h>
20  #include <linux/vmstat.h>
21  #include <linux/proc_fs.h>
22  #include <linux/seq_file.h>
23  #include <linux/debugfs.h>
24  #include <linux/sched.h>
25  #include <linux/math64.h>
26  #include <linux/writeback.h>
27  #include <linux/compaction.h>
28  #include <linux/mm_inline.h>
29  #include <linux/page_owner.h>
30  #include <linux/sched/isolation.h>
31  
32  #include "internal.h"
33  
34  #ifdef CONFIG_NUMA
35  int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
36  
37  /* zero numa counters within a zone */
38  static void zero_zone_numa_counters(struct zone *zone)
39  {
40  	int item, cpu;
41  
42  	for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
43  		atomic_long_set(&zone->vm_numa_event[item], 0);
44  		for_each_online_cpu(cpu) {
45  			per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
46  						= 0;
47  		}
48  	}
49  }
50  
51  /* zero numa counters of all the populated zones */
52  static void zero_zones_numa_counters(void)
53  {
54  	struct zone *zone;
55  
56  	for_each_populated_zone(zone)
57  		zero_zone_numa_counters(zone);
58  }
59  
60  /* zero global numa counters */
61  static void zero_global_numa_counters(void)
62  {
63  	int item;
64  
65  	for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
66  		atomic_long_set(&vm_numa_event[item], 0);
67  }
68  
69  static void invalid_numa_statistics(void)
70  {
71  	zero_zones_numa_counters();
72  	zero_global_numa_counters();
73  }
74  
75  static DEFINE_MUTEX(vm_numa_stat_lock);
76  
77  int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
78  		void *buffer, size_t *length, loff_t *ppos)
79  {
80  	int ret, oldval;
81  
82  	mutex_lock(&vm_numa_stat_lock);
83  	if (write)
84  		oldval = sysctl_vm_numa_stat;
85  	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
86  	if (ret || !write)
87  		goto out;
88  
89  	if (oldval == sysctl_vm_numa_stat)
90  		goto out;
91  	else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
92  		static_branch_enable(&vm_numa_stat_key);
93  		pr_info("enable numa statistics\n");
94  	} else {
95  		static_branch_disable(&vm_numa_stat_key);
96  		invalid_numa_statistics();
97  		pr_info("disable numa statistics, and clear numa counters\n");
98  	}
99  
100  out:
101  	mutex_unlock(&vm_numa_stat_lock);
102  	return ret;
103  }
104  #endif
105  
106  #ifdef CONFIG_VM_EVENT_COUNTERS
107  DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
108  EXPORT_PER_CPU_SYMBOL(vm_event_states);
109  
110  static void sum_vm_events(unsigned long *ret)
111  {
112  	int cpu;
113  	int i;
114  
115  	memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
116  
117  	for_each_online_cpu(cpu) {
118  		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
119  
120  		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
121  			ret[i] += this->event[i];
122  	}
123  }
124  
125  /*
126   * Accumulate the vm event counters across all CPUs.
127   * The result is unavoidably approximate - it can change
128   * during and after execution of this function.
129  */
130  void all_vm_events(unsigned long *ret)
131  {
132  	cpus_read_lock();
133  	sum_vm_events(ret);
134  	cpus_read_unlock();
135  }
136  EXPORT_SYMBOL_GPL(all_vm_events);
137  
138  /*
139   * Fold the foreign cpu events into our own.
140   *
141   * This is adding to the events on one processor
142   * but keeps the global counts constant.
143   */
144  void vm_events_fold_cpu(int cpu)
145  {
146  	struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
147  	int i;
148  
149  	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
150  		count_vm_events(i, fold_state->event[i]);
151  		fold_state->event[i] = 0;
152  	}
153  }
154  
155  #endif /* CONFIG_VM_EVENT_COUNTERS */
156  
157  /*
158   * Manage combined zone based / global counters
159   *
160   * vm_stat contains the global counters
161   */
162  atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
163  atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
164  atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
165  EXPORT_SYMBOL(vm_zone_stat);
166  EXPORT_SYMBOL(vm_node_stat);
167  
168  #ifdef CONFIG_NUMA
169  static void fold_vm_zone_numa_events(struct zone *zone)
170  {
171  	unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
172  	int cpu;
173  	enum numa_stat_item item;
174  
175  	for_each_online_cpu(cpu) {
176  		struct per_cpu_zonestat *pzstats;
177  
178  		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
179  		for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
180  			zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
181  	}
182  
183  	for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
184  		zone_numa_event_add(zone_numa_events[item], zone, item);
185  }
186  
187  void fold_vm_numa_events(void)
188  {
189  	struct zone *zone;
190  
191  	for_each_populated_zone(zone)
192  		fold_vm_zone_numa_events(zone);
193  }
194  #endif
195  
196  #ifdef CONFIG_SMP
197  
198  int calculate_pressure_threshold(struct zone *zone)
199  {
200  	int threshold;
201  	int watermark_distance;
202  
203  	/*
204  	 * As vmstats are not up to date, there is drift between the estimated
205  	 * and real values. For high thresholds and a high number of CPUs, it
206  	 * is possible for the min watermark to be breached while the estimated
207  	 * value looks fine. The pressure threshold is a reduced value such
208  	 * that even the maximum amount of drift will not accidentally breach
209  	 * the min watermark
210  	 */
211  	watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
212  	threshold = max(1, (int)(watermark_distance / num_online_cpus()));
213  
214  	/*
215  	 * Maximum threshold is 125
216  	 */
217  	threshold = min(125, threshold);
218  
219  	return threshold;
220  }
221  
222  int calculate_normal_threshold(struct zone *zone)
223  {
224  	int threshold;
225  	int mem;	/* memory in 128 MB units */
226  
227  	/*
228  	 * The threshold scales with the number of processors and the amount
229  	 * of memory per zone. More memory means that we can defer updates for
230  	 * longer, more processors could lead to more contention.
231   	 * fls() is used to have a cheap way of logarithmic scaling.
232  	 *
233  	 * Some sample thresholds:
234  	 *
235  	 * Threshold	Processors	(fls)	Zonesize	fls(mem)+1
236  	 * ------------------------------------------------------------------
237  	 * 8		1		1	0.9-1 GB	4
238  	 * 16		2		2	0.9-1 GB	4
239  	 * 20 		2		2	1-2 GB		5
240  	 * 24		2		2	2-4 GB		6
241  	 * 28		2		2	4-8 GB		7
242  	 * 32		2		2	8-16 GB		8
243  	 * 4		2		2	<128M		1
244  	 * 30		4		3	2-4 GB		5
245  	 * 48		4		3	8-16 GB		8
246  	 * 32		8		4	1-2 GB		4
247  	 * 32		8		4	0.9-1GB		4
248  	 * 10		16		5	<128M		1
249  	 * 40		16		5	900M		4
250  	 * 70		64		7	2-4 GB		5
251  	 * 84		64		7	4-8 GB		6
252  	 * 108		512		9	4-8 GB		6
253  	 * 125		1024		10	8-16 GB		8
254  	 * 125		1024		10	16-32 GB	9
255  	 */
256  
257  	mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
258  
259  	threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
260  
261  	/*
262  	 * Maximum threshold is 125
263  	 */
264  	threshold = min(125, threshold);
265  
266  	return threshold;
267  }
268  
269  /*
270   * Refresh the thresholds for each zone.
271   */
272  void refresh_zone_stat_thresholds(void)
273  {
274  	struct pglist_data *pgdat;
275  	struct zone *zone;
276  	int cpu;
277  	int threshold;
278  
279  	/* Zero current pgdat thresholds */
280  	for_each_online_pgdat(pgdat) {
281  		for_each_online_cpu(cpu) {
282  			per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
283  		}
284  	}
285  
286  	for_each_populated_zone(zone) {
287  		struct pglist_data *pgdat = zone->zone_pgdat;
288  		unsigned long max_drift, tolerate_drift;
289  
290  		threshold = calculate_normal_threshold(zone);
291  
292  		for_each_online_cpu(cpu) {
293  			int pgdat_threshold;
294  
295  			per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
296  							= threshold;
297  
298  			/* Base nodestat threshold on the largest populated zone. */
299  			pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
300  			per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
301  				= max(threshold, pgdat_threshold);
302  		}
303  
304  		/*
305  		 * Only set percpu_drift_mark if there is a danger that
306  		 * NR_FREE_PAGES reports the low watermark is ok when in fact
307  		 * the min watermark could be breached by an allocation
308  		 */
309  		tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
310  		max_drift = num_online_cpus() * threshold;
311  		if (max_drift > tolerate_drift)
312  			zone->percpu_drift_mark = high_wmark_pages(zone) +
313  					max_drift;
314  	}
315  }
316  
317  void set_pgdat_percpu_threshold(pg_data_t *pgdat,
318  				int (*calculate_pressure)(struct zone *))
319  {
320  	struct zone *zone;
321  	int cpu;
322  	int threshold;
323  	int i;
324  
325  	for (i = 0; i < pgdat->nr_zones; i++) {
326  		zone = &pgdat->node_zones[i];
327  		if (!zone->percpu_drift_mark)
328  			continue;
329  
330  		threshold = (*calculate_pressure)(zone);
331  		for_each_online_cpu(cpu)
332  			per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
333  							= threshold;
334  	}
335  }
336  
337  /*
338   * For use when we know that interrupts are disabled,
339   * or when we know that preemption is disabled and that
340   * particular counter cannot be updated from interrupt context.
341   */
342  void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
343  			   long delta)
344  {
345  	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
346  	s8 __percpu *p = pcp->vm_stat_diff + item;
347  	long x;
348  	long t;
349  
350  	/*
351  	 * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
352  	 * atomicity is provided by IRQs being disabled -- either explicitly
353  	 * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
354  	 * CPU migrations and preemption potentially corrupts a counter so
355  	 * disable preemption.
356  	 */
357  	preempt_disable_nested();
358  
359  	x = delta + __this_cpu_read(*p);
360  
361  	t = __this_cpu_read(pcp->stat_threshold);
362  
363  	if (unlikely(abs(x) > t)) {
364  		zone_page_state_add(x, zone, item);
365  		x = 0;
366  	}
367  	__this_cpu_write(*p, x);
368  
369  	preempt_enable_nested();
370  }
371  EXPORT_SYMBOL(__mod_zone_page_state);
372  
373  void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
374  				long delta)
375  {
376  	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
377  	s8 __percpu *p = pcp->vm_node_stat_diff + item;
378  	long x;
379  	long t;
380  
381  	if (vmstat_item_in_bytes(item)) {
382  		/*
383  		 * Only cgroups use subpage accounting right now; at
384  		 * the global level, these items still change in
385  		 * multiples of whole pages. Store them as pages
386  		 * internally to keep the per-cpu counters compact.
387  		 */
388  		VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
389  		delta >>= PAGE_SHIFT;
390  	}
391  
392  	/* See __mod_node_page_state */
393  	preempt_disable_nested();
394  
395  	x = delta + __this_cpu_read(*p);
396  
397  	t = __this_cpu_read(pcp->stat_threshold);
398  
399  	if (unlikely(abs(x) > t)) {
400  		node_page_state_add(x, pgdat, item);
401  		x = 0;
402  	}
403  	__this_cpu_write(*p, x);
404  
405  	preempt_enable_nested();
406  }
407  EXPORT_SYMBOL(__mod_node_page_state);
408  
409  /*
410   * Optimized increment and decrement functions.
411   *
412   * These are only for a single page and therefore can take a struct page *
413   * argument instead of struct zone *. This allows the inclusion of the code
414   * generated for page_zone(page) into the optimized functions.
415   *
416   * No overflow check is necessary and therefore the differential can be
417   * incremented or decremented in place which may allow the compilers to
418   * generate better code.
419   * The increment or decrement is known and therefore one boundary check can
420   * be omitted.
421   *
422   * NOTE: These functions are very performance sensitive. Change only
423   * with care.
424   *
425   * Some processors have inc/dec instructions that are atomic vs an interrupt.
426   * However, the code must first determine the differential location in a zone
427   * based on the processor number and then inc/dec the counter. There is no
428   * guarantee without disabling preemption that the processor will not change
429   * in between and therefore the atomicity vs. interrupt cannot be exploited
430   * in a useful way here.
431   */
432  void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
433  {
434  	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
435  	s8 __percpu *p = pcp->vm_stat_diff + item;
436  	s8 v, t;
437  
438  	/* See __mod_node_page_state */
439  	preempt_disable_nested();
440  
441  	v = __this_cpu_inc_return(*p);
442  	t = __this_cpu_read(pcp->stat_threshold);
443  	if (unlikely(v > t)) {
444  		s8 overstep = t >> 1;
445  
446  		zone_page_state_add(v + overstep, zone, item);
447  		__this_cpu_write(*p, -overstep);
448  	}
449  
450  	preempt_enable_nested();
451  }
452  
453  void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
454  {
455  	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
456  	s8 __percpu *p = pcp->vm_node_stat_diff + item;
457  	s8 v, t;
458  
459  	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
460  
461  	/* See __mod_node_page_state */
462  	preempt_disable_nested();
463  
464  	v = __this_cpu_inc_return(*p);
465  	t = __this_cpu_read(pcp->stat_threshold);
466  	if (unlikely(v > t)) {
467  		s8 overstep = t >> 1;
468  
469  		node_page_state_add(v + overstep, pgdat, item);
470  		__this_cpu_write(*p, -overstep);
471  	}
472  
473  	preempt_enable_nested();
474  }
475  
476  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
477  {
478  	__inc_zone_state(page_zone(page), item);
479  }
480  EXPORT_SYMBOL(__inc_zone_page_state);
481  
482  void __inc_node_page_state(struct page *page, enum node_stat_item item)
483  {
484  	__inc_node_state(page_pgdat(page), item);
485  }
486  EXPORT_SYMBOL(__inc_node_page_state);
487  
488  void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
489  {
490  	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
491  	s8 __percpu *p = pcp->vm_stat_diff + item;
492  	s8 v, t;
493  
494  	/* See __mod_node_page_state */
495  	preempt_disable_nested();
496  
497  	v = __this_cpu_dec_return(*p);
498  	t = __this_cpu_read(pcp->stat_threshold);
499  	if (unlikely(v < - t)) {
500  		s8 overstep = t >> 1;
501  
502  		zone_page_state_add(v - overstep, zone, item);
503  		__this_cpu_write(*p, overstep);
504  	}
505  
506  	preempt_enable_nested();
507  }
508  
509  void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
510  {
511  	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
512  	s8 __percpu *p = pcp->vm_node_stat_diff + item;
513  	s8 v, t;
514  
515  	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
516  
517  	/* See __mod_node_page_state */
518  	preempt_disable_nested();
519  
520  	v = __this_cpu_dec_return(*p);
521  	t = __this_cpu_read(pcp->stat_threshold);
522  	if (unlikely(v < - t)) {
523  		s8 overstep = t >> 1;
524  
525  		node_page_state_add(v - overstep, pgdat, item);
526  		__this_cpu_write(*p, overstep);
527  	}
528  
529  	preempt_enable_nested();
530  }
531  
532  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
533  {
534  	__dec_zone_state(page_zone(page), item);
535  }
536  EXPORT_SYMBOL(__dec_zone_page_state);
537  
538  void __dec_node_page_state(struct page *page, enum node_stat_item item)
539  {
540  	__dec_node_state(page_pgdat(page), item);
541  }
542  EXPORT_SYMBOL(__dec_node_page_state);
543  
544  #ifdef CONFIG_HAVE_CMPXCHG_LOCAL
545  /*
546   * If we have cmpxchg_local support then we do not need to incur the overhead
547   * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
548   *
549   * mod_state() modifies the zone counter state through atomic per cpu
550   * operations.
551   *
552   * Overstep mode specifies how overstep should handled:
553   *     0       No overstepping
554   *     1       Overstepping half of threshold
555   *     -1      Overstepping minus half of threshold
556  */
557  static inline void mod_zone_state(struct zone *zone,
558         enum zone_stat_item item, long delta, int overstep_mode)
559  {
560  	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
561  	s8 __percpu *p = pcp->vm_stat_diff + item;
562  	long o, n, t, z;
563  
564  	do {
565  		z = 0;  /* overflow to zone counters */
566  
567  		/*
568  		 * The fetching of the stat_threshold is racy. We may apply
569  		 * a counter threshold to the wrong the cpu if we get
570  		 * rescheduled while executing here. However, the next
571  		 * counter update will apply the threshold again and
572  		 * therefore bring the counter under the threshold again.
573  		 *
574  		 * Most of the time the thresholds are the same anyways
575  		 * for all cpus in a zone.
576  		 */
577  		t = this_cpu_read(pcp->stat_threshold);
578  
579  		o = this_cpu_read(*p);
580  		n = delta + o;
581  
582  		if (abs(n) > t) {
583  			int os = overstep_mode * (t >> 1) ;
584  
585  			/* Overflow must be added to zone counters */
586  			z = n + os;
587  			n = -os;
588  		}
589  	} while (this_cpu_cmpxchg(*p, o, n) != o);
590  
591  	if (z)
592  		zone_page_state_add(z, zone, item);
593  }
594  
595  void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
596  			 long delta)
597  {
598  	mod_zone_state(zone, item, delta, 0);
599  }
600  EXPORT_SYMBOL(mod_zone_page_state);
601  
602  void inc_zone_page_state(struct page *page, enum zone_stat_item item)
603  {
604  	mod_zone_state(page_zone(page), item, 1, 1);
605  }
606  EXPORT_SYMBOL(inc_zone_page_state);
607  
608  void dec_zone_page_state(struct page *page, enum zone_stat_item item)
609  {
610  	mod_zone_state(page_zone(page), item, -1, -1);
611  }
612  EXPORT_SYMBOL(dec_zone_page_state);
613  
614  static inline void mod_node_state(struct pglist_data *pgdat,
615         enum node_stat_item item, int delta, int overstep_mode)
616  {
617  	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
618  	s8 __percpu *p = pcp->vm_node_stat_diff + item;
619  	long o, n, t, z;
620  
621  	if (vmstat_item_in_bytes(item)) {
622  		/*
623  		 * Only cgroups use subpage accounting right now; at
624  		 * the global level, these items still change in
625  		 * multiples of whole pages. Store them as pages
626  		 * internally to keep the per-cpu counters compact.
627  		 */
628  		VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
629  		delta >>= PAGE_SHIFT;
630  	}
631  
632  	do {
633  		z = 0;  /* overflow to node counters */
634  
635  		/*
636  		 * The fetching of the stat_threshold is racy. We may apply
637  		 * a counter threshold to the wrong the cpu if we get
638  		 * rescheduled while executing here. However, the next
639  		 * counter update will apply the threshold again and
640  		 * therefore bring the counter under the threshold again.
641  		 *
642  		 * Most of the time the thresholds are the same anyways
643  		 * for all cpus in a node.
644  		 */
645  		t = this_cpu_read(pcp->stat_threshold);
646  
647  		o = this_cpu_read(*p);
648  		n = delta + o;
649  
650  		if (abs(n) > t) {
651  			int os = overstep_mode * (t >> 1) ;
652  
653  			/* Overflow must be added to node counters */
654  			z = n + os;
655  			n = -os;
656  		}
657  	} while (this_cpu_cmpxchg(*p, o, n) != o);
658  
659  	if (z)
660  		node_page_state_add(z, pgdat, item);
661  }
662  
663  void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
664  					long delta)
665  {
666  	mod_node_state(pgdat, item, delta, 0);
667  }
668  EXPORT_SYMBOL(mod_node_page_state);
669  
670  void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
671  {
672  	mod_node_state(pgdat, item, 1, 1);
673  }
674  
675  void inc_node_page_state(struct page *page, enum node_stat_item item)
676  {
677  	mod_node_state(page_pgdat(page), item, 1, 1);
678  }
679  EXPORT_SYMBOL(inc_node_page_state);
680  
681  void dec_node_page_state(struct page *page, enum node_stat_item item)
682  {
683  	mod_node_state(page_pgdat(page), item, -1, -1);
684  }
685  EXPORT_SYMBOL(dec_node_page_state);
686  #else
687  /*
688   * Use interrupt disable to serialize counter updates
689   */
690  void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
691  			 long delta)
692  {
693  	unsigned long flags;
694  
695  	local_irq_save(flags);
696  	__mod_zone_page_state(zone, item, delta);
697  	local_irq_restore(flags);
698  }
699  EXPORT_SYMBOL(mod_zone_page_state);
700  
701  void inc_zone_page_state(struct page *page, enum zone_stat_item item)
702  {
703  	unsigned long flags;
704  	struct zone *zone;
705  
706  	zone = page_zone(page);
707  	local_irq_save(flags);
708  	__inc_zone_state(zone, item);
709  	local_irq_restore(flags);
710  }
711  EXPORT_SYMBOL(inc_zone_page_state);
712  
713  void dec_zone_page_state(struct page *page, enum zone_stat_item item)
714  {
715  	unsigned long flags;
716  
717  	local_irq_save(flags);
718  	__dec_zone_page_state(page, item);
719  	local_irq_restore(flags);
720  }
721  EXPORT_SYMBOL(dec_zone_page_state);
722  
723  void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
724  {
725  	unsigned long flags;
726  
727  	local_irq_save(flags);
728  	__inc_node_state(pgdat, item);
729  	local_irq_restore(flags);
730  }
731  EXPORT_SYMBOL(inc_node_state);
732  
733  void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
734  					long delta)
735  {
736  	unsigned long flags;
737  
738  	local_irq_save(flags);
739  	__mod_node_page_state(pgdat, item, delta);
740  	local_irq_restore(flags);
741  }
742  EXPORT_SYMBOL(mod_node_page_state);
743  
744  void inc_node_page_state(struct page *page, enum node_stat_item item)
745  {
746  	unsigned long flags;
747  	struct pglist_data *pgdat;
748  
749  	pgdat = page_pgdat(page);
750  	local_irq_save(flags);
751  	__inc_node_state(pgdat, item);
752  	local_irq_restore(flags);
753  }
754  EXPORT_SYMBOL(inc_node_page_state);
755  
756  void dec_node_page_state(struct page *page, enum node_stat_item item)
757  {
758  	unsigned long flags;
759  
760  	local_irq_save(flags);
761  	__dec_node_page_state(page, item);
762  	local_irq_restore(flags);
763  }
764  EXPORT_SYMBOL(dec_node_page_state);
765  #endif
766  
767  /*
768   * Fold a differential into the global counters.
769   * Returns the number of counters updated.
770   */
771  static int fold_diff(int *zone_diff, int *node_diff)
772  {
773  	int i;
774  	int changes = 0;
775  
776  	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
777  		if (zone_diff[i]) {
778  			atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
779  			changes++;
780  	}
781  
782  	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
783  		if (node_diff[i]) {
784  			atomic_long_add(node_diff[i], &vm_node_stat[i]);
785  			changes++;
786  	}
787  	return changes;
788  }
789  
790  /*
791   * Update the zone counters for the current cpu.
792   *
793   * Note that refresh_cpu_vm_stats strives to only access
794   * node local memory. The per cpu pagesets on remote zones are placed
795   * in the memory local to the processor using that pageset. So the
796   * loop over all zones will access a series of cachelines local to
797   * the processor.
798   *
799   * The call to zone_page_state_add updates the cachelines with the
800   * statistics in the remote zone struct as well as the global cachelines
801   * with the global counters. These could cause remote node cache line
802   * bouncing and will have to be only done when necessary.
803   *
804   * The function returns the number of global counters updated.
805   */
806  static int refresh_cpu_vm_stats(bool do_pagesets)
807  {
808  	struct pglist_data *pgdat;
809  	struct zone *zone;
810  	int i;
811  	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
812  	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
813  	int changes = 0;
814  
815  	for_each_populated_zone(zone) {
816  		struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
817  #ifdef CONFIG_NUMA
818  		struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
819  #endif
820  
821  		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
822  			int v;
823  
824  			v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
825  			if (v) {
826  
827  				atomic_long_add(v, &zone->vm_stat[i]);
828  				global_zone_diff[i] += v;
829  #ifdef CONFIG_NUMA
830  				/* 3 seconds idle till flush */
831  				__this_cpu_write(pcp->expire, 3);
832  #endif
833  			}
834  		}
835  #ifdef CONFIG_NUMA
836  
837  		if (do_pagesets) {
838  			cond_resched();
839  			/*
840  			 * Deal with draining the remote pageset of this
841  			 * processor
842  			 *
843  			 * Check if there are pages remaining in this pageset
844  			 * if not then there is nothing to expire.
845  			 */
846  			if (!__this_cpu_read(pcp->expire) ||
847  			       !__this_cpu_read(pcp->count))
848  				continue;
849  
850  			/*
851  			 * We never drain zones local to this processor.
852  			 */
853  			if (zone_to_nid(zone) == numa_node_id()) {
854  				__this_cpu_write(pcp->expire, 0);
855  				continue;
856  			}
857  
858  			if (__this_cpu_dec_return(pcp->expire))
859  				continue;
860  
861  			if (__this_cpu_read(pcp->count)) {
862  				drain_zone_pages(zone, this_cpu_ptr(pcp));
863  				changes++;
864  			}
865  		}
866  #endif
867  	}
868  
869  	for_each_online_pgdat(pgdat) {
870  		struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
871  
872  		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
873  			int v;
874  
875  			v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
876  			if (v) {
877  				atomic_long_add(v, &pgdat->vm_stat[i]);
878  				global_node_diff[i] += v;
879  			}
880  		}
881  	}
882  
883  	changes += fold_diff(global_zone_diff, global_node_diff);
884  	return changes;
885  }
886  
887  /*
888   * Fold the data for an offline cpu into the global array.
889   * There cannot be any access by the offline cpu and therefore
890   * synchronization is simplified.
891   */
892  void cpu_vm_stats_fold(int cpu)
893  {
894  	struct pglist_data *pgdat;
895  	struct zone *zone;
896  	int i;
897  	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
898  	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
899  
900  	for_each_populated_zone(zone) {
901  		struct per_cpu_zonestat *pzstats;
902  
903  		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
904  
905  		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
906  			if (pzstats->vm_stat_diff[i]) {
907  				int v;
908  
909  				v = pzstats->vm_stat_diff[i];
910  				pzstats->vm_stat_diff[i] = 0;
911  				atomic_long_add(v, &zone->vm_stat[i]);
912  				global_zone_diff[i] += v;
913  			}
914  		}
915  #ifdef CONFIG_NUMA
916  		for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
917  			if (pzstats->vm_numa_event[i]) {
918  				unsigned long v;
919  
920  				v = pzstats->vm_numa_event[i];
921  				pzstats->vm_numa_event[i] = 0;
922  				zone_numa_event_add(v, zone, i);
923  			}
924  		}
925  #endif
926  	}
927  
928  	for_each_online_pgdat(pgdat) {
929  		struct per_cpu_nodestat *p;
930  
931  		p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
932  
933  		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
934  			if (p->vm_node_stat_diff[i]) {
935  				int v;
936  
937  				v = p->vm_node_stat_diff[i];
938  				p->vm_node_stat_diff[i] = 0;
939  				atomic_long_add(v, &pgdat->vm_stat[i]);
940  				global_node_diff[i] += v;
941  			}
942  	}
943  
944  	fold_diff(global_zone_diff, global_node_diff);
945  }
946  
947  /*
948   * this is only called if !populated_zone(zone), which implies no other users of
949   * pset->vm_stat_diff[] exist.
950   */
951  void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
952  {
953  	unsigned long v;
954  	int i;
955  
956  	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
957  		if (pzstats->vm_stat_diff[i]) {
958  			v = pzstats->vm_stat_diff[i];
959  			pzstats->vm_stat_diff[i] = 0;
960  			zone_page_state_add(v, zone, i);
961  		}
962  	}
963  
964  #ifdef CONFIG_NUMA
965  	for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
966  		if (pzstats->vm_numa_event[i]) {
967  			v = pzstats->vm_numa_event[i];
968  			pzstats->vm_numa_event[i] = 0;
969  			zone_numa_event_add(v, zone, i);
970  		}
971  	}
972  #endif
973  }
974  #endif
975  
976  #ifdef CONFIG_NUMA
977  /*
978   * Determine the per node value of a stat item. This function
979   * is called frequently in a NUMA machine, so try to be as
980   * frugal as possible.
981   */
982  unsigned long sum_zone_node_page_state(int node,
983  				 enum zone_stat_item item)
984  {
985  	struct zone *zones = NODE_DATA(node)->node_zones;
986  	int i;
987  	unsigned long count = 0;
988  
989  	for (i = 0; i < MAX_NR_ZONES; i++)
990  		count += zone_page_state(zones + i, item);
991  
992  	return count;
993  }
994  
995  /* Determine the per node value of a numa stat item. */
996  unsigned long sum_zone_numa_event_state(int node,
997  				 enum numa_stat_item item)
998  {
999  	struct zone *zones = NODE_DATA(node)->node_zones;
1000  	unsigned long count = 0;
1001  	int i;
1002  
1003  	for (i = 0; i < MAX_NR_ZONES; i++)
1004  		count += zone_numa_event_state(zones + i, item);
1005  
1006  	return count;
1007  }
1008  
1009  /*
1010   * Determine the per node value of a stat item.
1011   */
1012  unsigned long node_page_state_pages(struct pglist_data *pgdat,
1013  				    enum node_stat_item item)
1014  {
1015  	long x = atomic_long_read(&pgdat->vm_stat[item]);
1016  #ifdef CONFIG_SMP
1017  	if (x < 0)
1018  		x = 0;
1019  #endif
1020  	return x;
1021  }
1022  
1023  unsigned long node_page_state(struct pglist_data *pgdat,
1024  			      enum node_stat_item item)
1025  {
1026  	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1027  
1028  	return node_page_state_pages(pgdat, item);
1029  }
1030  #endif
1031  
1032  #ifdef CONFIG_COMPACTION
1033  
1034  struct contig_page_info {
1035  	unsigned long free_pages;
1036  	unsigned long free_blocks_total;
1037  	unsigned long free_blocks_suitable;
1038  };
1039  
1040  /*
1041   * Calculate the number of free pages in a zone, how many contiguous
1042   * pages are free and how many are large enough to satisfy an allocation of
1043   * the target size. Note that this function makes no attempt to estimate
1044   * how many suitable free blocks there *might* be if MOVABLE pages were
1045   * migrated. Calculating that is possible, but expensive and can be
1046   * figured out from userspace
1047   */
1048  static void fill_contig_page_info(struct zone *zone,
1049  				unsigned int suitable_order,
1050  				struct contig_page_info *info)
1051  {
1052  	unsigned int order;
1053  
1054  	info->free_pages = 0;
1055  	info->free_blocks_total = 0;
1056  	info->free_blocks_suitable = 0;
1057  
1058  	for (order = 0; order <= MAX_ORDER; order++) {
1059  		unsigned long blocks;
1060  
1061  		/*
1062  		 * Count number of free blocks.
1063  		 *
1064  		 * Access to nr_free is lockless as nr_free is used only for
1065  		 * diagnostic purposes. Use data_race to avoid KCSAN warning.
1066  		 */
1067  		blocks = data_race(zone->free_area[order].nr_free);
1068  		info->free_blocks_total += blocks;
1069  
1070  		/* Count free base pages */
1071  		info->free_pages += blocks << order;
1072  
1073  		/* Count the suitable free blocks */
1074  		if (order >= suitable_order)
1075  			info->free_blocks_suitable += blocks <<
1076  						(order - suitable_order);
1077  	}
1078  }
1079  
1080  /*
1081   * A fragmentation index only makes sense if an allocation of a requested
1082   * size would fail. If that is true, the fragmentation index indicates
1083   * whether external fragmentation or a lack of memory was the problem.
1084   * The value can be used to determine if page reclaim or compaction
1085   * should be used
1086   */
1087  static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
1088  {
1089  	unsigned long requested = 1UL << order;
1090  
1091  	if (WARN_ON_ONCE(order > MAX_ORDER))
1092  		return 0;
1093  
1094  	if (!info->free_blocks_total)
1095  		return 0;
1096  
1097  	/* Fragmentation index only makes sense when a request would fail */
1098  	if (info->free_blocks_suitable)
1099  		return -1000;
1100  
1101  	/*
1102  	 * Index is between 0 and 1 so return within 3 decimal places
1103  	 *
1104  	 * 0 => allocation would fail due to lack of memory
1105  	 * 1 => allocation would fail due to fragmentation
1106  	 */
1107  	return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1108  }
1109  
1110  /*
1111   * Calculates external fragmentation within a zone wrt the given order.
1112   * It is defined as the percentage of pages found in blocks of size
1113   * less than 1 << order. It returns values in range [0, 100].
1114   */
1115  unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
1116  {
1117  	struct contig_page_info info;
1118  
1119  	fill_contig_page_info(zone, order, &info);
1120  	if (info.free_pages == 0)
1121  		return 0;
1122  
1123  	return div_u64((info.free_pages -
1124  			(info.free_blocks_suitable << order)) * 100,
1125  			info.free_pages);
1126  }
1127  
1128  /* Same as __fragmentation index but allocs contig_page_info on stack */
1129  int fragmentation_index(struct zone *zone, unsigned int order)
1130  {
1131  	struct contig_page_info info;
1132  
1133  	fill_contig_page_info(zone, order, &info);
1134  	return __fragmentation_index(order, &info);
1135  }
1136  #endif
1137  
1138  #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1139      defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
1140  #ifdef CONFIG_ZONE_DMA
1141  #define TEXT_FOR_DMA(xx) xx "_dma",
1142  #else
1143  #define TEXT_FOR_DMA(xx)
1144  #endif
1145  
1146  #ifdef CONFIG_ZONE_DMA32
1147  #define TEXT_FOR_DMA32(xx) xx "_dma32",
1148  #else
1149  #define TEXT_FOR_DMA32(xx)
1150  #endif
1151  
1152  #ifdef CONFIG_HIGHMEM
1153  #define TEXT_FOR_HIGHMEM(xx) xx "_high",
1154  #else
1155  #define TEXT_FOR_HIGHMEM(xx)
1156  #endif
1157  
1158  #ifdef CONFIG_ZONE_DEVICE
1159  #define TEXT_FOR_DEVICE(xx) xx "_device",
1160  #else
1161  #define TEXT_FOR_DEVICE(xx)
1162  #endif
1163  
1164  #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
1165  					TEXT_FOR_HIGHMEM(xx) xx "_movable", \
1166  					TEXT_FOR_DEVICE(xx)
1167  
1168  const char * const vmstat_text[] = {
1169  	/* enum zone_stat_item counters */
1170  	"nr_free_pages",
1171  	"nr_zone_inactive_anon",
1172  	"nr_zone_active_anon",
1173  	"nr_zone_inactive_file",
1174  	"nr_zone_active_file",
1175  	"nr_zone_unevictable",
1176  	"nr_zone_write_pending",
1177  	"nr_mlock",
1178  	"nr_bounce",
1179  #if IS_ENABLED(CONFIG_ZSMALLOC)
1180  	"nr_zspages",
1181  #endif
1182  	"nr_free_cma",
1183  #ifdef CONFIG_UNACCEPTED_MEMORY
1184  	"nr_unaccepted",
1185  #endif
1186  
1187  	/* enum numa_stat_item counters */
1188  #ifdef CONFIG_NUMA
1189  	"numa_hit",
1190  	"numa_miss",
1191  	"numa_foreign",
1192  	"numa_interleave",
1193  	"numa_local",
1194  	"numa_other",
1195  #endif
1196  
1197  	/* enum node_stat_item counters */
1198  	"nr_inactive_anon",
1199  	"nr_active_anon",
1200  	"nr_inactive_file",
1201  	"nr_active_file",
1202  	"nr_unevictable",
1203  	"nr_slab_reclaimable",
1204  	"nr_slab_unreclaimable",
1205  	"nr_isolated_anon",
1206  	"nr_isolated_file",
1207  	"workingset_nodes",
1208  	"workingset_refault_anon",
1209  	"workingset_refault_file",
1210  	"workingset_activate_anon",
1211  	"workingset_activate_file",
1212  	"workingset_restore_anon",
1213  	"workingset_restore_file",
1214  	"workingset_nodereclaim",
1215  	"nr_anon_pages",
1216  	"nr_mapped",
1217  	"nr_file_pages",
1218  	"nr_dirty",
1219  	"nr_writeback",
1220  	"nr_writeback_temp",
1221  	"nr_shmem",
1222  	"nr_shmem_hugepages",
1223  	"nr_shmem_pmdmapped",
1224  	"nr_file_hugepages",
1225  	"nr_file_pmdmapped",
1226  	"nr_anon_transparent_hugepages",
1227  	"nr_vmscan_write",
1228  	"nr_vmscan_immediate_reclaim",
1229  	"nr_dirtied",
1230  	"nr_written",
1231  	"nr_throttled_written",
1232  	"nr_kernel_misc_reclaimable",
1233  	"nr_foll_pin_acquired",
1234  	"nr_foll_pin_released",
1235  	"nr_kernel_stack",
1236  #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1237  	"nr_shadow_call_stack",
1238  #endif
1239  	"nr_page_table_pages",
1240  	"nr_sec_page_table_pages",
1241  #ifdef CONFIG_SWAP
1242  	"nr_swapcached",
1243  #endif
1244  #ifdef CONFIG_NUMA_BALANCING
1245  	"pgpromote_success",
1246  	"pgpromote_candidate",
1247  #endif
1248  
1249  	/* enum writeback_stat_item counters */
1250  	"nr_dirty_threshold",
1251  	"nr_dirty_background_threshold",
1252  
1253  #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
1254  	/* enum vm_event_item counters */
1255  	"pgpgin",
1256  	"pgpgout",
1257  	"pswpin",
1258  	"pswpout",
1259  
1260  	TEXTS_FOR_ZONES("pgalloc")
1261  	TEXTS_FOR_ZONES("allocstall")
1262  	TEXTS_FOR_ZONES("pgskip")
1263  
1264  	"pgfree",
1265  	"pgactivate",
1266  	"pgdeactivate",
1267  	"pglazyfree",
1268  
1269  	"pgfault",
1270  	"pgmajfault",
1271  	"pglazyfreed",
1272  
1273  	"pgrefill",
1274  	"pgreuse",
1275  	"pgsteal_kswapd",
1276  	"pgsteal_direct",
1277  	"pgsteal_khugepaged",
1278  	"pgdemote_kswapd",
1279  	"pgdemote_direct",
1280  	"pgdemote_khugepaged",
1281  	"pgscan_kswapd",
1282  	"pgscan_direct",
1283  	"pgscan_khugepaged",
1284  	"pgscan_direct_throttle",
1285  	"pgscan_anon",
1286  	"pgscan_file",
1287  	"pgsteal_anon",
1288  	"pgsteal_file",
1289  
1290  #ifdef CONFIG_NUMA
1291  	"zone_reclaim_failed",
1292  #endif
1293  	"pginodesteal",
1294  	"slabs_scanned",
1295  	"kswapd_inodesteal",
1296  	"kswapd_low_wmark_hit_quickly",
1297  	"kswapd_high_wmark_hit_quickly",
1298  	"pageoutrun",
1299  
1300  	"pgrotated",
1301  
1302  	"drop_pagecache",
1303  	"drop_slab",
1304  	"oom_kill",
1305  
1306  #ifdef CONFIG_NUMA_BALANCING
1307  	"numa_pte_updates",
1308  	"numa_huge_pte_updates",
1309  	"numa_hint_faults",
1310  	"numa_hint_faults_local",
1311  	"numa_pages_migrated",
1312  #endif
1313  #ifdef CONFIG_MIGRATION
1314  	"pgmigrate_success",
1315  	"pgmigrate_fail",
1316  	"thp_migration_success",
1317  	"thp_migration_fail",
1318  	"thp_migration_split",
1319  #endif
1320  #ifdef CONFIG_COMPACTION
1321  	"compact_migrate_scanned",
1322  	"compact_free_scanned",
1323  	"compact_isolated",
1324  	"compact_stall",
1325  	"compact_fail",
1326  	"compact_success",
1327  	"compact_daemon_wake",
1328  	"compact_daemon_migrate_scanned",
1329  	"compact_daemon_free_scanned",
1330  #endif
1331  
1332  #ifdef CONFIG_HUGETLB_PAGE
1333  	"htlb_buddy_alloc_success",
1334  	"htlb_buddy_alloc_fail",
1335  #endif
1336  #ifdef CONFIG_CMA
1337  	"cma_alloc_success",
1338  	"cma_alloc_fail",
1339  #endif
1340  	"unevictable_pgs_culled",
1341  	"unevictable_pgs_scanned",
1342  	"unevictable_pgs_rescued",
1343  	"unevictable_pgs_mlocked",
1344  	"unevictable_pgs_munlocked",
1345  	"unevictable_pgs_cleared",
1346  	"unevictable_pgs_stranded",
1347  
1348  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1349  	"thp_fault_alloc",
1350  	"thp_fault_fallback",
1351  	"thp_fault_fallback_charge",
1352  	"thp_collapse_alloc",
1353  	"thp_collapse_alloc_failed",
1354  	"thp_file_alloc",
1355  	"thp_file_fallback",
1356  	"thp_file_fallback_charge",
1357  	"thp_file_mapped",
1358  	"thp_split_page",
1359  	"thp_split_page_failed",
1360  	"thp_deferred_split_page",
1361  	"thp_split_pmd",
1362  	"thp_scan_exceed_none_pte",
1363  	"thp_scan_exceed_swap_pte",
1364  	"thp_scan_exceed_share_pte",
1365  #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1366  	"thp_split_pud",
1367  #endif
1368  	"thp_zero_page_alloc",
1369  	"thp_zero_page_alloc_failed",
1370  	"thp_swpout",
1371  	"thp_swpout_fallback",
1372  #endif
1373  #ifdef CONFIG_MEMORY_BALLOON
1374  	"balloon_inflate",
1375  	"balloon_deflate",
1376  #ifdef CONFIG_BALLOON_COMPACTION
1377  	"balloon_migrate",
1378  #endif
1379  #endif /* CONFIG_MEMORY_BALLOON */
1380  #ifdef CONFIG_DEBUG_TLBFLUSH
1381  	"nr_tlb_remote_flush",
1382  	"nr_tlb_remote_flush_received",
1383  	"nr_tlb_local_flush_all",
1384  	"nr_tlb_local_flush_one",
1385  #endif /* CONFIG_DEBUG_TLBFLUSH */
1386  
1387  #ifdef CONFIG_SWAP
1388  	"swap_ra",
1389  	"swap_ra_hit",
1390  #ifdef CONFIG_KSM
1391  	"ksm_swpin_copy",
1392  #endif
1393  #endif
1394  #ifdef CONFIG_KSM
1395  	"cow_ksm",
1396  #endif
1397  #ifdef CONFIG_ZSWAP
1398  	"zswpin",
1399  	"zswpout",
1400  #endif
1401  #ifdef CONFIG_X86
1402  	"direct_map_level2_splits",
1403  	"direct_map_level3_splits",
1404  #endif
1405  #ifdef CONFIG_PER_VMA_LOCK_STATS
1406  	"vma_lock_success",
1407  	"vma_lock_abort",
1408  	"vma_lock_retry",
1409  	"vma_lock_miss",
1410  #endif
1411  #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
1412  };
1413  #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
1414  
1415  #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1416       defined(CONFIG_PROC_FS)
1417  static void *frag_start(struct seq_file *m, loff_t *pos)
1418  {
1419  	pg_data_t *pgdat;
1420  	loff_t node = *pos;
1421  
1422  	for (pgdat = first_online_pgdat();
1423  	     pgdat && node;
1424  	     pgdat = next_online_pgdat(pgdat))
1425  		--node;
1426  
1427  	return pgdat;
1428  }
1429  
1430  static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1431  {
1432  	pg_data_t *pgdat = (pg_data_t *)arg;
1433  
1434  	(*pos)++;
1435  	return next_online_pgdat(pgdat);
1436  }
1437  
1438  static void frag_stop(struct seq_file *m, void *arg)
1439  {
1440  }
1441  
1442  /*
1443   * Walk zones in a node and print using a callback.
1444   * If @assert_populated is true, only use callback for zones that are populated.
1445   */
1446  static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
1447  		bool assert_populated, bool nolock,
1448  		void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1449  {
1450  	struct zone *zone;
1451  	struct zone *node_zones = pgdat->node_zones;
1452  	unsigned long flags;
1453  
1454  	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
1455  		if (assert_populated && !populated_zone(zone))
1456  			continue;
1457  
1458  		if (!nolock)
1459  			spin_lock_irqsave(&zone->lock, flags);
1460  		print(m, pgdat, zone);
1461  		if (!nolock)
1462  			spin_unlock_irqrestore(&zone->lock, flags);
1463  	}
1464  }
1465  #endif
1466  
1467  #ifdef CONFIG_PROC_FS
1468  static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1469  						struct zone *zone)
1470  {
1471  	int order;
1472  
1473  	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1474  	for (order = 0; order <= MAX_ORDER; ++order)
1475  		/*
1476  		 * Access to nr_free is lockless as nr_free is used only for
1477  		 * printing purposes. Use data_race to avoid KCSAN warning.
1478  		 */
1479  		seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
1480  	seq_putc(m, '\n');
1481  }
1482  
1483  /*
1484   * This walks the free areas for each zone.
1485   */
1486  static int frag_show(struct seq_file *m, void *arg)
1487  {
1488  	pg_data_t *pgdat = (pg_data_t *)arg;
1489  	walk_zones_in_node(m, pgdat, true, false, frag_show_print);
1490  	return 0;
1491  }
1492  
1493  static void pagetypeinfo_showfree_print(struct seq_file *m,
1494  					pg_data_t *pgdat, struct zone *zone)
1495  {
1496  	int order, mtype;
1497  
1498  	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1499  		seq_printf(m, "Node %4d, zone %8s, type %12s ",
1500  					pgdat->node_id,
1501  					zone->name,
1502  					migratetype_names[mtype]);
1503  		for (order = 0; order <= MAX_ORDER; ++order) {
1504  			unsigned long freecount = 0;
1505  			struct free_area *area;
1506  			struct list_head *curr;
1507  			bool overflow = false;
1508  
1509  			area = &(zone->free_area[order]);
1510  
1511  			list_for_each(curr, &area->free_list[mtype]) {
1512  				/*
1513  				 * Cap the free_list iteration because it might
1514  				 * be really large and we are under a spinlock
1515  				 * so a long time spent here could trigger a
1516  				 * hard lockup detector. Anyway this is a
1517  				 * debugging tool so knowing there is a handful
1518  				 * of pages of this order should be more than
1519  				 * sufficient.
1520  				 */
1521  				if (++freecount >= 100000) {
1522  					overflow = true;
1523  					break;
1524  				}
1525  			}
1526  			seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1527  			spin_unlock_irq(&zone->lock);
1528  			cond_resched();
1529  			spin_lock_irq(&zone->lock);
1530  		}
1531  		seq_putc(m, '\n');
1532  	}
1533  }
1534  
1535  /* Print out the free pages at each order for each migatetype */
1536  static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
1537  {
1538  	int order;
1539  	pg_data_t *pgdat = (pg_data_t *)arg;
1540  
1541  	/* Print header */
1542  	seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
1543  	for (order = 0; order <= MAX_ORDER; ++order)
1544  		seq_printf(m, "%6d ", order);
1545  	seq_putc(m, '\n');
1546  
1547  	walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
1548  }
1549  
1550  static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1551  					pg_data_t *pgdat, struct zone *zone)
1552  {
1553  	int mtype;
1554  	unsigned long pfn;
1555  	unsigned long start_pfn = zone->zone_start_pfn;
1556  	unsigned long end_pfn = zone_end_pfn(zone);
1557  	unsigned long count[MIGRATE_TYPES] = { 0, };
1558  
1559  	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1560  		struct page *page;
1561  
1562  		page = pfn_to_online_page(pfn);
1563  		if (!page)
1564  			continue;
1565  
1566  		if (page_zone(page) != zone)
1567  			continue;
1568  
1569  		mtype = get_pageblock_migratetype(page);
1570  
1571  		if (mtype < MIGRATE_TYPES)
1572  			count[mtype]++;
1573  	}
1574  
1575  	/* Print counts */
1576  	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1577  	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1578  		seq_printf(m, "%12lu ", count[mtype]);
1579  	seq_putc(m, '\n');
1580  }
1581  
1582  /* Print out the number of pageblocks for each migratetype */
1583  static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1584  {
1585  	int mtype;
1586  	pg_data_t *pgdat = (pg_data_t *)arg;
1587  
1588  	seq_printf(m, "\n%-23s", "Number of blocks type ");
1589  	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1590  		seq_printf(m, "%12s ", migratetype_names[mtype]);
1591  	seq_putc(m, '\n');
1592  	walk_zones_in_node(m, pgdat, true, false,
1593  		pagetypeinfo_showblockcount_print);
1594  }
1595  
1596  /*
1597   * Print out the number of pageblocks for each migratetype that contain pages
1598   * of other types. This gives an indication of how well fallbacks are being
1599   * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1600   * to determine what is going on
1601   */
1602  static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1603  {
1604  #ifdef CONFIG_PAGE_OWNER
1605  	int mtype;
1606  
1607  	if (!static_branch_unlikely(&page_owner_inited))
1608  		return;
1609  
1610  	drain_all_pages(NULL);
1611  
1612  	seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1613  	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1614  		seq_printf(m, "%12s ", migratetype_names[mtype]);
1615  	seq_putc(m, '\n');
1616  
1617  	walk_zones_in_node(m, pgdat, true, true,
1618  		pagetypeinfo_showmixedcount_print);
1619  #endif /* CONFIG_PAGE_OWNER */
1620  }
1621  
1622  /*
1623   * This prints out statistics in relation to grouping pages by mobility.
1624   * It is expensive to collect so do not constantly read the file.
1625   */
1626  static int pagetypeinfo_show(struct seq_file *m, void *arg)
1627  {
1628  	pg_data_t *pgdat = (pg_data_t *)arg;
1629  
1630  	/* check memoryless node */
1631  	if (!node_state(pgdat->node_id, N_MEMORY))
1632  		return 0;
1633  
1634  	seq_printf(m, "Page block order: %d\n", pageblock_order);
1635  	seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
1636  	seq_putc(m, '\n');
1637  	pagetypeinfo_showfree(m, pgdat);
1638  	pagetypeinfo_showblockcount(m, pgdat);
1639  	pagetypeinfo_showmixedcount(m, pgdat);
1640  
1641  	return 0;
1642  }
1643  
1644  static const struct seq_operations fragmentation_op = {
1645  	.start	= frag_start,
1646  	.next	= frag_next,
1647  	.stop	= frag_stop,
1648  	.show	= frag_show,
1649  };
1650  
1651  static const struct seq_operations pagetypeinfo_op = {
1652  	.start	= frag_start,
1653  	.next	= frag_next,
1654  	.stop	= frag_stop,
1655  	.show	= pagetypeinfo_show,
1656  };
1657  
1658  static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1659  {
1660  	int zid;
1661  
1662  	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1663  		struct zone *compare = &pgdat->node_zones[zid];
1664  
1665  		if (populated_zone(compare))
1666  			return zone == compare;
1667  	}
1668  
1669  	return false;
1670  }
1671  
1672  static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1673  							struct zone *zone)
1674  {
1675  	int i;
1676  	seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
1677  	if (is_zone_first_populated(pgdat, zone)) {
1678  		seq_printf(m, "\n  per-node stats");
1679  		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1680  			unsigned long pages = node_page_state_pages(pgdat, i);
1681  
1682  			if (vmstat_item_print_in_thp(i))
1683  				pages /= HPAGE_PMD_NR;
1684  			seq_printf(m, "\n      %-12s %lu", node_stat_name(i),
1685  				   pages);
1686  		}
1687  	}
1688  	seq_printf(m,
1689  		   "\n  pages free     %lu"
1690  		   "\n        boost    %lu"
1691  		   "\n        min      %lu"
1692  		   "\n        low      %lu"
1693  		   "\n        high     %lu"
1694  		   "\n        spanned  %lu"
1695  		   "\n        present  %lu"
1696  		   "\n        managed  %lu"
1697  		   "\n        cma      %lu",
1698  		   zone_page_state(zone, NR_FREE_PAGES),
1699  		   zone->watermark_boost,
1700  		   min_wmark_pages(zone),
1701  		   low_wmark_pages(zone),
1702  		   high_wmark_pages(zone),
1703  		   zone->spanned_pages,
1704  		   zone->present_pages,
1705  		   zone_managed_pages(zone),
1706  		   zone_cma_pages(zone));
1707  
1708  	seq_printf(m,
1709  		   "\n        protection: (%ld",
1710  		   zone->lowmem_reserve[0]);
1711  	for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
1712  		seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
1713  	seq_putc(m, ')');
1714  
1715  	/* If unpopulated, no other information is useful */
1716  	if (!populated_zone(zone)) {
1717  		seq_putc(m, '\n');
1718  		return;
1719  	}
1720  
1721  	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1722  		seq_printf(m, "\n      %-12s %lu", zone_stat_name(i),
1723  			   zone_page_state(zone, i));
1724  
1725  #ifdef CONFIG_NUMA
1726  	for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1727  		seq_printf(m, "\n      %-12s %lu", numa_stat_name(i),
1728  			   zone_numa_event_state(zone, i));
1729  #endif
1730  
1731  	seq_printf(m, "\n  pagesets");
1732  	for_each_online_cpu(i) {
1733  		struct per_cpu_pages *pcp;
1734  		struct per_cpu_zonestat __maybe_unused *pzstats;
1735  
1736  		pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
1737  		seq_printf(m,
1738  			   "\n    cpu: %i"
1739  			   "\n              count: %i"
1740  			   "\n              high:  %i"
1741  			   "\n              batch: %i",
1742  			   i,
1743  			   pcp->count,
1744  			   pcp->high,
1745  			   pcp->batch);
1746  #ifdef CONFIG_SMP
1747  		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
1748  		seq_printf(m, "\n  vm stats threshold: %d",
1749  				pzstats->stat_threshold);
1750  #endif
1751  	}
1752  	seq_printf(m,
1753  		   "\n  node_unreclaimable:  %u"
1754  		   "\n  start_pfn:           %lu",
1755  		   pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
1756  		   zone->zone_start_pfn);
1757  	seq_putc(m, '\n');
1758  }
1759  
1760  /*
1761   * Output information about zones in @pgdat.  All zones are printed regardless
1762   * of whether they are populated or not: lowmem_reserve_ratio operates on the
1763   * set of all zones and userspace would not be aware of such zones if they are
1764   * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
1765   */
1766  static int zoneinfo_show(struct seq_file *m, void *arg)
1767  {
1768  	pg_data_t *pgdat = (pg_data_t *)arg;
1769  	walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
1770  	return 0;
1771  }
1772  
1773  static const struct seq_operations zoneinfo_op = {
1774  	.start	= frag_start, /* iterate over all zones. The same as in
1775  			       * fragmentation. */
1776  	.next	= frag_next,
1777  	.stop	= frag_stop,
1778  	.show	= zoneinfo_show,
1779  };
1780  
1781  #define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
1782  			 NR_VM_NUMA_EVENT_ITEMS + \
1783  			 NR_VM_NODE_STAT_ITEMS + \
1784  			 NR_VM_WRITEBACK_STAT_ITEMS + \
1785  			 (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1786  			  NR_VM_EVENT_ITEMS : 0))
1787  
1788  static void *vmstat_start(struct seq_file *m, loff_t *pos)
1789  {
1790  	unsigned long *v;
1791  	int i;
1792  
1793  	if (*pos >= NR_VMSTAT_ITEMS)
1794  		return NULL;
1795  
1796  	BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
1797  	fold_vm_numa_events();
1798  	v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
1799  	m->private = v;
1800  	if (!v)
1801  		return ERR_PTR(-ENOMEM);
1802  	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1803  		v[i] = global_zone_page_state(i);
1804  	v += NR_VM_ZONE_STAT_ITEMS;
1805  
1806  #ifdef CONFIG_NUMA
1807  	for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1808  		v[i] = global_numa_event_state(i);
1809  	v += NR_VM_NUMA_EVENT_ITEMS;
1810  #endif
1811  
1812  	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1813  		v[i] = global_node_page_state_pages(i);
1814  		if (vmstat_item_print_in_thp(i))
1815  			v[i] /= HPAGE_PMD_NR;
1816  	}
1817  	v += NR_VM_NODE_STAT_ITEMS;
1818  
1819  	global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1820  			    v + NR_DIRTY_THRESHOLD);
1821  	v += NR_VM_WRITEBACK_STAT_ITEMS;
1822  
1823  #ifdef CONFIG_VM_EVENT_COUNTERS
1824  	all_vm_events(v);
1825  	v[PGPGIN] /= 2;		/* sectors -> kbytes */
1826  	v[PGPGOUT] /= 2;
1827  #endif
1828  	return (unsigned long *)m->private + *pos;
1829  }
1830  
1831  static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1832  {
1833  	(*pos)++;
1834  	if (*pos >= NR_VMSTAT_ITEMS)
1835  		return NULL;
1836  	return (unsigned long *)m->private + *pos;
1837  }
1838  
1839  static int vmstat_show(struct seq_file *m, void *arg)
1840  {
1841  	unsigned long *l = arg;
1842  	unsigned long off = l - (unsigned long *)m->private;
1843  
1844  	seq_puts(m, vmstat_text[off]);
1845  	seq_put_decimal_ull(m, " ", *l);
1846  	seq_putc(m, '\n');
1847  
1848  	if (off == NR_VMSTAT_ITEMS - 1) {
1849  		/*
1850  		 * We've come to the end - add any deprecated counters to avoid
1851  		 * breaking userspace which might depend on them being present.
1852  		 */
1853  		seq_puts(m, "nr_unstable 0\n");
1854  	}
1855  	return 0;
1856  }
1857  
1858  static void vmstat_stop(struct seq_file *m, void *arg)
1859  {
1860  	kfree(m->private);
1861  	m->private = NULL;
1862  }
1863  
1864  static const struct seq_operations vmstat_op = {
1865  	.start	= vmstat_start,
1866  	.next	= vmstat_next,
1867  	.stop	= vmstat_stop,
1868  	.show	= vmstat_show,
1869  };
1870  #endif /* CONFIG_PROC_FS */
1871  
1872  #ifdef CONFIG_SMP
1873  static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1874  int sysctl_stat_interval __read_mostly = HZ;
1875  
1876  #ifdef CONFIG_PROC_FS
1877  static void refresh_vm_stats(struct work_struct *work)
1878  {
1879  	refresh_cpu_vm_stats(true);
1880  }
1881  
1882  int vmstat_refresh(struct ctl_table *table, int write,
1883  		   void *buffer, size_t *lenp, loff_t *ppos)
1884  {
1885  	long val;
1886  	int err;
1887  	int i;
1888  
1889  	/*
1890  	 * The regular update, every sysctl_stat_interval, may come later
1891  	 * than expected: leaving a significant amount in per_cpu buckets.
1892  	 * This is particularly misleading when checking a quantity of HUGE
1893  	 * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
1894  	 * which can equally be echo'ed to or cat'ted from (by root),
1895  	 * can be used to update the stats just before reading them.
1896  	 *
1897  	 * Oh, and since global_zone_page_state() etc. are so careful to hide
1898  	 * transiently negative values, report an error here if any of
1899  	 * the stats is negative, so we know to go looking for imbalance.
1900  	 */
1901  	err = schedule_on_each_cpu(refresh_vm_stats);
1902  	if (err)
1903  		return err;
1904  	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
1905  		/*
1906  		 * Skip checking stats known to go negative occasionally.
1907  		 */
1908  		switch (i) {
1909  		case NR_ZONE_WRITE_PENDING:
1910  		case NR_FREE_CMA_PAGES:
1911  			continue;
1912  		}
1913  		val = atomic_long_read(&vm_zone_stat[i]);
1914  		if (val < 0) {
1915  			pr_warn("%s: %s %ld\n",
1916  				__func__, zone_stat_name(i), val);
1917  		}
1918  	}
1919  	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1920  		/*
1921  		 * Skip checking stats known to go negative occasionally.
1922  		 */
1923  		switch (i) {
1924  		case NR_WRITEBACK:
1925  			continue;
1926  		}
1927  		val = atomic_long_read(&vm_node_stat[i]);
1928  		if (val < 0) {
1929  			pr_warn("%s: %s %ld\n",
1930  				__func__, node_stat_name(i), val);
1931  		}
1932  	}
1933  	if (write)
1934  		*ppos += *lenp;
1935  	else
1936  		*lenp = 0;
1937  	return 0;
1938  }
1939  #endif /* CONFIG_PROC_FS */
1940  
1941  static void vmstat_update(struct work_struct *w)
1942  {
1943  	if (refresh_cpu_vm_stats(true)) {
1944  		/*
1945  		 * Counters were updated so we expect more updates
1946  		 * to occur in the future. Keep on running the
1947  		 * update worker thread.
1948  		 */
1949  		queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
1950  				this_cpu_ptr(&vmstat_work),
1951  				round_jiffies_relative(sysctl_stat_interval));
1952  	}
1953  }
1954  
1955  /*
1956   * Check if the diffs for a certain cpu indicate that
1957   * an update is needed.
1958   */
1959  static bool need_update(int cpu)
1960  {
1961  	pg_data_t *last_pgdat = NULL;
1962  	struct zone *zone;
1963  
1964  	for_each_populated_zone(zone) {
1965  		struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
1966  		struct per_cpu_nodestat *n;
1967  
1968  		/*
1969  		 * The fast way of checking if there are any vmstat diffs.
1970  		 */
1971  		if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
1972  			return true;
1973  
1974  		if (last_pgdat == zone->zone_pgdat)
1975  			continue;
1976  		last_pgdat = zone->zone_pgdat;
1977  		n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
1978  		if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
1979  			return true;
1980  	}
1981  	return false;
1982  }
1983  
1984  /*
1985   * Switch off vmstat processing and then fold all the remaining differentials
1986   * until the diffs stay at zero. The function is used by NOHZ and can only be
1987   * invoked when tick processing is not active.
1988   */
1989  void quiet_vmstat(void)
1990  {
1991  	if (system_state != SYSTEM_RUNNING)
1992  		return;
1993  
1994  	if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
1995  		return;
1996  
1997  	if (!need_update(smp_processor_id()))
1998  		return;
1999  
2000  	/*
2001  	 * Just refresh counters and do not care about the pending delayed
2002  	 * vmstat_update. It doesn't fire that often to matter and canceling
2003  	 * it would be too expensive from this path.
2004  	 * vmstat_shepherd will take care about that for us.
2005  	 */
2006  	refresh_cpu_vm_stats(false);
2007  }
2008  
2009  /*
2010   * Shepherd worker thread that checks the
2011   * differentials of processors that have their worker
2012   * threads for vm statistics updates disabled because of
2013   * inactivity.
2014   */
2015  static void vmstat_shepherd(struct work_struct *w);
2016  
2017  static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
2018  
2019  static void vmstat_shepherd(struct work_struct *w)
2020  {
2021  	int cpu;
2022  
2023  	cpus_read_lock();
2024  	/* Check processors whose vmstat worker threads have been disabled */
2025  	for_each_online_cpu(cpu) {
2026  		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
2027  
2028  		/*
2029  		 * In kernel users of vmstat counters either require the precise value and
2030  		 * they are using zone_page_state_snapshot interface or they can live with
2031  		 * an imprecision as the regular flushing can happen at arbitrary time and
2032  		 * cumulative error can grow (see calculate_normal_threshold).
2033  		 *
2034  		 * From that POV the regular flushing can be postponed for CPUs that have
2035  		 * been isolated from the kernel interference without critical
2036  		 * infrastructure ever noticing. Skip regular flushing from vmstat_shepherd
2037  		 * for all isolated CPUs to avoid interference with the isolated workload.
2038  		 */
2039  		if (cpu_is_isolated(cpu))
2040  			continue;
2041  
2042  		if (!delayed_work_pending(dw) && need_update(cpu))
2043  			queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
2044  
2045  		cond_resched();
2046  	}
2047  	cpus_read_unlock();
2048  
2049  	schedule_delayed_work(&shepherd,
2050  		round_jiffies_relative(sysctl_stat_interval));
2051  }
2052  
2053  static void __init start_shepherd_timer(void)
2054  {
2055  	int cpu;
2056  
2057  	for_each_possible_cpu(cpu)
2058  		INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
2059  			vmstat_update);
2060  
2061  	schedule_delayed_work(&shepherd,
2062  		round_jiffies_relative(sysctl_stat_interval));
2063  }
2064  
2065  static void __init init_cpu_node_state(void)
2066  {
2067  	int node;
2068  
2069  	for_each_online_node(node) {
2070  		if (!cpumask_empty(cpumask_of_node(node)))
2071  			node_set_state(node, N_CPU);
2072  	}
2073  }
2074  
2075  static int vmstat_cpu_online(unsigned int cpu)
2076  {
2077  	refresh_zone_stat_thresholds();
2078  
2079  	if (!node_state(cpu_to_node(cpu), N_CPU)) {
2080  		node_set_state(cpu_to_node(cpu), N_CPU);
2081  	}
2082  
2083  	return 0;
2084  }
2085  
2086  static int vmstat_cpu_down_prep(unsigned int cpu)
2087  {
2088  	cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
2089  	return 0;
2090  }
2091  
2092  static int vmstat_cpu_dead(unsigned int cpu)
2093  {
2094  	const struct cpumask *node_cpus;
2095  	int node;
2096  
2097  	node = cpu_to_node(cpu);
2098  
2099  	refresh_zone_stat_thresholds();
2100  	node_cpus = cpumask_of_node(node);
2101  	if (!cpumask_empty(node_cpus))
2102  		return 0;
2103  
2104  	node_clear_state(node, N_CPU);
2105  
2106  	return 0;
2107  }
2108  
2109  #endif
2110  
2111  struct workqueue_struct *mm_percpu_wq;
2112  
2113  void __init init_mm_internals(void)
2114  {
2115  	int ret __maybe_unused;
2116  
2117  	mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
2118  
2119  #ifdef CONFIG_SMP
2120  	ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2121  					NULL, vmstat_cpu_dead);
2122  	if (ret < 0)
2123  		pr_err("vmstat: failed to register 'dead' hotplug state\n");
2124  
2125  	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2126  					vmstat_cpu_online,
2127  					vmstat_cpu_down_prep);
2128  	if (ret < 0)
2129  		pr_err("vmstat: failed to register 'online' hotplug state\n");
2130  
2131  	cpus_read_lock();
2132  	init_cpu_node_state();
2133  	cpus_read_unlock();
2134  
2135  	start_shepherd_timer();
2136  #endif
2137  #ifdef CONFIG_PROC_FS
2138  	proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
2139  	proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
2140  	proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2141  	proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
2142  #endif
2143  }
2144  
2145  #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
2146  
2147  /*
2148   * Return an index indicating how much of the available free memory is
2149   * unusable for an allocation of the requested size.
2150   */
2151  static int unusable_free_index(unsigned int order,
2152  				struct contig_page_info *info)
2153  {
2154  	/* No free memory is interpreted as all free memory is unusable */
2155  	if (info->free_pages == 0)
2156  		return 1000;
2157  
2158  	/*
2159  	 * Index should be a value between 0 and 1. Return a value to 3
2160  	 * decimal places.
2161  	 *
2162  	 * 0 => no fragmentation
2163  	 * 1 => high fragmentation
2164  	 */
2165  	return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2166  
2167  }
2168  
2169  static void unusable_show_print(struct seq_file *m,
2170  					pg_data_t *pgdat, struct zone *zone)
2171  {
2172  	unsigned int order;
2173  	int index;
2174  	struct contig_page_info info;
2175  
2176  	seq_printf(m, "Node %d, zone %8s ",
2177  				pgdat->node_id,
2178  				zone->name);
2179  	for (order = 0; order <= MAX_ORDER; ++order) {
2180  		fill_contig_page_info(zone, order, &info);
2181  		index = unusable_free_index(order, &info);
2182  		seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2183  	}
2184  
2185  	seq_putc(m, '\n');
2186  }
2187  
2188  /*
2189   * Display unusable free space index
2190   *
2191   * The unusable free space index measures how much of the available free
2192   * memory cannot be used to satisfy an allocation of a given size and is a
2193   * value between 0 and 1. The higher the value, the more of free memory is
2194   * unusable and by implication, the worse the external fragmentation is. This
2195   * can be expressed as a percentage by multiplying by 100.
2196   */
2197  static int unusable_show(struct seq_file *m, void *arg)
2198  {
2199  	pg_data_t *pgdat = (pg_data_t *)arg;
2200  
2201  	/* check memoryless node */
2202  	if (!node_state(pgdat->node_id, N_MEMORY))
2203  		return 0;
2204  
2205  	walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
2206  
2207  	return 0;
2208  }
2209  
2210  static const struct seq_operations unusable_sops = {
2211  	.start	= frag_start,
2212  	.next	= frag_next,
2213  	.stop	= frag_stop,
2214  	.show	= unusable_show,
2215  };
2216  
2217  DEFINE_SEQ_ATTRIBUTE(unusable);
2218  
2219  static void extfrag_show_print(struct seq_file *m,
2220  					pg_data_t *pgdat, struct zone *zone)
2221  {
2222  	unsigned int order;
2223  	int index;
2224  
2225  	/* Alloc on stack as interrupts are disabled for zone walk */
2226  	struct contig_page_info info;
2227  
2228  	seq_printf(m, "Node %d, zone %8s ",
2229  				pgdat->node_id,
2230  				zone->name);
2231  	for (order = 0; order <= MAX_ORDER; ++order) {
2232  		fill_contig_page_info(zone, order, &info);
2233  		index = __fragmentation_index(order, &info);
2234  		seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
2235  	}
2236  
2237  	seq_putc(m, '\n');
2238  }
2239  
2240  /*
2241   * Display fragmentation index for orders that allocations would fail for
2242   */
2243  static int extfrag_show(struct seq_file *m, void *arg)
2244  {
2245  	pg_data_t *pgdat = (pg_data_t *)arg;
2246  
2247  	walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
2248  
2249  	return 0;
2250  }
2251  
2252  static const struct seq_operations extfrag_sops = {
2253  	.start	= frag_start,
2254  	.next	= frag_next,
2255  	.stop	= frag_stop,
2256  	.show	= extfrag_show,
2257  };
2258  
2259  DEFINE_SEQ_ATTRIBUTE(extfrag);
2260  
2261  static int __init extfrag_debug_init(void)
2262  {
2263  	struct dentry *extfrag_debug_root;
2264  
2265  	extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
2266  
2267  	debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
2268  			    &unusable_fops);
2269  
2270  	debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
2271  			    &extfrag_fops);
2272  
2273  	return 0;
2274  }
2275  
2276  module_init(extfrag_debug_init);
2277  #endif
2278