xref: /linux/tools/mm/page-types.c (revision 954ea91fb68b771dba6d87cfa61b68e09cc2497f)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * page-types: Tool for querying page flags
4   *
5   * Copyright (C) 2009 Intel corporation
6   *
7   * Authors: Wu Fengguang <fengguang.wu@intel.com>
8   */
9  
10  #define _FILE_OFFSET_BITS 64
11  #define _GNU_SOURCE
12  #include <stdio.h>
13  #include <stdlib.h>
14  #include <unistd.h>
15  #include <stdint.h>
16  #include <stdarg.h>
17  #include <string.h>
18  #include <getopt.h>
19  #include <limits.h>
20  #include <assert.h>
21  #include <ftw.h>
22  #include <time.h>
23  #include <setjmp.h>
24  #include <signal.h>
25  #include <sys/types.h>
26  #include <sys/errno.h>
27  #include <sys/fcntl.h>
28  #include <sys/mount.h>
29  #include <sys/statfs.h>
30  #include <sys/mman.h>
31  #include "../../include/uapi/linux/magic.h"
32  #include "../../include/uapi/linux/kernel-page-flags.h"
33  #include <api/fs/fs.h>
34  
35  #ifndef MAX_PATH
36  # define MAX_PATH 256
37  #endif
38  
39  #ifndef STR
40  # define _STR(x) #x
41  # define STR(x) _STR(x)
42  #endif
43  
44  /*
45   * pagemap kernel ABI bits
46   */
47  
48  #define PM_ENTRY_BYTES		8
49  #define PM_PFRAME_BITS		55
50  #define PM_PFRAME_MASK		((1LL << PM_PFRAME_BITS) - 1)
51  #define PM_PFRAME(x)		((x) & PM_PFRAME_MASK)
52  #define MAX_SWAPFILES_SHIFT	5
53  #define PM_SWAP_OFFSET(x)	(((x) & PM_PFRAME_MASK) >> MAX_SWAPFILES_SHIFT)
54  #define PM_SOFT_DIRTY		(1ULL << 55)
55  #define PM_MMAP_EXCLUSIVE	(1ULL << 56)
56  #define PM_FILE			(1ULL << 61)
57  #define PM_SWAP			(1ULL << 62)
58  #define PM_PRESENT		(1ULL << 63)
59  
60  /*
61   * kernel page flags
62   */
63  
64  #define KPF_BYTES		8
65  #define PROC_KPAGEFLAGS		"/proc/kpageflags"
66  #define PROC_KPAGECOUNT		"/proc/kpagecount"
67  #define PROC_KPAGECGROUP	"/proc/kpagecgroup"
68  
69  #define SYS_KERNEL_MM_PAGE_IDLE "/sys/kernel/mm/page_idle/bitmap"
70  
71  /* [32-] kernel hacking assistances */
72  #define KPF_RESERVED		32
73  #define KPF_MLOCKED		33
74  #define KPF_MAPPEDTODISK	34
75  #define KPF_PRIVATE		35
76  #define KPF_PRIVATE_2		36
77  #define KPF_OWNER_PRIVATE	37
78  #define KPF_ARCH		38
79  #define KPF_UNCACHED		39
80  #define KPF_SOFTDIRTY		40
81  #define KPF_ARCH_2		41
82  
83  /* [47-] take some arbitrary free slots for expanding overloaded flags
84   * not part of kernel API
85   */
86  #define KPF_ANON_EXCLUSIVE	47
87  #define KPF_READAHEAD		48
88  #define KPF_SLOB_FREE		49
89  #define KPF_SLUB_FROZEN		50
90  #define KPF_SLUB_DEBUG		51
91  #define KPF_FILE		61
92  #define KPF_SWAP		62
93  #define KPF_MMAP_EXCLUSIVE	63
94  
95  #define KPF_ALL_BITS		((uint64_t)~0ULL)
96  #define KPF_HACKERS_BITS	(0xffffULL << 32)
97  #define KPF_OVERLOADED_BITS	(0xffffULL << 48)
98  #define BIT(name)		(1ULL << KPF_##name)
99  #define BITS_COMPOUND		(BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL))
100  
101  static const char * const page_flag_names[] = {
102  	[KPF_LOCKED]		= "L:locked",
103  	[KPF_ERROR]		= "E:error",
104  	[KPF_REFERENCED]	= "R:referenced",
105  	[KPF_UPTODATE]		= "U:uptodate",
106  	[KPF_DIRTY]		= "D:dirty",
107  	[KPF_LRU]		= "l:lru",
108  	[KPF_ACTIVE]		= "A:active",
109  	[KPF_SLAB]		= "S:slab",
110  	[KPF_WRITEBACK]		= "W:writeback",
111  	[KPF_RECLAIM]		= "I:reclaim",
112  	[KPF_BUDDY]		= "B:buddy",
113  
114  	[KPF_MMAP]		= "M:mmap",
115  	[KPF_ANON]		= "a:anonymous",
116  	[KPF_SWAPCACHE]		= "s:swapcache",
117  	[KPF_SWAPBACKED]	= "b:swapbacked",
118  	[KPF_COMPOUND_HEAD]	= "H:compound_head",
119  	[KPF_COMPOUND_TAIL]	= "T:compound_tail",
120  	[KPF_HUGE]		= "G:huge",
121  	[KPF_UNEVICTABLE]	= "u:unevictable",
122  	[KPF_HWPOISON]		= "X:hwpoison",
123  	[KPF_NOPAGE]		= "n:nopage",
124  	[KPF_KSM]		= "x:ksm",
125  	[KPF_THP]		= "t:thp",
126  	[KPF_OFFLINE]		= "o:offline",
127  	[KPF_PGTABLE]		= "g:pgtable",
128  	[KPF_ZERO_PAGE]		= "z:zero_page",
129  	[KPF_IDLE]              = "i:idle_page",
130  
131  	[KPF_RESERVED]		= "r:reserved",
132  	[KPF_MLOCKED]		= "m:mlocked",
133  	[KPF_MAPPEDTODISK]	= "d:mappedtodisk",
134  	[KPF_PRIVATE]		= "P:private",
135  	[KPF_PRIVATE_2]		= "p:private_2",
136  	[KPF_OWNER_PRIVATE]	= "O:owner_private",
137  	[KPF_ARCH]		= "h:arch",
138  	[KPF_UNCACHED]		= "c:uncached",
139  	[KPF_SOFTDIRTY]		= "f:softdirty",
140  	[KPF_ARCH_2]		= "H:arch_2",
141  
142  	[KPF_ANON_EXCLUSIVE]	= "d:anon_exclusive",
143  	[KPF_READAHEAD]		= "I:readahead",
144  	[KPF_SLOB_FREE]		= "P:slob_free",
145  	[KPF_SLUB_FROZEN]	= "A:slub_frozen",
146  	[KPF_SLUB_DEBUG]	= "E:slub_debug",
147  
148  	[KPF_FILE]		= "F:file",
149  	[KPF_SWAP]		= "w:swap",
150  	[KPF_MMAP_EXCLUSIVE]	= "1:mmap_exclusive",
151  };
152  
153  
154  /*
155   * data structures
156   */
157  
158  static int		opt_raw;	/* for kernel developers */
159  static int		opt_list;	/* list pages (in ranges) */
160  static int		opt_mark_idle;	/* set accessed bit */
161  static int		opt_no_summary;	/* don't show summary */
162  static pid_t		opt_pid;	/* process to walk */
163  const char		*opt_file;	/* file or directory path */
164  static uint64_t		opt_cgroup;	/* cgroup inode */
165  static int		opt_list_cgroup;/* list page cgroup */
166  static int		opt_list_mapcnt;/* list page map count */
167  static const char	*opt_kpageflags;/* kpageflags file to parse */
168  
169  #define MAX_ADDR_RANGES	1024
170  static int		nr_addr_ranges;
171  static unsigned long	opt_offset[MAX_ADDR_RANGES];
172  static unsigned long	opt_size[MAX_ADDR_RANGES];
173  
174  #define MAX_VMAS	10240
175  static int		nr_vmas;
176  static unsigned long	pg_start[MAX_VMAS];
177  static unsigned long	pg_end[MAX_VMAS];
178  
179  #define MAX_BIT_FILTERS	64
180  static int		nr_bit_filters;
181  static uint64_t		opt_mask[MAX_BIT_FILTERS];
182  static uint64_t		opt_bits[MAX_BIT_FILTERS];
183  
184  static int		page_size;
185  
186  static int		pagemap_fd;
187  static int		kpageflags_fd;
188  static int		kpagecount_fd = -1;
189  static int		kpagecgroup_fd = -1;
190  static int		page_idle_fd = -1;
191  
192  static int		opt_hwpoison;
193  static int		opt_unpoison;
194  
195  static const char	*hwpoison_debug_fs;
196  static int		hwpoison_inject_fd;
197  static int		hwpoison_forget_fd;
198  
199  #define HASH_SHIFT	13
200  #define HASH_SIZE	(1 << HASH_SHIFT)
201  #define HASH_MASK	(HASH_SIZE - 1)
202  #define HASH_KEY(flags)	(flags & HASH_MASK)
203  
204  static unsigned long	total_pages;
205  static unsigned long	nr_pages[HASH_SIZE];
206  static uint64_t		page_flags[HASH_SIZE];
207  
208  
209  /*
210   * helper functions
211   */
212  
213  #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
214  
215  #define min_t(type, x, y) ({			\
216  	type __min1 = (x);			\
217  	type __min2 = (y);			\
218  	__min1 < __min2 ? __min1 : __min2; })
219  
220  #define max_t(type, x, y) ({			\
221  	type __max1 = (x);			\
222  	type __max2 = (y);			\
223  	__max1 > __max2 ? __max1 : __max2; })
224  
225  static unsigned long pages2mb(unsigned long pages)
226  {
227  	return (pages * page_size) >> 20;
228  }
229  
230  static void fatal(const char *x, ...)
231  {
232  	va_list ap;
233  
234  	va_start(ap, x);
235  	vfprintf(stderr, x, ap);
236  	va_end(ap);
237  	exit(EXIT_FAILURE);
238  }
239  
240  static int checked_open(const char *pathname, int flags)
241  {
242  	int fd = open(pathname, flags);
243  
244  	if (fd < 0) {
245  		perror(pathname);
246  		exit(EXIT_FAILURE);
247  	}
248  
249  	return fd;
250  }
251  
252  /*
253   * pagemap/kpageflags routines
254   */
255  
256  static unsigned long do_u64_read(int fd, const char *name,
257  				 uint64_t *buf,
258  				 unsigned long index,
259  				 unsigned long count)
260  {
261  	long bytes;
262  
263  	if (index > ULONG_MAX / 8)
264  		fatal("index overflow: %lu\n", index);
265  
266  	bytes = pread(fd, buf, count * 8, (off_t)index * 8);
267  	if (bytes < 0) {
268  		perror(name);
269  		exit(EXIT_FAILURE);
270  	}
271  	if (bytes % 8)
272  		fatal("partial read: %lu bytes\n", bytes);
273  
274  	return bytes / 8;
275  }
276  
277  static unsigned long kpageflags_read(uint64_t *buf,
278  				     unsigned long index,
279  				     unsigned long pages)
280  {
281  	return do_u64_read(kpageflags_fd, opt_kpageflags, buf, index, pages);
282  }
283  
284  static unsigned long kpagecgroup_read(uint64_t *buf,
285  				      unsigned long index,
286  				      unsigned long pages)
287  {
288  	if (kpagecgroup_fd < 0)
289  		return pages;
290  
291  	return do_u64_read(kpagecgroup_fd, opt_kpageflags, buf, index, pages);
292  }
293  
294  static unsigned long kpagecount_read(uint64_t *buf,
295  				     unsigned long index,
296  				     unsigned long pages)
297  {
298  	return kpagecount_fd < 0 ? pages :
299  		do_u64_read(kpagecount_fd, PROC_KPAGECOUNT,
300  			    buf, index, pages);
301  }
302  
303  static unsigned long pagemap_read(uint64_t *buf,
304  				  unsigned long index,
305  				  unsigned long pages)
306  {
307  	return do_u64_read(pagemap_fd, "/proc/pid/pagemap", buf, index, pages);
308  }
309  
310  static unsigned long pagemap_pfn(uint64_t val)
311  {
312  	unsigned long pfn;
313  
314  	if (val & PM_PRESENT)
315  		pfn = PM_PFRAME(val);
316  	else
317  		pfn = 0;
318  
319  	return pfn;
320  }
321  
322  static unsigned long pagemap_swap_offset(uint64_t val)
323  {
324  	return val & PM_SWAP ? PM_SWAP_OFFSET(val) : 0;
325  }
326  
327  /*
328   * page flag names
329   */
330  
331  static char *page_flag_name(uint64_t flags)
332  {
333  	static char buf[65];
334  	int present;
335  	size_t i, j;
336  
337  	for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
338  		present = (flags >> i) & 1;
339  		if (!page_flag_names[i]) {
340  			if (present)
341  				fatal("unknown flag bit %d\n", i);
342  			continue;
343  		}
344  		buf[j++] = present ? page_flag_names[i][0] : '_';
345  	}
346  
347  	return buf;
348  }
349  
350  static char *page_flag_longname(uint64_t flags)
351  {
352  	static char buf[1024];
353  	size_t i, n;
354  
355  	for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) {
356  		if (!page_flag_names[i])
357  			continue;
358  		if ((flags >> i) & 1)
359  			n += snprintf(buf + n, sizeof(buf) - n, "%s,",
360  					page_flag_names[i] + 2);
361  	}
362  	if (n)
363  		n--;
364  	buf[n] = '\0';
365  
366  	return buf;
367  }
368  
369  
370  /*
371   * page list and summary
372   */
373  
374  static void show_page_range(unsigned long voffset, unsigned long offset,
375  			    unsigned long size, uint64_t flags,
376  			    uint64_t cgroup, uint64_t mapcnt)
377  {
378  	static uint64_t      flags0;
379  	static uint64_t	     cgroup0;
380  	static uint64_t      mapcnt0;
381  	static unsigned long voff;
382  	static unsigned long index;
383  	static unsigned long count;
384  
385  	if (flags == flags0 && cgroup == cgroup0 && mapcnt == mapcnt0 &&
386  	    offset == index + count && size && voffset == voff + count) {
387  		count += size;
388  		return;
389  	}
390  
391  	if (count) {
392  		if (opt_pid)
393  			printf("%lx\t", voff);
394  		if (opt_file)
395  			printf("%lx\t", voff);
396  		if (opt_list_cgroup)
397  			printf("@%llu\t", (unsigned long long)cgroup0);
398  		if (opt_list_mapcnt)
399  			printf("%lu\t", mapcnt0);
400  		printf("%lx\t%lx\t%s\n",
401  				index, count, page_flag_name(flags0));
402  	}
403  
404  	flags0 = flags;
405  	cgroup0 = cgroup;
406  	mapcnt0 = mapcnt;
407  	index  = offset;
408  	voff   = voffset;
409  	count  = size;
410  }
411  
412  static void flush_page_range(void)
413  {
414  	show_page_range(0, 0, 0, 0, 0, 0);
415  }
416  
417  static void show_page(unsigned long voffset, unsigned long offset,
418  		      uint64_t flags, uint64_t cgroup, uint64_t mapcnt)
419  {
420  	if (opt_pid)
421  		printf("%lx\t", voffset);
422  	if (opt_file)
423  		printf("%lx\t", voffset);
424  	if (opt_list_cgroup)
425  		printf("@%llu\t", (unsigned long long)cgroup);
426  	if (opt_list_mapcnt)
427  		printf("%lu\t", mapcnt);
428  
429  	printf("%lx\t%s\n", offset, page_flag_name(flags));
430  }
431  
432  static void show_summary(void)
433  {
434  	size_t i;
435  
436  	printf("             flags\tpage-count       MB"
437  		"  symbolic-flags\t\t\tlong-symbolic-flags\n");
438  
439  	for (i = 0; i < ARRAY_SIZE(nr_pages); i++) {
440  		if (nr_pages[i])
441  			printf("0x%016llx\t%10lu %8lu  %s\t%s\n",
442  				(unsigned long long)page_flags[i],
443  				nr_pages[i],
444  				pages2mb(nr_pages[i]),
445  				page_flag_name(page_flags[i]),
446  				page_flag_longname(page_flags[i]));
447  	}
448  
449  	printf("             total\t%10lu %8lu\n",
450  			total_pages, pages2mb(total_pages));
451  }
452  
453  
454  /*
455   * page flag filters
456   */
457  
458  static int bit_mask_ok(uint64_t flags)
459  {
460  	int i;
461  
462  	for (i = 0; i < nr_bit_filters; i++) {
463  		if (opt_bits[i] == KPF_ALL_BITS) {
464  			if ((flags & opt_mask[i]) == 0)
465  				return 0;
466  		} else {
467  			if ((flags & opt_mask[i]) != opt_bits[i])
468  				return 0;
469  		}
470  	}
471  
472  	return 1;
473  }
474  
475  static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
476  {
477  	/* Anonymous pages overload PG_mappedtodisk */
478  	if ((flags & BIT(ANON)) && (flags & BIT(MAPPEDTODISK)))
479  		flags ^= BIT(MAPPEDTODISK) | BIT(ANON_EXCLUSIVE);
480  
481  	/* SLOB/SLUB overload several page flags */
482  	if (flags & BIT(SLAB)) {
483  		if (flags & BIT(PRIVATE))
484  			flags ^= BIT(PRIVATE) | BIT(SLOB_FREE);
485  		if (flags & BIT(ACTIVE))
486  			flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN);
487  		if (flags & BIT(ERROR))
488  			flags ^= BIT(ERROR) | BIT(SLUB_DEBUG);
489  	}
490  
491  	/* PG_reclaim is overloaded as PG_readahead in the read path */
492  	if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM))
493  		flags ^= BIT(RECLAIM) | BIT(READAHEAD);
494  
495  	if (pme & PM_SOFT_DIRTY)
496  		flags |= BIT(SOFTDIRTY);
497  	if (pme & PM_FILE)
498  		flags |= BIT(FILE);
499  	if (pme & PM_SWAP)
500  		flags |= BIT(SWAP);
501  	if (pme & PM_MMAP_EXCLUSIVE)
502  		flags |= BIT(MMAP_EXCLUSIVE);
503  
504  	return flags;
505  }
506  
507  static uint64_t well_known_flags(uint64_t flags)
508  {
509  	/* hide flags intended only for kernel hacker */
510  	flags &= ~KPF_HACKERS_BITS;
511  
512  	/* hide non-hugeTLB compound pages */
513  	if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE)))
514  		flags &= ~BITS_COMPOUND;
515  
516  	return flags;
517  }
518  
519  static uint64_t kpageflags_flags(uint64_t flags, uint64_t pme)
520  {
521  	if (opt_raw)
522  		flags = expand_overloaded_flags(flags, pme);
523  	else
524  		flags = well_known_flags(flags);
525  
526  	return flags;
527  }
528  
529  /*
530   * page actions
531   */
532  
533  static void prepare_hwpoison_fd(void)
534  {
535  	char buf[MAX_PATH + 1];
536  
537  	hwpoison_debug_fs = debugfs__mount();
538  	if (!hwpoison_debug_fs) {
539  		perror("mount debugfs");
540  		exit(EXIT_FAILURE);
541  	}
542  
543  	if (opt_hwpoison && !hwpoison_inject_fd) {
544  		snprintf(buf, MAX_PATH, "%s/hwpoison/corrupt-pfn",
545  			hwpoison_debug_fs);
546  		hwpoison_inject_fd = checked_open(buf, O_WRONLY);
547  	}
548  
549  	if (opt_unpoison && !hwpoison_forget_fd) {
550  		snprintf(buf, MAX_PATH, "%s/hwpoison/unpoison-pfn",
551  			hwpoison_debug_fs);
552  		hwpoison_forget_fd = checked_open(buf, O_WRONLY);
553  	}
554  }
555  
556  static int hwpoison_page(unsigned long offset)
557  {
558  	char buf[100];
559  	int len;
560  
561  	len = sprintf(buf, "0x%lx\n", offset);
562  	len = write(hwpoison_inject_fd, buf, len);
563  	if (len < 0) {
564  		perror("hwpoison inject");
565  		return len;
566  	}
567  	return 0;
568  }
569  
570  static int unpoison_page(unsigned long offset)
571  {
572  	char buf[100];
573  	int len;
574  
575  	len = sprintf(buf, "0x%lx\n", offset);
576  	len = write(hwpoison_forget_fd, buf, len);
577  	if (len < 0) {
578  		perror("hwpoison forget");
579  		return len;
580  	}
581  	return 0;
582  }
583  
584  static int mark_page_idle(unsigned long offset)
585  {
586  	static unsigned long off;
587  	static uint64_t buf;
588  	int len;
589  
590  	if ((offset / 64 == off / 64) || buf == 0) {
591  		buf |= 1UL << (offset % 64);
592  		off = offset;
593  		return 0;
594  	}
595  
596  	len = pwrite(page_idle_fd, &buf, 8, 8 * (off / 64));
597  	if (len < 0) {
598  		perror("mark page idle");
599  		return len;
600  	}
601  
602  	buf = 1UL << (offset % 64);
603  	off = offset;
604  
605  	return 0;
606  }
607  
608  /*
609   * page frame walker
610   */
611  
612  static size_t hash_slot(uint64_t flags)
613  {
614  	size_t k = HASH_KEY(flags);
615  	size_t i;
616  
617  	/* Explicitly reserve slot 0 for flags 0: the following logic
618  	 * cannot distinguish an unoccupied slot from slot (flags==0).
619  	 */
620  	if (flags == 0)
621  		return 0;
622  
623  	/* search through the remaining (HASH_SIZE-1) slots */
624  	for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) {
625  		if (!k || k >= ARRAY_SIZE(page_flags))
626  			k = 1;
627  		if (page_flags[k] == 0) {
628  			page_flags[k] = flags;
629  			return k;
630  		}
631  		if (page_flags[k] == flags)
632  			return k;
633  	}
634  
635  	fatal("hash table full: bump up HASH_SHIFT?\n");
636  	exit(EXIT_FAILURE);
637  }
638  
639  static void add_page(unsigned long voffset, unsigned long offset,
640  		     uint64_t flags, uint64_t cgroup, uint64_t mapcnt,
641  		     uint64_t pme)
642  {
643  	flags = kpageflags_flags(flags, pme);
644  
645  	if (!bit_mask_ok(flags))
646  		return;
647  
648  	if (opt_cgroup && cgroup != (uint64_t)opt_cgroup)
649  		return;
650  
651  	if (opt_hwpoison)
652  		hwpoison_page(offset);
653  	if (opt_unpoison)
654  		unpoison_page(offset);
655  
656  	if (opt_mark_idle)
657  		mark_page_idle(offset);
658  
659  	if (opt_list == 1)
660  		show_page_range(voffset, offset, 1, flags, cgroup, mapcnt);
661  	else if (opt_list == 2)
662  		show_page(voffset, offset, flags, cgroup, mapcnt);
663  
664  	nr_pages[hash_slot(flags)]++;
665  	total_pages++;
666  }
667  
668  #define KPAGEFLAGS_BATCH	(64 << 10)	/* 64k pages */
669  static void walk_pfn(unsigned long voffset,
670  		     unsigned long index,
671  		     unsigned long count,
672  		     uint64_t pme)
673  {
674  	uint64_t buf[KPAGEFLAGS_BATCH];
675  	uint64_t cgi[KPAGEFLAGS_BATCH];
676  	uint64_t cnt[KPAGEFLAGS_BATCH];
677  	unsigned long batch;
678  	unsigned long pages;
679  	unsigned long i;
680  
681  	/*
682  	 * kpagecgroup_read() reads only if kpagecgroup were opened, but
683  	 * /proc/kpagecgroup might even not exist, so it's better to fill
684  	 * them with zeros here.
685  	 */
686  	if (count == 1)
687  		cgi[0] = 0;
688  	else
689  		memset(cgi, 0, sizeof cgi);
690  
691  	while (count) {
692  		batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH);
693  		pages = kpageflags_read(buf, index, batch);
694  		if (pages == 0)
695  			break;
696  
697  		if (kpagecgroup_read(cgi, index, pages) != pages)
698  			fatal("kpagecgroup returned fewer pages than expected");
699  
700  		if (kpagecount_read(cnt, index, pages) != pages)
701  			fatal("kpagecount returned fewer pages than expected");
702  
703  		for (i = 0; i < pages; i++)
704  			add_page(voffset + i, index + i,
705  				 buf[i], cgi[i], cnt[i], pme);
706  
707  		index += pages;
708  		count -= pages;
709  	}
710  }
711  
712  static void walk_swap(unsigned long voffset, uint64_t pme)
713  {
714  	uint64_t flags = kpageflags_flags(0, pme);
715  
716  	if (!bit_mask_ok(flags))
717  		return;
718  
719  	if (opt_cgroup)
720  		return;
721  
722  	if (opt_list == 1)
723  		show_page_range(voffset, pagemap_swap_offset(pme),
724  				1, flags, 0, 0);
725  	else if (opt_list == 2)
726  		show_page(voffset, pagemap_swap_offset(pme), flags, 0, 0);
727  
728  	nr_pages[hash_slot(flags)]++;
729  	total_pages++;
730  }
731  
732  #define PAGEMAP_BATCH	(64 << 10)
733  static void walk_vma(unsigned long index, unsigned long count)
734  {
735  	uint64_t buf[PAGEMAP_BATCH];
736  	unsigned long batch;
737  	unsigned long pages;
738  	unsigned long pfn;
739  	unsigned long i;
740  
741  	while (count) {
742  		batch = min_t(unsigned long, count, PAGEMAP_BATCH);
743  		pages = pagemap_read(buf, index, batch);
744  		if (pages == 0)
745  			break;
746  
747  		for (i = 0; i < pages; i++) {
748  			pfn = pagemap_pfn(buf[i]);
749  			if (pfn)
750  				walk_pfn(index + i, pfn, 1, buf[i]);
751  			if (buf[i] & PM_SWAP)
752  				walk_swap(index + i, buf[i]);
753  		}
754  
755  		index += pages;
756  		count -= pages;
757  	}
758  }
759  
760  static void walk_task(unsigned long index, unsigned long count)
761  {
762  	const unsigned long end = index + count;
763  	unsigned long start;
764  	int i = 0;
765  
766  	while (index < end) {
767  
768  		while (pg_end[i] <= index)
769  			if (++i >= nr_vmas)
770  				return;
771  		if (pg_start[i] >= end)
772  			return;
773  
774  		start = max_t(unsigned long, pg_start[i], index);
775  		index = min_t(unsigned long, pg_end[i], end);
776  
777  		assert(start < index);
778  		walk_vma(start, index - start);
779  	}
780  }
781  
782  static void add_addr_range(unsigned long offset, unsigned long size)
783  {
784  	if (nr_addr_ranges >= MAX_ADDR_RANGES)
785  		fatal("too many addr ranges\n");
786  
787  	opt_offset[nr_addr_ranges] = offset;
788  	opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset);
789  	nr_addr_ranges++;
790  }
791  
792  static void walk_addr_ranges(void)
793  {
794  	int i;
795  
796  	kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY);
797  
798  	if (!nr_addr_ranges)
799  		add_addr_range(0, ULONG_MAX);
800  
801  	for (i = 0; i < nr_addr_ranges; i++)
802  		if (!opt_pid)
803  			walk_pfn(opt_offset[i], opt_offset[i], opt_size[i], 0);
804  		else
805  			walk_task(opt_offset[i], opt_size[i]);
806  
807  	if (opt_mark_idle)
808  		mark_page_idle(0);
809  
810  	close(kpageflags_fd);
811  }
812  
813  
814  /*
815   * user interface
816   */
817  
818  static const char *page_flag_type(uint64_t flag)
819  {
820  	if (flag & KPF_HACKERS_BITS)
821  		return "(r)";
822  	if (flag & KPF_OVERLOADED_BITS)
823  		return "(o)";
824  	return "   ";
825  }
826  
827  static void usage(void)
828  {
829  	size_t i, j;
830  
831  	printf(
832  "page-types [options]\n"
833  "            -r|--raw                   Raw mode, for kernel developers\n"
834  "            -d|--describe flags        Describe flags\n"
835  "            -a|--addr    addr-spec     Walk a range of pages\n"
836  "            -b|--bits    bits-spec     Walk pages with specified bits\n"
837  "            -c|--cgroup  path|@inode   Walk pages within memory cgroup\n"
838  "            -p|--pid     pid           Walk process address space\n"
839  "            -f|--file    filename      Walk file address space\n"
840  "            -i|--mark-idle             Mark pages idle\n"
841  "            -l|--list                  Show page details in ranges\n"
842  "            -L|--list-each             Show page details one by one\n"
843  "            -C|--list-cgroup           Show cgroup inode for pages\n"
844  "            -M|--list-mapcnt           Show page map count\n"
845  "            -N|--no-summary            Don't show summary info\n"
846  "            -X|--hwpoison              hwpoison pages\n"
847  "            -x|--unpoison              unpoison pages\n"
848  "            -F|--kpageflags filename   kpageflags file to parse\n"
849  "            -h|--help                  Show this usage message\n"
850  "flags:\n"
851  "            0x10                       bitfield format, e.g.\n"
852  "            anon                       bit-name, e.g.\n"
853  "            0x10,anon                  comma-separated list, e.g.\n"
854  "addr-spec:\n"
855  "            N                          one page at offset N (unit: pages)\n"
856  "            N+M                        pages range from N to N+M-1\n"
857  "            N,M                        pages range from N to M-1\n"
858  "            N,                         pages range from N to end\n"
859  "            ,M                         pages range from 0 to M-1\n"
860  "bits-spec:\n"
861  "            bit1,bit2                  (flags & (bit1|bit2)) != 0\n"
862  "            bit1,bit2=bit1             (flags & (bit1|bit2)) == bit1\n"
863  "            bit1,~bit2                 (flags & (bit1|bit2)) == bit1\n"
864  "            =bit1,bit2                 flags == (bit1|bit2)\n"
865  "bit-names:\n"
866  	);
867  
868  	for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
869  		if (!page_flag_names[i])
870  			continue;
871  		printf("%16s%s", page_flag_names[i] + 2,
872  				 page_flag_type(1ULL << i));
873  		if (++j > 3) {
874  			j = 0;
875  			putchar('\n');
876  		}
877  	}
878  	printf("\n                                   "
879  		"(r) raw mode bits  (o) overloaded bits\n");
880  }
881  
882  static unsigned long long parse_number(const char *str)
883  {
884  	unsigned long long n;
885  
886  	n = strtoll(str, NULL, 0);
887  
888  	if (n == 0 && str[0] != '0')
889  		fatal("invalid name or number: %s\n", str);
890  
891  	return n;
892  }
893  
894  static void parse_pid(const char *str)
895  {
896  	FILE *file;
897  	char buf[5000];
898  
899  	opt_pid = parse_number(str);
900  
901  	sprintf(buf, "/proc/%d/pagemap", opt_pid);
902  	pagemap_fd = checked_open(buf, O_RDONLY);
903  
904  	sprintf(buf, "/proc/%d/maps", opt_pid);
905  	file = fopen(buf, "r");
906  	if (!file) {
907  		perror(buf);
908  		exit(EXIT_FAILURE);
909  	}
910  
911  	while (fgets(buf, sizeof(buf), file) != NULL) {
912  		unsigned long vm_start;
913  		unsigned long vm_end;
914  		unsigned long long pgoff;
915  		int major, minor;
916  		char r, w, x, s;
917  		unsigned long ino;
918  		int n;
919  
920  		n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu",
921  			   &vm_start,
922  			   &vm_end,
923  			   &r, &w, &x, &s,
924  			   &pgoff,
925  			   &major, &minor,
926  			   &ino);
927  		if (n < 10) {
928  			fprintf(stderr, "unexpected line: %s\n", buf);
929  			continue;
930  		}
931  		pg_start[nr_vmas] = vm_start / page_size;
932  		pg_end[nr_vmas] = vm_end / page_size;
933  		if (++nr_vmas >= MAX_VMAS) {
934  			fprintf(stderr, "too many VMAs\n");
935  			break;
936  		}
937  	}
938  	fclose(file);
939  }
940  
941  static void show_file(const char *name, const struct stat *st)
942  {
943  	unsigned long long size = st->st_size;
944  	char atime[64], mtime[64];
945  	long now = time(NULL);
946  
947  	printf("%s\tInode: %u\tSize: %llu (%llu pages)\n",
948  			name, (unsigned)st->st_ino,
949  			size, (size + page_size - 1) / page_size);
950  
951  	strftime(atime, sizeof(atime), "%c", localtime(&st->st_atime));
952  	strftime(mtime, sizeof(mtime), "%c", localtime(&st->st_mtime));
953  
954  	printf("Modify: %s (%ld seconds ago)\nAccess: %s (%ld seconds ago)\n",
955  			mtime, now - st->st_mtime,
956  			atime, now - st->st_atime);
957  }
958  
959  static sigjmp_buf sigbus_jmp;
960  
961  static void * volatile sigbus_addr;
962  
963  static void sigbus_handler(int sig, siginfo_t *info, void *ucontex)
964  {
965  	(void)sig;
966  	(void)ucontex;
967  	sigbus_addr = info ? info->si_addr : NULL;
968  	siglongjmp(sigbus_jmp, 1);
969  }
970  
971  static struct sigaction sigbus_action = {
972  	.sa_sigaction = sigbus_handler,
973  	.sa_flags = SA_SIGINFO,
974  };
975  
976  static void walk_file_range(const char *name, int fd,
977  			    unsigned long off, unsigned long end)
978  {
979  	uint8_t vec[PAGEMAP_BATCH];
980  	uint64_t buf[PAGEMAP_BATCH], flags;
981  	uint64_t cgroup = 0;
982  	uint64_t mapcnt = 0;
983  	unsigned long nr_pages, pfn, i;
984  	ssize_t len;
985  	void *ptr;
986  	int first = 1;
987  
988  	for (; off < end; off += len) {
989  		nr_pages = (end - off + page_size - 1) / page_size;
990  		if (nr_pages > PAGEMAP_BATCH)
991  			nr_pages = PAGEMAP_BATCH;
992  		len = nr_pages * page_size;
993  
994  		ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, off);
995  		if (ptr == MAP_FAILED)
996  			fatal("mmap failed: %s", name);
997  
998  		/* determine cached pages */
999  		if (mincore(ptr, len, vec))
1000  			fatal("mincore failed: %s", name);
1001  
1002  		/* turn off readahead */
1003  		if (madvise(ptr, len, MADV_RANDOM))
1004  			fatal("madvice failed: %s", name);
1005  
1006  		if (sigsetjmp(sigbus_jmp, 1)) {
1007  			end = off + sigbus_addr ? sigbus_addr - ptr : 0;
1008  			fprintf(stderr, "got sigbus at offset %lld: %s\n",
1009  					(long long)end, name);
1010  			goto got_sigbus;
1011  		}
1012  
1013  		/* populate ptes */
1014  		for (i = 0; i < nr_pages ; i++) {
1015  			if (vec[i] & 1)
1016  				(void)*(volatile int *)(ptr + i * page_size);
1017  		}
1018  got_sigbus:
1019  
1020  		/* turn off harvesting reference bits */
1021  		if (madvise(ptr, len, MADV_SEQUENTIAL))
1022  			fatal("madvice failed: %s", name);
1023  
1024  		if (pagemap_read(buf, (unsigned long)ptr / page_size,
1025  					nr_pages) != nr_pages)
1026  			fatal("cannot read pagemap");
1027  
1028  		munmap(ptr, len);
1029  
1030  		for (i = 0; i < nr_pages; i++) {
1031  			pfn = pagemap_pfn(buf[i]);
1032  			if (!pfn)
1033  				continue;
1034  			if (!kpageflags_read(&flags, pfn, 1))
1035  				continue;
1036  			if (!kpagecgroup_read(&cgroup, pfn, 1))
1037  				fatal("kpagecgroup_read failed");
1038  			if (!kpagecount_read(&mapcnt, pfn, 1))
1039  				fatal("kpagecount_read failed");
1040  			if (first && opt_list) {
1041  				first = 0;
1042  				flush_page_range();
1043  			}
1044  			add_page(off / page_size + i, pfn,
1045  				 flags, cgroup, mapcnt, buf[i]);
1046  		}
1047  	}
1048  }
1049  
1050  static void walk_file(const char *name, const struct stat *st)
1051  {
1052  	int i;
1053  	int fd;
1054  
1055  	fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW);
1056  
1057  	if (!nr_addr_ranges)
1058  		add_addr_range(0, st->st_size / page_size);
1059  
1060  	for (i = 0; i < nr_addr_ranges; i++)
1061  		walk_file_range(name, fd, opt_offset[i] * page_size,
1062  				(opt_offset[i] + opt_size[i]) * page_size);
1063  
1064  	close(fd);
1065  }
1066  
1067  int walk_tree(const char *name, const struct stat *st, int type, struct FTW *f)
1068  {
1069  	(void)f;
1070  	switch (type) {
1071  	case FTW_F:
1072  		if (S_ISREG(st->st_mode))
1073  			walk_file(name, st);
1074  		break;
1075  	case FTW_DNR:
1076  		fprintf(stderr, "cannot read dir: %s\n", name);
1077  		break;
1078  	}
1079  	return 0;
1080  }
1081  
1082  struct stat st;
1083  
1084  static void walk_page_cache(void)
1085  {
1086  	kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY);
1087  	pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY);
1088  	sigaction(SIGBUS, &sigbus_action, NULL);
1089  
1090  	if (stat(opt_file, &st))
1091  		fatal("stat failed: %s\n", opt_file);
1092  
1093  	if (S_ISREG(st.st_mode)) {
1094  		walk_file(opt_file, &st);
1095  	} else if (S_ISDIR(st.st_mode)) {
1096  		/* do not follow symlinks and mountpoints */
1097  		if (nftw(opt_file, walk_tree, 64, FTW_MOUNT | FTW_PHYS) < 0)
1098  			fatal("nftw failed: %s\n", opt_file);
1099  	} else
1100  		fatal("unhandled file type: %s\n", opt_file);
1101  
1102  	close(kpageflags_fd);
1103  	close(pagemap_fd);
1104  	signal(SIGBUS, SIG_DFL);
1105  }
1106  
1107  static void parse_file(const char *name)
1108  {
1109  	opt_file = name;
1110  }
1111  
1112  static void parse_cgroup(const char *path)
1113  {
1114  	if (path[0] == '@') {
1115  		opt_cgroup = parse_number(path + 1);
1116  		return;
1117  	}
1118  
1119  	struct stat st;
1120  
1121  	if (stat(path, &st))
1122  		fatal("stat failed: %s: %m\n", path);
1123  
1124  	if (!S_ISDIR(st.st_mode))
1125  		fatal("cgroup supposed to be a directory: %s\n", path);
1126  
1127  	opt_cgroup = st.st_ino;
1128  }
1129  
1130  static void parse_addr_range(const char *optarg)
1131  {
1132  	unsigned long offset;
1133  	unsigned long size;
1134  	char *p;
1135  
1136  	p = strchr(optarg, ',');
1137  	if (!p)
1138  		p = strchr(optarg, '+');
1139  
1140  	if (p == optarg) {
1141  		offset = 0;
1142  		size   = parse_number(p + 1);
1143  	} else if (p) {
1144  		offset = parse_number(optarg);
1145  		if (p[1] == '\0')
1146  			size = ULONG_MAX;
1147  		else {
1148  			size = parse_number(p + 1);
1149  			if (*p == ',') {
1150  				if (size < offset)
1151  					fatal("invalid range: %lu,%lu\n",
1152  							offset, size);
1153  				size -= offset;
1154  			}
1155  		}
1156  	} else {
1157  		offset = parse_number(optarg);
1158  		size   = 1;
1159  	}
1160  
1161  	add_addr_range(offset, size);
1162  }
1163  
1164  static void add_bits_filter(uint64_t mask, uint64_t bits)
1165  {
1166  	if (nr_bit_filters >= MAX_BIT_FILTERS)
1167  		fatal("too much bit filters\n");
1168  
1169  	opt_mask[nr_bit_filters] = mask;
1170  	opt_bits[nr_bit_filters] = bits;
1171  	nr_bit_filters++;
1172  }
1173  
1174  static uint64_t parse_flag_name(const char *str, int len)
1175  {
1176  	size_t i;
1177  
1178  	if (!*str || !len)
1179  		return 0;
1180  
1181  	if (len <= 8 && !strncmp(str, "compound", len))
1182  		return BITS_COMPOUND;
1183  
1184  	for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) {
1185  		if (!page_flag_names[i])
1186  			continue;
1187  		if (!strncmp(str, page_flag_names[i] + 2, len))
1188  			return 1ULL << i;
1189  	}
1190  
1191  	return parse_number(str);
1192  }
1193  
1194  static uint64_t parse_flag_names(const char *str, int all)
1195  {
1196  	const char *p    = str;
1197  	uint64_t   flags = 0;
1198  
1199  	while (1) {
1200  		if (*p == ',' || *p == '=' || *p == '\0') {
1201  			if ((*str != '~') || (*str == '~' && all && *++str))
1202  				flags |= parse_flag_name(str, p - str);
1203  			if (*p != ',')
1204  				break;
1205  			str = p + 1;
1206  		}
1207  		p++;
1208  	}
1209  
1210  	return flags;
1211  }
1212  
1213  static void parse_bits_mask(const char *optarg)
1214  {
1215  	uint64_t mask;
1216  	uint64_t bits;
1217  	const char *p;
1218  
1219  	p = strchr(optarg, '=');
1220  	if (p == optarg) {
1221  		mask = KPF_ALL_BITS;
1222  		bits = parse_flag_names(p + 1, 0);
1223  	} else if (p) {
1224  		mask = parse_flag_names(optarg, 0);
1225  		bits = parse_flag_names(p + 1, 0);
1226  	} else if (strchr(optarg, '~')) {
1227  		mask = parse_flag_names(optarg, 1);
1228  		bits = parse_flag_names(optarg, 0);
1229  	} else {
1230  		mask = parse_flag_names(optarg, 0);
1231  		bits = KPF_ALL_BITS;
1232  	}
1233  
1234  	add_bits_filter(mask, bits);
1235  }
1236  
1237  static void parse_kpageflags(const char *name)
1238  {
1239  	opt_kpageflags = name;
1240  }
1241  
1242  static void describe_flags(const char *optarg)
1243  {
1244  	uint64_t flags = parse_flag_names(optarg, 0);
1245  
1246  	printf("0x%016llx\t%s\t%s\n",
1247  		(unsigned long long)flags,
1248  		page_flag_name(flags),
1249  		page_flag_longname(flags));
1250  }
1251  
1252  static const struct option opts[] = {
1253  	{ "raw"       , 0, NULL, 'r' },
1254  	{ "pid"       , 1, NULL, 'p' },
1255  	{ "file"      , 1, NULL, 'f' },
1256  	{ "addr"      , 1, NULL, 'a' },
1257  	{ "bits"      , 1, NULL, 'b' },
1258  	{ "cgroup"    , 1, NULL, 'c' },
1259  	{ "describe"  , 1, NULL, 'd' },
1260  	{ "mark-idle" , 0, NULL, 'i' },
1261  	{ "list"      , 0, NULL, 'l' },
1262  	{ "list-each" , 0, NULL, 'L' },
1263  	{ "list-cgroup", 0, NULL, 'C' },
1264  	{ "list-mapcnt", 0, NULL, 'M' },
1265  	{ "no-summary", 0, NULL, 'N' },
1266  	{ "hwpoison"  , 0, NULL, 'X' },
1267  	{ "unpoison"  , 0, NULL, 'x' },
1268  	{ "kpageflags", 0, NULL, 'F' },
1269  	{ "help"      , 0, NULL, 'h' },
1270  	{ NULL        , 0, NULL, 0 }
1271  };
1272  
1273  int main(int argc, char *argv[])
1274  {
1275  	int c;
1276  
1277  	page_size = getpagesize();
1278  
1279  	while ((c = getopt_long(argc, argv,
1280  				"rp:f:a:b:d:c:CilLMNXxF:h",
1281  				opts, NULL)) != -1) {
1282  		switch (c) {
1283  		case 'r':
1284  			opt_raw = 1;
1285  			break;
1286  		case 'p':
1287  			parse_pid(optarg);
1288  			break;
1289  		case 'f':
1290  			parse_file(optarg);
1291  			break;
1292  		case 'a':
1293  			parse_addr_range(optarg);
1294  			break;
1295  		case 'b':
1296  			parse_bits_mask(optarg);
1297  			break;
1298  		case 'c':
1299  			parse_cgroup(optarg);
1300  			break;
1301  		case 'C':
1302  			opt_list_cgroup = 1;
1303  			break;
1304  		case 'd':
1305  			describe_flags(optarg);
1306  			exit(0);
1307  		case 'i':
1308  			opt_mark_idle = 1;
1309  			break;
1310  		case 'l':
1311  			opt_list = 1;
1312  			break;
1313  		case 'L':
1314  			opt_list = 2;
1315  			break;
1316  		case 'M':
1317  			opt_list_mapcnt = 1;
1318  			break;
1319  		case 'N':
1320  			opt_no_summary = 1;
1321  			break;
1322  		case 'X':
1323  			opt_hwpoison = 1;
1324  			prepare_hwpoison_fd();
1325  			break;
1326  		case 'x':
1327  			opt_unpoison = 1;
1328  			prepare_hwpoison_fd();
1329  			break;
1330  		case 'F':
1331  			parse_kpageflags(optarg);
1332  			break;
1333  		case 'h':
1334  			usage();
1335  			exit(0);
1336  		default:
1337  			usage();
1338  			exit(1);
1339  		}
1340  	}
1341  
1342  	if (!opt_kpageflags)
1343  		opt_kpageflags = PROC_KPAGEFLAGS;
1344  
1345  	if (opt_cgroup || opt_list_cgroup)
1346  		kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY);
1347  
1348  	if (opt_list && opt_list_mapcnt)
1349  		kpagecount_fd = checked_open(PROC_KPAGECOUNT, O_RDONLY);
1350  
1351  	if (opt_mark_idle)
1352  		page_idle_fd = checked_open(SYS_KERNEL_MM_PAGE_IDLE, O_RDWR);
1353  
1354  	if (opt_list && opt_pid)
1355  		printf("voffset\t");
1356  	if (opt_list && opt_file)
1357  		printf("foffset\t");
1358  	if (opt_list && opt_list_cgroup)
1359  		printf("cgroup\t");
1360  	if (opt_list && opt_list_mapcnt)
1361  		printf("map-cnt\t");
1362  
1363  	if (opt_list == 1)
1364  		printf("offset\tlen\tflags\n");
1365  	if (opt_list == 2)
1366  		printf("offset\tflags\n");
1367  
1368  	if (opt_file)
1369  		walk_page_cache();
1370  	else
1371  		walk_addr_ranges();
1372  
1373  	if (opt_list == 1)
1374  		flush_page_range();
1375  
1376  	if (opt_no_summary)
1377  		return 0;
1378  
1379  	if (opt_list)
1380  		printf("\n\n");
1381  
1382  	if (opt_file) {
1383  		show_file(opt_file, &st);
1384  		printf("\n");
1385  	}
1386  
1387  	show_summary();
1388  
1389  	if (opt_list_mapcnt)
1390  		close(kpagecount_fd);
1391  
1392  	if (page_idle_fd >= 0)
1393  		close(page_idle_fd);
1394  
1395  	return 0;
1396  }
1397