xref: /linux/tools/perf/builtin-kmem.c (revision 865224bfde78b31be8883302db496ecb3b7919ab)
1 // SPDX-License-Identifier: GPL-2.0
2 #include "builtin.h"
3 
4 #include "util/dso.h"
5 #include "util/evlist.h"
6 #include "util/evsel.h"
7 #include "util/config.h"
8 #include "util/map.h"
9 #include "util/symbol.h"
10 #include "util/thread.h"
11 #include "util/header.h"
12 #include "util/session.h"
13 #include "util/tool.h"
14 #include "util/callchain.h"
15 #include "util/time-utils.h"
16 #include <linux/err.h>
17 
18 #include <subcmd/pager.h>
19 #include <subcmd/parse-options.h>
20 #include "util/trace-event.h"
21 #include "util/data.h"
22 #include "util/cpumap.h"
23 
24 #include "util/debug.h"
25 #include "util/event.h"
26 #include "util/string2.h"
27 #include "util/util.h"
28 
29 #include <linux/kernel.h>
30 #include <linux/numa.h>
31 #include <linux/rbtree.h>
32 #include <linux/string.h>
33 #include <linux/zalloc.h>
34 #include <errno.h>
35 #include <inttypes.h>
36 #include <locale.h>
37 #include <regex.h>
38 
39 #include <linux/ctype.h>
40 #include <event-parse.h>
41 
42 static int	kmem_slab;
43 static int	kmem_page;
44 
45 static long	kmem_page_size;
46 static enum {
47 	KMEM_SLAB,
48 	KMEM_PAGE,
49 } kmem_default = KMEM_SLAB;  /* for backward compatibility */
50 
51 struct alloc_stat;
52 typedef int (*sort_fn_t)(void *, void *);
53 
54 static int			alloc_flag;
55 static int			caller_flag;
56 
57 static int			alloc_lines = -1;
58 static int			caller_lines = -1;
59 
60 static bool			raw_ip;
61 
62 struct alloc_stat {
63 	u64	call_site;
64 	u64	ptr;
65 	u64	bytes_req;
66 	u64	bytes_alloc;
67 	u64	last_alloc;
68 	u32	hit;
69 	u32	pingpong;
70 
71 	short	alloc_cpu;
72 
73 	struct rb_node node;
74 };
75 
76 static struct rb_root root_alloc_stat;
77 static struct rb_root root_alloc_sorted;
78 static struct rb_root root_caller_stat;
79 static struct rb_root root_caller_sorted;
80 
81 static unsigned long total_requested, total_allocated, total_freed;
82 static unsigned long nr_allocs, nr_cross_allocs;
83 
84 /* filters for controlling start and stop of time of analysis */
85 static struct perf_time_interval ptime;
86 static const char *time_str;
87 
88 static int insert_alloc_stat(unsigned long call_site, unsigned long ptr,
89 			     int bytes_req, int bytes_alloc, int cpu)
90 {
91 	struct rb_node **node = &root_alloc_stat.rb_node;
92 	struct rb_node *parent = NULL;
93 	struct alloc_stat *data = NULL;
94 
95 	while (*node) {
96 		parent = *node;
97 		data = rb_entry(*node, struct alloc_stat, node);
98 
99 		if (ptr > data->ptr)
100 			node = &(*node)->rb_right;
101 		else if (ptr < data->ptr)
102 			node = &(*node)->rb_left;
103 		else
104 			break;
105 	}
106 
107 	if (data && data->ptr == ptr) {
108 		data->hit++;
109 		data->bytes_req += bytes_req;
110 		data->bytes_alloc += bytes_alloc;
111 	} else {
112 		data = malloc(sizeof(*data));
113 		if (!data) {
114 			pr_err("%s: malloc failed\n", __func__);
115 			return -1;
116 		}
117 		data->ptr = ptr;
118 		data->pingpong = 0;
119 		data->hit = 1;
120 		data->bytes_req = bytes_req;
121 		data->bytes_alloc = bytes_alloc;
122 
123 		rb_link_node(&data->node, parent, node);
124 		rb_insert_color(&data->node, &root_alloc_stat);
125 	}
126 	data->call_site = call_site;
127 	data->alloc_cpu = cpu;
128 	data->last_alloc = bytes_alloc;
129 
130 	return 0;
131 }
132 
133 static int insert_caller_stat(unsigned long call_site,
134 			      int bytes_req, int bytes_alloc)
135 {
136 	struct rb_node **node = &root_caller_stat.rb_node;
137 	struct rb_node *parent = NULL;
138 	struct alloc_stat *data = NULL;
139 
140 	while (*node) {
141 		parent = *node;
142 		data = rb_entry(*node, struct alloc_stat, node);
143 
144 		if (call_site > data->call_site)
145 			node = &(*node)->rb_right;
146 		else if (call_site < data->call_site)
147 			node = &(*node)->rb_left;
148 		else
149 			break;
150 	}
151 
152 	if (data && data->call_site == call_site) {
153 		data->hit++;
154 		data->bytes_req += bytes_req;
155 		data->bytes_alloc += bytes_alloc;
156 	} else {
157 		data = malloc(sizeof(*data));
158 		if (!data) {
159 			pr_err("%s: malloc failed\n", __func__);
160 			return -1;
161 		}
162 		data->call_site = call_site;
163 		data->pingpong = 0;
164 		data->hit = 1;
165 		data->bytes_req = bytes_req;
166 		data->bytes_alloc = bytes_alloc;
167 
168 		rb_link_node(&data->node, parent, node);
169 		rb_insert_color(&data->node, &root_caller_stat);
170 	}
171 
172 	return 0;
173 }
174 
175 static int evsel__process_alloc_event(struct perf_sample *sample)
176 {
177 	unsigned long ptr = perf_sample__intval(sample, "ptr"),
178 		      call_site = perf_sample__intval(sample, "call_site");
179 	int bytes_req = perf_sample__intval(sample, "bytes_req"),
180 	    bytes_alloc = perf_sample__intval(sample, "bytes_alloc");
181 
182 	if (insert_alloc_stat(call_site, ptr, bytes_req, bytes_alloc, sample->cpu) ||
183 	    insert_caller_stat(call_site, bytes_req, bytes_alloc))
184 		return -1;
185 
186 	total_requested += bytes_req;
187 	total_allocated += bytes_alloc;
188 
189 	nr_allocs++;
190 
191 	/*
192 	 * Commit 11e9734bcb6a ("mm/slab_common: unify NUMA and UMA
193 	 * version of tracepoints") adds the field "node" into the
194 	 * tracepoints 'kmalloc' and 'kmem_cache_alloc'.
195 	 *
196 	 * The legacy tracepoints 'kmalloc_node' and 'kmem_cache_alloc_node'
197 	 * also contain the field "node".
198 	 *
199 	 * If the tracepoint contains the field "node" the tool stats the
200 	 * cross allocation.
201 	 */
202 	if (evsel__field(sample->evsel, "node")) {
203 		int node1, node2;
204 
205 		node1 = cpu__get_node((struct perf_cpu){.cpu = sample->cpu});
206 		node2 = perf_sample__intval(sample, "node");
207 
208 		/*
209 		 * If the field "node" is NUMA_NO_NODE (-1), we don't take it
210 		 * as a cross allocation.
211 		 */
212 		if ((node2 != NUMA_NO_NODE) && (node1 != node2))
213 			nr_cross_allocs++;
214 	}
215 
216 	return 0;
217 }
218 
219 static int ptr_cmp(void *, void *);
220 static int slab_callsite_cmp(void *, void *);
221 
222 static struct alloc_stat *search_alloc_stat(unsigned long ptr,
223 					    unsigned long call_site,
224 					    struct rb_root *root,
225 					    sort_fn_t sort_fn)
226 {
227 	struct rb_node *node = root->rb_node;
228 	struct alloc_stat key = { .ptr = ptr, .call_site = call_site };
229 
230 	while (node) {
231 		struct alloc_stat *data;
232 		int cmp;
233 
234 		data = rb_entry(node, struct alloc_stat, node);
235 
236 		cmp = sort_fn(&key, data);
237 		if (cmp < 0)
238 			node = node->rb_left;
239 		else if (cmp > 0)
240 			node = node->rb_right;
241 		else
242 			return data;
243 	}
244 	return NULL;
245 }
246 
247 static int evsel__process_free_event(struct perf_sample *sample)
248 {
249 	unsigned long ptr = perf_sample__intval(sample, "ptr");
250 	struct alloc_stat *s_alloc, *s_caller;
251 
252 	s_alloc = search_alloc_stat(ptr, 0, &root_alloc_stat, ptr_cmp);
253 	if (!s_alloc)
254 		return 0;
255 
256 	total_freed += s_alloc->last_alloc;
257 
258 	if ((short)sample->cpu != s_alloc->alloc_cpu) {
259 		s_alloc->pingpong++;
260 
261 		s_caller = search_alloc_stat(0, s_alloc->call_site,
262 					     &root_caller_stat,
263 					     slab_callsite_cmp);
264 		if (!s_caller)
265 			return -1;
266 		s_caller->pingpong++;
267 	}
268 	s_alloc->alloc_cpu = -1;
269 
270 	return 0;
271 }
272 
273 static u64 total_page_alloc_bytes;
274 static u64 total_page_free_bytes;
275 static u64 total_page_nomatch_bytes;
276 static u64 total_page_fail_bytes;
277 static unsigned long nr_page_allocs;
278 static unsigned long nr_page_frees;
279 static unsigned long nr_page_fails;
280 static unsigned long nr_page_nomatch;
281 
282 static bool use_pfn;
283 static bool live_page;
284 static struct perf_session *kmem_session;
285 
286 #define MAX_MIGRATE_TYPES  6
287 #define MAX_PAGE_ORDER     11
288 
289 static int order_stats[MAX_PAGE_ORDER][MAX_MIGRATE_TYPES];
290 
291 struct page_stat {
292 	struct rb_node 	node;
293 	u64 		page;
294 	u64 		callsite;
295 	int 		order;
296 	unsigned 	gfp_flags;
297 	unsigned 	migrate_type;
298 	u64		alloc_bytes;
299 	u64 		free_bytes;
300 	int 		nr_alloc;
301 	int 		nr_free;
302 };
303 
304 static struct rb_root page_live_tree;
305 static struct rb_root page_alloc_tree;
306 static struct rb_root page_alloc_sorted;
307 static struct rb_root page_caller_tree;
308 static struct rb_root page_caller_sorted;
309 
310 struct alloc_func {
311 	u64 start;
312 	u64 end;
313 	char *name;
314 };
315 
316 static int nr_alloc_funcs;
317 static struct alloc_func *alloc_func_list;
318 
319 static int funcmp(const void *a, const void *b)
320 {
321 	const struct alloc_func *fa = a;
322 	const struct alloc_func *fb = b;
323 
324 	if (fa->start > fb->start)
325 		return 1;
326 	else
327 		return -1;
328 }
329 
330 static int callcmp(const void *a, const void *b)
331 {
332 	const struct alloc_func *fa = a;
333 	const struct alloc_func *fb = b;
334 
335 	if (fb->start <= fa->start && fa->end < fb->end)
336 		return 0;
337 
338 	if (fa->start > fb->start)
339 		return 1;
340 	else
341 		return -1;
342 }
343 
344 static int build_alloc_func_list(void)
345 {
346 	int ret;
347 	struct map *kernel_map;
348 	struct symbol *sym;
349 	struct rb_node *node;
350 	struct alloc_func *func;
351 	struct machine *machine = &kmem_session->machines.host;
352 	regex_t alloc_func_regex;
353 	static const char pattern[] = "^_?_?(alloc|get_free|get_zeroed)_pages?";
354 
355 	ret = regcomp(&alloc_func_regex, pattern, REG_EXTENDED);
356 	if (ret) {
357 		char err[BUFSIZ];
358 
359 		regerror(ret, &alloc_func_regex, err, sizeof(err));
360 		pr_err("Invalid regex: %s\n%s", pattern, err);
361 		return -EINVAL;
362 	}
363 
364 	kernel_map = machine__kernel_map(machine);
365 	if (map__load(kernel_map) < 0) {
366 		pr_err("cannot load kernel map\n");
367 		return -ENOENT;
368 	}
369 
370 	map__for_each_symbol(kernel_map, sym, node) {
371 		if (regexec(&alloc_func_regex, sym->name, 0, NULL, 0))
372 			continue;
373 
374 		func = realloc(alloc_func_list,
375 			       (nr_alloc_funcs + 1) * sizeof(*func));
376 		if (func == NULL)
377 			return -ENOMEM;
378 
379 		pr_debug("alloc func: %s\n", sym->name);
380 		func[nr_alloc_funcs].start = sym->start;
381 		func[nr_alloc_funcs].end   = sym->end;
382 		func[nr_alloc_funcs].name  = sym->name;
383 
384 		alloc_func_list = func;
385 		nr_alloc_funcs++;
386 	}
387 
388 	qsort(alloc_func_list, nr_alloc_funcs, sizeof(*func), funcmp);
389 
390 	regfree(&alloc_func_regex);
391 	return 0;
392 }
393 
394 /*
395  * Find first non-memory allocation function from callchain.
396  * The allocation functions are in the 'alloc_func_list'.
397  */
398 static u64 find_callsite(struct perf_sample *sample)
399 {
400 	struct addr_location al;
401 	struct machine *machine = &kmem_session->machines.host;
402 	struct callchain_cursor_node *node;
403 	struct callchain_cursor *cursor;
404 	u64 result = sample->ip;
405 
406 	addr_location__init(&al);
407 	if (alloc_func_list == NULL) {
408 		if (build_alloc_func_list() < 0)
409 			goto out;
410 	}
411 
412 	al.thread = machine__findnew_thread(machine, sample->pid, sample->tid);
413 
414 	cursor = get_tls_callchain_cursor();
415 	if (cursor == NULL)
416 		goto out;
417 
418 	sample__resolve_callchain(sample, cursor, /*parent=*/NULL, &al, 16);
419 
420 	callchain_cursor_commit(cursor);
421 	while (true) {
422 		struct alloc_func key, *caller;
423 		u64 addr;
424 
425 		node = callchain_cursor_current(cursor);
426 		if (node == NULL)
427 			break;
428 
429 		key.start = key.end = node->ip;
430 		caller = bsearch(&key, alloc_func_list, nr_alloc_funcs,
431 				 sizeof(key), callcmp);
432 		if (!caller) {
433 			/* found */
434 			if (node->ms.map)
435 				addr = map__dso_unmap_ip(node->ms.map, node->ip);
436 			else
437 				addr = node->ip;
438 
439 			result = addr;
440 			goto out;
441 		} else
442 			pr_debug3("skipping alloc function: %s\n", caller->name);
443 
444 		callchain_cursor_advance(cursor);
445 	}
446 
447 	pr_debug2("unknown callsite: %"PRIx64 "\n", sample->ip);
448 out:
449 	addr_location__exit(&al);
450 	return result;
451 }
452 
453 struct sort_dimension {
454 	const char		name[20];
455 	sort_fn_t		cmp;
456 	struct list_head	list;
457 };
458 
459 static LIST_HEAD(page_alloc_sort_input);
460 static LIST_HEAD(page_caller_sort_input);
461 
462 static struct page_stat *
463 __page_stat__findnew_page(struct page_stat *pstat, bool create)
464 {
465 	struct rb_node **node = &page_live_tree.rb_node;
466 	struct rb_node *parent = NULL;
467 	struct page_stat *data;
468 
469 	while (*node) {
470 		s64 cmp;
471 
472 		parent = *node;
473 		data = rb_entry(*node, struct page_stat, node);
474 
475 		cmp = data->page - pstat->page;
476 		if (cmp < 0)
477 			node = &parent->rb_left;
478 		else if (cmp > 0)
479 			node = &parent->rb_right;
480 		else
481 			return data;
482 	}
483 
484 	if (!create)
485 		return NULL;
486 
487 	data = zalloc(sizeof(*data));
488 	if (data != NULL) {
489 		data->page = pstat->page;
490 		data->order = pstat->order;
491 		data->gfp_flags = pstat->gfp_flags;
492 		data->migrate_type = pstat->migrate_type;
493 
494 		rb_link_node(&data->node, parent, node);
495 		rb_insert_color(&data->node, &page_live_tree);
496 	}
497 
498 	return data;
499 }
500 
501 static struct page_stat *page_stat__find_page(struct page_stat *pstat)
502 {
503 	return __page_stat__findnew_page(pstat, false);
504 }
505 
506 static struct page_stat *page_stat__findnew_page(struct page_stat *pstat)
507 {
508 	return __page_stat__findnew_page(pstat, true);
509 }
510 
511 static struct page_stat *
512 __page_stat__findnew_alloc(struct page_stat *pstat, bool create)
513 {
514 	struct rb_node **node = &page_alloc_tree.rb_node;
515 	struct rb_node *parent = NULL;
516 	struct page_stat *data;
517 	struct sort_dimension *sort;
518 
519 	while (*node) {
520 		int cmp = 0;
521 
522 		parent = *node;
523 		data = rb_entry(*node, struct page_stat, node);
524 
525 		list_for_each_entry(sort, &page_alloc_sort_input, list) {
526 			cmp = sort->cmp(pstat, data);
527 			if (cmp)
528 				break;
529 		}
530 
531 		if (cmp < 0)
532 			node = &parent->rb_left;
533 		else if (cmp > 0)
534 			node = &parent->rb_right;
535 		else
536 			return data;
537 	}
538 
539 	if (!create)
540 		return NULL;
541 
542 	data = zalloc(sizeof(*data));
543 	if (data != NULL) {
544 		data->page = pstat->page;
545 		data->order = pstat->order;
546 		data->gfp_flags = pstat->gfp_flags;
547 		data->migrate_type = pstat->migrate_type;
548 
549 		rb_link_node(&data->node, parent, node);
550 		rb_insert_color(&data->node, &page_alloc_tree);
551 	}
552 
553 	return data;
554 }
555 
556 static struct page_stat *page_stat__find_alloc(struct page_stat *pstat)
557 {
558 	return __page_stat__findnew_alloc(pstat, false);
559 }
560 
561 static struct page_stat *page_stat__findnew_alloc(struct page_stat *pstat)
562 {
563 	return __page_stat__findnew_alloc(pstat, true);
564 }
565 
566 static struct page_stat *
567 __page_stat__findnew_caller(struct page_stat *pstat, bool create)
568 {
569 	struct rb_node **node = &page_caller_tree.rb_node;
570 	struct rb_node *parent = NULL;
571 	struct page_stat *data;
572 	struct sort_dimension *sort;
573 
574 	while (*node) {
575 		int cmp = 0;
576 
577 		parent = *node;
578 		data = rb_entry(*node, struct page_stat, node);
579 
580 		list_for_each_entry(sort, &page_caller_sort_input, list) {
581 			cmp = sort->cmp(pstat, data);
582 			if (cmp)
583 				break;
584 		}
585 
586 		if (cmp < 0)
587 			node = &parent->rb_left;
588 		else if (cmp > 0)
589 			node = &parent->rb_right;
590 		else
591 			return data;
592 	}
593 
594 	if (!create)
595 		return NULL;
596 
597 	data = zalloc(sizeof(*data));
598 	if (data != NULL) {
599 		data->callsite = pstat->callsite;
600 		data->order = pstat->order;
601 		data->gfp_flags = pstat->gfp_flags;
602 		data->migrate_type = pstat->migrate_type;
603 
604 		rb_link_node(&data->node, parent, node);
605 		rb_insert_color(&data->node, &page_caller_tree);
606 	}
607 
608 	return data;
609 }
610 
611 static struct page_stat *page_stat__find_caller(struct page_stat *pstat)
612 {
613 	return __page_stat__findnew_caller(pstat, false);
614 }
615 
616 static struct page_stat *page_stat__findnew_caller(struct page_stat *pstat)
617 {
618 	return __page_stat__findnew_caller(pstat, true);
619 }
620 
621 static bool valid_page(u64 pfn_or_page)
622 {
623 	if (use_pfn && pfn_or_page == -1UL)
624 		return false;
625 	if (!use_pfn && pfn_or_page == 0)
626 		return false;
627 	return true;
628 }
629 
630 struct gfp_flag {
631 	unsigned int flags;
632 	char *compact_str;
633 	char *human_readable;
634 };
635 
636 static struct gfp_flag *gfps;
637 static int nr_gfps;
638 
639 static int gfpcmp(const void *a, const void *b)
640 {
641 	const struct gfp_flag *fa = a;
642 	const struct gfp_flag *fb = b;
643 
644 	return fa->flags - fb->flags;
645 }
646 
647 /* see include/trace/events/mmflags.h */
648 static const struct {
649 	const char *original;
650 	const char *compact;
651 } gfp_compact_table[] = {
652 	{ "GFP_TRANSHUGE",		"THP" },
653 	{ "GFP_TRANSHUGE_LIGHT",	"THL" },
654 	{ "GFP_HIGHUSER_MOVABLE",	"HUM" },
655 	{ "GFP_HIGHUSER",		"HU" },
656 	{ "GFP_USER",			"U" },
657 	{ "GFP_KERNEL_ACCOUNT",		"KAC" },
658 	{ "GFP_KERNEL",			"K" },
659 	{ "GFP_NOFS",			"NF" },
660 	{ "GFP_ATOMIC",			"A" },
661 	{ "GFP_NOIO",			"NI" },
662 	{ "GFP_NOWAIT",			"NW" },
663 	{ "GFP_DMA",			"D" },
664 	{ "__GFP_HIGHMEM",		"HM" },
665 	{ "GFP_DMA32",			"D32" },
666 	{ "__GFP_HIGH",			"H" },
667 	{ "__GFP_IO",			"I" },
668 	{ "__GFP_FS",			"F" },
669 	{ "__GFP_NOWARN",		"NWR" },
670 	{ "__GFP_RETRY_MAYFAIL",	"R" },
671 	{ "__GFP_NOFAIL",		"NF" },
672 	{ "__GFP_NORETRY",		"NR" },
673 	{ "__GFP_COMP",			"C" },
674 	{ "__GFP_ZERO",			"Z" },
675 	{ "__GFP_NOMEMALLOC",		"NMA" },
676 	{ "__GFP_MEMALLOC",		"MA" },
677 	{ "__GFP_HARDWALL",		"HW" },
678 	{ "__GFP_THISNODE",		"TN" },
679 	{ "__GFP_RECLAIMABLE",		"RC" },
680 	{ "__GFP_MOVABLE",		"M" },
681 	{ "__GFP_ACCOUNT",		"AC" },
682 	{ "__GFP_WRITE",		"WR" },
683 	{ "__GFP_RECLAIM",		"R" },
684 	{ "__GFP_DIRECT_RECLAIM",	"DR" },
685 	{ "__GFP_KSWAPD_RECLAIM",	"KR" },
686 };
687 
688 static size_t max_gfp_len;
689 
690 static char *compact_gfp_flags(char *gfp_flags)
691 {
692 	char *orig_flags = strdup(gfp_flags);
693 	char *new_flags = NULL;
694 	char *str, *pos = NULL;
695 	size_t len = 0;
696 
697 	if (orig_flags == NULL)
698 		return NULL;
699 
700 	str = strtok_r(orig_flags, "|", &pos);
701 	while (str) {
702 		size_t i;
703 		char *new;
704 		const char *cpt;
705 
706 		for (i = 0; i < ARRAY_SIZE(gfp_compact_table); i++) {
707 			if (strcmp(gfp_compact_table[i].original, str))
708 				continue;
709 
710 			cpt = gfp_compact_table[i].compact;
711 			new = realloc(new_flags, len + strlen(cpt) + 2);
712 			if (new == NULL) {
713 				free(new_flags);
714 				free(orig_flags);
715 				return NULL;
716 			}
717 
718 			new_flags = new;
719 
720 			if (!len) {
721 				strcpy(new_flags, cpt);
722 			} else {
723 				strcat(new_flags, "|");
724 				strcat(new_flags, cpt);
725 				len++;
726 			}
727 
728 			len += strlen(cpt);
729 		}
730 
731 		str = strtok_r(NULL, "|", &pos);
732 	}
733 
734 	if (max_gfp_len < len)
735 		max_gfp_len = len;
736 
737 	free(orig_flags);
738 	return new_flags;
739 }
740 
741 static char *compact_gfp_string(unsigned long gfp_flags)
742 {
743 	struct gfp_flag key = {
744 		.flags = gfp_flags,
745 	};
746 	struct gfp_flag *gfp;
747 
748 	gfp = bsearch(&key, gfps, nr_gfps, sizeof(*gfps), gfpcmp);
749 	if (gfp)
750 		return gfp->compact_str;
751 
752 	return NULL;
753 }
754 
755 static int parse_gfp_flags(struct perf_sample *sample, unsigned int gfp_flags)
756 {
757 	struct tep_record record = {
758 		.cpu = sample->cpu,
759 		.data = sample->raw_data,
760 		.size = sample->raw_size,
761 	};
762 	struct trace_seq seq;
763 	char *str, *pos = NULL;
764 	const struct tep_event *tp_format;
765 
766 	if (nr_gfps) {
767 		struct gfp_flag key = {
768 			.flags = gfp_flags,
769 		};
770 
771 		if (bsearch(&key, gfps, nr_gfps, sizeof(*gfps), gfpcmp))
772 			return 0;
773 	}
774 
775 	trace_seq_init(&seq);
776 	tp_format = evsel__tp_format(sample->evsel);
777 	if (tp_format)
778 		tep_print_event(tp_format->tep, &seq, &record, "%s", TEP_PRINT_INFO);
779 
780 	str = strtok_r(seq.buffer, " ", &pos);
781 	while (str) {
782 		if (!strncmp(str, "gfp_flags=", 10)) {
783 			struct gfp_flag *new;
784 
785 			new = realloc(gfps, (nr_gfps + 1) * sizeof(*gfps));
786 			if (new == NULL)
787 				goto err_out;
788 
789 			gfps = new;
790 			new += nr_gfps;
791 
792 			new->flags = gfp_flags;
793 			new->human_readable = strdup(str + 10);
794 			if (!new->human_readable)
795 				goto err_out;
796 			new->compact_str = compact_gfp_flags(str + 10);
797 			if (!new->compact_str) {
798 				free(new->human_readable);
799 				goto err_out;
800 			}
801 			nr_gfps++;
802 			qsort(gfps, nr_gfps, sizeof(*gfps), gfpcmp);
803 		}
804 
805 		str = strtok_r(NULL, " ", &pos);
806 	}
807 
808 	trace_seq_destroy(&seq);
809 	return 0;
810 err_out:
811 	trace_seq_destroy(&seq);
812 	return -ENOMEM;
813 }
814 
815 static int evsel__process_page_alloc_event(struct perf_sample *sample)
816 {
817 	u64 page;
818 	unsigned int order = perf_sample__intval(sample, "order");
819 	unsigned int gfp_flags = perf_sample__intval(sample, "gfp_flags");
820 	unsigned int migrate_type = perf_sample__intval(sample, "migratetype");
821 	u64 bytes = kmem_page_size << order;
822 	u64 callsite;
823 	struct page_stat *pstat;
824 	struct page_stat this = {
825 		.order = order,
826 		.gfp_flags = gfp_flags,
827 		.migrate_type = migrate_type,
828 	};
829 
830 	if (order >= MAX_PAGE_ORDER) {
831 		pr_debug("Out-of-bounds order %u\n", order);
832 		return -1;
833 	}
834 
835 	if (migrate_type >= MAX_MIGRATE_TYPES) {
836 		pr_debug("Out-of-bounds migratetype %u\n", migrate_type);
837 		return -1;
838 	}
839 
840 	if (use_pfn)
841 		page = perf_sample__intval(sample, "pfn");
842 	else
843 		page = perf_sample__intval(sample, "page");
844 
845 	nr_page_allocs++;
846 	total_page_alloc_bytes += bytes;
847 
848 	if (!valid_page(page)) {
849 		nr_page_fails++;
850 		total_page_fail_bytes += bytes;
851 
852 		return 0;
853 	}
854 
855 	if (parse_gfp_flags(sample, gfp_flags) < 0)
856 		return -1;
857 
858 	callsite = find_callsite(sample);
859 
860 	/*
861 	 * This is to find the current page (with correct gfp flags and
862 	 * migrate type) at free event.
863 	 */
864 	this.page = page;
865 	pstat = page_stat__findnew_page(&this);
866 	if (pstat == NULL)
867 		return -ENOMEM;
868 
869 	pstat->nr_alloc++;
870 	pstat->alloc_bytes += bytes;
871 	pstat->callsite = callsite;
872 
873 	if (!live_page) {
874 		pstat = page_stat__findnew_alloc(&this);
875 		if (pstat == NULL)
876 			return -ENOMEM;
877 
878 		pstat->nr_alloc++;
879 		pstat->alloc_bytes += bytes;
880 		pstat->callsite = callsite;
881 	}
882 
883 	this.callsite = callsite;
884 	pstat = page_stat__findnew_caller(&this);
885 	if (pstat == NULL)
886 		return -ENOMEM;
887 
888 	pstat->nr_alloc++;
889 	pstat->alloc_bytes += bytes;
890 
891 	order_stats[order][migrate_type]++;
892 
893 	return 0;
894 }
895 
896 static int evsel__process_page_free_event(struct perf_sample *sample)
897 {
898 	u64 page;
899 	unsigned int order = perf_sample__intval(sample, "order");
900 	u64 bytes = kmem_page_size << order;
901 	struct page_stat *pstat;
902 	struct page_stat this = {
903 		.order = order,
904 	};
905 
906 	if (order >= MAX_PAGE_ORDER) {
907 		pr_debug("Out-of-bounds order %u\n", order);
908 		return -1;
909 	}
910 
911 	if (use_pfn)
912 		page = perf_sample__intval(sample, "pfn");
913 	else
914 		page = perf_sample__intval(sample, "page");
915 
916 	nr_page_frees++;
917 	total_page_free_bytes += bytes;
918 
919 	this.page = page;
920 	pstat = page_stat__find_page(&this);
921 	if (pstat == NULL) {
922 		pr_debug2("missing free at page %"PRIx64" (order: %d)\n",
923 			  page, order);
924 
925 		nr_page_nomatch++;
926 		total_page_nomatch_bytes += bytes;
927 
928 		return 0;
929 	}
930 
931 	this.gfp_flags = pstat->gfp_flags;
932 	this.migrate_type = pstat->migrate_type;
933 	this.callsite = pstat->callsite;
934 
935 	rb_erase(&pstat->node, &page_live_tree);
936 	free(pstat);
937 
938 	if (live_page) {
939 		order_stats[this.order][this.migrate_type]--;
940 	} else {
941 		pstat = page_stat__find_alloc(&this);
942 		if (pstat == NULL)
943 			return -ENOMEM;
944 
945 		pstat->nr_free++;
946 		pstat->free_bytes += bytes;
947 	}
948 
949 	pstat = page_stat__find_caller(&this);
950 	if (pstat == NULL)
951 		return -ENOENT;
952 
953 	pstat->nr_free++;
954 	pstat->free_bytes += bytes;
955 
956 	if (live_page) {
957 		pstat->nr_alloc--;
958 		pstat->alloc_bytes -= bytes;
959 
960 		if (pstat->nr_alloc == 0) {
961 			rb_erase(&pstat->node, &page_caller_tree);
962 			free(pstat);
963 		}
964 	}
965 
966 	return 0;
967 }
968 
969 static bool perf_kmem__skip_sample(struct perf_sample *sample)
970 {
971 	/* skip sample based on time? */
972 	if (perf_time__skip_sample(&ptime, sample->time))
973 		return true;
974 
975 	return false;
976 }
977 
978 typedef int (*tracepoint_handler)(struct perf_sample *sample);
979 
980 static int process_sample_event(const struct perf_tool *tool __maybe_unused,
981 				union perf_event *event,
982 				struct perf_sample *sample,
983 				struct machine *machine)
984 {
985 	struct evsel *evsel = sample->evsel;
986 	int err = 0;
987 	struct thread *thread = machine__findnew_thread(machine, sample->pid,
988 							sample->tid);
989 
990 	if (thread == NULL) {
991 		pr_debug("problem processing %s (%u) event at offset %#" PRIx64 ", skipping it.\n",
992 			 perf_event__name(event->header.type), event->header.type,
993 			 sample->file_offset);
994 		return -1;
995 	}
996 
997 	if (perf_kmem__skip_sample(sample)) {
998 		thread__put(thread);
999 		return 0;
1000 	}
1001 
1002 	dump_printf(" ... thread: %s:%d\n", thread__comm_str(thread), thread__tid(thread));
1003 
1004 	if (evsel->handler != NULL) {
1005 		tracepoint_handler f = evsel->handler;
1006 		err = f(sample);
1007 	}
1008 
1009 	thread__put(thread);
1010 
1011 	return err;
1012 }
1013 
1014 static double fragmentation(unsigned long n_req, unsigned long n_alloc)
1015 {
1016 	if (n_alloc == 0)
1017 		return 0.0;
1018 	else
1019 		return 100.0 - (100.0 * n_req / n_alloc);
1020 }
1021 
1022 static void __print_slab_result(struct rb_root *root,
1023 				struct perf_session *session,
1024 				int n_lines, int is_caller)
1025 {
1026 	struct rb_node *next;
1027 	struct machine *machine = &session->machines.host;
1028 
1029 	printf("%.105s\n", graph_dotted_line);
1030 	printf(" %-34s |",  is_caller ? "Callsite": "Alloc Ptr");
1031 	printf(" Total_alloc/Per | Total_req/Per   | Hit      | Ping-pong | Frag\n");
1032 	printf("%.105s\n", graph_dotted_line);
1033 
1034 	next = rb_first(root);
1035 
1036 	while (next && n_lines--) {
1037 		struct alloc_stat *data = rb_entry(next, struct alloc_stat,
1038 						   node);
1039 		struct symbol *sym = NULL;
1040 		struct map *map;
1041 		char buf[BUFSIZ];
1042 		u64 addr;
1043 
1044 		if (is_caller) {
1045 			addr = data->call_site;
1046 			if (!raw_ip)
1047 				sym = machine__find_kernel_symbol(machine, addr, &map);
1048 		} else
1049 			addr = data->ptr;
1050 
1051 		if (sym != NULL)
1052 			snprintf(buf, sizeof(buf), "%s+%" PRIx64 "", sym->name,
1053 				 addr - map__unmap_ip(map, sym->start));
1054 		else
1055 			snprintf(buf, sizeof(buf), "%#" PRIx64 "", addr);
1056 		printf(" %-34s |", buf);
1057 
1058 		printf(" %9llu/%-5lu | %9llu/%-5lu | %8lu | %9lu | %6.3f%%\n",
1059 		       (unsigned long long)data->bytes_alloc,
1060 		       (unsigned long)data->bytes_alloc / data->hit,
1061 		       (unsigned long long)data->bytes_req,
1062 		       (unsigned long)data->bytes_req / data->hit,
1063 		       (unsigned long)data->hit,
1064 		       (unsigned long)data->pingpong,
1065 		       fragmentation(data->bytes_req, data->bytes_alloc));
1066 
1067 		next = rb_next(next);
1068 	}
1069 
1070 	if (n_lines == -1)
1071 		printf(" ...                                | ...             | ...             | ...      | ...       | ...   \n");
1072 
1073 	printf("%.105s\n", graph_dotted_line);
1074 }
1075 
1076 static const char * const migrate_type_str[] = {
1077 	"UNMOVABL",
1078 	"RECLAIM",
1079 	"MOVABLE",
1080 	"RESERVED",
1081 	"CMA/ISLT",
1082 	"UNKNOWN",
1083 };
1084 
1085 static void __print_page_alloc_result(struct perf_session *session, int n_lines)
1086 {
1087 	struct rb_node *next = rb_first(&page_alloc_sorted);
1088 	struct machine *machine = &session->machines.host;
1089 	const char *format;
1090 	int gfp_len = max(strlen("GFP flags"), max_gfp_len);
1091 
1092 	printf("\n%.105s\n", graph_dotted_line);
1093 	printf(" %-16s | %5s alloc (KB) | Hits      | Order | Mig.type | %-*s | Callsite\n",
1094 	       use_pfn ? "PFN" : "Page", live_page ? "Live" : "Total",
1095 	       gfp_len, "GFP flags");
1096 	printf("%.105s\n", graph_dotted_line);
1097 
1098 	if (use_pfn)
1099 		format = " %16llu | %'16llu | %'9d | %5d | %8s | %-*s | %s\n";
1100 	else
1101 		format = " %016llx | %'16llu | %'9d | %5d | %8s | %-*s | %s\n";
1102 
1103 	while (next && n_lines--) {
1104 		struct page_stat *data;
1105 		struct symbol *sym;
1106 		struct map *map;
1107 		char buf[32];
1108 		char *caller = buf;
1109 
1110 		data = rb_entry(next, struct page_stat, node);
1111 		sym = machine__find_kernel_symbol(machine, data->callsite, &map);
1112 		if (sym)
1113 			caller = sym->name;
1114 		else
1115 			scnprintf(buf, sizeof(buf), "%"PRIx64, data->callsite);
1116 
1117 		printf(format, (unsigned long long)data->page,
1118 		       (unsigned long long)data->alloc_bytes / 1024,
1119 		       data->nr_alloc, data->order,
1120 		       migrate_type_str[data->migrate_type],
1121 		       gfp_len, compact_gfp_string(data->gfp_flags), caller);
1122 
1123 		next = rb_next(next);
1124 	}
1125 
1126 	if (n_lines == -1) {
1127 		printf(" ...              | ...              | ...       | ...   | ...      | %-*s | ...\n",
1128 		       gfp_len, "...");
1129 	}
1130 
1131 	printf("%.105s\n", graph_dotted_line);
1132 }
1133 
1134 static void __print_page_caller_result(struct perf_session *session, int n_lines)
1135 {
1136 	struct rb_node *next = rb_first(&page_caller_sorted);
1137 	struct machine *machine = &session->machines.host;
1138 	int gfp_len = max(strlen("GFP flags"), max_gfp_len);
1139 
1140 	printf("\n%.105s\n", graph_dotted_line);
1141 	printf(" %5s alloc (KB) | Hits      | Order | Mig.type | %-*s | Callsite\n",
1142 	       live_page ? "Live" : "Total", gfp_len, "GFP flags");
1143 	printf("%.105s\n", graph_dotted_line);
1144 
1145 	while (next && n_lines--) {
1146 		struct page_stat *data;
1147 		struct symbol *sym;
1148 		struct map *map;
1149 		char buf[32];
1150 		char *caller = buf;
1151 
1152 		data = rb_entry(next, struct page_stat, node);
1153 		sym = machine__find_kernel_symbol(machine, data->callsite, &map);
1154 		if (sym)
1155 			caller = sym->name;
1156 		else
1157 			scnprintf(buf, sizeof(buf), "%"PRIx64, data->callsite);
1158 
1159 		printf(" %'16llu | %'9d | %5d | %8s | %-*s | %s\n",
1160 		       (unsigned long long)data->alloc_bytes / 1024,
1161 		       data->nr_alloc, data->order,
1162 		       migrate_type_str[data->migrate_type],
1163 		       gfp_len, compact_gfp_string(data->gfp_flags), caller);
1164 
1165 		next = rb_next(next);
1166 	}
1167 
1168 	if (n_lines == -1) {
1169 		printf(" ...              | ...       | ...   | ...      | %-*s | ...\n",
1170 		       gfp_len, "...");
1171 	}
1172 
1173 	printf("%.105s\n", graph_dotted_line);
1174 }
1175 
1176 static void print_gfp_flags(void)
1177 {
1178 	int i;
1179 
1180 	printf("#\n");
1181 	printf("# GFP flags\n");
1182 	printf("# ---------\n");
1183 	for (i = 0; i < nr_gfps; i++) {
1184 		printf("# %08x: %*s: %s\n", gfps[i].flags,
1185 		       (int) max_gfp_len, gfps[i].compact_str,
1186 		       gfps[i].human_readable);
1187 	}
1188 }
1189 
1190 static void print_slab_summary(void)
1191 {
1192 	printf("\nSUMMARY (SLAB allocator)");
1193 	printf("\n========================\n");
1194 	printf("Total bytes requested: %'lu\n", total_requested);
1195 	printf("Total bytes allocated: %'lu\n", total_allocated);
1196 	printf("Total bytes freed:     %'lu\n", total_freed);
1197 	if (total_allocated > total_freed) {
1198 		printf("Net total bytes allocated: %'lu\n",
1199 		total_allocated - total_freed);
1200 	}
1201 	printf("Total bytes wasted on internal fragmentation: %'lu\n",
1202 	       total_allocated - total_requested);
1203 	printf("Internal fragmentation: %f%%\n",
1204 	       fragmentation(total_requested, total_allocated));
1205 	printf("Cross CPU allocations: %'lu/%'lu\n", nr_cross_allocs, nr_allocs);
1206 }
1207 
1208 static void print_page_summary(void)
1209 {
1210 	int o, m;
1211 	u64 nr_alloc_freed = nr_page_frees - nr_page_nomatch;
1212 	u64 total_alloc_freed_bytes = total_page_free_bytes - total_page_nomatch_bytes;
1213 
1214 	printf("\nSUMMARY (page allocator)");
1215 	printf("\n========================\n");
1216 	printf("%-30s: %'16lu   [ %'16"PRIu64" KB ]\n", "Total allocation requests",
1217 	       nr_page_allocs, total_page_alloc_bytes / 1024);
1218 	printf("%-30s: %'16lu   [ %'16"PRIu64" KB ]\n", "Total free requests",
1219 	       nr_page_frees, total_page_free_bytes / 1024);
1220 	printf("\n");
1221 
1222 	printf("%-30s: %'16"PRIu64"   [ %'16"PRIu64" KB ]\n", "Total alloc+freed requests",
1223 	       nr_alloc_freed, (total_alloc_freed_bytes) / 1024);
1224 	printf("%-30s: %'16"PRIu64"   [ %'16"PRIu64" KB ]\n", "Total alloc-only requests",
1225 	       nr_page_allocs - nr_alloc_freed,
1226 	       (total_page_alloc_bytes - total_alloc_freed_bytes) / 1024);
1227 	printf("%-30s: %'16lu   [ %'16"PRIu64" KB ]\n", "Total free-only requests",
1228 	       nr_page_nomatch, total_page_nomatch_bytes / 1024);
1229 	printf("\n");
1230 
1231 	printf("%-30s: %'16lu   [ %'16"PRIu64" KB ]\n", "Total allocation failures",
1232 	       nr_page_fails, total_page_fail_bytes / 1024);
1233 	printf("\n");
1234 
1235 	printf("%5s  %12s  %12s  %12s  %12s  %12s\n", "Order",  "Unmovable",
1236 	       "Reclaimable", "Movable", "Reserved", "CMA/Isolated");
1237 	printf("%.5s  %.12s  %.12s  %.12s  %.12s  %.12s\n", graph_dotted_line,
1238 	       graph_dotted_line, graph_dotted_line, graph_dotted_line,
1239 	       graph_dotted_line, graph_dotted_line);
1240 
1241 	for (o = 0; o < MAX_PAGE_ORDER; o++) {
1242 		printf("%5d", o);
1243 		for (m = 0; m < MAX_MIGRATE_TYPES - 1; m++) {
1244 			if (order_stats[o][m])
1245 				printf("  %'12d", order_stats[o][m]);
1246 			else
1247 				printf("  %12c", '.');
1248 		}
1249 		printf("\n");
1250 	}
1251 }
1252 
1253 static void print_slab_result(struct perf_session *session)
1254 {
1255 	if (caller_flag)
1256 		__print_slab_result(&root_caller_sorted, session, caller_lines, 1);
1257 	if (alloc_flag)
1258 		__print_slab_result(&root_alloc_sorted, session, alloc_lines, 0);
1259 	print_slab_summary();
1260 }
1261 
1262 static void print_page_result(struct perf_session *session)
1263 {
1264 	if (caller_flag || alloc_flag)
1265 		print_gfp_flags();
1266 	if (caller_flag)
1267 		__print_page_caller_result(session, caller_lines);
1268 	if (alloc_flag)
1269 		__print_page_alloc_result(session, alloc_lines);
1270 	print_page_summary();
1271 }
1272 
1273 static void print_result(struct perf_session *session)
1274 {
1275 	if (kmem_slab)
1276 		print_slab_result(session);
1277 	if (kmem_page)
1278 		print_page_result(session);
1279 }
1280 
1281 static LIST_HEAD(slab_caller_sort);
1282 static LIST_HEAD(slab_alloc_sort);
1283 static LIST_HEAD(page_caller_sort);
1284 static LIST_HEAD(page_alloc_sort);
1285 
1286 static void sort_slab_insert(struct rb_root *root, struct alloc_stat *data,
1287 			     struct list_head *sort_list)
1288 {
1289 	struct rb_node **new = &(root->rb_node);
1290 	struct rb_node *parent = NULL;
1291 	struct sort_dimension *sort;
1292 
1293 	while (*new) {
1294 		struct alloc_stat *this;
1295 		int cmp = 0;
1296 
1297 		this = rb_entry(*new, struct alloc_stat, node);
1298 		parent = *new;
1299 
1300 		list_for_each_entry(sort, sort_list, list) {
1301 			cmp = sort->cmp(data, this);
1302 			if (cmp)
1303 				break;
1304 		}
1305 
1306 		if (cmp > 0)
1307 			new = &((*new)->rb_left);
1308 		else
1309 			new = &((*new)->rb_right);
1310 	}
1311 
1312 	rb_link_node(&data->node, parent, new);
1313 	rb_insert_color(&data->node, root);
1314 }
1315 
1316 static void __sort_slab_result(struct rb_root *root, struct rb_root *root_sorted,
1317 			       struct list_head *sort_list)
1318 {
1319 	struct rb_node *node;
1320 	struct alloc_stat *data;
1321 
1322 	for (;;) {
1323 		node = rb_first(root);
1324 		if (!node)
1325 			break;
1326 
1327 		rb_erase(node, root);
1328 		data = rb_entry(node, struct alloc_stat, node);
1329 		sort_slab_insert(root_sorted, data, sort_list);
1330 	}
1331 }
1332 
1333 static void sort_page_insert(struct rb_root *root, struct page_stat *data,
1334 			     struct list_head *sort_list)
1335 {
1336 	struct rb_node **new = &root->rb_node;
1337 	struct rb_node *parent = NULL;
1338 	struct sort_dimension *sort;
1339 
1340 	while (*new) {
1341 		struct page_stat *this;
1342 		int cmp = 0;
1343 
1344 		this = rb_entry(*new, struct page_stat, node);
1345 		parent = *new;
1346 
1347 		list_for_each_entry(sort, sort_list, list) {
1348 			cmp = sort->cmp(data, this);
1349 			if (cmp)
1350 				break;
1351 		}
1352 
1353 		if (cmp > 0)
1354 			new = &parent->rb_left;
1355 		else
1356 			new = &parent->rb_right;
1357 	}
1358 
1359 	rb_link_node(&data->node, parent, new);
1360 	rb_insert_color(&data->node, root);
1361 }
1362 
1363 static void __sort_page_result(struct rb_root *root, struct rb_root *root_sorted,
1364 			       struct list_head *sort_list)
1365 {
1366 	struct rb_node *node;
1367 	struct page_stat *data;
1368 
1369 	for (;;) {
1370 		node = rb_first(root);
1371 		if (!node)
1372 			break;
1373 
1374 		rb_erase(node, root);
1375 		data = rb_entry(node, struct page_stat, node);
1376 		sort_page_insert(root_sorted, data, sort_list);
1377 	}
1378 }
1379 
1380 static void sort_result(void)
1381 {
1382 	if (kmem_slab) {
1383 		__sort_slab_result(&root_alloc_stat, &root_alloc_sorted,
1384 				   &slab_alloc_sort);
1385 		__sort_slab_result(&root_caller_stat, &root_caller_sorted,
1386 				   &slab_caller_sort);
1387 	}
1388 	if (kmem_page) {
1389 		if (live_page)
1390 			__sort_page_result(&page_live_tree, &page_alloc_sorted,
1391 					   &page_alloc_sort);
1392 		else
1393 			__sort_page_result(&page_alloc_tree, &page_alloc_sorted,
1394 					   &page_alloc_sort);
1395 
1396 		__sort_page_result(&page_caller_tree, &page_caller_sorted,
1397 				   &page_caller_sort);
1398 	}
1399 }
1400 
1401 static int __cmd_kmem(struct perf_session *session)
1402 {
1403 	int err = -EINVAL;
1404 	struct evsel *evsel;
1405 	const struct evsel_str_handler kmem_tracepoints[] = {
1406 		/* slab allocator */
1407 		{ "kmem:kmalloc",		evsel__process_alloc_event, },
1408 		{ "kmem:kmem_cache_alloc",	evsel__process_alloc_event, },
1409 		{ "kmem:kmalloc_node",		evsel__process_alloc_event, },
1410 		{ "kmem:kmem_cache_alloc_node", evsel__process_alloc_event, },
1411 		{ "kmem:kfree",			evsel__process_free_event, },
1412 		{ "kmem:kmem_cache_free",	evsel__process_free_event, },
1413 		/* page allocator */
1414 		{ "kmem:mm_page_alloc",		evsel__process_page_alloc_event, },
1415 		{ "kmem:mm_page_free",		evsel__process_page_free_event, },
1416 	};
1417 
1418 	if (!perf_session__has_traces(session, "kmem record"))
1419 		goto out;
1420 
1421 	if (perf_session__set_tracepoints_handlers(session, kmem_tracepoints)) {
1422 		pr_err("Initializing perf session tracepoint handlers failed\n");
1423 		goto out;
1424 	}
1425 
1426 	evlist__for_each_entry(session->evlist, evsel) {
1427 		if (evsel__name_is(evsel, "kmem:mm_page_alloc") &&
1428 		    evsel__field(evsel, "pfn")) {
1429 			use_pfn = true;
1430 			break;
1431 		}
1432 	}
1433 
1434 	setup_pager();
1435 	err = perf_session__process_events(session);
1436 	if (err != 0) {
1437 		pr_err("error during process events: %d\n", err);
1438 		goto out;
1439 	}
1440 	sort_result();
1441 	print_result(session);
1442 out:
1443 	return err;
1444 }
1445 
1446 /* slab sort keys */
1447 static int ptr_cmp(void *a, void *b)
1448 {
1449 	struct alloc_stat *l = a;
1450 	struct alloc_stat *r = b;
1451 
1452 	if (l->ptr < r->ptr)
1453 		return -1;
1454 	else if (l->ptr > r->ptr)
1455 		return 1;
1456 	return 0;
1457 }
1458 
1459 static struct sort_dimension ptr_sort_dimension = {
1460 	.name	= "ptr",
1461 	.cmp	= ptr_cmp,
1462 };
1463 
1464 static int slab_callsite_cmp(void *a, void *b)
1465 {
1466 	struct alloc_stat *l = a;
1467 	struct alloc_stat *r = b;
1468 
1469 	if (l->call_site < r->call_site)
1470 		return -1;
1471 	else if (l->call_site > r->call_site)
1472 		return 1;
1473 	return 0;
1474 }
1475 
1476 static struct sort_dimension callsite_sort_dimension = {
1477 	.name	= "callsite",
1478 	.cmp	= slab_callsite_cmp,
1479 };
1480 
1481 static int hit_cmp(void *a, void *b)
1482 {
1483 	struct alloc_stat *l = a;
1484 	struct alloc_stat *r = b;
1485 
1486 	if (l->hit < r->hit)
1487 		return -1;
1488 	else if (l->hit > r->hit)
1489 		return 1;
1490 	return 0;
1491 }
1492 
1493 static struct sort_dimension hit_sort_dimension = {
1494 	.name	= "hit",
1495 	.cmp	= hit_cmp,
1496 };
1497 
1498 static int bytes_cmp(void *a, void *b)
1499 {
1500 	struct alloc_stat *l = a;
1501 	struct alloc_stat *r = b;
1502 
1503 	if (l->bytes_alloc < r->bytes_alloc)
1504 		return -1;
1505 	else if (l->bytes_alloc > r->bytes_alloc)
1506 		return 1;
1507 	return 0;
1508 }
1509 
1510 static struct sort_dimension bytes_sort_dimension = {
1511 	.name	= "bytes",
1512 	.cmp	= bytes_cmp,
1513 };
1514 
1515 static int frag_cmp(void *a, void *b)
1516 {
1517 	double x, y;
1518 	struct alloc_stat *l = a;
1519 	struct alloc_stat *r = b;
1520 
1521 	x = fragmentation(l->bytes_req, l->bytes_alloc);
1522 	y = fragmentation(r->bytes_req, r->bytes_alloc);
1523 
1524 	if (x < y)
1525 		return -1;
1526 	else if (x > y)
1527 		return 1;
1528 	return 0;
1529 }
1530 
1531 static struct sort_dimension frag_sort_dimension = {
1532 	.name	= "frag",
1533 	.cmp	= frag_cmp,
1534 };
1535 
1536 static int pingpong_cmp(void *a, void *b)
1537 {
1538 	struct alloc_stat *l = a;
1539 	struct alloc_stat *r = b;
1540 
1541 	if (l->pingpong < r->pingpong)
1542 		return -1;
1543 	else if (l->pingpong > r->pingpong)
1544 		return 1;
1545 	return 0;
1546 }
1547 
1548 static struct sort_dimension pingpong_sort_dimension = {
1549 	.name	= "pingpong",
1550 	.cmp	= pingpong_cmp,
1551 };
1552 
1553 /* page sort keys */
1554 static int page_cmp(void *a, void *b)
1555 {
1556 	struct page_stat *l = a;
1557 	struct page_stat *r = b;
1558 
1559 	if (l->page < r->page)
1560 		return -1;
1561 	else if (l->page > r->page)
1562 		return 1;
1563 	return 0;
1564 }
1565 
1566 static struct sort_dimension page_sort_dimension = {
1567 	.name	= "page",
1568 	.cmp	= page_cmp,
1569 };
1570 
1571 static int page_callsite_cmp(void *a, void *b)
1572 {
1573 	struct page_stat *l = a;
1574 	struct page_stat *r = b;
1575 
1576 	if (l->callsite < r->callsite)
1577 		return -1;
1578 	else if (l->callsite > r->callsite)
1579 		return 1;
1580 	return 0;
1581 }
1582 
1583 static struct sort_dimension page_callsite_sort_dimension = {
1584 	.name	= "callsite",
1585 	.cmp	= page_callsite_cmp,
1586 };
1587 
1588 static int page_hit_cmp(void *a, void *b)
1589 {
1590 	struct page_stat *l = a;
1591 	struct page_stat *r = b;
1592 
1593 	if (l->nr_alloc < r->nr_alloc)
1594 		return -1;
1595 	else if (l->nr_alloc > r->nr_alloc)
1596 		return 1;
1597 	return 0;
1598 }
1599 
1600 static struct sort_dimension page_hit_sort_dimension = {
1601 	.name	= "hit",
1602 	.cmp	= page_hit_cmp,
1603 };
1604 
1605 static int page_bytes_cmp(void *a, void *b)
1606 {
1607 	struct page_stat *l = a;
1608 	struct page_stat *r = b;
1609 
1610 	if (l->alloc_bytes < r->alloc_bytes)
1611 		return -1;
1612 	else if (l->alloc_bytes > r->alloc_bytes)
1613 		return 1;
1614 	return 0;
1615 }
1616 
1617 static struct sort_dimension page_bytes_sort_dimension = {
1618 	.name	= "bytes",
1619 	.cmp	= page_bytes_cmp,
1620 };
1621 
1622 static int page_order_cmp(void *a, void *b)
1623 {
1624 	struct page_stat *l = a;
1625 	struct page_stat *r = b;
1626 
1627 	if (l->order < r->order)
1628 		return -1;
1629 	else if (l->order > r->order)
1630 		return 1;
1631 	return 0;
1632 }
1633 
1634 static struct sort_dimension page_order_sort_dimension = {
1635 	.name	= "order",
1636 	.cmp	= page_order_cmp,
1637 };
1638 
1639 static int migrate_type_cmp(void *a, void *b)
1640 {
1641 	struct page_stat *l = a;
1642 	struct page_stat *r = b;
1643 
1644 	/* for internal use to find free'd page */
1645 	if (l->migrate_type == -1U)
1646 		return 0;
1647 
1648 	if (l->migrate_type < r->migrate_type)
1649 		return -1;
1650 	else if (l->migrate_type > r->migrate_type)
1651 		return 1;
1652 	return 0;
1653 }
1654 
1655 static struct sort_dimension migrate_type_sort_dimension = {
1656 	.name	= "migtype",
1657 	.cmp	= migrate_type_cmp,
1658 };
1659 
1660 static int gfp_flags_cmp(void *a, void *b)
1661 {
1662 	struct page_stat *l = a;
1663 	struct page_stat *r = b;
1664 
1665 	/* for internal use to find free'd page */
1666 	if (l->gfp_flags == -1U)
1667 		return 0;
1668 
1669 	if (l->gfp_flags < r->gfp_flags)
1670 		return -1;
1671 	else if (l->gfp_flags > r->gfp_flags)
1672 		return 1;
1673 	return 0;
1674 }
1675 
1676 static struct sort_dimension gfp_flags_sort_dimension = {
1677 	.name	= "gfp",
1678 	.cmp	= gfp_flags_cmp,
1679 };
1680 
1681 static struct sort_dimension *slab_sorts[] = {
1682 	&ptr_sort_dimension,
1683 	&callsite_sort_dimension,
1684 	&hit_sort_dimension,
1685 	&bytes_sort_dimension,
1686 	&frag_sort_dimension,
1687 	&pingpong_sort_dimension,
1688 };
1689 
1690 static struct sort_dimension *page_sorts[] = {
1691 	&page_sort_dimension,
1692 	&page_callsite_sort_dimension,
1693 	&page_hit_sort_dimension,
1694 	&page_bytes_sort_dimension,
1695 	&page_order_sort_dimension,
1696 	&migrate_type_sort_dimension,
1697 	&gfp_flags_sort_dimension,
1698 };
1699 
1700 static int slab_sort_dimension__add(const char *tok, struct list_head *list)
1701 {
1702 	struct sort_dimension *sort;
1703 	int i;
1704 
1705 	for (i = 0; i < (int)ARRAY_SIZE(slab_sorts); i++) {
1706 		if (!strcmp(slab_sorts[i]->name, tok)) {
1707 			sort = memdup(slab_sorts[i], sizeof(*slab_sorts[i]));
1708 			if (!sort) {
1709 				pr_err("%s: memdup failed\n", __func__);
1710 				return -1;
1711 			}
1712 			list_add_tail(&sort->list, list);
1713 			return 0;
1714 		}
1715 	}
1716 
1717 	return -1;
1718 }
1719 
1720 static int page_sort_dimension__add(const char *tok, struct list_head *list)
1721 {
1722 	struct sort_dimension *sort;
1723 	int i;
1724 
1725 	for (i = 0; i < (int)ARRAY_SIZE(page_sorts); i++) {
1726 		if (!strcmp(page_sorts[i]->name, tok)) {
1727 			sort = memdup(page_sorts[i], sizeof(*page_sorts[i]));
1728 			if (!sort) {
1729 				pr_err("%s: memdup failed\n", __func__);
1730 				return -1;
1731 			}
1732 			list_add_tail(&sort->list, list);
1733 			return 0;
1734 		}
1735 	}
1736 
1737 	return -1;
1738 }
1739 
1740 static int setup_slab_sorting(struct list_head *sort_list, const char *arg)
1741 {
1742 	char *tok;
1743 	char *str = strdup(arg);
1744 	char *pos = str;
1745 
1746 	if (!str) {
1747 		pr_err("%s: strdup failed\n", __func__);
1748 		return -1;
1749 	}
1750 
1751 	while (true) {
1752 		tok = strsep(&pos, ",");
1753 		if (!tok)
1754 			break;
1755 		if (slab_sort_dimension__add(tok, sort_list) < 0) {
1756 			pr_err("Unknown slab --sort key: '%s'", tok);
1757 			free(str);
1758 			return -1;
1759 		}
1760 	}
1761 
1762 	free(str);
1763 	return 0;
1764 }
1765 
1766 static int setup_page_sorting(struct list_head *sort_list, const char *arg)
1767 {
1768 	char *tok;
1769 	char *str = strdup(arg);
1770 	char *pos = str;
1771 
1772 	if (!str) {
1773 		pr_err("%s: strdup failed\n", __func__);
1774 		return -1;
1775 	}
1776 
1777 	while (true) {
1778 		tok = strsep(&pos, ",");
1779 		if (!tok)
1780 			break;
1781 		if (page_sort_dimension__add(tok, sort_list) < 0) {
1782 			pr_err("Unknown page --sort key: '%s'", tok);
1783 			free(str);
1784 			return -1;
1785 		}
1786 	}
1787 
1788 	free(str);
1789 	return 0;
1790 }
1791 
1792 static int parse_sort_opt(const struct option *opt __maybe_unused,
1793 			  const char *arg, int unset __maybe_unused)
1794 {
1795 	if (!arg)
1796 		return -1;
1797 
1798 	if (kmem_page > kmem_slab ||
1799 	    (kmem_page == 0 && kmem_slab == 0 && kmem_default == KMEM_PAGE)) {
1800 		if (caller_flag > alloc_flag)
1801 			return setup_page_sorting(&page_caller_sort, arg);
1802 		else
1803 			return setup_page_sorting(&page_alloc_sort, arg);
1804 	} else {
1805 		if (caller_flag > alloc_flag)
1806 			return setup_slab_sorting(&slab_caller_sort, arg);
1807 		else
1808 			return setup_slab_sorting(&slab_alloc_sort, arg);
1809 	}
1810 
1811 	return 0;
1812 }
1813 
1814 static int parse_caller_opt(const struct option *opt __maybe_unused,
1815 			    const char *arg __maybe_unused,
1816 			    int unset __maybe_unused)
1817 {
1818 	caller_flag = (alloc_flag + 1);
1819 	return 0;
1820 }
1821 
1822 static int parse_alloc_opt(const struct option *opt __maybe_unused,
1823 			   const char *arg __maybe_unused,
1824 			   int unset __maybe_unused)
1825 {
1826 	alloc_flag = (caller_flag + 1);
1827 	return 0;
1828 }
1829 
1830 static int parse_slab_opt(const struct option *opt __maybe_unused,
1831 			  const char *arg __maybe_unused,
1832 			  int unset __maybe_unused)
1833 {
1834 	kmem_slab = (kmem_page + 1);
1835 	return 0;
1836 }
1837 
1838 static int parse_page_opt(const struct option *opt __maybe_unused,
1839 			  const char *arg __maybe_unused,
1840 			  int unset __maybe_unused)
1841 {
1842 	kmem_page = (kmem_slab + 1);
1843 	return 0;
1844 }
1845 
1846 static int parse_line_opt(const struct option *opt __maybe_unused,
1847 			  const char *arg, int unset __maybe_unused)
1848 {
1849 	int lines;
1850 
1851 	if (!arg)
1852 		return -1;
1853 
1854 	lines = strtoul(arg, NULL, 10);
1855 
1856 	if (caller_flag > alloc_flag)
1857 		caller_lines = lines;
1858 	else
1859 		alloc_lines = lines;
1860 
1861 	return 0;
1862 }
1863 
1864 static bool slab_legacy_tp_is_exposed(void)
1865 {
1866 	/*
1867 	 * The tracepoints "kmem:kmalloc_node" and
1868 	 * "kmem:kmem_cache_alloc_node" have been removed on the latest
1869 	 * kernel, if the tracepoint "kmem:kmalloc_node" is existed it
1870 	 * means the tool is running on an old kernel, we need to
1871 	 * rollback to support these legacy tracepoints.
1872 	 */
1873 	return IS_ERR(trace_event__tp_format("kmem", "kmalloc_node")) ?
1874 		false : true;
1875 }
1876 
1877 static int __cmd_record(int argc, const char **argv)
1878 {
1879 	const char * const record_args[] = {
1880 	"record", "-a", "-R", "-c", "1",
1881 	};
1882 	const char * const slab_events[] = {
1883 	"-e", "kmem:kmalloc",
1884 	"-e", "kmem:kfree",
1885 	"-e", "kmem:kmem_cache_alloc",
1886 	"-e", "kmem:kmem_cache_free",
1887 	};
1888 	const char * const slab_legacy_events[] = {
1889 	"-e", "kmem:kmalloc_node",
1890 	"-e", "kmem:kmem_cache_alloc_node",
1891 	};
1892 	const char * const page_events[] = {
1893 	"-e", "kmem:mm_page_alloc",
1894 	"-e", "kmem:mm_page_free",
1895 	};
1896 	unsigned int rec_argc, i, j;
1897 	const char **rec_argv;
1898 	unsigned int slab_legacy_tp_exposed = slab_legacy_tp_is_exposed();
1899 
1900 	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
1901 	if (kmem_slab) {
1902 		rec_argc += ARRAY_SIZE(slab_events);
1903 		if (slab_legacy_tp_exposed)
1904 			rec_argc += ARRAY_SIZE(slab_legacy_events);
1905 	}
1906 	if (kmem_page)
1907 		rec_argc += ARRAY_SIZE(page_events) + 1; /* for -g */
1908 
1909 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
1910 
1911 	if (rec_argv == NULL)
1912 		return -ENOMEM;
1913 
1914 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
1915 		rec_argv[i] = strdup(record_args[i]);
1916 
1917 	if (kmem_slab) {
1918 		for (j = 0; j < ARRAY_SIZE(slab_events); j++, i++)
1919 			rec_argv[i] = strdup(slab_events[j]);
1920 		if (slab_legacy_tp_exposed) {
1921 			for (j = 0; j < ARRAY_SIZE(slab_legacy_events); j++, i++)
1922 				rec_argv[i] = strdup(slab_legacy_events[j]);
1923 		}
1924 	}
1925 	if (kmem_page) {
1926 		rec_argv[i++] = strdup("-g");
1927 
1928 		for (j = 0; j < ARRAY_SIZE(page_events); j++, i++)
1929 			rec_argv[i] = strdup(page_events[j]);
1930 	}
1931 
1932 	for (j = 1; j < (unsigned int)argc; j++, i++)
1933 		rec_argv[i] = argv[j];
1934 
1935 	return cmd_record(i, rec_argv);
1936 }
1937 
1938 static int kmem_config(const char *var, const char *value, void *cb __maybe_unused)
1939 {
1940 	if (!strcmp(var, "kmem.default")) {
1941 		if (!strcmp(value, "slab"))
1942 			kmem_default = KMEM_SLAB;
1943 		else if (!strcmp(value, "page"))
1944 			kmem_default = KMEM_PAGE;
1945 		else
1946 			pr_err("invalid default value ('slab' or 'page' required): %s\n",
1947 			       value);
1948 		return 0;
1949 	}
1950 
1951 	return 0;
1952 }
1953 
1954 int cmd_kmem(int argc, const char **argv)
1955 {
1956 	const char * const default_slab_sort = "frag,hit,bytes";
1957 	const char * const default_page_sort = "bytes,hit";
1958 	struct perf_data data = {
1959 		.mode = PERF_DATA_MODE_READ,
1960 	};
1961 	const struct option kmem_options[] = {
1962 	OPT_STRING('i', "input", &input_name, "file", "input file name"),
1963 	OPT_INCR('v', "verbose", &verbose,
1964 		    "be more verbose (show symbol address, etc)"),
1965 	OPT_CALLBACK_NOOPT(0, "caller", NULL, NULL,
1966 			   "show per-callsite statistics", parse_caller_opt),
1967 	OPT_CALLBACK_NOOPT(0, "alloc", NULL, NULL,
1968 			   "show per-allocation statistics", parse_alloc_opt),
1969 	OPT_CALLBACK('s', "sort", NULL, "key[,key2...]",
1970 		     "sort by keys: ptr, callsite, bytes, hit, pingpong, frag, "
1971 		     "page, order, migtype, gfp", parse_sort_opt),
1972 	OPT_CALLBACK('l', "line", NULL, "num", "show n lines", parse_line_opt),
1973 	OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"),
1974 	OPT_BOOLEAN('f', "force", &data.force, "don't complain, do it"),
1975 	OPT_CALLBACK_NOOPT(0, "slab", NULL, NULL, "Analyze slab allocator",
1976 			   parse_slab_opt),
1977 	OPT_CALLBACK_NOOPT(0, "page", NULL, NULL, "Analyze page allocator",
1978 			   parse_page_opt),
1979 	OPT_BOOLEAN(0, "live", &live_page, "Show live page stat"),
1980 	OPT_STRING(0, "time", &time_str, "str",
1981 		   "Time span of interest (start,stop)"),
1982 	OPT_END()
1983 	};
1984 	const char *const kmem_subcommands[] = { "record", "stat", NULL };
1985 	const char *kmem_usage[] = {
1986 		NULL,
1987 		NULL
1988 	};
1989 	struct perf_session *session;
1990 	struct perf_tool perf_kmem;
1991 	static const char errmsg[] = "No %s allocation events found.  Have you run 'perf kmem record --%s'?\n";
1992 	int ret = perf_config(kmem_config, NULL);
1993 
1994 	if (ret)
1995 		return ret;
1996 
1997 	argc = parse_options_subcommand(argc, argv, kmem_options,
1998 					kmem_subcommands, kmem_usage,
1999 					PARSE_OPT_STOP_AT_NON_OPTION);
2000 
2001 	if (!argc)
2002 		usage_with_options(kmem_usage, kmem_options);
2003 
2004 	if (kmem_slab == 0 && kmem_page == 0) {
2005 		if (kmem_default == KMEM_SLAB)
2006 			kmem_slab = 1;
2007 		else
2008 			kmem_page = 1;
2009 	}
2010 
2011 	if (strlen(argv[0]) > 2 && strstarts("record", argv[0])) {
2012 		symbol__init(NULL);
2013 		return __cmd_record(argc, argv);
2014 	}
2015 
2016 	data.path = input_name;
2017 
2018 	perf_tool__init(&perf_kmem, /*ordered_events=*/true);
2019 	perf_kmem.sample	= process_sample_event;
2020 	perf_kmem.comm		= perf_event__process_comm;
2021 	perf_kmem.mmap		= perf_event__process_mmap;
2022 	perf_kmem.mmap2		= perf_event__process_mmap2;
2023 	perf_kmem.namespaces	= perf_event__process_namespaces;
2024 
2025 	kmem_session = session = perf_session__new(&data, &perf_kmem);
2026 	if (IS_ERR(session))
2027 		return PTR_ERR(session);
2028 
2029 	ret = -1;
2030 
2031 	if (kmem_slab) {
2032 		if (!evlist__find_tracepoint_by_name(session->evlist, "kmem:kmalloc")) {
2033 			pr_err(errmsg, "slab", "slab");
2034 			goto out_delete;
2035 		}
2036 	}
2037 
2038 	if (kmem_page) {
2039 		struct evsel *evsel = evlist__find_tracepoint_by_name(session->evlist, "kmem:mm_page_alloc");
2040 		const struct tep_event *tp_format = evsel ? evsel__tp_format(evsel) : NULL;
2041 
2042 		if (tp_format == NULL) {
2043 			pr_err(errmsg, "page", "page");
2044 			goto out_delete;
2045 		}
2046 		kmem_page_size = tep_get_page_size(tp_format->tep);
2047 		symbol_conf.use_callchain = true;
2048 	}
2049 
2050 	symbol__init(perf_session__env(session));
2051 
2052 	if (perf_time__parse_str(&ptime, time_str) != 0) {
2053 		pr_err("Invalid time string\n");
2054 		ret = -EINVAL;
2055 		goto out_delete;
2056 	}
2057 
2058 	if (!strcmp(argv[0], "stat")) {
2059 		setlocale(LC_ALL, "");
2060 
2061 		if (cpu__setup_cpunode_map())
2062 			goto out_delete;
2063 
2064 		if (list_empty(&slab_caller_sort))
2065 			setup_slab_sorting(&slab_caller_sort, default_slab_sort);
2066 		if (list_empty(&slab_alloc_sort))
2067 			setup_slab_sorting(&slab_alloc_sort, default_slab_sort);
2068 		if (list_empty(&page_caller_sort))
2069 			setup_page_sorting(&page_caller_sort, default_page_sort);
2070 		if (list_empty(&page_alloc_sort))
2071 			setup_page_sorting(&page_alloc_sort, default_page_sort);
2072 
2073 		if (kmem_page) {
2074 			setup_page_sorting(&page_alloc_sort_input,
2075 					   "page,order,migtype,gfp");
2076 			setup_page_sorting(&page_caller_sort_input,
2077 					   "callsite,order,migtype,gfp");
2078 		}
2079 		ret = __cmd_kmem(session);
2080 	} else
2081 		usage_with_options(kmem_usage, kmem_options);
2082 
2083 out_delete:
2084 	perf_session__delete(session);
2085 	/* free usage string allocated by parse_options_subcommand */
2086 	free((void *)kmem_usage[0]);
2087 
2088 	return ret;
2089 }
2090