xref: /linux/tools/perf/util/thread-stack.c (revision 9abdb50cda0ffe33bbb2e40cbad97b32fb7ff892)
1 /*
2  * thread-stack.c: Synthesize a thread's stack using call / return events
3  * Copyright (c) 2014, Intel Corporation.
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms and conditions of the GNU General Public License,
7  * version 2, as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12  * more details.
13  *
14  */
15 
16 #include <linux/rbtree.h>
17 #include <linux/list.h>
18 #include <linux/log2.h>
19 #include <errno.h>
20 #include "thread.h"
21 #include "event.h"
22 #include "machine.h"
23 #include "env.h"
24 #include "util.h"
25 #include "debug.h"
26 #include "symbol.h"
27 #include "comm.h"
28 #include "call-path.h"
29 #include "thread-stack.h"
30 
31 #define STACK_GROWTH 2048
32 
33 /*
34  * State of retpoline detection.
35  *
36  * RETPOLINE_NONE: no retpoline detection
37  * X86_RETPOLINE_POSSIBLE: x86 retpoline possible
38  * X86_RETPOLINE_DETECTED: x86 retpoline detected
39  */
40 enum retpoline_state_t {
41 	RETPOLINE_NONE,
42 	X86_RETPOLINE_POSSIBLE,
43 	X86_RETPOLINE_DETECTED,
44 };
45 
46 /**
47  * struct thread_stack_entry - thread stack entry.
48  * @ret_addr: return address
49  * @timestamp: timestamp (if known)
50  * @ref: external reference (e.g. db_id of sample)
51  * @branch_count: the branch count when the entry was created
52  * @cp: call path
53  * @no_call: a 'call' was not seen
54  * @trace_end: a 'call' but trace ended
55  * @non_call: a branch but not a 'call' to the start of a different symbol
56  */
57 struct thread_stack_entry {
58 	u64 ret_addr;
59 	u64 timestamp;
60 	u64 ref;
61 	u64 branch_count;
62 	struct call_path *cp;
63 	bool no_call;
64 	bool trace_end;
65 	bool non_call;
66 };
67 
68 /**
69  * struct thread_stack - thread stack constructed from 'call' and 'return'
70  *                       branch samples.
71  * @stack: array that holds the stack
72  * @cnt: number of entries in the stack
73  * @sz: current maximum stack size
74  * @trace_nr: current trace number
75  * @branch_count: running branch count
76  * @kernel_start: kernel start address
77  * @last_time: last timestamp
78  * @crp: call/return processor
79  * @comm: current comm
80  * @arr_sz: size of array if this is the first element of an array
81  * @rstate: used to detect retpolines
82  */
83 struct thread_stack {
84 	struct thread_stack_entry *stack;
85 	size_t cnt;
86 	size_t sz;
87 	u64 trace_nr;
88 	u64 branch_count;
89 	u64 kernel_start;
90 	u64 last_time;
91 	struct call_return_processor *crp;
92 	struct comm *comm;
93 	unsigned int arr_sz;
94 	enum retpoline_state_t rstate;
95 };
96 
97 /*
98  * Assume pid == tid == 0 identifies the idle task as defined by
99  * perf_session__register_idle_thread(). The idle task is really 1 task per cpu,
100  * and therefore requires a stack for each cpu.
101  */
102 static inline bool thread_stack__per_cpu(struct thread *thread)
103 {
104 	return !(thread->tid || thread->pid_);
105 }
106 
107 static int thread_stack__grow(struct thread_stack *ts)
108 {
109 	struct thread_stack_entry *new_stack;
110 	size_t sz, new_sz;
111 
112 	new_sz = ts->sz + STACK_GROWTH;
113 	sz = new_sz * sizeof(struct thread_stack_entry);
114 
115 	new_stack = realloc(ts->stack, sz);
116 	if (!new_stack)
117 		return -ENOMEM;
118 
119 	ts->stack = new_stack;
120 	ts->sz = new_sz;
121 
122 	return 0;
123 }
124 
125 static int thread_stack__init(struct thread_stack *ts, struct thread *thread,
126 			      struct call_return_processor *crp)
127 {
128 	int err;
129 
130 	err = thread_stack__grow(ts);
131 	if (err)
132 		return err;
133 
134 	if (thread->mg && thread->mg->machine) {
135 		struct machine *machine = thread->mg->machine;
136 		const char *arch = perf_env__arch(machine->env);
137 
138 		ts->kernel_start = machine__kernel_start(machine);
139 		if (!strcmp(arch, "x86"))
140 			ts->rstate = X86_RETPOLINE_POSSIBLE;
141 	} else {
142 		ts->kernel_start = 1ULL << 63;
143 	}
144 	ts->crp = crp;
145 
146 	return 0;
147 }
148 
149 static struct thread_stack *thread_stack__new(struct thread *thread, int cpu,
150 					      struct call_return_processor *crp)
151 {
152 	struct thread_stack *ts = thread->ts, *new_ts;
153 	unsigned int old_sz = ts ? ts->arr_sz : 0;
154 	unsigned int new_sz = 1;
155 
156 	if (thread_stack__per_cpu(thread) && cpu > 0)
157 		new_sz = roundup_pow_of_two(cpu + 1);
158 
159 	if (!ts || new_sz > old_sz) {
160 		new_ts = calloc(new_sz, sizeof(*ts));
161 		if (!new_ts)
162 			return NULL;
163 		if (ts)
164 			memcpy(new_ts, ts, old_sz * sizeof(*ts));
165 		new_ts->arr_sz = new_sz;
166 		zfree(&thread->ts);
167 		thread->ts = new_ts;
168 		ts = new_ts;
169 	}
170 
171 	if (thread_stack__per_cpu(thread) && cpu > 0 &&
172 	    (unsigned int)cpu < ts->arr_sz)
173 		ts += cpu;
174 
175 	if (!ts->stack &&
176 	    thread_stack__init(ts, thread, crp))
177 		return NULL;
178 
179 	return ts;
180 }
181 
182 static struct thread_stack *thread__cpu_stack(struct thread *thread, int cpu)
183 {
184 	struct thread_stack *ts = thread->ts;
185 
186 	if (cpu < 0)
187 		cpu = 0;
188 
189 	if (!ts || (unsigned int)cpu >= ts->arr_sz)
190 		return NULL;
191 
192 	ts += cpu;
193 
194 	if (!ts->stack)
195 		return NULL;
196 
197 	return ts;
198 }
199 
200 static inline struct thread_stack *thread__stack(struct thread *thread,
201 						    int cpu)
202 {
203 	if (!thread)
204 		return NULL;
205 
206 	if (thread_stack__per_cpu(thread))
207 		return thread__cpu_stack(thread, cpu);
208 
209 	return thread->ts;
210 }
211 
212 static int thread_stack__push(struct thread_stack *ts, u64 ret_addr,
213 			      bool trace_end)
214 {
215 	int err = 0;
216 
217 	if (ts->cnt == ts->sz) {
218 		err = thread_stack__grow(ts);
219 		if (err) {
220 			pr_warning("Out of memory: discarding thread stack\n");
221 			ts->cnt = 0;
222 		}
223 	}
224 
225 	ts->stack[ts->cnt].trace_end = trace_end;
226 	ts->stack[ts->cnt++].ret_addr = ret_addr;
227 
228 	return err;
229 }
230 
231 static void thread_stack__pop(struct thread_stack *ts, u64 ret_addr)
232 {
233 	size_t i;
234 
235 	/*
236 	 * In some cases there may be functions which are not seen to return.
237 	 * For example when setjmp / longjmp has been used.  Or the perf context
238 	 * switch in the kernel which doesn't stop and start tracing in exactly
239 	 * the same code path.  When that happens the return address will be
240 	 * further down the stack.  If the return address is not found at all,
241 	 * we assume the opposite (i.e. this is a return for a call that wasn't
242 	 * seen for some reason) and leave the stack alone.
243 	 */
244 	for (i = ts->cnt; i; ) {
245 		if (ts->stack[--i].ret_addr == ret_addr) {
246 			ts->cnt = i;
247 			return;
248 		}
249 	}
250 }
251 
252 static void thread_stack__pop_trace_end(struct thread_stack *ts)
253 {
254 	size_t i;
255 
256 	for (i = ts->cnt; i; ) {
257 		if (ts->stack[--i].trace_end)
258 			ts->cnt = i;
259 		else
260 			return;
261 	}
262 }
263 
264 static bool thread_stack__in_kernel(struct thread_stack *ts)
265 {
266 	if (!ts->cnt)
267 		return false;
268 
269 	return ts->stack[ts->cnt - 1].cp->in_kernel;
270 }
271 
272 static int thread_stack__call_return(struct thread *thread,
273 				     struct thread_stack *ts, size_t idx,
274 				     u64 timestamp, u64 ref, bool no_return)
275 {
276 	struct call_return_processor *crp = ts->crp;
277 	struct thread_stack_entry *tse;
278 	struct call_return cr = {
279 		.thread = thread,
280 		.comm = ts->comm,
281 		.db_id = 0,
282 	};
283 
284 	tse = &ts->stack[idx];
285 	cr.cp = tse->cp;
286 	cr.call_time = tse->timestamp;
287 	cr.return_time = timestamp;
288 	cr.branch_count = ts->branch_count - tse->branch_count;
289 	cr.call_ref = tse->ref;
290 	cr.return_ref = ref;
291 	if (tse->no_call)
292 		cr.flags |= CALL_RETURN_NO_CALL;
293 	if (no_return)
294 		cr.flags |= CALL_RETURN_NO_RETURN;
295 	if (tse->non_call)
296 		cr.flags |= CALL_RETURN_NON_CALL;
297 
298 	return crp->process(&cr, crp->data);
299 }
300 
301 static int __thread_stack__flush(struct thread *thread, struct thread_stack *ts)
302 {
303 	struct call_return_processor *crp = ts->crp;
304 	int err;
305 
306 	if (!crp) {
307 		ts->cnt = 0;
308 		return 0;
309 	}
310 
311 	while (ts->cnt) {
312 		err = thread_stack__call_return(thread, ts, --ts->cnt,
313 						ts->last_time, 0, true);
314 		if (err) {
315 			pr_err("Error flushing thread stack!\n");
316 			ts->cnt = 0;
317 			return err;
318 		}
319 	}
320 
321 	return 0;
322 }
323 
324 int thread_stack__flush(struct thread *thread)
325 {
326 	struct thread_stack *ts = thread->ts;
327 	unsigned int pos;
328 	int err = 0;
329 
330 	if (ts) {
331 		for (pos = 0; pos < ts->arr_sz; pos++) {
332 			int ret = __thread_stack__flush(thread, ts + pos);
333 
334 			if (ret)
335 				err = ret;
336 		}
337 	}
338 
339 	return err;
340 }
341 
342 int thread_stack__event(struct thread *thread, int cpu, u32 flags, u64 from_ip,
343 			u64 to_ip, u16 insn_len, u64 trace_nr)
344 {
345 	struct thread_stack *ts = thread__stack(thread, cpu);
346 
347 	if (!thread)
348 		return -EINVAL;
349 
350 	if (!ts) {
351 		ts = thread_stack__new(thread, cpu, NULL);
352 		if (!ts) {
353 			pr_warning("Out of memory: no thread stack\n");
354 			return -ENOMEM;
355 		}
356 		ts->trace_nr = trace_nr;
357 	}
358 
359 	/*
360 	 * When the trace is discontinuous, the trace_nr changes.  In that case
361 	 * the stack might be completely invalid.  Better to report nothing than
362 	 * to report something misleading, so flush the stack.
363 	 */
364 	if (trace_nr != ts->trace_nr) {
365 		if (ts->trace_nr)
366 			__thread_stack__flush(thread, ts);
367 		ts->trace_nr = trace_nr;
368 	}
369 
370 	/* Stop here if thread_stack__process() is in use */
371 	if (ts->crp)
372 		return 0;
373 
374 	if (flags & PERF_IP_FLAG_CALL) {
375 		u64 ret_addr;
376 
377 		if (!to_ip)
378 			return 0;
379 		ret_addr = from_ip + insn_len;
380 		if (ret_addr == to_ip)
381 			return 0; /* Zero-length calls are excluded */
382 		return thread_stack__push(ts, ret_addr,
383 					  flags & PERF_IP_FLAG_TRACE_END);
384 	} else if (flags & PERF_IP_FLAG_TRACE_BEGIN) {
385 		/*
386 		 * If the caller did not change the trace number (which would
387 		 * have flushed the stack) then try to make sense of the stack.
388 		 * Possibly, tracing began after returning to the current
389 		 * address, so try to pop that. Also, do not expect a call made
390 		 * when the trace ended, to return, so pop that.
391 		 */
392 		thread_stack__pop(ts, to_ip);
393 		thread_stack__pop_trace_end(ts);
394 	} else if ((flags & PERF_IP_FLAG_RETURN) && from_ip) {
395 		thread_stack__pop(ts, to_ip);
396 	}
397 
398 	return 0;
399 }
400 
401 void thread_stack__set_trace_nr(struct thread *thread, int cpu, u64 trace_nr)
402 {
403 	struct thread_stack *ts = thread__stack(thread, cpu);
404 
405 	if (!ts)
406 		return;
407 
408 	if (trace_nr != ts->trace_nr) {
409 		if (ts->trace_nr)
410 			__thread_stack__flush(thread, ts);
411 		ts->trace_nr = trace_nr;
412 	}
413 }
414 
415 static void __thread_stack__free(struct thread *thread, struct thread_stack *ts)
416 {
417 	__thread_stack__flush(thread, ts);
418 	zfree(&ts->stack);
419 }
420 
421 static void thread_stack__reset(struct thread *thread, struct thread_stack *ts)
422 {
423 	unsigned int arr_sz = ts->arr_sz;
424 
425 	__thread_stack__free(thread, ts);
426 	memset(ts, 0, sizeof(*ts));
427 	ts->arr_sz = arr_sz;
428 }
429 
430 void thread_stack__free(struct thread *thread)
431 {
432 	struct thread_stack *ts = thread->ts;
433 	unsigned int pos;
434 
435 	if (ts) {
436 		for (pos = 0; pos < ts->arr_sz; pos++)
437 			__thread_stack__free(thread, ts + pos);
438 		zfree(&thread->ts);
439 	}
440 }
441 
442 static inline u64 callchain_context(u64 ip, u64 kernel_start)
443 {
444 	return ip < kernel_start ? PERF_CONTEXT_USER : PERF_CONTEXT_KERNEL;
445 }
446 
447 void thread_stack__sample(struct thread *thread, int cpu,
448 			  struct ip_callchain *chain,
449 			  size_t sz, u64 ip, u64 kernel_start)
450 {
451 	struct thread_stack *ts = thread__stack(thread, cpu);
452 	u64 context = callchain_context(ip, kernel_start);
453 	u64 last_context;
454 	size_t i, j;
455 
456 	if (sz < 2) {
457 		chain->nr = 0;
458 		return;
459 	}
460 
461 	chain->ips[0] = context;
462 	chain->ips[1] = ip;
463 
464 	if (!ts) {
465 		chain->nr = 2;
466 		return;
467 	}
468 
469 	last_context = context;
470 
471 	for (i = 2, j = 1; i < sz && j <= ts->cnt; i++, j++) {
472 		ip = ts->stack[ts->cnt - j].ret_addr;
473 		context = callchain_context(ip, kernel_start);
474 		if (context != last_context) {
475 			if (i >= sz - 1)
476 				break;
477 			chain->ips[i++] = context;
478 			last_context = context;
479 		}
480 		chain->ips[i] = ip;
481 	}
482 
483 	chain->nr = i;
484 }
485 
486 struct call_return_processor *
487 call_return_processor__new(int (*process)(struct call_return *cr, void *data),
488 			   void *data)
489 {
490 	struct call_return_processor *crp;
491 
492 	crp = zalloc(sizeof(struct call_return_processor));
493 	if (!crp)
494 		return NULL;
495 	crp->cpr = call_path_root__new();
496 	if (!crp->cpr)
497 		goto out_free;
498 	crp->process = process;
499 	crp->data = data;
500 	return crp;
501 
502 out_free:
503 	free(crp);
504 	return NULL;
505 }
506 
507 void call_return_processor__free(struct call_return_processor *crp)
508 {
509 	if (crp) {
510 		call_path_root__free(crp->cpr);
511 		free(crp);
512 	}
513 }
514 
515 static int thread_stack__push_cp(struct thread_stack *ts, u64 ret_addr,
516 				 u64 timestamp, u64 ref, struct call_path *cp,
517 				 bool no_call, bool trace_end)
518 {
519 	struct thread_stack_entry *tse;
520 	int err;
521 
522 	if (!cp)
523 		return -ENOMEM;
524 
525 	if (ts->cnt == ts->sz) {
526 		err = thread_stack__grow(ts);
527 		if (err)
528 			return err;
529 	}
530 
531 	tse = &ts->stack[ts->cnt++];
532 	tse->ret_addr = ret_addr;
533 	tse->timestamp = timestamp;
534 	tse->ref = ref;
535 	tse->branch_count = ts->branch_count;
536 	tse->cp = cp;
537 	tse->no_call = no_call;
538 	tse->trace_end = trace_end;
539 	tse->non_call = false;
540 
541 	return 0;
542 }
543 
544 static int thread_stack__pop_cp(struct thread *thread, struct thread_stack *ts,
545 				u64 ret_addr, u64 timestamp, u64 ref,
546 				struct symbol *sym)
547 {
548 	int err;
549 
550 	if (!ts->cnt)
551 		return 1;
552 
553 	if (ts->cnt == 1) {
554 		struct thread_stack_entry *tse = &ts->stack[0];
555 
556 		if (tse->cp->sym == sym)
557 			return thread_stack__call_return(thread, ts, --ts->cnt,
558 							 timestamp, ref, false);
559 	}
560 
561 	if (ts->stack[ts->cnt - 1].ret_addr == ret_addr &&
562 	    !ts->stack[ts->cnt - 1].non_call) {
563 		return thread_stack__call_return(thread, ts, --ts->cnt,
564 						 timestamp, ref, false);
565 	} else {
566 		size_t i = ts->cnt - 1;
567 
568 		while (i--) {
569 			if (ts->stack[i].ret_addr != ret_addr ||
570 			    ts->stack[i].non_call)
571 				continue;
572 			i += 1;
573 			while (ts->cnt > i) {
574 				err = thread_stack__call_return(thread, ts,
575 								--ts->cnt,
576 								timestamp, ref,
577 								true);
578 				if (err)
579 					return err;
580 			}
581 			return thread_stack__call_return(thread, ts, --ts->cnt,
582 							 timestamp, ref, false);
583 		}
584 	}
585 
586 	return 1;
587 }
588 
589 static int thread_stack__bottom(struct thread_stack *ts,
590 				struct perf_sample *sample,
591 				struct addr_location *from_al,
592 				struct addr_location *to_al, u64 ref)
593 {
594 	struct call_path_root *cpr = ts->crp->cpr;
595 	struct call_path *cp;
596 	struct symbol *sym;
597 	u64 ip;
598 
599 	if (sample->ip) {
600 		ip = sample->ip;
601 		sym = from_al->sym;
602 	} else if (sample->addr) {
603 		ip = sample->addr;
604 		sym = to_al->sym;
605 	} else {
606 		return 0;
607 	}
608 
609 	cp = call_path__findnew(cpr, &cpr->call_path, sym, ip,
610 				ts->kernel_start);
611 
612 	return thread_stack__push_cp(ts, ip, sample->time, ref, cp,
613 				     true, false);
614 }
615 
616 static int thread_stack__no_call_return(struct thread *thread,
617 					struct thread_stack *ts,
618 					struct perf_sample *sample,
619 					struct addr_location *from_al,
620 					struct addr_location *to_al, u64 ref)
621 {
622 	struct call_path_root *cpr = ts->crp->cpr;
623 	struct call_path *root = &cpr->call_path;
624 	struct symbol *fsym = from_al->sym;
625 	struct symbol *tsym = to_al->sym;
626 	struct call_path *cp, *parent;
627 	u64 ks = ts->kernel_start;
628 	u64 addr = sample->addr;
629 	u64 tm = sample->time;
630 	u64 ip = sample->ip;
631 	int err;
632 
633 	if (ip >= ks && addr < ks) {
634 		/* Return to userspace, so pop all kernel addresses */
635 		while (thread_stack__in_kernel(ts)) {
636 			err = thread_stack__call_return(thread, ts, --ts->cnt,
637 							tm, ref, true);
638 			if (err)
639 				return err;
640 		}
641 
642 		/* If the stack is empty, push the userspace address */
643 		if (!ts->cnt) {
644 			cp = call_path__findnew(cpr, root, tsym, addr, ks);
645 			return thread_stack__push_cp(ts, 0, tm, ref, cp, true,
646 						     false);
647 		}
648 	} else if (thread_stack__in_kernel(ts) && ip < ks) {
649 		/* Return to userspace, so pop all kernel addresses */
650 		while (thread_stack__in_kernel(ts)) {
651 			err = thread_stack__call_return(thread, ts, --ts->cnt,
652 							tm, ref, true);
653 			if (err)
654 				return err;
655 		}
656 	}
657 
658 	if (ts->cnt)
659 		parent = ts->stack[ts->cnt - 1].cp;
660 	else
661 		parent = root;
662 
663 	if (parent->sym == from_al->sym) {
664 		/*
665 		 * At the bottom of the stack, assume the missing 'call' was
666 		 * before the trace started. So, pop the current symbol and push
667 		 * the 'to' symbol.
668 		 */
669 		if (ts->cnt == 1) {
670 			err = thread_stack__call_return(thread, ts, --ts->cnt,
671 							tm, ref, false);
672 			if (err)
673 				return err;
674 		}
675 
676 		if (!ts->cnt) {
677 			cp = call_path__findnew(cpr, root, tsym, addr, ks);
678 
679 			return thread_stack__push_cp(ts, addr, tm, ref, cp,
680 						     true, false);
681 		}
682 
683 		/*
684 		 * Otherwise assume the 'return' is being used as a jump (e.g.
685 		 * retpoline) and just push the 'to' symbol.
686 		 */
687 		cp = call_path__findnew(cpr, parent, tsym, addr, ks);
688 
689 		err = thread_stack__push_cp(ts, 0, tm, ref, cp, true, false);
690 		if (!err)
691 			ts->stack[ts->cnt - 1].non_call = true;
692 
693 		return err;
694 	}
695 
696 	/*
697 	 * Assume 'parent' has not yet returned, so push 'to', and then push and
698 	 * pop 'from'.
699 	 */
700 
701 	cp = call_path__findnew(cpr, parent, tsym, addr, ks);
702 
703 	err = thread_stack__push_cp(ts, addr, tm, ref, cp, true, false);
704 	if (err)
705 		return err;
706 
707 	cp = call_path__findnew(cpr, cp, fsym, ip, ks);
708 
709 	err = thread_stack__push_cp(ts, ip, tm, ref, cp, true, false);
710 	if (err)
711 		return err;
712 
713 	return thread_stack__call_return(thread, ts, --ts->cnt, tm, ref, false);
714 }
715 
716 static int thread_stack__trace_begin(struct thread *thread,
717 				     struct thread_stack *ts, u64 timestamp,
718 				     u64 ref)
719 {
720 	struct thread_stack_entry *tse;
721 	int err;
722 
723 	if (!ts->cnt)
724 		return 0;
725 
726 	/* Pop trace end */
727 	tse = &ts->stack[ts->cnt - 1];
728 	if (tse->trace_end) {
729 		err = thread_stack__call_return(thread, ts, --ts->cnt,
730 						timestamp, ref, false);
731 		if (err)
732 			return err;
733 	}
734 
735 	return 0;
736 }
737 
738 static int thread_stack__trace_end(struct thread_stack *ts,
739 				   struct perf_sample *sample, u64 ref)
740 {
741 	struct call_path_root *cpr = ts->crp->cpr;
742 	struct call_path *cp;
743 	u64 ret_addr;
744 
745 	/* No point having 'trace end' on the bottom of the stack */
746 	if (!ts->cnt || (ts->cnt == 1 && ts->stack[0].ref == ref))
747 		return 0;
748 
749 	cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, NULL, 0,
750 				ts->kernel_start);
751 
752 	ret_addr = sample->ip + sample->insn_len;
753 
754 	return thread_stack__push_cp(ts, ret_addr, sample->time, ref, cp,
755 				     false, true);
756 }
757 
758 static bool is_x86_retpoline(const char *name)
759 {
760 	const char *p = strstr(name, "__x86_indirect_thunk_");
761 
762 	return p == name || !strcmp(name, "__indirect_thunk_start");
763 }
764 
765 /*
766  * x86 retpoline functions pollute the call graph. This function removes them.
767  * This does not handle function return thunks, nor is there any improvement
768  * for the handling of inline thunks or extern thunks.
769  */
770 static int thread_stack__x86_retpoline(struct thread_stack *ts,
771 				       struct perf_sample *sample,
772 				       struct addr_location *to_al)
773 {
774 	struct thread_stack_entry *tse = &ts->stack[ts->cnt - 1];
775 	struct call_path_root *cpr = ts->crp->cpr;
776 	struct symbol *sym = tse->cp->sym;
777 	struct symbol *tsym = to_al->sym;
778 	struct call_path *cp;
779 
780 	if (sym && is_x86_retpoline(sym->name)) {
781 		/*
782 		 * This is a x86 retpoline fn. It pollutes the call graph by
783 		 * showing up everywhere there is an indirect branch, but does
784 		 * not itself mean anything. Here the top-of-stack is removed,
785 		 * by decrementing the stack count, and then further down, the
786 		 * resulting top-of-stack is replaced with the actual target.
787 		 * The result is that the retpoline functions will no longer
788 		 * appear in the call graph. Note this only affects the call
789 		 * graph, since all the original branches are left unchanged.
790 		 */
791 		ts->cnt -= 1;
792 		sym = ts->stack[ts->cnt - 2].cp->sym;
793 		if (sym && sym == tsym && to_al->addr != tsym->start) {
794 			/*
795 			 * Target is back to the middle of the symbol we came
796 			 * from so assume it is an indirect jmp and forget it
797 			 * altogether.
798 			 */
799 			ts->cnt -= 1;
800 			return 0;
801 		}
802 	} else if (sym && sym == tsym) {
803 		/*
804 		 * Target is back to the symbol we came from so assume it is an
805 		 * indirect jmp and forget it altogether.
806 		 */
807 		ts->cnt -= 1;
808 		return 0;
809 	}
810 
811 	cp = call_path__findnew(cpr, ts->stack[ts->cnt - 2].cp, tsym,
812 				sample->addr, ts->kernel_start);
813 	if (!cp)
814 		return -ENOMEM;
815 
816 	/* Replace the top-of-stack with the actual target */
817 	ts->stack[ts->cnt - 1].cp = cp;
818 
819 	return 0;
820 }
821 
822 int thread_stack__process(struct thread *thread, struct comm *comm,
823 			  struct perf_sample *sample,
824 			  struct addr_location *from_al,
825 			  struct addr_location *to_al, u64 ref,
826 			  struct call_return_processor *crp)
827 {
828 	struct thread_stack *ts = thread__stack(thread, sample->cpu);
829 	enum retpoline_state_t rstate;
830 	int err = 0;
831 
832 	if (ts && !ts->crp) {
833 		/* Supersede thread_stack__event() */
834 		thread_stack__reset(thread, ts);
835 		ts = NULL;
836 	}
837 
838 	if (!ts) {
839 		ts = thread_stack__new(thread, sample->cpu, crp);
840 		if (!ts)
841 			return -ENOMEM;
842 		ts->comm = comm;
843 	}
844 
845 	rstate = ts->rstate;
846 	if (rstate == X86_RETPOLINE_DETECTED)
847 		ts->rstate = X86_RETPOLINE_POSSIBLE;
848 
849 	/* Flush stack on exec */
850 	if (ts->comm != comm && thread->pid_ == thread->tid) {
851 		err = __thread_stack__flush(thread, ts);
852 		if (err)
853 			return err;
854 		ts->comm = comm;
855 	}
856 
857 	/* If the stack is empty, put the current symbol on the stack */
858 	if (!ts->cnt) {
859 		err = thread_stack__bottom(ts, sample, from_al, to_al, ref);
860 		if (err)
861 			return err;
862 	}
863 
864 	ts->branch_count += 1;
865 	ts->last_time = sample->time;
866 
867 	if (sample->flags & PERF_IP_FLAG_CALL) {
868 		bool trace_end = sample->flags & PERF_IP_FLAG_TRACE_END;
869 		struct call_path_root *cpr = ts->crp->cpr;
870 		struct call_path *cp;
871 		u64 ret_addr;
872 
873 		if (!sample->ip || !sample->addr)
874 			return 0;
875 
876 		ret_addr = sample->ip + sample->insn_len;
877 		if (ret_addr == sample->addr)
878 			return 0; /* Zero-length calls are excluded */
879 
880 		cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp,
881 					to_al->sym, sample->addr,
882 					ts->kernel_start);
883 		err = thread_stack__push_cp(ts, ret_addr, sample->time, ref,
884 					    cp, false, trace_end);
885 
886 		/*
887 		 * A call to the same symbol but not the start of the symbol,
888 		 * may be the start of a x86 retpoline.
889 		 */
890 		if (!err && rstate == X86_RETPOLINE_POSSIBLE && to_al->sym &&
891 		    from_al->sym == to_al->sym &&
892 		    to_al->addr != to_al->sym->start)
893 			ts->rstate = X86_RETPOLINE_DETECTED;
894 
895 	} else if (sample->flags & PERF_IP_FLAG_RETURN) {
896 		if (!sample->ip || !sample->addr)
897 			return 0;
898 
899 		/* x86 retpoline 'return' doesn't match the stack */
900 		if (rstate == X86_RETPOLINE_DETECTED && ts->cnt > 2 &&
901 		    ts->stack[ts->cnt - 1].ret_addr != sample->addr)
902 			return thread_stack__x86_retpoline(ts, sample, to_al);
903 
904 		err = thread_stack__pop_cp(thread, ts, sample->addr,
905 					   sample->time, ref, from_al->sym);
906 		if (err) {
907 			if (err < 0)
908 				return err;
909 			err = thread_stack__no_call_return(thread, ts, sample,
910 							   from_al, to_al, ref);
911 		}
912 	} else if (sample->flags & PERF_IP_FLAG_TRACE_BEGIN) {
913 		err = thread_stack__trace_begin(thread, ts, sample->time, ref);
914 	} else if (sample->flags & PERF_IP_FLAG_TRACE_END) {
915 		err = thread_stack__trace_end(ts, sample, ref);
916 	} else if (sample->flags & PERF_IP_FLAG_BRANCH &&
917 		   from_al->sym != to_al->sym && to_al->sym &&
918 		   to_al->addr == to_al->sym->start) {
919 		struct call_path_root *cpr = ts->crp->cpr;
920 		struct call_path *cp;
921 
922 		/*
923 		 * The compiler might optimize a call/ret combination by making
924 		 * it a jmp. Make that visible by recording on the stack a
925 		 * branch to the start of a different symbol. Note, that means
926 		 * when a ret pops the stack, all jmps must be popped off first.
927 		 */
928 		cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp,
929 					to_al->sym, sample->addr,
930 					ts->kernel_start);
931 		err = thread_stack__push_cp(ts, 0, sample->time, ref, cp, false,
932 					    false);
933 		if (!err)
934 			ts->stack[ts->cnt - 1].non_call = true;
935 	}
936 
937 	return err;
938 }
939 
940 size_t thread_stack__depth(struct thread *thread, int cpu)
941 {
942 	struct thread_stack *ts = thread__stack(thread, cpu);
943 
944 	if (!ts)
945 		return 0;
946 	return ts->cnt;
947 }
948