xref: /linux/tools/perf/builtin-sched.c (revision 500f5dd0a8b6f7bd174102587c7dff5a7d2fecbf)
1 // SPDX-License-Identifier: GPL-2.0
2 #include "builtin.h"
3 #include "perf.h"
4 #include "perf-sys.h"
5 
6 #include "util/cpumap.h"
7 #include "util/evlist.h"
8 #include "util/evsel.h"
9 #include "util/evsel_fprintf.h"
10 #include "util/mutex.h"
11 #include "util/symbol.h"
12 #include "util/thread.h"
13 #include "util/header.h"
14 #include "util/session.h"
15 #include "util/tool.h"
16 #include "util/cloexec.h"
17 #include "util/thread_map.h"
18 #include "util/color.h"
19 #include "util/stat.h"
20 #include "util/string2.h"
21 #include "util/callchain.h"
22 #include "util/time-utils.h"
23 
24 #include <subcmd/pager.h>
25 #include <subcmd/parse-options.h>
26 #include "util/trace-event.h"
27 
28 #include "util/debug.h"
29 #include "util/event.h"
30 #include "util/util.h"
31 #include "util/synthetic-events.h"
32 #include "util/target.h"
33 
34 #include <linux/kernel.h>
35 #include <linux/log2.h>
36 #include <linux/zalloc.h>
37 #include <sys/prctl.h>
38 #include <sys/resource.h>
39 #include <sys/wait.h>
40 #include <inttypes.h>
41 
42 #include <errno.h>
43 #include <semaphore.h>
44 #include <pthread.h>
45 #include <math.h>
46 #include <api/fs/fs.h>
47 #include <perf/cpumap.h>
48 #include <linux/time64.h>
49 #include <linux/err.h>
50 
51 #include <linux/ctype.h>
52 
53 #define PR_SET_NAME		15               /* Set process name */
54 #define MAX_CPUS		4096
55 #define COMM_LEN		20
56 #define SYM_LEN			129
57 #define MAX_PID			1024000
58 #define PID_MAX_LIMIT		4194304 /* kernel limit on 64-bit */
59 #define MAX_PRIO		140
60 #define SEP_LEN			100
61 
62 static const char *cpu_list;
63 static struct perf_cpu_map *user_requested_cpus;
64 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
65 
66 struct sched_atom;
67 
68 struct task_desc {
69 	unsigned long		nr;
70 	unsigned long		pid;
71 	char			comm[COMM_LEN];
72 
73 	unsigned long		nr_events;
74 	unsigned long		curr_event;
75 	struct sched_atom	**atoms;
76 
77 	pthread_t		thread;
78 
79 	sem_t			ready_for_work;
80 	sem_t			work_done_sem;
81 
82 	u64			cpu_usage;
83 };
84 
85 enum sched_event_type {
86 	SCHED_EVENT_RUN,
87 	SCHED_EVENT_SLEEP,
88 	SCHED_EVENT_WAKEUP,
89 };
90 
91 struct sched_atom {
92 	enum sched_event_type	type;
93 	u64			timestamp;
94 	u64			duration;
95 	unsigned long		nr;
96 	sem_t			*wait_sem;
97 	struct task_desc	*wakee;
98 };
99 
100 enum thread_state {
101 	THREAD_SLEEPING = 0,
102 	THREAD_WAIT_CPU,
103 	THREAD_SCHED_IN,
104 	THREAD_IGNORE
105 };
106 
107 struct work_atom {
108 	struct list_head	list;
109 	enum thread_state	state;
110 	u64			sched_out_time;
111 	u64			wake_up_time;
112 	u64			sched_in_time;
113 	u64			runtime;
114 };
115 
116 struct work_atoms {
117 	struct list_head	work_list;
118 	struct thread		*thread;
119 	struct rb_node		node;
120 	u64			max_lat;
121 	u64			max_lat_start;
122 	u64			max_lat_end;
123 	u64			total_lat;
124 	u64			nb_atoms;
125 	u64			total_runtime;
126 	int			num_merged;
127 };
128 
129 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
130 
131 struct perf_sched;
132 
133 struct trace_sched_handler {
134 	int (*switch_event)(struct perf_sched *sched, struct perf_sample *sample,
135 			    struct machine *machine);
136 
137 	int (*runtime_event)(struct perf_sched *sched, struct perf_sample *sample,
138 			     struct machine *machine);
139 
140 	int (*wakeup_event)(struct perf_sched *sched, struct perf_sample *sample,
141 			    struct machine *machine);
142 
143 	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
144 	int (*fork_event)(struct perf_sched *sched, union perf_event *event,
145 			  struct machine *machine);
146 
147 	int (*migrate_task_event)(struct perf_sched *sched,
148 				  struct perf_sample *sample,
149 				  struct machine *machine);
150 };
151 
152 #define COLOR_PIDS PERF_COLOR_BLUE
153 #define COLOR_CPUS PERF_COLOR_BG_RED
154 
155 struct perf_sched_map {
156 	DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
157 	struct perf_cpu		*comp_cpus;
158 	bool			 comp;
159 	struct perf_thread_map *color_pids;
160 	const char		*color_pids_str;
161 	struct perf_cpu_map	*color_cpus;
162 	const char		*color_cpus_str;
163 	const char		*task_name;
164 	struct strlist		*task_names;
165 	bool			fuzzy;
166 	struct perf_cpu_map	*cpus;
167 	const char		*cpus_str;
168 };
169 
170 struct perf_sched {
171 	struct perf_tool tool;
172 	const char	 *sort_order;
173 	unsigned long	 nr_tasks;
174 	struct task_desc **pid_to_task;
175 	struct task_desc **tasks;
176 	const struct trace_sched_handler *tp_handler;
177 	struct mutex	 start_work_mutex;
178 	struct mutex	 work_done_wait_mutex;
179 	int		 profile_cpu;
180 /*
181  * Track the current task - that way we can know whether there's any
182  * weird events, such as a task being switched away that is not current.
183  */
184 	struct perf_cpu	 max_cpu;
185 	u32		 *curr_pid;
186 	struct thread	 **curr_thread;
187 	struct thread	 **curr_out_thread;
188 	char		 next_shortname1;
189 	char		 next_shortname2;
190 	unsigned int	 replay_repeat;
191 	unsigned long	 nr_run_events;
192 	unsigned long	 nr_sleep_events;
193 	unsigned long	 nr_wakeup_events;
194 	unsigned long	 nr_sleep_corrections;
195 	unsigned long	 nr_run_events_optimized;
196 	unsigned long	 targetless_wakeups;
197 	unsigned long	 multitarget_wakeups;
198 	unsigned long	 nr_runs;
199 	unsigned long	 nr_timestamps;
200 	unsigned long	 nr_unordered_timestamps;
201 	unsigned long	 nr_context_switch_bugs;
202 	unsigned long	 nr_events;
203 	unsigned long	 nr_lost_chunks;
204 	unsigned long	 nr_lost_events;
205 	u64		 run_measurement_overhead;
206 	u64		 sleep_measurement_overhead;
207 	u64		 start_time;
208 	u64		 cpu_usage;
209 	u64		 runavg_cpu_usage;
210 	u64		 parent_cpu_usage;
211 	u64		 runavg_parent_cpu_usage;
212 	u64		 sum_runtime;
213 	u64		 sum_fluct;
214 	u64		 run_avg;
215 	u64		 all_runtime;
216 	u64		 all_count;
217 	u64		 *cpu_last_switched;
218 	struct rb_root_cached atom_root, sorted_atom_root, merged_atom_root;
219 	struct list_head sort_list, cmp_pid;
220 	bool force;
221 	bool skip_merge;
222 	struct perf_sched_map map;
223 
224 	/* options for timehist command */
225 	bool		summary;
226 	bool		summary_only;
227 	bool		idle_hist;
228 	bool		show_callchain;
229 	unsigned int	max_stack;
230 	bool		show_cpu_visual;
231 	bool		show_wakeups;
232 	bool		show_next;
233 	bool		show_migrations;
234 	bool		pre_migrations;
235 	bool		show_state;
236 	bool		show_prio;
237 	u64		skipped_samples;
238 	const char	*time_str;
239 	struct perf_time_interval ptime;
240 	struct perf_time_interval hist_time;
241 	volatile bool   thread_funcs_exit;
242 	const char	*prio_str;
243 	DECLARE_BITMAP(prio_bitmap, MAX_PRIO);
244 
245 	struct perf_session *session;
246 	struct perf_data *data;
247 };
248 
249 /* per thread run time data */
250 struct thread_runtime {
251 	u64 last_time;      /* time of previous sched in/out event */
252 	u64 dt_run;         /* run time */
253 	u64 dt_sleep;       /* time between CPU access by sleep (off cpu) */
254 	u64 dt_iowait;      /* time between CPU access by iowait (off cpu) */
255 	u64 dt_preempt;     /* time between CPU access by preempt (off cpu) */
256 	u64 dt_delay;       /* time between wakeup and sched-in */
257 	u64 dt_pre_mig;     /* time between migration and wakeup */
258 	u64 ready_to_run;   /* time of wakeup */
259 	u64 migrated;	    /* time when a thread is migrated */
260 
261 	struct stats run_stats;
262 	u64 total_run_time;
263 	u64 total_sleep_time;
264 	u64 total_iowait_time;
265 	u64 total_preempt_time;
266 	u64 total_delay_time;
267 	u64 total_pre_mig_time;
268 
269 	char last_state;
270 
271 	char shortname[3];
272 	bool comm_changed;
273 
274 	u64 migrations;
275 
276 	int prio;
277 	bool color;
278 };
279 
280 /* per event run time data */
281 struct evsel_runtime {
282 	u64 *last_time; /* time this event was last seen per cpu */
283 	u32 ncpu;       /* highest cpu slot allocated */
284 };
285 
286 /* per cpu idle time data */
287 struct idle_thread_runtime {
288 	struct thread_runtime	tr;
289 	struct thread		*last_thread;
290 	struct rb_root_cached	sorted_root;
291 	struct callchain_root	callchain;
292 	struct callchain_cursor	cursor;
293 };
294 
295 /* track idle times per cpu */
296 static struct thread **idle_threads;
297 static int idle_max_cpu;
298 static char idle_comm[] = "<idle>";
299 
300 static u64 get_nsecs(void)
301 {
302 	struct timespec ts;
303 
304 	clock_gettime(CLOCK_MONOTONIC, &ts);
305 
306 	return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
307 }
308 
309 static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
310 {
311 	u64 T0 = get_nsecs(), T1;
312 
313 	do {
314 		T1 = get_nsecs();
315 	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
316 }
317 
318 static void sleep_nsecs(u64 nsecs)
319 {
320 	struct timespec ts;
321 
322 	ts.tv_nsec = nsecs % 999999999;
323 	ts.tv_sec = nsecs / 999999999;
324 
325 	nanosleep(&ts, NULL);
326 }
327 
328 static void calibrate_run_measurement_overhead(struct perf_sched *sched)
329 {
330 	u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
331 	int i;
332 
333 	for (i = 0; i < 10; i++) {
334 		T0 = get_nsecs();
335 		burn_nsecs(sched, 0);
336 		T1 = get_nsecs();
337 		delta = T1-T0;
338 		min_delta = min(min_delta, delta);
339 	}
340 	sched->run_measurement_overhead = min_delta;
341 
342 	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
343 }
344 
345 static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
346 {
347 	u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
348 	int i;
349 
350 	for (i = 0; i < 10; i++) {
351 		T0 = get_nsecs();
352 		sleep_nsecs(10000);
353 		T1 = get_nsecs();
354 		delta = T1-T0;
355 		min_delta = min(min_delta, delta);
356 	}
357 	min_delta -= 10000;
358 	sched->sleep_measurement_overhead = min_delta;
359 
360 	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
361 }
362 
363 static struct sched_atom *
364 get_new_event(struct task_desc *task, u64 timestamp)
365 {
366 	struct sched_atom *event = zalloc(sizeof(*event));
367 	unsigned long idx = task->nr_events;
368 	size_t size;
369 	struct sched_atom **atoms_p;
370 
371 	if (event == NULL) {
372 		pr_err("ERROR: sched: failed to allocate event\n");
373 		return NULL;
374 	}
375 
376 	event->timestamp = timestamp;
377 	event->nr = idx;
378 
379 	size = sizeof(struct sched_atom *) * (task->nr_events + 1);
380 	atoms_p = realloc(task->atoms, size);
381 	if (!atoms_p) {
382 		pr_err("ERROR: sched: failed to grow atoms array\n");
383 		free(event);
384 		return NULL;
385 	}
386 	task->atoms = atoms_p;
387 	task->nr_events++;
388 
389 	task->atoms[idx] = event;
390 
391 	return event;
392 }
393 
394 static struct sched_atom *last_event(struct task_desc *task)
395 {
396 	if (!task->nr_events)
397 		return NULL;
398 
399 	return task->atoms[task->nr_events - 1];
400 }
401 
402 static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
403 				u64 timestamp, u64 duration)
404 {
405 	struct sched_atom *event, *curr_event = last_event(task);
406 
407 	/*
408 	 * optimize an existing RUN event by merging this one
409 	 * to it:
410 	 */
411 	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
412 		sched->nr_run_events_optimized++;
413 		curr_event->duration += duration;
414 		return;
415 	}
416 
417 	event = get_new_event(task, timestamp);
418 	if (event == NULL)
419 		return;
420 
421 	event->type = SCHED_EVENT_RUN;
422 	event->duration = duration;
423 
424 	sched->nr_run_events++;
425 }
426 
427 static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
428 				   u64 timestamp, struct task_desc *wakee)
429 {
430 	struct sched_atom *event, *wakee_event;
431 
432 	event = get_new_event(task, timestamp);
433 	if (event == NULL)
434 		return;
435 	event->type = SCHED_EVENT_WAKEUP;
436 	event->wakee = wakee;
437 
438 	wakee_event = last_event(wakee);
439 	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
440 		sched->targetless_wakeups++;
441 		return;
442 	}
443 	if (wakee_event->wait_sem) {
444 		sched->multitarget_wakeups++;
445 		return;
446 	}
447 
448 	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
449 	if (!wakee_event->wait_sem) {
450 		pr_err("ERROR: sched: failed to allocate semaphore\n");
451 		return;
452 	}
453 	sem_init(wakee_event->wait_sem, 0, 0);
454 	event->wait_sem = wakee_event->wait_sem;
455 
456 	sched->nr_wakeup_events++;
457 }
458 
459 static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
460 				  u64 timestamp)
461 {
462 	struct sched_atom *event = get_new_event(task, timestamp);
463 
464 	if (event == NULL)
465 		return;
466 
467 	event->type = SCHED_EVENT_SLEEP;
468 
469 	sched->nr_sleep_events++;
470 }
471 
472 static struct task_desc *register_pid(struct perf_sched *sched,
473 				      unsigned long pid, const char *comm)
474 {
475 	struct task_desc *task, **tasks_p;
476 	static int pid_max;
477 
478 	/* perf.data is untrusted — cap pid to prevent overflow in size calculations */
479 	if (pid >= PID_MAX_LIMIT) {
480 		pr_err("pid %lu exceeds limit %d, skipping\n", pid, PID_MAX_LIMIT);
481 		return NULL;
482 	}
483 
484 	if (sched->pid_to_task == NULL) {
485 		if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
486 			pid_max = MAX_PID;
487 		sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *));
488 		if (sched->pid_to_task == NULL)
489 			return NULL;
490 	}
491 	if (pid >= (unsigned long)pid_max) {
492 		void *p = realloc(sched->pid_to_task, (pid + 1) * sizeof(struct task_desc *));
493 
494 		if (p == NULL)
495 			return NULL;
496 		sched->pid_to_task = p;
497 		while (pid >= (unsigned long)pid_max)
498 			sched->pid_to_task[pid_max++] = NULL;
499 	}
500 
501 	task = sched->pid_to_task[pid];
502 
503 	if (task)
504 		return task;
505 
506 	task = zalloc(sizeof(*task));
507 	if (task == NULL)
508 		return NULL;
509 	task->pid = pid;
510 	if (comm)
511 		strlcpy(task->comm, comm, sizeof(task->comm));
512 	/*
513 	 * every task starts in sleeping state - this gets ignored
514 	 * if there's no wakeup pointing to this sleep state:
515 	 */
516 	add_sched_event_sleep(sched, task, 0);
517 
518 	sched->pid_to_task[pid] = task;
519 	tasks_p = realloc(sched->tasks, (sched->nr_tasks + 1) * sizeof(struct task_desc *));
520 	if (!tasks_p)
521 		return NULL;
522 	sched->tasks = tasks_p;
523 	sched->tasks[sched->nr_tasks] = task;
524 	task->nr = sched->nr_tasks++;
525 
526 	if (verbose > 0)
527 		printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
528 
529 	return task;
530 }
531 
532 
533 static void print_task_traces(struct perf_sched *sched)
534 {
535 	struct task_desc *task;
536 	unsigned long i;
537 
538 	for (i = 0; i < sched->nr_tasks; i++) {
539 		task = sched->tasks[i];
540 		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
541 			task->nr, task->comm, task->pid, task->nr_events);
542 	}
543 }
544 
545 static void add_cross_task_wakeups(struct perf_sched *sched)
546 {
547 	struct task_desc *task1, *task2;
548 	unsigned long i, j;
549 
550 	for (i = 0; i < sched->nr_tasks; i++) {
551 		task1 = sched->tasks[i];
552 		j = i + 1;
553 		if (j == sched->nr_tasks)
554 			j = 0;
555 		task2 = sched->tasks[j];
556 		add_sched_event_wakeup(sched, task1, 0, task2);
557 	}
558 }
559 
560 static void perf_sched__process_event(struct perf_sched *sched,
561 				      struct sched_atom *atom)
562 {
563 	int ret = 0;
564 
565 	switch (atom->type) {
566 		case SCHED_EVENT_RUN:
567 			burn_nsecs(sched, atom->duration);
568 			break;
569 		case SCHED_EVENT_SLEEP:
570 			if (atom->wait_sem)
571 				ret = sem_wait(atom->wait_sem);
572 			BUG_ON(ret);
573 			break;
574 		case SCHED_EVENT_WAKEUP:
575 			if (atom->wait_sem)
576 				ret = sem_post(atom->wait_sem);
577 			BUG_ON(ret);
578 			break;
579 		default:
580 			BUG_ON(1);
581 	}
582 }
583 
584 static u64 get_cpu_usage_nsec_parent(void)
585 {
586 	struct rusage ru;
587 	u64 sum;
588 	int err;
589 
590 	err = getrusage(RUSAGE_SELF, &ru);
591 	BUG_ON(err);
592 
593 	sum =  ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC;
594 	sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC;
595 
596 	return sum;
597 }
598 
599 static int self_open_counters(struct perf_sched *sched, unsigned long cur_task)
600 {
601 	struct perf_event_attr attr;
602 	char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
603 	int fd;
604 	struct rlimit limit;
605 	bool need_privilege = false;
606 
607 	memset(&attr, 0, sizeof(attr));
608 
609 	attr.type = PERF_TYPE_SOFTWARE;
610 	attr.config = PERF_COUNT_SW_TASK_CLOCK;
611 
612 force_again:
613 	fd = sys_perf_event_open(&attr, 0, -1, -1,
614 				 perf_event_open_cloexec_flag());
615 
616 	if (fd < 0) {
617 		if (errno == EMFILE) {
618 			if (sched->force) {
619 				BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1);
620 				limit.rlim_cur += sched->nr_tasks - cur_task;
621 				if (limit.rlim_cur > limit.rlim_max) {
622 					limit.rlim_max = limit.rlim_cur;
623 					need_privilege = true;
624 				}
625 				if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
626 					if (need_privilege && errno == EPERM)
627 						strcpy(info, "Need privilege\n");
628 				} else
629 					goto force_again;
630 			} else
631 				strcpy(info, "Have a try with -f option\n");
632 		}
633 		pr_err("Error: sys_perf_event_open() syscall returned "
634 		       "with %d (%s)\n%s", fd,
635 		       str_error_r(errno, sbuf, sizeof(sbuf)), info);
636 		exit(EXIT_FAILURE);
637 	}
638 	return fd;
639 }
640 
641 static u64 get_cpu_usage_nsec_self(int fd)
642 {
643 	u64 runtime;
644 	int ret;
645 
646 	ret = read(fd, &runtime, sizeof(runtime));
647 	BUG_ON(ret != sizeof(runtime));
648 
649 	return runtime;
650 }
651 
652 struct sched_thread_parms {
653 	struct task_desc  *task;
654 	struct perf_sched *sched;
655 	int fd;
656 };
657 
658 static void *thread_func(void *ctx)
659 {
660 	struct sched_thread_parms *parms = ctx;
661 	struct task_desc *this_task = parms->task;
662 	struct perf_sched *sched = parms->sched;
663 	u64 cpu_usage_0, cpu_usage_1;
664 	unsigned long i, ret;
665 	char comm2[22];
666 	int fd = parms->fd;
667 
668 	zfree(&parms);
669 
670 	sprintf(comm2, ":%s", this_task->comm);
671 	prctl(PR_SET_NAME, comm2);
672 	if (fd < 0)
673 		return NULL;
674 
675 	while (!sched->thread_funcs_exit) {
676 		ret = sem_post(&this_task->ready_for_work);
677 		BUG_ON(ret);
678 		mutex_lock(&sched->start_work_mutex);
679 		mutex_unlock(&sched->start_work_mutex);
680 
681 		cpu_usage_0 = get_cpu_usage_nsec_self(fd);
682 
683 		for (i = 0; i < this_task->nr_events; i++) {
684 			this_task->curr_event = i;
685 			perf_sched__process_event(sched, this_task->atoms[i]);
686 		}
687 
688 		cpu_usage_1 = get_cpu_usage_nsec_self(fd);
689 		this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
690 		ret = sem_post(&this_task->work_done_sem);
691 		BUG_ON(ret);
692 
693 		mutex_lock(&sched->work_done_wait_mutex);
694 		mutex_unlock(&sched->work_done_wait_mutex);
695 	}
696 	return NULL;
697 }
698 
699 static void create_tasks(struct perf_sched *sched)
700 	EXCLUSIVE_LOCK_FUNCTION(sched->start_work_mutex)
701 	EXCLUSIVE_LOCK_FUNCTION(sched->work_done_wait_mutex)
702 {
703 	struct task_desc *task;
704 	pthread_attr_t attr;
705 	unsigned long i;
706 	int err;
707 
708 	err = pthread_attr_init(&attr);
709 	BUG_ON(err);
710 	err = pthread_attr_setstacksize(&attr,
711 			(size_t) max(16 * 1024, (int)PTHREAD_STACK_MIN));
712 	BUG_ON(err);
713 	mutex_lock(&sched->start_work_mutex);
714 	mutex_lock(&sched->work_done_wait_mutex);
715 	for (i = 0; i < sched->nr_tasks; i++) {
716 		struct sched_thread_parms *parms = malloc(sizeof(*parms));
717 		BUG_ON(parms == NULL);
718 		parms->task = task = sched->tasks[i];
719 		parms->sched = sched;
720 		parms->fd = self_open_counters(sched, i);
721 		sem_init(&task->ready_for_work, 0, 0);
722 		sem_init(&task->work_done_sem, 0, 0);
723 		task->curr_event = 0;
724 		err = pthread_create(&task->thread, &attr, thread_func, parms);
725 		BUG_ON(err);
726 	}
727 }
728 
729 static void destroy_tasks(struct perf_sched *sched)
730 	UNLOCK_FUNCTION(sched->start_work_mutex)
731 	UNLOCK_FUNCTION(sched->work_done_wait_mutex)
732 {
733 	struct task_desc *task;
734 	unsigned long i;
735 	int err;
736 
737 	mutex_unlock(&sched->start_work_mutex);
738 	mutex_unlock(&sched->work_done_wait_mutex);
739 	/* Get rid of threads so they won't be upset by mutex destrunction */
740 	for (i = 0; i < sched->nr_tasks; i++) {
741 		task = sched->tasks[i];
742 		err = pthread_join(task->thread, NULL);
743 		BUG_ON(err);
744 		sem_destroy(&task->ready_for_work);
745 		sem_destroy(&task->work_done_sem);
746 	}
747 }
748 
749 static void wait_for_tasks(struct perf_sched *sched)
750 	EXCLUSIVE_LOCKS_REQUIRED(sched->work_done_wait_mutex)
751 	EXCLUSIVE_LOCKS_REQUIRED(sched->start_work_mutex)
752 {
753 	u64 cpu_usage_0, cpu_usage_1;
754 	struct task_desc *task;
755 	unsigned long i, ret;
756 
757 	sched->start_time = get_nsecs();
758 	sched->cpu_usage = 0;
759 	mutex_unlock(&sched->work_done_wait_mutex);
760 
761 	for (i = 0; i < sched->nr_tasks; i++) {
762 		task = sched->tasks[i];
763 		ret = sem_wait(&task->ready_for_work);
764 		BUG_ON(ret);
765 		sem_init(&task->ready_for_work, 0, 0);
766 	}
767 	mutex_lock(&sched->work_done_wait_mutex);
768 
769 	cpu_usage_0 = get_cpu_usage_nsec_parent();
770 
771 	mutex_unlock(&sched->start_work_mutex);
772 
773 	for (i = 0; i < sched->nr_tasks; i++) {
774 		task = sched->tasks[i];
775 		ret = sem_wait(&task->work_done_sem);
776 		BUG_ON(ret);
777 		sem_init(&task->work_done_sem, 0, 0);
778 		sched->cpu_usage += task->cpu_usage;
779 		task->cpu_usage = 0;
780 	}
781 
782 	cpu_usage_1 = get_cpu_usage_nsec_parent();
783 	if (!sched->runavg_cpu_usage)
784 		sched->runavg_cpu_usage = sched->cpu_usage;
785 	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat;
786 
787 	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
788 	if (!sched->runavg_parent_cpu_usage)
789 		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
790 	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) +
791 					 sched->parent_cpu_usage)/sched->replay_repeat;
792 
793 	mutex_lock(&sched->start_work_mutex);
794 
795 	for (i = 0; i < sched->nr_tasks; i++) {
796 		task = sched->tasks[i];
797 		task->curr_event = 0;
798 	}
799 }
800 
801 static void run_one_test(struct perf_sched *sched)
802 	EXCLUSIVE_LOCKS_REQUIRED(sched->work_done_wait_mutex)
803 	EXCLUSIVE_LOCKS_REQUIRED(sched->start_work_mutex)
804 {
805 	u64 T0, T1, delta, avg_delta, fluct;
806 
807 	T0 = get_nsecs();
808 	wait_for_tasks(sched);
809 	T1 = get_nsecs();
810 
811 	delta = T1 - T0;
812 	sched->sum_runtime += delta;
813 	sched->nr_runs++;
814 
815 	avg_delta = sched->sum_runtime / sched->nr_runs;
816 	if (delta < avg_delta)
817 		fluct = avg_delta - delta;
818 	else
819 		fluct = delta - avg_delta;
820 	sched->sum_fluct += fluct;
821 	if (!sched->run_avg)
822 		sched->run_avg = delta;
823 	sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat;
824 
825 	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / NSEC_PER_MSEC);
826 
827 	printf("ravg: %0.2f, ", (double)sched->run_avg / NSEC_PER_MSEC);
828 
829 	printf("cpu: %0.2f / %0.2f",
830 		(double)sched->cpu_usage / NSEC_PER_MSEC, (double)sched->runavg_cpu_usage / NSEC_PER_MSEC);
831 
832 #if 0
833 	/*
834 	 * rusage statistics done by the parent, these are less
835 	 * accurate than the sched->sum_exec_runtime based statistics:
836 	 */
837 	printf(" [%0.2f / %0.2f]",
838 		(double)sched->parent_cpu_usage / NSEC_PER_MSEC,
839 		(double)sched->runavg_parent_cpu_usage / NSEC_PER_MSEC);
840 #endif
841 
842 	printf("\n");
843 
844 	if (sched->nr_sleep_corrections)
845 		printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
846 	sched->nr_sleep_corrections = 0;
847 }
848 
849 static void test_calibrations(struct perf_sched *sched)
850 {
851 	u64 T0, T1;
852 
853 	T0 = get_nsecs();
854 	burn_nsecs(sched, NSEC_PER_MSEC);
855 	T1 = get_nsecs();
856 
857 	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
858 
859 	T0 = get_nsecs();
860 	sleep_nsecs(NSEC_PER_MSEC);
861 	T1 = get_nsecs();
862 
863 	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
864 }
865 
866 static int
867 replay_wakeup_event(struct perf_sched *sched,
868 		    struct perf_sample *sample,
869 		    struct machine *machine __maybe_unused)
870 {
871 	const char *comm = perf_sample__strval(sample, "comm");
872 	const u32 pid	 = perf_sample__intval(sample, "pid");
873 	struct task_desc *waker, *wakee;
874 
875 	if (verbose > 0) {
876 		printf("sched_wakeup event %p\n", sample->evsel);
877 
878 		printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
879 	}
880 
881 	waker = register_pid(sched, sample->tid, "<unknown>");
882 	wakee = register_pid(sched, pid, comm);
883 	if (waker == NULL || wakee == NULL)
884 		return -1;
885 
886 	add_sched_event_wakeup(sched, waker, sample->time, wakee);
887 	return 0;
888 }
889 
890 static int replay_switch_event(struct perf_sched *sched,
891 			       struct perf_sample *sample,
892 			       struct machine *machine __maybe_unused)
893 {
894 	const char *prev_comm  = perf_sample__strval(sample, "prev_comm"),
895 		   *next_comm  = perf_sample__strval(sample, "next_comm");
896 	const u32 prev_pid = perf_sample__intval(sample, "prev_pid"),
897 		  next_pid = perf_sample__intval(sample, "next_pid");
898 	struct task_desc *prev, __maybe_unused *next;
899 	u64 timestamp0, timestamp = sample->time;
900 	int cpu = sample->cpu;
901 	s64 delta;
902 
903 	if (verbose > 0)
904 		printf("sched_switch event %p\n", sample->evsel);
905 
906 	if (cpu >= MAX_CPUS || cpu < 0)
907 		return 0;
908 
909 	timestamp0 = sched->cpu_last_switched[cpu];
910 	if (timestamp0)
911 		delta = timestamp - timestamp0;
912 	else
913 		delta = 0;
914 
915 	if (delta < 0) {
916 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
917 		return -1;
918 	}
919 
920 	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
921 		 prev_comm, prev_pid, next_comm, next_pid, delta);
922 
923 	prev = register_pid(sched, prev_pid, prev_comm);
924 	next = register_pid(sched, next_pid, next_comm);
925 	if (prev == NULL || next == NULL)
926 		return -1;
927 
928 	sched->cpu_last_switched[cpu] = timestamp;
929 
930 	add_sched_event_run(sched, prev, timestamp, delta);
931 	add_sched_event_sleep(sched, prev, timestamp);
932 
933 	return 0;
934 }
935 
936 static int replay_fork_event(struct perf_sched *sched,
937 			     union perf_event *event,
938 			     struct machine *machine)
939 {
940 	struct thread *child, *parent;
941 
942 	child = machine__findnew_thread(machine, event->fork.pid,
943 					event->fork.tid);
944 	parent = machine__findnew_thread(machine, event->fork.ppid,
945 					 event->fork.ptid);
946 
947 	if (child == NULL || parent == NULL) {
948 		pr_debug("thread does not exist on fork event: child %p, parent %p\n",
949 				 child, parent);
950 		goto out_put;
951 	}
952 
953 	if (verbose > 0) {
954 		printf("fork event\n");
955 		printf("... parent: %s/%d\n", thread__comm_str(parent), thread__tid(parent));
956 		printf("...  child: %s/%d\n", thread__comm_str(child), thread__tid(child));
957 	}
958 
959 	register_pid(sched, thread__tid(parent), thread__comm_str(parent));
960 	register_pid(sched, thread__tid(child), thread__comm_str(child));
961 out_put:
962 	thread__put(child);
963 	thread__put(parent);
964 	return 0;
965 }
966 
967 struct sort_dimension {
968 	const char		*name;
969 	sort_fn_t		cmp;
970 	struct list_head	list;
971 };
972 
973 static inline void init_prio(struct thread_runtime *r)
974 {
975 	r->prio = -1;
976 }
977 
978 /*
979  * handle runtime stats saved per thread
980  */
981 static struct thread_runtime *thread__init_runtime(struct thread *thread)
982 {
983 	struct thread_runtime *r;
984 
985 	r = zalloc(sizeof(struct thread_runtime));
986 	if (!r)
987 		return NULL;
988 
989 	init_stats(&r->run_stats);
990 	init_prio(r);
991 	thread__set_priv(thread, r);
992 
993 	return r;
994 }
995 
996 static struct thread_runtime *thread__get_runtime(struct thread *thread)
997 {
998 	struct thread_runtime *tr;
999 
1000 	tr = thread__priv(thread);
1001 	if (tr == NULL) {
1002 		tr = thread__init_runtime(thread);
1003 		if (tr == NULL)
1004 			pr_debug("Failed to malloc memory for runtime data.\n");
1005 	}
1006 
1007 	return tr;
1008 }
1009 
1010 static int
1011 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
1012 {
1013 	struct sort_dimension *sort;
1014 	int ret = 0;
1015 
1016 	BUG_ON(list_empty(list));
1017 
1018 	list_for_each_entry(sort, list, list) {
1019 		ret = sort->cmp(l, r);
1020 		if (ret)
1021 			return ret;
1022 	}
1023 
1024 	return ret;
1025 }
1026 
1027 static struct work_atoms *
1028 thread_atoms_search(struct rb_root_cached *root, struct thread *thread,
1029 			 struct list_head *sort_list)
1030 {
1031 	struct rb_node *node = root->rb_root.rb_node;
1032 	struct work_atoms key = { .thread = thread };
1033 
1034 	while (node) {
1035 		struct work_atoms *atoms;
1036 		int cmp;
1037 
1038 		atoms = container_of(node, struct work_atoms, node);
1039 
1040 		cmp = thread_lat_cmp(sort_list, &key, atoms);
1041 		if (cmp > 0)
1042 			node = node->rb_left;
1043 		else if (cmp < 0)
1044 			node = node->rb_right;
1045 		else {
1046 			BUG_ON(!RC_CHK_EQUAL(thread, atoms->thread));
1047 			return atoms;
1048 		}
1049 	}
1050 	return NULL;
1051 }
1052 
1053 static void
1054 __thread_latency_insert(struct rb_root_cached *root, struct work_atoms *data,
1055 			 struct list_head *sort_list)
1056 {
1057 	struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
1058 	bool leftmost = true;
1059 
1060 	while (*new) {
1061 		struct work_atoms *this;
1062 		int cmp;
1063 
1064 		this = container_of(*new, struct work_atoms, node);
1065 		parent = *new;
1066 
1067 		cmp = thread_lat_cmp(sort_list, data, this);
1068 
1069 		if (cmp > 0)
1070 			new = &((*new)->rb_left);
1071 		else {
1072 			new = &((*new)->rb_right);
1073 			leftmost = false;
1074 		}
1075 	}
1076 
1077 	rb_link_node(&data->node, parent, new);
1078 	rb_insert_color_cached(&data->node, root, leftmost);
1079 }
1080 
1081 static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
1082 {
1083 	struct work_atoms *atoms = zalloc(sizeof(*atoms));
1084 	if (!atoms) {
1085 		pr_err("No memory at %s\n", __func__);
1086 		return -1;
1087 	}
1088 
1089 	atoms->thread = thread__get(thread);
1090 	INIT_LIST_HEAD(&atoms->work_list);
1091 	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
1092 	return 0;
1093 }
1094 
1095 static int
1096 add_sched_out_event(struct work_atoms *atoms,
1097 		    char run_state,
1098 		    u64 timestamp)
1099 {
1100 	struct work_atom *atom = zalloc(sizeof(*atom));
1101 	if (!atom) {
1102 		pr_err("Non memory at %s", __func__);
1103 		return -1;
1104 	}
1105 
1106 	atom->sched_out_time = timestamp;
1107 
1108 	if (run_state == 'R') {
1109 		atom->state = THREAD_WAIT_CPU;
1110 		atom->wake_up_time = atom->sched_out_time;
1111 	}
1112 
1113 	list_add_tail(&atom->list, &atoms->work_list);
1114 	return 0;
1115 }
1116 
1117 static void
1118 add_runtime_event(struct work_atoms *atoms, u64 delta,
1119 		  u64 timestamp __maybe_unused)
1120 {
1121 	struct work_atom *atom;
1122 
1123 	BUG_ON(list_empty(&atoms->work_list));
1124 
1125 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1126 
1127 	atom->runtime += delta;
1128 	atoms->total_runtime += delta;
1129 }
1130 
1131 static void
1132 add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
1133 {
1134 	struct work_atom *atom;
1135 	u64 delta;
1136 
1137 	if (list_empty(&atoms->work_list))
1138 		return;
1139 
1140 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1141 
1142 	if (atom->state != THREAD_WAIT_CPU)
1143 		return;
1144 
1145 	if (timestamp < atom->wake_up_time) {
1146 		atom->state = THREAD_IGNORE;
1147 		return;
1148 	}
1149 
1150 	atom->state = THREAD_SCHED_IN;
1151 	atom->sched_in_time = timestamp;
1152 
1153 	delta = atom->sched_in_time - atom->wake_up_time;
1154 	atoms->total_lat += delta;
1155 	if (delta > atoms->max_lat) {
1156 		atoms->max_lat = delta;
1157 		atoms->max_lat_start = atom->wake_up_time;
1158 		atoms->max_lat_end = timestamp;
1159 	}
1160 	atoms->nb_atoms++;
1161 }
1162 
1163 static void free_work_atoms(struct work_atoms *atoms)
1164 {
1165 	struct work_atom *atom, *tmp;
1166 
1167 	if (atoms == NULL)
1168 		return;
1169 
1170 	list_for_each_entry_safe(atom, tmp, &atoms->work_list, list) {
1171 		list_del(&atom->list);
1172 		free(atom);
1173 	}
1174 	thread__zput(atoms->thread);
1175 	free(atoms);
1176 }
1177 
1178 static int latency_switch_event(struct perf_sched *sched,
1179 				struct perf_sample *sample,
1180 				struct machine *machine)
1181 {
1182 	const u32 prev_pid = perf_sample__intval(sample, "prev_pid"),
1183 		  next_pid = perf_sample__intval(sample, "next_pid");
1184 	const char prev_state = perf_sample__taskstate(sample, "prev_state");
1185 	struct work_atoms *out_events, *in_events;
1186 	struct thread *sched_out, *sched_in;
1187 	u64 timestamp0, timestamp = sample->time;
1188 	int cpu = sample->cpu, err = -1;
1189 	s64 delta;
1190 
1191 	/* perf.data is untrusted input — CPU may be absent or corrupted */
1192 	if (cpu >= MAX_CPUS || cpu < 0) {
1193 		pr_warning("WARNING: at offset %#" PRIx64 ": out-of-bound sample CPU %d, skipping sample\n",
1194 			   sample->file_offset, cpu);
1195 		return 0;
1196 	}
1197 
1198 	timestamp0 = sched->cpu_last_switched[cpu];
1199 	sched->cpu_last_switched[cpu] = timestamp;
1200 	if (timestamp0)
1201 		delta = timestamp - timestamp0;
1202 	else
1203 		delta = 0;
1204 
1205 	if (delta < 0) {
1206 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1207 		return -1;
1208 	}
1209 
1210 	sched_out = machine__findnew_thread(machine, -1, prev_pid);
1211 	sched_in = machine__findnew_thread(machine, -1, next_pid);
1212 	if (sched_out == NULL || sched_in == NULL)
1213 		goto out_put;
1214 
1215 	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1216 	if (!out_events) {
1217 		if (thread_atoms_insert(sched, sched_out))
1218 			goto out_put;
1219 		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1220 		if (!out_events) {
1221 			pr_err("out-event: Internal tree error");
1222 			goto out_put;
1223 		}
1224 	}
1225 	if (add_sched_out_event(out_events, prev_state, timestamp))
1226 		goto out_put;
1227 
1228 	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1229 	if (!in_events) {
1230 		if (thread_atoms_insert(sched, sched_in))
1231 			goto out_put;
1232 		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1233 		if (!in_events) {
1234 			pr_err("in-event: Internal tree error");
1235 			goto out_put;
1236 		}
1237 		/*
1238 		 * Take came in we have not heard about yet,
1239 		 * add in an initial atom in runnable state:
1240 		 */
1241 		if (add_sched_out_event(in_events, 'R', timestamp))
1242 			goto out_put;
1243 	}
1244 	add_sched_in_event(in_events, timestamp);
1245 	err = 0;
1246 out_put:
1247 	thread__put(sched_out);
1248 	thread__put(sched_in);
1249 	return err;
1250 }
1251 
1252 static int latency_runtime_event(struct perf_sched *sched,
1253 				 struct perf_sample *sample,
1254 				 struct machine *machine)
1255 {
1256 	const u32 pid	   = perf_sample__intval(sample, "pid");
1257 	const u64 runtime  = perf_sample__intval(sample, "runtime");
1258 	struct thread *thread = machine__findnew_thread(machine, -1, pid);
1259 	struct work_atoms *atoms;
1260 	u64 timestamp = sample->time;
1261 	int cpu = sample->cpu, err = -1;
1262 
1263 	if (thread == NULL)
1264 		return -1;
1265 
1266 	atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1267 
1268 	/* perf.data is untrusted input — CPU may be absent or corrupted */
1269 	if (cpu >= MAX_CPUS || cpu < 0) {
1270 		pr_warning("WARNING: at offset %#" PRIx64 ": out-of-bound sample CPU %d, skipping sample\n",
1271 			   sample->file_offset, cpu);
1272 		err = 0;
1273 		goto out_put;
1274 	}
1275 	if (!atoms) {
1276 		if (thread_atoms_insert(sched, thread))
1277 			goto out_put;
1278 		atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1279 		if (!atoms) {
1280 			pr_err("in-event: Internal tree error");
1281 			goto out_put;
1282 		}
1283 		if (add_sched_out_event(atoms, 'R', timestamp))
1284 			goto out_put;
1285 	}
1286 
1287 	add_runtime_event(atoms, runtime, timestamp);
1288 	err = 0;
1289 out_put:
1290 	thread__put(thread);
1291 	return err;
1292 }
1293 
1294 static int latency_wakeup_event(struct perf_sched *sched,
1295 				struct perf_sample *sample,
1296 				struct machine *machine)
1297 {
1298 	const u32 pid	  = perf_sample__intval(sample, "pid");
1299 	struct work_atoms *atoms;
1300 	struct work_atom *atom;
1301 	struct thread *wakee;
1302 	u64 timestamp = sample->time;
1303 	int err = -1;
1304 
1305 	wakee = machine__findnew_thread(machine, -1, pid);
1306 	if (wakee == NULL)
1307 		return -1;
1308 	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1309 	if (!atoms) {
1310 		if (thread_atoms_insert(sched, wakee))
1311 			goto out_put;
1312 		atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1313 		if (!atoms) {
1314 			pr_err("wakeup-event: Internal tree error");
1315 			goto out_put;
1316 		}
1317 		if (add_sched_out_event(atoms, 'S', timestamp))
1318 			goto out_put;
1319 	}
1320 
1321 	BUG_ON(list_empty(&atoms->work_list));
1322 
1323 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1324 
1325 	/*
1326 	 * As we do not guarantee the wakeup event happens when
1327 	 * task is out of run queue, also may happen when task is
1328 	 * on run queue and wakeup only change ->state to TASK_RUNNING,
1329 	 * then we should not set the ->wake_up_time when wake up a
1330 	 * task which is on run queue.
1331 	 *
1332 	 * You WILL be missing events if you've recorded only
1333 	 * one CPU, or are only looking at only one, so don't
1334 	 * skip in this case.
1335 	 */
1336 	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1337 		goto out_ok;
1338 
1339 	sched->nr_timestamps++;
1340 	if (atom->sched_out_time > timestamp) {
1341 		sched->nr_unordered_timestamps++;
1342 		goto out_ok;
1343 	}
1344 
1345 	atom->state = THREAD_WAIT_CPU;
1346 	atom->wake_up_time = timestamp;
1347 out_ok:
1348 	err = 0;
1349 out_put:
1350 	thread__put(wakee);
1351 	return err;
1352 }
1353 
1354 static int latency_migrate_task_event(struct perf_sched *sched,
1355 				      struct perf_sample *sample,
1356 				      struct machine *machine)
1357 {
1358 	const u32 pid = perf_sample__intval(sample, "pid");
1359 	u64 timestamp = sample->time;
1360 	struct work_atoms *atoms;
1361 	struct work_atom *atom;
1362 	struct thread *migrant;
1363 	int err = -1;
1364 
1365 	/*
1366 	 * Only need to worry about migration when profiling one CPU.
1367 	 */
1368 	if (sched->profile_cpu == -1)
1369 		return 0;
1370 
1371 	migrant = machine__findnew_thread(machine, -1, pid);
1372 	if (migrant == NULL)
1373 		return -1;
1374 	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1375 	if (!atoms) {
1376 		if (thread_atoms_insert(sched, migrant))
1377 			goto out_put;
1378 		register_pid(sched, thread__tid(migrant), thread__comm_str(migrant));
1379 		atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1380 		if (!atoms) {
1381 			pr_err("migration-event: Internal tree error");
1382 			goto out_put;
1383 		}
1384 		if (add_sched_out_event(atoms, 'R', timestamp))
1385 			goto out_put;
1386 	}
1387 
1388 	BUG_ON(list_empty(&atoms->work_list));
1389 
1390 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1391 	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1392 
1393 	sched->nr_timestamps++;
1394 
1395 	if (atom->sched_out_time > timestamp)
1396 		sched->nr_unordered_timestamps++;
1397 	err = 0;
1398 out_put:
1399 	thread__put(migrant);
1400 	return err;
1401 }
1402 
1403 static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1404 {
1405 	int i;
1406 	int ret;
1407 	u64 avg;
1408 	char max_lat_start[32], max_lat_end[32];
1409 
1410 	if (!work_list->nb_atoms)
1411 		return;
1412 	/*
1413 	 * Ignore idle threads:
1414 	 */
1415 	if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
1416 		return;
1417 
1418 	sched->all_runtime += work_list->total_runtime;
1419 	sched->all_count   += work_list->nb_atoms;
1420 
1421 	if (work_list->num_merged > 1) {
1422 		ret = printf("  %s:(%d) ", thread__comm_str(work_list->thread),
1423 			     work_list->num_merged);
1424 	} else {
1425 		ret = printf("  %s:%d ", thread__comm_str(work_list->thread),
1426 			     thread__tid(work_list->thread));
1427 	}
1428 
1429 	for (i = 0; i < 24 - ret; i++)
1430 		printf(" ");
1431 
1432 	avg = work_list->total_lat / work_list->nb_atoms;
1433 	timestamp__scnprintf_usec(work_list->max_lat_start, max_lat_start, sizeof(max_lat_start));
1434 	timestamp__scnprintf_usec(work_list->max_lat_end, max_lat_end, sizeof(max_lat_end));
1435 
1436 	printf("|%11.3f ms |%9" PRIu64 " | avg:%8.3f ms | max:%8.3f ms | max start: %12s s | max end: %12s s\n",
1437 	      (double)work_list->total_runtime / NSEC_PER_MSEC,
1438 		 work_list->nb_atoms, (double)avg / NSEC_PER_MSEC,
1439 		 (double)work_list->max_lat / NSEC_PER_MSEC,
1440 		 max_lat_start, max_lat_end);
1441 }
1442 
1443 static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1444 {
1445 	pid_t l_tid, r_tid;
1446 
1447 	if (RC_CHK_EQUAL(l->thread, r->thread))
1448 		return 0;
1449 	l_tid = thread__tid(l->thread);
1450 	r_tid = thread__tid(r->thread);
1451 	if (l_tid < r_tid)
1452 		return -1;
1453 	if (l_tid > r_tid)
1454 		return 1;
1455 	return (int)(RC_CHK_ACCESS(l->thread) - RC_CHK_ACCESS(r->thread));
1456 }
1457 
1458 static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1459 {
1460 	u64 avgl, avgr;
1461 
1462 	if (!l->nb_atoms)
1463 		return -1;
1464 
1465 	if (!r->nb_atoms)
1466 		return 1;
1467 
1468 	avgl = l->total_lat / l->nb_atoms;
1469 	avgr = r->total_lat / r->nb_atoms;
1470 
1471 	if (avgl < avgr)
1472 		return -1;
1473 	if (avgl > avgr)
1474 		return 1;
1475 
1476 	return 0;
1477 }
1478 
1479 static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1480 {
1481 	if (l->max_lat < r->max_lat)
1482 		return -1;
1483 	if (l->max_lat > r->max_lat)
1484 		return 1;
1485 
1486 	return 0;
1487 }
1488 
1489 static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1490 {
1491 	if (l->nb_atoms < r->nb_atoms)
1492 		return -1;
1493 	if (l->nb_atoms > r->nb_atoms)
1494 		return 1;
1495 
1496 	return 0;
1497 }
1498 
1499 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1500 {
1501 	if (l->total_runtime < r->total_runtime)
1502 		return -1;
1503 	if (l->total_runtime > r->total_runtime)
1504 		return 1;
1505 
1506 	return 0;
1507 }
1508 
1509 static int sort_dimension__add(const char *tok, struct list_head *list)
1510 {
1511 	size_t i;
1512 	static struct sort_dimension avg_sort_dimension = {
1513 		.name = "avg",
1514 		.cmp  = avg_cmp,
1515 	};
1516 	static struct sort_dimension max_sort_dimension = {
1517 		.name = "max",
1518 		.cmp  = max_cmp,
1519 	};
1520 	static struct sort_dimension pid_sort_dimension = {
1521 		.name = "pid",
1522 		.cmp  = pid_cmp,
1523 	};
1524 	static struct sort_dimension runtime_sort_dimension = {
1525 		.name = "runtime",
1526 		.cmp  = runtime_cmp,
1527 	};
1528 	static struct sort_dimension switch_sort_dimension = {
1529 		.name = "switch",
1530 		.cmp  = switch_cmp,
1531 	};
1532 	struct sort_dimension *available_sorts[] = {
1533 		&pid_sort_dimension,
1534 		&avg_sort_dimension,
1535 		&max_sort_dimension,
1536 		&switch_sort_dimension,
1537 		&runtime_sort_dimension,
1538 	};
1539 
1540 	for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1541 		if (!strcmp(available_sorts[i]->name, tok)) {
1542 			list_add_tail(&available_sorts[i]->list, list);
1543 
1544 			return 0;
1545 		}
1546 	}
1547 
1548 	return -1;
1549 }
1550 
1551 static void perf_sched__sort_lat(struct perf_sched *sched)
1552 {
1553 	struct rb_node *node;
1554 	struct rb_root_cached *root = &sched->atom_root;
1555 again:
1556 	for (;;) {
1557 		struct work_atoms *data;
1558 		node = rb_first_cached(root);
1559 		if (!node)
1560 			break;
1561 
1562 		rb_erase_cached(node, root);
1563 		data = rb_entry(node, struct work_atoms, node);
1564 		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1565 	}
1566 	if (root == &sched->atom_root) {
1567 		root = &sched->merged_atom_root;
1568 		goto again;
1569 	}
1570 }
1571 
1572 static int process_sched_wakeup_event(const struct perf_tool *tool,
1573 				      struct perf_sample *sample,
1574 				      struct machine *machine)
1575 {
1576 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1577 
1578 	if (sched->tp_handler->wakeup_event)
1579 		return sched->tp_handler->wakeup_event(sched, sample, machine);
1580 
1581 	return 0;
1582 }
1583 
1584 static int process_sched_wakeup_ignore(const struct perf_tool *tool __maybe_unused,
1585 				      struct perf_sample *sample __maybe_unused,
1586 				      struct machine *machine __maybe_unused)
1587 {
1588 	return 0;
1589 }
1590 
1591 static bool thread__has_color(struct thread *thread)
1592 {
1593 	struct thread_runtime *tr = thread__priv(thread);
1594 
1595 	return tr != NULL && tr->color;
1596 }
1597 
1598 static struct thread*
1599 map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid)
1600 {
1601 	struct thread *thread = machine__findnew_thread(machine, pid, tid);
1602 
1603 	if (!sched->map.color_pids || !thread)
1604 		return thread;
1605 
1606 	/*
1607 	 * Always check the color-pids map, even if thread__priv() is
1608 	 * already set.  COMM events processed before the first sched_switch
1609 	 * allocate a thread_runtime via thread__get_runtime(), so priv is
1610 	 * non-NULL before we ever get here.  Skipping the check on non-NULL
1611 	 * priv would prevent those threads from being colored.
1612 	 */
1613 	if (thread_map__has(sched->map.color_pids, tid)) {
1614 		struct thread_runtime *tr = thread__get_runtime(thread);
1615 
1616 		if (tr)
1617 			tr->color = true;
1618 	}
1619 	return thread;
1620 }
1621 
1622 static bool sched_match_task(struct perf_sched *sched, const char *comm_str)
1623 {
1624 	bool fuzzy_match = sched->map.fuzzy;
1625 	struct strlist *task_names = sched->map.task_names;
1626 	struct str_node *node;
1627 
1628 	strlist__for_each_entry(node, task_names) {
1629 		bool match_found = fuzzy_match ? !!strstr(comm_str, node->s) :
1630 							!strcmp(comm_str, node->s);
1631 		if (match_found)
1632 			return true;
1633 	}
1634 
1635 	return false;
1636 }
1637 
1638 static void print_sched_map(struct perf_sched *sched, struct perf_cpu this_cpu, int cpus_nr,
1639 								const char *color, bool sched_out)
1640 {
1641 	for (int i = 0; i < cpus_nr; i++) {
1642 		struct perf_cpu cpu = {
1643 			.cpu = sched->map.comp ? sched->map.comp_cpus[i].cpu : i,
1644 		};
1645 		struct thread *curr_thread = sched->curr_thread[cpu.cpu];
1646 		struct thread *curr_out_thread = sched->curr_out_thread[cpu.cpu];
1647 		struct thread_runtime *curr_tr;
1648 		const char *pid_color = color;
1649 		const char *cpu_color = color;
1650 		char symbol = ' ';
1651 		struct thread *thread_to_check = sched_out ? curr_out_thread : curr_thread;
1652 
1653 		if (thread_to_check && thread__has_color(thread_to_check))
1654 			pid_color = COLOR_PIDS;
1655 
1656 		if (sched->map.color_cpus && perf_cpu_map__has(sched->map.color_cpus, cpu))
1657 			cpu_color = COLOR_CPUS;
1658 
1659 		if (cpu.cpu == this_cpu.cpu)
1660 			symbol = '*';
1661 
1662 		color_fprintf(stdout, cpu.cpu != this_cpu.cpu ? color : cpu_color, "%c", symbol);
1663 
1664 		thread_to_check = sched_out ? sched->curr_out_thread[cpu.cpu] :
1665 								sched->curr_thread[cpu.cpu];
1666 
1667 		if (thread_to_check) {
1668 			curr_tr = thread__get_runtime(thread_to_check);
1669 			if (curr_tr == NULL)
1670 				return;
1671 
1672 			if (sched_out) {
1673 				if (cpu.cpu == this_cpu.cpu)
1674 					color_fprintf(stdout, color, "-  ");
1675 				else {
1676 					curr_tr = thread__get_runtime(sched->curr_thread[cpu.cpu]);
1677 					if (curr_tr != NULL)
1678 						color_fprintf(stdout, pid_color, "%2s ",
1679 										curr_tr->shortname);
1680 				}
1681 			} else
1682 				color_fprintf(stdout, pid_color, "%2s ", curr_tr->shortname);
1683 		} else
1684 			color_fprintf(stdout, color, "   ");
1685 	}
1686 }
1687 
1688 static int map_switch_event(struct perf_sched *sched,  struct perf_sample *sample,
1689 			    struct machine *machine)
1690 {
1691 	const u32 next_pid = perf_sample__intval(sample, "next_pid");
1692 	const u32 prev_pid = perf_sample__intval(sample, "prev_pid");
1693 	struct thread *sched_in, *sched_out;
1694 	struct thread_runtime *tr;
1695 	int new_shortname;
1696 	u64 timestamp0, timestamp = sample->time;
1697 	s64 delta;
1698 	struct perf_cpu this_cpu = {
1699 		.cpu = sample->cpu,
1700 	};
1701 	int cpus_nr;
1702 	int proceed;
1703 	bool new_cpu = false;
1704 	const char *color = PERF_COLOR_NORMAL;
1705 	char stimestamp[32];
1706 	const char *str;
1707 	int ret = -1;
1708 
1709 	/* perf.data is untrusted input — CPU may be absent or corrupted */
1710 	if (this_cpu.cpu >= MAX_CPUS || this_cpu.cpu < 0) {
1711 		pr_warning("WARNING: at offset %#" PRIx64 ": out-of-bound sample CPU %d, skipping sample\n",
1712 			   sample->file_offset, this_cpu.cpu);
1713 		return 0;
1714 	}
1715 
1716 	if (this_cpu.cpu > sched->max_cpu.cpu)
1717 		sched->max_cpu = this_cpu;
1718 
1719 	if (sched->map.comp) {
1720 		cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS);
1721 		if (!__test_and_set_bit(this_cpu.cpu, sched->map.comp_cpus_mask)) {
1722 			sched->map.comp_cpus[cpus_nr++] = this_cpu;
1723 			new_cpu = true;
1724 		}
1725 	} else
1726 		cpus_nr = sched->max_cpu.cpu + 1;
1727 
1728 	timestamp0 = sched->cpu_last_switched[this_cpu.cpu];
1729 	sched->cpu_last_switched[this_cpu.cpu] = timestamp;
1730 	if (timestamp0)
1731 		delta = timestamp - timestamp0;
1732 	else
1733 		delta = 0;
1734 
1735 	if (delta < 0) {
1736 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1737 		return -1;
1738 	}
1739 
1740 	sched_in = map__findnew_thread(sched, machine, -1, next_pid);
1741 	sched_out = map__findnew_thread(sched, machine, -1, prev_pid);
1742 	if (sched_in == NULL || sched_out == NULL)
1743 		goto out;
1744 
1745 	tr = thread__get_runtime(sched_in);
1746 	if (tr == NULL)
1747 		goto out;
1748 
1749 	thread__put(sched->curr_thread[this_cpu.cpu]);
1750 	thread__put(sched->curr_out_thread[this_cpu.cpu]);
1751 
1752 	sched->curr_thread[this_cpu.cpu] = thread__get(sched_in);
1753 	sched->curr_out_thread[this_cpu.cpu] = thread__get(sched_out);
1754 
1755 	ret = 0;
1756 
1757 	str = thread__comm_str(sched_in);
1758 	new_shortname = 0;
1759 	if (!tr->shortname[0]) {
1760 		if (!strcmp(thread__comm_str(sched_in), "swapper")) {
1761 			/*
1762 			 * Don't allocate a letter-number for swapper:0
1763 			 * as a shortname. Instead, we use '.' for it.
1764 			 */
1765 			tr->shortname[0] = '.';
1766 			tr->shortname[1] = ' ';
1767 		} else if (!sched->map.task_name || sched_match_task(sched, str)) {
1768 			tr->shortname[0] = sched->next_shortname1;
1769 			tr->shortname[1] = sched->next_shortname2;
1770 
1771 			if (sched->next_shortname1 < 'Z') {
1772 				sched->next_shortname1++;
1773 			} else {
1774 				sched->next_shortname1 = 'A';
1775 				if (sched->next_shortname2 < '9')
1776 					sched->next_shortname2++;
1777 				else
1778 					sched->next_shortname2 = '0';
1779 			}
1780 		} else {
1781 			tr->shortname[0] = '-';
1782 			tr->shortname[1] = ' ';
1783 		}
1784 		new_shortname = 1;
1785 	}
1786 
1787 	if (sched->map.cpus && !perf_cpu_map__has(sched->map.cpus, this_cpu))
1788 		goto out;
1789 
1790 	proceed = 0;
1791 	str = thread__comm_str(sched_in);
1792 	/*
1793 	 * Check which of sched_in and sched_out matches the passed --task-name
1794 	 * arguments and call the corresponding print_sched_map.
1795 	 */
1796 	if (sched->map.task_name && !sched_match_task(sched, str)) {
1797 		if (!sched_match_task(sched, thread__comm_str(sched_out)))
1798 			goto out;
1799 		else
1800 			goto sched_out;
1801 
1802 	} else {
1803 		str = thread__comm_str(sched_out);
1804 		if (!(sched->map.task_name && !sched_match_task(sched, str)))
1805 			proceed = 1;
1806 	}
1807 
1808 	printf("  ");
1809 
1810 	print_sched_map(sched, this_cpu, cpus_nr, color, false);
1811 
1812 	timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp));
1813 	color_fprintf(stdout, color, "  %12s secs ", stimestamp);
1814 	if (new_shortname || tr->comm_changed || (verbose > 0 && thread__tid(sched_in))) {
1815 		const char *pid_color = color;
1816 
1817 		if (thread__has_color(sched_in))
1818 			pid_color = COLOR_PIDS;
1819 
1820 		color_fprintf(stdout, pid_color, "%s => %s:%d",
1821 			tr->shortname, thread__comm_str(sched_in), thread__tid(sched_in));
1822 		tr->comm_changed = false;
1823 	}
1824 
1825 	if (sched->map.comp && new_cpu)
1826 		color_fprintf(stdout, color, " (CPU %d)", this_cpu.cpu);
1827 
1828 	if (proceed != 1) {
1829 		color_fprintf(stdout, color, "\n");
1830 		goto out;
1831 	}
1832 
1833 sched_out:
1834 	if (sched->map.task_name) {
1835 		tr = thread__get_runtime(sched->curr_out_thread[this_cpu.cpu]);
1836 		if (strcmp(tr->shortname, "") == 0)
1837 			goto out;
1838 
1839 		if (proceed == 1)
1840 			color_fprintf(stdout, color, "\n");
1841 
1842 		printf("  ");
1843 		print_sched_map(sched, this_cpu, cpus_nr, color, true);
1844 		timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp));
1845 		color_fprintf(stdout, color, "  %12s secs ", stimestamp);
1846 	}
1847 
1848 	color_fprintf(stdout, color, "\n");
1849 
1850 out:
1851 	thread__put(sched_out);
1852 	thread__put(sched_in);
1853 
1854 	return ret;
1855 }
1856 
1857 static int process_sched_switch_event(const struct perf_tool *tool,
1858 				      struct perf_sample *sample,
1859 				      struct machine *machine)
1860 {
1861 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1862 	int this_cpu = sample->cpu, err = 0;
1863 	u32 prev_pid = perf_sample__intval(sample, "prev_pid"),
1864 	    next_pid = perf_sample__intval(sample, "next_pid");
1865 
1866 	/* perf.data is untrusted input — CPU may be absent or corrupted */
1867 	if (this_cpu < 0 || this_cpu >= MAX_CPUS) {
1868 		pr_warning("WARNING: at offset %#" PRIx64 ": out-of-bound sample CPU %d, skipping sample\n",
1869 			   sample->file_offset, this_cpu);
1870 		return 0;
1871 	}
1872 
1873 	if (sched->curr_pid[this_cpu] != (u32)-1) {
1874 		/*
1875 		 * Are we trying to switch away a PID that is
1876 		 * not current?
1877 		 */
1878 		if (sched->curr_pid[this_cpu] != prev_pid)
1879 			sched->nr_context_switch_bugs++;
1880 	}
1881 
1882 	if (sched->tp_handler->switch_event)
1883 		err = sched->tp_handler->switch_event(sched, sample, machine);
1884 
1885 	sched->curr_pid[this_cpu] = next_pid;
1886 	return err;
1887 }
1888 
1889 static int process_sched_runtime_event(const struct perf_tool *tool,
1890 				       struct perf_sample *sample,
1891 				       struct machine *machine)
1892 {
1893 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1894 
1895 	/* perf.data is untrusted input — CPU may be absent or corrupted */
1896 	if (sample->cpu >= MAX_CPUS) {
1897 		pr_warning("WARNING: at offset %#" PRIx64 ": out-of-bound sample CPU %u, skipping sample\n",
1898 			   sample->file_offset, sample->cpu);
1899 		return 0;
1900 	}
1901 
1902 	if (sched->tp_handler->runtime_event)
1903 		return sched->tp_handler->runtime_event(sched, sample, machine);
1904 
1905 	return 0;
1906 }
1907 
1908 static int perf_sched__process_fork_event(const struct perf_tool *tool,
1909 					  union perf_event *event,
1910 					  struct perf_sample *sample,
1911 					  struct machine *machine)
1912 {
1913 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1914 
1915 	/* run the fork event through the perf machinery */
1916 	perf_event__process_fork(tool, event, sample, machine);
1917 
1918 	/* and then run additional processing needed for this command */
1919 	if (sched->tp_handler->fork_event)
1920 		return sched->tp_handler->fork_event(sched, event, machine);
1921 
1922 	return 0;
1923 }
1924 
1925 static int process_sched_migrate_task_event(const struct perf_tool *tool,
1926 					    struct perf_sample *sample,
1927 					    struct machine *machine)
1928 {
1929 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1930 
1931 	if (sched->tp_handler->migrate_task_event)
1932 		return sched->tp_handler->migrate_task_event(sched, sample, machine);
1933 
1934 	return 0;
1935 }
1936 
1937 typedef int (*tracepoint_handler)(const struct perf_tool *tool,
1938 				  struct perf_sample *sample,
1939 				  struct machine *machine);
1940 
1941 static int perf_sched__process_tracepoint_sample(const struct perf_tool *tool __maybe_unused,
1942 						 union perf_event *event __maybe_unused,
1943 						 struct perf_sample *sample,
1944 						 struct machine *machine)
1945 {
1946 	struct evsel *evsel = sample->evsel;
1947 	int err = 0;
1948 
1949 	if (evsel->handler != NULL) {
1950 		tracepoint_handler f = evsel->handler;
1951 		err = f(tool, sample, machine);
1952 	}
1953 
1954 	return err;
1955 }
1956 
1957 static int perf_sched__process_comm(const struct perf_tool *tool __maybe_unused,
1958 				    union perf_event *event,
1959 				    struct perf_sample *sample,
1960 				    struct machine *machine)
1961 {
1962 	struct thread *thread;
1963 	struct thread_runtime *tr;
1964 	int err;
1965 
1966 	err = perf_event__process_comm(tool, event, sample, machine);
1967 	if (err)
1968 		return err;
1969 
1970 	thread = machine__find_thread(machine, sample->pid, sample->tid);
1971 	if (!thread) {
1972 		pr_err("Internal error: can't find thread\n");
1973 		return -1;
1974 	}
1975 
1976 	tr = thread__get_runtime(thread);
1977 	if (tr == NULL) {
1978 		thread__put(thread);
1979 		return -1;
1980 	}
1981 
1982 	tr->comm_changed = true;
1983 	thread__put(thread);
1984 
1985 	return 0;
1986 }
1987 
1988 static int perf_sched__read_events(struct perf_sched *sched)
1989 {
1990 	struct evsel_str_handler handlers[] = {
1991 		{ "sched:sched_switch",	      process_sched_switch_event, },
1992 		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
1993 		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
1994 		{ "sched:sched_waking",	      process_sched_wakeup_event, },
1995 		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
1996 		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
1997 	};
1998 	struct perf_session *session;
1999 	struct perf_data data = {
2000 		.path  = input_name,
2001 		.mode  = PERF_DATA_MODE_READ,
2002 		.force = sched->force,
2003 	};
2004 	int rc = -1;
2005 
2006 	session = perf_session__new(&data, &sched->tool);
2007 	if (IS_ERR(session)) {
2008 		pr_debug("Error creating perf session");
2009 		return PTR_ERR(session);
2010 	}
2011 
2012 	symbol__init(perf_session__env(session));
2013 
2014 	/* prefer sched_waking if it is captured */
2015 	if (evlist__find_tracepoint_by_name(session->evlist, "sched:sched_waking"))
2016 		handlers[2].handler = process_sched_wakeup_ignore;
2017 
2018 	if (perf_session__set_tracepoints_handlers(session, handlers))
2019 		goto out_delete;
2020 
2021 	if (perf_session__has_traces(session, "record -R")) {
2022 		int err = perf_session__process_events(session);
2023 		if (err) {
2024 			pr_err("Failed to process events, error %d", err);
2025 			goto out_delete;
2026 		}
2027 
2028 		sched->nr_events      = session->evlist->stats.nr_events[0];
2029 		sched->nr_lost_events = session->evlist->stats.total_lost;
2030 		sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
2031 	}
2032 
2033 	rc = 0;
2034 out_delete:
2035 	perf_session__delete(session);
2036 	return rc;
2037 }
2038 
2039 /*
2040  * scheduling times are printed as msec.usec
2041  */
2042 static inline void print_sched_time(unsigned long long nsecs, int width)
2043 {
2044 	unsigned long msecs;
2045 	unsigned long usecs;
2046 
2047 	msecs  = nsecs / NSEC_PER_MSEC;
2048 	nsecs -= msecs * NSEC_PER_MSEC;
2049 	usecs  = nsecs / NSEC_PER_USEC;
2050 	printf("%*lu.%03lu ", width, msecs, usecs);
2051 }
2052 
2053 /*
2054  * returns runtime data for event, allocating memory for it the
2055  * first time it is used.
2056  */
2057 static struct evsel_runtime *evsel__get_runtime(struct evsel *evsel)
2058 {
2059 	struct evsel_runtime *r = evsel->priv;
2060 
2061 	if (r == NULL) {
2062 		r = zalloc(sizeof(struct evsel_runtime));
2063 		evsel->priv = r;
2064 	}
2065 
2066 	return r;
2067 }
2068 
2069 /*
2070  * save last time event was seen per cpu
2071  */
2072 static void evsel__save_time(struct evsel *evsel, u64 timestamp, u32 cpu)
2073 {
2074 	struct evsel_runtime *r = evsel__get_runtime(evsel);
2075 
2076 	if (r == NULL)
2077 		return;
2078 
2079 	if ((cpu >= r->ncpu) || (r->last_time == NULL)) {
2080 		int i, n = __roundup_pow_of_two(cpu+1);
2081 		void *p = r->last_time;
2082 
2083 		p = realloc(r->last_time, n * sizeof(u64));
2084 		if (!p)
2085 			return;
2086 
2087 		r->last_time = p;
2088 		for (i = r->ncpu; i < n; ++i)
2089 			r->last_time[i] = (u64) 0;
2090 
2091 		r->ncpu = n;
2092 	}
2093 
2094 	r->last_time[cpu] = timestamp;
2095 }
2096 
2097 /* returns last time this event was seen on the given cpu */
2098 static u64 evsel__get_time(struct evsel *evsel, u32 cpu)
2099 {
2100 	struct evsel_runtime *r = evsel__get_runtime(evsel);
2101 
2102 	if ((r == NULL) || (r->last_time == NULL) || (cpu >= r->ncpu))
2103 		return 0;
2104 
2105 	return r->last_time[cpu];
2106 }
2107 
2108 static void timehist__evsel_priv_destructor(void *priv)
2109 {
2110 	struct evsel_runtime *r = priv;
2111 
2112 	if (r) {
2113 		free(r->last_time);
2114 		free(r);
2115 	}
2116 }
2117 
2118 static int comm_width = 30;
2119 
2120 static char *timehist_get_commstr(struct thread *thread)
2121 {
2122 	static char str[32];
2123 	const char *comm = thread__comm_str(thread);
2124 	pid_t tid = thread__tid(thread);
2125 	pid_t pid = thread__pid(thread);
2126 	int n;
2127 
2128 	if (pid == 0)
2129 		n = scnprintf(str, sizeof(str), "%s", comm);
2130 
2131 	else if (tid != pid)
2132 		n = scnprintf(str, sizeof(str), "%s[%d/%d]", comm, tid, pid);
2133 
2134 	else
2135 		n = scnprintf(str, sizeof(str), "%s[%d]", comm, tid);
2136 
2137 	if (n > comm_width)
2138 		comm_width = n;
2139 
2140 	return str;
2141 }
2142 
2143 /* prio field format: xxx or xxx->yyy */
2144 #define MAX_PRIO_STR_LEN 8
2145 static char *timehist_get_priostr(struct thread *thread,
2146 				  struct perf_sample *sample)
2147 {
2148 	static char prio_str[16];
2149 	int prev_prio = (int)perf_sample__intval(sample, "prev_prio");
2150 	struct thread_runtime *tr = thread__priv(thread);
2151 
2152 	if (tr->prio != prev_prio && tr->prio != -1)
2153 		scnprintf(prio_str, sizeof(prio_str), "%d->%d", tr->prio, prev_prio);
2154 	else
2155 		scnprintf(prio_str, sizeof(prio_str), "%d", prev_prio);
2156 
2157 	return prio_str;
2158 }
2159 
2160 static void timehist_header(struct perf_sched *sched)
2161 {
2162 	u32 ncpus = sched->max_cpu.cpu + 1;
2163 	u32 i, j;
2164 
2165 	printf("%15s %6s ", "time", "cpu");
2166 
2167 	if (sched->show_cpu_visual) {
2168 		printf(" ");
2169 		for (i = 0, j = 0; i < ncpus; ++i) {
2170 			printf("%x", j++);
2171 			if (j > 15)
2172 				j = 0;
2173 		}
2174 		printf(" ");
2175 	}
2176 
2177 	printf(" %-*s", comm_width, "task name");
2178 
2179 	if (sched->show_prio)
2180 		printf("  %-*s", MAX_PRIO_STR_LEN, "prio");
2181 
2182 	printf("  %9s  %9s  %9s", "wait time", "sch delay", "run time");
2183 
2184 	if (sched->pre_migrations)
2185 		printf("  %9s", "pre-mig time");
2186 
2187 	if (sched->show_state)
2188 		printf("  %s", "state");
2189 
2190 	printf("\n");
2191 
2192 	/*
2193 	 * units row
2194 	 */
2195 	printf("%15s %-6s ", "", "");
2196 
2197 	if (sched->show_cpu_visual)
2198 		printf(" %*s ", ncpus, "");
2199 
2200 	printf(" %-*s", comm_width, "[tid/pid]");
2201 
2202 	if (sched->show_prio)
2203 		printf("  %-*s", MAX_PRIO_STR_LEN, "");
2204 
2205 	printf("  %9s  %9s  %9s", "(msec)", "(msec)", "(msec)");
2206 
2207 	if (sched->pre_migrations)
2208 		printf("  %9s", "(msec)");
2209 
2210 	printf("\n");
2211 
2212 	/*
2213 	 * separator
2214 	 */
2215 	printf("%.15s %.6s ", graph_dotted_line, graph_dotted_line);
2216 
2217 	if (sched->show_cpu_visual)
2218 		printf(" %.*s ", ncpus, graph_dotted_line);
2219 
2220 	printf(" %.*s", comm_width, graph_dotted_line);
2221 
2222 	if (sched->show_prio)
2223 		printf("  %.*s", MAX_PRIO_STR_LEN, graph_dotted_line);
2224 
2225 	printf("  %.9s  %.9s  %.9s", graph_dotted_line, graph_dotted_line, graph_dotted_line);
2226 
2227 	if (sched->pre_migrations)
2228 		printf("  %.9s", graph_dotted_line);
2229 
2230 	if (sched->show_state)
2231 		printf("  %.5s", graph_dotted_line);
2232 
2233 	printf("\n");
2234 }
2235 
2236 static void timehist_print_sample(struct perf_sched *sched,
2237 				  struct perf_sample *sample,
2238 				  struct addr_location *al,
2239 				  struct thread *thread,
2240 				  u64 t, const char state)
2241 {
2242 	struct thread_runtime *tr = thread__priv(thread);
2243 	const char *next_comm = perf_sample__strval(sample, "next_comm");
2244 	const u32 next_pid = perf_sample__intval(sample, "next_pid");
2245 	u32 max_cpus = sched->max_cpu.cpu + 1;
2246 	char tstr[64];
2247 	char nstr[30];
2248 	u64 wait_time;
2249 
2250 	if (cpu_list && (sample->cpu >= MAX_NR_CPUS ||
2251 			!test_bit(sample->cpu, cpu_bitmap)))
2252 		return;
2253 
2254 	timestamp__scnprintf_usec(t, tstr, sizeof(tstr));
2255 	printf("%15s [%04d] ", tstr, sample->cpu);
2256 
2257 	if (sched->show_cpu_visual) {
2258 		u32 i;
2259 		char c;
2260 
2261 		printf(" ");
2262 		for (i = 0; i < max_cpus; ++i) {
2263 			/* flag idle times with 'i'; others are sched events */
2264 			if (i == sample->cpu)
2265 				c = (thread__tid(thread) == 0) ? 'i' : 's';
2266 			else
2267 				c = ' ';
2268 			printf("%c", c);
2269 		}
2270 		printf(" ");
2271 	}
2272 
2273 	if (!thread__comm_set(thread)) {
2274 		const char *prev_comm = perf_sample__strval(sample, "prev_comm");
2275 
2276 		thread__set_comm(thread, prev_comm, sample->time);
2277         }
2278 
2279 	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
2280 
2281 	if (sched->show_prio)
2282 		printf(" %-*s ", MAX_PRIO_STR_LEN, timehist_get_priostr(thread, sample));
2283 
2284 	wait_time = tr->dt_sleep + tr->dt_iowait + tr->dt_preempt;
2285 	print_sched_time(wait_time, 6);
2286 
2287 	print_sched_time(tr->dt_delay, 6);
2288 	print_sched_time(tr->dt_run, 6);
2289 	if (sched->pre_migrations)
2290 		print_sched_time(tr->dt_pre_mig, 6);
2291 
2292 	if (sched->show_state)
2293 		printf(" %5c ", thread__tid(thread) == 0 ? 'I' : state);
2294 
2295 	if (sched->show_next) {
2296 		snprintf(nstr, sizeof(nstr), "next: %s[%d]", next_comm, next_pid);
2297 		printf(" %-*s", comm_width, nstr);
2298 	}
2299 
2300 	if (sched->show_wakeups && !sched->show_next)
2301 		printf("  %-*s", comm_width, "");
2302 
2303 	if (thread__tid(thread) == 0)
2304 		goto out;
2305 
2306 	if (sched->show_callchain)
2307 		printf("  ");
2308 
2309 	sample__fprintf_sym(sample, al, 0,
2310 			    EVSEL__PRINT_SYM | EVSEL__PRINT_ONELINE |
2311 			    EVSEL__PRINT_CALLCHAIN_ARROW |
2312 			    EVSEL__PRINT_SKIP_IGNORED,
2313 			    get_tls_callchain_cursor(), symbol_conf.bt_stop_list,  stdout);
2314 
2315 out:
2316 	printf("\n");
2317 }
2318 
2319 /*
2320  * Explanation of delta-time stats:
2321  *
2322  *            t = time of current schedule out event
2323  *        tprev = time of previous sched out event
2324  *                also time of schedule-in event for current task
2325  *    last_time = time of last sched change event for current task
2326  *                (i.e, time process was last scheduled out)
2327  * ready_to_run = time of wakeup for current task
2328  *     migrated = time of task migration to another CPU
2329  *
2330  * -----|-------------|-------------|-------------|-------------|-----
2331  *    last         ready         migrated       tprev           t
2332  *    time         to run
2333  *
2334  *      |---------------- dt_wait ----------------|
2335  *                   |--------- dt_delay ---------|-- dt_run --|
2336  *                   |- dt_pre_mig -|
2337  *
2338  *     dt_run = run time of current task
2339  *    dt_wait = time between last schedule out event for task and tprev
2340  *              represents time spent off the cpu
2341  *   dt_delay = time between wakeup and schedule-in of task
2342  * dt_pre_mig = time between wakeup and migration to another CPU
2343  */
2344 
2345 static void timehist_update_runtime_stats(struct thread_runtime *r,
2346 					 u64 t, u64 tprev)
2347 {
2348 	r->dt_delay   = 0;
2349 	r->dt_sleep   = 0;
2350 	r->dt_iowait  = 0;
2351 	r->dt_preempt = 0;
2352 	r->dt_run     = 0;
2353 	r->dt_pre_mig = 0;
2354 
2355 	if (tprev) {
2356 		r->dt_run = t - tprev;
2357 		if (r->ready_to_run) {
2358 			if (r->ready_to_run > tprev)
2359 				pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
2360 			else
2361 				r->dt_delay = tprev - r->ready_to_run;
2362 
2363 			if ((r->migrated > r->ready_to_run) && (r->migrated < tprev))
2364 				r->dt_pre_mig = r->migrated - r->ready_to_run;
2365 		}
2366 
2367 		if (r->last_time > tprev)
2368 			pr_debug("time travel: last sched out time for task > previous sched_switch event\n");
2369 		else if (r->last_time) {
2370 			u64 dt_wait = tprev - r->last_time;
2371 
2372 			if (r->last_state == 'R')
2373 				r->dt_preempt = dt_wait;
2374 			else if (r->last_state == 'D')
2375 				r->dt_iowait = dt_wait;
2376 			else
2377 				r->dt_sleep = dt_wait;
2378 		}
2379 	}
2380 
2381 	update_stats(&r->run_stats, r->dt_run);
2382 
2383 	r->total_run_time     += r->dt_run;
2384 	r->total_delay_time   += r->dt_delay;
2385 	r->total_sleep_time   += r->dt_sleep;
2386 	r->total_iowait_time  += r->dt_iowait;
2387 	r->total_preempt_time += r->dt_preempt;
2388 	r->total_pre_mig_time += r->dt_pre_mig;
2389 }
2390 
2391 static bool is_idle_sample(struct perf_sample *sample)
2392 {
2393 	/* pid 0 == swapper == idle task */
2394 	if (evsel__name_is(sample->evsel, "sched:sched_switch"))
2395 		return perf_sample__intval(sample, "prev_pid") == 0;
2396 
2397 	return sample->pid == 0;
2398 }
2399 
2400 static void save_task_callchain(struct perf_sched *sched,
2401 				struct perf_sample *sample,
2402 				struct machine *machine)
2403 {
2404 	struct callchain_cursor *cursor;
2405 	struct thread *thread;
2406 
2407 	/* want main thread for process - has maps */
2408 	thread = machine__findnew_thread(machine, sample->pid, sample->pid);
2409 	if (thread == NULL) {
2410 		pr_debug("Failed to get thread for pid %d.\n", sample->pid);
2411 		return;
2412 	}
2413 
2414 	if (!sched->show_callchain || sample->callchain == NULL) {
2415 		thread__put(thread);
2416 		return;
2417 	}
2418 
2419 	cursor = get_tls_callchain_cursor();
2420 
2421 	if (thread__resolve_callchain(thread, cursor, sample,
2422 				      NULL, NULL, sched->max_stack + 2) != 0) {
2423 		if (verbose > 0)
2424 			pr_err("Failed to resolve callchain. Skipping\n");
2425 
2426 		thread__put(thread);
2427 		return;
2428 	}
2429 
2430 	callchain_cursor_commit(cursor);
2431 	thread__put(thread);
2432 
2433 	while (true) {
2434 		struct callchain_cursor_node *node;
2435 		struct symbol *sym;
2436 
2437 		node = callchain_cursor_current(cursor);
2438 		if (node == NULL)
2439 			break;
2440 
2441 		sym = node->ms.sym;
2442 		if (sym) {
2443 			if (!strcmp(sym->name, "schedule") ||
2444 			    !strcmp(sym->name, "__schedule") ||
2445 			    !strcmp(sym->name, "preempt_schedule"))
2446 				symbol__set_ignore(sym, true);
2447 		}
2448 
2449 		callchain_cursor_advance(cursor);
2450 	}
2451 }
2452 
2453 static int init_idle_thread(struct thread *thread)
2454 {
2455 	struct idle_thread_runtime *itr;
2456 
2457 	thread__set_comm(thread, idle_comm, 0);
2458 
2459 	itr = zalloc(sizeof(*itr));
2460 	if (itr == NULL)
2461 		return -ENOMEM;
2462 
2463 	init_prio(&itr->tr);
2464 	init_stats(&itr->tr.run_stats);
2465 	callchain_init(&itr->callchain);
2466 	callchain_cursor_reset(&itr->cursor);
2467 	thread__set_priv(thread, itr);
2468 
2469 	return 0;
2470 }
2471 
2472 /*
2473  * Track idle stats per cpu by maintaining a local thread
2474  * struct for the idle task on each cpu.
2475  */
2476 static int init_idle_threads(int ncpu)
2477 {
2478 	int i, ret;
2479 
2480 	idle_threads = calloc(ncpu, sizeof(struct thread *));
2481 	if (!idle_threads)
2482 		return -ENOMEM;
2483 
2484 	idle_max_cpu = ncpu;
2485 
2486 	/* allocate the actual thread struct if needed */
2487 	for (i = 0; i < ncpu; ++i) {
2488 		idle_threads[i] = thread__new(0, 0);
2489 		if (idle_threads[i] == NULL)
2490 			return -ENOMEM;
2491 
2492 		ret = init_idle_thread(idle_threads[i]);
2493 		if (ret < 0)
2494 			return ret;
2495 	}
2496 
2497 	return 0;
2498 }
2499 
2500 static void free_idle_threads(void)
2501 {
2502 	int i;
2503 
2504 	if (idle_threads == NULL)
2505 		return;
2506 
2507 	for (i = 0; i < idle_max_cpu; ++i) {
2508 		struct thread *idle = idle_threads[i];
2509 
2510 		if (idle) {
2511 			struct idle_thread_runtime *itr;
2512 
2513 			itr = thread__priv(idle);
2514 			if (itr) {
2515 				thread__put(itr->last_thread);
2516 				free_callchain(&itr->callchain);
2517 				callchain_cursor_cleanup(&itr->cursor);
2518 			}
2519 
2520 			thread__put(idle);
2521 		}
2522 	}
2523 
2524 	free(idle_threads);
2525 }
2526 
2527 static struct thread *get_idle_thread(int cpu)
2528 {
2529 	/*
2530 	 * expand/allocate array of pointers to local thread
2531 	 * structs if needed
2532 	 */
2533 	if ((cpu >= idle_max_cpu) || (idle_threads == NULL)) {
2534 		int i, j = __roundup_pow_of_two(cpu+1);
2535 		void *p;
2536 
2537 		p = realloc(idle_threads, j * sizeof(struct thread *));
2538 		if (!p)
2539 			return NULL;
2540 
2541 		idle_threads = (struct thread **) p;
2542 		for (i = idle_max_cpu; i < j; ++i)
2543 			idle_threads[i] = NULL;
2544 
2545 		idle_max_cpu = j;
2546 	}
2547 
2548 	/* allocate a new thread struct if needed */
2549 	if (idle_threads[cpu] == NULL) {
2550 		idle_threads[cpu] = thread__new(0, 0);
2551 
2552 		if (idle_threads[cpu]) {
2553 			if (init_idle_thread(idle_threads[cpu]) < 0) {
2554 				/* clean up so next call doesn't find a half-initialized thread */
2555 				thread__zput(idle_threads[cpu]);
2556 				return NULL;
2557 			}
2558 		}
2559 	}
2560 
2561 	return thread__get(idle_threads[cpu]);
2562 }
2563 
2564 static void save_idle_callchain(struct perf_sched *sched,
2565 				struct idle_thread_runtime *itr,
2566 				struct perf_sample *sample)
2567 {
2568 	struct callchain_cursor *cursor;
2569 
2570 	if (!sched->show_callchain || sample->callchain == NULL)
2571 		return;
2572 
2573 	cursor = get_tls_callchain_cursor();
2574 	if (cursor == NULL)
2575 		return;
2576 
2577 	callchain_cursor__copy(&itr->cursor, cursor);
2578 }
2579 
2580 static struct thread *timehist_get_thread(struct perf_sched *sched,
2581 					  struct perf_sample *sample,
2582 					  struct machine *machine)
2583 {
2584 	struct thread *thread;
2585 
2586 	if (is_idle_sample(sample)) {
2587 		thread = get_idle_thread(sample->cpu);
2588 		if (thread == NULL)
2589 			pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);
2590 
2591 	} else {
2592 		/* there were samples with tid 0 but non-zero pid */
2593 		thread = machine__findnew_thread(machine, sample->pid,
2594 						 sample->tid ?: sample->pid);
2595 		if (thread == NULL) {
2596 			pr_debug("Failed to get thread for tid %d. skipping sample.\n",
2597 				 sample->tid);
2598 		}
2599 
2600 		save_task_callchain(sched, sample, machine);
2601 		if (sched->idle_hist) {
2602 			struct thread *idle;
2603 			struct idle_thread_runtime *itr;
2604 
2605 			idle = get_idle_thread(sample->cpu);
2606 			if (idle == NULL) {
2607 				pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);
2608 				thread__put(thread);
2609 				return NULL;
2610 			}
2611 
2612 			itr = thread__priv(idle);
2613 			if (itr == NULL) {
2614 				thread__put(idle);
2615 				thread__put(thread);
2616 				return NULL;
2617 			}
2618 
2619 			thread__put(itr->last_thread);
2620 			itr->last_thread = thread__get(thread);
2621 
2622 			/* copy task callchain when entering to idle */
2623 			if (perf_sample__intval(sample, "next_pid") == 0)
2624 				save_idle_callchain(sched, itr, sample);
2625 
2626 			thread__put(idle);
2627 		}
2628 	}
2629 
2630 	return thread;
2631 }
2632 
2633 static bool timehist_skip_sample(struct perf_sched *sched,
2634 				 struct thread *thread,
2635 				 struct perf_sample *sample)
2636 {
2637 	bool rc = false;
2638 	int prio = -1;
2639 	struct thread_runtime *tr = NULL;
2640 
2641 	if (thread__is_filtered(thread)) {
2642 		rc = true;
2643 		sched->skipped_samples++;
2644 	}
2645 
2646 	if (sched->prio_str) {
2647 		/*
2648 		 * Because priority may be changed during task execution,
2649 		 * first read priority from prev sched_in event for current task.
2650 		 * If prev sched_in event is not saved, then read priority from
2651 		 * current task sched_out event.
2652 		 */
2653 		tr = thread__get_runtime(thread);
2654 		if (tr && tr->prio != -1)
2655 			prio = tr->prio;
2656 		else if (evsel__name_is(sample->evsel, "sched:sched_switch"))
2657 			prio = perf_sample__intval(sample, "prev_prio");
2658 
2659 		/* negative prio means no info; out-of-range prio can't match the filter */
2660 		if (prio >= 0 &&
2661 		    (prio >= MAX_PRIO || !test_bit(prio, sched->prio_bitmap))) {
2662 			rc = true;
2663 			sched->skipped_samples++;
2664 		}
2665 	}
2666 
2667 	if (sched->idle_hist) {
2668 		if (!evsel__name_is(sample->evsel, "sched:sched_switch"))
2669 			rc = true;
2670 		else if (perf_sample__intval(sample, "prev_pid") != 0 &&
2671 			 perf_sample__intval(sample, "next_pid") != 0)
2672 			rc = true;
2673 	}
2674 
2675 	return rc;
2676 }
2677 
2678 static void timehist_print_wakeup_event(struct perf_sched *sched,
2679 					struct perf_sample *sample,
2680 					struct machine *machine,
2681 					struct thread *awakened)
2682 {
2683 	struct thread *thread;
2684 	char tstr[64];
2685 
2686 	thread = machine__findnew_thread(machine, sample->pid, sample->tid);
2687 	if (thread == NULL)
2688 		return;
2689 
2690 	/* show wakeup unless both awakee and awaker are filtered */
2691 	if (timehist_skip_sample(sched, thread, sample) &&
2692 	    timehist_skip_sample(sched, awakened, sample)) {
2693 		thread__put(thread);
2694 		return;
2695 	}
2696 
2697 	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
2698 	printf("%15s [%04d] ", tstr, sample->cpu);
2699 	if (sched->show_cpu_visual)
2700 		printf(" %*s ", sched->max_cpu.cpu + 1, "");
2701 
2702 	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
2703 
2704 	/* dt spacer */
2705 	printf("  %9s  %9s  %9s ", "", "", "");
2706 
2707 	printf("awakened: %s", timehist_get_commstr(awakened));
2708 
2709 	printf("\n");
2710 
2711 	thread__put(thread);
2712 }
2713 
2714 static int timehist_sched_wakeup_ignore(const struct perf_tool *tool __maybe_unused,
2715 					union perf_event *event __maybe_unused,
2716 					struct perf_sample *sample __maybe_unused,
2717 					struct machine *machine __maybe_unused)
2718 {
2719 	return 0;
2720 }
2721 
2722 static int timehist_sched_wakeup_event(const struct perf_tool *tool,
2723 				       union perf_event *event __maybe_unused,
2724 				       struct perf_sample *sample,
2725 				       struct machine *machine)
2726 {
2727 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2728 	struct thread *thread;
2729 	struct thread_runtime *tr = NULL;
2730 	/* want pid of awakened task not pid in sample */
2731 	const u32 pid = perf_sample__intval(sample, "pid");
2732 
2733 	thread = machine__findnew_thread(machine, 0, pid);
2734 	if (thread == NULL)
2735 		return -1;
2736 
2737 	tr = thread__get_runtime(thread);
2738 	if (tr == NULL) {
2739 		thread__put(thread);
2740 		return -1;
2741 	}
2742 
2743 	if (tr->ready_to_run == 0)
2744 		tr->ready_to_run = sample->time;
2745 
2746 	/* show wakeups if requested */
2747 	if (sched->show_wakeups &&
2748 	    !perf_time__skip_sample(&sched->ptime, sample->time))
2749 		timehist_print_wakeup_event(sched, sample, machine, thread);
2750 
2751 	thread__put(thread);
2752 	return 0;
2753 }
2754 
2755 static void timehist_print_migration_event(struct perf_sched *sched,
2756 					struct perf_sample *sample,
2757 					struct machine *machine,
2758 					struct thread *migrated)
2759 {
2760 	struct thread *thread;
2761 	char tstr[64];
2762 	u32 max_cpus;
2763 	u32 ocpu, dcpu;
2764 
2765 	if (sched->summary_only)
2766 		return;
2767 
2768 	max_cpus = sched->max_cpu.cpu + 1;
2769 	ocpu = perf_sample__intval(sample, "orig_cpu");
2770 	dcpu = perf_sample__intval(sample, "dest_cpu");
2771 
2772 	thread = machine__findnew_thread(machine, sample->pid, sample->tid);
2773 	if (thread == NULL)
2774 		return;
2775 
2776 	if (timehist_skip_sample(sched, thread, sample) &&
2777 	    timehist_skip_sample(sched, migrated, sample)) {
2778 		thread__put(thread);
2779 		return;
2780 	}
2781 
2782 	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
2783 	printf("%15s [%04d] ", tstr, sample->cpu);
2784 
2785 	if (sched->show_cpu_visual) {
2786 		u32 i;
2787 		char c;
2788 
2789 		printf("  ");
2790 		for (i = 0; i < max_cpus; ++i) {
2791 			c = (i == sample->cpu) ? 'm' : ' ';
2792 			printf("%c", c);
2793 		}
2794 		printf("  ");
2795 	}
2796 
2797 	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
2798 
2799 	/* dt spacer */
2800 	printf("  %9s  %9s  %9s ", "", "", "");
2801 
2802 	printf("migrated: %s", timehist_get_commstr(migrated));
2803 	printf(" cpu %d => %d", ocpu, dcpu);
2804 
2805 	printf("\n");
2806 	thread__put(thread);
2807 }
2808 
2809 static int timehist_migrate_task_event(const struct perf_tool *tool,
2810 				       union perf_event *event __maybe_unused,
2811 				       struct perf_sample *sample,
2812 				       struct machine *machine)
2813 {
2814 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2815 	struct thread *thread;
2816 	struct thread_runtime *tr = NULL;
2817 	/* want pid of migrated task not pid in sample */
2818 	const u32 pid = perf_sample__intval(sample, "pid");
2819 
2820 	thread = machine__findnew_thread(machine, 0, pid);
2821 	if (thread == NULL)
2822 		return -1;
2823 
2824 	tr = thread__get_runtime(thread);
2825 	if (tr == NULL) {
2826 		thread__put(thread);
2827 		return -1;
2828 	}
2829 
2830 	tr->migrations++;
2831 	tr->migrated = sample->time;
2832 
2833 	/* show migrations if requested */
2834 	if (sched->show_migrations) {
2835 		timehist_print_migration_event(sched, sample, machine, thread);
2836 	}
2837 	thread__put(thread);
2838 
2839 	return 0;
2840 }
2841 
2842 static void timehist_update_task_prio(struct perf_sample *sample,
2843 				      struct machine *machine)
2844 {
2845 	struct thread *thread;
2846 	struct thread_runtime *tr = NULL;
2847 	const u32 next_pid = perf_sample__intval(sample, "next_pid");
2848 	const u32 next_prio = perf_sample__intval(sample, "next_prio");
2849 
2850 	if (next_pid == 0)
2851 		thread = get_idle_thread(sample->cpu);
2852 	else
2853 		thread = machine__findnew_thread(machine, -1, next_pid);
2854 
2855 	if (thread == NULL)
2856 		return;
2857 
2858 	tr = thread__get_runtime(thread);
2859 	if (tr != NULL)
2860 		tr->prio = next_prio;
2861 
2862 	thread__put(thread);
2863 }
2864 
2865 static int timehist_sched_change_event(const struct perf_tool *tool,
2866 				       union perf_event *event,
2867 				       struct perf_sample *sample,
2868 				       struct machine *machine)
2869 {
2870 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2871 	struct perf_time_interval *ptime = &sched->ptime;
2872 	struct addr_location al;
2873 	struct thread *thread = NULL;
2874 	struct thread_runtime *tr = NULL;
2875 	u64 tprev, t = sample->time;
2876 	int rc = 0;
2877 	const char state = perf_sample__taskstate(sample, "prev_state");
2878 
2879 	/* perf.data is untrusted input — CPU may be absent or corrupted */
2880 	if (sample->cpu >= MAX_CPUS) {
2881 		pr_warning("WARNING: at offset %#" PRIx64 ": out-of-bound sample CPU %d, skipping sample\n",
2882 			   sample->file_offset, sample->cpu);
2883 		return 0;
2884 	}
2885 
2886 	addr_location__init(&al);
2887 	if (machine__resolve(machine, &al, sample) < 0) {
2888 		pr_err("problem processing %s (%u) event at offset %#" PRIx64 ", skipping it\n",
2889 		       perf_event__name(event->header.type), event->header.type,
2890 		       sample->file_offset);
2891 		rc = -1;
2892 		goto out;
2893 	}
2894 
2895 	if (sched->show_prio || sched->prio_str)
2896 		timehist_update_task_prio(sample, machine);
2897 
2898 	thread = timehist_get_thread(sched, sample, machine);
2899 	if (thread == NULL) {
2900 		rc = -1;
2901 		goto out;
2902 	}
2903 
2904 	if (timehist_skip_sample(sched, thread, sample))
2905 		goto out;
2906 
2907 	tr = thread__get_runtime(thread);
2908 	if (tr == NULL) {
2909 		rc = -1;
2910 		goto out;
2911 	}
2912 
2913 	tprev = evsel__get_time(sample->evsel, sample->cpu);
2914 
2915 	/*
2916 	 * If start time given:
2917 	 * - sample time is under window user cares about - skip sample
2918 	 * - tprev is under window user cares about  - reset to start of window
2919 	 */
2920 	if (ptime->start && ptime->start > t)
2921 		goto out;
2922 
2923 	if (tprev && ptime->start > tprev)
2924 		tprev = ptime->start;
2925 
2926 	/*
2927 	 * If end time given:
2928 	 * - previous sched event is out of window - we are done
2929 	 * - sample time is beyond window user cares about - reset it
2930 	 *   to close out stats for time window interest
2931 	 * - If tprev is 0, that is, sched_in event for current task is
2932 	 *   not recorded, cannot determine whether sched_in event is
2933 	 *   within time window interest - ignore it
2934 	 */
2935 	if (ptime->end) {
2936 		if (!tprev || tprev > ptime->end)
2937 			goto out;
2938 
2939 		if (t > ptime->end)
2940 			t = ptime->end;
2941 	}
2942 
2943 	/*
2944 	 * Use is_idle_sample() not thread__tid() == 0: a crafted perf.data
2945 	 * can set common_pid=0 with prev_pid!=0, giving us a machine thread
2946 	 * whose priv is thread_runtime, not idle_thread_runtime — the cast
2947 	 * below would read past the allocation.
2948 	 */
2949 	if (!sched->idle_hist || is_idle_sample(sample)) {
2950 		if (!cpu_list || (sample->cpu < MAX_NR_CPUS &&
2951 				 test_bit(sample->cpu, cpu_bitmap)))
2952 			timehist_update_runtime_stats(tr, t, tprev);
2953 
2954 		if (sched->idle_hist) {
2955 			struct idle_thread_runtime *itr = (void *)tr;
2956 			struct thread_runtime *last_tr;
2957 
2958 			if (itr->last_thread == NULL)
2959 				goto out;
2960 
2961 			/* add current idle time as last thread's runtime */
2962 			last_tr = thread__get_runtime(itr->last_thread);
2963 			if (last_tr == NULL)
2964 				goto out;
2965 
2966 			timehist_update_runtime_stats(last_tr, t, tprev);
2967 			/*
2968 			 * remove delta time of last thread as it's not updated
2969 			 * and otherwise it will show an invalid value next
2970 			 * time.  we only care total run time and run stat.
2971 			 */
2972 			last_tr->dt_run = 0;
2973 			last_tr->dt_delay = 0;
2974 			last_tr->dt_sleep = 0;
2975 			last_tr->dt_iowait = 0;
2976 			last_tr->dt_preempt = 0;
2977 
2978 			if (itr->cursor.nr)
2979 				callchain_append(&itr->callchain, &itr->cursor, t - tprev);
2980 
2981 			thread__zput(itr->last_thread);
2982 		}
2983 
2984 		if (!sched->summary_only)
2985 			timehist_print_sample(sched, sample, &al, thread, t, state);
2986 	}
2987 
2988 out:
2989 	if (sched->hist_time.start == 0 && t >= ptime->start)
2990 		sched->hist_time.start = t;
2991 	if (ptime->end == 0 || t <= ptime->end)
2992 		sched->hist_time.end = t;
2993 
2994 	if (tr) {
2995 		/* time of this sched_switch event becomes last time task seen */
2996 		tr->last_time = sample->time;
2997 
2998 		/* last state is used to determine where to account wait time */
2999 		tr->last_state = state;
3000 
3001 		/* sched out event for task so reset ready to run time and migrated time */
3002 		if (state == 'R')
3003 			tr->ready_to_run = t;
3004 		else
3005 			tr->ready_to_run = 0;
3006 
3007 		tr->migrated = 0;
3008 	}
3009 
3010 	evsel__save_time(sample->evsel, sample->time, sample->cpu);
3011 
3012 	thread__put(thread);
3013 	addr_location__exit(&al);
3014 	return rc;
3015 }
3016 
3017 static int timehist_sched_switch_event(const struct perf_tool *tool,
3018 			     union perf_event *event,
3019 			     struct perf_sample *sample,
3020 			     struct machine *machine __maybe_unused)
3021 {
3022 	return timehist_sched_change_event(tool, event, sample, machine);
3023 }
3024 
3025 static int process_lost(const struct perf_tool *tool __maybe_unused,
3026 			union perf_event *event,
3027 			struct perf_sample *sample,
3028 			struct machine *machine __maybe_unused)
3029 {
3030 	char tstr[64];
3031 
3032 	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
3033 	printf("%15s ", tstr);
3034 	printf("lost %" PRI_lu64 " events on cpu %d\n", event->lost.lost, sample->cpu);
3035 
3036 	return 0;
3037 }
3038 
3039 
3040 static void print_thread_runtime(struct thread *t,
3041 				 struct thread_runtime *r)
3042 {
3043 	double mean = avg_stats(&r->run_stats);
3044 	float stddev;
3045 
3046 	printf("%*s   %5d  %9" PRIu64 " ",
3047 	       comm_width, timehist_get_commstr(t), thread__ppid(t),
3048 	       (u64) r->run_stats.n);
3049 
3050 	print_sched_time(r->total_run_time, 8);
3051 	stddev = rel_stddev_stats(stddev_stats(&r->run_stats), mean);
3052 	print_sched_time(r->run_stats.min, 6);
3053 	printf(" ");
3054 	print_sched_time((u64) mean, 6);
3055 	printf(" ");
3056 	print_sched_time(r->run_stats.max, 6);
3057 	printf("  ");
3058 	printf("%5.2f", stddev);
3059 	printf("   %5" PRIu64, r->migrations);
3060 	printf("\n");
3061 }
3062 
3063 static void print_thread_waittime(struct thread *t,
3064 				  struct thread_runtime *r)
3065 {
3066 	printf("%*s   %5d  %9" PRIu64 " ",
3067 	       comm_width, timehist_get_commstr(t), thread__ppid(t),
3068 	       (u64) r->run_stats.n);
3069 
3070 	print_sched_time(r->total_run_time, 8);
3071 	print_sched_time(r->total_sleep_time, 6);
3072 	printf(" ");
3073 	print_sched_time(r->total_iowait_time, 6);
3074 	printf(" ");
3075 	print_sched_time(r->total_preempt_time, 6);
3076 	printf(" ");
3077 	print_sched_time(r->total_delay_time, 6);
3078 	printf("\n");
3079 }
3080 
3081 struct total_run_stats {
3082 	struct perf_sched *sched;
3083 	u64  sched_count;
3084 	u64  task_count;
3085 	u64  total_run_time;
3086 };
3087 
3088 static int show_thread_runtime(struct thread *t, void *priv)
3089 {
3090 	struct total_run_stats *stats = priv;
3091 	struct thread_runtime *r;
3092 
3093 	if (thread__is_filtered(t))
3094 		return 0;
3095 
3096 	r = thread__priv(t);
3097 	if (r && r->run_stats.n) {
3098 		stats->task_count++;
3099 		stats->sched_count += r->run_stats.n;
3100 		stats->total_run_time += r->total_run_time;
3101 
3102 		if (stats->sched->show_state)
3103 			print_thread_waittime(t, r);
3104 		else
3105 			print_thread_runtime(t, r);
3106 	}
3107 
3108 	return 0;
3109 }
3110 
3111 static size_t callchain__fprintf_folded(FILE *fp, struct callchain_node *node)
3112 {
3113 	const char *sep = " <- ";
3114 	struct callchain_list *chain;
3115 	size_t ret = 0;
3116 	char bf[1024];
3117 	bool first;
3118 
3119 	if (node == NULL)
3120 		return 0;
3121 
3122 	ret = callchain__fprintf_folded(fp, node->parent);
3123 	first = (ret == 0);
3124 
3125 	list_for_each_entry(chain, &node->val, list) {
3126 		if (chain->ip >= PERF_CONTEXT_MAX)
3127 			continue;
3128 		if (chain->ms.sym && symbol__ignore(chain->ms.sym))
3129 			continue;
3130 		ret += fprintf(fp, "%s%s", first ? "" : sep,
3131 			       callchain_list__sym_name(chain, bf, sizeof(bf),
3132 							false));
3133 		first = false;
3134 	}
3135 
3136 	return ret;
3137 }
3138 
3139 static size_t timehist_print_idlehist_callchain(struct rb_root_cached *root)
3140 {
3141 	size_t ret = 0;
3142 	FILE *fp = stdout;
3143 	struct callchain_node *chain;
3144 	/* sort() uses rb_insert_color() on rb_root, not rb_root_cached */
3145 	struct rb_node *rb_node = rb_first(&root->rb_root);
3146 
3147 	printf("  %16s  %8s  %s\n", "Idle time (msec)", "Count", "Callchains");
3148 	printf("  %.16s  %.8s  %.50s\n", graph_dotted_line, graph_dotted_line,
3149 	       graph_dotted_line);
3150 
3151 	while (rb_node) {
3152 		chain = rb_entry(rb_node, struct callchain_node, rb_node);
3153 		rb_node = rb_next(rb_node);
3154 
3155 		ret += fprintf(fp, "  ");
3156 		print_sched_time(chain->hit, 12);
3157 		ret += 16;  /* print_sched_time returns 2nd arg + 4 */
3158 		ret += fprintf(fp, " %8d  ", chain->count);
3159 		ret += callchain__fprintf_folded(fp, chain);
3160 		ret += fprintf(fp, "\n");
3161 	}
3162 
3163 	return ret;
3164 }
3165 
3166 static void timehist_print_summary(struct perf_sched *sched,
3167 				   struct perf_session *session)
3168 {
3169 	struct machine *m = &session->machines.host;
3170 	struct total_run_stats totals;
3171 	u64 task_count;
3172 	struct thread *t;
3173 	struct thread_runtime *r;
3174 	int i;
3175 	u64 hist_time = sched->hist_time.end - sched->hist_time.start;
3176 
3177 	memset(&totals, 0, sizeof(totals));
3178 	totals.sched = sched;
3179 
3180 	if (sched->idle_hist) {
3181 		printf("\nIdle-time summary\n");
3182 		printf("%*s  parent  sched-out  ", comm_width, "comm");
3183 		printf("  idle-time   min-idle    avg-idle    max-idle  stddev  migrations\n");
3184 	} else if (sched->show_state) {
3185 		printf("\nWait-time summary\n");
3186 		printf("%*s  parent   sched-in  ", comm_width, "comm");
3187 		printf("   run-time      sleep      iowait     preempt       delay\n");
3188 	} else {
3189 		printf("\nRuntime summary\n");
3190 		printf("%*s  parent   sched-in  ", comm_width, "comm");
3191 		printf("   run-time    min-run     avg-run     max-run  stddev  migrations\n");
3192 	}
3193 	printf("%*s            (count)  ", comm_width, "");
3194 	printf("     (msec)     (msec)      (msec)      (msec)       %s\n",
3195 	       sched->show_state ? "(msec)" : "%");
3196 	printf("%.117s\n", graph_dotted_line);
3197 
3198 	machine__for_each_thread(m, show_thread_runtime, &totals);
3199 	task_count = totals.task_count;
3200 	if (!task_count)
3201 		printf("<no still running tasks>\n");
3202 
3203 	/* CPU idle stats not tracked when samples were skipped */
3204 	if (sched->skipped_samples && !sched->idle_hist)
3205 		return;
3206 
3207 	printf("\nIdle stats:\n");
3208 	for (i = 0; i < idle_max_cpu; ++i) {
3209 		if (cpu_list && !test_bit(i, cpu_bitmap))
3210 			continue;
3211 
3212 		t = idle_threads[i];
3213 		if (!t)
3214 			continue;
3215 
3216 		r = thread__priv(t);
3217 		if (r && r->run_stats.n) {
3218 			totals.sched_count += r->run_stats.n;
3219 			printf("    CPU %2d idle for ", i);
3220 			print_sched_time(r->total_run_time, 6);
3221 			printf(" msec  (%6.2f%%)\n", 100.0 * r->total_run_time / hist_time);
3222 		} else
3223 			printf("    CPU %2d idle entire time window\n", i);
3224 	}
3225 
3226 	if (sched->idle_hist && sched->show_callchain) {
3227 		callchain_param.mode  = CHAIN_FOLDED;
3228 		callchain_param.value = CCVAL_PERIOD;
3229 
3230 		callchain_register_param(&callchain_param);
3231 
3232 		printf("\nIdle stats by callchain:\n");
3233 		for (i = 0; i < idle_max_cpu; ++i) {
3234 			struct idle_thread_runtime *itr;
3235 
3236 			t = idle_threads[i];
3237 			if (!t)
3238 				continue;
3239 
3240 			itr = thread__priv(t);
3241 			if (itr == NULL)
3242 				continue;
3243 
3244 			callchain_param.sort(&itr->sorted_root.rb_root, &itr->callchain,
3245 					     0, &callchain_param);
3246 
3247 			printf("  CPU %2d:", i);
3248 			print_sched_time(itr->tr.total_run_time, 6);
3249 			printf(" msec\n");
3250 			timehist_print_idlehist_callchain(&itr->sorted_root);
3251 			printf("\n");
3252 		}
3253 	}
3254 
3255 	printf("\n"
3256 	       "    Total number of unique tasks: %" PRIu64 "\n"
3257 	       "Total number of context switches: %" PRIu64 "\n",
3258 	       totals.task_count, totals.sched_count);
3259 
3260 	printf("           Total run time (msec): ");
3261 	print_sched_time(totals.total_run_time, 2);
3262 	printf("\n");
3263 
3264 	printf("    Total scheduling time (msec): ");
3265 	print_sched_time(hist_time, 2);
3266 	printf(" (x %d)\n", sched->max_cpu.cpu);
3267 }
3268 
3269 typedef int (*sched_handler)(const struct perf_tool *tool,
3270 			  union perf_event *event,
3271 			  struct perf_sample *sample,
3272 			  struct machine *machine);
3273 
3274 static int perf_timehist__process_sample(const struct perf_tool *tool,
3275 					 union perf_event *event,
3276 					 struct perf_sample *sample,
3277 					 struct machine *machine)
3278 {
3279 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
3280 	struct evsel *evsel = sample->evsel;
3281 	int err = 0;
3282 	struct perf_cpu this_cpu = {
3283 		.cpu = sample->cpu,
3284 	};
3285 
3286 	/* max_cpu indexes arrays allocated with MAX_CPUS entries */
3287 	if (this_cpu.cpu >= 0 && this_cpu.cpu < MAX_CPUS &&
3288 	    this_cpu.cpu > sched->max_cpu.cpu)
3289 		sched->max_cpu = this_cpu;
3290 
3291 	if (evsel->handler != NULL) {
3292 		sched_handler f = evsel->handler;
3293 
3294 		err = f(tool, event, sample, machine);
3295 	}
3296 
3297 	return err;
3298 }
3299 
3300 static int timehist_check_attr(struct perf_sched *sched,
3301 			       struct evlist *evlist)
3302 {
3303 	struct evsel *evsel;
3304 	struct evsel_runtime *er;
3305 
3306 	list_for_each_entry(evsel, &evlist->core.entries, core.node) {
3307 		er = evsel__get_runtime(evsel);
3308 		if (er == NULL) {
3309 			pr_err("Failed to allocate memory for evsel runtime data\n");
3310 			return -1;
3311 		}
3312 
3313 		/* only need to save callchain related to sched_switch event */
3314 		if (sched->show_callchain &&
3315 		    evsel__name_is(evsel, "sched:sched_switch") &&
3316 		    !evsel__has_callchain(evsel)) {
3317 			pr_info("Samples of sched_switch event do not have callchains.\n");
3318 			sched->show_callchain = 0;
3319 			symbol_conf.use_callchain = 0;
3320 		}
3321 	}
3322 
3323 	return 0;
3324 }
3325 
3326 static int timehist_parse_prio_str(struct perf_sched *sched)
3327 {
3328 	char *p;
3329 	unsigned long start_prio, end_prio;
3330 	const char *str = sched->prio_str;
3331 
3332 	if (!str)
3333 		return 0;
3334 
3335 	while (isdigit(*str)) {
3336 		p = NULL;
3337 		start_prio = strtoul(str, &p, 0);
3338 		if (start_prio >= MAX_PRIO || (*p != '\0' && *p != ',' && *p != '-'))
3339 			return -1;
3340 
3341 		if (*p == '-') {
3342 			str = ++p;
3343 			p = NULL;
3344 			end_prio = strtoul(str, &p, 0);
3345 
3346 			if (end_prio >= MAX_PRIO || (*p != '\0' && *p != ','))
3347 				return -1;
3348 
3349 			if (end_prio < start_prio)
3350 				return -1;
3351 		} else {
3352 			end_prio = start_prio;
3353 		}
3354 
3355 		for (; start_prio <= end_prio; start_prio++)
3356 			__set_bit(start_prio, sched->prio_bitmap);
3357 
3358 		if (*p)
3359 			++p;
3360 
3361 		str = p;
3362 	}
3363 
3364 	return 0;
3365 }
3366 
3367 static int perf_sched__timehist(struct perf_sched *sched)
3368 {
3369 	struct evsel_str_handler handlers[] = {
3370 		{ "sched:sched_switch",       timehist_sched_switch_event, },
3371 		{ "sched:sched_wakeup",	      timehist_sched_wakeup_event, },
3372 		{ "sched:sched_waking",       timehist_sched_wakeup_event, },
3373 		{ "sched:sched_wakeup_new",   timehist_sched_wakeup_event, },
3374 	};
3375 	const struct evsel_str_handler migrate_handlers[] = {
3376 		{ "sched:sched_migrate_task", timehist_migrate_task_event, },
3377 	};
3378 	struct perf_data data = {
3379 		.path  = input_name,
3380 		.mode  = PERF_DATA_MODE_READ,
3381 		.force = sched->force,
3382 	};
3383 
3384 	struct perf_session *session;
3385 	struct perf_env *env;
3386 	struct evlist *evlist;
3387 	int err = -1;
3388 
3389 	/*
3390 	 * event handlers for timehist option
3391 	 */
3392 	sched->tool.sample	 = perf_timehist__process_sample;
3393 	sched->tool.mmap	 = perf_event__process_mmap;
3394 	sched->tool.mmap2	 = perf_event__process_mmap2;
3395 	sched->tool.comm	 = perf_event__process_comm;
3396 	sched->tool.exit	 = perf_event__process_exit;
3397 	sched->tool.fork	 = perf_event__process_fork;
3398 	sched->tool.lost	 = process_lost;
3399 	sched->tool.attr	 = perf_event__process_attr;
3400 	sched->tool.tracing_data = perf_event__process_tracing_data;
3401 	sched->tool.build_id	 = perf_event__process_build_id;
3402 
3403 	sched->tool.ordering_requires_timestamps = true;
3404 
3405 	symbol_conf.use_callchain = sched->show_callchain;
3406 
3407 	session = perf_session__new(&data, &sched->tool);
3408 	if (IS_ERR(session))
3409 		return PTR_ERR(session);
3410 
3411 	env = perf_session__env(session);
3412 	if (cpu_list) {
3413 		err = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap);
3414 		if (err < 0)
3415 			goto out;
3416 	}
3417 
3418 	evlist = session->evlist;
3419 
3420 	symbol__init(env);
3421 
3422 	if (perf_time__parse_str(&sched->ptime, sched->time_str) != 0) {
3423 		pr_err("Invalid time string\n");
3424 		err = -EINVAL;
3425 		goto out;
3426 	}
3427 
3428 	if (timehist_check_attr(sched, evlist) != 0)
3429 		goto out;
3430 
3431 	if (timehist_parse_prio_str(sched) != 0) {
3432 		pr_err("Invalid prio string\n");
3433 		goto out;
3434 	}
3435 
3436 	setup_pager();
3437 
3438 	evsel__set_priv_destructor(timehist__evsel_priv_destructor);
3439 
3440 	/* prefer sched_waking if it is captured */
3441 	if (evlist__find_tracepoint_by_name(session->evlist, "sched:sched_waking"))
3442 		handlers[1].handler = timehist_sched_wakeup_ignore;
3443 
3444 	/* setup per-evsel handlers */
3445 	if (perf_session__set_tracepoints_handlers(session, handlers))
3446 		goto out;
3447 
3448 	/* sched_switch event at a minimum needs to exist */
3449 	if (!evlist__find_tracepoint_by_name(session->evlist, "sched:sched_switch")) {
3450 		pr_err("No sched_switch events found. Have you run 'perf sched record'?\n");
3451 		goto out;
3452 	}
3453 
3454 	if ((sched->show_migrations || sched->pre_migrations) &&
3455 		perf_session__set_tracepoints_handlers(session, migrate_handlers))
3456 		goto out;
3457 
3458 	/* pre-allocate struct for per-CPU idle stats; cap to array bounds */
3459 	sched->max_cpu.cpu = min(env->nr_cpus_online, MAX_CPUS);
3460 	if (sched->max_cpu.cpu == 0)
3461 		sched->max_cpu.cpu = 4;
3462 	if (init_idle_threads(sched->max_cpu.cpu))
3463 		goto out;
3464 
3465 	/* summary_only implies summary option, but don't overwrite summary if set */
3466 	if (sched->summary_only)
3467 		sched->summary = sched->summary_only;
3468 
3469 	if (!sched->summary_only)
3470 		timehist_header(sched);
3471 
3472 	err = perf_session__process_events(session);
3473 	if (err) {
3474 		pr_err("Failed to process events, error %d", err);
3475 		goto out;
3476 	}
3477 
3478 	sched->nr_events      = evlist->stats.nr_events[0];
3479 	sched->nr_lost_events = evlist->stats.total_lost;
3480 	sched->nr_lost_chunks = evlist->stats.nr_events[PERF_RECORD_LOST];
3481 
3482 	if (sched->summary)
3483 		timehist_print_summary(sched, session);
3484 
3485 out:
3486 	free_idle_threads();
3487 	perf_session__delete(session);
3488 
3489 	return err;
3490 }
3491 
3492 
3493 static void print_bad_events(struct perf_sched *sched)
3494 {
3495 	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
3496 		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
3497 			(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
3498 			sched->nr_unordered_timestamps, sched->nr_timestamps);
3499 	}
3500 	if (sched->nr_lost_events && sched->nr_events) {
3501 		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
3502 			(double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
3503 			sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
3504 	}
3505 	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
3506 		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
3507 			(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
3508 			sched->nr_context_switch_bugs, sched->nr_timestamps);
3509 		if (sched->nr_lost_events)
3510 			printf(" (due to lost events?)");
3511 		printf("\n");
3512 	}
3513 }
3514 
3515 static void __merge_work_atoms(struct rb_root_cached *root, struct work_atoms *data)
3516 {
3517 	struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
3518 	struct work_atoms *this;
3519 	const char *comm = thread__comm_str(data->thread), *this_comm;
3520 	bool leftmost = true;
3521 
3522 	while (*new) {
3523 		int cmp;
3524 
3525 		this = container_of(*new, struct work_atoms, node);
3526 		parent = *new;
3527 
3528 		this_comm = thread__comm_str(this->thread);
3529 		cmp = strcmp(comm, this_comm);
3530 		if (cmp > 0) {
3531 			new = &((*new)->rb_left);
3532 		} else if (cmp < 0) {
3533 			new = &((*new)->rb_right);
3534 			leftmost = false;
3535 		} else {
3536 			this->num_merged++;
3537 			this->total_runtime += data->total_runtime;
3538 			this->nb_atoms += data->nb_atoms;
3539 			this->total_lat += data->total_lat;
3540 			list_splice_init(&data->work_list, &this->work_list);
3541 			if (this->max_lat < data->max_lat) {
3542 				this->max_lat = data->max_lat;
3543 				this->max_lat_start = data->max_lat_start;
3544 				this->max_lat_end = data->max_lat_end;
3545 			}
3546 			free_work_atoms(data);
3547 			return;
3548 		}
3549 	}
3550 
3551 	data->num_merged++;
3552 	rb_link_node(&data->node, parent, new);
3553 	rb_insert_color_cached(&data->node, root, leftmost);
3554 }
3555 
3556 static void perf_sched__merge_lat(struct perf_sched *sched)
3557 {
3558 	struct work_atoms *data;
3559 	struct rb_node *node;
3560 
3561 	if (sched->skip_merge)
3562 		return;
3563 
3564 	while ((node = rb_first_cached(&sched->atom_root))) {
3565 		rb_erase_cached(node, &sched->atom_root);
3566 		data = rb_entry(node, struct work_atoms, node);
3567 		__merge_work_atoms(&sched->merged_atom_root, data);
3568 	}
3569 }
3570 
3571 static int setup_cpus_switch_event(struct perf_sched *sched)
3572 {
3573 	unsigned int i;
3574 
3575 	sched->cpu_last_switched = calloc(MAX_CPUS, sizeof(*(sched->cpu_last_switched)));
3576 	if (!sched->cpu_last_switched)
3577 		return -1;
3578 
3579 	sched->curr_pid = calloc(MAX_CPUS, sizeof(*(sched->curr_pid)));
3580 	if (!sched->curr_pid) {
3581 		zfree(&sched->cpu_last_switched);
3582 		return -1;
3583 	}
3584 
3585 	for (i = 0; i < MAX_CPUS; i++)
3586 		sched->curr_pid[i] = -1;
3587 
3588 	return 0;
3589 }
3590 
3591 static void free_cpus_switch_event(struct perf_sched *sched)
3592 {
3593 	zfree(&sched->curr_pid);
3594 	zfree(&sched->cpu_last_switched);
3595 }
3596 
3597 static int perf_sched__lat(struct perf_sched *sched)
3598 {
3599 	int rc = -1;
3600 	struct rb_node *next;
3601 
3602 	setup_pager();
3603 
3604 	if (setup_cpus_switch_event(sched))
3605 		return rc;
3606 
3607 	if (perf_sched__read_events(sched))
3608 		goto out_free_cpus_switch_event;
3609 
3610 	perf_sched__merge_lat(sched);
3611 	perf_sched__sort_lat(sched);
3612 
3613 	printf("\n -------------------------------------------------------------------------------------------------------------------------------------------\n");
3614 	printf("  Task                  |   Runtime ms  |  Count   | Avg delay ms    | Max delay ms    | Max delay start           | Max delay end          |\n");
3615 	printf(" -------------------------------------------------------------------------------------------------------------------------------------------\n");
3616 
3617 	next = rb_first_cached(&sched->sorted_atom_root);
3618 
3619 	while (next) {
3620 		struct work_atoms *work_list;
3621 
3622 		work_list = rb_entry(next, struct work_atoms, node);
3623 		output_lat_thread(sched, work_list);
3624 		next = rb_next(next);
3625 	}
3626 
3627 	printf(" -----------------------------------------------------------------------------------------------------------------\n");
3628 	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
3629 		(double)sched->all_runtime / NSEC_PER_MSEC, sched->all_count);
3630 
3631 	printf(" ---------------------------------------------------\n");
3632 
3633 	print_bad_events(sched);
3634 	printf("\n");
3635 
3636 	rc = 0;
3637 
3638 	while ((next = rb_first_cached(&sched->sorted_atom_root))) {
3639 		struct work_atoms *data;
3640 
3641 		data = rb_entry(next, struct work_atoms, node);
3642 		rb_erase_cached(next, &sched->sorted_atom_root);
3643 		free_work_atoms(data);
3644 	}
3645 out_free_cpus_switch_event:
3646 	free_cpus_switch_event(sched);
3647 	return rc;
3648 }
3649 
3650 static int setup_map_cpus(struct perf_sched *sched)
3651 {
3652 	if (sched->map.comp) {
3653 		sched->map.comp_cpus = calloc(MAX_CPUS, sizeof(*sched->map.comp_cpus));
3654 		if (!sched->map.comp_cpus)
3655 			return -1;
3656 	}
3657 
3658 	if (sched->map.cpus_str) {
3659 		sched->map.cpus = perf_cpu_map__new(sched->map.cpus_str);
3660 		if (!sched->map.cpus) {
3661 			pr_err("failed to get cpus map from %s\n", sched->map.cpus_str);
3662 			zfree(&sched->map.comp_cpus);
3663 			return -1;
3664 		}
3665 	}
3666 
3667 	return 0;
3668 }
3669 
3670 static int setup_color_pids(struct perf_sched *sched)
3671 {
3672 	struct perf_thread_map *map;
3673 
3674 	if (!sched->map.color_pids_str)
3675 		return 0;
3676 
3677 	map = thread_map__new_by_tid_str(sched->map.color_pids_str);
3678 	if (!map) {
3679 		pr_err("failed to get thread map from %s\n", sched->map.color_pids_str);
3680 		return -1;
3681 	}
3682 
3683 	sched->map.color_pids = map;
3684 	return 0;
3685 }
3686 
3687 static int setup_color_cpus(struct perf_sched *sched)
3688 {
3689 	struct perf_cpu_map *map;
3690 
3691 	if (!sched->map.color_cpus_str)
3692 		return 0;
3693 
3694 	map = perf_cpu_map__new(sched->map.color_cpus_str);
3695 	if (!map) {
3696 		pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str);
3697 		return -1;
3698 	}
3699 
3700 	sched->map.color_cpus = map;
3701 	return 0;
3702 }
3703 
3704 static int perf_sched__map(struct perf_sched *sched)
3705 {
3706 	int rc = -1;
3707 
3708 	sched->curr_thread = calloc(MAX_CPUS, sizeof(*(sched->curr_thread)));
3709 	if (!sched->curr_thread)
3710 		return rc;
3711 
3712 	sched->curr_out_thread = calloc(MAX_CPUS, sizeof(*(sched->curr_out_thread)));
3713 	if (!sched->curr_out_thread)
3714 		goto out_free_curr_thread;
3715 
3716 	if (setup_cpus_switch_event(sched))
3717 		goto out_free_curr_out_thread;
3718 
3719 	if (setup_map_cpus(sched))
3720 		goto out_free_cpus_switch_event;
3721 
3722 	if (setup_color_pids(sched))
3723 		goto out_put_map_cpus;
3724 
3725 	if (setup_color_cpus(sched))
3726 		goto out_put_color_pids;
3727 
3728 	setup_pager();
3729 	if (perf_sched__read_events(sched))
3730 		goto out_put_color_cpus;
3731 
3732 	rc = 0;
3733 	print_bad_events(sched);
3734 
3735 out_put_color_cpus:
3736 	perf_cpu_map__put(sched->map.color_cpus);
3737 
3738 out_put_color_pids:
3739 	perf_thread_map__put(sched->map.color_pids);
3740 
3741 out_put_map_cpus:
3742 	zfree(&sched->map.comp_cpus);
3743 	perf_cpu_map__put(sched->map.cpus);
3744 
3745 out_free_cpus_switch_event:
3746 	free_cpus_switch_event(sched);
3747 
3748 out_free_curr_out_thread:
3749 	for (int i = 0; i < MAX_CPUS; i++)
3750 		thread__put(sched->curr_out_thread[i]);
3751 	zfree(&sched->curr_out_thread);
3752 
3753 out_free_curr_thread:
3754 	for (int i = 0; i < MAX_CPUS; i++)
3755 		thread__put(sched->curr_thread[i]);
3756 	zfree(&sched->curr_thread);
3757 	return rc;
3758 }
3759 
3760 static int perf_sched__replay(struct perf_sched *sched)
3761 {
3762 	int ret;
3763 	unsigned long i;
3764 
3765 	mutex_init(&sched->start_work_mutex);
3766 	mutex_init(&sched->work_done_wait_mutex);
3767 
3768 	ret = setup_cpus_switch_event(sched);
3769 	if (ret)
3770 		goto out_mutex_destroy;
3771 
3772 	calibrate_run_measurement_overhead(sched);
3773 	calibrate_sleep_measurement_overhead(sched);
3774 
3775 	test_calibrations(sched);
3776 
3777 	ret = perf_sched__read_events(sched);
3778 	if (ret)
3779 		goto out_free_cpus_switch_event;
3780 
3781 	printf("nr_run_events:        %ld\n", sched->nr_run_events);
3782 	printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
3783 	printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
3784 
3785 	if (sched->targetless_wakeups)
3786 		printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
3787 	if (sched->multitarget_wakeups)
3788 		printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
3789 	if (sched->nr_run_events_optimized)
3790 		printf("run atoms optimized: %ld\n",
3791 			sched->nr_run_events_optimized);
3792 
3793 	print_task_traces(sched);
3794 	add_cross_task_wakeups(sched);
3795 
3796 	sched->thread_funcs_exit = false;
3797 	create_tasks(sched);
3798 	printf("------------------------------------------------------------\n");
3799 	if (sched->replay_repeat == 0)
3800 		sched->replay_repeat = UINT_MAX;
3801 
3802 	for (i = 0; i < sched->replay_repeat; i++)
3803 		run_one_test(sched);
3804 
3805 	sched->thread_funcs_exit = true;
3806 	destroy_tasks(sched);
3807 
3808 out_free_cpus_switch_event:
3809 	free_cpus_switch_event(sched);
3810 
3811 out_mutex_destroy:
3812 	mutex_destroy(&sched->start_work_mutex);
3813 	mutex_destroy(&sched->work_done_wait_mutex);
3814 	return ret;
3815 }
3816 
3817 static void setup_sorting(struct perf_sched *sched, const struct option *options,
3818 			  const char * const usage_msg[])
3819 {
3820 	char *tmp, *tok, *str = strdup(sched->sort_order);
3821 
3822 	for (tok = strtok_r(str, ", ", &tmp);
3823 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
3824 		if (sort_dimension__add(tok, &sched->sort_list) < 0) {
3825 			usage_with_options_msg(usage_msg, options,
3826 					"Unknown --sort key: `%s'", tok);
3827 		}
3828 	}
3829 
3830 	free(str);
3831 
3832 	sort_dimension__add("pid", &sched->cmp_pid);
3833 }
3834 
3835 static int process_synthesized_schedstat_event(const struct perf_tool *tool,
3836 					       union perf_event *event,
3837 					       struct perf_sample *sample __maybe_unused,
3838 					       struct machine *machine __maybe_unused)
3839 {
3840 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
3841 
3842 	if (perf_data__write(sched->data, event, event->header.size) <= 0) {
3843 		pr_err("failed to write perf data, error: %m\n");
3844 		return -1;
3845 	}
3846 
3847 	sched->session->header.data_size += event->header.size;
3848 	return 0;
3849 }
3850 
3851 static volatile sig_atomic_t done;
3852 
3853 static void sighandler(int sig __maybe_unused)
3854 {
3855 	done = 1;
3856 }
3857 
3858 static int enable_sched_schedstats(int *reset)
3859 {
3860 	char path[PATH_MAX];
3861 	FILE *fp;
3862 	char ch;
3863 
3864 	snprintf(path, PATH_MAX, "%s/sys/kernel/sched_schedstats", procfs__mountpoint());
3865 	fp = fopen(path, "w+");
3866 	if (!fp) {
3867 		pr_err("Failed to open %s\n", path);
3868 		return -1;
3869 	}
3870 
3871 	ch = getc(fp);
3872 	if (ch == '0') {
3873 		*reset = 1;
3874 		rewind(fp);
3875 		putc('1', fp);
3876 		fclose(fp);
3877 	}
3878 	return 0;
3879 }
3880 
3881 static int disable_sched_schedstat(void)
3882 {
3883 	char path[PATH_MAX];
3884 	FILE *fp;
3885 
3886 	snprintf(path, PATH_MAX, "%s/sys/kernel/sched_schedstats", procfs__mountpoint());
3887 	fp = fopen(path, "w");
3888 	if (!fp) {
3889 		pr_err("Failed to open %s\n", path);
3890 		return -1;
3891 	}
3892 
3893 	putc('0', fp);
3894 	fclose(fp);
3895 	return 0;
3896 }
3897 
3898 /* perf.data or any other output file name used by stats subcommand (only). */
3899 const char *output_name;
3900 
3901 static int perf_sched__schedstat_record(struct perf_sched *sched,
3902 					int argc, const char **argv)
3903 {
3904 	struct perf_session *session;
3905 	struct target target = {};
3906 	struct evlist *evlist;
3907 	int reset = 0;
3908 	int err = 0;
3909 	int fd;
3910 	struct perf_data data = {
3911 		.path  = output_name,
3912 		.mode  = PERF_DATA_MODE_WRITE,
3913 	};
3914 
3915 	done = 0;
3916 	signal(SIGINT, sighandler);
3917 	signal(SIGCHLD, sighandler);
3918 	signal(SIGTERM, sighandler);
3919 
3920 	evlist = evlist__new();
3921 	if (!evlist)
3922 		return -ENOMEM;
3923 
3924 	session = perf_session__new(&data, &sched->tool);
3925 	if (IS_ERR(session)) {
3926 		pr_err("Perf session creation failed.\n");
3927 		evlist__delete(evlist);
3928 		return PTR_ERR(session);
3929 	}
3930 
3931 	session->evlist = evlist;
3932 
3933 	sched->session = session;
3934 	sched->data = &data;
3935 
3936 	fd = perf_data__fd(&data);
3937 
3938 	/*
3939 	 * Capture all important metadata about the system. Although they are
3940 	 * not used by `perf sched stats` tool directly, they provide useful
3941 	 * information about profiled environment.
3942 	 */
3943 	perf_header__set_feat(&session->header, HEADER_HOSTNAME);
3944 	perf_header__set_feat(&session->header, HEADER_OSRELEASE);
3945 	perf_header__set_feat(&session->header, HEADER_VERSION);
3946 	perf_header__set_feat(&session->header, HEADER_ARCH);
3947 	perf_header__set_feat(&session->header, HEADER_NRCPUS);
3948 	perf_header__set_feat(&session->header, HEADER_CPUDESC);
3949 	perf_header__set_feat(&session->header, HEADER_CPUID);
3950 	perf_header__set_feat(&session->header, HEADER_TOTAL_MEM);
3951 	perf_header__set_feat(&session->header, HEADER_CMDLINE);
3952 	perf_header__set_feat(&session->header, HEADER_CPU_TOPOLOGY);
3953 	perf_header__set_feat(&session->header, HEADER_NUMA_TOPOLOGY);
3954 	perf_header__set_feat(&session->header, HEADER_CACHE);
3955 	perf_header__set_feat(&session->header, HEADER_MEM_TOPOLOGY);
3956 	perf_header__set_feat(&session->header, HEADER_HYBRID_TOPOLOGY);
3957 	perf_header__set_feat(&session->header, HEADER_CPU_DOMAIN_INFO);
3958 
3959 	err = perf_session__write_header(session, evlist, fd, false);
3960 	if (err < 0)
3961 		goto out;
3962 
3963 	/*
3964 	 * `perf sched stats` does not support workload profiling (-p pid)
3965 	 * since /proc/schedstat file contains cpu specific data only. Hence, a
3966 	 * profile target is either set of cpus or systemwide, never a process.
3967 	 * Note that, although `-- <workload>` is supported, profile data are
3968 	 * still cpu/systemwide.
3969 	 */
3970 	if (cpu_list)
3971 		target.cpu_list = cpu_list;
3972 	else
3973 		target.system_wide = true;
3974 
3975 	if (argc) {
3976 		err = evlist__prepare_workload(evlist, &target, argv, false, NULL);
3977 		if (err)
3978 			goto out;
3979 	}
3980 
3981 	err = evlist__create_maps(evlist, &target);
3982 	if (err < 0)
3983 		goto out;
3984 
3985 	user_requested_cpus = evlist->core.user_requested_cpus;
3986 
3987 	err = perf_event__synthesize_schedstat(&(sched->tool),
3988 					       process_synthesized_schedstat_event,
3989 					       user_requested_cpus);
3990 	if (err < 0)
3991 		goto out;
3992 
3993 	err = enable_sched_schedstats(&reset);
3994 	if (err < 0)
3995 		goto out;
3996 
3997 	if (argc)
3998 		evlist__start_workload(evlist);
3999 
4000 	while (!done) {
4001 		if (argc && waitpid(evlist->workload.pid, NULL, WNOHANG) > 0)
4002 			break;
4003 		sleep(1);
4004 	}
4005 
4006 	if (reset) {
4007 		err = disable_sched_schedstat();
4008 		if (err < 0)
4009 			goto out;
4010 	}
4011 
4012 	err = perf_event__synthesize_schedstat(&(sched->tool),
4013 					       process_synthesized_schedstat_event,
4014 					       user_requested_cpus);
4015 	if (err < 0)
4016 		goto out;
4017 
4018 	err = perf_session__write_header(session, evlist, fd, true);
4019 
4020 out:
4021 	if (!err)
4022 		fprintf(stderr, "[ perf sched stats: Wrote samples to %s ]\n", data.path);
4023 	else
4024 		fprintf(stderr, "[ perf sched stats: Failed !! ]\n");
4025 
4026 	evlist__delete(evlist);
4027 	close(fd);
4028 	return err;
4029 }
4030 
4031 struct schedstat_domain {
4032 	struct list_head domain_list;
4033 	struct perf_record_schedstat_domain *domain_data;
4034 };
4035 
4036 struct schedstat_cpu {
4037 	struct list_head cpu_list;
4038 	struct list_head domain_head;
4039 	struct perf_record_schedstat_cpu *cpu_data;
4040 };
4041 
4042 static struct list_head cpu_head = LIST_HEAD_INIT(cpu_head);
4043 static struct schedstat_cpu *cpu_second_pass;
4044 static struct schedstat_domain *domain_second_pass;
4045 static bool after_workload_flag;
4046 static bool verbose_field;
4047 
4048 static void free_schedstat(struct list_head *head);
4049 
4050 static void store_schedstat_cpu_diff(struct schedstat_cpu *after_workload)
4051 {
4052 	struct perf_record_schedstat_cpu *before = cpu_second_pass->cpu_data;
4053 	struct perf_record_schedstat_cpu *after = after_workload->cpu_data;
4054 	__u16 version = after_workload->cpu_data->version;
4055 
4056 #define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)	\
4057 	(before->_ver._name = after->_ver._name - before->_ver._name)
4058 
4059 	if (version == 15) {
4060 #include <perf/schedstat-v15.h>
4061 	} else if (version == 16) {
4062 #include <perf/schedstat-v16.h>
4063 	} else if (version == 17) {
4064 #include <perf/schedstat-v17.h>
4065 	}
4066 
4067 #undef CPU_FIELD
4068 }
4069 
4070 static void store_schedstat_domain_diff(struct schedstat_domain *after_workload)
4071 {
4072 	struct perf_record_schedstat_domain *before = domain_second_pass->domain_data;
4073 	struct perf_record_schedstat_domain *after = after_workload->domain_data;
4074 	__u16 version = after_workload->domain_data->version;
4075 
4076 #define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)	\
4077 	(before->_ver._name = after->_ver._name - before->_ver._name)
4078 
4079 	if (version == 15) {
4080 #include <perf/schedstat-v15.h>
4081 	} else if (version == 16) {
4082 #include <perf/schedstat-v16.h>
4083 	} else if (version == 17) {
4084 #include <perf/schedstat-v17.h>
4085 	}
4086 #undef DOMAIN_FIELD
4087 }
4088 
4089 #define PCT_CHNG(_x, _y)        ((_x) ? ((double)((double)(_y) - (_x)) / (_x)) * 100 : 0.0)
4090 static inline void print_cpu_stats(struct perf_record_schedstat_cpu *cs1,
4091 				   struct perf_record_schedstat_cpu *cs2)
4092 {
4093 	printf("%-65s ", "DESC");
4094 	if (!cs2)
4095 		printf("%12s %12s", "COUNT", "PCT_CHANGE");
4096 	else
4097 		printf("%12s %11s %12s %14s %10s", "COUNT1", "COUNT2", "PCT_CHANGE",
4098 		       "PCT_CHANGE1", "PCT_CHANGE2");
4099 
4100 	printf("\n");
4101 	print_separator2(SEP_LEN, "", 0);
4102 
4103 #define CALC_PCT(_x, _y)	((_y) ? ((double)(_x) / (_y)) * 100 : 0.0)
4104 
4105 #define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)			\
4106 	do {										\
4107 		printf("%-65s: " _format, verbose_field ? _desc : #_name,		\
4108 		       cs1->_ver._name);						\
4109 		if (!cs2) {								\
4110 			if (_is_pct)							\
4111 				printf("  ( %8.2lf%% )",				\
4112 				       CALC_PCT(cs1->_ver._name, cs1->_ver._pct_of));	\
4113 		} else {								\
4114 			printf("," _format "  | %8.2lf%% |", cs2->_ver._name,		\
4115 			       PCT_CHNG(cs1->_ver._name, cs2->_ver._name));		\
4116 			if (_is_pct)							\
4117 				printf("  ( %8.2lf%%,  %8.2lf%% )",			\
4118 				       CALC_PCT(cs1->_ver._name, cs1->_ver._pct_of),	\
4119 				       CALC_PCT(cs2->_ver._name, cs2->_ver._pct_of));	\
4120 		}									\
4121 		printf("\n");								\
4122 	} while (0)
4123 
4124 	if (cs1->version == 15) {
4125 #include <perf/schedstat-v15.h>
4126 	} else if (cs1->version == 16) {
4127 #include <perf/schedstat-v16.h>
4128 	} else if (cs1->version == 17) {
4129 #include <perf/schedstat-v17.h>
4130 	}
4131 
4132 #undef CPU_FIELD
4133 #undef CALC_PCT
4134 }
4135 
4136 static inline void print_domain_stats(struct perf_record_schedstat_domain *ds1,
4137 				      struct perf_record_schedstat_domain *ds2,
4138 				      __u64 jiffies1, __u64 jiffies2)
4139 {
4140 	printf("%-65s ", "DESC");
4141 	if (!ds2)
4142 		printf("%12s %14s", "COUNT", "AVG_JIFFIES");
4143 	else
4144 		printf("%12s %11s %12s %16s %12s", "COUNT1", "COUNT2", "PCT_CHANGE",
4145 		       "AVG_JIFFIES1", "AVG_JIFFIES2");
4146 	printf("\n");
4147 
4148 #define DOMAIN_CATEGORY(_desc)							\
4149 	do {									\
4150 		size_t _len = strlen(_desc);					\
4151 		size_t _pre_dash_cnt = (SEP_LEN - _len) / 2;			\
4152 		size_t _post_dash_cnt = SEP_LEN - _len - _pre_dash_cnt;		\
4153 		print_separator2((int)_pre_dash_cnt, _desc, (int)_post_dash_cnt);\
4154 	} while (0)
4155 
4156 #define CALC_AVG(_x, _y)	((_y) ? (long double)(_x) / (_y) : 0.0)
4157 
4158 #define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)		\
4159 	do {									\
4160 		printf("%-65s: " _format, verbose_field ? _desc : #_name,	\
4161 		       ds1->_ver._name);					\
4162 		if (!ds2) {							\
4163 			if (_is_jiffies)					\
4164 				printf("  $ %11.2Lf $",				\
4165 				       CALC_AVG(jiffies1, ds1->_ver._name));	\
4166 		} else {							\
4167 			printf("," _format "  | %8.2lf%% |", ds2->_ver._name,	\
4168 			       PCT_CHNG(ds1->_ver._name, ds2->_ver._name));	\
4169 			if (_is_jiffies)					\
4170 				printf("  $ %11.2Lf, %11.2Lf $",		\
4171 				       CALC_AVG(jiffies1, ds1->_ver._name),	\
4172 				       CALC_AVG(jiffies2, ds2->_ver._name));	\
4173 		}								\
4174 		printf("\n");							\
4175 	} while (0)
4176 
4177 #define DERIVED_CNT_FIELD(_name, _desc, _format, _x, _y, _z, _ver)		\
4178 	do {									\
4179 		__u32 t1 = ds1->_ver._x - ds1->_ver._y - ds1->_ver._z;		\
4180 		printf("*%-64s: " _format, verbose_field ? _desc : #_name, t1);	\
4181 		if (ds2) {							\
4182 			__u32 t2 = ds2->_ver._x - ds2->_ver._y - ds2->_ver._z;	\
4183 			printf("," _format "  | %8.2lf%% |", t2,		\
4184 			       PCT_CHNG(t1, t2));				\
4185 		}								\
4186 		printf("\n");							\
4187 	} while (0)
4188 
4189 #define DERIVED_AVG_FIELD(_name, _desc, _format, _x, _y, _z, _w, _ver)		\
4190 	do {									\
4191 		__u32 t1 = ds1->_ver._x - ds1->_ver._y - ds1->_ver._z;		\
4192 		printf("*%-64s: " _format, verbose_field ? _desc : #_name,	\
4193 		       CALC_AVG(ds1->_ver._w, t1));				\
4194 		if (ds2) {							\
4195 			__u32 t2 = ds2->_ver._x - ds2->_ver._y - ds2->_ver._z;	\
4196 			printf("," _format "  | %8.2Lf%% |",			\
4197 			       CALC_AVG(ds2->_ver._w, t2),			\
4198 			       PCT_CHNG(CALC_AVG(ds1->_ver._w, t1),		\
4199 					CALC_AVG(ds2->_ver._w, t2)));		\
4200 		}								\
4201 		printf("\n");							\
4202 	} while (0)
4203 
4204 	if (ds1->version == 15) {
4205 #include <perf/schedstat-v15.h>
4206 	} else if (ds1->version == 16) {
4207 #include <perf/schedstat-v16.h>
4208 	} else if (ds1->version == 17) {
4209 #include <perf/schedstat-v17.h>
4210 	}
4211 
4212 #undef DERIVED_AVG_FIELD
4213 #undef DERIVED_CNT_FIELD
4214 #undef DOMAIN_FIELD
4215 #undef CALC_AVG
4216 #undef DOMAIN_CATEGORY
4217 }
4218 #undef PCT_CHNG
4219 
4220 static void summarize_schedstat_cpu(struct schedstat_cpu *summary_cpu,
4221 				    struct schedstat_cpu *cptr,
4222 				    int cnt, bool is_last)
4223 {
4224 	struct perf_record_schedstat_cpu *summary_cs = summary_cpu->cpu_data,
4225 					 *temp_cs = cptr->cpu_data;
4226 
4227 #define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)		\
4228 	do {									\
4229 		summary_cs->_ver._name += temp_cs->_ver._name;			\
4230 		if (is_last)							\
4231 			summary_cs->_ver._name /= cnt;				\
4232 	} while (0)
4233 
4234 	if (cptr->cpu_data->version == 15) {
4235 #include <perf/schedstat-v15.h>
4236 	} else if (cptr->cpu_data->version == 16) {
4237 #include <perf/schedstat-v16.h>
4238 	} else if (cptr->cpu_data->version == 17) {
4239 #include <perf/schedstat-v17.h>
4240 	}
4241 #undef CPU_FIELD
4242 }
4243 
4244 static void summarize_schedstat_domain(struct schedstat_domain *summary_domain,
4245 				       struct schedstat_domain *dptr,
4246 				       int cnt, bool is_last)
4247 {
4248 	struct perf_record_schedstat_domain *summary_ds = summary_domain->domain_data,
4249 					    *temp_ds = dptr->domain_data;
4250 
4251 #define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)		\
4252 	do {									\
4253 		summary_ds->_ver._name += temp_ds->_ver._name;			\
4254 		if (is_last)							\
4255 			summary_ds->_ver._name /= cnt;				\
4256 	} while (0)
4257 
4258 	if (dptr->domain_data->version == 15) {
4259 #include <perf/schedstat-v15.h>
4260 	} else if (dptr->domain_data->version == 16) {
4261 #include <perf/schedstat-v16.h>
4262 	} else if (dptr->domain_data->version == 17) {
4263 #include <perf/schedstat-v17.h>
4264 	}
4265 #undef DOMAIN_FIELD
4266 }
4267 
4268 /*
4269  * get_all_cpu_stats() appends the summary to the head of the list.
4270  */
4271 static int get_all_cpu_stats(struct list_head *head)
4272 {
4273 	struct schedstat_cpu *cptr, *summary_head = NULL;
4274 	struct schedstat_domain *dptr, *tdptr;
4275 	bool is_last = false;
4276 	int cnt = 1;
4277 	int ret = 0;
4278 	struct list_head tmp_cleanup_list;
4279 
4280 	assert(!list_empty(head));
4281 	cptr = list_first_entry(head, struct schedstat_cpu, cpu_list);
4282 
4283 	INIT_LIST_HEAD(&tmp_cleanup_list);
4284 
4285 	summary_head = zalloc(sizeof(*summary_head));
4286 	if (!summary_head)
4287 		return -ENOMEM;
4288 
4289 	INIT_LIST_HEAD(&summary_head->domain_head);
4290 	INIT_LIST_HEAD(&summary_head->cpu_list);
4291 	list_add(&summary_head->cpu_list, &tmp_cleanup_list);
4292 
4293 	summary_head->cpu_data = zalloc(sizeof(*summary_head->cpu_data));
4294 	if (!summary_head->cpu_data) {
4295 		ret = -ENOMEM;
4296 		goto out_cleanup;
4297 	}
4298 	memcpy(summary_head->cpu_data, cptr->cpu_data, sizeof(*summary_head->cpu_data));
4299 
4300 	list_for_each_entry(dptr, &cptr->domain_head, domain_list) {
4301 		tdptr = zalloc(sizeof(*tdptr));
4302 		if (!tdptr) {
4303 			ret = -ENOMEM;
4304 			goto out_cleanup;
4305 		}
4306 		INIT_LIST_HEAD(&tdptr->domain_list);
4307 
4308 		tdptr->domain_data = zalloc(sizeof(*tdptr->domain_data));
4309 		if (!tdptr->domain_data) {
4310 			free(tdptr);
4311 			ret = -ENOMEM;
4312 			goto out_cleanup;
4313 		}
4314 
4315 		memcpy(tdptr->domain_data, dptr->domain_data, sizeof(*tdptr->domain_data));
4316 		list_add_tail(&tdptr->domain_list, &summary_head->domain_head);
4317 	}
4318 
4319 	list_for_each_entry(cptr, head, cpu_list) {
4320 		if (list_is_first(&cptr->cpu_list, head))
4321 			continue;
4322 
4323 		if (list_is_last(&cptr->cpu_list, head))
4324 			is_last = true;
4325 
4326 		cnt++;
4327 		summarize_schedstat_cpu(summary_head, cptr, cnt, is_last);
4328 		if (list_empty(&summary_head->domain_head))
4329 			continue;
4330 
4331 		tdptr = list_first_entry(&summary_head->domain_head, struct schedstat_domain,
4332 					 domain_list);
4333 
4334 		list_for_each_entry(dptr, &cptr->domain_head, domain_list) {
4335 			summarize_schedstat_domain(tdptr, dptr, cnt, is_last);
4336 			if (list_is_last(&tdptr->domain_list, &summary_head->domain_head)) {
4337 				tdptr = NULL;
4338 				break;
4339 			}
4340 			tdptr = list_next_entry(tdptr, domain_list);
4341 		}
4342 	}
4343 
4344 	list_del_init(&summary_head->cpu_list);
4345 	list_add(&summary_head->cpu_list, head);
4346 	return 0;
4347 
4348 out_cleanup:
4349 	free_schedstat(&tmp_cleanup_list);
4350 	return ret;
4351 }
4352 
4353 static int show_schedstat_data(struct list_head *head1, struct cpu_domain_map **cd_map1, int nr1,
4354 			       struct list_head *head2, struct cpu_domain_map **cd_map2, int nr2,
4355 			       bool summary_only)
4356 {
4357 	struct schedstat_cpu *cptr1 = list_first_entry(head1, struct schedstat_cpu, cpu_list);
4358 	struct perf_record_schedstat_domain *ds1 = NULL, *ds2 = NULL;
4359 	struct schedstat_domain *dptr1 = NULL, *dptr2 = NULL;
4360 	struct schedstat_cpu *cptr2 = NULL;
4361 	__u64 jiffies1 = 0, jiffies2 = 0;
4362 	bool is_summary = true;
4363 	int ret = 0;
4364 
4365 	if (!cd_map1) {
4366 		pr_err("Error: CPU domain map 1 is missing.\n");
4367 		return -1;
4368 	}
4369 	if (head2 && !cd_map2) {
4370 		pr_err("Error: CPU domain map 2 is missing.\n");
4371 		return -1;
4372 	}
4373 
4374 	printf("Description\n");
4375 	print_separator2(SEP_LEN, "", 0);
4376 	printf("%-30s-> %s\n", "DESC", "Description of the field");
4377 	printf("%-30s-> %s\n", "COUNT", "Value of the field");
4378 	printf("%-30s-> %s\n", "PCT_CHANGE", "Percent change with corresponding base value");
4379 	printf("%-30s-> %s\n", "AVG_JIFFIES",
4380 	       "Avg time in jiffies between two consecutive occurrence of event");
4381 
4382 	print_separator2(SEP_LEN, "", 0);
4383 	printf("\n");
4384 
4385 	printf("%-65s: ", "Time elapsed (in jiffies)");
4386 	jiffies1 = cptr1->cpu_data->timestamp;
4387 	printf("%11llu", jiffies1);
4388 	if (head2) {
4389 		cptr2 = list_first_entry(head2, struct schedstat_cpu, cpu_list);
4390 		jiffies2 = cptr2->cpu_data->timestamp;
4391 		printf(",%11llu", jiffies2);
4392 	}
4393 	printf("\n");
4394 
4395 	ret = get_all_cpu_stats(head1);
4396 	if (ret)
4397 		return ret;
4398 	if (cptr2) {
4399 		ret = get_all_cpu_stats(head2);
4400 		if (ret)
4401 			return ret;
4402 		cptr2 = list_first_entry(head2, struct schedstat_cpu, cpu_list);
4403 	}
4404 
4405 	list_for_each_entry(cptr1, head1, cpu_list) {
4406 		struct cpu_domain_map *cd_info1 = NULL, *cd_info2 = NULL;
4407 		struct perf_record_schedstat_cpu *cs1 = cptr1->cpu_data;
4408 		struct perf_record_schedstat_cpu *cs2 = NULL;
4409 
4410 		dptr2 = NULL;
4411 		if (cs1->cpu >= (u32)nr1) {
4412 			pr_err("Error: CPU %d exceeds domain map size %d\n", cs1->cpu, nr1);
4413 			return -1;
4414 		}
4415 		cd_info1 = cd_map1[cs1->cpu];
4416 		if (!cd_info1) {
4417 			pr_err("Error: CPU %d domain info is missing in map 1.\n",
4418 			       cs1->cpu);
4419 			return -1;
4420 		}
4421 		if (cptr2) {
4422 			cs2 = cptr2->cpu_data;
4423 			if (cs2->cpu >= (u32)nr2) {
4424 				pr_err("Error: CPU %d exceeds domain map size %d\n", cs2->cpu, nr2);
4425 				return -1;
4426 			}
4427 			cd_info2 = cd_map2[cs2->cpu];
4428 			if (!cd_info2) {
4429 				pr_err("Error: CPU %d domain info is missing in map 2.\n",
4430 				       cs2->cpu);
4431 				return -1;
4432 			}
4433 			if (!list_empty(&cptr2->domain_head))
4434 				dptr2 = list_first_entry(&cptr2->domain_head,
4435 							 struct schedstat_domain,
4436 							 domain_list);
4437 		}
4438 
4439 		if (cs2 && cs1->cpu != cs2->cpu) {
4440 			pr_err("Failed because matching cpus not found for diff\n");
4441 			return -1;
4442 		}
4443 
4444 		if (cd_info2 && cd_info1->nr_domains != cd_info2->nr_domains) {
4445 			pr_err("Failed because nr_domains is not same for cpus\n");
4446 			return -1;
4447 		}
4448 
4449 		print_separator2(SEP_LEN, "", 0);
4450 
4451 		if (is_summary)
4452 			printf("CPU: <ALL CPUS SUMMARY>\n");
4453 		else
4454 			printf("CPU: %d\n", cs1->cpu);
4455 
4456 		print_separator2(SEP_LEN, "", 0);
4457 		print_cpu_stats(cs1, cs2);
4458 		print_separator2(SEP_LEN, "", 0);
4459 
4460 		list_for_each_entry(dptr1, &cptr1->domain_head, domain_list) {
4461 			struct domain_info *dinfo1 = NULL, *dinfo2 = NULL;
4462 
4463 			ds1 = dptr1->domain_data;
4464 			ds2 = NULL;
4465 			if (ds1->domain >= cd_info1->nr_domains) {
4466 				pr_err("Error: Domain %d exceeds max domains %d for CPU %d in map 1.\n",
4467 				       ds1->domain, cd_info1->nr_domains, cs1->cpu);
4468 				return -1;
4469 			}
4470 			dinfo1 = cd_info1->domains[ds1->domain];
4471 			if (!dinfo1) {
4472 				pr_err("Error: Domain %d info is missing for CPU %d in map 1.\n",
4473 				       ds1->domain, cs1->cpu);
4474 				return -1;
4475 			}
4476 			if (dptr2) {
4477 				ds2 = dptr2->domain_data;
4478 				if (ds2->domain >= cd_info2->nr_domains) {
4479 					pr_err("Error: Domain %d exceeds max domains %d for CPU %d in map 2.\n",
4480 					       ds2->domain, cd_info2->nr_domains, cs2->cpu);
4481 					return -1;
4482 				}
4483 				dinfo2 = cd_info2->domains[ds2->domain];
4484 				if (!dinfo2) {
4485 					pr_err("Error: Domain %d info is missing for CPU %d in map 2.\n",
4486 					       ds2->domain, cs2->cpu);
4487 					return -1;
4488 				}
4489 			}
4490 
4491 			if (dinfo2 && dinfo1->domain != dinfo2->domain) {
4492 				pr_err("Failed because matching domain not found for diff\n");
4493 				return -1;
4494 			}
4495 
4496 			if (is_summary) {
4497 				if (dinfo1->dname)
4498 					printf("CPU: <ALL CPUS SUMMARY> | DOMAIN: %s\n",
4499 					       dinfo1->dname);
4500 				else
4501 					printf("CPU: <ALL CPUS SUMMARY> | DOMAIN: %d\n",
4502 					       dinfo1->domain);
4503 			} else {
4504 				if (dinfo1->dname)
4505 					printf("CPU: %d | DOMAIN: %s | DOMAIN_CPUS: ",
4506 					       cs1->cpu, dinfo1->dname);
4507 				else
4508 					printf("CPU: %d | DOMAIN: %d | DOMAIN_CPUS: ",
4509 					       cs1->cpu, dinfo1->domain);
4510 
4511 				printf("%s\n", dinfo1->cpulist);
4512 			}
4513 			print_separator2(SEP_LEN, "", 0);
4514 			print_domain_stats(ds1, ds2, jiffies1, jiffies2);
4515 			print_separator2(SEP_LEN, "", 0);
4516 
4517 			if (dptr2) {
4518 				if (list_is_last(&dptr2->domain_list, &cptr2->domain_head))
4519 					dptr2 = NULL;
4520 				else
4521 					dptr2 = list_next_entry(dptr2, domain_list);
4522 			}
4523 		}
4524 		if (summary_only)
4525 			break;
4526 
4527 		if (cptr2) {
4528 			if (list_is_last(&cptr2->cpu_list, head2))
4529 				cptr2 = NULL;
4530 			else
4531 				cptr2 = list_next_entry(cptr2, cpu_list);
4532 		}
4533 
4534 		is_summary = false;
4535 	}
4536 	return ret;
4537 }
4538 
4539 /*
4540  * Creates a linked list of cpu_data and domain_data. Below represents the structure of the linked
4541  * list where CPU0,CPU1,CPU2, ..., CPU(N-1) stores the cpu_data. Here N is the total number of cpus.
4542  * Each of the CPU points to the list of domain_data. Here DOMAIN0, DOMAIN1, DOMAIN2, ... represents
4543  * the domain_data. Here D0, D1, D2, ..., Dm are the number of domains in the respective cpus.
4544  *
4545  *	+----------+
4546  *	| CPU_HEAD |
4547  *	+----------+
4548  *	      |
4549  *	      v
4550  *	+----------+    +---------+    +---------+    +---------+	    +--------------+
4551  *	|   CPU0   | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(D0-1) |
4552  *	+----------+    +---------+    +---------+    +---------+           +--------------+
4553  *	      |
4554  *	      v
4555  *	+----------+    +---------+    +---------+    +---------+           +--------------+
4556  *	|   CPU1   | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(D1-1) |
4557  *	+----------+    +---------+    +---------+    +---------+           +--------------+
4558  *	      |
4559  *	      v
4560  *	+----------+    +---------+    +---------+    +---------+           +--------------+
4561  *	|   CPU2   | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(D2-1) |
4562  *	+----------+    +---------+    +---------+    +---------+           +--------------+
4563  *	      |
4564  *	      v
4565  *	     ...
4566  *	      |
4567  *	      v
4568  *	+----------+    +---------+    +---------+    +---------+           +--------------+
4569  *	| CPU(N-1) | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(Dm-1) |
4570  *	+----------+    +---------+    +---------+    +---------+           +--------------+
4571  *
4572  * Each cpu as well as domain has 2 enties in the event list one before the workload starts and
4573  * other after completion of the workload. The above linked list stores the diff of the cpu and
4574  * domain statistics.
4575  */
4576 static int perf_sched__process_schedstat(const struct perf_tool *tool __maybe_unused,
4577 					 struct perf_session *session __maybe_unused,
4578 					 union perf_event *event)
4579 {
4580 	struct perf_cpu this_cpu;
4581 	static __u32 initial_cpu;
4582 
4583 	switch (event->header.type) {
4584 	case PERF_RECORD_SCHEDSTAT_CPU:
4585 		this_cpu.cpu = event->schedstat_cpu.cpu;
4586 		break;
4587 	case PERF_RECORD_SCHEDSTAT_DOMAIN:
4588 		this_cpu.cpu = event->schedstat_domain.cpu;
4589 		break;
4590 	default:
4591 		return 0;
4592 	}
4593 
4594 	if (user_requested_cpus && !perf_cpu_map__has(user_requested_cpus, this_cpu))
4595 		return 0;
4596 
4597 	if (event->header.type == PERF_RECORD_SCHEDSTAT_CPU) {
4598 		struct schedstat_cpu *temp = zalloc(sizeof(*temp));
4599 
4600 		if (!temp)
4601 			return -ENOMEM;
4602 
4603 		temp->cpu_data = zalloc(sizeof(*temp->cpu_data));
4604 		if (!temp->cpu_data)
4605 			return -ENOMEM;
4606 
4607 		memcpy(temp->cpu_data, &event->schedstat_cpu, sizeof(*temp->cpu_data));
4608 
4609 		if (!list_empty(&cpu_head) && temp->cpu_data->cpu == initial_cpu)
4610 			after_workload_flag = true;
4611 
4612 		if (!after_workload_flag) {
4613 			if (list_empty(&cpu_head))
4614 				initial_cpu = temp->cpu_data->cpu;
4615 
4616 			list_add_tail(&temp->cpu_list, &cpu_head);
4617 			INIT_LIST_HEAD(&temp->domain_head);
4618 		} else {
4619 			if (temp->cpu_data->cpu == initial_cpu) {
4620 				cpu_second_pass = list_first_entry(&cpu_head, struct schedstat_cpu,
4621 								   cpu_list);
4622 				cpu_second_pass->cpu_data->timestamp =
4623 					temp->cpu_data->timestamp - cpu_second_pass->cpu_data->timestamp;
4624 			} else {
4625 				cpu_second_pass = list_next_entry(cpu_second_pass, cpu_list);
4626 			}
4627 			domain_second_pass = list_first_entry(&cpu_second_pass->domain_head,
4628 							      struct schedstat_domain, domain_list);
4629 			store_schedstat_cpu_diff(temp);
4630 		}
4631 	} else if (event->header.type == PERF_RECORD_SCHEDSTAT_DOMAIN) {
4632 		struct schedstat_cpu *cpu_tail;
4633 		struct schedstat_domain *temp = zalloc(sizeof(*temp));
4634 
4635 		if (!temp)
4636 			return -ENOMEM;
4637 
4638 		temp->domain_data = zalloc(sizeof(*temp->domain_data));
4639 		if (!temp->domain_data)
4640 			return -ENOMEM;
4641 
4642 		memcpy(temp->domain_data, &event->schedstat_domain, sizeof(*temp->domain_data));
4643 
4644 		if (!after_workload_flag) {
4645 			cpu_tail = list_last_entry(&cpu_head, struct schedstat_cpu, cpu_list);
4646 			list_add_tail(&temp->domain_list, &cpu_tail->domain_head);
4647 		} else {
4648 			store_schedstat_domain_diff(temp);
4649 			domain_second_pass = list_next_entry(domain_second_pass, domain_list);
4650 		}
4651 	}
4652 
4653 	return 0;
4654 }
4655 
4656 static void free_schedstat(struct list_head *head)
4657 {
4658 	struct schedstat_domain *dptr, *n1;
4659 	struct schedstat_cpu *cptr, *n2;
4660 
4661 	list_for_each_entry_safe(cptr, n2, head, cpu_list) {
4662 		list_for_each_entry_safe(dptr, n1, &cptr->domain_head, domain_list) {
4663 			list_del_init(&dptr->domain_list);
4664 			free(dptr->domain_data);
4665 			free(dptr);
4666 		}
4667 		list_del_init(&cptr->cpu_list);
4668 		free(cptr->cpu_data);
4669 		free(cptr);
4670 	}
4671 }
4672 
4673 static int perf_sched__schedstat_report(struct perf_sched *sched)
4674 {
4675 	struct cpu_domain_map **cd_map;
4676 	struct perf_session *session;
4677 	struct target target = {};
4678 	struct perf_data data = {
4679 		.path  = input_name,
4680 		.mode  = PERF_DATA_MODE_READ,
4681 	};
4682 	int err = 0;
4683 
4684 	sched->tool.schedstat_cpu = perf_sched__process_schedstat;
4685 	sched->tool.schedstat_domain = perf_sched__process_schedstat;
4686 
4687 	session = perf_session__new(&data, &sched->tool);
4688 	if (IS_ERR(session)) {
4689 		pr_err("Perf session creation failed.\n");
4690 		return PTR_ERR(session);
4691 	}
4692 
4693 	if (cpu_list)
4694 		target.cpu_list = cpu_list;
4695 	else
4696 		target.system_wide = true;
4697 
4698 	err = evlist__create_maps(session->evlist, &target);
4699 	if (err < 0)
4700 		goto out;
4701 
4702 	user_requested_cpus = session->evlist->core.user_requested_cpus;
4703 
4704 	err = perf_session__process_events(session);
4705 
4706 	if (!err) {
4707 		setup_pager();
4708 
4709 		if (list_empty(&cpu_head)) {
4710 			pr_err("Data is not available\n");
4711 			err = -1;
4712 			goto out;
4713 		}
4714 
4715 		cd_map = session->header.env.cpu_domain;
4716 		err = show_schedstat_data(&cpu_head, cd_map,
4717 					  session->header.env.nr_cpus_avail,
4718 					  NULL, NULL, 0, false);
4719 	}
4720 
4721 out:
4722 	free_schedstat(&cpu_head);
4723 	perf_session__delete(session);
4724 	return err;
4725 }
4726 
4727 static int perf_sched__schedstat_diff(struct perf_sched *sched,
4728 				      int argc, const char **argv)
4729 {
4730 	struct cpu_domain_map **cd_map0 = NULL, **cd_map1 = NULL;
4731 	struct list_head cpu_head_ses0, cpu_head_ses1;
4732 	struct perf_session *session[2];
4733 	struct perf_data data[2] = {0};
4734 	int ret = 0, err = 0;
4735 	static const char *defaults[] = {
4736 		"perf.data.old",
4737 		"perf.data",
4738 	};
4739 
4740 	if (argc) {
4741 		if (argc == 1)
4742 			defaults[1] = argv[0];
4743 		else if (argc == 2) {
4744 			defaults[0] = argv[0];
4745 			defaults[1] = argv[1];
4746 		} else {
4747 			pr_err("perf sched stats diff is not supported with more than 2 files.\n");
4748 			goto out_ret;
4749 		}
4750 	}
4751 
4752 	INIT_LIST_HEAD(&cpu_head_ses0);
4753 	INIT_LIST_HEAD(&cpu_head_ses1);
4754 
4755 	sched->tool.schedstat_cpu = perf_sched__process_schedstat;
4756 	sched->tool.schedstat_domain = perf_sched__process_schedstat;
4757 
4758 	data[0].path = defaults[0];
4759 	data[0].mode  = PERF_DATA_MODE_READ;
4760 	session[0] = perf_session__new(&data[0], &sched->tool);
4761 	if (IS_ERR(session[0])) {
4762 		ret = PTR_ERR(session[0]);
4763 		pr_err("Failed to open %s\n", data[0].path);
4764 		goto out_delete_ses0;
4765 	}
4766 
4767 	err = perf_session__process_events(session[0]);
4768 	if (err) {
4769 		free_schedstat(&cpu_head);
4770 		goto out_delete_ses0;
4771 	}
4772 
4773 	cd_map0 = session[0]->header.env.cpu_domain;
4774 	list_replace_init(&cpu_head, &cpu_head_ses0);
4775 	after_workload_flag = false;
4776 
4777 	data[1].path = defaults[1];
4778 	data[1].mode  = PERF_DATA_MODE_READ;
4779 	session[1] = perf_session__new(&data[1], &sched->tool);
4780 	if (IS_ERR(session[1])) {
4781 		ret = PTR_ERR(session[1]);
4782 		pr_err("Failed to open %s\n", data[1].path);
4783 		goto out_delete_ses1;
4784 	}
4785 
4786 	err = perf_session__process_events(session[1]);
4787 	if (err) {
4788 		free_schedstat(&cpu_head);
4789 		goto out_delete_ses1;
4790 	}
4791 
4792 	cd_map1 = session[1]->header.env.cpu_domain;
4793 	list_replace_init(&cpu_head, &cpu_head_ses1);
4794 	after_workload_flag = false;
4795 	setup_pager();
4796 
4797 	if (list_empty(&cpu_head_ses1)) {
4798 		pr_err("Data is not available\n");
4799 		ret = -1;
4800 		goto out_delete_ses1;
4801 	}
4802 
4803 	if (list_empty(&cpu_head_ses0)) {
4804 		pr_err("Data is not available\n");
4805 		ret = -1;
4806 		goto out_delete_ses1;
4807 	}
4808 
4809 	ret = show_schedstat_data(&cpu_head_ses0, cd_map0, session[0]->header.env.nr_cpus_avail,
4810 				  &cpu_head_ses1, cd_map1, session[1]->header.env.nr_cpus_avail, true);
4811 	if (ret)
4812 		goto out_delete_ses1;
4813 
4814 out_delete_ses1:
4815 	free_schedstat(&cpu_head_ses1);
4816 	if (!IS_ERR(session[1]))
4817 		perf_session__delete(session[1]);
4818 
4819 out_delete_ses0:
4820 	free_schedstat(&cpu_head_ses0);
4821 	if (!IS_ERR(session[0]))
4822 		perf_session__delete(session[0]);
4823 
4824 out_ret:
4825 	return ret;
4826 }
4827 
4828 static int process_synthesized_event_live(const struct perf_tool *tool __maybe_unused,
4829 					  union perf_event *event,
4830 					  struct perf_sample *sample __maybe_unused,
4831 					  struct machine *machine __maybe_unused)
4832 {
4833 	return perf_sched__process_schedstat(tool, NULL, event);
4834 }
4835 
4836 static int perf_sched__schedstat_live(struct perf_sched *sched,
4837 				      int argc, const char **argv)
4838 {
4839 	struct cpu_domain_map **cd_map = NULL;
4840 	struct target target = {};
4841 	u32 __maybe_unused md;
4842 	struct evlist *evlist;
4843 	u32 nr = 0, sv;
4844 	int reset = 0;
4845 	int err = 0;
4846 
4847 	done = 0;
4848 	signal(SIGINT, sighandler);
4849 	signal(SIGCHLD, sighandler);
4850 	signal(SIGTERM, sighandler);
4851 
4852 	evlist = evlist__new();
4853 	if (!evlist)
4854 		return -ENOMEM;
4855 
4856 	/*
4857 	 * `perf sched schedstat` does not support workload profiling (-p pid)
4858 	 * since /proc/schedstat file contains cpu specific data only. Hence, a
4859 	 * profile target is either set of cpus or systemwide, never a process.
4860 	 * Note that, although `-- <workload>` is supported, profile data are
4861 	 * still cpu/systemwide.
4862 	 */
4863 	if (cpu_list)
4864 		target.cpu_list = cpu_list;
4865 	else
4866 		target.system_wide = true;
4867 
4868 	if (argc) {
4869 		err = evlist__prepare_workload(evlist, &target, argv, false, NULL);
4870 		if (err)
4871 			goto out;
4872 	}
4873 
4874 	err = evlist__create_maps(evlist, &target);
4875 	if (err < 0)
4876 		goto out;
4877 
4878 	user_requested_cpus = evlist->core.user_requested_cpus;
4879 
4880 	err = perf_event__synthesize_schedstat(&(sched->tool),
4881 					       process_synthesized_event_live,
4882 					       user_requested_cpus);
4883 	if (err < 0)
4884 		goto out;
4885 
4886 	err = enable_sched_schedstats(&reset);
4887 	if (err < 0)
4888 		goto out;
4889 
4890 	if (argc)
4891 		evlist__start_workload(evlist);
4892 
4893 	while (!done) {
4894 		if (argc && waitpid(evlist->workload.pid, NULL, WNOHANG) > 0)
4895 			break;
4896 		sleep(1);
4897 	}
4898 
4899 	if (reset) {
4900 		err = disable_sched_schedstat();
4901 		if (err < 0)
4902 			goto out;
4903 	}
4904 
4905 	err = perf_event__synthesize_schedstat(&(sched->tool),
4906 					       process_synthesized_event_live,
4907 					       user_requested_cpus);
4908 	if (err)
4909 		goto out;
4910 
4911 	setup_pager();
4912 
4913 	if (list_empty(&cpu_head)) {
4914 		pr_err("Data is not available\n");
4915 		err = -1;
4916 		goto out;
4917 	}
4918 
4919 	nr = cpu__max_present_cpu().cpu;
4920 	cd_map = build_cpu_domain_map(&sv, &md, nr);
4921 	if (!cd_map) {
4922 		pr_err("Unable to generate cpu-domain relation info");
4923 		goto out;
4924 	}
4925 
4926 	err = show_schedstat_data(&cpu_head, cd_map, nr, NULL, NULL, 0, false);
4927 	free_cpu_domain_info(cd_map, sv, nr);
4928 out:
4929 	free_schedstat(&cpu_head);
4930 	evlist__delete(evlist);
4931 	return err;
4932 }
4933 
4934 static bool schedstat_events_exposed(void)
4935 {
4936 	/*
4937 	 * Select "sched:sched_stat_wait" event to check
4938 	 * whether schedstat tracepoints are exposed.
4939 	 */
4940 	return IS_ERR(trace_event__tp_format("sched", "sched_stat_wait")) ?
4941 		false : true;
4942 }
4943 
4944 static int __cmd_record(int argc, const char **argv)
4945 {
4946 	unsigned int rec_argc, i, j;
4947 	char **rec_argv;
4948 	const char **rec_argv_copy;
4949 	const char * const record_args[] = {
4950 		"record",
4951 		"-a",
4952 		"-R",
4953 		"-m", "1024",
4954 		"-c", "1",
4955 		"-e", "sched:sched_switch",
4956 		"-e", "sched:sched_stat_runtime",
4957 		"-e", "sched:sched_process_fork",
4958 		"-e", "sched:sched_wakeup_new",
4959 		"-e", "sched:sched_migrate_task",
4960 	};
4961 
4962 	/*
4963 	 * The tracepoints trace_sched_stat_{wait, sleep, iowait}
4964 	 * are not exposed to user if CONFIG_SCHEDSTATS is not set,
4965 	 * to prevent "perf sched record" execution failure, determine
4966 	 * whether to record schedstat events according to actual situation.
4967 	 */
4968 	const char * const schedstat_args[] = {
4969 		"-e", "sched:sched_stat_wait",
4970 		"-e", "sched:sched_stat_sleep",
4971 		"-e", "sched:sched_stat_iowait",
4972 	};
4973 	unsigned int schedstat_argc = schedstat_events_exposed() ?
4974 		ARRAY_SIZE(schedstat_args) : 0;
4975 
4976 	struct tep_event *waking_event;
4977 	int ret;
4978 
4979 	/*
4980 	 * +2 for either "-e", "sched:sched_wakeup" or
4981 	 * "-e", "sched:sched_waking"
4982 	 */
4983 	rec_argc = ARRAY_SIZE(record_args) + 2 + schedstat_argc + argc - 1;
4984 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
4985 	if (rec_argv == NULL)
4986 		return -ENOMEM;
4987 	rec_argv_copy = calloc(rec_argc + 1, sizeof(char *));
4988 	if (rec_argv_copy == NULL) {
4989 		free(rec_argv);
4990 		return -ENOMEM;
4991 	}
4992 
4993 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
4994 		rec_argv[i] = strdup(record_args[i]);
4995 
4996 	rec_argv[i++] = strdup("-e");
4997 	waking_event = trace_event__tp_format("sched", "sched_waking");
4998 	if (!IS_ERR(waking_event))
4999 		rec_argv[i++] = strdup("sched:sched_waking");
5000 	else
5001 		rec_argv[i++] = strdup("sched:sched_wakeup");
5002 
5003 	for (j = 0; j < schedstat_argc; j++)
5004 		rec_argv[i++] = strdup(schedstat_args[j]);
5005 
5006 	for (j = 1; j < (unsigned int)argc; j++, i++)
5007 		rec_argv[i] = strdup(argv[j]);
5008 
5009 	BUG_ON(i != rec_argc);
5010 
5011 	memcpy(rec_argv_copy, rec_argv, sizeof(char *) * rec_argc);
5012 	ret = cmd_record(rec_argc, rec_argv_copy);
5013 
5014 	for (i = 0; i < rec_argc; i++)
5015 		free(rec_argv[i]);
5016 	free(rec_argv);
5017 	free(rec_argv_copy);
5018 
5019 	return ret;
5020 }
5021 
5022 int cmd_sched(int argc, const char **argv)
5023 {
5024 	static const char default_sort_order[] = "avg, max, switch, runtime";
5025 	struct perf_sched sched = {
5026 		.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
5027 		.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
5028 		.sort_order	      = default_sort_order,
5029 		.replay_repeat	      = 10,
5030 		.profile_cpu	      = -1,
5031 		.next_shortname1      = 'A',
5032 		.next_shortname2      = '0',
5033 		.skip_merge           = 0,
5034 		.show_callchain	      = 1,
5035 		.max_stack            = 5,
5036 	};
5037 	const struct option sched_options[] = {
5038 	OPT_STRING('i', "input", &input_name, "file",
5039 		    "input file name"),
5040 	OPT_INCR('v', "verbose", &verbose,
5041 		    "be more verbose (show symbol address, etc)"),
5042 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
5043 		    "dump raw trace in ASCII"),
5044 	OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
5045 	OPT_END()
5046 	};
5047 	const struct option latency_options[] = {
5048 	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
5049 		   "sort by key(s): runtime, switch, avg, max"),
5050 	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
5051 		    "CPU to profile on"),
5052 	OPT_BOOLEAN('p', "pids", &sched.skip_merge,
5053 		    "latency stats per pid instead of per comm"),
5054 	OPT_PARENT(sched_options)
5055 	};
5056 	const struct option replay_options[] = {
5057 	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
5058 		     "repeat the workload replay N times (0: infinite)"),
5059 	OPT_PARENT(sched_options)
5060 	};
5061 	const struct option map_options[] = {
5062 	OPT_BOOLEAN(0, "compact", &sched.map.comp,
5063 		    "map output in compact mode"),
5064 	OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids",
5065 		   "highlight given pids in map"),
5066 	OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus",
5067                     "highlight given CPUs in map"),
5068 	OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus",
5069                     "display given CPUs in map"),
5070 	OPT_STRING(0, "task-name", &sched.map.task_name, "task",
5071 		"map output only for the given task name(s)."),
5072 	OPT_BOOLEAN(0, "fuzzy-name", &sched.map.fuzzy,
5073 		"given command name can be partially matched (fuzzy matching)"),
5074 	OPT_PARENT(sched_options)
5075 	};
5076 	const struct option timehist_options[] = {
5077 	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
5078 		   "file", "vmlinux pathname"),
5079 	OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
5080 		   "file", "kallsyms pathname"),
5081 	OPT_BOOLEAN('g', "call-graph", &sched.show_callchain,
5082 		    "Display call chains if present (default on)"),
5083 	OPT_UINTEGER(0, "max-stack", &sched.max_stack,
5084 		   "Maximum number of functions to display backtrace."),
5085 	OPT_CALLBACK(0, "symfs", NULL, "directory[,layout]", SYMFS_HELP,
5086 		     symbol__config_symfs),
5087 	OPT_BOOLEAN('s', "summary", &sched.summary_only,
5088 		    "Show only syscall summary with statistics"),
5089 	OPT_BOOLEAN('S', "with-summary", &sched.summary,
5090 		    "Show all syscalls and summary with statistics"),
5091 	OPT_BOOLEAN('w', "wakeups", &sched.show_wakeups, "Show wakeup events"),
5092 	OPT_BOOLEAN('n', "next", &sched.show_next, "Show next task"),
5093 	OPT_BOOLEAN('M', "migrations", &sched.show_migrations, "Show migration events"),
5094 	OPT_BOOLEAN('V', "cpu-visual", &sched.show_cpu_visual, "Add CPU visual"),
5095 	OPT_BOOLEAN('I', "idle-hist", &sched.idle_hist, "Show idle events only"),
5096 	OPT_STRING(0, "time", &sched.time_str, "str",
5097 		   "Time span for analysis (start,stop)"),
5098 	OPT_BOOLEAN(0, "state", &sched.show_state, "Show task state when sched-out"),
5099 	OPT_STRING('p', "pid", &symbol_conf.pid_list_str, "pid[,pid...]",
5100 		   "analyze events only for given process id(s)"),
5101 	OPT_STRING('t', "tid", &symbol_conf.tid_list_str, "tid[,tid...]",
5102 		   "analyze events only for given thread id(s)"),
5103 	OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
5104 	OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"),
5105 	OPT_STRING(0, "prio", &sched.prio_str, "prio",
5106 		   "analyze events only for given task priority(ies)"),
5107 	OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"),
5108 	OPT_PARENT(sched_options)
5109 	};
5110 	const struct option stats_options[] = {
5111 	OPT_STRING('i', "input", &input_name, "file",
5112 		   "`stats report` with input filename"),
5113 	OPT_STRING('o', "output", &output_name, "file",
5114 		   "`stats record` with output filename"),
5115 	OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
5116 	OPT_BOOLEAN('v', "verbose", &verbose_field, "Show explanation for fields in the report"),
5117 	OPT_END()
5118 	};
5119 
5120 	const char * const latency_usage[] = {
5121 		"perf sched latency [<options>]",
5122 		NULL
5123 	};
5124 	const char * const replay_usage[] = {
5125 		"perf sched replay [<options>]",
5126 		NULL
5127 	};
5128 	const char * const map_usage[] = {
5129 		"perf sched map [<options>]",
5130 		NULL
5131 	};
5132 	const char * const timehist_usage[] = {
5133 		"perf sched timehist [<options>]",
5134 		NULL
5135 	};
5136 	const char *stats_usage[] = {
5137 		"perf sched stats {record|report} [<options>]",
5138 		NULL
5139 	};
5140 	const char *const sched_subcommands[] = { "record", "latency", "map",
5141 						  "replay", "script",
5142 						  "timehist", "stats", NULL };
5143 	const char *sched_usage[] = {
5144 		NULL,
5145 		NULL
5146 	};
5147 	struct trace_sched_handler lat_ops  = {
5148 		.wakeup_event	    = latency_wakeup_event,
5149 		.switch_event	    = latency_switch_event,
5150 		.runtime_event	    = latency_runtime_event,
5151 		.migrate_task_event = latency_migrate_task_event,
5152 	};
5153 	struct trace_sched_handler map_ops  = {
5154 		.switch_event	    = map_switch_event,
5155 	};
5156 	struct trace_sched_handler replay_ops  = {
5157 		.wakeup_event	    = replay_wakeup_event,
5158 		.switch_event	    = replay_switch_event,
5159 		.fork_event	    = replay_fork_event,
5160 	};
5161 	struct trace_sched_handler stats_ops  = {};
5162 	int ret;
5163 
5164 	perf_tool__init(&sched.tool, /*ordered_events=*/true);
5165 	sched.tool.sample	 = perf_sched__process_tracepoint_sample;
5166 	sched.tool.comm		 = perf_sched__process_comm;
5167 	sched.tool.namespaces	 = perf_event__process_namespaces;
5168 	sched.tool.lost		 = perf_event__process_lost;
5169 	sched.tool.fork		 = perf_sched__process_fork_event;
5170 
5171 	argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
5172 					sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
5173 	if (!argc)
5174 		usage_with_options(sched_usage, sched_options);
5175 
5176 	thread__set_priv_destructor(free);
5177 
5178 	/*
5179 	 * Aliased to 'perf script' for now:
5180 	 */
5181 	if (!strcmp(argv[0], "script")) {
5182 		ret = cmd_script(argc, argv);
5183 	} else if (strlen(argv[0]) > 2 && strstarts("record", argv[0])) {
5184 		ret = __cmd_record(argc, argv);
5185 	} else if (strlen(argv[0]) > 2 && strstarts("latency", argv[0])) {
5186 		sched.tp_handler = &lat_ops;
5187 		if (argc > 1) {
5188 			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
5189 			if (argc)
5190 				usage_with_options(latency_usage, latency_options);
5191 		}
5192 		setup_sorting(&sched, latency_options, latency_usage);
5193 		ret = perf_sched__lat(&sched);
5194 	} else if (!strcmp(argv[0], "map")) {
5195 		if (argc) {
5196 			argc = parse_options(argc, argv, map_options, map_usage, 0);
5197 			if (argc)
5198 				usage_with_options(map_usage, map_options);
5199 
5200 			if (sched.map.task_name) {
5201 				sched.map.task_names = strlist__new(sched.map.task_name, NULL);
5202 				if (sched.map.task_names == NULL) {
5203 					fprintf(stderr, "Failed to parse task names\n");
5204 					ret = -1;
5205 					goto out;
5206 				}
5207 			}
5208 		}
5209 		sched.tp_handler = &map_ops;
5210 		setup_sorting(&sched, latency_options, latency_usage);
5211 		ret = perf_sched__map(&sched);
5212 	} else if (strlen(argv[0]) > 2 && strstarts("replay", argv[0])) {
5213 		sched.tp_handler = &replay_ops;
5214 		if (argc) {
5215 			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
5216 			if (argc)
5217 				usage_with_options(replay_usage, replay_options);
5218 		}
5219 		ret = perf_sched__replay(&sched);
5220 	} else if (!strcmp(argv[0], "timehist")) {
5221 		if (argc) {
5222 			argc = parse_options(argc, argv, timehist_options,
5223 					     timehist_usage, 0);
5224 			if (argc)
5225 				usage_with_options(timehist_usage, timehist_options);
5226 		}
5227 		if ((sched.show_wakeups || sched.show_next) &&
5228 		    sched.summary_only) {
5229 			pr_err(" Error: -s and -[n|w] are mutually exclusive.\n");
5230 			parse_options_usage(timehist_usage, timehist_options, "s", true);
5231 			if (sched.show_wakeups)
5232 				parse_options_usage(NULL, timehist_options, "w", true);
5233 			if (sched.show_next)
5234 				parse_options_usage(NULL, timehist_options, "n", true);
5235 			ret = -EINVAL;
5236 			goto out;
5237 		}
5238 		ret = symbol__validate_sym_arguments();
5239 		if (!ret)
5240 			ret = perf_sched__timehist(&sched);
5241 	} else if (!strcmp(argv[0], "stats")) {
5242 		const char *const stats_subcommands[] = {"record", "report", NULL};
5243 
5244 		sched.tp_handler = &stats_ops;
5245 		argc = parse_options_subcommand(argc, argv, stats_options,
5246 						stats_subcommands,
5247 						stats_usage,
5248 						PARSE_OPT_STOP_AT_NON_OPTION);
5249 
5250 		if (argv[0] && !strcmp(argv[0], "record")) {
5251 			if (argc)
5252 				argc = parse_options(argc, argv, stats_options,
5253 						     stats_usage, 0);
5254 			return perf_sched__schedstat_record(&sched, argc, argv);
5255 		} else if (argv[0] && !strcmp(argv[0], "report")) {
5256 			if (argc)
5257 				argc = parse_options(argc, argv, stats_options,
5258 						     stats_usage, 0);
5259 			return perf_sched__schedstat_report(&sched);
5260 		} else if (argv[0] && !strcmp(argv[0], "diff")) {
5261 			if (argc)
5262 				argc = parse_options(argc, argv, stats_options,
5263 						     stats_usage, 0);
5264 			return perf_sched__schedstat_diff(&sched, argc, argv);
5265 		}
5266 		return perf_sched__schedstat_live(&sched, argc, argv);
5267 	} else {
5268 		usage_with_options(sched_usage, sched_options);
5269 	}
5270 
5271 out:
5272 	/* free usage string allocated by parse_options_subcommand */
5273 	free((void *)sched_usage[0]);
5274 
5275 	return ret;
5276 }
5277