xref: /linux/tools/perf/builtin-sched.c (revision 80c281fca252827facd05875b8d9d36d7aad0f8d)
1 // SPDX-License-Identifier: GPL-2.0
2 #include "builtin.h"
3 #include "perf-sys.h"
4 
5 #include "util/cpumap.h"
6 #include "util/evlist.h"
7 #include "util/evsel.h"
8 #include "util/evsel_fprintf.h"
9 #include "util/mutex.h"
10 #include "util/symbol.h"
11 #include "util/thread.h"
12 #include "util/header.h"
13 #include "util/session.h"
14 #include "util/tool.h"
15 #include "util/cloexec.h"
16 #include "util/thread_map.h"
17 #include "util/color.h"
18 #include "util/stat.h"
19 #include "util/string2.h"
20 #include "util/callchain.h"
21 #include "util/time-utils.h"
22 
23 #include <subcmd/pager.h>
24 #include <subcmd/parse-options.h>
25 #include "util/trace-event.h"
26 
27 #include "util/debug.h"
28 #include "util/event.h"
29 #include "util/util.h"
30 
31 #include <linux/kernel.h>
32 #include <linux/log2.h>
33 #include <linux/zalloc.h>
34 #include <sys/prctl.h>
35 #include <sys/resource.h>
36 #include <inttypes.h>
37 
38 #include <errno.h>
39 #include <semaphore.h>
40 #include <pthread.h>
41 #include <math.h>
42 #include <api/fs/fs.h>
43 #include <perf/cpumap.h>
44 #include <linux/time64.h>
45 #include <linux/err.h>
46 
47 #include <linux/ctype.h>
48 
49 #define PR_SET_NAME		15               /* Set process name */
50 #define MAX_CPUS		4096
51 #define COMM_LEN		20
52 #define SYM_LEN			129
53 #define MAX_PID			1024000
54 #define MAX_PRIO		140
55 
56 static const char *cpu_list;
57 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
58 
59 struct sched_atom;
60 
61 struct task_desc {
62 	unsigned long		nr;
63 	unsigned long		pid;
64 	char			comm[COMM_LEN];
65 
66 	unsigned long		nr_events;
67 	unsigned long		curr_event;
68 	struct sched_atom	**atoms;
69 
70 	pthread_t		thread;
71 
72 	sem_t			ready_for_work;
73 	sem_t			work_done_sem;
74 
75 	u64			cpu_usage;
76 };
77 
78 enum sched_event_type {
79 	SCHED_EVENT_RUN,
80 	SCHED_EVENT_SLEEP,
81 	SCHED_EVENT_WAKEUP,
82 };
83 
84 struct sched_atom {
85 	enum sched_event_type	type;
86 	u64			timestamp;
87 	u64			duration;
88 	unsigned long		nr;
89 	sem_t			*wait_sem;
90 	struct task_desc	*wakee;
91 };
92 
93 enum thread_state {
94 	THREAD_SLEEPING = 0,
95 	THREAD_WAIT_CPU,
96 	THREAD_SCHED_IN,
97 	THREAD_IGNORE
98 };
99 
100 struct work_atom {
101 	struct list_head	list;
102 	enum thread_state	state;
103 	u64			sched_out_time;
104 	u64			wake_up_time;
105 	u64			sched_in_time;
106 	u64			runtime;
107 };
108 
109 struct work_atoms {
110 	struct list_head	work_list;
111 	struct thread		*thread;
112 	struct rb_node		node;
113 	u64			max_lat;
114 	u64			max_lat_start;
115 	u64			max_lat_end;
116 	u64			total_lat;
117 	u64			nb_atoms;
118 	u64			total_runtime;
119 	int			num_merged;
120 };
121 
122 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
123 
124 struct perf_sched;
125 
126 struct trace_sched_handler {
127 	int (*switch_event)(struct perf_sched *sched, struct evsel *evsel,
128 			    struct perf_sample *sample, struct machine *machine);
129 
130 	int (*runtime_event)(struct perf_sched *sched, struct evsel *evsel,
131 			     struct perf_sample *sample, struct machine *machine);
132 
133 	int (*wakeup_event)(struct perf_sched *sched, struct evsel *evsel,
134 			    struct perf_sample *sample, struct machine *machine);
135 
136 	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
137 	int (*fork_event)(struct perf_sched *sched, union perf_event *event,
138 			  struct machine *machine);
139 
140 	int (*migrate_task_event)(struct perf_sched *sched,
141 				  struct evsel *evsel,
142 				  struct perf_sample *sample,
143 				  struct machine *machine);
144 };
145 
146 #define COLOR_PIDS PERF_COLOR_BLUE
147 #define COLOR_CPUS PERF_COLOR_BG_RED
148 
149 struct perf_sched_map {
150 	DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
151 	struct perf_cpu		*comp_cpus;
152 	bool			 comp;
153 	struct perf_thread_map *color_pids;
154 	const char		*color_pids_str;
155 	struct perf_cpu_map	*color_cpus;
156 	const char		*color_cpus_str;
157 	const char		*task_name;
158 	struct strlist		*task_names;
159 	bool			fuzzy;
160 	struct perf_cpu_map	*cpus;
161 	const char		*cpus_str;
162 };
163 
164 struct perf_sched {
165 	struct perf_tool tool;
166 	const char	 *sort_order;
167 	unsigned long	 nr_tasks;
168 	struct task_desc **pid_to_task;
169 	struct task_desc **tasks;
170 	const struct trace_sched_handler *tp_handler;
171 	struct mutex	 start_work_mutex;
172 	struct mutex	 work_done_wait_mutex;
173 	int		 profile_cpu;
174 /*
175  * Track the current task - that way we can know whether there's any
176  * weird events, such as a task being switched away that is not current.
177  */
178 	struct perf_cpu	 max_cpu;
179 	u32		 *curr_pid;
180 	struct thread	 **curr_thread;
181 	struct thread	 **curr_out_thread;
182 	char		 next_shortname1;
183 	char		 next_shortname2;
184 	unsigned int	 replay_repeat;
185 	unsigned long	 nr_run_events;
186 	unsigned long	 nr_sleep_events;
187 	unsigned long	 nr_wakeup_events;
188 	unsigned long	 nr_sleep_corrections;
189 	unsigned long	 nr_run_events_optimized;
190 	unsigned long	 targetless_wakeups;
191 	unsigned long	 multitarget_wakeups;
192 	unsigned long	 nr_runs;
193 	unsigned long	 nr_timestamps;
194 	unsigned long	 nr_unordered_timestamps;
195 	unsigned long	 nr_context_switch_bugs;
196 	unsigned long	 nr_events;
197 	unsigned long	 nr_lost_chunks;
198 	unsigned long	 nr_lost_events;
199 	u64		 run_measurement_overhead;
200 	u64		 sleep_measurement_overhead;
201 	u64		 start_time;
202 	u64		 cpu_usage;
203 	u64		 runavg_cpu_usage;
204 	u64		 parent_cpu_usage;
205 	u64		 runavg_parent_cpu_usage;
206 	u64		 sum_runtime;
207 	u64		 sum_fluct;
208 	u64		 run_avg;
209 	u64		 all_runtime;
210 	u64		 all_count;
211 	u64		 *cpu_last_switched;
212 	struct rb_root_cached atom_root, sorted_atom_root, merged_atom_root;
213 	struct list_head sort_list, cmp_pid;
214 	bool force;
215 	bool skip_merge;
216 	struct perf_sched_map map;
217 
218 	/* options for timehist command */
219 	bool		summary;
220 	bool		summary_only;
221 	bool		idle_hist;
222 	bool		show_callchain;
223 	unsigned int	max_stack;
224 	bool		show_cpu_visual;
225 	bool		show_wakeups;
226 	bool		show_next;
227 	bool		show_migrations;
228 	bool		show_state;
229 	bool		show_prio;
230 	u64		skipped_samples;
231 	const char	*time_str;
232 	struct perf_time_interval ptime;
233 	struct perf_time_interval hist_time;
234 	volatile bool   thread_funcs_exit;
235 	const char	*prio_str;
236 	DECLARE_BITMAP(prio_bitmap, MAX_PRIO);
237 };
238 
239 /* per thread run time data */
240 struct thread_runtime {
241 	u64 last_time;      /* time of previous sched in/out event */
242 	u64 dt_run;         /* run time */
243 	u64 dt_sleep;       /* time between CPU access by sleep (off cpu) */
244 	u64 dt_iowait;      /* time between CPU access by iowait (off cpu) */
245 	u64 dt_preempt;     /* time between CPU access by preempt (off cpu) */
246 	u64 dt_delay;       /* time between wakeup and sched-in */
247 	u64 ready_to_run;   /* time of wakeup */
248 
249 	struct stats run_stats;
250 	u64 total_run_time;
251 	u64 total_sleep_time;
252 	u64 total_iowait_time;
253 	u64 total_preempt_time;
254 	u64 total_delay_time;
255 
256 	char last_state;
257 
258 	char shortname[3];
259 	bool comm_changed;
260 
261 	u64 migrations;
262 
263 	int prio;
264 };
265 
266 /* per event run time data */
267 struct evsel_runtime {
268 	u64 *last_time; /* time this event was last seen per cpu */
269 	u32 ncpu;       /* highest cpu slot allocated */
270 };
271 
272 /* per cpu idle time data */
273 struct idle_thread_runtime {
274 	struct thread_runtime	tr;
275 	struct thread		*last_thread;
276 	struct rb_root_cached	sorted_root;
277 	struct callchain_root	callchain;
278 	struct callchain_cursor	cursor;
279 };
280 
281 /* track idle times per cpu */
282 static struct thread **idle_threads;
283 static int idle_max_cpu;
284 static char idle_comm[] = "<idle>";
285 
286 static u64 get_nsecs(void)
287 {
288 	struct timespec ts;
289 
290 	clock_gettime(CLOCK_MONOTONIC, &ts);
291 
292 	return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
293 }
294 
295 static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
296 {
297 	u64 T0 = get_nsecs(), T1;
298 
299 	do {
300 		T1 = get_nsecs();
301 	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
302 }
303 
304 static void sleep_nsecs(u64 nsecs)
305 {
306 	struct timespec ts;
307 
308 	ts.tv_nsec = nsecs % 999999999;
309 	ts.tv_sec = nsecs / 999999999;
310 
311 	nanosleep(&ts, NULL);
312 }
313 
314 static void calibrate_run_measurement_overhead(struct perf_sched *sched)
315 {
316 	u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
317 	int i;
318 
319 	for (i = 0; i < 10; i++) {
320 		T0 = get_nsecs();
321 		burn_nsecs(sched, 0);
322 		T1 = get_nsecs();
323 		delta = T1-T0;
324 		min_delta = min(min_delta, delta);
325 	}
326 	sched->run_measurement_overhead = min_delta;
327 
328 	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
329 }
330 
331 static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
332 {
333 	u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
334 	int i;
335 
336 	for (i = 0; i < 10; i++) {
337 		T0 = get_nsecs();
338 		sleep_nsecs(10000);
339 		T1 = get_nsecs();
340 		delta = T1-T0;
341 		min_delta = min(min_delta, delta);
342 	}
343 	min_delta -= 10000;
344 	sched->sleep_measurement_overhead = min_delta;
345 
346 	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
347 }
348 
349 static struct sched_atom *
350 get_new_event(struct task_desc *task, u64 timestamp)
351 {
352 	struct sched_atom *event = zalloc(sizeof(*event));
353 	unsigned long idx = task->nr_events;
354 	size_t size;
355 
356 	event->timestamp = timestamp;
357 	event->nr = idx;
358 
359 	task->nr_events++;
360 	size = sizeof(struct sched_atom *) * task->nr_events;
361 	task->atoms = realloc(task->atoms, size);
362 	BUG_ON(!task->atoms);
363 
364 	task->atoms[idx] = event;
365 
366 	return event;
367 }
368 
369 static struct sched_atom *last_event(struct task_desc *task)
370 {
371 	if (!task->nr_events)
372 		return NULL;
373 
374 	return task->atoms[task->nr_events - 1];
375 }
376 
377 static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
378 				u64 timestamp, u64 duration)
379 {
380 	struct sched_atom *event, *curr_event = last_event(task);
381 
382 	/*
383 	 * optimize an existing RUN event by merging this one
384 	 * to it:
385 	 */
386 	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
387 		sched->nr_run_events_optimized++;
388 		curr_event->duration += duration;
389 		return;
390 	}
391 
392 	event = get_new_event(task, timestamp);
393 
394 	event->type = SCHED_EVENT_RUN;
395 	event->duration = duration;
396 
397 	sched->nr_run_events++;
398 }
399 
400 static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
401 				   u64 timestamp, struct task_desc *wakee)
402 {
403 	struct sched_atom *event, *wakee_event;
404 
405 	event = get_new_event(task, timestamp);
406 	event->type = SCHED_EVENT_WAKEUP;
407 	event->wakee = wakee;
408 
409 	wakee_event = last_event(wakee);
410 	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
411 		sched->targetless_wakeups++;
412 		return;
413 	}
414 	if (wakee_event->wait_sem) {
415 		sched->multitarget_wakeups++;
416 		return;
417 	}
418 
419 	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
420 	sem_init(wakee_event->wait_sem, 0, 0);
421 	event->wait_sem = wakee_event->wait_sem;
422 
423 	sched->nr_wakeup_events++;
424 }
425 
426 static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
427 				  u64 timestamp)
428 {
429 	struct sched_atom *event = get_new_event(task, timestamp);
430 
431 	event->type = SCHED_EVENT_SLEEP;
432 
433 	sched->nr_sleep_events++;
434 }
435 
436 static struct task_desc *register_pid(struct perf_sched *sched,
437 				      unsigned long pid, const char *comm)
438 {
439 	struct task_desc *task;
440 	static int pid_max;
441 
442 	if (sched->pid_to_task == NULL) {
443 		if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
444 			pid_max = MAX_PID;
445 		BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
446 	}
447 	if (pid >= (unsigned long)pid_max) {
448 		BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) *
449 			sizeof(struct task_desc *))) == NULL);
450 		while (pid >= (unsigned long)pid_max)
451 			sched->pid_to_task[pid_max++] = NULL;
452 	}
453 
454 	task = sched->pid_to_task[pid];
455 
456 	if (task)
457 		return task;
458 
459 	task = zalloc(sizeof(*task));
460 	task->pid = pid;
461 	task->nr = sched->nr_tasks;
462 	strcpy(task->comm, comm);
463 	/*
464 	 * every task starts in sleeping state - this gets ignored
465 	 * if there's no wakeup pointing to this sleep state:
466 	 */
467 	add_sched_event_sleep(sched, task, 0);
468 
469 	sched->pid_to_task[pid] = task;
470 	sched->nr_tasks++;
471 	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
472 	BUG_ON(!sched->tasks);
473 	sched->tasks[task->nr] = task;
474 
475 	if (verbose > 0)
476 		printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
477 
478 	return task;
479 }
480 
481 
482 static void print_task_traces(struct perf_sched *sched)
483 {
484 	struct task_desc *task;
485 	unsigned long i;
486 
487 	for (i = 0; i < sched->nr_tasks; i++) {
488 		task = sched->tasks[i];
489 		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
490 			task->nr, task->comm, task->pid, task->nr_events);
491 	}
492 }
493 
494 static void add_cross_task_wakeups(struct perf_sched *sched)
495 {
496 	struct task_desc *task1, *task2;
497 	unsigned long i, j;
498 
499 	for (i = 0; i < sched->nr_tasks; i++) {
500 		task1 = sched->tasks[i];
501 		j = i + 1;
502 		if (j == sched->nr_tasks)
503 			j = 0;
504 		task2 = sched->tasks[j];
505 		add_sched_event_wakeup(sched, task1, 0, task2);
506 	}
507 }
508 
509 static void perf_sched__process_event(struct perf_sched *sched,
510 				      struct sched_atom *atom)
511 {
512 	int ret = 0;
513 
514 	switch (atom->type) {
515 		case SCHED_EVENT_RUN:
516 			burn_nsecs(sched, atom->duration);
517 			break;
518 		case SCHED_EVENT_SLEEP:
519 			if (atom->wait_sem)
520 				ret = sem_wait(atom->wait_sem);
521 			BUG_ON(ret);
522 			break;
523 		case SCHED_EVENT_WAKEUP:
524 			if (atom->wait_sem)
525 				ret = sem_post(atom->wait_sem);
526 			BUG_ON(ret);
527 			break;
528 		default:
529 			BUG_ON(1);
530 	}
531 }
532 
533 static u64 get_cpu_usage_nsec_parent(void)
534 {
535 	struct rusage ru;
536 	u64 sum;
537 	int err;
538 
539 	err = getrusage(RUSAGE_SELF, &ru);
540 	BUG_ON(err);
541 
542 	sum =  ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC;
543 	sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC;
544 
545 	return sum;
546 }
547 
548 static int self_open_counters(struct perf_sched *sched, unsigned long cur_task)
549 {
550 	struct perf_event_attr attr;
551 	char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
552 	int fd;
553 	struct rlimit limit;
554 	bool need_privilege = false;
555 
556 	memset(&attr, 0, sizeof(attr));
557 
558 	attr.type = PERF_TYPE_SOFTWARE;
559 	attr.config = PERF_COUNT_SW_TASK_CLOCK;
560 
561 force_again:
562 	fd = sys_perf_event_open(&attr, 0, -1, -1,
563 				 perf_event_open_cloexec_flag());
564 
565 	if (fd < 0) {
566 		if (errno == EMFILE) {
567 			if (sched->force) {
568 				BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1);
569 				limit.rlim_cur += sched->nr_tasks - cur_task;
570 				if (limit.rlim_cur > limit.rlim_max) {
571 					limit.rlim_max = limit.rlim_cur;
572 					need_privilege = true;
573 				}
574 				if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
575 					if (need_privilege && errno == EPERM)
576 						strcpy(info, "Need privilege\n");
577 				} else
578 					goto force_again;
579 			} else
580 				strcpy(info, "Have a try with -f option\n");
581 		}
582 		pr_err("Error: sys_perf_event_open() syscall returned "
583 		       "with %d (%s)\n%s", fd,
584 		       str_error_r(errno, sbuf, sizeof(sbuf)), info);
585 		exit(EXIT_FAILURE);
586 	}
587 	return fd;
588 }
589 
590 static u64 get_cpu_usage_nsec_self(int fd)
591 {
592 	u64 runtime;
593 	int ret;
594 
595 	ret = read(fd, &runtime, sizeof(runtime));
596 	BUG_ON(ret != sizeof(runtime));
597 
598 	return runtime;
599 }
600 
601 struct sched_thread_parms {
602 	struct task_desc  *task;
603 	struct perf_sched *sched;
604 	int fd;
605 };
606 
607 static void *thread_func(void *ctx)
608 {
609 	struct sched_thread_parms *parms = ctx;
610 	struct task_desc *this_task = parms->task;
611 	struct perf_sched *sched = parms->sched;
612 	u64 cpu_usage_0, cpu_usage_1;
613 	unsigned long i, ret;
614 	char comm2[22];
615 	int fd = parms->fd;
616 
617 	zfree(&parms);
618 
619 	sprintf(comm2, ":%s", this_task->comm);
620 	prctl(PR_SET_NAME, comm2);
621 	if (fd < 0)
622 		return NULL;
623 
624 	while (!sched->thread_funcs_exit) {
625 		ret = sem_post(&this_task->ready_for_work);
626 		BUG_ON(ret);
627 		mutex_lock(&sched->start_work_mutex);
628 		mutex_unlock(&sched->start_work_mutex);
629 
630 		cpu_usage_0 = get_cpu_usage_nsec_self(fd);
631 
632 		for (i = 0; i < this_task->nr_events; i++) {
633 			this_task->curr_event = i;
634 			perf_sched__process_event(sched, this_task->atoms[i]);
635 		}
636 
637 		cpu_usage_1 = get_cpu_usage_nsec_self(fd);
638 		this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
639 		ret = sem_post(&this_task->work_done_sem);
640 		BUG_ON(ret);
641 
642 		mutex_lock(&sched->work_done_wait_mutex);
643 		mutex_unlock(&sched->work_done_wait_mutex);
644 	}
645 	return NULL;
646 }
647 
648 static void create_tasks(struct perf_sched *sched)
649 	EXCLUSIVE_LOCK_FUNCTION(sched->start_work_mutex)
650 	EXCLUSIVE_LOCK_FUNCTION(sched->work_done_wait_mutex)
651 {
652 	struct task_desc *task;
653 	pthread_attr_t attr;
654 	unsigned long i;
655 	int err;
656 
657 	err = pthread_attr_init(&attr);
658 	BUG_ON(err);
659 	err = pthread_attr_setstacksize(&attr,
660 			(size_t) max(16 * 1024, (int)PTHREAD_STACK_MIN));
661 	BUG_ON(err);
662 	mutex_lock(&sched->start_work_mutex);
663 	mutex_lock(&sched->work_done_wait_mutex);
664 	for (i = 0; i < sched->nr_tasks; i++) {
665 		struct sched_thread_parms *parms = malloc(sizeof(*parms));
666 		BUG_ON(parms == NULL);
667 		parms->task = task = sched->tasks[i];
668 		parms->sched = sched;
669 		parms->fd = self_open_counters(sched, i);
670 		sem_init(&task->ready_for_work, 0, 0);
671 		sem_init(&task->work_done_sem, 0, 0);
672 		task->curr_event = 0;
673 		err = pthread_create(&task->thread, &attr, thread_func, parms);
674 		BUG_ON(err);
675 	}
676 }
677 
678 static void destroy_tasks(struct perf_sched *sched)
679 	UNLOCK_FUNCTION(sched->start_work_mutex)
680 	UNLOCK_FUNCTION(sched->work_done_wait_mutex)
681 {
682 	struct task_desc *task;
683 	unsigned long i;
684 	int err;
685 
686 	mutex_unlock(&sched->start_work_mutex);
687 	mutex_unlock(&sched->work_done_wait_mutex);
688 	/* Get rid of threads so they won't be upset by mutex destrunction */
689 	for (i = 0; i < sched->nr_tasks; i++) {
690 		task = sched->tasks[i];
691 		err = pthread_join(task->thread, NULL);
692 		BUG_ON(err);
693 		sem_destroy(&task->ready_for_work);
694 		sem_destroy(&task->work_done_sem);
695 	}
696 }
697 
698 static void wait_for_tasks(struct perf_sched *sched)
699 	EXCLUSIVE_LOCKS_REQUIRED(sched->work_done_wait_mutex)
700 	EXCLUSIVE_LOCKS_REQUIRED(sched->start_work_mutex)
701 {
702 	u64 cpu_usage_0, cpu_usage_1;
703 	struct task_desc *task;
704 	unsigned long i, ret;
705 
706 	sched->start_time = get_nsecs();
707 	sched->cpu_usage = 0;
708 	mutex_unlock(&sched->work_done_wait_mutex);
709 
710 	for (i = 0; i < sched->nr_tasks; i++) {
711 		task = sched->tasks[i];
712 		ret = sem_wait(&task->ready_for_work);
713 		BUG_ON(ret);
714 		sem_init(&task->ready_for_work, 0, 0);
715 	}
716 	mutex_lock(&sched->work_done_wait_mutex);
717 
718 	cpu_usage_0 = get_cpu_usage_nsec_parent();
719 
720 	mutex_unlock(&sched->start_work_mutex);
721 
722 	for (i = 0; i < sched->nr_tasks; i++) {
723 		task = sched->tasks[i];
724 		ret = sem_wait(&task->work_done_sem);
725 		BUG_ON(ret);
726 		sem_init(&task->work_done_sem, 0, 0);
727 		sched->cpu_usage += task->cpu_usage;
728 		task->cpu_usage = 0;
729 	}
730 
731 	cpu_usage_1 = get_cpu_usage_nsec_parent();
732 	if (!sched->runavg_cpu_usage)
733 		sched->runavg_cpu_usage = sched->cpu_usage;
734 	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat;
735 
736 	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
737 	if (!sched->runavg_parent_cpu_usage)
738 		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
739 	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) +
740 					 sched->parent_cpu_usage)/sched->replay_repeat;
741 
742 	mutex_lock(&sched->start_work_mutex);
743 
744 	for (i = 0; i < sched->nr_tasks; i++) {
745 		task = sched->tasks[i];
746 		task->curr_event = 0;
747 	}
748 }
749 
750 static void run_one_test(struct perf_sched *sched)
751 	EXCLUSIVE_LOCKS_REQUIRED(sched->work_done_wait_mutex)
752 	EXCLUSIVE_LOCKS_REQUIRED(sched->start_work_mutex)
753 {
754 	u64 T0, T1, delta, avg_delta, fluct;
755 
756 	T0 = get_nsecs();
757 	wait_for_tasks(sched);
758 	T1 = get_nsecs();
759 
760 	delta = T1 - T0;
761 	sched->sum_runtime += delta;
762 	sched->nr_runs++;
763 
764 	avg_delta = sched->sum_runtime / sched->nr_runs;
765 	if (delta < avg_delta)
766 		fluct = avg_delta - delta;
767 	else
768 		fluct = delta - avg_delta;
769 	sched->sum_fluct += fluct;
770 	if (!sched->run_avg)
771 		sched->run_avg = delta;
772 	sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat;
773 
774 	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / NSEC_PER_MSEC);
775 
776 	printf("ravg: %0.2f, ", (double)sched->run_avg / NSEC_PER_MSEC);
777 
778 	printf("cpu: %0.2f / %0.2f",
779 		(double)sched->cpu_usage / NSEC_PER_MSEC, (double)sched->runavg_cpu_usage / NSEC_PER_MSEC);
780 
781 #if 0
782 	/*
783 	 * rusage statistics done by the parent, these are less
784 	 * accurate than the sched->sum_exec_runtime based statistics:
785 	 */
786 	printf(" [%0.2f / %0.2f]",
787 		(double)sched->parent_cpu_usage / NSEC_PER_MSEC,
788 		(double)sched->runavg_parent_cpu_usage / NSEC_PER_MSEC);
789 #endif
790 
791 	printf("\n");
792 
793 	if (sched->nr_sleep_corrections)
794 		printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
795 	sched->nr_sleep_corrections = 0;
796 }
797 
798 static void test_calibrations(struct perf_sched *sched)
799 {
800 	u64 T0, T1;
801 
802 	T0 = get_nsecs();
803 	burn_nsecs(sched, NSEC_PER_MSEC);
804 	T1 = get_nsecs();
805 
806 	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
807 
808 	T0 = get_nsecs();
809 	sleep_nsecs(NSEC_PER_MSEC);
810 	T1 = get_nsecs();
811 
812 	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
813 }
814 
815 static int
816 replay_wakeup_event(struct perf_sched *sched,
817 		    struct evsel *evsel, struct perf_sample *sample,
818 		    struct machine *machine __maybe_unused)
819 {
820 	const char *comm = evsel__strval(evsel, sample, "comm");
821 	const u32 pid	 = evsel__intval(evsel, sample, "pid");
822 	struct task_desc *waker, *wakee;
823 
824 	if (verbose > 0) {
825 		printf("sched_wakeup event %p\n", evsel);
826 
827 		printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
828 	}
829 
830 	waker = register_pid(sched, sample->tid, "<unknown>");
831 	wakee = register_pid(sched, pid, comm);
832 
833 	add_sched_event_wakeup(sched, waker, sample->time, wakee);
834 	return 0;
835 }
836 
837 static int replay_switch_event(struct perf_sched *sched,
838 			       struct evsel *evsel,
839 			       struct perf_sample *sample,
840 			       struct machine *machine __maybe_unused)
841 {
842 	const char *prev_comm  = evsel__strval(evsel, sample, "prev_comm"),
843 		   *next_comm  = evsel__strval(evsel, sample, "next_comm");
844 	const u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"),
845 		  next_pid = evsel__intval(evsel, sample, "next_pid");
846 	struct task_desc *prev, __maybe_unused *next;
847 	u64 timestamp0, timestamp = sample->time;
848 	int cpu = sample->cpu;
849 	s64 delta;
850 
851 	if (verbose > 0)
852 		printf("sched_switch event %p\n", evsel);
853 
854 	if (cpu >= MAX_CPUS || cpu < 0)
855 		return 0;
856 
857 	timestamp0 = sched->cpu_last_switched[cpu];
858 	if (timestamp0)
859 		delta = timestamp - timestamp0;
860 	else
861 		delta = 0;
862 
863 	if (delta < 0) {
864 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
865 		return -1;
866 	}
867 
868 	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
869 		 prev_comm, prev_pid, next_comm, next_pid, delta);
870 
871 	prev = register_pid(sched, prev_pid, prev_comm);
872 	next = register_pid(sched, next_pid, next_comm);
873 
874 	sched->cpu_last_switched[cpu] = timestamp;
875 
876 	add_sched_event_run(sched, prev, timestamp, delta);
877 	add_sched_event_sleep(sched, prev, timestamp);
878 
879 	return 0;
880 }
881 
882 static int replay_fork_event(struct perf_sched *sched,
883 			     union perf_event *event,
884 			     struct machine *machine)
885 {
886 	struct thread *child, *parent;
887 
888 	child = machine__findnew_thread(machine, event->fork.pid,
889 					event->fork.tid);
890 	parent = machine__findnew_thread(machine, event->fork.ppid,
891 					 event->fork.ptid);
892 
893 	if (child == NULL || parent == NULL) {
894 		pr_debug("thread does not exist on fork event: child %p, parent %p\n",
895 				 child, parent);
896 		goto out_put;
897 	}
898 
899 	if (verbose > 0) {
900 		printf("fork event\n");
901 		printf("... parent: %s/%d\n", thread__comm_str(parent), thread__tid(parent));
902 		printf("...  child: %s/%d\n", thread__comm_str(child), thread__tid(child));
903 	}
904 
905 	register_pid(sched, thread__tid(parent), thread__comm_str(parent));
906 	register_pid(sched, thread__tid(child), thread__comm_str(child));
907 out_put:
908 	thread__put(child);
909 	thread__put(parent);
910 	return 0;
911 }
912 
913 struct sort_dimension {
914 	const char		*name;
915 	sort_fn_t		cmp;
916 	struct list_head	list;
917 };
918 
919 static inline void init_prio(struct thread_runtime *r)
920 {
921 	r->prio = -1;
922 }
923 
924 /*
925  * handle runtime stats saved per thread
926  */
927 static struct thread_runtime *thread__init_runtime(struct thread *thread)
928 {
929 	struct thread_runtime *r;
930 
931 	r = zalloc(sizeof(struct thread_runtime));
932 	if (!r)
933 		return NULL;
934 
935 	init_stats(&r->run_stats);
936 	init_prio(r);
937 	thread__set_priv(thread, r);
938 
939 	return r;
940 }
941 
942 static struct thread_runtime *thread__get_runtime(struct thread *thread)
943 {
944 	struct thread_runtime *tr;
945 
946 	tr = thread__priv(thread);
947 	if (tr == NULL) {
948 		tr = thread__init_runtime(thread);
949 		if (tr == NULL)
950 			pr_debug("Failed to malloc memory for runtime data.\n");
951 	}
952 
953 	return tr;
954 }
955 
956 static int
957 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
958 {
959 	struct sort_dimension *sort;
960 	int ret = 0;
961 
962 	BUG_ON(list_empty(list));
963 
964 	list_for_each_entry(sort, list, list) {
965 		ret = sort->cmp(l, r);
966 		if (ret)
967 			return ret;
968 	}
969 
970 	return ret;
971 }
972 
973 static struct work_atoms *
974 thread_atoms_search(struct rb_root_cached *root, struct thread *thread,
975 			 struct list_head *sort_list)
976 {
977 	struct rb_node *node = root->rb_root.rb_node;
978 	struct work_atoms key = { .thread = thread };
979 
980 	while (node) {
981 		struct work_atoms *atoms;
982 		int cmp;
983 
984 		atoms = container_of(node, struct work_atoms, node);
985 
986 		cmp = thread_lat_cmp(sort_list, &key, atoms);
987 		if (cmp > 0)
988 			node = node->rb_left;
989 		else if (cmp < 0)
990 			node = node->rb_right;
991 		else {
992 			BUG_ON(thread != atoms->thread);
993 			return atoms;
994 		}
995 	}
996 	return NULL;
997 }
998 
999 static void
1000 __thread_latency_insert(struct rb_root_cached *root, struct work_atoms *data,
1001 			 struct list_head *sort_list)
1002 {
1003 	struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
1004 	bool leftmost = true;
1005 
1006 	while (*new) {
1007 		struct work_atoms *this;
1008 		int cmp;
1009 
1010 		this = container_of(*new, struct work_atoms, node);
1011 		parent = *new;
1012 
1013 		cmp = thread_lat_cmp(sort_list, data, this);
1014 
1015 		if (cmp > 0)
1016 			new = &((*new)->rb_left);
1017 		else {
1018 			new = &((*new)->rb_right);
1019 			leftmost = false;
1020 		}
1021 	}
1022 
1023 	rb_link_node(&data->node, parent, new);
1024 	rb_insert_color_cached(&data->node, root, leftmost);
1025 }
1026 
1027 static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
1028 {
1029 	struct work_atoms *atoms = zalloc(sizeof(*atoms));
1030 	if (!atoms) {
1031 		pr_err("No memory at %s\n", __func__);
1032 		return -1;
1033 	}
1034 
1035 	atoms->thread = thread__get(thread);
1036 	INIT_LIST_HEAD(&atoms->work_list);
1037 	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
1038 	return 0;
1039 }
1040 
1041 static int
1042 add_sched_out_event(struct work_atoms *atoms,
1043 		    char run_state,
1044 		    u64 timestamp)
1045 {
1046 	struct work_atom *atom = zalloc(sizeof(*atom));
1047 	if (!atom) {
1048 		pr_err("Non memory at %s", __func__);
1049 		return -1;
1050 	}
1051 
1052 	atom->sched_out_time = timestamp;
1053 
1054 	if (run_state == 'R') {
1055 		atom->state = THREAD_WAIT_CPU;
1056 		atom->wake_up_time = atom->sched_out_time;
1057 	}
1058 
1059 	list_add_tail(&atom->list, &atoms->work_list);
1060 	return 0;
1061 }
1062 
1063 static void
1064 add_runtime_event(struct work_atoms *atoms, u64 delta,
1065 		  u64 timestamp __maybe_unused)
1066 {
1067 	struct work_atom *atom;
1068 
1069 	BUG_ON(list_empty(&atoms->work_list));
1070 
1071 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1072 
1073 	atom->runtime += delta;
1074 	atoms->total_runtime += delta;
1075 }
1076 
1077 static void
1078 add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
1079 {
1080 	struct work_atom *atom;
1081 	u64 delta;
1082 
1083 	if (list_empty(&atoms->work_list))
1084 		return;
1085 
1086 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1087 
1088 	if (atom->state != THREAD_WAIT_CPU)
1089 		return;
1090 
1091 	if (timestamp < atom->wake_up_time) {
1092 		atom->state = THREAD_IGNORE;
1093 		return;
1094 	}
1095 
1096 	atom->state = THREAD_SCHED_IN;
1097 	atom->sched_in_time = timestamp;
1098 
1099 	delta = atom->sched_in_time - atom->wake_up_time;
1100 	atoms->total_lat += delta;
1101 	if (delta > atoms->max_lat) {
1102 		atoms->max_lat = delta;
1103 		atoms->max_lat_start = atom->wake_up_time;
1104 		atoms->max_lat_end = timestamp;
1105 	}
1106 	atoms->nb_atoms++;
1107 }
1108 
1109 static int latency_switch_event(struct perf_sched *sched,
1110 				struct evsel *evsel,
1111 				struct perf_sample *sample,
1112 				struct machine *machine)
1113 {
1114 	const u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"),
1115 		  next_pid = evsel__intval(evsel, sample, "next_pid");
1116 	const char prev_state = evsel__taskstate(evsel, sample, "prev_state");
1117 	struct work_atoms *out_events, *in_events;
1118 	struct thread *sched_out, *sched_in;
1119 	u64 timestamp0, timestamp = sample->time;
1120 	int cpu = sample->cpu, err = -1;
1121 	s64 delta;
1122 
1123 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1124 
1125 	timestamp0 = sched->cpu_last_switched[cpu];
1126 	sched->cpu_last_switched[cpu] = timestamp;
1127 	if (timestamp0)
1128 		delta = timestamp - timestamp0;
1129 	else
1130 		delta = 0;
1131 
1132 	if (delta < 0) {
1133 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1134 		return -1;
1135 	}
1136 
1137 	sched_out = machine__findnew_thread(machine, -1, prev_pid);
1138 	sched_in = machine__findnew_thread(machine, -1, next_pid);
1139 	if (sched_out == NULL || sched_in == NULL)
1140 		goto out_put;
1141 
1142 	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1143 	if (!out_events) {
1144 		if (thread_atoms_insert(sched, sched_out))
1145 			goto out_put;
1146 		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1147 		if (!out_events) {
1148 			pr_err("out-event: Internal tree error");
1149 			goto out_put;
1150 		}
1151 	}
1152 	if (add_sched_out_event(out_events, prev_state, timestamp))
1153 		return -1;
1154 
1155 	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1156 	if (!in_events) {
1157 		if (thread_atoms_insert(sched, sched_in))
1158 			goto out_put;
1159 		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1160 		if (!in_events) {
1161 			pr_err("in-event: Internal tree error");
1162 			goto out_put;
1163 		}
1164 		/*
1165 		 * Take came in we have not heard about yet,
1166 		 * add in an initial atom in runnable state:
1167 		 */
1168 		if (add_sched_out_event(in_events, 'R', timestamp))
1169 			goto out_put;
1170 	}
1171 	add_sched_in_event(in_events, timestamp);
1172 	err = 0;
1173 out_put:
1174 	thread__put(sched_out);
1175 	thread__put(sched_in);
1176 	return err;
1177 }
1178 
1179 static int latency_runtime_event(struct perf_sched *sched,
1180 				 struct evsel *evsel,
1181 				 struct perf_sample *sample,
1182 				 struct machine *machine)
1183 {
1184 	const u32 pid	   = evsel__intval(evsel, sample, "pid");
1185 	const u64 runtime  = evsel__intval(evsel, sample, "runtime");
1186 	struct thread *thread = machine__findnew_thread(machine, -1, pid);
1187 	struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1188 	u64 timestamp = sample->time;
1189 	int cpu = sample->cpu, err = -1;
1190 
1191 	if (thread == NULL)
1192 		return -1;
1193 
1194 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1195 	if (!atoms) {
1196 		if (thread_atoms_insert(sched, thread))
1197 			goto out_put;
1198 		atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1199 		if (!atoms) {
1200 			pr_err("in-event: Internal tree error");
1201 			goto out_put;
1202 		}
1203 		if (add_sched_out_event(atoms, 'R', timestamp))
1204 			goto out_put;
1205 	}
1206 
1207 	add_runtime_event(atoms, runtime, timestamp);
1208 	err = 0;
1209 out_put:
1210 	thread__put(thread);
1211 	return err;
1212 }
1213 
1214 static int latency_wakeup_event(struct perf_sched *sched,
1215 				struct evsel *evsel,
1216 				struct perf_sample *sample,
1217 				struct machine *machine)
1218 {
1219 	const u32 pid	  = evsel__intval(evsel, sample, "pid");
1220 	struct work_atoms *atoms;
1221 	struct work_atom *atom;
1222 	struct thread *wakee;
1223 	u64 timestamp = sample->time;
1224 	int err = -1;
1225 
1226 	wakee = machine__findnew_thread(machine, -1, pid);
1227 	if (wakee == NULL)
1228 		return -1;
1229 	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1230 	if (!atoms) {
1231 		if (thread_atoms_insert(sched, wakee))
1232 			goto out_put;
1233 		atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1234 		if (!atoms) {
1235 			pr_err("wakeup-event: Internal tree error");
1236 			goto out_put;
1237 		}
1238 		if (add_sched_out_event(atoms, 'S', timestamp))
1239 			goto out_put;
1240 	}
1241 
1242 	BUG_ON(list_empty(&atoms->work_list));
1243 
1244 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1245 
1246 	/*
1247 	 * As we do not guarantee the wakeup event happens when
1248 	 * task is out of run queue, also may happen when task is
1249 	 * on run queue and wakeup only change ->state to TASK_RUNNING,
1250 	 * then we should not set the ->wake_up_time when wake up a
1251 	 * task which is on run queue.
1252 	 *
1253 	 * You WILL be missing events if you've recorded only
1254 	 * one CPU, or are only looking at only one, so don't
1255 	 * skip in this case.
1256 	 */
1257 	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1258 		goto out_ok;
1259 
1260 	sched->nr_timestamps++;
1261 	if (atom->sched_out_time > timestamp) {
1262 		sched->nr_unordered_timestamps++;
1263 		goto out_ok;
1264 	}
1265 
1266 	atom->state = THREAD_WAIT_CPU;
1267 	atom->wake_up_time = timestamp;
1268 out_ok:
1269 	err = 0;
1270 out_put:
1271 	thread__put(wakee);
1272 	return err;
1273 }
1274 
1275 static int latency_migrate_task_event(struct perf_sched *sched,
1276 				      struct evsel *evsel,
1277 				      struct perf_sample *sample,
1278 				      struct machine *machine)
1279 {
1280 	const u32 pid = evsel__intval(evsel, sample, "pid");
1281 	u64 timestamp = sample->time;
1282 	struct work_atoms *atoms;
1283 	struct work_atom *atom;
1284 	struct thread *migrant;
1285 	int err = -1;
1286 
1287 	/*
1288 	 * Only need to worry about migration when profiling one CPU.
1289 	 */
1290 	if (sched->profile_cpu == -1)
1291 		return 0;
1292 
1293 	migrant = machine__findnew_thread(machine, -1, pid);
1294 	if (migrant == NULL)
1295 		return -1;
1296 	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1297 	if (!atoms) {
1298 		if (thread_atoms_insert(sched, migrant))
1299 			goto out_put;
1300 		register_pid(sched, thread__tid(migrant), thread__comm_str(migrant));
1301 		atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1302 		if (!atoms) {
1303 			pr_err("migration-event: Internal tree error");
1304 			goto out_put;
1305 		}
1306 		if (add_sched_out_event(atoms, 'R', timestamp))
1307 			goto out_put;
1308 	}
1309 
1310 	BUG_ON(list_empty(&atoms->work_list));
1311 
1312 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1313 	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1314 
1315 	sched->nr_timestamps++;
1316 
1317 	if (atom->sched_out_time > timestamp)
1318 		sched->nr_unordered_timestamps++;
1319 	err = 0;
1320 out_put:
1321 	thread__put(migrant);
1322 	return err;
1323 }
1324 
1325 static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1326 {
1327 	int i;
1328 	int ret;
1329 	u64 avg;
1330 	char max_lat_start[32], max_lat_end[32];
1331 
1332 	if (!work_list->nb_atoms)
1333 		return;
1334 	/*
1335 	 * Ignore idle threads:
1336 	 */
1337 	if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
1338 		return;
1339 
1340 	sched->all_runtime += work_list->total_runtime;
1341 	sched->all_count   += work_list->nb_atoms;
1342 
1343 	if (work_list->num_merged > 1) {
1344 		ret = printf("  %s:(%d) ", thread__comm_str(work_list->thread),
1345 			     work_list->num_merged);
1346 	} else {
1347 		ret = printf("  %s:%d ", thread__comm_str(work_list->thread),
1348 			     thread__tid(work_list->thread));
1349 	}
1350 
1351 	for (i = 0; i < 24 - ret; i++)
1352 		printf(" ");
1353 
1354 	avg = work_list->total_lat / work_list->nb_atoms;
1355 	timestamp__scnprintf_usec(work_list->max_lat_start, max_lat_start, sizeof(max_lat_start));
1356 	timestamp__scnprintf_usec(work_list->max_lat_end, max_lat_end, sizeof(max_lat_end));
1357 
1358 	printf("|%11.3f ms |%9" PRIu64 " | avg:%8.3f ms | max:%8.3f ms | max start: %12s s | max end: %12s s\n",
1359 	      (double)work_list->total_runtime / NSEC_PER_MSEC,
1360 		 work_list->nb_atoms, (double)avg / NSEC_PER_MSEC,
1361 		 (double)work_list->max_lat / NSEC_PER_MSEC,
1362 		 max_lat_start, max_lat_end);
1363 }
1364 
1365 static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1366 {
1367 	pid_t l_tid, r_tid;
1368 
1369 	if (RC_CHK_EQUAL(l->thread, r->thread))
1370 		return 0;
1371 	l_tid = thread__tid(l->thread);
1372 	r_tid = thread__tid(r->thread);
1373 	if (l_tid < r_tid)
1374 		return -1;
1375 	if (l_tid > r_tid)
1376 		return 1;
1377 	return (int)(RC_CHK_ACCESS(l->thread) - RC_CHK_ACCESS(r->thread));
1378 }
1379 
1380 static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1381 {
1382 	u64 avgl, avgr;
1383 
1384 	if (!l->nb_atoms)
1385 		return -1;
1386 
1387 	if (!r->nb_atoms)
1388 		return 1;
1389 
1390 	avgl = l->total_lat / l->nb_atoms;
1391 	avgr = r->total_lat / r->nb_atoms;
1392 
1393 	if (avgl < avgr)
1394 		return -1;
1395 	if (avgl > avgr)
1396 		return 1;
1397 
1398 	return 0;
1399 }
1400 
1401 static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1402 {
1403 	if (l->max_lat < r->max_lat)
1404 		return -1;
1405 	if (l->max_lat > r->max_lat)
1406 		return 1;
1407 
1408 	return 0;
1409 }
1410 
1411 static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1412 {
1413 	if (l->nb_atoms < r->nb_atoms)
1414 		return -1;
1415 	if (l->nb_atoms > r->nb_atoms)
1416 		return 1;
1417 
1418 	return 0;
1419 }
1420 
1421 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1422 {
1423 	if (l->total_runtime < r->total_runtime)
1424 		return -1;
1425 	if (l->total_runtime > r->total_runtime)
1426 		return 1;
1427 
1428 	return 0;
1429 }
1430 
1431 static int sort_dimension__add(const char *tok, struct list_head *list)
1432 {
1433 	size_t i;
1434 	static struct sort_dimension avg_sort_dimension = {
1435 		.name = "avg",
1436 		.cmp  = avg_cmp,
1437 	};
1438 	static struct sort_dimension max_sort_dimension = {
1439 		.name = "max",
1440 		.cmp  = max_cmp,
1441 	};
1442 	static struct sort_dimension pid_sort_dimension = {
1443 		.name = "pid",
1444 		.cmp  = pid_cmp,
1445 	};
1446 	static struct sort_dimension runtime_sort_dimension = {
1447 		.name = "runtime",
1448 		.cmp  = runtime_cmp,
1449 	};
1450 	static struct sort_dimension switch_sort_dimension = {
1451 		.name = "switch",
1452 		.cmp  = switch_cmp,
1453 	};
1454 	struct sort_dimension *available_sorts[] = {
1455 		&pid_sort_dimension,
1456 		&avg_sort_dimension,
1457 		&max_sort_dimension,
1458 		&switch_sort_dimension,
1459 		&runtime_sort_dimension,
1460 	};
1461 
1462 	for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1463 		if (!strcmp(available_sorts[i]->name, tok)) {
1464 			list_add_tail(&available_sorts[i]->list, list);
1465 
1466 			return 0;
1467 		}
1468 	}
1469 
1470 	return -1;
1471 }
1472 
1473 static void perf_sched__sort_lat(struct perf_sched *sched)
1474 {
1475 	struct rb_node *node;
1476 	struct rb_root_cached *root = &sched->atom_root;
1477 again:
1478 	for (;;) {
1479 		struct work_atoms *data;
1480 		node = rb_first_cached(root);
1481 		if (!node)
1482 			break;
1483 
1484 		rb_erase_cached(node, root);
1485 		data = rb_entry(node, struct work_atoms, node);
1486 		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1487 	}
1488 	if (root == &sched->atom_root) {
1489 		root = &sched->merged_atom_root;
1490 		goto again;
1491 	}
1492 }
1493 
1494 static int process_sched_wakeup_event(const struct perf_tool *tool,
1495 				      struct evsel *evsel,
1496 				      struct perf_sample *sample,
1497 				      struct machine *machine)
1498 {
1499 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1500 
1501 	if (sched->tp_handler->wakeup_event)
1502 		return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
1503 
1504 	return 0;
1505 }
1506 
1507 static int process_sched_wakeup_ignore(const struct perf_tool *tool __maybe_unused,
1508 				      struct evsel *evsel __maybe_unused,
1509 				      struct perf_sample *sample __maybe_unused,
1510 				      struct machine *machine __maybe_unused)
1511 {
1512 	return 0;
1513 }
1514 
1515 union map_priv {
1516 	void	*ptr;
1517 	bool	 color;
1518 };
1519 
1520 static bool thread__has_color(struct thread *thread)
1521 {
1522 	union map_priv priv = {
1523 		.ptr = thread__priv(thread),
1524 	};
1525 
1526 	return priv.color;
1527 }
1528 
1529 static struct thread*
1530 map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid)
1531 {
1532 	struct thread *thread = machine__findnew_thread(machine, pid, tid);
1533 	union map_priv priv = {
1534 		.color = false,
1535 	};
1536 
1537 	if (!sched->map.color_pids || !thread || thread__priv(thread))
1538 		return thread;
1539 
1540 	if (thread_map__has(sched->map.color_pids, tid))
1541 		priv.color = true;
1542 
1543 	thread__set_priv(thread, priv.ptr);
1544 	return thread;
1545 }
1546 
1547 static bool sched_match_task(struct perf_sched *sched, const char *comm_str)
1548 {
1549 	bool fuzzy_match = sched->map.fuzzy;
1550 	struct strlist *task_names = sched->map.task_names;
1551 	struct str_node *node;
1552 
1553 	strlist__for_each_entry(node, task_names) {
1554 		bool match_found = fuzzy_match ? !!strstr(comm_str, node->s) :
1555 							!strcmp(comm_str, node->s);
1556 		if (match_found)
1557 			return true;
1558 	}
1559 
1560 	return false;
1561 }
1562 
1563 static void print_sched_map(struct perf_sched *sched, struct perf_cpu this_cpu, int cpus_nr,
1564 								const char *color, bool sched_out)
1565 {
1566 	for (int i = 0; i < cpus_nr; i++) {
1567 		struct perf_cpu cpu = {
1568 			.cpu = sched->map.comp ? sched->map.comp_cpus[i].cpu : i,
1569 		};
1570 		struct thread *curr_thread = sched->curr_thread[cpu.cpu];
1571 		struct thread *curr_out_thread = sched->curr_out_thread[cpu.cpu];
1572 		struct thread_runtime *curr_tr;
1573 		const char *pid_color = color;
1574 		const char *cpu_color = color;
1575 		char symbol = ' ';
1576 		struct thread *thread_to_check = sched_out ? curr_out_thread : curr_thread;
1577 
1578 		if (thread_to_check && thread__has_color(thread_to_check))
1579 			pid_color = COLOR_PIDS;
1580 
1581 		if (sched->map.color_cpus && perf_cpu_map__has(sched->map.color_cpus, cpu))
1582 			cpu_color = COLOR_CPUS;
1583 
1584 		if (cpu.cpu == this_cpu.cpu)
1585 			symbol = '*';
1586 
1587 		color_fprintf(stdout, cpu.cpu != this_cpu.cpu ? color : cpu_color, "%c", symbol);
1588 
1589 		thread_to_check = sched_out ? sched->curr_out_thread[cpu.cpu] :
1590 								sched->curr_thread[cpu.cpu];
1591 
1592 		if (thread_to_check) {
1593 			curr_tr = thread__get_runtime(thread_to_check);
1594 			if (curr_tr == NULL)
1595 				return;
1596 
1597 			if (sched_out) {
1598 				if (cpu.cpu == this_cpu.cpu)
1599 					color_fprintf(stdout, color, "-  ");
1600 				else {
1601 					curr_tr = thread__get_runtime(sched->curr_thread[cpu.cpu]);
1602 					if (curr_tr != NULL)
1603 						color_fprintf(stdout, pid_color, "%2s ",
1604 										curr_tr->shortname);
1605 				}
1606 			} else
1607 				color_fprintf(stdout, pid_color, "%2s ", curr_tr->shortname);
1608 		} else
1609 			color_fprintf(stdout, color, "   ");
1610 	}
1611 }
1612 
1613 static int map_switch_event(struct perf_sched *sched, struct evsel *evsel,
1614 			    struct perf_sample *sample, struct machine *machine)
1615 {
1616 	const u32 next_pid = evsel__intval(evsel, sample, "next_pid");
1617 	const u32 prev_pid = evsel__intval(evsel, sample, "prev_pid");
1618 	struct thread *sched_in, *sched_out;
1619 	struct thread_runtime *tr;
1620 	int new_shortname;
1621 	u64 timestamp0, timestamp = sample->time;
1622 	s64 delta;
1623 	struct perf_cpu this_cpu = {
1624 		.cpu = sample->cpu,
1625 	};
1626 	int cpus_nr;
1627 	int proceed;
1628 	bool new_cpu = false;
1629 	const char *color = PERF_COLOR_NORMAL;
1630 	char stimestamp[32];
1631 	const char *str;
1632 
1633 	BUG_ON(this_cpu.cpu >= MAX_CPUS || this_cpu.cpu < 0);
1634 
1635 	if (this_cpu.cpu > sched->max_cpu.cpu)
1636 		sched->max_cpu = this_cpu;
1637 
1638 	if (sched->map.comp) {
1639 		cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS);
1640 		if (!__test_and_set_bit(this_cpu.cpu, sched->map.comp_cpus_mask)) {
1641 			sched->map.comp_cpus[cpus_nr++] = this_cpu;
1642 			new_cpu = true;
1643 		}
1644 	} else
1645 		cpus_nr = sched->max_cpu.cpu;
1646 
1647 	timestamp0 = sched->cpu_last_switched[this_cpu.cpu];
1648 	sched->cpu_last_switched[this_cpu.cpu] = timestamp;
1649 	if (timestamp0)
1650 		delta = timestamp - timestamp0;
1651 	else
1652 		delta = 0;
1653 
1654 	if (delta < 0) {
1655 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1656 		return -1;
1657 	}
1658 
1659 	sched_in = map__findnew_thread(sched, machine, -1, next_pid);
1660 	sched_out = map__findnew_thread(sched, machine, -1, prev_pid);
1661 	if (sched_in == NULL || sched_out == NULL)
1662 		return -1;
1663 
1664 	tr = thread__get_runtime(sched_in);
1665 	if (tr == NULL) {
1666 		thread__put(sched_in);
1667 		return -1;
1668 	}
1669 
1670 	sched->curr_thread[this_cpu.cpu] = thread__get(sched_in);
1671 	sched->curr_out_thread[this_cpu.cpu] = thread__get(sched_out);
1672 
1673 	str = thread__comm_str(sched_in);
1674 	new_shortname = 0;
1675 	if (!tr->shortname[0]) {
1676 		if (!strcmp(thread__comm_str(sched_in), "swapper")) {
1677 			/*
1678 			 * Don't allocate a letter-number for swapper:0
1679 			 * as a shortname. Instead, we use '.' for it.
1680 			 */
1681 			tr->shortname[0] = '.';
1682 			tr->shortname[1] = ' ';
1683 		} else if (!sched->map.task_name || sched_match_task(sched, str)) {
1684 			tr->shortname[0] = sched->next_shortname1;
1685 			tr->shortname[1] = sched->next_shortname2;
1686 
1687 			if (sched->next_shortname1 < 'Z') {
1688 				sched->next_shortname1++;
1689 			} else {
1690 				sched->next_shortname1 = 'A';
1691 				if (sched->next_shortname2 < '9')
1692 					sched->next_shortname2++;
1693 				else
1694 					sched->next_shortname2 = '0';
1695 			}
1696 		} else {
1697 			tr->shortname[0] = '-';
1698 			tr->shortname[1] = ' ';
1699 		}
1700 		new_shortname = 1;
1701 	}
1702 
1703 	if (sched->map.cpus && !perf_cpu_map__has(sched->map.cpus, this_cpu))
1704 		goto out;
1705 
1706 	proceed = 0;
1707 	str = thread__comm_str(sched_in);
1708 	/*
1709 	 * Check which of sched_in and sched_out matches the passed --task-name
1710 	 * arguments and call the corresponding print_sched_map.
1711 	 */
1712 	if (sched->map.task_name && !sched_match_task(sched, str)) {
1713 		if (!sched_match_task(sched, thread__comm_str(sched_out)))
1714 			goto out;
1715 		else
1716 			goto sched_out;
1717 
1718 	} else {
1719 		str = thread__comm_str(sched_out);
1720 		if (!(sched->map.task_name && !sched_match_task(sched, str)))
1721 			proceed = 1;
1722 	}
1723 
1724 	printf("  ");
1725 
1726 	print_sched_map(sched, this_cpu, cpus_nr, color, false);
1727 
1728 	timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp));
1729 	color_fprintf(stdout, color, "  %12s secs ", stimestamp);
1730 	if (new_shortname || tr->comm_changed || (verbose > 0 && thread__tid(sched_in))) {
1731 		const char *pid_color = color;
1732 
1733 		if (thread__has_color(sched_in))
1734 			pid_color = COLOR_PIDS;
1735 
1736 		color_fprintf(stdout, pid_color, "%s => %s:%d",
1737 			tr->shortname, thread__comm_str(sched_in), thread__tid(sched_in));
1738 		tr->comm_changed = false;
1739 	}
1740 
1741 	if (sched->map.comp && new_cpu)
1742 		color_fprintf(stdout, color, " (CPU %d)", this_cpu);
1743 
1744 	if (proceed != 1) {
1745 		color_fprintf(stdout, color, "\n");
1746 		goto out;
1747 	}
1748 
1749 sched_out:
1750 	if (sched->map.task_name) {
1751 		tr = thread__get_runtime(sched->curr_out_thread[this_cpu.cpu]);
1752 		if (strcmp(tr->shortname, "") == 0)
1753 			goto out;
1754 
1755 		if (proceed == 1)
1756 			color_fprintf(stdout, color, "\n");
1757 
1758 		printf("  ");
1759 		print_sched_map(sched, this_cpu, cpus_nr, color, true);
1760 		timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp));
1761 		color_fprintf(stdout, color, "  %12s secs ", stimestamp);
1762 	}
1763 
1764 	color_fprintf(stdout, color, "\n");
1765 
1766 out:
1767 	if (sched->map.task_name)
1768 		thread__put(sched_out);
1769 
1770 	thread__put(sched_in);
1771 
1772 	return 0;
1773 }
1774 
1775 static int process_sched_switch_event(const struct perf_tool *tool,
1776 				      struct evsel *evsel,
1777 				      struct perf_sample *sample,
1778 				      struct machine *machine)
1779 {
1780 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1781 	int this_cpu = sample->cpu, err = 0;
1782 	u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"),
1783 	    next_pid = evsel__intval(evsel, sample, "next_pid");
1784 
1785 	if (sched->curr_pid[this_cpu] != (u32)-1) {
1786 		/*
1787 		 * Are we trying to switch away a PID that is
1788 		 * not current?
1789 		 */
1790 		if (sched->curr_pid[this_cpu] != prev_pid)
1791 			sched->nr_context_switch_bugs++;
1792 	}
1793 
1794 	if (sched->tp_handler->switch_event)
1795 		err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
1796 
1797 	sched->curr_pid[this_cpu] = next_pid;
1798 	return err;
1799 }
1800 
1801 static int process_sched_runtime_event(const struct perf_tool *tool,
1802 				       struct evsel *evsel,
1803 				       struct perf_sample *sample,
1804 				       struct machine *machine)
1805 {
1806 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1807 
1808 	if (sched->tp_handler->runtime_event)
1809 		return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
1810 
1811 	return 0;
1812 }
1813 
1814 static int perf_sched__process_fork_event(const struct perf_tool *tool,
1815 					  union perf_event *event,
1816 					  struct perf_sample *sample,
1817 					  struct machine *machine)
1818 {
1819 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1820 
1821 	/* run the fork event through the perf machinery */
1822 	perf_event__process_fork(tool, event, sample, machine);
1823 
1824 	/* and then run additional processing needed for this command */
1825 	if (sched->tp_handler->fork_event)
1826 		return sched->tp_handler->fork_event(sched, event, machine);
1827 
1828 	return 0;
1829 }
1830 
1831 static int process_sched_migrate_task_event(const struct perf_tool *tool,
1832 					    struct evsel *evsel,
1833 					    struct perf_sample *sample,
1834 					    struct machine *machine)
1835 {
1836 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1837 
1838 	if (sched->tp_handler->migrate_task_event)
1839 		return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
1840 
1841 	return 0;
1842 }
1843 
1844 typedef int (*tracepoint_handler)(const struct perf_tool *tool,
1845 				  struct evsel *evsel,
1846 				  struct perf_sample *sample,
1847 				  struct machine *machine);
1848 
1849 static int perf_sched__process_tracepoint_sample(const struct perf_tool *tool __maybe_unused,
1850 						 union perf_event *event __maybe_unused,
1851 						 struct perf_sample *sample,
1852 						 struct evsel *evsel,
1853 						 struct machine *machine)
1854 {
1855 	int err = 0;
1856 
1857 	if (evsel->handler != NULL) {
1858 		tracepoint_handler f = evsel->handler;
1859 		err = f(tool, evsel, sample, machine);
1860 	}
1861 
1862 	return err;
1863 }
1864 
1865 static int perf_sched__process_comm(const struct perf_tool *tool __maybe_unused,
1866 				    union perf_event *event,
1867 				    struct perf_sample *sample,
1868 				    struct machine *machine)
1869 {
1870 	struct thread *thread;
1871 	struct thread_runtime *tr;
1872 	int err;
1873 
1874 	err = perf_event__process_comm(tool, event, sample, machine);
1875 	if (err)
1876 		return err;
1877 
1878 	thread = machine__find_thread(machine, sample->pid, sample->tid);
1879 	if (!thread) {
1880 		pr_err("Internal error: can't find thread\n");
1881 		return -1;
1882 	}
1883 
1884 	tr = thread__get_runtime(thread);
1885 	if (tr == NULL) {
1886 		thread__put(thread);
1887 		return -1;
1888 	}
1889 
1890 	tr->comm_changed = true;
1891 	thread__put(thread);
1892 
1893 	return 0;
1894 }
1895 
1896 static int perf_sched__read_events(struct perf_sched *sched)
1897 {
1898 	struct evsel_str_handler handlers[] = {
1899 		{ "sched:sched_switch",	      process_sched_switch_event, },
1900 		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
1901 		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
1902 		{ "sched:sched_waking",	      process_sched_wakeup_event, },
1903 		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
1904 		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
1905 	};
1906 	struct perf_session *session;
1907 	struct perf_data data = {
1908 		.path  = input_name,
1909 		.mode  = PERF_DATA_MODE_READ,
1910 		.force = sched->force,
1911 	};
1912 	int rc = -1;
1913 
1914 	session = perf_session__new(&data, &sched->tool);
1915 	if (IS_ERR(session)) {
1916 		pr_debug("Error creating perf session");
1917 		return PTR_ERR(session);
1918 	}
1919 
1920 	symbol__init(&session->header.env);
1921 
1922 	/* prefer sched_waking if it is captured */
1923 	if (evlist__find_tracepoint_by_name(session->evlist, "sched:sched_waking"))
1924 		handlers[2].handler = process_sched_wakeup_ignore;
1925 
1926 	if (perf_session__set_tracepoints_handlers(session, handlers))
1927 		goto out_delete;
1928 
1929 	if (perf_session__has_traces(session, "record -R")) {
1930 		int err = perf_session__process_events(session);
1931 		if (err) {
1932 			pr_err("Failed to process events, error %d", err);
1933 			goto out_delete;
1934 		}
1935 
1936 		sched->nr_events      = session->evlist->stats.nr_events[0];
1937 		sched->nr_lost_events = session->evlist->stats.total_lost;
1938 		sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
1939 	}
1940 
1941 	rc = 0;
1942 out_delete:
1943 	perf_session__delete(session);
1944 	return rc;
1945 }
1946 
1947 /*
1948  * scheduling times are printed as msec.usec
1949  */
1950 static inline void print_sched_time(unsigned long long nsecs, int width)
1951 {
1952 	unsigned long msecs;
1953 	unsigned long usecs;
1954 
1955 	msecs  = nsecs / NSEC_PER_MSEC;
1956 	nsecs -= msecs * NSEC_PER_MSEC;
1957 	usecs  = nsecs / NSEC_PER_USEC;
1958 	printf("%*lu.%03lu ", width, msecs, usecs);
1959 }
1960 
1961 /*
1962  * returns runtime data for event, allocating memory for it the
1963  * first time it is used.
1964  */
1965 static struct evsel_runtime *evsel__get_runtime(struct evsel *evsel)
1966 {
1967 	struct evsel_runtime *r = evsel->priv;
1968 
1969 	if (r == NULL) {
1970 		r = zalloc(sizeof(struct evsel_runtime));
1971 		evsel->priv = r;
1972 	}
1973 
1974 	return r;
1975 }
1976 
1977 /*
1978  * save last time event was seen per cpu
1979  */
1980 static void evsel__save_time(struct evsel *evsel, u64 timestamp, u32 cpu)
1981 {
1982 	struct evsel_runtime *r = evsel__get_runtime(evsel);
1983 
1984 	if (r == NULL)
1985 		return;
1986 
1987 	if ((cpu >= r->ncpu) || (r->last_time == NULL)) {
1988 		int i, n = __roundup_pow_of_two(cpu+1);
1989 		void *p = r->last_time;
1990 
1991 		p = realloc(r->last_time, n * sizeof(u64));
1992 		if (!p)
1993 			return;
1994 
1995 		r->last_time = p;
1996 		for (i = r->ncpu; i < n; ++i)
1997 			r->last_time[i] = (u64) 0;
1998 
1999 		r->ncpu = n;
2000 	}
2001 
2002 	r->last_time[cpu] = timestamp;
2003 }
2004 
2005 /* returns last time this event was seen on the given cpu */
2006 static u64 evsel__get_time(struct evsel *evsel, u32 cpu)
2007 {
2008 	struct evsel_runtime *r = evsel__get_runtime(evsel);
2009 
2010 	if ((r == NULL) || (r->last_time == NULL) || (cpu >= r->ncpu))
2011 		return 0;
2012 
2013 	return r->last_time[cpu];
2014 }
2015 
2016 static int comm_width = 30;
2017 
2018 static char *timehist_get_commstr(struct thread *thread)
2019 {
2020 	static char str[32];
2021 	const char *comm = thread__comm_str(thread);
2022 	pid_t tid = thread__tid(thread);
2023 	pid_t pid = thread__pid(thread);
2024 	int n;
2025 
2026 	if (pid == 0)
2027 		n = scnprintf(str, sizeof(str), "%s", comm);
2028 
2029 	else if (tid != pid)
2030 		n = scnprintf(str, sizeof(str), "%s[%d/%d]", comm, tid, pid);
2031 
2032 	else
2033 		n = scnprintf(str, sizeof(str), "%s[%d]", comm, tid);
2034 
2035 	if (n > comm_width)
2036 		comm_width = n;
2037 
2038 	return str;
2039 }
2040 
2041 /* prio field format: xxx or xxx->yyy */
2042 #define MAX_PRIO_STR_LEN 8
2043 static char *timehist_get_priostr(struct evsel *evsel,
2044 				  struct thread *thread,
2045 				  struct perf_sample *sample)
2046 {
2047 	static char prio_str[16];
2048 	int prev_prio = (int)evsel__intval(evsel, sample, "prev_prio");
2049 	struct thread_runtime *tr = thread__priv(thread);
2050 
2051 	if (tr->prio != prev_prio && tr->prio != -1)
2052 		scnprintf(prio_str, sizeof(prio_str), "%d->%d", tr->prio, prev_prio);
2053 	else
2054 		scnprintf(prio_str, sizeof(prio_str), "%d", prev_prio);
2055 
2056 	return prio_str;
2057 }
2058 
2059 static void timehist_header(struct perf_sched *sched)
2060 {
2061 	u32 ncpus = sched->max_cpu.cpu + 1;
2062 	u32 i, j;
2063 
2064 	printf("%15s %6s ", "time", "cpu");
2065 
2066 	if (sched->show_cpu_visual) {
2067 		printf(" ");
2068 		for (i = 0, j = 0; i < ncpus; ++i) {
2069 			printf("%x", j++);
2070 			if (j > 15)
2071 				j = 0;
2072 		}
2073 		printf(" ");
2074 	}
2075 
2076 	if (sched->show_prio) {
2077 		printf(" %-*s  %-*s  %9s  %9s  %9s",
2078 		       comm_width, "task name", MAX_PRIO_STR_LEN, "prio",
2079 		       "wait time", "sch delay", "run time");
2080 	} else {
2081 		printf(" %-*s  %9s  %9s  %9s", comm_width,
2082 		       "task name", "wait time", "sch delay", "run time");
2083 	}
2084 
2085 	if (sched->show_state)
2086 		printf("  %s", "state");
2087 
2088 	printf("\n");
2089 
2090 	/*
2091 	 * units row
2092 	 */
2093 	printf("%15s %-6s ", "", "");
2094 
2095 	if (sched->show_cpu_visual)
2096 		printf(" %*s ", ncpus, "");
2097 
2098 	if (sched->show_prio) {
2099 		printf(" %-*s  %-*s  %9s  %9s  %9s",
2100 		       comm_width, "[tid/pid]", MAX_PRIO_STR_LEN, "",
2101 		       "(msec)", "(msec)", "(msec)");
2102 	} else {
2103 		printf(" %-*s  %9s  %9s  %9s", comm_width,
2104 		       "[tid/pid]", "(msec)", "(msec)", "(msec)");
2105 	}
2106 
2107 	if (sched->show_state)
2108 		printf("  %5s", "");
2109 
2110 	printf("\n");
2111 
2112 	/*
2113 	 * separator
2114 	 */
2115 	printf("%.15s %.6s ", graph_dotted_line, graph_dotted_line);
2116 
2117 	if (sched->show_cpu_visual)
2118 		printf(" %.*s ", ncpus, graph_dotted_line);
2119 
2120 	if (sched->show_prio) {
2121 		printf(" %.*s  %.*s  %.9s  %.9s  %.9s",
2122 		       comm_width, graph_dotted_line, MAX_PRIO_STR_LEN, graph_dotted_line,
2123 		       graph_dotted_line, graph_dotted_line, graph_dotted_line);
2124 	} else {
2125 		printf(" %.*s  %.9s  %.9s  %.9s", comm_width,
2126 		       graph_dotted_line, graph_dotted_line, graph_dotted_line,
2127 		       graph_dotted_line);
2128 	}
2129 
2130 	if (sched->show_state)
2131 		printf("  %.5s", graph_dotted_line);
2132 
2133 	printf("\n");
2134 }
2135 
2136 static void timehist_print_sample(struct perf_sched *sched,
2137 				  struct evsel *evsel,
2138 				  struct perf_sample *sample,
2139 				  struct addr_location *al,
2140 				  struct thread *thread,
2141 				  u64 t, const char state)
2142 {
2143 	struct thread_runtime *tr = thread__priv(thread);
2144 	const char *next_comm = evsel__strval(evsel, sample, "next_comm");
2145 	const u32 next_pid = evsel__intval(evsel, sample, "next_pid");
2146 	u32 max_cpus = sched->max_cpu.cpu + 1;
2147 	char tstr[64];
2148 	char nstr[30];
2149 	u64 wait_time;
2150 
2151 	if (cpu_list && !test_bit(sample->cpu, cpu_bitmap))
2152 		return;
2153 
2154 	timestamp__scnprintf_usec(t, tstr, sizeof(tstr));
2155 	printf("%15s [%04d] ", tstr, sample->cpu);
2156 
2157 	if (sched->show_cpu_visual) {
2158 		u32 i;
2159 		char c;
2160 
2161 		printf(" ");
2162 		for (i = 0; i < max_cpus; ++i) {
2163 			/* flag idle times with 'i'; others are sched events */
2164 			if (i == sample->cpu)
2165 				c = (thread__tid(thread) == 0) ? 'i' : 's';
2166 			else
2167 				c = ' ';
2168 			printf("%c", c);
2169 		}
2170 		printf(" ");
2171 	}
2172 
2173 	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
2174 
2175 	if (sched->show_prio)
2176 		printf(" %-*s ", MAX_PRIO_STR_LEN, timehist_get_priostr(evsel, thread, sample));
2177 
2178 	wait_time = tr->dt_sleep + tr->dt_iowait + tr->dt_preempt;
2179 	print_sched_time(wait_time, 6);
2180 
2181 	print_sched_time(tr->dt_delay, 6);
2182 	print_sched_time(tr->dt_run, 6);
2183 
2184 	if (sched->show_state)
2185 		printf(" %5c ", thread__tid(thread) == 0 ? 'I' : state);
2186 
2187 	if (sched->show_next) {
2188 		snprintf(nstr, sizeof(nstr), "next: %s[%d]", next_comm, next_pid);
2189 		printf(" %-*s", comm_width, nstr);
2190 	}
2191 
2192 	if (sched->show_wakeups && !sched->show_next)
2193 		printf("  %-*s", comm_width, "");
2194 
2195 	if (thread__tid(thread) == 0)
2196 		goto out;
2197 
2198 	if (sched->show_callchain)
2199 		printf("  ");
2200 
2201 	sample__fprintf_sym(sample, al, 0,
2202 			    EVSEL__PRINT_SYM | EVSEL__PRINT_ONELINE |
2203 			    EVSEL__PRINT_CALLCHAIN_ARROW |
2204 			    EVSEL__PRINT_SKIP_IGNORED,
2205 			    get_tls_callchain_cursor(), symbol_conf.bt_stop_list,  stdout);
2206 
2207 out:
2208 	printf("\n");
2209 }
2210 
2211 /*
2212  * Explanation of delta-time stats:
2213  *
2214  *            t = time of current schedule out event
2215  *        tprev = time of previous sched out event
2216  *                also time of schedule-in event for current task
2217  *    last_time = time of last sched change event for current task
2218  *                (i.e, time process was last scheduled out)
2219  * ready_to_run = time of wakeup for current task
2220  *
2221  * -----|------------|------------|------------|------
2222  *    last         ready        tprev          t
2223  *    time         to run
2224  *
2225  *      |-------- dt_wait --------|
2226  *                   |- dt_delay -|-- dt_run --|
2227  *
2228  *   dt_run = run time of current task
2229  *  dt_wait = time between last schedule out event for task and tprev
2230  *            represents time spent off the cpu
2231  * dt_delay = time between wakeup and schedule-in of task
2232  */
2233 
2234 static void timehist_update_runtime_stats(struct thread_runtime *r,
2235 					 u64 t, u64 tprev)
2236 {
2237 	r->dt_delay   = 0;
2238 	r->dt_sleep   = 0;
2239 	r->dt_iowait  = 0;
2240 	r->dt_preempt = 0;
2241 	r->dt_run     = 0;
2242 
2243 	if (tprev) {
2244 		r->dt_run = t - tprev;
2245 		if (r->ready_to_run) {
2246 			if (r->ready_to_run > tprev)
2247 				pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
2248 			else
2249 				r->dt_delay = tprev - r->ready_to_run;
2250 		}
2251 
2252 		if (r->last_time > tprev)
2253 			pr_debug("time travel: last sched out time for task > previous sched_switch event\n");
2254 		else if (r->last_time) {
2255 			u64 dt_wait = tprev - r->last_time;
2256 
2257 			if (r->last_state == 'R')
2258 				r->dt_preempt = dt_wait;
2259 			else if (r->last_state == 'D')
2260 				r->dt_iowait = dt_wait;
2261 			else
2262 				r->dt_sleep = dt_wait;
2263 		}
2264 	}
2265 
2266 	update_stats(&r->run_stats, r->dt_run);
2267 
2268 	r->total_run_time     += r->dt_run;
2269 	r->total_delay_time   += r->dt_delay;
2270 	r->total_sleep_time   += r->dt_sleep;
2271 	r->total_iowait_time  += r->dt_iowait;
2272 	r->total_preempt_time += r->dt_preempt;
2273 }
2274 
2275 static bool is_idle_sample(struct perf_sample *sample,
2276 			   struct evsel *evsel)
2277 {
2278 	/* pid 0 == swapper == idle task */
2279 	if (evsel__name_is(evsel, "sched:sched_switch"))
2280 		return evsel__intval(evsel, sample, "prev_pid") == 0;
2281 
2282 	return sample->pid == 0;
2283 }
2284 
2285 static void save_task_callchain(struct perf_sched *sched,
2286 				struct perf_sample *sample,
2287 				struct evsel *evsel,
2288 				struct machine *machine)
2289 {
2290 	struct callchain_cursor *cursor;
2291 	struct thread *thread;
2292 
2293 	/* want main thread for process - has maps */
2294 	thread = machine__findnew_thread(machine, sample->pid, sample->pid);
2295 	if (thread == NULL) {
2296 		pr_debug("Failed to get thread for pid %d.\n", sample->pid);
2297 		return;
2298 	}
2299 
2300 	if (!sched->show_callchain || sample->callchain == NULL)
2301 		return;
2302 
2303 	cursor = get_tls_callchain_cursor();
2304 
2305 	if (thread__resolve_callchain(thread, cursor, evsel, sample,
2306 				      NULL, NULL, sched->max_stack + 2) != 0) {
2307 		if (verbose > 0)
2308 			pr_err("Failed to resolve callchain. Skipping\n");
2309 
2310 		return;
2311 	}
2312 
2313 	callchain_cursor_commit(cursor);
2314 
2315 	while (true) {
2316 		struct callchain_cursor_node *node;
2317 		struct symbol *sym;
2318 
2319 		node = callchain_cursor_current(cursor);
2320 		if (node == NULL)
2321 			break;
2322 
2323 		sym = node->ms.sym;
2324 		if (sym) {
2325 			if (!strcmp(sym->name, "schedule") ||
2326 			    !strcmp(sym->name, "__schedule") ||
2327 			    !strcmp(sym->name, "preempt_schedule"))
2328 				sym->ignore = 1;
2329 		}
2330 
2331 		callchain_cursor_advance(cursor);
2332 	}
2333 }
2334 
2335 static int init_idle_thread(struct thread *thread)
2336 {
2337 	struct idle_thread_runtime *itr;
2338 
2339 	thread__set_comm(thread, idle_comm, 0);
2340 
2341 	itr = zalloc(sizeof(*itr));
2342 	if (itr == NULL)
2343 		return -ENOMEM;
2344 
2345 	init_prio(&itr->tr);
2346 	init_stats(&itr->tr.run_stats);
2347 	callchain_init(&itr->callchain);
2348 	callchain_cursor_reset(&itr->cursor);
2349 	thread__set_priv(thread, itr);
2350 
2351 	return 0;
2352 }
2353 
2354 /*
2355  * Track idle stats per cpu by maintaining a local thread
2356  * struct for the idle task on each cpu.
2357  */
2358 static int init_idle_threads(int ncpu)
2359 {
2360 	int i, ret;
2361 
2362 	idle_threads = zalloc(ncpu * sizeof(struct thread *));
2363 	if (!idle_threads)
2364 		return -ENOMEM;
2365 
2366 	idle_max_cpu = ncpu;
2367 
2368 	/* allocate the actual thread struct if needed */
2369 	for (i = 0; i < ncpu; ++i) {
2370 		idle_threads[i] = thread__new(0, 0);
2371 		if (idle_threads[i] == NULL)
2372 			return -ENOMEM;
2373 
2374 		ret = init_idle_thread(idle_threads[i]);
2375 		if (ret < 0)
2376 			return ret;
2377 	}
2378 
2379 	return 0;
2380 }
2381 
2382 static void free_idle_threads(void)
2383 {
2384 	int i;
2385 
2386 	if (idle_threads == NULL)
2387 		return;
2388 
2389 	for (i = 0; i < idle_max_cpu; ++i) {
2390 		if ((idle_threads[i]))
2391 			thread__delete(idle_threads[i]);
2392 	}
2393 
2394 	free(idle_threads);
2395 }
2396 
2397 static struct thread *get_idle_thread(int cpu)
2398 {
2399 	/*
2400 	 * expand/allocate array of pointers to local thread
2401 	 * structs if needed
2402 	 */
2403 	if ((cpu >= idle_max_cpu) || (idle_threads == NULL)) {
2404 		int i, j = __roundup_pow_of_two(cpu+1);
2405 		void *p;
2406 
2407 		p = realloc(idle_threads, j * sizeof(struct thread *));
2408 		if (!p)
2409 			return NULL;
2410 
2411 		idle_threads = (struct thread **) p;
2412 		for (i = idle_max_cpu; i < j; ++i)
2413 			idle_threads[i] = NULL;
2414 
2415 		idle_max_cpu = j;
2416 	}
2417 
2418 	/* allocate a new thread struct if needed */
2419 	if (idle_threads[cpu] == NULL) {
2420 		idle_threads[cpu] = thread__new(0, 0);
2421 
2422 		if (idle_threads[cpu]) {
2423 			if (init_idle_thread(idle_threads[cpu]) < 0)
2424 				return NULL;
2425 		}
2426 	}
2427 
2428 	return idle_threads[cpu];
2429 }
2430 
2431 static void save_idle_callchain(struct perf_sched *sched,
2432 				struct idle_thread_runtime *itr,
2433 				struct perf_sample *sample)
2434 {
2435 	struct callchain_cursor *cursor;
2436 
2437 	if (!sched->show_callchain || sample->callchain == NULL)
2438 		return;
2439 
2440 	cursor = get_tls_callchain_cursor();
2441 	if (cursor == NULL)
2442 		return;
2443 
2444 	callchain_cursor__copy(&itr->cursor, cursor);
2445 }
2446 
2447 static struct thread *timehist_get_thread(struct perf_sched *sched,
2448 					  struct perf_sample *sample,
2449 					  struct machine *machine,
2450 					  struct evsel *evsel)
2451 {
2452 	struct thread *thread;
2453 
2454 	if (is_idle_sample(sample, evsel)) {
2455 		thread = get_idle_thread(sample->cpu);
2456 		if (thread == NULL)
2457 			pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);
2458 
2459 	} else {
2460 		/* there were samples with tid 0 but non-zero pid */
2461 		thread = machine__findnew_thread(machine, sample->pid,
2462 						 sample->tid ?: sample->pid);
2463 		if (thread == NULL) {
2464 			pr_debug("Failed to get thread for tid %d. skipping sample.\n",
2465 				 sample->tid);
2466 		}
2467 
2468 		save_task_callchain(sched, sample, evsel, machine);
2469 		if (sched->idle_hist) {
2470 			struct thread *idle;
2471 			struct idle_thread_runtime *itr;
2472 
2473 			idle = get_idle_thread(sample->cpu);
2474 			if (idle == NULL) {
2475 				pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);
2476 				return NULL;
2477 			}
2478 
2479 			itr = thread__priv(idle);
2480 			if (itr == NULL)
2481 				return NULL;
2482 
2483 			itr->last_thread = thread;
2484 
2485 			/* copy task callchain when entering to idle */
2486 			if (evsel__intval(evsel, sample, "next_pid") == 0)
2487 				save_idle_callchain(sched, itr, sample);
2488 		}
2489 	}
2490 
2491 	return thread;
2492 }
2493 
2494 static bool timehist_skip_sample(struct perf_sched *sched,
2495 				 struct thread *thread,
2496 				 struct evsel *evsel,
2497 				 struct perf_sample *sample)
2498 {
2499 	bool rc = false;
2500 	int prio = -1;
2501 	struct thread_runtime *tr = NULL;
2502 
2503 	if (thread__is_filtered(thread)) {
2504 		rc = true;
2505 		sched->skipped_samples++;
2506 	}
2507 
2508 	if (sched->prio_str) {
2509 		/*
2510 		 * Because priority may be changed during task execution,
2511 		 * first read priority from prev sched_in event for current task.
2512 		 * If prev sched_in event is not saved, then read priority from
2513 		 * current task sched_out event.
2514 		 */
2515 		tr = thread__get_runtime(thread);
2516 		if (tr && tr->prio != -1)
2517 			prio = tr->prio;
2518 		else if (evsel__name_is(evsel, "sched:sched_switch"))
2519 			prio = evsel__intval(evsel, sample, "prev_prio");
2520 
2521 		if (prio != -1 && !test_bit(prio, sched->prio_bitmap)) {
2522 			rc = true;
2523 			sched->skipped_samples++;
2524 		}
2525 	}
2526 
2527 	if (sched->idle_hist) {
2528 		if (!evsel__name_is(evsel, "sched:sched_switch"))
2529 			rc = true;
2530 		else if (evsel__intval(evsel, sample, "prev_pid") != 0 &&
2531 			 evsel__intval(evsel, sample, "next_pid") != 0)
2532 			rc = true;
2533 	}
2534 
2535 	return rc;
2536 }
2537 
2538 static void timehist_print_wakeup_event(struct perf_sched *sched,
2539 					struct evsel *evsel,
2540 					struct perf_sample *sample,
2541 					struct machine *machine,
2542 					struct thread *awakened)
2543 {
2544 	struct thread *thread;
2545 	char tstr[64];
2546 
2547 	thread = machine__findnew_thread(machine, sample->pid, sample->tid);
2548 	if (thread == NULL)
2549 		return;
2550 
2551 	/* show wakeup unless both awakee and awaker are filtered */
2552 	if (timehist_skip_sample(sched, thread, evsel, sample) &&
2553 	    timehist_skip_sample(sched, awakened, evsel, sample)) {
2554 		return;
2555 	}
2556 
2557 	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
2558 	printf("%15s [%04d] ", tstr, sample->cpu);
2559 	if (sched->show_cpu_visual)
2560 		printf(" %*s ", sched->max_cpu.cpu + 1, "");
2561 
2562 	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
2563 
2564 	/* dt spacer */
2565 	printf("  %9s  %9s  %9s ", "", "", "");
2566 
2567 	printf("awakened: %s", timehist_get_commstr(awakened));
2568 
2569 	printf("\n");
2570 }
2571 
2572 static int timehist_sched_wakeup_ignore(const struct perf_tool *tool __maybe_unused,
2573 					union perf_event *event __maybe_unused,
2574 					struct evsel *evsel __maybe_unused,
2575 					struct perf_sample *sample __maybe_unused,
2576 					struct machine *machine __maybe_unused)
2577 {
2578 	return 0;
2579 }
2580 
2581 static int timehist_sched_wakeup_event(const struct perf_tool *tool,
2582 				       union perf_event *event __maybe_unused,
2583 				       struct evsel *evsel,
2584 				       struct perf_sample *sample,
2585 				       struct machine *machine)
2586 {
2587 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2588 	struct thread *thread;
2589 	struct thread_runtime *tr = NULL;
2590 	/* want pid of awakened task not pid in sample */
2591 	const u32 pid = evsel__intval(evsel, sample, "pid");
2592 
2593 	thread = machine__findnew_thread(machine, 0, pid);
2594 	if (thread == NULL)
2595 		return -1;
2596 
2597 	tr = thread__get_runtime(thread);
2598 	if (tr == NULL)
2599 		return -1;
2600 
2601 	if (tr->ready_to_run == 0)
2602 		tr->ready_to_run = sample->time;
2603 
2604 	/* show wakeups if requested */
2605 	if (sched->show_wakeups &&
2606 	    !perf_time__skip_sample(&sched->ptime, sample->time))
2607 		timehist_print_wakeup_event(sched, evsel, sample, machine, thread);
2608 
2609 	return 0;
2610 }
2611 
2612 static void timehist_print_migration_event(struct perf_sched *sched,
2613 					struct evsel *evsel,
2614 					struct perf_sample *sample,
2615 					struct machine *machine,
2616 					struct thread *migrated)
2617 {
2618 	struct thread *thread;
2619 	char tstr[64];
2620 	u32 max_cpus;
2621 	u32 ocpu, dcpu;
2622 
2623 	if (sched->summary_only)
2624 		return;
2625 
2626 	max_cpus = sched->max_cpu.cpu + 1;
2627 	ocpu = evsel__intval(evsel, sample, "orig_cpu");
2628 	dcpu = evsel__intval(evsel, sample, "dest_cpu");
2629 
2630 	thread = machine__findnew_thread(machine, sample->pid, sample->tid);
2631 	if (thread == NULL)
2632 		return;
2633 
2634 	if (timehist_skip_sample(sched, thread, evsel, sample) &&
2635 	    timehist_skip_sample(sched, migrated, evsel, sample)) {
2636 		return;
2637 	}
2638 
2639 	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
2640 	printf("%15s [%04d] ", tstr, sample->cpu);
2641 
2642 	if (sched->show_cpu_visual) {
2643 		u32 i;
2644 		char c;
2645 
2646 		printf("  ");
2647 		for (i = 0; i < max_cpus; ++i) {
2648 			c = (i == sample->cpu) ? 'm' : ' ';
2649 			printf("%c", c);
2650 		}
2651 		printf("  ");
2652 	}
2653 
2654 	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
2655 
2656 	/* dt spacer */
2657 	printf("  %9s  %9s  %9s ", "", "", "");
2658 
2659 	printf("migrated: %s", timehist_get_commstr(migrated));
2660 	printf(" cpu %d => %d", ocpu, dcpu);
2661 
2662 	printf("\n");
2663 }
2664 
2665 static int timehist_migrate_task_event(const struct perf_tool *tool,
2666 				       union perf_event *event __maybe_unused,
2667 				       struct evsel *evsel,
2668 				       struct perf_sample *sample,
2669 				       struct machine *machine)
2670 {
2671 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2672 	struct thread *thread;
2673 	struct thread_runtime *tr = NULL;
2674 	/* want pid of migrated task not pid in sample */
2675 	const u32 pid = evsel__intval(evsel, sample, "pid");
2676 
2677 	thread = machine__findnew_thread(machine, 0, pid);
2678 	if (thread == NULL)
2679 		return -1;
2680 
2681 	tr = thread__get_runtime(thread);
2682 	if (tr == NULL)
2683 		return -1;
2684 
2685 	tr->migrations++;
2686 
2687 	/* show migrations if requested */
2688 	timehist_print_migration_event(sched, evsel, sample, machine, thread);
2689 
2690 	return 0;
2691 }
2692 
2693 static void timehist_update_task_prio(struct evsel *evsel,
2694 				      struct perf_sample *sample,
2695 				      struct machine *machine)
2696 {
2697 	struct thread *thread;
2698 	struct thread_runtime *tr = NULL;
2699 	const u32 next_pid = evsel__intval(evsel, sample, "next_pid");
2700 	const u32 next_prio = evsel__intval(evsel, sample, "next_prio");
2701 
2702 	if (next_pid == 0)
2703 		thread = get_idle_thread(sample->cpu);
2704 	else
2705 		thread = machine__findnew_thread(machine, -1, next_pid);
2706 
2707 	if (thread == NULL)
2708 		return;
2709 
2710 	tr = thread__get_runtime(thread);
2711 	if (tr == NULL)
2712 		return;
2713 
2714 	tr->prio = next_prio;
2715 }
2716 
2717 static int timehist_sched_change_event(const struct perf_tool *tool,
2718 				       union perf_event *event,
2719 				       struct evsel *evsel,
2720 				       struct perf_sample *sample,
2721 				       struct machine *machine)
2722 {
2723 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2724 	struct perf_time_interval *ptime = &sched->ptime;
2725 	struct addr_location al;
2726 	struct thread *thread;
2727 	struct thread_runtime *tr = NULL;
2728 	u64 tprev, t = sample->time;
2729 	int rc = 0;
2730 	const char state = evsel__taskstate(evsel, sample, "prev_state");
2731 
2732 	addr_location__init(&al);
2733 	if (machine__resolve(machine, &al, sample) < 0) {
2734 		pr_err("problem processing %d event. skipping it\n",
2735 		       event->header.type);
2736 		rc = -1;
2737 		goto out;
2738 	}
2739 
2740 	if (sched->show_prio || sched->prio_str)
2741 		timehist_update_task_prio(evsel, sample, machine);
2742 
2743 	thread = timehist_get_thread(sched, sample, machine, evsel);
2744 	if (thread == NULL) {
2745 		rc = -1;
2746 		goto out;
2747 	}
2748 
2749 	if (timehist_skip_sample(sched, thread, evsel, sample))
2750 		goto out;
2751 
2752 	tr = thread__get_runtime(thread);
2753 	if (tr == NULL) {
2754 		rc = -1;
2755 		goto out;
2756 	}
2757 
2758 	tprev = evsel__get_time(evsel, sample->cpu);
2759 
2760 	/*
2761 	 * If start time given:
2762 	 * - sample time is under window user cares about - skip sample
2763 	 * - tprev is under window user cares about  - reset to start of window
2764 	 */
2765 	if (ptime->start && ptime->start > t)
2766 		goto out;
2767 
2768 	if (tprev && ptime->start > tprev)
2769 		tprev = ptime->start;
2770 
2771 	/*
2772 	 * If end time given:
2773 	 * - previous sched event is out of window - we are done
2774 	 * - sample time is beyond window user cares about - reset it
2775 	 *   to close out stats for time window interest
2776 	 * - If tprev is 0, that is, sched_in event for current task is
2777 	 *   not recorded, cannot determine whether sched_in event is
2778 	 *   within time window interest - ignore it
2779 	 */
2780 	if (ptime->end) {
2781 		if (!tprev || tprev > ptime->end)
2782 			goto out;
2783 
2784 		if (t > ptime->end)
2785 			t = ptime->end;
2786 	}
2787 
2788 	if (!sched->idle_hist || thread__tid(thread) == 0) {
2789 		if (!cpu_list || test_bit(sample->cpu, cpu_bitmap))
2790 			timehist_update_runtime_stats(tr, t, tprev);
2791 
2792 		if (sched->idle_hist) {
2793 			struct idle_thread_runtime *itr = (void *)tr;
2794 			struct thread_runtime *last_tr;
2795 
2796 			if (itr->last_thread == NULL)
2797 				goto out;
2798 
2799 			/* add current idle time as last thread's runtime */
2800 			last_tr = thread__get_runtime(itr->last_thread);
2801 			if (last_tr == NULL)
2802 				goto out;
2803 
2804 			timehist_update_runtime_stats(last_tr, t, tprev);
2805 			/*
2806 			 * remove delta time of last thread as it's not updated
2807 			 * and otherwise it will show an invalid value next
2808 			 * time.  we only care total run time and run stat.
2809 			 */
2810 			last_tr->dt_run = 0;
2811 			last_tr->dt_delay = 0;
2812 			last_tr->dt_sleep = 0;
2813 			last_tr->dt_iowait = 0;
2814 			last_tr->dt_preempt = 0;
2815 
2816 			if (itr->cursor.nr)
2817 				callchain_append(&itr->callchain, &itr->cursor, t - tprev);
2818 
2819 			itr->last_thread = NULL;
2820 		}
2821 
2822 		if (!sched->summary_only)
2823 			timehist_print_sample(sched, evsel, sample, &al, thread, t, state);
2824 	}
2825 
2826 out:
2827 	if (sched->hist_time.start == 0 && t >= ptime->start)
2828 		sched->hist_time.start = t;
2829 	if (ptime->end == 0 || t <= ptime->end)
2830 		sched->hist_time.end = t;
2831 
2832 	if (tr) {
2833 		/* time of this sched_switch event becomes last time task seen */
2834 		tr->last_time = sample->time;
2835 
2836 		/* last state is used to determine where to account wait time */
2837 		tr->last_state = state;
2838 
2839 		/* sched out event for task so reset ready to run time */
2840 		if (state == 'R')
2841 			tr->ready_to_run = t;
2842 		else
2843 			tr->ready_to_run = 0;
2844 	}
2845 
2846 	evsel__save_time(evsel, sample->time, sample->cpu);
2847 
2848 	addr_location__exit(&al);
2849 	return rc;
2850 }
2851 
2852 static int timehist_sched_switch_event(const struct perf_tool *tool,
2853 			     union perf_event *event,
2854 			     struct evsel *evsel,
2855 			     struct perf_sample *sample,
2856 			     struct machine *machine __maybe_unused)
2857 {
2858 	return timehist_sched_change_event(tool, event, evsel, sample, machine);
2859 }
2860 
2861 static int process_lost(const struct perf_tool *tool __maybe_unused,
2862 			union perf_event *event,
2863 			struct perf_sample *sample,
2864 			struct machine *machine __maybe_unused)
2865 {
2866 	char tstr[64];
2867 
2868 	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
2869 	printf("%15s ", tstr);
2870 	printf("lost %" PRI_lu64 " events on cpu %d\n", event->lost.lost, sample->cpu);
2871 
2872 	return 0;
2873 }
2874 
2875 
2876 static void print_thread_runtime(struct thread *t,
2877 				 struct thread_runtime *r)
2878 {
2879 	double mean = avg_stats(&r->run_stats);
2880 	float stddev;
2881 
2882 	printf("%*s   %5d  %9" PRIu64 " ",
2883 	       comm_width, timehist_get_commstr(t), thread__ppid(t),
2884 	       (u64) r->run_stats.n);
2885 
2886 	print_sched_time(r->total_run_time, 8);
2887 	stddev = rel_stddev_stats(stddev_stats(&r->run_stats), mean);
2888 	print_sched_time(r->run_stats.min, 6);
2889 	printf(" ");
2890 	print_sched_time((u64) mean, 6);
2891 	printf(" ");
2892 	print_sched_time(r->run_stats.max, 6);
2893 	printf("  ");
2894 	printf("%5.2f", stddev);
2895 	printf("   %5" PRIu64, r->migrations);
2896 	printf("\n");
2897 }
2898 
2899 static void print_thread_waittime(struct thread *t,
2900 				  struct thread_runtime *r)
2901 {
2902 	printf("%*s   %5d  %9" PRIu64 " ",
2903 	       comm_width, timehist_get_commstr(t), thread__ppid(t),
2904 	       (u64) r->run_stats.n);
2905 
2906 	print_sched_time(r->total_run_time, 8);
2907 	print_sched_time(r->total_sleep_time, 6);
2908 	printf(" ");
2909 	print_sched_time(r->total_iowait_time, 6);
2910 	printf(" ");
2911 	print_sched_time(r->total_preempt_time, 6);
2912 	printf(" ");
2913 	print_sched_time(r->total_delay_time, 6);
2914 	printf("\n");
2915 }
2916 
2917 struct total_run_stats {
2918 	struct perf_sched *sched;
2919 	u64  sched_count;
2920 	u64  task_count;
2921 	u64  total_run_time;
2922 };
2923 
2924 static int show_thread_runtime(struct thread *t, void *priv)
2925 {
2926 	struct total_run_stats *stats = priv;
2927 	struct thread_runtime *r;
2928 
2929 	if (thread__is_filtered(t))
2930 		return 0;
2931 
2932 	r = thread__priv(t);
2933 	if (r && r->run_stats.n) {
2934 		stats->task_count++;
2935 		stats->sched_count += r->run_stats.n;
2936 		stats->total_run_time += r->total_run_time;
2937 
2938 		if (stats->sched->show_state)
2939 			print_thread_waittime(t, r);
2940 		else
2941 			print_thread_runtime(t, r);
2942 	}
2943 
2944 	return 0;
2945 }
2946 
2947 static size_t callchain__fprintf_folded(FILE *fp, struct callchain_node *node)
2948 {
2949 	const char *sep = " <- ";
2950 	struct callchain_list *chain;
2951 	size_t ret = 0;
2952 	char bf[1024];
2953 	bool first;
2954 
2955 	if (node == NULL)
2956 		return 0;
2957 
2958 	ret = callchain__fprintf_folded(fp, node->parent);
2959 	first = (ret == 0);
2960 
2961 	list_for_each_entry(chain, &node->val, list) {
2962 		if (chain->ip >= PERF_CONTEXT_MAX)
2963 			continue;
2964 		if (chain->ms.sym && chain->ms.sym->ignore)
2965 			continue;
2966 		ret += fprintf(fp, "%s%s", first ? "" : sep,
2967 			       callchain_list__sym_name(chain, bf, sizeof(bf),
2968 							false));
2969 		first = false;
2970 	}
2971 
2972 	return ret;
2973 }
2974 
2975 static size_t timehist_print_idlehist_callchain(struct rb_root_cached *root)
2976 {
2977 	size_t ret = 0;
2978 	FILE *fp = stdout;
2979 	struct callchain_node *chain;
2980 	struct rb_node *rb_node = rb_first_cached(root);
2981 
2982 	printf("  %16s  %8s  %s\n", "Idle time (msec)", "Count", "Callchains");
2983 	printf("  %.16s  %.8s  %.50s\n", graph_dotted_line, graph_dotted_line,
2984 	       graph_dotted_line);
2985 
2986 	while (rb_node) {
2987 		chain = rb_entry(rb_node, struct callchain_node, rb_node);
2988 		rb_node = rb_next(rb_node);
2989 
2990 		ret += fprintf(fp, "  ");
2991 		print_sched_time(chain->hit, 12);
2992 		ret += 16;  /* print_sched_time returns 2nd arg + 4 */
2993 		ret += fprintf(fp, " %8d  ", chain->count);
2994 		ret += callchain__fprintf_folded(fp, chain);
2995 		ret += fprintf(fp, "\n");
2996 	}
2997 
2998 	return ret;
2999 }
3000 
3001 static void timehist_print_summary(struct perf_sched *sched,
3002 				   struct perf_session *session)
3003 {
3004 	struct machine *m = &session->machines.host;
3005 	struct total_run_stats totals;
3006 	u64 task_count;
3007 	struct thread *t;
3008 	struct thread_runtime *r;
3009 	int i;
3010 	u64 hist_time = sched->hist_time.end - sched->hist_time.start;
3011 
3012 	memset(&totals, 0, sizeof(totals));
3013 	totals.sched = sched;
3014 
3015 	if (sched->idle_hist) {
3016 		printf("\nIdle-time summary\n");
3017 		printf("%*s  parent  sched-out  ", comm_width, "comm");
3018 		printf("  idle-time   min-idle    avg-idle    max-idle  stddev  migrations\n");
3019 	} else if (sched->show_state) {
3020 		printf("\nWait-time summary\n");
3021 		printf("%*s  parent   sched-in  ", comm_width, "comm");
3022 		printf("   run-time      sleep      iowait     preempt       delay\n");
3023 	} else {
3024 		printf("\nRuntime summary\n");
3025 		printf("%*s  parent   sched-in  ", comm_width, "comm");
3026 		printf("   run-time    min-run     avg-run     max-run  stddev  migrations\n");
3027 	}
3028 	printf("%*s            (count)  ", comm_width, "");
3029 	printf("     (msec)     (msec)      (msec)      (msec)       %s\n",
3030 	       sched->show_state ? "(msec)" : "%");
3031 	printf("%.117s\n", graph_dotted_line);
3032 
3033 	machine__for_each_thread(m, show_thread_runtime, &totals);
3034 	task_count = totals.task_count;
3035 	if (!task_count)
3036 		printf("<no still running tasks>\n");
3037 
3038 	/* CPU idle stats not tracked when samples were skipped */
3039 	if (sched->skipped_samples && !sched->idle_hist)
3040 		return;
3041 
3042 	printf("\nIdle stats:\n");
3043 	for (i = 0; i < idle_max_cpu; ++i) {
3044 		if (cpu_list && !test_bit(i, cpu_bitmap))
3045 			continue;
3046 
3047 		t = idle_threads[i];
3048 		if (!t)
3049 			continue;
3050 
3051 		r = thread__priv(t);
3052 		if (r && r->run_stats.n) {
3053 			totals.sched_count += r->run_stats.n;
3054 			printf("    CPU %2d idle for ", i);
3055 			print_sched_time(r->total_run_time, 6);
3056 			printf(" msec  (%6.2f%%)\n", 100.0 * r->total_run_time / hist_time);
3057 		} else
3058 			printf("    CPU %2d idle entire time window\n", i);
3059 	}
3060 
3061 	if (sched->idle_hist && sched->show_callchain) {
3062 		callchain_param.mode  = CHAIN_FOLDED;
3063 		callchain_param.value = CCVAL_PERIOD;
3064 
3065 		callchain_register_param(&callchain_param);
3066 
3067 		printf("\nIdle stats by callchain:\n");
3068 		for (i = 0; i < idle_max_cpu; ++i) {
3069 			struct idle_thread_runtime *itr;
3070 
3071 			t = idle_threads[i];
3072 			if (!t)
3073 				continue;
3074 
3075 			itr = thread__priv(t);
3076 			if (itr == NULL)
3077 				continue;
3078 
3079 			callchain_param.sort(&itr->sorted_root.rb_root, &itr->callchain,
3080 					     0, &callchain_param);
3081 
3082 			printf("  CPU %2d:", i);
3083 			print_sched_time(itr->tr.total_run_time, 6);
3084 			printf(" msec\n");
3085 			timehist_print_idlehist_callchain(&itr->sorted_root);
3086 			printf("\n");
3087 		}
3088 	}
3089 
3090 	printf("\n"
3091 	       "    Total number of unique tasks: %" PRIu64 "\n"
3092 	       "Total number of context switches: %" PRIu64 "\n",
3093 	       totals.task_count, totals.sched_count);
3094 
3095 	printf("           Total run time (msec): ");
3096 	print_sched_time(totals.total_run_time, 2);
3097 	printf("\n");
3098 
3099 	printf("    Total scheduling time (msec): ");
3100 	print_sched_time(hist_time, 2);
3101 	printf(" (x %d)\n", sched->max_cpu.cpu);
3102 }
3103 
3104 typedef int (*sched_handler)(const struct perf_tool *tool,
3105 			  union perf_event *event,
3106 			  struct evsel *evsel,
3107 			  struct perf_sample *sample,
3108 			  struct machine *machine);
3109 
3110 static int perf_timehist__process_sample(const struct perf_tool *tool,
3111 					 union perf_event *event,
3112 					 struct perf_sample *sample,
3113 					 struct evsel *evsel,
3114 					 struct machine *machine)
3115 {
3116 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
3117 	int err = 0;
3118 	struct perf_cpu this_cpu = {
3119 		.cpu = sample->cpu,
3120 	};
3121 
3122 	if (this_cpu.cpu > sched->max_cpu.cpu)
3123 		sched->max_cpu = this_cpu;
3124 
3125 	if (evsel->handler != NULL) {
3126 		sched_handler f = evsel->handler;
3127 
3128 		err = f(tool, event, evsel, sample, machine);
3129 	}
3130 
3131 	return err;
3132 }
3133 
3134 static int timehist_check_attr(struct perf_sched *sched,
3135 			       struct evlist *evlist)
3136 {
3137 	struct evsel *evsel;
3138 	struct evsel_runtime *er;
3139 
3140 	list_for_each_entry(evsel, &evlist->core.entries, core.node) {
3141 		er = evsel__get_runtime(evsel);
3142 		if (er == NULL) {
3143 			pr_err("Failed to allocate memory for evsel runtime data\n");
3144 			return -1;
3145 		}
3146 
3147 		/* only need to save callchain related to sched_switch event */
3148 		if (sched->show_callchain &&
3149 		    evsel__name_is(evsel, "sched:sched_switch") &&
3150 		    !evsel__has_callchain(evsel)) {
3151 			pr_info("Samples of sched_switch event do not have callchains.\n");
3152 			sched->show_callchain = 0;
3153 			symbol_conf.use_callchain = 0;
3154 		}
3155 	}
3156 
3157 	return 0;
3158 }
3159 
3160 static int timehist_parse_prio_str(struct perf_sched *sched)
3161 {
3162 	char *p;
3163 	unsigned long start_prio, end_prio;
3164 	const char *str = sched->prio_str;
3165 
3166 	if (!str)
3167 		return 0;
3168 
3169 	while (isdigit(*str)) {
3170 		p = NULL;
3171 		start_prio = strtoul(str, &p, 0);
3172 		if (start_prio >= MAX_PRIO || (*p != '\0' && *p != ',' && *p != '-'))
3173 			return -1;
3174 
3175 		if (*p == '-') {
3176 			str = ++p;
3177 			p = NULL;
3178 			end_prio = strtoul(str, &p, 0);
3179 
3180 			if (end_prio >= MAX_PRIO || (*p != '\0' && *p != ','))
3181 				return -1;
3182 
3183 			if (end_prio < start_prio)
3184 				return -1;
3185 		} else {
3186 			end_prio = start_prio;
3187 		}
3188 
3189 		for (; start_prio <= end_prio; start_prio++)
3190 			__set_bit(start_prio, sched->prio_bitmap);
3191 
3192 		if (*p)
3193 			++p;
3194 
3195 		str = p;
3196 	}
3197 
3198 	return 0;
3199 }
3200 
3201 static int perf_sched__timehist(struct perf_sched *sched)
3202 {
3203 	struct evsel_str_handler handlers[] = {
3204 		{ "sched:sched_switch",       timehist_sched_switch_event, },
3205 		{ "sched:sched_wakeup",	      timehist_sched_wakeup_event, },
3206 		{ "sched:sched_waking",       timehist_sched_wakeup_event, },
3207 		{ "sched:sched_wakeup_new",   timehist_sched_wakeup_event, },
3208 	};
3209 	const struct evsel_str_handler migrate_handlers[] = {
3210 		{ "sched:sched_migrate_task", timehist_migrate_task_event, },
3211 	};
3212 	struct perf_data data = {
3213 		.path  = input_name,
3214 		.mode  = PERF_DATA_MODE_READ,
3215 		.force = sched->force,
3216 	};
3217 
3218 	struct perf_session *session;
3219 	struct evlist *evlist;
3220 	int err = -1;
3221 
3222 	/*
3223 	 * event handlers for timehist option
3224 	 */
3225 	sched->tool.sample	 = perf_timehist__process_sample;
3226 	sched->tool.mmap	 = perf_event__process_mmap;
3227 	sched->tool.comm	 = perf_event__process_comm;
3228 	sched->tool.exit	 = perf_event__process_exit;
3229 	sched->tool.fork	 = perf_event__process_fork;
3230 	sched->tool.lost	 = process_lost;
3231 	sched->tool.attr	 = perf_event__process_attr;
3232 	sched->tool.tracing_data = perf_event__process_tracing_data;
3233 	sched->tool.build_id	 = perf_event__process_build_id;
3234 
3235 	sched->tool.ordering_requires_timestamps = true;
3236 
3237 	symbol_conf.use_callchain = sched->show_callchain;
3238 
3239 	session = perf_session__new(&data, &sched->tool);
3240 	if (IS_ERR(session))
3241 		return PTR_ERR(session);
3242 
3243 	if (cpu_list) {
3244 		err = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap);
3245 		if (err < 0)
3246 			goto out;
3247 	}
3248 
3249 	evlist = session->evlist;
3250 
3251 	symbol__init(&session->header.env);
3252 
3253 	if (perf_time__parse_str(&sched->ptime, sched->time_str) != 0) {
3254 		pr_err("Invalid time string\n");
3255 		err = -EINVAL;
3256 		goto out;
3257 	}
3258 
3259 	if (timehist_check_attr(sched, evlist) != 0)
3260 		goto out;
3261 
3262 	if (timehist_parse_prio_str(sched) != 0) {
3263 		pr_err("Invalid prio string\n");
3264 		goto out;
3265 	}
3266 
3267 	setup_pager();
3268 
3269 	/* prefer sched_waking if it is captured */
3270 	if (evlist__find_tracepoint_by_name(session->evlist, "sched:sched_waking"))
3271 		handlers[1].handler = timehist_sched_wakeup_ignore;
3272 
3273 	/* setup per-evsel handlers */
3274 	if (perf_session__set_tracepoints_handlers(session, handlers))
3275 		goto out;
3276 
3277 	/* sched_switch event at a minimum needs to exist */
3278 	if (!evlist__find_tracepoint_by_name(session->evlist, "sched:sched_switch")) {
3279 		pr_err("No sched_switch events found. Have you run 'perf sched record'?\n");
3280 		goto out;
3281 	}
3282 
3283 	if (sched->show_migrations &&
3284 	    perf_session__set_tracepoints_handlers(session, migrate_handlers))
3285 		goto out;
3286 
3287 	/* pre-allocate struct for per-CPU idle stats */
3288 	sched->max_cpu.cpu = session->header.env.nr_cpus_online;
3289 	if (sched->max_cpu.cpu == 0)
3290 		sched->max_cpu.cpu = 4;
3291 	if (init_idle_threads(sched->max_cpu.cpu))
3292 		goto out;
3293 
3294 	/* summary_only implies summary option, but don't overwrite summary if set */
3295 	if (sched->summary_only)
3296 		sched->summary = sched->summary_only;
3297 
3298 	if (!sched->summary_only)
3299 		timehist_header(sched);
3300 
3301 	err = perf_session__process_events(session);
3302 	if (err) {
3303 		pr_err("Failed to process events, error %d", err);
3304 		goto out;
3305 	}
3306 
3307 	sched->nr_events      = evlist->stats.nr_events[0];
3308 	sched->nr_lost_events = evlist->stats.total_lost;
3309 	sched->nr_lost_chunks = evlist->stats.nr_events[PERF_RECORD_LOST];
3310 
3311 	if (sched->summary)
3312 		timehist_print_summary(sched, session);
3313 
3314 out:
3315 	free_idle_threads();
3316 	perf_session__delete(session);
3317 
3318 	return err;
3319 }
3320 
3321 
3322 static void print_bad_events(struct perf_sched *sched)
3323 {
3324 	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
3325 		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
3326 			(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
3327 			sched->nr_unordered_timestamps, sched->nr_timestamps);
3328 	}
3329 	if (sched->nr_lost_events && sched->nr_events) {
3330 		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
3331 			(double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
3332 			sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
3333 	}
3334 	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
3335 		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
3336 			(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
3337 			sched->nr_context_switch_bugs, sched->nr_timestamps);
3338 		if (sched->nr_lost_events)
3339 			printf(" (due to lost events?)");
3340 		printf("\n");
3341 	}
3342 }
3343 
3344 static void __merge_work_atoms(struct rb_root_cached *root, struct work_atoms *data)
3345 {
3346 	struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
3347 	struct work_atoms *this;
3348 	const char *comm = thread__comm_str(data->thread), *this_comm;
3349 	bool leftmost = true;
3350 
3351 	while (*new) {
3352 		int cmp;
3353 
3354 		this = container_of(*new, struct work_atoms, node);
3355 		parent = *new;
3356 
3357 		this_comm = thread__comm_str(this->thread);
3358 		cmp = strcmp(comm, this_comm);
3359 		if (cmp > 0) {
3360 			new = &((*new)->rb_left);
3361 		} else if (cmp < 0) {
3362 			new = &((*new)->rb_right);
3363 			leftmost = false;
3364 		} else {
3365 			this->num_merged++;
3366 			this->total_runtime += data->total_runtime;
3367 			this->nb_atoms += data->nb_atoms;
3368 			this->total_lat += data->total_lat;
3369 			list_splice(&data->work_list, &this->work_list);
3370 			if (this->max_lat < data->max_lat) {
3371 				this->max_lat = data->max_lat;
3372 				this->max_lat_start = data->max_lat_start;
3373 				this->max_lat_end = data->max_lat_end;
3374 			}
3375 			zfree(&data);
3376 			return;
3377 		}
3378 	}
3379 
3380 	data->num_merged++;
3381 	rb_link_node(&data->node, parent, new);
3382 	rb_insert_color_cached(&data->node, root, leftmost);
3383 }
3384 
3385 static void perf_sched__merge_lat(struct perf_sched *sched)
3386 {
3387 	struct work_atoms *data;
3388 	struct rb_node *node;
3389 
3390 	if (sched->skip_merge)
3391 		return;
3392 
3393 	while ((node = rb_first_cached(&sched->atom_root))) {
3394 		rb_erase_cached(node, &sched->atom_root);
3395 		data = rb_entry(node, struct work_atoms, node);
3396 		__merge_work_atoms(&sched->merged_atom_root, data);
3397 	}
3398 }
3399 
3400 static int setup_cpus_switch_event(struct perf_sched *sched)
3401 {
3402 	unsigned int i;
3403 
3404 	sched->cpu_last_switched = calloc(MAX_CPUS, sizeof(*(sched->cpu_last_switched)));
3405 	if (!sched->cpu_last_switched)
3406 		return -1;
3407 
3408 	sched->curr_pid = malloc(MAX_CPUS * sizeof(*(sched->curr_pid)));
3409 	if (!sched->curr_pid) {
3410 		zfree(&sched->cpu_last_switched);
3411 		return -1;
3412 	}
3413 
3414 	for (i = 0; i < MAX_CPUS; i++)
3415 		sched->curr_pid[i] = -1;
3416 
3417 	return 0;
3418 }
3419 
3420 static void free_cpus_switch_event(struct perf_sched *sched)
3421 {
3422 	zfree(&sched->curr_pid);
3423 	zfree(&sched->cpu_last_switched);
3424 }
3425 
3426 static int perf_sched__lat(struct perf_sched *sched)
3427 {
3428 	int rc = -1;
3429 	struct rb_node *next;
3430 
3431 	setup_pager();
3432 
3433 	if (setup_cpus_switch_event(sched))
3434 		return rc;
3435 
3436 	if (perf_sched__read_events(sched))
3437 		goto out_free_cpus_switch_event;
3438 
3439 	perf_sched__merge_lat(sched);
3440 	perf_sched__sort_lat(sched);
3441 
3442 	printf("\n -------------------------------------------------------------------------------------------------------------------------------------------\n");
3443 	printf("  Task                  |   Runtime ms  |  Count   | Avg delay ms    | Max delay ms    | Max delay start           | Max delay end          |\n");
3444 	printf(" -------------------------------------------------------------------------------------------------------------------------------------------\n");
3445 
3446 	next = rb_first_cached(&sched->sorted_atom_root);
3447 
3448 	while (next) {
3449 		struct work_atoms *work_list;
3450 
3451 		work_list = rb_entry(next, struct work_atoms, node);
3452 		output_lat_thread(sched, work_list);
3453 		next = rb_next(next);
3454 		thread__zput(work_list->thread);
3455 	}
3456 
3457 	printf(" -----------------------------------------------------------------------------------------------------------------\n");
3458 	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
3459 		(double)sched->all_runtime / NSEC_PER_MSEC, sched->all_count);
3460 
3461 	printf(" ---------------------------------------------------\n");
3462 
3463 	print_bad_events(sched);
3464 	printf("\n");
3465 
3466 	rc = 0;
3467 
3468 out_free_cpus_switch_event:
3469 	free_cpus_switch_event(sched);
3470 	return rc;
3471 }
3472 
3473 static int setup_map_cpus(struct perf_sched *sched)
3474 {
3475 	sched->max_cpu.cpu  = sysconf(_SC_NPROCESSORS_CONF);
3476 
3477 	if (sched->map.comp) {
3478 		sched->map.comp_cpus = zalloc(sched->max_cpu.cpu * sizeof(int));
3479 		if (!sched->map.comp_cpus)
3480 			return -1;
3481 	}
3482 
3483 	if (sched->map.cpus_str) {
3484 		sched->map.cpus = perf_cpu_map__new(sched->map.cpus_str);
3485 		if (!sched->map.cpus) {
3486 			pr_err("failed to get cpus map from %s\n", sched->map.cpus_str);
3487 			zfree(&sched->map.comp_cpus);
3488 			return -1;
3489 		}
3490 	}
3491 
3492 	return 0;
3493 }
3494 
3495 static int setup_color_pids(struct perf_sched *sched)
3496 {
3497 	struct perf_thread_map *map;
3498 
3499 	if (!sched->map.color_pids_str)
3500 		return 0;
3501 
3502 	map = thread_map__new_by_tid_str(sched->map.color_pids_str);
3503 	if (!map) {
3504 		pr_err("failed to get thread map from %s\n", sched->map.color_pids_str);
3505 		return -1;
3506 	}
3507 
3508 	sched->map.color_pids = map;
3509 	return 0;
3510 }
3511 
3512 static int setup_color_cpus(struct perf_sched *sched)
3513 {
3514 	struct perf_cpu_map *map;
3515 
3516 	if (!sched->map.color_cpus_str)
3517 		return 0;
3518 
3519 	map = perf_cpu_map__new(sched->map.color_cpus_str);
3520 	if (!map) {
3521 		pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str);
3522 		return -1;
3523 	}
3524 
3525 	sched->map.color_cpus = map;
3526 	return 0;
3527 }
3528 
3529 static int perf_sched__map(struct perf_sched *sched)
3530 {
3531 	int rc = -1;
3532 
3533 	sched->curr_thread = calloc(MAX_CPUS, sizeof(*(sched->curr_thread)));
3534 	if (!sched->curr_thread)
3535 		return rc;
3536 
3537 	sched->curr_out_thread = calloc(MAX_CPUS, sizeof(*(sched->curr_out_thread)));
3538 	if (!sched->curr_out_thread)
3539 		return rc;
3540 
3541 	if (setup_cpus_switch_event(sched))
3542 		goto out_free_curr_thread;
3543 
3544 	if (setup_map_cpus(sched))
3545 		goto out_free_cpus_switch_event;
3546 
3547 	if (setup_color_pids(sched))
3548 		goto out_put_map_cpus;
3549 
3550 	if (setup_color_cpus(sched))
3551 		goto out_put_color_pids;
3552 
3553 	setup_pager();
3554 	if (perf_sched__read_events(sched))
3555 		goto out_put_color_cpus;
3556 
3557 	rc = 0;
3558 	print_bad_events(sched);
3559 
3560 out_put_color_cpus:
3561 	perf_cpu_map__put(sched->map.color_cpus);
3562 
3563 out_put_color_pids:
3564 	perf_thread_map__put(sched->map.color_pids);
3565 
3566 out_put_map_cpus:
3567 	zfree(&sched->map.comp_cpus);
3568 	perf_cpu_map__put(sched->map.cpus);
3569 
3570 out_free_cpus_switch_event:
3571 	free_cpus_switch_event(sched);
3572 
3573 out_free_curr_thread:
3574 	zfree(&sched->curr_thread);
3575 	return rc;
3576 }
3577 
3578 static int perf_sched__replay(struct perf_sched *sched)
3579 {
3580 	int ret;
3581 	unsigned long i;
3582 
3583 	mutex_init(&sched->start_work_mutex);
3584 	mutex_init(&sched->work_done_wait_mutex);
3585 
3586 	ret = setup_cpus_switch_event(sched);
3587 	if (ret)
3588 		goto out_mutex_destroy;
3589 
3590 	calibrate_run_measurement_overhead(sched);
3591 	calibrate_sleep_measurement_overhead(sched);
3592 
3593 	test_calibrations(sched);
3594 
3595 	ret = perf_sched__read_events(sched);
3596 	if (ret)
3597 		goto out_free_cpus_switch_event;
3598 
3599 	printf("nr_run_events:        %ld\n", sched->nr_run_events);
3600 	printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
3601 	printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
3602 
3603 	if (sched->targetless_wakeups)
3604 		printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
3605 	if (sched->multitarget_wakeups)
3606 		printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
3607 	if (sched->nr_run_events_optimized)
3608 		printf("run atoms optimized: %ld\n",
3609 			sched->nr_run_events_optimized);
3610 
3611 	print_task_traces(sched);
3612 	add_cross_task_wakeups(sched);
3613 
3614 	sched->thread_funcs_exit = false;
3615 	create_tasks(sched);
3616 	printf("------------------------------------------------------------\n");
3617 	if (sched->replay_repeat == 0)
3618 		sched->replay_repeat = UINT_MAX;
3619 
3620 	for (i = 0; i < sched->replay_repeat; i++)
3621 		run_one_test(sched);
3622 
3623 	sched->thread_funcs_exit = true;
3624 	destroy_tasks(sched);
3625 
3626 out_free_cpus_switch_event:
3627 	free_cpus_switch_event(sched);
3628 
3629 out_mutex_destroy:
3630 	mutex_destroy(&sched->start_work_mutex);
3631 	mutex_destroy(&sched->work_done_wait_mutex);
3632 	return ret;
3633 }
3634 
3635 static void setup_sorting(struct perf_sched *sched, const struct option *options,
3636 			  const char * const usage_msg[])
3637 {
3638 	char *tmp, *tok, *str = strdup(sched->sort_order);
3639 
3640 	for (tok = strtok_r(str, ", ", &tmp);
3641 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
3642 		if (sort_dimension__add(tok, &sched->sort_list) < 0) {
3643 			usage_with_options_msg(usage_msg, options,
3644 					"Unknown --sort key: `%s'", tok);
3645 		}
3646 	}
3647 
3648 	free(str);
3649 
3650 	sort_dimension__add("pid", &sched->cmp_pid);
3651 }
3652 
3653 static bool schedstat_events_exposed(void)
3654 {
3655 	/*
3656 	 * Select "sched:sched_stat_wait" event to check
3657 	 * whether schedstat tracepoints are exposed.
3658 	 */
3659 	return IS_ERR(trace_event__tp_format("sched", "sched_stat_wait")) ?
3660 		false : true;
3661 }
3662 
3663 static int __cmd_record(int argc, const char **argv)
3664 {
3665 	unsigned int rec_argc, i, j;
3666 	char **rec_argv;
3667 	const char **rec_argv_copy;
3668 	const char * const record_args[] = {
3669 		"record",
3670 		"-a",
3671 		"-R",
3672 		"-m", "1024",
3673 		"-c", "1",
3674 		"-e", "sched:sched_switch",
3675 		"-e", "sched:sched_stat_runtime",
3676 		"-e", "sched:sched_process_fork",
3677 		"-e", "sched:sched_wakeup_new",
3678 		"-e", "sched:sched_migrate_task",
3679 	};
3680 
3681 	/*
3682 	 * The tracepoints trace_sched_stat_{wait, sleep, iowait}
3683 	 * are not exposed to user if CONFIG_SCHEDSTATS is not set,
3684 	 * to prevent "perf sched record" execution failure, determine
3685 	 * whether to record schedstat events according to actual situation.
3686 	 */
3687 	const char * const schedstat_args[] = {
3688 		"-e", "sched:sched_stat_wait",
3689 		"-e", "sched:sched_stat_sleep",
3690 		"-e", "sched:sched_stat_iowait",
3691 	};
3692 	unsigned int schedstat_argc = schedstat_events_exposed() ?
3693 		ARRAY_SIZE(schedstat_args) : 0;
3694 
3695 	struct tep_event *waking_event;
3696 	int ret;
3697 
3698 	/*
3699 	 * +2 for either "-e", "sched:sched_wakeup" or
3700 	 * "-e", "sched:sched_waking"
3701 	 */
3702 	rec_argc = ARRAY_SIZE(record_args) + 2 + schedstat_argc + argc - 1;
3703 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
3704 	if (rec_argv == NULL)
3705 		return -ENOMEM;
3706 	rec_argv_copy = calloc(rec_argc + 1, sizeof(char *));
3707 	if (rec_argv_copy == NULL) {
3708 		free(rec_argv);
3709 		return -ENOMEM;
3710 	}
3711 
3712 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
3713 		rec_argv[i] = strdup(record_args[i]);
3714 
3715 	rec_argv[i++] = strdup("-e");
3716 	waking_event = trace_event__tp_format("sched", "sched_waking");
3717 	if (!IS_ERR(waking_event))
3718 		rec_argv[i++] = strdup("sched:sched_waking");
3719 	else
3720 		rec_argv[i++] = strdup("sched:sched_wakeup");
3721 
3722 	for (j = 0; j < schedstat_argc; j++)
3723 		rec_argv[i++] = strdup(schedstat_args[j]);
3724 
3725 	for (j = 1; j < (unsigned int)argc; j++, i++)
3726 		rec_argv[i] = strdup(argv[j]);
3727 
3728 	BUG_ON(i != rec_argc);
3729 
3730 	memcpy(rec_argv_copy, rec_argv, sizeof(char *) * rec_argc);
3731 	ret = cmd_record(rec_argc, rec_argv_copy);
3732 
3733 	for (i = 0; i < rec_argc; i++)
3734 		free(rec_argv[i]);
3735 	free(rec_argv);
3736 	free(rec_argv_copy);
3737 
3738 	return ret;
3739 }
3740 
3741 int cmd_sched(int argc, const char **argv)
3742 {
3743 	static const char default_sort_order[] = "avg, max, switch, runtime";
3744 	struct perf_sched sched = {
3745 		.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
3746 		.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
3747 		.sort_order	      = default_sort_order,
3748 		.replay_repeat	      = 10,
3749 		.profile_cpu	      = -1,
3750 		.next_shortname1      = 'A',
3751 		.next_shortname2      = '0',
3752 		.skip_merge           = 0,
3753 		.show_callchain	      = 1,
3754 		.max_stack            = 5,
3755 	};
3756 	const struct option sched_options[] = {
3757 	OPT_STRING('i', "input", &input_name, "file",
3758 		    "input file name"),
3759 	OPT_INCR('v', "verbose", &verbose,
3760 		    "be more verbose (show symbol address, etc)"),
3761 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
3762 		    "dump raw trace in ASCII"),
3763 	OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
3764 	OPT_END()
3765 	};
3766 	const struct option latency_options[] = {
3767 	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
3768 		   "sort by key(s): runtime, switch, avg, max"),
3769 	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
3770 		    "CPU to profile on"),
3771 	OPT_BOOLEAN('p', "pids", &sched.skip_merge,
3772 		    "latency stats per pid instead of per comm"),
3773 	OPT_PARENT(sched_options)
3774 	};
3775 	const struct option replay_options[] = {
3776 	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
3777 		     "repeat the workload replay N times (0: infinite)"),
3778 	OPT_PARENT(sched_options)
3779 	};
3780 	const struct option map_options[] = {
3781 	OPT_BOOLEAN(0, "compact", &sched.map.comp,
3782 		    "map output in compact mode"),
3783 	OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids",
3784 		   "highlight given pids in map"),
3785 	OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus",
3786                     "highlight given CPUs in map"),
3787 	OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus",
3788                     "display given CPUs in map"),
3789 	OPT_STRING(0, "task-name", &sched.map.task_name, "task",
3790 		"map output only for the given task name(s)."),
3791 	OPT_BOOLEAN(0, "fuzzy-name", &sched.map.fuzzy,
3792 		"given command name can be partially matched (fuzzy matching)"),
3793 	OPT_PARENT(sched_options)
3794 	};
3795 	const struct option timehist_options[] = {
3796 	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
3797 		   "file", "vmlinux pathname"),
3798 	OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
3799 		   "file", "kallsyms pathname"),
3800 	OPT_BOOLEAN('g', "call-graph", &sched.show_callchain,
3801 		    "Display call chains if present (default on)"),
3802 	OPT_UINTEGER(0, "max-stack", &sched.max_stack,
3803 		   "Maximum number of functions to display backtrace."),
3804 	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
3805 		    "Look for files with symbols relative to this directory"),
3806 	OPT_BOOLEAN('s', "summary", &sched.summary_only,
3807 		    "Show only syscall summary with statistics"),
3808 	OPT_BOOLEAN('S', "with-summary", &sched.summary,
3809 		    "Show all syscalls and summary with statistics"),
3810 	OPT_BOOLEAN('w', "wakeups", &sched.show_wakeups, "Show wakeup events"),
3811 	OPT_BOOLEAN('n', "next", &sched.show_next, "Show next task"),
3812 	OPT_BOOLEAN('M', "migrations", &sched.show_migrations, "Show migration events"),
3813 	OPT_BOOLEAN('V', "cpu-visual", &sched.show_cpu_visual, "Add CPU visual"),
3814 	OPT_BOOLEAN('I', "idle-hist", &sched.idle_hist, "Show idle events only"),
3815 	OPT_STRING(0, "time", &sched.time_str, "str",
3816 		   "Time span for analysis (start,stop)"),
3817 	OPT_BOOLEAN(0, "state", &sched.show_state, "Show task state when sched-out"),
3818 	OPT_STRING('p', "pid", &symbol_conf.pid_list_str, "pid[,pid...]",
3819 		   "analyze events only for given process id(s)"),
3820 	OPT_STRING('t', "tid", &symbol_conf.tid_list_str, "tid[,tid...]",
3821 		   "analyze events only for given thread id(s)"),
3822 	OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
3823 	OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"),
3824 	OPT_STRING(0, "prio", &sched.prio_str, "prio",
3825 		   "analyze events only for given task priority(ies)"),
3826 	OPT_PARENT(sched_options)
3827 	};
3828 
3829 	const char * const latency_usage[] = {
3830 		"perf sched latency [<options>]",
3831 		NULL
3832 	};
3833 	const char * const replay_usage[] = {
3834 		"perf sched replay [<options>]",
3835 		NULL
3836 	};
3837 	const char * const map_usage[] = {
3838 		"perf sched map [<options>]",
3839 		NULL
3840 	};
3841 	const char * const timehist_usage[] = {
3842 		"perf sched timehist [<options>]",
3843 		NULL
3844 	};
3845 	const char *const sched_subcommands[] = { "record", "latency", "map",
3846 						  "replay", "script",
3847 						  "timehist", NULL };
3848 	const char *sched_usage[] = {
3849 		NULL,
3850 		NULL
3851 	};
3852 	struct trace_sched_handler lat_ops  = {
3853 		.wakeup_event	    = latency_wakeup_event,
3854 		.switch_event	    = latency_switch_event,
3855 		.runtime_event	    = latency_runtime_event,
3856 		.migrate_task_event = latency_migrate_task_event,
3857 	};
3858 	struct trace_sched_handler map_ops  = {
3859 		.switch_event	    = map_switch_event,
3860 	};
3861 	struct trace_sched_handler replay_ops  = {
3862 		.wakeup_event	    = replay_wakeup_event,
3863 		.switch_event	    = replay_switch_event,
3864 		.fork_event	    = replay_fork_event,
3865 	};
3866 	int ret;
3867 
3868 	perf_tool__init(&sched.tool, /*ordered_events=*/true);
3869 	sched.tool.sample	 = perf_sched__process_tracepoint_sample;
3870 	sched.tool.comm		 = perf_sched__process_comm;
3871 	sched.tool.namespaces	 = perf_event__process_namespaces;
3872 	sched.tool.lost		 = perf_event__process_lost;
3873 	sched.tool.fork		 = perf_sched__process_fork_event;
3874 
3875 	argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
3876 					sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3877 	if (!argc)
3878 		usage_with_options(sched_usage, sched_options);
3879 
3880 	/*
3881 	 * Aliased to 'perf script' for now:
3882 	 */
3883 	if (!strcmp(argv[0], "script")) {
3884 		return cmd_script(argc, argv);
3885 	} else if (strlen(argv[0]) > 2 && strstarts("record", argv[0])) {
3886 		return __cmd_record(argc, argv);
3887 	} else if (strlen(argv[0]) > 2 && strstarts("latency", argv[0])) {
3888 		sched.tp_handler = &lat_ops;
3889 		if (argc > 1) {
3890 			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
3891 			if (argc)
3892 				usage_with_options(latency_usage, latency_options);
3893 		}
3894 		setup_sorting(&sched, latency_options, latency_usage);
3895 		return perf_sched__lat(&sched);
3896 	} else if (!strcmp(argv[0], "map")) {
3897 		if (argc) {
3898 			argc = parse_options(argc, argv, map_options, map_usage, 0);
3899 			if (argc)
3900 				usage_with_options(map_usage, map_options);
3901 
3902 			if (sched.map.task_name) {
3903 				sched.map.task_names = strlist__new(sched.map.task_name, NULL);
3904 				if (sched.map.task_names == NULL) {
3905 					fprintf(stderr, "Failed to parse task names\n");
3906 					return -1;
3907 				}
3908 			}
3909 		}
3910 		sched.tp_handler = &map_ops;
3911 		setup_sorting(&sched, latency_options, latency_usage);
3912 		return perf_sched__map(&sched);
3913 	} else if (strlen(argv[0]) > 2 && strstarts("replay", argv[0])) {
3914 		sched.tp_handler = &replay_ops;
3915 		if (argc) {
3916 			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
3917 			if (argc)
3918 				usage_with_options(replay_usage, replay_options);
3919 		}
3920 		return perf_sched__replay(&sched);
3921 	} else if (!strcmp(argv[0], "timehist")) {
3922 		if (argc) {
3923 			argc = parse_options(argc, argv, timehist_options,
3924 					     timehist_usage, 0);
3925 			if (argc)
3926 				usage_with_options(timehist_usage, timehist_options);
3927 		}
3928 		if ((sched.show_wakeups || sched.show_next) &&
3929 		    sched.summary_only) {
3930 			pr_err(" Error: -s and -[n|w] are mutually exclusive.\n");
3931 			parse_options_usage(timehist_usage, timehist_options, "s", true);
3932 			if (sched.show_wakeups)
3933 				parse_options_usage(NULL, timehist_options, "w", true);
3934 			if (sched.show_next)
3935 				parse_options_usage(NULL, timehist_options, "n", true);
3936 			return -EINVAL;
3937 		}
3938 		ret = symbol__validate_sym_arguments();
3939 		if (ret)
3940 			return ret;
3941 
3942 		return perf_sched__timehist(&sched);
3943 	} else {
3944 		usage_with_options(sched_usage, sched_options);
3945 	}
3946 
3947 	/* free usage string allocated by parse_options_subcommand */
3948 	free((void *)sched_usage[0]);
3949 
3950 	return 0;
3951 }
3952