xref: /linux/tools/perf/builtin-sched.c (revision 26b0d14106954ae46d2f4f7eec3481828a210f7d)
1 #include "builtin.h"
2 #include "perf.h"
3 
4 #include "util/util.h"
5 #include "util/evlist.h"
6 #include "util/cache.h"
7 #include "util/evsel.h"
8 #include "util/symbol.h"
9 #include "util/thread.h"
10 #include "util/header.h"
11 #include "util/session.h"
12 #include "util/tool.h"
13 
14 #include "util/parse-options.h"
15 #include "util/trace-event.h"
16 
17 #include "util/debug.h"
18 
19 #include <sys/prctl.h>
20 #include <sys/resource.h>
21 
22 #include <semaphore.h>
23 #include <pthread.h>
24 #include <math.h>
25 
26 static const char		*input_name;
27 
28 static char			default_sort_order[] = "avg, max, switch, runtime";
29 static const char		*sort_order = default_sort_order;
30 
31 static int			profile_cpu = -1;
32 
33 #define PR_SET_NAME		15               /* Set process name */
34 #define MAX_CPUS		4096
35 
36 static u64			run_measurement_overhead;
37 static u64			sleep_measurement_overhead;
38 
39 #define COMM_LEN		20
40 #define SYM_LEN			129
41 
42 #define MAX_PID			65536
43 
44 static unsigned long		nr_tasks;
45 
46 struct sched_atom;
47 
48 struct task_desc {
49 	unsigned long		nr;
50 	unsigned long		pid;
51 	char			comm[COMM_LEN];
52 
53 	unsigned long		nr_events;
54 	unsigned long		curr_event;
55 	struct sched_atom	**atoms;
56 
57 	pthread_t		thread;
58 	sem_t			sleep_sem;
59 
60 	sem_t			ready_for_work;
61 	sem_t			work_done_sem;
62 
63 	u64			cpu_usage;
64 };
65 
66 enum sched_event_type {
67 	SCHED_EVENT_RUN,
68 	SCHED_EVENT_SLEEP,
69 	SCHED_EVENT_WAKEUP,
70 	SCHED_EVENT_MIGRATION,
71 };
72 
73 struct sched_atom {
74 	enum sched_event_type	type;
75 	int			specific_wait;
76 	u64			timestamp;
77 	u64			duration;
78 	unsigned long		nr;
79 	sem_t			*wait_sem;
80 	struct task_desc	*wakee;
81 };
82 
83 static struct task_desc		*pid_to_task[MAX_PID];
84 
85 static struct task_desc		**tasks;
86 
87 static pthread_mutex_t		start_work_mutex = PTHREAD_MUTEX_INITIALIZER;
88 static u64			start_time;
89 
90 static pthread_mutex_t		work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER;
91 
92 static unsigned long		nr_run_events;
93 static unsigned long		nr_sleep_events;
94 static unsigned long		nr_wakeup_events;
95 
96 static unsigned long		nr_sleep_corrections;
97 static unsigned long		nr_run_events_optimized;
98 
99 static unsigned long		targetless_wakeups;
100 static unsigned long		multitarget_wakeups;
101 
102 static u64			cpu_usage;
103 static u64			runavg_cpu_usage;
104 static u64			parent_cpu_usage;
105 static u64			runavg_parent_cpu_usage;
106 
107 static unsigned long		nr_runs;
108 static u64			sum_runtime;
109 static u64			sum_fluct;
110 static u64			run_avg;
111 
112 static unsigned int		replay_repeat = 10;
113 static unsigned long		nr_timestamps;
114 static unsigned long		nr_unordered_timestamps;
115 static unsigned long		nr_state_machine_bugs;
116 static unsigned long		nr_context_switch_bugs;
117 static unsigned long		nr_events;
118 static unsigned long		nr_lost_chunks;
119 static unsigned long		nr_lost_events;
120 
121 #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
122 
123 enum thread_state {
124 	THREAD_SLEEPING = 0,
125 	THREAD_WAIT_CPU,
126 	THREAD_SCHED_IN,
127 	THREAD_IGNORE
128 };
129 
130 struct work_atom {
131 	struct list_head	list;
132 	enum thread_state	state;
133 	u64			sched_out_time;
134 	u64			wake_up_time;
135 	u64			sched_in_time;
136 	u64			runtime;
137 };
138 
139 struct work_atoms {
140 	struct list_head	work_list;
141 	struct thread		*thread;
142 	struct rb_node		node;
143 	u64			max_lat;
144 	u64			max_lat_at;
145 	u64			total_lat;
146 	u64			nb_atoms;
147 	u64			total_runtime;
148 };
149 
150 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
151 
152 static struct rb_root		atom_root, sorted_atom_root;
153 
154 static u64			all_runtime;
155 static u64			all_count;
156 
157 
158 static u64 get_nsecs(void)
159 {
160 	struct timespec ts;
161 
162 	clock_gettime(CLOCK_MONOTONIC, &ts);
163 
164 	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
165 }
166 
167 static void burn_nsecs(u64 nsecs)
168 {
169 	u64 T0 = get_nsecs(), T1;
170 
171 	do {
172 		T1 = get_nsecs();
173 	} while (T1 + run_measurement_overhead < T0 + nsecs);
174 }
175 
176 static void sleep_nsecs(u64 nsecs)
177 {
178 	struct timespec ts;
179 
180 	ts.tv_nsec = nsecs % 999999999;
181 	ts.tv_sec = nsecs / 999999999;
182 
183 	nanosleep(&ts, NULL);
184 }
185 
186 static void calibrate_run_measurement_overhead(void)
187 {
188 	u64 T0, T1, delta, min_delta = 1000000000ULL;
189 	int i;
190 
191 	for (i = 0; i < 10; i++) {
192 		T0 = get_nsecs();
193 		burn_nsecs(0);
194 		T1 = get_nsecs();
195 		delta = T1-T0;
196 		min_delta = min(min_delta, delta);
197 	}
198 	run_measurement_overhead = min_delta;
199 
200 	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
201 }
202 
203 static void calibrate_sleep_measurement_overhead(void)
204 {
205 	u64 T0, T1, delta, min_delta = 1000000000ULL;
206 	int i;
207 
208 	for (i = 0; i < 10; i++) {
209 		T0 = get_nsecs();
210 		sleep_nsecs(10000);
211 		T1 = get_nsecs();
212 		delta = T1-T0;
213 		min_delta = min(min_delta, delta);
214 	}
215 	min_delta -= 10000;
216 	sleep_measurement_overhead = min_delta;
217 
218 	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
219 }
220 
221 static struct sched_atom *
222 get_new_event(struct task_desc *task, u64 timestamp)
223 {
224 	struct sched_atom *event = zalloc(sizeof(*event));
225 	unsigned long idx = task->nr_events;
226 	size_t size;
227 
228 	event->timestamp = timestamp;
229 	event->nr = idx;
230 
231 	task->nr_events++;
232 	size = sizeof(struct sched_atom *) * task->nr_events;
233 	task->atoms = realloc(task->atoms, size);
234 	BUG_ON(!task->atoms);
235 
236 	task->atoms[idx] = event;
237 
238 	return event;
239 }
240 
241 static struct sched_atom *last_event(struct task_desc *task)
242 {
243 	if (!task->nr_events)
244 		return NULL;
245 
246 	return task->atoms[task->nr_events - 1];
247 }
248 
249 static void
250 add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration)
251 {
252 	struct sched_atom *event, *curr_event = last_event(task);
253 
254 	/*
255 	 * optimize an existing RUN event by merging this one
256 	 * to it:
257 	 */
258 	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
259 		nr_run_events_optimized++;
260 		curr_event->duration += duration;
261 		return;
262 	}
263 
264 	event = get_new_event(task, timestamp);
265 
266 	event->type = SCHED_EVENT_RUN;
267 	event->duration = duration;
268 
269 	nr_run_events++;
270 }
271 
272 static void
273 add_sched_event_wakeup(struct task_desc *task, u64 timestamp,
274 		       struct task_desc *wakee)
275 {
276 	struct sched_atom *event, *wakee_event;
277 
278 	event = get_new_event(task, timestamp);
279 	event->type = SCHED_EVENT_WAKEUP;
280 	event->wakee = wakee;
281 
282 	wakee_event = last_event(wakee);
283 	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
284 		targetless_wakeups++;
285 		return;
286 	}
287 	if (wakee_event->wait_sem) {
288 		multitarget_wakeups++;
289 		return;
290 	}
291 
292 	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
293 	sem_init(wakee_event->wait_sem, 0, 0);
294 	wakee_event->specific_wait = 1;
295 	event->wait_sem = wakee_event->wait_sem;
296 
297 	nr_wakeup_events++;
298 }
299 
300 static void
301 add_sched_event_sleep(struct task_desc *task, u64 timestamp,
302 		      u64 task_state __used)
303 {
304 	struct sched_atom *event = get_new_event(task, timestamp);
305 
306 	event->type = SCHED_EVENT_SLEEP;
307 
308 	nr_sleep_events++;
309 }
310 
311 static struct task_desc *register_pid(unsigned long pid, const char *comm)
312 {
313 	struct task_desc *task;
314 
315 	BUG_ON(pid >= MAX_PID);
316 
317 	task = pid_to_task[pid];
318 
319 	if (task)
320 		return task;
321 
322 	task = zalloc(sizeof(*task));
323 	task->pid = pid;
324 	task->nr = nr_tasks;
325 	strcpy(task->comm, comm);
326 	/*
327 	 * every task starts in sleeping state - this gets ignored
328 	 * if there's no wakeup pointing to this sleep state:
329 	 */
330 	add_sched_event_sleep(task, 0, 0);
331 
332 	pid_to_task[pid] = task;
333 	nr_tasks++;
334 	tasks = realloc(tasks, nr_tasks*sizeof(struct task_task *));
335 	BUG_ON(!tasks);
336 	tasks[task->nr] = task;
337 
338 	if (verbose)
339 		printf("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
340 
341 	return task;
342 }
343 
344 
345 static void print_task_traces(void)
346 {
347 	struct task_desc *task;
348 	unsigned long i;
349 
350 	for (i = 0; i < nr_tasks; i++) {
351 		task = tasks[i];
352 		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
353 			task->nr, task->comm, task->pid, task->nr_events);
354 	}
355 }
356 
357 static void add_cross_task_wakeups(void)
358 {
359 	struct task_desc *task1, *task2;
360 	unsigned long i, j;
361 
362 	for (i = 0; i < nr_tasks; i++) {
363 		task1 = tasks[i];
364 		j = i + 1;
365 		if (j == nr_tasks)
366 			j = 0;
367 		task2 = tasks[j];
368 		add_sched_event_wakeup(task1, 0, task2);
369 	}
370 }
371 
372 static void
373 process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom)
374 {
375 	int ret = 0;
376 
377 	switch (atom->type) {
378 		case SCHED_EVENT_RUN:
379 			burn_nsecs(atom->duration);
380 			break;
381 		case SCHED_EVENT_SLEEP:
382 			if (atom->wait_sem)
383 				ret = sem_wait(atom->wait_sem);
384 			BUG_ON(ret);
385 			break;
386 		case SCHED_EVENT_WAKEUP:
387 			if (atom->wait_sem)
388 				ret = sem_post(atom->wait_sem);
389 			BUG_ON(ret);
390 			break;
391 		case SCHED_EVENT_MIGRATION:
392 			break;
393 		default:
394 			BUG_ON(1);
395 	}
396 }
397 
398 static u64 get_cpu_usage_nsec_parent(void)
399 {
400 	struct rusage ru;
401 	u64 sum;
402 	int err;
403 
404 	err = getrusage(RUSAGE_SELF, &ru);
405 	BUG_ON(err);
406 
407 	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
408 	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
409 
410 	return sum;
411 }
412 
413 static int self_open_counters(void)
414 {
415 	struct perf_event_attr attr;
416 	int fd;
417 
418 	memset(&attr, 0, sizeof(attr));
419 
420 	attr.type = PERF_TYPE_SOFTWARE;
421 	attr.config = PERF_COUNT_SW_TASK_CLOCK;
422 
423 	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
424 
425 	if (fd < 0)
426 		die("Error: sys_perf_event_open() syscall returned"
427 		    "with %d (%s)\n", fd, strerror(errno));
428 	return fd;
429 }
430 
431 static u64 get_cpu_usage_nsec_self(int fd)
432 {
433 	u64 runtime;
434 	int ret;
435 
436 	ret = read(fd, &runtime, sizeof(runtime));
437 	BUG_ON(ret != sizeof(runtime));
438 
439 	return runtime;
440 }
441 
442 static void *thread_func(void *ctx)
443 {
444 	struct task_desc *this_task = ctx;
445 	u64 cpu_usage_0, cpu_usage_1;
446 	unsigned long i, ret;
447 	char comm2[22];
448 	int fd;
449 
450 	sprintf(comm2, ":%s", this_task->comm);
451 	prctl(PR_SET_NAME, comm2);
452 	fd = self_open_counters();
453 
454 again:
455 	ret = sem_post(&this_task->ready_for_work);
456 	BUG_ON(ret);
457 	ret = pthread_mutex_lock(&start_work_mutex);
458 	BUG_ON(ret);
459 	ret = pthread_mutex_unlock(&start_work_mutex);
460 	BUG_ON(ret);
461 
462 	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
463 
464 	for (i = 0; i < this_task->nr_events; i++) {
465 		this_task->curr_event = i;
466 		process_sched_event(this_task, this_task->atoms[i]);
467 	}
468 
469 	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
470 	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
471 	ret = sem_post(&this_task->work_done_sem);
472 	BUG_ON(ret);
473 
474 	ret = pthread_mutex_lock(&work_done_wait_mutex);
475 	BUG_ON(ret);
476 	ret = pthread_mutex_unlock(&work_done_wait_mutex);
477 	BUG_ON(ret);
478 
479 	goto again;
480 }
481 
482 static void create_tasks(void)
483 {
484 	struct task_desc *task;
485 	pthread_attr_t attr;
486 	unsigned long i;
487 	int err;
488 
489 	err = pthread_attr_init(&attr);
490 	BUG_ON(err);
491 	err = pthread_attr_setstacksize(&attr,
492 			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
493 	BUG_ON(err);
494 	err = pthread_mutex_lock(&start_work_mutex);
495 	BUG_ON(err);
496 	err = pthread_mutex_lock(&work_done_wait_mutex);
497 	BUG_ON(err);
498 	for (i = 0; i < nr_tasks; i++) {
499 		task = tasks[i];
500 		sem_init(&task->sleep_sem, 0, 0);
501 		sem_init(&task->ready_for_work, 0, 0);
502 		sem_init(&task->work_done_sem, 0, 0);
503 		task->curr_event = 0;
504 		err = pthread_create(&task->thread, &attr, thread_func, task);
505 		BUG_ON(err);
506 	}
507 }
508 
509 static void wait_for_tasks(void)
510 {
511 	u64 cpu_usage_0, cpu_usage_1;
512 	struct task_desc *task;
513 	unsigned long i, ret;
514 
515 	start_time = get_nsecs();
516 	cpu_usage = 0;
517 	pthread_mutex_unlock(&work_done_wait_mutex);
518 
519 	for (i = 0; i < nr_tasks; i++) {
520 		task = tasks[i];
521 		ret = sem_wait(&task->ready_for_work);
522 		BUG_ON(ret);
523 		sem_init(&task->ready_for_work, 0, 0);
524 	}
525 	ret = pthread_mutex_lock(&work_done_wait_mutex);
526 	BUG_ON(ret);
527 
528 	cpu_usage_0 = get_cpu_usage_nsec_parent();
529 
530 	pthread_mutex_unlock(&start_work_mutex);
531 
532 	for (i = 0; i < nr_tasks; i++) {
533 		task = tasks[i];
534 		ret = sem_wait(&task->work_done_sem);
535 		BUG_ON(ret);
536 		sem_init(&task->work_done_sem, 0, 0);
537 		cpu_usage += task->cpu_usage;
538 		task->cpu_usage = 0;
539 	}
540 
541 	cpu_usage_1 = get_cpu_usage_nsec_parent();
542 	if (!runavg_cpu_usage)
543 		runavg_cpu_usage = cpu_usage;
544 	runavg_cpu_usage = (runavg_cpu_usage*9 + cpu_usage)/10;
545 
546 	parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
547 	if (!runavg_parent_cpu_usage)
548 		runavg_parent_cpu_usage = parent_cpu_usage;
549 	runavg_parent_cpu_usage = (runavg_parent_cpu_usage*9 +
550 				   parent_cpu_usage)/10;
551 
552 	ret = pthread_mutex_lock(&start_work_mutex);
553 	BUG_ON(ret);
554 
555 	for (i = 0; i < nr_tasks; i++) {
556 		task = tasks[i];
557 		sem_init(&task->sleep_sem, 0, 0);
558 		task->curr_event = 0;
559 	}
560 }
561 
562 static void run_one_test(void)
563 {
564 	u64 T0, T1, delta, avg_delta, fluct;
565 
566 	T0 = get_nsecs();
567 	wait_for_tasks();
568 	T1 = get_nsecs();
569 
570 	delta = T1 - T0;
571 	sum_runtime += delta;
572 	nr_runs++;
573 
574 	avg_delta = sum_runtime / nr_runs;
575 	if (delta < avg_delta)
576 		fluct = avg_delta - delta;
577 	else
578 		fluct = delta - avg_delta;
579 	sum_fluct += fluct;
580 	if (!run_avg)
581 		run_avg = delta;
582 	run_avg = (run_avg*9 + delta)/10;
583 
584 	printf("#%-3ld: %0.3f, ",
585 		nr_runs, (double)delta/1000000.0);
586 
587 	printf("ravg: %0.2f, ",
588 		(double)run_avg/1e6);
589 
590 	printf("cpu: %0.2f / %0.2f",
591 		(double)cpu_usage/1e6, (double)runavg_cpu_usage/1e6);
592 
593 #if 0
594 	/*
595 	 * rusage statistics done by the parent, these are less
596 	 * accurate than the sum_exec_runtime based statistics:
597 	 */
598 	printf(" [%0.2f / %0.2f]",
599 		(double)parent_cpu_usage/1e6,
600 		(double)runavg_parent_cpu_usage/1e6);
601 #endif
602 
603 	printf("\n");
604 
605 	if (nr_sleep_corrections)
606 		printf(" (%ld sleep corrections)\n", nr_sleep_corrections);
607 	nr_sleep_corrections = 0;
608 }
609 
610 static void test_calibrations(void)
611 {
612 	u64 T0, T1;
613 
614 	T0 = get_nsecs();
615 	burn_nsecs(1e6);
616 	T1 = get_nsecs();
617 
618 	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
619 
620 	T0 = get_nsecs();
621 	sleep_nsecs(1e6);
622 	T1 = get_nsecs();
623 
624 	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
625 }
626 
627 #define FILL_FIELD(ptr, field, event, data)	\
628 	ptr.field = (typeof(ptr.field)) raw_field_value(event, #field, data)
629 
630 #define FILL_ARRAY(ptr, array, event, data)			\
631 do {								\
632 	void *__array = raw_field_ptr(event, #array, data);	\
633 	memcpy(ptr.array, __array, sizeof(ptr.array));	\
634 } while(0)
635 
636 #define FILL_COMMON_FIELDS(ptr, event, data)			\
637 do {								\
638 	FILL_FIELD(ptr, common_type, event, data);		\
639 	FILL_FIELD(ptr, common_flags, event, data);		\
640 	FILL_FIELD(ptr, common_preempt_count, event, data);	\
641 	FILL_FIELD(ptr, common_pid, event, data);		\
642 	FILL_FIELD(ptr, common_tgid, event, data);		\
643 } while (0)
644 
645 
646 
647 struct trace_switch_event {
648 	u32 size;
649 
650 	u16 common_type;
651 	u8 common_flags;
652 	u8 common_preempt_count;
653 	u32 common_pid;
654 	u32 common_tgid;
655 
656 	char prev_comm[16];
657 	u32 prev_pid;
658 	u32 prev_prio;
659 	u64 prev_state;
660 	char next_comm[16];
661 	u32 next_pid;
662 	u32 next_prio;
663 };
664 
665 struct trace_runtime_event {
666 	u32 size;
667 
668 	u16 common_type;
669 	u8 common_flags;
670 	u8 common_preempt_count;
671 	u32 common_pid;
672 	u32 common_tgid;
673 
674 	char comm[16];
675 	u32 pid;
676 	u64 runtime;
677 	u64 vruntime;
678 };
679 
680 struct trace_wakeup_event {
681 	u32 size;
682 
683 	u16 common_type;
684 	u8 common_flags;
685 	u8 common_preempt_count;
686 	u32 common_pid;
687 	u32 common_tgid;
688 
689 	char comm[16];
690 	u32 pid;
691 
692 	u32 prio;
693 	u32 success;
694 	u32 cpu;
695 };
696 
697 struct trace_fork_event {
698 	u32 size;
699 
700 	u16 common_type;
701 	u8 common_flags;
702 	u8 common_preempt_count;
703 	u32 common_pid;
704 	u32 common_tgid;
705 
706 	char parent_comm[16];
707 	u32 parent_pid;
708 	char child_comm[16];
709 	u32 child_pid;
710 };
711 
712 struct trace_migrate_task_event {
713 	u32 size;
714 
715 	u16 common_type;
716 	u8 common_flags;
717 	u8 common_preempt_count;
718 	u32 common_pid;
719 	u32 common_tgid;
720 
721 	char comm[16];
722 	u32 pid;
723 
724 	u32 prio;
725 	u32 cpu;
726 };
727 
728 struct trace_sched_handler {
729 	void (*switch_event)(struct trace_switch_event *,
730 			     struct machine *,
731 			     struct event_format *,
732 			     int cpu,
733 			     u64 timestamp,
734 			     struct thread *thread);
735 
736 	void (*runtime_event)(struct trace_runtime_event *,
737 			      struct machine *,
738 			      struct event_format *,
739 			      int cpu,
740 			      u64 timestamp,
741 			      struct thread *thread);
742 
743 	void (*wakeup_event)(struct trace_wakeup_event *,
744 			     struct machine *,
745 			     struct event_format *,
746 			     int cpu,
747 			     u64 timestamp,
748 			     struct thread *thread);
749 
750 	void (*fork_event)(struct trace_fork_event *,
751 			   struct event_format *,
752 			   int cpu,
753 			   u64 timestamp,
754 			   struct thread *thread);
755 
756 	void (*migrate_task_event)(struct trace_migrate_task_event *,
757 			   struct machine *machine,
758 			   struct event_format *,
759 			   int cpu,
760 			   u64 timestamp,
761 			   struct thread *thread);
762 };
763 
764 
765 static void
766 replay_wakeup_event(struct trace_wakeup_event *wakeup_event,
767 		    struct machine *machine __used,
768 		    struct event_format *event,
769 		    int cpu __used,
770 		    u64 timestamp __used,
771 		    struct thread *thread __used)
772 {
773 	struct task_desc *waker, *wakee;
774 
775 	if (verbose) {
776 		printf("sched_wakeup event %p\n", event);
777 
778 		printf(" ... pid %d woke up %s/%d\n",
779 			wakeup_event->common_pid,
780 			wakeup_event->comm,
781 			wakeup_event->pid);
782 	}
783 
784 	waker = register_pid(wakeup_event->common_pid, "<unknown>");
785 	wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
786 
787 	add_sched_event_wakeup(waker, timestamp, wakee);
788 }
789 
790 static u64 cpu_last_switched[MAX_CPUS];
791 
792 static void
793 replay_switch_event(struct trace_switch_event *switch_event,
794 		    struct machine *machine __used,
795 		    struct event_format *event,
796 		    int cpu,
797 		    u64 timestamp,
798 		    struct thread *thread __used)
799 {
800 	struct task_desc *prev, __used *next;
801 	u64 timestamp0;
802 	s64 delta;
803 
804 	if (verbose)
805 		printf("sched_switch event %p\n", event);
806 
807 	if (cpu >= MAX_CPUS || cpu < 0)
808 		return;
809 
810 	timestamp0 = cpu_last_switched[cpu];
811 	if (timestamp0)
812 		delta = timestamp - timestamp0;
813 	else
814 		delta = 0;
815 
816 	if (delta < 0)
817 		die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
818 
819 	if (verbose) {
820 		printf(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
821 			switch_event->prev_comm, switch_event->prev_pid,
822 			switch_event->next_comm, switch_event->next_pid,
823 			delta);
824 	}
825 
826 	prev = register_pid(switch_event->prev_pid, switch_event->prev_comm);
827 	next = register_pid(switch_event->next_pid, switch_event->next_comm);
828 
829 	cpu_last_switched[cpu] = timestamp;
830 
831 	add_sched_event_run(prev, timestamp, delta);
832 	add_sched_event_sleep(prev, timestamp, switch_event->prev_state);
833 }
834 
835 
836 static void
837 replay_fork_event(struct trace_fork_event *fork_event,
838 		  struct event_format *event,
839 		  int cpu __used,
840 		  u64 timestamp __used,
841 		  struct thread *thread __used)
842 {
843 	if (verbose) {
844 		printf("sched_fork event %p\n", event);
845 		printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
846 		printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
847 	}
848 	register_pid(fork_event->parent_pid, fork_event->parent_comm);
849 	register_pid(fork_event->child_pid, fork_event->child_comm);
850 }
851 
852 static struct trace_sched_handler replay_ops  = {
853 	.wakeup_event		= replay_wakeup_event,
854 	.switch_event		= replay_switch_event,
855 	.fork_event		= replay_fork_event,
856 };
857 
858 struct sort_dimension {
859 	const char		*name;
860 	sort_fn_t		cmp;
861 	struct list_head	list;
862 };
863 
864 static LIST_HEAD(cmp_pid);
865 
866 static int
867 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
868 {
869 	struct sort_dimension *sort;
870 	int ret = 0;
871 
872 	BUG_ON(list_empty(list));
873 
874 	list_for_each_entry(sort, list, list) {
875 		ret = sort->cmp(l, r);
876 		if (ret)
877 			return ret;
878 	}
879 
880 	return ret;
881 }
882 
883 static struct work_atoms *
884 thread_atoms_search(struct rb_root *root, struct thread *thread,
885 			 struct list_head *sort_list)
886 {
887 	struct rb_node *node = root->rb_node;
888 	struct work_atoms key = { .thread = thread };
889 
890 	while (node) {
891 		struct work_atoms *atoms;
892 		int cmp;
893 
894 		atoms = container_of(node, struct work_atoms, node);
895 
896 		cmp = thread_lat_cmp(sort_list, &key, atoms);
897 		if (cmp > 0)
898 			node = node->rb_left;
899 		else if (cmp < 0)
900 			node = node->rb_right;
901 		else {
902 			BUG_ON(thread != atoms->thread);
903 			return atoms;
904 		}
905 	}
906 	return NULL;
907 }
908 
909 static void
910 __thread_latency_insert(struct rb_root *root, struct work_atoms *data,
911 			 struct list_head *sort_list)
912 {
913 	struct rb_node **new = &(root->rb_node), *parent = NULL;
914 
915 	while (*new) {
916 		struct work_atoms *this;
917 		int cmp;
918 
919 		this = container_of(*new, struct work_atoms, node);
920 		parent = *new;
921 
922 		cmp = thread_lat_cmp(sort_list, data, this);
923 
924 		if (cmp > 0)
925 			new = &((*new)->rb_left);
926 		else
927 			new = &((*new)->rb_right);
928 	}
929 
930 	rb_link_node(&data->node, parent, new);
931 	rb_insert_color(&data->node, root);
932 }
933 
934 static void thread_atoms_insert(struct thread *thread)
935 {
936 	struct work_atoms *atoms = zalloc(sizeof(*atoms));
937 	if (!atoms)
938 		die("No memory");
939 
940 	atoms->thread = thread;
941 	INIT_LIST_HEAD(&atoms->work_list);
942 	__thread_latency_insert(&atom_root, atoms, &cmp_pid);
943 }
944 
945 static void
946 latency_fork_event(struct trace_fork_event *fork_event __used,
947 		   struct event_format *event __used,
948 		   int cpu __used,
949 		   u64 timestamp __used,
950 		   struct thread *thread __used)
951 {
952 	/* should insert the newcomer */
953 }
954 
955 __used
956 static char sched_out_state(struct trace_switch_event *switch_event)
957 {
958 	const char *str = TASK_STATE_TO_CHAR_STR;
959 
960 	return str[switch_event->prev_state];
961 }
962 
963 static void
964 add_sched_out_event(struct work_atoms *atoms,
965 		    char run_state,
966 		    u64 timestamp)
967 {
968 	struct work_atom *atom = zalloc(sizeof(*atom));
969 	if (!atom)
970 		die("Non memory");
971 
972 	atom->sched_out_time = timestamp;
973 
974 	if (run_state == 'R') {
975 		atom->state = THREAD_WAIT_CPU;
976 		atom->wake_up_time = atom->sched_out_time;
977 	}
978 
979 	list_add_tail(&atom->list, &atoms->work_list);
980 }
981 
982 static void
983 add_runtime_event(struct work_atoms *atoms, u64 delta, u64 timestamp __used)
984 {
985 	struct work_atom *atom;
986 
987 	BUG_ON(list_empty(&atoms->work_list));
988 
989 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
990 
991 	atom->runtime += delta;
992 	atoms->total_runtime += delta;
993 }
994 
995 static void
996 add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
997 {
998 	struct work_atom *atom;
999 	u64 delta;
1000 
1001 	if (list_empty(&atoms->work_list))
1002 		return;
1003 
1004 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1005 
1006 	if (atom->state != THREAD_WAIT_CPU)
1007 		return;
1008 
1009 	if (timestamp < atom->wake_up_time) {
1010 		atom->state = THREAD_IGNORE;
1011 		return;
1012 	}
1013 
1014 	atom->state = THREAD_SCHED_IN;
1015 	atom->sched_in_time = timestamp;
1016 
1017 	delta = atom->sched_in_time - atom->wake_up_time;
1018 	atoms->total_lat += delta;
1019 	if (delta > atoms->max_lat) {
1020 		atoms->max_lat = delta;
1021 		atoms->max_lat_at = timestamp;
1022 	}
1023 	atoms->nb_atoms++;
1024 }
1025 
1026 static void
1027 latency_switch_event(struct trace_switch_event *switch_event,
1028 		     struct machine *machine,
1029 		     struct event_format *event __used,
1030 		     int cpu,
1031 		     u64 timestamp,
1032 		     struct thread *thread __used)
1033 {
1034 	struct work_atoms *out_events, *in_events;
1035 	struct thread *sched_out, *sched_in;
1036 	u64 timestamp0;
1037 	s64 delta;
1038 
1039 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1040 
1041 	timestamp0 = cpu_last_switched[cpu];
1042 	cpu_last_switched[cpu] = timestamp;
1043 	if (timestamp0)
1044 		delta = timestamp - timestamp0;
1045 	else
1046 		delta = 0;
1047 
1048 	if (delta < 0)
1049 		die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1050 
1051 
1052 	sched_out = machine__findnew_thread(machine, switch_event->prev_pid);
1053 	sched_in = machine__findnew_thread(machine, switch_event->next_pid);
1054 
1055 	out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
1056 	if (!out_events) {
1057 		thread_atoms_insert(sched_out);
1058 		out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
1059 		if (!out_events)
1060 			die("out-event: Internal tree error");
1061 	}
1062 	add_sched_out_event(out_events, sched_out_state(switch_event), timestamp);
1063 
1064 	in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
1065 	if (!in_events) {
1066 		thread_atoms_insert(sched_in);
1067 		in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
1068 		if (!in_events)
1069 			die("in-event: Internal tree error");
1070 		/*
1071 		 * Take came in we have not heard about yet,
1072 		 * add in an initial atom in runnable state:
1073 		 */
1074 		add_sched_out_event(in_events, 'R', timestamp);
1075 	}
1076 	add_sched_in_event(in_events, timestamp);
1077 }
1078 
1079 static void
1080 latency_runtime_event(struct trace_runtime_event *runtime_event,
1081 		     struct machine *machine,
1082 		     struct event_format *event __used,
1083 		     int cpu,
1084 		     u64 timestamp,
1085 		     struct thread *this_thread __used)
1086 {
1087 	struct thread *thread = machine__findnew_thread(machine, runtime_event->pid);
1088 	struct work_atoms *atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
1089 
1090 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1091 	if (!atoms) {
1092 		thread_atoms_insert(thread);
1093 		atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
1094 		if (!atoms)
1095 			die("in-event: Internal tree error");
1096 		add_sched_out_event(atoms, 'R', timestamp);
1097 	}
1098 
1099 	add_runtime_event(atoms, runtime_event->runtime, timestamp);
1100 }
1101 
1102 static void
1103 latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
1104 		     struct machine *machine,
1105 		     struct event_format *__event __used,
1106 		     int cpu __used,
1107 		     u64 timestamp,
1108 		     struct thread *thread __used)
1109 {
1110 	struct work_atoms *atoms;
1111 	struct work_atom *atom;
1112 	struct thread *wakee;
1113 
1114 	/* Note for later, it may be interesting to observe the failing cases */
1115 	if (!wakeup_event->success)
1116 		return;
1117 
1118 	wakee = machine__findnew_thread(machine, wakeup_event->pid);
1119 	atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
1120 	if (!atoms) {
1121 		thread_atoms_insert(wakee);
1122 		atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
1123 		if (!atoms)
1124 			die("wakeup-event: Internal tree error");
1125 		add_sched_out_event(atoms, 'S', timestamp);
1126 	}
1127 
1128 	BUG_ON(list_empty(&atoms->work_list));
1129 
1130 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1131 
1132 	/*
1133 	 * You WILL be missing events if you've recorded only
1134 	 * one CPU, or are only looking at only one, so don't
1135 	 * make useless noise.
1136 	 */
1137 	if (profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1138 		nr_state_machine_bugs++;
1139 
1140 	nr_timestamps++;
1141 	if (atom->sched_out_time > timestamp) {
1142 		nr_unordered_timestamps++;
1143 		return;
1144 	}
1145 
1146 	atom->state = THREAD_WAIT_CPU;
1147 	atom->wake_up_time = timestamp;
1148 }
1149 
1150 static void
1151 latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event,
1152 		     struct machine *machine,
1153 		     struct event_format *__event __used,
1154 		     int cpu __used,
1155 		     u64 timestamp,
1156 		     struct thread *thread __used)
1157 {
1158 	struct work_atoms *atoms;
1159 	struct work_atom *atom;
1160 	struct thread *migrant;
1161 
1162 	/*
1163 	 * Only need to worry about migration when profiling one CPU.
1164 	 */
1165 	if (profile_cpu == -1)
1166 		return;
1167 
1168 	migrant = machine__findnew_thread(machine, migrate_task_event->pid);
1169 	atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
1170 	if (!atoms) {
1171 		thread_atoms_insert(migrant);
1172 		register_pid(migrant->pid, migrant->comm);
1173 		atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
1174 		if (!atoms)
1175 			die("migration-event: Internal tree error");
1176 		add_sched_out_event(atoms, 'R', timestamp);
1177 	}
1178 
1179 	BUG_ON(list_empty(&atoms->work_list));
1180 
1181 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1182 	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1183 
1184 	nr_timestamps++;
1185 
1186 	if (atom->sched_out_time > timestamp)
1187 		nr_unordered_timestamps++;
1188 }
1189 
1190 static struct trace_sched_handler lat_ops  = {
1191 	.wakeup_event		= latency_wakeup_event,
1192 	.switch_event		= latency_switch_event,
1193 	.runtime_event		= latency_runtime_event,
1194 	.fork_event		= latency_fork_event,
1195 	.migrate_task_event	= latency_migrate_task_event,
1196 };
1197 
1198 static void output_lat_thread(struct work_atoms *work_list)
1199 {
1200 	int i;
1201 	int ret;
1202 	u64 avg;
1203 
1204 	if (!work_list->nb_atoms)
1205 		return;
1206 	/*
1207 	 * Ignore idle threads:
1208 	 */
1209 	if (!strcmp(work_list->thread->comm, "swapper"))
1210 		return;
1211 
1212 	all_runtime += work_list->total_runtime;
1213 	all_count += work_list->nb_atoms;
1214 
1215 	ret = printf("  %s:%d ", work_list->thread->comm, work_list->thread->pid);
1216 
1217 	for (i = 0; i < 24 - ret; i++)
1218 		printf(" ");
1219 
1220 	avg = work_list->total_lat / work_list->nb_atoms;
1221 
1222 	printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n",
1223 	      (double)work_list->total_runtime / 1e6,
1224 		 work_list->nb_atoms, (double)avg / 1e6,
1225 		 (double)work_list->max_lat / 1e6,
1226 		 (double)work_list->max_lat_at / 1e9);
1227 }
1228 
1229 static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1230 {
1231 	if (l->thread->pid < r->thread->pid)
1232 		return -1;
1233 	if (l->thread->pid > r->thread->pid)
1234 		return 1;
1235 
1236 	return 0;
1237 }
1238 
1239 static struct sort_dimension pid_sort_dimension = {
1240 	.name			= "pid",
1241 	.cmp			= pid_cmp,
1242 };
1243 
1244 static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1245 {
1246 	u64 avgl, avgr;
1247 
1248 	if (!l->nb_atoms)
1249 		return -1;
1250 
1251 	if (!r->nb_atoms)
1252 		return 1;
1253 
1254 	avgl = l->total_lat / l->nb_atoms;
1255 	avgr = r->total_lat / r->nb_atoms;
1256 
1257 	if (avgl < avgr)
1258 		return -1;
1259 	if (avgl > avgr)
1260 		return 1;
1261 
1262 	return 0;
1263 }
1264 
1265 static struct sort_dimension avg_sort_dimension = {
1266 	.name			= "avg",
1267 	.cmp			= avg_cmp,
1268 };
1269 
1270 static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1271 {
1272 	if (l->max_lat < r->max_lat)
1273 		return -1;
1274 	if (l->max_lat > r->max_lat)
1275 		return 1;
1276 
1277 	return 0;
1278 }
1279 
1280 static struct sort_dimension max_sort_dimension = {
1281 	.name			= "max",
1282 	.cmp			= max_cmp,
1283 };
1284 
1285 static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1286 {
1287 	if (l->nb_atoms < r->nb_atoms)
1288 		return -1;
1289 	if (l->nb_atoms > r->nb_atoms)
1290 		return 1;
1291 
1292 	return 0;
1293 }
1294 
1295 static struct sort_dimension switch_sort_dimension = {
1296 	.name			= "switch",
1297 	.cmp			= switch_cmp,
1298 };
1299 
1300 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1301 {
1302 	if (l->total_runtime < r->total_runtime)
1303 		return -1;
1304 	if (l->total_runtime > r->total_runtime)
1305 		return 1;
1306 
1307 	return 0;
1308 }
1309 
1310 static struct sort_dimension runtime_sort_dimension = {
1311 	.name			= "runtime",
1312 	.cmp			= runtime_cmp,
1313 };
1314 
1315 static struct sort_dimension *available_sorts[] = {
1316 	&pid_sort_dimension,
1317 	&avg_sort_dimension,
1318 	&max_sort_dimension,
1319 	&switch_sort_dimension,
1320 	&runtime_sort_dimension,
1321 };
1322 
1323 #define NB_AVAILABLE_SORTS	(int)(sizeof(available_sorts) / sizeof(struct sort_dimension *))
1324 
1325 static LIST_HEAD(sort_list);
1326 
1327 static int sort_dimension__add(const char *tok, struct list_head *list)
1328 {
1329 	int i;
1330 
1331 	for (i = 0; i < NB_AVAILABLE_SORTS; i++) {
1332 		if (!strcmp(available_sorts[i]->name, tok)) {
1333 			list_add_tail(&available_sorts[i]->list, list);
1334 
1335 			return 0;
1336 		}
1337 	}
1338 
1339 	return -1;
1340 }
1341 
1342 static void setup_sorting(void);
1343 
1344 static void sort_lat(void)
1345 {
1346 	struct rb_node *node;
1347 
1348 	for (;;) {
1349 		struct work_atoms *data;
1350 		node = rb_first(&atom_root);
1351 		if (!node)
1352 			break;
1353 
1354 		rb_erase(node, &atom_root);
1355 		data = rb_entry(node, struct work_atoms, node);
1356 		__thread_latency_insert(&sorted_atom_root, data, &sort_list);
1357 	}
1358 }
1359 
1360 static struct trace_sched_handler *trace_handler;
1361 
1362 static void
1363 process_sched_wakeup_event(struct perf_tool *tool __used,
1364 			   struct event_format *event,
1365 			   struct perf_sample *sample,
1366 			   struct machine *machine,
1367 			   struct thread *thread)
1368 {
1369 	void *data = sample->raw_data;
1370 	struct trace_wakeup_event wakeup_event;
1371 
1372 	FILL_COMMON_FIELDS(wakeup_event, event, data);
1373 
1374 	FILL_ARRAY(wakeup_event, comm, event, data);
1375 	FILL_FIELD(wakeup_event, pid, event, data);
1376 	FILL_FIELD(wakeup_event, prio, event, data);
1377 	FILL_FIELD(wakeup_event, success, event, data);
1378 	FILL_FIELD(wakeup_event, cpu, event, data);
1379 
1380 	if (trace_handler->wakeup_event)
1381 		trace_handler->wakeup_event(&wakeup_event, machine, event,
1382 					    sample->cpu, sample->time, thread);
1383 }
1384 
1385 /*
1386  * Track the current task - that way we can know whether there's any
1387  * weird events, such as a task being switched away that is not current.
1388  */
1389 static int max_cpu;
1390 
1391 static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 };
1392 
1393 static struct thread *curr_thread[MAX_CPUS];
1394 
1395 static char next_shortname1 = 'A';
1396 static char next_shortname2 = '0';
1397 
1398 static void
1399 map_switch_event(struct trace_switch_event *switch_event,
1400 		 struct machine *machine,
1401 		 struct event_format *event __used,
1402 		 int this_cpu,
1403 		 u64 timestamp,
1404 		 struct thread *thread __used)
1405 {
1406 	struct thread *sched_out __used, *sched_in;
1407 	int new_shortname;
1408 	u64 timestamp0;
1409 	s64 delta;
1410 	int cpu;
1411 
1412 	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
1413 
1414 	if (this_cpu > max_cpu)
1415 		max_cpu = this_cpu;
1416 
1417 	timestamp0 = cpu_last_switched[this_cpu];
1418 	cpu_last_switched[this_cpu] = timestamp;
1419 	if (timestamp0)
1420 		delta = timestamp - timestamp0;
1421 	else
1422 		delta = 0;
1423 
1424 	if (delta < 0)
1425 		die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1426 
1427 
1428 	sched_out = machine__findnew_thread(machine, switch_event->prev_pid);
1429 	sched_in = machine__findnew_thread(machine, switch_event->next_pid);
1430 
1431 	curr_thread[this_cpu] = sched_in;
1432 
1433 	printf("  ");
1434 
1435 	new_shortname = 0;
1436 	if (!sched_in->shortname[0]) {
1437 		sched_in->shortname[0] = next_shortname1;
1438 		sched_in->shortname[1] = next_shortname2;
1439 
1440 		if (next_shortname1 < 'Z') {
1441 			next_shortname1++;
1442 		} else {
1443 			next_shortname1='A';
1444 			if (next_shortname2 < '9') {
1445 				next_shortname2++;
1446 			} else {
1447 				next_shortname2='0';
1448 			}
1449 		}
1450 		new_shortname = 1;
1451 	}
1452 
1453 	for (cpu = 0; cpu <= max_cpu; cpu++) {
1454 		if (cpu != this_cpu)
1455 			printf(" ");
1456 		else
1457 			printf("*");
1458 
1459 		if (curr_thread[cpu]) {
1460 			if (curr_thread[cpu]->pid)
1461 				printf("%2s ", curr_thread[cpu]->shortname);
1462 			else
1463 				printf(".  ");
1464 		} else
1465 			printf("   ");
1466 	}
1467 
1468 	printf("  %12.6f secs ", (double)timestamp/1e9);
1469 	if (new_shortname) {
1470 		printf("%s => %s:%d\n",
1471 			sched_in->shortname, sched_in->comm, sched_in->pid);
1472 	} else {
1473 		printf("\n");
1474 	}
1475 }
1476 
1477 static void
1478 process_sched_switch_event(struct perf_tool *tool __used,
1479 			   struct event_format *event,
1480 			   struct perf_sample *sample,
1481 			   struct machine *machine,
1482 			   struct thread *thread)
1483 {
1484 	int this_cpu = sample->cpu;
1485 	void *data = sample->raw_data;
1486 	struct trace_switch_event switch_event;
1487 
1488 	FILL_COMMON_FIELDS(switch_event, event, data);
1489 
1490 	FILL_ARRAY(switch_event, prev_comm, event, data);
1491 	FILL_FIELD(switch_event, prev_pid, event, data);
1492 	FILL_FIELD(switch_event, prev_prio, event, data);
1493 	FILL_FIELD(switch_event, prev_state, event, data);
1494 	FILL_ARRAY(switch_event, next_comm, event, data);
1495 	FILL_FIELD(switch_event, next_pid, event, data);
1496 	FILL_FIELD(switch_event, next_prio, event, data);
1497 
1498 	if (curr_pid[this_cpu] != (u32)-1) {
1499 		/*
1500 		 * Are we trying to switch away a PID that is
1501 		 * not current?
1502 		 */
1503 		if (curr_pid[this_cpu] != switch_event.prev_pid)
1504 			nr_context_switch_bugs++;
1505 	}
1506 	if (trace_handler->switch_event)
1507 		trace_handler->switch_event(&switch_event, machine, event,
1508 					    this_cpu, sample->time, thread);
1509 
1510 	curr_pid[this_cpu] = switch_event.next_pid;
1511 }
1512 
1513 static void
1514 process_sched_runtime_event(struct perf_tool *tool __used,
1515 			    struct event_format *event,
1516 			    struct perf_sample *sample,
1517 			    struct machine *machine,
1518 			    struct thread *thread)
1519 {
1520 	void *data = sample->raw_data;
1521 	struct trace_runtime_event runtime_event;
1522 
1523 	FILL_ARRAY(runtime_event, comm, event, data);
1524 	FILL_FIELD(runtime_event, pid, event, data);
1525 	FILL_FIELD(runtime_event, runtime, event, data);
1526 	FILL_FIELD(runtime_event, vruntime, event, data);
1527 
1528 	if (trace_handler->runtime_event)
1529 		trace_handler->runtime_event(&runtime_event, machine, event,
1530 					     sample->cpu, sample->time, thread);
1531 }
1532 
1533 static void
1534 process_sched_fork_event(struct perf_tool *tool __used,
1535 			 struct event_format *event,
1536 			 struct perf_sample *sample,
1537 			 struct machine *machine __used,
1538 			 struct thread *thread)
1539 {
1540 	void *data = sample->raw_data;
1541 	struct trace_fork_event fork_event;
1542 
1543 	FILL_COMMON_FIELDS(fork_event, event, data);
1544 
1545 	FILL_ARRAY(fork_event, parent_comm, event, data);
1546 	FILL_FIELD(fork_event, parent_pid, event, data);
1547 	FILL_ARRAY(fork_event, child_comm, event, data);
1548 	FILL_FIELD(fork_event, child_pid, event, data);
1549 
1550 	if (trace_handler->fork_event)
1551 		trace_handler->fork_event(&fork_event, event,
1552 					  sample->cpu, sample->time, thread);
1553 }
1554 
1555 static void
1556 process_sched_exit_event(struct perf_tool *tool __used,
1557 			 struct event_format *event,
1558 			 struct perf_sample *sample __used,
1559 			 struct machine *machine __used,
1560 			 struct thread *thread __used)
1561 {
1562 	if (verbose)
1563 		printf("sched_exit event %p\n", event);
1564 }
1565 
1566 static void
1567 process_sched_migrate_task_event(struct perf_tool *tool __used,
1568 				 struct event_format *event,
1569 				 struct perf_sample *sample,
1570 				 struct machine *machine,
1571 				 struct thread *thread)
1572 {
1573 	void *data = sample->raw_data;
1574 	struct trace_migrate_task_event migrate_task_event;
1575 
1576 	FILL_COMMON_FIELDS(migrate_task_event, event, data);
1577 
1578 	FILL_ARRAY(migrate_task_event, comm, event, data);
1579 	FILL_FIELD(migrate_task_event, pid, event, data);
1580 	FILL_FIELD(migrate_task_event, prio, event, data);
1581 	FILL_FIELD(migrate_task_event, cpu, event, data);
1582 
1583 	if (trace_handler->migrate_task_event)
1584 		trace_handler->migrate_task_event(&migrate_task_event, machine,
1585 						  event, sample->cpu,
1586 						  sample->time, thread);
1587 }
1588 
1589 typedef void (*tracepoint_handler)(struct perf_tool *tool, struct event_format *event,
1590 				   struct perf_sample *sample,
1591 				   struct machine *machine,
1592 				   struct thread *thread);
1593 
1594 static int perf_sched__process_tracepoint_sample(struct perf_tool *tool,
1595 						 union perf_event *event __used,
1596 						 struct perf_sample *sample,
1597 						 struct perf_evsel *evsel,
1598 						 struct machine *machine)
1599 {
1600 	struct thread *thread = machine__findnew_thread(machine, sample->pid);
1601 
1602 	if (thread == NULL) {
1603 		pr_debug("problem processing %s event, skipping it.\n",
1604 			 evsel->name);
1605 		return -1;
1606 	}
1607 
1608 	evsel->hists.stats.total_period += sample->period;
1609 	hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
1610 
1611 	if (evsel->handler.func != NULL) {
1612 		tracepoint_handler f = evsel->handler.func;
1613 
1614 		if (evsel->handler.data == NULL)
1615 			evsel->handler.data = trace_find_event(evsel->attr.config);
1616 
1617 		f(tool, evsel->handler.data, sample, machine, thread);
1618 	}
1619 
1620 	return 0;
1621 }
1622 
1623 static struct perf_tool perf_sched = {
1624 	.sample			= perf_sched__process_tracepoint_sample,
1625 	.comm			= perf_event__process_comm,
1626 	.lost			= perf_event__process_lost,
1627 	.fork			= perf_event__process_task,
1628 	.ordered_samples	= true,
1629 };
1630 
1631 static void read_events(bool destroy, struct perf_session **psession)
1632 {
1633 	int err = -EINVAL;
1634 	const struct perf_evsel_str_handler handlers[] = {
1635 		{ "sched:sched_switch",	      process_sched_switch_event, },
1636 		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
1637 		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
1638 		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
1639 		{ "sched:sched_process_fork", process_sched_fork_event, },
1640 		{ "sched:sched_process_exit", process_sched_exit_event, },
1641 		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
1642 	};
1643 	struct perf_session *session = perf_session__new(input_name, O_RDONLY,
1644 							 0, false, &perf_sched);
1645 	if (session == NULL)
1646 		die("No Memory");
1647 
1648 	err = perf_evlist__set_tracepoints_handlers_array(session->evlist, handlers);
1649 	assert(err == 0);
1650 
1651 	if (perf_session__has_traces(session, "record -R")) {
1652 		err = perf_session__process_events(session, &perf_sched);
1653 		if (err)
1654 			die("Failed to process events, error %d", err);
1655 
1656 		nr_events      = session->hists.stats.nr_events[0];
1657 		nr_lost_events = session->hists.stats.total_lost;
1658 		nr_lost_chunks = session->hists.stats.nr_events[PERF_RECORD_LOST];
1659 	}
1660 
1661 	if (destroy)
1662 		perf_session__delete(session);
1663 
1664 	if (psession)
1665 		*psession = session;
1666 }
1667 
1668 static void print_bad_events(void)
1669 {
1670 	if (nr_unordered_timestamps && nr_timestamps) {
1671 		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1672 			(double)nr_unordered_timestamps/(double)nr_timestamps*100.0,
1673 			nr_unordered_timestamps, nr_timestamps);
1674 	}
1675 	if (nr_lost_events && nr_events) {
1676 		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1677 			(double)nr_lost_events/(double)nr_events*100.0,
1678 			nr_lost_events, nr_events, nr_lost_chunks);
1679 	}
1680 	if (nr_state_machine_bugs && nr_timestamps) {
1681 		printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
1682 			(double)nr_state_machine_bugs/(double)nr_timestamps*100.0,
1683 			nr_state_machine_bugs, nr_timestamps);
1684 		if (nr_lost_events)
1685 			printf(" (due to lost events?)");
1686 		printf("\n");
1687 	}
1688 	if (nr_context_switch_bugs && nr_timestamps) {
1689 		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
1690 			(double)nr_context_switch_bugs/(double)nr_timestamps*100.0,
1691 			nr_context_switch_bugs, nr_timestamps);
1692 		if (nr_lost_events)
1693 			printf(" (due to lost events?)");
1694 		printf("\n");
1695 	}
1696 }
1697 
1698 static void __cmd_lat(void)
1699 {
1700 	struct rb_node *next;
1701 	struct perf_session *session;
1702 
1703 	setup_pager();
1704 	read_events(false, &session);
1705 	sort_lat();
1706 
1707 	printf("\n ---------------------------------------------------------------------------------------------------------------\n");
1708 	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at     |\n");
1709 	printf(" ---------------------------------------------------------------------------------------------------------------\n");
1710 
1711 	next = rb_first(&sorted_atom_root);
1712 
1713 	while (next) {
1714 		struct work_atoms *work_list;
1715 
1716 		work_list = rb_entry(next, struct work_atoms, node);
1717 		output_lat_thread(work_list);
1718 		next = rb_next(next);
1719 	}
1720 
1721 	printf(" -----------------------------------------------------------------------------------------\n");
1722 	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
1723 		(double)all_runtime/1e6, all_count);
1724 
1725 	printf(" ---------------------------------------------------\n");
1726 
1727 	print_bad_events();
1728 	printf("\n");
1729 
1730 	perf_session__delete(session);
1731 }
1732 
1733 static struct trace_sched_handler map_ops  = {
1734 	.wakeup_event		= NULL,
1735 	.switch_event		= map_switch_event,
1736 	.runtime_event		= NULL,
1737 	.fork_event		= NULL,
1738 };
1739 
1740 static void __cmd_map(void)
1741 {
1742 	max_cpu = sysconf(_SC_NPROCESSORS_CONF);
1743 
1744 	setup_pager();
1745 	read_events(true, NULL);
1746 	print_bad_events();
1747 }
1748 
1749 static void __cmd_replay(void)
1750 {
1751 	unsigned long i;
1752 
1753 	calibrate_run_measurement_overhead();
1754 	calibrate_sleep_measurement_overhead();
1755 
1756 	test_calibrations();
1757 
1758 	read_events(true, NULL);
1759 
1760 	printf("nr_run_events:        %ld\n", nr_run_events);
1761 	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
1762 	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
1763 
1764 	if (targetless_wakeups)
1765 		printf("target-less wakeups:  %ld\n", targetless_wakeups);
1766 	if (multitarget_wakeups)
1767 		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
1768 	if (nr_run_events_optimized)
1769 		printf("run atoms optimized: %ld\n",
1770 			nr_run_events_optimized);
1771 
1772 	print_task_traces();
1773 	add_cross_task_wakeups();
1774 
1775 	create_tasks();
1776 	printf("------------------------------------------------------------\n");
1777 	for (i = 0; i < replay_repeat; i++)
1778 		run_one_test();
1779 }
1780 
1781 
1782 static const char * const sched_usage[] = {
1783 	"perf sched [<options>] {record|latency|map|replay|script}",
1784 	NULL
1785 };
1786 
1787 static const struct option sched_options[] = {
1788 	OPT_STRING('i', "input", &input_name, "file",
1789 		    "input file name"),
1790 	OPT_INCR('v', "verbose", &verbose,
1791 		    "be more verbose (show symbol address, etc)"),
1792 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1793 		    "dump raw trace in ASCII"),
1794 	OPT_END()
1795 };
1796 
1797 static const char * const latency_usage[] = {
1798 	"perf sched latency [<options>]",
1799 	NULL
1800 };
1801 
1802 static const struct option latency_options[] = {
1803 	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
1804 		   "sort by key(s): runtime, switch, avg, max"),
1805 	OPT_INCR('v', "verbose", &verbose,
1806 		    "be more verbose (show symbol address, etc)"),
1807 	OPT_INTEGER('C', "CPU", &profile_cpu,
1808 		    "CPU to profile on"),
1809 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1810 		    "dump raw trace in ASCII"),
1811 	OPT_END()
1812 };
1813 
1814 static const char * const replay_usage[] = {
1815 	"perf sched replay [<options>]",
1816 	NULL
1817 };
1818 
1819 static const struct option replay_options[] = {
1820 	OPT_UINTEGER('r', "repeat", &replay_repeat,
1821 		     "repeat the workload replay N times (-1: infinite)"),
1822 	OPT_INCR('v', "verbose", &verbose,
1823 		    "be more verbose (show symbol address, etc)"),
1824 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1825 		    "dump raw trace in ASCII"),
1826 	OPT_END()
1827 };
1828 
1829 static void setup_sorting(void)
1830 {
1831 	char *tmp, *tok, *str = strdup(sort_order);
1832 
1833 	for (tok = strtok_r(str, ", ", &tmp);
1834 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
1835 		if (sort_dimension__add(tok, &sort_list) < 0) {
1836 			error("Unknown --sort key: `%s'", tok);
1837 			usage_with_options(latency_usage, latency_options);
1838 		}
1839 	}
1840 
1841 	free(str);
1842 
1843 	sort_dimension__add("pid", &cmp_pid);
1844 }
1845 
1846 static const char *record_args[] = {
1847 	"record",
1848 	"-a",
1849 	"-R",
1850 	"-f",
1851 	"-m", "1024",
1852 	"-c", "1",
1853 	"-e", "sched:sched_switch",
1854 	"-e", "sched:sched_stat_wait",
1855 	"-e", "sched:sched_stat_sleep",
1856 	"-e", "sched:sched_stat_iowait",
1857 	"-e", "sched:sched_stat_runtime",
1858 	"-e", "sched:sched_process_exit",
1859 	"-e", "sched:sched_process_fork",
1860 	"-e", "sched:sched_wakeup",
1861 	"-e", "sched:sched_migrate_task",
1862 };
1863 
1864 static int __cmd_record(int argc, const char **argv)
1865 {
1866 	unsigned int rec_argc, i, j;
1867 	const char **rec_argv;
1868 
1869 	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
1870 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
1871 
1872 	if (rec_argv == NULL)
1873 		return -ENOMEM;
1874 
1875 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
1876 		rec_argv[i] = strdup(record_args[i]);
1877 
1878 	for (j = 1; j < (unsigned int)argc; j++, i++)
1879 		rec_argv[i] = argv[j];
1880 
1881 	BUG_ON(i != rec_argc);
1882 
1883 	return cmd_record(i, rec_argv, NULL);
1884 }
1885 
1886 int cmd_sched(int argc, const char **argv, const char *prefix __used)
1887 {
1888 	argc = parse_options(argc, argv, sched_options, sched_usage,
1889 			     PARSE_OPT_STOP_AT_NON_OPTION);
1890 	if (!argc)
1891 		usage_with_options(sched_usage, sched_options);
1892 
1893 	/*
1894 	 * Aliased to 'perf script' for now:
1895 	 */
1896 	if (!strcmp(argv[0], "script"))
1897 		return cmd_script(argc, argv, prefix);
1898 
1899 	symbol__init();
1900 	if (!strncmp(argv[0], "rec", 3)) {
1901 		return __cmd_record(argc, argv);
1902 	} else if (!strncmp(argv[0], "lat", 3)) {
1903 		trace_handler = &lat_ops;
1904 		if (argc > 1) {
1905 			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
1906 			if (argc)
1907 				usage_with_options(latency_usage, latency_options);
1908 		}
1909 		setup_sorting();
1910 		__cmd_lat();
1911 	} else if (!strcmp(argv[0], "map")) {
1912 		trace_handler = &map_ops;
1913 		setup_sorting();
1914 		__cmd_map();
1915 	} else if (!strncmp(argv[0], "rep", 3)) {
1916 		trace_handler = &replay_ops;
1917 		if (argc) {
1918 			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
1919 			if (argc)
1920 				usage_with_options(replay_usage, replay_options);
1921 		}
1922 		__cmd_replay();
1923 	} else {
1924 		usage_with_options(sched_usage, sched_options);
1925 	}
1926 
1927 	return 0;
1928 }
1929