1 // SPDX-License-Identifier: GPL-2.0 2 #include "builtin.h" 3 #include "perf.h" 4 #include "perf-sys.h" 5 6 #include "util/cpumap.h" 7 #include "util/evlist.h" 8 #include "util/evsel.h" 9 #include "util/evsel_fprintf.h" 10 #include "util/mutex.h" 11 #include "util/symbol.h" 12 #include "util/thread.h" 13 #include "util/header.h" 14 #include "util/session.h" 15 #include "util/tool.h" 16 #include "util/cloexec.h" 17 #include "util/thread_map.h" 18 #include "util/color.h" 19 #include "util/stat.h" 20 #include "util/string2.h" 21 #include "util/callchain.h" 22 #include "util/time-utils.h" 23 24 #include <subcmd/pager.h> 25 #include <subcmd/parse-options.h> 26 #include "util/trace-event.h" 27 28 #include "util/debug.h" 29 #include "util/event.h" 30 #include "util/util.h" 31 #include "util/synthetic-events.h" 32 #include "util/target.h" 33 34 #include <linux/kernel.h> 35 #include <linux/log2.h> 36 #include <linux/zalloc.h> 37 #include <sys/prctl.h> 38 #include <sys/resource.h> 39 #include <sys/wait.h> 40 #include <inttypes.h> 41 42 #include <errno.h> 43 #include <semaphore.h> 44 #include <pthread.h> 45 #include <math.h> 46 #include <api/fs/fs.h> 47 #include <perf/cpumap.h> 48 #include <linux/time64.h> 49 #include <linux/err.h> 50 51 #include <linux/ctype.h> 52 53 #define PR_SET_NAME 15 /* Set process name */ 54 #define MAX_CPUS 4096 55 #define COMM_LEN 20 56 #define SYM_LEN 129 57 #define MAX_PID 1024000 58 #define PID_MAX_LIMIT 4194304 /* kernel limit on 64-bit */ 59 #define MAX_PRIO 140 60 #define SEP_LEN 100 61 62 static const char *cpu_list; 63 static struct perf_cpu_map *user_requested_cpus; 64 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); 65 66 struct sched_atom; 67 68 struct task_desc { 69 unsigned long nr; 70 unsigned long pid; 71 char comm[COMM_LEN]; 72 73 unsigned long nr_events; 74 unsigned long curr_event; 75 struct sched_atom **atoms; 76 77 pthread_t thread; 78 79 sem_t ready_for_work; 80 sem_t work_done_sem; 81 82 u64 cpu_usage; 83 }; 84 85 enum sched_event_type { 86 SCHED_EVENT_RUN, 87 SCHED_EVENT_SLEEP, 88 SCHED_EVENT_WAKEUP, 89 }; 90 91 struct sched_atom { 92 enum sched_event_type type; 93 u64 timestamp; 94 u64 duration; 95 unsigned long nr; 96 sem_t *wait_sem; 97 struct task_desc *wakee; 98 }; 99 100 enum thread_state { 101 THREAD_SLEEPING = 0, 102 THREAD_WAIT_CPU, 103 THREAD_SCHED_IN, 104 THREAD_IGNORE 105 }; 106 107 struct work_atom { 108 struct list_head list; 109 enum thread_state state; 110 u64 sched_out_time; 111 u64 wake_up_time; 112 u64 sched_in_time; 113 u64 runtime; 114 }; 115 116 struct work_atoms { 117 struct list_head work_list; 118 struct thread *thread; 119 struct rb_node node; 120 u64 max_lat; 121 u64 max_lat_start; 122 u64 max_lat_end; 123 u64 total_lat; 124 u64 nb_atoms; 125 u64 total_runtime; 126 int num_merged; 127 }; 128 129 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *); 130 131 struct perf_sched; 132 133 struct trace_sched_handler { 134 int (*switch_event)(struct perf_sched *sched, struct perf_sample *sample, 135 struct machine *machine); 136 137 int (*runtime_event)(struct perf_sched *sched, struct perf_sample *sample, 138 struct machine *machine); 139 140 int (*wakeup_event)(struct perf_sched *sched, struct perf_sample *sample, 141 struct machine *machine); 142 143 /* PERF_RECORD_FORK event, not sched_process_fork tracepoint */ 144 int (*fork_event)(struct perf_sched *sched, union perf_event *event, 145 struct machine *machine); 146 147 int (*migrate_task_event)(struct perf_sched *sched, 148 struct perf_sample *sample, 149 struct machine *machine); 150 }; 151 152 #define COLOR_PIDS PERF_COLOR_BLUE 153 #define COLOR_CPUS PERF_COLOR_BG_RED 154 155 struct perf_sched_map { 156 DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS); 157 struct perf_cpu *comp_cpus; 158 bool comp; 159 struct perf_thread_map *color_pids; 160 const char *color_pids_str; 161 struct perf_cpu_map *color_cpus; 162 const char *color_cpus_str; 163 const char *task_name; 164 struct strlist *task_names; 165 bool fuzzy; 166 struct perf_cpu_map *cpus; 167 const char *cpus_str; 168 }; 169 170 struct perf_sched { 171 struct perf_tool tool; 172 const char *sort_order; 173 unsigned long nr_tasks; 174 struct task_desc **pid_to_task; 175 struct task_desc **tasks; 176 const struct trace_sched_handler *tp_handler; 177 struct mutex start_work_mutex; 178 struct mutex work_done_wait_mutex; 179 int profile_cpu; 180 /* 181 * Track the current task - that way we can know whether there's any 182 * weird events, such as a task being switched away that is not current. 183 */ 184 struct perf_cpu max_cpu; 185 u32 *curr_pid; 186 struct thread **curr_thread; 187 struct thread **curr_out_thread; 188 char next_shortname1; 189 char next_shortname2; 190 unsigned int replay_repeat; 191 unsigned long nr_run_events; 192 unsigned long nr_sleep_events; 193 unsigned long nr_wakeup_events; 194 unsigned long nr_sleep_corrections; 195 unsigned long nr_run_events_optimized; 196 unsigned long targetless_wakeups; 197 unsigned long multitarget_wakeups; 198 unsigned long nr_runs; 199 unsigned long nr_timestamps; 200 unsigned long nr_unordered_timestamps; 201 unsigned long nr_context_switch_bugs; 202 unsigned long nr_events; 203 unsigned long nr_lost_chunks; 204 unsigned long nr_lost_events; 205 u64 run_measurement_overhead; 206 u64 sleep_measurement_overhead; 207 u64 start_time; 208 u64 cpu_usage; 209 u64 runavg_cpu_usage; 210 u64 parent_cpu_usage; 211 u64 runavg_parent_cpu_usage; 212 u64 sum_runtime; 213 u64 sum_fluct; 214 u64 run_avg; 215 u64 all_runtime; 216 u64 all_count; 217 u64 *cpu_last_switched; 218 struct rb_root_cached atom_root, sorted_atom_root, merged_atom_root; 219 struct list_head sort_list, cmp_pid; 220 bool force; 221 bool skip_merge; 222 struct perf_sched_map map; 223 224 /* options for timehist command */ 225 bool summary; 226 bool summary_only; 227 bool idle_hist; 228 bool show_callchain; 229 unsigned int max_stack; 230 bool show_cpu_visual; 231 bool show_wakeups; 232 bool show_next; 233 bool show_migrations; 234 bool pre_migrations; 235 bool show_state; 236 bool show_prio; 237 u64 skipped_samples; 238 const char *time_str; 239 struct perf_time_interval ptime; 240 struct perf_time_interval hist_time; 241 volatile bool thread_funcs_exit; 242 const char *prio_str; 243 DECLARE_BITMAP(prio_bitmap, MAX_PRIO); 244 245 struct perf_session *session; 246 struct perf_data *data; 247 }; 248 249 /* per thread run time data */ 250 struct thread_runtime { 251 u64 last_time; /* time of previous sched in/out event */ 252 u64 dt_run; /* run time */ 253 u64 dt_sleep; /* time between CPU access by sleep (off cpu) */ 254 u64 dt_iowait; /* time between CPU access by iowait (off cpu) */ 255 u64 dt_preempt; /* time between CPU access by preempt (off cpu) */ 256 u64 dt_delay; /* time between wakeup and sched-in */ 257 u64 dt_pre_mig; /* time between migration and wakeup */ 258 u64 ready_to_run; /* time of wakeup */ 259 u64 migrated; /* time when a thread is migrated */ 260 261 struct stats run_stats; 262 u64 total_run_time; 263 u64 total_sleep_time; 264 u64 total_iowait_time; 265 u64 total_preempt_time; 266 u64 total_delay_time; 267 u64 total_pre_mig_time; 268 269 char last_state; 270 271 char shortname[3]; 272 bool comm_changed; 273 274 u64 migrations; 275 276 int prio; 277 bool color; 278 }; 279 280 /* per event run time data */ 281 struct evsel_runtime { 282 u64 *last_time; /* time this event was last seen per cpu */ 283 u32 ncpu; /* highest cpu slot allocated */ 284 }; 285 286 /* per cpu idle time data */ 287 struct idle_thread_runtime { 288 struct thread_runtime tr; 289 struct thread *last_thread; 290 struct rb_root_cached sorted_root; 291 struct callchain_root callchain; 292 struct callchain_cursor cursor; 293 }; 294 295 /* track idle times per cpu */ 296 static struct thread **idle_threads; 297 static int idle_max_cpu; 298 static char idle_comm[] = "<idle>"; 299 300 static u64 get_nsecs(void) 301 { 302 struct timespec ts; 303 304 clock_gettime(CLOCK_MONOTONIC, &ts); 305 306 return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec; 307 } 308 309 static void burn_nsecs(struct perf_sched *sched, u64 nsecs) 310 { 311 u64 T0 = get_nsecs(), T1; 312 313 do { 314 T1 = get_nsecs(); 315 } while (T1 + sched->run_measurement_overhead < T0 + nsecs); 316 } 317 318 static void sleep_nsecs(u64 nsecs) 319 { 320 struct timespec ts; 321 322 ts.tv_nsec = nsecs % 999999999; 323 ts.tv_sec = nsecs / 999999999; 324 325 nanosleep(&ts, NULL); 326 } 327 328 static void calibrate_run_measurement_overhead(struct perf_sched *sched) 329 { 330 u64 T0, T1, delta, min_delta = NSEC_PER_SEC; 331 int i; 332 333 for (i = 0; i < 10; i++) { 334 T0 = get_nsecs(); 335 burn_nsecs(sched, 0); 336 T1 = get_nsecs(); 337 delta = T1-T0; 338 min_delta = min(min_delta, delta); 339 } 340 sched->run_measurement_overhead = min_delta; 341 342 printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta); 343 } 344 345 static void calibrate_sleep_measurement_overhead(struct perf_sched *sched) 346 { 347 u64 T0, T1, delta, min_delta = NSEC_PER_SEC; 348 int i; 349 350 for (i = 0; i < 10; i++) { 351 T0 = get_nsecs(); 352 sleep_nsecs(10000); 353 T1 = get_nsecs(); 354 delta = T1-T0; 355 min_delta = min(min_delta, delta); 356 } 357 min_delta -= 10000; 358 sched->sleep_measurement_overhead = min_delta; 359 360 printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta); 361 } 362 363 static struct sched_atom * 364 get_new_event(struct task_desc *task, u64 timestamp) 365 { 366 struct sched_atom *event = zalloc(sizeof(*event)); 367 unsigned long idx = task->nr_events; 368 size_t size; 369 struct sched_atom **atoms_p; 370 371 if (event == NULL) { 372 pr_err("ERROR: sched: failed to allocate event\n"); 373 return NULL; 374 } 375 376 event->timestamp = timestamp; 377 event->nr = idx; 378 379 size = sizeof(struct sched_atom *) * (task->nr_events + 1); 380 atoms_p = realloc(task->atoms, size); 381 if (!atoms_p) { 382 pr_err("ERROR: sched: failed to grow atoms array\n"); 383 free(event); 384 return NULL; 385 } 386 task->atoms = atoms_p; 387 task->nr_events++; 388 389 task->atoms[idx] = event; 390 391 return event; 392 } 393 394 static struct sched_atom *last_event(struct task_desc *task) 395 { 396 if (!task->nr_events) 397 return NULL; 398 399 return task->atoms[task->nr_events - 1]; 400 } 401 402 static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task, 403 u64 timestamp, u64 duration) 404 { 405 struct sched_atom *event, *curr_event = last_event(task); 406 407 /* 408 * optimize an existing RUN event by merging this one 409 * to it: 410 */ 411 if (curr_event && curr_event->type == SCHED_EVENT_RUN) { 412 sched->nr_run_events_optimized++; 413 curr_event->duration += duration; 414 return; 415 } 416 417 event = get_new_event(task, timestamp); 418 if (event == NULL) 419 return; 420 421 event->type = SCHED_EVENT_RUN; 422 event->duration = duration; 423 424 sched->nr_run_events++; 425 } 426 427 static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task, 428 u64 timestamp, struct task_desc *wakee) 429 { 430 struct sched_atom *event, *wakee_event; 431 432 event = get_new_event(task, timestamp); 433 if (event == NULL) 434 return; 435 event->type = SCHED_EVENT_WAKEUP; 436 event->wakee = wakee; 437 438 wakee_event = last_event(wakee); 439 if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) { 440 sched->targetless_wakeups++; 441 return; 442 } 443 if (wakee_event->wait_sem) { 444 sched->multitarget_wakeups++; 445 return; 446 } 447 448 wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem)); 449 if (!wakee_event->wait_sem) { 450 pr_err("ERROR: sched: failed to allocate semaphore\n"); 451 return; 452 } 453 sem_init(wakee_event->wait_sem, 0, 0); 454 event->wait_sem = wakee_event->wait_sem; 455 456 sched->nr_wakeup_events++; 457 } 458 459 static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task, 460 u64 timestamp) 461 { 462 struct sched_atom *event = get_new_event(task, timestamp); 463 464 if (event == NULL) 465 return; 466 467 event->type = SCHED_EVENT_SLEEP; 468 469 sched->nr_sleep_events++; 470 } 471 472 static struct task_desc *register_pid(struct perf_sched *sched, 473 unsigned long pid, const char *comm) 474 { 475 struct task_desc *task, **tasks_p; 476 static int pid_max; 477 478 /* perf.data is untrusted — cap pid to prevent overflow in size calculations */ 479 if (pid >= PID_MAX_LIMIT) { 480 pr_err("pid %lu exceeds limit %d, skipping\n", pid, PID_MAX_LIMIT); 481 return NULL; 482 } 483 484 if (sched->pid_to_task == NULL) { 485 if (sysctl__read_int("kernel/pid_max", &pid_max) < 0) 486 pid_max = MAX_PID; 487 sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *)); 488 if (sched->pid_to_task == NULL) 489 return NULL; 490 } 491 if (pid >= (unsigned long)pid_max) { 492 void *p = realloc(sched->pid_to_task, (pid + 1) * sizeof(struct task_desc *)); 493 494 if (p == NULL) 495 return NULL; 496 sched->pid_to_task = p; 497 while (pid >= (unsigned long)pid_max) 498 sched->pid_to_task[pid_max++] = NULL; 499 } 500 501 task = sched->pid_to_task[pid]; 502 503 if (task) 504 return task; 505 506 task = zalloc(sizeof(*task)); 507 if (task == NULL) 508 return NULL; 509 task->pid = pid; 510 if (comm) 511 strlcpy(task->comm, comm, sizeof(task->comm)); 512 /* 513 * every task starts in sleeping state - this gets ignored 514 * if there's no wakeup pointing to this sleep state: 515 */ 516 add_sched_event_sleep(sched, task, 0); 517 518 sched->pid_to_task[pid] = task; 519 tasks_p = realloc(sched->tasks, (sched->nr_tasks + 1) * sizeof(struct task_desc *)); 520 if (!tasks_p) 521 return NULL; 522 sched->tasks = tasks_p; 523 sched->tasks[sched->nr_tasks] = task; 524 task->nr = sched->nr_tasks++; 525 526 if (verbose > 0) 527 printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm); 528 529 return task; 530 } 531 532 533 static void print_task_traces(struct perf_sched *sched) 534 { 535 struct task_desc *task; 536 unsigned long i; 537 538 for (i = 0; i < sched->nr_tasks; i++) { 539 task = sched->tasks[i]; 540 printf("task %6ld (%20s:%10ld), nr_events: %ld\n", 541 task->nr, task->comm, task->pid, task->nr_events); 542 } 543 } 544 545 static void add_cross_task_wakeups(struct perf_sched *sched) 546 { 547 struct task_desc *task1, *task2; 548 unsigned long i, j; 549 550 for (i = 0; i < sched->nr_tasks; i++) { 551 task1 = sched->tasks[i]; 552 j = i + 1; 553 if (j == sched->nr_tasks) 554 j = 0; 555 task2 = sched->tasks[j]; 556 add_sched_event_wakeup(sched, task1, 0, task2); 557 } 558 } 559 560 static void perf_sched__process_event(struct perf_sched *sched, 561 struct sched_atom *atom) 562 { 563 int ret = 0; 564 565 switch (atom->type) { 566 case SCHED_EVENT_RUN: 567 burn_nsecs(sched, atom->duration); 568 break; 569 case SCHED_EVENT_SLEEP: 570 if (atom->wait_sem) 571 ret = sem_wait(atom->wait_sem); 572 BUG_ON(ret); 573 break; 574 case SCHED_EVENT_WAKEUP: 575 if (atom->wait_sem) 576 ret = sem_post(atom->wait_sem); 577 BUG_ON(ret); 578 break; 579 default: 580 BUG_ON(1); 581 } 582 } 583 584 static u64 get_cpu_usage_nsec_parent(void) 585 { 586 struct rusage ru; 587 u64 sum; 588 int err; 589 590 err = getrusage(RUSAGE_SELF, &ru); 591 BUG_ON(err); 592 593 sum = ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC; 594 sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC; 595 596 return sum; 597 } 598 599 static int self_open_counters(struct perf_sched *sched, unsigned long cur_task) 600 { 601 struct perf_event_attr attr; 602 char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE]; 603 int fd; 604 struct rlimit limit; 605 bool need_privilege = false; 606 607 memset(&attr, 0, sizeof(attr)); 608 609 attr.type = PERF_TYPE_SOFTWARE; 610 attr.config = PERF_COUNT_SW_TASK_CLOCK; 611 612 force_again: 613 fd = sys_perf_event_open(&attr, 0, -1, -1, 614 perf_event_open_cloexec_flag()); 615 616 if (fd < 0) { 617 if (errno == EMFILE) { 618 if (sched->force) { 619 BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1); 620 limit.rlim_cur += sched->nr_tasks - cur_task; 621 if (limit.rlim_cur > limit.rlim_max) { 622 limit.rlim_max = limit.rlim_cur; 623 need_privilege = true; 624 } 625 if (setrlimit(RLIMIT_NOFILE, &limit) == -1) { 626 if (need_privilege && errno == EPERM) 627 strcpy(info, "Need privilege\n"); 628 } else 629 goto force_again; 630 } else 631 strcpy(info, "Have a try with -f option\n"); 632 } 633 pr_err("Error: sys_perf_event_open() syscall returned " 634 "with %d (%s)\n%s", fd, 635 str_error_r(errno, sbuf, sizeof(sbuf)), info); 636 exit(EXIT_FAILURE); 637 } 638 return fd; 639 } 640 641 static u64 get_cpu_usage_nsec_self(int fd) 642 { 643 u64 runtime; 644 int ret; 645 646 ret = read(fd, &runtime, sizeof(runtime)); 647 BUG_ON(ret != sizeof(runtime)); 648 649 return runtime; 650 } 651 652 struct sched_thread_parms { 653 struct task_desc *task; 654 struct perf_sched *sched; 655 int fd; 656 }; 657 658 static void *thread_func(void *ctx) 659 { 660 struct sched_thread_parms *parms = ctx; 661 struct task_desc *this_task = parms->task; 662 struct perf_sched *sched = parms->sched; 663 u64 cpu_usage_0, cpu_usage_1; 664 unsigned long i, ret; 665 char comm2[22]; 666 int fd = parms->fd; 667 668 zfree(&parms); 669 670 sprintf(comm2, ":%s", this_task->comm); 671 prctl(PR_SET_NAME, comm2); 672 if (fd < 0) 673 return NULL; 674 675 while (!sched->thread_funcs_exit) { 676 ret = sem_post(&this_task->ready_for_work); 677 BUG_ON(ret); 678 mutex_lock(&sched->start_work_mutex); 679 mutex_unlock(&sched->start_work_mutex); 680 681 cpu_usage_0 = get_cpu_usage_nsec_self(fd); 682 683 for (i = 0; i < this_task->nr_events; i++) { 684 this_task->curr_event = i; 685 perf_sched__process_event(sched, this_task->atoms[i]); 686 } 687 688 cpu_usage_1 = get_cpu_usage_nsec_self(fd); 689 this_task->cpu_usage = cpu_usage_1 - cpu_usage_0; 690 ret = sem_post(&this_task->work_done_sem); 691 BUG_ON(ret); 692 693 mutex_lock(&sched->work_done_wait_mutex); 694 mutex_unlock(&sched->work_done_wait_mutex); 695 } 696 return NULL; 697 } 698 699 static void create_tasks(struct perf_sched *sched) 700 EXCLUSIVE_LOCK_FUNCTION(sched->start_work_mutex) 701 EXCLUSIVE_LOCK_FUNCTION(sched->work_done_wait_mutex) 702 { 703 struct task_desc *task; 704 pthread_attr_t attr; 705 unsigned long i; 706 int err; 707 708 err = pthread_attr_init(&attr); 709 BUG_ON(err); 710 err = pthread_attr_setstacksize(&attr, 711 (size_t) max(16 * 1024, (int)PTHREAD_STACK_MIN)); 712 BUG_ON(err); 713 mutex_lock(&sched->start_work_mutex); 714 mutex_lock(&sched->work_done_wait_mutex); 715 for (i = 0; i < sched->nr_tasks; i++) { 716 struct sched_thread_parms *parms = malloc(sizeof(*parms)); 717 BUG_ON(parms == NULL); 718 parms->task = task = sched->tasks[i]; 719 parms->sched = sched; 720 parms->fd = self_open_counters(sched, i); 721 sem_init(&task->ready_for_work, 0, 0); 722 sem_init(&task->work_done_sem, 0, 0); 723 task->curr_event = 0; 724 err = pthread_create(&task->thread, &attr, thread_func, parms); 725 BUG_ON(err); 726 } 727 } 728 729 static void destroy_tasks(struct perf_sched *sched) 730 UNLOCK_FUNCTION(sched->start_work_mutex) 731 UNLOCK_FUNCTION(sched->work_done_wait_mutex) 732 { 733 struct task_desc *task; 734 unsigned long i; 735 int err; 736 737 mutex_unlock(&sched->start_work_mutex); 738 mutex_unlock(&sched->work_done_wait_mutex); 739 /* Get rid of threads so they won't be upset by mutex destrunction */ 740 for (i = 0; i < sched->nr_tasks; i++) { 741 task = sched->tasks[i]; 742 err = pthread_join(task->thread, NULL); 743 BUG_ON(err); 744 sem_destroy(&task->ready_for_work); 745 sem_destroy(&task->work_done_sem); 746 } 747 } 748 749 static void wait_for_tasks(struct perf_sched *sched) 750 EXCLUSIVE_LOCKS_REQUIRED(sched->work_done_wait_mutex) 751 EXCLUSIVE_LOCKS_REQUIRED(sched->start_work_mutex) 752 { 753 u64 cpu_usage_0, cpu_usage_1; 754 struct task_desc *task; 755 unsigned long i, ret; 756 757 sched->start_time = get_nsecs(); 758 sched->cpu_usage = 0; 759 mutex_unlock(&sched->work_done_wait_mutex); 760 761 for (i = 0; i < sched->nr_tasks; i++) { 762 task = sched->tasks[i]; 763 ret = sem_wait(&task->ready_for_work); 764 BUG_ON(ret); 765 sem_init(&task->ready_for_work, 0, 0); 766 } 767 mutex_lock(&sched->work_done_wait_mutex); 768 769 cpu_usage_0 = get_cpu_usage_nsec_parent(); 770 771 mutex_unlock(&sched->start_work_mutex); 772 773 for (i = 0; i < sched->nr_tasks; i++) { 774 task = sched->tasks[i]; 775 ret = sem_wait(&task->work_done_sem); 776 BUG_ON(ret); 777 sem_init(&task->work_done_sem, 0, 0); 778 sched->cpu_usage += task->cpu_usage; 779 task->cpu_usage = 0; 780 } 781 782 cpu_usage_1 = get_cpu_usage_nsec_parent(); 783 if (!sched->runavg_cpu_usage) 784 sched->runavg_cpu_usage = sched->cpu_usage; 785 sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat; 786 787 sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0; 788 if (!sched->runavg_parent_cpu_usage) 789 sched->runavg_parent_cpu_usage = sched->parent_cpu_usage; 790 sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) + 791 sched->parent_cpu_usage)/sched->replay_repeat; 792 793 mutex_lock(&sched->start_work_mutex); 794 795 for (i = 0; i < sched->nr_tasks; i++) { 796 task = sched->tasks[i]; 797 task->curr_event = 0; 798 } 799 } 800 801 static void run_one_test(struct perf_sched *sched) 802 EXCLUSIVE_LOCKS_REQUIRED(sched->work_done_wait_mutex) 803 EXCLUSIVE_LOCKS_REQUIRED(sched->start_work_mutex) 804 { 805 u64 T0, T1, delta, avg_delta, fluct; 806 807 T0 = get_nsecs(); 808 wait_for_tasks(sched); 809 T1 = get_nsecs(); 810 811 delta = T1 - T0; 812 sched->sum_runtime += delta; 813 sched->nr_runs++; 814 815 avg_delta = sched->sum_runtime / sched->nr_runs; 816 if (delta < avg_delta) 817 fluct = avg_delta - delta; 818 else 819 fluct = delta - avg_delta; 820 sched->sum_fluct += fluct; 821 if (!sched->run_avg) 822 sched->run_avg = delta; 823 sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat; 824 825 printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / NSEC_PER_MSEC); 826 827 printf("ravg: %0.2f, ", (double)sched->run_avg / NSEC_PER_MSEC); 828 829 printf("cpu: %0.2f / %0.2f", 830 (double)sched->cpu_usage / NSEC_PER_MSEC, (double)sched->runavg_cpu_usage / NSEC_PER_MSEC); 831 832 #if 0 833 /* 834 * rusage statistics done by the parent, these are less 835 * accurate than the sched->sum_exec_runtime based statistics: 836 */ 837 printf(" [%0.2f / %0.2f]", 838 (double)sched->parent_cpu_usage / NSEC_PER_MSEC, 839 (double)sched->runavg_parent_cpu_usage / NSEC_PER_MSEC); 840 #endif 841 842 printf("\n"); 843 844 if (sched->nr_sleep_corrections) 845 printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections); 846 sched->nr_sleep_corrections = 0; 847 } 848 849 static void test_calibrations(struct perf_sched *sched) 850 { 851 u64 T0, T1; 852 853 T0 = get_nsecs(); 854 burn_nsecs(sched, NSEC_PER_MSEC); 855 T1 = get_nsecs(); 856 857 printf("the run test took %" PRIu64 " nsecs\n", T1 - T0); 858 859 T0 = get_nsecs(); 860 sleep_nsecs(NSEC_PER_MSEC); 861 T1 = get_nsecs(); 862 863 printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0); 864 } 865 866 static int 867 replay_wakeup_event(struct perf_sched *sched, 868 struct perf_sample *sample, 869 struct machine *machine __maybe_unused) 870 { 871 const char *comm = perf_sample__strval(sample, "comm"); 872 const u32 pid = perf_sample__intval(sample, "pid"); 873 struct task_desc *waker, *wakee; 874 875 if (verbose > 0) { 876 printf("sched_wakeup event %p\n", sample->evsel); 877 878 printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid); 879 } 880 881 waker = register_pid(sched, sample->tid, "<unknown>"); 882 wakee = register_pid(sched, pid, comm); 883 if (waker == NULL || wakee == NULL) 884 return -1; 885 886 add_sched_event_wakeup(sched, waker, sample->time, wakee); 887 return 0; 888 } 889 890 static int replay_switch_event(struct perf_sched *sched, 891 struct perf_sample *sample, 892 struct machine *machine __maybe_unused) 893 { 894 const char *prev_comm = perf_sample__strval(sample, "prev_comm"), 895 *next_comm = perf_sample__strval(sample, "next_comm"); 896 const u32 prev_pid = perf_sample__intval(sample, "prev_pid"), 897 next_pid = perf_sample__intval(sample, "next_pid"); 898 struct task_desc *prev, __maybe_unused *next; 899 u64 timestamp0, timestamp = sample->time; 900 int cpu = sample->cpu; 901 s64 delta; 902 903 if (verbose > 0) 904 printf("sched_switch event %p\n", sample->evsel); 905 906 if (cpu >= MAX_CPUS || cpu < 0) 907 return 0; 908 909 timestamp0 = sched->cpu_last_switched[cpu]; 910 if (timestamp0) 911 delta = timestamp - timestamp0; 912 else 913 delta = 0; 914 915 if (delta < 0) { 916 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta); 917 return -1; 918 } 919 920 pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n", 921 prev_comm, prev_pid, next_comm, next_pid, delta); 922 923 prev = register_pid(sched, prev_pid, prev_comm); 924 next = register_pid(sched, next_pid, next_comm); 925 if (prev == NULL || next == NULL) 926 return -1; 927 928 sched->cpu_last_switched[cpu] = timestamp; 929 930 add_sched_event_run(sched, prev, timestamp, delta); 931 add_sched_event_sleep(sched, prev, timestamp); 932 933 return 0; 934 } 935 936 static int replay_fork_event(struct perf_sched *sched, 937 union perf_event *event, 938 struct machine *machine) 939 { 940 struct thread *child, *parent; 941 942 child = machine__findnew_thread(machine, event->fork.pid, 943 event->fork.tid); 944 parent = machine__findnew_thread(machine, event->fork.ppid, 945 event->fork.ptid); 946 947 if (child == NULL || parent == NULL) { 948 pr_debug("thread does not exist on fork event: child %p, parent %p\n", 949 child, parent); 950 goto out_put; 951 } 952 953 if (verbose > 0) { 954 printf("fork event\n"); 955 printf("... parent: %s/%d\n", thread__comm_str(parent), thread__tid(parent)); 956 printf("... child: %s/%d\n", thread__comm_str(child), thread__tid(child)); 957 } 958 959 register_pid(sched, thread__tid(parent), thread__comm_str(parent)); 960 register_pid(sched, thread__tid(child), thread__comm_str(child)); 961 out_put: 962 thread__put(child); 963 thread__put(parent); 964 return 0; 965 } 966 967 struct sort_dimension { 968 const char *name; 969 sort_fn_t cmp; 970 struct list_head list; 971 }; 972 973 static inline void init_prio(struct thread_runtime *r) 974 { 975 r->prio = -1; 976 } 977 978 /* 979 * handle runtime stats saved per thread 980 */ 981 static struct thread_runtime *thread__init_runtime(struct thread *thread) 982 { 983 struct thread_runtime *r; 984 985 r = zalloc(sizeof(struct thread_runtime)); 986 if (!r) 987 return NULL; 988 989 init_stats(&r->run_stats); 990 init_prio(r); 991 thread__set_priv(thread, r); 992 993 return r; 994 } 995 996 static struct thread_runtime *thread__get_runtime(struct thread *thread) 997 { 998 struct thread_runtime *tr; 999 1000 tr = thread__priv(thread); 1001 if (tr == NULL) { 1002 tr = thread__init_runtime(thread); 1003 if (tr == NULL) 1004 pr_debug("Failed to malloc memory for runtime data.\n"); 1005 } 1006 1007 return tr; 1008 } 1009 1010 static int 1011 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r) 1012 { 1013 struct sort_dimension *sort; 1014 int ret = 0; 1015 1016 BUG_ON(list_empty(list)); 1017 1018 list_for_each_entry(sort, list, list) { 1019 ret = sort->cmp(l, r); 1020 if (ret) 1021 return ret; 1022 } 1023 1024 return ret; 1025 } 1026 1027 static struct work_atoms * 1028 thread_atoms_search(struct rb_root_cached *root, struct thread *thread, 1029 struct list_head *sort_list) 1030 { 1031 struct rb_node *node = root->rb_root.rb_node; 1032 struct work_atoms key = { .thread = thread }; 1033 1034 while (node) { 1035 struct work_atoms *atoms; 1036 int cmp; 1037 1038 atoms = container_of(node, struct work_atoms, node); 1039 1040 cmp = thread_lat_cmp(sort_list, &key, atoms); 1041 if (cmp > 0) 1042 node = node->rb_left; 1043 else if (cmp < 0) 1044 node = node->rb_right; 1045 else { 1046 BUG_ON(!RC_CHK_EQUAL(thread, atoms->thread)); 1047 return atoms; 1048 } 1049 } 1050 return NULL; 1051 } 1052 1053 static void 1054 __thread_latency_insert(struct rb_root_cached *root, struct work_atoms *data, 1055 struct list_head *sort_list) 1056 { 1057 struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL; 1058 bool leftmost = true; 1059 1060 while (*new) { 1061 struct work_atoms *this; 1062 int cmp; 1063 1064 this = container_of(*new, struct work_atoms, node); 1065 parent = *new; 1066 1067 cmp = thread_lat_cmp(sort_list, data, this); 1068 1069 if (cmp > 0) 1070 new = &((*new)->rb_left); 1071 else { 1072 new = &((*new)->rb_right); 1073 leftmost = false; 1074 } 1075 } 1076 1077 rb_link_node(&data->node, parent, new); 1078 rb_insert_color_cached(&data->node, root, leftmost); 1079 } 1080 1081 static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread) 1082 { 1083 struct work_atoms *atoms = zalloc(sizeof(*atoms)); 1084 if (!atoms) { 1085 pr_err("No memory at %s\n", __func__); 1086 return -1; 1087 } 1088 1089 atoms->thread = thread__get(thread); 1090 INIT_LIST_HEAD(&atoms->work_list); 1091 __thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid); 1092 return 0; 1093 } 1094 1095 static int 1096 add_sched_out_event(struct work_atoms *atoms, 1097 char run_state, 1098 u64 timestamp) 1099 { 1100 struct work_atom *atom = zalloc(sizeof(*atom)); 1101 if (!atom) { 1102 pr_err("Non memory at %s", __func__); 1103 return -1; 1104 } 1105 1106 atom->sched_out_time = timestamp; 1107 1108 if (run_state == 'R') { 1109 atom->state = THREAD_WAIT_CPU; 1110 atom->wake_up_time = atom->sched_out_time; 1111 } 1112 1113 list_add_tail(&atom->list, &atoms->work_list); 1114 return 0; 1115 } 1116 1117 static void 1118 add_runtime_event(struct work_atoms *atoms, u64 delta, 1119 u64 timestamp __maybe_unused) 1120 { 1121 struct work_atom *atom; 1122 1123 BUG_ON(list_empty(&atoms->work_list)); 1124 1125 atom = list_entry(atoms->work_list.prev, struct work_atom, list); 1126 1127 atom->runtime += delta; 1128 atoms->total_runtime += delta; 1129 } 1130 1131 static void 1132 add_sched_in_event(struct work_atoms *atoms, u64 timestamp) 1133 { 1134 struct work_atom *atom; 1135 u64 delta; 1136 1137 if (list_empty(&atoms->work_list)) 1138 return; 1139 1140 atom = list_entry(atoms->work_list.prev, struct work_atom, list); 1141 1142 if (atom->state != THREAD_WAIT_CPU) 1143 return; 1144 1145 if (timestamp < atom->wake_up_time) { 1146 atom->state = THREAD_IGNORE; 1147 return; 1148 } 1149 1150 atom->state = THREAD_SCHED_IN; 1151 atom->sched_in_time = timestamp; 1152 1153 delta = atom->sched_in_time - atom->wake_up_time; 1154 atoms->total_lat += delta; 1155 if (delta > atoms->max_lat) { 1156 atoms->max_lat = delta; 1157 atoms->max_lat_start = atom->wake_up_time; 1158 atoms->max_lat_end = timestamp; 1159 } 1160 atoms->nb_atoms++; 1161 } 1162 1163 static void free_work_atoms(struct work_atoms *atoms) 1164 { 1165 struct work_atom *atom, *tmp; 1166 1167 if (atoms == NULL) 1168 return; 1169 1170 list_for_each_entry_safe(atom, tmp, &atoms->work_list, list) { 1171 list_del(&atom->list); 1172 free(atom); 1173 } 1174 thread__zput(atoms->thread); 1175 free(atoms); 1176 } 1177 1178 static int latency_switch_event(struct perf_sched *sched, 1179 struct perf_sample *sample, 1180 struct machine *machine) 1181 { 1182 const u32 prev_pid = perf_sample__intval(sample, "prev_pid"), 1183 next_pid = perf_sample__intval(sample, "next_pid"); 1184 const char prev_state = perf_sample__taskstate(sample, "prev_state"); 1185 struct work_atoms *out_events, *in_events; 1186 struct thread *sched_out, *sched_in; 1187 u64 timestamp0, timestamp = sample->time; 1188 int cpu = sample->cpu, err = -1; 1189 s64 delta; 1190 1191 /* perf.data is untrusted input — CPU may be absent or corrupted */ 1192 if (cpu >= MAX_CPUS || cpu < 0) { 1193 pr_warning("WARNING: at offset %#" PRIx64 ": out-of-bound sample CPU %d, skipping sample\n", 1194 sample->file_offset, cpu); 1195 return 0; 1196 } 1197 1198 timestamp0 = sched->cpu_last_switched[cpu]; 1199 sched->cpu_last_switched[cpu] = timestamp; 1200 if (timestamp0) 1201 delta = timestamp - timestamp0; 1202 else 1203 delta = 0; 1204 1205 if (delta < 0) { 1206 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta); 1207 return -1; 1208 } 1209 1210 sched_out = machine__findnew_thread(machine, -1, prev_pid); 1211 sched_in = machine__findnew_thread(machine, -1, next_pid); 1212 if (sched_out == NULL || sched_in == NULL) 1213 goto out_put; 1214 1215 out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid); 1216 if (!out_events) { 1217 if (thread_atoms_insert(sched, sched_out)) 1218 goto out_put; 1219 out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid); 1220 if (!out_events) { 1221 pr_err("out-event: Internal tree error"); 1222 goto out_put; 1223 } 1224 } 1225 if (add_sched_out_event(out_events, prev_state, timestamp)) 1226 goto out_put; 1227 1228 in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid); 1229 if (!in_events) { 1230 if (thread_atoms_insert(sched, sched_in)) 1231 goto out_put; 1232 in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid); 1233 if (!in_events) { 1234 pr_err("in-event: Internal tree error"); 1235 goto out_put; 1236 } 1237 /* 1238 * Take came in we have not heard about yet, 1239 * add in an initial atom in runnable state: 1240 */ 1241 if (add_sched_out_event(in_events, 'R', timestamp)) 1242 goto out_put; 1243 } 1244 add_sched_in_event(in_events, timestamp); 1245 err = 0; 1246 out_put: 1247 thread__put(sched_out); 1248 thread__put(sched_in); 1249 return err; 1250 } 1251 1252 static int latency_runtime_event(struct perf_sched *sched, 1253 struct perf_sample *sample, 1254 struct machine *machine) 1255 { 1256 const u32 pid = perf_sample__intval(sample, "pid"); 1257 const u64 runtime = perf_sample__intval(sample, "runtime"); 1258 struct thread *thread = machine__findnew_thread(machine, -1, pid); 1259 struct work_atoms *atoms; 1260 u64 timestamp = sample->time; 1261 int cpu = sample->cpu, err = -1; 1262 1263 if (thread == NULL) 1264 return -1; 1265 1266 atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid); 1267 1268 /* perf.data is untrusted input — CPU may be absent or corrupted */ 1269 if (cpu >= MAX_CPUS || cpu < 0) { 1270 pr_warning("WARNING: at offset %#" PRIx64 ": out-of-bound sample CPU %d, skipping sample\n", 1271 sample->file_offset, cpu); 1272 err = 0; 1273 goto out_put; 1274 } 1275 if (!atoms) { 1276 if (thread_atoms_insert(sched, thread)) 1277 goto out_put; 1278 atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid); 1279 if (!atoms) { 1280 pr_err("in-event: Internal tree error"); 1281 goto out_put; 1282 } 1283 if (add_sched_out_event(atoms, 'R', timestamp)) 1284 goto out_put; 1285 } 1286 1287 add_runtime_event(atoms, runtime, timestamp); 1288 err = 0; 1289 out_put: 1290 thread__put(thread); 1291 return err; 1292 } 1293 1294 static int latency_wakeup_event(struct perf_sched *sched, 1295 struct perf_sample *sample, 1296 struct machine *machine) 1297 { 1298 const u32 pid = perf_sample__intval(sample, "pid"); 1299 struct work_atoms *atoms; 1300 struct work_atom *atom; 1301 struct thread *wakee; 1302 u64 timestamp = sample->time; 1303 int err = -1; 1304 1305 wakee = machine__findnew_thread(machine, -1, pid); 1306 if (wakee == NULL) 1307 return -1; 1308 atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid); 1309 if (!atoms) { 1310 if (thread_atoms_insert(sched, wakee)) 1311 goto out_put; 1312 atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid); 1313 if (!atoms) { 1314 pr_err("wakeup-event: Internal tree error"); 1315 goto out_put; 1316 } 1317 if (add_sched_out_event(atoms, 'S', timestamp)) 1318 goto out_put; 1319 } 1320 1321 BUG_ON(list_empty(&atoms->work_list)); 1322 1323 atom = list_entry(atoms->work_list.prev, struct work_atom, list); 1324 1325 /* 1326 * As we do not guarantee the wakeup event happens when 1327 * task is out of run queue, also may happen when task is 1328 * on run queue and wakeup only change ->state to TASK_RUNNING, 1329 * then we should not set the ->wake_up_time when wake up a 1330 * task which is on run queue. 1331 * 1332 * You WILL be missing events if you've recorded only 1333 * one CPU, or are only looking at only one, so don't 1334 * skip in this case. 1335 */ 1336 if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING) 1337 goto out_ok; 1338 1339 sched->nr_timestamps++; 1340 if (atom->sched_out_time > timestamp) { 1341 sched->nr_unordered_timestamps++; 1342 goto out_ok; 1343 } 1344 1345 atom->state = THREAD_WAIT_CPU; 1346 atom->wake_up_time = timestamp; 1347 out_ok: 1348 err = 0; 1349 out_put: 1350 thread__put(wakee); 1351 return err; 1352 } 1353 1354 static int latency_migrate_task_event(struct perf_sched *sched, 1355 struct perf_sample *sample, 1356 struct machine *machine) 1357 { 1358 const u32 pid = perf_sample__intval(sample, "pid"); 1359 u64 timestamp = sample->time; 1360 struct work_atoms *atoms; 1361 struct work_atom *atom; 1362 struct thread *migrant; 1363 int err = -1; 1364 1365 /* 1366 * Only need to worry about migration when profiling one CPU. 1367 */ 1368 if (sched->profile_cpu == -1) 1369 return 0; 1370 1371 migrant = machine__findnew_thread(machine, -1, pid); 1372 if (migrant == NULL) 1373 return -1; 1374 atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid); 1375 if (!atoms) { 1376 if (thread_atoms_insert(sched, migrant)) 1377 goto out_put; 1378 register_pid(sched, thread__tid(migrant), thread__comm_str(migrant)); 1379 atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid); 1380 if (!atoms) { 1381 pr_err("migration-event: Internal tree error"); 1382 goto out_put; 1383 } 1384 if (add_sched_out_event(atoms, 'R', timestamp)) 1385 goto out_put; 1386 } 1387 1388 BUG_ON(list_empty(&atoms->work_list)); 1389 1390 atom = list_entry(atoms->work_list.prev, struct work_atom, list); 1391 atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp; 1392 1393 sched->nr_timestamps++; 1394 1395 if (atom->sched_out_time > timestamp) 1396 sched->nr_unordered_timestamps++; 1397 err = 0; 1398 out_put: 1399 thread__put(migrant); 1400 return err; 1401 } 1402 1403 static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list) 1404 { 1405 int i; 1406 int ret; 1407 u64 avg; 1408 char max_lat_start[32], max_lat_end[32]; 1409 1410 if (!work_list->nb_atoms) 1411 return; 1412 /* 1413 * Ignore idle threads: 1414 */ 1415 if (!strcmp(thread__comm_str(work_list->thread), "swapper")) 1416 return; 1417 1418 sched->all_runtime += work_list->total_runtime; 1419 sched->all_count += work_list->nb_atoms; 1420 1421 if (work_list->num_merged > 1) { 1422 ret = printf(" %s:(%d) ", thread__comm_str(work_list->thread), 1423 work_list->num_merged); 1424 } else { 1425 ret = printf(" %s:%d ", thread__comm_str(work_list->thread), 1426 thread__tid(work_list->thread)); 1427 } 1428 1429 for (i = 0; i < 24 - ret; i++) 1430 printf(" "); 1431 1432 avg = work_list->total_lat / work_list->nb_atoms; 1433 timestamp__scnprintf_usec(work_list->max_lat_start, max_lat_start, sizeof(max_lat_start)); 1434 timestamp__scnprintf_usec(work_list->max_lat_end, max_lat_end, sizeof(max_lat_end)); 1435 1436 printf("|%11.3f ms |%9" PRIu64 " | avg:%8.3f ms | max:%8.3f ms | max start: %12s s | max end: %12s s\n", 1437 (double)work_list->total_runtime / NSEC_PER_MSEC, 1438 work_list->nb_atoms, (double)avg / NSEC_PER_MSEC, 1439 (double)work_list->max_lat / NSEC_PER_MSEC, 1440 max_lat_start, max_lat_end); 1441 } 1442 1443 static int pid_cmp(struct work_atoms *l, struct work_atoms *r) 1444 { 1445 pid_t l_tid, r_tid; 1446 1447 if (RC_CHK_EQUAL(l->thread, r->thread)) 1448 return 0; 1449 l_tid = thread__tid(l->thread); 1450 r_tid = thread__tid(r->thread); 1451 if (l_tid < r_tid) 1452 return -1; 1453 if (l_tid > r_tid) 1454 return 1; 1455 return (int)(RC_CHK_ACCESS(l->thread) - RC_CHK_ACCESS(r->thread)); 1456 } 1457 1458 static int avg_cmp(struct work_atoms *l, struct work_atoms *r) 1459 { 1460 u64 avgl, avgr; 1461 1462 if (!l->nb_atoms) 1463 return -1; 1464 1465 if (!r->nb_atoms) 1466 return 1; 1467 1468 avgl = l->total_lat / l->nb_atoms; 1469 avgr = r->total_lat / r->nb_atoms; 1470 1471 if (avgl < avgr) 1472 return -1; 1473 if (avgl > avgr) 1474 return 1; 1475 1476 return 0; 1477 } 1478 1479 static int max_cmp(struct work_atoms *l, struct work_atoms *r) 1480 { 1481 if (l->max_lat < r->max_lat) 1482 return -1; 1483 if (l->max_lat > r->max_lat) 1484 return 1; 1485 1486 return 0; 1487 } 1488 1489 static int switch_cmp(struct work_atoms *l, struct work_atoms *r) 1490 { 1491 if (l->nb_atoms < r->nb_atoms) 1492 return -1; 1493 if (l->nb_atoms > r->nb_atoms) 1494 return 1; 1495 1496 return 0; 1497 } 1498 1499 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r) 1500 { 1501 if (l->total_runtime < r->total_runtime) 1502 return -1; 1503 if (l->total_runtime > r->total_runtime) 1504 return 1; 1505 1506 return 0; 1507 } 1508 1509 static int sort_dimension__add(const char *tok, struct list_head *list) 1510 { 1511 size_t i; 1512 static struct sort_dimension avg_sort_dimension = { 1513 .name = "avg", 1514 .cmp = avg_cmp, 1515 }; 1516 static struct sort_dimension max_sort_dimension = { 1517 .name = "max", 1518 .cmp = max_cmp, 1519 }; 1520 static struct sort_dimension pid_sort_dimension = { 1521 .name = "pid", 1522 .cmp = pid_cmp, 1523 }; 1524 static struct sort_dimension runtime_sort_dimension = { 1525 .name = "runtime", 1526 .cmp = runtime_cmp, 1527 }; 1528 static struct sort_dimension switch_sort_dimension = { 1529 .name = "switch", 1530 .cmp = switch_cmp, 1531 }; 1532 struct sort_dimension *available_sorts[] = { 1533 &pid_sort_dimension, 1534 &avg_sort_dimension, 1535 &max_sort_dimension, 1536 &switch_sort_dimension, 1537 &runtime_sort_dimension, 1538 }; 1539 1540 for (i = 0; i < ARRAY_SIZE(available_sorts); i++) { 1541 if (!strcmp(available_sorts[i]->name, tok)) { 1542 list_add_tail(&available_sorts[i]->list, list); 1543 1544 return 0; 1545 } 1546 } 1547 1548 return -1; 1549 } 1550 1551 static void perf_sched__sort_lat(struct perf_sched *sched) 1552 { 1553 struct rb_node *node; 1554 struct rb_root_cached *root = &sched->atom_root; 1555 again: 1556 for (;;) { 1557 struct work_atoms *data; 1558 node = rb_first_cached(root); 1559 if (!node) 1560 break; 1561 1562 rb_erase_cached(node, root); 1563 data = rb_entry(node, struct work_atoms, node); 1564 __thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list); 1565 } 1566 if (root == &sched->atom_root) { 1567 root = &sched->merged_atom_root; 1568 goto again; 1569 } 1570 } 1571 1572 static int process_sched_wakeup_event(const struct perf_tool *tool, 1573 struct perf_sample *sample, 1574 struct machine *machine) 1575 { 1576 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1577 1578 if (sched->tp_handler->wakeup_event) 1579 return sched->tp_handler->wakeup_event(sched, sample, machine); 1580 1581 return 0; 1582 } 1583 1584 static int process_sched_wakeup_ignore(const struct perf_tool *tool __maybe_unused, 1585 struct perf_sample *sample __maybe_unused, 1586 struct machine *machine __maybe_unused) 1587 { 1588 return 0; 1589 } 1590 1591 static bool thread__has_color(struct thread *thread) 1592 { 1593 struct thread_runtime *tr = thread__priv(thread); 1594 1595 return tr != NULL && tr->color; 1596 } 1597 1598 static struct thread* 1599 map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid) 1600 { 1601 struct thread *thread = machine__findnew_thread(machine, pid, tid); 1602 1603 if (!sched->map.color_pids || !thread) 1604 return thread; 1605 1606 /* 1607 * Always check the color-pids map, even if thread__priv() is 1608 * already set. COMM events processed before the first sched_switch 1609 * allocate a thread_runtime via thread__get_runtime(), so priv is 1610 * non-NULL before we ever get here. Skipping the check on non-NULL 1611 * priv would prevent those threads from being colored. 1612 */ 1613 if (thread_map__has(sched->map.color_pids, tid)) { 1614 struct thread_runtime *tr = thread__get_runtime(thread); 1615 1616 if (tr) 1617 tr->color = true; 1618 } 1619 return thread; 1620 } 1621 1622 static bool sched_match_task(struct perf_sched *sched, const char *comm_str) 1623 { 1624 bool fuzzy_match = sched->map.fuzzy; 1625 struct strlist *task_names = sched->map.task_names; 1626 struct str_node *node; 1627 1628 strlist__for_each_entry(node, task_names) { 1629 bool match_found = fuzzy_match ? !!strstr(comm_str, node->s) : 1630 !strcmp(comm_str, node->s); 1631 if (match_found) 1632 return true; 1633 } 1634 1635 return false; 1636 } 1637 1638 static void print_sched_map(struct perf_sched *sched, struct perf_cpu this_cpu, int cpus_nr, 1639 const char *color, bool sched_out) 1640 { 1641 for (int i = 0; i < cpus_nr; i++) { 1642 struct perf_cpu cpu = { 1643 .cpu = sched->map.comp ? sched->map.comp_cpus[i].cpu : i, 1644 }; 1645 struct thread *curr_thread = sched->curr_thread[cpu.cpu]; 1646 struct thread *curr_out_thread = sched->curr_out_thread[cpu.cpu]; 1647 struct thread_runtime *curr_tr; 1648 const char *pid_color = color; 1649 const char *cpu_color = color; 1650 char symbol = ' '; 1651 struct thread *thread_to_check = sched_out ? curr_out_thread : curr_thread; 1652 1653 if (thread_to_check && thread__has_color(thread_to_check)) 1654 pid_color = COLOR_PIDS; 1655 1656 if (sched->map.color_cpus && perf_cpu_map__has(sched->map.color_cpus, cpu)) 1657 cpu_color = COLOR_CPUS; 1658 1659 if (cpu.cpu == this_cpu.cpu) 1660 symbol = '*'; 1661 1662 color_fprintf(stdout, cpu.cpu != this_cpu.cpu ? color : cpu_color, "%c", symbol); 1663 1664 thread_to_check = sched_out ? sched->curr_out_thread[cpu.cpu] : 1665 sched->curr_thread[cpu.cpu]; 1666 1667 if (thread_to_check) { 1668 curr_tr = thread__get_runtime(thread_to_check); 1669 if (curr_tr == NULL) 1670 return; 1671 1672 if (sched_out) { 1673 if (cpu.cpu == this_cpu.cpu) 1674 color_fprintf(stdout, color, "- "); 1675 else { 1676 curr_tr = thread__get_runtime(sched->curr_thread[cpu.cpu]); 1677 if (curr_tr != NULL) 1678 color_fprintf(stdout, pid_color, "%2s ", 1679 curr_tr->shortname); 1680 } 1681 } else 1682 color_fprintf(stdout, pid_color, "%2s ", curr_tr->shortname); 1683 } else 1684 color_fprintf(stdout, color, " "); 1685 } 1686 } 1687 1688 static int map_switch_event(struct perf_sched *sched, struct perf_sample *sample, 1689 struct machine *machine) 1690 { 1691 const u32 next_pid = perf_sample__intval(sample, "next_pid"); 1692 const u32 prev_pid = perf_sample__intval(sample, "prev_pid"); 1693 struct thread *sched_in, *sched_out; 1694 struct thread_runtime *tr; 1695 int new_shortname; 1696 u64 timestamp0, timestamp = sample->time; 1697 s64 delta; 1698 struct perf_cpu this_cpu = { 1699 .cpu = sample->cpu, 1700 }; 1701 int cpus_nr; 1702 int proceed; 1703 bool new_cpu = false; 1704 const char *color = PERF_COLOR_NORMAL; 1705 char stimestamp[32]; 1706 const char *str; 1707 int ret = -1; 1708 1709 /* perf.data is untrusted input — CPU may be absent or corrupted */ 1710 if (this_cpu.cpu >= MAX_CPUS || this_cpu.cpu < 0) { 1711 pr_warning("WARNING: at offset %#" PRIx64 ": out-of-bound sample CPU %d, skipping sample\n", 1712 sample->file_offset, this_cpu.cpu); 1713 return 0; 1714 } 1715 1716 if (this_cpu.cpu > sched->max_cpu.cpu) 1717 sched->max_cpu = this_cpu; 1718 1719 if (sched->map.comp) { 1720 cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS); 1721 if (!__test_and_set_bit(this_cpu.cpu, sched->map.comp_cpus_mask)) { 1722 sched->map.comp_cpus[cpus_nr++] = this_cpu; 1723 new_cpu = true; 1724 } 1725 } else 1726 cpus_nr = sched->max_cpu.cpu + 1; 1727 1728 timestamp0 = sched->cpu_last_switched[this_cpu.cpu]; 1729 sched->cpu_last_switched[this_cpu.cpu] = timestamp; 1730 if (timestamp0) 1731 delta = timestamp - timestamp0; 1732 else 1733 delta = 0; 1734 1735 if (delta < 0) { 1736 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta); 1737 return -1; 1738 } 1739 1740 sched_in = map__findnew_thread(sched, machine, -1, next_pid); 1741 sched_out = map__findnew_thread(sched, machine, -1, prev_pid); 1742 if (sched_in == NULL || sched_out == NULL) 1743 goto out; 1744 1745 tr = thread__get_runtime(sched_in); 1746 if (tr == NULL) 1747 goto out; 1748 1749 thread__put(sched->curr_thread[this_cpu.cpu]); 1750 thread__put(sched->curr_out_thread[this_cpu.cpu]); 1751 1752 sched->curr_thread[this_cpu.cpu] = thread__get(sched_in); 1753 sched->curr_out_thread[this_cpu.cpu] = thread__get(sched_out); 1754 1755 ret = 0; 1756 1757 str = thread__comm_str(sched_in); 1758 new_shortname = 0; 1759 if (!tr->shortname[0]) { 1760 if (!strcmp(thread__comm_str(sched_in), "swapper")) { 1761 /* 1762 * Don't allocate a letter-number for swapper:0 1763 * as a shortname. Instead, we use '.' for it. 1764 */ 1765 tr->shortname[0] = '.'; 1766 tr->shortname[1] = ' '; 1767 } else if (!sched->map.task_name || sched_match_task(sched, str)) { 1768 tr->shortname[0] = sched->next_shortname1; 1769 tr->shortname[1] = sched->next_shortname2; 1770 1771 if (sched->next_shortname1 < 'Z') { 1772 sched->next_shortname1++; 1773 } else { 1774 sched->next_shortname1 = 'A'; 1775 if (sched->next_shortname2 < '9') 1776 sched->next_shortname2++; 1777 else 1778 sched->next_shortname2 = '0'; 1779 } 1780 } else { 1781 tr->shortname[0] = '-'; 1782 tr->shortname[1] = ' '; 1783 } 1784 new_shortname = 1; 1785 } 1786 1787 if (sched->map.cpus && !perf_cpu_map__has(sched->map.cpus, this_cpu)) 1788 goto out; 1789 1790 proceed = 0; 1791 str = thread__comm_str(sched_in); 1792 /* 1793 * Check which of sched_in and sched_out matches the passed --task-name 1794 * arguments and call the corresponding print_sched_map. 1795 */ 1796 if (sched->map.task_name && !sched_match_task(sched, str)) { 1797 if (!sched_match_task(sched, thread__comm_str(sched_out))) 1798 goto out; 1799 else 1800 goto sched_out; 1801 1802 } else { 1803 str = thread__comm_str(sched_out); 1804 if (!(sched->map.task_name && !sched_match_task(sched, str))) 1805 proceed = 1; 1806 } 1807 1808 printf(" "); 1809 1810 print_sched_map(sched, this_cpu, cpus_nr, color, false); 1811 1812 timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp)); 1813 color_fprintf(stdout, color, " %12s secs ", stimestamp); 1814 if (new_shortname || tr->comm_changed || (verbose > 0 && thread__tid(sched_in))) { 1815 const char *pid_color = color; 1816 1817 if (thread__has_color(sched_in)) 1818 pid_color = COLOR_PIDS; 1819 1820 color_fprintf(stdout, pid_color, "%s => %s:%d", 1821 tr->shortname, thread__comm_str(sched_in), thread__tid(sched_in)); 1822 tr->comm_changed = false; 1823 } 1824 1825 if (sched->map.comp && new_cpu) 1826 color_fprintf(stdout, color, " (CPU %d)", this_cpu.cpu); 1827 1828 if (proceed != 1) { 1829 color_fprintf(stdout, color, "\n"); 1830 goto out; 1831 } 1832 1833 sched_out: 1834 if (sched->map.task_name) { 1835 tr = thread__get_runtime(sched->curr_out_thread[this_cpu.cpu]); 1836 if (strcmp(tr->shortname, "") == 0) 1837 goto out; 1838 1839 if (proceed == 1) 1840 color_fprintf(stdout, color, "\n"); 1841 1842 printf(" "); 1843 print_sched_map(sched, this_cpu, cpus_nr, color, true); 1844 timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp)); 1845 color_fprintf(stdout, color, " %12s secs ", stimestamp); 1846 } 1847 1848 color_fprintf(stdout, color, "\n"); 1849 1850 out: 1851 thread__put(sched_out); 1852 thread__put(sched_in); 1853 1854 return ret; 1855 } 1856 1857 static int process_sched_switch_event(const struct perf_tool *tool, 1858 struct perf_sample *sample, 1859 struct machine *machine) 1860 { 1861 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1862 int this_cpu = sample->cpu, err = 0; 1863 u32 prev_pid = perf_sample__intval(sample, "prev_pid"), 1864 next_pid = perf_sample__intval(sample, "next_pid"); 1865 1866 /* perf.data is untrusted input — CPU may be absent or corrupted */ 1867 if (this_cpu < 0 || this_cpu >= MAX_CPUS) { 1868 pr_warning("WARNING: at offset %#" PRIx64 ": out-of-bound sample CPU %d, skipping sample\n", 1869 sample->file_offset, this_cpu); 1870 return 0; 1871 } 1872 1873 if (sched->curr_pid[this_cpu] != (u32)-1) { 1874 /* 1875 * Are we trying to switch away a PID that is 1876 * not current? 1877 */ 1878 if (sched->curr_pid[this_cpu] != prev_pid) 1879 sched->nr_context_switch_bugs++; 1880 } 1881 1882 if (sched->tp_handler->switch_event) 1883 err = sched->tp_handler->switch_event(sched, sample, machine); 1884 1885 sched->curr_pid[this_cpu] = next_pid; 1886 return err; 1887 } 1888 1889 static int process_sched_runtime_event(const struct perf_tool *tool, 1890 struct perf_sample *sample, 1891 struct machine *machine) 1892 { 1893 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1894 1895 /* perf.data is untrusted input — CPU may be absent or corrupted */ 1896 if (sample->cpu >= MAX_CPUS) { 1897 pr_warning("WARNING: at offset %#" PRIx64 ": out-of-bound sample CPU %u, skipping sample\n", 1898 sample->file_offset, sample->cpu); 1899 return 0; 1900 } 1901 1902 if (sched->tp_handler->runtime_event) 1903 return sched->tp_handler->runtime_event(sched, sample, machine); 1904 1905 return 0; 1906 } 1907 1908 static int perf_sched__process_fork_event(const struct perf_tool *tool, 1909 union perf_event *event, 1910 struct perf_sample *sample, 1911 struct machine *machine) 1912 { 1913 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1914 1915 /* run the fork event through the perf machinery */ 1916 perf_event__process_fork(tool, event, sample, machine); 1917 1918 /* and then run additional processing needed for this command */ 1919 if (sched->tp_handler->fork_event) 1920 return sched->tp_handler->fork_event(sched, event, machine); 1921 1922 return 0; 1923 } 1924 1925 static int process_sched_migrate_task_event(const struct perf_tool *tool, 1926 struct perf_sample *sample, 1927 struct machine *machine) 1928 { 1929 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1930 1931 if (sched->tp_handler->migrate_task_event) 1932 return sched->tp_handler->migrate_task_event(sched, sample, machine); 1933 1934 return 0; 1935 } 1936 1937 typedef int (*tracepoint_handler)(const struct perf_tool *tool, 1938 struct perf_sample *sample, 1939 struct machine *machine); 1940 1941 static int perf_sched__process_tracepoint_sample(const struct perf_tool *tool __maybe_unused, 1942 union perf_event *event __maybe_unused, 1943 struct perf_sample *sample, 1944 struct machine *machine) 1945 { 1946 struct evsel *evsel = sample->evsel; 1947 int err = 0; 1948 1949 if (evsel->handler != NULL) { 1950 tracepoint_handler f = evsel->handler; 1951 err = f(tool, sample, machine); 1952 } 1953 1954 return err; 1955 } 1956 1957 static int perf_sched__process_comm(const struct perf_tool *tool __maybe_unused, 1958 union perf_event *event, 1959 struct perf_sample *sample, 1960 struct machine *machine) 1961 { 1962 struct thread *thread; 1963 struct thread_runtime *tr; 1964 int err; 1965 1966 err = perf_event__process_comm(tool, event, sample, machine); 1967 if (err) 1968 return err; 1969 1970 thread = machine__find_thread(machine, sample->pid, sample->tid); 1971 if (!thread) { 1972 pr_err("Internal error: can't find thread\n"); 1973 return -1; 1974 } 1975 1976 tr = thread__get_runtime(thread); 1977 if (tr == NULL) { 1978 thread__put(thread); 1979 return -1; 1980 } 1981 1982 tr->comm_changed = true; 1983 thread__put(thread); 1984 1985 return 0; 1986 } 1987 1988 static int perf_sched__read_events(struct perf_sched *sched) 1989 { 1990 struct evsel_str_handler handlers[] = { 1991 { "sched:sched_switch", process_sched_switch_event, }, 1992 { "sched:sched_stat_runtime", process_sched_runtime_event, }, 1993 { "sched:sched_wakeup", process_sched_wakeup_event, }, 1994 { "sched:sched_waking", process_sched_wakeup_event, }, 1995 { "sched:sched_wakeup_new", process_sched_wakeup_event, }, 1996 { "sched:sched_migrate_task", process_sched_migrate_task_event, }, 1997 }; 1998 struct perf_session *session; 1999 struct perf_data data = { 2000 .path = input_name, 2001 .mode = PERF_DATA_MODE_READ, 2002 .force = sched->force, 2003 }; 2004 int rc = -1; 2005 2006 session = perf_session__new(&data, &sched->tool); 2007 if (IS_ERR(session)) { 2008 pr_debug("Error creating perf session"); 2009 return PTR_ERR(session); 2010 } 2011 2012 symbol__init(perf_session__env(session)); 2013 2014 /* prefer sched_waking if it is captured */ 2015 if (evlist__find_tracepoint_by_name(session->evlist, "sched:sched_waking")) 2016 handlers[2].handler = process_sched_wakeup_ignore; 2017 2018 if (perf_session__set_tracepoints_handlers(session, handlers)) 2019 goto out_delete; 2020 2021 if (perf_session__has_traces(session, "record -R")) { 2022 int err = perf_session__process_events(session); 2023 if (err) { 2024 pr_err("Failed to process events, error %d", err); 2025 goto out_delete; 2026 } 2027 2028 sched->nr_events = session->evlist->stats.nr_events[0]; 2029 sched->nr_lost_events = session->evlist->stats.total_lost; 2030 sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST]; 2031 } 2032 2033 rc = 0; 2034 out_delete: 2035 perf_session__delete(session); 2036 return rc; 2037 } 2038 2039 /* 2040 * scheduling times are printed as msec.usec 2041 */ 2042 static inline void print_sched_time(unsigned long long nsecs, int width) 2043 { 2044 unsigned long msecs; 2045 unsigned long usecs; 2046 2047 msecs = nsecs / NSEC_PER_MSEC; 2048 nsecs -= msecs * NSEC_PER_MSEC; 2049 usecs = nsecs / NSEC_PER_USEC; 2050 printf("%*lu.%03lu ", width, msecs, usecs); 2051 } 2052 2053 /* 2054 * returns runtime data for event, allocating memory for it the 2055 * first time it is used. 2056 */ 2057 static struct evsel_runtime *evsel__get_runtime(struct evsel *evsel) 2058 { 2059 struct evsel_runtime *r = evsel->priv; 2060 2061 if (r == NULL) { 2062 r = zalloc(sizeof(struct evsel_runtime)); 2063 evsel->priv = r; 2064 } 2065 2066 return r; 2067 } 2068 2069 /* 2070 * save last time event was seen per cpu 2071 */ 2072 static void evsel__save_time(struct evsel *evsel, u64 timestamp, u32 cpu) 2073 { 2074 struct evsel_runtime *r = evsel__get_runtime(evsel); 2075 2076 if (r == NULL) 2077 return; 2078 2079 if ((cpu >= r->ncpu) || (r->last_time == NULL)) { 2080 int i, n = __roundup_pow_of_two(cpu+1); 2081 void *p = r->last_time; 2082 2083 p = realloc(r->last_time, n * sizeof(u64)); 2084 if (!p) 2085 return; 2086 2087 r->last_time = p; 2088 for (i = r->ncpu; i < n; ++i) 2089 r->last_time[i] = (u64) 0; 2090 2091 r->ncpu = n; 2092 } 2093 2094 r->last_time[cpu] = timestamp; 2095 } 2096 2097 /* returns last time this event was seen on the given cpu */ 2098 static u64 evsel__get_time(struct evsel *evsel, u32 cpu) 2099 { 2100 struct evsel_runtime *r = evsel__get_runtime(evsel); 2101 2102 if ((r == NULL) || (r->last_time == NULL) || (cpu >= r->ncpu)) 2103 return 0; 2104 2105 return r->last_time[cpu]; 2106 } 2107 2108 static void timehist__evsel_priv_destructor(void *priv) 2109 { 2110 struct evsel_runtime *r = priv; 2111 2112 if (r) { 2113 free(r->last_time); 2114 free(r); 2115 } 2116 } 2117 2118 static int comm_width = 30; 2119 2120 static char *timehist_get_commstr(struct thread *thread) 2121 { 2122 static char str[32]; 2123 const char *comm = thread__comm_str(thread); 2124 pid_t tid = thread__tid(thread); 2125 pid_t pid = thread__pid(thread); 2126 int n; 2127 2128 if (pid == 0) 2129 n = scnprintf(str, sizeof(str), "%s", comm); 2130 2131 else if (tid != pid) 2132 n = scnprintf(str, sizeof(str), "%s[%d/%d]", comm, tid, pid); 2133 2134 else 2135 n = scnprintf(str, sizeof(str), "%s[%d]", comm, tid); 2136 2137 if (n > comm_width) 2138 comm_width = n; 2139 2140 return str; 2141 } 2142 2143 /* prio field format: xxx or xxx->yyy */ 2144 #define MAX_PRIO_STR_LEN 8 2145 static char *timehist_get_priostr(struct thread *thread, 2146 struct perf_sample *sample) 2147 { 2148 static char prio_str[16]; 2149 int prev_prio = (int)perf_sample__intval(sample, "prev_prio"); 2150 struct thread_runtime *tr = thread__priv(thread); 2151 2152 if (tr->prio != prev_prio && tr->prio != -1) 2153 scnprintf(prio_str, sizeof(prio_str), "%d->%d", tr->prio, prev_prio); 2154 else 2155 scnprintf(prio_str, sizeof(prio_str), "%d", prev_prio); 2156 2157 return prio_str; 2158 } 2159 2160 static void timehist_header(struct perf_sched *sched) 2161 { 2162 u32 ncpus = sched->max_cpu.cpu + 1; 2163 u32 i, j; 2164 2165 printf("%15s %6s ", "time", "cpu"); 2166 2167 if (sched->show_cpu_visual) { 2168 printf(" "); 2169 for (i = 0, j = 0; i < ncpus; ++i) { 2170 printf("%x", j++); 2171 if (j > 15) 2172 j = 0; 2173 } 2174 printf(" "); 2175 } 2176 2177 printf(" %-*s", comm_width, "task name"); 2178 2179 if (sched->show_prio) 2180 printf(" %-*s", MAX_PRIO_STR_LEN, "prio"); 2181 2182 printf(" %9s %9s %9s", "wait time", "sch delay", "run time"); 2183 2184 if (sched->pre_migrations) 2185 printf(" %9s", "pre-mig time"); 2186 2187 if (sched->show_state) 2188 printf(" %s", "state"); 2189 2190 printf("\n"); 2191 2192 /* 2193 * units row 2194 */ 2195 printf("%15s %-6s ", "", ""); 2196 2197 if (sched->show_cpu_visual) 2198 printf(" %*s ", ncpus, ""); 2199 2200 printf(" %-*s", comm_width, "[tid/pid]"); 2201 2202 if (sched->show_prio) 2203 printf(" %-*s", MAX_PRIO_STR_LEN, ""); 2204 2205 printf(" %9s %9s %9s", "(msec)", "(msec)", "(msec)"); 2206 2207 if (sched->pre_migrations) 2208 printf(" %9s", "(msec)"); 2209 2210 printf("\n"); 2211 2212 /* 2213 * separator 2214 */ 2215 printf("%.15s %.6s ", graph_dotted_line, graph_dotted_line); 2216 2217 if (sched->show_cpu_visual) 2218 printf(" %.*s ", ncpus, graph_dotted_line); 2219 2220 printf(" %.*s", comm_width, graph_dotted_line); 2221 2222 if (sched->show_prio) 2223 printf(" %.*s", MAX_PRIO_STR_LEN, graph_dotted_line); 2224 2225 printf(" %.9s %.9s %.9s", graph_dotted_line, graph_dotted_line, graph_dotted_line); 2226 2227 if (sched->pre_migrations) 2228 printf(" %.9s", graph_dotted_line); 2229 2230 if (sched->show_state) 2231 printf(" %.5s", graph_dotted_line); 2232 2233 printf("\n"); 2234 } 2235 2236 static void timehist_print_sample(struct perf_sched *sched, 2237 struct perf_sample *sample, 2238 struct addr_location *al, 2239 struct thread *thread, 2240 u64 t, const char state) 2241 { 2242 struct thread_runtime *tr = thread__priv(thread); 2243 const char *next_comm = perf_sample__strval(sample, "next_comm"); 2244 const u32 next_pid = perf_sample__intval(sample, "next_pid"); 2245 u32 max_cpus = sched->max_cpu.cpu + 1; 2246 char tstr[64]; 2247 char nstr[30]; 2248 u64 wait_time; 2249 2250 if (cpu_list && (sample->cpu >= MAX_NR_CPUS || 2251 !test_bit(sample->cpu, cpu_bitmap))) 2252 return; 2253 2254 timestamp__scnprintf_usec(t, tstr, sizeof(tstr)); 2255 printf("%15s [%04d] ", tstr, sample->cpu); 2256 2257 if (sched->show_cpu_visual) { 2258 u32 i; 2259 char c; 2260 2261 printf(" "); 2262 for (i = 0; i < max_cpus; ++i) { 2263 /* flag idle times with 'i'; others are sched events */ 2264 if (i == sample->cpu) 2265 c = (thread__tid(thread) == 0) ? 'i' : 's'; 2266 else 2267 c = ' '; 2268 printf("%c", c); 2269 } 2270 printf(" "); 2271 } 2272 2273 if (!thread__comm_set(thread)) { 2274 const char *prev_comm = perf_sample__strval(sample, "prev_comm"); 2275 2276 thread__set_comm(thread, prev_comm, sample->time); 2277 } 2278 2279 printf(" %-*s ", comm_width, timehist_get_commstr(thread)); 2280 2281 if (sched->show_prio) 2282 printf(" %-*s ", MAX_PRIO_STR_LEN, timehist_get_priostr(thread, sample)); 2283 2284 wait_time = tr->dt_sleep + tr->dt_iowait + tr->dt_preempt; 2285 print_sched_time(wait_time, 6); 2286 2287 print_sched_time(tr->dt_delay, 6); 2288 print_sched_time(tr->dt_run, 6); 2289 if (sched->pre_migrations) 2290 print_sched_time(tr->dt_pre_mig, 6); 2291 2292 if (sched->show_state) 2293 printf(" %5c ", thread__tid(thread) == 0 ? 'I' : state); 2294 2295 if (sched->show_next) { 2296 snprintf(nstr, sizeof(nstr), "next: %s[%d]", next_comm, next_pid); 2297 printf(" %-*s", comm_width, nstr); 2298 } 2299 2300 if (sched->show_wakeups && !sched->show_next) 2301 printf(" %-*s", comm_width, ""); 2302 2303 if (thread__tid(thread) == 0) 2304 goto out; 2305 2306 if (sched->show_callchain) 2307 printf(" "); 2308 2309 sample__fprintf_sym(sample, al, 0, 2310 EVSEL__PRINT_SYM | EVSEL__PRINT_ONELINE | 2311 EVSEL__PRINT_CALLCHAIN_ARROW | 2312 EVSEL__PRINT_SKIP_IGNORED, 2313 get_tls_callchain_cursor(), symbol_conf.bt_stop_list, stdout); 2314 2315 out: 2316 printf("\n"); 2317 } 2318 2319 /* 2320 * Explanation of delta-time stats: 2321 * 2322 * t = time of current schedule out event 2323 * tprev = time of previous sched out event 2324 * also time of schedule-in event for current task 2325 * last_time = time of last sched change event for current task 2326 * (i.e, time process was last scheduled out) 2327 * ready_to_run = time of wakeup for current task 2328 * migrated = time of task migration to another CPU 2329 * 2330 * -----|-------------|-------------|-------------|-------------|----- 2331 * last ready migrated tprev t 2332 * time to run 2333 * 2334 * |---------------- dt_wait ----------------| 2335 * |--------- dt_delay ---------|-- dt_run --| 2336 * |- dt_pre_mig -| 2337 * 2338 * dt_run = run time of current task 2339 * dt_wait = time between last schedule out event for task and tprev 2340 * represents time spent off the cpu 2341 * dt_delay = time between wakeup and schedule-in of task 2342 * dt_pre_mig = time between wakeup and migration to another CPU 2343 */ 2344 2345 static void timehist_update_runtime_stats(struct thread_runtime *r, 2346 u64 t, u64 tprev) 2347 { 2348 r->dt_delay = 0; 2349 r->dt_sleep = 0; 2350 r->dt_iowait = 0; 2351 r->dt_preempt = 0; 2352 r->dt_run = 0; 2353 r->dt_pre_mig = 0; 2354 2355 if (tprev) { 2356 r->dt_run = t - tprev; 2357 if (r->ready_to_run) { 2358 if (r->ready_to_run > tprev) 2359 pr_debug("time travel: wakeup time for task > previous sched_switch event\n"); 2360 else 2361 r->dt_delay = tprev - r->ready_to_run; 2362 2363 if ((r->migrated > r->ready_to_run) && (r->migrated < tprev)) 2364 r->dt_pre_mig = r->migrated - r->ready_to_run; 2365 } 2366 2367 if (r->last_time > tprev) 2368 pr_debug("time travel: last sched out time for task > previous sched_switch event\n"); 2369 else if (r->last_time) { 2370 u64 dt_wait = tprev - r->last_time; 2371 2372 if (r->last_state == 'R') 2373 r->dt_preempt = dt_wait; 2374 else if (r->last_state == 'D') 2375 r->dt_iowait = dt_wait; 2376 else 2377 r->dt_sleep = dt_wait; 2378 } 2379 } 2380 2381 update_stats(&r->run_stats, r->dt_run); 2382 2383 r->total_run_time += r->dt_run; 2384 r->total_delay_time += r->dt_delay; 2385 r->total_sleep_time += r->dt_sleep; 2386 r->total_iowait_time += r->dt_iowait; 2387 r->total_preempt_time += r->dt_preempt; 2388 r->total_pre_mig_time += r->dt_pre_mig; 2389 } 2390 2391 static bool is_idle_sample(struct perf_sample *sample) 2392 { 2393 /* pid 0 == swapper == idle task */ 2394 if (evsel__name_is(sample->evsel, "sched:sched_switch")) 2395 return perf_sample__intval(sample, "prev_pid") == 0; 2396 2397 return sample->pid == 0; 2398 } 2399 2400 static void save_task_callchain(struct perf_sched *sched, 2401 struct perf_sample *sample, 2402 struct machine *machine) 2403 { 2404 struct callchain_cursor *cursor; 2405 struct thread *thread; 2406 2407 /* want main thread for process - has maps */ 2408 thread = machine__findnew_thread(machine, sample->pid, sample->pid); 2409 if (thread == NULL) { 2410 pr_debug("Failed to get thread for pid %d.\n", sample->pid); 2411 return; 2412 } 2413 2414 if (!sched->show_callchain || sample->callchain == NULL) { 2415 thread__put(thread); 2416 return; 2417 } 2418 2419 cursor = get_tls_callchain_cursor(); 2420 2421 if (thread__resolve_callchain(thread, cursor, sample, 2422 NULL, NULL, sched->max_stack + 2) != 0) { 2423 if (verbose > 0) 2424 pr_err("Failed to resolve callchain. Skipping\n"); 2425 2426 thread__put(thread); 2427 return; 2428 } 2429 2430 callchain_cursor_commit(cursor); 2431 thread__put(thread); 2432 2433 while (true) { 2434 struct callchain_cursor_node *node; 2435 struct symbol *sym; 2436 2437 node = callchain_cursor_current(cursor); 2438 if (node == NULL) 2439 break; 2440 2441 sym = node->ms.sym; 2442 if (sym) { 2443 if (!strcmp(sym->name, "schedule") || 2444 !strcmp(sym->name, "__schedule") || 2445 !strcmp(sym->name, "preempt_schedule")) 2446 symbol__set_ignore(sym, true); 2447 } 2448 2449 callchain_cursor_advance(cursor); 2450 } 2451 } 2452 2453 static int init_idle_thread(struct thread *thread) 2454 { 2455 struct idle_thread_runtime *itr; 2456 2457 thread__set_comm(thread, idle_comm, 0); 2458 2459 itr = zalloc(sizeof(*itr)); 2460 if (itr == NULL) 2461 return -ENOMEM; 2462 2463 init_prio(&itr->tr); 2464 init_stats(&itr->tr.run_stats); 2465 callchain_init(&itr->callchain); 2466 callchain_cursor_reset(&itr->cursor); 2467 thread__set_priv(thread, itr); 2468 2469 return 0; 2470 } 2471 2472 /* 2473 * Track idle stats per cpu by maintaining a local thread 2474 * struct for the idle task on each cpu. 2475 */ 2476 static int init_idle_threads(int ncpu) 2477 { 2478 int i, ret; 2479 2480 idle_threads = calloc(ncpu, sizeof(struct thread *)); 2481 if (!idle_threads) 2482 return -ENOMEM; 2483 2484 idle_max_cpu = ncpu; 2485 2486 /* allocate the actual thread struct if needed */ 2487 for (i = 0; i < ncpu; ++i) { 2488 idle_threads[i] = thread__new(0, 0); 2489 if (idle_threads[i] == NULL) 2490 return -ENOMEM; 2491 2492 ret = init_idle_thread(idle_threads[i]); 2493 if (ret < 0) 2494 return ret; 2495 } 2496 2497 return 0; 2498 } 2499 2500 static void free_idle_threads(void) 2501 { 2502 int i; 2503 2504 if (idle_threads == NULL) 2505 return; 2506 2507 for (i = 0; i < idle_max_cpu; ++i) { 2508 struct thread *idle = idle_threads[i]; 2509 2510 if (idle) { 2511 struct idle_thread_runtime *itr; 2512 2513 itr = thread__priv(idle); 2514 if (itr) { 2515 thread__put(itr->last_thread); 2516 free_callchain(&itr->callchain); 2517 callchain_cursor_cleanup(&itr->cursor); 2518 } 2519 2520 thread__put(idle); 2521 } 2522 } 2523 2524 free(idle_threads); 2525 } 2526 2527 static struct thread *get_idle_thread(int cpu) 2528 { 2529 /* 2530 * expand/allocate array of pointers to local thread 2531 * structs if needed 2532 */ 2533 if ((cpu >= idle_max_cpu) || (idle_threads == NULL)) { 2534 int i, j = __roundup_pow_of_two(cpu+1); 2535 void *p; 2536 2537 p = realloc(idle_threads, j * sizeof(struct thread *)); 2538 if (!p) 2539 return NULL; 2540 2541 idle_threads = (struct thread **) p; 2542 for (i = idle_max_cpu; i < j; ++i) 2543 idle_threads[i] = NULL; 2544 2545 idle_max_cpu = j; 2546 } 2547 2548 /* allocate a new thread struct if needed */ 2549 if (idle_threads[cpu] == NULL) { 2550 idle_threads[cpu] = thread__new(0, 0); 2551 2552 if (idle_threads[cpu]) { 2553 if (init_idle_thread(idle_threads[cpu]) < 0) { 2554 /* clean up so next call doesn't find a half-initialized thread */ 2555 thread__zput(idle_threads[cpu]); 2556 return NULL; 2557 } 2558 } 2559 } 2560 2561 return thread__get(idle_threads[cpu]); 2562 } 2563 2564 static void save_idle_callchain(struct perf_sched *sched, 2565 struct idle_thread_runtime *itr, 2566 struct perf_sample *sample) 2567 { 2568 struct callchain_cursor *cursor; 2569 2570 if (!sched->show_callchain || sample->callchain == NULL) 2571 return; 2572 2573 cursor = get_tls_callchain_cursor(); 2574 if (cursor == NULL) 2575 return; 2576 2577 callchain_cursor__copy(&itr->cursor, cursor); 2578 } 2579 2580 static struct thread *timehist_get_thread(struct perf_sched *sched, 2581 struct perf_sample *sample, 2582 struct machine *machine) 2583 { 2584 struct thread *thread; 2585 2586 if (is_idle_sample(sample)) { 2587 thread = get_idle_thread(sample->cpu); 2588 if (thread == NULL) 2589 pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu); 2590 2591 } else { 2592 /* there were samples with tid 0 but non-zero pid */ 2593 thread = machine__findnew_thread(machine, sample->pid, 2594 sample->tid ?: sample->pid); 2595 if (thread == NULL) { 2596 pr_debug("Failed to get thread for tid %d. skipping sample.\n", 2597 sample->tid); 2598 } 2599 2600 save_task_callchain(sched, sample, machine); 2601 if (sched->idle_hist) { 2602 struct thread *idle; 2603 struct idle_thread_runtime *itr; 2604 2605 idle = get_idle_thread(sample->cpu); 2606 if (idle == NULL) { 2607 pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu); 2608 thread__put(thread); 2609 return NULL; 2610 } 2611 2612 itr = thread__priv(idle); 2613 if (itr == NULL) { 2614 thread__put(idle); 2615 thread__put(thread); 2616 return NULL; 2617 } 2618 2619 thread__put(itr->last_thread); 2620 itr->last_thread = thread__get(thread); 2621 2622 /* copy task callchain when entering to idle */ 2623 if (perf_sample__intval(sample, "next_pid") == 0) 2624 save_idle_callchain(sched, itr, sample); 2625 2626 thread__put(idle); 2627 } 2628 } 2629 2630 return thread; 2631 } 2632 2633 static bool timehist_skip_sample(struct perf_sched *sched, 2634 struct thread *thread, 2635 struct perf_sample *sample) 2636 { 2637 bool rc = false; 2638 int prio = -1; 2639 struct thread_runtime *tr = NULL; 2640 2641 if (thread__is_filtered(thread)) { 2642 rc = true; 2643 sched->skipped_samples++; 2644 } 2645 2646 if (sched->prio_str) { 2647 /* 2648 * Because priority may be changed during task execution, 2649 * first read priority from prev sched_in event for current task. 2650 * If prev sched_in event is not saved, then read priority from 2651 * current task sched_out event. 2652 */ 2653 tr = thread__get_runtime(thread); 2654 if (tr && tr->prio != -1) 2655 prio = tr->prio; 2656 else if (evsel__name_is(sample->evsel, "sched:sched_switch")) 2657 prio = perf_sample__intval(sample, "prev_prio"); 2658 2659 /* negative prio means no info; out-of-range prio can't match the filter */ 2660 if (prio >= 0 && 2661 (prio >= MAX_PRIO || !test_bit(prio, sched->prio_bitmap))) { 2662 rc = true; 2663 sched->skipped_samples++; 2664 } 2665 } 2666 2667 if (sched->idle_hist) { 2668 if (!evsel__name_is(sample->evsel, "sched:sched_switch")) 2669 rc = true; 2670 else if (perf_sample__intval(sample, "prev_pid") != 0 && 2671 perf_sample__intval(sample, "next_pid") != 0) 2672 rc = true; 2673 } 2674 2675 return rc; 2676 } 2677 2678 static void timehist_print_wakeup_event(struct perf_sched *sched, 2679 struct perf_sample *sample, 2680 struct machine *machine, 2681 struct thread *awakened) 2682 { 2683 struct thread *thread; 2684 char tstr[64]; 2685 2686 thread = machine__findnew_thread(machine, sample->pid, sample->tid); 2687 if (thread == NULL) 2688 return; 2689 2690 /* show wakeup unless both awakee and awaker are filtered */ 2691 if (timehist_skip_sample(sched, thread, sample) && 2692 timehist_skip_sample(sched, awakened, sample)) { 2693 thread__put(thread); 2694 return; 2695 } 2696 2697 timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr)); 2698 printf("%15s [%04d] ", tstr, sample->cpu); 2699 if (sched->show_cpu_visual) 2700 printf(" %*s ", sched->max_cpu.cpu + 1, ""); 2701 2702 printf(" %-*s ", comm_width, timehist_get_commstr(thread)); 2703 2704 /* dt spacer */ 2705 printf(" %9s %9s %9s ", "", "", ""); 2706 2707 printf("awakened: %s", timehist_get_commstr(awakened)); 2708 2709 printf("\n"); 2710 2711 thread__put(thread); 2712 } 2713 2714 static int timehist_sched_wakeup_ignore(const struct perf_tool *tool __maybe_unused, 2715 union perf_event *event __maybe_unused, 2716 struct perf_sample *sample __maybe_unused, 2717 struct machine *machine __maybe_unused) 2718 { 2719 return 0; 2720 } 2721 2722 static int timehist_sched_wakeup_event(const struct perf_tool *tool, 2723 union perf_event *event __maybe_unused, 2724 struct perf_sample *sample, 2725 struct machine *machine) 2726 { 2727 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 2728 struct thread *thread; 2729 struct thread_runtime *tr = NULL; 2730 /* want pid of awakened task not pid in sample */ 2731 const u32 pid = perf_sample__intval(sample, "pid"); 2732 2733 thread = machine__findnew_thread(machine, 0, pid); 2734 if (thread == NULL) 2735 return -1; 2736 2737 tr = thread__get_runtime(thread); 2738 if (tr == NULL) { 2739 thread__put(thread); 2740 return -1; 2741 } 2742 2743 if (tr->ready_to_run == 0) 2744 tr->ready_to_run = sample->time; 2745 2746 /* show wakeups if requested */ 2747 if (sched->show_wakeups && 2748 !perf_time__skip_sample(&sched->ptime, sample->time)) 2749 timehist_print_wakeup_event(sched, sample, machine, thread); 2750 2751 thread__put(thread); 2752 return 0; 2753 } 2754 2755 static void timehist_print_migration_event(struct perf_sched *sched, 2756 struct perf_sample *sample, 2757 struct machine *machine, 2758 struct thread *migrated) 2759 { 2760 struct thread *thread; 2761 char tstr[64]; 2762 u32 max_cpus; 2763 u32 ocpu, dcpu; 2764 2765 if (sched->summary_only) 2766 return; 2767 2768 max_cpus = sched->max_cpu.cpu + 1; 2769 ocpu = perf_sample__intval(sample, "orig_cpu"); 2770 dcpu = perf_sample__intval(sample, "dest_cpu"); 2771 2772 thread = machine__findnew_thread(machine, sample->pid, sample->tid); 2773 if (thread == NULL) 2774 return; 2775 2776 if (timehist_skip_sample(sched, thread, sample) && 2777 timehist_skip_sample(sched, migrated, sample)) { 2778 thread__put(thread); 2779 return; 2780 } 2781 2782 timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr)); 2783 printf("%15s [%04d] ", tstr, sample->cpu); 2784 2785 if (sched->show_cpu_visual) { 2786 u32 i; 2787 char c; 2788 2789 printf(" "); 2790 for (i = 0; i < max_cpus; ++i) { 2791 c = (i == sample->cpu) ? 'm' : ' '; 2792 printf("%c", c); 2793 } 2794 printf(" "); 2795 } 2796 2797 printf(" %-*s ", comm_width, timehist_get_commstr(thread)); 2798 2799 /* dt spacer */ 2800 printf(" %9s %9s %9s ", "", "", ""); 2801 2802 printf("migrated: %s", timehist_get_commstr(migrated)); 2803 printf(" cpu %d => %d", ocpu, dcpu); 2804 2805 printf("\n"); 2806 thread__put(thread); 2807 } 2808 2809 static int timehist_migrate_task_event(const struct perf_tool *tool, 2810 union perf_event *event __maybe_unused, 2811 struct perf_sample *sample, 2812 struct machine *machine) 2813 { 2814 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 2815 struct thread *thread; 2816 struct thread_runtime *tr = NULL; 2817 /* want pid of migrated task not pid in sample */ 2818 const u32 pid = perf_sample__intval(sample, "pid"); 2819 2820 thread = machine__findnew_thread(machine, 0, pid); 2821 if (thread == NULL) 2822 return -1; 2823 2824 tr = thread__get_runtime(thread); 2825 if (tr == NULL) { 2826 thread__put(thread); 2827 return -1; 2828 } 2829 2830 tr->migrations++; 2831 tr->migrated = sample->time; 2832 2833 /* show migrations if requested */ 2834 if (sched->show_migrations) { 2835 timehist_print_migration_event(sched, sample, machine, thread); 2836 } 2837 thread__put(thread); 2838 2839 return 0; 2840 } 2841 2842 static void timehist_update_task_prio(struct perf_sample *sample, 2843 struct machine *machine) 2844 { 2845 struct thread *thread; 2846 struct thread_runtime *tr = NULL; 2847 const u32 next_pid = perf_sample__intval(sample, "next_pid"); 2848 const u32 next_prio = perf_sample__intval(sample, "next_prio"); 2849 2850 if (next_pid == 0) 2851 thread = get_idle_thread(sample->cpu); 2852 else 2853 thread = machine__findnew_thread(machine, -1, next_pid); 2854 2855 if (thread == NULL) 2856 return; 2857 2858 tr = thread__get_runtime(thread); 2859 if (tr != NULL) 2860 tr->prio = next_prio; 2861 2862 thread__put(thread); 2863 } 2864 2865 static int timehist_sched_change_event(const struct perf_tool *tool, 2866 union perf_event *event, 2867 struct perf_sample *sample, 2868 struct machine *machine) 2869 { 2870 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 2871 struct perf_time_interval *ptime = &sched->ptime; 2872 struct addr_location al; 2873 struct thread *thread = NULL; 2874 struct thread_runtime *tr = NULL; 2875 u64 tprev, t = sample->time; 2876 int rc = 0; 2877 const char state = perf_sample__taskstate(sample, "prev_state"); 2878 2879 /* perf.data is untrusted input — CPU may be absent or corrupted */ 2880 if (sample->cpu >= MAX_CPUS) { 2881 pr_warning("WARNING: at offset %#" PRIx64 ": out-of-bound sample CPU %d, skipping sample\n", 2882 sample->file_offset, sample->cpu); 2883 return 0; 2884 } 2885 2886 addr_location__init(&al); 2887 if (machine__resolve(machine, &al, sample) < 0) { 2888 pr_err("problem processing %s (%u) event at offset %#" PRIx64 ", skipping it\n", 2889 perf_event__name(event->header.type), event->header.type, 2890 sample->file_offset); 2891 rc = -1; 2892 goto out; 2893 } 2894 2895 if (sched->show_prio || sched->prio_str) 2896 timehist_update_task_prio(sample, machine); 2897 2898 thread = timehist_get_thread(sched, sample, machine); 2899 if (thread == NULL) { 2900 rc = -1; 2901 goto out; 2902 } 2903 2904 if (timehist_skip_sample(sched, thread, sample)) 2905 goto out; 2906 2907 tr = thread__get_runtime(thread); 2908 if (tr == NULL) { 2909 rc = -1; 2910 goto out; 2911 } 2912 2913 tprev = evsel__get_time(sample->evsel, sample->cpu); 2914 2915 /* 2916 * If start time given: 2917 * - sample time is under window user cares about - skip sample 2918 * - tprev is under window user cares about - reset to start of window 2919 */ 2920 if (ptime->start && ptime->start > t) 2921 goto out; 2922 2923 if (tprev && ptime->start > tprev) 2924 tprev = ptime->start; 2925 2926 /* 2927 * If end time given: 2928 * - previous sched event is out of window - we are done 2929 * - sample time is beyond window user cares about - reset it 2930 * to close out stats for time window interest 2931 * - If tprev is 0, that is, sched_in event for current task is 2932 * not recorded, cannot determine whether sched_in event is 2933 * within time window interest - ignore it 2934 */ 2935 if (ptime->end) { 2936 if (!tprev || tprev > ptime->end) 2937 goto out; 2938 2939 if (t > ptime->end) 2940 t = ptime->end; 2941 } 2942 2943 /* 2944 * Use is_idle_sample() not thread__tid() == 0: a crafted perf.data 2945 * can set common_pid=0 with prev_pid!=0, giving us a machine thread 2946 * whose priv is thread_runtime, not idle_thread_runtime — the cast 2947 * below would read past the allocation. 2948 */ 2949 if (!sched->idle_hist || is_idle_sample(sample)) { 2950 if (!cpu_list || (sample->cpu < MAX_NR_CPUS && 2951 test_bit(sample->cpu, cpu_bitmap))) 2952 timehist_update_runtime_stats(tr, t, tprev); 2953 2954 if (sched->idle_hist) { 2955 struct idle_thread_runtime *itr = (void *)tr; 2956 struct thread_runtime *last_tr; 2957 2958 if (itr->last_thread == NULL) 2959 goto out; 2960 2961 /* add current idle time as last thread's runtime */ 2962 last_tr = thread__get_runtime(itr->last_thread); 2963 if (last_tr == NULL) 2964 goto out; 2965 2966 timehist_update_runtime_stats(last_tr, t, tprev); 2967 /* 2968 * remove delta time of last thread as it's not updated 2969 * and otherwise it will show an invalid value next 2970 * time. we only care total run time and run stat. 2971 */ 2972 last_tr->dt_run = 0; 2973 last_tr->dt_delay = 0; 2974 last_tr->dt_sleep = 0; 2975 last_tr->dt_iowait = 0; 2976 last_tr->dt_preempt = 0; 2977 2978 if (itr->cursor.nr) 2979 callchain_append(&itr->callchain, &itr->cursor, t - tprev); 2980 2981 thread__zput(itr->last_thread); 2982 } 2983 2984 if (!sched->summary_only) 2985 timehist_print_sample(sched, sample, &al, thread, t, state); 2986 } 2987 2988 out: 2989 if (sched->hist_time.start == 0 && t >= ptime->start) 2990 sched->hist_time.start = t; 2991 if (ptime->end == 0 || t <= ptime->end) 2992 sched->hist_time.end = t; 2993 2994 if (tr) { 2995 /* time of this sched_switch event becomes last time task seen */ 2996 tr->last_time = sample->time; 2997 2998 /* last state is used to determine where to account wait time */ 2999 tr->last_state = state; 3000 3001 /* sched out event for task so reset ready to run time and migrated time */ 3002 if (state == 'R') 3003 tr->ready_to_run = t; 3004 else 3005 tr->ready_to_run = 0; 3006 3007 tr->migrated = 0; 3008 } 3009 3010 evsel__save_time(sample->evsel, sample->time, sample->cpu); 3011 3012 thread__put(thread); 3013 addr_location__exit(&al); 3014 return rc; 3015 } 3016 3017 static int timehist_sched_switch_event(const struct perf_tool *tool, 3018 union perf_event *event, 3019 struct perf_sample *sample, 3020 struct machine *machine __maybe_unused) 3021 { 3022 return timehist_sched_change_event(tool, event, sample, machine); 3023 } 3024 3025 static int process_lost(const struct perf_tool *tool __maybe_unused, 3026 union perf_event *event, 3027 struct perf_sample *sample, 3028 struct machine *machine __maybe_unused) 3029 { 3030 char tstr[64]; 3031 3032 timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr)); 3033 printf("%15s ", tstr); 3034 printf("lost %" PRI_lu64 " events on cpu %d\n", event->lost.lost, sample->cpu); 3035 3036 return 0; 3037 } 3038 3039 3040 static void print_thread_runtime(struct thread *t, 3041 struct thread_runtime *r) 3042 { 3043 double mean = avg_stats(&r->run_stats); 3044 float stddev; 3045 3046 printf("%*s %5d %9" PRIu64 " ", 3047 comm_width, timehist_get_commstr(t), thread__ppid(t), 3048 (u64) r->run_stats.n); 3049 3050 print_sched_time(r->total_run_time, 8); 3051 stddev = rel_stddev_stats(stddev_stats(&r->run_stats), mean); 3052 print_sched_time(r->run_stats.min, 6); 3053 printf(" "); 3054 print_sched_time((u64) mean, 6); 3055 printf(" "); 3056 print_sched_time(r->run_stats.max, 6); 3057 printf(" "); 3058 printf("%5.2f", stddev); 3059 printf(" %5" PRIu64, r->migrations); 3060 printf("\n"); 3061 } 3062 3063 static void print_thread_waittime(struct thread *t, 3064 struct thread_runtime *r) 3065 { 3066 printf("%*s %5d %9" PRIu64 " ", 3067 comm_width, timehist_get_commstr(t), thread__ppid(t), 3068 (u64) r->run_stats.n); 3069 3070 print_sched_time(r->total_run_time, 8); 3071 print_sched_time(r->total_sleep_time, 6); 3072 printf(" "); 3073 print_sched_time(r->total_iowait_time, 6); 3074 printf(" "); 3075 print_sched_time(r->total_preempt_time, 6); 3076 printf(" "); 3077 print_sched_time(r->total_delay_time, 6); 3078 printf("\n"); 3079 } 3080 3081 struct total_run_stats { 3082 struct perf_sched *sched; 3083 u64 sched_count; 3084 u64 task_count; 3085 u64 total_run_time; 3086 }; 3087 3088 static int show_thread_runtime(struct thread *t, void *priv) 3089 { 3090 struct total_run_stats *stats = priv; 3091 struct thread_runtime *r; 3092 3093 if (thread__is_filtered(t)) 3094 return 0; 3095 3096 r = thread__priv(t); 3097 if (r && r->run_stats.n) { 3098 stats->task_count++; 3099 stats->sched_count += r->run_stats.n; 3100 stats->total_run_time += r->total_run_time; 3101 3102 if (stats->sched->show_state) 3103 print_thread_waittime(t, r); 3104 else 3105 print_thread_runtime(t, r); 3106 } 3107 3108 return 0; 3109 } 3110 3111 static size_t callchain__fprintf_folded(FILE *fp, struct callchain_node *node) 3112 { 3113 const char *sep = " <- "; 3114 struct callchain_list *chain; 3115 size_t ret = 0; 3116 char bf[1024]; 3117 bool first; 3118 3119 if (node == NULL) 3120 return 0; 3121 3122 ret = callchain__fprintf_folded(fp, node->parent); 3123 first = (ret == 0); 3124 3125 list_for_each_entry(chain, &node->val, list) { 3126 if (chain->ip >= PERF_CONTEXT_MAX) 3127 continue; 3128 if (chain->ms.sym && symbol__ignore(chain->ms.sym)) 3129 continue; 3130 ret += fprintf(fp, "%s%s", first ? "" : sep, 3131 callchain_list__sym_name(chain, bf, sizeof(bf), 3132 false)); 3133 first = false; 3134 } 3135 3136 return ret; 3137 } 3138 3139 static size_t timehist_print_idlehist_callchain(struct rb_root_cached *root) 3140 { 3141 size_t ret = 0; 3142 FILE *fp = stdout; 3143 struct callchain_node *chain; 3144 /* sort() uses rb_insert_color() on rb_root, not rb_root_cached */ 3145 struct rb_node *rb_node = rb_first(&root->rb_root); 3146 3147 printf(" %16s %8s %s\n", "Idle time (msec)", "Count", "Callchains"); 3148 printf(" %.16s %.8s %.50s\n", graph_dotted_line, graph_dotted_line, 3149 graph_dotted_line); 3150 3151 while (rb_node) { 3152 chain = rb_entry(rb_node, struct callchain_node, rb_node); 3153 rb_node = rb_next(rb_node); 3154 3155 ret += fprintf(fp, " "); 3156 print_sched_time(chain->hit, 12); 3157 ret += 16; /* print_sched_time returns 2nd arg + 4 */ 3158 ret += fprintf(fp, " %8d ", chain->count); 3159 ret += callchain__fprintf_folded(fp, chain); 3160 ret += fprintf(fp, "\n"); 3161 } 3162 3163 return ret; 3164 } 3165 3166 static void timehist_print_summary(struct perf_sched *sched, 3167 struct perf_session *session) 3168 { 3169 struct machine *m = &session->machines.host; 3170 struct total_run_stats totals; 3171 u64 task_count; 3172 struct thread *t; 3173 struct thread_runtime *r; 3174 int i; 3175 u64 hist_time = sched->hist_time.end - sched->hist_time.start; 3176 3177 memset(&totals, 0, sizeof(totals)); 3178 totals.sched = sched; 3179 3180 if (sched->idle_hist) { 3181 printf("\nIdle-time summary\n"); 3182 printf("%*s parent sched-out ", comm_width, "comm"); 3183 printf(" idle-time min-idle avg-idle max-idle stddev migrations\n"); 3184 } else if (sched->show_state) { 3185 printf("\nWait-time summary\n"); 3186 printf("%*s parent sched-in ", comm_width, "comm"); 3187 printf(" run-time sleep iowait preempt delay\n"); 3188 } else { 3189 printf("\nRuntime summary\n"); 3190 printf("%*s parent sched-in ", comm_width, "comm"); 3191 printf(" run-time min-run avg-run max-run stddev migrations\n"); 3192 } 3193 printf("%*s (count) ", comm_width, ""); 3194 printf(" (msec) (msec) (msec) (msec) %s\n", 3195 sched->show_state ? "(msec)" : "%"); 3196 printf("%.117s\n", graph_dotted_line); 3197 3198 machine__for_each_thread(m, show_thread_runtime, &totals); 3199 task_count = totals.task_count; 3200 if (!task_count) 3201 printf("<no still running tasks>\n"); 3202 3203 /* CPU idle stats not tracked when samples were skipped */ 3204 if (sched->skipped_samples && !sched->idle_hist) 3205 return; 3206 3207 printf("\nIdle stats:\n"); 3208 for (i = 0; i < idle_max_cpu; ++i) { 3209 if (cpu_list && !test_bit(i, cpu_bitmap)) 3210 continue; 3211 3212 t = idle_threads[i]; 3213 if (!t) 3214 continue; 3215 3216 r = thread__priv(t); 3217 if (r && r->run_stats.n) { 3218 totals.sched_count += r->run_stats.n; 3219 printf(" CPU %2d idle for ", i); 3220 print_sched_time(r->total_run_time, 6); 3221 printf(" msec (%6.2f%%)\n", 100.0 * r->total_run_time / hist_time); 3222 } else 3223 printf(" CPU %2d idle entire time window\n", i); 3224 } 3225 3226 if (sched->idle_hist && sched->show_callchain) { 3227 callchain_param.mode = CHAIN_FOLDED; 3228 callchain_param.value = CCVAL_PERIOD; 3229 3230 callchain_register_param(&callchain_param); 3231 3232 printf("\nIdle stats by callchain:\n"); 3233 for (i = 0; i < idle_max_cpu; ++i) { 3234 struct idle_thread_runtime *itr; 3235 3236 t = idle_threads[i]; 3237 if (!t) 3238 continue; 3239 3240 itr = thread__priv(t); 3241 if (itr == NULL) 3242 continue; 3243 3244 callchain_param.sort(&itr->sorted_root.rb_root, &itr->callchain, 3245 0, &callchain_param); 3246 3247 printf(" CPU %2d:", i); 3248 print_sched_time(itr->tr.total_run_time, 6); 3249 printf(" msec\n"); 3250 timehist_print_idlehist_callchain(&itr->sorted_root); 3251 printf("\n"); 3252 } 3253 } 3254 3255 printf("\n" 3256 " Total number of unique tasks: %" PRIu64 "\n" 3257 "Total number of context switches: %" PRIu64 "\n", 3258 totals.task_count, totals.sched_count); 3259 3260 printf(" Total run time (msec): "); 3261 print_sched_time(totals.total_run_time, 2); 3262 printf("\n"); 3263 3264 printf(" Total scheduling time (msec): "); 3265 print_sched_time(hist_time, 2); 3266 printf(" (x %d)\n", sched->max_cpu.cpu); 3267 } 3268 3269 typedef int (*sched_handler)(const struct perf_tool *tool, 3270 union perf_event *event, 3271 struct perf_sample *sample, 3272 struct machine *machine); 3273 3274 static int perf_timehist__process_sample(const struct perf_tool *tool, 3275 union perf_event *event, 3276 struct perf_sample *sample, 3277 struct machine *machine) 3278 { 3279 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 3280 struct evsel *evsel = sample->evsel; 3281 int err = 0; 3282 struct perf_cpu this_cpu = { 3283 .cpu = sample->cpu, 3284 }; 3285 3286 /* max_cpu indexes arrays allocated with MAX_CPUS entries */ 3287 if (this_cpu.cpu >= 0 && this_cpu.cpu < MAX_CPUS && 3288 this_cpu.cpu > sched->max_cpu.cpu) 3289 sched->max_cpu = this_cpu; 3290 3291 if (evsel->handler != NULL) { 3292 sched_handler f = evsel->handler; 3293 3294 err = f(tool, event, sample, machine); 3295 } 3296 3297 return err; 3298 } 3299 3300 static int timehist_check_attr(struct perf_sched *sched, 3301 struct evlist *evlist) 3302 { 3303 struct evsel *evsel; 3304 struct evsel_runtime *er; 3305 3306 list_for_each_entry(evsel, &evlist->core.entries, core.node) { 3307 er = evsel__get_runtime(evsel); 3308 if (er == NULL) { 3309 pr_err("Failed to allocate memory for evsel runtime data\n"); 3310 return -1; 3311 } 3312 3313 /* only need to save callchain related to sched_switch event */ 3314 if (sched->show_callchain && 3315 evsel__name_is(evsel, "sched:sched_switch") && 3316 !evsel__has_callchain(evsel)) { 3317 pr_info("Samples of sched_switch event do not have callchains.\n"); 3318 sched->show_callchain = 0; 3319 symbol_conf.use_callchain = 0; 3320 } 3321 } 3322 3323 return 0; 3324 } 3325 3326 static int timehist_parse_prio_str(struct perf_sched *sched) 3327 { 3328 char *p; 3329 unsigned long start_prio, end_prio; 3330 const char *str = sched->prio_str; 3331 3332 if (!str) 3333 return 0; 3334 3335 while (isdigit(*str)) { 3336 p = NULL; 3337 start_prio = strtoul(str, &p, 0); 3338 if (start_prio >= MAX_PRIO || (*p != '\0' && *p != ',' && *p != '-')) 3339 return -1; 3340 3341 if (*p == '-') { 3342 str = ++p; 3343 p = NULL; 3344 end_prio = strtoul(str, &p, 0); 3345 3346 if (end_prio >= MAX_PRIO || (*p != '\0' && *p != ',')) 3347 return -1; 3348 3349 if (end_prio < start_prio) 3350 return -1; 3351 } else { 3352 end_prio = start_prio; 3353 } 3354 3355 for (; start_prio <= end_prio; start_prio++) 3356 __set_bit(start_prio, sched->prio_bitmap); 3357 3358 if (*p) 3359 ++p; 3360 3361 str = p; 3362 } 3363 3364 return 0; 3365 } 3366 3367 static int perf_sched__timehist(struct perf_sched *sched) 3368 { 3369 struct evsel_str_handler handlers[] = { 3370 { "sched:sched_switch", timehist_sched_switch_event, }, 3371 { "sched:sched_wakeup", timehist_sched_wakeup_event, }, 3372 { "sched:sched_waking", timehist_sched_wakeup_event, }, 3373 { "sched:sched_wakeup_new", timehist_sched_wakeup_event, }, 3374 }; 3375 const struct evsel_str_handler migrate_handlers[] = { 3376 { "sched:sched_migrate_task", timehist_migrate_task_event, }, 3377 }; 3378 struct perf_data data = { 3379 .path = input_name, 3380 .mode = PERF_DATA_MODE_READ, 3381 .force = sched->force, 3382 }; 3383 3384 struct perf_session *session; 3385 struct perf_env *env; 3386 struct evlist *evlist; 3387 int err = -1; 3388 3389 /* 3390 * event handlers for timehist option 3391 */ 3392 sched->tool.sample = perf_timehist__process_sample; 3393 sched->tool.mmap = perf_event__process_mmap; 3394 sched->tool.mmap2 = perf_event__process_mmap2; 3395 sched->tool.comm = perf_event__process_comm; 3396 sched->tool.exit = perf_event__process_exit; 3397 sched->tool.fork = perf_event__process_fork; 3398 sched->tool.lost = process_lost; 3399 sched->tool.attr = perf_event__process_attr; 3400 sched->tool.tracing_data = perf_event__process_tracing_data; 3401 sched->tool.build_id = perf_event__process_build_id; 3402 3403 sched->tool.ordering_requires_timestamps = true; 3404 3405 symbol_conf.use_callchain = sched->show_callchain; 3406 3407 session = perf_session__new(&data, &sched->tool); 3408 if (IS_ERR(session)) 3409 return PTR_ERR(session); 3410 3411 env = perf_session__env(session); 3412 if (cpu_list) { 3413 err = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap); 3414 if (err < 0) 3415 goto out; 3416 } 3417 3418 evlist = session->evlist; 3419 3420 symbol__init(env); 3421 3422 if (perf_time__parse_str(&sched->ptime, sched->time_str) != 0) { 3423 pr_err("Invalid time string\n"); 3424 err = -EINVAL; 3425 goto out; 3426 } 3427 3428 if (timehist_check_attr(sched, evlist) != 0) 3429 goto out; 3430 3431 if (timehist_parse_prio_str(sched) != 0) { 3432 pr_err("Invalid prio string\n"); 3433 goto out; 3434 } 3435 3436 setup_pager(); 3437 3438 evsel__set_priv_destructor(timehist__evsel_priv_destructor); 3439 3440 /* prefer sched_waking if it is captured */ 3441 if (evlist__find_tracepoint_by_name(session->evlist, "sched:sched_waking")) 3442 handlers[1].handler = timehist_sched_wakeup_ignore; 3443 3444 /* setup per-evsel handlers */ 3445 if (perf_session__set_tracepoints_handlers(session, handlers)) 3446 goto out; 3447 3448 /* sched_switch event at a minimum needs to exist */ 3449 if (!evlist__find_tracepoint_by_name(session->evlist, "sched:sched_switch")) { 3450 pr_err("No sched_switch events found. Have you run 'perf sched record'?\n"); 3451 goto out; 3452 } 3453 3454 if ((sched->show_migrations || sched->pre_migrations) && 3455 perf_session__set_tracepoints_handlers(session, migrate_handlers)) 3456 goto out; 3457 3458 /* pre-allocate struct for per-CPU idle stats; cap to array bounds */ 3459 sched->max_cpu.cpu = min(env->nr_cpus_online, MAX_CPUS); 3460 if (sched->max_cpu.cpu == 0) 3461 sched->max_cpu.cpu = 4; 3462 if (init_idle_threads(sched->max_cpu.cpu)) 3463 goto out; 3464 3465 /* summary_only implies summary option, but don't overwrite summary if set */ 3466 if (sched->summary_only) 3467 sched->summary = sched->summary_only; 3468 3469 if (!sched->summary_only) 3470 timehist_header(sched); 3471 3472 err = perf_session__process_events(session); 3473 if (err) { 3474 pr_err("Failed to process events, error %d", err); 3475 goto out; 3476 } 3477 3478 sched->nr_events = evlist->stats.nr_events[0]; 3479 sched->nr_lost_events = evlist->stats.total_lost; 3480 sched->nr_lost_chunks = evlist->stats.nr_events[PERF_RECORD_LOST]; 3481 3482 if (sched->summary) 3483 timehist_print_summary(sched, session); 3484 3485 out: 3486 free_idle_threads(); 3487 perf_session__delete(session); 3488 3489 return err; 3490 } 3491 3492 3493 static void print_bad_events(struct perf_sched *sched) 3494 { 3495 if (sched->nr_unordered_timestamps && sched->nr_timestamps) { 3496 printf(" INFO: %.3f%% unordered timestamps (%ld out of %ld)\n", 3497 (double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0, 3498 sched->nr_unordered_timestamps, sched->nr_timestamps); 3499 } 3500 if (sched->nr_lost_events && sched->nr_events) { 3501 printf(" INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n", 3502 (double)sched->nr_lost_events/(double)sched->nr_events * 100.0, 3503 sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks); 3504 } 3505 if (sched->nr_context_switch_bugs && sched->nr_timestamps) { 3506 printf(" INFO: %.3f%% context switch bugs (%ld out of %ld)", 3507 (double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0, 3508 sched->nr_context_switch_bugs, sched->nr_timestamps); 3509 if (sched->nr_lost_events) 3510 printf(" (due to lost events?)"); 3511 printf("\n"); 3512 } 3513 } 3514 3515 static void __merge_work_atoms(struct rb_root_cached *root, struct work_atoms *data) 3516 { 3517 struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL; 3518 struct work_atoms *this; 3519 const char *comm = thread__comm_str(data->thread), *this_comm; 3520 bool leftmost = true; 3521 3522 while (*new) { 3523 int cmp; 3524 3525 this = container_of(*new, struct work_atoms, node); 3526 parent = *new; 3527 3528 this_comm = thread__comm_str(this->thread); 3529 cmp = strcmp(comm, this_comm); 3530 if (cmp > 0) { 3531 new = &((*new)->rb_left); 3532 } else if (cmp < 0) { 3533 new = &((*new)->rb_right); 3534 leftmost = false; 3535 } else { 3536 this->num_merged++; 3537 this->total_runtime += data->total_runtime; 3538 this->nb_atoms += data->nb_atoms; 3539 this->total_lat += data->total_lat; 3540 list_splice_init(&data->work_list, &this->work_list); 3541 if (this->max_lat < data->max_lat) { 3542 this->max_lat = data->max_lat; 3543 this->max_lat_start = data->max_lat_start; 3544 this->max_lat_end = data->max_lat_end; 3545 } 3546 free_work_atoms(data); 3547 return; 3548 } 3549 } 3550 3551 data->num_merged++; 3552 rb_link_node(&data->node, parent, new); 3553 rb_insert_color_cached(&data->node, root, leftmost); 3554 } 3555 3556 static void perf_sched__merge_lat(struct perf_sched *sched) 3557 { 3558 struct work_atoms *data; 3559 struct rb_node *node; 3560 3561 if (sched->skip_merge) 3562 return; 3563 3564 while ((node = rb_first_cached(&sched->atom_root))) { 3565 rb_erase_cached(node, &sched->atom_root); 3566 data = rb_entry(node, struct work_atoms, node); 3567 __merge_work_atoms(&sched->merged_atom_root, data); 3568 } 3569 } 3570 3571 static int setup_cpus_switch_event(struct perf_sched *sched) 3572 { 3573 unsigned int i; 3574 3575 sched->cpu_last_switched = calloc(MAX_CPUS, sizeof(*(sched->cpu_last_switched))); 3576 if (!sched->cpu_last_switched) 3577 return -1; 3578 3579 sched->curr_pid = calloc(MAX_CPUS, sizeof(*(sched->curr_pid))); 3580 if (!sched->curr_pid) { 3581 zfree(&sched->cpu_last_switched); 3582 return -1; 3583 } 3584 3585 for (i = 0; i < MAX_CPUS; i++) 3586 sched->curr_pid[i] = -1; 3587 3588 return 0; 3589 } 3590 3591 static void free_cpus_switch_event(struct perf_sched *sched) 3592 { 3593 zfree(&sched->curr_pid); 3594 zfree(&sched->cpu_last_switched); 3595 } 3596 3597 static int perf_sched__lat(struct perf_sched *sched) 3598 { 3599 int rc = -1; 3600 struct rb_node *next; 3601 3602 setup_pager(); 3603 3604 if (setup_cpus_switch_event(sched)) 3605 return rc; 3606 3607 if (perf_sched__read_events(sched)) 3608 goto out_free_cpus_switch_event; 3609 3610 perf_sched__merge_lat(sched); 3611 perf_sched__sort_lat(sched); 3612 3613 printf("\n -------------------------------------------------------------------------------------------------------------------------------------------\n"); 3614 printf(" Task | Runtime ms | Count | Avg delay ms | Max delay ms | Max delay start | Max delay end |\n"); 3615 printf(" -------------------------------------------------------------------------------------------------------------------------------------------\n"); 3616 3617 next = rb_first_cached(&sched->sorted_atom_root); 3618 3619 while (next) { 3620 struct work_atoms *work_list; 3621 3622 work_list = rb_entry(next, struct work_atoms, node); 3623 output_lat_thread(sched, work_list); 3624 next = rb_next(next); 3625 } 3626 3627 printf(" -----------------------------------------------------------------------------------------------------------------\n"); 3628 printf(" TOTAL: |%11.3f ms |%9" PRIu64 " |\n", 3629 (double)sched->all_runtime / NSEC_PER_MSEC, sched->all_count); 3630 3631 printf(" ---------------------------------------------------\n"); 3632 3633 print_bad_events(sched); 3634 printf("\n"); 3635 3636 rc = 0; 3637 3638 while ((next = rb_first_cached(&sched->sorted_atom_root))) { 3639 struct work_atoms *data; 3640 3641 data = rb_entry(next, struct work_atoms, node); 3642 rb_erase_cached(next, &sched->sorted_atom_root); 3643 free_work_atoms(data); 3644 } 3645 out_free_cpus_switch_event: 3646 free_cpus_switch_event(sched); 3647 return rc; 3648 } 3649 3650 static int setup_map_cpus(struct perf_sched *sched) 3651 { 3652 if (sched->map.comp) { 3653 sched->map.comp_cpus = calloc(MAX_CPUS, sizeof(*sched->map.comp_cpus)); 3654 if (!sched->map.comp_cpus) 3655 return -1; 3656 } 3657 3658 if (sched->map.cpus_str) { 3659 sched->map.cpus = perf_cpu_map__new(sched->map.cpus_str); 3660 if (!sched->map.cpus) { 3661 pr_err("failed to get cpus map from %s\n", sched->map.cpus_str); 3662 zfree(&sched->map.comp_cpus); 3663 return -1; 3664 } 3665 } 3666 3667 return 0; 3668 } 3669 3670 static int setup_color_pids(struct perf_sched *sched) 3671 { 3672 struct perf_thread_map *map; 3673 3674 if (!sched->map.color_pids_str) 3675 return 0; 3676 3677 map = thread_map__new_by_tid_str(sched->map.color_pids_str); 3678 if (!map) { 3679 pr_err("failed to get thread map from %s\n", sched->map.color_pids_str); 3680 return -1; 3681 } 3682 3683 sched->map.color_pids = map; 3684 return 0; 3685 } 3686 3687 static int setup_color_cpus(struct perf_sched *sched) 3688 { 3689 struct perf_cpu_map *map; 3690 3691 if (!sched->map.color_cpus_str) 3692 return 0; 3693 3694 map = perf_cpu_map__new(sched->map.color_cpus_str); 3695 if (!map) { 3696 pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str); 3697 return -1; 3698 } 3699 3700 sched->map.color_cpus = map; 3701 return 0; 3702 } 3703 3704 static int perf_sched__map(struct perf_sched *sched) 3705 { 3706 int rc = -1; 3707 3708 sched->curr_thread = calloc(MAX_CPUS, sizeof(*(sched->curr_thread))); 3709 if (!sched->curr_thread) 3710 return rc; 3711 3712 sched->curr_out_thread = calloc(MAX_CPUS, sizeof(*(sched->curr_out_thread))); 3713 if (!sched->curr_out_thread) 3714 goto out_free_curr_thread; 3715 3716 if (setup_cpus_switch_event(sched)) 3717 goto out_free_curr_out_thread; 3718 3719 if (setup_map_cpus(sched)) 3720 goto out_free_cpus_switch_event; 3721 3722 if (setup_color_pids(sched)) 3723 goto out_put_map_cpus; 3724 3725 if (setup_color_cpus(sched)) 3726 goto out_put_color_pids; 3727 3728 setup_pager(); 3729 if (perf_sched__read_events(sched)) 3730 goto out_put_color_cpus; 3731 3732 rc = 0; 3733 print_bad_events(sched); 3734 3735 out_put_color_cpus: 3736 perf_cpu_map__put(sched->map.color_cpus); 3737 3738 out_put_color_pids: 3739 perf_thread_map__put(sched->map.color_pids); 3740 3741 out_put_map_cpus: 3742 zfree(&sched->map.comp_cpus); 3743 perf_cpu_map__put(sched->map.cpus); 3744 3745 out_free_cpus_switch_event: 3746 free_cpus_switch_event(sched); 3747 3748 out_free_curr_out_thread: 3749 for (int i = 0; i < MAX_CPUS; i++) 3750 thread__put(sched->curr_out_thread[i]); 3751 zfree(&sched->curr_out_thread); 3752 3753 out_free_curr_thread: 3754 for (int i = 0; i < MAX_CPUS; i++) 3755 thread__put(sched->curr_thread[i]); 3756 zfree(&sched->curr_thread); 3757 return rc; 3758 } 3759 3760 static int perf_sched__replay(struct perf_sched *sched) 3761 { 3762 int ret; 3763 unsigned long i; 3764 3765 mutex_init(&sched->start_work_mutex); 3766 mutex_init(&sched->work_done_wait_mutex); 3767 3768 ret = setup_cpus_switch_event(sched); 3769 if (ret) 3770 goto out_mutex_destroy; 3771 3772 calibrate_run_measurement_overhead(sched); 3773 calibrate_sleep_measurement_overhead(sched); 3774 3775 test_calibrations(sched); 3776 3777 ret = perf_sched__read_events(sched); 3778 if (ret) 3779 goto out_free_cpus_switch_event; 3780 3781 printf("nr_run_events: %ld\n", sched->nr_run_events); 3782 printf("nr_sleep_events: %ld\n", sched->nr_sleep_events); 3783 printf("nr_wakeup_events: %ld\n", sched->nr_wakeup_events); 3784 3785 if (sched->targetless_wakeups) 3786 printf("target-less wakeups: %ld\n", sched->targetless_wakeups); 3787 if (sched->multitarget_wakeups) 3788 printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups); 3789 if (sched->nr_run_events_optimized) 3790 printf("run atoms optimized: %ld\n", 3791 sched->nr_run_events_optimized); 3792 3793 print_task_traces(sched); 3794 add_cross_task_wakeups(sched); 3795 3796 sched->thread_funcs_exit = false; 3797 create_tasks(sched); 3798 printf("------------------------------------------------------------\n"); 3799 if (sched->replay_repeat == 0) 3800 sched->replay_repeat = UINT_MAX; 3801 3802 for (i = 0; i < sched->replay_repeat; i++) 3803 run_one_test(sched); 3804 3805 sched->thread_funcs_exit = true; 3806 destroy_tasks(sched); 3807 3808 out_free_cpus_switch_event: 3809 free_cpus_switch_event(sched); 3810 3811 out_mutex_destroy: 3812 mutex_destroy(&sched->start_work_mutex); 3813 mutex_destroy(&sched->work_done_wait_mutex); 3814 return ret; 3815 } 3816 3817 static void setup_sorting(struct perf_sched *sched, const struct option *options, 3818 const char * const usage_msg[]) 3819 { 3820 char *tmp, *tok, *str = strdup(sched->sort_order); 3821 3822 for (tok = strtok_r(str, ", ", &tmp); 3823 tok; tok = strtok_r(NULL, ", ", &tmp)) { 3824 if (sort_dimension__add(tok, &sched->sort_list) < 0) { 3825 usage_with_options_msg(usage_msg, options, 3826 "Unknown --sort key: `%s'", tok); 3827 } 3828 } 3829 3830 free(str); 3831 3832 sort_dimension__add("pid", &sched->cmp_pid); 3833 } 3834 3835 static int process_synthesized_schedstat_event(const struct perf_tool *tool, 3836 union perf_event *event, 3837 struct perf_sample *sample __maybe_unused, 3838 struct machine *machine __maybe_unused) 3839 { 3840 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 3841 3842 if (perf_data__write(sched->data, event, event->header.size) <= 0) { 3843 pr_err("failed to write perf data, error: %m\n"); 3844 return -1; 3845 } 3846 3847 sched->session->header.data_size += event->header.size; 3848 return 0; 3849 } 3850 3851 static volatile sig_atomic_t done; 3852 3853 static void sighandler(int sig __maybe_unused) 3854 { 3855 done = 1; 3856 } 3857 3858 static int enable_sched_schedstats(int *reset) 3859 { 3860 char path[PATH_MAX]; 3861 FILE *fp; 3862 char ch; 3863 3864 snprintf(path, PATH_MAX, "%s/sys/kernel/sched_schedstats", procfs__mountpoint()); 3865 fp = fopen(path, "w+"); 3866 if (!fp) { 3867 pr_err("Failed to open %s\n", path); 3868 return -1; 3869 } 3870 3871 ch = getc(fp); 3872 if (ch == '0') { 3873 *reset = 1; 3874 rewind(fp); 3875 putc('1', fp); 3876 fclose(fp); 3877 } 3878 return 0; 3879 } 3880 3881 static int disable_sched_schedstat(void) 3882 { 3883 char path[PATH_MAX]; 3884 FILE *fp; 3885 3886 snprintf(path, PATH_MAX, "%s/sys/kernel/sched_schedstats", procfs__mountpoint()); 3887 fp = fopen(path, "w"); 3888 if (!fp) { 3889 pr_err("Failed to open %s\n", path); 3890 return -1; 3891 } 3892 3893 putc('0', fp); 3894 fclose(fp); 3895 return 0; 3896 } 3897 3898 /* perf.data or any other output file name used by stats subcommand (only). */ 3899 const char *output_name; 3900 3901 static int perf_sched__schedstat_record(struct perf_sched *sched, 3902 int argc, const char **argv) 3903 { 3904 struct perf_session *session; 3905 struct target target = {}; 3906 struct evlist *evlist; 3907 int reset = 0; 3908 int err = 0; 3909 int fd; 3910 struct perf_data data = { 3911 .path = output_name, 3912 .mode = PERF_DATA_MODE_WRITE, 3913 }; 3914 3915 done = 0; 3916 signal(SIGINT, sighandler); 3917 signal(SIGCHLD, sighandler); 3918 signal(SIGTERM, sighandler); 3919 3920 evlist = evlist__new(); 3921 if (!evlist) 3922 return -ENOMEM; 3923 3924 session = perf_session__new(&data, &sched->tool); 3925 if (IS_ERR(session)) { 3926 pr_err("Perf session creation failed.\n"); 3927 evlist__delete(evlist); 3928 return PTR_ERR(session); 3929 } 3930 3931 session->evlist = evlist; 3932 3933 sched->session = session; 3934 sched->data = &data; 3935 3936 fd = perf_data__fd(&data); 3937 3938 /* 3939 * Capture all important metadata about the system. Although they are 3940 * not used by `perf sched stats` tool directly, they provide useful 3941 * information about profiled environment. 3942 */ 3943 perf_header__set_feat(&session->header, HEADER_HOSTNAME); 3944 perf_header__set_feat(&session->header, HEADER_OSRELEASE); 3945 perf_header__set_feat(&session->header, HEADER_VERSION); 3946 perf_header__set_feat(&session->header, HEADER_ARCH); 3947 perf_header__set_feat(&session->header, HEADER_NRCPUS); 3948 perf_header__set_feat(&session->header, HEADER_CPUDESC); 3949 perf_header__set_feat(&session->header, HEADER_CPUID); 3950 perf_header__set_feat(&session->header, HEADER_TOTAL_MEM); 3951 perf_header__set_feat(&session->header, HEADER_CMDLINE); 3952 perf_header__set_feat(&session->header, HEADER_CPU_TOPOLOGY); 3953 perf_header__set_feat(&session->header, HEADER_NUMA_TOPOLOGY); 3954 perf_header__set_feat(&session->header, HEADER_CACHE); 3955 perf_header__set_feat(&session->header, HEADER_MEM_TOPOLOGY); 3956 perf_header__set_feat(&session->header, HEADER_HYBRID_TOPOLOGY); 3957 perf_header__set_feat(&session->header, HEADER_CPU_DOMAIN_INFO); 3958 3959 err = perf_session__write_header(session, evlist, fd, false); 3960 if (err < 0) 3961 goto out; 3962 3963 /* 3964 * `perf sched stats` does not support workload profiling (-p pid) 3965 * since /proc/schedstat file contains cpu specific data only. Hence, a 3966 * profile target is either set of cpus or systemwide, never a process. 3967 * Note that, although `-- <workload>` is supported, profile data are 3968 * still cpu/systemwide. 3969 */ 3970 if (cpu_list) 3971 target.cpu_list = cpu_list; 3972 else 3973 target.system_wide = true; 3974 3975 if (argc) { 3976 err = evlist__prepare_workload(evlist, &target, argv, false, NULL); 3977 if (err) 3978 goto out; 3979 } 3980 3981 err = evlist__create_maps(evlist, &target); 3982 if (err < 0) 3983 goto out; 3984 3985 user_requested_cpus = evlist->core.user_requested_cpus; 3986 3987 err = perf_event__synthesize_schedstat(&(sched->tool), 3988 process_synthesized_schedstat_event, 3989 user_requested_cpus); 3990 if (err < 0) 3991 goto out; 3992 3993 err = enable_sched_schedstats(&reset); 3994 if (err < 0) 3995 goto out; 3996 3997 if (argc) 3998 evlist__start_workload(evlist); 3999 4000 while (!done) { 4001 if (argc && waitpid(evlist->workload.pid, NULL, WNOHANG) > 0) 4002 break; 4003 sleep(1); 4004 } 4005 4006 if (reset) { 4007 err = disable_sched_schedstat(); 4008 if (err < 0) 4009 goto out; 4010 } 4011 4012 err = perf_event__synthesize_schedstat(&(sched->tool), 4013 process_synthesized_schedstat_event, 4014 user_requested_cpus); 4015 if (err < 0) 4016 goto out; 4017 4018 err = perf_session__write_header(session, evlist, fd, true); 4019 4020 out: 4021 if (!err) 4022 fprintf(stderr, "[ perf sched stats: Wrote samples to %s ]\n", data.path); 4023 else 4024 fprintf(stderr, "[ perf sched stats: Failed !! ]\n"); 4025 4026 evlist__delete(evlist); 4027 close(fd); 4028 return err; 4029 } 4030 4031 struct schedstat_domain { 4032 struct list_head domain_list; 4033 struct perf_record_schedstat_domain *domain_data; 4034 }; 4035 4036 struct schedstat_cpu { 4037 struct list_head cpu_list; 4038 struct list_head domain_head; 4039 struct perf_record_schedstat_cpu *cpu_data; 4040 }; 4041 4042 static struct list_head cpu_head = LIST_HEAD_INIT(cpu_head); 4043 static struct schedstat_cpu *cpu_second_pass; 4044 static struct schedstat_domain *domain_second_pass; 4045 static bool after_workload_flag; 4046 static bool verbose_field; 4047 4048 static void free_schedstat(struct list_head *head); 4049 4050 static void store_schedstat_cpu_diff(struct schedstat_cpu *after_workload) 4051 { 4052 struct perf_record_schedstat_cpu *before = cpu_second_pass->cpu_data; 4053 struct perf_record_schedstat_cpu *after = after_workload->cpu_data; 4054 __u16 version = after_workload->cpu_data->version; 4055 4056 #define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver) \ 4057 (before->_ver._name = after->_ver._name - before->_ver._name) 4058 4059 if (version == 15) { 4060 #include <perf/schedstat-v15.h> 4061 } else if (version == 16) { 4062 #include <perf/schedstat-v16.h> 4063 } else if (version == 17) { 4064 #include <perf/schedstat-v17.h> 4065 } 4066 4067 #undef CPU_FIELD 4068 } 4069 4070 static void store_schedstat_domain_diff(struct schedstat_domain *after_workload) 4071 { 4072 struct perf_record_schedstat_domain *before = domain_second_pass->domain_data; 4073 struct perf_record_schedstat_domain *after = after_workload->domain_data; 4074 __u16 version = after_workload->domain_data->version; 4075 4076 #define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver) \ 4077 (before->_ver._name = after->_ver._name - before->_ver._name) 4078 4079 if (version == 15) { 4080 #include <perf/schedstat-v15.h> 4081 } else if (version == 16) { 4082 #include <perf/schedstat-v16.h> 4083 } else if (version == 17) { 4084 #include <perf/schedstat-v17.h> 4085 } 4086 #undef DOMAIN_FIELD 4087 } 4088 4089 #define PCT_CHNG(_x, _y) ((_x) ? ((double)((double)(_y) - (_x)) / (_x)) * 100 : 0.0) 4090 static inline void print_cpu_stats(struct perf_record_schedstat_cpu *cs1, 4091 struct perf_record_schedstat_cpu *cs2) 4092 { 4093 printf("%-65s ", "DESC"); 4094 if (!cs2) 4095 printf("%12s %12s", "COUNT", "PCT_CHANGE"); 4096 else 4097 printf("%12s %11s %12s %14s %10s", "COUNT1", "COUNT2", "PCT_CHANGE", 4098 "PCT_CHANGE1", "PCT_CHANGE2"); 4099 4100 printf("\n"); 4101 print_separator2(SEP_LEN, "", 0); 4102 4103 #define CALC_PCT(_x, _y) ((_y) ? ((double)(_x) / (_y)) * 100 : 0.0) 4104 4105 #define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver) \ 4106 do { \ 4107 printf("%-65s: " _format, verbose_field ? _desc : #_name, \ 4108 cs1->_ver._name); \ 4109 if (!cs2) { \ 4110 if (_is_pct) \ 4111 printf(" ( %8.2lf%% )", \ 4112 CALC_PCT(cs1->_ver._name, cs1->_ver._pct_of)); \ 4113 } else { \ 4114 printf("," _format " | %8.2lf%% |", cs2->_ver._name, \ 4115 PCT_CHNG(cs1->_ver._name, cs2->_ver._name)); \ 4116 if (_is_pct) \ 4117 printf(" ( %8.2lf%%, %8.2lf%% )", \ 4118 CALC_PCT(cs1->_ver._name, cs1->_ver._pct_of), \ 4119 CALC_PCT(cs2->_ver._name, cs2->_ver._pct_of)); \ 4120 } \ 4121 printf("\n"); \ 4122 } while (0) 4123 4124 if (cs1->version == 15) { 4125 #include <perf/schedstat-v15.h> 4126 } else if (cs1->version == 16) { 4127 #include <perf/schedstat-v16.h> 4128 } else if (cs1->version == 17) { 4129 #include <perf/schedstat-v17.h> 4130 } 4131 4132 #undef CPU_FIELD 4133 #undef CALC_PCT 4134 } 4135 4136 static inline void print_domain_stats(struct perf_record_schedstat_domain *ds1, 4137 struct perf_record_schedstat_domain *ds2, 4138 __u64 jiffies1, __u64 jiffies2) 4139 { 4140 printf("%-65s ", "DESC"); 4141 if (!ds2) 4142 printf("%12s %14s", "COUNT", "AVG_JIFFIES"); 4143 else 4144 printf("%12s %11s %12s %16s %12s", "COUNT1", "COUNT2", "PCT_CHANGE", 4145 "AVG_JIFFIES1", "AVG_JIFFIES2"); 4146 printf("\n"); 4147 4148 #define DOMAIN_CATEGORY(_desc) \ 4149 do { \ 4150 size_t _len = strlen(_desc); \ 4151 size_t _pre_dash_cnt = (SEP_LEN - _len) / 2; \ 4152 size_t _post_dash_cnt = SEP_LEN - _len - _pre_dash_cnt; \ 4153 print_separator2((int)_pre_dash_cnt, _desc, (int)_post_dash_cnt);\ 4154 } while (0) 4155 4156 #define CALC_AVG(_x, _y) ((_y) ? (long double)(_x) / (_y) : 0.0) 4157 4158 #define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver) \ 4159 do { \ 4160 printf("%-65s: " _format, verbose_field ? _desc : #_name, \ 4161 ds1->_ver._name); \ 4162 if (!ds2) { \ 4163 if (_is_jiffies) \ 4164 printf(" $ %11.2Lf $", \ 4165 CALC_AVG(jiffies1, ds1->_ver._name)); \ 4166 } else { \ 4167 printf("," _format " | %8.2lf%% |", ds2->_ver._name, \ 4168 PCT_CHNG(ds1->_ver._name, ds2->_ver._name)); \ 4169 if (_is_jiffies) \ 4170 printf(" $ %11.2Lf, %11.2Lf $", \ 4171 CALC_AVG(jiffies1, ds1->_ver._name), \ 4172 CALC_AVG(jiffies2, ds2->_ver._name)); \ 4173 } \ 4174 printf("\n"); \ 4175 } while (0) 4176 4177 #define DERIVED_CNT_FIELD(_name, _desc, _format, _x, _y, _z, _ver) \ 4178 do { \ 4179 __u32 t1 = ds1->_ver._x - ds1->_ver._y - ds1->_ver._z; \ 4180 printf("*%-64s: " _format, verbose_field ? _desc : #_name, t1); \ 4181 if (ds2) { \ 4182 __u32 t2 = ds2->_ver._x - ds2->_ver._y - ds2->_ver._z; \ 4183 printf("," _format " | %8.2lf%% |", t2, \ 4184 PCT_CHNG(t1, t2)); \ 4185 } \ 4186 printf("\n"); \ 4187 } while (0) 4188 4189 #define DERIVED_AVG_FIELD(_name, _desc, _format, _x, _y, _z, _w, _ver) \ 4190 do { \ 4191 __u32 t1 = ds1->_ver._x - ds1->_ver._y - ds1->_ver._z; \ 4192 printf("*%-64s: " _format, verbose_field ? _desc : #_name, \ 4193 CALC_AVG(ds1->_ver._w, t1)); \ 4194 if (ds2) { \ 4195 __u32 t2 = ds2->_ver._x - ds2->_ver._y - ds2->_ver._z; \ 4196 printf("," _format " | %8.2Lf%% |", \ 4197 CALC_AVG(ds2->_ver._w, t2), \ 4198 PCT_CHNG(CALC_AVG(ds1->_ver._w, t1), \ 4199 CALC_AVG(ds2->_ver._w, t2))); \ 4200 } \ 4201 printf("\n"); \ 4202 } while (0) 4203 4204 if (ds1->version == 15) { 4205 #include <perf/schedstat-v15.h> 4206 } else if (ds1->version == 16) { 4207 #include <perf/schedstat-v16.h> 4208 } else if (ds1->version == 17) { 4209 #include <perf/schedstat-v17.h> 4210 } 4211 4212 #undef DERIVED_AVG_FIELD 4213 #undef DERIVED_CNT_FIELD 4214 #undef DOMAIN_FIELD 4215 #undef CALC_AVG 4216 #undef DOMAIN_CATEGORY 4217 } 4218 #undef PCT_CHNG 4219 4220 static void summarize_schedstat_cpu(struct schedstat_cpu *summary_cpu, 4221 struct schedstat_cpu *cptr, 4222 int cnt, bool is_last) 4223 { 4224 struct perf_record_schedstat_cpu *summary_cs = summary_cpu->cpu_data, 4225 *temp_cs = cptr->cpu_data; 4226 4227 #define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver) \ 4228 do { \ 4229 summary_cs->_ver._name += temp_cs->_ver._name; \ 4230 if (is_last) \ 4231 summary_cs->_ver._name /= cnt; \ 4232 } while (0) 4233 4234 if (cptr->cpu_data->version == 15) { 4235 #include <perf/schedstat-v15.h> 4236 } else if (cptr->cpu_data->version == 16) { 4237 #include <perf/schedstat-v16.h> 4238 } else if (cptr->cpu_data->version == 17) { 4239 #include <perf/schedstat-v17.h> 4240 } 4241 #undef CPU_FIELD 4242 } 4243 4244 static void summarize_schedstat_domain(struct schedstat_domain *summary_domain, 4245 struct schedstat_domain *dptr, 4246 int cnt, bool is_last) 4247 { 4248 struct perf_record_schedstat_domain *summary_ds = summary_domain->domain_data, 4249 *temp_ds = dptr->domain_data; 4250 4251 #define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver) \ 4252 do { \ 4253 summary_ds->_ver._name += temp_ds->_ver._name; \ 4254 if (is_last) \ 4255 summary_ds->_ver._name /= cnt; \ 4256 } while (0) 4257 4258 if (dptr->domain_data->version == 15) { 4259 #include <perf/schedstat-v15.h> 4260 } else if (dptr->domain_data->version == 16) { 4261 #include <perf/schedstat-v16.h> 4262 } else if (dptr->domain_data->version == 17) { 4263 #include <perf/schedstat-v17.h> 4264 } 4265 #undef DOMAIN_FIELD 4266 } 4267 4268 /* 4269 * get_all_cpu_stats() appends the summary to the head of the list. 4270 */ 4271 static int get_all_cpu_stats(struct list_head *head) 4272 { 4273 struct schedstat_cpu *cptr, *summary_head = NULL; 4274 struct schedstat_domain *dptr, *tdptr; 4275 bool is_last = false; 4276 int cnt = 1; 4277 int ret = 0; 4278 struct list_head tmp_cleanup_list; 4279 4280 assert(!list_empty(head)); 4281 cptr = list_first_entry(head, struct schedstat_cpu, cpu_list); 4282 4283 INIT_LIST_HEAD(&tmp_cleanup_list); 4284 4285 summary_head = zalloc(sizeof(*summary_head)); 4286 if (!summary_head) 4287 return -ENOMEM; 4288 4289 INIT_LIST_HEAD(&summary_head->domain_head); 4290 INIT_LIST_HEAD(&summary_head->cpu_list); 4291 list_add(&summary_head->cpu_list, &tmp_cleanup_list); 4292 4293 summary_head->cpu_data = zalloc(sizeof(*summary_head->cpu_data)); 4294 if (!summary_head->cpu_data) { 4295 ret = -ENOMEM; 4296 goto out_cleanup; 4297 } 4298 memcpy(summary_head->cpu_data, cptr->cpu_data, sizeof(*summary_head->cpu_data)); 4299 4300 list_for_each_entry(dptr, &cptr->domain_head, domain_list) { 4301 tdptr = zalloc(sizeof(*tdptr)); 4302 if (!tdptr) { 4303 ret = -ENOMEM; 4304 goto out_cleanup; 4305 } 4306 INIT_LIST_HEAD(&tdptr->domain_list); 4307 4308 tdptr->domain_data = zalloc(sizeof(*tdptr->domain_data)); 4309 if (!tdptr->domain_data) { 4310 free(tdptr); 4311 ret = -ENOMEM; 4312 goto out_cleanup; 4313 } 4314 4315 memcpy(tdptr->domain_data, dptr->domain_data, sizeof(*tdptr->domain_data)); 4316 list_add_tail(&tdptr->domain_list, &summary_head->domain_head); 4317 } 4318 4319 list_for_each_entry(cptr, head, cpu_list) { 4320 if (list_is_first(&cptr->cpu_list, head)) 4321 continue; 4322 4323 if (list_is_last(&cptr->cpu_list, head)) 4324 is_last = true; 4325 4326 cnt++; 4327 summarize_schedstat_cpu(summary_head, cptr, cnt, is_last); 4328 if (list_empty(&summary_head->domain_head)) 4329 continue; 4330 4331 tdptr = list_first_entry(&summary_head->domain_head, struct schedstat_domain, 4332 domain_list); 4333 4334 list_for_each_entry(dptr, &cptr->domain_head, domain_list) { 4335 summarize_schedstat_domain(tdptr, dptr, cnt, is_last); 4336 if (list_is_last(&tdptr->domain_list, &summary_head->domain_head)) { 4337 tdptr = NULL; 4338 break; 4339 } 4340 tdptr = list_next_entry(tdptr, domain_list); 4341 } 4342 } 4343 4344 list_del_init(&summary_head->cpu_list); 4345 list_add(&summary_head->cpu_list, head); 4346 return 0; 4347 4348 out_cleanup: 4349 free_schedstat(&tmp_cleanup_list); 4350 return ret; 4351 } 4352 4353 static int show_schedstat_data(struct list_head *head1, struct cpu_domain_map **cd_map1, int nr1, 4354 struct list_head *head2, struct cpu_domain_map **cd_map2, int nr2, 4355 bool summary_only) 4356 { 4357 struct schedstat_cpu *cptr1 = list_first_entry(head1, struct schedstat_cpu, cpu_list); 4358 struct perf_record_schedstat_domain *ds1 = NULL, *ds2 = NULL; 4359 struct schedstat_domain *dptr1 = NULL, *dptr2 = NULL; 4360 struct schedstat_cpu *cptr2 = NULL; 4361 __u64 jiffies1 = 0, jiffies2 = 0; 4362 bool is_summary = true; 4363 int ret = 0; 4364 4365 if (!cd_map1) { 4366 pr_err("Error: CPU domain map 1 is missing.\n"); 4367 return -1; 4368 } 4369 if (head2 && !cd_map2) { 4370 pr_err("Error: CPU domain map 2 is missing.\n"); 4371 return -1; 4372 } 4373 4374 printf("Description\n"); 4375 print_separator2(SEP_LEN, "", 0); 4376 printf("%-30s-> %s\n", "DESC", "Description of the field"); 4377 printf("%-30s-> %s\n", "COUNT", "Value of the field"); 4378 printf("%-30s-> %s\n", "PCT_CHANGE", "Percent change with corresponding base value"); 4379 printf("%-30s-> %s\n", "AVG_JIFFIES", 4380 "Avg time in jiffies between two consecutive occurrence of event"); 4381 4382 print_separator2(SEP_LEN, "", 0); 4383 printf("\n"); 4384 4385 printf("%-65s: ", "Time elapsed (in jiffies)"); 4386 jiffies1 = cptr1->cpu_data->timestamp; 4387 printf("%11llu", jiffies1); 4388 if (head2) { 4389 cptr2 = list_first_entry(head2, struct schedstat_cpu, cpu_list); 4390 jiffies2 = cptr2->cpu_data->timestamp; 4391 printf(",%11llu", jiffies2); 4392 } 4393 printf("\n"); 4394 4395 ret = get_all_cpu_stats(head1); 4396 if (ret) 4397 return ret; 4398 if (cptr2) { 4399 ret = get_all_cpu_stats(head2); 4400 if (ret) 4401 return ret; 4402 cptr2 = list_first_entry(head2, struct schedstat_cpu, cpu_list); 4403 } 4404 4405 list_for_each_entry(cptr1, head1, cpu_list) { 4406 struct cpu_domain_map *cd_info1 = NULL, *cd_info2 = NULL; 4407 struct perf_record_schedstat_cpu *cs1 = cptr1->cpu_data; 4408 struct perf_record_schedstat_cpu *cs2 = NULL; 4409 4410 dptr2 = NULL; 4411 if (cs1->cpu >= (u32)nr1) { 4412 pr_err("Error: CPU %d exceeds domain map size %d\n", cs1->cpu, nr1); 4413 return -1; 4414 } 4415 cd_info1 = cd_map1[cs1->cpu]; 4416 if (!cd_info1) { 4417 pr_err("Error: CPU %d domain info is missing in map 1.\n", 4418 cs1->cpu); 4419 return -1; 4420 } 4421 if (cptr2) { 4422 cs2 = cptr2->cpu_data; 4423 if (cs2->cpu >= (u32)nr2) { 4424 pr_err("Error: CPU %d exceeds domain map size %d\n", cs2->cpu, nr2); 4425 return -1; 4426 } 4427 cd_info2 = cd_map2[cs2->cpu]; 4428 if (!cd_info2) { 4429 pr_err("Error: CPU %d domain info is missing in map 2.\n", 4430 cs2->cpu); 4431 return -1; 4432 } 4433 if (!list_empty(&cptr2->domain_head)) 4434 dptr2 = list_first_entry(&cptr2->domain_head, 4435 struct schedstat_domain, 4436 domain_list); 4437 } 4438 4439 if (cs2 && cs1->cpu != cs2->cpu) { 4440 pr_err("Failed because matching cpus not found for diff\n"); 4441 return -1; 4442 } 4443 4444 if (cd_info2 && cd_info1->nr_domains != cd_info2->nr_domains) { 4445 pr_err("Failed because nr_domains is not same for cpus\n"); 4446 return -1; 4447 } 4448 4449 print_separator2(SEP_LEN, "", 0); 4450 4451 if (is_summary) 4452 printf("CPU: <ALL CPUS SUMMARY>\n"); 4453 else 4454 printf("CPU: %d\n", cs1->cpu); 4455 4456 print_separator2(SEP_LEN, "", 0); 4457 print_cpu_stats(cs1, cs2); 4458 print_separator2(SEP_LEN, "", 0); 4459 4460 list_for_each_entry(dptr1, &cptr1->domain_head, domain_list) { 4461 struct domain_info *dinfo1 = NULL, *dinfo2 = NULL; 4462 4463 ds1 = dptr1->domain_data; 4464 ds2 = NULL; 4465 if (ds1->domain >= cd_info1->nr_domains) { 4466 pr_err("Error: Domain %d exceeds max domains %d for CPU %d in map 1.\n", 4467 ds1->domain, cd_info1->nr_domains, cs1->cpu); 4468 return -1; 4469 } 4470 dinfo1 = cd_info1->domains[ds1->domain]; 4471 if (!dinfo1) { 4472 pr_err("Error: Domain %d info is missing for CPU %d in map 1.\n", 4473 ds1->domain, cs1->cpu); 4474 return -1; 4475 } 4476 if (dptr2) { 4477 ds2 = dptr2->domain_data; 4478 if (ds2->domain >= cd_info2->nr_domains) { 4479 pr_err("Error: Domain %d exceeds max domains %d for CPU %d in map 2.\n", 4480 ds2->domain, cd_info2->nr_domains, cs2->cpu); 4481 return -1; 4482 } 4483 dinfo2 = cd_info2->domains[ds2->domain]; 4484 if (!dinfo2) { 4485 pr_err("Error: Domain %d info is missing for CPU %d in map 2.\n", 4486 ds2->domain, cs2->cpu); 4487 return -1; 4488 } 4489 } 4490 4491 if (dinfo2 && dinfo1->domain != dinfo2->domain) { 4492 pr_err("Failed because matching domain not found for diff\n"); 4493 return -1; 4494 } 4495 4496 if (is_summary) { 4497 if (dinfo1->dname) 4498 printf("CPU: <ALL CPUS SUMMARY> | DOMAIN: %s\n", 4499 dinfo1->dname); 4500 else 4501 printf("CPU: <ALL CPUS SUMMARY> | DOMAIN: %d\n", 4502 dinfo1->domain); 4503 } else { 4504 if (dinfo1->dname) 4505 printf("CPU: %d | DOMAIN: %s | DOMAIN_CPUS: ", 4506 cs1->cpu, dinfo1->dname); 4507 else 4508 printf("CPU: %d | DOMAIN: %d | DOMAIN_CPUS: ", 4509 cs1->cpu, dinfo1->domain); 4510 4511 printf("%s\n", dinfo1->cpulist); 4512 } 4513 print_separator2(SEP_LEN, "", 0); 4514 print_domain_stats(ds1, ds2, jiffies1, jiffies2); 4515 print_separator2(SEP_LEN, "", 0); 4516 4517 if (dptr2) { 4518 if (list_is_last(&dptr2->domain_list, &cptr2->domain_head)) 4519 dptr2 = NULL; 4520 else 4521 dptr2 = list_next_entry(dptr2, domain_list); 4522 } 4523 } 4524 if (summary_only) 4525 break; 4526 4527 if (cptr2) { 4528 if (list_is_last(&cptr2->cpu_list, head2)) 4529 cptr2 = NULL; 4530 else 4531 cptr2 = list_next_entry(cptr2, cpu_list); 4532 } 4533 4534 is_summary = false; 4535 } 4536 return ret; 4537 } 4538 4539 /* 4540 * Creates a linked list of cpu_data and domain_data. Below represents the structure of the linked 4541 * list where CPU0,CPU1,CPU2, ..., CPU(N-1) stores the cpu_data. Here N is the total number of cpus. 4542 * Each of the CPU points to the list of domain_data. Here DOMAIN0, DOMAIN1, DOMAIN2, ... represents 4543 * the domain_data. Here D0, D1, D2, ..., Dm are the number of domains in the respective cpus. 4544 * 4545 * +----------+ 4546 * | CPU_HEAD | 4547 * +----------+ 4548 * | 4549 * v 4550 * +----------+ +---------+ +---------+ +---------+ +--------------+ 4551 * | CPU0 | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(D0-1) | 4552 * +----------+ +---------+ +---------+ +---------+ +--------------+ 4553 * | 4554 * v 4555 * +----------+ +---------+ +---------+ +---------+ +--------------+ 4556 * | CPU1 | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(D1-1) | 4557 * +----------+ +---------+ +---------+ +---------+ +--------------+ 4558 * | 4559 * v 4560 * +----------+ +---------+ +---------+ +---------+ +--------------+ 4561 * | CPU2 | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(D2-1) | 4562 * +----------+ +---------+ +---------+ +---------+ +--------------+ 4563 * | 4564 * v 4565 * ... 4566 * | 4567 * v 4568 * +----------+ +---------+ +---------+ +---------+ +--------------+ 4569 * | CPU(N-1) | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(Dm-1) | 4570 * +----------+ +---------+ +---------+ +---------+ +--------------+ 4571 * 4572 * Each cpu as well as domain has 2 enties in the event list one before the workload starts and 4573 * other after completion of the workload. The above linked list stores the diff of the cpu and 4574 * domain statistics. 4575 */ 4576 static int perf_sched__process_schedstat(const struct perf_tool *tool __maybe_unused, 4577 struct perf_session *session __maybe_unused, 4578 union perf_event *event) 4579 { 4580 struct perf_cpu this_cpu; 4581 static __u32 initial_cpu; 4582 4583 switch (event->header.type) { 4584 case PERF_RECORD_SCHEDSTAT_CPU: 4585 this_cpu.cpu = event->schedstat_cpu.cpu; 4586 break; 4587 case PERF_RECORD_SCHEDSTAT_DOMAIN: 4588 this_cpu.cpu = event->schedstat_domain.cpu; 4589 break; 4590 default: 4591 return 0; 4592 } 4593 4594 if (user_requested_cpus && !perf_cpu_map__has(user_requested_cpus, this_cpu)) 4595 return 0; 4596 4597 if (event->header.type == PERF_RECORD_SCHEDSTAT_CPU) { 4598 struct schedstat_cpu *temp = zalloc(sizeof(*temp)); 4599 4600 if (!temp) 4601 return -ENOMEM; 4602 4603 temp->cpu_data = zalloc(sizeof(*temp->cpu_data)); 4604 if (!temp->cpu_data) 4605 return -ENOMEM; 4606 4607 memcpy(temp->cpu_data, &event->schedstat_cpu, sizeof(*temp->cpu_data)); 4608 4609 if (!list_empty(&cpu_head) && temp->cpu_data->cpu == initial_cpu) 4610 after_workload_flag = true; 4611 4612 if (!after_workload_flag) { 4613 if (list_empty(&cpu_head)) 4614 initial_cpu = temp->cpu_data->cpu; 4615 4616 list_add_tail(&temp->cpu_list, &cpu_head); 4617 INIT_LIST_HEAD(&temp->domain_head); 4618 } else { 4619 if (temp->cpu_data->cpu == initial_cpu) { 4620 cpu_second_pass = list_first_entry(&cpu_head, struct schedstat_cpu, 4621 cpu_list); 4622 cpu_second_pass->cpu_data->timestamp = 4623 temp->cpu_data->timestamp - cpu_second_pass->cpu_data->timestamp; 4624 } else { 4625 cpu_second_pass = list_next_entry(cpu_second_pass, cpu_list); 4626 } 4627 domain_second_pass = list_first_entry(&cpu_second_pass->domain_head, 4628 struct schedstat_domain, domain_list); 4629 store_schedstat_cpu_diff(temp); 4630 } 4631 } else if (event->header.type == PERF_RECORD_SCHEDSTAT_DOMAIN) { 4632 struct schedstat_cpu *cpu_tail; 4633 struct schedstat_domain *temp = zalloc(sizeof(*temp)); 4634 4635 if (!temp) 4636 return -ENOMEM; 4637 4638 temp->domain_data = zalloc(sizeof(*temp->domain_data)); 4639 if (!temp->domain_data) 4640 return -ENOMEM; 4641 4642 memcpy(temp->domain_data, &event->schedstat_domain, sizeof(*temp->domain_data)); 4643 4644 if (!after_workload_flag) { 4645 cpu_tail = list_last_entry(&cpu_head, struct schedstat_cpu, cpu_list); 4646 list_add_tail(&temp->domain_list, &cpu_tail->domain_head); 4647 } else { 4648 store_schedstat_domain_diff(temp); 4649 domain_second_pass = list_next_entry(domain_second_pass, domain_list); 4650 } 4651 } 4652 4653 return 0; 4654 } 4655 4656 static void free_schedstat(struct list_head *head) 4657 { 4658 struct schedstat_domain *dptr, *n1; 4659 struct schedstat_cpu *cptr, *n2; 4660 4661 list_for_each_entry_safe(cptr, n2, head, cpu_list) { 4662 list_for_each_entry_safe(dptr, n1, &cptr->domain_head, domain_list) { 4663 list_del_init(&dptr->domain_list); 4664 free(dptr->domain_data); 4665 free(dptr); 4666 } 4667 list_del_init(&cptr->cpu_list); 4668 free(cptr->cpu_data); 4669 free(cptr); 4670 } 4671 } 4672 4673 static int perf_sched__schedstat_report(struct perf_sched *sched) 4674 { 4675 struct cpu_domain_map **cd_map; 4676 struct perf_session *session; 4677 struct target target = {}; 4678 struct perf_data data = { 4679 .path = input_name, 4680 .mode = PERF_DATA_MODE_READ, 4681 }; 4682 int err = 0; 4683 4684 sched->tool.schedstat_cpu = perf_sched__process_schedstat; 4685 sched->tool.schedstat_domain = perf_sched__process_schedstat; 4686 4687 session = perf_session__new(&data, &sched->tool); 4688 if (IS_ERR(session)) { 4689 pr_err("Perf session creation failed.\n"); 4690 return PTR_ERR(session); 4691 } 4692 4693 if (cpu_list) 4694 target.cpu_list = cpu_list; 4695 else 4696 target.system_wide = true; 4697 4698 err = evlist__create_maps(session->evlist, &target); 4699 if (err < 0) 4700 goto out; 4701 4702 user_requested_cpus = session->evlist->core.user_requested_cpus; 4703 4704 err = perf_session__process_events(session); 4705 4706 if (!err) { 4707 setup_pager(); 4708 4709 if (list_empty(&cpu_head)) { 4710 pr_err("Data is not available\n"); 4711 err = -1; 4712 goto out; 4713 } 4714 4715 cd_map = session->header.env.cpu_domain; 4716 err = show_schedstat_data(&cpu_head, cd_map, 4717 session->header.env.nr_cpus_avail, 4718 NULL, NULL, 0, false); 4719 } 4720 4721 out: 4722 free_schedstat(&cpu_head); 4723 perf_session__delete(session); 4724 return err; 4725 } 4726 4727 static int perf_sched__schedstat_diff(struct perf_sched *sched, 4728 int argc, const char **argv) 4729 { 4730 struct cpu_domain_map **cd_map0 = NULL, **cd_map1 = NULL; 4731 struct list_head cpu_head_ses0, cpu_head_ses1; 4732 struct perf_session *session[2]; 4733 struct perf_data data[2] = {0}; 4734 int ret = 0, err = 0; 4735 static const char *defaults[] = { 4736 "perf.data.old", 4737 "perf.data", 4738 }; 4739 4740 if (argc) { 4741 if (argc == 1) 4742 defaults[1] = argv[0]; 4743 else if (argc == 2) { 4744 defaults[0] = argv[0]; 4745 defaults[1] = argv[1]; 4746 } else { 4747 pr_err("perf sched stats diff is not supported with more than 2 files.\n"); 4748 goto out_ret; 4749 } 4750 } 4751 4752 INIT_LIST_HEAD(&cpu_head_ses0); 4753 INIT_LIST_HEAD(&cpu_head_ses1); 4754 4755 sched->tool.schedstat_cpu = perf_sched__process_schedstat; 4756 sched->tool.schedstat_domain = perf_sched__process_schedstat; 4757 4758 data[0].path = defaults[0]; 4759 data[0].mode = PERF_DATA_MODE_READ; 4760 session[0] = perf_session__new(&data[0], &sched->tool); 4761 if (IS_ERR(session[0])) { 4762 ret = PTR_ERR(session[0]); 4763 pr_err("Failed to open %s\n", data[0].path); 4764 goto out_delete_ses0; 4765 } 4766 4767 err = perf_session__process_events(session[0]); 4768 if (err) { 4769 free_schedstat(&cpu_head); 4770 goto out_delete_ses0; 4771 } 4772 4773 cd_map0 = session[0]->header.env.cpu_domain; 4774 list_replace_init(&cpu_head, &cpu_head_ses0); 4775 after_workload_flag = false; 4776 4777 data[1].path = defaults[1]; 4778 data[1].mode = PERF_DATA_MODE_READ; 4779 session[1] = perf_session__new(&data[1], &sched->tool); 4780 if (IS_ERR(session[1])) { 4781 ret = PTR_ERR(session[1]); 4782 pr_err("Failed to open %s\n", data[1].path); 4783 goto out_delete_ses1; 4784 } 4785 4786 err = perf_session__process_events(session[1]); 4787 if (err) { 4788 free_schedstat(&cpu_head); 4789 goto out_delete_ses1; 4790 } 4791 4792 cd_map1 = session[1]->header.env.cpu_domain; 4793 list_replace_init(&cpu_head, &cpu_head_ses1); 4794 after_workload_flag = false; 4795 setup_pager(); 4796 4797 if (list_empty(&cpu_head_ses1)) { 4798 pr_err("Data is not available\n"); 4799 ret = -1; 4800 goto out_delete_ses1; 4801 } 4802 4803 if (list_empty(&cpu_head_ses0)) { 4804 pr_err("Data is not available\n"); 4805 ret = -1; 4806 goto out_delete_ses1; 4807 } 4808 4809 ret = show_schedstat_data(&cpu_head_ses0, cd_map0, session[0]->header.env.nr_cpus_avail, 4810 &cpu_head_ses1, cd_map1, session[1]->header.env.nr_cpus_avail, true); 4811 if (ret) 4812 goto out_delete_ses1; 4813 4814 out_delete_ses1: 4815 free_schedstat(&cpu_head_ses1); 4816 if (!IS_ERR(session[1])) 4817 perf_session__delete(session[1]); 4818 4819 out_delete_ses0: 4820 free_schedstat(&cpu_head_ses0); 4821 if (!IS_ERR(session[0])) 4822 perf_session__delete(session[0]); 4823 4824 out_ret: 4825 return ret; 4826 } 4827 4828 static int process_synthesized_event_live(const struct perf_tool *tool __maybe_unused, 4829 union perf_event *event, 4830 struct perf_sample *sample __maybe_unused, 4831 struct machine *machine __maybe_unused) 4832 { 4833 return perf_sched__process_schedstat(tool, NULL, event); 4834 } 4835 4836 static int perf_sched__schedstat_live(struct perf_sched *sched, 4837 int argc, const char **argv) 4838 { 4839 struct cpu_domain_map **cd_map = NULL; 4840 struct target target = {}; 4841 u32 __maybe_unused md; 4842 struct evlist *evlist; 4843 u32 nr = 0, sv; 4844 int reset = 0; 4845 int err = 0; 4846 4847 done = 0; 4848 signal(SIGINT, sighandler); 4849 signal(SIGCHLD, sighandler); 4850 signal(SIGTERM, sighandler); 4851 4852 evlist = evlist__new(); 4853 if (!evlist) 4854 return -ENOMEM; 4855 4856 /* 4857 * `perf sched schedstat` does not support workload profiling (-p pid) 4858 * since /proc/schedstat file contains cpu specific data only. Hence, a 4859 * profile target is either set of cpus or systemwide, never a process. 4860 * Note that, although `-- <workload>` is supported, profile data are 4861 * still cpu/systemwide. 4862 */ 4863 if (cpu_list) 4864 target.cpu_list = cpu_list; 4865 else 4866 target.system_wide = true; 4867 4868 if (argc) { 4869 err = evlist__prepare_workload(evlist, &target, argv, false, NULL); 4870 if (err) 4871 goto out; 4872 } 4873 4874 err = evlist__create_maps(evlist, &target); 4875 if (err < 0) 4876 goto out; 4877 4878 user_requested_cpus = evlist->core.user_requested_cpus; 4879 4880 err = perf_event__synthesize_schedstat(&(sched->tool), 4881 process_synthesized_event_live, 4882 user_requested_cpus); 4883 if (err < 0) 4884 goto out; 4885 4886 err = enable_sched_schedstats(&reset); 4887 if (err < 0) 4888 goto out; 4889 4890 if (argc) 4891 evlist__start_workload(evlist); 4892 4893 while (!done) { 4894 if (argc && waitpid(evlist->workload.pid, NULL, WNOHANG) > 0) 4895 break; 4896 sleep(1); 4897 } 4898 4899 if (reset) { 4900 err = disable_sched_schedstat(); 4901 if (err < 0) 4902 goto out; 4903 } 4904 4905 err = perf_event__synthesize_schedstat(&(sched->tool), 4906 process_synthesized_event_live, 4907 user_requested_cpus); 4908 if (err) 4909 goto out; 4910 4911 setup_pager(); 4912 4913 if (list_empty(&cpu_head)) { 4914 pr_err("Data is not available\n"); 4915 err = -1; 4916 goto out; 4917 } 4918 4919 nr = cpu__max_present_cpu().cpu; 4920 cd_map = build_cpu_domain_map(&sv, &md, nr); 4921 if (!cd_map) { 4922 pr_err("Unable to generate cpu-domain relation info"); 4923 goto out; 4924 } 4925 4926 err = show_schedstat_data(&cpu_head, cd_map, nr, NULL, NULL, 0, false); 4927 free_cpu_domain_info(cd_map, sv, nr); 4928 out: 4929 free_schedstat(&cpu_head); 4930 evlist__delete(evlist); 4931 return err; 4932 } 4933 4934 static bool schedstat_events_exposed(void) 4935 { 4936 /* 4937 * Select "sched:sched_stat_wait" event to check 4938 * whether schedstat tracepoints are exposed. 4939 */ 4940 return IS_ERR(trace_event__tp_format("sched", "sched_stat_wait")) ? 4941 false : true; 4942 } 4943 4944 static int __cmd_record(int argc, const char **argv) 4945 { 4946 unsigned int rec_argc, i, j; 4947 char **rec_argv; 4948 const char **rec_argv_copy; 4949 const char * const record_args[] = { 4950 "record", 4951 "-a", 4952 "-R", 4953 "-m", "1024", 4954 "-c", "1", 4955 "-e", "sched:sched_switch", 4956 "-e", "sched:sched_stat_runtime", 4957 "-e", "sched:sched_process_fork", 4958 "-e", "sched:sched_wakeup_new", 4959 "-e", "sched:sched_migrate_task", 4960 }; 4961 4962 /* 4963 * The tracepoints trace_sched_stat_{wait, sleep, iowait} 4964 * are not exposed to user if CONFIG_SCHEDSTATS is not set, 4965 * to prevent "perf sched record" execution failure, determine 4966 * whether to record schedstat events according to actual situation. 4967 */ 4968 const char * const schedstat_args[] = { 4969 "-e", "sched:sched_stat_wait", 4970 "-e", "sched:sched_stat_sleep", 4971 "-e", "sched:sched_stat_iowait", 4972 }; 4973 unsigned int schedstat_argc = schedstat_events_exposed() ? 4974 ARRAY_SIZE(schedstat_args) : 0; 4975 4976 struct tep_event *waking_event; 4977 int ret; 4978 4979 /* 4980 * +2 for either "-e", "sched:sched_wakeup" or 4981 * "-e", "sched:sched_waking" 4982 */ 4983 rec_argc = ARRAY_SIZE(record_args) + 2 + schedstat_argc + argc - 1; 4984 rec_argv = calloc(rec_argc + 1, sizeof(char *)); 4985 if (rec_argv == NULL) 4986 return -ENOMEM; 4987 rec_argv_copy = calloc(rec_argc + 1, sizeof(char *)); 4988 if (rec_argv_copy == NULL) { 4989 free(rec_argv); 4990 return -ENOMEM; 4991 } 4992 4993 for (i = 0; i < ARRAY_SIZE(record_args); i++) 4994 rec_argv[i] = strdup(record_args[i]); 4995 4996 rec_argv[i++] = strdup("-e"); 4997 waking_event = trace_event__tp_format("sched", "sched_waking"); 4998 if (!IS_ERR(waking_event)) 4999 rec_argv[i++] = strdup("sched:sched_waking"); 5000 else 5001 rec_argv[i++] = strdup("sched:sched_wakeup"); 5002 5003 for (j = 0; j < schedstat_argc; j++) 5004 rec_argv[i++] = strdup(schedstat_args[j]); 5005 5006 for (j = 1; j < (unsigned int)argc; j++, i++) 5007 rec_argv[i] = strdup(argv[j]); 5008 5009 BUG_ON(i != rec_argc); 5010 5011 memcpy(rec_argv_copy, rec_argv, sizeof(char *) * rec_argc); 5012 ret = cmd_record(rec_argc, rec_argv_copy); 5013 5014 for (i = 0; i < rec_argc; i++) 5015 free(rec_argv[i]); 5016 free(rec_argv); 5017 free(rec_argv_copy); 5018 5019 return ret; 5020 } 5021 5022 int cmd_sched(int argc, const char **argv) 5023 { 5024 static const char default_sort_order[] = "avg, max, switch, runtime"; 5025 struct perf_sched sched = { 5026 .cmp_pid = LIST_HEAD_INIT(sched.cmp_pid), 5027 .sort_list = LIST_HEAD_INIT(sched.sort_list), 5028 .sort_order = default_sort_order, 5029 .replay_repeat = 10, 5030 .profile_cpu = -1, 5031 .next_shortname1 = 'A', 5032 .next_shortname2 = '0', 5033 .skip_merge = 0, 5034 .show_callchain = 1, 5035 .max_stack = 5, 5036 }; 5037 const struct option sched_options[] = { 5038 OPT_STRING('i', "input", &input_name, "file", 5039 "input file name"), 5040 OPT_INCR('v', "verbose", &verbose, 5041 "be more verbose (show symbol address, etc)"), 5042 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, 5043 "dump raw trace in ASCII"), 5044 OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"), 5045 OPT_END() 5046 }; 5047 const struct option latency_options[] = { 5048 OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]", 5049 "sort by key(s): runtime, switch, avg, max"), 5050 OPT_INTEGER('C', "CPU", &sched.profile_cpu, 5051 "CPU to profile on"), 5052 OPT_BOOLEAN('p', "pids", &sched.skip_merge, 5053 "latency stats per pid instead of per comm"), 5054 OPT_PARENT(sched_options) 5055 }; 5056 const struct option replay_options[] = { 5057 OPT_UINTEGER('r', "repeat", &sched.replay_repeat, 5058 "repeat the workload replay N times (0: infinite)"), 5059 OPT_PARENT(sched_options) 5060 }; 5061 const struct option map_options[] = { 5062 OPT_BOOLEAN(0, "compact", &sched.map.comp, 5063 "map output in compact mode"), 5064 OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids", 5065 "highlight given pids in map"), 5066 OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus", 5067 "highlight given CPUs in map"), 5068 OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus", 5069 "display given CPUs in map"), 5070 OPT_STRING(0, "task-name", &sched.map.task_name, "task", 5071 "map output only for the given task name(s)."), 5072 OPT_BOOLEAN(0, "fuzzy-name", &sched.map.fuzzy, 5073 "given command name can be partially matched (fuzzy matching)"), 5074 OPT_PARENT(sched_options) 5075 }; 5076 const struct option timehist_options[] = { 5077 OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name, 5078 "file", "vmlinux pathname"), 5079 OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, 5080 "file", "kallsyms pathname"), 5081 OPT_BOOLEAN('g', "call-graph", &sched.show_callchain, 5082 "Display call chains if present (default on)"), 5083 OPT_UINTEGER(0, "max-stack", &sched.max_stack, 5084 "Maximum number of functions to display backtrace."), 5085 OPT_CALLBACK(0, "symfs", NULL, "directory[,layout]", SYMFS_HELP, 5086 symbol__config_symfs), 5087 OPT_BOOLEAN('s', "summary", &sched.summary_only, 5088 "Show only syscall summary with statistics"), 5089 OPT_BOOLEAN('S', "with-summary", &sched.summary, 5090 "Show all syscalls and summary with statistics"), 5091 OPT_BOOLEAN('w', "wakeups", &sched.show_wakeups, "Show wakeup events"), 5092 OPT_BOOLEAN('n', "next", &sched.show_next, "Show next task"), 5093 OPT_BOOLEAN('M', "migrations", &sched.show_migrations, "Show migration events"), 5094 OPT_BOOLEAN('V', "cpu-visual", &sched.show_cpu_visual, "Add CPU visual"), 5095 OPT_BOOLEAN('I', "idle-hist", &sched.idle_hist, "Show idle events only"), 5096 OPT_STRING(0, "time", &sched.time_str, "str", 5097 "Time span for analysis (start,stop)"), 5098 OPT_BOOLEAN(0, "state", &sched.show_state, "Show task state when sched-out"), 5099 OPT_STRING('p', "pid", &symbol_conf.pid_list_str, "pid[,pid...]", 5100 "analyze events only for given process id(s)"), 5101 OPT_STRING('t', "tid", &symbol_conf.tid_list_str, "tid[,tid...]", 5102 "analyze events only for given thread id(s)"), 5103 OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"), 5104 OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"), 5105 OPT_STRING(0, "prio", &sched.prio_str, "prio", 5106 "analyze events only for given task priority(ies)"), 5107 OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"), 5108 OPT_PARENT(sched_options) 5109 }; 5110 const struct option stats_options[] = { 5111 OPT_STRING('i', "input", &input_name, "file", 5112 "`stats report` with input filename"), 5113 OPT_STRING('o', "output", &output_name, "file", 5114 "`stats record` with output filename"), 5115 OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"), 5116 OPT_BOOLEAN('v', "verbose", &verbose_field, "Show explanation for fields in the report"), 5117 OPT_END() 5118 }; 5119 5120 const char * const latency_usage[] = { 5121 "perf sched latency [<options>]", 5122 NULL 5123 }; 5124 const char * const replay_usage[] = { 5125 "perf sched replay [<options>]", 5126 NULL 5127 }; 5128 const char * const map_usage[] = { 5129 "perf sched map [<options>]", 5130 NULL 5131 }; 5132 const char * const timehist_usage[] = { 5133 "perf sched timehist [<options>]", 5134 NULL 5135 }; 5136 const char *stats_usage[] = { 5137 "perf sched stats {record|report} [<options>]", 5138 NULL 5139 }; 5140 const char *const sched_subcommands[] = { "record", "latency", "map", 5141 "replay", "script", 5142 "timehist", "stats", NULL }; 5143 const char *sched_usage[] = { 5144 NULL, 5145 NULL 5146 }; 5147 struct trace_sched_handler lat_ops = { 5148 .wakeup_event = latency_wakeup_event, 5149 .switch_event = latency_switch_event, 5150 .runtime_event = latency_runtime_event, 5151 .migrate_task_event = latency_migrate_task_event, 5152 }; 5153 struct trace_sched_handler map_ops = { 5154 .switch_event = map_switch_event, 5155 }; 5156 struct trace_sched_handler replay_ops = { 5157 .wakeup_event = replay_wakeup_event, 5158 .switch_event = replay_switch_event, 5159 .fork_event = replay_fork_event, 5160 }; 5161 struct trace_sched_handler stats_ops = {}; 5162 int ret; 5163 5164 perf_tool__init(&sched.tool, /*ordered_events=*/true); 5165 sched.tool.sample = perf_sched__process_tracepoint_sample; 5166 sched.tool.comm = perf_sched__process_comm; 5167 sched.tool.namespaces = perf_event__process_namespaces; 5168 sched.tool.lost = perf_event__process_lost; 5169 sched.tool.fork = perf_sched__process_fork_event; 5170 5171 argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands, 5172 sched_usage, PARSE_OPT_STOP_AT_NON_OPTION); 5173 if (!argc) 5174 usage_with_options(sched_usage, sched_options); 5175 5176 thread__set_priv_destructor(free); 5177 5178 /* 5179 * Aliased to 'perf script' for now: 5180 */ 5181 if (!strcmp(argv[0], "script")) { 5182 ret = cmd_script(argc, argv); 5183 } else if (strlen(argv[0]) > 2 && strstarts("record", argv[0])) { 5184 ret = __cmd_record(argc, argv); 5185 } else if (strlen(argv[0]) > 2 && strstarts("latency", argv[0])) { 5186 sched.tp_handler = &lat_ops; 5187 if (argc > 1) { 5188 argc = parse_options(argc, argv, latency_options, latency_usage, 0); 5189 if (argc) 5190 usage_with_options(latency_usage, latency_options); 5191 } 5192 setup_sorting(&sched, latency_options, latency_usage); 5193 ret = perf_sched__lat(&sched); 5194 } else if (!strcmp(argv[0], "map")) { 5195 if (argc) { 5196 argc = parse_options(argc, argv, map_options, map_usage, 0); 5197 if (argc) 5198 usage_with_options(map_usage, map_options); 5199 5200 if (sched.map.task_name) { 5201 sched.map.task_names = strlist__new(sched.map.task_name, NULL); 5202 if (sched.map.task_names == NULL) { 5203 fprintf(stderr, "Failed to parse task names\n"); 5204 ret = -1; 5205 goto out; 5206 } 5207 } 5208 } 5209 sched.tp_handler = &map_ops; 5210 setup_sorting(&sched, latency_options, latency_usage); 5211 ret = perf_sched__map(&sched); 5212 } else if (strlen(argv[0]) > 2 && strstarts("replay", argv[0])) { 5213 sched.tp_handler = &replay_ops; 5214 if (argc) { 5215 argc = parse_options(argc, argv, replay_options, replay_usage, 0); 5216 if (argc) 5217 usage_with_options(replay_usage, replay_options); 5218 } 5219 ret = perf_sched__replay(&sched); 5220 } else if (!strcmp(argv[0], "timehist")) { 5221 if (argc) { 5222 argc = parse_options(argc, argv, timehist_options, 5223 timehist_usage, 0); 5224 if (argc) 5225 usage_with_options(timehist_usage, timehist_options); 5226 } 5227 if ((sched.show_wakeups || sched.show_next) && 5228 sched.summary_only) { 5229 pr_err(" Error: -s and -[n|w] are mutually exclusive.\n"); 5230 parse_options_usage(timehist_usage, timehist_options, "s", true); 5231 if (sched.show_wakeups) 5232 parse_options_usage(NULL, timehist_options, "w", true); 5233 if (sched.show_next) 5234 parse_options_usage(NULL, timehist_options, "n", true); 5235 ret = -EINVAL; 5236 goto out; 5237 } 5238 ret = symbol__validate_sym_arguments(); 5239 if (!ret) 5240 ret = perf_sched__timehist(&sched); 5241 } else if (!strcmp(argv[0], "stats")) { 5242 const char *const stats_subcommands[] = {"record", "report", NULL}; 5243 5244 sched.tp_handler = &stats_ops; 5245 argc = parse_options_subcommand(argc, argv, stats_options, 5246 stats_subcommands, 5247 stats_usage, 5248 PARSE_OPT_STOP_AT_NON_OPTION); 5249 5250 if (argv[0] && !strcmp(argv[0], "record")) { 5251 if (argc) 5252 argc = parse_options(argc, argv, stats_options, 5253 stats_usage, 0); 5254 return perf_sched__schedstat_record(&sched, argc, argv); 5255 } else if (argv[0] && !strcmp(argv[0], "report")) { 5256 if (argc) 5257 argc = parse_options(argc, argv, stats_options, 5258 stats_usage, 0); 5259 return perf_sched__schedstat_report(&sched); 5260 } else if (argv[0] && !strcmp(argv[0], "diff")) { 5261 if (argc) 5262 argc = parse_options(argc, argv, stats_options, 5263 stats_usage, 0); 5264 return perf_sched__schedstat_diff(&sched, argc, argv); 5265 } 5266 return perf_sched__schedstat_live(&sched, argc, argv); 5267 } else { 5268 usage_with_options(sched_usage, sched_options); 5269 } 5270 5271 out: 5272 /* free usage string allocated by parse_options_subcommand */ 5273 free((void *)sched_usage[0]); 5274 5275 return ret; 5276 } 5277