1 #include "builtin.h" 2 #include "perf.h" 3 4 #include "util/util.h" 5 #include "util/evlist.h" 6 #include "util/cache.h" 7 #include "util/evsel.h" 8 #include "util/symbol.h" 9 #include "util/thread.h" 10 #include "util/header.h" 11 #include "util/session.h" 12 #include "util/tool.h" 13 #include "util/cloexec.h" 14 #include "util/thread_map.h" 15 #include "util/color.h" 16 17 #include <subcmd/parse-options.h> 18 #include "util/trace-event.h" 19 20 #include "util/debug.h" 21 22 #include <sys/prctl.h> 23 #include <sys/resource.h> 24 25 #include <semaphore.h> 26 #include <pthread.h> 27 #include <math.h> 28 #include <api/fs/fs.h> 29 #include <linux/time64.h> 30 31 #define PR_SET_NAME 15 /* Set process name */ 32 #define MAX_CPUS 4096 33 #define COMM_LEN 20 34 #define SYM_LEN 129 35 #define MAX_PID 1024000 36 37 struct sched_atom; 38 39 struct task_desc { 40 unsigned long nr; 41 unsigned long pid; 42 char comm[COMM_LEN]; 43 44 unsigned long nr_events; 45 unsigned long curr_event; 46 struct sched_atom **atoms; 47 48 pthread_t thread; 49 sem_t sleep_sem; 50 51 sem_t ready_for_work; 52 sem_t work_done_sem; 53 54 u64 cpu_usage; 55 }; 56 57 enum sched_event_type { 58 SCHED_EVENT_RUN, 59 SCHED_EVENT_SLEEP, 60 SCHED_EVENT_WAKEUP, 61 SCHED_EVENT_MIGRATION, 62 }; 63 64 struct sched_atom { 65 enum sched_event_type type; 66 int specific_wait; 67 u64 timestamp; 68 u64 duration; 69 unsigned long nr; 70 sem_t *wait_sem; 71 struct task_desc *wakee; 72 }; 73 74 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP" 75 76 enum thread_state { 77 THREAD_SLEEPING = 0, 78 THREAD_WAIT_CPU, 79 THREAD_SCHED_IN, 80 THREAD_IGNORE 81 }; 82 83 struct work_atom { 84 struct list_head list; 85 enum thread_state state; 86 u64 sched_out_time; 87 u64 wake_up_time; 88 u64 sched_in_time; 89 u64 runtime; 90 }; 91 92 struct work_atoms { 93 struct list_head work_list; 94 struct thread *thread; 95 struct rb_node node; 96 u64 max_lat; 97 u64 max_lat_at; 98 u64 total_lat; 99 u64 nb_atoms; 100 u64 total_runtime; 101 int num_merged; 102 }; 103 104 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *); 105 106 struct perf_sched; 107 108 struct trace_sched_handler { 109 int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel, 110 struct perf_sample *sample, struct machine *machine); 111 112 int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel, 113 struct perf_sample *sample, struct machine *machine); 114 115 int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel, 116 struct perf_sample *sample, struct machine *machine); 117 118 /* PERF_RECORD_FORK event, not sched_process_fork tracepoint */ 119 int (*fork_event)(struct perf_sched *sched, union perf_event *event, 120 struct machine *machine); 121 122 int (*migrate_task_event)(struct perf_sched *sched, 123 struct perf_evsel *evsel, 124 struct perf_sample *sample, 125 struct machine *machine); 126 }; 127 128 #define COLOR_PIDS PERF_COLOR_BLUE 129 #define COLOR_CPUS PERF_COLOR_BG_RED 130 131 struct perf_sched_map { 132 DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS); 133 int *comp_cpus; 134 bool comp; 135 struct thread_map *color_pids; 136 const char *color_pids_str; 137 struct cpu_map *color_cpus; 138 const char *color_cpus_str; 139 struct cpu_map *cpus; 140 const char *cpus_str; 141 }; 142 143 struct perf_sched { 144 struct perf_tool tool; 145 const char *sort_order; 146 unsigned long nr_tasks; 147 struct task_desc **pid_to_task; 148 struct task_desc **tasks; 149 const struct trace_sched_handler *tp_handler; 150 pthread_mutex_t start_work_mutex; 151 pthread_mutex_t work_done_wait_mutex; 152 int profile_cpu; 153 /* 154 * Track the current task - that way we can know whether there's any 155 * weird events, such as a task being switched away that is not current. 156 */ 157 int max_cpu; 158 u32 curr_pid[MAX_CPUS]; 159 struct thread *curr_thread[MAX_CPUS]; 160 char next_shortname1; 161 char next_shortname2; 162 unsigned int replay_repeat; 163 unsigned long nr_run_events; 164 unsigned long nr_sleep_events; 165 unsigned long nr_wakeup_events; 166 unsigned long nr_sleep_corrections; 167 unsigned long nr_run_events_optimized; 168 unsigned long targetless_wakeups; 169 unsigned long multitarget_wakeups; 170 unsigned long nr_runs; 171 unsigned long nr_timestamps; 172 unsigned long nr_unordered_timestamps; 173 unsigned long nr_context_switch_bugs; 174 unsigned long nr_events; 175 unsigned long nr_lost_chunks; 176 unsigned long nr_lost_events; 177 u64 run_measurement_overhead; 178 u64 sleep_measurement_overhead; 179 u64 start_time; 180 u64 cpu_usage; 181 u64 runavg_cpu_usage; 182 u64 parent_cpu_usage; 183 u64 runavg_parent_cpu_usage; 184 u64 sum_runtime; 185 u64 sum_fluct; 186 u64 run_avg; 187 u64 all_runtime; 188 u64 all_count; 189 u64 cpu_last_switched[MAX_CPUS]; 190 struct rb_root atom_root, sorted_atom_root, merged_atom_root; 191 struct list_head sort_list, cmp_pid; 192 bool force; 193 bool skip_merge; 194 struct perf_sched_map map; 195 }; 196 197 static u64 get_nsecs(void) 198 { 199 struct timespec ts; 200 201 clock_gettime(CLOCK_MONOTONIC, &ts); 202 203 return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec; 204 } 205 206 static void burn_nsecs(struct perf_sched *sched, u64 nsecs) 207 { 208 u64 T0 = get_nsecs(), T1; 209 210 do { 211 T1 = get_nsecs(); 212 } while (T1 + sched->run_measurement_overhead < T0 + nsecs); 213 } 214 215 static void sleep_nsecs(u64 nsecs) 216 { 217 struct timespec ts; 218 219 ts.tv_nsec = nsecs % 999999999; 220 ts.tv_sec = nsecs / 999999999; 221 222 nanosleep(&ts, NULL); 223 } 224 225 static void calibrate_run_measurement_overhead(struct perf_sched *sched) 226 { 227 u64 T0, T1, delta, min_delta = NSEC_PER_SEC; 228 int i; 229 230 for (i = 0; i < 10; i++) { 231 T0 = get_nsecs(); 232 burn_nsecs(sched, 0); 233 T1 = get_nsecs(); 234 delta = T1-T0; 235 min_delta = min(min_delta, delta); 236 } 237 sched->run_measurement_overhead = min_delta; 238 239 printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta); 240 } 241 242 static void calibrate_sleep_measurement_overhead(struct perf_sched *sched) 243 { 244 u64 T0, T1, delta, min_delta = NSEC_PER_SEC; 245 int i; 246 247 for (i = 0; i < 10; i++) { 248 T0 = get_nsecs(); 249 sleep_nsecs(10000); 250 T1 = get_nsecs(); 251 delta = T1-T0; 252 min_delta = min(min_delta, delta); 253 } 254 min_delta -= 10000; 255 sched->sleep_measurement_overhead = min_delta; 256 257 printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta); 258 } 259 260 static struct sched_atom * 261 get_new_event(struct task_desc *task, u64 timestamp) 262 { 263 struct sched_atom *event = zalloc(sizeof(*event)); 264 unsigned long idx = task->nr_events; 265 size_t size; 266 267 event->timestamp = timestamp; 268 event->nr = idx; 269 270 task->nr_events++; 271 size = sizeof(struct sched_atom *) * task->nr_events; 272 task->atoms = realloc(task->atoms, size); 273 BUG_ON(!task->atoms); 274 275 task->atoms[idx] = event; 276 277 return event; 278 } 279 280 static struct sched_atom *last_event(struct task_desc *task) 281 { 282 if (!task->nr_events) 283 return NULL; 284 285 return task->atoms[task->nr_events - 1]; 286 } 287 288 static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task, 289 u64 timestamp, u64 duration) 290 { 291 struct sched_atom *event, *curr_event = last_event(task); 292 293 /* 294 * optimize an existing RUN event by merging this one 295 * to it: 296 */ 297 if (curr_event && curr_event->type == SCHED_EVENT_RUN) { 298 sched->nr_run_events_optimized++; 299 curr_event->duration += duration; 300 return; 301 } 302 303 event = get_new_event(task, timestamp); 304 305 event->type = SCHED_EVENT_RUN; 306 event->duration = duration; 307 308 sched->nr_run_events++; 309 } 310 311 static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task, 312 u64 timestamp, struct task_desc *wakee) 313 { 314 struct sched_atom *event, *wakee_event; 315 316 event = get_new_event(task, timestamp); 317 event->type = SCHED_EVENT_WAKEUP; 318 event->wakee = wakee; 319 320 wakee_event = last_event(wakee); 321 if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) { 322 sched->targetless_wakeups++; 323 return; 324 } 325 if (wakee_event->wait_sem) { 326 sched->multitarget_wakeups++; 327 return; 328 } 329 330 wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem)); 331 sem_init(wakee_event->wait_sem, 0, 0); 332 wakee_event->specific_wait = 1; 333 event->wait_sem = wakee_event->wait_sem; 334 335 sched->nr_wakeup_events++; 336 } 337 338 static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task, 339 u64 timestamp, u64 task_state __maybe_unused) 340 { 341 struct sched_atom *event = get_new_event(task, timestamp); 342 343 event->type = SCHED_EVENT_SLEEP; 344 345 sched->nr_sleep_events++; 346 } 347 348 static struct task_desc *register_pid(struct perf_sched *sched, 349 unsigned long pid, const char *comm) 350 { 351 struct task_desc *task; 352 static int pid_max; 353 354 if (sched->pid_to_task == NULL) { 355 if (sysctl__read_int("kernel/pid_max", &pid_max) < 0) 356 pid_max = MAX_PID; 357 BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL); 358 } 359 if (pid >= (unsigned long)pid_max) { 360 BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) * 361 sizeof(struct task_desc *))) == NULL); 362 while (pid >= (unsigned long)pid_max) 363 sched->pid_to_task[pid_max++] = NULL; 364 } 365 366 task = sched->pid_to_task[pid]; 367 368 if (task) 369 return task; 370 371 task = zalloc(sizeof(*task)); 372 task->pid = pid; 373 task->nr = sched->nr_tasks; 374 strcpy(task->comm, comm); 375 /* 376 * every task starts in sleeping state - this gets ignored 377 * if there's no wakeup pointing to this sleep state: 378 */ 379 add_sched_event_sleep(sched, task, 0, 0); 380 381 sched->pid_to_task[pid] = task; 382 sched->nr_tasks++; 383 sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *)); 384 BUG_ON(!sched->tasks); 385 sched->tasks[task->nr] = task; 386 387 if (verbose) 388 printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm); 389 390 return task; 391 } 392 393 394 static void print_task_traces(struct perf_sched *sched) 395 { 396 struct task_desc *task; 397 unsigned long i; 398 399 for (i = 0; i < sched->nr_tasks; i++) { 400 task = sched->tasks[i]; 401 printf("task %6ld (%20s:%10ld), nr_events: %ld\n", 402 task->nr, task->comm, task->pid, task->nr_events); 403 } 404 } 405 406 static void add_cross_task_wakeups(struct perf_sched *sched) 407 { 408 struct task_desc *task1, *task2; 409 unsigned long i, j; 410 411 for (i = 0; i < sched->nr_tasks; i++) { 412 task1 = sched->tasks[i]; 413 j = i + 1; 414 if (j == sched->nr_tasks) 415 j = 0; 416 task2 = sched->tasks[j]; 417 add_sched_event_wakeup(sched, task1, 0, task2); 418 } 419 } 420 421 static void perf_sched__process_event(struct perf_sched *sched, 422 struct sched_atom *atom) 423 { 424 int ret = 0; 425 426 switch (atom->type) { 427 case SCHED_EVENT_RUN: 428 burn_nsecs(sched, atom->duration); 429 break; 430 case SCHED_EVENT_SLEEP: 431 if (atom->wait_sem) 432 ret = sem_wait(atom->wait_sem); 433 BUG_ON(ret); 434 break; 435 case SCHED_EVENT_WAKEUP: 436 if (atom->wait_sem) 437 ret = sem_post(atom->wait_sem); 438 BUG_ON(ret); 439 break; 440 case SCHED_EVENT_MIGRATION: 441 break; 442 default: 443 BUG_ON(1); 444 } 445 } 446 447 static u64 get_cpu_usage_nsec_parent(void) 448 { 449 struct rusage ru; 450 u64 sum; 451 int err; 452 453 err = getrusage(RUSAGE_SELF, &ru); 454 BUG_ON(err); 455 456 sum = ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC; 457 sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC; 458 459 return sum; 460 } 461 462 static int self_open_counters(struct perf_sched *sched, unsigned long cur_task) 463 { 464 struct perf_event_attr attr; 465 char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE]; 466 int fd; 467 struct rlimit limit; 468 bool need_privilege = false; 469 470 memset(&attr, 0, sizeof(attr)); 471 472 attr.type = PERF_TYPE_SOFTWARE; 473 attr.config = PERF_COUNT_SW_TASK_CLOCK; 474 475 force_again: 476 fd = sys_perf_event_open(&attr, 0, -1, -1, 477 perf_event_open_cloexec_flag()); 478 479 if (fd < 0) { 480 if (errno == EMFILE) { 481 if (sched->force) { 482 BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1); 483 limit.rlim_cur += sched->nr_tasks - cur_task; 484 if (limit.rlim_cur > limit.rlim_max) { 485 limit.rlim_max = limit.rlim_cur; 486 need_privilege = true; 487 } 488 if (setrlimit(RLIMIT_NOFILE, &limit) == -1) { 489 if (need_privilege && errno == EPERM) 490 strcpy(info, "Need privilege\n"); 491 } else 492 goto force_again; 493 } else 494 strcpy(info, "Have a try with -f option\n"); 495 } 496 pr_err("Error: sys_perf_event_open() syscall returned " 497 "with %d (%s)\n%s", fd, 498 str_error_r(errno, sbuf, sizeof(sbuf)), info); 499 exit(EXIT_FAILURE); 500 } 501 return fd; 502 } 503 504 static u64 get_cpu_usage_nsec_self(int fd) 505 { 506 u64 runtime; 507 int ret; 508 509 ret = read(fd, &runtime, sizeof(runtime)); 510 BUG_ON(ret != sizeof(runtime)); 511 512 return runtime; 513 } 514 515 struct sched_thread_parms { 516 struct task_desc *task; 517 struct perf_sched *sched; 518 int fd; 519 }; 520 521 static void *thread_func(void *ctx) 522 { 523 struct sched_thread_parms *parms = ctx; 524 struct task_desc *this_task = parms->task; 525 struct perf_sched *sched = parms->sched; 526 u64 cpu_usage_0, cpu_usage_1; 527 unsigned long i, ret; 528 char comm2[22]; 529 int fd = parms->fd; 530 531 zfree(&parms); 532 533 sprintf(comm2, ":%s", this_task->comm); 534 prctl(PR_SET_NAME, comm2); 535 if (fd < 0) 536 return NULL; 537 again: 538 ret = sem_post(&this_task->ready_for_work); 539 BUG_ON(ret); 540 ret = pthread_mutex_lock(&sched->start_work_mutex); 541 BUG_ON(ret); 542 ret = pthread_mutex_unlock(&sched->start_work_mutex); 543 BUG_ON(ret); 544 545 cpu_usage_0 = get_cpu_usage_nsec_self(fd); 546 547 for (i = 0; i < this_task->nr_events; i++) { 548 this_task->curr_event = i; 549 perf_sched__process_event(sched, this_task->atoms[i]); 550 } 551 552 cpu_usage_1 = get_cpu_usage_nsec_self(fd); 553 this_task->cpu_usage = cpu_usage_1 - cpu_usage_0; 554 ret = sem_post(&this_task->work_done_sem); 555 BUG_ON(ret); 556 557 ret = pthread_mutex_lock(&sched->work_done_wait_mutex); 558 BUG_ON(ret); 559 ret = pthread_mutex_unlock(&sched->work_done_wait_mutex); 560 BUG_ON(ret); 561 562 goto again; 563 } 564 565 static void create_tasks(struct perf_sched *sched) 566 { 567 struct task_desc *task; 568 pthread_attr_t attr; 569 unsigned long i; 570 int err; 571 572 err = pthread_attr_init(&attr); 573 BUG_ON(err); 574 err = pthread_attr_setstacksize(&attr, 575 (size_t) max(16 * 1024, PTHREAD_STACK_MIN)); 576 BUG_ON(err); 577 err = pthread_mutex_lock(&sched->start_work_mutex); 578 BUG_ON(err); 579 err = pthread_mutex_lock(&sched->work_done_wait_mutex); 580 BUG_ON(err); 581 for (i = 0; i < sched->nr_tasks; i++) { 582 struct sched_thread_parms *parms = malloc(sizeof(*parms)); 583 BUG_ON(parms == NULL); 584 parms->task = task = sched->tasks[i]; 585 parms->sched = sched; 586 parms->fd = self_open_counters(sched, i); 587 sem_init(&task->sleep_sem, 0, 0); 588 sem_init(&task->ready_for_work, 0, 0); 589 sem_init(&task->work_done_sem, 0, 0); 590 task->curr_event = 0; 591 err = pthread_create(&task->thread, &attr, thread_func, parms); 592 BUG_ON(err); 593 } 594 } 595 596 static void wait_for_tasks(struct perf_sched *sched) 597 { 598 u64 cpu_usage_0, cpu_usage_1; 599 struct task_desc *task; 600 unsigned long i, ret; 601 602 sched->start_time = get_nsecs(); 603 sched->cpu_usage = 0; 604 pthread_mutex_unlock(&sched->work_done_wait_mutex); 605 606 for (i = 0; i < sched->nr_tasks; i++) { 607 task = sched->tasks[i]; 608 ret = sem_wait(&task->ready_for_work); 609 BUG_ON(ret); 610 sem_init(&task->ready_for_work, 0, 0); 611 } 612 ret = pthread_mutex_lock(&sched->work_done_wait_mutex); 613 BUG_ON(ret); 614 615 cpu_usage_0 = get_cpu_usage_nsec_parent(); 616 617 pthread_mutex_unlock(&sched->start_work_mutex); 618 619 for (i = 0; i < sched->nr_tasks; i++) { 620 task = sched->tasks[i]; 621 ret = sem_wait(&task->work_done_sem); 622 BUG_ON(ret); 623 sem_init(&task->work_done_sem, 0, 0); 624 sched->cpu_usage += task->cpu_usage; 625 task->cpu_usage = 0; 626 } 627 628 cpu_usage_1 = get_cpu_usage_nsec_parent(); 629 if (!sched->runavg_cpu_usage) 630 sched->runavg_cpu_usage = sched->cpu_usage; 631 sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat; 632 633 sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0; 634 if (!sched->runavg_parent_cpu_usage) 635 sched->runavg_parent_cpu_usage = sched->parent_cpu_usage; 636 sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) + 637 sched->parent_cpu_usage)/sched->replay_repeat; 638 639 ret = pthread_mutex_lock(&sched->start_work_mutex); 640 BUG_ON(ret); 641 642 for (i = 0; i < sched->nr_tasks; i++) { 643 task = sched->tasks[i]; 644 sem_init(&task->sleep_sem, 0, 0); 645 task->curr_event = 0; 646 } 647 } 648 649 static void run_one_test(struct perf_sched *sched) 650 { 651 u64 T0, T1, delta, avg_delta, fluct; 652 653 T0 = get_nsecs(); 654 wait_for_tasks(sched); 655 T1 = get_nsecs(); 656 657 delta = T1 - T0; 658 sched->sum_runtime += delta; 659 sched->nr_runs++; 660 661 avg_delta = sched->sum_runtime / sched->nr_runs; 662 if (delta < avg_delta) 663 fluct = avg_delta - delta; 664 else 665 fluct = delta - avg_delta; 666 sched->sum_fluct += fluct; 667 if (!sched->run_avg) 668 sched->run_avg = delta; 669 sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat; 670 671 printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / NSEC_PER_MSEC); 672 673 printf("ravg: %0.2f, ", (double)sched->run_avg / NSEC_PER_MSEC); 674 675 printf("cpu: %0.2f / %0.2f", 676 (double)sched->cpu_usage / NSEC_PER_MSEC, (double)sched->runavg_cpu_usage / NSEC_PER_MSEC); 677 678 #if 0 679 /* 680 * rusage statistics done by the parent, these are less 681 * accurate than the sched->sum_exec_runtime based statistics: 682 */ 683 printf(" [%0.2f / %0.2f]", 684 (double)sched->parent_cpu_usage / NSEC_PER_MSEC, 685 (double)sched->runavg_parent_cpu_usage / NSEC_PER_MSEC); 686 #endif 687 688 printf("\n"); 689 690 if (sched->nr_sleep_corrections) 691 printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections); 692 sched->nr_sleep_corrections = 0; 693 } 694 695 static void test_calibrations(struct perf_sched *sched) 696 { 697 u64 T0, T1; 698 699 T0 = get_nsecs(); 700 burn_nsecs(sched, NSEC_PER_MSEC); 701 T1 = get_nsecs(); 702 703 printf("the run test took %" PRIu64 " nsecs\n", T1 - T0); 704 705 T0 = get_nsecs(); 706 sleep_nsecs(NSEC_PER_MSEC); 707 T1 = get_nsecs(); 708 709 printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0); 710 } 711 712 static int 713 replay_wakeup_event(struct perf_sched *sched, 714 struct perf_evsel *evsel, struct perf_sample *sample, 715 struct machine *machine __maybe_unused) 716 { 717 const char *comm = perf_evsel__strval(evsel, sample, "comm"); 718 const u32 pid = perf_evsel__intval(evsel, sample, "pid"); 719 struct task_desc *waker, *wakee; 720 721 if (verbose) { 722 printf("sched_wakeup event %p\n", evsel); 723 724 printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid); 725 } 726 727 waker = register_pid(sched, sample->tid, "<unknown>"); 728 wakee = register_pid(sched, pid, comm); 729 730 add_sched_event_wakeup(sched, waker, sample->time, wakee); 731 return 0; 732 } 733 734 static int replay_switch_event(struct perf_sched *sched, 735 struct perf_evsel *evsel, 736 struct perf_sample *sample, 737 struct machine *machine __maybe_unused) 738 { 739 const char *prev_comm = perf_evsel__strval(evsel, sample, "prev_comm"), 740 *next_comm = perf_evsel__strval(evsel, sample, "next_comm"); 741 const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"), 742 next_pid = perf_evsel__intval(evsel, sample, "next_pid"); 743 const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state"); 744 struct task_desc *prev, __maybe_unused *next; 745 u64 timestamp0, timestamp = sample->time; 746 int cpu = sample->cpu; 747 s64 delta; 748 749 if (verbose) 750 printf("sched_switch event %p\n", evsel); 751 752 if (cpu >= MAX_CPUS || cpu < 0) 753 return 0; 754 755 timestamp0 = sched->cpu_last_switched[cpu]; 756 if (timestamp0) 757 delta = timestamp - timestamp0; 758 else 759 delta = 0; 760 761 if (delta < 0) { 762 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta); 763 return -1; 764 } 765 766 pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n", 767 prev_comm, prev_pid, next_comm, next_pid, delta); 768 769 prev = register_pid(sched, prev_pid, prev_comm); 770 next = register_pid(sched, next_pid, next_comm); 771 772 sched->cpu_last_switched[cpu] = timestamp; 773 774 add_sched_event_run(sched, prev, timestamp, delta); 775 add_sched_event_sleep(sched, prev, timestamp, prev_state); 776 777 return 0; 778 } 779 780 static int replay_fork_event(struct perf_sched *sched, 781 union perf_event *event, 782 struct machine *machine) 783 { 784 struct thread *child, *parent; 785 786 child = machine__findnew_thread(machine, event->fork.pid, 787 event->fork.tid); 788 parent = machine__findnew_thread(machine, event->fork.ppid, 789 event->fork.ptid); 790 791 if (child == NULL || parent == NULL) { 792 pr_debug("thread does not exist on fork event: child %p, parent %p\n", 793 child, parent); 794 goto out_put; 795 } 796 797 if (verbose) { 798 printf("fork event\n"); 799 printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid); 800 printf("... child: %s/%d\n", thread__comm_str(child), child->tid); 801 } 802 803 register_pid(sched, parent->tid, thread__comm_str(parent)); 804 register_pid(sched, child->tid, thread__comm_str(child)); 805 out_put: 806 thread__put(child); 807 thread__put(parent); 808 return 0; 809 } 810 811 struct sort_dimension { 812 const char *name; 813 sort_fn_t cmp; 814 struct list_head list; 815 }; 816 817 static int 818 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r) 819 { 820 struct sort_dimension *sort; 821 int ret = 0; 822 823 BUG_ON(list_empty(list)); 824 825 list_for_each_entry(sort, list, list) { 826 ret = sort->cmp(l, r); 827 if (ret) 828 return ret; 829 } 830 831 return ret; 832 } 833 834 static struct work_atoms * 835 thread_atoms_search(struct rb_root *root, struct thread *thread, 836 struct list_head *sort_list) 837 { 838 struct rb_node *node = root->rb_node; 839 struct work_atoms key = { .thread = thread }; 840 841 while (node) { 842 struct work_atoms *atoms; 843 int cmp; 844 845 atoms = container_of(node, struct work_atoms, node); 846 847 cmp = thread_lat_cmp(sort_list, &key, atoms); 848 if (cmp > 0) 849 node = node->rb_left; 850 else if (cmp < 0) 851 node = node->rb_right; 852 else { 853 BUG_ON(thread != atoms->thread); 854 return atoms; 855 } 856 } 857 return NULL; 858 } 859 860 static void 861 __thread_latency_insert(struct rb_root *root, struct work_atoms *data, 862 struct list_head *sort_list) 863 { 864 struct rb_node **new = &(root->rb_node), *parent = NULL; 865 866 while (*new) { 867 struct work_atoms *this; 868 int cmp; 869 870 this = container_of(*new, struct work_atoms, node); 871 parent = *new; 872 873 cmp = thread_lat_cmp(sort_list, data, this); 874 875 if (cmp > 0) 876 new = &((*new)->rb_left); 877 else 878 new = &((*new)->rb_right); 879 } 880 881 rb_link_node(&data->node, parent, new); 882 rb_insert_color(&data->node, root); 883 } 884 885 static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread) 886 { 887 struct work_atoms *atoms = zalloc(sizeof(*atoms)); 888 if (!atoms) { 889 pr_err("No memory at %s\n", __func__); 890 return -1; 891 } 892 893 atoms->thread = thread__get(thread); 894 INIT_LIST_HEAD(&atoms->work_list); 895 __thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid); 896 return 0; 897 } 898 899 static char sched_out_state(u64 prev_state) 900 { 901 const char *str = TASK_STATE_TO_CHAR_STR; 902 903 return str[prev_state]; 904 } 905 906 static int 907 add_sched_out_event(struct work_atoms *atoms, 908 char run_state, 909 u64 timestamp) 910 { 911 struct work_atom *atom = zalloc(sizeof(*atom)); 912 if (!atom) { 913 pr_err("Non memory at %s", __func__); 914 return -1; 915 } 916 917 atom->sched_out_time = timestamp; 918 919 if (run_state == 'R') { 920 atom->state = THREAD_WAIT_CPU; 921 atom->wake_up_time = atom->sched_out_time; 922 } 923 924 list_add_tail(&atom->list, &atoms->work_list); 925 return 0; 926 } 927 928 static void 929 add_runtime_event(struct work_atoms *atoms, u64 delta, 930 u64 timestamp __maybe_unused) 931 { 932 struct work_atom *atom; 933 934 BUG_ON(list_empty(&atoms->work_list)); 935 936 atom = list_entry(atoms->work_list.prev, struct work_atom, list); 937 938 atom->runtime += delta; 939 atoms->total_runtime += delta; 940 } 941 942 static void 943 add_sched_in_event(struct work_atoms *atoms, u64 timestamp) 944 { 945 struct work_atom *atom; 946 u64 delta; 947 948 if (list_empty(&atoms->work_list)) 949 return; 950 951 atom = list_entry(atoms->work_list.prev, struct work_atom, list); 952 953 if (atom->state != THREAD_WAIT_CPU) 954 return; 955 956 if (timestamp < atom->wake_up_time) { 957 atom->state = THREAD_IGNORE; 958 return; 959 } 960 961 atom->state = THREAD_SCHED_IN; 962 atom->sched_in_time = timestamp; 963 964 delta = atom->sched_in_time - atom->wake_up_time; 965 atoms->total_lat += delta; 966 if (delta > atoms->max_lat) { 967 atoms->max_lat = delta; 968 atoms->max_lat_at = timestamp; 969 } 970 atoms->nb_atoms++; 971 } 972 973 static int latency_switch_event(struct perf_sched *sched, 974 struct perf_evsel *evsel, 975 struct perf_sample *sample, 976 struct machine *machine) 977 { 978 const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"), 979 next_pid = perf_evsel__intval(evsel, sample, "next_pid"); 980 const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state"); 981 struct work_atoms *out_events, *in_events; 982 struct thread *sched_out, *sched_in; 983 u64 timestamp0, timestamp = sample->time; 984 int cpu = sample->cpu, err = -1; 985 s64 delta; 986 987 BUG_ON(cpu >= MAX_CPUS || cpu < 0); 988 989 timestamp0 = sched->cpu_last_switched[cpu]; 990 sched->cpu_last_switched[cpu] = timestamp; 991 if (timestamp0) 992 delta = timestamp - timestamp0; 993 else 994 delta = 0; 995 996 if (delta < 0) { 997 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta); 998 return -1; 999 } 1000 1001 sched_out = machine__findnew_thread(machine, -1, prev_pid); 1002 sched_in = machine__findnew_thread(machine, -1, next_pid); 1003 if (sched_out == NULL || sched_in == NULL) 1004 goto out_put; 1005 1006 out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid); 1007 if (!out_events) { 1008 if (thread_atoms_insert(sched, sched_out)) 1009 goto out_put; 1010 out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid); 1011 if (!out_events) { 1012 pr_err("out-event: Internal tree error"); 1013 goto out_put; 1014 } 1015 } 1016 if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp)) 1017 return -1; 1018 1019 in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid); 1020 if (!in_events) { 1021 if (thread_atoms_insert(sched, sched_in)) 1022 goto out_put; 1023 in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid); 1024 if (!in_events) { 1025 pr_err("in-event: Internal tree error"); 1026 goto out_put; 1027 } 1028 /* 1029 * Take came in we have not heard about yet, 1030 * add in an initial atom in runnable state: 1031 */ 1032 if (add_sched_out_event(in_events, 'R', timestamp)) 1033 goto out_put; 1034 } 1035 add_sched_in_event(in_events, timestamp); 1036 err = 0; 1037 out_put: 1038 thread__put(sched_out); 1039 thread__put(sched_in); 1040 return err; 1041 } 1042 1043 static int latency_runtime_event(struct perf_sched *sched, 1044 struct perf_evsel *evsel, 1045 struct perf_sample *sample, 1046 struct machine *machine) 1047 { 1048 const u32 pid = perf_evsel__intval(evsel, sample, "pid"); 1049 const u64 runtime = perf_evsel__intval(evsel, sample, "runtime"); 1050 struct thread *thread = machine__findnew_thread(machine, -1, pid); 1051 struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid); 1052 u64 timestamp = sample->time; 1053 int cpu = sample->cpu, err = -1; 1054 1055 if (thread == NULL) 1056 return -1; 1057 1058 BUG_ON(cpu >= MAX_CPUS || cpu < 0); 1059 if (!atoms) { 1060 if (thread_atoms_insert(sched, thread)) 1061 goto out_put; 1062 atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid); 1063 if (!atoms) { 1064 pr_err("in-event: Internal tree error"); 1065 goto out_put; 1066 } 1067 if (add_sched_out_event(atoms, 'R', timestamp)) 1068 goto out_put; 1069 } 1070 1071 add_runtime_event(atoms, runtime, timestamp); 1072 err = 0; 1073 out_put: 1074 thread__put(thread); 1075 return err; 1076 } 1077 1078 static int latency_wakeup_event(struct perf_sched *sched, 1079 struct perf_evsel *evsel, 1080 struct perf_sample *sample, 1081 struct machine *machine) 1082 { 1083 const u32 pid = perf_evsel__intval(evsel, sample, "pid"); 1084 struct work_atoms *atoms; 1085 struct work_atom *atom; 1086 struct thread *wakee; 1087 u64 timestamp = sample->time; 1088 int err = -1; 1089 1090 wakee = machine__findnew_thread(machine, -1, pid); 1091 if (wakee == NULL) 1092 return -1; 1093 atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid); 1094 if (!atoms) { 1095 if (thread_atoms_insert(sched, wakee)) 1096 goto out_put; 1097 atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid); 1098 if (!atoms) { 1099 pr_err("wakeup-event: Internal tree error"); 1100 goto out_put; 1101 } 1102 if (add_sched_out_event(atoms, 'S', timestamp)) 1103 goto out_put; 1104 } 1105 1106 BUG_ON(list_empty(&atoms->work_list)); 1107 1108 atom = list_entry(atoms->work_list.prev, struct work_atom, list); 1109 1110 /* 1111 * As we do not guarantee the wakeup event happens when 1112 * task is out of run queue, also may happen when task is 1113 * on run queue and wakeup only change ->state to TASK_RUNNING, 1114 * then we should not set the ->wake_up_time when wake up a 1115 * task which is on run queue. 1116 * 1117 * You WILL be missing events if you've recorded only 1118 * one CPU, or are only looking at only one, so don't 1119 * skip in this case. 1120 */ 1121 if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING) 1122 goto out_ok; 1123 1124 sched->nr_timestamps++; 1125 if (atom->sched_out_time > timestamp) { 1126 sched->nr_unordered_timestamps++; 1127 goto out_ok; 1128 } 1129 1130 atom->state = THREAD_WAIT_CPU; 1131 atom->wake_up_time = timestamp; 1132 out_ok: 1133 err = 0; 1134 out_put: 1135 thread__put(wakee); 1136 return err; 1137 } 1138 1139 static int latency_migrate_task_event(struct perf_sched *sched, 1140 struct perf_evsel *evsel, 1141 struct perf_sample *sample, 1142 struct machine *machine) 1143 { 1144 const u32 pid = perf_evsel__intval(evsel, sample, "pid"); 1145 u64 timestamp = sample->time; 1146 struct work_atoms *atoms; 1147 struct work_atom *atom; 1148 struct thread *migrant; 1149 int err = -1; 1150 1151 /* 1152 * Only need to worry about migration when profiling one CPU. 1153 */ 1154 if (sched->profile_cpu == -1) 1155 return 0; 1156 1157 migrant = machine__findnew_thread(machine, -1, pid); 1158 if (migrant == NULL) 1159 return -1; 1160 atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid); 1161 if (!atoms) { 1162 if (thread_atoms_insert(sched, migrant)) 1163 goto out_put; 1164 register_pid(sched, migrant->tid, thread__comm_str(migrant)); 1165 atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid); 1166 if (!atoms) { 1167 pr_err("migration-event: Internal tree error"); 1168 goto out_put; 1169 } 1170 if (add_sched_out_event(atoms, 'R', timestamp)) 1171 goto out_put; 1172 } 1173 1174 BUG_ON(list_empty(&atoms->work_list)); 1175 1176 atom = list_entry(atoms->work_list.prev, struct work_atom, list); 1177 atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp; 1178 1179 sched->nr_timestamps++; 1180 1181 if (atom->sched_out_time > timestamp) 1182 sched->nr_unordered_timestamps++; 1183 err = 0; 1184 out_put: 1185 thread__put(migrant); 1186 return err; 1187 } 1188 1189 static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list) 1190 { 1191 int i; 1192 int ret; 1193 u64 avg; 1194 char max_lat_at[32]; 1195 1196 if (!work_list->nb_atoms) 1197 return; 1198 /* 1199 * Ignore idle threads: 1200 */ 1201 if (!strcmp(thread__comm_str(work_list->thread), "swapper")) 1202 return; 1203 1204 sched->all_runtime += work_list->total_runtime; 1205 sched->all_count += work_list->nb_atoms; 1206 1207 if (work_list->num_merged > 1) 1208 ret = printf(" %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged); 1209 else 1210 ret = printf(" %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid); 1211 1212 for (i = 0; i < 24 - ret; i++) 1213 printf(" "); 1214 1215 avg = work_list->total_lat / work_list->nb_atoms; 1216 timestamp__scnprintf_usec(work_list->max_lat_at, max_lat_at, sizeof(max_lat_at)); 1217 1218 printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %13s s\n", 1219 (double)work_list->total_runtime / NSEC_PER_MSEC, 1220 work_list->nb_atoms, (double)avg / NSEC_PER_MSEC, 1221 (double)work_list->max_lat / NSEC_PER_MSEC, 1222 max_lat_at); 1223 } 1224 1225 static int pid_cmp(struct work_atoms *l, struct work_atoms *r) 1226 { 1227 if (l->thread == r->thread) 1228 return 0; 1229 if (l->thread->tid < r->thread->tid) 1230 return -1; 1231 if (l->thread->tid > r->thread->tid) 1232 return 1; 1233 return (int)(l->thread - r->thread); 1234 } 1235 1236 static int avg_cmp(struct work_atoms *l, struct work_atoms *r) 1237 { 1238 u64 avgl, avgr; 1239 1240 if (!l->nb_atoms) 1241 return -1; 1242 1243 if (!r->nb_atoms) 1244 return 1; 1245 1246 avgl = l->total_lat / l->nb_atoms; 1247 avgr = r->total_lat / r->nb_atoms; 1248 1249 if (avgl < avgr) 1250 return -1; 1251 if (avgl > avgr) 1252 return 1; 1253 1254 return 0; 1255 } 1256 1257 static int max_cmp(struct work_atoms *l, struct work_atoms *r) 1258 { 1259 if (l->max_lat < r->max_lat) 1260 return -1; 1261 if (l->max_lat > r->max_lat) 1262 return 1; 1263 1264 return 0; 1265 } 1266 1267 static int switch_cmp(struct work_atoms *l, struct work_atoms *r) 1268 { 1269 if (l->nb_atoms < r->nb_atoms) 1270 return -1; 1271 if (l->nb_atoms > r->nb_atoms) 1272 return 1; 1273 1274 return 0; 1275 } 1276 1277 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r) 1278 { 1279 if (l->total_runtime < r->total_runtime) 1280 return -1; 1281 if (l->total_runtime > r->total_runtime) 1282 return 1; 1283 1284 return 0; 1285 } 1286 1287 static int sort_dimension__add(const char *tok, struct list_head *list) 1288 { 1289 size_t i; 1290 static struct sort_dimension avg_sort_dimension = { 1291 .name = "avg", 1292 .cmp = avg_cmp, 1293 }; 1294 static struct sort_dimension max_sort_dimension = { 1295 .name = "max", 1296 .cmp = max_cmp, 1297 }; 1298 static struct sort_dimension pid_sort_dimension = { 1299 .name = "pid", 1300 .cmp = pid_cmp, 1301 }; 1302 static struct sort_dimension runtime_sort_dimension = { 1303 .name = "runtime", 1304 .cmp = runtime_cmp, 1305 }; 1306 static struct sort_dimension switch_sort_dimension = { 1307 .name = "switch", 1308 .cmp = switch_cmp, 1309 }; 1310 struct sort_dimension *available_sorts[] = { 1311 &pid_sort_dimension, 1312 &avg_sort_dimension, 1313 &max_sort_dimension, 1314 &switch_sort_dimension, 1315 &runtime_sort_dimension, 1316 }; 1317 1318 for (i = 0; i < ARRAY_SIZE(available_sorts); i++) { 1319 if (!strcmp(available_sorts[i]->name, tok)) { 1320 list_add_tail(&available_sorts[i]->list, list); 1321 1322 return 0; 1323 } 1324 } 1325 1326 return -1; 1327 } 1328 1329 static void perf_sched__sort_lat(struct perf_sched *sched) 1330 { 1331 struct rb_node *node; 1332 struct rb_root *root = &sched->atom_root; 1333 again: 1334 for (;;) { 1335 struct work_atoms *data; 1336 node = rb_first(root); 1337 if (!node) 1338 break; 1339 1340 rb_erase(node, root); 1341 data = rb_entry(node, struct work_atoms, node); 1342 __thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list); 1343 } 1344 if (root == &sched->atom_root) { 1345 root = &sched->merged_atom_root; 1346 goto again; 1347 } 1348 } 1349 1350 static int process_sched_wakeup_event(struct perf_tool *tool, 1351 struct perf_evsel *evsel, 1352 struct perf_sample *sample, 1353 struct machine *machine) 1354 { 1355 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1356 1357 if (sched->tp_handler->wakeup_event) 1358 return sched->tp_handler->wakeup_event(sched, evsel, sample, machine); 1359 1360 return 0; 1361 } 1362 1363 union map_priv { 1364 void *ptr; 1365 bool color; 1366 }; 1367 1368 static bool thread__has_color(struct thread *thread) 1369 { 1370 union map_priv priv = { 1371 .ptr = thread__priv(thread), 1372 }; 1373 1374 return priv.color; 1375 } 1376 1377 static struct thread* 1378 map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid) 1379 { 1380 struct thread *thread = machine__findnew_thread(machine, pid, tid); 1381 union map_priv priv = { 1382 .color = false, 1383 }; 1384 1385 if (!sched->map.color_pids || !thread || thread__priv(thread)) 1386 return thread; 1387 1388 if (thread_map__has(sched->map.color_pids, tid)) 1389 priv.color = true; 1390 1391 thread__set_priv(thread, priv.ptr); 1392 return thread; 1393 } 1394 1395 static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, 1396 struct perf_sample *sample, struct machine *machine) 1397 { 1398 const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid"); 1399 struct thread *sched_in; 1400 int new_shortname; 1401 u64 timestamp0, timestamp = sample->time; 1402 s64 delta; 1403 int i, this_cpu = sample->cpu; 1404 int cpus_nr; 1405 bool new_cpu = false; 1406 const char *color = PERF_COLOR_NORMAL; 1407 char stimestamp[32]; 1408 1409 BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0); 1410 1411 if (this_cpu > sched->max_cpu) 1412 sched->max_cpu = this_cpu; 1413 1414 if (sched->map.comp) { 1415 cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS); 1416 if (!test_and_set_bit(this_cpu, sched->map.comp_cpus_mask)) { 1417 sched->map.comp_cpus[cpus_nr++] = this_cpu; 1418 new_cpu = true; 1419 } 1420 } else 1421 cpus_nr = sched->max_cpu; 1422 1423 timestamp0 = sched->cpu_last_switched[this_cpu]; 1424 sched->cpu_last_switched[this_cpu] = timestamp; 1425 if (timestamp0) 1426 delta = timestamp - timestamp0; 1427 else 1428 delta = 0; 1429 1430 if (delta < 0) { 1431 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta); 1432 return -1; 1433 } 1434 1435 sched_in = map__findnew_thread(sched, machine, -1, next_pid); 1436 if (sched_in == NULL) 1437 return -1; 1438 1439 sched->curr_thread[this_cpu] = thread__get(sched_in); 1440 1441 printf(" "); 1442 1443 new_shortname = 0; 1444 if (!sched_in->shortname[0]) { 1445 if (!strcmp(thread__comm_str(sched_in), "swapper")) { 1446 /* 1447 * Don't allocate a letter-number for swapper:0 1448 * as a shortname. Instead, we use '.' for it. 1449 */ 1450 sched_in->shortname[0] = '.'; 1451 sched_in->shortname[1] = ' '; 1452 } else { 1453 sched_in->shortname[0] = sched->next_shortname1; 1454 sched_in->shortname[1] = sched->next_shortname2; 1455 1456 if (sched->next_shortname1 < 'Z') { 1457 sched->next_shortname1++; 1458 } else { 1459 sched->next_shortname1 = 'A'; 1460 if (sched->next_shortname2 < '9') 1461 sched->next_shortname2++; 1462 else 1463 sched->next_shortname2 = '0'; 1464 } 1465 } 1466 new_shortname = 1; 1467 } 1468 1469 for (i = 0; i < cpus_nr; i++) { 1470 int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i; 1471 struct thread *curr_thread = sched->curr_thread[cpu]; 1472 const char *pid_color = color; 1473 const char *cpu_color = color; 1474 1475 if (curr_thread && thread__has_color(curr_thread)) 1476 pid_color = COLOR_PIDS; 1477 1478 if (sched->map.cpus && !cpu_map__has(sched->map.cpus, cpu)) 1479 continue; 1480 1481 if (sched->map.color_cpus && cpu_map__has(sched->map.color_cpus, cpu)) 1482 cpu_color = COLOR_CPUS; 1483 1484 if (cpu != this_cpu) 1485 color_fprintf(stdout, color, " "); 1486 else 1487 color_fprintf(stdout, cpu_color, "*"); 1488 1489 if (sched->curr_thread[cpu]) 1490 color_fprintf(stdout, pid_color, "%2s ", sched->curr_thread[cpu]->shortname); 1491 else 1492 color_fprintf(stdout, color, " "); 1493 } 1494 1495 if (sched->map.cpus && !cpu_map__has(sched->map.cpus, this_cpu)) 1496 goto out; 1497 1498 timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp)); 1499 color_fprintf(stdout, color, " %12s secs ", stimestamp); 1500 if (new_shortname || (verbose && sched_in->tid)) { 1501 const char *pid_color = color; 1502 1503 if (thread__has_color(sched_in)) 1504 pid_color = COLOR_PIDS; 1505 1506 color_fprintf(stdout, pid_color, "%s => %s:%d", 1507 sched_in->shortname, thread__comm_str(sched_in), sched_in->tid); 1508 } 1509 1510 if (sched->map.comp && new_cpu) 1511 color_fprintf(stdout, color, " (CPU %d)", this_cpu); 1512 1513 out: 1514 color_fprintf(stdout, color, "\n"); 1515 1516 thread__put(sched_in); 1517 1518 return 0; 1519 } 1520 1521 static int process_sched_switch_event(struct perf_tool *tool, 1522 struct perf_evsel *evsel, 1523 struct perf_sample *sample, 1524 struct machine *machine) 1525 { 1526 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1527 int this_cpu = sample->cpu, err = 0; 1528 u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"), 1529 next_pid = perf_evsel__intval(evsel, sample, "next_pid"); 1530 1531 if (sched->curr_pid[this_cpu] != (u32)-1) { 1532 /* 1533 * Are we trying to switch away a PID that is 1534 * not current? 1535 */ 1536 if (sched->curr_pid[this_cpu] != prev_pid) 1537 sched->nr_context_switch_bugs++; 1538 } 1539 1540 if (sched->tp_handler->switch_event) 1541 err = sched->tp_handler->switch_event(sched, evsel, sample, machine); 1542 1543 sched->curr_pid[this_cpu] = next_pid; 1544 return err; 1545 } 1546 1547 static int process_sched_runtime_event(struct perf_tool *tool, 1548 struct perf_evsel *evsel, 1549 struct perf_sample *sample, 1550 struct machine *machine) 1551 { 1552 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1553 1554 if (sched->tp_handler->runtime_event) 1555 return sched->tp_handler->runtime_event(sched, evsel, sample, machine); 1556 1557 return 0; 1558 } 1559 1560 static int perf_sched__process_fork_event(struct perf_tool *tool, 1561 union perf_event *event, 1562 struct perf_sample *sample, 1563 struct machine *machine) 1564 { 1565 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1566 1567 /* run the fork event through the perf machineruy */ 1568 perf_event__process_fork(tool, event, sample, machine); 1569 1570 /* and then run additional processing needed for this command */ 1571 if (sched->tp_handler->fork_event) 1572 return sched->tp_handler->fork_event(sched, event, machine); 1573 1574 return 0; 1575 } 1576 1577 static int process_sched_migrate_task_event(struct perf_tool *tool, 1578 struct perf_evsel *evsel, 1579 struct perf_sample *sample, 1580 struct machine *machine) 1581 { 1582 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1583 1584 if (sched->tp_handler->migrate_task_event) 1585 return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine); 1586 1587 return 0; 1588 } 1589 1590 typedef int (*tracepoint_handler)(struct perf_tool *tool, 1591 struct perf_evsel *evsel, 1592 struct perf_sample *sample, 1593 struct machine *machine); 1594 1595 static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused, 1596 union perf_event *event __maybe_unused, 1597 struct perf_sample *sample, 1598 struct perf_evsel *evsel, 1599 struct machine *machine) 1600 { 1601 int err = 0; 1602 1603 if (evsel->handler != NULL) { 1604 tracepoint_handler f = evsel->handler; 1605 err = f(tool, evsel, sample, machine); 1606 } 1607 1608 return err; 1609 } 1610 1611 static int perf_sched__read_events(struct perf_sched *sched) 1612 { 1613 const struct perf_evsel_str_handler handlers[] = { 1614 { "sched:sched_switch", process_sched_switch_event, }, 1615 { "sched:sched_stat_runtime", process_sched_runtime_event, }, 1616 { "sched:sched_wakeup", process_sched_wakeup_event, }, 1617 { "sched:sched_wakeup_new", process_sched_wakeup_event, }, 1618 { "sched:sched_migrate_task", process_sched_migrate_task_event, }, 1619 }; 1620 struct perf_session *session; 1621 struct perf_data_file file = { 1622 .path = input_name, 1623 .mode = PERF_DATA_MODE_READ, 1624 .force = sched->force, 1625 }; 1626 int rc = -1; 1627 1628 session = perf_session__new(&file, false, &sched->tool); 1629 if (session == NULL) { 1630 pr_debug("No Memory for session\n"); 1631 return -1; 1632 } 1633 1634 symbol__init(&session->header.env); 1635 1636 if (perf_session__set_tracepoints_handlers(session, handlers)) 1637 goto out_delete; 1638 1639 if (perf_session__has_traces(session, "record -R")) { 1640 int err = perf_session__process_events(session); 1641 if (err) { 1642 pr_err("Failed to process events, error %d", err); 1643 goto out_delete; 1644 } 1645 1646 sched->nr_events = session->evlist->stats.nr_events[0]; 1647 sched->nr_lost_events = session->evlist->stats.total_lost; 1648 sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST]; 1649 } 1650 1651 rc = 0; 1652 out_delete: 1653 perf_session__delete(session); 1654 return rc; 1655 } 1656 1657 static void print_bad_events(struct perf_sched *sched) 1658 { 1659 if (sched->nr_unordered_timestamps && sched->nr_timestamps) { 1660 printf(" INFO: %.3f%% unordered timestamps (%ld out of %ld)\n", 1661 (double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0, 1662 sched->nr_unordered_timestamps, sched->nr_timestamps); 1663 } 1664 if (sched->nr_lost_events && sched->nr_events) { 1665 printf(" INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n", 1666 (double)sched->nr_lost_events/(double)sched->nr_events * 100.0, 1667 sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks); 1668 } 1669 if (sched->nr_context_switch_bugs && sched->nr_timestamps) { 1670 printf(" INFO: %.3f%% context switch bugs (%ld out of %ld)", 1671 (double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0, 1672 sched->nr_context_switch_bugs, sched->nr_timestamps); 1673 if (sched->nr_lost_events) 1674 printf(" (due to lost events?)"); 1675 printf("\n"); 1676 } 1677 } 1678 1679 static void __merge_work_atoms(struct rb_root *root, struct work_atoms *data) 1680 { 1681 struct rb_node **new = &(root->rb_node), *parent = NULL; 1682 struct work_atoms *this; 1683 const char *comm = thread__comm_str(data->thread), *this_comm; 1684 1685 while (*new) { 1686 int cmp; 1687 1688 this = container_of(*new, struct work_atoms, node); 1689 parent = *new; 1690 1691 this_comm = thread__comm_str(this->thread); 1692 cmp = strcmp(comm, this_comm); 1693 if (cmp > 0) { 1694 new = &((*new)->rb_left); 1695 } else if (cmp < 0) { 1696 new = &((*new)->rb_right); 1697 } else { 1698 this->num_merged++; 1699 this->total_runtime += data->total_runtime; 1700 this->nb_atoms += data->nb_atoms; 1701 this->total_lat += data->total_lat; 1702 list_splice(&data->work_list, &this->work_list); 1703 if (this->max_lat < data->max_lat) { 1704 this->max_lat = data->max_lat; 1705 this->max_lat_at = data->max_lat_at; 1706 } 1707 zfree(&data); 1708 return; 1709 } 1710 } 1711 1712 data->num_merged++; 1713 rb_link_node(&data->node, parent, new); 1714 rb_insert_color(&data->node, root); 1715 } 1716 1717 static void perf_sched__merge_lat(struct perf_sched *sched) 1718 { 1719 struct work_atoms *data; 1720 struct rb_node *node; 1721 1722 if (sched->skip_merge) 1723 return; 1724 1725 while ((node = rb_first(&sched->atom_root))) { 1726 rb_erase(node, &sched->atom_root); 1727 data = rb_entry(node, struct work_atoms, node); 1728 __merge_work_atoms(&sched->merged_atom_root, data); 1729 } 1730 } 1731 1732 static int perf_sched__lat(struct perf_sched *sched) 1733 { 1734 struct rb_node *next; 1735 1736 setup_pager(); 1737 1738 if (perf_sched__read_events(sched)) 1739 return -1; 1740 1741 perf_sched__merge_lat(sched); 1742 perf_sched__sort_lat(sched); 1743 1744 printf("\n -----------------------------------------------------------------------------------------------------------------\n"); 1745 printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at |\n"); 1746 printf(" -----------------------------------------------------------------------------------------------------------------\n"); 1747 1748 next = rb_first(&sched->sorted_atom_root); 1749 1750 while (next) { 1751 struct work_atoms *work_list; 1752 1753 work_list = rb_entry(next, struct work_atoms, node); 1754 output_lat_thread(sched, work_list); 1755 next = rb_next(next); 1756 thread__zput(work_list->thread); 1757 } 1758 1759 printf(" -----------------------------------------------------------------------------------------------------------------\n"); 1760 printf(" TOTAL: |%11.3f ms |%9" PRIu64 " |\n", 1761 (double)sched->all_runtime / NSEC_PER_MSEC, sched->all_count); 1762 1763 printf(" ---------------------------------------------------\n"); 1764 1765 print_bad_events(sched); 1766 printf("\n"); 1767 1768 return 0; 1769 } 1770 1771 static int setup_map_cpus(struct perf_sched *sched) 1772 { 1773 struct cpu_map *map; 1774 1775 sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF); 1776 1777 if (sched->map.comp) { 1778 sched->map.comp_cpus = zalloc(sched->max_cpu * sizeof(int)); 1779 if (!sched->map.comp_cpus) 1780 return -1; 1781 } 1782 1783 if (!sched->map.cpus_str) 1784 return 0; 1785 1786 map = cpu_map__new(sched->map.cpus_str); 1787 if (!map) { 1788 pr_err("failed to get cpus map from %s\n", sched->map.cpus_str); 1789 return -1; 1790 } 1791 1792 sched->map.cpus = map; 1793 return 0; 1794 } 1795 1796 static int setup_color_pids(struct perf_sched *sched) 1797 { 1798 struct thread_map *map; 1799 1800 if (!sched->map.color_pids_str) 1801 return 0; 1802 1803 map = thread_map__new_by_tid_str(sched->map.color_pids_str); 1804 if (!map) { 1805 pr_err("failed to get thread map from %s\n", sched->map.color_pids_str); 1806 return -1; 1807 } 1808 1809 sched->map.color_pids = map; 1810 return 0; 1811 } 1812 1813 static int setup_color_cpus(struct perf_sched *sched) 1814 { 1815 struct cpu_map *map; 1816 1817 if (!sched->map.color_cpus_str) 1818 return 0; 1819 1820 map = cpu_map__new(sched->map.color_cpus_str); 1821 if (!map) { 1822 pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str); 1823 return -1; 1824 } 1825 1826 sched->map.color_cpus = map; 1827 return 0; 1828 } 1829 1830 static int perf_sched__map(struct perf_sched *sched) 1831 { 1832 if (setup_map_cpus(sched)) 1833 return -1; 1834 1835 if (setup_color_pids(sched)) 1836 return -1; 1837 1838 if (setup_color_cpus(sched)) 1839 return -1; 1840 1841 setup_pager(); 1842 if (perf_sched__read_events(sched)) 1843 return -1; 1844 print_bad_events(sched); 1845 return 0; 1846 } 1847 1848 static int perf_sched__replay(struct perf_sched *sched) 1849 { 1850 unsigned long i; 1851 1852 calibrate_run_measurement_overhead(sched); 1853 calibrate_sleep_measurement_overhead(sched); 1854 1855 test_calibrations(sched); 1856 1857 if (perf_sched__read_events(sched)) 1858 return -1; 1859 1860 printf("nr_run_events: %ld\n", sched->nr_run_events); 1861 printf("nr_sleep_events: %ld\n", sched->nr_sleep_events); 1862 printf("nr_wakeup_events: %ld\n", sched->nr_wakeup_events); 1863 1864 if (sched->targetless_wakeups) 1865 printf("target-less wakeups: %ld\n", sched->targetless_wakeups); 1866 if (sched->multitarget_wakeups) 1867 printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups); 1868 if (sched->nr_run_events_optimized) 1869 printf("run atoms optimized: %ld\n", 1870 sched->nr_run_events_optimized); 1871 1872 print_task_traces(sched); 1873 add_cross_task_wakeups(sched); 1874 1875 create_tasks(sched); 1876 printf("------------------------------------------------------------\n"); 1877 for (i = 0; i < sched->replay_repeat; i++) 1878 run_one_test(sched); 1879 1880 return 0; 1881 } 1882 1883 static void setup_sorting(struct perf_sched *sched, const struct option *options, 1884 const char * const usage_msg[]) 1885 { 1886 char *tmp, *tok, *str = strdup(sched->sort_order); 1887 1888 for (tok = strtok_r(str, ", ", &tmp); 1889 tok; tok = strtok_r(NULL, ", ", &tmp)) { 1890 if (sort_dimension__add(tok, &sched->sort_list) < 0) { 1891 usage_with_options_msg(usage_msg, options, 1892 "Unknown --sort key: `%s'", tok); 1893 } 1894 } 1895 1896 free(str); 1897 1898 sort_dimension__add("pid", &sched->cmp_pid); 1899 } 1900 1901 static int __cmd_record(int argc, const char **argv) 1902 { 1903 unsigned int rec_argc, i, j; 1904 const char **rec_argv; 1905 const char * const record_args[] = { 1906 "record", 1907 "-a", 1908 "-R", 1909 "-m", "1024", 1910 "-c", "1", 1911 "-e", "sched:sched_switch", 1912 "-e", "sched:sched_stat_wait", 1913 "-e", "sched:sched_stat_sleep", 1914 "-e", "sched:sched_stat_iowait", 1915 "-e", "sched:sched_stat_runtime", 1916 "-e", "sched:sched_process_fork", 1917 "-e", "sched:sched_wakeup", 1918 "-e", "sched:sched_wakeup_new", 1919 "-e", "sched:sched_migrate_task", 1920 }; 1921 1922 rec_argc = ARRAY_SIZE(record_args) + argc - 1; 1923 rec_argv = calloc(rec_argc + 1, sizeof(char *)); 1924 1925 if (rec_argv == NULL) 1926 return -ENOMEM; 1927 1928 for (i = 0; i < ARRAY_SIZE(record_args); i++) 1929 rec_argv[i] = strdup(record_args[i]); 1930 1931 for (j = 1; j < (unsigned int)argc; j++, i++) 1932 rec_argv[i] = argv[j]; 1933 1934 BUG_ON(i != rec_argc); 1935 1936 return cmd_record(i, rec_argv, NULL); 1937 } 1938 1939 int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) 1940 { 1941 const char default_sort_order[] = "avg, max, switch, runtime"; 1942 struct perf_sched sched = { 1943 .tool = { 1944 .sample = perf_sched__process_tracepoint_sample, 1945 .comm = perf_event__process_comm, 1946 .lost = perf_event__process_lost, 1947 .fork = perf_sched__process_fork_event, 1948 .ordered_events = true, 1949 }, 1950 .cmp_pid = LIST_HEAD_INIT(sched.cmp_pid), 1951 .sort_list = LIST_HEAD_INIT(sched.sort_list), 1952 .start_work_mutex = PTHREAD_MUTEX_INITIALIZER, 1953 .work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER, 1954 .sort_order = default_sort_order, 1955 .replay_repeat = 10, 1956 .profile_cpu = -1, 1957 .next_shortname1 = 'A', 1958 .next_shortname2 = '0', 1959 .skip_merge = 0, 1960 }; 1961 const struct option sched_options[] = { 1962 OPT_STRING('i', "input", &input_name, "file", 1963 "input file name"), 1964 OPT_INCR('v', "verbose", &verbose, 1965 "be more verbose (show symbol address, etc)"), 1966 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, 1967 "dump raw trace in ASCII"), 1968 OPT_END() 1969 }; 1970 const struct option latency_options[] = { 1971 OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]", 1972 "sort by key(s): runtime, switch, avg, max"), 1973 OPT_INCR('v', "verbose", &verbose, 1974 "be more verbose (show symbol address, etc)"), 1975 OPT_INTEGER('C', "CPU", &sched.profile_cpu, 1976 "CPU to profile on"), 1977 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, 1978 "dump raw trace in ASCII"), 1979 OPT_BOOLEAN('p', "pids", &sched.skip_merge, 1980 "latency stats per pid instead of per comm"), 1981 OPT_PARENT(sched_options) 1982 }; 1983 const struct option replay_options[] = { 1984 OPT_UINTEGER('r', "repeat", &sched.replay_repeat, 1985 "repeat the workload replay N times (-1: infinite)"), 1986 OPT_INCR('v', "verbose", &verbose, 1987 "be more verbose (show symbol address, etc)"), 1988 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, 1989 "dump raw trace in ASCII"), 1990 OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"), 1991 OPT_PARENT(sched_options) 1992 }; 1993 const struct option map_options[] = { 1994 OPT_BOOLEAN(0, "compact", &sched.map.comp, 1995 "map output in compact mode"), 1996 OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids", 1997 "highlight given pids in map"), 1998 OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus", 1999 "highlight given CPUs in map"), 2000 OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus", 2001 "display given CPUs in map"), 2002 OPT_PARENT(sched_options) 2003 }; 2004 const char * const latency_usage[] = { 2005 "perf sched latency [<options>]", 2006 NULL 2007 }; 2008 const char * const replay_usage[] = { 2009 "perf sched replay [<options>]", 2010 NULL 2011 }; 2012 const char * const map_usage[] = { 2013 "perf sched map [<options>]", 2014 NULL 2015 }; 2016 const char *const sched_subcommands[] = { "record", "latency", "map", 2017 "replay", "script", NULL }; 2018 const char *sched_usage[] = { 2019 NULL, 2020 NULL 2021 }; 2022 struct trace_sched_handler lat_ops = { 2023 .wakeup_event = latency_wakeup_event, 2024 .switch_event = latency_switch_event, 2025 .runtime_event = latency_runtime_event, 2026 .migrate_task_event = latency_migrate_task_event, 2027 }; 2028 struct trace_sched_handler map_ops = { 2029 .switch_event = map_switch_event, 2030 }; 2031 struct trace_sched_handler replay_ops = { 2032 .wakeup_event = replay_wakeup_event, 2033 .switch_event = replay_switch_event, 2034 .fork_event = replay_fork_event, 2035 }; 2036 unsigned int i; 2037 2038 for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++) 2039 sched.curr_pid[i] = -1; 2040 2041 argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands, 2042 sched_usage, PARSE_OPT_STOP_AT_NON_OPTION); 2043 if (!argc) 2044 usage_with_options(sched_usage, sched_options); 2045 2046 /* 2047 * Aliased to 'perf script' for now: 2048 */ 2049 if (!strcmp(argv[0], "script")) 2050 return cmd_script(argc, argv, prefix); 2051 2052 if (!strncmp(argv[0], "rec", 3)) { 2053 return __cmd_record(argc, argv); 2054 } else if (!strncmp(argv[0], "lat", 3)) { 2055 sched.tp_handler = &lat_ops; 2056 if (argc > 1) { 2057 argc = parse_options(argc, argv, latency_options, latency_usage, 0); 2058 if (argc) 2059 usage_with_options(latency_usage, latency_options); 2060 } 2061 setup_sorting(&sched, latency_options, latency_usage); 2062 return perf_sched__lat(&sched); 2063 } else if (!strcmp(argv[0], "map")) { 2064 if (argc) { 2065 argc = parse_options(argc, argv, map_options, map_usage, 0); 2066 if (argc) 2067 usage_with_options(map_usage, map_options); 2068 } 2069 sched.tp_handler = &map_ops; 2070 setup_sorting(&sched, latency_options, latency_usage); 2071 return perf_sched__map(&sched); 2072 } else if (!strncmp(argv[0], "rep", 3)) { 2073 sched.tp_handler = &replay_ops; 2074 if (argc) { 2075 argc = parse_options(argc, argv, replay_options, replay_usage, 0); 2076 if (argc) 2077 usage_with_options(replay_usage, replay_options); 2078 } 2079 return perf_sched__replay(&sched); 2080 } else { 2081 usage_with_options(sched_usage, sched_options); 2082 } 2083 2084 return 0; 2085 } 2086