1 #include "builtin.h" 2 #include "perf.h" 3 4 #include "util/util.h" 5 #include "util/evlist.h" 6 #include "util/cache.h" 7 #include "util/evsel.h" 8 #include "util/symbol.h" 9 #include "util/thread.h" 10 #include "util/header.h" 11 #include "util/session.h" 12 #include "util/tool.h" 13 #include "util/cloexec.h" 14 #include "util/thread_map.h" 15 #include "util/color.h" 16 17 #include <subcmd/parse-options.h> 18 #include "util/trace-event.h" 19 20 #include "util/debug.h" 21 22 #include <sys/prctl.h> 23 #include <sys/resource.h> 24 25 #include <semaphore.h> 26 #include <pthread.h> 27 #include <math.h> 28 #include <api/fs/fs.h> 29 #include <linux/time64.h> 30 31 #define PR_SET_NAME 15 /* Set process name */ 32 #define MAX_CPUS 4096 33 #define COMM_LEN 20 34 #define SYM_LEN 129 35 #define MAX_PID 1024000 36 37 struct sched_atom; 38 39 struct task_desc { 40 unsigned long nr; 41 unsigned long pid; 42 char comm[COMM_LEN]; 43 44 unsigned long nr_events; 45 unsigned long curr_event; 46 struct sched_atom **atoms; 47 48 pthread_t thread; 49 sem_t sleep_sem; 50 51 sem_t ready_for_work; 52 sem_t work_done_sem; 53 54 u64 cpu_usage; 55 }; 56 57 enum sched_event_type { 58 SCHED_EVENT_RUN, 59 SCHED_EVENT_SLEEP, 60 SCHED_EVENT_WAKEUP, 61 SCHED_EVENT_MIGRATION, 62 }; 63 64 struct sched_atom { 65 enum sched_event_type type; 66 int specific_wait; 67 u64 timestamp; 68 u64 duration; 69 unsigned long nr; 70 sem_t *wait_sem; 71 struct task_desc *wakee; 72 }; 73 74 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP" 75 76 enum thread_state { 77 THREAD_SLEEPING = 0, 78 THREAD_WAIT_CPU, 79 THREAD_SCHED_IN, 80 THREAD_IGNORE 81 }; 82 83 struct work_atom { 84 struct list_head list; 85 enum thread_state state; 86 u64 sched_out_time; 87 u64 wake_up_time; 88 u64 sched_in_time; 89 u64 runtime; 90 }; 91 92 struct work_atoms { 93 struct list_head work_list; 94 struct thread *thread; 95 struct rb_node node; 96 u64 max_lat; 97 u64 max_lat_at; 98 u64 total_lat; 99 u64 nb_atoms; 100 u64 total_runtime; 101 int num_merged; 102 }; 103 104 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *); 105 106 struct perf_sched; 107 108 struct trace_sched_handler { 109 int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel, 110 struct perf_sample *sample, struct machine *machine); 111 112 int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel, 113 struct perf_sample *sample, struct machine *machine); 114 115 int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel, 116 struct perf_sample *sample, struct machine *machine); 117 118 /* PERF_RECORD_FORK event, not sched_process_fork tracepoint */ 119 int (*fork_event)(struct perf_sched *sched, union perf_event *event, 120 struct machine *machine); 121 122 int (*migrate_task_event)(struct perf_sched *sched, 123 struct perf_evsel *evsel, 124 struct perf_sample *sample, 125 struct machine *machine); 126 }; 127 128 #define COLOR_PIDS PERF_COLOR_BLUE 129 #define COLOR_CPUS PERF_COLOR_BG_RED 130 131 struct perf_sched_map { 132 DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS); 133 int *comp_cpus; 134 bool comp; 135 struct thread_map *color_pids; 136 const char *color_pids_str; 137 struct cpu_map *color_cpus; 138 const char *color_cpus_str; 139 struct cpu_map *cpus; 140 const char *cpus_str; 141 }; 142 143 struct perf_sched { 144 struct perf_tool tool; 145 const char *sort_order; 146 unsigned long nr_tasks; 147 struct task_desc **pid_to_task; 148 struct task_desc **tasks; 149 const struct trace_sched_handler *tp_handler; 150 pthread_mutex_t start_work_mutex; 151 pthread_mutex_t work_done_wait_mutex; 152 int profile_cpu; 153 /* 154 * Track the current task - that way we can know whether there's any 155 * weird events, such as a task being switched away that is not current. 156 */ 157 int max_cpu; 158 u32 curr_pid[MAX_CPUS]; 159 struct thread *curr_thread[MAX_CPUS]; 160 char next_shortname1; 161 char next_shortname2; 162 unsigned int replay_repeat; 163 unsigned long nr_run_events; 164 unsigned long nr_sleep_events; 165 unsigned long nr_wakeup_events; 166 unsigned long nr_sleep_corrections; 167 unsigned long nr_run_events_optimized; 168 unsigned long targetless_wakeups; 169 unsigned long multitarget_wakeups; 170 unsigned long nr_runs; 171 unsigned long nr_timestamps; 172 unsigned long nr_unordered_timestamps; 173 unsigned long nr_context_switch_bugs; 174 unsigned long nr_events; 175 unsigned long nr_lost_chunks; 176 unsigned long nr_lost_events; 177 u64 run_measurement_overhead; 178 u64 sleep_measurement_overhead; 179 u64 start_time; 180 u64 cpu_usage; 181 u64 runavg_cpu_usage; 182 u64 parent_cpu_usage; 183 u64 runavg_parent_cpu_usage; 184 u64 sum_runtime; 185 u64 sum_fluct; 186 u64 run_avg; 187 u64 all_runtime; 188 u64 all_count; 189 u64 cpu_last_switched[MAX_CPUS]; 190 struct rb_root atom_root, sorted_atom_root, merged_atom_root; 191 struct list_head sort_list, cmp_pid; 192 bool force; 193 bool skip_merge; 194 struct perf_sched_map map; 195 }; 196 197 static u64 get_nsecs(void) 198 { 199 struct timespec ts; 200 201 clock_gettime(CLOCK_MONOTONIC, &ts); 202 203 return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec; 204 } 205 206 static void burn_nsecs(struct perf_sched *sched, u64 nsecs) 207 { 208 u64 T0 = get_nsecs(), T1; 209 210 do { 211 T1 = get_nsecs(); 212 } while (T1 + sched->run_measurement_overhead < T0 + nsecs); 213 } 214 215 static void sleep_nsecs(u64 nsecs) 216 { 217 struct timespec ts; 218 219 ts.tv_nsec = nsecs % 999999999; 220 ts.tv_sec = nsecs / 999999999; 221 222 nanosleep(&ts, NULL); 223 } 224 225 static void calibrate_run_measurement_overhead(struct perf_sched *sched) 226 { 227 u64 T0, T1, delta, min_delta = NSEC_PER_SEC; 228 int i; 229 230 for (i = 0; i < 10; i++) { 231 T0 = get_nsecs(); 232 burn_nsecs(sched, 0); 233 T1 = get_nsecs(); 234 delta = T1-T0; 235 min_delta = min(min_delta, delta); 236 } 237 sched->run_measurement_overhead = min_delta; 238 239 printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta); 240 } 241 242 static void calibrate_sleep_measurement_overhead(struct perf_sched *sched) 243 { 244 u64 T0, T1, delta, min_delta = NSEC_PER_SEC; 245 int i; 246 247 for (i = 0; i < 10; i++) { 248 T0 = get_nsecs(); 249 sleep_nsecs(10000); 250 T1 = get_nsecs(); 251 delta = T1-T0; 252 min_delta = min(min_delta, delta); 253 } 254 min_delta -= 10000; 255 sched->sleep_measurement_overhead = min_delta; 256 257 printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta); 258 } 259 260 static struct sched_atom * 261 get_new_event(struct task_desc *task, u64 timestamp) 262 { 263 struct sched_atom *event = zalloc(sizeof(*event)); 264 unsigned long idx = task->nr_events; 265 size_t size; 266 267 event->timestamp = timestamp; 268 event->nr = idx; 269 270 task->nr_events++; 271 size = sizeof(struct sched_atom *) * task->nr_events; 272 task->atoms = realloc(task->atoms, size); 273 BUG_ON(!task->atoms); 274 275 task->atoms[idx] = event; 276 277 return event; 278 } 279 280 static struct sched_atom *last_event(struct task_desc *task) 281 { 282 if (!task->nr_events) 283 return NULL; 284 285 return task->atoms[task->nr_events - 1]; 286 } 287 288 static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task, 289 u64 timestamp, u64 duration) 290 { 291 struct sched_atom *event, *curr_event = last_event(task); 292 293 /* 294 * optimize an existing RUN event by merging this one 295 * to it: 296 */ 297 if (curr_event && curr_event->type == SCHED_EVENT_RUN) { 298 sched->nr_run_events_optimized++; 299 curr_event->duration += duration; 300 return; 301 } 302 303 event = get_new_event(task, timestamp); 304 305 event->type = SCHED_EVENT_RUN; 306 event->duration = duration; 307 308 sched->nr_run_events++; 309 } 310 311 static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task, 312 u64 timestamp, struct task_desc *wakee) 313 { 314 struct sched_atom *event, *wakee_event; 315 316 event = get_new_event(task, timestamp); 317 event->type = SCHED_EVENT_WAKEUP; 318 event->wakee = wakee; 319 320 wakee_event = last_event(wakee); 321 if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) { 322 sched->targetless_wakeups++; 323 return; 324 } 325 if (wakee_event->wait_sem) { 326 sched->multitarget_wakeups++; 327 return; 328 } 329 330 wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem)); 331 sem_init(wakee_event->wait_sem, 0, 0); 332 wakee_event->specific_wait = 1; 333 event->wait_sem = wakee_event->wait_sem; 334 335 sched->nr_wakeup_events++; 336 } 337 338 static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task, 339 u64 timestamp, u64 task_state __maybe_unused) 340 { 341 struct sched_atom *event = get_new_event(task, timestamp); 342 343 event->type = SCHED_EVENT_SLEEP; 344 345 sched->nr_sleep_events++; 346 } 347 348 static struct task_desc *register_pid(struct perf_sched *sched, 349 unsigned long pid, const char *comm) 350 { 351 struct task_desc *task; 352 static int pid_max; 353 354 if (sched->pid_to_task == NULL) { 355 if (sysctl__read_int("kernel/pid_max", &pid_max) < 0) 356 pid_max = MAX_PID; 357 BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL); 358 } 359 if (pid >= (unsigned long)pid_max) { 360 BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) * 361 sizeof(struct task_desc *))) == NULL); 362 while (pid >= (unsigned long)pid_max) 363 sched->pid_to_task[pid_max++] = NULL; 364 } 365 366 task = sched->pid_to_task[pid]; 367 368 if (task) 369 return task; 370 371 task = zalloc(sizeof(*task)); 372 task->pid = pid; 373 task->nr = sched->nr_tasks; 374 strcpy(task->comm, comm); 375 /* 376 * every task starts in sleeping state - this gets ignored 377 * if there's no wakeup pointing to this sleep state: 378 */ 379 add_sched_event_sleep(sched, task, 0, 0); 380 381 sched->pid_to_task[pid] = task; 382 sched->nr_tasks++; 383 sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *)); 384 BUG_ON(!sched->tasks); 385 sched->tasks[task->nr] = task; 386 387 if (verbose) 388 printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm); 389 390 return task; 391 } 392 393 394 static void print_task_traces(struct perf_sched *sched) 395 { 396 struct task_desc *task; 397 unsigned long i; 398 399 for (i = 0; i < sched->nr_tasks; i++) { 400 task = sched->tasks[i]; 401 printf("task %6ld (%20s:%10ld), nr_events: %ld\n", 402 task->nr, task->comm, task->pid, task->nr_events); 403 } 404 } 405 406 static void add_cross_task_wakeups(struct perf_sched *sched) 407 { 408 struct task_desc *task1, *task2; 409 unsigned long i, j; 410 411 for (i = 0; i < sched->nr_tasks; i++) { 412 task1 = sched->tasks[i]; 413 j = i + 1; 414 if (j == sched->nr_tasks) 415 j = 0; 416 task2 = sched->tasks[j]; 417 add_sched_event_wakeup(sched, task1, 0, task2); 418 } 419 } 420 421 static void perf_sched__process_event(struct perf_sched *sched, 422 struct sched_atom *atom) 423 { 424 int ret = 0; 425 426 switch (atom->type) { 427 case SCHED_EVENT_RUN: 428 burn_nsecs(sched, atom->duration); 429 break; 430 case SCHED_EVENT_SLEEP: 431 if (atom->wait_sem) 432 ret = sem_wait(atom->wait_sem); 433 BUG_ON(ret); 434 break; 435 case SCHED_EVENT_WAKEUP: 436 if (atom->wait_sem) 437 ret = sem_post(atom->wait_sem); 438 BUG_ON(ret); 439 break; 440 case SCHED_EVENT_MIGRATION: 441 break; 442 default: 443 BUG_ON(1); 444 } 445 } 446 447 static u64 get_cpu_usage_nsec_parent(void) 448 { 449 struct rusage ru; 450 u64 sum; 451 int err; 452 453 err = getrusage(RUSAGE_SELF, &ru); 454 BUG_ON(err); 455 456 sum = ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC; 457 sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC; 458 459 return sum; 460 } 461 462 static int self_open_counters(struct perf_sched *sched, unsigned long cur_task) 463 { 464 struct perf_event_attr attr; 465 char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE]; 466 int fd; 467 struct rlimit limit; 468 bool need_privilege = false; 469 470 memset(&attr, 0, sizeof(attr)); 471 472 attr.type = PERF_TYPE_SOFTWARE; 473 attr.config = PERF_COUNT_SW_TASK_CLOCK; 474 475 force_again: 476 fd = sys_perf_event_open(&attr, 0, -1, -1, 477 perf_event_open_cloexec_flag()); 478 479 if (fd < 0) { 480 if (errno == EMFILE) { 481 if (sched->force) { 482 BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1); 483 limit.rlim_cur += sched->nr_tasks - cur_task; 484 if (limit.rlim_cur > limit.rlim_max) { 485 limit.rlim_max = limit.rlim_cur; 486 need_privilege = true; 487 } 488 if (setrlimit(RLIMIT_NOFILE, &limit) == -1) { 489 if (need_privilege && errno == EPERM) 490 strcpy(info, "Need privilege\n"); 491 } else 492 goto force_again; 493 } else 494 strcpy(info, "Have a try with -f option\n"); 495 } 496 pr_err("Error: sys_perf_event_open() syscall returned " 497 "with %d (%s)\n%s", fd, 498 str_error_r(errno, sbuf, sizeof(sbuf)), info); 499 exit(EXIT_FAILURE); 500 } 501 return fd; 502 } 503 504 static u64 get_cpu_usage_nsec_self(int fd) 505 { 506 u64 runtime; 507 int ret; 508 509 ret = read(fd, &runtime, sizeof(runtime)); 510 BUG_ON(ret != sizeof(runtime)); 511 512 return runtime; 513 } 514 515 struct sched_thread_parms { 516 struct task_desc *task; 517 struct perf_sched *sched; 518 int fd; 519 }; 520 521 static void *thread_func(void *ctx) 522 { 523 struct sched_thread_parms *parms = ctx; 524 struct task_desc *this_task = parms->task; 525 struct perf_sched *sched = parms->sched; 526 u64 cpu_usage_0, cpu_usage_1; 527 unsigned long i, ret; 528 char comm2[22]; 529 int fd = parms->fd; 530 531 zfree(&parms); 532 533 sprintf(comm2, ":%s", this_task->comm); 534 prctl(PR_SET_NAME, comm2); 535 if (fd < 0) 536 return NULL; 537 again: 538 ret = sem_post(&this_task->ready_for_work); 539 BUG_ON(ret); 540 ret = pthread_mutex_lock(&sched->start_work_mutex); 541 BUG_ON(ret); 542 ret = pthread_mutex_unlock(&sched->start_work_mutex); 543 BUG_ON(ret); 544 545 cpu_usage_0 = get_cpu_usage_nsec_self(fd); 546 547 for (i = 0; i < this_task->nr_events; i++) { 548 this_task->curr_event = i; 549 perf_sched__process_event(sched, this_task->atoms[i]); 550 } 551 552 cpu_usage_1 = get_cpu_usage_nsec_self(fd); 553 this_task->cpu_usage = cpu_usage_1 - cpu_usage_0; 554 ret = sem_post(&this_task->work_done_sem); 555 BUG_ON(ret); 556 557 ret = pthread_mutex_lock(&sched->work_done_wait_mutex); 558 BUG_ON(ret); 559 ret = pthread_mutex_unlock(&sched->work_done_wait_mutex); 560 BUG_ON(ret); 561 562 goto again; 563 } 564 565 static void create_tasks(struct perf_sched *sched) 566 { 567 struct task_desc *task; 568 pthread_attr_t attr; 569 unsigned long i; 570 int err; 571 572 err = pthread_attr_init(&attr); 573 BUG_ON(err); 574 err = pthread_attr_setstacksize(&attr, 575 (size_t) max(16 * 1024, PTHREAD_STACK_MIN)); 576 BUG_ON(err); 577 err = pthread_mutex_lock(&sched->start_work_mutex); 578 BUG_ON(err); 579 err = pthread_mutex_lock(&sched->work_done_wait_mutex); 580 BUG_ON(err); 581 for (i = 0; i < sched->nr_tasks; i++) { 582 struct sched_thread_parms *parms = malloc(sizeof(*parms)); 583 BUG_ON(parms == NULL); 584 parms->task = task = sched->tasks[i]; 585 parms->sched = sched; 586 parms->fd = self_open_counters(sched, i); 587 sem_init(&task->sleep_sem, 0, 0); 588 sem_init(&task->ready_for_work, 0, 0); 589 sem_init(&task->work_done_sem, 0, 0); 590 task->curr_event = 0; 591 err = pthread_create(&task->thread, &attr, thread_func, parms); 592 BUG_ON(err); 593 } 594 } 595 596 static void wait_for_tasks(struct perf_sched *sched) 597 { 598 u64 cpu_usage_0, cpu_usage_1; 599 struct task_desc *task; 600 unsigned long i, ret; 601 602 sched->start_time = get_nsecs(); 603 sched->cpu_usage = 0; 604 pthread_mutex_unlock(&sched->work_done_wait_mutex); 605 606 for (i = 0; i < sched->nr_tasks; i++) { 607 task = sched->tasks[i]; 608 ret = sem_wait(&task->ready_for_work); 609 BUG_ON(ret); 610 sem_init(&task->ready_for_work, 0, 0); 611 } 612 ret = pthread_mutex_lock(&sched->work_done_wait_mutex); 613 BUG_ON(ret); 614 615 cpu_usage_0 = get_cpu_usage_nsec_parent(); 616 617 pthread_mutex_unlock(&sched->start_work_mutex); 618 619 for (i = 0; i < sched->nr_tasks; i++) { 620 task = sched->tasks[i]; 621 ret = sem_wait(&task->work_done_sem); 622 BUG_ON(ret); 623 sem_init(&task->work_done_sem, 0, 0); 624 sched->cpu_usage += task->cpu_usage; 625 task->cpu_usage = 0; 626 } 627 628 cpu_usage_1 = get_cpu_usage_nsec_parent(); 629 if (!sched->runavg_cpu_usage) 630 sched->runavg_cpu_usage = sched->cpu_usage; 631 sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat; 632 633 sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0; 634 if (!sched->runavg_parent_cpu_usage) 635 sched->runavg_parent_cpu_usage = sched->parent_cpu_usage; 636 sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) + 637 sched->parent_cpu_usage)/sched->replay_repeat; 638 639 ret = pthread_mutex_lock(&sched->start_work_mutex); 640 BUG_ON(ret); 641 642 for (i = 0; i < sched->nr_tasks; i++) { 643 task = sched->tasks[i]; 644 sem_init(&task->sleep_sem, 0, 0); 645 task->curr_event = 0; 646 } 647 } 648 649 static void run_one_test(struct perf_sched *sched) 650 { 651 u64 T0, T1, delta, avg_delta, fluct; 652 653 T0 = get_nsecs(); 654 wait_for_tasks(sched); 655 T1 = get_nsecs(); 656 657 delta = T1 - T0; 658 sched->sum_runtime += delta; 659 sched->nr_runs++; 660 661 avg_delta = sched->sum_runtime / sched->nr_runs; 662 if (delta < avg_delta) 663 fluct = avg_delta - delta; 664 else 665 fluct = delta - avg_delta; 666 sched->sum_fluct += fluct; 667 if (!sched->run_avg) 668 sched->run_avg = delta; 669 sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat; 670 671 printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / NSEC_PER_MSEC); 672 673 printf("ravg: %0.2f, ", (double)sched->run_avg / NSEC_PER_MSEC); 674 675 printf("cpu: %0.2f / %0.2f", 676 (double)sched->cpu_usage / NSEC_PER_MSEC, (double)sched->runavg_cpu_usage / NSEC_PER_MSEC); 677 678 #if 0 679 /* 680 * rusage statistics done by the parent, these are less 681 * accurate than the sched->sum_exec_runtime based statistics: 682 */ 683 printf(" [%0.2f / %0.2f]", 684 (double)sched->parent_cpu_usage / NSEC_PER_MSEC, 685 (double)sched->runavg_parent_cpu_usage / NSEC_PER_MSEC); 686 #endif 687 688 printf("\n"); 689 690 if (sched->nr_sleep_corrections) 691 printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections); 692 sched->nr_sleep_corrections = 0; 693 } 694 695 static void test_calibrations(struct perf_sched *sched) 696 { 697 u64 T0, T1; 698 699 T0 = get_nsecs(); 700 burn_nsecs(sched, NSEC_PER_MSEC); 701 T1 = get_nsecs(); 702 703 printf("the run test took %" PRIu64 " nsecs\n", T1 - T0); 704 705 T0 = get_nsecs(); 706 sleep_nsecs(NSEC_PER_MSEC); 707 T1 = get_nsecs(); 708 709 printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0); 710 } 711 712 static int 713 replay_wakeup_event(struct perf_sched *sched, 714 struct perf_evsel *evsel, struct perf_sample *sample, 715 struct machine *machine __maybe_unused) 716 { 717 const char *comm = perf_evsel__strval(evsel, sample, "comm"); 718 const u32 pid = perf_evsel__intval(evsel, sample, "pid"); 719 struct task_desc *waker, *wakee; 720 721 if (verbose) { 722 printf("sched_wakeup event %p\n", evsel); 723 724 printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid); 725 } 726 727 waker = register_pid(sched, sample->tid, "<unknown>"); 728 wakee = register_pid(sched, pid, comm); 729 730 add_sched_event_wakeup(sched, waker, sample->time, wakee); 731 return 0; 732 } 733 734 static int replay_switch_event(struct perf_sched *sched, 735 struct perf_evsel *evsel, 736 struct perf_sample *sample, 737 struct machine *machine __maybe_unused) 738 { 739 const char *prev_comm = perf_evsel__strval(evsel, sample, "prev_comm"), 740 *next_comm = perf_evsel__strval(evsel, sample, "next_comm"); 741 const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"), 742 next_pid = perf_evsel__intval(evsel, sample, "next_pid"); 743 const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state"); 744 struct task_desc *prev, __maybe_unused *next; 745 u64 timestamp0, timestamp = sample->time; 746 int cpu = sample->cpu; 747 s64 delta; 748 749 if (verbose) 750 printf("sched_switch event %p\n", evsel); 751 752 if (cpu >= MAX_CPUS || cpu < 0) 753 return 0; 754 755 timestamp0 = sched->cpu_last_switched[cpu]; 756 if (timestamp0) 757 delta = timestamp - timestamp0; 758 else 759 delta = 0; 760 761 if (delta < 0) { 762 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta); 763 return -1; 764 } 765 766 pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n", 767 prev_comm, prev_pid, next_comm, next_pid, delta); 768 769 prev = register_pid(sched, prev_pid, prev_comm); 770 next = register_pid(sched, next_pid, next_comm); 771 772 sched->cpu_last_switched[cpu] = timestamp; 773 774 add_sched_event_run(sched, prev, timestamp, delta); 775 add_sched_event_sleep(sched, prev, timestamp, prev_state); 776 777 return 0; 778 } 779 780 static int replay_fork_event(struct perf_sched *sched, 781 union perf_event *event, 782 struct machine *machine) 783 { 784 struct thread *child, *parent; 785 786 child = machine__findnew_thread(machine, event->fork.pid, 787 event->fork.tid); 788 parent = machine__findnew_thread(machine, event->fork.ppid, 789 event->fork.ptid); 790 791 if (child == NULL || parent == NULL) { 792 pr_debug("thread does not exist on fork event: child %p, parent %p\n", 793 child, parent); 794 goto out_put; 795 } 796 797 if (verbose) { 798 printf("fork event\n"); 799 printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid); 800 printf("... child: %s/%d\n", thread__comm_str(child), child->tid); 801 } 802 803 register_pid(sched, parent->tid, thread__comm_str(parent)); 804 register_pid(sched, child->tid, thread__comm_str(child)); 805 out_put: 806 thread__put(child); 807 thread__put(parent); 808 return 0; 809 } 810 811 struct sort_dimension { 812 const char *name; 813 sort_fn_t cmp; 814 struct list_head list; 815 }; 816 817 static int 818 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r) 819 { 820 struct sort_dimension *sort; 821 int ret = 0; 822 823 BUG_ON(list_empty(list)); 824 825 list_for_each_entry(sort, list, list) { 826 ret = sort->cmp(l, r); 827 if (ret) 828 return ret; 829 } 830 831 return ret; 832 } 833 834 static struct work_atoms * 835 thread_atoms_search(struct rb_root *root, struct thread *thread, 836 struct list_head *sort_list) 837 { 838 struct rb_node *node = root->rb_node; 839 struct work_atoms key = { .thread = thread }; 840 841 while (node) { 842 struct work_atoms *atoms; 843 int cmp; 844 845 atoms = container_of(node, struct work_atoms, node); 846 847 cmp = thread_lat_cmp(sort_list, &key, atoms); 848 if (cmp > 0) 849 node = node->rb_left; 850 else if (cmp < 0) 851 node = node->rb_right; 852 else { 853 BUG_ON(thread != atoms->thread); 854 return atoms; 855 } 856 } 857 return NULL; 858 } 859 860 static void 861 __thread_latency_insert(struct rb_root *root, struct work_atoms *data, 862 struct list_head *sort_list) 863 { 864 struct rb_node **new = &(root->rb_node), *parent = NULL; 865 866 while (*new) { 867 struct work_atoms *this; 868 int cmp; 869 870 this = container_of(*new, struct work_atoms, node); 871 parent = *new; 872 873 cmp = thread_lat_cmp(sort_list, data, this); 874 875 if (cmp > 0) 876 new = &((*new)->rb_left); 877 else 878 new = &((*new)->rb_right); 879 } 880 881 rb_link_node(&data->node, parent, new); 882 rb_insert_color(&data->node, root); 883 } 884 885 static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread) 886 { 887 struct work_atoms *atoms = zalloc(sizeof(*atoms)); 888 if (!atoms) { 889 pr_err("No memory at %s\n", __func__); 890 return -1; 891 } 892 893 atoms->thread = thread__get(thread); 894 INIT_LIST_HEAD(&atoms->work_list); 895 __thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid); 896 return 0; 897 } 898 899 static char sched_out_state(u64 prev_state) 900 { 901 const char *str = TASK_STATE_TO_CHAR_STR; 902 903 return str[prev_state]; 904 } 905 906 static int 907 add_sched_out_event(struct work_atoms *atoms, 908 char run_state, 909 u64 timestamp) 910 { 911 struct work_atom *atom = zalloc(sizeof(*atom)); 912 if (!atom) { 913 pr_err("Non memory at %s", __func__); 914 return -1; 915 } 916 917 atom->sched_out_time = timestamp; 918 919 if (run_state == 'R') { 920 atom->state = THREAD_WAIT_CPU; 921 atom->wake_up_time = atom->sched_out_time; 922 } 923 924 list_add_tail(&atom->list, &atoms->work_list); 925 return 0; 926 } 927 928 static void 929 add_runtime_event(struct work_atoms *atoms, u64 delta, 930 u64 timestamp __maybe_unused) 931 { 932 struct work_atom *atom; 933 934 BUG_ON(list_empty(&atoms->work_list)); 935 936 atom = list_entry(atoms->work_list.prev, struct work_atom, list); 937 938 atom->runtime += delta; 939 atoms->total_runtime += delta; 940 } 941 942 static void 943 add_sched_in_event(struct work_atoms *atoms, u64 timestamp) 944 { 945 struct work_atom *atom; 946 u64 delta; 947 948 if (list_empty(&atoms->work_list)) 949 return; 950 951 atom = list_entry(atoms->work_list.prev, struct work_atom, list); 952 953 if (atom->state != THREAD_WAIT_CPU) 954 return; 955 956 if (timestamp < atom->wake_up_time) { 957 atom->state = THREAD_IGNORE; 958 return; 959 } 960 961 atom->state = THREAD_SCHED_IN; 962 atom->sched_in_time = timestamp; 963 964 delta = atom->sched_in_time - atom->wake_up_time; 965 atoms->total_lat += delta; 966 if (delta > atoms->max_lat) { 967 atoms->max_lat = delta; 968 atoms->max_lat_at = timestamp; 969 } 970 atoms->nb_atoms++; 971 } 972 973 static int latency_switch_event(struct perf_sched *sched, 974 struct perf_evsel *evsel, 975 struct perf_sample *sample, 976 struct machine *machine) 977 { 978 const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"), 979 next_pid = perf_evsel__intval(evsel, sample, "next_pid"); 980 const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state"); 981 struct work_atoms *out_events, *in_events; 982 struct thread *sched_out, *sched_in; 983 u64 timestamp0, timestamp = sample->time; 984 int cpu = sample->cpu, err = -1; 985 s64 delta; 986 987 BUG_ON(cpu >= MAX_CPUS || cpu < 0); 988 989 timestamp0 = sched->cpu_last_switched[cpu]; 990 sched->cpu_last_switched[cpu] = timestamp; 991 if (timestamp0) 992 delta = timestamp - timestamp0; 993 else 994 delta = 0; 995 996 if (delta < 0) { 997 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta); 998 return -1; 999 } 1000 1001 sched_out = machine__findnew_thread(machine, -1, prev_pid); 1002 sched_in = machine__findnew_thread(machine, -1, next_pid); 1003 if (sched_out == NULL || sched_in == NULL) 1004 goto out_put; 1005 1006 out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid); 1007 if (!out_events) { 1008 if (thread_atoms_insert(sched, sched_out)) 1009 goto out_put; 1010 out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid); 1011 if (!out_events) { 1012 pr_err("out-event: Internal tree error"); 1013 goto out_put; 1014 } 1015 } 1016 if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp)) 1017 return -1; 1018 1019 in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid); 1020 if (!in_events) { 1021 if (thread_atoms_insert(sched, sched_in)) 1022 goto out_put; 1023 in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid); 1024 if (!in_events) { 1025 pr_err("in-event: Internal tree error"); 1026 goto out_put; 1027 } 1028 /* 1029 * Take came in we have not heard about yet, 1030 * add in an initial atom in runnable state: 1031 */ 1032 if (add_sched_out_event(in_events, 'R', timestamp)) 1033 goto out_put; 1034 } 1035 add_sched_in_event(in_events, timestamp); 1036 err = 0; 1037 out_put: 1038 thread__put(sched_out); 1039 thread__put(sched_in); 1040 return err; 1041 } 1042 1043 static int latency_runtime_event(struct perf_sched *sched, 1044 struct perf_evsel *evsel, 1045 struct perf_sample *sample, 1046 struct machine *machine) 1047 { 1048 const u32 pid = perf_evsel__intval(evsel, sample, "pid"); 1049 const u64 runtime = perf_evsel__intval(evsel, sample, "runtime"); 1050 struct thread *thread = machine__findnew_thread(machine, -1, pid); 1051 struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid); 1052 u64 timestamp = sample->time; 1053 int cpu = sample->cpu, err = -1; 1054 1055 if (thread == NULL) 1056 return -1; 1057 1058 BUG_ON(cpu >= MAX_CPUS || cpu < 0); 1059 if (!atoms) { 1060 if (thread_atoms_insert(sched, thread)) 1061 goto out_put; 1062 atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid); 1063 if (!atoms) { 1064 pr_err("in-event: Internal tree error"); 1065 goto out_put; 1066 } 1067 if (add_sched_out_event(atoms, 'R', timestamp)) 1068 goto out_put; 1069 } 1070 1071 add_runtime_event(atoms, runtime, timestamp); 1072 err = 0; 1073 out_put: 1074 thread__put(thread); 1075 return err; 1076 } 1077 1078 static int latency_wakeup_event(struct perf_sched *sched, 1079 struct perf_evsel *evsel, 1080 struct perf_sample *sample, 1081 struct machine *machine) 1082 { 1083 const u32 pid = perf_evsel__intval(evsel, sample, "pid"); 1084 struct work_atoms *atoms; 1085 struct work_atom *atom; 1086 struct thread *wakee; 1087 u64 timestamp = sample->time; 1088 int err = -1; 1089 1090 wakee = machine__findnew_thread(machine, -1, pid); 1091 if (wakee == NULL) 1092 return -1; 1093 atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid); 1094 if (!atoms) { 1095 if (thread_atoms_insert(sched, wakee)) 1096 goto out_put; 1097 atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid); 1098 if (!atoms) { 1099 pr_err("wakeup-event: Internal tree error"); 1100 goto out_put; 1101 } 1102 if (add_sched_out_event(atoms, 'S', timestamp)) 1103 goto out_put; 1104 } 1105 1106 BUG_ON(list_empty(&atoms->work_list)); 1107 1108 atom = list_entry(atoms->work_list.prev, struct work_atom, list); 1109 1110 /* 1111 * As we do not guarantee the wakeup event happens when 1112 * task is out of run queue, also may happen when task is 1113 * on run queue and wakeup only change ->state to TASK_RUNNING, 1114 * then we should not set the ->wake_up_time when wake up a 1115 * task which is on run queue. 1116 * 1117 * You WILL be missing events if you've recorded only 1118 * one CPU, or are only looking at only one, so don't 1119 * skip in this case. 1120 */ 1121 if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING) 1122 goto out_ok; 1123 1124 sched->nr_timestamps++; 1125 if (atom->sched_out_time > timestamp) { 1126 sched->nr_unordered_timestamps++; 1127 goto out_ok; 1128 } 1129 1130 atom->state = THREAD_WAIT_CPU; 1131 atom->wake_up_time = timestamp; 1132 out_ok: 1133 err = 0; 1134 out_put: 1135 thread__put(wakee); 1136 return err; 1137 } 1138 1139 static int latency_migrate_task_event(struct perf_sched *sched, 1140 struct perf_evsel *evsel, 1141 struct perf_sample *sample, 1142 struct machine *machine) 1143 { 1144 const u32 pid = perf_evsel__intval(evsel, sample, "pid"); 1145 u64 timestamp = sample->time; 1146 struct work_atoms *atoms; 1147 struct work_atom *atom; 1148 struct thread *migrant; 1149 int err = -1; 1150 1151 /* 1152 * Only need to worry about migration when profiling one CPU. 1153 */ 1154 if (sched->profile_cpu == -1) 1155 return 0; 1156 1157 migrant = machine__findnew_thread(machine, -1, pid); 1158 if (migrant == NULL) 1159 return -1; 1160 atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid); 1161 if (!atoms) { 1162 if (thread_atoms_insert(sched, migrant)) 1163 goto out_put; 1164 register_pid(sched, migrant->tid, thread__comm_str(migrant)); 1165 atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid); 1166 if (!atoms) { 1167 pr_err("migration-event: Internal tree error"); 1168 goto out_put; 1169 } 1170 if (add_sched_out_event(atoms, 'R', timestamp)) 1171 goto out_put; 1172 } 1173 1174 BUG_ON(list_empty(&atoms->work_list)); 1175 1176 atom = list_entry(atoms->work_list.prev, struct work_atom, list); 1177 atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp; 1178 1179 sched->nr_timestamps++; 1180 1181 if (atom->sched_out_time > timestamp) 1182 sched->nr_unordered_timestamps++; 1183 err = 0; 1184 out_put: 1185 thread__put(migrant); 1186 return err; 1187 } 1188 1189 static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list) 1190 { 1191 int i; 1192 int ret; 1193 u64 avg; 1194 1195 if (!work_list->nb_atoms) 1196 return; 1197 /* 1198 * Ignore idle threads: 1199 */ 1200 if (!strcmp(thread__comm_str(work_list->thread), "swapper")) 1201 return; 1202 1203 sched->all_runtime += work_list->total_runtime; 1204 sched->all_count += work_list->nb_atoms; 1205 1206 if (work_list->num_merged > 1) 1207 ret = printf(" %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged); 1208 else 1209 ret = printf(" %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid); 1210 1211 for (i = 0; i < 24 - ret; i++) 1212 printf(" "); 1213 1214 avg = work_list->total_lat / work_list->nb_atoms; 1215 1216 printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %13.6f s\n", 1217 (double)work_list->total_runtime / NSEC_PER_MSEC, 1218 work_list->nb_atoms, (double)avg / NSEC_PER_MSEC, 1219 (double)work_list->max_lat / NSEC_PER_MSEC, 1220 (double)work_list->max_lat_at / NSEC_PER_SEC); 1221 } 1222 1223 static int pid_cmp(struct work_atoms *l, struct work_atoms *r) 1224 { 1225 if (l->thread == r->thread) 1226 return 0; 1227 if (l->thread->tid < r->thread->tid) 1228 return -1; 1229 if (l->thread->tid > r->thread->tid) 1230 return 1; 1231 return (int)(l->thread - r->thread); 1232 } 1233 1234 static int avg_cmp(struct work_atoms *l, struct work_atoms *r) 1235 { 1236 u64 avgl, avgr; 1237 1238 if (!l->nb_atoms) 1239 return -1; 1240 1241 if (!r->nb_atoms) 1242 return 1; 1243 1244 avgl = l->total_lat / l->nb_atoms; 1245 avgr = r->total_lat / r->nb_atoms; 1246 1247 if (avgl < avgr) 1248 return -1; 1249 if (avgl > avgr) 1250 return 1; 1251 1252 return 0; 1253 } 1254 1255 static int max_cmp(struct work_atoms *l, struct work_atoms *r) 1256 { 1257 if (l->max_lat < r->max_lat) 1258 return -1; 1259 if (l->max_lat > r->max_lat) 1260 return 1; 1261 1262 return 0; 1263 } 1264 1265 static int switch_cmp(struct work_atoms *l, struct work_atoms *r) 1266 { 1267 if (l->nb_atoms < r->nb_atoms) 1268 return -1; 1269 if (l->nb_atoms > r->nb_atoms) 1270 return 1; 1271 1272 return 0; 1273 } 1274 1275 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r) 1276 { 1277 if (l->total_runtime < r->total_runtime) 1278 return -1; 1279 if (l->total_runtime > r->total_runtime) 1280 return 1; 1281 1282 return 0; 1283 } 1284 1285 static int sort_dimension__add(const char *tok, struct list_head *list) 1286 { 1287 size_t i; 1288 static struct sort_dimension avg_sort_dimension = { 1289 .name = "avg", 1290 .cmp = avg_cmp, 1291 }; 1292 static struct sort_dimension max_sort_dimension = { 1293 .name = "max", 1294 .cmp = max_cmp, 1295 }; 1296 static struct sort_dimension pid_sort_dimension = { 1297 .name = "pid", 1298 .cmp = pid_cmp, 1299 }; 1300 static struct sort_dimension runtime_sort_dimension = { 1301 .name = "runtime", 1302 .cmp = runtime_cmp, 1303 }; 1304 static struct sort_dimension switch_sort_dimension = { 1305 .name = "switch", 1306 .cmp = switch_cmp, 1307 }; 1308 struct sort_dimension *available_sorts[] = { 1309 &pid_sort_dimension, 1310 &avg_sort_dimension, 1311 &max_sort_dimension, 1312 &switch_sort_dimension, 1313 &runtime_sort_dimension, 1314 }; 1315 1316 for (i = 0; i < ARRAY_SIZE(available_sorts); i++) { 1317 if (!strcmp(available_sorts[i]->name, tok)) { 1318 list_add_tail(&available_sorts[i]->list, list); 1319 1320 return 0; 1321 } 1322 } 1323 1324 return -1; 1325 } 1326 1327 static void perf_sched__sort_lat(struct perf_sched *sched) 1328 { 1329 struct rb_node *node; 1330 struct rb_root *root = &sched->atom_root; 1331 again: 1332 for (;;) { 1333 struct work_atoms *data; 1334 node = rb_first(root); 1335 if (!node) 1336 break; 1337 1338 rb_erase(node, root); 1339 data = rb_entry(node, struct work_atoms, node); 1340 __thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list); 1341 } 1342 if (root == &sched->atom_root) { 1343 root = &sched->merged_atom_root; 1344 goto again; 1345 } 1346 } 1347 1348 static int process_sched_wakeup_event(struct perf_tool *tool, 1349 struct perf_evsel *evsel, 1350 struct perf_sample *sample, 1351 struct machine *machine) 1352 { 1353 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1354 1355 if (sched->tp_handler->wakeup_event) 1356 return sched->tp_handler->wakeup_event(sched, evsel, sample, machine); 1357 1358 return 0; 1359 } 1360 1361 union map_priv { 1362 void *ptr; 1363 bool color; 1364 }; 1365 1366 static bool thread__has_color(struct thread *thread) 1367 { 1368 union map_priv priv = { 1369 .ptr = thread__priv(thread), 1370 }; 1371 1372 return priv.color; 1373 } 1374 1375 static struct thread* 1376 map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid) 1377 { 1378 struct thread *thread = machine__findnew_thread(machine, pid, tid); 1379 union map_priv priv = { 1380 .color = false, 1381 }; 1382 1383 if (!sched->map.color_pids || !thread || thread__priv(thread)) 1384 return thread; 1385 1386 if (thread_map__has(sched->map.color_pids, tid)) 1387 priv.color = true; 1388 1389 thread__set_priv(thread, priv.ptr); 1390 return thread; 1391 } 1392 1393 static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, 1394 struct perf_sample *sample, struct machine *machine) 1395 { 1396 const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid"); 1397 struct thread *sched_in; 1398 int new_shortname; 1399 u64 timestamp0, timestamp = sample->time; 1400 s64 delta; 1401 int i, this_cpu = sample->cpu; 1402 int cpus_nr; 1403 bool new_cpu = false; 1404 const char *color = PERF_COLOR_NORMAL; 1405 1406 BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0); 1407 1408 if (this_cpu > sched->max_cpu) 1409 sched->max_cpu = this_cpu; 1410 1411 if (sched->map.comp) { 1412 cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS); 1413 if (!test_and_set_bit(this_cpu, sched->map.comp_cpus_mask)) { 1414 sched->map.comp_cpus[cpus_nr++] = this_cpu; 1415 new_cpu = true; 1416 } 1417 } else 1418 cpus_nr = sched->max_cpu; 1419 1420 timestamp0 = sched->cpu_last_switched[this_cpu]; 1421 sched->cpu_last_switched[this_cpu] = timestamp; 1422 if (timestamp0) 1423 delta = timestamp - timestamp0; 1424 else 1425 delta = 0; 1426 1427 if (delta < 0) { 1428 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta); 1429 return -1; 1430 } 1431 1432 sched_in = map__findnew_thread(sched, machine, -1, next_pid); 1433 if (sched_in == NULL) 1434 return -1; 1435 1436 sched->curr_thread[this_cpu] = thread__get(sched_in); 1437 1438 printf(" "); 1439 1440 new_shortname = 0; 1441 if (!sched_in->shortname[0]) { 1442 if (!strcmp(thread__comm_str(sched_in), "swapper")) { 1443 /* 1444 * Don't allocate a letter-number for swapper:0 1445 * as a shortname. Instead, we use '.' for it. 1446 */ 1447 sched_in->shortname[0] = '.'; 1448 sched_in->shortname[1] = ' '; 1449 } else { 1450 sched_in->shortname[0] = sched->next_shortname1; 1451 sched_in->shortname[1] = sched->next_shortname2; 1452 1453 if (sched->next_shortname1 < 'Z') { 1454 sched->next_shortname1++; 1455 } else { 1456 sched->next_shortname1 = 'A'; 1457 if (sched->next_shortname2 < '9') 1458 sched->next_shortname2++; 1459 else 1460 sched->next_shortname2 = '0'; 1461 } 1462 } 1463 new_shortname = 1; 1464 } 1465 1466 for (i = 0; i < cpus_nr; i++) { 1467 int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i; 1468 struct thread *curr_thread = sched->curr_thread[cpu]; 1469 const char *pid_color = color; 1470 const char *cpu_color = color; 1471 1472 if (curr_thread && thread__has_color(curr_thread)) 1473 pid_color = COLOR_PIDS; 1474 1475 if (sched->map.cpus && !cpu_map__has(sched->map.cpus, cpu)) 1476 continue; 1477 1478 if (sched->map.color_cpus && cpu_map__has(sched->map.color_cpus, cpu)) 1479 cpu_color = COLOR_CPUS; 1480 1481 if (cpu != this_cpu) 1482 color_fprintf(stdout, cpu_color, " "); 1483 else 1484 color_fprintf(stdout, cpu_color, "*"); 1485 1486 if (sched->curr_thread[cpu]) 1487 color_fprintf(stdout, pid_color, "%2s ", sched->curr_thread[cpu]->shortname); 1488 else 1489 color_fprintf(stdout, color, " "); 1490 } 1491 1492 if (sched->map.cpus && !cpu_map__has(sched->map.cpus, this_cpu)) 1493 goto out; 1494 1495 color_fprintf(stdout, color, " %12.6f secs ", (double)timestamp / NSEC_PER_SEC); 1496 if (new_shortname) { 1497 const char *pid_color = color; 1498 1499 if (thread__has_color(sched_in)) 1500 pid_color = COLOR_PIDS; 1501 1502 color_fprintf(stdout, pid_color, "%s => %s:%d", 1503 sched_in->shortname, thread__comm_str(sched_in), sched_in->tid); 1504 } 1505 1506 if (sched->map.comp && new_cpu) 1507 color_fprintf(stdout, color, " (CPU %d)", this_cpu); 1508 1509 out: 1510 color_fprintf(stdout, color, "\n"); 1511 1512 thread__put(sched_in); 1513 1514 return 0; 1515 } 1516 1517 static int process_sched_switch_event(struct perf_tool *tool, 1518 struct perf_evsel *evsel, 1519 struct perf_sample *sample, 1520 struct machine *machine) 1521 { 1522 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1523 int this_cpu = sample->cpu, err = 0; 1524 u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"), 1525 next_pid = perf_evsel__intval(evsel, sample, "next_pid"); 1526 1527 if (sched->curr_pid[this_cpu] != (u32)-1) { 1528 /* 1529 * Are we trying to switch away a PID that is 1530 * not current? 1531 */ 1532 if (sched->curr_pid[this_cpu] != prev_pid) 1533 sched->nr_context_switch_bugs++; 1534 } 1535 1536 if (sched->tp_handler->switch_event) 1537 err = sched->tp_handler->switch_event(sched, evsel, sample, machine); 1538 1539 sched->curr_pid[this_cpu] = next_pid; 1540 return err; 1541 } 1542 1543 static int process_sched_runtime_event(struct perf_tool *tool, 1544 struct perf_evsel *evsel, 1545 struct perf_sample *sample, 1546 struct machine *machine) 1547 { 1548 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1549 1550 if (sched->tp_handler->runtime_event) 1551 return sched->tp_handler->runtime_event(sched, evsel, sample, machine); 1552 1553 return 0; 1554 } 1555 1556 static int perf_sched__process_fork_event(struct perf_tool *tool, 1557 union perf_event *event, 1558 struct perf_sample *sample, 1559 struct machine *machine) 1560 { 1561 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1562 1563 /* run the fork event through the perf machineruy */ 1564 perf_event__process_fork(tool, event, sample, machine); 1565 1566 /* and then run additional processing needed for this command */ 1567 if (sched->tp_handler->fork_event) 1568 return sched->tp_handler->fork_event(sched, event, machine); 1569 1570 return 0; 1571 } 1572 1573 static int process_sched_migrate_task_event(struct perf_tool *tool, 1574 struct perf_evsel *evsel, 1575 struct perf_sample *sample, 1576 struct machine *machine) 1577 { 1578 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1579 1580 if (sched->tp_handler->migrate_task_event) 1581 return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine); 1582 1583 return 0; 1584 } 1585 1586 typedef int (*tracepoint_handler)(struct perf_tool *tool, 1587 struct perf_evsel *evsel, 1588 struct perf_sample *sample, 1589 struct machine *machine); 1590 1591 static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused, 1592 union perf_event *event __maybe_unused, 1593 struct perf_sample *sample, 1594 struct perf_evsel *evsel, 1595 struct machine *machine) 1596 { 1597 int err = 0; 1598 1599 if (evsel->handler != NULL) { 1600 tracepoint_handler f = evsel->handler; 1601 err = f(tool, evsel, sample, machine); 1602 } 1603 1604 return err; 1605 } 1606 1607 static int perf_sched__read_events(struct perf_sched *sched) 1608 { 1609 const struct perf_evsel_str_handler handlers[] = { 1610 { "sched:sched_switch", process_sched_switch_event, }, 1611 { "sched:sched_stat_runtime", process_sched_runtime_event, }, 1612 { "sched:sched_wakeup", process_sched_wakeup_event, }, 1613 { "sched:sched_wakeup_new", process_sched_wakeup_event, }, 1614 { "sched:sched_migrate_task", process_sched_migrate_task_event, }, 1615 }; 1616 struct perf_session *session; 1617 struct perf_data_file file = { 1618 .path = input_name, 1619 .mode = PERF_DATA_MODE_READ, 1620 .force = sched->force, 1621 }; 1622 int rc = -1; 1623 1624 session = perf_session__new(&file, false, &sched->tool); 1625 if (session == NULL) { 1626 pr_debug("No Memory for session\n"); 1627 return -1; 1628 } 1629 1630 symbol__init(&session->header.env); 1631 1632 if (perf_session__set_tracepoints_handlers(session, handlers)) 1633 goto out_delete; 1634 1635 if (perf_session__has_traces(session, "record -R")) { 1636 int err = perf_session__process_events(session); 1637 if (err) { 1638 pr_err("Failed to process events, error %d", err); 1639 goto out_delete; 1640 } 1641 1642 sched->nr_events = session->evlist->stats.nr_events[0]; 1643 sched->nr_lost_events = session->evlist->stats.total_lost; 1644 sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST]; 1645 } 1646 1647 rc = 0; 1648 out_delete: 1649 perf_session__delete(session); 1650 return rc; 1651 } 1652 1653 static void print_bad_events(struct perf_sched *sched) 1654 { 1655 if (sched->nr_unordered_timestamps && sched->nr_timestamps) { 1656 printf(" INFO: %.3f%% unordered timestamps (%ld out of %ld)\n", 1657 (double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0, 1658 sched->nr_unordered_timestamps, sched->nr_timestamps); 1659 } 1660 if (sched->nr_lost_events && sched->nr_events) { 1661 printf(" INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n", 1662 (double)sched->nr_lost_events/(double)sched->nr_events * 100.0, 1663 sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks); 1664 } 1665 if (sched->nr_context_switch_bugs && sched->nr_timestamps) { 1666 printf(" INFO: %.3f%% context switch bugs (%ld out of %ld)", 1667 (double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0, 1668 sched->nr_context_switch_bugs, sched->nr_timestamps); 1669 if (sched->nr_lost_events) 1670 printf(" (due to lost events?)"); 1671 printf("\n"); 1672 } 1673 } 1674 1675 static void __merge_work_atoms(struct rb_root *root, struct work_atoms *data) 1676 { 1677 struct rb_node **new = &(root->rb_node), *parent = NULL; 1678 struct work_atoms *this; 1679 const char *comm = thread__comm_str(data->thread), *this_comm; 1680 1681 while (*new) { 1682 int cmp; 1683 1684 this = container_of(*new, struct work_atoms, node); 1685 parent = *new; 1686 1687 this_comm = thread__comm_str(this->thread); 1688 cmp = strcmp(comm, this_comm); 1689 if (cmp > 0) { 1690 new = &((*new)->rb_left); 1691 } else if (cmp < 0) { 1692 new = &((*new)->rb_right); 1693 } else { 1694 this->num_merged++; 1695 this->total_runtime += data->total_runtime; 1696 this->nb_atoms += data->nb_atoms; 1697 this->total_lat += data->total_lat; 1698 list_splice(&data->work_list, &this->work_list); 1699 if (this->max_lat < data->max_lat) { 1700 this->max_lat = data->max_lat; 1701 this->max_lat_at = data->max_lat_at; 1702 } 1703 zfree(&data); 1704 return; 1705 } 1706 } 1707 1708 data->num_merged++; 1709 rb_link_node(&data->node, parent, new); 1710 rb_insert_color(&data->node, root); 1711 } 1712 1713 static void perf_sched__merge_lat(struct perf_sched *sched) 1714 { 1715 struct work_atoms *data; 1716 struct rb_node *node; 1717 1718 if (sched->skip_merge) 1719 return; 1720 1721 while ((node = rb_first(&sched->atom_root))) { 1722 rb_erase(node, &sched->atom_root); 1723 data = rb_entry(node, struct work_atoms, node); 1724 __merge_work_atoms(&sched->merged_atom_root, data); 1725 } 1726 } 1727 1728 static int perf_sched__lat(struct perf_sched *sched) 1729 { 1730 struct rb_node *next; 1731 1732 setup_pager(); 1733 1734 if (perf_sched__read_events(sched)) 1735 return -1; 1736 1737 perf_sched__merge_lat(sched); 1738 perf_sched__sort_lat(sched); 1739 1740 printf("\n -----------------------------------------------------------------------------------------------------------------\n"); 1741 printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at |\n"); 1742 printf(" -----------------------------------------------------------------------------------------------------------------\n"); 1743 1744 next = rb_first(&sched->sorted_atom_root); 1745 1746 while (next) { 1747 struct work_atoms *work_list; 1748 1749 work_list = rb_entry(next, struct work_atoms, node); 1750 output_lat_thread(sched, work_list); 1751 next = rb_next(next); 1752 thread__zput(work_list->thread); 1753 } 1754 1755 printf(" -----------------------------------------------------------------------------------------------------------------\n"); 1756 printf(" TOTAL: |%11.3f ms |%9" PRIu64 " |\n", 1757 (double)sched->all_runtime / NSEC_PER_MSEC, sched->all_count); 1758 1759 printf(" ---------------------------------------------------\n"); 1760 1761 print_bad_events(sched); 1762 printf("\n"); 1763 1764 return 0; 1765 } 1766 1767 static int setup_map_cpus(struct perf_sched *sched) 1768 { 1769 struct cpu_map *map; 1770 1771 sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF); 1772 1773 if (sched->map.comp) { 1774 sched->map.comp_cpus = zalloc(sched->max_cpu * sizeof(int)); 1775 if (!sched->map.comp_cpus) 1776 return -1; 1777 } 1778 1779 if (!sched->map.cpus_str) 1780 return 0; 1781 1782 map = cpu_map__new(sched->map.cpus_str); 1783 if (!map) { 1784 pr_err("failed to get cpus map from %s\n", sched->map.cpus_str); 1785 return -1; 1786 } 1787 1788 sched->map.cpus = map; 1789 return 0; 1790 } 1791 1792 static int setup_color_pids(struct perf_sched *sched) 1793 { 1794 struct thread_map *map; 1795 1796 if (!sched->map.color_pids_str) 1797 return 0; 1798 1799 map = thread_map__new_by_tid_str(sched->map.color_pids_str); 1800 if (!map) { 1801 pr_err("failed to get thread map from %s\n", sched->map.color_pids_str); 1802 return -1; 1803 } 1804 1805 sched->map.color_pids = map; 1806 return 0; 1807 } 1808 1809 static int setup_color_cpus(struct perf_sched *sched) 1810 { 1811 struct cpu_map *map; 1812 1813 if (!sched->map.color_cpus_str) 1814 return 0; 1815 1816 map = cpu_map__new(sched->map.color_cpus_str); 1817 if (!map) { 1818 pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str); 1819 return -1; 1820 } 1821 1822 sched->map.color_cpus = map; 1823 return 0; 1824 } 1825 1826 static int perf_sched__map(struct perf_sched *sched) 1827 { 1828 if (setup_map_cpus(sched)) 1829 return -1; 1830 1831 if (setup_color_pids(sched)) 1832 return -1; 1833 1834 if (setup_color_cpus(sched)) 1835 return -1; 1836 1837 setup_pager(); 1838 if (perf_sched__read_events(sched)) 1839 return -1; 1840 print_bad_events(sched); 1841 return 0; 1842 } 1843 1844 static int perf_sched__replay(struct perf_sched *sched) 1845 { 1846 unsigned long i; 1847 1848 calibrate_run_measurement_overhead(sched); 1849 calibrate_sleep_measurement_overhead(sched); 1850 1851 test_calibrations(sched); 1852 1853 if (perf_sched__read_events(sched)) 1854 return -1; 1855 1856 printf("nr_run_events: %ld\n", sched->nr_run_events); 1857 printf("nr_sleep_events: %ld\n", sched->nr_sleep_events); 1858 printf("nr_wakeup_events: %ld\n", sched->nr_wakeup_events); 1859 1860 if (sched->targetless_wakeups) 1861 printf("target-less wakeups: %ld\n", sched->targetless_wakeups); 1862 if (sched->multitarget_wakeups) 1863 printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups); 1864 if (sched->nr_run_events_optimized) 1865 printf("run atoms optimized: %ld\n", 1866 sched->nr_run_events_optimized); 1867 1868 print_task_traces(sched); 1869 add_cross_task_wakeups(sched); 1870 1871 create_tasks(sched); 1872 printf("------------------------------------------------------------\n"); 1873 for (i = 0; i < sched->replay_repeat; i++) 1874 run_one_test(sched); 1875 1876 return 0; 1877 } 1878 1879 static void setup_sorting(struct perf_sched *sched, const struct option *options, 1880 const char * const usage_msg[]) 1881 { 1882 char *tmp, *tok, *str = strdup(sched->sort_order); 1883 1884 for (tok = strtok_r(str, ", ", &tmp); 1885 tok; tok = strtok_r(NULL, ", ", &tmp)) { 1886 if (sort_dimension__add(tok, &sched->sort_list) < 0) { 1887 usage_with_options_msg(usage_msg, options, 1888 "Unknown --sort key: `%s'", tok); 1889 } 1890 } 1891 1892 free(str); 1893 1894 sort_dimension__add("pid", &sched->cmp_pid); 1895 } 1896 1897 static int __cmd_record(int argc, const char **argv) 1898 { 1899 unsigned int rec_argc, i, j; 1900 const char **rec_argv; 1901 const char * const record_args[] = { 1902 "record", 1903 "-a", 1904 "-R", 1905 "-m", "1024", 1906 "-c", "1", 1907 "-e", "sched:sched_switch", 1908 "-e", "sched:sched_stat_wait", 1909 "-e", "sched:sched_stat_sleep", 1910 "-e", "sched:sched_stat_iowait", 1911 "-e", "sched:sched_stat_runtime", 1912 "-e", "sched:sched_process_fork", 1913 "-e", "sched:sched_wakeup", 1914 "-e", "sched:sched_wakeup_new", 1915 "-e", "sched:sched_migrate_task", 1916 }; 1917 1918 rec_argc = ARRAY_SIZE(record_args) + argc - 1; 1919 rec_argv = calloc(rec_argc + 1, sizeof(char *)); 1920 1921 if (rec_argv == NULL) 1922 return -ENOMEM; 1923 1924 for (i = 0; i < ARRAY_SIZE(record_args); i++) 1925 rec_argv[i] = strdup(record_args[i]); 1926 1927 for (j = 1; j < (unsigned int)argc; j++, i++) 1928 rec_argv[i] = argv[j]; 1929 1930 BUG_ON(i != rec_argc); 1931 1932 return cmd_record(i, rec_argv, NULL); 1933 } 1934 1935 int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) 1936 { 1937 const char default_sort_order[] = "avg, max, switch, runtime"; 1938 struct perf_sched sched = { 1939 .tool = { 1940 .sample = perf_sched__process_tracepoint_sample, 1941 .comm = perf_event__process_comm, 1942 .lost = perf_event__process_lost, 1943 .fork = perf_sched__process_fork_event, 1944 .ordered_events = true, 1945 }, 1946 .cmp_pid = LIST_HEAD_INIT(sched.cmp_pid), 1947 .sort_list = LIST_HEAD_INIT(sched.sort_list), 1948 .start_work_mutex = PTHREAD_MUTEX_INITIALIZER, 1949 .work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER, 1950 .sort_order = default_sort_order, 1951 .replay_repeat = 10, 1952 .profile_cpu = -1, 1953 .next_shortname1 = 'A', 1954 .next_shortname2 = '0', 1955 .skip_merge = 0, 1956 }; 1957 const struct option latency_options[] = { 1958 OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]", 1959 "sort by key(s): runtime, switch, avg, max"), 1960 OPT_INCR('v', "verbose", &verbose, 1961 "be more verbose (show symbol address, etc)"), 1962 OPT_INTEGER('C', "CPU", &sched.profile_cpu, 1963 "CPU to profile on"), 1964 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, 1965 "dump raw trace in ASCII"), 1966 OPT_BOOLEAN('p', "pids", &sched.skip_merge, 1967 "latency stats per pid instead of per comm"), 1968 OPT_END() 1969 }; 1970 const struct option replay_options[] = { 1971 OPT_UINTEGER('r', "repeat", &sched.replay_repeat, 1972 "repeat the workload replay N times (-1: infinite)"), 1973 OPT_INCR('v', "verbose", &verbose, 1974 "be more verbose (show symbol address, etc)"), 1975 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, 1976 "dump raw trace in ASCII"), 1977 OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"), 1978 OPT_END() 1979 }; 1980 const struct option sched_options[] = { 1981 OPT_STRING('i', "input", &input_name, "file", 1982 "input file name"), 1983 OPT_INCR('v', "verbose", &verbose, 1984 "be more verbose (show symbol address, etc)"), 1985 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, 1986 "dump raw trace in ASCII"), 1987 OPT_END() 1988 }; 1989 const struct option map_options[] = { 1990 OPT_BOOLEAN(0, "compact", &sched.map.comp, 1991 "map output in compact mode"), 1992 OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids", 1993 "highlight given pids in map"), 1994 OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus", 1995 "highlight given CPUs in map"), 1996 OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus", 1997 "display given CPUs in map"), 1998 OPT_END() 1999 }; 2000 const char * const latency_usage[] = { 2001 "perf sched latency [<options>]", 2002 NULL 2003 }; 2004 const char * const replay_usage[] = { 2005 "perf sched replay [<options>]", 2006 NULL 2007 }; 2008 const char * const map_usage[] = { 2009 "perf sched map [<options>]", 2010 NULL 2011 }; 2012 const char *const sched_subcommands[] = { "record", "latency", "map", 2013 "replay", "script", NULL }; 2014 const char *sched_usage[] = { 2015 NULL, 2016 NULL 2017 }; 2018 struct trace_sched_handler lat_ops = { 2019 .wakeup_event = latency_wakeup_event, 2020 .switch_event = latency_switch_event, 2021 .runtime_event = latency_runtime_event, 2022 .migrate_task_event = latency_migrate_task_event, 2023 }; 2024 struct trace_sched_handler map_ops = { 2025 .switch_event = map_switch_event, 2026 }; 2027 struct trace_sched_handler replay_ops = { 2028 .wakeup_event = replay_wakeup_event, 2029 .switch_event = replay_switch_event, 2030 .fork_event = replay_fork_event, 2031 }; 2032 unsigned int i; 2033 2034 for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++) 2035 sched.curr_pid[i] = -1; 2036 2037 argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands, 2038 sched_usage, PARSE_OPT_STOP_AT_NON_OPTION); 2039 if (!argc) 2040 usage_with_options(sched_usage, sched_options); 2041 2042 /* 2043 * Aliased to 'perf script' for now: 2044 */ 2045 if (!strcmp(argv[0], "script")) 2046 return cmd_script(argc, argv, prefix); 2047 2048 if (!strncmp(argv[0], "rec", 3)) { 2049 return __cmd_record(argc, argv); 2050 } else if (!strncmp(argv[0], "lat", 3)) { 2051 sched.tp_handler = &lat_ops; 2052 if (argc > 1) { 2053 argc = parse_options(argc, argv, latency_options, latency_usage, 0); 2054 if (argc) 2055 usage_with_options(latency_usage, latency_options); 2056 } 2057 setup_sorting(&sched, latency_options, latency_usage); 2058 return perf_sched__lat(&sched); 2059 } else if (!strcmp(argv[0], "map")) { 2060 if (argc) { 2061 argc = parse_options(argc, argv, map_options, map_usage, 0); 2062 if (argc) 2063 usage_with_options(map_usage, map_options); 2064 } 2065 sched.tp_handler = &map_ops; 2066 setup_sorting(&sched, latency_options, latency_usage); 2067 return perf_sched__map(&sched); 2068 } else if (!strncmp(argv[0], "rep", 3)) { 2069 sched.tp_handler = &replay_ops; 2070 if (argc) { 2071 argc = parse_options(argc, argv, replay_options, replay_usage, 0); 2072 if (argc) 2073 usage_with_options(replay_usage, replay_options); 2074 } 2075 return perf_sched__replay(&sched); 2076 } else { 2077 usage_with_options(sched_usage, sched_options); 2078 } 2079 2080 return 0; 2081 } 2082