1 /* 2 * builtin-top.c 3 * 4 * Builtin top command: Display a continuously updated profile of 5 * any workload, CPU or specific PID. 6 * 7 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com> 8 * 9 * Improvements and fixes by: 10 * 11 * Arjan van de Ven <arjan@linux.intel.com> 12 * Yanmin Zhang <yanmin.zhang@intel.com> 13 * Wu Fengguang <fengguang.wu@intel.com> 14 * Mike Galbraith <efault@gmx.de> 15 * Paul Mackerras <paulus@samba.org> 16 * 17 * Released under the GPL v2. (and only v2, not any later version) 18 */ 19 #include "builtin.h" 20 21 #include "perf.h" 22 23 #include "util/symbol.h" 24 #include "util/color.h" 25 #include "util/util.h" 26 #include <linux/rbtree.h> 27 #include "util/parse-options.h" 28 #include "util/parse-events.h" 29 30 #include <assert.h> 31 #include <fcntl.h> 32 33 #include <stdio.h> 34 35 #include <errno.h> 36 #include <time.h> 37 #include <sched.h> 38 #include <pthread.h> 39 40 #include <sys/syscall.h> 41 #include <sys/ioctl.h> 42 #include <sys/poll.h> 43 #include <sys/prctl.h> 44 #include <sys/wait.h> 45 #include <sys/uio.h> 46 #include <sys/mman.h> 47 48 #include <linux/unistd.h> 49 #include <linux/types.h> 50 51 static int fd[MAX_NR_CPUS][MAX_COUNTERS]; 52 53 static int system_wide = 0; 54 55 static int default_interval = 100000; 56 57 static u64 count_filter = 5; 58 static int print_entries = 15; 59 60 static int target_pid = -1; 61 static int inherit = 0; 62 static int profile_cpu = -1; 63 static int nr_cpus = 0; 64 static unsigned int realtime_prio = 0; 65 static int group = 0; 66 static unsigned int page_size; 67 static unsigned int mmap_pages = 16; 68 static int freq = 0; 69 static int verbose = 0; 70 static char *vmlinux = NULL; 71 72 static char *sym_filter; 73 static unsigned long filter_start; 74 static unsigned long filter_end; 75 76 static int delay_secs = 2; 77 static int zero; 78 static int dump_symtab; 79 80 /* 81 * Symbols 82 */ 83 84 static u64 min_ip; 85 static u64 max_ip = -1ll; 86 87 struct sym_entry { 88 struct rb_node rb_node; 89 struct list_head node; 90 unsigned long count[MAX_COUNTERS]; 91 unsigned long snap_count; 92 double weight; 93 int skip; 94 }; 95 96 struct sym_entry *sym_filter_entry; 97 98 struct dso *kernel_dso; 99 100 /* 101 * Symbols will be added here in record_ip and will get out 102 * after decayed. 103 */ 104 static LIST_HEAD(active_symbols); 105 static pthread_mutex_t active_symbols_lock = PTHREAD_MUTEX_INITIALIZER; 106 107 /* 108 * Ordering weight: count-1 * count-2 * ... / count-n 109 */ 110 static double sym_weight(const struct sym_entry *sym) 111 { 112 double weight = sym->snap_count; 113 int counter; 114 115 for (counter = 1; counter < nr_counters-1; counter++) 116 weight *= sym->count[counter]; 117 118 weight /= (sym->count[counter] + 1); 119 120 return weight; 121 } 122 123 static long samples; 124 static long userspace_samples; 125 static const char CONSOLE_CLEAR[] = "[H[2J"; 126 127 static void __list_insert_active_sym(struct sym_entry *syme) 128 { 129 list_add(&syme->node, &active_symbols); 130 } 131 132 static void list_remove_active_sym(struct sym_entry *syme) 133 { 134 pthread_mutex_lock(&active_symbols_lock); 135 list_del_init(&syme->node); 136 pthread_mutex_unlock(&active_symbols_lock); 137 } 138 139 static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se) 140 { 141 struct rb_node **p = &tree->rb_node; 142 struct rb_node *parent = NULL; 143 struct sym_entry *iter; 144 145 while (*p != NULL) { 146 parent = *p; 147 iter = rb_entry(parent, struct sym_entry, rb_node); 148 149 if (se->weight > iter->weight) 150 p = &(*p)->rb_left; 151 else 152 p = &(*p)->rb_right; 153 } 154 155 rb_link_node(&se->rb_node, parent, p); 156 rb_insert_color(&se->rb_node, tree); 157 } 158 159 static void print_sym_table(void) 160 { 161 int printed = 0, j; 162 int counter; 163 float samples_per_sec = samples/delay_secs; 164 float ksamples_per_sec = (samples-userspace_samples)/delay_secs; 165 float sum_ksamples = 0.0; 166 struct sym_entry *syme, *n; 167 struct rb_root tmp = RB_ROOT; 168 struct rb_node *nd; 169 170 samples = userspace_samples = 0; 171 172 /* Sort the active symbols */ 173 pthread_mutex_lock(&active_symbols_lock); 174 syme = list_entry(active_symbols.next, struct sym_entry, node); 175 pthread_mutex_unlock(&active_symbols_lock); 176 177 list_for_each_entry_safe_from(syme, n, &active_symbols, node) { 178 syme->snap_count = syme->count[0]; 179 if (syme->snap_count != 0) { 180 syme->weight = sym_weight(syme); 181 rb_insert_active_sym(&tmp, syme); 182 sum_ksamples += syme->snap_count; 183 184 for (j = 0; j < nr_counters; j++) 185 syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8; 186 } else 187 list_remove_active_sym(syme); 188 } 189 190 puts(CONSOLE_CLEAR); 191 192 printf( 193 "------------------------------------------------------------------------------\n"); 194 printf( " PerfTop:%8.0f irqs/sec kernel:%4.1f%% [", 195 samples_per_sec, 196 100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec))); 197 198 if (nr_counters == 1) { 199 printf("%Ld", (u64)attrs[0].sample_period); 200 if (freq) 201 printf("Hz "); 202 else 203 printf(" "); 204 } 205 206 for (counter = 0; counter < nr_counters; counter++) { 207 if (counter) 208 printf("/"); 209 210 printf("%s", event_name(counter)); 211 } 212 213 printf( "], "); 214 215 if (target_pid != -1) 216 printf(" (target_pid: %d", target_pid); 217 else 218 printf(" (all"); 219 220 if (profile_cpu != -1) 221 printf(", cpu: %d)\n", profile_cpu); 222 else { 223 if (target_pid != -1) 224 printf(")\n"); 225 else 226 printf(", %d CPUs)\n", nr_cpus); 227 } 228 229 printf("------------------------------------------------------------------------------\n\n"); 230 231 if (nr_counters == 1) 232 printf(" samples pcnt"); 233 else 234 printf(" weight samples pcnt"); 235 236 printf(" RIP kernel function\n" 237 " ______ _______ _____ ________________ _______________\n\n" 238 ); 239 240 for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) { 241 struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node); 242 struct symbol *sym = (struct symbol *)(syme + 1); 243 double pcnt; 244 245 if (++printed > print_entries || syme->snap_count < count_filter) 246 continue; 247 248 pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) / 249 sum_ksamples)); 250 251 if (nr_counters == 1) 252 printf("%20.2f - ", syme->weight); 253 else 254 printf("%9.1f %10ld - ", syme->weight, syme->snap_count); 255 256 percent_color_fprintf(stdout, "%4.1f%%", pcnt); 257 printf(" - %016llx : %s", sym->start, sym->name); 258 if (sym->module) 259 printf("\t[%s]", sym->module->name); 260 printf("\n"); 261 } 262 } 263 264 static void *display_thread(void *arg __used) 265 { 266 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; 267 int delay_msecs = delay_secs * 1000; 268 269 printf("PerfTop refresh period: %d seconds\n", delay_secs); 270 271 do { 272 print_sym_table(); 273 } while (!poll(&stdin_poll, 1, delay_msecs) == 1); 274 275 printf("key pressed - exiting.\n"); 276 exit(0); 277 278 return NULL; 279 } 280 281 /* Tag samples to be skipped. */ 282 static const char *skip_symbols[] = { 283 "default_idle", 284 "cpu_idle", 285 "enter_idle", 286 "exit_idle", 287 "mwait_idle", 288 "mwait_idle_with_hints", 289 "ppc64_runlatch_off", 290 "pseries_dedicated_idle_sleep", 291 NULL 292 }; 293 294 static int symbol_filter(struct dso *self, struct symbol *sym) 295 { 296 static int filter_match; 297 struct sym_entry *syme; 298 const char *name = sym->name; 299 int i; 300 301 /* 302 * ppc64 uses function descriptors and appends a '.' to the 303 * start of every instruction address. Remove it. 304 */ 305 if (name[0] == '.') 306 name++; 307 308 if (!strcmp(name, "_text") || 309 !strcmp(name, "_etext") || 310 !strcmp(name, "_sinittext") || 311 !strncmp("init_module", name, 11) || 312 !strncmp("cleanup_module", name, 14) || 313 strstr(name, "_text_start") || 314 strstr(name, "_text_end")) 315 return 1; 316 317 syme = dso__sym_priv(self, sym); 318 for (i = 0; skip_symbols[i]; i++) { 319 if (!strcmp(skip_symbols[i], name)) { 320 syme->skip = 1; 321 break; 322 } 323 } 324 325 if (filter_match == 1) { 326 filter_end = sym->start; 327 filter_match = -1; 328 if (filter_end - filter_start > 10000) { 329 fprintf(stderr, 330 "hm, too large filter symbol <%s> - skipping.\n", 331 sym_filter); 332 fprintf(stderr, "symbol filter start: %016lx\n", 333 filter_start); 334 fprintf(stderr, " end: %016lx\n", 335 filter_end); 336 filter_end = filter_start = 0; 337 sym_filter = NULL; 338 sleep(1); 339 } 340 } 341 342 if (filter_match == 0 && sym_filter && !strcmp(name, sym_filter)) { 343 filter_match = 1; 344 filter_start = sym->start; 345 } 346 347 348 return 0; 349 } 350 351 static int parse_symbols(void) 352 { 353 struct rb_node *node; 354 struct symbol *sym; 355 int modules = vmlinux ? 1 : 0; 356 357 kernel_dso = dso__new("[kernel]", sizeof(struct sym_entry)); 358 if (kernel_dso == NULL) 359 return -1; 360 361 if (dso__load_kernel(kernel_dso, vmlinux, symbol_filter, verbose, modules) <= 0) 362 goto out_delete_dso; 363 364 node = rb_first(&kernel_dso->syms); 365 sym = rb_entry(node, struct symbol, rb_node); 366 min_ip = sym->start; 367 368 node = rb_last(&kernel_dso->syms); 369 sym = rb_entry(node, struct symbol, rb_node); 370 max_ip = sym->end; 371 372 if (dump_symtab) 373 dso__fprintf(kernel_dso, stderr); 374 375 return 0; 376 377 out_delete_dso: 378 dso__delete(kernel_dso); 379 kernel_dso = NULL; 380 return -1; 381 } 382 383 #define TRACE_COUNT 3 384 385 /* 386 * Binary search in the histogram table and record the hit: 387 */ 388 static void record_ip(u64 ip, int counter) 389 { 390 struct symbol *sym = dso__find_symbol(kernel_dso, ip); 391 392 if (sym != NULL) { 393 struct sym_entry *syme = dso__sym_priv(kernel_dso, sym); 394 395 if (!syme->skip) { 396 syme->count[counter]++; 397 pthread_mutex_lock(&active_symbols_lock); 398 if (list_empty(&syme->node) || !syme->node.next) 399 __list_insert_active_sym(syme); 400 pthread_mutex_unlock(&active_symbols_lock); 401 return; 402 } 403 } 404 405 samples--; 406 } 407 408 static void process_event(u64 ip, int counter, int user) 409 { 410 samples++; 411 412 if (user) { 413 userspace_samples++; 414 return; 415 } 416 417 record_ip(ip, counter); 418 } 419 420 struct mmap_data { 421 int counter; 422 void *base; 423 int mask; 424 unsigned int prev; 425 }; 426 427 static unsigned int mmap_read_head(struct mmap_data *md) 428 { 429 struct perf_counter_mmap_page *pc = md->base; 430 int head; 431 432 head = pc->data_head; 433 rmb(); 434 435 return head; 436 } 437 438 struct timeval last_read, this_read; 439 440 static void mmap_read_counter(struct mmap_data *md) 441 { 442 unsigned int head = mmap_read_head(md); 443 unsigned int old = md->prev; 444 unsigned char *data = md->base + page_size; 445 int diff; 446 447 gettimeofday(&this_read, NULL); 448 449 /* 450 * If we're further behind than half the buffer, there's a chance 451 * the writer will bite our tail and mess up the samples under us. 452 * 453 * If we somehow ended up ahead of the head, we got messed up. 454 * 455 * In either case, truncate and restart at head. 456 */ 457 diff = head - old; 458 if (diff > md->mask / 2 || diff < 0) { 459 struct timeval iv; 460 unsigned long msecs; 461 462 timersub(&this_read, &last_read, &iv); 463 msecs = iv.tv_sec*1000 + iv.tv_usec/1000; 464 465 fprintf(stderr, "WARNING: failed to keep up with mmap data." 466 " Last read %lu msecs ago.\n", msecs); 467 468 /* 469 * head points to a known good entry, start there. 470 */ 471 old = head; 472 } 473 474 last_read = this_read; 475 476 for (; old != head;) { 477 struct ip_event { 478 struct perf_event_header header; 479 u64 ip; 480 u32 pid, target_pid; 481 }; 482 struct mmap_event { 483 struct perf_event_header header; 484 u32 pid, target_pid; 485 u64 start; 486 u64 len; 487 u64 pgoff; 488 char filename[PATH_MAX]; 489 }; 490 491 typedef union event_union { 492 struct perf_event_header header; 493 struct ip_event ip; 494 struct mmap_event mmap; 495 } event_t; 496 497 event_t *event = (event_t *)&data[old & md->mask]; 498 499 event_t event_copy; 500 501 size_t size = event->header.size; 502 503 /* 504 * Event straddles the mmap boundary -- header should always 505 * be inside due to u64 alignment of output. 506 */ 507 if ((old & md->mask) + size != ((old + size) & md->mask)) { 508 unsigned int offset = old; 509 unsigned int len = min(sizeof(*event), size), cpy; 510 void *dst = &event_copy; 511 512 do { 513 cpy = min(md->mask + 1 - (offset & md->mask), len); 514 memcpy(dst, &data[offset & md->mask], cpy); 515 offset += cpy; 516 dst += cpy; 517 len -= cpy; 518 } while (len); 519 520 event = &event_copy; 521 } 522 523 old += size; 524 525 if (event->header.type == PERF_EVENT_SAMPLE) { 526 int user = 527 (event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK) == PERF_EVENT_MISC_USER; 528 process_event(event->ip.ip, md->counter, user); 529 } 530 } 531 532 md->prev = old; 533 } 534 535 static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; 536 static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; 537 538 static void mmap_read(void) 539 { 540 int i, counter; 541 542 for (i = 0; i < nr_cpus; i++) { 543 for (counter = 0; counter < nr_counters; counter++) 544 mmap_read_counter(&mmap_array[i][counter]); 545 } 546 } 547 548 int nr_poll; 549 int group_fd; 550 551 static void start_counter(int i, int counter) 552 { 553 struct perf_counter_attr *attr; 554 int cpu; 555 556 cpu = profile_cpu; 557 if (target_pid == -1 && profile_cpu == -1) 558 cpu = i; 559 560 attr = attrs + counter; 561 562 attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; 563 attr->freq = freq; 564 attr->inherit = (cpu < 0) && inherit; 565 566 try_again: 567 fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0); 568 569 if (fd[i][counter] < 0) { 570 int err = errno; 571 572 if (err == EPERM) 573 die("No permission - are you root?\n"); 574 /* 575 * If it's cycles then fall back to hrtimer 576 * based cpu-clock-tick sw counter, which 577 * is always available even if no PMU support: 578 */ 579 if (attr->type == PERF_TYPE_HARDWARE 580 && attr->config == PERF_COUNT_HW_CPU_CYCLES) { 581 582 if (verbose) 583 warning(" ... trying to fall back to cpu-clock-ticks\n"); 584 585 attr->type = PERF_TYPE_SOFTWARE; 586 attr->config = PERF_COUNT_SW_CPU_CLOCK; 587 goto try_again; 588 } 589 printf("\n"); 590 error("perfcounter syscall returned with %d (%s)\n", 591 fd[i][counter], strerror(err)); 592 die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n"); 593 exit(-1); 594 } 595 assert(fd[i][counter] >= 0); 596 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK); 597 598 /* 599 * First counter acts as the group leader: 600 */ 601 if (group && group_fd == -1) 602 group_fd = fd[i][counter]; 603 604 event_array[nr_poll].fd = fd[i][counter]; 605 event_array[nr_poll].events = POLLIN; 606 nr_poll++; 607 608 mmap_array[i][counter].counter = counter; 609 mmap_array[i][counter].prev = 0; 610 mmap_array[i][counter].mask = mmap_pages*page_size - 1; 611 mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size, 612 PROT_READ, MAP_SHARED, fd[i][counter], 0); 613 if (mmap_array[i][counter].base == MAP_FAILED) 614 die("failed to mmap with %d (%s)\n", errno, strerror(errno)); 615 } 616 617 static int __cmd_top(void) 618 { 619 pthread_t thread; 620 int i, counter; 621 int ret; 622 623 for (i = 0; i < nr_cpus; i++) { 624 group_fd = -1; 625 for (counter = 0; counter < nr_counters; counter++) 626 start_counter(i, counter); 627 } 628 629 /* Wait for a minimal set of events before starting the snapshot */ 630 poll(event_array, nr_poll, 100); 631 632 mmap_read(); 633 634 if (pthread_create(&thread, NULL, display_thread, NULL)) { 635 printf("Could not create display thread.\n"); 636 exit(-1); 637 } 638 639 if (realtime_prio) { 640 struct sched_param param; 641 642 param.sched_priority = realtime_prio; 643 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { 644 printf("Could not set realtime priority.\n"); 645 exit(-1); 646 } 647 } 648 649 while (1) { 650 int hits = samples; 651 652 mmap_read(); 653 654 if (hits == samples) 655 ret = poll(event_array, nr_poll, 100); 656 } 657 658 return 0; 659 } 660 661 static const char * const top_usage[] = { 662 "perf top [<options>]", 663 NULL 664 }; 665 666 static const struct option options[] = { 667 OPT_CALLBACK('e', "event", NULL, "event", 668 "event selector. use 'perf list' to list available events", 669 parse_events), 670 OPT_INTEGER('c', "count", &default_interval, 671 "event period to sample"), 672 OPT_INTEGER('p', "pid", &target_pid, 673 "profile events on existing pid"), 674 OPT_BOOLEAN('a', "all-cpus", &system_wide, 675 "system-wide collection from all CPUs"), 676 OPT_INTEGER('C', "CPU", &profile_cpu, 677 "CPU to profile on"), 678 OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"), 679 OPT_INTEGER('m', "mmap-pages", &mmap_pages, 680 "number of mmap data pages"), 681 OPT_INTEGER('r', "realtime", &realtime_prio, 682 "collect data with this RT SCHED_FIFO priority"), 683 OPT_INTEGER('d', "delay", &delay_secs, 684 "number of seconds to delay between refreshes"), 685 OPT_BOOLEAN('D', "dump-symtab", &dump_symtab, 686 "dump the symbol table used for profiling"), 687 OPT_INTEGER('f', "count-filter", &count_filter, 688 "only display functions with more events than this"), 689 OPT_BOOLEAN('g', "group", &group, 690 "put the counters into a counter group"), 691 OPT_BOOLEAN('i', "inherit", &inherit, 692 "child tasks inherit counters"), 693 OPT_STRING('s', "sym-filter", &sym_filter, "pattern", 694 "only display symbols matchig this pattern"), 695 OPT_BOOLEAN('z', "zero", &zero, 696 "zero history across updates"), 697 OPT_INTEGER('F', "freq", &freq, 698 "profile at this frequency"), 699 OPT_INTEGER('E', "entries", &print_entries, 700 "display this many functions"), 701 OPT_BOOLEAN('v', "verbose", &verbose, 702 "be more verbose (show counter open errors, etc)"), 703 OPT_END() 704 }; 705 706 int cmd_top(int argc, const char **argv, const char *prefix __used) 707 { 708 int counter; 709 710 symbol__init(); 711 712 page_size = sysconf(_SC_PAGE_SIZE); 713 714 argc = parse_options(argc, argv, options, top_usage, 0); 715 if (argc) 716 usage_with_options(top_usage, options); 717 718 if (freq) { 719 default_interval = freq; 720 freq = 1; 721 } 722 723 /* CPU and PID are mutually exclusive */ 724 if (target_pid != -1 && profile_cpu != -1) { 725 printf("WARNING: PID switch overriding CPU\n"); 726 sleep(1); 727 profile_cpu = -1; 728 } 729 730 if (!nr_counters) 731 nr_counters = 1; 732 733 if (delay_secs < 1) 734 delay_secs = 1; 735 736 parse_symbols(); 737 738 /* 739 * Fill in the ones not specifically initialized via -c: 740 */ 741 for (counter = 0; counter < nr_counters; counter++) { 742 if (attrs[counter].sample_period) 743 continue; 744 745 attrs[counter].sample_period = default_interval; 746 } 747 748 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); 749 assert(nr_cpus <= MAX_NR_CPUS); 750 assert(nr_cpus >= 0); 751 752 if (target_pid != -1 || profile_cpu != -1) 753 nr_cpus = 1; 754 755 return __cmd_top(); 756 } 757