1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * intel_tpebs.c: Intel TPEBS support 4 */ 5 6 #include <api/fs/fs.h> 7 #include <sys/param.h> 8 #include <subcmd/run-command.h> 9 #include <thread.h> 10 #include "intel-tpebs.h" 11 #include <linux/list.h> 12 #include <linux/zalloc.h> 13 #include <linux/err.h> 14 #include "sample.h" 15 #include "counts.h" 16 #include "debug.h" 17 #include "evlist.h" 18 #include "evsel.h" 19 #include "mutex.h" 20 #include "session.h" 21 #include "stat.h" 22 #include "tool.h" 23 #include "cpumap.h" 24 #include "metricgroup.h" 25 #include <sys/stat.h> 26 #include <sys/file.h> 27 #include <errno.h> 28 #include <poll.h> 29 #include <math.h> 30 31 #define PERF_DATA "-" 32 33 bool tpebs_recording; 34 enum tpebs_mode tpebs_mode; 35 static LIST_HEAD(tpebs_results); 36 static pthread_t tpebs_reader_thread; 37 static struct child_process tpebs_cmd; 38 static int control_fd[2], ack_fd[2]; 39 static struct mutex tpebs_mtx; 40 static bool tpebs_stopping; 41 42 struct tpebs_retire_lat { 43 struct list_head nd; 44 /** @evsel: The evsel that opened the retire_lat event. */ 45 struct evsel *evsel; 46 /** @event: Event passed to perf record. */ 47 char *event; 48 /** @stats: Recorded retirement latency stats. */ 49 struct stats stats; 50 /** @last: Last retirement latency read. */ 51 uint64_t last; 52 /* Has the event been sent to perf record? */ 53 bool started; 54 }; 55 56 static void tpebs_init(void) 57 { 58 mutex_init(&tpebs_mtx); 59 control_fd[0] = control_fd[1] = -1; 60 ack_fd[0] = ack_fd[1] = -1; 61 } 62 63 static struct mutex *tpebs_mtx_get(void) 64 { 65 static pthread_once_t tpebs_once = PTHREAD_ONCE_INIT; 66 67 pthread_once(&tpebs_once, tpebs_init); 68 return &tpebs_mtx; 69 } 70 71 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel) 72 EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()); 73 74 static int evsel__tpebs_start_perf_record(struct evsel *evsel) 75 { 76 const char **record_argv; 77 int tpebs_event_size = 0, i = 0, ret; 78 char control_fd_buf[32]; 79 char cpumap_buf[50]; 80 struct tpebs_retire_lat *t; 81 82 list_for_each_entry(t, &tpebs_results, nd) 83 tpebs_event_size++; 84 85 record_argv = malloc((10 + 2 * tpebs_event_size) * sizeof(*record_argv)); 86 if (!record_argv) 87 return -ENOMEM; 88 89 record_argv[i++] = "perf"; 90 record_argv[i++] = "record"; 91 record_argv[i++] = "-W"; 92 record_argv[i++] = "--synth=no"; 93 94 scnprintf(control_fd_buf, sizeof(control_fd_buf), "--control=fd:%d,%d", 95 control_fd[0], ack_fd[1]); 96 record_argv[i++] = control_fd_buf; 97 98 record_argv[i++] = "-o"; 99 record_argv[i++] = PERF_DATA; 100 101 if (!perf_cpu_map__is_any_cpu_or_is_empty(evsel->evlist->core.user_requested_cpus)) { 102 cpu_map__snprint(evsel->evlist->core.user_requested_cpus, cpumap_buf, 103 sizeof(cpumap_buf)); 104 record_argv[i++] = "-C"; 105 record_argv[i++] = cpumap_buf; 106 } 107 108 list_for_each_entry(t, &tpebs_results, nd) { 109 record_argv[i++] = "-e"; 110 record_argv[i++] = t->event; 111 } 112 record_argv[i++] = NULL; 113 assert(i == 10 + 2 * tpebs_event_size || i == 8 + 2 * tpebs_event_size); 114 /* Note, no workload given so system wide is implied. */ 115 116 assert(tpebs_cmd.pid == 0); 117 memset(&tpebs_cmd, 0, sizeof(tpebs_cmd)); 118 tpebs_cmd.argv = record_argv; 119 tpebs_cmd.out = -1; 120 ret = start_command(&tpebs_cmd); 121 zfree(&tpebs_cmd.argv); 122 list_for_each_entry(t, &tpebs_results, nd) 123 t->started = true; 124 125 return ret; 126 } 127 128 static bool is_child_pid(pid_t parent, pid_t child) 129 { 130 if (parent < 0 || child < 0) 131 return false; 132 133 while (true) { 134 char path[PATH_MAX]; 135 char line[256]; 136 FILE *fp; 137 138 new_child: 139 if (parent == child) 140 return true; 141 142 if (child <= 0) 143 return false; 144 145 scnprintf(path, sizeof(path), "%s/%d/status", procfs__mountpoint(), child); 146 fp = fopen(path, "r"); 147 if (!fp) { 148 /* Presumably the process went away. Assume not a child. */ 149 return false; 150 } 151 while (fgets(line, sizeof(line), fp) != NULL) { 152 if (strncmp(line, "PPid:", 5) == 0) { 153 fclose(fp); 154 if (sscanf(line + 5, "%d", &child) != 1) { 155 /* Unexpected error parsing. */ 156 return false; 157 } 158 goto new_child; 159 } 160 } 161 /* Unexpected EOF. */ 162 fclose(fp); 163 return false; 164 } 165 } 166 167 static bool should_ignore_sample(const struct perf_sample *sample, const struct tpebs_retire_lat *t) 168 { 169 pid_t workload_pid, sample_pid = sample->pid; 170 171 /* 172 * During evlist__purge the evlist will be removed prior to the 173 * evsel__exit calling evsel__tpebs_close and taking the 174 * tpebs_mtx. Avoid a segfault by ignoring samples in this case. 175 */ 176 if (t->evsel->evlist == NULL) 177 return true; 178 179 workload_pid = t->evsel->evlist->workload.pid; 180 if (workload_pid < 0 || workload_pid == sample_pid) 181 return false; 182 183 if (!t->evsel->core.attr.inherit) 184 return true; 185 186 return !is_child_pid(workload_pid, sample_pid); 187 } 188 189 static int process_sample_event(const struct perf_tool *tool __maybe_unused, 190 union perf_event *event __maybe_unused, 191 struct perf_sample *sample, 192 struct machine *machine __maybe_unused) 193 { 194 struct tpebs_retire_lat *t; 195 196 mutex_lock(tpebs_mtx_get()); 197 if (tpebs_cmd.pid == 0) { 198 /* Record has terminated. */ 199 mutex_unlock(tpebs_mtx_get()); 200 return 0; 201 } 202 t = tpebs_retire_lat__find(sample->evsel); 203 if (!t) { 204 mutex_unlock(tpebs_mtx_get()); 205 return -EINVAL; 206 } 207 if (should_ignore_sample(sample, t)) { 208 mutex_unlock(tpebs_mtx_get()); 209 return 0; 210 } 211 /* 212 * Need to handle per core results? We are assuming average retire 213 * latency value will be used. Save the number of samples and the sum of 214 * retire latency value for each event. 215 */ 216 t->last = sample->weight3; 217 update_stats(&t->stats, sample->weight3); 218 mutex_unlock(tpebs_mtx_get()); 219 return 0; 220 } 221 222 static void *__sample_reader(void *arg __maybe_unused) 223 { 224 struct perf_session *session; 225 struct perf_data data = { 226 .mode = PERF_DATA_MODE_READ, 227 .path = PERF_DATA, 228 .file.fd = tpebs_cmd.out, 229 }; 230 struct perf_tool tool; 231 232 perf_tool__init(&tool, /*ordered_events=*/false); 233 tool.sample = process_sample_event; 234 tool.feature = perf_event__process_feature; 235 tool.attr = perf_event__process_attr; 236 237 session = perf_session__new(&data, &tool); 238 if (IS_ERR(session)) 239 return NULL; 240 perf_session__process_events(session); 241 perf_session__delete(session); 242 243 return NULL; 244 } 245 246 static int tpebs_send_record_cmd(const char *msg) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()) 247 { 248 struct pollfd pollfd = { .events = POLLIN, }; 249 int ret, len, retries = 0; 250 char ack_buf[8]; 251 252 /* Check if the command exited before the send, done with the lock held. */ 253 if (tpebs_cmd.pid == 0) 254 return 0; 255 256 /* 257 * Let go of the lock while sending/receiving as blocking can starve the 258 * sample reading thread. 259 */ 260 mutex_unlock(tpebs_mtx_get()); 261 262 /* Send perf record command.*/ 263 len = strlen(msg); 264 ret = write(control_fd[1], msg, len); 265 if (ret != len) { 266 pr_err("perf record control write control message '%s' failed\n", msg); 267 ret = -EPIPE; 268 goto out; 269 } 270 271 if (!strcmp(msg, EVLIST_CTL_CMD_STOP_TAG)) { 272 ret = 0; 273 goto out; 274 } 275 276 /* Wait for an ack. */ 277 pollfd.fd = ack_fd[0]; 278 279 /* 280 * We need this poll to ensure the ack_fd PIPE will not hang 281 * when perf record failed for any reason. The timeout value 282 * 3000ms is an empirical selection. 283 */ 284 again: 285 if (!poll(&pollfd, 1, 500)) { 286 if (check_if_command_finished(&tpebs_cmd)) { 287 ret = 0; 288 goto out; 289 } 290 291 if (retries++ < 6) 292 goto again; 293 pr_err("tpebs failed: perf record ack timeout for '%s'\n", msg); 294 ret = -ETIMEDOUT; 295 goto out; 296 } 297 298 if (!(pollfd.revents & POLLIN)) { 299 if (check_if_command_finished(&tpebs_cmd)) { 300 ret = 0; 301 goto out; 302 } 303 304 pr_err("tpebs failed: did not received an ack for '%s'\n", msg); 305 ret = -EPIPE; 306 goto out; 307 } 308 309 ret = read(ack_fd[0], ack_buf, sizeof(ack_buf)); 310 if (ret > 0) 311 ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG); 312 else 313 pr_err("tpebs: perf record control ack failed\n"); 314 out: 315 /* Re-take lock as expected by caller. */ 316 mutex_lock(tpebs_mtx_get()); 317 return ret; 318 } 319 320 /* 321 * tpebs_stop - stop the sample data read thread and the perf record process. 322 */ 323 static int tpebs_stop(void) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()) 324 { 325 int ret = 0; 326 327 if (tpebs_stopping) 328 return 0; 329 330 /* Like tpebs_start, we should only run tpebs_end once. */ 331 if (tpebs_cmd.pid != 0) { 332 pid_t actual_pid = tpebs_cmd.pid; 333 334 tpebs_stopping = true; 335 tpebs_send_record_cmd(EVLIST_CTL_CMD_STOP_TAG); 336 tpebs_cmd.pid = 0; 337 mutex_unlock(tpebs_mtx_get()); 338 pthread_join(tpebs_reader_thread, NULL); 339 mutex_lock(tpebs_mtx_get()); 340 if (control_fd[0] >= 0) { 341 close(control_fd[0]); 342 control_fd[0] = -1; 343 } 344 if (control_fd[1] >= 0) { 345 close(control_fd[1]); 346 control_fd[1] = -1; 347 } 348 if (ack_fd[0] >= 0) { 349 close(ack_fd[0]); 350 ack_fd[0] = -1; 351 } 352 if (ack_fd[1] >= 0) { 353 close(ack_fd[1]); 354 ack_fd[1] = -1; 355 } 356 if (tpebs_cmd.out >= 0) { 357 close(tpebs_cmd.out); 358 tpebs_cmd.out = -1; 359 } 360 tpebs_cmd.pid = actual_pid; 361 ret = finish_command(&tpebs_cmd); 362 tpebs_cmd.pid = 0; 363 tpebs_stopping = false; 364 if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL) 365 ret = 0; 366 } 367 return ret; 368 } 369 370 /** 371 * evsel__tpebs_event() - Create string event encoding to pass to `perf record`. 372 */ 373 static int evsel__tpebs_event(struct evsel *evsel, char **event) 374 { 375 char *name, *modifier; 376 int ret; 377 378 name = strdup(evsel->name); 379 if (!name) 380 return -ENOMEM; 381 382 modifier = strrchr(name, 'R'); 383 if (!modifier) { 384 ret = -EINVAL; 385 goto out; 386 } 387 *modifier = 'p'; 388 modifier = strchr(name, ':'); 389 if (!modifier) 390 modifier = strrchr(name, '/'); 391 if (!modifier) { 392 ret = -EINVAL; 393 goto out; 394 } 395 *modifier = '\0'; 396 if (asprintf(event, "%s/name=tpebs_event_%p/%s", name, evsel, modifier + 1) > 0) 397 ret = 0; 398 else 399 ret = -ENOMEM; 400 out: 401 if (ret) 402 pr_err("Tpebs event modifier broken '%s'\n", evsel->name); 403 free(name); 404 return ret; 405 } 406 407 static struct tpebs_retire_lat *tpebs_retire_lat__new(struct evsel *evsel) 408 { 409 struct tpebs_retire_lat *result = zalloc(sizeof(*result)); 410 int ret; 411 412 if (!result) 413 return NULL; 414 415 ret = evsel__tpebs_event(evsel, &result->event); 416 if (ret) { 417 free(result); 418 return NULL; 419 } 420 result->evsel = evsel; 421 return result; 422 } 423 424 static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r) 425 { 426 zfree(&r->event); 427 free(r); 428 } 429 430 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel) 431 { 432 struct tpebs_retire_lat *t; 433 unsigned long num; 434 const char *evsel_name; 435 436 /* 437 * Evsels will match for evlist with the retirement latency event. The 438 * name with "tpebs_event_" prefix will be present on events being read 439 * from `perf record`. 440 */ 441 if (evsel__is_retire_lat(evsel)) { 442 list_for_each_entry(t, &tpebs_results, nd) { 443 if (t->evsel == evsel) 444 return t; 445 } 446 return NULL; 447 } 448 evsel_name = strstr(evsel->name, "tpebs_event_"); 449 if (!evsel_name) { 450 /* Unexpected that the perf record should have other events. */ 451 return NULL; 452 } 453 errno = 0; 454 num = strtoull(evsel_name + 12, NULL, 16); 455 if (errno) { 456 pr_err("Bad evsel for tpebs find '%s'\n", evsel->name); 457 return NULL; 458 } 459 list_for_each_entry(t, &tpebs_results, nd) { 460 if ((unsigned long)t->evsel == num) 461 return t; 462 } 463 return NULL; 464 } 465 466 /** 467 * evsel__tpebs_prepare - create tpebs data structures ready for opening. 468 * @evsel: retire_latency evsel, all evsels on its list will be prepared. 469 */ 470 static int evsel__tpebs_prepare(struct evsel *evsel) 471 { 472 struct evsel *pos; 473 struct tpebs_retire_lat *tpebs_event; 474 475 mutex_lock(tpebs_mtx_get()); 476 tpebs_event = tpebs_retire_lat__find(evsel); 477 if (tpebs_event) { 478 /* evsel, or an identically named one, was already prepared. */ 479 mutex_unlock(tpebs_mtx_get()); 480 return 0; 481 } 482 tpebs_event = tpebs_retire_lat__new(evsel); 483 if (!tpebs_event) { 484 mutex_unlock(tpebs_mtx_get()); 485 return -ENOMEM; 486 } 487 list_add_tail(&tpebs_event->nd, &tpebs_results); 488 mutex_unlock(tpebs_mtx_get()); 489 490 /* 491 * Eagerly prepare all other evsels on the list to try to ensure that by 492 * open they are all known. 493 */ 494 evlist__for_each_entry(evsel->evlist, pos) { 495 int ret; 496 497 if (pos == evsel || !pos->retire_lat) 498 continue; 499 500 ret = evsel__tpebs_prepare(pos); 501 if (ret) 502 return ret; 503 } 504 return 0; 505 } 506 507 /** 508 * evsel__tpebs_open - starts tpebs execution. 509 * @evsel: retire_latency evsel, all evsels on its list will be selected. Each 510 * evsel is sampled to get the average retire_latency value. 511 */ 512 int evsel__tpebs_open(struct evsel *evsel) 513 { 514 int ret; 515 bool tpebs_empty; 516 bool started_process = false; 517 518 /* We should only run tpebs_start when tpebs_recording is enabled. */ 519 if (!tpebs_recording) 520 return 0; 521 522 mutex_lock(tpebs_mtx_get()); 523 if (tpebs_stopping) { 524 mutex_unlock(tpebs_mtx_get()); 525 return -EBUSY; 526 } 527 /* Only start the events once. */ 528 if (tpebs_cmd.pid != 0) { 529 struct tpebs_retire_lat *t; 530 bool valid; 531 532 t = tpebs_retire_lat__find(evsel); 533 valid = t && t->started; 534 mutex_unlock(tpebs_mtx_get()); 535 /* May fail as the event wasn't started. */ 536 return valid ? 0 : -EBUSY; 537 } 538 mutex_unlock(tpebs_mtx_get()); 539 540 ret = evsel__tpebs_prepare(evsel); 541 if (ret) 542 return ret; 543 544 mutex_lock(tpebs_mtx_get()); 545 if (tpebs_stopping || tpebs_cmd.pid != 0) { 546 ret = -EBUSY; 547 goto out; 548 } 549 tpebs_empty = list_empty(&tpebs_results); 550 if (!tpebs_empty) { 551 started_process = true; 552 /*Create control and ack fd for --control*/ 553 if (pipe(control_fd) < 0) { 554 pr_err("tpebs: Failed to create control fifo"); 555 ret = -1; 556 goto out; 557 } 558 if (pipe(ack_fd) < 0) { 559 pr_err("tpebs: Failed to create control fifo"); 560 ret = -1; 561 goto out; 562 } 563 564 ret = evsel__tpebs_start_perf_record(evsel); 565 if (ret) 566 goto out; 567 568 if (pthread_create(&tpebs_reader_thread, /*attr=*/NULL, __sample_reader, 569 /*arg=*/NULL)) { 570 kill(tpebs_cmd.pid, SIGTERM); 571 pr_err("Could not create thread to process sample data.\n"); 572 ret = -1; 573 goto out; 574 } 575 ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_ENABLE_TAG); 576 } 577 out: 578 if (ret) { 579 struct tpebs_retire_lat *t = tpebs_retire_lat__find(evsel); 580 581 if (t) { 582 list_del_init(&t->nd); 583 tpebs_retire_lat__delete(t); 584 } 585 586 if (started_process) { 587 if (tpebs_cmd.pid > 0) { 588 kill(tpebs_cmd.pid, SIGTERM); 589 finish_command(&tpebs_cmd); 590 tpebs_cmd.pid = 0; 591 } 592 if (tpebs_cmd.out >= 0) { 593 close(tpebs_cmd.out); 594 tpebs_cmd.out = -1; 595 } 596 if (control_fd[0] >= 0) { 597 close(control_fd[0]); 598 control_fd[0] = -1; 599 } 600 if (control_fd[1] >= 0) { 601 close(control_fd[1]); 602 control_fd[1] = -1; 603 } 604 if (ack_fd[0] >= 0) { 605 close(ack_fd[0]); 606 ack_fd[0] = -1; 607 } 608 if (ack_fd[1] >= 0) { 609 close(ack_fd[1]); 610 ack_fd[1] = -1; 611 } 612 } 613 } 614 mutex_unlock(tpebs_mtx_get()); 615 return ret; 616 } 617 618 int evsel__tpebs_read(struct evsel *evsel, int cpu_map_idx, int thread) 619 { 620 struct perf_counts_values *count, *old_count = NULL; 621 struct tpebs_retire_lat *t; 622 uint64_t val; 623 int ret; 624 625 /* Only set retire_latency value to the first CPU and thread. */ 626 if (cpu_map_idx != 0 || thread != 0) 627 return 0; 628 629 if (evsel->prev_raw_counts) 630 old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread); 631 632 count = perf_counts(evsel->counts, cpu_map_idx, thread); 633 634 mutex_lock(tpebs_mtx_get()); 635 t = tpebs_retire_lat__find(evsel); 636 /* 637 * If reading the first tpebs result, send a ping to the record 638 * process. Allow the sample reader a chance to read by releasing and 639 * reacquiring the lock. 640 */ 641 if (t && &t->nd == tpebs_results.next) { 642 ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_PING_TAG); 643 mutex_unlock(tpebs_mtx_get()); 644 if (ret) 645 return ret; 646 mutex_lock(tpebs_mtx_get()); 647 } 648 if (t == NULL || t->stats.n == 0) { 649 /* No sample data, use default. */ 650 if (tpebs_recording) { 651 pr_warning_once( 652 "Using precomputed retirement latency data as no samples\n"); 653 } 654 val = 0; 655 switch (tpebs_mode) { 656 case TPEBS_MODE__MIN: 657 val = rint(evsel->retirement_latency.min); 658 break; 659 case TPEBS_MODE__MAX: 660 val = rint(evsel->retirement_latency.max); 661 break; 662 default: 663 case TPEBS_MODE__LAST: 664 case TPEBS_MODE__MEAN: 665 val = rint(evsel->retirement_latency.mean); 666 break; 667 } 668 } else { 669 switch (tpebs_mode) { 670 case TPEBS_MODE__MIN: 671 val = t->stats.min; 672 break; 673 case TPEBS_MODE__MAX: 674 val = t->stats.max; 675 break; 676 case TPEBS_MODE__LAST: 677 val = t->last; 678 break; 679 default: 680 case TPEBS_MODE__MEAN: 681 val = rint(t->stats.mean); 682 break; 683 } 684 } 685 mutex_unlock(tpebs_mtx_get()); 686 687 if (old_count) { 688 count->val = old_count->val + val; 689 count->run = old_count->run + 1; 690 count->ena = old_count->ena + 1; 691 } else { 692 count->val = val; 693 count->run++; 694 count->ena++; 695 } 696 return 0; 697 } 698 699 /** 700 * evsel__tpebs_close() - delete tpebs related data. If the last event, stop the 701 * created thread and process by calling tpebs_stop(). 702 * 703 * This function is called in evsel__close() to be symmetric with 704 * evsel__tpebs_open() being called in evsel__open(). 705 */ 706 void evsel__tpebs_close(struct evsel *evsel) 707 { 708 struct tpebs_retire_lat *t; 709 710 mutex_lock(tpebs_mtx_get()); 711 t = tpebs_retire_lat__find(evsel); 712 if (t) { 713 list_del_init(&t->nd); 714 tpebs_retire_lat__delete(t); 715 716 if (list_empty(&tpebs_results)) 717 tpebs_stop(); 718 } 719 mutex_unlock(tpebs_mtx_get()); 720 } 721