1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * intel_tpebs.c: Intel TPEBS support 4 */ 5 6 #include <api/fs/fs.h> 7 #include <sys/param.h> 8 #include <subcmd/run-command.h> 9 #include <thread.h> 10 #include "intel-tpebs.h" 11 #include <linux/list.h> 12 #include <linux/zalloc.h> 13 #include <linux/err.h> 14 #include "sample.h" 15 #include "counts.h" 16 #include "debug.h" 17 #include "evlist.h" 18 #include "evsel.h" 19 #include "mutex.h" 20 #include "session.h" 21 #include "stat.h" 22 #include "tool.h" 23 #include "cpumap.h" 24 #include "metricgroup.h" 25 #include "stat.h" 26 #include <sys/stat.h> 27 #include <sys/file.h> 28 #include <errno.h> 29 #include <poll.h> 30 #include <math.h> 31 32 #define PERF_DATA "-" 33 34 bool tpebs_recording; 35 enum tpebs_mode tpebs_mode; 36 static LIST_HEAD(tpebs_results); 37 static pthread_t tpebs_reader_thread; 38 static struct child_process tpebs_cmd; 39 static int control_fd[2], ack_fd[2]; 40 static struct mutex tpebs_mtx; 41 42 struct tpebs_retire_lat { 43 struct list_head nd; 44 /** @evsel: The evsel that opened the retire_lat event. */ 45 struct evsel *evsel; 46 /** @event: Event passed to perf record. */ 47 char *event; 48 /** @stats: Recorded retirement latency stats. */ 49 struct stats stats; 50 /** @last: Last retirement latency read. */ 51 uint64_t last; 52 /* Has the event been sent to perf record? */ 53 bool started; 54 }; 55 56 static void tpebs_mtx_init(void) 57 { 58 mutex_init(&tpebs_mtx); 59 } 60 61 static struct mutex *tpebs_mtx_get(void) 62 { 63 static pthread_once_t tpebs_mtx_once = PTHREAD_ONCE_INIT; 64 65 pthread_once(&tpebs_mtx_once, tpebs_mtx_init); 66 return &tpebs_mtx; 67 } 68 69 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel) 70 EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()); 71 72 static int evsel__tpebs_start_perf_record(struct evsel *evsel) 73 { 74 const char **record_argv; 75 int tpebs_event_size = 0, i = 0, ret; 76 char control_fd_buf[32]; 77 char cpumap_buf[50]; 78 struct tpebs_retire_lat *t; 79 80 list_for_each_entry(t, &tpebs_results, nd) 81 tpebs_event_size++; 82 83 record_argv = malloc((10 + 2 * tpebs_event_size) * sizeof(*record_argv)); 84 if (!record_argv) 85 return -ENOMEM; 86 87 record_argv[i++] = "perf"; 88 record_argv[i++] = "record"; 89 record_argv[i++] = "-W"; 90 record_argv[i++] = "--synth=no"; 91 92 scnprintf(control_fd_buf, sizeof(control_fd_buf), "--control=fd:%d,%d", 93 control_fd[0], ack_fd[1]); 94 record_argv[i++] = control_fd_buf; 95 96 record_argv[i++] = "-o"; 97 record_argv[i++] = PERF_DATA; 98 99 if (!perf_cpu_map__is_any_cpu_or_is_empty(evsel->evlist->core.user_requested_cpus)) { 100 cpu_map__snprint(evsel->evlist->core.user_requested_cpus, cpumap_buf, 101 sizeof(cpumap_buf)); 102 record_argv[i++] = "-C"; 103 record_argv[i++] = cpumap_buf; 104 } 105 106 list_for_each_entry(t, &tpebs_results, nd) { 107 record_argv[i++] = "-e"; 108 record_argv[i++] = t->event; 109 } 110 record_argv[i++] = NULL; 111 assert(i == 10 + 2 * tpebs_event_size || i == 8 + 2 * tpebs_event_size); 112 /* Note, no workload given so system wide is implied. */ 113 114 assert(tpebs_cmd.pid == 0); 115 tpebs_cmd.argv = record_argv; 116 tpebs_cmd.out = -1; 117 ret = start_command(&tpebs_cmd); 118 zfree(&tpebs_cmd.argv); 119 list_for_each_entry(t, &tpebs_results, nd) 120 t->started = true; 121 122 return ret; 123 } 124 125 static bool is_child_pid(pid_t parent, pid_t child) 126 { 127 if (parent < 0 || child < 0) 128 return false; 129 130 while (true) { 131 char path[PATH_MAX]; 132 char line[256]; 133 FILE *fp; 134 135 new_child: 136 if (parent == child) 137 return true; 138 139 if (child <= 0) 140 return false; 141 142 scnprintf(path, sizeof(path), "%s/%d/status", procfs__mountpoint(), child); 143 fp = fopen(path, "r"); 144 if (!fp) { 145 /* Presumably the process went away. Assume not a child. */ 146 return false; 147 } 148 while (fgets(line, sizeof(line), fp) != NULL) { 149 if (strncmp(line, "PPid:", 5) == 0) { 150 fclose(fp); 151 if (sscanf(line + 5, "%d", &child) != 1) { 152 /* Unexpected error parsing. */ 153 return false; 154 } 155 goto new_child; 156 } 157 } 158 /* Unexpected EOF. */ 159 fclose(fp); 160 return false; 161 } 162 } 163 164 static bool should_ignore_sample(const struct perf_sample *sample, const struct tpebs_retire_lat *t) 165 { 166 pid_t workload_pid, sample_pid = sample->pid; 167 168 /* 169 * During evlist__purge the evlist will be removed prior to the 170 * evsel__exit calling evsel__tpebs_close and taking the 171 * tpebs_mtx. Avoid a segfault by ignoring samples in this case. 172 */ 173 if (t->evsel->evlist == NULL) 174 return true; 175 176 workload_pid = t->evsel->evlist->workload.pid; 177 if (workload_pid < 0 || workload_pid == sample_pid) 178 return false; 179 180 if (!t->evsel->core.attr.inherit) 181 return true; 182 183 return !is_child_pid(workload_pid, sample_pid); 184 } 185 186 static int process_sample_event(const struct perf_tool *tool __maybe_unused, 187 union perf_event *event __maybe_unused, 188 struct perf_sample *sample, 189 struct evsel *evsel, 190 struct machine *machine __maybe_unused) 191 { 192 struct tpebs_retire_lat *t; 193 194 mutex_lock(tpebs_mtx_get()); 195 if (tpebs_cmd.pid == 0) { 196 /* Record has terminated. */ 197 mutex_unlock(tpebs_mtx_get()); 198 return 0; 199 } 200 t = tpebs_retire_lat__find(evsel); 201 if (!t) { 202 mutex_unlock(tpebs_mtx_get()); 203 return -EINVAL; 204 } 205 if (should_ignore_sample(sample, t)) { 206 mutex_unlock(tpebs_mtx_get()); 207 return 0; 208 } 209 /* 210 * Need to handle per core results? We are assuming average retire 211 * latency value will be used. Save the number of samples and the sum of 212 * retire latency value for each event. 213 */ 214 t->last = sample->weight3; 215 update_stats(&t->stats, sample->weight3); 216 mutex_unlock(tpebs_mtx_get()); 217 return 0; 218 } 219 220 static int process_feature_event(const struct perf_tool *tool __maybe_unused, 221 struct perf_session *session, 222 union perf_event *event) 223 { 224 if (event->feat.feat_id < HEADER_LAST_FEATURE) 225 return perf_event__process_feature(session, event); 226 return 0; 227 } 228 229 static void *__sample_reader(void *arg __maybe_unused) 230 { 231 struct perf_session *session; 232 struct perf_data data = { 233 .mode = PERF_DATA_MODE_READ, 234 .path = PERF_DATA, 235 .file.fd = tpebs_cmd.out, 236 }; 237 struct perf_tool tool; 238 239 perf_tool__init(&tool, /*ordered_events=*/false); 240 tool.sample = process_sample_event; 241 tool.feature = process_feature_event; 242 tool.attr = perf_event__process_attr; 243 244 session = perf_session__new(&data, &tool); 245 if (IS_ERR(session)) 246 return NULL; 247 perf_session__process_events(session); 248 perf_session__delete(session); 249 250 return NULL; 251 } 252 253 static int tpebs_send_record_cmd(const char *msg) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()) 254 { 255 struct pollfd pollfd = { .events = POLLIN, }; 256 int ret, len, retries = 0; 257 char ack_buf[8]; 258 259 /* Check if the command exited before the send, done with the lock held. */ 260 if (tpebs_cmd.pid == 0) 261 return 0; 262 263 /* 264 * Let go of the lock while sending/receiving as blocking can starve the 265 * sample reading thread. 266 */ 267 mutex_unlock(tpebs_mtx_get()); 268 269 /* Send perf record command.*/ 270 len = strlen(msg); 271 ret = write(control_fd[1], msg, len); 272 if (ret != len) { 273 pr_err("perf record control write control message '%s' failed\n", msg); 274 ret = -EPIPE; 275 goto out; 276 } 277 278 if (!strcmp(msg, EVLIST_CTL_CMD_STOP_TAG)) { 279 ret = 0; 280 goto out; 281 } 282 283 /* Wait for an ack. */ 284 pollfd.fd = ack_fd[0]; 285 286 /* 287 * We need this poll to ensure the ack_fd PIPE will not hang 288 * when perf record failed for any reason. The timeout value 289 * 3000ms is an empirical selection. 290 */ 291 again: 292 if (!poll(&pollfd, 1, 500)) { 293 if (check_if_command_finished(&tpebs_cmd)) { 294 ret = 0; 295 goto out; 296 } 297 298 if (retries++ < 6) 299 goto again; 300 pr_err("tpebs failed: perf record ack timeout for '%s'\n", msg); 301 ret = -ETIMEDOUT; 302 goto out; 303 } 304 305 if (!(pollfd.revents & POLLIN)) { 306 if (check_if_command_finished(&tpebs_cmd)) { 307 ret = 0; 308 goto out; 309 } 310 311 pr_err("tpebs failed: did not received an ack for '%s'\n", msg); 312 ret = -EPIPE; 313 goto out; 314 } 315 316 ret = read(ack_fd[0], ack_buf, sizeof(ack_buf)); 317 if (ret > 0) 318 ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG); 319 else 320 pr_err("tpebs: perf record control ack failed\n"); 321 out: 322 /* Re-take lock as expected by caller. */ 323 mutex_lock(tpebs_mtx_get()); 324 return ret; 325 } 326 327 /* 328 * tpebs_stop - stop the sample data read thread and the perf record process. 329 */ 330 static int tpebs_stop(void) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()) 331 { 332 int ret = 0; 333 334 /* Like tpebs_start, we should only run tpebs_end once. */ 335 if (tpebs_cmd.pid != 0) { 336 tpebs_send_record_cmd(EVLIST_CTL_CMD_STOP_TAG); 337 tpebs_cmd.pid = 0; 338 mutex_unlock(tpebs_mtx_get()); 339 pthread_join(tpebs_reader_thread, NULL); 340 mutex_lock(tpebs_mtx_get()); 341 close(control_fd[0]); 342 close(control_fd[1]); 343 close(ack_fd[0]); 344 close(ack_fd[1]); 345 close(tpebs_cmd.out); 346 ret = finish_command(&tpebs_cmd); 347 tpebs_cmd.pid = 0; 348 if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL) 349 ret = 0; 350 } 351 return ret; 352 } 353 354 /** 355 * evsel__tpebs_event() - Create string event encoding to pass to `perf record`. 356 */ 357 static int evsel__tpebs_event(struct evsel *evsel, char **event) 358 { 359 char *name, *modifier; 360 int ret; 361 362 name = strdup(evsel->name); 363 if (!name) 364 return -ENOMEM; 365 366 modifier = strrchr(name, 'R'); 367 if (!modifier) { 368 ret = -EINVAL; 369 goto out; 370 } 371 *modifier = 'p'; 372 modifier = strchr(name, ':'); 373 if (!modifier) 374 modifier = strrchr(name, '/'); 375 if (!modifier) { 376 ret = -EINVAL; 377 goto out; 378 } 379 *modifier = '\0'; 380 if (asprintf(event, "%s/name=tpebs_event_%p/%s", name, evsel, modifier + 1) > 0) 381 ret = 0; 382 else 383 ret = -ENOMEM; 384 out: 385 if (ret) 386 pr_err("Tpebs event modifier broken '%s'\n", evsel->name); 387 free(name); 388 return ret; 389 } 390 391 static struct tpebs_retire_lat *tpebs_retire_lat__new(struct evsel *evsel) 392 { 393 struct tpebs_retire_lat *result = zalloc(sizeof(*result)); 394 int ret; 395 396 if (!result) 397 return NULL; 398 399 ret = evsel__tpebs_event(evsel, &result->event); 400 if (ret) { 401 free(result); 402 return NULL; 403 } 404 result->evsel = evsel; 405 return result; 406 } 407 408 static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r) 409 { 410 zfree(&r->event); 411 free(r); 412 } 413 414 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel) 415 { 416 struct tpebs_retire_lat *t; 417 unsigned long num; 418 const char *evsel_name; 419 420 /* 421 * Evsels will match for evlist with the retirement latency event. The 422 * name with "tpebs_event_" prefix will be present on events being read 423 * from `perf record`. 424 */ 425 if (evsel__is_retire_lat(evsel)) { 426 list_for_each_entry(t, &tpebs_results, nd) { 427 if (t->evsel == evsel) 428 return t; 429 } 430 return NULL; 431 } 432 evsel_name = strstr(evsel->name, "tpebs_event_"); 433 if (!evsel_name) { 434 /* Unexpected that the perf record should have other events. */ 435 return NULL; 436 } 437 errno = 0; 438 num = strtoull(evsel_name + 12, NULL, 16); 439 if (errno) { 440 pr_err("Bad evsel for tpebs find '%s'\n", evsel->name); 441 return NULL; 442 } 443 list_for_each_entry(t, &tpebs_results, nd) { 444 if ((unsigned long)t->evsel == num) 445 return t; 446 } 447 return NULL; 448 } 449 450 /** 451 * evsel__tpebs_prepare - create tpebs data structures ready for opening. 452 * @evsel: retire_latency evsel, all evsels on its list will be prepared. 453 */ 454 static int evsel__tpebs_prepare(struct evsel *evsel) 455 { 456 struct evsel *pos; 457 struct tpebs_retire_lat *tpebs_event; 458 459 mutex_lock(tpebs_mtx_get()); 460 tpebs_event = tpebs_retire_lat__find(evsel); 461 if (tpebs_event) { 462 /* evsel, or an identically named one, was already prepared. */ 463 mutex_unlock(tpebs_mtx_get()); 464 return 0; 465 } 466 tpebs_event = tpebs_retire_lat__new(evsel); 467 if (!tpebs_event) { 468 mutex_unlock(tpebs_mtx_get()); 469 return -ENOMEM; 470 } 471 list_add_tail(&tpebs_event->nd, &tpebs_results); 472 mutex_unlock(tpebs_mtx_get()); 473 474 /* 475 * Eagerly prepare all other evsels on the list to try to ensure that by 476 * open they are all known. 477 */ 478 evlist__for_each_entry(evsel->evlist, pos) { 479 int ret; 480 481 if (pos == evsel || !pos->retire_lat) 482 continue; 483 484 ret = evsel__tpebs_prepare(pos); 485 if (ret) 486 return ret; 487 } 488 return 0; 489 } 490 491 /** 492 * evsel__tpebs_open - starts tpebs execution. 493 * @evsel: retire_latency evsel, all evsels on its list will be selected. Each 494 * evsel is sampled to get the average retire_latency value. 495 */ 496 int evsel__tpebs_open(struct evsel *evsel) 497 { 498 int ret; 499 bool tpebs_empty; 500 501 /* We should only run tpebs_start when tpebs_recording is enabled. */ 502 if (!tpebs_recording) 503 return 0; 504 /* Only start the events once. */ 505 if (tpebs_cmd.pid != 0) { 506 struct tpebs_retire_lat *t; 507 bool valid; 508 509 mutex_lock(tpebs_mtx_get()); 510 t = tpebs_retire_lat__find(evsel); 511 valid = t && t->started; 512 mutex_unlock(tpebs_mtx_get()); 513 /* May fail as the event wasn't started. */ 514 return valid ? 0 : -EBUSY; 515 } 516 517 ret = evsel__tpebs_prepare(evsel); 518 if (ret) 519 return ret; 520 521 mutex_lock(tpebs_mtx_get()); 522 tpebs_empty = list_empty(&tpebs_results); 523 if (!tpebs_empty) { 524 /*Create control and ack fd for --control*/ 525 if (pipe(control_fd) < 0) { 526 pr_err("tpebs: Failed to create control fifo"); 527 ret = -1; 528 goto out; 529 } 530 if (pipe(ack_fd) < 0) { 531 pr_err("tpebs: Failed to create control fifo"); 532 ret = -1; 533 goto out; 534 } 535 536 ret = evsel__tpebs_start_perf_record(evsel); 537 if (ret) 538 goto out; 539 540 if (pthread_create(&tpebs_reader_thread, /*attr=*/NULL, __sample_reader, 541 /*arg=*/NULL)) { 542 kill(tpebs_cmd.pid, SIGTERM); 543 close(tpebs_cmd.out); 544 pr_err("Could not create thread to process sample data.\n"); 545 ret = -1; 546 goto out; 547 } 548 ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_ENABLE_TAG); 549 } 550 out: 551 if (ret) { 552 struct tpebs_retire_lat *t = tpebs_retire_lat__find(evsel); 553 554 list_del_init(&t->nd); 555 tpebs_retire_lat__delete(t); 556 } 557 mutex_unlock(tpebs_mtx_get()); 558 return ret; 559 } 560 561 int evsel__tpebs_read(struct evsel *evsel, int cpu_map_idx, int thread) 562 { 563 struct perf_counts_values *count, *old_count = NULL; 564 struct tpebs_retire_lat *t; 565 uint64_t val; 566 int ret; 567 568 /* Only set retire_latency value to the first CPU and thread. */ 569 if (cpu_map_idx != 0 || thread != 0) 570 return 0; 571 572 if (evsel->prev_raw_counts) 573 old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread); 574 575 count = perf_counts(evsel->counts, cpu_map_idx, thread); 576 577 mutex_lock(tpebs_mtx_get()); 578 t = tpebs_retire_lat__find(evsel); 579 /* 580 * If reading the first tpebs result, send a ping to the record 581 * process. Allow the sample reader a chance to read by releasing and 582 * reacquiring the lock. 583 */ 584 if (t && &t->nd == tpebs_results.next) { 585 ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_PING_TAG); 586 mutex_unlock(tpebs_mtx_get()); 587 if (ret) 588 return ret; 589 mutex_lock(tpebs_mtx_get()); 590 } 591 if (t == NULL || t->stats.n == 0) { 592 /* No sample data, use default. */ 593 if (tpebs_recording) { 594 pr_warning_once( 595 "Using precomputed retirement latency data as no samples\n"); 596 } 597 val = 0; 598 switch (tpebs_mode) { 599 case TPEBS_MODE__MIN: 600 val = rint(evsel->retirement_latency.min); 601 break; 602 case TPEBS_MODE__MAX: 603 val = rint(evsel->retirement_latency.max); 604 break; 605 default: 606 case TPEBS_MODE__LAST: 607 case TPEBS_MODE__MEAN: 608 val = rint(evsel->retirement_latency.mean); 609 break; 610 } 611 } else { 612 switch (tpebs_mode) { 613 case TPEBS_MODE__MIN: 614 val = t->stats.min; 615 break; 616 case TPEBS_MODE__MAX: 617 val = t->stats.max; 618 break; 619 case TPEBS_MODE__LAST: 620 val = t->last; 621 break; 622 default: 623 case TPEBS_MODE__MEAN: 624 val = rint(t->stats.mean); 625 break; 626 } 627 } 628 mutex_unlock(tpebs_mtx_get()); 629 630 if (old_count) { 631 count->val = old_count->val + val; 632 count->run = old_count->run + 1; 633 count->ena = old_count->ena + 1; 634 } else { 635 count->val = val; 636 count->run++; 637 count->ena++; 638 } 639 return 0; 640 } 641 642 /** 643 * evsel__tpebs_close() - delete tpebs related data. If the last event, stop the 644 * created thread and process by calling tpebs_stop(). 645 * 646 * This function is called in evsel__close() to be symmetric with 647 * evsel__tpebs_open() being called in evsel__open(). 648 */ 649 void evsel__tpebs_close(struct evsel *evsel) 650 { 651 struct tpebs_retire_lat *t; 652 653 mutex_lock(tpebs_mtx_get()); 654 t = tpebs_retire_lat__find(evsel); 655 if (t) { 656 list_del_init(&t->nd); 657 tpebs_retire_lat__delete(t); 658 659 if (list_empty(&tpebs_results)) 660 tpebs_stop(); 661 } 662 mutex_unlock(tpebs_mtx_get()); 663 } 664