1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * intel_tpebs.c: Intel TPEBS support 4 */ 5 6 #include <api/fs/fs.h> 7 #include <sys/param.h> 8 #include <subcmd/run-command.h> 9 #include <thread.h> 10 #include "intel-tpebs.h" 11 #include <linux/list.h> 12 #include <linux/zalloc.h> 13 #include <linux/err.h> 14 #include "sample.h" 15 #include "counts.h" 16 #include "debug.h" 17 #include "evlist.h" 18 #include "evsel.h" 19 #include "mutex.h" 20 #include "session.h" 21 #include "stat.h" 22 #include "tool.h" 23 #include "cpumap.h" 24 #include "metricgroup.h" 25 #include "stat.h" 26 #include <sys/stat.h> 27 #include <sys/file.h> 28 #include <poll.h> 29 #include <math.h> 30 31 #define PERF_DATA "-" 32 33 bool tpebs_recording; 34 enum tpebs_mode tpebs_mode; 35 static LIST_HEAD(tpebs_results); 36 static pthread_t tpebs_reader_thread; 37 static struct child_process tpebs_cmd; 38 static int control_fd[2], ack_fd[2]; 39 static struct mutex tpebs_mtx; 40 41 struct tpebs_retire_lat { 42 struct list_head nd; 43 /** @evsel: The evsel that opened the retire_lat event. */ 44 struct evsel *evsel; 45 /** @event: Event passed to perf record. */ 46 char *event; 47 /** @stats: Recorded retirement latency stats. */ 48 struct stats stats; 49 /** @last: Last retirement latency read. */ 50 uint64_t last; 51 /* Has the event been sent to perf record? */ 52 bool started; 53 }; 54 55 static void tpebs_mtx_init(void) 56 { 57 mutex_init(&tpebs_mtx); 58 } 59 60 static struct mutex *tpebs_mtx_get(void) 61 { 62 static pthread_once_t tpebs_mtx_once = PTHREAD_ONCE_INIT; 63 64 pthread_once(&tpebs_mtx_once, tpebs_mtx_init); 65 return &tpebs_mtx; 66 } 67 68 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel) 69 EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()); 70 71 static int evsel__tpebs_start_perf_record(struct evsel *evsel) 72 { 73 const char **record_argv; 74 int tpebs_event_size = 0, i = 0, ret; 75 char control_fd_buf[32]; 76 char cpumap_buf[50]; 77 struct tpebs_retire_lat *t; 78 79 list_for_each_entry(t, &tpebs_results, nd) 80 tpebs_event_size++; 81 82 record_argv = malloc((10 + 2 * tpebs_event_size) * sizeof(*record_argv)); 83 if (!record_argv) 84 return -ENOMEM; 85 86 record_argv[i++] = "perf"; 87 record_argv[i++] = "record"; 88 record_argv[i++] = "-W"; 89 record_argv[i++] = "--synth=no"; 90 91 scnprintf(control_fd_buf, sizeof(control_fd_buf), "--control=fd:%d,%d", 92 control_fd[0], ack_fd[1]); 93 record_argv[i++] = control_fd_buf; 94 95 record_argv[i++] = "-o"; 96 record_argv[i++] = PERF_DATA; 97 98 if (!perf_cpu_map__is_any_cpu_or_is_empty(evsel->evlist->core.user_requested_cpus)) { 99 cpu_map__snprint(evsel->evlist->core.user_requested_cpus, cpumap_buf, 100 sizeof(cpumap_buf)); 101 record_argv[i++] = "-C"; 102 record_argv[i++] = cpumap_buf; 103 } 104 105 list_for_each_entry(t, &tpebs_results, nd) { 106 record_argv[i++] = "-e"; 107 record_argv[i++] = t->event; 108 } 109 record_argv[i++] = NULL; 110 assert(i == 10 + 2 * tpebs_event_size || i == 8 + 2 * tpebs_event_size); 111 /* Note, no workload given so system wide is implied. */ 112 113 assert(tpebs_cmd.pid == 0); 114 tpebs_cmd.argv = record_argv; 115 tpebs_cmd.out = -1; 116 ret = start_command(&tpebs_cmd); 117 zfree(&tpebs_cmd.argv); 118 list_for_each_entry(t, &tpebs_results, nd) 119 t->started = true; 120 121 return ret; 122 } 123 124 static bool is_child_pid(pid_t parent, pid_t child) 125 { 126 if (parent < 0 || child < 0) 127 return false; 128 129 while (true) { 130 char path[PATH_MAX]; 131 char line[256]; 132 FILE *fp; 133 134 new_child: 135 if (parent == child) 136 return true; 137 138 if (child <= 0) 139 return false; 140 141 scnprintf(path, sizeof(path), "%s/%d/status", procfs__mountpoint(), child); 142 fp = fopen(path, "r"); 143 if (!fp) { 144 /* Presumably the process went away. Assume not a child. */ 145 return false; 146 } 147 while (fgets(line, sizeof(line), fp) != NULL) { 148 if (strncmp(line, "PPid:", 5) == 0) { 149 fclose(fp); 150 if (sscanf(line + 5, "%d", &child) != 1) { 151 /* Unexpected error parsing. */ 152 return false; 153 } 154 goto new_child; 155 } 156 } 157 /* Unexpected EOF. */ 158 fclose(fp); 159 return false; 160 } 161 } 162 163 static bool should_ignore_sample(const struct perf_sample *sample, const struct tpebs_retire_lat *t) 164 { 165 pid_t workload_pid, sample_pid = sample->pid; 166 167 /* 168 * During evlist__purge the evlist will be removed prior to the 169 * evsel__exit calling evsel__tpebs_close and taking the 170 * tpebs_mtx. Avoid a segfault by ignoring samples in this case. 171 */ 172 if (t->evsel->evlist == NULL) 173 return true; 174 175 workload_pid = t->evsel->evlist->workload.pid; 176 if (workload_pid < 0 || workload_pid == sample_pid) 177 return false; 178 179 if (!t->evsel->core.attr.inherit) 180 return true; 181 182 return !is_child_pid(workload_pid, sample_pid); 183 } 184 185 static int process_sample_event(const struct perf_tool *tool __maybe_unused, 186 union perf_event *event __maybe_unused, 187 struct perf_sample *sample, 188 struct evsel *evsel, 189 struct machine *machine __maybe_unused) 190 { 191 struct tpebs_retire_lat *t; 192 193 mutex_lock(tpebs_mtx_get()); 194 if (tpebs_cmd.pid == 0) { 195 /* Record has terminated. */ 196 mutex_unlock(tpebs_mtx_get()); 197 return 0; 198 } 199 t = tpebs_retire_lat__find(evsel); 200 if (!t) { 201 mutex_unlock(tpebs_mtx_get()); 202 return -EINVAL; 203 } 204 if (should_ignore_sample(sample, t)) { 205 mutex_unlock(tpebs_mtx_get()); 206 return 0; 207 } 208 /* 209 * Need to handle per core results? We are assuming average retire 210 * latency value will be used. Save the number of samples and the sum of 211 * retire latency value for each event. 212 */ 213 t->last = sample->weight3; 214 update_stats(&t->stats, sample->weight3); 215 mutex_unlock(tpebs_mtx_get()); 216 return 0; 217 } 218 219 static int process_feature_event(const struct perf_tool *tool __maybe_unused, 220 struct perf_session *session, 221 union perf_event *event) 222 { 223 if (event->feat.feat_id < HEADER_LAST_FEATURE) 224 return perf_event__process_feature(session, event); 225 return 0; 226 } 227 228 static void *__sample_reader(void *arg __maybe_unused) 229 { 230 struct perf_session *session; 231 struct perf_data data = { 232 .mode = PERF_DATA_MODE_READ, 233 .path = PERF_DATA, 234 .file.fd = tpebs_cmd.out, 235 }; 236 struct perf_tool tool; 237 238 perf_tool__init(&tool, /*ordered_events=*/false); 239 tool.sample = process_sample_event; 240 tool.feature = process_feature_event; 241 tool.attr = perf_event__process_attr; 242 243 session = perf_session__new(&data, &tool); 244 if (IS_ERR(session)) 245 return NULL; 246 perf_session__process_events(session); 247 perf_session__delete(session); 248 249 return NULL; 250 } 251 252 static int tpebs_send_record_cmd(const char *msg) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()) 253 { 254 struct pollfd pollfd = { .events = POLLIN, }; 255 int ret, len, retries = 0; 256 char ack_buf[8]; 257 258 /* Check if the command exited before the send, done with the lock held. */ 259 if (tpebs_cmd.pid == 0) 260 return 0; 261 262 /* 263 * Let go of the lock while sending/receiving as blocking can starve the 264 * sample reading thread. 265 */ 266 mutex_unlock(tpebs_mtx_get()); 267 268 /* Send perf record command.*/ 269 len = strlen(msg); 270 ret = write(control_fd[1], msg, len); 271 if (ret != len) { 272 pr_err("perf record control write control message '%s' failed\n", msg); 273 ret = -EPIPE; 274 goto out; 275 } 276 277 if (!strcmp(msg, EVLIST_CTL_CMD_STOP_TAG)) { 278 ret = 0; 279 goto out; 280 } 281 282 /* Wait for an ack. */ 283 pollfd.fd = ack_fd[0]; 284 285 /* 286 * We need this poll to ensure the ack_fd PIPE will not hang 287 * when perf record failed for any reason. The timeout value 288 * 3000ms is an empirical selection. 289 */ 290 again: 291 if (!poll(&pollfd, 1, 500)) { 292 if (check_if_command_finished(&tpebs_cmd)) { 293 ret = 0; 294 goto out; 295 } 296 297 if (retries++ < 6) 298 goto again; 299 pr_err("tpebs failed: perf record ack timeout for '%s'\n", msg); 300 ret = -ETIMEDOUT; 301 goto out; 302 } 303 304 if (!(pollfd.revents & POLLIN)) { 305 if (check_if_command_finished(&tpebs_cmd)) { 306 ret = 0; 307 goto out; 308 } 309 310 pr_err("tpebs failed: did not received an ack for '%s'\n", msg); 311 ret = -EPIPE; 312 goto out; 313 } 314 315 ret = read(ack_fd[0], ack_buf, sizeof(ack_buf)); 316 if (ret > 0) 317 ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG); 318 else 319 pr_err("tpebs: perf record control ack failed\n"); 320 out: 321 /* Re-take lock as expected by caller. */ 322 mutex_lock(tpebs_mtx_get()); 323 return ret; 324 } 325 326 /* 327 * tpebs_stop - stop the sample data read thread and the perf record process. 328 */ 329 static int tpebs_stop(void) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()) 330 { 331 int ret = 0; 332 333 /* Like tpebs_start, we should only run tpebs_end once. */ 334 if (tpebs_cmd.pid != 0) { 335 tpebs_send_record_cmd(EVLIST_CTL_CMD_STOP_TAG); 336 tpebs_cmd.pid = 0; 337 mutex_unlock(tpebs_mtx_get()); 338 pthread_join(tpebs_reader_thread, NULL); 339 mutex_lock(tpebs_mtx_get()); 340 close(control_fd[0]); 341 close(control_fd[1]); 342 close(ack_fd[0]); 343 close(ack_fd[1]); 344 close(tpebs_cmd.out); 345 ret = finish_command(&tpebs_cmd); 346 tpebs_cmd.pid = 0; 347 if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL) 348 ret = 0; 349 } 350 return ret; 351 } 352 353 /** 354 * evsel__tpebs_event() - Create string event encoding to pass to `perf record`. 355 */ 356 static int evsel__tpebs_event(struct evsel *evsel, char **event) 357 { 358 char *name, *modifier; 359 int ret; 360 361 name = strdup(evsel->name); 362 if (!name) 363 return -ENOMEM; 364 365 modifier = strrchr(name, 'R'); 366 if (!modifier) { 367 ret = -EINVAL; 368 goto out; 369 } 370 *modifier = 'p'; 371 modifier = strchr(name, ':'); 372 if (!modifier) 373 modifier = strrchr(name, '/'); 374 if (!modifier) { 375 ret = -EINVAL; 376 goto out; 377 } 378 *modifier = '\0'; 379 if (asprintf(event, "%s/name=tpebs_event_%p/%s", name, evsel, modifier + 1) > 0) 380 ret = 0; 381 else 382 ret = -ENOMEM; 383 out: 384 if (ret) 385 pr_err("Tpebs event modifier broken '%s'\n", evsel->name); 386 free(name); 387 return ret; 388 } 389 390 static struct tpebs_retire_lat *tpebs_retire_lat__new(struct evsel *evsel) 391 { 392 struct tpebs_retire_lat *result = zalloc(sizeof(*result)); 393 int ret; 394 395 if (!result) 396 return NULL; 397 398 ret = evsel__tpebs_event(evsel, &result->event); 399 if (ret) { 400 free(result); 401 return NULL; 402 } 403 result->evsel = evsel; 404 return result; 405 } 406 407 static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r) 408 { 409 zfree(&r->event); 410 free(r); 411 } 412 413 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel) 414 { 415 struct tpebs_retire_lat *t; 416 unsigned long num; 417 const char *evsel_name; 418 419 /* 420 * Evsels will match for evlist with the retirement latency event. The 421 * name with "tpebs_event_" prefix will be present on events being read 422 * from `perf record`. 423 */ 424 if (evsel__is_retire_lat(evsel)) { 425 list_for_each_entry(t, &tpebs_results, nd) { 426 if (t->evsel == evsel) 427 return t; 428 } 429 return NULL; 430 } 431 evsel_name = strstr(evsel->name, "tpebs_event_"); 432 if (!evsel_name) { 433 /* Unexpected that the perf record should have other events. */ 434 return NULL; 435 } 436 errno = 0; 437 num = strtoull(evsel_name + 12, NULL, 16); 438 if (errno) { 439 pr_err("Bad evsel for tpebs find '%s'\n", evsel->name); 440 return NULL; 441 } 442 list_for_each_entry(t, &tpebs_results, nd) { 443 if ((unsigned long)t->evsel == num) 444 return t; 445 } 446 return NULL; 447 } 448 449 /** 450 * evsel__tpebs_prepare - create tpebs data structures ready for opening. 451 * @evsel: retire_latency evsel, all evsels on its list will be prepared. 452 */ 453 static int evsel__tpebs_prepare(struct evsel *evsel) 454 { 455 struct evsel *pos; 456 struct tpebs_retire_lat *tpebs_event; 457 458 mutex_lock(tpebs_mtx_get()); 459 tpebs_event = tpebs_retire_lat__find(evsel); 460 if (tpebs_event) { 461 /* evsel, or an identically named one, was already prepared. */ 462 mutex_unlock(tpebs_mtx_get()); 463 return 0; 464 } 465 tpebs_event = tpebs_retire_lat__new(evsel); 466 if (!tpebs_event) { 467 mutex_unlock(tpebs_mtx_get()); 468 return -ENOMEM; 469 } 470 list_add_tail(&tpebs_event->nd, &tpebs_results); 471 mutex_unlock(tpebs_mtx_get()); 472 473 /* 474 * Eagerly prepare all other evsels on the list to try to ensure that by 475 * open they are all known. 476 */ 477 evlist__for_each_entry(evsel->evlist, pos) { 478 int ret; 479 480 if (pos == evsel || !pos->retire_lat) 481 continue; 482 483 ret = evsel__tpebs_prepare(pos); 484 if (ret) 485 return ret; 486 } 487 return 0; 488 } 489 490 /** 491 * evsel__tpebs_open - starts tpebs execution. 492 * @evsel: retire_latency evsel, all evsels on its list will be selected. Each 493 * evsel is sampled to get the average retire_latency value. 494 */ 495 int evsel__tpebs_open(struct evsel *evsel) 496 { 497 int ret; 498 bool tpebs_empty; 499 500 /* We should only run tpebs_start when tpebs_recording is enabled. */ 501 if (!tpebs_recording) 502 return 0; 503 /* Only start the events once. */ 504 if (tpebs_cmd.pid != 0) { 505 struct tpebs_retire_lat *t; 506 bool valid; 507 508 mutex_lock(tpebs_mtx_get()); 509 t = tpebs_retire_lat__find(evsel); 510 valid = t && t->started; 511 mutex_unlock(tpebs_mtx_get()); 512 /* May fail as the event wasn't started. */ 513 return valid ? 0 : -EBUSY; 514 } 515 516 ret = evsel__tpebs_prepare(evsel); 517 if (ret) 518 return ret; 519 520 mutex_lock(tpebs_mtx_get()); 521 tpebs_empty = list_empty(&tpebs_results); 522 if (!tpebs_empty) { 523 /*Create control and ack fd for --control*/ 524 if (pipe(control_fd) < 0) { 525 pr_err("tpebs: Failed to create control fifo"); 526 ret = -1; 527 goto out; 528 } 529 if (pipe(ack_fd) < 0) { 530 pr_err("tpebs: Failed to create control fifo"); 531 ret = -1; 532 goto out; 533 } 534 535 ret = evsel__tpebs_start_perf_record(evsel); 536 if (ret) 537 goto out; 538 539 if (pthread_create(&tpebs_reader_thread, /*attr=*/NULL, __sample_reader, 540 /*arg=*/NULL)) { 541 kill(tpebs_cmd.pid, SIGTERM); 542 close(tpebs_cmd.out); 543 pr_err("Could not create thread to process sample data.\n"); 544 ret = -1; 545 goto out; 546 } 547 ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_ENABLE_TAG); 548 } 549 out: 550 if (ret) { 551 struct tpebs_retire_lat *t = tpebs_retire_lat__find(evsel); 552 553 list_del_init(&t->nd); 554 tpebs_retire_lat__delete(t); 555 } 556 mutex_unlock(tpebs_mtx_get()); 557 return ret; 558 } 559 560 int evsel__tpebs_read(struct evsel *evsel, int cpu_map_idx, int thread) 561 { 562 struct perf_counts_values *count, *old_count = NULL; 563 struct tpebs_retire_lat *t; 564 uint64_t val; 565 int ret; 566 567 /* Only set retire_latency value to the first CPU and thread. */ 568 if (cpu_map_idx != 0 || thread != 0) 569 return 0; 570 571 if (evsel->prev_raw_counts) 572 old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread); 573 574 count = perf_counts(evsel->counts, cpu_map_idx, thread); 575 576 mutex_lock(tpebs_mtx_get()); 577 t = tpebs_retire_lat__find(evsel); 578 /* 579 * If reading the first tpebs result, send a ping to the record 580 * process. Allow the sample reader a chance to read by releasing and 581 * reacquiring the lock. 582 */ 583 if (t && &t->nd == tpebs_results.next) { 584 ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_PING_TAG); 585 mutex_unlock(tpebs_mtx_get()); 586 if (ret) 587 return ret; 588 mutex_lock(tpebs_mtx_get()); 589 } 590 if (t == NULL || t->stats.n == 0) { 591 /* No sample data, use default. */ 592 if (tpebs_recording) { 593 pr_warning_once( 594 "Using precomputed retirement latency data as no samples\n"); 595 } 596 val = 0; 597 switch (tpebs_mode) { 598 case TPEBS_MODE__MIN: 599 val = rint(evsel->retirement_latency.min); 600 break; 601 case TPEBS_MODE__MAX: 602 val = rint(evsel->retirement_latency.max); 603 break; 604 default: 605 case TPEBS_MODE__LAST: 606 case TPEBS_MODE__MEAN: 607 val = rint(evsel->retirement_latency.mean); 608 break; 609 } 610 } else { 611 switch (tpebs_mode) { 612 case TPEBS_MODE__MIN: 613 val = t->stats.min; 614 break; 615 case TPEBS_MODE__MAX: 616 val = t->stats.max; 617 break; 618 case TPEBS_MODE__LAST: 619 val = t->last; 620 break; 621 default: 622 case TPEBS_MODE__MEAN: 623 val = rint(t->stats.mean); 624 break; 625 } 626 } 627 mutex_unlock(tpebs_mtx_get()); 628 629 if (old_count) { 630 count->val = old_count->val + val; 631 count->run = old_count->run + 1; 632 count->ena = old_count->ena + 1; 633 } else { 634 count->val = val; 635 count->run++; 636 count->ena++; 637 } 638 return 0; 639 } 640 641 /** 642 * evsel__tpebs_close() - delete tpebs related data. If the last event, stop the 643 * created thread and process by calling tpebs_stop(). 644 * 645 * This function is called in evsel__close() to be symmetric with 646 * evsel__tpebs_open() being called in evsel__open(). 647 */ 648 void evsel__tpebs_close(struct evsel *evsel) 649 { 650 struct tpebs_retire_lat *t; 651 652 mutex_lock(tpebs_mtx_get()); 653 t = tpebs_retire_lat__find(evsel); 654 if (t) { 655 list_del_init(&t->nd); 656 tpebs_retire_lat__delete(t); 657 658 if (list_empty(&tpebs_results)) 659 tpebs_stop(); 660 } 661 mutex_unlock(tpebs_mtx_get()); 662 } 663