1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * intel_tpebs.c: Intel TPEBS support 4 */ 5 6 #include <api/fs/fs.h> 7 #include <sys/param.h> 8 #include <subcmd/run-command.h> 9 #include <thread.h> 10 #include "intel-tpebs.h" 11 #include <linux/list.h> 12 #include <linux/zalloc.h> 13 #include <linux/err.h> 14 #include "sample.h" 15 #include "counts.h" 16 #include "debug.h" 17 #include "evlist.h" 18 #include "evsel.h" 19 #include "mutex.h" 20 #include "session.h" 21 #include "stat.h" 22 #include "tool.h" 23 #include "cpumap.h" 24 #include "metricgroup.h" 25 #include "stat.h" 26 #include <sys/stat.h> 27 #include <sys/file.h> 28 #include <poll.h> 29 #include <math.h> 30 31 #define PERF_DATA "-" 32 33 bool tpebs_recording; 34 enum tpebs_mode tpebs_mode; 35 static LIST_HEAD(tpebs_results); 36 static pthread_t tpebs_reader_thread; 37 static struct child_process tpebs_cmd; 38 static int control_fd[2], ack_fd[2]; 39 static struct mutex tpebs_mtx; 40 41 struct tpebs_retire_lat { 42 struct list_head nd; 43 /** @evsel: The evsel that opened the retire_lat event. */ 44 struct evsel *evsel; 45 /** @event: Event passed to perf record. */ 46 char *event; 47 /** @stats: Recorded retirement latency stats. */ 48 struct stats stats; 49 /** @last: Last retirement latency read. */ 50 uint64_t last; 51 /* Has the event been sent to perf record? */ 52 bool started; 53 }; 54 55 static void tpebs_mtx_init(void) 56 { 57 mutex_init(&tpebs_mtx); 58 } 59 60 static struct mutex *tpebs_mtx_get(void) 61 { 62 static pthread_once_t tpebs_mtx_once = PTHREAD_ONCE_INIT; 63 64 pthread_once(&tpebs_mtx_once, tpebs_mtx_init); 65 return &tpebs_mtx; 66 } 67 68 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel) 69 EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()); 70 71 static int evsel__tpebs_start_perf_record(struct evsel *evsel) 72 { 73 const char **record_argv; 74 int tpebs_event_size = 0, i = 0, ret; 75 char control_fd_buf[32]; 76 char cpumap_buf[50]; 77 struct tpebs_retire_lat *t; 78 79 list_for_each_entry(t, &tpebs_results, nd) 80 tpebs_event_size++; 81 82 record_argv = malloc((10 + 2 * tpebs_event_size) * sizeof(*record_argv)); 83 if (!record_argv) 84 return -ENOMEM; 85 86 record_argv[i++] = "perf"; 87 record_argv[i++] = "record"; 88 record_argv[i++] = "-W"; 89 record_argv[i++] = "--synth=no"; 90 91 scnprintf(control_fd_buf, sizeof(control_fd_buf), "--control=fd:%d,%d", 92 control_fd[0], ack_fd[1]); 93 record_argv[i++] = control_fd_buf; 94 95 record_argv[i++] = "-o"; 96 record_argv[i++] = PERF_DATA; 97 98 if (!perf_cpu_map__is_any_cpu_or_is_empty(evsel->evlist->core.user_requested_cpus)) { 99 cpu_map__snprint(evsel->evlist->core.user_requested_cpus, cpumap_buf, 100 sizeof(cpumap_buf)); 101 record_argv[i++] = "-C"; 102 record_argv[i++] = cpumap_buf; 103 } 104 105 list_for_each_entry(t, &tpebs_results, nd) { 106 record_argv[i++] = "-e"; 107 record_argv[i++] = t->event; 108 } 109 record_argv[i++] = NULL; 110 assert(i == 10 + 2 * tpebs_event_size || i == 8 + 2 * tpebs_event_size); 111 /* Note, no workload given so system wide is implied. */ 112 113 assert(tpebs_cmd.pid == 0); 114 tpebs_cmd.argv = record_argv; 115 tpebs_cmd.out = -1; 116 ret = start_command(&tpebs_cmd); 117 zfree(&tpebs_cmd.argv); 118 list_for_each_entry(t, &tpebs_results, nd) 119 t->started = true; 120 121 return ret; 122 } 123 124 static bool is_child_pid(pid_t parent, pid_t child) 125 { 126 if (parent < 0 || child < 0) 127 return false; 128 129 while (true) { 130 char path[PATH_MAX]; 131 char line[256]; 132 FILE *fp; 133 134 new_child: 135 if (parent == child) 136 return true; 137 138 if (child <= 0) 139 return false; 140 141 scnprintf(path, sizeof(path), "%s/%d/status", procfs__mountpoint(), child); 142 fp = fopen(path, "r"); 143 if (!fp) { 144 /* Presumably the process went away. Assume not a child. */ 145 return false; 146 } 147 while (fgets(line, sizeof(line), fp) != NULL) { 148 if (strncmp(line, "PPid:", 5) == 0) { 149 fclose(fp); 150 if (sscanf(line + 5, "%d", &child) != 1) { 151 /* Unexpected error parsing. */ 152 return false; 153 } 154 goto new_child; 155 } 156 } 157 /* Unexpected EOF. */ 158 fclose(fp); 159 return false; 160 } 161 } 162 163 static bool should_ignore_sample(const struct perf_sample *sample, const struct tpebs_retire_lat *t) 164 { 165 pid_t workload_pid = t->evsel->evlist->workload.pid; 166 pid_t sample_pid = sample->pid; 167 168 if (workload_pid < 0 || workload_pid == sample_pid) 169 return false; 170 171 if (!t->evsel->core.attr.inherit) 172 return true; 173 174 return !is_child_pid(workload_pid, sample_pid); 175 } 176 177 static int process_sample_event(const struct perf_tool *tool __maybe_unused, 178 union perf_event *event __maybe_unused, 179 struct perf_sample *sample, 180 struct evsel *evsel, 181 struct machine *machine __maybe_unused) 182 { 183 struct tpebs_retire_lat *t; 184 185 mutex_lock(tpebs_mtx_get()); 186 if (tpebs_cmd.pid == 0) { 187 /* Record has terminated. */ 188 mutex_unlock(tpebs_mtx_get()); 189 return 0; 190 } 191 t = tpebs_retire_lat__find(evsel); 192 if (!t) { 193 mutex_unlock(tpebs_mtx_get()); 194 return -EINVAL; 195 } 196 if (should_ignore_sample(sample, t)) { 197 mutex_unlock(tpebs_mtx_get()); 198 return 0; 199 } 200 /* 201 * Need to handle per core results? We are assuming average retire 202 * latency value will be used. Save the number of samples and the sum of 203 * retire latency value for each event. 204 */ 205 t->last = sample->retire_lat; 206 update_stats(&t->stats, sample->retire_lat); 207 mutex_unlock(tpebs_mtx_get()); 208 return 0; 209 } 210 211 static int process_feature_event(struct perf_session *session, 212 union perf_event *event) 213 { 214 if (event->feat.feat_id < HEADER_LAST_FEATURE) 215 return perf_event__process_feature(session, event); 216 return 0; 217 } 218 219 static void *__sample_reader(void *arg __maybe_unused) 220 { 221 struct perf_session *session; 222 struct perf_data data = { 223 .mode = PERF_DATA_MODE_READ, 224 .path = PERF_DATA, 225 .file.fd = tpebs_cmd.out, 226 }; 227 struct perf_tool tool; 228 229 perf_tool__init(&tool, /*ordered_events=*/false); 230 tool.sample = process_sample_event; 231 tool.feature = process_feature_event; 232 tool.attr = perf_event__process_attr; 233 234 session = perf_session__new(&data, &tool); 235 if (IS_ERR(session)) 236 return NULL; 237 perf_session__process_events(session); 238 perf_session__delete(session); 239 240 return NULL; 241 } 242 243 static int tpebs_send_record_cmd(const char *msg) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()) 244 { 245 struct pollfd pollfd = { .events = POLLIN, }; 246 int ret, len, retries = 0; 247 char ack_buf[8]; 248 249 /* Check if the command exited before the send, done with the lock held. */ 250 if (tpebs_cmd.pid == 0) 251 return 0; 252 253 /* 254 * Let go of the lock while sending/receiving as blocking can starve the 255 * sample reading thread. 256 */ 257 mutex_unlock(tpebs_mtx_get()); 258 259 /* Send perf record command.*/ 260 len = strlen(msg); 261 ret = write(control_fd[1], msg, len); 262 if (ret != len) { 263 pr_err("perf record control write control message '%s' failed\n", msg); 264 ret = -EPIPE; 265 goto out; 266 } 267 268 if (!strcmp(msg, EVLIST_CTL_CMD_STOP_TAG)) { 269 ret = 0; 270 goto out; 271 } 272 273 /* Wait for an ack. */ 274 pollfd.fd = ack_fd[0]; 275 276 /* 277 * We need this poll to ensure the ack_fd PIPE will not hang 278 * when perf record failed for any reason. The timeout value 279 * 3000ms is an empirical selection. 280 */ 281 again: 282 if (!poll(&pollfd, 1, 500)) { 283 if (check_if_command_finished(&tpebs_cmd)) { 284 ret = 0; 285 goto out; 286 } 287 288 if (retries++ < 6) 289 goto again; 290 pr_err("tpebs failed: perf record ack timeout for '%s'\n", msg); 291 ret = -ETIMEDOUT; 292 goto out; 293 } 294 295 if (!(pollfd.revents & POLLIN)) { 296 if (check_if_command_finished(&tpebs_cmd)) { 297 ret = 0; 298 goto out; 299 } 300 301 pr_err("tpebs failed: did not received an ack for '%s'\n", msg); 302 ret = -EPIPE; 303 goto out; 304 } 305 306 ret = read(ack_fd[0], ack_buf, sizeof(ack_buf)); 307 if (ret > 0) 308 ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG); 309 else 310 pr_err("tpebs: perf record control ack failed\n"); 311 out: 312 /* Re-take lock as expected by caller. */ 313 mutex_lock(tpebs_mtx_get()); 314 return ret; 315 } 316 317 /* 318 * tpebs_stop - stop the sample data read thread and the perf record process. 319 */ 320 static int tpebs_stop(void) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()) 321 { 322 int ret = 0; 323 324 /* Like tpebs_start, we should only run tpebs_end once. */ 325 if (tpebs_cmd.pid != 0) { 326 tpebs_send_record_cmd(EVLIST_CTL_CMD_STOP_TAG); 327 tpebs_cmd.pid = 0; 328 mutex_unlock(tpebs_mtx_get()); 329 pthread_join(tpebs_reader_thread, NULL); 330 mutex_lock(tpebs_mtx_get()); 331 close(control_fd[0]); 332 close(control_fd[1]); 333 close(ack_fd[0]); 334 close(ack_fd[1]); 335 close(tpebs_cmd.out); 336 ret = finish_command(&tpebs_cmd); 337 tpebs_cmd.pid = 0; 338 if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL) 339 ret = 0; 340 } 341 return ret; 342 } 343 344 /** 345 * evsel__tpebs_event() - Create string event encoding to pass to `perf record`. 346 */ 347 static int evsel__tpebs_event(struct evsel *evsel, char **event) 348 { 349 char *name, *modifier; 350 int ret; 351 352 name = strdup(evsel->name); 353 if (!name) 354 return -ENOMEM; 355 356 modifier = strrchr(name, 'R'); 357 if (!modifier) { 358 ret = -EINVAL; 359 goto out; 360 } 361 *modifier = 'p'; 362 modifier = strchr(name, ':'); 363 if (!modifier) 364 modifier = strrchr(name, '/'); 365 if (!modifier) { 366 ret = -EINVAL; 367 goto out; 368 } 369 *modifier = '\0'; 370 if (asprintf(event, "%s/name=tpebs_event_%p/%s", name, evsel, modifier + 1) > 0) 371 ret = 0; 372 else 373 ret = -ENOMEM; 374 out: 375 if (ret) 376 pr_err("Tpebs event modifier broken '%s'\n", evsel->name); 377 free(name); 378 return ret; 379 } 380 381 static struct tpebs_retire_lat *tpebs_retire_lat__new(struct evsel *evsel) 382 { 383 struct tpebs_retire_lat *result = zalloc(sizeof(*result)); 384 int ret; 385 386 if (!result) 387 return NULL; 388 389 ret = evsel__tpebs_event(evsel, &result->event); 390 if (ret) { 391 free(result); 392 return NULL; 393 } 394 result->evsel = evsel; 395 return result; 396 } 397 398 static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r) 399 { 400 zfree(&r->event); 401 free(r); 402 } 403 404 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel) 405 { 406 struct tpebs_retire_lat *t; 407 unsigned long num; 408 const char *evsel_name; 409 410 /* 411 * Evsels will match for evlist with the retirement latency event. The 412 * name with "tpebs_event_" prefix will be present on events being read 413 * from `perf record`. 414 */ 415 if (evsel__is_retire_lat(evsel)) { 416 list_for_each_entry(t, &tpebs_results, nd) { 417 if (t->evsel == evsel) 418 return t; 419 } 420 return NULL; 421 } 422 evsel_name = strstr(evsel->name, "tpebs_event_"); 423 if (!evsel_name) { 424 /* Unexpected that the perf record should have other events. */ 425 return NULL; 426 } 427 errno = 0; 428 num = strtoull(evsel_name + 12, NULL, 16); 429 if (errno) { 430 pr_err("Bad evsel for tpebs find '%s'\n", evsel->name); 431 return NULL; 432 } 433 list_for_each_entry(t, &tpebs_results, nd) { 434 if ((unsigned long)t->evsel == num) 435 return t; 436 } 437 return NULL; 438 } 439 440 /** 441 * evsel__tpebs_prepare - create tpebs data structures ready for opening. 442 * @evsel: retire_latency evsel, all evsels on its list will be prepared. 443 */ 444 static int evsel__tpebs_prepare(struct evsel *evsel) 445 { 446 struct evsel *pos; 447 struct tpebs_retire_lat *tpebs_event; 448 449 mutex_lock(tpebs_mtx_get()); 450 tpebs_event = tpebs_retire_lat__find(evsel); 451 if (tpebs_event) { 452 /* evsel, or an identically named one, was already prepared. */ 453 mutex_unlock(tpebs_mtx_get()); 454 return 0; 455 } 456 tpebs_event = tpebs_retire_lat__new(evsel); 457 if (!tpebs_event) { 458 mutex_unlock(tpebs_mtx_get()); 459 return -ENOMEM; 460 } 461 list_add_tail(&tpebs_event->nd, &tpebs_results); 462 mutex_unlock(tpebs_mtx_get()); 463 464 /* 465 * Eagerly prepare all other evsels on the list to try to ensure that by 466 * open they are all known. 467 */ 468 evlist__for_each_entry(evsel->evlist, pos) { 469 int ret; 470 471 if (pos == evsel || !pos->retire_lat) 472 continue; 473 474 ret = evsel__tpebs_prepare(pos); 475 if (ret) 476 return ret; 477 } 478 return 0; 479 } 480 481 /** 482 * evsel__tpebs_open - starts tpebs execution. 483 * @evsel: retire_latency evsel, all evsels on its list will be selected. Each 484 * evsel is sampled to get the average retire_latency value. 485 */ 486 int evsel__tpebs_open(struct evsel *evsel) 487 { 488 int ret; 489 bool tpebs_empty; 490 491 /* We should only run tpebs_start when tpebs_recording is enabled. */ 492 if (!tpebs_recording) 493 return 0; 494 /* Only start the events once. */ 495 if (tpebs_cmd.pid != 0) { 496 struct tpebs_retire_lat *t; 497 bool valid; 498 499 mutex_lock(tpebs_mtx_get()); 500 t = tpebs_retire_lat__find(evsel); 501 valid = t && t->started; 502 mutex_unlock(tpebs_mtx_get()); 503 /* May fail as the event wasn't started. */ 504 return valid ? 0 : -EBUSY; 505 } 506 507 ret = evsel__tpebs_prepare(evsel); 508 if (ret) 509 return ret; 510 511 mutex_lock(tpebs_mtx_get()); 512 tpebs_empty = list_empty(&tpebs_results); 513 if (!tpebs_empty) { 514 /*Create control and ack fd for --control*/ 515 if (pipe(control_fd) < 0) { 516 pr_err("tpebs: Failed to create control fifo"); 517 ret = -1; 518 goto out; 519 } 520 if (pipe(ack_fd) < 0) { 521 pr_err("tpebs: Failed to create control fifo"); 522 ret = -1; 523 goto out; 524 } 525 526 ret = evsel__tpebs_start_perf_record(evsel); 527 if (ret) 528 goto out; 529 530 if (pthread_create(&tpebs_reader_thread, /*attr=*/NULL, __sample_reader, 531 /*arg=*/NULL)) { 532 kill(tpebs_cmd.pid, SIGTERM); 533 close(tpebs_cmd.out); 534 pr_err("Could not create thread to process sample data.\n"); 535 ret = -1; 536 goto out; 537 } 538 ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_ENABLE_TAG); 539 } 540 out: 541 if (ret) { 542 struct tpebs_retire_lat *t = tpebs_retire_lat__find(evsel); 543 544 list_del_init(&t->nd); 545 tpebs_retire_lat__delete(t); 546 } 547 mutex_unlock(tpebs_mtx_get()); 548 return ret; 549 } 550 551 int evsel__tpebs_read(struct evsel *evsel, int cpu_map_idx, int thread) 552 { 553 struct perf_counts_values *count, *old_count = NULL; 554 struct tpebs_retire_lat *t; 555 uint64_t val; 556 int ret; 557 558 /* Only set retire_latency value to the first CPU and thread. */ 559 if (cpu_map_idx != 0 || thread != 0) 560 return 0; 561 562 if (evsel->prev_raw_counts) 563 old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread); 564 565 count = perf_counts(evsel->counts, cpu_map_idx, thread); 566 567 mutex_lock(tpebs_mtx_get()); 568 t = tpebs_retire_lat__find(evsel); 569 /* 570 * If reading the first tpebs result, send a ping to the record 571 * process. Allow the sample reader a chance to read by releasing and 572 * reacquiring the lock. 573 */ 574 if (t && &t->nd == tpebs_results.next) { 575 ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_PING_TAG); 576 mutex_unlock(tpebs_mtx_get()); 577 if (ret) 578 return ret; 579 mutex_lock(tpebs_mtx_get()); 580 } 581 if (t == NULL || t->stats.n == 0) { 582 /* No sample data, use default. */ 583 if (tpebs_recording) { 584 pr_warning_once( 585 "Using precomputed retirement latency data as no samples\n"); 586 } 587 val = 0; 588 switch (tpebs_mode) { 589 case TPEBS_MODE__MIN: 590 val = rint(evsel->retirement_latency.min); 591 break; 592 case TPEBS_MODE__MAX: 593 val = rint(evsel->retirement_latency.max); 594 break; 595 default: 596 case TPEBS_MODE__LAST: 597 case TPEBS_MODE__MEAN: 598 val = rint(evsel->retirement_latency.mean); 599 break; 600 } 601 } else { 602 switch (tpebs_mode) { 603 case TPEBS_MODE__MIN: 604 val = t->stats.min; 605 break; 606 case TPEBS_MODE__MAX: 607 val = t->stats.max; 608 break; 609 case TPEBS_MODE__LAST: 610 val = t->last; 611 break; 612 default: 613 case TPEBS_MODE__MEAN: 614 val = rint(t->stats.mean); 615 break; 616 } 617 } 618 mutex_unlock(tpebs_mtx_get()); 619 620 if (old_count) { 621 count->val = old_count->val + val; 622 count->run = old_count->run + 1; 623 count->ena = old_count->ena + 1; 624 } else { 625 count->val = val; 626 count->run++; 627 count->ena++; 628 } 629 return 0; 630 } 631 632 /** 633 * evsel__tpebs_close() - delete tpebs related data. If the last event, stop the 634 * created thread and process by calling tpebs_stop(). 635 * 636 * This function is called in evsel__close() to be symmetric with 637 * evsel__tpebs_open() being called in evsel__open(). 638 */ 639 void evsel__tpebs_close(struct evsel *evsel) 640 { 641 struct tpebs_retire_lat *t; 642 643 mutex_lock(tpebs_mtx_get()); 644 t = tpebs_retire_lat__find(evsel); 645 if (t) { 646 list_del_init(&t->nd); 647 tpebs_retire_lat__delete(t); 648 649 if (list_empty(&tpebs_results)) 650 tpebs_stop(); 651 } 652 mutex_unlock(tpebs_mtx_get()); 653 } 654