1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * intel_tpebs.c: Intel TPEBS support 4 */ 5 6 7 #include <sys/param.h> 8 #include <subcmd/run-command.h> 9 #include <thread.h> 10 #include "intel-tpebs.h" 11 #include <linux/list.h> 12 #include <linux/zalloc.h> 13 #include <linux/err.h> 14 #include "sample.h" 15 #include "counts.h" 16 #include "debug.h" 17 #include "evlist.h" 18 #include "evsel.h" 19 #include "mutex.h" 20 #include "session.h" 21 #include "stat.h" 22 #include "tool.h" 23 #include "cpumap.h" 24 #include "metricgroup.h" 25 #include "stat.h" 26 #include <sys/stat.h> 27 #include <sys/file.h> 28 #include <poll.h> 29 #include <math.h> 30 31 #define PERF_DATA "-" 32 33 bool tpebs_recording; 34 enum tpebs_mode tpebs_mode; 35 static LIST_HEAD(tpebs_results); 36 static pthread_t tpebs_reader_thread; 37 static struct child_process tpebs_cmd; 38 static int control_fd[2], ack_fd[2]; 39 static struct mutex tpebs_mtx; 40 41 struct tpebs_retire_lat { 42 struct list_head nd; 43 /** @evsel: The evsel that opened the retire_lat event. */ 44 struct evsel *evsel; 45 /** @event: Event passed to perf record. */ 46 char *event; 47 /** @stats: Recorded retirement latency stats. */ 48 struct stats stats; 49 /** @last: Last retirement latency read. */ 50 uint64_t last; 51 /* Has the event been sent to perf record? */ 52 bool started; 53 }; 54 55 static void tpebs_mtx_init(void) 56 { 57 mutex_init(&tpebs_mtx); 58 } 59 60 static struct mutex *tpebs_mtx_get(void) 61 { 62 static pthread_once_t tpebs_mtx_once = PTHREAD_ONCE_INIT; 63 64 pthread_once(&tpebs_mtx_once, tpebs_mtx_init); 65 return &tpebs_mtx; 66 } 67 68 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel) 69 EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()); 70 71 static int evsel__tpebs_start_perf_record(struct evsel *evsel) 72 { 73 const char **record_argv; 74 int tpebs_event_size = 0, i = 0, ret; 75 char control_fd_buf[32]; 76 char cpumap_buf[50]; 77 struct tpebs_retire_lat *t; 78 79 list_for_each_entry(t, &tpebs_results, nd) 80 tpebs_event_size++; 81 82 record_argv = malloc((10 + 2 * tpebs_event_size) * sizeof(*record_argv)); 83 if (!record_argv) 84 return -ENOMEM; 85 86 record_argv[i++] = "perf"; 87 record_argv[i++] = "record"; 88 record_argv[i++] = "-W"; 89 record_argv[i++] = "--synth=no"; 90 91 scnprintf(control_fd_buf, sizeof(control_fd_buf), "--control=fd:%d,%d", 92 control_fd[0], ack_fd[1]); 93 record_argv[i++] = control_fd_buf; 94 95 record_argv[i++] = "-o"; 96 record_argv[i++] = PERF_DATA; 97 98 if (!perf_cpu_map__is_any_cpu_or_is_empty(evsel->evlist->core.user_requested_cpus)) { 99 cpu_map__snprint(evsel->evlist->core.user_requested_cpus, cpumap_buf, 100 sizeof(cpumap_buf)); 101 record_argv[i++] = "-C"; 102 record_argv[i++] = cpumap_buf; 103 } 104 105 list_for_each_entry(t, &tpebs_results, nd) { 106 record_argv[i++] = "-e"; 107 record_argv[i++] = t->event; 108 } 109 record_argv[i++] = NULL; 110 assert(i == 10 + 2 * tpebs_event_size || i == 8 + 2 * tpebs_event_size); 111 /* Note, no workload given so system wide is implied. */ 112 113 assert(tpebs_cmd.pid == 0); 114 tpebs_cmd.argv = record_argv; 115 tpebs_cmd.out = -1; 116 ret = start_command(&tpebs_cmd); 117 zfree(&tpebs_cmd.argv); 118 list_for_each_entry(t, &tpebs_results, nd) 119 t->started = true; 120 121 return ret; 122 } 123 124 static int process_sample_event(const struct perf_tool *tool __maybe_unused, 125 union perf_event *event __maybe_unused, 126 struct perf_sample *sample, 127 struct evsel *evsel, 128 struct machine *machine __maybe_unused) 129 { 130 struct tpebs_retire_lat *t; 131 132 mutex_lock(tpebs_mtx_get()); 133 if (tpebs_cmd.pid == 0) { 134 /* Record has terminated. */ 135 mutex_unlock(tpebs_mtx_get()); 136 return 0; 137 } 138 t = tpebs_retire_lat__find(evsel); 139 if (!t) { 140 mutex_unlock(tpebs_mtx_get()); 141 return -EINVAL; 142 } 143 /* 144 * Need to handle per core results? We are assuming average retire 145 * latency value will be used. Save the number of samples and the sum of 146 * retire latency value for each event. 147 */ 148 t->last = sample->retire_lat; 149 update_stats(&t->stats, sample->retire_lat); 150 mutex_unlock(tpebs_mtx_get()); 151 return 0; 152 } 153 154 static int process_feature_event(struct perf_session *session, 155 union perf_event *event) 156 { 157 if (event->feat.feat_id < HEADER_LAST_FEATURE) 158 return perf_event__process_feature(session, event); 159 return 0; 160 } 161 162 static void *__sample_reader(void *arg __maybe_unused) 163 { 164 struct perf_session *session; 165 struct perf_data data = { 166 .mode = PERF_DATA_MODE_READ, 167 .path = PERF_DATA, 168 .file.fd = tpebs_cmd.out, 169 }; 170 struct perf_tool tool; 171 172 perf_tool__init(&tool, /*ordered_events=*/false); 173 tool.sample = process_sample_event; 174 tool.feature = process_feature_event; 175 tool.attr = perf_event__process_attr; 176 177 session = perf_session__new(&data, &tool); 178 if (IS_ERR(session)) 179 return NULL; 180 perf_session__process_events(session); 181 perf_session__delete(session); 182 183 return NULL; 184 } 185 186 static int tpebs_send_record_cmd(const char *msg) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()) 187 { 188 struct pollfd pollfd = { .events = POLLIN, }; 189 int ret, len, retries = 0; 190 char ack_buf[8]; 191 192 /* Check if the command exited before the send, done with the lock held. */ 193 if (tpebs_cmd.pid == 0) 194 return 0; 195 196 /* 197 * Let go of the lock while sending/receiving as blocking can starve the 198 * sample reading thread. 199 */ 200 mutex_unlock(tpebs_mtx_get()); 201 202 /* Send perf record command.*/ 203 len = strlen(msg); 204 ret = write(control_fd[1], msg, len); 205 if (ret != len) { 206 pr_err("perf record control write control message '%s' failed\n", msg); 207 ret = -EPIPE; 208 goto out; 209 } 210 211 if (!strcmp(msg, EVLIST_CTL_CMD_STOP_TAG)) { 212 ret = 0; 213 goto out; 214 } 215 216 /* Wait for an ack. */ 217 pollfd.fd = ack_fd[0]; 218 219 /* 220 * We need this poll to ensure the ack_fd PIPE will not hang 221 * when perf record failed for any reason. The timeout value 222 * 3000ms is an empirical selection. 223 */ 224 again: 225 if (!poll(&pollfd, 1, 500)) { 226 if (check_if_command_finished(&tpebs_cmd)) { 227 ret = 0; 228 goto out; 229 } 230 231 if (retries++ < 6) 232 goto again; 233 pr_err("tpebs failed: perf record ack timeout for '%s'\n", msg); 234 ret = -ETIMEDOUT; 235 goto out; 236 } 237 238 if (!(pollfd.revents & POLLIN)) { 239 if (check_if_command_finished(&tpebs_cmd)) { 240 ret = 0; 241 goto out; 242 } 243 244 pr_err("tpebs failed: did not received an ack for '%s'\n", msg); 245 ret = -EPIPE; 246 goto out; 247 } 248 249 ret = read(ack_fd[0], ack_buf, sizeof(ack_buf)); 250 if (ret > 0) 251 ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG); 252 else 253 pr_err("tpebs: perf record control ack failed\n"); 254 out: 255 /* Re-take lock as expected by caller. */ 256 mutex_lock(tpebs_mtx_get()); 257 return ret; 258 } 259 260 /* 261 * tpebs_stop - stop the sample data read thread and the perf record process. 262 */ 263 static int tpebs_stop(void) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()) 264 { 265 int ret = 0; 266 267 /* Like tpebs_start, we should only run tpebs_end once. */ 268 if (tpebs_cmd.pid != 0) { 269 tpebs_send_record_cmd(EVLIST_CTL_CMD_STOP_TAG); 270 tpebs_cmd.pid = 0; 271 mutex_unlock(tpebs_mtx_get()); 272 pthread_join(tpebs_reader_thread, NULL); 273 mutex_lock(tpebs_mtx_get()); 274 close(control_fd[0]); 275 close(control_fd[1]); 276 close(ack_fd[0]); 277 close(ack_fd[1]); 278 close(tpebs_cmd.out); 279 ret = finish_command(&tpebs_cmd); 280 tpebs_cmd.pid = 0; 281 if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL) 282 ret = 0; 283 } 284 return ret; 285 } 286 287 /** 288 * evsel__tpebs_event() - Create string event encoding to pass to `perf record`. 289 */ 290 static int evsel__tpebs_event(struct evsel *evsel, char **event) 291 { 292 char *name, *modifier; 293 int ret; 294 295 name = strdup(evsel->name); 296 if (!name) 297 return -ENOMEM; 298 299 modifier = strrchr(name, 'R'); 300 if (!modifier) { 301 ret = -EINVAL; 302 goto out; 303 } 304 *modifier = 'p'; 305 modifier = strchr(name, ':'); 306 if (!modifier) 307 modifier = strrchr(name, '/'); 308 if (!modifier) { 309 ret = -EINVAL; 310 goto out; 311 } 312 *modifier = '\0'; 313 if (asprintf(event, "%s/name=tpebs_event_%p/%s", name, evsel, modifier + 1) > 0) 314 ret = 0; 315 else 316 ret = -ENOMEM; 317 out: 318 if (ret) 319 pr_err("Tpebs event modifier broken '%s'\n", evsel->name); 320 free(name); 321 return ret; 322 } 323 324 static struct tpebs_retire_lat *tpebs_retire_lat__new(struct evsel *evsel) 325 { 326 struct tpebs_retire_lat *result = zalloc(sizeof(*result)); 327 int ret; 328 329 if (!result) 330 return NULL; 331 332 ret = evsel__tpebs_event(evsel, &result->event); 333 if (ret) { 334 free(result); 335 return NULL; 336 } 337 result->evsel = evsel; 338 return result; 339 } 340 341 static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r) 342 { 343 zfree(&r->event); 344 free(r); 345 } 346 347 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel) 348 { 349 struct tpebs_retire_lat *t; 350 unsigned long num; 351 const char *evsel_name; 352 353 /* 354 * Evsels will match for evlist with the retirement latency event. The 355 * name with "tpebs_event_" prefix will be present on events being read 356 * from `perf record`. 357 */ 358 if (evsel__is_retire_lat(evsel)) { 359 list_for_each_entry(t, &tpebs_results, nd) { 360 if (t->evsel == evsel) 361 return t; 362 } 363 return NULL; 364 } 365 evsel_name = strstr(evsel->name, "tpebs_event_"); 366 if (!evsel_name) { 367 /* Unexpected that the perf record should have other events. */ 368 return NULL; 369 } 370 errno = 0; 371 num = strtoull(evsel_name + 12, NULL, 16); 372 if (errno) { 373 pr_err("Bad evsel for tpebs find '%s'\n", evsel->name); 374 return NULL; 375 } 376 list_for_each_entry(t, &tpebs_results, nd) { 377 if ((unsigned long)t->evsel == num) 378 return t; 379 } 380 return NULL; 381 } 382 383 /** 384 * evsel__tpebs_prepare - create tpebs data structures ready for opening. 385 * @evsel: retire_latency evsel, all evsels on its list will be prepared. 386 */ 387 static int evsel__tpebs_prepare(struct evsel *evsel) 388 { 389 struct evsel *pos; 390 struct tpebs_retire_lat *tpebs_event; 391 392 mutex_lock(tpebs_mtx_get()); 393 tpebs_event = tpebs_retire_lat__find(evsel); 394 if (tpebs_event) { 395 /* evsel, or an identically named one, was already prepared. */ 396 mutex_unlock(tpebs_mtx_get()); 397 return 0; 398 } 399 tpebs_event = tpebs_retire_lat__new(evsel); 400 if (!tpebs_event) { 401 mutex_unlock(tpebs_mtx_get()); 402 return -ENOMEM; 403 } 404 list_add_tail(&tpebs_event->nd, &tpebs_results); 405 mutex_unlock(tpebs_mtx_get()); 406 407 /* 408 * Eagerly prepare all other evsels on the list to try to ensure that by 409 * open they are all known. 410 */ 411 evlist__for_each_entry(evsel->evlist, pos) { 412 int ret; 413 414 if (pos == evsel || !pos->retire_lat) 415 continue; 416 417 ret = evsel__tpebs_prepare(pos); 418 if (ret) 419 return ret; 420 } 421 return 0; 422 } 423 424 /** 425 * evsel__tpebs_open - starts tpebs execution. 426 * @evsel: retire_latency evsel, all evsels on its list will be selected. Each 427 * evsel is sampled to get the average retire_latency value. 428 */ 429 int evsel__tpebs_open(struct evsel *evsel) 430 { 431 int ret; 432 bool tpebs_empty; 433 434 /* We should only run tpebs_start when tpebs_recording is enabled. */ 435 if (!tpebs_recording) 436 return 0; 437 /* Only start the events once. */ 438 if (tpebs_cmd.pid != 0) { 439 struct tpebs_retire_lat *t; 440 bool valid; 441 442 mutex_lock(tpebs_mtx_get()); 443 t = tpebs_retire_lat__find(evsel); 444 valid = t && t->started; 445 mutex_unlock(tpebs_mtx_get()); 446 /* May fail as the event wasn't started. */ 447 return valid ? 0 : -EBUSY; 448 } 449 450 ret = evsel__tpebs_prepare(evsel); 451 if (ret) 452 return ret; 453 454 mutex_lock(tpebs_mtx_get()); 455 tpebs_empty = list_empty(&tpebs_results); 456 if (!tpebs_empty) { 457 /*Create control and ack fd for --control*/ 458 if (pipe(control_fd) < 0) { 459 pr_err("tpebs: Failed to create control fifo"); 460 ret = -1; 461 goto out; 462 } 463 if (pipe(ack_fd) < 0) { 464 pr_err("tpebs: Failed to create control fifo"); 465 ret = -1; 466 goto out; 467 } 468 469 ret = evsel__tpebs_start_perf_record(evsel); 470 if (ret) 471 goto out; 472 473 if (pthread_create(&tpebs_reader_thread, /*attr=*/NULL, __sample_reader, 474 /*arg=*/NULL)) { 475 kill(tpebs_cmd.pid, SIGTERM); 476 close(tpebs_cmd.out); 477 pr_err("Could not create thread to process sample data.\n"); 478 ret = -1; 479 goto out; 480 } 481 ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_ENABLE_TAG); 482 } 483 out: 484 if (ret) { 485 struct tpebs_retire_lat *t = tpebs_retire_lat__find(evsel); 486 487 list_del_init(&t->nd); 488 tpebs_retire_lat__delete(t); 489 } 490 mutex_unlock(tpebs_mtx_get()); 491 return ret; 492 } 493 494 int evsel__tpebs_read(struct evsel *evsel, int cpu_map_idx, int thread) 495 { 496 struct perf_counts_values *count, *old_count = NULL; 497 struct tpebs_retire_lat *t; 498 uint64_t val; 499 int ret; 500 501 /* Only set retire_latency value to the first CPU and thread. */ 502 if (cpu_map_idx != 0 || thread != 0) 503 return 0; 504 505 if (evsel->prev_raw_counts) 506 old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread); 507 508 count = perf_counts(evsel->counts, cpu_map_idx, thread); 509 510 mutex_lock(tpebs_mtx_get()); 511 t = tpebs_retire_lat__find(evsel); 512 /* 513 * If reading the first tpebs result, send a ping to the record 514 * process. Allow the sample reader a chance to read by releasing and 515 * reacquiring the lock. 516 */ 517 if (t && &t->nd == tpebs_results.next) { 518 ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_PING_TAG); 519 mutex_unlock(tpebs_mtx_get()); 520 if (ret) 521 return ret; 522 mutex_lock(tpebs_mtx_get()); 523 } 524 if (t == NULL || t->stats.n == 0) { 525 /* No sample data, use default. */ 526 if (tpebs_recording) { 527 pr_warning_once( 528 "Using precomputed retirement latency data as no samples\n"); 529 } 530 val = 0; 531 switch (tpebs_mode) { 532 case TPEBS_MODE__MIN: 533 val = rint(evsel->retirement_latency.min); 534 break; 535 case TPEBS_MODE__MAX: 536 val = rint(evsel->retirement_latency.max); 537 break; 538 default: 539 case TPEBS_MODE__LAST: 540 case TPEBS_MODE__MEAN: 541 val = rint(evsel->retirement_latency.mean); 542 break; 543 } 544 } else { 545 switch (tpebs_mode) { 546 case TPEBS_MODE__MIN: 547 val = t->stats.min; 548 break; 549 case TPEBS_MODE__MAX: 550 val = t->stats.max; 551 break; 552 case TPEBS_MODE__LAST: 553 val = t->last; 554 break; 555 default: 556 case TPEBS_MODE__MEAN: 557 val = rint(t->stats.mean); 558 break; 559 } 560 } 561 mutex_unlock(tpebs_mtx_get()); 562 563 if (old_count) { 564 count->val = old_count->val + val; 565 count->run = old_count->run + 1; 566 count->ena = old_count->ena + 1; 567 } else { 568 count->val = val; 569 count->run++; 570 count->ena++; 571 } 572 return 0; 573 } 574 575 /** 576 * evsel__tpebs_close() - delete tpebs related data. If the last event, stop the 577 * created thread and process by calling tpebs_stop(). 578 * 579 * This function is called in evsel__close() to be symmetric with 580 * evsel__tpebs_open() being called in evsel__open(). 581 */ 582 void evsel__tpebs_close(struct evsel *evsel) 583 { 584 struct tpebs_retire_lat *t; 585 586 mutex_lock(tpebs_mtx_get()); 587 t = tpebs_retire_lat__find(evsel); 588 if (t) { 589 list_del_init(&t->nd); 590 tpebs_retire_lat__delete(t); 591 592 if (list_empty(&tpebs_results)) 593 tpebs_stop(); 594 } 595 mutex_unlock(tpebs_mtx_get()); 596 } 597