1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * intel_tpebs.c: Intel TPEBS support
4 */
5
6 #include <api/fs/fs.h>
7 #include <sys/param.h>
8 #include <subcmd/run-command.h>
9 #include <thread.h>
10 #include "intel-tpebs.h"
11 #include <linux/list.h>
12 #include <linux/zalloc.h>
13 #include <linux/err.h>
14 #include "sample.h"
15 #include "counts.h"
16 #include "debug.h"
17 #include "evlist.h"
18 #include "evsel.h"
19 #include "mutex.h"
20 #include "session.h"
21 #include "stat.h"
22 #include "tool.h"
23 #include "cpumap.h"
24 #include "metricgroup.h"
25 #include "stat.h"
26 #include <sys/stat.h>
27 #include <sys/file.h>
28 #include <errno.h>
29 #include <poll.h>
30 #include <math.h>
31
32 #define PERF_DATA "-"
33
34 bool tpebs_recording;
35 enum tpebs_mode tpebs_mode;
36 static LIST_HEAD(tpebs_results);
37 static pthread_t tpebs_reader_thread;
38 static struct child_process tpebs_cmd;
39 static int control_fd[2], ack_fd[2];
40 static struct mutex tpebs_mtx;
41
42 struct tpebs_retire_lat {
43 struct list_head nd;
44 /** @evsel: The evsel that opened the retire_lat event. */
45 struct evsel *evsel;
46 /** @event: Event passed to perf record. */
47 char *event;
48 /** @stats: Recorded retirement latency stats. */
49 struct stats stats;
50 /** @last: Last retirement latency read. */
51 uint64_t last;
52 /* Has the event been sent to perf record? */
53 bool started;
54 };
55
tpebs_mtx_init(void)56 static void tpebs_mtx_init(void)
57 {
58 mutex_init(&tpebs_mtx);
59 }
60
tpebs_mtx_get(void)61 static struct mutex *tpebs_mtx_get(void)
62 {
63 static pthread_once_t tpebs_mtx_once = PTHREAD_ONCE_INIT;
64
65 pthread_once(&tpebs_mtx_once, tpebs_mtx_init);
66 return &tpebs_mtx;
67 }
68
69 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel)
70 EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get());
71
evsel__tpebs_start_perf_record(struct evsel * evsel)72 static int evsel__tpebs_start_perf_record(struct evsel *evsel)
73 {
74 const char **record_argv;
75 int tpebs_event_size = 0, i = 0, ret;
76 char control_fd_buf[32];
77 char cpumap_buf[50];
78 struct tpebs_retire_lat *t;
79
80 list_for_each_entry(t, &tpebs_results, nd)
81 tpebs_event_size++;
82
83 record_argv = malloc((10 + 2 * tpebs_event_size) * sizeof(*record_argv));
84 if (!record_argv)
85 return -ENOMEM;
86
87 record_argv[i++] = "perf";
88 record_argv[i++] = "record";
89 record_argv[i++] = "-W";
90 record_argv[i++] = "--synth=no";
91
92 scnprintf(control_fd_buf, sizeof(control_fd_buf), "--control=fd:%d,%d",
93 control_fd[0], ack_fd[1]);
94 record_argv[i++] = control_fd_buf;
95
96 record_argv[i++] = "-o";
97 record_argv[i++] = PERF_DATA;
98
99 if (!perf_cpu_map__is_any_cpu_or_is_empty(evsel->evlist->core.user_requested_cpus)) {
100 cpu_map__snprint(evsel->evlist->core.user_requested_cpus, cpumap_buf,
101 sizeof(cpumap_buf));
102 record_argv[i++] = "-C";
103 record_argv[i++] = cpumap_buf;
104 }
105
106 list_for_each_entry(t, &tpebs_results, nd) {
107 record_argv[i++] = "-e";
108 record_argv[i++] = t->event;
109 }
110 record_argv[i++] = NULL;
111 assert(i == 10 + 2 * tpebs_event_size || i == 8 + 2 * tpebs_event_size);
112 /* Note, no workload given so system wide is implied. */
113
114 assert(tpebs_cmd.pid == 0);
115 tpebs_cmd.argv = record_argv;
116 tpebs_cmd.out = -1;
117 ret = start_command(&tpebs_cmd);
118 zfree(&tpebs_cmd.argv);
119 list_for_each_entry(t, &tpebs_results, nd)
120 t->started = true;
121
122 return ret;
123 }
124
is_child_pid(pid_t parent,pid_t child)125 static bool is_child_pid(pid_t parent, pid_t child)
126 {
127 if (parent < 0 || child < 0)
128 return false;
129
130 while (true) {
131 char path[PATH_MAX];
132 char line[256];
133 FILE *fp;
134
135 new_child:
136 if (parent == child)
137 return true;
138
139 if (child <= 0)
140 return false;
141
142 scnprintf(path, sizeof(path), "%s/%d/status", procfs__mountpoint(), child);
143 fp = fopen(path, "r");
144 if (!fp) {
145 /* Presumably the process went away. Assume not a child. */
146 return false;
147 }
148 while (fgets(line, sizeof(line), fp) != NULL) {
149 if (strncmp(line, "PPid:", 5) == 0) {
150 fclose(fp);
151 if (sscanf(line + 5, "%d", &child) != 1) {
152 /* Unexpected error parsing. */
153 return false;
154 }
155 goto new_child;
156 }
157 }
158 /* Unexpected EOF. */
159 fclose(fp);
160 return false;
161 }
162 }
163
should_ignore_sample(const struct perf_sample * sample,const struct tpebs_retire_lat * t)164 static bool should_ignore_sample(const struct perf_sample *sample, const struct tpebs_retire_lat *t)
165 {
166 pid_t workload_pid, sample_pid = sample->pid;
167
168 /*
169 * During evlist__purge the evlist will be removed prior to the
170 * evsel__exit calling evsel__tpebs_close and taking the
171 * tpebs_mtx. Avoid a segfault by ignoring samples in this case.
172 */
173 if (t->evsel->evlist == NULL)
174 return true;
175
176 workload_pid = t->evsel->evlist->workload.pid;
177 if (workload_pid < 0 || workload_pid == sample_pid)
178 return false;
179
180 if (!t->evsel->core.attr.inherit)
181 return true;
182
183 return !is_child_pid(workload_pid, sample_pid);
184 }
185
process_sample_event(const struct perf_tool * tool __maybe_unused,union perf_event * event __maybe_unused,struct perf_sample * sample,struct evsel * evsel,struct machine * machine __maybe_unused)186 static int process_sample_event(const struct perf_tool *tool __maybe_unused,
187 union perf_event *event __maybe_unused,
188 struct perf_sample *sample,
189 struct evsel *evsel,
190 struct machine *machine __maybe_unused)
191 {
192 struct tpebs_retire_lat *t;
193
194 mutex_lock(tpebs_mtx_get());
195 if (tpebs_cmd.pid == 0) {
196 /* Record has terminated. */
197 mutex_unlock(tpebs_mtx_get());
198 return 0;
199 }
200 t = tpebs_retire_lat__find(evsel);
201 if (!t) {
202 mutex_unlock(tpebs_mtx_get());
203 return -EINVAL;
204 }
205 if (should_ignore_sample(sample, t)) {
206 mutex_unlock(tpebs_mtx_get());
207 return 0;
208 }
209 /*
210 * Need to handle per core results? We are assuming average retire
211 * latency value will be used. Save the number of samples and the sum of
212 * retire latency value for each event.
213 */
214 t->last = sample->weight3;
215 update_stats(&t->stats, sample->weight3);
216 mutex_unlock(tpebs_mtx_get());
217 return 0;
218 }
219
process_feature_event(const struct perf_tool * tool __maybe_unused,struct perf_session * session,union perf_event * event)220 static int process_feature_event(const struct perf_tool *tool __maybe_unused,
221 struct perf_session *session,
222 union perf_event *event)
223 {
224 if (event->feat.feat_id < HEADER_LAST_FEATURE)
225 return perf_event__process_feature(session, event);
226 return 0;
227 }
228
__sample_reader(void * arg __maybe_unused)229 static void *__sample_reader(void *arg __maybe_unused)
230 {
231 struct perf_session *session;
232 struct perf_data data = {
233 .mode = PERF_DATA_MODE_READ,
234 .path = PERF_DATA,
235 .file.fd = tpebs_cmd.out,
236 };
237 struct perf_tool tool;
238
239 perf_tool__init(&tool, /*ordered_events=*/false);
240 tool.sample = process_sample_event;
241 tool.feature = process_feature_event;
242 tool.attr = perf_event__process_attr;
243
244 session = perf_session__new(&data, &tool);
245 if (IS_ERR(session))
246 return NULL;
247 perf_session__process_events(session);
248 perf_session__delete(session);
249
250 return NULL;
251 }
252
tpebs_send_record_cmd(const char * msg)253 static int tpebs_send_record_cmd(const char *msg) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get())
254 {
255 struct pollfd pollfd = { .events = POLLIN, };
256 int ret, len, retries = 0;
257 char ack_buf[8];
258
259 /* Check if the command exited before the send, done with the lock held. */
260 if (tpebs_cmd.pid == 0)
261 return 0;
262
263 /*
264 * Let go of the lock while sending/receiving as blocking can starve the
265 * sample reading thread.
266 */
267 mutex_unlock(tpebs_mtx_get());
268
269 /* Send perf record command.*/
270 len = strlen(msg);
271 ret = write(control_fd[1], msg, len);
272 if (ret != len) {
273 pr_err("perf record control write control message '%s' failed\n", msg);
274 ret = -EPIPE;
275 goto out;
276 }
277
278 if (!strcmp(msg, EVLIST_CTL_CMD_STOP_TAG)) {
279 ret = 0;
280 goto out;
281 }
282
283 /* Wait for an ack. */
284 pollfd.fd = ack_fd[0];
285
286 /*
287 * We need this poll to ensure the ack_fd PIPE will not hang
288 * when perf record failed for any reason. The timeout value
289 * 3000ms is an empirical selection.
290 */
291 again:
292 if (!poll(&pollfd, 1, 500)) {
293 if (check_if_command_finished(&tpebs_cmd)) {
294 ret = 0;
295 goto out;
296 }
297
298 if (retries++ < 6)
299 goto again;
300 pr_err("tpebs failed: perf record ack timeout for '%s'\n", msg);
301 ret = -ETIMEDOUT;
302 goto out;
303 }
304
305 if (!(pollfd.revents & POLLIN)) {
306 if (check_if_command_finished(&tpebs_cmd)) {
307 ret = 0;
308 goto out;
309 }
310
311 pr_err("tpebs failed: did not received an ack for '%s'\n", msg);
312 ret = -EPIPE;
313 goto out;
314 }
315
316 ret = read(ack_fd[0], ack_buf, sizeof(ack_buf));
317 if (ret > 0)
318 ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG);
319 else
320 pr_err("tpebs: perf record control ack failed\n");
321 out:
322 /* Re-take lock as expected by caller. */
323 mutex_lock(tpebs_mtx_get());
324 return ret;
325 }
326
327 /*
328 * tpebs_stop - stop the sample data read thread and the perf record process.
329 */
tpebs_stop(void)330 static int tpebs_stop(void) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get())
331 {
332 int ret = 0;
333
334 /* Like tpebs_start, we should only run tpebs_end once. */
335 if (tpebs_cmd.pid != 0) {
336 tpebs_send_record_cmd(EVLIST_CTL_CMD_STOP_TAG);
337 tpebs_cmd.pid = 0;
338 mutex_unlock(tpebs_mtx_get());
339 pthread_join(tpebs_reader_thread, NULL);
340 mutex_lock(tpebs_mtx_get());
341 close(control_fd[0]);
342 close(control_fd[1]);
343 close(ack_fd[0]);
344 close(ack_fd[1]);
345 close(tpebs_cmd.out);
346 ret = finish_command(&tpebs_cmd);
347 tpebs_cmd.pid = 0;
348 if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
349 ret = 0;
350 }
351 return ret;
352 }
353
354 /**
355 * evsel__tpebs_event() - Create string event encoding to pass to `perf record`.
356 */
evsel__tpebs_event(struct evsel * evsel,char ** event)357 static int evsel__tpebs_event(struct evsel *evsel, char **event)
358 {
359 char *name, *modifier;
360 int ret;
361
362 name = strdup(evsel->name);
363 if (!name)
364 return -ENOMEM;
365
366 modifier = strrchr(name, 'R');
367 if (!modifier) {
368 ret = -EINVAL;
369 goto out;
370 }
371 *modifier = 'p';
372 modifier = strchr(name, ':');
373 if (!modifier)
374 modifier = strrchr(name, '/');
375 if (!modifier) {
376 ret = -EINVAL;
377 goto out;
378 }
379 *modifier = '\0';
380 if (asprintf(event, "%s/name=tpebs_event_%p/%s", name, evsel, modifier + 1) > 0)
381 ret = 0;
382 else
383 ret = -ENOMEM;
384 out:
385 if (ret)
386 pr_err("Tpebs event modifier broken '%s'\n", evsel->name);
387 free(name);
388 return ret;
389 }
390
tpebs_retire_lat__new(struct evsel * evsel)391 static struct tpebs_retire_lat *tpebs_retire_lat__new(struct evsel *evsel)
392 {
393 struct tpebs_retire_lat *result = zalloc(sizeof(*result));
394 int ret;
395
396 if (!result)
397 return NULL;
398
399 ret = evsel__tpebs_event(evsel, &result->event);
400 if (ret) {
401 free(result);
402 return NULL;
403 }
404 result->evsel = evsel;
405 return result;
406 }
407
tpebs_retire_lat__delete(struct tpebs_retire_lat * r)408 static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
409 {
410 zfree(&r->event);
411 free(r);
412 }
413
tpebs_retire_lat__find(struct evsel * evsel)414 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel)
415 {
416 struct tpebs_retire_lat *t;
417 unsigned long num;
418 const char *evsel_name;
419
420 /*
421 * Evsels will match for evlist with the retirement latency event. The
422 * name with "tpebs_event_" prefix will be present on events being read
423 * from `perf record`.
424 */
425 if (evsel__is_retire_lat(evsel)) {
426 list_for_each_entry(t, &tpebs_results, nd) {
427 if (t->evsel == evsel)
428 return t;
429 }
430 return NULL;
431 }
432 evsel_name = strstr(evsel->name, "tpebs_event_");
433 if (!evsel_name) {
434 /* Unexpected that the perf record should have other events. */
435 return NULL;
436 }
437 errno = 0;
438 num = strtoull(evsel_name + 12, NULL, 16);
439 if (errno) {
440 pr_err("Bad evsel for tpebs find '%s'\n", evsel->name);
441 return NULL;
442 }
443 list_for_each_entry(t, &tpebs_results, nd) {
444 if ((unsigned long)t->evsel == num)
445 return t;
446 }
447 return NULL;
448 }
449
450 /**
451 * evsel__tpebs_prepare - create tpebs data structures ready for opening.
452 * @evsel: retire_latency evsel, all evsels on its list will be prepared.
453 */
evsel__tpebs_prepare(struct evsel * evsel)454 static int evsel__tpebs_prepare(struct evsel *evsel)
455 {
456 struct evsel *pos;
457 struct tpebs_retire_lat *tpebs_event;
458
459 mutex_lock(tpebs_mtx_get());
460 tpebs_event = tpebs_retire_lat__find(evsel);
461 if (tpebs_event) {
462 /* evsel, or an identically named one, was already prepared. */
463 mutex_unlock(tpebs_mtx_get());
464 return 0;
465 }
466 tpebs_event = tpebs_retire_lat__new(evsel);
467 if (!tpebs_event) {
468 mutex_unlock(tpebs_mtx_get());
469 return -ENOMEM;
470 }
471 list_add_tail(&tpebs_event->nd, &tpebs_results);
472 mutex_unlock(tpebs_mtx_get());
473
474 /*
475 * Eagerly prepare all other evsels on the list to try to ensure that by
476 * open they are all known.
477 */
478 evlist__for_each_entry(evsel->evlist, pos) {
479 int ret;
480
481 if (pos == evsel || !pos->retire_lat)
482 continue;
483
484 ret = evsel__tpebs_prepare(pos);
485 if (ret)
486 return ret;
487 }
488 return 0;
489 }
490
491 /**
492 * evsel__tpebs_open - starts tpebs execution.
493 * @evsel: retire_latency evsel, all evsels on its list will be selected. Each
494 * evsel is sampled to get the average retire_latency value.
495 */
evsel__tpebs_open(struct evsel * evsel)496 int evsel__tpebs_open(struct evsel *evsel)
497 {
498 int ret;
499 bool tpebs_empty;
500
501 /* We should only run tpebs_start when tpebs_recording is enabled. */
502 if (!tpebs_recording)
503 return 0;
504 /* Only start the events once. */
505 if (tpebs_cmd.pid != 0) {
506 struct tpebs_retire_lat *t;
507 bool valid;
508
509 mutex_lock(tpebs_mtx_get());
510 t = tpebs_retire_lat__find(evsel);
511 valid = t && t->started;
512 mutex_unlock(tpebs_mtx_get());
513 /* May fail as the event wasn't started. */
514 return valid ? 0 : -EBUSY;
515 }
516
517 ret = evsel__tpebs_prepare(evsel);
518 if (ret)
519 return ret;
520
521 mutex_lock(tpebs_mtx_get());
522 tpebs_empty = list_empty(&tpebs_results);
523 if (!tpebs_empty) {
524 /*Create control and ack fd for --control*/
525 if (pipe(control_fd) < 0) {
526 pr_err("tpebs: Failed to create control fifo");
527 ret = -1;
528 goto out;
529 }
530 if (pipe(ack_fd) < 0) {
531 pr_err("tpebs: Failed to create control fifo");
532 ret = -1;
533 goto out;
534 }
535
536 ret = evsel__tpebs_start_perf_record(evsel);
537 if (ret)
538 goto out;
539
540 if (pthread_create(&tpebs_reader_thread, /*attr=*/NULL, __sample_reader,
541 /*arg=*/NULL)) {
542 kill(tpebs_cmd.pid, SIGTERM);
543 close(tpebs_cmd.out);
544 pr_err("Could not create thread to process sample data.\n");
545 ret = -1;
546 goto out;
547 }
548 ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_ENABLE_TAG);
549 }
550 out:
551 if (ret) {
552 struct tpebs_retire_lat *t = tpebs_retire_lat__find(evsel);
553
554 list_del_init(&t->nd);
555 tpebs_retire_lat__delete(t);
556 }
557 mutex_unlock(tpebs_mtx_get());
558 return ret;
559 }
560
evsel__tpebs_read(struct evsel * evsel,int cpu_map_idx,int thread)561 int evsel__tpebs_read(struct evsel *evsel, int cpu_map_idx, int thread)
562 {
563 struct perf_counts_values *count, *old_count = NULL;
564 struct tpebs_retire_lat *t;
565 uint64_t val;
566 int ret;
567
568 /* Only set retire_latency value to the first CPU and thread. */
569 if (cpu_map_idx != 0 || thread != 0)
570 return 0;
571
572 if (evsel->prev_raw_counts)
573 old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread);
574
575 count = perf_counts(evsel->counts, cpu_map_idx, thread);
576
577 mutex_lock(tpebs_mtx_get());
578 t = tpebs_retire_lat__find(evsel);
579 /*
580 * If reading the first tpebs result, send a ping to the record
581 * process. Allow the sample reader a chance to read by releasing and
582 * reacquiring the lock.
583 */
584 if (t && &t->nd == tpebs_results.next) {
585 ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_PING_TAG);
586 mutex_unlock(tpebs_mtx_get());
587 if (ret)
588 return ret;
589 mutex_lock(tpebs_mtx_get());
590 }
591 if (t == NULL || t->stats.n == 0) {
592 /* No sample data, use default. */
593 if (tpebs_recording) {
594 pr_warning_once(
595 "Using precomputed retirement latency data as no samples\n");
596 }
597 val = 0;
598 switch (tpebs_mode) {
599 case TPEBS_MODE__MIN:
600 val = rint(evsel->retirement_latency.min);
601 break;
602 case TPEBS_MODE__MAX:
603 val = rint(evsel->retirement_latency.max);
604 break;
605 default:
606 case TPEBS_MODE__LAST:
607 case TPEBS_MODE__MEAN:
608 val = rint(evsel->retirement_latency.mean);
609 break;
610 }
611 } else {
612 switch (tpebs_mode) {
613 case TPEBS_MODE__MIN:
614 val = t->stats.min;
615 break;
616 case TPEBS_MODE__MAX:
617 val = t->stats.max;
618 break;
619 case TPEBS_MODE__LAST:
620 val = t->last;
621 break;
622 default:
623 case TPEBS_MODE__MEAN:
624 val = rint(t->stats.mean);
625 break;
626 }
627 }
628 mutex_unlock(tpebs_mtx_get());
629
630 if (old_count) {
631 count->val = old_count->val + val;
632 count->run = old_count->run + 1;
633 count->ena = old_count->ena + 1;
634 } else {
635 count->val = val;
636 count->run++;
637 count->ena++;
638 }
639 return 0;
640 }
641
642 /**
643 * evsel__tpebs_close() - delete tpebs related data. If the last event, stop the
644 * created thread and process by calling tpebs_stop().
645 *
646 * This function is called in evsel__close() to be symmetric with
647 * evsel__tpebs_open() being called in evsel__open().
648 */
evsel__tpebs_close(struct evsel * evsel)649 void evsel__tpebs_close(struct evsel *evsel)
650 {
651 struct tpebs_retire_lat *t;
652
653 mutex_lock(tpebs_mtx_get());
654 t = tpebs_retire_lat__find(evsel);
655 if (t) {
656 list_del_init(&t->nd);
657 tpebs_retire_lat__delete(t);
658
659 if (list_empty(&tpebs_results))
660 tpebs_stop();
661 }
662 mutex_unlock(tpebs_mtx_get());
663 }
664