xref: /linux/tools/perf/util/intel-tpebs.c (revision 7f81907b7e3f93dfed2e903af52659baa4944341)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_tpebs.c: Intel TPEBS support
4  */
5 
6 #include <api/fs/fs.h>
7 #include <sys/param.h>
8 #include <subcmd/run-command.h>
9 #include <thread.h>
10 #include "intel-tpebs.h"
11 #include <linux/list.h>
12 #include <linux/zalloc.h>
13 #include <linux/err.h>
14 #include "sample.h"
15 #include "counts.h"
16 #include "debug.h"
17 #include "evlist.h"
18 #include "evsel.h"
19 #include "mutex.h"
20 #include "session.h"
21 #include "stat.h"
22 #include "tool.h"
23 #include "cpumap.h"
24 #include "metricgroup.h"
25 #include "stat.h"
26 #include <sys/stat.h>
27 #include <sys/file.h>
28 #include <poll.h>
29 #include <math.h>
30 
31 #define PERF_DATA		"-"
32 
33 bool tpebs_recording;
34 enum tpebs_mode tpebs_mode;
35 static LIST_HEAD(tpebs_results);
36 static pthread_t tpebs_reader_thread;
37 static struct child_process tpebs_cmd;
38 static int control_fd[2], ack_fd[2];
39 static struct mutex tpebs_mtx;
40 
41 struct tpebs_retire_lat {
42 	struct list_head nd;
43 	/** @evsel: The evsel that opened the retire_lat event. */
44 	struct evsel *evsel;
45 	/** @event: Event passed to perf record. */
46 	char *event;
47 	/** @stats: Recorded retirement latency stats. */
48 	struct stats stats;
49 	/** @last: Last retirement latency read. */
50 	uint64_t last;
51 	/* Has the event been sent to perf record? */
52 	bool started;
53 };
54 
55 static void tpebs_mtx_init(void)
56 {
57 	mutex_init(&tpebs_mtx);
58 }
59 
60 static struct mutex *tpebs_mtx_get(void)
61 {
62 	static pthread_once_t tpebs_mtx_once = PTHREAD_ONCE_INIT;
63 
64 	pthread_once(&tpebs_mtx_once, tpebs_mtx_init);
65 	return &tpebs_mtx;
66 }
67 
68 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel)
69 	EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get());
70 
71 static int evsel__tpebs_start_perf_record(struct evsel *evsel)
72 {
73 	const char **record_argv;
74 	int tpebs_event_size = 0, i = 0, ret;
75 	char control_fd_buf[32];
76 	char cpumap_buf[50];
77 	struct tpebs_retire_lat *t;
78 
79 	list_for_each_entry(t, &tpebs_results, nd)
80 		tpebs_event_size++;
81 
82 	record_argv = malloc((10 + 2 * tpebs_event_size) * sizeof(*record_argv));
83 	if (!record_argv)
84 		return -ENOMEM;
85 
86 	record_argv[i++] = "perf";
87 	record_argv[i++] = "record";
88 	record_argv[i++] = "-W";
89 	record_argv[i++] = "--synth=no";
90 
91 	scnprintf(control_fd_buf, sizeof(control_fd_buf), "--control=fd:%d,%d",
92 		  control_fd[0], ack_fd[1]);
93 	record_argv[i++] = control_fd_buf;
94 
95 	record_argv[i++] = "-o";
96 	record_argv[i++] = PERF_DATA;
97 
98 	if (!perf_cpu_map__is_any_cpu_or_is_empty(evsel->evlist->core.user_requested_cpus)) {
99 		cpu_map__snprint(evsel->evlist->core.user_requested_cpus, cpumap_buf,
100 				 sizeof(cpumap_buf));
101 		record_argv[i++] = "-C";
102 		record_argv[i++] = cpumap_buf;
103 	}
104 
105 	list_for_each_entry(t, &tpebs_results, nd) {
106 		record_argv[i++] = "-e";
107 		record_argv[i++] = t->event;
108 	}
109 	record_argv[i++] = NULL;
110 	assert(i == 10 + 2 * tpebs_event_size || i == 8 + 2 * tpebs_event_size);
111 	/* Note, no workload given so system wide is implied. */
112 
113 	assert(tpebs_cmd.pid == 0);
114 	tpebs_cmd.argv = record_argv;
115 	tpebs_cmd.out = -1;
116 	ret = start_command(&tpebs_cmd);
117 	zfree(&tpebs_cmd.argv);
118 	list_for_each_entry(t, &tpebs_results, nd)
119 		t->started = true;
120 
121 	return ret;
122 }
123 
124 static bool is_child_pid(pid_t parent, pid_t child)
125 {
126 	if (parent < 0 || child < 0)
127 		return false;
128 
129 	while (true) {
130 		char path[PATH_MAX];
131 		char line[256];
132 		FILE *fp;
133 
134 new_child:
135 		if (parent == child)
136 			return true;
137 
138 		if (child <= 0)
139 			return false;
140 
141 		scnprintf(path, sizeof(path), "%s/%d/status", procfs__mountpoint(), child);
142 		fp = fopen(path, "r");
143 		if (!fp) {
144 			/* Presumably the process went away. Assume not a child. */
145 			return false;
146 		}
147 		while (fgets(line, sizeof(line), fp) != NULL) {
148 			if (strncmp(line, "PPid:", 5) == 0) {
149 				fclose(fp);
150 				if (sscanf(line + 5, "%d", &child) != 1) {
151 					/* Unexpected error parsing. */
152 					return false;
153 				}
154 				goto new_child;
155 			}
156 		}
157 		/* Unexpected EOF. */
158 		fclose(fp);
159 		return false;
160 	}
161 }
162 
163 static bool should_ignore_sample(const struct perf_sample *sample, const struct tpebs_retire_lat *t)
164 {
165 	pid_t workload_pid, sample_pid = sample->pid;
166 
167 	/*
168 	 * During evlist__purge the evlist will be removed prior to the
169 	 * evsel__exit calling evsel__tpebs_close and taking the
170 	 * tpebs_mtx. Avoid a segfault by ignoring samples in this case.
171 	 */
172 	if (t->evsel->evlist == NULL)
173 		return true;
174 
175 	workload_pid = t->evsel->evlist->workload.pid;
176 	if (workload_pid < 0 || workload_pid == sample_pid)
177 		return false;
178 
179 	if (!t->evsel->core.attr.inherit)
180 		return true;
181 
182 	return !is_child_pid(workload_pid, sample_pid);
183 }
184 
185 static int process_sample_event(const struct perf_tool *tool __maybe_unused,
186 				union perf_event *event __maybe_unused,
187 				struct perf_sample *sample,
188 				struct evsel *evsel,
189 				struct machine *machine __maybe_unused)
190 {
191 	struct tpebs_retire_lat *t;
192 
193 	mutex_lock(tpebs_mtx_get());
194 	if (tpebs_cmd.pid == 0) {
195 		/* Record has terminated. */
196 		mutex_unlock(tpebs_mtx_get());
197 		return 0;
198 	}
199 	t = tpebs_retire_lat__find(evsel);
200 	if (!t) {
201 		mutex_unlock(tpebs_mtx_get());
202 		return -EINVAL;
203 	}
204 	if (should_ignore_sample(sample, t)) {
205 		mutex_unlock(tpebs_mtx_get());
206 		return 0;
207 	}
208 	/*
209 	 * Need to handle per core results? We are assuming average retire
210 	 * latency value will be used. Save the number of samples and the sum of
211 	 * retire latency value for each event.
212 	 */
213 	t->last = sample->retire_lat;
214 	update_stats(&t->stats, sample->retire_lat);
215 	mutex_unlock(tpebs_mtx_get());
216 	return 0;
217 }
218 
219 static int process_feature_event(struct perf_session *session,
220 				 union perf_event *event)
221 {
222 	if (event->feat.feat_id < HEADER_LAST_FEATURE)
223 		return perf_event__process_feature(session, event);
224 	return 0;
225 }
226 
227 static void *__sample_reader(void *arg __maybe_unused)
228 {
229 	struct perf_session *session;
230 	struct perf_data data = {
231 		.mode = PERF_DATA_MODE_READ,
232 		.path = PERF_DATA,
233 		.file.fd = tpebs_cmd.out,
234 	};
235 	struct perf_tool tool;
236 
237 	perf_tool__init(&tool, /*ordered_events=*/false);
238 	tool.sample = process_sample_event;
239 	tool.feature = process_feature_event;
240 	tool.attr = perf_event__process_attr;
241 
242 	session = perf_session__new(&data, &tool);
243 	if (IS_ERR(session))
244 		return NULL;
245 	perf_session__process_events(session);
246 	perf_session__delete(session);
247 
248 	return NULL;
249 }
250 
251 static int tpebs_send_record_cmd(const char *msg) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get())
252 {
253 	struct pollfd pollfd = { .events = POLLIN, };
254 	int ret, len, retries = 0;
255 	char ack_buf[8];
256 
257 	/* Check if the command exited before the send, done with the lock held. */
258 	if (tpebs_cmd.pid == 0)
259 		return 0;
260 
261 	/*
262 	 * Let go of the lock while sending/receiving as blocking can starve the
263 	 * sample reading thread.
264 	 */
265 	mutex_unlock(tpebs_mtx_get());
266 
267 	/* Send perf record command.*/
268 	len = strlen(msg);
269 	ret = write(control_fd[1], msg, len);
270 	if (ret != len) {
271 		pr_err("perf record control write control message '%s' failed\n", msg);
272 		ret = -EPIPE;
273 		goto out;
274 	}
275 
276 	if (!strcmp(msg, EVLIST_CTL_CMD_STOP_TAG)) {
277 		ret = 0;
278 		goto out;
279 	}
280 
281 	/* Wait for an ack. */
282 	pollfd.fd = ack_fd[0];
283 
284 	/*
285 	 * We need this poll to ensure the ack_fd PIPE will not hang
286 	 * when perf record failed for any reason. The timeout value
287 	 * 3000ms is an empirical selection.
288 	 */
289 again:
290 	if (!poll(&pollfd, 1, 500)) {
291 		if (check_if_command_finished(&tpebs_cmd)) {
292 			ret = 0;
293 			goto out;
294 		}
295 
296 		if (retries++ < 6)
297 			goto again;
298 		pr_err("tpebs failed: perf record ack timeout for '%s'\n", msg);
299 		ret = -ETIMEDOUT;
300 		goto out;
301 	}
302 
303 	if (!(pollfd.revents & POLLIN)) {
304 		if (check_if_command_finished(&tpebs_cmd)) {
305 			ret = 0;
306 			goto out;
307 		}
308 
309 		pr_err("tpebs failed: did not received an ack for '%s'\n", msg);
310 		ret = -EPIPE;
311 		goto out;
312 	}
313 
314 	ret = read(ack_fd[0], ack_buf, sizeof(ack_buf));
315 	if (ret > 0)
316 		ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG);
317 	else
318 		pr_err("tpebs: perf record control ack failed\n");
319 out:
320 	/* Re-take lock as expected by caller. */
321 	mutex_lock(tpebs_mtx_get());
322 	return ret;
323 }
324 
325 /*
326  * tpebs_stop - stop the sample data read thread and the perf record process.
327  */
328 static int tpebs_stop(void) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get())
329 {
330 	int ret = 0;
331 
332 	/* Like tpebs_start, we should only run tpebs_end once. */
333 	if (tpebs_cmd.pid != 0) {
334 		tpebs_send_record_cmd(EVLIST_CTL_CMD_STOP_TAG);
335 		tpebs_cmd.pid = 0;
336 		mutex_unlock(tpebs_mtx_get());
337 		pthread_join(tpebs_reader_thread, NULL);
338 		mutex_lock(tpebs_mtx_get());
339 		close(control_fd[0]);
340 		close(control_fd[1]);
341 		close(ack_fd[0]);
342 		close(ack_fd[1]);
343 		close(tpebs_cmd.out);
344 		ret = finish_command(&tpebs_cmd);
345 		tpebs_cmd.pid = 0;
346 		if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
347 			ret = 0;
348 	}
349 	return ret;
350 }
351 
352 /**
353  * evsel__tpebs_event() - Create string event encoding to pass to `perf record`.
354  */
355 static int evsel__tpebs_event(struct evsel *evsel, char **event)
356 {
357 	char *name, *modifier;
358 	int ret;
359 
360 	name = strdup(evsel->name);
361 	if (!name)
362 		return -ENOMEM;
363 
364 	modifier = strrchr(name, 'R');
365 	if (!modifier) {
366 		ret = -EINVAL;
367 		goto out;
368 	}
369 	*modifier = 'p';
370 	modifier = strchr(name, ':');
371 	if (!modifier)
372 		modifier = strrchr(name, '/');
373 	if (!modifier) {
374 		ret = -EINVAL;
375 		goto out;
376 	}
377 	*modifier = '\0';
378 	if (asprintf(event, "%s/name=tpebs_event_%p/%s", name, evsel, modifier + 1) > 0)
379 		ret = 0;
380 	else
381 		ret = -ENOMEM;
382 out:
383 	if (ret)
384 		pr_err("Tpebs event modifier broken '%s'\n", evsel->name);
385 	free(name);
386 	return ret;
387 }
388 
389 static struct tpebs_retire_lat *tpebs_retire_lat__new(struct evsel *evsel)
390 {
391 	struct tpebs_retire_lat *result = zalloc(sizeof(*result));
392 	int ret;
393 
394 	if (!result)
395 		return NULL;
396 
397 	ret = evsel__tpebs_event(evsel, &result->event);
398 	if (ret) {
399 		free(result);
400 		return NULL;
401 	}
402 	result->evsel = evsel;
403 	return result;
404 }
405 
406 static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
407 {
408 	zfree(&r->event);
409 	free(r);
410 }
411 
412 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel)
413 {
414 	struct tpebs_retire_lat *t;
415 	unsigned long num;
416 	const char *evsel_name;
417 
418 	/*
419 	 * Evsels will match for evlist with the retirement latency event. The
420 	 * name with "tpebs_event_" prefix will be present on events being read
421 	 * from `perf record`.
422 	 */
423 	if (evsel__is_retire_lat(evsel)) {
424 		list_for_each_entry(t, &tpebs_results, nd) {
425 			if (t->evsel == evsel)
426 				return t;
427 		}
428 		return NULL;
429 	}
430 	evsel_name = strstr(evsel->name, "tpebs_event_");
431 	if (!evsel_name) {
432 		/* Unexpected that the perf record should have other events. */
433 		return NULL;
434 	}
435 	errno = 0;
436 	num = strtoull(evsel_name + 12, NULL, 16);
437 	if (errno) {
438 		pr_err("Bad evsel for tpebs find '%s'\n", evsel->name);
439 		return NULL;
440 	}
441 	list_for_each_entry(t, &tpebs_results, nd) {
442 		if ((unsigned long)t->evsel == num)
443 			return t;
444 	}
445 	return NULL;
446 }
447 
448 /**
449  * evsel__tpebs_prepare - create tpebs data structures ready for opening.
450  * @evsel: retire_latency evsel, all evsels on its list will be prepared.
451  */
452 static int evsel__tpebs_prepare(struct evsel *evsel)
453 {
454 	struct evsel *pos;
455 	struct tpebs_retire_lat *tpebs_event;
456 
457 	mutex_lock(tpebs_mtx_get());
458 	tpebs_event = tpebs_retire_lat__find(evsel);
459 	if (tpebs_event) {
460 		/* evsel, or an identically named one, was already prepared. */
461 		mutex_unlock(tpebs_mtx_get());
462 		return 0;
463 	}
464 	tpebs_event = tpebs_retire_lat__new(evsel);
465 	if (!tpebs_event) {
466 		mutex_unlock(tpebs_mtx_get());
467 		return -ENOMEM;
468 	}
469 	list_add_tail(&tpebs_event->nd, &tpebs_results);
470 	mutex_unlock(tpebs_mtx_get());
471 
472 	/*
473 	 * Eagerly prepare all other evsels on the list to try to ensure that by
474 	 * open they are all known.
475 	 */
476 	evlist__for_each_entry(evsel->evlist, pos) {
477 		int ret;
478 
479 		if (pos == evsel || !pos->retire_lat)
480 			continue;
481 
482 		ret = evsel__tpebs_prepare(pos);
483 		if (ret)
484 			return ret;
485 	}
486 	return 0;
487 }
488 
489 /**
490  * evsel__tpebs_open - starts tpebs execution.
491  * @evsel: retire_latency evsel, all evsels on its list will be selected. Each
492  *         evsel is sampled to get the average retire_latency value.
493  */
494 int evsel__tpebs_open(struct evsel *evsel)
495 {
496 	int ret;
497 	bool tpebs_empty;
498 
499 	/* We should only run tpebs_start when tpebs_recording is enabled. */
500 	if (!tpebs_recording)
501 		return 0;
502 	/* Only start the events once. */
503 	if (tpebs_cmd.pid != 0) {
504 		struct tpebs_retire_lat *t;
505 		bool valid;
506 
507 		mutex_lock(tpebs_mtx_get());
508 		t = tpebs_retire_lat__find(evsel);
509 		valid = t && t->started;
510 		mutex_unlock(tpebs_mtx_get());
511 		/* May fail as the event wasn't started. */
512 		return valid ? 0 : -EBUSY;
513 	}
514 
515 	ret = evsel__tpebs_prepare(evsel);
516 	if (ret)
517 		return ret;
518 
519 	mutex_lock(tpebs_mtx_get());
520 	tpebs_empty = list_empty(&tpebs_results);
521 	if (!tpebs_empty) {
522 		/*Create control and ack fd for --control*/
523 		if (pipe(control_fd) < 0) {
524 			pr_err("tpebs: Failed to create control fifo");
525 			ret = -1;
526 			goto out;
527 		}
528 		if (pipe(ack_fd) < 0) {
529 			pr_err("tpebs: Failed to create control fifo");
530 			ret = -1;
531 			goto out;
532 		}
533 
534 		ret = evsel__tpebs_start_perf_record(evsel);
535 		if (ret)
536 			goto out;
537 
538 		if (pthread_create(&tpebs_reader_thread, /*attr=*/NULL, __sample_reader,
539 				   /*arg=*/NULL)) {
540 			kill(tpebs_cmd.pid, SIGTERM);
541 			close(tpebs_cmd.out);
542 			pr_err("Could not create thread to process sample data.\n");
543 			ret = -1;
544 			goto out;
545 		}
546 		ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_ENABLE_TAG);
547 	}
548 out:
549 	if (ret) {
550 		struct tpebs_retire_lat *t = tpebs_retire_lat__find(evsel);
551 
552 		list_del_init(&t->nd);
553 		tpebs_retire_lat__delete(t);
554 	}
555 	mutex_unlock(tpebs_mtx_get());
556 	return ret;
557 }
558 
559 int evsel__tpebs_read(struct evsel *evsel, int cpu_map_idx, int thread)
560 {
561 	struct perf_counts_values *count, *old_count = NULL;
562 	struct tpebs_retire_lat *t;
563 	uint64_t val;
564 	int ret;
565 
566 	/* Only set retire_latency value to the first CPU and thread. */
567 	if (cpu_map_idx != 0 || thread != 0)
568 		return 0;
569 
570 	if (evsel->prev_raw_counts)
571 		old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread);
572 
573 	count = perf_counts(evsel->counts, cpu_map_idx, thread);
574 
575 	mutex_lock(tpebs_mtx_get());
576 	t = tpebs_retire_lat__find(evsel);
577 	/*
578 	 * If reading the first tpebs result, send a ping to the record
579 	 * process. Allow the sample reader a chance to read by releasing and
580 	 * reacquiring the lock.
581 	 */
582 	if (t && &t->nd == tpebs_results.next) {
583 		ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_PING_TAG);
584 		mutex_unlock(tpebs_mtx_get());
585 		if (ret)
586 			return ret;
587 		mutex_lock(tpebs_mtx_get());
588 	}
589 	if (t == NULL || t->stats.n == 0) {
590 		/* No sample data, use default. */
591 		if (tpebs_recording) {
592 			pr_warning_once(
593 				"Using precomputed retirement latency data as no samples\n");
594 		}
595 		val = 0;
596 		switch (tpebs_mode) {
597 		case TPEBS_MODE__MIN:
598 			val = rint(evsel->retirement_latency.min);
599 			break;
600 		case TPEBS_MODE__MAX:
601 			val = rint(evsel->retirement_latency.max);
602 			break;
603 		default:
604 		case TPEBS_MODE__LAST:
605 		case TPEBS_MODE__MEAN:
606 			val = rint(evsel->retirement_latency.mean);
607 			break;
608 		}
609 	} else {
610 		switch (tpebs_mode) {
611 		case TPEBS_MODE__MIN:
612 			val = t->stats.min;
613 			break;
614 		case TPEBS_MODE__MAX:
615 			val = t->stats.max;
616 			break;
617 		case TPEBS_MODE__LAST:
618 			val = t->last;
619 			break;
620 		default:
621 		case TPEBS_MODE__MEAN:
622 			val = rint(t->stats.mean);
623 			break;
624 		}
625 	}
626 	mutex_unlock(tpebs_mtx_get());
627 
628 	if (old_count) {
629 		count->val = old_count->val + val;
630 		count->run = old_count->run + 1;
631 		count->ena = old_count->ena + 1;
632 	} else {
633 		count->val = val;
634 		count->run++;
635 		count->ena++;
636 	}
637 	return 0;
638 }
639 
640 /**
641  * evsel__tpebs_close() - delete tpebs related data. If the last event, stop the
642  * created thread and process by calling tpebs_stop().
643  *
644  * This function is called in evsel__close() to be symmetric with
645  * evsel__tpebs_open() being called in evsel__open().
646  */
647 void evsel__tpebs_close(struct evsel *evsel)
648 {
649 	struct tpebs_retire_lat *t;
650 
651 	mutex_lock(tpebs_mtx_get());
652 	t = tpebs_retire_lat__find(evsel);
653 	if (t) {
654 		list_del_init(&t->nd);
655 		tpebs_retire_lat__delete(t);
656 
657 		if (list_empty(&tpebs_results))
658 			tpebs_stop();
659 	}
660 	mutex_unlock(tpebs_mtx_get());
661 }
662