xref: /linux/tools/perf/util/intel-tpebs.c (revision 735a3ac37012bbb6bb96145e90e9281beb7c27f2)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_tpebs.c: Intel TPEBS support
4  */
5 
6 #include <api/fs/fs.h>
7 #include <sys/param.h>
8 #include <subcmd/run-command.h>
9 #include <thread.h>
10 #include "intel-tpebs.h"
11 #include <linux/list.h>
12 #include <linux/zalloc.h>
13 #include <linux/err.h>
14 #include "sample.h"
15 #include "counts.h"
16 #include "debug.h"
17 #include "evlist.h"
18 #include "evsel.h"
19 #include "mutex.h"
20 #include "session.h"
21 #include "stat.h"
22 #include "tool.h"
23 #include "cpumap.h"
24 #include "metricgroup.h"
25 #include "stat.h"
26 #include <sys/stat.h>
27 #include <sys/file.h>
28 #include <poll.h>
29 #include <math.h>
30 
31 #define PERF_DATA		"-"
32 
33 bool tpebs_recording;
34 enum tpebs_mode tpebs_mode;
35 static LIST_HEAD(tpebs_results);
36 static pthread_t tpebs_reader_thread;
37 static struct child_process tpebs_cmd;
38 static int control_fd[2], ack_fd[2];
39 static struct mutex tpebs_mtx;
40 
41 struct tpebs_retire_lat {
42 	struct list_head nd;
43 	/** @evsel: The evsel that opened the retire_lat event. */
44 	struct evsel *evsel;
45 	/** @event: Event passed to perf record. */
46 	char *event;
47 	/** @stats: Recorded retirement latency stats. */
48 	struct stats stats;
49 	/** @last: Last retirement latency read. */
50 	uint64_t last;
51 	/* Has the event been sent to perf record? */
52 	bool started;
53 };
54 
55 static void tpebs_mtx_init(void)
56 {
57 	mutex_init(&tpebs_mtx);
58 }
59 
60 static struct mutex *tpebs_mtx_get(void)
61 {
62 	static pthread_once_t tpebs_mtx_once = PTHREAD_ONCE_INIT;
63 
64 	pthread_once(&tpebs_mtx_once, tpebs_mtx_init);
65 	return &tpebs_mtx;
66 }
67 
68 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel)
69 	EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get());
70 
71 static int evsel__tpebs_start_perf_record(struct evsel *evsel)
72 {
73 	const char **record_argv;
74 	int tpebs_event_size = 0, i = 0, ret;
75 	char control_fd_buf[32];
76 	char cpumap_buf[50];
77 	struct tpebs_retire_lat *t;
78 
79 	list_for_each_entry(t, &tpebs_results, nd)
80 		tpebs_event_size++;
81 
82 	record_argv = malloc((10 + 2 * tpebs_event_size) * sizeof(*record_argv));
83 	if (!record_argv)
84 		return -ENOMEM;
85 
86 	record_argv[i++] = "perf";
87 	record_argv[i++] = "record";
88 	record_argv[i++] = "-W";
89 	record_argv[i++] = "--synth=no";
90 
91 	scnprintf(control_fd_buf, sizeof(control_fd_buf), "--control=fd:%d,%d",
92 		  control_fd[0], ack_fd[1]);
93 	record_argv[i++] = control_fd_buf;
94 
95 	record_argv[i++] = "-o";
96 	record_argv[i++] = PERF_DATA;
97 
98 	if (!perf_cpu_map__is_any_cpu_or_is_empty(evsel->evlist->core.user_requested_cpus)) {
99 		cpu_map__snprint(evsel->evlist->core.user_requested_cpus, cpumap_buf,
100 				 sizeof(cpumap_buf));
101 		record_argv[i++] = "-C";
102 		record_argv[i++] = cpumap_buf;
103 	}
104 
105 	list_for_each_entry(t, &tpebs_results, nd) {
106 		record_argv[i++] = "-e";
107 		record_argv[i++] = t->event;
108 	}
109 	record_argv[i++] = NULL;
110 	assert(i == 10 + 2 * tpebs_event_size || i == 8 + 2 * tpebs_event_size);
111 	/* Note, no workload given so system wide is implied. */
112 
113 	assert(tpebs_cmd.pid == 0);
114 	tpebs_cmd.argv = record_argv;
115 	tpebs_cmd.out = -1;
116 	ret = start_command(&tpebs_cmd);
117 	zfree(&tpebs_cmd.argv);
118 	list_for_each_entry(t, &tpebs_results, nd)
119 		t->started = true;
120 
121 	return ret;
122 }
123 
124 static bool is_child_pid(pid_t parent, pid_t child)
125 {
126 	if (parent < 0 || child < 0)
127 		return false;
128 
129 	while (true) {
130 		char path[PATH_MAX];
131 		char line[256];
132 		FILE *fp;
133 
134 new_child:
135 		if (parent == child)
136 			return true;
137 
138 		if (child <= 0)
139 			return false;
140 
141 		scnprintf(path, sizeof(path), "%s/%d/status", procfs__mountpoint(), child);
142 		fp = fopen(path, "r");
143 		if (!fp) {
144 			/* Presumably the process went away. Assume not a child. */
145 			return false;
146 		}
147 		while (fgets(line, sizeof(line), fp) != NULL) {
148 			if (strncmp(line, "PPid:", 5) == 0) {
149 				fclose(fp);
150 				if (sscanf(line + 5, "%d", &child) != 1) {
151 					/* Unexpected error parsing. */
152 					return false;
153 				}
154 				goto new_child;
155 			}
156 		}
157 		/* Unexpected EOF. */
158 		fclose(fp);
159 		return false;
160 	}
161 }
162 
163 static bool should_ignore_sample(const struct perf_sample *sample, const struct tpebs_retire_lat *t)
164 {
165 	pid_t workload_pid = t->evsel->evlist->workload.pid;
166 	pid_t sample_pid = sample->pid;
167 
168 	if (workload_pid < 0 || workload_pid == sample_pid)
169 		return false;
170 
171 	if (!t->evsel->core.attr.inherit)
172 		return true;
173 
174 	return !is_child_pid(workload_pid, sample_pid);
175 }
176 
177 static int process_sample_event(const struct perf_tool *tool __maybe_unused,
178 				union perf_event *event __maybe_unused,
179 				struct perf_sample *sample,
180 				struct evsel *evsel,
181 				struct machine *machine __maybe_unused)
182 {
183 	struct tpebs_retire_lat *t;
184 
185 	mutex_lock(tpebs_mtx_get());
186 	if (tpebs_cmd.pid == 0) {
187 		/* Record has terminated. */
188 		mutex_unlock(tpebs_mtx_get());
189 		return 0;
190 	}
191 	t = tpebs_retire_lat__find(evsel);
192 	if (!t) {
193 		mutex_unlock(tpebs_mtx_get());
194 		return -EINVAL;
195 	}
196 	if (should_ignore_sample(sample, t)) {
197 		mutex_unlock(tpebs_mtx_get());
198 		return 0;
199 	}
200 	/*
201 	 * Need to handle per core results? We are assuming average retire
202 	 * latency value will be used. Save the number of samples and the sum of
203 	 * retire latency value for each event.
204 	 */
205 	t->last = sample->retire_lat;
206 	update_stats(&t->stats, sample->retire_lat);
207 	mutex_unlock(tpebs_mtx_get());
208 	return 0;
209 }
210 
211 static int process_feature_event(struct perf_session *session,
212 				 union perf_event *event)
213 {
214 	if (event->feat.feat_id < HEADER_LAST_FEATURE)
215 		return perf_event__process_feature(session, event);
216 	return 0;
217 }
218 
219 static void *__sample_reader(void *arg __maybe_unused)
220 {
221 	struct perf_session *session;
222 	struct perf_data data = {
223 		.mode = PERF_DATA_MODE_READ,
224 		.path = PERF_DATA,
225 		.file.fd = tpebs_cmd.out,
226 	};
227 	struct perf_tool tool;
228 
229 	perf_tool__init(&tool, /*ordered_events=*/false);
230 	tool.sample = process_sample_event;
231 	tool.feature = process_feature_event;
232 	tool.attr = perf_event__process_attr;
233 
234 	session = perf_session__new(&data, &tool);
235 	if (IS_ERR(session))
236 		return NULL;
237 	perf_session__process_events(session);
238 	perf_session__delete(session);
239 
240 	return NULL;
241 }
242 
243 static int tpebs_send_record_cmd(const char *msg) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get())
244 {
245 	struct pollfd pollfd = { .events = POLLIN, };
246 	int ret, len, retries = 0;
247 	char ack_buf[8];
248 
249 	/* Check if the command exited before the send, done with the lock held. */
250 	if (tpebs_cmd.pid == 0)
251 		return 0;
252 
253 	/*
254 	 * Let go of the lock while sending/receiving as blocking can starve the
255 	 * sample reading thread.
256 	 */
257 	mutex_unlock(tpebs_mtx_get());
258 
259 	/* Send perf record command.*/
260 	len = strlen(msg);
261 	ret = write(control_fd[1], msg, len);
262 	if (ret != len) {
263 		pr_err("perf record control write control message '%s' failed\n", msg);
264 		ret = -EPIPE;
265 		goto out;
266 	}
267 
268 	if (!strcmp(msg, EVLIST_CTL_CMD_STOP_TAG)) {
269 		ret = 0;
270 		goto out;
271 	}
272 
273 	/* Wait for an ack. */
274 	pollfd.fd = ack_fd[0];
275 
276 	/*
277 	 * We need this poll to ensure the ack_fd PIPE will not hang
278 	 * when perf record failed for any reason. The timeout value
279 	 * 3000ms is an empirical selection.
280 	 */
281 again:
282 	if (!poll(&pollfd, 1, 500)) {
283 		if (check_if_command_finished(&tpebs_cmd)) {
284 			ret = 0;
285 			goto out;
286 		}
287 
288 		if (retries++ < 6)
289 			goto again;
290 		pr_err("tpebs failed: perf record ack timeout for '%s'\n", msg);
291 		ret = -ETIMEDOUT;
292 		goto out;
293 	}
294 
295 	if (!(pollfd.revents & POLLIN)) {
296 		if (check_if_command_finished(&tpebs_cmd)) {
297 			ret = 0;
298 			goto out;
299 		}
300 
301 		pr_err("tpebs failed: did not received an ack for '%s'\n", msg);
302 		ret = -EPIPE;
303 		goto out;
304 	}
305 
306 	ret = read(ack_fd[0], ack_buf, sizeof(ack_buf));
307 	if (ret > 0)
308 		ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG);
309 	else
310 		pr_err("tpebs: perf record control ack failed\n");
311 out:
312 	/* Re-take lock as expected by caller. */
313 	mutex_lock(tpebs_mtx_get());
314 	return ret;
315 }
316 
317 /*
318  * tpebs_stop - stop the sample data read thread and the perf record process.
319  */
320 static int tpebs_stop(void) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get())
321 {
322 	int ret = 0;
323 
324 	/* Like tpebs_start, we should only run tpebs_end once. */
325 	if (tpebs_cmd.pid != 0) {
326 		tpebs_send_record_cmd(EVLIST_CTL_CMD_STOP_TAG);
327 		tpebs_cmd.pid = 0;
328 		mutex_unlock(tpebs_mtx_get());
329 		pthread_join(tpebs_reader_thread, NULL);
330 		mutex_lock(tpebs_mtx_get());
331 		close(control_fd[0]);
332 		close(control_fd[1]);
333 		close(ack_fd[0]);
334 		close(ack_fd[1]);
335 		close(tpebs_cmd.out);
336 		ret = finish_command(&tpebs_cmd);
337 		tpebs_cmd.pid = 0;
338 		if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
339 			ret = 0;
340 	}
341 	return ret;
342 }
343 
344 /**
345  * evsel__tpebs_event() - Create string event encoding to pass to `perf record`.
346  */
347 static int evsel__tpebs_event(struct evsel *evsel, char **event)
348 {
349 	char *name, *modifier;
350 	int ret;
351 
352 	name = strdup(evsel->name);
353 	if (!name)
354 		return -ENOMEM;
355 
356 	modifier = strrchr(name, 'R');
357 	if (!modifier) {
358 		ret = -EINVAL;
359 		goto out;
360 	}
361 	*modifier = 'p';
362 	modifier = strchr(name, ':');
363 	if (!modifier)
364 		modifier = strrchr(name, '/');
365 	if (!modifier) {
366 		ret = -EINVAL;
367 		goto out;
368 	}
369 	*modifier = '\0';
370 	if (asprintf(event, "%s/name=tpebs_event_%p/%s", name, evsel, modifier + 1) > 0)
371 		ret = 0;
372 	else
373 		ret = -ENOMEM;
374 out:
375 	if (ret)
376 		pr_err("Tpebs event modifier broken '%s'\n", evsel->name);
377 	free(name);
378 	return ret;
379 }
380 
381 static struct tpebs_retire_lat *tpebs_retire_lat__new(struct evsel *evsel)
382 {
383 	struct tpebs_retire_lat *result = zalloc(sizeof(*result));
384 	int ret;
385 
386 	if (!result)
387 		return NULL;
388 
389 	ret = evsel__tpebs_event(evsel, &result->event);
390 	if (ret) {
391 		free(result);
392 		return NULL;
393 	}
394 	result->evsel = evsel;
395 	return result;
396 }
397 
398 static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
399 {
400 	zfree(&r->event);
401 	free(r);
402 }
403 
404 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel)
405 {
406 	struct tpebs_retire_lat *t;
407 	unsigned long num;
408 	const char *evsel_name;
409 
410 	/*
411 	 * Evsels will match for evlist with the retirement latency event. The
412 	 * name with "tpebs_event_" prefix will be present on events being read
413 	 * from `perf record`.
414 	 */
415 	if (evsel__is_retire_lat(evsel)) {
416 		list_for_each_entry(t, &tpebs_results, nd) {
417 			if (t->evsel == evsel)
418 				return t;
419 		}
420 		return NULL;
421 	}
422 	evsel_name = strstr(evsel->name, "tpebs_event_");
423 	if (!evsel_name) {
424 		/* Unexpected that the perf record should have other events. */
425 		return NULL;
426 	}
427 	errno = 0;
428 	num = strtoull(evsel_name + 12, NULL, 16);
429 	if (errno) {
430 		pr_err("Bad evsel for tpebs find '%s'\n", evsel->name);
431 		return NULL;
432 	}
433 	list_for_each_entry(t, &tpebs_results, nd) {
434 		if ((unsigned long)t->evsel == num)
435 			return t;
436 	}
437 	return NULL;
438 }
439 
440 /**
441  * evsel__tpebs_prepare - create tpebs data structures ready for opening.
442  * @evsel: retire_latency evsel, all evsels on its list will be prepared.
443  */
444 static int evsel__tpebs_prepare(struct evsel *evsel)
445 {
446 	struct evsel *pos;
447 	struct tpebs_retire_lat *tpebs_event;
448 
449 	mutex_lock(tpebs_mtx_get());
450 	tpebs_event = tpebs_retire_lat__find(evsel);
451 	if (tpebs_event) {
452 		/* evsel, or an identically named one, was already prepared. */
453 		mutex_unlock(tpebs_mtx_get());
454 		return 0;
455 	}
456 	tpebs_event = tpebs_retire_lat__new(evsel);
457 	if (!tpebs_event) {
458 		mutex_unlock(tpebs_mtx_get());
459 		return -ENOMEM;
460 	}
461 	list_add_tail(&tpebs_event->nd, &tpebs_results);
462 	mutex_unlock(tpebs_mtx_get());
463 
464 	/*
465 	 * Eagerly prepare all other evsels on the list to try to ensure that by
466 	 * open they are all known.
467 	 */
468 	evlist__for_each_entry(evsel->evlist, pos) {
469 		int ret;
470 
471 		if (pos == evsel || !pos->retire_lat)
472 			continue;
473 
474 		ret = evsel__tpebs_prepare(pos);
475 		if (ret)
476 			return ret;
477 	}
478 	return 0;
479 }
480 
481 /**
482  * evsel__tpebs_open - starts tpebs execution.
483  * @evsel: retire_latency evsel, all evsels on its list will be selected. Each
484  *         evsel is sampled to get the average retire_latency value.
485  */
486 int evsel__tpebs_open(struct evsel *evsel)
487 {
488 	int ret;
489 	bool tpebs_empty;
490 
491 	/* We should only run tpebs_start when tpebs_recording is enabled. */
492 	if (!tpebs_recording)
493 		return 0;
494 	/* Only start the events once. */
495 	if (tpebs_cmd.pid != 0) {
496 		struct tpebs_retire_lat *t;
497 		bool valid;
498 
499 		mutex_lock(tpebs_mtx_get());
500 		t = tpebs_retire_lat__find(evsel);
501 		valid = t && t->started;
502 		mutex_unlock(tpebs_mtx_get());
503 		/* May fail as the event wasn't started. */
504 		return valid ? 0 : -EBUSY;
505 	}
506 
507 	ret = evsel__tpebs_prepare(evsel);
508 	if (ret)
509 		return ret;
510 
511 	mutex_lock(tpebs_mtx_get());
512 	tpebs_empty = list_empty(&tpebs_results);
513 	if (!tpebs_empty) {
514 		/*Create control and ack fd for --control*/
515 		if (pipe(control_fd) < 0) {
516 			pr_err("tpebs: Failed to create control fifo");
517 			ret = -1;
518 			goto out;
519 		}
520 		if (pipe(ack_fd) < 0) {
521 			pr_err("tpebs: Failed to create control fifo");
522 			ret = -1;
523 			goto out;
524 		}
525 
526 		ret = evsel__tpebs_start_perf_record(evsel);
527 		if (ret)
528 			goto out;
529 
530 		if (pthread_create(&tpebs_reader_thread, /*attr=*/NULL, __sample_reader,
531 				   /*arg=*/NULL)) {
532 			kill(tpebs_cmd.pid, SIGTERM);
533 			close(tpebs_cmd.out);
534 			pr_err("Could not create thread to process sample data.\n");
535 			ret = -1;
536 			goto out;
537 		}
538 		ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_ENABLE_TAG);
539 	}
540 out:
541 	if (ret) {
542 		struct tpebs_retire_lat *t = tpebs_retire_lat__find(evsel);
543 
544 		list_del_init(&t->nd);
545 		tpebs_retire_lat__delete(t);
546 	}
547 	mutex_unlock(tpebs_mtx_get());
548 	return ret;
549 }
550 
551 int evsel__tpebs_read(struct evsel *evsel, int cpu_map_idx, int thread)
552 {
553 	struct perf_counts_values *count, *old_count = NULL;
554 	struct tpebs_retire_lat *t;
555 	uint64_t val;
556 	int ret;
557 
558 	/* Only set retire_latency value to the first CPU and thread. */
559 	if (cpu_map_idx != 0 || thread != 0)
560 		return 0;
561 
562 	if (evsel->prev_raw_counts)
563 		old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread);
564 
565 	count = perf_counts(evsel->counts, cpu_map_idx, thread);
566 
567 	mutex_lock(tpebs_mtx_get());
568 	t = tpebs_retire_lat__find(evsel);
569 	/*
570 	 * If reading the first tpebs result, send a ping to the record
571 	 * process. Allow the sample reader a chance to read by releasing and
572 	 * reacquiring the lock.
573 	 */
574 	if (t && &t->nd == tpebs_results.next) {
575 		ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_PING_TAG);
576 		mutex_unlock(tpebs_mtx_get());
577 		if (ret)
578 			return ret;
579 		mutex_lock(tpebs_mtx_get());
580 	}
581 	if (t == NULL || t->stats.n == 0) {
582 		/* No sample data, use default. */
583 		if (tpebs_recording) {
584 			pr_warning_once(
585 				"Using precomputed retirement latency data as no samples\n");
586 		}
587 		val = 0;
588 		switch (tpebs_mode) {
589 		case TPEBS_MODE__MIN:
590 			val = rint(evsel->retirement_latency.min);
591 			break;
592 		case TPEBS_MODE__MAX:
593 			val = rint(evsel->retirement_latency.max);
594 			break;
595 		default:
596 		case TPEBS_MODE__LAST:
597 		case TPEBS_MODE__MEAN:
598 			val = rint(evsel->retirement_latency.mean);
599 			break;
600 		}
601 	} else {
602 		switch (tpebs_mode) {
603 		case TPEBS_MODE__MIN:
604 			val = t->stats.min;
605 			break;
606 		case TPEBS_MODE__MAX:
607 			val = t->stats.max;
608 			break;
609 		case TPEBS_MODE__LAST:
610 			val = t->last;
611 			break;
612 		default:
613 		case TPEBS_MODE__MEAN:
614 			val = rint(t->stats.mean);
615 			break;
616 		}
617 	}
618 	mutex_unlock(tpebs_mtx_get());
619 
620 	if (old_count) {
621 		count->val = old_count->val + val;
622 		count->run = old_count->run + 1;
623 		count->ena = old_count->ena + 1;
624 	} else {
625 		count->val = val;
626 		count->run++;
627 		count->ena++;
628 	}
629 	return 0;
630 }
631 
632 /**
633  * evsel__tpebs_close() - delete tpebs related data. If the last event, stop the
634  * created thread and process by calling tpebs_stop().
635  *
636  * This function is called in evsel__close() to be symmetric with
637  * evsel__tpebs_open() being called in evsel__open().
638  */
639 void evsel__tpebs_close(struct evsel *evsel)
640 {
641 	struct tpebs_retire_lat *t;
642 
643 	mutex_lock(tpebs_mtx_get());
644 	t = tpebs_retire_lat__find(evsel);
645 	if (t) {
646 		list_del_init(&t->nd);
647 		tpebs_retire_lat__delete(t);
648 
649 		if (list_empty(&tpebs_results))
650 			tpebs_stop();
651 	}
652 	mutex_unlock(tpebs_mtx_get());
653 }
654