xref: /linux/tools/perf/util/intel-tpebs.c (revision 17e548405a81665fd14cee960db7d093d1396400)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_tpebs.c: Intel TPEBS support
4  */
5 
6 
7 #include <sys/param.h>
8 #include <subcmd/run-command.h>
9 #include <thread.h>
10 #include "intel-tpebs.h"
11 #include <linux/list.h>
12 #include <linux/zalloc.h>
13 #include <linux/err.h>
14 #include "sample.h"
15 #include "counts.h"
16 #include "debug.h"
17 #include "evlist.h"
18 #include "evsel.h"
19 #include "mutex.h"
20 #include "session.h"
21 #include "stat.h"
22 #include "tool.h"
23 #include "cpumap.h"
24 #include "metricgroup.h"
25 #include "stat.h"
26 #include <sys/stat.h>
27 #include <sys/file.h>
28 #include <poll.h>
29 #include <math.h>
30 
31 #define PERF_DATA		"-"
32 
33 bool tpebs_recording;
34 enum tpebs_mode tpebs_mode;
35 static LIST_HEAD(tpebs_results);
36 static pthread_t tpebs_reader_thread;
37 static struct child_process tpebs_cmd;
38 static int control_fd[2], ack_fd[2];
39 static struct mutex tpebs_mtx;
40 
41 struct tpebs_retire_lat {
42 	struct list_head nd;
43 	/** @evsel: The evsel that opened the retire_lat event. */
44 	struct evsel *evsel;
45 	/** @event: Event passed to perf record. */
46 	char *event;
47 	/** @stats: Recorded retirement latency stats. */
48 	struct stats stats;
49 	/** @last: Last retirement latency read. */
50 	uint64_t last;
51 	/* Has the event been sent to perf record? */
52 	bool started;
53 };
54 
55 static void tpebs_mtx_init(void)
56 {
57 	mutex_init(&tpebs_mtx);
58 }
59 
60 static struct mutex *tpebs_mtx_get(void)
61 {
62 	static pthread_once_t tpebs_mtx_once = PTHREAD_ONCE_INIT;
63 
64 	pthread_once(&tpebs_mtx_once, tpebs_mtx_init);
65 	return &tpebs_mtx;
66 }
67 
68 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel)
69 	EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get());
70 
71 static int evsel__tpebs_start_perf_record(struct evsel *evsel)
72 {
73 	const char **record_argv;
74 	int tpebs_event_size = 0, i = 0, ret;
75 	char control_fd_buf[32];
76 	char cpumap_buf[50];
77 	struct tpebs_retire_lat *t;
78 
79 	list_for_each_entry(t, &tpebs_results, nd)
80 		tpebs_event_size++;
81 
82 	record_argv = malloc((10 + 2 * tpebs_event_size) * sizeof(*record_argv));
83 	if (!record_argv)
84 		return -ENOMEM;
85 
86 	record_argv[i++] = "perf";
87 	record_argv[i++] = "record";
88 	record_argv[i++] = "-W";
89 	record_argv[i++] = "--synth=no";
90 
91 	scnprintf(control_fd_buf, sizeof(control_fd_buf), "--control=fd:%d,%d",
92 		  control_fd[0], ack_fd[1]);
93 	record_argv[i++] = control_fd_buf;
94 
95 	record_argv[i++] = "-o";
96 	record_argv[i++] = PERF_DATA;
97 
98 	if (!perf_cpu_map__is_any_cpu_or_is_empty(evsel->evlist->core.user_requested_cpus)) {
99 		cpu_map__snprint(evsel->evlist->core.user_requested_cpus, cpumap_buf,
100 				 sizeof(cpumap_buf));
101 		record_argv[i++] = "-C";
102 		record_argv[i++] = cpumap_buf;
103 	}
104 
105 	list_for_each_entry(t, &tpebs_results, nd) {
106 		record_argv[i++] = "-e";
107 		record_argv[i++] = t->event;
108 	}
109 	record_argv[i++] = NULL;
110 	assert(i == 10 + 2 * tpebs_event_size || i == 8 + 2 * tpebs_event_size);
111 	/* Note, no workload given so system wide is implied. */
112 
113 	assert(tpebs_cmd.pid == 0);
114 	tpebs_cmd.argv = record_argv;
115 	tpebs_cmd.out = -1;
116 	ret = start_command(&tpebs_cmd);
117 	zfree(&tpebs_cmd.argv);
118 	list_for_each_entry(t, &tpebs_results, nd)
119 		t->started = true;
120 
121 	return ret;
122 }
123 
124 static int process_sample_event(const struct perf_tool *tool __maybe_unused,
125 				union perf_event *event __maybe_unused,
126 				struct perf_sample *sample,
127 				struct evsel *evsel,
128 				struct machine *machine __maybe_unused)
129 {
130 	struct tpebs_retire_lat *t;
131 
132 	mutex_lock(tpebs_mtx_get());
133 	if (tpebs_cmd.pid == 0) {
134 		/* Record has terminated. */
135 		mutex_unlock(tpebs_mtx_get());
136 		return 0;
137 	}
138 	t = tpebs_retire_lat__find(evsel);
139 	if (!t) {
140 		mutex_unlock(tpebs_mtx_get());
141 		return -EINVAL;
142 	}
143 	/*
144 	 * Need to handle per core results? We are assuming average retire
145 	 * latency value will be used. Save the number of samples and the sum of
146 	 * retire latency value for each event.
147 	 */
148 	t->last = sample->retire_lat;
149 	update_stats(&t->stats, sample->retire_lat);
150 	mutex_unlock(tpebs_mtx_get());
151 	return 0;
152 }
153 
154 static int process_feature_event(struct perf_session *session,
155 				 union perf_event *event)
156 {
157 	if (event->feat.feat_id < HEADER_LAST_FEATURE)
158 		return perf_event__process_feature(session, event);
159 	return 0;
160 }
161 
162 static void *__sample_reader(void *arg __maybe_unused)
163 {
164 	struct perf_session *session;
165 	struct perf_data data = {
166 		.mode = PERF_DATA_MODE_READ,
167 		.path = PERF_DATA,
168 		.file.fd = tpebs_cmd.out,
169 	};
170 	struct perf_tool tool;
171 
172 	perf_tool__init(&tool, /*ordered_events=*/false);
173 	tool.sample = process_sample_event;
174 	tool.feature = process_feature_event;
175 	tool.attr = perf_event__process_attr;
176 
177 	session = perf_session__new(&data, &tool);
178 	if (IS_ERR(session))
179 		return NULL;
180 	perf_session__process_events(session);
181 	perf_session__delete(session);
182 
183 	return NULL;
184 }
185 
186 static int tpebs_send_record_cmd(const char *msg) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get())
187 {
188 	struct pollfd pollfd = { .events = POLLIN, };
189 	int ret, len, retries = 0;
190 	char ack_buf[8];
191 
192 	/* Check if the command exited before the send, done with the lock held. */
193 	if (tpebs_cmd.pid == 0)
194 		return 0;
195 
196 	/*
197 	 * Let go of the lock while sending/receiving as blocking can starve the
198 	 * sample reading thread.
199 	 */
200 	mutex_unlock(tpebs_mtx_get());
201 
202 	/* Send perf record command.*/
203 	len = strlen(msg);
204 	ret = write(control_fd[1], msg, len);
205 	if (ret != len) {
206 		pr_err("perf record control write control message '%s' failed\n", msg);
207 		ret = -EPIPE;
208 		goto out;
209 	}
210 
211 	if (!strcmp(msg, EVLIST_CTL_CMD_STOP_TAG)) {
212 		ret = 0;
213 		goto out;
214 	}
215 
216 	/* Wait for an ack. */
217 	pollfd.fd = ack_fd[0];
218 
219 	/*
220 	 * We need this poll to ensure the ack_fd PIPE will not hang
221 	 * when perf record failed for any reason. The timeout value
222 	 * 3000ms is an empirical selection.
223 	 */
224 again:
225 	if (!poll(&pollfd, 1, 500)) {
226 		if (check_if_command_finished(&tpebs_cmd)) {
227 			ret = 0;
228 			goto out;
229 		}
230 
231 		if (retries++ < 6)
232 			goto again;
233 		pr_err("tpebs failed: perf record ack timeout for '%s'\n", msg);
234 		ret = -ETIMEDOUT;
235 		goto out;
236 	}
237 
238 	if (!(pollfd.revents & POLLIN)) {
239 		if (check_if_command_finished(&tpebs_cmd)) {
240 			ret = 0;
241 			goto out;
242 		}
243 
244 		pr_err("tpebs failed: did not received an ack for '%s'\n", msg);
245 		ret = -EPIPE;
246 		goto out;
247 	}
248 
249 	ret = read(ack_fd[0], ack_buf, sizeof(ack_buf));
250 	if (ret > 0)
251 		ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG);
252 	else
253 		pr_err("tpebs: perf record control ack failed\n");
254 out:
255 	/* Re-take lock as expected by caller. */
256 	mutex_lock(tpebs_mtx_get());
257 	return ret;
258 }
259 
260 /*
261  * tpebs_stop - stop the sample data read thread and the perf record process.
262  */
263 static int tpebs_stop(void) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get())
264 {
265 	int ret = 0;
266 
267 	/* Like tpebs_start, we should only run tpebs_end once. */
268 	if (tpebs_cmd.pid != 0) {
269 		tpebs_send_record_cmd(EVLIST_CTL_CMD_STOP_TAG);
270 		tpebs_cmd.pid = 0;
271 		mutex_unlock(tpebs_mtx_get());
272 		pthread_join(tpebs_reader_thread, NULL);
273 		mutex_lock(tpebs_mtx_get());
274 		close(control_fd[0]);
275 		close(control_fd[1]);
276 		close(ack_fd[0]);
277 		close(ack_fd[1]);
278 		close(tpebs_cmd.out);
279 		ret = finish_command(&tpebs_cmd);
280 		tpebs_cmd.pid = 0;
281 		if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
282 			ret = 0;
283 	}
284 	return ret;
285 }
286 
287 /**
288  * evsel__tpebs_event() - Create string event encoding to pass to `perf record`.
289  */
290 static int evsel__tpebs_event(struct evsel *evsel, char **event)
291 {
292 	char *name, *modifier;
293 	int ret;
294 
295 	name = strdup(evsel->name);
296 	if (!name)
297 		return -ENOMEM;
298 
299 	modifier = strrchr(name, 'R');
300 	if (!modifier) {
301 		ret = -EINVAL;
302 		goto out;
303 	}
304 	*modifier = 'p';
305 	modifier = strchr(name, ':');
306 	if (!modifier)
307 		modifier = strrchr(name, '/');
308 	if (!modifier) {
309 		ret = -EINVAL;
310 		goto out;
311 	}
312 	*modifier = '\0';
313 	if (asprintf(event, "%s/name=tpebs_event_%p/%s", name, evsel, modifier + 1) > 0)
314 		ret = 0;
315 	else
316 		ret = -ENOMEM;
317 out:
318 	if (ret)
319 		pr_err("Tpebs event modifier broken '%s'\n", evsel->name);
320 	free(name);
321 	return ret;
322 }
323 
324 static struct tpebs_retire_lat *tpebs_retire_lat__new(struct evsel *evsel)
325 {
326 	struct tpebs_retire_lat *result = zalloc(sizeof(*result));
327 	int ret;
328 
329 	if (!result)
330 		return NULL;
331 
332 	ret = evsel__tpebs_event(evsel, &result->event);
333 	if (ret) {
334 		free(result);
335 		return NULL;
336 	}
337 	result->evsel = evsel;
338 	return result;
339 }
340 
341 static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
342 {
343 	zfree(&r->event);
344 	free(r);
345 }
346 
347 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel)
348 {
349 	struct tpebs_retire_lat *t;
350 	unsigned long num;
351 	const char *evsel_name;
352 
353 	/*
354 	 * Evsels will match for evlist with the retirement latency event. The
355 	 * name with "tpebs_event_" prefix will be present on events being read
356 	 * from `perf record`.
357 	 */
358 	if (evsel__is_retire_lat(evsel)) {
359 		list_for_each_entry(t, &tpebs_results, nd) {
360 			if (t->evsel == evsel)
361 				return t;
362 		}
363 		return NULL;
364 	}
365 	evsel_name = strstr(evsel->name, "tpebs_event_");
366 	if (!evsel_name) {
367 		/* Unexpected that the perf record should have other events. */
368 		return NULL;
369 	}
370 	errno = 0;
371 	num = strtoull(evsel_name + 12, NULL, 16);
372 	if (errno) {
373 		pr_err("Bad evsel for tpebs find '%s'\n", evsel->name);
374 		return NULL;
375 	}
376 	list_for_each_entry(t, &tpebs_results, nd) {
377 		if ((unsigned long)t->evsel == num)
378 			return t;
379 	}
380 	return NULL;
381 }
382 
383 /**
384  * evsel__tpebs_prepare - create tpebs data structures ready for opening.
385  * @evsel: retire_latency evsel, all evsels on its list will be prepared.
386  */
387 static int evsel__tpebs_prepare(struct evsel *evsel)
388 {
389 	struct evsel *pos;
390 	struct tpebs_retire_lat *tpebs_event;
391 
392 	mutex_lock(tpebs_mtx_get());
393 	tpebs_event = tpebs_retire_lat__find(evsel);
394 	if (tpebs_event) {
395 		/* evsel, or an identically named one, was already prepared. */
396 		mutex_unlock(tpebs_mtx_get());
397 		return 0;
398 	}
399 	tpebs_event = tpebs_retire_lat__new(evsel);
400 	if (!tpebs_event) {
401 		mutex_unlock(tpebs_mtx_get());
402 		return -ENOMEM;
403 	}
404 	list_add_tail(&tpebs_event->nd, &tpebs_results);
405 	mutex_unlock(tpebs_mtx_get());
406 
407 	/*
408 	 * Eagerly prepare all other evsels on the list to try to ensure that by
409 	 * open they are all known.
410 	 */
411 	evlist__for_each_entry(evsel->evlist, pos) {
412 		int ret;
413 
414 		if (pos == evsel || !pos->retire_lat)
415 			continue;
416 
417 		ret = evsel__tpebs_prepare(pos);
418 		if (ret)
419 			return ret;
420 	}
421 	return 0;
422 }
423 
424 /**
425  * evsel__tpebs_open - starts tpebs execution.
426  * @evsel: retire_latency evsel, all evsels on its list will be selected. Each
427  *         evsel is sampled to get the average retire_latency value.
428  */
429 int evsel__tpebs_open(struct evsel *evsel)
430 {
431 	int ret;
432 	bool tpebs_empty;
433 
434 	/* We should only run tpebs_start when tpebs_recording is enabled. */
435 	if (!tpebs_recording)
436 		return 0;
437 	/* Only start the events once. */
438 	if (tpebs_cmd.pid != 0) {
439 		struct tpebs_retire_lat *t;
440 		bool valid;
441 
442 		mutex_lock(tpebs_mtx_get());
443 		t = tpebs_retire_lat__find(evsel);
444 		valid = t && t->started;
445 		mutex_unlock(tpebs_mtx_get());
446 		/* May fail as the event wasn't started. */
447 		return valid ? 0 : -EBUSY;
448 	}
449 
450 	ret = evsel__tpebs_prepare(evsel);
451 	if (ret)
452 		return ret;
453 
454 	mutex_lock(tpebs_mtx_get());
455 	tpebs_empty = list_empty(&tpebs_results);
456 	if (!tpebs_empty) {
457 		/*Create control and ack fd for --control*/
458 		if (pipe(control_fd) < 0) {
459 			pr_err("tpebs: Failed to create control fifo");
460 			ret = -1;
461 			goto out;
462 		}
463 		if (pipe(ack_fd) < 0) {
464 			pr_err("tpebs: Failed to create control fifo");
465 			ret = -1;
466 			goto out;
467 		}
468 
469 		ret = evsel__tpebs_start_perf_record(evsel);
470 		if (ret)
471 			goto out;
472 
473 		if (pthread_create(&tpebs_reader_thread, /*attr=*/NULL, __sample_reader,
474 				   /*arg=*/NULL)) {
475 			kill(tpebs_cmd.pid, SIGTERM);
476 			close(tpebs_cmd.out);
477 			pr_err("Could not create thread to process sample data.\n");
478 			ret = -1;
479 			goto out;
480 		}
481 		ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_ENABLE_TAG);
482 	}
483 out:
484 	if (ret) {
485 		struct tpebs_retire_lat *t = tpebs_retire_lat__find(evsel);
486 
487 		list_del_init(&t->nd);
488 		tpebs_retire_lat__delete(t);
489 	}
490 	mutex_unlock(tpebs_mtx_get());
491 	return ret;
492 }
493 
494 int evsel__tpebs_read(struct evsel *evsel, int cpu_map_idx, int thread)
495 {
496 	struct perf_counts_values *count, *old_count = NULL;
497 	struct tpebs_retire_lat *t;
498 	uint64_t val;
499 	int ret;
500 
501 	/* Only set retire_latency value to the first CPU and thread. */
502 	if (cpu_map_idx != 0 || thread != 0)
503 		return 0;
504 
505 	if (evsel->prev_raw_counts)
506 		old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread);
507 
508 	count = perf_counts(evsel->counts, cpu_map_idx, thread);
509 
510 	mutex_lock(tpebs_mtx_get());
511 	t = tpebs_retire_lat__find(evsel);
512 	/*
513 	 * If reading the first tpebs result, send a ping to the record
514 	 * process. Allow the sample reader a chance to read by releasing and
515 	 * reacquiring the lock.
516 	 */
517 	if (t && &t->nd == tpebs_results.next) {
518 		ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_PING_TAG);
519 		mutex_unlock(tpebs_mtx_get());
520 		if (ret)
521 			return ret;
522 		mutex_lock(tpebs_mtx_get());
523 	}
524 	if (t == NULL || t->stats.n == 0) {
525 		/* No sample data, use default. */
526 		if (tpebs_recording) {
527 			pr_warning_once(
528 				"Using precomputed retirement latency data as no samples\n");
529 		}
530 		val = 0;
531 		switch (tpebs_mode) {
532 		case TPEBS_MODE__MIN:
533 			val = rint(evsel->retirement_latency.min);
534 			break;
535 		case TPEBS_MODE__MAX:
536 			val = rint(evsel->retirement_latency.max);
537 			break;
538 		default:
539 		case TPEBS_MODE__LAST:
540 		case TPEBS_MODE__MEAN:
541 			val = rint(evsel->retirement_latency.mean);
542 			break;
543 		}
544 	} else {
545 		switch (tpebs_mode) {
546 		case TPEBS_MODE__MIN:
547 			val = t->stats.min;
548 			break;
549 		case TPEBS_MODE__MAX:
550 			val = t->stats.max;
551 			break;
552 		case TPEBS_MODE__LAST:
553 			val = t->last;
554 			break;
555 		default:
556 		case TPEBS_MODE__MEAN:
557 			val = rint(t->stats.mean);
558 			break;
559 		}
560 	}
561 	mutex_unlock(tpebs_mtx_get());
562 
563 	if (old_count) {
564 		count->val = old_count->val + val;
565 		count->run = old_count->run + 1;
566 		count->ena = old_count->ena + 1;
567 	} else {
568 		count->val = val;
569 		count->run++;
570 		count->ena++;
571 	}
572 	return 0;
573 }
574 
575 /**
576  * evsel__tpebs_close() - delete tpebs related data. If the last event, stop the
577  * created thread and process by calling tpebs_stop().
578  *
579  * This function is called in evsel__close() to be symmetric with
580  * evsel__tpebs_open() being called in evsel__open().
581  */
582 void evsel__tpebs_close(struct evsel *evsel)
583 {
584 	struct tpebs_retire_lat *t;
585 
586 	mutex_lock(tpebs_mtx_get());
587 	t = tpebs_retire_lat__find(evsel);
588 	if (t) {
589 		list_del_init(&t->nd);
590 		tpebs_retire_lat__delete(t);
591 
592 		if (list_empty(&tpebs_results))
593 			tpebs_stop();
594 	}
595 	mutex_unlock(tpebs_mtx_get());
596 }
597