xref: /linux/tools/perf/util/intel-tpebs.c (revision 754187ad73b73bcb44f106a8e5fc88789beff1bd)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_tpebs.c: Intel TPEBS support
4  */
5 
6 #include <api/fs/fs.h>
7 #include <sys/param.h>
8 #include <subcmd/run-command.h>
9 #include <thread.h>
10 #include "intel-tpebs.h"
11 #include <linux/list.h>
12 #include <linux/zalloc.h>
13 #include <linux/err.h>
14 #include "sample.h"
15 #include "counts.h"
16 #include "debug.h"
17 #include "evlist.h"
18 #include "evsel.h"
19 #include "mutex.h"
20 #include "session.h"
21 #include "stat.h"
22 #include "tool.h"
23 #include "cpumap.h"
24 #include "metricgroup.h"
25 #include "stat.h"
26 #include <sys/stat.h>
27 #include <sys/file.h>
28 #include <poll.h>
29 #include <math.h>
30 
31 #define PERF_DATA		"-"
32 
33 bool tpebs_recording;
34 enum tpebs_mode tpebs_mode;
35 static LIST_HEAD(tpebs_results);
36 static pthread_t tpebs_reader_thread;
37 static struct child_process tpebs_cmd;
38 static int control_fd[2], ack_fd[2];
39 static struct mutex tpebs_mtx;
40 
41 struct tpebs_retire_lat {
42 	struct list_head nd;
43 	/** @evsel: The evsel that opened the retire_lat event. */
44 	struct evsel *evsel;
45 	/** @event: Event passed to perf record. */
46 	char *event;
47 	/** @stats: Recorded retirement latency stats. */
48 	struct stats stats;
49 	/** @last: Last retirement latency read. */
50 	uint64_t last;
51 	/* Has the event been sent to perf record? */
52 	bool started;
53 };
54 
55 static void tpebs_mtx_init(void)
56 {
57 	mutex_init(&tpebs_mtx);
58 }
59 
60 static struct mutex *tpebs_mtx_get(void)
61 {
62 	static pthread_once_t tpebs_mtx_once = PTHREAD_ONCE_INIT;
63 
64 	pthread_once(&tpebs_mtx_once, tpebs_mtx_init);
65 	return &tpebs_mtx;
66 }
67 
68 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel)
69 	EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get());
70 
71 static int evsel__tpebs_start_perf_record(struct evsel *evsel)
72 {
73 	const char **record_argv;
74 	int tpebs_event_size = 0, i = 0, ret;
75 	char control_fd_buf[32];
76 	char cpumap_buf[50];
77 	struct tpebs_retire_lat *t;
78 
79 	list_for_each_entry(t, &tpebs_results, nd)
80 		tpebs_event_size++;
81 
82 	record_argv = malloc((10 + 2 * tpebs_event_size) * sizeof(*record_argv));
83 	if (!record_argv)
84 		return -ENOMEM;
85 
86 	record_argv[i++] = "perf";
87 	record_argv[i++] = "record";
88 	record_argv[i++] = "-W";
89 	record_argv[i++] = "--synth=no";
90 
91 	scnprintf(control_fd_buf, sizeof(control_fd_buf), "--control=fd:%d,%d",
92 		  control_fd[0], ack_fd[1]);
93 	record_argv[i++] = control_fd_buf;
94 
95 	record_argv[i++] = "-o";
96 	record_argv[i++] = PERF_DATA;
97 
98 	if (!perf_cpu_map__is_any_cpu_or_is_empty(evsel->evlist->core.user_requested_cpus)) {
99 		cpu_map__snprint(evsel->evlist->core.user_requested_cpus, cpumap_buf,
100 				 sizeof(cpumap_buf));
101 		record_argv[i++] = "-C";
102 		record_argv[i++] = cpumap_buf;
103 	}
104 
105 	list_for_each_entry(t, &tpebs_results, nd) {
106 		record_argv[i++] = "-e";
107 		record_argv[i++] = t->event;
108 	}
109 	record_argv[i++] = NULL;
110 	assert(i == 10 + 2 * tpebs_event_size || i == 8 + 2 * tpebs_event_size);
111 	/* Note, no workload given so system wide is implied. */
112 
113 	assert(tpebs_cmd.pid == 0);
114 	tpebs_cmd.argv = record_argv;
115 	tpebs_cmd.out = -1;
116 	ret = start_command(&tpebs_cmd);
117 	zfree(&tpebs_cmd.argv);
118 	list_for_each_entry(t, &tpebs_results, nd)
119 		t->started = true;
120 
121 	return ret;
122 }
123 
124 static bool is_child_pid(pid_t parent, pid_t child)
125 {
126 	if (parent < 0 || child < 0)
127 		return false;
128 
129 	while (true) {
130 		char path[PATH_MAX];
131 		char line[256];
132 		FILE *fp;
133 
134 new_child:
135 		if (parent == child)
136 			return true;
137 
138 		if (child <= 0)
139 			return false;
140 
141 		scnprintf(path, sizeof(path), "%s/%d/status", procfs__mountpoint(), child);
142 		fp = fopen(path, "r");
143 		if (!fp) {
144 			/* Presumably the process went away. Assume not a child. */
145 			return false;
146 		}
147 		while (fgets(line, sizeof(line), fp) != NULL) {
148 			if (strncmp(line, "PPid:", 5) == 0) {
149 				fclose(fp);
150 				if (sscanf(line + 5, "%d", &child) != 1) {
151 					/* Unexpected error parsing. */
152 					return false;
153 				}
154 				goto new_child;
155 			}
156 		}
157 		/* Unexpected EOF. */
158 		fclose(fp);
159 		return false;
160 	}
161 }
162 
163 static bool should_ignore_sample(const struct perf_sample *sample, const struct tpebs_retire_lat *t)
164 {
165 	pid_t workload_pid, sample_pid = sample->pid;
166 
167 	/*
168 	 * During evlist__purge the evlist will be removed prior to the
169 	 * evsel__exit calling evsel__tpebs_close and taking the
170 	 * tpebs_mtx. Avoid a segfault by ignoring samples in this case.
171 	 */
172 	if (t->evsel->evlist == NULL)
173 		return true;
174 
175 	workload_pid = t->evsel->evlist->workload.pid;
176 	if (workload_pid < 0 || workload_pid == sample_pid)
177 		return false;
178 
179 	if (!t->evsel->core.attr.inherit)
180 		return true;
181 
182 	return !is_child_pid(workload_pid, sample_pid);
183 }
184 
185 static int process_sample_event(const struct perf_tool *tool __maybe_unused,
186 				union perf_event *event __maybe_unused,
187 				struct perf_sample *sample,
188 				struct evsel *evsel,
189 				struct machine *machine __maybe_unused)
190 {
191 	struct tpebs_retire_lat *t;
192 
193 	mutex_lock(tpebs_mtx_get());
194 	if (tpebs_cmd.pid == 0) {
195 		/* Record has terminated. */
196 		mutex_unlock(tpebs_mtx_get());
197 		return 0;
198 	}
199 	t = tpebs_retire_lat__find(evsel);
200 	if (!t) {
201 		mutex_unlock(tpebs_mtx_get());
202 		return -EINVAL;
203 	}
204 	if (should_ignore_sample(sample, t)) {
205 		mutex_unlock(tpebs_mtx_get());
206 		return 0;
207 	}
208 	/*
209 	 * Need to handle per core results? We are assuming average retire
210 	 * latency value will be used. Save the number of samples and the sum of
211 	 * retire latency value for each event.
212 	 */
213 	t->last = sample->weight3;
214 	update_stats(&t->stats, sample->weight3);
215 	mutex_unlock(tpebs_mtx_get());
216 	return 0;
217 }
218 
219 static int process_feature_event(const struct perf_tool *tool __maybe_unused,
220 				 struct perf_session *session,
221 				 union perf_event *event)
222 {
223 	if (event->feat.feat_id < HEADER_LAST_FEATURE)
224 		return perf_event__process_feature(session, event);
225 	return 0;
226 }
227 
228 static void *__sample_reader(void *arg __maybe_unused)
229 {
230 	struct perf_session *session;
231 	struct perf_data data = {
232 		.mode = PERF_DATA_MODE_READ,
233 		.path = PERF_DATA,
234 		.file.fd = tpebs_cmd.out,
235 	};
236 	struct perf_tool tool;
237 
238 	perf_tool__init(&tool, /*ordered_events=*/false);
239 	tool.sample = process_sample_event;
240 	tool.feature = process_feature_event;
241 	tool.attr = perf_event__process_attr;
242 
243 	session = perf_session__new(&data, &tool);
244 	if (IS_ERR(session))
245 		return NULL;
246 	perf_session__process_events(session);
247 	perf_session__delete(session);
248 
249 	return NULL;
250 }
251 
252 static int tpebs_send_record_cmd(const char *msg) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get())
253 {
254 	struct pollfd pollfd = { .events = POLLIN, };
255 	int ret, len, retries = 0;
256 	char ack_buf[8];
257 
258 	/* Check if the command exited before the send, done with the lock held. */
259 	if (tpebs_cmd.pid == 0)
260 		return 0;
261 
262 	/*
263 	 * Let go of the lock while sending/receiving as blocking can starve the
264 	 * sample reading thread.
265 	 */
266 	mutex_unlock(tpebs_mtx_get());
267 
268 	/* Send perf record command.*/
269 	len = strlen(msg);
270 	ret = write(control_fd[1], msg, len);
271 	if (ret != len) {
272 		pr_err("perf record control write control message '%s' failed\n", msg);
273 		ret = -EPIPE;
274 		goto out;
275 	}
276 
277 	if (!strcmp(msg, EVLIST_CTL_CMD_STOP_TAG)) {
278 		ret = 0;
279 		goto out;
280 	}
281 
282 	/* Wait for an ack. */
283 	pollfd.fd = ack_fd[0];
284 
285 	/*
286 	 * We need this poll to ensure the ack_fd PIPE will not hang
287 	 * when perf record failed for any reason. The timeout value
288 	 * 3000ms is an empirical selection.
289 	 */
290 again:
291 	if (!poll(&pollfd, 1, 500)) {
292 		if (check_if_command_finished(&tpebs_cmd)) {
293 			ret = 0;
294 			goto out;
295 		}
296 
297 		if (retries++ < 6)
298 			goto again;
299 		pr_err("tpebs failed: perf record ack timeout for '%s'\n", msg);
300 		ret = -ETIMEDOUT;
301 		goto out;
302 	}
303 
304 	if (!(pollfd.revents & POLLIN)) {
305 		if (check_if_command_finished(&tpebs_cmd)) {
306 			ret = 0;
307 			goto out;
308 		}
309 
310 		pr_err("tpebs failed: did not received an ack for '%s'\n", msg);
311 		ret = -EPIPE;
312 		goto out;
313 	}
314 
315 	ret = read(ack_fd[0], ack_buf, sizeof(ack_buf));
316 	if (ret > 0)
317 		ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG);
318 	else
319 		pr_err("tpebs: perf record control ack failed\n");
320 out:
321 	/* Re-take lock as expected by caller. */
322 	mutex_lock(tpebs_mtx_get());
323 	return ret;
324 }
325 
326 /*
327  * tpebs_stop - stop the sample data read thread and the perf record process.
328  */
329 static int tpebs_stop(void) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get())
330 {
331 	int ret = 0;
332 
333 	/* Like tpebs_start, we should only run tpebs_end once. */
334 	if (tpebs_cmd.pid != 0) {
335 		tpebs_send_record_cmd(EVLIST_CTL_CMD_STOP_TAG);
336 		tpebs_cmd.pid = 0;
337 		mutex_unlock(tpebs_mtx_get());
338 		pthread_join(tpebs_reader_thread, NULL);
339 		mutex_lock(tpebs_mtx_get());
340 		close(control_fd[0]);
341 		close(control_fd[1]);
342 		close(ack_fd[0]);
343 		close(ack_fd[1]);
344 		close(tpebs_cmd.out);
345 		ret = finish_command(&tpebs_cmd);
346 		tpebs_cmd.pid = 0;
347 		if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
348 			ret = 0;
349 	}
350 	return ret;
351 }
352 
353 /**
354  * evsel__tpebs_event() - Create string event encoding to pass to `perf record`.
355  */
356 static int evsel__tpebs_event(struct evsel *evsel, char **event)
357 {
358 	char *name, *modifier;
359 	int ret;
360 
361 	name = strdup(evsel->name);
362 	if (!name)
363 		return -ENOMEM;
364 
365 	modifier = strrchr(name, 'R');
366 	if (!modifier) {
367 		ret = -EINVAL;
368 		goto out;
369 	}
370 	*modifier = 'p';
371 	modifier = strchr(name, ':');
372 	if (!modifier)
373 		modifier = strrchr(name, '/');
374 	if (!modifier) {
375 		ret = -EINVAL;
376 		goto out;
377 	}
378 	*modifier = '\0';
379 	if (asprintf(event, "%s/name=tpebs_event_%p/%s", name, evsel, modifier + 1) > 0)
380 		ret = 0;
381 	else
382 		ret = -ENOMEM;
383 out:
384 	if (ret)
385 		pr_err("Tpebs event modifier broken '%s'\n", evsel->name);
386 	free(name);
387 	return ret;
388 }
389 
390 static struct tpebs_retire_lat *tpebs_retire_lat__new(struct evsel *evsel)
391 {
392 	struct tpebs_retire_lat *result = zalloc(sizeof(*result));
393 	int ret;
394 
395 	if (!result)
396 		return NULL;
397 
398 	ret = evsel__tpebs_event(evsel, &result->event);
399 	if (ret) {
400 		free(result);
401 		return NULL;
402 	}
403 	result->evsel = evsel;
404 	return result;
405 }
406 
407 static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
408 {
409 	zfree(&r->event);
410 	free(r);
411 }
412 
413 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel)
414 {
415 	struct tpebs_retire_lat *t;
416 	unsigned long num;
417 	const char *evsel_name;
418 
419 	/*
420 	 * Evsels will match for evlist with the retirement latency event. The
421 	 * name with "tpebs_event_" prefix will be present on events being read
422 	 * from `perf record`.
423 	 */
424 	if (evsel__is_retire_lat(evsel)) {
425 		list_for_each_entry(t, &tpebs_results, nd) {
426 			if (t->evsel == evsel)
427 				return t;
428 		}
429 		return NULL;
430 	}
431 	evsel_name = strstr(evsel->name, "tpebs_event_");
432 	if (!evsel_name) {
433 		/* Unexpected that the perf record should have other events. */
434 		return NULL;
435 	}
436 	errno = 0;
437 	num = strtoull(evsel_name + 12, NULL, 16);
438 	if (errno) {
439 		pr_err("Bad evsel for tpebs find '%s'\n", evsel->name);
440 		return NULL;
441 	}
442 	list_for_each_entry(t, &tpebs_results, nd) {
443 		if ((unsigned long)t->evsel == num)
444 			return t;
445 	}
446 	return NULL;
447 }
448 
449 /**
450  * evsel__tpebs_prepare - create tpebs data structures ready for opening.
451  * @evsel: retire_latency evsel, all evsels on its list will be prepared.
452  */
453 static int evsel__tpebs_prepare(struct evsel *evsel)
454 {
455 	struct evsel *pos;
456 	struct tpebs_retire_lat *tpebs_event;
457 
458 	mutex_lock(tpebs_mtx_get());
459 	tpebs_event = tpebs_retire_lat__find(evsel);
460 	if (tpebs_event) {
461 		/* evsel, or an identically named one, was already prepared. */
462 		mutex_unlock(tpebs_mtx_get());
463 		return 0;
464 	}
465 	tpebs_event = tpebs_retire_lat__new(evsel);
466 	if (!tpebs_event) {
467 		mutex_unlock(tpebs_mtx_get());
468 		return -ENOMEM;
469 	}
470 	list_add_tail(&tpebs_event->nd, &tpebs_results);
471 	mutex_unlock(tpebs_mtx_get());
472 
473 	/*
474 	 * Eagerly prepare all other evsels on the list to try to ensure that by
475 	 * open they are all known.
476 	 */
477 	evlist__for_each_entry(evsel->evlist, pos) {
478 		int ret;
479 
480 		if (pos == evsel || !pos->retire_lat)
481 			continue;
482 
483 		ret = evsel__tpebs_prepare(pos);
484 		if (ret)
485 			return ret;
486 	}
487 	return 0;
488 }
489 
490 /**
491  * evsel__tpebs_open - starts tpebs execution.
492  * @evsel: retire_latency evsel, all evsels on its list will be selected. Each
493  *         evsel is sampled to get the average retire_latency value.
494  */
495 int evsel__tpebs_open(struct evsel *evsel)
496 {
497 	int ret;
498 	bool tpebs_empty;
499 
500 	/* We should only run tpebs_start when tpebs_recording is enabled. */
501 	if (!tpebs_recording)
502 		return 0;
503 	/* Only start the events once. */
504 	if (tpebs_cmd.pid != 0) {
505 		struct tpebs_retire_lat *t;
506 		bool valid;
507 
508 		mutex_lock(tpebs_mtx_get());
509 		t = tpebs_retire_lat__find(evsel);
510 		valid = t && t->started;
511 		mutex_unlock(tpebs_mtx_get());
512 		/* May fail as the event wasn't started. */
513 		return valid ? 0 : -EBUSY;
514 	}
515 
516 	ret = evsel__tpebs_prepare(evsel);
517 	if (ret)
518 		return ret;
519 
520 	mutex_lock(tpebs_mtx_get());
521 	tpebs_empty = list_empty(&tpebs_results);
522 	if (!tpebs_empty) {
523 		/*Create control and ack fd for --control*/
524 		if (pipe(control_fd) < 0) {
525 			pr_err("tpebs: Failed to create control fifo");
526 			ret = -1;
527 			goto out;
528 		}
529 		if (pipe(ack_fd) < 0) {
530 			pr_err("tpebs: Failed to create control fifo");
531 			ret = -1;
532 			goto out;
533 		}
534 
535 		ret = evsel__tpebs_start_perf_record(evsel);
536 		if (ret)
537 			goto out;
538 
539 		if (pthread_create(&tpebs_reader_thread, /*attr=*/NULL, __sample_reader,
540 				   /*arg=*/NULL)) {
541 			kill(tpebs_cmd.pid, SIGTERM);
542 			close(tpebs_cmd.out);
543 			pr_err("Could not create thread to process sample data.\n");
544 			ret = -1;
545 			goto out;
546 		}
547 		ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_ENABLE_TAG);
548 	}
549 out:
550 	if (ret) {
551 		struct tpebs_retire_lat *t = tpebs_retire_lat__find(evsel);
552 
553 		list_del_init(&t->nd);
554 		tpebs_retire_lat__delete(t);
555 	}
556 	mutex_unlock(tpebs_mtx_get());
557 	return ret;
558 }
559 
560 int evsel__tpebs_read(struct evsel *evsel, int cpu_map_idx, int thread)
561 {
562 	struct perf_counts_values *count, *old_count = NULL;
563 	struct tpebs_retire_lat *t;
564 	uint64_t val;
565 	int ret;
566 
567 	/* Only set retire_latency value to the first CPU and thread. */
568 	if (cpu_map_idx != 0 || thread != 0)
569 		return 0;
570 
571 	if (evsel->prev_raw_counts)
572 		old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread);
573 
574 	count = perf_counts(evsel->counts, cpu_map_idx, thread);
575 
576 	mutex_lock(tpebs_mtx_get());
577 	t = tpebs_retire_lat__find(evsel);
578 	/*
579 	 * If reading the first tpebs result, send a ping to the record
580 	 * process. Allow the sample reader a chance to read by releasing and
581 	 * reacquiring the lock.
582 	 */
583 	if (t && &t->nd == tpebs_results.next) {
584 		ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_PING_TAG);
585 		mutex_unlock(tpebs_mtx_get());
586 		if (ret)
587 			return ret;
588 		mutex_lock(tpebs_mtx_get());
589 	}
590 	if (t == NULL || t->stats.n == 0) {
591 		/* No sample data, use default. */
592 		if (tpebs_recording) {
593 			pr_warning_once(
594 				"Using precomputed retirement latency data as no samples\n");
595 		}
596 		val = 0;
597 		switch (tpebs_mode) {
598 		case TPEBS_MODE__MIN:
599 			val = rint(evsel->retirement_latency.min);
600 			break;
601 		case TPEBS_MODE__MAX:
602 			val = rint(evsel->retirement_latency.max);
603 			break;
604 		default:
605 		case TPEBS_MODE__LAST:
606 		case TPEBS_MODE__MEAN:
607 			val = rint(evsel->retirement_latency.mean);
608 			break;
609 		}
610 	} else {
611 		switch (tpebs_mode) {
612 		case TPEBS_MODE__MIN:
613 			val = t->stats.min;
614 			break;
615 		case TPEBS_MODE__MAX:
616 			val = t->stats.max;
617 			break;
618 		case TPEBS_MODE__LAST:
619 			val = t->last;
620 			break;
621 		default:
622 		case TPEBS_MODE__MEAN:
623 			val = rint(t->stats.mean);
624 			break;
625 		}
626 	}
627 	mutex_unlock(tpebs_mtx_get());
628 
629 	if (old_count) {
630 		count->val = old_count->val + val;
631 		count->run = old_count->run + 1;
632 		count->ena = old_count->ena + 1;
633 	} else {
634 		count->val = val;
635 		count->run++;
636 		count->ena++;
637 	}
638 	return 0;
639 }
640 
641 /**
642  * evsel__tpebs_close() - delete tpebs related data. If the last event, stop the
643  * created thread and process by calling tpebs_stop().
644  *
645  * This function is called in evsel__close() to be symmetric with
646  * evsel__tpebs_open() being called in evsel__open().
647  */
648 void evsel__tpebs_close(struct evsel *evsel)
649 {
650 	struct tpebs_retire_lat *t;
651 
652 	mutex_lock(tpebs_mtx_get());
653 	t = tpebs_retire_lat__find(evsel);
654 	if (t) {
655 		list_del_init(&t->nd);
656 		tpebs_retire_lat__delete(t);
657 
658 		if (list_empty(&tpebs_results))
659 			tpebs_stop();
660 	}
661 	mutex_unlock(tpebs_mtx_get());
662 }
663