xref: /linux/tools/perf/util/intel-tpebs.c (revision 9e906a9dead17d81d6c2687f65e159231d0e3286)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_tpebs.c: Intel TPEBS support
4  */
5 
6 #include <api/fs/fs.h>
7 #include <sys/param.h>
8 #include <subcmd/run-command.h>
9 #include <thread.h>
10 #include "intel-tpebs.h"
11 #include <linux/list.h>
12 #include <linux/zalloc.h>
13 #include <linux/err.h>
14 #include "sample.h"
15 #include "counts.h"
16 #include "debug.h"
17 #include "evlist.h"
18 #include "evsel.h"
19 #include "mutex.h"
20 #include "session.h"
21 #include "stat.h"
22 #include "tool.h"
23 #include "cpumap.h"
24 #include "metricgroup.h"
25 #include "stat.h"
26 #include <sys/stat.h>
27 #include <sys/file.h>
28 #include <errno.h>
29 #include <poll.h>
30 #include <math.h>
31 
32 #define PERF_DATA		"-"
33 
34 bool tpebs_recording;
35 enum tpebs_mode tpebs_mode;
36 static LIST_HEAD(tpebs_results);
37 static pthread_t tpebs_reader_thread;
38 static struct child_process tpebs_cmd;
39 static int control_fd[2], ack_fd[2];
40 static struct mutex tpebs_mtx;
41 
42 struct tpebs_retire_lat {
43 	struct list_head nd;
44 	/** @evsel: The evsel that opened the retire_lat event. */
45 	struct evsel *evsel;
46 	/** @event: Event passed to perf record. */
47 	char *event;
48 	/** @stats: Recorded retirement latency stats. */
49 	struct stats stats;
50 	/** @last: Last retirement latency read. */
51 	uint64_t last;
52 	/* Has the event been sent to perf record? */
53 	bool started;
54 };
55 
tpebs_mtx_init(void)56 static void tpebs_mtx_init(void)
57 {
58 	mutex_init(&tpebs_mtx);
59 }
60 
tpebs_mtx_get(void)61 static struct mutex *tpebs_mtx_get(void)
62 {
63 	static pthread_once_t tpebs_mtx_once = PTHREAD_ONCE_INIT;
64 
65 	pthread_once(&tpebs_mtx_once, tpebs_mtx_init);
66 	return &tpebs_mtx;
67 }
68 
69 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel)
70 	EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get());
71 
evsel__tpebs_start_perf_record(struct evsel * evsel)72 static int evsel__tpebs_start_perf_record(struct evsel *evsel)
73 {
74 	const char **record_argv;
75 	int tpebs_event_size = 0, i = 0, ret;
76 	char control_fd_buf[32];
77 	char cpumap_buf[50];
78 	struct tpebs_retire_lat *t;
79 
80 	list_for_each_entry(t, &tpebs_results, nd)
81 		tpebs_event_size++;
82 
83 	record_argv = malloc((10 + 2 * tpebs_event_size) * sizeof(*record_argv));
84 	if (!record_argv)
85 		return -ENOMEM;
86 
87 	record_argv[i++] = "perf";
88 	record_argv[i++] = "record";
89 	record_argv[i++] = "-W";
90 	record_argv[i++] = "--synth=no";
91 
92 	scnprintf(control_fd_buf, sizeof(control_fd_buf), "--control=fd:%d,%d",
93 		  control_fd[0], ack_fd[1]);
94 	record_argv[i++] = control_fd_buf;
95 
96 	record_argv[i++] = "-o";
97 	record_argv[i++] = PERF_DATA;
98 
99 	if (!perf_cpu_map__is_any_cpu_or_is_empty(evsel->evlist->core.user_requested_cpus)) {
100 		cpu_map__snprint(evsel->evlist->core.user_requested_cpus, cpumap_buf,
101 				 sizeof(cpumap_buf));
102 		record_argv[i++] = "-C";
103 		record_argv[i++] = cpumap_buf;
104 	}
105 
106 	list_for_each_entry(t, &tpebs_results, nd) {
107 		record_argv[i++] = "-e";
108 		record_argv[i++] = t->event;
109 	}
110 	record_argv[i++] = NULL;
111 	assert(i == 10 + 2 * tpebs_event_size || i == 8 + 2 * tpebs_event_size);
112 	/* Note, no workload given so system wide is implied. */
113 
114 	assert(tpebs_cmd.pid == 0);
115 	tpebs_cmd.argv = record_argv;
116 	tpebs_cmd.out = -1;
117 	ret = start_command(&tpebs_cmd);
118 	zfree(&tpebs_cmd.argv);
119 	list_for_each_entry(t, &tpebs_results, nd)
120 		t->started = true;
121 
122 	return ret;
123 }
124 
is_child_pid(pid_t parent,pid_t child)125 static bool is_child_pid(pid_t parent, pid_t child)
126 {
127 	if (parent < 0 || child < 0)
128 		return false;
129 
130 	while (true) {
131 		char path[PATH_MAX];
132 		char line[256];
133 		FILE *fp;
134 
135 new_child:
136 		if (parent == child)
137 			return true;
138 
139 		if (child <= 0)
140 			return false;
141 
142 		scnprintf(path, sizeof(path), "%s/%d/status", procfs__mountpoint(), child);
143 		fp = fopen(path, "r");
144 		if (!fp) {
145 			/* Presumably the process went away. Assume not a child. */
146 			return false;
147 		}
148 		while (fgets(line, sizeof(line), fp) != NULL) {
149 			if (strncmp(line, "PPid:", 5) == 0) {
150 				fclose(fp);
151 				if (sscanf(line + 5, "%d", &child) != 1) {
152 					/* Unexpected error parsing. */
153 					return false;
154 				}
155 				goto new_child;
156 			}
157 		}
158 		/* Unexpected EOF. */
159 		fclose(fp);
160 		return false;
161 	}
162 }
163 
should_ignore_sample(const struct perf_sample * sample,const struct tpebs_retire_lat * t)164 static bool should_ignore_sample(const struct perf_sample *sample, const struct tpebs_retire_lat *t)
165 {
166 	pid_t workload_pid, sample_pid = sample->pid;
167 
168 	/*
169 	 * During evlist__purge the evlist will be removed prior to the
170 	 * evsel__exit calling evsel__tpebs_close and taking the
171 	 * tpebs_mtx. Avoid a segfault by ignoring samples in this case.
172 	 */
173 	if (t->evsel->evlist == NULL)
174 		return true;
175 
176 	workload_pid = t->evsel->evlist->workload.pid;
177 	if (workload_pid < 0 || workload_pid == sample_pid)
178 		return false;
179 
180 	if (!t->evsel->core.attr.inherit)
181 		return true;
182 
183 	return !is_child_pid(workload_pid, sample_pid);
184 }
185 
process_sample_event(const struct perf_tool * tool __maybe_unused,union perf_event * event __maybe_unused,struct perf_sample * sample,struct evsel * evsel,struct machine * machine __maybe_unused)186 static int process_sample_event(const struct perf_tool *tool __maybe_unused,
187 				union perf_event *event __maybe_unused,
188 				struct perf_sample *sample,
189 				struct evsel *evsel,
190 				struct machine *machine __maybe_unused)
191 {
192 	struct tpebs_retire_lat *t;
193 
194 	mutex_lock(tpebs_mtx_get());
195 	if (tpebs_cmd.pid == 0) {
196 		/* Record has terminated. */
197 		mutex_unlock(tpebs_mtx_get());
198 		return 0;
199 	}
200 	t = tpebs_retire_lat__find(evsel);
201 	if (!t) {
202 		mutex_unlock(tpebs_mtx_get());
203 		return -EINVAL;
204 	}
205 	if (should_ignore_sample(sample, t)) {
206 		mutex_unlock(tpebs_mtx_get());
207 		return 0;
208 	}
209 	/*
210 	 * Need to handle per core results? We are assuming average retire
211 	 * latency value will be used. Save the number of samples and the sum of
212 	 * retire latency value for each event.
213 	 */
214 	t->last = sample->weight3;
215 	update_stats(&t->stats, sample->weight3);
216 	mutex_unlock(tpebs_mtx_get());
217 	return 0;
218 }
219 
process_feature_event(const struct perf_tool * tool __maybe_unused,struct perf_session * session,union perf_event * event)220 static int process_feature_event(const struct perf_tool *tool __maybe_unused,
221 				 struct perf_session *session,
222 				 union perf_event *event)
223 {
224 	if (event->feat.feat_id < HEADER_LAST_FEATURE)
225 		return perf_event__process_feature(session, event);
226 	return 0;
227 }
228 
__sample_reader(void * arg __maybe_unused)229 static void *__sample_reader(void *arg __maybe_unused)
230 {
231 	struct perf_session *session;
232 	struct perf_data data = {
233 		.mode = PERF_DATA_MODE_READ,
234 		.path = PERF_DATA,
235 		.file.fd = tpebs_cmd.out,
236 	};
237 	struct perf_tool tool;
238 
239 	perf_tool__init(&tool, /*ordered_events=*/false);
240 	tool.sample = process_sample_event;
241 	tool.feature = process_feature_event;
242 	tool.attr = perf_event__process_attr;
243 
244 	session = perf_session__new(&data, &tool);
245 	if (IS_ERR(session))
246 		return NULL;
247 	perf_session__process_events(session);
248 	perf_session__delete(session);
249 
250 	return NULL;
251 }
252 
tpebs_send_record_cmd(const char * msg)253 static int tpebs_send_record_cmd(const char *msg) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get())
254 {
255 	struct pollfd pollfd = { .events = POLLIN, };
256 	int ret, len, retries = 0;
257 	char ack_buf[8];
258 
259 	/* Check if the command exited before the send, done with the lock held. */
260 	if (tpebs_cmd.pid == 0)
261 		return 0;
262 
263 	/*
264 	 * Let go of the lock while sending/receiving as blocking can starve the
265 	 * sample reading thread.
266 	 */
267 	mutex_unlock(tpebs_mtx_get());
268 
269 	/* Send perf record command.*/
270 	len = strlen(msg);
271 	ret = write(control_fd[1], msg, len);
272 	if (ret != len) {
273 		pr_err("perf record control write control message '%s' failed\n", msg);
274 		ret = -EPIPE;
275 		goto out;
276 	}
277 
278 	if (!strcmp(msg, EVLIST_CTL_CMD_STOP_TAG)) {
279 		ret = 0;
280 		goto out;
281 	}
282 
283 	/* Wait for an ack. */
284 	pollfd.fd = ack_fd[0];
285 
286 	/*
287 	 * We need this poll to ensure the ack_fd PIPE will not hang
288 	 * when perf record failed for any reason. The timeout value
289 	 * 3000ms is an empirical selection.
290 	 */
291 again:
292 	if (!poll(&pollfd, 1, 500)) {
293 		if (check_if_command_finished(&tpebs_cmd)) {
294 			ret = 0;
295 			goto out;
296 		}
297 
298 		if (retries++ < 6)
299 			goto again;
300 		pr_err("tpebs failed: perf record ack timeout for '%s'\n", msg);
301 		ret = -ETIMEDOUT;
302 		goto out;
303 	}
304 
305 	if (!(pollfd.revents & POLLIN)) {
306 		if (check_if_command_finished(&tpebs_cmd)) {
307 			ret = 0;
308 			goto out;
309 		}
310 
311 		pr_err("tpebs failed: did not received an ack for '%s'\n", msg);
312 		ret = -EPIPE;
313 		goto out;
314 	}
315 
316 	ret = read(ack_fd[0], ack_buf, sizeof(ack_buf));
317 	if (ret > 0)
318 		ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG);
319 	else
320 		pr_err("tpebs: perf record control ack failed\n");
321 out:
322 	/* Re-take lock as expected by caller. */
323 	mutex_lock(tpebs_mtx_get());
324 	return ret;
325 }
326 
327 /*
328  * tpebs_stop - stop the sample data read thread and the perf record process.
329  */
tpebs_stop(void)330 static int tpebs_stop(void) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get())
331 {
332 	int ret = 0;
333 
334 	/* Like tpebs_start, we should only run tpebs_end once. */
335 	if (tpebs_cmd.pid != 0) {
336 		tpebs_send_record_cmd(EVLIST_CTL_CMD_STOP_TAG);
337 		tpebs_cmd.pid = 0;
338 		mutex_unlock(tpebs_mtx_get());
339 		pthread_join(tpebs_reader_thread, NULL);
340 		mutex_lock(tpebs_mtx_get());
341 		close(control_fd[0]);
342 		close(control_fd[1]);
343 		close(ack_fd[0]);
344 		close(ack_fd[1]);
345 		close(tpebs_cmd.out);
346 		ret = finish_command(&tpebs_cmd);
347 		tpebs_cmd.pid = 0;
348 		if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
349 			ret = 0;
350 	}
351 	return ret;
352 }
353 
354 /**
355  * evsel__tpebs_event() - Create string event encoding to pass to `perf record`.
356  */
evsel__tpebs_event(struct evsel * evsel,char ** event)357 static int evsel__tpebs_event(struct evsel *evsel, char **event)
358 {
359 	char *name, *modifier;
360 	int ret;
361 
362 	name = strdup(evsel->name);
363 	if (!name)
364 		return -ENOMEM;
365 
366 	modifier = strrchr(name, 'R');
367 	if (!modifier) {
368 		ret = -EINVAL;
369 		goto out;
370 	}
371 	*modifier = 'p';
372 	modifier = strchr(name, ':');
373 	if (!modifier)
374 		modifier = strrchr(name, '/');
375 	if (!modifier) {
376 		ret = -EINVAL;
377 		goto out;
378 	}
379 	*modifier = '\0';
380 	if (asprintf(event, "%s/name=tpebs_event_%p/%s", name, evsel, modifier + 1) > 0)
381 		ret = 0;
382 	else
383 		ret = -ENOMEM;
384 out:
385 	if (ret)
386 		pr_err("Tpebs event modifier broken '%s'\n", evsel->name);
387 	free(name);
388 	return ret;
389 }
390 
tpebs_retire_lat__new(struct evsel * evsel)391 static struct tpebs_retire_lat *tpebs_retire_lat__new(struct evsel *evsel)
392 {
393 	struct tpebs_retire_lat *result = zalloc(sizeof(*result));
394 	int ret;
395 
396 	if (!result)
397 		return NULL;
398 
399 	ret = evsel__tpebs_event(evsel, &result->event);
400 	if (ret) {
401 		free(result);
402 		return NULL;
403 	}
404 	result->evsel = evsel;
405 	return result;
406 }
407 
tpebs_retire_lat__delete(struct tpebs_retire_lat * r)408 static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
409 {
410 	zfree(&r->event);
411 	free(r);
412 }
413 
tpebs_retire_lat__find(struct evsel * evsel)414 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel)
415 {
416 	struct tpebs_retire_lat *t;
417 	unsigned long num;
418 	const char *evsel_name;
419 
420 	/*
421 	 * Evsels will match for evlist with the retirement latency event. The
422 	 * name with "tpebs_event_" prefix will be present on events being read
423 	 * from `perf record`.
424 	 */
425 	if (evsel__is_retire_lat(evsel)) {
426 		list_for_each_entry(t, &tpebs_results, nd) {
427 			if (t->evsel == evsel)
428 				return t;
429 		}
430 		return NULL;
431 	}
432 	evsel_name = strstr(evsel->name, "tpebs_event_");
433 	if (!evsel_name) {
434 		/* Unexpected that the perf record should have other events. */
435 		return NULL;
436 	}
437 	errno = 0;
438 	num = strtoull(evsel_name + 12, NULL, 16);
439 	if (errno) {
440 		pr_err("Bad evsel for tpebs find '%s'\n", evsel->name);
441 		return NULL;
442 	}
443 	list_for_each_entry(t, &tpebs_results, nd) {
444 		if ((unsigned long)t->evsel == num)
445 			return t;
446 	}
447 	return NULL;
448 }
449 
450 /**
451  * evsel__tpebs_prepare - create tpebs data structures ready for opening.
452  * @evsel: retire_latency evsel, all evsels on its list will be prepared.
453  */
evsel__tpebs_prepare(struct evsel * evsel)454 static int evsel__tpebs_prepare(struct evsel *evsel)
455 {
456 	struct evsel *pos;
457 	struct tpebs_retire_lat *tpebs_event;
458 
459 	mutex_lock(tpebs_mtx_get());
460 	tpebs_event = tpebs_retire_lat__find(evsel);
461 	if (tpebs_event) {
462 		/* evsel, or an identically named one, was already prepared. */
463 		mutex_unlock(tpebs_mtx_get());
464 		return 0;
465 	}
466 	tpebs_event = tpebs_retire_lat__new(evsel);
467 	if (!tpebs_event) {
468 		mutex_unlock(tpebs_mtx_get());
469 		return -ENOMEM;
470 	}
471 	list_add_tail(&tpebs_event->nd, &tpebs_results);
472 	mutex_unlock(tpebs_mtx_get());
473 
474 	/*
475 	 * Eagerly prepare all other evsels on the list to try to ensure that by
476 	 * open they are all known.
477 	 */
478 	evlist__for_each_entry(evsel->evlist, pos) {
479 		int ret;
480 
481 		if (pos == evsel || !pos->retire_lat)
482 			continue;
483 
484 		ret = evsel__tpebs_prepare(pos);
485 		if (ret)
486 			return ret;
487 	}
488 	return 0;
489 }
490 
491 /**
492  * evsel__tpebs_open - starts tpebs execution.
493  * @evsel: retire_latency evsel, all evsels on its list will be selected. Each
494  *         evsel is sampled to get the average retire_latency value.
495  */
evsel__tpebs_open(struct evsel * evsel)496 int evsel__tpebs_open(struct evsel *evsel)
497 {
498 	int ret;
499 	bool tpebs_empty;
500 
501 	/* We should only run tpebs_start when tpebs_recording is enabled. */
502 	if (!tpebs_recording)
503 		return 0;
504 	/* Only start the events once. */
505 	if (tpebs_cmd.pid != 0) {
506 		struct tpebs_retire_lat *t;
507 		bool valid;
508 
509 		mutex_lock(tpebs_mtx_get());
510 		t = tpebs_retire_lat__find(evsel);
511 		valid = t && t->started;
512 		mutex_unlock(tpebs_mtx_get());
513 		/* May fail as the event wasn't started. */
514 		return valid ? 0 : -EBUSY;
515 	}
516 
517 	ret = evsel__tpebs_prepare(evsel);
518 	if (ret)
519 		return ret;
520 
521 	mutex_lock(tpebs_mtx_get());
522 	tpebs_empty = list_empty(&tpebs_results);
523 	if (!tpebs_empty) {
524 		/*Create control and ack fd for --control*/
525 		if (pipe(control_fd) < 0) {
526 			pr_err("tpebs: Failed to create control fifo");
527 			ret = -1;
528 			goto out;
529 		}
530 		if (pipe(ack_fd) < 0) {
531 			pr_err("tpebs: Failed to create control fifo");
532 			ret = -1;
533 			goto out;
534 		}
535 
536 		ret = evsel__tpebs_start_perf_record(evsel);
537 		if (ret)
538 			goto out;
539 
540 		if (pthread_create(&tpebs_reader_thread, /*attr=*/NULL, __sample_reader,
541 				   /*arg=*/NULL)) {
542 			kill(tpebs_cmd.pid, SIGTERM);
543 			close(tpebs_cmd.out);
544 			pr_err("Could not create thread to process sample data.\n");
545 			ret = -1;
546 			goto out;
547 		}
548 		ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_ENABLE_TAG);
549 	}
550 out:
551 	if (ret) {
552 		struct tpebs_retire_lat *t = tpebs_retire_lat__find(evsel);
553 
554 		list_del_init(&t->nd);
555 		tpebs_retire_lat__delete(t);
556 	}
557 	mutex_unlock(tpebs_mtx_get());
558 	return ret;
559 }
560 
evsel__tpebs_read(struct evsel * evsel,int cpu_map_idx,int thread)561 int evsel__tpebs_read(struct evsel *evsel, int cpu_map_idx, int thread)
562 {
563 	struct perf_counts_values *count, *old_count = NULL;
564 	struct tpebs_retire_lat *t;
565 	uint64_t val;
566 	int ret;
567 
568 	/* Only set retire_latency value to the first CPU and thread. */
569 	if (cpu_map_idx != 0 || thread != 0)
570 		return 0;
571 
572 	if (evsel->prev_raw_counts)
573 		old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread);
574 
575 	count = perf_counts(evsel->counts, cpu_map_idx, thread);
576 
577 	mutex_lock(tpebs_mtx_get());
578 	t = tpebs_retire_lat__find(evsel);
579 	/*
580 	 * If reading the first tpebs result, send a ping to the record
581 	 * process. Allow the sample reader a chance to read by releasing and
582 	 * reacquiring the lock.
583 	 */
584 	if (t && &t->nd == tpebs_results.next) {
585 		ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_PING_TAG);
586 		mutex_unlock(tpebs_mtx_get());
587 		if (ret)
588 			return ret;
589 		mutex_lock(tpebs_mtx_get());
590 	}
591 	if (t == NULL || t->stats.n == 0) {
592 		/* No sample data, use default. */
593 		if (tpebs_recording) {
594 			pr_warning_once(
595 				"Using precomputed retirement latency data as no samples\n");
596 		}
597 		val = 0;
598 		switch (tpebs_mode) {
599 		case TPEBS_MODE__MIN:
600 			val = rint(evsel->retirement_latency.min);
601 			break;
602 		case TPEBS_MODE__MAX:
603 			val = rint(evsel->retirement_latency.max);
604 			break;
605 		default:
606 		case TPEBS_MODE__LAST:
607 		case TPEBS_MODE__MEAN:
608 			val = rint(evsel->retirement_latency.mean);
609 			break;
610 		}
611 	} else {
612 		switch (tpebs_mode) {
613 		case TPEBS_MODE__MIN:
614 			val = t->stats.min;
615 			break;
616 		case TPEBS_MODE__MAX:
617 			val = t->stats.max;
618 			break;
619 		case TPEBS_MODE__LAST:
620 			val = t->last;
621 			break;
622 		default:
623 		case TPEBS_MODE__MEAN:
624 			val = rint(t->stats.mean);
625 			break;
626 		}
627 	}
628 	mutex_unlock(tpebs_mtx_get());
629 
630 	if (old_count) {
631 		count->val = old_count->val + val;
632 		count->run = old_count->run + 1;
633 		count->ena = old_count->ena + 1;
634 	} else {
635 		count->val = val;
636 		count->run++;
637 		count->ena++;
638 	}
639 	return 0;
640 }
641 
642 /**
643  * evsel__tpebs_close() - delete tpebs related data. If the last event, stop the
644  * created thread and process by calling tpebs_stop().
645  *
646  * This function is called in evsel__close() to be symmetric with
647  * evsel__tpebs_open() being called in evsel__open().
648  */
evsel__tpebs_close(struct evsel * evsel)649 void evsel__tpebs_close(struct evsel *evsel)
650 {
651 	struct tpebs_retire_lat *t;
652 
653 	mutex_lock(tpebs_mtx_get());
654 	t = tpebs_retire_lat__find(evsel);
655 	if (t) {
656 		list_del_init(&t->nd);
657 		tpebs_retire_lat__delete(t);
658 
659 		if (list_empty(&tpebs_results))
660 			tpebs_stop();
661 	}
662 	mutex_unlock(tpebs_mtx_get());
663 }
664