// SPDX-License-Identifier: GPL-2.0-only /* * intel_tpebs.c: Intel TPEBS support */ #include #include #include #include "intel-tpebs.h" #include #include #include #include "sample.h" #include "debug.h" #include "evlist.h" #include "evsel.h" #include "session.h" #include "tool.h" #include "cpumap.h" #include "metricgroup.h" #include #include #include #include #define PERF_DATA "-" bool tpebs_recording; static pid_t tpebs_pid = -1; static size_t tpebs_event_size; static LIST_HEAD(tpebs_results); static pthread_t tpebs_reader_thread; static struct child_process *tpebs_cmd; struct tpebs_retire_lat { struct list_head nd; /* Event name */ const char *name; /* Event name with the TPEBS modifier R */ const char *tpebs_name; /* Count of retire_latency values found in sample data */ size_t count; /* Sum of all the retire_latency values in sample data */ int sum; /* Average of retire_latency, val = sum / count */ double val; }; static int get_perf_record_args(const char **record_argv, char buf[], const char *cpumap_buf) { struct tpebs_retire_lat *e; int i = 0; pr_debug("tpebs: Prepare perf record for retire_latency\n"); record_argv[i++] = "perf"; record_argv[i++] = "record"; record_argv[i++] = "-W"; record_argv[i++] = "--synth=no"; record_argv[i++] = buf; if (!cpumap_buf) { pr_err("tpebs: Require cpumap list to run sampling\n"); return -ECANCELED; } /* Use -C when cpumap_buf is not "-1" */ if (strcmp(cpumap_buf, "-1")) { record_argv[i++] = "-C"; record_argv[i++] = cpumap_buf; } list_for_each_entry(e, &tpebs_results, nd) { record_argv[i++] = "-e"; record_argv[i++] = e->name; } record_argv[i++] = "-o"; record_argv[i++] = PERF_DATA; return 0; } static int prepare_run_command(const char **argv) { tpebs_cmd = zalloc(sizeof(struct child_process)); if (!tpebs_cmd) return -ENOMEM; tpebs_cmd->argv = argv; tpebs_cmd->out = -1; return 0; } static int start_perf_record(int control_fd[], int ack_fd[], const char *cpumap_buf) { const char **record_argv; int ret; char buf[32]; scnprintf(buf, sizeof(buf), "--control=fd:%d,%d", control_fd[0], ack_fd[1]); record_argv = calloc(12 + 2 * tpebs_event_size, sizeof(char *)); if (!record_argv) return -ENOMEM; ret = get_perf_record_args(record_argv, buf, cpumap_buf); if (ret) goto out; ret = prepare_run_command(record_argv); if (ret) goto out; ret = start_command(tpebs_cmd); out: free(record_argv); return ret; } static int process_sample_event(const struct perf_tool *tool __maybe_unused, union perf_event *event __maybe_unused, struct perf_sample *sample, struct evsel *evsel, struct machine *machine __maybe_unused) { int ret = 0; const char *evname; struct tpebs_retire_lat *t; evname = evsel__name(evsel); /* * Need to handle per core results? We are assuming average retire * latency value will be used. Save the number of samples and the sum of * retire latency value for each event. */ list_for_each_entry(t, &tpebs_results, nd) { if (!strcmp(evname, t->name)) { t->count += 1; t->sum += sample->retire_lat; t->val = (double) t->sum / t->count; break; } } return ret; } static int process_feature_event(struct perf_session *session, union perf_event *event) { if (event->feat.feat_id < HEADER_LAST_FEATURE) return perf_event__process_feature(session, event); return 0; } static void *__sample_reader(void *arg) { struct child_process *child = arg; struct perf_session *session; struct perf_data data = { .mode = PERF_DATA_MODE_READ, .path = PERF_DATA, .file.fd = child->out, }; struct perf_tool tool; perf_tool__init(&tool, /*ordered_events=*/false); tool.sample = process_sample_event; tool.feature = process_feature_event; tool.attr = perf_event__process_attr; session = perf_session__new(&data, &tool); if (IS_ERR(session)) return NULL; perf_session__process_events(session); perf_session__delete(session); return NULL; } /* * tpebs_stop - stop the sample data read thread and the perf record process. */ static int tpebs_stop(void) { int ret = 0; /* Like tpebs_start, we should only run tpebs_end once. */ if (tpebs_pid != -1) { kill(tpebs_cmd->pid, SIGTERM); tpebs_pid = -1; pthread_join(tpebs_reader_thread, NULL); close(tpebs_cmd->out); ret = finish_command(tpebs_cmd); if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL) ret = 0; } return ret; } /* * tpebs_start - start tpebs execution. * @evsel_list: retire_latency evsels in this list will be selected and sampled * to get the average retire_latency value. * * This function will be called from evlist level later when evlist__open() is * called consistently. */ int tpebs_start(struct evlist *evsel_list) { int ret = 0; struct evsel *evsel; char cpumap_buf[50]; /* * We should only run tpebs_start when tpebs_recording is enabled. * And we should only run it once with all the required events. */ if (tpebs_pid != -1 || !tpebs_recording) return 0; cpu_map__snprint(evsel_list->core.user_requested_cpus, cpumap_buf, sizeof(cpumap_buf)); /* * Prepare perf record for sampling event retire_latency before fork and * prepare workload */ evlist__for_each_entry(evsel_list, evsel) { int i; char *name; struct tpebs_retire_lat *new; if (!evsel->retire_lat) continue; pr_debug("tpebs: Retire_latency of event %s is required\n", evsel->name); for (i = strlen(evsel->name) - 1; i > 0; i--) { if (evsel->name[i] == 'R') break; } if (i <= 0 || evsel->name[i] != 'R') { ret = -1; goto err; } name = strdup(evsel->name); if (!name) { ret = -ENOMEM; goto err; } name[i] = 'p'; new = zalloc(sizeof(*new)); if (!new) { ret = -1; zfree(name); goto err; } new->name = name; new->tpebs_name = evsel->name; list_add_tail(&new->nd, &tpebs_results); tpebs_event_size += 1; } if (tpebs_event_size > 0) { struct pollfd pollfd = { .events = POLLIN, }; int control_fd[2], ack_fd[2], len; char ack_buf[8]; /*Create control and ack fd for --control*/ if (pipe(control_fd) < 0) { pr_err("tpebs: Failed to create control fifo"); ret = -1; goto out; } if (pipe(ack_fd) < 0) { pr_err("tpebs: Failed to create control fifo"); ret = -1; goto out; } ret = start_perf_record(control_fd, ack_fd, cpumap_buf); if (ret) goto out; tpebs_pid = tpebs_cmd->pid; if (pthread_create(&tpebs_reader_thread, NULL, __sample_reader, tpebs_cmd)) { kill(tpebs_cmd->pid, SIGTERM); close(tpebs_cmd->out); pr_err("Could not create thread to process sample data.\n"); ret = -1; goto out; } /* Wait for perf record initialization.*/ len = strlen(EVLIST_CTL_CMD_ENABLE_TAG); ret = write(control_fd[1], EVLIST_CTL_CMD_ENABLE_TAG, len); if (ret != len) { pr_err("perf record control write control message failed\n"); goto out; } /* wait for an ack */ pollfd.fd = ack_fd[0]; /* * We need this poll to ensure the ack_fd PIPE will not hang * when perf record failed for any reason. The timeout value * 3000ms is an empirical selection. */ if (!poll(&pollfd, 1, 3000)) { pr_err("tpebs failed: perf record ack timeout\n"); ret = -1; goto out; } if (!(pollfd.revents & POLLIN)) { pr_err("tpebs failed: did not received an ack\n"); ret = -1; goto out; } ret = read(ack_fd[0], ack_buf, sizeof(ack_buf)); if (ret > 0) ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG); else { pr_err("tpebs: perf record control ack failed\n"); goto out; } out: close(control_fd[0]); close(control_fd[1]); close(ack_fd[0]); close(ack_fd[1]); } err: if (ret) tpebs_delete(); return ret; } int tpebs_set_evsel(struct evsel *evsel, int cpu_map_idx, int thread) { __u64 val; bool found = false; struct tpebs_retire_lat *t; struct perf_counts_values *count; /* Non reitre_latency evsel should never enter this function. */ if (!evsel__is_retire_lat(evsel)) return -1; /* * Need to stop the forked record to ensure get sampled data from the * PIPE to process and get non-zero retire_lat value for hybrid. */ tpebs_stop(); count = perf_counts(evsel->counts, cpu_map_idx, thread); list_for_each_entry(t, &tpebs_results, nd) { if (t->tpebs_name == evsel->name || (evsel->metric_id && !strcmp(t->tpebs_name, evsel->metric_id))) { found = true; break; } } /* Set ena and run to non-zero */ count->ena = count->run = 1; count->lost = 0; if (!found) { /* * Set default value or 0 when retire_latency for this event is * not found from sampling data (record_tpebs not set or 0 * sample recorded). */ count->val = 0; return 0; } /* * Only set retire_latency value to the first CPU and thread. */ if (cpu_map_idx == 0 && thread == 0) val = rint(t->val); else val = 0; count->val = val; return 0; } static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r) { zfree(&r->name); free(r); } /* * tpebs_delete - delete tpebs related data and stop the created thread and * process by calling tpebs_stop(). * * This function is called from evlist_delete() and also from builtin-stat * stat_handle_error(). If tpebs_start() is called from places other then perf * stat, need to ensure tpebs_delete() is also called to safely free mem and * close the data read thread and the forked perf record process. * * This function is also called in evsel__close() to be symmetric with * tpebs_start() being called in evsel__open(). We will update this call site * when move tpebs_start() to evlist level. */ void tpebs_delete(void) { struct tpebs_retire_lat *r, *rtmp; if (tpebs_pid == -1) return; tpebs_stop(); list_for_each_entry_safe(r, rtmp, &tpebs_results, nd) { list_del_init(&r->nd); tpebs_retire_lat__delete(r); } if (tpebs_cmd) { free(tpebs_cmd); tpebs_cmd = NULL; } }