xref: /linux/tools/perf/arch/x86/util/pmu.c (revision 0fc8f6200d2313278fbf4539bbab74677c685531)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <string.h>
3 #include <stdio.h>
4 #include <sys/types.h>
5 #include <dirent.h>
6 #include <fcntl.h>
7 #include <linux/stddef.h>
8 #include <linux/string.h>
9 #include <linux/perf_event.h>
10 #include <api/fs/fs.h>
11 #include <api/io_dir.h>
12 #include <internal/cpumap.h>
13 #include <errno.h>
14 
15 #include "../../../util/intel-pt.h"
16 #include "../../../util/intel-bts.h"
17 #include "../../../util/pmu.h"
18 #include "../../../util/fncache.h"
19 #include "../../../util/pmus.h"
20 #include "mem-events.h"
21 #include "util/debug.h"
22 #include "util/env.h"
23 #include "util/header.h"
24 
25 static bool x86__is_intel_graniterapids(void)
26 {
27 	static bool checked_if_graniterapids;
28 	static bool is_graniterapids;
29 
30 	if (!checked_if_graniterapids) {
31 		const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
32 		char *cpuid = get_cpuid_str((struct perf_cpu){0});
33 
34 		is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
35 		free(cpuid);
36 		checked_if_graniterapids = true;
37 	}
38 	return is_graniterapids;
39 }
40 
41 static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
42 {
43 	struct perf_cpu_map *cpus;
44 	char *buf = NULL;
45 	size_t buf_len;
46 
47 	if (sysfs__read_str(sysfs_path, &buf, &buf_len) < 0)
48 		return NULL;
49 
50 	cpus = perf_cpu_map__new(buf);
51 	free(buf);
52 	return cpus;
53 }
54 
55 static int snc_nodes_per_l3_cache(void)
56 {
57 	static bool checked_snc;
58 	static int snc_nodes;
59 
60 	if (!checked_snc) {
61 		struct perf_cpu_map *node_cpus =
62 			read_sysfs_cpu_map("devices/system/node/node0/cpulist");
63 		struct perf_cpu_map *cache_cpus =
64 			read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
65 
66 		snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
67 		perf_cpu_map__put(cache_cpus);
68 		perf_cpu_map__put(node_cpus);
69 		checked_snc = true;
70 	}
71 	return snc_nodes;
72 }
73 
74 static int num_chas(void)
75 {
76 	static bool checked_chas;
77 	static int num_chas;
78 
79 	if (!checked_chas) {
80 		int fd = perf_pmu__event_source_devices_fd();
81 		struct io_dir dir;
82 		struct io_dirent64 *dent;
83 
84 		if (fd < 0)
85 			return -1;
86 
87 		io_dir__init(&dir, fd);
88 
89 		while ((dent = io_dir__readdir(&dir)) != NULL) {
90 			/* Note, dent->d_type will be DT_LNK and so isn't a useful filter. */
91 			if (strstarts(dent->d_name, "uncore_cha_"))
92 				num_chas++;
93 		}
94 		close(fd);
95 		checked_chas = true;
96 	}
97 	return num_chas;
98 }
99 
100 #define MAX_SNCS 6
101 
102 static int uncore_cha_snc(struct perf_pmu *pmu)
103 {
104 	// CHA SNC numbers are ordered correspond to the CHAs number.
105 	unsigned int cha_num;
106 	int num_cha, chas_per_node, cha_snc;
107 	int snc_nodes = snc_nodes_per_l3_cache();
108 
109 	if (snc_nodes <= 1)
110 		return 0;
111 
112 	num_cha = num_chas();
113 	if (num_cha <= 0) {
114 		pr_warning("Unexpected: no CHAs found\n");
115 		return 0;
116 	}
117 
118 	/* Compute SNC for PMU. */
119 	if (sscanf(pmu->name, "uncore_cha_%u", &cha_num) != 1) {
120 		pr_warning("Unexpected: unable to compute CHA number '%s'\n", pmu->name);
121 		return 0;
122 	}
123 	chas_per_node = num_cha / snc_nodes;
124 	cha_snc = cha_num / chas_per_node;
125 
126 	/* Range check cha_snc. for unexpected out of bounds. */
127 	return cha_snc >= MAX_SNCS ? 0 : cha_snc;
128 }
129 
130 static int uncore_imc_snc(struct perf_pmu *pmu)
131 {
132 	// Compute the IMC SNC using lookup tables.
133 	unsigned int imc_num;
134 	int snc_nodes = snc_nodes_per_l3_cache();
135 	const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
136 	const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
137 	const u8 *snc_map;
138 	size_t snc_map_len;
139 
140 	switch (snc_nodes) {
141 	case 2:
142 		snc_map = snc2_map;
143 		snc_map_len = ARRAY_SIZE(snc2_map);
144 		break;
145 	case 3:
146 		snc_map = snc3_map;
147 		snc_map_len = ARRAY_SIZE(snc3_map);
148 		break;
149 	default:
150 		/* Error or no lookup support for SNC with >3 nodes. */
151 		return 0;
152 	}
153 
154 	/* Compute SNC for PMU. */
155 	if (sscanf(pmu->name, "uncore_imc_%u", &imc_num) != 1) {
156 		pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
157 		return 0;
158 	}
159 	if (imc_num >= snc_map_len) {
160 		pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
161 		return 0;
162 	}
163 	return snc_map[imc_num];
164 }
165 
166 static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
167 {
168 	static bool checked_cpu_adjust[MAX_SNCS];
169 	static int cpu_adjust[MAX_SNCS];
170 	struct perf_cpu_map *node_cpus;
171 	char node_path[] = "devices/system/node/node0/cpulist";
172 
173 	/* Was adjust already computed? */
174 	if (checked_cpu_adjust[pmu_snc])
175 		return cpu_adjust[pmu_snc];
176 
177 	/* SNC0 doesn't need an adjust. */
178 	if (pmu_snc == 0) {
179 		cpu_adjust[0] = 0;
180 		checked_cpu_adjust[0] = true;
181 		return 0;
182 	}
183 
184 	/*
185 	 * Use NUMA topology to compute first CPU of the NUMA node, we want to
186 	 * adjust CPU 0 to be this and similarly for other CPUs if there is >1
187 	 * socket.
188 	 */
189 	assert(pmu_snc >= 0 && pmu_snc <= 9);
190 	node_path[24] += pmu_snc; // Shift node0 to be node<pmu_snc>.
191 	node_cpus = read_sysfs_cpu_map(node_path);
192 	cpu_adjust[pmu_snc] = perf_cpu_map__cpu(node_cpus, 0).cpu;
193 	if (cpu_adjust[pmu_snc] < 0) {
194 		pr_debug("Failed to read valid CPU list from <sysfs>/%s\n", node_path);
195 		cpu_adjust[pmu_snc] = 0;
196 	} else {
197 		checked_cpu_adjust[pmu_snc] = true;
198 	}
199 	perf_cpu_map__put(node_cpus);
200 	return cpu_adjust[pmu_snc];
201 }
202 
203 static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
204 {
205 	// With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
206 	// topology. For example, a two socket graniterapids machine may be set
207 	// up with 3-way SNC meaning there are 6 NUMA nodes that should be
208 	// displayed with --per-node. The cpumask of the CHA and IMC PMUs
209 	// reflects per-socket information meaning, for example, uncore_cha_60
210 	// on a two socket graniterapids machine with 120 cores per socket will
211 	// have a cpumask of "0,120". This cpumask needs adjusting to "40,160"
212 	// to reflect that uncore_cha_60 is used for the 2nd SNC of each
213 	// socket. Without the adjustment events on uncore_cha_60 will appear in
214 	// node 0 and node 3 (in our example 2 socket 3-way set up), but with
215 	// the adjustment they will appear in node 1 and node 4. The number of
216 	// CHAs is typically larger than the number of cores. The CHA numbers
217 	// are assumed to split evenly and inorder wrt core numbers. There are
218 	// fewer memory IMC PMUs than cores and mapping is handled using lookup
219 	// tables.
220 	static struct perf_cpu_map *cha_adjusted[MAX_SNCS];
221 	static struct perf_cpu_map *imc_adjusted[MAX_SNCS];
222 	struct perf_cpu_map **adjusted = cha ? cha_adjusted : imc_adjusted;
223 	unsigned int idx;
224 	int pmu_snc, cpu_adjust;
225 	struct perf_cpu cpu;
226 	bool alloc;
227 
228 	// Cpus from the kernel holds first CPU of each socket. e.g. 0,120.
229 	if (perf_cpu_map__cpu(pmu->cpus, 0).cpu != 0) {
230 		pr_debug("Ignoring cpumask adjust for %s as unexpected first CPU\n", pmu->name);
231 		return;
232 	}
233 
234 	pmu_snc = cha ? uncore_cha_snc(pmu) : uncore_imc_snc(pmu);
235 	if (pmu_snc == 0) {
236 		// No adjustment necessary for the first SNC.
237 		return;
238 	}
239 
240 	alloc = adjusted[pmu_snc] == NULL;
241 	if (alloc) {
242 		// Hold onto the perf_cpu_map globally to avoid recomputation.
243 		cpu_adjust = uncore_cha_imc_compute_cpu_adjust(pmu_snc);
244 		adjusted[pmu_snc] = perf_cpu_map__empty_new(perf_cpu_map__nr(pmu->cpus));
245 		if (!adjusted[pmu_snc])
246 			return;
247 	}
248 
249 	perf_cpu_map__for_each_cpu(cpu, idx, pmu->cpus) {
250 		// Compute the new cpu map values or if not allocating, assert
251 		// that they match expectations. asserts will be removed to
252 		// avoid overhead in NDEBUG builds.
253 		if (alloc) {
254 			RC_CHK_ACCESS(adjusted[pmu_snc])->map[idx].cpu = cpu.cpu + cpu_adjust;
255 		} else if (idx == 0) {
256 			cpu_adjust = perf_cpu_map__cpu(adjusted[pmu_snc], idx).cpu - cpu.cpu;
257 			assert(uncore_cha_imc_compute_cpu_adjust(pmu_snc) == cpu_adjust);
258 		} else {
259 			assert(perf_cpu_map__cpu(adjusted[pmu_snc], idx).cpu ==
260 			       cpu.cpu + cpu_adjust);
261 		}
262 	}
263 
264 	perf_cpu_map__put(pmu->cpus);
265 	pmu->cpus = perf_cpu_map__get(adjusted[pmu_snc]);
266 }
267 
268 void perf_pmu__arch_init(struct perf_pmu *pmu)
269 {
270 	struct perf_pmu_caps *ldlat_cap;
271 
272 	if (!strcmp(pmu->name, INTEL_PT_PMU_NAME)) {
273 		pmu->auxtrace = true;
274 		pmu->selectable = true;
275 		pmu->perf_event_attr_init_default = intel_pt_pmu_default_config;
276 	}
277 	if (!strcmp(pmu->name, INTEL_BTS_PMU_NAME)) {
278 		pmu->auxtrace = true;
279 		pmu->selectable = true;
280 	}
281 
282 	if (x86__is_amd_cpu()) {
283 		if (strcmp(pmu->name, "ibs_op"))
284 			return;
285 
286 		pmu->mem_events = perf_mem_events_amd;
287 
288 		if (!perf_pmu__caps_parse(pmu))
289 			return;
290 
291 		ldlat_cap = perf_pmu__get_cap(pmu, "ldlat");
292 		if (!ldlat_cap || strcmp(ldlat_cap->value, "1"))
293 			return;
294 
295 		perf_mem_events__loads_ldlat = 0;
296 		pmu->mem_events = perf_mem_events_amd_ldlat;
297 	} else {
298 		if (pmu->is_core) {
299 			if (perf_pmu__have_event(pmu, "mem-loads-aux"))
300 				pmu->mem_events = perf_mem_events_intel_aux;
301 			else
302 				pmu->mem_events = perf_mem_events_intel;
303 		} else if (x86__is_intel_graniterapids()) {
304 			if (strstarts(pmu->name, "uncore_cha_"))
305 				gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
306 			else if (strstarts(pmu->name, "uncore_imc_"))
307 				gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
308 		}
309 	}
310 }
311