1 // SPDX-License-Identifier: GPL-2.0 2 #include <string.h> 3 #include <stdio.h> 4 #include <sys/types.h> 5 #include <dirent.h> 6 #include <fcntl.h> 7 #include <linux/stddef.h> 8 #include <linux/string.h> 9 #include <linux/perf_event.h> 10 #include <api/fs/fs.h> 11 #include <api/io_dir.h> 12 #include <internal/cpumap.h> 13 #include <errno.h> 14 15 #include "../../../util/intel-pt.h" 16 #include "../../../util/intel-bts.h" 17 #include "../../../util/pmu.h" 18 #include "../../../util/fncache.h" 19 #include "../../../util/pmus.h" 20 #include "mem-events.h" 21 #include "util/debug.h" 22 #include "util/env.h" 23 #include "util/header.h" 24 25 static bool x86__is_intel_graniterapids(void) 26 { 27 static bool checked_if_graniterapids; 28 static bool is_graniterapids; 29 30 if (!checked_if_graniterapids) { 31 const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]"; 32 char *cpuid = get_cpuid_str((struct perf_cpu){0}); 33 34 is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0; 35 free(cpuid); 36 checked_if_graniterapids = true; 37 } 38 return is_graniterapids; 39 } 40 41 static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path) 42 { 43 struct perf_cpu_map *cpus; 44 char *buf = NULL; 45 size_t buf_len; 46 47 if (sysfs__read_str(sysfs_path, &buf, &buf_len) < 0) 48 return NULL; 49 50 cpus = perf_cpu_map__new(buf); 51 free(buf); 52 return cpus; 53 } 54 55 static int snc_nodes_per_l3_cache(void) 56 { 57 static bool checked_snc; 58 static int snc_nodes; 59 60 if (!checked_snc) { 61 struct perf_cpu_map *node_cpus = 62 read_sysfs_cpu_map("devices/system/node/node0/cpulist"); 63 struct perf_cpu_map *cache_cpus = 64 read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list"); 65 66 snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus); 67 perf_cpu_map__put(cache_cpus); 68 perf_cpu_map__put(node_cpus); 69 checked_snc = true; 70 } 71 return snc_nodes; 72 } 73 74 static int num_chas(void) 75 { 76 static bool checked_chas; 77 static int num_chas; 78 79 if (!checked_chas) { 80 int fd = perf_pmu__event_source_devices_fd(); 81 struct io_dir dir; 82 struct io_dirent64 *dent; 83 84 if (fd < 0) 85 return -1; 86 87 io_dir__init(&dir, fd); 88 89 while ((dent = io_dir__readdir(&dir)) != NULL) { 90 /* Note, dent->d_type will be DT_LNK and so isn't a useful filter. */ 91 if (strstarts(dent->d_name, "uncore_cha_")) 92 num_chas++; 93 } 94 close(fd); 95 checked_chas = true; 96 } 97 return num_chas; 98 } 99 100 #define MAX_SNCS 6 101 102 static int uncore_cha_snc(struct perf_pmu *pmu) 103 { 104 // CHA SNC numbers are ordered correspond to the CHAs number. 105 unsigned int cha_num; 106 int num_cha, chas_per_node, cha_snc; 107 int snc_nodes = snc_nodes_per_l3_cache(); 108 109 if (snc_nodes <= 1) 110 return 0; 111 112 num_cha = num_chas(); 113 if (num_cha <= 0) { 114 pr_warning("Unexpected: no CHAs found\n"); 115 return 0; 116 } 117 118 /* Compute SNC for PMU. */ 119 if (sscanf(pmu->name, "uncore_cha_%u", &cha_num) != 1) { 120 pr_warning("Unexpected: unable to compute CHA number '%s'\n", pmu->name); 121 return 0; 122 } 123 chas_per_node = num_cha / snc_nodes; 124 cha_snc = cha_num / chas_per_node; 125 126 /* Range check cha_snc. for unexpected out of bounds. */ 127 return cha_snc >= MAX_SNCS ? 0 : cha_snc; 128 } 129 130 static int uncore_imc_snc(struct perf_pmu *pmu) 131 { 132 // Compute the IMC SNC using lookup tables. 133 unsigned int imc_num; 134 int snc_nodes = snc_nodes_per_l3_cache(); 135 const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0}; 136 const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2}; 137 const u8 *snc_map; 138 size_t snc_map_len; 139 140 switch (snc_nodes) { 141 case 2: 142 snc_map = snc2_map; 143 snc_map_len = ARRAY_SIZE(snc2_map); 144 break; 145 case 3: 146 snc_map = snc3_map; 147 snc_map_len = ARRAY_SIZE(snc3_map); 148 break; 149 default: 150 /* Error or no lookup support for SNC with >3 nodes. */ 151 return 0; 152 } 153 154 /* Compute SNC for PMU. */ 155 if (sscanf(pmu->name, "uncore_imc_%u", &imc_num) != 1) { 156 pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name); 157 return 0; 158 } 159 if (imc_num >= snc_map_len) { 160 pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes); 161 return 0; 162 } 163 return snc_map[imc_num]; 164 } 165 166 static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc) 167 { 168 static bool checked_cpu_adjust[MAX_SNCS]; 169 static int cpu_adjust[MAX_SNCS]; 170 struct perf_cpu_map *node_cpus; 171 char node_path[] = "devices/system/node/node0/cpulist"; 172 173 /* Was adjust already computed? */ 174 if (checked_cpu_adjust[pmu_snc]) 175 return cpu_adjust[pmu_snc]; 176 177 /* SNC0 doesn't need an adjust. */ 178 if (pmu_snc == 0) { 179 cpu_adjust[0] = 0; 180 checked_cpu_adjust[0] = true; 181 return 0; 182 } 183 184 /* 185 * Use NUMA topology to compute first CPU of the NUMA node, we want to 186 * adjust CPU 0 to be this and similarly for other CPUs if there is >1 187 * socket. 188 */ 189 assert(pmu_snc >= 0 && pmu_snc <= 9); 190 node_path[24] += pmu_snc; // Shift node0 to be node<pmu_snc>. 191 node_cpus = read_sysfs_cpu_map(node_path); 192 cpu_adjust[pmu_snc] = perf_cpu_map__cpu(node_cpus, 0).cpu; 193 if (cpu_adjust[pmu_snc] < 0) { 194 pr_debug("Failed to read valid CPU list from <sysfs>/%s\n", node_path); 195 cpu_adjust[pmu_snc] = 0; 196 } else { 197 checked_cpu_adjust[pmu_snc] = true; 198 } 199 perf_cpu_map__put(node_cpus); 200 return cpu_adjust[pmu_snc]; 201 } 202 203 static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha) 204 { 205 // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the 206 // topology. For example, a two socket graniterapids machine may be set 207 // up with 3-way SNC meaning there are 6 NUMA nodes that should be 208 // displayed with --per-node. The cpumask of the CHA and IMC PMUs 209 // reflects per-socket information meaning, for example, uncore_cha_60 210 // on a two socket graniterapids machine with 120 cores per socket will 211 // have a cpumask of "0,120". This cpumask needs adjusting to "40,160" 212 // to reflect that uncore_cha_60 is used for the 2nd SNC of each 213 // socket. Without the adjustment events on uncore_cha_60 will appear in 214 // node 0 and node 3 (in our example 2 socket 3-way set up), but with 215 // the adjustment they will appear in node 1 and node 4. The number of 216 // CHAs is typically larger than the number of cores. The CHA numbers 217 // are assumed to split evenly and inorder wrt core numbers. There are 218 // fewer memory IMC PMUs than cores and mapping is handled using lookup 219 // tables. 220 static struct perf_cpu_map *cha_adjusted[MAX_SNCS]; 221 static struct perf_cpu_map *imc_adjusted[MAX_SNCS]; 222 struct perf_cpu_map **adjusted = cha ? cha_adjusted : imc_adjusted; 223 unsigned int idx; 224 int pmu_snc, cpu_adjust; 225 struct perf_cpu cpu; 226 bool alloc; 227 228 // Cpus from the kernel holds first CPU of each socket. e.g. 0,120. 229 if (perf_cpu_map__cpu(pmu->cpus, 0).cpu != 0) { 230 pr_debug("Ignoring cpumask adjust for %s as unexpected first CPU\n", pmu->name); 231 return; 232 } 233 234 pmu_snc = cha ? uncore_cha_snc(pmu) : uncore_imc_snc(pmu); 235 if (pmu_snc == 0) { 236 // No adjustment necessary for the first SNC. 237 return; 238 } 239 240 alloc = adjusted[pmu_snc] == NULL; 241 if (alloc) { 242 // Hold onto the perf_cpu_map globally to avoid recomputation. 243 cpu_adjust = uncore_cha_imc_compute_cpu_adjust(pmu_snc); 244 adjusted[pmu_snc] = perf_cpu_map__empty_new(perf_cpu_map__nr(pmu->cpus)); 245 if (!adjusted[pmu_snc]) 246 return; 247 } 248 249 perf_cpu_map__for_each_cpu(cpu, idx, pmu->cpus) { 250 // Compute the new cpu map values or if not allocating, assert 251 // that they match expectations. asserts will be removed to 252 // avoid overhead in NDEBUG builds. 253 if (alloc) { 254 RC_CHK_ACCESS(adjusted[pmu_snc])->map[idx].cpu = cpu.cpu + cpu_adjust; 255 } else if (idx == 0) { 256 cpu_adjust = perf_cpu_map__cpu(adjusted[pmu_snc], idx).cpu - cpu.cpu; 257 assert(uncore_cha_imc_compute_cpu_adjust(pmu_snc) == cpu_adjust); 258 } else { 259 assert(perf_cpu_map__cpu(adjusted[pmu_snc], idx).cpu == 260 cpu.cpu + cpu_adjust); 261 } 262 } 263 264 perf_cpu_map__put(pmu->cpus); 265 pmu->cpus = perf_cpu_map__get(adjusted[pmu_snc]); 266 } 267 268 void perf_pmu__arch_init(struct perf_pmu *pmu) 269 { 270 struct perf_pmu_caps *ldlat_cap; 271 272 if (!strcmp(pmu->name, INTEL_PT_PMU_NAME)) { 273 pmu->auxtrace = true; 274 pmu->selectable = true; 275 pmu->perf_event_attr_init_default = intel_pt_pmu_default_config; 276 } 277 if (!strcmp(pmu->name, INTEL_BTS_PMU_NAME)) { 278 pmu->auxtrace = true; 279 pmu->selectable = true; 280 } 281 282 if (x86__is_amd_cpu()) { 283 if (strcmp(pmu->name, "ibs_op")) 284 return; 285 286 pmu->mem_events = perf_mem_events_amd; 287 288 if (!perf_pmu__caps_parse(pmu)) 289 return; 290 291 ldlat_cap = perf_pmu__get_cap(pmu, "ldlat"); 292 if (!ldlat_cap || strcmp(ldlat_cap->value, "1")) 293 return; 294 295 perf_mem_events__loads_ldlat = 0; 296 pmu->mem_events = perf_mem_events_amd_ldlat; 297 } else { 298 if (pmu->is_core) { 299 if (perf_pmu__have_event(pmu, "mem-loads-aux")) 300 pmu->mem_events = perf_mem_events_intel_aux; 301 else 302 pmu->mem_events = perf_mem_events_intel; 303 } else if (x86__is_intel_graniterapids()) { 304 if (strstarts(pmu->name, "uncore_cha_")) 305 gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true); 306 else if (strstarts(pmu->name, "uncore_imc_")) 307 gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false); 308 } 309 } 310 } 311