1 // SPDX-License-Identifier: GPL-2.0 2 #include <string.h> 3 #include <stdio.h> 4 #include <sys/types.h> 5 #include <dirent.h> 6 #include <fcntl.h> 7 #include <linux/stddef.h> 8 #include <linux/perf_event.h> 9 #include <linux/zalloc.h> 10 #include <api/fs/fs.h> 11 #include <api/io_dir.h> 12 #include <internal/cpumap.h> 13 #include <errno.h> 14 15 #include "../../../util/intel-pt.h" 16 #include "../../../util/intel-bts.h" 17 #include "../../../util/pmu.h" 18 #include "../../../util/fncache.h" 19 #include "../../../util/pmus.h" 20 #include "mem-events.h" 21 #include "util/debug.h" 22 #include "util/env.h" 23 #include "util/header.h" 24 25 static bool x86__is_intel_graniterapids(void) 26 { 27 static bool checked_if_graniterapids; 28 static bool is_graniterapids; 29 30 if (!checked_if_graniterapids) { 31 const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]"; 32 char *cpuid = get_cpuid_str((struct perf_cpu){0}); 33 34 is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0; 35 free(cpuid); 36 checked_if_graniterapids = true; 37 } 38 return is_graniterapids; 39 } 40 41 static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path) 42 { 43 struct perf_cpu_map *cpus; 44 char *buf = NULL; 45 size_t buf_len; 46 47 if (sysfs__read_str(sysfs_path, &buf, &buf_len) < 0) 48 return NULL; 49 50 cpus = perf_cpu_map__new(buf); 51 free(buf); 52 return cpus; 53 } 54 55 static int snc_nodes_per_l3_cache(void) 56 { 57 static bool checked_snc; 58 static int snc_nodes; 59 60 if (!checked_snc) { 61 struct perf_cpu_map *node_cpus = 62 read_sysfs_cpu_map("devices/system/node/node0/cpulist"); 63 struct perf_cpu_map *cache_cpus = 64 read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list"); 65 66 snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus); 67 perf_cpu_map__put(cache_cpus); 68 perf_cpu_map__put(node_cpus); 69 checked_snc = true; 70 } 71 return snc_nodes; 72 } 73 74 static bool starts_with(const char *str, const char *prefix) 75 { 76 return !strncmp(prefix, str, strlen(prefix)); 77 } 78 79 static int num_chas(void) 80 { 81 static bool checked_chas; 82 static int num_chas; 83 84 if (!checked_chas) { 85 int fd = perf_pmu__event_source_devices_fd(); 86 struct io_dir dir; 87 struct io_dirent64 *dent; 88 89 if (fd < 0) 90 return -1; 91 92 io_dir__init(&dir, fd); 93 94 while ((dent = io_dir__readdir(&dir)) != NULL) { 95 /* Note, dent->d_type will be DT_LNK and so isn't a useful filter. */ 96 if (starts_with(dent->d_name, "uncore_cha_")) 97 num_chas++; 98 } 99 close(fd); 100 checked_chas = true; 101 } 102 return num_chas; 103 } 104 105 #define MAX_SNCS 6 106 107 static int uncore_cha_snc(struct perf_pmu *pmu) 108 { 109 // CHA SNC numbers are ordered correspond to the CHAs number. 110 unsigned int cha_num; 111 int num_cha, chas_per_node, cha_snc; 112 int snc_nodes = snc_nodes_per_l3_cache(); 113 114 if (snc_nodes <= 1) 115 return 0; 116 117 num_cha = num_chas(); 118 if (num_cha <= 0) { 119 pr_warning("Unexpected: no CHAs found\n"); 120 return 0; 121 } 122 123 /* Compute SNC for PMU. */ 124 if (sscanf(pmu->name, "uncore_cha_%u", &cha_num) != 1) { 125 pr_warning("Unexpected: unable to compute CHA number '%s'\n", pmu->name); 126 return 0; 127 } 128 chas_per_node = num_cha / snc_nodes; 129 cha_snc = cha_num / chas_per_node; 130 131 /* Range check cha_snc. for unexpected out of bounds. */ 132 return cha_snc >= MAX_SNCS ? 0 : cha_snc; 133 } 134 135 static int uncore_imc_snc(struct perf_pmu *pmu) 136 { 137 // Compute the IMC SNC using lookup tables. 138 unsigned int imc_num; 139 int snc_nodes = snc_nodes_per_l3_cache(); 140 const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0}; 141 const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2}; 142 const u8 *snc_map; 143 size_t snc_map_len; 144 145 switch (snc_nodes) { 146 case 2: 147 snc_map = snc2_map; 148 snc_map_len = ARRAY_SIZE(snc2_map); 149 break; 150 case 3: 151 snc_map = snc3_map; 152 snc_map_len = ARRAY_SIZE(snc3_map); 153 break; 154 default: 155 /* Error or no lookup support for SNC with >3 nodes. */ 156 return 0; 157 } 158 159 /* Compute SNC for PMU. */ 160 if (sscanf(pmu->name, "uncore_imc_%u", &imc_num) != 1) { 161 pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name); 162 return 0; 163 } 164 if (imc_num >= snc_map_len) { 165 pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes); 166 return 0; 167 } 168 return snc_map[imc_num]; 169 } 170 171 static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc) 172 { 173 static bool checked_cpu_adjust[MAX_SNCS]; 174 static int cpu_adjust[MAX_SNCS]; 175 struct perf_cpu_map *node_cpus; 176 char node_path[] = "devices/system/node/node0/cpulist"; 177 178 /* Was adjust already computed? */ 179 if (checked_cpu_adjust[pmu_snc]) 180 return cpu_adjust[pmu_snc]; 181 182 /* SNC0 doesn't need an adjust. */ 183 if (pmu_snc == 0) { 184 cpu_adjust[0] = 0; 185 checked_cpu_adjust[0] = true; 186 return 0; 187 } 188 189 /* 190 * Use NUMA topology to compute first CPU of the NUMA node, we want to 191 * adjust CPU 0 to be this and similarly for other CPUs if there is >1 192 * socket. 193 */ 194 assert(pmu_snc >= 0 && pmu_snc <= 9); 195 node_path[24] += pmu_snc; // Shift node0 to be node<pmu_snc>. 196 node_cpus = read_sysfs_cpu_map(node_path); 197 cpu_adjust[pmu_snc] = perf_cpu_map__cpu(node_cpus, 0).cpu; 198 if (cpu_adjust[pmu_snc] < 0) { 199 pr_debug("Failed to read valid CPU list from <sysfs>/%s\n", node_path); 200 cpu_adjust[pmu_snc] = 0; 201 } else { 202 checked_cpu_adjust[pmu_snc] = true; 203 } 204 perf_cpu_map__put(node_cpus); 205 return cpu_adjust[pmu_snc]; 206 } 207 208 static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha) 209 { 210 // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the 211 // topology. For example, a two socket graniterapids machine may be set 212 // up with 3-way SNC meaning there are 6 NUMA nodes that should be 213 // displayed with --per-node. The cpumask of the CHA and IMC PMUs 214 // reflects per-socket information meaning, for example, uncore_cha_60 215 // on a two socket graniterapids machine with 120 cores per socket will 216 // have a cpumask of "0,120". This cpumask needs adjusting to "40,160" 217 // to reflect that uncore_cha_60 is used for the 2nd SNC of each 218 // socket. Without the adjustment events on uncore_cha_60 will appear in 219 // node 0 and node 3 (in our example 2 socket 3-way set up), but with 220 // the adjustment they will appear in node 1 and node 4. The number of 221 // CHAs is typically larger than the number of cores. The CHA numbers 222 // are assumed to split evenly and inorder wrt core numbers. There are 223 // fewer memory IMC PMUs than cores and mapping is handled using lookup 224 // tables. 225 static struct perf_cpu_map *cha_adjusted[MAX_SNCS]; 226 static struct perf_cpu_map *imc_adjusted[MAX_SNCS]; 227 struct perf_cpu_map **adjusted = cha ? cha_adjusted : imc_adjusted; 228 int idx, pmu_snc, cpu_adjust; 229 struct perf_cpu cpu; 230 bool alloc; 231 232 // Cpus from the kernel holds first CPU of each socket. e.g. 0,120. 233 if (perf_cpu_map__cpu(pmu->cpus, 0).cpu != 0) { 234 pr_debug("Ignoring cpumask adjust for %s as unexpected first CPU\n", pmu->name); 235 return; 236 } 237 238 pmu_snc = cha ? uncore_cha_snc(pmu) : uncore_imc_snc(pmu); 239 if (pmu_snc == 0) { 240 // No adjustment necessary for the first SNC. 241 return; 242 } 243 244 alloc = adjusted[pmu_snc] == NULL; 245 if (alloc) { 246 // Hold onto the perf_cpu_map globally to avoid recomputation. 247 cpu_adjust = uncore_cha_imc_compute_cpu_adjust(pmu_snc); 248 adjusted[pmu_snc] = perf_cpu_map__empty_new(perf_cpu_map__nr(pmu->cpus)); 249 if (!adjusted[pmu_snc]) 250 return; 251 } 252 253 perf_cpu_map__for_each_cpu(cpu, idx, pmu->cpus) { 254 // Compute the new cpu map values or if not allocating, assert 255 // that they match expectations. asserts will be removed to 256 // avoid overhead in NDEBUG builds. 257 if (alloc) { 258 RC_CHK_ACCESS(adjusted[pmu_snc])->map[idx].cpu = cpu.cpu + cpu_adjust; 259 } else if (idx == 0) { 260 cpu_adjust = perf_cpu_map__cpu(adjusted[pmu_snc], idx).cpu - cpu.cpu; 261 assert(uncore_cha_imc_compute_cpu_adjust(pmu_snc) == cpu_adjust); 262 } else { 263 assert(perf_cpu_map__cpu(adjusted[pmu_snc], idx).cpu == 264 cpu.cpu + cpu_adjust); 265 } 266 } 267 268 perf_cpu_map__put(pmu->cpus); 269 pmu->cpus = perf_cpu_map__get(adjusted[pmu_snc]); 270 } 271 272 void perf_pmu__arch_init(struct perf_pmu *pmu) 273 { 274 struct perf_pmu_caps *ldlat_cap; 275 276 #ifdef HAVE_AUXTRACE_SUPPORT 277 if (!strcmp(pmu->name, INTEL_PT_PMU_NAME)) { 278 pmu->auxtrace = true; 279 pmu->selectable = true; 280 pmu->perf_event_attr_init_default = intel_pt_pmu_default_config; 281 } 282 if (!strcmp(pmu->name, INTEL_BTS_PMU_NAME)) { 283 pmu->auxtrace = true; 284 pmu->selectable = true; 285 } 286 #endif 287 288 if (x86__is_amd_cpu()) { 289 if (strcmp(pmu->name, "ibs_op")) 290 return; 291 292 pmu->mem_events = perf_mem_events_amd; 293 294 if (!perf_pmu__caps_parse(pmu)) 295 return; 296 297 ldlat_cap = perf_pmu__get_cap(pmu, "ldlat"); 298 if (!ldlat_cap || strcmp(ldlat_cap->value, "1")) 299 return; 300 301 perf_mem_events__loads_ldlat = 0; 302 pmu->mem_events = perf_mem_events_amd_ldlat; 303 } else { 304 if (pmu->is_core) { 305 if (perf_pmu__have_event(pmu, "mem-loads-aux")) 306 pmu->mem_events = perf_mem_events_intel_aux; 307 else 308 pmu->mem_events = perf_mem_events_intel; 309 } else if (x86__is_intel_graniterapids()) { 310 if (starts_with(pmu->name, "uncore_cha_")) 311 gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true); 312 else if (starts_with(pmu->name, "uncore_imc_")) 313 gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false); 314 } 315 } 316 } 317