xref: /linux/tools/power/x86/turbostat/turbostat.c (revision c652dc44192d96820d73a7ecd89d275ca7e4355d)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * turbostat -- show CPU frequency and C-state residency
4  * on modern Intel and AMD processors.
5  *
6  * Copyright (c) 2025 Intel Corporation.
7  * Len Brown <len.brown@intel.com>
8  */
9 
10 #define _GNU_SOURCE
11 #include MSRHEADER
12 
13 // copied from arch/x86/include/asm/cpu_device_id.h
14 #define VFM_MODEL_BIT	0
15 #define VFM_FAMILY_BIT	8
16 #define VFM_VENDOR_BIT	16
17 #define VFM_RSVD_BIT	24
18 
19 #define	VFM_MODEL_MASK	GENMASK(VFM_FAMILY_BIT - 1, VFM_MODEL_BIT)
20 #define	VFM_FAMILY_MASK	GENMASK(VFM_VENDOR_BIT - 1, VFM_FAMILY_BIT)
21 #define	VFM_VENDOR_MASK	GENMASK(VFM_RSVD_BIT - 1, VFM_VENDOR_BIT)
22 
23 #define VFM_MODEL(vfm)	(((vfm) & VFM_MODEL_MASK) >> VFM_MODEL_BIT)
24 #define VFM_FAMILY(vfm)	(((vfm) & VFM_FAMILY_MASK) >> VFM_FAMILY_BIT)
25 #define VFM_VENDOR(vfm)	(((vfm) & VFM_VENDOR_MASK) >> VFM_VENDOR_BIT)
26 
27 #define	VFM_MAKE(_vendor, _family, _model) (	\
28 	((_model) << VFM_MODEL_BIT) |		\
29 	((_family) << VFM_FAMILY_BIT) |		\
30 	((_vendor) << VFM_VENDOR_BIT)		\
31 )
32 // end copied section
33 
34 #define CPUID_LEAF_MODEL_ID			0x1A
35 #define CPUID_LEAF_MODEL_ID_CORE_TYPE_SHIFT	24
36 
37 #define X86_VENDOR_INTEL	0
38 
39 #include INTEL_FAMILY_HEADER
40 #include BUILD_BUG_HEADER
41 #include <stdarg.h>
42 #include <stdio.h>
43 #include <err.h>
44 #include <unistd.h>
45 #include <sys/types.h>
46 #include <sys/wait.h>
47 #include <sys/stat.h>
48 #include <sys/select.h>
49 #include <sys/resource.h>
50 #include <sys/mman.h>
51 #include <fcntl.h>
52 #include <signal.h>
53 #include <sys/time.h>
54 #include <stdlib.h>
55 #include <getopt.h>
56 #include <dirent.h>
57 #include <string.h>
58 #include <ctype.h>
59 #include <sched.h>
60 #include <time.h>
61 #include <cpuid.h>
62 #include <sys/capability.h>
63 #include <errno.h>
64 #include <math.h>
65 #include <linux/perf_event.h>
66 #include <asm/unistd.h>
67 #include <stdbool.h>
68 #include <assert.h>
69 #include <linux/kernel.h>
70 #include <limits.h>
71 
72 #define UNUSED(x) (void)(x)
73 
74 /*
75  * This list matches the column headers, except
76  * 1. built-in only, the sysfs counters are not here -- we learn of those at run-time
77  * 2. Core and CPU are moved to the end, we can't have strings that contain them
78  *    matching on them for --show and --hide.
79  */
80 
81 /*
82  * buffer size used by sscanf() for added column names
83  * Usually truncated to 7 characters, but also handles 18 columns for raw 64-bit counters
84  */
85 #define	NAME_BYTES 20
86 #define PATH_BYTES 128
87 #define PERF_NAME_BYTES 128
88 
89 #define MAX_NOFILE 0x8000
90 
91 #define COUNTER_KIND_PERF_PREFIX "perf/"
92 #define COUNTER_KIND_PERF_PREFIX_LEN strlen(COUNTER_KIND_PERF_PREFIX)
93 #define PERF_DEV_NAME_BYTES 32
94 #define PERF_EVT_NAME_BYTES 32
95 
96 #define INTEL_ECORE_TYPE	0x20
97 #define INTEL_PCORE_TYPE	0x40
98 
99 #define ROUND_UP_TO_PAGE_SIZE(n) (((n) + 0x1000UL-1UL) & ~(0x1000UL-1UL))
100 
101 enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE };
102 enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M };
103 enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE };
104 enum counter_source { COUNTER_SOURCE_NONE, COUNTER_SOURCE_PERF, COUNTER_SOURCE_MSR };
105 
106 struct perf_counter_info {
107 	struct perf_counter_info *next;
108 
109 	/* How to open the counter / What counter it is. */
110 	char device[PERF_DEV_NAME_BYTES];
111 	char event[PERF_EVT_NAME_BYTES];
112 
113 	/* How to show/format the counter. */
114 	char name[PERF_NAME_BYTES];
115 	unsigned int width;
116 	enum counter_scope scope;
117 	enum counter_type type;
118 	enum counter_format format;
119 	double scale;
120 
121 	/* For reading the counter. */
122 	int *fd_perf_per_domain;
123 	size_t num_domains;
124 };
125 
126 struct sysfs_path {
127 	char path[PATH_BYTES];
128 	int id;
129 	struct sysfs_path *next;
130 };
131 
132 struct msr_counter {
133 	unsigned int msr_num;
134 	char name[NAME_BYTES];
135 	struct sysfs_path *sp;
136 	unsigned int width;
137 	enum counter_type type;
138 	enum counter_format format;
139 	struct msr_counter *next;
140 	unsigned int flags;
141 #define	FLAGS_HIDE	(1 << 0)
142 #define	FLAGS_SHOW	(1 << 1)
143 #define	SYSFS_PERCPU	(1 << 1)
144 };
145 
146 struct msr_counter bic[] = {
147 	{ 0x0, "usec", NULL, 0, 0, 0, NULL, 0 },
148 	{ 0x0, "Time_Of_Day_Seconds", NULL, 0, 0, 0, NULL, 0 },
149 	{ 0x0, "Package", NULL, 0, 0, 0, NULL, 0 },
150 	{ 0x0, "Node", NULL, 0, 0, 0, NULL, 0 },
151 	{ 0x0, "Avg_MHz", NULL, 0, 0, 0, NULL, 0 },
152 	{ 0x0, "Busy%", NULL, 0, 0, 0, NULL, 0 },
153 	{ 0x0, "Bzy_MHz", NULL, 0, 0, 0, NULL, 0 },
154 	{ 0x0, "TSC_MHz", NULL, 0, 0, 0, NULL, 0 },
155 	{ 0x0, "IRQ", NULL, 0, 0, 0, NULL, 0 },
156 	{ 0x0, "SMI", NULL, 32, 0, FORMAT_DELTA, NULL, 0 },
157 	{ 0x0, "cpuidle", NULL, 0, 0, 0, NULL, 0 },
158 	{ 0x0, "CPU%c1", NULL, 0, 0, 0, NULL, 0 },
159 	{ 0x0, "CPU%c3", NULL, 0, 0, 0, NULL, 0 },
160 	{ 0x0, "CPU%c6", NULL, 0, 0, 0, NULL, 0 },
161 	{ 0x0, "CPU%c7", NULL, 0, 0, 0, NULL, 0 },
162 	{ 0x0, "ThreadC", NULL, 0, 0, 0, NULL, 0 },
163 	{ 0x0, "CoreTmp", NULL, 0, 0, 0, NULL, 0 },
164 	{ 0x0, "CoreCnt", NULL, 0, 0, 0, NULL, 0 },
165 	{ 0x0, "PkgTmp", NULL, 0, 0, 0, NULL, 0 },
166 	{ 0x0, "GFX%rc6", NULL, 0, 0, 0, NULL, 0 },
167 	{ 0x0, "GFXMHz", NULL, 0, 0, 0, NULL, 0 },
168 	{ 0x0, "Pkg%pc2", NULL, 0, 0, 0, NULL, 0 },
169 	{ 0x0, "Pkg%pc3", NULL, 0, 0, 0, NULL, 0 },
170 	{ 0x0, "Pkg%pc6", NULL, 0, 0, 0, NULL, 0 },
171 	{ 0x0, "Pkg%pc7", NULL, 0, 0, 0, NULL, 0 },
172 	{ 0x0, "Pkg%pc8", NULL, 0, 0, 0, NULL, 0 },
173 	{ 0x0, "Pkg%pc9", NULL, 0, 0, 0, NULL, 0 },
174 	{ 0x0, "Pk%pc10", NULL, 0, 0, 0, NULL, 0 },
175 	{ 0x0, "CPU%LPI", NULL, 0, 0, 0, NULL, 0 },
176 	{ 0x0, "SYS%LPI", NULL, 0, 0, 0, NULL, 0 },
177 	{ 0x0, "PkgWatt", NULL, 0, 0, 0, NULL, 0 },
178 	{ 0x0, "CorWatt", NULL, 0, 0, 0, NULL, 0 },
179 	{ 0x0, "GFXWatt", NULL, 0, 0, 0, NULL, 0 },
180 	{ 0x0, "PkgCnt", NULL, 0, 0, 0, NULL, 0 },
181 	{ 0x0, "RAMWatt", NULL, 0, 0, 0, NULL, 0 },
182 	{ 0x0, "PKG_%", NULL, 0, 0, 0, NULL, 0 },
183 	{ 0x0, "RAM_%", NULL, 0, 0, 0, NULL, 0 },
184 	{ 0x0, "Pkg_J", NULL, 0, 0, 0, NULL, 0 },
185 	{ 0x0, "Cor_J", NULL, 0, 0, 0, NULL, 0 },
186 	{ 0x0, "GFX_J", NULL, 0, 0, 0, NULL, 0 },
187 	{ 0x0, "RAM_J", NULL, 0, 0, 0, NULL, 0 },
188 	{ 0x0, "Mod%c6", NULL, 0, 0, 0, NULL, 0 },
189 	{ 0x0, "Totl%C0", NULL, 0, 0, 0, NULL, 0 },
190 	{ 0x0, "Any%C0", NULL, 0, 0, 0, NULL, 0 },
191 	{ 0x0, "GFX%C0", NULL, 0, 0, 0, NULL, 0 },
192 	{ 0x0, "CPUGFX%", NULL, 0, 0, 0, NULL, 0 },
193 	{ 0x0, "Core", NULL, 0, 0, 0, NULL, 0 },
194 	{ 0x0, "CPU", NULL, 0, 0, 0, NULL, 0 },
195 	{ 0x0, "APIC", NULL, 0, 0, 0, NULL, 0 },
196 	{ 0x0, "X2APIC", NULL, 0, 0, 0, NULL, 0 },
197 	{ 0x0, "Die", NULL, 0, 0, 0, NULL, 0 },
198 	{ 0x0, "L3", NULL, 0, 0, 0, NULL, 0 },
199 	{ 0x0, "GFXAMHz", NULL, 0, 0, 0, NULL, 0 },
200 	{ 0x0, "IPC", NULL, 0, 0, 0, NULL, 0 },
201 	{ 0x0, "CoreThr", NULL, 0, 0, 0, NULL, 0 },
202 	{ 0x0, "UncMHz", NULL, 0, 0, 0, NULL, 0 },
203 	{ 0x0, "SAM%mc6", NULL, 0, 0, 0, NULL, 0 },
204 	{ 0x0, "SAMMHz", NULL, 0, 0, 0, NULL, 0 },
205 	{ 0x0, "SAMAMHz", NULL, 0, 0, 0, NULL, 0 },
206 	{ 0x0, "Die%c6", NULL, 0, 0, 0, NULL, 0 },
207 	{ 0x0, "SysWatt", NULL, 0, 0, 0, NULL, 0 },
208 	{ 0x0, "Sys_J", NULL, 0, 0, 0, NULL, 0 },
209 	{ 0x0, "NMI", NULL, 0, 0, 0, NULL, 0 },
210 	{ 0x0, "CPU%c1e", NULL, 0, 0, 0, NULL, 0 },
211 	{ 0x0, "pct_idle", NULL, 0, 0, 0, NULL, 0 },
212 };
213 
214 /* n.b. bic_names must match the order in bic[], above */
215 enum bic_names {
216 	BIC_USEC,
217 	BIC_TOD,
218 	BIC_Package,
219 	BIC_Node,
220 	BIC_Avg_MHz,
221 	BIC_Busy,
222 	BIC_Bzy_MHz,
223 	BIC_TSC_MHz,
224 	BIC_IRQ,
225 	BIC_SMI,
226 	BIC_cpuidle,
227 	BIC_CPU_c1,
228 	BIC_CPU_c3,
229 	BIC_CPU_c6,
230 	BIC_CPU_c7,
231 	BIC_ThreadC,
232 	BIC_CoreTmp,
233 	BIC_CoreCnt,
234 	BIC_PkgTmp,
235 	BIC_GFX_rc6,
236 	BIC_GFXMHz,
237 	BIC_Pkgpc2,
238 	BIC_Pkgpc3,
239 	BIC_Pkgpc6,
240 	BIC_Pkgpc7,
241 	BIC_Pkgpc8,
242 	BIC_Pkgpc9,
243 	BIC_Pkgpc10,
244 	BIC_CPU_LPI,
245 	BIC_SYS_LPI,
246 	BIC_PkgWatt,
247 	BIC_CorWatt,
248 	BIC_GFXWatt,
249 	BIC_PkgCnt,
250 	BIC_RAMWatt,
251 	BIC_PKG__,
252 	BIC_RAM__,
253 	BIC_Pkg_J,
254 	BIC_Cor_J,
255 	BIC_GFX_J,
256 	BIC_RAM_J,
257 	BIC_Mod_c6,
258 	BIC_Totl_c0,
259 	BIC_Any_c0,
260 	BIC_GFX_c0,
261 	BIC_CPUGFX,
262 	BIC_Core,
263 	BIC_CPU,
264 	BIC_APIC,
265 	BIC_X2APIC,
266 	BIC_Die,
267 	BIC_L3,
268 	BIC_GFXACTMHz,
269 	BIC_IPC,
270 	BIC_CORE_THROT_CNT,
271 	BIC_UNCORE_MHZ,
272 	BIC_SAM_mc6,
273 	BIC_SAMMHz,
274 	BIC_SAMACTMHz,
275 	BIC_Diec6,
276 	BIC_SysWatt,
277 	BIC_Sys_J,
278 	BIC_NMI,
279 	BIC_CPU_c1e,
280 	BIC_pct_idle,
281 	MAX_BIC
282 };
283 
284 void print_bic_set(char *s, cpu_set_t *set)
285 {
286 	int i;
287 
288 	assert(MAX_BIC < CPU_SETSIZE);
289 
290 	printf("%s:", s);
291 
292 	for (i = 0; i <= MAX_BIC; ++i) {
293 
294 		if (CPU_ISSET(i, set)) {
295 			assert(i < MAX_BIC);
296 			printf(" %s", bic[i].name);
297 		}
298 	}
299 	putchar('\n');
300 }
301 
302 static cpu_set_t bic_group_topology;
303 static cpu_set_t bic_group_thermal_pwr;
304 static cpu_set_t bic_group_frequency;
305 static cpu_set_t bic_group_hw_idle;
306 static cpu_set_t bic_group_sw_idle;
307 static cpu_set_t bic_group_idle;
308 static cpu_set_t bic_group_other;
309 static cpu_set_t bic_group_disabled_by_default;
310 static cpu_set_t bic_enabled;
311 static cpu_set_t bic_present;
312 
313 /* modify */
314 #define BIC_INIT(set) CPU_ZERO(set)
315 
316 #define SET_BIC(COUNTER_NUMBER, set) CPU_SET(COUNTER_NUMBER, set)
317 #define CLR_BIC(COUNTER_NUMBER, set) CPU_CLR(COUNTER_NUMBER, set)
318 
319 #define BIC_PRESENT(COUNTER_NUMBER) SET_BIC(COUNTER_NUMBER, &bic_present)
320 #define BIC_NOT_PRESENT(COUNTER_NUMBER) CPU_CLR(COUNTER_NUMBER, &bic_present)
321 
322 /* test */
323 #define BIC_IS_ENABLED(COUNTER_NUMBER) CPU_ISSET(COUNTER_NUMBER, &bic_enabled)
324 #define DO_BIC_READ(COUNTER_NUMBER) CPU_ISSET(COUNTER_NUMBER, &bic_present)
325 #define DO_BIC(COUNTER_NUMBER) (CPU_ISSET(COUNTER_NUMBER, &bic_enabled) && CPU_ISSET(COUNTER_NUMBER, &bic_present))
326 
327 static void bic_set_all(cpu_set_t *set)
328 {
329 	int i;
330 
331 	assert(MAX_BIC < CPU_SETSIZE);
332 
333 	for (i = 0; i < MAX_BIC; ++i)
334 		SET_BIC(i, set);
335 }
336 
337 /*
338  * bic_clear_bits()
339  * clear all the bits from "clr" in "dst"
340  */
341 static void bic_clear_bits(cpu_set_t *dst, cpu_set_t *clr)
342 {
343 	int i;
344 
345 	assert(MAX_BIC < CPU_SETSIZE);
346 
347 	for (i = 0; i < MAX_BIC; ++i)
348 		if (CPU_ISSET(i, clr))
349 			CLR_BIC(i, dst);
350 }
351 
352 static void bic_groups_init(void)
353 {
354 	BIC_INIT(&bic_group_topology);
355 	SET_BIC(BIC_Package, &bic_group_topology);
356 	SET_BIC(BIC_Node, &bic_group_topology);
357 	SET_BIC(BIC_CoreCnt, &bic_group_topology);
358 	SET_BIC(BIC_PkgCnt, &bic_group_topology);
359 	SET_BIC(BIC_Core, &bic_group_topology);
360 	SET_BIC(BIC_CPU, &bic_group_topology);
361 	SET_BIC(BIC_Die, &bic_group_topology);
362 	SET_BIC(BIC_L3, &bic_group_topology);
363 
364 	BIC_INIT(&bic_group_thermal_pwr);
365 	SET_BIC(BIC_CoreTmp, &bic_group_thermal_pwr);
366 	SET_BIC(BIC_PkgTmp, &bic_group_thermal_pwr);
367 	SET_BIC(BIC_PkgWatt, &bic_group_thermal_pwr);
368 	SET_BIC(BIC_CorWatt, &bic_group_thermal_pwr);
369 	SET_BIC(BIC_GFXWatt, &bic_group_thermal_pwr);
370 	SET_BIC(BIC_RAMWatt, &bic_group_thermal_pwr);
371 	SET_BIC(BIC_PKG__, &bic_group_thermal_pwr);
372 	SET_BIC(BIC_RAM__, &bic_group_thermal_pwr);
373 	SET_BIC(BIC_SysWatt, &bic_group_thermal_pwr);
374 
375 	BIC_INIT(&bic_group_frequency);
376 	SET_BIC(BIC_Avg_MHz, &bic_group_frequency);
377 	SET_BIC(BIC_Busy, &bic_group_frequency);
378 	SET_BIC(BIC_Bzy_MHz, &bic_group_frequency);
379 	SET_BIC(BIC_TSC_MHz, &bic_group_frequency);
380 	SET_BIC(BIC_GFXMHz, &bic_group_frequency);
381 	SET_BIC(BIC_GFXACTMHz, &bic_group_frequency);
382 	SET_BIC(BIC_SAMMHz, &bic_group_frequency);
383 	SET_BIC(BIC_SAMACTMHz, &bic_group_frequency);
384 	SET_BIC(BIC_UNCORE_MHZ, &bic_group_frequency);
385 
386 	BIC_INIT(&bic_group_hw_idle);
387 	SET_BIC(BIC_Busy, &bic_group_hw_idle);
388 	SET_BIC(BIC_CPU_c1, &bic_group_hw_idle);
389 	SET_BIC(BIC_CPU_c3, &bic_group_hw_idle);
390 	SET_BIC(BIC_CPU_c6, &bic_group_hw_idle);
391 	SET_BIC(BIC_CPU_c7, &bic_group_hw_idle);
392 	SET_BIC(BIC_GFX_rc6, &bic_group_hw_idle);
393 	SET_BIC(BIC_Pkgpc2, &bic_group_hw_idle);
394 	SET_BIC(BIC_Pkgpc3, &bic_group_hw_idle);
395 	SET_BIC(BIC_Pkgpc6, &bic_group_hw_idle);
396 	SET_BIC(BIC_Pkgpc7, &bic_group_hw_idle);
397 	SET_BIC(BIC_Pkgpc8, &bic_group_hw_idle);
398 	SET_BIC(BIC_Pkgpc9, &bic_group_hw_idle);
399 	SET_BIC(BIC_Pkgpc10, &bic_group_hw_idle);
400 	SET_BIC(BIC_CPU_LPI, &bic_group_hw_idle);
401 	SET_BIC(BIC_SYS_LPI, &bic_group_hw_idle);
402 	SET_BIC(BIC_Mod_c6, &bic_group_hw_idle);
403 	SET_BIC(BIC_Totl_c0, &bic_group_hw_idle);
404 	SET_BIC(BIC_Any_c0, &bic_group_hw_idle);
405 	SET_BIC(BIC_GFX_c0, &bic_group_hw_idle);
406 	SET_BIC(BIC_CPUGFX, &bic_group_hw_idle);
407 	SET_BIC(BIC_SAM_mc6, &bic_group_hw_idle);
408 	SET_BIC(BIC_Diec6, &bic_group_hw_idle);
409 
410 	BIC_INIT(&bic_group_sw_idle);
411 	SET_BIC(BIC_Busy, &bic_group_sw_idle);
412 	SET_BIC(BIC_cpuidle, &bic_group_sw_idle);
413 	SET_BIC(BIC_pct_idle, &bic_group_sw_idle);
414 
415 	BIC_INIT(&bic_group_idle);
416 	CPU_OR(&bic_group_idle, &bic_group_idle, &bic_group_hw_idle);
417 	SET_BIC(BIC_pct_idle, &bic_group_idle);
418 
419 	BIC_INIT(&bic_group_other);
420 	SET_BIC(BIC_IRQ, &bic_group_other);
421 	SET_BIC(BIC_NMI, &bic_group_other);
422 	SET_BIC(BIC_SMI, &bic_group_other);
423 	SET_BIC(BIC_ThreadC, &bic_group_other);
424 	SET_BIC(BIC_CoreTmp, &bic_group_other);
425 	SET_BIC(BIC_IPC, &bic_group_other);
426 
427 	BIC_INIT(&bic_group_disabled_by_default);
428 	SET_BIC(BIC_USEC, &bic_group_disabled_by_default);
429 	SET_BIC(BIC_TOD, &bic_group_disabled_by_default);
430 	SET_BIC(BIC_cpuidle, &bic_group_disabled_by_default);
431 	SET_BIC(BIC_APIC, &bic_group_disabled_by_default);
432 	SET_BIC(BIC_X2APIC, &bic_group_disabled_by_default);
433 
434 	BIC_INIT(&bic_enabled);
435 	bic_set_all(&bic_enabled);
436 	bic_clear_bits(&bic_enabled, &bic_group_disabled_by_default);
437 
438 	BIC_INIT(&bic_present);
439 	SET_BIC(BIC_USEC, &bic_present);
440 	SET_BIC(BIC_TOD, &bic_present);
441 	SET_BIC(BIC_cpuidle, &bic_present);
442 	SET_BIC(BIC_APIC, &bic_present);
443 	SET_BIC(BIC_X2APIC, &bic_present);
444 	SET_BIC(BIC_pct_idle, &bic_present);
445 }
446 
447 /*
448  * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit:
449  * If you change the values, note they are used both in comparisons
450  * (>= PCL__7) and to index pkg_cstate_limit_strings[].
451  */
452 #define PCLUKN 0		/* Unknown */
453 #define PCLRSV 1		/* Reserved */
454 #define PCL__0 2		/* PC0 */
455 #define PCL__1 3		/* PC1 */
456 #define PCL__2 4		/* PC2 */
457 #define PCL__3 5		/* PC3 */
458 #define PCL__4 6		/* PC4 */
459 #define PCL__6 7		/* PC6 */
460 #define PCL_6N 8		/* PC6 No Retention */
461 #define PCL_6R 9		/* PC6 Retention */
462 #define PCL__7 10		/* PC7 */
463 #define PCL_7S 11		/* PC7 Shrink */
464 #define PCL__8 12		/* PC8 */
465 #define PCL__9 13		/* PC9 */
466 #define PCL_10 14		/* PC10 */
467 #define PCLUNL 15		/* Unlimited */
468 
469 struct amperf_group_fd;
470 
471 char *proc_stat = "/proc/stat";
472 FILE *outf;
473 int *fd_percpu;
474 int *fd_instr_count_percpu;
475 struct timeval interval_tv = { 5, 0 };
476 struct timespec interval_ts = { 5, 0 };
477 
478 unsigned int num_iterations;
479 unsigned int header_iterations;
480 unsigned int debug;
481 unsigned int quiet;
482 unsigned int shown;
483 unsigned int sums_need_wide_columns;
484 unsigned int rapl_joules;
485 unsigned int summary_only;
486 unsigned int list_header_only;
487 unsigned int dump_only;
488 unsigned int force_load;
489 unsigned int has_aperf;
490 unsigned int has_aperf_access;
491 unsigned int has_epb;
492 unsigned int has_turbo;
493 unsigned int is_hybrid;
494 unsigned int units = 1000000;	/* MHz etc */
495 unsigned int genuine_intel;
496 unsigned int authentic_amd;
497 unsigned int hygon_genuine;
498 unsigned int max_level, max_extended_level;
499 unsigned int has_invariant_tsc;
500 unsigned int aperf_mperf_multiplier = 1;
501 double bclk;
502 double base_hz;
503 unsigned int has_base_hz;
504 double tsc_tweak = 1.0;
505 unsigned int show_pkg_only;
506 unsigned int show_core_only;
507 char *output_buffer, *outp;
508 unsigned int do_dts;
509 unsigned int do_ptm;
510 unsigned int do_ipc;
511 unsigned long long cpuidle_cur_cpu_lpi_us;
512 unsigned long long cpuidle_cur_sys_lpi_us;
513 unsigned int tj_max;
514 unsigned int tj_max_override;
515 double rapl_power_units, rapl_time_units;
516 double rapl_dram_energy_units, rapl_energy_units, rapl_psys_energy_units;
517 double rapl_joule_counter_range;
518 unsigned int crystal_hz;
519 unsigned long long tsc_hz;
520 int base_cpu;
521 unsigned int has_hwp;		/* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */
522 			/* IA32_HWP_REQUEST, IA32_HWP_STATUS */
523 unsigned int has_hwp_notify;	/* IA32_HWP_INTERRUPT */
524 unsigned int has_hwp_activity_window;	/* IA32_HWP_REQUEST[bits 41:32] */
525 unsigned int has_hwp_epp;	/* IA32_HWP_REQUEST[bits 31:24] */
526 unsigned int has_hwp_pkg;	/* IA32_HWP_REQUEST_PKG */
527 unsigned int first_counter_read = 1;
528 
529 static struct timeval procsysfs_tv_begin;
530 
531 int ignore_stdin;
532 bool no_msr;
533 bool no_perf;
534 
535 enum gfx_sysfs_idx {
536 	GFX_rc6,
537 	GFX_MHz,
538 	GFX_ACTMHz,
539 	SAM_mc6,
540 	SAM_MHz,
541 	SAM_ACTMHz,
542 	GFX_MAX
543 };
544 
545 struct gfx_sysfs_info {
546 	FILE *fp;
547 	unsigned int val;
548 	unsigned long long val_ull;
549 };
550 
551 static struct gfx_sysfs_info gfx_info[GFX_MAX];
552 
553 int get_msr(int cpu, off_t offset, unsigned long long *msr);
554 int add_counter(unsigned int msr_num, char *path, char *name,
555 		unsigned int width, enum counter_scope scope,
556 		enum counter_type type, enum counter_format format, int flags, int package_num);
557 
558 /* Model specific support Start */
559 
560 /* List of features that may diverge among different platforms */
561 struct platform_features {
562 	bool has_msr_misc_feature_control;	/* MSR_MISC_FEATURE_CONTROL */
563 	bool has_msr_misc_pwr_mgmt;	/* MSR_MISC_PWR_MGMT */
564 	bool has_nhm_msrs;	/* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, MSR_IA32_POWER_CTL, TRL MSRs */
565 	bool has_config_tdp;	/* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */
566 	int bclk_freq;		/* CPU base clock */
567 	int crystal_freq;	/* Crystal clock to use when not available from CPUID.15 */
568 	int supported_cstates;	/* Core cstates and Package cstates supported */
569 	int cst_limit;		/* MSR_PKG_CST_CONFIG_CONTROL */
570 	bool has_cst_auto_convension;	/* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */
571 	bool has_irtl_msrs;	/* MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL */
572 	bool has_msr_core_c1_res;	/* MSR_CORE_C1_RES */
573 	bool has_msr_module_c6_res_ms;	/* MSR_MODULE_C6_RES_MS */
574 	bool has_msr_c6_demotion_policy_config;	/* MSR_CC6_DEMOTION_POLICY_CONFIG/MSR_MC6_DEMOTION_POLICY_CONFIG */
575 	bool has_msr_atom_pkg_c6_residency;	/* MSR_ATOM_PKG_C6_RESIDENCY */
576 	bool has_msr_knl_core_c6_residency;	/* MSR_KNL_CORE_C6_RESIDENCY */
577 	bool has_ext_cst_msrs;	/* MSR_PKG_WEIGHTED_CORE_C0_RES/MSR_PKG_ANY_CORE_C0_RES/MSR_PKG_ANY_GFXE_C0_RES/MSR_PKG_BOTH_CORE_GFXE_C0_RES */
578 	bool has_cst_prewake_bit;	/* Cstate prewake bit in MSR_IA32_POWER_CTL */
579 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
580 	int plr_msrs;		/* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
581 	int rapl_msrs;		/* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */
582 	bool has_per_core_rapl;	/* Indicates cores energy collection is per-core, not per-package. AMD specific for now */
583 	bool has_rapl_divisor;	/* Divisor for Energy unit raw value from MSR_RAPL_POWER_UNIT */
584 	bool has_fixed_rapl_unit;	/* Fixed Energy Unit used for DRAM RAPL Domain */
585 	bool has_fixed_rapl_psys_unit;	/* Fixed Energy Unit used for PSYS RAPL Domain */
586 	int rapl_quirk_tdp;	/* Hardcoded TDP value when cannot be retrieved from hardware */
587 	int tcc_offset_bits;	/* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */
588 	bool enable_tsc_tweak;	/* Use CPU Base freq instead of TSC freq for aperf/mperf counter */
589 	bool need_perf_multiplier;	/* mperf/aperf multiplier */
590 };
591 
592 struct platform_data {
593 	unsigned int vfm;
594 	const struct platform_features *features;
595 };
596 
597 /* For BCLK */
598 enum bclk_freq {
599 	BCLK_100MHZ = 1,
600 	BCLK_133MHZ,
601 	BCLK_SLV,
602 };
603 
604 #define SLM_BCLK_FREQS 5
605 double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0 };
606 
607 double slm_bclk(void)
608 {
609 	unsigned long long msr = 3;
610 	unsigned int i;
611 	double freq;
612 
613 	if (get_msr(base_cpu, MSR_FSB_FREQ, &msr))
614 		fprintf(outf, "SLM BCLK: unknown\n");
615 
616 	i = msr & 0xf;
617 	if (i >= SLM_BCLK_FREQS) {
618 		fprintf(outf, "SLM BCLK[%d] invalid\n", i);
619 		i = 3;
620 	}
621 	freq = slm_freq_table[i];
622 
623 	if (!quiet)
624 		fprintf(outf, "SLM BCLK: %.1f Mhz\n", freq);
625 
626 	return freq;
627 }
628 
629 /* For Package cstate limit */
630 enum package_cstate_limit {
631 	CST_LIMIT_NHM = 1,
632 	CST_LIMIT_SNB,
633 	CST_LIMIT_HSW,
634 	CST_LIMIT_SKX,
635 	CST_LIMIT_ICX,
636 	CST_LIMIT_SLV,
637 	CST_LIMIT_AMT,
638 	CST_LIMIT_KNL,
639 	CST_LIMIT_GMT,
640 };
641 
642 /* For Turbo Ratio Limit MSRs */
643 enum turbo_ratio_limit_msrs {
644 	TRL_BASE = BIT(0),
645 	TRL_LIMIT1 = BIT(1),
646 	TRL_LIMIT2 = BIT(2),
647 	TRL_ATOM = BIT(3),
648 	TRL_KNL = BIT(4),
649 	TRL_CORECOUNT = BIT(5),
650 };
651 
652 /* For Perf Limit Reason MSRs */
653 enum perf_limit_reason_msrs {
654 	PLR_CORE = BIT(0),
655 	PLR_GFX = BIT(1),
656 	PLR_RING = BIT(2),
657 };
658 
659 /* For RAPL MSRs */
660 enum rapl_msrs {
661 	RAPL_PKG_POWER_LIMIT = BIT(0),	/* 0x610 MSR_PKG_POWER_LIMIT */
662 	RAPL_PKG_ENERGY_STATUS = BIT(1),	/* 0x611 MSR_PKG_ENERGY_STATUS */
663 	RAPL_PKG_PERF_STATUS = BIT(2),	/* 0x613 MSR_PKG_PERF_STATUS */
664 	RAPL_PKG_POWER_INFO = BIT(3),	/* 0x614 MSR_PKG_POWER_INFO */
665 	RAPL_DRAM_POWER_LIMIT = BIT(4),	/* 0x618 MSR_DRAM_POWER_LIMIT */
666 	RAPL_DRAM_ENERGY_STATUS = BIT(5),	/* 0x619 MSR_DRAM_ENERGY_STATUS */
667 	RAPL_DRAM_PERF_STATUS = BIT(6),	/* 0x61b MSR_DRAM_PERF_STATUS */
668 	RAPL_DRAM_POWER_INFO = BIT(7),	/* 0x61c MSR_DRAM_POWER_INFO */
669 	RAPL_CORE_POWER_LIMIT = BIT(8),	/* 0x638 MSR_PP0_POWER_LIMIT */
670 	RAPL_CORE_ENERGY_STATUS = BIT(9),	/* 0x639 MSR_PP0_ENERGY_STATUS */
671 	RAPL_CORE_POLICY = BIT(10),	/* 0x63a MSR_PP0_POLICY */
672 	RAPL_GFX_POWER_LIMIT = BIT(11),	/* 0x640 MSR_PP1_POWER_LIMIT */
673 	RAPL_GFX_ENERGY_STATUS = BIT(12),	/* 0x641 MSR_PP1_ENERGY_STATUS */
674 	RAPL_GFX_POLICY = BIT(13),	/* 0x642 MSR_PP1_POLICY */
675 	RAPL_AMD_PWR_UNIT = BIT(14),	/* 0xc0010299 MSR_AMD_RAPL_POWER_UNIT */
676 	RAPL_AMD_CORE_ENERGY_STAT = BIT(15),	/* 0xc001029a MSR_AMD_CORE_ENERGY_STATUS */
677 	RAPL_AMD_PKG_ENERGY_STAT = BIT(16),	/* 0xc001029b MSR_AMD_PKG_ENERGY_STATUS */
678 	RAPL_PLATFORM_ENERGY_LIMIT = BIT(17),	/* 0x64c MSR_PLATFORM_ENERGY_LIMIT */
679 	RAPL_PLATFORM_ENERGY_STATUS = BIT(18),	/* 0x64d MSR_PLATFORM_ENERGY_STATUS */
680 };
681 
682 #define RAPL_PKG	(RAPL_PKG_ENERGY_STATUS | RAPL_PKG_POWER_LIMIT)
683 #define RAPL_DRAM	(RAPL_DRAM_ENERGY_STATUS | RAPL_DRAM_POWER_LIMIT)
684 #define RAPL_CORE	(RAPL_CORE_ENERGY_STATUS | RAPL_CORE_POWER_LIMIT)
685 #define RAPL_GFX	(RAPL_GFX_POWER_LIMIT | RAPL_GFX_ENERGY_STATUS)
686 #define RAPL_PSYS	(RAPL_PLATFORM_ENERGY_STATUS | RAPL_PLATFORM_ENERGY_LIMIT)
687 
688 #define RAPL_PKG_ALL	(RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO)
689 #define RAPL_DRAM_ALL	(RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_DRAM_POWER_INFO)
690 #define RAPL_CORE_ALL	(RAPL_CORE | RAPL_CORE_POLICY)
691 #define RAPL_GFX_ALL	(RAPL_GFX | RAPL_GFX_POLICY)
692 
693 #define RAPL_AMD_F17H	(RAPL_AMD_PWR_UNIT | RAPL_AMD_CORE_ENERGY_STAT | RAPL_AMD_PKG_ENERGY_STAT)
694 
695 /* For Cstates */
696 enum cstates {
697 	CC1 = BIT(0),
698 	CC3 = BIT(1),
699 	CC6 = BIT(2),
700 	CC7 = BIT(3),
701 	PC2 = BIT(4),
702 	PC3 = BIT(5),
703 	PC6 = BIT(6),
704 	PC7 = BIT(7),
705 	PC8 = BIT(8),
706 	PC9 = BIT(9),
707 	PC10 = BIT(10),
708 };
709 
710 static const struct platform_features nhm_features = {
711 	.has_msr_misc_pwr_mgmt = 1,
712 	.has_nhm_msrs = 1,
713 	.bclk_freq = BCLK_133MHZ,
714 	.supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
715 	.cst_limit = CST_LIMIT_NHM,
716 	.trl_msrs = TRL_BASE,
717 };
718 
719 static const struct platform_features nhx_features = {
720 	.has_msr_misc_pwr_mgmt = 1,
721 	.has_nhm_msrs = 1,
722 	.bclk_freq = BCLK_133MHZ,
723 	.supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
724 	.cst_limit = CST_LIMIT_NHM,
725 };
726 
727 static const struct platform_features snb_features = {
728 	.has_msr_misc_feature_control = 1,
729 	.has_msr_misc_pwr_mgmt = 1,
730 	.has_nhm_msrs = 1,
731 	.bclk_freq = BCLK_100MHZ,
732 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
733 	.cst_limit = CST_LIMIT_SNB,
734 	.has_irtl_msrs = 1,
735 	.trl_msrs = TRL_BASE,
736 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
737 };
738 
739 static const struct platform_features snx_features = {
740 	.has_msr_misc_feature_control = 1,
741 	.has_msr_misc_pwr_mgmt = 1,
742 	.has_nhm_msrs = 1,
743 	.bclk_freq = BCLK_100MHZ,
744 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
745 	.cst_limit = CST_LIMIT_SNB,
746 	.has_irtl_msrs = 1,
747 	.trl_msrs = TRL_BASE,
748 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
749 };
750 
751 static const struct platform_features ivb_features = {
752 	.has_msr_misc_feature_control = 1,
753 	.has_msr_misc_pwr_mgmt = 1,
754 	.has_nhm_msrs = 1,
755 	.has_config_tdp = 1,
756 	.bclk_freq = BCLK_100MHZ,
757 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
758 	.cst_limit = CST_LIMIT_SNB,
759 	.has_irtl_msrs = 1,
760 	.trl_msrs = TRL_BASE,
761 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
762 };
763 
764 static const struct platform_features ivx_features = {
765 	.has_msr_misc_feature_control = 1,
766 	.has_msr_misc_pwr_mgmt = 1,
767 	.has_nhm_msrs = 1,
768 	.bclk_freq = BCLK_100MHZ,
769 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
770 	.cst_limit = CST_LIMIT_SNB,
771 	.has_irtl_msrs = 1,
772 	.trl_msrs = TRL_BASE | TRL_LIMIT1,
773 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
774 };
775 
776 static const struct platform_features hsw_features = {
777 	.has_msr_misc_feature_control = 1,
778 	.has_msr_misc_pwr_mgmt = 1,
779 	.has_nhm_msrs = 1,
780 	.has_config_tdp = 1,
781 	.bclk_freq = BCLK_100MHZ,
782 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
783 	.cst_limit = CST_LIMIT_HSW,
784 	.has_irtl_msrs = 1,
785 	.trl_msrs = TRL_BASE,
786 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
787 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
788 };
789 
790 static const struct platform_features hsx_features = {
791 	.has_msr_misc_feature_control = 1,
792 	.has_msr_misc_pwr_mgmt = 1,
793 	.has_nhm_msrs = 1,
794 	.has_config_tdp = 1,
795 	.bclk_freq = BCLK_100MHZ,
796 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
797 	.cst_limit = CST_LIMIT_HSW,
798 	.has_irtl_msrs = 1,
799 	.trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2,
800 	.plr_msrs = PLR_CORE | PLR_RING,
801 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
802 	.has_fixed_rapl_unit = 1,
803 };
804 
805 static const struct platform_features hswl_features = {
806 	.has_msr_misc_feature_control = 1,
807 	.has_msr_misc_pwr_mgmt = 1,
808 	.has_nhm_msrs = 1,
809 	.has_config_tdp = 1,
810 	.bclk_freq = BCLK_100MHZ,
811 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
812 	.cst_limit = CST_LIMIT_HSW,
813 	.has_irtl_msrs = 1,
814 	.trl_msrs = TRL_BASE,
815 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
816 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
817 };
818 
819 static const struct platform_features hswg_features = {
820 	.has_msr_misc_feature_control = 1,
821 	.has_msr_misc_pwr_mgmt = 1,
822 	.has_nhm_msrs = 1,
823 	.has_config_tdp = 1,
824 	.bclk_freq = BCLK_100MHZ,
825 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
826 	.cst_limit = CST_LIMIT_HSW,
827 	.has_irtl_msrs = 1,
828 	.trl_msrs = TRL_BASE,
829 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
830 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
831 };
832 
833 static const struct platform_features bdw_features = {
834 	.has_msr_misc_feature_control = 1,
835 	.has_msr_misc_pwr_mgmt = 1,
836 	.has_nhm_msrs = 1,
837 	.has_config_tdp = 1,
838 	.bclk_freq = BCLK_100MHZ,
839 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
840 	.cst_limit = CST_LIMIT_HSW,
841 	.has_irtl_msrs = 1,
842 	.trl_msrs = TRL_BASE,
843 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
844 };
845 
846 static const struct platform_features bdwg_features = {
847 	.has_msr_misc_feature_control = 1,
848 	.has_msr_misc_pwr_mgmt = 1,
849 	.has_nhm_msrs = 1,
850 	.has_config_tdp = 1,
851 	.bclk_freq = BCLK_100MHZ,
852 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
853 	.cst_limit = CST_LIMIT_HSW,
854 	.has_irtl_msrs = 1,
855 	.trl_msrs = TRL_BASE,
856 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
857 };
858 
859 static const struct platform_features bdx_features = {
860 	.has_msr_misc_feature_control = 1,
861 	.has_msr_misc_pwr_mgmt = 1,
862 	.has_nhm_msrs = 1,
863 	.has_config_tdp = 1,
864 	.bclk_freq = BCLK_100MHZ,
865 	.supported_cstates = CC1 | CC3 | CC6 | PC2 | PC3 | PC6,
866 	.cst_limit = CST_LIMIT_HSW,
867 	.has_irtl_msrs = 1,
868 	.has_cst_auto_convension = 1,
869 	.trl_msrs = TRL_BASE,
870 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
871 	.has_fixed_rapl_unit = 1,
872 };
873 
874 static const struct platform_features skl_features = {
875 	.has_msr_misc_feature_control = 1,
876 	.has_msr_misc_pwr_mgmt = 1,
877 	.has_nhm_msrs = 1,
878 	.has_config_tdp = 1,
879 	.bclk_freq = BCLK_100MHZ,
880 	.crystal_freq = 24000000,
881 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
882 	.cst_limit = CST_LIMIT_HSW,
883 	.has_irtl_msrs = 1,
884 	.has_ext_cst_msrs = 1,
885 	.trl_msrs = TRL_BASE,
886 	.tcc_offset_bits = 6,
887 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX | RAPL_PSYS,
888 	.enable_tsc_tweak = 1,
889 };
890 
891 static const struct platform_features cnl_features = {
892 	.has_msr_misc_feature_control = 1,
893 	.has_msr_misc_pwr_mgmt = 1,
894 	.has_nhm_msrs = 1,
895 	.has_config_tdp = 1,
896 	.bclk_freq = BCLK_100MHZ,
897 	.supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
898 	.cst_limit = CST_LIMIT_HSW,
899 	.has_irtl_msrs = 1,
900 	.has_msr_core_c1_res = 1,
901 	.has_ext_cst_msrs = 1,
902 	.trl_msrs = TRL_BASE,
903 	.tcc_offset_bits = 6,
904 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX | RAPL_PSYS,
905 	.enable_tsc_tweak = 1,
906 };
907 
908 /* Copied from cnl_features, with PC7/PC9 removed */
909 static const struct platform_features adl_features = {
910 	.has_msr_misc_feature_control	= cnl_features.has_msr_misc_feature_control,
911 	.has_msr_misc_pwr_mgmt		= cnl_features.has_msr_misc_pwr_mgmt,
912 	.has_nhm_msrs			= cnl_features.has_nhm_msrs,
913 	.has_config_tdp			= cnl_features.has_config_tdp,
914 	.bclk_freq			= cnl_features.bclk_freq,
915 	.supported_cstates		= CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC8 | PC10,
916 	.cst_limit			= cnl_features.cst_limit,
917 	.has_irtl_msrs			= cnl_features.has_irtl_msrs,
918 	.has_msr_core_c1_res		= cnl_features.has_msr_core_c1_res,
919 	.has_ext_cst_msrs		= cnl_features.has_ext_cst_msrs,
920 	.trl_msrs			= cnl_features.trl_msrs,
921 	.tcc_offset_bits		= cnl_features.tcc_offset_bits,
922 	.rapl_msrs			= cnl_features.rapl_msrs,
923 	.enable_tsc_tweak		= cnl_features.enable_tsc_tweak,
924 };
925 
926 /* Copied from adl_features, with PC3/PC8 removed */
927 static const struct platform_features lnl_features = {
928 	.has_msr_misc_feature_control	= adl_features.has_msr_misc_feature_control,
929 	.has_msr_misc_pwr_mgmt		= adl_features.has_msr_misc_pwr_mgmt,
930 	.has_nhm_msrs			= adl_features.has_nhm_msrs,
931 	.has_config_tdp			= adl_features.has_config_tdp,
932 	.bclk_freq			= adl_features.bclk_freq,
933 	.supported_cstates		= CC1 | CC6 | CC7 | PC2 | PC6 | PC10,
934 	.cst_limit			= adl_features.cst_limit,
935 	.has_irtl_msrs			= adl_features.has_irtl_msrs,
936 	.has_msr_core_c1_res		= adl_features.has_msr_core_c1_res,
937 	.has_ext_cst_msrs		= adl_features.has_ext_cst_msrs,
938 	.trl_msrs			= adl_features.trl_msrs,
939 	.tcc_offset_bits		= adl_features.tcc_offset_bits,
940 	.rapl_msrs			= adl_features.rapl_msrs,
941 	.enable_tsc_tweak		= adl_features.enable_tsc_tweak,
942 };
943 
944 static const struct platform_features skx_features = {
945 	.has_msr_misc_feature_control = 1,
946 	.has_msr_misc_pwr_mgmt = 1,
947 	.has_nhm_msrs = 1,
948 	.has_config_tdp = 1,
949 	.bclk_freq = BCLK_100MHZ,
950 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
951 	.cst_limit = CST_LIMIT_SKX,
952 	.has_irtl_msrs = 1,
953 	.has_cst_auto_convension = 1,
954 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
955 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
956 	.has_fixed_rapl_unit = 1,
957 };
958 
959 static const struct platform_features icx_features = {
960 	.has_msr_misc_feature_control = 1,
961 	.has_msr_misc_pwr_mgmt = 1,
962 	.has_nhm_msrs = 1,
963 	.has_config_tdp = 1,
964 	.bclk_freq = BCLK_100MHZ,
965 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
966 	.cst_limit = CST_LIMIT_ICX,
967 	.has_msr_core_c1_res = 1,
968 	.has_irtl_msrs = 1,
969 	.has_cst_prewake_bit = 1,
970 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
971 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS,
972 	.has_fixed_rapl_unit = 1,
973 };
974 
975 static const struct platform_features spr_features = {
976 	.has_msr_misc_feature_control = 1,
977 	.has_msr_misc_pwr_mgmt = 1,
978 	.has_nhm_msrs = 1,
979 	.has_config_tdp = 1,
980 	.bclk_freq = BCLK_100MHZ,
981 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
982 	.cst_limit = CST_LIMIT_SKX,
983 	.has_msr_core_c1_res = 1,
984 	.has_irtl_msrs = 1,
985 	.has_cst_prewake_bit = 1,
986 	.has_fixed_rapl_psys_unit = 1,
987 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
988 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS,
989 };
990 
991 static const struct platform_features dmr_features = {
992 	.has_msr_misc_feature_control	= spr_features.has_msr_misc_feature_control,
993 	.has_msr_misc_pwr_mgmt		= spr_features.has_msr_misc_pwr_mgmt,
994 	.has_nhm_msrs			= spr_features.has_nhm_msrs,
995 	.bclk_freq			= spr_features.bclk_freq,
996 	.supported_cstates		= spr_features.supported_cstates,
997 	.cst_limit			= spr_features.cst_limit,
998 	.has_msr_core_c1_res		= spr_features.has_msr_core_c1_res,
999 	.has_cst_prewake_bit		= spr_features.has_cst_prewake_bit,
1000 	.has_fixed_rapl_psys_unit	= spr_features.has_fixed_rapl_psys_unit,
1001 	.trl_msrs			= spr_features.trl_msrs,
1002 	.has_msr_module_c6_res_ms	= 1,	/* DMR has Dual-Core-Module and MC6 MSR */
1003 	.rapl_msrs			= 0,	/* DMR does not have RAPL MSRs */
1004 	.plr_msrs			= 0,	/* DMR does not have PLR  MSRs */
1005 	.has_irtl_msrs			= 0,	/* DMR does not have IRTL MSRs */
1006 	.has_config_tdp			= 0,	/* DMR does not have CTDP MSRs */
1007 };
1008 
1009 static const struct platform_features srf_features = {
1010 	.has_msr_misc_feature_control = 1,
1011 	.has_msr_misc_pwr_mgmt = 1,
1012 	.has_nhm_msrs = 1,
1013 	.has_config_tdp = 1,
1014 	.bclk_freq = BCLK_100MHZ,
1015 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
1016 	.cst_limit = CST_LIMIT_SKX,
1017 	.has_msr_core_c1_res = 1,
1018 	.has_msr_module_c6_res_ms = 1,
1019 	.has_irtl_msrs = 1,
1020 	.has_cst_prewake_bit = 1,
1021 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
1022 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS,
1023 };
1024 
1025 static const struct platform_features grr_features = {
1026 	.has_msr_misc_feature_control = 1,
1027 	.has_msr_misc_pwr_mgmt = 1,
1028 	.has_nhm_msrs = 1,
1029 	.has_config_tdp = 1,
1030 	.bclk_freq = BCLK_100MHZ,
1031 	.supported_cstates = CC1 | CC6,
1032 	.cst_limit = CST_LIMIT_SKX,
1033 	.has_msr_core_c1_res = 1,
1034 	.has_msr_module_c6_res_ms = 1,
1035 	.has_irtl_msrs = 1,
1036 	.has_cst_prewake_bit = 1,
1037 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
1038 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS,
1039 };
1040 
1041 static const struct platform_features slv_features = {
1042 	.has_nhm_msrs = 1,
1043 	.bclk_freq = BCLK_SLV,
1044 	.supported_cstates = CC1 | CC6 | PC6,
1045 	.cst_limit = CST_LIMIT_SLV,
1046 	.has_msr_core_c1_res = 1,
1047 	.has_msr_module_c6_res_ms = 1,
1048 	.has_msr_c6_demotion_policy_config = 1,
1049 	.has_msr_atom_pkg_c6_residency = 1,
1050 	.trl_msrs = TRL_ATOM,
1051 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
1052 	.has_rapl_divisor = 1,
1053 	.rapl_quirk_tdp = 30,
1054 };
1055 
1056 static const struct platform_features slvd_features = {
1057 	.has_msr_misc_pwr_mgmt = 1,
1058 	.has_nhm_msrs = 1,
1059 	.bclk_freq = BCLK_SLV,
1060 	.supported_cstates = CC1 | CC6 | PC3 | PC6,
1061 	.cst_limit = CST_LIMIT_SLV,
1062 	.has_msr_atom_pkg_c6_residency = 1,
1063 	.trl_msrs = TRL_BASE,
1064 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
1065 	.rapl_quirk_tdp = 30,
1066 };
1067 
1068 static const struct platform_features amt_features = {
1069 	.has_nhm_msrs = 1,
1070 	.bclk_freq = BCLK_133MHZ,
1071 	.supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
1072 	.cst_limit = CST_LIMIT_AMT,
1073 	.trl_msrs = TRL_BASE,
1074 };
1075 
1076 static const struct platform_features gmt_features = {
1077 	.has_msr_misc_pwr_mgmt = 1,
1078 	.has_nhm_msrs = 1,
1079 	.bclk_freq = BCLK_100MHZ,
1080 	.crystal_freq = 19200000,
1081 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
1082 	.cst_limit = CST_LIMIT_GMT,
1083 	.has_irtl_msrs = 1,
1084 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
1085 	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
1086 };
1087 
1088 static const struct platform_features gmtd_features = {
1089 	.has_msr_misc_pwr_mgmt = 1,
1090 	.has_nhm_msrs = 1,
1091 	.bclk_freq = BCLK_100MHZ,
1092 	.crystal_freq = 25000000,
1093 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
1094 	.cst_limit = CST_LIMIT_GMT,
1095 	.has_irtl_msrs = 1,
1096 	.has_msr_core_c1_res = 1,
1097 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
1098 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS,
1099 };
1100 
1101 static const struct platform_features gmtp_features = {
1102 	.has_msr_misc_pwr_mgmt = 1,
1103 	.has_nhm_msrs = 1,
1104 	.bclk_freq = BCLK_100MHZ,
1105 	.crystal_freq = 19200000,
1106 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
1107 	.cst_limit = CST_LIMIT_GMT,
1108 	.has_irtl_msrs = 1,
1109 	.trl_msrs = TRL_BASE,
1110 	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
1111 };
1112 
1113 static const struct platform_features tmt_features = {
1114 	.has_msr_misc_pwr_mgmt = 1,
1115 	.has_nhm_msrs = 1,
1116 	.bclk_freq = BCLK_100MHZ,
1117 	.supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
1118 	.cst_limit = CST_LIMIT_GMT,
1119 	.has_irtl_msrs = 1,
1120 	.trl_msrs = TRL_BASE,
1121 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
1122 	.enable_tsc_tweak = 1,
1123 };
1124 
1125 static const struct platform_features tmtd_features = {
1126 	.has_msr_misc_pwr_mgmt = 1,
1127 	.has_nhm_msrs = 1,
1128 	.bclk_freq = BCLK_100MHZ,
1129 	.supported_cstates = CC1 | CC6,
1130 	.cst_limit = CST_LIMIT_GMT,
1131 	.has_irtl_msrs = 1,
1132 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
1133 	.rapl_msrs = RAPL_PKG_ALL,
1134 };
1135 
1136 static const struct platform_features knl_features = {
1137 	.has_msr_misc_pwr_mgmt = 1,
1138 	.has_nhm_msrs = 1,
1139 	.has_config_tdp = 1,
1140 	.bclk_freq = BCLK_100MHZ,
1141 	.supported_cstates = CC1 | CC6 | PC3 | PC6,
1142 	.cst_limit = CST_LIMIT_KNL,
1143 	.has_msr_knl_core_c6_residency = 1,
1144 	.trl_msrs = TRL_KNL,
1145 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
1146 	.has_fixed_rapl_unit = 1,
1147 	.need_perf_multiplier = 1,
1148 };
1149 
1150 static const struct platform_features default_features = {
1151 };
1152 
1153 static const struct platform_features amd_features_with_rapl = {
1154 	.rapl_msrs = RAPL_AMD_F17H,
1155 	.has_per_core_rapl = 1,
1156 	.rapl_quirk_tdp = 280,	/* This is the max stock TDP of HEDT/Server Fam17h+ chips */
1157 };
1158 
1159 static const struct platform_data turbostat_pdata[] = {
1160 	{ INTEL_NEHALEM, &nhm_features },
1161 	{ INTEL_NEHALEM_G, &nhm_features },
1162 	{ INTEL_NEHALEM_EP, &nhm_features },
1163 	{ INTEL_NEHALEM_EX, &nhx_features },
1164 	{ INTEL_WESTMERE, &nhm_features },
1165 	{ INTEL_WESTMERE_EP, &nhm_features },
1166 	{ INTEL_WESTMERE_EX, &nhx_features },
1167 	{ INTEL_SANDYBRIDGE, &snb_features },
1168 	{ INTEL_SANDYBRIDGE_X, &snx_features },
1169 	{ INTEL_IVYBRIDGE, &ivb_features },
1170 	{ INTEL_IVYBRIDGE_X, &ivx_features },
1171 	{ INTEL_HASWELL, &hsw_features },
1172 	{ INTEL_HASWELL_X, &hsx_features },
1173 	{ INTEL_HASWELL_L, &hswl_features },
1174 	{ INTEL_HASWELL_G, &hswg_features },
1175 	{ INTEL_BROADWELL, &bdw_features },
1176 	{ INTEL_BROADWELL_G, &bdwg_features },
1177 	{ INTEL_BROADWELL_X, &bdx_features },
1178 	{ INTEL_BROADWELL_D, &bdx_features },
1179 	{ INTEL_SKYLAKE_L, &skl_features },
1180 	{ INTEL_SKYLAKE, &skl_features },
1181 	{ INTEL_SKYLAKE_X, &skx_features },
1182 	{ INTEL_KABYLAKE_L, &skl_features },
1183 	{ INTEL_KABYLAKE, &skl_features },
1184 	{ INTEL_COMETLAKE, &skl_features },
1185 	{ INTEL_COMETLAKE_L, &skl_features },
1186 	{ INTEL_CANNONLAKE_L, &cnl_features },
1187 	{ INTEL_ICELAKE_X, &icx_features },
1188 	{ INTEL_ICELAKE_D, &icx_features },
1189 	{ INTEL_ICELAKE_L, &cnl_features },
1190 	{ INTEL_ICELAKE_NNPI, &cnl_features },
1191 	{ INTEL_ROCKETLAKE, &cnl_features },
1192 	{ INTEL_TIGERLAKE_L, &cnl_features },
1193 	{ INTEL_TIGERLAKE, &cnl_features },
1194 	{ INTEL_SAPPHIRERAPIDS_X, &spr_features },
1195 	{ INTEL_EMERALDRAPIDS_X, &spr_features },
1196 	{ INTEL_GRANITERAPIDS_X, &spr_features },
1197 	{ INTEL_GRANITERAPIDS_D, &spr_features },
1198 	{ INTEL_PANTHERCOVE_X, &dmr_features },
1199 	{ INTEL_LAKEFIELD, &cnl_features },
1200 	{ INTEL_ALDERLAKE, &adl_features },
1201 	{ INTEL_ALDERLAKE_L, &adl_features },
1202 	{ INTEL_RAPTORLAKE, &adl_features },
1203 	{ INTEL_RAPTORLAKE_P, &adl_features },
1204 	{ INTEL_RAPTORLAKE_S, &adl_features },
1205 	{ INTEL_BARTLETTLAKE, &adl_features },
1206 	{ INTEL_METEORLAKE, &adl_features },
1207 	{ INTEL_METEORLAKE_L, &adl_features },
1208 	{ INTEL_ARROWLAKE_H, &adl_features },
1209 	{ INTEL_ARROWLAKE_U, &adl_features },
1210 	{ INTEL_ARROWLAKE, &adl_features },
1211 	{ INTEL_LUNARLAKE_M, &lnl_features },
1212 	{ INTEL_PANTHERLAKE_L, &lnl_features },
1213 	{ INTEL_ATOM_SILVERMONT, &slv_features },
1214 	{ INTEL_ATOM_SILVERMONT_D, &slvd_features },
1215 	{ INTEL_ATOM_AIRMONT, &amt_features },
1216 	{ INTEL_ATOM_GOLDMONT, &gmt_features },
1217 	{ INTEL_ATOM_GOLDMONT_D, &gmtd_features },
1218 	{ INTEL_ATOM_GOLDMONT_PLUS, &gmtp_features },
1219 	{ INTEL_ATOM_TREMONT_D, &tmtd_features },
1220 	{ INTEL_ATOM_TREMONT, &tmt_features },
1221 	{ INTEL_ATOM_TREMONT_L, &tmt_features },
1222 	{ INTEL_ATOM_GRACEMONT, &adl_features },
1223 	{ INTEL_ATOM_CRESTMONT_X, &srf_features },
1224 	{ INTEL_ATOM_CRESTMONT, &grr_features },
1225 	{ INTEL_ATOM_DARKMONT_X, &srf_features },
1226 	{ INTEL_XEON_PHI_KNL, &knl_features },
1227 	{ INTEL_XEON_PHI_KNM, &knl_features },
1228 	/*
1229 	 * Missing support for
1230 	 * INTEL_ICELAKE
1231 	 * INTEL_ATOM_SILVERMONT_MID
1232 	 * INTEL_ATOM_SILVERMONT_MID2
1233 	 * INTEL_ATOM_AIRMONT_NP
1234 	 */
1235 	{ 0, NULL },
1236 };
1237 
1238 static const struct platform_features *platform;
1239 
1240 void probe_platform_features(unsigned int family, unsigned int model)
1241 {
1242 	int i;
1243 
1244 	if (authentic_amd || hygon_genuine) {
1245 		/* fallback to default features on unsupported models */
1246 		force_load++;
1247 		if (max_extended_level >= 0x80000007) {
1248 			unsigned int eax, ebx, ecx, edx;
1249 
1250 			__cpuid(0x80000007, eax, ebx, ecx, edx);
1251 			/* RAPL (Fam 17h+) */
1252 			if ((edx & (1 << 14)) && family >= 0x17)
1253 				platform = &amd_features_with_rapl;
1254 		}
1255 		goto end;
1256 	}
1257 
1258 	if (!genuine_intel)
1259 		goto end;
1260 
1261 	for (i = 0; turbostat_pdata[i].features; i++) {
1262 		if (VFM_FAMILY(turbostat_pdata[i].vfm) == family && VFM_MODEL(turbostat_pdata[i].vfm) == model) {
1263 			platform = turbostat_pdata[i].features;
1264 			return;
1265 		}
1266 	}
1267 
1268 end:
1269 	if (force_load && !platform) {
1270 		fprintf(outf, "Forced to run on unsupported platform!\n");
1271 		platform = &default_features;
1272 	}
1273 
1274 	if (platform)
1275 		return;
1276 
1277 	fprintf(stderr, "Unsupported platform detected.\n\tSee RUN THE LATEST VERSION on turbostat(8)\n");
1278 	exit(1);
1279 }
1280 
1281 /* Model specific support End */
1282 
1283 #define	TJMAX_DEFAULT	100
1284 
1285 /* MSRs that are not yet in the kernel-provided header. */
1286 #define MSR_RAPL_PWR_UNIT	0xc0010299
1287 #define MSR_CORE_ENERGY_STAT	0xc001029a
1288 #define MSR_PKG_ENERGY_STAT	0xc001029b
1289 
1290 #define MAX(a, b) ((a) > (b) ? (a) : (b))
1291 
1292 int backwards_count;
1293 char *progname;
1294 
1295 #define CPU_SUBSET_MAXCPUS	8192	/* need to use before probe... */
1296 cpu_set_t *cpu_present_set, *cpu_possible_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset;
1297 size_t cpu_present_setsize, cpu_possible_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize,
1298     cpu_subset_size;
1299 #define MAX_ADDED_THREAD_COUNTERS 24
1300 #define MAX_ADDED_CORE_COUNTERS 8
1301 #define MAX_ADDED_PACKAGE_COUNTERS 16
1302 #define PMT_MAX_ADDED_THREAD_COUNTERS 24
1303 #define PMT_MAX_ADDED_CORE_COUNTERS 8
1304 #define PMT_MAX_ADDED_PACKAGE_COUNTERS 16
1305 #define BITMASK_SIZE 32
1306 
1307 #define ZERO_ARRAY(arr) (memset(arr, 0, sizeof(arr)) + __must_be_array(arr))
1308 
1309 /* Indexes used to map data read from perf and MSRs into global variables */
1310 enum rapl_rci_index {
1311 	RAPL_RCI_INDEX_ENERGY_PKG = 0,
1312 	RAPL_RCI_INDEX_ENERGY_CORES = 1,
1313 	RAPL_RCI_INDEX_DRAM = 2,
1314 	RAPL_RCI_INDEX_GFX = 3,
1315 	RAPL_RCI_INDEX_PKG_PERF_STATUS = 4,
1316 	RAPL_RCI_INDEX_DRAM_PERF_STATUS = 5,
1317 	RAPL_RCI_INDEX_CORE_ENERGY = 6,
1318 	RAPL_RCI_INDEX_ENERGY_PLATFORM = 7,
1319 	NUM_RAPL_COUNTERS,
1320 };
1321 
1322 enum rapl_unit {
1323 	RAPL_UNIT_INVALID,
1324 	RAPL_UNIT_JOULES,
1325 	RAPL_UNIT_WATTS,
1326 };
1327 
1328 struct rapl_counter_info_t {
1329 	unsigned long long data[NUM_RAPL_COUNTERS];
1330 	enum counter_source source[NUM_RAPL_COUNTERS];
1331 	unsigned long long flags[NUM_RAPL_COUNTERS];
1332 	double scale[NUM_RAPL_COUNTERS];
1333 	enum rapl_unit unit[NUM_RAPL_COUNTERS];
1334 	unsigned long long msr[NUM_RAPL_COUNTERS];
1335 	unsigned long long msr_mask[NUM_RAPL_COUNTERS];
1336 	int msr_shift[NUM_RAPL_COUNTERS];
1337 
1338 	int fd_perf;
1339 };
1340 
1341 /* struct rapl_counter_info_t for each RAPL domain */
1342 struct rapl_counter_info_t *rapl_counter_info_perdomain;
1343 unsigned int rapl_counter_info_perdomain_size;
1344 
1345 #define RAPL_COUNTER_FLAG_PLATFORM_COUNTER (1u << 0)
1346 #define RAPL_COUNTER_FLAG_USE_MSR_SUM (1u << 1)
1347 
1348 struct rapl_counter_arch_info {
1349 	int feature_mask;	/* Mask for testing if the counter is supported on host */
1350 	const char *perf_subsys;
1351 	const char *perf_name;
1352 	unsigned long long msr;
1353 	unsigned long long msr_mask;
1354 	int msr_shift;		/* Positive mean shift right, negative mean shift left */
1355 	double *platform_rapl_msr_scale;	/* Scale applied to values read by MSR (platform dependent, filled at runtime) */
1356 	unsigned int rci_index;	/* Maps data from perf counters to global variables */
1357 	unsigned int bic_number;
1358 	double compat_scale;	/* Some counters require constant scaling to be in the same range as other, similar ones */
1359 	unsigned long long flags;
1360 };
1361 
1362 static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = {
1363 	{
1364 	 .feature_mask = RAPL_PKG,
1365 	 .perf_subsys = "power",
1366 	 .perf_name = "energy-pkg",
1367 	 .msr = MSR_PKG_ENERGY_STATUS,
1368 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1369 	 .msr_shift = 0,
1370 	 .platform_rapl_msr_scale = &rapl_energy_units,
1371 	 .rci_index = RAPL_RCI_INDEX_ENERGY_PKG,
1372 	 .bic_number = BIC_PkgWatt,
1373 	 .compat_scale = 1.0,
1374 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1375 	  },
1376 	{
1377 	 .feature_mask = RAPL_PKG,
1378 	 .perf_subsys = "power",
1379 	 .perf_name = "energy-pkg",
1380 	 .msr = MSR_PKG_ENERGY_STATUS,
1381 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1382 	 .msr_shift = 0,
1383 	 .platform_rapl_msr_scale = &rapl_energy_units,
1384 	 .rci_index = RAPL_RCI_INDEX_ENERGY_PKG,
1385 	 .bic_number = BIC_Pkg_J,
1386 	 .compat_scale = 1.0,
1387 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1388 	  },
1389 	{
1390 	 .feature_mask = RAPL_AMD_F17H,
1391 	 .perf_subsys = "power",
1392 	 .perf_name = "energy-pkg",
1393 	 .msr = MSR_PKG_ENERGY_STAT,
1394 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1395 	 .msr_shift = 0,
1396 	 .platform_rapl_msr_scale = &rapl_energy_units,
1397 	 .rci_index = RAPL_RCI_INDEX_ENERGY_PKG,
1398 	 .bic_number = BIC_PkgWatt,
1399 	 .compat_scale = 1.0,
1400 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1401 	  },
1402 	{
1403 	 .feature_mask = RAPL_AMD_F17H,
1404 	 .perf_subsys = "power",
1405 	 .perf_name = "energy-pkg",
1406 	 .msr = MSR_PKG_ENERGY_STAT,
1407 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1408 	 .msr_shift = 0,
1409 	 .platform_rapl_msr_scale = &rapl_energy_units,
1410 	 .rci_index = RAPL_RCI_INDEX_ENERGY_PKG,
1411 	 .bic_number = BIC_Pkg_J,
1412 	 .compat_scale = 1.0,
1413 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1414 	  },
1415 	{
1416 	 .feature_mask = RAPL_CORE_ENERGY_STATUS,
1417 	 .perf_subsys = "power",
1418 	 .perf_name = "energy-cores",
1419 	 .msr = MSR_PP0_ENERGY_STATUS,
1420 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1421 	 .msr_shift = 0,
1422 	 .platform_rapl_msr_scale = &rapl_energy_units,
1423 	 .rci_index = RAPL_RCI_INDEX_ENERGY_CORES,
1424 	 .bic_number = BIC_CorWatt,
1425 	 .compat_scale = 1.0,
1426 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1427 	  },
1428 	{
1429 	 .feature_mask = RAPL_CORE_ENERGY_STATUS,
1430 	 .perf_subsys = "power",
1431 	 .perf_name = "energy-cores",
1432 	 .msr = MSR_PP0_ENERGY_STATUS,
1433 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1434 	 .msr_shift = 0,
1435 	 .platform_rapl_msr_scale = &rapl_energy_units,
1436 	 .rci_index = RAPL_RCI_INDEX_ENERGY_CORES,
1437 	 .bic_number = BIC_Cor_J,
1438 	 .compat_scale = 1.0,
1439 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1440 	  },
1441 	{
1442 	 .feature_mask = RAPL_DRAM,
1443 	 .perf_subsys = "power",
1444 	 .perf_name = "energy-ram",
1445 	 .msr = MSR_DRAM_ENERGY_STATUS,
1446 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1447 	 .msr_shift = 0,
1448 	 .platform_rapl_msr_scale = &rapl_dram_energy_units,
1449 	 .rci_index = RAPL_RCI_INDEX_DRAM,
1450 	 .bic_number = BIC_RAMWatt,
1451 	 .compat_scale = 1.0,
1452 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1453 	  },
1454 	{
1455 	 .feature_mask = RAPL_DRAM,
1456 	 .perf_subsys = "power",
1457 	 .perf_name = "energy-ram",
1458 	 .msr = MSR_DRAM_ENERGY_STATUS,
1459 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1460 	 .msr_shift = 0,
1461 	 .platform_rapl_msr_scale = &rapl_dram_energy_units,
1462 	 .rci_index = RAPL_RCI_INDEX_DRAM,
1463 	 .bic_number = BIC_RAM_J,
1464 	 .compat_scale = 1.0,
1465 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1466 	  },
1467 	{
1468 	 .feature_mask = RAPL_GFX,
1469 	 .perf_subsys = "power",
1470 	 .perf_name = "energy-gpu",
1471 	 .msr = MSR_PP1_ENERGY_STATUS,
1472 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1473 	 .msr_shift = 0,
1474 	 .platform_rapl_msr_scale = &rapl_energy_units,
1475 	 .rci_index = RAPL_RCI_INDEX_GFX,
1476 	 .bic_number = BIC_GFXWatt,
1477 	 .compat_scale = 1.0,
1478 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1479 	  },
1480 	{
1481 	 .feature_mask = RAPL_GFX,
1482 	 .perf_subsys = "power",
1483 	 .perf_name = "energy-gpu",
1484 	 .msr = MSR_PP1_ENERGY_STATUS,
1485 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1486 	 .msr_shift = 0,
1487 	 .platform_rapl_msr_scale = &rapl_energy_units,
1488 	 .rci_index = RAPL_RCI_INDEX_GFX,
1489 	 .bic_number = BIC_GFX_J,
1490 	 .compat_scale = 1.0,
1491 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1492 	  },
1493 	{
1494 	 .feature_mask = RAPL_PKG_PERF_STATUS,
1495 	 .perf_subsys = NULL,
1496 	 .perf_name = NULL,
1497 	 .msr = MSR_PKG_PERF_STATUS,
1498 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1499 	 .msr_shift = 0,
1500 	 .platform_rapl_msr_scale = &rapl_time_units,
1501 	 .rci_index = RAPL_RCI_INDEX_PKG_PERF_STATUS,
1502 	 .bic_number = BIC_PKG__,
1503 	 .compat_scale = 100.0,
1504 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1505 	  },
1506 	{
1507 	 .feature_mask = RAPL_DRAM_PERF_STATUS,
1508 	 .perf_subsys = NULL,
1509 	 .perf_name = NULL,
1510 	 .msr = MSR_DRAM_PERF_STATUS,
1511 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1512 	 .msr_shift = 0,
1513 	 .platform_rapl_msr_scale = &rapl_time_units,
1514 	 .rci_index = RAPL_RCI_INDEX_DRAM_PERF_STATUS,
1515 	 .bic_number = BIC_RAM__,
1516 	 .compat_scale = 100.0,
1517 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1518 	  },
1519 	{
1520 	 .feature_mask = RAPL_AMD_F17H,
1521 	 .perf_subsys = NULL,
1522 	 .perf_name = NULL,
1523 	 .msr = MSR_CORE_ENERGY_STAT,
1524 	 .msr_mask = 0xFFFFFFFF,
1525 	 .msr_shift = 0,
1526 	 .platform_rapl_msr_scale = &rapl_energy_units,
1527 	 .rci_index = RAPL_RCI_INDEX_CORE_ENERGY,
1528 	 .bic_number = BIC_CorWatt,
1529 	 .compat_scale = 1.0,
1530 	 .flags = 0,
1531 	  },
1532 	{
1533 	 .feature_mask = RAPL_AMD_F17H,
1534 	 .perf_subsys = NULL,
1535 	 .perf_name = NULL,
1536 	 .msr = MSR_CORE_ENERGY_STAT,
1537 	 .msr_mask = 0xFFFFFFFF,
1538 	 .msr_shift = 0,
1539 	 .platform_rapl_msr_scale = &rapl_energy_units,
1540 	 .rci_index = RAPL_RCI_INDEX_CORE_ENERGY,
1541 	 .bic_number = BIC_Cor_J,
1542 	 .compat_scale = 1.0,
1543 	 .flags = 0,
1544 	  },
1545 	{
1546 	 .feature_mask = RAPL_PSYS,
1547 	 .perf_subsys = "power",
1548 	 .perf_name = "energy-psys",
1549 	 .msr = MSR_PLATFORM_ENERGY_STATUS,
1550 	 .msr_mask = 0x00000000FFFFFFFF,
1551 	 .msr_shift = 0,
1552 	 .platform_rapl_msr_scale = &rapl_psys_energy_units,
1553 	 .rci_index = RAPL_RCI_INDEX_ENERGY_PLATFORM,
1554 	 .bic_number = BIC_SysWatt,
1555 	 .compat_scale = 1.0,
1556 	 .flags = RAPL_COUNTER_FLAG_PLATFORM_COUNTER | RAPL_COUNTER_FLAG_USE_MSR_SUM,
1557 	  },
1558 	{
1559 	 .feature_mask = RAPL_PSYS,
1560 	 .perf_subsys = "power",
1561 	 .perf_name = "energy-psys",
1562 	 .msr = MSR_PLATFORM_ENERGY_STATUS,
1563 	 .msr_mask = 0x00000000FFFFFFFF,
1564 	 .msr_shift = 0,
1565 	 .platform_rapl_msr_scale = &rapl_psys_energy_units,
1566 	 .rci_index = RAPL_RCI_INDEX_ENERGY_PLATFORM,
1567 	 .bic_number = BIC_Sys_J,
1568 	 .compat_scale = 1.0,
1569 	 .flags = RAPL_COUNTER_FLAG_PLATFORM_COUNTER | RAPL_COUNTER_FLAG_USE_MSR_SUM,
1570 	  },
1571 };
1572 
1573 struct rapl_counter {
1574 	unsigned long long raw_value;
1575 	enum rapl_unit unit;
1576 	double scale;
1577 };
1578 
1579 /* Indexes used to map data read from perf and MSRs into global variables */
1580 enum ccstate_rci_index {
1581 	CCSTATE_RCI_INDEX_C1_RESIDENCY = 0,
1582 	CCSTATE_RCI_INDEX_C3_RESIDENCY = 1,
1583 	CCSTATE_RCI_INDEX_C6_RESIDENCY = 2,
1584 	CCSTATE_RCI_INDEX_C7_RESIDENCY = 3,
1585 	PCSTATE_RCI_INDEX_C2_RESIDENCY = 4,
1586 	PCSTATE_RCI_INDEX_C3_RESIDENCY = 5,
1587 	PCSTATE_RCI_INDEX_C6_RESIDENCY = 6,
1588 	PCSTATE_RCI_INDEX_C7_RESIDENCY = 7,
1589 	PCSTATE_RCI_INDEX_C8_RESIDENCY = 8,
1590 	PCSTATE_RCI_INDEX_C9_RESIDENCY = 9,
1591 	PCSTATE_RCI_INDEX_C10_RESIDENCY = 10,
1592 	NUM_CSTATE_COUNTERS,
1593 };
1594 
1595 struct cstate_counter_info_t {
1596 	unsigned long long data[NUM_CSTATE_COUNTERS];
1597 	enum counter_source source[NUM_CSTATE_COUNTERS];
1598 	unsigned long long msr[NUM_CSTATE_COUNTERS];
1599 	int fd_perf_core;
1600 	int fd_perf_pkg;
1601 };
1602 
1603 struct cstate_counter_info_t *ccstate_counter_info;
1604 unsigned int ccstate_counter_info_size;
1605 
1606 #define CSTATE_COUNTER_FLAG_COLLECT_PER_CORE   (1u << 0)
1607 #define CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD ((1u << 1) | CSTATE_COUNTER_FLAG_COLLECT_PER_CORE)
1608 #define CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY (1u << 2)
1609 
1610 struct cstate_counter_arch_info {
1611 	int feature_mask;	/* Mask for testing if the counter is supported on host */
1612 	const char *perf_subsys;
1613 	const char *perf_name;
1614 	unsigned long long msr;
1615 	unsigned int rci_index;	/* Maps data from perf counters to global variables */
1616 	unsigned int bic_number;
1617 	unsigned long long flags;
1618 	int pkg_cstate_limit;
1619 };
1620 
1621 static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = {
1622 	{
1623 	 .feature_mask = CC1,
1624 	 .perf_subsys = "cstate_core",
1625 	 .perf_name = "c1-residency",
1626 	 .msr = MSR_CORE_C1_RES,
1627 	 .rci_index = CCSTATE_RCI_INDEX_C1_RESIDENCY,
1628 	 .bic_number = BIC_CPU_c1,
1629 	 .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD,
1630 	 .pkg_cstate_limit = 0,
1631 	  },
1632 	{
1633 	 .feature_mask = CC3,
1634 	 .perf_subsys = "cstate_core",
1635 	 .perf_name = "c3-residency",
1636 	 .msr = MSR_CORE_C3_RESIDENCY,
1637 	 .rci_index = CCSTATE_RCI_INDEX_C3_RESIDENCY,
1638 	 .bic_number = BIC_CPU_c3,
1639 	 .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY,
1640 	 .pkg_cstate_limit = 0,
1641 	  },
1642 	{
1643 	 .feature_mask = CC6,
1644 	 .perf_subsys = "cstate_core",
1645 	 .perf_name = "c6-residency",
1646 	 .msr = MSR_CORE_C6_RESIDENCY,
1647 	 .rci_index = CCSTATE_RCI_INDEX_C6_RESIDENCY,
1648 	 .bic_number = BIC_CPU_c6,
1649 	 .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY,
1650 	 .pkg_cstate_limit = 0,
1651 	  },
1652 	{
1653 	 .feature_mask = CC7,
1654 	 .perf_subsys = "cstate_core",
1655 	 .perf_name = "c7-residency",
1656 	 .msr = MSR_CORE_C7_RESIDENCY,
1657 	 .rci_index = CCSTATE_RCI_INDEX_C7_RESIDENCY,
1658 	 .bic_number = BIC_CPU_c7,
1659 	 .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY,
1660 	 .pkg_cstate_limit = 0,
1661 	  },
1662 	{
1663 	 .feature_mask = PC2,
1664 	 .perf_subsys = "cstate_pkg",
1665 	 .perf_name = "c2-residency",
1666 	 .msr = MSR_PKG_C2_RESIDENCY,
1667 	 .rci_index = PCSTATE_RCI_INDEX_C2_RESIDENCY,
1668 	 .bic_number = BIC_Pkgpc2,
1669 	 .flags = 0,
1670 	 .pkg_cstate_limit = PCL__2,
1671 	  },
1672 	{
1673 	 .feature_mask = PC3,
1674 	 .perf_subsys = "cstate_pkg",
1675 	 .perf_name = "c3-residency",
1676 	 .msr = MSR_PKG_C3_RESIDENCY,
1677 	 .rci_index = PCSTATE_RCI_INDEX_C3_RESIDENCY,
1678 	 .bic_number = BIC_Pkgpc3,
1679 	 .flags = 0,
1680 	 .pkg_cstate_limit = PCL__3,
1681 	  },
1682 	{
1683 	 .feature_mask = PC6,
1684 	 .perf_subsys = "cstate_pkg",
1685 	 .perf_name = "c6-residency",
1686 	 .msr = MSR_PKG_C6_RESIDENCY,
1687 	 .rci_index = PCSTATE_RCI_INDEX_C6_RESIDENCY,
1688 	 .bic_number = BIC_Pkgpc6,
1689 	 .flags = 0,
1690 	 .pkg_cstate_limit = PCL__6,
1691 	  },
1692 	{
1693 	 .feature_mask = PC7,
1694 	 .perf_subsys = "cstate_pkg",
1695 	 .perf_name = "c7-residency",
1696 	 .msr = MSR_PKG_C7_RESIDENCY,
1697 	 .rci_index = PCSTATE_RCI_INDEX_C7_RESIDENCY,
1698 	 .bic_number = BIC_Pkgpc7,
1699 	 .flags = 0,
1700 	 .pkg_cstate_limit = PCL__7,
1701 	  },
1702 	{
1703 	 .feature_mask = PC8,
1704 	 .perf_subsys = "cstate_pkg",
1705 	 .perf_name = "c8-residency",
1706 	 .msr = MSR_PKG_C8_RESIDENCY,
1707 	 .rci_index = PCSTATE_RCI_INDEX_C8_RESIDENCY,
1708 	 .bic_number = BIC_Pkgpc8,
1709 	 .flags = 0,
1710 	 .pkg_cstate_limit = PCL__8,
1711 	  },
1712 	{
1713 	 .feature_mask = PC9,
1714 	 .perf_subsys = "cstate_pkg",
1715 	 .perf_name = "c9-residency",
1716 	 .msr = MSR_PKG_C9_RESIDENCY,
1717 	 .rci_index = PCSTATE_RCI_INDEX_C9_RESIDENCY,
1718 	 .bic_number = BIC_Pkgpc9,
1719 	 .flags = 0,
1720 	 .pkg_cstate_limit = PCL__9,
1721 	  },
1722 	{
1723 	 .feature_mask = PC10,
1724 	 .perf_subsys = "cstate_pkg",
1725 	 .perf_name = "c10-residency",
1726 	 .msr = MSR_PKG_C10_RESIDENCY,
1727 	 .rci_index = PCSTATE_RCI_INDEX_C10_RESIDENCY,
1728 	 .bic_number = BIC_Pkgpc10,
1729 	 .flags = 0,
1730 	 .pkg_cstate_limit = PCL_10,
1731 	  },
1732 };
1733 
1734 /* Indexes used to map data read from perf and MSRs into global variables */
1735 enum msr_rci_index {
1736 	MSR_RCI_INDEX_APERF = 0,
1737 	MSR_RCI_INDEX_MPERF = 1,
1738 	MSR_RCI_INDEX_SMI = 2,
1739 	NUM_MSR_COUNTERS,
1740 };
1741 
1742 struct msr_counter_info_t {
1743 	unsigned long long data[NUM_MSR_COUNTERS];
1744 	enum counter_source source[NUM_MSR_COUNTERS];
1745 	unsigned long long msr[NUM_MSR_COUNTERS];
1746 	unsigned long long msr_mask[NUM_MSR_COUNTERS];
1747 	int fd_perf;
1748 };
1749 
1750 struct msr_counter_info_t *msr_counter_info;
1751 unsigned int msr_counter_info_size;
1752 
1753 struct msr_counter_arch_info {
1754 	const char *perf_subsys;
1755 	const char *perf_name;
1756 	unsigned long long msr;
1757 	unsigned long long msr_mask;
1758 	unsigned int rci_index;	/* Maps data from perf counters to global variables */
1759 	bool needed;
1760 	bool present;
1761 };
1762 
1763 enum msr_arch_info_index {
1764 	MSR_ARCH_INFO_APERF_INDEX = 0,
1765 	MSR_ARCH_INFO_MPERF_INDEX = 1,
1766 	MSR_ARCH_INFO_SMI_INDEX = 2,
1767 };
1768 
1769 static struct msr_counter_arch_info msr_counter_arch_infos[] = {
1770 	[MSR_ARCH_INFO_APERF_INDEX] = {
1771 				       .perf_subsys = "msr",
1772 				       .perf_name = "aperf",
1773 				       .msr = MSR_IA32_APERF,
1774 				       .msr_mask = 0xFFFFFFFFFFFFFFFF,
1775 				       .rci_index = MSR_RCI_INDEX_APERF,
1776 				        },
1777 
1778 	[MSR_ARCH_INFO_MPERF_INDEX] = {
1779 				       .perf_subsys = "msr",
1780 				       .perf_name = "mperf",
1781 				       .msr = MSR_IA32_MPERF,
1782 				       .msr_mask = 0xFFFFFFFFFFFFFFFF,
1783 				       .rci_index = MSR_RCI_INDEX_MPERF,
1784 				        },
1785 
1786 	[MSR_ARCH_INFO_SMI_INDEX] = {
1787 				     .perf_subsys = "msr",
1788 				     .perf_name = "smi",
1789 				     .msr = MSR_SMI_COUNT,
1790 				     .msr_mask = 0xFFFFFFFF,
1791 				     .rci_index = MSR_RCI_INDEX_SMI,
1792 				      },
1793 };
1794 
1795 /* Can be redefined when compiling, useful for testing. */
1796 #ifndef SYSFS_TELEM_PATH
1797 #define SYSFS_TELEM_PATH "/sys/class/intel_pmt"
1798 #endif
1799 
1800 #define PMT_COUNTER_MTL_DC6_OFFSET 120
1801 #define PMT_COUNTER_MTL_DC6_LSB    0
1802 #define PMT_COUNTER_MTL_DC6_MSB    63
1803 #define PMT_MTL_DC6_GUID           0x1a067102
1804 #define PMT_MTL_DC6_SEQ            0
1805 
1806 #define PMT_COUNTER_CWF_MC1E_OFFSET_BASE          20936
1807 #define PMT_COUNTER_CWF_MC1E_OFFSET_INCREMENT     24
1808 #define PMT_COUNTER_CWF_MC1E_NUM_MODULES_PER_FILE 12
1809 #define PMT_COUNTER_CWF_CPUS_PER_MODULE           4
1810 #define PMT_COUNTER_CWF_MC1E_LSB                  0
1811 #define PMT_COUNTER_CWF_MC1E_MSB                  63
1812 #define PMT_CWF_MC1E_GUID                         0x14421519
1813 
1814 unsigned long long tcore_clock_freq_hz = 800000000;
1815 
1816 #define PMT_COUNTER_NAME_SIZE_BYTES      16
1817 #define PMT_COUNTER_TYPE_NAME_SIZE_BYTES 32
1818 
1819 struct pmt_mmio {
1820 	struct pmt_mmio *next;
1821 
1822 	unsigned int guid;
1823 	unsigned int size;
1824 
1825 	/* Base pointer to the mmaped memory. */
1826 	void *mmio_base;
1827 
1828 	/*
1829 	 * Offset to be applied to the mmio_base
1830 	 * to get the beginning of the PMT counters for given GUID.
1831 	 */
1832 	unsigned long pmt_offset;
1833 } *pmt_mmios;
1834 
1835 enum pmt_datatype {
1836 	PMT_TYPE_RAW,
1837 	PMT_TYPE_XTAL_TIME,
1838 	PMT_TYPE_TCORE_CLOCK,
1839 };
1840 
1841 struct pmt_domain_info {
1842 	/*
1843 	 * Pointer to the MMIO obtained by applying a counter offset
1844 	 * to the mmio_base of the mmaped region for the given GUID.
1845 	 *
1846 	 * This is where to read the raw value of the counter from.
1847 	 */
1848 	unsigned long *pcounter;
1849 };
1850 
1851 struct pmt_counter {
1852 	struct pmt_counter *next;
1853 
1854 	/* PMT metadata */
1855 	char name[PMT_COUNTER_NAME_SIZE_BYTES];
1856 	enum pmt_datatype type;
1857 	enum counter_scope scope;
1858 	unsigned int lsb;
1859 	unsigned int msb;
1860 
1861 	/* BIC-like metadata */
1862 	enum counter_format format;
1863 
1864 	unsigned int num_domains;
1865 	struct pmt_domain_info *domains;
1866 };
1867 
1868 /*
1869  * PMT telemetry directory iterator.
1870  * Used to iterate telemetry files in sysfs in correct order.
1871  */
1872 struct pmt_diriter_t {
1873 	DIR *dir;
1874 	struct dirent **namelist;
1875 	unsigned int num_names;
1876 	unsigned int current_name_idx;
1877 };
1878 
1879 int pmt_telemdir_filter(const struct dirent *e)
1880 {
1881 	unsigned int dummy;
1882 
1883 	return sscanf(e->d_name, "telem%u", &dummy);
1884 }
1885 
1886 int pmt_telemdir_sort(const struct dirent **a, const struct dirent **b)
1887 {
1888 	unsigned int aidx = 0, bidx = 0;
1889 
1890 	sscanf((*a)->d_name, "telem%u", &aidx);
1891 	sscanf((*b)->d_name, "telem%u", &bidx);
1892 
1893 	return aidx >= bidx;
1894 }
1895 
1896 const struct dirent *pmt_diriter_next(struct pmt_diriter_t *iter)
1897 {
1898 	const struct dirent *ret = NULL;
1899 
1900 	if (!iter->dir)
1901 		return NULL;
1902 
1903 	if (iter->current_name_idx >= iter->num_names)
1904 		return NULL;
1905 
1906 	ret = iter->namelist[iter->current_name_idx];
1907 	++iter->current_name_idx;
1908 
1909 	return ret;
1910 }
1911 
1912 const struct dirent *pmt_diriter_begin(struct pmt_diriter_t *iter, const char *pmt_root_path)
1913 {
1914 	int num_names = iter->num_names;
1915 
1916 	if (!iter->dir) {
1917 		iter->dir = opendir(pmt_root_path);
1918 		if (iter->dir == NULL)
1919 			return NULL;
1920 
1921 		num_names = scandir(pmt_root_path, &iter->namelist, pmt_telemdir_filter, pmt_telemdir_sort);
1922 		if (num_names == -1)
1923 			return NULL;
1924 	}
1925 
1926 	iter->current_name_idx = 0;
1927 	iter->num_names = num_names;
1928 
1929 	return pmt_diriter_next(iter);
1930 }
1931 
1932 void pmt_diriter_init(struct pmt_diriter_t *iter)
1933 {
1934 	memset(iter, 0, sizeof(*iter));
1935 }
1936 
1937 void pmt_diriter_remove(struct pmt_diriter_t *iter)
1938 {
1939 	if (iter->namelist) {
1940 		for (unsigned int i = 0; i < iter->num_names; i++) {
1941 			free(iter->namelist[i]);
1942 			iter->namelist[i] = NULL;
1943 		}
1944 	}
1945 
1946 	free(iter->namelist);
1947 	iter->namelist = NULL;
1948 	iter->num_names = 0;
1949 	iter->current_name_idx = 0;
1950 
1951 	closedir(iter->dir);
1952 	iter->dir = NULL;
1953 }
1954 
1955 unsigned int pmt_counter_get_width(const struct pmt_counter *p)
1956 {
1957 	return (p->msb - p->lsb) + 1;
1958 }
1959 
1960 void pmt_counter_resize_(struct pmt_counter *pcounter, unsigned int new_size)
1961 {
1962 	struct pmt_domain_info *new_mem;
1963 
1964 	new_mem = (struct pmt_domain_info *)reallocarray(pcounter->domains, new_size, sizeof(*pcounter->domains));
1965 	if (!new_mem) {
1966 		fprintf(stderr, "%s: failed to allocate memory for PMT counters\n", __func__);
1967 		exit(1);
1968 	}
1969 
1970 	/* Zero initialize just allocated memory. */
1971 	const size_t num_new_domains = new_size - pcounter->num_domains;
1972 
1973 	memset(&new_mem[pcounter->num_domains], 0, num_new_domains * sizeof(*pcounter->domains));
1974 
1975 	pcounter->num_domains = new_size;
1976 	pcounter->domains = new_mem;
1977 }
1978 
1979 void pmt_counter_resize(struct pmt_counter *pcounter, unsigned int new_size)
1980 {
1981 	/*
1982 	 * Allocate more memory ahead of time.
1983 	 *
1984 	 * Always allocate space for at least 8 elements
1985 	 * and double the size when growing.
1986 	 */
1987 	if (new_size < 8)
1988 		new_size = 8;
1989 	new_size = MAX(new_size, pcounter->num_domains * 2);
1990 
1991 	pmt_counter_resize_(pcounter, new_size);
1992 }
1993 
1994 struct thread_data {
1995 	struct timeval tv_begin;
1996 	struct timeval tv_end;
1997 	struct timeval tv_delta;
1998 	unsigned long long tsc;
1999 	unsigned long long aperf;
2000 	unsigned long long mperf;
2001 	unsigned long long c1;
2002 	unsigned long long instr_count;
2003 	unsigned long long irq_count;
2004 	unsigned long long nmi_count;
2005 	unsigned int smi_count;
2006 	unsigned int cpu_id;
2007 	unsigned int apic_id;
2008 	unsigned int x2apic_id;
2009 	unsigned int flags;
2010 	bool is_atom;
2011 	unsigned long long counter[MAX_ADDED_THREAD_COUNTERS];
2012 	unsigned long long perf_counter[MAX_ADDED_THREAD_COUNTERS];
2013 	unsigned long long pmt_counter[PMT_MAX_ADDED_THREAD_COUNTERS];
2014 } *thread_even, *thread_odd;
2015 
2016 struct core_data {
2017 	int base_cpu;
2018 	unsigned long long c3;
2019 	unsigned long long c6;
2020 	unsigned long long c7;
2021 	unsigned long long mc6_us;	/* duplicate as per-core for now, even though per module */
2022 	unsigned int core_temp_c;
2023 	struct rapl_counter core_energy;	/* MSR_CORE_ENERGY_STAT */
2024 	unsigned int core_id;
2025 	unsigned long long core_throt_cnt;
2026 	unsigned long long counter[MAX_ADDED_CORE_COUNTERS];
2027 	unsigned long long perf_counter[MAX_ADDED_CORE_COUNTERS];
2028 	unsigned long long pmt_counter[PMT_MAX_ADDED_CORE_COUNTERS];
2029 } *core_even, *core_odd;
2030 
2031 struct pkg_data {
2032 	int base_cpu;
2033 	unsigned long long pc2;
2034 	unsigned long long pc3;
2035 	unsigned long long pc6;
2036 	unsigned long long pc7;
2037 	unsigned long long pc8;
2038 	unsigned long long pc9;
2039 	unsigned long long pc10;
2040 	long long cpu_lpi;
2041 	long long sys_lpi;
2042 	unsigned long long pkg_wtd_core_c0;
2043 	unsigned long long pkg_any_core_c0;
2044 	unsigned long long pkg_any_gfxe_c0;
2045 	unsigned long long pkg_both_core_gfxe_c0;
2046 	long long gfx_rc6_ms;
2047 	unsigned int gfx_mhz;
2048 	unsigned int gfx_act_mhz;
2049 	long long sam_mc6_ms;
2050 	unsigned int sam_mhz;
2051 	unsigned int sam_act_mhz;
2052 	unsigned int package_id;
2053 	struct rapl_counter energy_pkg;	/* MSR_PKG_ENERGY_STATUS */
2054 	struct rapl_counter energy_dram;	/* MSR_DRAM_ENERGY_STATUS */
2055 	struct rapl_counter energy_cores;	/* MSR_PP0_ENERGY_STATUS */
2056 	struct rapl_counter energy_gfx;	/* MSR_PP1_ENERGY_STATUS */
2057 	struct rapl_counter rapl_pkg_perf_status;	/* MSR_PKG_PERF_STATUS */
2058 	struct rapl_counter rapl_dram_perf_status;	/* MSR_DRAM_PERF_STATUS */
2059 	unsigned int pkg_temp_c;
2060 	unsigned int uncore_mhz;
2061 	unsigned long long die_c6;
2062 	unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS];
2063 	unsigned long long perf_counter[MAX_ADDED_PACKAGE_COUNTERS];
2064 	unsigned long long pmt_counter[PMT_MAX_ADDED_PACKAGE_COUNTERS];
2065 } *package_even, *package_odd;
2066 
2067 #define ODD_COUNTERS thread_odd, core_odd, package_odd
2068 #define EVEN_COUNTERS thread_even, core_even, package_even
2069 
2070 #define GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no)	      \
2071 	((thread_base) +						      \
2072 	 ((pkg_no) *							      \
2073 	  topo.nodes_per_pkg * topo.cores_per_node * topo.threads_per_core) + \
2074 	 ((node_no) * topo.cores_per_node * topo.threads_per_core) +	      \
2075 	 ((core_no) * topo.threads_per_core) +				      \
2076 	 (thread_no))
2077 
2078 #define GET_CORE(core_base, core_no, node_no, pkg_no)			\
2079 	((core_base) +							\
2080 	 ((pkg_no) *  topo.nodes_per_pkg * topo.cores_per_node) +	\
2081 	 ((node_no) * topo.cores_per_node) +				\
2082 	 (core_no))
2083 
2084 /*
2085  * The accumulated sum of MSR is defined as a monotonic
2086  * increasing MSR, it will be accumulated periodically,
2087  * despite its register's bit width.
2088  */
2089 enum {
2090 	IDX_PKG_ENERGY,
2091 	IDX_DRAM_ENERGY,
2092 	IDX_PP0_ENERGY,
2093 	IDX_PP1_ENERGY,
2094 	IDX_PKG_PERF,
2095 	IDX_DRAM_PERF,
2096 	IDX_PSYS_ENERGY,
2097 	IDX_COUNT,
2098 };
2099 
2100 int get_msr_sum(int cpu, off_t offset, unsigned long long *msr);
2101 
2102 struct msr_sum_array {
2103 	/* get_msr_sum() = sum + (get_msr() - last) */
2104 	struct {
2105 		/*The accumulated MSR value is updated by the timer */
2106 		unsigned long long sum;
2107 		/*The MSR footprint recorded in last timer */
2108 		unsigned long long last;
2109 	} entries[IDX_COUNT];
2110 };
2111 
2112 /* The percpu MSR sum array.*/
2113 struct msr_sum_array *per_cpu_msr_sum;
2114 
2115 off_t idx_to_offset(int idx)
2116 {
2117 	off_t offset;
2118 
2119 	switch (idx) {
2120 	case IDX_PKG_ENERGY:
2121 		if (platform->rapl_msrs & RAPL_AMD_F17H)
2122 			offset = MSR_PKG_ENERGY_STAT;
2123 		else
2124 			offset = MSR_PKG_ENERGY_STATUS;
2125 		break;
2126 	case IDX_DRAM_ENERGY:
2127 		offset = MSR_DRAM_ENERGY_STATUS;
2128 		break;
2129 	case IDX_PP0_ENERGY:
2130 		offset = MSR_PP0_ENERGY_STATUS;
2131 		break;
2132 	case IDX_PP1_ENERGY:
2133 		offset = MSR_PP1_ENERGY_STATUS;
2134 		break;
2135 	case IDX_PKG_PERF:
2136 		offset = MSR_PKG_PERF_STATUS;
2137 		break;
2138 	case IDX_DRAM_PERF:
2139 		offset = MSR_DRAM_PERF_STATUS;
2140 		break;
2141 	case IDX_PSYS_ENERGY:
2142 		offset = MSR_PLATFORM_ENERGY_STATUS;
2143 		break;
2144 	default:
2145 		offset = -1;
2146 	}
2147 	return offset;
2148 }
2149 
2150 int offset_to_idx(off_t offset)
2151 {
2152 	int idx;
2153 
2154 	switch (offset) {
2155 	case MSR_PKG_ENERGY_STATUS:
2156 	case MSR_PKG_ENERGY_STAT:
2157 		idx = IDX_PKG_ENERGY;
2158 		break;
2159 	case MSR_DRAM_ENERGY_STATUS:
2160 		idx = IDX_DRAM_ENERGY;
2161 		break;
2162 	case MSR_PP0_ENERGY_STATUS:
2163 		idx = IDX_PP0_ENERGY;
2164 		break;
2165 	case MSR_PP1_ENERGY_STATUS:
2166 		idx = IDX_PP1_ENERGY;
2167 		break;
2168 	case MSR_PKG_PERF_STATUS:
2169 		idx = IDX_PKG_PERF;
2170 		break;
2171 	case MSR_DRAM_PERF_STATUS:
2172 		idx = IDX_DRAM_PERF;
2173 		break;
2174 	case MSR_PLATFORM_ENERGY_STATUS:
2175 		idx = IDX_PSYS_ENERGY;
2176 		break;
2177 	default:
2178 		idx = -1;
2179 	}
2180 	return idx;
2181 }
2182 
2183 int idx_valid(int idx)
2184 {
2185 	switch (idx) {
2186 	case IDX_PKG_ENERGY:
2187 		return platform->rapl_msrs & (RAPL_PKG | RAPL_AMD_F17H);
2188 	case IDX_DRAM_ENERGY:
2189 		return platform->rapl_msrs & RAPL_DRAM;
2190 	case IDX_PP0_ENERGY:
2191 		return platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS;
2192 	case IDX_PP1_ENERGY:
2193 		return platform->rapl_msrs & RAPL_GFX;
2194 	case IDX_PKG_PERF:
2195 		return platform->rapl_msrs & RAPL_PKG_PERF_STATUS;
2196 	case IDX_DRAM_PERF:
2197 		return platform->rapl_msrs & RAPL_DRAM_PERF_STATUS;
2198 	case IDX_PSYS_ENERGY:
2199 		return platform->rapl_msrs & RAPL_PSYS;
2200 	default:
2201 		return 0;
2202 	}
2203 }
2204 
2205 struct sys_counters {
2206 	/* MSR added counters */
2207 	unsigned int added_thread_counters;
2208 	unsigned int added_core_counters;
2209 	unsigned int added_package_counters;
2210 	struct msr_counter *tp;
2211 	struct msr_counter *cp;
2212 	struct msr_counter *pp;
2213 
2214 	/* perf added counters */
2215 	unsigned int added_thread_perf_counters;
2216 	unsigned int added_core_perf_counters;
2217 	unsigned int added_package_perf_counters;
2218 	struct perf_counter_info *perf_tp;
2219 	struct perf_counter_info *perf_cp;
2220 	struct perf_counter_info *perf_pp;
2221 
2222 	struct pmt_counter *pmt_tp;
2223 	struct pmt_counter *pmt_cp;
2224 	struct pmt_counter *pmt_pp;
2225 } sys;
2226 
2227 static size_t free_msr_counters_(struct msr_counter **pp)
2228 {
2229 	struct msr_counter *p = NULL;
2230 	size_t num_freed = 0;
2231 
2232 	while (*pp) {
2233 		p = *pp;
2234 
2235 		if (p->msr_num != 0) {
2236 			*pp = p->next;
2237 
2238 			free(p);
2239 			++num_freed;
2240 
2241 			continue;
2242 		}
2243 
2244 		pp = &p->next;
2245 	}
2246 
2247 	return num_freed;
2248 }
2249 
2250 /*
2251  * Free all added counters accessed via msr.
2252  */
2253 static void free_sys_msr_counters(void)
2254 {
2255 	/* Thread counters */
2256 	sys.added_thread_counters -= free_msr_counters_(&sys.tp);
2257 
2258 	/* Core counters */
2259 	sys.added_core_counters -= free_msr_counters_(&sys.cp);
2260 
2261 	/* Package counters */
2262 	sys.added_package_counters -= free_msr_counters_(&sys.pp);
2263 }
2264 
2265 struct system_summary {
2266 	struct thread_data threads;
2267 	struct core_data cores;
2268 	struct pkg_data packages;
2269 } average;
2270 
2271 struct platform_counters {
2272 	struct rapl_counter energy_psys;	/* MSR_PLATFORM_ENERGY_STATUS */
2273 } platform_counters_odd, platform_counters_even;
2274 
2275 struct cpu_topology {
2276 	int physical_package_id;
2277 	int die_id;
2278 	int l3_id;
2279 	int logical_cpu_id;
2280 	int physical_node_id;
2281 	int logical_node_id;	/* 0-based count within the package */
2282 	int physical_core_id;
2283 	int thread_id;
2284 	int type;
2285 	cpu_set_t *put_ids;	/* Processing Unit/Thread IDs */
2286 } *cpus;
2287 
2288 struct topo_params {
2289 	int num_packages;
2290 	int num_die;
2291 	int num_cpus;
2292 	int num_cores;
2293 	int allowed_packages;
2294 	int allowed_cpus;
2295 	int allowed_cores;
2296 	int max_cpu_num;
2297 	int max_core_id;
2298 	int max_package_id;
2299 	int max_die_id;
2300 	int max_l3_id;
2301 	int max_node_num;
2302 	int nodes_per_pkg;
2303 	int cores_per_node;
2304 	int threads_per_core;
2305 } topo;
2306 
2307 struct timeval tv_even, tv_odd, tv_delta;
2308 
2309 int *irq_column_2_cpu;		/* /proc/interrupts column numbers */
2310 int *irqs_per_cpu;		/* indexed by cpu_num */
2311 int *nmi_per_cpu;		/* indexed by cpu_num */
2312 
2313 void setup_all_buffers(bool startup);
2314 
2315 char *sys_lpi_file;
2316 char *sys_lpi_file_sysfs = "/sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us";
2317 char *sys_lpi_file_debugfs = "/sys/kernel/debug/pmc_core/slp_s0_residency_usec";
2318 
2319 int cpu_is_not_present(int cpu)
2320 {
2321 	return !CPU_ISSET_S(cpu, cpu_present_setsize, cpu_present_set);
2322 }
2323 
2324 int cpu_is_not_allowed(int cpu)
2325 {
2326 	return !CPU_ISSET_S(cpu, cpu_allowed_setsize, cpu_allowed_set);
2327 }
2328 
2329 /*
2330  * run func(thread, core, package) in topology order
2331  * skip non-present cpus
2332  */
2333 
2334 #define PER_THREAD_PARAMS  struct thread_data *t, struct core_data *c, struct pkg_data *p
2335 
2336 int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pkg_data *),
2337 		 struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base)
2338 {
2339 	int retval, pkg_no, core_no, thread_no, node_no;
2340 
2341 	retval = 0;
2342 
2343 	for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
2344 		for (node_no = 0; node_no < topo.nodes_per_pkg; node_no++) {
2345 			for (core_no = 0; core_no < topo.cores_per_node; ++core_no) {
2346 				for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) {
2347 					struct thread_data *t;
2348 					struct core_data *c;
2349 
2350 					t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
2351 
2352 					if (cpu_is_not_allowed(t->cpu_id))
2353 						continue;
2354 
2355 					c = GET_CORE(core_base, core_no, node_no, pkg_no);
2356 
2357 					retval |= func(t, c, &pkg_base[pkg_no]);
2358 				}
2359 			}
2360 		}
2361 	}
2362 	return retval;
2363 }
2364 
2365 int is_cpu_first_thread_in_core(PER_THREAD_PARAMS)
2366 {
2367 	UNUSED(p);
2368 
2369 	return ((int)t->cpu_id == c->base_cpu || c->base_cpu < 0);
2370 }
2371 
2372 int is_cpu_first_core_in_package(PER_THREAD_PARAMS)
2373 {
2374 	UNUSED(c);
2375 
2376 	return ((int)t->cpu_id == p->base_cpu || p->base_cpu < 0);
2377 }
2378 
2379 int is_cpu_first_thread_in_package(PER_THREAD_PARAMS)
2380 {
2381 	return is_cpu_first_thread_in_core(t, c, p) && is_cpu_first_core_in_package(t, c, p);
2382 }
2383 
2384 int cpu_migrate(int cpu)
2385 {
2386 	CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set);
2387 	CPU_SET_S(cpu, cpu_affinity_setsize, cpu_affinity_set);
2388 	if (sched_setaffinity(0, cpu_affinity_setsize, cpu_affinity_set) == -1)
2389 		return -1;
2390 	else
2391 		return 0;
2392 }
2393 
2394 int get_msr_fd(int cpu)
2395 {
2396 	char pathname[32];
2397 	int fd;
2398 
2399 	fd = fd_percpu[cpu];
2400 
2401 	if (fd)
2402 		return fd;
2403 #if defined(ANDROID)
2404 	sprintf(pathname, "/dev/msr%d", cpu);
2405 #else
2406 	sprintf(pathname, "/dev/cpu/%d/msr", cpu);
2407 #endif
2408 	fd = open(pathname, O_RDONLY);
2409 	if (fd < 0)
2410 #if defined(ANDROID)
2411 		err(-1, "%s open failed, try chown or chmod +r /dev/msr*, "
2412 		    "or run with --no-msr, or run as root", pathname);
2413 #else
2414 		err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, "
2415 		    "or run with --no-msr, or run as root", pathname);
2416 #endif
2417 	fd_percpu[cpu] = fd;
2418 
2419 	return fd;
2420 }
2421 
2422 static void bic_disable_msr_access(void)
2423 {
2424 	CLR_BIC(BIC_Mod_c6, &bic_enabled);
2425 	CLR_BIC(BIC_CoreTmp, &bic_enabled);
2426 	CLR_BIC(BIC_Totl_c0, &bic_enabled);
2427 	CLR_BIC(BIC_Any_c0, &bic_enabled);
2428 	CLR_BIC(BIC_GFX_c0, &bic_enabled);
2429 	CLR_BIC(BIC_CPUGFX, &bic_enabled);
2430 	CLR_BIC(BIC_PkgTmp, &bic_enabled);
2431 
2432 	free_sys_msr_counters();
2433 }
2434 
2435 static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags)
2436 {
2437 	assert(!no_perf);
2438 
2439 	return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
2440 }
2441 
2442 static long open_perf_counter(int cpu, unsigned int type, unsigned int config, int group_fd, __u64 read_format)
2443 {
2444 	struct perf_event_attr attr;
2445 	const pid_t pid = -1;
2446 	const unsigned long flags = 0;
2447 
2448 	assert(!no_perf);
2449 
2450 	memset(&attr, 0, sizeof(struct perf_event_attr));
2451 
2452 	attr.type = type;
2453 	attr.size = sizeof(struct perf_event_attr);
2454 	attr.config = config;
2455 	attr.disabled = 0;
2456 	attr.sample_type = PERF_SAMPLE_IDENTIFIER;
2457 	attr.read_format = read_format;
2458 
2459 	const int fd = perf_event_open(&attr, pid, cpu, group_fd, flags);
2460 
2461 	return fd;
2462 }
2463 
2464 int get_instr_count_fd(int cpu)
2465 {
2466 	if (fd_instr_count_percpu[cpu])
2467 		return fd_instr_count_percpu[cpu];
2468 
2469 	fd_instr_count_percpu[cpu] = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0);
2470 
2471 	return fd_instr_count_percpu[cpu];
2472 }
2473 
2474 int get_msr(int cpu, off_t offset, unsigned long long *msr)
2475 {
2476 	ssize_t retval;
2477 
2478 	assert(!no_msr);
2479 
2480 	retval = pread(get_msr_fd(cpu), msr, sizeof(*msr), offset);
2481 
2482 	if (retval != sizeof *msr)
2483 		err(-1, "cpu%d: msr offset 0x%llx read failed", cpu, (unsigned long long)offset);
2484 
2485 	return 0;
2486 }
2487 
2488 int add_msr_counter(int cpu, off_t offset)
2489 {
2490 	ssize_t retval;
2491 	unsigned long long value;
2492 
2493 	if (no_msr)
2494 		return -1;
2495 
2496 	if (!offset)
2497 		return -1;
2498 
2499 	retval = pread(get_msr_fd(cpu), &value, sizeof(value), offset);
2500 
2501 	/* if the read failed, the probe fails */
2502 	if (retval != sizeof(value))
2503 		return -1;
2504 
2505 	if (value == 0)
2506 		return 0;
2507 
2508 	return 1;
2509 }
2510 
2511 int add_rapl_msr_counter(int cpu, const struct rapl_counter_arch_info *cai)
2512 {
2513 	int ret;
2514 
2515 	if (!(platform->rapl_msrs & cai->feature_mask))
2516 		return -1;
2517 
2518 	ret = add_msr_counter(cpu, cai->msr);
2519 	if (ret < 0)
2520 		return -1;
2521 
2522 	switch (cai->rci_index) {
2523 	case RAPL_RCI_INDEX_ENERGY_PKG:
2524 	case RAPL_RCI_INDEX_ENERGY_CORES:
2525 	case RAPL_RCI_INDEX_DRAM:
2526 	case RAPL_RCI_INDEX_GFX:
2527 	case RAPL_RCI_INDEX_ENERGY_PLATFORM:
2528 		if (ret == 0)
2529 			return 1;
2530 	}
2531 
2532 	/* PKG,DRAM_PERF_STATUS MSRs, can return any value */
2533 	return 1;
2534 }
2535 
2536 /* Convert CPU ID to domain ID for given added perf counter. */
2537 unsigned int cpu_to_domain(const struct perf_counter_info *pc, int cpu)
2538 {
2539 	switch (pc->scope) {
2540 	case SCOPE_CPU:
2541 		return cpu;
2542 
2543 	case SCOPE_CORE:
2544 		return cpus[cpu].physical_core_id;
2545 
2546 	case SCOPE_PACKAGE:
2547 		return cpus[cpu].physical_package_id;
2548 	}
2549 
2550 	__builtin_unreachable();
2551 }
2552 
2553 #define MAX_DEFERRED 16
2554 char *deferred_add_names[MAX_DEFERRED];
2555 char *deferred_skip_names[MAX_DEFERRED];
2556 int deferred_add_index;
2557 int deferred_skip_index;
2558 unsigned int deferred_add_consumed;
2559 unsigned int deferred_skip_consumed;
2560 
2561 /*
2562  * HIDE_LIST - hide this list of counters, show the rest [default]
2563  * SHOW_LIST - show this list of counters, hide the rest
2564  */
2565 enum show_hide_mode { SHOW_LIST, HIDE_LIST } global_show_hide_mode = HIDE_LIST;
2566 
2567 void help(void)
2568 {
2569 	fprintf(outf,
2570 		"Usage: turbostat [OPTIONS][(--interval seconds) | COMMAND ...]\n"
2571 		"\n"
2572 		"Turbostat forks the specified COMMAND and prints statistics\n"
2573 		"when COMMAND completes.\n"
2574 		"If no COMMAND is specified, turbostat wakes every 5-seconds\n"
2575 		"to print statistics, until interrupted.\n"
2576 		"  -a, --add counter\n"
2577 		"		add a counter\n"
2578 		"		  eg. --add msr0x10,u64,cpu,delta,MY_TSC\n"
2579 		"		  eg. --add perf/cstate_pkg/c2-residency,package,delta,percent,perfPC2\n"
2580 		"		  eg. --add pmt,name=XTAL,type=raw,domain=package0,offset=0,lsb=0,msb=63,guid=0x1a067102\n"
2581 		"  -c, --cpu cpu-set\n"
2582 		"		limit output to summary plus cpu-set:\n"
2583 		"		  {core | package | j,k,l..m,n-p }\n"
2584 		"  -d, --debug\n"
2585 		"		displays usec, Time_Of_Day_Seconds and more debugging\n"
2586 		"		debug messages are printed to stderr\n"
2587 		"  -D, --Dump\n"
2588 		"		displays the raw counter values\n"
2589 		"  -e, --enable [all | column]\n"
2590 		"		shows all or the specified disabled column\n"
2591 		"  -f, --force\n"
2592 		"		force load turbostat with minimum default features on unsupported platforms.\n"
2593 		"  -H, --hide [column | column,column,...]\n"
2594 		"		hide the specified column(s)\n"
2595 		"  -i, --interval sec.subsec\n"
2596 		"		override default 5-second measurement interval\n"
2597 		"  -J, --Joules\n"
2598 		"		displays energy in Joules instead of Watts\n"
2599 		"  -l, --list\n"
2600 		"		list column headers only\n"
2601 		"  -M, --no-msr\n"
2602 		"		disable all uses of the MSR driver\n"
2603 		"  -P, --no-perf\n"
2604 		"		disable all uses of the perf API\n"
2605 		"  -n, --num_iterations num\n"
2606 		"		number of the measurement iterations\n"
2607 		"  -N, --header_iterations num\n"
2608 		"		print header every num iterations\n"
2609 		"  -o, --out file\n"
2610 		"		create or truncate \"file\" for all output\n"
2611 		"  -q, --quiet\n"
2612 		"		skip decoding system configuration header\n"
2613 		"  -s, --show [column | column,column,...]\n"
2614 		"		show only the specified column(s)\n"
2615 		"  -S, --Summary\n"
2616 		"		limits output to 1-line system summary per interval\n"
2617 		"  -T, --TCC temperature\n"
2618 		"		sets the Thermal Control Circuit temperature in\n"
2619 		"		  degrees Celsius\n"
2620 		"  -h, --help\n"
2621 		"		print this help message\n"
2622 		"  -v, --version\n\t\tprint version information\n\nFor more help, run \"man turbostat\"\n");
2623 }
2624 
2625 /*
2626  * bic_lookup
2627  * for all the strings in comma separate name_list,
2628  * set the approprate bit in return value.
2629  */
2630 void bic_lookup(cpu_set_t *ret_set, char *name_list, enum show_hide_mode mode)
2631 {
2632 	unsigned int i;
2633 
2634 	while (name_list) {
2635 		char *comma;
2636 
2637 		comma = strchr(name_list, ',');
2638 
2639 		if (comma)
2640 			*comma = '\0';
2641 
2642 		for (i = 0; i < MAX_BIC; ++i) {
2643 			if (!strcmp(name_list, bic[i].name)) {
2644 				SET_BIC(i, ret_set);
2645 				break;
2646 			}
2647 			if (!strcmp(name_list, "all")) {
2648 				bic_set_all(ret_set);
2649 				break;
2650 			} else if (!strcmp(name_list, "topology")) {
2651 				CPU_OR(ret_set, ret_set, &bic_group_topology);
2652 				break;
2653 			} else if (!strcmp(name_list, "power")) {
2654 				CPU_OR(ret_set, ret_set, &bic_group_thermal_pwr);
2655 				break;
2656 			} else if (!strcmp(name_list, "idle")) {
2657 				CPU_OR(ret_set, ret_set, &bic_group_idle);
2658 				break;
2659 			} else if (!strcmp(name_list, "swidle")) {
2660 				CPU_OR(ret_set, ret_set, &bic_group_sw_idle);
2661 				break;
2662 			} else if (!strcmp(name_list, "sysfs")) {	/* legacy compatibility */
2663 				CPU_OR(ret_set, ret_set, &bic_group_sw_idle);
2664 				break;
2665 			} else if (!strcmp(name_list, "hwidle")) {
2666 				CPU_OR(ret_set, ret_set, &bic_group_hw_idle);
2667 				break;
2668 			} else if (!strcmp(name_list, "frequency")) {
2669 				CPU_OR(ret_set, ret_set, &bic_group_frequency);
2670 				break;
2671 			} else if (!strcmp(name_list, "other")) {
2672 				CPU_OR(ret_set, ret_set, &bic_group_other);
2673 				break;
2674 			}
2675 		}
2676 		if (i == MAX_BIC) {
2677 			if (mode == SHOW_LIST) {
2678 				deferred_add_names[deferred_add_index++] = name_list;
2679 				if (deferred_add_index >= MAX_DEFERRED) {
2680 					fprintf(stderr, "More than max %d un-recognized --add options '%s'\n",
2681 						MAX_DEFERRED, name_list);
2682 					help();
2683 					exit(1);
2684 				}
2685 			} else {
2686 				deferred_skip_names[deferred_skip_index++] = name_list;
2687 				if (debug)
2688 					fprintf(stderr, "deferred \"%s\"\n", name_list);
2689 				if (deferred_skip_index >= MAX_DEFERRED) {
2690 					fprintf(stderr, "More than max %d un-recognized --skip options '%s'\n",
2691 						MAX_DEFERRED, name_list);
2692 					help();
2693 					exit(1);
2694 				}
2695 			}
2696 		}
2697 
2698 		name_list = comma;
2699 		if (name_list)
2700 			name_list++;
2701 
2702 	}
2703 }
2704 
2705 void print_header(char *delim)
2706 {
2707 	struct msr_counter *mp;
2708 	struct perf_counter_info *pp;
2709 	struct pmt_counter *ppmt;
2710 	int printed = 0;
2711 
2712 	if (DO_BIC(BIC_USEC))
2713 		outp += sprintf(outp, "%susec", (printed++ ? delim : ""));
2714 	if (DO_BIC(BIC_TOD))
2715 		outp += sprintf(outp, "%sTime_Of_Day_Seconds", (printed++ ? delim : ""));
2716 	if (DO_BIC(BIC_Package))
2717 		outp += sprintf(outp, "%sPackage", (printed++ ? delim : ""));
2718 	if (DO_BIC(BIC_Die))
2719 		outp += sprintf(outp, "%sDie", (printed++ ? delim : ""));
2720 	if (DO_BIC(BIC_L3))
2721 		outp += sprintf(outp, "%sL3", (printed++ ? delim : ""));
2722 	if (DO_BIC(BIC_Node))
2723 		outp += sprintf(outp, "%sNode", (printed++ ? delim : ""));
2724 	if (DO_BIC(BIC_Core))
2725 		outp += sprintf(outp, "%sCore", (printed++ ? delim : ""));
2726 	if (DO_BIC(BIC_CPU))
2727 		outp += sprintf(outp, "%sCPU", (printed++ ? delim : ""));
2728 	if (DO_BIC(BIC_APIC))
2729 		outp += sprintf(outp, "%sAPIC", (printed++ ? delim : ""));
2730 	if (DO_BIC(BIC_X2APIC))
2731 		outp += sprintf(outp, "%sX2APIC", (printed++ ? delim : ""));
2732 	if (DO_BIC(BIC_Avg_MHz))
2733 		outp += sprintf(outp, "%sAvg_MHz", (printed++ ? delim : ""));
2734 	if (DO_BIC(BIC_Busy))
2735 		outp += sprintf(outp, "%sBusy%%", (printed++ ? delim : ""));
2736 	if (DO_BIC(BIC_Bzy_MHz))
2737 		outp += sprintf(outp, "%sBzy_MHz", (printed++ ? delim : ""));
2738 	if (DO_BIC(BIC_TSC_MHz))
2739 		outp += sprintf(outp, "%sTSC_MHz", (printed++ ? delim : ""));
2740 
2741 	if (DO_BIC(BIC_IPC))
2742 		outp += sprintf(outp, "%sIPC", (printed++ ? delim : ""));
2743 
2744 	if (DO_BIC(BIC_IRQ)) {
2745 		if (sums_need_wide_columns)
2746 			outp += sprintf(outp, "%s     IRQ", (printed++ ? delim : ""));
2747 		else
2748 			outp += sprintf(outp, "%sIRQ", (printed++ ? delim : ""));
2749 	}
2750 	if (DO_BIC(BIC_NMI)) {
2751 		if (sums_need_wide_columns)
2752 			outp += sprintf(outp, "%s     NMI", (printed++ ? delim : ""));
2753 		else
2754 			outp += sprintf(outp, "%sNMI", (printed++ ? delim : ""));
2755 	}
2756 
2757 	if (DO_BIC(BIC_SMI))
2758 		outp += sprintf(outp, "%sSMI", (printed++ ? delim : ""));
2759 
2760 	for (mp = sys.tp; mp; mp = mp->next) {
2761 
2762 		if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) {
2763 			if (mp->width == 64)
2764 				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), mp->name);
2765 			else
2766 				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), mp->name);
2767 		} else {
2768 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2769 				outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), mp->name);
2770 			else
2771 				outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), mp->name);
2772 		}
2773 	}
2774 
2775 	for (pp = sys.perf_tp; pp; pp = pp->next) {
2776 
2777 		if (pp->format == FORMAT_RAW) {
2778 			if (pp->width == 64)
2779 				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name);
2780 			else
2781 				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name);
2782 		} else {
2783 			if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2784 				outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name);
2785 			else
2786 				outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name);
2787 		}
2788 	}
2789 
2790 	ppmt = sys.pmt_tp;
2791 	while (ppmt) {
2792 		switch (ppmt->type) {
2793 		case PMT_TYPE_RAW:
2794 			if (pmt_counter_get_width(ppmt) <= 32)
2795 				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name);
2796 			else
2797 				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name);
2798 
2799 			break;
2800 
2801 		case PMT_TYPE_XTAL_TIME:
2802 		case PMT_TYPE_TCORE_CLOCK:
2803 			outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), ppmt->name);
2804 			break;
2805 		}
2806 
2807 		ppmt = ppmt->next;
2808 	}
2809 
2810 	if (DO_BIC(BIC_CPU_c1))
2811 		outp += sprintf(outp, "%sCPU%%c1", (printed++ ? delim : ""));
2812 	if (DO_BIC(BIC_CPU_c3))
2813 		outp += sprintf(outp, "%sCPU%%c3", (printed++ ? delim : ""));
2814 	if (DO_BIC(BIC_CPU_c6))
2815 		outp += sprintf(outp, "%sCPU%%c6", (printed++ ? delim : ""));
2816 	if (DO_BIC(BIC_CPU_c7))
2817 		outp += sprintf(outp, "%sCPU%%c7", (printed++ ? delim : ""));
2818 
2819 	if (DO_BIC(BIC_Mod_c6))
2820 		outp += sprintf(outp, "%sMod%%c6", (printed++ ? delim : ""));
2821 
2822 	if (DO_BIC(BIC_CoreTmp))
2823 		outp += sprintf(outp, "%sCoreTmp", (printed++ ? delim : ""));
2824 
2825 	if (DO_BIC(BIC_CORE_THROT_CNT))
2826 		outp += sprintf(outp, "%sCoreThr", (printed++ ? delim : ""));
2827 
2828 	if (platform->rapl_msrs && !rapl_joules) {
2829 		if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl)
2830 			outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : ""));
2831 	} else if (platform->rapl_msrs && rapl_joules) {
2832 		if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl)
2833 			outp += sprintf(outp, "%sCor_J", (printed++ ? delim : ""));
2834 	}
2835 
2836 	for (mp = sys.cp; mp; mp = mp->next) {
2837 		if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) {
2838 			if (mp->width == 64)
2839 				outp += sprintf(outp, "%s%18.18s", delim, mp->name);
2840 			else
2841 				outp += sprintf(outp, "%s%10.10s", delim, mp->name);
2842 		} else {
2843 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2844 				outp += sprintf(outp, "%s%8s", delim, mp->name);
2845 			else
2846 				outp += sprintf(outp, "%s%s", delim, mp->name);
2847 		}
2848 	}
2849 
2850 	for (pp = sys.perf_cp; pp; pp = pp->next) {
2851 
2852 		if (pp->format == FORMAT_RAW) {
2853 			if (pp->width == 64)
2854 				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name);
2855 			else
2856 				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name);
2857 		} else {
2858 			if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2859 				outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name);
2860 			else
2861 				outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name);
2862 		}
2863 	}
2864 
2865 	ppmt = sys.pmt_cp;
2866 	while (ppmt) {
2867 		switch (ppmt->type) {
2868 		case PMT_TYPE_RAW:
2869 			if (pmt_counter_get_width(ppmt) <= 32)
2870 				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name);
2871 			else
2872 				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name);
2873 
2874 			break;
2875 
2876 		case PMT_TYPE_XTAL_TIME:
2877 		case PMT_TYPE_TCORE_CLOCK:
2878 			outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), ppmt->name);
2879 			break;
2880 		}
2881 
2882 		ppmt = ppmt->next;
2883 	}
2884 
2885 	if (DO_BIC(BIC_PkgTmp))
2886 		outp += sprintf(outp, "%sPkgTmp", (printed++ ? delim : ""));
2887 
2888 	if (DO_BIC(BIC_GFX_rc6))
2889 		outp += sprintf(outp, "%sGFX%%rc6", (printed++ ? delim : ""));
2890 
2891 	if (DO_BIC(BIC_GFXMHz))
2892 		outp += sprintf(outp, "%sGFXMHz", (printed++ ? delim : ""));
2893 
2894 	if (DO_BIC(BIC_GFXACTMHz))
2895 		outp += sprintf(outp, "%sGFXAMHz", (printed++ ? delim : ""));
2896 
2897 	if (DO_BIC(BIC_SAM_mc6))
2898 		outp += sprintf(outp, "%sSAM%%mc6", (printed++ ? delim : ""));
2899 
2900 	if (DO_BIC(BIC_SAMMHz))
2901 		outp += sprintf(outp, "%sSAMMHz", (printed++ ? delim : ""));
2902 
2903 	if (DO_BIC(BIC_SAMACTMHz))
2904 		outp += sprintf(outp, "%sSAMAMHz", (printed++ ? delim : ""));
2905 
2906 	if (DO_BIC(BIC_Totl_c0))
2907 		outp += sprintf(outp, "%sTotl%%C0", (printed++ ? delim : ""));
2908 	if (DO_BIC(BIC_Any_c0))
2909 		outp += sprintf(outp, "%sAny%%C0", (printed++ ? delim : ""));
2910 	if (DO_BIC(BIC_GFX_c0))
2911 		outp += sprintf(outp, "%sGFX%%C0", (printed++ ? delim : ""));
2912 	if (DO_BIC(BIC_CPUGFX))
2913 		outp += sprintf(outp, "%sCPUGFX%%", (printed++ ? delim : ""));
2914 
2915 	if (DO_BIC(BIC_Pkgpc2))
2916 		outp += sprintf(outp, "%sPkg%%pc2", (printed++ ? delim : ""));
2917 	if (DO_BIC(BIC_Pkgpc3))
2918 		outp += sprintf(outp, "%sPkg%%pc3", (printed++ ? delim : ""));
2919 	if (DO_BIC(BIC_Pkgpc6))
2920 		outp += sprintf(outp, "%sPkg%%pc6", (printed++ ? delim : ""));
2921 	if (DO_BIC(BIC_Pkgpc7))
2922 		outp += sprintf(outp, "%sPkg%%pc7", (printed++ ? delim : ""));
2923 	if (DO_BIC(BIC_Pkgpc8))
2924 		outp += sprintf(outp, "%sPkg%%pc8", (printed++ ? delim : ""));
2925 	if (DO_BIC(BIC_Pkgpc9))
2926 		outp += sprintf(outp, "%sPkg%%pc9", (printed++ ? delim : ""));
2927 	if (DO_BIC(BIC_Pkgpc10))
2928 		outp += sprintf(outp, "%sPk%%pc10", (printed++ ? delim : ""));
2929 	if (DO_BIC(BIC_Diec6))
2930 		outp += sprintf(outp, "%sDie%%c6", (printed++ ? delim : ""));
2931 	if (DO_BIC(BIC_CPU_LPI))
2932 		outp += sprintf(outp, "%sCPU%%LPI", (printed++ ? delim : ""));
2933 	if (DO_BIC(BIC_SYS_LPI))
2934 		outp += sprintf(outp, "%sSYS%%LPI", (printed++ ? delim : ""));
2935 
2936 	if (!rapl_joules) {
2937 		if (DO_BIC(BIC_PkgWatt))
2938 			outp += sprintf(outp, "%sPkgWatt", (printed++ ? delim : ""));
2939 		if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl)
2940 			outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : ""));
2941 		if (DO_BIC(BIC_GFXWatt))
2942 			outp += sprintf(outp, "%sGFXWatt", (printed++ ? delim : ""));
2943 		if (DO_BIC(BIC_RAMWatt))
2944 			outp += sprintf(outp, "%sRAMWatt", (printed++ ? delim : ""));
2945 		if (DO_BIC(BIC_PKG__))
2946 			outp += sprintf(outp, "%sPKG_%%", (printed++ ? delim : ""));
2947 		if (DO_BIC(BIC_RAM__))
2948 			outp += sprintf(outp, "%sRAM_%%", (printed++ ? delim : ""));
2949 	} else {
2950 		if (DO_BIC(BIC_Pkg_J))
2951 			outp += sprintf(outp, "%sPkg_J", (printed++ ? delim : ""));
2952 		if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl)
2953 			outp += sprintf(outp, "%sCor_J", (printed++ ? delim : ""));
2954 		if (DO_BIC(BIC_GFX_J))
2955 			outp += sprintf(outp, "%sGFX_J", (printed++ ? delim : ""));
2956 		if (DO_BIC(BIC_RAM_J))
2957 			outp += sprintf(outp, "%sRAM_J", (printed++ ? delim : ""));
2958 		if (DO_BIC(BIC_PKG__))
2959 			outp += sprintf(outp, "%sPKG_%%", (printed++ ? delim : ""));
2960 		if (DO_BIC(BIC_RAM__))
2961 			outp += sprintf(outp, "%sRAM_%%", (printed++ ? delim : ""));
2962 	}
2963 	if (DO_BIC(BIC_UNCORE_MHZ))
2964 		outp += sprintf(outp, "%sUncMHz", (printed++ ? delim : ""));
2965 
2966 	for (mp = sys.pp; mp; mp = mp->next) {
2967 		if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) {
2968 			if (mp->width == 64)
2969 				outp += sprintf(outp, "%s%18.18s", delim, mp->name);
2970 			else if (mp->width == 32)
2971 				outp += sprintf(outp, "%s%10.10s", delim, mp->name);
2972 			else
2973 				outp += sprintf(outp, "%s%7.7s", delim, mp->name);
2974 		} else {
2975 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2976 				outp += sprintf(outp, "%s%8s", delim, mp->name);
2977 			else
2978 				outp += sprintf(outp, "%s%7.7s", delim, mp->name);
2979 		}
2980 	}
2981 
2982 	for (pp = sys.perf_pp; pp; pp = pp->next) {
2983 
2984 		if (pp->format == FORMAT_RAW) {
2985 			if (pp->width == 64)
2986 				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name);
2987 			else
2988 				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name);
2989 		} else {
2990 			if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2991 				outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name);
2992 			else
2993 				outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name);
2994 		}
2995 	}
2996 
2997 	ppmt = sys.pmt_pp;
2998 	while (ppmt) {
2999 		switch (ppmt->type) {
3000 		case PMT_TYPE_RAW:
3001 			if (pmt_counter_get_width(ppmt) <= 32)
3002 				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name);
3003 			else
3004 				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name);
3005 
3006 			break;
3007 
3008 		case PMT_TYPE_XTAL_TIME:
3009 		case PMT_TYPE_TCORE_CLOCK:
3010 			outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), ppmt->name);
3011 			break;
3012 		}
3013 
3014 		ppmt = ppmt->next;
3015 	}
3016 
3017 	if (DO_BIC(BIC_SysWatt))
3018 		outp += sprintf(outp, "%sSysWatt", (printed++ ? delim : ""));
3019 	if (DO_BIC(BIC_Sys_J))
3020 		outp += sprintf(outp, "%sSys_J", (printed++ ? delim : ""));
3021 
3022 	outp += sprintf(outp, "\n");
3023 }
3024 
3025 int dump_counters(PER_THREAD_PARAMS)
3026 {
3027 	int i;
3028 	struct msr_counter *mp;
3029 	struct platform_counters *pplat_cnt = p == package_odd ? &platform_counters_odd : &platform_counters_even;
3030 
3031 	outp += sprintf(outp, "t %p, c %p, p %p\n", t, c, p);
3032 
3033 	if (t) {
3034 		outp += sprintf(outp, "CPU: %d flags 0x%x\n", t->cpu_id, t->flags);
3035 		outp += sprintf(outp, "TSC: %016llX\n", t->tsc);
3036 		outp += sprintf(outp, "aperf: %016llX\n", t->aperf);
3037 		outp += sprintf(outp, "mperf: %016llX\n", t->mperf);
3038 		outp += sprintf(outp, "c1: %016llX\n", t->c1);
3039 
3040 		if (DO_BIC(BIC_IPC))
3041 			outp += sprintf(outp, "IPC: %lld\n", t->instr_count);
3042 
3043 		if (DO_BIC(BIC_IRQ))
3044 			outp += sprintf(outp, "IRQ: %lld\n", t->irq_count);
3045 		if (DO_BIC(BIC_NMI))
3046 			outp += sprintf(outp, "IRQ: %lld\n", t->nmi_count);
3047 		if (DO_BIC(BIC_SMI))
3048 			outp += sprintf(outp, "SMI: %d\n", t->smi_count);
3049 
3050 		for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
3051 			outp +=
3052 			    sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num,
3053 				    t->counter[i], mp->sp->path);
3054 		}
3055 	}
3056 
3057 	if (c && is_cpu_first_thread_in_core(t, c, p)) {
3058 		outp += sprintf(outp, "core: %d\n", c->core_id);
3059 		outp += sprintf(outp, "c3: %016llX\n", c->c3);
3060 		outp += sprintf(outp, "c6: %016llX\n", c->c6);
3061 		outp += sprintf(outp, "c7: %016llX\n", c->c7);
3062 		outp += sprintf(outp, "DTS: %dC\n", c->core_temp_c);
3063 		outp += sprintf(outp, "cpu_throt_count: %016llX\n", c->core_throt_cnt);
3064 
3065 		const unsigned long long energy_value = c->core_energy.raw_value * c->core_energy.scale;
3066 		const double energy_scale = c->core_energy.scale;
3067 
3068 		if (c->core_energy.unit == RAPL_UNIT_JOULES)
3069 			outp += sprintf(outp, "Joules: %0llX (scale: %lf)\n", energy_value, energy_scale);
3070 
3071 		for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
3072 			outp +=
3073 			    sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num,
3074 				    c->counter[i], mp->sp->path);
3075 		}
3076 		outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us);
3077 	}
3078 
3079 	if (p && is_cpu_first_core_in_package(t, c, p)) {
3080 		outp += sprintf(outp, "package: %d\n", p->package_id);
3081 
3082 		outp += sprintf(outp, "Weighted cores: %016llX\n", p->pkg_wtd_core_c0);
3083 		outp += sprintf(outp, "Any cores: %016llX\n", p->pkg_any_core_c0);
3084 		outp += sprintf(outp, "Any GFX: %016llX\n", p->pkg_any_gfxe_c0);
3085 		outp += sprintf(outp, "CPU + GFX: %016llX\n", p->pkg_both_core_gfxe_c0);
3086 
3087 		outp += sprintf(outp, "pc2: %016llX\n", p->pc2);
3088 		if (DO_BIC(BIC_Pkgpc3))
3089 			outp += sprintf(outp, "pc3: %016llX\n", p->pc3);
3090 		if (DO_BIC(BIC_Pkgpc6))
3091 			outp += sprintf(outp, "pc6: %016llX\n", p->pc6);
3092 		if (DO_BIC(BIC_Pkgpc7))
3093 			outp += sprintf(outp, "pc7: %016llX\n", p->pc7);
3094 		outp += sprintf(outp, "pc8: %016llX\n", p->pc8);
3095 		outp += sprintf(outp, "pc9: %016llX\n", p->pc9);
3096 		outp += sprintf(outp, "pc10: %016llX\n", p->pc10);
3097 		outp += sprintf(outp, "cpu_lpi: %016llX\n", p->cpu_lpi);
3098 		outp += sprintf(outp, "sys_lpi: %016llX\n", p->sys_lpi);
3099 		outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg.raw_value);
3100 		outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores.raw_value);
3101 		outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx.raw_value);
3102 		outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram.raw_value);
3103 		outp += sprintf(outp, "Joules PSYS: %0llX\n", pplat_cnt->energy_psys.raw_value);
3104 		outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status.raw_value);
3105 		outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status.raw_value);
3106 		outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c);
3107 
3108 		for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
3109 			outp +=
3110 			    sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num,
3111 				    p->counter[i], mp->sp->path);
3112 		}
3113 	}
3114 
3115 	outp += sprintf(outp, "\n");
3116 
3117 	return 0;
3118 }
3119 
3120 double rapl_counter_get_value(const struct rapl_counter *c, enum rapl_unit desired_unit, double interval)
3121 {
3122 	assert(desired_unit != RAPL_UNIT_INVALID);
3123 
3124 	/*
3125 	 * For now we don't expect anything other than joules,
3126 	 * so just simplify the logic.
3127 	 */
3128 	assert(c->unit == RAPL_UNIT_JOULES);
3129 
3130 	const double scaled = c->raw_value * c->scale;
3131 
3132 	if (desired_unit == RAPL_UNIT_WATTS)
3133 		return scaled / interval;
3134 	return scaled;
3135 }
3136 
3137 /*
3138  * column formatting convention & formats
3139  */
3140 int format_counters(PER_THREAD_PARAMS)
3141 {
3142 	static int count;
3143 
3144 	struct platform_counters *pplat_cnt = NULL;
3145 	double interval_float, tsc;
3146 	char *fmt8;
3147 	int i;
3148 	struct msr_counter *mp;
3149 	struct perf_counter_info *pp;
3150 	struct pmt_counter *ppmt;
3151 	char *delim = "\t";
3152 	int printed = 0;
3153 
3154 	if (t == &average.threads) {
3155 		pplat_cnt = count & 1 ? &platform_counters_odd : &platform_counters_even;
3156 		++count;
3157 	}
3158 
3159 	/* if showing only 1st thread in core and this isn't one, bail out */
3160 	if (show_core_only && !is_cpu_first_thread_in_core(t, c, p))
3161 		return 0;
3162 
3163 	/* if showing only 1st thread in pkg and this isn't one, bail out */
3164 	if (show_pkg_only && !is_cpu_first_core_in_package(t, c, p))
3165 		return 0;
3166 
3167 	/*if not summary line and --cpu is used */
3168 	if ((t != &average.threads) && (cpu_subset && !CPU_ISSET_S(t->cpu_id, cpu_subset_size, cpu_subset)))
3169 		return 0;
3170 
3171 	if (DO_BIC(BIC_USEC)) {
3172 		/* on each row, print how many usec each timestamp took to gather */
3173 		struct timeval tv;
3174 
3175 		timersub(&t->tv_end, &t->tv_begin, &tv);
3176 		outp += sprintf(outp, "%5ld\t", tv.tv_sec * 1000000 + tv.tv_usec);
3177 	}
3178 
3179 	/* Time_Of_Day_Seconds: on each row, print sec.usec last timestamp taken */
3180 	if (DO_BIC(BIC_TOD))
3181 		outp += sprintf(outp, "%10ld.%06ld\t", t->tv_end.tv_sec, t->tv_end.tv_usec);
3182 
3183 	interval_float = t->tv_delta.tv_sec + t->tv_delta.tv_usec / 1000000.0;
3184 
3185 	tsc = t->tsc * tsc_tweak;
3186 
3187 	/* topo columns, print blanks on 1st (average) line */
3188 	if (t == &average.threads) {
3189 		if (DO_BIC(BIC_Package))
3190 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
3191 		if (DO_BIC(BIC_Die))
3192 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
3193 		if (DO_BIC(BIC_L3))
3194 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
3195 		if (DO_BIC(BIC_Node))
3196 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
3197 		if (DO_BIC(BIC_Core))
3198 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
3199 		if (DO_BIC(BIC_CPU))
3200 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
3201 		if (DO_BIC(BIC_APIC))
3202 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
3203 		if (DO_BIC(BIC_X2APIC))
3204 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
3205 	} else {
3206 		if (DO_BIC(BIC_Package)) {
3207 			if (p)
3208 				outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->package_id);
3209 			else
3210 				outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
3211 		}
3212 		if (DO_BIC(BIC_Die)) {
3213 			if (c)
3214 				outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), cpus[t->cpu_id].die_id);
3215 			else
3216 				outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
3217 		}
3218 		if (DO_BIC(BIC_L3)) {
3219 			if (c)
3220 				outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), cpus[t->cpu_id].l3_id);
3221 			else
3222 				outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
3223 		}
3224 		if (DO_BIC(BIC_Node)) {
3225 			if (t)
3226 				outp += sprintf(outp, "%s%d",
3227 						(printed++ ? delim : ""), cpus[t->cpu_id].physical_node_id);
3228 			else
3229 				outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
3230 		}
3231 		if (DO_BIC(BIC_Core)) {
3232 			if (c)
3233 				outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), c->core_id);
3234 			else
3235 				outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
3236 		}
3237 		if (DO_BIC(BIC_CPU))
3238 			outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->cpu_id);
3239 		if (DO_BIC(BIC_APIC))
3240 			outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->apic_id);
3241 		if (DO_BIC(BIC_X2APIC))
3242 			outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->x2apic_id);
3243 	}
3244 
3245 	if (DO_BIC(BIC_Avg_MHz))
3246 		outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 / units * t->aperf / interval_float);
3247 
3248 	if (DO_BIC(BIC_Busy))
3249 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->mperf / tsc);
3250 
3251 	if (DO_BIC(BIC_Bzy_MHz)) {
3252 		if (has_base_hz)
3253 			outp +=
3254 			    sprintf(outp, "%s%.0f", (printed++ ? delim : ""), base_hz / units * t->aperf / t->mperf);
3255 		else
3256 			outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""),
3257 					tsc / units * t->aperf / t->mperf / interval_float);
3258 	}
3259 
3260 	if (DO_BIC(BIC_TSC_MHz))
3261 		outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 * t->tsc / units / interval_float);
3262 
3263 	if (DO_BIC(BIC_IPC))
3264 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 1.0 * t->instr_count / t->aperf);
3265 
3266 	/* IRQ */
3267 	if (DO_BIC(BIC_IRQ)) {
3268 		if (sums_need_wide_columns)
3269 			outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->irq_count);
3270 		else
3271 			outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->irq_count);
3272 	}
3273 
3274 	/* NMI */
3275 	if (DO_BIC(BIC_NMI)) {
3276 		if (sums_need_wide_columns)
3277 			outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->nmi_count);
3278 		else
3279 			outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->nmi_count);
3280 	}
3281 
3282 	/* SMI */
3283 	if (DO_BIC(BIC_SMI))
3284 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->smi_count);
3285 
3286 	/* Added counters */
3287 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
3288 		if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) {
3289 			if (mp->width == 32)
3290 				outp +=
3291 				    sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)t->counter[i]);
3292 			else
3293 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->counter[i]);
3294 		} else if (mp->format == FORMAT_DELTA) {
3295 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
3296 				outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->counter[i]);
3297 			else
3298 				outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->counter[i]);
3299 		} else if (mp->format == FORMAT_PERCENT) {
3300 			if (mp->type == COUNTER_USEC)
3301 				outp +=
3302 				    sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
3303 					    t->counter[i] / interval_float / 10000);
3304 			else
3305 				outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->counter[i] / tsc);
3306 		}
3307 	}
3308 
3309 	/* Added perf counters */
3310 	for (i = 0, pp = sys.perf_tp; pp; ++i, pp = pp->next) {
3311 		if (pp->format == FORMAT_RAW) {
3312 			if (pp->width == 32)
3313 				outp +=
3314 				    sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
3315 					    (unsigned int)t->perf_counter[i]);
3316 			else
3317 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->perf_counter[i]);
3318 		} else if (pp->format == FORMAT_DELTA) {
3319 			if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
3320 				outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->perf_counter[i]);
3321 			else
3322 				outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->perf_counter[i]);
3323 		} else if (pp->format == FORMAT_PERCENT) {
3324 			if (pp->type == COUNTER_USEC)
3325 				outp +=
3326 				    sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
3327 					    t->perf_counter[i] / interval_float / 10000);
3328 			else
3329 				outp +=
3330 				    sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->perf_counter[i] / tsc);
3331 		}
3332 	}
3333 
3334 	for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
3335 		const unsigned long value_raw = t->pmt_counter[i];
3336 		double value_converted;
3337 		switch (ppmt->type) {
3338 		case PMT_TYPE_RAW:
3339 			if (pmt_counter_get_width(ppmt) <= 32)
3340 				outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
3341 						(unsigned int)t->pmt_counter[i]);
3342 			else
3343 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->pmt_counter[i]);
3344 
3345 			break;
3346 
3347 		case PMT_TYPE_XTAL_TIME:
3348 			value_converted = 100.0 * value_raw / crystal_hz / interval_float;
3349 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
3350 			break;
3351 
3352 		case PMT_TYPE_TCORE_CLOCK:
3353 			value_converted = 100.0 * value_raw / tcore_clock_freq_hz / interval_float;
3354 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
3355 		}
3356 	}
3357 
3358 	/* C1 */
3359 	if (DO_BIC(BIC_CPU_c1))
3360 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc);
3361 
3362 	/* print per-core data only for 1st thread in core */
3363 	if (!is_cpu_first_thread_in_core(t, c, p))
3364 		goto done;
3365 
3366 	if (DO_BIC(BIC_CPU_c3))
3367 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c3 / tsc);
3368 	if (DO_BIC(BIC_CPU_c6))
3369 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c6 / tsc);
3370 	if (DO_BIC(BIC_CPU_c7))
3371 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c7 / tsc);
3372 
3373 	/* Mod%c6 */
3374 	if (DO_BIC(BIC_Mod_c6))
3375 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->mc6_us / tsc);
3376 
3377 	if (DO_BIC(BIC_CoreTmp))
3378 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), c->core_temp_c);
3379 
3380 	/* Core throttle count */
3381 	if (DO_BIC(BIC_CORE_THROT_CNT))
3382 		outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->core_throt_cnt);
3383 
3384 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
3385 		if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) {
3386 			if (mp->width == 32)
3387 				outp +=
3388 				    sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)c->counter[i]);
3389 			else
3390 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->counter[i]);
3391 		} else if (mp->format == FORMAT_DELTA) {
3392 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
3393 				outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), c->counter[i]);
3394 			else
3395 				outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->counter[i]);
3396 		} else if (mp->format == FORMAT_PERCENT) {
3397 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->counter[i] / tsc);
3398 		}
3399 	}
3400 
3401 	for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
3402 		if (pp->format == FORMAT_RAW) {
3403 			if (pp->width == 32)
3404 				outp +=
3405 				    sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
3406 					    (unsigned int)c->perf_counter[i]);
3407 			else
3408 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->perf_counter[i]);
3409 		} else if (pp->format == FORMAT_DELTA) {
3410 			if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
3411 				outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), c->perf_counter[i]);
3412 			else
3413 				outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->perf_counter[i]);
3414 		} else if (pp->format == FORMAT_PERCENT) {
3415 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->perf_counter[i] / tsc);
3416 		}
3417 	}
3418 
3419 	for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
3420 		const unsigned long value_raw = c->pmt_counter[i];
3421 		double value_converted;
3422 		switch (ppmt->type) {
3423 		case PMT_TYPE_RAW:
3424 			if (pmt_counter_get_width(ppmt) <= 32)
3425 				outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
3426 						(unsigned int)c->pmt_counter[i]);
3427 			else
3428 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->pmt_counter[i]);
3429 
3430 			break;
3431 
3432 		case PMT_TYPE_XTAL_TIME:
3433 			value_converted = 100.0 * value_raw / crystal_hz / interval_float;
3434 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
3435 			break;
3436 
3437 		case PMT_TYPE_TCORE_CLOCK:
3438 			value_converted = 100.0 * value_raw / tcore_clock_freq_hz / interval_float;
3439 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
3440 		}
3441 	}
3442 
3443 	fmt8 = "%s%.2f";
3444 
3445 	if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl)
3446 		outp +=
3447 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
3448 			    rapl_counter_get_value(&c->core_energy, RAPL_UNIT_WATTS, interval_float));
3449 	if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl)
3450 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
3451 				rapl_counter_get_value(&c->core_energy, RAPL_UNIT_JOULES, interval_float));
3452 
3453 	/* print per-package data only for 1st core in package */
3454 	if (!is_cpu_first_core_in_package(t, c, p))
3455 		goto done;
3456 
3457 	/* PkgTmp */
3458 	if (DO_BIC(BIC_PkgTmp))
3459 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->pkg_temp_c);
3460 
3461 	/* GFXrc6 */
3462 	if (DO_BIC(BIC_GFX_rc6)) {
3463 		if (p->gfx_rc6_ms == -1) {	/* detect GFX counter reset */
3464 			outp += sprintf(outp, "%s**.**", (printed++ ? delim : ""));
3465 		} else {
3466 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
3467 					p->gfx_rc6_ms / 10.0 / interval_float);
3468 		}
3469 	}
3470 
3471 	/* GFXMHz */
3472 	if (DO_BIC(BIC_GFXMHz))
3473 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_mhz);
3474 
3475 	/* GFXACTMHz */
3476 	if (DO_BIC(BIC_GFXACTMHz))
3477 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_act_mhz);
3478 
3479 	/* SAMmc6 */
3480 	if (DO_BIC(BIC_SAM_mc6)) {
3481 		if (p->sam_mc6_ms == -1) {	/* detect GFX counter reset */
3482 			outp += sprintf(outp, "%s**.**", (printed++ ? delim : ""));
3483 		} else {
3484 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
3485 					p->sam_mc6_ms / 10.0 / interval_float);
3486 		}
3487 	}
3488 
3489 	/* SAMMHz */
3490 	if (DO_BIC(BIC_SAMMHz))
3491 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_mhz);
3492 
3493 	/* SAMACTMHz */
3494 	if (DO_BIC(BIC_SAMACTMHz))
3495 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_act_mhz);
3496 
3497 	/* Totl%C0, Any%C0 GFX%C0 CPUGFX% */
3498 	if (DO_BIC(BIC_Totl_c0))
3499 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0 / tsc);
3500 	if (DO_BIC(BIC_Any_c0))
3501 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_core_c0 / tsc);
3502 	if (DO_BIC(BIC_GFX_c0))
3503 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_gfxe_c0 / tsc);
3504 	if (DO_BIC(BIC_CPUGFX))
3505 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_both_core_gfxe_c0 / tsc);
3506 
3507 	if (DO_BIC(BIC_Pkgpc2))
3508 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc2 / tsc);
3509 	if (DO_BIC(BIC_Pkgpc3))
3510 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc3 / tsc);
3511 	if (DO_BIC(BIC_Pkgpc6))
3512 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc6 / tsc);
3513 	if (DO_BIC(BIC_Pkgpc7))
3514 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc7 / tsc);
3515 	if (DO_BIC(BIC_Pkgpc8))
3516 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc8 / tsc);
3517 	if (DO_BIC(BIC_Pkgpc9))
3518 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc9 / tsc);
3519 	if (DO_BIC(BIC_Pkgpc10))
3520 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc);
3521 
3522 	if (DO_BIC(BIC_Diec6))
3523 		outp +=
3524 		    sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->die_c6 / crystal_hz / interval_float);
3525 
3526 	if (DO_BIC(BIC_CPU_LPI)) {
3527 		if (p->cpu_lpi >= 0)
3528 			outp +=
3529 			    sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
3530 				    100.0 * p->cpu_lpi / 1000000.0 / interval_float);
3531 		else
3532 			outp += sprintf(outp, "%s(neg)", (printed++ ? delim : ""));
3533 	}
3534 	if (DO_BIC(BIC_SYS_LPI)) {
3535 		if (p->sys_lpi >= 0)
3536 			outp +=
3537 			    sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
3538 				    100.0 * p->sys_lpi / 1000000.0 / interval_float);
3539 		else
3540 			outp += sprintf(outp, "%s(neg)", (printed++ ? delim : ""));
3541 	}
3542 
3543 	if (DO_BIC(BIC_PkgWatt))
3544 		outp +=
3545 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
3546 			    rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_WATTS, interval_float));
3547 	if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl)
3548 		outp +=
3549 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
3550 			    rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_WATTS, interval_float));
3551 	if (DO_BIC(BIC_GFXWatt))
3552 		outp +=
3553 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
3554 			    rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_WATTS, interval_float));
3555 	if (DO_BIC(BIC_RAMWatt))
3556 		outp +=
3557 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
3558 			    rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_WATTS, interval_float));
3559 	if (DO_BIC(BIC_Pkg_J))
3560 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
3561 				rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_JOULES, interval_float));
3562 	if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl)
3563 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
3564 				rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_JOULES, interval_float));
3565 	if (DO_BIC(BIC_GFX_J))
3566 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
3567 				rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_JOULES, interval_float));
3568 	if (DO_BIC(BIC_RAM_J))
3569 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
3570 				rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_JOULES, interval_float));
3571 	if (DO_BIC(BIC_PKG__))
3572 		outp +=
3573 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
3574 			    rapl_counter_get_value(&p->rapl_pkg_perf_status, RAPL_UNIT_WATTS, interval_float));
3575 	if (DO_BIC(BIC_RAM__))
3576 		outp +=
3577 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
3578 			    rapl_counter_get_value(&p->rapl_dram_perf_status, RAPL_UNIT_WATTS, interval_float));
3579 	/* UncMHz */
3580 	if (DO_BIC(BIC_UNCORE_MHZ))
3581 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->uncore_mhz);
3582 
3583 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
3584 		if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) {
3585 			if (mp->width == 32)
3586 				outp +=
3587 				    sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)p->counter[i]);
3588 			else
3589 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->counter[i]);
3590 		} else if (mp->format == FORMAT_DELTA) {
3591 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
3592 				outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), p->counter[i]);
3593 			else
3594 				outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->counter[i]);
3595 		} else if (mp->format == FORMAT_PERCENT) {
3596 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i] / tsc);
3597 		} else if (mp->type == COUNTER_K2M)
3598 			outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->counter[i] / 1000);
3599 	}
3600 
3601 	for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
3602 		if (pp->format == FORMAT_RAW) {
3603 			if (pp->width == 32)
3604 				outp +=
3605 				    sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
3606 					    (unsigned int)p->perf_counter[i]);
3607 			else
3608 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->perf_counter[i]);
3609 		} else if (pp->format == FORMAT_DELTA) {
3610 			if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
3611 				outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), p->perf_counter[i]);
3612 			else
3613 				outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->perf_counter[i]);
3614 		} else if (pp->format == FORMAT_PERCENT) {
3615 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->perf_counter[i] / tsc);
3616 		} else if (pp->type == COUNTER_K2M) {
3617 			outp +=
3618 			    sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->perf_counter[i] / 1000);
3619 		}
3620 	}
3621 
3622 	for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
3623 		const unsigned long value_raw = p->pmt_counter[i];
3624 		double value_converted;
3625 		switch (ppmt->type) {
3626 		case PMT_TYPE_RAW:
3627 			if (pmt_counter_get_width(ppmt) <= 32)
3628 				outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
3629 						(unsigned int)p->pmt_counter[i]);
3630 			else
3631 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->pmt_counter[i]);
3632 
3633 			break;
3634 
3635 		case PMT_TYPE_XTAL_TIME:
3636 			value_converted = 100.0 * value_raw / crystal_hz / interval_float;
3637 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
3638 			break;
3639 
3640 		case PMT_TYPE_TCORE_CLOCK:
3641 			value_converted = 100.0 * value_raw / tcore_clock_freq_hz / interval_float;
3642 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
3643 		}
3644 	}
3645 
3646 	if (DO_BIC(BIC_SysWatt) && (t == &average.threads))
3647 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
3648 				rapl_counter_get_value(&pplat_cnt->energy_psys, RAPL_UNIT_WATTS, interval_float));
3649 	if (DO_BIC(BIC_Sys_J) && (t == &average.threads))
3650 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
3651 				rapl_counter_get_value(&pplat_cnt->energy_psys, RAPL_UNIT_JOULES, interval_float));
3652 
3653 done:
3654 	if (*(outp - 1) != '\n')
3655 		outp += sprintf(outp, "\n");
3656 
3657 	return 0;
3658 }
3659 
3660 void flush_output_stdout(void)
3661 {
3662 	FILE *filep;
3663 
3664 	if (outf == stderr)
3665 		filep = stdout;
3666 	else
3667 		filep = outf;
3668 
3669 	fputs(output_buffer, filep);
3670 	fflush(filep);
3671 
3672 	outp = output_buffer;
3673 }
3674 
3675 void flush_output_stderr(void)
3676 {
3677 	fputs(output_buffer, outf);
3678 	fflush(outf);
3679 	outp = output_buffer;
3680 }
3681 
3682 void format_all_counters(PER_THREAD_PARAMS)
3683 {
3684 	static int count;
3685 
3686 	if ((!count || (header_iterations && !(count % header_iterations))) || !summary_only)
3687 		print_header("\t");
3688 
3689 	format_counters(&average.threads, &average.cores, &average.packages);
3690 
3691 	count++;
3692 
3693 	if (summary_only)
3694 		return;
3695 
3696 	for_all_cpus(format_counters, t, c, p);
3697 }
3698 
3699 #define DELTA_WRAP32(new, old)			\
3700 	old = ((((unsigned long long)new << 32) - ((unsigned long long)old << 32)) >> 32);
3701 
3702 int delta_package(struct pkg_data *new, struct pkg_data *old)
3703 {
3704 	int i;
3705 	struct msr_counter *mp;
3706 	struct perf_counter_info *pp;
3707 	struct pmt_counter *ppmt;
3708 
3709 	if (DO_BIC(BIC_Totl_c0))
3710 		old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0;
3711 	if (DO_BIC(BIC_Any_c0))
3712 		old->pkg_any_core_c0 = new->pkg_any_core_c0 - old->pkg_any_core_c0;
3713 	if (DO_BIC(BIC_GFX_c0))
3714 		old->pkg_any_gfxe_c0 = new->pkg_any_gfxe_c0 - old->pkg_any_gfxe_c0;
3715 	if (DO_BIC(BIC_CPUGFX))
3716 		old->pkg_both_core_gfxe_c0 = new->pkg_both_core_gfxe_c0 - old->pkg_both_core_gfxe_c0;
3717 
3718 	old->pc2 = new->pc2 - old->pc2;
3719 	if (DO_BIC(BIC_Pkgpc3))
3720 		old->pc3 = new->pc3 - old->pc3;
3721 	if (DO_BIC(BIC_Pkgpc6))
3722 		old->pc6 = new->pc6 - old->pc6;
3723 	if (DO_BIC(BIC_Pkgpc7))
3724 		old->pc7 = new->pc7 - old->pc7;
3725 	old->pc8 = new->pc8 - old->pc8;
3726 	old->pc9 = new->pc9 - old->pc9;
3727 	old->pc10 = new->pc10 - old->pc10;
3728 	old->die_c6 = new->die_c6 - old->die_c6;
3729 	old->cpu_lpi = new->cpu_lpi - old->cpu_lpi;
3730 	old->sys_lpi = new->sys_lpi - old->sys_lpi;
3731 	old->pkg_temp_c = new->pkg_temp_c;
3732 
3733 	/* flag an error when rc6 counter resets/wraps */
3734 	if (old->gfx_rc6_ms > new->gfx_rc6_ms)
3735 		old->gfx_rc6_ms = -1;
3736 	else
3737 		old->gfx_rc6_ms = new->gfx_rc6_ms - old->gfx_rc6_ms;
3738 
3739 	old->uncore_mhz = new->uncore_mhz;
3740 	old->gfx_mhz = new->gfx_mhz;
3741 	old->gfx_act_mhz = new->gfx_act_mhz;
3742 
3743 	/* flag an error when mc6 counter resets/wraps */
3744 	if (old->sam_mc6_ms > new->sam_mc6_ms)
3745 		old->sam_mc6_ms = -1;
3746 	else
3747 		old->sam_mc6_ms = new->sam_mc6_ms - old->sam_mc6_ms;
3748 
3749 	old->sam_mhz = new->sam_mhz;
3750 	old->sam_act_mhz = new->sam_act_mhz;
3751 
3752 	old->energy_pkg.raw_value = new->energy_pkg.raw_value - old->energy_pkg.raw_value;
3753 	old->energy_cores.raw_value = new->energy_cores.raw_value - old->energy_cores.raw_value;
3754 	old->energy_gfx.raw_value = new->energy_gfx.raw_value - old->energy_gfx.raw_value;
3755 	old->energy_dram.raw_value = new->energy_dram.raw_value - old->energy_dram.raw_value;
3756 	old->rapl_pkg_perf_status.raw_value = new->rapl_pkg_perf_status.raw_value - old->rapl_pkg_perf_status.raw_value;
3757 	old->rapl_dram_perf_status.raw_value =
3758 	    new->rapl_dram_perf_status.raw_value - old->rapl_dram_perf_status.raw_value;
3759 
3760 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
3761 		if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE)
3762 			old->counter[i] = new->counter[i];
3763 		else if (mp->format == FORMAT_AVERAGE)
3764 			old->counter[i] = new->counter[i];
3765 		else
3766 			old->counter[i] = new->counter[i] - old->counter[i];
3767 	}
3768 
3769 	for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
3770 		if (pp->format == FORMAT_RAW)
3771 			old->perf_counter[i] = new->perf_counter[i];
3772 		else if (pp->format == FORMAT_AVERAGE)
3773 			old->perf_counter[i] = new->perf_counter[i];
3774 		else
3775 			old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i];
3776 	}
3777 
3778 	for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
3779 		if (ppmt->format == FORMAT_RAW)
3780 			old->pmt_counter[i] = new->pmt_counter[i];
3781 		else
3782 			old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i];
3783 	}
3784 
3785 	return 0;
3786 }
3787 
3788 void delta_core(struct core_data *new, struct core_data *old)
3789 {
3790 	int i;
3791 	struct msr_counter *mp;
3792 	struct perf_counter_info *pp;
3793 	struct pmt_counter *ppmt;
3794 
3795 	old->c3 = new->c3 - old->c3;
3796 	old->c6 = new->c6 - old->c6;
3797 	old->c7 = new->c7 - old->c7;
3798 	old->core_temp_c = new->core_temp_c;
3799 	old->core_throt_cnt = new->core_throt_cnt - old->core_throt_cnt;
3800 	old->mc6_us = new->mc6_us - old->mc6_us;
3801 
3802 	DELTA_WRAP32(new->core_energy.raw_value, old->core_energy.raw_value);
3803 
3804 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
3805 		if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE)
3806 			old->counter[i] = new->counter[i];
3807 		else
3808 			old->counter[i] = new->counter[i] - old->counter[i];
3809 	}
3810 
3811 	for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
3812 		if (pp->format == FORMAT_RAW)
3813 			old->perf_counter[i] = new->perf_counter[i];
3814 		else
3815 			old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i];
3816 	}
3817 
3818 	for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
3819 		if (ppmt->format == FORMAT_RAW)
3820 			old->pmt_counter[i] = new->pmt_counter[i];
3821 		else
3822 			old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i];
3823 	}
3824 }
3825 
3826 int soft_c1_residency_display(int bic)
3827 {
3828 	if (!DO_BIC(BIC_CPU_c1) || platform->has_msr_core_c1_res)
3829 		return 0;
3830 
3831 	return DO_BIC_READ(bic);
3832 }
3833 
3834 /*
3835  * old = new - old
3836  */
3837 int delta_thread(struct thread_data *new, struct thread_data *old, struct core_data *core_delta)
3838 {
3839 	int i;
3840 	struct msr_counter *mp;
3841 	struct perf_counter_info *pp;
3842 	struct pmt_counter *ppmt;
3843 
3844 	/* we run cpuid just the 1st time, copy the results */
3845 	if (DO_BIC(BIC_APIC))
3846 		new->apic_id = old->apic_id;
3847 	if (DO_BIC(BIC_X2APIC))
3848 		new->x2apic_id = old->x2apic_id;
3849 
3850 	/*
3851 	 * the timestamps from start of measurement interval are in "old"
3852 	 * the timestamp from end of measurement interval are in "new"
3853 	 * over-write old w/ new so we can print end of interval values
3854 	 */
3855 
3856 	timersub(&new->tv_begin, &old->tv_begin, &old->tv_delta);
3857 	old->tv_begin = new->tv_begin;
3858 	old->tv_end = new->tv_end;
3859 
3860 	old->tsc = new->tsc - old->tsc;
3861 
3862 	/* check for TSC < 1 Mcycles over interval */
3863 	if (old->tsc < (1000 * 1000))
3864 		errx(-3, "Insanely slow TSC rate, TSC stops in idle?\n"
3865 		     "You can disable all c-states by booting with \"idle=poll\"\n"
3866 		     "or just the deep ones with \"processor.max_cstate=1\"");
3867 
3868 	old->c1 = new->c1 - old->c1;
3869 
3870 	if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC)
3871 	    || soft_c1_residency_display(BIC_Avg_MHz)) {
3872 		if ((new->aperf > old->aperf) && (new->mperf > old->mperf)) {
3873 			old->aperf = new->aperf - old->aperf;
3874 			old->mperf = new->mperf - old->mperf;
3875 		} else {
3876 			return -1;
3877 		}
3878 	}
3879 
3880 	if (platform->has_msr_core_c1_res) {
3881 		/*
3882 		 * Some models have a dedicated C1 residency MSR,
3883 		 * which should be more accurate than the derivation below.
3884 		 */
3885 	} else {
3886 		/*
3887 		 * As counter collection is not atomic,
3888 		 * it is possible for mperf's non-halted cycles + idle states
3889 		 * to exceed TSC's all cycles: show c1 = 0% in that case.
3890 		 */
3891 		if ((old->mperf + core_delta->c3 + core_delta->c6 + core_delta->c7) > (old->tsc * tsc_tweak))
3892 			old->c1 = 0;
3893 		else {
3894 			/* normal case, derive c1 */
3895 			old->c1 = (old->tsc * tsc_tweak) - old->mperf - core_delta->c3
3896 			    - core_delta->c6 - core_delta->c7;
3897 		}
3898 	}
3899 
3900 	if (old->mperf == 0) {
3901 		if (debug > 1)
3902 			fprintf(outf, "cpu%d MPERF 0!\n", old->cpu_id);
3903 		old->mperf = 1;	/* divide by 0 protection */
3904 	}
3905 
3906 	if (DO_BIC(BIC_IPC))
3907 		old->instr_count = new->instr_count - old->instr_count;
3908 
3909 	if (DO_BIC(BIC_IRQ))
3910 		old->irq_count = new->irq_count - old->irq_count;
3911 
3912 	if (DO_BIC(BIC_NMI))
3913 		old->nmi_count = new->nmi_count - old->nmi_count;
3914 
3915 	if (DO_BIC(BIC_SMI))
3916 		old->smi_count = new->smi_count - old->smi_count;
3917 
3918 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
3919 		if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE)
3920 			old->counter[i] = new->counter[i];
3921 		else
3922 			old->counter[i] = new->counter[i] - old->counter[i];
3923 	}
3924 
3925 	for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) {
3926 		if (pp->format == FORMAT_RAW)
3927 			old->perf_counter[i] = new->perf_counter[i];
3928 		else
3929 			old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i];
3930 	}
3931 
3932 	for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
3933 		if (ppmt->format == FORMAT_RAW)
3934 			old->pmt_counter[i] = new->pmt_counter[i];
3935 		else
3936 			old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i];
3937 	}
3938 
3939 	return 0;
3940 }
3941 
3942 int delta_cpu(struct thread_data *t, struct core_data *c,
3943 	      struct pkg_data *p, struct thread_data *t2, struct core_data *c2, struct pkg_data *p2)
3944 {
3945 	int retval = 0;
3946 
3947 	/* calculate core delta only for 1st thread in core */
3948 	if (is_cpu_first_thread_in_core(t, c, p))
3949 		delta_core(c, c2);
3950 
3951 	/* always calculate thread delta */
3952 	retval = delta_thread(t, t2, c2);	/* c2 is core delta */
3953 
3954 	/* calculate package delta only for 1st core in package */
3955 	if (is_cpu_first_core_in_package(t, c, p))
3956 		retval |= delta_package(p, p2);
3957 
3958 	return retval;
3959 }
3960 
3961 void delta_platform(struct platform_counters *new, struct platform_counters *old)
3962 {
3963 	old->energy_psys.raw_value = new->energy_psys.raw_value - old->energy_psys.raw_value;
3964 }
3965 
3966 void rapl_counter_clear(struct rapl_counter *c)
3967 {
3968 	c->raw_value = 0;
3969 	c->scale = 0.0;
3970 	c->unit = RAPL_UNIT_INVALID;
3971 }
3972 
3973 void clear_counters(PER_THREAD_PARAMS)
3974 {
3975 	int i;
3976 	struct msr_counter *mp;
3977 
3978 	t->tv_begin.tv_sec = 0;
3979 	t->tv_begin.tv_usec = 0;
3980 	t->tv_end.tv_sec = 0;
3981 	t->tv_end.tv_usec = 0;
3982 	t->tv_delta.tv_sec = 0;
3983 	t->tv_delta.tv_usec = 0;
3984 
3985 	t->tsc = 0;
3986 	t->aperf = 0;
3987 	t->mperf = 0;
3988 	t->c1 = 0;
3989 
3990 	t->instr_count = 0;
3991 
3992 	t->irq_count = 0;
3993 	t->nmi_count = 0;
3994 	t->smi_count = 0;
3995 
3996 	c->c3 = 0;
3997 	c->c6 = 0;
3998 	c->c7 = 0;
3999 	c->mc6_us = 0;
4000 	c->core_temp_c = 0;
4001 	rapl_counter_clear(&c->core_energy);
4002 	c->core_throt_cnt = 0;
4003 
4004 	p->pkg_wtd_core_c0 = 0;
4005 	p->pkg_any_core_c0 = 0;
4006 	p->pkg_any_gfxe_c0 = 0;
4007 	p->pkg_both_core_gfxe_c0 = 0;
4008 
4009 	p->pc2 = 0;
4010 	if (DO_BIC(BIC_Pkgpc3))
4011 		p->pc3 = 0;
4012 	if (DO_BIC(BIC_Pkgpc6))
4013 		p->pc6 = 0;
4014 	if (DO_BIC(BIC_Pkgpc7))
4015 		p->pc7 = 0;
4016 	p->pc8 = 0;
4017 	p->pc9 = 0;
4018 	p->pc10 = 0;
4019 	p->die_c6 = 0;
4020 	p->cpu_lpi = 0;
4021 	p->sys_lpi = 0;
4022 
4023 	rapl_counter_clear(&p->energy_pkg);
4024 	rapl_counter_clear(&p->energy_dram);
4025 	rapl_counter_clear(&p->energy_cores);
4026 	rapl_counter_clear(&p->energy_gfx);
4027 	rapl_counter_clear(&p->rapl_pkg_perf_status);
4028 	rapl_counter_clear(&p->rapl_dram_perf_status);
4029 	p->pkg_temp_c = 0;
4030 
4031 	p->gfx_rc6_ms = 0;
4032 	p->uncore_mhz = 0;
4033 	p->gfx_mhz = 0;
4034 	p->gfx_act_mhz = 0;
4035 	p->sam_mc6_ms = 0;
4036 	p->sam_mhz = 0;
4037 	p->sam_act_mhz = 0;
4038 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next)
4039 		t->counter[i] = 0;
4040 
4041 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next)
4042 		c->counter[i] = 0;
4043 
4044 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next)
4045 		p->counter[i] = 0;
4046 
4047 	memset(&t->perf_counter[0], 0, sizeof(t->perf_counter));
4048 	memset(&c->perf_counter[0], 0, sizeof(c->perf_counter));
4049 	memset(&p->perf_counter[0], 0, sizeof(p->perf_counter));
4050 
4051 	memset(&t->pmt_counter[0], 0, ARRAY_SIZE(t->pmt_counter));
4052 	memset(&c->pmt_counter[0], 0, ARRAY_SIZE(c->pmt_counter));
4053 	memset(&p->pmt_counter[0], 0, ARRAY_SIZE(p->pmt_counter));
4054 }
4055 
4056 void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src)
4057 {
4058 	/* Copy unit and scale from src if dst is not initialized */
4059 	if (dst->unit == RAPL_UNIT_INVALID) {
4060 		dst->unit = src->unit;
4061 		dst->scale = src->scale;
4062 	}
4063 
4064 	assert(dst->unit == src->unit);
4065 	assert(dst->scale == src->scale);
4066 
4067 	dst->raw_value += src->raw_value;
4068 }
4069 
4070 int sum_counters(PER_THREAD_PARAMS)
4071 {
4072 	int i;
4073 	struct msr_counter *mp;
4074 	struct perf_counter_info *pp;
4075 	struct pmt_counter *ppmt;
4076 
4077 	/* copy un-changing apic_id's */
4078 	if (DO_BIC(BIC_APIC))
4079 		average.threads.apic_id = t->apic_id;
4080 	if (DO_BIC(BIC_X2APIC))
4081 		average.threads.x2apic_id = t->x2apic_id;
4082 
4083 	/* remember first tv_begin */
4084 	if (average.threads.tv_begin.tv_sec == 0)
4085 		average.threads.tv_begin = procsysfs_tv_begin;
4086 
4087 	/* remember last tv_end */
4088 	average.threads.tv_end = t->tv_end;
4089 
4090 	average.threads.tsc += t->tsc;
4091 	average.threads.aperf += t->aperf;
4092 	average.threads.mperf += t->mperf;
4093 	average.threads.c1 += t->c1;
4094 
4095 	average.threads.instr_count += t->instr_count;
4096 
4097 	average.threads.irq_count += t->irq_count;
4098 	average.threads.nmi_count += t->nmi_count;
4099 	average.threads.smi_count += t->smi_count;
4100 
4101 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
4102 		if (mp->format == FORMAT_RAW)
4103 			continue;
4104 		average.threads.counter[i] += t->counter[i];
4105 	}
4106 
4107 	for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) {
4108 		if (pp->format == FORMAT_RAW)
4109 			continue;
4110 		average.threads.perf_counter[i] += t->perf_counter[i];
4111 	}
4112 
4113 	for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
4114 		average.threads.pmt_counter[i] += t->pmt_counter[i];
4115 	}
4116 
4117 	/* sum per-core values only for 1st thread in core */
4118 	if (!is_cpu_first_thread_in_core(t, c, p))
4119 		return 0;
4120 
4121 	average.cores.c3 += c->c3;
4122 	average.cores.c6 += c->c6;
4123 	average.cores.c7 += c->c7;
4124 	average.cores.mc6_us += c->mc6_us;
4125 
4126 	average.cores.core_temp_c = MAX(average.cores.core_temp_c, c->core_temp_c);
4127 	average.cores.core_throt_cnt = MAX(average.cores.core_throt_cnt, c->core_throt_cnt);
4128 
4129 	rapl_counter_accumulate(&average.cores.core_energy, &c->core_energy);
4130 
4131 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
4132 		if (mp->format == FORMAT_RAW)
4133 			continue;
4134 		average.cores.counter[i] += c->counter[i];
4135 	}
4136 
4137 	for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
4138 		if (pp->format == FORMAT_RAW)
4139 			continue;
4140 		average.cores.perf_counter[i] += c->perf_counter[i];
4141 	}
4142 
4143 	for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
4144 		average.cores.pmt_counter[i] += c->pmt_counter[i];
4145 	}
4146 
4147 	/* sum per-pkg values only for 1st core in pkg */
4148 	if (!is_cpu_first_core_in_package(t, c, p))
4149 		return 0;
4150 
4151 	if (DO_BIC(BIC_Totl_c0))
4152 		average.packages.pkg_wtd_core_c0 += p->pkg_wtd_core_c0;
4153 	if (DO_BIC(BIC_Any_c0))
4154 		average.packages.pkg_any_core_c0 += p->pkg_any_core_c0;
4155 	if (DO_BIC(BIC_GFX_c0))
4156 		average.packages.pkg_any_gfxe_c0 += p->pkg_any_gfxe_c0;
4157 	if (DO_BIC(BIC_CPUGFX))
4158 		average.packages.pkg_both_core_gfxe_c0 += p->pkg_both_core_gfxe_c0;
4159 
4160 	average.packages.pc2 += p->pc2;
4161 	if (DO_BIC(BIC_Pkgpc3))
4162 		average.packages.pc3 += p->pc3;
4163 	if (DO_BIC(BIC_Pkgpc6))
4164 		average.packages.pc6 += p->pc6;
4165 	if (DO_BIC(BIC_Pkgpc7))
4166 		average.packages.pc7 += p->pc7;
4167 	average.packages.pc8 += p->pc8;
4168 	average.packages.pc9 += p->pc9;
4169 	average.packages.pc10 += p->pc10;
4170 	average.packages.die_c6 += p->die_c6;
4171 
4172 	average.packages.cpu_lpi = p->cpu_lpi;
4173 	average.packages.sys_lpi = p->sys_lpi;
4174 
4175 	rapl_counter_accumulate(&average.packages.energy_pkg, &p->energy_pkg);
4176 	rapl_counter_accumulate(&average.packages.energy_dram, &p->energy_dram);
4177 	rapl_counter_accumulate(&average.packages.energy_cores, &p->energy_cores);
4178 	rapl_counter_accumulate(&average.packages.energy_gfx, &p->energy_gfx);
4179 
4180 	average.packages.gfx_rc6_ms = p->gfx_rc6_ms;
4181 	average.packages.uncore_mhz = p->uncore_mhz;
4182 	average.packages.gfx_mhz = p->gfx_mhz;
4183 	average.packages.gfx_act_mhz = p->gfx_act_mhz;
4184 	average.packages.sam_mc6_ms = p->sam_mc6_ms;
4185 	average.packages.sam_mhz = p->sam_mhz;
4186 	average.packages.sam_act_mhz = p->sam_act_mhz;
4187 
4188 	average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c);
4189 
4190 	rapl_counter_accumulate(&average.packages.rapl_pkg_perf_status, &p->rapl_pkg_perf_status);
4191 	rapl_counter_accumulate(&average.packages.rapl_dram_perf_status, &p->rapl_dram_perf_status);
4192 
4193 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
4194 		if ((mp->format == FORMAT_RAW) && (topo.num_packages == 0))
4195 			average.packages.counter[i] = p->counter[i];
4196 		else
4197 			average.packages.counter[i] += p->counter[i];
4198 	}
4199 
4200 	for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
4201 		if ((pp->format == FORMAT_RAW) && (topo.num_packages == 0))
4202 			average.packages.perf_counter[i] = p->perf_counter[i];
4203 		else
4204 			average.packages.perf_counter[i] += p->perf_counter[i];
4205 	}
4206 
4207 	for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
4208 		average.packages.pmt_counter[i] += p->pmt_counter[i];
4209 	}
4210 
4211 	return 0;
4212 }
4213 
4214 /*
4215  * sum the counters for all cpus in the system
4216  * compute the weighted average
4217  */
4218 void compute_average(PER_THREAD_PARAMS)
4219 {
4220 	int i;
4221 	struct msr_counter *mp;
4222 	struct perf_counter_info *pp;
4223 	struct pmt_counter *ppmt;
4224 
4225 	clear_counters(&average.threads, &average.cores, &average.packages);
4226 
4227 	for_all_cpus(sum_counters, t, c, p);
4228 
4229 	/* Use the global time delta for the average. */
4230 	average.threads.tv_delta = tv_delta;
4231 
4232 	average.threads.tsc /= topo.allowed_cpus;
4233 	average.threads.aperf /= topo.allowed_cpus;
4234 	average.threads.mperf /= topo.allowed_cpus;
4235 	average.threads.instr_count /= topo.allowed_cpus;
4236 	average.threads.c1 /= topo.allowed_cpus;
4237 
4238 	if (average.threads.irq_count > 9999999)
4239 		sums_need_wide_columns = 1;
4240 	if (average.threads.nmi_count > 9999999)
4241 		sums_need_wide_columns = 1;
4242 
4243 	average.cores.c3 /= topo.allowed_cores;
4244 	average.cores.c6 /= topo.allowed_cores;
4245 	average.cores.c7 /= topo.allowed_cores;
4246 	average.cores.mc6_us /= topo.allowed_cores;
4247 
4248 	if (DO_BIC(BIC_Totl_c0))
4249 		average.packages.pkg_wtd_core_c0 /= topo.allowed_packages;
4250 	if (DO_BIC(BIC_Any_c0))
4251 		average.packages.pkg_any_core_c0 /= topo.allowed_packages;
4252 	if (DO_BIC(BIC_GFX_c0))
4253 		average.packages.pkg_any_gfxe_c0 /= topo.allowed_packages;
4254 	if (DO_BIC(BIC_CPUGFX))
4255 		average.packages.pkg_both_core_gfxe_c0 /= topo.allowed_packages;
4256 
4257 	average.packages.pc2 /= topo.allowed_packages;
4258 	if (DO_BIC(BIC_Pkgpc3))
4259 		average.packages.pc3 /= topo.allowed_packages;
4260 	if (DO_BIC(BIC_Pkgpc6))
4261 		average.packages.pc6 /= topo.allowed_packages;
4262 	if (DO_BIC(BIC_Pkgpc7))
4263 		average.packages.pc7 /= topo.allowed_packages;
4264 
4265 	average.packages.pc8 /= topo.allowed_packages;
4266 	average.packages.pc9 /= topo.allowed_packages;
4267 	average.packages.pc10 /= topo.allowed_packages;
4268 	average.packages.die_c6 /= topo.allowed_packages;
4269 
4270 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
4271 		if (mp->format == FORMAT_RAW)
4272 			continue;
4273 		if (mp->type == COUNTER_ITEMS) {
4274 			if (average.threads.counter[i] > 9999999)
4275 				sums_need_wide_columns = 1;
4276 			continue;
4277 		}
4278 		average.threads.counter[i] /= topo.allowed_cpus;
4279 	}
4280 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
4281 		if (mp->format == FORMAT_RAW)
4282 			continue;
4283 		if (mp->type == COUNTER_ITEMS) {
4284 			if (average.cores.counter[i] > 9999999)
4285 				sums_need_wide_columns = 1;
4286 		}
4287 		average.cores.counter[i] /= topo.allowed_cores;
4288 	}
4289 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
4290 		if (mp->format == FORMAT_RAW)
4291 			continue;
4292 		if (mp->type == COUNTER_ITEMS) {
4293 			if (average.packages.counter[i] > 9999999)
4294 				sums_need_wide_columns = 1;
4295 		}
4296 		average.packages.counter[i] /= topo.allowed_packages;
4297 	}
4298 
4299 	for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) {
4300 		if (pp->format == FORMAT_RAW)
4301 			continue;
4302 		if (pp->type == COUNTER_ITEMS) {
4303 			if (average.threads.perf_counter[i] > 9999999)
4304 				sums_need_wide_columns = 1;
4305 			continue;
4306 		}
4307 		average.threads.perf_counter[i] /= topo.allowed_cpus;
4308 	}
4309 	for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
4310 		if (pp->format == FORMAT_RAW)
4311 			continue;
4312 		if (pp->type == COUNTER_ITEMS) {
4313 			if (average.cores.perf_counter[i] > 9999999)
4314 				sums_need_wide_columns = 1;
4315 		}
4316 		average.cores.perf_counter[i] /= topo.allowed_cores;
4317 	}
4318 	for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
4319 		if (pp->format == FORMAT_RAW)
4320 			continue;
4321 		if (pp->type == COUNTER_ITEMS) {
4322 			if (average.packages.perf_counter[i] > 9999999)
4323 				sums_need_wide_columns = 1;
4324 		}
4325 		average.packages.perf_counter[i] /= topo.allowed_packages;
4326 	}
4327 
4328 	for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
4329 		average.threads.pmt_counter[i] /= topo.allowed_cpus;
4330 	}
4331 	for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
4332 		average.cores.pmt_counter[i] /= topo.allowed_cores;
4333 	}
4334 	for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
4335 		average.packages.pmt_counter[i] /= topo.allowed_packages;
4336 	}
4337 }
4338 
4339 static unsigned long long rdtsc(void)
4340 {
4341 	unsigned int low, high;
4342 
4343 	asm volatile ("rdtsc":"=a" (low), "=d"(high));
4344 
4345 	return low | ((unsigned long long)high) << 32;
4346 }
4347 
4348 /*
4349  * Open a file, and exit on failure
4350  */
4351 FILE *fopen_or_die(const char *path, const char *mode)
4352 {
4353 	FILE *filep = fopen(path, mode);
4354 
4355 	if (!filep)
4356 		err(1, "%s: open failed", path);
4357 	return filep;
4358 }
4359 
4360 /*
4361  * snapshot_sysfs_counter()
4362  *
4363  * return snapshot of given counter
4364  */
4365 unsigned long long snapshot_sysfs_counter(char *path)
4366 {
4367 	FILE *fp;
4368 	int retval;
4369 	unsigned long long counter;
4370 
4371 	fp = fopen_or_die(path, "r");
4372 
4373 	retval = fscanf(fp, "%lld", &counter);
4374 	if (retval != 1)
4375 		err(1, "snapshot_sysfs_counter(%s)", path);
4376 
4377 	fclose(fp);
4378 
4379 	return counter;
4380 }
4381 
4382 int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp, char *counter_path)
4383 {
4384 	if (mp->msr_num != 0) {
4385 		assert(!no_msr);
4386 		if (get_msr(cpu, mp->msr_num, counterp))
4387 			return -1;
4388 	} else {
4389 		char path[128 + PATH_BYTES];
4390 
4391 		if (mp->flags & SYSFS_PERCPU) {
4392 			sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->sp->path);
4393 
4394 			*counterp = snapshot_sysfs_counter(path);
4395 		} else {
4396 			*counterp = snapshot_sysfs_counter(counter_path);
4397 		}
4398 	}
4399 
4400 	return 0;
4401 }
4402 
4403 unsigned long long get_legacy_uncore_mhz(int package)
4404 {
4405 	char path[128];
4406 	int die;
4407 	static int warn_once;
4408 
4409 	/*
4410 	 * for this package, use the first die_id that exists
4411 	 */
4412 	for (die = 0; die <= topo.max_die_id; ++die) {
4413 
4414 		sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz",
4415 			package, die);
4416 
4417 		if (access(path, R_OK) == 0)
4418 			return (snapshot_sysfs_counter(path) / 1000);
4419 	}
4420 	if (!warn_once) {
4421 		warnx("BUG: %s: No %s", __func__, path);
4422 		warn_once = 1;
4423 	}
4424 
4425 	return 0;
4426 }
4427 
4428 int get_epb(int cpu)
4429 {
4430 	char path[128 + PATH_BYTES];
4431 	unsigned long long msr;
4432 	int ret, epb = -1;
4433 	FILE *fp;
4434 
4435 	sprintf(path, "/sys/devices/system/cpu/cpu%d/power/energy_perf_bias", cpu);
4436 
4437 	fp = fopen(path, "r");
4438 	if (!fp)
4439 		goto msr_fallback;
4440 
4441 	ret = fscanf(fp, "%d", &epb);
4442 	if (ret != 1)
4443 		err(1, "%s(%s)", __func__, path);
4444 
4445 	fclose(fp);
4446 
4447 	return epb;
4448 
4449 msr_fallback:
4450 	if (no_msr)
4451 		return -1;
4452 
4453 	get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr);
4454 
4455 	return msr & 0xf;
4456 }
4457 
4458 void get_apic_id(struct thread_data *t)
4459 {
4460 	unsigned int eax, ebx, ecx, edx;
4461 
4462 	if (DO_BIC(BIC_APIC)) {
4463 		eax = ebx = ecx = edx = 0;
4464 		__cpuid(1, eax, ebx, ecx, edx);
4465 
4466 		t->apic_id = (ebx >> 24) & 0xff;
4467 	}
4468 
4469 	if (!DO_BIC(BIC_X2APIC))
4470 		return;
4471 
4472 	if (authentic_amd || hygon_genuine) {
4473 		unsigned int topology_extensions;
4474 
4475 		if (max_extended_level < 0x8000001e)
4476 			return;
4477 
4478 		eax = ebx = ecx = edx = 0;
4479 		__cpuid(0x80000001, eax, ebx, ecx, edx);
4480 		topology_extensions = ecx & (1 << 22);
4481 
4482 		if (topology_extensions == 0)
4483 			return;
4484 
4485 		eax = ebx = ecx = edx = 0;
4486 		__cpuid(0x8000001e, eax, ebx, ecx, edx);
4487 
4488 		t->x2apic_id = eax;
4489 		return;
4490 	}
4491 
4492 	if (!genuine_intel)
4493 		return;
4494 
4495 	if (max_level < 0xb)
4496 		return;
4497 
4498 	ecx = 0;
4499 	__cpuid(0xb, eax, ebx, ecx, edx);
4500 	t->x2apic_id = edx;
4501 
4502 	if (debug && (t->apic_id != (t->x2apic_id & 0xff)))
4503 		fprintf(outf, "cpu%d: BIOS BUG: apic 0x%x x2apic 0x%x\n", t->cpu_id, t->apic_id, t->x2apic_id);
4504 }
4505 
4506 int get_core_throt_cnt(int cpu, unsigned long long *cnt)
4507 {
4508 	char path[128 + PATH_BYTES];
4509 	unsigned long long tmp;
4510 	FILE *fp;
4511 	int ret;
4512 
4513 	sprintf(path, "/sys/devices/system/cpu/cpu%d/thermal_throttle/core_throttle_count", cpu);
4514 	fp = fopen(path, "r");
4515 	if (!fp)
4516 		return -1;
4517 	ret = fscanf(fp, "%lld", &tmp);
4518 	fclose(fp);
4519 	if (ret != 1)
4520 		return -1;
4521 	*cnt = tmp;
4522 
4523 	return 0;
4524 }
4525 
4526 struct amperf_group_fd {
4527 	int aperf;		/* Also the group descriptor */
4528 	int mperf;
4529 };
4530 
4531 static int read_perf_counter_info(const char *const path, const char *const parse_format, void *value_ptr)
4532 {
4533 	int fdmt;
4534 	int bytes_read;
4535 	char buf[64];
4536 	int ret = -1;
4537 
4538 	fdmt = open(path, O_RDONLY, 0);
4539 	if (fdmt == -1) {
4540 		if (debug)
4541 			fprintf(stderr, "Failed to parse perf counter info %s\n", path);
4542 		ret = -1;
4543 		goto cleanup_and_exit;
4544 	}
4545 
4546 	bytes_read = read(fdmt, buf, sizeof(buf) - 1);
4547 	if (bytes_read <= 0 || bytes_read >= (int)sizeof(buf)) {
4548 		if (debug)
4549 			fprintf(stderr, "Failed to parse perf counter info %s\n", path);
4550 		ret = -1;
4551 		goto cleanup_and_exit;
4552 	}
4553 
4554 	buf[bytes_read] = '\0';
4555 
4556 	if (sscanf(buf, parse_format, value_ptr) != 1) {
4557 		if (debug)
4558 			fprintf(stderr, "Failed to parse perf counter info %s\n", path);
4559 		ret = -1;
4560 		goto cleanup_and_exit;
4561 	}
4562 
4563 	ret = 0;
4564 
4565 cleanup_and_exit:
4566 	close(fdmt);
4567 	return ret;
4568 }
4569 
4570 static unsigned int read_perf_counter_info_n(const char *const path, const char *const parse_format)
4571 {
4572 	unsigned int v;
4573 	int status;
4574 
4575 	status = read_perf_counter_info(path, parse_format, &v);
4576 	if (status)
4577 		v = -1;
4578 
4579 	return v;
4580 }
4581 
4582 static unsigned int read_perf_type(const char *subsys)
4583 {
4584 	const char *const path_format = "/sys/bus/event_source/devices/%s/type";
4585 	const char *const format = "%u";
4586 	char path[128];
4587 
4588 	snprintf(path, sizeof(path), path_format, subsys);
4589 
4590 	return read_perf_counter_info_n(path, format);
4591 }
4592 
4593 static unsigned int read_perf_config(const char *subsys, const char *event_name)
4594 {
4595 	const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s";
4596 	FILE *fconfig = NULL;
4597 	char path[128];
4598 	char config_str[64];
4599 	unsigned int config;
4600 	unsigned int umask;
4601 	bool has_config = false;
4602 	bool has_umask = false;
4603 	unsigned int ret = -1;
4604 
4605 	snprintf(path, sizeof(path), path_format, subsys, event_name);
4606 
4607 	fconfig = fopen(path, "r");
4608 	if (!fconfig)
4609 		return -1;
4610 
4611 	if (fgets(config_str, ARRAY_SIZE(config_str), fconfig) != config_str)
4612 		goto cleanup_and_exit;
4613 
4614 	for (char *pconfig_str = &config_str[0]; pconfig_str;) {
4615 		if (sscanf(pconfig_str, "event=%x", &config) == 1) {
4616 			has_config = true;
4617 			goto next;
4618 		}
4619 
4620 		if (sscanf(pconfig_str, "umask=%x", &umask) == 1) {
4621 			has_umask = true;
4622 			goto next;
4623 		}
4624 
4625 next:
4626 		pconfig_str = strchr(pconfig_str, ',');
4627 		if (pconfig_str) {
4628 			*pconfig_str = '\0';
4629 			++pconfig_str;
4630 		}
4631 	}
4632 
4633 	if (!has_umask)
4634 		umask = 0;
4635 
4636 	if (has_config)
4637 		ret = (umask << 8) | config;
4638 
4639 cleanup_and_exit:
4640 	fclose(fconfig);
4641 	return ret;
4642 }
4643 
4644 static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_name)
4645 {
4646 	const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.unit";
4647 	const char *const format = "%s";
4648 	char path[128];
4649 	char unit_buffer[16];
4650 
4651 	snprintf(path, sizeof(path), path_format, subsys, event_name);
4652 
4653 	read_perf_counter_info(path, format, &unit_buffer);
4654 	if (strcmp("Joules", unit_buffer) == 0)
4655 		return RAPL_UNIT_JOULES;
4656 
4657 	return RAPL_UNIT_INVALID;
4658 }
4659 
4660 static double read_perf_scale(const char *subsys, const char *event_name)
4661 {
4662 	const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.scale";
4663 	const char *const format = "%lf";
4664 	char path[128];
4665 	double scale;
4666 
4667 	snprintf(path, sizeof(path), path_format, subsys, event_name);
4668 
4669 	if (read_perf_counter_info(path, format, &scale))
4670 		return 0.0;
4671 
4672 	return scale;
4673 }
4674 
4675 size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci)
4676 {
4677 	size_t ret = 0;
4678 
4679 	for (int i = 0; i < NUM_RAPL_COUNTERS; ++i)
4680 		if (rci->source[i] == COUNTER_SOURCE_PERF)
4681 			++ret;
4682 
4683 	return ret;
4684 }
4685 
4686 static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t *cci)
4687 {
4688 	size_t ret = 0;
4689 
4690 	for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i)
4691 		if (cci->source[i] == COUNTER_SOURCE_PERF)
4692 			++ret;
4693 
4694 	return ret;
4695 }
4696 
4697 void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx)
4698 {
4699 	if (rci->source[idx] == COUNTER_SOURCE_NONE)
4700 		return;
4701 
4702 	rc->raw_value = rci->data[idx];
4703 	rc->unit = rci->unit[idx];
4704 	rc->scale = rci->scale[idx];
4705 }
4706 
4707 int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct pkg_data *p)
4708 {
4709 	struct platform_counters *pplat_cnt = p == package_odd ? &platform_counters_odd : &platform_counters_even;
4710 	unsigned long long perf_data[NUM_RAPL_COUNTERS + 1];
4711 	struct rapl_counter_info_t *rci;
4712 
4713 	if (debug >= 2)
4714 		fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain);
4715 
4716 	assert(rapl_counter_info_perdomain);
4717 	assert(domain < rapl_counter_info_perdomain_size);
4718 
4719 	rci = &rapl_counter_info_perdomain[domain];
4720 
4721 	/*
4722 	 * If we have any perf counters to read, read them all now, in bulk
4723 	 */
4724 	if (rci->fd_perf != -1) {
4725 		size_t num_perf_counters = rapl_counter_info_count_perf(rci);
4726 		const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long);
4727 		const ssize_t actual_read_size = read(rci->fd_perf, &perf_data[0], sizeof(perf_data));
4728 
4729 		if (actual_read_size != expected_read_size)
4730 			err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size,
4731 			    actual_read_size);
4732 	}
4733 
4734 	for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) {
4735 		switch (rci->source[i]) {
4736 		case COUNTER_SOURCE_NONE:
4737 			rci->data[i] = 0;
4738 			break;
4739 
4740 		case COUNTER_SOURCE_PERF:
4741 			assert(pi < ARRAY_SIZE(perf_data));
4742 			assert(rci->fd_perf != -1);
4743 
4744 			if (debug >= 2)
4745 				fprintf(stderr, "Reading rapl counter via perf at %u (%llu %e %lf)\n",
4746 					i, perf_data[pi], rci->scale[i], perf_data[pi] * rci->scale[i]);
4747 
4748 			rci->data[i] = perf_data[pi];
4749 
4750 			++pi;
4751 			break;
4752 
4753 		case COUNTER_SOURCE_MSR:
4754 			if (debug >= 2)
4755 				fprintf(stderr, "Reading rapl counter via msr at %u\n", i);
4756 
4757 			assert(!no_msr);
4758 			if (rci->flags[i] & RAPL_COUNTER_FLAG_USE_MSR_SUM) {
4759 				if (get_msr_sum(cpu, rci->msr[i], &rci->data[i]))
4760 					return -13 - i;
4761 			} else {
4762 				if (get_msr(cpu, rci->msr[i], &rci->data[i]))
4763 					return -13 - i;
4764 			}
4765 
4766 			rci->data[i] &= rci->msr_mask[i];
4767 			if (rci->msr_shift[i] >= 0)
4768 				rci->data[i] >>= abs(rci->msr_shift[i]);
4769 			else
4770 				rci->data[i] <<= abs(rci->msr_shift[i]);
4771 
4772 			break;
4773 		}
4774 	}
4775 
4776 	BUILD_BUG_ON(NUM_RAPL_COUNTERS != 8);
4777 	write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG);
4778 	write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES);
4779 	write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM);
4780 	write_rapl_counter(&p->energy_gfx, rci, RAPL_RCI_INDEX_GFX);
4781 	write_rapl_counter(&p->rapl_pkg_perf_status, rci, RAPL_RCI_INDEX_PKG_PERF_STATUS);
4782 	write_rapl_counter(&p->rapl_dram_perf_status, rci, RAPL_RCI_INDEX_DRAM_PERF_STATUS);
4783 	write_rapl_counter(&c->core_energy, rci, RAPL_RCI_INDEX_CORE_ENERGY);
4784 	write_rapl_counter(&pplat_cnt->energy_psys, rci, RAPL_RCI_INDEX_ENERGY_PLATFORM);
4785 
4786 	return 0;
4787 }
4788 
4789 char *find_sysfs_path_by_id(struct sysfs_path *sp, int id)
4790 {
4791 	while (sp) {
4792 		if (sp->id == id)
4793 			return (sp->path);
4794 		sp = sp->next;
4795 	}
4796 	if (debug)
4797 		warnx("%s: id%d not found", __func__, id);
4798 	return NULL;
4799 }
4800 
4801 int get_cstate_counters(unsigned int cpu, PER_THREAD_PARAMS)
4802 {
4803 	/*
4804 	 * Overcommit memory a little bit here,
4805 	 * but skip calculating exact sizes for the buffers.
4806 	 */
4807 	unsigned long long perf_data[NUM_CSTATE_COUNTERS];
4808 	unsigned long long perf_data_core[NUM_CSTATE_COUNTERS + 1];
4809 	unsigned long long perf_data_pkg[NUM_CSTATE_COUNTERS + 1];
4810 
4811 	struct cstate_counter_info_t *cci;
4812 
4813 	if (debug >= 2)
4814 		fprintf(stderr, "%s: cpu%d\n", __func__, cpu);
4815 
4816 	assert(ccstate_counter_info);
4817 	assert(cpu <= ccstate_counter_info_size);
4818 
4819 	ZERO_ARRAY(perf_data);
4820 	ZERO_ARRAY(perf_data_core);
4821 	ZERO_ARRAY(perf_data_pkg);
4822 
4823 	cci = &ccstate_counter_info[cpu];
4824 
4825 	/*
4826 	 * If we have any perf counters to read, read them all now, in bulk
4827 	 */
4828 	const size_t num_perf_counters = cstate_counter_info_count_perf(cci);
4829 	ssize_t expected_read_size = num_perf_counters * sizeof(unsigned long long);
4830 	ssize_t actual_read_size_core = 0, actual_read_size_pkg = 0;
4831 
4832 	if (cci->fd_perf_core != -1) {
4833 		/* Each descriptor read begins with number of counters read. */
4834 		expected_read_size += sizeof(unsigned long long);
4835 
4836 		actual_read_size_core = read(cci->fd_perf_core, &perf_data_core[0], sizeof(perf_data_core));
4837 
4838 		if (actual_read_size_core <= 0)
4839 			err(-1, "%s: read perf %s: %ld", __func__, "core", actual_read_size_core);
4840 	}
4841 
4842 	if (cci->fd_perf_pkg != -1) {
4843 		/* Each descriptor read begins with number of counters read. */
4844 		expected_read_size += sizeof(unsigned long long);
4845 
4846 		actual_read_size_pkg = read(cci->fd_perf_pkg, &perf_data_pkg[0], sizeof(perf_data_pkg));
4847 
4848 		if (actual_read_size_pkg <= 0)
4849 			err(-1, "%s: read perf %s: %ld", __func__, "pkg", actual_read_size_pkg);
4850 	}
4851 
4852 	const ssize_t actual_read_size_total = actual_read_size_core + actual_read_size_pkg;
4853 
4854 	if (actual_read_size_total != expected_read_size)
4855 		err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, actual_read_size_total);
4856 
4857 	/*
4858 	 * Copy ccstate and pcstate data into unified buffer.
4859 	 *
4860 	 * Skip first element from core and pkg buffers.
4861 	 * Kernel puts there how many counters were read.
4862 	 */
4863 	const size_t num_core_counters = perf_data_core[0];
4864 	const size_t num_pkg_counters = perf_data_pkg[0];
4865 
4866 	assert(num_perf_counters == num_core_counters + num_pkg_counters);
4867 
4868 	/* Copy ccstate perf data */
4869 	memcpy(&perf_data[0], &perf_data_core[1], num_core_counters * sizeof(unsigned long long));
4870 
4871 	/* Copy pcstate perf data */
4872 	memcpy(&perf_data[num_core_counters], &perf_data_pkg[1], num_pkg_counters * sizeof(unsigned long long));
4873 
4874 	for (unsigned int i = 0, pi = 0; i < NUM_CSTATE_COUNTERS; ++i) {
4875 		switch (cci->source[i]) {
4876 		case COUNTER_SOURCE_NONE:
4877 			break;
4878 
4879 		case COUNTER_SOURCE_PERF:
4880 			assert(pi < ARRAY_SIZE(perf_data));
4881 			assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1);
4882 
4883 			if (debug >= 2)
4884 				fprintf(stderr, "cstate via %s %u: %llu\n", "perf", i, perf_data[pi]);
4885 
4886 			cci->data[i] = perf_data[pi];
4887 
4888 			++pi;
4889 			break;
4890 
4891 		case COUNTER_SOURCE_MSR:
4892 			assert(!no_msr);
4893 			if (get_msr(cpu, cci->msr[i], &cci->data[i]))
4894 				return -13 - i;
4895 
4896 			if (debug >= 2)
4897 				fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", "msr", cci->msr[i], i, cci->data[i]);
4898 
4899 			break;
4900 		}
4901 	}
4902 
4903 	/*
4904 	 * Helper to write the data only if the source of
4905 	 * the counter for the current cpu is not none.
4906 	 *
4907 	 * Otherwise we would overwrite core data with 0 (default value),
4908 	 * when invoked for the thread sibling.
4909 	 */
4910 #define PERF_COUNTER_WRITE_DATA(out_counter, index) do {	\
4911 	if (cci->source[index] != COUNTER_SOURCE_NONE)		\
4912 		out_counter = cci->data[index];			\
4913 } while (0)
4914 
4915 	BUILD_BUG_ON(NUM_CSTATE_COUNTERS != 11);
4916 
4917 	PERF_COUNTER_WRITE_DATA(t->c1, CCSTATE_RCI_INDEX_C1_RESIDENCY);
4918 	PERF_COUNTER_WRITE_DATA(c->c3, CCSTATE_RCI_INDEX_C3_RESIDENCY);
4919 	PERF_COUNTER_WRITE_DATA(c->c6, CCSTATE_RCI_INDEX_C6_RESIDENCY);
4920 	PERF_COUNTER_WRITE_DATA(c->c7, CCSTATE_RCI_INDEX_C7_RESIDENCY);
4921 
4922 	PERF_COUNTER_WRITE_DATA(p->pc2, PCSTATE_RCI_INDEX_C2_RESIDENCY);
4923 	PERF_COUNTER_WRITE_DATA(p->pc3, PCSTATE_RCI_INDEX_C3_RESIDENCY);
4924 	PERF_COUNTER_WRITE_DATA(p->pc6, PCSTATE_RCI_INDEX_C6_RESIDENCY);
4925 	PERF_COUNTER_WRITE_DATA(p->pc7, PCSTATE_RCI_INDEX_C7_RESIDENCY);
4926 	PERF_COUNTER_WRITE_DATA(p->pc8, PCSTATE_RCI_INDEX_C8_RESIDENCY);
4927 	PERF_COUNTER_WRITE_DATA(p->pc9, PCSTATE_RCI_INDEX_C9_RESIDENCY);
4928 	PERF_COUNTER_WRITE_DATA(p->pc10, PCSTATE_RCI_INDEX_C10_RESIDENCY);
4929 
4930 #undef PERF_COUNTER_WRITE_DATA
4931 
4932 	return 0;
4933 }
4934 
4935 size_t msr_counter_info_count_perf(const struct msr_counter_info_t *mci)
4936 {
4937 	size_t ret = 0;
4938 
4939 	for (int i = 0; i < NUM_MSR_COUNTERS; ++i)
4940 		if (mci->source[i] == COUNTER_SOURCE_PERF)
4941 			++ret;
4942 
4943 	return ret;
4944 }
4945 
4946 int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t)
4947 {
4948 	unsigned long long perf_data[NUM_MSR_COUNTERS + 1];
4949 
4950 	struct msr_counter_info_t *mci;
4951 
4952 	if (debug >= 2)
4953 		fprintf(stderr, "%s: cpu%d\n", __func__, cpu);
4954 
4955 	assert(msr_counter_info);
4956 	assert(cpu <= msr_counter_info_size);
4957 
4958 	mci = &msr_counter_info[cpu];
4959 
4960 	ZERO_ARRAY(perf_data);
4961 	ZERO_ARRAY(mci->data);
4962 
4963 	if (mci->fd_perf != -1) {
4964 		const size_t num_perf_counters = msr_counter_info_count_perf(mci);
4965 		const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long);
4966 		const ssize_t actual_read_size = read(mci->fd_perf, &perf_data[0], sizeof(perf_data));
4967 
4968 		if (actual_read_size != expected_read_size)
4969 			err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size,
4970 			    actual_read_size);
4971 	}
4972 
4973 	for (unsigned int i = 0, pi = 1; i < NUM_MSR_COUNTERS; ++i) {
4974 		switch (mci->source[i]) {
4975 		case COUNTER_SOURCE_NONE:
4976 			break;
4977 
4978 		case COUNTER_SOURCE_PERF:
4979 			assert(pi < ARRAY_SIZE(perf_data));
4980 			assert(mci->fd_perf != -1);
4981 
4982 			if (debug >= 2)
4983 				fprintf(stderr, "Reading msr counter via perf at %u: %llu\n", i, perf_data[pi]);
4984 
4985 			mci->data[i] = perf_data[pi];
4986 
4987 			++pi;
4988 			break;
4989 
4990 		case COUNTER_SOURCE_MSR:
4991 			assert(!no_msr);
4992 
4993 			if (get_msr(cpu, mci->msr[i], &mci->data[i]))
4994 				return -2 - i;
4995 
4996 			mci->data[i] &= mci->msr_mask[i];
4997 
4998 			if (debug >= 2)
4999 				fprintf(stderr, "Reading msr counter via msr at %u: %llu\n", i, mci->data[i]);
5000 
5001 			break;
5002 		}
5003 	}
5004 
5005 	BUILD_BUG_ON(NUM_MSR_COUNTERS != 3);
5006 	t->aperf = mci->data[MSR_RCI_INDEX_APERF];
5007 	t->mperf = mci->data[MSR_RCI_INDEX_MPERF];
5008 	t->smi_count = mci->data[MSR_RCI_INDEX_SMI];
5009 
5010 	return 0;
5011 }
5012 
5013 int perf_counter_info_read_values(struct perf_counter_info *pp, int cpu, unsigned long long *out, size_t out_size)
5014 {
5015 	unsigned int domain;
5016 	unsigned long long value;
5017 	int fd_counter;
5018 
5019 	for (size_t i = 0; pp; ++i, pp = pp->next) {
5020 		domain = cpu_to_domain(pp, cpu);
5021 		assert(domain < pp->num_domains);
5022 
5023 		fd_counter = pp->fd_perf_per_domain[domain];
5024 
5025 		if (fd_counter == -1)
5026 			continue;
5027 
5028 		if (read(fd_counter, &value, sizeof(value)) != sizeof(value))
5029 			return 1;
5030 
5031 		assert(i < out_size);
5032 		out[i] = value * pp->scale;
5033 	}
5034 
5035 	return 0;
5036 }
5037 
5038 unsigned long pmt_gen_value_mask(unsigned int lsb, unsigned int msb)
5039 {
5040 	unsigned long mask;
5041 
5042 	if (msb == 63)
5043 		mask = 0xffffffffffffffff;
5044 	else
5045 		mask = ((1 << (msb + 1)) - 1);
5046 
5047 	mask -= (1 << lsb) - 1;
5048 
5049 	return mask;
5050 }
5051 
5052 unsigned long pmt_read_counter(struct pmt_counter *ppmt, unsigned int domain_id)
5053 {
5054 	if (domain_id >= ppmt->num_domains)
5055 		return 0;
5056 
5057 	const unsigned long *pmmio = ppmt->domains[domain_id].pcounter;
5058 	const unsigned long value = pmmio ? *pmmio : 0;
5059 	const unsigned long value_mask = pmt_gen_value_mask(ppmt->lsb, ppmt->msb);
5060 	const unsigned long value_shift = ppmt->lsb;
5061 
5062 	return (value & value_mask) >> value_shift;
5063 }
5064 
5065 /* Rapl domain enumeration helpers */
5066 static inline int get_rapl_num_domains(void)
5067 {
5068 	int num_packages = topo.max_package_id + 1;
5069 	int num_cores_per_package;
5070 	int num_cores;
5071 
5072 	if (!platform->has_per_core_rapl)
5073 		return num_packages;
5074 
5075 	num_cores_per_package = topo.max_core_id + 1;
5076 	num_cores = num_cores_per_package * num_packages;
5077 
5078 	return num_cores;
5079 }
5080 
5081 static inline int get_rapl_domain_id(int cpu)
5082 {
5083 	int nr_cores_per_package = topo.max_core_id + 1;
5084 	int rapl_core_id;
5085 
5086 	if (!platform->has_per_core_rapl)
5087 		return cpus[cpu].physical_package_id;
5088 
5089 	/* Compute the system-wide unique core-id for @cpu */
5090 	rapl_core_id = cpus[cpu].physical_core_id;
5091 	rapl_core_id += cpus[cpu].physical_package_id * nr_cores_per_package;
5092 
5093 	return rapl_core_id;
5094 }
5095 
5096 /*
5097  * get_counters(...)
5098  * migrate to cpu
5099  * acquire and record local counters for that cpu
5100  */
5101 int get_counters(PER_THREAD_PARAMS)
5102 {
5103 	int cpu = t->cpu_id;
5104 	unsigned long long msr;
5105 	struct msr_counter *mp;
5106 	struct pmt_counter *pp;
5107 	int i;
5108 	int status;
5109 
5110 	if (cpu_migrate(cpu)) {
5111 		fprintf(outf, "%s: Could not migrate to CPU %d\n", __func__, cpu);
5112 		return -1;
5113 	}
5114 
5115 	gettimeofday(&t->tv_begin, (struct timezone *)NULL);
5116 
5117 	if (first_counter_read)
5118 		get_apic_id(t);
5119 
5120 	t->tsc = rdtsc();	/* we are running on local CPU of interest */
5121 
5122 	get_smi_aperf_mperf(cpu, t);
5123 
5124 	if (DO_BIC(BIC_IPC))
5125 		if (read(get_instr_count_fd(cpu), &t->instr_count, sizeof(long long)) != sizeof(long long))
5126 			return -4;
5127 
5128 	if (DO_BIC(BIC_IRQ))
5129 		t->irq_count = irqs_per_cpu[cpu];
5130 	if (DO_BIC(BIC_NMI))
5131 		t->nmi_count = nmi_per_cpu[cpu];
5132 
5133 	get_cstate_counters(cpu, t, c, p);
5134 
5135 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
5136 		if (get_mp(cpu, mp, &t->counter[i], mp->sp->path))
5137 			return -10;
5138 	}
5139 
5140 	if (perf_counter_info_read_values(sys.perf_tp, cpu, t->perf_counter, MAX_ADDED_THREAD_COUNTERS))
5141 		return -10;
5142 
5143 	for (i = 0, pp = sys.pmt_tp; pp; i++, pp = pp->next)
5144 		t->pmt_counter[i] = pmt_read_counter(pp, t->cpu_id);
5145 
5146 	/* collect core counters only for 1st thread in core */
5147 	if (!is_cpu_first_thread_in_core(t, c, p))
5148 		goto done;
5149 
5150 	if (platform->has_per_core_rapl) {
5151 		status = get_rapl_counters(cpu, get_rapl_domain_id(cpu), c, p);
5152 		if (status != 0)
5153 			return status;
5154 	}
5155 
5156 	if (DO_BIC(BIC_CPU_c7) && t->is_atom) {
5157 		/*
5158 		 * For Atom CPUs that has core cstate deeper than c6,
5159 		 * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper.
5160 		 * Minus CC7 (and deeper cstates) residency to get
5161 		 * accturate cc6 residency.
5162 		 */
5163 		c->c6 -= c->c7;
5164 	}
5165 
5166 	if (DO_BIC(BIC_Mod_c6))
5167 		if (get_msr(cpu, MSR_MODULE_C6_RES_MS, &c->mc6_us))
5168 			return -8;
5169 
5170 	if (DO_BIC(BIC_CoreTmp)) {
5171 		if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
5172 			return -9;
5173 		c->core_temp_c = tj_max - ((msr >> 16) & 0x7F);
5174 	}
5175 
5176 	if (DO_BIC(BIC_CORE_THROT_CNT))
5177 		get_core_throt_cnt(cpu, &c->core_throt_cnt);
5178 
5179 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
5180 		if (get_mp(cpu, mp, &c->counter[i], mp->sp->path))
5181 			return -10;
5182 	}
5183 
5184 	if (perf_counter_info_read_values(sys.perf_cp, cpu, c->perf_counter, MAX_ADDED_CORE_COUNTERS))
5185 		return -10;
5186 
5187 	for (i = 0, pp = sys.pmt_cp; pp; i++, pp = pp->next)
5188 		c->pmt_counter[i] = pmt_read_counter(pp, c->core_id);
5189 
5190 	/* collect package counters only for 1st core in package */
5191 	if (!is_cpu_first_core_in_package(t, c, p))
5192 		goto done;
5193 
5194 	if (DO_BIC(BIC_Totl_c0)) {
5195 		if (get_msr(cpu, MSR_PKG_WEIGHTED_CORE_C0_RES, &p->pkg_wtd_core_c0))
5196 			return -10;
5197 	}
5198 	if (DO_BIC(BIC_Any_c0)) {
5199 		if (get_msr(cpu, MSR_PKG_ANY_CORE_C0_RES, &p->pkg_any_core_c0))
5200 			return -11;
5201 	}
5202 	if (DO_BIC(BIC_GFX_c0)) {
5203 		if (get_msr(cpu, MSR_PKG_ANY_GFXE_C0_RES, &p->pkg_any_gfxe_c0))
5204 			return -12;
5205 	}
5206 	if (DO_BIC(BIC_CPUGFX)) {
5207 		if (get_msr(cpu, MSR_PKG_BOTH_CORE_GFXE_C0_RES, &p->pkg_both_core_gfxe_c0))
5208 			return -13;
5209 	}
5210 
5211 	if (DO_BIC(BIC_CPU_LPI))
5212 		p->cpu_lpi = cpuidle_cur_cpu_lpi_us;
5213 	if (DO_BIC(BIC_SYS_LPI))
5214 		p->sys_lpi = cpuidle_cur_sys_lpi_us;
5215 
5216 	if (!platform->has_per_core_rapl) {
5217 		status = get_rapl_counters(cpu, get_rapl_domain_id(cpu), c, p);
5218 		if (status != 0)
5219 			return status;
5220 	}
5221 
5222 	if (DO_BIC(BIC_PkgTmp)) {
5223 		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
5224 			return -17;
5225 		p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F);
5226 	}
5227 
5228 	if (DO_BIC(BIC_UNCORE_MHZ))
5229 		p->uncore_mhz = get_legacy_uncore_mhz(p->package_id);
5230 
5231 	if (DO_BIC(BIC_GFX_rc6))
5232 		p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull;
5233 
5234 	if (DO_BIC(BIC_GFXMHz))
5235 		p->gfx_mhz = gfx_info[GFX_MHz].val;
5236 
5237 	if (DO_BIC(BIC_GFXACTMHz))
5238 		p->gfx_act_mhz = gfx_info[GFX_ACTMHz].val;
5239 
5240 	if (DO_BIC(BIC_SAM_mc6))
5241 		p->sam_mc6_ms = gfx_info[SAM_mc6].val_ull;
5242 
5243 	if (DO_BIC(BIC_SAMMHz))
5244 		p->sam_mhz = gfx_info[SAM_MHz].val;
5245 
5246 	if (DO_BIC(BIC_SAMACTMHz))
5247 		p->sam_act_mhz = gfx_info[SAM_ACTMHz].val;
5248 
5249 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
5250 		char *path = NULL;
5251 
5252 		if (mp->msr_num == 0) {
5253 			path = find_sysfs_path_by_id(mp->sp, p->package_id);
5254 			if (path == NULL) {
5255 				warnx("%s: package_id %d not found", __func__, p->package_id);
5256 				return -10;
5257 			}
5258 		}
5259 		if (get_mp(cpu, mp, &p->counter[i], path))
5260 			return -10;
5261 	}
5262 
5263 	if (perf_counter_info_read_values(sys.perf_pp, cpu, p->perf_counter, MAX_ADDED_PACKAGE_COUNTERS))
5264 		return -10;
5265 
5266 	for (i = 0, pp = sys.pmt_pp; pp; i++, pp = pp->next)
5267 		p->pmt_counter[i] = pmt_read_counter(pp, p->package_id);
5268 
5269 done:
5270 	gettimeofday(&t->tv_end, (struct timezone *)NULL);
5271 
5272 	return 0;
5273 }
5274 
5275 int pkg_cstate_limit = PCLUKN;
5276 char *pkg_cstate_limit_strings[] = { "unknown", "reserved", "pc0", "pc1", "pc2",
5277 	"pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited"
5278 };
5279 
5280 int nhm_pkg_cstate_limits[16] =
5281     { PCL__0, PCL__1, PCL__3, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
5282 	PCLRSV, PCLRSV
5283 };
5284 
5285 int snb_pkg_cstate_limits[16] =
5286     { PCL__0, PCL__2, PCL_6N, PCL_6R, PCL__7, PCL_7S, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
5287 	PCLRSV, PCLRSV
5288 };
5289 
5290 int hsw_pkg_cstate_limits[16] =
5291     { PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
5292 	PCLRSV, PCLRSV
5293 };
5294 
5295 int slv_pkg_cstate_limits[16] =
5296     { PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
5297 	PCL__6, PCL__7
5298 };
5299 
5300 int amt_pkg_cstate_limits[16] =
5301     { PCLUNL, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
5302 	PCLRSV, PCLRSV
5303 };
5304 
5305 int phi_pkg_cstate_limits[16] =
5306     { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
5307 	PCLRSV, PCLRSV
5308 };
5309 
5310 int glm_pkg_cstate_limits[16] =
5311     { PCLUNL, PCL__1, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCL_10, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
5312 	PCLRSV, PCLRSV
5313 };
5314 
5315 int skx_pkg_cstate_limits[16] =
5316     { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
5317 	PCLRSV, PCLRSV
5318 };
5319 
5320 int icx_pkg_cstate_limits[16] =
5321     { PCL__0, PCL__2, PCL__6, PCL__6, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
5322 	PCLRSV, PCLRSV
5323 };
5324 
5325 void probe_cst_limit(void)
5326 {
5327 	unsigned long long msr;
5328 	int *pkg_cstate_limits;
5329 
5330 	if (!platform->has_nhm_msrs || no_msr)
5331 		return;
5332 
5333 	switch (platform->cst_limit) {
5334 	case CST_LIMIT_NHM:
5335 		pkg_cstate_limits = nhm_pkg_cstate_limits;
5336 		break;
5337 	case CST_LIMIT_SNB:
5338 		pkg_cstate_limits = snb_pkg_cstate_limits;
5339 		break;
5340 	case CST_LIMIT_HSW:
5341 		pkg_cstate_limits = hsw_pkg_cstate_limits;
5342 		break;
5343 	case CST_LIMIT_SKX:
5344 		pkg_cstate_limits = skx_pkg_cstate_limits;
5345 		break;
5346 	case CST_LIMIT_ICX:
5347 		pkg_cstate_limits = icx_pkg_cstate_limits;
5348 		break;
5349 	case CST_LIMIT_SLV:
5350 		pkg_cstate_limits = slv_pkg_cstate_limits;
5351 		break;
5352 	case CST_LIMIT_AMT:
5353 		pkg_cstate_limits = amt_pkg_cstate_limits;
5354 		break;
5355 	case CST_LIMIT_KNL:
5356 		pkg_cstate_limits = phi_pkg_cstate_limits;
5357 		break;
5358 	case CST_LIMIT_GMT:
5359 		pkg_cstate_limits = glm_pkg_cstate_limits;
5360 		break;
5361 	default:
5362 		return;
5363 	}
5364 
5365 	get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
5366 	pkg_cstate_limit = pkg_cstate_limits[msr & 0xF];
5367 }
5368 
5369 static void dump_platform_info(void)
5370 {
5371 	unsigned long long msr;
5372 	unsigned int ratio;
5373 
5374 	if (!platform->has_nhm_msrs || no_msr)
5375 		return;
5376 
5377 	get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
5378 
5379 	fprintf(outf, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", base_cpu, msr);
5380 
5381 	ratio = (msr >> 40) & 0xFF;
5382 	fprintf(outf, "%d * %.1f = %.1f MHz max efficiency frequency\n", ratio, bclk, ratio * bclk);
5383 
5384 	ratio = (msr >> 8) & 0xFF;
5385 	fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk);
5386 }
5387 
5388 static void dump_power_ctl(void)
5389 {
5390 	unsigned long long msr;
5391 
5392 	if (!platform->has_nhm_msrs || no_msr)
5393 		return;
5394 
5395 	get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr);
5396 	fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n",
5397 		base_cpu, msr, msr & 0x2 ? "EN" : "DIS");
5398 
5399 	/* C-state Pre-wake Disable (CSTATE_PREWAKE_DISABLE) */
5400 	if (platform->has_cst_prewake_bit)
5401 		fprintf(outf, "C-state Pre-wake: %sabled\n", msr & 0x40000000 ? "DIS" : "EN");
5402 
5403 	return;
5404 }
5405 
5406 static void dump_turbo_ratio_limit2(void)
5407 {
5408 	unsigned long long msr;
5409 	unsigned int ratio;
5410 
5411 	get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT2, &msr);
5412 
5413 	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT2: 0x%08llx\n", base_cpu, msr);
5414 
5415 	ratio = (msr >> 8) & 0xFF;
5416 	if (ratio)
5417 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 18 active cores\n", ratio, bclk, ratio * bclk);
5418 
5419 	ratio = (msr >> 0) & 0xFF;
5420 	if (ratio)
5421 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 17 active cores\n", ratio, bclk, ratio * bclk);
5422 	return;
5423 }
5424 
5425 static void dump_turbo_ratio_limit1(void)
5426 {
5427 	unsigned long long msr;
5428 	unsigned int ratio;
5429 
5430 	get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &msr);
5431 
5432 	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, msr);
5433 
5434 	ratio = (msr >> 56) & 0xFF;
5435 	if (ratio)
5436 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 16 active cores\n", ratio, bclk, ratio * bclk);
5437 
5438 	ratio = (msr >> 48) & 0xFF;
5439 	if (ratio)
5440 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 15 active cores\n", ratio, bclk, ratio * bclk);
5441 
5442 	ratio = (msr >> 40) & 0xFF;
5443 	if (ratio)
5444 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 14 active cores\n", ratio, bclk, ratio * bclk);
5445 
5446 	ratio = (msr >> 32) & 0xFF;
5447 	if (ratio)
5448 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 13 active cores\n", ratio, bclk, ratio * bclk);
5449 
5450 	ratio = (msr >> 24) & 0xFF;
5451 	if (ratio)
5452 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 12 active cores\n", ratio, bclk, ratio * bclk);
5453 
5454 	ratio = (msr >> 16) & 0xFF;
5455 	if (ratio)
5456 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 11 active cores\n", ratio, bclk, ratio * bclk);
5457 
5458 	ratio = (msr >> 8) & 0xFF;
5459 	if (ratio)
5460 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 10 active cores\n", ratio, bclk, ratio * bclk);
5461 
5462 	ratio = (msr >> 0) & 0xFF;
5463 	if (ratio)
5464 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 9 active cores\n", ratio, bclk, ratio * bclk);
5465 	return;
5466 }
5467 
5468 static void dump_turbo_ratio_limits(int trl_msr_offset)
5469 {
5470 	unsigned long long msr, core_counts;
5471 	int shift;
5472 
5473 	get_msr(base_cpu, trl_msr_offset, &msr);
5474 	fprintf(outf, "cpu%d: MSR_%sTURBO_RATIO_LIMIT: 0x%08llx\n",
5475 		base_cpu, trl_msr_offset == MSR_SECONDARY_TURBO_RATIO_LIMIT ? "SECONDARY_" : "", msr);
5476 
5477 	if (platform->trl_msrs & TRL_CORECOUNT) {
5478 		get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &core_counts);
5479 		fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, core_counts);
5480 	} else {
5481 		core_counts = 0x0807060504030201;
5482 	}
5483 
5484 	for (shift = 56; shift >= 0; shift -= 8) {
5485 		unsigned int ratio, group_size;
5486 
5487 		ratio = (msr >> shift) & 0xFF;
5488 		group_size = (core_counts >> shift) & 0xFF;
5489 		if (ratio)
5490 			fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n",
5491 				ratio, bclk, ratio * bclk, group_size);
5492 	}
5493 
5494 	return;
5495 }
5496 
5497 static void dump_atom_turbo_ratio_limits(void)
5498 {
5499 	unsigned long long msr;
5500 	unsigned int ratio;
5501 
5502 	get_msr(base_cpu, MSR_ATOM_CORE_RATIOS, &msr);
5503 	fprintf(outf, "cpu%d: MSR_ATOM_CORE_RATIOS: 0x%08llx\n", base_cpu, msr & 0xFFFFFFFF);
5504 
5505 	ratio = (msr >> 0) & 0x3F;
5506 	if (ratio)
5507 		fprintf(outf, "%d * %.1f = %.1f MHz minimum operating frequency\n", ratio, bclk, ratio * bclk);
5508 
5509 	ratio = (msr >> 8) & 0x3F;
5510 	if (ratio)
5511 		fprintf(outf, "%d * %.1f = %.1f MHz low frequency mode (LFM)\n", ratio, bclk, ratio * bclk);
5512 
5513 	ratio = (msr >> 16) & 0x3F;
5514 	if (ratio)
5515 		fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk);
5516 
5517 	get_msr(base_cpu, MSR_ATOM_CORE_TURBO_RATIOS, &msr);
5518 	fprintf(outf, "cpu%d: MSR_ATOM_CORE_TURBO_RATIOS: 0x%08llx\n", base_cpu, msr & 0xFFFFFFFF);
5519 
5520 	ratio = (msr >> 24) & 0x3F;
5521 	if (ratio)
5522 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 4 active cores\n", ratio, bclk, ratio * bclk);
5523 
5524 	ratio = (msr >> 16) & 0x3F;
5525 	if (ratio)
5526 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 3 active cores\n", ratio, bclk, ratio * bclk);
5527 
5528 	ratio = (msr >> 8) & 0x3F;
5529 	if (ratio)
5530 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 2 active cores\n", ratio, bclk, ratio * bclk);
5531 
5532 	ratio = (msr >> 0) & 0x3F;
5533 	if (ratio)
5534 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 1 active core\n", ratio, bclk, ratio * bclk);
5535 }
5536 
5537 static void dump_knl_turbo_ratio_limits(void)
5538 {
5539 	const unsigned int buckets_no = 7;
5540 
5541 	unsigned long long msr;
5542 	int delta_cores, delta_ratio;
5543 	int i, b_nr;
5544 	unsigned int cores[buckets_no];
5545 	unsigned int ratio[buckets_no];
5546 
5547 	get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT, &msr);
5548 
5549 	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr);
5550 
5551 	/*
5552 	 * Turbo encoding in KNL is as follows:
5553 	 * [0] -- Reserved
5554 	 * [7:1] -- Base value of number of active cores of bucket 1.
5555 	 * [15:8] -- Base value of freq ratio of bucket 1.
5556 	 * [20:16] -- +ve delta of number of active cores of bucket 2.
5557 	 * i.e. active cores of bucket 2 =
5558 	 * active cores of bucket 1 + delta
5559 	 * [23:21] -- Negative delta of freq ratio of bucket 2.
5560 	 * i.e. freq ratio of bucket 2 =
5561 	 * freq ratio of bucket 1 - delta
5562 	 * [28:24]-- +ve delta of number of active cores of bucket 3.
5563 	 * [31:29]-- -ve delta of freq ratio of bucket 3.
5564 	 * [36:32]-- +ve delta of number of active cores of bucket 4.
5565 	 * [39:37]-- -ve delta of freq ratio of bucket 4.
5566 	 * [44:40]-- +ve delta of number of active cores of bucket 5.
5567 	 * [47:45]-- -ve delta of freq ratio of bucket 5.
5568 	 * [52:48]-- +ve delta of number of active cores of bucket 6.
5569 	 * [55:53]-- -ve delta of freq ratio of bucket 6.
5570 	 * [60:56]-- +ve delta of number of active cores of bucket 7.
5571 	 * [63:61]-- -ve delta of freq ratio of bucket 7.
5572 	 */
5573 
5574 	b_nr = 0;
5575 	cores[b_nr] = (msr & 0xFF) >> 1;
5576 	ratio[b_nr] = (msr >> 8) & 0xFF;
5577 
5578 	for (i = 16; i < 64; i += 8) {
5579 		delta_cores = (msr >> i) & 0x1F;
5580 		delta_ratio = (msr >> (i + 5)) & 0x7;
5581 
5582 		cores[b_nr + 1] = cores[b_nr] + delta_cores;
5583 		ratio[b_nr + 1] = ratio[b_nr] - delta_ratio;
5584 		b_nr++;
5585 	}
5586 
5587 	for (i = buckets_no - 1; i >= 0; i--)
5588 		if (i > 0 ? ratio[i] != ratio[i - 1] : 1)
5589 			fprintf(outf,
5590 				"%d * %.1f = %.1f MHz max turbo %d active cores\n",
5591 				ratio[i], bclk, ratio[i] * bclk, cores[i]);
5592 }
5593 
5594 static void dump_cst_cfg(void)
5595 {
5596 	unsigned long long msr;
5597 
5598 	if (!platform->has_nhm_msrs || no_msr)
5599 		return;
5600 
5601 	get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
5602 
5603 	fprintf(outf, "cpu%d: MSR_PKG_CST_CONFIG_CONTROL: 0x%08llx", base_cpu, msr);
5604 
5605 	fprintf(outf, " (%s%s%s%s%slocked, pkg-cstate-limit=%d (%s)",
5606 		(msr & SNB_C3_AUTO_UNDEMOTE) ? "UNdemote-C3, " : "",
5607 		(msr & SNB_C1_AUTO_UNDEMOTE) ? "UNdemote-C1, " : "",
5608 		(msr & NHM_C3_AUTO_DEMOTE) ? "demote-C3, " : "",
5609 		(msr & NHM_C1_AUTO_DEMOTE) ? "demote-C1, " : "",
5610 		(msr & (1 << 15)) ? "" : "UN", (unsigned int)msr & 0xF, pkg_cstate_limit_strings[pkg_cstate_limit]);
5611 
5612 #define AUTOMATIC_CSTATE_CONVERSION		(1UL << 16)
5613 	if (platform->has_cst_auto_convension) {
5614 		fprintf(outf, ", automatic c-state conversion=%s", (msr & AUTOMATIC_CSTATE_CONVERSION) ? "on" : "off");
5615 	}
5616 
5617 	fprintf(outf, ")\n");
5618 
5619 	return;
5620 }
5621 
5622 static void dump_config_tdp(void)
5623 {
5624 	unsigned long long msr;
5625 
5626 	get_msr(base_cpu, MSR_CONFIG_TDP_NOMINAL, &msr);
5627 	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_NOMINAL: 0x%08llx", base_cpu, msr);
5628 	fprintf(outf, " (base_ratio=%d)\n", (unsigned int)msr & 0xFF);
5629 
5630 	get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_1, &msr);
5631 	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_1: 0x%08llx (", base_cpu, msr);
5632 	if (msr) {
5633 		fprintf(outf, "PKG_MIN_PWR_LVL1=%d ", (unsigned int)(msr >> 48) & 0x7FFF);
5634 		fprintf(outf, "PKG_MAX_PWR_LVL1=%d ", (unsigned int)(msr >> 32) & 0x7FFF);
5635 		fprintf(outf, "LVL1_RATIO=%d ", (unsigned int)(msr >> 16) & 0xFF);
5636 		fprintf(outf, "PKG_TDP_LVL1=%d", (unsigned int)(msr) & 0x7FFF);
5637 	}
5638 	fprintf(outf, ")\n");
5639 
5640 	get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_2, &msr);
5641 	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_2: 0x%08llx (", base_cpu, msr);
5642 	if (msr) {
5643 		fprintf(outf, "PKG_MIN_PWR_LVL2=%d ", (unsigned int)(msr >> 48) & 0x7FFF);
5644 		fprintf(outf, "PKG_MAX_PWR_LVL2=%d ", (unsigned int)(msr >> 32) & 0x7FFF);
5645 		fprintf(outf, "LVL2_RATIO=%d ", (unsigned int)(msr >> 16) & 0xFF);
5646 		fprintf(outf, "PKG_TDP_LVL2=%d", (unsigned int)(msr) & 0x7FFF);
5647 	}
5648 	fprintf(outf, ")\n");
5649 
5650 	get_msr(base_cpu, MSR_CONFIG_TDP_CONTROL, &msr);
5651 	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_CONTROL: 0x%08llx (", base_cpu, msr);
5652 	if ((msr) & 0x3)
5653 		fprintf(outf, "TDP_LEVEL=%d ", (unsigned int)(msr) & 0x3);
5654 	fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1);
5655 	fprintf(outf, ")\n");
5656 
5657 	get_msr(base_cpu, MSR_TURBO_ACTIVATION_RATIO, &msr);
5658 	fprintf(outf, "cpu%d: MSR_TURBO_ACTIVATION_RATIO: 0x%08llx (", base_cpu, msr);
5659 	fprintf(outf, "MAX_NON_TURBO_RATIO=%d", (unsigned int)(msr) & 0xFF);
5660 	fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1);
5661 	fprintf(outf, ")\n");
5662 }
5663 
5664 unsigned int irtl_time_units[] = { 1, 32, 1024, 32768, 1048576, 33554432, 0, 0 };
5665 
5666 void print_irtl(void)
5667 {
5668 	unsigned long long msr;
5669 
5670 	if (!platform->has_irtl_msrs || no_msr)
5671 		return;
5672 
5673 	if (platform->supported_cstates & PC3) {
5674 		get_msr(base_cpu, MSR_PKGC3_IRTL, &msr);
5675 		fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr);
5676 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
5677 			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
5678 	}
5679 
5680 	if (platform->supported_cstates & PC6) {
5681 		get_msr(base_cpu, MSR_PKGC6_IRTL, &msr);
5682 		fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", base_cpu, msr);
5683 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
5684 			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
5685 	}
5686 
5687 	if (platform->supported_cstates & PC7) {
5688 		get_msr(base_cpu, MSR_PKGC7_IRTL, &msr);
5689 		fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", base_cpu, msr);
5690 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
5691 			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
5692 	}
5693 
5694 	if (platform->supported_cstates & PC8) {
5695 		get_msr(base_cpu, MSR_PKGC8_IRTL, &msr);
5696 		fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", base_cpu, msr);
5697 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
5698 			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
5699 	}
5700 
5701 	if (platform->supported_cstates & PC9) {
5702 		get_msr(base_cpu, MSR_PKGC9_IRTL, &msr);
5703 		fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", base_cpu, msr);
5704 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
5705 			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
5706 	}
5707 
5708 	if (platform->supported_cstates & PC10) {
5709 		get_msr(base_cpu, MSR_PKGC10_IRTL, &msr);
5710 		fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", base_cpu, msr);
5711 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
5712 			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
5713 	}
5714 }
5715 
5716 void free_fd_percpu(void)
5717 {
5718 	int i;
5719 
5720 	if (!fd_percpu)
5721 		return;
5722 
5723 	for (i = 0; i < topo.max_cpu_num + 1; ++i) {
5724 		if (fd_percpu[i] != 0)
5725 			close(fd_percpu[i]);
5726 	}
5727 
5728 	free(fd_percpu);
5729 	fd_percpu = NULL;
5730 }
5731 
5732 void free_fd_instr_count_percpu(void)
5733 {
5734 	if (!fd_instr_count_percpu)
5735 		return;
5736 
5737 	for (int i = 0; i < topo.max_cpu_num + 1; ++i) {
5738 		if (fd_instr_count_percpu[i] != 0)
5739 			close(fd_instr_count_percpu[i]);
5740 	}
5741 
5742 	free(fd_instr_count_percpu);
5743 	fd_instr_count_percpu = NULL;
5744 }
5745 
5746 void free_fd_cstate(void)
5747 {
5748 	if (!ccstate_counter_info)
5749 		return;
5750 
5751 	const int counter_info_num = ccstate_counter_info_size;
5752 
5753 	for (int counter_id = 0; counter_id < counter_info_num; ++counter_id) {
5754 		if (ccstate_counter_info[counter_id].fd_perf_core != -1)
5755 			close(ccstate_counter_info[counter_id].fd_perf_core);
5756 
5757 		if (ccstate_counter_info[counter_id].fd_perf_pkg != -1)
5758 			close(ccstate_counter_info[counter_id].fd_perf_pkg);
5759 	}
5760 
5761 	free(ccstate_counter_info);
5762 	ccstate_counter_info = NULL;
5763 	ccstate_counter_info_size = 0;
5764 }
5765 
5766 void free_fd_msr(void)
5767 {
5768 	if (!msr_counter_info)
5769 		return;
5770 
5771 	for (int cpu = 0; cpu < topo.max_cpu_num; ++cpu) {
5772 		if (msr_counter_info[cpu].fd_perf != -1)
5773 			close(msr_counter_info[cpu].fd_perf);
5774 	}
5775 
5776 	free(msr_counter_info);
5777 	msr_counter_info = NULL;
5778 	msr_counter_info_size = 0;
5779 }
5780 
5781 void free_fd_rapl_percpu(void)
5782 {
5783 	if (!rapl_counter_info_perdomain)
5784 		return;
5785 
5786 	const int num_domains = rapl_counter_info_perdomain_size;
5787 
5788 	for (int domain_id = 0; domain_id < num_domains; ++domain_id) {
5789 		if (rapl_counter_info_perdomain[domain_id].fd_perf != -1)
5790 			close(rapl_counter_info_perdomain[domain_id].fd_perf);
5791 	}
5792 
5793 	free(rapl_counter_info_perdomain);
5794 	rapl_counter_info_perdomain = NULL;
5795 	rapl_counter_info_perdomain_size = 0;
5796 }
5797 
5798 void free_fd_added_perf_counters_(struct perf_counter_info *pp)
5799 {
5800 	if (!pp)
5801 		return;
5802 
5803 	if (!pp->fd_perf_per_domain)
5804 		return;
5805 
5806 	while (pp) {
5807 		for (size_t domain = 0; domain < pp->num_domains; ++domain) {
5808 			if (pp->fd_perf_per_domain[domain] != -1) {
5809 				close(pp->fd_perf_per_domain[domain]);
5810 				pp->fd_perf_per_domain[domain] = -1;
5811 			}
5812 		}
5813 
5814 		free(pp->fd_perf_per_domain);
5815 		pp->fd_perf_per_domain = NULL;
5816 
5817 		pp = pp->next;
5818 	}
5819 }
5820 
5821 void free_fd_added_perf_counters(void)
5822 {
5823 	free_fd_added_perf_counters_(sys.perf_tp);
5824 	free_fd_added_perf_counters_(sys.perf_cp);
5825 	free_fd_added_perf_counters_(sys.perf_pp);
5826 }
5827 
5828 void free_all_buffers(void)
5829 {
5830 	int i;
5831 
5832 	CPU_FREE(cpu_present_set);
5833 	cpu_present_set = NULL;
5834 	cpu_present_setsize = 0;
5835 
5836 	CPU_FREE(cpu_effective_set);
5837 	cpu_effective_set = NULL;
5838 	cpu_effective_setsize = 0;
5839 
5840 	CPU_FREE(cpu_allowed_set);
5841 	cpu_allowed_set = NULL;
5842 	cpu_allowed_setsize = 0;
5843 
5844 	CPU_FREE(cpu_affinity_set);
5845 	cpu_affinity_set = NULL;
5846 	cpu_affinity_setsize = 0;
5847 
5848 	free(thread_even);
5849 	free(core_even);
5850 	free(package_even);
5851 
5852 	thread_even = NULL;
5853 	core_even = NULL;
5854 	package_even = NULL;
5855 
5856 	free(thread_odd);
5857 	free(core_odd);
5858 	free(package_odd);
5859 
5860 	thread_odd = NULL;
5861 	core_odd = NULL;
5862 	package_odd = NULL;
5863 
5864 	free(output_buffer);
5865 	output_buffer = NULL;
5866 	outp = NULL;
5867 
5868 	free_fd_percpu();
5869 	free_fd_instr_count_percpu();
5870 	free_fd_msr();
5871 	free_fd_rapl_percpu();
5872 	free_fd_cstate();
5873 	free_fd_added_perf_counters();
5874 
5875 	free(irq_column_2_cpu);
5876 	free(irqs_per_cpu);
5877 	free(nmi_per_cpu);
5878 
5879 	for (i = 0; i <= topo.max_cpu_num; ++i) {
5880 		if (cpus[i].put_ids)
5881 			CPU_FREE(cpus[i].put_ids);
5882 	}
5883 	free(cpus);
5884 }
5885 
5886 /*
5887  * Parse a file containing a single int.
5888  * Return 0 if file can not be opened
5889  * Exit if file can be opened, but can not be parsed
5890  */
5891 int parse_int_file(const char *fmt, ...)
5892 {
5893 	va_list args;
5894 	char path[PATH_MAX];
5895 	FILE *filep;
5896 	int value;
5897 
5898 	va_start(args, fmt);
5899 	vsnprintf(path, sizeof(path), fmt, args);
5900 	va_end(args);
5901 	filep = fopen(path, "r");
5902 	if (!filep)
5903 		return 0;
5904 	if (fscanf(filep, "%d", &value) != 1)
5905 		err(1, "%s: failed to parse number from file", path);
5906 	fclose(filep);
5907 	return value;
5908 }
5909 
5910 /*
5911  * cpu_is_first_core_in_package(cpu)
5912  * return 1 if given CPU is 1st core in package
5913  */
5914 int cpu_is_first_core_in_package(int cpu)
5915 {
5916 	return cpu == parse_int_file("/sys/devices/system/cpu/cpu%d/topology/core_siblings_list", cpu);
5917 }
5918 
5919 int get_physical_package_id(int cpu)
5920 {
5921 	return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/physical_package_id", cpu);
5922 }
5923 
5924 int get_die_id(int cpu)
5925 {
5926 	return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/die_id", cpu);
5927 }
5928 
5929 int get_l3_id(int cpu)
5930 {
5931 	return parse_int_file("/sys/devices/system/cpu/cpu%d/cache/index3/id", cpu);
5932 }
5933 
5934 int get_core_id(int cpu)
5935 {
5936 	return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/core_id", cpu);
5937 }
5938 
5939 void set_node_data(void)
5940 {
5941 	int pkg, node, lnode, cpu, cpux;
5942 	int cpu_count;
5943 
5944 	/* initialize logical_node_id */
5945 	for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu)
5946 		cpus[cpu].logical_node_id = -1;
5947 
5948 	cpu_count = 0;
5949 	for (pkg = 0; pkg < topo.num_packages; pkg++) {
5950 		lnode = 0;
5951 		for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu) {
5952 			if (cpus[cpu].physical_package_id != pkg)
5953 				continue;
5954 			/* find a cpu with an unset logical_node_id */
5955 			if (cpus[cpu].logical_node_id != -1)
5956 				continue;
5957 			cpus[cpu].logical_node_id = lnode;
5958 			node = cpus[cpu].physical_node_id;
5959 			cpu_count++;
5960 			/*
5961 			 * find all matching cpus on this pkg and set
5962 			 * the logical_node_id
5963 			 */
5964 			for (cpux = cpu; cpux <= topo.max_cpu_num; cpux++) {
5965 				if ((cpus[cpux].physical_package_id == pkg) && (cpus[cpux].physical_node_id == node)) {
5966 					cpus[cpux].logical_node_id = lnode;
5967 					cpu_count++;
5968 				}
5969 			}
5970 			lnode++;
5971 			if (lnode > topo.nodes_per_pkg)
5972 				topo.nodes_per_pkg = lnode;
5973 		}
5974 		if (cpu_count >= topo.max_cpu_num)
5975 			break;
5976 	}
5977 }
5978 
5979 int get_physical_node_id(struct cpu_topology *thiscpu)
5980 {
5981 	char path[80];
5982 	FILE *filep;
5983 	int i;
5984 	int cpu = thiscpu->logical_cpu_id;
5985 
5986 	for (i = 0; i <= topo.max_cpu_num; i++) {
5987 		sprintf(path, "/sys/devices/system/cpu/cpu%d/node%i/cpulist", cpu, i);
5988 		filep = fopen(path, "r");
5989 		if (!filep)
5990 			continue;
5991 		fclose(filep);
5992 		return i;
5993 	}
5994 	return -1;
5995 }
5996 
5997 static int parse_cpu_str(char *cpu_str, cpu_set_t *cpu_set, int cpu_set_size)
5998 {
5999 	unsigned int start, end;
6000 	char *next = cpu_str;
6001 
6002 	while (next && *next) {
6003 
6004 		if (*next == '-')	/* no negative cpu numbers */
6005 			return 1;
6006 
6007 		if (*next == '\0' || *next == '\n')
6008 			break;
6009 
6010 		start = strtoul(next, &next, 10);
6011 
6012 		if (start >= CPU_SUBSET_MAXCPUS)
6013 			return 1;
6014 		CPU_SET_S(start, cpu_set_size, cpu_set);
6015 
6016 		if (*next == '\0' || *next == '\n')
6017 			break;
6018 
6019 		if (*next == ',') {
6020 			next += 1;
6021 			continue;
6022 		}
6023 
6024 		if (*next == '-') {
6025 			next += 1;	/* start range */
6026 		} else if (*next == '.') {
6027 			next += 1;
6028 			if (*next == '.')
6029 				next += 1;	/* start range */
6030 			else
6031 				return 1;
6032 		}
6033 
6034 		end = strtoul(next, &next, 10);
6035 		if (end <= start)
6036 			return 1;
6037 
6038 		while (++start <= end) {
6039 			if (start >= CPU_SUBSET_MAXCPUS)
6040 				return 1;
6041 			CPU_SET_S(start, cpu_set_size, cpu_set);
6042 		}
6043 
6044 		if (*next == ',')
6045 			next += 1;
6046 		else if (*next != '\0' && *next != '\n')
6047 			return 1;
6048 	}
6049 
6050 	return 0;
6051 }
6052 
6053 int get_thread_siblings(struct cpu_topology *thiscpu)
6054 {
6055 	char path[80], character;
6056 	FILE *filep;
6057 	unsigned long map;
6058 	int so, shift, sib_core;
6059 	int cpu = thiscpu->logical_cpu_id;
6060 	int offset = topo.max_cpu_num + 1;
6061 	size_t size;
6062 	int thread_id = 0;
6063 
6064 	thiscpu->put_ids = CPU_ALLOC((topo.max_cpu_num + 1));
6065 	if (thiscpu->thread_id < 0)
6066 		thiscpu->thread_id = thread_id++;
6067 	if (!thiscpu->put_ids)
6068 		return -1;
6069 
6070 	size = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
6071 	CPU_ZERO_S(size, thiscpu->put_ids);
6072 
6073 	sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpu);
6074 	filep = fopen(path, "r");
6075 
6076 	if (!filep) {
6077 		warnx("%s: open failed", path);
6078 		return -1;
6079 	}
6080 	do {
6081 		offset -= BITMASK_SIZE;
6082 		if (fscanf(filep, "%lx%c", &map, &character) != 2)
6083 			err(1, "%s: failed to parse file", path);
6084 		for (shift = 0; shift < BITMASK_SIZE; shift++) {
6085 			if ((map >> shift) & 0x1) {
6086 				so = shift + offset;
6087 				sib_core = get_core_id(so);
6088 				if (sib_core == thiscpu->physical_core_id) {
6089 					CPU_SET_S(so, size, thiscpu->put_ids);
6090 					if ((so != cpu) && (cpus[so].thread_id < 0))
6091 						cpus[so].thread_id = thread_id++;
6092 				}
6093 			}
6094 		}
6095 	} while (character == ',');
6096 	fclose(filep);
6097 
6098 	return CPU_COUNT_S(size, thiscpu->put_ids);
6099 }
6100 
6101 /*
6102  * run func(thread, core, package) in topology order
6103  * skip non-present cpus
6104  */
6105 
6106 int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *,
6107 			       struct pkg_data *, struct thread_data *, struct core_data *,
6108 			       struct pkg_data *), struct thread_data *thread_base,
6109 		   struct core_data *core_base, struct pkg_data *pkg_base,
6110 		   struct thread_data *thread_base2, struct core_data *core_base2, struct pkg_data *pkg_base2)
6111 {
6112 	int retval, pkg_no, node_no, core_no, thread_no;
6113 
6114 	retval = 0;
6115 
6116 	for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
6117 		for (node_no = 0; node_no < topo.nodes_per_pkg; ++node_no) {
6118 			for (core_no = 0; core_no < topo.cores_per_node; ++core_no) {
6119 				for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) {
6120 					struct thread_data *t, *t2;
6121 					struct core_data *c, *c2;
6122 
6123 					t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
6124 
6125 					if (cpu_is_not_allowed(t->cpu_id))
6126 						continue;
6127 
6128 					t2 = GET_THREAD(thread_base2, thread_no, core_no, node_no, pkg_no);
6129 
6130 					c = GET_CORE(core_base, core_no, node_no, pkg_no);
6131 					c2 = GET_CORE(core_base2, core_no, node_no, pkg_no);
6132 
6133 					retval |= func(t, c, &pkg_base[pkg_no], t2, c2, &pkg_base2[pkg_no]);
6134 				}
6135 			}
6136 		}
6137 	}
6138 	return retval;
6139 }
6140 
6141 /*
6142  * run func(cpu) on every cpu in /proc/stat
6143  * return max_cpu number
6144  */
6145 int for_all_proc_cpus(int (func) (int))
6146 {
6147 	FILE *fp;
6148 	int cpu_num;
6149 	int retval;
6150 
6151 	fp = fopen_or_die(proc_stat, "r");
6152 
6153 	retval = fscanf(fp, "cpu %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n");
6154 	if (retval != 0)
6155 		err(1, "%s: failed to parse format", proc_stat);
6156 
6157 	while (1) {
6158 		retval = fscanf(fp, "cpu%u %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n", &cpu_num);
6159 		if (retval != 1)
6160 			break;
6161 
6162 		retval = func(cpu_num);
6163 		if (retval) {
6164 			fclose(fp);
6165 			return (retval);
6166 		}
6167 	}
6168 	fclose(fp);
6169 	return 0;
6170 }
6171 
6172 #define PATH_EFFECTIVE_CPUS	"/sys/fs/cgroup/cpuset.cpus.effective"
6173 
6174 static char cpu_effective_str[1024];
6175 
6176 static int update_effective_str(bool startup)
6177 {
6178 	FILE *fp;
6179 	char *pos;
6180 	char buf[1024];
6181 	int ret;
6182 
6183 	if (cpu_effective_str[0] == '\0' && !startup)
6184 		return 0;
6185 
6186 	fp = fopen(PATH_EFFECTIVE_CPUS, "r");
6187 	if (!fp)
6188 		return 0;
6189 
6190 	pos = fgets(buf, 1024, fp);
6191 	if (!pos)
6192 		err(1, "%s: file read failed\n", PATH_EFFECTIVE_CPUS);
6193 
6194 	fclose(fp);
6195 
6196 	ret = strncmp(cpu_effective_str, buf, 1024);
6197 	if (!ret)
6198 		return 0;
6199 
6200 	strncpy(cpu_effective_str, buf, 1024);
6201 	return 1;
6202 }
6203 
6204 static void update_effective_set(bool startup)
6205 {
6206 	update_effective_str(startup);
6207 
6208 	if (parse_cpu_str(cpu_effective_str, cpu_effective_set, cpu_effective_setsize))
6209 		err(1, "%s: cpu str malformat %s\n", PATH_EFFECTIVE_CPUS, cpu_effective_str);
6210 }
6211 
6212 void linux_perf_init(void);
6213 void msr_perf_init(void);
6214 void rapl_perf_init(void);
6215 void cstate_perf_init(void);
6216 void added_perf_counters_init(void);
6217 void pmt_init(void);
6218 
6219 void re_initialize(void)
6220 {
6221 	free_all_buffers();
6222 	setup_all_buffers(false);
6223 	linux_perf_init();
6224 	msr_perf_init();
6225 	rapl_perf_init();
6226 	cstate_perf_init();
6227 	added_perf_counters_init();
6228 	pmt_init();
6229 	fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus,
6230 		topo.allowed_cpus);
6231 }
6232 
6233 void set_max_cpu_num(void)
6234 {
6235 	FILE *filep;
6236 	int base_cpu;
6237 	unsigned long dummy;
6238 	char pathname[64];
6239 
6240 	base_cpu = sched_getcpu();
6241 	if (base_cpu < 0)
6242 		err(1, "cannot find calling cpu ID");
6243 	sprintf(pathname, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", base_cpu);
6244 
6245 	filep = fopen_or_die(pathname, "r");
6246 	topo.max_cpu_num = 0;
6247 	while (fscanf(filep, "%lx,", &dummy) == 1)
6248 		topo.max_cpu_num += BITMASK_SIZE;
6249 	fclose(filep);
6250 	topo.max_cpu_num--;	/* 0 based */
6251 }
6252 
6253 /*
6254  * count_cpus()
6255  * remember the last one seen, it will be the max
6256  */
6257 int count_cpus(int cpu)
6258 {
6259 	UNUSED(cpu);
6260 
6261 	topo.num_cpus++;
6262 	return 0;
6263 }
6264 
6265 int mark_cpu_present(int cpu)
6266 {
6267 	CPU_SET_S(cpu, cpu_present_setsize, cpu_present_set);
6268 	return 0;
6269 }
6270 
6271 int init_thread_id(int cpu)
6272 {
6273 	cpus[cpu].thread_id = -1;
6274 	return 0;
6275 }
6276 
6277 int set_my_cpu_type(void)
6278 {
6279 	unsigned int eax, ebx, ecx, edx;
6280 	unsigned int max_level;
6281 
6282 	__cpuid(0, max_level, ebx, ecx, edx);
6283 
6284 	if (max_level < CPUID_LEAF_MODEL_ID)
6285 		return 0;
6286 
6287 	__cpuid(CPUID_LEAF_MODEL_ID, eax, ebx, ecx, edx);
6288 
6289 	return (eax >> CPUID_LEAF_MODEL_ID_CORE_TYPE_SHIFT);
6290 }
6291 
6292 int set_cpu_hybrid_type(int cpu)
6293 {
6294 	if (cpu_migrate(cpu))
6295 		return -1;
6296 
6297 	int type = set_my_cpu_type();
6298 
6299 	cpus[cpu].type = type;
6300 	return 0;
6301 }
6302 
6303 /*
6304  * snapshot_proc_interrupts()
6305  *
6306  * read and record summary of /proc/interrupts
6307  *
6308  * return 1 if config change requires a restart, else return 0
6309  */
6310 int snapshot_proc_interrupts(void)
6311 {
6312 	static FILE *fp;
6313 	int column, retval;
6314 
6315 	if (fp == NULL)
6316 		fp = fopen_or_die("/proc/interrupts", "r");
6317 	else
6318 		rewind(fp);
6319 
6320 	/* read 1st line of /proc/interrupts to get cpu* name for each column */
6321 	for (column = 0; column < topo.num_cpus; ++column) {
6322 		int cpu_number;
6323 
6324 		retval = fscanf(fp, " CPU%d", &cpu_number);
6325 		if (retval != 1)
6326 			break;
6327 
6328 		if (cpu_number > topo.max_cpu_num) {
6329 			warn("/proc/interrupts: cpu%d: > %d", cpu_number, topo.max_cpu_num);
6330 			return 1;
6331 		}
6332 
6333 		irq_column_2_cpu[column] = cpu_number;
6334 		irqs_per_cpu[cpu_number] = 0;
6335 		nmi_per_cpu[cpu_number] = 0;
6336 	}
6337 
6338 	/* read /proc/interrupt count lines and sum up irqs per cpu */
6339 	while (1) {
6340 		int column;
6341 		char buf[64];
6342 		int this_row_is_nmi = 0;
6343 
6344 		retval = fscanf(fp, " %s:", buf);	/* irq# "N:" */
6345 		if (retval != 1)
6346 			break;
6347 
6348 		if (strncmp(buf, "NMI", strlen("NMI")) == 0)
6349 			this_row_is_nmi = 1;
6350 
6351 		/* read the count per cpu */
6352 		for (column = 0; column < topo.num_cpus; ++column) {
6353 
6354 			int cpu_number, irq_count;
6355 
6356 			retval = fscanf(fp, " %d", &irq_count);
6357 
6358 			if (retval != 1)
6359 				break;
6360 
6361 			cpu_number = irq_column_2_cpu[column];
6362 			irqs_per_cpu[cpu_number] += irq_count;
6363 			if (this_row_is_nmi)
6364 				nmi_per_cpu[cpu_number] += irq_count;
6365 		}
6366 		while (getc(fp) != '\n') ;	/* flush interrupt description */
6367 
6368 	}
6369 	return 0;
6370 }
6371 
6372 /*
6373  * snapshot_graphics()
6374  *
6375  * record snapshot of specified graphics sysfs knob
6376  *
6377  * return 1 if config change requires a restart, else return 0
6378  */
6379 int snapshot_graphics(int idx)
6380 {
6381 	int retval;
6382 
6383 	rewind(gfx_info[idx].fp);
6384 	fflush(gfx_info[idx].fp);
6385 
6386 	switch (idx) {
6387 	case GFX_rc6:
6388 	case SAM_mc6:
6389 		retval = fscanf(gfx_info[idx].fp, "%lld", &gfx_info[idx].val_ull);
6390 		if (retval != 1)
6391 			err(1, "rc6");
6392 		return 0;
6393 	case GFX_MHz:
6394 	case GFX_ACTMHz:
6395 	case SAM_MHz:
6396 	case SAM_ACTMHz:
6397 		retval = fscanf(gfx_info[idx].fp, "%d", &gfx_info[idx].val);
6398 		if (retval != 1)
6399 			err(1, "MHz");
6400 		return 0;
6401 	default:
6402 		return -EINVAL;
6403 	}
6404 }
6405 
6406 /*
6407  * snapshot_cpu_lpi()
6408  *
6409  * record snapshot of
6410  * /sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us
6411  */
6412 int snapshot_cpu_lpi_us(void)
6413 {
6414 	FILE *fp;
6415 	int retval;
6416 
6417 	fp = fopen_or_die("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", "r");
6418 
6419 	retval = fscanf(fp, "%lld", &cpuidle_cur_cpu_lpi_us);
6420 	if (retval != 1) {
6421 		fprintf(stderr, "Disabling Low Power Idle CPU output\n");
6422 		BIC_NOT_PRESENT(BIC_CPU_LPI);
6423 		fclose(fp);
6424 		return -1;
6425 	}
6426 
6427 	fclose(fp);
6428 
6429 	return 0;
6430 }
6431 
6432 /*
6433  * snapshot_sys_lpi()
6434  *
6435  * record snapshot of sys_lpi_file
6436  */
6437 int snapshot_sys_lpi_us(void)
6438 {
6439 	FILE *fp;
6440 	int retval;
6441 
6442 	fp = fopen_or_die(sys_lpi_file, "r");
6443 
6444 	retval = fscanf(fp, "%lld", &cpuidle_cur_sys_lpi_us);
6445 	if (retval != 1) {
6446 		fprintf(stderr, "Disabling Low Power Idle System output\n");
6447 		BIC_NOT_PRESENT(BIC_SYS_LPI);
6448 		fclose(fp);
6449 		return -1;
6450 	}
6451 	fclose(fp);
6452 
6453 	return 0;
6454 }
6455 
6456 /*
6457  * snapshot /proc and /sys files
6458  *
6459  * return 1 if configuration restart needed, else return 0
6460  */
6461 int snapshot_proc_sysfs_files(void)
6462 {
6463 	gettimeofday(&procsysfs_tv_begin, (struct timezone *)NULL);
6464 
6465 	if (DO_BIC(BIC_IRQ) || DO_BIC(BIC_NMI))
6466 		if (snapshot_proc_interrupts())
6467 			return 1;
6468 
6469 	if (DO_BIC(BIC_GFX_rc6))
6470 		snapshot_graphics(GFX_rc6);
6471 
6472 	if (DO_BIC(BIC_GFXMHz))
6473 		snapshot_graphics(GFX_MHz);
6474 
6475 	if (DO_BIC(BIC_GFXACTMHz))
6476 		snapshot_graphics(GFX_ACTMHz);
6477 
6478 	if (DO_BIC(BIC_SAM_mc6))
6479 		snapshot_graphics(SAM_mc6);
6480 
6481 	if (DO_BIC(BIC_SAMMHz))
6482 		snapshot_graphics(SAM_MHz);
6483 
6484 	if (DO_BIC(BIC_SAMACTMHz))
6485 		snapshot_graphics(SAM_ACTMHz);
6486 
6487 	if (DO_BIC(BIC_CPU_LPI))
6488 		snapshot_cpu_lpi_us();
6489 
6490 	if (DO_BIC(BIC_SYS_LPI))
6491 		snapshot_sys_lpi_us();
6492 
6493 	return 0;
6494 }
6495 
6496 int exit_requested;
6497 
6498 static void signal_handler(int signal)
6499 {
6500 	switch (signal) {
6501 	case SIGINT:
6502 		exit_requested = 1;
6503 		if (debug)
6504 			fprintf(stderr, " SIGINT\n");
6505 		break;
6506 	case SIGUSR1:
6507 		if (debug > 1)
6508 			fprintf(stderr, "SIGUSR1\n");
6509 		break;
6510 	}
6511 }
6512 
6513 void setup_signal_handler(void)
6514 {
6515 	struct sigaction sa;
6516 
6517 	memset(&sa, 0, sizeof(sa));
6518 
6519 	sa.sa_handler = &signal_handler;
6520 
6521 	if (sigaction(SIGINT, &sa, NULL) < 0)
6522 		err(1, "sigaction SIGINT");
6523 	if (sigaction(SIGUSR1, &sa, NULL) < 0)
6524 		err(1, "sigaction SIGUSR1");
6525 }
6526 
6527 void do_sleep(void)
6528 {
6529 	struct timeval tout;
6530 	struct timespec rest;
6531 	fd_set readfds;
6532 	int retval;
6533 
6534 	FD_ZERO(&readfds);
6535 	FD_SET(0, &readfds);
6536 
6537 	if (ignore_stdin) {
6538 		nanosleep(&interval_ts, NULL);
6539 		return;
6540 	}
6541 
6542 	tout = interval_tv;
6543 	retval = select(1, &readfds, NULL, NULL, &tout);
6544 
6545 	if (retval == 1) {
6546 		switch (getc(stdin)) {
6547 		case 'q':
6548 			exit_requested = 1;
6549 			break;
6550 		case EOF:
6551 			/*
6552 			 * 'stdin' is a pipe closed on the other end. There
6553 			 * won't be any further input.
6554 			 */
6555 			ignore_stdin = 1;
6556 			/* Sleep the rest of the time */
6557 			rest.tv_sec = (tout.tv_sec + tout.tv_usec / 1000000);
6558 			rest.tv_nsec = (tout.tv_usec % 1000000) * 1000;
6559 			nanosleep(&rest, NULL);
6560 		}
6561 	}
6562 }
6563 
6564 int get_msr_sum(int cpu, off_t offset, unsigned long long *msr)
6565 {
6566 	int ret, idx;
6567 	unsigned long long msr_cur, msr_last;
6568 
6569 	assert(!no_msr);
6570 
6571 	if (!per_cpu_msr_sum)
6572 		return 1;
6573 
6574 	idx = offset_to_idx(offset);
6575 	if (idx < 0)
6576 		return idx;
6577 	/* get_msr_sum() = sum + (get_msr() - last) */
6578 	ret = get_msr(cpu, offset, &msr_cur);
6579 	if (ret)
6580 		return ret;
6581 	msr_last = per_cpu_msr_sum[cpu].entries[idx].last;
6582 	DELTA_WRAP32(msr_cur, msr_last);
6583 	*msr = msr_last + per_cpu_msr_sum[cpu].entries[idx].sum;
6584 
6585 	return 0;
6586 }
6587 
6588 timer_t timerid;
6589 
6590 /* Timer callback, update the sum of MSRs periodically. */
6591 static int update_msr_sum(PER_THREAD_PARAMS)
6592 {
6593 	int i, ret;
6594 	int cpu = t->cpu_id;
6595 
6596 	UNUSED(c);
6597 	UNUSED(p);
6598 
6599 	assert(!no_msr);
6600 
6601 	for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) {
6602 		unsigned long long msr_cur, msr_last;
6603 		off_t offset;
6604 
6605 		if (!idx_valid(i))
6606 			continue;
6607 		offset = idx_to_offset(i);
6608 		if (offset < 0)
6609 			continue;
6610 		ret = get_msr(cpu, offset, &msr_cur);
6611 		if (ret) {
6612 			fprintf(outf, "Can not update msr(0x%llx)\n", (unsigned long long)offset);
6613 			continue;
6614 		}
6615 
6616 		msr_last = per_cpu_msr_sum[cpu].entries[i].last;
6617 		per_cpu_msr_sum[cpu].entries[i].last = msr_cur & 0xffffffff;
6618 
6619 		DELTA_WRAP32(msr_cur, msr_last);
6620 		per_cpu_msr_sum[cpu].entries[i].sum += msr_last;
6621 	}
6622 	return 0;
6623 }
6624 
6625 static void msr_record_handler(union sigval v)
6626 {
6627 	UNUSED(v);
6628 
6629 	for_all_cpus(update_msr_sum, EVEN_COUNTERS);
6630 }
6631 
6632 void msr_sum_record(void)
6633 {
6634 	struct itimerspec its;
6635 	struct sigevent sev;
6636 
6637 	per_cpu_msr_sum = calloc(topo.max_cpu_num + 1, sizeof(struct msr_sum_array));
6638 	if (!per_cpu_msr_sum) {
6639 		fprintf(outf, "Can not allocate memory for long time MSR.\n");
6640 		return;
6641 	}
6642 	/*
6643 	 * Signal handler might be restricted, so use thread notifier instead.
6644 	 */
6645 	memset(&sev, 0, sizeof(struct sigevent));
6646 	sev.sigev_notify = SIGEV_THREAD;
6647 	sev.sigev_notify_function = msr_record_handler;
6648 
6649 	sev.sigev_value.sival_ptr = &timerid;
6650 	if (timer_create(CLOCK_REALTIME, &sev, &timerid) == -1) {
6651 		fprintf(outf, "Can not create timer.\n");
6652 		goto release_msr;
6653 	}
6654 
6655 	its.it_value.tv_sec = 0;
6656 	its.it_value.tv_nsec = 1;
6657 	/*
6658 	 * A wraparound time has been calculated early.
6659 	 * Some sources state that the peak power for a
6660 	 * microprocessor is usually 1.5 times the TDP rating,
6661 	 * use 2 * TDP for safety.
6662 	 */
6663 	its.it_interval.tv_sec = rapl_joule_counter_range / 2;
6664 	its.it_interval.tv_nsec = 0;
6665 
6666 	if (timer_settime(timerid, 0, &its, NULL) == -1) {
6667 		fprintf(outf, "Can not set timer.\n");
6668 		goto release_timer;
6669 	}
6670 	return;
6671 
6672 release_timer:
6673 	timer_delete(timerid);
6674 release_msr:
6675 	free(per_cpu_msr_sum);
6676 }
6677 
6678 /*
6679  * set_my_sched_priority(pri)
6680  * return previous priority on success
6681  * return value < -20 on failure
6682  */
6683 int set_my_sched_priority(int priority)
6684 {
6685 	int retval;
6686 	int original_priority;
6687 
6688 	errno = 0;
6689 	original_priority = getpriority(PRIO_PROCESS, 0);
6690 	if (errno && (original_priority == -1))
6691 		return -21;
6692 
6693 	retval = setpriority(PRIO_PROCESS, 0, priority);
6694 	if (retval)
6695 		return -21;
6696 
6697 	errno = 0;
6698 	retval = getpriority(PRIO_PROCESS, 0);
6699 	if (retval != priority)
6700 		return -21;
6701 
6702 	return original_priority;
6703 }
6704 
6705 void turbostat_loop()
6706 {
6707 	int retval;
6708 	int restarted = 0;
6709 	unsigned int done_iters = 0;
6710 
6711 	setup_signal_handler();
6712 
6713 	/*
6714 	 * elevate own priority for interval mode
6715 	 *
6716 	 * ignore on error - we probably don't have permission to set it, but
6717 	 * it's not a big deal
6718 	 */
6719 	set_my_sched_priority(-20);
6720 
6721 restart:
6722 	restarted++;
6723 
6724 	snapshot_proc_sysfs_files();
6725 	retval = for_all_cpus(get_counters, EVEN_COUNTERS);
6726 	first_counter_read = 0;
6727 	if (retval < -1) {
6728 		exit(retval);
6729 	} else if (retval == -1) {
6730 		if (restarted > 10) {
6731 			exit(retval);
6732 		}
6733 		re_initialize();
6734 		goto restart;
6735 	}
6736 	restarted = 0;
6737 	done_iters = 0;
6738 	gettimeofday(&tv_even, (struct timezone *)NULL);
6739 
6740 	while (1) {
6741 		if (for_all_proc_cpus(cpu_is_not_present)) {
6742 			re_initialize();
6743 			goto restart;
6744 		}
6745 		if (update_effective_str(false)) {
6746 			re_initialize();
6747 			goto restart;
6748 		}
6749 		do_sleep();
6750 		if (snapshot_proc_sysfs_files())
6751 			goto restart;
6752 		retval = for_all_cpus(get_counters, ODD_COUNTERS);
6753 		if (retval < -1) {
6754 			exit(retval);
6755 		} else if (retval == -1) {
6756 			re_initialize();
6757 			goto restart;
6758 		}
6759 		gettimeofday(&tv_odd, (struct timezone *)NULL);
6760 		timersub(&tv_odd, &tv_even, &tv_delta);
6761 		if (for_all_cpus_2(delta_cpu, ODD_COUNTERS, EVEN_COUNTERS)) {
6762 			re_initialize();
6763 			goto restart;
6764 		}
6765 		delta_platform(&platform_counters_odd, &platform_counters_even);
6766 		compute_average(EVEN_COUNTERS);
6767 		format_all_counters(EVEN_COUNTERS);
6768 		flush_output_stdout();
6769 		if (exit_requested)
6770 			break;
6771 		if (num_iterations && ++done_iters >= num_iterations)
6772 			break;
6773 		do_sleep();
6774 		if (snapshot_proc_sysfs_files())
6775 			goto restart;
6776 		retval = for_all_cpus(get_counters, EVEN_COUNTERS);
6777 		if (retval < -1) {
6778 			exit(retval);
6779 		} else if (retval == -1) {
6780 			re_initialize();
6781 			goto restart;
6782 		}
6783 		gettimeofday(&tv_even, (struct timezone *)NULL);
6784 		timersub(&tv_even, &tv_odd, &tv_delta);
6785 		if (for_all_cpus_2(delta_cpu, EVEN_COUNTERS, ODD_COUNTERS)) {
6786 			re_initialize();
6787 			goto restart;
6788 		}
6789 		delta_platform(&platform_counters_even, &platform_counters_odd);
6790 		compute_average(ODD_COUNTERS);
6791 		format_all_counters(ODD_COUNTERS);
6792 		flush_output_stdout();
6793 		if (exit_requested)
6794 			break;
6795 		if (num_iterations && ++done_iters >= num_iterations)
6796 			break;
6797 	}
6798 }
6799 
6800 void check_dev_msr()
6801 {
6802 	struct stat sb;
6803 	char pathname[32];
6804 
6805 	if (no_msr)
6806 		return;
6807 #if defined(ANDROID)
6808 	sprintf(pathname, "/dev/msr%d", base_cpu);
6809 #else
6810 	sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
6811 #endif
6812 	if (stat(pathname, &sb))
6813 		if (system("/sbin/modprobe msr > /dev/null 2>&1"))
6814 			no_msr = 1;
6815 }
6816 
6817 /*
6818  * check for CAP_SYS_RAWIO
6819  * return 0 on success
6820  * return 1 on fail
6821  */
6822 int check_for_cap_sys_rawio(void)
6823 {
6824 	cap_t caps;
6825 	cap_flag_value_t cap_flag_value;
6826 	int ret = 0;
6827 
6828 	caps = cap_get_proc();
6829 	if (caps == NULL) {
6830 		/*
6831 		 * CONFIG_MULTIUSER=n kernels have no cap_get_proc()
6832 		 * Allow them to continue and attempt to access MSRs
6833 		 */
6834 		if (errno == ENOSYS)
6835 			return 0;
6836 
6837 		return 1;
6838 	}
6839 
6840 	if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) {
6841 		ret = 1;
6842 		goto free_and_exit;
6843 	}
6844 
6845 	if (cap_flag_value != CAP_SET) {
6846 		ret = 1;
6847 		goto free_and_exit;
6848 	}
6849 
6850 free_and_exit:
6851 	if (cap_free(caps) == -1)
6852 		err(-6, "cap_free\n");
6853 
6854 	return ret;
6855 }
6856 
6857 void check_msr_permission(void)
6858 {
6859 	int failed = 0;
6860 	char pathname[32];
6861 
6862 	if (no_msr)
6863 		return;
6864 
6865 	/* check for CAP_SYS_RAWIO */
6866 	failed += check_for_cap_sys_rawio();
6867 
6868 	/* test file permissions */
6869 #if defined(ANDROID)
6870 	sprintf(pathname, "/dev/msr%d", base_cpu);
6871 #else
6872 	sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
6873 #endif
6874 	if (euidaccess(pathname, R_OK)) {
6875 		failed++;
6876 	}
6877 
6878 	if (failed) {
6879 		warnx("Failed to access %s. Some of the counters may not be available\n"
6880 		      "\tRun as root to enable them or use %s to disable the access explicitly", pathname, "--no-msr");
6881 		no_msr = 1;
6882 	}
6883 }
6884 
6885 void probe_bclk(void)
6886 {
6887 	unsigned long long msr;
6888 	unsigned int base_ratio;
6889 
6890 	if (!platform->has_nhm_msrs || no_msr)
6891 		return;
6892 
6893 	if (platform->bclk_freq == BCLK_100MHZ)
6894 		bclk = 100.00;
6895 	else if (platform->bclk_freq == BCLK_133MHZ)
6896 		bclk = 133.33;
6897 	else if (platform->bclk_freq == BCLK_SLV)
6898 		bclk = slm_bclk();
6899 	else
6900 		return;
6901 
6902 	get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
6903 	base_ratio = (msr >> 8) & 0xFF;
6904 
6905 	base_hz = base_ratio * bclk * 1000000;
6906 	has_base_hz = 1;
6907 
6908 	if (platform->enable_tsc_tweak)
6909 		tsc_tweak = base_hz / tsc_hz;
6910 }
6911 
6912 static void remove_underbar(char *s)
6913 {
6914 	char *to = s;
6915 
6916 	while (*s) {
6917 		if (*s != '_')
6918 			*to++ = *s;
6919 		s++;
6920 	}
6921 
6922 	*to = 0;
6923 }
6924 
6925 static void dump_turbo_ratio_info(void)
6926 {
6927 	if (!has_turbo)
6928 		return;
6929 
6930 	if (!platform->has_nhm_msrs || no_msr)
6931 		return;
6932 
6933 	if (platform->trl_msrs & TRL_LIMIT2)
6934 		dump_turbo_ratio_limit2();
6935 
6936 	if (platform->trl_msrs & TRL_LIMIT1)
6937 		dump_turbo_ratio_limit1();
6938 
6939 	if (platform->trl_msrs & TRL_BASE) {
6940 		dump_turbo_ratio_limits(MSR_TURBO_RATIO_LIMIT);
6941 
6942 		if (is_hybrid)
6943 			dump_turbo_ratio_limits(MSR_SECONDARY_TURBO_RATIO_LIMIT);
6944 	}
6945 
6946 	if (platform->trl_msrs & TRL_ATOM)
6947 		dump_atom_turbo_ratio_limits();
6948 
6949 	if (platform->trl_msrs & TRL_KNL)
6950 		dump_knl_turbo_ratio_limits();
6951 
6952 	if (platform->has_config_tdp)
6953 		dump_config_tdp();
6954 }
6955 
6956 static int read_sysfs_int(char *path)
6957 {
6958 	FILE *input;
6959 	int retval = -1;
6960 
6961 	input = fopen(path, "r");
6962 	if (input == NULL) {
6963 		if (debug)
6964 			fprintf(outf, "NSFOD %s\n", path);
6965 		return (-1);
6966 	}
6967 	if (fscanf(input, "%d", &retval) != 1)
6968 		err(1, "%s: failed to read int from file", path);
6969 	fclose(input);
6970 
6971 	return (retval);
6972 }
6973 
6974 static void dump_sysfs_file(char *path)
6975 {
6976 	FILE *input;
6977 	char cpuidle_buf[64];
6978 
6979 	input = fopen(path, "r");
6980 	if (input == NULL) {
6981 		if (debug)
6982 			fprintf(outf, "NSFOD %s\n", path);
6983 		return;
6984 	}
6985 	if (!fgets(cpuidle_buf, sizeof(cpuidle_buf), input))
6986 		err(1, "%s: failed to read file", path);
6987 	fclose(input);
6988 
6989 	fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf);
6990 }
6991 
6992 static void probe_intel_uncore_frequency_legacy(void)
6993 {
6994 	int i, j;
6995 	char path[256];
6996 
6997 	for (i = 0; i < topo.num_packages; ++i) {
6998 		for (j = 0; j <= topo.max_die_id; ++j) {
6999 			int k, l;
7000 			char path_base[128];
7001 
7002 			sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i,
7003 				j);
7004 
7005 			sprintf(path, "%s/current_freq_khz", path_base);
7006 			if (access(path, R_OK))
7007 				continue;
7008 
7009 			BIC_PRESENT(BIC_UNCORE_MHZ);
7010 
7011 			if (quiet)
7012 				return;
7013 
7014 			sprintf(path, "%s/min_freq_khz", path_base);
7015 			k = read_sysfs_int(path);
7016 			sprintf(path, "%s/max_freq_khz", path_base);
7017 			l = read_sysfs_int(path);
7018 			fprintf(outf, "Uncore Frequency package%d die%d: %d - %d MHz ", i, j, k / 1000, l / 1000);
7019 
7020 			sprintf(path, "%s/initial_min_freq_khz", path_base);
7021 			k = read_sysfs_int(path);
7022 			sprintf(path, "%s/initial_max_freq_khz", path_base);
7023 			l = read_sysfs_int(path);
7024 			fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000);
7025 
7026 			sprintf(path, "%s/current_freq_khz", path_base);
7027 			k = read_sysfs_int(path);
7028 			fprintf(outf, " %d MHz\n", k / 1000);
7029 		}
7030 	}
7031 }
7032 
7033 static void probe_intel_uncore_frequency_cluster(void)
7034 {
7035 	int i, uncore_max_id;
7036 	char path[256];
7037 	char path_base[128];
7038 
7039 	if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK))
7040 		return;
7041 
7042 	for (uncore_max_id = 0;; ++uncore_max_id) {
7043 
7044 		sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", uncore_max_id);
7045 
7046 		/* uncore## start at 00 and skips no numbers, so stop upon first missing */
7047 		if (access(path_base, R_OK)) {
7048 			uncore_max_id -= 1;
7049 			break;
7050 		}
7051 	}
7052 	for (i = uncore_max_id; i >= 0; --i) {
7053 		int k, l;
7054 		int package_id, domain_id, cluster_id;
7055 		char name_buf[16];
7056 
7057 		sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i);
7058 
7059 		if (access(path_base, R_OK))
7060 			err(1, "%s: %s\n", __func__, path_base);
7061 
7062 		sprintf(path, "%s/package_id", path_base);
7063 		package_id = read_sysfs_int(path);
7064 
7065 		sprintf(path, "%s/domain_id", path_base);
7066 		domain_id = read_sysfs_int(path);
7067 
7068 		sprintf(path, "%s/fabric_cluster_id", path_base);
7069 		cluster_id = read_sysfs_int(path);
7070 
7071 		sprintf(path, "%s/current_freq_khz", path_base);
7072 		sprintf(name_buf, "UMHz%d.%d", domain_id, cluster_id);
7073 
7074 		/*
7075 		 * Once add_couter() is called, that counter is always read
7076 		 * and reported -- So it is effectively (enabled & present).
7077 		 * Only call add_counter() here if legacy BIC_UNCORE_MHZ (UncMHz)
7078 		 * is (enabled).  Since we are in this routine, we
7079 		 * know we will not probe and set (present) the legacy counter.
7080 		 *
7081 		 * This allows "--show/--hide UncMHz" to be effective for
7082 		 * the clustered MHz counters, as a group.
7083 		 */
7084 		if BIC_IS_ENABLED
7085 			(BIC_UNCORE_MHZ)
7086 			    add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0,
7087 					package_id);
7088 
7089 		if (quiet)
7090 			continue;
7091 
7092 		sprintf(path, "%s/min_freq_khz", path_base);
7093 		k = read_sysfs_int(path);
7094 		sprintf(path, "%s/max_freq_khz", path_base);
7095 		l = read_sysfs_int(path);
7096 		fprintf(outf, "Uncore Frequency package%d domain%d cluster%d: %d - %d MHz ", package_id, domain_id,
7097 			cluster_id, k / 1000, l / 1000);
7098 
7099 		sprintf(path, "%s/initial_min_freq_khz", path_base);
7100 		k = read_sysfs_int(path);
7101 		sprintf(path, "%s/initial_max_freq_khz", path_base);
7102 		l = read_sysfs_int(path);
7103 		fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000);
7104 
7105 		sprintf(path, "%s/current_freq_khz", path_base);
7106 		k = read_sysfs_int(path);
7107 		fprintf(outf, " %d MHz\n", k / 1000);
7108 	}
7109 }
7110 
7111 static void probe_intel_uncore_frequency(void)
7112 {
7113 	if (!genuine_intel)
7114 		return;
7115 
7116 	if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK) == 0)
7117 		probe_intel_uncore_frequency_cluster();
7118 	else
7119 		probe_intel_uncore_frequency_legacy();
7120 }
7121 
7122 static void set_graphics_fp(char *path, int idx)
7123 {
7124 	if (!access(path, R_OK))
7125 		gfx_info[idx].fp = fopen_or_die(path, "r");
7126 }
7127 
7128 /* Enlarge this if there are /sys/class/drm/card2 ... */
7129 #define GFX_MAX_CARDS	2
7130 
7131 static void probe_graphics(void)
7132 {
7133 	char path[PATH_MAX];
7134 	int i;
7135 
7136 	/* Xe graphics sysfs knobs */
7137 	if (!access("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", R_OK)) {
7138 		FILE *fp;
7139 		char buf[8];
7140 		bool gt0_is_gt;
7141 
7142 		fp = fopen("/sys/class/drm/card0/device/tile0/gt0/gtidle/name", "r");
7143 		if (!fp)
7144 			goto next;
7145 
7146 		if (!fread(buf, sizeof(char), 7, fp)) {
7147 			fclose(fp);
7148 			goto next;
7149 		}
7150 		fclose(fp);
7151 
7152 		if (!strncmp(buf, "gt0-rc", strlen("gt0-rc")))
7153 			gt0_is_gt = true;
7154 		else if (!strncmp(buf, "gt0-mc", strlen("gt0-mc")))
7155 			gt0_is_gt = false;
7156 		else
7157 			goto next;
7158 
7159 		set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms",
7160 				gt0_is_gt ? GFX_rc6 : SAM_mc6);
7161 
7162 		set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq", gt0_is_gt ? GFX_MHz : SAM_MHz);
7163 
7164 		set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq",
7165 				gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz);
7166 
7167 		set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms",
7168 				gt0_is_gt ? SAM_mc6 : GFX_rc6);
7169 
7170 		set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq", gt0_is_gt ? SAM_MHz : GFX_MHz);
7171 
7172 		set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq",
7173 				gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz);
7174 
7175 		goto end;
7176 	}
7177 
7178 next:
7179 	/* New i915 graphics sysfs knobs */
7180 	for (i = 0; i < GFX_MAX_CARDS; i++) {
7181 		snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt0/rc6_residency_ms", i);
7182 		if (!access(path, R_OK))
7183 			break;
7184 	}
7185 
7186 	if (i == GFX_MAX_CARDS)
7187 		goto legacy_i915;
7188 
7189 	snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt0/rc6_residency_ms", i);
7190 	set_graphics_fp(path, GFX_rc6);
7191 
7192 	snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt0/rps_cur_freq_mhz", i);
7193 	set_graphics_fp(path, GFX_MHz);
7194 
7195 	snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt0/rps_act_freq_mhz", i);
7196 	set_graphics_fp(path, GFX_ACTMHz);
7197 
7198 	snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt1/rc6_residency_ms", i);
7199 	set_graphics_fp(path, SAM_mc6);
7200 
7201 	snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt1/rps_cur_freq_mhz", i);
7202 	set_graphics_fp(path, SAM_MHz);
7203 
7204 	snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt1/rps_act_freq_mhz", i);
7205 	set_graphics_fp(path, SAM_ACTMHz);
7206 
7207 	goto end;
7208 
7209 legacy_i915:
7210 	/* Fall back to traditional i915 graphics sysfs knobs */
7211 	set_graphics_fp("/sys/class/drm/card0/power/rc6_residency_ms", GFX_rc6);
7212 
7213 	set_graphics_fp("/sys/class/drm/card0/gt_cur_freq_mhz", GFX_MHz);
7214 	if (!gfx_info[GFX_MHz].fp)
7215 		set_graphics_fp("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", GFX_MHz);
7216 
7217 	set_graphics_fp("/sys/class/drm/card0/gt_act_freq_mhz", GFX_ACTMHz);
7218 	if (!gfx_info[GFX_ACTMHz].fp)
7219 		set_graphics_fp("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", GFX_ACTMHz);
7220 
7221 end:
7222 	if (gfx_info[GFX_rc6].fp)
7223 		BIC_PRESENT(BIC_GFX_rc6);
7224 	if (gfx_info[GFX_MHz].fp)
7225 		BIC_PRESENT(BIC_GFXMHz);
7226 	if (gfx_info[GFX_ACTMHz].fp)
7227 		BIC_PRESENT(BIC_GFXACTMHz);
7228 	if (gfx_info[SAM_mc6].fp)
7229 		BIC_PRESENT(BIC_SAM_mc6);
7230 	if (gfx_info[SAM_MHz].fp)
7231 		BIC_PRESENT(BIC_SAMMHz);
7232 	if (gfx_info[SAM_ACTMHz].fp)
7233 		BIC_PRESENT(BIC_SAMACTMHz);
7234 }
7235 
7236 static void dump_sysfs_cstate_config(void)
7237 {
7238 	char path[64];
7239 	char name_buf[16];
7240 	char desc[64];
7241 	FILE *input;
7242 	int state;
7243 	char *sp;
7244 
7245 	if (access("/sys/devices/system/cpu/cpuidle", R_OK)) {
7246 		fprintf(outf, "cpuidle not loaded\n");
7247 		return;
7248 	}
7249 
7250 	dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_driver");
7251 	dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_governor");
7252 	dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_governor_ro");
7253 
7254 	for (state = 0; state < 10; ++state) {
7255 
7256 		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
7257 		input = fopen(path, "r");
7258 		if (input == NULL)
7259 			continue;
7260 		if (!fgets(name_buf, sizeof(name_buf), input))
7261 			err(1, "%s: failed to read file", path);
7262 
7263 		/* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
7264 		sp = strchr(name_buf, '-');
7265 		if (!sp)
7266 			sp = strchrnul(name_buf, '\n');
7267 		*sp = '\0';
7268 		fclose(input);
7269 
7270 		remove_underbar(name_buf);
7271 
7272 		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc", base_cpu, state);
7273 		input = fopen(path, "r");
7274 		if (input == NULL)
7275 			continue;
7276 		if (!fgets(desc, sizeof(desc), input))
7277 			err(1, "%s: failed to read file", path);
7278 
7279 		fprintf(outf, "cpu%d: %s: %s", base_cpu, name_buf, desc);
7280 		fclose(input);
7281 	}
7282 }
7283 
7284 static void dump_sysfs_pstate_config(void)
7285 {
7286 	char path[64];
7287 	char driver_buf[64];
7288 	char governor_buf[64];
7289 	FILE *input;
7290 	int turbo;
7291 
7292 	sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_driver", base_cpu);
7293 	input = fopen(path, "r");
7294 	if (input == NULL) {
7295 		fprintf(outf, "NSFOD %s\n", path);
7296 		return;
7297 	}
7298 	if (!fgets(driver_buf, sizeof(driver_buf), input))
7299 		err(1, "%s: failed to read file", path);
7300 	fclose(input);
7301 
7302 	sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", base_cpu);
7303 	input = fopen(path, "r");
7304 	if (input == NULL) {
7305 		fprintf(outf, "NSFOD %s\n", path);
7306 		return;
7307 	}
7308 	if (!fgets(governor_buf, sizeof(governor_buf), input))
7309 		err(1, "%s: failed to read file", path);
7310 	fclose(input);
7311 
7312 	fprintf(outf, "cpu%d: cpufreq driver: %s", base_cpu, driver_buf);
7313 	fprintf(outf, "cpu%d: cpufreq governor: %s", base_cpu, governor_buf);
7314 
7315 	sprintf(path, "/sys/devices/system/cpu/cpufreq/boost");
7316 	input = fopen(path, "r");
7317 	if (input != NULL) {
7318 		if (fscanf(input, "%d", &turbo) != 1)
7319 			err(1, "%s: failed to parse number from file", path);
7320 		fprintf(outf, "cpufreq boost: %d\n", turbo);
7321 		fclose(input);
7322 	}
7323 
7324 	sprintf(path, "/sys/devices/system/cpu/intel_pstate/no_turbo");
7325 	input = fopen(path, "r");
7326 	if (input != NULL) {
7327 		if (fscanf(input, "%d", &turbo) != 1)
7328 			err(1, "%s: failed to parse number from file", path);
7329 		fprintf(outf, "cpufreq intel_pstate no_turbo: %d\n", turbo);
7330 		fclose(input);
7331 	}
7332 }
7333 
7334 /*
7335  * print_epb()
7336  * Decode the ENERGY_PERF_BIAS MSR
7337  */
7338 int print_epb(PER_THREAD_PARAMS)
7339 {
7340 	char *epb_string;
7341 	int cpu, epb;
7342 
7343 	UNUSED(c);
7344 	UNUSED(p);
7345 
7346 	if (!has_epb)
7347 		return 0;
7348 
7349 	cpu = t->cpu_id;
7350 
7351 	/* EPB is per-package */
7352 	if (!is_cpu_first_thread_in_package(t, c, p))
7353 		return 0;
7354 
7355 	if (cpu_migrate(cpu)) {
7356 		fprintf(outf, "print_epb: Could not migrate to CPU %d\n", cpu);
7357 		return -1;
7358 	}
7359 
7360 	epb = get_epb(cpu);
7361 	if (epb < 0)
7362 		return 0;
7363 
7364 	switch (epb) {
7365 	case ENERGY_PERF_BIAS_PERFORMANCE:
7366 		epb_string = "performance";
7367 		break;
7368 	case ENERGY_PERF_BIAS_NORMAL:
7369 		epb_string = "balanced";
7370 		break;
7371 	case ENERGY_PERF_BIAS_POWERSAVE:
7372 		epb_string = "powersave";
7373 		break;
7374 	default:
7375 		epb_string = "custom";
7376 		break;
7377 	}
7378 	fprintf(outf, "cpu%d: EPB: %d (%s)\n", cpu, epb, epb_string);
7379 
7380 	return 0;
7381 }
7382 
7383 /*
7384  * print_hwp()
7385  * Decode the MSR_HWP_CAPABILITIES
7386  */
7387 int print_hwp(PER_THREAD_PARAMS)
7388 {
7389 	unsigned long long msr;
7390 	int cpu;
7391 
7392 	UNUSED(c);
7393 	UNUSED(p);
7394 
7395 	if (no_msr)
7396 		return 0;
7397 
7398 	if (!has_hwp)
7399 		return 0;
7400 
7401 	cpu = t->cpu_id;
7402 
7403 	/* MSR_HWP_CAPABILITIES is per-package */
7404 	if (!is_cpu_first_thread_in_package(t, c, p))
7405 		return 0;
7406 
7407 	if (cpu_migrate(cpu)) {
7408 		fprintf(outf, "print_hwp: Could not migrate to CPU %d\n", cpu);
7409 		return -1;
7410 	}
7411 
7412 	if (get_msr(cpu, MSR_PM_ENABLE, &msr))
7413 		return 0;
7414 
7415 	fprintf(outf, "cpu%d: MSR_PM_ENABLE: 0x%08llx (%sHWP)\n", cpu, msr, (msr & (1 << 0)) ? "" : "No-");
7416 
7417 	/* MSR_PM_ENABLE[1] == 1 if HWP is enabled and MSRs visible */
7418 	if ((msr & (1 << 0)) == 0)
7419 		return 0;
7420 
7421 	if (get_msr(cpu, MSR_HWP_CAPABILITIES, &msr))
7422 		return 0;
7423 
7424 	fprintf(outf, "cpu%d: MSR_HWP_CAPABILITIES: 0x%08llx "
7425 		"(high %d guar %d eff %d low %d)\n",
7426 		cpu, msr,
7427 		(unsigned int)HWP_HIGHEST_PERF(msr),
7428 		(unsigned int)HWP_GUARANTEED_PERF(msr),
7429 		(unsigned int)HWP_MOSTEFFICIENT_PERF(msr), (unsigned int)HWP_LOWEST_PERF(msr));
7430 
7431 	if (get_msr(cpu, MSR_HWP_REQUEST, &msr))
7432 		return 0;
7433 
7434 	fprintf(outf, "cpu%d: MSR_HWP_REQUEST: 0x%08llx "
7435 		"(min %d max %d des %d epp 0x%x window 0x%x pkg 0x%x)\n",
7436 		cpu, msr,
7437 		(unsigned int)(((msr) >> 0) & 0xff),
7438 		(unsigned int)(((msr) >> 8) & 0xff),
7439 		(unsigned int)(((msr) >> 16) & 0xff),
7440 		(unsigned int)(((msr) >> 24) & 0xff),
7441 		(unsigned int)(((msr) >> 32) & 0xff3), (unsigned int)(((msr) >> 42) & 0x1));
7442 
7443 	if (has_hwp_pkg) {
7444 		if (get_msr(cpu, MSR_HWP_REQUEST_PKG, &msr))
7445 			return 0;
7446 
7447 		fprintf(outf, "cpu%d: MSR_HWP_REQUEST_PKG: 0x%08llx "
7448 			"(min %d max %d des %d epp 0x%x window 0x%x)\n",
7449 			cpu, msr,
7450 			(unsigned int)(((msr) >> 0) & 0xff),
7451 			(unsigned int)(((msr) >> 8) & 0xff),
7452 			(unsigned int)(((msr) >> 16) & 0xff),
7453 			(unsigned int)(((msr) >> 24) & 0xff), (unsigned int)(((msr) >> 32) & 0xff3));
7454 	}
7455 	if (has_hwp_notify) {
7456 		if (get_msr(cpu, MSR_HWP_INTERRUPT, &msr))
7457 			return 0;
7458 
7459 		fprintf(outf, "cpu%d: MSR_HWP_INTERRUPT: 0x%08llx "
7460 			"(%s_Guaranteed_Perf_Change, %s_Excursion_Min)\n",
7461 			cpu, msr, ((msr) & 0x1) ? "EN" : "Dis", ((msr) & 0x2) ? "EN" : "Dis");
7462 	}
7463 	if (get_msr(cpu, MSR_HWP_STATUS, &msr))
7464 		return 0;
7465 
7466 	fprintf(outf, "cpu%d: MSR_HWP_STATUS: 0x%08llx "
7467 		"(%sGuaranteed_Perf_Change, %sExcursion_Min)\n",
7468 		cpu, msr, ((msr) & 0x1) ? "" : "No-", ((msr) & 0x4) ? "" : "No-");
7469 
7470 	return 0;
7471 }
7472 
7473 /*
7474  * print_perf_limit()
7475  */
7476 int print_perf_limit(PER_THREAD_PARAMS)
7477 {
7478 	unsigned long long msr;
7479 	int cpu;
7480 
7481 	UNUSED(c);
7482 	UNUSED(p);
7483 
7484 	if (no_msr)
7485 		return 0;
7486 
7487 	cpu = t->cpu_id;
7488 
7489 	/* per-package */
7490 	if (!is_cpu_first_thread_in_package(t, c, p))
7491 		return 0;
7492 
7493 	if (cpu_migrate(cpu)) {
7494 		fprintf(outf, "print_perf_limit: Could not migrate to CPU %d\n", cpu);
7495 		return -1;
7496 	}
7497 
7498 	if (platform->plr_msrs & PLR_CORE) {
7499 		get_msr(cpu, MSR_CORE_PERF_LIMIT_REASONS, &msr);
7500 		fprintf(outf, "cpu%d: MSR_CORE_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
7501 		fprintf(outf, " (Active: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)",
7502 			(msr & 1 << 15) ? "bit15, " : "",
7503 			(msr & 1 << 14) ? "bit14, " : "",
7504 			(msr & 1 << 13) ? "Transitions, " : "",
7505 			(msr & 1 << 12) ? "MultiCoreTurbo, " : "",
7506 			(msr & 1 << 11) ? "PkgPwrL2, " : "",
7507 			(msr & 1 << 10) ? "PkgPwrL1, " : "",
7508 			(msr & 1 << 9) ? "CorePwr, " : "",
7509 			(msr & 1 << 8) ? "Amps, " : "",
7510 			(msr & 1 << 6) ? "VR-Therm, " : "",
7511 			(msr & 1 << 5) ? "Auto-HWP, " : "",
7512 			(msr & 1 << 4) ? "Graphics, " : "",
7513 			(msr & 1 << 2) ? "bit2, " : "",
7514 			(msr & 1 << 1) ? "ThermStatus, " : "", (msr & 1 << 0) ? "PROCHOT, " : "");
7515 		fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n",
7516 			(msr & 1 << 31) ? "bit31, " : "",
7517 			(msr & 1 << 30) ? "bit30, " : "",
7518 			(msr & 1 << 29) ? "Transitions, " : "",
7519 			(msr & 1 << 28) ? "MultiCoreTurbo, " : "",
7520 			(msr & 1 << 27) ? "PkgPwrL2, " : "",
7521 			(msr & 1 << 26) ? "PkgPwrL1, " : "",
7522 			(msr & 1 << 25) ? "CorePwr, " : "",
7523 			(msr & 1 << 24) ? "Amps, " : "",
7524 			(msr & 1 << 22) ? "VR-Therm, " : "",
7525 			(msr & 1 << 21) ? "Auto-HWP, " : "",
7526 			(msr & 1 << 20) ? "Graphics, " : "",
7527 			(msr & 1 << 18) ? "bit18, " : "",
7528 			(msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 16) ? "PROCHOT, " : "");
7529 
7530 	}
7531 	if (platform->plr_msrs & PLR_GFX) {
7532 		get_msr(cpu, MSR_GFX_PERF_LIMIT_REASONS, &msr);
7533 		fprintf(outf, "cpu%d: MSR_GFX_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
7534 		fprintf(outf, " (Active: %s%s%s%s%s%s%s%s)",
7535 			(msr & 1 << 0) ? "PROCHOT, " : "",
7536 			(msr & 1 << 1) ? "ThermStatus, " : "",
7537 			(msr & 1 << 4) ? "Graphics, " : "",
7538 			(msr & 1 << 6) ? "VR-Therm, " : "",
7539 			(msr & 1 << 8) ? "Amps, " : "",
7540 			(msr & 1 << 9) ? "GFXPwr, " : "",
7541 			(msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : "");
7542 		fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s)\n",
7543 			(msr & 1 << 16) ? "PROCHOT, " : "",
7544 			(msr & 1 << 17) ? "ThermStatus, " : "",
7545 			(msr & 1 << 20) ? "Graphics, " : "",
7546 			(msr & 1 << 22) ? "VR-Therm, " : "",
7547 			(msr & 1 << 24) ? "Amps, " : "",
7548 			(msr & 1 << 25) ? "GFXPwr, " : "",
7549 			(msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : "");
7550 	}
7551 	if (platform->plr_msrs & PLR_RING) {
7552 		get_msr(cpu, MSR_RING_PERF_LIMIT_REASONS, &msr);
7553 		fprintf(outf, "cpu%d: MSR_RING_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
7554 		fprintf(outf, " (Active: %s%s%s%s%s%s)",
7555 			(msr & 1 << 0) ? "PROCHOT, " : "",
7556 			(msr & 1 << 1) ? "ThermStatus, " : "",
7557 			(msr & 1 << 6) ? "VR-Therm, " : "",
7558 			(msr & 1 << 8) ? "Amps, " : "",
7559 			(msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : "");
7560 		fprintf(outf, " (Logged: %s%s%s%s%s%s)\n",
7561 			(msr & 1 << 16) ? "PROCHOT, " : "",
7562 			(msr & 1 << 17) ? "ThermStatus, " : "",
7563 			(msr & 1 << 22) ? "VR-Therm, " : "",
7564 			(msr & 1 << 24) ? "Amps, " : "",
7565 			(msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : "");
7566 	}
7567 	return 0;
7568 }
7569 
7570 #define	RAPL_POWER_GRANULARITY	0x7FFF	/* 15 bit power granularity */
7571 #define	RAPL_TIME_GRANULARITY	0x3F	/* 6 bit time granularity */
7572 
7573 double get_quirk_tdp(void)
7574 {
7575 	if (platform->rapl_quirk_tdp)
7576 		return platform->rapl_quirk_tdp;
7577 
7578 	return 135.0;
7579 }
7580 
7581 double get_tdp_intel(void)
7582 {
7583 	unsigned long long msr;
7584 
7585 	if (platform->rapl_msrs & RAPL_PKG_POWER_INFO)
7586 		if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr))
7587 			return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units;
7588 	return get_quirk_tdp();
7589 }
7590 
7591 double get_tdp_amd(void)
7592 {
7593 	return get_quirk_tdp();
7594 }
7595 
7596 void rapl_probe_intel(void)
7597 {
7598 	unsigned long long msr;
7599 	unsigned int time_unit;
7600 	double tdp;
7601 
7602 	if (rapl_joules) {
7603 		CLR_BIC(BIC_SysWatt, &bic_enabled);
7604 		CLR_BIC(BIC_PkgWatt, &bic_enabled);
7605 		CLR_BIC(BIC_CorWatt, &bic_enabled);
7606 		CLR_BIC(BIC_RAMWatt, &bic_enabled);
7607 		CLR_BIC(BIC_GFXWatt, &bic_enabled);
7608 	} else {
7609 		CLR_BIC(BIC_Sys_J, &bic_enabled);
7610 		CLR_BIC(BIC_Pkg_J, &bic_enabled);
7611 		CLR_BIC(BIC_Cor_J, &bic_enabled);
7612 		CLR_BIC(BIC_RAM_J, &bic_enabled);
7613 		CLR_BIC(BIC_GFX_J, &bic_enabled);
7614 	}
7615 
7616 	if (!platform->rapl_msrs || no_msr)
7617 		return;
7618 
7619 	if (!(platform->rapl_msrs & RAPL_PKG_PERF_STATUS))
7620 		CLR_BIC(BIC_PKG__, &bic_enabled);
7621 	if (!(platform->rapl_msrs & RAPL_DRAM_PERF_STATUS))
7622 		CLR_BIC(BIC_RAM__, &bic_enabled);
7623 
7624 	/* units on package 0, verify later other packages match */
7625 	if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr))
7626 		return;
7627 
7628 	rapl_power_units = 1.0 / (1 << (msr & 0xF));
7629 	if (platform->has_rapl_divisor)
7630 		rapl_energy_units = 1.0 * (1 << (msr >> 8 & 0x1F)) / 1000000;
7631 	else
7632 		rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F));
7633 
7634 	if (platform->has_fixed_rapl_unit)
7635 		rapl_dram_energy_units = (15.3 / 1000000);
7636 	else
7637 		rapl_dram_energy_units = rapl_energy_units;
7638 
7639 	if (platform->has_fixed_rapl_psys_unit)
7640 		rapl_psys_energy_units = 1.0;
7641 	else
7642 		rapl_psys_energy_units = rapl_energy_units;
7643 
7644 	time_unit = msr >> 16 & 0xF;
7645 	if (time_unit == 0)
7646 		time_unit = 0xA;
7647 
7648 	rapl_time_units = 1.0 / (1 << (time_unit));
7649 
7650 	tdp = get_tdp_intel();
7651 
7652 	rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
7653 	if (!quiet)
7654 		fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
7655 }
7656 
7657 void rapl_probe_amd(void)
7658 {
7659 	unsigned long long msr;
7660 	double tdp;
7661 
7662 	if (rapl_joules) {
7663 		CLR_BIC(BIC_SysWatt, &bic_enabled);
7664 		CLR_BIC(BIC_CorWatt, &bic_enabled);
7665 	} else {
7666 		CLR_BIC(BIC_Pkg_J, &bic_enabled);
7667 		CLR_BIC(BIC_Cor_J, &bic_enabled);
7668 	}
7669 
7670 	if (!platform->rapl_msrs || no_msr)
7671 		return;
7672 
7673 	if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr))
7674 		return;
7675 
7676 	rapl_time_units = ldexp(1.0, -(msr >> 16 & 0xf));
7677 	rapl_energy_units = ldexp(1.0, -(msr >> 8 & 0x1f));
7678 	rapl_power_units = ldexp(1.0, -(msr & 0xf));
7679 
7680 	tdp = get_tdp_amd();
7681 
7682 	rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
7683 	if (!quiet)
7684 		fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
7685 }
7686 
7687 void print_power_limit_msr(int cpu, unsigned long long msr, char *label)
7688 {
7689 	fprintf(outf, "cpu%d: %s: %sabled (%0.3f Watts, %f sec, clamp %sabled)\n",
7690 		cpu, label,
7691 		((msr >> 15) & 1) ? "EN" : "DIS",
7692 		((msr >> 0) & 0x7FFF) * rapl_power_units,
7693 		(1.0 + (((msr >> 22) & 0x3) / 4.0)) * (1 << ((msr >> 17) & 0x1F)) * rapl_time_units,
7694 		(((msr >> 16) & 1) ? "EN" : "DIS"));
7695 
7696 	return;
7697 }
7698 
7699 static int fread_int(char *path, int *val)
7700 {
7701 	FILE *filep;
7702 	int ret;
7703 
7704 	filep = fopen(path, "r");
7705 	if (!filep)
7706 		return -1;
7707 
7708 	ret = fscanf(filep, "%d", val);
7709 	fclose(filep);
7710 	return ret;
7711 }
7712 
7713 static int fread_ull(char *path, unsigned long long *val)
7714 {
7715 	FILE *filep;
7716 	int ret;
7717 
7718 	filep = fopen(path, "r");
7719 	if (!filep)
7720 		return -1;
7721 
7722 	ret = fscanf(filep, "%llu", val);
7723 	fclose(filep);
7724 	return ret;
7725 }
7726 
7727 static int fread_str(char *path, char *buf, int size)
7728 {
7729 	FILE *filep;
7730 	int ret;
7731 	char *cp;
7732 
7733 	filep = fopen(path, "r");
7734 	if (!filep)
7735 		return -1;
7736 
7737 	ret = fread(buf, 1, size, filep);
7738 	fclose(filep);
7739 
7740 	/* replace '\n' with '\0' */
7741 	cp = strchr(buf, '\n');
7742 	if (cp != NULL)
7743 		*cp = '\0';
7744 
7745 	return ret;
7746 }
7747 
7748 #define PATH_RAPL_SYSFS	"/sys/class/powercap"
7749 
7750 static int dump_one_domain(char *domain_path)
7751 {
7752 	char path[PATH_MAX];
7753 	char str[PATH_MAX];
7754 	unsigned long long val;
7755 	int constraint;
7756 	int enable;
7757 	int ret;
7758 
7759 	snprintf(path, PATH_MAX, "%s/name", domain_path);
7760 	ret = fread_str(path, str, PATH_MAX);
7761 	if (ret <= 0)
7762 		return -1;
7763 
7764 	fprintf(outf, "%s: %s", domain_path + strlen(PATH_RAPL_SYSFS) + 1, str);
7765 
7766 	snprintf(path, PATH_MAX, "%s/enabled", domain_path);
7767 	ret = fread_int(path, &enable);
7768 	if (ret <= 0)
7769 		return -1;
7770 
7771 	if (!enable) {
7772 		fputs(" disabled\n", outf);
7773 		return 0;
7774 	}
7775 
7776 	for (constraint = 0;; constraint++) {
7777 		snprintf(path, PATH_MAX, "%s/constraint_%d_time_window_us", domain_path, constraint);
7778 		ret = fread_ull(path, &val);
7779 		if (ret <= 0)
7780 			break;
7781 
7782 		if (val > 1000000)
7783 			fprintf(outf, " %0.1fs", (double)val / 1000000);
7784 		else if (val > 1000)
7785 			fprintf(outf, " %0.1fms", (double)val / 1000);
7786 		else
7787 			fprintf(outf, " %0.1fus", (double)val);
7788 
7789 		snprintf(path, PATH_MAX, "%s/constraint_%d_power_limit_uw", domain_path, constraint);
7790 		ret = fread_ull(path, &val);
7791 		if (ret > 0 && val)
7792 			fprintf(outf, ":%lluW", val / 1000000);
7793 
7794 		snprintf(path, PATH_MAX, "%s/constraint_%d_max_power_uw", domain_path, constraint);
7795 		ret = fread_ull(path, &val);
7796 		if (ret > 0 && val)
7797 			fprintf(outf, ",max:%lluW", val / 1000000);
7798 	}
7799 	fputc('\n', outf);
7800 
7801 	return 0;
7802 }
7803 
7804 static int print_rapl_sysfs(void)
7805 {
7806 	DIR *dir, *cdir;
7807 	struct dirent *entry, *centry;
7808 	char path[PATH_MAX];
7809 	char str[PATH_MAX];
7810 
7811 	if ((dir = opendir(PATH_RAPL_SYSFS)) == NULL) {
7812 		warn("open %s failed", PATH_RAPL_SYSFS);
7813 		return 1;
7814 	}
7815 
7816 	while ((entry = readdir(dir)) != NULL) {
7817 		if (strlen(entry->d_name) > 100)
7818 			continue;
7819 
7820 		if (strncmp(entry->d_name, "intel-rapl", strlen("intel-rapl")))
7821 			continue;
7822 
7823 		snprintf(path, PATH_MAX, "%s/%s/name", PATH_RAPL_SYSFS, entry->d_name);
7824 
7825 		/* Parse top level domains first, including package and psys */
7826 		fread_str(path, str, PATH_MAX);
7827 		if (strncmp(str, "package", strlen("package")) && strncmp(str, "psys", strlen("psys")))
7828 			continue;
7829 
7830 		snprintf(path, PATH_MAX, "%s/%s", PATH_RAPL_SYSFS, entry->d_name);
7831 		if ((cdir = opendir(path)) == NULL) {
7832 			perror("opendir() error");
7833 			return 1;
7834 		}
7835 
7836 		dump_one_domain(path);
7837 
7838 		while ((centry = readdir(cdir)) != NULL) {
7839 			if (strncmp(centry->d_name, "intel-rapl", strlen("intel-rapl")))
7840 				continue;
7841 			snprintf(path, PATH_MAX, "%s/%s/%s", PATH_RAPL_SYSFS, entry->d_name, centry->d_name);
7842 			dump_one_domain(path);
7843 		}
7844 		closedir(cdir);
7845 	}
7846 
7847 	closedir(dir);
7848 	return 0;
7849 }
7850 
7851 int print_rapl(PER_THREAD_PARAMS)
7852 {
7853 	unsigned long long msr;
7854 	const char *msr_name;
7855 	int cpu;
7856 
7857 	UNUSED(c);
7858 	UNUSED(p);
7859 
7860 	if (!platform->rapl_msrs)
7861 		return 0;
7862 
7863 	/* RAPL counters are per package, so print only for 1st thread/package */
7864 	if (!is_cpu_first_thread_in_package(t, c, p))
7865 		return 0;
7866 
7867 	cpu = t->cpu_id;
7868 	if (cpu_migrate(cpu)) {
7869 		fprintf(outf, "print_rapl: Could not migrate to CPU %d\n", cpu);
7870 		return -1;
7871 	}
7872 
7873 	if (platform->rapl_msrs & RAPL_AMD_F17H) {
7874 		msr_name = "MSR_RAPL_PWR_UNIT";
7875 		if (get_msr(cpu, MSR_RAPL_PWR_UNIT, &msr))
7876 			return -1;
7877 	} else {
7878 		msr_name = "MSR_RAPL_POWER_UNIT";
7879 		if (get_msr(cpu, MSR_RAPL_POWER_UNIT, &msr))
7880 			return -1;
7881 	}
7882 
7883 	fprintf(outf, "cpu%d: %s: 0x%08llx (%f Watts, %f Joules, %f sec.)\n", cpu, msr_name, msr,
7884 		rapl_power_units, rapl_energy_units, rapl_time_units);
7885 
7886 	if (platform->rapl_msrs & RAPL_PKG_POWER_INFO) {
7887 
7888 		if (get_msr(cpu, MSR_PKG_POWER_INFO, &msr))
7889 			return -5;
7890 
7891 		fprintf(outf, "cpu%d: MSR_PKG_POWER_INFO: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
7892 			cpu, msr,
7893 			((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
7894 			((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units,
7895 			((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
7896 			((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
7897 
7898 	}
7899 	if (platform->rapl_msrs & RAPL_PKG) {
7900 
7901 		if (get_msr(cpu, MSR_PKG_POWER_LIMIT, &msr))
7902 			return -9;
7903 
7904 		fprintf(outf, "cpu%d: MSR_PKG_POWER_LIMIT: 0x%08llx (%slocked)\n",
7905 			cpu, msr, (msr >> 63) & 1 ? "" : "UN");
7906 
7907 		print_power_limit_msr(cpu, msr, "PKG Limit #1");
7908 		fprintf(outf, "cpu%d: PKG Limit #2: %sabled (%0.3f Watts, %f* sec, clamp %sabled)\n",
7909 			cpu,
7910 			((msr >> 47) & 1) ? "EN" : "DIS",
7911 			((msr >> 32) & 0x7FFF) * rapl_power_units,
7912 			(1.0 + (((msr >> 54) & 0x3) / 4.0)) * (1 << ((msr >> 49) & 0x1F)) * rapl_time_units,
7913 			((msr >> 48) & 1) ? "EN" : "DIS");
7914 
7915 		if (get_msr(cpu, MSR_VR_CURRENT_CONFIG, &msr))
7916 			return -9;
7917 
7918 		fprintf(outf, "cpu%d: MSR_VR_CURRENT_CONFIG: 0x%08llx\n", cpu, msr);
7919 		fprintf(outf, "cpu%d: PKG Limit #4: %f Watts (%slocked)\n",
7920 			cpu, ((msr >> 0) & 0x1FFF) * rapl_power_units, (msr >> 31) & 1 ? "" : "UN");
7921 	}
7922 
7923 	if (platform->rapl_msrs & RAPL_DRAM_POWER_INFO) {
7924 		if (get_msr(cpu, MSR_DRAM_POWER_INFO, &msr))
7925 			return -6;
7926 
7927 		fprintf(outf, "cpu%d: MSR_DRAM_POWER_INFO,: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
7928 			cpu, msr,
7929 			((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
7930 			((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units,
7931 			((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
7932 			((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
7933 	}
7934 	if (platform->rapl_msrs & RAPL_DRAM) {
7935 		if (get_msr(cpu, MSR_DRAM_POWER_LIMIT, &msr))
7936 			return -9;
7937 		fprintf(outf, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n",
7938 			cpu, msr, (msr >> 31) & 1 ? "" : "UN");
7939 
7940 		print_power_limit_msr(cpu, msr, "DRAM Limit");
7941 	}
7942 	if (platform->rapl_msrs & RAPL_CORE_POLICY) {
7943 		if (get_msr(cpu, MSR_PP0_POLICY, &msr))
7944 			return -7;
7945 
7946 		fprintf(outf, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF);
7947 	}
7948 	if (platform->rapl_msrs & RAPL_CORE_POWER_LIMIT) {
7949 		if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr))
7950 			return -9;
7951 		fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n",
7952 			cpu, msr, (msr >> 31) & 1 ? "" : "UN");
7953 		print_power_limit_msr(cpu, msr, "Cores Limit");
7954 	}
7955 	if (platform->rapl_msrs & RAPL_GFX) {
7956 		if (get_msr(cpu, MSR_PP1_POLICY, &msr))
7957 			return -8;
7958 
7959 		fprintf(outf, "cpu%d: MSR_PP1_POLICY: %lld\n", cpu, msr & 0xF);
7960 
7961 		if (get_msr(cpu, MSR_PP1_POWER_LIMIT, &msr))
7962 			return -9;
7963 		fprintf(outf, "cpu%d: MSR_PP1_POWER_LIMIT: 0x%08llx (%slocked)\n",
7964 			cpu, msr, (msr >> 31) & 1 ? "" : "UN");
7965 		print_power_limit_msr(cpu, msr, "GFX Limit");
7966 	}
7967 	return 0;
7968 }
7969 
7970 /*
7971  * probe_rapl()
7972  *
7973  * sets rapl_power_units, rapl_energy_units, rapl_time_units
7974  */
7975 void probe_rapl(void)
7976 {
7977 	if (genuine_intel)
7978 		rapl_probe_intel();
7979 	if (authentic_amd || hygon_genuine)
7980 		rapl_probe_amd();
7981 
7982 	if (quiet)
7983 		return;
7984 
7985 	print_rapl_sysfs();
7986 
7987 	if (!platform->rapl_msrs || no_msr)
7988 		return;
7989 
7990 	for_all_cpus(print_rapl, ODD_COUNTERS);
7991 }
7992 
7993 /*
7994  * MSR_IA32_TEMPERATURE_TARGET indicates the temperature where
7995  * the Thermal Control Circuit (TCC) activates.
7996  * This is usually equal to tjMax.
7997  *
7998  * Older processors do not have this MSR, so there we guess,
7999  * but also allow cmdline over-ride with -T.
8000  *
8001  * Several MSR temperature values are in units of degrees-C
8002  * below this value, including the Digital Thermal Sensor (DTS),
8003  * Package Thermal Management Sensor (PTM), and thermal event thresholds.
8004  */
8005 int set_temperature_target(PER_THREAD_PARAMS)
8006 {
8007 	unsigned long long msr;
8008 	unsigned int tcc_default, tcc_offset;
8009 	int cpu;
8010 
8011 	UNUSED(c);
8012 	UNUSED(p);
8013 
8014 	/* tj_max is used only for dts or ptm */
8015 	if (!(do_dts || do_ptm))
8016 		return 0;
8017 
8018 	/* this is a per-package concept */
8019 	if (!is_cpu_first_thread_in_package(t, c, p))
8020 		return 0;
8021 
8022 	cpu = t->cpu_id;
8023 	if (cpu_migrate(cpu)) {
8024 		fprintf(outf, "Could not migrate to CPU %d\n", cpu);
8025 		return -1;
8026 	}
8027 
8028 	if (tj_max_override != 0) {
8029 		tj_max = tj_max_override;
8030 		fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n", cpu, tj_max);
8031 		return 0;
8032 	}
8033 
8034 	/* Temperature Target MSR is Nehalem and newer only */
8035 	if (!platform->has_nhm_msrs || no_msr)
8036 		goto guess;
8037 
8038 	if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr))
8039 		goto guess;
8040 
8041 	tcc_default = (msr >> 16) & 0xFF;
8042 
8043 	if (!quiet) {
8044 		int bits = platform->tcc_offset_bits;
8045 		unsigned long long enabled = 0;
8046 
8047 		if (bits && !get_msr(base_cpu, MSR_PLATFORM_INFO, &enabled))
8048 			enabled = (enabled >> 30) & 1;
8049 
8050 		if (bits && enabled) {
8051 			tcc_offset = (msr >> 24) & GENMASK(bits - 1, 0);
8052 			fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n",
8053 				cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset);
8054 		} else {
8055 			fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", cpu, msr, tcc_default);
8056 		}
8057 	}
8058 
8059 	if (!tcc_default)
8060 		goto guess;
8061 
8062 	tj_max = tcc_default;
8063 
8064 	return 0;
8065 
8066 guess:
8067 	tj_max = TJMAX_DEFAULT;
8068 	fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", cpu, tj_max);
8069 
8070 	return 0;
8071 }
8072 
8073 int print_thermal(PER_THREAD_PARAMS)
8074 {
8075 	unsigned long long msr;
8076 	unsigned int dts, dts2;
8077 	int cpu;
8078 
8079 	UNUSED(c);
8080 	UNUSED(p);
8081 
8082 	if (no_msr)
8083 		return 0;
8084 
8085 	if (!(do_dts || do_ptm))
8086 		return 0;
8087 
8088 	cpu = t->cpu_id;
8089 
8090 	/* DTS is per-core, no need to print for each thread */
8091 	if (!is_cpu_first_thread_in_core(t, c, p))
8092 		return 0;
8093 
8094 	if (cpu_migrate(cpu)) {
8095 		fprintf(outf, "print_thermal: Could not migrate to CPU %d\n", cpu);
8096 		return -1;
8097 	}
8098 
8099 	if (do_ptm && is_cpu_first_core_in_package(t, c, p)) {
8100 		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
8101 			return 0;
8102 
8103 		dts = (msr >> 16) & 0x7F;
8104 		fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", cpu, msr, tj_max - dts);
8105 
8106 		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr))
8107 			return 0;
8108 
8109 		dts = (msr >> 16) & 0x7F;
8110 		dts2 = (msr >> 8) & 0x7F;
8111 		fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
8112 			cpu, msr, tj_max - dts, tj_max - dts2);
8113 	}
8114 
8115 	if (do_dts && debug) {
8116 		unsigned int resolution;
8117 
8118 		if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
8119 			return 0;
8120 
8121 		dts = (msr >> 16) & 0x7F;
8122 		resolution = (msr >> 27) & 0xF;
8123 		fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n",
8124 			cpu, msr, tj_max - dts, resolution);
8125 
8126 		if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr))
8127 			return 0;
8128 
8129 		dts = (msr >> 16) & 0x7F;
8130 		dts2 = (msr >> 8) & 0x7F;
8131 		fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
8132 			cpu, msr, tj_max - dts, tj_max - dts2);
8133 	}
8134 
8135 	return 0;
8136 }
8137 
8138 void probe_thermal(void)
8139 {
8140 	if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK))
8141 		BIC_PRESENT(BIC_CORE_THROT_CNT);
8142 	else
8143 		BIC_NOT_PRESENT(BIC_CORE_THROT_CNT);
8144 
8145 	for_all_cpus(set_temperature_target, ODD_COUNTERS);
8146 
8147 	if (quiet)
8148 		return;
8149 
8150 	for_all_cpus(print_thermal, ODD_COUNTERS);
8151 }
8152 
8153 int get_cpu_type(PER_THREAD_PARAMS)
8154 {
8155 	unsigned int eax, ebx, ecx, edx;
8156 
8157 	UNUSED(c);
8158 	UNUSED(p);
8159 
8160 	if (!genuine_intel)
8161 		return 0;
8162 
8163 	if (cpu_migrate(t->cpu_id)) {
8164 		fprintf(outf, "Could not migrate to CPU %d\n", t->cpu_id);
8165 		return -1;
8166 	}
8167 
8168 	if (max_level < 0x1a)
8169 		return 0;
8170 
8171 	__cpuid(0x1a, eax, ebx, ecx, edx);
8172 	eax = (eax >> 24) & 0xFF;
8173 	if (eax == 0x20)
8174 		t->is_atom = true;
8175 	return 0;
8176 }
8177 
8178 void decode_feature_control_msr(void)
8179 {
8180 	unsigned long long msr;
8181 
8182 	if (no_msr)
8183 		return;
8184 
8185 	if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr))
8186 		fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n",
8187 			base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : "");
8188 }
8189 
8190 void decode_misc_enable_msr(void)
8191 {
8192 	unsigned long long msr;
8193 
8194 	if (no_msr)
8195 		return;
8196 
8197 	if (!genuine_intel)
8198 		return;
8199 
8200 	if (!get_msr(base_cpu, MSR_IA32_MISC_ENABLE, &msr))
8201 		fprintf(outf, "cpu%d: MSR_IA32_MISC_ENABLE: 0x%08llx (%sTCC %sEIST %sMWAIT %sPREFETCH %sTURBO)\n",
8202 			base_cpu, msr,
8203 			msr & MSR_IA32_MISC_ENABLE_TM1 ? "" : "No-",
8204 			msr & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP ? "" : "No-",
8205 			msr & MSR_IA32_MISC_ENABLE_MWAIT ? "" : "No-",
8206 			msr & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE ? "No-" : "",
8207 			msr & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ? "No-" : "");
8208 }
8209 
8210 void decode_misc_feature_control(void)
8211 {
8212 	unsigned long long msr;
8213 
8214 	if (no_msr)
8215 		return;
8216 
8217 	if (!platform->has_msr_misc_feature_control)
8218 		return;
8219 
8220 	if (!get_msr(base_cpu, MSR_MISC_FEATURE_CONTROL, &msr))
8221 		fprintf(outf,
8222 			"cpu%d: MSR_MISC_FEATURE_CONTROL: 0x%08llx (%sL2-Prefetch %sL2-Prefetch-pair %sL1-Prefetch %sL1-IP-Prefetch)\n",
8223 			base_cpu, msr, msr & (0 << 0) ? "No-" : "", msr & (1 << 0) ? "No-" : "",
8224 			msr & (2 << 0) ? "No-" : "", msr & (3 << 0) ? "No-" : "");
8225 }
8226 
8227 /*
8228  * Decode MSR_MISC_PWR_MGMT
8229  *
8230  * Decode the bits according to the Nehalem documentation
8231  * bit[0] seems to continue to have same meaning going forward
8232  * bit[1] less so...
8233  */
8234 void decode_misc_pwr_mgmt_msr(void)
8235 {
8236 	unsigned long long msr;
8237 
8238 	if (no_msr)
8239 		return;
8240 
8241 	if (!platform->has_msr_misc_pwr_mgmt)
8242 		return;
8243 
8244 	if (!get_msr(base_cpu, MSR_MISC_PWR_MGMT, &msr))
8245 		fprintf(outf, "cpu%d: MSR_MISC_PWR_MGMT: 0x%08llx (%sable-EIST_Coordination %sable-EPB %sable-OOB)\n",
8246 			base_cpu, msr,
8247 			msr & (1 << 0) ? "DIS" : "EN", msr & (1 << 1) ? "EN" : "DIS", msr & (1 << 8) ? "EN" : "DIS");
8248 }
8249 
8250 /*
8251  * Decode MSR_CC6_DEMOTION_POLICY_CONFIG, MSR_MC6_DEMOTION_POLICY_CONFIG
8252  *
8253  * This MSRs are present on Silvermont processors,
8254  * Intel Atom processor E3000 series (Baytrail), and friends.
8255  */
8256 void decode_c6_demotion_policy_msr(void)
8257 {
8258 	unsigned long long msr;
8259 
8260 	if (no_msr)
8261 		return;
8262 
8263 	if (!platform->has_msr_c6_demotion_policy_config)
8264 		return;
8265 
8266 	if (!get_msr(base_cpu, MSR_CC6_DEMOTION_POLICY_CONFIG, &msr))
8267 		fprintf(outf, "cpu%d: MSR_CC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-CC6-Demotion)\n",
8268 			base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
8269 
8270 	if (!get_msr(base_cpu, MSR_MC6_DEMOTION_POLICY_CONFIG, &msr))
8271 		fprintf(outf, "cpu%d: MSR_MC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-MC6-Demotion)\n",
8272 			base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
8273 }
8274 
8275 void print_dev_latency(void)
8276 {
8277 	char *path = "/dev/cpu_dma_latency";
8278 	int fd;
8279 	int value;
8280 	int retval;
8281 
8282 	fd = open(path, O_RDONLY);
8283 	if (fd < 0) {
8284 		if (debug)
8285 			warnx("Read %s failed", path);
8286 		return;
8287 	}
8288 
8289 	retval = read(fd, (void *)&value, sizeof(int));
8290 	if (retval != sizeof(int)) {
8291 		warn("read failed %s", path);
8292 		close(fd);
8293 		return;
8294 	}
8295 	fprintf(outf, "/dev/cpu_dma_latency: %d usec (%s)\n", value, value == 2000000000 ? "default" : "constrained");
8296 
8297 	close(fd);
8298 }
8299 
8300 static int has_instr_count_access(void)
8301 {
8302 	int fd;
8303 	int has_access;
8304 
8305 	if (no_perf)
8306 		return 0;
8307 
8308 	fd = open_perf_counter(base_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0);
8309 	has_access = fd != -1;
8310 
8311 	if (fd != -1)
8312 		close(fd);
8313 
8314 	if (!has_access)
8315 		warnx("Failed to access %s. Some of the counters may not be available\n"
8316 		      "\tRun as root to enable them or use %s to disable the access explicitly",
8317 		      "instructions retired perf counter", "--no-perf");
8318 
8319 	return has_access;
8320 }
8321 
8322 int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai,
8323 			  double *scale_, enum rapl_unit *unit_)
8324 {
8325 	int ret = -1;
8326 
8327 	if (no_perf)
8328 		return -1;
8329 
8330 	if (!cai->perf_name)
8331 		return -1;
8332 
8333 	const double scale = read_perf_scale(cai->perf_subsys, cai->perf_name);
8334 
8335 	if (scale == 0.0)
8336 		goto end;
8337 
8338 	const enum rapl_unit unit = read_perf_rapl_unit(cai->perf_subsys, cai->perf_name);
8339 
8340 	if (unit == RAPL_UNIT_INVALID)
8341 		goto end;
8342 
8343 	const unsigned int rapl_type = read_perf_type(cai->perf_subsys);
8344 	const unsigned int rapl_energy_pkg_config = read_perf_config(cai->perf_subsys, cai->perf_name);
8345 
8346 	ret = open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP);
8347 	if (ret == -1)
8348 		goto end;
8349 
8350 	/* If it's the first counter opened, make it a group descriptor */
8351 	if (rci->fd_perf == -1)
8352 		rci->fd_perf = ret;
8353 
8354 	*scale_ = scale;
8355 	*unit_ = unit;
8356 
8357 end:
8358 	if (debug >= 2)
8359 		fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu);
8360 
8361 	return ret;
8362 }
8363 
8364 /*
8365  * Linux-perf manages the HW instructions-retired counter
8366  * by enabling when requested, and hiding rollover
8367  */
8368 void linux_perf_init(void)
8369 {
8370 	if (access("/proc/sys/kernel/perf_event_paranoid", F_OK))
8371 		return;
8372 
8373 	if (BIC_IS_ENABLED(BIC_IPC) && has_aperf) {
8374 		fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
8375 		if (fd_instr_count_percpu == NULL)
8376 			err(-1, "calloc fd_instr_count_percpu");
8377 	}
8378 }
8379 
8380 void rapl_perf_init(void)
8381 {
8382 	const unsigned int num_domains = get_rapl_num_domains();
8383 	bool *domain_visited = calloc(num_domains, sizeof(bool));
8384 
8385 	rapl_counter_info_perdomain = calloc(num_domains, sizeof(*rapl_counter_info_perdomain));
8386 	if (rapl_counter_info_perdomain == NULL)
8387 		err(-1, "calloc rapl_counter_info_percpu");
8388 	rapl_counter_info_perdomain_size = num_domains;
8389 
8390 	/*
8391 	 * Initialize rapl_counter_info_percpu
8392 	 */
8393 	for (unsigned int domain_id = 0; domain_id < num_domains; ++domain_id) {
8394 		struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain_id];
8395 
8396 		rci->fd_perf = -1;
8397 		for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) {
8398 			rci->data[i] = 0;
8399 			rci->source[i] = COUNTER_SOURCE_NONE;
8400 		}
8401 	}
8402 
8403 	/*
8404 	 * Open/probe the counters
8405 	 * If can't get it via perf, fallback to MSR
8406 	 */
8407 	for (size_t i = 0; i < ARRAY_SIZE(rapl_counter_arch_infos); ++i) {
8408 
8409 		const struct rapl_counter_arch_info *const cai = &rapl_counter_arch_infos[i];
8410 		bool has_counter = 0;
8411 		double scale;
8412 		enum rapl_unit unit;
8413 		unsigned int next_domain;
8414 
8415 		if (!BIC_IS_ENABLED(cai->bic_number))
8416 			continue;
8417 
8418 		memset(domain_visited, 0, num_domains * sizeof(*domain_visited));
8419 
8420 		for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) {
8421 
8422 			if (cpu_is_not_allowed(cpu))
8423 				continue;
8424 
8425 			/* Skip already seen and handled RAPL domains */
8426 			next_domain = get_rapl_domain_id(cpu);
8427 
8428 			assert(next_domain < num_domains);
8429 
8430 			if (domain_visited[next_domain])
8431 				continue;
8432 
8433 			domain_visited[next_domain] = 1;
8434 
8435 			if ((cai->flags & RAPL_COUNTER_FLAG_PLATFORM_COUNTER) && (cpu != base_cpu))
8436 				continue;
8437 
8438 			struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[next_domain];
8439 
8440 			/*
8441 			 * rapl_counter_arch_infos[] can have multiple entries describing the same
8442 			 * counter, due to the difference from different platforms/Vendors.
8443 			 * E.g. rapl_counter_arch_infos[0] and rapl_counter_arch_infos[1] share the
8444 			 * same perf_subsys and perf_name, but with different MSR address.
8445 			 * rapl_counter_arch_infos[0] is for Intel and rapl_counter_arch_infos[1]
8446 			 * is for AMD.
8447 			 * In this case, it is possible that multiple rapl_counter_arch_infos[]
8448 			 * entries are probed just because their perf/msr is duplicate and valid.
8449 			 *
8450 			 * Thus need a check to avoid re-probe the same counters.
8451 			 */
8452 			if (rci->source[cai->rci_index] != COUNTER_SOURCE_NONE)
8453 				break;
8454 
8455 			/* Use perf API for this counter */
8456 			if (add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) {
8457 				rci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
8458 				rci->scale[cai->rci_index] = scale * cai->compat_scale;
8459 				rci->unit[cai->rci_index] = unit;
8460 				rci->flags[cai->rci_index] = cai->flags;
8461 
8462 				/* Use MSR for this counter */
8463 			} else if (add_rapl_msr_counter(cpu, cai) >= 0) {
8464 				rci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
8465 				rci->msr[cai->rci_index] = cai->msr;
8466 				rci->msr_mask[cai->rci_index] = cai->msr_mask;
8467 				rci->msr_shift[cai->rci_index] = cai->msr_shift;
8468 				rci->unit[cai->rci_index] = RAPL_UNIT_JOULES;
8469 				rci->scale[cai->rci_index] = *cai->platform_rapl_msr_scale * cai->compat_scale;
8470 				rci->flags[cai->rci_index] = cai->flags;
8471 			}
8472 
8473 			if (rci->source[cai->rci_index] != COUNTER_SOURCE_NONE)
8474 				has_counter = 1;
8475 		}
8476 
8477 		/* If any CPU has access to the counter, make it present */
8478 		if (has_counter)
8479 			BIC_PRESENT(cai->bic_number);
8480 	}
8481 
8482 	free(domain_visited);
8483 }
8484 
8485 /* Assumes msr_counter_info is populated */
8486 static int has_amperf_access(void)
8487 {
8488 	return msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].present &&
8489 	    msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].present;
8490 }
8491 
8492 int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *group_name)
8493 {
8494 	if (strcmp(group_name, "cstate_core") == 0)
8495 		return &cci->fd_perf_core;
8496 
8497 	if (strcmp(group_name, "cstate_pkg") == 0)
8498 		return &cci->fd_perf_pkg;
8499 
8500 	return NULL;
8501 }
8502 
8503 int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai)
8504 {
8505 	int ret = -1;
8506 
8507 	if (no_perf)
8508 		return -1;
8509 
8510 	if (!cai->perf_name)
8511 		return -1;
8512 
8513 	int *pfd_group = get_cstate_perf_group_fd(cci, cai->perf_subsys);
8514 
8515 	if (pfd_group == NULL)
8516 		goto end;
8517 
8518 	const unsigned int type = read_perf_type(cai->perf_subsys);
8519 	const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name);
8520 
8521 	ret = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP);
8522 
8523 	if (ret == -1)
8524 		goto end;
8525 
8526 	/* If it's the first counter opened, make it a group descriptor */
8527 	if (*pfd_group == -1)
8528 		*pfd_group = ret;
8529 
8530 end:
8531 	if (debug >= 2)
8532 		fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu);
8533 
8534 	return ret;
8535 }
8536 
8537 int add_msr_perf_counter(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai)
8538 {
8539 	int ret = -1;
8540 
8541 	if (no_perf)
8542 		return -1;
8543 
8544 	if (!cai->perf_name)
8545 		return -1;
8546 
8547 	const unsigned int type = read_perf_type(cai->perf_subsys);
8548 	const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name);
8549 
8550 	ret = open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP);
8551 
8552 	if (ret == -1)
8553 		goto end;
8554 
8555 	/* If it's the first counter opened, make it a group descriptor */
8556 	if (cci->fd_perf == -1)
8557 		cci->fd_perf = ret;
8558 
8559 end:
8560 	if (debug)
8561 		fprintf(stderr, "%s: %s/%s: %d (cpu: %d)\n", __func__, cai->perf_subsys, cai->perf_name, ret, cpu);
8562 
8563 	return ret;
8564 }
8565 
8566 void msr_perf_init_(void)
8567 {
8568 	const int mci_num = topo.max_cpu_num + 1;
8569 
8570 	msr_counter_info = calloc(mci_num, sizeof(*msr_counter_info));
8571 	if (!msr_counter_info)
8572 		err(1, "calloc msr_counter_info");
8573 	msr_counter_info_size = mci_num;
8574 
8575 	for (int cpu = 0; cpu < mci_num; ++cpu)
8576 		msr_counter_info[cpu].fd_perf = -1;
8577 
8578 	for (int cidx = 0; cidx < NUM_MSR_COUNTERS; ++cidx) {
8579 
8580 		struct msr_counter_arch_info *cai = &msr_counter_arch_infos[cidx];
8581 
8582 		cai->present = false;
8583 
8584 		for (int cpu = 0; cpu < mci_num; ++cpu) {
8585 
8586 			struct msr_counter_info_t *const cci = &msr_counter_info[cpu];
8587 
8588 			if (cpu_is_not_allowed(cpu))
8589 				continue;
8590 
8591 			if (cai->needed) {
8592 				/* Use perf API for this counter */
8593 				if (add_msr_perf_counter(cpu, cci, cai) != -1) {
8594 					cci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
8595 					cai->present = true;
8596 
8597 					/* User MSR for this counter */
8598 				} else if (add_msr_counter(cpu, cai->msr) >= 0) {
8599 					cci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
8600 					cci->msr[cai->rci_index] = cai->msr;
8601 					cci->msr_mask[cai->rci_index] = cai->msr_mask;
8602 					cai->present = true;
8603 				}
8604 			}
8605 		}
8606 	}
8607 }
8608 
8609 /* Initialize data for reading perf counters from the MSR group. */
8610 void msr_perf_init(void)
8611 {
8612 	bool need_amperf = false, need_smi = false;
8613 	const bool need_soft_c1 = (!platform->has_msr_core_c1_res) && (platform->supported_cstates & CC1);
8614 
8615 	need_amperf = BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || BIC_IS_ENABLED(BIC_Bzy_MHz)
8616 	    || BIC_IS_ENABLED(BIC_IPC) || need_soft_c1;
8617 
8618 	if (BIC_IS_ENABLED(BIC_SMI))
8619 		need_smi = true;
8620 
8621 	/* Enable needed counters */
8622 	msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].needed = need_amperf;
8623 	msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].needed = need_amperf;
8624 	msr_counter_arch_infos[MSR_ARCH_INFO_SMI_INDEX].needed = need_smi;
8625 
8626 	msr_perf_init_();
8627 
8628 	const bool has_amperf = has_amperf_access();
8629 	const bool has_smi = msr_counter_arch_infos[MSR_ARCH_INFO_SMI_INDEX].present;
8630 
8631 	has_aperf_access = has_amperf;
8632 
8633 	if (has_amperf) {
8634 		BIC_PRESENT(BIC_Avg_MHz);
8635 		BIC_PRESENT(BIC_Busy);
8636 		BIC_PRESENT(BIC_Bzy_MHz);
8637 		BIC_PRESENT(BIC_SMI);
8638 	}
8639 
8640 	if (has_smi)
8641 		BIC_PRESENT(BIC_SMI);
8642 }
8643 
8644 void cstate_perf_init_(bool soft_c1)
8645 {
8646 	bool has_counter;
8647 	bool *cores_visited = NULL, *pkg_visited = NULL;
8648 	const int cores_visited_elems = topo.max_core_id + 1;
8649 	const int pkg_visited_elems = topo.max_package_id + 1;
8650 	const int cci_num = topo.max_cpu_num + 1;
8651 
8652 	ccstate_counter_info = calloc(cci_num, sizeof(*ccstate_counter_info));
8653 	if (!ccstate_counter_info)
8654 		err(1, "calloc ccstate_counter_arch_info");
8655 	ccstate_counter_info_size = cci_num;
8656 
8657 	cores_visited = calloc(cores_visited_elems, sizeof(*cores_visited));
8658 	if (!cores_visited)
8659 		err(1, "calloc cores_visited");
8660 
8661 	pkg_visited = calloc(pkg_visited_elems, sizeof(*pkg_visited));
8662 	if (!pkg_visited)
8663 		err(1, "calloc pkg_visited");
8664 
8665 	/* Initialize cstate_counter_info_percpu */
8666 	for (int cpu = 0; cpu < cci_num; ++cpu) {
8667 		ccstate_counter_info[cpu].fd_perf_core = -1;
8668 		ccstate_counter_info[cpu].fd_perf_pkg = -1;
8669 	}
8670 
8671 	for (int cidx = 0; cidx < NUM_CSTATE_COUNTERS; ++cidx) {
8672 		has_counter = false;
8673 		memset(cores_visited, 0, cores_visited_elems * sizeof(*cores_visited));
8674 		memset(pkg_visited, 0, pkg_visited_elems * sizeof(*pkg_visited));
8675 
8676 		const struct cstate_counter_arch_info *cai = &ccstate_counter_arch_infos[cidx];
8677 
8678 		for (int cpu = 0; cpu < cci_num; ++cpu) {
8679 
8680 			struct cstate_counter_info_t *const cci = &ccstate_counter_info[cpu];
8681 
8682 			if (cpu_is_not_allowed(cpu))
8683 				continue;
8684 
8685 			const int core_id = cpus[cpu].physical_core_id;
8686 			const int pkg_id = cpus[cpu].physical_package_id;
8687 
8688 			assert(core_id < cores_visited_elems);
8689 			assert(pkg_id < pkg_visited_elems);
8690 
8691 			const bool per_thread = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD;
8692 			const bool per_core = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_CORE;
8693 
8694 			if (!per_thread && cores_visited[core_id])
8695 				continue;
8696 
8697 			if (!per_core && pkg_visited[pkg_id])
8698 				continue;
8699 
8700 			const bool counter_needed = BIC_IS_ENABLED(cai->bic_number) ||
8701 			    (soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY));
8702 			const bool counter_supported = (platform->supported_cstates & cai->feature_mask);
8703 
8704 			if (counter_needed && counter_supported) {
8705 				/* Use perf API for this counter */
8706 				if (add_cstate_perf_counter(cpu, cci, cai) != -1) {
8707 
8708 					cci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
8709 
8710 					/* User MSR for this counter */
8711 				} else if (pkg_cstate_limit >= cai->pkg_cstate_limit
8712 					   && add_msr_counter(cpu, cai->msr) >= 0) {
8713 					cci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
8714 					cci->msr[cai->rci_index] = cai->msr;
8715 				}
8716 			}
8717 
8718 			if (cci->source[cai->rci_index] != COUNTER_SOURCE_NONE) {
8719 				has_counter = true;
8720 				cores_visited[core_id] = true;
8721 				pkg_visited[pkg_id] = true;
8722 			}
8723 		}
8724 
8725 		/* If any CPU has access to the counter, make it present */
8726 		if (has_counter)
8727 			BIC_PRESENT(cai->bic_number);
8728 	}
8729 
8730 	free(cores_visited);
8731 	free(pkg_visited);
8732 }
8733 
8734 void cstate_perf_init(void)
8735 {
8736 	/*
8737 	 * If we don't have a C1 residency MSR, we calculate it "in software",
8738 	 * but we need APERF, MPERF too.
8739 	 */
8740 	const bool soft_c1 = !platform->has_msr_core_c1_res && has_amperf_access()
8741 	    && platform->supported_cstates & CC1;
8742 
8743 	if (soft_c1)
8744 		BIC_PRESENT(BIC_CPU_c1);
8745 
8746 	cstate_perf_init_(soft_c1);
8747 }
8748 
8749 void probe_cstates(void)
8750 {
8751 	probe_cst_limit();
8752 
8753 	if (platform->has_msr_module_c6_res_ms)
8754 		BIC_PRESENT(BIC_Mod_c6);
8755 
8756 	if (platform->has_ext_cst_msrs && !no_msr) {
8757 		BIC_PRESENT(BIC_Totl_c0);
8758 		BIC_PRESENT(BIC_Any_c0);
8759 		BIC_PRESENT(BIC_GFX_c0);
8760 		BIC_PRESENT(BIC_CPUGFX);
8761 	}
8762 
8763 	if (quiet)
8764 		return;
8765 
8766 	dump_power_ctl();
8767 	dump_cst_cfg();
8768 	decode_c6_demotion_policy_msr();
8769 	print_dev_latency();
8770 	dump_sysfs_cstate_config();
8771 	print_irtl();
8772 }
8773 
8774 void probe_lpi(void)
8775 {
8776 	if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK))
8777 		BIC_PRESENT(BIC_CPU_LPI);
8778 	else
8779 		BIC_NOT_PRESENT(BIC_CPU_LPI);
8780 
8781 	if (!access(sys_lpi_file_sysfs, R_OK)) {
8782 		sys_lpi_file = sys_lpi_file_sysfs;
8783 		BIC_PRESENT(BIC_SYS_LPI);
8784 	} else if (!access(sys_lpi_file_debugfs, R_OK)) {
8785 		sys_lpi_file = sys_lpi_file_debugfs;
8786 		BIC_PRESENT(BIC_SYS_LPI);
8787 	} else {
8788 		sys_lpi_file_sysfs = NULL;
8789 		BIC_NOT_PRESENT(BIC_SYS_LPI);
8790 	}
8791 
8792 }
8793 
8794 void probe_pstates(void)
8795 {
8796 	probe_bclk();
8797 
8798 	if (quiet)
8799 		return;
8800 
8801 	dump_platform_info();
8802 	dump_turbo_ratio_info();
8803 	dump_sysfs_pstate_config();
8804 	decode_misc_pwr_mgmt_msr();
8805 
8806 	for_all_cpus(print_hwp, ODD_COUNTERS);
8807 	for_all_cpus(print_epb, ODD_COUNTERS);
8808 	for_all_cpus(print_perf_limit, ODD_COUNTERS);
8809 }
8810 
8811 void process_cpuid()
8812 {
8813 	unsigned int eax, ebx, ecx, edx;
8814 	unsigned int fms, family, model, stepping, ecx_flags, edx_flags;
8815 	unsigned long long ucode_patch = 0;
8816 	bool ucode_patch_valid = false;
8817 
8818 	eax = ebx = ecx = edx = 0;
8819 
8820 	__cpuid(0, max_level, ebx, ecx, edx);
8821 
8822 	if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
8823 		genuine_intel = 1;
8824 	else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
8825 		authentic_amd = 1;
8826 	else if (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)
8827 		hygon_genuine = 1;
8828 
8829 	if (!quiet)
8830 		fprintf(outf, "CPUID(0): %.4s%.4s%.4s 0x%x CPUID levels\n",
8831 			(char *)&ebx, (char *)&edx, (char *)&ecx, max_level);
8832 
8833 	__cpuid(1, fms, ebx, ecx, edx);
8834 	family = (fms >> 8) & 0xf;
8835 	model = (fms >> 4) & 0xf;
8836 	stepping = fms & 0xf;
8837 	if (family == 0xf)
8838 		family += (fms >> 20) & 0xff;
8839 	if (family >= 6)
8840 		model += ((fms >> 16) & 0xf) << 4;
8841 	ecx_flags = ecx;
8842 	edx_flags = edx;
8843 
8844 	if (!no_msr) {
8845 		if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch))
8846 			warnx("get_msr(UCODE)");
8847 		else
8848 			ucode_patch_valid = true;
8849 	}
8850 
8851 	/*
8852 	 * check max extended function levels of CPUID.
8853 	 * This is needed to check for invariant TSC.
8854 	 * This check is valid for both Intel and AMD.
8855 	 */
8856 	ebx = ecx = edx = 0;
8857 	__cpuid(0x80000000, max_extended_level, ebx, ecx, edx);
8858 
8859 	if (!quiet) {
8860 		fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d)",
8861 			family, model, stepping, family, model, stepping);
8862 		if (ucode_patch_valid)
8863 			fprintf(outf, " microcode 0x%x", (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF));
8864 		fputc('\n', outf);
8865 
8866 		fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level);
8867 		fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n",
8868 			ecx_flags & (1 << 0) ? "SSE3" : "-",
8869 			ecx_flags & (1 << 3) ? "MONITOR" : "-",
8870 			ecx_flags & (1 << 6) ? "SMX" : "-",
8871 			ecx_flags & (1 << 7) ? "EIST" : "-",
8872 			ecx_flags & (1 << 8) ? "TM2" : "-",
8873 			edx_flags & (1 << 4) ? "TSC" : "-",
8874 			edx_flags & (1 << 5) ? "MSR" : "-",
8875 			edx_flags & (1 << 22) ? "ACPI-TM" : "-",
8876 			edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-");
8877 	}
8878 
8879 	probe_platform_features(family, model);
8880 
8881 	if (!(edx_flags & (1 << 5)))
8882 		errx(1, "CPUID: no MSR");
8883 
8884 	if (max_extended_level >= 0x80000007) {
8885 
8886 		/*
8887 		 * Non-Stop TSC is advertised by CPUID.EAX=0x80000007: EDX.bit8
8888 		 * this check is valid for both Intel and AMD
8889 		 */
8890 		__cpuid(0x80000007, eax, ebx, ecx, edx);
8891 		has_invariant_tsc = edx & (1 << 8);
8892 	}
8893 
8894 	/*
8895 	 * APERF/MPERF is advertised by CPUID.EAX=0x6: ECX.bit0
8896 	 * this check is valid for both Intel and AMD
8897 	 */
8898 
8899 	__cpuid(0x6, eax, ebx, ecx, edx);
8900 	has_aperf = ecx & (1 << 0);
8901 	do_dts = eax & (1 << 0);
8902 	if (do_dts)
8903 		BIC_PRESENT(BIC_CoreTmp);
8904 	has_turbo = eax & (1 << 1);
8905 	do_ptm = eax & (1 << 6);
8906 	if (do_ptm)
8907 		BIC_PRESENT(BIC_PkgTmp);
8908 	has_hwp = eax & (1 << 7);
8909 	has_hwp_notify = eax & (1 << 8);
8910 	has_hwp_activity_window = eax & (1 << 9);
8911 	has_hwp_epp = eax & (1 << 10);
8912 	has_hwp_pkg = eax & (1 << 11);
8913 	has_epb = ecx & (1 << 3);
8914 
8915 	if (!quiet)
8916 		fprintf(outf, "CPUID(6): %sAPERF, %sTURBO, %sDTS, %sPTM, %sHWP, "
8917 			"%sHWPnotify, %sHWPwindow, %sHWPepp, %sHWPpkg, %sEPB\n",
8918 			has_aperf ? "" : "No-",
8919 			has_turbo ? "" : "No-",
8920 			do_dts ? "" : "No-",
8921 			do_ptm ? "" : "No-",
8922 			has_hwp ? "" : "No-",
8923 			has_hwp_notify ? "" : "No-",
8924 			has_hwp_activity_window ? "" : "No-",
8925 			has_hwp_epp ? "" : "No-", has_hwp_pkg ? "" : "No-", has_epb ? "" : "No-");
8926 
8927 	if (!quiet)
8928 		decode_misc_enable_msr();
8929 
8930 	if (max_level >= 0x7 && !quiet) {
8931 		int has_sgx;
8932 
8933 		ecx = 0;
8934 
8935 		__cpuid_count(0x7, 0, eax, ebx, ecx, edx);
8936 
8937 		has_sgx = ebx & (1 << 2);
8938 
8939 		is_hybrid = edx & (1 << 15);
8940 
8941 		fprintf(outf, "CPUID(7): %sSGX %sHybrid\n", has_sgx ? "" : "No-", is_hybrid ? "" : "No-");
8942 
8943 		if (has_sgx)
8944 			decode_feature_control_msr();
8945 	}
8946 
8947 	if (max_level >= 0x15) {
8948 		unsigned int eax_crystal;
8949 		unsigned int ebx_tsc;
8950 
8951 		/*
8952 		 * CPUID 15H TSC/Crystal ratio, possibly Crystal Hz
8953 		 */
8954 		eax_crystal = ebx_tsc = crystal_hz = edx = 0;
8955 		__cpuid(0x15, eax_crystal, ebx_tsc, crystal_hz, edx);
8956 
8957 		if (ebx_tsc != 0) {
8958 			if (!quiet && (ebx != 0))
8959 				fprintf(outf, "CPUID(0x15): eax_crystal: %d ebx_tsc: %d ecx_crystal_hz: %d\n",
8960 					eax_crystal, ebx_tsc, crystal_hz);
8961 
8962 			if (crystal_hz == 0)
8963 				crystal_hz = platform->crystal_freq;
8964 
8965 			if (crystal_hz) {
8966 				tsc_hz = (unsigned long long)crystal_hz *ebx_tsc / eax_crystal;
8967 				if (!quiet)
8968 					fprintf(outf, "TSC: %lld MHz (%d Hz * %d / %d / 1000000)\n",
8969 						tsc_hz / 1000000, crystal_hz, ebx_tsc, eax_crystal);
8970 			}
8971 		}
8972 	}
8973 	if (max_level >= 0x16) {
8974 		unsigned int base_mhz, max_mhz, bus_mhz, edx;
8975 
8976 		/*
8977 		 * CPUID 16H Base MHz, Max MHz, Bus MHz
8978 		 */
8979 		base_mhz = max_mhz = bus_mhz = edx = 0;
8980 
8981 		__cpuid(0x16, base_mhz, max_mhz, bus_mhz, edx);
8982 
8983 		bclk = bus_mhz;
8984 
8985 		base_hz = base_mhz * 1000000;
8986 		has_base_hz = 1;
8987 
8988 		if (platform->enable_tsc_tweak)
8989 			tsc_tweak = base_hz / tsc_hz;
8990 
8991 		if (!quiet)
8992 			fprintf(outf, "CPUID(0x16): base_mhz: %d max_mhz: %d bus_mhz: %d\n",
8993 				base_mhz, max_mhz, bus_mhz);
8994 	}
8995 
8996 	if (has_aperf)
8997 		aperf_mperf_multiplier = platform->need_perf_multiplier ? 1024 : 1;
8998 
8999 	BIC_PRESENT(BIC_IRQ);
9000 	BIC_PRESENT(BIC_NMI);
9001 	BIC_PRESENT(BIC_TSC_MHz);
9002 }
9003 
9004 static void counter_info_init(void)
9005 {
9006 	for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) {
9007 		struct cstate_counter_arch_info *const cai = &ccstate_counter_arch_infos[i];
9008 
9009 		if (platform->has_msr_knl_core_c6_residency && cai->msr == MSR_CORE_C6_RESIDENCY)
9010 			cai->msr = MSR_KNL_CORE_C6_RESIDENCY;
9011 
9012 		if (!platform->has_msr_core_c1_res && cai->msr == MSR_CORE_C1_RES)
9013 			cai->msr = 0;
9014 
9015 		if (platform->has_msr_atom_pkg_c6_residency && cai->msr == MSR_PKG_C6_RESIDENCY)
9016 			cai->msr = MSR_ATOM_PKG_C6_RESIDENCY;
9017 	}
9018 
9019 	for (int i = 0; i < NUM_MSR_COUNTERS; ++i) {
9020 		msr_counter_arch_infos[i].present = false;
9021 		msr_counter_arch_infos[i].needed = false;
9022 	}
9023 }
9024 
9025 void probe_pm_features(void)
9026 {
9027 	probe_pstates();
9028 
9029 	probe_cstates();
9030 
9031 	probe_lpi();
9032 
9033 	probe_intel_uncore_frequency();
9034 
9035 	probe_graphics();
9036 
9037 	probe_rapl();
9038 
9039 	probe_thermal();
9040 
9041 	if (platform->has_nhm_msrs && !no_msr)
9042 		BIC_PRESENT(BIC_SMI);
9043 
9044 	if (!quiet)
9045 		decode_misc_feature_control();
9046 }
9047 
9048 /*
9049  * in /dev/cpu/ return success for names that are numbers
9050  * ie. filter out ".", "..", "microcode".
9051  */
9052 int dir_filter(const struct dirent *dirp)
9053 {
9054 	if (isdigit(dirp->d_name[0]))
9055 		return 1;
9056 	else
9057 		return 0;
9058 }
9059 
9060 char *possible_file = "/sys/devices/system/cpu/possible";
9061 char possible_buf[1024];
9062 
9063 int initialize_cpu_possible_set(void)
9064 {
9065 	FILE *fp;
9066 
9067 	fp = fopen(possible_file, "r");
9068 	if (!fp) {
9069 		warn("open %s", possible_file);
9070 		return -1;
9071 	}
9072 	if (fread(possible_buf, sizeof(char), 1024, fp) == 0) {
9073 		warn("read %s", possible_file);
9074 		goto err;
9075 	}
9076 	if (parse_cpu_str(possible_buf, cpu_possible_set, cpu_possible_setsize)) {
9077 		warnx("%s: cpu str malformat %s\n", possible_file, cpu_effective_str);
9078 		goto err;
9079 	}
9080 	return 0;
9081 
9082 err:
9083 	fclose(fp);
9084 	return -1;
9085 }
9086 
9087 void topology_probe(bool startup)
9088 {
9089 	int i;
9090 	int max_core_id = 0;
9091 	int max_package_id = 0;
9092 	int max_siblings = 0;
9093 
9094 	/* Initialize num_cpus, max_cpu_num */
9095 	set_max_cpu_num();
9096 	topo.num_cpus = 0;
9097 	for_all_proc_cpus(count_cpus);
9098 	if (!summary_only)
9099 		BIC_PRESENT(BIC_CPU);
9100 
9101 	if (debug > 1)
9102 		fprintf(outf, "num_cpus %d max_cpu_num %d\n", topo.num_cpus, topo.max_cpu_num);
9103 
9104 	cpus = calloc(1, (topo.max_cpu_num + 1) * sizeof(struct cpu_topology));
9105 	if (cpus == NULL)
9106 		err(1, "calloc cpus");
9107 
9108 	/*
9109 	 * Allocate and initialize cpu_present_set
9110 	 */
9111 	cpu_present_set = CPU_ALLOC((topo.max_cpu_num + 1));
9112 	if (cpu_present_set == NULL)
9113 		err(3, "CPU_ALLOC");
9114 	cpu_present_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
9115 	CPU_ZERO_S(cpu_present_setsize, cpu_present_set);
9116 	for_all_proc_cpus(mark_cpu_present);
9117 
9118 	/*
9119 	 * Allocate and initialize cpu_possible_set
9120 	 */
9121 	cpu_possible_set = CPU_ALLOC((topo.max_cpu_num + 1));
9122 	if (cpu_possible_set == NULL)
9123 		err(3, "CPU_ALLOC");
9124 	cpu_possible_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
9125 	CPU_ZERO_S(cpu_possible_setsize, cpu_possible_set);
9126 	initialize_cpu_possible_set();
9127 
9128 	/*
9129 	 * Allocate and initialize cpu_effective_set
9130 	 */
9131 	cpu_effective_set = CPU_ALLOC((topo.max_cpu_num + 1));
9132 	if (cpu_effective_set == NULL)
9133 		err(3, "CPU_ALLOC");
9134 	cpu_effective_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
9135 	CPU_ZERO_S(cpu_effective_setsize, cpu_effective_set);
9136 	update_effective_set(startup);
9137 
9138 	/*
9139 	 * Allocate and initialize cpu_allowed_set
9140 	 */
9141 	cpu_allowed_set = CPU_ALLOC((topo.max_cpu_num + 1));
9142 	if (cpu_allowed_set == NULL)
9143 		err(3, "CPU_ALLOC");
9144 	cpu_allowed_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
9145 	CPU_ZERO_S(cpu_allowed_setsize, cpu_allowed_set);
9146 
9147 	/*
9148 	 * Validate and update cpu_allowed_set.
9149 	 *
9150 	 * Make sure all cpus in cpu_subset are also in cpu_present_set during startup.
9151 	 * Give a warning when cpus in cpu_subset become unavailable at runtime.
9152 	 * Give a warning when cpus are not effective because of cgroup setting.
9153 	 *
9154 	 * cpu_allowed_set is the intersection of cpu_present_set/cpu_effective_set/cpu_subset.
9155 	 */
9156 	for (i = 0; i < CPU_SUBSET_MAXCPUS; ++i) {
9157 		if (cpu_subset && !CPU_ISSET_S(i, cpu_subset_size, cpu_subset))
9158 			continue;
9159 
9160 		if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) {
9161 			if (cpu_subset) {
9162 				/* cpus in cpu_subset must be in cpu_present_set during startup */
9163 				if (startup)
9164 					err(1, "cpu%d not present", i);
9165 				else
9166 					fprintf(stderr, "cpu%d not present\n", i);
9167 			}
9168 			continue;
9169 		}
9170 
9171 		if (CPU_COUNT_S(cpu_effective_setsize, cpu_effective_set)) {
9172 			if (!CPU_ISSET_S(i, cpu_effective_setsize, cpu_effective_set)) {
9173 				fprintf(stderr, "cpu%d not effective\n", i);
9174 				continue;
9175 			}
9176 		}
9177 
9178 		CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set);
9179 	}
9180 
9181 	if (!CPU_COUNT_S(cpu_allowed_setsize, cpu_allowed_set))
9182 		err(-ENODEV, "No valid cpus found");
9183 	sched_setaffinity(0, cpu_allowed_setsize, cpu_allowed_set);
9184 
9185 	/*
9186 	 * Allocate and initialize cpu_affinity_set
9187 	 */
9188 	cpu_affinity_set = CPU_ALLOC((topo.max_cpu_num + 1));
9189 	if (cpu_affinity_set == NULL)
9190 		err(3, "CPU_ALLOC");
9191 	cpu_affinity_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
9192 	CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set);
9193 
9194 	for_all_proc_cpus(init_thread_id);
9195 
9196 	for_all_proc_cpus(set_cpu_hybrid_type);
9197 
9198 	/*
9199 	 * For online cpus
9200 	 * find max_core_id, max_package_id
9201 	 */
9202 	for (i = 0; i <= topo.max_cpu_num; ++i) {
9203 		int siblings;
9204 
9205 		if (cpu_is_not_present(i)) {
9206 			if (debug > 1)
9207 				fprintf(outf, "cpu%d NOT PRESENT\n", i);
9208 			continue;
9209 		}
9210 
9211 		cpus[i].logical_cpu_id = i;
9212 
9213 		/* get package information */
9214 		cpus[i].physical_package_id = get_physical_package_id(i);
9215 		if (cpus[i].physical_package_id > max_package_id)
9216 			max_package_id = cpus[i].physical_package_id;
9217 
9218 		/* get die information */
9219 		cpus[i].die_id = get_die_id(i);
9220 		if (cpus[i].die_id > topo.max_die_id)
9221 			topo.max_die_id = cpus[i].die_id;
9222 
9223 		/* get l3 information */
9224 		cpus[i].l3_id = get_l3_id(i);
9225 		if (cpus[i].l3_id > topo.max_l3_id)
9226 			topo.max_l3_id = cpus[i].l3_id;
9227 
9228 		/* get numa node information */
9229 		cpus[i].physical_node_id = get_physical_node_id(&cpus[i]);
9230 		if (cpus[i].physical_node_id > topo.max_node_num)
9231 			topo.max_node_num = cpus[i].physical_node_id;
9232 
9233 		/* get core information */
9234 		cpus[i].physical_core_id = get_core_id(i);
9235 		if (cpus[i].physical_core_id > max_core_id)
9236 			max_core_id = cpus[i].physical_core_id;
9237 
9238 		/* get thread information */
9239 		siblings = get_thread_siblings(&cpus[i]);
9240 		if (siblings > max_siblings)
9241 			max_siblings = siblings;
9242 		if (cpus[i].thread_id == 0)
9243 			topo.num_cores++;
9244 	}
9245 	topo.max_core_id = max_core_id;
9246 	topo.max_package_id = max_package_id;
9247 
9248 	topo.cores_per_node = max_core_id + 1;
9249 	if (debug > 1)
9250 		fprintf(outf, "max_core_id %d, sizing for %d cores per package\n", max_core_id, topo.cores_per_node);
9251 	if (!summary_only)
9252 		BIC_PRESENT(BIC_Core);
9253 
9254 	topo.num_die = topo.max_die_id + 1;
9255 	if (debug > 1)
9256 		fprintf(outf, "max_die_id %d, sizing for %d die\n", topo.max_die_id, topo.num_die);
9257 	if (!summary_only && topo.num_die > 1)
9258 		BIC_PRESENT(BIC_Die);
9259 
9260 	if (!summary_only && topo.max_l3_id > 0)
9261 		BIC_PRESENT(BIC_L3);
9262 
9263 	topo.num_packages = max_package_id + 1;
9264 	if (debug > 1)
9265 		fprintf(outf, "max_package_id %d, sizing for %d packages\n", max_package_id, topo.num_packages);
9266 	if (!summary_only && topo.num_packages > 1)
9267 		BIC_PRESENT(BIC_Package);
9268 
9269 	set_node_data();
9270 	if (debug > 1)
9271 		fprintf(outf, "nodes_per_pkg %d\n", topo.nodes_per_pkg);
9272 	if (!summary_only && topo.nodes_per_pkg > 1)
9273 		BIC_PRESENT(BIC_Node);
9274 
9275 	topo.threads_per_core = max_siblings;
9276 	if (debug > 1)
9277 		fprintf(outf, "max_siblings %d\n", max_siblings);
9278 
9279 	if (debug < 1)
9280 		return;
9281 
9282 	for (i = 0; i <= topo.max_cpu_num; ++i) {
9283 		if (cpu_is_not_present(i))
9284 			continue;
9285 		fprintf(outf,
9286 			"cpu %d pkg %d die %d l3 %d node %d lnode %d core %d thread %d\n",
9287 			i, cpus[i].physical_package_id, cpus[i].die_id, cpus[i].l3_id,
9288 			cpus[i].physical_node_id, cpus[i].logical_node_id, cpus[i].physical_core_id, cpus[i].thread_id);
9289 	}
9290 
9291 }
9292 
9293 void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_data **p)
9294 {
9295 	int i;
9296 	int num_cores = topo.cores_per_node * topo.nodes_per_pkg * topo.num_packages;
9297 	int num_threads = topo.threads_per_core * num_cores;
9298 
9299 	*t = calloc(num_threads, sizeof(struct thread_data));
9300 	if (*t == NULL)
9301 		goto error;
9302 
9303 	for (i = 0; i < num_threads; i++)
9304 		(*t)[i].cpu_id = -1;
9305 
9306 	*c = calloc(num_cores, sizeof(struct core_data));
9307 	if (*c == NULL)
9308 		goto error;
9309 
9310 	for (i = 0; i < num_cores; i++) {
9311 		(*c)[i].core_id = -1;
9312 		(*c)[i].base_cpu = -1;
9313 	}
9314 
9315 	*p = calloc(topo.num_packages, sizeof(struct pkg_data));
9316 	if (*p == NULL)
9317 		goto error;
9318 
9319 	for (i = 0; i < topo.num_packages; i++) {
9320 		(*p)[i].package_id = i;
9321 		(*p)[i].base_cpu = -1;
9322 	}
9323 
9324 	return;
9325 error:
9326 	err(1, "calloc counters");
9327 }
9328 
9329 /*
9330  * init_counter()
9331  *
9332  * set FIRST_THREAD_IN_CORE and FIRST_CORE_IN_PACKAGE
9333  */
9334 void init_counter(struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base, int cpu_id)
9335 {
9336 	int pkg_id = cpus[cpu_id].physical_package_id;
9337 	int node_id = cpus[cpu_id].logical_node_id;
9338 	int core_id = cpus[cpu_id].physical_core_id;
9339 	int thread_id = cpus[cpu_id].thread_id;
9340 	struct thread_data *t;
9341 	struct core_data *c;
9342 
9343 	/* Workaround for systems where physical_node_id==-1
9344 	 * and logical_node_id==(-1 - topo.num_cpus)
9345 	 */
9346 	if (node_id < 0)
9347 		node_id = 0;
9348 
9349 	t = GET_THREAD(thread_base, thread_id, core_id, node_id, pkg_id);
9350 	c = GET_CORE(core_base, core_id, node_id, pkg_id);
9351 
9352 	t->cpu_id = cpu_id;
9353 	if (!cpu_is_not_allowed(cpu_id)) {
9354 		if (c->base_cpu < 0)
9355 			c->base_cpu = t->cpu_id;
9356 		if (pkg_base[pkg_id].base_cpu < 0)
9357 			pkg_base[pkg_id].base_cpu = t->cpu_id;
9358 	}
9359 
9360 	c->core_id = core_id;
9361 	pkg_base[pkg_id].package_id = pkg_id;
9362 }
9363 
9364 int initialize_counters(int cpu_id)
9365 {
9366 	init_counter(EVEN_COUNTERS, cpu_id);
9367 	init_counter(ODD_COUNTERS, cpu_id);
9368 	return 0;
9369 }
9370 
9371 void allocate_output_buffer()
9372 {
9373 	output_buffer = calloc(1, (1 + topo.num_cpus) * 2048);
9374 	outp = output_buffer;
9375 	if (outp == NULL)
9376 		err(-1, "calloc output buffer");
9377 }
9378 
9379 void allocate_fd_percpu(void)
9380 {
9381 	fd_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
9382 	if (fd_percpu == NULL)
9383 		err(-1, "calloc fd_percpu");
9384 }
9385 
9386 void allocate_irq_buffers(void)
9387 {
9388 	irq_column_2_cpu = calloc(topo.num_cpus, sizeof(int));
9389 	if (irq_column_2_cpu == NULL)
9390 		err(-1, "calloc %d", topo.num_cpus);
9391 
9392 	irqs_per_cpu = calloc(topo.max_cpu_num + 1, sizeof(int));
9393 	if (irqs_per_cpu == NULL)
9394 		err(-1, "calloc %d IRQ", topo.max_cpu_num + 1);
9395 
9396 	nmi_per_cpu = calloc(topo.max_cpu_num + 1, sizeof(int));
9397 	if (nmi_per_cpu == NULL)
9398 		err(-1, "calloc %d NMI", topo.max_cpu_num + 1);
9399 }
9400 
9401 int update_topo(PER_THREAD_PARAMS)
9402 {
9403 	topo.allowed_cpus++;
9404 	if ((int)t->cpu_id == c->base_cpu)
9405 		topo.allowed_cores++;
9406 	if ((int)t->cpu_id == p->base_cpu)
9407 		topo.allowed_packages++;
9408 
9409 	return 0;
9410 }
9411 
9412 void topology_update(void)
9413 {
9414 	topo.allowed_cpus = 0;
9415 	topo.allowed_cores = 0;
9416 	topo.allowed_packages = 0;
9417 	for_all_cpus(update_topo, ODD_COUNTERS);
9418 }
9419 
9420 void setup_all_buffers(bool startup)
9421 {
9422 	topology_probe(startup);
9423 	allocate_irq_buffers();
9424 	allocate_fd_percpu();
9425 	allocate_counters(&thread_even, &core_even, &package_even);
9426 	allocate_counters(&thread_odd, &core_odd, &package_odd);
9427 	allocate_output_buffer();
9428 	for_all_proc_cpus(initialize_counters);
9429 	topology_update();
9430 }
9431 
9432 void set_base_cpu(void)
9433 {
9434 	int i;
9435 
9436 	for (i = 0; i < topo.max_cpu_num + 1; ++i) {
9437 		if (cpu_is_not_allowed(i))
9438 			continue;
9439 		base_cpu = i;
9440 		if (debug > 1)
9441 			fprintf(outf, "base_cpu = %d\n", base_cpu);
9442 		return;
9443 	}
9444 	err(-ENODEV, "No valid cpus found");
9445 }
9446 
9447 bool has_added_counters(void)
9448 {
9449 	/*
9450 	 * It only makes sense to call this after the command line is parsed,
9451 	 * otherwise sys structure is not populated.
9452 	 */
9453 
9454 	return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters;
9455 }
9456 
9457 void check_msr_access(void)
9458 {
9459 	check_dev_msr();
9460 	check_msr_permission();
9461 
9462 	if (no_msr)
9463 		bic_disable_msr_access();
9464 }
9465 
9466 void check_perf_access(void)
9467 {
9468 	if (no_perf || !BIC_IS_ENABLED(BIC_IPC) || !has_instr_count_access())
9469 		CLR_BIC(BIC_IPC, &bic_enabled);
9470 }
9471 
9472 bool perf_has_hybrid_devices(void)
9473 {
9474 	/*
9475 	 *  0: unknown
9476 	 *  1: has separate perf device for p and e core
9477 	 * -1: doesn't have separate perf device for p and e core
9478 	 */
9479 	static int cached;
9480 
9481 	if (cached > 0)
9482 		return true;
9483 
9484 	if (cached < 0)
9485 		return false;
9486 
9487 	if (access("/sys/bus/event_source/devices/cpu_core", F_OK)) {
9488 		cached = -1;
9489 		return false;
9490 	}
9491 
9492 	if (access("/sys/bus/event_source/devices/cpu_atom", F_OK)) {
9493 		cached = -1;
9494 		return false;
9495 	}
9496 
9497 	cached = 1;
9498 	return true;
9499 }
9500 
9501 int added_perf_counters_init_(struct perf_counter_info *pinfo)
9502 {
9503 	size_t num_domains = 0;
9504 	unsigned int next_domain;
9505 	bool *domain_visited;
9506 	unsigned int perf_type, perf_config;
9507 	double perf_scale;
9508 	int fd_perf;
9509 
9510 	if (!pinfo)
9511 		return 0;
9512 
9513 	const size_t max_num_domains = MAX(topo.max_cpu_num + 1, MAX(topo.max_core_id + 1, topo.max_package_id + 1));
9514 
9515 	domain_visited = calloc(max_num_domains, sizeof(*domain_visited));
9516 
9517 	while (pinfo) {
9518 		switch (pinfo->scope) {
9519 		case SCOPE_CPU:
9520 			num_domains = topo.max_cpu_num + 1;
9521 			break;
9522 
9523 		case SCOPE_CORE:
9524 			num_domains = topo.max_core_id + 1;
9525 			break;
9526 
9527 		case SCOPE_PACKAGE:
9528 			num_domains = topo.max_package_id + 1;
9529 			break;
9530 		}
9531 
9532 		/* Allocate buffer for file descriptor for each domain. */
9533 		pinfo->fd_perf_per_domain = calloc(num_domains, sizeof(*pinfo->fd_perf_per_domain));
9534 		if (!pinfo->fd_perf_per_domain)
9535 			errx(1, "%s: alloc %s", __func__, "fd_perf_per_domain");
9536 
9537 		for (size_t i = 0; i < num_domains; ++i)
9538 			pinfo->fd_perf_per_domain[i] = -1;
9539 
9540 		pinfo->num_domains = num_domains;
9541 		pinfo->scale = 1.0;
9542 
9543 		memset(domain_visited, 0, max_num_domains * sizeof(*domain_visited));
9544 
9545 		for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) {
9546 
9547 			next_domain = cpu_to_domain(pinfo, cpu);
9548 
9549 			assert(next_domain < num_domains);
9550 
9551 			if (cpu_is_not_allowed(cpu))
9552 				continue;
9553 
9554 			if (domain_visited[next_domain])
9555 				continue;
9556 
9557 			/*
9558 			 * Intel hybrid platforms expose different perf devices for P and E cores.
9559 			 * Instead of one, "/sys/bus/event_source/devices/cpu" device, there are
9560 			 * "/sys/bus/event_source/devices/{cpu_core,cpu_atom}".
9561 			 *
9562 			 * This makes it more complicated to the user, because most of the counters
9563 			 * are available on both and have to be handled manually, otherwise.
9564 			 *
9565 			 * Code below, allow user to use the old "cpu" name, which is translated accordingly.
9566 			 */
9567 			const char *perf_device = pinfo->device;
9568 
9569 			if (strcmp(perf_device, "cpu") == 0 && perf_has_hybrid_devices()) {
9570 				switch (cpus[cpu].type) {
9571 				case INTEL_PCORE_TYPE:
9572 					perf_device = "cpu_core";
9573 					break;
9574 
9575 				case INTEL_ECORE_TYPE:
9576 					perf_device = "cpu_atom";
9577 					break;
9578 
9579 				default:	/* Don't change, we will probably fail and report a problem soon. */
9580 					break;
9581 				}
9582 			}
9583 
9584 			perf_type = read_perf_type(perf_device);
9585 			if (perf_type == (unsigned int)-1) {
9586 				warnx("%s: perf/%s/%s: failed to read %s", __func__, perf_device, pinfo->event, "type");
9587 				continue;
9588 			}
9589 
9590 			perf_config = read_perf_config(perf_device, pinfo->event);
9591 			if (perf_config == (unsigned int)-1) {
9592 				warnx("%s: perf/%s/%s: failed to read %s",
9593 				      __func__, perf_device, pinfo->event, "config");
9594 				continue;
9595 			}
9596 
9597 			/* Scale is not required, some counters just don't have it. */
9598 			perf_scale = read_perf_scale(perf_device, pinfo->event);
9599 			if (perf_scale == 0.0)
9600 				perf_scale = 1.0;
9601 
9602 			fd_perf = open_perf_counter(cpu, perf_type, perf_config, -1, 0);
9603 			if (fd_perf == -1) {
9604 				warnx("%s: perf/%s/%s: failed to open counter on cpu%d",
9605 				      __func__, perf_device, pinfo->event, cpu);
9606 				continue;
9607 			}
9608 
9609 			domain_visited[next_domain] = 1;
9610 			pinfo->fd_perf_per_domain[next_domain] = fd_perf;
9611 			pinfo->scale = perf_scale;
9612 
9613 			if (debug)
9614 				fprintf(stderr, "Add perf/%s/%s cpu%d: %d\n",
9615 					perf_device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]);
9616 		}
9617 
9618 		pinfo = pinfo->next;
9619 	}
9620 
9621 	free(domain_visited);
9622 
9623 	return 0;
9624 }
9625 
9626 void added_perf_counters_init(void)
9627 {
9628 	if (added_perf_counters_init_(sys.perf_tp))
9629 		errx(1, "%s: %s", __func__, "thread");
9630 
9631 	if (added_perf_counters_init_(sys.perf_cp))
9632 		errx(1, "%s: %s", __func__, "core");
9633 
9634 	if (added_perf_counters_init_(sys.perf_pp))
9635 		errx(1, "%s: %s", __func__, "package");
9636 }
9637 
9638 int parse_telem_info_file(int fd_dir, const char *info_filename, const char *format, unsigned long *output)
9639 {
9640 	int fd_telem_info;
9641 	FILE *file_telem_info;
9642 	unsigned long value;
9643 
9644 	fd_telem_info = openat(fd_dir, info_filename, O_RDONLY);
9645 	if (fd_telem_info == -1)
9646 		return -1;
9647 
9648 	file_telem_info = fdopen(fd_telem_info, "r");
9649 	if (file_telem_info == NULL) {
9650 		close(fd_telem_info);
9651 		return -1;
9652 	}
9653 
9654 	if (fscanf(file_telem_info, format, &value) != 1) {
9655 		fclose(file_telem_info);
9656 		return -1;
9657 	}
9658 
9659 	fclose(file_telem_info);
9660 
9661 	*output = value;
9662 
9663 	return 0;
9664 }
9665 
9666 struct pmt_mmio *pmt_mmio_open(unsigned int target_guid)
9667 {
9668 	struct pmt_diriter_t pmt_iter;
9669 	const struct dirent *entry;
9670 	struct stat st;
9671 	int fd_telem_dir, fd_pmt;
9672 	unsigned long guid, size, offset;
9673 	size_t mmap_size;
9674 	void *mmio;
9675 	struct pmt_mmio *head = NULL, *last = NULL;
9676 	struct pmt_mmio *new_pmt = NULL;
9677 
9678 	if (stat(SYSFS_TELEM_PATH, &st) == -1)
9679 		return NULL;
9680 
9681 	pmt_diriter_init(&pmt_iter);
9682 	entry = pmt_diriter_begin(&pmt_iter, SYSFS_TELEM_PATH);
9683 	if (!entry) {
9684 		pmt_diriter_remove(&pmt_iter);
9685 		return NULL;
9686 	}
9687 
9688 	for (; entry != NULL; entry = pmt_diriter_next(&pmt_iter)) {
9689 		if (fstatat(dirfd(pmt_iter.dir), entry->d_name, &st, 0) == -1)
9690 			break;
9691 
9692 		if (!S_ISDIR(st.st_mode))
9693 			continue;
9694 
9695 		fd_telem_dir = openat(dirfd(pmt_iter.dir), entry->d_name, O_RDONLY);
9696 		if (fd_telem_dir == -1)
9697 			break;
9698 
9699 		if (parse_telem_info_file(fd_telem_dir, "guid", "%lx", &guid)) {
9700 			close(fd_telem_dir);
9701 			break;
9702 		}
9703 
9704 		if (parse_telem_info_file(fd_telem_dir, "size", "%lu", &size)) {
9705 			close(fd_telem_dir);
9706 			break;
9707 		}
9708 
9709 		if (guid != target_guid) {
9710 			close(fd_telem_dir);
9711 			continue;
9712 		}
9713 
9714 		if (parse_telem_info_file(fd_telem_dir, "offset", "%lu", &offset)) {
9715 			close(fd_telem_dir);
9716 			break;
9717 		}
9718 
9719 		assert(offset == 0);
9720 
9721 		fd_pmt = openat(fd_telem_dir, "telem", O_RDONLY);
9722 		if (fd_pmt == -1)
9723 			goto loop_cleanup_and_break;
9724 
9725 		mmap_size = ROUND_UP_TO_PAGE_SIZE(size);
9726 		mmio = mmap(0, mmap_size, PROT_READ, MAP_SHARED, fd_pmt, 0);
9727 		if (mmio != MAP_FAILED) {
9728 			if (debug)
9729 				fprintf(stderr, "%s: 0x%lx mmaped at: %p\n", __func__, guid, mmio);
9730 
9731 			new_pmt = calloc(1, sizeof(*new_pmt));
9732 
9733 			if (!new_pmt) {
9734 				fprintf(stderr, "%s: Failed to allocate pmt_mmio\n", __func__);
9735 				exit(1);
9736 			}
9737 
9738 			/*
9739 			 * Create linked list of mmaped regions,
9740 			 * but preserve the ordering from sysfs.
9741 			 * Ordering is important for the user to
9742 			 * use the seq=%u parameter when adding a counter.
9743 			 */
9744 			new_pmt->guid = guid;
9745 			new_pmt->mmio_base = mmio;
9746 			new_pmt->pmt_offset = offset;
9747 			new_pmt->size = size;
9748 			new_pmt->next = pmt_mmios;
9749 
9750 			if (last)
9751 				last->next = new_pmt;
9752 			else
9753 				head = new_pmt;
9754 
9755 			last = new_pmt;
9756 		}
9757 
9758 loop_cleanup_and_break:
9759 		close(fd_pmt);
9760 		close(fd_telem_dir);
9761 	}
9762 
9763 	pmt_diriter_remove(&pmt_iter);
9764 
9765 	/*
9766 	 * If we found something, stick just
9767 	 * created linked list to the front.
9768 	 */
9769 	if (head)
9770 		pmt_mmios = head;
9771 
9772 	return head;
9773 }
9774 
9775 struct pmt_mmio *pmt_mmio_find(unsigned int guid)
9776 {
9777 	struct pmt_mmio *pmmio = pmt_mmios;
9778 
9779 	while (pmmio) {
9780 		if (pmmio->guid == guid)
9781 			return pmmio;
9782 
9783 		pmmio = pmmio->next;
9784 	}
9785 
9786 	return NULL;
9787 }
9788 
9789 void *pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offset)
9790 {
9791 	char *ret;
9792 
9793 	/* Get base of mmaped PMT file. */
9794 	ret = (char *)pmmio->mmio_base;
9795 
9796 	/*
9797 	 * Apply PMT MMIO offset to obtain beginning of the mmaped telemetry data.
9798 	 * It's not guaranteed that the mmaped memory begins with the telemetry data
9799 	 *      - we might have to apply the offset first.
9800 	 */
9801 	ret += pmmio->pmt_offset;
9802 
9803 	/* Apply the counter offset to get the address to the mmaped counter. */
9804 	ret += counter_offset;
9805 
9806 	return ret;
9807 }
9808 
9809 struct pmt_mmio *pmt_add_guid(unsigned int guid, unsigned int seq)
9810 {
9811 	struct pmt_mmio *ret;
9812 
9813 	ret = pmt_mmio_find(guid);
9814 	if (!ret)
9815 		ret = pmt_mmio_open(guid);
9816 
9817 	while (ret && seq) {
9818 		ret = ret->next;
9819 		--seq;
9820 	}
9821 
9822 	return ret;
9823 }
9824 
9825 enum pmt_open_mode {
9826 	PMT_OPEN_TRY,		/* Open failure is not an error. */
9827 	PMT_OPEN_REQUIRED,	/* Open failure is a fatal error. */
9828 };
9829 
9830 struct pmt_counter *pmt_find_counter(struct pmt_counter *pcounter, const char *name)
9831 {
9832 	while (pcounter) {
9833 		if (strcmp(pcounter->name, name) == 0)
9834 			break;
9835 
9836 		pcounter = pcounter->next;
9837 	}
9838 
9839 	return pcounter;
9840 }
9841 
9842 struct pmt_counter **pmt_get_scope_root(enum counter_scope scope)
9843 {
9844 	switch (scope) {
9845 	case SCOPE_CPU:
9846 		return &sys.pmt_tp;
9847 	case SCOPE_CORE:
9848 		return &sys.pmt_cp;
9849 	case SCOPE_PACKAGE:
9850 		return &sys.pmt_pp;
9851 	}
9852 
9853 	__builtin_unreachable();
9854 }
9855 
9856 void pmt_counter_add_domain(struct pmt_counter *pcounter, unsigned long *pmmio, unsigned int domain_id)
9857 {
9858 	/* Make sure the new domain fits. */
9859 	if (domain_id >= pcounter->num_domains)
9860 		pmt_counter_resize(pcounter, domain_id + 1);
9861 
9862 	assert(pcounter->domains);
9863 	assert(domain_id < pcounter->num_domains);
9864 
9865 	pcounter->domains[domain_id].pcounter = pmmio;
9866 }
9867 
9868 int pmt_add_counter(unsigned int guid, unsigned int seq, const char *name, enum pmt_datatype type,
9869 		    unsigned int lsb, unsigned int msb, unsigned int offset, enum counter_scope scope,
9870 		    enum counter_format format, unsigned int domain_id, enum pmt_open_mode mode)
9871 {
9872 	struct pmt_mmio *mmio;
9873 	struct pmt_counter *pcounter;
9874 	struct pmt_counter **const pmt_root = pmt_get_scope_root(scope);
9875 	bool new_counter = false;
9876 	int conflict = 0;
9877 
9878 	if (lsb > msb) {
9879 		fprintf(stderr, "%s: %s: `%s` must be satisfied\n", __func__, "lsb <= msb", name);
9880 		exit(1);
9881 	}
9882 
9883 	if (msb >= 64) {
9884 		fprintf(stderr, "%s: %s: `%s` must be satisfied\n", __func__, "msb < 64", name);
9885 		exit(1);
9886 	}
9887 
9888 	mmio = pmt_add_guid(guid, seq);
9889 	if (!mmio) {
9890 		if (mode != PMT_OPEN_TRY) {
9891 			fprintf(stderr, "%s: failed to map PMT MMIO for guid %x, seq %u\n", __func__, guid, seq);
9892 			exit(1);
9893 		}
9894 
9895 		return 1;
9896 	}
9897 
9898 	if (offset >= mmio->size) {
9899 		if (mode != PMT_OPEN_TRY) {
9900 			fprintf(stderr, "%s: offset %u outside of PMT MMIO size %u\n", __func__, offset, mmio->size);
9901 			exit(1);
9902 		}
9903 
9904 		return 1;
9905 	}
9906 
9907 	pcounter = pmt_find_counter(*pmt_root, name);
9908 	if (!pcounter) {
9909 		pcounter = calloc(1, sizeof(*pcounter));
9910 		new_counter = true;
9911 	}
9912 
9913 	if (new_counter) {
9914 		strncpy(pcounter->name, name, ARRAY_SIZE(pcounter->name) - 1);
9915 		pcounter->type = type;
9916 		pcounter->scope = scope;
9917 		pcounter->lsb = lsb;
9918 		pcounter->msb = msb;
9919 		pcounter->format = format;
9920 	} else {
9921 		conflict += pcounter->type != type;
9922 		conflict += pcounter->scope != scope;
9923 		conflict += pcounter->lsb != lsb;
9924 		conflict += pcounter->msb != msb;
9925 		conflict += pcounter->format != format;
9926 	}
9927 
9928 	if (conflict) {
9929 		fprintf(stderr, "%s: conflicting parameters for the PMT counter with the same name %s\n",
9930 			__func__, name);
9931 		exit(1);
9932 	}
9933 
9934 	pmt_counter_add_domain(pcounter, pmt_get_counter_pointer(mmio, offset), domain_id);
9935 
9936 	if (new_counter) {
9937 		pcounter->next = *pmt_root;
9938 		*pmt_root = pcounter;
9939 	}
9940 
9941 	return 0;
9942 }
9943 
9944 void pmt_init(void)
9945 {
9946 	int cpu_num;
9947 	unsigned long seq, offset, mod_num;
9948 
9949 	if (BIC_IS_ENABLED(BIC_Diec6)) {
9950 		pmt_add_counter(PMT_MTL_DC6_GUID, PMT_MTL_DC6_SEQ, "Die%c6", PMT_TYPE_XTAL_TIME,
9951 				PMT_COUNTER_MTL_DC6_LSB, PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET,
9952 				SCOPE_PACKAGE, FORMAT_DELTA, 0, PMT_OPEN_TRY);
9953 	}
9954 
9955 	if (BIC_IS_ENABLED(BIC_CPU_c1e)) {
9956 		seq = 0;
9957 		offset = PMT_COUNTER_CWF_MC1E_OFFSET_BASE;
9958 		mod_num = 0;	/* Relative module number for current PMT file. */
9959 
9960 		/* Open the counter for each CPU. */
9961 		for (cpu_num = 0; cpu_num < topo.max_cpu_num;) {
9962 
9963 			if (cpu_is_not_allowed(cpu_num))
9964 				goto next_loop_iter;
9965 
9966 			/*
9967 			 * Set the scope to CPU, even though CWF report the counter per module.
9968 			 * CPUs inside the same module will read from the same location, instead of reporting zeros.
9969 			 *
9970 			 * CWF with newer firmware might require a PMT_TYPE_XTAL_TIME intead of PMT_TYPE_TCORE_CLOCK.
9971 			 */
9972 			pmt_add_counter(PMT_CWF_MC1E_GUID, seq, "CPU%c1e", PMT_TYPE_TCORE_CLOCK,
9973 					PMT_COUNTER_CWF_MC1E_LSB, PMT_COUNTER_CWF_MC1E_MSB, offset, SCOPE_CPU,
9974 					FORMAT_DELTA, cpu_num, PMT_OPEN_TRY);
9975 
9976 			/*
9977 			 * Rather complex logic for each time we go to the next loop iteration,
9978 			 * so keep it as a label.
9979 			 */
9980 next_loop_iter:
9981 			/*
9982 			 * Advance the cpu number and check if we should also advance offset to
9983 			 * the next counter inside the PMT file.
9984 			 *
9985 			 * On Clearwater Forest platform, the counter is reported per module,
9986 			 * so open the same counter for all of the CPUs inside the module.
9987 			 * That way, reported table show the correct value for all of the CPUs inside the module,
9988 			 * instead of zeros.
9989 			 */
9990 			++cpu_num;
9991 			if (cpu_num % PMT_COUNTER_CWF_CPUS_PER_MODULE == 0) {
9992 				offset += PMT_COUNTER_CWF_MC1E_OFFSET_INCREMENT;
9993 				++mod_num;
9994 			}
9995 
9996 			/*
9997 			 * There are PMT_COUNTER_CWF_MC1E_NUM_MODULES_PER_FILE in each PMT file.
9998 			 *
9999 			 * If that number is reached, seq must be incremented to advance to the next file in a sequence.
10000 			 * Offset inside that file and a module counter has to be reset.
10001 			 */
10002 			if (mod_num == PMT_COUNTER_CWF_MC1E_NUM_MODULES_PER_FILE) {
10003 				++seq;
10004 				offset = PMT_COUNTER_CWF_MC1E_OFFSET_BASE;
10005 				mod_num = 0;
10006 			}
10007 		}
10008 	}
10009 }
10010 
10011 void turbostat_init()
10012 {
10013 	setup_all_buffers(true);
10014 	set_base_cpu();
10015 	check_msr_access();
10016 	check_perf_access();
10017 	process_cpuid();
10018 	counter_info_init();
10019 	probe_pm_features();
10020 	msr_perf_init();
10021 	linux_perf_init();
10022 	rapl_perf_init();
10023 	cstate_perf_init();
10024 	added_perf_counters_init();
10025 	pmt_init();
10026 
10027 	for_all_cpus(get_cpu_type, ODD_COUNTERS);
10028 	for_all_cpus(get_cpu_type, EVEN_COUNTERS);
10029 
10030 	if (BIC_IS_ENABLED(BIC_IPC) && has_aperf_access && get_instr_count_fd(base_cpu) != -1)
10031 		BIC_PRESENT(BIC_IPC);
10032 
10033 	/*
10034 	 * If TSC tweak is needed, but couldn't get it,
10035 	 * disable more BICs, since it can't be reported accurately.
10036 	 */
10037 	if (platform->enable_tsc_tweak && !has_base_hz) {
10038 		CLR_BIC(BIC_Busy, &bic_enabled);
10039 		CLR_BIC(BIC_Bzy_MHz, &bic_enabled);
10040 	}
10041 }
10042 
10043 void affinitize_child(void)
10044 {
10045 	/* Prefer cpu_possible_set, if available */
10046 	if (sched_setaffinity(0, cpu_possible_setsize, cpu_possible_set)) {
10047 		warn("sched_setaffinity cpu_possible_set");
10048 
10049 		/* Otherwise, allow child to run on same cpu set as turbostat */
10050 		if (sched_setaffinity(0, cpu_allowed_setsize, cpu_allowed_set))
10051 			warn("sched_setaffinity cpu_allowed_set");
10052 	}
10053 }
10054 
10055 int fork_it(char **argv)
10056 {
10057 	pid_t child_pid;
10058 	int status;
10059 
10060 	snapshot_proc_sysfs_files();
10061 	status = for_all_cpus(get_counters, EVEN_COUNTERS);
10062 	first_counter_read = 0;
10063 	if (status)
10064 		exit(status);
10065 	gettimeofday(&tv_even, (struct timezone *)NULL);
10066 
10067 	child_pid = fork();
10068 	if (!child_pid) {
10069 		/* child */
10070 		affinitize_child();
10071 		execvp(argv[0], argv);
10072 		err(errno, "exec %s", argv[0]);
10073 	} else {
10074 
10075 		/* parent */
10076 		if (child_pid == -1)
10077 			err(1, "fork");
10078 
10079 		signal(SIGINT, SIG_IGN);
10080 		signal(SIGQUIT, SIG_IGN);
10081 		if (waitpid(child_pid, &status, 0) == -1)
10082 			err(status, "waitpid");
10083 
10084 		if (WIFEXITED(status))
10085 			status = WEXITSTATUS(status);
10086 	}
10087 	/*
10088 	 * n.b. fork_it() does not check for errors from for_all_cpus()
10089 	 * because re-starting is problematic when forking
10090 	 */
10091 	snapshot_proc_sysfs_files();
10092 	for_all_cpus(get_counters, ODD_COUNTERS);
10093 	gettimeofday(&tv_odd, (struct timezone *)NULL);
10094 	timersub(&tv_odd, &tv_even, &tv_delta);
10095 	if (for_all_cpus_2(delta_cpu, ODD_COUNTERS, EVEN_COUNTERS))
10096 		fprintf(outf, "%s: Counter reset detected\n", progname);
10097 	delta_platform(&platform_counters_odd, &platform_counters_even);
10098 
10099 	compute_average(EVEN_COUNTERS);
10100 	format_all_counters(EVEN_COUNTERS);
10101 
10102 	fprintf(outf, "%.6f sec\n", tv_delta.tv_sec + tv_delta.tv_usec / 1000000.0);
10103 
10104 	flush_output_stderr();
10105 
10106 	return status;
10107 }
10108 
10109 int get_and_dump_counters(void)
10110 {
10111 	int status;
10112 
10113 	snapshot_proc_sysfs_files();
10114 	status = for_all_cpus(get_counters, ODD_COUNTERS);
10115 	if (status)
10116 		return status;
10117 
10118 	status = for_all_cpus(dump_counters, ODD_COUNTERS);
10119 	if (status)
10120 		return status;
10121 
10122 	flush_output_stdout();
10123 
10124 	return status;
10125 }
10126 
10127 void print_version()
10128 {
10129 	fprintf(outf, "turbostat version 2025.09.09 - Len Brown <lenb@kernel.org>\n");
10130 }
10131 
10132 #define COMMAND_LINE_SIZE 2048
10133 
10134 void print_bootcmd(void)
10135 {
10136 	char bootcmd[COMMAND_LINE_SIZE];
10137 	FILE *fp;
10138 	int ret;
10139 
10140 	memset(bootcmd, 0, COMMAND_LINE_SIZE);
10141 	fp = fopen("/proc/cmdline", "r");
10142 	if (!fp)
10143 		return;
10144 
10145 	ret = fread(bootcmd, sizeof(char), COMMAND_LINE_SIZE - 1, fp);
10146 	if (ret) {
10147 		bootcmd[ret] = '\0';
10148 		/* the last character is already '\n' */
10149 		fprintf(outf, "Kernel command line: %s", bootcmd);
10150 	}
10151 
10152 	fclose(fp);
10153 }
10154 
10155 struct msr_counter *find_msrp_by_name(struct msr_counter *head, char *name)
10156 {
10157 	struct msr_counter *mp;
10158 
10159 	for (mp = head; mp; mp = mp->next) {
10160 		if (debug)
10161 			fprintf(stderr, "%s: %s %s\n", __func__, name, mp->name);
10162 		if (!strcmp(name, mp->name))
10163 			return mp;
10164 	}
10165 	return NULL;
10166 }
10167 
10168 int add_counter(unsigned int msr_num, char *path, char *name,
10169 		unsigned int width, enum counter_scope scope,
10170 		enum counter_type type, enum counter_format format, int flags, int id)
10171 {
10172 	struct msr_counter *msrp;
10173 
10174 	if (no_msr && msr_num)
10175 		errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num);
10176 
10177 	if (debug)
10178 		fprintf(stderr, "%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n",
10179 			__func__, msr_num, path, name, width, scope, type, format, flags, id);
10180 
10181 	switch (scope) {
10182 
10183 	case SCOPE_CPU:
10184 		msrp = find_msrp_by_name(sys.tp, name);
10185 		if (msrp) {
10186 			if (debug)
10187 				fprintf(stderr, "%s: %s FOUND\n", __func__, name);
10188 			break;
10189 		}
10190 		if (sys.added_thread_counters++ >= MAX_ADDED_THREAD_COUNTERS) {
10191 			warnx("ignoring thread counter %s", name);
10192 			return -1;
10193 		}
10194 		break;
10195 	case SCOPE_CORE:
10196 		msrp = find_msrp_by_name(sys.cp, name);
10197 		if (msrp) {
10198 			if (debug)
10199 				fprintf(stderr, "%s: %s FOUND\n", __func__, name);
10200 			break;
10201 		}
10202 		if (sys.added_core_counters++ >= MAX_ADDED_CORE_COUNTERS) {
10203 			warnx("ignoring core counter %s", name);
10204 			return -1;
10205 		}
10206 		break;
10207 	case SCOPE_PACKAGE:
10208 		msrp = find_msrp_by_name(sys.pp, name);
10209 		if (msrp) {
10210 			if (debug)
10211 				fprintf(stderr, "%s: %s FOUND\n", __func__, name);
10212 			break;
10213 		}
10214 		if (sys.added_package_counters++ >= MAX_ADDED_PACKAGE_COUNTERS) {
10215 			warnx("ignoring package counter %s", name);
10216 			return -1;
10217 		}
10218 		break;
10219 	default:
10220 		warnx("ignoring counter %s with unknown scope", name);
10221 		return -1;
10222 	}
10223 
10224 	if (msrp == NULL) {
10225 		msrp = calloc(1, sizeof(struct msr_counter));
10226 		if (msrp == NULL)
10227 			err(-1, "calloc msr_counter");
10228 
10229 		msrp->msr_num = msr_num;
10230 		strncpy(msrp->name, name, NAME_BYTES - 1);
10231 		msrp->width = width;
10232 		msrp->type = type;
10233 		msrp->format = format;
10234 		msrp->flags = flags;
10235 
10236 		switch (scope) {
10237 		case SCOPE_CPU:
10238 			msrp->next = sys.tp;
10239 			sys.tp = msrp;
10240 			break;
10241 		case SCOPE_CORE:
10242 			msrp->next = sys.cp;
10243 			sys.cp = msrp;
10244 			break;
10245 		case SCOPE_PACKAGE:
10246 			msrp->next = sys.pp;
10247 			sys.pp = msrp;
10248 			break;
10249 		}
10250 	}
10251 
10252 	if (path) {
10253 		struct sysfs_path *sp;
10254 
10255 		sp = calloc(1, sizeof(struct sysfs_path));
10256 		if (sp == NULL) {
10257 			perror("calloc");
10258 			exit(1);
10259 		}
10260 		strncpy(sp->path, path, PATH_BYTES - 1);
10261 		sp->id = id;
10262 		sp->next = msrp->sp;
10263 		msrp->sp = sp;
10264 	}
10265 
10266 	return 0;
10267 }
10268 
10269 /*
10270  * Initialize the fields used for identifying and opening the counter.
10271  *
10272  * Defer the initialization of any runtime buffers for actually reading
10273  * the counters for when we initialize all perf counters, so we can later
10274  * easily call re_initialize().
10275  */
10276 struct perf_counter_info *make_perf_counter_info(const char *perf_device,
10277 						 const char *perf_event,
10278 						 const char *name,
10279 						 unsigned int width,
10280 						 enum counter_scope scope,
10281 						 enum counter_type type, enum counter_format format)
10282 {
10283 	struct perf_counter_info *pinfo;
10284 
10285 	pinfo = calloc(1, sizeof(*pinfo));
10286 	if (!pinfo)
10287 		errx(1, "%s: Failed to allocate %s/%s\n", __func__, perf_device, perf_event);
10288 
10289 	strncpy(pinfo->device, perf_device, ARRAY_SIZE(pinfo->device) - 1);
10290 	strncpy(pinfo->event, perf_event, ARRAY_SIZE(pinfo->event) - 1);
10291 
10292 	strncpy(pinfo->name, name, ARRAY_SIZE(pinfo->name) - 1);
10293 	pinfo->width = width;
10294 	pinfo->scope = scope;
10295 	pinfo->type = type;
10296 	pinfo->format = format;
10297 
10298 	return pinfo;
10299 }
10300 
10301 int add_perf_counter(const char *perf_device, const char *perf_event, const char *name_buffer, unsigned int width,
10302 		     enum counter_scope scope, enum counter_type type, enum counter_format format)
10303 {
10304 	struct perf_counter_info *pinfo;
10305 
10306 	switch (scope) {
10307 	case SCOPE_CPU:
10308 		if (sys.added_thread_perf_counters >= MAX_ADDED_THREAD_COUNTERS) {
10309 			warnx("ignoring thread counter perf/%s/%s", perf_device, perf_event);
10310 			return -1;
10311 		}
10312 		break;
10313 
10314 	case SCOPE_CORE:
10315 		if (sys.added_core_perf_counters >= MAX_ADDED_CORE_COUNTERS) {
10316 			warnx("ignoring core counter perf/%s/%s", perf_device, perf_event);
10317 			return -1;
10318 		}
10319 		break;
10320 
10321 	case SCOPE_PACKAGE:
10322 		if (sys.added_package_perf_counters >= MAX_ADDED_PACKAGE_COUNTERS) {
10323 			warnx("ignoring package counter perf/%s/%s", perf_device, perf_event);
10324 			return -1;
10325 		}
10326 		break;
10327 	}
10328 
10329 	pinfo = make_perf_counter_info(perf_device, perf_event, name_buffer, width, scope, type, format);
10330 
10331 	if (!pinfo)
10332 		return -1;
10333 
10334 	switch (scope) {
10335 	case SCOPE_CPU:
10336 		pinfo->next = sys.perf_tp;
10337 		sys.perf_tp = pinfo;
10338 		++sys.added_thread_perf_counters;
10339 		break;
10340 
10341 	case SCOPE_CORE:
10342 		pinfo->next = sys.perf_cp;
10343 		sys.perf_cp = pinfo;
10344 		++sys.added_core_perf_counters;
10345 		break;
10346 
10347 	case SCOPE_PACKAGE:
10348 		pinfo->next = sys.perf_pp;
10349 		sys.perf_pp = pinfo;
10350 		++sys.added_package_perf_counters;
10351 		break;
10352 	}
10353 
10354 	// FIXME: we might not have debug here yet
10355 	if (debug)
10356 		fprintf(stderr, "%s: %s/%s, name: %s, scope%d\n",
10357 			__func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope);
10358 
10359 	return 0;
10360 }
10361 
10362 void parse_add_command_msr(char *add_command)
10363 {
10364 	int msr_num = 0;
10365 	char *path = NULL;
10366 	char perf_device[PERF_DEV_NAME_BYTES] = "";
10367 	char perf_event[PERF_EVT_NAME_BYTES] = "";
10368 	char name_buffer[PERF_NAME_BYTES] = "";
10369 	int width = 64;
10370 	int fail = 0;
10371 	enum counter_scope scope = SCOPE_CPU;
10372 	enum counter_type type = COUNTER_CYCLES;
10373 	enum counter_format format = FORMAT_DELTA;
10374 
10375 	while (add_command) {
10376 
10377 		if (sscanf(add_command, "msr0x%x", &msr_num) == 1)
10378 			goto next;
10379 
10380 		if (sscanf(add_command, "msr%d", &msr_num) == 1)
10381 			goto next;
10382 
10383 		BUILD_BUG_ON(ARRAY_SIZE(perf_device) <= 31);
10384 		BUILD_BUG_ON(ARRAY_SIZE(perf_event) <= 31);
10385 		if (sscanf(add_command, "perf/%31[^/]/%31[^,]", &perf_device[0], &perf_event[0]) == 2)
10386 			goto next;
10387 
10388 		if (*add_command == '/') {
10389 			path = add_command;
10390 			goto next;
10391 		}
10392 
10393 		if (sscanf(add_command, "u%d", &width) == 1) {
10394 			if ((width == 32) || (width == 64))
10395 				goto next;
10396 			width = 64;
10397 		}
10398 		if (!strncmp(add_command, "cpu", strlen("cpu"))) {
10399 			scope = SCOPE_CPU;
10400 			goto next;
10401 		}
10402 		if (!strncmp(add_command, "core", strlen("core"))) {
10403 			scope = SCOPE_CORE;
10404 			goto next;
10405 		}
10406 		if (!strncmp(add_command, "package", strlen("package"))) {
10407 			scope = SCOPE_PACKAGE;
10408 			goto next;
10409 		}
10410 		if (!strncmp(add_command, "cycles", strlen("cycles"))) {
10411 			type = COUNTER_CYCLES;
10412 			goto next;
10413 		}
10414 		if (!strncmp(add_command, "seconds", strlen("seconds"))) {
10415 			type = COUNTER_SECONDS;
10416 			goto next;
10417 		}
10418 		if (!strncmp(add_command, "usec", strlen("usec"))) {
10419 			type = COUNTER_USEC;
10420 			goto next;
10421 		}
10422 		if (!strncmp(add_command, "raw", strlen("raw"))) {
10423 			format = FORMAT_RAW;
10424 			goto next;
10425 		}
10426 		if (!strncmp(add_command, "average", strlen("average"))) {
10427 			format = FORMAT_AVERAGE;
10428 			goto next;
10429 		}
10430 		if (!strncmp(add_command, "delta", strlen("delta"))) {
10431 			format = FORMAT_DELTA;
10432 			goto next;
10433 		}
10434 		if (!strncmp(add_command, "percent", strlen("percent"))) {
10435 			format = FORMAT_PERCENT;
10436 			goto next;
10437 		}
10438 
10439 		BUILD_BUG_ON(ARRAY_SIZE(name_buffer) <= 18);
10440 		if (sscanf(add_command, "%18s,%*s", name_buffer) == 1) {
10441 			char *eos;
10442 
10443 			eos = strchr(name_buffer, ',');
10444 			if (eos)
10445 				*eos = '\0';
10446 			goto next;
10447 		}
10448 
10449 next:
10450 		add_command = strchr(add_command, ',');
10451 		if (add_command) {
10452 			*add_command = '\0';
10453 			add_command++;
10454 		}
10455 
10456 	}
10457 	if ((msr_num == 0) && (path == NULL) && (perf_device[0] == '\0' || perf_event[0] == '\0')) {
10458 		fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter | perf/device/event) required\n");
10459 		fail++;
10460 	}
10461 
10462 	/* Test for non-empty perf_device and perf_event */
10463 	const bool is_perf_counter = perf_device[0] && perf_event[0];
10464 
10465 	/* generate default column header */
10466 	if (*name_buffer == '\0') {
10467 		if (is_perf_counter) {
10468 			snprintf(name_buffer, ARRAY_SIZE(name_buffer), "perf/%s", perf_event);
10469 		} else {
10470 			if (width == 32)
10471 				sprintf(name_buffer, "M0x%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
10472 			else
10473 				sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
10474 		}
10475 	}
10476 
10477 	if (is_perf_counter) {
10478 		if (add_perf_counter(perf_device, perf_event, name_buffer, width, scope, type, format))
10479 			fail++;
10480 	} else {
10481 		if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0))
10482 			fail++;
10483 	}
10484 
10485 	if (fail) {
10486 		help();
10487 		exit(1);
10488 	}
10489 }
10490 
10491 bool starts_with(const char *str, const char *prefix)
10492 {
10493 	return strncmp(prefix, str, strlen(prefix)) == 0;
10494 }
10495 
10496 int pmt_parse_from_path(const char *target_path, unsigned int *out_guid, unsigned int *out_seq)
10497 {
10498 	struct pmt_diriter_t pmt_iter;
10499 	const struct dirent *dirname;
10500 	struct stat stat, target_stat;
10501 	int fd_telem_dir = -1;
10502 	int fd_target_dir;
10503 	unsigned int seq = 0;
10504 	unsigned long guid, target_guid;
10505 	int ret = -1;
10506 
10507 	fd_target_dir = open(target_path, O_RDONLY | O_DIRECTORY);
10508 	if (fd_target_dir == -1) {
10509 		return -1;
10510 	}
10511 
10512 	if (fstat(fd_target_dir, &target_stat) == -1) {
10513 		fprintf(stderr, "%s: Failed to stat the target: %s", __func__, strerror(errno));
10514 		exit(1);
10515 	}
10516 
10517 	if (parse_telem_info_file(fd_target_dir, "guid", "%lx", &target_guid)) {
10518 		fprintf(stderr, "%s: Failed to parse the target guid file: %s", __func__, strerror(errno));
10519 		exit(1);
10520 	}
10521 
10522 	close(fd_target_dir);
10523 
10524 	pmt_diriter_init(&pmt_iter);
10525 
10526 	for (dirname = pmt_diriter_begin(&pmt_iter, SYSFS_TELEM_PATH); dirname != NULL;
10527 	     dirname = pmt_diriter_next(&pmt_iter)) {
10528 
10529 		fd_telem_dir = openat(dirfd(pmt_iter.dir), dirname->d_name, O_RDONLY | O_DIRECTORY);
10530 		if (fd_telem_dir == -1)
10531 			continue;
10532 
10533 		if (parse_telem_info_file(fd_telem_dir, "guid", "%lx", &guid)) {
10534 			fprintf(stderr, "%s: Failed to parse the guid file: %s", __func__, strerror(errno));
10535 			continue;
10536 		}
10537 
10538 		if (fstat(fd_telem_dir, &stat) == -1) {
10539 			fprintf(stderr, "%s: Failed to stat %s directory: %s", __func__,
10540 				dirname->d_name, strerror(errno));
10541 			continue;
10542 		}
10543 
10544 		/*
10545 		 * If reached the same directory as target, exit the loop.
10546 		 * Seq has the correct value now.
10547 		 */
10548 		if (stat.st_dev == target_stat.st_dev && stat.st_ino == target_stat.st_ino) {
10549 			ret = 0;
10550 			break;
10551 		}
10552 
10553 		/*
10554 		 * If reached directory with the same guid,
10555 		 * but it's not the target directory yet,
10556 		 * increment seq and continue the search.
10557 		 */
10558 		if (guid == target_guid)
10559 			++seq;
10560 
10561 		close(fd_telem_dir);
10562 		fd_telem_dir = -1;
10563 	}
10564 
10565 	pmt_diriter_remove(&pmt_iter);
10566 
10567 	if (fd_telem_dir != -1)
10568 		close(fd_telem_dir);
10569 
10570 	if (!ret) {
10571 		*out_guid = target_guid;
10572 		*out_seq = seq;
10573 	}
10574 
10575 	return ret;
10576 }
10577 
10578 void parse_add_command_pmt(char *add_command)
10579 {
10580 	char *name = NULL;
10581 	char *type_name = NULL;
10582 	char *format_name = NULL;
10583 	char *direct_path = NULL;
10584 	static const char direct_path_prefix[] = "path=";
10585 	unsigned int offset;
10586 	unsigned int lsb;
10587 	unsigned int msb;
10588 	unsigned int guid;
10589 	unsigned int seq = 0;	/* By default, pick first file in a sequence with a given GUID. */
10590 	unsigned int domain_id;
10591 	enum counter_scope scope = 0;
10592 	enum pmt_datatype type = PMT_TYPE_RAW;
10593 	enum counter_format format = FORMAT_RAW;
10594 	bool has_offset = false;
10595 	bool has_lsb = false;
10596 	bool has_msb = false;
10597 	bool has_format = true;	/* Format has a default value. */
10598 	bool has_guid = false;
10599 	bool has_scope = false;
10600 	bool has_type = true;	/* Type has a default value. */
10601 
10602 	/* Consume the "pmt," prefix. */
10603 	add_command = strchr(add_command, ',');
10604 	if (!add_command) {
10605 		help();
10606 		exit(1);
10607 	}
10608 	++add_command;
10609 
10610 	while (add_command) {
10611 		if (starts_with(add_command, "name=")) {
10612 			name = add_command + strlen("name=");
10613 			goto next;
10614 		}
10615 
10616 		if (starts_with(add_command, "type=")) {
10617 			type_name = add_command + strlen("type=");
10618 			goto next;
10619 		}
10620 
10621 		if (starts_with(add_command, "domain=")) {
10622 			const size_t prefix_len = strlen("domain=");
10623 
10624 			if (sscanf(add_command + prefix_len, "cpu%u", &domain_id) == 1) {
10625 				scope = SCOPE_CPU;
10626 				has_scope = true;
10627 			} else if (sscanf(add_command + prefix_len, "core%u", &domain_id) == 1) {
10628 				scope = SCOPE_CORE;
10629 				has_scope = true;
10630 			} else if (sscanf(add_command + prefix_len, "package%u", &domain_id) == 1) {
10631 				scope = SCOPE_PACKAGE;
10632 				has_scope = true;
10633 			}
10634 
10635 			if (!has_scope) {
10636 				printf("%s: invalid value for scope. Expected cpu%%u, core%%u or package%%u.\n",
10637 				       __func__);
10638 				exit(1);
10639 			}
10640 
10641 			goto next;
10642 		}
10643 
10644 		if (starts_with(add_command, "format=")) {
10645 			format_name = add_command + strlen("format=");
10646 			goto next;
10647 		}
10648 
10649 		if (sscanf(add_command, "offset=%u", &offset) == 1) {
10650 			has_offset = true;
10651 			goto next;
10652 		}
10653 
10654 		if (sscanf(add_command, "lsb=%u", &lsb) == 1) {
10655 			has_lsb = true;
10656 			goto next;
10657 		}
10658 
10659 		if (sscanf(add_command, "msb=%u", &msb) == 1) {
10660 			has_msb = true;
10661 			goto next;
10662 		}
10663 
10664 		if (sscanf(add_command, "guid=%x", &guid) == 1) {
10665 			has_guid = true;
10666 			goto next;
10667 		}
10668 
10669 		if (sscanf(add_command, "seq=%x", &seq) == 1)
10670 			goto next;
10671 
10672 		if (strncmp(add_command, direct_path_prefix, strlen(direct_path_prefix)) == 0) {
10673 			direct_path = add_command + strlen(direct_path_prefix);
10674 			goto next;
10675 		}
10676 next:
10677 		add_command = strchr(add_command, ',');
10678 		if (add_command) {
10679 			*add_command = '\0';
10680 			add_command++;
10681 		}
10682 	}
10683 
10684 	if (!name) {
10685 		printf("%s: missing %s\n", __func__, "name");
10686 		exit(1);
10687 	}
10688 
10689 	if (strlen(name) >= PMT_COUNTER_NAME_SIZE_BYTES) {
10690 		printf("%s: name has to be at most %d characters long\n", __func__, PMT_COUNTER_NAME_SIZE_BYTES);
10691 		exit(1);
10692 	}
10693 
10694 	if (format_name) {
10695 		has_format = false;
10696 
10697 		if (strcmp("raw", format_name) == 0) {
10698 			format = FORMAT_RAW;
10699 			has_format = true;
10700 		}
10701 
10702 		if (strcmp("average", format_name) == 0) {
10703 			format = FORMAT_AVERAGE;
10704 			has_format = true;
10705 		}
10706 
10707 		if (strcmp("delta", format_name) == 0) {
10708 			format = FORMAT_DELTA;
10709 			has_format = true;
10710 		}
10711 
10712 		if (!has_format) {
10713 			fprintf(stderr, "%s: Invalid format %s. Expected raw, average or delta\n",
10714 				__func__, format_name);
10715 			exit(1);
10716 		}
10717 	}
10718 
10719 	if (type_name) {
10720 		has_type = false;
10721 
10722 		if (strcmp("raw", type_name) == 0) {
10723 			type = PMT_TYPE_RAW;
10724 			has_type = true;
10725 		}
10726 
10727 		if (strcmp("txtal_time", type_name) == 0) {
10728 			type = PMT_TYPE_XTAL_TIME;
10729 			has_type = true;
10730 		}
10731 
10732 		if (strcmp("tcore_clock", type_name) == 0) {
10733 			type = PMT_TYPE_TCORE_CLOCK;
10734 			has_type = true;
10735 		}
10736 
10737 		if (!has_type) {
10738 			printf("%s: invalid %s: %s\n", __func__, "type", type_name);
10739 			exit(1);
10740 		}
10741 	}
10742 
10743 	if (!has_offset) {
10744 		printf("%s : missing %s\n", __func__, "offset");
10745 		exit(1);
10746 	}
10747 
10748 	if (!has_lsb) {
10749 		printf("%s: missing %s\n", __func__, "lsb");
10750 		exit(1);
10751 	}
10752 
10753 	if (!has_msb) {
10754 		printf("%s: missing %s\n", __func__, "msb");
10755 		exit(1);
10756 	}
10757 
10758 	if (direct_path && has_guid) {
10759 		printf("%s: path and guid+seq parameters are mutually exclusive\n"
10760 		       "notice: passed guid=0x%x and path=%s\n", __func__, guid, direct_path);
10761 		exit(1);
10762 	}
10763 
10764 	if (direct_path) {
10765 		if (pmt_parse_from_path(direct_path, &guid, &seq)) {
10766 			printf("%s: failed to parse PMT file from %s\n", __func__, direct_path);
10767 			exit(1);
10768 		}
10769 
10770 		/* GUID was just infered from the direct path. */
10771 		has_guid = true;
10772 	}
10773 
10774 	if (!has_guid) {
10775 		printf("%s: missing %s\n", __func__, "guid or path");
10776 		exit(1);
10777 	}
10778 
10779 	if (!has_scope) {
10780 		printf("%s: missing %s\n", __func__, "scope");
10781 		exit(1);
10782 	}
10783 
10784 	if (lsb > msb) {
10785 		printf("%s: lsb > msb doesn't make sense\n", __func__);
10786 		exit(1);
10787 	}
10788 
10789 	pmt_add_counter(guid, seq, name, type, lsb, msb, offset, scope, format, domain_id, PMT_OPEN_REQUIRED);
10790 }
10791 
10792 void parse_add_command(char *add_command)
10793 {
10794 	if (strncmp(add_command, "pmt", strlen("pmt")) == 0)
10795 		return parse_add_command_pmt(add_command);
10796 	return parse_add_command_msr(add_command);
10797 }
10798 
10799 int is_deferred_add(char *name)
10800 {
10801 	int i;
10802 
10803 	for (i = 0; i < deferred_add_index; ++i)
10804 		if (!strcmp(name, deferred_add_names[i])) {
10805 			deferred_add_consumed |= (1 << i);
10806 			return 1;
10807 		}
10808 	return 0;
10809 }
10810 
10811 int is_deferred_skip(char *name)
10812 {
10813 	int i;
10814 
10815 	for (i = 0; i < deferred_skip_index; ++i)
10816 		if (!strcmp(name, deferred_skip_names[i])) {
10817 			deferred_skip_consumed |= (1 << i);
10818 			return 1;
10819 		}
10820 	return 0;
10821 }
10822 
10823 void verify_deferred_consumed(void)
10824 {
10825 	int i;
10826 	int fail = 0;
10827 
10828 	for (i = 0; i < deferred_add_index; ++i) {
10829 		if (!(deferred_add_consumed & (1 << i))) {
10830 			warnx("Counter '%s' can not be added.", deferred_add_names[i]);
10831 			fail++;
10832 		}
10833 	}
10834 	for (i = 0; i < deferred_skip_index; ++i) {
10835 		if (!(deferred_skip_consumed & (1 << i))) {
10836 			warnx("Counter '%s' can not be skipped.", deferred_skip_names[i]);
10837 			fail++;
10838 		}
10839 	}
10840 	if (fail)
10841 		exit(-EINVAL);
10842 }
10843 
10844 void probe_cpuidle_residency(void)
10845 {
10846 	char path[64];
10847 	char name_buf[16];
10848 	FILE *input;
10849 	int state;
10850 	int min_state = 1024, max_state = 0;
10851 	char *sp;
10852 
10853 	for (state = 10; state >= 0; --state) {
10854 
10855 		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
10856 		input = fopen(path, "r");
10857 		if (input == NULL)
10858 			continue;
10859 		if (!fgets(name_buf, sizeof(name_buf), input))
10860 			err(1, "%s: failed to read file", path);
10861 
10862 		/* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
10863 		sp = strchr(name_buf, '-');
10864 		if (!sp)
10865 			sp = strchrnul(name_buf, '\n');
10866 		*sp = '%';
10867 		*(sp + 1) = '\0';
10868 
10869 		remove_underbar(name_buf);
10870 
10871 		fclose(input);
10872 
10873 		sprintf(path, "cpuidle/state%d/time", state);
10874 
10875 		if (!DO_BIC(BIC_pct_idle) && !is_deferred_add(name_buf))
10876 			continue;
10877 
10878 		if (is_deferred_skip(name_buf))
10879 			continue;
10880 
10881 		add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU, 0);
10882 
10883 		if (state > max_state)
10884 			max_state = state;
10885 		if (state < min_state)
10886 			min_state = state;
10887 	}
10888 }
10889 
10890 void probe_cpuidle_counts(void)
10891 {
10892 	char path[64];
10893 	char name_buf[16];
10894 	FILE *input;
10895 	int state;
10896 	int min_state = 1024, max_state = 0;
10897 	char *sp;
10898 
10899 	if (!DO_BIC(BIC_cpuidle))
10900 		return;
10901 
10902 	for (state = 10; state >= 0; --state) {
10903 
10904 		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
10905 		input = fopen(path, "r");
10906 		if (input == NULL)
10907 			continue;
10908 		if (!fgets(name_buf, sizeof(name_buf), input))
10909 			err(1, "%s: failed to read file", path);
10910 		fclose(input);
10911 
10912 		remove_underbar(name_buf);
10913 
10914 		if (!DO_BIC(BIC_cpuidle) && !is_deferred_add(name_buf))
10915 			continue;
10916 
10917 		if (is_deferred_skip(name_buf))
10918 			continue;
10919 
10920 		/* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
10921 		sp = strchr(name_buf, '-');
10922 		if (!sp)
10923 			sp = strchrnul(name_buf, '\n');
10924 
10925 		/*
10926 		 * The 'below' sysfs file always contains 0 for the deepest state (largest index),
10927 		 * do not add it.
10928 		 */
10929 		if (state != max_state) {
10930 			/*
10931 			 * Add 'C1+' for C1, and so on. The 'below' sysfs file always contains 0 for
10932 			 * the last state, so do not add it.
10933 			 */
10934 
10935 			*sp = '+';
10936 			*(sp + 1) = '\0';
10937 			sprintf(path, "cpuidle/state%d/below", state);
10938 			add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU, 0);
10939 		}
10940 
10941 		*sp = '\0';
10942 		sprintf(path, "cpuidle/state%d/usage", state);
10943 		add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU, 0);
10944 
10945 		/*
10946 		 * The 'above' sysfs file always contains 0 for the shallowest state (smallest
10947 		 * index), do not add it.
10948 		 */
10949 		if (state != min_state) {
10950 			*sp = '-';
10951 			*(sp + 1) = '\0';
10952 			sprintf(path, "cpuidle/state%d/above", state);
10953 			add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU, 0);
10954 		}
10955 	}
10956 }
10957 
10958 /*
10959  * parse cpuset with following syntax
10960  * 1,2,4..6,8-10 and set bits in cpu_subset
10961  */
10962 void parse_cpu_command(char *optarg)
10963 {
10964 	if (!strcmp(optarg, "core")) {
10965 		if (cpu_subset)
10966 			goto error;
10967 		show_core_only++;
10968 		return;
10969 	}
10970 	if (!strcmp(optarg, "package")) {
10971 		if (cpu_subset)
10972 			goto error;
10973 		show_pkg_only++;
10974 		return;
10975 	}
10976 	if (show_core_only || show_pkg_only)
10977 		goto error;
10978 
10979 	cpu_subset = CPU_ALLOC(CPU_SUBSET_MAXCPUS);
10980 	if (cpu_subset == NULL)
10981 		err(3, "CPU_ALLOC");
10982 	cpu_subset_size = CPU_ALLOC_SIZE(CPU_SUBSET_MAXCPUS);
10983 
10984 	CPU_ZERO_S(cpu_subset_size, cpu_subset);
10985 
10986 	if (parse_cpu_str(optarg, cpu_subset, cpu_subset_size))
10987 		goto error;
10988 
10989 	return;
10990 
10991 error:
10992 	fprintf(stderr, "\"--cpu %s\" malformed\n", optarg);
10993 	help();
10994 	exit(-1);
10995 }
10996 
10997 void cmdline(int argc, char **argv)
10998 {
10999 	int opt;
11000 	int option_index = 0;
11001 	static struct option long_options[] = {
11002 		{ "add", required_argument, 0, 'a' },
11003 		{ "cpu", required_argument, 0, 'c' },
11004 		{ "Dump", no_argument, 0, 'D' },
11005 		{ "debug", no_argument, 0, 'd' },	/* internal, not documented */
11006 		{ "enable", required_argument, 0, 'e' },
11007 		{ "force", no_argument, 0, 'f' },
11008 		{ "interval", required_argument, 0, 'i' },
11009 		{ "IPC", no_argument, 0, 'I' },
11010 		{ "num_iterations", required_argument, 0, 'n' },
11011 		{ "header_iterations", required_argument, 0, 'N' },
11012 		{ "help", no_argument, 0, 'h' },
11013 		{ "hide", required_argument, 0, 'H' },	// meh, -h taken by --help
11014 		{ "Joules", no_argument, 0, 'J' },
11015 		{ "list", no_argument, 0, 'l' },
11016 		{ "out", required_argument, 0, 'o' },
11017 		{ "quiet", no_argument, 0, 'q' },
11018 		{ "no-msr", no_argument, 0, 'M' },
11019 		{ "no-perf", no_argument, 0, 'P' },
11020 		{ "show", required_argument, 0, 's' },
11021 		{ "Summary", no_argument, 0, 'S' },
11022 		{ "TCC", required_argument, 0, 'T' },
11023 		{ "version", no_argument, 0, 'v' },
11024 		{ 0, 0, 0, 0 }
11025 	};
11026 
11027 	progname = argv[0];
11028 
11029 	/*
11030 	 * Parse some options early, because they may make other options invalid,
11031 	 * like adding the MSR counter with --add and at the same time using --no-msr.
11032 	 */
11033 	while ((opt = getopt_long_only(argc, argv, "+MPn:", long_options, &option_index)) != -1) {
11034 		switch (opt) {
11035 		case 'M':
11036 			no_msr = 1;
11037 			break;
11038 		case 'P':
11039 			no_perf = 1;
11040 			break;
11041 		default:
11042 			break;
11043 		}
11044 	}
11045 	optind = 0;
11046 
11047 	while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qMST:v", long_options, &option_index)) != -1) {
11048 		switch (opt) {
11049 		case 'a':
11050 			parse_add_command(optarg);
11051 			break;
11052 		case 'c':
11053 			parse_cpu_command(optarg);
11054 			break;
11055 		case 'D':
11056 			dump_only++;
11057 			/*
11058 			 * Force the no_perf early to prevent using it as a source.
11059 			 * User asks for raw values, but perf returns them relative
11060 			 * to the opening of the file descriptor.
11061 			 */
11062 			no_perf = 1;
11063 			break;
11064 		case 'e':
11065 			/* --enable specified counter, without clearning existing list */
11066 			bic_lookup(&bic_enabled, optarg, SHOW_LIST);
11067 			break;
11068 		case 'f':
11069 			force_load++;
11070 			break;
11071 		case 'd':
11072 			debug++;
11073 			bic_set_all(&bic_enabled);
11074 			break;
11075 		case 'H':
11076 			/*
11077 			 * --hide: do not show those specified
11078 			 *  multiple invocations simply clear more bits in enabled mask
11079 			 */
11080 			{
11081 				cpu_set_t bic_group_hide;
11082 
11083 				BIC_INIT(&bic_group_hide);
11084 
11085 				bic_lookup(&bic_group_hide, optarg, HIDE_LIST);
11086 				bic_clear_bits(&bic_enabled, &bic_group_hide);
11087 			}
11088 			break;
11089 		case 'h':
11090 		default:
11091 			help();
11092 			exit(1);
11093 		case 'i':
11094 			{
11095 				double interval = strtod(optarg, NULL);
11096 
11097 				if (interval < 0.001) {
11098 					fprintf(outf, "interval %f seconds is too small\n", interval);
11099 					exit(2);
11100 				}
11101 
11102 				interval_tv.tv_sec = interval_ts.tv_sec = interval;
11103 				interval_tv.tv_usec = (interval - interval_tv.tv_sec) * 1000000;
11104 				interval_ts.tv_nsec = (interval - interval_ts.tv_sec) * 1000000000;
11105 			}
11106 			break;
11107 		case 'J':
11108 			rapl_joules++;
11109 			break;
11110 		case 'l':
11111 			bic_set_all(&bic_enabled);
11112 			list_header_only++;
11113 			quiet++;
11114 			break;
11115 		case 'o':
11116 			outf = fopen_or_die(optarg, "w");
11117 			break;
11118 		case 'q':
11119 			quiet = 1;
11120 			break;
11121 		case 'M':
11122 		case 'P':
11123 			/* Parsed earlier */
11124 			break;
11125 		case 'n':
11126 			num_iterations = strtod(optarg, NULL);
11127 
11128 			if (num_iterations <= 0) {
11129 				fprintf(outf, "iterations %d should be positive number\n", num_iterations);
11130 				exit(2);
11131 			}
11132 			break;
11133 		case 'N':
11134 			header_iterations = strtod(optarg, NULL);
11135 
11136 			if (header_iterations <= 0) {
11137 				fprintf(outf, "iterations %d should be positive number\n", header_iterations);
11138 				exit(2);
11139 			}
11140 			break;
11141 		case 's':
11142 			/*
11143 			 * --show: show only those specified
11144 			 *  The 1st invocation will clear and replace the enabled mask
11145 			 *  subsequent invocations can add to it.
11146 			 */
11147 			if (shown == 0)
11148 				BIC_INIT(&bic_enabled);
11149 			bic_lookup(&bic_enabled, optarg, SHOW_LIST);
11150 			shown = 1;
11151 			break;
11152 		case 'S':
11153 			summary_only++;
11154 			break;
11155 		case 'T':
11156 			tj_max_override = atoi(optarg);
11157 			break;
11158 		case 'v':
11159 			print_version();
11160 			exit(0);
11161 			break;
11162 		}
11163 	}
11164 }
11165 
11166 void set_rlimit(void)
11167 {
11168 	struct rlimit limit;
11169 
11170 	if (getrlimit(RLIMIT_NOFILE, &limit) < 0)
11171 		err(1, "Failed to get rlimit");
11172 
11173 	if (limit.rlim_max < MAX_NOFILE)
11174 		limit.rlim_max = MAX_NOFILE;
11175 	if (limit.rlim_cur < MAX_NOFILE)
11176 		limit.rlim_cur = MAX_NOFILE;
11177 
11178 	if (setrlimit(RLIMIT_NOFILE, &limit) < 0)
11179 		err(1, "Failed to set rlimit");
11180 }
11181 
11182 int main(int argc, char **argv)
11183 {
11184 	int fd, ret;
11185 
11186 	bic_groups_init();
11187 
11188 	fd = open("/sys/fs/cgroup/cgroup.procs", O_WRONLY);
11189 	if (fd < 0)
11190 		goto skip_cgroup_setting;
11191 
11192 	ret = write(fd, "0\n", 2);
11193 	if (ret == -1)
11194 		perror("Can't update cgroup\n");
11195 
11196 	close(fd);
11197 
11198 skip_cgroup_setting:
11199 	outf = stderr;
11200 	cmdline(argc, argv);
11201 
11202 	if (!quiet) {
11203 		print_version();
11204 		print_bootcmd();
11205 	}
11206 
11207 	probe_cpuidle_residency();
11208 	probe_cpuidle_counts();
11209 
11210 	verify_deferred_consumed();
11211 
11212 	if (!getuid())
11213 		set_rlimit();
11214 
11215 	turbostat_init();
11216 
11217 	if (!no_msr)
11218 		msr_sum_record();
11219 
11220 	/* dump counters and exit */
11221 	if (dump_only)
11222 		return get_and_dump_counters();
11223 
11224 	/* list header and exit */
11225 	if (list_header_only) {
11226 		print_header(",");
11227 		flush_output_stdout();
11228 		return 0;
11229 	}
11230 
11231 	/*
11232 	 * if any params left, it must be a command to fork
11233 	 */
11234 	if (argc - optind)
11235 		return fork_it(argv + optind);
11236 	else
11237 		turbostat_loop();
11238 
11239 	return 0;
11240 }
11241