xref: /linux/tools/power/x86/turbostat/turbostat.c (revision 001821b0e79716c4e17c71d8e053a23599a7a508)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * turbostat -- show CPU frequency and C-state residency
4  * on modern Intel and AMD processors.
5  *
6  * Copyright (c) 2024 Intel Corporation.
7  * Len Brown <len.brown@intel.com>
8  */
9 
10 #define _GNU_SOURCE
11 #include MSRHEADER
12 #include INTEL_FAMILY_HEADER
13 #include <stdarg.h>
14 #include <stdio.h>
15 #include <err.h>
16 #include <unistd.h>
17 #include <sys/types.h>
18 #include <sys/wait.h>
19 #include <sys/stat.h>
20 #include <sys/select.h>
21 #include <sys/resource.h>
22 #include <fcntl.h>
23 #include <signal.h>
24 #include <sys/time.h>
25 #include <stdlib.h>
26 #include <getopt.h>
27 #include <dirent.h>
28 #include <string.h>
29 #include <ctype.h>
30 #include <sched.h>
31 #include <time.h>
32 #include <cpuid.h>
33 #include <sys/capability.h>
34 #include <errno.h>
35 #include <math.h>
36 #include <linux/perf_event.h>
37 #include <asm/unistd.h>
38 #include <stdbool.h>
39 #include <assert.h>
40 #include <linux/kernel.h>
41 #include <linux/build_bug.h>
42 
43 #define UNUSED(x) (void)(x)
44 
45 /*
46  * This list matches the column headers, except
47  * 1. built-in only, the sysfs counters are not here -- we learn of those at run-time
48  * 2. Core and CPU are moved to the end, we can't have strings that contain them
49  *    matching on them for --show and --hide.
50  */
51 
52 /*
53  * buffer size used by sscanf() for added column names
54  * Usually truncated to 7 characters, but also handles 18 columns for raw 64-bit counters
55  */
56 #define	NAME_BYTES 20
57 #define PATH_BYTES 128
58 
59 #define MAX_NOFILE 0x8000
60 
61 enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE };
62 enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M };
63 enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE };
64 enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR };
65 enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR };
66 enum cstate_source { CSTATE_SOURCE_NONE, CSTATE_SOURCE_PERF, CSTATE_SOURCE_MSR };
67 
68 struct sysfs_path {
69 	char path[PATH_BYTES];
70 	int id;
71 	struct sysfs_path *next;
72 };
73 
74 struct msr_counter {
75 	unsigned int msr_num;
76 	char name[NAME_BYTES];
77 	struct sysfs_path *sp;
78 	unsigned int width;
79 	enum counter_type type;
80 	enum counter_format format;
81 	struct msr_counter *next;
82 	unsigned int flags;
83 #define	FLAGS_HIDE	(1 << 0)
84 #define	FLAGS_SHOW	(1 << 1)
85 #define	SYSFS_PERCPU	(1 << 1)
86 };
87 
88 struct msr_counter bic[] = {
89 	{ 0x0, "usec", NULL, 0, 0, 0, NULL, 0 },
90 	{ 0x0, "Time_Of_Day_Seconds", NULL, 0, 0, 0, NULL, 0 },
91 	{ 0x0, "Package", NULL, 0, 0, 0, NULL, 0 },
92 	{ 0x0, "Node", NULL, 0, 0, 0, NULL, 0 },
93 	{ 0x0, "Avg_MHz", NULL, 0, 0, 0, NULL, 0 },
94 	{ 0x0, "Busy%", NULL, 0, 0, 0, NULL, 0 },
95 	{ 0x0, "Bzy_MHz", NULL, 0, 0, 0, NULL, 0 },
96 	{ 0x0, "TSC_MHz", NULL, 0, 0, 0, NULL, 0 },
97 	{ 0x0, "IRQ", NULL, 0, 0, 0, NULL, 0 },
98 	{ 0x0, "SMI", NULL, 32, 0, FORMAT_DELTA, NULL, 0 },
99 	{ 0x0, "sysfs", NULL, 0, 0, 0, NULL, 0 },
100 	{ 0x0, "CPU%c1", NULL, 0, 0, 0, NULL, 0 },
101 	{ 0x0, "CPU%c3", NULL, 0, 0, 0, NULL, 0 },
102 	{ 0x0, "CPU%c6", NULL, 0, 0, 0, NULL, 0 },
103 	{ 0x0, "CPU%c7", NULL, 0, 0, 0, NULL, 0 },
104 	{ 0x0, "ThreadC", NULL, 0, 0, 0, NULL, 0 },
105 	{ 0x0, "CoreTmp", NULL, 0, 0, 0, NULL, 0 },
106 	{ 0x0, "CoreCnt", NULL, 0, 0, 0, NULL, 0 },
107 	{ 0x0, "PkgTmp", NULL, 0, 0, 0, NULL, 0 },
108 	{ 0x0, "GFX%rc6", NULL, 0, 0, 0, NULL, 0 },
109 	{ 0x0, "GFXMHz", NULL, 0, 0, 0, NULL, 0 },
110 	{ 0x0, "Pkg%pc2", NULL, 0, 0, 0, NULL, 0 },
111 	{ 0x0, "Pkg%pc3", NULL, 0, 0, 0, NULL, 0 },
112 	{ 0x0, "Pkg%pc6", NULL, 0, 0, 0, NULL, 0 },
113 	{ 0x0, "Pkg%pc7", NULL, 0, 0, 0, NULL, 0 },
114 	{ 0x0, "Pkg%pc8", NULL, 0, 0, 0, NULL, 0 },
115 	{ 0x0, "Pkg%pc9", NULL, 0, 0, 0, NULL, 0 },
116 	{ 0x0, "Pk%pc10", NULL, 0, 0, 0, NULL, 0 },
117 	{ 0x0, "CPU%LPI", NULL, 0, 0, 0, NULL, 0 },
118 	{ 0x0, "SYS%LPI", NULL, 0, 0, 0, NULL, 0 },
119 	{ 0x0, "PkgWatt", NULL, 0, 0, 0, NULL, 0 },
120 	{ 0x0, "CorWatt", NULL, 0, 0, 0, NULL, 0 },
121 	{ 0x0, "GFXWatt", NULL, 0, 0, 0, NULL, 0 },
122 	{ 0x0, "PkgCnt", NULL, 0, 0, 0, NULL, 0 },
123 	{ 0x0, "RAMWatt", NULL, 0, 0, 0, NULL, 0 },
124 	{ 0x0, "PKG_%", NULL, 0, 0, 0, NULL, 0 },
125 	{ 0x0, "RAM_%", NULL, 0, 0, 0, NULL, 0 },
126 	{ 0x0, "Pkg_J", NULL, 0, 0, 0, NULL, 0 },
127 	{ 0x0, "Cor_J", NULL, 0, 0, 0, NULL, 0 },
128 	{ 0x0, "GFX_J", NULL, 0, 0, 0, NULL, 0 },
129 	{ 0x0, "RAM_J", NULL, 0, 0, 0, NULL, 0 },
130 	{ 0x0, "Mod%c6", NULL, 0, 0, 0, NULL, 0 },
131 	{ 0x0, "Totl%C0", NULL, 0, 0, 0, NULL, 0 },
132 	{ 0x0, "Any%C0", NULL, 0, 0, 0, NULL, 0 },
133 	{ 0x0, "GFX%C0", NULL, 0, 0, 0, NULL, 0 },
134 	{ 0x0, "CPUGFX%", NULL, 0, 0, 0, NULL, 0 },
135 	{ 0x0, "Core", NULL, 0, 0, 0, NULL, 0 },
136 	{ 0x0, "CPU", NULL, 0, 0, 0, NULL, 0 },
137 	{ 0x0, "APIC", NULL, 0, 0, 0, NULL, 0 },
138 	{ 0x0, "X2APIC", NULL, 0, 0, 0, NULL, 0 },
139 	{ 0x0, "Die", NULL, 0, 0, 0, NULL, 0 },
140 	{ 0x0, "GFXAMHz", NULL, 0, 0, 0, NULL, 0 },
141 	{ 0x0, "IPC", NULL, 0, 0, 0, NULL, 0 },
142 	{ 0x0, "CoreThr", NULL, 0, 0, 0, NULL, 0 },
143 	{ 0x0, "UncMHz", NULL, 0, 0, 0, NULL, 0 },
144 	{ 0x0, "SAM%mc6", NULL, 0, 0, 0, NULL, 0 },
145 	{ 0x0, "SAMMHz", NULL, 0, 0, 0, NULL, 0 },
146 	{ 0x0, "SAMAMHz", NULL, 0, 0, 0, NULL, 0 },
147 };
148 
149 #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter))
150 #define	BIC_USEC	(1ULL << 0)
151 #define	BIC_TOD		(1ULL << 1)
152 #define	BIC_Package	(1ULL << 2)
153 #define	BIC_Node	(1ULL << 3)
154 #define	BIC_Avg_MHz	(1ULL << 4)
155 #define	BIC_Busy	(1ULL << 5)
156 #define	BIC_Bzy_MHz	(1ULL << 6)
157 #define	BIC_TSC_MHz	(1ULL << 7)
158 #define	BIC_IRQ		(1ULL << 8)
159 #define	BIC_SMI		(1ULL << 9)
160 #define	BIC_sysfs	(1ULL << 10)
161 #define	BIC_CPU_c1	(1ULL << 11)
162 #define	BIC_CPU_c3	(1ULL << 12)
163 #define	BIC_CPU_c6	(1ULL << 13)
164 #define	BIC_CPU_c7	(1ULL << 14)
165 #define	BIC_ThreadC	(1ULL << 15)
166 #define	BIC_CoreTmp	(1ULL << 16)
167 #define	BIC_CoreCnt	(1ULL << 17)
168 #define	BIC_PkgTmp	(1ULL << 18)
169 #define	BIC_GFX_rc6	(1ULL << 19)
170 #define	BIC_GFXMHz	(1ULL << 20)
171 #define	BIC_Pkgpc2	(1ULL << 21)
172 #define	BIC_Pkgpc3	(1ULL << 22)
173 #define	BIC_Pkgpc6	(1ULL << 23)
174 #define	BIC_Pkgpc7	(1ULL << 24)
175 #define	BIC_Pkgpc8	(1ULL << 25)
176 #define	BIC_Pkgpc9	(1ULL << 26)
177 #define	BIC_Pkgpc10	(1ULL << 27)
178 #define BIC_CPU_LPI	(1ULL << 28)
179 #define BIC_SYS_LPI	(1ULL << 29)
180 #define	BIC_PkgWatt	(1ULL << 30)
181 #define	BIC_CorWatt	(1ULL << 31)
182 #define	BIC_GFXWatt	(1ULL << 32)
183 #define	BIC_PkgCnt	(1ULL << 33)
184 #define	BIC_RAMWatt	(1ULL << 34)
185 #define	BIC_PKG__	(1ULL << 35)
186 #define	BIC_RAM__	(1ULL << 36)
187 #define	BIC_Pkg_J	(1ULL << 37)
188 #define	BIC_Cor_J	(1ULL << 38)
189 #define	BIC_GFX_J	(1ULL << 39)
190 #define	BIC_RAM_J	(1ULL << 40)
191 #define	BIC_Mod_c6	(1ULL << 41)
192 #define	BIC_Totl_c0	(1ULL << 42)
193 #define	BIC_Any_c0	(1ULL << 43)
194 #define	BIC_GFX_c0	(1ULL << 44)
195 #define	BIC_CPUGFX	(1ULL << 45)
196 #define	BIC_Core	(1ULL << 46)
197 #define	BIC_CPU		(1ULL << 47)
198 #define	BIC_APIC	(1ULL << 48)
199 #define	BIC_X2APIC	(1ULL << 49)
200 #define	BIC_Die		(1ULL << 50)
201 #define	BIC_GFXACTMHz	(1ULL << 51)
202 #define	BIC_IPC		(1ULL << 52)
203 #define	BIC_CORE_THROT_CNT	(1ULL << 53)
204 #define	BIC_UNCORE_MHZ		(1ULL << 54)
205 #define	BIC_SAM_mc6		(1ULL << 55)
206 #define	BIC_SAMMHz		(1ULL << 56)
207 #define	BIC_SAMACTMHz		(1ULL << 57)
208 
209 #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die )
210 #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__)
211 #define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ)
212 #define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6)
213 #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC)
214 
215 #define BIC_DISABLED_BY_DEFAULT	(BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC)
216 
217 unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT);
218 unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC_X2APIC;
219 
220 #define DO_BIC(COUNTER_NAME) (bic_enabled & bic_present & COUNTER_NAME)
221 #define DO_BIC_READ(COUNTER_NAME) (bic_present & COUNTER_NAME)
222 #define ENABLE_BIC(COUNTER_NAME) (bic_enabled |= COUNTER_NAME)
223 #define BIC_PRESENT(COUNTER_BIT) (bic_present |= COUNTER_BIT)
224 #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT)
225 #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT)
226 
227 /*
228  * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit:
229  * If you change the values, note they are used both in comparisons
230  * (>= PCL__7) and to index pkg_cstate_limit_strings[].
231  */
232 #define PCLUKN 0		/* Unknown */
233 #define PCLRSV 1		/* Reserved */
234 #define PCL__0 2		/* PC0 */
235 #define PCL__1 3		/* PC1 */
236 #define PCL__2 4		/* PC2 */
237 #define PCL__3 5		/* PC3 */
238 #define PCL__4 6		/* PC4 */
239 #define PCL__6 7		/* PC6 */
240 #define PCL_6N 8		/* PC6 No Retention */
241 #define PCL_6R 9		/* PC6 Retention */
242 #define PCL__7 10		/* PC7 */
243 #define PCL_7S 11		/* PC7 Shrink */
244 #define PCL__8 12		/* PC8 */
245 #define PCL__9 13		/* PC9 */
246 #define PCL_10 14		/* PC10 */
247 #define PCLUNL 15		/* Unlimited */
248 
249 struct amperf_group_fd;
250 
251 char *proc_stat = "/proc/stat";
252 FILE *outf;
253 int *fd_percpu;
254 int *fd_instr_count_percpu;
255 struct amperf_group_fd *fd_amperf_percpu;	/* File descriptors for perf group with APERF and MPERF counters. */
256 struct timeval interval_tv = { 5, 0 };
257 struct timespec interval_ts = { 5, 0 };
258 
259 unsigned int num_iterations;
260 unsigned int header_iterations;
261 unsigned int debug;
262 unsigned int quiet;
263 unsigned int shown;
264 unsigned int sums_need_wide_columns;
265 unsigned int rapl_joules;
266 unsigned int summary_only;
267 unsigned int list_header_only;
268 unsigned int dump_only;
269 unsigned int has_aperf;
270 unsigned int has_epb;
271 unsigned int has_turbo;
272 unsigned int is_hybrid;
273 unsigned int units = 1000000;	/* MHz etc */
274 unsigned int genuine_intel;
275 unsigned int authentic_amd;
276 unsigned int hygon_genuine;
277 unsigned int max_level, max_extended_level;
278 unsigned int has_invariant_tsc;
279 unsigned int aperf_mperf_multiplier = 1;
280 double bclk;
281 double base_hz;
282 unsigned int has_base_hz;
283 double tsc_tweak = 1.0;
284 unsigned int show_pkg_only;
285 unsigned int show_core_only;
286 char *output_buffer, *outp;
287 unsigned int do_dts;
288 unsigned int do_ptm;
289 unsigned int do_ipc;
290 unsigned long long cpuidle_cur_cpu_lpi_us;
291 unsigned long long cpuidle_cur_sys_lpi_us;
292 unsigned int tj_max;
293 unsigned int tj_max_override;
294 double rapl_power_units, rapl_time_units;
295 double rapl_dram_energy_units, rapl_energy_units;
296 double rapl_joule_counter_range;
297 unsigned int crystal_hz;
298 unsigned long long tsc_hz;
299 int base_cpu;
300 unsigned int has_hwp;		/* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */
301 			/* IA32_HWP_REQUEST, IA32_HWP_STATUS */
302 unsigned int has_hwp_notify;	/* IA32_HWP_INTERRUPT */
303 unsigned int has_hwp_activity_window;	/* IA32_HWP_REQUEST[bits 41:32] */
304 unsigned int has_hwp_epp;	/* IA32_HWP_REQUEST[bits 31:24] */
305 unsigned int has_hwp_pkg;	/* IA32_HWP_REQUEST_PKG */
306 unsigned int first_counter_read = 1;
307 int ignore_stdin;
308 bool no_msr;
309 bool no_perf;
310 enum amperf_source amperf_source;
311 
312 enum gfx_sysfs_idx {
313 	GFX_rc6,
314 	GFX_MHz,
315 	GFX_ACTMHz,
316 	SAM_mc6,
317 	SAM_MHz,
318 	SAM_ACTMHz,
319 	GFX_MAX
320 };
321 
322 struct gfx_sysfs_info {
323 	const char *path;
324 	FILE *fp;
325 	unsigned int val;
326 	unsigned long long val_ull;
327 };
328 
329 static struct gfx_sysfs_info gfx_info[GFX_MAX];
330 
331 int get_msr(int cpu, off_t offset, unsigned long long *msr);
332 int add_counter(unsigned int msr_num, char *path, char *name,
333 		unsigned int width, enum counter_scope scope,
334 		enum counter_type type, enum counter_format format, int flags, int package_num);
335 
336 /* Model specific support Start */
337 
338 /* List of features that may diverge among different platforms */
339 struct platform_features {
340 	bool has_msr_misc_feature_control;	/* MSR_MISC_FEATURE_CONTROL */
341 	bool has_msr_misc_pwr_mgmt;	/* MSR_MISC_PWR_MGMT */
342 	bool has_nhm_msrs;	/* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, MSR_IA32_POWER_CTL, TRL MSRs */
343 	bool has_config_tdp;	/* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */
344 	int bclk_freq;		/* CPU base clock */
345 	int crystal_freq;	/* Crystal clock to use when not available from CPUID.15 */
346 	int supported_cstates;	/* Core cstates and Package cstates supported */
347 	int cst_limit;		/* MSR_PKG_CST_CONFIG_CONTROL */
348 	bool has_cst_auto_convension;	/* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */
349 	bool has_irtl_msrs;	/* MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL */
350 	bool has_msr_core_c1_res;	/* MSR_CORE_C1_RES */
351 	bool has_msr_module_c6_res_ms;	/* MSR_MODULE_C6_RES_MS */
352 	bool has_msr_c6_demotion_policy_config;	/* MSR_CC6_DEMOTION_POLICY_CONFIG/MSR_MC6_DEMOTION_POLICY_CONFIG */
353 	bool has_msr_atom_pkg_c6_residency;	/* MSR_ATOM_PKG_C6_RESIDENCY */
354 	bool has_msr_knl_core_c6_residency;	/* MSR_KNL_CORE_C6_RESIDENCY */
355 	bool has_ext_cst_msrs;	/* MSR_PKG_WEIGHTED_CORE_C0_RES/MSR_PKG_ANY_CORE_C0_RES/MSR_PKG_ANY_GFXE_C0_RES/MSR_PKG_BOTH_CORE_GFXE_C0_RES */
356 	bool has_cst_prewake_bit;	/* Cstate prewake bit in MSR_IA32_POWER_CTL */
357 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
358 	int plr_msrs;		/* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
359 	int rapl_msrs;		/* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */
360 	bool has_per_core_rapl;	/* Indicates cores energy collection is per-core, not per-package. AMD specific for now */
361 	bool has_rapl_divisor;	/* Divisor for Energy unit raw value from MSR_RAPL_POWER_UNIT */
362 	bool has_fixed_rapl_unit;	/* Fixed Energy Unit used for DRAM RAPL Domain */
363 	int rapl_quirk_tdp;	/* Hardcoded TDP value when cannot be retrieved from hardware */
364 	int tcc_offset_bits;	/* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */
365 	bool enable_tsc_tweak;	/* Use CPU Base freq instead of TSC freq for aperf/mperf counter */
366 	bool need_perf_multiplier;	/* mperf/aperf multiplier */
367 };
368 
369 struct platform_data {
370 	unsigned int model;
371 	const struct platform_features *features;
372 };
373 
374 /* For BCLK */
375 enum bclk_freq {
376 	BCLK_100MHZ = 1,
377 	BCLK_133MHZ,
378 	BCLK_SLV,
379 };
380 
381 #define SLM_BCLK_FREQS 5
382 double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0 };
383 
384 double slm_bclk(void)
385 {
386 	unsigned long long msr = 3;
387 	unsigned int i;
388 	double freq;
389 
390 	if (get_msr(base_cpu, MSR_FSB_FREQ, &msr))
391 		fprintf(outf, "SLM BCLK: unknown\n");
392 
393 	i = msr & 0xf;
394 	if (i >= SLM_BCLK_FREQS) {
395 		fprintf(outf, "SLM BCLK[%d] invalid\n", i);
396 		i = 3;
397 	}
398 	freq = slm_freq_table[i];
399 
400 	if (!quiet)
401 		fprintf(outf, "SLM BCLK: %.1f Mhz\n", freq);
402 
403 	return freq;
404 }
405 
406 /* For Package cstate limit */
407 enum package_cstate_limit {
408 	CST_LIMIT_NHM = 1,
409 	CST_LIMIT_SNB,
410 	CST_LIMIT_HSW,
411 	CST_LIMIT_SKX,
412 	CST_LIMIT_ICX,
413 	CST_LIMIT_SLV,
414 	CST_LIMIT_AMT,
415 	CST_LIMIT_KNL,
416 	CST_LIMIT_GMT,
417 };
418 
419 /* For Turbo Ratio Limit MSRs */
420 enum turbo_ratio_limit_msrs {
421 	TRL_BASE = BIT(0),
422 	TRL_LIMIT1 = BIT(1),
423 	TRL_LIMIT2 = BIT(2),
424 	TRL_ATOM = BIT(3),
425 	TRL_KNL = BIT(4),
426 	TRL_CORECOUNT = BIT(5),
427 };
428 
429 /* For Perf Limit Reason MSRs */
430 enum perf_limit_reason_msrs {
431 	PLR_CORE = BIT(0),
432 	PLR_GFX = BIT(1),
433 	PLR_RING = BIT(2),
434 };
435 
436 /* For RAPL MSRs */
437 enum rapl_msrs {
438 	RAPL_PKG_POWER_LIMIT = BIT(0),	/* 0x610 MSR_PKG_POWER_LIMIT */
439 	RAPL_PKG_ENERGY_STATUS = BIT(1),	/* 0x611 MSR_PKG_ENERGY_STATUS */
440 	RAPL_PKG_PERF_STATUS = BIT(2),	/* 0x613 MSR_PKG_PERF_STATUS */
441 	RAPL_PKG_POWER_INFO = BIT(3),	/* 0x614 MSR_PKG_POWER_INFO */
442 	RAPL_DRAM_POWER_LIMIT = BIT(4),	/* 0x618 MSR_DRAM_POWER_LIMIT */
443 	RAPL_DRAM_ENERGY_STATUS = BIT(5),	/* 0x619 MSR_DRAM_ENERGY_STATUS */
444 	RAPL_DRAM_PERF_STATUS = BIT(6),	/* 0x61b MSR_DRAM_PERF_STATUS */
445 	RAPL_DRAM_POWER_INFO = BIT(7),	/* 0x61c MSR_DRAM_POWER_INFO */
446 	RAPL_CORE_POWER_LIMIT = BIT(8),	/* 0x638 MSR_PP0_POWER_LIMIT */
447 	RAPL_CORE_ENERGY_STATUS = BIT(9),	/* 0x639 MSR_PP0_ENERGY_STATUS */
448 	RAPL_CORE_POLICY = BIT(10),	/* 0x63a MSR_PP0_POLICY */
449 	RAPL_GFX_POWER_LIMIT = BIT(11),	/* 0x640 MSR_PP1_POWER_LIMIT */
450 	RAPL_GFX_ENERGY_STATUS = BIT(12),	/* 0x641 MSR_PP1_ENERGY_STATUS */
451 	RAPL_GFX_POLICY = BIT(13),	/* 0x642 MSR_PP1_POLICY */
452 	RAPL_AMD_PWR_UNIT = BIT(14),	/* 0xc0010299 MSR_AMD_RAPL_POWER_UNIT */
453 	RAPL_AMD_CORE_ENERGY_STAT = BIT(15),	/* 0xc001029a MSR_AMD_CORE_ENERGY_STATUS */
454 	RAPL_AMD_PKG_ENERGY_STAT = BIT(16),	/* 0xc001029b MSR_AMD_PKG_ENERGY_STATUS */
455 };
456 
457 #define RAPL_PKG	(RAPL_PKG_ENERGY_STATUS | RAPL_PKG_POWER_LIMIT)
458 #define RAPL_DRAM	(RAPL_DRAM_ENERGY_STATUS | RAPL_DRAM_POWER_LIMIT)
459 #define RAPL_CORE	(RAPL_CORE_ENERGY_STATUS | RAPL_CORE_POWER_LIMIT)
460 #define RAPL_GFX	(RAPL_GFX_POWER_LIMIT | RAPL_GFX_ENERGY_STATUS)
461 
462 #define RAPL_PKG_ALL	(RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO)
463 #define RAPL_DRAM_ALL	(RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_DRAM_POWER_INFO)
464 #define RAPL_CORE_ALL	(RAPL_CORE | RAPL_CORE_POLICY)
465 #define RAPL_GFX_ALL	(RAPL_GFX | RAPL_GFX_POLIGY)
466 
467 #define RAPL_AMD_F17H	(RAPL_AMD_PWR_UNIT | RAPL_AMD_CORE_ENERGY_STAT | RAPL_AMD_PKG_ENERGY_STAT)
468 
469 /* For Cstates */
470 enum cstates {
471 	CC1 = BIT(0),
472 	CC3 = BIT(1),
473 	CC6 = BIT(2),
474 	CC7 = BIT(3),
475 	PC2 = BIT(4),
476 	PC3 = BIT(5),
477 	PC6 = BIT(6),
478 	PC7 = BIT(7),
479 	PC8 = BIT(8),
480 	PC9 = BIT(9),
481 	PC10 = BIT(10),
482 };
483 
484 static const struct platform_features nhm_features = {
485 	.has_msr_misc_pwr_mgmt = 1,
486 	.has_nhm_msrs = 1,
487 	.bclk_freq = BCLK_133MHZ,
488 	.supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
489 	.cst_limit = CST_LIMIT_NHM,
490 	.trl_msrs = TRL_BASE,
491 };
492 
493 static const struct platform_features nhx_features = {
494 	.has_msr_misc_pwr_mgmt = 1,
495 	.has_nhm_msrs = 1,
496 	.bclk_freq = BCLK_133MHZ,
497 	.supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
498 	.cst_limit = CST_LIMIT_NHM,
499 };
500 
501 static const struct platform_features snb_features = {
502 	.has_msr_misc_feature_control = 1,
503 	.has_msr_misc_pwr_mgmt = 1,
504 	.has_nhm_msrs = 1,
505 	.bclk_freq = BCLK_100MHZ,
506 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
507 	.cst_limit = CST_LIMIT_SNB,
508 	.has_irtl_msrs = 1,
509 	.trl_msrs = TRL_BASE,
510 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
511 };
512 
513 static const struct platform_features snx_features = {
514 	.has_msr_misc_feature_control = 1,
515 	.has_msr_misc_pwr_mgmt = 1,
516 	.has_nhm_msrs = 1,
517 	.bclk_freq = BCLK_100MHZ,
518 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
519 	.cst_limit = CST_LIMIT_SNB,
520 	.has_irtl_msrs = 1,
521 	.trl_msrs = TRL_BASE,
522 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
523 };
524 
525 static const struct platform_features ivb_features = {
526 	.has_msr_misc_feature_control = 1,
527 	.has_msr_misc_pwr_mgmt = 1,
528 	.has_nhm_msrs = 1,
529 	.has_config_tdp = 1,
530 	.bclk_freq = BCLK_100MHZ,
531 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
532 	.cst_limit = CST_LIMIT_SNB,
533 	.has_irtl_msrs = 1,
534 	.trl_msrs = TRL_BASE,
535 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
536 };
537 
538 static const struct platform_features ivx_features = {
539 	.has_msr_misc_feature_control = 1,
540 	.has_msr_misc_pwr_mgmt = 1,
541 	.has_nhm_msrs = 1,
542 	.bclk_freq = BCLK_100MHZ,
543 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
544 	.cst_limit = CST_LIMIT_SNB,
545 	.has_irtl_msrs = 1,
546 	.trl_msrs = TRL_BASE | TRL_LIMIT1,
547 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
548 };
549 
550 static const struct platform_features hsw_features = {
551 	.has_msr_misc_feature_control = 1,
552 	.has_msr_misc_pwr_mgmt = 1,
553 	.has_nhm_msrs = 1,
554 	.has_config_tdp = 1,
555 	.bclk_freq = BCLK_100MHZ,
556 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
557 	.cst_limit = CST_LIMIT_HSW,
558 	.has_irtl_msrs = 1,
559 	.trl_msrs = TRL_BASE,
560 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
561 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
562 };
563 
564 static const struct platform_features hsx_features = {
565 	.has_msr_misc_feature_control = 1,
566 	.has_msr_misc_pwr_mgmt = 1,
567 	.has_nhm_msrs = 1,
568 	.has_config_tdp = 1,
569 	.bclk_freq = BCLK_100MHZ,
570 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
571 	.cst_limit = CST_LIMIT_HSW,
572 	.has_irtl_msrs = 1,
573 	.trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2,
574 	.plr_msrs = PLR_CORE | PLR_RING,
575 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
576 	.has_fixed_rapl_unit = 1,
577 };
578 
579 static const struct platform_features hswl_features = {
580 	.has_msr_misc_feature_control = 1,
581 	.has_msr_misc_pwr_mgmt = 1,
582 	.has_nhm_msrs = 1,
583 	.has_config_tdp = 1,
584 	.bclk_freq = BCLK_100MHZ,
585 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
586 	.cst_limit = CST_LIMIT_HSW,
587 	.has_irtl_msrs = 1,
588 	.trl_msrs = TRL_BASE,
589 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
590 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
591 };
592 
593 static const struct platform_features hswg_features = {
594 	.has_msr_misc_feature_control = 1,
595 	.has_msr_misc_pwr_mgmt = 1,
596 	.has_nhm_msrs = 1,
597 	.has_config_tdp = 1,
598 	.bclk_freq = BCLK_100MHZ,
599 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
600 	.cst_limit = CST_LIMIT_HSW,
601 	.has_irtl_msrs = 1,
602 	.trl_msrs = TRL_BASE,
603 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
604 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
605 };
606 
607 static const struct platform_features bdw_features = {
608 	.has_msr_misc_feature_control = 1,
609 	.has_msr_misc_pwr_mgmt = 1,
610 	.has_nhm_msrs = 1,
611 	.has_config_tdp = 1,
612 	.bclk_freq = BCLK_100MHZ,
613 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
614 	.cst_limit = CST_LIMIT_HSW,
615 	.has_irtl_msrs = 1,
616 	.trl_msrs = TRL_BASE,
617 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
618 };
619 
620 static const struct platform_features bdwg_features = {
621 	.has_msr_misc_feature_control = 1,
622 	.has_msr_misc_pwr_mgmt = 1,
623 	.has_nhm_msrs = 1,
624 	.has_config_tdp = 1,
625 	.bclk_freq = BCLK_100MHZ,
626 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
627 	.cst_limit = CST_LIMIT_HSW,
628 	.has_irtl_msrs = 1,
629 	.trl_msrs = TRL_BASE,
630 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
631 };
632 
633 static const struct platform_features bdx_features = {
634 	.has_msr_misc_feature_control = 1,
635 	.has_msr_misc_pwr_mgmt = 1,
636 	.has_nhm_msrs = 1,
637 	.has_config_tdp = 1,
638 	.bclk_freq = BCLK_100MHZ,
639 	.supported_cstates = CC1 | CC3 | CC6 | PC2 | PC3 | PC6,
640 	.cst_limit = CST_LIMIT_HSW,
641 	.has_irtl_msrs = 1,
642 	.has_cst_auto_convension = 1,
643 	.trl_msrs = TRL_BASE,
644 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
645 	.has_fixed_rapl_unit = 1,
646 };
647 
648 static const struct platform_features skl_features = {
649 	.has_msr_misc_feature_control = 1,
650 	.has_msr_misc_pwr_mgmt = 1,
651 	.has_nhm_msrs = 1,
652 	.has_config_tdp = 1,
653 	.bclk_freq = BCLK_100MHZ,
654 	.crystal_freq = 24000000,
655 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
656 	.cst_limit = CST_LIMIT_HSW,
657 	.has_irtl_msrs = 1,
658 	.has_ext_cst_msrs = 1,
659 	.trl_msrs = TRL_BASE,
660 	.tcc_offset_bits = 6,
661 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
662 	.enable_tsc_tweak = 1,
663 };
664 
665 static const struct platform_features cnl_features = {
666 	.has_msr_misc_feature_control = 1,
667 	.has_msr_misc_pwr_mgmt = 1,
668 	.has_nhm_msrs = 1,
669 	.has_config_tdp = 1,
670 	.bclk_freq = BCLK_100MHZ,
671 	.supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
672 	.cst_limit = CST_LIMIT_HSW,
673 	.has_irtl_msrs = 1,
674 	.has_msr_core_c1_res = 1,
675 	.has_ext_cst_msrs = 1,
676 	.trl_msrs = TRL_BASE,
677 	.tcc_offset_bits = 6,
678 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
679 	.enable_tsc_tweak = 1,
680 };
681 
682 static const struct platform_features adl_features = {
683 	.has_msr_misc_feature_control = 1,
684 	.has_msr_misc_pwr_mgmt = 1,
685 	.has_nhm_msrs = 1,
686 	.has_config_tdp = 1,
687 	.bclk_freq = BCLK_100MHZ,
688 	.supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC8 | PC10,
689 	.cst_limit = CST_LIMIT_HSW,
690 	.has_irtl_msrs = 1,
691 	.has_msr_core_c1_res = 1,
692 	.has_ext_cst_msrs = 1,
693 	.trl_msrs = TRL_BASE,
694 	.tcc_offset_bits = 6,
695 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
696 	.enable_tsc_tweak = 1,
697 };
698 
699 static const struct platform_features arl_features = {
700 	.has_msr_misc_feature_control = 1,
701 	.has_msr_misc_pwr_mgmt = 1,
702 	.has_nhm_msrs = 1,
703 	.has_config_tdp = 1,
704 	.bclk_freq = BCLK_100MHZ,
705 	.supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC10,
706 	.cst_limit = CST_LIMIT_HSW,
707 	.has_irtl_msrs = 1,
708 	.has_msr_core_c1_res = 1,
709 	.has_ext_cst_msrs = 1,
710 	.trl_msrs = TRL_BASE,
711 	.tcc_offset_bits = 6,
712 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
713 	.enable_tsc_tweak = 1,
714 };
715 
716 static const struct platform_features skx_features = {
717 	.has_msr_misc_feature_control = 1,
718 	.has_msr_misc_pwr_mgmt = 1,
719 	.has_nhm_msrs = 1,
720 	.has_config_tdp = 1,
721 	.bclk_freq = BCLK_100MHZ,
722 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
723 	.cst_limit = CST_LIMIT_SKX,
724 	.has_irtl_msrs = 1,
725 	.has_cst_auto_convension = 1,
726 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
727 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
728 	.has_fixed_rapl_unit = 1,
729 };
730 
731 static const struct platform_features icx_features = {
732 	.has_msr_misc_feature_control = 1,
733 	.has_msr_misc_pwr_mgmt = 1,
734 	.has_nhm_msrs = 1,
735 	.has_config_tdp = 1,
736 	.bclk_freq = BCLK_100MHZ,
737 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
738 	.cst_limit = CST_LIMIT_ICX,
739 	.has_msr_core_c1_res = 1,
740 	.has_irtl_msrs = 1,
741 	.has_cst_prewake_bit = 1,
742 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
743 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
744 	.has_fixed_rapl_unit = 1,
745 };
746 
747 static const struct platform_features spr_features = {
748 	.has_msr_misc_feature_control = 1,
749 	.has_msr_misc_pwr_mgmt = 1,
750 	.has_nhm_msrs = 1,
751 	.has_config_tdp = 1,
752 	.bclk_freq = BCLK_100MHZ,
753 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
754 	.cst_limit = CST_LIMIT_SKX,
755 	.has_msr_core_c1_res = 1,
756 	.has_irtl_msrs = 1,
757 	.has_cst_prewake_bit = 1,
758 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
759 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
760 };
761 
762 static const struct platform_features srf_features = {
763 	.has_msr_misc_feature_control = 1,
764 	.has_msr_misc_pwr_mgmt = 1,
765 	.has_nhm_msrs = 1,
766 	.has_config_tdp = 1,
767 	.bclk_freq = BCLK_100MHZ,
768 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
769 	.cst_limit = CST_LIMIT_SKX,
770 	.has_msr_core_c1_res = 1,
771 	.has_msr_module_c6_res_ms = 1,
772 	.has_irtl_msrs = 1,
773 	.has_cst_prewake_bit = 1,
774 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
775 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
776 };
777 
778 static const struct platform_features grr_features = {
779 	.has_msr_misc_feature_control = 1,
780 	.has_msr_misc_pwr_mgmt = 1,
781 	.has_nhm_msrs = 1,
782 	.has_config_tdp = 1,
783 	.bclk_freq = BCLK_100MHZ,
784 	.supported_cstates = CC1 | CC6,
785 	.cst_limit = CST_LIMIT_SKX,
786 	.has_msr_core_c1_res = 1,
787 	.has_msr_module_c6_res_ms = 1,
788 	.has_irtl_msrs = 1,
789 	.has_cst_prewake_bit = 1,
790 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
791 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
792 };
793 
794 static const struct platform_features slv_features = {
795 	.has_nhm_msrs = 1,
796 	.bclk_freq = BCLK_SLV,
797 	.supported_cstates = CC1 | CC6 | PC6,
798 	.cst_limit = CST_LIMIT_SLV,
799 	.has_msr_core_c1_res = 1,
800 	.has_msr_module_c6_res_ms = 1,
801 	.has_msr_c6_demotion_policy_config = 1,
802 	.has_msr_atom_pkg_c6_residency = 1,
803 	.trl_msrs = TRL_ATOM,
804 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
805 	.has_rapl_divisor = 1,
806 	.rapl_quirk_tdp = 30,
807 };
808 
809 static const struct platform_features slvd_features = {
810 	.has_msr_misc_pwr_mgmt = 1,
811 	.has_nhm_msrs = 1,
812 	.bclk_freq = BCLK_SLV,
813 	.supported_cstates = CC1 | CC6 | PC3 | PC6,
814 	.cst_limit = CST_LIMIT_SLV,
815 	.has_msr_atom_pkg_c6_residency = 1,
816 	.trl_msrs = TRL_BASE,
817 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
818 	.rapl_quirk_tdp = 30,
819 };
820 
821 static const struct platform_features amt_features = {
822 	.has_nhm_msrs = 1,
823 	.bclk_freq = BCLK_133MHZ,
824 	.supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
825 	.cst_limit = CST_LIMIT_AMT,
826 	.trl_msrs = TRL_BASE,
827 };
828 
829 static const struct platform_features gmt_features = {
830 	.has_msr_misc_pwr_mgmt = 1,
831 	.has_nhm_msrs = 1,
832 	.bclk_freq = BCLK_100MHZ,
833 	.crystal_freq = 19200000,
834 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
835 	.cst_limit = CST_LIMIT_GMT,
836 	.has_irtl_msrs = 1,
837 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
838 	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
839 };
840 
841 static const struct platform_features gmtd_features = {
842 	.has_msr_misc_pwr_mgmt = 1,
843 	.has_nhm_msrs = 1,
844 	.bclk_freq = BCLK_100MHZ,
845 	.crystal_freq = 25000000,
846 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
847 	.cst_limit = CST_LIMIT_GMT,
848 	.has_irtl_msrs = 1,
849 	.has_msr_core_c1_res = 1,
850 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
851 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS,
852 };
853 
854 static const struct platform_features gmtp_features = {
855 	.has_msr_misc_pwr_mgmt = 1,
856 	.has_nhm_msrs = 1,
857 	.bclk_freq = BCLK_100MHZ,
858 	.crystal_freq = 19200000,
859 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
860 	.cst_limit = CST_LIMIT_GMT,
861 	.has_irtl_msrs = 1,
862 	.trl_msrs = TRL_BASE,
863 	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
864 };
865 
866 static const struct platform_features tmt_features = {
867 	.has_msr_misc_pwr_mgmt = 1,
868 	.has_nhm_msrs = 1,
869 	.bclk_freq = BCLK_100MHZ,
870 	.supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
871 	.cst_limit = CST_LIMIT_GMT,
872 	.has_irtl_msrs = 1,
873 	.trl_msrs = TRL_BASE,
874 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
875 	.enable_tsc_tweak = 1,
876 };
877 
878 static const struct platform_features tmtd_features = {
879 	.has_msr_misc_pwr_mgmt = 1,
880 	.has_nhm_msrs = 1,
881 	.bclk_freq = BCLK_100MHZ,
882 	.supported_cstates = CC1 | CC6,
883 	.cst_limit = CST_LIMIT_GMT,
884 	.has_irtl_msrs = 1,
885 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
886 	.rapl_msrs = RAPL_PKG_ALL,
887 };
888 
889 static const struct platform_features knl_features = {
890 	.has_msr_misc_pwr_mgmt = 1,
891 	.has_nhm_msrs = 1,
892 	.has_config_tdp = 1,
893 	.bclk_freq = BCLK_100MHZ,
894 	.supported_cstates = CC1 | CC6 | PC3 | PC6,
895 	.cst_limit = CST_LIMIT_KNL,
896 	.has_msr_knl_core_c6_residency = 1,
897 	.trl_msrs = TRL_KNL,
898 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
899 	.has_fixed_rapl_unit = 1,
900 	.need_perf_multiplier = 1,
901 };
902 
903 static const struct platform_features default_features = {
904 };
905 
906 static const struct platform_features amd_features_with_rapl = {
907 	.rapl_msrs = RAPL_AMD_F17H,
908 	.has_per_core_rapl = 1,
909 	.rapl_quirk_tdp = 280,	/* This is the max stock TDP of HEDT/Server Fam17h+ chips */
910 };
911 
912 static const struct platform_data turbostat_pdata[] = {
913 	{ INTEL_FAM6_NEHALEM, &nhm_features },
914 	{ INTEL_FAM6_NEHALEM_G, &nhm_features },
915 	{ INTEL_FAM6_NEHALEM_EP, &nhm_features },
916 	{ INTEL_FAM6_NEHALEM_EX, &nhx_features },
917 	{ INTEL_FAM6_WESTMERE, &nhm_features },
918 	{ INTEL_FAM6_WESTMERE_EP, &nhm_features },
919 	{ INTEL_FAM6_WESTMERE_EX, &nhx_features },
920 	{ INTEL_FAM6_SANDYBRIDGE, &snb_features },
921 	{ INTEL_FAM6_SANDYBRIDGE_X, &snx_features },
922 	{ INTEL_FAM6_IVYBRIDGE, &ivb_features },
923 	{ INTEL_FAM6_IVYBRIDGE_X, &ivx_features },
924 	{ INTEL_FAM6_HASWELL, &hsw_features },
925 	{ INTEL_FAM6_HASWELL_X, &hsx_features },
926 	{ INTEL_FAM6_HASWELL_L, &hswl_features },
927 	{ INTEL_FAM6_HASWELL_G, &hswg_features },
928 	{ INTEL_FAM6_BROADWELL, &bdw_features },
929 	{ INTEL_FAM6_BROADWELL_G, &bdwg_features },
930 	{ INTEL_FAM6_BROADWELL_X, &bdx_features },
931 	{ INTEL_FAM6_BROADWELL_D, &bdx_features },
932 	{ INTEL_FAM6_SKYLAKE_L, &skl_features },
933 	{ INTEL_FAM6_SKYLAKE, &skl_features },
934 	{ INTEL_FAM6_SKYLAKE_X, &skx_features },
935 	{ INTEL_FAM6_KABYLAKE_L, &skl_features },
936 	{ INTEL_FAM6_KABYLAKE, &skl_features },
937 	{ INTEL_FAM6_COMETLAKE, &skl_features },
938 	{ INTEL_FAM6_COMETLAKE_L, &skl_features },
939 	{ INTEL_FAM6_CANNONLAKE_L, &cnl_features },
940 	{ INTEL_FAM6_ICELAKE_X, &icx_features },
941 	{ INTEL_FAM6_ICELAKE_D, &icx_features },
942 	{ INTEL_FAM6_ICELAKE_L, &cnl_features },
943 	{ INTEL_FAM6_ICELAKE_NNPI, &cnl_features },
944 	{ INTEL_FAM6_ROCKETLAKE, &cnl_features },
945 	{ INTEL_FAM6_TIGERLAKE_L, &cnl_features },
946 	{ INTEL_FAM6_TIGERLAKE, &cnl_features },
947 	{ INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features },
948 	{ INTEL_FAM6_EMERALDRAPIDS_X, &spr_features },
949 	{ INTEL_FAM6_GRANITERAPIDS_X, &spr_features },
950 	{ INTEL_FAM6_LAKEFIELD, &cnl_features },
951 	{ INTEL_FAM6_ALDERLAKE, &adl_features },
952 	{ INTEL_FAM6_ALDERLAKE_L, &adl_features },
953 	{ INTEL_FAM6_RAPTORLAKE, &adl_features },
954 	{ INTEL_FAM6_RAPTORLAKE_P, &adl_features },
955 	{ INTEL_FAM6_RAPTORLAKE_S, &adl_features },
956 	{ INTEL_FAM6_METEORLAKE, &cnl_features },
957 	{ INTEL_FAM6_METEORLAKE_L, &cnl_features },
958 	{ INTEL_FAM6_ARROWLAKE_H, &arl_features },
959 	{ INTEL_FAM6_ARROWLAKE_U, &arl_features },
960 	{ INTEL_FAM6_ARROWLAKE, &arl_features },
961 	{ INTEL_FAM6_LUNARLAKE_M, &arl_features },
962 	{ INTEL_FAM6_ATOM_SILVERMONT, &slv_features },
963 	{ INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features },
964 	{ INTEL_FAM6_ATOM_AIRMONT, &amt_features },
965 	{ INTEL_FAM6_ATOM_GOLDMONT, &gmt_features },
966 	{ INTEL_FAM6_ATOM_GOLDMONT_D, &gmtd_features },
967 	{ INTEL_FAM6_ATOM_GOLDMONT_PLUS, &gmtp_features },
968 	{ INTEL_FAM6_ATOM_TREMONT_D, &tmtd_features },
969 	{ INTEL_FAM6_ATOM_TREMONT, &tmt_features },
970 	{ INTEL_FAM6_ATOM_TREMONT_L, &tmt_features },
971 	{ INTEL_FAM6_ATOM_GRACEMONT, &adl_features },
972 	{ INTEL_FAM6_ATOM_CRESTMONT_X, &srf_features },
973 	{ INTEL_FAM6_ATOM_CRESTMONT, &grr_features },
974 	{ INTEL_FAM6_XEON_PHI_KNL, &knl_features },
975 	{ INTEL_FAM6_XEON_PHI_KNM, &knl_features },
976 	/*
977 	 * Missing support for
978 	 * INTEL_FAM6_ICELAKE
979 	 * INTEL_FAM6_ATOM_SILVERMONT_MID
980 	 * INTEL_FAM6_ATOM_AIRMONT_MID
981 	 * INTEL_FAM6_ATOM_AIRMONT_NP
982 	 */
983 	{ 0, NULL },
984 };
985 
986 static const struct platform_features *platform;
987 
988 void probe_platform_features(unsigned int family, unsigned int model)
989 {
990 	int i;
991 
992 	platform = &default_features;
993 
994 	if (authentic_amd || hygon_genuine) {
995 		if (max_extended_level >= 0x80000007) {
996 			unsigned int eax, ebx, ecx, edx;
997 
998 			__cpuid(0x80000007, eax, ebx, ecx, edx);
999 			/* RAPL (Fam 17h+) */
1000 			if ((edx & (1 << 14)) && family >= 0x17)
1001 				platform = &amd_features_with_rapl;
1002 		}
1003 		return;
1004 	}
1005 
1006 	if (!genuine_intel || family != 6)
1007 		return;
1008 
1009 	for (i = 0; turbostat_pdata[i].features; i++) {
1010 		if (turbostat_pdata[i].model == model) {
1011 			platform = turbostat_pdata[i].features;
1012 			return;
1013 		}
1014 	}
1015 }
1016 
1017 /* Model specific support End */
1018 
1019 #define	TJMAX_DEFAULT	100
1020 
1021 /* MSRs that are not yet in the kernel-provided header. */
1022 #define MSR_RAPL_PWR_UNIT	0xc0010299
1023 #define MSR_CORE_ENERGY_STAT	0xc001029a
1024 #define MSR_PKG_ENERGY_STAT	0xc001029b
1025 
1026 #define MAX(a, b) ((a) > (b) ? (a) : (b))
1027 
1028 int backwards_count;
1029 char *progname;
1030 
1031 #define CPU_SUBSET_MAXCPUS	1024	/* need to use before probe... */
1032 cpu_set_t *cpu_present_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset;
1033 size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size;
1034 #define MAX_ADDED_THREAD_COUNTERS 24
1035 #define MAX_ADDED_CORE_COUNTERS 8
1036 #define MAX_ADDED_PACKAGE_COUNTERS 16
1037 #define BITMASK_SIZE 32
1038 
1039 /* Indexes used to map data read from perf and MSRs into global variables */
1040 enum rapl_rci_index {
1041 	RAPL_RCI_INDEX_ENERGY_PKG = 0,
1042 	RAPL_RCI_INDEX_ENERGY_CORES = 1,
1043 	RAPL_RCI_INDEX_DRAM = 2,
1044 	RAPL_RCI_INDEX_GFX = 3,
1045 	RAPL_RCI_INDEX_PKG_PERF_STATUS = 4,
1046 	RAPL_RCI_INDEX_DRAM_PERF_STATUS = 5,
1047 	RAPL_RCI_INDEX_CORE_ENERGY = 6,
1048 	NUM_RAPL_COUNTERS,
1049 };
1050 
1051 enum rapl_unit {
1052 	RAPL_UNIT_INVALID,
1053 	RAPL_UNIT_JOULES,
1054 	RAPL_UNIT_WATTS,
1055 };
1056 
1057 struct rapl_counter_info_t {
1058 	unsigned long long data[NUM_RAPL_COUNTERS];
1059 	enum rapl_source source[NUM_RAPL_COUNTERS];
1060 	unsigned long long flags[NUM_RAPL_COUNTERS];
1061 	double scale[NUM_RAPL_COUNTERS];
1062 	enum rapl_unit unit[NUM_RAPL_COUNTERS];
1063 
1064 	union {
1065 		/* Active when source == RAPL_SOURCE_MSR */
1066 		struct {
1067 			unsigned long long msr[NUM_RAPL_COUNTERS];
1068 			unsigned long long msr_mask[NUM_RAPL_COUNTERS];
1069 			int msr_shift[NUM_RAPL_COUNTERS];
1070 		};
1071 	};
1072 
1073 	int fd_perf;
1074 };
1075 
1076 /* struct rapl_counter_info_t for each RAPL domain */
1077 struct rapl_counter_info_t *rapl_counter_info_perdomain;
1078 unsigned int rapl_counter_info_perdomain_size;
1079 
1080 #define RAPL_COUNTER_FLAG_USE_MSR_SUM (1u << 1)
1081 
1082 struct rapl_counter_arch_info {
1083 	int feature_mask;	/* Mask for testing if the counter is supported on host */
1084 	const char *perf_subsys;
1085 	const char *perf_name;
1086 	unsigned long long msr;
1087 	unsigned long long msr_mask;
1088 	int msr_shift;		/* Positive mean shift right, negative mean shift left */
1089 	double *platform_rapl_msr_scale;	/* Scale applied to values read by MSR (platform dependent, filled at runtime) */
1090 	unsigned int rci_index;	/* Maps data from perf counters to global variables */
1091 	unsigned long long bic;
1092 	double compat_scale;	/* Some counters require constant scaling to be in the same range as other, similar ones */
1093 	unsigned long long flags;
1094 };
1095 
1096 static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = {
1097 	{
1098 	 .feature_mask = RAPL_PKG,
1099 	 .perf_subsys = "power",
1100 	 .perf_name = "energy-pkg",
1101 	 .msr = MSR_PKG_ENERGY_STATUS,
1102 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1103 	 .msr_shift = 0,
1104 	 .platform_rapl_msr_scale = &rapl_energy_units,
1105 	 .rci_index = RAPL_RCI_INDEX_ENERGY_PKG,
1106 	 .bic = BIC_PkgWatt | BIC_Pkg_J,
1107 	 .compat_scale = 1.0,
1108 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1109 	  },
1110 	{
1111 	 .feature_mask = RAPL_AMD_F17H,
1112 	 .perf_subsys = "power",
1113 	 .perf_name = "energy-pkg",
1114 	 .msr = MSR_PKG_ENERGY_STAT,
1115 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1116 	 .msr_shift = 0,
1117 	 .platform_rapl_msr_scale = &rapl_energy_units,
1118 	 .rci_index = RAPL_RCI_INDEX_ENERGY_PKG,
1119 	 .bic = BIC_PkgWatt | BIC_Pkg_J,
1120 	 .compat_scale = 1.0,
1121 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1122 	  },
1123 	{
1124 	 .feature_mask = RAPL_CORE_ENERGY_STATUS,
1125 	 .perf_subsys = "power",
1126 	 .perf_name = "energy-cores",
1127 	 .msr = MSR_PP0_ENERGY_STATUS,
1128 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1129 	 .msr_shift = 0,
1130 	 .platform_rapl_msr_scale = &rapl_energy_units,
1131 	 .rci_index = RAPL_RCI_INDEX_ENERGY_CORES,
1132 	 .bic = BIC_CorWatt | BIC_Cor_J,
1133 	 .compat_scale = 1.0,
1134 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1135 	  },
1136 	{
1137 	 .feature_mask = RAPL_DRAM,
1138 	 .perf_subsys = "power",
1139 	 .perf_name = "energy-ram",
1140 	 .msr = MSR_DRAM_ENERGY_STATUS,
1141 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1142 	 .msr_shift = 0,
1143 	 .platform_rapl_msr_scale = &rapl_dram_energy_units,
1144 	 .rci_index = RAPL_RCI_INDEX_DRAM,
1145 	 .bic = BIC_RAMWatt | BIC_RAM_J,
1146 	 .compat_scale = 1.0,
1147 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1148 	  },
1149 	{
1150 	 .feature_mask = RAPL_GFX,
1151 	 .perf_subsys = "power",
1152 	 .perf_name = "energy-gpu",
1153 	 .msr = MSR_PP1_ENERGY_STATUS,
1154 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1155 	 .msr_shift = 0,
1156 	 .platform_rapl_msr_scale = &rapl_energy_units,
1157 	 .rci_index = RAPL_RCI_INDEX_GFX,
1158 	 .bic = BIC_GFXWatt | BIC_GFX_J,
1159 	 .compat_scale = 1.0,
1160 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1161 	  },
1162 	{
1163 	 .feature_mask = RAPL_PKG_PERF_STATUS,
1164 	 .perf_subsys = NULL,
1165 	 .perf_name = NULL,
1166 	 .msr = MSR_PKG_PERF_STATUS,
1167 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1168 	 .msr_shift = 0,
1169 	 .platform_rapl_msr_scale = &rapl_time_units,
1170 	 .rci_index = RAPL_RCI_INDEX_PKG_PERF_STATUS,
1171 	 .bic = BIC_PKG__,
1172 	 .compat_scale = 100.0,
1173 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1174 	  },
1175 	{
1176 	 .feature_mask = RAPL_DRAM_PERF_STATUS,
1177 	 .perf_subsys = NULL,
1178 	 .perf_name = NULL,
1179 	 .msr = MSR_DRAM_PERF_STATUS,
1180 	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
1181 	 .msr_shift = 0,
1182 	 .platform_rapl_msr_scale = &rapl_time_units,
1183 	 .rci_index = RAPL_RCI_INDEX_DRAM_PERF_STATUS,
1184 	 .bic = BIC_RAM__,
1185 	 .compat_scale = 100.0,
1186 	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1187 	  },
1188 	{
1189 	 .feature_mask = RAPL_AMD_F17H,
1190 	 .perf_subsys = NULL,
1191 	 .perf_name = NULL,
1192 	 .msr = MSR_CORE_ENERGY_STAT,
1193 	 .msr_mask = 0xFFFFFFFF,
1194 	 .msr_shift = 0,
1195 	 .platform_rapl_msr_scale = &rapl_energy_units,
1196 	 .rci_index = RAPL_RCI_INDEX_CORE_ENERGY,
1197 	 .bic = BIC_CorWatt | BIC_Cor_J,
1198 	 .compat_scale = 1.0,
1199 	 .flags = 0,
1200 	  },
1201 };
1202 
1203 struct rapl_counter {
1204 	unsigned long long raw_value;
1205 	enum rapl_unit unit;
1206 	double scale;
1207 };
1208 
1209 /* Indexes used to map data read from perf and MSRs into global variables */
1210 enum ccstate_rci_index {
1211 	CCSTATE_RCI_INDEX_C1_RESIDENCY = 0,
1212 	CCSTATE_RCI_INDEX_C3_RESIDENCY = 1,
1213 	CCSTATE_RCI_INDEX_C6_RESIDENCY = 2,
1214 	CCSTATE_RCI_INDEX_C7_RESIDENCY = 3,
1215 	PCSTATE_RCI_INDEX_C2_RESIDENCY = 4,
1216 	PCSTATE_RCI_INDEX_C3_RESIDENCY = 5,
1217 	PCSTATE_RCI_INDEX_C6_RESIDENCY = 6,
1218 	PCSTATE_RCI_INDEX_C7_RESIDENCY = 7,
1219 	PCSTATE_RCI_INDEX_C8_RESIDENCY = 8,
1220 	PCSTATE_RCI_INDEX_C9_RESIDENCY = 9,
1221 	PCSTATE_RCI_INDEX_C10_RESIDENCY = 10,
1222 	NUM_CSTATE_COUNTERS,
1223 };
1224 
1225 struct cstate_counter_info_t {
1226 	unsigned long long data[NUM_CSTATE_COUNTERS];
1227 	enum cstate_source source[NUM_CSTATE_COUNTERS];
1228 	unsigned long long msr[NUM_CSTATE_COUNTERS];
1229 	int fd_perf_core;
1230 	int fd_perf_pkg;
1231 };
1232 
1233 struct cstate_counter_info_t *ccstate_counter_info;
1234 unsigned int ccstate_counter_info_size;
1235 
1236 #define CSTATE_COUNTER_FLAG_COLLECT_PER_CORE   (1u << 0)
1237 #define CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD ((1u << 1) | CSTATE_COUNTER_FLAG_COLLECT_PER_CORE)
1238 #define CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY (1u << 2)
1239 
1240 struct cstate_counter_arch_info {
1241 	int feature_mask;	/* Mask for testing if the counter is supported on host */
1242 	const char *perf_subsys;
1243 	const char *perf_name;
1244 	unsigned long long msr;
1245 	unsigned int rci_index;	/* Maps data from perf counters to global variables */
1246 	unsigned long long bic;
1247 	unsigned long long flags;
1248 	int pkg_cstate_limit;
1249 };
1250 
1251 static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = {
1252 	{
1253 	 .feature_mask = CC1,
1254 	 .perf_subsys = "cstate_core",
1255 	 .perf_name = "c1-residency",
1256 	 .msr = MSR_CORE_C1_RES,
1257 	 .rci_index = CCSTATE_RCI_INDEX_C1_RESIDENCY,
1258 	 .bic = BIC_CPU_c1,
1259 	 .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD,
1260 	 .pkg_cstate_limit = 0,
1261 	  },
1262 	{
1263 	 .feature_mask = CC3,
1264 	 .perf_subsys = "cstate_core",
1265 	 .perf_name = "c3-residency",
1266 	 .msr = MSR_CORE_C3_RESIDENCY,
1267 	 .rci_index = CCSTATE_RCI_INDEX_C3_RESIDENCY,
1268 	 .bic = BIC_CPU_c3,
1269 	 .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY,
1270 	 .pkg_cstate_limit = 0,
1271 	  },
1272 	{
1273 	 .feature_mask = CC6,
1274 	 .perf_subsys = "cstate_core",
1275 	 .perf_name = "c6-residency",
1276 	 .msr = MSR_CORE_C6_RESIDENCY,
1277 	 .rci_index = CCSTATE_RCI_INDEX_C6_RESIDENCY,
1278 	 .bic = BIC_CPU_c6,
1279 	 .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY,
1280 	 .pkg_cstate_limit = 0,
1281 	  },
1282 	{
1283 	 .feature_mask = CC7,
1284 	 .perf_subsys = "cstate_core",
1285 	 .perf_name = "c7-residency",
1286 	 .msr = MSR_CORE_C7_RESIDENCY,
1287 	 .rci_index = CCSTATE_RCI_INDEX_C7_RESIDENCY,
1288 	 .bic = BIC_CPU_c7,
1289 	 .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY,
1290 	 .pkg_cstate_limit = 0,
1291 	  },
1292 	{
1293 	 .feature_mask = PC2,
1294 	 .perf_subsys = "cstate_pkg",
1295 	 .perf_name = "c2-residency",
1296 	 .msr = MSR_PKG_C2_RESIDENCY,
1297 	 .rci_index = PCSTATE_RCI_INDEX_C2_RESIDENCY,
1298 	 .bic = BIC_Pkgpc2,
1299 	 .flags = 0,
1300 	 .pkg_cstate_limit = PCL__2,
1301 	  },
1302 	{
1303 	 .feature_mask = PC3,
1304 	 .perf_subsys = "cstate_pkg",
1305 	 .perf_name = "c3-residency",
1306 	 .msr = MSR_PKG_C3_RESIDENCY,
1307 	 .rci_index = PCSTATE_RCI_INDEX_C3_RESIDENCY,
1308 	 .bic = BIC_Pkgpc3,
1309 	 .flags = 0,
1310 	 .pkg_cstate_limit = PCL__3,
1311 	  },
1312 	{
1313 	 .feature_mask = PC6,
1314 	 .perf_subsys = "cstate_pkg",
1315 	 .perf_name = "c6-residency",
1316 	 .msr = MSR_PKG_C6_RESIDENCY,
1317 	 .rci_index = PCSTATE_RCI_INDEX_C6_RESIDENCY,
1318 	 .bic = BIC_Pkgpc6,
1319 	 .flags = 0,
1320 	 .pkg_cstate_limit = PCL__6,
1321 	  },
1322 	{
1323 	 .feature_mask = PC7,
1324 	 .perf_subsys = "cstate_pkg",
1325 	 .perf_name = "c7-residency",
1326 	 .msr = MSR_PKG_C7_RESIDENCY,
1327 	 .rci_index = PCSTATE_RCI_INDEX_C7_RESIDENCY,
1328 	 .bic = BIC_Pkgpc7,
1329 	 .flags = 0,
1330 	 .pkg_cstate_limit = PCL__7,
1331 	  },
1332 	{
1333 	 .feature_mask = PC8,
1334 	 .perf_subsys = "cstate_pkg",
1335 	 .perf_name = "c8-residency",
1336 	 .msr = MSR_PKG_C8_RESIDENCY,
1337 	 .rci_index = PCSTATE_RCI_INDEX_C8_RESIDENCY,
1338 	 .bic = BIC_Pkgpc8,
1339 	 .flags = 0,
1340 	 .pkg_cstate_limit = PCL__8,
1341 	  },
1342 	{
1343 	 .feature_mask = PC9,
1344 	 .perf_subsys = "cstate_pkg",
1345 	 .perf_name = "c9-residency",
1346 	 .msr = MSR_PKG_C9_RESIDENCY,
1347 	 .rci_index = PCSTATE_RCI_INDEX_C9_RESIDENCY,
1348 	 .bic = BIC_Pkgpc9,
1349 	 .flags = 0,
1350 	 .pkg_cstate_limit = PCL__9,
1351 	  },
1352 	{
1353 	 .feature_mask = PC10,
1354 	 .perf_subsys = "cstate_pkg",
1355 	 .perf_name = "c10-residency",
1356 	 .msr = MSR_PKG_C10_RESIDENCY,
1357 	 .rci_index = PCSTATE_RCI_INDEX_C10_RESIDENCY,
1358 	 .bic = BIC_Pkgpc10,
1359 	 .flags = 0,
1360 	 .pkg_cstate_limit = PCL_10,
1361 	  },
1362 };
1363 
1364 struct thread_data {
1365 	struct timeval tv_begin;
1366 	struct timeval tv_end;
1367 	struct timeval tv_delta;
1368 	unsigned long long tsc;
1369 	unsigned long long aperf;
1370 	unsigned long long mperf;
1371 	unsigned long long c1;
1372 	unsigned long long instr_count;
1373 	unsigned long long irq_count;
1374 	unsigned int smi_count;
1375 	unsigned int cpu_id;
1376 	unsigned int apic_id;
1377 	unsigned int x2apic_id;
1378 	unsigned int flags;
1379 	bool is_atom;
1380 	unsigned long long counter[MAX_ADDED_THREAD_COUNTERS];
1381 } *thread_even, *thread_odd;
1382 
1383 struct core_data {
1384 	int base_cpu;
1385 	unsigned long long c3;
1386 	unsigned long long c6;
1387 	unsigned long long c7;
1388 	unsigned long long mc6_us;	/* duplicate as per-core for now, even though per module */
1389 	unsigned int core_temp_c;
1390 	struct rapl_counter core_energy;	/* MSR_CORE_ENERGY_STAT */
1391 	unsigned int core_id;
1392 	unsigned long long core_throt_cnt;
1393 	unsigned long long counter[MAX_ADDED_CORE_COUNTERS];
1394 } *core_even, *core_odd;
1395 
1396 struct pkg_data {
1397 	int base_cpu;
1398 	unsigned long long pc2;
1399 	unsigned long long pc3;
1400 	unsigned long long pc6;
1401 	unsigned long long pc7;
1402 	unsigned long long pc8;
1403 	unsigned long long pc9;
1404 	unsigned long long pc10;
1405 	long long cpu_lpi;
1406 	long long sys_lpi;
1407 	unsigned long long pkg_wtd_core_c0;
1408 	unsigned long long pkg_any_core_c0;
1409 	unsigned long long pkg_any_gfxe_c0;
1410 	unsigned long long pkg_both_core_gfxe_c0;
1411 	long long gfx_rc6_ms;
1412 	unsigned int gfx_mhz;
1413 	unsigned int gfx_act_mhz;
1414 	long long sam_mc6_ms;
1415 	unsigned int sam_mhz;
1416 	unsigned int sam_act_mhz;
1417 	unsigned int package_id;
1418 	struct rapl_counter energy_pkg;	/* MSR_PKG_ENERGY_STATUS */
1419 	struct rapl_counter energy_dram;	/* MSR_DRAM_ENERGY_STATUS */
1420 	struct rapl_counter energy_cores;	/* MSR_PP0_ENERGY_STATUS */
1421 	struct rapl_counter energy_gfx;	/* MSR_PP1_ENERGY_STATUS */
1422 	struct rapl_counter rapl_pkg_perf_status;	/* MSR_PKG_PERF_STATUS */
1423 	struct rapl_counter rapl_dram_perf_status;	/* MSR_DRAM_PERF_STATUS */
1424 	unsigned int pkg_temp_c;
1425 	unsigned int uncore_mhz;
1426 	unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS];
1427 } *package_even, *package_odd;
1428 
1429 #define ODD_COUNTERS thread_odd, core_odd, package_odd
1430 #define EVEN_COUNTERS thread_even, core_even, package_even
1431 
1432 #define GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no)	      \
1433 	((thread_base) +						      \
1434 	 ((pkg_no) *							      \
1435 	  topo.nodes_per_pkg * topo.cores_per_node * topo.threads_per_core) + \
1436 	 ((node_no) * topo.cores_per_node * topo.threads_per_core) +	      \
1437 	 ((core_no) * topo.threads_per_core) +				      \
1438 	 (thread_no))
1439 
1440 #define GET_CORE(core_base, core_no, node_no, pkg_no)			\
1441 	((core_base) +							\
1442 	 ((pkg_no) *  topo.nodes_per_pkg * topo.cores_per_node) +	\
1443 	 ((node_no) * topo.cores_per_node) +				\
1444 	 (core_no))
1445 
1446 #define GET_PKG(pkg_base, pkg_no) (pkg_base + pkg_no)
1447 
1448 /*
1449  * The accumulated sum of MSR is defined as a monotonic
1450  * increasing MSR, it will be accumulated periodically,
1451  * despite its register's bit width.
1452  */
1453 enum {
1454 	IDX_PKG_ENERGY,
1455 	IDX_DRAM_ENERGY,
1456 	IDX_PP0_ENERGY,
1457 	IDX_PP1_ENERGY,
1458 	IDX_PKG_PERF,
1459 	IDX_DRAM_PERF,
1460 	IDX_COUNT,
1461 };
1462 
1463 int get_msr_sum(int cpu, off_t offset, unsigned long long *msr);
1464 
1465 struct msr_sum_array {
1466 	/* get_msr_sum() = sum + (get_msr() - last) */
1467 	struct {
1468 		/*The accumulated MSR value is updated by the timer */
1469 		unsigned long long sum;
1470 		/*The MSR footprint recorded in last timer */
1471 		unsigned long long last;
1472 	} entries[IDX_COUNT];
1473 };
1474 
1475 /* The percpu MSR sum array.*/
1476 struct msr_sum_array *per_cpu_msr_sum;
1477 
1478 off_t idx_to_offset(int idx)
1479 {
1480 	off_t offset;
1481 
1482 	switch (idx) {
1483 	case IDX_PKG_ENERGY:
1484 		if (platform->rapl_msrs & RAPL_AMD_F17H)
1485 			offset = MSR_PKG_ENERGY_STAT;
1486 		else
1487 			offset = MSR_PKG_ENERGY_STATUS;
1488 		break;
1489 	case IDX_DRAM_ENERGY:
1490 		offset = MSR_DRAM_ENERGY_STATUS;
1491 		break;
1492 	case IDX_PP0_ENERGY:
1493 		offset = MSR_PP0_ENERGY_STATUS;
1494 		break;
1495 	case IDX_PP1_ENERGY:
1496 		offset = MSR_PP1_ENERGY_STATUS;
1497 		break;
1498 	case IDX_PKG_PERF:
1499 		offset = MSR_PKG_PERF_STATUS;
1500 		break;
1501 	case IDX_DRAM_PERF:
1502 		offset = MSR_DRAM_PERF_STATUS;
1503 		break;
1504 	default:
1505 		offset = -1;
1506 	}
1507 	return offset;
1508 }
1509 
1510 int offset_to_idx(off_t offset)
1511 {
1512 	int idx;
1513 
1514 	switch (offset) {
1515 	case MSR_PKG_ENERGY_STATUS:
1516 	case MSR_PKG_ENERGY_STAT:
1517 		idx = IDX_PKG_ENERGY;
1518 		break;
1519 	case MSR_DRAM_ENERGY_STATUS:
1520 		idx = IDX_DRAM_ENERGY;
1521 		break;
1522 	case MSR_PP0_ENERGY_STATUS:
1523 		idx = IDX_PP0_ENERGY;
1524 		break;
1525 	case MSR_PP1_ENERGY_STATUS:
1526 		idx = IDX_PP1_ENERGY;
1527 		break;
1528 	case MSR_PKG_PERF_STATUS:
1529 		idx = IDX_PKG_PERF;
1530 		break;
1531 	case MSR_DRAM_PERF_STATUS:
1532 		idx = IDX_DRAM_PERF;
1533 		break;
1534 	default:
1535 		idx = -1;
1536 	}
1537 	return idx;
1538 }
1539 
1540 int idx_valid(int idx)
1541 {
1542 	switch (idx) {
1543 	case IDX_PKG_ENERGY:
1544 		return platform->rapl_msrs & (RAPL_PKG | RAPL_AMD_F17H);
1545 	case IDX_DRAM_ENERGY:
1546 		return platform->rapl_msrs & RAPL_DRAM;
1547 	case IDX_PP0_ENERGY:
1548 		return platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS;
1549 	case IDX_PP1_ENERGY:
1550 		return platform->rapl_msrs & RAPL_GFX;
1551 	case IDX_PKG_PERF:
1552 		return platform->rapl_msrs & RAPL_PKG_PERF_STATUS;
1553 	case IDX_DRAM_PERF:
1554 		return platform->rapl_msrs & RAPL_DRAM_PERF_STATUS;
1555 	default:
1556 		return 0;
1557 	}
1558 }
1559 
1560 struct sys_counters {
1561 	unsigned int added_thread_counters;
1562 	unsigned int added_core_counters;
1563 	unsigned int added_package_counters;
1564 	struct msr_counter *tp;
1565 	struct msr_counter *cp;
1566 	struct msr_counter *pp;
1567 } sys;
1568 
1569 static size_t free_msr_counters_(struct msr_counter **pp)
1570 {
1571 	struct msr_counter *p = NULL;
1572 	size_t num_freed = 0;
1573 
1574 	while (*pp) {
1575 		p = *pp;
1576 
1577 		if (p->msr_num != 0) {
1578 			*pp = p->next;
1579 
1580 			free(p);
1581 			++num_freed;
1582 
1583 			continue;
1584 		}
1585 
1586 		pp = &p->next;
1587 	}
1588 
1589 	return num_freed;
1590 }
1591 
1592 /*
1593  * Free all added counters accessed via msr.
1594  */
1595 static void free_sys_msr_counters(void)
1596 {
1597 	/* Thread counters */
1598 	sys.added_thread_counters -= free_msr_counters_(&sys.tp);
1599 
1600 	/* Core counters */
1601 	sys.added_core_counters -= free_msr_counters_(&sys.cp);
1602 
1603 	/* Package counters */
1604 	sys.added_package_counters -= free_msr_counters_(&sys.pp);
1605 }
1606 
1607 struct system_summary {
1608 	struct thread_data threads;
1609 	struct core_data cores;
1610 	struct pkg_data packages;
1611 } average;
1612 
1613 struct cpu_topology {
1614 	int physical_package_id;
1615 	int die_id;
1616 	int logical_cpu_id;
1617 	int physical_node_id;
1618 	int logical_node_id;	/* 0-based count within the package */
1619 	int physical_core_id;
1620 	int thread_id;
1621 	cpu_set_t *put_ids;	/* Processing Unit/Thread IDs */
1622 } *cpus;
1623 
1624 struct topo_params {
1625 	int num_packages;
1626 	int num_die;
1627 	int num_cpus;
1628 	int num_cores;
1629 	int allowed_packages;
1630 	int allowed_cpus;
1631 	int allowed_cores;
1632 	int max_cpu_num;
1633 	int max_core_id;
1634 	int max_package_id;
1635 	int max_die_id;
1636 	int max_node_num;
1637 	int nodes_per_pkg;
1638 	int cores_per_node;
1639 	int threads_per_core;
1640 } topo;
1641 
1642 struct timeval tv_even, tv_odd, tv_delta;
1643 
1644 int *irq_column_2_cpu;		/* /proc/interrupts column numbers */
1645 int *irqs_per_cpu;		/* indexed by cpu_num */
1646 
1647 void setup_all_buffers(bool startup);
1648 
1649 char *sys_lpi_file;
1650 char *sys_lpi_file_sysfs = "/sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us";
1651 char *sys_lpi_file_debugfs = "/sys/kernel/debug/pmc_core/slp_s0_residency_usec";
1652 
1653 int cpu_is_not_present(int cpu)
1654 {
1655 	return !CPU_ISSET_S(cpu, cpu_present_setsize, cpu_present_set);
1656 }
1657 
1658 int cpu_is_not_allowed(int cpu)
1659 {
1660 	return !CPU_ISSET_S(cpu, cpu_allowed_setsize, cpu_allowed_set);
1661 }
1662 
1663 /*
1664  * run func(thread, core, package) in topology order
1665  * skip non-present cpus
1666  */
1667 
1668 int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pkg_data *),
1669 		 struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base)
1670 {
1671 	int retval, pkg_no, core_no, thread_no, node_no;
1672 
1673 	for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
1674 		for (node_no = 0; node_no < topo.nodes_per_pkg; node_no++) {
1675 			for (core_no = 0; core_no < topo.cores_per_node; ++core_no) {
1676 				for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) {
1677 					struct thread_data *t;
1678 					struct core_data *c;
1679 					struct pkg_data *p;
1680 					t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
1681 
1682 					if (cpu_is_not_allowed(t->cpu_id))
1683 						continue;
1684 
1685 					c = GET_CORE(core_base, core_no, node_no, pkg_no);
1686 					p = GET_PKG(pkg_base, pkg_no);
1687 
1688 					retval = func(t, c, p);
1689 					if (retval)
1690 						return retval;
1691 				}
1692 			}
1693 		}
1694 	}
1695 	return 0;
1696 }
1697 
1698 int is_cpu_first_thread_in_core(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1699 {
1700 	UNUSED(p);
1701 
1702 	return ((int)t->cpu_id == c->base_cpu || c->base_cpu < 0);
1703 }
1704 
1705 int is_cpu_first_core_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1706 {
1707 	UNUSED(c);
1708 
1709 	return ((int)t->cpu_id == p->base_cpu || p->base_cpu < 0);
1710 }
1711 
1712 int is_cpu_first_thread_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1713 {
1714 	return is_cpu_first_thread_in_core(t, c, p) && is_cpu_first_core_in_package(t, c, p);
1715 }
1716 
1717 int cpu_migrate(int cpu)
1718 {
1719 	CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set);
1720 	CPU_SET_S(cpu, cpu_affinity_setsize, cpu_affinity_set);
1721 	if (sched_setaffinity(0, cpu_affinity_setsize, cpu_affinity_set) == -1)
1722 		return -1;
1723 	else
1724 		return 0;
1725 }
1726 
1727 int get_msr_fd(int cpu)
1728 {
1729 	char pathname[32];
1730 	int fd;
1731 
1732 	fd = fd_percpu[cpu];
1733 
1734 	if (fd)
1735 		return fd;
1736 
1737 	sprintf(pathname, "/dev/cpu/%d/msr", cpu);
1738 	fd = open(pathname, O_RDONLY);
1739 	if (fd < 0)
1740 		err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, "
1741 		    "or run with --no-msr, or run as root", pathname);
1742 
1743 	fd_percpu[cpu] = fd;
1744 
1745 	return fd;
1746 }
1747 
1748 static void bic_disable_msr_access(void)
1749 {
1750 	const unsigned long bic_msrs = BIC_SMI | BIC_Mod_c6 | BIC_CoreTmp |
1751 	    BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_PkgTmp;
1752 
1753 	bic_enabled &= ~bic_msrs;
1754 
1755 	free_sys_msr_counters();
1756 }
1757 
1758 static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags)
1759 {
1760 	assert(!no_perf);
1761 
1762 	return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
1763 }
1764 
1765 static long open_perf_counter(int cpu, unsigned int type, unsigned int config, int group_fd, __u64 read_format)
1766 {
1767 	struct perf_event_attr attr;
1768 	const pid_t pid = -1;
1769 	const unsigned long flags = 0;
1770 
1771 	assert(!no_perf);
1772 
1773 	memset(&attr, 0, sizeof(struct perf_event_attr));
1774 
1775 	attr.type = type;
1776 	attr.size = sizeof(struct perf_event_attr);
1777 	attr.config = config;
1778 	attr.disabled = 0;
1779 	attr.sample_type = PERF_SAMPLE_IDENTIFIER;
1780 	attr.read_format = read_format;
1781 
1782 	const int fd = perf_event_open(&attr, pid, cpu, group_fd, flags);
1783 
1784 	return fd;
1785 }
1786 
1787 int get_instr_count_fd(int cpu)
1788 {
1789 	if (fd_instr_count_percpu[cpu])
1790 		return fd_instr_count_percpu[cpu];
1791 
1792 	fd_instr_count_percpu[cpu] = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0);
1793 
1794 	return fd_instr_count_percpu[cpu];
1795 }
1796 
1797 int get_msr(int cpu, off_t offset, unsigned long long *msr)
1798 {
1799 	ssize_t retval;
1800 
1801 	assert(!no_msr);
1802 
1803 	retval = pread(get_msr_fd(cpu), msr, sizeof(*msr), offset);
1804 
1805 	if (retval != sizeof *msr)
1806 		err(-1, "cpu%d: msr offset 0x%llx read failed", cpu, (unsigned long long)offset);
1807 
1808 	return 0;
1809 }
1810 
1811 int probe_msr(int cpu, off_t offset)
1812 {
1813 	ssize_t retval;
1814 	unsigned long long dummy;
1815 
1816 	assert(!no_msr);
1817 
1818 	retval = pread(get_msr_fd(cpu), &dummy, sizeof(dummy), offset);
1819 
1820 	if (retval != sizeof(dummy))
1821 		return 1;
1822 
1823 	return 0;
1824 }
1825 
1826 #define MAX_DEFERRED 16
1827 char *deferred_add_names[MAX_DEFERRED];
1828 char *deferred_skip_names[MAX_DEFERRED];
1829 int deferred_add_index;
1830 int deferred_skip_index;
1831 
1832 /*
1833  * HIDE_LIST - hide this list of counters, show the rest [default]
1834  * SHOW_LIST - show this list of counters, hide the rest
1835  */
1836 enum show_hide_mode { SHOW_LIST, HIDE_LIST } global_show_hide_mode = HIDE_LIST;
1837 
1838 void help(void)
1839 {
1840 	fprintf(outf,
1841 		"Usage: turbostat [OPTIONS][(--interval seconds) | COMMAND ...]\n"
1842 		"\n"
1843 		"Turbostat forks the specified COMMAND and prints statistics\n"
1844 		"when COMMAND completes.\n"
1845 		"If no COMMAND is specified, turbostat wakes every 5-seconds\n"
1846 		"to print statistics, until interrupted.\n"
1847 		"  -a, --add	add a counter\n"
1848 		"		  eg. --add msr0x10,u64,cpu,delta,MY_TSC\n"
1849 		"  -c, --cpu	cpu-set	limit output to summary plus cpu-set:\n"
1850 		"		  {core | package | j,k,l..m,n-p }\n"
1851 		"  -d, --debug	displays usec, Time_Of_Day_Seconds and more debugging\n"
1852 		"  -D, --Dump	displays the raw counter values\n"
1853 		"  -e, --enable	[all | column]\n"
1854 		"		shows all or the specified disabled column\n"
1855 		"  -H, --hide [column|column,column,...]\n"
1856 		"		hide the specified column(s)\n"
1857 		"  -i, --interval sec.subsec\n"
1858 		"		Override default 5-second measurement interval\n"
1859 		"  -J, --Joules	displays energy in Joules instead of Watts\n"
1860 		"  -l, --list	list column headers only\n"
1861 		"  -M, --no-msr Disable all uses of the MSR driver\n"
1862 		"  -P, --no-perf Disable all uses of the perf API\n"
1863 		"  -n, --num_iterations num\n"
1864 		"		number of the measurement iterations\n"
1865 		"  -N, --header_iterations num\n"
1866 		"		print header every num iterations\n"
1867 		"  -o, --out file\n"
1868 		"		create or truncate \"file\" for all output\n"
1869 		"  -q, --quiet	skip decoding system configuration header\n"
1870 		"  -s, --show [column|column,column,...]\n"
1871 		"		show only the specified column(s)\n"
1872 		"  -S, --Summary\n"
1873 		"		limits output to 1-line system summary per interval\n"
1874 		"  -T, --TCC temperature\n"
1875 		"		sets the Thermal Control Circuit temperature in\n"
1876 		"		  degrees Celsius\n"
1877 		"  -h, --help	print this help message\n"
1878 		"  -v, --version	print version information\n" "\n" "For more help, run \"man turbostat\"\n");
1879 }
1880 
1881 /*
1882  * bic_lookup
1883  * for all the strings in comma separate name_list,
1884  * set the approprate bit in return value.
1885  */
1886 unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode)
1887 {
1888 	unsigned int i;
1889 	unsigned long long retval = 0;
1890 
1891 	while (name_list) {
1892 		char *comma;
1893 
1894 		comma = strchr(name_list, ',');
1895 
1896 		if (comma)
1897 			*comma = '\0';
1898 
1899 		for (i = 0; i < MAX_BIC; ++i) {
1900 			if (!strcmp(name_list, bic[i].name)) {
1901 				retval |= (1ULL << i);
1902 				break;
1903 			}
1904 			if (!strcmp(name_list, "all")) {
1905 				retval |= ~0;
1906 				break;
1907 			} else if (!strcmp(name_list, "topology")) {
1908 				retval |= BIC_TOPOLOGY;
1909 				break;
1910 			} else if (!strcmp(name_list, "power")) {
1911 				retval |= BIC_THERMAL_PWR;
1912 				break;
1913 			} else if (!strcmp(name_list, "idle")) {
1914 				retval |= BIC_IDLE;
1915 				break;
1916 			} else if (!strcmp(name_list, "frequency")) {
1917 				retval |= BIC_FREQUENCY;
1918 				break;
1919 			} else if (!strcmp(name_list, "other")) {
1920 				retval |= BIC_OTHER;
1921 				break;
1922 			}
1923 
1924 		}
1925 		if (i == MAX_BIC) {
1926 			if (mode == SHOW_LIST) {
1927 				deferred_add_names[deferred_add_index++] = name_list;
1928 				if (deferred_add_index >= MAX_DEFERRED) {
1929 					fprintf(stderr, "More than max %d un-recognized --add options '%s'\n",
1930 						MAX_DEFERRED, name_list);
1931 					help();
1932 					exit(1);
1933 				}
1934 			} else {
1935 				deferred_skip_names[deferred_skip_index++] = name_list;
1936 				if (debug)
1937 					fprintf(stderr, "deferred \"%s\"\n", name_list);
1938 				if (deferred_skip_index >= MAX_DEFERRED) {
1939 					fprintf(stderr, "More than max %d un-recognized --skip options '%s'\n",
1940 						MAX_DEFERRED, name_list);
1941 					help();
1942 					exit(1);
1943 				}
1944 			}
1945 		}
1946 
1947 		name_list = comma;
1948 		if (name_list)
1949 			name_list++;
1950 
1951 	}
1952 	return retval;
1953 }
1954 
1955 void print_header(char *delim)
1956 {
1957 	struct msr_counter *mp;
1958 	int printed = 0;
1959 
1960 	if (DO_BIC(BIC_USEC))
1961 		outp += sprintf(outp, "%susec", (printed++ ? delim : ""));
1962 	if (DO_BIC(BIC_TOD))
1963 		outp += sprintf(outp, "%sTime_Of_Day_Seconds", (printed++ ? delim : ""));
1964 	if (DO_BIC(BIC_Package))
1965 		outp += sprintf(outp, "%sPackage", (printed++ ? delim : ""));
1966 	if (DO_BIC(BIC_Die))
1967 		outp += sprintf(outp, "%sDie", (printed++ ? delim : ""));
1968 	if (DO_BIC(BIC_Node))
1969 		outp += sprintf(outp, "%sNode", (printed++ ? delim : ""));
1970 	if (DO_BIC(BIC_Core))
1971 		outp += sprintf(outp, "%sCore", (printed++ ? delim : ""));
1972 	if (DO_BIC(BIC_CPU))
1973 		outp += sprintf(outp, "%sCPU", (printed++ ? delim : ""));
1974 	if (DO_BIC(BIC_APIC))
1975 		outp += sprintf(outp, "%sAPIC", (printed++ ? delim : ""));
1976 	if (DO_BIC(BIC_X2APIC))
1977 		outp += sprintf(outp, "%sX2APIC", (printed++ ? delim : ""));
1978 	if (DO_BIC(BIC_Avg_MHz))
1979 		outp += sprintf(outp, "%sAvg_MHz", (printed++ ? delim : ""));
1980 	if (DO_BIC(BIC_Busy))
1981 		outp += sprintf(outp, "%sBusy%%", (printed++ ? delim : ""));
1982 	if (DO_BIC(BIC_Bzy_MHz))
1983 		outp += sprintf(outp, "%sBzy_MHz", (printed++ ? delim : ""));
1984 	if (DO_BIC(BIC_TSC_MHz))
1985 		outp += sprintf(outp, "%sTSC_MHz", (printed++ ? delim : ""));
1986 
1987 	if (DO_BIC(BIC_IPC))
1988 		outp += sprintf(outp, "%sIPC", (printed++ ? delim : ""));
1989 
1990 	if (DO_BIC(BIC_IRQ)) {
1991 		if (sums_need_wide_columns)
1992 			outp += sprintf(outp, "%s     IRQ", (printed++ ? delim : ""));
1993 		else
1994 			outp += sprintf(outp, "%sIRQ", (printed++ ? delim : ""));
1995 	}
1996 
1997 	if (DO_BIC(BIC_SMI))
1998 		outp += sprintf(outp, "%sSMI", (printed++ ? delim : ""));
1999 
2000 	for (mp = sys.tp; mp; mp = mp->next) {
2001 
2002 		if (mp->format == FORMAT_RAW) {
2003 			if (mp->width == 64)
2004 				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), mp->name);
2005 			else
2006 				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), mp->name);
2007 		} else {
2008 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2009 				outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), mp->name);
2010 			else
2011 				outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), mp->name);
2012 		}
2013 	}
2014 
2015 	if (DO_BIC(BIC_CPU_c1))
2016 		outp += sprintf(outp, "%sCPU%%c1", (printed++ ? delim : ""));
2017 	if (DO_BIC(BIC_CPU_c3))
2018 		outp += sprintf(outp, "%sCPU%%c3", (printed++ ? delim : ""));
2019 	if (DO_BIC(BIC_CPU_c6))
2020 		outp += sprintf(outp, "%sCPU%%c6", (printed++ ? delim : ""));
2021 	if (DO_BIC(BIC_CPU_c7))
2022 		outp += sprintf(outp, "%sCPU%%c7", (printed++ ? delim : ""));
2023 
2024 	if (DO_BIC(BIC_Mod_c6))
2025 		outp += sprintf(outp, "%sMod%%c6", (printed++ ? delim : ""));
2026 
2027 	if (DO_BIC(BIC_CoreTmp))
2028 		outp += sprintf(outp, "%sCoreTmp", (printed++ ? delim : ""));
2029 
2030 	if (DO_BIC(BIC_CORE_THROT_CNT))
2031 		outp += sprintf(outp, "%sCoreThr", (printed++ ? delim : ""));
2032 
2033 	if (platform->rapl_msrs && !rapl_joules) {
2034 		if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl)
2035 			outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : ""));
2036 	} else if (platform->rapl_msrs && rapl_joules) {
2037 		if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl)
2038 			outp += sprintf(outp, "%sCor_J", (printed++ ? delim : ""));
2039 	}
2040 
2041 	for (mp = sys.cp; mp; mp = mp->next) {
2042 		if (mp->format == FORMAT_RAW) {
2043 			if (mp->width == 64)
2044 				outp += sprintf(outp, "%s%18.18s", delim, mp->name);
2045 			else
2046 				outp += sprintf(outp, "%s%10.10s", delim, mp->name);
2047 		} else {
2048 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2049 				outp += sprintf(outp, "%s%8s", delim, mp->name);
2050 			else
2051 				outp += sprintf(outp, "%s%s", delim, mp->name);
2052 		}
2053 	}
2054 
2055 	if (DO_BIC(BIC_PkgTmp))
2056 		outp += sprintf(outp, "%sPkgTmp", (printed++ ? delim : ""));
2057 
2058 	if (DO_BIC(BIC_GFX_rc6))
2059 		outp += sprintf(outp, "%sGFX%%rc6", (printed++ ? delim : ""));
2060 
2061 	if (DO_BIC(BIC_GFXMHz))
2062 		outp += sprintf(outp, "%sGFXMHz", (printed++ ? delim : ""));
2063 
2064 	if (DO_BIC(BIC_GFXACTMHz))
2065 		outp += sprintf(outp, "%sGFXAMHz", (printed++ ? delim : ""));
2066 
2067 	if (DO_BIC(BIC_SAM_mc6))
2068 		outp += sprintf(outp, "%sSAM%%mc6", (printed++ ? delim : ""));
2069 
2070 	if (DO_BIC(BIC_SAMMHz))
2071 		outp += sprintf(outp, "%sSAMMHz", (printed++ ? delim : ""));
2072 
2073 	if (DO_BIC(BIC_SAMACTMHz))
2074 		outp += sprintf(outp, "%sSAMAMHz", (printed++ ? delim : ""));
2075 
2076 	if (DO_BIC(BIC_Totl_c0))
2077 		outp += sprintf(outp, "%sTotl%%C0", (printed++ ? delim : ""));
2078 	if (DO_BIC(BIC_Any_c0))
2079 		outp += sprintf(outp, "%sAny%%C0", (printed++ ? delim : ""));
2080 	if (DO_BIC(BIC_GFX_c0))
2081 		outp += sprintf(outp, "%sGFX%%C0", (printed++ ? delim : ""));
2082 	if (DO_BIC(BIC_CPUGFX))
2083 		outp += sprintf(outp, "%sCPUGFX%%", (printed++ ? delim : ""));
2084 
2085 	if (DO_BIC(BIC_Pkgpc2))
2086 		outp += sprintf(outp, "%sPkg%%pc2", (printed++ ? delim : ""));
2087 	if (DO_BIC(BIC_Pkgpc3))
2088 		outp += sprintf(outp, "%sPkg%%pc3", (printed++ ? delim : ""));
2089 	if (DO_BIC(BIC_Pkgpc6))
2090 		outp += sprintf(outp, "%sPkg%%pc6", (printed++ ? delim : ""));
2091 	if (DO_BIC(BIC_Pkgpc7))
2092 		outp += sprintf(outp, "%sPkg%%pc7", (printed++ ? delim : ""));
2093 	if (DO_BIC(BIC_Pkgpc8))
2094 		outp += sprintf(outp, "%sPkg%%pc8", (printed++ ? delim : ""));
2095 	if (DO_BIC(BIC_Pkgpc9))
2096 		outp += sprintf(outp, "%sPkg%%pc9", (printed++ ? delim : ""));
2097 	if (DO_BIC(BIC_Pkgpc10))
2098 		outp += sprintf(outp, "%sPk%%pc10", (printed++ ? delim : ""));
2099 	if (DO_BIC(BIC_CPU_LPI))
2100 		outp += sprintf(outp, "%sCPU%%LPI", (printed++ ? delim : ""));
2101 	if (DO_BIC(BIC_SYS_LPI))
2102 		outp += sprintf(outp, "%sSYS%%LPI", (printed++ ? delim : ""));
2103 
2104 	if (platform->rapl_msrs && !rapl_joules) {
2105 		if (DO_BIC(BIC_PkgWatt))
2106 			outp += sprintf(outp, "%sPkgWatt", (printed++ ? delim : ""));
2107 		if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl)
2108 			outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : ""));
2109 		if (DO_BIC(BIC_GFXWatt))
2110 			outp += sprintf(outp, "%sGFXWatt", (printed++ ? delim : ""));
2111 		if (DO_BIC(BIC_RAMWatt))
2112 			outp += sprintf(outp, "%sRAMWatt", (printed++ ? delim : ""));
2113 		if (DO_BIC(BIC_PKG__))
2114 			outp += sprintf(outp, "%sPKG_%%", (printed++ ? delim : ""));
2115 		if (DO_BIC(BIC_RAM__))
2116 			outp += sprintf(outp, "%sRAM_%%", (printed++ ? delim : ""));
2117 	} else if (platform->rapl_msrs && rapl_joules) {
2118 		if (DO_BIC(BIC_Pkg_J))
2119 			outp += sprintf(outp, "%sPkg_J", (printed++ ? delim : ""));
2120 		if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl)
2121 			outp += sprintf(outp, "%sCor_J", (printed++ ? delim : ""));
2122 		if (DO_BIC(BIC_GFX_J))
2123 			outp += sprintf(outp, "%sGFX_J", (printed++ ? delim : ""));
2124 		if (DO_BIC(BIC_RAM_J))
2125 			outp += sprintf(outp, "%sRAM_J", (printed++ ? delim : ""));
2126 		if (DO_BIC(BIC_PKG__))
2127 			outp += sprintf(outp, "%sPKG_%%", (printed++ ? delim : ""));
2128 		if (DO_BIC(BIC_RAM__))
2129 			outp += sprintf(outp, "%sRAM_%%", (printed++ ? delim : ""));
2130 	}
2131 	if (DO_BIC(BIC_UNCORE_MHZ))
2132 		outp += sprintf(outp, "%sUncMHz", (printed++ ? delim : ""));
2133 
2134 	for (mp = sys.pp; mp; mp = mp->next) {
2135 		if (mp->format == FORMAT_RAW) {
2136 			if (mp->width == 64)
2137 				outp += sprintf(outp, "%s%18.18s", delim, mp->name);
2138 			else if (mp->width == 32)
2139 				outp += sprintf(outp, "%s%10.10s", delim, mp->name);
2140 			else
2141 				outp += sprintf(outp, "%s%7.7s", delim, mp->name);
2142 		} else {
2143 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2144 				outp += sprintf(outp, "%s%8s", delim, mp->name);
2145 			else
2146 				outp += sprintf(outp, "%s%7.7s", delim, mp->name);
2147 		}
2148 	}
2149 
2150 	outp += sprintf(outp, "\n");
2151 }
2152 
2153 int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
2154 {
2155 	int i;
2156 	struct msr_counter *mp;
2157 
2158 	outp += sprintf(outp, "t %p, c %p, p %p\n", t, c, p);
2159 
2160 	if (t) {
2161 		outp += sprintf(outp, "CPU: %d flags 0x%x\n", t->cpu_id, t->flags);
2162 		outp += sprintf(outp, "TSC: %016llX\n", t->tsc);
2163 		outp += sprintf(outp, "aperf: %016llX\n", t->aperf);
2164 		outp += sprintf(outp, "mperf: %016llX\n", t->mperf);
2165 		outp += sprintf(outp, "c1: %016llX\n", t->c1);
2166 
2167 		if (DO_BIC(BIC_IPC))
2168 			outp += sprintf(outp, "IPC: %lld\n", t->instr_count);
2169 
2170 		if (DO_BIC(BIC_IRQ))
2171 			outp += sprintf(outp, "IRQ: %lld\n", t->irq_count);
2172 		if (DO_BIC(BIC_SMI))
2173 			outp += sprintf(outp, "SMI: %d\n", t->smi_count);
2174 
2175 		for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
2176 			outp +=
2177 			    sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num,
2178 				    t->counter[i], mp->sp->path);
2179 		}
2180 	}
2181 
2182 	if (c && is_cpu_first_thread_in_core(t, c, p)) {
2183 		outp += sprintf(outp, "core: %d\n", c->core_id);
2184 		outp += sprintf(outp, "c3: %016llX\n", c->c3);
2185 		outp += sprintf(outp, "c6: %016llX\n", c->c6);
2186 		outp += sprintf(outp, "c7: %016llX\n", c->c7);
2187 		outp += sprintf(outp, "DTS: %dC\n", c->core_temp_c);
2188 		outp += sprintf(outp, "cpu_throt_count: %016llX\n", c->core_throt_cnt);
2189 
2190 		const unsigned long long energy_value = c->core_energy.raw_value * c->core_energy.scale;
2191 		const double energy_scale = c->core_energy.scale;
2192 
2193 		if (c->core_energy.unit == RAPL_UNIT_JOULES)
2194 			outp += sprintf(outp, "Joules: %0llX (scale: %lf)\n", energy_value, energy_scale);
2195 
2196 		for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
2197 			outp +=
2198 			    sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num,
2199 				    c->counter[i], mp->sp->path);
2200 		}
2201 		outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us);
2202 	}
2203 
2204 	if (p && is_cpu_first_core_in_package(t, c, p)) {
2205 		outp += sprintf(outp, "package: %d\n", p->package_id);
2206 
2207 		outp += sprintf(outp, "Weighted cores: %016llX\n", p->pkg_wtd_core_c0);
2208 		outp += sprintf(outp, "Any cores: %016llX\n", p->pkg_any_core_c0);
2209 		outp += sprintf(outp, "Any GFX: %016llX\n", p->pkg_any_gfxe_c0);
2210 		outp += sprintf(outp, "CPU + GFX: %016llX\n", p->pkg_both_core_gfxe_c0);
2211 
2212 		outp += sprintf(outp, "pc2: %016llX\n", p->pc2);
2213 		if (DO_BIC(BIC_Pkgpc3))
2214 			outp += sprintf(outp, "pc3: %016llX\n", p->pc3);
2215 		if (DO_BIC(BIC_Pkgpc6))
2216 			outp += sprintf(outp, "pc6: %016llX\n", p->pc6);
2217 		if (DO_BIC(BIC_Pkgpc7))
2218 			outp += sprintf(outp, "pc7: %016llX\n", p->pc7);
2219 		outp += sprintf(outp, "pc8: %016llX\n", p->pc8);
2220 		outp += sprintf(outp, "pc9: %016llX\n", p->pc9);
2221 		outp += sprintf(outp, "pc10: %016llX\n", p->pc10);
2222 		outp += sprintf(outp, "cpu_lpi: %016llX\n", p->cpu_lpi);
2223 		outp += sprintf(outp, "sys_lpi: %016llX\n", p->sys_lpi);
2224 		outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg.raw_value);
2225 		outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores.raw_value);
2226 		outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx.raw_value);
2227 		outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram.raw_value);
2228 		outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status.raw_value);
2229 		outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status.raw_value);
2230 		outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c);
2231 
2232 		for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
2233 			outp +=
2234 			    sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num,
2235 				    p->counter[i], mp->sp->path);
2236 		}
2237 	}
2238 
2239 	outp += sprintf(outp, "\n");
2240 
2241 	return 0;
2242 }
2243 
2244 double rapl_counter_get_value(const struct rapl_counter *c, enum rapl_unit desired_unit, double interval)
2245 {
2246 	assert(desired_unit != RAPL_UNIT_INVALID);
2247 
2248 	/*
2249 	 * For now we don't expect anything other than joules,
2250 	 * so just simplify the logic.
2251 	 */
2252 	assert(c->unit == RAPL_UNIT_JOULES);
2253 
2254 	const double scaled = c->raw_value * c->scale;
2255 
2256 	if (desired_unit == RAPL_UNIT_WATTS)
2257 		return scaled / interval;
2258 	return scaled;
2259 }
2260 
2261 /*
2262  * column formatting convention & formats
2263  */
2264 int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
2265 {
2266 	double interval_float, tsc;
2267 	char *fmt8;
2268 	int i;
2269 	struct msr_counter *mp;
2270 	char *delim = "\t";
2271 	int printed = 0;
2272 
2273 	/* if showing only 1st thread in core and this isn't one, bail out */
2274 	if (show_core_only && !is_cpu_first_thread_in_core(t, c, p))
2275 		return 0;
2276 
2277 	/* if showing only 1st thread in pkg and this isn't one, bail out */
2278 	if (show_pkg_only && !is_cpu_first_core_in_package(t, c, p))
2279 		return 0;
2280 
2281 	/*if not summary line and --cpu is used */
2282 	if ((t != &average.threads) && (cpu_subset && !CPU_ISSET_S(t->cpu_id, cpu_subset_size, cpu_subset)))
2283 		return 0;
2284 
2285 	if (DO_BIC(BIC_USEC)) {
2286 		/* on each row, print how many usec each timestamp took to gather */
2287 		struct timeval tv;
2288 
2289 		timersub(&t->tv_end, &t->tv_begin, &tv);
2290 		outp += sprintf(outp, "%5ld\t", tv.tv_sec * 1000000 + tv.tv_usec);
2291 	}
2292 
2293 	/* Time_Of_Day_Seconds: on each row, print sec.usec last timestamp taken */
2294 	if (DO_BIC(BIC_TOD))
2295 		outp += sprintf(outp, "%10ld.%06ld\t", t->tv_end.tv_sec, t->tv_end.tv_usec);
2296 
2297 	interval_float = t->tv_delta.tv_sec + t->tv_delta.tv_usec / 1000000.0;
2298 
2299 	tsc = t->tsc * tsc_tweak;
2300 
2301 	/* topo columns, print blanks on 1st (average) line */
2302 	if (t == &average.threads) {
2303 		if (DO_BIC(BIC_Package))
2304 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2305 		if (DO_BIC(BIC_Die))
2306 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2307 		if (DO_BIC(BIC_Node))
2308 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2309 		if (DO_BIC(BIC_Core))
2310 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2311 		if (DO_BIC(BIC_CPU))
2312 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2313 		if (DO_BIC(BIC_APIC))
2314 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2315 		if (DO_BIC(BIC_X2APIC))
2316 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2317 	} else {
2318 		if (DO_BIC(BIC_Package)) {
2319 			if (p)
2320 				outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->package_id);
2321 			else
2322 				outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2323 		}
2324 		if (DO_BIC(BIC_Die)) {
2325 			if (c)
2326 				outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), cpus[t->cpu_id].die_id);
2327 			else
2328 				outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2329 		}
2330 		if (DO_BIC(BIC_Node)) {
2331 			if (t)
2332 				outp += sprintf(outp, "%s%d",
2333 						(printed++ ? delim : ""), cpus[t->cpu_id].physical_node_id);
2334 			else
2335 				outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2336 		}
2337 		if (DO_BIC(BIC_Core)) {
2338 			if (c)
2339 				outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), c->core_id);
2340 			else
2341 				outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2342 		}
2343 		if (DO_BIC(BIC_CPU))
2344 			outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->cpu_id);
2345 		if (DO_BIC(BIC_APIC))
2346 			outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->apic_id);
2347 		if (DO_BIC(BIC_X2APIC))
2348 			outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->x2apic_id);
2349 	}
2350 
2351 	if (DO_BIC(BIC_Avg_MHz))
2352 		outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 / units * t->aperf / interval_float);
2353 
2354 	if (DO_BIC(BIC_Busy))
2355 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->mperf / tsc);
2356 
2357 	if (DO_BIC(BIC_Bzy_MHz)) {
2358 		if (has_base_hz)
2359 			outp +=
2360 			    sprintf(outp, "%s%.0f", (printed++ ? delim : ""), base_hz / units * t->aperf / t->mperf);
2361 		else
2362 			outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""),
2363 					tsc / units * t->aperf / t->mperf / interval_float);
2364 	}
2365 
2366 	if (DO_BIC(BIC_TSC_MHz))
2367 		outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 * t->tsc / units / interval_float);
2368 
2369 	if (DO_BIC(BIC_IPC))
2370 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 1.0 * t->instr_count / t->aperf);
2371 
2372 	/* IRQ */
2373 	if (DO_BIC(BIC_IRQ)) {
2374 		if (sums_need_wide_columns)
2375 			outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->irq_count);
2376 		else
2377 			outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->irq_count);
2378 	}
2379 
2380 	/* SMI */
2381 	if (DO_BIC(BIC_SMI))
2382 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->smi_count);
2383 
2384 	/* Added counters */
2385 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
2386 		if (mp->format == FORMAT_RAW) {
2387 			if (mp->width == 32)
2388 				outp +=
2389 				    sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)t->counter[i]);
2390 			else
2391 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->counter[i]);
2392 		} else if (mp->format == FORMAT_DELTA) {
2393 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2394 				outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->counter[i]);
2395 			else
2396 				outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->counter[i]);
2397 		} else if (mp->format == FORMAT_PERCENT) {
2398 			if (mp->type == COUNTER_USEC)
2399 				outp +=
2400 				    sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
2401 					    t->counter[i] / interval_float / 10000);
2402 			else
2403 				outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->counter[i] / tsc);
2404 		}
2405 	}
2406 
2407 	/* C1 */
2408 	if (DO_BIC(BIC_CPU_c1))
2409 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc);
2410 
2411 	/* print per-core data only for 1st thread in core */
2412 	if (!is_cpu_first_thread_in_core(t, c, p))
2413 		goto done;
2414 
2415 	if (DO_BIC(BIC_CPU_c3))
2416 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c3 / tsc);
2417 	if (DO_BIC(BIC_CPU_c6))
2418 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c6 / tsc);
2419 	if (DO_BIC(BIC_CPU_c7))
2420 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c7 / tsc);
2421 
2422 	/* Mod%c6 */
2423 	if (DO_BIC(BIC_Mod_c6))
2424 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->mc6_us / tsc);
2425 
2426 	if (DO_BIC(BIC_CoreTmp))
2427 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), c->core_temp_c);
2428 
2429 	/* Core throttle count */
2430 	if (DO_BIC(BIC_CORE_THROT_CNT))
2431 		outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->core_throt_cnt);
2432 
2433 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
2434 		if (mp->format == FORMAT_RAW) {
2435 			if (mp->width == 32)
2436 				outp +=
2437 				    sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)c->counter[i]);
2438 			else
2439 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->counter[i]);
2440 		} else if (mp->format == FORMAT_DELTA) {
2441 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2442 				outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), c->counter[i]);
2443 			else
2444 				outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->counter[i]);
2445 		} else if (mp->format == FORMAT_PERCENT) {
2446 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->counter[i] / tsc);
2447 		}
2448 	}
2449 
2450 	fmt8 = "%s%.2f";
2451 
2452 	if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl)
2453 		outp +=
2454 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
2455 			    rapl_counter_get_value(&c->core_energy, RAPL_UNIT_WATTS, interval_float));
2456 	if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl)
2457 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
2458 				rapl_counter_get_value(&c->core_energy, RAPL_UNIT_JOULES, interval_float));
2459 
2460 	/* print per-package data only for 1st core in package */
2461 	if (!is_cpu_first_core_in_package(t, c, p))
2462 		goto done;
2463 
2464 	/* PkgTmp */
2465 	if (DO_BIC(BIC_PkgTmp))
2466 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->pkg_temp_c);
2467 
2468 	/* GFXrc6 */
2469 	if (DO_BIC(BIC_GFX_rc6)) {
2470 		if (p->gfx_rc6_ms == -1) {	/* detect GFX counter reset */
2471 			outp += sprintf(outp, "%s**.**", (printed++ ? delim : ""));
2472 		} else {
2473 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
2474 					p->gfx_rc6_ms / 10.0 / interval_float);
2475 		}
2476 	}
2477 
2478 	/* GFXMHz */
2479 	if (DO_BIC(BIC_GFXMHz))
2480 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_mhz);
2481 
2482 	/* GFXACTMHz */
2483 	if (DO_BIC(BIC_GFXACTMHz))
2484 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_act_mhz);
2485 
2486 	/* SAMmc6 */
2487 	if (DO_BIC(BIC_SAM_mc6)) {
2488 		if (p->sam_mc6_ms == -1) {	/* detect GFX counter reset */
2489 			outp += sprintf(outp, "%s**.**", (printed++ ? delim : ""));
2490 		} else {
2491 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
2492 					p->sam_mc6_ms / 10.0 / interval_float);
2493 		}
2494 	}
2495 
2496 	/* SAMMHz */
2497 	if (DO_BIC(BIC_SAMMHz))
2498 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_mhz);
2499 
2500 	/* SAMACTMHz */
2501 	if (DO_BIC(BIC_SAMACTMHz))
2502 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_act_mhz);
2503 
2504 	/* Totl%C0, Any%C0 GFX%C0 CPUGFX% */
2505 	if (DO_BIC(BIC_Totl_c0))
2506 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0 / tsc);
2507 	if (DO_BIC(BIC_Any_c0))
2508 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_core_c0 / tsc);
2509 	if (DO_BIC(BIC_GFX_c0))
2510 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_gfxe_c0 / tsc);
2511 	if (DO_BIC(BIC_CPUGFX))
2512 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_both_core_gfxe_c0 / tsc);
2513 
2514 	if (DO_BIC(BIC_Pkgpc2))
2515 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc2 / tsc);
2516 	if (DO_BIC(BIC_Pkgpc3))
2517 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc3 / tsc);
2518 	if (DO_BIC(BIC_Pkgpc6))
2519 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc6 / tsc);
2520 	if (DO_BIC(BIC_Pkgpc7))
2521 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc7 / tsc);
2522 	if (DO_BIC(BIC_Pkgpc8))
2523 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc8 / tsc);
2524 	if (DO_BIC(BIC_Pkgpc9))
2525 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc9 / tsc);
2526 	if (DO_BIC(BIC_Pkgpc10))
2527 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc);
2528 
2529 	if (DO_BIC(BIC_CPU_LPI)) {
2530 		if (p->cpu_lpi >= 0)
2531 			outp +=
2532 			    sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
2533 				    100.0 * p->cpu_lpi / 1000000.0 / interval_float);
2534 		else
2535 			outp += sprintf(outp, "%s(neg)", (printed++ ? delim : ""));
2536 	}
2537 	if (DO_BIC(BIC_SYS_LPI)) {
2538 		if (p->sys_lpi >= 0)
2539 			outp +=
2540 			    sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
2541 				    100.0 * p->sys_lpi / 1000000.0 / interval_float);
2542 		else
2543 			outp += sprintf(outp, "%s(neg)", (printed++ ? delim : ""));
2544 	}
2545 
2546 	if (DO_BIC(BIC_PkgWatt))
2547 		outp +=
2548 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
2549 			    rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_WATTS, interval_float));
2550 	if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl)
2551 		outp +=
2552 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
2553 			    rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_WATTS, interval_float));
2554 	if (DO_BIC(BIC_GFXWatt))
2555 		outp +=
2556 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
2557 			    rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_WATTS, interval_float));
2558 	if (DO_BIC(BIC_RAMWatt))
2559 		outp +=
2560 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
2561 			    rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_WATTS, interval_float));
2562 	if (DO_BIC(BIC_Pkg_J))
2563 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
2564 				rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_JOULES, interval_float));
2565 	if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl)
2566 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
2567 				rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_JOULES, interval_float));
2568 	if (DO_BIC(BIC_GFX_J))
2569 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
2570 				rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_JOULES, interval_float));
2571 	if (DO_BIC(BIC_RAM_J))
2572 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
2573 				rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_JOULES, interval_float));
2574 	if (DO_BIC(BIC_PKG__))
2575 		outp +=
2576 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
2577 			    rapl_counter_get_value(&p->rapl_pkg_perf_status, RAPL_UNIT_WATTS, interval_float));
2578 	if (DO_BIC(BIC_RAM__))
2579 		outp +=
2580 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
2581 			    rapl_counter_get_value(&p->rapl_dram_perf_status, RAPL_UNIT_WATTS, interval_float));
2582 	/* UncMHz */
2583 	if (DO_BIC(BIC_UNCORE_MHZ))
2584 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->uncore_mhz);
2585 
2586 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
2587 		if (mp->format == FORMAT_RAW) {
2588 			if (mp->width == 32)
2589 				outp +=
2590 				    sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)p->counter[i]);
2591 			else
2592 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->counter[i]);
2593 		} else if (mp->format == FORMAT_DELTA) {
2594 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2595 				outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), p->counter[i]);
2596 			else
2597 				outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->counter[i]);
2598 		} else if (mp->format == FORMAT_PERCENT) {
2599 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i] / tsc);
2600 		} else if (mp->type == COUNTER_K2M)
2601 			outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->counter[i] / 1000);
2602 	}
2603 
2604 done:
2605 	if (*(outp - 1) != '\n')
2606 		outp += sprintf(outp, "\n");
2607 
2608 	return 0;
2609 }
2610 
2611 void flush_output_stdout(void)
2612 {
2613 	FILE *filep;
2614 
2615 	if (outf == stderr)
2616 		filep = stdout;
2617 	else
2618 		filep = outf;
2619 
2620 	fputs(output_buffer, filep);
2621 	fflush(filep);
2622 
2623 	outp = output_buffer;
2624 }
2625 
2626 void flush_output_stderr(void)
2627 {
2628 	fputs(output_buffer, outf);
2629 	fflush(outf);
2630 	outp = output_buffer;
2631 }
2632 
2633 void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
2634 {
2635 	static int count;
2636 
2637 	if ((!count || (header_iterations && !(count % header_iterations))) || !summary_only)
2638 		print_header("\t");
2639 
2640 	format_counters(&average.threads, &average.cores, &average.packages);
2641 
2642 	count++;
2643 
2644 	if (summary_only)
2645 		return;
2646 
2647 	for_all_cpus(format_counters, t, c, p);
2648 }
2649 
2650 #define DELTA_WRAP32(new, old)			\
2651 	old = ((((unsigned long long)new << 32) - ((unsigned long long)old << 32)) >> 32);
2652 
2653 int delta_package(struct pkg_data *new, struct pkg_data *old)
2654 {
2655 	int i;
2656 	struct msr_counter *mp;
2657 
2658 	if (DO_BIC(BIC_Totl_c0))
2659 		old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0;
2660 	if (DO_BIC(BIC_Any_c0))
2661 		old->pkg_any_core_c0 = new->pkg_any_core_c0 - old->pkg_any_core_c0;
2662 	if (DO_BIC(BIC_GFX_c0))
2663 		old->pkg_any_gfxe_c0 = new->pkg_any_gfxe_c0 - old->pkg_any_gfxe_c0;
2664 	if (DO_BIC(BIC_CPUGFX))
2665 		old->pkg_both_core_gfxe_c0 = new->pkg_both_core_gfxe_c0 - old->pkg_both_core_gfxe_c0;
2666 
2667 	old->pc2 = new->pc2 - old->pc2;
2668 	if (DO_BIC(BIC_Pkgpc3))
2669 		old->pc3 = new->pc3 - old->pc3;
2670 	if (DO_BIC(BIC_Pkgpc6))
2671 		old->pc6 = new->pc6 - old->pc6;
2672 	if (DO_BIC(BIC_Pkgpc7))
2673 		old->pc7 = new->pc7 - old->pc7;
2674 	old->pc8 = new->pc8 - old->pc8;
2675 	old->pc9 = new->pc9 - old->pc9;
2676 	old->pc10 = new->pc10 - old->pc10;
2677 	old->cpu_lpi = new->cpu_lpi - old->cpu_lpi;
2678 	old->sys_lpi = new->sys_lpi - old->sys_lpi;
2679 	old->pkg_temp_c = new->pkg_temp_c;
2680 
2681 	/* flag an error when rc6 counter resets/wraps */
2682 	if (old->gfx_rc6_ms > new->gfx_rc6_ms)
2683 		old->gfx_rc6_ms = -1;
2684 	else
2685 		old->gfx_rc6_ms = new->gfx_rc6_ms - old->gfx_rc6_ms;
2686 
2687 	old->uncore_mhz = new->uncore_mhz;
2688 	old->gfx_mhz = new->gfx_mhz;
2689 	old->gfx_act_mhz = new->gfx_act_mhz;
2690 
2691 	/* flag an error when mc6 counter resets/wraps */
2692 	if (old->sam_mc6_ms > new->sam_mc6_ms)
2693 		old->sam_mc6_ms = -1;
2694 	else
2695 		old->sam_mc6_ms = new->sam_mc6_ms - old->sam_mc6_ms;
2696 
2697 	old->sam_mhz = new->sam_mhz;
2698 	old->sam_act_mhz = new->sam_act_mhz;
2699 
2700 	old->energy_pkg.raw_value = new->energy_pkg.raw_value - old->energy_pkg.raw_value;
2701 	old->energy_cores.raw_value = new->energy_cores.raw_value - old->energy_cores.raw_value;
2702 	old->energy_gfx.raw_value = new->energy_gfx.raw_value - old->energy_gfx.raw_value;
2703 	old->energy_dram.raw_value = new->energy_dram.raw_value - old->energy_dram.raw_value;
2704 	old->rapl_pkg_perf_status.raw_value = new->rapl_pkg_perf_status.raw_value - old->rapl_pkg_perf_status.raw_value;
2705 	old->rapl_dram_perf_status.raw_value =
2706 	    new->rapl_dram_perf_status.raw_value - old->rapl_dram_perf_status.raw_value;
2707 
2708 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
2709 		if (mp->format == FORMAT_RAW)
2710 			old->counter[i] = new->counter[i];
2711 		else if (mp->format == FORMAT_AVERAGE)
2712 			old->counter[i] = new->counter[i];
2713 		else
2714 			old->counter[i] = new->counter[i] - old->counter[i];
2715 	}
2716 
2717 	return 0;
2718 }
2719 
2720 void delta_core(struct core_data *new, struct core_data *old)
2721 {
2722 	int i;
2723 	struct msr_counter *mp;
2724 
2725 	old->c3 = new->c3 - old->c3;
2726 	old->c6 = new->c6 - old->c6;
2727 	old->c7 = new->c7 - old->c7;
2728 	old->core_temp_c = new->core_temp_c;
2729 	old->core_throt_cnt = new->core_throt_cnt;
2730 	old->mc6_us = new->mc6_us - old->mc6_us;
2731 
2732 	DELTA_WRAP32(new->core_energy.raw_value, old->core_energy.raw_value);
2733 
2734 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
2735 		if (mp->format == FORMAT_RAW)
2736 			old->counter[i] = new->counter[i];
2737 		else
2738 			old->counter[i] = new->counter[i] - old->counter[i];
2739 	}
2740 }
2741 
2742 int soft_c1_residency_display(int bic)
2743 {
2744 	if (!DO_BIC(BIC_CPU_c1) || platform->has_msr_core_c1_res)
2745 		return 0;
2746 
2747 	return DO_BIC_READ(bic);
2748 }
2749 
2750 /*
2751  * old = new - old
2752  */
2753 int delta_thread(struct thread_data *new, struct thread_data *old, struct core_data *core_delta)
2754 {
2755 	int i;
2756 	struct msr_counter *mp;
2757 
2758 	/* we run cpuid just the 1st time, copy the results */
2759 	if (DO_BIC(BIC_APIC))
2760 		new->apic_id = old->apic_id;
2761 	if (DO_BIC(BIC_X2APIC))
2762 		new->x2apic_id = old->x2apic_id;
2763 
2764 	/*
2765 	 * the timestamps from start of measurement interval are in "old"
2766 	 * the timestamp from end of measurement interval are in "new"
2767 	 * over-write old w/ new so we can print end of interval values
2768 	 */
2769 
2770 	timersub(&new->tv_begin, &old->tv_begin, &old->tv_delta);
2771 	old->tv_begin = new->tv_begin;
2772 	old->tv_end = new->tv_end;
2773 
2774 	old->tsc = new->tsc - old->tsc;
2775 
2776 	/* check for TSC < 1 Mcycles over interval */
2777 	if (old->tsc < (1000 * 1000))
2778 		errx(-3, "Insanely slow TSC rate, TSC stops in idle?\n"
2779 		     "You can disable all c-states by booting with \"idle=poll\"\n"
2780 		     "or just the deep ones with \"processor.max_cstate=1\"");
2781 
2782 	old->c1 = new->c1 - old->c1;
2783 
2784 	if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC)
2785 	    || soft_c1_residency_display(BIC_Avg_MHz)) {
2786 		if ((new->aperf > old->aperf) && (new->mperf > old->mperf)) {
2787 			old->aperf = new->aperf - old->aperf;
2788 			old->mperf = new->mperf - old->mperf;
2789 		} else {
2790 			return -1;
2791 		}
2792 	}
2793 
2794 	if (platform->has_msr_core_c1_res) {
2795 		/*
2796 		 * Some models have a dedicated C1 residency MSR,
2797 		 * which should be more accurate than the derivation below.
2798 		 */
2799 	} else {
2800 		/*
2801 		 * As counter collection is not atomic,
2802 		 * it is possible for mperf's non-halted cycles + idle states
2803 		 * to exceed TSC's all cycles: show c1 = 0% in that case.
2804 		 */
2805 		if ((old->mperf + core_delta->c3 + core_delta->c6 + core_delta->c7) > (old->tsc * tsc_tweak))
2806 			old->c1 = 0;
2807 		else {
2808 			/* normal case, derive c1 */
2809 			old->c1 = (old->tsc * tsc_tweak) - old->mperf - core_delta->c3
2810 			    - core_delta->c6 - core_delta->c7;
2811 		}
2812 	}
2813 
2814 	if (old->mperf == 0) {
2815 		if (debug > 1)
2816 			fprintf(outf, "cpu%d MPERF 0!\n", old->cpu_id);
2817 		old->mperf = 1;	/* divide by 0 protection */
2818 	}
2819 
2820 	if (DO_BIC(BIC_IPC))
2821 		old->instr_count = new->instr_count - old->instr_count;
2822 
2823 	if (DO_BIC(BIC_IRQ))
2824 		old->irq_count = new->irq_count - old->irq_count;
2825 
2826 	if (DO_BIC(BIC_SMI))
2827 		old->smi_count = new->smi_count - old->smi_count;
2828 
2829 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
2830 		if (mp->format == FORMAT_RAW)
2831 			old->counter[i] = new->counter[i];
2832 		else
2833 			old->counter[i] = new->counter[i] - old->counter[i];
2834 	}
2835 	return 0;
2836 }
2837 
2838 int delta_cpu(struct thread_data *t, struct core_data *c,
2839 	      struct pkg_data *p, struct thread_data *t2, struct core_data *c2, struct pkg_data *p2)
2840 {
2841 	int retval = 0;
2842 
2843 	/* calculate core delta only for 1st thread in core */
2844 	if (is_cpu_first_thread_in_core(t, c, p))
2845 		delta_core(c, c2);
2846 
2847 	/* always calculate thread delta */
2848 	retval = delta_thread(t, t2, c2);	/* c2 is core delta */
2849 	if (retval)
2850 		return retval;
2851 
2852 	/* calculate package delta only for 1st core in package */
2853 	if (is_cpu_first_core_in_package(t, c, p))
2854 		retval = delta_package(p, p2);
2855 
2856 	return retval;
2857 }
2858 
2859 void rapl_counter_clear(struct rapl_counter *c)
2860 {
2861 	c->raw_value = 0;
2862 	c->scale = 0.0;
2863 	c->unit = RAPL_UNIT_INVALID;
2864 }
2865 
2866 void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
2867 {
2868 	int i;
2869 	struct msr_counter *mp;
2870 
2871 	t->tv_begin.tv_sec = 0;
2872 	t->tv_begin.tv_usec = 0;
2873 	t->tv_end.tv_sec = 0;
2874 	t->tv_end.tv_usec = 0;
2875 	t->tv_delta.tv_sec = 0;
2876 	t->tv_delta.tv_usec = 0;
2877 
2878 	t->tsc = 0;
2879 	t->aperf = 0;
2880 	t->mperf = 0;
2881 	t->c1 = 0;
2882 
2883 	t->instr_count = 0;
2884 
2885 	t->irq_count = 0;
2886 	t->smi_count = 0;
2887 
2888 	c->c3 = 0;
2889 	c->c6 = 0;
2890 	c->c7 = 0;
2891 	c->mc6_us = 0;
2892 	c->core_temp_c = 0;
2893 	rapl_counter_clear(&c->core_energy);
2894 	c->core_throt_cnt = 0;
2895 
2896 	p->pkg_wtd_core_c0 = 0;
2897 	p->pkg_any_core_c0 = 0;
2898 	p->pkg_any_gfxe_c0 = 0;
2899 	p->pkg_both_core_gfxe_c0 = 0;
2900 
2901 	p->pc2 = 0;
2902 	if (DO_BIC(BIC_Pkgpc3))
2903 		p->pc3 = 0;
2904 	if (DO_BIC(BIC_Pkgpc6))
2905 		p->pc6 = 0;
2906 	if (DO_BIC(BIC_Pkgpc7))
2907 		p->pc7 = 0;
2908 	p->pc8 = 0;
2909 	p->pc9 = 0;
2910 	p->pc10 = 0;
2911 	p->cpu_lpi = 0;
2912 	p->sys_lpi = 0;
2913 
2914 	rapl_counter_clear(&p->energy_pkg);
2915 	rapl_counter_clear(&p->energy_dram);
2916 	rapl_counter_clear(&p->energy_cores);
2917 	rapl_counter_clear(&p->energy_gfx);
2918 	rapl_counter_clear(&p->rapl_pkg_perf_status);
2919 	rapl_counter_clear(&p->rapl_dram_perf_status);
2920 	p->pkg_temp_c = 0;
2921 
2922 	p->gfx_rc6_ms = 0;
2923 	p->uncore_mhz = 0;
2924 	p->gfx_mhz = 0;
2925 	p->gfx_act_mhz = 0;
2926 	p->sam_mc6_ms = 0;
2927 	p->sam_mhz = 0;
2928 	p->sam_act_mhz = 0;
2929 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next)
2930 		t->counter[i] = 0;
2931 
2932 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next)
2933 		c->counter[i] = 0;
2934 
2935 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next)
2936 		p->counter[i] = 0;
2937 }
2938 
2939 void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src)
2940 {
2941 	/* Copy unit and scale from src if dst is not initialized */
2942 	if (dst->unit == RAPL_UNIT_INVALID) {
2943 		dst->unit = src->unit;
2944 		dst->scale = src->scale;
2945 	}
2946 
2947 	assert(dst->unit == src->unit);
2948 	assert(dst->scale == src->scale);
2949 
2950 	dst->raw_value += src->raw_value;
2951 }
2952 
2953 int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
2954 {
2955 	int i;
2956 	struct msr_counter *mp;
2957 
2958 	/* copy un-changing apic_id's */
2959 	if (DO_BIC(BIC_APIC))
2960 		average.threads.apic_id = t->apic_id;
2961 	if (DO_BIC(BIC_X2APIC))
2962 		average.threads.x2apic_id = t->x2apic_id;
2963 
2964 	/* remember first tv_begin */
2965 	if (average.threads.tv_begin.tv_sec == 0)
2966 		average.threads.tv_begin = t->tv_begin;
2967 
2968 	/* remember last tv_end */
2969 	average.threads.tv_end = t->tv_end;
2970 
2971 	average.threads.tsc += t->tsc;
2972 	average.threads.aperf += t->aperf;
2973 	average.threads.mperf += t->mperf;
2974 	average.threads.c1 += t->c1;
2975 
2976 	average.threads.instr_count += t->instr_count;
2977 
2978 	average.threads.irq_count += t->irq_count;
2979 	average.threads.smi_count += t->smi_count;
2980 
2981 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
2982 		if (mp->format == FORMAT_RAW)
2983 			continue;
2984 		average.threads.counter[i] += t->counter[i];
2985 	}
2986 
2987 	/* sum per-core values only for 1st thread in core */
2988 	if (!is_cpu_first_thread_in_core(t, c, p))
2989 		return 0;
2990 
2991 	average.cores.c3 += c->c3;
2992 	average.cores.c6 += c->c6;
2993 	average.cores.c7 += c->c7;
2994 	average.cores.mc6_us += c->mc6_us;
2995 
2996 	average.cores.core_temp_c = MAX(average.cores.core_temp_c, c->core_temp_c);
2997 	average.cores.core_throt_cnt = MAX(average.cores.core_throt_cnt, c->core_throt_cnt);
2998 
2999 	rapl_counter_accumulate(&average.cores.core_energy, &c->core_energy);
3000 
3001 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
3002 		if (mp->format == FORMAT_RAW)
3003 			continue;
3004 		average.cores.counter[i] += c->counter[i];
3005 	}
3006 
3007 	/* sum per-pkg values only for 1st core in pkg */
3008 	if (!is_cpu_first_core_in_package(t, c, p))
3009 		return 0;
3010 
3011 	if (DO_BIC(BIC_Totl_c0))
3012 		average.packages.pkg_wtd_core_c0 += p->pkg_wtd_core_c0;
3013 	if (DO_BIC(BIC_Any_c0))
3014 		average.packages.pkg_any_core_c0 += p->pkg_any_core_c0;
3015 	if (DO_BIC(BIC_GFX_c0))
3016 		average.packages.pkg_any_gfxe_c0 += p->pkg_any_gfxe_c0;
3017 	if (DO_BIC(BIC_CPUGFX))
3018 		average.packages.pkg_both_core_gfxe_c0 += p->pkg_both_core_gfxe_c0;
3019 
3020 	average.packages.pc2 += p->pc2;
3021 	if (DO_BIC(BIC_Pkgpc3))
3022 		average.packages.pc3 += p->pc3;
3023 	if (DO_BIC(BIC_Pkgpc6))
3024 		average.packages.pc6 += p->pc6;
3025 	if (DO_BIC(BIC_Pkgpc7))
3026 		average.packages.pc7 += p->pc7;
3027 	average.packages.pc8 += p->pc8;
3028 	average.packages.pc9 += p->pc9;
3029 	average.packages.pc10 += p->pc10;
3030 
3031 	average.packages.cpu_lpi = p->cpu_lpi;
3032 	average.packages.sys_lpi = p->sys_lpi;
3033 
3034 	rapl_counter_accumulate(&average.packages.energy_pkg, &p->energy_pkg);
3035 	rapl_counter_accumulate(&average.packages.energy_dram, &p->energy_dram);
3036 	rapl_counter_accumulate(&average.packages.energy_cores, &p->energy_cores);
3037 	rapl_counter_accumulate(&average.packages.energy_gfx, &p->energy_gfx);
3038 
3039 	average.packages.gfx_rc6_ms = p->gfx_rc6_ms;
3040 	average.packages.uncore_mhz = p->uncore_mhz;
3041 	average.packages.gfx_mhz = p->gfx_mhz;
3042 	average.packages.gfx_act_mhz = p->gfx_act_mhz;
3043 	average.packages.sam_mc6_ms = p->sam_mc6_ms;
3044 	average.packages.sam_mhz = p->sam_mhz;
3045 	average.packages.sam_act_mhz = p->sam_act_mhz;
3046 
3047 	average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c);
3048 
3049 	rapl_counter_accumulate(&average.packages.rapl_pkg_perf_status, &p->rapl_pkg_perf_status);
3050 	rapl_counter_accumulate(&average.packages.rapl_dram_perf_status, &p->rapl_dram_perf_status);
3051 
3052 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
3053 		if ((mp->format == FORMAT_RAW) && (topo.num_packages == 0))
3054 			average.packages.counter[i] = p->counter[i];
3055 		else
3056 			average.packages.counter[i] += p->counter[i];
3057 	}
3058 	return 0;
3059 }
3060 
3061 /*
3062  * sum the counters for all cpus in the system
3063  * compute the weighted average
3064  */
3065 void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data *p)
3066 {
3067 	int i;
3068 	struct msr_counter *mp;
3069 
3070 	clear_counters(&average.threads, &average.cores, &average.packages);
3071 
3072 	for_all_cpus(sum_counters, t, c, p);
3073 
3074 	/* Use the global time delta for the average. */
3075 	average.threads.tv_delta = tv_delta;
3076 
3077 	average.threads.tsc /= topo.allowed_cpus;
3078 	average.threads.aperf /= topo.allowed_cpus;
3079 	average.threads.mperf /= topo.allowed_cpus;
3080 	average.threads.instr_count /= topo.allowed_cpus;
3081 	average.threads.c1 /= topo.allowed_cpus;
3082 
3083 	if (average.threads.irq_count > 9999999)
3084 		sums_need_wide_columns = 1;
3085 
3086 	average.cores.c3 /= topo.allowed_cores;
3087 	average.cores.c6 /= topo.allowed_cores;
3088 	average.cores.c7 /= topo.allowed_cores;
3089 	average.cores.mc6_us /= topo.allowed_cores;
3090 
3091 	if (DO_BIC(BIC_Totl_c0))
3092 		average.packages.pkg_wtd_core_c0 /= topo.allowed_packages;
3093 	if (DO_BIC(BIC_Any_c0))
3094 		average.packages.pkg_any_core_c0 /= topo.allowed_packages;
3095 	if (DO_BIC(BIC_GFX_c0))
3096 		average.packages.pkg_any_gfxe_c0 /= topo.allowed_packages;
3097 	if (DO_BIC(BIC_CPUGFX))
3098 		average.packages.pkg_both_core_gfxe_c0 /= topo.allowed_packages;
3099 
3100 	average.packages.pc2 /= topo.allowed_packages;
3101 	if (DO_BIC(BIC_Pkgpc3))
3102 		average.packages.pc3 /= topo.allowed_packages;
3103 	if (DO_BIC(BIC_Pkgpc6))
3104 		average.packages.pc6 /= topo.allowed_packages;
3105 	if (DO_BIC(BIC_Pkgpc7))
3106 		average.packages.pc7 /= topo.allowed_packages;
3107 
3108 	average.packages.pc8 /= topo.allowed_packages;
3109 	average.packages.pc9 /= topo.allowed_packages;
3110 	average.packages.pc10 /= topo.allowed_packages;
3111 
3112 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
3113 		if (mp->format == FORMAT_RAW)
3114 			continue;
3115 		if (mp->type == COUNTER_ITEMS) {
3116 			if (average.threads.counter[i] > 9999999)
3117 				sums_need_wide_columns = 1;
3118 			continue;
3119 		}
3120 		average.threads.counter[i] /= topo.allowed_cpus;
3121 	}
3122 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
3123 		if (mp->format == FORMAT_RAW)
3124 			continue;
3125 		if (mp->type == COUNTER_ITEMS) {
3126 			if (average.cores.counter[i] > 9999999)
3127 				sums_need_wide_columns = 1;
3128 		}
3129 		average.cores.counter[i] /= topo.allowed_cores;
3130 	}
3131 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
3132 		if (mp->format == FORMAT_RAW)
3133 			continue;
3134 		if (mp->type == COUNTER_ITEMS) {
3135 			if (average.packages.counter[i] > 9999999)
3136 				sums_need_wide_columns = 1;
3137 		}
3138 		average.packages.counter[i] /= topo.allowed_packages;
3139 	}
3140 }
3141 
3142 static unsigned long long rdtsc(void)
3143 {
3144 	unsigned int low, high;
3145 
3146 	asm volatile ("rdtsc":"=a" (low), "=d"(high));
3147 
3148 	return low | ((unsigned long long)high) << 32;
3149 }
3150 
3151 /*
3152  * Open a file, and exit on failure
3153  */
3154 FILE *fopen_or_die(const char *path, const char *mode)
3155 {
3156 	FILE *filep = fopen(path, mode);
3157 
3158 	if (!filep)
3159 		err(1, "%s: open failed", path);
3160 	return filep;
3161 }
3162 
3163 /*
3164  * snapshot_sysfs_counter()
3165  *
3166  * return snapshot of given counter
3167  */
3168 unsigned long long snapshot_sysfs_counter(char *path)
3169 {
3170 	FILE *fp;
3171 	int retval;
3172 	unsigned long long counter;
3173 
3174 	fp = fopen_or_die(path, "r");
3175 
3176 	retval = fscanf(fp, "%lld", &counter);
3177 	if (retval != 1)
3178 		err(1, "snapshot_sysfs_counter(%s)", path);
3179 
3180 	fclose(fp);
3181 
3182 	return counter;
3183 }
3184 
3185 int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp, char *counter_path)
3186 {
3187 	if (mp->msr_num != 0) {
3188 		assert(!no_msr);
3189 		if (get_msr(cpu, mp->msr_num, counterp))
3190 			return -1;
3191 	} else {
3192 		char path[128 + PATH_BYTES];
3193 
3194 		if (mp->flags & SYSFS_PERCPU) {
3195 			sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->sp->path);
3196 
3197 			*counterp = snapshot_sysfs_counter(path);
3198 		} else {
3199 			*counterp = snapshot_sysfs_counter(counter_path);
3200 		}
3201 	}
3202 
3203 	return 0;
3204 }
3205 
3206 unsigned long long get_legacy_uncore_mhz(int package)
3207 {
3208 	char path[128];
3209 	int die;
3210 	static int warn_once;
3211 
3212 	/*
3213 	 * for this package, use the first die_id that exists
3214 	 */
3215 	for (die = 0; die <= topo.max_die_id; ++die) {
3216 
3217 		sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz",
3218 			package, die);
3219 
3220 		if (access(path, R_OK) == 0)
3221 			return (snapshot_sysfs_counter(path) / 1000);
3222 	}
3223 	if (!warn_once) {
3224 		warnx("BUG: %s: No %s", __func__, path);
3225 		warn_once = 1;
3226 	}
3227 
3228 	return 0;
3229 }
3230 
3231 int get_epb(int cpu)
3232 {
3233 	char path[128 + PATH_BYTES];
3234 	unsigned long long msr;
3235 	int ret, epb = -1;
3236 	FILE *fp;
3237 
3238 	sprintf(path, "/sys/devices/system/cpu/cpu%d/power/energy_perf_bias", cpu);
3239 
3240 	fp = fopen(path, "r");
3241 	if (!fp)
3242 		goto msr_fallback;
3243 
3244 	ret = fscanf(fp, "%d", &epb);
3245 	if (ret != 1)
3246 		err(1, "%s(%s)", __func__, path);
3247 
3248 	fclose(fp);
3249 
3250 	return epb;
3251 
3252 msr_fallback:
3253 	if (no_msr)
3254 		return -1;
3255 
3256 	get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr);
3257 
3258 	return msr & 0xf;
3259 }
3260 
3261 void get_apic_id(struct thread_data *t)
3262 {
3263 	unsigned int eax, ebx, ecx, edx;
3264 
3265 	if (DO_BIC(BIC_APIC)) {
3266 		eax = ebx = ecx = edx = 0;
3267 		__cpuid(1, eax, ebx, ecx, edx);
3268 
3269 		t->apic_id = (ebx >> 24) & 0xff;
3270 	}
3271 
3272 	if (!DO_BIC(BIC_X2APIC))
3273 		return;
3274 
3275 	if (authentic_amd || hygon_genuine) {
3276 		unsigned int topology_extensions;
3277 
3278 		if (max_extended_level < 0x8000001e)
3279 			return;
3280 
3281 		eax = ebx = ecx = edx = 0;
3282 		__cpuid(0x80000001, eax, ebx, ecx, edx);
3283 		topology_extensions = ecx & (1 << 22);
3284 
3285 		if (topology_extensions == 0)
3286 			return;
3287 
3288 		eax = ebx = ecx = edx = 0;
3289 		__cpuid(0x8000001e, eax, ebx, ecx, edx);
3290 
3291 		t->x2apic_id = eax;
3292 		return;
3293 	}
3294 
3295 	if (!genuine_intel)
3296 		return;
3297 
3298 	if (max_level < 0xb)
3299 		return;
3300 
3301 	ecx = 0;
3302 	__cpuid(0xb, eax, ebx, ecx, edx);
3303 	t->x2apic_id = edx;
3304 
3305 	if (debug && (t->apic_id != (t->x2apic_id & 0xff)))
3306 		fprintf(outf, "cpu%d: BIOS BUG: apic 0x%x x2apic 0x%x\n", t->cpu_id, t->apic_id, t->x2apic_id);
3307 }
3308 
3309 int get_core_throt_cnt(int cpu, unsigned long long *cnt)
3310 {
3311 	char path[128 + PATH_BYTES];
3312 	unsigned long long tmp;
3313 	FILE *fp;
3314 	int ret;
3315 
3316 	sprintf(path, "/sys/devices/system/cpu/cpu%d/thermal_throttle/core_throttle_count", cpu);
3317 	fp = fopen(path, "r");
3318 	if (!fp)
3319 		return -1;
3320 	ret = fscanf(fp, "%lld", &tmp);
3321 	fclose(fp);
3322 	if (ret != 1)
3323 		return -1;
3324 	*cnt = tmp;
3325 
3326 	return 0;
3327 }
3328 
3329 struct amperf_group_fd {
3330 	int aperf;		/* Also the group descriptor */
3331 	int mperf;
3332 };
3333 
3334 static int read_perf_counter_info(const char *const path, const char *const parse_format, void *value_ptr)
3335 {
3336 	int fdmt;
3337 	int bytes_read;
3338 	char buf[64];
3339 	int ret = -1;
3340 
3341 	fdmt = open(path, O_RDONLY, 0);
3342 	if (fdmt == -1) {
3343 		if (debug)
3344 			fprintf(stderr, "Failed to parse perf counter info %s\n", path);
3345 		ret = -1;
3346 		goto cleanup_and_exit;
3347 	}
3348 
3349 	bytes_read = read(fdmt, buf, sizeof(buf) - 1);
3350 	if (bytes_read <= 0 || bytes_read >= (int)sizeof(buf)) {
3351 		if (debug)
3352 			fprintf(stderr, "Failed to parse perf counter info %s\n", path);
3353 		ret = -1;
3354 		goto cleanup_and_exit;
3355 	}
3356 
3357 	buf[bytes_read] = '\0';
3358 
3359 	if (sscanf(buf, parse_format, value_ptr) != 1) {
3360 		if (debug)
3361 			fprintf(stderr, "Failed to parse perf counter info %s\n", path);
3362 		ret = -1;
3363 		goto cleanup_and_exit;
3364 	}
3365 
3366 	ret = 0;
3367 
3368 cleanup_and_exit:
3369 	close(fdmt);
3370 	return ret;
3371 }
3372 
3373 static unsigned int read_perf_counter_info_n(const char *const path, const char *const parse_format)
3374 {
3375 	unsigned int v;
3376 	int status;
3377 
3378 	status = read_perf_counter_info(path, parse_format, &v);
3379 	if (status)
3380 		v = -1;
3381 
3382 	return v;
3383 }
3384 
3385 static unsigned int read_msr_type(void)
3386 {
3387 	const char *const path = "/sys/bus/event_source/devices/msr/type";
3388 	const char *const format = "%u";
3389 
3390 	return read_perf_counter_info_n(path, format);
3391 }
3392 
3393 static unsigned int read_aperf_config(void)
3394 {
3395 	const char *const path = "/sys/bus/event_source/devices/msr/events/aperf";
3396 	const char *const format = "event=%x";
3397 
3398 	return read_perf_counter_info_n(path, format);
3399 }
3400 
3401 static unsigned int read_mperf_config(void)
3402 {
3403 	const char *const path = "/sys/bus/event_source/devices/msr/events/mperf";
3404 	const char *const format = "event=%x";
3405 
3406 	return read_perf_counter_info_n(path, format);
3407 }
3408 
3409 static unsigned int read_perf_type(const char *subsys)
3410 {
3411 	const char *const path_format = "/sys/bus/event_source/devices/%s/type";
3412 	const char *const format = "%u";
3413 	char path[128];
3414 
3415 	snprintf(path, sizeof(path), path_format, subsys);
3416 
3417 	return read_perf_counter_info_n(path, format);
3418 }
3419 
3420 static unsigned int read_rapl_config(const char *subsys, const char *event_name)
3421 {
3422 	const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s";
3423 	const char *const format = "event=%x";
3424 	char path[128];
3425 
3426 	snprintf(path, sizeof(path), path_format, subsys, event_name);
3427 
3428 	return read_perf_counter_info_n(path, format);
3429 }
3430 
3431 static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_name)
3432 {
3433 	const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.unit";
3434 	const char *const format = "%s";
3435 	char path[128];
3436 	char unit_buffer[16];
3437 
3438 	snprintf(path, sizeof(path), path_format, subsys, event_name);
3439 
3440 	read_perf_counter_info(path, format, &unit_buffer);
3441 	if (strcmp("Joules", unit_buffer) == 0)
3442 		return RAPL_UNIT_JOULES;
3443 
3444 	return RAPL_UNIT_INVALID;
3445 }
3446 
3447 static double read_perf_rapl_scale(const char *subsys, const char *event_name)
3448 {
3449 	const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.scale";
3450 	const char *const format = "%lf";
3451 	char path[128];
3452 	double scale;
3453 
3454 	snprintf(path, sizeof(path), path_format, subsys, event_name);
3455 
3456 	if (read_perf_counter_info(path, format, &scale))
3457 		return 0.0;
3458 
3459 	return scale;
3460 }
3461 
3462 static struct amperf_group_fd open_amperf_fd(int cpu)
3463 {
3464 	const unsigned int msr_type = read_msr_type();
3465 	const unsigned int aperf_config = read_aperf_config();
3466 	const unsigned int mperf_config = read_mperf_config();
3467 	struct amperf_group_fd fds = {.aperf = -1, .mperf = -1 };
3468 
3469 	fds.aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP);
3470 	fds.mperf = open_perf_counter(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP);
3471 
3472 	return fds;
3473 }
3474 
3475 static int get_amperf_fd(int cpu)
3476 {
3477 	assert(fd_amperf_percpu);
3478 
3479 	if (fd_amperf_percpu[cpu].aperf)
3480 		return fd_amperf_percpu[cpu].aperf;
3481 
3482 	fd_amperf_percpu[cpu] = open_amperf_fd(cpu);
3483 
3484 	return fd_amperf_percpu[cpu].aperf;
3485 }
3486 
3487 /* Read APERF, MPERF and TSC using the perf API. */
3488 static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu)
3489 {
3490 	union {
3491 		struct {
3492 			unsigned long nr_entries;
3493 			unsigned long aperf;
3494 			unsigned long mperf;
3495 		};
3496 
3497 		unsigned long as_array[3];
3498 	} cnt;
3499 
3500 	const int fd_amperf = get_amperf_fd(cpu);
3501 
3502 	/*
3503 	 * Read the TSC with rdtsc, because we want the absolute value and not
3504 	 * the offset from the start of the counter.
3505 	 */
3506 	t->tsc = rdtsc();
3507 
3508 	const int n = read(fd_amperf, &cnt.as_array[0], sizeof(cnt.as_array));
3509 
3510 	if (n != sizeof(cnt.as_array))
3511 		return -2;
3512 
3513 	t->aperf = cnt.aperf * aperf_mperf_multiplier;
3514 	t->mperf = cnt.mperf * aperf_mperf_multiplier;
3515 
3516 	return 0;
3517 }
3518 
3519 /* Read APERF, MPERF and TSC using the MSR driver and rdtsc instruction. */
3520 static int read_aperf_mperf_tsc_msr(struct thread_data *t, int cpu)
3521 {
3522 	unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time;
3523 	int aperf_mperf_retry_count = 0;
3524 
3525 	/*
3526 	 * The TSC, APERF and MPERF must be read together for
3527 	 * APERF/MPERF and MPERF/TSC to give accurate results.
3528 	 *
3529 	 * Unfortunately, APERF and MPERF are read by
3530 	 * individual system call, so delays may occur
3531 	 * between them.  If the time to read them
3532 	 * varies by a large amount, we re-read them.
3533 	 */
3534 
3535 	/*
3536 	 * This initial dummy APERF read has been seen to
3537 	 * reduce jitter in the subsequent reads.
3538 	 */
3539 
3540 	if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
3541 		return -3;
3542 
3543 retry:
3544 	t->tsc = rdtsc();	/* re-read close to APERF */
3545 
3546 	tsc_before = t->tsc;
3547 
3548 	if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
3549 		return -3;
3550 
3551 	tsc_between = rdtsc();
3552 
3553 	if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf))
3554 		return -4;
3555 
3556 	tsc_after = rdtsc();
3557 
3558 	aperf_time = tsc_between - tsc_before;
3559 	mperf_time = tsc_after - tsc_between;
3560 
3561 	/*
3562 	 * If the system call latency to read APERF and MPERF
3563 	 * differ by more than 2x, then try again.
3564 	 */
3565 	if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) {
3566 		aperf_mperf_retry_count++;
3567 		if (aperf_mperf_retry_count < 5)
3568 			goto retry;
3569 		else
3570 			warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time);
3571 	}
3572 	aperf_mperf_retry_count = 0;
3573 
3574 	t->aperf = t->aperf * aperf_mperf_multiplier;
3575 	t->mperf = t->mperf * aperf_mperf_multiplier;
3576 
3577 	return 0;
3578 }
3579 
3580 size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci)
3581 {
3582 	size_t ret = 0;
3583 
3584 	for (int i = 0; i < NUM_RAPL_COUNTERS; ++i)
3585 		if (rci->source[i] == RAPL_SOURCE_PERF)
3586 			++ret;
3587 
3588 	return ret;
3589 }
3590 
3591 static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t *cci)
3592 {
3593 	size_t ret = 0;
3594 
3595 	for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i)
3596 		if (cci->source[i] == CSTATE_SOURCE_PERF)
3597 			++ret;
3598 
3599 	return ret;
3600 }
3601 
3602 void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx)
3603 {
3604 	rc->raw_value = rci->data[idx];
3605 	rc->unit = rci->unit[idx];
3606 	rc->scale = rci->scale[idx];
3607 }
3608 
3609 int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct pkg_data *p)
3610 {
3611 	unsigned long long perf_data[NUM_RAPL_COUNTERS + 1];
3612 	struct rapl_counter_info_t *rci;
3613 
3614 	if (debug)
3615 		fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain);
3616 
3617 	assert(rapl_counter_info_perdomain);
3618 	assert(domain < rapl_counter_info_perdomain_size);
3619 
3620 	rci = &rapl_counter_info_perdomain[domain];
3621 
3622 	/*
3623 	 * If we have any perf counters to read, read them all now, in bulk
3624 	 */
3625 	if (rci->fd_perf != -1) {
3626 		size_t num_perf_counters = rapl_counter_info_count_perf(rci);
3627 		const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long);
3628 		const ssize_t actual_read_size = read(rci->fd_perf, &perf_data[0], sizeof(perf_data));
3629 
3630 		if (actual_read_size != expected_read_size)
3631 			err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size,
3632 			    actual_read_size);
3633 	}
3634 
3635 	for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) {
3636 		switch (rci->source[i]) {
3637 		case RAPL_SOURCE_NONE:
3638 			break;
3639 
3640 		case RAPL_SOURCE_PERF:
3641 			assert(pi < ARRAY_SIZE(perf_data));
3642 			assert(rci->fd_perf != -1);
3643 
3644 			if (debug)
3645 				fprintf(stderr, "Reading rapl counter via perf at %u (%llu %e %lf)\n",
3646 					i, perf_data[pi], rci->scale[i], perf_data[pi] * rci->scale[i]);
3647 
3648 			rci->data[i] = perf_data[pi];
3649 
3650 			++pi;
3651 			break;
3652 
3653 		case RAPL_SOURCE_MSR:
3654 			if (debug)
3655 				fprintf(stderr, "Reading rapl counter via msr at %u\n", i);
3656 
3657 			assert(!no_msr);
3658 			if (rci->flags[i] & RAPL_COUNTER_FLAG_USE_MSR_SUM) {
3659 				if (get_msr_sum(cpu, rci->msr[i], &rci->data[i]))
3660 					return -13 - i;
3661 			} else {
3662 				if (get_msr(cpu, rci->msr[i], &rci->data[i]))
3663 					return -13 - i;
3664 			}
3665 
3666 			rci->data[i] &= rci->msr_mask[i];
3667 			if (rci->msr_shift[i] >= 0)
3668 				rci->data[i] >>= abs(rci->msr_shift[i]);
3669 			else
3670 				rci->data[i] <<= abs(rci->msr_shift[i]);
3671 
3672 			break;
3673 		}
3674 	}
3675 
3676 	BUILD_BUG_ON(NUM_RAPL_COUNTERS != 7);
3677 	write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG);
3678 	write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES);
3679 	write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM);
3680 	write_rapl_counter(&p->energy_gfx, rci, RAPL_RCI_INDEX_GFX);
3681 	write_rapl_counter(&p->rapl_pkg_perf_status, rci, RAPL_RCI_INDEX_PKG_PERF_STATUS);
3682 	write_rapl_counter(&p->rapl_dram_perf_status, rci, RAPL_RCI_INDEX_DRAM_PERF_STATUS);
3683 	write_rapl_counter(&c->core_energy, rci, RAPL_RCI_INDEX_CORE_ENERGY);
3684 
3685 	return 0;
3686 }
3687 
3688 char *find_sysfs_path_by_id(struct sysfs_path *sp, int id)
3689 {
3690 	while (sp) {
3691 		if (sp->id == id)
3692 			return (sp->path);
3693 		sp = sp->next;
3694 	}
3695 	if (debug)
3696 		warnx("%s: id%d not found", __func__, id);
3697 	return NULL;
3698 }
3699 
3700 int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_data *c, struct pkg_data *p)
3701 {
3702 	/*
3703 	 * Overcommit memory a little bit here,
3704 	 * but skip calculating exact sizes for the buffers.
3705 	 */
3706 	unsigned long long perf_data[NUM_CSTATE_COUNTERS];
3707 	unsigned long long perf_data_core[NUM_CSTATE_COUNTERS + 1];
3708 	unsigned long long perf_data_pkg[NUM_CSTATE_COUNTERS + 1];
3709 
3710 	struct cstate_counter_info_t *cci;
3711 
3712 	if (debug)
3713 		fprintf(stderr, "%s: cpu%d\n", __func__, cpu);
3714 
3715 	assert(ccstate_counter_info);
3716 	assert(cpu <= ccstate_counter_info_size);
3717 
3718 	memset(perf_data, 0, sizeof(perf_data));
3719 	memset(perf_data_core, 0, sizeof(perf_data_core));
3720 	memset(perf_data_pkg, 0, sizeof(perf_data_pkg));
3721 
3722 	cci = &ccstate_counter_info[cpu];
3723 
3724 	/*
3725 	 * If we have any perf counters to read, read them all now, in bulk
3726 	 */
3727 	const size_t num_perf_counters = cstate_counter_info_count_perf(cci);
3728 	ssize_t expected_read_size = num_perf_counters * sizeof(unsigned long long);
3729 	ssize_t actual_read_size_core = 0, actual_read_size_pkg = 0;
3730 
3731 	if (cci->fd_perf_core != -1) {
3732 		/* Each descriptor read begins with number of counters read. */
3733 		expected_read_size += sizeof(unsigned long long);
3734 
3735 		actual_read_size_core = read(cci->fd_perf_core, &perf_data_core[0], sizeof(perf_data_core));
3736 
3737 		if (actual_read_size_core <= 0)
3738 			err(-1, "%s: read perf %s: %ld", __func__, "core", actual_read_size_core);
3739 	}
3740 
3741 	if (cci->fd_perf_pkg != -1) {
3742 		/* Each descriptor read begins with number of counters read. */
3743 		expected_read_size += sizeof(unsigned long long);
3744 
3745 		actual_read_size_pkg = read(cci->fd_perf_pkg, &perf_data_pkg[0], sizeof(perf_data_pkg));
3746 
3747 		if (actual_read_size_pkg <= 0)
3748 			err(-1, "%s: read perf %s: %ld", __func__, "pkg", actual_read_size_pkg);
3749 	}
3750 
3751 	const ssize_t actual_read_size_total = actual_read_size_core + actual_read_size_pkg;
3752 
3753 	if (actual_read_size_total != expected_read_size)
3754 		err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, actual_read_size_total);
3755 
3756 	/*
3757 	 * Copy ccstate and pcstate data into unified buffer.
3758 	 *
3759 	 * Skip first element from core and pkg buffers.
3760 	 * Kernel puts there how many counters were read.
3761 	 */
3762 	const size_t num_core_counters = perf_data_core[0];
3763 	const size_t num_pkg_counters = perf_data_pkg[0];
3764 
3765 	assert(num_perf_counters == num_core_counters + num_pkg_counters);
3766 
3767 	/* Copy ccstate perf data */
3768 	memcpy(&perf_data[0], &perf_data_core[1], num_core_counters * sizeof(unsigned long long));
3769 
3770 	/* Copy pcstate perf data */
3771 	memcpy(&perf_data[num_core_counters], &perf_data_pkg[1], num_pkg_counters * sizeof(unsigned long long));
3772 
3773 	for (unsigned int i = 0, pi = 0; i < NUM_CSTATE_COUNTERS; ++i) {
3774 		switch (cci->source[i]) {
3775 		case CSTATE_SOURCE_NONE:
3776 			break;
3777 
3778 		case CSTATE_SOURCE_PERF:
3779 			assert(pi < ARRAY_SIZE(perf_data));
3780 			assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1);
3781 
3782 			if (debug) {
3783 				fprintf(stderr, "cstate via %s %u: %llu\n", "perf", i, perf_data[pi]);
3784 			}
3785 
3786 			cci->data[i] = perf_data[pi];
3787 
3788 			++pi;
3789 			break;
3790 
3791 		case CSTATE_SOURCE_MSR:
3792 			assert(!no_msr);
3793 			if (get_msr(cpu, cci->msr[i], &cci->data[i]))
3794 				return -13 - i;
3795 
3796 			if (debug) {
3797 				fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", "msr", cci->msr[i], i, cci->data[i]);
3798 			}
3799 
3800 			break;
3801 		}
3802 	}
3803 
3804 	/*
3805 	 * Helper to write the data only if the source of
3806 	 * the counter for the current cpu is not none.
3807 	 *
3808 	 * Otherwise we would overwrite core data with 0 (default value),
3809 	 * when invoked for the thread sibling.
3810 	 */
3811 #define PERF_COUNTER_WRITE_DATA(out_counter, index) do {	\
3812 	if (cci->source[index] != CSTATE_SOURCE_NONE)		\
3813 		out_counter = cci->data[index];			\
3814 } while (0)
3815 
3816 	BUILD_BUG_ON(NUM_CSTATE_COUNTERS != 11);
3817 
3818 	PERF_COUNTER_WRITE_DATA(t->c1, CCSTATE_RCI_INDEX_C1_RESIDENCY);
3819 	PERF_COUNTER_WRITE_DATA(c->c3, CCSTATE_RCI_INDEX_C3_RESIDENCY);
3820 	PERF_COUNTER_WRITE_DATA(c->c6, CCSTATE_RCI_INDEX_C6_RESIDENCY);
3821 	PERF_COUNTER_WRITE_DATA(c->c7, CCSTATE_RCI_INDEX_C7_RESIDENCY);
3822 
3823 	PERF_COUNTER_WRITE_DATA(p->pc2, PCSTATE_RCI_INDEX_C2_RESIDENCY);
3824 	PERF_COUNTER_WRITE_DATA(p->pc3, PCSTATE_RCI_INDEX_C3_RESIDENCY);
3825 	PERF_COUNTER_WRITE_DATA(p->pc6, PCSTATE_RCI_INDEX_C6_RESIDENCY);
3826 	PERF_COUNTER_WRITE_DATA(p->pc7, PCSTATE_RCI_INDEX_C7_RESIDENCY);
3827 	PERF_COUNTER_WRITE_DATA(p->pc8, PCSTATE_RCI_INDEX_C8_RESIDENCY);
3828 	PERF_COUNTER_WRITE_DATA(p->pc9, PCSTATE_RCI_INDEX_C9_RESIDENCY);
3829 	PERF_COUNTER_WRITE_DATA(p->pc10, PCSTATE_RCI_INDEX_C10_RESIDENCY);
3830 
3831 #undef PERF_COUNTER_WRITE_DATA
3832 
3833 	return 0;
3834 }
3835 
3836 /*
3837  * get_counters(...)
3838  * migrate to cpu
3839  * acquire and record local counters for that cpu
3840  */
3841 int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
3842 {
3843 	int cpu = t->cpu_id;
3844 	unsigned long long msr;
3845 	struct msr_counter *mp;
3846 	int i;
3847 	int status;
3848 
3849 	if (cpu_migrate(cpu)) {
3850 		fprintf(outf, "%s: Could not migrate to CPU %d\n", __func__, cpu);
3851 		return -1;
3852 	}
3853 
3854 	gettimeofday(&t->tv_begin, (struct timezone *)NULL);
3855 
3856 	if (first_counter_read)
3857 		get_apic_id(t);
3858 
3859 	t->tsc = rdtsc();	/* we are running on local CPU of interest */
3860 
3861 	if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC)
3862 	    || soft_c1_residency_display(BIC_Avg_MHz)) {
3863 		int status = -1;
3864 
3865 		assert(!no_perf || !no_msr);
3866 
3867 		switch (amperf_source) {
3868 		case AMPERF_SOURCE_PERF:
3869 			status = read_aperf_mperf_tsc_perf(t, cpu);
3870 			break;
3871 		case AMPERF_SOURCE_MSR:
3872 			status = read_aperf_mperf_tsc_msr(t, cpu);
3873 			break;
3874 		}
3875 
3876 		if (status != 0)
3877 			return status;
3878 	}
3879 
3880 	if (DO_BIC(BIC_IPC))
3881 		if (read(get_instr_count_fd(cpu), &t->instr_count, sizeof(long long)) != sizeof(long long))
3882 			return -4;
3883 
3884 	if (DO_BIC(BIC_IRQ))
3885 		t->irq_count = irqs_per_cpu[cpu];
3886 	if (DO_BIC(BIC_SMI)) {
3887 		if (get_msr(cpu, MSR_SMI_COUNT, &msr))
3888 			return -5;
3889 		t->smi_count = msr & 0xFFFFFFFF;
3890 	}
3891 
3892 	get_cstate_counters(cpu, t, c, p);
3893 
3894 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
3895 		if (get_mp(cpu, mp, &t->counter[i], mp->sp->path))
3896 			return -10;
3897 	}
3898 
3899 	/* collect core counters only for 1st thread in core */
3900 	if (!is_cpu_first_thread_in_core(t, c, p))
3901 		goto done;
3902 
3903 	if (platform->has_per_core_rapl) {
3904 		status = get_rapl_counters(cpu, c->core_id, c, p);
3905 		if (status != 0)
3906 			return status;
3907 	}
3908 
3909 	if (DO_BIC(BIC_CPU_c7) && t->is_atom) {
3910 		/*
3911 		 * For Atom CPUs that has core cstate deeper than c6,
3912 		 * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper.
3913 		 * Minus CC7 (and deeper cstates) residency to get
3914 		 * accturate cc6 residency.
3915 		 */
3916 		c->c6 -= c->c7;
3917 	}
3918 
3919 	if (DO_BIC(BIC_Mod_c6))
3920 		if (get_msr(cpu, MSR_MODULE_C6_RES_MS, &c->mc6_us))
3921 			return -8;
3922 
3923 	if (DO_BIC(BIC_CoreTmp)) {
3924 		if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
3925 			return -9;
3926 		c->core_temp_c = tj_max - ((msr >> 16) & 0x7F);
3927 	}
3928 
3929 	if (DO_BIC(BIC_CORE_THROT_CNT))
3930 		get_core_throt_cnt(cpu, &c->core_throt_cnt);
3931 
3932 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
3933 		if (get_mp(cpu, mp, &c->counter[i], mp->sp->path))
3934 			return -10;
3935 	}
3936 
3937 	/* collect package counters only for 1st core in package */
3938 	if (!is_cpu_first_core_in_package(t, c, p))
3939 		goto done;
3940 
3941 	if (DO_BIC(BIC_Totl_c0)) {
3942 		if (get_msr(cpu, MSR_PKG_WEIGHTED_CORE_C0_RES, &p->pkg_wtd_core_c0))
3943 			return -10;
3944 	}
3945 	if (DO_BIC(BIC_Any_c0)) {
3946 		if (get_msr(cpu, MSR_PKG_ANY_CORE_C0_RES, &p->pkg_any_core_c0))
3947 			return -11;
3948 	}
3949 	if (DO_BIC(BIC_GFX_c0)) {
3950 		if (get_msr(cpu, MSR_PKG_ANY_GFXE_C0_RES, &p->pkg_any_gfxe_c0))
3951 			return -12;
3952 	}
3953 	if (DO_BIC(BIC_CPUGFX)) {
3954 		if (get_msr(cpu, MSR_PKG_BOTH_CORE_GFXE_C0_RES, &p->pkg_both_core_gfxe_c0))
3955 			return -13;
3956 	}
3957 
3958 	if (DO_BIC(BIC_CPU_LPI))
3959 		p->cpu_lpi = cpuidle_cur_cpu_lpi_us;
3960 	if (DO_BIC(BIC_SYS_LPI))
3961 		p->sys_lpi = cpuidle_cur_sys_lpi_us;
3962 
3963 	if (!platform->has_per_core_rapl) {
3964 		status = get_rapl_counters(cpu, p->package_id, c, p);
3965 		if (status != 0)
3966 			return status;
3967 	}
3968 
3969 	if (DO_BIC(BIC_PkgTmp)) {
3970 		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
3971 			return -17;
3972 		p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F);
3973 	}
3974 
3975 	if (DO_BIC(BIC_UNCORE_MHZ))
3976 		p->uncore_mhz = get_legacy_uncore_mhz(p->package_id);
3977 
3978 	if (DO_BIC(BIC_GFX_rc6))
3979 		p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull;
3980 
3981 	if (DO_BIC(BIC_GFXMHz))
3982 		p->gfx_mhz = gfx_info[GFX_MHz].val;
3983 
3984 	if (DO_BIC(BIC_GFXACTMHz))
3985 		p->gfx_act_mhz = gfx_info[GFX_ACTMHz].val;
3986 
3987 	if (DO_BIC(BIC_SAM_mc6))
3988 		p->sam_mc6_ms = gfx_info[SAM_mc6].val_ull;
3989 
3990 	if (DO_BIC(BIC_SAMMHz))
3991 		p->sam_mhz = gfx_info[SAM_MHz].val;
3992 
3993 	if (DO_BIC(BIC_SAMACTMHz))
3994 		p->sam_act_mhz = gfx_info[SAM_ACTMHz].val;
3995 
3996 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
3997 		char *path = NULL;
3998 
3999 		if (mp->msr_num == 0) {
4000 			path = find_sysfs_path_by_id(mp->sp, p->package_id);
4001 			if (path == NULL) {
4002 				warnx("%s: package_id %d not found", __func__, p->package_id);
4003 				return -10;
4004 			}
4005 		}
4006 		if (get_mp(cpu, mp, &p->counter[i], path))
4007 			return -10;
4008 	}
4009 done:
4010 	gettimeofday(&t->tv_end, (struct timezone *)NULL);
4011 
4012 	return 0;
4013 }
4014 
4015 int pkg_cstate_limit = PCLUKN;
4016 char *pkg_cstate_limit_strings[] = { "unknown", "reserved", "pc0", "pc1", "pc2",
4017 	"pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited"
4018 };
4019 
4020 int nhm_pkg_cstate_limits[16] =
4021     { PCL__0, PCL__1, PCL__3, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4022 	PCLRSV, PCLRSV
4023 };
4024 
4025 int snb_pkg_cstate_limits[16] =
4026     { PCL__0, PCL__2, PCL_6N, PCL_6R, PCL__7, PCL_7S, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4027 	PCLRSV, PCLRSV
4028 };
4029 
4030 int hsw_pkg_cstate_limits[16] =
4031     { PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4032 	PCLRSV, PCLRSV
4033 };
4034 
4035 int slv_pkg_cstate_limits[16] =
4036     { PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4037 	PCL__6, PCL__7
4038 };
4039 
4040 int amt_pkg_cstate_limits[16] =
4041     { PCLUNL, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4042 	PCLRSV, PCLRSV
4043 };
4044 
4045 int phi_pkg_cstate_limits[16] =
4046     { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4047 	PCLRSV, PCLRSV
4048 };
4049 
4050 int glm_pkg_cstate_limits[16] =
4051     { PCLUNL, PCL__1, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCL_10, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4052 	PCLRSV, PCLRSV
4053 };
4054 
4055 int skx_pkg_cstate_limits[16] =
4056     { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4057 	PCLRSV, PCLRSV
4058 };
4059 
4060 int icx_pkg_cstate_limits[16] =
4061     { PCL__0, PCL__2, PCL__6, PCL__6, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4062 	PCLRSV, PCLRSV
4063 };
4064 
4065 void probe_cst_limit(void)
4066 {
4067 	unsigned long long msr;
4068 	int *pkg_cstate_limits;
4069 
4070 	if (!platform->has_nhm_msrs || no_msr)
4071 		return;
4072 
4073 	switch (platform->cst_limit) {
4074 	case CST_LIMIT_NHM:
4075 		pkg_cstate_limits = nhm_pkg_cstate_limits;
4076 		break;
4077 	case CST_LIMIT_SNB:
4078 		pkg_cstate_limits = snb_pkg_cstate_limits;
4079 		break;
4080 	case CST_LIMIT_HSW:
4081 		pkg_cstate_limits = hsw_pkg_cstate_limits;
4082 		break;
4083 	case CST_LIMIT_SKX:
4084 		pkg_cstate_limits = skx_pkg_cstate_limits;
4085 		break;
4086 	case CST_LIMIT_ICX:
4087 		pkg_cstate_limits = icx_pkg_cstate_limits;
4088 		break;
4089 	case CST_LIMIT_SLV:
4090 		pkg_cstate_limits = slv_pkg_cstate_limits;
4091 		break;
4092 	case CST_LIMIT_AMT:
4093 		pkg_cstate_limits = amt_pkg_cstate_limits;
4094 		break;
4095 	case CST_LIMIT_KNL:
4096 		pkg_cstate_limits = phi_pkg_cstate_limits;
4097 		break;
4098 	case CST_LIMIT_GMT:
4099 		pkg_cstate_limits = glm_pkg_cstate_limits;
4100 		break;
4101 	default:
4102 		return;
4103 	}
4104 
4105 	get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
4106 	pkg_cstate_limit = pkg_cstate_limits[msr & 0xF];
4107 }
4108 
4109 static void dump_platform_info(void)
4110 {
4111 	unsigned long long msr;
4112 	unsigned int ratio;
4113 
4114 	if (!platform->has_nhm_msrs || no_msr)
4115 		return;
4116 
4117 	get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
4118 
4119 	fprintf(outf, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", base_cpu, msr);
4120 
4121 	ratio = (msr >> 40) & 0xFF;
4122 	fprintf(outf, "%d * %.1f = %.1f MHz max efficiency frequency\n", ratio, bclk, ratio * bclk);
4123 
4124 	ratio = (msr >> 8) & 0xFF;
4125 	fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk);
4126 }
4127 
4128 static void dump_power_ctl(void)
4129 {
4130 	unsigned long long msr;
4131 
4132 	if (!platform->has_nhm_msrs || no_msr)
4133 		return;
4134 
4135 	get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr);
4136 	fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n",
4137 		base_cpu, msr, msr & 0x2 ? "EN" : "DIS");
4138 
4139 	/* C-state Pre-wake Disable (CSTATE_PREWAKE_DISABLE) */
4140 	if (platform->has_cst_prewake_bit)
4141 		fprintf(outf, "C-state Pre-wake: %sabled\n", msr & 0x40000000 ? "DIS" : "EN");
4142 
4143 	return;
4144 }
4145 
4146 static void dump_turbo_ratio_limit2(void)
4147 {
4148 	unsigned long long msr;
4149 	unsigned int ratio;
4150 
4151 	get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT2, &msr);
4152 
4153 	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT2: 0x%08llx\n", base_cpu, msr);
4154 
4155 	ratio = (msr >> 8) & 0xFF;
4156 	if (ratio)
4157 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 18 active cores\n", ratio, bclk, ratio * bclk);
4158 
4159 	ratio = (msr >> 0) & 0xFF;
4160 	if (ratio)
4161 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 17 active cores\n", ratio, bclk, ratio * bclk);
4162 	return;
4163 }
4164 
4165 static void dump_turbo_ratio_limit1(void)
4166 {
4167 	unsigned long long msr;
4168 	unsigned int ratio;
4169 
4170 	get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &msr);
4171 
4172 	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, msr);
4173 
4174 	ratio = (msr >> 56) & 0xFF;
4175 	if (ratio)
4176 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 16 active cores\n", ratio, bclk, ratio * bclk);
4177 
4178 	ratio = (msr >> 48) & 0xFF;
4179 	if (ratio)
4180 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 15 active cores\n", ratio, bclk, ratio * bclk);
4181 
4182 	ratio = (msr >> 40) & 0xFF;
4183 	if (ratio)
4184 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 14 active cores\n", ratio, bclk, ratio * bclk);
4185 
4186 	ratio = (msr >> 32) & 0xFF;
4187 	if (ratio)
4188 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 13 active cores\n", ratio, bclk, ratio * bclk);
4189 
4190 	ratio = (msr >> 24) & 0xFF;
4191 	if (ratio)
4192 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 12 active cores\n", ratio, bclk, ratio * bclk);
4193 
4194 	ratio = (msr >> 16) & 0xFF;
4195 	if (ratio)
4196 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 11 active cores\n", ratio, bclk, ratio * bclk);
4197 
4198 	ratio = (msr >> 8) & 0xFF;
4199 	if (ratio)
4200 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 10 active cores\n", ratio, bclk, ratio * bclk);
4201 
4202 	ratio = (msr >> 0) & 0xFF;
4203 	if (ratio)
4204 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 9 active cores\n", ratio, bclk, ratio * bclk);
4205 	return;
4206 }
4207 
4208 static void dump_turbo_ratio_limits(int trl_msr_offset)
4209 {
4210 	unsigned long long msr, core_counts;
4211 	int shift;
4212 
4213 	get_msr(base_cpu, trl_msr_offset, &msr);
4214 	fprintf(outf, "cpu%d: MSR_%sTURBO_RATIO_LIMIT: 0x%08llx\n",
4215 		base_cpu, trl_msr_offset == MSR_SECONDARY_TURBO_RATIO_LIMIT ? "SECONDARY_" : "", msr);
4216 
4217 	if (platform->trl_msrs & TRL_CORECOUNT) {
4218 		get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &core_counts);
4219 		fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, core_counts);
4220 	} else {
4221 		core_counts = 0x0807060504030201;
4222 	}
4223 
4224 	for (shift = 56; shift >= 0; shift -= 8) {
4225 		unsigned int ratio, group_size;
4226 
4227 		ratio = (msr >> shift) & 0xFF;
4228 		group_size = (core_counts >> shift) & 0xFF;
4229 		if (ratio)
4230 			fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n",
4231 				ratio, bclk, ratio * bclk, group_size);
4232 	}
4233 
4234 	return;
4235 }
4236 
4237 static void dump_atom_turbo_ratio_limits(void)
4238 {
4239 	unsigned long long msr;
4240 	unsigned int ratio;
4241 
4242 	get_msr(base_cpu, MSR_ATOM_CORE_RATIOS, &msr);
4243 	fprintf(outf, "cpu%d: MSR_ATOM_CORE_RATIOS: 0x%08llx\n", base_cpu, msr & 0xFFFFFFFF);
4244 
4245 	ratio = (msr >> 0) & 0x3F;
4246 	if (ratio)
4247 		fprintf(outf, "%d * %.1f = %.1f MHz minimum operating frequency\n", ratio, bclk, ratio * bclk);
4248 
4249 	ratio = (msr >> 8) & 0x3F;
4250 	if (ratio)
4251 		fprintf(outf, "%d * %.1f = %.1f MHz low frequency mode (LFM)\n", ratio, bclk, ratio * bclk);
4252 
4253 	ratio = (msr >> 16) & 0x3F;
4254 	if (ratio)
4255 		fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk);
4256 
4257 	get_msr(base_cpu, MSR_ATOM_CORE_TURBO_RATIOS, &msr);
4258 	fprintf(outf, "cpu%d: MSR_ATOM_CORE_TURBO_RATIOS: 0x%08llx\n", base_cpu, msr & 0xFFFFFFFF);
4259 
4260 	ratio = (msr >> 24) & 0x3F;
4261 	if (ratio)
4262 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 4 active cores\n", ratio, bclk, ratio * bclk);
4263 
4264 	ratio = (msr >> 16) & 0x3F;
4265 	if (ratio)
4266 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 3 active cores\n", ratio, bclk, ratio * bclk);
4267 
4268 	ratio = (msr >> 8) & 0x3F;
4269 	if (ratio)
4270 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 2 active cores\n", ratio, bclk, ratio * bclk);
4271 
4272 	ratio = (msr >> 0) & 0x3F;
4273 	if (ratio)
4274 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 1 active core\n", ratio, bclk, ratio * bclk);
4275 }
4276 
4277 static void dump_knl_turbo_ratio_limits(void)
4278 {
4279 	const unsigned int buckets_no = 7;
4280 
4281 	unsigned long long msr;
4282 	int delta_cores, delta_ratio;
4283 	int i, b_nr;
4284 	unsigned int cores[buckets_no];
4285 	unsigned int ratio[buckets_no];
4286 
4287 	get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT, &msr);
4288 
4289 	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr);
4290 
4291 	/*
4292 	 * Turbo encoding in KNL is as follows:
4293 	 * [0] -- Reserved
4294 	 * [7:1] -- Base value of number of active cores of bucket 1.
4295 	 * [15:8] -- Base value of freq ratio of bucket 1.
4296 	 * [20:16] -- +ve delta of number of active cores of bucket 2.
4297 	 * i.e. active cores of bucket 2 =
4298 	 * active cores of bucket 1 + delta
4299 	 * [23:21] -- Negative delta of freq ratio of bucket 2.
4300 	 * i.e. freq ratio of bucket 2 =
4301 	 * freq ratio of bucket 1 - delta
4302 	 * [28:24]-- +ve delta of number of active cores of bucket 3.
4303 	 * [31:29]-- -ve delta of freq ratio of bucket 3.
4304 	 * [36:32]-- +ve delta of number of active cores of bucket 4.
4305 	 * [39:37]-- -ve delta of freq ratio of bucket 4.
4306 	 * [44:40]-- +ve delta of number of active cores of bucket 5.
4307 	 * [47:45]-- -ve delta of freq ratio of bucket 5.
4308 	 * [52:48]-- +ve delta of number of active cores of bucket 6.
4309 	 * [55:53]-- -ve delta of freq ratio of bucket 6.
4310 	 * [60:56]-- +ve delta of number of active cores of bucket 7.
4311 	 * [63:61]-- -ve delta of freq ratio of bucket 7.
4312 	 */
4313 
4314 	b_nr = 0;
4315 	cores[b_nr] = (msr & 0xFF) >> 1;
4316 	ratio[b_nr] = (msr >> 8) & 0xFF;
4317 
4318 	for (i = 16; i < 64; i += 8) {
4319 		delta_cores = (msr >> i) & 0x1F;
4320 		delta_ratio = (msr >> (i + 5)) & 0x7;
4321 
4322 		cores[b_nr + 1] = cores[b_nr] + delta_cores;
4323 		ratio[b_nr + 1] = ratio[b_nr] - delta_ratio;
4324 		b_nr++;
4325 	}
4326 
4327 	for (i = buckets_no - 1; i >= 0; i--)
4328 		if (i > 0 ? ratio[i] != ratio[i - 1] : 1)
4329 			fprintf(outf,
4330 				"%d * %.1f = %.1f MHz max turbo %d active cores\n",
4331 				ratio[i], bclk, ratio[i] * bclk, cores[i]);
4332 }
4333 
4334 static void dump_cst_cfg(void)
4335 {
4336 	unsigned long long msr;
4337 
4338 	if (!platform->has_nhm_msrs || no_msr)
4339 		return;
4340 
4341 	get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
4342 
4343 	fprintf(outf, "cpu%d: MSR_PKG_CST_CONFIG_CONTROL: 0x%08llx", base_cpu, msr);
4344 
4345 	fprintf(outf, " (%s%s%s%s%slocked, pkg-cstate-limit=%d (%s)",
4346 		(msr & SNB_C3_AUTO_UNDEMOTE) ? "UNdemote-C3, " : "",
4347 		(msr & SNB_C1_AUTO_UNDEMOTE) ? "UNdemote-C1, " : "",
4348 		(msr & NHM_C3_AUTO_DEMOTE) ? "demote-C3, " : "",
4349 		(msr & NHM_C1_AUTO_DEMOTE) ? "demote-C1, " : "",
4350 		(msr & (1 << 15)) ? "" : "UN", (unsigned int)msr & 0xF, pkg_cstate_limit_strings[pkg_cstate_limit]);
4351 
4352 #define AUTOMATIC_CSTATE_CONVERSION		(1UL << 16)
4353 	if (platform->has_cst_auto_convension) {
4354 		fprintf(outf, ", automatic c-state conversion=%s", (msr & AUTOMATIC_CSTATE_CONVERSION) ? "on" : "off");
4355 	}
4356 
4357 	fprintf(outf, ")\n");
4358 
4359 	return;
4360 }
4361 
4362 static void dump_config_tdp(void)
4363 {
4364 	unsigned long long msr;
4365 
4366 	get_msr(base_cpu, MSR_CONFIG_TDP_NOMINAL, &msr);
4367 	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_NOMINAL: 0x%08llx", base_cpu, msr);
4368 	fprintf(outf, " (base_ratio=%d)\n", (unsigned int)msr & 0xFF);
4369 
4370 	get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_1, &msr);
4371 	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_1: 0x%08llx (", base_cpu, msr);
4372 	if (msr) {
4373 		fprintf(outf, "PKG_MIN_PWR_LVL1=%d ", (unsigned int)(msr >> 48) & 0x7FFF);
4374 		fprintf(outf, "PKG_MAX_PWR_LVL1=%d ", (unsigned int)(msr >> 32) & 0x7FFF);
4375 		fprintf(outf, "LVL1_RATIO=%d ", (unsigned int)(msr >> 16) & 0xFF);
4376 		fprintf(outf, "PKG_TDP_LVL1=%d", (unsigned int)(msr) & 0x7FFF);
4377 	}
4378 	fprintf(outf, ")\n");
4379 
4380 	get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_2, &msr);
4381 	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_2: 0x%08llx (", base_cpu, msr);
4382 	if (msr) {
4383 		fprintf(outf, "PKG_MIN_PWR_LVL2=%d ", (unsigned int)(msr >> 48) & 0x7FFF);
4384 		fprintf(outf, "PKG_MAX_PWR_LVL2=%d ", (unsigned int)(msr >> 32) & 0x7FFF);
4385 		fprintf(outf, "LVL2_RATIO=%d ", (unsigned int)(msr >> 16) & 0xFF);
4386 		fprintf(outf, "PKG_TDP_LVL2=%d", (unsigned int)(msr) & 0x7FFF);
4387 	}
4388 	fprintf(outf, ")\n");
4389 
4390 	get_msr(base_cpu, MSR_CONFIG_TDP_CONTROL, &msr);
4391 	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_CONTROL: 0x%08llx (", base_cpu, msr);
4392 	if ((msr) & 0x3)
4393 		fprintf(outf, "TDP_LEVEL=%d ", (unsigned int)(msr) & 0x3);
4394 	fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1);
4395 	fprintf(outf, ")\n");
4396 
4397 	get_msr(base_cpu, MSR_TURBO_ACTIVATION_RATIO, &msr);
4398 	fprintf(outf, "cpu%d: MSR_TURBO_ACTIVATION_RATIO: 0x%08llx (", base_cpu, msr);
4399 	fprintf(outf, "MAX_NON_TURBO_RATIO=%d", (unsigned int)(msr) & 0xFF);
4400 	fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1);
4401 	fprintf(outf, ")\n");
4402 }
4403 
4404 unsigned int irtl_time_units[] = { 1, 32, 1024, 32768, 1048576, 33554432, 0, 0 };
4405 
4406 void print_irtl(void)
4407 {
4408 	unsigned long long msr;
4409 
4410 	if (!platform->has_irtl_msrs || no_msr)
4411 		return;
4412 
4413 	if (platform->supported_cstates & PC3) {
4414 		get_msr(base_cpu, MSR_PKGC3_IRTL, &msr);
4415 		fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr);
4416 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
4417 			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
4418 	}
4419 
4420 	if (platform->supported_cstates & PC6) {
4421 		get_msr(base_cpu, MSR_PKGC6_IRTL, &msr);
4422 		fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", base_cpu, msr);
4423 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
4424 			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
4425 	}
4426 
4427 	if (platform->supported_cstates & PC7) {
4428 		get_msr(base_cpu, MSR_PKGC7_IRTL, &msr);
4429 		fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", base_cpu, msr);
4430 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
4431 			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
4432 	}
4433 
4434 	if (platform->supported_cstates & PC8) {
4435 		get_msr(base_cpu, MSR_PKGC8_IRTL, &msr);
4436 		fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", base_cpu, msr);
4437 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
4438 			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
4439 	}
4440 
4441 	if (platform->supported_cstates & PC9) {
4442 		get_msr(base_cpu, MSR_PKGC9_IRTL, &msr);
4443 		fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", base_cpu, msr);
4444 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
4445 			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
4446 	}
4447 
4448 	if (platform->supported_cstates & PC10) {
4449 		get_msr(base_cpu, MSR_PKGC10_IRTL, &msr);
4450 		fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", base_cpu, msr);
4451 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
4452 			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
4453 	}
4454 }
4455 
4456 void free_fd_percpu(void)
4457 {
4458 	int i;
4459 
4460 	if (!fd_percpu)
4461 		return;
4462 
4463 	for (i = 0; i < topo.max_cpu_num + 1; ++i) {
4464 		if (fd_percpu[i] != 0)
4465 			close(fd_percpu[i]);
4466 	}
4467 
4468 	free(fd_percpu);
4469 	fd_percpu = NULL;
4470 }
4471 
4472 void free_fd_amperf_percpu(void)
4473 {
4474 	int i;
4475 
4476 	if (!fd_amperf_percpu)
4477 		return;
4478 
4479 	for (i = 0; i < topo.max_cpu_num + 1; ++i) {
4480 		if (fd_amperf_percpu[i].mperf != 0)
4481 			close(fd_amperf_percpu[i].mperf);
4482 
4483 		if (fd_amperf_percpu[i].aperf != 0)
4484 			close(fd_amperf_percpu[i].aperf);
4485 	}
4486 
4487 	free(fd_amperf_percpu);
4488 	fd_amperf_percpu = NULL;
4489 }
4490 
4491 void free_fd_instr_count_percpu(void)
4492 {
4493 	if (!fd_instr_count_percpu)
4494 		return;
4495 
4496 	for (int i = 0; i < topo.max_cpu_num + 1; ++i) {
4497 		if (fd_instr_count_percpu[i] != 0)
4498 			close(fd_instr_count_percpu[i]);
4499 	}
4500 
4501 	free(fd_instr_count_percpu);
4502 	fd_instr_count_percpu = NULL;
4503 }
4504 
4505 void free_fd_cstate(void)
4506 {
4507 	if (!ccstate_counter_info)
4508 		return;
4509 
4510 	const int counter_info_num = ccstate_counter_info_size;
4511 
4512 	for (int counter_id = 0; counter_id < counter_info_num; ++counter_id) {
4513 		if (ccstate_counter_info[counter_id].fd_perf_core != -1)
4514 			close(ccstate_counter_info[counter_id].fd_perf_core);
4515 
4516 		if (ccstate_counter_info[counter_id].fd_perf_pkg != -1)
4517 			close(ccstate_counter_info[counter_id].fd_perf_pkg);
4518 	}
4519 
4520 	free(ccstate_counter_info);
4521 	ccstate_counter_info = NULL;
4522 	ccstate_counter_info_size = 0;
4523 }
4524 
4525 void free_fd_rapl_percpu(void)
4526 {
4527 	if (!rapl_counter_info_perdomain)
4528 		return;
4529 
4530 	const int num_domains = rapl_counter_info_perdomain_size;
4531 
4532 	for (int domain_id = 0; domain_id < num_domains; ++domain_id) {
4533 		if (rapl_counter_info_perdomain[domain_id].fd_perf != -1)
4534 			close(rapl_counter_info_perdomain[domain_id].fd_perf);
4535 	}
4536 
4537 	free(rapl_counter_info_perdomain);
4538 	rapl_counter_info_perdomain = NULL;
4539 	rapl_counter_info_perdomain_size = 0;
4540 }
4541 
4542 void free_all_buffers(void)
4543 {
4544 	int i;
4545 
4546 	CPU_FREE(cpu_present_set);
4547 	cpu_present_set = NULL;
4548 	cpu_present_setsize = 0;
4549 
4550 	CPU_FREE(cpu_effective_set);
4551 	cpu_effective_set = NULL;
4552 	cpu_effective_setsize = 0;
4553 
4554 	CPU_FREE(cpu_allowed_set);
4555 	cpu_allowed_set = NULL;
4556 	cpu_allowed_setsize = 0;
4557 
4558 	CPU_FREE(cpu_affinity_set);
4559 	cpu_affinity_set = NULL;
4560 	cpu_affinity_setsize = 0;
4561 
4562 	free(thread_even);
4563 	free(core_even);
4564 	free(package_even);
4565 
4566 	thread_even = NULL;
4567 	core_even = NULL;
4568 	package_even = NULL;
4569 
4570 	free(thread_odd);
4571 	free(core_odd);
4572 	free(package_odd);
4573 
4574 	thread_odd = NULL;
4575 	core_odd = NULL;
4576 	package_odd = NULL;
4577 
4578 	free(output_buffer);
4579 	output_buffer = NULL;
4580 	outp = NULL;
4581 
4582 	free_fd_percpu();
4583 	free_fd_instr_count_percpu();
4584 	free_fd_amperf_percpu();
4585 	free_fd_rapl_percpu();
4586 	free_fd_cstate();
4587 
4588 	free(irq_column_2_cpu);
4589 	free(irqs_per_cpu);
4590 
4591 	for (i = 0; i <= topo.max_cpu_num; ++i) {
4592 		if (cpus[i].put_ids)
4593 			CPU_FREE(cpus[i].put_ids);
4594 	}
4595 	free(cpus);
4596 }
4597 
4598 /*
4599  * Parse a file containing a single int.
4600  * Return 0 if file can not be opened
4601  * Exit if file can be opened, but can not be parsed
4602  */
4603 int parse_int_file(const char *fmt, ...)
4604 {
4605 	va_list args;
4606 	char path[PATH_MAX];
4607 	FILE *filep;
4608 	int value;
4609 
4610 	va_start(args, fmt);
4611 	vsnprintf(path, sizeof(path), fmt, args);
4612 	va_end(args);
4613 	filep = fopen(path, "r");
4614 	if (!filep)
4615 		return 0;
4616 	if (fscanf(filep, "%d", &value) != 1)
4617 		err(1, "%s: failed to parse number from file", path);
4618 	fclose(filep);
4619 	return value;
4620 }
4621 
4622 /*
4623  * cpu_is_first_core_in_package(cpu)
4624  * return 1 if given CPU is 1st core in package
4625  */
4626 int cpu_is_first_core_in_package(int cpu)
4627 {
4628 	return cpu == parse_int_file("/sys/devices/system/cpu/cpu%d/topology/core_siblings_list", cpu);
4629 }
4630 
4631 int get_physical_package_id(int cpu)
4632 {
4633 	return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/physical_package_id", cpu);
4634 }
4635 
4636 int get_die_id(int cpu)
4637 {
4638 	return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/die_id", cpu);
4639 }
4640 
4641 int get_core_id(int cpu)
4642 {
4643 	return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/core_id", cpu);
4644 }
4645 
4646 void set_node_data(void)
4647 {
4648 	int pkg, node, lnode, cpu, cpux;
4649 	int cpu_count;
4650 
4651 	/* initialize logical_node_id */
4652 	for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu)
4653 		cpus[cpu].logical_node_id = -1;
4654 
4655 	cpu_count = 0;
4656 	for (pkg = 0; pkg < topo.num_packages; pkg++) {
4657 		lnode = 0;
4658 		for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu) {
4659 			if (cpus[cpu].physical_package_id != pkg)
4660 				continue;
4661 			/* find a cpu with an unset logical_node_id */
4662 			if (cpus[cpu].logical_node_id != -1)
4663 				continue;
4664 			cpus[cpu].logical_node_id = lnode;
4665 			node = cpus[cpu].physical_node_id;
4666 			cpu_count++;
4667 			/*
4668 			 * find all matching cpus on this pkg and set
4669 			 * the logical_node_id
4670 			 */
4671 			for (cpux = cpu; cpux <= topo.max_cpu_num; cpux++) {
4672 				if ((cpus[cpux].physical_package_id == pkg) && (cpus[cpux].physical_node_id == node)) {
4673 					cpus[cpux].logical_node_id = lnode;
4674 					cpu_count++;
4675 				}
4676 			}
4677 			lnode++;
4678 			if (lnode > topo.nodes_per_pkg)
4679 				topo.nodes_per_pkg = lnode;
4680 		}
4681 		if (cpu_count >= topo.max_cpu_num)
4682 			break;
4683 	}
4684 }
4685 
4686 int get_physical_node_id(struct cpu_topology *thiscpu)
4687 {
4688 	char path[80];
4689 	FILE *filep;
4690 	int i;
4691 	int cpu = thiscpu->logical_cpu_id;
4692 
4693 	for (i = 0; i <= topo.max_cpu_num; i++) {
4694 		sprintf(path, "/sys/devices/system/cpu/cpu%d/node%i/cpulist", cpu, i);
4695 		filep = fopen(path, "r");
4696 		if (!filep)
4697 			continue;
4698 		fclose(filep);
4699 		return i;
4700 	}
4701 	return -1;
4702 }
4703 
4704 static int parse_cpu_str(char *cpu_str, cpu_set_t *cpu_set, int cpu_set_size)
4705 {
4706 	unsigned int start, end;
4707 	char *next = cpu_str;
4708 
4709 	while (next && *next) {
4710 
4711 		if (*next == '-')	/* no negative cpu numbers */
4712 			return 1;
4713 
4714 		start = strtoul(next, &next, 10);
4715 
4716 		if (start >= CPU_SUBSET_MAXCPUS)
4717 			return 1;
4718 		CPU_SET_S(start, cpu_set_size, cpu_set);
4719 
4720 		if (*next == '\0' || *next == '\n')
4721 			break;
4722 
4723 		if (*next == ',') {
4724 			next += 1;
4725 			continue;
4726 		}
4727 
4728 		if (*next == '-') {
4729 			next += 1;	/* start range */
4730 		} else if (*next == '.') {
4731 			next += 1;
4732 			if (*next == '.')
4733 				next += 1;	/* start range */
4734 			else
4735 				return 1;
4736 		}
4737 
4738 		end = strtoul(next, &next, 10);
4739 		if (end <= start)
4740 			return 1;
4741 
4742 		while (++start <= end) {
4743 			if (start >= CPU_SUBSET_MAXCPUS)
4744 				return 1;
4745 			CPU_SET_S(start, cpu_set_size, cpu_set);
4746 		}
4747 
4748 		if (*next == ',')
4749 			next += 1;
4750 		else if (*next != '\0' && *next != '\n')
4751 			return 1;
4752 	}
4753 
4754 	return 0;
4755 }
4756 
4757 int get_thread_siblings(struct cpu_topology *thiscpu)
4758 {
4759 	char path[80], character;
4760 	FILE *filep;
4761 	unsigned long map;
4762 	int so, shift, sib_core;
4763 	int cpu = thiscpu->logical_cpu_id;
4764 	int offset = topo.max_cpu_num + 1;
4765 	size_t size;
4766 	int thread_id = 0;
4767 
4768 	thiscpu->put_ids = CPU_ALLOC((topo.max_cpu_num + 1));
4769 	if (thiscpu->thread_id < 0)
4770 		thiscpu->thread_id = thread_id++;
4771 	if (!thiscpu->put_ids)
4772 		return -1;
4773 
4774 	size = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
4775 	CPU_ZERO_S(size, thiscpu->put_ids);
4776 
4777 	sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpu);
4778 	filep = fopen(path, "r");
4779 
4780 	if (!filep) {
4781 		warnx("%s: open failed", path);
4782 		return -1;
4783 	}
4784 	do {
4785 		offset -= BITMASK_SIZE;
4786 		if (fscanf(filep, "%lx%c", &map, &character) != 2)
4787 			err(1, "%s: failed to parse file", path);
4788 		for (shift = 0; shift < BITMASK_SIZE; shift++) {
4789 			if ((map >> shift) & 0x1) {
4790 				so = shift + offset;
4791 				sib_core = get_core_id(so);
4792 				if (sib_core == thiscpu->physical_core_id) {
4793 					CPU_SET_S(so, size, thiscpu->put_ids);
4794 					if ((so != cpu) && (cpus[so].thread_id < 0))
4795 						cpus[so].thread_id = thread_id++;
4796 				}
4797 			}
4798 		}
4799 	} while (character == ',');
4800 	fclose(filep);
4801 
4802 	return CPU_COUNT_S(size, thiscpu->put_ids);
4803 }
4804 
4805 /*
4806  * run func(thread, core, package) in topology order
4807  * skip non-present cpus
4808  */
4809 
4810 int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *,
4811 			       struct pkg_data *, struct thread_data *, struct core_data *,
4812 			       struct pkg_data *), struct thread_data *thread_base,
4813 		   struct core_data *core_base, struct pkg_data *pkg_base,
4814 		   struct thread_data *thread_base2, struct core_data *core_base2, struct pkg_data *pkg_base2)
4815 {
4816 	int retval, pkg_no, node_no, core_no, thread_no;
4817 
4818 	for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
4819 		for (node_no = 0; node_no < topo.nodes_per_pkg; ++node_no) {
4820 			for (core_no = 0; core_no < topo.cores_per_node; ++core_no) {
4821 				for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) {
4822 					struct thread_data *t, *t2;
4823 					struct core_data *c, *c2;
4824 					struct pkg_data *p, *p2;
4825 
4826 					t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
4827 
4828 					if (cpu_is_not_allowed(t->cpu_id))
4829 						continue;
4830 
4831 					t2 = GET_THREAD(thread_base2, thread_no, core_no, node_no, pkg_no);
4832 
4833 					c = GET_CORE(core_base, core_no, node_no, pkg_no);
4834 					c2 = GET_CORE(core_base2, core_no, node_no, pkg_no);
4835 
4836 					p = GET_PKG(pkg_base, pkg_no);
4837 					p2 = GET_PKG(pkg_base2, pkg_no);
4838 
4839 					retval = func(t, c, p, t2, c2, p2);
4840 					if (retval)
4841 						return retval;
4842 				}
4843 			}
4844 		}
4845 	}
4846 	return 0;
4847 }
4848 
4849 /*
4850  * run func(cpu) on every cpu in /proc/stat
4851  * return max_cpu number
4852  */
4853 int for_all_proc_cpus(int (func) (int))
4854 {
4855 	FILE *fp;
4856 	int cpu_num;
4857 	int retval;
4858 
4859 	fp = fopen_or_die(proc_stat, "r");
4860 
4861 	retval = fscanf(fp, "cpu %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n");
4862 	if (retval != 0)
4863 		err(1, "%s: failed to parse format", proc_stat);
4864 
4865 	while (1) {
4866 		retval = fscanf(fp, "cpu%u %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n", &cpu_num);
4867 		if (retval != 1)
4868 			break;
4869 
4870 		retval = func(cpu_num);
4871 		if (retval) {
4872 			fclose(fp);
4873 			return (retval);
4874 		}
4875 	}
4876 	fclose(fp);
4877 	return 0;
4878 }
4879 
4880 #define PATH_EFFECTIVE_CPUS	"/sys/fs/cgroup/cpuset.cpus.effective"
4881 
4882 static char cpu_effective_str[1024];
4883 
4884 static int update_effective_str(bool startup)
4885 {
4886 	FILE *fp;
4887 	char *pos;
4888 	char buf[1024];
4889 	int ret;
4890 
4891 	if (cpu_effective_str[0] == '\0' && !startup)
4892 		return 0;
4893 
4894 	fp = fopen(PATH_EFFECTIVE_CPUS, "r");
4895 	if (!fp)
4896 		return 0;
4897 
4898 	pos = fgets(buf, 1024, fp);
4899 	if (!pos)
4900 		err(1, "%s: file read failed\n", PATH_EFFECTIVE_CPUS);
4901 
4902 	fclose(fp);
4903 
4904 	ret = strncmp(cpu_effective_str, buf, 1024);
4905 	if (!ret)
4906 		return 0;
4907 
4908 	strncpy(cpu_effective_str, buf, 1024);
4909 	return 1;
4910 }
4911 
4912 static void update_effective_set(bool startup)
4913 {
4914 	update_effective_str(startup);
4915 
4916 	if (parse_cpu_str(cpu_effective_str, cpu_effective_set, cpu_effective_setsize))
4917 		err(1, "%s: cpu str malformat %s\n", PATH_EFFECTIVE_CPUS, cpu_effective_str);
4918 }
4919 
4920 void linux_perf_init(void);
4921 void rapl_perf_init(void);
4922 void cstate_perf_init(void);
4923 
4924 void re_initialize(void)
4925 {
4926 	free_all_buffers();
4927 	setup_all_buffers(false);
4928 	linux_perf_init();
4929 	rapl_perf_init();
4930 	cstate_perf_init();
4931 	fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus,
4932 		topo.allowed_cpus);
4933 }
4934 
4935 void set_max_cpu_num(void)
4936 {
4937 	FILE *filep;
4938 	int base_cpu;
4939 	unsigned long dummy;
4940 	char pathname[64];
4941 
4942 	base_cpu = sched_getcpu();
4943 	if (base_cpu < 0)
4944 		err(1, "cannot find calling cpu ID");
4945 	sprintf(pathname, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", base_cpu);
4946 
4947 	filep = fopen_or_die(pathname, "r");
4948 	topo.max_cpu_num = 0;
4949 	while (fscanf(filep, "%lx,", &dummy) == 1)
4950 		topo.max_cpu_num += BITMASK_SIZE;
4951 	fclose(filep);
4952 	topo.max_cpu_num--;	/* 0 based */
4953 }
4954 
4955 /*
4956  * count_cpus()
4957  * remember the last one seen, it will be the max
4958  */
4959 int count_cpus(int cpu)
4960 {
4961 	UNUSED(cpu);
4962 
4963 	topo.num_cpus++;
4964 	return 0;
4965 }
4966 
4967 int mark_cpu_present(int cpu)
4968 {
4969 	CPU_SET_S(cpu, cpu_present_setsize, cpu_present_set);
4970 	return 0;
4971 }
4972 
4973 int init_thread_id(int cpu)
4974 {
4975 	cpus[cpu].thread_id = -1;
4976 	return 0;
4977 }
4978 
4979 /*
4980  * snapshot_proc_interrupts()
4981  *
4982  * read and record summary of /proc/interrupts
4983  *
4984  * return 1 if config change requires a restart, else return 0
4985  */
4986 int snapshot_proc_interrupts(void)
4987 {
4988 	static FILE *fp;
4989 	int column, retval;
4990 
4991 	if (fp == NULL)
4992 		fp = fopen_or_die("/proc/interrupts", "r");
4993 	else
4994 		rewind(fp);
4995 
4996 	/* read 1st line of /proc/interrupts to get cpu* name for each column */
4997 	for (column = 0; column < topo.num_cpus; ++column) {
4998 		int cpu_number;
4999 
5000 		retval = fscanf(fp, " CPU%d", &cpu_number);
5001 		if (retval != 1)
5002 			break;
5003 
5004 		if (cpu_number > topo.max_cpu_num) {
5005 			warn("/proc/interrupts: cpu%d: > %d", cpu_number, topo.max_cpu_num);
5006 			return 1;
5007 		}
5008 
5009 		irq_column_2_cpu[column] = cpu_number;
5010 		irqs_per_cpu[cpu_number] = 0;
5011 	}
5012 
5013 	/* read /proc/interrupt count lines and sum up irqs per cpu */
5014 	while (1) {
5015 		int column;
5016 		char buf[64];
5017 
5018 		retval = fscanf(fp, " %s:", buf);	/* flush irq# "N:" */
5019 		if (retval != 1)
5020 			break;
5021 
5022 		/* read the count per cpu */
5023 		for (column = 0; column < topo.num_cpus; ++column) {
5024 
5025 			int cpu_number, irq_count;
5026 
5027 			retval = fscanf(fp, " %d", &irq_count);
5028 			if (retval != 1)
5029 				break;
5030 
5031 			cpu_number = irq_column_2_cpu[column];
5032 			irqs_per_cpu[cpu_number] += irq_count;
5033 
5034 		}
5035 
5036 		while (getc(fp) != '\n') ;	/* flush interrupt description */
5037 
5038 	}
5039 	return 0;
5040 }
5041 
5042 /*
5043  * snapshot_graphics()
5044  *
5045  * record snapshot of specified graphics sysfs knob
5046  *
5047  * return 1 if config change requires a restart, else return 0
5048  */
5049 int snapshot_graphics(int idx)
5050 {
5051 	FILE *fp;
5052 	int retval;
5053 
5054 	switch (idx) {
5055 	case GFX_rc6:
5056 	case SAM_mc6:
5057 		fp = fopen_or_die(gfx_info[idx].path, "r");
5058 		retval = fscanf(fp, "%lld", &gfx_info[idx].val_ull);
5059 		if (retval != 1)
5060 			err(1, "rc6");
5061 		fclose(fp);
5062 		return 0;
5063 	case GFX_MHz:
5064 	case GFX_ACTMHz:
5065 	case SAM_MHz:
5066 	case SAM_ACTMHz:
5067 		if (gfx_info[idx].fp == NULL) {
5068 			gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r");
5069 		} else {
5070 			rewind(gfx_info[idx].fp);
5071 			fflush(gfx_info[idx].fp);
5072 		}
5073 		retval = fscanf(gfx_info[idx].fp, "%d", &gfx_info[idx].val);
5074 		if (retval != 1)
5075 			err(1, "MHz");
5076 		return 0;
5077 	default:
5078 		return -EINVAL;
5079 	}
5080 }
5081 
5082 /*
5083  * snapshot_cpu_lpi()
5084  *
5085  * record snapshot of
5086  * /sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us
5087  */
5088 int snapshot_cpu_lpi_us(void)
5089 {
5090 	FILE *fp;
5091 	int retval;
5092 
5093 	fp = fopen_or_die("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", "r");
5094 
5095 	retval = fscanf(fp, "%lld", &cpuidle_cur_cpu_lpi_us);
5096 	if (retval != 1) {
5097 		fprintf(stderr, "Disabling Low Power Idle CPU output\n");
5098 		BIC_NOT_PRESENT(BIC_CPU_LPI);
5099 		fclose(fp);
5100 		return -1;
5101 	}
5102 
5103 	fclose(fp);
5104 
5105 	return 0;
5106 }
5107 
5108 /*
5109  * snapshot_sys_lpi()
5110  *
5111  * record snapshot of sys_lpi_file
5112  */
5113 int snapshot_sys_lpi_us(void)
5114 {
5115 	FILE *fp;
5116 	int retval;
5117 
5118 	fp = fopen_or_die(sys_lpi_file, "r");
5119 
5120 	retval = fscanf(fp, "%lld", &cpuidle_cur_sys_lpi_us);
5121 	if (retval != 1) {
5122 		fprintf(stderr, "Disabling Low Power Idle System output\n");
5123 		BIC_NOT_PRESENT(BIC_SYS_LPI);
5124 		fclose(fp);
5125 		return -1;
5126 	}
5127 	fclose(fp);
5128 
5129 	return 0;
5130 }
5131 
5132 /*
5133  * snapshot /proc and /sys files
5134  *
5135  * return 1 if configuration restart needed, else return 0
5136  */
5137 int snapshot_proc_sysfs_files(void)
5138 {
5139 	if (DO_BIC(BIC_IRQ))
5140 		if (snapshot_proc_interrupts())
5141 			return 1;
5142 
5143 	if (DO_BIC(BIC_GFX_rc6))
5144 		snapshot_graphics(GFX_rc6);
5145 
5146 	if (DO_BIC(BIC_GFXMHz))
5147 		snapshot_graphics(GFX_MHz);
5148 
5149 	if (DO_BIC(BIC_GFXACTMHz))
5150 		snapshot_graphics(GFX_ACTMHz);
5151 
5152 	if (DO_BIC(BIC_SAM_mc6))
5153 		snapshot_graphics(SAM_mc6);
5154 
5155 	if (DO_BIC(BIC_SAMMHz))
5156 		snapshot_graphics(SAM_MHz);
5157 
5158 	if (DO_BIC(BIC_SAMACTMHz))
5159 		snapshot_graphics(SAM_ACTMHz);
5160 
5161 	if (DO_BIC(BIC_CPU_LPI))
5162 		snapshot_cpu_lpi_us();
5163 
5164 	if (DO_BIC(BIC_SYS_LPI))
5165 		snapshot_sys_lpi_us();
5166 
5167 	return 0;
5168 }
5169 
5170 int exit_requested;
5171 
5172 static void signal_handler(int signal)
5173 {
5174 	switch (signal) {
5175 	case SIGINT:
5176 		exit_requested = 1;
5177 		if (debug)
5178 			fprintf(stderr, " SIGINT\n");
5179 		break;
5180 	case SIGUSR1:
5181 		if (debug > 1)
5182 			fprintf(stderr, "SIGUSR1\n");
5183 		break;
5184 	}
5185 }
5186 
5187 void setup_signal_handler(void)
5188 {
5189 	struct sigaction sa;
5190 
5191 	memset(&sa, 0, sizeof(sa));
5192 
5193 	sa.sa_handler = &signal_handler;
5194 
5195 	if (sigaction(SIGINT, &sa, NULL) < 0)
5196 		err(1, "sigaction SIGINT");
5197 	if (sigaction(SIGUSR1, &sa, NULL) < 0)
5198 		err(1, "sigaction SIGUSR1");
5199 }
5200 
5201 void do_sleep(void)
5202 {
5203 	struct timeval tout;
5204 	struct timespec rest;
5205 	fd_set readfds;
5206 	int retval;
5207 
5208 	FD_ZERO(&readfds);
5209 	FD_SET(0, &readfds);
5210 
5211 	if (ignore_stdin) {
5212 		nanosleep(&interval_ts, NULL);
5213 		return;
5214 	}
5215 
5216 	tout = interval_tv;
5217 	retval = select(1, &readfds, NULL, NULL, &tout);
5218 
5219 	if (retval == 1) {
5220 		switch (getc(stdin)) {
5221 		case 'q':
5222 			exit_requested = 1;
5223 			break;
5224 		case EOF:
5225 			/*
5226 			 * 'stdin' is a pipe closed on the other end. There
5227 			 * won't be any further input.
5228 			 */
5229 			ignore_stdin = 1;
5230 			/* Sleep the rest of the time */
5231 			rest.tv_sec = (tout.tv_sec + tout.tv_usec / 1000000);
5232 			rest.tv_nsec = (tout.tv_usec % 1000000) * 1000;
5233 			nanosleep(&rest, NULL);
5234 		}
5235 	}
5236 }
5237 
5238 int get_msr_sum(int cpu, off_t offset, unsigned long long *msr)
5239 {
5240 	int ret, idx;
5241 	unsigned long long msr_cur, msr_last;
5242 
5243 	assert(!no_msr);
5244 
5245 	if (!per_cpu_msr_sum)
5246 		return 1;
5247 
5248 	idx = offset_to_idx(offset);
5249 	if (idx < 0)
5250 		return idx;
5251 	/* get_msr_sum() = sum + (get_msr() - last) */
5252 	ret = get_msr(cpu, offset, &msr_cur);
5253 	if (ret)
5254 		return ret;
5255 	msr_last = per_cpu_msr_sum[cpu].entries[idx].last;
5256 	DELTA_WRAP32(msr_cur, msr_last);
5257 	*msr = msr_last + per_cpu_msr_sum[cpu].entries[idx].sum;
5258 
5259 	return 0;
5260 }
5261 
5262 timer_t timerid;
5263 
5264 /* Timer callback, update the sum of MSRs periodically. */
5265 static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg_data *p)
5266 {
5267 	int i, ret;
5268 	int cpu = t->cpu_id;
5269 
5270 	UNUSED(c);
5271 	UNUSED(p);
5272 
5273 	assert(!no_msr);
5274 
5275 	for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) {
5276 		unsigned long long msr_cur, msr_last;
5277 		off_t offset;
5278 
5279 		if (!idx_valid(i))
5280 			continue;
5281 		offset = idx_to_offset(i);
5282 		if (offset < 0)
5283 			continue;
5284 		ret = get_msr(cpu, offset, &msr_cur);
5285 		if (ret) {
5286 			fprintf(outf, "Can not update msr(0x%llx)\n", (unsigned long long)offset);
5287 			continue;
5288 		}
5289 
5290 		msr_last = per_cpu_msr_sum[cpu].entries[i].last;
5291 		per_cpu_msr_sum[cpu].entries[i].last = msr_cur & 0xffffffff;
5292 
5293 		DELTA_WRAP32(msr_cur, msr_last);
5294 		per_cpu_msr_sum[cpu].entries[i].sum += msr_last;
5295 	}
5296 	return 0;
5297 }
5298 
5299 static void msr_record_handler(union sigval v)
5300 {
5301 	UNUSED(v);
5302 
5303 	for_all_cpus(update_msr_sum, EVEN_COUNTERS);
5304 }
5305 
5306 void msr_sum_record(void)
5307 {
5308 	struct itimerspec its;
5309 	struct sigevent sev;
5310 
5311 	per_cpu_msr_sum = calloc(topo.max_cpu_num + 1, sizeof(struct msr_sum_array));
5312 	if (!per_cpu_msr_sum) {
5313 		fprintf(outf, "Can not allocate memory for long time MSR.\n");
5314 		return;
5315 	}
5316 	/*
5317 	 * Signal handler might be restricted, so use thread notifier instead.
5318 	 */
5319 	memset(&sev, 0, sizeof(struct sigevent));
5320 	sev.sigev_notify = SIGEV_THREAD;
5321 	sev.sigev_notify_function = msr_record_handler;
5322 
5323 	sev.sigev_value.sival_ptr = &timerid;
5324 	if (timer_create(CLOCK_REALTIME, &sev, &timerid) == -1) {
5325 		fprintf(outf, "Can not create timer.\n");
5326 		goto release_msr;
5327 	}
5328 
5329 	its.it_value.tv_sec = 0;
5330 	its.it_value.tv_nsec = 1;
5331 	/*
5332 	 * A wraparound time has been calculated early.
5333 	 * Some sources state that the peak power for a
5334 	 * microprocessor is usually 1.5 times the TDP rating,
5335 	 * use 2 * TDP for safety.
5336 	 */
5337 	its.it_interval.tv_sec = rapl_joule_counter_range / 2;
5338 	its.it_interval.tv_nsec = 0;
5339 
5340 	if (timer_settime(timerid, 0, &its, NULL) == -1) {
5341 		fprintf(outf, "Can not set timer.\n");
5342 		goto release_timer;
5343 	}
5344 	return;
5345 
5346 release_timer:
5347 	timer_delete(timerid);
5348 release_msr:
5349 	free(per_cpu_msr_sum);
5350 }
5351 
5352 /*
5353  * set_my_sched_priority(pri)
5354  * return previous priority on success
5355  * return value < -20 on failure
5356  */
5357 int set_my_sched_priority(int priority)
5358 {
5359 	int retval;
5360 	int original_priority;
5361 
5362 	errno = 0;
5363 	original_priority = getpriority(PRIO_PROCESS, 0);
5364 	if (errno && (original_priority == -1))
5365 		return -21;
5366 
5367 	retval = setpriority(PRIO_PROCESS, 0, priority);
5368 	if (retval)
5369 		return -21;
5370 
5371 	errno = 0;
5372 	retval = getpriority(PRIO_PROCESS, 0);
5373 	if (retval != priority)
5374 		return -21;
5375 
5376 	return original_priority;
5377 }
5378 
5379 void turbostat_loop()
5380 {
5381 	int retval;
5382 	int restarted = 0;
5383 	unsigned int done_iters = 0;
5384 
5385 	setup_signal_handler();
5386 
5387 	/*
5388 	 * elevate own priority for interval mode
5389 	 *
5390 	 * ignore on error - we probably don't have permission to set it, but
5391 	 * it's not a big deal
5392 	 */
5393 	set_my_sched_priority(-20);
5394 
5395 restart:
5396 	restarted++;
5397 
5398 	snapshot_proc_sysfs_files();
5399 	retval = for_all_cpus(get_counters, EVEN_COUNTERS);
5400 	first_counter_read = 0;
5401 	if (retval < -1) {
5402 		exit(retval);
5403 	} else if (retval == -1) {
5404 		if (restarted > 10) {
5405 			exit(retval);
5406 		}
5407 		re_initialize();
5408 		goto restart;
5409 	}
5410 	restarted = 0;
5411 	done_iters = 0;
5412 	gettimeofday(&tv_even, (struct timezone *)NULL);
5413 
5414 	while (1) {
5415 		if (for_all_proc_cpus(cpu_is_not_present)) {
5416 			re_initialize();
5417 			goto restart;
5418 		}
5419 		if (update_effective_str(false)) {
5420 			re_initialize();
5421 			goto restart;
5422 		}
5423 		do_sleep();
5424 		if (snapshot_proc_sysfs_files())
5425 			goto restart;
5426 		retval = for_all_cpus(get_counters, ODD_COUNTERS);
5427 		if (retval < -1) {
5428 			exit(retval);
5429 		} else if (retval == -1) {
5430 			re_initialize();
5431 			goto restart;
5432 		}
5433 		gettimeofday(&tv_odd, (struct timezone *)NULL);
5434 		timersub(&tv_odd, &tv_even, &tv_delta);
5435 		if (for_all_cpus_2(delta_cpu, ODD_COUNTERS, EVEN_COUNTERS)) {
5436 			re_initialize();
5437 			goto restart;
5438 		}
5439 		compute_average(EVEN_COUNTERS);
5440 		format_all_counters(EVEN_COUNTERS);
5441 		flush_output_stdout();
5442 		if (exit_requested)
5443 			break;
5444 		if (num_iterations && ++done_iters >= num_iterations)
5445 			break;
5446 		do_sleep();
5447 		if (snapshot_proc_sysfs_files())
5448 			goto restart;
5449 		retval = for_all_cpus(get_counters, EVEN_COUNTERS);
5450 		if (retval < -1) {
5451 			exit(retval);
5452 		} else if (retval == -1) {
5453 			re_initialize();
5454 			goto restart;
5455 		}
5456 		gettimeofday(&tv_even, (struct timezone *)NULL);
5457 		timersub(&tv_even, &tv_odd, &tv_delta);
5458 		if (for_all_cpus_2(delta_cpu, EVEN_COUNTERS, ODD_COUNTERS)) {
5459 			re_initialize();
5460 			goto restart;
5461 		}
5462 		compute_average(ODD_COUNTERS);
5463 		format_all_counters(ODD_COUNTERS);
5464 		flush_output_stdout();
5465 		if (exit_requested)
5466 			break;
5467 		if (num_iterations && ++done_iters >= num_iterations)
5468 			break;
5469 	}
5470 }
5471 
5472 void check_dev_msr()
5473 {
5474 	struct stat sb;
5475 	char pathname[32];
5476 
5477 	if (no_msr)
5478 		return;
5479 
5480 	sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
5481 	if (stat(pathname, &sb))
5482 		if (system("/sbin/modprobe msr > /dev/null 2>&1"))
5483 			no_msr = 1;
5484 }
5485 
5486 /*
5487  * check for CAP_SYS_RAWIO
5488  * return 0 on success
5489  * return 1 on fail
5490  */
5491 int check_for_cap_sys_rawio(void)
5492 {
5493 	cap_t caps;
5494 	cap_flag_value_t cap_flag_value;
5495 	int ret = 0;
5496 
5497 	caps = cap_get_proc();
5498 	if (caps == NULL)
5499 		return 1;
5500 
5501 	if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) {
5502 		ret = 1;
5503 		goto free_and_exit;
5504 	}
5505 
5506 	if (cap_flag_value != CAP_SET) {
5507 		ret = 1;
5508 		goto free_and_exit;
5509 	}
5510 
5511 free_and_exit:
5512 	if (cap_free(caps) == -1)
5513 		err(-6, "cap_free\n");
5514 
5515 	return ret;
5516 }
5517 
5518 void check_msr_permission(void)
5519 {
5520 	int failed = 0;
5521 	char pathname[32];
5522 
5523 	if (no_msr)
5524 		return;
5525 
5526 	/* check for CAP_SYS_RAWIO */
5527 	failed += check_for_cap_sys_rawio();
5528 
5529 	/* test file permissions */
5530 	sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
5531 	if (euidaccess(pathname, R_OK)) {
5532 		failed++;
5533 	}
5534 
5535 	if (failed) {
5536 		warnx("Failed to access %s. Some of the counters may not be available\n"
5537 		      "\tRun as root to enable them or use %s to disable the access explicitly", pathname, "--no-msr");
5538 		no_msr = 1;
5539 	}
5540 }
5541 
5542 void probe_bclk(void)
5543 {
5544 	unsigned long long msr;
5545 	unsigned int base_ratio;
5546 
5547 	if (!platform->has_nhm_msrs || no_msr)
5548 		return;
5549 
5550 	if (platform->bclk_freq == BCLK_100MHZ)
5551 		bclk = 100.00;
5552 	else if (platform->bclk_freq == BCLK_133MHZ)
5553 		bclk = 133.33;
5554 	else if (platform->bclk_freq == BCLK_SLV)
5555 		bclk = slm_bclk();
5556 	else
5557 		return;
5558 
5559 	get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
5560 	base_ratio = (msr >> 8) & 0xFF;
5561 
5562 	base_hz = base_ratio * bclk * 1000000;
5563 	has_base_hz = 1;
5564 
5565 	if (platform->enable_tsc_tweak)
5566 		tsc_tweak = base_hz / tsc_hz;
5567 }
5568 
5569 static void remove_underbar(char *s)
5570 {
5571 	char *to = s;
5572 
5573 	while (*s) {
5574 		if (*s != '_')
5575 			*to++ = *s;
5576 		s++;
5577 	}
5578 
5579 	*to = 0;
5580 }
5581 
5582 static void dump_turbo_ratio_info(void)
5583 {
5584 	if (!has_turbo)
5585 		return;
5586 
5587 	if (!platform->has_nhm_msrs || no_msr)
5588 		return;
5589 
5590 	if (platform->trl_msrs & TRL_LIMIT2)
5591 		dump_turbo_ratio_limit2();
5592 
5593 	if (platform->trl_msrs & TRL_LIMIT1)
5594 		dump_turbo_ratio_limit1();
5595 
5596 	if (platform->trl_msrs & TRL_BASE) {
5597 		dump_turbo_ratio_limits(MSR_TURBO_RATIO_LIMIT);
5598 
5599 		if (is_hybrid)
5600 			dump_turbo_ratio_limits(MSR_SECONDARY_TURBO_RATIO_LIMIT);
5601 	}
5602 
5603 	if (platform->trl_msrs & TRL_ATOM)
5604 		dump_atom_turbo_ratio_limits();
5605 
5606 	if (platform->trl_msrs & TRL_KNL)
5607 		dump_knl_turbo_ratio_limits();
5608 
5609 	if (platform->has_config_tdp)
5610 		dump_config_tdp();
5611 }
5612 
5613 static int read_sysfs_int(char *path)
5614 {
5615 	FILE *input;
5616 	int retval = -1;
5617 
5618 	input = fopen(path, "r");
5619 	if (input == NULL) {
5620 		if (debug)
5621 			fprintf(outf, "NSFOD %s\n", path);
5622 		return (-1);
5623 	}
5624 	if (fscanf(input, "%d", &retval) != 1)
5625 		err(1, "%s: failed to read int from file", path);
5626 	fclose(input);
5627 
5628 	return (retval);
5629 }
5630 
5631 static void dump_sysfs_file(char *path)
5632 {
5633 	FILE *input;
5634 	char cpuidle_buf[64];
5635 
5636 	input = fopen(path, "r");
5637 	if (input == NULL) {
5638 		if (debug)
5639 			fprintf(outf, "NSFOD %s\n", path);
5640 		return;
5641 	}
5642 	if (!fgets(cpuidle_buf, sizeof(cpuidle_buf), input))
5643 		err(1, "%s: failed to read file", path);
5644 	fclose(input);
5645 
5646 	fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf);
5647 }
5648 
5649 static void probe_intel_uncore_frequency_legacy(void)
5650 {
5651 	int i, j;
5652 	char path[256];
5653 
5654 	for (i = 0; i < topo.num_packages; ++i) {
5655 		for (j = 0; j <= topo.max_die_id; ++j) {
5656 			int k, l;
5657 			char path_base[128];
5658 
5659 			sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i,
5660 				j);
5661 
5662 			if (access(path_base, R_OK))
5663 				continue;
5664 
5665 			BIC_PRESENT(BIC_UNCORE_MHZ);
5666 
5667 			if (quiet)
5668 				return;
5669 
5670 			sprintf(path, "%s/min_freq_khz", path_base);
5671 			k = read_sysfs_int(path);
5672 			sprintf(path, "%s/max_freq_khz", path_base);
5673 			l = read_sysfs_int(path);
5674 			fprintf(outf, "Uncore Frequency package%d die%d: %d - %d MHz ", i, j, k / 1000, l / 1000);
5675 
5676 			sprintf(path, "%s/initial_min_freq_khz", path_base);
5677 			k = read_sysfs_int(path);
5678 			sprintf(path, "%s/initial_max_freq_khz", path_base);
5679 			l = read_sysfs_int(path);
5680 			fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000);
5681 
5682 			sprintf(path, "%s/current_freq_khz", path_base);
5683 			k = read_sysfs_int(path);
5684 			fprintf(outf, " %d MHz\n", k / 1000);
5685 		}
5686 	}
5687 }
5688 
5689 static void probe_intel_uncore_frequency_cluster(void)
5690 {
5691 	int i, uncore_max_id;
5692 	char path[256];
5693 	char path_base[128];
5694 
5695 	if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK))
5696 		return;
5697 
5698 	if (quiet)
5699 		return;
5700 
5701 	for (uncore_max_id = 0;; ++uncore_max_id) {
5702 
5703 		sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", uncore_max_id);
5704 
5705 		/* uncore## start at 00 and skips no numbers, so stop upon first missing */
5706 		if (access(path_base, R_OK)) {
5707 			uncore_max_id -= 1;
5708 			break;
5709 		}
5710 	}
5711 	for (i = uncore_max_id; i >= 0; --i) {
5712 		int k, l;
5713 		int package_id, domain_id, cluster_id;
5714 		char name_buf[16];
5715 
5716 		sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i);
5717 
5718 		if (access(path_base, R_OK))
5719 			err(1, "%s: %s\n", __func__, path_base);
5720 
5721 		sprintf(path, "%s/package_id", path_base);
5722 		package_id = read_sysfs_int(path);
5723 
5724 		sprintf(path, "%s/domain_id", path_base);
5725 		domain_id = read_sysfs_int(path);
5726 
5727 		sprintf(path, "%s/fabric_cluster_id", path_base);
5728 		cluster_id = read_sysfs_int(path);
5729 
5730 		sprintf(path, "%s/min_freq_khz", path_base);
5731 		k = read_sysfs_int(path);
5732 		sprintf(path, "%s/max_freq_khz", path_base);
5733 		l = read_sysfs_int(path);
5734 		fprintf(outf, "Uncore Frequency package%d domain%d cluster%d: %d - %d MHz ", package_id, domain_id,
5735 			cluster_id, k / 1000, l / 1000);
5736 
5737 		sprintf(path, "%s/initial_min_freq_khz", path_base);
5738 		k = read_sysfs_int(path);
5739 		sprintf(path, "%s/initial_max_freq_khz", path_base);
5740 		l = read_sysfs_int(path);
5741 		fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000);
5742 
5743 		sprintf(path, "%s/current_freq_khz", path_base);
5744 		k = read_sysfs_int(path);
5745 		fprintf(outf, " %d MHz\n", k / 1000);
5746 
5747 		sprintf(path, "%s/current_freq_khz", path_base);
5748 		sprintf(name_buf, "UMHz%d.%d", domain_id, cluster_id);
5749 
5750 		add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0, package_id);
5751 	}
5752 }
5753 
5754 static void probe_intel_uncore_frequency(void)
5755 {
5756 	if (!genuine_intel)
5757 		return;
5758 
5759 	if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK) == 0)
5760 		probe_intel_uncore_frequency_cluster();
5761 	else
5762 		probe_intel_uncore_frequency_legacy();
5763 }
5764 
5765 static void probe_graphics(void)
5766 {
5767 	/* Xe graphics sysfs knobs */
5768 	if (!access("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", R_OK)) {
5769 		FILE *fp;
5770 		char buf[8];
5771 		bool gt0_is_gt;
5772 		int idx;
5773 
5774 		fp = fopen("/sys/class/drm/card0/device/tile0/gt0/gtidle/name", "r");
5775 		if (!fp)
5776 			goto next;
5777 
5778 		if (!fread(buf, sizeof(char), 7, fp)) {
5779 			fclose(fp);
5780 			goto next;
5781 		}
5782 		fclose(fp);
5783 
5784 		if (!strncmp(buf, "gt0-rc", strlen("gt0-rc")))
5785 			gt0_is_gt = true;
5786 		else if (!strncmp(buf, "gt0-mc", strlen("gt0-mc")))
5787 			gt0_is_gt = false;
5788 		else
5789 			goto next;
5790 
5791 		idx = gt0_is_gt ? GFX_rc6 : SAM_mc6;
5792 		gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms";
5793 
5794 		idx = gt0_is_gt ? GFX_MHz : SAM_MHz;
5795 		if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq", R_OK))
5796 			gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq";
5797 
5798 		idx = gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz;
5799 		if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq", R_OK))
5800 			gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq";
5801 
5802 		idx = gt0_is_gt ? SAM_mc6 : GFX_rc6;
5803 		if (!access("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms", R_OK))
5804 			gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms";
5805 
5806 		idx = gt0_is_gt ? SAM_MHz : GFX_MHz;
5807 		if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq", R_OK))
5808 			gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq";
5809 
5810 		idx = gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz;
5811 		if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq", R_OK))
5812 			gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq";
5813 
5814 		goto end;
5815 	}
5816 
5817 next:
5818 	/* New i915 graphics sysfs knobs */
5819 	if (!access("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", R_OK)) {
5820 		gfx_info[GFX_rc6].path = "/sys/class/drm/card0/gt/gt0/rc6_residency_ms";
5821 
5822 		if (!access("/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz", R_OK))
5823 			gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz";
5824 
5825 		if (!access("/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz", R_OK))
5826 			gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz";
5827 
5828 		if (!access("/sys/class/drm/card0/gt/gt1/rc6_residency_ms", R_OK))
5829 			gfx_info[SAM_mc6].path = "/sys/class/drm/card0/gt/gt1/rc6_residency_ms";
5830 
5831 		if (!access("/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz", R_OK))
5832 			gfx_info[SAM_MHz].path = "/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz";
5833 
5834 		if (!access("/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz", R_OK))
5835 			gfx_info[SAM_ACTMHz].path = "/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz";
5836 
5837 		goto end;
5838 	}
5839 
5840 	/* Fall back to traditional i915 graphics sysfs knobs */
5841 	if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK))
5842 		gfx_info[GFX_rc6].path = "/sys/class/drm/card0/power/rc6_residency_ms";
5843 
5844 	if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK))
5845 		gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt_cur_freq_mhz";
5846 	else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK))
5847 		gfx_info[GFX_MHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz";
5848 
5849 	if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK))
5850 		gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt_act_freq_mhz";
5851 	else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK))
5852 		gfx_info[GFX_ACTMHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz";
5853 
5854 end:
5855 	if (gfx_info[GFX_rc6].path)
5856 		BIC_PRESENT(BIC_GFX_rc6);
5857 	if (gfx_info[GFX_MHz].path)
5858 		BIC_PRESENT(BIC_GFXMHz);
5859 	if (gfx_info[GFX_ACTMHz].path)
5860 		BIC_PRESENT(BIC_GFXACTMHz);
5861 	if (gfx_info[SAM_mc6].path)
5862 		BIC_PRESENT(BIC_SAM_mc6);
5863 	if (gfx_info[SAM_MHz].path)
5864 		BIC_PRESENT(BIC_SAMMHz);
5865 	if (gfx_info[SAM_ACTMHz].path)
5866 		BIC_PRESENT(BIC_SAMACTMHz);
5867 }
5868 
5869 static void dump_sysfs_cstate_config(void)
5870 {
5871 	char path[64];
5872 	char name_buf[16];
5873 	char desc[64];
5874 	FILE *input;
5875 	int state;
5876 	char *sp;
5877 
5878 	if (access("/sys/devices/system/cpu/cpuidle", R_OK)) {
5879 		fprintf(outf, "cpuidle not loaded\n");
5880 		return;
5881 	}
5882 
5883 	dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_driver");
5884 	dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_governor");
5885 	dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_governor_ro");
5886 
5887 	for (state = 0; state < 10; ++state) {
5888 
5889 		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
5890 		input = fopen(path, "r");
5891 		if (input == NULL)
5892 			continue;
5893 		if (!fgets(name_buf, sizeof(name_buf), input))
5894 			err(1, "%s: failed to read file", path);
5895 
5896 		/* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
5897 		sp = strchr(name_buf, '-');
5898 		if (!sp)
5899 			sp = strchrnul(name_buf, '\n');
5900 		*sp = '\0';
5901 		fclose(input);
5902 
5903 		remove_underbar(name_buf);
5904 
5905 		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc", base_cpu, state);
5906 		input = fopen(path, "r");
5907 		if (input == NULL)
5908 			continue;
5909 		if (!fgets(desc, sizeof(desc), input))
5910 			err(1, "%s: failed to read file", path);
5911 
5912 		fprintf(outf, "cpu%d: %s: %s", base_cpu, name_buf, desc);
5913 		fclose(input);
5914 	}
5915 }
5916 
5917 static void dump_sysfs_pstate_config(void)
5918 {
5919 	char path[64];
5920 	char driver_buf[64];
5921 	char governor_buf[64];
5922 	FILE *input;
5923 	int turbo;
5924 
5925 	sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_driver", base_cpu);
5926 	input = fopen(path, "r");
5927 	if (input == NULL) {
5928 		fprintf(outf, "NSFOD %s\n", path);
5929 		return;
5930 	}
5931 	if (!fgets(driver_buf, sizeof(driver_buf), input))
5932 		err(1, "%s: failed to read file", path);
5933 	fclose(input);
5934 
5935 	sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", base_cpu);
5936 	input = fopen(path, "r");
5937 	if (input == NULL) {
5938 		fprintf(outf, "NSFOD %s\n", path);
5939 		return;
5940 	}
5941 	if (!fgets(governor_buf, sizeof(governor_buf), input))
5942 		err(1, "%s: failed to read file", path);
5943 	fclose(input);
5944 
5945 	fprintf(outf, "cpu%d: cpufreq driver: %s", base_cpu, driver_buf);
5946 	fprintf(outf, "cpu%d: cpufreq governor: %s", base_cpu, governor_buf);
5947 
5948 	sprintf(path, "/sys/devices/system/cpu/cpufreq/boost");
5949 	input = fopen(path, "r");
5950 	if (input != NULL) {
5951 		if (fscanf(input, "%d", &turbo) != 1)
5952 			err(1, "%s: failed to parse number from file", path);
5953 		fprintf(outf, "cpufreq boost: %d\n", turbo);
5954 		fclose(input);
5955 	}
5956 
5957 	sprintf(path, "/sys/devices/system/cpu/intel_pstate/no_turbo");
5958 	input = fopen(path, "r");
5959 	if (input != NULL) {
5960 		if (fscanf(input, "%d", &turbo) != 1)
5961 			err(1, "%s: failed to parse number from file", path);
5962 		fprintf(outf, "cpufreq intel_pstate no_turbo: %d\n", turbo);
5963 		fclose(input);
5964 	}
5965 }
5966 
5967 /*
5968  * print_epb()
5969  * Decode the ENERGY_PERF_BIAS MSR
5970  */
5971 int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p)
5972 {
5973 	char *epb_string;
5974 	int cpu, epb;
5975 
5976 	UNUSED(c);
5977 	UNUSED(p);
5978 
5979 	if (!has_epb)
5980 		return 0;
5981 
5982 	cpu = t->cpu_id;
5983 
5984 	/* EPB is per-package */
5985 	if (!is_cpu_first_thread_in_package(t, c, p))
5986 		return 0;
5987 
5988 	if (cpu_migrate(cpu)) {
5989 		fprintf(outf, "print_epb: Could not migrate to CPU %d\n", cpu);
5990 		return -1;
5991 	}
5992 
5993 	epb = get_epb(cpu);
5994 	if (epb < 0)
5995 		return 0;
5996 
5997 	switch (epb) {
5998 	case ENERGY_PERF_BIAS_PERFORMANCE:
5999 		epb_string = "performance";
6000 		break;
6001 	case ENERGY_PERF_BIAS_NORMAL:
6002 		epb_string = "balanced";
6003 		break;
6004 	case ENERGY_PERF_BIAS_POWERSAVE:
6005 		epb_string = "powersave";
6006 		break;
6007 	default:
6008 		epb_string = "custom";
6009 		break;
6010 	}
6011 	fprintf(outf, "cpu%d: EPB: %d (%s)\n", cpu, epb, epb_string);
6012 
6013 	return 0;
6014 }
6015 
6016 /*
6017  * print_hwp()
6018  * Decode the MSR_HWP_CAPABILITIES
6019  */
6020 int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
6021 {
6022 	unsigned long long msr;
6023 	int cpu;
6024 
6025 	UNUSED(c);
6026 	UNUSED(p);
6027 
6028 	if (no_msr)
6029 		return 0;
6030 
6031 	if (!has_hwp)
6032 		return 0;
6033 
6034 	cpu = t->cpu_id;
6035 
6036 	/* MSR_HWP_CAPABILITIES is per-package */
6037 	if (!is_cpu_first_thread_in_package(t, c, p))
6038 		return 0;
6039 
6040 	if (cpu_migrate(cpu)) {
6041 		fprintf(outf, "print_hwp: Could not migrate to CPU %d\n", cpu);
6042 		return -1;
6043 	}
6044 
6045 	if (get_msr(cpu, MSR_PM_ENABLE, &msr))
6046 		return 0;
6047 
6048 	fprintf(outf, "cpu%d: MSR_PM_ENABLE: 0x%08llx (%sHWP)\n", cpu, msr, (msr & (1 << 0)) ? "" : "No-");
6049 
6050 	/* MSR_PM_ENABLE[1] == 1 if HWP is enabled and MSRs visible */
6051 	if ((msr & (1 << 0)) == 0)
6052 		return 0;
6053 
6054 	if (get_msr(cpu, MSR_HWP_CAPABILITIES, &msr))
6055 		return 0;
6056 
6057 	fprintf(outf, "cpu%d: MSR_HWP_CAPABILITIES: 0x%08llx "
6058 		"(high %d guar %d eff %d low %d)\n",
6059 		cpu, msr,
6060 		(unsigned int)HWP_HIGHEST_PERF(msr),
6061 		(unsigned int)HWP_GUARANTEED_PERF(msr),
6062 		(unsigned int)HWP_MOSTEFFICIENT_PERF(msr), (unsigned int)HWP_LOWEST_PERF(msr));
6063 
6064 	if (get_msr(cpu, MSR_HWP_REQUEST, &msr))
6065 		return 0;
6066 
6067 	fprintf(outf, "cpu%d: MSR_HWP_REQUEST: 0x%08llx "
6068 		"(min %d max %d des %d epp 0x%x window 0x%x pkg 0x%x)\n",
6069 		cpu, msr,
6070 		(unsigned int)(((msr) >> 0) & 0xff),
6071 		(unsigned int)(((msr) >> 8) & 0xff),
6072 		(unsigned int)(((msr) >> 16) & 0xff),
6073 		(unsigned int)(((msr) >> 24) & 0xff),
6074 		(unsigned int)(((msr) >> 32) & 0xff3), (unsigned int)(((msr) >> 42) & 0x1));
6075 
6076 	if (has_hwp_pkg) {
6077 		if (get_msr(cpu, MSR_HWP_REQUEST_PKG, &msr))
6078 			return 0;
6079 
6080 		fprintf(outf, "cpu%d: MSR_HWP_REQUEST_PKG: 0x%08llx "
6081 			"(min %d max %d des %d epp 0x%x window 0x%x)\n",
6082 			cpu, msr,
6083 			(unsigned int)(((msr) >> 0) & 0xff),
6084 			(unsigned int)(((msr) >> 8) & 0xff),
6085 			(unsigned int)(((msr) >> 16) & 0xff),
6086 			(unsigned int)(((msr) >> 24) & 0xff), (unsigned int)(((msr) >> 32) & 0xff3));
6087 	}
6088 	if (has_hwp_notify) {
6089 		if (get_msr(cpu, MSR_HWP_INTERRUPT, &msr))
6090 			return 0;
6091 
6092 		fprintf(outf, "cpu%d: MSR_HWP_INTERRUPT: 0x%08llx "
6093 			"(%s_Guaranteed_Perf_Change, %s_Excursion_Min)\n",
6094 			cpu, msr, ((msr) & 0x1) ? "EN" : "Dis", ((msr) & 0x2) ? "EN" : "Dis");
6095 	}
6096 	if (get_msr(cpu, MSR_HWP_STATUS, &msr))
6097 		return 0;
6098 
6099 	fprintf(outf, "cpu%d: MSR_HWP_STATUS: 0x%08llx "
6100 		"(%sGuaranteed_Perf_Change, %sExcursion_Min)\n",
6101 		cpu, msr, ((msr) & 0x1) ? "" : "No-", ((msr) & 0x4) ? "" : "No-");
6102 
6103 	return 0;
6104 }
6105 
6106 /*
6107  * print_perf_limit()
6108  */
6109 int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data *p)
6110 {
6111 	unsigned long long msr;
6112 	int cpu;
6113 
6114 	UNUSED(c);
6115 	UNUSED(p);
6116 
6117 	if (no_msr)
6118 		return 0;
6119 
6120 	cpu = t->cpu_id;
6121 
6122 	/* per-package */
6123 	if (!is_cpu_first_thread_in_package(t, c, p))
6124 		return 0;
6125 
6126 	if (cpu_migrate(cpu)) {
6127 		fprintf(outf, "print_perf_limit: Could not migrate to CPU %d\n", cpu);
6128 		return -1;
6129 	}
6130 
6131 	if (platform->plr_msrs & PLR_CORE) {
6132 		get_msr(cpu, MSR_CORE_PERF_LIMIT_REASONS, &msr);
6133 		fprintf(outf, "cpu%d: MSR_CORE_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
6134 		fprintf(outf, " (Active: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)",
6135 			(msr & 1 << 15) ? "bit15, " : "",
6136 			(msr & 1 << 14) ? "bit14, " : "",
6137 			(msr & 1 << 13) ? "Transitions, " : "",
6138 			(msr & 1 << 12) ? "MultiCoreTurbo, " : "",
6139 			(msr & 1 << 11) ? "PkgPwrL2, " : "",
6140 			(msr & 1 << 10) ? "PkgPwrL1, " : "",
6141 			(msr & 1 << 9) ? "CorePwr, " : "",
6142 			(msr & 1 << 8) ? "Amps, " : "",
6143 			(msr & 1 << 6) ? "VR-Therm, " : "",
6144 			(msr & 1 << 5) ? "Auto-HWP, " : "",
6145 			(msr & 1 << 4) ? "Graphics, " : "",
6146 			(msr & 1 << 2) ? "bit2, " : "",
6147 			(msr & 1 << 1) ? "ThermStatus, " : "", (msr & 1 << 0) ? "PROCHOT, " : "");
6148 		fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n",
6149 			(msr & 1 << 31) ? "bit31, " : "",
6150 			(msr & 1 << 30) ? "bit30, " : "",
6151 			(msr & 1 << 29) ? "Transitions, " : "",
6152 			(msr & 1 << 28) ? "MultiCoreTurbo, " : "",
6153 			(msr & 1 << 27) ? "PkgPwrL2, " : "",
6154 			(msr & 1 << 26) ? "PkgPwrL1, " : "",
6155 			(msr & 1 << 25) ? "CorePwr, " : "",
6156 			(msr & 1 << 24) ? "Amps, " : "",
6157 			(msr & 1 << 22) ? "VR-Therm, " : "",
6158 			(msr & 1 << 21) ? "Auto-HWP, " : "",
6159 			(msr & 1 << 20) ? "Graphics, " : "",
6160 			(msr & 1 << 18) ? "bit18, " : "",
6161 			(msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 16) ? "PROCHOT, " : "");
6162 
6163 	}
6164 	if (platform->plr_msrs & PLR_GFX) {
6165 		get_msr(cpu, MSR_GFX_PERF_LIMIT_REASONS, &msr);
6166 		fprintf(outf, "cpu%d: MSR_GFX_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
6167 		fprintf(outf, " (Active: %s%s%s%s%s%s%s%s)",
6168 			(msr & 1 << 0) ? "PROCHOT, " : "",
6169 			(msr & 1 << 1) ? "ThermStatus, " : "",
6170 			(msr & 1 << 4) ? "Graphics, " : "",
6171 			(msr & 1 << 6) ? "VR-Therm, " : "",
6172 			(msr & 1 << 8) ? "Amps, " : "",
6173 			(msr & 1 << 9) ? "GFXPwr, " : "",
6174 			(msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : "");
6175 		fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s)\n",
6176 			(msr & 1 << 16) ? "PROCHOT, " : "",
6177 			(msr & 1 << 17) ? "ThermStatus, " : "",
6178 			(msr & 1 << 20) ? "Graphics, " : "",
6179 			(msr & 1 << 22) ? "VR-Therm, " : "",
6180 			(msr & 1 << 24) ? "Amps, " : "",
6181 			(msr & 1 << 25) ? "GFXPwr, " : "",
6182 			(msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : "");
6183 	}
6184 	if (platform->plr_msrs & PLR_RING) {
6185 		get_msr(cpu, MSR_RING_PERF_LIMIT_REASONS, &msr);
6186 		fprintf(outf, "cpu%d: MSR_RING_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
6187 		fprintf(outf, " (Active: %s%s%s%s%s%s)",
6188 			(msr & 1 << 0) ? "PROCHOT, " : "",
6189 			(msr & 1 << 1) ? "ThermStatus, " : "",
6190 			(msr & 1 << 6) ? "VR-Therm, " : "",
6191 			(msr & 1 << 8) ? "Amps, " : "",
6192 			(msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : "");
6193 		fprintf(outf, " (Logged: %s%s%s%s%s%s)\n",
6194 			(msr & 1 << 16) ? "PROCHOT, " : "",
6195 			(msr & 1 << 17) ? "ThermStatus, " : "",
6196 			(msr & 1 << 22) ? "VR-Therm, " : "",
6197 			(msr & 1 << 24) ? "Amps, " : "",
6198 			(msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : "");
6199 	}
6200 	return 0;
6201 }
6202 
6203 #define	RAPL_POWER_GRANULARITY	0x7FFF	/* 15 bit power granularity */
6204 #define	RAPL_TIME_GRANULARITY	0x3F	/* 6 bit time granularity */
6205 
6206 double get_quirk_tdp(void)
6207 {
6208 	if (platform->rapl_quirk_tdp)
6209 		return platform->rapl_quirk_tdp;
6210 
6211 	return 135.0;
6212 }
6213 
6214 double get_tdp_intel(void)
6215 {
6216 	unsigned long long msr;
6217 
6218 	if (platform->rapl_msrs & RAPL_PKG_POWER_INFO)
6219 		if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr))
6220 			return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units;
6221 	return get_quirk_tdp();
6222 }
6223 
6224 double get_tdp_amd(void)
6225 {
6226 	return get_quirk_tdp();
6227 }
6228 
6229 void rapl_probe_intel(void)
6230 {
6231 	unsigned long long msr;
6232 	unsigned int time_unit;
6233 	double tdp;
6234 	const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt | BIC_RAMWatt | BIC_GFXWatt;
6235 	const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J | BIC_RAM_J | BIC_GFX_J;
6236 
6237 	if (rapl_joules)
6238 		bic_enabled &= ~bic_watt_bits;
6239 	else
6240 		bic_enabled &= ~bic_joules_bits;
6241 
6242 	if (!(platform->rapl_msrs & RAPL_PKG_PERF_STATUS))
6243 		bic_enabled &= ~BIC_PKG__;
6244 	if (!(platform->rapl_msrs & RAPL_DRAM_PERF_STATUS))
6245 		bic_enabled &= ~BIC_RAM__;
6246 
6247 	/* units on package 0, verify later other packages match */
6248 	if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr))
6249 		return;
6250 
6251 	rapl_power_units = 1.0 / (1 << (msr & 0xF));
6252 	if (platform->has_rapl_divisor)
6253 		rapl_energy_units = 1.0 * (1 << (msr >> 8 & 0x1F)) / 1000000;
6254 	else
6255 		rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F));
6256 
6257 	if (platform->has_fixed_rapl_unit)
6258 		rapl_dram_energy_units = (15.3 / 1000000);
6259 	else
6260 		rapl_dram_energy_units = rapl_energy_units;
6261 
6262 	time_unit = msr >> 16 & 0xF;
6263 	if (time_unit == 0)
6264 		time_unit = 0xA;
6265 
6266 	rapl_time_units = 1.0 / (1 << (time_unit));
6267 
6268 	tdp = get_tdp_intel();
6269 
6270 	rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
6271 	if (!quiet)
6272 		fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
6273 }
6274 
6275 void rapl_probe_amd(void)
6276 {
6277 	unsigned long long msr;
6278 	double tdp;
6279 	const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt;
6280 	const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J;
6281 
6282 	if (rapl_joules)
6283 		bic_enabled &= ~bic_watt_bits;
6284 	else
6285 		bic_enabled &= ~bic_joules_bits;
6286 
6287 	if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr))
6288 		return;
6289 
6290 	rapl_time_units = ldexp(1.0, -(msr >> 16 & 0xf));
6291 	rapl_energy_units = ldexp(1.0, -(msr >> 8 & 0x1f));
6292 	rapl_power_units = ldexp(1.0, -(msr & 0xf));
6293 
6294 	tdp = get_tdp_amd();
6295 
6296 	rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
6297 	if (!quiet)
6298 		fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
6299 }
6300 
6301 void print_power_limit_msr(int cpu, unsigned long long msr, char *label)
6302 {
6303 	fprintf(outf, "cpu%d: %s: %sabled (%0.3f Watts, %f sec, clamp %sabled)\n",
6304 		cpu, label,
6305 		((msr >> 15) & 1) ? "EN" : "DIS",
6306 		((msr >> 0) & 0x7FFF) * rapl_power_units,
6307 		(1.0 + (((msr >> 22) & 0x3) / 4.0)) * (1 << ((msr >> 17) & 0x1F)) * rapl_time_units,
6308 		(((msr >> 16) & 1) ? "EN" : "DIS"));
6309 
6310 	return;
6311 }
6312 
6313 int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
6314 {
6315 	unsigned long long msr;
6316 	const char *msr_name;
6317 	int cpu;
6318 
6319 	UNUSED(c);
6320 	UNUSED(p);
6321 
6322 	if (!platform->rapl_msrs)
6323 		return 0;
6324 
6325 	/* RAPL counters are per package, so print only for 1st thread/package */
6326 	if (!is_cpu_first_thread_in_package(t, c, p))
6327 		return 0;
6328 
6329 	cpu = t->cpu_id;
6330 	if (cpu_migrate(cpu)) {
6331 		fprintf(outf, "print_rapl: Could not migrate to CPU %d\n", cpu);
6332 		return -1;
6333 	}
6334 
6335 	if (platform->rapl_msrs & RAPL_AMD_F17H) {
6336 		msr_name = "MSR_RAPL_PWR_UNIT";
6337 		if (get_msr(cpu, MSR_RAPL_PWR_UNIT, &msr))
6338 			return -1;
6339 	} else {
6340 		msr_name = "MSR_RAPL_POWER_UNIT";
6341 		if (get_msr(cpu, MSR_RAPL_POWER_UNIT, &msr))
6342 			return -1;
6343 	}
6344 
6345 	fprintf(outf, "cpu%d: %s: 0x%08llx (%f Watts, %f Joules, %f sec.)\n", cpu, msr_name, msr,
6346 		rapl_power_units, rapl_energy_units, rapl_time_units);
6347 
6348 	if (platform->rapl_msrs & RAPL_PKG_POWER_INFO) {
6349 
6350 		if (get_msr(cpu, MSR_PKG_POWER_INFO, &msr))
6351 			return -5;
6352 
6353 		fprintf(outf, "cpu%d: MSR_PKG_POWER_INFO: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
6354 			cpu, msr,
6355 			((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
6356 			((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units,
6357 			((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
6358 			((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
6359 
6360 	}
6361 	if (platform->rapl_msrs & RAPL_PKG) {
6362 
6363 		if (get_msr(cpu, MSR_PKG_POWER_LIMIT, &msr))
6364 			return -9;
6365 
6366 		fprintf(outf, "cpu%d: MSR_PKG_POWER_LIMIT: 0x%08llx (%slocked)\n",
6367 			cpu, msr, (msr >> 63) & 1 ? "" : "UN");
6368 
6369 		print_power_limit_msr(cpu, msr, "PKG Limit #1");
6370 		fprintf(outf, "cpu%d: PKG Limit #2: %sabled (%0.3f Watts, %f* sec, clamp %sabled)\n",
6371 			cpu,
6372 			((msr >> 47) & 1) ? "EN" : "DIS",
6373 			((msr >> 32) & 0x7FFF) * rapl_power_units,
6374 			(1.0 + (((msr >> 54) & 0x3) / 4.0)) * (1 << ((msr >> 49) & 0x1F)) * rapl_time_units,
6375 			((msr >> 48) & 1) ? "EN" : "DIS");
6376 
6377 		if (get_msr(cpu, MSR_VR_CURRENT_CONFIG, &msr))
6378 			return -9;
6379 
6380 		fprintf(outf, "cpu%d: MSR_VR_CURRENT_CONFIG: 0x%08llx\n", cpu, msr);
6381 		fprintf(outf, "cpu%d: PKG Limit #4: %f Watts (%slocked)\n",
6382 			cpu, ((msr >> 0) & 0x1FFF) * rapl_power_units, (msr >> 31) & 1 ? "" : "UN");
6383 	}
6384 
6385 	if (platform->rapl_msrs & RAPL_DRAM_POWER_INFO) {
6386 		if (get_msr(cpu, MSR_DRAM_POWER_INFO, &msr))
6387 			return -6;
6388 
6389 		fprintf(outf, "cpu%d: MSR_DRAM_POWER_INFO,: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
6390 			cpu, msr,
6391 			((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
6392 			((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units,
6393 			((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
6394 			((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
6395 	}
6396 	if (platform->rapl_msrs & RAPL_DRAM) {
6397 		if (get_msr(cpu, MSR_DRAM_POWER_LIMIT, &msr))
6398 			return -9;
6399 		fprintf(outf, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n",
6400 			cpu, msr, (msr >> 31) & 1 ? "" : "UN");
6401 
6402 		print_power_limit_msr(cpu, msr, "DRAM Limit");
6403 	}
6404 	if (platform->rapl_msrs & RAPL_CORE_POLICY) {
6405 		if (get_msr(cpu, MSR_PP0_POLICY, &msr))
6406 			return -7;
6407 
6408 		fprintf(outf, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF);
6409 	}
6410 	if (platform->rapl_msrs & RAPL_CORE_POWER_LIMIT) {
6411 		if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr))
6412 			return -9;
6413 		fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n",
6414 			cpu, msr, (msr >> 31) & 1 ? "" : "UN");
6415 		print_power_limit_msr(cpu, msr, "Cores Limit");
6416 	}
6417 	if (platform->rapl_msrs & RAPL_GFX) {
6418 		if (get_msr(cpu, MSR_PP1_POLICY, &msr))
6419 			return -8;
6420 
6421 		fprintf(outf, "cpu%d: MSR_PP1_POLICY: %lld\n", cpu, msr & 0xF);
6422 
6423 		if (get_msr(cpu, MSR_PP1_POWER_LIMIT, &msr))
6424 			return -9;
6425 		fprintf(outf, "cpu%d: MSR_PP1_POWER_LIMIT: 0x%08llx (%slocked)\n",
6426 			cpu, msr, (msr >> 31) & 1 ? "" : "UN");
6427 		print_power_limit_msr(cpu, msr, "GFX Limit");
6428 	}
6429 	return 0;
6430 }
6431 
6432 /*
6433  * probe_rapl()
6434  *
6435  * sets rapl_power_units, rapl_energy_units, rapl_time_units
6436  */
6437 void probe_rapl(void)
6438 {
6439 	if (!platform->rapl_msrs || no_msr)
6440 		return;
6441 
6442 	if (genuine_intel)
6443 		rapl_probe_intel();
6444 	if (authentic_amd || hygon_genuine)
6445 		rapl_probe_amd();
6446 
6447 	if (quiet)
6448 		return;
6449 
6450 	for_all_cpus(print_rapl, ODD_COUNTERS);
6451 }
6452 
6453 /*
6454  * MSR_IA32_TEMPERATURE_TARGET indicates the temperature where
6455  * the Thermal Control Circuit (TCC) activates.
6456  * This is usually equal to tjMax.
6457  *
6458  * Older processors do not have this MSR, so there we guess,
6459  * but also allow cmdline over-ride with -T.
6460  *
6461  * Several MSR temperature values are in units of degrees-C
6462  * below this value, including the Digital Thermal Sensor (DTS),
6463  * Package Thermal Management Sensor (PTM), and thermal event thresholds.
6464  */
6465 int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p)
6466 {
6467 	unsigned long long msr;
6468 	unsigned int tcc_default, tcc_offset;
6469 	int cpu;
6470 
6471 	UNUSED(c);
6472 	UNUSED(p);
6473 
6474 	/* tj_max is used only for dts or ptm */
6475 	if (!(do_dts || do_ptm))
6476 		return 0;
6477 
6478 	/* this is a per-package concept */
6479 	if (!is_cpu_first_thread_in_package(t, c, p))
6480 		return 0;
6481 
6482 	cpu = t->cpu_id;
6483 	if (cpu_migrate(cpu)) {
6484 		fprintf(outf, "Could not migrate to CPU %d\n", cpu);
6485 		return -1;
6486 	}
6487 
6488 	if (tj_max_override != 0) {
6489 		tj_max = tj_max_override;
6490 		fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n", cpu, tj_max);
6491 		return 0;
6492 	}
6493 
6494 	/* Temperature Target MSR is Nehalem and newer only */
6495 	if (!platform->has_nhm_msrs || no_msr)
6496 		goto guess;
6497 
6498 	if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr))
6499 		goto guess;
6500 
6501 	tcc_default = (msr >> 16) & 0xFF;
6502 
6503 	if (!quiet) {
6504 		int bits = platform->tcc_offset_bits;
6505 		unsigned long long enabled = 0;
6506 
6507 		if (bits && !get_msr(base_cpu, MSR_PLATFORM_INFO, &enabled))
6508 			enabled = (enabled >> 30) & 1;
6509 
6510 		if (bits && enabled) {
6511 			tcc_offset = (msr >> 24) & GENMASK(bits - 1, 0);
6512 			fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n",
6513 				cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset);
6514 		} else {
6515 			fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", cpu, msr, tcc_default);
6516 		}
6517 	}
6518 
6519 	if (!tcc_default)
6520 		goto guess;
6521 
6522 	tj_max = tcc_default;
6523 
6524 	return 0;
6525 
6526 guess:
6527 	tj_max = TJMAX_DEFAULT;
6528 	fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", cpu, tj_max);
6529 
6530 	return 0;
6531 }
6532 
6533 int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p)
6534 {
6535 	unsigned long long msr;
6536 	unsigned int dts, dts2;
6537 	int cpu;
6538 
6539 	UNUSED(c);
6540 	UNUSED(p);
6541 
6542 	if (no_msr)
6543 		return 0;
6544 
6545 	if (!(do_dts || do_ptm))
6546 		return 0;
6547 
6548 	cpu = t->cpu_id;
6549 
6550 	/* DTS is per-core, no need to print for each thread */
6551 	if (!is_cpu_first_thread_in_core(t, c, p))
6552 		return 0;
6553 
6554 	if (cpu_migrate(cpu)) {
6555 		fprintf(outf, "print_thermal: Could not migrate to CPU %d\n", cpu);
6556 		return -1;
6557 	}
6558 
6559 	if (do_ptm && is_cpu_first_core_in_package(t, c, p)) {
6560 		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
6561 			return 0;
6562 
6563 		dts = (msr >> 16) & 0x7F;
6564 		fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", cpu, msr, tj_max - dts);
6565 
6566 		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr))
6567 			return 0;
6568 
6569 		dts = (msr >> 16) & 0x7F;
6570 		dts2 = (msr >> 8) & 0x7F;
6571 		fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
6572 			cpu, msr, tj_max - dts, tj_max - dts2);
6573 	}
6574 
6575 	if (do_dts && debug) {
6576 		unsigned int resolution;
6577 
6578 		if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
6579 			return 0;
6580 
6581 		dts = (msr >> 16) & 0x7F;
6582 		resolution = (msr >> 27) & 0xF;
6583 		fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n",
6584 			cpu, msr, tj_max - dts, resolution);
6585 
6586 		if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr))
6587 			return 0;
6588 
6589 		dts = (msr >> 16) & 0x7F;
6590 		dts2 = (msr >> 8) & 0x7F;
6591 		fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
6592 			cpu, msr, tj_max - dts, tj_max - dts2);
6593 	}
6594 
6595 	return 0;
6596 }
6597 
6598 void probe_thermal(void)
6599 {
6600 	if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK))
6601 		BIC_PRESENT(BIC_CORE_THROT_CNT);
6602 	else
6603 		BIC_NOT_PRESENT(BIC_CORE_THROT_CNT);
6604 
6605 	for_all_cpus(set_temperature_target, ODD_COUNTERS);
6606 
6607 	if (quiet)
6608 		return;
6609 
6610 	for_all_cpus(print_thermal, ODD_COUNTERS);
6611 }
6612 
6613 int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p)
6614 {
6615 	unsigned int eax, ebx, ecx, edx;
6616 
6617 	UNUSED(c);
6618 	UNUSED(p);
6619 
6620 	if (!genuine_intel)
6621 		return 0;
6622 
6623 	if (cpu_migrate(t->cpu_id)) {
6624 		fprintf(outf, "Could not migrate to CPU %d\n", t->cpu_id);
6625 		return -1;
6626 	}
6627 
6628 	if (max_level < 0x1a)
6629 		return 0;
6630 
6631 	__cpuid(0x1a, eax, ebx, ecx, edx);
6632 	eax = (eax >> 24) & 0xFF;
6633 	if (eax == 0x20)
6634 		t->is_atom = true;
6635 	return 0;
6636 }
6637 
6638 void decode_feature_control_msr(void)
6639 {
6640 	unsigned long long msr;
6641 
6642 	if (no_msr)
6643 		return;
6644 
6645 	if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr))
6646 		fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n",
6647 			base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : "");
6648 }
6649 
6650 void decode_misc_enable_msr(void)
6651 {
6652 	unsigned long long msr;
6653 
6654 	if (no_msr)
6655 		return;
6656 
6657 	if (!genuine_intel)
6658 		return;
6659 
6660 	if (!get_msr(base_cpu, MSR_IA32_MISC_ENABLE, &msr))
6661 		fprintf(outf, "cpu%d: MSR_IA32_MISC_ENABLE: 0x%08llx (%sTCC %sEIST %sMWAIT %sPREFETCH %sTURBO)\n",
6662 			base_cpu, msr,
6663 			msr & MSR_IA32_MISC_ENABLE_TM1 ? "" : "No-",
6664 			msr & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP ? "" : "No-",
6665 			msr & MSR_IA32_MISC_ENABLE_MWAIT ? "" : "No-",
6666 			msr & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE ? "No-" : "",
6667 			msr & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ? "No-" : "");
6668 }
6669 
6670 void decode_misc_feature_control(void)
6671 {
6672 	unsigned long long msr;
6673 
6674 	if (no_msr)
6675 		return;
6676 
6677 	if (!platform->has_msr_misc_feature_control)
6678 		return;
6679 
6680 	if (!get_msr(base_cpu, MSR_MISC_FEATURE_CONTROL, &msr))
6681 		fprintf(outf,
6682 			"cpu%d: MSR_MISC_FEATURE_CONTROL: 0x%08llx (%sL2-Prefetch %sL2-Prefetch-pair %sL1-Prefetch %sL1-IP-Prefetch)\n",
6683 			base_cpu, msr, msr & (0 << 0) ? "No-" : "", msr & (1 << 0) ? "No-" : "",
6684 			msr & (2 << 0) ? "No-" : "", msr & (3 << 0) ? "No-" : "");
6685 }
6686 
6687 /*
6688  * Decode MSR_MISC_PWR_MGMT
6689  *
6690  * Decode the bits according to the Nehalem documentation
6691  * bit[0] seems to continue to have same meaning going forward
6692  * bit[1] less so...
6693  */
6694 void decode_misc_pwr_mgmt_msr(void)
6695 {
6696 	unsigned long long msr;
6697 
6698 	if (no_msr)
6699 		return;
6700 
6701 	if (!platform->has_msr_misc_pwr_mgmt)
6702 		return;
6703 
6704 	if (!get_msr(base_cpu, MSR_MISC_PWR_MGMT, &msr))
6705 		fprintf(outf, "cpu%d: MSR_MISC_PWR_MGMT: 0x%08llx (%sable-EIST_Coordination %sable-EPB %sable-OOB)\n",
6706 			base_cpu, msr,
6707 			msr & (1 << 0) ? "DIS" : "EN", msr & (1 << 1) ? "EN" : "DIS", msr & (1 << 8) ? "EN" : "DIS");
6708 }
6709 
6710 /*
6711  * Decode MSR_CC6_DEMOTION_POLICY_CONFIG, MSR_MC6_DEMOTION_POLICY_CONFIG
6712  *
6713  * This MSRs are present on Silvermont processors,
6714  * Intel Atom processor E3000 series (Baytrail), and friends.
6715  */
6716 void decode_c6_demotion_policy_msr(void)
6717 {
6718 	unsigned long long msr;
6719 
6720 	if (no_msr)
6721 		return;
6722 
6723 	if (!platform->has_msr_c6_demotion_policy_config)
6724 		return;
6725 
6726 	if (!get_msr(base_cpu, MSR_CC6_DEMOTION_POLICY_CONFIG, &msr))
6727 		fprintf(outf, "cpu%d: MSR_CC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-CC6-Demotion)\n",
6728 			base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
6729 
6730 	if (!get_msr(base_cpu, MSR_MC6_DEMOTION_POLICY_CONFIG, &msr))
6731 		fprintf(outf, "cpu%d: MSR_MC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-MC6-Demotion)\n",
6732 			base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
6733 }
6734 
6735 void print_dev_latency(void)
6736 {
6737 	char *path = "/dev/cpu_dma_latency";
6738 	int fd;
6739 	int value;
6740 	int retval;
6741 
6742 	fd = open(path, O_RDONLY);
6743 	if (fd < 0) {
6744 		if (debug)
6745 			warnx("Read %s failed", path);
6746 		return;
6747 	}
6748 
6749 	retval = read(fd, (void *)&value, sizeof(int));
6750 	if (retval != sizeof(int)) {
6751 		warn("read failed %s", path);
6752 		close(fd);
6753 		return;
6754 	}
6755 	fprintf(outf, "/dev/cpu_dma_latency: %d usec (%s)\n", value, value == 2000000000 ? "default" : "constrained");
6756 
6757 	close(fd);
6758 }
6759 
6760 static int has_instr_count_access(void)
6761 {
6762 	int fd;
6763 	int has_access;
6764 
6765 	if (no_perf)
6766 		return 0;
6767 
6768 	fd = open_perf_counter(base_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0);
6769 	has_access = fd != -1;
6770 
6771 	if (fd != -1)
6772 		close(fd);
6773 
6774 	if (!has_access)
6775 		warnx("Failed to access %s. Some of the counters may not be available\n"
6776 		      "\tRun as root to enable them or use %s to disable the access explicitly",
6777 		      "instructions retired perf counter", "--no-perf");
6778 
6779 	return has_access;
6780 }
6781 
6782 bool is_aperf_access_required(void)
6783 {
6784 	return BIC_IS_ENABLED(BIC_Avg_MHz)
6785 	    || BIC_IS_ENABLED(BIC_Busy)
6786 	    || BIC_IS_ENABLED(BIC_Bzy_MHz)
6787 	    || BIC_IS_ENABLED(BIC_IPC)
6788 	    || BIC_IS_ENABLED(BIC_CPU_c1);
6789 }
6790 
6791 int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai,
6792 			   double *scale_, enum rapl_unit *unit_)
6793 {
6794 	if (no_perf)
6795 		return -1;
6796 
6797 	const double scale = read_perf_rapl_scale(cai->perf_subsys, cai->perf_name);
6798 
6799 	if (scale == 0.0)
6800 		return -1;
6801 
6802 	const enum rapl_unit unit = read_perf_rapl_unit(cai->perf_subsys, cai->perf_name);
6803 
6804 	if (unit == RAPL_UNIT_INVALID)
6805 		return -1;
6806 
6807 	const unsigned int rapl_type = read_perf_type(cai->perf_subsys);
6808 	const unsigned int rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name);
6809 
6810 	const int fd_counter =
6811 	    open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP);
6812 	if (fd_counter == -1)
6813 		return -1;
6814 
6815 	/* If it's the first counter opened, make it a group descriptor */
6816 	if (rci->fd_perf == -1)
6817 		rci->fd_perf = fd_counter;
6818 
6819 	*scale_ = scale;
6820 	*unit_ = unit;
6821 	return fd_counter;
6822 }
6823 
6824 int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai,
6825 			  double *scale, enum rapl_unit *unit)
6826 {
6827 	int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit);
6828 
6829 	if (debug)
6830 		fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu);
6831 
6832 	return ret;
6833 }
6834 
6835 /*
6836  * Linux-perf manages the HW instructions-retired counter
6837  * by enabling when requested, and hiding rollover
6838  */
6839 void linux_perf_init(void)
6840 {
6841 	if (access("/proc/sys/kernel/perf_event_paranoid", F_OK))
6842 		return;
6843 
6844 	if (BIC_IS_ENABLED(BIC_IPC) && has_aperf) {
6845 		fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
6846 		if (fd_instr_count_percpu == NULL)
6847 			err(-1, "calloc fd_instr_count_percpu");
6848 	}
6849 
6850 	const bool aperf_required = is_aperf_access_required();
6851 
6852 	if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) {
6853 		fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu));
6854 		if (fd_amperf_percpu == NULL)
6855 			err(-1, "calloc fd_amperf_percpu");
6856 	}
6857 }
6858 
6859 void rapl_perf_init(void)
6860 {
6861 	const unsigned int num_domains = (platform->has_per_core_rapl ? topo.max_core_id : topo.max_package_id) + 1;
6862 	bool *domain_visited = calloc(num_domains, sizeof(bool));
6863 
6864 	rapl_counter_info_perdomain = calloc(num_domains, sizeof(*rapl_counter_info_perdomain));
6865 	if (rapl_counter_info_perdomain == NULL)
6866 		err(-1, "calloc rapl_counter_info_percpu");
6867 	rapl_counter_info_perdomain_size = num_domains;
6868 
6869 	/*
6870 	 * Initialize rapl_counter_info_percpu
6871 	 */
6872 	for (unsigned int domain_id = 0; domain_id < num_domains; ++domain_id) {
6873 		struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain_id];
6874 
6875 		rci->fd_perf = -1;
6876 		for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) {
6877 			rci->data[i] = 0;
6878 			rci->source[i] = RAPL_SOURCE_NONE;
6879 		}
6880 	}
6881 
6882 	/*
6883 	 * Open/probe the counters
6884 	 * If can't get it via perf, fallback to MSR
6885 	 */
6886 	for (size_t i = 0; i < ARRAY_SIZE(rapl_counter_arch_infos); ++i) {
6887 
6888 		const struct rapl_counter_arch_info *const cai = &rapl_counter_arch_infos[i];
6889 		bool has_counter = 0;
6890 		double scale;
6891 		enum rapl_unit unit;
6892 		unsigned int next_domain;
6893 
6894 		memset(domain_visited, 0, num_domains * sizeof(*domain_visited));
6895 
6896 		for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) {
6897 
6898 			if (cpu_is_not_allowed(cpu))
6899 				continue;
6900 
6901 			/* Skip already seen and handled RAPL domains */
6902 			next_domain =
6903 			    platform->has_per_core_rapl ? cpus[cpu].physical_core_id : cpus[cpu].physical_package_id;
6904 
6905 			assert(next_domain < num_domains);
6906 
6907 			if (domain_visited[next_domain])
6908 				continue;
6909 
6910 			domain_visited[next_domain] = 1;
6911 
6912 			struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[next_domain];
6913 
6914 			/* Check if the counter is enabled and accessible */
6915 			if (BIC_IS_ENABLED(cai->bic) && (platform->rapl_msrs & cai->feature_mask)) {
6916 
6917 				/* Use perf API for this counter */
6918 				if (!no_perf && cai->perf_name
6919 				    && add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) {
6920 					rci->source[cai->rci_index] = RAPL_SOURCE_PERF;
6921 					rci->scale[cai->rci_index] = scale * cai->compat_scale;
6922 					rci->unit[cai->rci_index] = unit;
6923 					rci->flags[cai->rci_index] = cai->flags;
6924 
6925 					/* Use MSR for this counter */
6926 				} else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) {
6927 					rci->source[cai->rci_index] = RAPL_SOURCE_MSR;
6928 					rci->msr[cai->rci_index] = cai->msr;
6929 					rci->msr_mask[cai->rci_index] = cai->msr_mask;
6930 					rci->msr_shift[cai->rci_index] = cai->msr_shift;
6931 					rci->unit[cai->rci_index] = RAPL_UNIT_JOULES;
6932 					rci->scale[cai->rci_index] = *cai->platform_rapl_msr_scale * cai->compat_scale;
6933 					rci->flags[cai->rci_index] = cai->flags;
6934 				}
6935 			}
6936 
6937 			if (rci->source[cai->rci_index] != RAPL_SOURCE_NONE)
6938 				has_counter = 1;
6939 		}
6940 
6941 		/* If any CPU has access to the counter, make it present */
6942 		if (has_counter)
6943 			BIC_PRESENT(cai->bic);
6944 	}
6945 
6946 	free(domain_visited);
6947 }
6948 
6949 static int has_amperf_access_via_msr(void)
6950 {
6951 	if (no_msr)
6952 		return 0;
6953 
6954 	if (probe_msr(base_cpu, MSR_IA32_APERF))
6955 		return 0;
6956 
6957 	if (probe_msr(base_cpu, MSR_IA32_MPERF))
6958 		return 0;
6959 
6960 	return 1;
6961 }
6962 
6963 static int has_amperf_access_via_perf(void)
6964 {
6965 	struct amperf_group_fd fds;
6966 
6967 	/*
6968 	 * Cache the last result, so we don't warn the user multiple times
6969 	 *
6970 	 * Negative means cached, no access
6971 	 * Zero means not cached
6972 	 * Positive means cached, has access
6973 	 */
6974 	static int has_access_cached;
6975 
6976 	if (no_perf)
6977 		return 0;
6978 
6979 	if (has_access_cached != 0)
6980 		return has_access_cached > 0;
6981 
6982 	fds = open_amperf_fd(base_cpu);
6983 	has_access_cached = (fds.aperf != -1) && (fds.mperf != -1);
6984 
6985 	if (fds.aperf == -1)
6986 		warnx("Failed to access %s. Some of the counters may not be available\n"
6987 		      "\tRun as root to enable them or use %s to disable the access explicitly",
6988 		      "APERF perf counter", "--no-perf");
6989 	else
6990 		close(fds.aperf);
6991 
6992 	if (fds.mperf == -1)
6993 		warnx("Failed to access %s. Some of the counters may not be available\n"
6994 		      "\tRun as root to enable them or use %s to disable the access explicitly",
6995 		      "MPERF perf counter", "--no-perf");
6996 	else
6997 		close(fds.mperf);
6998 
6999 	if (has_access_cached == 0)
7000 		has_access_cached = -1;
7001 
7002 	return has_access_cached > 0;
7003 }
7004 
7005 /* Check if we can access APERF and MPERF */
7006 static int has_amperf_access(void)
7007 {
7008 	if (!is_aperf_access_required())
7009 		return 0;
7010 
7011 	if (!no_msr && has_amperf_access_via_msr())
7012 		return 1;
7013 
7014 	if (!no_perf && has_amperf_access_via_perf())
7015 		return 1;
7016 
7017 	return 0;
7018 }
7019 
7020 int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *group_name)
7021 {
7022 	if (strcmp(group_name, "cstate_core") == 0)
7023 		return &cci->fd_perf_core;
7024 
7025 	if (strcmp(group_name, "cstate_pkg") == 0)
7026 		return &cci->fd_perf_pkg;
7027 
7028 	return NULL;
7029 }
7030 
7031 int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai)
7032 {
7033 	if (no_perf)
7034 		return -1;
7035 
7036 	int *pfd_group = get_cstate_perf_group_fd(cci, cai->perf_subsys);
7037 
7038 	if (pfd_group == NULL)
7039 		return -1;
7040 
7041 	const unsigned int type = read_perf_type(cai->perf_subsys);
7042 	const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name);
7043 
7044 	const int fd_counter = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP);
7045 
7046 	if (fd_counter == -1)
7047 		return -1;
7048 
7049 	/* If it's the first counter opened, make it a group descriptor */
7050 	if (*pfd_group == -1)
7051 		*pfd_group = fd_counter;
7052 
7053 	return fd_counter;
7054 }
7055 
7056 int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai)
7057 {
7058 	int ret = add_cstate_perf_counter_(cpu, cci, cai);
7059 
7060 	if (debug)
7061 		fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu);
7062 
7063 	return ret;
7064 }
7065 
7066 void cstate_perf_init_(bool soft_c1)
7067 {
7068 	bool has_counter;
7069 	bool *cores_visited = NULL, *pkg_visited = NULL;
7070 	const int cores_visited_elems = topo.max_core_id + 1;
7071 	const int pkg_visited_elems = topo.max_package_id + 1;
7072 	const int cci_num = topo.max_cpu_num + 1;
7073 
7074 	ccstate_counter_info = calloc(cci_num, sizeof(*ccstate_counter_info));
7075 	if (!ccstate_counter_info)
7076 		err(1, "calloc ccstate_counter_arch_info");
7077 	ccstate_counter_info_size = cci_num;
7078 
7079 	cores_visited = calloc(cores_visited_elems, sizeof(*cores_visited));
7080 	if (!cores_visited)
7081 		err(1, "calloc cores_visited");
7082 
7083 	pkg_visited = calloc(pkg_visited_elems, sizeof(*pkg_visited));
7084 	if (!pkg_visited)
7085 		err(1, "calloc pkg_visited");
7086 
7087 	/* Initialize cstate_counter_info_percpu */
7088 	for (int cpu = 0; cpu < cci_num; ++cpu) {
7089 		ccstate_counter_info[cpu].fd_perf_core = -1;
7090 		ccstate_counter_info[cpu].fd_perf_pkg = -1;
7091 	}
7092 
7093 	for (int cidx = 0; cidx < NUM_CSTATE_COUNTERS; ++cidx) {
7094 		has_counter = false;
7095 		memset(cores_visited, 0, cores_visited_elems * sizeof(*cores_visited));
7096 		memset(pkg_visited, 0, pkg_visited_elems * sizeof(*pkg_visited));
7097 
7098 		const struct cstate_counter_arch_info *cai = &ccstate_counter_arch_infos[cidx];
7099 
7100 		for (int cpu = 0; cpu < cci_num; ++cpu) {
7101 
7102 			struct cstate_counter_info_t *const cci = &ccstate_counter_info[cpu];
7103 
7104 			if (cpu_is_not_allowed(cpu))
7105 				continue;
7106 
7107 			const int core_id = cpus[cpu].physical_core_id;
7108 			const int pkg_id = cpus[cpu].physical_package_id;
7109 
7110 			assert(core_id < cores_visited_elems);
7111 			assert(pkg_id < pkg_visited_elems);
7112 
7113 			const bool per_thread = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD;
7114 			const bool per_core = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_CORE;
7115 
7116 			if (!per_thread && cores_visited[core_id])
7117 				continue;
7118 
7119 			if (!per_core && pkg_visited[pkg_id])
7120 				continue;
7121 
7122 			const bool counter_needed = BIC_IS_ENABLED(cai->bic) ||
7123 			    (soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY));
7124 			const bool counter_supported = (platform->supported_cstates & cai->feature_mask);
7125 
7126 			if (counter_needed && counter_supported) {
7127 				/* Use perf API for this counter */
7128 				if (!no_perf && cai->perf_name && add_cstate_perf_counter(cpu, cci, cai) != -1) {
7129 
7130 					cci->source[cai->rci_index] = CSTATE_SOURCE_PERF;
7131 
7132 					/* User MSR for this counter */
7133 				} else if (!no_msr && cai->msr && pkg_cstate_limit >= cai->pkg_cstate_limit
7134 					   && probe_msr(cpu, cai->msr) == 0) {
7135 					cci->source[cai->rci_index] = CSTATE_SOURCE_MSR;
7136 					cci->msr[cai->rci_index] = cai->msr;
7137 				}
7138 			}
7139 
7140 			if (cci->source[cai->rci_index] != CSTATE_SOURCE_NONE) {
7141 				has_counter = true;
7142 				cores_visited[core_id] = true;
7143 				pkg_visited[pkg_id] = true;
7144 			}
7145 		}
7146 
7147 		/* If any CPU has access to the counter, make it present */
7148 		if (has_counter)
7149 			BIC_PRESENT(cai->bic);
7150 	}
7151 
7152 	free(cores_visited);
7153 	free(pkg_visited);
7154 }
7155 
7156 void cstate_perf_init(void)
7157 {
7158 	/*
7159 	 * If we don't have a C1 residency MSR, we calculate it "in software",
7160 	 * but we need APERF, MPERF too.
7161 	 */
7162 	const bool soft_c1 = !platform->has_msr_core_c1_res && has_amperf_access()
7163 	    && platform->supported_cstates & CC1;
7164 
7165 	if (soft_c1)
7166 		BIC_PRESENT(BIC_CPU_c1);
7167 
7168 	cstate_perf_init_(soft_c1);
7169 }
7170 
7171 void probe_cstates(void)
7172 {
7173 	probe_cst_limit();
7174 
7175 	if (platform->has_msr_module_c6_res_ms)
7176 		BIC_PRESENT(BIC_Mod_c6);
7177 
7178 	if (platform->has_ext_cst_msrs && !no_msr) {
7179 		BIC_PRESENT(BIC_Totl_c0);
7180 		BIC_PRESENT(BIC_Any_c0);
7181 		BIC_PRESENT(BIC_GFX_c0);
7182 		BIC_PRESENT(BIC_CPUGFX);
7183 	}
7184 
7185 	if (quiet)
7186 		return;
7187 
7188 	dump_power_ctl();
7189 	dump_cst_cfg();
7190 	decode_c6_demotion_policy_msr();
7191 	print_dev_latency();
7192 	dump_sysfs_cstate_config();
7193 	print_irtl();
7194 }
7195 
7196 void probe_lpi(void)
7197 {
7198 	if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK))
7199 		BIC_PRESENT(BIC_CPU_LPI);
7200 	else
7201 		BIC_NOT_PRESENT(BIC_CPU_LPI);
7202 
7203 	if (!access(sys_lpi_file_sysfs, R_OK)) {
7204 		sys_lpi_file = sys_lpi_file_sysfs;
7205 		BIC_PRESENT(BIC_SYS_LPI);
7206 	} else if (!access(sys_lpi_file_debugfs, R_OK)) {
7207 		sys_lpi_file = sys_lpi_file_debugfs;
7208 		BIC_PRESENT(BIC_SYS_LPI);
7209 	} else {
7210 		sys_lpi_file_sysfs = NULL;
7211 		BIC_NOT_PRESENT(BIC_SYS_LPI);
7212 	}
7213 
7214 }
7215 
7216 void probe_pstates(void)
7217 {
7218 	probe_bclk();
7219 
7220 	if (quiet)
7221 		return;
7222 
7223 	dump_platform_info();
7224 	dump_turbo_ratio_info();
7225 	dump_sysfs_pstate_config();
7226 	decode_misc_pwr_mgmt_msr();
7227 
7228 	for_all_cpus(print_hwp, ODD_COUNTERS);
7229 	for_all_cpus(print_epb, ODD_COUNTERS);
7230 	for_all_cpus(print_perf_limit, ODD_COUNTERS);
7231 }
7232 
7233 void process_cpuid()
7234 {
7235 	unsigned int eax, ebx, ecx, edx;
7236 	unsigned int fms, family, model, stepping, ecx_flags, edx_flags;
7237 	unsigned long long ucode_patch = 0;
7238 	bool ucode_patch_valid = false;
7239 
7240 	eax = ebx = ecx = edx = 0;
7241 
7242 	__cpuid(0, max_level, ebx, ecx, edx);
7243 
7244 	if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
7245 		genuine_intel = 1;
7246 	else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
7247 		authentic_amd = 1;
7248 	else if (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)
7249 		hygon_genuine = 1;
7250 
7251 	if (!quiet)
7252 		fprintf(outf, "CPUID(0): %.4s%.4s%.4s 0x%x CPUID levels\n",
7253 			(char *)&ebx, (char *)&edx, (char *)&ecx, max_level);
7254 
7255 	__cpuid(1, fms, ebx, ecx, edx);
7256 	family = (fms >> 8) & 0xf;
7257 	model = (fms >> 4) & 0xf;
7258 	stepping = fms & 0xf;
7259 	if (family == 0xf)
7260 		family += (fms >> 20) & 0xff;
7261 	if (family >= 6)
7262 		model += ((fms >> 16) & 0xf) << 4;
7263 	ecx_flags = ecx;
7264 	edx_flags = edx;
7265 
7266 	if (!no_msr) {
7267 		if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch))
7268 			warnx("get_msr(UCODE)");
7269 		else
7270 			ucode_patch_valid = true;
7271 	}
7272 
7273 	/*
7274 	 * check max extended function levels of CPUID.
7275 	 * This is needed to check for invariant TSC.
7276 	 * This check is valid for both Intel and AMD.
7277 	 */
7278 	ebx = ecx = edx = 0;
7279 	__cpuid(0x80000000, max_extended_level, ebx, ecx, edx);
7280 
7281 	if (!quiet) {
7282 		fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d)",
7283 			family, model, stepping, family, model, stepping);
7284 		if (ucode_patch_valid)
7285 			fprintf(outf, " microcode 0x%x", (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF));
7286 		fputc('\n', outf);
7287 
7288 		fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level);
7289 		fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n",
7290 			ecx_flags & (1 << 0) ? "SSE3" : "-",
7291 			ecx_flags & (1 << 3) ? "MONITOR" : "-",
7292 			ecx_flags & (1 << 6) ? "SMX" : "-",
7293 			ecx_flags & (1 << 7) ? "EIST" : "-",
7294 			ecx_flags & (1 << 8) ? "TM2" : "-",
7295 			edx_flags & (1 << 4) ? "TSC" : "-",
7296 			edx_flags & (1 << 5) ? "MSR" : "-",
7297 			edx_flags & (1 << 22) ? "ACPI-TM" : "-",
7298 			edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-");
7299 	}
7300 
7301 	probe_platform_features(family, model);
7302 
7303 	if (!(edx_flags & (1 << 5)))
7304 		errx(1, "CPUID: no MSR");
7305 
7306 	if (max_extended_level >= 0x80000007) {
7307 
7308 		/*
7309 		 * Non-Stop TSC is advertised by CPUID.EAX=0x80000007: EDX.bit8
7310 		 * this check is valid for both Intel and AMD
7311 		 */
7312 		__cpuid(0x80000007, eax, ebx, ecx, edx);
7313 		has_invariant_tsc = edx & (1 << 8);
7314 	}
7315 
7316 	/*
7317 	 * APERF/MPERF is advertised by CPUID.EAX=0x6: ECX.bit0
7318 	 * this check is valid for both Intel and AMD
7319 	 */
7320 
7321 	__cpuid(0x6, eax, ebx, ecx, edx);
7322 	has_aperf = ecx & (1 << 0);
7323 	if (has_aperf && has_amperf_access()) {
7324 		BIC_PRESENT(BIC_Avg_MHz);
7325 		BIC_PRESENT(BIC_Busy);
7326 		BIC_PRESENT(BIC_Bzy_MHz);
7327 		BIC_PRESENT(BIC_IPC);
7328 	}
7329 	do_dts = eax & (1 << 0);
7330 	if (do_dts)
7331 		BIC_PRESENT(BIC_CoreTmp);
7332 	has_turbo = eax & (1 << 1);
7333 	do_ptm = eax & (1 << 6);
7334 	if (do_ptm)
7335 		BIC_PRESENT(BIC_PkgTmp);
7336 	has_hwp = eax & (1 << 7);
7337 	has_hwp_notify = eax & (1 << 8);
7338 	has_hwp_activity_window = eax & (1 << 9);
7339 	has_hwp_epp = eax & (1 << 10);
7340 	has_hwp_pkg = eax & (1 << 11);
7341 	has_epb = ecx & (1 << 3);
7342 
7343 	if (!quiet)
7344 		fprintf(outf, "CPUID(6): %sAPERF, %sTURBO, %sDTS, %sPTM, %sHWP, "
7345 			"%sHWPnotify, %sHWPwindow, %sHWPepp, %sHWPpkg, %sEPB\n",
7346 			has_aperf ? "" : "No-",
7347 			has_turbo ? "" : "No-",
7348 			do_dts ? "" : "No-",
7349 			do_ptm ? "" : "No-",
7350 			has_hwp ? "" : "No-",
7351 			has_hwp_notify ? "" : "No-",
7352 			has_hwp_activity_window ? "" : "No-",
7353 			has_hwp_epp ? "" : "No-", has_hwp_pkg ? "" : "No-", has_epb ? "" : "No-");
7354 
7355 	if (!quiet)
7356 		decode_misc_enable_msr();
7357 
7358 	if (max_level >= 0x7 && !quiet) {
7359 		int has_sgx;
7360 
7361 		ecx = 0;
7362 
7363 		__cpuid_count(0x7, 0, eax, ebx, ecx, edx);
7364 
7365 		has_sgx = ebx & (1 << 2);
7366 
7367 		is_hybrid = edx & (1 << 15);
7368 
7369 		fprintf(outf, "CPUID(7): %sSGX %sHybrid\n", has_sgx ? "" : "No-", is_hybrid ? "" : "No-");
7370 
7371 		if (has_sgx)
7372 			decode_feature_control_msr();
7373 	}
7374 
7375 	if (max_level >= 0x15) {
7376 		unsigned int eax_crystal;
7377 		unsigned int ebx_tsc;
7378 
7379 		/*
7380 		 * CPUID 15H TSC/Crystal ratio, possibly Crystal Hz
7381 		 */
7382 		eax_crystal = ebx_tsc = crystal_hz = edx = 0;
7383 		__cpuid(0x15, eax_crystal, ebx_tsc, crystal_hz, edx);
7384 
7385 		if (ebx_tsc != 0) {
7386 			if (!quiet && (ebx != 0))
7387 				fprintf(outf, "CPUID(0x15): eax_crystal: %d ebx_tsc: %d ecx_crystal_hz: %d\n",
7388 					eax_crystal, ebx_tsc, crystal_hz);
7389 
7390 			if (crystal_hz == 0)
7391 				crystal_hz = platform->crystal_freq;
7392 
7393 			if (crystal_hz) {
7394 				tsc_hz = (unsigned long long)crystal_hz *ebx_tsc / eax_crystal;
7395 				if (!quiet)
7396 					fprintf(outf, "TSC: %lld MHz (%d Hz * %d / %d / 1000000)\n",
7397 						tsc_hz / 1000000, crystal_hz, ebx_tsc, eax_crystal);
7398 			}
7399 		}
7400 	}
7401 	if (max_level >= 0x16) {
7402 		unsigned int base_mhz, max_mhz, bus_mhz, edx;
7403 
7404 		/*
7405 		 * CPUID 16H Base MHz, Max MHz, Bus MHz
7406 		 */
7407 		base_mhz = max_mhz = bus_mhz = edx = 0;
7408 
7409 		__cpuid(0x16, base_mhz, max_mhz, bus_mhz, edx);
7410 
7411 		bclk = bus_mhz;
7412 
7413 		base_hz = base_mhz * 1000000;
7414 		has_base_hz = 1;
7415 
7416 		if (platform->enable_tsc_tweak)
7417 			tsc_tweak = base_hz / tsc_hz;
7418 
7419 		if (!quiet)
7420 			fprintf(outf, "CPUID(0x16): base_mhz: %d max_mhz: %d bus_mhz: %d\n",
7421 				base_mhz, max_mhz, bus_mhz);
7422 	}
7423 
7424 	if (has_aperf)
7425 		aperf_mperf_multiplier = platform->need_perf_multiplier ? 1024 : 1;
7426 
7427 	BIC_PRESENT(BIC_IRQ);
7428 	BIC_PRESENT(BIC_TSC_MHz);
7429 }
7430 
7431 static void counter_info_init(void)
7432 {
7433 	for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) {
7434 		struct cstate_counter_arch_info *const cai = &ccstate_counter_arch_infos[i];
7435 
7436 		if (platform->has_msr_knl_core_c6_residency && cai->msr == MSR_CORE_C6_RESIDENCY)
7437 			cai->msr = MSR_KNL_CORE_C6_RESIDENCY;
7438 
7439 		if (!platform->has_msr_core_c1_res && cai->msr == MSR_CORE_C1_RES)
7440 			cai->msr = 0;
7441 
7442 		if (platform->has_msr_atom_pkg_c6_residency && cai->msr == MSR_PKG_C6_RESIDENCY)
7443 			cai->msr = MSR_ATOM_PKG_C6_RESIDENCY;
7444 	}
7445 }
7446 
7447 void probe_pm_features(void)
7448 {
7449 	probe_pstates();
7450 
7451 	probe_cstates();
7452 
7453 	probe_lpi();
7454 
7455 	probe_intel_uncore_frequency();
7456 
7457 	probe_graphics();
7458 
7459 	probe_rapl();
7460 
7461 	probe_thermal();
7462 
7463 	if (platform->has_nhm_msrs && !no_msr)
7464 		BIC_PRESENT(BIC_SMI);
7465 
7466 	if (!quiet)
7467 		decode_misc_feature_control();
7468 }
7469 
7470 /*
7471  * in /dev/cpu/ return success for names that are numbers
7472  * ie. filter out ".", "..", "microcode".
7473  */
7474 int dir_filter(const struct dirent *dirp)
7475 {
7476 	if (isdigit(dirp->d_name[0]))
7477 		return 1;
7478 	else
7479 		return 0;
7480 }
7481 
7482 void topology_probe(bool startup)
7483 {
7484 	int i;
7485 	int max_core_id = 0;
7486 	int max_package_id = 0;
7487 	int max_siblings = 0;
7488 
7489 	/* Initialize num_cpus, max_cpu_num */
7490 	set_max_cpu_num();
7491 	topo.num_cpus = 0;
7492 	for_all_proc_cpus(count_cpus);
7493 	if (!summary_only && topo.num_cpus > 1)
7494 		BIC_PRESENT(BIC_CPU);
7495 
7496 	if (debug > 1)
7497 		fprintf(outf, "num_cpus %d max_cpu_num %d\n", topo.num_cpus, topo.max_cpu_num);
7498 
7499 	cpus = calloc(1, (topo.max_cpu_num + 1) * sizeof(struct cpu_topology));
7500 	if (cpus == NULL)
7501 		err(1, "calloc cpus");
7502 
7503 	/*
7504 	 * Allocate and initialize cpu_present_set
7505 	 */
7506 	cpu_present_set = CPU_ALLOC((topo.max_cpu_num + 1));
7507 	if (cpu_present_set == NULL)
7508 		err(3, "CPU_ALLOC");
7509 	cpu_present_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
7510 	CPU_ZERO_S(cpu_present_setsize, cpu_present_set);
7511 	for_all_proc_cpus(mark_cpu_present);
7512 
7513 	/*
7514 	 * Allocate and initialize cpu_effective_set
7515 	 */
7516 	cpu_effective_set = CPU_ALLOC((topo.max_cpu_num + 1));
7517 	if (cpu_effective_set == NULL)
7518 		err(3, "CPU_ALLOC");
7519 	cpu_effective_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
7520 	CPU_ZERO_S(cpu_effective_setsize, cpu_effective_set);
7521 	update_effective_set(startup);
7522 
7523 	/*
7524 	 * Allocate and initialize cpu_allowed_set
7525 	 */
7526 	cpu_allowed_set = CPU_ALLOC((topo.max_cpu_num + 1));
7527 	if (cpu_allowed_set == NULL)
7528 		err(3, "CPU_ALLOC");
7529 	cpu_allowed_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
7530 	CPU_ZERO_S(cpu_allowed_setsize, cpu_allowed_set);
7531 
7532 	/*
7533 	 * Validate and update cpu_allowed_set.
7534 	 *
7535 	 * Make sure all cpus in cpu_subset are also in cpu_present_set during startup.
7536 	 * Give a warning when cpus in cpu_subset become unavailable at runtime.
7537 	 * Give a warning when cpus are not effective because of cgroup setting.
7538 	 *
7539 	 * cpu_allowed_set is the intersection of cpu_present_set/cpu_effective_set/cpu_subset.
7540 	 */
7541 	for (i = 0; i < CPU_SUBSET_MAXCPUS; ++i) {
7542 		if (cpu_subset && !CPU_ISSET_S(i, cpu_subset_size, cpu_subset))
7543 			continue;
7544 
7545 		if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) {
7546 			if (cpu_subset) {
7547 				/* cpus in cpu_subset must be in cpu_present_set during startup */
7548 				if (startup)
7549 					err(1, "cpu%d not present", i);
7550 				else
7551 					fprintf(stderr, "cpu%d not present\n", i);
7552 			}
7553 			continue;
7554 		}
7555 
7556 		if (CPU_COUNT_S(cpu_effective_setsize, cpu_effective_set)) {
7557 			if (!CPU_ISSET_S(i, cpu_effective_setsize, cpu_effective_set)) {
7558 				fprintf(stderr, "cpu%d not effective\n", i);
7559 				continue;
7560 			}
7561 		}
7562 
7563 		CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set);
7564 	}
7565 
7566 	if (!CPU_COUNT_S(cpu_allowed_setsize, cpu_allowed_set))
7567 		err(-ENODEV, "No valid cpus found");
7568 	sched_setaffinity(0, cpu_allowed_setsize, cpu_allowed_set);
7569 
7570 	/*
7571 	 * Allocate and initialize cpu_affinity_set
7572 	 */
7573 	cpu_affinity_set = CPU_ALLOC((topo.max_cpu_num + 1));
7574 	if (cpu_affinity_set == NULL)
7575 		err(3, "CPU_ALLOC");
7576 	cpu_affinity_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
7577 	CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set);
7578 
7579 	for_all_proc_cpus(init_thread_id);
7580 
7581 	/*
7582 	 * For online cpus
7583 	 * find max_core_id, max_package_id
7584 	 */
7585 	for (i = 0; i <= topo.max_cpu_num; ++i) {
7586 		int siblings;
7587 
7588 		if (cpu_is_not_present(i)) {
7589 			if (debug > 1)
7590 				fprintf(outf, "cpu%d NOT PRESENT\n", i);
7591 			continue;
7592 		}
7593 
7594 		cpus[i].logical_cpu_id = i;
7595 
7596 		/* get package information */
7597 		cpus[i].physical_package_id = get_physical_package_id(i);
7598 		if (cpus[i].physical_package_id > max_package_id)
7599 			max_package_id = cpus[i].physical_package_id;
7600 
7601 		/* get die information */
7602 		cpus[i].die_id = get_die_id(i);
7603 		if (cpus[i].die_id > topo.max_die_id)
7604 			topo.max_die_id = cpus[i].die_id;
7605 
7606 		/* get numa node information */
7607 		cpus[i].physical_node_id = get_physical_node_id(&cpus[i]);
7608 		if (cpus[i].physical_node_id > topo.max_node_num)
7609 			topo.max_node_num = cpus[i].physical_node_id;
7610 
7611 		/* get core information */
7612 		cpus[i].physical_core_id = get_core_id(i);
7613 		if (cpus[i].physical_core_id > max_core_id)
7614 			max_core_id = cpus[i].physical_core_id;
7615 
7616 		/* get thread information */
7617 		siblings = get_thread_siblings(&cpus[i]);
7618 		if (siblings > max_siblings)
7619 			max_siblings = siblings;
7620 		if (cpus[i].thread_id == 0)
7621 			topo.num_cores++;
7622 	}
7623 	topo.max_core_id = max_core_id;
7624 	topo.max_package_id = max_package_id;
7625 
7626 	topo.cores_per_node = max_core_id + 1;
7627 	if (debug > 1)
7628 		fprintf(outf, "max_core_id %d, sizing for %d cores per package\n", max_core_id, topo.cores_per_node);
7629 	if (!summary_only && topo.cores_per_node > 1)
7630 		BIC_PRESENT(BIC_Core);
7631 
7632 	topo.num_die = topo.max_die_id + 1;
7633 	if (debug > 1)
7634 		fprintf(outf, "max_die_id %d, sizing for %d die\n", topo.max_die_id, topo.num_die);
7635 	if (!summary_only && topo.num_die > 1)
7636 		BIC_PRESENT(BIC_Die);
7637 
7638 	topo.num_packages = max_package_id + 1;
7639 	if (debug > 1)
7640 		fprintf(outf, "max_package_id %d, sizing for %d packages\n", max_package_id, topo.num_packages);
7641 	if (!summary_only && topo.num_packages > 1)
7642 		BIC_PRESENT(BIC_Package);
7643 
7644 	set_node_data();
7645 	if (debug > 1)
7646 		fprintf(outf, "nodes_per_pkg %d\n", topo.nodes_per_pkg);
7647 	if (!summary_only && topo.nodes_per_pkg > 1)
7648 		BIC_PRESENT(BIC_Node);
7649 
7650 	topo.threads_per_core = max_siblings;
7651 	if (debug > 1)
7652 		fprintf(outf, "max_siblings %d\n", max_siblings);
7653 
7654 	if (debug < 1)
7655 		return;
7656 
7657 	for (i = 0; i <= topo.max_cpu_num; ++i) {
7658 		if (cpu_is_not_present(i))
7659 			continue;
7660 		fprintf(outf,
7661 			"cpu %d pkg %d die %d node %d lnode %d core %d thread %d\n",
7662 			i, cpus[i].physical_package_id, cpus[i].die_id,
7663 			cpus[i].physical_node_id, cpus[i].logical_node_id, cpus[i].physical_core_id, cpus[i].thread_id);
7664 	}
7665 
7666 }
7667 
7668 void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_data **p)
7669 {
7670 	int i;
7671 	int num_cores = topo.cores_per_node * topo.nodes_per_pkg * topo.num_packages;
7672 	int num_threads = topo.threads_per_core * num_cores;
7673 
7674 	*t = calloc(num_threads, sizeof(struct thread_data));
7675 	if (*t == NULL)
7676 		goto error;
7677 
7678 	for (i = 0; i < num_threads; i++)
7679 		(*t)[i].cpu_id = -1;
7680 
7681 	*c = calloc(num_cores, sizeof(struct core_data));
7682 	if (*c == NULL)
7683 		goto error;
7684 
7685 	for (i = 0; i < num_cores; i++) {
7686 		(*c)[i].core_id = -1;
7687 		(*c)[i].base_cpu = -1;
7688 	}
7689 
7690 	*p = calloc(topo.num_packages, sizeof(struct pkg_data));
7691 	if (*p == NULL)
7692 		goto error;
7693 
7694 	for (i = 0; i < topo.num_packages; i++) {
7695 		(*p)[i].package_id = i;
7696 		(*p)[i].base_cpu = -1;
7697 	}
7698 
7699 	return;
7700 error:
7701 	err(1, "calloc counters");
7702 }
7703 
7704 /*
7705  * init_counter()
7706  *
7707  * set FIRST_THREAD_IN_CORE and FIRST_CORE_IN_PACKAGE
7708  */
7709 void init_counter(struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base, int cpu_id)
7710 {
7711 	int pkg_id = cpus[cpu_id].physical_package_id;
7712 	int node_id = cpus[cpu_id].logical_node_id;
7713 	int core_id = cpus[cpu_id].physical_core_id;
7714 	int thread_id = cpus[cpu_id].thread_id;
7715 	struct thread_data *t;
7716 	struct core_data *c;
7717 	struct pkg_data *p;
7718 
7719 	/* Workaround for systems where physical_node_id==-1
7720 	 * and logical_node_id==(-1 - topo.num_cpus)
7721 	 */
7722 	if (node_id < 0)
7723 		node_id = 0;
7724 
7725 	t = GET_THREAD(thread_base, thread_id, core_id, node_id, pkg_id);
7726 	c = GET_CORE(core_base, core_id, node_id, pkg_id);
7727 	p = GET_PKG(pkg_base, pkg_id);
7728 
7729 	t->cpu_id = cpu_id;
7730 	if (!cpu_is_not_allowed(cpu_id)) {
7731 		if (c->base_cpu < 0)
7732 			c->base_cpu = t->cpu_id;
7733 		if (p->base_cpu < 0)
7734 			p->base_cpu = t->cpu_id;
7735 	}
7736 
7737 	c->core_id = core_id;
7738 	p->package_id = pkg_id;
7739 }
7740 
7741 int initialize_counters(int cpu_id)
7742 {
7743 	init_counter(EVEN_COUNTERS, cpu_id);
7744 	init_counter(ODD_COUNTERS, cpu_id);
7745 	return 0;
7746 }
7747 
7748 void allocate_output_buffer()
7749 {
7750 	output_buffer = calloc(1, (1 + topo.num_cpus) * 2048);
7751 	outp = output_buffer;
7752 	if (outp == NULL)
7753 		err(-1, "calloc output buffer");
7754 }
7755 
7756 void allocate_fd_percpu(void)
7757 {
7758 	fd_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
7759 	if (fd_percpu == NULL)
7760 		err(-1, "calloc fd_percpu");
7761 }
7762 
7763 void allocate_irq_buffers(void)
7764 {
7765 	irq_column_2_cpu = calloc(topo.num_cpus, sizeof(int));
7766 	if (irq_column_2_cpu == NULL)
7767 		err(-1, "calloc %d", topo.num_cpus);
7768 
7769 	irqs_per_cpu = calloc(topo.max_cpu_num + 1, sizeof(int));
7770 	if (irqs_per_cpu == NULL)
7771 		err(-1, "calloc %d", topo.max_cpu_num + 1);
7772 }
7773 
7774 int update_topo(struct thread_data *t, struct core_data *c, struct pkg_data *p)
7775 {
7776 	topo.allowed_cpus++;
7777 	if ((int)t->cpu_id == c->base_cpu)
7778 		topo.allowed_cores++;
7779 	if ((int)t->cpu_id == p->base_cpu)
7780 		topo.allowed_packages++;
7781 
7782 	return 0;
7783 }
7784 
7785 void topology_update(void)
7786 {
7787 	topo.allowed_cpus = 0;
7788 	topo.allowed_cores = 0;
7789 	topo.allowed_packages = 0;
7790 	for_all_cpus(update_topo, ODD_COUNTERS);
7791 }
7792 
7793 void setup_all_buffers(bool startup)
7794 {
7795 	topology_probe(startup);
7796 	allocate_irq_buffers();
7797 	allocate_fd_percpu();
7798 	allocate_counters(&thread_even, &core_even, &package_even);
7799 	allocate_counters(&thread_odd, &core_odd, &package_odd);
7800 	allocate_output_buffer();
7801 	for_all_proc_cpus(initialize_counters);
7802 	topology_update();
7803 }
7804 
7805 void set_base_cpu(void)
7806 {
7807 	int i;
7808 
7809 	for (i = 0; i < topo.max_cpu_num + 1; ++i) {
7810 		if (cpu_is_not_allowed(i))
7811 			continue;
7812 		base_cpu = i;
7813 		if (debug > 1)
7814 			fprintf(outf, "base_cpu = %d\n", base_cpu);
7815 		return;
7816 	}
7817 	err(-ENODEV, "No valid cpus found");
7818 }
7819 
7820 static void set_amperf_source(void)
7821 {
7822 	amperf_source = AMPERF_SOURCE_PERF;
7823 
7824 	const bool aperf_required = is_aperf_access_required();
7825 
7826 	if (no_perf || !aperf_required || !has_amperf_access_via_perf())
7827 		amperf_source = AMPERF_SOURCE_MSR;
7828 
7829 	if (quiet || !debug)
7830 		return;
7831 
7832 	fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf");
7833 }
7834 
7835 bool has_added_counters(void)
7836 {
7837 	/*
7838 	 * It only makes sense to call this after the command line is parsed,
7839 	 * otherwise sys structure is not populated.
7840 	 */
7841 
7842 	return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters;
7843 }
7844 
7845 bool is_msr_access_required(void)
7846 {
7847 	if (no_msr)
7848 		return false;
7849 
7850 	if (has_added_counters())
7851 		return true;
7852 
7853 	return BIC_IS_ENABLED(BIC_SMI)
7854 	    || BIC_IS_ENABLED(BIC_CPU_c1)
7855 	    || BIC_IS_ENABLED(BIC_CPU_c3)
7856 	    || BIC_IS_ENABLED(BIC_CPU_c6)
7857 	    || BIC_IS_ENABLED(BIC_CPU_c7)
7858 	    || BIC_IS_ENABLED(BIC_Mod_c6)
7859 	    || BIC_IS_ENABLED(BIC_CoreTmp)
7860 	    || BIC_IS_ENABLED(BIC_Totl_c0)
7861 	    || BIC_IS_ENABLED(BIC_Any_c0)
7862 	    || BIC_IS_ENABLED(BIC_GFX_c0)
7863 	    || BIC_IS_ENABLED(BIC_CPUGFX)
7864 	    || BIC_IS_ENABLED(BIC_Pkgpc3)
7865 	    || BIC_IS_ENABLED(BIC_Pkgpc6)
7866 	    || BIC_IS_ENABLED(BIC_Pkgpc2)
7867 	    || BIC_IS_ENABLED(BIC_Pkgpc7)
7868 	    || BIC_IS_ENABLED(BIC_Pkgpc8)
7869 	    || BIC_IS_ENABLED(BIC_Pkgpc9)
7870 	    || BIC_IS_ENABLED(BIC_Pkgpc10)
7871 	    /* TODO: Multiplex access with perf */
7872 	    || BIC_IS_ENABLED(BIC_CorWatt)
7873 	    || BIC_IS_ENABLED(BIC_Cor_J)
7874 	    || BIC_IS_ENABLED(BIC_PkgWatt)
7875 	    || BIC_IS_ENABLED(BIC_CorWatt)
7876 	    || BIC_IS_ENABLED(BIC_GFXWatt)
7877 	    || BIC_IS_ENABLED(BIC_RAMWatt)
7878 	    || BIC_IS_ENABLED(BIC_Pkg_J)
7879 	    || BIC_IS_ENABLED(BIC_Cor_J)
7880 	    || BIC_IS_ENABLED(BIC_GFX_J)
7881 	    || BIC_IS_ENABLED(BIC_RAM_J)
7882 	    || BIC_IS_ENABLED(BIC_PKG__)
7883 	    || BIC_IS_ENABLED(BIC_RAM__)
7884 	    || BIC_IS_ENABLED(BIC_PkgTmp)
7885 	    || (is_aperf_access_required() && !has_amperf_access_via_perf());
7886 }
7887 
7888 void check_msr_access(void)
7889 {
7890 	if (!is_msr_access_required())
7891 		no_msr = 1;
7892 
7893 	check_dev_msr();
7894 	check_msr_permission();
7895 
7896 	if (no_msr)
7897 		bic_disable_msr_access();
7898 }
7899 
7900 void check_perf_access(void)
7901 {
7902 	const bool intrcount_required = BIC_IS_ENABLED(BIC_IPC);
7903 
7904 	if (no_perf || !intrcount_required || !has_instr_count_access())
7905 		bic_enabled &= ~BIC_IPC;
7906 
7907 	const bool aperf_required = is_aperf_access_required();
7908 
7909 	if (!aperf_required || !has_amperf_access()) {
7910 		bic_enabled &= ~BIC_Avg_MHz;
7911 		bic_enabled &= ~BIC_Busy;
7912 		bic_enabled &= ~BIC_Bzy_MHz;
7913 		bic_enabled &= ~BIC_IPC;
7914 	}
7915 }
7916 
7917 void turbostat_init()
7918 {
7919 	setup_all_buffers(true);
7920 	set_base_cpu();
7921 	check_msr_access();
7922 	check_perf_access();
7923 	process_cpuid();
7924 	counter_info_init();
7925 	probe_pm_features();
7926 	set_amperf_source();
7927 	linux_perf_init();
7928 	rapl_perf_init();
7929 	cstate_perf_init();
7930 
7931 	for_all_cpus(get_cpu_type, ODD_COUNTERS);
7932 	for_all_cpus(get_cpu_type, EVEN_COUNTERS);
7933 
7934 	if (DO_BIC(BIC_IPC))
7935 		(void)get_instr_count_fd(base_cpu);
7936 
7937 	/*
7938 	 * If TSC tweak is needed, but couldn't get it,
7939 	 * disable more BICs, since it can't be reported accurately.
7940 	 */
7941 	if (platform->enable_tsc_tweak && !has_base_hz) {
7942 		bic_enabled &= ~BIC_Busy;
7943 		bic_enabled &= ~BIC_Bzy_MHz;
7944 	}
7945 }
7946 
7947 int fork_it(char **argv)
7948 {
7949 	pid_t child_pid;
7950 	int status;
7951 
7952 	snapshot_proc_sysfs_files();
7953 	status = for_all_cpus(get_counters, EVEN_COUNTERS);
7954 	first_counter_read = 0;
7955 	if (status)
7956 		exit(status);
7957 	gettimeofday(&tv_even, (struct timezone *)NULL);
7958 
7959 	child_pid = fork();
7960 	if (!child_pid) {
7961 		/* child */
7962 		execvp(argv[0], argv);
7963 		err(errno, "exec %s", argv[0]);
7964 	} else {
7965 
7966 		/* parent */
7967 		if (child_pid == -1)
7968 			err(1, "fork");
7969 
7970 		signal(SIGINT, SIG_IGN);
7971 		signal(SIGQUIT, SIG_IGN);
7972 		if (waitpid(child_pid, &status, 0) == -1)
7973 			err(status, "waitpid");
7974 
7975 		if (WIFEXITED(status))
7976 			status = WEXITSTATUS(status);
7977 	}
7978 	/*
7979 	 * n.b. fork_it() does not check for errors from for_all_cpus()
7980 	 * because re-starting is problematic when forking
7981 	 */
7982 	snapshot_proc_sysfs_files();
7983 	for_all_cpus(get_counters, ODD_COUNTERS);
7984 	gettimeofday(&tv_odd, (struct timezone *)NULL);
7985 	timersub(&tv_odd, &tv_even, &tv_delta);
7986 	if (for_all_cpus_2(delta_cpu, ODD_COUNTERS, EVEN_COUNTERS))
7987 		fprintf(outf, "%s: Counter reset detected\n", progname);
7988 	else {
7989 		compute_average(EVEN_COUNTERS);
7990 		format_all_counters(EVEN_COUNTERS);
7991 	}
7992 
7993 	fprintf(outf, "%.6f sec\n", tv_delta.tv_sec + tv_delta.tv_usec / 1000000.0);
7994 
7995 	flush_output_stderr();
7996 
7997 	return status;
7998 }
7999 
8000 int get_and_dump_counters(void)
8001 {
8002 	int status;
8003 
8004 	snapshot_proc_sysfs_files();
8005 	status = for_all_cpus(get_counters, ODD_COUNTERS);
8006 	if (status)
8007 		return status;
8008 
8009 	status = for_all_cpus(dump_counters, ODD_COUNTERS);
8010 	if (status)
8011 		return status;
8012 
8013 	flush_output_stdout();
8014 
8015 	return status;
8016 }
8017 
8018 void print_version()
8019 {
8020 	fprintf(outf, "turbostat version 2024.05.10 - Len Brown <lenb@kernel.org>\n");
8021 }
8022 
8023 #define COMMAND_LINE_SIZE 2048
8024 
8025 void print_bootcmd(void)
8026 {
8027 	char bootcmd[COMMAND_LINE_SIZE];
8028 	FILE *fp;
8029 	int ret;
8030 
8031 	memset(bootcmd, 0, COMMAND_LINE_SIZE);
8032 	fp = fopen("/proc/cmdline", "r");
8033 	if (!fp)
8034 		return;
8035 
8036 	ret = fread(bootcmd, sizeof(char), COMMAND_LINE_SIZE - 1, fp);
8037 	if (ret) {
8038 		bootcmd[ret] = '\0';
8039 		/* the last character is already '\n' */
8040 		fprintf(outf, "Kernel command line: %s", bootcmd);
8041 	}
8042 
8043 	fclose(fp);
8044 }
8045 
8046 struct msr_counter *find_msrp_by_name(struct msr_counter *head, char *name)
8047 {
8048 	struct msr_counter *mp;
8049 
8050 	for (mp = head; mp; mp = mp->next) {
8051 		if (debug)
8052 			printf("%s: %s %s\n", __func__, name, mp->name);
8053 		if (!strncmp(name, mp->name, strlen(mp->name)))
8054 			return mp;
8055 	}
8056 	return NULL;
8057 }
8058 
8059 int add_counter(unsigned int msr_num, char *path, char *name,
8060 		unsigned int width, enum counter_scope scope,
8061 		enum counter_type type, enum counter_format format, int flags, int id)
8062 {
8063 	struct msr_counter *msrp;
8064 
8065 	if (no_msr && msr_num)
8066 		errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num);
8067 
8068 	if (debug)
8069 		printf("%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n", __func__, msr_num,
8070 		       path, name, width, scope, type, format, flags, id);
8071 
8072 	switch (scope) {
8073 
8074 	case SCOPE_CPU:
8075 		msrp = find_msrp_by_name(sys.tp, name);
8076 		if (msrp) {
8077 			if (debug)
8078 				printf("%s: %s FOUND\n", __func__, name);
8079 			break;
8080 		}
8081 		if (sys.added_thread_counters++ >= MAX_ADDED_THREAD_COUNTERS) {
8082 			warnx("ignoring thread counter %s", name);
8083 			return -1;
8084 		}
8085 		break;
8086 	case SCOPE_CORE:
8087 		msrp = find_msrp_by_name(sys.cp, name);
8088 		if (msrp) {
8089 			if (debug)
8090 				printf("%s: %s FOUND\n", __func__, name);
8091 			break;
8092 		}
8093 		if (sys.added_core_counters++ >= MAX_ADDED_CORE_COUNTERS) {
8094 			warnx("ignoring core counter %s", name);
8095 			return -1;
8096 		}
8097 		break;
8098 	case SCOPE_PACKAGE:
8099 		msrp = find_msrp_by_name(sys.pp, name);
8100 		if (msrp) {
8101 			if (debug)
8102 				printf("%s: %s FOUND\n", __func__, name);
8103 			break;
8104 		}
8105 		if (sys.added_package_counters++ >= MAX_ADDED_PACKAGE_COUNTERS) {
8106 			warnx("ignoring package counter %s", name);
8107 			return -1;
8108 		}
8109 		break;
8110 	default:
8111 		warnx("ignoring counter %s with unknown scope", name);
8112 		return -1;
8113 	}
8114 
8115 	if (msrp == NULL) {
8116 		msrp = calloc(1, sizeof(struct msr_counter));
8117 		if (msrp == NULL)
8118 			err(-1, "calloc msr_counter");
8119 		msrp->msr_num = msr_num;
8120 		strncpy(msrp->name, name, NAME_BYTES - 1);
8121 		msrp->width = width;
8122 		msrp->type = type;
8123 		msrp->format = format;
8124 		msrp->flags = flags;
8125 
8126 		switch (scope) {
8127 		case SCOPE_CPU:
8128 			msrp->next = sys.tp;
8129 			sys.tp = msrp;
8130 			break;
8131 		case SCOPE_CORE:
8132 			msrp->next = sys.cp;
8133 			sys.cp = msrp;
8134 			break;
8135 		case SCOPE_PACKAGE:
8136 			msrp->next = sys.pp;
8137 			sys.pp = msrp;
8138 			break;
8139 		}
8140 	}
8141 
8142 	if (path) {
8143 		struct sysfs_path *sp;
8144 
8145 		sp = calloc(1, sizeof(struct sysfs_path));
8146 		if (sp == NULL) {
8147 			perror("calloc");
8148 			exit(1);
8149 		}
8150 		strncpy(sp->path, path, PATH_BYTES - 1);
8151 		sp->id = id;
8152 		sp->next = msrp->sp;
8153 		msrp->sp = sp;
8154 	}
8155 
8156 	return 0;
8157 }
8158 
8159 void parse_add_command(char *add_command)
8160 {
8161 	int msr_num = 0;
8162 	char *path = NULL;
8163 	char name_buffer[NAME_BYTES] = "";
8164 	int width = 64;
8165 	int fail = 0;
8166 	enum counter_scope scope = SCOPE_CPU;
8167 	enum counter_type type = COUNTER_CYCLES;
8168 	enum counter_format format = FORMAT_DELTA;
8169 
8170 	while (add_command) {
8171 
8172 		if (sscanf(add_command, "msr0x%x", &msr_num) == 1)
8173 			goto next;
8174 
8175 		if (sscanf(add_command, "msr%d", &msr_num) == 1)
8176 			goto next;
8177 
8178 		if (*add_command == '/') {
8179 			path = add_command;
8180 			goto next;
8181 		}
8182 
8183 		if (sscanf(add_command, "u%d", &width) == 1) {
8184 			if ((width == 32) || (width == 64))
8185 				goto next;
8186 			width = 64;
8187 		}
8188 		if (!strncmp(add_command, "cpu", strlen("cpu"))) {
8189 			scope = SCOPE_CPU;
8190 			goto next;
8191 		}
8192 		if (!strncmp(add_command, "core", strlen("core"))) {
8193 			scope = SCOPE_CORE;
8194 			goto next;
8195 		}
8196 		if (!strncmp(add_command, "package", strlen("package"))) {
8197 			scope = SCOPE_PACKAGE;
8198 			goto next;
8199 		}
8200 		if (!strncmp(add_command, "cycles", strlen("cycles"))) {
8201 			type = COUNTER_CYCLES;
8202 			goto next;
8203 		}
8204 		if (!strncmp(add_command, "seconds", strlen("seconds"))) {
8205 			type = COUNTER_SECONDS;
8206 			goto next;
8207 		}
8208 		if (!strncmp(add_command, "usec", strlen("usec"))) {
8209 			type = COUNTER_USEC;
8210 			goto next;
8211 		}
8212 		if (!strncmp(add_command, "raw", strlen("raw"))) {
8213 			format = FORMAT_RAW;
8214 			goto next;
8215 		}
8216 		if (!strncmp(add_command, "delta", strlen("delta"))) {
8217 			format = FORMAT_DELTA;
8218 			goto next;
8219 		}
8220 		if (!strncmp(add_command, "percent", strlen("percent"))) {
8221 			format = FORMAT_PERCENT;
8222 			goto next;
8223 		}
8224 
8225 		if (sscanf(add_command, "%18s,%*s", name_buffer) == 1) {	/* 18 < NAME_BYTES */
8226 			char *eos;
8227 
8228 			eos = strchr(name_buffer, ',');
8229 			if (eos)
8230 				*eos = '\0';
8231 			goto next;
8232 		}
8233 
8234 next:
8235 		add_command = strchr(add_command, ',');
8236 		if (add_command) {
8237 			*add_command = '\0';
8238 			add_command++;
8239 		}
8240 
8241 	}
8242 	if ((msr_num == 0) && (path == NULL)) {
8243 		fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter ) required\n");
8244 		fail++;
8245 	}
8246 
8247 	/* generate default column header */
8248 	if (*name_buffer == '\0') {
8249 		if (width == 32)
8250 			sprintf(name_buffer, "M0x%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
8251 		else
8252 			sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
8253 	}
8254 
8255 	if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0))
8256 		fail++;
8257 
8258 	if (fail) {
8259 		help();
8260 		exit(1);
8261 	}
8262 }
8263 
8264 int is_deferred_add(char *name)
8265 {
8266 	int i;
8267 
8268 	for (i = 0; i < deferred_add_index; ++i)
8269 		if (!strcmp(name, deferred_add_names[i]))
8270 			return 1;
8271 	return 0;
8272 }
8273 
8274 int is_deferred_skip(char *name)
8275 {
8276 	int i;
8277 
8278 	for (i = 0; i < deferred_skip_index; ++i)
8279 		if (!strcmp(name, deferred_skip_names[i]))
8280 			return 1;
8281 	return 0;
8282 }
8283 
8284 void probe_sysfs(void)
8285 {
8286 	char path[64];
8287 	char name_buf[16];
8288 	FILE *input;
8289 	int state;
8290 	char *sp;
8291 
8292 	for (state = 10; state >= 0; --state) {
8293 
8294 		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
8295 		input = fopen(path, "r");
8296 		if (input == NULL)
8297 			continue;
8298 		if (!fgets(name_buf, sizeof(name_buf), input))
8299 			err(1, "%s: failed to read file", path);
8300 
8301 		/* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
8302 		sp = strchr(name_buf, '-');
8303 		if (!sp)
8304 			sp = strchrnul(name_buf, '\n');
8305 		*sp = '%';
8306 		*(sp + 1) = '\0';
8307 
8308 		remove_underbar(name_buf);
8309 
8310 		fclose(input);
8311 
8312 		sprintf(path, "cpuidle/state%d/time", state);
8313 
8314 		if (!DO_BIC(BIC_sysfs) && !is_deferred_add(name_buf))
8315 			continue;
8316 
8317 		if (is_deferred_skip(name_buf))
8318 			continue;
8319 
8320 		add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU, 0);
8321 	}
8322 
8323 	for (state = 10; state >= 0; --state) {
8324 
8325 		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
8326 		input = fopen(path, "r");
8327 		if (input == NULL)
8328 			continue;
8329 		if (!fgets(name_buf, sizeof(name_buf), input))
8330 			err(1, "%s: failed to read file", path);
8331 		/* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
8332 		sp = strchr(name_buf, '-');
8333 		if (!sp)
8334 			sp = strchrnul(name_buf, '\n');
8335 		*sp = '\0';
8336 		fclose(input);
8337 
8338 		remove_underbar(name_buf);
8339 
8340 		sprintf(path, "cpuidle/state%d/usage", state);
8341 
8342 		if (!DO_BIC(BIC_sysfs) && !is_deferred_add(name_buf))
8343 			continue;
8344 
8345 		if (is_deferred_skip(name_buf))
8346 			continue;
8347 
8348 		add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU, 0);
8349 	}
8350 
8351 }
8352 
8353 /*
8354  * parse cpuset with following syntax
8355  * 1,2,4..6,8-10 and set bits in cpu_subset
8356  */
8357 void parse_cpu_command(char *optarg)
8358 {
8359 	if (!strcmp(optarg, "core")) {
8360 		if (cpu_subset)
8361 			goto error;
8362 		show_core_only++;
8363 		return;
8364 	}
8365 	if (!strcmp(optarg, "package")) {
8366 		if (cpu_subset)
8367 			goto error;
8368 		show_pkg_only++;
8369 		return;
8370 	}
8371 	if (show_core_only || show_pkg_only)
8372 		goto error;
8373 
8374 	cpu_subset = CPU_ALLOC(CPU_SUBSET_MAXCPUS);
8375 	if (cpu_subset == NULL)
8376 		err(3, "CPU_ALLOC");
8377 	cpu_subset_size = CPU_ALLOC_SIZE(CPU_SUBSET_MAXCPUS);
8378 
8379 	CPU_ZERO_S(cpu_subset_size, cpu_subset);
8380 
8381 	if (parse_cpu_str(optarg, cpu_subset, cpu_subset_size))
8382 		goto error;
8383 
8384 	return;
8385 
8386 error:
8387 	fprintf(stderr, "\"--cpu %s\" malformed\n", optarg);
8388 	help();
8389 	exit(-1);
8390 }
8391 
8392 void cmdline(int argc, char **argv)
8393 {
8394 	int opt;
8395 	int option_index = 0;
8396 	static struct option long_options[] = {
8397 		{ "add", required_argument, 0, 'a' },
8398 		{ "cpu", required_argument, 0, 'c' },
8399 		{ "Dump", no_argument, 0, 'D' },
8400 		{ "debug", no_argument, 0, 'd' },	/* internal, not documented */
8401 		{ "enable", required_argument, 0, 'e' },
8402 		{ "interval", required_argument, 0, 'i' },
8403 		{ "IPC", no_argument, 0, 'I' },
8404 		{ "num_iterations", required_argument, 0, 'n' },
8405 		{ "header_iterations", required_argument, 0, 'N' },
8406 		{ "help", no_argument, 0, 'h' },
8407 		{ "hide", required_argument, 0, 'H' },	// meh, -h taken by --help
8408 		{ "Joules", no_argument, 0, 'J' },
8409 		{ "list", no_argument, 0, 'l' },
8410 		{ "out", required_argument, 0, 'o' },
8411 		{ "quiet", no_argument, 0, 'q' },
8412 		{ "no-msr", no_argument, 0, 'M' },
8413 		{ "no-perf", no_argument, 0, 'P' },
8414 		{ "show", required_argument, 0, 's' },
8415 		{ "Summary", no_argument, 0, 'S' },
8416 		{ "TCC", required_argument, 0, 'T' },
8417 		{ "version", no_argument, 0, 'v' },
8418 		{ 0, 0, 0, 0 }
8419 	};
8420 
8421 	progname = argv[0];
8422 
8423 	/*
8424 	 * Parse some options early, because they may make other options invalid,
8425 	 * like adding the MSR counter with --add and at the same time using --no-msr.
8426 	 */
8427 	while ((opt = getopt_long_only(argc, argv, "MP", long_options, &option_index)) != -1) {
8428 		switch (opt) {
8429 		case 'M':
8430 			no_msr = 1;
8431 			break;
8432 		case 'P':
8433 			no_perf = 1;
8434 			break;
8435 		default:
8436 			break;
8437 		}
8438 	}
8439 	optind = 0;
8440 
8441 	while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qMST:v", long_options, &option_index)) != -1) {
8442 		switch (opt) {
8443 		case 'a':
8444 			parse_add_command(optarg);
8445 			break;
8446 		case 'c':
8447 			parse_cpu_command(optarg);
8448 			break;
8449 		case 'D':
8450 			dump_only++;
8451 			break;
8452 		case 'e':
8453 			/* --enable specified counter */
8454 			bic_enabled = bic_enabled | bic_lookup(optarg, SHOW_LIST);
8455 			break;
8456 		case 'd':
8457 			debug++;
8458 			ENABLE_BIC(BIC_DISABLED_BY_DEFAULT);
8459 			break;
8460 		case 'H':
8461 			/*
8462 			 * --hide: do not show those specified
8463 			 *  multiple invocations simply clear more bits in enabled mask
8464 			 */
8465 			bic_enabled &= ~bic_lookup(optarg, HIDE_LIST);
8466 			break;
8467 		case 'h':
8468 		default:
8469 			help();
8470 			exit(1);
8471 		case 'i':
8472 			{
8473 				double interval = strtod(optarg, NULL);
8474 
8475 				if (interval < 0.001) {
8476 					fprintf(outf, "interval %f seconds is too small\n", interval);
8477 					exit(2);
8478 				}
8479 
8480 				interval_tv.tv_sec = interval_ts.tv_sec = interval;
8481 				interval_tv.tv_usec = (interval - interval_tv.tv_sec) * 1000000;
8482 				interval_ts.tv_nsec = (interval - interval_ts.tv_sec) * 1000000000;
8483 			}
8484 			break;
8485 		case 'J':
8486 			rapl_joules++;
8487 			break;
8488 		case 'l':
8489 			ENABLE_BIC(BIC_DISABLED_BY_DEFAULT);
8490 			list_header_only++;
8491 			quiet++;
8492 			break;
8493 		case 'o':
8494 			outf = fopen_or_die(optarg, "w");
8495 			break;
8496 		case 'q':
8497 			quiet = 1;
8498 			break;
8499 		case 'M':
8500 		case 'P':
8501 			/* Parsed earlier */
8502 			break;
8503 		case 'n':
8504 			num_iterations = strtod(optarg, NULL);
8505 
8506 			if (num_iterations <= 0) {
8507 				fprintf(outf, "iterations %d should be positive number\n", num_iterations);
8508 				exit(2);
8509 			}
8510 			break;
8511 		case 'N':
8512 			header_iterations = strtod(optarg, NULL);
8513 
8514 			if (header_iterations <= 0) {
8515 				fprintf(outf, "iterations %d should be positive number\n", header_iterations);
8516 				exit(2);
8517 			}
8518 			break;
8519 		case 's':
8520 			/*
8521 			 * --show: show only those specified
8522 			 *  The 1st invocation will clear and replace the enabled mask
8523 			 *  subsequent invocations can add to it.
8524 			 */
8525 			if (shown == 0)
8526 				bic_enabled = bic_lookup(optarg, SHOW_LIST);
8527 			else
8528 				bic_enabled |= bic_lookup(optarg, SHOW_LIST);
8529 			shown = 1;
8530 			break;
8531 		case 'S':
8532 			summary_only++;
8533 			break;
8534 		case 'T':
8535 			tj_max_override = atoi(optarg);
8536 			break;
8537 		case 'v':
8538 			print_version();
8539 			exit(0);
8540 			break;
8541 		}
8542 	}
8543 }
8544 
8545 void set_rlimit(void)
8546 {
8547 	struct rlimit limit;
8548 
8549 	if (getrlimit(RLIMIT_NOFILE, &limit) < 0)
8550 		err(1, "Failed to get rlimit");
8551 
8552 	if (limit.rlim_max < MAX_NOFILE)
8553 		limit.rlim_max = MAX_NOFILE;
8554 	if (limit.rlim_cur < MAX_NOFILE)
8555 		limit.rlim_cur = MAX_NOFILE;
8556 
8557 	if (setrlimit(RLIMIT_NOFILE, &limit) < 0)
8558 		err(1, "Failed to set rlimit");
8559 }
8560 
8561 int main(int argc, char **argv)
8562 {
8563 	int fd, ret;
8564 
8565 	fd = open("/sys/fs/cgroup/cgroup.procs", O_WRONLY);
8566 	if (fd < 0)
8567 		goto skip_cgroup_setting;
8568 
8569 	ret = write(fd, "0\n", 2);
8570 	if (ret == -1)
8571 		perror("Can't update cgroup\n");
8572 
8573 	close(fd);
8574 
8575 skip_cgroup_setting:
8576 	outf = stderr;
8577 	cmdline(argc, argv);
8578 
8579 	if (!quiet) {
8580 		print_version();
8581 		print_bootcmd();
8582 	}
8583 
8584 	probe_sysfs();
8585 
8586 	if (!getuid())
8587 		set_rlimit();
8588 
8589 	turbostat_init();
8590 
8591 	if (!no_msr)
8592 		msr_sum_record();
8593 
8594 	/* dump counters and exit */
8595 	if (dump_only)
8596 		return get_and_dump_counters();
8597 
8598 	/* list header and exit */
8599 	if (list_header_only) {
8600 		print_header(",");
8601 		flush_output_stdout();
8602 		return 0;
8603 	}
8604 
8605 	/*
8606 	 * if any params left, it must be a command to fork
8607 	 */
8608 	if (argc - optind)
8609 		return fork_it(argv + optind);
8610 	else
8611 		turbostat_loop();
8612 
8613 	return 0;
8614 }
8615