xref: /linux/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json (revision a1ff5a7d78a036d6c2178ee5acd6ba4946243800)
156de5b63SAndi Kleen[
256de5b63SAndi Kleen    {
3100ee7c3SIan Rogers        "BriefDescription": "C2 residency percent per package",
4100ee7c3SIan Rogers        "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC",
556de5b63SAndi Kleen        "MetricGroup": "Power",
6100ee7c3SIan Rogers        "MetricName": "C2_Pkg_Residency",
7ecabdc6aSIan Rogers        "ScaleUnit": "100%"
8ecabdc6aSIan Rogers    },
9ecabdc6aSIan Rogers    {
10ecabdc6aSIan Rogers        "BriefDescription": "C3 residency percent per core",
11ecabdc6aSIan Rogers        "MetricExpr": "cstate_core@c3\\-residency@ / TSC",
12ecabdc6aSIan Rogers        "MetricGroup": "Power",
13ecabdc6aSIan Rogers        "MetricName": "C3_Core_Residency",
14ecabdc6aSIan Rogers        "ScaleUnit": "100%"
15ecabdc6aSIan Rogers    },
16ecabdc6aSIan Rogers    {
17ecabdc6aSIan Rogers        "BriefDescription": "C3 residency percent per package",
18ecabdc6aSIan Rogers        "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC",
19ecabdc6aSIan Rogers        "MetricGroup": "Power",
20ecabdc6aSIan Rogers        "MetricName": "C3_Pkg_Residency",
21ecabdc6aSIan Rogers        "ScaleUnit": "100%"
22ecabdc6aSIan Rogers    },
23ecabdc6aSIan Rogers    {
24100ee7c3SIan Rogers        "BriefDescription": "C6 residency percent per core",
25100ee7c3SIan Rogers        "MetricExpr": "cstate_core@c6\\-residency@ / TSC",
26100ee7c3SIan Rogers        "MetricGroup": "Power",
27100ee7c3SIan Rogers        "MetricName": "C6_Core_Residency",
28100ee7c3SIan Rogers        "ScaleUnit": "100%"
29100ee7c3SIan Rogers    },
30100ee7c3SIan Rogers    {
31ecabdc6aSIan Rogers        "BriefDescription": "C6 residency percent per package",
32ecabdc6aSIan Rogers        "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC",
33ecabdc6aSIan Rogers        "MetricGroup": "Power",
34ecabdc6aSIan Rogers        "MetricName": "C6_Pkg_Residency",
35ecabdc6aSIan Rogers        "ScaleUnit": "100%"
36ecabdc6aSIan Rogers    },
37ecabdc6aSIan Rogers    {
38100ee7c3SIan Rogers        "BriefDescription": "C7 residency percent per core",
39100ee7c3SIan Rogers        "MetricExpr": "cstate_core@c7\\-residency@ / TSC",
40100ee7c3SIan Rogers        "MetricGroup": "Power",
41100ee7c3SIan Rogers        "MetricName": "C7_Core_Residency",
42100ee7c3SIan Rogers        "ScaleUnit": "100%"
43100ee7c3SIan Rogers    },
44100ee7c3SIan Rogers    {
45ecabdc6aSIan Rogers        "BriefDescription": "C7 residency percent per package",
46ecabdc6aSIan Rogers        "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC",
47ecabdc6aSIan Rogers        "MetricGroup": "Power",
48ecabdc6aSIan Rogers        "MetricName": "C7_Pkg_Residency",
49ecabdc6aSIan Rogers        "ScaleUnit": "100%"
50100ee7c3SIan Rogers    },
51100ee7c3SIan Rogers    {
52100ee7c3SIan Rogers        "BriefDescription": "Uncore frequency per die [GHZ]",
53b522c8afSIan Rogers        "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9",
54100ee7c3SIan Rogers        "MetricGroup": "SoC",
55100ee7c3SIan Rogers        "MetricName": "UNCORE_FREQ"
56100ee7c3SIan Rogers    },
57100ee7c3SIan Rogers    {
58b522c8afSIan Rogers        "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.",
59b522c8afSIan Rogers        "MetricExpr": "CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY",
60b522c8afSIan Rogers        "MetricName": "cpi",
61b522c8afSIan Rogers        "ScaleUnit": "1per_instr"
62b522c8afSIan Rogers    },
63b522c8afSIan Rogers    {
64b522c8afSIan Rogers        "BriefDescription": "CPU operating frequency (in GHz)",
65b522c8afSIan Rogers        "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ / 1e9",
66b522c8afSIan Rogers        "MetricName": "cpu_operating_frequency",
67b522c8afSIan Rogers        "ScaleUnit": "1GHz"
68b522c8afSIan Rogers    },
69b522c8afSIan Rogers    {
70b522c8afSIan Rogers        "BriefDescription": "Percentage of time spent in the active CPU power state C0",
71*4c10b96fSIan Rogers        "MetricExpr": "tma_info_system_cpus_utilized",
72b522c8afSIan Rogers        "MetricName": "cpu_utilization",
73b522c8afSIan Rogers        "ScaleUnit": "100%"
74b522c8afSIan Rogers    },
75b522c8afSIan Rogers    {
76b522c8afSIan Rogers        "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions",
77b522c8afSIan Rogers        "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY",
78b522c8afSIan Rogers        "MetricName": "dtlb_2mb_large_page_load_mpi",
79b522c8afSIan Rogers        "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the Data Translation Lookaside Buffer (DTLB) and further levels of TLB.",
80b522c8afSIan Rogers        "ScaleUnit": "1per_instr"
81b522c8afSIan Rogers    },
82b522c8afSIan Rogers    {
83b522c8afSIan Rogers        "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions",
84b522c8afSIan Rogers        "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY",
85b522c8afSIan Rogers        "MetricName": "dtlb_load_mpi",
86b522c8afSIan Rogers        "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.",
87b522c8afSIan Rogers        "ScaleUnit": "1per_instr"
88b522c8afSIan Rogers    },
89b522c8afSIan Rogers    {
90b522c8afSIan Rogers        "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions",
91b522c8afSIan Rogers        "MetricExpr": "DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY",
92b522c8afSIan Rogers        "MetricName": "dtlb_store_mpi",
93b522c8afSIan Rogers        "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.",
94b522c8afSIan Rogers        "ScaleUnit": "1per_instr"
95b522c8afSIan Rogers    },
96b522c8afSIan Rogers    {
97b522c8afSIan Rogers        "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.",
98b522c8afSIan Rogers        "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3) * 4 / 1e6 / duration_time",
99b522c8afSIan Rogers        "MetricName": "io_bandwidth_read",
100b522c8afSIan Rogers        "ScaleUnit": "1MB/s"
101b522c8afSIan Rogers    },
102b522c8afSIan Rogers    {
103b522c8afSIan Rogers        "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.",
104b522c8afSIan Rogers        "MetricExpr": "(UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART0 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART1 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART2 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART3) * 4 / 1e6 / duration_time",
105b522c8afSIan Rogers        "MetricName": "io_bandwidth_write",
106b522c8afSIan Rogers        "ScaleUnit": "1MB/s"
107b522c8afSIan Rogers    },
108b522c8afSIan Rogers    {
109b522c8afSIan Rogers        "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions",
110b522c8afSIan Rogers        "MetricExpr": "ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY",
111b522c8afSIan Rogers        "MetricName": "itlb_large_page_mpi",
112b522c8afSIan Rogers        "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the Instruction Translation Lookaside Buffer (ITLB) and further levels of TLB.",
113b522c8afSIan Rogers        "ScaleUnit": "1per_instr"
114b522c8afSIan Rogers    },
115b522c8afSIan Rogers    {
116b522c8afSIan Rogers        "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions",
117b522c8afSIan Rogers        "MetricExpr": "ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY",
118b522c8afSIan Rogers        "MetricName": "itlb_mpi",
119b522c8afSIan Rogers        "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB.",
120b522c8afSIan Rogers        "ScaleUnit": "1per_instr"
121b522c8afSIan Rogers    },
122b522c8afSIan Rogers    {
123b522c8afSIan Rogers        "BriefDescription": "Ratio of number of code read requests missing in L1 instruction cache (includes prefetches) to the total number of completed instructions",
124b522c8afSIan Rogers        "MetricExpr": "L2_RQSTS.ALL_CODE_RD / INST_RETIRED.ANY",
125b522c8afSIan Rogers        "MetricName": "l1_i_code_read_misses_with_prefetches_per_instr",
126b522c8afSIan Rogers        "ScaleUnit": "1per_instr"
127b522c8afSIan Rogers    },
128b522c8afSIan Rogers    {
129b522c8afSIan Rogers        "BriefDescription": "Ratio of number of demand load requests hitting in L1 data cache to the total number of completed instructions",
130b522c8afSIan Rogers        "MetricExpr": "MEM_LOAD_RETIRED.L1_HIT / INST_RETIRED.ANY",
131b522c8afSIan Rogers        "MetricName": "l1d_demand_data_read_hits_per_instr",
132b522c8afSIan Rogers        "ScaleUnit": "1per_instr"
133b522c8afSIan Rogers    },
134b522c8afSIan Rogers    {
135b522c8afSIan Rogers        "BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions",
136b522c8afSIan Rogers        "MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY",
137b522c8afSIan Rogers        "MetricName": "l1d_mpi",
138b522c8afSIan Rogers        "ScaleUnit": "1per_instr"
139b522c8afSIan Rogers    },
140b522c8afSIan Rogers    {
141b522c8afSIan Rogers        "BriefDescription": "Ratio of number of code read request missing L2 cache to the total number of completed instructions",
142b522c8afSIan Rogers        "MetricExpr": "L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY",
143b522c8afSIan Rogers        "MetricName": "l2_demand_code_mpi",
144b522c8afSIan Rogers        "ScaleUnit": "1per_instr"
145b522c8afSIan Rogers    },
146b522c8afSIan Rogers    {
147b522c8afSIan Rogers        "BriefDescription": "Ratio of number of completed demand load requests hitting in L2 cache to the total number of completed instructions",
148b522c8afSIan Rogers        "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT / INST_RETIRED.ANY",
149b522c8afSIan Rogers        "MetricName": "l2_demand_data_read_hits_per_instr",
150b522c8afSIan Rogers        "ScaleUnit": "1per_instr"
151b522c8afSIan Rogers    },
152b522c8afSIan Rogers    {
153b522c8afSIan Rogers        "BriefDescription": "Ratio of number of completed data read request missing L2 cache to the total number of completed instructions",
154b522c8afSIan Rogers        "MetricExpr": "MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY",
155b522c8afSIan Rogers        "MetricName": "l2_demand_data_read_mpi",
156b522c8afSIan Rogers        "ScaleUnit": "1per_instr"
157b522c8afSIan Rogers    },
158b522c8afSIan Rogers    {
159b522c8afSIan Rogers        "BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions",
160b522c8afSIan Rogers        "MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY",
161b522c8afSIan Rogers        "MetricName": "l2_mpi",
162b522c8afSIan Rogers        "ScaleUnit": "1per_instr"
163b522c8afSIan Rogers    },
164b522c8afSIan Rogers    {
165b522c8afSIan Rogers        "BriefDescription": "Ratio of number of code read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions",
166*4c10b96fSIan Rogers        "MetricExpr": "cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x12cc0233@ / INST_RETIRED.ANY",
167b522c8afSIan Rogers        "MetricName": "llc_code_read_mpi_demand_plus_prefetch",
168b522c8afSIan Rogers        "ScaleUnit": "1per_instr"
169b522c8afSIan Rogers    },
170b522c8afSIan Rogers    {
171b522c8afSIan Rogers        "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) in nano seconds",
172b522c8afSIan Rogers        "MetricExpr": "1e9 * (cha@UNC_CHA_TOR_OCCUPANCY.IA_MISS\\,config1\\=0x40433@ / cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40433@) / (UNC_CHA_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time",
173b522c8afSIan Rogers        "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency",
174b522c8afSIan Rogers        "ScaleUnit": "1ns"
175b522c8afSIan Rogers    },
176b522c8afSIan Rogers    {
177b522c8afSIan Rogers        "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to local memory in nano seconds",
178b522c8afSIan Rogers        "MetricExpr": "1e9 * (cha@UNC_CHA_TOR_OCCUPANCY.IA_MISS\\,config1\\=0x40432@ / cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40432@) / (UNC_CHA_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time",
179b522c8afSIan Rogers        "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_local_requests",
180b522c8afSIan Rogers        "ScaleUnit": "1ns"
181b522c8afSIan Rogers    },
182b522c8afSIan Rogers    {
183b522c8afSIan Rogers        "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to remote memory in nano seconds",
184b522c8afSIan Rogers        "MetricExpr": "1e9 * (cha@UNC_CHA_TOR_OCCUPANCY.IA_MISS\\,config1\\=0x40431@ / cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40431@) / (UNC_CHA_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time",
185b522c8afSIan Rogers        "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_remote_requests",
186b522c8afSIan Rogers        "ScaleUnit": "1ns"
187b522c8afSIan Rogers    },
188b522c8afSIan Rogers    {
189b522c8afSIan Rogers        "BriefDescription": "Ratio of number of data read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions",
190*4c10b96fSIan Rogers        "MetricExpr": "cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x12d40433@ / INST_RETIRED.ANY",
191b522c8afSIan Rogers        "MetricName": "llc_data_read_mpi_demand_plus_prefetch",
192b522c8afSIan Rogers        "ScaleUnit": "1per_instr"
193b522c8afSIan Rogers    },
194b522c8afSIan Rogers    {
195b522c8afSIan Rogers        "BriefDescription": "Bandwidth (MB/sec) of read requests that miss the last level cache (LLC) and go to local memory.",
196b522c8afSIan Rogers        "MetricExpr": "UNC_CHA_REQUESTS.READS_LOCAL * 64 / 1e6 / duration_time",
197b522c8afSIan Rogers        "MetricName": "llc_miss_local_memory_bandwidth_read",
198b522c8afSIan Rogers        "ScaleUnit": "1MB/s"
199b522c8afSIan Rogers    },
200b522c8afSIan Rogers    {
201b522c8afSIan Rogers        "BriefDescription": "Bandwidth (MB/sec) of write requests that miss the last level cache (LLC) and go to local memory.",
202b522c8afSIan Rogers        "MetricExpr": "UNC_CHA_REQUESTS.WRITES_LOCAL * 64 / 1e6 / duration_time",
203b522c8afSIan Rogers        "MetricName": "llc_miss_local_memory_bandwidth_write",
204b522c8afSIan Rogers        "ScaleUnit": "1MB/s"
205b522c8afSIan Rogers    },
206b522c8afSIan Rogers    {
207b522c8afSIan Rogers        "BriefDescription": "Bandwidth (MB/sec) of read requests that miss the last level cache (LLC) and go to remote memory.",
208b522c8afSIan Rogers        "MetricExpr": "UNC_CHA_REQUESTS.READS_REMOTE * 64 / 1e6 / duration_time",
209b522c8afSIan Rogers        "MetricName": "llc_miss_remote_memory_bandwidth_read",
210b522c8afSIan Rogers        "ScaleUnit": "1MB/s"
211b522c8afSIan Rogers    },
212b522c8afSIan Rogers    {
213e2c8b40eSIan Rogers        "BriefDescription": "Bandwidth (MB/sec) of write requests that miss the last level cache (LLC) and go to remote memory.",
214e2c8b40eSIan Rogers        "MetricExpr": "UNC_CHA_REQUESTS.WRITES_REMOTE * 64 / 1e6 / duration_time",
215e2c8b40eSIan Rogers        "MetricName": "llc_miss_remote_memory_bandwidth_write",
216e2c8b40eSIan Rogers        "ScaleUnit": "1MB/s"
217e2c8b40eSIan Rogers    },
218e2c8b40eSIan Rogers    {
219b522c8afSIan Rogers        "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions",
220b522c8afSIan Rogers        "MetricExpr": "MEM_INST_RETIRED.ALL_LOADS / INST_RETIRED.ANY",
221b522c8afSIan Rogers        "MetricName": "loads_per_instr",
222b522c8afSIan Rogers        "ScaleUnit": "1per_instr"
223b522c8afSIan Rogers    },
224b522c8afSIan Rogers    {
225b522c8afSIan Rogers        "BriefDescription": "DDR memory read bandwidth (MB/sec)",
226b522c8afSIan Rogers        "MetricExpr": "UNC_M_CAS_COUNT.RD * 64 / 1e6 / duration_time",
227b522c8afSIan Rogers        "MetricName": "memory_bandwidth_read",
228b522c8afSIan Rogers        "ScaleUnit": "1MB/s"
229b522c8afSIan Rogers    },
230b522c8afSIan Rogers    {
231b522c8afSIan Rogers        "BriefDescription": "DDR memory bandwidth (MB/sec)",
232b522c8afSIan Rogers        "MetricExpr": "(UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) * 64 / 1e6 / duration_time",
233b522c8afSIan Rogers        "MetricName": "memory_bandwidth_total",
234b522c8afSIan Rogers        "ScaleUnit": "1MB/s"
235b522c8afSIan Rogers    },
236b522c8afSIan Rogers    {
237b522c8afSIan Rogers        "BriefDescription": "DDR memory write bandwidth (MB/sec)",
238b522c8afSIan Rogers        "MetricExpr": "UNC_M_CAS_COUNT.WR * 64 / 1e6 / duration_time",
239b522c8afSIan Rogers        "MetricName": "memory_bandwidth_write",
240b522c8afSIan Rogers        "ScaleUnit": "1MB/s"
241b522c8afSIan Rogers    },
242b522c8afSIan Rogers    {
243b522c8afSIan Rogers        "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.",
244b522c8afSIan Rogers        "MetricExpr": "cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40432@ / (cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40432@ + cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40431@)",
245b522c8afSIan Rogers        "MetricName": "numa_reads_addressed_to_local_dram",
246b522c8afSIan Rogers        "ScaleUnit": "100%"
247b522c8afSIan Rogers    },
248b522c8afSIan Rogers    {
249b522c8afSIan Rogers        "BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.",
250b522c8afSIan Rogers        "MetricExpr": "cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40431@ / (cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40432@ + cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40431@)",
251b522c8afSIan Rogers        "MetricName": "numa_reads_addressed_to_remote_dram",
252b522c8afSIan Rogers        "ScaleUnit": "100%"
253b522c8afSIan Rogers    },
254b522c8afSIan Rogers    {
255b522c8afSIan Rogers        "BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue",
256b522c8afSIan Rogers        "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY",
257b522c8afSIan Rogers        "MetricName": "percent_uops_delivered_from_decoded_icache",
258b522c8afSIan Rogers        "ScaleUnit": "100%"
259b522c8afSIan Rogers    },
260b522c8afSIan Rogers    {
261b522c8afSIan Rogers        "BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue",
262b522c8afSIan Rogers        "MetricExpr": "IDQ.MITE_UOPS / UOPS_ISSUED.ANY",
263b522c8afSIan Rogers        "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline",
264b522c8afSIan Rogers        "ScaleUnit": "100%"
265b522c8afSIan Rogers    },
266b522c8afSIan Rogers    {
267b522c8afSIan Rogers        "BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue",
268b522c8afSIan Rogers        "MetricExpr": "IDQ.MS_UOPS / UOPS_ISSUED.ANY",
269b522c8afSIan Rogers        "MetricName": "percent_uops_delivered_from_microcode_sequencer",
270b522c8afSIan Rogers        "ScaleUnit": "100%"
271b522c8afSIan Rogers    },
272b522c8afSIan Rogers    {
273100ee7c3SIan Rogers        "BriefDescription": "Percentage of cycles spent in System Management Interrupts.",
274100ee7c3SIan Rogers        "MetricExpr": "((msr@aperf@ - cycles) / msr@aperf@ if msr@smi@ > 0 else 0)",
275100ee7c3SIan Rogers        "MetricGroup": "smi",
276100ee7c3SIan Rogers        "MetricName": "smi_cycles",
277100ee7c3SIan Rogers        "MetricThreshold": "smi_cycles > 0.1",
278100ee7c3SIan Rogers        "ScaleUnit": "100%"
279100ee7c3SIan Rogers    },
280100ee7c3SIan Rogers    {
281100ee7c3SIan Rogers        "BriefDescription": "Number of SMI interrupts.",
282100ee7c3SIan Rogers        "MetricExpr": "msr@smi@",
283100ee7c3SIan Rogers        "MetricGroup": "smi",
284100ee7c3SIan Rogers        "MetricName": "smi_num",
285100ee7c3SIan Rogers        "ScaleUnit": "1SMI#"
286100ee7c3SIan Rogers    },
287100ee7c3SIan Rogers    {
288b522c8afSIan Rogers        "BriefDescription": "The ratio of number of completed memory store instructions to the total number completed instructions",
289b522c8afSIan Rogers        "MetricExpr": "MEM_INST_RETIRED.ALL_STORES / INST_RETIRED.ANY",
290b522c8afSIan Rogers        "MetricName": "stores_per_instr",
291b522c8afSIan Rogers        "ScaleUnit": "1per_instr"
292b522c8afSIan Rogers    },
293b522c8afSIan Rogers    {
294100ee7c3SIan Rogers        "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset",
295b522c8afSIan Rogers        "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks",
296100ee7c3SIan Rogers        "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
297100ee7c3SIan Rogers        "MetricName": "tma_4k_aliasing",
298100ee7c3SIan Rogers        "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
299100ee7c3SIan Rogers        "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).",
300100ee7c3SIan Rogers        "ScaleUnit": "100%"
301100ee7c3SIan Rogers    },
302100ee7c3SIan Rogers    {
303100ee7c3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.",
304b522c8afSIan Rogers        "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots",
305100ee7c3SIan Rogers        "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
306100ee7c3SIan Rogers        "MetricName": "tma_alu_op_utilization",
307e2c8b40eSIan Rogers        "MetricThreshold": "tma_alu_op_utilization > 0.4",
308100ee7c3SIan Rogers        "ScaleUnit": "100%"
309100ee7c3SIan Rogers    },
310100ee7c3SIan Rogers    {
311100ee7c3SIan Rogers        "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
312e2c8b40eSIan Rogers        "MetricExpr": "34 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots",
313*4c10b96fSIan Rogers        "MetricGroup": "BvIO;TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
314100ee7c3SIan Rogers        "MetricName": "tma_assists",
315100ee7c3SIan Rogers        "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
316100ee7c3SIan Rogers        "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: OTHER_ASSISTS.ANY",
317100ee7c3SIan Rogers        "ScaleUnit": "100%"
318100ee7c3SIan Rogers    },
319100ee7c3SIan Rogers    {
320100ee7c3SIan Rogers        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
321b522c8afSIan Rogers        "MetricExpr": "1 - tma_frontend_bound - (UOPS_ISSUED.ANY + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots",
322*4c10b96fSIan Rogers        "MetricGroup": "BvOB;TmaL1;TopdownL1;tma_L1_group",
323100ee7c3SIan Rogers        "MetricName": "tma_backend_bound",
324100ee7c3SIan Rogers        "MetricThreshold": "tma_backend_bound > 0.2",
325ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL1",
326100ee7c3SIan Rogers        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
327100ee7c3SIan Rogers        "ScaleUnit": "100%"
328100ee7c3SIan Rogers    },
329100ee7c3SIan Rogers    {
330100ee7c3SIan Rogers        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
331b522c8afSIan Rogers        "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots",
332100ee7c3SIan Rogers        "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
333100ee7c3SIan Rogers        "MetricName": "tma_bad_speculation",
334100ee7c3SIan Rogers        "MetricThreshold": "tma_bad_speculation > 0.15",
335ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL1",
336100ee7c3SIan Rogers        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
337100ee7c3SIan Rogers        "ScaleUnit": "100%"
338100ee7c3SIan Rogers    },
339100ee7c3SIan Rogers    {
340100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction",
341100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
342100ee7c3SIan Rogers        "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * tma_bad_speculation",
343*4c10b96fSIan Rogers        "MetricGroup": "BadSpec;BrMispredicts;BvMP;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
344100ee7c3SIan Rogers        "MetricName": "tma_branch_mispredicts",
345100ee7c3SIan Rogers        "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
346ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
347b522c8afSIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers",
348100ee7c3SIan Rogers        "ScaleUnit": "100%"
349100ee7c3SIan Rogers    },
350100ee7c3SIan Rogers    {
351100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers",
352b522c8afSIan Rogers        "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks + tma_unknown_branches",
353100ee7c3SIan Rogers        "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group",
354100ee7c3SIan Rogers        "MetricName": "tma_branch_resteers",
355100ee7c3SIan Rogers        "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
356100ee7c3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES",
357100ee7c3SIan Rogers        "ScaleUnit": "100%"
358100ee7c3SIan Rogers    },
359100ee7c3SIan Rogers    {
360100ee7c3SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction",
361100ee7c3SIan Rogers        "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)",
362100ee7c3SIan Rogers        "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
363100ee7c3SIan Rogers        "MetricName": "tma_cisc",
364100ee7c3SIan Rogers        "MetricThreshold": "tma_cisc > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
365100ee7c3SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.",
366100ee7c3SIan Rogers        "ScaleUnit": "100%"
367100ee7c3SIan Rogers    },
368100ee7c3SIan Rogers    {
369100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears",
370b522c8afSIan Rogers        "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks",
371100ee7c3SIan Rogers        "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC",
372100ee7c3SIan Rogers        "MetricName": "tma_clears_resteers",
373100ee7c3SIan Rogers        "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
374100ee7c3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches",
375100ee7c3SIan Rogers        "ScaleUnit": "100%"
376100ee7c3SIan Rogers    },
377100ee7c3SIan Rogers    {
378100ee7c3SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
379100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
380e2c8b40eSIan Rogers        "MetricExpr": "(44 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 44 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
381*4c10b96fSIan Rogers        "MetricGroup": "BvMS;DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
382100ee7c3SIan Rogers        "MetricName": "tma_contested_accesses",
383100ee7c3SIan Rogers        "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
384100ee7c3SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS. Related metrics: tma_data_sharing, tma_false_sharing, tma_machine_clears, tma_remote_cache",
385100ee7c3SIan Rogers        "ScaleUnit": "100%"
386100ee7c3SIan Rogers    },
387100ee7c3SIan Rogers    {
388100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck",
389100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
390100ee7c3SIan Rogers        "MetricExpr": "tma_backend_bound - tma_memory_bound",
391100ee7c3SIan Rogers        "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
392100ee7c3SIan Rogers        "MetricName": "tma_core_bound",
393100ee7c3SIan Rogers        "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
394ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
395100ee7c3SIan Rogers        "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
396100ee7c3SIan Rogers        "ScaleUnit": "100%"
397100ee7c3SIan Rogers    },
398100ee7c3SIan Rogers    {
399100ee7c3SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
400100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
401e2c8b40eSIan Rogers        "MetricExpr": "44 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
402*4c10b96fSIan Rogers        "MetricGroup": "BvMS;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
403100ee7c3SIan Rogers        "MetricName": "tma_data_sharing",
404100ee7c3SIan Rogers        "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
405100ee7c3SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS. Related metrics: tma_contested_accesses, tma_false_sharing, tma_machine_clears, tma_remote_cache",
406100ee7c3SIan Rogers        "ScaleUnit": "100%"
407100ee7c3SIan Rogers    },
408100ee7c3SIan Rogers    {
409100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder",
410b522c8afSIan Rogers        "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2",
411100ee7c3SIan Rogers        "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group",
412100ee7c3SIan Rogers        "MetricName": "tma_decoder0_alone",
413e2c8b40eSIan Rogers        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)",
414100ee7c3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions",
415100ee7c3SIan Rogers        "ScaleUnit": "100%"
416100ee7c3SIan Rogers    },
417100ee7c3SIan Rogers    {
418100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active",
419b522c8afSIan Rogers        "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_thread_clks",
420*4c10b96fSIan Rogers        "MetricGroup": "BvCB;TopdownL3;tma_L3_group;tma_core_bound_group",
421100ee7c3SIan Rogers        "MetricName": "tma_divider",
422100ee7c3SIan Rogers        "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
423100ee7c3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_ACTIVE",
424100ee7c3SIan Rogers        "ScaleUnit": "100%"
425100ee7c3SIan Rogers    },
426100ee7c3SIan Rogers    {
427100ee7c3SIan Rogers        "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads",
428100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
429b522c8afSIan Rogers        "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound",
430100ee7c3SIan Rogers        "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
431100ee7c3SIan Rogers        "MetricName": "tma_dram_bound",
432100ee7c3SIan Rogers        "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
433100ee7c3SIan Rogers        "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS",
434100ee7c3SIan Rogers        "ScaleUnit": "100%"
435100ee7c3SIan Rogers    },
436100ee7c3SIan Rogers    {
437100ee7c3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline",
438e2c8b40eSIan Rogers        "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2",
439100ee7c3SIan Rogers        "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
440100ee7c3SIan Rogers        "MetricName": "tma_dsb",
441e2c8b40eSIan Rogers        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
442100ee7c3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
443100ee7c3SIan Rogers        "ScaleUnit": "100%"
444100ee7c3SIan Rogers    },
445100ee7c3SIan Rogers    {
446100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines",
447b522c8afSIan Rogers        "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks",
448100ee7c3SIan Rogers        "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
449100ee7c3SIan Rogers        "MetricName": "tma_dsb_switches",
450100ee7c3SIan Rogers        "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
451*4c10b96fSIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
452100ee7c3SIan Rogers        "ScaleUnit": "100%"
453100ee7c3SIan Rogers    },
454100ee7c3SIan Rogers    {
455100ee7c3SIan Rogers        "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses",
456100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
457b522c8afSIan Rogers        "MetricExpr": "min(9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks",
458*4c10b96fSIan Rogers        "MetricGroup": "BvMT;MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group",
459100ee7c3SIan Rogers        "MetricName": "tma_dtlb_load",
460100ee7c3SIan Rogers        "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
461e2c8b40eSIan Rogers        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
462100ee7c3SIan Rogers        "ScaleUnit": "100%"
463100ee7c3SIan Rogers    },
464100ee7c3SIan Rogers    {
465100ee7c3SIan Rogers        "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses",
466b522c8afSIan Rogers        "MetricExpr": "(9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_core_clks",
467*4c10b96fSIan Rogers        "MetricGroup": "BvMT;MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group",
468100ee7c3SIan Rogers        "MetricName": "tma_dtlb_store",
469100ee7c3SIan Rogers        "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
470e2c8b40eSIan Rogers        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
471100ee7c3SIan Rogers        "ScaleUnit": "100%"
472100ee7c3SIan Rogers    },
473100ee7c3SIan Rogers    {
474100ee7c3SIan Rogers        "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
475100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
476e2c8b40eSIan Rogers        "MetricExpr": "(110 * tma_info_system_core_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS.REMOTE_HITM + OFFCORE_RESPONSE.PF_L2_RFO.L3_MISS.REMOTE_HITM) + 47.5 * tma_info_system_core_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / tma_info_thread_clks",
477*4c10b96fSIan Rogers        "MetricGroup": "BvMS;DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group",
478100ee7c3SIan Rogers        "MetricName": "tma_false_sharing",
479100ee7c3SIan Rogers        "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
480100ee7c3SIan Rogers        "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM. Related metrics: tma_contested_accesses, tma_data_sharing, tma_machine_clears, tma_remote_cache",
481100ee7c3SIan Rogers        "ScaleUnit": "100%"
482100ee7c3SIan Rogers    },
483100ee7c3SIan Rogers    {
484100ee7c3SIan Rogers        "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed",
485100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
486b522c8afSIan Rogers        "MetricExpr": "tma_info_memory_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_thread_clks",
487*4c10b96fSIan Rogers        "MetricGroup": "BvMS;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
488100ee7c3SIan Rogers        "MetricName": "tma_fb_full",
489100ee7c3SIan Rogers        "MetricThreshold": "tma_fb_full > 0.3",
490e2c8b40eSIan Rogers        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
491100ee7c3SIan Rogers        "ScaleUnit": "100%"
492100ee7c3SIan Rogers    },
493100ee7c3SIan Rogers    {
494100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues",
495100ee7c3SIan Rogers        "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
496100ee7c3SIan Rogers        "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
497100ee7c3SIan Rogers        "MetricName": "tma_fetch_bandwidth",
498e2c8b40eSIan Rogers        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
499ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
500*4c10b96fSIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
501100ee7c3SIan Rogers        "ScaleUnit": "100%"
502100ee7c3SIan Rogers    },
503100ee7c3SIan Rogers    {
504100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues",
505b522c8afSIan Rogers        "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_thread_slots",
506100ee7c3SIan Rogers        "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
507100ee7c3SIan Rogers        "MetricName": "tma_fetch_latency",
508100ee7c3SIan Rogers        "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
509ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
510100ee7c3SIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS",
511100ee7c3SIan Rogers        "ScaleUnit": "100%"
512100ee7c3SIan Rogers    },
513100ee7c3SIan Rogers    {
514100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops",
515*4c10b96fSIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
516100ee7c3SIan Rogers        "MetricExpr": "tma_heavy_operations - tma_microcode_sequencer",
517100ee7c3SIan Rogers        "MetricGroup": "TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueD0",
518100ee7c3SIan Rogers        "MetricName": "tma_few_uops_instructions",
519100ee7c3SIan Rogers        "MetricThreshold": "tma_few_uops_instructions > 0.05 & tma_heavy_operations > 0.1",
520100ee7c3SIan Rogers        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions. Related metrics: tma_decoder0_alone",
521100ee7c3SIan Rogers        "ScaleUnit": "100%"
522100ee7c3SIan Rogers    },
523100ee7c3SIan Rogers    {
524100ee7c3SIan Rogers        "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)",
525100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
526100ee7c3SIan Rogers        "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector",
527100ee7c3SIan Rogers        "MetricGroup": "HPC;TopdownL3;tma_L3_group;tma_light_operations_group",
528100ee7c3SIan Rogers        "MetricName": "tma_fp_arith",
529100ee7c3SIan Rogers        "MetricThreshold": "tma_fp_arith > 0.2 & tma_light_operations > 0.6",
530100ee7c3SIan Rogers        "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.",
531100ee7c3SIan Rogers        "ScaleUnit": "100%"
532100ee7c3SIan Rogers    },
533100ee7c3SIan Rogers    {
534e2c8b40eSIan Rogers        "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists",
535e2c8b40eSIan Rogers        "MetricExpr": "34 * FP_ASSIST.ANY / tma_info_thread_slots",
536e2c8b40eSIan Rogers        "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group",
537e2c8b40eSIan Rogers        "MetricName": "tma_fp_assists",
538e2c8b40eSIan Rogers        "MetricThreshold": "tma_fp_assists > 0.1",
539e2c8b40eSIan Rogers        "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called Denormals).",
540e2c8b40eSIan Rogers        "ScaleUnit": "100%"
541e2c8b40eSIan Rogers    },
542e2c8b40eSIan Rogers    {
543100ee7c3SIan Rogers        "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
544*4c10b96fSIan Rogers        "MetricExpr": "FP_ARITH_INST_RETIRED.SCALAR / UOPS_RETIRED.RETIRE_SLOTS",
545100ee7c3SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
546100ee7c3SIan Rogers        "MetricName": "tma_fp_scalar",
547100ee7c3SIan Rogers        "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
548100ee7c3SIan Rogers        "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
549100ee7c3SIan Rogers        "ScaleUnit": "100%"
550100ee7c3SIan Rogers    },
551100ee7c3SIan Rogers    {
552100ee7c3SIan Rogers        "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths",
553100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
554100ee7c3SIan Rogers        "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@ / UOPS_RETIRED.RETIRE_SLOTS",
555100ee7c3SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
556100ee7c3SIan Rogers        "MetricName": "tma_fp_vector",
557100ee7c3SIan Rogers        "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
558100ee7c3SIan Rogers        "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
559100ee7c3SIan Rogers        "ScaleUnit": "100%"
560100ee7c3SIan Rogers    },
561100ee7c3SIan Rogers    {
562100ee7c3SIan Rogers        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors",
563100ee7c3SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS",
564100ee7c3SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
565100ee7c3SIan Rogers        "MetricName": "tma_fp_vector_128b",
566100ee7c3SIan Rogers        "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
567100ee7c3SIan Rogers        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
568100ee7c3SIan Rogers        "ScaleUnit": "100%"
569100ee7c3SIan Rogers    },
570100ee7c3SIan Rogers    {
571100ee7c3SIan Rogers        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors",
572100ee7c3SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS",
573100ee7c3SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
574100ee7c3SIan Rogers        "MetricName": "tma_fp_vector_256b",
575100ee7c3SIan Rogers        "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
576100ee7c3SIan Rogers        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
577100ee7c3SIan Rogers        "ScaleUnit": "100%"
578100ee7c3SIan Rogers    },
579100ee7c3SIan Rogers    {
580100ee7c3SIan Rogers        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors",
581100ee7c3SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS",
582100ee7c3SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
583100ee7c3SIan Rogers        "MetricName": "tma_fp_vector_512b",
584100ee7c3SIan Rogers        "MetricThreshold": "tma_fp_vector_512b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
585100ee7c3SIan Rogers        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
586100ee7c3SIan Rogers        "ScaleUnit": "100%"
587100ee7c3SIan Rogers    },
588100ee7c3SIan Rogers    {
589100ee7c3SIan Rogers        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
590b522c8afSIan Rogers        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_thread_slots",
591*4c10b96fSIan Rogers        "MetricGroup": "BvFB;BvIO;PGO;TmaL1;TopdownL1;tma_L1_group",
592100ee7c3SIan Rogers        "MetricName": "tma_frontend_bound",
593100ee7c3SIan Rogers        "MetricThreshold": "tma_frontend_bound > 0.15",
594ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL1",
595100ee7c3SIan Rogers        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS",
596100ee7c3SIan Rogers        "ScaleUnit": "100%"
597100ee7c3SIan Rogers    },
598100ee7c3SIan Rogers    {
599100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions",
600100ee7c3SIan Rogers        "MetricExpr": "tma_light_operations * UOPS_RETIRED.MACRO_FUSED / UOPS_RETIRED.RETIRE_SLOTS",
601*4c10b96fSIan Rogers        "MetricGroup": "Branches;BvBO;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
602100ee7c3SIan Rogers        "MetricName": "tma_fused_instructions",
603100ee7c3SIan Rogers        "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6",
604e2c8b40eSIan Rogers        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. CMP+JCC or DEC+JCC are common examples of legacy fusions. {([MTL] Note new MOV+OP and Load+OP fusions appear under Other_Light_Ops in MTL!)}",
605100ee7c3SIan Rogers        "ScaleUnit": "100%"
606100ee7c3SIan Rogers    },
607100ee7c3SIan Rogers    {
608100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences",
609b522c8afSIan Rogers        "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY) / tma_info_thread_slots",
610100ee7c3SIan Rogers        "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
611100ee7c3SIan Rogers        "MetricName": "tma_heavy_operations",
612100ee7c3SIan Rogers        "MetricThreshold": "tma_heavy_operations > 0.1",
613ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
614e2c8b40eSIan Rogers        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
615100ee7c3SIan Rogers        "ScaleUnit": "100%"
616100ee7c3SIan Rogers    },
617100ee7c3SIan Rogers    {
618100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
619b522c8afSIan Rogers        "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / tma_info_thread_clks",
620*4c10b96fSIan Rogers        "MetricGroup": "BigFootprint;BvBC;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
621100ee7c3SIan Rogers        "MetricName": "tma_icache_misses",
622100ee7c3SIan Rogers        "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
623100ee7c3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
624100ee7c3SIan Rogers        "ScaleUnit": "100%"
625100ee7c3SIan Rogers    },
626100ee7c3SIan Rogers    {
627100ee7c3SIan Rogers        "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
628e2c8b40eSIan Rogers        "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES / 100",
629100ee7c3SIan Rogers        "MetricGroup": "Bad;BrMispredicts;tma_issueBM",
630b522c8afSIan Rogers        "MetricName": "tma_info_bad_spec_branch_misprediction_cost",
631b522c8afSIan Rogers        "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers"
632100ee7c3SIan Rogers    },
633100ee7c3SIan Rogers    {
634b522c8afSIan Rogers        "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
635e2c8b40eSIan Rogers        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)",
636b522c8afSIan Rogers        "MetricGroup": "Bad;BrMispredicts",
637b522c8afSIan Rogers        "MetricName": "tma_info_bad_spec_ipmisp_indirect",
638b522c8afSIan Rogers        "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3"
639100ee7c3SIan Rogers    },
640100ee7c3SIan Rogers    {
641b522c8afSIan Rogers        "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)",
642e2c8b40eSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
643b522c8afSIan Rogers        "MetricGroup": "Bad;BadSpec;BrMispredicts",
644b522c8afSIan Rogers        "MetricName": "tma_info_bad_spec_ipmispredict",
645b522c8afSIan Rogers        "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200"
646100ee7c3SIan Rogers    },
647100ee7c3SIan Rogers    {
648e2c8b40eSIan Rogers        "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)",
649e2c8b40eSIan Rogers        "MetricExpr": "INT_MISC.CLEARS_COUNT / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)",
650e2c8b40eSIan Rogers        "MetricGroup": "BrMispredicts",
651e2c8b40eSIan Rogers        "MetricName": "tma_info_bad_spec_spec_clears_ratio"
652e2c8b40eSIan Rogers    },
653e2c8b40eSIan Rogers    {
654e2c8b40eSIan Rogers        "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
655100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
656b522c8afSIan Rogers        "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)",
657100ee7c3SIan Rogers        "MetricGroup": "Cor;SMT",
658b522c8afSIan Rogers        "MetricName": "tma_info_botlnk_l0_core_bound_likely",
659b522c8afSIan Rogers        "MetricThreshold": "tma_info_botlnk_l0_core_bound_likely > 0.5"
660100ee7c3SIan Rogers    },
661100ee7c3SIan Rogers    {
662*4c10b96fSIan Rogers        "BriefDescription": "Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck",
663*4c10b96fSIan Rogers        "MetricExpr": "100 * (tma_frontend_bound * (tma_fetch_bandwidth / (tma_fetch_bandwidth + tma_fetch_latency)) * (tma_dsb / (tma_dsb + tma_mite)))",
664*4c10b96fSIan Rogers        "MetricGroup": "DSB;FetchBW;tma_issueFB",
665*4c10b96fSIan Rogers        "MetricName": "tma_info_botlnk_l2_dsb_bandwidth",
666*4c10b96fSIan Rogers        "MetricThreshold": "tma_info_botlnk_l2_dsb_bandwidth > 10",
667*4c10b96fSIan Rogers        "PublicDescription": "Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp"
668*4c10b96fSIan Rogers    },
669*4c10b96fSIan Rogers    {
670100ee7c3SIan Rogers        "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck",
671100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
672100ee7c3SIan Rogers        "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))",
673100ee7c3SIan Rogers        "MetricGroup": "DSBmiss;Fed;tma_issueFB",
674b522c8afSIan Rogers        "MetricName": "tma_info_botlnk_l2_dsb_misses",
675b522c8afSIan Rogers        "MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10",
676*4c10b96fSIan Rogers        "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp"
677100ee7c3SIan Rogers    },
678100ee7c3SIan Rogers    {
679100ee7c3SIan Rogers        "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck",
680100ee7c3SIan Rogers        "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
681100ee7c3SIan Rogers        "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL",
682b522c8afSIan Rogers        "MetricName": "tma_info_botlnk_l2_ic_misses",
683b522c8afSIan Rogers        "MetricThreshold": "tma_info_botlnk_l2_ic_misses > 5",
684100ee7c3SIan Rogers        "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: "
685100ee7c3SIan Rogers    },
686100ee7c3SIan Rogers    {
687b522c8afSIan Rogers        "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)",
688b522c8afSIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
689b522c8afSIan Rogers        "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)",
690*4c10b96fSIan Rogers        "MetricGroup": "BigFootprint;BvBC;Fed;Frontend;IcMiss;MemoryTLB",
691b522c8afSIan Rogers        "MetricName": "tma_info_bottleneck_big_code",
692e2c8b40eSIan Rogers        "MetricThreshold": "tma_info_bottleneck_big_code > 20"
693100ee7c3SIan Rogers    },
694100ee7c3SIan Rogers    {
695*4c10b96fSIan Rogers        "BriefDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA",
696*4c10b96fSIan Rogers        "MetricExpr": "100 * ((BR_INST_RETIRED.ALL_BRANCHES + 2 * BR_INST_RETIRED.NEAR_CALL + INST_RETIRED.NOP) / tma_info_thread_slots)",
697*4c10b96fSIan Rogers        "MetricGroup": "BvBO;Ret",
698b522c8afSIan Rogers        "MetricName": "tma_info_bottleneck_branching_overhead",
699*4c10b96fSIan Rogers        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5",
700*4c10b96fSIan Rogers        "PublicDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound)"
701e2c8b40eSIan Rogers    },
702e2c8b40eSIan Rogers    {
703e2c8b40eSIan Rogers        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks",
704*4c10b96fSIan Rogers        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_hit_latency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))",
705*4c10b96fSIan Rogers        "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW",
706e2c8b40eSIan Rogers        "MetricName": "tma_info_bottleneck_cache_memory_bandwidth",
707e2c8b40eSIan Rogers        "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20",
708e2c8b40eSIan Rogers        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
709e2c8b40eSIan Rogers    },
710e2c8b40eSIan Rogers    {
711e2c8b40eSIan Rogers        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks",
712*4c10b96fSIan Rogers        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_hit_latency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_hit_latency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))",
713*4c10b96fSIan Rogers        "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat",
714e2c8b40eSIan Rogers        "MetricName": "tma_info_bottleneck_cache_memory_latency",
715e2c8b40eSIan Rogers        "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20",
716e2c8b40eSIan Rogers        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency"
717e2c8b40eSIan Rogers    },
718e2c8b40eSIan Rogers    {
719e2c8b40eSIan Rogers        "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation",
720e2c8b40eSIan Rogers        "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))",
721*4c10b96fSIan Rogers        "MetricGroup": "BvCB;Cor;tma_issueComp",
722e2c8b40eSIan Rogers        "MetricName": "tma_info_bottleneck_compute_bound_est",
723e2c8b40eSIan Rogers        "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20",
724e2c8b40eSIan Rogers        "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: "
725100ee7c3SIan Rogers    },
726100ee7c3SIan Rogers    {
727*4c10b96fSIan Rogers        "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)",
728100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
729e2c8b40eSIan Rogers        "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
730*4c10b96fSIan Rogers        "MetricGroup": "BvFB;Fed;FetchBW;Frontend",
731b522c8afSIan Rogers        "MetricName": "tma_info_bottleneck_instruction_fetch_bw",
732b522c8afSIan Rogers        "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20"
733100ee7c3SIan Rogers    },
734100ee7c3SIan Rogers    {
735e2c8b40eSIan Rogers        "BriefDescription": "Total pipeline cost of irregular execution (e.g",
736e2c8b40eSIan Rogers        "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
737*4c10b96fSIan Rogers        "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS",
738e2c8b40eSIan Rogers        "MetricName": "tma_info_bottleneck_irregular_overhead",
739e2c8b40eSIan Rogers        "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10",
740e2c8b40eSIan Rogers        "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches"
741100ee7c3SIan Rogers    },
742100ee7c3SIan Rogers    {
743100ee7c3SIan Rogers        "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)",
744100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
745*4c10b96fSIan Rogers        "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_hit_latency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))",
746*4c10b96fSIan Rogers        "MetricGroup": "BvMT;Mem;MemoryTLB;Offcore;tma_issueTLB",
747b522c8afSIan Rogers        "MetricName": "tma_info_bottleneck_memory_data_tlbs",
748b522c8afSIan Rogers        "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20",
749e2c8b40eSIan Rogers        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization"
750100ee7c3SIan Rogers    },
751100ee7c3SIan Rogers    {
752e2c8b40eSIan Rogers        "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)",
753e2c8b40eSIan Rogers        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))",
754*4c10b96fSIan Rogers        "MetricGroup": "BvMS;Mem;Offcore;tma_issueTLB",
755e2c8b40eSIan Rogers        "MetricName": "tma_info_bottleneck_memory_synchronization",
756e2c8b40eSIan Rogers        "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10",
757e2c8b40eSIan Rogers        "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs"
758100ee7c3SIan Rogers    },
759100ee7c3SIan Rogers    {
760100ee7c3SIan Rogers        "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks",
761100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
762e2c8b40eSIan Rogers        "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
763*4c10b96fSIan Rogers        "MetricGroup": "Bad;BadSpec;BrMispredicts;BvMP;tma_issueBM",
764b522c8afSIan Rogers        "MetricName": "tma_info_bottleneck_mispredictions",
765b522c8afSIan Rogers        "MetricThreshold": "tma_info_bottleneck_mispredictions > 20",
766b522c8afSIan Rogers        "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers"
767b522c8afSIan Rogers    },
768b522c8afSIan Rogers    {
769*4c10b96fSIan Rogers        "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end",
770*4c10b96fSIan Rogers        "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_useful_work)",
771*4c10b96fSIan Rogers        "MetricGroup": "BvOB;Cor;Offcore",
772e2c8b40eSIan Rogers        "MetricName": "tma_info_bottleneck_other_bottlenecks",
773e2c8b40eSIan Rogers        "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20",
774*4c10b96fSIan Rogers        "PublicDescription": "Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls."
775*4c10b96fSIan Rogers    },
776*4c10b96fSIan Rogers    {
777*4c10b96fSIan Rogers        "BriefDescription": "Total pipeline cost of \"useful operations\" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.",
778*4c10b96fSIan Rogers        "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + 2 * BR_INST_RETIRED.NEAR_CALL + INST_RETIRED.NOP) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
779*4c10b96fSIan Rogers        "MetricGroup": "BvUW;Ret",
780*4c10b96fSIan Rogers        "MetricName": "tma_info_bottleneck_useful_work",
781*4c10b96fSIan Rogers        "MetricThreshold": "tma_info_bottleneck_useful_work > 20"
782e2c8b40eSIan Rogers    },
783e2c8b40eSIan Rogers    {
784b522c8afSIan Rogers        "BriefDescription": "Fraction of branches that are CALL or RET",
785b522c8afSIan Rogers        "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES",
786b522c8afSIan Rogers        "MetricGroup": "Bad;Branches",
787b522c8afSIan Rogers        "MetricName": "tma_info_branches_callret"
788b522c8afSIan Rogers    },
789b522c8afSIan Rogers    {
790b522c8afSIan Rogers        "BriefDescription": "Fraction of branches that are non-taken conditionals",
791b522c8afSIan Rogers        "MetricExpr": "BR_INST_RETIRED.NOT_TAKEN / BR_INST_RETIRED.ALL_BRANCHES",
792b522c8afSIan Rogers        "MetricGroup": "Bad;Branches;CodeGen;PGO",
793b522c8afSIan Rogers        "MetricName": "tma_info_branches_cond_nt"
794b522c8afSIan Rogers    },
795b522c8afSIan Rogers    {
796b522c8afSIan Rogers        "BriefDescription": "Fraction of branches that are taken conditionals",
797b522c8afSIan Rogers        "MetricExpr": "(BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) / BR_INST_RETIRED.ALL_BRANCHES",
798b522c8afSIan Rogers        "MetricGroup": "Bad;Branches;CodeGen;PGO",
799b522c8afSIan Rogers        "MetricName": "tma_info_branches_cond_tk"
800b522c8afSIan Rogers    },
801b522c8afSIan Rogers    {
802b522c8afSIan Rogers        "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps",
803b522c8afSIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
804e2c8b40eSIan Rogers        "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.COND - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES",
805b522c8afSIan Rogers        "MetricGroup": "Bad;Branches",
806b522c8afSIan Rogers        "MetricName": "tma_info_branches_jump"
807b522c8afSIan Rogers    },
808b522c8afSIan Rogers    {
809b522c8afSIan Rogers        "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
810b522c8afSIan Rogers        "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_thread_clks))",
811b522c8afSIan Rogers        "MetricGroup": "SMT",
812b522c8afSIan Rogers        "MetricName": "tma_info_core_core_clks"
813b522c8afSIan Rogers    },
814b522c8afSIan Rogers    {
815b522c8afSIan Rogers        "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)",
816b522c8afSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks",
817b522c8afSIan Rogers        "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group",
818b522c8afSIan Rogers        "MetricName": "tma_info_core_coreipc"
819b522c8afSIan Rogers    },
820b522c8afSIan Rogers    {
821e2c8b40eSIan Rogers        "BriefDescription": "uops Executed per Cycle",
822e2c8b40eSIan Rogers        "MetricExpr": "UOPS_EXECUTED.THREAD / tma_info_thread_clks",
823e2c8b40eSIan Rogers        "MetricGroup": "Power",
824e2c8b40eSIan Rogers        "MetricName": "tma_info_core_epc"
825e2c8b40eSIan Rogers    },
826e2c8b40eSIan Rogers    {
827b522c8afSIan Rogers        "BriefDescription": "Floating Point Operations Per Cycle",
828b522c8afSIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
829e2c8b40eSIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks",
830b522c8afSIan Rogers        "MetricGroup": "Flops;Ret",
831b522c8afSIan Rogers        "MetricName": "tma_info_core_flopc"
832b522c8afSIan Rogers    },
833b522c8afSIan Rogers    {
834b522c8afSIan Rogers        "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)",
835*4c10b96fSIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_core_clks)",
836b522c8afSIan Rogers        "MetricGroup": "Cor;Flops;HPC",
837b522c8afSIan Rogers        "MetricName": "tma_info_core_fp_arith_utilization",
838b522c8afSIan Rogers        "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
839b522c8afSIan Rogers    },
840b522c8afSIan Rogers    {
841e2c8b40eSIan Rogers        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
842e2c8b40eSIan Rogers        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
843b522c8afSIan Rogers        "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
844b522c8afSIan Rogers        "MetricName": "tma_info_core_ilp"
845b522c8afSIan Rogers    },
846b522c8afSIan Rogers    {
847b522c8afSIan Rogers        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
848b522c8afSIan Rogers        "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)",
849b522c8afSIan Rogers        "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB",
850b522c8afSIan Rogers        "MetricName": "tma_info_frontend_dsb_coverage",
851b522c8afSIan Rogers        "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35",
852*4c10b96fSIan Rogers        "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp"
853b522c8afSIan Rogers    },
854b522c8afSIan Rogers    {
855b522c8afSIan Rogers        "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.",
856b522c8afSIan Rogers        "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / DSB2MITE_SWITCHES.COUNT",
857b522c8afSIan Rogers        "MetricGroup": "DSBmiss",
858b522c8afSIan Rogers        "MetricName": "tma_info_frontend_dsb_switch_cost"
859b522c8afSIan Rogers    },
860b522c8afSIan Rogers    {
861b522c8afSIan Rogers        "BriefDescription": "Average number of Uops issued by front-end when it issued something",
862b522c8afSIan Rogers        "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@",
863b522c8afSIan Rogers        "MetricGroup": "Fed;FetchBW",
864b522c8afSIan Rogers        "MetricName": "tma_info_frontend_fetch_upc"
865b522c8afSIan Rogers    },
866b522c8afSIan Rogers    {
867b522c8afSIan Rogers        "BriefDescription": "Average Latency for L1 instruction cache misses",
868b522c8afSIan Rogers        "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ + 2",
869b522c8afSIan Rogers        "MetricGroup": "Fed;FetchLat;IcMiss",
870b522c8afSIan Rogers        "MetricName": "tma_info_frontend_icache_miss_latency"
871b522c8afSIan Rogers    },
872b522c8afSIan Rogers    {
873b522c8afSIan Rogers        "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)",
874b522c8afSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS",
875b522c8afSIan Rogers        "MetricGroup": "DSBmiss;Fed",
876b522c8afSIan Rogers        "MetricName": "tma_info_frontend_ipdsb_miss_ret",
877b522c8afSIan Rogers        "MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50"
878b522c8afSIan Rogers    },
879b522c8afSIan Rogers    {
880b522c8afSIan Rogers        "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)",
881b522c8afSIan Rogers        "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY",
882b522c8afSIan Rogers        "MetricGroup": "Fed",
883b522c8afSIan Rogers        "MetricName": "tma_info_frontend_ipunknown_branch"
884b522c8afSIan Rogers    },
885b522c8afSIan Rogers    {
886b522c8afSIan Rogers        "BriefDescription": "L2 cache true code cacheline misses per kilo instruction",
887b522c8afSIan Rogers        "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY",
888b522c8afSIan Rogers        "MetricGroup": "IcMiss",
889b522c8afSIan Rogers        "MetricName": "tma_info_frontend_l2mpki_code"
890b522c8afSIan Rogers    },
891b522c8afSIan Rogers    {
892b522c8afSIan Rogers        "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction",
893b522c8afSIan Rogers        "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY",
894b522c8afSIan Rogers        "MetricGroup": "IcMiss",
895b522c8afSIan Rogers        "MetricName": "tma_info_frontend_l2mpki_code_all"
896b522c8afSIan Rogers    },
897b522c8afSIan Rogers    {
898b522c8afSIan Rogers        "BriefDescription": "Branch instructions per taken branch.",
899b522c8afSIan Rogers        "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN",
900b522c8afSIan Rogers        "MetricGroup": "Branches;Fed;PGO",
901b522c8afSIan Rogers        "MetricName": "tma_info_inst_mix_bptkbranch"
902b522c8afSIan Rogers    },
903b522c8afSIan Rogers    {
904b522c8afSIan Rogers        "BriefDescription": "Total number of retired Instructions",
905b522c8afSIan Rogers        "MetricExpr": "INST_RETIRED.ANY",
906b522c8afSIan Rogers        "MetricGroup": "Summary;TmaL1;tma_L1_group",
907b522c8afSIan Rogers        "MetricName": "tma_info_inst_mix_instructions",
908b522c8afSIan Rogers        "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST"
909b522c8afSIan Rogers    },
910b522c8afSIan Rogers    {
911b522c8afSIan Rogers        "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)",
912b522c8afSIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
913*4c10b96fSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@)",
914b522c8afSIan Rogers        "MetricGroup": "Flops;InsType",
915b522c8afSIan Rogers        "MetricName": "tma_info_inst_mix_iparith",
916b522c8afSIan Rogers        "MetricThreshold": "tma_info_inst_mix_iparith < 10",
917e2c8b40eSIan Rogers        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW."
918b522c8afSIan Rogers    },
919b522c8afSIan Rogers    {
920b522c8afSIan Rogers        "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
921b522c8afSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)",
922b522c8afSIan Rogers        "MetricGroup": "Flops;FpVector;InsType",
923b522c8afSIan Rogers        "MetricName": "tma_info_inst_mix_iparith_avx128",
924b522c8afSIan Rogers        "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10",
925e2c8b40eSIan Rogers        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
926b522c8afSIan Rogers    },
927b522c8afSIan Rogers    {
928b522c8afSIan Rogers        "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
929b522c8afSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
930b522c8afSIan Rogers        "MetricGroup": "Flops;FpVector;InsType",
931b522c8afSIan Rogers        "MetricName": "tma_info_inst_mix_iparith_avx256",
932b522c8afSIan Rogers        "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10",
933e2c8b40eSIan Rogers        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
934b522c8afSIan Rogers    },
935b522c8afSIan Rogers    {
936b522c8afSIan Rogers        "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)",
937b522c8afSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
938b522c8afSIan Rogers        "MetricGroup": "Flops;FpVector;InsType",
939b522c8afSIan Rogers        "MetricName": "tma_info_inst_mix_iparith_avx512",
940b522c8afSIan Rogers        "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10",
941e2c8b40eSIan Rogers        "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
942b522c8afSIan Rogers    },
943b522c8afSIan Rogers    {
944b522c8afSIan Rogers        "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
945b522c8afSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE",
946b522c8afSIan Rogers        "MetricGroup": "Flops;FpScalar;InsType",
947b522c8afSIan Rogers        "MetricName": "tma_info_inst_mix_iparith_scalar_dp",
948b522c8afSIan Rogers        "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10",
949e2c8b40eSIan Rogers        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
950b522c8afSIan Rogers    },
951b522c8afSIan Rogers    {
952b522c8afSIan Rogers        "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
953b522c8afSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE",
954b522c8afSIan Rogers        "MetricGroup": "Flops;FpScalar;InsType",
955b522c8afSIan Rogers        "MetricName": "tma_info_inst_mix_iparith_scalar_sp",
956b522c8afSIan Rogers        "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10",
957e2c8b40eSIan Rogers        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
958b522c8afSIan Rogers    },
959b522c8afSIan Rogers    {
960b522c8afSIan Rogers        "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
961b522c8afSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES",
962b522c8afSIan Rogers        "MetricGroup": "Branches;Fed;InsType",
963b522c8afSIan Rogers        "MetricName": "tma_info_inst_mix_ipbranch",
964b522c8afSIan Rogers        "MetricThreshold": "tma_info_inst_mix_ipbranch < 8"
965b522c8afSIan Rogers    },
966b522c8afSIan Rogers    {
967b522c8afSIan Rogers        "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)",
968b522c8afSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL",
969b522c8afSIan Rogers        "MetricGroup": "Branches;Fed;PGO",
970b522c8afSIan Rogers        "MetricName": "tma_info_inst_mix_ipcall",
971b522c8afSIan Rogers        "MetricThreshold": "tma_info_inst_mix_ipcall < 200"
972b522c8afSIan Rogers    },
973b522c8afSIan Rogers    {
974b522c8afSIan Rogers        "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
975b522c8afSIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
976e2c8b40eSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
977b522c8afSIan Rogers        "MetricGroup": "Flops;InsType",
978b522c8afSIan Rogers        "MetricName": "tma_info_inst_mix_ipflop",
979b522c8afSIan Rogers        "MetricThreshold": "tma_info_inst_mix_ipflop < 10"
980b522c8afSIan Rogers    },
981b522c8afSIan Rogers    {
982b522c8afSIan Rogers        "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)",
983b522c8afSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS",
984b522c8afSIan Rogers        "MetricGroup": "InsType",
985b522c8afSIan Rogers        "MetricName": "tma_info_inst_mix_ipload",
986b522c8afSIan Rogers        "MetricThreshold": "tma_info_inst_mix_ipload < 3"
987b522c8afSIan Rogers    },
988b522c8afSIan Rogers    {
989b522c8afSIan Rogers        "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)",
990b522c8afSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES",
991b522c8afSIan Rogers        "MetricGroup": "InsType",
992b522c8afSIan Rogers        "MetricName": "tma_info_inst_mix_ipstore",
993b522c8afSIan Rogers        "MetricThreshold": "tma_info_inst_mix_ipstore < 8"
994b522c8afSIan Rogers    },
995b522c8afSIan Rogers    {
996b522c8afSIan Rogers        "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)",
997b522c8afSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@",
998b522c8afSIan Rogers        "MetricGroup": "Prefetches",
999b522c8afSIan Rogers        "MetricName": "tma_info_inst_mix_ipswpf",
1000b522c8afSIan Rogers        "MetricThreshold": "tma_info_inst_mix_ipswpf < 100"
1001b522c8afSIan Rogers    },
1002b522c8afSIan Rogers    {
1003*4c10b96fSIan Rogers        "BriefDescription": "Instructions per taken branch",
1004b522c8afSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN",
1005b522c8afSIan Rogers        "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB",
1006b522c8afSIan Rogers        "MetricName": "tma_info_inst_mix_iptb",
1007b522c8afSIan Rogers        "MetricThreshold": "tma_info_inst_mix_iptb < 9",
1008*4c10b96fSIan Rogers        "PublicDescription": "Instructions per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp"
1009e2c8b40eSIan Rogers    },
1010e2c8b40eSIan Rogers    {
1011b522c8afSIan Rogers        "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
1012e2c8b40eSIan Rogers        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
1013b522c8afSIan Rogers        "MetricGroup": "Mem;MemoryBW",
1014e2c8b40eSIan Rogers        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t"
1015b522c8afSIan Rogers    },
1016b522c8afSIan Rogers    {
1017b522c8afSIan Rogers        "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
1018e2c8b40eSIan Rogers        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
1019b522c8afSIan Rogers        "MetricGroup": "Mem;MemoryBW",
1020e2c8b40eSIan Rogers        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t"
1021b522c8afSIan Rogers    },
1022b522c8afSIan Rogers    {
1023b522c8afSIan Rogers        "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction",
1024b522c8afSIan Rogers        "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / tma_info_inst_mix_instructions",
1025b522c8afSIan Rogers        "MetricGroup": "L2Evicts;Mem;Server",
1026b522c8afSIan Rogers        "MetricName": "tma_info_memory_core_l2_evictions_nonsilent_pki"
1027b522c8afSIan Rogers    },
1028b522c8afSIan Rogers    {
1029b522c8afSIan Rogers        "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)",
1030b522c8afSIan Rogers        "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / tma_info_inst_mix_instructions",
1031b522c8afSIan Rogers        "MetricGroup": "L2Evicts;Mem;Server",
1032b522c8afSIan Rogers        "MetricName": "tma_info_memory_core_l2_evictions_silent_pki"
1033b522c8afSIan Rogers    },
1034b522c8afSIan Rogers    {
1035b522c8afSIan Rogers        "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
1036e2c8b40eSIan Rogers        "MetricExpr": "tma_info_memory_l3_cache_access_bw",
1037b522c8afSIan Rogers        "MetricGroup": "Mem;MemoryBW;Offcore",
1038e2c8b40eSIan Rogers        "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t"
1039b522c8afSIan Rogers    },
1040b522c8afSIan Rogers    {
1041b522c8afSIan Rogers        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
1042e2c8b40eSIan Rogers        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
1043b522c8afSIan Rogers        "MetricGroup": "Mem;MemoryBW",
1044e2c8b40eSIan Rogers        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t"
1045e2c8b40eSIan Rogers    },
1046e2c8b40eSIan Rogers    {
1047b522c8afSIan Rogers        "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)",
1048b522c8afSIan Rogers        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY",
1049e2c8b40eSIan Rogers        "MetricGroup": "CacheHits;Mem",
1050b522c8afSIan Rogers        "MetricName": "tma_info_memory_fb_hpki"
1051b522c8afSIan Rogers    },
1052b522c8afSIan Rogers    {
1053*4c10b96fSIan Rogers        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
1054e2c8b40eSIan Rogers        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
1055e2c8b40eSIan Rogers        "MetricGroup": "Mem;MemoryBW",
1056e2c8b40eSIan Rogers        "MetricName": "tma_info_memory_l1d_cache_fill_bw"
1057e2c8b40eSIan Rogers    },
1058e2c8b40eSIan Rogers    {
1059b522c8afSIan Rogers        "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
1060b522c8afSIan Rogers        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY",
1061e2c8b40eSIan Rogers        "MetricGroup": "CacheHits;Mem",
1062b522c8afSIan Rogers        "MetricName": "tma_info_memory_l1mpki"
1063b522c8afSIan Rogers    },
1064b522c8afSIan Rogers    {
1065b522c8afSIan Rogers        "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)",
1066b522c8afSIan Rogers        "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY",
1067e2c8b40eSIan Rogers        "MetricGroup": "CacheHits;Mem",
1068b522c8afSIan Rogers        "MetricName": "tma_info_memory_l1mpki_load"
1069b522c8afSIan Rogers    },
1070b522c8afSIan Rogers    {
1071*4c10b96fSIan Rogers        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
1072e2c8b40eSIan Rogers        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
1073e2c8b40eSIan Rogers        "MetricGroup": "Mem;MemoryBW",
1074e2c8b40eSIan Rogers        "MetricName": "tma_info_memory_l2_cache_fill_bw"
1075e2c8b40eSIan Rogers    },
1076e2c8b40eSIan Rogers    {
1077b522c8afSIan Rogers        "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
1078b522c8afSIan Rogers        "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
1079e2c8b40eSIan Rogers        "MetricGroup": "CacheHits;Mem",
1080b522c8afSIan Rogers        "MetricName": "tma_info_memory_l2hpki_all"
1081b522c8afSIan Rogers    },
1082b522c8afSIan Rogers    {
1083b522c8afSIan Rogers        "BriefDescription": "L2 cache hits per kilo instruction for all demand loads  (including speculative)",
1084b522c8afSIan Rogers        "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
1085e2c8b40eSIan Rogers        "MetricGroup": "CacheHits;Mem",
1086b522c8afSIan Rogers        "MetricName": "tma_info_memory_l2hpki_load"
1087b522c8afSIan Rogers    },
1088b522c8afSIan Rogers    {
1089b522c8afSIan Rogers        "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
1090b522c8afSIan Rogers        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY",
1091e2c8b40eSIan Rogers        "MetricGroup": "Backend;CacheHits;Mem",
1092b522c8afSIan Rogers        "MetricName": "tma_info_memory_l2mpki"
1093b522c8afSIan Rogers    },
1094b522c8afSIan Rogers    {
1095b522c8afSIan Rogers        "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
1096b522c8afSIan Rogers        "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY",
1097e2c8b40eSIan Rogers        "MetricGroup": "CacheHits;Mem;Offcore",
1098b522c8afSIan Rogers        "MetricName": "tma_info_memory_l2mpki_all"
1099b522c8afSIan Rogers    },
1100b522c8afSIan Rogers    {
1101b522c8afSIan Rogers        "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads  (including speculative)",
1102b522c8afSIan Rogers        "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
1103e2c8b40eSIan Rogers        "MetricGroup": "CacheHits;Mem",
1104b522c8afSIan Rogers        "MetricName": "tma_info_memory_l2mpki_load"
1105b522c8afSIan Rogers    },
1106b522c8afSIan Rogers    {
1107*4c10b96fSIan Rogers        "BriefDescription": "Offcore requests (L2 cache miss) per kilo instruction for demand RFOs",
1108*4c10b96fSIan Rogers        "MetricExpr": "1e3 * OFFCORE_REQUESTS.DEMAND_RFO / INST_RETIRED.ANY",
1109*4c10b96fSIan Rogers        "MetricGroup": "CacheMisses;Offcore",
1110*4c10b96fSIan Rogers        "MetricName": "tma_info_memory_l2mpki_rfo"
1111*4c10b96fSIan Rogers    },
1112*4c10b96fSIan Rogers    {
1113*4c10b96fSIan Rogers        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
1114e2c8b40eSIan Rogers        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
1115e2c8b40eSIan Rogers        "MetricGroup": "Mem;MemoryBW;Offcore",
1116e2c8b40eSIan Rogers        "MetricName": "tma_info_memory_l3_cache_access_bw"
1117e2c8b40eSIan Rogers    },
1118e2c8b40eSIan Rogers    {
1119*4c10b96fSIan Rogers        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
1120e2c8b40eSIan Rogers        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
1121e2c8b40eSIan Rogers        "MetricGroup": "Mem;MemoryBW",
1122e2c8b40eSIan Rogers        "MetricName": "tma_info_memory_l3_cache_fill_bw"
1123e2c8b40eSIan Rogers    },
1124e2c8b40eSIan Rogers    {
1125b522c8afSIan Rogers        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
1126b522c8afSIan Rogers        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
1127e2c8b40eSIan Rogers        "MetricGroup": "Mem",
1128b522c8afSIan Rogers        "MetricName": "tma_info_memory_l3mpki"
1129b522c8afSIan Rogers    },
1130b522c8afSIan Rogers    {
1131e2c8b40eSIan Rogers        "BriefDescription": "Average Parallel L2 cache miss data reads",
1132e2c8b40eSIan Rogers        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
1133e2c8b40eSIan Rogers        "MetricGroup": "Memory_BW;Offcore",
1134e2c8b40eSIan Rogers        "MetricName": "tma_info_memory_latency_data_l2_mlp"
1135e2c8b40eSIan Rogers    },
1136e2c8b40eSIan Rogers    {
1137e2c8b40eSIan Rogers        "BriefDescription": "Average Latency for L2 cache miss demand Loads",
1138*4c10b96fSIan Rogers        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
1139e2c8b40eSIan Rogers        "MetricGroup": "Memory_Lat;Offcore",
1140e2c8b40eSIan Rogers        "MetricName": "tma_info_memory_latency_load_l2_miss_latency"
1141e2c8b40eSIan Rogers    },
1142e2c8b40eSIan Rogers    {
1143e2c8b40eSIan Rogers        "BriefDescription": "Average Parallel L2 cache miss demand Loads",
1144e2c8b40eSIan Rogers        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
1145af34a16dSIan Rogers        "MetricGroup": "Memory_BW;Offcore",
1146*4c10b96fSIan Rogers        "MetricName": "tma_info_memory_latency_load_l2_mlp"
1147e2c8b40eSIan Rogers    },
1148e2c8b40eSIan Rogers    {
1149b522c8afSIan Rogers        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
1150b522c8afSIan Rogers        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)",
1151b522c8afSIan Rogers        "MetricGroup": "Mem;MemoryBound;MemoryLat",
1152b522c8afSIan Rogers        "MetricName": "tma_info_memory_load_miss_real_latency"
1153100ee7c3SIan Rogers    },
1154100ee7c3SIan Rogers    {
1155e2c8b40eSIan Rogers        "BriefDescription": "Un-cacheable retired load per kilo instruction",
1156*4c10b96fSIan Rogers        "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY",
1157e2c8b40eSIan Rogers        "MetricGroup": "Mem",
1158e2c8b40eSIan Rogers        "MetricName": "tma_info_memory_mix_uc_load_pki"
1159e2c8b40eSIan Rogers    },
1160e2c8b40eSIan Rogers    {
1161100ee7c3SIan Rogers        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
1162100ee7c3SIan Rogers        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
1163100ee7c3SIan Rogers        "MetricGroup": "Mem;MemoryBW;MemoryBound",
1164b522c8afSIan Rogers        "MetricName": "tma_info_memory_mlp",
1165100ee7c3SIan Rogers        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
1166100ee7c3SIan Rogers    },
1167100ee7c3SIan Rogers    {
1168b522c8afSIan Rogers        "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
1169b522c8afSIan Rogers        "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY",
1170b522c8afSIan Rogers        "MetricGroup": "Fed;MemoryTLB",
1171b522c8afSIan Rogers        "MetricName": "tma_info_memory_tlb_code_stlb_mpki"
1172b522c8afSIan Rogers    },
1173b522c8afSIan Rogers    {
1174b522c8afSIan Rogers        "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
1175b522c8afSIan Rogers        "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY",
1176b522c8afSIan Rogers        "MetricGroup": "Mem;MemoryTLB",
1177b522c8afSIan Rogers        "MetricName": "tma_info_memory_tlb_load_stlb_mpki"
1178b522c8afSIan Rogers    },
1179b522c8afSIan Rogers    {
1180100ee7c3SIan Rogers        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
1181100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
1182b522c8afSIan Rogers        "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING) / (2 * tma_info_core_core_clks)",
1183100ee7c3SIan Rogers        "MetricGroup": "Mem;MemoryTLB",
1184b522c8afSIan Rogers        "MetricName": "tma_info_memory_tlb_page_walks_utilization",
1185b522c8afSIan Rogers        "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5"
1186100ee7c3SIan Rogers    },
1187100ee7c3SIan Rogers    {
1188100ee7c3SIan Rogers        "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
1189100ee7c3SIan Rogers        "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY",
1190100ee7c3SIan Rogers        "MetricGroup": "Mem;MemoryTLB",
1191b522c8afSIan Rogers        "MetricName": "tma_info_memory_tlb_store_stlb_mpki"
1192b522c8afSIan Rogers    },
1193b522c8afSIan Rogers    {
1194*4c10b96fSIan Rogers        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per core",
1195e2c8b40eSIan Rogers        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)",
1196b522c8afSIan Rogers        "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
1197b522c8afSIan Rogers        "MetricName": "tma_info_pipeline_execute"
1198b522c8afSIan Rogers    },
1199b522c8afSIan Rogers    {
1200*4c10b96fSIan Rogers        "BriefDescription": "Average number of uops fetched from DSB per cycle",
1201*4c10b96fSIan Rogers        "MetricExpr": "IDQ.DSB_UOPS / IDQ.DSB_CYCLES_ANY",
1202*4c10b96fSIan Rogers        "MetricGroup": "Fed;FetchBW",
1203*4c10b96fSIan Rogers        "MetricName": "tma_info_pipeline_fetch_dsb"
1204*4c10b96fSIan Rogers    },
1205*4c10b96fSIan Rogers    {
1206*4c10b96fSIan Rogers        "BriefDescription": "Average number of uops fetched from MITE per cycle",
1207*4c10b96fSIan Rogers        "MetricExpr": "IDQ.MITE_UOPS / IDQ.MITE_CYCLES",
1208*4c10b96fSIan Rogers        "MetricGroup": "Fed;FetchBW",
1209*4c10b96fSIan Rogers        "MetricName": "tma_info_pipeline_fetch_mite"
1210*4c10b96fSIan Rogers    },
1211*4c10b96fSIan Rogers    {
1212e2c8b40eSIan Rogers        "BriefDescription": "Instructions per a microcode Assist invocation",
1213e2c8b40eSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (FP_ASSIST.ANY + OTHER_ASSISTS.ANY)",
1214e2c8b40eSIan Rogers        "MetricGroup": "MicroSeq;Pipeline;Ret;Retire",
1215e2c8b40eSIan Rogers        "MetricName": "tma_info_pipeline_ipassist",
1216e2c8b40eSIan Rogers        "MetricThreshold": "tma_info_pipeline_ipassist < 100e3",
1217e2c8b40eSIan Rogers        "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)"
1218e2c8b40eSIan Rogers    },
1219e2c8b40eSIan Rogers    {
1220b522c8afSIan Rogers        "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
1221b522c8afSIan Rogers        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@",
1222b522c8afSIan Rogers        "MetricGroup": "Pipeline;Ret",
1223b522c8afSIan Rogers        "MetricName": "tma_info_pipeline_retire"
1224b522c8afSIan Rogers    },
1225b522c8afSIan Rogers    {
1226e2c8b40eSIan Rogers        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
1227b522c8afSIan Rogers        "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
1228b522c8afSIan Rogers        "MetricGroup": "Power;Summary",
1229e2c8b40eSIan Rogers        "MetricName": "tma_info_system_core_frequency"
1230b522c8afSIan Rogers    },
1231b522c8afSIan Rogers    {
1232e2c8b40eSIan Rogers        "BriefDescription": "Average CPU Utilization (percentage)",
1233*4c10b96fSIan Rogers        "MetricExpr": "tma_info_system_cpus_utilized / #num_cpus_online",
1234b522c8afSIan Rogers        "MetricGroup": "HPC;Summary",
1235b522c8afSIan Rogers        "MetricName": "tma_info_system_cpu_utilization"
1236b522c8afSIan Rogers    },
1237b522c8afSIan Rogers    {
1238e2c8b40eSIan Rogers        "BriefDescription": "Average number of utilized CPUs",
1239*4c10b96fSIan Rogers        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
1240e2c8b40eSIan Rogers        "MetricGroup": "Summary",
1241e2c8b40eSIan Rogers        "MetricName": "tma_info_system_cpus_utilized"
1242e2c8b40eSIan Rogers    },
1243e2c8b40eSIan Rogers    {
1244b522c8afSIan Rogers        "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
1245b522c8afSIan Rogers        "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time",
1246e2c8b40eSIan Rogers        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
1247b522c8afSIan Rogers        "MetricName": "tma_info_system_dram_bw_use",
1248e2c8b40eSIan Rogers        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
1249b522c8afSIan Rogers    },
1250b522c8afSIan Rogers    {
1251b522c8afSIan Rogers        "BriefDescription": "Giga Floating Point Operations Per Second",
1252b522c8afSIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
1253e2c8b40eSIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time",
1254b522c8afSIan Rogers        "MetricGroup": "Cor;Flops;HPC",
1255b522c8afSIan Rogers        "MetricName": "tma_info_system_gflops",
1256e2c8b40eSIan Rogers        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
1257b522c8afSIan Rogers    },
1258b522c8afSIan Rogers    {
1259b522c8afSIan Rogers        "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]",
1260b522c8afSIan Rogers        "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3) * 4 / 1e9 / duration_time",
1261e2c8b40eSIan Rogers        "MetricGroup": "IoBW;MemOffcore;Server;SoC",
1262e2c8b40eSIan Rogers        "MetricName": "tma_info_system_io_read_bw",
1263e2c8b40eSIan Rogers        "PublicDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]. Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU"
1264b522c8afSIan Rogers    },
1265b522c8afSIan Rogers    {
1266b522c8afSIan Rogers        "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]",
1267b522c8afSIan Rogers        "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3) * 4 / 1e9 / duration_time",
1268e2c8b40eSIan Rogers        "MetricGroup": "IoBW;MemOffcore;Server;SoC",
1269e2c8b40eSIan Rogers        "MetricName": "tma_info_system_io_write_bw",
1270e2c8b40eSIan Rogers        "PublicDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]. Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU"
1271b522c8afSIan Rogers    },
1272b522c8afSIan Rogers    {
1273b522c8afSIan Rogers        "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
1274b522c8afSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u",
1275b522c8afSIan Rogers        "MetricGroup": "Branches;OS",
1276b522c8afSIan Rogers        "MetricName": "tma_info_system_ipfarbranch",
1277b522c8afSIan Rogers        "MetricThreshold": "tma_info_system_ipfarbranch < 1e6"
1278b522c8afSIan Rogers    },
1279b522c8afSIan Rogers    {
1280b522c8afSIan Rogers        "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode",
1281b522c8afSIan Rogers        "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k",
1282b522c8afSIan Rogers        "MetricGroup": "OS",
1283b522c8afSIan Rogers        "MetricName": "tma_info_system_kernel_cpi"
1284b522c8afSIan Rogers    },
1285b522c8afSIan Rogers    {
1286b522c8afSIan Rogers        "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode",
1287b522c8afSIan Rogers        "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD",
1288b522c8afSIan Rogers        "MetricGroup": "OS",
1289b522c8afSIan Rogers        "MetricName": "tma_info_system_kernel_utilization",
1290b522c8afSIan Rogers        "MetricThreshold": "tma_info_system_kernel_utilization > 0.05"
1291b522c8afSIan Rogers    },
1292b522c8afSIan Rogers    {
1293b522c8afSIan Rogers        "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]",
1294b522c8afSIan Rogers        "MetricExpr": "1e9 * (UNC_M_RPQ_OCCUPANCY / UNC_M_RPQ_INSERTS) / imc_0@event\\=0x0@",
1295e2c8b40eSIan Rogers        "MetricGroup": "MemOffcore;MemoryLat;Server;SoC",
1296b522c8afSIan Rogers        "MetricName": "tma_info_system_mem_dram_read_latency",
1297b522c8afSIan Rogers        "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches"
1298b522c8afSIan Rogers    },
1299b522c8afSIan Rogers    {
1300b522c8afSIan Rogers        "BriefDescription": "Average number of parallel data read requests to external memory",
1301b522c8afSIan Rogers        "MetricExpr": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD@thresh\\=1@",
1302b522c8afSIan Rogers        "MetricGroup": "Mem;MemoryBW;SoC",
1303b522c8afSIan Rogers        "MetricName": "tma_info_system_mem_parallel_reads",
1304b522c8afSIan Rogers        "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches"
1305b522c8afSIan Rogers    },
1306b522c8afSIan Rogers    {
1307b522c8afSIan Rogers        "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)",
1308b522c8afSIan Rogers        "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_system_socket_clks / duration_time)",
1309b522c8afSIan Rogers        "MetricGroup": "Mem;MemoryLat;SoC",
1310b522c8afSIan Rogers        "MetricName": "tma_info_system_mem_read_latency",
1311b522c8afSIan Rogers        "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)"
1312b522c8afSIan Rogers    },
1313b522c8afSIan Rogers    {
1314b522c8afSIan Rogers        "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0",
1315b522c8afSIan Rogers        "MetricExpr": "(CORE_POWER.LVL0_TURBO_LICENSE / 2 / tma_info_core_core_clks if #SMT_on else CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_core_clks)",
1316b522c8afSIan Rogers        "MetricGroup": "Power",
1317b522c8afSIan Rogers        "MetricName": "tma_info_system_power_license0_utilization",
1318b522c8afSIan Rogers        "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0.  This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes."
1319b522c8afSIan Rogers    },
1320b522c8afSIan Rogers    {
1321b522c8afSIan Rogers        "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1",
1322b522c8afSIan Rogers        "MetricExpr": "(CORE_POWER.LVL1_TURBO_LICENSE / 2 / tma_info_core_core_clks if #SMT_on else CORE_POWER.LVL1_TURBO_LICENSE / tma_info_core_core_clks)",
1323b522c8afSIan Rogers        "MetricGroup": "Power",
1324b522c8afSIan Rogers        "MetricName": "tma_info_system_power_license1_utilization",
1325b522c8afSIan Rogers        "MetricThreshold": "tma_info_system_power_license1_utilization > 0.5",
1326b522c8afSIan Rogers        "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1.  This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions."
1327b522c8afSIan Rogers    },
1328b522c8afSIan Rogers    {
1329b522c8afSIan Rogers        "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX)",
1330b522c8afSIan Rogers        "MetricExpr": "(CORE_POWER.LVL2_TURBO_LICENSE / 2 / tma_info_core_core_clks if #SMT_on else CORE_POWER.LVL2_TURBO_LICENSE / tma_info_core_core_clks)",
1331b522c8afSIan Rogers        "MetricGroup": "Power",
1332b522c8afSIan Rogers        "MetricName": "tma_info_system_power_license2_utilization",
1333b522c8afSIan Rogers        "MetricThreshold": "tma_info_system_power_license2_utilization > 0.5",
1334b522c8afSIan Rogers        "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX).  This includes high current AVX 512-bit instructions."
1335b522c8afSIan Rogers    },
1336b522c8afSIan Rogers    {
1337b522c8afSIan Rogers        "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
1338b522c8afSIan Rogers        "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)",
1339b522c8afSIan Rogers        "MetricGroup": "SMT",
1340b522c8afSIan Rogers        "MetricName": "tma_info_system_smt_2t_utilization"
1341b522c8afSIan Rogers    },
1342b522c8afSIan Rogers    {
1343b522c8afSIan Rogers        "BriefDescription": "Socket actual clocks when any core is active on that socket",
1344b522c8afSIan Rogers        "MetricExpr": "cha_0@event\\=0x0@",
1345b522c8afSIan Rogers        "MetricGroup": "SoC",
1346b522c8afSIan Rogers        "MetricName": "tma_info_system_socket_clks"
1347100ee7c3SIan Rogers    },
1348100ee7c3SIan Rogers    {
1349100ee7c3SIan Rogers        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
1350b522c8afSIan Rogers        "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC",
1351100ee7c3SIan Rogers        "MetricGroup": "Power",
1352b522c8afSIan Rogers        "MetricName": "tma_info_system_turbo_utilization"
1353b522c8afSIan Rogers    },
1354b522c8afSIan Rogers    {
1355e2c8b40eSIan Rogers        "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]",
1356e2c8b40eSIan Rogers        "MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time",
1357e2c8b40eSIan Rogers        "MetricGroup": "SoC",
1358e2c8b40eSIan Rogers        "MetricName": "tma_info_system_uncore_frequency"
1359e2c8b40eSIan Rogers    },
1360e2c8b40eSIan Rogers    {
1361b522c8afSIan Rogers        "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
1362b522c8afSIan Rogers        "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
1363b522c8afSIan Rogers        "MetricGroup": "Pipeline",
1364b522c8afSIan Rogers        "MetricName": "tma_info_thread_clks"
1365b522c8afSIan Rogers    },
1366b522c8afSIan Rogers    {
1367b522c8afSIan Rogers        "BriefDescription": "Cycles Per Instruction (per Logical Processor)",
1368b522c8afSIan Rogers        "MetricExpr": "1 / tma_info_thread_ipc",
1369b522c8afSIan Rogers        "MetricGroup": "Mem;Pipeline",
1370b522c8afSIan Rogers        "MetricName": "tma_info_thread_cpi"
1371b522c8afSIan Rogers    },
1372b522c8afSIan Rogers    {
1373b522c8afSIan Rogers        "BriefDescription": "The ratio of Executed- by Issued-Uops",
1374b522c8afSIan Rogers        "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY",
1375b522c8afSIan Rogers        "MetricGroup": "Cor;Pipeline",
1376b522c8afSIan Rogers        "MetricName": "tma_info_thread_execute_per_issue",
1377b522c8afSIan Rogers        "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage."
1378b522c8afSIan Rogers    },
1379b522c8afSIan Rogers    {
1380b522c8afSIan Rogers        "BriefDescription": "Instructions Per Cycle (per Logical Processor)",
1381b522c8afSIan Rogers        "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks",
1382b522c8afSIan Rogers        "MetricGroup": "Ret;Summary",
1383b522c8afSIan Rogers        "MetricName": "tma_info_thread_ipc"
1384b522c8afSIan Rogers    },
1385b522c8afSIan Rogers    {
1386b522c8afSIan Rogers        "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)",
1387b522c8afSIan Rogers        "MetricExpr": "4 * tma_info_core_core_clks",
1388b522c8afSIan Rogers        "MetricGroup": "TmaL1;tma_L1_group",
1389b522c8afSIan Rogers        "MetricName": "tma_info_thread_slots"
1390100ee7c3SIan Rogers    },
1391100ee7c3SIan Rogers    {
1392100ee7c3SIan Rogers        "BriefDescription": "Uops Per Instruction",
1393100ee7c3SIan Rogers        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY",
1394100ee7c3SIan Rogers        "MetricGroup": "Pipeline;Ret;Retire",
1395b522c8afSIan Rogers        "MetricName": "tma_info_thread_uoppi",
1396b522c8afSIan Rogers        "MetricThreshold": "tma_info_thread_uoppi > 1.05"
1397100ee7c3SIan Rogers    },
1398100ee7c3SIan Rogers    {
1399*4c10b96fSIan Rogers        "BriefDescription": "Uops per taken branch",
1400100ee7c3SIan Rogers        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / BR_INST_RETIRED.NEAR_TAKEN",
1401100ee7c3SIan Rogers        "MetricGroup": "Branches;Fed;FetchBW",
1402b522c8afSIan Rogers        "MetricName": "tma_info_thread_uptb",
1403b522c8afSIan Rogers        "MetricThreshold": "tma_info_thread_uptb < 6"
1404100ee7c3SIan Rogers    },
1405100ee7c3SIan Rogers    {
1406100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
1407e2c8b40eSIan Rogers        "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks",
1408*4c10b96fSIan Rogers        "MetricGroup": "BigFootprint;BvBC;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
1409100ee7c3SIan Rogers        "MetricName": "tma_itlb_misses",
1410100ee7c3SIan Rogers        "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
1411100ee7c3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
1412100ee7c3SIan Rogers        "ScaleUnit": "100%"
1413100ee7c3SIan Rogers    },
1414100ee7c3SIan Rogers    {
1415100ee7c3SIan Rogers        "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
1416b522c8afSIan Rogers        "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)",
1417e2c8b40eSIan Rogers        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
1418100ee7c3SIan Rogers        "MetricName": "tma_l1_bound",
1419100ee7c3SIan Rogers        "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
1420100ee7c3SIan Rogers        "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
1421100ee7c3SIan Rogers        "ScaleUnit": "100%"
1422100ee7c3SIan Rogers    },
1423100ee7c3SIan Rogers    {
1424*4c10b96fSIan Rogers        "BriefDescription": "This metric roughly estimates fraction of cycles with demand load accesses that hit the L1 cache",
1425*4c10b96fSIan Rogers        "MetricExpr": "min(2 * (MEM_INST_RETIRED.ALL_LOADS - MEM_LOAD_RETIRED.FB_HIT - MEM_LOAD_RETIRED.L1_MISS) * 20 / 100, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks",
1426*4c10b96fSIan Rogers        "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group",
1427*4c10b96fSIan Rogers        "MetricName": "tma_l1_hit_latency",
1428*4c10b96fSIan Rogers        "MetricThreshold": "tma_l1_hit_latency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
1429*4c10b96fSIan Rogers        "PublicDescription": "This metric roughly estimates fraction of cycles with demand load accesses that hit the L1 cache. The short latency of the L1 data cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT",
1430*4c10b96fSIan Rogers        "ScaleUnit": "100%"
1431*4c10b96fSIan Rogers    },
1432*4c10b96fSIan Rogers    {
1433100ee7c3SIan Rogers        "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
1434100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
1435b522c8afSIan Rogers        "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)",
1436*4c10b96fSIan Rogers        "MetricGroup": "BvML;CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
1437100ee7c3SIan Rogers        "MetricName": "tma_l2_bound",
1438100ee7c3SIan Rogers        "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
1439100ee7c3SIan Rogers        "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
1440100ee7c3SIan Rogers        "ScaleUnit": "100%"
1441100ee7c3SIan Rogers    },
1442100ee7c3SIan Rogers    {
1443100ee7c3SIan Rogers        "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
1444b522c8afSIan Rogers        "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks",
1445e2c8b40eSIan Rogers        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
1446100ee7c3SIan Rogers        "MetricName": "tma_l3_bound",
1447100ee7c3SIan Rogers        "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
1448100ee7c3SIan Rogers        "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
1449100ee7c3SIan Rogers        "ScaleUnit": "100%"
1450100ee7c3SIan Rogers    },
1451100ee7c3SIan Rogers    {
1452e2c8b40eSIan Rogers        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
1453e2c8b40eSIan Rogers        "MetricExpr": "17 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks",
1454*4c10b96fSIan Rogers        "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
1455100ee7c3SIan Rogers        "MetricName": "tma_l3_hit_latency",
1456100ee7c3SIan Rogers        "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
1457e2c8b40eSIan Rogers        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency",
1458100ee7c3SIan Rogers        "ScaleUnit": "100%"
1459100ee7c3SIan Rogers    },
1460100ee7c3SIan Rogers    {
1461100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
1462e2c8b40eSIan Rogers        "MetricExpr": "DECODE.LCP / tma_info_thread_clks",
1463100ee7c3SIan Rogers        "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
1464100ee7c3SIan Rogers        "MetricName": "tma_lcp",
1465100ee7c3SIan Rogers        "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
1466*4c10b96fSIan Rogers        "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb",
1467100ee7c3SIan Rogers        "ScaleUnit": "100%"
1468100ee7c3SIan Rogers    },
1469100ee7c3SIan Rogers    {
1470100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)",
1471100ee7c3SIan Rogers        "MetricExpr": "tma_retiring - tma_heavy_operations",
1472100ee7c3SIan Rogers        "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
1473100ee7c3SIan Rogers        "MetricName": "tma_light_operations",
1474100ee7c3SIan Rogers        "MetricThreshold": "tma_light_operations > 0.6",
1475ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
1476e2c8b40eSIan Rogers        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
1477100ee7c3SIan Rogers        "ScaleUnit": "100%"
1478100ee7c3SIan Rogers    },
1479100ee7c3SIan Rogers    {
1480100ee7c3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations",
1481b522c8afSIan Rogers        "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)",
1482100ee7c3SIan Rogers        "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
1483100ee7c3SIan Rogers        "MetricName": "tma_load_op_utilization",
1484100ee7c3SIan Rogers        "MetricThreshold": "tma_load_op_utilization > 0.6",
1485100ee7c3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations. Sample with: UOPS_DISPATCHED.PORT_2_3",
1486100ee7c3SIan Rogers        "ScaleUnit": "100%"
1487100ee7c3SIan Rogers    },
1488100ee7c3SIan Rogers    {
1489100ee7c3SIan Rogers        "BriefDescription": "This metric roughly estimates the fraction of cycles where the (first level) DTLB was missed by load accesses, that later on hit in second-level TLB (STLB)",
1490100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
1491100ee7c3SIan Rogers        "MetricExpr": "tma_dtlb_load - tma_load_stlb_miss",
1492100ee7c3SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group",
1493100ee7c3SIan Rogers        "MetricName": "tma_load_stlb_hit",
1494100ee7c3SIan Rogers        "MetricThreshold": "tma_load_stlb_hit > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
1495100ee7c3SIan Rogers        "ScaleUnit": "100%"
1496100ee7c3SIan Rogers    },
1497100ee7c3SIan Rogers    {
1498100ee7c3SIan Rogers        "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk",
1499b522c8afSIan Rogers        "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_thread_clks",
1500100ee7c3SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group",
1501100ee7c3SIan Rogers        "MetricName": "tma_load_stlb_miss",
1502100ee7c3SIan Rogers        "MetricThreshold": "tma_load_stlb_miss > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
1503100ee7c3SIan Rogers        "ScaleUnit": "100%"
1504100ee7c3SIan Rogers    },
1505100ee7c3SIan Rogers    {
1506100ee7c3SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory",
1507e2c8b40eSIan Rogers        "MetricExpr": "59.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
1508100ee7c3SIan Rogers        "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group",
1509e2c8b40eSIan Rogers        "MetricName": "tma_local_mem",
1510e2c8b40eSIan Rogers        "MetricThreshold": "tma_local_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
1511*4c10b96fSIan Rogers        "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM",
1512100ee7c3SIan Rogers        "ScaleUnit": "100%"
1513100ee7c3SIan Rogers    },
1514100ee7c3SIan Rogers    {
1515100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations",
1516b522c8afSIan Rogers        "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (11 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks",
1517100ee7c3SIan Rogers        "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group",
1518100ee7c3SIan Rogers        "MetricName": "tma_lock_latency",
1519100ee7c3SIan Rogers        "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
1520*4c10b96fSIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS. Related metrics: tma_store_latency",
1521100ee7c3SIan Rogers        "ScaleUnit": "100%"
1522100ee7c3SIan Rogers    },
1523100ee7c3SIan Rogers    {
1524100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears",
1525100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
1526100ee7c3SIan Rogers        "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts",
1527*4c10b96fSIan Rogers        "MetricGroup": "BadSpec;BvMS;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
1528100ee7c3SIan Rogers        "MetricName": "tma_machine_clears",
1529100ee7c3SIan Rogers        "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
1530ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
1531100ee7c3SIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
1532100ee7c3SIan Rogers        "ScaleUnit": "100%"
1533100ee7c3SIan Rogers    },
1534100ee7c3SIan Rogers    {
1535e2c8b40eSIan Rogers        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
1536b522c8afSIan Rogers        "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks",
1537*4c10b96fSIan Rogers        "MetricGroup": "BvMS;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
1538100ee7c3SIan Rogers        "MetricName": "tma_mem_bandwidth",
1539100ee7c3SIan Rogers        "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
1540e2c8b40eSIan Rogers        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
1541100ee7c3SIan Rogers        "ScaleUnit": "100%"
1542100ee7c3SIan Rogers    },
1543100ee7c3SIan Rogers    {
1544e2c8b40eSIan Rogers        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
1545b522c8afSIan Rogers        "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
1546*4c10b96fSIan Rogers        "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
1547100ee7c3SIan Rogers        "MetricName": "tma_mem_latency",
1548100ee7c3SIan Rogers        "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
1549e2c8b40eSIan Rogers        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency",
1550100ee7c3SIan Rogers        "ScaleUnit": "100%"
1551100ee7c3SIan Rogers    },
1552100ee7c3SIan Rogers    {
1553100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck",
1554100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
1555100ee7c3SIan Rogers        "MetricExpr": "(CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * tma_backend_bound",
1556100ee7c3SIan Rogers        "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
1557100ee7c3SIan Rogers        "MetricName": "tma_memory_bound",
1558100ee7c3SIan Rogers        "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
1559ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
1560100ee7c3SIan Rogers        "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
1561100ee7c3SIan Rogers        "ScaleUnit": "100%"
1562100ee7c3SIan Rogers    },
1563100ee7c3SIan Rogers    {
1564100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.",
1565100ee7c3SIan Rogers        "MetricExpr": "tma_light_operations * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY",
1566100ee7c3SIan Rogers        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
1567100ee7c3SIan Rogers        "MetricName": "tma_memory_operations",
1568100ee7c3SIan Rogers        "MetricThreshold": "tma_memory_operations > 0.1 & tma_light_operations > 0.6",
1569100ee7c3SIan Rogers        "ScaleUnit": "100%"
1570100ee7c3SIan Rogers    },
1571100ee7c3SIan Rogers    {
1572100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit",
1573*4c10b96fSIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
1574b522c8afSIan Rogers        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots",
1575100ee7c3SIan Rogers        "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS",
1576100ee7c3SIan Rogers        "MetricName": "tma_microcode_sequencer",
1577100ee7c3SIan Rogers        "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1",
1578e2c8b40eSIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches",
1579100ee7c3SIan Rogers        "ScaleUnit": "100%"
1580100ee7c3SIan Rogers    },
1581100ee7c3SIan Rogers    {
1582100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage",
1583b522c8afSIan Rogers        "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks",
1584*4c10b96fSIan Rogers        "MetricGroup": "BadSpec;BrMispredicts;BvMP;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM",
1585100ee7c3SIan Rogers        "MetricName": "tma_mispredicts_resteers",
1586100ee7c3SIan Rogers        "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
1587b522c8afSIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions",
1588100ee7c3SIan Rogers        "ScaleUnit": "100%"
1589100ee7c3SIan Rogers    },
1590100ee7c3SIan Rogers    {
1591100ee7c3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)",
1592b522c8afSIan Rogers        "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
1593100ee7c3SIan Rogers        "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
1594100ee7c3SIan Rogers        "MetricName": "tma_mite",
1595e2c8b40eSIan Rogers        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
1596100ee7c3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS",
1597100ee7c3SIan Rogers        "ScaleUnit": "100%"
1598100ee7c3SIan Rogers    },
1599100ee7c3SIan Rogers    {
1600e2c8b40eSIan Rogers        "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)",
1601100ee7c3SIan Rogers        "MetricExpr": "UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY",
1602100ee7c3SIan Rogers        "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group",
1603100ee7c3SIan Rogers        "MetricName": "tma_mixing_vectors",
1604100ee7c3SIan Rogers        "MetricThreshold": "tma_mixing_vectors > 0.05",
1605e2c8b40eSIan Rogers        "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
1606100ee7c3SIan Rogers        "ScaleUnit": "100%"
1607100ee7c3SIan Rogers    },
1608100ee7c3SIan Rogers    {
1609100ee7c3SIan Rogers        "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)",
1610b522c8afSIan Rogers        "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_thread_clks",
1611100ee7c3SIan Rogers        "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO",
1612100ee7c3SIan Rogers        "MetricName": "tma_ms_switches",
1613100ee7c3SIan Rogers        "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
1614e2c8b40eSIan Rogers        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
1615100ee7c3SIan Rogers        "ScaleUnit": "100%"
1616100ee7c3SIan Rogers    },
1617100ee7c3SIan Rogers    {
1618100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused",
1619100ee7c3SIan Rogers        "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - UOPS_RETIRED.MACRO_FUSED) / UOPS_RETIRED.RETIRE_SLOTS",
1620*4c10b96fSIan Rogers        "MetricGroup": "Branches;BvBO;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
1621100ee7c3SIan Rogers        "MetricName": "tma_non_fused_branches",
1622100ee7c3SIan Rogers        "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6",
1623100ee7c3SIan Rogers        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.",
1624100ee7c3SIan Rogers        "ScaleUnit": "100%"
1625100ee7c3SIan Rogers    },
1626100ee7c3SIan Rogers    {
1627100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions",
1628100ee7c3SIan Rogers        "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / UOPS_RETIRED.RETIRE_SLOTS",
1629*4c10b96fSIan Rogers        "MetricGroup": "BvBO;Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group",
1630100ee7c3SIan Rogers        "MetricName": "tma_nop_instructions",
1631e2c8b40eSIan Rogers        "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)",
1632100ee7c3SIan Rogers        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP",
1633100ee7c3SIan Rogers        "ScaleUnit": "100%"
1634100ee7c3SIan Rogers    },
1635100ee7c3SIan Rogers    {
1636100ee7c3SIan Rogers        "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes",
1637e2c8b40eSIan Rogers        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))",
1638100ee7c3SIan Rogers        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
1639100ee7c3SIan Rogers        "MetricName": "tma_other_light_ops",
1640100ee7c3SIan Rogers        "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6",
1641100ee7c3SIan Rogers        "PublicDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting",
1642100ee7c3SIan Rogers        "ScaleUnit": "100%"
1643100ee7c3SIan Rogers    },
1644100ee7c3SIan Rogers    {
1645e2c8b40eSIan Rogers        "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).",
1646e2c8b40eSIan Rogers        "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)",
1647*4c10b96fSIan Rogers        "MetricGroup": "BrMispredicts;BvIO;TopdownL3;tma_L3_group;tma_branch_mispredicts_group",
1648e2c8b40eSIan Rogers        "MetricName": "tma_other_mispredicts",
1649e2c8b40eSIan Rogers        "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)",
1650e2c8b40eSIan Rogers        "ScaleUnit": "100%"
1651e2c8b40eSIan Rogers    },
1652e2c8b40eSIan Rogers    {
1653e2c8b40eSIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.",
1654e2c8b40eSIan Rogers        "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)",
1655*4c10b96fSIan Rogers        "MetricGroup": "BvIO;Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group",
1656e2c8b40eSIan Rogers        "MetricName": "tma_other_nukes",
1657e2c8b40eSIan Rogers        "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)",
1658e2c8b40eSIan Rogers        "ScaleUnit": "100%"
1659e2c8b40eSIan Rogers    },
1660e2c8b40eSIan Rogers    {
1661100ee7c3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)",
1662b522c8afSIan Rogers        "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks",
1663100ee7c3SIan Rogers        "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
1664100ee7c3SIan Rogers        "MetricName": "tma_port_0",
1665100ee7c3SIan Rogers        "MetricThreshold": "tma_port_0 > 0.6",
1666100ee7c3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
1667100ee7c3SIan Rogers        "ScaleUnit": "100%"
1668100ee7c3SIan Rogers    },
1669100ee7c3SIan Rogers    {
1670100ee7c3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)",
1671b522c8afSIan Rogers        "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_core_clks",
1672100ee7c3SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
1673100ee7c3SIan Rogers        "MetricName": "tma_port_1",
1674100ee7c3SIan Rogers        "MetricThreshold": "tma_port_1 > 0.6",
1675100ee7c3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2",
1676100ee7c3SIan Rogers        "ScaleUnit": "100%"
1677100ee7c3SIan Rogers    },
1678100ee7c3SIan Rogers    {
1679100ee7c3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)",
1680b522c8afSIan Rogers        "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_core_clks",
1681100ee7c3SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group",
1682100ee7c3SIan Rogers        "MetricName": "tma_port_2",
1683100ee7c3SIan Rogers        "MetricThreshold": "tma_port_2 > 0.6",
1684100ee7c3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads). Sample with: UOPS_DISPATCHED_PORT.PORT_2",
1685100ee7c3SIan Rogers        "ScaleUnit": "100%"
1686100ee7c3SIan Rogers    },
1687100ee7c3SIan Rogers    {
1688100ee7c3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)",
1689b522c8afSIan Rogers        "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_core_clks",
1690100ee7c3SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group",
1691100ee7c3SIan Rogers        "MetricName": "tma_port_3",
1692100ee7c3SIan Rogers        "MetricThreshold": "tma_port_3 > 0.6",
1693100ee7c3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads). Sample with: UOPS_DISPATCHED_PORT.PORT_3",
1694100ee7c3SIan Rogers        "ScaleUnit": "100%"
1695100ee7c3SIan Rogers    },
1696100ee7c3SIan Rogers    {
1697100ee7c3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data)",
1698100ee7c3SIan Rogers        "MetricExpr": "tma_store_op_utilization",
1699100ee7c3SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_issueSpSt;tma_store_op_utilization_group",
1700100ee7c3SIan Rogers        "MetricName": "tma_port_4",
1701100ee7c3SIan Rogers        "MetricThreshold": "tma_port_4 > 0.6",
1702100ee7c3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data). Sample with: UOPS_DISPATCHED_PORT.PORT_4. Related metrics: tma_split_stores",
1703100ee7c3SIan Rogers        "ScaleUnit": "100%"
1704100ee7c3SIan Rogers    },
1705100ee7c3SIan Rogers    {
1706100ee7c3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)",
1707b522c8afSIan Rogers        "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_core_clks",
1708100ee7c3SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
1709100ee7c3SIan Rogers        "MetricName": "tma_port_5",
1710100ee7c3SIan Rogers        "MetricThreshold": "tma_port_5 > 0.6",
1711100ee7c3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2",
1712100ee7c3SIan Rogers        "ScaleUnit": "100%"
1713100ee7c3SIan Rogers    },
1714100ee7c3SIan Rogers    {
1715100ee7c3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)",
1716b522c8afSIan Rogers        "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks",
1717100ee7c3SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
1718100ee7c3SIan Rogers        "MetricName": "tma_port_6",
1719100ee7c3SIan Rogers        "MetricThreshold": "tma_port_6 > 0.6",
1720100ee7c3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
1721100ee7c3SIan Rogers        "ScaleUnit": "100%"
1722100ee7c3SIan Rogers    },
1723100ee7c3SIan Rogers    {
1724100ee7c3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address)",
1725b522c8afSIan Rogers        "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_core_clks",
1726100ee7c3SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_store_op_utilization_group",
1727100ee7c3SIan Rogers        "MetricName": "tma_port_7",
1728100ee7c3SIan Rogers        "MetricThreshold": "tma_port_7 > 0.6",
1729100ee7c3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address). Sample with: UOPS_DISPATCHED_PORT.PORT_7",
1730100ee7c3SIan Rogers        "ScaleUnit": "100%"
1731100ee7c3SIan Rogers    },
1732100ee7c3SIan Rogers    {
1733100ee7c3SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
1734e2c8b40eSIan Rogers        "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)",
1735100ee7c3SIan Rogers        "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group",
1736100ee7c3SIan Rogers        "MetricName": "tma_ports_utilization",
1737100ee7c3SIan Rogers        "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
1738100ee7c3SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related).  Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.",
1739100ee7c3SIan Rogers        "ScaleUnit": "100%"
1740100ee7c3SIan Rogers    },
1741100ee7c3SIan Rogers    {
1742100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
1743*4c10b96fSIan Rogers        "MetricExpr": "EXE_ACTIVITY.EXE_BOUND_0_PORTS / tma_info_thread_clks",
1744100ee7c3SIan Rogers        "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
1745100ee7c3SIan Rogers        "MetricName": "tma_ports_utilized_0",
1746100ee7c3SIan Rogers        "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
1747100ee7c3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.",
1748100ee7c3SIan Rogers        "ScaleUnit": "100%"
1749100ee7c3SIan Rogers    },
1750100ee7c3SIan Rogers    {
1751100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
1752b522c8afSIan Rogers        "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_1 - UOPS_EXECUTED.CORE_CYCLES_GE_2) / 2 if #SMT_on else EXE_ACTIVITY.1_PORTS_UTIL) / tma_info_core_core_clks",
1753100ee7c3SIan Rogers        "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group",
1754100ee7c3SIan Rogers        "MetricName": "tma_ports_utilized_1",
1755100ee7c3SIan Rogers        "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
1756100ee7c3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful. Related metrics: tma_l1_bound",
1757100ee7c3SIan Rogers        "ScaleUnit": "100%"
1758100ee7c3SIan Rogers    },
1759100ee7c3SIan Rogers    {
1760100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
1761b522c8afSIan Rogers        "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_2 - UOPS_EXECUTED.CORE_CYCLES_GE_3) / 2 if #SMT_on else EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_core_core_clks",
1762100ee7c3SIan Rogers        "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group",
1763100ee7c3SIan Rogers        "MetricName": "tma_ports_utilized_2",
1764100ee7c3SIan Rogers        "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
1765100ee7c3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).  Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6",
1766100ee7c3SIan Rogers        "ScaleUnit": "100%"
1767100ee7c3SIan Rogers    },
1768100ee7c3SIan Rogers    {
1769100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).",
1770b522c8afSIan Rogers        "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / tma_info_core_core_clks",
1771*4c10b96fSIan Rogers        "MetricGroup": "BvCB;PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
1772100ee7c3SIan Rogers        "MetricName": "tma_ports_utilized_3m",
1773e2c8b40eSIan Rogers        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
1774100ee7c3SIan Rogers        "ScaleUnit": "100%"
1775100ee7c3SIan Rogers    },
1776100ee7c3SIan Rogers    {
1777100ee7c3SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues",
1778100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
1779e2c8b40eSIan Rogers        "MetricExpr": "(89.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 89.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
1780100ee7c3SIan Rogers        "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group",
1781100ee7c3SIan Rogers        "MetricName": "tma_remote_cache",
1782100ee7c3SIan Rogers        "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
1783100ee7c3SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues. This is caused often due to non-optimal NUMA allocations. #link to NUMA article. Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM_PS;MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD_PS. Related metrics: tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_machine_clears",
1784100ee7c3SIan Rogers        "ScaleUnit": "100%"
1785100ee7c3SIan Rogers    },
1786100ee7c3SIan Rogers    {
1787100ee7c3SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory",
1788e2c8b40eSIan Rogers        "MetricExpr": "127 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
1789100ee7c3SIan Rogers        "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group",
1790e2c8b40eSIan Rogers        "MetricName": "tma_remote_mem",
1791e2c8b40eSIan Rogers        "MetricThreshold": "tma_remote_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
1792100ee7c3SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article. Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM_PS",
1793100ee7c3SIan Rogers        "ScaleUnit": "100%"
1794100ee7c3SIan Rogers    },
1795100ee7c3SIan Rogers    {
1796100ee7c3SIan Rogers        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
1797b522c8afSIan Rogers        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_thread_slots",
1798*4c10b96fSIan Rogers        "MetricGroup": "BvUW;TmaL1;TopdownL1;tma_L1_group",
1799100ee7c3SIan Rogers        "MetricName": "tma_retiring",
1800100ee7c3SIan Rogers        "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
1801ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL1",
1802100ee7c3SIan Rogers        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
1803100ee7c3SIan Rogers        "ScaleUnit": "100%"
1804100ee7c3SIan Rogers    },
1805100ee7c3SIan Rogers    {
1806100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations",
1807b522c8afSIan Rogers        "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / tma_info_thread_clks",
1808*4c10b96fSIan Rogers        "MetricGroup": "BvIO;PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO",
1809100ee7c3SIan Rogers        "MetricName": "tma_serializing_operation",
1810e2c8b40eSIan Rogers        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
1811100ee7c3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: PARTIAL_RAT_STALLS.SCOREBOARD. Related metrics: tma_ms_switches",
1812100ee7c3SIan Rogers        "ScaleUnit": "100%"
1813100ee7c3SIan Rogers    },
1814100ee7c3SIan Rogers    {
1815100ee7c3SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary",
1816100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
1817b522c8afSIan Rogers        "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks",
1818100ee7c3SIan Rogers        "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
1819100ee7c3SIan Rogers        "MetricName": "tma_split_loads",
1820100ee7c3SIan Rogers        "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
1821100ee7c3SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_INST_RETIRED.SPLIT_LOADS_PS",
1822100ee7c3SIan Rogers        "ScaleUnit": "100%"
1823100ee7c3SIan Rogers    },
1824100ee7c3SIan Rogers    {
1825100ee7c3SIan Rogers        "BriefDescription": "This metric represents rate of split store accesses",
1826b522c8afSIan Rogers        "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks",
1827100ee7c3SIan Rogers        "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group",
1828100ee7c3SIan Rogers        "MetricName": "tma_split_stores",
1829100ee7c3SIan Rogers        "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
1830100ee7c3SIan Rogers        "PublicDescription": "This metric represents rate of split store accesses.  Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_INST_RETIRED.SPLIT_STORES_PS. Related metrics: tma_port_4",
1831100ee7c3SIan Rogers        "ScaleUnit": "100%"
1832100ee7c3SIan Rogers    },
1833100ee7c3SIan Rogers    {
1834100ee7c3SIan Rogers        "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)",
1835b522c8afSIan Rogers        "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_core_clks",
1836*4c10b96fSIan Rogers        "MetricGroup": "BvMS;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
1837100ee7c3SIan Rogers        "MetricName": "tma_sq_full",
1838100ee7c3SIan Rogers        "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
1839e2c8b40eSIan Rogers        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
1840100ee7c3SIan Rogers        "ScaleUnit": "100%"
1841100ee7c3SIan Rogers    },
1842100ee7c3SIan Rogers    {
1843100ee7c3SIan Rogers        "BriefDescription": "This metric estimates how often CPU was stalled  due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write",
1844b522c8afSIan Rogers        "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_thread_clks",
1845100ee7c3SIan Rogers        "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
1846100ee7c3SIan Rogers        "MetricName": "tma_store_bound",
1847100ee7c3SIan Rogers        "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
1848100ee7c3SIan Rogers        "PublicDescription": "This metric estimates how often CPU was stalled  due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS",
1849100ee7c3SIan Rogers        "ScaleUnit": "100%"
1850100ee7c3SIan Rogers    },
1851100ee7c3SIan Rogers    {
1852100ee7c3SIan Rogers        "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores",
1853b522c8afSIan Rogers        "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks",
1854100ee7c3SIan Rogers        "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
1855100ee7c3SIan Rogers        "MetricName": "tma_store_fwd_blk",
1856100ee7c3SIan Rogers        "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
1857100ee7c3SIan Rogers        "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.",
1858100ee7c3SIan Rogers        "ScaleUnit": "100%"
1859100ee7c3SIan Rogers    },
1860100ee7c3SIan Rogers    {
1861100ee7c3SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses",
1862100ee7c3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
1863b522c8afSIan Rogers        "MetricExpr": "(L2_RQSTS.RFO_HIT * 11 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks",
1864*4c10b96fSIan Rogers        "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group",
1865100ee7c3SIan Rogers        "MetricName": "tma_store_latency",
1866100ee7c3SIan Rogers        "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
1867100ee7c3SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full). Related metrics: tma_fb_full, tma_lock_latency",
1868100ee7c3SIan Rogers        "ScaleUnit": "100%"
1869100ee7c3SIan Rogers    },
1870100ee7c3SIan Rogers    {
1871100ee7c3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations",
1872b522c8afSIan Rogers        "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_core_clks",
1873100ee7c3SIan Rogers        "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
1874100ee7c3SIan Rogers        "MetricName": "tma_store_op_utilization",
1875100ee7c3SIan Rogers        "MetricThreshold": "tma_store_op_utilization > 0.6",
1876100ee7c3SIan Rogers        "ScaleUnit": "100%"
1877100ee7c3SIan Rogers    },
1878100ee7c3SIan Rogers    {
1879100ee7c3SIan Rogers        "BriefDescription": "This metric roughly estimates the fraction of cycles where the TLB was missed by store accesses, hitting in the second-level TLB (STLB)",
1880100ee7c3SIan Rogers        "MetricExpr": "tma_dtlb_store - tma_store_stlb_miss",
1881100ee7c3SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group",
1882100ee7c3SIan Rogers        "MetricName": "tma_store_stlb_hit",
1883100ee7c3SIan Rogers        "MetricThreshold": "tma_store_stlb_hit > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
1884100ee7c3SIan Rogers        "ScaleUnit": "100%"
1885100ee7c3SIan Rogers    },
1886100ee7c3SIan Rogers    {
1887100ee7c3SIan Rogers        "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk",
1888b522c8afSIan Rogers        "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_core_clks",
1889100ee7c3SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group",
1890100ee7c3SIan Rogers        "MetricName": "tma_store_stlb_miss",
1891100ee7c3SIan Rogers        "MetricThreshold": "tma_store_stlb_miss > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
1892100ee7c3SIan Rogers        "ScaleUnit": "100%"
1893100ee7c3SIan Rogers    },
1894100ee7c3SIan Rogers    {
1895100ee7c3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
1896b522c8afSIan Rogers        "MetricExpr": "9 * BACLEARS.ANY / tma_info_thread_clks",
1897*4c10b96fSIan Rogers        "MetricGroup": "BigFootprint;BvBC;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
1898100ee7c3SIan Rogers        "MetricName": "tma_unknown_branches",
1899100ee7c3SIan Rogers        "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
1900e2c8b40eSIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY",
1901100ee7c3SIan Rogers        "ScaleUnit": "100%"
1902100ee7c3SIan Rogers    },
1903100ee7c3SIan Rogers    {
1904100ee7c3SIan Rogers        "BriefDescription": "This metric serves as an approximation of legacy x87 usage",
1905100ee7c3SIan Rogers        "MetricExpr": "tma_retiring * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD",
1906100ee7c3SIan Rogers        "MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group",
1907100ee7c3SIan Rogers        "MetricName": "tma_x87_use",
1908100ee7c3SIan Rogers        "MetricThreshold": "tma_x87_use > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
1909100ee7c3SIan Rogers        "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.",
1910100ee7c3SIan Rogers        "ScaleUnit": "100%"
1911100ee7c3SIan Rogers    },
1912100ee7c3SIan Rogers    {
1913100ee7c3SIan Rogers        "BriefDescription": "Percentage of cycles in aborted transactions.",
19148076dc8cSIan Rogers        "MetricExpr": "(max(cycles\\-t - cycles\\-ct, 0) / cycles if has_event(cycles\\-t) else 0)",
1915100ee7c3SIan Rogers        "MetricGroup": "transaction",
1916100ee7c3SIan Rogers        "MetricName": "tsx_aborted_cycles",
1917100ee7c3SIan Rogers        "ScaleUnit": "100%"
1918100ee7c3SIan Rogers    },
1919100ee7c3SIan Rogers    {
1920100ee7c3SIan Rogers        "BriefDescription": "Number of cycles within a transaction divided by the number of elisions.",
1921c43c64f8SIan Rogers        "MetricExpr": "(cycles\\-t / el\\-start if has_event(el\\-start) else 0)",
1922100ee7c3SIan Rogers        "MetricGroup": "transaction",
1923100ee7c3SIan Rogers        "MetricName": "tsx_cycles_per_elision",
1924100ee7c3SIan Rogers        "ScaleUnit": "1cycles / elision"
1925100ee7c3SIan Rogers    },
1926100ee7c3SIan Rogers    {
1927100ee7c3SIan Rogers        "BriefDescription": "Number of cycles within a transaction divided by the number of transactions.",
19288076dc8cSIan Rogers        "MetricExpr": "(cycles\\-t / tx\\-start if has_event(cycles\\-t) else 0)",
1929100ee7c3SIan Rogers        "MetricGroup": "transaction",
1930100ee7c3SIan Rogers        "MetricName": "tsx_cycles_per_transaction",
1931100ee7c3SIan Rogers        "ScaleUnit": "1cycles / transaction"
1932100ee7c3SIan Rogers    },
1933100ee7c3SIan Rogers    {
1934100ee7c3SIan Rogers        "BriefDescription": "Percentage of cycles within a transaction region.",
19358076dc8cSIan Rogers        "MetricExpr": "(cycles\\-t / cycles if has_event(cycles\\-t) else 0)",
1936100ee7c3SIan Rogers        "MetricGroup": "transaction",
1937100ee7c3SIan Rogers        "MetricName": "tsx_transactional_cycles",
1938100ee7c3SIan Rogers        "ScaleUnit": "100%"
1939b522c8afSIan Rogers    },
1940b522c8afSIan Rogers    {
1941b522c8afSIan Rogers        "BriefDescription": "Uncore operating frequency in GHz",
1942b522c8afSIan Rogers        "MetricExpr": "UNC_CHA_CLOCKTICKS / (#num_cores / #num_packages * #num_packages) / 1e9 / duration_time",
1943b522c8afSIan Rogers        "MetricName": "uncore_frequency",
1944b522c8afSIan Rogers        "ScaleUnit": "1GHz"
1945b522c8afSIan Rogers    },
1946b522c8afSIan Rogers    {
194719dd49c9SIan Rogers        "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data receive bandwidth (MB/sec)",
194819dd49c9SIan Rogers        "MetricExpr": "UNC_UPI_RxL_FLITS.ALL_DATA * 7.111111111111111 / 1e6 / duration_time",
194919dd49c9SIan Rogers        "MetricName": "upi_data_receive_bw",
195019dd49c9SIan Rogers        "ScaleUnit": "1MB/s"
195119dd49c9SIan Rogers    },
195219dd49c9SIan Rogers    {
1953b522c8afSIan Rogers        "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data transmit bandwidth (MB/sec)",
1954b522c8afSIan Rogers        "MetricExpr": "UNC_UPI_TxL_FLITS.ALL_DATA * 7.111111111111111 / 1e6 / duration_time",
1955b522c8afSIan Rogers        "MetricName": "upi_data_transmit_bw",
1956b522c8afSIan Rogers        "ScaleUnit": "1MB/s"
195756de5b63SAndi Kleen    }
195856de5b63SAndi Kleen]
1959