xref: /freebsd/lib/libpmc/pmc.3 (revision 2b743a9e9ddc6736208dc8ca1ce06ce64ad20a19)
1.\" Copyright (c) 2003-2006 Joseph Koshy.  All rights reserved.
2.\"
3.\" Redistribution and use in source and binary forms, with or without
4.\" modification, are permitted provided that the following conditions
5.\" are met:
6.\" 1. Redistributions of source code must retain the above copyright
7.\"    notice, this list of conditions and the following disclaimer.
8.\" 2. Redistributions in binary form must reproduce the above copyright
9.\"    notice, this list of conditions and the following disclaimer in the
10.\"    documentation and/or other materials provided with the distribution.
11.\"
12.\" This software is provided by Joseph Koshy ``as is'' and
13.\" any express or implied warranties, including, but not limited to, the
14.\" implied warranties of merchantability and fitness for a particular purpose
15.\" are disclaimed.  in no event shall Joseph Koshy be liable
16.\" for any direct, indirect, incidental, special, exemplary, or consequential
17.\" damages (including, but not limited to, procurement of substitute goods
18.\" or services; loss of use, data, or profits; or business interruption)
19.\" however caused and on any theory of liability, whether in contract, strict
20.\" liability, or tort (including negligence or otherwise) arising in any way
21.\" out of the use of this software, even if advised of the possibility of
22.\" such damage.
23.\"
24.\" $FreeBSD$
25.\"
26.Dd February 25, 2006
27.Os
28.Dt PMC 3
29.Sh NAME
30.Nm pmc_allocate ,
31.Nm pmc_attach ,
32.Nm pmc_capabilities ,
33.Nm pmc_configure_logfile ,
34.Nm pmc_cpuinfo ,
35.Nm pmc_detach ,
36.Nm pmc_disable ,
37.Nm pmc_enable ,
38.Nm pmc_event_names_of_class ,
39.Nm pmc_flush_logfile ,
40.Nm pmc_get_driver_stats ,
41.Nm pmc_get_msr ,
42.Nm pmc_init ,
43.Nm pmc_name_of_capability ,
44.Nm pmc_name_of_class ,
45.Nm pmc_name_of_cputype ,
46.Nm pmc_name_of_event ,
47.Nm pmc_name_of_mode ,
48.Nm pmc_name_of_state ,
49.Nm pmc_ncpu ,
50.Nm pmc_npmc ,
51.Nm pmc_pmcinfo ,
52.Nm pmc_read ,
53.Nm pmc_release ,
54.Nm pmc_rw ,
55.Nm pmc_set ,
56.Nm pmc_start ,
57.Nm pmc_stop ,
58.Nm pmc_width ,
59.Nm pmc_write ,
60.Nm pmc_writelog
61.Nd programming API for using hardware performance monitoring counters
62.Sh LIBRARY
63.Lb libpmc
64.Sh SYNOPSIS
65.In pmc.h
66.Ft int
67.Fo pmc_allocate
68.Fa "const char *eventspecifier"
69.Fa "enum pmc_mode mode"
70.Fa "uint32_t flags"
71.Fa "int cpu"
72.Fa "pmc_id_t *pmcid"
73.Fc
74.Ft int
75.Fn pmc_attach "pmc_id_t pmcid" "pid_t pid"
76.Ft int
77.Fn pmc_capabilities "pmc_id_t pmc" "uint32_t *caps"
78.Ft int
79.Fn pmc_configure_logfile "int fd"
80.Ft int
81.Fn pmc_cpuinfo "const struct pmc_cpuinfo **cpu_info"
82.Ft int
83.Fn pmc_detach "pmc_id_t pmcid" "pid_t pid"
84.Ft int
85.Fn pmc_disable "int cpu" "int pmc"
86.Ft int
87.Fn pmc_enable "int cpu" "int pmc"
88.Ft int
89.Fo pmc_event_names_of_class
90.Fa "enum pmc_class cl"
91.Fa "const char ***eventnames"
92.Fa "int *nevents"
93.Fc
94.Ft int
95.Fn pmc_flush_logfile void
96.Ft int
97.Fn pmc_get_driver_stats "struct pmc_driverstats *gms"
98.Ft int
99.Fn pmc_get_msr "pmc_id_t pmc" "uint32_t *msr"
100.Ft int
101.Fn pmc_init void
102.Ft "const char *"
103.Fn pmc_name_of_capability "enum pmc_caps pc"
104.Ft "const char *"
105.Fn pmc_name_of_class "enum pmc_class pc"
106.Ft "const char *"
107.Fn pmc_name_of_cputype "enum pmc_cputype ct"
108.Ft "const char *"
109.Fn pmc_name_of_disposition "enum pmc_disp pd"
110.Ft "const char *"
111.Fn pmc_name_of_event "enum pmc_event pe"
112.Ft "const char *"
113.Fn pmc_name_of_mode "enum pmc_mode pm"
114.Ft "const char *"
115.Fn pmc_name_of_state "enum pmc_state ps"
116.Ft int
117.Fn pmc_ncpu void
118.Ft int
119.Fn pmc_npmc "int cpu"
120.Ft int
121.Fn pmc_pmcinfo "int cpu" "struct pmc_pmcinfo **pmc_info"
122.Ft int
123.Fn pmc_read "pmc_id_t pmc" "pmc_value_t *value"
124.Ft int
125.Fn pmc_release "pmc_id_t pmc"
126.Ft int
127.Fn pmc_rw "pmc_id_t pmc" "pmc_value_t newvalue" "pmc_value_t *oldvaluep"
128.Ft int
129.Fn pmc_set "pmc_id_t pmc" "pmc_value_t value"
130.Ft int
131.Fn pmc_start "pmc_id_t pmc"
132.Ft int
133.Fn pmc_stop "pmc_id_t pmc"
134.Ft int
135.Fn pmc_write "pmc_id_t pmc" "pmc_value_t value"
136.Ft int
137.Fn pmc_writelog "uint32_t userdata"
138.Ft int
139.Fn pmc_width "pmc_id_t pmc" "uint32_t *width"
140.Sh DESCRIPTION
141These functions implement a high-level library for using the
142system's hardware performance counters.
143.Pp
144PMCs are allocated using
145.Fn pmc_allocate ,
146released using
147.Fn pmc_release
148and read using
149.Fn pmc_read .
150Allocated PMCs may be started or stopped at any time using
151.Fn pmc_start
152and
153.Fn pmc_stop
154respectively.
155An allocated PMC may be of
156.Dq global
157scope, meaning that the PMC measures system-wide events, or
158.Dq process-private
159scope, meaning that the PMC only counts hardware events when
160the allocating process (or, optionally, its children)
161are active.
162.Pp
163PMCs may further be in
164.Dq "counting mode" ,
165or in
166.Dq "sampling mode" .
167Sampling mode PMCs deliver an interrupt to the CPU after
168a configured number of hardware events have been seen.
169A process-private sampling mode PMC will cause its owner
170process to get periodic
171.Dv SIGPROF
172interrupts, while a global sampling mode PMC is used to
173do system-wide statistical sampling (see
174.Xr hwpmc 4 ) .
175The sampling rate desired of a sampling-mode PMC is set using
176.Fn pmc_set .
177Counting mode PMCs do not interrupt the CPU; their values
178can be read using
179.Fn pmc_read .
180.Pp
181System-wide statistical sampling is configured by allocating
182at least one sampling mode PMC with
183global scope, and when a log file is configured using
184.Fn pmc_configure_logfile .
185The
186.Xr hwpmc 4
187driver manages system-wide statistical sampling; for more
188information please see
189.Xr hwpmc 4 .
190.Ss Application Programming Interface
191The function
192.Fn pmc_init
193initializes the
194.Nm pmc
195library.
196This function must be called first, before any of the other
197functions in the library.
198.Pp
199The function
200.Fn pmc_allocate
201allocates a counter that counts the events named by
202.Fa eventspecifier ,
203and writes the allocated counter ID to
204.Fa *pmcid .
205Argument
206.Fa eventspecifier
207comprises an PMC event name followed by an optional comma separated
208list of keywords and qualifiers.
209The allowed syntax for
210.Fa eventspecifier
211is processor architecture specific and is listed in section
212.Sx "EVENT SPECIFIERS"
213below.
214The desired PMC mode is specified by
215.Fa mode ,
216and any mode specific modifiers are specified using
217.Fa flags .
218The
219.Fa cpu
220argument is the value
221.Dv PMC_CPU_ANY ,
222or names the CPU the allocation is to be on.
223Requesting a specific CPU only makes sense for global PMCs;
224process-private PMC allocations should always specify
225.Dv PMC_CPU_ANY .
226.Pp
227By default, a PMC configured in process-virtual counting mode is set up
228to profile its owner process.
229The function
230.Fn pmc_attach
231may be used to attach the PMC to a different process.
232It
233needs to be called before the counter is first started
234with
235.Fn pmc_start .
236The function
237.Fn pmc_detach
238may be used to detach a PMC from a process it was attached to
239using a prior call to
240.Fn pmc_attach .
241.Pp
242The function
243.Fn pmc_release
244releases a PMC previously allocated with
245.Fn pmc_allocate .
246This function call implicitly detaches the PMC from all its target
247processes.
248.Pp
249An allocated PMC may be started and stopped using
250.Fn pmc_start
251and
252.Fn pmc_stop
253respectively.
254.Pp
255The current value of a PMC may be read with
256.Fn pmc_read
257and written using
258.Fn pmc_write ,
259provided the underlying hardware supports these operations on
260the allocated PMC.
261The read and write operation may be combined using
262.Fn pmc_rw .
263.Pp
264The function
265.Fn pmc_capabilities
266sets argument
267.Fa caps
268to a bitmask of capabilities supported by the PMC denoted by
269argument
270.Fa pmc .
271The function
272.Fn pmc_width
273sets argument
274.Fa width
275to the width of the PMC denoted by argument
276.Fa pmc .
277.Pp
278The
279.Fn pmc_configure_logfile
280function causes the
281.Xr hwpmc 4
282driver to log performance data to file corresponding
283to the process' file handle
284.Fa fd .
285If argument
286.Fa fd
287is \-1, then any previously configured logging is reset
288and all data queued to be written are discarded.
289.Pp
290The
291.Fn pmc_flush_logfile
292function will send all data queued inside the
293.Xr hwpmc 4
294driver to the configured log file before returning.
295The
296.Fn pmc_writelog
297function will append a log entry containing the argument
298.Fa userdata
299to the log file.
300.Pp
301The function
302.Fn pmc_set
303configures a sampling PMC
304.Fa pmc
305to interrupt every
306.Fa value
307events.
308For counting PMCs,
309.Fn pmc_set
310sets the initial value of the PMC to
311.Fa value .
312.Pp
313The function
314.Fn pmc_get_driver_statistics
315copies a snapshot of the usage statistics maintained by
316.Xr hwpmc 4
317into the memory area pointed to by argument
318.Fa gms .
319.Ss Signal Handling Requirements
320Applications using PMCs are required to handle the following signals:
321.Bl -tag -width indent
322.It Dv SIGBUS
323When the
324.Xr hwpmc 4
325module is unloaded using
326.Xr kldunload 8 ,
327processes that have PMCs allocated to them will be sent a
328.Dv SIGBUS
329signal.
330.It Dv SIGIO
331The
332.Xr hwpmc 4
333driver will send a PMC owning process a
334.Dv SIGIO
335signal if:
336.Bl -bullet
337.It
338If any process-mode PMC allocated by it loses all its
339target processes.
340.It
341If the driver encounters an error when writing log data to a
342configured log file.
343This error may be retrieved by a subsequent call to
344.Fn pmc_flush_logfile .
345.El
346.El
347.Ss Convenience Functions
348The function
349.Fn pmc_ncpu
350returns the number of CPUs present in the system.
351.Pp
352The function
353.Fn pmc_npmc
354returns the number of PMCs supported on CPU
355.Fa cpu .
356The function
357.Fn pmc_cpuinfo
358sets argument
359.Fa cpu_info
360to point to a structure with information about the system's CPUs.
361Function
362.Fn pmc_pmcinfo
363returns information about the current state of CPU
364.Fa cpu Ns 's
365PMCs.
366This function sets argument
367.Fa *pmc_info
368to point to a memory area allocated with
369.Xr calloc 3 .
370The caller is expected to
371.Fn free
372the area when done.
373.Pp
374The functions
375.Fn pmc_name_of_capability ,
376.Fn pmc_name_of_class ,
377.Fn pmc_name_of_cputype ,
378.Fn pmc_name_of_disposition ,
379.Fn pmc_name_of_event ,
380.Fn pmc_name_of_mode
381and
382.Fn pmc_name_of_state
383are useful for code wanting to print error messages.
384They return
385.Vt "const char *"
386pointers to human-readable representations of their arguments.
387These return values should not be freed using
388.Xr free 3 .
389.Pp
390The function
391.Fn pmc_event_names_of_class
392returns a list of event names supported by a given PMC class
393.Fa cl .
394On successful return, an array of
395.Vt "const char *"
396pointers to the names of valid events supported by class
397.Fa cl
398is allocated by the library using
399.Xr malloc 3 ,
400and a pointer to this array is returned in the location pointed to by
401.Fa eventnames .
402The number of pointers allocated is returned in the location pointed
403to by
404.Fa nevents .
405.Ss Administration
406Individual PMCs may be enabled or disabled on a given CPU using
407.Fn pmc_enable
408and
409.Fn pmc_disable
410respectively.
411For these functions,
412.Fa cpu
413is the CPU number, and
414.Fa pmc
415is the index of the PMC to be operated on.
416Only the super-user is allowed to enable and disable PMCs.
417.Ss x86 Architecture Specific API
418The
419.Fn pmc_get_msr
420function returns the processor model specific register number
421associated with
422.Fa pmc .
423Applications may use the x86
424.Ic RDPMC
425instruction to directly read the contents of the PMC.
426.Sh EVENT SPECIFIERS
427Event specifiers are strings comprising of an event name, followed by
428optional parameters modifying the semantics of the hardware event
429being probed.
430Event names are PMC architecture dependent, but the
431.Xr hwpmc 4
432library defines machine independent aliases for commonly used
433events.
434.Ss Event Name Aliases
435Event name aliases are CPU architecture independent names for commonly
436used events.
437The following aliases are known to this version of the
438.Nm pmc
439library:
440.Bl -tag -width indent
441.It Li branches
442Measure the number of branches retired.
443.It Li branch-mispredicts
444Measure the number of retired branches that were mispredicted.
445.It Li cycles
446Measure processor cycles.
447This event is implemented using the processor's Time Stamp Counter
448register.
449.It Li dc-misses
450Measure the number of data cache misses.
451.It Li ic-misses
452Measure the number of instruction cache misses.
453.It Li instructions
454Measure the number of instructions retired.
455.It Li interrupts
456Measure the number of interrupts seen.
457.It Li unhalted-cycles
458Measure the number of cycles the processor is not in a halted
459or sleep state.
460.El
461.Ss Time Stamp Counter (TSC)
462The timestamp counter is a monotonically non-decreasing counter that
463counts processor cycles.
464.Pp
465In the i386 architecture, this counter may
466be selected by requesting an event with event specifier
467.Dq Li tsc .
468The
469.Dq Li tsc
470event does not support any further qualifiers.
471It can only be allocated in system-wide counting mode,
472and is a read-only counter.
473Multiple processes are allowed to allocate the TSC.
474Once allocated, it may be read using the
475.Fn pmc_read
476function, or by using the RDTSC instruction.
477.Ss AMD (K7) PMCs
478These PMCs are present in the
479.Tn "AMD Athlon"
480series of CPUs and are documented in:
481.Rs
482.%B "AMD Athlon Processor x86 Code Optimization Guide"
483.%N "Publication No. 22007"
484.%D "February 2002"
485.%Q "Advanced Micro Devices, Inc."
486.Re
487.Pp
488Event specifiers for AMD K7 PMCs can have the following optional
489qualifiers:
490.Bl -tag -width indent
491.It Li count= Ns Ar value
492Configure the counter to increment only if the number of configured
493events measured in a cycle is greater than or equal to
494.Ar value .
495.It Li edge
496Configure the counter to only count negated-to-asserted transitions
497of the conditions expressed by the other qualifiers.
498In other words, the counter will increment only once whenever a given
499condition becomes true, irrespective of the number of clocks during
500which the condition remains true.
501.It Li inv
502Invert the sense of comparision when the
503.Dq Li count
504qualifier is present, making the counter to increment when the
505number of events per cycle is less than the value specified by
506the
507.Dq Li count
508qualifier.
509.It Li os
510Configure the PMC to count events happening at privilege level 0.
511.It Li unitmask= Ns Ar mask
512This qualifier is used to further qualify a select few events,
513.Dq Li k7-dc-refills-from-l2 ,
514.Dq Li k7-dc-refills-from-system
515and
516.Dq Li k7-dc-writebacks .
517Here
518.Ar mask
519is a string of the following characters optionally separated by
520.Ql +
521characters:
522.Pp
523.Bl -tag -width indent -compact
524.It Li m
525Count operations for lines in the
526.Dq Modified
527state.
528.It Li o
529Count operations for lines in the
530.Dq Owner
531state.
532.It Li e
533Count operations for lines in the
534.Dq Exclusive
535state.
536.It Li s
537Count operations for lines in the
538.Dq Shared
539state.
540.It Li i
541Count operations for lines in the
542.Dq Invalid
543state.
544.El
545.Pp
546If no
547.Dq Li unitmask
548qualifier is specified, the default is to count events for caches
549lines in any of the above states.
550.It Li usr
551Configure the PMC to count events occurring at privilege levels 1, 2
552or 3.
553.El
554.Pp
555If neither of the
556.Dq Li os
557or
558.Dq Li usr
559qualifiers were specified, the default is to enable both.
560.Pp
561The event specifiers supported on AMD K7 PMCs are:
562.Bl -tag -width indent
563.It Li k7-dc-accesses
564Count data cache accesses.
565.It Li k7-dc-misses
566Count data cache misses.
567.It Li k7-dc-refills-from-l2 Op Li ,unitmask= Ns Ar mask
568Count data cache refills from L2 cache.
569This event may be further qualified using the
570.Dq Li unitmask
571qualifier.
572.It Li k7-dc-refills-from-system Op Li ,unitmask= Ns Ar mask
573Count data cache refills from system memory.
574This event may be further qualified using the
575.Dq Li unitmask
576qualifier.
577.It Li k7-dc-writebacks Op Li ,unitmask= Ns Ar mask
578Count data cache writebacks.
579This event may be further qualified using the
580.Dq Li unitmask
581qualifier.
582.It Li k7-l1-dtlb-miss-and-l2-dtlb-hits
583Count L1 DTLB misses and L2 DTLB hits.
584.It Li k7-l1-and-l2-dtlb-misses
585Count L1 and L2 DTLB misses.
586.It Li k7-misaligned-references
587Count misaligned data references.
588.It Li k7-ic-fetches
589Count instruction cache fetches.
590.It Li k7-ic-misses
591Count instruction cache misses.
592.It Li k7-l1-itlb-misses
593Count L1 ITLB misses that are L2 ITLB hits.
594.It Li k7-l1-l2-itlb-misses
595Count L1 (and L2) ITLB misses.
596.It Li k7-retired-instructions
597Count all retired instructions.
598.It Li k7-retired-ops
599Count retired ops.
600.It Li k7-retired-branches
601Count all retired branches (conditional, unconditional, exceptions
602and interrupts).
603.It Li k7-retired-branches-mispredicted
604Count all misprediced retired branches.
605.It Li k7-retired-taken-branches
606Count retired taken branches.
607.It Li k7-retired-taken-branches-mispredicted
608Count mispredicted taken branches that were retired.
609.It Li k7-retired-far-control-transfers
610Count retired far control transfers.
611.It Li k7-retired-resync-branches
612Count retired resync branches (non control transfer branches).
613.It Li k7-interrupts-masked-cycles
614Count the number of cycles when the processor's
615.Va IF
616flag was zero.
617.It Li k7-interrupts-masked-while-pending-cycles
618Count the number of cycles interrupts were masked while pending due
619to the processor's
620.Va IF
621flag being zero.
622.It Li k7-hardware-interrupts
623Count the number of taken hardware interrupts.
624.El
625.Ss AMD (K8) PMCs
626These PMCs are present in the
627.Tn "AMD Athlon64"
628and
629.Tn "AMD Opteron"
630series of CPUs.
631They are documented in:
632.Rs
633.%B "BIOS and Kernel Developer's Guide for the AMD Athlon(tm) 64 and AMD Opteron Processors"
634.%N "Publication No. 26094"
635.%D "April 2004"
636.%Q "Advanced Micro Devices, Inc."
637.Re
638.Pp
639Event specifiers for AMD K8 PMCs can have the following optional
640qualifiers:
641.Bl -tag -width indent
642.It Li count= Ns Ar value
643Configure the counter to increment only if the number of configured
644events measured in a cycle is greater than or equal to
645.Ar value .
646.It Li edge
647Configure the counter to only count negated-to-asserted transitions
648of the conditions expressed by the other fields.
649In other words, the counter will increment only once whenever a given
650condition becomes true, irrespective of the number of clocks during
651which the condition remains true.
652.It Li inv
653Invert the sense of comparision when the
654.Dq Li count
655qualifier is present, making the counter to increment when the
656number of events per cycle is less than the value specified by
657the
658.Dq Li count
659qualifier.
660.It Li mask= Ns Ar qualifier
661Many event specifiers for AMD K8 PMCs need to be additionally
662qualified using a mask qualifier.
663These additional qualifiers are event-specific and are documented
664along with their associated event specifiers below.
665.It Li os
666Configure the PMC to count events happening at privilege level 0.
667.It Li usr
668Configure the PMC to count events occurring at privilege levels 1, 2
669or 3.
670.El
671.Pp
672If neither of the
673.Dq Li os
674or
675.Dq Li usr
676qualifiers were specified, the default is to enable both.
677.Pp
678The event specifiers supported on AMD K8 PMCs are:
679.Bl -tag -width indent
680.It Li k8-bu-cpu-clk-unhalted
681Count the number of clock cycles when the CPU is not in the HLT or
682STPCLK states.
683.It Li k8-bu-fill-request-l2-miss Op Li ,mask= Ns Ar qualifier
684Count fill requests that missed in the L2 cache.
685This event may be further qualified using
686.Ar qualifier ,
687which is a
688.Ql +
689separated set of the following keywords:
690.Pp
691.Bl -tag -width indent -compact
692.It Li dc-fill
693Count data cache fill requests.
694.It Li ic-fill
695Count instruction cache fill requests.
696.It Li tlb-reload
697Count TLB reloads.
698.El
699.Pp
700The default is to count all types of requests.
701.It Li k8-bu-internal-l2-request Op Li ,mask= Ns Ar qualifier
702Count internally generated requests to the L2 cache.
703This event may be further qualified using
704.Ar qualifier ,
705which is a
706.Ql +
707separated set of the following keywords:
708.Pp
709.Bl -tag -width indent -compact
710.It Li cancelled
711Count cancelled requests.
712.It Li dc-fill
713Count data cache fill requests.
714.It Li ic-fill
715Count instruction cache fill requests.
716.It Li tag-snoop
717Count tag snoop requests.
718.It Li tlb-reload
719Count TLB reloads.
720.El
721.Pp
722The default is to count all types of requests.
723.It Li k8-dc-access
724Count data cache accesses including microcode scratchpad accesses.
725.It Li k8-dc-copyback Op Li ,mask= Ns Ar qualifier
726Count data cache copyback operations.
727This event may be further qualified using
728.Ar qualifier ,
729which is a
730.Ql +
731separated set of the following keywords:
732.Pp
733.Bl -tag -width indent -compact
734.It Li exclusive
735Count operations for lines in the
736.Dq exclusive
737state.
738.It Li invalid
739Count operations for lines in the
740.Dq invalid
741state.
742.It Li modified
743Count operations for lines in the
744.Dq modified
745state.
746.It Li owner
747Count operations for lines in the
748.Dq owner
749state.
750.It Li shared
751Count operations for lines in the
752.Dq shared
753state.
754.El
755.Pp
756The default is to count operations for lines in all the
757above states.
758.It Li k8-dc-dcache-accesses-by-locks Op Li ,mask= Ns Ar qualifier
759Count data cache accesses by lock instructions.
760This event is only available on processors of revision C or later
761vintage.
762This event may be further qualified using
763.Ar qualifier ,
764which is a
765.Ql +
766separated set of the following keywords:
767.Pp
768.Bl -tag -width indent -compact
769.It Li accesses
770Count data cache accesses by lock instructions.
771.It Li misses
772Count data cache misses by lock instructions.
773.El
774.Pp
775The default is to count all accesses.
776.It Li k8-dc-dispatched-prefetch-instructions Op Li ,mask= Ns Ar qualifier
777Count the number of dispatched prefetch instructions.
778This event may be further qualified using
779.Ar qualifier ,
780which is a
781.Ql +
782separated set of the following keywords:
783.Pp
784.Bl -tag -width indent -compact
785.It Li load
786Count load operations.
787.It Li nta
788Count non-temporal operations.
789.It Li store
790Count store operations.
791.El
792.Pp
793The default is to count all operations.
794.It Li k8-dc-l1-dtlb-miss-and-l2-dtlb-hit
795Count L1 DTLB misses that are L2 DTLB hits.
796.It Li k8-dc-l1-dtlb-miss-and-l2-dtlb-miss
797Count L1 DTLB misses that are also misses in the L2 DTLB.
798.It Li k8-dc-microarchitectural-early-cancel-of-an-access
799Count microarchitectural early cancels of data cache accesses.
800.It Li k8-dc-microarchitectural-late-cancel-of-an-access
801Count microarchitectural late cancels of data cache accesses.
802.It Li k8-dc-misaligned-data-reference
803Count misaligned data references.
804.It Li k8-dc-miss
805Count data cache misses.
806.It Li k8-dc-one-bit-ecc-error Op Li ,mask= Ns Ar qualifier
807Count one bit ECC errors found by the scrubber.
808This event may be further qualified using
809.Ar qualifier ,
810which is a
811.Ql +
812separated set of the following keywords:
813.Pp
814.Bl -tag -width indent -compact
815.It Li scrubber
816Count scrubber detected errors.
817.It Li piggyback
818Count piggyback scrubber errors.
819.El
820.Pp
821The default is to count both kinds of errors.
822.It Li k8-dc-refill-from-l2 Op Li ,mask= Ns Ar qualifier
823Count data cache refills from L2 cache.
824This event may be further qualified using
825.Ar qualifier ,
826which is a
827.Ql +
828separated set of the following keywords:
829.Pp
830.Bl -tag -width indent -compact
831.It Li exclusive
832Count operations for lines in the
833.Dq exclusive
834state.
835.It Li invalid
836Count operations for lines in the
837.Dq invalid
838state.
839.It Li modified
840Count operations for lines in the
841.Dq modified
842state.
843.It Li owner
844Count operations for lines in the
845.Dq owner
846state.
847.It Li shared
848Count operations for lines in the
849.Dq shared
850state.
851.El
852.Pp
853The default is to count operations for lines in all the
854above states.
855.It Li k8-dc-refill-from-system Op Li ,mask= Ns Ar qualifier
856Count data cache refills from system memory.
857This event may be further qualified using
858.Ar qualifier ,
859which is a
860.Ql +
861separated set of the following keywords:
862.Pp
863.Bl -tag -width indent -compact
864.It Li exclusive
865Count operations for lines in the
866.Dq exclusive
867state.
868.It Li invalid
869Count operations for lines in the
870.Dq invalid
871state.
872.It Li modified
873Count operations for lines in the
874.Dq modified
875state.
876.It Li owner
877Count operations for lines in the
878.Dq owner
879state.
880.It Li shared
881Count operations for lines in the
882.Dq shared
883state.
884.El
885.Pp
886The default is to count operations for lines in all the
887above states.
888.It Li k8-fp-dispatched-fpu-ops Op Li ,mask= Ns Ar qualifier
889Count the number of dispatched FPU ops.
890This event is supported in revision B and later CPUs.
891This event may be further qualified using
892.Ar qualifier ,
893which is a
894.Ql +
895separated set of the following keywords:
896.Pp
897.Bl -tag -width indent -compact
898.It Li add-pipe-excluding-junk-ops
899Count add pipe ops excluding junk ops.
900.It Li add-pipe-junk-ops
901Count junk ops in the add pipe.
902.It Li multiply-pipe-excluding-junk-ops
903Count multiply pipe ops excluding junk ops.
904.It Li multiply-pipe-junk-ops
905Count junk ops in the multiply pipe.
906.It Li store-pipe-excluding-junk-ops
907Count store pipe ops excluding junk ops
908.It Li store-pipe-junk-ops
909Count junk ops in the store pipe.
910.El
911.Pp
912The default is to count all types of ops.
913.It Li k8-fp-cycles-with-no-fpu-ops-retired
914Count cycles when no FPU ops were retired.
915This event is supported in revision B and later CPUs.
916.It Li k8-fp-dispatched-fpu-fast-flag-ops
917Count dispatched FPU ops that use the fast flag interface.
918This event is supported in revision B and later CPUs.
919.It Li k8-fr-decoder-empty
920Count cycles when there was nothing to dispatch (i.e., the decoder
921was empty).
922.It Li k8-fr-dispatch-stalls
923Count all dispatch stalls.
924.It Li k8-fr-dispatch-stall-for-segment-load
925Count dispatch stalls for segment loads.
926.It Li k8-fr-dispatch-stall-for-serialization
927Count dispatch stalls for serialization.
928.It Li k8-fr-dispatch-stall-from-branch-abort-to-retire
929Count dispatch stalls from branch abort to retiral.
930.It Li k8-fr-dispatch-stall-when-fpu-is-full
931Count dispatch stalls when the FPU is full.
932.It Li k8-fr-dispatch-stall-when-ls-is-full
933Count dispatch stalls when the load/store unit is full.
934.It Li k8-fr-dispatch-stall-when-reorder-buffer-is-full
935Count dispatch stalls when the reorder buffer is full.
936.It Li k8-fr-dispatch-stall-when-reservation-stations-are-full
937Count dispatch stalls when reservation stations are full.
938.It Li k8-fr-dispatch-stall-when-waiting-for-all-to-be-quiet
939Count dispatch stalls when waiting for all to be quiet.
940.\" XXX What does "waiting for all to be quiet" mean?
941.It Li k8-fr-dispatch-stall-when-waiting-far-xfer-or-resync-branch-pending
942Count dispatch stalls when a far control transfer or a resync branch
943is pending.
944.It Li k8-fr-fpu-exceptions Op Li ,mask= Ns Ar qualifier
945Count FPU exceptions.
946This event is supported in revision B and later CPUs.
947This event may be further qualified using
948.Ar qualifier ,
949which is a
950.Ql +
951separated set of the following keywords:
952.Pp
953.Bl -tag -width indent -compact
954.It Li sse-and-x87-microtraps
955Count SSE and x87 microtraps.
956.It Li sse-reclass-microfaults
957Count SSE reclass microfaults
958.It Li sse-retype-microfaults
959Count SSE retype microfaults
960.It Li x87-reclass-microfaults
961Count x87 reclass microfaults.
962.El
963.Pp
964The default is to count all types of exceptions.
965.It Li k8-fr-interrupts-masked-cycles
966Count cycles when interrupts were masked (by CPU RFLAGS field IF was zero).
967.It Li k8-fr-interrupts-masked-while-pending-cycles
968Count cycles while interrupts were masked while pending (i.e., cycles
969when INTR was asserted while CPU RFLAGS field IF was zero).
970.It Li k8-fr-number-of-breakpoints-for-dr0
971Count the number of breakpoints for DR0.
972.It Li k8-fr-number-of-breakpoints-for-dr1
973Count the number of breakpoints for DR1.
974.It Li k8-fr-number-of-breakpoints-for-dr2
975Count the number of breakpoints for DR2.
976.It Li k8-fr-number-of-breakpoints-for-dr3
977Count the number of breakpoints for DR3.
978.It Li k8-fr-retired-branches
979Count retired branches including exceptions and interrupts.
980.It Li k8-fr-retired-branches-mispredicted
981Count mispredicted retired branches.
982.It Li k8-fr-retired-far-control-transfers
983Count retired far control transfers (which are always mispredicted).
984.It Li k8-fr-retired-fastpath-double-op-instructions Op Li ,mask= Ns Ar qualifier
985Count retired fastpath double op instructions.
986This event is supported in revision B and later CPUs.
987This event may be further qualified using
988.Ar qualifier ,
989which is a
990.Ql +
991separated set of the following keywords:
992.Pp
993.Bl -tag -width indent -compact
994.It Li low-op-pos-0
995Count instructions with the low op in position 0.
996.It Li low-op-pos-1
997Count instructions with the low op in position 1.
998.It Li low-op-pos-2
999Count instructions with the low op in position 2.
1000.El
1001.Pp
1002The default is to count all types of instructions.
1003.It Li k8-fr-retired-fpu-instructions Op Li ,mask= Ns Ar qualifier
1004Count retired FPU instructions.
1005This event is supported in revision B and later CPUs.
1006This event may be further qualified using
1007.Ar qualifier ,
1008which is a
1009.Ql +
1010separated set of the following keywords:
1011.Pp
1012.Bl -tag -width indent -compact
1013.It Li mmx-3dnow
1014Count MMX and 3DNow!\& instructions.
1015.It Li packed-sse-sse2
1016Count packed SSE and SSE2 instructions.
1017.It Li scalar-sse-sse2
1018Count scalar SSE and SSE2 instructions
1019.It Li x87
1020Count x87 instructions.
1021.El
1022.Pp
1023The default is to count all types of instructions.
1024.It Li k8-fr-retired-near-returns
1025Count retired near returns.
1026.It Li k8-fr-retired-near-returns-mispredicted
1027Count mispredicted near returns.
1028.It Li k8-fr-retired-resyncs
1029Count retired resyncs (non-control transfer branches).
1030.It Li k8-fr-retired-taken-hardware-interrupts
1031Count retired taken hardware interrupts.
1032.It Li k8-fr-retired-taken-branches
1033Count retired taken branches.
1034.It Li k8-fr-retired-taken-branches-mispredicted
1035Count retired taken branches that were mispredicted.
1036.It Li k8-fr-retired-taken-branches-mispredicted-by-addr-miscompare
1037Count retired taken branches that were mispredicted only due to an
1038address miscompare.
1039.It Li k8-fr-retired-uops
1040Count retired uops.
1041.It Li k8-fr-retired-x86-instructions
1042Count retired x86 instructions including exceptions and interrupts.
1043.It Li k8-ic-fetch
1044Count instruction cache fetches.
1045.It Li k8-ic-instruction-fetch-stall
1046Count cycles in stalls due to instruction fetch.
1047.It Li k8-ic-l1-itlb-miss-and-l2-itlb-hit
1048Count L1 ITLB misses that are L2 ITLB hits.
1049.It Li k8-ic-l1-itlb-miss-and-l2-itlb-miss
1050Count ITLB misses that miss in both L1 and L2 ITLBs.
1051.It Li k8-ic-microarchitectural-resync-by-snoop
1052Count microarchitectural resyncs caused by snoops.
1053.It Li k8-ic-miss
1054Count instruction cache misses.
1055.It Li k8-ic-refill-from-l2
1056Count instruction cache refills from L2 cache.
1057.It Li k8-ic-refill-from-system
1058Count instruction cache refills from system memory.
1059.It Li k8-ic-return-stack-hits
1060Count hits to the return stack.
1061.It Li k8-ic-return-stack-overflow
1062Count overflows of the return stack.
1063.It Li k8-ls-buffer2-full
1064Count load/store buffer2 full events.
1065.It Li k8-ls-locked-operation Op Li ,mask= Ns Ar qualifier
1066Count locked operations.
1067For revision C and later CPUs, the following qualifiers are supported:
1068.Pp
1069.Bl -tag -width indent -compact
1070.It Li cycles-in-request
1071Count the number of cycles in the lock request/grant stage.
1072.It Li cycles-to-complete
1073Count the number of cycles a lock takes to complete once it is
1074non-speculative and is the older load/store operation.
1075.It Li locked-instructions
1076Count the number of lock instructions executed.
1077.El
1078.Pp
1079The default is to count the number of lock instructions executed.
1080.It Li k8-ls-microarchitectural-late-cancel
1081Count microarchitectural late cancels of operations in the load/store
1082unit.
1083.It Li k8-ls-microarchitectural-resync-by-self-modifying-code
1084Count microarchitectural resyncs caused by self-modifying code.
1085.It Li k8-ls-microarchitectural-resync-by-snoop
1086Count microarchitectural resyncs caused by snoops.
1087.It Li k8-ls-retired-cflush-instructions
1088Count retired CFLUSH instructions.
1089.It Li k8-ls-retired-cpuid-instructions
1090Count retired CPUID instructions.
1091.It Li k8-ls-segment-register-load Op Li ,mask= Ns Ar qualifier
1092Count segment register loads.
1093This event may be further qualified using
1094.Ar qualifier ,
1095which is a
1096.Ql +
1097separated set of the following keywords:
1098.Bl -tag -width indent -compact
1099.It Li cs
1100Count CS register loads.
1101.It Li ds
1102Count DS register loads.
1103.It Li es
1104Count ES register loads.
1105.It Li fs
1106Count FS register loads.
1107.It Li gs
1108Count GS register loads.
1109.\" .It Li hs
1110.\" Count HS register loads.
1111.\" XXX "HS" register?
1112.It Li ss
1113Count SS register loads.
1114.El
1115.Pp
1116The default is to count all types of loads.
1117.It Li k8-nb-memory-controller-bypass-saturation Op Li ,mask= Ns Ar qualifier
1118Count memory controller bypass counter saturation events.
1119This event may be further qualified using
1120.Ar qualifier ,
1121which is a
1122.Ql +
1123separated set of the following keywords:
1124.Pp
1125.Bl -tag -width indent -compact
1126.It Li dram-controller-interface-bypass
1127Count DRAM controller interface bypass.
1128.It Li dram-controller-queue-bypass
1129Count DRAM controller queue bypass.
1130.It Li memory-controller-hi-pri-bypass
1131Count memory controller high priority bypasses.
1132.It Li memory-controller-lo-pri-bypass
1133Count memory controller low priority bypasses.
1134.El
1135.Pp
1136.It Li k8-nb-memory-controller-dram-slots-missed
1137Count memory controller DRAM command slots missed (in MemClks).
1138.It Li k8-nb-memory-controller-page-access-event Op Li ,mask= Ns Ar qualifier
1139Count memory controller page access events.
1140This event may be further qualified using
1141.Ar qualifier ,
1142which is a
1143.Ql +
1144separated set of the following keywords:
1145.Pp
1146.Bl -tag -width indent -compact
1147.It Li page-conflict
1148Count page conflicts.
1149.It Li page-hit
1150Count page hits.
1151.It Li page-miss
1152Count page misses.
1153.El
1154.Pp
1155The default is to count all types of events.
1156.It Li k8-nb-memory-controller-page-table-overflow
1157Count memory control page table overflow events.
1158.It Li k8-nb-probe-result Op Li ,mask= Ns Ar qualifier
1159Count probe events.
1160This event may be further qualified using
1161.Ar qualifier ,
1162which is a
1163.Ql +
1164separated set of the following keywords:
1165.Pp
1166.Bl -tag -width indent -compact
1167.It Li probe-hit
1168Count all probe hits.
1169.It Li probe-hit-dirty-no-memory-cancel
1170Count probe hits without memory cancels.
1171.It Li probe-hit-dirty-with-memory-cancel
1172Count probe hits with memory cancels.
1173.It Li probe-miss
1174Count probe misses.
1175.El
1176.It Li k8-nb-sized-commands Op Li ,mask= Ns Ar qualifier
1177Count sized commands issued.
1178This event may be further qualified using
1179.Ar qualifier ,
1180which is a
1181.Ql +
1182separated set of the following keywords:
1183.Pp
1184.Bl -tag -width indent -compact
1185.It Li nonpostwrszbyte
1186.It Li nonpostwrszdword
1187.It Li postwrszbyte
1188.It Li postwrszdword
1189.It Li rdszbyte
1190.It Li rdszdword
1191.It Li rdmodwr
1192.El
1193.Pp
1194The default is to count all types of commands.
1195.It Li k8-nb-memory-controller-turnaround Op Li ,mask= Ns Ar qualifier
1196Count memory control turnaround events.
1197This event may be further qualified using
1198.Ar qualifier ,
1199which is a
1200.Ql +
1201separated set of the following keywords:
1202.Pp
1203.Bl -tag -width indent -compact
1204.\" XXX doc is unclear whether these are cycle counts or event counts
1205.It Li dimm-turnaround
1206Count DIMM turnarounds.
1207.It Li read-to-write-turnaround
1208Count read to write turnarounds.
1209.It Li write-to-read-turnaround
1210Count write to read turnarounds.
1211.El
1212.Pp
1213The default is to count all types of events.
1214.It Li k8-nb-ht-bus0-bandwidth Op Li ,mask= Ns Ar qualifier
1215.It Li k8-nb-ht-bus1-bandwidth Op Li ,mask= Ns Ar qualifier
1216.It Li k8-nb-ht-bus2-bandwidth Op Li ,mask= Ns Ar qualifier
1217Count events on the HyperTransport(tm) buses.
1218These events may be further qualified using
1219.Ar qualifier ,
1220which is a
1221.Ql +
1222separated set of the following keywords:
1223.Pp
1224.Bl -tag -width indent -compact
1225.It Li buffer-release
1226Count buffer release messages sent.
1227.It Li command
1228Count command messages sent.
1229.It Li data
1230Count data messages sent.
1231.It Li nop
1232Count nop messages sent.
1233.El
1234.Pp
1235The default is to count all types of messages.
1236.El
1237.Ss Intel P6 PMCS
1238Intel P6 PMCs are present in Intel
1239.Tn "Pentium Pro" ,
1240.Tn "Pentium II" ,
1241.Tn Celeron ,
1242.Tn "Pentium III"
1243and
1244.Tn "Pentium M"
1245processors.
1246.Pp
1247These CPUs have two counters.
1248Some events may only be used on specific counters and some events are
1249defined only on specific processor models.
1250.Pp
1251These PMCs are documented in
1252.Rs
1253.%B "IA-32 Intel(R) Architecture Software Developer's Manual"
1254.%T "Volume 3: System Programming Guide"
1255.%N "Order Number 245472-012"
1256.%D 2003
1257.%Q "Intel Corporation"
1258.Re
1259.Pp
1260Some of these events are affected by processor errata described in
1261.Rs
1262.%B "Intel(R) Pentium(R) III Processor Specification Update"
1263.%N "Document Number: 244453-054"
1264.%D "April 2005"
1265.%Q "Intel Corporation"
1266.Re
1267.Pp
1268Event specifiers for Intel P6 PMCs can have the following common
1269qualifiers:
1270.Bl -tag -width indent
1271.It Li cmask= Ns Ar value
1272Configure the PMC to increment only if the number of configured
1273events measured in a cycle is greater than or equal to
1274.Ar value .
1275.It Li edge
1276Configure the PMC to count the number of deasserted to asserted
1277transitions of the conditions expressed by the other qualifiers.
1278If specified, the counter will increment only once whenever a
1279condition becomes true, irrespective of the number of clocks during
1280which the condition remains true.
1281.It Li inv
1282Invert the sense of comparision when the
1283.Dq Li cmask
1284qualifier is present, making the counter increment when the number of
1285events per cycle is less than the value specified by the
1286.Dq Li cmask
1287qualifier.
1288.It Li os
1289Configure the PMC to count events happening at processor privilege
1290level 0.
1291.It Li umask= Ns Ar value
1292This qualifier is used to further qualify the event selected (see
1293below).
1294.It Li usr
1295Configure the PMC to count events occurring at privilege levels 1, 2
1296or 3.
1297.El
1298.Pp
1299If neither of the
1300.Dq Li os
1301or
1302.Dq Li usr
1303qualifiers are specified, the default is to enable both.
1304.Pp
1305The event specifiers supported by Intel P6 PMCs are:
1306.Bl -tag -width indent
1307.It Li p6-baclears
1308Count the number of times a static branch prediction was made by the
1309branch decoder because the BTB did not have a prediction.
1310.It Li p6-br-bac-missp-exec
1311.Pq Tn "Pentium M"
1312Count the number of branch instructions executed that where
1313mispredicted at the Front End (BAC).
1314.It Li p6-br-bogus
1315Count the number of bogus branches.
1316.It Li p6-br-call-exec
1317.Pq Tn "Pentium M"
1318Count the number of call instructions executed.
1319.It Li p6-br-call-missp-exec
1320.Pq Tn "Pentium M"
1321Count the number of call instructions executed that were mispredicted.
1322.It Li p6-br-cnd-exec
1323.Pq Tn "Pentium M"
1324Count the number of conditional branch instructions executed.
1325.It Li p6-br-cnd-missp-exec
1326.Pq Tn "Pentium M"
1327Count the number of conditional branch instructions executed that were
1328mispredicted.
1329.It Li p6-br-ind-call-exec
1330.Pq Tn "Pentium M"
1331Count the number of indirect call instructions executed.
1332.It Li p6-br-ind-exec
1333.Pq Tn "Pentium M"
1334Count the number of indirect branch instructions executed.
1335.It Li p6-br-ind-missp-exec
1336.Pq Tn "Pentium M"
1337Count the number of indirect branch instructions executed that were
1338mispredicted.
1339.It Li p6-br-inst-decoded
1340Count the number of branch instructions decoded.
1341.It Li p6-br-inst-exec
1342.Pq Tn "Pentium M"
1343Count the number of branch instructions executed but necessarily retired.
1344.It Li p6-br-inst-retired
1345Count the number of branch instructions retired.
1346.It Li p6-br-miss-pred-retired
1347Count the number of mispredicted branch instructions retired.
1348.It Li p6-br-miss-pred-taken-ret
1349Count the number of taken mispredicted branches retired.
1350.It Li p6-br-missp-exec
1351.Pq Tn "Pentium M"
1352Count the number of branch instructions executed that were
1353mispredicted at execution.
1354.It Li p6-br-ret-bac-missp-exec
1355.Pq Tn "Pentium M"
1356Count the number of return instructions executed that were
1357mispredicted at the Front End (BAC).
1358.It Li p6-br-ret-exec
1359.Pq Tn "Pentium M"
1360Count the number of return instructions executed.
1361.It Li p6-br-ret-missp-exec
1362.Pq Tn "Pentium M"
1363Count the number of return instructions executed that were
1364mispredicted at execution.
1365.It Li p6-br-taken-retired
1366Count the number of taken branches retired.
1367.It Li p6-btb-misses
1368Count the number of branches for which the BTB did not produce a
1369prediction.
1370.It Li p6-bus-bnr-drv
1371Count the number of bus clock cycles during which this processor is
1372driving the BNR# pin.
1373.It Li p6-bus-data-rcv
1374Count the number of bus clock cycles during which this processor is
1375receiving data.
1376.It Li p6-bus-drdy-clocks Op Li ,umask= Ns Ar qualifier
1377Count the number of clocks during which DRDY# is asserted.
1378An additional qualifier may be specified, and comprises one of the
1379following keywords:
1380.Pp
1381.Bl -tag -width indent -compact
1382.It Li any
1383Count transactions generated by any agent on the bus.
1384.It Li self
1385Count transactions generated by this processor.
1386.El
1387.Pp
1388The default is to count operations generated by this processor.
1389.It Li p6-bus-hit-drv
1390Count the number of bus clock cycles during which this processor is
1391driving the HIT# pin.
1392.It Li p6-bus-hitm-drv
1393Count the number of bus clock cycles during which this processor is
1394driving the HITM# pin.
1395.It Li p6-bus-lock-clocks Op Li ,umask= Ns Ar qualifier
1396Count the number of clocks during with LOCK# is asserted on the
1397external system bus.
1398An additional qualifier may be specified and comprises one of the following
1399keywords:
1400.Pp
1401.Bl -tag -width indent -compact
1402.It Li any
1403Count transactions generated by any agent on the bus.
1404.It Li self
1405Count transactions generated by this processor.
1406.El
1407.Pp
1408The default is to count operations generated by this processor.
1409.It Li p6-bus-req-outstanding
1410Count the number of bus requests outstanding in any given cycle.
1411.It Li p6-bus-snoop-stall
1412Count the number of clock cycles during which the bus is snoop stalled.
1413.It Li p6-bus-tran-any Op Li ,umask= Ns Ar qualifier
1414Count the number of completed bus transactions of any kind.
1415An additional qualifier may be specified and comprises one of the following
1416keywords:
1417.Pp
1418.Bl -tag -width indent -compact
1419.It Li any
1420Count transactions generated by any agent on the bus.
1421.It Li self
1422Count transactions generated by this processor.
1423.El
1424.Pp
1425The default is to count operations generated by this processor.
1426.It Li p6-bus-tran-brd Op Li ,umask= Ns Ar qualifier
1427Count the number of burst read transactions.
1428An additional qualifier may be specified and comprises one of the following
1429keywords:
1430.Pp
1431.Bl -tag -width indent -compact
1432.It Li any
1433Count transactions generated by any agent on the bus.
1434.It Li self
1435Count transactions generated by this processor.
1436.El
1437.Pp
1438The default is to count operations generated by this processor.
1439.It Li p6-bus-tran-burst Op Li ,umask= Ns Ar qualifier
1440Count the number of completed burst transactions.
1441An additional qualifier may be specified and comprises one of the following
1442keywords:
1443.Pp
1444.Bl -tag -width indent -compact
1445.It Li any
1446Count transactions generated by any agent on the bus.
1447.It Li self
1448Count transactions generated by this processor.
1449.El
1450.Pp
1451The default is to count operations generated by this processor.
1452.It Li p6-bus-tran-def Op Li ,umask= Ns Ar qualifier
1453Count the number of completed deferred transactions.
1454An additional qualifier may be specified and comprises one of the following
1455keywords:
1456.Pp
1457.Bl -tag -width indent -compact
1458.It Li any
1459Count transactions generated by any agent on the bus.
1460.It Li self
1461Count transactions generated by this processor.
1462.El
1463.Pp
1464The default is to count operations generated by this processor.
1465.It Li p6-bus-tran-ifetch Op Li ,umask= Ns Ar qualifier
1466Count the number of completed instruction fetch transactions.
1467An additional qualifier may be specified and comprises one of the following
1468keywords:
1469.Pp
1470.Bl -tag -width indent -compact
1471.It Li any
1472Count transactions generated by any agent on the bus.
1473.It Li self
1474Count transactions generated by this processor.
1475.El
1476.Pp
1477The default is to count operations generated by this processor.
1478.It Li p6-bus-tran-inval Op Li ,umask= Ns Ar qualifier
1479Count the number of completed invalidate transactions.
1480An additional qualifier may be specified and comprises one of the following
1481keywords:
1482.Pp
1483.Bl -tag -width indent -compact
1484.It Li any
1485Count transactions generated by any agent on the bus.
1486.It Li self
1487Count transactions generated by this processor.
1488.El
1489.Pp
1490The default is to count operations generated by this processor.
1491.It Li p6-bus-tran-mem Op Li ,umask= Ns Ar qualifier
1492Count the number of completed memory transactions.
1493An additional qualifier may be specified and comprises one of the following
1494keywords:
1495.Pp
1496.Bl -tag -width indent -compact
1497.It Li any
1498Count transactions generated by any agent on the bus.
1499.It Li self
1500Count transactions generated by this processor.
1501.El
1502.Pp
1503The default is to count operations generated by this processor.
1504.It Li p6-bus-tran-pwr Op Li ,umask= Ns Ar qualifier
1505Count the number of completed partial write transactions.
1506An additional qualifier may be specified and comprises one of the following
1507keywords:
1508.Pp
1509.Bl -tag -width indent -compact
1510.It Li any
1511Count transactions generated by any agent on the bus.
1512.It Li self
1513Count transactions generated by this processor.
1514.El
1515.Pp
1516The default is to count operations generated by this processor.
1517.It Li p6-bus-tran-rfo Op Li ,umask= Ns Ar qualifier
1518Count the number of completed read-for-ownership transactions.
1519An additional qualifier may be specified and comprises one of the following
1520keywords:
1521.Pp
1522.Bl -tag -width indent -compact
1523.It Li any
1524Count transactions generated by any agent on the bus.
1525.It Li self
1526Count transactions generated by this processor.
1527.El
1528.Pp
1529The default is to count operations generated by this processor.
1530.It Li p6-bus-trans-io Op Li ,umask= Ns Ar qualifier
1531Count the number of completed I/O transactions.
1532An additional qualifier may be specified and comprises one of the following
1533keywords:
1534.Pp
1535.Bl -tag -width indent -compact
1536.It Li any
1537Count transactions generated by any agent on the bus.
1538.It Li self
1539Count transactions generated by this processor.
1540.El
1541.Pp
1542The default is to count operations generated by this processor.
1543.It Li p6-bus-trans-p Op Li ,umask= Ns Ar qualifier
1544Count the number of completed partial transactions.
1545An additional qualifier may be specified and comprises one of the following
1546keywords:
1547.Pp
1548.Bl -tag -width indent -compact
1549.It Li any
1550Count transactions generated by any agent on the bus.
1551.It Li self
1552Count transactions generated by this processor.
1553.El
1554.Pp
1555The default is to count operations generated by this processor.
1556.It Li p6-bus-trans-wb Op Li ,umask= Ns Ar qualifier
1557Count the number of completed write-back transactions.
1558An additional qualifier may be specified and comprises one of the following
1559keywords:
1560.Pp
1561.Bl -tag -width indent -compact
1562.It Li any
1563Count transactions generated by any agent on the bus.
1564.It Li self
1565Count transactions generated by this processor.
1566.El
1567.Pp
1568The default is to count operations generated by this processor.
1569.It Li p6-cpu-clk-unhalted
1570Count the number of cycles during with the processor was not halted.
1571.Pp
1572.Pq Tn "Pentium M"
1573Count the number of cycles during with the processor was not halted
1574and not in a thermal trip.
1575.It Li p6-cycles-div-busy
1576Count the number of cycles during which the divider is busy and cannot
1577accept new divides.
1578This event is only allocated on counter 0.
1579.It Li p6-cycles-in-pending-and-masked
1580Count the number of processor cycles for which interrupts were
1581disabled and interrupts were pending.
1582.It Li p6-cycles-int-masked
1583Count the number of processor cycles for which interrupts were
1584disabled.
1585.It Li p6-data-mem-refs
1586Count all loads and all stores using any memory type, including
1587internal retries.
1588Each part of a split store is counted separately.
1589.It Li p6-dcu-lines-in
1590Count the total lines allocated in the data cache unit.
1591.It Li p6-dcu-m-lines-in
1592Count the number of M state lines allocated in the data cache unit.
1593.It Li p6-dcu-m-lines-out
1594Count the number of M state lines evicted from the data cache unit.
1595.It Li p6-dcu-miss-outstanding
1596Count the weighted number of cycles while a data cache unit miss is
1597outstanding, incremented by the number of outstanding cache misses at
1598any time.
1599.It Li p6-div
1600Count the number of floating point multiplies.
1601This event is only allocated on counter 1.
1602.It Li p6-emon-esp-uops
1603.Pq Tn "Pentium M"
1604Count the total number of micro-ops.
1605.It Li p6-emon-est-trans Op Li ,umask= Ns Ar qualifier
1606.Pq Tn "Pentium M"
1607Count the number of
1608.Tn "Enhanced Intel SpeedStep"
1609transitions.
1610An additional qualifier may be specified, and can be one of the
1611following keywords:
1612.Pp
1613.Bl -tag -width indent -compact
1614.It Li all
1615Count all transitions.
1616.It Li freq
1617Count only frequency transitions.
1618.El
1619.Pp
1620The default is to count all transitions.
1621.It Li p6-emon-fused-uops-ret Op Li ,umask= Ns Ar qualifier
1622.Pq Tn "Pentium M"
1623Count the number of retired fused micro-ops.
1624An additional qualifier may be specified, and may be one of the
1625following keywords:
1626.Pp
1627.Bl -tag -width indent -compact
1628.It Li all
1629Count all fused micro-ops.
1630.It Li loadop
1631Count only load and op micro-ops.
1632.It Li stdsta
1633Count only STD/STA micro-ops.
1634.El
1635.Pp
1636The default is to count all fused micro-ops.
1637.It Li p6-emon-kni-comp-inst-ret
1638.Pq Tn "Pentium III"
1639Count the number of SSE computational instructions retired.
1640An additional qualifier may be specified, and comprises one of the
1641following keywords:
1642.Pp
1643.Bl -tag -width indent -compact
1644.It Li packed-and-scalar
1645Count packed and scalar operations.
1646.It Li scalar
1647Count scalar operations only.
1648.El
1649.Pp
1650The default is to count packed and scalar operations.
1651.It Li p6-emon-kni-inst-retired Op Li ,umask= Ns Ar qualifier
1652.Pq Tn "Pentium III"
1653Count the number of SSE instructions retired.
1654An additional qualifier may be specified, and comprises one of the
1655following keywords:
1656.Pp
1657.Bl -tag -width indent -compact
1658.It Li packed-and-scalar
1659Count packed and scalar operations.
1660.It Li scalar
1661Count scalar operations only.
1662.El
1663.Pp
1664The default is to count packed and scalar operations.
1665.It Li p6-emon-kni-pref-dispatched Op Li ,umask= Ns Ar qualifier
1666.Pq Tn "Pentium III"
1667Count the number of SSE prefetch or weakly ordered instructions
1668dispatched (including speculative prefetches).
1669An additional qualifier may be specified, and comprises one of the
1670following keywords:
1671.Pp
1672.Bl -tag -width indent -compact
1673.It Li nta
1674Count non-temporal prefetches.
1675.It Li t1
1676Count prefetches to L1.
1677.It Li t2
1678Count prefetches to L2.
1679.It Li wos
1680Count weakly ordered stores.
1681.El
1682.Pp
1683The default is to count non-temporal prefetches.
1684.It Li p6-emon-kni-pref-miss Op Li ,umask= Ns Ar qualifier
1685.Pq Tn "Pentium III"
1686Count the number of prefetch or weakly ordered instructions that miss
1687all caches.
1688An additional qualifier may be specified, and comprises one of the
1689following keywords:
1690.Pp
1691.Bl -tag -width indent -compact
1692.It Li nta
1693Count non-temporal prefetches.
1694.It Li t1
1695Count prefetches to L1.
1696.It Li t2
1697Count prefetches to L2.
1698.It Li wos
1699Count weakly ordered stores.
1700.El
1701.Pp
1702The default is to count non-temporal prefetches.
1703.It Li p6-emon-pref-rqsts-dn
1704.Pq Tn "Pentium M"
1705Count the number of downward prefetches issued.
1706.It Li p6-emon-pref-rqsts-up
1707.Pq Tn "Pentium M"
1708Count the number of upward prefetches issued.
1709.It Li p6-emon-simd-instr-retired
1710.Pq Tn "Pentium M"
1711Count the number of retired
1712.Tn MMX
1713instructions.
1714.It Li p6-emon-sse-sse2-comp-inst-retired Op Li ,umask= Ns Ar qualifier
1715.Pq Tn "Pentium M"
1716Count the number of computational SSE instructions retired.
1717An additional qualifier may be specified and can be one of the
1718following keywords:
1719.Pp
1720.Bl -tag -width indent -compact
1721.It Li sse-packed-single
1722Count SSE packed-single instructions.
1723.It Li sse-scalar-single
1724Count SSE scalar-single instructions.
1725.It Li sse2-packed-double
1726Count SSE2 packed-double instructions.
1727.It Li sse2-scalar-double
1728Count SSE2 scalar-double instructions.
1729.El
1730.Pp
1731The default is to count SSE packed-single instructions.
1732.It Li p6-emon-sse-sse2-inst-retired Op Li ,umask= Ns Ar qualifer
1733.Pp
1734.Pq Tn "Pentium M"
1735Count the number of SSE instructions retired.
1736An additional qualifier can be specified, and can be one of the
1737following keywords:
1738.Pp
1739.Bl -tag -width indent -compact
1740.It Li sse-packed-single
1741Count SSE packed-single instructions.
1742.It Li sse-packed-single-scalar-single
1743Count SSE packed-single and scalar-single instructions.
1744.It Li sse2-packed-double
1745Count SSE2 packed-double instructions.
1746.It Li sse2-scalar-double
1747Count SSE2 scalar-double instructions.
1748.El
1749.Pp
1750The default is to count SSE packed-single instructions.
1751.It Li p6-emon-synch-uops
1752.Pq Tn "Pentium M"
1753Count the number of sync micro-ops.
1754.It Li p6-emon-thermal-trip
1755.Pq Tn "Pentium M"
1756Count the duration or occurrences of thermal trips.
1757Use the
1758.Dq Li edge
1759qualifier to count occurrences of thermal trips.
1760.It Li p6-emon-unfusion
1761.Pq Tn "Pentium M"
1762Count the number of unfusion events in the reorder buffer.
1763.It Li p6-flops
1764Count the number of computational floating point operations retired.
1765This event is only allocated on counter 0.
1766.It Li p6-fp-assist
1767Count the number of floating point exceptions handled by microcode.
1768This event is only allocated on counter 1.
1769.It Li p6-fp-comps-ops-exe
1770Count the number of computation floating point operations executed.
1771This event is only allocated on counter 0.
1772.It Li p6-fp-mmx-trans Op Li ,umask= Ns Ar qualifier
1773.Pq Tn "Pentium II" , Tn "Pentium III"
1774Count the number of transitions between MMX and floating-point
1775instructions.
1776An additional qualifier may be specified, and comprises one of the
1777following keywords:
1778.Pp
1779.Bl -tag -width indent -compact
1780.It Li mmxtofp
1781Count transitions from MMX instructions to floating-point instructions.
1782.It Li fptommx
1783Count transitions from floating-point instructions to MMX instructions.
1784.El
1785.Pp
1786The default is to count MMX to floating-point transitions.
1787.It Li p6-hw-int-rx
1788Count the number of hardware interrupts received.
1789.It Li p6-ifu-fetch
1790Count the number of instruction fetches, both cacheable and non-cacheable.
1791.It Li p6-ifu-fetch-miss
1792Count the number of instruction fetch misses (i.e., those that produce
1793memory accesses).
1794.It Li p6-ifu-mem-stall
1795Count the number of cycles instruction fetch is stalled for any reason.
1796.It Li p6-ild-stall
1797Count the number of cycles the instruction length decoder is stalled.
1798.It Li p6-inst-decoded
1799Count the number of instructions decoded.
1800.It Li p6-inst-retired
1801Count the number of instructions retired.
1802.It Li p6-itlb-miss
1803Count the number of instruction TLB misses.
1804.It Li p6-l2-ads
1805Count the number of L2 address strobes.
1806.It Li p6-l2-dbus-busy
1807Count the number of cycles during which the L2 cache data bus was busy.
1808.It Li p6-l2-dbus-busy-rd
1809Count the number of cycles during which the L2 cache data bus was busy
1810transferring read data from L2 to the processor.
1811.It Li p6-l2-ifetch Op Li ,umask= Ns Ar qualifier
1812Count the number of L2 instruction fetches.
1813An additional qualifier may be specified and comprises a list of the following
1814keywords separated by
1815.Ql +
1816characters:
1817.Pp
1818.Bl -tag -width indent -compact
1819.It Li e
1820Count operations affecting E (exclusive) state lines.
1821.It Li i
1822Count operations affecting I (invalid) state lines.
1823.It Li m
1824Count operations affecting M (modified) state lines.
1825.It Li s
1826Count operations affecting S (shared) state lines.
1827.El
1828.Pp
1829The default is to count operations affecting all (MESI) state lines.
1830.It Li p6-l2-ld Op Li ,umask= Ns Ar qualifier
1831Count the number of L2 data loads.
1832An additional qualifier may be specified and comprises a list of the following
1833keywords separated by
1834.Ql +
1835characters:
1836.Pp
1837.Bl -tag -width indent -compact
1838.It Li both
1839.Pq Tn "Pentium M"
1840Count both hardware-prefetched lines and non-hardware-prefetched lines.
1841.It Li e
1842Count operations affecting E (exclusive) state lines.
1843.It Li hw
1844.Pq Tn "Pentium M"
1845Count hardware-prefetched lines only.
1846.It Li i
1847Count operations affecting I (invalid) state lines.
1848.It Li m
1849Count operations affecting M (modified) state lines.
1850.It Li nonhw
1851.Pq Tn "Pentium M"
1852Exclude hardware-prefetched lines.
1853.It Li s
1854Count operations affecting S (shared) state lines.
1855.El
1856.Pp
1857The default on processors other than
1858.Tn "Pentium M"
1859processors is to count operations affecting all (MESI) state lines.
1860The default on
1861.Tn "Pentium M"
1862processors is to count both hardware-prefetched and
1863non-hardware-prefetch operations on all (MESI) state lines.
1864.Pq Errata
1865This event is affected by processor errata E53.
1866.It Li p6-l2-lines-in Op Li ,umask= Ns Ar qualifier
1867Count the number of L2 lines allocated.
1868An additional qualifier may be specified and comprises a list of the following
1869keywords separated by
1870.Ql +
1871characters:
1872.Pp
1873.Bl -tag -width indent -compact
1874.It Li both
1875.Pq Tn "Pentium M"
1876Count both hardware-prefetched lines and non-hardware-prefetched lines.
1877.It Li e
1878Count operations affecting E (exclusive) state lines.
1879.It Li hw
1880.Pq Tn "Pentium M"
1881Count hardware-prefetched lines only.
1882.It Li i
1883Count operations affecting I (invalid) state lines.
1884.It Li m
1885Count operations affecting M (modified) state lines.
1886.It Li nonhw
1887.Pq Tn "Pentium M"
1888Exclude hardware-prefetched lines.
1889.It Li s
1890Count operations affecting S (shared) state lines.
1891.El
1892.Pp
1893The default on processors other than
1894.Tn "Pentium M"
1895processors is to count operations affecting all (MESI) state lines.
1896The default on
1897.Tn "Pentium M"
1898processors is to count both hardware-prefetched and
1899non-hardware-prefetch operations on all (MESI) state lines.
1900.Pq Errata
1901This event is affected by processor errata E45.
1902.It Li p6-l2-lines-out Op Li ,umask= Ns Ar qualifier
1903Count the number of L2 lines evicted.
1904An additional qualifier may be specified and comprises a list of the following
1905keywords separated by
1906.Ql +
1907characters:
1908.Pp
1909.Bl -tag -width indent -compact
1910.It Li both
1911.Pq Tn "Pentium M"
1912Count both hardware-prefetched lines and non-hardware-prefetched lines.
1913.It Li e
1914Count operations affecting E (exclusive) state lines.
1915.It Li hw
1916.Pq Tn "Pentium M"
1917Count hardware-prefetched lines only.
1918.It Li i
1919Count operations affecting I (invalid) state lines.
1920.It Li m
1921Count operations affecting M (modified) state lines.
1922.It Li nonhw
1923.Pq Tn "Pentium M" only
1924Exclude hardware-prefetched lines.
1925.It Li s
1926Count operations affecting S (shared) state lines.
1927.El
1928.Pp
1929The default on processors other than
1930.Tn "Pentium M"
1931processors is to count operations affecting all (MESI) state lines.
1932The default on
1933.Tn "Pentium M"
1934processors is to count both hardware-prefetched and
1935non-hardware-prefetch operations on all (MESI) state lines.
1936.Pq Errata
1937This event is affected by processor errata E45.
1938.It Li p6-l2-m-lines-inm
1939Count the number of modified lines allocated in L2 cache.
1940.It Li p6-l2-m-lines-outm Op Li ,umask= Ns Ar qualifier
1941Count the number of L2 M-state lines evicted.
1942.Pp
1943.Pq Tn "Pentium M"
1944On these processors an additional qualifier may be specified and
1945comprises a list of the following keywords separated by
1946.Ql +
1947characters:
1948.Pp
1949.Bl -tag -width indent -compact
1950.It Li both
1951Count both hardware-prefetched lines and non-hardware-prefetched lines.
1952.It Li hw
1953Count hardware-prefetched lines only.
1954.It Li nonhw
1955Exclude hardware-prefetched lines.
1956.El
1957.Pp
1958The default is to count both hardware-prefetched and
1959non-hardware-prefetch operations.
1960.Pq Errata
1961This event is affected by processor errata E53.
1962.It Li p6-l2-rqsts Op Li ,umask= Ns Ar qualifier
1963Count the total number of L2 requests.
1964An additional qualifier may be specified and comprises a list of the following
1965keywords separated by
1966.Ql +
1967characters:
1968.Pp
1969.Bl -tag -width indent -compact
1970.It Li e
1971Count operations affecting E (exclusive) state lines.
1972.It Li i
1973Count operations affecting I (invalid) state lines.
1974.It Li m
1975Count operations affecting M (modified) state lines.
1976.It Li s
1977Count operations affecting S (shared) state lines.
1978.El
1979.Pp
1980The default is to count operations affecting all (MESI) state lines.
1981.It Li p6-l2-st
1982Count the number of L2 data stores.
1983An additional qualifier may be specified and comprises a list of the following
1984keywords separated by
1985.Ql +
1986characters:
1987.Pp
1988.Bl -tag -width indent -compact
1989.It Li e
1990Count operations affecting E (exclusive) state lines.
1991.It Li i
1992Count operations affecting I (invalid) state lines.
1993.It Li m
1994Count operations affecting M (modified) state lines.
1995.It Li s
1996Count operations affecting S (shared) state lines.
1997.El
1998.Pp
1999The default is to count operations affecting all (MESI) state lines.
2000.It Li p6-ld-blocks
2001Count the number of load operations delayed due to store buffer blocks.
2002.It Li p6-misalign-mem-ref
2003Count the number of misaligned data memory references (crossing a 64
2004bit boundary).
2005.It Li p6-mmx-assist
2006.Pq Tn "Pentium II" , Tn "Pentium III"
2007Count the number of MMX assists executed.
2008.It Li p6-mmx-instr-exec
2009.Pq Tn Celeron , Tn "Pentium II"
2010Count the number of MMX instructions executed, except MOVQ and MOVD
2011stores from register to memory.
2012.It Li p6-mmx-instr-ret
2013.Pq Tn "Pentium II"
2014Count the number of MMX instructions retired.
2015.It Li p6-mmx-instr-type-exec Op Li ,umask= Ns Ar qualifier
2016.Pq Tn "Pentium II" , Tn "Pentium III"
2017Count the number of MMX instructions executed.
2018An additional qualifier may be specified and comprises a list of
2019the following keywords separated by
2020.Ql +
2021characters:
2022.Pp
2023.Bl -tag -width indent -compact
2024.It Li pack
2025Count MMX pack operation instructions.
2026.It Li packed-arithmetic
2027Count MMX packed arithmetic instructions.
2028.It Li packed-logical
2029Count MMX packed logical instructions.
2030.It Li packed-multiply
2031Count MMX packed multiply instructions.
2032.It Li packed-shift
2033Count MMX packed shift instructions.
2034.It Li unpack
2035Count MMX unpack operation instructions.
2036.El
2037.Pp
2038The default is to count all operations.
2039.It Li p6-mmx-sat-instr-exec
2040.Pq Tn "Pentium II" , Tn "Pentium III"
2041Count the number of MMX saturating instructions executed.
2042.It Li p6-mmx-uops-exec
2043.Pq Tn "Pentium II" , Tn "Pentium III"
2044Count the number of MMX micro-ops executed.
2045.It Li p6-mul
2046Count the number of floating point multiplies.
2047This event is only allocated on counter 1.
2048.It Li p6-partial-rat-stalls
2049Count the number of cycles or events for partial stalls.
2050.It Li p6-resource-stalls
2051Count the number of cycles there was a resource related stall of any kind.
2052.It Li p6-ret-seg-renames
2053.Pq Tn "Pentium II" , Tn "Pentium III"
2054Count the number of segment register rename events retired.
2055.It Li p6-sb-drains
2056Count the number of cycles the store buffer is draining.
2057.It Li p6-seg-reg-renames Op Li ,umask= Ns Ar qualifier
2058.Pq Tn "Pentium II" , Tn "Pentium III"
2059Count the number of segment register renames.
2060An additional qualifier may be specified, and comprises a list of the
2061following keywords separated by
2062.Ql +
2063characters:
2064.Pp
2065.Bl -tag -width indent -compact
2066.It Li ds
2067Count renames for segment register DS.
2068.It Li es
2069Count renames for segment register ES.
2070.It Li fs
2071Count renames for segment register FS.
2072.It Li gs
2073Count renames for segment register GS.
2074.El
2075.Pp
2076The default is to count operations affecting all segment registers.
2077.It Li p6-seg-rename-stalls
2078.Pq Tn "Pentium II" , Tn "Pentium III"
2079Count the number of segment register renaming stalls.
2080An additional qualifier may be specified, and comprises a list of the
2081following keywords separated by
2082.Ql +
2083characters:
2084.Pp
2085.Bl -tag -width indent -compact
2086.It Li ds
2087Count stalls for segment register DS.
2088.It Li es
2089Count stalls for segment register ES.
2090.It Li fs
2091Count stalls for segment register FS.
2092.It Li gs
2093Count stalls for segment register GS.
2094.El
2095.Pp
2096The default is to count operations affecting all the segment registers.
2097.It Li p6-segment-reg-loads
2098Count the number of segment register loads.
2099.It Li p6-uops-retired
2100Count the number of micro-ops retired.
2101.El
2102.Ss Intel P4 PMCS
2103Intel P4 PMCs are present in Intel
2104.Tn "Pentium 4"
2105and
2106.Tn Xeon
2107processors.
2108These PMCs are documented in
2109.Rs
2110.%B "IA-32 Intel(R) Architecture Software Developer's Manual"
2111.%T "Volume 3: System Programming Guide"
2112.%N "Order Number 245472-012"
2113.%D 2003
2114.%Q "Intel Corporation"
2115.Re
2116Further information about using these PMCs may be found in
2117.Rs
2118.%B "IA-32 Intel(R) Architecture Optimization Guide"
2119.%D 2003
2120.%N "Order Number 248966-009"
2121.%Q "Intel Corporation"
2122.Re
2123Some of these events are affected by processor errata described in
2124.Rs
2125.%B "Intel(R) Pentium(R) 4 Processor Specification Update"
2126.%N "Document Number: 249199-059"
2127.%D "April 2005"
2128.%Q "Intel Corporation"
2129.Re
2130.Pp
2131Event specifiers for Intel P4 PMCs can have the following common
2132qualifiers:
2133.Bl -tag -width indent
2134.It Li active= Ns Ar choice
2135(On P4 HTT CPUs) Filter event counting based on which logical
2136processors are active.
2137The allowed values of
2138.Ar choice
2139are:
2140.Pp
2141.Bl -tag -width indent -compact
2142.It Li any
2143Count when either logical processor is active.
2144.It Li both
2145Count when both logical processors are active.
2146.It Li none
2147Count only when neither logical processor is active.
2148.It Li single
2149Count only when one logical processor is active.
2150.El
2151.Pp
2152The default is
2153.Dq Li both .
2154.It Li cascade
2155Configure the PMC to cascade onto its partner.
2156See
2157.Sx "Cascading P4 PMCs"
2158below for more information.
2159.It Li edge
2160Configure the counter to count false to true transitions of the threshold
2161comparision output.
2162This qualifier only takes effect if a threshold qualifier has also been
2163specified.
2164.It Li complement
2165Configure the counter to increment only when the event count seen is
2166less than the threshold qualifier value specified.
2167.It Li mask= Ns Ar qualifier
2168Many event specifiers for Intel P4 PMCs need to be additionally
2169qualified using a mask qualifier.
2170The allowed syntax for these qualifiers is event specific and is
2171described along with the events.
2172.It Li os
2173Configure the PMC to count when the CPL of the processor is 0.
2174.It Li precise
2175Select precise event based sampling.
2176Precise sampling is supported by the hardware for a limited set of
2177events.
2178.It Li tag= Ns Ar value
2179Configure the PMC to tag the internal uop selected by the other
2180fields in this event specifier with value
2181.Ar value .
2182This feature is used when cascading PMCs.
2183.It Li threshold= Ns Ar value
2184Configure the PMC to increment only when the event counts seen are
2185greater than the specified threshold value
2186.Ar value .
2187.It Li usr
2188Configure the PMC to count when the CPL of the processor is 1, 2 or 3.
2189.El
2190.Pp
2191If neither of the
2192.Dq Li os
2193or
2194.Dq Li usr
2195qualifiers are specified, the default is to enable both.
2196.Pp
2197On Intel Pentium 4 processors with HTT, events are
2198divided into two classes:
2199.Pp
2200.Bl -tag -width indent -compact
2201.It "TS Events"
2202are those where hardware can differentiate between events
2203generated on one logical processor from those generated on the
2204other.
2205.It "TI Events"
2206are those where hardware cannot differentiate between events
2207generated by multiple logical processors in a package.
2208.El
2209.Pp
2210Only TS events are allowed for use with process-mode PMCs on
2211Pentium-4/HTT CPUs.
2212.Pp
2213The event specifiers supported by Intel P4 PMCs are:
2214.Pp
2215.Bl -tag -width indent
2216.It Li p4-128bit-mmx-uop Op Li ,mask= Ns Ar flags
2217.Pq "TI event"
2218Count integer SIMD SSE2 instructions that operate on 128 bit SIMD
2219operands.
2220Qualifier
2221.Ar flags
2222can take the following value (which is also the default):
2223.Pp
2224.Bl -tag -width indent -compact
2225.It Li all
2226Count all uops operating on 128 bit SIMD integer operands in memory or
2227XMM register.
2228.El
2229.Pp
2230If an instruction contains more than one 128 bit MMX uop, then each
2231uop will be counted.
2232.It Li p4-64bit-mmx-uop Op Li ,mask= Ns Ar flags
2233.Pq "TI event"
2234Count MMX instructions that operate on 64 bit SIMD operands.
2235Qualifier
2236.Ar flags
2237can take the following value (which is also the default):
2238.Pp
2239.Bl -tag -width indent -compact
2240.It Li all
2241Count all uops operating on 64 bit SIMD integer operands in memory or
2242in MMX registers.
2243.El
2244.Pp
2245If an instruction contains more than one 64 bit MMX uop, then each
2246uop will be counted.
2247.It Li p4-b2b-cycles
2248.Pq "TI event"
2249Count back-to-back bys cycles.
2250Further documentation for this event is unavailable.
2251.It Li p4-bnr
2252.Pq "TI event"
2253Count bus-not-ready conditions.
2254Further documentation for this event is unavailable.
2255.It Li p4-bpu-fetch-request Op Li ,mask= Ns Ar qualifier
2256.Pq "TS event"
2257Count instruction fetch requests qualified by additional
2258flags specified in
2259.Ar qualifier .
2260At this point only one flag is supported:
2261.Pp
2262.Bl -tag -width indent -compact
2263.It Li tcmiss
2264Count trace cache lookup misses.
2265.El
2266.Pp
2267The default qualifier is also
2268.Dq Li mask=tcmiss .
2269.It Li p4-branch-retired Op Li ,mask= Ns Ar flags
2270.Pq "TS event"
2271Counts retired branches.
2272Qualifier
2273.Ar flags
2274is a list of the following
2275.Ql +
2276separated strings:
2277.Pp
2278.Bl -tag -width indent -compact
2279.It Li mmnp
2280Count branches not-taken and predicted.
2281.It Li mmnm
2282Count branches not-taken and mis-predicted.
2283.It Li mmtp
2284Count branches taken and predicted.
2285.It Li mmtm
2286Count branches taken and mis-predicted.
2287.El
2288.Pp
2289The default qualifier counts all four kinds of branches.
2290.It Li p4-bsq-active-entries Op Li ,mask= Ns Ar qualifier
2291.Pq "TS event"
2292Count the number of entries (clipped at 15) currently active in the
2293BSQ.
2294Qualifier
2295.Ar qualifier
2296is a
2297.Ql +
2298separated set of the following flags:
2299.Pp
2300.Bl -tag -width indent -compact
2301.It Li req-type0 , Li req-type1
2302Forms a 2-bit number used to select the request type encoding:
2303.Pp
2304.Bl -tag -width indent -compact
2305.It Li 0
2306reads excluding read invalidate
2307.It Li 1
2308read invalidates
2309.It Li 2
2310writes other than writebacks
2311.It Li 3
2312writebacks
2313.El
2314.Pp
2315Bit
2316.Dq Li req-type1
2317is the MSB for this two bit number.
2318.It Li req-len0 , Li req-len1
2319Forms a two-bit number that specifies the request length encoding:
2320.Pp
2321.Bl -tag -width indent -compact
2322.It Li 0
23230 chunks
2324.It Li 1
23251 chunk
2326.It Li 3
23278 chunks
2328.El
2329.Pp
2330Bit
2331.Dq Li req-len1
2332is the MSB for this two bit number.
2333.It Li req-io-type
2334Count requests that are input or output requests.
2335.It Li req-lock-type
2336Count requests that lock the bus.
2337.It Li req-lock-cache
2338Count requests that lock the cache.
2339.It Li req-split-type
2340Count requests that is a bus 8-byte chunk that is split across an
23418-byte boundary.
2342.It Li req-dem-type
2343Count requests that are demand (not prefetches) if set.
2344Count requests that are prefetches if not set.
2345.It Li req-ord-type
2346Count requests that are ordered.
2347.It Li mem-type0 , Li mem-type1 , Li mem-type2
2348Forms a 3-bit number that specifies a memory type encoding:
2349.Pp
2350.Bl -tag -width indent -compact
2351.It Li 0
2352UC
2353.It Li 1
2354USWC
2355.It Li 4
2356WT
2357.It Li 5
2358WP
2359.It Li 6
2360WB
2361.El
2362.Pp
2363Bit
2364.Dq Li mem-type2
2365is the MSB of this 3-bit number.
2366.El
2367.Pp
2368The default qualifier has all the above bits set.
2369.Pp
2370Edge triggering using the
2371.Dq Li edge
2372qualifier should not be used with this event when counting cycles.
2373.It Li p4-bsq-allocation Op Li ,mask= Ns Ar qualifier
2374.Pq "TS event"
2375Count allocations in the bus sequence unit according to the flags
2376specified in
2377.Ar qualifier ,
2378which is a
2379.Ql +
2380separated set of the following flags:
2381.Pp
2382.Bl -tag -width indent -compact
2383.It Li req-type0 , Li req-type1
2384Forms a 2-bit number used to select the request type encoding:
2385.Pp
2386.Bl -tag -width indent -compact
2387.It Li 0
2388reads excluding read invalidate
2389.It Li 1
2390read invalidates
2391.It Li 2
2392writes other than writebacks
2393.It Li 3
2394writebacks
2395.El
2396.Pp
2397Bit
2398.Dq Li req-type1
2399is the MSB for this two bit number.
2400.It Li req-len0 , Li req-len1
2401Forms a two-bit number that specifies the request length encoding:
2402.Pp
2403.Bl -tag -width indent -compact
2404.It Li 0
24050 chunks
2406.It Li 1
24071 chunk
2408.It Li 3
24098 chunks
2410.El
2411.Pp
2412Bit
2413.Dq Li req-len1
2414is the MSB for this two bit number.
2415.It Li req-io-type
2416Count requests that are input or output requests.
2417.It Li req-lock-type
2418Count requests that lock the bus.
2419.It Li req-lock-cache
2420Count requests that lock the cache.
2421.It Li req-split-type
2422Count requests that is a bus 8-byte chunk that is split across an
24238-byte boundary.
2424.It Li req-dem-type
2425Count requests that are demand (not prefetches) if set.
2426Count requests that are prefetches if not set.
2427.It Li req-ord-type
2428Count requests that are ordered.
2429.It Li mem-type0 , Li mem-type1 , Li mem-type2
2430Forms a 3-bit number that specifies a memory type encoding:
2431.Pp
2432.Bl -tag -width indent -compact
2433.It Li 0
2434UC
2435.It Li 1
2436USWC
2437.It Li 4
2438WT
2439.It Li 5
2440WP
2441.It Li 6
2442WB
2443.El
2444.Pp
2445Bit
2446.Dq Li mem-type2
2447is the MSB of this 3-bit number.
2448.El
2449.Pp
2450The default qualifier has all the above bits set.
2451.Pp
2452This event is usually used along with the
2453.Dq Li edge
2454qualifier to avoid multiple counting.
2455.It Li p4-bsq-cache-reference Op Li ,mask= Ns Ar qualifier
2456.Pq "TS event"
2457Count cache references as seen by the bus unit (2nd or 3rd level
2458cache references).
2459Qualifier
2460.Ar qualifier
2461is a
2462.Ql +
2463separated list of the following keywords:
2464.Pp
2465.Bl -tag -width indent -compact
2466.It Li rd-2ndl-hits
2467Count 2nd level cache hits in the shared state.
2468.It Li rd-2ndl-hite
2469Count 2nd level cache hits in the exclusive state.
2470.It Li rd-2ndl-hitm
2471Count 2nd level cache hits in the modified state.
2472.It Li rd-3rdl-hits
2473Count 3rd level cache hits in the shared state.
2474.It Li rd-3rdl-hite
2475Count 3rd level cache hits in the exclusive state.
2476.It Li rd-3rdl-hitm
2477Count 3rd level cache hits in the modified state.
2478.It Li rd-2ndl-miss
2479Count 2nd level cache misses.
2480.It Li rd-3rdl-miss
2481Count 3rd level cache misses.
2482.It Li wr-2ndl-miss
2483Count write-back lookups from the data access cache that miss the 2nd
2484level cache.
2485.El
2486.Pp
2487The default is to count all the above events.
2488.It Li p4-execution-event Op Li ,mask= Ns Ar flags
2489.Pq "TS event"
2490Count the retirement of tagged uops selected through the execution
2491tagging mechanism.
2492Qualifier
2493.Ar flags
2494can contain the following strings separated by
2495.Ql +
2496characters:
2497.Pp
2498.Bl -tag -width indent -compact
2499.It Li nbogus0 , Li nbogus1 , Li nbogus2 , Li nbogus3
2500The marked uops are not bogus.
2501.It Li bogus0 , Li bogus1 , Li bogus2 , Li bogus3
2502The marked uops are bogus.
2503.El
2504.Pp
2505This event requires additional (upstream) events to be allocated to
2506perform the desired uop tagging.
2507The default is to set all the above flags.
2508This event can be used for precise event based sampling.
2509.It Li p4-front-end-event Op Li ,mask= Ns Ar flags
2510.Pq "TS event"
2511Count the retirement of tagged uops selected through the front-end
2512tagging mechanism.
2513Qualifier
2514.Ar flags
2515can contain the following strings separated by
2516.Ql +
2517characters:
2518.Pp
2519.Bl -tag -width indent -compact
2520.It Li nbogus
2521The marked uops are not bogus.
2522.It Li bogus
2523The marked uops are bogus.
2524.El
2525.Pp
2526This event requires additional (upstream) events to be allocated to
2527perform the desired uop tagging.
2528The default is to select both kinds of events.
2529This event can be used for precise event based sampling.
2530.It Li p4-fsb-data-activity Op Li ,mask= Ns Ar flags
2531.Pq "TI event"
2532Count each DBSY or DRDY event selected by qualifier
2533.Ar flags .
2534Qualifier
2535.Ar flags
2536is a
2537.Ql +
2538separated set of the following flags:
2539.Pp
2540.Bl -tag -width indent -compact
2541.It Li drdy-drv
2542Count when this processor is driving data onto the bus.
2543.It Li drdy-own
2544Count when this processor is reading data from the bus.
2545.It Li drdy-other
2546Count when data is on the bus but not being sampled by this processor.
2547.It Li dbsy-drv
2548Count when this processor reserves the bus for use in the next cycle
2549in order to drive data.
2550.It Li dbsy-own
2551Count when some agent reserves the bus for use in the next bus cycle
2552to drive data that this processor will sample.
2553.It Li dbsy-other
2554Count when some agent reserves the bus for use in the next bus cycle
2555to drive data that this processor will not sample.
2556.El
2557.Pp
2558Flags
2559.Dq Li drdy-own
2560and
2561.Dq Li drdy-other
2562are mutually exclusive.
2563Flags
2564.Dq Li dbsy-own
2565and
2566.Dq Li dbsy-other
2567are mutually exclusive.
2568The default value for
2569.Ar qualifier
2570is
2571.Dq Li drdy-drv+drdy-own+dbsy-drv+dbsy-own .
2572.It Li p4-global-power-events Op Li ,mask= Ns Ar flags
2573.Pq "TS event"
2574Count cycles during which the processor is not stopped.
2575Qualifier
2576.Ar flags
2577can take the following value (which is also the default):
2578.Pp
2579.Bl -tag -width indent -compact
2580.It Li running
2581Count cycles when the processor is active.
2582.El
2583.Pp
2584.It Li p4-instr-retired Op Li ,mask= Ns Ar flags
2585.Pq "TS event"
2586Count instructions retired during a clock cycle.
2587Qualifer
2588.Ar flags
2589comprises of the following strings separated by
2590.Ql +
2591characters:
2592.Pp
2593.Bl -tag -width indent -compact
2594.It Li nbogusntag
2595Count non-bogus instructions that are not tagged.
2596.It Li nbogustag
2597Count non-bogus instructions that are tagged.
2598.It Li bogusntag
2599Count bogus instructions that are not tagged.
2600.It Li bogustag
2601Count bogus instructions that are tagged.
2602.El
2603.Pp
2604The default qualifier counts all the above kinds of instructions.
2605.It Li p4-ioq-active-entries Xo
2606.Op Li ,mask= Ns Ar qualifier
2607.Op Li ,busreqtype= Ns Ar req-type
2608.Xc
2609.Pq "TS event"
2610Count the number of entries (clipped at 15) in the IOQ that are
2611active.
2612The event masks are specified by qualifier
2613.Ar qualifier
2614and
2615.Ar req-type .
2616.Pp
2617Qualifier
2618.Ar qualifier
2619is a
2620.Ql +
2621separated set of the following flags:
2622.Pp
2623.Bl -tag -width indent -compact
2624.It Li all-read
2625Count read entries.
2626.It Li all-write
2627Count write entries.
2628.It Li mem-uc
2629Count entries accessing uncacheable memory.
2630.It Li mem-wc
2631Count entries accessing write-combining memory.
2632.It Li mem-wt
2633Count entries accessing write-through memory.
2634.It Li mem-wp
2635Count entries accessing write-protected memory
2636.It Li mem-wb
2637Count entries accessing write-back memory.
2638.It Li own
2639Count store requests driven by the processor (i.e., not by other
2640processors or by DMA).
2641.It Li other
2642Count store requests driven by other processors or by DMA.
2643.It Li prefetch
2644Include hardware and software prefetch requests in the count.
2645.El
2646.Pp
2647The default value for
2648.Ar qualifier
2649is to enable all the above flags.
2650.Pp
2651The
2652.Ar req-type
2653qualifier is a 5-bit number can be additionally used to select a
2654specific bus request type.
2655The default is 0.
2656.Pp
2657The
2658.Dq Li edge
2659qualifier should not be used when counting cycles with this event.
2660The exact behaviour of this event depends on the processor revision.
2661.It Li p4-ioq-allocation Xo
2662.Op Li ,mask= Ns Ar qualifier
2663.Op Li ,busreqtype= Ns Ar req-type
2664.Xc
2665.Pq "TS event"
2666Count various types of transactions on the bus matching the flags set
2667in
2668.Ar qualifier
2669and
2670.Ar req-type .
2671.Pp
2672Qualifier
2673.Ar qualifier
2674is a
2675.Ql +
2676separated set of the following flags:
2677.Pp
2678.Bl -tag -width indent -compact
2679.It Li all-read
2680Count read entries.
2681.It Li all-write
2682Count write entries.
2683.It Li mem-uc
2684Count entries accessing uncacheable memory.
2685.It Li mem-wc
2686Count entries accessing write-combining memory.
2687.It Li mem-wt
2688Count entries accessing write-through memory.
2689.It Li mem-wp
2690Count entries accessing write-protected memory
2691.It Li mem-wb
2692Count entries accessing write-back memory.
2693.It Li own
2694Count store requests driven by the processor (i.e., not by other
2695processors or by DMA).
2696.It Li other
2697Count store requests driven by other processors or by DMA.
2698.It Li prefetch
2699Include hardware and software prefetch requests in the count.
2700.El
2701.Pp
2702The default value for
2703.Ar qualifier
2704is to enable all the above flags.
2705.Pp
2706The
2707.Ar req-type
2708qualifier is a 5-bit number can be additionally used to select a
2709specific bus request type.
2710The default is 0.
2711.Pp
2712The
2713.Dq Li edge
2714qualifier is normally used with this event to prevent multiple
2715counting.
2716The exact behaviour of this event depends on the processor revision.
2717.It Li p4-itlb-reference Op mask= Ns Ar qualifier
2718.Pq "TS event"
2719Count translations using the intruction translation look-aside
2720buffer.
2721The
2722.Ar qualifier
2723argument is a list of the following strings separated by
2724.Ql +
2725characters.
2726.Pp
2727.Bl -tag -width indent -compact
2728.It Li hit
2729Count ITLB hits.
2730.It Li miss
2731Count ITLB misses.
2732.It Li hit-uc
2733Count uncacheable ITLB hits.
2734.El
2735.Pp
2736If no
2737.Ar qualifier
2738is specified the default is to count all the three kinds of ITLB
2739translations.
2740.It Li p4-load-port-replay Op Li ,mask= Ns Ar qualifier
2741.Pq "TS event"
2742Count replayed events at the load port.
2743Qualifier
2744.Ar qualifier
2745can take on one value:
2746.Pp
2747.Bl -tag -width indent -compact
2748.It Li split-ld
2749Count split loads.
2750.El
2751.Pp
2752The default value for
2753.Ar qualifier
2754is
2755.Dq Li split-ld .
2756.It Li p4-mispred-branch-retired Op Li ,mask= Ns Ar flags
2757.Pq "TS event"
2758Count mispredicted IA-32 branch instructions.
2759Qualifier
2760.Ar flags
2761can take the following value (which is also the default):
2762.Pp
2763.Bl -tag -width indent -compact
2764.It Li nbogus
2765Count non-bogus retired branch instructions.
2766.El
2767.It Li p4-machine-clear Op Li ,mask= Ns Ar flags
2768.Pq "TS event"
2769Count the number of pipeline clears seen by the processor.
2770Qualifer
2771.Ar flags
2772is a list of the following strings separated by
2773.Ql +
2774characters:
2775.Pp
2776.Bl -tag -width indent -compact
2777.It Li clear
2778Count for a portion of the many cycles when the machine is being
2779cleared for any reason.
2780.It Li moclear
2781Count machine clears due to memory ordering issues.
2782.It Li smclear
2783Count machine clears due to self-modifying code.
2784.El
2785.Pp
2786Use qualifier
2787.Dq Li edge
2788to get a count of occurrences of machine clears.
2789The default qualifier is
2790.Dq Li clear .
2791.It Li p4-memory-cancel Op Li ,mask= Ns Ar event-list
2792.Pq "TS event"
2793Count the cancelling of various kinds of requests in the data cache
2794address control unit of the CPU.
2795The qualifier
2796.Ar event-list
2797is a list of the following strings separated by
2798.Ql +
2799characters:
2800.Pp
2801.Bl -tag -width indent -compact
2802.It Li st-rb-full
2803Requests cancelled because no store request buffer was available.
2804.It Li 64k-conf
2805Requests that conflict due to 64K aliasing.
2806.El
2807.Pp
2808If
2809.Ar event-list
2810is not specified, then the default is to count both kinds of events.
2811.It Li p4-memory-complete Op Li ,mask= Ns Ar event-list
2812.Pq "TS event"
2813Count the completion of load split, store split, uncacheable split and
2814uncacheable load operations selected by qualifier
2815.Ar event-list .
2816The qualifier
2817.Ar event-list
2818is a
2819.Ql +
2820separated list of the following flags:
2821.Pp
2822.Bl -tag -width indent -compact
2823.It Li lsc
2824Count load splits completed, excluding loads from uncacheable or
2825write-combining areas.
2826.It Li ssc
2827Count any split stores completed.
2828.El
2829.Pp
2830The default is to count both kinds of operations.
2831.It Li p4-mob-load-replay Op Li ,mask= Ns Ar qualifier
2832.Pq "TS event"
2833Count load replays triggered by the memory order buffer.
2834Qualifier
2835.Ar qualifier
2836can be a
2837.Ql +
2838separated list of the following flags:
2839.Pp
2840.Bl -tag -width indent -compact
2841.It Li no-sta
2842Count replays because of unknown store addresses.
2843.It Li no-std
2844Count replays because of unknown store data.
2845.It Li partial-data
2846Count replays because of partially overlapped data accesses between
2847load and store operations.
2848.It Li unalgn-addr
2849Count replays because of mismatches in the lower 4 bits of load and
2850store operations.
2851.El
2852.Pp
2853The default qualifier is
2854.Ar no-sta+no-std+partial-data+unalgn-addr .
2855.It Li p4-packed-dp-uop Op Li ,mask= Ns Ar flags
2856.Pq "TI event"
2857Count packed double-precision uops.
2858Qualifier
2859.Ar flags
2860can take the following value (which is also the default):
2861.Pp
2862.Bl -tag -width indent -compact
2863.It Li all
2864Count all uops operating on packed double-precision operands.
2865.El
2866.It Li p4-packed-sp-uop Op Li ,mask= Ns Ar flags
2867.Pq "TI event"
2868Count packed single-precision uops.
2869Qualifier
2870.Ar flags
2871can take the following value (which is also the default):
2872.Pp
2873.Bl -tag -width indent -compact
2874.It Li all
2875Count all uops operating on packed single-precision operands.
2876.El
2877.It Li p4-page-walk-type Op Li ,mask= Ns Ar qualifier
2878.Pq "TI event"
2879Count page walks performed by the page miss handler.
2880Qualifier
2881.Ar qualifier
2882can be a
2883.Ql +
2884separated list of the following keywords:
2885.Pp
2886.Bl -tag -width indent -compact
2887.It Li dtmiss
2888Count page walks for data TLB misses.
2889.It Li itmiss
2890Count page walks for instruction TLB misses.
2891.El
2892.Pp
2893The default value for
2894.Ar qualifier
2895is
2896.Dq Li dtmiss+itmiss .
2897.It Li p4-replay-event Op Li ,mask= Ns Ar flags
2898.Pq "TS event"
2899Count the retirement of tagged uops selected through the replay
2900tagging mechanism.
2901Qualifier
2902.Ar flags
2903contains a
2904.Ql +
2905separated set of the following strings:
2906.Pp
2907.Bl -tag -width indent -compact
2908.It Li nbogus
2909The marked uops are not bogus.
2910.It Li bogus
2911The marked uops are bogus.
2912.El
2913.Pp
2914This event requires additional (upstream) events to be allocated to
2915perform the desired uop tagging.
2916The default qualifier counts both kinds of uops.
2917This event can be used for precise event based sampling.
2918.It Li p4-resource-stall Op Li ,mask= Ns Ar flags
2919.Pq "TS event"
2920Count the occurrence or latency of stalls in the allocator.
2921Qualifier
2922.Ar flags
2923can take the following value (which is also the default):
2924.Pp
2925.Bl -tag -width indent -compact
2926.It Li sbfull
2927A stall due to the lack of store buffers.
2928.El
2929.It Li p4-response
2930.Pq "TI event"
2931Count different types of responses.
2932Further documentation on this event is not available.
2933.It Li p4-retired-branch-type Op Li ,mask= Ns Ar flags
2934.Pq "TS event"
2935Count branches retired.
2936Qualifier
2937.Ar flags
2938contains a
2939.Ql +
2940separated list of strings:
2941.Pp
2942.Bl -tag -width indent -compact
2943.It Li conditional
2944Count conditional jumps.
2945.It Li call
2946Count direct and indirect call branches.
2947.It Li return
2948Count return branches.
2949.It Li indirect
2950Count returns, indirect calls or indirect jumps.
2951.El
2952.Pp
2953The default qualifier counts all the above branch types.
2954.It Li p4-retired-mispred-branch-type Op Li ,mask= Ns Ar flags
2955.Pq "TS event"
2956Count mispredicted branches retired.
2957Qualifier
2958.Ar flags
2959contains a
2960.Ql +
2961separated list of strings:
2962.Pp
2963.Bl -tag -width indent -compact
2964.It Li conditional
2965Count conditional jumps.
2966.It Li call
2967Count indirect call branches.
2968.It Li return
2969Count return branches.
2970.It Li indirect
2971Count returns, indirect calls or indirect jumps.
2972.El
2973.Pp
2974The default qualifier counts all the above branch types.
2975.It Li p4-scalar-dp-uop Op Li ,mask= Ns Ar flags
2976.Pq "TI event"
2977Count the number of scalar double-precision uops.
2978Qualifier
2979.Ar flags
2980can take the following value (which is also the default):
2981.Pp
2982.Bl -tag -width indent -compact
2983.It Li all
2984Count the number of scalar double-precision uops.
2985.El
2986.It Li p4-scalar-sp-uop Op Li ,mask= Ns Ar flags
2987.Pq "TI event"
2988Count the number of scalar single-precision uops.
2989Qualifier
2990.Ar flags
2991can take the following value (which is also the default):
2992.Pp
2993.Bl -tag -width indent -compact
2994.It Li all
2995Count all uops operating on scalar single-precision operands.
2996.El
2997.It Li p4-snoop
2998.Pq "TI event"
2999Count snoop traffic.
3000Further documentation on this event is not available.
3001.It Li p4-sse-input-assist Op Li ,mask= Ns Ar flags
3002.Pq "TI event"
3003Count the number of times an assist is required to handle problems
3004with the operands for SSE and SSE2 operations.
3005Qualifier
3006.Ar flags
3007can take the following value (which is also the default):
3008.Pp
3009.Bl -tag -width indent -compact
3010.It Li all
3011Count assists for all SSE and SSE2 uops.
3012.El
3013.It Li p4-store-port-replay Op Li ,mask= Ns Ar qualifier
3014.Pq "TS event"
3015Count events replayed at the store port.
3016Qualifier
3017.Ar qualifier
3018can take on one value:
3019.Pp
3020.Bl -tag -width indent -compact
3021.It Li split-st
3022Count split stores.
3023.El
3024.Pp
3025The default value for
3026.Ar qualifier
3027is
3028.Dq Li split-st .
3029.It Li p4-tc-deliver-mode Op Li ,mask= Ns Ar qualifier
3030.Pq "TI event"
3031Count the duration in cycles of operating modes of the trace cache and
3032decode engine.
3033The desired operating mode is selected by
3034.Ar qualifier ,
3035which is a list of the following strings separated by
3036.Ql +
3037characters:
3038.Pp
3039.Bl -tag -width indent -compact
3040.It Li DD
3041Both logical processors are in deliver mode.
3042.It Li DB
3043Logical processor 0 is in deliver mode while logical processor 1 is in
3044build mode.
3045.It Li DI
3046Logical processor 0 is in deliver mode while logical processor 1 is
3047halted, or in machine clear, or transitioning to a long microcode
3048flow.
3049.It Li BD
3050Logical processor 0 is in build mode while logical processor 1 is in
3051deliver mode.
3052.It Li BB
3053Both logical processors are in build mode.
3054.It Li BI
3055Logical processor 0 is in build mode while logical processor 1 is
3056halted, or in machine clear or transitioning to a long microcode
3057flow.
3058.It Li ID
3059Logical processor 0 is halted, or in machine clear or transitioning to
3060a long microcode flow while logical processor 1 is in deliver mode.
3061.It Li IB
3062Logical processor 0 is halted, or in machine clear or transitioning to
3063a long microcode flow while logical processor 1 is in build mode.
3064.El
3065.Pp
3066If there is only one logical processor in the processor package then
3067the qualifier for logical processor 1 is ignored.
3068If no qualifier is specified, the default qualifier is
3069.Dq Li DD+DB+DI+BD+BB+BI+ID+IB .
3070.It Li p4-tc-ms-xfer Op Li ,mask= Ns Ar flags
3071.Pq "TI event"
3072Count the number of times uop delivery changed from the trace cache to
3073MS ROM.
3074Qualifier
3075.Ar flags
3076can take the following value (which is also the default):
3077.Pp
3078.Bl -tag -width indent -compact
3079.It Li cisc
3080Count TC to MS transfers.
3081.El
3082.It Li p4-uop-queue-writes Op Li ,mask= Ns Ar flags
3083.Pq "TS event"
3084Count the number of valid uops written to the uop queue.
3085Qualifier
3086.Ar flags
3087is a list of the following strings, separated by
3088.Ql +
3089characters:
3090.Pp
3091.Bl -tag -width indent -compact
3092.It Li from-tc-build
3093Count uops being written from the trace cache in build mode.
3094.It Li from-tc-deliver
3095Count uops being written from the trace cache in deliver mode.
3096.It Li from-rom
3097Count uops being written from microcode ROM.
3098.El
3099.Pp
3100The default qualifier counts all the above kinds of uops.
3101.It Li p4-uop-type Op Li ,mask= Ns Ar flags
3102.Pq "TS event"
3103This event is used in conjunction with the front-end at-retirement
3104mechanism to tag load and store uops.
3105Qualifer
3106.Ar flags
3107comprises the following strings separated by
3108.Ql +
3109characters:
3110.Pp
3111.Bl -tag -width indent -compact
3112.It Li tagloads
3113Mark uops that are load operations.
3114.It Li tagstores
3115Mark uops that are store operations.
3116.El
3117.Pp
3118The default qualifier counts both kinds of uops.
3119.It Li p4-uops-retired Op Li ,mask= Ns Ar flags
3120.Pq "TS event"
3121Count uops retired during a clock cycle.
3122Qualifier
3123.Ar flags
3124comprises the following strings separated by
3125.Ql +
3126characters:
3127.Pp
3128.Bl -tag -width indent -compact
3129.It Li nbogus
3130Count marked uops that are not bogus.
3131.It Li bogus
3132Count marked uops that are bogus.
3133.El
3134.Pp
3135The default qualifier counts both kinds of uops.
3136.It Li p4-wc-buffer Op Li ,mask= Ns Ar flags
3137.Pq "TI event"
3138Count write-combining buffer operations.
3139Qualifier
3140.Ar flags
3141contains the following strings separated by
3142.Ql +
3143characters:
3144.Pp
3145.Bl -tag -width indent -compact
3146.It Li wcb-evicts
3147WC buffer evictions due to any cause.
3148.It Li wcb-full-evict
3149WC buffer evictions due to no WC buffer being available.
3150.El
3151.Pp
3152The default qualifer counts both kinds of evictions.
3153.It Li p4-x87-assist Op Li ,mask= Ns Ar flags
3154.Pq "TS event"
3155Count the retirement of x87 instructions that required special
3156handling.
3157Qualifier
3158.Ar flags
3159contains the following strings separated by
3160.Ql +
3161characters:
3162.Pp
3163.Bl -tag -width indent -compact
3164.It Li fpsu
3165Count instructions that saw an FP stack underflow.
3166.It Li fpso
3167Count instructions that saw an FP stack overflow.
3168.It Li poao
3169Count instructions that saw an x87 output overflow.
3170.It Li poau
3171Count instructions that saw an x87 output underflow.
3172.It Li prea
3173Count instructions that needed an x87 input assist.
3174.El
3175.Pp
3176The default qualifier counts all the above types of instruction
3177retirements.
3178.It Li p4-x87-fp-uop Op Li ,mask= Ns Ar flags
3179.Pq "TI event"
3180Count x87 floating-point uops.
3181Qualifier
3182.Ar flags
3183can take the following value (which is also the default):
3184.Pp
3185.Bl -tag -width indent -compact
3186.It Li all
3187Count all x87 floating-point uops.
3188.El
3189.Pp
3190If an instruction contains more than one x87 floating-point uops, then
3191all x87 floating-point uops will be counted.
3192This event does not count x87 floating-point data movement operations.
3193.It Li p4-x87-simd-moves-uop Op Li ,mask= Ns Ar flags
3194.Pq "TI event"
3195Count each x87 FPU, MMX, SSE, or SSE2 uops that load data or store
3196data or perform register-to-register moves.
3197This event does not count integer move uops.
3198Qualifier
3199.Ar flags
3200may contain the following keywords separated by
3201.Ql +
3202characters:
3203.Pp
3204.Bl -tag -width indent -compact
3205.It Li allp0
3206Count all x87 and SIMD store and move uops.
3207.It Li allp2
3208Count all x87 and SIMD load uops.
3209.El
3210.Pp
3211The default is to count all uops.
3212.Pq Errata
3213This event may be affected by processor errata N43.
3214.El
3215.Ss "Cascading P4 PMCs"
3216PMC cascading support is currently poorly implemented.
3217While individual event counters may be allocated with a
3218.Dq Li cascade
3219qualifier, the current API does not offer the ability
3220to name and allocate all the resources needed for a
3221cascaded event counter pair in a single operation.
3222.Ss "Precise Event Based Sampling"
3223Support for precise event based sampling is currently
3224unimplemented in
3225.Xr hwpmc 4 .
3226.Sh IMPLEMENTATION NOTES
3227On the i386 architecture,
3228.Fx
3229has historically allowed the use of the RDTSC instruction from
3230user-mode (i.e., at a processor CPL of 3) by any process.
3231This behaviour is preserved by
3232.Xr hwpmc 4 .
3233.Sh RETURN VALUES
3234The
3235.Fn pmc_name_of_capability ,
3236.Fn pmc_name_of_class ,
3237.Fn pmc_name_of_cputype ,
3238.Fn pmc_name_of_disposition ,
3239.Fn pmc_name_of_event ,
3240.Fn pmc_name_of_mode ,
3241and
3242.Fn pmc_name_of_state
3243functions return a pointer to the human readable form of their argument.
3244These pointers may point to statically allocated storage and must
3245not be passed to
3246.Fn free .
3247In case of an error, these functions return
3248.Dv NULL
3249and set the global variable
3250.Va errno .
3251.Pp
3252The functions
3253.Fn pmc_ncpu
3254and
3255.Fn pmc_npmc
3256return the number of CPUs and number of PMCs configured respectively;
3257in case of an error they return the value
3258\-1
3259and set the global variable
3260.Va errno .
3261.Pp
3262All other functions return the value
32630
3264if successful; otherwise the value
3265\-1
3266is returned and the global variable
3267.Va errno
3268is set to indicate the error.
3269.Sh COMPATIBILITY
3270The interface between the
3271.Nm pmc
3272library and the
3273.Xr hwpmc 4
3274driver is intended to be private to the implementation and may
3275change.
3276In order to ease forward compatibility with future versions of the
3277.Xr hwpmc 4
3278driver, applications are urged to dynamically link with the
3279.Nm pmc
3280library.
3281.Pp
3282The
3283.Nm pmc
3284API is
3285.Ud
3286.Sh ERRORS
3287A call to
3288.Fn pmc_init
3289may fail with the following errors in addition to those returned by
3290.Xr modfind 2 ,
3291.Xr modstat 2
3292and
3293.Xr hwpmc 4 :
3294.Bl -tag -width Er
3295.It Bq Er ENXIO
3296An unknown CPU type was encountered during initialization.
3297.It Bq Er EPROGMISMATCH
3298The version number of the
3299.Xr hwpmc 4
3300kernel module did not match that compiled into the
3301.Nm pmc
3302library.
3303.El
3304.Pp
3305A call to
3306.Fn pmc_capabilities ,
3307.Fn pmc_name_of_capability ,
3308.Fn pmc_name_of_disposition ,
3309.Fn pmc_name_of_state ,
3310.Fn pmc_name_of_event ,
3311.Fn pmc_name_of_mode
3312.Fn pmc_name_of_class
3313and
3314.Fn pmc_width
3315may fail with the following error:
3316.Bl -tag -width Er
3317.It Bq Er EINVAL
3318An invalid argument was passed to the function.
3319.El
3320.Pp
3321A call to
3322.Fn pmc_cpuinfo
3323or
3324.Fn pmc_ncpu
3325may fail with the following error:
3326.Bl -tag -width Er
3327.It Bq Er ENXIO
3328The
3329.Nm pmc
3330has not been initialized.
3331.El
3332.Pp
3333A call to
3334.Fn pmc_npmc
3335may fail with the following errors:
3336.Bl -tag -width Er
3337.It Bq Er EINVAL
3338The argument passed in was out of range.
3339.It Bq Er ENXIO
3340The
3341.Nm pmc
3342library has not been initialized.
3343.El
3344.Pp
3345A call to
3346.Fn pmc_pmcinfo
3347may fail with the following errors, in addition to those returned by
3348.Xr hwpmc 4 :
3349.Bl -tag -width Er
3350.It Bq Er ENXIO
3351The
3352.Nm pmc
3353library is not yet initialized.
3354.El
3355.Pp
3356A call to
3357.Fn pmc_allocate
3358may fail with the following errors, in addition to those returned by
3359.Xr hwpmc 4 :
3360.Bl -tag -width Er
3361.It Bq Er EINVAL
3362The
3363.Fa mode
3364argument passed in had an illegal value, or the event specification
3365.Fa ctrspec
3366was unrecognized for this CPU type.
3367.El
3368.Pp
3369Calls to
3370.Fn pmc_attach ,
3371.Fn pmc_configure_logfile ,
3372.Fn pmc_detach ,
3373.Fn pmc_disable ,
3374.Fn pmc_enable ,
3375.Fn pmc_get_driver_stats ,
3376.Fn pmc_get_msr ,
3377.Fn pmc_read ,
3378.Fn pmc_release ,
3379.Fn pmc_rw ,
3380.Fn pmc_set ,
3381.Fn pmc_start ,
3382.Fn pmc_stop ,
3383.Fn pmc_write ,
3384and
3385.Fn pmc_writelog
3386may fail with the errors described in
3387.Xr hwpmc 4 .
3388.Pp
3389If a log file was configured using
3390.Fn pmc_configure_logfile
3391and the
3392.Xr hwpmc 4
3393driver encountered an error while logging data to it, then
3394logging will be stopped and a subsequent call to
3395.Fn pmc_flush_logfile
3396will fail with the error code seen by the
3397.Xr hwpmc 4
3398driver.
3399.Sh SEE ALSO
3400.Xr modfind 2 ,
3401.Xr modstat 2 ,
3402.Xr calloc 3 ,
3403.Xr pmclog 3 ,
3404.Xr hwpmc 4 ,
3405.Xr pmccontrol 8 ,
3406.Xr pmcstat 8
3407.Sh HISTORY
3408The
3409.Nm pmc
3410library first appeared in
3411.Fx 6.0 .
3412.Sh BUGS
3413The information returned by
3414.Fn pmc_cpuinfo ,
3415.Fn pmc_ncpu
3416and possibly
3417.Fn pmc_npmc
3418should really be available all the time, through a better designed
3419interface and not just when
3420.Xr hwpmc 4
3421is present in the kernel.
3422