xref: /freebsd/lib/libpmc/pmc.3 (revision 5138c36a1bf9c514074a375465592947064e0bbd)
1.\" Copyright (c) 2003-2005 Joseph Koshy.  All rights reserved.
2.\"
3.\" Redistribution and use in source and binary forms, with or without
4.\" modification, are permitted provided that the following conditions
5.\" are met:
6.\" 1. Redistributions of source code must retain the above copyright
7.\"    notice, this list of conditions and the following disclaimer.
8.\" 2. Redistributions in binary form must reproduce the above copyright
9.\"    notice, this list of conditions and the following disclaimer in the
10.\"    documentation and/or other materials provided with the distribution.
11.\"
12.\" This software is provided by Joseph Koshy ``as is'' and
13.\" any express or implied warranties, including, but not limited to, the
14.\" implied warranties of merchantability and fitness for a particular purpose
15.\" are disclaimed.  in no event shall Joseph Koshy be liable
16.\" for any direct, indirect, incidental, special, exemplary, or consequential
17.\" damages (including, but not limited to, procurement of substitute goods
18.\" or services; loss of use, data, or profits; or business interruption)
19.\" however caused and on any theory of liability, whether in contract, strict
20.\" liability, or tort (including negligence or otherwise) arising in any way
21.\" out of the use of this software, even if advised of the possibility of
22.\" such damage.
23.\"
24.\" $FreeBSD$
25.\"
26.Dd June 24, 2005
27.Os
28.Dt PMC 3
29.Sh NAME
30.Nm pmc_allocate ,
31.Nm pmc_attach ,
32.Nm pmc_capabilities ,
33.Nm pmc_configure_logfile ,
34.Nm pmc_cpuinfo ,
35.Nm pmc_detach ,
36.Nm pmc_disable ,
37.Nm pmc_enable ,
38.Nm pmc_event_names_of_class ,
39.Nm pmc_flush_logfile ,
40.Nm pmc_get_driver_stats ,
41.Nm pmc_get_msr ,
42.Nm pmc_init ,
43.Nm pmc_name_of_capability ,
44.Nm pmc_name_of_class ,
45.Nm pmc_name_of_cputype ,
46.Nm pmc_name_of_event ,
47.Nm pmc_name_of_mode ,
48.Nm pmc_name_of_state ,
49.Nm pmc_ncpu ,
50.Nm pmc_npmc ,
51.Nm pmc_pmcinfo ,
52.Nm pmc_read ,
53.Nm pmc_release ,
54.Nm pmc_rw ,
55.Nm pmc_set ,
56.Nm pmc_start ,
57.Nm pmc_stop ,
58.Nm pmc_width ,
59.Nm pmc_write ,
60.Nm pmc_writelog
61.Nd programming API for using hardware performance monitoring counters
62.Sh LIBRARY
63.Lb libpmc
64.Sh SYNOPSIS
65.In pmc.h
66.Ft int
67.Fo pmc_allocate
68.Fa "const char *eventspecifier"
69.Fa "enum pmc_mode mode"
70.Fa "uint32_t flags"
71.Fa "uint32_t cpu"
72.Fa "pmc_id_t *pmcid"
73.Fc
74.Ft int
75.Fn pmc_attach "pmc_id_t pmcid" "pid_t pid"
76.Ft int
77.Fn pmc_capabilities "pmc_id_t pmc" "uint32_t *caps"
78.Ft int
79.Fn pmc_configure_logfile "int fd"
80.Ft int
81.Fn pmc_cpuinfo "const struct pmc_cpuinfo **cpu_info"
82.Ft int
83.Fn pmc_detach "pmc_id_t pmcid" "pid_t pid"
84.Ft int
85.Fn pmc_disable "uint32_t cpu" "int pmc"
86.Ft int
87.Fn pmc_enable "uint32_t cpu" "int pmc"
88.Ft int
89.Fo pmc_event_names_of_class
90.Fa "enum pmc_class cl"
91.Fa "const char ***eventnames"
92.Fa "int *nevents"
93.Fc
94.Ft int
95.Fn pmc_flush_logfile void
96.Ft int
97.Fn pmc_get_driver_stats "struct pmc_driverstats *gms"
98.Ft int
99.Fn pmc_get_msr "pmc_id_t pmc" "uint32_t *msr"
100.Ft int
101.Fn pmc_init void
102.Ft "const char *"
103.Fn pmc_name_of_capability "enum pmc_caps pc"
104.Ft "const char *"
105.Fn pmc_name_of_class "enum pmc_class pc"
106.Ft "const char *"
107.Fn pmc_name_of_cputype "enum pmc_cputype ct"
108.Ft "const char *"
109.Fn pmc_name_of_disposition "enum pmc_disp pd"
110.Ft "const char *"
111.Fn pmc_name_of_event "enum pmc_event pe"
112.Ft "const char *"
113.Fn pmc_name_of_mode "enum pmc_mode pm"
114.Ft "const char *"
115.Fn pmc_name_of_state "enum pmc_state ps"
116.Ft int
117.Fn pmc_ncpu void
118.Ft int
119.Fn pmc_npmc "uint32_t cpu"
120.Ft int
121.Fn pmc_pmcinfo "uint32_t cpu" "struct pmc_pmcinfo **pmc_info"
122.Ft int
123.Fn pmc_read "pmc_id_t pmc" "pmc_value_t *value"
124.Ft int
125.Fn pmc_release "pmc_id_t pmc"
126.Ft int
127.Fn pmc_rw "pmc_id_t pmc" "pmc_value_t newvalue" "pmc_value_t *oldvaluep"
128.Ft int
129.Fn pmc_set "pmc_id_t pmc" "pmc_value_t value"
130.Ft int
131.Fn pmc_start "pmc_id_t pmc"
132.Ft int
133.Fn pmc_stop "pmc_id_t pmc"
134.Ft int
135.Fn pmc_write "pmc_id_t pmc" "pmc_value_t value"
136.Ft int
137.Fn pmc_writelog "uint32_t userdata"
138.Ft int
139.Fn pmc_width "pmc_id_t pmc" "uint32_t *width"
140.Sh DESCRIPTION
141These functions implement a high-level library for using the
142system's hardware performance counters.
143.Pp
144PMCs are allocated using
145.Fn pmc_allocate ,
146released using
147.Fn pmc_release
148and read using
149.Fn pmc_read .
150Allocated PMCs may be started or stopped at any time using
151.Fn pmc_start
152and
153.Fn pmc_stop
154respectively.
155An allocated PMC may be of
156.Dq global
157scope, meaning that the PMC measures system-wide events, or
158.Dq process-private
159scope, meaning that the PMC only counts hardware events when
160the allocating process (or, optionally, its children)
161are active.
162.Pp
163PMCs may further be in
164.Dq "counting mode" ,
165or in
166.Dq "sampling mode" .
167Sampling mode PMCs deliver an interrupt to the CPU after
168a configured number of hardware events have been seen.
169A process-private sampling mode PMC will cause its owner
170process to get periodic
171.Dv SIGPROF
172interrupts, while a global sampling mode PMC is used to
173do system-wide statistical sampling (see
174.Xr hwpmc 4 ) .
175The sampling rate desired of a sampling-mode PMC is set using
176.Fn pmc_set .
177Counting mode PMCs do not interrupt the CPU; their values
178can be read using
179.Fn pmc_read .
180.Pp
181System-wide statistical sampling is configured by allocating
182at least one sampling mode PMC with
183global scope, and when a log file is configured using
184.Fn pmc_configure_logfile .
185The
186.Xr hwpmc 4
187driver manages system-wide statistical sampling; for more
188information please see
189.Xr hwpmc 4 .
190.Ss Application Programming Interface
191The function
192.Fn pmc_init
193initializes the
194.Xr pmc 3
195library.
196This function must be called first, before any of the other
197functions in the library.
198.Pp
199The function
200.Fn pmc_allocate
201allocates a counter that counts the events named by
202.Fa eventspecifier ,
203and writes the allocated counter ID to
204.Fa *pmcid .
205Argument
206.Fa eventspecifier
207comprises an PMC event name followed by an optional comma separated
208list of keywords and qualifiers.
209The allowed syntax for
210.Fa eventspecifier
211is processor architecture specific and is listed in section
212.Sx "EVENT SPECIFIERS"
213below.
214The desired PMC mode is specified by
215.Fa mode ,
216and any mode specific modifiers are specified using
217.Fa flags .
218The
219.Fa cpu
220argument is the value
221.Dv PMC_CPU_ANY ,
222or names the CPU the allocation is to be on.
223Requesting a specific CPU only makes sense for global PMCs;
224process-private PMC allocations should always specify
225.Dv PMC_CPU_ANY .
226.Pp
227By default, a PMC configured in process-virtual counting mode is set up
228to profile its owner process.
229The function
230.Fn pmc_attach
231may be used to attach the PMC to a different process.
232It
233needs to be called before the counter is first started
234with
235.Fn pmc_start .
236The function
237.Fn pmc_detach
238may be used to detach a PMC from a process it was attached to
239using a prior call to
240.Fn pmc_attach .
241.Pp
242The function
243.Fn pmc_release
244releases a PMC previously allocated with
245.Fn pmc_allocate .
246This function call implicitly detaches the PMC from all its target
247processes.
248.Pp
249An allocated PMC may be started and stopped using
250.Fn pmc_start
251and
252.Fn pmc_stop
253respectively.
254.Pp
255The current value of a PMC may be read with
256.Fn pmc_read
257and written using
258.Fn pmc_write ,
259provided the underlying hardware supports these operations on
260the allocated PMC.
261The read and write operation may be combined using
262.Fn pmc_rw .
263.Pp
264The function
265.Fn pmc_capabilities
266sets argument
267.Fa caps
268to a bitmask of capabilities supported by the PMC denoted by
269argument
270.Fa pmc .
271The function
272.Fn pmc_width
273sets argument
274.Fa width
275to the width of the PMC denoted by argument
276.Fa pmc .
277.Pp
278The
279.Fn pmc_configure_logfile
280function causes the
281.Xr hwpmc 4
282driver to log performance data to file corresponding
283to the process' file handle
284.Fa fd .
285If argument
286.Fa fd
287is \-1, then any previously configured logging is reset
288and all data queued to be written are discarded.
289.Pp
290The
291.Fn pmc_flush_logfile
292function will send all data queued inside the
293.Xr hwpmc 4
294driver to the configured log file before returning.
295The
296.Fn pmc_writelog
297function will append a log entry containing the argument
298.Fa userdata
299to the log file.
300.Pp
301The function
302.Fn pmc_set
303configures a sampling PMC
304.Fa pmc
305to interrupt every
306.Fa value
307events.
308For counting PMCs,
309.Fn pmc_set
310sets the initial value of the PMC to
311.Fa value .
312.Pp
313The function
314.Fn pmc_get_driver_statistics
315copies a snapshot of the usage statistics maintained by
316.Xr hwpmc 4
317into the memory area pointed to by argument
318.Fa gms .
319.Ss Signal Handling Requirements
320Applications using PMCs are required to handle the following signals:
321.Bl -tag -width indent
322.It Dv SIGBUS
323When the
324.Xr hwpmc 4
325module is unloaded using
326.Xr kldunload 8 ,
327processes that have PMCs allocated to them will be sent a
328.Dv SIGBUS
329signal.
330.It Dv SIGIO
331The
332.Xr hwpmc 4
333driver will send a PMC owning process a
334.Dv SIGIO
335signal if:
336.Bl -bullet
337.It
338If any process-mode PMC allocated by it loses all its
339target processes.
340.It
341If the driver encounters an error when writing log data to a
342configured log file.
343This error may be retrieved by a subsequent call to
344.Fn pmc_flush_logfile .
345.El
346.El
347.Ss Convenience Functions
348The function
349.Fn pmc_ncpu
350returns the number of CPUs present in the system.
351.Pp
352The function
353.Fn pmc_npmc
354returns the number of PMCs supported on CPU
355.Fa cpu .
356The function
357.Fn pmc_cpuinfo
358sets argument
359.Fa cpu_info
360to point to a structure with information about the system's CPUs.
361Function
362.Fn pmc_pmcinfo
363returns information about the current state of CPU
364.Fa cpu Ns 's
365PMCs.
366This function sets argument
367.Fa *pmc_info
368to point to a memory area allocated with
369.Xr calloc 3 .
370The caller is expected to
371.Fn free
372the area when done.
373.Pp
374The functions
375.Fn pmc_name_of_capability ,
376.Fn pmc_name_of_class ,
377.Fn pmc_name_of_cputype ,
378.Fn pmc_name_of_disposition ,
379.Fn pmc_name_of_event ,
380.Fn pmc_name_of_mode
381and
382.Fn pmc_name_of_state
383are useful for code wanting to print error messages.
384They return
385.Vt "const char *"
386pointers to human-readable representations of their arguments.
387These return values should not be freed using
388.Xr free 3 .
389.Pp
390The function
391.Fn pmc_event_names_of_class
392returns a list of event names supported by a given PMC class
393.Fa cl .
394On successful return, an array of
395.Vt "const char *"
396pointers to the names of valid events supported by class
397.Fa cl
398is allocated by the library using
399.Xr malloc 3 ,
400and a pointer to this array is returned in the location pointed to by
401.Fa eventnames .
402The number of pointers allocated is returned in the location pointed
403to by
404.Fa nevents .
405.Ss Administration
406Individual PMCs may be enabled or disabled on a given CPU using
407.Fn pmc_enable
408and
409.Fn pmc_disable
410respectively.
411For these functions,
412.Fa cpu
413is the CPU number, and
414.Fa pmc
415is the index of the PMC to be operated on.
416Only the super-user is allowed to enable and disable PMCs.
417.Ss x86 Architecture Specific API
418The
419.Fn pmc_get_msr
420function returns the processor model specific register number
421associated with
422.Fa pmc .
423Applications may use the x86
424.Ic RDPMC
425instruction to directly read the contents of the PMC.
426.Sh EVENT SPECIFIERS
427Event specifiers are strings comprising of an event name, followed by
428optional parameters modifying the semantics of the hardware event
429being probed.
430Event names are PMC architecture dependent, but the
431.Xr hwpmc 4
432library defines machine independent aliases for commonly used
433events.
434.Ss Event Name Aliases
435Event name aliases are CPU architecture independent names for commonly
436used events.
437The following aliases are known to this version of the
438.Xr pmc 3
439library:
440.Bl -tag -width indent
441.It Li branches
442Measure the number of branches retired.
443.It Li branch-mispredicts
444Measure the number of retired branches that were mispredicted.
445.It Li cycles
446Measure processor cycles.
447This event is implemented using the processor's Time Stamp Counter
448register.
449.It Li dc-misses
450Measure the number of data cache misses.
451.It Li ic-misses
452Measure the number of instruction cache misses.
453.It Li instructions
454Measure the number of instructions retired.
455.It Li interrupts
456Measure the number of interrupts seen.
457.El
458.Ss Time Stamp Counter (TSC)
459The timestamp counter is a monotonically non-decreasing counter that
460counts processor cycles.
461.Pp
462In the i386 architecture, this counter may
463be selected by requesting an event with event specifier
464.Dq Li tsc .
465The
466.Dq Li tsc
467event does not support any further qualifiers.
468It can only be allocated in system-wide counting mode,
469and is a read-only counter.
470Multiple processes are allowed to allocate the TSC.
471Once allocated, it may be read using the
472.Fn pmc_read
473function, or by using the RDTSC instruction.
474.Ss AMD (K7) PMCs
475These PMCs are present in the
476.Tn "AMD Athlon"
477series of CPUs and are documented in:
478.Rs
479.%B "AMD Athlon Processor x86 Code Optimization Guide"
480.%N "Publication No. 22007"
481.%D "February 2002"
482.%Q "Advanced Micro Devices, Inc."
483.Re
484.Pp
485Event specifiers for AMD K7 PMCs can have the following optional
486qualifiers:
487.Bl -tag -width indent
488.It Li count= Ns Ar value
489Configure the counter to increment only if the number of configured
490events measured in a cycle is greater than or equal to
491.Ar value .
492.It Li edge
493Configure the counter to only count negated-to-asserted transitions
494of the conditions expressed by the other qualifiers.
495In other words, the counter will increment only once whenever a given
496condition becomes true, irrespective of the number of clocks during
497which the condition remains true.
498.It Li inv
499Invert the sense of comparision when the
500.Dq Li count
501qualifier is present, making the counter to increment when the
502number of events per cycle is less than the value specified by
503the
504.Dq Li count
505qualifier.
506.It Li os
507Configure the PMC to count events happening at privilege level 0.
508.It Li unitmask= Ns Ar mask
509This qualifier is used to further qualify a select few events,
510.Dq Li k7-dc-refills-from-l2 ,
511.Dq Li k7-dc-refills-from-system
512and
513.Dq Li k7-dc-writebacks .
514Here
515.Ar mask
516is a string of the following characters optionally separated by
517.Ql +
518characters:
519.Pp
520.Bl -tag -width indent -compact
521.It Li m
522Count operations for lines in the
523.Dq Modified
524state.
525.It Li o
526Count operations for lines in the
527.Dq Owner
528state.
529.It Li e
530Count operations for lines in the
531.Dq Exclusive
532state.
533.It Li s
534Count operations for lines in the
535.Dq Shared
536state.
537.It Li i
538Count operations for lines in the
539.Dq Invalid
540state.
541.El
542.Pp
543If no
544.Dq Li unitmask
545qualifier is specified, the default is to count events for caches
546lines in any of the above states.
547.It Li usr
548Configure the PMC to count events occurring at privilege levels 1, 2
549or 3.
550.El
551.Pp
552If neither of the
553.Dq Li os
554or
555.Dq Li usr
556qualifiers were specified, the default is to enable both.
557.Pp
558The event specifiers supported on AMD K7 PMCs are:
559.Bl -tag -width indent
560.It Li k7-dc-accesses
561Count data cache accesses.
562.It Li k7-dc-misses
563Count data cache misses.
564.It Li k7-dc-refills-from-l2 Op Li ,unitmask= Ns Ar mask
565Count data cache refills from L2 cache.
566This event may be further qualified using the
567.Dq Li unitmask
568qualifier.
569.It Li k7-dc-refills-from-system Op Li ,unitmask= Ns Ar mask
570Count data cache refills from system memory.
571This event may be further qualified using the
572.Dq Li unitmask
573qualifier.
574.It Li k7-dc-writebacks Op Li ,unitmask= Ns Ar mask
575Count data cache writebacks.
576This event may be further qualified using the
577.Dq Li unitmask
578qualifier.
579.It Li k7-l1-dtlb-miss-and-l2-dtlb-hits
580Count L1 DTLB misses and L2 DTLB hits.
581.It Li k7-l1-and-l2-dtlb-misses
582Count L1 and L2 DTLB misses.
583.It Li k7-misaligned-references
584Count misaligned data references.
585.It Li k7-ic-fetches
586Count instruction cache fetches.
587.It Li k7-ic-misses
588Count instruction cache misses.
589.It Li k7-l1-itlb-misses
590Count L1 ITLB misses that are L2 ITLB hits.
591.It Li k7-l1-l2-itlb-misses
592Count L1 (and L2) ITLB misses.
593.It Li k7-retired-instructions
594Count all retired instructions.
595.It Li k7-retired-ops
596Count retired ops.
597.It Li k7-retired-branches
598Count all retired branches (conditional, unconditional, exceptions
599and interrupts).
600.It Li k7-retired-branches-mispredicted
601Count all misprediced retired branches.
602.It Li k7-retired-taken-branches
603Count retired taken branches.
604.It Li k7-retired-taken-branches-mispredicted
605Count mispredicted taken branches that were retired.
606.It Li k7-retired-far-control-transfers
607Count retired far control transfers.
608.It Li k7-retired-resync-branches
609Count retired resync branches (non control transfer branches).
610.It Li k7-interrupts-masked-cycles
611Count the number of cycles when the processor's
612.Va IF
613flag was zero.
614.It Li k7-interrupts-masked-while-pending-cycles
615Count the number of cycles interrupts were masked while pending due
616to the processor's
617.Va IF
618flag being zero.
619.It Li k7-hardware-interrupts
620Count the number of taken hardware interrupts.
621.El
622.Ss AMD (K8) PMCs
623These PMCs are present in the
624.Tn "AMD Athlon64"
625and
626.Tn "AMD Opteron"
627series of CPUs.
628They are documented in:
629.Rs
630.%B "BIOS and Kernel Developer's Guide for the AMD Athlon(tm) 64 and AMD Opteron Processors"
631.%N "Publication No. 26094"
632.%D "April 2004"
633.%Q "Advanced Micro Devices, Inc."
634.Re
635.Pp
636Event specifiers for AMD K8 PMCs can have the following optional
637qualifiers:
638.Bl -tag -width indent
639.It Li count= Ns Ar value
640Configure the counter to increment only if the number of configured
641events measured in a cycle is greater than or equal to
642.Ar value .
643.It Li edge
644Configure the counter to only count negated-to-asserted transitions
645of the conditions expressed by the other fields.
646In other words, the counter will increment only once whenever a given
647condition becomes true, irrespective of the number of clocks during
648which the condition remains true.
649.It Li inv
650Invert the sense of comparision when the
651.Dq Li count
652qualifier is present, making the counter to increment when the
653number of events per cycle is less than the value specified by
654the
655.Dq Li count
656qualifier.
657.It Li mask= Ns Ar qualifier
658Many event specifiers for AMD K8 PMCs need to be additionally
659qualified using a mask qualifier.
660These additional qualifiers are event-specific and are documented
661along with their associated event specifiers below.
662.It Li os
663Configure the PMC to count events happening at privilege level 0.
664.It Li usr
665Configure the PMC to count events occurring at privilege levels 1, 2
666or 3.
667.El
668.Pp
669If neither of the
670.Dq Li os
671or
672.Dq Li usr
673qualifiers were specified, the default is to enable both.
674.Pp
675The event specifiers supported on AMD K8 PMCs are:
676.Bl -tag -width indent
677.It Li k8-bu-cpu-clk-unhalted
678Count the number of clock cycles when the CPU is not in the HLT or
679STPCLK states.
680.It Li k8-bu-fill-request-l2-miss Op Li ,mask= Ns Ar qualifier
681Count fill requests that missed in the L2 cache.
682This event may be further qualified using
683.Ar qualifier ,
684which is a
685.Ql +
686separated set of the following keywords:
687.Pp
688.Bl -tag -width indent -compact
689.It Li dc-fill
690Count data cache fill requests.
691.It Li ic-fill
692Count instruction cache fill requests.
693.It Li tlb-reload
694Count TLB reloads.
695.El
696.Pp
697The default is to count all types of requests.
698.It Li k8-bu-internal-l2-request Op Li ,mask= Ns Ar qualifier
699Count internally generated requests to the L2 cache.
700This event may be further qualified using
701.Ar qualifier ,
702which is a
703.Ql +
704separated set of the following keywords:
705.Pp
706.Bl -tag -width indent -compact
707.It Li cancelled
708Count cancelled requests.
709.It Li dc-fill
710Count data cache fill requests.
711.It Li ic-fill
712Count instruction cache fill requests.
713.It Li tag-snoop
714Count tag snoop requests.
715.It Li tlb-reload
716Count TLB reloads.
717.El
718.Pp
719The default is to count all types of requests.
720.It Li k8-dc-access
721Count data cache accesses including microcode scratchpad accesses.
722.It Li k8-dc-copyback Op Li ,mask= Ns Ar qualifier
723Count data cache copyback operations.
724This event may be further qualified using
725.Ar qualifier ,
726which is a
727.Ql +
728separated set of the following keywords:
729.Pp
730.Bl -tag -width indent -compact
731.It Li exclusive
732Count operations for lines in the
733.Dq exclusive
734state.
735.It Li invalid
736Count operations for lines in the
737.Dq invalid
738state.
739.It Li modified
740Count operations for lines in the
741.Dq modified
742state.
743.It Li owner
744Count operations for lines in the
745.Dq owner
746state.
747.It Li shared
748Count operations for lines in the
749.Dq shared
750state.
751.El
752.Pp
753The default is to count operations for lines in all the
754above states.
755.It Li k8-dc-dcache-accesses-by-locks Op Li ,mask= Ns Ar qualifier
756Count data cache accesses by lock instructions.
757This event is only available on processors of revision C or later
758vintage.
759This event may be further qualified using
760.Ar qualifier ,
761which is a
762.Ql +
763separated set of the following keywords:
764.Pp
765.Bl -tag -width indent -compact
766.It Li accesses
767Count data cache accesses by lock instructions.
768.It Li misses
769Count data cache misses by lock instructions.
770.El
771.Pp
772The default is to count all accesses.
773.It Li k8-dc-dispatched-prefetch-instructions Op Li ,mask= Ns Ar qualifier
774Count the number of dispatched prefetch instructions.
775This event may be further qualified using
776.Ar qualifier ,
777which is a
778.Ql +
779separated set of the following keywords:
780.Pp
781.Bl -tag -width indent -compact
782.It Li load
783Count load operations.
784.It Li nta
785Count non-temporal operations.
786.It Li store
787Count store operations.
788.El
789.Pp
790The default is to count all operations.
791.It Li k8-dc-l1-dtlb-miss-and-l2-dtlb-hit
792Count L1 DTLB misses that are L2 DTLB hits.
793.It Li k8-dc-l1-dtlb-miss-and-l2-dtlb-miss
794Count L1 DTLB misses that are also misses in the L2 DTLB.
795.It Li k8-dc-microarchitectural-early-cancel-of-an-access
796Count microarchitectural early cancels of data cache accesses.
797.It Li k8-dc-microarchitectural-late-cancel-of-an-access
798Count microarchitectural late cancels of data cache accesses.
799.It Li k8-dc-misaligned-data-reference
800Count misaligned data references.
801.It Li k8-dc-miss
802Count data cache misses.
803.It Li k8-dc-one-bit-ecc-error Op Li ,mask= Ns Ar qualifier
804Count one bit ECC errors found by the scrubber.
805This event may be further qualified using
806.Ar qualifier ,
807which is a
808.Ql +
809separated set of the following keywords:
810.Pp
811.Bl -tag -width indent -compact
812.It Li scrubber
813Count scrubber detected errors.
814.It Li piggyback
815Count piggyback scrubber errors.
816.El
817.Pp
818The default is to count both kinds of errors.
819.It Li k8-dc-refill-from-l2 Op Li ,mask= Ns Ar qualifier
820Count data cache refills from L2 cache.
821This event may be further qualified using
822.Ar qualifier ,
823which is a
824.Ql +
825separated set of the following keywords:
826.Pp
827.Bl -tag -width indent -compact
828.It Li exclusive
829Count operations for lines in the
830.Dq exclusive
831state.
832.It Li invalid
833Count operations for lines in the
834.Dq invalid
835state.
836.It Li modified
837Count operations for lines in the
838.Dq modified
839state.
840.It Li owner
841Count operations for lines in the
842.Dq owner
843state.
844.It Li shared
845Count operations for lines in the
846.Dq shared
847state.
848.El
849.Pp
850The default is to count operations for lines in all the
851above states.
852.It Li k8-dc-refill-from-system Op Li ,mask= Ns Ar qualifier
853Count data cache refills from system memory.
854This event may be further qualified using
855.Ar qualifier ,
856which is a
857.Ql +
858separated set of the following keywords:
859.Pp
860.Bl -tag -width indent -compact
861.It Li exclusive
862Count operations for lines in the
863.Dq exclusive
864state.
865.It Li invalid
866Count operations for lines in the
867.Dq invalid
868state.
869.It Li modified
870Count operations for lines in the
871.Dq modified
872state.
873.It Li owner
874Count operations for lines in the
875.Dq owner
876state.
877.It Li shared
878Count operations for lines in the
879.Dq shared
880state.
881.El
882.Pp
883The default is to count operations for lines in all the
884above states.
885.It Li k8-fp-dispatched-fpu-ops Op Li ,mask= Ns Ar qualifier
886Count the number of dispatched FPU ops.
887This event is supported in revision B and later CPUs.
888This event may be further qualified using
889.Ar qualifier ,
890which is a
891.Ql +
892separated set of the following keywords:
893.Pp
894.Bl -tag -width indent -compact
895.It Li add-pipe-excluding-junk-ops
896Count add pipe ops excluding junk ops.
897.It Li add-pipe-junk-ops
898Count junk ops in the add pipe.
899.It Li multiply-pipe-excluding-junk-ops
900Count multiply pipe ops excluding junk ops.
901.It Li multiply-pipe-junk-ops
902Count junk ops in the multiply pipe.
903.It Li store-pipe-excluding-junk-ops
904Count store pipe ops excluding junk ops
905.It Li store-pipe-junk-ops
906Count junk ops in the store pipe.
907.El
908.Pp
909The default is to count all types of ops.
910.It Li k8-fp-cycles-with-no-fpu-ops-retired
911Count cycles when no FPU ops were retired.
912This event is supported in revision B and later CPUs.
913.It Li k8-fp-dispatched-fpu-fast-flag-ops
914Count dispatched FPU ops that use the fast flag interface.
915This event is supported in revision B and later CPUs.
916.It Li k8-fr-decoder-empty
917Count cycles when there was nothing to dispatch (i.e., the decoder
918was empty).
919.It Li k8-fr-dispatch-stalls
920Count all dispatch stalls.
921.It Li k8-fr-dispatch-stall-for-segment-load
922Count dispatch stalls for segment loads.
923.It Li k8-fr-dispatch-stall-for-serialization
924Count dispatch stalls for serialization.
925.It Li k8-fr-dispatch-stall-from-branch-abort-to-retire
926Count dispatch stalls from branch abort to retiral.
927.It Li k8-fr-dispatch-stall-when-fpu-is-full
928Count dispatch stalls when the FPU is full.
929.It Li k8-fr-dispatch-stall-when-ls-is-full
930Count dispatch stalls when the load/store unit is full.
931.It Li k8-fr-dispatch-stall-when-reorder-buffer-is-full
932Count dispatch stalls when the reorder buffer is full.
933.It Li k8-fr-dispatch-stall-when-reservation-stations-are-full
934Count dispatch stalls when reservation stations are full.
935.It Li k8-fr-dispatch-stall-when-waiting-for-all-to-be-quiet
936Count dispatch stalls when waiting for all to be quiet.
937.\" XXX What does "waiting for all to be quiet" mean?
938.It Li k8-fr-dispatch-stall-when-waiting-far-xfer-or-resync-branch-pending
939Count dispatch stalls when a far control transfer or a resync branch
940is pending.
941.It Li k8-fr-fpu-exceptions Op Li ,mask= Ns Ar qualifier
942Count FPU exceptions.
943This event is supported in revision B and later CPUs.
944This event may be further qualified using
945.Ar qualifier ,
946which is a
947.Ql +
948separated set of the following keywords:
949.Pp
950.Bl -tag -width indent -compact
951.It Li sse-and-x87-microtraps
952Count SSE and x87 microtraps.
953.It Li sse-reclass-microfaults
954Count SSE reclass microfaults
955.It Li sse-retype-microfaults
956Count SSE retype microfaults
957.It Li x87-reclass-microfaults
958Count x87 reclass microfaults.
959.El
960.Pp
961The default is to count all types of exceptions.
962.It Li k8-fr-interrupts-masked-cycles
963Count cycles when interrupts were masked (by CPU RFLAGS field IF was zero).
964.It Li k8-fr-interrupts-masked-while-pending-cycles
965Count cycles while interrupts were masked while pending (i.e., cycles
966when INTR was asserted while CPU RFLAGS field IF was zero).
967.It Li k8-fr-number-of-breakpoints-for-dr0
968Count the number of breakpoints for DR0.
969.It Li k8-fr-number-of-breakpoints-for-dr1
970Count the number of breakpoints for DR1.
971.It Li k8-fr-number-of-breakpoints-for-dr2
972Count the number of breakpoints for DR2.
973.It Li k8-fr-number-of-breakpoints-for-dr3
974Count the number of breakpoints for DR3.
975.It Li k8-fr-retired-branches
976Count retired branches including exceptions and interrupts.
977.It Li k8-fr-retired-branches-mispredicted
978Count mispredicted retired branches.
979.It Li k8-fr-retired-far-control-transfers
980Count retired far control transfers (which are always mispredicted).
981.It Li k8-fr-retired-fastpath-double-op-instructions Op Li ,mask= Ns Ar qualifier
982Count retired fastpath double op instructions.
983This event is supported in revision B and later CPUs.
984This event may be further qualified using
985.Ar qualifier ,
986which is a
987.Ql +
988separated set of the following keywords:
989.Pp
990.Bl -tag -width indent -compact
991.It Li low-op-pos-0
992Count instructions with the low op in position 0.
993.It Li low-op-pos-1
994Count instructions with the low op in position 1.
995.It Li low-op-pos-2
996Count instructions with the low op in position 2.
997.El
998.Pp
999The default is to count all types of instructions.
1000.It Li k8-fr-retired-fpu-instructions Op Li ,mask= Ns Ar qualifier
1001Count retired FPU instructions.
1002This event is supported in revision B and later CPUs.
1003This event may be further qualified using
1004.Ar qualifier ,
1005which is a
1006.Ql +
1007separated set of the following keywords:
1008.Pp
1009.Bl -tag -width indent -compact
1010.It Li mmx-3dnow
1011Count MMX and 3DNow!\& instructions.
1012.It Li packed-sse-sse2
1013Count packed SSE and SSE2 instructions.
1014.It Li scalar-sse-sse2
1015Count scalar SSE and SSE2 instructions
1016.It Li x87
1017Count x87 instructions.
1018.El
1019.Pp
1020The default is to count all types of instructions.
1021.It Li k8-fr-retired-near-returns
1022Count retired near returns.
1023.It Li k8-fr-retired-near-returns-mispredicted
1024Count mispredicted near returns.
1025.It Li k8-fr-retired-resyncs
1026Count retired resyncs (non-control transfer branches).
1027.It Li k8-fr-retired-taken-hardware-interrupts
1028Count retired taken hardware interrupts.
1029.It Li k8-fr-retired-taken-branches
1030Count retired taken branches.
1031.It Li k8-fr-retired-taken-branches-mispredicted
1032Count retired taken branches that were mispredicted.
1033.It Li k8-fr-retired-taken-branches-mispredicted-by-addr-miscompare
1034Count retired taken branches that were mispredicted only due to an
1035address miscompare.
1036.It Li k8-fr-retired-uops
1037Count retired uops.
1038.It Li k8-fr-retired-x86-instructions
1039Count retired x86 instructions including exceptions and interrupts.
1040.It Li k8-ic-fetch
1041Count instruction cache fetches.
1042.It Li k8-ic-instruction-fetch-stall
1043Count cycles in stalls due to instruction fetch.
1044.It Li k8-ic-l1-itlb-miss-and-l2-itlb-hit
1045Count L1 ITLB misses that are L2 ITLB hits.
1046.It Li k8-ic-l1-itlb-miss-and-l2-itlb-miss
1047Count ITLB misses that miss in both L1 and L2 ITLBs.
1048.It Li k8-ic-microarchitectural-resync-by-snoop
1049Count microarchitectural resyncs caused by snoops.
1050.It Li k8-ic-miss
1051Count instruction cache misses.
1052.It Li k8-ic-refill-from-l2
1053Count instruction cache refills from L2 cache.
1054.It Li k8-ic-refill-from-system
1055Count instruction cache refills from system memory.
1056.It Li k8-ic-return-stack-hits
1057Count hits to the return stack.
1058.It Li k8-ic-return-stack-overflow
1059Count overflows of the return stack.
1060.It Li k8-ls-buffer2-full
1061Count load/store buffer2 full events.
1062.It Li k8-ls-locked-operation Op Li ,mask= Ns Ar qualifier
1063Count locked operations.
1064For revision C and later CPUs, the following qualifiers are supported:
1065.Pp
1066.Bl -tag -width indent -compact
1067.It Li cycles-in-request
1068Count the number of cycles in the lock request/grant stage.
1069.It Li cycles-to-complete
1070Count the number of cycles a lock takes to complete once it is
1071non-speculative and is the older load/store operation.
1072.It Li locked-instructions
1073Count the number of lock instructions executed.
1074.El
1075.Pp
1076The default is to count the number of lock instructions executed.
1077.It Li k8-ls-microarchitectural-late-cancel
1078Count microarchitectural late cancels of operations in the load/store
1079unit.
1080.It Li k8-ls-microarchitectural-resync-by-self-modifying-code
1081Count microarchitectural resyncs caused by self-modifying code.
1082.It Li k8-ls-microarchitectural-resync-by-snoop
1083Count microarchitectural resyncs caused by snoops.
1084.It Li k8-ls-retired-cflush-instructions
1085Count retired CFLUSH instructions.
1086.It Li k8-ls-retired-cpuid-instructions
1087Count retired CPUID instructions.
1088.It Li k8-ls-segment-register-load Op Li ,mask= Ns Ar qualifier
1089Count segment register loads.
1090This event may be further qualified using
1091.Ar qualifier ,
1092which is a
1093.Ql +
1094separated set of the following keywords:
1095.Bl -tag -width indent -compact
1096.It Li cs
1097Count CS register loads.
1098.It Li ds
1099Count DS register loads.
1100.It Li es
1101Count ES register loads.
1102.It Li fs
1103Count FS register loads.
1104.It Li gs
1105Count GS register loads.
1106.\" .It Li hs
1107.\" Count HS register loads.
1108.\" XXX "HS" register?
1109.It Li ss
1110Count SS register loads.
1111.El
1112.Pp
1113The default is to count all types of loads.
1114.It Li k8-nb-memory-controller-bypass-saturation Op Li ,mask= Ns Ar qualifier
1115Count memory controller bypass counter saturation events.
1116This event may be further qualified using
1117.Ar qualifier ,
1118which is a
1119.Ql +
1120separated set of the following keywords:
1121.Pp
1122.Bl -tag -width indent -compact
1123.It Li dram-controller-interface-bypass
1124Count DRAM controller interface bypass.
1125.It Li dram-controller-queue-bypass
1126Count DRAM controller queue bypass.
1127.It Li memory-controller-hi-pri-bypass
1128Count memory controller high priority bypasses.
1129.It Li memory-controller-lo-pri-bypass
1130Count memory controller low priority bypasses.
1131.El
1132.Pp
1133.It Li k8-nb-memory-controller-dram-slots-missed
1134Count memory controller DRAM command slots missed (in MemClks).
1135.It Li k8-nb-memory-controller-page-access-event Op Li ,mask= Ns Ar qualifier
1136Count memory controller page access events.
1137This event may be further qualified using
1138.Ar qualifier ,
1139which is a
1140.Ql +
1141separated set of the following keywords:
1142.Pp
1143.Bl -tag -width indent -compact
1144.It Li page-conflict
1145Count page conflicts.
1146.It Li page-hit
1147Count page hits.
1148.It Li page-miss
1149Count page misses.
1150.El
1151.Pp
1152The default is to count all types of events.
1153.It Li k8-nb-memory-controller-page-table-overflow
1154Count memory control page table overflow events.
1155.It Li k8-nb-probe-result Op Li ,mask= Ns Ar qualifier
1156Count probe events.
1157This event may be further qualified using
1158.Ar qualifier ,
1159which is a
1160.Ql +
1161separated set of the following keywords:
1162.Pp
1163.Bl -tag -width indent -compact
1164.It Li probe-hit
1165Count all probe hits.
1166.It Li probe-hit-dirty-no-memory-cancel
1167Count probe hits without memory cancels.
1168.It Li probe-hit-dirty-with-memory-cancel
1169Count probe hits with memory cancels.
1170.It Li probe-miss
1171Count probe misses.
1172.El
1173.It Li k8-nb-sized-commands Op Li ,mask= Ns Ar qualifier
1174Count sized commands issued.
1175This event may be further qualified using
1176.Ar qualifier ,
1177which is a
1178.Ql +
1179separated set of the following keywords:
1180.Pp
1181.Bl -tag -width indent -compact
1182.It Li nonpostwrszbyte
1183.It Li nonpostwrszdword
1184.It Li postwrszbyte
1185.It Li postwrszdword
1186.It Li rdszbyte
1187.It Li rdszdword
1188.It Li rdmodwr
1189.El
1190.Pp
1191The default is to count all types of commands.
1192.It Li k8-nb-memory-controller-turnaround Op Li ,mask= Ns Ar qualifier
1193Count memory control turnaround events.
1194This event may be further qualified using
1195.Ar qualifier ,
1196which is a
1197.Ql +
1198separated set of the following keywords:
1199.Pp
1200.Bl -tag -width indent -compact
1201.\" XXX doc is unclear whether these are cycle counts or event counts
1202.It Li dimm-turnaround
1203Count DIMM turnarounds.
1204.It Li read-to-write-turnaround
1205Count read to write turnarounds.
1206.It Li write-to-read-turnaround
1207Count write to read turnarounds.
1208.El
1209.Pp
1210The default is to count all types of events.
1211.It Li k8-nb-ht-bus0-bandwidth Op Li ,mask= Ns Ar qualifier
1212.It Li k8-nb-ht-bus1-bandwidth Op Li ,mask= Ns Ar qualifier
1213.It Li k8-nb-ht-bus2-bandwidth Op Li ,mask= Ns Ar qualifier
1214Count events on the HyperTransport(tm) buses.
1215These events may be further qualified using
1216.Ar qualifier ,
1217which is a
1218.Ql +
1219separated set of the following keywords:
1220.Pp
1221.Bl -tag -width indent -compact
1222.It Li buffer-release
1223Count buffer release messages sent.
1224.It Li command
1225Count command messages sent.
1226.It Li data
1227Count data messages sent.
1228.It Li nop
1229Count nop messages sent.
1230.El
1231.Pp
1232The default is to count all types of messages.
1233.El
1234.Ss Intel P6 PMCS
1235Intel P6 PMCs are present in Intel
1236.Tn "Pentium Pro" ,
1237.Tn "Pentium II" ,
1238.Tn Celeron ,
1239.Tn "Pentium III"
1240and
1241.Tn "Pentium M"
1242processors.
1243.Pp
1244These CPUs have two counters.
1245Some events may only be used on specific counters and some events are
1246defined only on specific processor models.
1247.Pp
1248These PMCs are documented in
1249.Rs
1250.%B "IA-32 Intel(R) Architecture Software Developer's Manual"
1251.%T "Volume 3: System Programming Guide"
1252.%N "Order Number 245472-012"
1253.%D 2003
1254.%Q "Intel Corporation"
1255.Re
1256.Pp
1257Some of these events are affected by processor errata described in
1258.Rs
1259.%B "Intel(R) Pentium(R) III Processor Specification Update"
1260.%N "Document Number: 244453-054"
1261.%D "April 2005"
1262.%Q "Intel Corporation"
1263.Re
1264.Pp
1265Event specifiers for Intel P6 PMCs can have the following common
1266qualifiers:
1267.Bl -tag -width indent
1268.It Li cmask= Ns Ar value
1269Configure the PMC to increment only if the number of configured
1270events measured in a cycle is greater than or equal to
1271.Ar value .
1272.It Li edge
1273Configure the PMC to count the number of deasserted to asserted
1274transitions of the conditions expressed by the other qualifiers.
1275If specified, the counter will increment only once whenever a
1276condition becomes true, irrespective of the number of clocks during
1277which the condition remains true.
1278.It Li inv
1279Invert the sense of comparision when the
1280.Dq Li cmask
1281qualifier is present, making the counter increment when the number of
1282events per cycle is less than the value specified by the
1283.Dq Li cmask
1284qualifier.
1285.It Li os
1286Configure the PMC to count events happening at processor privilege
1287level 0.
1288.It Li umask= Ns Ar value
1289This qualifier is used to further qualify the event selected (see
1290below).
1291.It Li usr
1292Configure the PMC to count events occurring at privilege levels 1, 2
1293or 3.
1294.El
1295.Pp
1296If neither of the
1297.Dq Li os
1298or
1299.Dq Li usr
1300qualifiers are specified, the default is to enable both.
1301.Pp
1302The event specifiers supported by Intel P6 PMCs are:
1303.Bl -tag -width indent
1304.It Li p6-baclears
1305Count the number of times a static branch prediction was made by the
1306branch decoder because the BTB did not have a prediction.
1307.It Li p6-br-bac-missp-exec
1308.Pq Tn "Pentium M"
1309Count the number of branch instructions executed that where
1310mispredicted at the Front End (BAC).
1311.It Li p6-br-bogus
1312Count the number of bogus branches.
1313.It Li p6-br-call-exec
1314.Pq Tn "Pentium M"
1315Count the number of call instructions executed.
1316.It Li p6-br-call-missp-exec
1317.Pq Tn "Pentium M"
1318Count the number of call instructions executed that were mispredicted.
1319.It Li p6-br-cnd-exec
1320.Pq Tn "Pentium M"
1321Count the number of conditional branch instructions executed.
1322.It Li p6-br-cnd-missp-exec
1323.Pq Tn "Pentium M"
1324Count the number of conditional branch instructions executed that were
1325mispredicted.
1326.It Li p6-br-ind-call-exec
1327.Pq Tn "Pentium M"
1328Count the number of indirect call instructions executed.
1329.It Li p6-br-ind-exec
1330.Pq Tn "Pentium M"
1331Count the number of indirect branch instructions executed.
1332.It Li p6-br-ind-missp-exec
1333.Pq Tn "Pentium M"
1334Count the number of indirect branch instructions executed that were
1335mispredicted.
1336.It Li p6-br-inst-decoded
1337Count the number of branch instructions decoded.
1338.It Li p6-br-inst-exec
1339.Pq Tn "Pentium M"
1340Count the number of branch instructions executed but necessarily retired.
1341.It Li p6-br-inst-retired
1342Count the number of branch instructions retired.
1343.It Li p6-br-miss-pred-retired
1344Count the number of mispredicted branch instructions retired.
1345.It Li p6-br-miss-pred-taken-ret
1346Count the number of taken mispredicted branches retired.
1347.It Li p6-br-missp-exec
1348.Pq Tn "Pentium M"
1349Count the number of branch instructions executed that were
1350mispredicted at execution.
1351.It Li p6-br-ret-bac-missp-exec
1352.Pq Tn "Pentium M"
1353Count the number of return instructions executed that were
1354mispredicted at the Front End (BAC).
1355.It Li p6-br-ret-exec
1356.Pq Tn "Pentium M"
1357Count the number of return instructions executed.
1358.It Li p6-br-ret-missp-exec
1359.Pq Tn "Pentium M"
1360Count the number of return instructions executed that were
1361mispredicted at execution.
1362.It Li p6-br-taken-retired
1363Count the number of taken branches retired.
1364.It Li p6-btb-misses
1365Count the number of branches for which the BTB did not produce a
1366prediction.
1367.It Li p6-bus-bnr-drv
1368Count the number of bus clock cycles during which this processor is
1369driving the BNR# pin.
1370.It Li p6-bus-data-rcv
1371Count the number of bus clock cycles during which this processor is
1372receiving data.
1373.It Li p6-bus-drdy-clocks Op Li ,umask= Ns Ar qualifier
1374Count the number of clocks during which DRDY# is asserted.
1375An additional qualifier may be specified, and comprises one of the
1376following keywords:
1377.Pp
1378.Bl -tag -width indent -compact
1379.It Li any
1380Count transactions generated by any agent on the bus.
1381.It Li self
1382Count transactions generated by this processor.
1383.El
1384.Pp
1385The default is to count operations generated by this processor.
1386.It Li p6-bus-hit-drv
1387Count the number of bus clock cycles during which this processor is
1388driving the HIT# pin.
1389.It Li p6-bus-hitm-drv
1390Count the number of bus clock cycles during which this processor is
1391driving the HITM# pin.
1392.It Li p6-bus-lock-clocks Op Li ,umask= Ns Ar qualifier
1393Count the number of clocks during with LOCK# is asserted on the
1394external system bus.
1395An additional qualifier may be specified and comprises one of the following
1396keywords:
1397.Pp
1398.Bl -tag -width indent -compact
1399.It Li any
1400Count transactions generated by any agent on the bus.
1401.It Li self
1402Count transactions generated by this processor.
1403.El
1404.Pp
1405The default is to count operations generated by this processor.
1406.It Li p6-bus-req-outstanding
1407Count the number of bus requests outstanding in any given cycle.
1408.It Li p6-bus-snoop-stall
1409Count the number of clock cycles during which the bus is snoop stalled.
1410.It Li p6-bus-tran-any Op Li ,umask= Ns Ar qualifier
1411Count the number of completed bus transactions of any kind.
1412An additional qualifier may be specified and comprises one of the following
1413keywords:
1414.Pp
1415.Bl -tag -width indent -compact
1416.It Li any
1417Count transactions generated by any agent on the bus.
1418.It Li self
1419Count transactions generated by this processor.
1420.El
1421.Pp
1422The default is to count operations generated by this processor.
1423.It Li p6-bus-tran-brd Op Li ,umask= Ns Ar qualifier
1424Count the number of burst read transactions.
1425An additional qualifier may be specified and comprises one of the following
1426keywords:
1427.Pp
1428.Bl -tag -width indent -compact
1429.It Li any
1430Count transactions generated by any agent on the bus.
1431.It Li self
1432Count transactions generated by this processor.
1433.El
1434.Pp
1435The default is to count operations generated by this processor.
1436.It Li p6-bus-tran-burst Op Li ,umask= Ns Ar qualifier
1437Count the number of completed burst transactions.
1438An additional qualifier may be specified and comprises one of the following
1439keywords:
1440.Pp
1441.Bl -tag -width indent -compact
1442.It Li any
1443Count transactions generated by any agent on the bus.
1444.It Li self
1445Count transactions generated by this processor.
1446.El
1447.Pp
1448The default is to count operations generated by this processor.
1449.It Li p6-bus-tran-def Op Li ,umask= Ns Ar qualifier
1450Count the number of completed deferred transactions.
1451An additional qualifier may be specified and comprises one of the following
1452keywords:
1453.Pp
1454.Bl -tag -width indent -compact
1455.It Li any
1456Count transactions generated by any agent on the bus.
1457.It Li self
1458Count transactions generated by this processor.
1459.El
1460.Pp
1461The default is to count operations generated by this processor.
1462.It Li p6-bus-tran-ifetch Op Li ,umask= Ns Ar qualifier
1463Count the number of completed instruction fetch transactions.
1464An additional qualifier may be specified and comprises one of the following
1465keywords:
1466.Pp
1467.Bl -tag -width indent -compact
1468.It Li any
1469Count transactions generated by any agent on the bus.
1470.It Li self
1471Count transactions generated by this processor.
1472.El
1473.Pp
1474The default is to count operations generated by this processor.
1475.It Li p6-bus-tran-inval Op Li ,umask= Ns Ar qualifier
1476Count the number of completed invalidate transactions.
1477An additional qualifier may be specified and comprises one of the following
1478keywords:
1479.Pp
1480.Bl -tag -width indent -compact
1481.It Li any
1482Count transactions generated by any agent on the bus.
1483.It Li self
1484Count transactions generated by this processor.
1485.El
1486.Pp
1487The default is to count operations generated by this processor.
1488.It Li p6-bus-tran-mem Op Li ,umask= Ns Ar qualifier
1489Count the number of completed memory transactions.
1490An additional qualifier may be specified and comprises one of the following
1491keywords:
1492.Pp
1493.Bl -tag -width indent -compact
1494.It Li any
1495Count transactions generated by any agent on the bus.
1496.It Li self
1497Count transactions generated by this processor.
1498.El
1499.Pp
1500The default is to count operations generated by this processor.
1501.It Li p6-bus-tran-pwr Op Li ,umask= Ns Ar qualifier
1502Count the number of completed partial write transactions.
1503An additional qualifier may be specified and comprises one of the following
1504keywords:
1505.Pp
1506.Bl -tag -width indent -compact
1507.It Li any
1508Count transactions generated by any agent on the bus.
1509.It Li self
1510Count transactions generated by this processor.
1511.El
1512.Pp
1513The default is to count operations generated by this processor.
1514.It Li p6-bus-tran-rfo Op Li ,umask= Ns Ar qualifier
1515Count the number of completed read-for-ownership transactions.
1516An additional qualifier may be specified and comprises one of the following
1517keywords:
1518.Pp
1519.Bl -tag -width indent -compact
1520.It Li any
1521Count transactions generated by any agent on the bus.
1522.It Li self
1523Count transactions generated by this processor.
1524.El
1525.Pp
1526The default is to count operations generated by this processor.
1527.It Li p6-bus-trans-io Op Li ,umask= Ns Ar qualifier
1528Count the number of completed I/O transactions.
1529An additional qualifier may be specified and comprises one of the following
1530keywords:
1531.Pp
1532.Bl -tag -width indent -compact
1533.It Li any
1534Count transactions generated by any agent on the bus.
1535.It Li self
1536Count transactions generated by this processor.
1537.El
1538.Pp
1539The default is to count operations generated by this processor.
1540.It Li p6-bus-trans-p Op Li ,umask= Ns Ar qualifier
1541Count the number of completed partial transactions.
1542An additional qualifier may be specified and comprises one of the following
1543keywords:
1544.Pp
1545.Bl -tag -width indent -compact
1546.It Li any
1547Count transactions generated by any agent on the bus.
1548.It Li self
1549Count transactions generated by this processor.
1550.El
1551.Pp
1552The default is to count operations generated by this processor.
1553.It Li p6-bus-trans-wb Op Li ,umask= Ns Ar qualifier
1554Count the number of completed write-back transactions.
1555An additional qualifier may be specified and comprises one of the following
1556keywords:
1557.Pp
1558.Bl -tag -width indent -compact
1559.It Li any
1560Count transactions generated by any agent on the bus.
1561.It Li self
1562Count transactions generated by this processor.
1563.El
1564.Pp
1565The default is to count operations generated by this processor.
1566.It Li p6-cpu-clk-unhalted
1567Count the number of cycles during with the processor was not halted.
1568.Pp
1569.Pq Tn "Pentium M"
1570Count the number of cycles during with the processor was not halted
1571and not in a thermal trip.
1572.It Li p6-cycles-div-busy
1573Count the number of cycles during which the divider is busy and cannot
1574accept new divides.
1575This event is only allocated on counter 0.
1576.It Li p6-cycles-in-pending-and-masked
1577Count the number of processor cycles for which interrupts were
1578disabled and interrupts were pending.
1579.It Li p6-cycles-int-masked
1580Count the number of processor cycles for which interrupts were
1581disabled.
1582.It Li p6-data-mem-refs
1583Count all loads and all stores using any memory type, including
1584internal retries.
1585Each part of a split store is counted separately.
1586.It Li p6-dcu-lines-in
1587Count the total lines allocated in the data cache unit.
1588.It Li p6-dcu-m-lines-in
1589Count the number of M state lines allocated in the data cache unit.
1590.It Li p6-dcu-m-lines-out
1591Count the number of M state lines evicted from the data cache unit.
1592.It Li p6-dcu-miss-outstanding
1593Count the weighted number of cycles while a data cache unit miss is
1594outstanding, incremented by the number of outstanding cache misses at
1595any time.
1596.It Li p6-div
1597Count the number of floating point multiplies.
1598This event is only allocated on counter 1.
1599.It Li p6-emon-esp-uops
1600.Pq Tn "Pentium M"
1601Count the total number of micro-ops.
1602.It Li p6-emon-est-trans Op Li ,umask= Ns Ar qualifier
1603.Pq Tn "Pentium M"
1604Count the number of
1605.Tn "Enhanced Intel SpeedStep"
1606transitions.
1607An additional qualifier may be specified, and can be one of the
1608following keywords:
1609.Pp
1610.Bl -tag -width indent -compact
1611.It Li all
1612Count all transitions.
1613.It Li freq
1614Count only frequency transitions.
1615.El
1616.Pp
1617The default is to count all transitions.
1618.It Li p6-emon-fused-uops-ret Op Li ,umask= Ns Ar qualifier
1619.Pq Tn "Pentium M"
1620Count the number of retired fused micro-ops.
1621An additional qualifier may be specified, and may be one of the
1622following keywords:
1623.Pp
1624.Bl -tag -width indent -compact
1625.It Li all
1626Count all fused micro-ops.
1627.It Li loadop
1628Count only load and op micro-ops.
1629.It Li stdsta
1630Count only STD/STA micro-ops.
1631.El
1632.Pp
1633The default is to count all fused micro-ops.
1634.It Li p6-emon-kni-comp-inst-ret
1635.Pq Tn "Pentium III"
1636Count the number of SSE computational instructions retired.
1637An additional qualifier may be specified, and comprises one of the
1638following keywords:
1639.Pp
1640.Bl -tag -width indent -compact
1641.It Li packed-and-scalar
1642Count packed and scalar operations.
1643.It Li scalar
1644Count scalar operations only.
1645.El
1646.Pp
1647The default is to count packed and scalar operations.
1648.It Li p6-emon-kni-inst-retired Op Li ,umask= Ns Ar qualifier
1649.Pq Tn "Pentium III"
1650Count the number of SSE instructions retired.
1651An additional qualifier may be specified, and comprises one of the
1652following keywords:
1653.Pp
1654.Bl -tag -width indent -compact
1655.It Li packed-and-scalar
1656Count packed and scalar operations.
1657.It Li scalar
1658Count scalar operations only.
1659.El
1660.Pp
1661The default is to count packed and scalar operations.
1662.It Li p6-emon-kni-pref-dispatched Op Li ,umask= Ns Ar qualifier
1663.Pq Tn "Pentium III"
1664Count the number of SSE prefetch or weakly ordered instructions
1665dispatched (including speculative prefetches).
1666An additional qualifier may be specified, and comprises one of the
1667following keywords:
1668.Pp
1669.Bl -tag -width indent -compact
1670.It Li nta
1671Count non-temporal prefetches.
1672.It Li t1
1673Count prefetches to L1.
1674.It Li t2
1675Count prefetches to L2.
1676.It Li wos
1677Count weakly ordered stores.
1678.El
1679.Pp
1680The default is to count non-temporal prefetches.
1681.It Li p6-emon-kni-pref-miss Op Li ,umask= Ns Ar qualifier
1682.Pq Tn "Pentium III"
1683Count the number of prefetch or weakly ordered instructions that miss
1684all caches.
1685An additional qualifier may be specified, and comprises one of the
1686following keywords:
1687.Pp
1688.Bl -tag -width indent -compact
1689.It Li nta
1690Count non-temporal prefetches.
1691.It Li t1
1692Count prefetches to L1.
1693.It Li t2
1694Count prefetches to L2.
1695.It Li wos
1696Count weakly ordered stores.
1697.El
1698.Pp
1699The default is to count non-temporal prefetches.
1700.It Li p6-emon-pref-rqsts-dn
1701.Pq Tn "Pentium M"
1702Count the number of downward prefetches issued.
1703.It Li p6-emon-pref-rqsts-up
1704.Pq Tn "Pentium M"
1705Count the number of upward prefetches issued.
1706.It Li p6-emon-simd-instr-retired
1707.Pq Tn "Pentium M"
1708Count the number of retired
1709.Tn MMX
1710instructions.
1711.It Li p6-emon-sse-sse2-comp-inst-retired Op Li ,umask= Ns Ar qualifier
1712.Pq Tn "Pentium M"
1713Count the number of computational SSE instructions retired.
1714An additional qualifier may be specified and can be one of the
1715following keywords:
1716.Pp
1717.Bl -tag -width indent -compact
1718.It Li sse-packed-single
1719Count SSE packed-single instructions.
1720.It Li sse-scalar-single
1721Count SSE scalar-single instructions.
1722.It Li sse2-packed-double
1723Count SSE2 packed-double instructions.
1724.It Li sse2-scalar-double
1725Count SSE2 scalar-double instructions.
1726.El
1727.Pp
1728The default is to count SSE packed-single instructions.
1729.It Li p6-emon-sse-sse2-inst-retired Op Li ,umask= Ns Ar qualifer
1730.Pp
1731.Pq Tn "Pentium M"
1732Count the number of SSE instructions retired.
1733An additional qualifier can be specified, and can be one of the
1734following keywords:
1735.Pp
1736.Bl -tag -width indent -compact
1737.It Li sse-packed-single
1738Count SSE packed-single instructions.
1739.It Li sse-packed-single-scalar-single
1740Count SSE packed-single and scalar-single instructions.
1741.It Li sse2-packed-double
1742Count SSE2 packed-double instructions.
1743.It Li sse2-scalar-double
1744Count SSE2 scalar-double instructions.
1745.El
1746.Pp
1747The default is to count SSE packed-single instructions.
1748.It Li p6-emon-synch-uops
1749.Pq Tn "Pentium M"
1750Count the number of sync micro-ops.
1751.It Li p6-emon-thermal-trip
1752.Pq Tn "Pentium M"
1753Count the duration or occurrences of thermal trips.
1754Use the
1755.Dq Li edge
1756qualifier to count occurrences of thermal trips.
1757.It Li p6-emon-unfusion
1758.Pq Tn "Pentium M"
1759Count the number of unfusion events in the reorder buffer.
1760.It Li p6-flops
1761Count the number of computational floating point operations retired.
1762This event is only allocated on counter 0.
1763.It Li p6-fp-assist
1764Count the number of floating point exceptions handled by microcode.
1765This event is only allocated on counter 1.
1766.It Li p6-fp-comps-ops-exe
1767Count the number of computation floating point operations executed.
1768This event is only allocated on counter 0.
1769.It Li p6-fp-mmx-trans Op Li ,umask= Ns Ar qualifier
1770.Pq Tn "Pentium II" , Tn "Pentium III"
1771Count the number of transitions between MMX and floating-point
1772instructions.
1773An additional qualifier may be specified, and comprises one of the
1774following keywords:
1775.Pp
1776.Bl -tag -width indent -compact
1777.It Li mmxtofp
1778Count transitions from MMX instructions to floating-point instructions.
1779.It Li fptommx
1780Count transitions from floating-point instructions to MMX instructions.
1781.El
1782.Pp
1783The default is to count MMX to floating-point transitions.
1784.It Li p6-hw-int-rx
1785Count the number of hardware interrupts received.
1786.It Li p6-ifu-fetch
1787Count the number of instruction fetches, both cacheable and non-cacheable.
1788.It Li p6-ifu-fetch-miss
1789Count the number of instruction fetch misses (i.e., those that produce
1790memory accesses).
1791.It Li p6-ifu-mem-stall
1792Count the number of cycles instruction fetch is stalled for any reason.
1793.It Li p6-ild-stall
1794Count the number of cycles the instruction length decoder is stalled.
1795.It Li p6-inst-decoded
1796Count the number of instructions decoded.
1797.It Li p6-inst-retired
1798Count the number of instructions retired.
1799.It Li p6-itlb-miss
1800Count the number of instruction TLB misses.
1801.It Li p6-l2-ads
1802Count the number of L2 address strobes.
1803.It Li p6-l2-dbus-busy
1804Count the number of cycles during which the L2 cache data bus was busy.
1805.It Li p6-l2-dbus-busy-rd
1806Count the number of cycles during which the L2 cache data bus was busy
1807transferring read data from L2 to the processor.
1808.It Li p6-l2-ifetch Op Li ,umask= Ns Ar qualifier
1809Count the number of L2 instruction fetches.
1810An additional qualifier may be specified and comprises a list of the following
1811keywords separated by
1812.Ql +
1813characters:
1814.Pp
1815.Bl -tag -width indent -compact
1816.It Li e
1817Count operations affecting E (exclusive) state lines.
1818.It Li i
1819Count operations affecting I (invalid) state lines.
1820.It Li m
1821Count operations affecting M (modified) state lines.
1822.It Li s
1823Count operations affecting S (shared) state lines.
1824.El
1825.Pp
1826The default is to count operations affecting all (MESI) state lines.
1827.It Li p6-l2-ld Op Li ,umask= Ns Ar qualifier
1828Count the number of L2 data loads.
1829An additional qualifier may be specified and comprises a list of the following
1830keywords separated by
1831.Ql +
1832characters:
1833.Pp
1834.Bl -tag -width indent -compact
1835.It Li both
1836.Pq Tn "Pentium M"
1837Count both hardware-prefetched lines and non-hardware-prefetched lines.
1838.It Li e
1839Count operations affecting E (exclusive) state lines.
1840.It Li hw
1841.Pq Tn "Pentium M"
1842Count hardware-prefetched lines only.
1843.It Li i
1844Count operations affecting I (invalid) state lines.
1845.It Li m
1846Count operations affecting M (modified) state lines.
1847.It Li nonhw
1848.Pq Tn "Pentium M"
1849Exclude hardware-prefetched lines.
1850.It Li s
1851Count operations affecting S (shared) state lines.
1852.El
1853.Pp
1854The default on processors other than
1855.Tn "Pentium M"
1856processors is to count operations affecting all (MESI) state lines.
1857The default on
1858.Tn "Pentium M"
1859processors is to count both hardware-prefetched and
1860non-hardware-prefetch operations on all (MESI) state lines.
1861.Pq Errata
1862This event is affected by processor errata E53.
1863.It Li p6-l2-lines-in Op Li ,umask= Ns Ar qualifier
1864Count the number of L2 lines allocated.
1865An additional qualifier may be specified and comprises a list of the following
1866keywords separated by
1867.Ql +
1868characters:
1869.Pp
1870.Bl -tag -width indent -compact
1871.It Li both
1872.Pq Tn "Pentium M"
1873Count both hardware-prefetched lines and non-hardware-prefetched lines.
1874.It Li e
1875Count operations affecting E (exclusive) state lines.
1876.It Li hw
1877.Pq Tn "Pentium M"
1878Count hardware-prefetched lines only.
1879.It Li i
1880Count operations affecting I (invalid) state lines.
1881.It Li m
1882Count operations affecting M (modified) state lines.
1883.It Li nonhw
1884.Pq Tn "Pentium M"
1885Exclude hardware-prefetched lines.
1886.It Li s
1887Count operations affecting S (shared) state lines.
1888.El
1889.Pp
1890The default on processors other than
1891.Tn "Pentium M"
1892processors is to count operations affecting all (MESI) state lines.
1893The default on
1894.Tn "Pentium M"
1895processors is to count both hardware-prefetched and
1896non-hardware-prefetch operations on all (MESI) state lines.
1897.Pq Errata
1898This event is affected by processor errata E45.
1899.It Li p6-l2-lines-out Op Li ,umask= Ns Ar qualifier
1900Count the number of L2 lines evicted.
1901An additional qualifier may be specified and comprises a list of the following
1902keywords separated by
1903.Ql +
1904characters:
1905.Pp
1906.Bl -tag -width indent -compact
1907.It Li both
1908.Pq Tn "Pentium M"
1909Count both hardware-prefetched lines and non-hardware-prefetched lines.
1910.It Li e
1911Count operations affecting E (exclusive) state lines.
1912.It Li hw
1913.Pq Tn "Pentium M"
1914Count hardware-prefetched lines only.
1915.It Li i
1916Count operations affecting I (invalid) state lines.
1917.It Li m
1918Count operations affecting M (modified) state lines.
1919.It Li nonhw
1920.Pq Tn "Pentium M" only
1921Exclude hardware-prefetched lines.
1922.It Li s
1923Count operations affecting S (shared) state lines.
1924.El
1925.Pp
1926The default on processors other than
1927.Tn "Pentium M"
1928processors is to count operations affecting all (MESI) state lines.
1929The default on
1930.Tn "Pentium M"
1931processors is to count both hardware-prefetched and
1932non-hardware-prefetch operations on all (MESI) state lines.
1933.Pq Errata
1934This event is affected by processor errata E45.
1935.It Li p6-l2-m-lines-inm
1936Count the number of modified lines allocated in L2 cache.
1937.It Li p6-l2-m-lines-outm Op Li ,umask= Ns Ar qualifier
1938Count the number of L2 M-state lines evicted.
1939.Pp
1940.Pq Tn "Pentium M"
1941On these processors an additional qualifier may be specified and
1942comprises a list of the following keywords separated by
1943.Ql +
1944characters:
1945.Pp
1946.Bl -tag -width indent -compact
1947.It Li both
1948Count both hardware-prefetched lines and non-hardware-prefetched lines.
1949.It Li hw
1950Count hardware-prefetched lines only.
1951.It Li nonhw
1952Exclude hardware-prefetched lines.
1953.El
1954.Pp
1955The default is to count both hardware-prefetched and
1956non-hardware-prefetch operations.
1957.Pq Errata
1958This event is affected by processor errata E53.
1959.It Li p6-l2-rqsts Op Li ,umask= Ns Ar qualifier
1960Count the total number of L2 requests.
1961An additional qualifier may be specified and comprises a list of the following
1962keywords separated by
1963.Ql +
1964characters:
1965.Pp
1966.Bl -tag -width indent -compact
1967.It Li e
1968Count operations affecting E (exclusive) state lines.
1969.It Li i
1970Count operations affecting I (invalid) state lines.
1971.It Li m
1972Count operations affecting M (modified) state lines.
1973.It Li s
1974Count operations affecting S (shared) state lines.
1975.El
1976.Pp
1977The default is to count operations affecting all (MESI) state lines.
1978.It Li p6-l2-st
1979Count the number of L2 data stores.
1980An additional qualifier may be specified and comprises a list of the following
1981keywords separated by
1982.Ql +
1983characters:
1984.Pp
1985.Bl -tag -width indent -compact
1986.It Li e
1987Count operations affecting E (exclusive) state lines.
1988.It Li i
1989Count operations affecting I (invalid) state lines.
1990.It Li m
1991Count operations affecting M (modified) state lines.
1992.It Li s
1993Count operations affecting S (shared) state lines.
1994.El
1995.Pp
1996The default is to count operations affecting all (MESI) state lines.
1997.It Li p6-ld-blocks
1998Count the number of load operations delayed due to store buffer blocks.
1999.It Li p6-misalign-mem-ref
2000Count the number of misaligned data memory references (crossing a 64
2001bit boundary).
2002.It Li p6-mmx-assist
2003.Pq Tn "Pentium II" , Tn "Pentium III"
2004Count the number of MMX assists executed.
2005.It Li p6-mmx-instr-exec
2006.Pq Tn Celeron , Tn "Pentium II"
2007Count the number of MMX instructions executed, except MOVQ and MOVD
2008stores from register to memory.
2009.It Li p6-mmx-instr-ret
2010.Pq Tn "Pentium II"
2011Count the number of MMX instructions retired.
2012.It Li p6-mmx-instr-type-exec Op Li ,umask= Ns Ar qualifier
2013.Pq Tn "Pentium II" , Tn "Pentium III"
2014Count the number of MMX instructions executed.
2015An additional qualifier may be specified and comprises a list of
2016the following keywords separated by
2017.Ql +
2018characters:
2019.Pp
2020.Bl -tag -width indent -compact
2021.It Li pack
2022Count MMX pack operation instructions.
2023.It Li packed-arithmetic
2024Count MMX packed arithmetic instructions.
2025.It Li packed-logical
2026Count MMX packed logical instructions.
2027.It Li packed-multiply
2028Count MMX packed multiply instructions.
2029.It Li packed-shift
2030Count MMX packed shift instructions.
2031.It Li unpack
2032Count MMX unpack operation instructions.
2033.El
2034.Pp
2035The default is to count all operations.
2036.It Li p6-mmx-sat-instr-exec
2037.Pq Tn "Pentium II" , Tn "Pentium III"
2038Count the number of MMX saturating instructions executed.
2039.It Li p6-mmx-uops-exec
2040.Pq Tn "Pentium II" , Tn "Pentium III"
2041Count the number of MMX micro-ops executed.
2042.It Li p6-mul
2043Count the number of floating point multiplies.
2044This event is only allocated on counter 1.
2045.It Li p6-partial-rat-stalls
2046Count the number of cycles or events for partial stalls.
2047.It Li p6-resource-stalls
2048Count the number of cycles there was a resource related stall of any kind.
2049.It Li p6-ret-seg-renames
2050.Pq Tn "Pentium II" , Tn "Pentium III"
2051Count the number of segment register rename events retired.
2052.It Li p6-sb-drains
2053Count the number of cycles the store buffer is draining.
2054.It Li p6-seg-reg-renames Op Li ,umask= Ns Ar qualifier
2055.Pq Tn "Pentium II" , Tn "Pentium III"
2056Count the number of segment register renames.
2057An additional qualifier may be specified, and comprises a list of the
2058following keywords separated by
2059.Ql +
2060characters:
2061.Pp
2062.Bl -tag -width indent -compact
2063.It Li ds
2064Count renames for segment register DS.
2065.It Li es
2066Count renames for segment register ES.
2067.It Li fs
2068Count renames for segment register FS.
2069.It Li gs
2070Count renames for segment register GS.
2071.El
2072.Pp
2073The default is to count operations affecting all segment registers.
2074.It Li p6-seg-rename-stalls
2075.Pq Tn "Pentium II" , Tn "Pentium III"
2076Count the number of segment register renaming stalls.
2077An additional qualifier may be specified, and comprises a list of the
2078following keywords separated by
2079.Ql +
2080characters:
2081.Pp
2082.Bl -tag -width indent -compact
2083.It Li ds
2084Count stalls for segment register DS.
2085.It Li es
2086Count stalls for segment register ES.
2087.It Li fs
2088Count stalls for segment register FS.
2089.It Li gs
2090Count stalls for segment register GS.
2091.El
2092.Pp
2093The default is to count operations affecting all the segment registers.
2094.It Li p6-segment-reg-loads
2095Count the number of segment register loads.
2096.It Li p6-uops-retired
2097Count the number of micro-ops retired.
2098.El
2099.Ss Intel P4 PMCS
2100Intel P4 PMCs are present in Intel
2101.Tn "Pentium 4"
2102and
2103.Tn Xeon
2104processors.
2105These PMCs are documented in
2106.Rs
2107.%B "IA-32 Intel(R) Architecture Software Developer's Manual"
2108.%T "Volume 3: System Programming Guide"
2109.%N "Order Number 245472-012"
2110.%D 2003
2111.%Q "Intel Corporation"
2112.Re
2113Further information about using these PMCs may be found in
2114.Rs
2115.%B "IA-32 Intel(R) Architecture Optimization Guide"
2116.%D 2003
2117.%N "Order Number 248966-009"
2118.%Q "Intel Corporation"
2119.Re
2120Some of these events are affected by processor errata described in
2121.Rs
2122.%B "Intel(R) Pentium(R) 4 Processor Specification Update"
2123.%N "Document Number: 249199-059"
2124.%D "April 2005"
2125.%Q "Intel Corporation"
2126.Re
2127.Pp
2128Event specifiers for Intel P4 PMCs can have the following common
2129qualifiers:
2130.Bl -tag -width indent
2131.It Li active= Ns Ar choice
2132(On P4 HTT CPUs) Filter event counting based on which logical
2133processors are active.
2134The allowed values of
2135.Ar choice
2136are:
2137.Pp
2138.Bl -tag -width indent -compact
2139.It Li any
2140Count when either logical processor is active.
2141.It Li both
2142Count when both logical processors are active.
2143.It Li none
2144Count only when neither logical processor is active.
2145.It Li single
2146Count only when one logical processor is active.
2147.El
2148.Pp
2149The default is
2150.Dq Li both .
2151.It Li cascade
2152Configure the PMC to cascade onto its partner.
2153See
2154.Sx "Cascading P4 PMCs"
2155below for more information.
2156.It Li edge
2157Configure the counter to count false to true transitions of the threshold
2158comparision output.
2159This qualifier only takes effect if a threshold qualifier has also been
2160specified.
2161.It Li complement
2162Configure the counter to increment only when the event count seen is
2163less than the threshold qualifier value specified.
2164.It Li mask= Ns Ar qualifier
2165Many event specifiers for Intel P4 PMCs need to be additionally
2166qualified using a mask qualifier.
2167The allowed syntax for these qualifiers is event specific and is
2168described along with the events.
2169.It Li os
2170Configure the PMC to count when the CPL of the processor is 0.
2171.It Li precise
2172Select precise event based sampling.
2173Precise sampling is supported by the hardware for a limited set of
2174events.
2175.It Li tag= Ns Ar value
2176Configure the PMC to tag the internal uop selected by the other
2177fields in this event specifier with value
2178.Ar value .
2179This feature is used when cascading PMCs.
2180.It Li threshold= Ns Ar value
2181Configure the PMC to increment only when the event counts seen are
2182greater than the specified threshold value
2183.Ar value .
2184.It Li usr
2185Configure the PMC to count when the CPL of the processor is 1, 2 or 3.
2186.El
2187.Pp
2188If neither of the
2189.Dq Li os
2190or
2191.Dq Li usr
2192qualifiers are specified, the default is to enable both.
2193.Pp
2194On Intel Pentium 4 processors with HTT, events are
2195divided into two classes:
2196.Pp
2197.Bl -tag -width indent -compact
2198.It "TS Events"
2199are those where hardware can differentiate between events
2200generated on one logical processor from those generated on the
2201other.
2202.It "TI Events"
2203are those where hardware cannot differentiate between events
2204generated by multiple logical processors in a package.
2205.El
2206.Pp
2207Only TS events are allowed for use with process-mode PMCs on
2208Pentium-4/HTT CPUs.
2209.Pp
2210The event specifiers supported by Intel P4 PMCs are:
2211.Pp
2212.Bl -tag -width indent
2213.It Li p4-128bit-mmx-uop Op Li ,mask= Ns Ar flags
2214.Pq "TI event"
2215Count integer SIMD SSE2 instructions that operate on 128 bit SIMD
2216operands.
2217Qualifier
2218.Ar flags
2219can take the following value (which is also the default):
2220.Pp
2221.Bl -tag -width indent -compact
2222.It Li all
2223Count all uops operating on 128 bit SIMD integer operands in memory or
2224XMM register.
2225.El
2226.Pp
2227If an instruction contains more than one 128 bit MMX uop, then each
2228uop will be counted.
2229.It Li p4-64bit-mmx-uop Op Li ,mask= Ns Ar flags
2230.Pq "TI event"
2231Count MMX instructions that operate on 64 bit SIMD operands.
2232Qualifier
2233.Ar flags
2234can take the following value (which is also the default):
2235.Pp
2236.Bl -tag -width indent -compact
2237.It Li all
2238Count all uops operating on 64 bit SIMD integer operands in memory or
2239in MMX registers.
2240.El
2241.Pp
2242If an instruction contains more than one 64 bit MMX uop, then each
2243uop will be counted.
2244.It Li p4-b2b-cycles
2245.Pq "TI event"
2246Count back-to-back bys cycles.
2247Further documentation for this event is unavailable.
2248.It Li p4-bnr
2249.Pq "TI event"
2250Count bus-not-ready conditions.
2251Further documentation for this event is unavailable.
2252.It Li p4-bpu-fetch-request Op Li ,mask= Ns Ar qualifier
2253.Pq "TS event"
2254Count instruction fetch requests qualified by additional
2255flags specified in
2256.Ar qualifier .
2257At this point only one flag is supported:
2258.Pp
2259.Bl -tag -width indent -compact
2260.It Li tcmiss
2261Count trace cache lookup misses.
2262.El
2263.Pp
2264The default qualifier is also
2265.Dq Li mask=tcmiss .
2266.It Li p4-branch-retired Op Li ,mask= Ns Ar flags
2267.Pq "TS event"
2268Counts retired branches.
2269Qualifier
2270.Ar flags
2271is a list of the following
2272.Ql +
2273separated strings:
2274.Pp
2275.Bl -tag -width indent -compact
2276.It Li mmnp
2277Count branches not-taken and predicted.
2278.It Li mmnm
2279Count branches not-taken and mis-predicted.
2280.It Li mmtp
2281Count branches taken and predicted.
2282.It Li mmtm
2283Count branches taken and mis-predicted.
2284.El
2285.Pp
2286The default qualifier counts all four kinds of branches.
2287.It Li p4-bsq-active-entries Op Li ,mask= Ns Ar qualifier
2288.Pq "TS event"
2289Count the number of entries (clipped at 15) currently active in the
2290BSQ.
2291Qualifier
2292.Ar qualifier
2293is a
2294.Ql +
2295separated set of the following flags:
2296.Pp
2297.Bl -tag -width indent -compact
2298.It Li req-type0 , Li req-type1
2299Forms a 2-bit number used to select the request type encoding:
2300.Pp
2301.Bl -tag -width indent -compact
2302.It Li 0
2303reads excluding read invalidate
2304.It Li 1
2305read invalidates
2306.It Li 2
2307writes other than writebacks
2308.It Li 3
2309writebacks
2310.El
2311.Pp
2312Bit
2313.Dq Li req-type1
2314is the MSB for this two bit number.
2315.It Li req-len0 , Li req-len1
2316Forms a two-bit number that specifies the request length encoding:
2317.Pp
2318.Bl -tag -width indent -compact
2319.It Li 0
23200 chunks
2321.It Li 1
23221 chunk
2323.It Li 3
23248 chunks
2325.El
2326.Pp
2327Bit
2328.Dq Li req-len1
2329is the MSB for this two bit number.
2330.It Li req-io-type
2331Count requests that are input or output requests.
2332.It Li req-lock-type
2333Count requests that lock the bus.
2334.It Li req-lock-cache
2335Count requests that lock the cache.
2336.It Li req-split-type
2337Count requests that is a bus 8-byte chunk that is split across an
23388-byte boundary.
2339.It Li req-dem-type
2340Count requests that are demand (not prefetches) if set.
2341Count requests that are prefetches if not set.
2342.It Li req-ord-type
2343Count requests that are ordered.
2344.It Li mem-type0 , Li mem-type1 , Li mem-type2
2345Forms a 3-bit number that specifies a memory type encoding:
2346.Pp
2347.Bl -tag -width indent -compact
2348.It Li 0
2349UC
2350.It Li 1
2351USWC
2352.It Li 4
2353WT
2354.It Li 5
2355WP
2356.It Li 6
2357WB
2358.El
2359.Pp
2360Bit
2361.Dq Li mem-type2
2362is the MSB of this 3-bit number.
2363.El
2364.Pp
2365The default qualifier has all the above bits set.
2366.Pp
2367Edge triggering using the
2368.Dq Li edge
2369qualifier should not be used with this event when counting cycles.
2370.It Li p4-bsq-allocation Op Li ,mask= Ns Ar qualifier
2371.Pq "TS event"
2372Count allocations in the bus sequence unit according to the flags
2373specified in
2374.Ar qualifier ,
2375which is a
2376.Ql +
2377separated set of the following flags:
2378.Pp
2379.Bl -tag -width indent -compact
2380.It Li req-type0 , Li req-type1
2381Forms a 2-bit number used to select the request type encoding:
2382.Pp
2383.Bl -tag -width indent -compact
2384.It Li 0
2385reads excluding read invalidate
2386.It Li 1
2387read invalidates
2388.It Li 2
2389writes other than writebacks
2390.It Li 3
2391writebacks
2392.El
2393.Pp
2394Bit
2395.Dq Li req-type1
2396is the MSB for this two bit number.
2397.It Li req-len0 , Li req-len1
2398Forms a two-bit number that specifies the request length encoding:
2399.Pp
2400.Bl -tag -width indent -compact
2401.It Li 0
24020 chunks
2403.It Li 1
24041 chunk
2405.It Li 3
24068 chunks
2407.El
2408.Pp
2409Bit
2410.Dq Li req-len1
2411is the MSB for this two bit number.
2412.It Li req-io-type
2413Count requests that are input or output requests.
2414.It Li req-lock-type
2415Count requests that lock the bus.
2416.It Li req-lock-cache
2417Count requests that lock the cache.
2418.It Li req-split-type
2419Count requests that is a bus 8-byte chunk that is split across an
24208-byte boundary.
2421.It Li req-dem-type
2422Count requests that are demand (not prefetches) if set.
2423Count requests that are prefetches if not set.
2424.It Li req-ord-type
2425Count requests that are ordered.
2426.It Li mem-type0 , Li mem-type1 , Li mem-type2
2427Forms a 3-bit number that specifies a memory type encoding:
2428.Pp
2429.Bl -tag -width indent -compact
2430.It Li 0
2431UC
2432.It Li 1
2433USWC
2434.It Li 4
2435WT
2436.It Li 5
2437WP
2438.It Li 6
2439WB
2440.El
2441.Pp
2442Bit
2443.Dq Li mem-type2
2444is the MSB of this 3-bit number.
2445.El
2446.Pp
2447The default qualifier has all the above bits set.
2448.Pp
2449This event is usually used along with the
2450.Dq Li edge
2451qualifier to avoid multiple counting.
2452.It Li p4-bsq-cache-reference Op Li ,mask= Ns Ar qualifier
2453.Pq "TS event"
2454Count cache references as seen by the bus unit (2nd or 3rd level
2455cache references).
2456Qualifier
2457.Ar qualifier
2458is a
2459.Ql +
2460separated list of the following keywords:
2461.Pp
2462.Bl -tag -width indent -compact
2463.It Li rd-2ndl-hits
2464Count 2nd level cache hits in the shared state.
2465.It Li rd-2ndl-hite
2466Count 2nd level cache hits in the exclusive state.
2467.It Li rd-2ndl-hitm
2468Count 2nd level cache hits in the modified state.
2469.It Li rd-3rdl-hits
2470Count 3rd level cache hits in the shared state.
2471.It Li rd-3rdl-hite
2472Count 3rd level cache hits in the exclusive state.
2473.It Li rd-3rdl-hitm
2474Count 3rd level cache hits in the modified state.
2475.It Li rd-2ndl-miss
2476Count 2nd level cache misses.
2477.It Li rd-3rdl-miss
2478Count 3rd level cache misses.
2479.It Li wr-2ndl-miss
2480Count write-back lookups from the data access cache that miss the 2nd
2481level cache.
2482.El
2483.Pp
2484The default is to count all the above events.
2485.It Li p4-execution-event Op Li ,mask= Ns Ar flags
2486.Pq "TS event"
2487Count the retirement of tagged uops selected through the execution
2488tagging mechanism.
2489Qualifier
2490.Ar flags
2491can contain the following strings separated by
2492.Ql +
2493characters:
2494.Pp
2495.Bl -tag -width indent -compact
2496.It Li nbogus0 , Li nbogus1 , Li nbogus2 , Li nbogus3
2497The marked uops are not bogus.
2498.It Li bogus0 , Li bogus1 , Li bogus2 , Li bogus3
2499The marked uops are bogus.
2500.El
2501.Pp
2502This event requires additional (upstream) events to be allocated to
2503perform the desired uop tagging.
2504The default is to set all the above flags.
2505This event can be used for precise event based sampling.
2506.It Li p4-front-end-event Op Li ,mask= Ns Ar flags
2507.Pq "TS event"
2508Count the retirement of tagged uops selected through the front-end
2509tagging mechanism.
2510Qualifier
2511.Ar flags
2512can contain the following strings separated by
2513.Ql +
2514characters:
2515.Pp
2516.Bl -tag -width indent -compact
2517.It Li nbogus
2518The marked uops are not bogus.
2519.It Li bogus
2520The marked uops are bogus.
2521.El
2522.Pp
2523This event requires additional (upstream) events to be allocated to
2524perform the desired uop tagging.
2525The default is to select both kinds of events.
2526This event can be used for precise event based sampling.
2527.It Li p4-fsb-data-activity Op Li ,mask= Ns Ar flags
2528.Pq "TI event"
2529Count each DBSY or DRDY event selected by qualifier
2530.Ar flags .
2531Qualifier
2532.Ar flags
2533is a
2534.Ql +
2535separated set of the following flags:
2536.Pp
2537.Bl -tag -width indent -compact
2538.It Li drdy-drv
2539Count when this processor is driving data onto the bus.
2540.It Li drdy-own
2541Count when this processor is reading data from the bus.
2542.It Li drdy-other
2543Count when data is on the bus but not being sampled by this processor.
2544.It Li dbsy-drv
2545Count when this processor reserves the bus for use in the next cycle
2546in order to drive data.
2547.It Li dbsy-own
2548Count when some agent reserves the bus for use in the next bus cycle
2549to drive data that this processor will sample.
2550.It Li dbsy-other
2551Count when some agent reserves the bus for use in the next bus cycle
2552to drive data that this processor will not sample.
2553.El
2554.Pp
2555Flags
2556.Dq Li drdy-own
2557and
2558.Dq Li drdy-other
2559are mutually exclusive.
2560Flags
2561.Dq Li dbsy-own
2562and
2563.Dq Li dbsy-other
2564are mutually exclusive.
2565The default value for
2566.Ar qualifier
2567is
2568.Dq Li drdy-drv+drdy-own+dbsy-drv+dbsy-own .
2569.It Li p4-global-power-events Op Li ,mask= Ns Ar flags
2570.Pq "TS event"
2571Count cycles during which the processor is not stopped.
2572Qualifier
2573.Ar flags
2574can take the following value (which is also the default):
2575.Pp
2576.Bl -tag -width indent -compact
2577.It Li running
2578Count cycles when the processor is active.
2579.El
2580.Pp
2581.It Li p4-instr-retired Op Li ,mask= Ns Ar flags
2582.Pq "TS event"
2583Count instructions retired during a clock cycle.
2584Qualifer
2585.Ar flags
2586comprises of the following strings separated by
2587.Ql +
2588characters:
2589.Pp
2590.Bl -tag -width indent -compact
2591.It Li nbogusntag
2592Count non-bogus instructions that are not tagged.
2593.It Li nbogustag
2594Count non-bogus instructions that are tagged.
2595.It Li bogusntag
2596Count bogus instructions that are not tagged.
2597.It Li bogustag
2598Count bogus instructions that are tagged.
2599.El
2600.Pp
2601The default qualifier counts all the above kinds of instructions.
2602.It Li p4-ioq-active-entries Xo
2603.Op Li ,mask= Ns Ar qualifier
2604.Op Li ,busreqtype= Ns Ar req-type
2605.Xc
2606.Pq "TS event"
2607Count the number of entries (clipped at 15) in the IOQ that are
2608active.
2609The event masks are specified by qualifier
2610.Ar qualifier
2611and
2612.Ar req-type .
2613.Pp
2614Qualifier
2615.Ar qualifier
2616is a
2617.Ql +
2618separated set of the following flags:
2619.Pp
2620.Bl -tag -width indent -compact
2621.It Li all-read
2622Count read entries.
2623.It Li all-write
2624Count write entries.
2625.It Li mem-uc
2626Count entries accessing uncacheable memory.
2627.It Li mem-wc
2628Count entries accessing write-combining memory.
2629.It Li mem-wt
2630Count entries accessing write-through memory.
2631.It Li mem-wp
2632Count entries accessing write-protected memory
2633.It Li mem-wb
2634Count entries accessing write-back memory.
2635.It Li own
2636Count store requests driven by the processor (i.e., not by other
2637processors or by DMA).
2638.It Li other
2639Count store requests driven by other processors or by DMA.
2640.It Li prefetch
2641Include hardware and software prefetch requests in the count.
2642.El
2643.Pp
2644The default value for
2645.Ar qualifier
2646is to enable all the above flags.
2647.Pp
2648The
2649.Ar req-type
2650qualifier is a 5-bit number can be additionally used to select a
2651specific bus request type.
2652The default is 0.
2653.Pp
2654The
2655.Dq Li edge
2656qualifier should not be used when counting cycles with this event.
2657The exact behaviour of this event depends on the processor revision.
2658.It Li p4-ioq-allocation Xo
2659.Op Li ,mask= Ns Ar qualifier
2660.Op Li ,busreqtype= Ns Ar req-type
2661.Xc
2662.Pq "TS event"
2663Count various types of transactions on the bus matching the flags set
2664in
2665.Ar qualifier
2666and
2667.Ar req-type .
2668.Pp
2669Qualifier
2670.Ar qualifier
2671is a
2672.Ql +
2673separated set of the following flags:
2674.Pp
2675.Bl -tag -width indent -compact
2676.It Li all-read
2677Count read entries.
2678.It Li all-write
2679Count write entries.
2680.It Li mem-uc
2681Count entries accessing uncacheable memory.
2682.It Li mem-wc
2683Count entries accessing write-combining memory.
2684.It Li mem-wt
2685Count entries accessing write-through memory.
2686.It Li mem-wp
2687Count entries accessing write-protected memory
2688.It Li mem-wb
2689Count entries accessing write-back memory.
2690.It Li own
2691Count store requests driven by the processor (i.e., not by other
2692processors or by DMA).
2693.It Li other
2694Count store requests driven by other processors or by DMA.
2695.It Li prefetch
2696Include hardware and software prefetch requests in the count.
2697.El
2698.Pp
2699The default value for
2700.Ar qualifier
2701is to enable all the above flags.
2702.Pp
2703The
2704.Ar req-type
2705qualifier is a 5-bit number can be additionally used to select a
2706specific bus request type.
2707The default is 0.
2708.Pp
2709The
2710.Dq Li edge
2711qualifier is normally used with this event to prevent multiple
2712counting.
2713The exact behaviour of this event depends on the processor revision.
2714.It Li p4-itlb-reference Op mask= Ns Ar qualifier
2715.Pq "TS event"
2716Count translations using the intruction translation look-aside
2717buffer.
2718The
2719.Ar qualifier
2720argument is a list of the following strings separated by
2721.Ql +
2722characters.
2723.Pp
2724.Bl -tag -width indent -compact
2725.It Li hit
2726Count ITLB hits.
2727.It Li miss
2728Count ITLB misses.
2729.It Li hit-uc
2730Count uncacheable ITLB hits.
2731.El
2732.Pp
2733If no
2734.Ar qualifier
2735is specified the default is to count all the three kinds of ITLB
2736translations.
2737.It Li p4-load-port-replay Op Li ,mask= Ns Ar qualifier
2738.Pq "TS event"
2739Count replayed events at the load port.
2740Qualifier
2741.Ar qualifier
2742can take on one value:
2743.Pp
2744.Bl -tag -width indent -compact
2745.It Li split-ld
2746Count split loads.
2747.El
2748.Pp
2749The default value for
2750.Ar qualifier
2751is
2752.Dq Li split-ld .
2753.It Li p4-mispred-branch-retired Op Li ,mask= Ns Ar flags
2754.Pq "TS event"
2755Count mispredicted IA-32 branch instructions.
2756Qualifier
2757.Ar flags
2758can take the following value (which is also the default):
2759.Pp
2760.Bl -tag -width indent -compact
2761.It Li nbogus
2762Count non-bogus retired branch instructions.
2763.El
2764.It Li p4-machine-clear Op Li ,mask= Ns Ar flags
2765.Pq "TS event"
2766Count the number of pipeline clears seen by the processor.
2767Qualifer
2768.Ar flags
2769is a list of the following strings separated by
2770.Ql +
2771characters:
2772.Pp
2773.Bl -tag -width indent -compact
2774.It Li clear
2775Count for a portion of the many cycles when the machine is being
2776cleared for any reason.
2777.It Li moclear
2778Count machine clears due to memory ordering issues.
2779.It Li smclear
2780Count machine clears due to self-modifying code.
2781.El
2782.Pp
2783Use qualifier
2784.Dq Li edge
2785to get a count of occurrences of machine clears.
2786The default qualifier is
2787.Dq Li clear .
2788.It Li p4-memory-cancel Op Li ,mask= Ns Ar event-list
2789.Pq "TS event"
2790Count the cancelling of various kinds of requests in the data cache
2791address control unit of the CPU.
2792The qualifier
2793.Ar event-list
2794is a list of the following strings separated by
2795.Ql +
2796characters:
2797.Pp
2798.Bl -tag -width indent -compact
2799.It Li st-rb-full
2800Requests cancelled because no store request buffer was available.
2801.It Li 64k-conf
2802Requests that conflict due to 64K aliasing.
2803.El
2804.Pp
2805If
2806.Ar event-list
2807is not specified, then the default is to count both kinds of events.
2808.It Li p4-memory-complete Op Li ,mask= Ns Ar event-list
2809.Pq "TS event"
2810Count the completion of load split, store split, uncacheable split and
2811uncacheable load operations selected by qualifier
2812.Ar event-list .
2813The qualifier
2814.Ar event-list
2815is a
2816.Ql +
2817separated list of the following flags:
2818.Pp
2819.Bl -tag -width indent -compact
2820.It Li lsc
2821Count load splits completed, excluding loads from uncacheable or
2822write-combining areas.
2823.It Li ssc
2824Count any split stores completed.
2825.El
2826.Pp
2827The default is to count both kinds of operations.
2828.It Li p4-mob-load-replay Op Li ,mask= Ns Ar qualifier
2829.Pq "TS event"
2830Count load replays triggered by the memory order buffer.
2831Qualifier
2832.Ar qualifier
2833can be a
2834.Ql +
2835separated list of the following flags:
2836.Pp
2837.Bl -tag -width indent -compact
2838.It Li no-sta
2839Count replays because of unknown store addresses.
2840.It Li no-std
2841Count replays because of unknown store data.
2842.It Li partial-data
2843Count replays because of partially overlapped data accesses between
2844load and store operations.
2845.It Li unalgn-addr
2846Count replays because of mismatches in the lower 4 bits of load and
2847store operations.
2848.El
2849.Pp
2850The default qualifier is
2851.Ar no-sta+no-std+partial-data+unalgn-addr .
2852.It Li p4-packed-dp-uop Op Li ,mask= Ns Ar flags
2853.Pq "TI event"
2854Count packed double-precision uops.
2855Qualifier
2856.Ar flags
2857can take the following value (which is also the default):
2858.Pp
2859.Bl -tag -width indent -compact
2860.It Li all
2861Count all uops operating on packed double-precision operands.
2862.El
2863.It Li p4-packed-sp-uop Op Li ,mask= Ns Ar flags
2864.Pq "TI event"
2865Count packed single-precision uops.
2866Qualifier
2867.Ar flags
2868can take the following value (which is also the default):
2869.Pp
2870.Bl -tag -width indent -compact
2871.It Li all
2872Count all uops operating on packed single-precision operands.
2873.El
2874.It Li p4-page-walk-type Op Li ,mask= Ns Ar qualifier
2875.Pq "TI event"
2876Count page walks performed by the page miss handler.
2877Qualifier
2878.Ar qualifier
2879can be a
2880.Ql +
2881separated list of the following keywords:
2882.Pp
2883.Bl -tag -width indent -compact
2884.It Li dtmiss
2885Count page walks for data TLB misses.
2886.It Li itmiss
2887Count page walks for instruction TLB misses.
2888.El
2889.Pp
2890The default value for
2891.Ar qualifier
2892is
2893.Dq Li dtmiss+itmiss .
2894.It Li p4-replay-event Op Li ,mask= Ns Ar flags
2895.Pq "TS event"
2896Count the retirement of tagged uops selected through the replay
2897tagging mechanism.
2898Qualifier
2899.Ar flags
2900contains a
2901.Ql +
2902separated set of the following strings:
2903.Pp
2904.Bl -tag -width indent -compact
2905.It Li nbogus
2906The marked uops are not bogus.
2907.It Li bogus
2908The marked uops are bogus.
2909.El
2910.Pp
2911This event requires additional (upstream) events to be allocated to
2912perform the desired uop tagging.
2913The default qualifier counts both kinds of uops.
2914This event can be used for precise event based sampling.
2915.It Li p4-resource-stall Op Li ,mask= Ns Ar flags
2916.Pq "TS event"
2917Count the occurrence or latency of stalls in the allocator.
2918Qualifier
2919.Ar flags
2920can take the following value (which is also the default):
2921.Pp
2922.Bl -tag -width indent -compact
2923.It Li sbfull
2924A stall due to the lack of store buffers.
2925.El
2926.It Li p4-response
2927.Pq "TI event"
2928Count different types of responses.
2929Further documentation on this event is not available.
2930.It Li p4-retired-branch-type Op Li ,mask= Ns Ar flags
2931.Pq "TS event"
2932Count branches retired.
2933Qualifier
2934.Ar flags
2935contains a
2936.Ql +
2937separated list of strings:
2938.Pp
2939.Bl -tag -width indent -compact
2940.It Li conditional
2941Count conditional jumps.
2942.It Li call
2943Count direct and indirect call branches.
2944.It Li return
2945Count return branches.
2946.It Li indirect
2947Count returns, indirect calls or indirect jumps.
2948.El
2949.Pp
2950The default qualifier counts all the above branch types.
2951.It Li p4-retired-mispred-branch-type Op Li ,mask= Ns Ar flags
2952.Pq "TS event"
2953Count mispredicted branches retired.
2954Qualifier
2955.Ar flags
2956contains a
2957.Ql +
2958separated list of strings:
2959.Pp
2960.Bl -tag -width indent -compact
2961.It Li conditional
2962Count conditional jumps.
2963.It Li call
2964Count indirect call branches.
2965.It Li return
2966Count return branches.
2967.It Li indirect
2968Count returns, indirect calls or indirect jumps.
2969.El
2970.Pp
2971The default qualifier counts all the above branch types.
2972.It Li p4-scalar-dp-uop Op Li ,mask= Ns Ar flags
2973.Pq "TI event"
2974Count the number of scalar double-precision uops.
2975Qualifier
2976.Ar flags
2977can take the following value (which is also the default):
2978.Pp
2979.Bl -tag -width indent -compact
2980.It Li all
2981Count the number of scalar double-precision uops.
2982.El
2983.It Li p4-scalar-sp-uop Op Li ,mask= Ns Ar flags
2984.Pq "TI event"
2985Count the number of scalar single-precision uops.
2986Qualifier
2987.Ar flags
2988can take the following value (which is also the default):
2989.Pp
2990.Bl -tag -width indent -compact
2991.It Li all
2992Count all uops operating on scalar single-precision operands.
2993.El
2994.It Li p4-snoop
2995.Pq "TI event"
2996Count snoop traffic.
2997Further documentation on this event is not available.
2998.It Li p4-sse-input-assist Op Li ,mask= Ns Ar flags
2999.Pq "TI event"
3000Count the number of times an assist is required to handle problems
3001with the operands for SSE and SSE2 operations.
3002Qualifier
3003.Ar flags
3004can take the following value (which is also the default):
3005.Pp
3006.Bl -tag -width indent -compact
3007.It Li all
3008Count assists for all SSE and SSE2 uops.
3009.El
3010.It Li p4-store-port-replay Op Li ,mask= Ns Ar qualifier
3011.Pq "TS event"
3012Count events replayed at the store port.
3013Qualifier
3014.Ar qualifier
3015can take on one value:
3016.Pp
3017.Bl -tag -width indent -compact
3018.It Li split-st
3019Count split stores.
3020.El
3021.Pp
3022The default value for
3023.Ar qualifier
3024is
3025.Dq Li split-st .
3026.It Li p4-tc-deliver-mode Op Li ,mask= Ns Ar qualifier
3027.Pq "TI event"
3028Count the duration in cycles of operating modes of the trace cache and
3029decode engine.
3030The desired operating mode is selected by
3031.Ar qualifier ,
3032which is a list of the following strings separated by
3033.Ql +
3034characters:
3035.Pp
3036.Bl -tag -width indent -compact
3037.It Li DD
3038Both logical processors are in deliver mode.
3039.It Li DB
3040Logical processor 0 is in deliver mode while logical processor 1 is in
3041build mode.
3042.It Li DI
3043Logical processor 0 is in deliver mode while logical processor 1 is
3044halted, or in machine clear, or transitioning to a long microcode
3045flow.
3046.It Li BD
3047Logical processor 0 is in build mode while logical processor 1 is in
3048deliver mode.
3049.It Li BB
3050Both logical processors are in build mode.
3051.It Li BI
3052Logical processor 0 is in build mode while logical processor 1 is
3053halted, or in machine clear or transitioning to a long microcode
3054flow.
3055.It Li ID
3056Logical processor 0 is halted, or in machine clear or transitioning to
3057a long microcode flow while logical processor 1 is in deliver mode.
3058.It Li IB
3059Logical processor 0 is halted, or in machine clear or transitioning to
3060a long microcode flow while logical processor 1 is in build mode.
3061.El
3062.Pp
3063If there is only one logical processor in the processor package then
3064the qualifier for logical processor 1 is ignored.
3065If no qualifier is specified, the default qualifier is
3066.Dq Li DD+DB+DI+BD+BB+BI+ID+IB .
3067.It Li p4-tc-ms-xfer Op Li ,mask= Ns Ar flags
3068.Pq "TI event"
3069Count the number of times uop delivery changed from the trace cache to
3070MS ROM.
3071Qualifier
3072.Ar flags
3073can take the following value (which is also the default):
3074.Pp
3075.Bl -tag -width indent -compact
3076.It Li cisc
3077Count TC to MS transfers.
3078.El
3079.It Li p4-uop-queue-writes Op Li ,mask= Ns Ar flags
3080.Pq "TS event"
3081Count the number of valid uops written to the uop queue.
3082Qualifier
3083.Ar flags
3084is a list of the following strings, separated by
3085.Ql +
3086characters:
3087.Pp
3088.Bl -tag -width indent -compact
3089.It Li from-tc-build
3090Count uops being written from the trace cache in build mode.
3091.It Li from-tc-deliver
3092Count uops being written from the trace cache in deliver mode.
3093.It Li from-rom
3094Count uops being written from microcode ROM.
3095.El
3096.Pp
3097The default qualifier counts all the above kinds of uops.
3098.It Li p4-uop-type Op Li ,mask= Ns Ar flags
3099.Pq "TS event"
3100This event is used in conjunction with the front-end at-retirement
3101mechanism to tag load and store uops.
3102Qualifer
3103.Ar flags
3104comprises the following strings separated by
3105.Ql +
3106characters:
3107.Pp
3108.Bl -tag -width indent -compact
3109.It Li tagloads
3110Mark uops that are load operations.
3111.It Li tagstores
3112Mark uops that are store operations.
3113.El
3114.Pp
3115The default qualifier counts both kinds of uops.
3116.It Li p4-uops-retired Op Li ,mask= Ns Ar flags
3117.Pq "TS event"
3118Count uops retired during a clock cycle.
3119Qualifier
3120.Ar flags
3121comprises the following strings separated by
3122.Ql +
3123characters:
3124.Pp
3125.Bl -tag -width indent -compact
3126.It Li nbogus
3127Count marked uops that are not bogus.
3128.It Li bogus
3129Count marked uops that are bogus.
3130.El
3131.Pp
3132The default qualifier counts both kinds of uops.
3133.It Li p4-wc-buffer Op Li ,mask= Ns Ar flags
3134.Pq "TI event"
3135Count write-combining buffer operations.
3136Qualifier
3137.Ar flags
3138contains the following strings separated by
3139.Ql +
3140characters:
3141.Pp
3142.Bl -tag -width indent -compact
3143.It Li wcb-evicts
3144WC buffer evictions due to any cause.
3145.It Li wcb-full-evict
3146WC buffer evictions due to no WC buffer being available.
3147.El
3148.Pp
3149The default qualifer counts both kinds of evictions.
3150.It Li p4-x87-assist Op Li ,mask= Ns Ar flags
3151.Pq "TS event"
3152Count the retirement of x87 instructions that required special
3153handling.
3154Qualifier
3155.Ar flags
3156contains the following strings separated by
3157.Ql +
3158characters:
3159.Pp
3160.Bl -tag -width indent -compact
3161.It Li fpsu
3162Count instructions that saw an FP stack underflow.
3163.It Li fpso
3164Count instructions that saw an FP stack overflow.
3165.It Li poao
3166Count instructions that saw an x87 output overflow.
3167.It Li poau
3168Count instructions that saw an x87 output underflow.
3169.It Li prea
3170Count instructions that needed an x87 input assist.
3171.El
3172.Pp
3173The default qualifier counts all the above types of instruction
3174retirements.
3175.It Li p4-x87-fp-uop Op Li ,mask= Ns Ar flags
3176.Pq "TI event"
3177Count x87 floating-point uops.
3178Qualifier
3179.Ar flags
3180can take the following value (which is also the default):
3181.Pp
3182.Bl -tag -width indent -compact
3183.It Li all
3184Count all x87 floating-point uops.
3185.El
3186.Pp
3187If an instruction contains more than one x87 floating-point uops, then
3188all x87 floating-point uops will be counted.
3189This event does not count x87 floating-point data movement operations.
3190.It Li p4-x87-simd-moves-uop Op Li ,mask= Ns Ar flags
3191.Pq "TI event"
3192Count each x87 FPU, MMX, SSE, or SSE2 uops that load data or store
3193data or perform register-to-register moves.
3194This event does not count integer move uops.
3195Qualifier
3196.Ar flags
3197may contain the following keywords separated by
3198.Ql +
3199characters:
3200.Pp
3201.Bl -tag -width indent -compact
3202.It Li allp0
3203Count all x87 and SIMD store and move uops.
3204.It Li allp2
3205Count all x87 and SIMD load uops.
3206.El
3207.Pp
3208The default is to count all uops.
3209.Pq Errata
3210This event may be affected by processor errata N43.
3211.El
3212.Ss "Cascading P4 PMCs"
3213PMC cascading support is currently poorly implemented.
3214While individual event counters may be allocated with a
3215.Dq Li cascade
3216qualifier, the current API does not offer the ability
3217to name and allocate all the resources needed for a
3218cascaded event counter pair in a single operation.
3219.Ss "Precise Event Based Sampling"
3220Support for precise event based sampling is currently
3221unimplemented in
3222.Xr hwpmc 4 .
3223.Sh IMPLEMENTATION NOTES
3224On the i386 architecture,
3225.Fx
3226has historically allowed the use of the RDTSC instruction from
3227user-mode (i.e., at a processor CPL of 3) by any process.
3228This behaviour is preserved by
3229.Xr hwpmc 4 .
3230.Sh RETURN VALUES
3231The
3232.Fn pmc_name_of_capability ,
3233.Fn pmc_name_of_class ,
3234.Fn pmc_name_of_cputype ,
3235.Fn pmc_name_of_disposition ,
3236.Fn pmc_name_of_event ,
3237.Fn pmc_name_of_mode ,
3238and
3239.Fn pmc_name_of_state
3240functions return a pointer to the human readable form of their argument.
3241These pointers may point to statically allocated storage and must
3242not be passed to
3243.Fn free .
3244In case of an error, these functions return
3245.Dv NULL
3246and set the global variable
3247.Va errno .
3248.Pp
3249The functions
3250.Fn pmc_ncpu
3251and
3252.Fn pmc_npmc
3253return the number of CPUs and number of PMCs configured respectively;
3254in case of an error they return the value
3255\-1
3256and set the global variable
3257.Va errno .
3258.Pp
3259All other functions return the value
32600
3261if successful; otherwise the value
3262\-1
3263is returned and the global variable
3264.Va errno
3265is set to indicate the error.
3266.Sh COMPATIBILITY
3267The interface between the
3268.Xr pmc 3
3269library and the
3270.Xr hwpmc 4
3271driver is intended to be private to the implementation and may
3272change.
3273In order to ease forward compatibility with future versions of the
3274.Xr hwpmc 4
3275driver, applications are urged to dynamically link with the
3276.Xr pmc 3
3277library.
3278.Pp
3279The
3280.Xr pmc 3
3281API is
3282.Ud
3283.Sh ERRORS
3284A call to
3285.Fn pmc_init
3286may fail with the following errors in addition to those returned by
3287.Xr modfind 2 ,
3288.Xr modstat 2
3289and
3290.Xr hwpmc 4 :
3291.Bl -tag -width Er
3292.It Bq Er ENXIO
3293An unknown CPU type was encountered during initialization.
3294.It Bq Er EPROGMISMATCH
3295The version number of the
3296.Xr hwpmc 4
3297kernel module did not match that compiled into the
3298.Xr pmc 3
3299library.
3300.El
3301.Pp
3302A call to
3303.Fn pmc_capabilities ,
3304.Fn pmc_name_of_capability ,
3305.Fn pmc_name_of_disposition ,
3306.Fn pmc_name_of_state ,
3307.Fn pmc_name_of_event ,
3308.Fn pmc_name_of_mode
3309.Fn pmc_name_of_class
3310and
3311.Fn pmc_width
3312may fail with the following error:
3313.Bl -tag -width Er
3314.It Bq Er EINVAL
3315An invalid argument was passed to the function.
3316.El
3317.Pp
3318A call to
3319.Fn pmc_cpuinfo
3320or
3321.Fn pmc_ncpu
3322may fail with the following error:
3323.Bl -tag -width Er
3324.It Bq Er ENXIO
3325The
3326.Xr pmc 3
3327has not been initialized.
3328.El
3329.Pp
3330A call to
3331.Fn pmc_npmc
3332may fail with the following errors:
3333.Bl -tag -width Er
3334.It Bq Er EINVAL
3335The argument passed in was out of range.
3336.It Bq Er ENXIO
3337The
3338.Xr pmc 3
3339library has not been initialized.
3340.El
3341.Pp
3342A call to
3343.Fn pmc_pmcinfo
3344may fail with the following errors, in addition to those returned by
3345.Xr hwpmc 4 :
3346.Bl -tag -width Er
3347.It Bq Er ENXIO
3348The
3349.Xr pmc 3
3350library is not yet initialized.
3351.El
3352.Pp
3353A call to
3354.Fn pmc_allocate
3355may fail with the following errors, in addition to those returned by
3356.Xr hwpmc 4 :
3357.Bl -tag -width Er
3358.It Bq Er EINVAL
3359The
3360.Fa mode
3361argument passed in had an illegal value, or the event specification
3362.Fa ctrspec
3363was unrecognized for this CPU type.
3364.El
3365.Pp
3366Calls to
3367.Fn pmc_attach ,
3368.Fn pmc_configure_logfile ,
3369.Fn pmc_detach ,
3370.Fn pmc_disable ,
3371.Fn pmc_enable ,
3372.Fn pmc_get_driver_stats ,
3373.Fn pmc_get_msr ,
3374.Fn pmc_read ,
3375.Fn pmc_release ,
3376.Fn pmc_rw ,
3377.Fn pmc_set ,
3378.Fn pmc_start ,
3379.Fn pmc_stop ,
3380.Fn pmc_write ,
3381and
3382.Fn pmc_writelog
3383may fail with the errors described in
3384.Xr hwpmc 4 .
3385.Pp
3386If a log file was configured using
3387.Fn pmc_configure_logfile
3388and the
3389.Xr hwpmc 4
3390driver encountered an error while logging data to it, then
3391logging will be stopped and a subsequent call to
3392.Fn pmc_flush_logfile
3393will fail with the error code seen by the
3394.Xr hwpmc 4
3395driver.
3396.Sh SEE ALSO
3397.Xr modfind 2 ,
3398.Xr modstat 2 ,
3399.Xr calloc 3 ,
3400.Xr pmclog 3 ,
3401.Xr hwpmc 4 ,
3402.Xr pmccontrol 8 ,
3403.Xr pmcstat 8
3404.Sh HISTORY
3405The
3406.Xr pmc 3
3407library first appeared in
3408.Fx 6.0 .
3409.Sh BUGS
3410The information returned by
3411.Fn pmc_cpuinfo ,
3412.Fn pmc_ncpu
3413and possibly
3414.Fn pmc_npmc
3415should really be available all the time, through a better designed
3416interface and not just when
3417.Xr hwpmc 4
3418is present in the kernel.
3419