xref: /freebsd/lib/libpmc/pmc.3 (revision d429ea332342fcb98d27a350d0c4944bf9aec3f9)
1.\" Copyright (c) 2003-2005 Joseph Koshy.  All rights reserved.
2.\"
3.\" Redistribution and use in source and binary forms, with or without
4.\" modification, are permitted provided that the following conditions
5.\" are met:
6.\" 1. Redistributions of source code must retain the above copyright
7.\"    notice, this list of conditions and the following disclaimer.
8.\" 2. Redistributions in binary form must reproduce the above copyright
9.\"    notice, this list of conditions and the following disclaimer in the
10.\"    documentation and/or other materials provided with the distribution.
11.\"
12.\" This software is provided by Joseph Koshy ``as is'' and
13.\" any express or implied warranties, including, but not limited to, the
14.\" implied warranties of merchantability and fitness for a particular purpose
15.\" are disclaimed.  in no event shall Joseph Koshy be liable
16.\" for any direct, indirect, incidental, special, exemplary, or consequential
17.\" damages (including, but not limited to, procurement of substitute goods
18.\" or services; loss of use, data, or profits; or business interruption)
19.\" however caused and on any theory of liability, whether in contract, strict
20.\" liability, or tort (including negligence or otherwise) arising in any way
21.\" out of the use of this software, even if advised of the possibility of
22.\" such damage.
23.\"
24.\" $FreeBSD$
25.\"
26.Dd Apr 15, 2005
27.Os
28.Dt PMC 3
29.Sh NAME
30.Nm pmc_allocate ,
31.Nm pmc_attach ,
32.Nm pmc_capabilities ,
33.Nm pmc_configure_logfile ,
34.Nm pmc_cpuinfo ,
35.Nm pmc_detach ,
36.Nm pmc_disable ,
37.Nm pmc_enable ,
38.Nm pmc_event_names_of_class ,
39.Nm pmc_flush_logfile ,
40.Nm pmc_get_driver_stats ,
41.Nm pmc_get_msr ,
42.Nm pmc_init ,
43.Nm pmc_name_of_capability ,
44.Nm pmc_name_of_class ,
45.Nm pmc_name_of_cputype ,
46.Nm pmc_name_of_event ,
47.Nm pmc_name_of_mode ,
48.Nm pmc_name_of_state ,
49.Nm pmc_ncpu ,
50.Nm pmc_npmc ,
51.Nm pmc_pmcinfo ,
52.Nm pmc_read ,
53.Nm pmc_release ,
54.Nm pmc_rw ,
55.Nm pmc_set ,
56.Nm pmc_start ,
57.Nm pmc_stop ,
58.Nm pmc_width ,
59.Nm pmc_write ,
60.Nm pmc_writelog
61.Nd programming API for using hardware performance monitoring counters
62.Sh LIBRARY
63.Lb libpmc
64.Sh SYNOPSIS
65.In pmc.h
66.Ft int
67.Fo pmc_allocate
68.Fa "const char *eventspecifier"
69.Fa "enum pmc_mode mode"
70.Fa "uint32_t flags"
71.Fa "uint32_t cpu"
72.Fa "pmc_id_t *pmcid"
73.Fc
74.Ft int
75.Fo pmc_attach
76.Fa "pmc_id_t pmcid"
77.Fa "pid_t pid"
78.Fc
79.Ft int
80.Fn pmc_capabilities "pmc_id_t pmc" "uint32_t *caps"
81.Ft int
82.Fn pmc_configure_logfile "int fd"
83.Ft int
84.Fn pmc_cpuinfo "const struct pmc_cpuinfo **cpu_info"
85.Ft int
86.Fo pmc_detach
87.Fa "pmc_id_t pmcid"
88.Fa "pid_t pid"
89.Fc
90.Ft int
91.Fn pmc_disable "uint32_t cpu" "int pmc"
92.Ft int
93.Fn pmc_enable "uint32_t cpu" "int pmc"
94.Ft int
95.Fo pmc_event_names_of_class
96.Fa "enum pmc_class cl"
97.Fa "const char ***eventnames"
98.Fa "int *nevents"
99.Fc
100.Ft int
101.Fn pmc_flush_logfile "void"
102.Ft int
103.Fn pmc_get_driver_stats "struct pmc_driverstats *gms"
104.Ft int
105.Fn pmc_get_msr "pmc_id_t pmc" "uint32_t *msr"
106.Ft int
107.Fn pmc_init "void"
108.Ft "const char *"
109.Fn pmc_name_of_capability "enum pmc_caps pc"
110.Ft "const char *"
111.Fn pmc_name_of_class "enum pmc_class pc"
112.Ft "const char *"
113.Fn pmc_name_of_cputype "enum pmc_cputype ct"
114.Ft "const char *"
115.Fn pmc_name_of_disposition "enum pmc_disp pd"
116.Ft "const char *"
117.Fn pmc_name_of_event "enum pmc_event pe"
118.Ft "const char *"
119.Fn pmc_name_of_mode "enum pmc_mode pm"
120.Ft "const char *"
121.Fn pmc_name_of_state "enum pmc_state ps"
122.Ft int
123.Fn pmc_ncpu "void"
124.Ft int
125.Fn pmc_npmc "uint32_t cpu"
126.Ft int
127.Fn pmc_pmcinfo "uint32_t cpu" "struct pmc_pmcinfo **pmc_info"
128.Ft int
129.Fn pmc_read "pmc_id_t pmc" "pmc_value_t *value"
130.Ft int
131.Fn pmc_release "pmc_id_t pmc"
132.Ft int
133.Fn pmc_rw "pmc_id_t pmc" "pmc_value_t newvalue" "pmc_value_t *oldvaluep"
134.Ft int
135.Fn pmc_set "pmc_id_t pmc" "pmc_value_t value"
136.Ft int
137.Fn pmc_start "pmc_id_t pmc"
138.Ft int
139.Fn pmc_stop "pmc_id_t pmc"
140.Ft int
141.Fn pmc_write "pmc_id_t pmc" "pmc_value_t value"
142.Ft int
143.Fn pmc_writelog "uint32_t userdata"
144.Ft int
145.Fn pmc_width "pmc_id_t pmc" "uint32_t *width"
146.Sh DESCRIPTION
147These functions implement a high-level library for using the
148system's hardware performance counters.
149.Pp
150PMCs are allocated using
151.Fn pmc_allocate ,
152released using
153.Fn pmc_release
154and read using
155.Fn pmc_read .
156Allocated PMCs may be started or stopped at any time using
157.Fn pmc_start
158and
159.Fn pmc_stop
160respectively.
161An allocated PMC may be of
162.Qq global
163scope, meaning that the PMC measures system-wide events, or
164.Qq process-private
165scope, meaning that the PMC only counts hardware events when
166the allocating process (or, optionally, its children)
167are active.
168.Pp
169PMCs may further be in
170.Qq "counting mode" ,
171or in
172.Qq "sampling mode" .
173Sampling mode PMCs deliver an interrupt to the CPU after
174a configured number of hardware events have been seen.
175A process-private sampling mode PMC will cause its owner
176process to get periodic
177.Sy SIGPROF
178interrupts, while a global sampling mode PMC is used to
179do system-wide statistical sampling (see
180.Xr hwpmc 4 ) .
181The sampling rate desired of a sampling-mode PMC is set using
182.Fn pmc_set .
183Counting mode PMCs do not interrupt the CPU; their values
184can be read using
185.Fn pmc_read .
186.Pp
187System-wide statistical sampling is configured by allocating
188at least one sampling mode PMC with
189global scope, and when a log file is configured using
190.Fn pmc_configure_logfile .
191The
192.Xr hwpmc 4
193driver manages system-wide statistical sampling; for more
194information please see
195.Xr hwpmc 4 .
196.Ss APPLICATION PROGRAMMING INTERFACE
197.Fn pmc_init
198initializes the
199.Xr pmc 3
200library.
201This function must be called first, before any of the other
202functions in the library.
203.Pp
204.Fn pmc_allocate
205allocates a counter that counts the events named by
206.Fa eventspecifier ,
207and writes the allocated counter id to
208.Fa *pmcid .
209Argument
210.Fa eventspecifier
211comprises an PMC event name followed by an optional comma separated
212list of keywords and qualifiers.
213The allowed syntax for
214.Fa eventspecifier
215is processor architecture specific and is listed in section
216.Sx "EVENT SPECIFIERS"
217below.
218The desired PMC mode is specified by
219.Fa mode ,
220and any mode specific modifiers are specified using
221.Fa flags .
222The
223.Fa cpu
224argument is the value
225.Li PMC_CPU_ANY ,
226or names the cpu the allocation is to be on.
227Requesting a specific CPU makes only makes sense for global PMCs;
228process-private PMC allocations should always specify
229.Li PMC_CPU_ANY .
230.Pp
231By default a PMC configured in process-virtual counting mode is setup
232to profile its owner process.
233The function
234.Fn pmc_attach
235may be used to attach the PMC to a different process.
236.Fn pmc_attach
237needs to be called before the counter is first started
238with
239.Fn pmc_start .
240The function
241.Fn pmc_detach
242may be used to detach a PMC from a process it was attached to
243using a prior call to
244.Fn pmc_attach .
245.Pp
246.Fn pmc_release
247releases a PMC previously allocated with
248.Fn pmc_allocate .
249This function call implicitly detaches the PMC from all its target
250processes.
251.Pp
252An allocated PMC may be started and stopped using
253.Fn pmc_start
254and
255.Fn pmc_stop
256respectively.
257.Pp
258The current value of a PMC may be read with
259.Fn pmc_read
260and written using
261.Fn pmc_write ,
262provided the underlying hardware supports these operations on
263the allocated PMC.
264The read and write operation may be combined using
265.Fn pmc_rw .
266.Pp
267The function
268.Fn pmc_capabilities
269sets argument
270.Fa caps
271to a bitmask of capabilities supported by the PMC denoted by
272argument
273.Fa pmc .
274The function
275.Fn pmc_width
276sets argument
277.Fa width
278to the width of the PMC denoted by argument
279.Fa pmc .
280.Pp
281The
282.Fn pmc_configure_logfile
283function causes the
284.Xr hwpmc 4
285driver to log performance data to file corresponding
286to the process' file handle
287.Fa fd .
288If argument
289.Fa fd
290is -1, then any previously configured logging is reset
291and all data queued to be written are discarded.
292.Pp
293The
294.Fn pmc_flush_logfile
295function will send all data queued inside the
296.Xr hwpmc 4
297driver to the configured log file before returning.
298The
299.Fn pmc_writelog
300function will append a log entry containing the argument
301.Fa userdata
302to the log file.
303.Pp
304.Fn pmc_set
305configures an sampling PMC
306.Fa pmc
307to interrupt every
308.Fa value
309events.
310For counting PMCs,
311.Fn pmc_set
312sets the initial value of the PMC to
313.Fa value .
314.Pp
315.Fn pmc_get_driver_statistics
316copies a snapshot of the usage statistics maintained by
317.Xr hwpmc 4
318into the memory area pointed to be argument
319.Fa gms .
320.Ss SIGNAL HANDLING REQUIREMENTS
321Applications using PMCs are required to handle the following signals:
322.Bl -tag -width indent
323.It SIGBUS
324When the
325.Xr hwpmc 4
326module is unloaded using
327.Xr kldunload 8 ,
328processes that have PMCs allocated to them will be sent a
329SIGBUS signal.
330.It SIGIO
331The
332.Xr hwpmc 4
333driver will send a PMC owning process a SIGIO signal if:
334.Bl -bullet
335.It
336If any process-mode PMC allocated by it loses all its
337target processes.
338.It
339If the driver encounters an error when writing log data to a
340configured log file.
341This error may be retrieved by a subsequent call to
342.Fn pmc_flush_logfile .
343.El
344.El
345.Ss CONVENIENCE FUNCTIONS
346.Fn pmc_ncpu
347returns the number of CPUs present in the system.
348.Pp
349.Fn pmc_npmc
350returns the number of PMCs supported on CPU
351.Fa cpu .
352.Fn pmc_cpuinfo
353sets argument
354.Fa cpu_info
355to point to a structure with information about the system's CPUs.
356Function
357.Fn pmc_pmcinfo
358returns information about the current state of CPU
359.Fa cpu Ap s
360PMCs.
361This function sets argument
362.Fa *pmc_info
363to point to a memory area allocated with
364.Xr calloc 3 .
365The caller is expected to
366.Fn free
367the area when done.
368.Pp
369The functions
370.Fn pmc_name_of_capability ,
371.Fn pmc_name_of_class ,
372.Fn pmc_name_of_cputype ,
373.Fn pmc_name_of_disposition ,
374.Fn pmc_name_of_event ,
375.Fn pmc_name_of_mode
376and
377.Fn pmc_name_of_state
378are useful for code wanting to print error messages.
379They return
380.Ft "const char *"
381pointers to human-readable representations of their arguments.
382These return values should not be freed using
383.Xr free 3 .
384.Pp
385.Fn pmc_event_names_of_class
386returns a list of event names supported by a given PMC class
387.Fa cl .
388On successful return, an array of
389.Ft "const char *"
390pointers to the names of valid events supported by class
391.Fa cl
392is allocated by the library using
393.Xr malloc 3 ,
394and a pointer to this array is returned in the location pointed to by
395.Fa eventnames .
396The number of pointers allocated is returned in the location pointed
397to by
398.Fa nevents .
399.Ss ADMINISTRATION
400Individual PMCs may be enabled or disabled on a given CPU using
401.Fn pmc_enable
402and
403.Fn pmc_disable
404respectively.
405For these functions,
406.Fa cpu
407is the CPU number, and
408.Fa pmc
409is the index of the PMC to be operated on.
410Only the super-user is allowed to enable and disable PMCs.
411.Ss X86 ARCHITECTURE SPECIFIC API
412The
413.Fn pmc_get_msr
414function returns the processor model specific register number
415associated with
416.Fa pmc .
417Applications may use the x86
418.Sy RDPMC
419instruction to directly read the contents of the PMC.
420.Sh EVENT SPECIFIERS
421Event specifiers are strings comprising of an event name, followed by
422optional parameters modifying the semantics of the hardware event
423being probed.
424Event names are PMC architecture dependent, but the
425.Xr hwpmc 4
426library defines machine independent aliases for commonly used
427events.
428.Ss Event Name Aliases
429Event name aliases are CPU architecture independent names for commonly
430used events.
431The following aliases are known to this version of the
432.Xr pmc 3
433library:
434.Bl -tag -width indent
435.It Li branches
436Measure the number of branches retired.
437.It Li branch-mispredicts
438Measure the number of retired branches that were mispredicted.
439.It Li cycles
440Measure processor cycles.
441This event is implemented using the processor's Time Stamp Counter
442register.
443.It Li dc-misses
444Measure the number of data cache misses.
445.It Li ic-misses
446Measure the number of instruction cache misses.
447.It Li instructions
448Measure the number of instructions retired.
449.It Li interrupts
450Measure the number of interrupts seen.
451.El
452.Ss Time Stamp Counter (TSC)
453The timestamp counter is a monontonically non-decreasing counter that
454counts processor cycles.
455.Pp
456In the i386 architecture this counter may
457be selected by requesting an event with eventspecifier
458.Ic tsc .
459The
460.Ic tsc
461event does not support any further qualifiers.
462It can only be allocated in system-wide counting mode,
463and is a read-only counter.
464Multiple processes are allowed to allocate the TSC.
465Once allocated, it may be read using the
466.Fn pmc_read
467function, or by using the RDTSC instruction.
468.Ss AMD (K7) PMCs
469These PMCs are present in the
470.Tn "AMD Athlon"
471series of CPUs and are documented in:
472.Rs
473.%B "AMD Athlon Processor x86 Code Optimization Guide"
474.%N "Publication No. 22007"
475.%D "February 2002"
476.%Q "Advanced Micronic Devices, Inc."
477.Re
478.Pp
479Event specifiers for AMD K7 PMCs can have the following optional
480qualifiers:
481.Bl -tag -width indent
482.It Li count= Ns Ar value
483Configure the counter to increment only if the number of configured
484events measured in a cycle is greater than or equal to
485.Ar value .
486.It Li edge
487Configure the counter to only count negated-to-asserted transitions
488of the conditions expressed by the other qualifiers.
489In other words, the counter will increment only once whenever a given
490condition becomes true, irrespective of the number of clocks during
491which the condition remains true.
492.It Li inv
493Invert the sense of comparision when the
494.Li count
495qualifier is present, making the counter to increment when the
496number of events per cycle is less than the value specified by
497the
498.Li count
499qualifier.
500.It Li os
501Configure the PMC to count events happening at privilege level 0.
502.It Li unitmask= Ns Ar mask
503This qualifier is used to further qualify a select few events,
504.Li k7-dc-refills-from-l2 ,
505.Li k7-dc-refills-from-system
506and
507.Li k7-dc-writebacks .
508Here
509.Ar mask
510is a string of the following characters optionally seperated by
511.Li "+"
512characters:
513.Bl -tag -width indent -compact
514.It Li m
515Count operations for lines in the
516.Dq Modified
517state.
518.It Li o
519Count operations for lines in the
520.Dq Owner
521state.
522.It Li e
523Count operations for lines in the
524.Dq Exclusive
525state.
526.It Li s
527Count operations for lines in the
528.Dq Shared
529state.
530.It Li i
531Count operations for lines in the
532.Dq Invalid
533state.
534.El
535If no
536.Ar unitmask
537qualifier is specified, the default is to count events for caches
538lines in any of the above states.
539.It Li usr
540Configure the PMC to count events occurring at privilege levels 1, 2
541or 3.
542.El
543If neither of the
544.Li os
545or
546.Li usr
547qualifiers were specified, the default is to enable both.
548.Pp
549The event specifiers support on AMD K7 PMCs are:
550.Bl -tag -width indent
551.It Li k7-dc-accesses
552Count data cache accesses.
553.It Li k7-dc-misses
554Count data cache misses.
555.It Li k7-dc-refills-from-l2 Op Li ,unitmask= Ns Ar mask
556Count data cache refills from L2 cache.
557This event may be further qualified using the
558.Li unitmask
559qualifier.
560.It Li k7-dc-refills-from-system Op Li ,unitmask= Ns Ar mask
561Count data cache refills from system memory.
562This event may be further qualified using the
563.Li unitmask
564qualifier.
565.It Li k7-dc-writebacks Op Li ,unitmask= Ns Ar mask
566Count data cache writebacks.
567This event may be further qualified using the
568.Li unitmask
569qualifier.
570.It Li k7-l1-dtlb-miss-and-l2-dtlb-hits
571Count L1 DTLB misses and L2 DTLB hits.
572.It Li k7-l1-and-l2-dtlb-misses
573Count L1 and L2 DTLB misses.
574.It Li k7-misaligned-references
575Count misaligned data references.
576.It Li k7-ic-fetches
577Count instruction cache fetches.
578.It Li k7-ic-misses
579Count instruction cache misses.
580.It Li k7-l1-itlb-misses
581Count L1 ITLB misses that are L2 ITLB hits.
582.It Li k7-l1-l2-itlb-misses
583Count L1 (and L2) ITLB misses.
584.It Li k7-retired-instructions
585Count all retired instructions.
586.It Li k7-retired-ops
587Count retired ops.
588.It Li k7-retired-branches
589Count all retired branches (conditional, unconditional, exceptions
590and interrupts).
591.It Li k7-retired-branches-mispredicted
592Count all misprediced retired branches.
593.It Li k7-retired-taken-branches
594Count retired taken branches.
595.It Li k7-retired-taken-branches-mispredicted
596Count mispredicted taken branches that were retired.
597.It Li k7-retired-far-control-transfers
598Count retired far control transfers.
599.It Li k7-retired-resync-branches
600Count retired resync branches (non control transfer branches).
601.It Li k7-interrupts-masked-cycles
602Count the number of cycles when the processor's
603.Li IF
604flag was zero.
605.It Li k7-interrupts-masked-while-pending-cycles
606Count the number of cycles interrupts were masked while pending due
607to the processor's
608.Li IF
609flag being zero.
610.It Li k7-hardware-interrupts
611Count the number of taken hardware interrupts.
612.El
613.Ss AMD (K8) PMCs
614These PMCs are present in the
615.Tn "AMD Athlon64"
616and
617.Tn "AMD Opteron"
618series of CPUs.
619They are documented in:
620.Rs
621.%B "BIOS and Kernel Developer's Guide for the AMD Athlon(tm) 64 and AMD Opteron Processors"
622.%N "Publication No. 26094"
623.%D "April 2004"
624.%Q "Advanced Micronic Devices, Inc."
625.Re
626.Pp
627Event specifiers for AMD K8 PMCs can have the following optional
628qualifiers:
629.Bl -tag -width indent
630.It Li count= Ns Ar value
631Configure the counter to increment only if the number of configured
632events measured in a cycle is greater than or equal to
633.Ar value .
634.It Li edge
635Configure the counter to only count negated-to-asserted transitions
636of the conditions expressed by the other fields.
637In other words, the counter will increment only once whenever a given
638condition becomes true, irrespective of the number of clocks during
639which the condition remains true.
640.It Li inv
641Invert the sense of comparision when the
642.Li count
643qualifier is present, making the counter to increment when the
644number of events per cycle is less than the value specified by
645the
646.Li count
647qualifier.
648.It Li mask= Ns Ar qualifier
649Many event specifiers for AMD K8 PMCs need to be additionally
650qualified using a mask qualifier.
651These additional qualifiers are event-specific and are documented
652along with their associated event specifiers below.
653.It Li os
654Configure the PMC to count events happening at privilege level 0.
655.It Li usr
656Configure the PMC to count events occurring at privilege levels 1, 2
657or 3.
658.El
659If neither of the
660.Li os
661or
662.Li usr
663qualifiers were specified, the default is to enable both.
664.Pp
665The event specifiers support on AMD K8 PMCs are:
666.Bl -tag -width indent
667.It Li k8-bu-cpu-clk-unhalted
668Count the number of clock cycles when the CPU is not in the HLT or
669STPCLK states.
670.It Li k8-bu-fill-request-l2-miss Op Li ,mask= Ns Ar qualifier
671Count fill requests that missed in the L2 cache.
672This event may be further qualified using
673.Ar qualifier ,
674which is a
675.Li + Ns - Ns
676separated set of the following keywords:
677.Bl -tag -width "XXXXXXXXXX" -compact
678.It Li dc-fill
679Count data cache fill requests.
680.It Li ic-fill
681Count instruction cache fill requests.
682.It Li tlb-reload
683Count TLB reloads.
684.El
685The default is to count all types of requests.
686.It Li k8-bu-internal-l2-request Op Li ,mask= Ns Ar qualifier
687Count internally generated requests to the L2 cache.
688This event may be further qualified using
689.Ar qualifier ,
690which is a
691.Li "+" Ns - Ns
692separated set of the following keywords:
693.Bl -tag -width "XXXXXXXXXX" -compact
694.It Li cancelled
695Count cancelled requests.
696.It Li dc-fill
697Count data cache fill requests.
698.It Li ic-fill
699Count instruction cache fill requests.
700.It Li tag-snoop
701Count tag snoop requests.
702.It Li tlb-reload
703Count TLB reloads.
704.El
705The default is to count all types of requests.
706.It Li k8-dc-access
707Count data cache accesses including microcode scratchpad accesses.
708.It Li k8-dc-copyback Op Li ,mask= Ns Ar qualifier
709Count data cache copyback operations.
710This event may be further qualified using
711.Ar qualifier ,
712which is a
713.Li "+" Ns - Ns
714separated set of the following keywords:
715.Bl -tag -width "exclusive" -compact
716.It Li exclusive
717Count operations for lines in the
718.Dq exclusive
719state.
720.It Li invalid
721Count operations for lines in the
722.Dq invalid
723state.
724.It Li modified
725Count operations for lines in the
726.Dq modified
727state.
728.It Li owner
729Count operations for lines in the
730.Dq owner
731state.
732.It Li shared
733Count operations for lines in the
734.Dq shared
735state.
736.El
737The default is to count operations for lines in all the
738above states.
739.It Li k8-dc-dcache-accesses-by-locks Op Li ,mask= Ns Ar qualifier
740Count data cache accesses by lock instructions.
741This event is only available on processors of revision C or later
742vintage.
743This event may be further qualified using
744.Ar qualifier ,
745which is a
746.Li "+" Ns - Ns
747separated set of the following keywords:
748.Bl -tag -width "exclusive" -compact
749.It Li accesses
750Count data cache accesses by lock instructions.
751.It Li misses
752Count data cache misses by lock instructions.
753.El
754The default is to count all accesses.
755.It Li k8-dc-dispatched-prefetch-instructions Op Li ,mask= Ns Ar qualifier
756Count the number of dispatched prefetch instructions.
757This event may be further qualified using
758.Ar qualifier ,
759which is a
760.Li "+" Ns - Ns
761separated set of the following keywords:
762.Bl -tag -width "exclusive" -compact
763.It Li load
764Count load operations.
765.It Li nta
766Count non-temporal operations.
767.It Li store
768Count store operations.
769.El
770The default is to count all operations.
771.It Li k8-dc-l1-dtlb-miss-and-l2-dtlb-hit
772Count L1 DTLB misses that are L2 DTLB hits.
773.It Li k8-dc-l1-dtlb-miss-and-l2-dtlb-miss
774Count L1 DTLB misses that are also misses in the L2 DTLB.
775.It Li k8-dc-microarchitectural-early-cancel-of-an-access
776Count microarchitectural early cancels of data cache accesses.
777.It Li k8-dc-microarchitectural-late-cancel-of-an-access
778Count microarchitectural late cancels of data cache accesses.
779.It Li k8-dc-misaligned-data-reference
780Count misaligned data references.
781.It Li k8-dc-miss
782Count data cache misses.
783.It Li k8-dc-one-bit-ecc-error Op Li ,mask= Ns Ar qualifier
784Count one bit ECC errors found by the scrubber.
785This event may be further qualified using
786.Ar qualifier ,
787which is a
788.Li "+" Ns - Ns
789separated set of the following keywords:
790.Bl -tag -width "piggyback" -compact
791.It Li scrubber
792Count scrubber detected errors.
793.It Li piggyback
794Count piggyback scrubber errors.
795.El
796The default is to count both kinds of errors.
797.It Li k8-dc-refill-from-l2 Op Li ,mask= Ns Ar qualifier
798Count data cache refills from L2 cache.
799This event may be further qualified using
800.Ar qualifier ,
801which is a
802.Li "+" Ns - Ns
803separated set of the following keywords:
804.Bl -tag -width "exclusive" -compact
805.It Li exclusive
806Count operations for lines in the
807.Dq exclusive
808state.
809.It Li invalid
810Count operations for lines in the
811.Dq invalid
812state.
813.It Li modified
814Count operations for lines in the
815.Dq modified
816state.
817.It Li owner
818Count operations for lines in the
819.Dq owner
820state.
821.It Li shared
822Count operations for lines in the
823.Dq shared
824state.
825.El
826The default is to count operations for lines in all the
827above states.
828.It Li k8-dc-refill-from-system Op Li ,mask= Ns Ar qualifier
829Count data cache refills from system memory.
830This event may be further qualified using
831.Ar qualifier ,
832which is a
833.Li "+" Ns - Ns
834separated set of the following keywords:
835.Bl -tag -width "exclusive" -compact
836.It Li exclusive
837Count operations for lines in the
838.Dq exclusive
839state.
840.It Li invalid
841Count operations for lines in the
842.Dq invalid
843state.
844.It Li modified
845Count operations for lines in the
846.Dq modified
847state.
848.It Li owner
849Count operations for lines in the
850.Dq owner
851state.
852.It Li shared
853Count operations for lines in the
854.Dq shared
855state.
856.El
857The default is to count operations for lines in all the
858above states.
859.It Li k8-fp-dispatched-fpu-ops Op Li ,mask= Ns Ar qualifier
860Count the number of dispatched FPU ops.
861This event is supported in revision B and later CPUs.
862This event may be further qualified using
863.Ar qualifier ,
864which is a
865.Li "+" Ns - Ns
866separated set of the following keywords:
867.Bl -tag -width "XXXXXXXXXX" -compact
868.It Li add-pipe-excluding-junk-ops
869Count add pipe ops excluding junk ops.
870.It Li add-pipe-junk-ops
871Count junk ops in the add pipe.
872.It Li multiply-pipe-excluding-junk-ops
873Count multiply pipe ops excluding junk ops.
874.It Li multiply-pipe-junk-ops
875Count junk ops in the multiply pipe.
876.It Li store-pipe-excluding-junk-ops
877Count store pipe ops excluding junk ops
878.It Li store-pipe-junk-ops
879Count junk ops in the store pipe.
880.El
881The default is to count all types of ops.
882.It Li k8-fp-cycles-with-no-fpu-ops-retired
883Count cycles when no FPU ops were retired.
884This event is supported in revision B and later CPUs.
885.It Li k8-fp-dispatched-fpu-fast-flag-ops
886Count dispatched FPU ops that use the fast flag interface.
887This event is supported in revision B and later CPUs.
888.It Li k8-fr-decoder-empty
889Count cycles when there was nothing to dispatch (i.e., the decoder
890was empty).
891.It Li k8-fr-dispatch-stalls
892Count all dispatch stalls.
893.It Li k8-fr-dispatch-stall-for-segment-load
894Count dispatch stalls for segment loads.
895.It Li k8-fr-dispatch-stall-for-serialization
896Count dispatch stalls for serialization.
897.It Li k8-fr-dispatch-stall-from-branch-abort-to-retire
898Count dispatch stalls from branch abort to retiral.
899.It Li k8-fr-dispatch-stall-when-fpu-is-full
900Count dispatch stalls when the FPU is full.
901.It Li k8-fr-dispatch-stall-when-ls-is-full
902Count dispatch stalls when the load/store unit is full.
903.It Li k8-fr-dispatch-stall-when-reorder-buffer-is-full
904Count dispatch stalls when the reorder buffer is full.
905.It Li k8-fr-dispatch-stall-when-reservation-stations-are-full
906Count dispatch stalls when reservation stations are full.
907.It Li k8-fr-dispatch-stall-when-waiting-for-all-to-be-quiet
908Count dispatch stalls when waiting for all to be quiet.
909.\" XXX What does "waiting for all to be quiet" mean?
910.It Li k8-fr-dispatch-stall-when-waiting-far-xfer-or-resync-branch-pending
911Count dispatch stalls when a far control transfer or a resync branch
912is pending.
913.It Li k8-fr-fpu-exceptions Op Li ,mask= Ns Ar qualifier
914Count FPU exceptions.
915This event is supported in revision B and later CPUs.
916This event may be further qualified using
917.Ar qualifier ,
918which is a
919.Li "+" Ns - Ns
920separated set of the following keywords:
921.Bl -tag -width "XXXXXXXXXX" -compact
922.It Li sse-and-x87-microtraps
923Count SSE and x87 microtraps.
924.It Li sse-reclass-microfaults
925Count SSE reclass microfaults
926.It Li sse-retype-microfaults
927Count SSE retype microfaults
928.It Li x87-reclass-microfaults
929Count x87 reclass microfaults.
930.El
931The default is to count all types of exceptions.
932.It Li k8-fr-interrupts-masked-cycles
933Count cycles when interrupts were masked (by CPU RFLAGS field IF was zero).
934.It Li k8-fr-interrupts-masked-while-pending-cycles
935Count cycles while interrupts were masked while pending (i.e., cycles
936when INTR was asserted while CPU RFLAGS field IF was zero).
937.It Li k8-fr-number-of-breakpoints-for-dr0
938Count the number of breakpoints for DR0.
939.It Li k8-fr-number-of-breakpoints-for-dr1
940Count the number of breakpoints for DR1.
941.It Li k8-fr-number-of-breakpoints-for-dr2
942Count the number of breakpoints for DR2.
943.It Li k8-fr-number-of-breakpoints-for-dr3
944Count the number of breakpoints for DR3.
945.It Li k8-fr-retired-branches
946Count retired branches including exceptions and interrupts.
947.It Li k8-fr-retired-branches-mispredicted
948Count mispredicted retired branches.
949.It Li k8-fr-retired-far-control-transfers
950Count retired far control transfers (which are always mispredicted).
951.It Li k8-fr-retired-fastpath-double-op-instructions Op Li ,mask= Ns Ar qualifier
952Count retired fastpath double op instructions.
953This event is supported in revision B and later CPUs.
954This event may be further qualified using
955.Ar qualifier ,
956which is a
957.Li "+" Ns - Ns
958separated set of the following keywords:
959.Bl -tag -width "XXXXXXXXXXXX" -compact
960.It Li low-op-pos-0
961Count instructions with the low op in position 0.
962.It Li low-op-pos-1
963Count instructions with the low op in position 1.
964.It Li low-op-pos-2
965Count instructions with the low op in position 2.
966.El
967The default is to count all types of instructions.
968.It Li k8-fr-retired-fpu-instructions Op Li ,mask= Ns Ar qualifier
969Count retired FPU instructions.
970This event is supported in revision B and later CPUs.
971This event may be further qualified using
972.Ar qualifier ,
973which is a
974.Li "+" Ns - Ns
975separated set of the following keywords:
976.Bl -tag -width "XXXXXXXXXX" -compact
977.It Li mmx-3dnow
978Count MMX and 3DNow! instructions.
979.It Li packed-sse-sse2
980Count packed SSE and SSE2 instructions.
981.It Li scalar-sse-sse2
982Count scalar SSE and SSE2 instructions
983.It Li x87
984Count x87 instructions.
985.El
986The default is to count all types of instructions.
987.It Li k8-fr-retired-near-returns
988Count retired near returns.
989.It Li k8-fr-retired-near-returns-mispredicted
990Count mispredicted near returns.
991.It Li k8-fr-retired-resyncs
992Count retired resyncs (non-control transfer branches).
993.It Li k8-fr-retired-taken-hardware-interrupts
994Count retired taken hardware interrupts.
995.It Li k8-fr-retired-taken-branches
996Count retired taken branches.
997.It Li k8-fr-retired-taken-branches-mispredicted
998Count retired taken branches that were mispredicted.
999.It Li k8-fr-retired-taken-branches-mispredicted-by-addr-miscompare
1000Count retired taken branches that were mispredicted only due to an
1001address miscompare.
1002.It Li k8-fr-retired-uops
1003Count retired uops.
1004.It Li k8-fr-retired-x86-instructions
1005Count retired x86 instructions including exceptions and interrupts.
1006.It Li k8-ic-fetch
1007Count instruction cache fetches.
1008.It Li k8-ic-instruction-fetch-stall
1009Count cycles in stalls due to instruction fetch.
1010.It Li k8-ic-l1-itlb-miss-and-l2-itlb-hit
1011Count L1 ITLB misses that are L2 ITLB hits.
1012.It Li k8-ic-l1-itlb-miss-and-l2-itlb-miss
1013Count ITLB misses that miss in both L1 and L2 ITLBs.
1014.It Li k8-ic-microarchitectural-resync-by-snoop
1015Count microarchitectural resyncs caused by snoops.
1016.It Li k8-ic-miss
1017Count instruction cache misses.
1018.It Li k8-ic-refill-from-l2
1019Count instruction cache refills from L2 cache.
1020.It Li k8-ic-refill-from-system
1021Count instruction cache refills from system memory.
1022.It Li k8-ic-return-stack-hits
1023Count hits to the return stack.
1024.It Li k8-ic-return-stack-overflow
1025Count overflows of the return stack.
1026.It Li k8-ls-buffer2-full
1027Count load/store buffer2 full events.
1028.It Li k8-ls-locked-operation Op Li ,mask= Ns Ar qualifier
1029Count locked operations.
1030For revision C and later CPUs, the following qualifiers are supported:
1031.Bl -tag -width "XXXXXXXXXXXXX" -compact
1032.It Li cycles-in-request
1033Count the number of cycles in the lock request/grant stage.
1034.It Li cycles-to-complete
1035Count the number of cycles a lock takes to complete once it is
1036non-speculative and is the older load/store operation.
1037.It Li locked-instructions
1038Count the number of lock instructions executed.
1039.El
1040The default is to count the number of lock instructions executed.
1041.It Li k8-ls-microarchitectural-late-cancel
1042Count microarchitectural late cancels of operations in the load/store
1043unit.
1044.It Li k8-ls-microarchitectural-resync-by-self-modifying-code
1045Count microarchitectural resyncs caused by self-modifying code.
1046.It Li k8-ls-microarchitectural-resync-by-snoop
1047Count microarchitectural resyncs caused by snoops.
1048.It Li k8-ls-retired-cflush-instructions
1049Count retired CFLUSH instructions.
1050.It Li k8-ls-retired-cpuid-instructions
1051Count retired CPUID instructions.
1052.It Li k8-ls-segment-register-load Op Li ,mask= Ns Ar qualifier
1053Count segment register loads.
1054This event may be further qualified using
1055.Ar qualifier ,
1056which is a
1057.Li "+" Ns - Ns
1058separated set of the following keywords:
1059.Bl -tag -width "XX" -compact
1060.It Li cs
1061Count CS register loads.
1062.It Li ds
1063Count DS register loads.
1064.It Li es
1065Count ES register loads.
1066.It Li fs
1067Count FS register loads.
1068.It Li gs
1069Count GS register loads.
1070.\" .It Ic hs
1071.\" Count HS register loads.
1072.\" XXX "HS" register?
1073.It Li ss
1074Count SS register loads.
1075.El
1076The default is to count all types of loads.
1077.It Li k8-nb-memory-controller-bypass-saturation Op Li ,mask= Ns Ar qualifier
1078Count memory controller bypass counter saturation events.
1079This event may be further qualified using
1080.Ar qualifier ,
1081which is a
1082.Li "+" Ns - Ns
1083separated set of the following keywords:
1084.Bl -tag -width "XXXXXXXXXX" -compact
1085.It Li dram-controller-interface-bypass
1086Count DRAM controller interface bypass.
1087.It Li dram-controller-queue-bypass
1088Count DRAM controller queue bypass.
1089.It Li memory-controller-hi-pri-bypass
1090Count memory controller high priority bypasses.
1091.It Li memory-controller-lo-pri-bypass
1092Count memory controller low priority bypasses.
1093.El
1094.It Li k8-nb-memory-controller-dram-slots-missed
1095Count memory controller DRAM command slots missed (in MemClks).
1096.It Li k8-nb-memory-controller-page-access-event Op Li ,mask= Ns Ar qualifier
1097Count memory controller page access events.
1098This event may be further qualified using
1099.Ar qualifier ,
1100which is a
1101.Li "+" Ns - Ns
1102separated set of the following keywords:
1103.Bl -tag -width "XXXXXXXXXX" -compact
1104.It Li page-conflict
1105Count page conflicts.
1106.It Li page-hit
1107Count page hits.
1108.It Li page-miss
1109Count page misses.
1110.El
1111The default is to count all types of events.
1112.It Li k8-nb-memory-controller-page-table-overflow
1113Count memory control page table overflow events.
1114.It Li k8-nb-probe-result Op Li ,mask= Ns Ar qualifier
1115Count probe events.
1116This event may be further qualified using
1117.Ar qualifier ,
1118which is a
1119.Li "+" Ns - Ns
1120separated set of the following keywords:
1121.Bl -tag -width "exclusive" -compact
1122.It Li probe-hit
1123Count all probe hits.
1124.It Li probe-hit-dirty-no-memory-cancel
1125Count probe hits without memory cancels.
1126.It Li probe-hit-dirty-with-memory-cancel
1127Count probe hits with memory cancels.
1128.It Li probe-miss
1129Count probe misses.
1130.El
1131.It Li k8-nb-sized-commands Op Li ,mask= Ns Ar qualifier
1132Count sized commands issued.
1133This event may be further qualified using
1134.Ar qualifier ,
1135which is a
1136.Li "+" Ns - Ns
1137separated set of the following keywords:
1138.Bl -tag -width "exclusive" -compact
1139.It Li nonpostwrszbyte
1140.It Li nonpostwrszdword
1141.It Li postwrszbyte
1142.It Li postwrszdword
1143.It Li rdszbyte
1144.It Li rdszdword
1145.It Li rdmodwr
1146.El
1147The default is to count all types of commands.
1148.It Li k8-nb-memory-controller-turnaround Op Li ,mask= Ns Ar qualifier
1149Count memory control turnaround events.
1150This event may be further qualified using
1151.Ar qualifier ,
1152which is a
1153.Li "+" Ns - Ns
1154separated set of the following keywords:
1155.Bl -tag -width "XXXXXXXXXX" -compact
1156.\" XXX doc is unclear whether these are cycle counts or event counts
1157.It Li dimm-turnaround
1158Count DIMM turnarounds.
1159.It Li read-to-write-turnaround
1160Count read to write turnarounds.
1161.It Li write-to-read-turnaround
1162Count write to read turnarounds.
1163.El
1164The default is to count all types of events.
1165.It Li k8-nb-ht-bus0-bandwidth Op Li ,mask= Ns Ar qualifier
1166.It Li k8-nb-ht-bus1-bandwidth Op Li ,mask= Ns Ar qualifier
1167.It Li k8-nb-ht-bus2-bandwidth Op Li ,mask= Ns Ar qualifier
1168Count events on the HyperTransport(tm) buses.
1169These events may be further qualified using
1170.Ar qualifier ,
1171which is a
1172.Li "+" Ns - Ns
1173separated set of the following keywords:
1174.Bl -tag -width "XXXXXXXXXX" -compact
1175.It Li buffer-release
1176Count buffer release messages sent.
1177.It Li command
1178Count command messages sent.
1179.It Li data
1180Count data messages sent.
1181.It Li nop
1182Count nop messages sent.
1183.El
1184The default is to count all types of messages.
1185.El
1186.Ss Intel P6 PMCS
1187Intel P6 PMCs are present in Intel
1188.Tn "Pentium Pro" ,
1189.Tn "Pentium II" ,
1190.Tn "Celeron" ,
1191.Tn "Pentium III"
1192and
1193.Tn "Pentium M"
1194processors.
1195.Pp
1196These CPUs have two counters.
1197Some events may only be used on specific counters and some events are
1198defined only on specific processor models.
1199.Pp
1200These PMCs are documented in
1201.Rs
1202.%B "IA-32 Intel(R) Architecture Software Developer's Manual"
1203.%T "Volume 3: System Programming Guide"
1204.%N "Order Number 245472-012"
1205.%D 2003
1206.%Q "Intel Corporation"
1207.Re
1208.Pp
1209Some of these events are affected by processor errata described in
1210.Rs
1211.%B "Intel(R) Pentium(R) III Processor Specification Update"
1212.%N "Document Number: 244453-054"
1213.%D "April 2005"
1214.%Q "Intel Corporation"
1215.Re
1216.Pp
1217Event specifiers for Intel P6 PMCs can have the following common
1218qualifiers:
1219.Bl -tag -width indent
1220.It Li cmask= Ns Ar value
1221Configure the PMC to increment only if the number of configured
1222events measured in a cycle is greater than or equal to
1223.Ar value .
1224.It Li edge
1225Configure the PMC to count the number of deasserted to asserted
1226transitions of the conditions expressed by the other qualifiers.
1227If specified, the counter will increment only once whenever a
1228condition becomes true, irrespective of the number of clocks during
1229which the condition remains true.
1230.It Li inv
1231Invert the sense of comparision when the
1232.Ar cmask
1233qualifier is present, making the counter increment when the number of
1234events per cycle is less than the value specified by the
1235.Ar cmask
1236qualifier.
1237.It Li os
1238Configure the PMC to count events happening at processor privilege
1239level 0.
1240.It Li umask= Ns Ar value
1241This qualifier is used to further qualify the event selected (see
1242below).
1243.It Li usr
1244Configure the PMC to count events occurring at privilege levels 1, 2
1245or 3.
1246.El
1247If neither of the
1248.Li os
1249or
1250.Li usr
1251qualifiers are specified, the default is to enable both.
1252.Pp
1253The event specifiers supported by Intel P6 PMCs are:
1254.Bl -tag -width indent
1255.It Li p6-baclears
1256Count the number of times a static branch prediction was made by the
1257branch decoder because the BTB did not have a prediction.
1258.It Li p6-br-bac-missp-exec
1259.Pq Tn "Pentium M"
1260Count the number of branch instructions executed that where
1261mispredicted at the Front End (BAC).
1262.It Li p6-br-bogus
1263Count the number of bogus branches.
1264.It Li p6-br-call-exec
1265.Pq Tn "Pentium M"
1266Count the number of call instructions executed.
1267.It Li p6-br-call-missp-exec
1268.Pq Tn "Pentium M"
1269Count the number of call instructions executed that were mispredicted.
1270.It Li p6-br-cnd-exec
1271.Pq Tn "Pentium M"
1272Count the number of conditional branch instructions executed.
1273.It Li p6-br-cnd-missp-exec
1274.Pq Tn "Pentium M"
1275Count the number of conditional branch instructions executed that were
1276mispredicted.
1277.It Li p6-br-ind-call-exec
1278.Pq Tn "Pentium M"
1279Count the number of indirect call instructions executed.
1280.It Li p6-br-ind-exec
1281.Pq Tn "Pentium M"
1282Count the number of indirect branch instructions executed.
1283.It Li p6-br-ind-missp-exec
1284.Pq Tn "Pentium M"
1285Count the number of indirect branch instructions executed that were
1286mispredicted.
1287.It Li p6-br-inst-decoded
1288Count the number of branch instructions decoded.
1289.It Li p6-br-inst-exec
1290.Pq Tn "Pentium M"
1291Count the number of branch instructions executed but necessarily retired.
1292.It Li p6-br-inst-retired
1293Count the number of branch instructions retired.
1294.It Li p6-br-miss-pred-retired
1295Count the number of mispredicted branch instructions retired.
1296.It Li p6-br-miss-pred-taken-ret
1297Count the number of taken mispredicted branches retired.
1298.It Li p6-br-missp-exec
1299.Pq Tn "Pentium M"
1300Count the number of branch instructions executed that were
1301mispredicted at execution.
1302.It Li p6-br-ret-bac-missp-exec
1303.Pq Tn "Pentium M"
1304Count the number of return instructions executed that were
1305mispredicted at the Front End (BAC).
1306.It Li p6-br-ret-exec
1307.Pq Tn "Pentium M"
1308Count the number of return instructions executed.
1309.It Li p6-br-ret-missp-exec
1310.Pq Tn "Pentium M"
1311Count the number of return instructions executed that were
1312mispredicted at execution.
1313.It Li p6-br-taken-retired
1314Count the number of taken branches retired.
1315.It Li p6-btb-misses
1316Count the number of branches for which the BTB did not produce a
1317prediction.
1318.It Li p6-bus-bnr-drv
1319Count the number of bus clock cycles during which this processor is
1320driving the BNR# pin.
1321.It Li p6-bus-data-rcv
1322Count the number of bus clock cycles during which this processor is
1323receiving data.
1324.It Li p6-bus-drdy-clocks Op Li ,umask= Ns Ar qualifier
1325Count the number of clocks during which DRDY# is asserted.
1326An additional qualifier may be specified, and comprises one of the
1327following keywords:
1328.Bl -tag -width indent -compact
1329.It Li any
1330Count transactions generated by any agent on the bus.
1331.It Li self
1332Count transactions generated by this processor.
1333.El
1334The default is to count operations generated by this processor.
1335.It Li p6-bus-hit-drv
1336Count the number of bus clock cycles during which this processor is
1337driving the HIT# pin.
1338.It Li p6-bus-hitm-drv
1339Count the number of bus clock cycles during which this processor is
1340driving the HITM# pin.
1341.It Li p6-bus-lock-clocks Op Li ,umask= Ns Ar qualifier
1342Count the number of clocks during with LOCK# is asserted on the
1343external system bus.
1344An additional qualifier may be specified and comprises one of the following
1345keywords:
1346.Bl -tag -width indent -compact
1347.It Li any
1348Count transactions generated by any agent on the bus.
1349.It Li self
1350Count transactions generated by this processor.
1351.El
1352The default is to count operations generated by this processor.
1353.It Li p6-bus-req-outstanding
1354Count the number of bus requests outstanding in any given cycle.
1355.It Li p6-bus-snoop-stall
1356Count the number of clock cycles during which the bus is snoop stalled.
1357.It Li p6-bus-tran-any Op Li ,umask= Ns Ar qualifier
1358Count the number of completed bus transactions of any kind.
1359An additional qualifier may be specified and comprises one of the following
1360keywords:
1361.Bl -tag -width indent -compact
1362.It Li any
1363Count transactions generated by any agent on the bus.
1364.It Li self
1365Count transactions generated by this processor.
1366.El
1367The default is to count operations generated by this processor.
1368.It Li p6-bus-tran-brd Op Li ,umask= Ns Ar qualifier
1369Count the number of burst read transactions.
1370An additional qualifier may be specified and comprises one of the following
1371keywords:
1372.Bl -tag -width indent -compact
1373.It Li any
1374Count transactions generated by any agent on the bus.
1375.It Li self
1376Count transactions generated by this processor.
1377.El
1378The default is to count operations generated by this processor.
1379.It Li p6-bus-tran-burst Op Li ,umask= Ns Ar qualifier
1380Count the number of completed burst transactions.
1381An additional qualifier may be specified and comprises one of the following
1382keywords:
1383.Bl -tag -width indent -compact
1384.It Li any
1385Count transactions generated by any agent on the bus.
1386.It Li self
1387Count transactions generated by this processor.
1388.El
1389The default is to count operations generated by this processor.
1390.It Li p6-bus-tran-def Op Li ,umask= Ns Ar qualifier
1391Count the number of completed deferred transactions.
1392An additional qualifier may be specified and comprises one of the following
1393keywords:
1394.Bl -tag -width indent -compact
1395.It Li any
1396Count transactions generated by any agent on the bus.
1397.It Li self
1398Count transactions generated by this processor.
1399.El
1400The default is to count operations generated by this processor.
1401.It Li p6-bus-tran-ifetch Op Li ,umask= Ns Ar qualifier
1402Count the number of completed instruction fetch transactions.
1403An additional qualifier may be specified and comprises one of the following
1404keywords:
1405.Bl -tag -width indent -compact
1406.It Li any
1407Count transactions generated by any agent on the bus.
1408.It Li self
1409Count transactions generated by this processor.
1410.El
1411The default is to count operations generated by this processor.
1412.It Li p6-bus-tran-inval Op Li ,umask= Ns Ar qualifier
1413Count the number of completed invalidate transactions.
1414An additional qualifier may be specified and comprises one of the following
1415keywords:
1416.Bl -tag -width indent -compact
1417.It Li any
1418Count transactions generated by any agent on the bus.
1419.It Li self
1420Count transactions generated by this processor.
1421.El
1422The default is to count operations generated by this processor.
1423.It Li p6-bus-tran-mem Op Li ,umask= Ns Ar qualifier
1424Count the number of completed memory transactions.
1425An additional qualifier may be specified and comprises one of the following
1426keywords:
1427.Bl -tag -width indent -compact
1428.It Li any
1429Count transactions generated by any agent on the bus.
1430.It Li self
1431Count transactions generated by this processor.
1432.El
1433The default is to count operations generated by this processor.
1434.It Li p6-bus-tran-pwr Op Li ,umask= Ns Ar qualifier
1435Count the number of completed partial write transactions.
1436An additional qualifier may be specified and comprises one of the following
1437keywords:
1438.Bl -tag -width indent -compact
1439.It Li any
1440Count transactions generated by any agent on the bus.
1441.It Li self
1442Count transactions generated by this processor.
1443.El
1444The default is to count operations generated by this processor.
1445.It Li p6-bus-tran-rfo Op Li ,umask= Ns Ar qualifier
1446Count the number of completed read-for-ownership transactions.
1447An additional qualifier may be specified and comprises one of the following
1448keywords:
1449.Bl -tag -width indent -compact
1450.It Li any
1451Count transactions generated by any agent on the bus.
1452.It Li self
1453Count transactions generated by this processor.
1454.El
1455The default is to count operations generated by this processor.
1456.It Li p6-bus-trans-io Op Li ,umask= Ns Ar qualifier
1457Count the number of completed I/O transactions.
1458An additional qualifier may be specified and comprises one of the following
1459keywords:
1460.Bl -tag -width indent -compact
1461.It Li any
1462Count transactions generated by any agent on the bus.
1463.It Li self
1464Count transactions generated by this processor.
1465.El
1466The default is to count operations generated by this processor.
1467.It Li p6-bus-trans-p Op Li ,umask= Ns Ar qualifier
1468Count the number of completed partial transactions.
1469An additional qualifier may be specified and comprises one of the following
1470keywords:
1471.Bl -tag -width indent -compact
1472.It Li any
1473Count transactions generated by any agent on the bus.
1474.It Li self
1475Count transactions generated by this processor.
1476.El
1477The default is to count operations generated by this processor.
1478.It Li p6-bus-trans-wb Op Li ,umask= Ns Ar qualifier
1479Count the number of completed write-back transactions.
1480An additional qualifier may be specified and comprises one of the following
1481keywords:
1482.Bl -tag -width indent -compact
1483.It Li any
1484Count transactions generated by any agent on the bus.
1485.It Li self
1486Count transactions generated by this processor.
1487.El
1488The default is to count operations generated by this processor.
1489.It Li p6-cpu-clk-unhalted
1490Count the number of cycles during with the processor was not halted.
1491.Pp
1492.Pq Tn "Pentium M"
1493Count the number of cycles during with the processor was not halted
1494and not in a thermal trip.
1495.It Li p6-cycles-div-busy
1496Count the number of cycles during which the divider is busy and cannot
1497accept new divides.
1498This event is only allocated on counter 0.
1499.It Li p6-cycles-in-pending-and-masked
1500Count the number of processor cycles for which interrupts were
1501disabled and interrupts were pending.
1502.It Li p6-cycles-int-masked
1503Count the number of processor cycles for which interrupts were
1504disabled.
1505.It Li p6-data-mem-refs
1506Count all loads and all stores using any memory type, including
1507internal retries.
1508Each part of a split store is counted seperately.
1509.It Li p6-dcu-lines-in
1510Count the total lines allocated in the data cache unit.
1511.It Li p6-dcu-m-lines-in
1512Count the number of M state lines allocated in the data cache unit.
1513.It Li p6-dcu-m-lines-out
1514Count the number of M state lines evicted from the data cache unit.
1515.It Li p6-dcu-miss-outstanding
1516Count the weighted number of cycles while a data cache unit miss is
1517outstanding, incremented by the number of outstanding cache misses at
1518any time.
1519.It Li p6-div
1520Count the number of floating point multiplies.
1521This event is only allocated on counter 1.
1522.It Li p6-emon-esp-uops
1523.Pq Tn "Pentium M"
1524Count the total number of micro-ops.
1525.It Li p6-emon-est-trans Op Li ,umask= Ns Ar qualifier
1526.Pq Tn "Pentium M"
1527Count the number of
1528.Tn "Enhanced Intel SpeedStep"
1529transitions.
1530An additional qualifier may be specified, and can be one of the
1531following keywords:
1532.Bl -tag -width indent -compact
1533.It Li all
1534Count all transitions.
1535.It Li freq
1536Count only frequency transitions.
1537.El
1538The default is to count all transitions.
1539.It Li p6-emon-fused-uops-ret Op Li ,umask= Ns Ar qualifier
1540.Pq Tn "Pentium M"
1541Count the number of retired fused micro-ops.
1542An additional qualifier may be specified, and may be one of the
1543following keywords:
1544.Bl -tag -width indent -compact
1545.It Li all
1546Count all fused micro-ops.
1547.It Li loadop
1548Count only load and op micro-ops.
1549.It Li stdsta
1550Count only STD/STA micro-ops.
1551.El
1552The default is to count all fused micro-ops.
1553.It Li p6-emon-kni-comp-inst-ret
1554.Pq Tn "Pentium III"
1555Count the number of SSE computational instructions retired.
1556An additional qualifier may be specified, and comprises one of the
1557following keywords:
1558.Bl -tag -width indent -compact
1559.It Li packed-and-scalar
1560Count packed and scalar operations.
1561.It Li scalar
1562Count scalar operations only.
1563.El
1564The default is to count packed and scalar operations.
1565.It Li p6-emon-kni-inst-retired Op Li ,umask= Ns Ar qualifier
1566.Pq Tn "Pentium III"
1567Count the number of SSE instructions retired.
1568An additional qualifier may be specified, and comprises one of the
1569following keywords:
1570.Bl -tag -width indent -compact
1571.It Li packed-and-scalar
1572Count packed and scalar operations.
1573.It Li scalar
1574Count scalar operations only.
1575.El
1576The default is to count packed and scalar operations.
1577.It Li p6-emon-kni-pref-dispatched Op Li ,umask= Ns Ar qualifier
1578.Pq Tn "Pentium III"
1579Count the number of SSE prefetch or weakly ordered instructions
1580dispatched (including speculative prefetches).
1581An additional qualifier may be specified, and comprises one of the
1582following keywords:
1583.Bl -tag -width indent -compact
1584.It Li nta
1585Count non-temporal prefetches.
1586.It Li t1
1587Count prefetches to L1.
1588.It Li t2
1589Count prefetches to L2.
1590.It Li wos
1591Count weakly ordered stores.
1592.El
1593The default is to count non-temporal prefetches.
1594.It Li p6-emon-kni-pref-miss Op Li ,umask= Ns Ar qualifier
1595.Pq Tn "Pentium III"
1596Count the number of prefetch or weakly ordered instructions that miss
1597all caches.
1598An additional qualifier may be specified, and comprises one of the
1599following keywords:
1600.Bl -tag -width indent -compact
1601.It Li nta
1602Count non-temporal prefetches.
1603.It Li t1
1604Count prefetches to L1.
1605.It Li t2
1606Count prefetches to L2.
1607.It Li wos
1608Count weakly ordered stores.
1609.El
1610The default is to count non-temporal prefetches.
1611.It Li p6-emon-pref-rqsts-dn
1612.Pq Tn "Pentium M"
1613Count the number of downward prefetches issued.
1614.It Li p6-emon-pref-rqsts-up
1615.Pq Tn "Pentium M"
1616Count the number of upward prefetches issued.
1617.It Li p6-emon-simd-instr-retired
1618.Pq Tn "Pentium M"
1619Count the number of retired
1620.Tn MMX
1621instructions.
1622.It Li p6-emon-sse-sse2-comp-inst-retired Op Li ,umask= Ns Ar qualifier
1623.Pq Tn "Pentium M"
1624Count the number of computational SSE instructions retired.
1625An additional qualifier may be specified and can be one of the
1626following keywords:
1627.Bl -tag -width indent -compact
1628.It Li sse-packed-single
1629Count SSE packed-single instructions.
1630.It Li sse-scalar-single
1631Count SSE scalar-single instructions.
1632.It Li sse2-packed-double
1633Count SSE2 packed-double instructions.
1634.It Li sse2-scalar-double
1635Count SSE2 scalar-double instructions.
1636.El
1637The default is to count SSE packed-single instructions.
1638.It Li p6-emon-sse-sse2-inst-retired Op Li ,umask= Ns Ar qualifer
1639.Pp
1640.Pq Tn "Pentium M"
1641Count the number of SSE instructions retired.
1642An additional qualifier can be specified, and can be one of the
1643following keywords:
1644.Bl -tag -width indent -compact
1645.It Li sse-packed-single
1646Count SSE packed-single instructions.
1647.It Li sse-packed-single-scalar-single
1648Count SSE packed-single and scalar-single instructions.
1649.It Li sse2-packed-double
1650Count SSE2 packed-double instructions.
1651.It Li sse2-scalar-double
1652Count SSE2 scalar-double instructions.
1653.El
1654The default is to count SSE packed-single instructions.
1655.It Li p6-emon-synch-uops
1656.Pq Tn "Pentium M"
1657Count the number of sync micro-ops.
1658.It Li p6-emon-thermal-trip
1659.Pq Tn "Pentium M"
1660Count the duration or occurrences of thermal trips.
1661Use the
1662.Ar edge
1663qualifier to count occurrences of thermal trips.
1664.It Li p6-emon-unfusion
1665.Pq Tn "Pentium M"
1666Count the number of unfusion events in the reorder buffer.
1667.It Li p6-flops
1668Count the number of computational floating point operations retired.
1669This event is only allocated on counter 0.
1670.It Li p6-fp-assist
1671Count the number of floating point exceptions handled by microcode.
1672This event is only allocated on counter 1.
1673.It Li p6-fp-comps-ops-exe
1674Count the number of computation floating point operations executed.
1675This event is only allocated on counter 0.
1676.It Li p6-fp-mmx-trans Op Li ,umask= Ns Ar qualifier
1677.Pq Tn "Pentium II" , Tn "Pentium III"
1678Count the number of transitions between MMX and floating-point
1679instructions.
1680An additional qualifier may be specified, and comprises one of the
1681following keywords:
1682.Bl -tag -width indent -compact
1683.It Li mmxtofp
1684Count transitions from MMX instructions to floating-point instructions.
1685.It Li fptommx
1686Count transitions from floating-point instructions to MMX instructions.
1687.El
1688The default is to count MMX to floating-point transitions.
1689.It Li p6-hw-int-rx
1690Count the number of hardware interrupts received.
1691.It Li p6-ifu-fetch
1692Count the number of instruction fetches, both cacheable and non-cacheable.
1693.It Li p6-ifu-fetch-miss
1694Count the number of instruction fetch misses (i.e., those that produce
1695memory accesses).
1696.It Li p6-ifu-mem-stall
1697Count the number of cycles instruction fetch is stalled for any reason.
1698.It Li p6-ild-stall
1699Count the number of cycles the instruction length decoder is stalled.
1700.It Li p6-inst-decoded
1701Count the number of instructions decoded.
1702.It Li p6-inst-retired
1703Count the number of instructions retired.
1704.It Li p6-itlb-miss
1705Count the number of instruction TLB misses.
1706.It Li p6-l2-ads
1707Count the number of L2 address strobes.
1708.It Li p6-l2-dbus-busy
1709Count the number of cycles during which the L2 cache data bus was busy.
1710.It Li p6-l2-dbus-busy-rd
1711Count the number of cycles during which the L2 cache data bus was busy
1712transferring read data from L2 to the processor.
1713.It Li p6-l2-ifetch Op Li ,umask= Ns Ar qualifier
1714Count the number of L2 instruction fetches.
1715An additional qualifier may be specified and comprises a list of the following
1716keywords separated by
1717.Li "+"
1718characters:
1719.Bl -tag -width indent -compact
1720.It Li e
1721Count operations affecting E (exclusive) state lines.
1722.It Li i
1723Count operations affecting I (invalid) state lines.
1724.It Li m
1725Count operations affecting M (modified) state lines.
1726.It Li s
1727Count operations affecting S (shared) state lines.
1728.El
1729The default is to count operations affecting all (MESI) state lines.
1730.It Li p6-l2-ld Op Li ,umask= Ns Ar qualifier
1731Count the number of L2 data loads.
1732An additional qualifier may be specified and comprises a list of the following
1733keywords separated by
1734.Li "+"
1735characters:
1736.Bl -tag -width indent -compact
1737.It Li both
1738.Pq Tn "Pentium M"
1739Count both hardware-prefetched lines and non-hardware-prefetched lines.
1740.It Li e
1741Count operations affecting E (exclusive) state lines.
1742.It Li hw
1743.Pq Tn "Pentium M"
1744Count hardware-prefetched lines only.
1745.It Li i
1746Count operations affecting I (invalid) state lines.
1747.It Li m
1748Count operations affecting M (modified) state lines.
1749.It Li nonhw
1750.Pq Tn "Pentium M"
1751Exclude hardware-prefetched lines.
1752.It Li s
1753Count operations affecting S (shared) state lines.
1754.El
1755The default on processors other than
1756.Tn "Pentium M"
1757processors is to count operations affecting all (MESI) state lines.
1758The default on
1759.Tn "Pentium M"
1760processors is to count both hardware-prefetched and
1761non-hardware-prefetch operations on all (MESI) state lines.
1762.Pq Errata
1763This event is affected by processor errata E53.
1764.It Li p6-l2-lines-in Op Li ,umask= Ns Ar qualifier
1765Count the number of L2 lines allocated.
1766An additional qualifier may be specified and comprises a list of the following
1767keywords separated by
1768.Li "+"
1769characters:
1770.Bl -tag -width indent -compact
1771.It Li both
1772.Pq Tn "Pentium M"
1773Count both hardware-prefetched lines and non-hardware-prefetched lines.
1774.It Li e
1775Count operations affecting E (exclusive) state lines.
1776.It Li hw
1777.Pq Tn "Pentium M"
1778Count hardware-prefetched lines only.
1779.It Li i
1780Count operations affecting I (invalid) state lines.
1781.It Li m
1782Count operations affecting M (modified) state lines.
1783.It Li nonhw
1784.Pq Tn "Pentium M"
1785Exclude hardware-prefetched lines.
1786.It Li s
1787Count operations affecting S (shared) state lines.
1788.El
1789The default on processors other than
1790.Tn "Pentium M"
1791processors is to count operations affecting all (MESI) state lines.
1792The default on
1793.Tn "Pentium M"
1794processors is to count both hardware-prefetched and
1795non-hardware-prefetch operations on all (MESI) state lines.
1796.Pq Errata
1797This event is affected by processor errata E45.
1798.It Li p6-l2-lines-out Op Li ,umask= Ns Ar qualifier
1799Count the number of L2 lines evicted.
1800An additional qualifier may be specified and comprises a list of the following
1801keywords separated by
1802.Li "+"
1803characters:
1804.Bl -tag -width indent -compact
1805.It Li both
1806.Pq Tn "Pentium M"
1807Count both hardware-prefetched lines and non-hardware-prefetched lines.
1808.It Li e
1809Count operations affecting E (exclusive) state lines.
1810.It Li hw
1811.Pq Tn "Pentium M"
1812Count hardware-prefetched lines only.
1813.It Li i
1814Count operations affecting I (invalid) state lines.
1815.It Li m
1816Count operations affecting M (modified) state lines.
1817.It Li nonhw
1818.Pq Tn "Pentium M" only
1819Exclude hardware-prefetched lines.
1820.It Li s
1821Count operations affecting S (shared) state lines.
1822.El
1823The default on processors other than
1824.Tn "Pentium M"
1825processors is to count operations affecting all (MESI) state lines.
1826The default on
1827.Tn "Pentium M"
1828processors is to count both hardware-prefetched and
1829non-hardware-prefetch operations on all (MESI) state lines.
1830.Pq Errata
1831This event is affected by processor errata E45.
1832.It Li p6-l2-m-lines-inm
1833Count the number of modified lines allocated in L2 cache.
1834.It Li p6-l2-m-lines-outm Op Li ,umask= Ns Ar qualifier
1835Count the number of L2 M-state lines evicted.
1836.Pp
1837.Pq Tn "Pentium M"
1838On these processors an additional qualifier may be specified and
1839comprises a list of the following keywords separated by
1840.Li "+"
1841characters:
1842.Bl -tag -width indent -compact
1843.It Li both
1844Count both hardware-prefetched lines and non-hardware-prefetched lines.
1845.It Li hw
1846Count hardware-prefetched lines only.
1847.It Li nonhw
1848Exclude hardware-prefetched lines.
1849.El
1850The default is to count both hardware-prefetched and
1851non-hardware-prefetch operations.
1852.Pq Errata
1853This event is affected by processor errata E53.
1854.It Li p6-l2-rqsts Op Li ,umask= Ns Ar qualifier
1855Count the total number of L2 requests.
1856An additional qualifier may be specified and comprises a list of the following
1857keywords separated by
1858.Li "+"
1859characters:
1860.Bl -tag -width indent -compact
1861.It Li e
1862Count operations affecting E (exclusive) state lines.
1863.It Li i
1864Count operations affecting I (invalid) state lines.
1865.It Li m
1866Count operations affecting M (modified) state lines.
1867.It Li s
1868Count operations affecting S (shared) state lines.
1869.El
1870The default is to count operations affecting all (MESI) state lines.
1871.It Li p6-l2-st
1872Count the number of L2 data stores.
1873An additional qualifier may be specified and comprises a list of the following
1874keywords separated by
1875.Li "+"
1876characters:
1877.Bl -tag -width indent -compact
1878.It Li e
1879Count operations affecting E (exclusive) state lines.
1880.It Li i
1881Count operations affecting I (invalid) state lines.
1882.It Li m
1883Count operations affecting M (modified) state lines.
1884.It Li s
1885Count operations affecting S (shared) state lines.
1886.El
1887The default is to count operations affecting all (MESI) state lines.
1888.It Li p6-ld-blocks
1889Count the number of load operations delayed due to store buffer blocks.
1890.It Li p6-misalign-mem-ref
1891Count the number of misaligned data memory references (crossing a 64
1892bit boundary).
1893.It Li p6-mmx-assist
1894.Pq Tn "Pentium II" , Tn "Pentium III"
1895Count the number of MMX assists executed.
1896.It Li p6-mmx-instr-exec
1897.Pq Tn "Celeron" , Tn "Pentium II"
1898Count the number of MMX instructions executed, except MOVQ and MOVD
1899stores from register to memory.
1900.It Li p6-mmx-instr-ret
1901.Pq Tn "Pentium II"
1902Count the number of MMX instructions retired.
1903.It Li p6-mmx-instr-type-exec Op Li ,umask= Ns Ar qualifier
1904.Pq Tn "Pentium II" , Tn "Pentium III"
1905Count the number of MMX instructions executed.
1906An additional qualifier may be specified and comprises a list of
1907the following keywords separated by
1908.Li "+"
1909characters:
1910.Bl -tag -width indent -compact
1911.It Li pack
1912Count MMX pack operation instructions.
1913.It Li packed-arithmetic
1914Count MMX packed arithmetic instructions.
1915.It Li packed-logical
1916Count MMX packed logical instructions.
1917.It Li packed-multiply
1918Count MMX packed multiply instructions.
1919.It Li packed-shift
1920Count MMX packed shift instructions.
1921.It Li unpack
1922Count MMX unpack operation instructions.
1923.El
1924The default is to count all operations.
1925.It Li p6-mmx-sat-instr-exec
1926.Pq Tn "Pentium II" , Tn "Pentium III"
1927Count the number of MMX saturating instructions executed.
1928.It Li p6-mmx-uops-exec
1929.Pq Tn "Pentium II" , Tn "Pentium III"
1930Count the number of MMX micro-ops executed.
1931.It Li p6-mul
1932Count the number of floating point multiplies.
1933This event is only allocated on counter 1.
1934.It Li p6-partial-rat-stalls
1935Count the number of cycles or events for partial stalls.
1936.It Li p6-resource-stalls
1937Count the number of cycles there was a resource related stall of any kind.
1938.It Li p6-ret-seg-renames
1939.Pq Tn "Pentium II" , Tn "Pentium III"
1940Count the number of segment register rename events retired.
1941.It Li p6-sb-drains
1942Count the number of cycles the store buffer is draining.
1943.It Li p6-seg-reg-renames Op Li ,umask= Ns Ar qualifier
1944.Pq Tn "Pentium II" , Tn "Pentium III"
1945Count the number of segment register renames.
1946An additional qualifier may be specified, and comprises a list of the
1947following keywords separated by
1948.Li "+"
1949characters:
1950.Bl -tag -width indent -compact
1951.It Li ds
1952Count renames for segment register DS.
1953.It Li es
1954Count renames for segment register ES.
1955.It Li fs
1956Count renames for segment register FS.
1957.It Li gs
1958Count renames for segment register GS.
1959.El
1960The default is to count operations affecting all segment registers.
1961.It Li p6-seg-rename-stalls
1962.Pq Tn "Pentium II" , Tn "Pentium III"
1963Count the number of segment register renaming stalls.
1964An additional qualifier may be specified, and comprises a list of the
1965following keywords separated by
1966.Li "+"
1967characters:
1968.Bl -tag -width indent -compact
1969.It Li ds
1970Count stalls for segment register DS.
1971.It Li es
1972Count stalls for segment register ES.
1973.It Li fs
1974Count stalls for segment register FS.
1975.It Li gs
1976Count stalls for segment register GS.
1977.El
1978The default is to count operations affecting all the segment registers.
1979.It Li p6-segment-reg-loads
1980Count the number of segment register loads.
1981.It Li p6-uops-retired
1982Count the number of micro-ops retired.
1983.El
1984.Ss Intel P4 PMCS
1985Intel P4 PMCs are present in Intel
1986.Tn "Pentium 4"
1987and
1988.Tn Xeon
1989processors.
1990These PMCs are documented in
1991.Rs
1992.%B "IA-32 Intel(R) Architecture Software Developer's Manual"
1993.%T "Volume 3: System Programming Guide"
1994.%N "Order Number 245472-012"
1995.%D 2003
1996.%Q "Intel Corporation"
1997.Re
1998Further information about using these PMCs may be found in
1999.Rs
2000.%B "IA-32 Intel(R) Architecture Optimization Guide"
2001.%D 2003
2002.%N "Order Number 248966-009"
2003.%Q "Intel Corporation"
2004.Re
2005Some of these events are affected by processor errata described in
2006.Rs
2007.%B "Intel(R) Pentium(R) 4 Processor Specification Update"
2008.%N "Document Number:  249199-059"
2009.%D "April 2005"
2010.%Q "Intel Corporation"
2011.Re
2012.Pp
2013Event specifiers for Intel P4 PMCs can have the following common
2014qualifiers:
2015.Bl -tag -width indent
2016.It Li active= Ns Ar choice
2017(On P4 HTT CPUs) Filter event counting based on which logical
2018processors are active.
2019The allowed values of
2020.Ar choice
2021are:
2022.Bl -tag -width indent -compact
2023.It Li any
2024Count when either logical processor is active.
2025.It Li both
2026Count when both logical processors are active.
2027.It Li none
2028Count only when neither logical processor is active.
2029.It Li single
2030Count only when one logical processor is active.
2031.El
2032The default is
2033.Li both .
2034.It Li cascade
2035Configure the PMC to cascade onto its partner.
2036The PMC for the partner must already have been allocated by the
2037current process.
2038See
2039.Sx "Cascading P4 PMCs"
2040below for more information.
2041.It Li edge
2042Configure the counter to count false to true transitions of the threshold
2043comparision output.
2044This qualifier only takes effect if a threshold qualifier has also been
2045specified.
2046.It Li complement
2047Configure the counter to increment only when the event count seen is
2048less than the threshold qualifier value specified.
2049.It Li mask= Ns Ar qualifier
2050Many event specifiers for Intel P4 PMCs need to be additionally
2051qualified using a mask qualifier.
2052The allowed syntax for these qualifiers is event specific and is
2053described along with the events.
2054.It Li os
2055Configure the PMC to count when the CPL of the processor is 0.
2056.It Li precise
2057Select precise event based sampling.
2058Precise sampling is supported by the hardware for a limited set of
2059events.
2060.It Li tag= Ns Ar value
2061Configure the PMC to tag the internal uop selected by the other
2062fields in this event specifier with value
2063.Ar value .
2064This feature is used when cascading PMCs.
2065.It Li threshold= Ns Ar value
2066Configure the PMC to increment only when the event counts seen are
2067greater than the specified threshold value
2068.Ar value .
2069.It Li usr
2070Configure the PMC to count when the CPL of the processor is 1, 2 or 3.
2071.El
2072If neither of the
2073.Li os
2074or
2075.Li usr
2076qualifiers are specified, the default is to enable both.
2077.Pp
2078On Intel Pentium 4 processors with HTT, events are
2079divided into two classes:
2080.Bl -tag -width "XXXXXXXXXX" -compact
2081.It "TS Events"
2082are those where hardware can differentiate between events
2083generated on one logical processor from those generated on the
2084other.
2085.It "TI Events"
2086are those where hardware cannot differentiate between events
2087generated by multiple logical processors in a package.
2088.El
2089Only TS events are allowed for use with process-mode PMCs on
2090Pentium-4/HTT CPUs.
2091.Pp
2092The event specifiers supported by Intel P4 PMCs are:
2093.Bl -tag -width indent
2094.It Li p4-128bit-mmx-uop Op Li ,mask= Ns Ar flags
2095.Pq "TI event"
2096Count integer SIMD SSE2 instructions that operate on 128 bit SIMD
2097operands.
2098Qualifier
2099.Ar flags
2100can take the following value (which is also the default):
2101.Bl -tag -width indent -compact
2102.It Li all
2103Count all uops operating on 128 bit SIMD integer operands in memory or
2104XMM register.
2105.El
2106If an instruction contains more than one 128 bit MMX uop, then each
2107uop will be counted.
2108.It Li p4-64bit-mmx-uop Op Li ,mask= Ns Ar flags
2109.Pq "TI event"
2110Count MMX instructions that operate on 64 bit SIMD operands.
2111Qualifier
2112.Ar flags
2113can take the following value (which is also the default):
2114.Bl -tag -width indent -compact
2115.It Li all
2116Count all uops operating on 64 bit SIMD integer operands in memory or
2117in MMX registers.
2118.El
2119If an instruction contains more than one 64 bit MMX uop, then each
2120uop will be counted.
2121.It Li p4-b2b-cycles
2122.Pq "TI event"
2123Count back-to-back bys cycles.
2124Further documentation for this event is unavailable.
2125.It Li p4-bnr
2126.Pq "TI event"
2127Count bus-not-ready conditions.
2128Further documentation for this event is unavailable.
2129.It Li p4-bpu-fetch-request Op Li ,mask= Ns Ar qualifier
2130.Pq "TS event"
2131Count instruction fetch requests qualified by additional
2132flags specified in
2133.Ar qualifier .
2134At this point only one flag is supported:
2135.Bl -tag -width indent -compact
2136.It Li tcmiss
2137Count trace cache lookup misses.
2138.El
2139The default qualifier is also
2140.Ar mask=tcmiss .
2141.It Li p4-branch-retired Op Li ,mask= Ns Ar flags
2142.Pq "TS event"
2143Counts retired branches.
2144Qualifier
2145.Ar flags
2146is a list of the following
2147.Li +
2148separated strings:
2149.Bl -tag -width indent -compact
2150.It Li mmnp
2151Count branches not-taken and predicted.
2152.It Li mmnm
2153Count branches not-taken and mis-predicted.
2154.It Li mmtp
2155Count branches taken and predicted.
2156.It Li mmtm
2157Count branches taken and mis-predicted.
2158.El
2159The default qualifier counts all four kinds of branches.
2160.It Li p4-bsq-active-entries Op Li ,mask= Ns Ar qualifier
2161.Pq "TS event"
2162Count the number of entries (clipped at 15) currently active in the
2163BSQ.
2164Qualifier
2165.Ar qualifier
2166is a
2167.Li +
2168separated set of the following flags:
2169.Bl -tag -width indent -compact
2170.It Li req-type0 , Li req-type1
2171Forms a 2-bit number used to select the request type encoding:
2172.Bl -tag -width indent -compact
2173.It Li 0
2174reads excluding read invalidate
2175.It Li 1
2176read invalidates
2177.It Li 2
2178writes other than writebacks
2179.It Li 3
2180writebacks
2181.El
2182Bit
2183.Li req-type1
2184is the MSB for this two bit number.
2185.It Li req-len0 , Li req-len1
2186Forms a two-bit number that specifies the request length encoding:
2187.Bl -tag -width indent -compact
2188.It Li 0
21890 chunks
2190.It Li 1
21911 chunk
2192.It Li 3
21938 chunks
2194.El
2195Bit
2196.Li req-len1
2197is the MSB for this two bit number.
2198.It Li req-io-type
2199Count requests that are input or output requests.
2200.It Li req-lock-type
2201Count requests that lock the bus.
2202.It Li req-lock-cache
2203Count requests that lock the cache.
2204.It Li req-split-type
2205Count requests that is a bus 8-byte chunk that is split across an
22068-byte boundary.
2207.It Li req-dem-type
2208Count requests that are demand (not prefetches) if set.
2209Count requests that are prefetches if not set.
2210.It Li req-ord-type
2211Count requests that are ordered.
2212.It Li mem-type0 , Li mem-type1 , Li mem-type2
2213Forms a 3-bit number that specifies a memory type encoding:
2214.Bl -tag -width indent -compact
2215.It Li 0
2216UC
2217.It Li 1
2218USWC
2219.It Li 4
2220WT
2221.It Li 5
2222WP
2223.It Li 6
2224WB
2225.El
2226Bit
2227.Li mem-type2
2228is the MSB of this 3-bit number.
2229.El
2230The default qualifier has all the above bits set.
2231.Pp
2232Edge triggering using the
2233.Li edge
2234qualifier should not be used with this event when counting cycles.
2235.It Li p4-bsq-allocation Op Li ,mask= Ns Ar qualifier
2236.Pq "TS event"
2237Count allocations in the bus sequence unit according to the flags
2238specified in
2239.Ar qualifier ,
2240which is a
2241.Li +
2242separated set of the following flags:
2243.Bl -tag -width indent -compact
2244.It Li req-type0 , Li req-type1
2245Forms a 2-bit number used to select the request type encoding:
2246.Bl -tag -width indent -compact
2247.It Li 0
2248reads excluding read invalidate
2249.It Li 1
2250read invalidates
2251.It Li 2
2252writes other than writebacks
2253.It Li 3
2254writebacks
2255.El
2256Bit
2257.Li req-type1
2258is the MSB for this two bit number.
2259.It Li req-len0 , Li req-len1
2260Forms a two-bit number that specifies the request length encoding:
2261.Bl -tag -width indent -compact
2262.It Li 0
22630 chunks
2264.It Li 1
22651 chunk
2266.It Li 3
22678 chunks
2268.El
2269Bit
2270.Li req-len1
2271is the MSB for this two bit number.
2272.It Li req-io-type
2273Count requests that are input or output requests.
2274.It Li req-lock-type
2275Count requests that lock the bus.
2276.It Li req-lock-cache
2277Count requests that lock the cache.
2278.It Li req-split-type
2279Count requests that is a bus 8-byte chunk that is split across an
22808-byte boundary.
2281.It Li req-dem-type
2282Count requests that are demand (not prefetches) if set.
2283Count requests that are prefetches if not set.
2284.It Li req-ord-type
2285Count requests that are ordered.
2286.It Li mem-type0 , Li mem-type1 , Li mem-type2
2287Forms a 3-bit number that specifies a memory type encoding:
2288.Bl -tag -width indent -compact
2289.It Li 0
2290UC
2291.It Li 1
2292USWC
2293.It Li 4
2294WT
2295.It Li 5
2296WP
2297.It Li 6
2298WB
2299.El
2300Bit
2301.Li mem-type2
2302is the MSB of this 3-bit number.
2303.El
2304The default qualifier has all the above bits set.
2305.Pp
2306This event is usually used along with the
2307.Li edge
2308qualifier to avoid multiple counting.
2309.It Li p4-bsq-cache-reference Op Li ,mask= Ns Ar qualifier
2310.Pq "TS event"
2311Count cache references as seen by the bus unit (2nd or 3rd level
2312cache references).
2313Qualifier
2314.Ar qualifier
2315is a
2316.Li +
2317separated list of the following keywords:
2318.Bl -tag -width indent -compact
2319.It Li rd-2ndl-hits
2320Count 2nd level cache hits in the shared state.
2321.It Li rd-2ndl-hite
2322Count 2nd level cache hits in the exclusive state.
2323.It Li rd-2ndl-hitm
2324Count 2nd level cache hits in the modified state.
2325.It Li rd-3rdl-hits
2326Count 3rd level cache hits in the shared state.
2327.It Li rd-3rdl-hite
2328Count 3rd level cache hits in the exclusive state.
2329.It Li rd-3rdl-hitm
2330Count 3rd level cache hits in the modified state.
2331.It Li rd-2ndl-miss
2332Count 2nd level cache misses.
2333.It Li rd-3rdl-miss
2334Count 3rd level cache misses.
2335.It Li wr-2ndl-miss
2336Count write-back lookups from the data access cache that miss the 2nd
2337level cache.
2338.El
2339The default is to count all the above events.
2340.It Li p4-execution-event Op Li ,mask= Ns Ar flags
2341.Pq "TS event"
2342Count the retirement of tagged uops selected through the execution
2343tagging mechanism.
2344Qualifier
2345.Ar flags
2346can contain the following strings separated by
2347.Li +
2348characters:
2349.Bl -tag -width indent -compact
2350.It Li nbogus0 , Li nbogus1 , Li nbogus2 , Li nbogus3
2351The marked uops are not bogus.
2352.It Li bogus0 , Li bogus1 , Li bogus2 , Li bogus3
2353The marked uops are bogus.
2354.El
2355This event requires additional (upstream) events to be allocated to
2356perform the desired uop tagging.
2357The default is to set all the above flags.
2358This event can be used for precise event based sampling.
2359.It Li p4-front-end-event Op Li ,mask= Ns Ar flags
2360.Pq "TS event"
2361Count the retirement of tagged uops selected through the front-end
2362tagging mechanism.
2363Qualifier
2364.Ar flags
2365can contain the following strings separated by
2366.Li +
2367characters:
2368.Bl -tag -width indent -compact
2369.It Li nbogus
2370The marked uops are not bogus.
2371.It Li bogus
2372The marked uops are bogus.
2373.El
2374This event requires additional (upstream) events to be allocated to
2375perform the desired uop tagging.
2376The default is to select both kinds of events.
2377This event can be used for precise event based sampling.
2378.It Li p4-fsb-data-activity Op Li ,mask= Ns Ar flags
2379.Pq "TI event"
2380Count each DBSY or DRDY event selected by qualifier
2381.Ar flags .
2382Qualifier
2383.Ar flags
2384is a
2385.Li +
2386separated set of the following flags:
2387.Bl -tag -width indent -compact
2388.It Li drdy-drv
2389Count when this processor is driving data onto the bus.
2390.It Li drdy-own
2391Count when this processor is reading data from the bus.
2392.It Li drdy-other
2393Count when data is on the bus but not being sampled by this processor.
2394.It Li dbsy-drv
2395Count when this processor reserves the bus for use in the next cycle
2396in order to drive data.
2397.It Li dbsy-own
2398Count when some agent reserves the bus for use in the next bus cycle
2399to drive data that this processor will sample.
2400.It Li dbsy-other
2401Count when some agent reserves the bus for use in the next bus cycle
2402to drive data that this processor will not sample.
2403.El
2404Flags
2405.Li drdy-own
2406and
2407.Li drdy-other
2408are mutually exclusive.
2409Flags
2410.Li dbsy-own
2411and
2412.Li dbsy-other
2413are mutually exclusive.
2414The default value for
2415.Ar qualifier
2416is
2417.Li drdy-drv+drdy-own+dbsy-drv+dbsy-own .
2418.It Li p4-global-power-events Op Li ,mask= Ns Ar flags
2419.Pq "TS event"
2420Count cycles during which the processor is not stopped.
2421Qualifier
2422.Ar flags
2423can take the following value (which is also the default):
2424.Bl -tag -width indent -compact
2425.It Li running
2426Count cycles when the processor is active.
2427.El
2428.It Li p4-instr-retired Op Li ,mask= Ns Ar flags
2429.Pq "TS event"
2430Count instructions retired during a clock cycle.
2431Qualifer
2432.Ar flags
2433comprises of the following strings separated by
2434.Li +
2435characters:
2436.Bl -tag -width indent -compact
2437.It Li nbogusntag
2438Count non-bogus instructions that are not tagged.
2439.It Li nbogustag
2440Count non-bogus instructions that are tagged.
2441.It Li bogusntag
2442Count bogus instructions that are not tagged.
2443.It Li bogustag
2444Count bogus instructions that are tagged.
2445.El
2446The default qualifier counts all the above kinds of instructions.
2447.It Li p4-ioq-active-entries Xo
2448.Op Li ,mask= Ns Ar qualifier
2449.Op Li ,busreqtype= Ns Ar req-type
2450.Xc
2451.Pq "TS event"
2452Count the number of entries (clipped at 15) in the IOQ that are
2453active.
2454The event masks are specified by qualifier
2455.Ar qualifier
2456and
2457.Ar req-type .
2458.Pp
2459Qualifier
2460.Ar qualifier
2461is a
2462.Li +
2463separated set of the following flags:
2464.Bl -tag -width indent -compact
2465.It Li all-read
2466Count read entries.
2467.It Li all-write
2468Count write entries.
2469.It Li mem-uc
2470Count entries accessing uncacheable memory.
2471.It Li mem-wc
2472Count entries accessing write-combining memory.
2473.It Li mem-wt
2474Count entries accessing write-through memory.
2475.It Li mem-wp
2476Count entries accessing write-protected memory
2477.It Li mem-wb
2478Count entries accessing write-back memory.
2479.It Li own
2480Count store requests driven by the processor (i.e., not by other
2481processors or by DMA).
2482.It Li other
2483Count store requests driven by other processors or by DMA.
2484.It Li prefetch
2485Include hardware and software prefetch requests in the count.
2486.El
2487The default value for
2488.Ar qualifier
2489is to enable all the above flags.
2490.Pp
2491The
2492.Ar req-type
2493qualifier is a 5-bit number can be additionally used to select a
2494specific bus request type.
2495The default is 0.
2496.Pp
2497The
2498.Li edge
2499qualifier should not be used when counting cycles with this event.
2500The exact behaviour of this event depends on the processor revision.
2501.It Li p4-ioq-allocation Xo
2502.Op Li ,mask= Ns Ar qualifier
2503.Op Li ,busreqtype= Ns Ar req-type
2504.Xc
2505.Pq "TS event"
2506Count various types of transactions on the bus matching the flags set
2507in
2508.Ar qualifier
2509and
2510.Ar req-type .
2511.Pp
2512Qualifier
2513.Ar qualifier
2514is a
2515.Li +
2516separated set of the following flags:
2517.Bl -tag -width indent -compact
2518.It Li all-read
2519Count read entries.
2520.It Li all-write
2521Count write entries.
2522.It Li mem-uc
2523Count entries accessing uncacheable memory.
2524.It Li mem-wc
2525Count entries accessing write-combining memory.
2526.It Li mem-wt
2527Count entries accessing write-through memory.
2528.It Li mem-wp
2529Count entries accessing write-protected memory
2530.It Li mem-wb
2531Count entries accessing write-back memory.
2532.It Li own
2533Count store requests driven by the processor (i.e., not by other
2534processors or by DMA).
2535.It Li other
2536Count store requests driven by other processors or by DMA.
2537.It Li prefetch
2538Include hardware and software prefetch requests in the count.
2539.El
2540The default value for
2541.Ar qualifier
2542is to enable all the above flags.
2543.Pp
2544The
2545.Ar req-type
2546qualifier is a 5-bit number can be additionally used to select a
2547specific bus request type.
2548The default is 0.
2549.Pp
2550The
2551.Li edge
2552qualifier is normally used with this event to prevent multiple
2553counting.
2554The exact behaviour of this event depends on the processor revision.
2555.It Li p4-itlb-reference Op mask= Ns Ar qualifier
2556.Pq "TS event"
2557Count translations using the intruction translation look-aside
2558buffer.
2559The
2560.Ar qualifier
2561argument is a list of the following strings separated by
2562.Li +
2563characters.
2564.Bl -tag -width indent -compact
2565.It Li hit
2566Count ITLB hits.
2567.It Li miss
2568Count ITLB misses.
2569.It Li hit-uc
2570Count uncacheable ITLB hits.
2571.El
2572If no
2573.Ar qualifier
2574is specified the default is to count all the three kinds of ITLB
2575translations.
2576.It Li p4-load-port-replay Op Li ,mask= Ns Ar qualifier
2577.Pq "TS event"
2578Count replayed events at the load port.
2579Qualifier
2580.Ar qualifier
2581can take on one value:
2582.Bl -tag -width indent -compact
2583.It Li split-ld
2584Count split loads.
2585.El
2586The default value for
2587.Ar qualifier
2588is
2589.Li split-ld .
2590.It Li p4-mispred-branch-retired Op Li ,mask= Ns Ar flags
2591.Pq "TS event"
2592Count mispredicted IA-32 branch instructions.
2593Qualifier
2594.Ar flags
2595can take the following value (which is also the default):
2596.Bl -tag -width indent -compact
2597.It Li nbogus
2598Count non-bogus retired branch instructions.
2599.El
2600.It Li p4-machine-clear Op Li ,mask= Ns Ar flags
2601.Pq "TS event"
2602Count the number of pipeline clears seen by the processor.
2603Qualifer
2604.Ar flags
2605is a list of the following strings separated by
2606.Li +
2607characters:
2608.Bl -tag -width indent -compact
2609.It Li clear
2610Count for a portion of the many cycles when the machine is being
2611cleared for any reason.
2612.It Li moclear
2613Count machine clears due to memory ordering issues.
2614.It Li smclear
2615Count machine clears due to self-modifying code.
2616.El
2617Use qualifier
2618.Li edge
2619to get a count of occurrences of machine clears.
2620The default qualifier is
2621.Li clear .
2622.It Li p4-memory-cancel Op Li ,mask= Ns Ar event-list
2623.Pq "TS event"
2624Count the cancelling of various kinds of requests in the data cache
2625address control unit of the CPU.
2626The qualifier
2627.Ar event-list
2628is a list of the following strings separated by
2629.Li "+"
2630characters:
2631.Bl -tag -width indent -compact
2632.It Li st-rb-full
2633Requests cancelled because no store request buffer was available.
2634.It Li 64k-conf
2635Requests that conflict due to 64K aliasing.
2636.El
2637If
2638.Ar event-list
2639is not specified, then the default is to count both kinds of events.
2640.It Li p4-memory-complete Op Li ,mask= Ns Ar event-list
2641.Pq "TS event"
2642Count the completion of load split, store split, uncacheable split and
2643uncacheable load operations selected by qualifier
2644.Ar event-list .
2645The qualifier
2646.Ar event-list
2647is a
2648.Li +
2649separated list of the following flags:
2650.Bl -tag -width indent -compact
2651.It Li lsc
2652Count load splits completed, excluding loads from uncacheable or
2653write-combining areas.
2654.It Li ssc
2655Count any split stores completed.
2656.El
2657The default is to count both kinds of operations.
2658.It Li p4-mob-load-replay Op Li ,mask= Ns Ar qualifier
2659.Pq "TS event"
2660Count load replays triggered by the memory order buffer.
2661Qualifier
2662.Ar qualifier
2663can be a
2664.Li +
2665separated list of the following flags:
2666.Bl -tag -width indent -compact
2667.It Li no-sta
2668Count replays because of unknown store addresses.
2669.It Li no-std
2670Count replays because of unknown store data.
2671.It Li partial-data
2672Count replays because of partially overlapped data accesses between
2673load and store operations.
2674.It Li unalgn-addr
2675Count replays because of mismatches in the lower 4 bits of load and
2676store operations.
2677.El
2678The default qualifier is
2679.Ar no-sta+no-std+partial-data+unalgn-addr .
2680.It Li p4-packed-dp-uop Op Li ,mask= Ns Ar flags
2681.Pq "TI event"
2682Count packed double-precision uops.
2683Qualifier
2684.Ar flags
2685can take the following value (which is also the default):
2686.Bl -tag -width indent -compact
2687.It Li all
2688Count all uops operating on packed double-precision operands.
2689.El
2690.It Li p4-packed-sp-uop Op Li ,mask= Ns Ar flags
2691.Pq "TI event"
2692Count packed single-precision uops.
2693Qualifier
2694.Ar flags
2695can take the following value (which is also the default):
2696.Bl -tag -width indent -compact
2697.It Li all
2698Count all uops operating on packed single-precision operands.
2699.El
2700.It Li p4-page-walk-type Op Li ,mask= Ns Ar qualifier
2701.Pq "TI event"
2702Count page walks performed by the page miss handler.
2703Qualifier
2704.Ar qualifier
2705can be a
2706.Li +
2707separated list of the following keywords:
2708.Bl -tag -width indent -compact
2709.It Li dtmiss
2710Count page walks for data TLB misses.
2711.It Li itmiss
2712Count page walks for instruction TLB misses.
2713.El
2714The default value for
2715.Ar qualifier
2716is
2717.Li dtmiss+itmiss .
2718.It Li p4-replay-event Op Li ,mask= Ns Ar flags
2719.Pq "TS event"
2720Count the retirement of tagged uops selected through the replay
2721tagging mechanism.
2722Qualifier
2723.Ar flags
2724contains a
2725.Li +
2726separated set of the following strings:
2727.Bl -tag -width indent -compact
2728.It Li nbogus
2729The marked uops are not bogus.
2730.It Li bogus
2731The marked uops are bogus.
2732.El
2733This event requires additional (upstream) events to be allocated to
2734perform the desired uop tagging.
2735The default qualifier counts both kinds of uops.
2736This event can be used for precise event based sampling.
2737.It Li p4-resource-stall Op Li ,mask= Ns Ar flags
2738.Pq "TS event"
2739Count the occurrence or latency of stalls in the allocator.
2740Qualifier
2741.Ar flags
2742can take the following value (which is also the default):
2743.Bl -tag -width indent -compact
2744.It Li sbfull
2745A stall due to the lack of store buffers.
2746.El
2747.It Li p4-response
2748.Pq "TI event"
2749Count different types of responses.
2750Further documentation on this event is not available.
2751.It Li p4-retired-branch-type Op Li ,mask= Ns Ar flags
2752.Pq "TS event"
2753Count branches retired.
2754Qualifier
2755.Ar flags
2756contains a
2757.Li +
2758separated list of strings:
2759.Bl -tag -width indent -compact
2760.It Li conditional
2761Count conditional jumps.
2762.It Li call
2763Count direct and indirect call branches.
2764.It Li return
2765Count return branches.
2766.It Li indirect
2767Count returns, indirect calls or indirect jumps.
2768.El
2769The default qualifier counts all the above branch types.
2770.It Li p4-retired-mispred-branch-type Op Li ,mask= Ns Ar flags
2771.Pq "TS event"
2772Count mispredicted branches retired.
2773Qualifier
2774.Ar flags
2775contains a
2776.Li +
2777separated list of strings:
2778.Bl -tag -width indent -compact
2779.It Li conditional
2780Count conditional jumps.
2781.It Li call
2782Count indirect call branches.
2783.It Li return
2784Count return branches.
2785.It Li indirect
2786Count returns, indirect calls or indirect jumps.
2787.El
2788The default qualifier counts all the above branch types.
2789.It Li p4-scalar-dp-uop Op Li ,mask= Ns Ar flags
2790.Pq "TI event"
2791Count the number of scalar double-precision uops.
2792Qualifier
2793.Ar flags
2794can take the following value (which is also the default):
2795.Bl -tag -width indent -compact
2796.It Li all
2797Count the number of scalar double-precision uops.
2798.El
2799.It Li p4-scalar-sp-uop Op Li ,mask= Ns Ar flags
2800.Pq "TI event"
2801Count the number of scalar single-precision uops.
2802Qualifier
2803.Ar flags
2804can take the following value (which is also the default):
2805.Bl -tag -width indent -compact
2806.It Li all
2807Count all uops operating on scalar single-precision operands.
2808.El
2809.It Li p4-snoop
2810.Pq "TI event"
2811Count snoop traffic.
2812Further documentation on this event is not available.
2813.It Li p4-sse-input-assist Op Li ,mask= Ns Ar flags
2814.Pq "TI event"
2815Count the number of times an assist is required to handle problems
2816with the operands for SSE and SSE2 operations.
2817Qualifier
2818.Ar flags
2819can take the following value (which is also the default):
2820.Bl -tag -width indent -compact
2821.It Li all
2822Count assists for all SSE and SSE2 uops.
2823.El
2824.It Li p4-store-port-replay Op Li ,mask= Ns Ar qualifier
2825.Pq "TS event"
2826Count events replayed at the store port.
2827Qualifier
2828.Ar qualifier
2829can take on one value:
2830.Bl -tag -width indent -compact
2831.It Li split-st
2832Count split stores.
2833.El
2834The default value for
2835.Ar qualifier
2836is
2837.Li split-st .
2838.It Li p4-tc-deliver-mode Op Li ,mask= Ns Ar qualifier
2839.Pq "TI event"
2840Count the duration in cycles of operating modes of the trace cache and
2841decode engine.
2842The desired operating mode is selected by
2843.Ar qualifier ,
2844which is a list of the following strings separated by
2845.Li "+"
2846characters:
2847.Bl -tag -width indent -compact
2848.It Li DD
2849Both logical processors are in deliver mode.
2850.It Li DB
2851Logical processor 0 is in deliver mode while logical processor 1 is in
2852build mode.
2853.It Li DI
2854Logical processor 0 is in deliver mode while logical processor 1 is
2855halted, or in machine clear, or transitioning to a long microcode
2856flow.
2857.It Li BD
2858Logical processor 0 is in build mode while logical processor 1 is in
2859deliver mode.
2860.It Li BB
2861Both logical processors are in build mode.
2862.It Li BI
2863Logical processor 0 is in build mode while logical processor 1 is
2864halted, or in machine clear or transitioning to a long microcode
2865flow.
2866.It Li ID
2867Logical processor 0 is halted, or in machine clear or transitioning to
2868a long microcode flow while logical processor 1 is in deliver mode.
2869.It Li IB
2870Logical processor 0 is halted, or in machine clear or transitioning to
2871a long microcode flow while logical processor 1 is in build mode.
2872.El
2873If there is only one logical processor in the processor package then
2874the qualifier for logical processor 1 is ignored.
2875If no qualifier is specified, the default qualifier is
2876.Li DD+DB+DI+BD+BB+BI+ID+IB .
2877.It Li p4-tc-ms-xfer Op Li ,mask= Ns Ar flags
2878.Pq "TI event"
2879Count the number of times uop delivery changed from the trace cache to
2880MS ROM.
2881Qualifier
2882.Ar flags
2883can take the following value (which is also the default):
2884.Bl -tag -width indent -compact
2885.It Li cisc
2886Count TC to MS transfers.
2887.El
2888.It Li p4-uop-queue-writes Op Li ,mask= Ns Ar flags
2889.Pq "TS event"
2890Count the number of valid uops written to the uop queue.
2891Qualifier
2892.Ar flags
2893is a list of the following strings, separated by
2894.Li +
2895characters:
2896.Bl -tag -width indent -compact
2897.It Li from-tc-build
2898Count uops being written from the trace cache in build mode.
2899.It Li from-tc-deliver
2900Count uops being written from the trace cache in deliver mode.
2901.It Li from-rom
2902Count uops being written from microcode ROM.
2903.El
2904The default qualifier counts all the above kinds of uops.
2905.It Li p4-uop-type Op Li ,mask= Ns Ar flags
2906.Pq "TS event"
2907This event is used in conjunction with the front-end at-retirement
2908mechanism to tag load and store uops.
2909Qualifer
2910.Ar flags
2911comprises the following strings separated by
2912.Li +
2913characters:
2914.Bl -tag -width indent -compact
2915.It Li tagloads
2916Mark uops that are load operations.
2917.It Li tagstores
2918Mark uops that are store operations.
2919.El
2920The default qualifier counts both kinds of uops.
2921.It Li p4-uops-retired Op Li ,mask= Ns Ar flags
2922.Pq "TS event"
2923Count uops retired during a clock cycle.
2924Qualifier
2925.Ar flags
2926comprises the following strings separated by
2927.Li +
2928characters:
2929.Bl -tag -width indent -compact
2930.It Li nbogus
2931Count marked uops that are not bogus.
2932.It Li bogus
2933Count marked uops that are bogus.
2934.El
2935The default qualifier counts both kinds of uops.
2936.It Li p4-wc-buffer Op Li ,mask= Ns Ar flags
2937.Pq "TI event"
2938Count write-combining buffer operations.
2939Qualifier
2940.Ar flags
2941contains the following strings separated by
2942.Li +
2943characters:
2944.Bl -tag -width indent -compact
2945.It Li wcb-evicts
2946WC buffer evictions due to any cause.
2947.It Li wcb-full-evict
2948WC buffer evictions due to no WC buffer being available.
2949.El
2950The default qualifer counts both kinds of evictions.
2951.It Li p4-x87-assist Op Li ,mask= Ns Ar flags
2952.Pq "TS event"
2953Count the retirement of x87 instructions that required special
2954handling.
2955Qualifier
2956.Ar flags
2957contains the following strings separated by
2958.Li +
2959characters:
2960.Bl -tag -width indent -compact
2961.It Li fpsu
2962Count instructions that saw an FP stack underflow.
2963.It Li fpso
2964Count instructions that saw an FP stack overflow.
2965.It Li poao
2966Count instructions that saw an x87 output overflow.
2967.It Li poau
2968Count instructions that saw an x87 output underflow.
2969.It Li prea
2970Count instructions that needed an x87 input assist.
2971.El
2972The default qualifier counts all the above types of instruction
2973retirements.
2974.It Li p4-x87-fp-uop Op Li ,mask= Ns Ar flags
2975.Pq "TI event"
2976Count x87 floating-point uops.
2977Qualifier
2978.Ar flags
2979can take the following value (which is also the default):
2980.Bl -tag -width indent -compact
2981.It Li all
2982Count all x87 floating-point uops.
2983.El
2984If an instruction contains more than one x87 floating-point uops, then
2985all x87 floating-point uops will be counted.
2986This event does not count x87 floating-point data movement operations.
2987.It Li p4-x87-simd-moves-uop Op Li ,mask= Ns Ar flags
2988.Pq "TI event"
2989Count each x87 FPU, MMX, SSE, or SSE2 uops that load data or store
2990data or perform register-to-register moves.
2991This event does not count integer move uops.
2992Qualifier
2993.Ar flags
2994may contain the following keywords separated by
2995.Li +
2996characters:
2997.Bl -tag -width indent -compact
2998.It Li allp0
2999Count all x87 and SIMD store and move uops.
3000.It Li allp2
3001Count all x87 and SIMD load uops.
3002.El
3003The default is to count all uops.
3004.Pq Errata
3005This event may be affected by processor errata N43.
3006.El
3007.Ss "Cascading P4 PMCs"
3008To be filled in.
3009.Ss "Precise Event Based Sampling"
3010To be filled in.
3011.Sh IMPLEMENTATION NOTES
3012On the i386 architecture,
3013.Fx
3014has historically allowed the use of the RDTSC instruction from
3015user-mode (i.e., at a processor CPL of 3) by any process.
3016This behaviour is preserved by
3017.Xr hwpmc 4 .
3018.Sh RETURN VALUES
3019The
3020.Fn pmc_name_of_capability ,
3021.Fn pmc_name_of_class ,
3022.Fn pmc_name_of_cputype ,
3023.Fn pmc_name_of_disposition ,
3024.Fn pmc_name_of_event ,
3025.Fn pmc_name_of_mode ,
3026and
3027.Fn pmc_name_of_state
3028functions return a pointer to the human readable form of their argument.
3029These pointers may point to statically allocated storage and must
3030not be passed to
3031.Fn free .
3032In case of an error, these functions return
3033.Li NULL
3034and set the global variable
3035.Va errno .
3036.Pp
3037The functions
3038.Fn pmc_ncpu
3039and
3040.Fn pmc_npmc
3041return the number of CPUs and number of PMCs configured respectively;
3042in case of an error they return the value
3043.Li -1
3044and set the global variable
3045.Va errno .
3046.Pp
3047All other functions return the value
3048.Li 0
3049if successful; otherwise the value
3050.Li -1
3051is returned and the global variable
3052.Va errno
3053is set to indicate the error.
3054.Sh ERRORS
3055A call to
3056.Fn pmc_init
3057may fail with the following errors in addition to those returned by
3058.Xr modfind 2 ,
3059.Xr modstat 2
3060and
3061.Xr hwpmc 4 :
3062.Bl -tag -width Er
3063.It Bq Er ENXIO
3064An unknown CPU type was encountered during initialization.
3065.It Bq Er EPROGMISMATCH
3066The version number of the
3067.Xr hwpmc 4
3068kernel module did not match that compiled into the
3069.Xr pmc 3
3070library.
3071.El
3072.Pp
3073A call to
3074.Fn pmc_capabilities ,
3075.Fn pmc_name_of_capability ,
3076.Fn pmc_name_of_disposition ,
3077.Fn pmc_name_of_state ,
3078.Fn pmc_name_of_event ,
3079.Fn pmc_name_of_mode
3080.Fn pmc_name_of_class
3081and
3082.Fn pmc_width
3083may fail with the following error:
3084.Bl -tag -width Er
3085.It Bq Er EINVAL
3086An invalid argument was passed to the function.
3087.El
3088.Pp
3089A call to
3090.Fn pmc_cpuinfo
3091or
3092.Fn pmc_ncpu
3093may fail with the following error:
3094.Bl -tag -width Er
3095.It Bq Er ENXIO
3096The
3097.Xr pmc 3
3098has not been initialized.
3099.El
3100.Pp
3101A call to
3102.Fn pmc_npmc
3103may fail with the following errors:
3104.Bl -tag -width Er
3105.It Bq Er EINVAL
3106The argument passed in was out of range.
3107.It Bq Er ENXIO
3108The
3109.Xr pmc 3
3110library has not been initialized.
3111.El
3112.Pp
3113A call to
3114.Fn pmc_pmcinfo
3115may fail with the following errors, in addition to those returned by
3116.Xr hwpmc 4 :
3117.Bl -tag -width Er
3118.It Bq Er ENXIO
3119The
3120.Xr pmc 3
3121library is not yet initialized.
3122.El
3123.Pp
3124A call to
3125.Fn pmc_allocate
3126may fail with the following errors, in addition to those returned by
3127.Xr hwpmc 4 :
3128.Bl -tag -width Er
3129.It Bq Er EINVAL
3130The
3131.Fa mode
3132argument passed in had an illegal value, or the event specification
3133.Fa ctrspec
3134was unrecognized for this cpu type.
3135.El
3136.Pp
3137Calls to
3138.Fn pmc_attach ,
3139.Fn pmc_configure_logfile ,
3140.Fn pmc_detach ,
3141.Fn pmc_disable ,
3142.Fn pmc_enable ,
3143.Fn pmc_get_driver_stats ,
3144.Fn pmc_get_msr ,
3145.Fn pmc_read ,
3146.Fn pmc_release ,
3147.Fn pmc_rw ,
3148.Fn pmc_set ,
3149.Fn pmc_start ,
3150.Fn pmc_stop ,
3151.Fn pmc_write ,
3152and
3153.Fn pmc_writelog
3154may fail with the errors described in
3155.Xr hwpmc 4 .
3156.Pp
3157If a log file was configured using
3158.Fn pmc_configure_logfile
3159and the
3160.Xr hwpmc 4
3161driver encountered an error while logging data to it, then
3162logging will be stopped and a subsequent call to
3163.Fn pmc_flush_logfile
3164will fail with the error code seen by the
3165.Xr hwpmc 4
3166driver.
3167.Sh SEE ALSO
3168.Xr modfind 2 ,
3169.Xr modstat 2 ,
3170.Xr calloc 3 ,
3171.Xr pmclog 3 ,
3172.Xr hwpmc 4 ,
3173.Xr pmccontrol 8 ,
3174.Xr pmcreport 8 ,
3175.Xr pmcstat 8
3176.Sh BUGS
3177The information returned by
3178.Fn pmc_cpuinfo ,
3179.Fn pmc_ncpu
3180and possibly
3181.Fn pmc_npmc
3182should really be available all the time, through a better designed
3183interface and not just when
3184.Xr hwpmc 4
3185is present in the kernel.
3186