xref: /freebsd/lib/libpmc/pmc.3 (revision ebccf1e3a6b11b97cbf5f813dd76636e892a9035)
1.\" Copyright (c) 2003 Joseph Koshy.  All rights reserved.
2.\"
3.\" Redistribution and use in source and binary forms, with or without
4.\" modification, are permitted provided that the following conditions
5.\" are met:
6.\" 1. Redistributions of source code must retain the above copyright
7.\"    notice, this list of conditions and the following disclaimer.
8.\" 2. Redistributions in binary form must reproduce the above copyright
9.\"    notice, this list of conditions and the following disclaimer in the
10.\"    documentation and/or other materials provided with the distribution.
11.\"
12.\" This software is provided by Joseph Koshy ``as is'' and
13.\" any express or implied warranties, including, but not limited to, the
14.\" implied warranties of merchantability and fitness for a particular purpose
15.\" are disclaimed.  in no event shall Joseph Koshy be liable
16.\" for any direct, indirect, incidental, special, exemplary, or consequential
17.\" damages (including, but not limited to, procurement of substitute goods
18.\" or services; loss of use, data, or profits; or business interruption)
19.\" however caused and on any theory of liability, whether in contract, strict
20.\" liability, or tort (including negligence or otherwise) arising in any way
21.\" out of the use of this software, even if advised of the possibility of
22.\" such damage.
23.\"
24.\" $FreeBSD$
25.\"
26.Dd Apr 15, 2005
27.Os
28.Dt PMC 3
29.Sh NAME
30.Nm pmc_allocate ,
31.Nm pmc_attach ,
32.Nm pmc_configure_logfile ,
33.Nm pmc_cpuinfo ,
34.Nm pmc_detach ,
35.Nm pmc_disable ,
36.Nm pmc_enable ,
37.Nm pmc_event_names_of_class ,
38.Nm pmc_get_driver_stats ,
39.Nm pmc_init ,
40.Nm pmc_name_of_capability ,
41.Nm pmc_name_of_class ,
42.Nm pmc_name_of_cputype ,
43.Nm pmc_name_of_event ,
44.Nm pmc_name_of_mode ,
45.Nm pmc_name_of_state ,
46.Nm pmc_ncpu ,
47.Nm pmc_npmc ,
48.Nm pmc_pmcinfo ,
49.Nm pmc_read ,
50.Nm pmc_release ,
51.Nm pmc_rw ,
52.Nm pmc_set ,
53.Nm pmc_start ,
54.Nm pmc_stop ,
55.Nm pmc_write ,
56.Nm pmc_x86_get_msr
57.Nd programming API for using hardware performance monitoring counters
58.Sh LIBRARY
59.Lb libpmc
60.Sh SYNOPSIS
61.In pmc.h
62.Ft int
63.Fo pmc_allocate
64.Fa "const char *eventspecifier"
65.Fa "enum pmc_mode mode"
66.Fa "uint32_t flags"
67.Fa "uint32_t cpu"
68.Fa "pmc_id_t *pmcid"
69.Fc
70.Ft int
71.Fo pmc_attach
72.Fa "pmc_id_t pmcid"
73.Fa "pid_t pid"
74.Fc
75.Ft int
76.Fn pmc_configure_logfile "int fd"
77.Ft int
78.Fn pmc_cpuinfo "const struct pmc_op_getcpuinfo **cpu_info"
79.Ft int
80.Fo pmc_detach
81.Fa "pmc_id_t pmcid"
82.Fa "pid_t pid"
83.Fc
84.Ft int
85.Fn pmc_disable "uint32_t cpu" "int pmc"
86.Ft int
87.Fn pmc_enable "uint32_t cpu" "int pmc"
88.Ft int
89.Fo pmc_event_names_of_class
90.Fa "enum pmc_class cl"
91.Fa "const char ***eventnames"
92.Fa "int *nevents"
93.Fc
94.Ft int
95.Fn pmc_get_driver_stats "struct pmc_op_getdriverstats *gms"
96.Ft int
97.Fn pmc_init "void"
98.Ft "const char *"
99.Fn pmc_name_of_capability "enum pmc_caps pc"
100.Ft "const char *"
101.Fn pmc_name_of_class "enum pmc_class pc"
102.Ft "const char *"
103.Fn pmc_name_of_cputype "enum pmc_cputype ct"
104.Ft "const char *"
105.Fn pmc_name_of_disposition "enum pmc_disp pd"
106.Ft "const char *"
107.Fn pmc_name_of_event "enum pmc_event pe"
108.Ft "const char *"
109.Fn pmc_name_of_mode "enum pmc_mode pm"
110.Ft "const char *"
111.Fn pmc_name_of_state "enum pmc_state ps"
112.Ft int
113.Fn pmc_ncpu "void"
114.Ft int
115.Fn pmc_npmc "uint32_t cpu"
116.Ft int
117.Fn pmc_pmcinfo "uint32_t cpu" "struct pmc_op_getpmcinfo **pmc_info"
118.Ft int
119.Fn pmc_read "pmc_id_t pmc" "pmc_value_t *value"
120.Ft int
121.Fn pmc_release "pmc_id_t pmc"
122.Ft int
123.Fn pmc_rw "pmc_id_t pmc" "pmc_value_t newvalue" "pmc_value_t *oldvaluep"
124.Ft int
125.Fn pmc_set "pmc_id_t pmc" "pmc_value_t value"
126.Ft int
127.Fn pmc_start "pmc_id_t pmc"
128.Ft int
129.Fn pmc_stop "pmc_id_t pmc"
130.Ft int
131.Fn pmc_write "pmc_id_t pmc" "pmc_value_t value"
132.Ft int
133.Fn pmc_x86_get_msr "int pmc" "uint32_t *msr"
134.Sh DESCRIPTION
135These functions implement a high-level library for using the
136system's hardware performance counters.
137.Pp
138PMCs are allocated using
139.Fn pmc_allocate ,
140released using
141.Fn pmc_release
142and read using
143.Fn pmc_read .
144Allocated PMCs may be started or stopped at any time using
145.Fn pmc_start
146and
147.Fn pmc_stop
148respectively.
149An allocated PMC may be of
150.Qq global
151scope, meaning that the PMC measures system-wide events, or
152.Qq process-private
153scope, meaning that the PMC only counts hardware events when
154the allocating process (or, optionally, its children)
155are active.
156.Pp
157PMCs may further be in
158.Qq "counting mode" ,
159or in
160.Qq "sampling mode" .
161Sampling mode PMCs deliver an interrupt to the CPU after
162a configured number of hardware events have been seen.
163A process-private sampling mode PMC will cause its owner
164process to get periodic
165.Sy SIGPROF
166interrupts, while a global sampling mode PMC is used to
167do system-wide statistical sampling (see
168.Xr hwpmc 4 ) .
169The sampling rate desired of a sampling-mode PMC is set using
170.Fn pmc_set .
171Counting mode PMCs do not interrupt the CPU; their values
172can be read using
173.Fn pmc_read .
174.Pp
175System-wide statistical sampling is configured by allocating
176at least one sampling mode PMC with
177global scope, and when a log file is configured using
178.Fn pmc_configure_logfile .
179The
180.Xr hwpmc 4
181driver manages system-wide statistical sampling; for more
182information please see
183.Xr hwpmc 4 .
184.Ss APPLICATION PROGRAMMING INTERFACE
185.Fn pmc_init
186initializes the
187.Xr pmc 3
188library.
189This function must be called first, before any of the other
190functions in the library.
191.Pp
192.Fn pmc_allocate
193allocates a counter that counts the events named by
194.Fa eventspecifier ,
195and writes the allocated counter id to
196.Fa *pmcid .
197Argument
198.Fa eventspecifier
199comprises an PMC event name followed by an optional comma separated
200list of keywords and qualifiers.
201The allowed syntax for
202.Fa eventspecifier
203is processor architecture specific and is listed in section
204.Sx "EVENT SPECIFIERS"
205below.
206The desired PMC mode is specified by
207.Fa mode ,
208and any mode specific modifiers are specified using
209.Fa flags .
210The
211.Fa cpu
212argument is the value
213.Li PMC_CPU_ANY ,
214or names the cpu the allocation is to be on.
215Requesting a specific CPU makes only makes sense for global PMCs;
216process-private PMC allocations should always specify
217.Li PMC_CPU_ANY .
218.Pp
219By default a PMC configured in process-virtual counting mode is setup
220to profile its owner process.
221The function
222.Fn pmc_attach
223may be used to attach the PMC to a different process.
224.Fn pmc_attach
225needs to be called before the counter is first started
226with
227.Fn pmc_start .
228The function
229.Fn pmc_detach
230may be used to detach a PMC from a process it was attached to
231using a prior call to
232.Fn pmc_attach .
233.Pp
234.Fn pmc_release
235releases a PMC previously allocated with
236.Fn pmc_allocate .
237This function call implicitly detaches the PMC from all its target
238processes.
239.Pp
240An allocated PMC may be started and stopped using
241.Fn pmc_start
242and
243.Fn pmc_stop
244respectively.
245.Pp
246The current value of a PMC may be read with
247.Fn pmc_read
248and written using
249.Fn pmc_write ,
250provided the underlying hardware supports these operations on
251the allocated PMC.
252The read and write operation may be combined using
253.Fn pmc_rw .
254.Pp
255The
256.Fn pmc_configure_logfile
257function causes the
258.Xr hwpmc 4
259driver to log system wide performance data to file corresponding
260to the process' file handle
261.Fa fd .
262.Pp
263.Fn pmc_set
264configures an sampling PMC
265.Fa pmc
266to interrupt every
267.Fa value
268events.
269For counting PMCs,
270.Fn pmc_set
271sets the initial value of the PMC to
272.Fa value .
273.Pp
274.Fn pmc_get_driver_statistics
275copies a snapshot of the usage statistics maintained by
276.Xr hwpmc 4
277into the memory area pointed to be argument
278.Fa gms .
279.Ss SIGNAL HANDLING REQUIREMENTS
280Applications using PMCs are required to handle the following signals:
281.Bl -tag -width indent
282.It SIGBUS
283When the
284.Xr hwpmc 4
285module is unloaded using
286.Xr kldunload 8 ,
287processes that have PMCs allocated to them will be sent a
288SIGBUS signal.
289.It SIGIO
290Attempting to read a PMC that is not currently attached to a running
291process will cause a SIGIO signal to be sent to the reader.
292.El
293.Ss CONVENIENCE FUNCTIONS
294.Fn pmc_ncpu
295returns the number of CPUs present in the system.
296.Pp
297.Fn pmc_npmc
298returns the number of PMCs supported on CPU
299.Fa cpu .
300.Fn pmc_cpuinfo
301sets argument
302.Fa cpu_info
303to point to a structure with information about the system's CPUs.
304.Fn pmc_pmcinfo
305returns information about the current state of CPU
306.Fa cpu Ap s
307PMCs.
308.Pp
309The functions
310.Fn pmc_name_of_capability ,
311.Fn pmc_name_of_class ,
312.Fn pmc_name_of_cputype ,
313.Fn pmc_name_of_disposition ,
314.Fn pmc_name_of_event ,
315.Fn pmc_name_of_mode
316and
317.Fn pmc_name_of_state
318are useful for code wanting to print error messages.
319They return
320.Ft "const char *"
321pointers to human-readable representations of their arguments.
322These return values should not be freed using
323.Xr free 3 .
324.Pp
325.Fn pmc_event_names_of_class
326returns a list of event names supported by a given PMC class
327.Fa cl .
328On successful return, an array of
329.Ft "const char *"
330pointers to the names of valid events supported by class
331.Fa cl
332is allocated by the library using
333.Xr malloc 3 ,
334and a pointer to this array is returned in the location pointed to by
335.Fa eventnames .
336The number of pointers allocated is returned in the location pointed
337to by
338.Fa nevents .
339.Ss ADMINISTRATION
340Individual PMCs may be enabled or disabled on a given CPU using
341.Fn pmc_enable
342and
343.Fn pmc_disable
344respectively.
345For these functions,
346.Fa cpu
347is the CPU number, and
348.Fa pmc
349is the index of the PMC to be operated on.
350Only the super-user is allowed to enable and disable PMCs.
351.Ss X86 ARCHITECTURE SPECIFIC API
352The
353.Fn pmc_x86_get_msr
354function returns the processor model specific register number
355associated with
356.Fa pmc .
357Applications may use the x86
358.Sy RDPMC
359instruction to directly read the contents of the PMC.
360.Sh EVENT SPECIFIERS
361Event specifiers are strings comprising of an event name, followed by
362optional parameters modifying the semantics of the hardware event
363being probed.
364Event names are PMC architecture dependent, but the
365.Xr hwpmc 4
366library defines machine independent aliases for commonly used
367events.
368.Ss Event Name Aliases
369Event name aliases are CPU architecture independent names for commonly
370used events.
371The following aliases are known to this version of the
372.Xr pmc 3
373library:
374.Bl -tag -width indent
375.It Li branches
376Measure the number of branches retired.
377.It Li branch-mispredicts
378Measure the number of retired branches that were mispredicted.
379.It Li cycles
380Measure processor cycles.
381This event is implemented using the processor's Time Stamp Counter
382register.
383.It Li dc-misses
384Measure the number of data cache misses.
385.It Li ic-misses
386Measure the number of instruction cache misses.
387.It Li instructions
388Measure the number of instructions retired.
389.It Li interrupts
390Measure the number of interrupts seen.
391.El
392.Ss Time Stamp Counter (TSC)
393The timestamp counter is a monontonically non-decreasing counter that
394counts processor cycles.
395.Pp
396In the i386 architecture this counter may
397be selected by requesting an event with eventspecifier
398.Ic tsc .
399The
400.Ic tsc
401event does not support any further qualifiers.
402It can only be allocated in system-wide counting mode,
403and is a read-only counter.
404Multiple processes are allowed to allocate the TSC.
405Once allocated, it may be read using the
406.Fn pmc_read
407function, or by using the RDTSC instruction.
408.Ss AMD (K7) PMCs
409These PMCs are present in the
410.Tn "AMD Athlon"
411series of CPUs and are documented in:
412.Rs
413.%B "AMD Athlon Processor x86 Code Optimization Guide"
414.%N "Publication No. 22007"
415.%D "February 2002"
416.%Q "Advanced Micronic Devices, Inc."
417.Re
418.Pp
419Event specifiers for AMD K7 PMCs can have the following optional
420qualifiers:
421.Bl -tag -width indent
422.It Li count= Ns Ar value
423Configure the counter to increment only if the number of configured
424events measured in a cycle is greater than or equal to
425.Ar value .
426.It Li edge
427Configure the counter to only count negated-to-asserted transitions
428of the conditions expressed by the other qualifiers.
429In other words, the counter will increment only once whenever a given
430condition becomes true, irrespective of the number of clocks during
431which the condition remains true.
432.It Li inv
433Invert the sense of comparision when the
434.Li count
435qualifier is present, making the counter to increment when the
436number of events per cycle is less than the value specified by
437the
438.Li count
439qualifier.
440.It Li os
441Configure the PMC to count events happening at privilege level 0.
442.It Li unitmask= Ns Ar mask
443This qualifier is used to further qualify a select few events,
444.Li k7-dc-refills-from-l2 ,
445.Li k7-dc-refills-from-system
446and
447.Li k7-dc-writebacks .
448Here
449.Ar mask
450is a string of the following characters optionally seperated by
451.Li "+"
452characters:
453.Bl -tag -width indent -compact
454.It Li m
455Count operations for lines in the
456.Dq Modified
457state.
458.It Li o
459Count operations for lines in the
460.Dq Owner
461state.
462.It Li e
463Count operations for lines in the
464.Dq Exclusive
465state.
466.It Li s
467Count operations for lines in the
468.Dq Shared
469state.
470.It Li i
471Count operations for lines in the
472.Dq Invalid
473state.
474.El
475If no
476.Ar unitmask
477qualifier is specified, the default is to count events for caches
478lines in any of the above states.
479.It Li usr
480Configure the PMC to count events occurring at privilege levels 1, 2
481or 3.
482.El
483If neither of the
484.Li os
485or
486.Li usr
487qualifiers were specified, the default is to enable both.
488.Pp
489The event specifiers support on AMD K7 PMCs are:
490.Bl -tag -width indent
491.It Li k7-dc-accesses
492Count data cache accesses.
493.It Li k7-dc-misses
494Count data cache misses.
495.It Li k7-dc-refills-from-l2 Op Li ,unitmask= Ns Ar mask
496Count data cache refills from L2 cache.
497This event may be further qualified using the
498.Li unitmask
499qualifier.
500.It Li k7-dc-refills-from-system Op Li ,unitmask= Ns Ar mask
501Count data cache refills from system memory.
502This event may be further qualified using the
503.Li unitmask
504qualifier.
505.It Li k7-dc-writebacks Op Li ,unitmask= Ns Ar mask
506Count data cache writebacks.
507This event may be further qualified using the
508.Li unitmask
509qualifier.
510.It Li k7-l1-dtlb-miss-and-l2-dtlb-hits
511Count L1 DTLB misses and L2 DTLB hits.
512.It Li k7-l1-and-l2-dtlb-misses
513Count L1 and L2 DTLB misses.
514.It Li k7-misaligned-references
515Count misaligned data references.
516.It Li k7-ic-fetches
517Count instruction cache fetches.
518.It Li k7-ic-misses
519Count instruction cache misses.
520.It Li k7-l1-itlb-misses
521Count L1 ITLB misses that are L2 ITLB hits.
522.It Li k7-l1-l2-itlb-misses
523Count L1 (and L2) ITLB misses.
524.It Li k7-retired-instructions
525Count all retired instructions.
526.It Li k7-retired-ops
527Count retired ops.
528.It Li k7-retired-branches
529Count all retired branches (conditional, unconditional, exceptions
530and interrupts).
531.It Li k7-retired-branches-mispredicted
532Count all misprediced retired branches.
533.It Li k7-retired-taken-branches
534Count retired taken branches.
535.It Li k7-retired-taken-branches-mispredicted
536Count mispredicted taken branches that were retired.
537.It Li k7-retired-far-control-transfers
538Count retired far control transfers.
539.It Li k7-retired-resync-branches
540Count retired resync branches (non control transfer branches).
541.It Li k7-interrupts-masked-cycles
542Count the number of cycles when the processor's
543.Li IF
544flag was zero.
545.It Li k7-interrupts-masked-while-pending-cycles
546Count the number of cycles interrupts were masked while pending due
547to the processor's
548.Li IF
549flag being zero.
550.It Li k7-hardware-interrupts
551Count the number of taken hardware interrupts.
552.El
553.Ss AMD (K8) PMCs
554These PMCs are present in the
555.Tn "AMD Athlon64"
556and
557.Tn "AMD Opteron"
558series of CPUs.
559They are documented in:
560.Rs
561.%B "BIOS and Kernel Developer's Guide for the AMD Athlon(tm) 64 and AMD Opteron Processors"
562.%N "Publication No. 26094"
563.%D "April 2004"
564.%Q "Advanced Micronic Devices, Inc."
565.Re
566.Pp
567Event specifiers for AMD K8 PMCs can have the following optional
568qualifiers:
569.Bl -tag -width indent
570.It Li count= Ns Ar value
571Configure the counter to increment only if the number of configured
572events measured in a cycle is greater than or equal to
573.Ar value .
574.It Li edge
575Configure the counter to only count negated-to-asserted transitions
576of the conditions expressed by the other fields.
577In other words, the counter will increment only once whenever a given
578condition becomes true, irrespective of the number of clocks during
579which the condition remains true.
580.It Li inv
581Invert the sense of comparision when the
582.Li count
583qualifier is present, making the counter to increment when the
584number of events per cycle is less than the value specified by
585the
586.Li count
587qualifier.
588.It Li mask= Ns Ar qualifier
589Many event specifiers for AMD K8 PMCs need to be additionally
590qualified using a mask qualifier.
591These additional qualifiers are event-specific and are documented
592along with their associated event specifiers below.
593.It Li os
594Configure the PMC to count events happening at privilege level 0.
595.It Li usr
596Configure the PMC to count events occurring at privilege levels 1, 2
597or 3.
598.El
599If neither of the
600.Li os
601or
602.Li usr
603qualifiers were specified, the default is to enable both.
604.Pp
605The event specifiers support on AMD K8 PMCs are:
606.Bl -tag -width indent
607.It Li k8-bu-cpu-clk-unhalted
608Count the number of clock cycles when the CPU is not in the HLT or
609STPCLK states.
610.It Li k8-bu-fill-request-l2-miss Op Li ,mask= Ns Ar qualifier
611Count fill requests that missed in the L2 cache.
612This event may be further qualified using
613.Ar qualifier ,
614which is a
615.Li + Ns - Ns
616separated set of the following keywords:
617.Bl -tag -width "XXXXXXXXXX" -compact
618.It Li dc-fill
619Count data cache fill requests.
620.It Li ic-fill
621Count instruction cache fill requests.
622.It Li tlb-reload
623Count TLB reloads.
624.El
625The default is to count all types of requests.
626.It Li k8-bu-internal-l2-request Op Li ,mask= Ns Ar qualifier
627Count internally generated requests to the L2 cache.
628This event may be further qualified using
629.Ar qualifier ,
630which is a
631.Li "+" Ns - Ns
632separated set of the following keywords:
633.Bl -tag -width "XXXXXXXXXX" -compact
634.It Li cancelled
635Count cancelled requests.
636.It Li dc-fill
637Count data cache fill requests.
638.It Li ic-fill
639Count instruction cache fill requests.
640.It Li tag-snoop
641Count tag snoop requests.
642.It Li tlb-reload
643Count TLB reloads.
644.El
645The default is to count all types of requests.
646.It Li k8-dc-access
647Count data cache accesses including microcode scratchpad accesses.
648.It Li k8-dc-copyback Op Li ,mask= Ns Ar qualifier
649Count data cache copyback operations.
650This event may be further qualified using
651.Ar qualifier ,
652which is a
653.Li "+" Ns - Ns
654separated set of the following keywords:
655.Bl -tag -width "exclusive" -compact
656.It Li exclusive
657Count operations for lines in the
658.Dq exclusive
659state.
660.It Li invalid
661Count operations for lines in the
662.Dq invalid
663state.
664.It Li modified
665Count operations for lines in the
666.Dq modified
667state.
668.It Li owner
669Count operations for lines in the
670.Dq owner
671state.
672.It Li shared
673Count operations for lines in the
674.Dq shared
675state.
676.El
677The default is to count operations for lines in all the
678above states.
679.It Li k8-dc-dcache-accesses-by-locks Op Li ,mask= Ns Ar qualifier
680Count data cache accesses by lock instructions.
681This event is only available on processors of revision C or later
682vintage.
683This event may be further qualified using
684.Ar qualifier ,
685which is a
686.Li "+" Ns - Ns
687separated set of the following keywords:
688.Bl -tag -width "exclusive" -compact
689.It Li accesses
690Count data cache accesses by lock instructions.
691.It Li misses
692Count data cache misses by lock instructions.
693.El
694The default is to count all accesses.
695.It Li k8-dc-dispatched-prefetch-instructions Op Li ,mask= Ns Ar qualifier
696Count the number of dispatched prefetch instructions.
697This event may be further qualified using
698.Ar qualifier ,
699which is a
700.Li "+" Ns - Ns
701separated set of the following keywords:
702.Bl -tag -width "exclusive" -compact
703.It Li load
704Count load operations.
705.It Li nta
706Count non-temporal operations.
707.It Li store
708Count store operations.
709.El
710The default is to count all operations.
711.It Li k8-dc-l1-dtlb-miss-and-l2-dtlb-hit
712Count L1 DTLB misses that are L2 DTLB hits.
713.It Li k8-dc-l1-dtlb-miss-and-l2-dtlb-miss
714Count L1 DTLB misses that are also misses in the L2 DTLB.
715.It Li k8-dc-microarchitectural-early-cancel-of-an-access
716Count microarchitectural early cancels of data cache accesses.
717.It Li k8-dc-microarchitectural-late-cancel-of-an-access
718Count microarchitectural late cancels of data cache accesses.
719.It Li k8-dc-misaligned-data-reference
720Count misaligned data references.
721.It Li k8-dc-miss
722Count data cache misses.
723.It Li k8-dc-one-bit-ecc-error Op Li ,mask= Ns Ar qualifier
724Count one bit ECC errors found by the scrubber.
725This event may be further qualified using
726.Ar qualifier ,
727which is a
728.Li "+" Ns - Ns
729separated set of the following keywords:
730.Bl -tag -width "piggyback" -compact
731.It Li scrubber
732Count scrubber detected errors.
733.It Li piggyback
734Count piggyback scrubber errors.
735.El
736The default is to count both kinds of errors.
737.It Li k8-dc-refill-from-l2 Op Li ,mask= Ns Ar qualifier
738Count data cache refills from L2 cache.
739This event may be further qualified using
740.Ar qualifier ,
741which is a
742.Li "+" Ns - Ns
743separated set of the following keywords:
744.Bl -tag -width "exclusive" -compact
745.It Li exclusive
746Count operations for lines in the
747.Dq exclusive
748state.
749.It Li invalid
750Count operations for lines in the
751.Dq invalid
752state.
753.It Li modified
754Count operations for lines in the
755.Dq modified
756state.
757.It Li owner
758Count operations for lines in the
759.Dq owner
760state.
761.It Li shared
762Count operations for lines in the
763.Dq shared
764state.
765.El
766The default is to count operations for lines in all the
767above states.
768.It Li k8-dc-refill-from-system Op Li ,mask= Ns Ar qualifier
769Count data cache refills from system memory.
770This event may be further qualified using
771.Ar qualifier ,
772which is a
773.Li "+" Ns - Ns
774separated set of the following keywords:
775.Bl -tag -width "exclusive" -compact
776.It Li exclusive
777Count operations for lines in the
778.Dq exclusive
779state.
780.It Li invalid
781Count operations for lines in the
782.Dq invalid
783state.
784.It Li modified
785Count operations for lines in the
786.Dq modified
787state.
788.It Li owner
789Count operations for lines in the
790.Dq owner
791state.
792.It Li shared
793Count operations for lines in the
794.Dq shared
795state.
796.El
797The default is to count operations for lines in all the
798above states.
799.It Li k8-fp-dispatched-fpu-ops Op Li ,mask= Ns Ar qualifier
800Count the number of dispatched FPU ops.
801This event is supported in revision B and later CPUs.
802This event may be further qualified using
803.Ar qualifier ,
804which is a
805.Li "+" Ns - Ns
806separated set of the following keywords:
807.Bl -tag -width "XXXXXXXXXX" -compact
808.It Li add-pipe-excluding-junk-ops
809Count add pipe ops excluding junk ops.
810.It Li add-pipe-junk-ops
811Count junk ops in the add pipe.
812.It Li multiply-pipe-excluding-junk-ops
813Count multiply pipe ops excluding junk ops.
814.It Li multiply-pipe-junk-ops
815Count junk ops in the multiply pipe.
816.It Li store-pipe-excluding-junk-ops
817Count store pipe ops excluding junk ops
818.It Li store-pipe-junk-ops
819Count junk ops in the store pipe.
820.El
821The default is to count all types of ops.
822.It Li k8-fp-cycles-with-no-fpu-ops-retired
823Count cycles when no FPU ops were retired.
824This event is supported in revision B and later CPUs.
825.It Li k8-fp-dispatched-fpu-fast-flag-ops
826Count dispatched FPU ops that use the fast flag interface.
827This event is supported in revision B and later CPUs.
828.It Li k8-fr-decoder-empty
829Count cycles when there was nothing to dispatch (i.e., the decoder
830was empty).
831.It Li k8-fr-dispatch-stalls
832Count all dispatch stalls.
833.It Li k8-fr-dispatch-stall-for-segment-load
834Count dispatch stalls for segment loads.
835.It Li k8-fr-dispatch-stall-for-serialization
836Count dispatch stalls for serialization.
837.It Li k8-fr-dispatch-stall-from-branch-abort-to-retire
838Count dispatch stalls from branch abort to retiral.
839.It Li k8-fr-dispatch-stall-when-fpu-is-full
840Count dispatch stalls when the FPU is full.
841.It Li k8-fr-dispatch-stall-when-ls-is-full
842Count dispatch stalls when the load/store unit is full.
843.It Li k8-fr-dispatch-stall-when-reorder-buffer-is-full
844Count dispatch stalls when the reorder buffer is full.
845.It Li k8-fr-dispatch-stall-when-reservation-stations-are-full
846Count dispatch stalls when reservation stations are full.
847.It Li k8-fr-dispatch-stall-when-waiting-for-all-to-be-quiet
848Count dispatch stalls when waiting for all to be quiet.
849.\" XXX What does "waiting for all to be quiet" mean?
850.It Li k8-fr-dispatch-stall-when-waiting-far-xfer-or-resync-branch-pending
851Count dispatch stalls when a far control transfer or a resync branch
852is pending.
853.It Li k8-fr-fpu-exceptions Op Li ,mask= Ns Ar qualifier
854Count FPU exceptions.
855This event is supported in revision B and later CPUs.
856This event may be further qualified using
857.Ar qualifier ,
858which is a
859.Li "+" Ns - Ns
860separated set of the following keywords:
861.Bl -tag -width "XXXXXXXXXX" -compact
862.It Li sse-and-x87-microtraps
863Count SSE and x87 microtraps.
864.It Li sse-reclass-microfaults
865Count SSE reclass microfaults
866.It Li sse-retype-microfaults
867Count SSE retype microfaults
868.It Li x87-reclass-microfaults
869Count x87 reclass microfaults.
870.El
871The default is to count all types of exceptions.
872.It Li k8-fr-interrupts-masked-cycles
873Count cycles when interrupts were masked (by CPU RFLAGS field IF was zero).
874.It Li k8-fr-interrupts-masked-while-pending-cycles
875Count cycles while interrupts were masked while pending (i.e., cycles
876when INTR was asserted while CPU RFLAGS field IF was zero).
877.It Li k8-fr-number-of-breakpoints-for-dr0
878Count the number of breakpoints for DR0.
879.It Li k8-fr-number-of-breakpoints-for-dr1
880Count the number of breakpoints for DR1.
881.It Li k8-fr-number-of-breakpoints-for-dr2
882Count the number of breakpoints for DR2.
883.It Li k8-fr-number-of-breakpoints-for-dr3
884Count the number of breakpoints for DR3.
885.It Li k8-fr-retired-branches
886Count retired branches including exceptions and interrupts.
887.It Li k8-fr-retired-branches-mispredicted
888Count mispredicted retired branches.
889.It Li k8-fr-retired-far-control-transfers
890Count retired far control transfers (which are always mispredicted).
891.It Li k8-fr-retired-fastpath-double-op-instructions Op Li ,mask= Ns Ar qualifier
892Count retired fastpath double op instructions.
893This event is supported in revision B and later CPUs.
894This event may be further qualified using
895.Ar qualifier ,
896which is a
897.Li "+" Ns - Ns
898separated set of the following keywords:
899.Bl -tag -width "XXXXXXXXXXXX" -compact
900.It Li low-op-pos-0
901Count instructions with the low op in position 0.
902.It Li low-op-pos-1
903Count instructions with the low op in position 1.
904.It Li low-op-pos-2
905Count instructions with the low op in position 2.
906.El
907The default is to count all types of instructions.
908.It Li k8-fr-retired-fpu-instructions Op Li ,mask= Ns Ar qualifier
909Count retired FPU instructions.
910This event is supported in revision B and later CPUs.
911This event may be further qualified using
912.Ar qualifier ,
913which is a
914.Li "+" Ns - Ns
915separated set of the following keywords:
916.Bl -tag -width "XXXXXXXXXX" -compact
917.It Li mmx-3dnow
918Count MMX and 3DNow! instructions.
919.It Li packed-sse-sse2
920Count packed SSE and SSE2 instructions.
921.It Li scalar-sse-sse2
922Count scalar SSE and SSE2 instructions
923.It Li x87
924Count x87 instructions.
925.El
926The default is to count all types of instructions.
927.It Li k8-fr-retired-near-returns
928Count retired near returns.
929.It Li k8-fr-retired-near-returns-mispredicted
930Count mispredicted near returns.
931.It Li k8-fr-retired-resyncs
932Count retired resyncs (non-control transfer branches).
933.It Li k8-fr-retired-taken-hardware-interrupts
934Count retired taken hardware interrupts.
935.It Li k8-fr-retired-taken-branches
936Count retired taken branches.
937.It Li k8-fr-retired-taken-branches-mispredicted
938Count retired taken branches that were mispredicted.
939.It Li k8-fr-retired-taken-branches-mispredicted-by-addr-miscompare
940Count retired taken branches that were mispredicted only due to an
941address miscompare.
942.It Li k8-fr-retired-uops
943Count retired uops.
944.It Li k8-fr-retired-x86-instructions
945Count retired x86 instructions including exceptions and interrupts.
946.It Li k8-ic-fetch
947Count instruction cache fetches.
948.It Li k8-ic-instruction-fetch-stall
949Count cycles in stalls due to instruction fetch.
950.It Li k8-ic-l1-itlb-miss-and-l2-itlb-hit
951Count L1 ITLB misses that are L2 ITLB hits.
952.It Li k8-ic-l1-itlb-miss-and-l2-itlb-miss
953Count ITLB misses that miss in both L1 and L2 ITLBs.
954.It Li k8-ic-microarchitectural-resync-by-snoop
955Count microarchitectural resyncs caused by snoops.
956.It Li k8-ic-miss
957Count instruction cache misses.
958.It Li k8-ic-refill-from-l2
959Count instruction cache refills from L2 cache.
960.It Li k8-ic-refill-from-system
961Count instruction cache refills from system memory.
962.It Li k8-ic-return-stack-hits
963Count hits to the return stack.
964.It Li k8-ic-return-stack-overflow
965Count overflows of the return stack.
966.It Li k8-ls-buffer2-full
967Count load/store buffer2 full events.
968.It Li k8-ls-locked-operation Op Li ,mask= Ns Ar qualifier
969Count locked operations.
970For revision C and later CPUs, the following qualifiers are supported:
971.Bl -tag -width "XXXXXXXXXXXXX" -compact
972.It Li cycles-in-request
973Count the number of cycles in the lock request/grant stage.
974.It Li cycles-to-complete
975Count the number of cycles a lock takes to complete once it is
976non-speculative and is the older load/store operation.
977.It Li locked-instructions
978Count the number of lock instructions executed.
979.El
980The default is to count the number of lock instructions executed.
981.It Li k8-ls-microarchitectural-late-cancel
982Count microarchitectural late cancels of operations in the load/store
983unit.
984.It Li k8-ls-microarchitectural-resync-by-self-modifying-code
985Count microarchitectural resyncs caused by self-modifying code.
986.It Li k8-ls-microarchitectural-resync-by-snoop
987Count microarchitectural resyncs caused by snoops.
988.It Li k8-ls-retired-cflush-instructions
989Count retired CFLUSH instructions.
990.It Li k8-ls-retired-cpuid-instructions
991Count retired CPUID instructions.
992.It Li k8-ls-segment-register-load Op Li ,mask= Ns Ar qualifier
993Count segment register loads.
994This event may be further qualified using
995.Ar qualifier ,
996which is a
997.Li "+" Ns - Ns
998separated set of the following keywords:
999.Bl -tag -width "XX" -compact
1000.It Li cs
1001Count CS register loads.
1002.It Li ds
1003Count DS register loads.
1004.It Li es
1005Count ES register loads.
1006.It Li fs
1007Count FS register loads.
1008.It Li gs
1009Count GS register loads.
1010.\" .It Ic hs
1011.\" Count HS register loads.
1012.\" XXX "HS" register?
1013.It Li ss
1014Count SS register loads.
1015.El
1016The default is to count all types of loads.
1017.It Li k8-nb-memory-controller-bypass-saturation Op Li ,mask= Ns Ar qualifier
1018Count memory controller bypass counter saturation events.
1019This event may be further qualified using
1020.Ar qualifier ,
1021which is a
1022.Li "+" Ns - Ns
1023separated set of the following keywords:
1024.Bl -tag -width "XXXXXXXXXX" -compact
1025.It Li dram-controller-interface-bypass
1026Count DRAM controller interface bypass.
1027.It Li dram-controller-queue-bypass
1028Count DRAM controller queue bypass.
1029.It Li memory-controller-hi-pri-bypass
1030Count memory controller high priority bypasses.
1031.It Li memory-controller-lo-pri-bypass
1032Count memory controller low priority bypasses.
1033.El
1034.It Li k8-nb-memory-controller-dram-slots-missed
1035Count memory controller DRAM command slots missed (in MemClks).
1036.It Li k8-nb-memory-controller-page-access-event Op Li ,mask= Ns Ar qualifier
1037Count memory controller page access events.
1038This event may be further qualified using
1039.Ar qualifier ,
1040which is a
1041.Li "+" Ns - Ns
1042separated set of the following keywords:
1043.Bl -tag -width "XXXXXXXXXX" -compact
1044.It Li page-conflict
1045Count page conflicts.
1046.It Li page-hit
1047Count page hits.
1048.It Li page-miss
1049Count page misses.
1050.El
1051The default is to count all types of events.
1052.It Li k8-nb-memory-controller-page-table-overflow
1053Count memory control page table overflow events.
1054.It Li k8-nb-probe-result Op Li ,mask= Ns Ar qualifier
1055Count probe events.
1056This event may be further qualified using
1057.Ar qualifier ,
1058which is a
1059.Li "+" Ns - Ns
1060separated set of the following keywords:
1061.Bl -tag -width "exclusive" -compact
1062.It Li probe-hit
1063Count all probe hits.
1064.It Li probe-hit-dirty-no-memory-cancel
1065Count probe hits without memory cancels.
1066.It Li probe-hit-dirty-with-memory-cancel
1067Count probe hits with memory cancels.
1068.It Li probe-miss
1069Count probe misses.
1070.El
1071.It Li k8-nb-sized-commands Op Li ,mask= Ns Ar qualifier
1072Count sized commands issued.
1073This event may be further qualified using
1074.Ar qualifier ,
1075which is a
1076.Li "+" Ns - Ns
1077separated set of the following keywords:
1078.Bl -tag -width "exclusive" -compact
1079.It Li nonpostwrszbyte
1080.It Li nonpostwrszdword
1081.It Li postwrszbyte
1082.It Li postwrszdword
1083.It Li rdszbyte
1084.It Li rdszdword
1085.It Li rdmodwr
1086.El
1087The default is to count all types of commands.
1088.It Li k8-nb-memory-controller-turnaround Op Li ,mask= Ns Ar qualifier
1089Count memory control turnaround events.
1090This event may be further qualified using
1091.Ar qualifier ,
1092which is a
1093.Li "+" Ns - Ns
1094separated set of the following keywords:
1095.Bl -tag -width "XXXXXXXXXX" -compact
1096.\" XXX doc is unclear whether these are cycle counts or event counts
1097.It Li dimm-turnaround
1098Count DIMM turnarounds.
1099.It Li read-to-write-turnaround
1100Count read to write turnarounds.
1101.It Li write-to-read-turnaround
1102Count write to read turnarounds.
1103.El
1104The default is to count all types of events.
1105.It Li k8-nb-ht-bus0-bandwidth Op Li ,mask= Ns Ar qualifier
1106.It Li k8-nb-ht-bus1-bandwidth Op Li ,mask= Ns Ar qualifier
1107.It Li k8-nb-ht-bus2-bandwidth Op Li ,mask= Ns Ar qualifier
1108Count events on the HyperTransport(tm) buses.
1109These events may be further qualified using
1110.Ar qualifier ,
1111which is a
1112.Li "+" Ns - Ns
1113separated set of the following keywords:
1114.Bl -tag -width "XXXXXXXXXX" -compact
1115.It Li buffer-release
1116Count buffer release messages sent.
1117.It Li command
1118Count command messages sent.
1119.It Li data
1120Count data messages sent.
1121.It Li nop
1122Count nop messages sent.
1123.El
1124The default is to count all types of messages.
1125.El
1126.Ss Intel P6 PMCS
1127Intel P6 PMCs are present in Intel
1128.Tn "Pentium Pro" ,
1129.Tn "Pentium II" ,
1130.Tn "Celeron" ,
1131.Tn "Pentium III"
1132and
1133.Tn "Pentium M"
1134processors.
1135.Pp
1136These CPUs have two counters.
1137Some events may only be used on specific counters and some events are
1138defined only on specific processor models.
1139.Pp
1140These PMCs are documented in
1141.Rs
1142.%B "IA-32 Intel(R) Architecture Software Developer's Manual"
1143.%T "Volume 3: System Programming Guide"
1144.%N "Order Number 245472-012"
1145.%D 2003
1146.%Q "Intel Corporation"
1147.Re
1148.Pp
1149Event specifiers for Intel P6 PMCs can have the following common
1150qualifiers:
1151.Bl -tag -width indent
1152.It Li cmask= Ns Ar value
1153Configure the PMC to increment only if the number of configured
1154events measured in a cycle is greater than or equal to
1155.Ar value .
1156.It Li edge
1157Configure the PMC to count the number of deasserted to asserted
1158transitions of the conditions expressed by the other qualifiers.
1159If specified, the counter will increment only once whenever a
1160condition becomes true, irrespective of the number of clocks during
1161which the condition remains true.
1162.It Li inv
1163Invert the sense of comparision when the
1164.Ar cmask
1165qualifier is present, making the counter increment when the number of
1166events per cycle is less than the value specified by the
1167.Ar cmask
1168qualifier.
1169.It Li os
1170Configure the PMC to count events happening at processor privilege
1171level 0.
1172.It Li umask= Ns Ar value
1173This qualifier is used to further qualify the event selected (see
1174below).
1175.It Li usr
1176Configure the PMC to count events occurring at privilege levels 1, 2
1177or 3.
1178.El
1179If neither of the
1180.Li os
1181or
1182.Li usr
1183qualifiers are specified, the default is to enable both.
1184.Pp
1185The event specifiers supported by Intel P6 PMCs are:
1186.Bl -tag -width indent
1187.It Li p6-baclears
1188Count the number of times a static branch prediction was made by the
1189branch decoder because the BTB did not have a prediction.
1190.It Li p6-br-bac-missp-exec
1191.Pq Tn "Pentium M"
1192Count the number of branch instructions executed that where
1193mispredicted at the Front End (BAC).
1194.It Li p6-br-bogus
1195Count the number of bogus branches.
1196.It Li p6-br-call-exec
1197.Pq Tn "Pentium M"
1198Count the number of call instructions executed.
1199.It Li p6-br-call-missp-exec
1200.Pq Tn "Pentium M"
1201Count the number of call instructions executed that were mispredicted.
1202.It Li p6-br-cnd-exec
1203.Pq Tn "Pentium M"
1204Count the number of conditional branch instructions executed.
1205.It Li p6-br-cnd-missp-exec
1206.Pq Tn "Pentium M"
1207Count the number of conditional branch instructions executed that were
1208mispredicted.
1209.It Li p6-br-ind-call-exec
1210.Pq Tn "Pentium M"
1211Count the number of indirect call instructions executed.
1212.It Li p6-br-ind-exec
1213.Pq Tn "Pentium M"
1214Count the number of indirect branch instructions executed.
1215.It Li p6-br-ind-missp-exec
1216.Pq Tn "Pentium M"
1217Count the number of indirect branch instructions executed that were
1218mispredicted.
1219.It Li p6-br-inst-decoded
1220Count the number of branch instructions decoded.
1221.It Li p6-br-inst-exec
1222.Pq Tn "Pentium M"
1223Count the number of branch instructions executed but necessarily retired.
1224.It Li p6-br-inst-retired
1225Count the number of branch instructions retired.
1226.It Li p6-br-miss-pred-retired
1227Count the number of mispredicted branch instructions retired.
1228.It Li p6-br-miss-pred-taken-ret
1229Count the number of taken mispredicted branches retired.
1230.It Li p6-br-missp-exec
1231.Pq Tn "Pentium M"
1232Count the number of branch instructions executed that were
1233mispredicted at execution.
1234.It Li p6-br-ret-bac-missp-exec
1235.Pq Tn "Pentium M"
1236Count the number of return instructions executed that were
1237mispredicted at the Front End (BAC).
1238.It Li p6-br-ret-exec
1239.Pq Tn "Pentium M"
1240Count the number of return instructions executed.
1241.It Li p6-br-ret-missp-exec
1242.Pq Tn "Pentium M"
1243Count the number of return instructions executed that were
1244mispredicted at execution.
1245.It Li p6-br-taken-retired
1246Count the number of taken branches retired.
1247.It Li p6-btb-misses
1248Count the number of branches for which the BTB did not produce a
1249prediction.
1250.It Li p6-bus-bnr-drv
1251Count the number of bus clock cycles during which this processor is
1252driving the BNR# pin.
1253.It Li p6-bus-data-rcv
1254Count the number of bus clock cycles during which this processor is
1255receiving data.
1256.It Li p6-bus-drdy-clocks Op Li ,umask= Ns Ar qualifier
1257Count the number of clocks during which DRDY# is asserted.
1258An additional qualifier may be specified, and comprises one of the
1259following keywords:
1260.Bl -tag -width indent -compact
1261.It Li any
1262Count transactions generated by any agent on the bus.
1263.It Li self
1264Count transactions generated by this processor.
1265.El
1266The default is to count operations generated by this processor.
1267.It Li p6-bus-hit-drv
1268Count the number of bus clock cycles during which this processor is
1269driving the HIT# pin.
1270.It Li p6-bus-hitm-drv
1271Count the number of bus clock cycles during which this processor is
1272driving the HITM# pin.
1273.It Li p6-bus-lock-clocks Op Li ,umask= Ns Ar qualifier
1274Count the number of clocks during with LOCK# is asserted on the
1275external system bus.
1276An additional qualifier may be specified and comprises one of the following
1277keywords:
1278.Bl -tag -width indent -compact
1279.It Li any
1280Count transactions generated by any agent on the bus.
1281.It Li self
1282Count transactions generated by this processor.
1283.El
1284The default is to count operations generated by this processor.
1285.It Li p6-bus-req-outstanding
1286Count the number of bus requests outstanding in any given cycle.
1287.It Li p6-bus-snoop-stall
1288Count the number of clock cycles during which the bus is snoop stalled.
1289.It Li p6-bus-tran-any Op Li ,umask= Ns Ar qualifier
1290Count the number of completed bus transactions of any kind.
1291An additional qualifier may be specified and comprises one of the following
1292keywords:
1293.Bl -tag -width indent -compact
1294.It Li any
1295Count transactions generated by any agent on the bus.
1296.It Li self
1297Count transactions generated by this processor.
1298.El
1299The default is to count operations generated by this processor.
1300.It Li p6-bus-tran-brd Op Li ,umask= Ns Ar qualifier
1301Count the number of burst read transactions.
1302An additional qualifier may be specified and comprises one of the following
1303keywords:
1304.Bl -tag -width indent -compact
1305.It Li any
1306Count transactions generated by any agent on the bus.
1307.It Li self
1308Count transactions generated by this processor.
1309.El
1310The default is to count operations generated by this processor.
1311.It Li p6-bus-tran-burst Op Li ,umask= Ns Ar qualifier
1312Count the number of completed burst transactions.
1313An additional qualifier may be specified and comprises one of the following
1314keywords:
1315.Bl -tag -width indent -compact
1316.It Li any
1317Count transactions generated by any agent on the bus.
1318.It Li self
1319Count transactions generated by this processor.
1320.El
1321The default is to count operations generated by this processor.
1322.It Li p6-bus-tran-def Op Li ,umask= Ns Ar qualifier
1323Count the number of completed deferred transactions.
1324An additional qualifier may be specified and comprises one of the following
1325keywords:
1326.Bl -tag -width indent -compact
1327.It Li any
1328Count transactions generated by any agent on the bus.
1329.It Li self
1330Count transactions generated by this processor.
1331.El
1332The default is to count operations generated by this processor.
1333.It Li p6-bus-tran-ifetch Op Li ,umask= Ns Ar qualifier
1334Count the number of completed instruction fetch transactions.
1335An additional qualifier may be specified and comprises one of the following
1336keywords:
1337.Bl -tag -width indent -compact
1338.It Li any
1339Count transactions generated by any agent on the bus.
1340.It Li self
1341Count transactions generated by this processor.
1342.El
1343The default is to count operations generated by this processor.
1344.It Li p6-bus-tran-inval Op Li ,umask= Ns Ar qualifier
1345Count the number of completed invalidate transactions.
1346An additional qualifier may be specified and comprises one of the following
1347keywords:
1348.Bl -tag -width indent -compact
1349.It Li any
1350Count transactions generated by any agent on the bus.
1351.It Li self
1352Count transactions generated by this processor.
1353.El
1354The default is to count operations generated by this processor.
1355.It Li p6-bus-tran-mem Op Li ,umask= Ns Ar qualifier
1356Count the number of completed memory transactions.
1357An additional qualifier may be specified and comprises one of the following
1358keywords:
1359.Bl -tag -width indent -compact
1360.It Li any
1361Count transactions generated by any agent on the bus.
1362.It Li self
1363Count transactions generated by this processor.
1364.El
1365The default is to count operations generated by this processor.
1366.It Li p6-bus-tran-pwr Op Li ,umask= Ns Ar qualifier
1367Count the number of completed partial write transactions.
1368An additional qualifier may be specified and comprises one of the following
1369keywords:
1370.Bl -tag -width indent -compact
1371.It Li any
1372Count transactions generated by any agent on the bus.
1373.It Li self
1374Count transactions generated by this processor.
1375.El
1376The default is to count operations generated by this processor.
1377.It Li p6-bus-tran-rfo Op Li ,umask= Ns Ar qualifier
1378Count the number of completed read-for-ownership transactions.
1379An additional qualifier may be specified and comprises one of the following
1380keywords:
1381.Bl -tag -width indent -compact
1382.It Li any
1383Count transactions generated by any agent on the bus.
1384.It Li self
1385Count transactions generated by this processor.
1386.El
1387The default is to count operations generated by this processor.
1388.It Li p6-bus-trans-io Op Li ,umask= Ns Ar qualifier
1389Count the number of completed I/O transactions.
1390An additional qualifier may be specified and comprises one of the following
1391keywords:
1392.Bl -tag -width indent -compact
1393.It Li any
1394Count transactions generated by any agent on the bus.
1395.It Li self
1396Count transactions generated by this processor.
1397.El
1398The default is to count operations generated by this processor.
1399.It Li p6-bus-trans-p Op Li ,umask= Ns Ar qualifier
1400Count the number of completed partial transactions.
1401An additional qualifier may be specified and comprises one of the following
1402keywords:
1403.Bl -tag -width indent -compact
1404.It Li any
1405Count transactions generated by any agent on the bus.
1406.It Li self
1407Count transactions generated by this processor.
1408.El
1409The default is to count operations generated by this processor.
1410.It Li p6-bus-trans-wb Op Li ,umask= Ns Ar qualifier
1411Count the number of completed write-back transactions.
1412An additional qualifier may be specified and comprises one of the following
1413keywords:
1414.Bl -tag -width indent -compact
1415.It Li any
1416Count transactions generated by any agent on the bus.
1417.It Li self
1418Count transactions generated by this processor.
1419.El
1420The default is to count operations generated by this processor.
1421.It Li p6-cpu-clk-unhalted
1422Count the number of cycles during with the processor was not halted.
1423.Pp
1424.Pq Tn "Pentium M"
1425Count the number of cycles during with the processor was not halted
1426and not in a thermal trip.
1427.It Li p6-cycles-div-busy
1428Count the number of cycles during which the divider is busy and cannot
1429accept new divides.
1430This event is only allocated on counter 0.
1431.It Li p6-cycles-in-pending-and-masked
1432Count the number of processor cycles for which interrupts were
1433disabled and interrupts were pending.
1434.It Li p6-cycles-int-masked
1435Count the number of processor cycles for which interrupts were
1436disabled.
1437.It Li p6-data-mem-refs
1438Count all loads and all stores using any memory type, including
1439internal retries.
1440Each part of a split store is counted seperately.
1441.It Li p6-dcu-lines-in
1442Count the total lines allocated in the data cache unit.
1443.It Li p6-dcu-m-lines-in
1444Count the number of M state lines allocated in the data cache unit.
1445.It Li p6-dcu-m-lines-out
1446Count the number of M state lines evicted from the data cache unit.
1447.It Li p6-dcu-miss-outstanding
1448Count the weighted number of cycles while a data cache unit miss is
1449outstanding, incremented by the number of outstanding cache misses at
1450any time.
1451.It Li p6-div
1452Count the number of floating point multiplies.
1453This event is only allocated on counter 1.
1454.It Li p6-emon-esp-uops
1455.Pq Tn "Pentium M"
1456Count the total number of micro-ops.
1457.It Li p6-emon-est-trans Op Li ,umask= Ns Ar qualifier
1458.Pq Tn "Pentium M"
1459Count the number of
1460.Tn "Enhanced Intel SpeedStep"
1461transitions.
1462An additional qualifier may be specified, and can be one of the
1463following keywords:
1464.Bl -tag -width indent -compact
1465.It Li all
1466Count all transitions.
1467.It Li freq
1468Count only frequency transitions.
1469.El
1470The default is to count all transitions.
1471.It Li p6-emon-fused-uops-ret Op Li ,umask= Ns Ar qualifier
1472.Pq Tn "Pentium M"
1473Count the number of retired fused micro-ops.
1474An additional qualifier may be specified, and may be one of the
1475following keywords:
1476.Bl -tag -width indent -compact
1477.It Li all
1478Count all fused micro-ops.
1479.It Li loadop
1480Count only load and op micro-ops.
1481.It Li stdsta
1482Count only STD/STA micro-ops.
1483.El
1484The default is to count all fused micro-ops.
1485.It Li p6-emon-kni-comp-inst-ret
1486.Pq Tn "Pentium III"
1487Count the number of SSE computational instructions retired.
1488An additional qualifier may be specified, and comprises one of the
1489following keywords:
1490.Bl -tag -width indent -compact
1491.It Li packed-and-scalar
1492Count packed and scalar operations.
1493.It Li scalar
1494Count scalar operations only.
1495.El
1496The default is to count packed and scalar operations.
1497.It Li p6-emon-kni-inst-retired Op Li ,umask= Ns Ar qualifier
1498.Pq Tn "Pentium III"
1499Count the number of SSE instructions retired.
1500An additional qualifier may be specified, and comprises one of the
1501following keywords:
1502.Bl -tag -width indent -compact
1503.It Li packed-and-scalar
1504Count packed and scalar operations.
1505.It Li scalar
1506Count scalar operations only.
1507.El
1508The default is to count packed and scalar operations.
1509.It Li p6-emon-kni-pref-dispatched Op Li ,umask= Ns Ar qualifier
1510.Pq Tn "Pentium III"
1511Count the number of SSE prefetch or weakly ordered instructions
1512dispatched (including speculative prefetches).
1513An additional qualifier may be specified, and comprises one of the
1514following keywords:
1515.Bl -tag -width indent -compact
1516.It Li nta
1517Count non-temporal prefetches.
1518.It Li t1
1519Count prefetches to L1.
1520.It Li t2
1521Count prefetches to L2.
1522.It Li wos
1523Count weakly ordered stores.
1524.El
1525The default is to count non-temporal prefetches.
1526.It Li p6-emon-kni-pref-miss Op Li ,umask= Ns Ar qualifier
1527.Pq Tn "Pentium III"
1528Count the number of prefetch or weakly ordered instructions that miss
1529all caches.
1530An additional qualifier may be specified, and comprises one of the
1531following keywords:
1532.Bl -tag -width indent -compact
1533.It Li nta
1534Count non-temporal prefetches.
1535.It Li t1
1536Count prefetches to L1.
1537.It Li t2
1538Count prefetches to L2.
1539.It Li wos
1540Count weakly ordered stores.
1541.El
1542The default is to count non-temporal prefetches.
1543.It Li p6-emon-pref-rqsts-dn
1544.Pq Tn "Pentium M"
1545Count the number of downward prefetches issued.
1546.It Li p6-emon-pref-rqsts-up
1547.Pq Tn "Pentium M"
1548Count the number of upward prefetches issued.
1549.It Li p6-emon-simd-instr-retired
1550.Pq Tn "Pentium M"
1551Count the number of retired
1552.Tn MMX
1553instructions.
1554.It Li p6-emon-sse-sse2-comp-inst-retired Op Li ,umask= Ns Ar qualifier
1555.Pq Tn "Pentium M"
1556Count the number of computational SSE instructions retired.
1557An additional qualifier may be specified and can be one of the
1558following keywords:
1559.Bl -tag -width indent -compact
1560.It Li sse-packed-single
1561Count SSE packed-single instructions.
1562.It Li sse-scalar-single
1563Count SSE scalar-single instructions.
1564.It Li sse2-packed-double
1565Count SSE2 packed-double instructions.
1566.It Li sse2-scalar-double
1567Count SSE2 scalar-double instructions.
1568.El
1569The default is to count SSE packed-single instructions.
1570.It Li p6-emon-sse-sse2-inst-retired Op Li ,umask= Ns Ar qualifer
1571.Pp
1572.Pq Tn "Pentium M"
1573Count the number of SSE instructions retired.
1574An additional qualifier can be specified, and can be one of the
1575following keywords:
1576.Bl -tag -width indent -compact
1577.It Li sse-packed-single
1578Count SSE packed-single instructions.
1579.It Li sse-packed-single-scalar-single
1580Count SSE packed-single and scalar-single instructions.
1581.It Li sse2-packed-double
1582Count SSE2 packed-double instructions.
1583.It Li sse2-scalar-double
1584Count SSE2 scalar-double instructions.
1585.El
1586The default is to count SSE packed-single instructions.
1587.It Li p6-emon-synch-uops
1588.Pq Tn "Pentium M"
1589Count the number of sync micro-ops.
1590.It Li p6-emon-thermal-trip
1591.Pq Tn "Pentium M"
1592Count the duration or occurrences of thermal trips.
1593Use the
1594.Ar edge
1595qualifier to count occurrences of thermal trips.
1596.It Li p6-emon-unfusion
1597.Pq Tn "Pentium M"
1598Count the number of unfusion events in the reorder buffer.
1599.It Li p6-flops
1600Count the number of computational floating point operations retired.
1601This event is only allocated on counter 0.
1602.It Li p6-fp-assist
1603Count the number of floating point exceptions handled by microcode.
1604This event is only allocated on counter 1.
1605.It Li p6-fp-comps-ops-exe
1606Count the number of computation floating point operations executed.
1607This event is only allocated on counter 0.
1608.It Li p6-fp-mmx-trans Op Li ,umask= Ns Ar qualifier
1609.Pq Tn "Pentium II" , Tn "Pentium III"
1610Count the number of transitions between MMX and floating-point
1611instructions.
1612An additional qualifier may be specified, and comprises one of the
1613following keywords:
1614.Bl -tag -width indent -compact
1615.It Li mmxtofp
1616Count transitions from MMX instructions to floating-point instructions.
1617.It Li fptommx
1618Count transitions from floating-point instructions to MMX instructions.
1619.El
1620The default is to count MMX to floating-point transitions.
1621.It Li p6-hw-int-rx
1622Count the number of hardware interrupts received.
1623.It Li p6-ifu-fetch
1624Count the number of instruction fetches, both cacheable and non-cacheable.
1625.It Li p6-ifu-fetch-miss
1626Count the number of instruction fetch misses (i.e., those that produce
1627memory accesses).
1628.It Li p6-ifu-mem-stall
1629Count the number of cycles instruction fetch is stalled for any reason.
1630.It Li p6-ild-stall
1631Count the number of cycles the instruction length decoder is stalled.
1632.It Li p6-inst-decoded
1633Count the number of instructions decoded.
1634.It Li p6-inst-retired
1635Count the number of instructions retired.
1636.It Li p6-itlb-miss
1637Count the number of instruction TLB misses.
1638.It Li p6-l2-ads
1639Count the number of L2 address strobes.
1640.It Li p6-l2-dbus-busy
1641Count the number of cycles during which the L2 cache data bus was busy.
1642.It Li p6-l2-dbus-busy-rd
1643Count the number of cycles during which the L2 cache data bus was busy
1644transferring read data from L2 to the processor.
1645.It Li p6-l2-ifetch Op Li ,umask= Ns Ar qualifier
1646Count the number of L2 instruction fetches.
1647An additional qualifier may be specified and comprises a list of the following
1648keywords separated by
1649.Li "+"
1650characters:
1651.Bl -tag -width indent -compact
1652.It Li e
1653Count operations affecting E (exclusive) state lines.
1654.It Li i
1655Count operations affecting I (invalid) state lines.
1656.It Li m
1657Count operations affecting M (modified) state lines.
1658.It Li s
1659Count operations affecting S (shared) state lines.
1660.El
1661The default is to count operations affecting all (MESI) state lines.
1662.It Li p6-l2-ld Op Li ,umask= Ns Ar qualifier
1663Count the number of L2 data loads.
1664An additional qualifier may be specified and comprises a list of the following
1665keywords separated by
1666.Li "+"
1667characters:
1668.Bl -tag -width indent -compact
1669.It Li both
1670.Pq Tn "Pentium M"
1671Count both hardware-prefetched lines and non-hardware-prefetched lines.
1672.It Li e
1673Count operations affecting E (exclusive) state lines.
1674.It Li hw
1675.Pq Tn "Pentium M"
1676Count hardware-prefetched lines only.
1677.It Li i
1678Count operations affecting I (invalid) state lines.
1679.It Li m
1680Count operations affecting M (modified) state lines.
1681.It Li nonhw
1682.Pq Tn "Pentium M"
1683Exclude hardware-prefetched lines.
1684.It Li s
1685Count operations affecting S (shared) state lines.
1686.El
1687The default on processors other than
1688.Tn "Pentium M"
1689processors is to count operations affecting all (MESI) state lines.
1690The default on
1691.Tn "Pentium M"
1692processors is to count both hardware-prefetched and
1693non-hardware-prefetch operations on all (MESI) state lines.
1694.It Li p6-l2-lines-in Op Li ,umask= Ns Ar qualifier
1695Count the number of L2 lines allocated.
1696An additional qualifier may be specified and comprises a list of the following
1697keywords separated by
1698.Li "+"
1699characters:
1700.Bl -tag -width indent -compact
1701.It Li both
1702.Pq Tn "Pentium M"
1703Count both hardware-prefetched lines and non-hardware-prefetched lines.
1704.It Li e
1705Count operations affecting E (exclusive) state lines.
1706.It Li hw
1707.Pq Tn "Pentium M"
1708Count hardware-prefetched lines only.
1709.It Li i
1710Count operations affecting I (invalid) state lines.
1711.It Li m
1712Count operations affecting M (modified) state lines.
1713.It Li nonhw
1714.Pq Tn "Pentium M"
1715Exclude hardware-prefetched lines.
1716.It Li s
1717Count operations affecting S (shared) state lines.
1718.El
1719The default on processors other than
1720.Tn "Pentium M"
1721processors is to count operations affecting all (MESI) state lines.
1722The default on
1723.Tn "Pentium M"
1724processors is to count both hardware-prefetched and
1725non-hardware-prefetch operations on all (MESI) state lines.
1726.It Li p6-l2-lines-out Op Li ,umask= Ns Ar qualifier
1727Count the number of L2 lines evicted.
1728An additional qualifier may be specified and comprises a list of the following
1729keywords separated by
1730.Li "+"
1731characters:
1732.Bl -tag -width indent -compact
1733.It Li both
1734.Pq Tn "Pentium M"
1735Count both hardware-prefetched lines and non-hardware-prefetched lines.
1736.It Li e
1737Count operations affecting E (exclusive) state lines.
1738.It Li hw
1739.Pq Tn "Pentium M"
1740Count hardware-prefetched lines only.
1741.It Li i
1742Count operations affecting I (invalid) state lines.
1743.It Li m
1744Count operations affecting M (modified) state lines.
1745.It Li nonhw
1746.Pq Tn "Pentium M" only
1747Exclude hardware-prefetched lines.
1748.It Li s
1749Count operations affecting S (shared) state lines.
1750.El
1751The default on processors other than
1752.Tn "Pentium M"
1753processors is to count operations affecting all (MESI) state lines.
1754The default on
1755.Tn "Pentium M"
1756processors is to count both hardware-prefetched and
1757non-hardware-prefetch operations on all (MESI) state lines.
1758.It Li p6-l2-m-lines-inm
1759Count the number of modified lines allocated in L2 cache.
1760.It Li p6-l2-m-lines-outm Op Li ,umask= Ns Ar qualifier
1761Count the number of L2 M-state lines evicted.
1762.Pp
1763.Pq Tn "Pentium M"
1764On these processors an additional qualifier may be specified and
1765comprises a list of the following keywords separated by
1766.Li "+"
1767characters:
1768.Bl -tag -width indent -compact
1769.It Li both
1770Count both hardware-prefetched lines and non-hardware-prefetched lines.
1771.It Li hw
1772Count hardware-prefetched lines only.
1773.It Li nonhw
1774Exclude hardware-prefetched lines.
1775.El
1776The default is to count both hardware-prefetched and
1777non-hardware-prefetch operations.
1778.It Li p6-l2-rqsts Op Li ,umask= Ns Ar qualifier
1779Count the total number of L2 requests.
1780An additional qualifier may be specified and comprises a list of the following
1781keywords separated by
1782.Li "+"
1783characters:
1784.Bl -tag -width indent -compact
1785.It Li e
1786Count operations affecting E (exclusive) state lines.
1787.It Li i
1788Count operations affecting I (invalid) state lines.
1789.It Li m
1790Count operations affecting M (modified) state lines.
1791.It Li s
1792Count operations affecting S (shared) state lines.
1793.El
1794The default is to count operations affecting all (MESI) state lines.
1795.It Li p6-l2-st
1796Count the number of L2 data stores.
1797An additional qualifier may be specified and comprises a list of the following
1798keywords separated by
1799.Li "+"
1800characters:
1801.Bl -tag -width indent -compact
1802.It Li e
1803Count operations affecting E (exclusive) state lines.
1804.It Li i
1805Count operations affecting I (invalid) state lines.
1806.It Li m
1807Count operations affecting M (modified) state lines.
1808.It Li s
1809Count operations affecting S (shared) state lines.
1810.El
1811The default is to count operations affecting all (MESI) state lines.
1812.It Li p6-ld-blocks
1813Count the number of load operations delayed due to store buffer blocks.
1814.It Li p6-misalign-mem-ref
1815Count the number of misaligned data memory references (crossing a 64
1816bit boundary).
1817.It Li p6-mmx-assist
1818.Pq Tn "Pentium II" , Tn "Pentium III"
1819Count the number of MMX assists executed.
1820.It Li p6-mmx-instr-exec
1821.Pq Tn "Celeron" , Tn "Pentium II"
1822Count the number of MMX instructions executed, except MOVQ and MOVD
1823stores from register to memory.
1824.It Li p6-mmx-instr-ret
1825.Pq Tn "Pentium II"
1826Count the number of MMX instructions retired.
1827.It Li p6-mmx-instr-type-exec Op Li ,umask= Ns Ar qualifier
1828.Pq Tn "Pentium II" , Tn "Pentium III"
1829Count the number of MMX instructions executed.
1830An additional qualifier may be specified and comprises a list of
1831the following keywords separated by
1832.Li "+"
1833characters:
1834.Bl -tag -width indent -compact
1835.It Li pack
1836Count MMX pack operation instructions.
1837.It Li packed-arithmetic
1838Count MMX packed arithmetic instructions.
1839.It Li packed-logical
1840Count MMX packed logical instructions.
1841.It Li packed-multiply
1842Count MMX packed multiply instructions.
1843.It Li packed-shift
1844Count MMX packed shift instructions.
1845.It Li unpack
1846Count MMX unpack operation instructions.
1847.El
1848The default is to count all operations.
1849.It Li p6-mmx-sat-instr-exec
1850.Pq Tn "Pentium II" , Tn "Pentium III"
1851Count the number of MMX saturating instructions executed.
1852.It Li p6-mmx-uops-exec
1853.Pq Tn "Pentium II" , Tn "Pentium III"
1854Count the number of MMX micro-ops executed.
1855.It Li p6-mul
1856Count the number of floating point multiplies.
1857This event is only allocated on counter 1.
1858.It Li p6-partial-rat-stalls
1859Count the number of cycles or events for partial stalls.
1860.It Li p6-resource-stalls
1861Count the number of cycles there was a resource related stall of any kind.
1862.It Li p6-ret-seg-renames
1863.Pq Tn "Pentium II" , Tn "Pentium III"
1864Count the number of segment register rename events retired.
1865.It Li p6-sb-drains
1866Count the number of cycles the store buffer is draining.
1867.It Li p6-seg-reg-renames Op Li ,umask= Ns Ar qualifier
1868.Pq Tn "Pentium II" , Tn "Pentium III"
1869Count the number of segment register renames.
1870An additional qualifier may be specified, and comprises a list of the
1871following keywords separated by
1872.Li "+"
1873characters:
1874.Bl -tag -width indent -compact
1875.It Li ds
1876Count renames for segment register DS.
1877.It Li es
1878Count renames for segment register ES.
1879.It Li fs
1880Count renames for segment register FS.
1881.It Li gs
1882Count renames for segment register GS.
1883.El
1884The default is to count operations affecting all segment registers.
1885.It Li p6-seg-rename-stalls
1886.Pq Tn "Pentium II" , Tn "Pentium III"
1887Count the number of segment register renaming stalls.
1888An additional qualifier may be specified, and comprises a list of the
1889following keywords separated by
1890.Li "+"
1891characters:
1892.Bl -tag -width indent -compact
1893.It Li ds
1894Count stalls for segment register DS.
1895.It Li es
1896Count stalls for segment register ES.
1897.It Li fs
1898Count stalls for segment register FS.
1899.It Li gs
1900Count stalls for segment register GS.
1901.El
1902The default is to count operations affecting all the segment registers.
1903.It Li p6-segment-reg-loads
1904Count the number of segment register loads.
1905.It Li p6-uops-retired
1906Count the number of micro-ops retired.
1907.El
1908.Ss Intel P4 PMCS
1909Intel P4 PMCs are present in Intel
1910.Tn "Pentium 4"
1911and
1912.Tn Xeon
1913processors.
1914These PMCs are documented in
1915.Rs
1916.%B "IA-32 Intel(R) Architecture Software Developer's Manual"
1917.%T "Volume 3: System Programming Guide"
1918.%N "Order Number 245472-012"
1919.%D 2003
1920.%Q "Intel Corporation"
1921.Re
1922Further information about using these PMCs may be found in
1923.Rs
1924.%B "IA-32 Intel(R) Architecture Optimization Guide"
1925.%D 2003
1926.%N "Order Number 248966-009"
1927.%Q "Intel Corporation"
1928.Re
1929.Pp
1930Event specifiers for Intel P4 PMCs can have the following common
1931qualifiers:
1932.Bl -tag -width indent
1933.It Li active= Ns Ar choice
1934(On P4 HTT CPUs) Filter event counting based on which logical
1935processors are active.
1936The allowed values of
1937.Ar choice
1938are:
1939.Bl -tag -width indent -compact
1940.It Li any
1941Count when either logical processor is active.
1942.It Li both
1943Count when both logical processors are active.
1944.It Li none
1945Count only when neither logical processor is active.
1946.It Li single
1947Count only when one logical processor is active.
1948.El
1949The default is
1950.Li both .
1951.It Li cascade
1952Configure the PMC to cascade onto its partner.
1953The PMC for the partner must already have been allocated by the
1954current process.
1955See
1956.Sx "Cascading P4 PMCs"
1957below for more information.
1958.It Li edge
1959Configure the counter to count false to true transitions of the threshold
1960comparision output.
1961This qualifier only takes effect if a threshold qualifier has also been
1962specified.
1963.It Li complement
1964Configure the counter to increment only when the event count seen is
1965less than the threshold qualifier value specified.
1966.It Li mask= Ns Ar qualifier
1967Many event specifiers for Intel P4 PMCs need to be additionally
1968qualified using a mask qualifier.
1969The allowed syntax for these qualifiers is event specific and is
1970described along with the events.
1971.It Li os
1972Configure the PMC to count when the CPL of the processor is 0.
1973.It Li precise
1974Select precise event based sampling.
1975Precise sampling is supported by the hardware for a limited set of
1976events.
1977.It Li tag= Ns Ar value
1978Configure the PMC to tag the internal uop selected by the other
1979fields in this event specifier with value
1980.Ar value .
1981This feature is used when cascading PMCs.
1982.It Li threshold= Ns Ar value
1983Configure the PMC to increment only when the event counts seen are
1984greater than the specified threshold value
1985.Ar value .
1986.It Li usr
1987Configure the PMC to count when the CPL of the processor is 1, 2 or 3.
1988.El
1989If neither of the
1990.Li os
1991or
1992.Li usr
1993qualifiers are specified, the default is to enable both.
1994.Pp
1995On Intel Pentium 4 processors with HTT, events are
1996divided into two classes:
1997.Bl -tag -width "XXXXXXXXXX" -compact
1998.It "TS Events"
1999are those where hardware can differentiate between events
2000generated on one logical processor from those generated on the
2001other.
2002.It "TI Events"
2003are those where hardware cannot differentiate between events
2004generated by multiple logical processors in a package.
2005.El
2006Only TS events are allowed for use with process-mode PMCs on
2007Pentium-4/HTT CPUs.
2008.Pp
2009The event specifiers supported by Intel P4 PMCs are:
2010.Bl -tag -width indent
2011.It Li p4-128bit-mmx-uop Op Li ,mask= Ns Ar flags
2012.Pq "TI event"
2013Count integer SIMD SSE2 instructions that operate on 128 bit SIMD
2014operands.
2015Qualifier
2016.Ar flags
2017can take the following value (which is also the default):
2018.Bl -tag -width indent -compact
2019.It Li all
2020Count all uops operating on 128 bit SIMD integer operands in memory or
2021XMM register.
2022.El
2023If an instruction contains more than one 128 bit MMX uop, then each
2024uop will be counted.
2025.It Li p4-64bit-mmx-uop Op Li ,mask= Ns Ar flags
2026.Pq "TI event"
2027Count MMX instructions that operate on 64 bit SIMD operands.
2028Qualifier
2029.Ar flags
2030can take the following value (which is also the default):
2031.Bl -tag -width indent -compact
2032.It Li all
2033Count all uops operating on 64 bit SIMD integer operands in memory or
2034in MMX registers.
2035.El
2036If an instruction contains more than one 64 bit MMX uop, then each
2037uop will be counted.
2038.It Li p4-b2b-cycles
2039.Pq "TI event"
2040Count back-to-back bys cycles.
2041Further documentation for this event is unavailable.
2042.It Li p4-bnr
2043.Pq "TI event"
2044Count bus-not-ready conditions.
2045Further documentation for this event is unavailable.
2046.It Li p4-bpu-fetch-request Op Li ,mask= Ns Ar qualifier
2047.Pq "TS event"
2048Count instruction fetch requests qualified by additional
2049flags specified in
2050.Ar qualifier .
2051At this point only one flag is supported:
2052.Bl -tag -width indent -compact
2053.It Li tcmiss
2054Count trace cache lookup misses.
2055.El
2056The default qualifier is also
2057.Ar mask=tcmiss .
2058.It Li p4-branch-retired Op Li ,mask= Ns Ar flags
2059.Pq "TS event"
2060Counts retired branches.
2061Qualifier
2062.Ar flags
2063is a list of the following
2064.Li +
2065separated strings:
2066.Bl -tag -width indent -compact
2067.It Li mmnp
2068Count branches not-taken and predicted.
2069.It Li mmnm
2070Count branches not-taken and mis-predicted.
2071.It Li mmtp
2072Count branches taken and predicted.
2073.It Li mmtm
2074Count branches taken and mis-predicted.
2075.El
2076The default qualifier counts all four kinds of branches.
2077.It Li p4-bsq-active-entries Op Li ,mask= Ns Ar qualifier
2078.Pq "TS event"
2079Count the number of entries (clipped at 15) currently active in the
2080BSQ.
2081Qualifier
2082.Ar qualifier
2083is a
2084.Li +
2085separated set of the following flags:
2086.Bl -tag -width indent -compact
2087.It Li req-type0 , Li req-type1
2088Forms a 2-bit number used to select the request type encoding:
2089.Bl -tag -width indent -compact
2090.It Li 0
2091reads excluding read invalidate
2092.It Li 1
2093read invalidates
2094.It Li 2
2095writes other than writebacks
2096.It Li 3
2097writebacks
2098.El
2099Bit
2100.Li req-type1
2101is the MSB for this two bit number.
2102.It Li req-len0 , Li req-len1
2103Forms a two-bit number that specifies the request length encoding:
2104.Bl -tag -width indent -compact
2105.It Li 0
21060 chunks
2107.It Li 1
21081 chunk
2109.It Li 3
21108 chunks
2111.El
2112Bit
2113.Li req-len1
2114is the MSB for this two bit number.
2115.It Li req-io-type
2116Count requests that are input or output requests.
2117.It Li req-lock-type
2118Count requests that lock the bus.
2119.It Li req-lock-cache
2120Count requests that lock the cache.
2121.It Li req-split-type
2122Count requests that is a bus 8-byte chunk that is split across an
21238-byte boundary.
2124.It Li req-dem-type
2125Count requests that are demand (not prefetches) if set.
2126Count requests that are prefetches if not set.
2127.It Li req-ord-type
2128Count requests that are ordered.
2129.It Li mem-type0 , Li mem-type1 , Li mem-type2
2130Forms a 3-bit number that specifies a memory type encoding:
2131.Bl -tag -width indent -compact
2132.It Li 0
2133UC
2134.It Li 1
2135USWC
2136.It Li 4
2137WT
2138.It Li 5
2139WP
2140.It Li 6
2141WB
2142.El
2143Bit
2144.Li mem-type2
2145is the MSB of this 3-bit number.
2146.El
2147The default qualifier has all the above bits set.
2148.Pp
2149Edge triggering using the
2150.Li edge
2151qualifier should not be used with this event when counting cycles.
2152.It Li p4-bsq-allocation Op Li ,mask= Ns Ar qualifier
2153.Pq "TS event"
2154Count allocations in the bus sequence unit according to the flags
2155specified in
2156.Ar qualifier ,
2157which is a
2158.Li +
2159separated set of the following flags:
2160.Bl -tag -width indent -compact
2161.It Li req-type0 , Li req-type1
2162Forms a 2-bit number used to select the request type encoding:
2163.Bl -tag -width indent -compact
2164.It Li 0
2165reads excluding read invalidate
2166.It Li 1
2167read invalidates
2168.It Li 2
2169writes other than writebacks
2170.It Li 3
2171writebacks
2172.El
2173Bit
2174.Li req-type1
2175is the MSB for this two bit number.
2176.It Li req-len0 , Li req-len1
2177Forms a two-bit number that specifies the request length encoding:
2178.Bl -tag -width indent -compact
2179.It Li 0
21800 chunks
2181.It Li 1
21821 chunk
2183.It Li 3
21848 chunks
2185.El
2186Bit
2187.Li req-len1
2188is the MSB for this two bit number.
2189.It Li req-io-type
2190Count requests that are input or output requests.
2191.It Li req-lock-type
2192Count requests that lock the bus.
2193.It Li req-lock-cache
2194Count requests that lock the cache.
2195.It Li req-split-type
2196Count requests that is a bus 8-byte chunk that is split across an
21978-byte boundary.
2198.It Li req-dem-type
2199Count requests that are demand (not prefetches) if set.
2200Count requests that are prefetches if not set.
2201.It Li req-ord-type
2202Count requests that are ordered.
2203.It Li mem-type0 , Li mem-type1 , Li mem-type2
2204Forms a 3-bit number that specifies a memory type encoding:
2205.Bl -tag -width indent -compact
2206.It Li 0
2207UC
2208.It Li 1
2209USWC
2210.It Li 4
2211WT
2212.It Li 5
2213WP
2214.It Li 6
2215WB
2216.El
2217Bit
2218.Li mem-type2
2219is the MSB of this 3-bit number.
2220.El
2221The default qualifier has all the above bits set.
2222.Pp
2223This event is usually used along with the
2224.Li edge
2225qualifier to avoid multiple counting.
2226.It Li p4-bsq-cache-reference Op Li ,mask= Ns Ar qualifier
2227.Pq "TS event"
2228Count cache references as seen by the bus unit (2nd or 3rd level
2229cache references).
2230Qualifier
2231.Ar qualifier
2232is a
2233.Li +
2234separated list of the following keywords:
2235.Bl -tag -width indent -compact
2236.It Li rd-2ndl-hits
2237Count 2nd level cache hits in the shared state.
2238.It Li rd-2ndl-hite
2239Count 2nd level cache hits in the exclusive state.
2240.It Li rd-2ndl-hitm
2241Count 2nd level cache hits in the modified state.
2242.It Li rd-3rdl-hits
2243Count 3rd level cache hits in the shared state.
2244.It Li rd-3rdl-hite
2245Count 3rd level cache hits in the exclusive state.
2246.It Li rd-3rdl-hitm
2247Count 3rd level cache hits in the modified state.
2248.It Li rd-2ndl-miss
2249Count 2nd level cache misses.
2250.It Li rd-3rdl-miss
2251Count 3rd level cache misses.
2252.It Li wr-2ndl-miss
2253Count write-back lookups from the data access cache that miss the 2nd
2254level cache.
2255.El
2256The default is to count all the above events.
2257.It Li p4-execution-event Op Li ,mask= Ns Ar flags
2258.Pq "TS event"
2259Count the retirement of tagged uops selected through the execution
2260tagging mechanism.
2261Qualifier
2262.Ar flags
2263can contain the following strings separated by
2264.Li +
2265characters:
2266.Bl -tag -width indent -compact
2267.It Li nbogus0 , Li nbogus1 , Li nbogus2 , Li nbogus3
2268The marked uops are not bogus.
2269.It Li bogus0 , Li bogus1 , Li bogus2 , Li bogus3
2270The marked uops are bogus.
2271.El
2272This event requires additional (upstream) events to be allocated to
2273perform the desired uop tagging.
2274The default is to set all the above flags.
2275This event can be used for precise event based sampling.
2276.It Li p4-front-end-event Op Li ,mask= Ns Ar flags
2277.Pq "TS event"
2278Count the retirement of tagged uops selected through the front-end
2279tagging mechanism.
2280Qualifier
2281.Ar flags
2282can contain the following strings separated by
2283.Li +
2284characters:
2285.Bl -tag -width indent -compact
2286.It Li nbogus
2287The marked uops are not bogus.
2288.It Li bogus
2289The marked uops are bogus.
2290.El
2291This event requires additional (upstream) events to be allocated to
2292perform the desired uop tagging.
2293The default is to select both kinds of events.
2294This event can be used for precise event based sampling.
2295.It Li p4-fsb-data-activity Op Li ,mask= Ns Ar flags
2296.Pq "TI event"
2297Count each DBSY or DRDY event selected by qualifier
2298.Ar flags .
2299Qualifier
2300.Ar flags
2301is a
2302.Li +
2303separated set of the following flags:
2304.Bl -tag -width indent -compact
2305.It Li drdy-drv
2306Count when this processor is driving data onto the bus.
2307.It Li drdy-own
2308Count when this processor is reading data from the bus.
2309.It Li drdy-other
2310Count when data is on the bus but not being sampled by this processor.
2311.It Li dbsy-drv
2312Count when this processor reserves the bus for use in the next cycle
2313in order to drive data.
2314.It Li dbsy-own
2315Count when some agent reserves the bus for use in the next bus cycle
2316to drive data that this processor will sample.
2317.It Li dbsy-other
2318Count when some agent reserves the bus for use in the next bus cycle
2319to drive data that this processor will not sample.
2320.El
2321Flags
2322.Li drdy-own
2323and
2324.Li drdy-other
2325are mutually exclusive.
2326Flags
2327.Li dbsy-own
2328and
2329.Li dbsy-other
2330are mutually exclusive.
2331The default value for
2332.Ar qualifier
2333is
2334.Li drdy-drv+drdy-own+dbsy-drv+dbsy-own .
2335.It Li p4-global-power-events Op Li ,mask= Ns Ar flags
2336.Pq "TS event"
2337Count cycles during which the processor is not stopped.
2338Qualifier
2339.Ar flags
2340can take the following value (which is also the default):
2341.Bl -tag -width indent -compact
2342.It Li running
2343Count cycles when the processor is active.
2344.El
2345.It Li p4-instr-retired Op Li ,mask= Ns Ar flags
2346.Pq "TS event"
2347Count instructions retired during a clock cycle.
2348Qualifer
2349.Ar flags
2350comprises of the following strings separated by
2351.Li +
2352characters:
2353.Bl -tag -width indent -compact
2354.It Li nbogusntag
2355Count non-bogus instructions that are not tagged.
2356.It Li nbogustag
2357Count non-bogus instructions that are tagged.
2358.It Li bogusntag
2359Count bogus instructions that are not tagged.
2360.It Li bogustag
2361Count bogus instructions that are tagged.
2362.El
2363The default qualifier counts all the above kinds of instructions.
2364.It Li p4-ioq-active-entries Xo
2365.Op Li ,mask= Ns Ar qualifier
2366.Op Li ,busreqtype= Ns Ar req-type
2367.Xc
2368.Pq "TS event"
2369Count the number of entries (clipped at 15) in the IOQ that are
2370active.
2371The event masks are specified by qualifier
2372.Ar qualifier
2373and
2374.Ar req-type .
2375.Pp
2376Qualifier
2377.Ar qualifier
2378is a
2379.Li +
2380separated set of the following flags:
2381.Bl -tag -width indent -compact
2382.It Li all-read
2383Count read entries.
2384.It Li all-write
2385Count write entries.
2386.It Li mem-uc
2387Count entries accessing uncacheable memory.
2388.It Li mem-wc
2389Count entries accessing write-combining memory.
2390.It Li mem-wt
2391Count entries accessing write-through memory.
2392.It Li mem-wp
2393Count entries accessing write-protected memory
2394.It Li mem-wb
2395Count entries accessing write-back memory.
2396.It Li own
2397Count store requests driven by the processor (i.e., not by other
2398processors or by DMA).
2399.It Li other
2400Count store requests driven by other processors or by DMA.
2401.It Li prefetch
2402Include hardware and software prefetch requests in the count.
2403.El
2404The default value for
2405.Ar qualifier
2406is to enable all the above flags.
2407.Pp
2408The
2409.Ar req-type
2410qualifier is a 5-bit number can be additionally used to select a
2411specific bus request type.
2412The default is 0.
2413.Pp
2414The
2415.Li edge
2416qualifier should not be used when counting cycles with this event.
2417The exact behaviour of this event depends on the processor revision.
2418.It Li p4-ioq-allocation Xo
2419.Op Li ,mask= Ns Ar qualifier
2420.Op Li ,busreqtype= Ns Ar req-type
2421.Xc
2422.Pq "TS event"
2423Count various types of transactions on the bus matching the flags set
2424in
2425.Ar qualifier
2426and
2427.Ar req-type .
2428.Pp
2429Qualifier
2430.Ar qualifier
2431is a
2432.Li +
2433separated set of the following flags:
2434.Bl -tag -width indent -compact
2435.It Li all-read
2436Count read entries.
2437.It Li all-write
2438Count write entries.
2439.It Li mem-uc
2440Count entries accessing uncacheable memory.
2441.It Li mem-wc
2442Count entries accessing write-combining memory.
2443.It Li mem-wt
2444Count entries accessing write-through memory.
2445.It Li mem-wp
2446Count entries accessing write-protected memory
2447.It Li mem-wb
2448Count entries accessing write-back memory.
2449.It Li own
2450Count store requests driven by the processor (i.e., not by other
2451processors or by DMA).
2452.It Li other
2453Count store requests driven by other processors or by DMA.
2454.It Li prefetch
2455Include hardware and software prefetch requests in the count.
2456.El
2457The default value for
2458.Ar qualifier
2459is to enable all the above flags.
2460.Pp
2461The
2462.Ar req-type
2463qualifier is a 5-bit number can be additionally used to select a
2464specific bus request type.
2465The default is 0.
2466.Pp
2467The
2468.Li edge
2469qualifier is normally used with this event to prevent multiple
2470counting.
2471The exact behaviour of this event depends on the processor revision.
2472.It Li p4-itlb-reference Op mask= Ns Ar qualifier
2473.Pq "TS event"
2474Count translations using the intruction translation look-aside
2475buffer.
2476The
2477.Ar qualifier
2478argument is a list of the following strings separated by
2479.Li +
2480characters.
2481.Bl -tag -width indent -compact
2482.It Li hit
2483Count ITLB hits.
2484.It Li miss
2485Count ITLB misses.
2486.It Li hit-uc
2487Count uncacheable ITLB hits.
2488.El
2489If no
2490.Ar qualifier
2491is specified the default is to count all the three kinds of ITLB
2492translations.
2493.It Li p4-load-port-replay Op Li ,mask= Ns Ar qualifier
2494.Pq "TS event"
2495Count replayed events at the load port.
2496Qualifier
2497.Ar qualifier
2498can take on one value:
2499.Bl -tag -width indent -compact
2500.It Li split-ld
2501Count split loads.
2502.El
2503The default value for
2504.Ar qualifier
2505is
2506.Li split-ld .
2507.It Li p4-mispred-branch-retired Op Li ,mask= Ns Ar flags
2508.Pq "TS event"
2509Count mispredicted IA-32 branch instructions.
2510Qualifier
2511.Ar flags
2512can take the following value (which is also the default):
2513.Bl -tag -width indent -compact
2514.It Li nbogus
2515Count non-bogus retired branch instructions.
2516.El
2517.It Li p4-machine-clear Op Li ,mask= Ns Ar flags
2518.Pq "TS event"
2519Count the number of pipeline clears seen by the processor.
2520Qualifer
2521.Ar flags
2522is a list of the following strings separated by
2523.Li +
2524characters:
2525.Bl -tag -width indent -compact
2526.It Li clear
2527Count for a portion of the many cycles when the machine is being
2528cleared for any reason.
2529.It Li moclear
2530Count machine clears due to memory ordering issues.
2531.It Li smclear
2532Count machine clears due to self-modifying code.
2533.El
2534Use qualifier
2535.Li edge
2536to get a count of occurrences of machine clears.
2537The default qualifier is
2538.Li clear .
2539.It Li p4-memory-cancel Op Li ,mask= Ns Ar event-list
2540.Pq "TS event"
2541Count the cancelling of various kinds of requests in the data cache
2542address control unit of the CPU.
2543The qualifier
2544.Ar event-list
2545is a list of the following strings separated by
2546.Li "+"
2547characters:
2548.Bl -tag -width indent -compact
2549.It Li st-rb-full
2550Requests cancelled because no store request buffer was available.
2551.It Li 64k-conf
2552Requests that conflict due to 64K aliasing.
2553.El
2554If
2555.Ar event-list
2556is not specified, then the default is to count both kinds of events.
2557.It Li p4-memory-complete Op Li ,mask= Ns Ar event-list
2558.Pq "TS event"
2559Count the completion of load split, store split, uncacheable split and
2560uncacheable load operations selected by qualifier
2561.Ar event-list .
2562The qualifier
2563.Ar event-list
2564is a
2565.Li +
2566separated list of the following flags:
2567.Bl -tag -width indent -compact
2568.It Li lsc
2569Count load splits completed, excluding loads from uncacheable or
2570write-combining areas.
2571.It Li ssc
2572Count any split stores completed.
2573.El
2574The default is to count both kinds of operations.
2575.It Li p4-mob-load-replay Op Li ,mask= Ns Ar qualifier
2576.Pq "TS event"
2577Count load replays triggered by the memory order buffer.
2578Qualifier
2579.Ar qualifier
2580can be a
2581.Li +
2582separated list of the following flags:
2583.Bl -tag -width indent -compact
2584.It Li no-sta
2585Count replays because of unknown store addresses.
2586.It Li no-std
2587Count replays because of unknown store data.
2588.It Li partial-data
2589Count replays because of partially overlapped data accesses between
2590load and store operations.
2591.It Li unalgn-addr
2592Count replays because of mismatches in the lower 4 bits of load and
2593store operations.
2594.El
2595The default qualifier is
2596.Ar no-sta+no-std+partial-data+unalgn-addr .
2597.It Li p4-packed-dp-uop Op Li ,mask= Ns Ar flags
2598.Pq "TI event"
2599Count packed double-precision uops.
2600Qualifier
2601.Ar flags
2602can take the following value (which is also the default):
2603.Bl -tag -width indent -compact
2604.It Li all
2605Count all uops operating on packed double-precision operands.
2606.El
2607.It Li p4-packed-sp-uop Op Li ,mask= Ns Ar flags
2608.Pq "TI event"
2609Count packed single-precision uops.
2610Qualifier
2611.Ar flags
2612can take the following value (which is also the default):
2613.Bl -tag -width indent -compact
2614.It Li all
2615Count all uops operating on packed single-precision operands.
2616.El
2617.It Li p4-page-walk-type Op Li ,mask= Ns Ar qualifier
2618.Pq "TI event"
2619Count page walks performed by the page miss handler.
2620Qualifier
2621.Ar qualifier
2622can be a
2623.Li +
2624separated list of the following keywords:
2625.Bl -tag -width indent -compact
2626.It Li dtmiss
2627Count page walks for data TLB misses.
2628.It Li itmiss
2629Count page walks for instruction TLB misses.
2630.El
2631The default value for
2632.Ar qualifier
2633is
2634.Li dtmiss+itmiss .
2635.It Li p4-replay-event Op Li ,mask= Ns Ar flags
2636.Pq "TS event"
2637Count the retirement of tagged uops selected through the replay
2638tagging mechanism.
2639Qualifier
2640.Ar flags
2641contains a
2642.Li +
2643separated set of the following strings:
2644.Bl -tag -width indent -compact
2645.It Li nbogus
2646The marked uops are not bogus.
2647.It Li bogus
2648The marked uops are bogus.
2649.El
2650This event requires additional (upstream) events to be allocated to
2651perform the desired uop tagging.
2652The default qualifier counts both kinds of uops.
2653This event can be used for precise event based sampling.
2654.It Li p4-resource-stall Op Li ,mask= Ns Ar flags
2655.Pq "TS event"
2656Count the occurrence or latency of stalls in the allocator.
2657Qualifier
2658.Ar flags
2659can take the following value (which is also the default):
2660.Bl -tag -width indent -compact
2661.It Li sbfull
2662A stall due to the lack of store buffers.
2663.El
2664.It Li p4-response
2665.Pq "TI event"
2666Count different types of responses.
2667Further documentation on this event is not available.
2668.It Li p4-retired-branch-type Op Li ,mask= Ns Ar flags
2669.Pq "TS event"
2670Count branches retired.
2671Qualifier
2672.Ar flags
2673contains a
2674.Li +
2675separated list of strings:
2676.Bl -tag -width indent -compact
2677.It Li conditional
2678Count conditional jumps.
2679.It Li call
2680Count direct and indirect call branches.
2681.It Li return
2682Count return branches.
2683.It Li indirect
2684Count returns, indirect calls or indirect jumps.
2685.El
2686The default qualifier counts all the above branch types.
2687.It Li p4-retired-mispred-branch-type Op Li ,mask= Ns Ar flags
2688.Pq "TS event"
2689Count mispredicted branches retired.
2690Qualifier
2691.Ar flags
2692contains a
2693.Li +
2694separated list of strings:
2695.Bl -tag -width indent -compact
2696.It Li conditional
2697Count conditional jumps.
2698.It Li call
2699Count indirect call branches.
2700.It Li return
2701Count return branches.
2702.It Li indirect
2703Count returns, indirect calls or indirect jumps.
2704.El
2705The default qualifier counts all the above branch types.
2706.It Li p4-scalar-dp-uop Op Li ,mask= Ns Ar flags
2707.Pq "TI event"
2708Count the number of scalar double-precision uops.
2709Qualifier
2710.Ar flags
2711can take the following value (which is also the default):
2712.Bl -tag -width indent -compact
2713.It Li all
2714Count the number of scalar double-precision uops.
2715.El
2716.It Li p4-scalar-sp-uop Op Li ,mask= Ns Ar flags
2717.Pq "TI event"
2718Count the number of scalar single-precision uops.
2719Qualifier
2720.Ar flags
2721can take the following value (which is also the default):
2722.Bl -tag -width indent -compact
2723.It Li all
2724Count all uops operating on scalar single-precision operands.
2725.El
2726.It Li p4-snoop
2727.Pq "TI event"
2728Count snoop traffic.
2729Further documentation on this event is not available.
2730.It Li p4-sse-input-assist Op Li ,mask= Ns Ar flags
2731.Pq "TI event"
2732Count the number of times an assist is required to handle problems
2733with the operands for SSE and SSE2 operations.
2734Qualifier
2735.Ar flags
2736can take the following value (which is also the default):
2737.Bl -tag -width indent -compact
2738.It Li all
2739Count assists for all SSE and SSE2 uops.
2740.El
2741.It Li p4-store-port-replay Op Li ,mask= Ns Ar qualifier
2742.Pq "TS event"
2743Count events replayed at the store port.
2744Qualifier
2745.Ar qualifier
2746can take on one value:
2747.Bl -tag -width indent -compact
2748.It Li split-st
2749Count split stores.
2750.El
2751The default value for
2752.Ar qualifier
2753is
2754.Li split-st .
2755.It Li p4-tc-deliver-mode Op Li ,mask= Ns Ar qualifier
2756.Pq "TI event"
2757Count the duration in cycles of operating modes of the trace cache and
2758decode engine.
2759The desired operating mode is selected by
2760.Ar qualifier ,
2761which is a list of the following strings separated by
2762.Li "+"
2763characters:
2764.Bl -tag -width indent -compact
2765.It Li DD
2766Both logical processors are in deliver mode.
2767.It Li DB
2768Logical processor 0 is in deliver mode while logical processor 1 is in
2769build mode.
2770.It Li DI
2771Logical processor 0 is in deliver mode while logical processor 1 is
2772halted, or in machine clear, or transitioning to a long microcode
2773flow.
2774.It Li BD
2775Logical processor 0 is in build mode while logical processor 1 is in
2776deliver mode.
2777.It Li BB
2778Both logical processors are in build mode.
2779.It Li BI
2780Logical processor 0 is in build mode while logical processor 1 is
2781halted, or in machine clear or transitioning to a long microcode
2782flow.
2783.It Li ID
2784Logical processor 0 is halted, or in machine clear or transitioning to
2785a long microcode flow while logical processor 1 is in deliver mode.
2786.It Li IB
2787Logical processor 0 is halted, or in machine clear or transitioning to
2788a long microcode flow while logical processor 1 is in build mode.
2789.El
2790If there is only one logical processor in the processor package then
2791the qualifier for logical processor 1 is ignored.
2792If no qualifier is specified, the default qualifier is
2793.Li DD+DB+DI+BD+BB+BI+ID+IB .
2794.It Li p4-tc-ms-xfer Op Li ,mask= Ns Ar flags
2795.Pq "TI event"
2796Count the number of times uop delivery changed from the trace cache to
2797MS ROM.
2798Qualifier
2799.Ar flags
2800can take the following value (which is also the default):
2801.Bl -tag -width indent -compact
2802.It Li cisc
2803Count TC to MS transfers.
2804.El
2805.It Li p4-uop-queue-writes Op Li ,mask= Ns Ar flags
2806.Pq "TS event"
2807Count the number of valid uops written to the uop queue.
2808Qualifier
2809.Ar flags
2810is a list of the following strings, separated by
2811.Li +
2812characters:
2813.Bl -tag -width indent -compact
2814.It Li from-tc-build
2815Count uops being written from the trace cache in build mode.
2816.It Li from-tc-deliver
2817Count uops being written from the trace cache in deliver mode.
2818.It Li from-rom
2819Count uops being written from microcode ROM.
2820.El
2821The default qualifier counts all the above kinds of uops.
2822.It Li p4-uop-type Op Li ,mask= Ns Ar flags
2823.Pq "TS event"
2824This event is used in conjunction with the front-end at-retirement
2825mechanism to tag load and store uops.
2826Qualifer
2827.Ar flags
2828comprises the following strings separated by
2829.Li +
2830characters:
2831.Bl -tag -width indent -compact
2832.It Li tagloads
2833Mark uops that are load operations.
2834.It Li tagstores
2835Mark uops that are store operations.
2836.El
2837The default qualifier counts both kinds of uops.
2838.It Li p4-uops-retired Op Li ,mask= Ns Ar flags
2839.Pq "TS event"
2840Count uops retired during a clock cycle.
2841Qualifier
2842.Ar flags
2843comprises the following strings separated by
2844.Li +
2845characters:
2846.Bl -tag -width indent -compact
2847.It Li nbogus
2848Count marked uops that are not bogus.
2849.It Li bogus
2850Count marked uops that are bogus.
2851.El
2852The default qualifier counts both kinds of uops.
2853.It Li p4-wc-buffer Op Li ,mask= Ns Ar flags
2854.Pq "TI event"
2855Count write-combining buffer operations.
2856Qualifier
2857.Ar flags
2858contains the following strings separated by
2859.Li +
2860characters:
2861.Bl -tag -width indent -compact
2862.It Li wcb-evicts
2863WC buffer evictions due to any cause.
2864.It Li wcb-full-evict
2865WC buffer evictions due to no WC buffer being available.
2866.El
2867The default qualifer counts both kinds of evictions.
2868.It Li p4-x87-assist Op Li ,mask= Ns Ar flags
2869.Pq "TS event"
2870Count the retirement of x87 instructions that required special
2871handling.
2872Qualifier
2873.Ar flags
2874contains the following strings separated by
2875.Li +
2876characters:
2877.Bl -tag -width indent -compact
2878.It Li fpsu
2879Count instructions that saw an FP stack underflow.
2880.It Li fpso
2881Count instructions that saw an FP stack overflow.
2882.It Li poao
2883Count instructions that saw an x87 output overflow.
2884.It Li poau
2885Count instructions that saw an x87 output underflow.
2886.It Li prea
2887Count instructions that needed an x87 input assist.
2888.El
2889The default qualifier counts all the above types of instruction
2890retirements.
2891.It Li p4-x87-fp-uop Op Li ,mask= Ns Ar flags
2892.Pq "TI event"
2893Count x87 floating-point uops.
2894Qualifier
2895.Ar flags
2896can take the following value (which is also the default):
2897.Bl -tag -width indent -compact
2898.It Li all
2899Count all x87 floating-point uops.
2900.El
2901If an instruction contains more than one x87 floating-point uops, then
2902all x87 floating-point uops will be counted.
2903This event does not count x87 floating-point data movement operations.
2904.It Li p4-x87-simd-moves-uop Op Li ,mask= Ns Ar flags
2905.Pq "TI event"
2906Count each x87 FPU, MMX, SSE, or SSE2 uops that load data or store
2907data or perform register-to-register moves.
2908This event does not count integer move uops.
2909Qualifier
2910.Ar flags
2911may contain the following keywords separated by
2912.Li +
2913characters:
2914.Bl -tag -width indent -compact
2915.It Li allp0
2916Count all x87 and SIMD store and move uops.
2917.It Li allp2
2918Count all x87 and SIMD load uops.
2919.El
2920The default is to count all uops.
2921.El
2922.Ss "Cascading P4 PMCs"
2923To be filled in.
2924.Ss "Precise Event Based Sampling"
2925To be filled in.
2926.Sh IMPLEMENTATION NOTES
2927On the i386 architecture,
2928.Fx
2929has historically allowed the use of the RDTSC instruction from
2930user-mode (i.e., at a processor CPL of 3) by any process.
2931This behaviour is preserved by
2932.Xr hwpmc 4 .
2933.Sh RETURN VALUES
2934The
2935.Fn pmc_name_of_capability ,
2936.Fn pmc_name_of_class ,
2937.Fn pmc_name_of_cputype ,
2938.Fn pmc_name_of_disposition ,
2939.Fn pmc_name_of_event ,
2940.Fn pmc_name_of_mode ,
2941and
2942.Fn pmc_name_of_state
2943functions return a pointer to the human readable form of their argument.
2944These pointers may point to statically allocated storage and must
2945not be passed to
2946.Fn free .
2947In case of an error, these functions return
2948.Li NULL
2949and set the global variable
2950.Va errno .
2951.Pp
2952The functions
2953.Fn pmc_ncpu
2954and
2955.Fn pmc_npmc
2956return the number of CPUs and number of PMCs configured respectively;
2957in case of an error they return the value
2958.Li -1
2959and set the global variable
2960.Va errno .
2961.Pp
2962All other functions return the value
2963.Li 0
2964if successful; otherwise the value
2965.Li -1
2966is returned and the global variable
2967.Va errno
2968is set to indicate the error.
2969.Sh ERRORS
2970A call to
2971.Fn pmc_init
2972may fail with the following errors in addition to those returned by
2973.Xr modfind 2 ,
2974.Xr modstat 2
2975and
2976.Xr hwpmc 4 :
2977.Bl -tag -width Er
2978.It Bq Er ENXIO
2979An unknown CPU type was encountered during initialization.
2980.It Bq Er EPROGMISMATCH
2981The version number of the
2982.Xr hwpmc 4
2983kernel module did not match that compiled into the
2984.Xr pmc 3
2985library.
2986.El
2987.Pp
2988A call to
2989.Fn pmc_name_of_capability ,
2990.Fn pmc_name_of_disposition ,
2991.Fn pmc_name_of_state ,
2992.Fn pmc_name_of_event ,
2993.Fn pmc_name_of_mode
2994and
2995.Fn pmc_name_of_class
2996may fail with the following error:
2997.Bl -tag -width Er
2998.It Bq Er EINVAL
2999An invalid argument was passed to the function.
3000.El
3001.Pp
3002A call to
3003.Fn pmc_cpuinfo
3004or
3005.Fn pmc_ncpu
3006may fail with the following error:
3007.Bl -tag -width Er
3008.It Bq Er ENXIO
3009The
3010.Xr pmc 3
3011has not been initialized.
3012.El
3013.Pp
3014A call to
3015.Fn pmc_npmc
3016may fail with the following errors:
3017.Bl -tag -width Er
3018.It Bq Er EINVAL
3019The argument passed in was out of range.
3020.It Bq Er ENXIO
3021The
3022.Xr pmc 3
3023library has not been initialized.
3024.El
3025.Pp
3026A call to
3027.Fn pmc_pmcinfo
3028may fail with the following errors, in addition to those returned by
3029.Xr hwpmc 4 :
3030.Bl -tag -width Er
3031.It Bq Er ENXIO
3032The
3033.Xr pmc 3
3034library is not yet initialized.
3035.El
3036.Pp
3037A call to
3038.Fn pmc_allocate
3039may fail with the following errors, in addition to those returned by
3040.Xr hwpmc 4 :
3041.Bl -tag -width Er
3042.It Bq Er EINVAL
3043The
3044.Fa mode
3045argument passed in had an illegal value, or the event specification
3046.Fa ctrspec
3047was unrecognized for this cpu type.
3048.El
3049.Pp
3050Calls to
3051.Fn pmc_attach ,
3052.Fn pmc_detach ,
3053.Fn pmc_release ,
3054.Fn pmc_start ,
3055.Fn pmc_stop ,
3056.Fn pmc_read ,
3057.Fn pmc_write ,
3058.Fn pmc_rw ,
3059.Fn pmc_set ,
3060.Fn pmc_configure_logfile ,
3061.Fn pmc_get_driver_stats ,
3062.Fn pmc_enable ,
3063.Fn pmc_disable ,
3064and
3065.Fn pmc_x86_get_msr
3066may fail with the errors described in
3067.Xr hwpmc 4 .
3068.Sh SEE ALSO
3069.Xr modfind 2 ,
3070.Xr modstat 2 ,
3071.Xr hwpmc 4 ,
3072.Xr pmccontrol 8 ,
3073.Xr pmcreport 8 ,
3074.Xr pmcstat 8
3075.Sh BUGS
3076The information returned by
3077.Fn pmc_cpuinfo ,
3078.Fn pmc_ncpu
3079and possibly
3080.Fn pmc_npmc
3081should really be available all the time, through a better designed
3082interface.
3083.Pp
3084The API for
3085.Fn pmc_cpuinfo
3086and
3087.Fn pmc_pmcinfo
3088expose too much of the underlying
3089.Xr hwpmc 4
3090driver's internals to userland.
3091