xref: /titanic_41/usr/src/uts/intel/pcbe/p4_pcbe.c (revision 275c9da86e89f8abf71135cf63d9fc23671b2e60)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Performance Counter Back-End for Pentium 4.
30  */
31 
32 #include <sys/cpuvar.h>
33 #include <sys/param.h>
34 #include <sys/cpc_impl.h>
35 #include <sys/cpc_pcbe.h>
36 #include <sys/inttypes.h>
37 #include <sys/errno.h>
38 #include <sys/systm.h>
39 #include <sys/archsystm.h>
40 #include <sys/x86_archext.h>
41 #include <sys/modctl.h>
42 #include <sys/sdt.h>
43 #include <sys/cred.h>
44 #include <sys/policy.h>
45 #include <sys/privregs.h>
46 
47 static int p4_pcbe_init(void);
48 static uint_t p4_pcbe_ncounters(void);
49 static const char *p4_pcbe_impl_name(void);
50 static const char *p4_pcbe_cpuref(void);
51 static char *p4_pcbe_list_events(uint_t picnum);
52 static char *p4_pcbe_list_attrs(void);
53 static uint64_t p4_pcbe_event_coverage(char *event);
54 static uint64_t p4_pcbe_overflow_bitmap(void);
55 static int p4_pcbe_configure(uint_t picnum, char *event, uint64_t preset,
56     uint32_t flags, uint_t nattrs, kcpc_attr_t *attrs, void **data,
57     void *token);
58 static void p4_pcbe_program(void *token);
59 static void p4_pcbe_allstop(void);
60 static void p4_pcbe_sample(void *token);
61 static void p4_pcbe_free(void *config);
62 
63 extern int cpuid_get_clogid(cpu_t *);
64 
65 static pcbe_ops_t p4_pcbe_ops = {
66 	PCBE_VER_1,
67 	CPC_CAP_OVERFLOW_INTERRUPT | CPC_CAP_OVERFLOW_PRECISE,
68 	p4_pcbe_ncounters,
69 	p4_pcbe_impl_name,
70 	p4_pcbe_cpuref,
71 	p4_pcbe_list_events,
72 	p4_pcbe_list_attrs,
73 	p4_pcbe_event_coverage,
74 	p4_pcbe_overflow_bitmap,
75 	p4_pcbe_configure,
76 	p4_pcbe_program,
77 	p4_pcbe_allstop,
78 	p4_pcbe_sample,
79 	p4_pcbe_free
80 };
81 
82 /*
83  * P4 Configuration Flags.
84  */
85 #define	P4_THIS_USR	0x1 /* HTT: Measure usr events on this logical CPU */
86 #define	P4_THIS_SYS	0x2 /* HTT: Measure os events on this logical CPU */
87 #define	P4_SIBLING_USR	0x4 /* HTT: Measure os events on other logical CPU */
88 #define	P4_SIBLING_SYS	0x8 /* HTT: Measure usr events on other logical CPU */
89 #define	P4_PMI		0x10 /* HTT: Set PMI bit for local logical CPU */
90 
91 typedef struct _p4_pcbe_config {
92 	uint8_t		p4_flags;
93 	uint8_t		p4_picno;	/* From 0 to 18 */
94 	uint8_t		p4_escr_ndx;	/* Which ESCR to use */
95 	uint32_t	p4_escr;	/* Value to program in selected ESCR */
96 	uint32_t	p4_cccr;	/* Value to program in counter's CCCR */
97 	uint64_t	p4_rawpic;
98 } p4_pcbe_config_t;
99 
100 typedef uint32_t cntr_map_t;
101 
102 typedef struct _p4_escr {
103 	int		pe_num;
104 	uint32_t	pe_addr;
105 	uint32_t	pe_map; /* bitmap of counters; bit 1 means ctr 0 */
106 } p4_escr_t;
107 
108 #define	MASK40			UINT64_C(0xffffffffff)
109 
110 /*
111  * CCCR field definitions.
112  *
113  * Note that the Intel Developer's Manual states that the reserved field at
114  * bit location 16 and 17 must be set to 11. (??)
115  */
116 #define	CCCR_ENABLE_SHIFT	12
117 #define	CCCR_ESCR_SEL_SHIFT	13
118 #define	CCCR_ACTV_THR_SHIFT	16
119 #define	CCCR_COMPARE_SHIFT	18
120 #define	CCCR_COMPLEMENT_SHIFT	19
121 #define	CCCR_THRESHOLD_SHIFT	20
122 #define	CCCR_EDGE_SHIFT		24
123 #define	CCCR_OVF_PMI_SHIFT	26
124 #define	CCCR_OVF_PMI_T0_SHIFT	26
125 #define	CCCR_OVF_PMI_T1_SHIFT	27
126 #define	CCCR_OVF_SHIFT		31
127 #define	CCCR_ACTV_THR_MASK	0x3
128 #define	CCCR_THRESHOLD_MAX	0xF
129 #define	CCCR_ENABLE		(1U << CCCR_ENABLE_SHIFT)
130 #define	CCCR_COMPARE		(1U << CCCR_COMPARE_SHIFT)
131 #define	CCCR_COMPLEMENT		(1U << CCCR_COMPLEMENT_SHIFT)
132 #define	CCCR_EDGE		(1U << CCCR_EDGE_SHIFT)
133 #define	CCCR_OVF_PMI		(1U << CCCR_OVF_PMI_SHIFT)
134 #define	CCCR_OVF_PMI_T0		(1U << CCCR_OVF_PMI_T0_SHIFT)
135 #define	CCCR_OVF_PMI_T1		(1U << CCCR_OVF_PMI_T1_SHIFT)
136 #define	CCCR_INIT		CCCR_ENABLE
137 #define	CCCR_OVF		(1U << CCCR_OVF_SHIFT)
138 
139 #define	ESCR_EVSEL_SHIFT	25
140 #define	ESCR_EVMASK_SHIFT	9
141 #define	ESCR_TAG_VALUE_SHIFT	5
142 #define	ESCR_TAG_VALUE_MAX	0xF
143 #define	ESCR_TAG_ENABLE_SHIFT	4
144 #define	ESCR_USR_SHIFT		2
145 #define	ESCR_OS_SHIFT		3
146 #define	ESCR_USR		(1U << ESCR_USR_SHIFT)
147 #define	ESCR_OS			(1U << ESCR_OS_SHIFT)
148 #define	ESCR_TAG_ENABLE		(1U << ESCR_TAG_ENABLE_SHIFT)
149 
150 /*
151  * HyperThreaded ESCR fields.
152  */
153 #define	ESCR_T0_OS_SHIFT	3
154 #define	ESCR_T0_USR_SHIFT	2
155 #define	ESCR_T1_OS_SHIFT	1
156 #define	ESCR_T1_USR_SHIFT	0
157 #define	ESCR_T0_OS		(1U << ESCR_T0_OS_SHIFT)
158 #define	ESCR_T0_USR		(1U << ESCR_T0_USR_SHIFT)
159 #define	ESCR_T1_OS		(1U << ESCR_T1_OS_SHIFT)
160 #define	ESCR_T1_USR		(1U << ESCR_T1_USR_SHIFT)
161 
162 /*
163  * ESCRs are grouped by counter; each group of ESCRs is associated with a
164  * distinct group of counters. Use these macros to fill in the table below.
165  */
166 #define	BPU0_map	(0x1 | 0x2)		/* Counters 0 and 1 */
167 #define	BPU2_map	(0x4 | 0x8)		/* Counters 2 and 3 */
168 #define	MS0_map		(0x10 | 0x20)		/* Counters 4 and 5 */
169 #define	MS2_map		(0x40 | 0x80)		/* Counters 6 and 7 */
170 #define	FLAME0_map	(0x100 | 0x200)		/* Counters 8 and 9 */
171 #define	FLAME2_map	(0x400 | 0x800)		/* Counters 10 and 11 */
172 #define	IQ0_map		(0x1000 | 0x2000 | 0x10000) /* Counters 12, 13, 16 */
173 #define	IQ2_map		(0x4000 | 0x8000 | 0x20000) /* Counters 14, 15, 17 */
174 
175 /*
176  * Table describing the 45 Event Selection and Control Registers (ESCRs).
177  */
178 const p4_escr_t p4_escrs[] = {
179 #define	BPU0 (1)
180 	{ 0, 0x3B2, BPU0_map },		/* 0 */
181 #define	IS0 (1ULL << 1)
182 	{ 1, 0x3B4, BPU0_map },		/* 1 */
183 #define	MOB0 (1ULL << 2)
184 	{ 2, 0x3AA, BPU0_map },		/* 2 */
185 #define	ITLB0 (1ULL << 3)
186 	{ 3, 0x3B6, BPU0_map },		/* 3 */
187 #define	PMH0 (1ULL << 4)
188 	{ 4, 0x3AC, BPU0_map },		/* 4 */
189 #define	IX0 (1ULL << 5)
190 	{ 5, 0x3C8, BPU0_map },		/* 5 */
191 #define	FSB0 (1ULL << 6)
192 	{ 6, 0x3A2, BPU0_map },		/* 6 */
193 #define	BSU0 (1ULL << 7)
194 	{ 7, 0x3A0, BPU0_map },		/* 7 */
195 #define	BPU1 (1ULL << 8)
196 	{ 0, 0x3B3, BPU2_map },		/* 8 */
197 #define	IS1 (1ULL << 9)
198 	{ 1, 0x3B5, BPU2_map },		/* 9 */
199 #define	MOB1 (1ULL << 10)
200 	{ 2, 0x3AB, BPU2_map },		/* 10 */
201 #define	ITLB1 (1ULL << 11)
202 	{ 3, 0x3B7, BPU2_map },		/* 11 */
203 #define	PMH1 (1ULL << 12)
204 	{ 4, 0x3AD, BPU2_map },		/* 12 */
205 #define	IX1 (1ULL << 13)
206 	{ 5, 0x3C9, BPU2_map },		/* 13 */
207 #define	FSB1 (1ULL << 14)
208 	{ 6, 0x3A3, BPU2_map },		/* 14 */
209 #define	BSU1 (1ULL << 15)
210 	{ 7, 0x3A1, BPU2_map },		/* 15 */
211 #define	MS0 (1ULL << 16)
212 	{ 0, 0x3C0, MS0_map },		/* 16 */
213 #define	TC0 (1ULL << 17)
214 	{ 1, 0x3C4, MS0_map },		/* 17 */
215 #define	TBPU0 (1ULL << 18)
216 	{ 2, 0x3C2, MS0_map },		/* 18 */
217 #define	MS1 (1ULL << 19)
218 	{ 0, 0x3C1, MS2_map },		/* 19 */
219 #define	TC1 (1ULL << 20)
220 	{ 1, 0x3C5, MS2_map },		/* 20 */
221 #define	TBPU1 (1ULL << 21)
222 	{ 2, 0x3C3, MS2_map },		/* 21 */
223 #define	FLAME0 (1ULL << 22)
224 	{ 0, 0x3A6, FLAME0_map },	/* 22 */
225 #define	FIRM0 (1ULL << 23)
226 	{ 1, 0x3A4, FLAME0_map },	/* 23 */
227 #define	SAAT0 (1ULL << 24)
228 	{ 2, 0x3AE, FLAME0_map },	/* 24 */
229 #define	U2L0 (1ULL << 25)
230 	{ 3, 0x3B0, FLAME0_map },	/* 25 */
231 #define	DAC0 (1ULL << 26)
232 	{ 5, 0x3A8, FLAME0_map },	/* 26 */
233 #define	FLAME1 (1ULL << 27)
234 	{ 0, 0x3A7, FLAME2_map },	/* 27 */
235 #define	FIRM1 (1ULL << 28)
236 	{ 1, 0x3A5, FLAME2_map },	/* 28 */
237 #define	SAAT1 (1ULL << 29)
238 	{ 2, 0x3AF, FLAME2_map },	/* 29 */
239 #define	U2L1 (1ULL << 30)
240 	{ 3, 0x3B1, FLAME2_map },	/* 30 */
241 #define	DAC1 (1ULL << 31)
242 	{ 5, 0x3A9, FLAME2_map },	/* 31 */
243 #define	IQ0 (1ULL << 32)
244 	{ 0, 0x3BA, IQ0_map },		/* 32 */
245 #define	ALF0 (1ULL << 33)
246 	{ 1, 0x3CA, IQ0_map },		/* 33 */
247 #define	RAT0 (1ULL << 34)
248 	{ 2, 0x3BC, IQ0_map },		/* 34 */
249 #define	SSU0 (1ULL << 35)
250 	{ 3, 0x3BE, IQ0_map },		/* 35 */
251 #define	CRU0 (1ULL << 36)
252 	{ 4, 0x3B8, IQ0_map },		/* 36 */
253 #define	CRU2 (1ULL << 37)
254 	{ 5, 0x3CC, IQ0_map },		/* 37 */
255 #define	CRU4 (1ULL << 38)
256 	{ 6, 0x3E0, IQ0_map },		/* 38 */
257 #define	IQ1 (1ULL << 39)
258 	{ 0, 0x3BB, IQ2_map },		/* 39 */
259 #define	ALF1 (1ULL << 40)
260 	{ 1, 0x3CB, IQ2_map },		/* 40 */
261 #define	RAT1 (1ULL << 41)
262 	{ 2, 0x3BD, IQ2_map },		/* 41 */
263 #define	CRU1 (1ULL << 42)
264 	{ 4, 0x3B9, IQ2_map },		/* 42 */
265 #define	CRU3 (1ULL << 43)
266 	{ 5, 0x3CD, IQ2_map },		/* 43 */
267 #define	CRU5 (1ULL << 44)
268 	{ 6, 0x3E1, IQ2_map }		/* 44 */
269 };
270 
271 #define	ESCR_MAX_INDEX 44
272 
273 typedef struct _p4_ctr {
274 	uint32_t	pc_caddr;	/* counter MSR address */
275 	uint32_t	pc_ctladdr;	/* counter's CCCR MSR address */
276 	uint64_t	pc_map;		/* bitmap of ESCRs controlling ctr */
277 } p4_ctr_t;
278 
279 const p4_ctr_t p4_ctrs[18] = {
280 { /* BPU_COUNTER0 */ 0x300, 0x360, BSU0|FSB0|MOB0|PMH0|BPU0|IS0|ITLB0|IX0},
281 { /* BPU_COUNTER1 */ 0x301, 0x361, BSU0|FSB0|MOB0|PMH0|BPU0|IS0|ITLB0|IX0},
282 { /* BPU_COUNTER2 */ 0x302, 0x362, BSU1|FSB1|MOB1|PMH1|BPU1|IS1|ITLB1|IX1},
283 { /* BPU_COUNTER3 */ 0x303, 0x363, BSU1|FSB1|MOB1|PMH1|BPU1|IS1|ITLB1|IX1},
284 { /* MS_COUNTER0 */  0x304, 0x364, MS0|TBPU0|TC0 },
285 { /* MS_COUNTER1 */  0x305, 0x365, MS0|TBPU0|TC0 },
286 { /* MS_COUNTER2 */  0x306, 0x366, MS1|TBPU1|TC1 },
287 { /* MS_COUNTER3 */  0x307, 0x367, MS1|TBPU1|TC1 },
288 { /* FLAME_COUNTER0 */ 0x308, 0x368, FIRM0|FLAME0|DAC0|SAAT0|U2L0 },
289 { /* FLAME_COUNTER1 */ 0x309, 0x369, FIRM0|FLAME0|DAC0|SAAT0|U2L0 },
290 { /* FLAME_COUNTER2 */ 0x30A, 0x36A, FIRM1|FLAME1|DAC1|SAAT1|U2L1 },
291 { /* FLAME_COUNTER3 */ 0x30B, 0x36B, FIRM1|FLAME1|DAC1|SAAT1|U2L1 },
292 { /* IQ_COUNTER0 */  0x30C, 0x36C, CRU0|CRU2|CRU4|IQ0|RAT0|SSU0|ALF0 },
293 { /* IQ_COUNTER1 */  0x30D, 0x36D, CRU0|CRU2|CRU4|IQ0|RAT0|SSU0|ALF0 },
294 { /* IQ_COUNTER2 */  0x30E, 0x36E, CRU1|CRU3|CRU5|IQ1|RAT1|ALF1 },
295 { /* IQ_COUNTER3 */  0x30F, 0x36F, CRU1|CRU3|CRU5|IQ1|RAT1|ALF1 },
296 { /* IQ_COUNTER4 */  0x310, 0x370, CRU0|CRU2|CRU4|IQ0|RAT0|SSU0|ALF0 },
297 { /* IQ_COUNTER5 */  0x311, 0x371, CRU1|CRU3|CRU5|IQ1|RAT1|ALF1 }
298 };
299 
300 typedef struct _p4_event {
301 	char		*pe_name;	/* Name of event according to docs */
302 	uint64_t	pe_escr_map;	/* Bitmap of ESCRs capable of event */
303 	uint32_t	pe_escr_mask;	/* permissible ESCR event mask */
304 	uint8_t		pe_ev;		/* ESCR event select value */
305 	uint16_t	pe_cccr;	/* CCCR select value */
306 	uint32_t	pe_ctr_mask;	/* Bitmap of capable counters */
307 } p4_event_t;
308 
309 #define	C(n) (1 << n)
310 
311 p4_event_t p4_events[] = {
312 { "branch_retired", CRU2|CRU3, 0xF, 0x6, 0x5, C(12)|C(13)|C(14)|C(15)|C(16) },
313 { "mispred_branch_retired", CRU0|CRU1, 0x1, 0x3, 0x4,
314 	C(12)|C(13)|C(14)|C(15)|C(16) },
315 { "TC_deliver_mode", TC0|TC1, 0xFF, 0x1, 0x1, C(4)|C(5)|C(6)|C(7) },
316 { "BPU_fetch_request", BPU0|BPU1, 0x1, 0x3, 0x0, C(0)|C(1)|C(2)|C(3) },
317 { "ITLB_reference", ITLB0|ITLB1, 0x7, 0x18, 0x3, C(0)|C(1)|C(2)|C(3) },
318 { "memory_cancel", DAC0|DAC1, 0x6, 0x2, 0x5, C(8)|C(9)|C(10)|C(11) },
319 { "memory_complete", SAAT0|SAAT1, 0x3, 0x8, 0x2, C(8)|C(9)|C(10)|C(11) },
320 { "load_port_replay", SAAT0|SAAT1, 0x1, 0x4, 0x2, C(8)|C(9)|C(10)|C(11) },
321 { "store_port_replay", SAAT0|SAAT1, 0x1, 0x5, 0x2, C(8)|C(9)|C(10)|C(11) },
322 { "MOB_load_replay", MOB0|MOB1, 0x35, 0x3, 0x2, C(0)|C(1)|C(2)|C(3) },
323 { "page_walk_type", PMH0|PMH1, 0x3, 0x1, 0x4, C(0)|C(1)|C(2)|C(3) },
324 { "BSQ_cache_reference", BSU0|BSU1, 0x73F, 0xC, 0x7, C(0)|C(1)|C(2)|C(3) },
325 { "IOQ_allocation", FSB0, 0xEFFF, 0x3, 0x6, C(0)|C(1) },
326 { "IOQ_active_entries", FSB1, 0xEFFF, 0x1A, 0x6, C(2)|C(3) },
327 { "FSB_data_activity", FSB0|FSB1, 0x3F, 0x17, 0x6, C(0)|C(1)|C(2)|C(3) },
328 { "BSQ_allocation", BSU0, 0x3FEF, 0x5, 0x7, C(0)|C(1) },
329 { "bsq_active_entries", BSU1, 0x3FEF, 0x6, 0x7, C(2)|C(3) },
330 { "x87_assist", CRU2|CRU3, 0x1F, 0x3, 0x5, C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
331 { "SSE_input_assist", FIRM0|FIRM1, 0x8000, 0x34, 0x1, C(8)|C(9)|C(10)|C(11) },
332 { "packed_SP_uop", FIRM0|FIRM1, 0x8000, 0x8, 0x1, C(8)|C(9)|C(10)|C(11) },
333 { "packed_DP_uop", FIRM0|FIRM1, 0x8000, 0xC, 0x1, C(8)|C(9)|C(10)|C(11) },
334 { "scalar_SP_uop", FIRM0|FIRM1, 0x8000, 0xA, 0x1, C(8)|C(9)|C(10)|C(11) },
335 { "scalar_DP_uop", FIRM0|FIRM1, 0x8000, 0xE, 0x1, C(8)|C(9)|C(10)|C(11) },
336 { "64bit_MMX_uop", FIRM0|FIRM1, 0x8000, 0x2, 0x1, C(8)|C(9)|C(10)|C(11) },
337 { "128bit_MMX_uop", FIRM0|FIRM1, 0x8000, 0x1A, 0x1, C(8)|C(9)|C(10)|C(11) },
338 { "x87_FP_uop", FIRM0|FIRM1, 0x8000, 0x4, 0x1, C(8)|C(9)|C(10)|C(11) },
339 { "x87_SIMD_moves_uop", FIRM0|FIRM1, 0x18, 0x2E, 0x1, C(8)|C(9)|C(10)|C(11) },
340 { "machine_clear", CRU2|CRU3, 0xD, 0x2, 0x5,
341 	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
342 { "global_power_events", FSB0|FSB1, 0x1, 0x5, 0x6, C(0)|C(1)|C(2)|C(3) },
343 { "tc_ms_xfer", MS0|MS1, 0x1, 0x5, 0x0, C(4)|C(5)|C(6)|C(7) },
344 { "uop_queue_writes", MS0|MS1, 0x7, 0x9, 0x0, C(4)|C(5)|C(6)|C(7) },
345 { "front_end_event", CRU2|CRU3, 0x3, 0x8, 0x5,
346 	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
347 { "execution_event", CRU2|CRU3, 0xFF, 0xC, 0x5,
348 	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
349 { "replay_event", CRU2|CRU3, 0x3, 0x9, 0x5,
350 	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
351 { "instr_retired", CRU0|CRU1, 0xF, 0x2, 0x4,
352 	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
353 { "uops_retired", CRU0|CRU1, 0x3, 0x1, 0x4,
354 	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
355 { "uop_type", RAT0|RAT1, 0x3, 0x2, 0x2, C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
356 { "retired_mispred_branch_type", TBPU0|TBPU1, 0x1F, 0x5, 0x2,
357 	C(4)|C(5)|C(6)|C(7)},
358 { "retired_branch_type", TBPU0|TBPU1, 0x1F, 0x4, 0x2, C(4)|C(5)|C(6)|C(7) },
359 { NULL, 0, 0, 0, 0 }
360 };
361 
362 /*
363  * Indicates whether the "rdpmc" instruction is available on this processor.
364  */
365 static int p4_rdpmc_avail = 0;
366 
367 static const uint64_t p4_cccrstop = 0;
368 
369 static char *p4_eventlist[18];
370 
371 /*
372  * If set, this processor has HyperThreading.
373  */
374 static int p4_htt = 0;
375 
376 #define	P4_FAMILY	0xF
377 
378 static int
379 p4_pcbe_init(void)
380 {
381 	int		i;
382 	size_t		size;
383 	p4_event_t	*ev;
384 
385 	/*
386 	 * If we're not running on a P4, refuse to load.
387 	 */
388 	if (cpuid_getvendor(CPU) != X86_VENDOR_Intel ||
389 	    cpuid_getfamily(CPU) != P4_FAMILY)
390 		return (-1);
391 
392 	/*
393 	 * Set up the event lists for each counter.
394 	 *
395 	 * First pass calculates the size of the event list, and the second
396 	 * pass copies each event name into the event list.
397 	 */
398 	for (i = 0; i < 18; i++) {
399 		size = 0;
400 		for (ev = p4_events; ev->pe_name != NULL; ev++) {
401 			if (ev->pe_ctr_mask & C(i))
402 				size += strlen(ev->pe_name) + 1;
403 		}
404 
405 		/*
406 		 * We use 'size + 1' here to ensure room for the final
407 		 * strcat when it terminates the string.
408 		 */
409 		p4_eventlist[i] = (char *)kmem_alloc(size + 1, KM_SLEEP);
410 		*p4_eventlist[i] = '\0';
411 
412 		for (ev = p4_events; ev->pe_name != NULL; ev++) {
413 			if (ev->pe_ctr_mask & C(i)) {
414 				(void) strcat(p4_eventlist[i], ev->pe_name);
415 				(void) strcat(p4_eventlist[i], ",");
416 			}
417 		}
418 		/*
419 		 * Remove trailing ','
420 		 */
421 		p4_eventlist[i][size - 1] = '\0';
422 	}
423 
424 	if (x86_feature & X86_MMX)
425 		p4_rdpmc_avail = 1;
426 	/*
427 	 * The X86_HTT flag may disappear soon, so we'll isolate the impact of
428 	 * its demise to the following if().
429 	 */
430 	if (x86_feature & X86_HTT)
431 		p4_htt = 1;
432 
433 	return (0);
434 }
435 
436 static uint_t
437 p4_pcbe_ncounters(void)
438 {
439 	return (18);
440 }
441 
442 static const char *
443 p4_pcbe_impl_name(void)
444 {
445 	if (p4_htt)
446 		return ("Pentium 4 with HyperThreading");
447 	return ("Pentium 4");
448 }
449 
450 static const char *
451 p4_pcbe_cpuref(void)
452 {
453 	return ("See Appendix A.1 of the \"IA-32 Intel Architecture Software " \
454 	    "Developer's Manual Volume 3: System Programming Guide,\" "	       \
455 	    "Order # 245472-012, 2003");
456 }
457 
458 static char *
459 p4_pcbe_list_events(uint_t picnum)
460 {
461 	ASSERT(picnum >= 0 && picnum < 18);
462 
463 	return (p4_eventlist[picnum]);
464 }
465 
466 #define	P4_ATTRS "emask,tag,compare,complement,threshold,edge"
467 
468 static char *
469 p4_pcbe_list_attrs(void)
470 {
471 	if (p4_htt)
472 		return (P4_ATTRS ",active_thread,count_sibling_usr,"
473 		    "count_sibling_sys");
474 	return (P4_ATTRS);
475 }
476 
477 static uint64_t
478 p4_pcbe_event_coverage(char *event)
479 {
480 	p4_event_t *ev;
481 
482 	for (ev = p4_events; ev->pe_name != NULL; ev++) {
483 		if (strcmp(event, ev->pe_name) == 0)
484 			break;
485 	}
486 
487 	return (ev->pe_ctr_mask);
488 }
489 
490 static uint64_t
491 p4_pcbe_overflow_bitmap(void)
492 {
493 	extern int	kcpc_hw_overflow_intr_installed;
494 	uint64_t	ret = 0;
495 	int		i;
496 
497 	/*
498 	 * The CCCR's OVF bit indicates that the corresponding counter has
499 	 * overflowed. It must be explicitly cleared by software, so it is
500 	 * safe to read the CCCR values here.
501 	 */
502 	for (i = 0; i < 18; i++) {
503 		if (rdmsr(p4_ctrs[i].pc_ctladdr) & CCCR_OVF)
504 			ret |= (1 << i);
505 	}
506 
507 	/*
508 	 * Pentium 4 and Xeon turn off the CPC interrupt mask bit in the LVT at
509 	 * every overflow. Turn it back on here.
510 	 */
511 	ASSERT(kcpc_hw_overflow_intr_installed);
512 	(*kcpc_hw_enable_cpc_intr)();
513 
514 	return (ret);
515 }
516 
517 static int
518 p4_escr_inuse(p4_pcbe_config_t **cfgs, int escr_ndx)
519 {
520 	int i;
521 
522 	for (i = 0; i < 18; i++) {
523 		if (cfgs[i] == NULL)
524 			continue;
525 		if (cfgs[i]->p4_escr_ndx == escr_ndx)
526 			return (1);
527 	}
528 
529 	return (0);
530 }
531 
532 static void
533 build_cfgs(p4_pcbe_config_t *cfgs[18], uint64_t *data[18], void *token)
534 {
535 	p4_pcbe_config_t	*cfg = NULL;
536 	uint64_t		*daddr;
537 
538 	bzero(cfgs, 18 * sizeof (p4_pcbe_config_t *));
539 
540 	do {
541 		cfg = (p4_pcbe_config_t *)kcpc_next_config(token, cfg, &daddr);
542 
543 		if (cfg != NULL) {
544 			ASSERT(cfg->p4_picno < 18);
545 			cfgs[cfg->p4_picno] = cfg;
546 			if (data != NULL) {
547 				ASSERT(daddr != NULL);
548 				data[cfg->p4_picno] = daddr;
549 			}
550 		}
551 	} while (cfg != NULL);
552 }
553 
554 /*
555  * Programming a counter:
556  *
557  * Select event.
558  * Choose an ESCR capable of counting that event.
559  * Set up the ESCR with the desired parameters (usr, sys, tag).
560  * Set up the CCCR to point to the selected ESCR.
561  * Set the CCCR parameters (overflow, cascade, edge, etc).
562  */
563 static int
564 p4_pcbe_configure(uint_t picnum, char *eventname, uint64_t preset,
565     uint32_t flags, uint_t nattrs, kcpc_attr_t *attrs, void **data,
566     void *token)
567 {
568 	p4_pcbe_config_t	*cfgs[18];
569 	p4_pcbe_config_t	*cfg;
570 	p4_event_t		*ev;
571 	int			escr_ndx;
572 	int			i;
573 	uint16_t		emask = 0;
574 	uint8_t			tag;
575 	int			use_tag = 0;
576 	int			active_thread = 0x3; /* default is "any" */
577 	int			compare = 0;
578 	int			complement = 0;
579 	int			threshold = 0;
580 	int			edge = 0;
581 	int			sibling_usr = 0; /* count usr on other cpu */
582 	int			sibling_sys = 0; /* count sys on other cpu */
583 
584 	/*
585 	 * If we've been handed an existing configuration, we need only preset
586 	 * the counter value.
587 	 */
588 	if (*data != NULL) {
589 		cfg = *data;
590 		cfg->p4_rawpic = preset & MASK40;
591 		return (0);
592 	}
593 
594 	if (picnum < 0 || picnum >= 18)
595 		return (CPC_INVALID_PICNUM);
596 
597 	for (ev = p4_events; ev->pe_name != NULL; ev++) {
598 		if (strcmp(eventname, ev->pe_name) == 0)
599 			break;
600 	}
601 	if (ev->pe_name == NULL)
602 		return (CPC_INVALID_EVENT);
603 
604 	build_cfgs(cfgs, NULL, token);
605 
606 	/*
607 	 * Find an ESCR capable of counting this event.
608 	 */
609 	for (escr_ndx = 0; escr_ndx < ESCR_MAX_INDEX; escr_ndx++) {
610 		if ((ev->pe_escr_map & (1ULL << escr_ndx)) &&
611 		    p4_escr_inuse(cfgs, escr_ndx) == 0)
612 			break;
613 	}
614 
615 	/*
616 	 * All ESCRs capable of counting this event are already being
617 	 * used.
618 	 */
619 	if (escr_ndx == ESCR_MAX_INDEX)
620 		return (CPC_RESOURCE_UNAVAIL);
621 
622 	/*
623 	 * At this point, ev points to the desired event and escr is the index
624 	 * of a capable and available ESCR.
625 	 *
626 	 * Now process and verify the attributes.
627 	 */
628 	for (i = 0; i < nattrs; i++) {
629 		if (strcmp("emask", attrs[i].ka_name) == 0) {
630 			if ((attrs[i].ka_val | ev->pe_escr_mask)
631 			    != ev->pe_escr_mask)
632 				return (CPC_ATTRIBUTE_OUT_OF_RANGE);
633 			emask = attrs[i].ka_val;
634 			continue;
635 		} else if (strcmp("tag", attrs[i].ka_name) == 0) {
636 			if (attrs[i].ka_val > ESCR_TAG_VALUE_MAX)
637 				return (CPC_ATTRIBUTE_OUT_OF_RANGE);
638 			tag = attrs[i].ka_val;
639 			use_tag = 1;
640 			continue;
641 		} else if (strcmp("compare", attrs[i].ka_name) == 0) {
642 			if (attrs[i].ka_val != 0)
643 				compare = 1;
644 			continue;
645 		} else if (strcmp("complement", attrs[i].ka_name) == 0) {
646 			if (attrs[i].ka_val != 0)
647 				complement = 1;
648 			continue;
649 		} else if (strcmp("threshold", attrs[i].ka_name) == 0) {
650 			if (attrs[i].ka_val > CCCR_THRESHOLD_MAX)
651 				return (CPC_ATTRIBUTE_OUT_OF_RANGE);
652 			threshold = attrs[i].ka_val;
653 			continue;
654 		} else if (strcmp("edge", attrs[i].ka_name) == 0) {
655 			if (attrs[i].ka_val != 0)
656 				edge = 1;
657 			continue;
658 		}
659 
660 		/*
661 		 * The remaining attributes are valid only on HyperThreaded P4s
662 		 * for processes with the "cpc_cpu" privilege.
663 		 */
664 		if (p4_htt == 0)
665 			return (CPC_INVALID_ATTRIBUTE);
666 
667 		if (secpolicy_cpc_cpu(crgetcred()) != 0)
668 			return (CPC_ATTR_REQUIRES_PRIVILEGE);
669 
670 		if (strcmp("active_thread", attrs[i].ka_name) == 0) {
671 			if ((attrs[i].ka_val | CCCR_ACTV_THR_MASK) !=
672 			    CCCR_ACTV_THR_MASK)
673 				return (CPC_ATTRIBUTE_OUT_OF_RANGE);
674 			active_thread = (int)attrs[i].ka_val;
675 		} else if (strcmp("count_sibling_usr", attrs[i].ka_name) == 0) {
676 			if (attrs[i].ka_val != 0)
677 				sibling_usr = 1;
678 		} else if (strcmp("count_sibling_sys", attrs[i].ka_name) == 0) {
679 			if (attrs[i].ka_val != 0)
680 				sibling_sys = 1;
681 		} else
682 			return (CPC_INVALID_ATTRIBUTE);
683 	}
684 
685 	/*
686 	 * Make sure the counter can count this event
687 	 */
688 	if ((ev->pe_ctr_mask & C(picnum)) == 0)
689 		return (CPC_PIC_NOT_CAPABLE);
690 
691 	/*
692 	 * Find an ESCR that lines up with the event _and_ the counter.
693 	 */
694 	for (escr_ndx = 0; escr_ndx < ESCR_MAX_INDEX; escr_ndx++) {
695 		if ((ev->pe_escr_map & (1ULL << escr_ndx)) &&
696 		    (p4_escrs[escr_ndx].pe_map & (1 << picnum)) &&
697 		    p4_escr_inuse(cfgs, escr_ndx) == 0)
698 			break;
699 	}
700 	if (escr_ndx == ESCR_MAX_INDEX)
701 		return (CPC_RESOURCE_UNAVAIL);
702 
703 	cfg = (p4_pcbe_config_t *)kmem_alloc(sizeof (p4_pcbe_config_t),
704 	    KM_SLEEP);
705 
706 	cfg->p4_flags = 0;
707 	cfg->p4_picno = picnum;
708 	cfg->p4_escr_ndx = escr_ndx;
709 	cfg->p4_escr = (ev->pe_ev << ESCR_EVSEL_SHIFT) |
710 	    (emask << ESCR_EVMASK_SHIFT);
711 
712 	if (use_tag == 1) {
713 		cfg->p4_escr |= tag << ESCR_TAG_VALUE_SHIFT;
714 		cfg->p4_escr |= ESCR_TAG_ENABLE;
715 	}
716 
717 	if (p4_htt) {
718 		/*
719 		 * This is a HyperThreaded P4.  Since we don't know which
720 		 * logical CPU this configuration will eventually be programmed
721 		 * on, we can't yet decide which fields of the ESCR to select.
722 		 *
723 		 * Record the necessary information in the flags for later.
724 		 */
725 		if (flags & CPC_COUNT_USER)
726 			cfg->p4_flags |= P4_THIS_USR;
727 		if (flags & CPC_COUNT_SYSTEM)
728 			cfg->p4_flags |= P4_THIS_SYS;
729 		if (p4_htt && sibling_usr)
730 			cfg->p4_flags |= P4_SIBLING_USR;
731 		if (p4_htt && sibling_sys)
732 			cfg->p4_flags |= P4_SIBLING_SYS;
733 	} else {
734 		/*
735 		 * This is not HyperThreaded, so we can determine the exact
736 		 * ESCR value necessary now.
737 		 */
738 		if (flags & CPC_COUNT_USER)
739 			cfg->p4_escr |= ESCR_USR;
740 		if (flags & CPC_COUNT_SYSTEM)
741 			cfg->p4_escr |= ESCR_OS;
742 	}
743 
744 	cfg->p4_rawpic = preset & MASK40;
745 
746 	/*
747 	 * Even on non-HT P4s, Intel states the active_thread field (marked as
748 	 * "reserved" for the non-HT chips) must be set to all 1s.
749 	 */
750 	cfg->p4_cccr = CCCR_INIT | (active_thread << CCCR_ACTV_THR_SHIFT);
751 	if (compare)
752 		cfg->p4_cccr |= CCCR_COMPARE;
753 	if (complement)
754 		cfg->p4_cccr |= CCCR_COMPLEMENT;
755 	cfg->p4_cccr |= threshold << CCCR_THRESHOLD_SHIFT;
756 	if (edge)
757 		cfg->p4_cccr |= CCCR_EDGE;
758 	cfg->p4_cccr |= p4_escrs[cfg->p4_escr_ndx].pe_num
759 	    << CCCR_ESCR_SEL_SHIFT;
760 	if (flags & CPC_OVF_NOTIFY_EMT) {
761 		if (p4_htt)
762 			cfg->p4_flags |= P4_PMI;
763 		else {
764 			/*
765 			 * If the user has asked for notification of overflows,
766 			 * we automatically program the hardware to generate an
767 			 * interrupt on overflow.
768 			 *
769 			 * This can only be programmed now if this P4 doesn't
770 			 * have HyperThreading. If it does, we must wait until
771 			 * we know which logical CPU we'll be programming.
772 			 */
773 			cfg->p4_cccr |= CCCR_OVF_PMI;
774 		}
775 	}
776 
777 	*data = cfg;
778 
779 	return (0);
780 }
781 
782 static void
783 p4_pcbe_program(void *token)
784 {
785 	int			i;
786 	uint64_t		cccr;
787 	p4_pcbe_config_t	*cfgs[18];
788 
789 	p4_pcbe_allstop();
790 
791 	build_cfgs(cfgs, NULL, token);
792 
793 	if (p4_rdpmc_avail) {
794 		ulong_t curcr4 = getcr4();
795 		if (kcpc_allow_nonpriv(token))
796 			setcr4(curcr4 | CR4_PCE);
797 		else
798 			setcr4(curcr4 & ~CR4_PCE);
799 	}
800 
801 	/*
802 	 * Ideally we would start all counters with a single operation, but in
803 	 * P4 each counter is enabled individually via its CCCR. To minimize the
804 	 * probe effect of enabling the counters, we do it in two passes: the
805 	 * first programs the counter and ESCR, and the second programs the
806 	 * CCCR (and thus enables the counter).
807 	 */
808 	if (p4_htt) {
809 		int	lid = cpuid_get_clogid(CPU); /* Logical ID of CPU */
810 
811 		for (i = 0; i < 18; i++) {
812 			uint64_t escr;
813 
814 			if (cfgs[i] == NULL)
815 				continue;
816 			escr = (uint64_t)cfgs[i]->p4_escr;
817 
818 			if (cfgs[i]->p4_flags & P4_THIS_USR)
819 				escr |= (lid == 0) ? ESCR_T0_USR : ESCR_T1_USR;
820 			if (cfgs[i]->p4_flags & P4_THIS_SYS)
821 				escr |= (lid == 0) ? ESCR_T0_OS : ESCR_T1_OS;
822 			if (cfgs[i]->p4_flags & P4_SIBLING_USR)
823 				escr |= (lid == 0) ? ESCR_T1_USR : ESCR_T0_USR;
824 			if (cfgs[i]->p4_flags & P4_SIBLING_SYS)
825 				escr |= (lid == 0) ? ESCR_T1_OS : ESCR_T0_OS;
826 
827 			wrmsr(p4_ctrs[i].pc_caddr, cfgs[i]->p4_rawpic);
828 			wrmsr(p4_escrs[cfgs[i]->p4_escr_ndx].pe_addr, escr);
829 		}
830 
831 		for (i = 0; i < 18; i++) {
832 			if (cfgs[i] == NULL)
833 				continue;
834 			cccr = (uint64_t)cfgs[i]->p4_cccr;
835 			/*
836 			 * We always target the overflow interrupt at the
837 			 * logical CPU which is doing the counting.
838 			 */
839 			if (cfgs[i]->p4_flags & P4_PMI)
840 				cccr |= (lid == 0) ?
841 				    CCCR_OVF_PMI_T0 : CCCR_OVF_PMI_T1;
842 			wrmsr(p4_ctrs[i].pc_ctladdr, cccr);
843 		}
844 	} else {
845 		for (i = 0; i < 18; i++) {
846 			if (cfgs[i] == NULL)
847 				continue;
848 			wrmsr(p4_ctrs[i].pc_caddr, cfgs[i]->p4_rawpic);
849 			wrmsr(p4_escrs[cfgs[i]->p4_escr_ndx].pe_addr,
850 			    (uint64_t)cfgs[i]->p4_escr);
851 		}
852 
853 		for (i = 0; i < 18; i++) {
854 			if (cfgs[i] == NULL)
855 				continue;
856 			wrmsr(p4_ctrs[i].pc_ctladdr,
857 			    (uint64_t)cfgs[i]->p4_cccr);
858 		}
859 	}
860 }
861 
862 static void
863 p4_pcbe_allstop(void)
864 {
865 	int		i;
866 
867 	for (i = 0; i < 18; i++)
868 		wrmsr(p4_ctrs[i].pc_ctladdr, 0ULL);
869 
870 	setcr4(getcr4() & ~CR4_PCE);
871 }
872 
873 
874 static void
875 p4_pcbe_sample(void *token)
876 {
877 	p4_pcbe_config_t	*cfgs[18];
878 	uint64_t		*addrs[18];
879 	uint64_t		curpic[18];
880 	int64_t			diff;
881 	int			i;
882 
883 	for (i = 0; i < 18; i++)
884 		curpic[i] = rdmsr(p4_ctrs[i].pc_caddr);
885 
886 	build_cfgs(cfgs, addrs, token);
887 
888 	for (i = 0; i < 18; i++) {
889 		if (cfgs[i] == NULL)
890 			continue;
891 		diff = curpic[i] - cfgs[i]->p4_rawpic;
892 		if (diff < 0)
893 			diff += (1ll << 40);
894 		*addrs[i] += diff;
895 		DTRACE_PROBE4(p4__pcbe__sample, int, i, uint64_t, *addrs[i],
896 		    uint64_t, curpic[i], uint64_t, cfgs[i]->p4_rawpic);
897 		cfgs[i]->p4_rawpic = *addrs[i] & MASK40;
898 	}
899 }
900 
901 static void
902 p4_pcbe_free(void *config)
903 {
904 	kmem_free(config, sizeof (p4_pcbe_config_t));
905 }
906 
907 static struct modlpcbe modlpcbe = {
908 	&mod_pcbeops,
909 	"Pentium 4 Performance Counters v%I%",
910 	&p4_pcbe_ops
911 };
912 
913 static struct modlinkage modl = {
914 	MODREV_1,
915 	&modlpcbe,
916 };
917 
918 int
919 _init(void)
920 {
921 	if (p4_pcbe_init() != 0)
922 		return (ENOTSUP);
923 	return (mod_install(&modl));
924 }
925 
926 int
927 _fini(void)
928 {
929 	return (mod_remove(&modl));
930 }
931 
932 int
933 _info(struct modinfo *mi)
934 {
935 	return (mod_info(&modl, mi));
936 }
937