xref: /titanic_41/usr/src/uts/intel/pcbe/p4_pcbe.c (revision 8461248208fabd3a8230615f8615e5bf1b4dcdcb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Performance Counter Back-End for Pentium 4.
31  */
32 
33 #include <sys/cpuvar.h>
34 #include <sys/param.h>
35 #include <sys/cpc_impl.h>
36 #include <sys/cpc_pcbe.h>
37 #include <sys/inttypes.h>
38 #include <sys/errno.h>
39 #include <sys/systm.h>
40 #include <sys/archsystm.h>
41 #include <sys/x86_archext.h>
42 #include <sys/modctl.h>
43 #include <sys/sdt.h>
44 #include <sys/cred.h>
45 #include <sys/policy.h>
46 #include <sys/privregs.h>
47 
48 static int p4_pcbe_init(void);
49 static uint_t p4_pcbe_ncounters(void);
50 static const char *p4_pcbe_impl_name(void);
51 static const char *p4_pcbe_cpuref(void);
52 static char *p4_pcbe_list_events(uint_t picnum);
53 static char *p4_pcbe_list_attrs(void);
54 static uint64_t p4_pcbe_event_coverage(char *event);
55 static uint64_t p4_pcbe_overflow_bitmap(void);
56 static int p4_pcbe_configure(uint_t picnum, char *event, uint64_t preset,
57     uint32_t flags, uint_t nattrs, kcpc_attr_t *attrs, void **data,
58     void *token);
59 static void p4_pcbe_program(void *token);
60 static void p4_pcbe_allstop(void);
61 static void p4_pcbe_sample(void *token);
62 static void p4_pcbe_free(void *config);
63 
64 extern int chip_plat_get_clogid(cpu_t *);
65 
66 static pcbe_ops_t p4_pcbe_ops = {
67 	PCBE_VER_1,
68 	CPC_CAP_OVERFLOW_INTERRUPT | CPC_CAP_OVERFLOW_PRECISE,
69 	p4_pcbe_ncounters,
70 	p4_pcbe_impl_name,
71 	p4_pcbe_cpuref,
72 	p4_pcbe_list_events,
73 	p4_pcbe_list_attrs,
74 	p4_pcbe_event_coverage,
75 	p4_pcbe_overflow_bitmap,
76 	p4_pcbe_configure,
77 	p4_pcbe_program,
78 	p4_pcbe_allstop,
79 	p4_pcbe_sample,
80 	p4_pcbe_free
81 };
82 
83 /*
84  * P4 Configuration Flags.
85  */
86 #define	P4_THIS_USR	0x1 /* HTT: Measure usr events on this logical CPU */
87 #define	P4_THIS_SYS	0x2 /* HTT: Measure os events on this logical CPU */
88 #define	P4_SIBLING_USR	0x4 /* HTT: Measure os events on other logical CPU */
89 #define	P4_SIBLING_SYS	0x8 /* HTT: Measure usr events on other logical CPU */
90 #define	P4_PMI		0x10 /* HTT: Set PMI bit for local logical CPU */
91 
92 typedef struct _p4_pcbe_config {
93 	uint8_t		p4_flags;
94 	uint8_t		p4_picno;	/* From 0 to 18 */
95 	uint8_t		p4_escr_ndx;	/* Which ESCR to use */
96 	uint32_t	p4_escr;	/* Value to program in selected ESCR */
97 	uint32_t	p4_cccr;	/* Value to program in counter's CCCR */
98 	uint64_t	p4_rawpic;
99 } p4_pcbe_config_t;
100 
101 typedef uint32_t cntr_map_t;
102 
103 typedef struct _p4_escr {
104 	int		pe_num;
105 	uint32_t	pe_addr;
106 	uint32_t	pe_map; /* bitmap of counters; bit 1 means ctr 0 */
107 } p4_escr_t;
108 
109 #define	MASK40			UINT64_C(0xffffffffff)
110 
111 /*
112  * CCCR field definitions.
113  *
114  * Note that the Intel Developer's Manual states that the reserved field at
115  * bit location 16 and 17 must be set to 11. (??)
116  */
117 #define	CCCR_ENABLE_SHIFT	12
118 #define	CCCR_ESCR_SEL_SHIFT	13
119 #define	CCCR_ACTV_THR_SHIFT	16
120 #define	CCCR_COMPARE_SHIFT	18
121 #define	CCCR_COMPLEMENT_SHIFT	19
122 #define	CCCR_THRESHOLD_SHIFT	20
123 #define	CCCR_EDGE_SHIFT		24
124 #define	CCCR_OVF_PMI_SHIFT	26
125 #define	CCCR_OVF_PMI_T0_SHIFT	26
126 #define	CCCR_OVF_PMI_T1_SHIFT	27
127 #define	CCCR_OVF_SHIFT		31
128 #define	CCCR_ACTV_THR_MASK	0x3
129 #define	CCCR_THRESHOLD_MAX	0xF
130 #define	CCCR_ENABLE		(1U << CCCR_ENABLE_SHIFT)
131 #define	CCCR_COMPARE		(1U << CCCR_COMPARE_SHIFT)
132 #define	CCCR_COMPLEMENT		(1U << CCCR_COMPLEMENT_SHIFT)
133 #define	CCCR_EDGE		(1U << CCCR_EDGE_SHIFT)
134 #define	CCCR_OVF_PMI		(1U << CCCR_OVF_PMI_SHIFT)
135 #define	CCCR_OVF_PMI_T0		(1U << CCCR_OVF_PMI_T0_SHIFT)
136 #define	CCCR_OVF_PMI_T1		(1U << CCCR_OVF_PMI_T1_SHIFT)
137 #define	CCCR_INIT		CCCR_ENABLE
138 #define	CCCR_OVF		(1U << CCCR_OVF_SHIFT)
139 
140 #define	ESCR_EVSEL_SHIFT	25
141 #define	ESCR_EVMASK_SHIFT	9
142 #define	ESCR_TAG_VALUE_SHIFT	5
143 #define	ESCR_TAG_VALUE_MAX	0xF
144 #define	ESCR_TAG_ENABLE_SHIFT	4
145 #define	ESCR_USR_SHIFT		2
146 #define	ESCR_OS_SHIFT		3
147 #define	ESCR_USR		(1U << ESCR_USR_SHIFT)
148 #define	ESCR_OS			(1U << ESCR_OS_SHIFT)
149 #define	ESCR_TAG_ENABLE		(1U << ESCR_TAG_ENABLE_SHIFT)
150 
151 /*
152  * HyperThreaded ESCR fields.
153  */
154 #define	ESCR_T0_OS_SHIFT	3
155 #define	ESCR_T0_USR_SHIFT	2
156 #define	ESCR_T1_OS_SHIFT	1
157 #define	ESCR_T1_USR_SHIFT	0
158 #define	ESCR_T0_OS		(1U << ESCR_T0_OS_SHIFT)
159 #define	ESCR_T0_USR		(1U << ESCR_T0_USR_SHIFT)
160 #define	ESCR_T1_OS		(1U << ESCR_T1_OS_SHIFT)
161 #define	ESCR_T1_USR		(1U << ESCR_T1_USR_SHIFT)
162 
163 /*
164  * ESCRs are grouped by counter; each group of ESCRs is associated with a
165  * distinct group of counters. Use these macros to fill in the table below.
166  */
167 #define	BPU0_map	(0x1 | 0x2)		/* Counters 0 and 1 */
168 #define	BPU2_map	(0x4 | 0x8)		/* Counters 2 and 3 */
169 #define	MS0_map		(0x10 | 0x20)		/* Counters 4 and 5 */
170 #define	MS2_map		(0x40 | 0x80)		/* Counters 6 and 7 */
171 #define	FLAME0_map	(0x100 | 0x200)		/* Counters 8 and 9 */
172 #define	FLAME2_map	(0x400 | 0x800)		/* Counters 10 and 11 */
173 #define	IQ0_map		(0x1000 | 0x2000 | 0x10000) /* Counters 12, 13, 16 */
174 #define	IQ2_map		(0x4000 | 0x8000 | 0x20000) /* Counters 14, 15, 17 */
175 
176 /*
177  * Table describing the 45 Event Selection and Control Registers (ESCRs).
178  */
179 const p4_escr_t p4_escrs[] = {
180 #define	BPU0 (1)
181 	{ 0, 0x3B2, BPU0_map },		/* 0 */
182 #define	IS0 (1ULL << 1)
183 	{ 1, 0x3B4, BPU0_map },		/* 1 */
184 #define	MOB0 (1ULL << 2)
185 	{ 2, 0x3AA, BPU0_map },		/* 2 */
186 #define	ITLB0 (1ULL << 3)
187 	{ 3, 0x3B6, BPU0_map },		/* 3 */
188 #define	PMH0 (1ULL << 4)
189 	{ 4, 0x3AC, BPU0_map },		/* 4 */
190 #define	IX0 (1ULL << 5)
191 	{ 5, 0x3C8, BPU0_map },		/* 5 */
192 #define	FSB0 (1ULL << 6)
193 	{ 6, 0x3A2, BPU0_map },		/* 6 */
194 #define	BSU0 (1ULL << 7)
195 	{ 7, 0x3A0, BPU0_map },		/* 7 */
196 #define	BPU1 (1ULL << 8)
197 	{ 0, 0x3B3, BPU2_map },		/* 8 */
198 #define	IS1 (1ULL << 9)
199 	{ 1, 0x3B5, BPU2_map },		/* 9 */
200 #define	MOB1 (1ULL << 10)
201 	{ 2, 0x3AB, BPU2_map },		/* 10 */
202 #define	ITLB1 (1ULL << 11)
203 	{ 3, 0x3B7, BPU2_map },		/* 11 */
204 #define	PMH1 (1ULL << 12)
205 	{ 4, 0x3AD, BPU2_map },		/* 12 */
206 #define	IX1 (1ULL << 13)
207 	{ 5, 0x3C9, BPU2_map },		/* 13 */
208 #define	FSB1 (1ULL << 14)
209 	{ 6, 0x3A3, BPU2_map },		/* 14 */
210 #define	BSU1 (1ULL << 15)
211 	{ 7, 0x3A1, BPU2_map },		/* 15 */
212 #define	MS0 (1ULL << 16)
213 	{ 0, 0x3C0, MS0_map },		/* 16 */
214 #define	TC0 (1ULL << 17)
215 	{ 1, 0x3C4, MS0_map },		/* 17 */
216 #define	TBPU0 (1ULL << 18)
217 	{ 2, 0x3C2, MS0_map },		/* 18 */
218 #define	MS1 (1ULL << 19)
219 	{ 0, 0x3C1, MS2_map },		/* 19 */
220 #define	TC1 (1ULL << 20)
221 	{ 1, 0x3C5, MS2_map },		/* 20 */
222 #define	TBPU1 (1ULL << 21)
223 	{ 2, 0x3C3, MS2_map },		/* 21 */
224 #define	FLAME0 (1ULL << 22)
225 	{ 0, 0x3A6, FLAME0_map },	/* 22 */
226 #define	FIRM0 (1ULL << 23)
227 	{ 1, 0x3A4, FLAME0_map },	/* 23 */
228 #define	SAAT0 (1ULL << 24)
229 	{ 2, 0x3AE, FLAME0_map },	/* 24 */
230 #define	U2L0 (1ULL << 25)
231 	{ 3, 0x3B0, FLAME0_map },	/* 25 */
232 #define	DAC0 (1ULL << 26)
233 	{ 5, 0x3A8, FLAME0_map },	/* 26 */
234 #define	FLAME1 (1ULL << 27)
235 	{ 0, 0x3A7, FLAME2_map },	/* 27 */
236 #define	FIRM1 (1ULL << 28)
237 	{ 1, 0x3A5, FLAME2_map },	/* 28 */
238 #define	SAAT1 (1ULL << 29)
239 	{ 2, 0x3AF, FLAME2_map },	/* 29 */
240 #define	U2L1 (1ULL << 30)
241 	{ 3, 0x3B1, FLAME2_map },	/* 30 */
242 #define	DAC1 (1ULL << 31)
243 	{ 5, 0x3A9, FLAME2_map },	/* 31 */
244 #define	IQ0 (1ULL << 32)
245 	{ 0, 0x3BA, IQ0_map },		/* 32 */
246 #define	ALF0 (1ULL << 33)
247 	{ 1, 0x3CA, IQ0_map },		/* 33 */
248 #define	RAT0 (1ULL << 34)
249 	{ 2, 0x3BC, IQ0_map },		/* 34 */
250 #define	SSU0 (1ULL << 35)
251 	{ 3, 0x3BE, IQ0_map },		/* 35 */
252 #define	CRU0 (1ULL << 36)
253 	{ 4, 0x3B8, IQ0_map },		/* 36 */
254 #define	CRU2 (1ULL << 37)
255 	{ 5, 0x3CC, IQ0_map },		/* 37 */
256 #define	CRU4 (1ULL << 38)
257 	{ 6, 0x3E0, IQ0_map },		/* 38 */
258 #define	IQ1 (1ULL << 39)
259 	{ 0, 0x3BB, IQ2_map },		/* 39 */
260 #define	ALF1 (1ULL << 40)
261 	{ 1, 0x3CB, IQ2_map },		/* 40 */
262 #define	RAT1 (1ULL << 41)
263 	{ 2, 0x3BD, IQ2_map },		/* 41 */
264 #define	CRU1 (1ULL << 42)
265 	{ 4, 0x3B9, IQ2_map },		/* 42 */
266 #define	CRU3 (1ULL << 43)
267 	{ 5, 0x3CD, IQ2_map },		/* 43 */
268 #define	CRU5 (1ULL << 44)
269 	{ 6, 0x3E1, IQ2_map }		/* 44 */
270 };
271 
272 #define	ESCR_MAX_INDEX 44
273 
274 typedef struct _p4_ctr {
275 	uint32_t	pc_caddr;	/* counter MSR address */
276 	uint32_t	pc_ctladdr;	/* counter's CCCR MSR address */
277 	uint64_t	pc_map;		/* bitmap of ESCRs controlling ctr */
278 } p4_ctr_t;
279 
280 const p4_ctr_t p4_ctrs[18] = {
281 { /* BPU_COUNTER0 */ 0x300, 0x360, BSU0|FSB0|MOB0|PMH0|BPU0|IS0|ITLB0|IX0},
282 { /* BPU_COUNTER1 */ 0x301, 0x361, BSU0|FSB0|MOB0|PMH0|BPU0|IS0|ITLB0|IX0},
283 { /* BPU_COUNTER2 */ 0x302, 0x362, BSU1|FSB1|MOB1|PMH1|BPU1|IS1|ITLB1|IX1},
284 { /* BPU_COUNTER3 */ 0x303, 0x363, BSU1|FSB1|MOB1|PMH1|BPU1|IS1|ITLB1|IX1},
285 { /* MS_COUNTER0 */  0x304, 0x364, MS0|TBPU0|TC0 },
286 { /* MS_COUNTER1 */  0x305, 0x365, MS0|TBPU0|TC0 },
287 { /* MS_COUNTER2 */  0x306, 0x366, MS1|TBPU1|TC1 },
288 { /* MS_COUNTER3 */  0x307, 0x367, MS1|TBPU1|TC1 },
289 { /* FLAME_COUNTER0 */ 0x308, 0x368, FIRM0|FLAME0|DAC0|SAAT0|U2L0 },
290 { /* FLAME_COUNTER1 */ 0x309, 0x369, FIRM0|FLAME0|DAC0|SAAT0|U2L0 },
291 { /* FLAME_COUNTER2 */ 0x30A, 0x36A, FIRM1|FLAME1|DAC1|SAAT1|U2L1 },
292 { /* FLAME_COUNTER3 */ 0x30B, 0x36B, FIRM1|FLAME1|DAC1|SAAT1|U2L1 },
293 { /* IQ_COUNTER0 */  0x30C, 0x36C, CRU0|CRU2|CRU4|IQ0|RAT0|SSU0|ALF0 },
294 { /* IQ_COUNTER1 */  0x30D, 0x36D, CRU0|CRU2|CRU4|IQ0|RAT0|SSU0|ALF0 },
295 { /* IQ_COUNTER2 */  0x30E, 0x36E, CRU1|CRU3|CRU5|IQ1|RAT1|ALF1 },
296 { /* IQ_COUNTER3 */  0x30F, 0x36F, CRU1|CRU3|CRU5|IQ1|RAT1|ALF1 },
297 { /* IQ_COUNTER4 */  0x310, 0x370, CRU0|CRU2|CRU4|IQ0|RAT0|SSU0|ALF0 },
298 { /* IQ_COUNTER5 */  0x311, 0x371, CRU1|CRU3|CRU5|IQ1|RAT1|ALF1 }
299 };
300 
301 typedef struct _p4_event {
302 	char		*pe_name;	/* Name of event according to docs */
303 	uint64_t	pe_escr_map;	/* Bitmap of ESCRs capable of event */
304 	uint32_t	pe_escr_mask;	/* permissible ESCR event mask */
305 	uint8_t		pe_ev;		/* ESCR event select value */
306 	uint16_t	pe_cccr;	/* CCCR select value */
307 	uint32_t	pe_ctr_mask;	/* Bitmap of capable counters */
308 } p4_event_t;
309 
310 #define	C(n) (1 << n)
311 
312 p4_event_t p4_events[] = {
313 { "branch_retired", CRU2|CRU3, 0xF, 0x6, 0x5, C(12)|C(13)|C(14)|C(15)|C(16) },
314 { "mispred_branch_retired", CRU0|CRU1, 0x1, 0x3, 0x4,
315 	C(12)|C(13)|C(14)|C(15)|C(16) },
316 { "TC_deliver_mode", TC0|TC1, 0xFF, 0x1, 0x1, C(4)|C(5)|C(6)|C(7) },
317 { "BPU_fetch_request", BPU0|BPU1, 0x1, 0x3, 0x0, C(0)|C(1)|C(2)|C(3) },
318 { "ITLB_reference", ITLB0|ITLB1, 0x7, 0x18, 0x3, C(0)|C(1)|C(2)|C(3) },
319 { "memory_cancel", DAC0|DAC1, 0x6, 0x2, 0x5, C(8)|C(9)|C(10)|C(11) },
320 { "memory_complete", SAAT0|SAAT1, 0x3, 0x8, 0x2, C(8)|C(9)|C(10)|C(11) },
321 { "load_port_replay", SAAT0|SAAT1, 0x1, 0x4, 0x2, C(8)|C(9)|C(10)|C(11) },
322 { "store_port_replay", SAAT0|SAAT1, 0x1, 0x5, 0x2, C(8)|C(9)|C(10)|C(11) },
323 { "MOB_load_replay", MOB0|MOB1, 0x35, 0x3, 0x2, C(0)|C(1)|C(2)|C(3) },
324 { "page_walk_type", PMH0|PMH1, 0x3, 0x1, 0x4, C(0)|C(1)|C(2)|C(3) },
325 { "BSQ_cache_reference", BSU0|BSU1, 0x73F, 0xC, 0x7, C(0)|C(1)|C(2)|C(3) },
326 { "IOQ_allocation", FSB0, 0xEFFF, 0x3, 0x6, C(0)|C(1) },
327 { "IOQ_active_entries", FSB1, 0xEFFF, 0x1A, 0x6, C(2)|C(3) },
328 { "FSB_data_activity", FSB0|FSB1, 0x3F, 0x17, 0x6, C(0)|C(1)|C(2)|C(3) },
329 { "BSQ_allocation", BSU0, 0x3FEF, 0x5, 0x7, C(0)|C(1) },
330 { "bsq_active_entries", BSU1, 0x3FEF, 0x6, 0x7, C(2)|C(3) },
331 { "x87_assist", CRU2|CRU3, 0x1F, 0x3, 0x5, C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
332 { "SSE_input_assist", FIRM0|FIRM1, 0x8000, 0x34, 0x1, C(8)|C(9)|C(10)|C(11) },
333 { "packed_SP_uop", FIRM0|FIRM1, 0x8000, 0x8, 0x1, C(8)|C(9)|C(10)|C(11) },
334 { "packed_DP_uop", FIRM0|FIRM1, 0x8000, 0xC, 0x1, C(8)|C(9)|C(10)|C(11) },
335 { "scalar_SP_uop", FIRM0|FIRM1, 0x8000, 0xA, 0x1, C(8)|C(9)|C(10)|C(11) },
336 { "scalar_DP_uop", FIRM0|FIRM1, 0x8000, 0xE, 0x1, C(8)|C(9)|C(10)|C(11) },
337 { "64bit_MMX_uop", FIRM0|FIRM1, 0x8000, 0x2, 0x1, C(8)|C(9)|C(10)|C(11) },
338 { "128bit_MMX_uop", FIRM0|FIRM1, 0x8000, 0x1A, 0x1, C(8)|C(9)|C(10)|C(11) },
339 { "x87_FP_uop", FIRM0|FIRM1, 0x8000, 0x4, 0x1, C(8)|C(9)|C(10)|C(11) },
340 { "x87_SIMD_moves_uop", FIRM0|FIRM1, 0x18, 0x2E, 0x1, C(8)|C(9)|C(10)|C(11) },
341 { "machine_clear", CRU2|CRU3, 0xD, 0x2, 0x5,
342 	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
343 { "global_power_events", FSB0|FSB1, 0x1, 0x5, 0x6, C(0)|C(1)|C(2)|C(3) },
344 { "tc_ms_xfer", MS0|MS1, 0x1, 0x5, 0x0, C(4)|C(5)|C(6)|C(7) },
345 { "uop_queue_writes", MS0|MS1, 0x7, 0x9, 0x0, C(4)|C(5)|C(6)|C(7) },
346 { "front_end_event", CRU2|CRU3, 0x3, 0x8, 0x5,
347 	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
348 { "execution_event", CRU2|CRU3, 0xFF, 0xC, 0x5,
349 	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
350 { "replay_event", CRU2|CRU3, 0x3, 0x9, 0x5,
351 	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
352 { "instr_retired", CRU0|CRU1, 0xF, 0x2, 0x4,
353 	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
354 { "uops_retired", CRU0|CRU1, 0x3, 0x1, 0x4,
355 	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
356 { "uop_type", RAT0|RAT1, 0x3, 0x2, 0x2, C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
357 { "retired_mispred_branch_type", TBPU0|TBPU1, 0x1F, 0x5, 0x2,
358 	C(4)|C(5)|C(6)|C(7)},
359 { "retired_branch_type", TBPU0|TBPU1, 0x1F, 0x4, 0x2, C(4)|C(5)|C(6)|C(7) },
360 { NULL, 0, 0, 0, 0 }
361 };
362 
363 /*
364  * Indicates whether the "rdpmc" instruction is available on this processor.
365  */
366 static int p4_rdpmc_avail = 0;
367 
368 static const uint64_t p4_cccrstop = 0;
369 
370 static char *p4_eventlist[18];
371 
372 /*
373  * If set, this processor has HyperThreading.
374  */
375 static int p4_htt = 0;
376 
377 #define	P4_FAMILY	0xF
378 
379 static int
380 p4_pcbe_init(void)
381 {
382 	int		i;
383 	size_t		size;
384 	p4_event_t	*ev;
385 
386 	/*
387 	 * If we're not running on a P4, refuse to load.
388 	 */
389 	if (cpuid_getvendor(CPU) != X86_VENDOR_Intel ||
390 	    cpuid_getfamily(CPU) != P4_FAMILY)
391 		return (-1);
392 
393 	/*
394 	 * Set up the event lists for each counter.
395 	 *
396 	 * First pass calculates the size of the event list, and the second
397 	 * pass copies each event name into the event list.
398 	 */
399 	for (i = 0; i < 18; i++) {
400 		size = 0;
401 		for (ev = p4_events; ev->pe_name != NULL; ev++) {
402 			if (ev->pe_ctr_mask & C(i))
403 				size += strlen(ev->pe_name) + 1;
404 		}
405 
406 		/*
407 		 * We use 'size + 1' here to ensure room for the final
408 		 * strcat when it terminates the string.
409 		 */
410 		p4_eventlist[i] = (char *)kmem_alloc(size + 1, KM_SLEEP);
411 		*p4_eventlist[i] = '\0';
412 
413 		for (ev = p4_events; ev->pe_name != NULL; ev++) {
414 			if (ev->pe_ctr_mask & C(i)) {
415 				(void) strcat(p4_eventlist[i], ev->pe_name);
416 				(void) strcat(p4_eventlist[i], ",");
417 			}
418 		}
419 		/*
420 		 * Remove trailing ','
421 		 */
422 		p4_eventlist[i][size - 1] = '\0';
423 	}
424 
425 	if (x86_feature & X86_MMX)
426 		p4_rdpmc_avail = 1;
427 	/*
428 	 * The X86_HTT flag may disappear soon, so we'll isolate the impact of
429 	 * its demise to the following if().
430 	 */
431 	if (x86_feature & X86_HTT)
432 		p4_htt = 1;
433 
434 	return (0);
435 }
436 
437 static uint_t
438 p4_pcbe_ncounters(void)
439 {
440 	return (18);
441 }
442 
443 static const char *
444 p4_pcbe_impl_name(void)
445 {
446 	if (p4_htt)
447 		return ("Pentium 4 with HyperThreading");
448 	return ("Pentium 4");
449 }
450 
451 static const char *
452 p4_pcbe_cpuref(void)
453 {
454 	return ("See Appendix A.1 of the \"IA-32 Intel Architecture Software " \
455 	    "Developer's Manual Volume 3: System Programming Guide,\" "	       \
456 	    "Order # 245472-012, 2003");
457 }
458 
459 static char *
460 p4_pcbe_list_events(uint_t picnum)
461 {
462 	ASSERT(picnum >= 0 && picnum < 18);
463 
464 	return (p4_eventlist[picnum]);
465 }
466 
467 #define	P4_ATTRS "emask,tag,compare,complement,threshold,edge"
468 
469 static char *
470 p4_pcbe_list_attrs(void)
471 {
472 	if (p4_htt)
473 		return (P4_ATTRS ",active_thread,count_sibling_usr,"
474 		    "count_sibling_sys");
475 	return (P4_ATTRS);
476 }
477 
478 static uint64_t
479 p4_pcbe_event_coverage(char *event)
480 {
481 	p4_event_t *ev;
482 
483 	for (ev = p4_events; ev->pe_name != NULL; ev++) {
484 		if (strcmp(event, ev->pe_name) == 0)
485 			break;
486 	}
487 
488 	return (ev->pe_ctr_mask);
489 }
490 
491 static uint64_t
492 p4_pcbe_overflow_bitmap(void)
493 {
494 	extern int	kcpc_hw_overflow_intr_installed;
495 	uint64_t	ret = 0;
496 	uint64_t	tmp;
497 	int		i;
498 
499 	/*
500 	 * The CCCR's OVF bit indicates that the corresponding counter has
501 	 * overflowed. It must be explicitly cleared by software, so it is
502 	 * safe to read the CCCR values here.
503 	 */
504 	for (i = 0; i < 18; i++) {
505 		(void) rdmsr(p4_ctrs[i].pc_ctladdr, &tmp);
506 		if (tmp & CCCR_OVF)
507 			ret |= (1 << i);
508 	}
509 
510 	/*
511 	 * Pentium 4 and Xeon turn off the CPC interrupt mask bit in the LVT at
512 	 * every overflow. Turn it back on here.
513 	 */
514 	ASSERT(kcpc_hw_overflow_intr_installed);
515 	(*kcpc_hw_enable_cpc_intr)();
516 
517 	return (ret);
518 }
519 
520 static int
521 p4_escr_inuse(p4_pcbe_config_t **cfgs, int escr_ndx)
522 {
523 	int i;
524 
525 	for (i = 0; i < 18; i++) {
526 		if (cfgs[i] == NULL)
527 			continue;
528 		if (cfgs[i]->p4_escr_ndx == escr_ndx)
529 			return (1);
530 	}
531 
532 	return (0);
533 }
534 
535 static void
536 build_cfgs(p4_pcbe_config_t *cfgs[18], uint64_t *data[18], void *token)
537 {
538 	p4_pcbe_config_t	*cfg = NULL;
539 	uint64_t		*daddr;
540 
541 	bzero(cfgs, 18 * sizeof (p4_pcbe_config_t *));
542 
543 	do {
544 		cfg = (p4_pcbe_config_t *)kcpc_next_config(token, cfg, &daddr);
545 
546 		if (cfg != NULL) {
547 			ASSERT(cfg->p4_picno < 18);
548 			cfgs[cfg->p4_picno] = cfg;
549 			if (data != NULL) {
550 				ASSERT(daddr != NULL);
551 				data[cfg->p4_picno] = daddr;
552 			}
553 		}
554 	} while (cfg != NULL);
555 }
556 
557 /*
558  * Programming a counter:
559  *
560  * Select event.
561  * Choose an ESCR capable of counting that event.
562  * Set up the ESCR with the desired parameters (usr, sys, tag).
563  * Set up the CCCR to point to the selected ESCR.
564  * Set the CCCR parameters (overflow, cascade, edge, etc).
565  */
566 static int
567 p4_pcbe_configure(uint_t picnum, char *eventname, uint64_t preset,
568     uint32_t flags, uint_t nattrs, kcpc_attr_t *attrs, void **data,
569     void *token)
570 {
571 	p4_pcbe_config_t	*cfgs[18];
572 	p4_pcbe_config_t	*cfg;
573 	p4_event_t		*ev;
574 	int			escr_ndx;
575 	int			i;
576 	uint16_t		emask = 0;
577 	uint8_t			tag;
578 	int			use_tag = 0;
579 	int			active_thread = 0x3; /* default is "any" */
580 	int			compare = 0;
581 	int			complement = 0;
582 	int			threshold = 0;
583 	int			edge = 0;
584 	int			sibling_usr = 0; /* count usr on other cpu */
585 	int			sibling_sys = 0; /* count sys on other cpu */
586 
587 	/*
588 	 * If we've been handed an existing configuration, we need only preset
589 	 * the counter value.
590 	 */
591 	if (*data != NULL) {
592 		cfg = *data;
593 		cfg->p4_rawpic = preset & MASK40;
594 		return (0);
595 	}
596 
597 	if (picnum < 0 || picnum >= 18)
598 		return (CPC_INVALID_PICNUM);
599 
600 	for (ev = p4_events; ev->pe_name != NULL; ev++) {
601 		if (strcmp(eventname, ev->pe_name) == 0)
602 			break;
603 	}
604 	if (ev->pe_name == NULL)
605 		return (CPC_INVALID_EVENT);
606 
607 	build_cfgs(cfgs, NULL, token);
608 
609 	/*
610 	 * Find an ESCR capable of counting this event.
611 	 */
612 	for (escr_ndx = 0; escr_ndx < ESCR_MAX_INDEX; escr_ndx++) {
613 		if ((ev->pe_escr_map & (1ULL << escr_ndx)) &&
614 		    p4_escr_inuse(cfgs, escr_ndx) == 0)
615 			break;
616 	}
617 
618 	/*
619 	 * All ESCRs capable of counting this event are already being
620 	 * used.
621 	 */
622 	if (escr_ndx == ESCR_MAX_INDEX)
623 		return (CPC_RESOURCE_UNAVAIL);
624 
625 	/*
626 	 * At this point, ev points to the desired event and escr is the index
627 	 * of a capable and available ESCR.
628 	 *
629 	 * Now process and verify the attributes.
630 	 */
631 	for (i = 0; i < nattrs; i++) {
632 		if (strcmp("emask", attrs[i].ka_name) == 0) {
633 			if ((attrs[i].ka_val | ev->pe_escr_mask)
634 			    != ev->pe_escr_mask)
635 				return (CPC_ATTRIBUTE_OUT_OF_RANGE);
636 			emask = attrs[i].ka_val;
637 			continue;
638 		} else if (strcmp("tag", attrs[i].ka_name) == 0) {
639 			if (attrs[i].ka_val > ESCR_TAG_VALUE_MAX)
640 				return (CPC_ATTRIBUTE_OUT_OF_RANGE);
641 			tag = attrs[i].ka_val;
642 			use_tag = 1;
643 			continue;
644 		} else if (strcmp("compare", attrs[i].ka_name) == 0) {
645 			if (attrs[i].ka_val != 0)
646 				compare = 1;
647 			continue;
648 		} else if (strcmp("complement", attrs[i].ka_name) == 0) {
649 			if (attrs[i].ka_val != 0)
650 				complement = 1;
651 			continue;
652 		} else if (strcmp("threshold", attrs[i].ka_name) == 0) {
653 			if (attrs[i].ka_val > CCCR_THRESHOLD_MAX)
654 				return (CPC_ATTRIBUTE_OUT_OF_RANGE);
655 			threshold = attrs[i].ka_val;
656 			continue;
657 		} else if (strcmp("edge", attrs[i].ka_name) == 0) {
658 			if (attrs[i].ka_val != 0)
659 				edge = 1;
660 			continue;
661 		}
662 
663 		/*
664 		 * The remaining attributes are valid only on HyperThreaded P4s
665 		 * for processes with the "cpc_cpu" privilege.
666 		 */
667 		if (p4_htt == 0)
668 			return (CPC_INVALID_ATTRIBUTE);
669 
670 		if (secpolicy_cpc_cpu(crgetcred()) != 0)
671 			return (CPC_ATTR_REQUIRES_PRIVILEGE);
672 
673 		if (strcmp("active_thread", attrs[i].ka_name) == 0) {
674 			if ((attrs[i].ka_val | CCCR_ACTV_THR_MASK) !=
675 			    CCCR_ACTV_THR_MASK)
676 				return (CPC_ATTRIBUTE_OUT_OF_RANGE);
677 			active_thread = (int)attrs[i].ka_val;
678 		} else if (strcmp("count_sibling_usr", attrs[i].ka_name) == 0) {
679 			if (attrs[i].ka_val != 0)
680 				sibling_usr = 1;
681 		} else if (strcmp("count_sibling_sys", attrs[i].ka_name) == 0) {
682 			if (attrs[i].ka_val != 0)
683 				sibling_sys = 1;
684 		} else
685 			return (CPC_INVALID_ATTRIBUTE);
686 	}
687 
688 	/*
689 	 * Make sure the counter can count this event
690 	 */
691 	if ((ev->pe_ctr_mask & C(picnum)) == 0)
692 		return (CPC_PIC_NOT_CAPABLE);
693 
694 	/*
695 	 * Find an ESCR that lines up with the event _and_ the counter.
696 	 */
697 	for (escr_ndx = 0; escr_ndx < ESCR_MAX_INDEX; escr_ndx++) {
698 		if ((ev->pe_escr_map & (1ULL << escr_ndx)) &&
699 		    (p4_escrs[escr_ndx].pe_map & (1 << picnum)) &&
700 		    p4_escr_inuse(cfgs, escr_ndx) == 0)
701 			break;
702 	}
703 	if (escr_ndx == ESCR_MAX_INDEX)
704 		return (CPC_RESOURCE_UNAVAIL);
705 
706 	cfg = (p4_pcbe_config_t *)kmem_alloc(sizeof (p4_pcbe_config_t),
707 	    KM_SLEEP);
708 
709 	cfg->p4_flags = 0;
710 	cfg->p4_picno = picnum;
711 	cfg->p4_escr_ndx = escr_ndx;
712 	cfg->p4_escr = (ev->pe_ev << ESCR_EVSEL_SHIFT) |
713 	    (emask << ESCR_EVMASK_SHIFT);
714 
715 	if (use_tag == 1) {
716 		cfg->p4_escr |= tag << ESCR_TAG_VALUE_SHIFT;
717 		cfg->p4_escr |= ESCR_TAG_ENABLE;
718 	}
719 
720 	if (p4_htt) {
721 		/*
722 		 * This is a HyperThreaded P4.  Since we don't know which
723 		 * logical CPU this configuration will eventually be programmed
724 		 * on, we can't yet decide which fields of the ESCR to select.
725 		 *
726 		 * Record the necessary information in the flags for later.
727 		 */
728 		if (flags & CPC_COUNT_USER)
729 			cfg->p4_flags |= P4_THIS_USR;
730 		if (flags & CPC_COUNT_SYSTEM)
731 			cfg->p4_flags |= P4_THIS_SYS;
732 		if (p4_htt && sibling_usr)
733 			cfg->p4_flags |= P4_SIBLING_USR;
734 		if (p4_htt && sibling_sys)
735 			cfg->p4_flags |= P4_SIBLING_SYS;
736 	} else {
737 		/*
738 		 * This is not HyperThreaded, so we can determine the exact
739 		 * ESCR value necessary now.
740 		 */
741 		if (flags & CPC_COUNT_USER)
742 			cfg->p4_escr |= ESCR_USR;
743 		if (flags & CPC_COUNT_SYSTEM)
744 			cfg->p4_escr |= ESCR_OS;
745 	}
746 
747 	cfg->p4_rawpic = preset & MASK40;
748 
749 	/*
750 	 * Even on non-HT P4s, Intel states the active_thread field (marked as
751 	 * "reserved" for the non-HT chips) must be set to all 1s.
752 	 */
753 	cfg->p4_cccr = CCCR_INIT | (active_thread << CCCR_ACTV_THR_SHIFT);
754 	if (compare)
755 		cfg->p4_cccr |= CCCR_COMPARE;
756 	if (complement)
757 		cfg->p4_cccr |= CCCR_COMPLEMENT;
758 	cfg->p4_cccr |= threshold << CCCR_THRESHOLD_SHIFT;
759 	if (edge)
760 		cfg->p4_cccr |= CCCR_EDGE;
761 	cfg->p4_cccr |= p4_escrs[cfg->p4_escr_ndx].pe_num
762 	    << CCCR_ESCR_SEL_SHIFT;
763 	if (flags & CPC_OVF_NOTIFY_EMT) {
764 		if (p4_htt)
765 			cfg->p4_flags |= P4_PMI;
766 		else {
767 			/*
768 			 * If the user has asked for notification of overflows,
769 			 * we automatically program the hardware to generate an
770 			 * interrupt on overflow.
771 			 *
772 			 * This can only be programmed now if this P4 doesn't
773 			 * have HyperThreading. If it does, we must wait until
774 			 * we know which logical CPU we'll be programming.
775 			 */
776 			cfg->p4_cccr |= CCCR_OVF_PMI;
777 		}
778 	}
779 
780 	*data = cfg;
781 
782 	return (0);
783 }
784 
785 static void
786 p4_pcbe_program(void *token)
787 {
788 	int			i;
789 	uint64_t		escr;
790 	uint64_t		cccr;
791 	p4_pcbe_config_t	*cfgs[18];
792 
793 	p4_pcbe_allstop();
794 
795 	build_cfgs(cfgs, NULL, token);
796 
797 	if (p4_rdpmc_avail) {
798 		uint32_t curcr4 = getcr4();
799 		if (kcpc_allow_nonpriv(token))
800 			setcr4(curcr4 | CR4_PCE);
801 		else
802 			setcr4(curcr4 & ~CR4_PCE);
803 	}
804 
805 	/*
806 	 * Ideally we would start all counters with a single operation, but in
807 	 * P4 each counter is enabled individually via its CCCR. To minimize the
808 	 * probe effect of enabling the counters, we do it in two passes: the
809 	 * first programs the counter and ESCR, and the second programs the
810 	 * CCCR (and thus enables the counter).
811 	 */
812 	if (p4_htt) {
813 		int	lid = chip_plat_get_clogid(CPU); /* Logical ID of CPU */
814 
815 		for (i = 0; i < 18; i++) {
816 			if (cfgs[i] == NULL)
817 				continue;
818 			escr = (uint64_t)cfgs[i]->p4_escr;
819 
820 			if (cfgs[i]->p4_flags & P4_THIS_USR)
821 				escr |= (lid == 0) ? ESCR_T0_USR : ESCR_T1_USR;
822 			if (cfgs[i]->p4_flags & P4_THIS_SYS)
823 				escr |= (lid == 0) ? ESCR_T0_OS : ESCR_T1_OS;
824 			if (cfgs[i]->p4_flags & P4_SIBLING_USR)
825 				escr |= (lid == 0) ? ESCR_T1_USR : ESCR_T0_USR;
826 			if (cfgs[i]->p4_flags & P4_SIBLING_SYS)
827 				escr |= (lid == 0) ? ESCR_T1_OS : ESCR_T0_OS;
828 
829 			wrmsr(p4_ctrs[i].pc_caddr, &cfgs[i]->p4_rawpic);
830 			wrmsr(p4_escrs[cfgs[i]->p4_escr_ndx].pe_addr, &escr);
831 		}
832 
833 		for (i = 0; i < 18; i++) {
834 			if (cfgs[i] == NULL)
835 				continue;
836 			cccr = (uint64_t)cfgs[i]->p4_cccr;
837 			/*
838 			 * We always target the overflow interrupt at the
839 			 * logical CPU which is doing the counting.
840 			 */
841 			if (cfgs[i]->p4_flags & P4_PMI)
842 				cccr |= (lid == 0) ?
843 				    CCCR_OVF_PMI_T0 : CCCR_OVF_PMI_T1;
844 			wrmsr(p4_ctrs[i].pc_ctladdr, &cccr);
845 		}
846 	} else {
847 		for (i = 0; i < 18; i++) {
848 			if (cfgs[i] == NULL)
849 				continue;
850 			escr = (uint64_t)cfgs[i]->p4_escr;
851 			wrmsr(p4_ctrs[i].pc_caddr, &cfgs[i]->p4_rawpic);
852 			wrmsr(p4_escrs[cfgs[i]->p4_escr_ndx].pe_addr, &escr);
853 		}
854 
855 		for (i = 0; i < 18; i++) {
856 			if (cfgs[i] == NULL)
857 				continue;
858 			cccr = (uint64_t)cfgs[i]->p4_cccr;
859 			wrmsr(p4_ctrs[i].pc_ctladdr, &cccr);
860 		}
861 	}
862 }
863 
864 static void
865 p4_pcbe_allstop(void)
866 {
867 	int		i;
868 	uint64_t	tmp = 0;
869 
870 	for (i = 0; i < 18; i++)
871 		wrmsr(p4_ctrs[i].pc_ctladdr, &tmp);
872 
873 	setcr4(getcr4() & ~CR4_PCE);
874 }
875 
876 
877 static void
878 p4_pcbe_sample(void *token)
879 {
880 	p4_pcbe_config_t	*cfgs[18];
881 	uint64_t		*addrs[18];
882 	uint64_t		curpic[18];
883 	int64_t			diff;
884 	int			i;
885 
886 	for (i = 0; i < 18; i++)
887 		(void) rdmsr(p4_ctrs[i].pc_caddr, &curpic[i]);
888 
889 	build_cfgs(cfgs, addrs, token);
890 
891 	for (i = 0; i < 18; i++) {
892 		if (cfgs[i] == NULL)
893 			continue;
894 		diff = curpic[i] - cfgs[i]->p4_rawpic;
895 		if (diff < 0)
896 			diff += (1ll << 40);
897 		*addrs[i] += diff;
898 		DTRACE_PROBE4(p4__pcbe__sample, int, i, uint64_t, *addrs[i],
899 		    uint64_t, curpic[i], uint64_t, cfgs[i]->p4_rawpic);
900 		cfgs[i]->p4_rawpic = *addrs[i] & MASK40;
901 	}
902 }
903 
904 static void
905 p4_pcbe_free(void *config)
906 {
907 	kmem_free(config, sizeof (p4_pcbe_config_t));
908 }
909 
910 static struct modlpcbe modlpcbe = {
911 	&mod_pcbeops,
912 	"Pentium 4 Performance Counters v%I%",
913 	&p4_pcbe_ops
914 };
915 
916 static struct modlinkage modl = {
917 	MODREV_1,
918 	&modlpcbe,
919 };
920 
921 int
922 _init(void)
923 {
924 	if (p4_pcbe_init() != 0)
925 		return (ENOTSUP);
926 	return (mod_install(&modl));
927 }
928 
929 int
930 _fini(void)
931 {
932 	return (mod_remove(&modl));
933 }
934 
935 int
936 _info(struct modinfo *mi)
937 {
938 	return (mod_info(&modl, mi));
939 }
940