xref: /titanic_41/usr/src/uts/intel/pcbe/p123_pcbe.c (revision d0e72dff6c5b858c797dfc1172d9c3aed2103940)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Performance Counter Back-End for Pentiums I, II, and III.
30  */
31 
32 #include <sys/cpuvar.h>
33 #include <sys/param.h>
34 #include <sys/cpc_impl.h>
35 #include <sys/cpc_pcbe.h>
36 #include <sys/modctl.h>
37 #include <sys/inttypes.h>
38 #include <sys/systm.h>
39 #include <sys/cmn_err.h>
40 #include <sys/x86_archext.h>
41 #include <sys/sdt.h>
42 #include <sys/archsystm.h>
43 #include <sys/privregs.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 
47 static int64_t diff3931(uint64_t sample, uint64_t old);
48 static uint64_t trunc3931(uint64_t value);
49 
50 static int ptm_pcbe_init(void);
51 static uint_t ptm_pcbe_ncounters(void);
52 static const char *ptm_pcbe_impl_name(void);
53 static const char *ptm_pcbe_cpuref(void);
54 static char *ptm_pcbe_list_events(uint_t picnum);
55 static char *ptm_pcbe_list_attrs(void);
56 static uint64_t ptm_pcbe_event_coverage(char *event);
57 static int ptm_pcbe_pic_index(char *picname);
58 static uint64_t	ptm_pcbe_overflow_bitmap(void);
59 static int ptm_pcbe_configure(uint_t picnum, char *event, uint64_t preset,
60     uint32_t flags, uint_t nattrs, kcpc_attr_t *attrs, void **data,
61     void *token);
62 static void ptm_pcbe_program(void *token);
63 static void ptm_pcbe_allstop(void);
64 static void ptm_pcbe_sample(void *token);
65 static void ptm_pcbe_free(void *config);
66 
67 pcbe_ops_t ptm_pcbe_ops = {
68 	PCBE_VER_1,
69 	0,
70 	ptm_pcbe_ncounters,
71 	ptm_pcbe_impl_name,
72 	ptm_pcbe_cpuref,
73 	ptm_pcbe_list_events,
74 	ptm_pcbe_list_attrs,
75 	ptm_pcbe_event_coverage,
76 	ptm_pcbe_overflow_bitmap,
77 	ptm_pcbe_configure,
78 	ptm_pcbe_program,
79 	ptm_pcbe_allstop,
80 	ptm_pcbe_sample,
81 	ptm_pcbe_free
82 };
83 
84 typedef enum _ptm_ver {
85 	PTM_VER_P5,
86 	PTM_VER_P6
87 } ptm_ver_t;
88 
89 static ptm_ver_t ptm_ver;
90 static const char *ptm_impl_name;
91 static const char *ptm_cpuref;
92 static char *pic_events[2] = { NULL, NULL };
93 
94 /*
95  * Indicates whether the "rdpmc" instruction is available on this processor.
96  */
97 static int ptm_rdpmc_avail = 0;
98 
99 #define	ALL_STOPPED	0ULL
100 
101 typedef struct _ptm_pcbe_config {
102 	uint8_t		ptm_picno;	/* 0 for pic0 or 1 for pic1 */
103 	uint32_t	ptm_ctl;    /* P6: PerfEventSelect; P5: cesr, shifted */
104 	uint64_t	ptm_rawpic;
105 } ptm_pcbe_config_t;
106 
107 struct nametable {
108 	uint8_t		bits;
109 	const char	*name;
110 };
111 
112 #define	NT_END 0xFF
113 
114 /*
115  * Basic Pentium events
116  */
117 #define	P5_EVENTS				\
118 	{0x0,	"data_read"},			\
119 	{0x1,	"data_write"},			\
120 	{0x2,	"data_tlb_miss"},		\
121 	{0x3,	"data_read_miss"},		\
122 	{0x4,	"data_write_miss"},		\
123 	{0x5,	"write_hit_to_M_or_E"},		\
124 	{0x6,	"dcache_lines_wrback"},		\
125 	{0x7,	"external_snoops"},		\
126 	{0x8,	"external_dcache_snoop_hits"},	\
127 	{0x9,	"memory_access_in_both_pipes"},	\
128 	{0xa,	"bank_conflicts"},		\
129 	{0xb,	"misaligned_ref"},		\
130 	{0xc,	"code_read"},			\
131 	{0xd,	"code_tlb_miss"},		\
132 	{0xe,	"code_cache_miss"},		\
133 	{0xf,	"any_segreg_loaded"},		\
134 	{0x12,	"branches"},			\
135 	{0x13,	"btb_hits"},			\
136 	{0x14,	"taken_or_btb_hit"},		\
137 	{0x15,	"pipeline_flushes"},		\
138 	{0x16,	"instr_exec"},			\
139 	{0x17,	"instr_exec_V_pipe"},		\
140 	{0x18,	"clks_bus_cycle"},		\
141 	{0x19,	"clks_full_wbufs"},		\
142 	{0x1a,	"pipe_stall_read"},		\
143 	{0x1b,	"stall_on_write_ME"},		\
144 	{0x1c,	"locked_bus_cycle"},		\
145 	{0x1d,	"io_rw_cycles"},		\
146 	{0x1e,	"reads_noncache_mem"},		\
147 	{0x1f,	"pipeline_agi_stalls"},		\
148 	{0x22,	"flops"},			\
149 	{0x23,	"bp_match_dr0"},		\
150 	{0x24,	"bp_match_dr1"},		\
151 	{0x25,	"bp_match_dr2"},		\
152 	{0x26,	"bp_match_dr3"},		\
153 	{0x27,	"hw_intrs"},			\
154 	{0x28,	"data_rw"},			\
155 	{0x29,	"data_rw_miss"}
156 
157 static const struct nametable P5mmx_names0[] = {
158 	P5_EVENTS,
159 	{0x2a,	"bus_ownership_latency"},
160 	{0x2b,	"mmx_instr_upipe"},
161 	{0x2c,	"cache_M_line_sharing"},
162 	{0x2d,	"emms_instr"},
163 	{0x2e,	"bus_util_processor"},
164 	{0x2f,	"sat_mmx_instr"},
165 	{0x30,	"clks_not_HLT"},
166 	{0x31,	"mmx_data_read"},
167 	{0x32,	"clks_fp_stall"},
168 	{0x33,	"d1_starv_fifo_0"},
169 	{0x34,	"mmx_data_write"},
170 	{0x35,	"pipe_flush_wbp"},
171 	{0x36,	"mmx_misalign_data_refs"},
172 	{0x37,	"rets_pred_incorrect"},
173 	{0x38,	"mmx_multiply_unit_interlock"},
174 	{0x39,	"rets"},
175 	{0x3a,	"btb_false_entries"},
176 	{0x3b,	"clocks_stall_full_wb"},
177 	{NT_END, ""}
178 };
179 
180 static const struct nametable P5mmx_names1[] = {
181 	P5_EVENTS,
182 	{0x2a,	"bus_ownership_transfers"},
183 	{0x2b,	"mmx_instr_vpipe"},
184 	{0x2c,	"cache_lint_sharing"},
185 	{0x2d,	"mmx_fp_transitions"},
186 	{0x2e,	"writes_noncache_mem"},
187 	{0x2f,	"sats_performed"},
188 	{0x30,	"clks_dcache_tlb_miss"},
189 	{0x31,	"mmx_data_read_miss"},
190 	{0x32,	"taken_br"},
191 	{0x33,	"d1_starv_fifo_1"},
192 	{0x34,	"mmx_data_write_miss"},
193 	{0x35,	"pipe_flush_wbp_wb"},
194 	{0x36,	"mmx_pipe_stall_data_read"},
195 	{0x37,	"rets_pred"},
196 	{0x38,	"movd_movq_stall"},
197 	{0x39,	"rsb_overflow"},
198 	{0x3a,	"btb_mispred_nt"},
199 	{0x3b,	"mmx_stall_write_ME"},
200 	{NT_END, ""}
201 };
202 
203 static const struct nametable *P5mmx_names[2] = {
204 	P5mmx_names0,
205 	P5mmx_names1
206 };
207 
208 /*
209  * Pentium Pro and Pentium II events
210  */
211 static const struct nametable _P6_names[] = {
212 	/*
213 	 * Data cache unit
214 	 */
215 	{0x43,	"data_mem_refs"},
216 	{0x45,	"dcu_lines_in"},
217 	{0x46,	"dcu_m_lines_in"},
218 	{0x47,	"dcu_m_lines_out"},
219 	{0x48,	"dcu_miss_outstanding"},
220 
221 	/*
222 	 * Instruction fetch unit
223 	 */
224 	{0x80,	"ifu_ifetch"},
225 	{0x81,	"ifu_ifetch_miss"},
226 	{0x85,	"itlb_miss"},
227 	{0x86,	"ifu_mem_stall"},
228 	{0x87,	"ild_stall"},
229 
230 	/*
231 	 * L2 cache
232 	 */
233 	{0x28,	"l2_ifetch"},
234 	{0x29,	"l2_ld"},
235 	{0x2a,	"l2_st"},
236 	{0x24,	"l2_lines_in"},
237 	{0x26,	"l2_lines_out"},
238 	{0x25,	"l2_m_lines_inm"},
239 	{0x27,	"l2_m_lines_outm"},
240 	{0x2e,	"l2_rqsts"},
241 	{0x21,	"l2_ads"},
242 	{0x22,	"l2_dbus_busy"},
243 	{0x23,	"l2_dbus_busy_rd"},
244 
245 	/*
246 	 * External bus logic
247 	 */
248 	{0x62,	"bus_drdy_clocks"},
249 	{0x63,	"bus_lock_clocks"},
250 	{0x60,	"bus_req_outstanding"},
251 	{0x65,	"bus_tran_brd"},
252 	{0x66,	"bus_tran_rfo"},
253 	{0x67,	"bus_trans_wb"},
254 	{0x68,	"bus_tran_ifetch"},
255 	{0x69,	"bus_tran_inval"},
256 	{0x6a,	"bus_tran_pwr"},
257 	{0x6b,	"bus_trans_p"},
258 	{0x6c,	"bus_trans_io"},
259 	{0x6d,	"bus_tran_def"},
260 	{0x6e,	"bus_tran_burst"},
261 	{0x70,	"bus_tran_any"},
262 	{0x6f,	"bus_tran_mem"},
263 	{0x64,	"bus_data_rcv"},
264 	{0x61,	"bus_bnr_drv"},
265 	{0x7a,	"bus_hit_drv"},
266 	{0x7b,	"bus_hitm_drv"},
267 	{0x7e,	"bus_snoop_stall"},
268 
269 	/*
270 	 * Floating point unit
271 	 */
272 	{0xc1,	"flops"},		/* 0 only */
273 	{0x10,	"fp_comp_ops_exe"},	/* 0 only */
274 	{0x11,	"fp_assist"},		/* 1 only */
275 	{0x12,	"mul"},			/* 1 only */
276 	{0x13,	"div"},			/* 1 only */
277 	{0x14,	"cycles_div_busy"},	/* 0 only */
278 
279 	/*
280 	 * Memory ordering
281 	 */
282 	{0x3,	"ld_blocks"},
283 	{0x4,	"sb_drains"},
284 	{0x5,	"misalign_mem_ref"},
285 
286 	/*
287 	 * Instruction decoding and retirement
288 	 */
289 	{0xc0,	"inst_retired"},
290 	{0xc2,	"uops_retired"},
291 	{0xd0,	"inst_decoder"},
292 
293 	/*
294 	 * Interrupts
295 	 */
296 	{0xc8,	"hw_int_rx"},
297 	{0xc6,	"cycles_int_masked"},
298 	{0xc7,	"cycles_int_pending_and_masked"},
299 
300 	/*
301 	 * Branches
302 	 */
303 	{0xc4,	"br_inst_retired"},
304 	{0xc5,	"br_miss_pred_retired"},
305 	{0xc9,	"br_taken_retired"},
306 	{0xca,	"br_miss_pred_taken_ret"},
307 	{0xe0,	"br_inst_decoded"},
308 	{0xe2,	"btb_misses"},
309 	{0xe4,	"br_bogus"},
310 	{0xe6,	"baclears"},
311 
312 	/*
313 	 * Stalls
314 	 */
315 	{0xa2,	"resource_stalls"},
316 	{0xd2,	"partial_rat_stalls"},
317 
318 	/*
319 	 * Segment register loads
320 	 */
321 	{0x6,	"segment_reg_loads"},
322 
323 	/*
324 	 * Clocks
325 	 */
326 	{0x79,	"cpu_clk_unhalted"},
327 
328 	/*
329 	 * MMX
330 	 */
331 	{0xb0,	"mmx_instr_exec"},
332 	{0xb1,	"mmx_sat_instr_exec"},
333 	{0xb2,	"mmx_uops_exec"},
334 	{0xb3,	"mmx_instr_type_exec"},
335 	{0xcc,	"fp_mmx_trans"},
336 	{0xcd,	"mmx_assists"},
337 	{0xce,	"mmx_instr_ret"},
338 	{0xd4,	"seg_rename_stalls"},
339 	{0xd5,	"seg_reg_renames"},
340 	{0xd6,	"ret_seg_renames"},
341 
342 	{NT_END, ""}
343 };
344 
345 static const struct nametable *P6_names[2] = {
346 	_P6_names,
347 	_P6_names
348 };
349 
350 static const struct nametable **events;
351 
352 #define	BITS(v, u, l)	\
353 	(((v) >> (l)) & ((1 << (1 + (u) - (l))) - 1))
354 
355 /*
356  * "Well known" bit fields in the Pentium CES register
357  * The interfaces in libcpc should make these #defines uninteresting.
358  */
359 #define	CPC_P5_CESR_ES0_SHIFT	0
360 #define	CPC_P5_CESR_ES0_MASK	0x3f
361 #define	CPC_P5_CESR_ES1_SHIFT	16
362 #define	CPC_P5_CESR_ES1_MASK	0x3f
363 
364 #define	CPC_P5_CESR_OS0		6
365 #define	CPC_P5_CESR_USR0	7
366 #define	CPC_P5_CESR_CLK0	8
367 #define	CPC_P5_CESR_PC0		9
368 #define	CPC_P5_CESR_OS1		(CPC_P5_CESR_OS0 + 16)
369 #define	CPC_P5_CESR_USR1	(CPC_P5_CESR_USR0 + 16)
370 #define	CPC_P5_CESR_CLK1	(CPC_P5_CESR_CLK0 + 16)
371 #define	CPC_P5_CESR_PC1		(CPC_P5_CESR_PC0 + 16)
372 
373 /*
374  * "Well known" bit fields in the Pentium Pro PerfEvtSel registers
375  * The interfaces in libcpc should make these #defines uninteresting.
376  */
377 #define	CPC_P6_PES_INV		23
378 #define	CPC_P6_PES_EN		22
379 #define	CPC_P6_PES_INT		20
380 #define	CPC_P6_PES_PC		19
381 #define	CPC_P6_PES_E		18
382 #define	CPC_P6_PES_OS		17
383 #define	CPC_P6_PES_USR		16
384 
385 #define	CPC_P6_PES_UMASK_SHIFT	8
386 #define	CPC_P6_PES_UMASK_MASK	(0xffu)
387 
388 #define	CPC_P6_PES_CMASK_SHIFT	24
389 #define	CPC_P6_PES_CMASK_MASK	(0xffu)
390 
391 #define	CPC_P6_PES_PIC0_MASK	(0xffu)
392 #define	CPC_P6_PES_PIC1_MASK	(0xffu)
393 
394 #define	P6_PES_EN	(UINT32_C(1) << CPC_P6_PES_EN)
395 #define	P6_PES_INT	(UINT32_C(1) << CPC_P6_PES_INT)
396 #define	P6_PES_OS	(UINT32_C(1) << CPC_P6_PES_OS)
397 
398 /*
399  * Pentium 5 attributes
400  */
401 #define	P5_NOEDGE	0x1	/* "noedge"	- no edge detection */
402 #define	P5_PC		0x2	/* "pc"		- pin control */
403 
404 /*
405  * Pentium 6 attributes
406  */
407 #define	P6_NOEDGE	0x1
408 #define	P6_PC		0x2
409 #define	P6_INV		0x4	/* "inv" - count inverted transitions */
410 #define	P6_INT		0x8	/* "int" - interrupt on overflow */
411 
412 /*
413  * CPU reference strings
414  */
415 
416 #define	P5_CPUREF	"See Appendix A.4 of the \"IA-32 Intel Architecture "  \
417 			"Software Developer's Manual Volume 3: System "	       \
418 			"Programming Guide,\" Order # 245472-012, 2003"
419 
420 #define	P6_CPUREF	"See Appendix A.3 of the \"IA-32 Intel Architecture "  \
421 			"Software Developer's Manual Volume 3: System "	       \
422 			"Programming Guide,\" Order # 245472-012, 2003"
423 
424 static int
425 ptm_pcbe_init(void)
426 {
427 	const struct nametable	*n;
428 	int			i;
429 	size_t			size;
430 
431 	if (x86_feature & X86_MMX)
432 		ptm_rdpmc_avail = 1;
433 
434 	/*
435 	 * Discover type of CPU and set events pointer appropriately.
436 	 *
437 	 * Map family and model into the performance
438 	 * counter architectures we currently understand.
439 	 *
440 	 * See application note AP485 (from developer.intel.com)
441 	 * for further explanation.
442 	 */
443 	if (cpuid_getvendor(CPU) != X86_VENDOR_Intel)
444 		return (-1);
445 	switch (cpuid_getfamily(CPU)) {
446 	case 5:		/* Pentium and Pentium with MMX */
447 		events = P5mmx_names;
448 		ptm_ver = PTM_VER_P5;
449 		ptm_cpuref = P5_CPUREF;
450 		if (cpuid_getmodel(CPU) < 4)
451 			ptm_impl_name = "Pentium";
452 		else
453 			ptm_impl_name = "Pentium with MMX";
454 		break;
455 	case 6:		/* Pentium Pro and Pentium II and III */
456 		events = P6_names;
457 		ptm_ver = PTM_VER_P6;
458 		ptm_cpuref = P6_CPUREF;
459 		ptm_pcbe_ops.pcbe_caps = CPC_CAP_OVERFLOW_INTERRUPT;
460 		if (x86_feature & X86_MMX)
461 			ptm_impl_name = "Pentium Pro with MMX, Pentium II";
462 		else
463 			ptm_impl_name = "Pentium Pro, Pentium II";
464 		break;
465 	default:
466 		return (-1);
467 	}
468 
469 	/*
470 	 * Initialize the list of events for each PIC.
471 	 * Do two passes: one to compute the size necessary and another
472 	 * to copy the strings. Need room for event, comma, and NULL terminator.
473 	 */
474 	for (i = 0; i < 2; i++) {
475 		size = 0;
476 		for (n = events[i]; n->bits != NT_END; n++)
477 			size += strlen(n->name) + 1;
478 		pic_events[i] = kmem_alloc(size + 1, KM_SLEEP);
479 		*pic_events[i] = '\0';
480 		for (n = events[i]; n->bits != NT_END; n++) {
481 			(void) strcat(pic_events[i], n->name);
482 			(void) strcat(pic_events[i], ",");
483 		}
484 		/*
485 		 * Remove trailing comma.
486 		 */
487 		pic_events[i][size - 1] = '\0';
488 	}
489 
490 	return (0);
491 }
492 
493 static uint_t
494 ptm_pcbe_ncounters(void)
495 {
496 	return (2);
497 }
498 
499 static const char *
500 ptm_pcbe_impl_name(void)
501 {
502 	return (ptm_impl_name);
503 }
504 
505 static const char *
506 ptm_pcbe_cpuref(void)
507 {
508 	return (ptm_cpuref);
509 }
510 
511 static char *
512 ptm_pcbe_list_events(uint_t picnum)
513 {
514 	ASSERT(picnum >= 0 && picnum < cpc_ncounters);
515 
516 	if (pic_events[0] == NULL) {
517 		ASSERT(pic_events[1] == NULL);
518 	}
519 
520 	return (pic_events[picnum]);
521 }
522 
523 static char *
524 ptm_pcbe_list_attrs(void)
525 {
526 	if (ptm_ver == PTM_VER_P5)
527 		return ("noedge,pc");
528 	else
529 		return ("noedge,pc,inv,int,umask,cmask");
530 }
531 
532 static const struct nametable *
533 find_event(int regno, char *name)
534 {
535 	const struct nametable *n;
536 
537 	n = events[regno];
538 
539 	for (; n->bits != NT_END; n++)
540 		if (strcmp(name, n->name) == 0)
541 			return (n);
542 
543 	return (NULL);
544 }
545 
546 static uint64_t
547 ptm_pcbe_event_coverage(char *event)
548 {
549 	uint64_t bitmap = 0;
550 
551 	if (find_event(0, event) != NULL)
552 		bitmap = 0x1;
553 	if (find_event(1, event) != NULL)
554 		bitmap |= 0x2;
555 
556 	return (bitmap);
557 }
558 
559 static uint64_t
560 ptm_pcbe_overflow_bitmap(void)
561 {
562 	uint64_t	ret = 0;
563 	uint64_t	pes[2];
564 
565 	/*
566 	 * P5 is not capable of generating interrupts.
567 	 */
568 	ASSERT(ptm_ver == PTM_VER_P6);
569 
570 	/*
571 	 * CPC could have caused an interrupt provided that
572 	 *
573 	 * 1) Counters are enabled
574 	 * 2) Either counter has requested an interrupt
575 	 */
576 
577 	pes[0] = rdmsr(REG_PERFEVNT0);
578 	if (((uint32_t)pes[0] & P6_PES_EN) != P6_PES_EN)
579 		return (0);
580 
581 	/*
582 	 * If a particular counter requested an interrupt, assume it caused
583 	 * this interrupt. There is no way to determine which counter overflowed
584 	 * on this hardware other than by using unreliable heuristics.
585 	 */
586 
587 	pes[1] = rdmsr(REG_PERFEVNT1);
588 	if ((uint32_t)pes[0] & P6_PES_INT)
589 		ret |= 0x1;
590 	if ((uint32_t)pes[1] & P6_PES_INT)
591 		ret |= 0x2;
592 
593 	return (ret);
594 }
595 
596 /*ARGSUSED*/
597 static int
598 ptm_pcbe_configure(uint_t picnum, char *eventname, uint64_t preset,
599     uint32_t flags, uint_t nattrs, kcpc_attr_t *attrs, void **data,
600     void *token)
601 {
602 	ptm_pcbe_config_t	*conf;
603 	const struct nametable	*n;
604 	struct nametable	nt_raw = { 0, "raw" };
605 	int			i;
606 	int			ptm_flags = 0;
607 
608 	/*
609 	 * If we've been handed an existing configuration, we need only preset
610 	 * the counter value.
611 	 */
612 	if (*data != NULL) {
613 		conf = *data;
614 		conf->ptm_rawpic = trunc3931(preset);
615 		return (0);
616 	}
617 
618 	if (picnum != 0 && picnum != 1)
619 		return (CPC_INVALID_PICNUM);
620 
621 	if ((n = find_event(picnum, eventname)) == NULL) {
622 		long tmp;
623 
624 		/*
625 		 * If ddi_strtol() likes this event, use it as a raw event code.
626 		 */
627 		if (ddi_strtol(eventname, NULL, 0, &tmp) != 0)
628 			return (CPC_INVALID_EVENT);
629 
630 		nt_raw.bits = tmp;
631 
632 		if (ptm_ver == PTM_VER_P5)
633 			nt_raw.bits &= CPC_P5_CESR_ES0_MASK;
634 		else
635 			nt_raw.bits &= CPC_P6_PES_PIC0_MASK;
636 
637 		n = &nt_raw;
638 	}
639 
640 	conf = kmem_alloc(sizeof (ptm_pcbe_config_t), KM_SLEEP);
641 
642 	conf->ptm_picno = picnum;
643 	conf->ptm_rawpic = trunc3931(preset);
644 	conf->ptm_ctl = 0;
645 
646 	if (ptm_ver == PTM_VER_P5) {
647 		int picshift;
648 		picshift = (picnum == 0) ? 0 : 16;
649 
650 		for (i = 0; i < nattrs; i++) {
651 			/*
652 			 * Value of these attributes is ignored; their presence
653 			 * alone tells us to set the corresponding flag.
654 			 */
655 			if (strncmp(attrs[i].ka_name, "noedge", 7) == 0) {
656 				if (attrs[i].ka_val != 0)
657 					ptm_flags |= P5_NOEDGE;
658 			} else if (strncmp(attrs[i].ka_name, "pc", 3) == 0) {
659 				if (attrs[i].ka_val != 0)
660 					ptm_flags |= P5_PC;
661 			} else {
662 				kmem_free(conf, sizeof (ptm_pcbe_config_t));
663 				return (CPC_INVALID_ATTRIBUTE);
664 			}
665 		}
666 
667 		if (flags & CPC_COUNT_USER)
668 			conf->ptm_ctl |= (1 << (CPC_P5_CESR_USR0 + picshift));
669 		if (flags & CPC_COUNT_SYSTEM)
670 			conf->ptm_ctl |= (1 << (CPC_P5_CESR_OS0 + picshift));
671 		if (ptm_flags & P5_NOEDGE)
672 			conf->ptm_ctl |= (1 << (CPC_P5_CESR_CLK0 + picshift));
673 		if (ptm_flags & P5_PC)
674 			conf->ptm_ctl |= (1 << (CPC_P5_CESR_PC0 + picshift));
675 
676 		ASSERT((n->bits | CPC_P5_CESR_ES0_MASK) ==
677 		    CPC_P5_CESR_ES0_MASK);
678 
679 		conf->ptm_ctl |= (n->bits << picshift);
680 	} else {
681 		for (i = 0; i < nattrs; i++) {
682 			if (strncmp(attrs[i].ka_name, "noedge", 6) == 0) {
683 				if (attrs[i].ka_val != 0)
684 					ptm_flags |= P6_NOEDGE;
685 			} else if (strncmp(attrs[i].ka_name, "pc", 2) == 0) {
686 				if (attrs[i].ka_val != 0)
687 					ptm_flags |= P6_PC;
688 			} else if (strncmp(attrs[i].ka_name, "inv", 3) == 0) {
689 				if (attrs[i].ka_val != 0)
690 					ptm_flags |= P6_INV;
691 			} else if (strncmp(attrs[i].ka_name, "umask", 5) == 0) {
692 				if ((attrs[i].ka_val | CPC_P6_PES_UMASK_MASK) !=
693 					CPC_P6_PES_UMASK_MASK) {
694 					kmem_free(conf,
695 					    sizeof (ptm_pcbe_config_t));
696 					return (CPC_ATTRIBUTE_OUT_OF_RANGE);
697 				}
698 				conf->ptm_ctl |= (uint8_t)attrs[i].ka_val <<
699 				    CPC_P6_PES_UMASK_SHIFT;
700 			} else if (strncmp(attrs[i].ka_name, "cmask", 5) == 0) {
701 				if ((attrs[i].ka_val | CPC_P6_PES_CMASK_MASK) !=
702 					CPC_P6_PES_CMASK_MASK) {
703 					kmem_free(conf,
704 					    sizeof (ptm_pcbe_config_t));
705 					return (CPC_ATTRIBUTE_OUT_OF_RANGE);
706 				}
707 				conf->ptm_ctl |= (uint8_t)attrs[i].ka_val <<
708 				    CPC_P6_PES_CMASK_SHIFT;
709 			} else if (strncmp(attrs[i].ka_name, "int", 3) == 0) {
710 				if (attrs[i].ka_val != 0)
711 					ptm_flags |= P6_INT;
712 			} else {
713 				kmem_free(conf, sizeof (ptm_pcbe_config_t));
714 				return (CPC_INVALID_ATTRIBUTE);
715 			}
716 		}
717 
718 		if (flags & CPC_OVF_NOTIFY_EMT)
719 			/*
720 			 * If the user has requested notification of overflows,
721 			 * we automatically program the hardware to generate
722 			 * overflow interrupts.
723 			 */
724 			ptm_flags |= P6_INT;
725 		if (flags & CPC_COUNT_USER)
726 			conf->ptm_ctl |= (1 << CPC_P6_PES_USR);
727 		if (flags & CPC_COUNT_SYSTEM)
728 			conf->ptm_ctl |= (1 << CPC_P6_PES_OS);
729 		if ((ptm_flags & P6_NOEDGE) == 0)
730 			conf->ptm_ctl |= (1 << CPC_P6_PES_E);
731 		if (ptm_flags & P6_PC)
732 			conf->ptm_ctl |= (1 << CPC_P6_PES_PC);
733 		if (ptm_flags & P6_INV)
734 			conf->ptm_ctl |= (1 << CPC_P6_PES_INV);
735 		if (ptm_flags & P6_INT)
736 			conf->ptm_ctl |= (1 << CPC_P6_PES_INT);
737 
738 		ASSERT((n->bits | CPC_P6_PES_PIC0_MASK) ==
739 		    CPC_P6_PES_PIC0_MASK);
740 
741 		conf->ptm_ctl |= n->bits;
742 	}
743 
744 	*data = conf;
745 	return (0);
746 }
747 
748 static void
749 ptm_pcbe_program(void *token)
750 {
751 	ptm_pcbe_config_t	*pic0;
752 	ptm_pcbe_config_t	*pic1;
753 	ptm_pcbe_config_t	*tmp;
754 	ptm_pcbe_config_t	empty = { 1, 0, 0 }; /* assume pic1 to start */
755 
756 	if ((pic0 = kcpc_next_config(token, NULL, NULL)) == NULL)
757 		panic("ptm_pcbe: token %p has no configs", token);
758 
759 	if ((pic1 = kcpc_next_config(token, pic0, NULL)) == NULL)
760 		pic1 = &empty;
761 
762 	if (pic0->ptm_picno != 0) {
763 		empty.ptm_picno = 0;
764 		tmp = pic1;
765 		pic1 = pic0;
766 		pic0 = tmp;
767 	}
768 
769 	ASSERT(pic0->ptm_picno == 0 && pic1->ptm_picno == 1);
770 
771 	if (ptm_rdpmc_avail) {
772 		uint32_t curcr4 = getcr4();
773 		if (kcpc_allow_nonpriv(token))
774 			setcr4(curcr4 | CR4_PCE);
775 		else
776 			setcr4(curcr4 & ~CR4_PCE);
777 	}
778 
779 	if (ptm_ver == PTM_VER_P5) {
780 		wrmsr(P5_CESR, ALL_STOPPED);
781 		wrmsr(P5_CTR0, pic0->ptm_rawpic);
782 		wrmsr(P5_CTR1, pic1->ptm_rawpic);
783 		wrmsr(P5_CESR, pic0->ptm_ctl | pic1->ptm_ctl);
784 		pic0->ptm_rawpic = rdmsr(P5_CTR0);
785 		pic1->ptm_rawpic = rdmsr(P5_CTR1);
786 	} else {
787 		uint64_t	pes;
788 		wrmsr(REG_PERFEVNT0, ALL_STOPPED);
789 		wrmsr(REG_PERFCTR0, pic0->ptm_rawpic);
790 		wrmsr(REG_PERFCTR1, pic1->ptm_rawpic);
791 		pes = pic1->ptm_ctl;
792 		DTRACE_PROBE1(ptm__pes1, uint64_t, pes);
793 		wrmsr(REG_PERFEVNT1, pes);
794 		pes = pic0->ptm_ctl | (1 << CPC_P6_PES_EN);
795 		DTRACE_PROBE1(ptm__pes0, uint64_t, pes);
796 		wrmsr(REG_PERFEVNT0, pes);
797 	}
798 }
799 
800 static void
801 ptm_pcbe_allstop(void)
802 {
803 	if (ptm_ver == PTM_VER_P5)
804 		wrmsr(P5_CESR, ALL_STOPPED);
805 	else {
806 		wrmsr(REG_PERFEVNT0, ALL_STOPPED);
807 		setcr4((uint32_t)getcr4() & ~CR4_PCE);
808 	}
809 }
810 
811 static void
812 ptm_pcbe_sample(void *token)
813 {
814 	ptm_pcbe_config_t	*pic0;
815 	ptm_pcbe_config_t	*pic1;
816 	ptm_pcbe_config_t	*swap;
817 	ptm_pcbe_config_t	empty = { 1, 0, 0 }; /* assume pic1 to start */
818 	uint64_t		tmp;
819 	uint64_t		*pic0_data;
820 	uint64_t		*pic1_data;
821 	uint64_t		*dtmp;
822 	uint64_t		curpic[2];
823 
824 	if ((pic0 = kcpc_next_config(token, NULL, &pic0_data)) == NULL)
825 		panic("ptm_pcbe: token %p has no configs", token);
826 
827 	if ((pic1 = kcpc_next_config(token, pic0, &pic1_data)) == NULL) {
828 		pic1 = &empty;
829 		pic1_data = &tmp;
830 	}
831 
832 	if (pic0->ptm_picno != 0) {
833 		empty.ptm_picno = 0;
834 		swap = pic0;
835 		pic0 = pic1;
836 		pic1 = swap;
837 		dtmp = pic0_data;
838 		pic0_data = pic1_data;
839 		pic1_data = dtmp;
840 	}
841 
842 	ASSERT(pic0->ptm_picno == 0 && pic1->ptm_picno == 1);
843 
844 	if (ptm_ver == PTM_VER_P5) {
845 		curpic[0] = rdmsr(P5_CTR0);
846 		curpic[1] = rdmsr(P5_CTR1);
847 	} else {
848 		curpic[0] = rdmsr(REG_PERFCTR0);
849 		curpic[1] = rdmsr(REG_PERFCTR1);
850 	}
851 
852 	DTRACE_PROBE1(ptm__curpic0, uint64_t, curpic[0]);
853 	DTRACE_PROBE1(ptm__curpic1, uint64_t, curpic[1]);
854 
855 	*pic0_data += diff3931(curpic[0], pic0->ptm_rawpic);
856 	pic0->ptm_rawpic = trunc3931(*pic0_data);
857 
858 	*pic1_data += diff3931(curpic[1], pic1->ptm_rawpic);
859 	pic1->ptm_rawpic = trunc3931(*pic1_data);
860 }
861 
862 static void
863 ptm_pcbe_free(void *config)
864 {
865 	kmem_free(config, sizeof (ptm_pcbe_config_t));
866 }
867 
868 /*
869  * Virtualizes the 40-bit field of the %pic
870  * register into a 64-bit software register.
871  *
872  * We can retrieve 40 (signed) bits from the counters,
873  * but we can set only 32 (signed) bits into the counters.
874  * This makes virtualizing more than 31-bits of registers
875  * quite tricky.
876  *
877  * If bits 39 to 31 are set in the virtualized pic register,
878  * then we can preset the counter to this value using the fact
879  * that wrmsr sign extends bit 31.   Though it might look easier
880  * to only use the bottom 31-bits of the register, we have to allow
881  * the full 40-bits to be used to perform overflow profiling.
882  */
883 
884 #define	MASK40		UINT64_C(0xffffffffff)
885 #define	MASK31		UINT64_C(0x7fffffff)
886 #define	BITS_39_31	UINT64_C(0xff80000000)
887 
888 static int64_t
889 diff3931(uint64_t sample, uint64_t old)
890 {
891 	int64_t diff;
892 
893 	if ((old & BITS_39_31) == BITS_39_31) {
894 		diff = (MASK40 & sample) - old;
895 		if (diff < 0)
896 			diff += (UINT64_C(1) << 40);
897 	} else {
898 		diff = (MASK31 & sample) - old;
899 		if (diff < 0)
900 			diff += (UINT64_C(1) << 31);
901 	}
902 	return (diff);
903 }
904 
905 static uint64_t
906 trunc3931(uint64_t value)
907 {
908 	if ((value & BITS_39_31) == BITS_39_31)
909 		return (MASK40 & value);
910 	return (MASK31 & value);
911 }
912 
913 static struct modlpcbe modlpcbe = {
914 	&mod_pcbeops,
915 	"Pentium Performance Counters v%I%",
916 	&ptm_pcbe_ops
917 };
918 
919 static struct modlinkage modl = {
920 	MODREV_1,
921 	&modlpcbe,
922 };
923 
924 int
925 _init(void)
926 {
927 	if (ptm_pcbe_init() != 0)
928 		return (ENOTSUP);
929 	return (mod_install(&modl));
930 }
931 
932 int
933 _fini(void)
934 {
935 	return (mod_remove(&modl));
936 }
937 
938 int
939 _info(struct modinfo *mi)
940 {
941 	return (mod_info(&modl, mi));
942 }
943