/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" /* * Performance Counter Back-End for Pentium 4. */ #include <sys/cpuvar.h> #include <sys/param.h> #include <sys/cpc_impl.h> #include <sys/cpc_pcbe.h> #include <sys/inttypes.h> #include <sys/errno.h> #include <sys/systm.h> #include <sys/archsystm.h> #include <sys/x86_archext.h> #include <sys/modctl.h> #include <sys/sdt.h> #include <sys/cred.h> #include <sys/policy.h> #include <sys/privregs.h> static int p4_pcbe_init(void); static uint_t p4_pcbe_ncounters(void); static const char *p4_pcbe_impl_name(void); static const char *p4_pcbe_cpuref(void); static char *p4_pcbe_list_events(uint_t picnum); static char *p4_pcbe_list_attrs(void); static uint64_t p4_pcbe_event_coverage(char *event); static uint64_t p4_pcbe_overflow_bitmap(void); static int p4_pcbe_configure(uint_t picnum, char *event, uint64_t preset, uint32_t flags, uint_t nattrs, kcpc_attr_t *attrs, void **data, void *token); static void p4_pcbe_program(void *token); static void p4_pcbe_allstop(void); static void p4_pcbe_sample(void *token); static void p4_pcbe_free(void *config); extern int cpuid_get_clogid(cpu_t *); static pcbe_ops_t p4_pcbe_ops = { PCBE_VER_1, CPC_CAP_OVERFLOW_INTERRUPT | CPC_CAP_OVERFLOW_PRECISE, p4_pcbe_ncounters, p4_pcbe_impl_name, p4_pcbe_cpuref, p4_pcbe_list_events, p4_pcbe_list_attrs, p4_pcbe_event_coverage, p4_pcbe_overflow_bitmap, p4_pcbe_configure, p4_pcbe_program, p4_pcbe_allstop, p4_pcbe_sample, p4_pcbe_free }; /* * P4 Configuration Flags. */ #define P4_THIS_USR 0x1 /* HTT: Measure usr events on this logical CPU */ #define P4_THIS_SYS 0x2 /* HTT: Measure os events on this logical CPU */ #define P4_SIBLING_USR 0x4 /* HTT: Measure os events on other logical CPU */ #define P4_SIBLING_SYS 0x8 /* HTT: Measure usr events on other logical CPU */ #define P4_PMI 0x10 /* HTT: Set PMI bit for local logical CPU */ typedef struct _p4_pcbe_config { uint8_t p4_flags; uint8_t p4_picno; /* From 0 to 18 */ uint8_t p4_escr_ndx; /* Which ESCR to use */ uint32_t p4_escr; /* Value to program in selected ESCR */ uint32_t p4_cccr; /* Value to program in counter's CCCR */ uint64_t p4_rawpic; } p4_pcbe_config_t; typedef uint32_t cntr_map_t; typedef struct _p4_escr { int pe_num; uint32_t pe_addr; uint32_t pe_map; /* bitmap of counters; bit 1 means ctr 0 */ } p4_escr_t; #define MASK40 UINT64_C(0xffffffffff) /* * CCCR field definitions. * * Note that the Intel Developer's Manual states that the reserved field at * bit location 16 and 17 must be set to 11. (??) */ #define CCCR_ENABLE_SHIFT 12 #define CCCR_ESCR_SEL_SHIFT 13 #define CCCR_ACTV_THR_SHIFT 16 #define CCCR_COMPARE_SHIFT 18 #define CCCR_COMPLEMENT_SHIFT 19 #define CCCR_THRESHOLD_SHIFT 20 #define CCCR_EDGE_SHIFT 24 #define CCCR_OVF_PMI_SHIFT 26 #define CCCR_OVF_PMI_T0_SHIFT 26 #define CCCR_OVF_PMI_T1_SHIFT 27 #define CCCR_OVF_SHIFT 31 #define CCCR_ACTV_THR_MASK 0x3 #define CCCR_THRESHOLD_MAX 0xF #define CCCR_ENABLE (1U << CCCR_ENABLE_SHIFT) #define CCCR_COMPARE (1U << CCCR_COMPARE_SHIFT) #define CCCR_COMPLEMENT (1U << CCCR_COMPLEMENT_SHIFT) #define CCCR_EDGE (1U << CCCR_EDGE_SHIFT) #define CCCR_OVF_PMI (1U << CCCR_OVF_PMI_SHIFT) #define CCCR_OVF_PMI_T0 (1U << CCCR_OVF_PMI_T0_SHIFT) #define CCCR_OVF_PMI_T1 (1U << CCCR_OVF_PMI_T1_SHIFT) #define CCCR_INIT CCCR_ENABLE #define CCCR_OVF (1U << CCCR_OVF_SHIFT) #define ESCR_EVSEL_SHIFT 25 #define ESCR_EVMASK_SHIFT 9 #define ESCR_TAG_VALUE_SHIFT 5 #define ESCR_TAG_VALUE_MAX 0xF #define ESCR_TAG_ENABLE_SHIFT 4 #define ESCR_USR_SHIFT 2 #define ESCR_OS_SHIFT 3 #define ESCR_USR (1U << ESCR_USR_SHIFT) #define ESCR_OS (1U << ESCR_OS_SHIFT) #define ESCR_TAG_ENABLE (1U << ESCR_TAG_ENABLE_SHIFT) /* * HyperThreaded ESCR fields. */ #define ESCR_T0_OS_SHIFT 3 #define ESCR_T0_USR_SHIFT 2 #define ESCR_T1_OS_SHIFT 1 #define ESCR_T1_USR_SHIFT 0 #define ESCR_T0_OS (1U << ESCR_T0_OS_SHIFT) #define ESCR_T0_USR (1U << ESCR_T0_USR_SHIFT) #define ESCR_T1_OS (1U << ESCR_T1_OS_SHIFT) #define ESCR_T1_USR (1U << ESCR_T1_USR_SHIFT) /* * ESCRs are grouped by counter; each group of ESCRs is associated with a * distinct group of counters. Use these macros to fill in the table below. */ #define BPU0_map (0x1 | 0x2) /* Counters 0 and 1 */ #define BPU2_map (0x4 | 0x8) /* Counters 2 and 3 */ #define MS0_map (0x10 | 0x20) /* Counters 4 and 5 */ #define MS2_map (0x40 | 0x80) /* Counters 6 and 7 */ #define FLAME0_map (0x100 | 0x200) /* Counters 8 and 9 */ #define FLAME2_map (0x400 | 0x800) /* Counters 10 and 11 */ #define IQ0_map (0x1000 | 0x2000 | 0x10000) /* Counters 12, 13, 16 */ #define IQ2_map (0x4000 | 0x8000 | 0x20000) /* Counters 14, 15, 17 */ /* * Table describing the 45 Event Selection and Control Registers (ESCRs). */ const p4_escr_t p4_escrs[] = { #define BPU0 (1) { 0, 0x3B2, BPU0_map }, /* 0 */ #define IS0 (1ULL << 1) { 1, 0x3B4, BPU0_map }, /* 1 */ #define MOB0 (1ULL << 2) { 2, 0x3AA, BPU0_map }, /* 2 */ #define ITLB0 (1ULL << 3) { 3, 0x3B6, BPU0_map }, /* 3 */ #define PMH0 (1ULL << 4) { 4, 0x3AC, BPU0_map }, /* 4 */ #define IX0 (1ULL << 5) { 5, 0x3C8, BPU0_map }, /* 5 */ #define FSB0 (1ULL << 6) { 6, 0x3A2, BPU0_map }, /* 6 */ #define BSU0 (1ULL << 7) { 7, 0x3A0, BPU0_map }, /* 7 */ #define BPU1 (1ULL << 8) { 0, 0x3B3, BPU2_map }, /* 8 */ #define IS1 (1ULL << 9) { 1, 0x3B5, BPU2_map }, /* 9 */ #define MOB1 (1ULL << 10) { 2, 0x3AB, BPU2_map }, /* 10 */ #define ITLB1 (1ULL << 11) { 3, 0x3B7, BPU2_map }, /* 11 */ #define PMH1 (1ULL << 12) { 4, 0x3AD, BPU2_map }, /* 12 */ #define IX1 (1ULL << 13) { 5, 0x3C9, BPU2_map }, /* 13 */ #define FSB1 (1ULL << 14) { 6, 0x3A3, BPU2_map }, /* 14 */ #define BSU1 (1ULL << 15) { 7, 0x3A1, BPU2_map }, /* 15 */ #define MS0 (1ULL << 16) { 0, 0x3C0, MS0_map }, /* 16 */ #define TC0 (1ULL << 17) { 1, 0x3C4, MS0_map }, /* 17 */ #define TBPU0 (1ULL << 18) { 2, 0x3C2, MS0_map }, /* 18 */ #define MS1 (1ULL << 19) { 0, 0x3C1, MS2_map }, /* 19 */ #define TC1 (1ULL << 20) { 1, 0x3C5, MS2_map }, /* 20 */ #define TBPU1 (1ULL << 21) { 2, 0x3C3, MS2_map }, /* 21 */ #define FLAME0 (1ULL << 22) { 0, 0x3A6, FLAME0_map }, /* 22 */ #define FIRM0 (1ULL << 23) { 1, 0x3A4, FLAME0_map }, /* 23 */ #define SAAT0 (1ULL << 24) { 2, 0x3AE, FLAME0_map }, /* 24 */ #define U2L0 (1ULL << 25) { 3, 0x3B0, FLAME0_map }, /* 25 */ #define DAC0 (1ULL << 26) { 5, 0x3A8, FLAME0_map }, /* 26 */ #define FLAME1 (1ULL << 27) { 0, 0x3A7, FLAME2_map }, /* 27 */ #define FIRM1 (1ULL << 28) { 1, 0x3A5, FLAME2_map }, /* 28 */ #define SAAT1 (1ULL << 29) { 2, 0x3AF, FLAME2_map }, /* 29 */ #define U2L1 (1ULL << 30) { 3, 0x3B1, FLAME2_map }, /* 30 */ #define DAC1 (1ULL << 31) { 5, 0x3A9, FLAME2_map }, /* 31 */ #define IQ0 (1ULL << 32) { 0, 0x3BA, IQ0_map }, /* 32 */ #define ALF0 (1ULL << 33) { 1, 0x3CA, IQ0_map }, /* 33 */ #define RAT0 (1ULL << 34) { 2, 0x3BC, IQ0_map }, /* 34 */ #define SSU0 (1ULL << 35) { 3, 0x3BE, IQ0_map }, /* 35 */ #define CRU0 (1ULL << 36) { 4, 0x3B8, IQ0_map }, /* 36 */ #define CRU2 (1ULL << 37) { 5, 0x3CC, IQ0_map }, /* 37 */ #define CRU4 (1ULL << 38) { 6, 0x3E0, IQ0_map }, /* 38 */ #define IQ1 (1ULL << 39) { 0, 0x3BB, IQ2_map }, /* 39 */ #define ALF1 (1ULL << 40) { 1, 0x3CB, IQ2_map }, /* 40 */ #define RAT1 (1ULL << 41) { 2, 0x3BD, IQ2_map }, /* 41 */ #define CRU1 (1ULL << 42) { 4, 0x3B9, IQ2_map }, /* 42 */ #define CRU3 (1ULL << 43) { 5, 0x3CD, IQ2_map }, /* 43 */ #define CRU5 (1ULL << 44) { 6, 0x3E1, IQ2_map } /* 44 */ }; #define ESCR_MAX_INDEX 44 typedef struct _p4_ctr { uint32_t pc_caddr; /* counter MSR address */ uint32_t pc_ctladdr; /* counter's CCCR MSR address */ uint64_t pc_map; /* bitmap of ESCRs controlling ctr */ } p4_ctr_t; const p4_ctr_t p4_ctrs[18] = { { /* BPU_COUNTER0 */ 0x300, 0x360, BSU0|FSB0|MOB0|PMH0|BPU0|IS0|ITLB0|IX0}, { /* BPU_COUNTER1 */ 0x301, 0x361, BSU0|FSB0|MOB0|PMH0|BPU0|IS0|ITLB0|IX0}, { /* BPU_COUNTER2 */ 0x302, 0x362, BSU1|FSB1|MOB1|PMH1|BPU1|IS1|ITLB1|IX1}, { /* BPU_COUNTER3 */ 0x303, 0x363, BSU1|FSB1|MOB1|PMH1|BPU1|IS1|ITLB1|IX1}, { /* MS_COUNTER0 */ 0x304, 0x364, MS0|TBPU0|TC0 }, { /* MS_COUNTER1 */ 0x305, 0x365, MS0|TBPU0|TC0 }, { /* MS_COUNTER2 */ 0x306, 0x366, MS1|TBPU1|TC1 }, { /* MS_COUNTER3 */ 0x307, 0x367, MS1|TBPU1|TC1 }, { /* FLAME_COUNTER0 */ 0x308, 0x368, FIRM0|FLAME0|DAC0|SAAT0|U2L0 }, { /* FLAME_COUNTER1 */ 0x309, 0x369, FIRM0|FLAME0|DAC0|SAAT0|U2L0 }, { /* FLAME_COUNTER2 */ 0x30A, 0x36A, FIRM1|FLAME1|DAC1|SAAT1|U2L1 }, { /* FLAME_COUNTER3 */ 0x30B, 0x36B, FIRM1|FLAME1|DAC1|SAAT1|U2L1 }, { /* IQ_COUNTER0 */ 0x30C, 0x36C, CRU0|CRU2|CRU4|IQ0|RAT0|SSU0|ALF0 }, { /* IQ_COUNTER1 */ 0x30D, 0x36D, CRU0|CRU2|CRU4|IQ0|RAT0|SSU0|ALF0 }, { /* IQ_COUNTER2 */ 0x30E, 0x36E, CRU1|CRU3|CRU5|IQ1|RAT1|ALF1 }, { /* IQ_COUNTER3 */ 0x30F, 0x36F, CRU1|CRU3|CRU5|IQ1|RAT1|ALF1 }, { /* IQ_COUNTER4 */ 0x310, 0x370, CRU0|CRU2|CRU4|IQ0|RAT0|SSU0|ALF0 }, { /* IQ_COUNTER5 */ 0x311, 0x371, CRU1|CRU3|CRU5|IQ1|RAT1|ALF1 } }; typedef struct _p4_event { char *pe_name; /* Name of event according to docs */ uint64_t pe_escr_map; /* Bitmap of ESCRs capable of event */ uint32_t pe_escr_mask; /* permissible ESCR event mask */ uint8_t pe_ev; /* ESCR event select value */ uint16_t pe_cccr; /* CCCR select value */ uint32_t pe_ctr_mask; /* Bitmap of capable counters */ } p4_event_t; #define C(n) (1 << n) p4_event_t p4_events[] = { { "branch_retired", CRU2|CRU3, 0xF, 0x6, 0x5, C(12)|C(13)|C(14)|C(15)|C(16) }, { "mispred_branch_retired", CRU0|CRU1, 0x1, 0x3, 0x4, C(12)|C(13)|C(14)|C(15)|C(16) }, { "TC_deliver_mode", TC0|TC1, 0xFF, 0x1, 0x1, C(4)|C(5)|C(6)|C(7) }, { "BPU_fetch_request", BPU0|BPU1, 0x1, 0x3, 0x0, C(0)|C(1)|C(2)|C(3) }, { "ITLB_reference", ITLB0|ITLB1, 0x7, 0x18, 0x3, C(0)|C(1)|C(2)|C(3) }, { "memory_cancel", DAC0|DAC1, 0x6, 0x2, 0x5, C(8)|C(9)|C(10)|C(11) }, { "memory_complete", SAAT0|SAAT1, 0x3, 0x8, 0x2, C(8)|C(9)|C(10)|C(11) }, { "load_port_replay", SAAT0|SAAT1, 0x1, 0x4, 0x2, C(8)|C(9)|C(10)|C(11) }, { "store_port_replay", SAAT0|SAAT1, 0x1, 0x5, 0x2, C(8)|C(9)|C(10)|C(11) }, { "MOB_load_replay", MOB0|MOB1, 0x35, 0x3, 0x2, C(0)|C(1)|C(2)|C(3) }, { "page_walk_type", PMH0|PMH1, 0x3, 0x1, 0x4, C(0)|C(1)|C(2)|C(3) }, { "BSQ_cache_reference", BSU0|BSU1, 0x73F, 0xC, 0x7, C(0)|C(1)|C(2)|C(3) }, { "IOQ_allocation", FSB0, 0xEFFF, 0x3, 0x6, C(0)|C(1) }, { "IOQ_active_entries", FSB1, 0xEFFF, 0x1A, 0x6, C(2)|C(3) }, { "FSB_data_activity", FSB0|FSB1, 0x3F, 0x17, 0x6, C(0)|C(1)|C(2)|C(3) }, { "BSQ_allocation", BSU0, 0x3FEF, 0x5, 0x7, C(0)|C(1) }, { "bsq_active_entries", BSU1, 0x3FEF, 0x6, 0x7, C(2)|C(3) }, { "x87_assist", CRU2|CRU3, 0x1F, 0x3, 0x5, C(12)|C(13)|C(14)|C(15)|C(16)|C(17)}, { "SSE_input_assist", FIRM0|FIRM1, 0x8000, 0x34, 0x1, C(8)|C(9)|C(10)|C(11) }, { "packed_SP_uop", FIRM0|FIRM1, 0x8000, 0x8, 0x1, C(8)|C(9)|C(10)|C(11) }, { "packed_DP_uop", FIRM0|FIRM1, 0x8000, 0xC, 0x1, C(8)|C(9)|C(10)|C(11) }, { "scalar_SP_uop", FIRM0|FIRM1, 0x8000, 0xA, 0x1, C(8)|C(9)|C(10)|C(11) }, { "scalar_DP_uop", FIRM0|FIRM1, 0x8000, 0xE, 0x1, C(8)|C(9)|C(10)|C(11) }, { "64bit_MMX_uop", FIRM0|FIRM1, 0x8000, 0x2, 0x1, C(8)|C(9)|C(10)|C(11) }, { "128bit_MMX_uop", FIRM0|FIRM1, 0x8000, 0x1A, 0x1, C(8)|C(9)|C(10)|C(11) }, { "x87_FP_uop", FIRM0|FIRM1, 0x8000, 0x4, 0x1, C(8)|C(9)|C(10)|C(11) }, { "x87_SIMD_moves_uop", FIRM0|FIRM1, 0x18, 0x2E, 0x1, C(8)|C(9)|C(10)|C(11) }, { "machine_clear", CRU2|CRU3, 0xD, 0x2, 0x5, C(12)|C(13)|C(14)|C(15)|C(16)|C(17)}, { "global_power_events", FSB0|FSB1, 0x1, 0x5, 0x6, C(0)|C(1)|C(2)|C(3) }, { "tc_ms_xfer", MS0|MS1, 0x1, 0x5, 0x0, C(4)|C(5)|C(6)|C(7) }, { "uop_queue_writes", MS0|MS1, 0x7, 0x9, 0x0, C(4)|C(5)|C(6)|C(7) }, { "front_end_event", CRU2|CRU3, 0x3, 0x8, 0x5, C(12)|C(13)|C(14)|C(15)|C(16)|C(17)}, { "execution_event", CRU2|CRU3, 0xFF, 0xC, 0x5, C(12)|C(13)|C(14)|C(15)|C(16)|C(17)}, { "replay_event", CRU2|CRU3, 0x3, 0x9, 0x5, C(12)|C(13)|C(14)|C(15)|C(16)|C(17)}, { "instr_retired", CRU0|CRU1, 0xF, 0x2, 0x4, C(12)|C(13)|C(14)|C(15)|C(16)|C(17)}, { "uops_retired", CRU0|CRU1, 0x3, 0x1, 0x4, C(12)|C(13)|C(14)|C(15)|C(16)|C(17)}, { "uop_type", RAT0|RAT1, 0x3, 0x2, 0x2, C(12)|C(13)|C(14)|C(15)|C(16)|C(17)}, { "retired_mispred_branch_type", TBPU0|TBPU1, 0x1F, 0x5, 0x2, C(4)|C(5)|C(6)|C(7)}, { "retired_branch_type", TBPU0|TBPU1, 0x1F, 0x4, 0x2, C(4)|C(5)|C(6)|C(7) }, { NULL, 0, 0, 0, 0 } }; /* * Indicates whether the "rdpmc" instruction is available on this processor. */ static int p4_rdpmc_avail = 0; static const uint64_t p4_cccrstop = 0; static char *p4_eventlist[18]; /* * If set, this processor has HyperThreading. */ static int p4_htt = 0; #define P4_FAMILY 0xF static int p4_pcbe_init(void) { int i; size_t size; p4_event_t *ev; /* * If we're not running on a P4, refuse to load. */ if (cpuid_getvendor(CPU) != X86_VENDOR_Intel || cpuid_getfamily(CPU) != P4_FAMILY) return (-1); /* * Set up the event lists for each counter. * * First pass calculates the size of the event list, and the second * pass copies each event name into the event list. */ for (i = 0; i < 18; i++) { size = 0; for (ev = p4_events; ev->pe_name != NULL; ev++) { if (ev->pe_ctr_mask & C(i)) size += strlen(ev->pe_name) + 1; } /* * We use 'size + 1' here to ensure room for the final * strcat when it terminates the string. */ p4_eventlist[i] = (char *)kmem_alloc(size + 1, KM_SLEEP); *p4_eventlist[i] = '\0'; for (ev = p4_events; ev->pe_name != NULL; ev++) { if (ev->pe_ctr_mask & C(i)) { (void) strcat(p4_eventlist[i], ev->pe_name); (void) strcat(p4_eventlist[i], ","); } } /* * Remove trailing ',' */ p4_eventlist[i][size - 1] = '\0'; } if (x86_feature & X86_MMX) p4_rdpmc_avail = 1; /* * The X86_HTT flag may disappear soon, so we'll isolate the impact of * its demise to the following if(). */ if (x86_feature & X86_HTT) p4_htt = 1; return (0); } static uint_t p4_pcbe_ncounters(void) { return (18); } static const char * p4_pcbe_impl_name(void) { if (p4_htt) return ("Pentium 4 with HyperThreading"); return ("Pentium 4"); } static const char * p4_pcbe_cpuref(void) { return ("See Appendix A.1 of the \"IA-32 Intel Architecture Software " \ "Developer's Manual Volume 3: System Programming Guide,\" " \ "Order # 245472-012, 2003"); } static char * p4_pcbe_list_events(uint_t picnum) { ASSERT(picnum >= 0 && picnum < 18); return (p4_eventlist[picnum]); } #define P4_ATTRS "emask,tag,compare,complement,threshold,edge" static char * p4_pcbe_list_attrs(void) { if (p4_htt) return (P4_ATTRS ",active_thread,count_sibling_usr," "count_sibling_sys"); return (P4_ATTRS); } static uint64_t p4_pcbe_event_coverage(char *event) { p4_event_t *ev; for (ev = p4_events; ev->pe_name != NULL; ev++) { if (strcmp(event, ev->pe_name) == 0) break; } return (ev->pe_ctr_mask); } static uint64_t p4_pcbe_overflow_bitmap(void) { extern int kcpc_hw_overflow_intr_installed; uint64_t ret = 0; int i; /* * The CCCR's OVF bit indicates that the corresponding counter has * overflowed. It must be explicitly cleared by software, so it is * safe to read the CCCR values here. */ for (i = 0; i < 18; i++) { if (rdmsr(p4_ctrs[i].pc_ctladdr) & CCCR_OVF) ret |= (1 << i); } /* * Pentium 4 and Xeon turn off the CPC interrupt mask bit in the LVT at * every overflow. Turn it back on here. */ ASSERT(kcpc_hw_overflow_intr_installed); (*kcpc_hw_enable_cpc_intr)(); return (ret); } static int p4_escr_inuse(p4_pcbe_config_t **cfgs, int escr_ndx) { int i; for (i = 0; i < 18; i++) { if (cfgs[i] == NULL) continue; if (cfgs[i]->p4_escr_ndx == escr_ndx) return (1); } return (0); } static void build_cfgs(p4_pcbe_config_t *cfgs[18], uint64_t *data[18], void *token) { p4_pcbe_config_t *cfg = NULL; uint64_t *daddr; bzero(cfgs, 18 * sizeof (p4_pcbe_config_t *)); do { cfg = (p4_pcbe_config_t *)kcpc_next_config(token, cfg, &daddr); if (cfg != NULL) { ASSERT(cfg->p4_picno < 18); cfgs[cfg->p4_picno] = cfg; if (data != NULL) { ASSERT(daddr != NULL); data[cfg->p4_picno] = daddr; } } } while (cfg != NULL); } /* * Programming a counter: * * Select event. * Choose an ESCR capable of counting that event. * Set up the ESCR with the desired parameters (usr, sys, tag). * Set up the CCCR to point to the selected ESCR. * Set the CCCR parameters (overflow, cascade, edge, etc). */ static int p4_pcbe_configure(uint_t picnum, char *eventname, uint64_t preset, uint32_t flags, uint_t nattrs, kcpc_attr_t *attrs, void **data, void *token) { p4_pcbe_config_t *cfgs[18]; p4_pcbe_config_t *cfg; p4_event_t *ev; int escr_ndx; int i; uint16_t emask = 0; uint8_t tag; int use_tag = 0; int active_thread = 0x3; /* default is "any" */ int compare = 0; int complement = 0; int threshold = 0; int edge = 0; int sibling_usr = 0; /* count usr on other cpu */ int sibling_sys = 0; /* count sys on other cpu */ /* * If we've been handed an existing configuration, we need only preset * the counter value. */ if (*data != NULL) { cfg = *data; cfg->p4_rawpic = preset & MASK40; return (0); } if (picnum < 0 || picnum >= 18) return (CPC_INVALID_PICNUM); for (ev = p4_events; ev->pe_name != NULL; ev++) { if (strcmp(eventname, ev->pe_name) == 0) break; } if (ev->pe_name == NULL) return (CPC_INVALID_EVENT); build_cfgs(cfgs, NULL, token); /* * Find an ESCR capable of counting this event. */ for (escr_ndx = 0; escr_ndx < ESCR_MAX_INDEX; escr_ndx++) { if ((ev->pe_escr_map & (1ULL << escr_ndx)) && p4_escr_inuse(cfgs, escr_ndx) == 0) break; } /* * All ESCRs capable of counting this event are already being * used. */ if (escr_ndx == ESCR_MAX_INDEX) return (CPC_RESOURCE_UNAVAIL); /* * At this point, ev points to the desired event and escr is the index * of a capable and available ESCR. * * Now process and verify the attributes. */ for (i = 0; i < nattrs; i++) { if (strcmp("emask", attrs[i].ka_name) == 0) { if ((attrs[i].ka_val | ev->pe_escr_mask) != ev->pe_escr_mask) return (CPC_ATTRIBUTE_OUT_OF_RANGE); emask = attrs[i].ka_val; continue; } else if (strcmp("tag", attrs[i].ka_name) == 0) { if (attrs[i].ka_val > ESCR_TAG_VALUE_MAX) return (CPC_ATTRIBUTE_OUT_OF_RANGE); tag = attrs[i].ka_val; use_tag = 1; continue; } else if (strcmp("compare", attrs[i].ka_name) == 0) { if (attrs[i].ka_val != 0) compare = 1; continue; } else if (strcmp("complement", attrs[i].ka_name) == 0) { if (attrs[i].ka_val != 0) complement = 1; continue; } else if (strcmp("threshold", attrs[i].ka_name) == 0) { if (attrs[i].ka_val > CCCR_THRESHOLD_MAX) return (CPC_ATTRIBUTE_OUT_OF_RANGE); threshold = attrs[i].ka_val; continue; } else if (strcmp("edge", attrs[i].ka_name) == 0) { if (attrs[i].ka_val != 0) edge = 1; continue; } /* * The remaining attributes are valid only on HyperThreaded P4s * for processes with the "cpc_cpu" privilege. */ if (p4_htt == 0) return (CPC_INVALID_ATTRIBUTE); if (secpolicy_cpc_cpu(crgetcred()) != 0) return (CPC_ATTR_REQUIRES_PRIVILEGE); if (strcmp("active_thread", attrs[i].ka_name) == 0) { if ((attrs[i].ka_val | CCCR_ACTV_THR_MASK) != CCCR_ACTV_THR_MASK) return (CPC_ATTRIBUTE_OUT_OF_RANGE); active_thread = (int)attrs[i].ka_val; } else if (strcmp("count_sibling_usr", attrs[i].ka_name) == 0) { if (attrs[i].ka_val != 0) sibling_usr = 1; } else if (strcmp("count_sibling_sys", attrs[i].ka_name) == 0) { if (attrs[i].ka_val != 0) sibling_sys = 1; } else return (CPC_INVALID_ATTRIBUTE); } /* * Make sure the counter can count this event */ if ((ev->pe_ctr_mask & C(picnum)) == 0) return (CPC_PIC_NOT_CAPABLE); /* * Find an ESCR that lines up with the event _and_ the counter. */ for (escr_ndx = 0; escr_ndx < ESCR_MAX_INDEX; escr_ndx++) { if ((ev->pe_escr_map & (1ULL << escr_ndx)) && (p4_escrs[escr_ndx].pe_map & (1 << picnum)) && p4_escr_inuse(cfgs, escr_ndx) == 0) break; } if (escr_ndx == ESCR_MAX_INDEX) return (CPC_RESOURCE_UNAVAIL); cfg = (p4_pcbe_config_t *)kmem_alloc(sizeof (p4_pcbe_config_t), KM_SLEEP); cfg->p4_flags = 0; cfg->p4_picno = picnum; cfg->p4_escr_ndx = escr_ndx; cfg->p4_escr = (ev->pe_ev << ESCR_EVSEL_SHIFT) | (emask << ESCR_EVMASK_SHIFT); if (use_tag == 1) { cfg->p4_escr |= tag << ESCR_TAG_VALUE_SHIFT; cfg->p4_escr |= ESCR_TAG_ENABLE; } if (p4_htt) { /* * This is a HyperThreaded P4. Since we don't know which * logical CPU this configuration will eventually be programmed * on, we can't yet decide which fields of the ESCR to select. * * Record the necessary information in the flags for later. */ if (flags & CPC_COUNT_USER) cfg->p4_flags |= P4_THIS_USR; if (flags & CPC_COUNT_SYSTEM) cfg->p4_flags |= P4_THIS_SYS; if (p4_htt && sibling_usr) cfg->p4_flags |= P4_SIBLING_USR; if (p4_htt && sibling_sys) cfg->p4_flags |= P4_SIBLING_SYS; } else { /* * This is not HyperThreaded, so we can determine the exact * ESCR value necessary now. */ if (flags & CPC_COUNT_USER) cfg->p4_escr |= ESCR_USR; if (flags & CPC_COUNT_SYSTEM) cfg->p4_escr |= ESCR_OS; } cfg->p4_rawpic = preset & MASK40; /* * Even on non-HT P4s, Intel states the active_thread field (marked as * "reserved" for the non-HT chips) must be set to all 1s. */ cfg->p4_cccr = CCCR_INIT | (active_thread << CCCR_ACTV_THR_SHIFT); if (compare) cfg->p4_cccr |= CCCR_COMPARE; if (complement) cfg->p4_cccr |= CCCR_COMPLEMENT; cfg->p4_cccr |= threshold << CCCR_THRESHOLD_SHIFT; if (edge) cfg->p4_cccr |= CCCR_EDGE; cfg->p4_cccr |= p4_escrs[cfg->p4_escr_ndx].pe_num << CCCR_ESCR_SEL_SHIFT; if (flags & CPC_OVF_NOTIFY_EMT) { if (p4_htt) cfg->p4_flags |= P4_PMI; else { /* * If the user has asked for notification of overflows, * we automatically program the hardware to generate an * interrupt on overflow. * * This can only be programmed now if this P4 doesn't * have HyperThreading. If it does, we must wait until * we know which logical CPU we'll be programming. */ cfg->p4_cccr |= CCCR_OVF_PMI; } } *data = cfg; return (0); } static void p4_pcbe_program(void *token) { int i; uint64_t cccr; p4_pcbe_config_t *cfgs[18]; p4_pcbe_allstop(); build_cfgs(cfgs, NULL, token); if (p4_rdpmc_avail) { uint32_t curcr4 = getcr4(); if (kcpc_allow_nonpriv(token)) setcr4(curcr4 | CR4_PCE); else setcr4(curcr4 & ~CR4_PCE); } /* * Ideally we would start all counters with a single operation, but in * P4 each counter is enabled individually via its CCCR. To minimize the * probe effect of enabling the counters, we do it in two passes: the * first programs the counter and ESCR, and the second programs the * CCCR (and thus enables the counter). */ if (p4_htt) { int lid = cpuid_get_clogid(CPU); /* Logical ID of CPU */ for (i = 0; i < 18; i++) { uint64_t escr; if (cfgs[i] == NULL) continue; escr = (uint64_t)cfgs[i]->p4_escr; if (cfgs[i]->p4_flags & P4_THIS_USR) escr |= (lid == 0) ? ESCR_T0_USR : ESCR_T1_USR; if (cfgs[i]->p4_flags & P4_THIS_SYS) escr |= (lid == 0) ? ESCR_T0_OS : ESCR_T1_OS; if (cfgs[i]->p4_flags & P4_SIBLING_USR) escr |= (lid == 0) ? ESCR_T1_USR : ESCR_T0_USR; if (cfgs[i]->p4_flags & P4_SIBLING_SYS) escr |= (lid == 0) ? ESCR_T1_OS : ESCR_T0_OS; wrmsr(p4_ctrs[i].pc_caddr, cfgs[i]->p4_rawpic); wrmsr(p4_escrs[cfgs[i]->p4_escr_ndx].pe_addr, escr); } for (i = 0; i < 18; i++) { if (cfgs[i] == NULL) continue; cccr = (uint64_t)cfgs[i]->p4_cccr; /* * We always target the overflow interrupt at the * logical CPU which is doing the counting. */ if (cfgs[i]->p4_flags & P4_PMI) cccr |= (lid == 0) ? CCCR_OVF_PMI_T0 : CCCR_OVF_PMI_T1; wrmsr(p4_ctrs[i].pc_ctladdr, cccr); } } else { for (i = 0; i < 18; i++) { if (cfgs[i] == NULL) continue; wrmsr(p4_ctrs[i].pc_caddr, cfgs[i]->p4_rawpic); wrmsr(p4_escrs[cfgs[i]->p4_escr_ndx].pe_addr, (uint64_t)cfgs[i]->p4_escr); } for (i = 0; i < 18; i++) { if (cfgs[i] == NULL) continue; wrmsr(p4_ctrs[i].pc_ctladdr, (uint64_t)cfgs[i]->p4_cccr); } } } static void p4_pcbe_allstop(void) { int i; for (i = 0; i < 18; i++) wrmsr(p4_ctrs[i].pc_ctladdr, 0ULL); setcr4(getcr4() & ~CR4_PCE); } static void p4_pcbe_sample(void *token) { p4_pcbe_config_t *cfgs[18]; uint64_t *addrs[18]; uint64_t curpic[18]; int64_t diff; int i; for (i = 0; i < 18; i++) curpic[i] = rdmsr(p4_ctrs[i].pc_caddr); build_cfgs(cfgs, addrs, token); for (i = 0; i < 18; i++) { if (cfgs[i] == NULL) continue; diff = curpic[i] - cfgs[i]->p4_rawpic; if (diff < 0) diff += (1ll << 40); *addrs[i] += diff; DTRACE_PROBE4(p4__pcbe__sample, int, i, uint64_t, *addrs[i], uint64_t, curpic[i], uint64_t, cfgs[i]->p4_rawpic); cfgs[i]->p4_rawpic = *addrs[i] & MASK40; } } static void p4_pcbe_free(void *config) { kmem_free(config, sizeof (p4_pcbe_config_t)); } static struct modlpcbe modlpcbe = { &mod_pcbeops, "Pentium 4 Performance Counters v%I%", &p4_pcbe_ops }; static struct modlinkage modl = { MODREV_1, &modlpcbe, }; int _init(void) { if (p4_pcbe_init() != 0) return (ENOTSUP); return (mod_install(&modl)); } int _fini(void) { return (mod_remove(&modl)); } int _info(struct modinfo *mi) { return (mod_info(&modl, mi)); }