1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * AMD Athlon64/Opteron CPU Module Machine-Check Poller 31 * 32 * The AMD Opteron processor doesn't yet report correctable errors via #mc's. 33 * Instead, it fixes the problem, silently updates the error state MSRs, and 34 * resumes operation. In order to discover occurrances of correctable errors, 35 * we have to poll in the background using the omni cyclics mechanism. The 36 * error injector also has the ability to manually request an immediate poll. 37 * Locking is fairly simple within the poller: the per-CPU mutex 38 * ao->ao_mca.ao_mca_poll_lock ensures that only one poll request is active. 39 */ 40 41 #include <sys/types.h> 42 #include <sys/sysmacros.h> 43 #include <sys/x86_archext.h> 44 #include <sys/ddi.h> 45 #include <sys/sunddi.h> 46 #include <sys/ksynch.h> 47 #include <sys/sdt.h> 48 49 #include "ao.h" 50 51 static uint_t ao_mca_poll_trace_nent = 100; 52 #ifdef DEBUG 53 static uint_t ao_mca_poll_trace_always = 1; 54 #else 55 static uint_t ao_mca_poll_trace_always = 0; 56 #endif 57 58 static cyclic_id_t ao_mca_poll_cycid; 59 static hrtime_t ao_mca_poll_interval = NANOSEC * 10ULL; 60 61 static void 62 ao_mca_poll_trace(ao_mca_t *mca, uint32_t what, uint32_t nerr) 63 { 64 uint_t next; 65 ao_mca_poll_trace_t *pt; 66 67 ASSERT(MUTEX_HELD(&mca->ao_mca_poll_lock)); 68 DTRACE_PROBE2(ao__poll__trace, uint32_t, what, uint32_t, nerr); 69 70 if (mca->ao_mca_poll_trace == NULL) 71 return; /* poll trace buffer is disabled */ 72 73 next = (mca->ao_mca_poll_curtrace + 1) % ao_mca_poll_trace_nent; 74 pt = &mca->ao_mca_poll_trace[next]; 75 76 pt->mpt_when = 0; 77 pt->mpt_what = what; 78 79 if (what == AO_MPT_WHAT_CYC_ERR) 80 pt->mpt_nerr = MIN(nerr, UINT8_MAX); 81 82 pt->mpt_when = gethrtime(); 83 mca->ao_mca_poll_curtrace = next; 84 } 85 86 static void 87 ao_mca_poll_common(ao_mca_t *mca, int what) 88 { 89 ao_cpu_logout_t *acl = &mca->ao_mca_logout[AO_MCA_LOGOUT_POLLER]; 90 int i, n, fatal; 91 92 if (mca->ao_mca_flags & AO_MCA_F_UNFAULTING) { 93 mca->ao_mca_flags &= ~AO_MCA_F_UNFAULTING; 94 ao_mca_poll_trace(mca, AO_MPT_WHAT_UNFAULTING, 0); 95 96 /* 97 * On the first poll after re-enabling a faulty CPU we clear 98 * the status registers; see ao_faulted_exit() for more info. 99 */ 100 if (what == AO_MPT_WHAT_CYC_ERR) { 101 for (i = 0; i < AMD_MCA_BANK_COUNT; i++) 102 wrmsr(ao_bank_regs[i].abr_status, 0); 103 return; 104 } 105 } 106 107 fatal = ao_mca_logout(acl, NULL, &n); 108 ao_mca_poll_trace(mca, what, n); 109 110 if (fatal && cmi_panic_on_uncorrectable_error) 111 fm_panic("Unrecoverable Machine-Check Error (polled)"); 112 } 113 114 static void 115 ao_mca_poll_cyclic(void *arg) 116 { 117 ao_data_t *ao = arg; 118 119 if (ao != NULL && mutex_tryenter(&ao->ao_mca.ao_mca_poll_lock)) { 120 ao_mca_poll_common(&ao->ao_mca, AO_MPT_WHAT_CYC_ERR); 121 mutex_exit(&ao->ao_mca.ao_mca_poll_lock); 122 } 123 } 124 125 void 126 ao_mca_poke(void *arg) 127 { 128 ao_data_t *ao = arg; 129 130 mutex_enter(&ao->ao_mca.ao_mca_poll_lock); 131 ao_mca_poll_common(&ao->ao_mca, AO_MPT_WHAT_POKE_ERR); 132 mutex_exit(&ao->ao_mca.ao_mca_poll_lock); 133 } 134 135 /*ARGSUSED*/ 136 static void 137 ao_mca_poll_online(void *arg, cpu_t *cpu, cyc_handler_t *cyh, cyc_time_t *cyt) 138 { 139 cyt->cyt_when = 0; 140 cyh->cyh_level = CY_LOW_LEVEL; 141 142 /* 143 * If the CPU coming on-line isn't supported by this CPU module, then 144 * disable the cylic by cranking cyt_interval and setting arg to NULL. 145 */ 146 if (cpu->cpu_m.mcpu_cmi != NULL && 147 cpu->cpu_m.mcpu_cmi->cmi_ops != &_cmi_ops) { 148 cyt->cyt_interval = INT64_MAX; 149 cyh->cyh_func = ao_mca_poll_cyclic; 150 cyh->cyh_arg = NULL; 151 } else { 152 cyt->cyt_interval = ao_mca_poll_interval; 153 cyh->cyh_func = ao_mca_poll_cyclic; 154 cyh->cyh_arg = cpu->cpu_m.mcpu_cmidata; 155 } 156 } 157 158 /*ARGSUSED*/ 159 static void 160 ao_mca_poll_offline(void *arg, cpu_t *cpu, void *cyh_arg) 161 { 162 /* nothing to do here */ 163 } 164 165 void 166 ao_mca_poll_init(ao_mca_t *mca) 167 { 168 mutex_init(&mca->ao_mca_poll_lock, NULL, MUTEX_DRIVER, NULL); 169 170 if (ao_mca_poll_trace_always) { 171 mca->ao_mca_poll_trace = 172 kmem_zalloc(sizeof (ao_mca_poll_trace_t) * 173 ao_mca_poll_trace_nent, KM_SLEEP); 174 mca->ao_mca_poll_curtrace = 0; 175 } 176 } 177 178 void 179 ao_mca_poll_start(void) 180 { 181 cyc_omni_handler_t cyo; 182 183 if (ao_mca_poll_interval == 0) 184 return; /* if manually tuned to zero, disable polling */ 185 186 cyo.cyo_online = ao_mca_poll_online; 187 cyo.cyo_offline = ao_mca_poll_offline; 188 cyo.cyo_arg = NULL; 189 190 mutex_enter(&cpu_lock); 191 ao_mca_poll_cycid = cyclic_add_omni(&cyo); 192 mutex_exit(&cpu_lock); 193 } 194