1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 25 * Use is subject to license terms. 26 */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 30 /* 31 * AMD Athlon64/Opteron CPU Module Machine-Check Poller 32 * 33 * The AMD Opteron processor doesn't yet report correctable errors via #mc's. 34 * Instead, it fixes the problem, silently updates the error state MSRs, and 35 * resumes operation. In order to discover occurrances of correctable errors, 36 * we have to poll in the background using the omni cyclics mechanism. The 37 * error injector also has the ability to manually request an immediate poll. 38 * Locking is fairly simple within the poller: the per-CPU mutex 39 * ao->ao_mca.ao_mca_poll_lock ensures that only one poll request is active. 40 */ 41 42 #include <sys/types.h> 43 #include <sys/sysmacros.h> 44 #include <sys/x86_archext.h> 45 #include <sys/ddi.h> 46 #include <sys/sunddi.h> 47 #include <sys/ksynch.h> 48 #include <sys/sdt.h> 49 50 #include "ao.h" 51 52 static uint_t ao_mca_poll_trace_nent = 100; 53 #ifdef DEBUG 54 static uint_t ao_mca_poll_trace_always = 1; 55 #else 56 static uint_t ao_mca_poll_trace_always = 0; 57 #endif 58 59 static cyclic_id_t ao_mca_poll_cycid; 60 static hrtime_t ao_mca_poll_interval = NANOSEC * 10ULL; 61 62 static void 63 ao_mca_poll_trace(ao_mca_t *mca, uint32_t what, uint32_t nerr) 64 { 65 uint_t next; 66 ao_mca_poll_trace_t *pt; 67 68 ASSERT(MUTEX_HELD(&mca->ao_mca_poll_lock)); 69 DTRACE_PROBE2(ao__poll__trace, uint32_t, what, uint32_t, nerr); 70 71 if (mca->ao_mca_poll_trace == NULL) 72 return; /* poll trace buffer is disabled */ 73 74 next = (mca->ao_mca_poll_curtrace + 1) % ao_mca_poll_trace_nent; 75 pt = &mca->ao_mca_poll_trace[next]; 76 77 pt->mpt_when = 0; 78 pt->mpt_what = what; 79 80 if (what == AO_MPT_WHAT_CYC_ERR) 81 pt->mpt_nerr = MIN(nerr, UINT8_MAX); 82 83 pt->mpt_when = gethrtime(); 84 mca->ao_mca_poll_curtrace = next; 85 } 86 87 static void 88 ao_mca_poll_common(ao_mca_t *mca, int what) 89 { 90 ao_cpu_logout_t *acl = &mca->ao_mca_logout[AO_MCA_LOGOUT_POLLER]; 91 int i, n, fatal; 92 93 if (mca->ao_mca_flags & AO_MCA_F_UNFAULTING) { 94 mca->ao_mca_flags &= ~AO_MCA_F_UNFAULTING; 95 ao_mca_poll_trace(mca, AO_MPT_WHAT_UNFAULTING, 0); 96 97 /* 98 * On the first poll after re-enabling a faulty CPU we clear 99 * the status registers; see ao_faulted_exit() for more info. 100 */ 101 if (what == AO_MPT_WHAT_CYC_ERR) { 102 for (i = 0; i < AMD_MCA_BANK_COUNT; i++) 103 wrmsr(ao_bank_regs[i].abr_status, 0); 104 return; 105 } 106 } 107 108 fatal = ao_mca_logout(acl, NULL, &n); 109 ao_mca_poll_trace(mca, what, n); 110 111 if (fatal && cmi_panic_on_uncorrectable_error) 112 fm_panic("Unrecoverable Machine-Check Exception"); 113 } 114 115 static void 116 ao_mca_poll_cyclic(void *arg) 117 { 118 ao_data_t *ao = arg; 119 120 if (ao != NULL && mutex_tryenter(&ao->ao_mca.ao_mca_poll_lock)) { 121 ao_mca_poll_common(&ao->ao_mca, AO_MPT_WHAT_CYC_ERR); 122 mutex_exit(&ao->ao_mca.ao_mca_poll_lock); 123 } 124 } 125 126 void 127 ao_mca_poke(void *arg) 128 { 129 ao_data_t *ao = arg; 130 131 mutex_enter(&ao->ao_mca.ao_mca_poll_lock); 132 ao_mca_poll_common(&ao->ao_mca, AO_MPT_WHAT_POKE_ERR); 133 mutex_exit(&ao->ao_mca.ao_mca_poll_lock); 134 } 135 136 /*ARGSUSED*/ 137 static void 138 ao_mca_poll_online(void *arg, cpu_t *cpu, cyc_handler_t *cyh, cyc_time_t *cyt) 139 { 140 cyt->cyt_when = 0; 141 cyh->cyh_level = CY_LOW_LEVEL; 142 143 /* 144 * If the CPU coming on-line isn't supported by this CPU module, then 145 * disable the cylic by cranking cyt_interval and setting arg to NULL. 146 */ 147 if (cpu->cpu_m.mcpu_cmi != NULL && 148 cpu->cpu_m.mcpu_cmi->cmi_ops != &_cmi_ops) { 149 cyt->cyt_interval = INT64_MAX; 150 cyh->cyh_func = ao_mca_poll_cyclic; 151 cyh->cyh_arg = NULL; 152 } else { 153 cyt->cyt_interval = ao_mca_poll_interval; 154 cyh->cyh_func = ao_mca_poll_cyclic; 155 cyh->cyh_arg = cpu->cpu_m.mcpu_cmidata; 156 } 157 } 158 159 /*ARGSUSED*/ 160 static void 161 ao_mca_poll_offline(void *arg, cpu_t *cpu, void *cyh_arg) 162 { 163 /* nothing to do here */ 164 } 165 166 void 167 ao_mca_poll_init(ao_mca_t *mca) 168 { 169 mutex_init(&mca->ao_mca_poll_lock, NULL, MUTEX_DRIVER, NULL); 170 171 if (ao_mca_poll_trace_always) { 172 mca->ao_mca_poll_trace = 173 kmem_zalloc(sizeof (ao_mca_poll_trace_t) * 174 ao_mca_poll_trace_nent, KM_SLEEP); 175 mca->ao_mca_poll_curtrace = 0; 176 } 177 } 178 179 void 180 ao_mca_poll_start(void) 181 { 182 cyc_omni_handler_t cyo; 183 184 if (ao_mca_poll_interval == 0) 185 return; /* if manually tuned to zero, disable polling */ 186 187 cyo.cyo_online = ao_mca_poll_online; 188 cyo.cyo_offline = ao_mca_poll_offline; 189 cyo.cyo_arg = NULL; 190 191 mutex_enter(&cpu_lock); 192 ao_mca_poll_cycid = cyclic_add_omni(&cyo); 193 mutex_exit(&cpu_lock); 194 } 195