xref: /illumos-gate/usr/src/uts/i86pc/cpu/amd_opteron/ao_poll.c (revision a07094369b21309434206d9b3601d162693466fc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 /*
31  * AMD Athlon64/Opteron CPU Module Machine-Check Poller
32  *
33  * The AMD Opteron processor doesn't yet report correctable errors via #mc's.
34  * Instead, it fixes the problem, silently updates the error state MSRs, and
35  * resumes operation.  In order to discover occurrances of correctable errors,
36  * we have to poll in the background using the omni cyclics mechanism.  The
37  * error injector also has the ability to manually request an immediate poll.
38  * Locking is fairly simple within the poller: the per-CPU mutex
39  * ao->ao_mca.ao_mca_poll_lock ensures that only one poll request is active.
40  */
41 
42 #include <sys/types.h>
43 #include <sys/sysmacros.h>
44 #include <sys/x86_archext.h>
45 #include <sys/ddi.h>
46 #include <sys/sunddi.h>
47 #include <sys/ksynch.h>
48 #include <sys/sdt.h>
49 
50 #include "ao.h"
51 
52 static uint_t ao_mca_poll_trace_nent = 100;
53 #ifdef DEBUG
54 static uint_t ao_mca_poll_trace_always = 1;
55 #else
56 static uint_t ao_mca_poll_trace_always = 0;
57 #endif
58 
59 static cyclic_id_t ao_mca_poll_cycid;
60 static hrtime_t ao_mca_poll_interval = NANOSEC * 10ULL;
61 
62 static void
63 ao_mca_poll_trace(ao_mca_t *mca, uint32_t what, uint32_t nerr)
64 {
65 	uint_t next;
66 	ao_mca_poll_trace_t *pt;
67 
68 	ASSERT(MUTEX_HELD(&mca->ao_mca_poll_lock));
69 	DTRACE_PROBE2(ao__poll__trace, uint32_t, what, uint32_t, nerr);
70 
71 	if (mca->ao_mca_poll_trace == NULL)
72 		return; /* poll trace buffer is disabled */
73 
74 	next = (mca->ao_mca_poll_curtrace + 1) % ao_mca_poll_trace_nent;
75 	pt = &mca->ao_mca_poll_trace[next];
76 
77 	pt->mpt_when = 0;
78 	pt->mpt_what = what;
79 
80 	if (what == AO_MPT_WHAT_CYC_ERR)
81 		pt->mpt_nerr = MIN(nerr, UINT8_MAX);
82 
83 	pt->mpt_when = gethrtime();
84 	mca->ao_mca_poll_curtrace = next;
85 }
86 
87 static void
88 ao_mca_poll_common(ao_mca_t *mca, int what)
89 {
90 	ao_cpu_logout_t *acl = &mca->ao_mca_logout[AO_MCA_LOGOUT_POLLER];
91 	int i, n, fatal;
92 
93 	if (mca->ao_mca_flags & AO_MCA_F_UNFAULTING) {
94 		mca->ao_mca_flags &= ~AO_MCA_F_UNFAULTING;
95 		ao_mca_poll_trace(mca, AO_MPT_WHAT_UNFAULTING, 0);
96 
97 		/*
98 		 * On the first poll after re-enabling a faulty CPU we clear
99 		 * the status registers; see ao_faulted_exit() for more info.
100 		 */
101 		if (what == AO_MPT_WHAT_CYC_ERR) {
102 			for (i = 0; i < AMD_MCA_BANK_COUNT; i++)
103 				wrmsr(ao_bank_regs[i].abr_status, 0);
104 			return;
105 		}
106 	}
107 
108 	fatal = ao_mca_logout(acl, NULL, &n);
109 	ao_mca_poll_trace(mca, what, n);
110 
111 	if (fatal && cmi_panic_on_uncorrectable_error)
112 		fm_panic("Unrecoverable Machine-Check Exception");
113 }
114 
115 static void
116 ao_mca_poll_cyclic(void *arg)
117 {
118 	ao_data_t *ao = arg;
119 
120 	if (ao != NULL && mutex_tryenter(&ao->ao_mca.ao_mca_poll_lock)) {
121 		ao_mca_poll_common(&ao->ao_mca, AO_MPT_WHAT_CYC_ERR);
122 		mutex_exit(&ao->ao_mca.ao_mca_poll_lock);
123 	}
124 }
125 
126 void
127 ao_mca_poke(void *arg)
128 {
129 	ao_data_t *ao = arg;
130 
131 	mutex_enter(&ao->ao_mca.ao_mca_poll_lock);
132 	ao_mca_poll_common(&ao->ao_mca, AO_MPT_WHAT_POKE_ERR);
133 	mutex_exit(&ao->ao_mca.ao_mca_poll_lock);
134 }
135 
136 /*ARGSUSED*/
137 static void
138 ao_mca_poll_online(void *arg, cpu_t *cpu, cyc_handler_t *cyh, cyc_time_t *cyt)
139 {
140 	cyt->cyt_when = 0;
141 	cyh->cyh_level = CY_LOW_LEVEL;
142 
143 	/*
144 	 * If the CPU coming on-line isn't supported by this CPU module, then
145 	 * disable the cylic by cranking cyt_interval and setting arg to NULL.
146 	 */
147 	if (cpu->cpu_m.mcpu_cmi != NULL &&
148 	    cpu->cpu_m.mcpu_cmi->cmi_ops != &_cmi_ops) {
149 		cyt->cyt_interval = INT64_MAX;
150 		cyh->cyh_func = ao_mca_poll_cyclic;
151 		cyh->cyh_arg = NULL;
152 	} else {
153 		cyt->cyt_interval = ao_mca_poll_interval;
154 		cyh->cyh_func = ao_mca_poll_cyclic;
155 		cyh->cyh_arg = cpu->cpu_m.mcpu_cmidata;
156 	}
157 }
158 
159 /*ARGSUSED*/
160 static void
161 ao_mca_poll_offline(void *arg, cpu_t *cpu, void *cyh_arg)
162 {
163 	/* nothing to do here */
164 }
165 
166 void
167 ao_mca_poll_init(ao_mca_t *mca)
168 {
169 	mutex_init(&mca->ao_mca_poll_lock, NULL, MUTEX_DRIVER, NULL);
170 
171 	if (ao_mca_poll_trace_always) {
172 		mca->ao_mca_poll_trace =
173 		    kmem_zalloc(sizeof (ao_mca_poll_trace_t) *
174 		    ao_mca_poll_trace_nent, KM_SLEEP);
175 		mca->ao_mca_poll_curtrace = 0;
176 	}
177 }
178 
179 void
180 ao_mca_poll_start(void)
181 {
182 	cyc_omni_handler_t cyo;
183 
184 	if (ao_mca_poll_interval == 0)
185 		return; /* if manually tuned to zero, disable polling */
186 
187 	cyo.cyo_online = ao_mca_poll_online;
188 	cyo.cyo_offline = ao_mca_poll_offline;
189 	cyo.cyo_arg = NULL;
190 
191 	mutex_enter(&cpu_lock);
192 	ao_mca_poll_cycid = cyclic_add_omni(&cyo);
193 	mutex_exit(&cpu_lock);
194 }
195