1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <cma.h>
27
28 #include <sys/fm/ldom.h>
29 #include <sys/fm/protocol.h>
30 #include <fm/fmd_fmri.h>
31 #include <fm/libtopo.h>
32
33 #include <assert.h>
34 #include <fcntl.h>
35 #include <unistd.h>
36 #include <errno.h>
37 #include <strings.h>
38
39 #include <sys/types.h>
40 #include <sys/processor.h>
41
42 extern ldom_hdl_t *cma_lhp;
43
44 /*ARGSUSED*/
45 int
cpu_blacklist_cmd(fmd_hdl_t * hdl,nvlist_t * fmri,boolean_t repair)46 cpu_blacklist_cmd(fmd_hdl_t *hdl, nvlist_t *fmri, boolean_t repair)
47 {
48 if (repair)
49 return (ldom_fmri_unblacklist(cma_lhp, fmri));
50 else
51 return (ldom_fmri_blacklist(cma_lhp, fmri));
52 }
53
54 int
cma_cpu_blacklist(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,boolean_t repair)55 cma_cpu_blacklist(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
56 boolean_t repair)
57 {
58 nvlist_t *fmri;
59 int rc, err;
60
61 /*
62 * Some platforms have special unums for the E$ DIMMs. If we're dealing
63 * with a platform that has these unums, one will have been added to the
64 * fault as the resource. We'll use that for the blacklisting. If we
65 * can't find a resource, we'll fall back to the ASRU.
66 */
67 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &fmri) != 0)
68 fmri = asru;
69
70 rc = cpu_blacklist_cmd(hdl, fmri, repair);
71 err = errno;
72
73 if (rc < 0 && err != ENOTSUP) {
74 errno = err;
75 return (-1);
76 }
77
78 return (0);
79 }
80
81 /*ARGSUSED*/
82 static int
cpu_cmd(fmd_hdl_t * hdl,nvlist_t * fmri,int cmd)83 cpu_cmd(fmd_hdl_t *hdl, nvlist_t *fmri, int cmd)
84 {
85 int rc = 0;
86 char *scheme;
87
88 /*
89 * We're using topo retire if the fmri is in "hc" scheme.
90 */
91 if (nvlist_lookup_string(fmri, FM_FMRI_SCHEME, &scheme) == 0 &&
92 strcmp(scheme, FM_FMRI_SCHEME_HC) == 0) {
93 if (cmd != P_STATUS) {
94 errno = EINVAL;
95 return (-1);
96 }
97 rc = fmd_nvl_fmri_service_state(hdl, fmri);
98 switch (rc) {
99 case FMD_SERVICE_STATE_UNUSABLE:
100 return (P_FAULTED);
101 case -1:
102 return (-1);
103 default:
104 return (P_ONLINE);
105 }
106 }
107
108 switch (cmd & ~P_FORCED) {
109 case P_STATUS:
110 rc = ldom_fmri_status(cma_lhp, fmri);
111 break;
112 case P_FAULTED:
113 rc = ldom_fmri_retire(cma_lhp, fmri);
114 break;
115 case P_ONLINE:
116 rc = ldom_fmri_unretire(cma_lhp, fmri);
117 break;
118 default:
119 errno = EINVAL;
120 return (-1);
121 }
122
123 if (rc != P_OFFLINE && rc != P_ONLINE && rc != P_FAULTED) {
124 errno = rc;
125 return (-1);
126 }
127
128 return (rc);
129 }
130
131 void
cma_cpu_start_retry(fmd_hdl_t * hdl,nvlist_t * fmri,const char * uuid,boolean_t repair)132 cma_cpu_start_retry(fmd_hdl_t *hdl, nvlist_t *fmri, const char *uuid,
133 boolean_t repair)
134 {
135 cma_cpu_t *cpu;
136 char *scheme;
137 uint_t cpuid;
138 nvlist_t *asru = NULL;
139 topo_hdl_t *thp;
140 int err;
141
142 if (repair || nvlist_lookup_string(fmri, FM_FMRI_SCHEME, &scheme) != 0)
143 return;
144 if (strcmp(scheme, FM_FMRI_SCHEME_CPU) == 0) {
145 if (nvlist_lookup_uint32(fmri, FM_FMRI_CPU_ID, &cpuid) != 0)
146 return;
147 } else if (strcmp(scheme, FM_FMRI_SCHEME_HC) != 0) {
148 return;
149 } else {
150 /* lookup cpuid from ASRU */
151 thp = fmd_fmri_topo_hold(TOPO_VERSION);
152 if (thp != NULL) {
153 (void) topo_fmri_asru(thp, fmri, &asru, &err);
154 fmd_fmri_topo_rele(thp);
155 }
156 if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
157 nvlist_free(asru);
158 return;
159 }
160 }
161
162 /*
163 * check to see if the cpu has been offline.
164 */
165 fmd_hdl_debug(hdl, "cpu %u is not offline yet - sleeping\n", cpuid);
166
167 /*
168 * Create a cpu node and add to the head of the cpu list
169 */
170 cpu = fmd_hdl_zalloc(hdl, sizeof (cma_cpu_t), FMD_SLEEP);
171 (void) nvlist_dup(fmri, &cpu->cpu_fmri, 0);
172 if (uuid != NULL)
173 cpu->cpu_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP);
174
175 cpu->cpuid = cpuid;
176 cpu->cpu_next = cma.cma_cpus;
177 cma.cma_cpus = cpu;
178
179 if (cma.cma_cpu_timerid != 0)
180 fmd_timer_remove(hdl, cma.cma_cpu_timerid);
181
182 cma.cma_cpu_curdelay = cma.cma_cpu_mindelay;
183
184 cma.cma_cpu_timerid =
185 fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay);
186 }
187
188
189 int
cma_cpu_statechange(fmd_hdl_t * hdl,nvlist_t * asru,const char * uuid,int cpustate,boolean_t repair)190 cma_cpu_statechange(fmd_hdl_t *hdl, nvlist_t *asru, const char *uuid,
191 int cpustate, boolean_t repair)
192 {
193 int i;
194 uint_t cpuid;
195
196 if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
197 fmd_hdl_debug(hdl, "missing '%s'\n", FM_FMRI_CPU_ID);
198 cma_stats.bad_flts.fmds_value.ui64++;
199 return (CMA_RA_FAILURE);
200 }
201
202 /*
203 * cpu offlining using ldom_fmri_retire() may be asynchronous, so we
204 * have to set the timer and check the cpu status later.
205 */
206 for (i = 0; i < cma.cma_cpu_tries;
207 i++, (void) nanosleep(&cma.cma_cpu_delay, NULL)) {
208 if (cpu_cmd(hdl, asru, cpustate) != -1) {
209 if (repair)
210 cma_stats.cpu_repairs.fmds_value.ui64++;
211 else
212 cma_stats.cpu_flts.fmds_value.ui64++;
213 break;
214 }
215 }
216
217 if (i >= cma.cma_cpu_tries) {
218 cma_stats.cpu_fails.fmds_value.ui64++;
219 }
220
221 cma_cpu_start_retry(hdl, asru, uuid, repair);
222
223 return (CMA_RA_FAILURE);
224 }
225
226 static int
cpu_retry(fmd_hdl_t * hdl,cma_cpu_t * cpu)227 cpu_retry(fmd_hdl_t *hdl, cma_cpu_t *cpu)
228 {
229 int rc = 0;
230
231 fmd_hdl_debug(hdl, "cpu_retry()\n");
232
233 if (cpu->cpu_fmri == NULL) {
234 return (1);
235 }
236
237 if (!fmd_nvl_fmri_present(hdl, cpu->cpu_fmri)) {
238 fmd_hdl_debug(hdl, "cpu %u is not present", cpu->cpuid);
239 return (1);
240 }
241
242 rc = cpu_cmd(hdl, cpu->cpu_fmri, P_STATUS);
243 if (rc == P_FAULTED || rc == P_OFFLINE) {
244 fmd_hdl_debug(hdl, "cpu %u is offlined on retry %u\n",
245 cpu->cpuid, cpu->cpu_nretries);
246 cma_stats.cpu_flts.fmds_value.ui64++;
247
248 if (cpu->cpu_uuid != NULL)
249 fmd_case_uuclose(hdl, cpu->cpu_uuid);
250 return (1); /* success */
251 }
252
253 if (rc == -1) {
254 fmd_hdl_debug(hdl, "failed to retry cpu %u\n", cpu->cpuid);
255 cma_stats.page_fails.fmds_value.ui64++;
256 return (1); /* give up */
257 }
258
259 return (0);
260 }
261
262 static void
cma_cpu_free(fmd_hdl_t * hdl,cma_cpu_t * cpu)263 cma_cpu_free(fmd_hdl_t *hdl, cma_cpu_t *cpu)
264 {
265 if (cpu->cpu_fmri != NULL)
266 nvlist_free(cpu->cpu_fmri);
267 if (cpu->cpu_uuid != NULL)
268 fmd_hdl_strfree(hdl, cpu->cpu_uuid);
269 fmd_hdl_free(hdl, cpu, sizeof (cma_cpu_t));
270 }
271
272 void
cma_cpu_retry(fmd_hdl_t * hdl)273 cma_cpu_retry(fmd_hdl_t *hdl)
274 {
275 cma_cpu_t **cpup;
276
277 fmd_hdl_debug(hdl, "cma_cpu_retry: timer fired\n");
278
279 cma.cma_cpu_timerid = 0;
280
281 cpup = &cma.cma_cpus;
282 while (*cpup != NULL) {
283 cma_cpu_t *cpu = *cpup;
284
285 if (cpu_retry(hdl, cpu)) {
286 /*
287 * Successful retry or we're giving up - remove from
288 * the list
289 */
290 *cpup = cpu->cpu_next;
291
292 cma_cpu_free(hdl, cpu);
293 } else {
294 cpu->cpu_nretries++;
295 cpup = &cpu->cpu_next;
296 }
297 }
298
299 if (cma.cma_cpus == NULL)
300 return; /* no more cpus */
301
302 /*
303 * We still have cpus to check. Back the delay
304 * off, and schedule a retry.
305 */
306 cma.cma_cpu_curdelay = MIN(cma.cma_cpu_curdelay * 2,
307 cma.cma_cpu_maxdelay);
308
309 fmd_hdl_debug(hdl, "scheduled cpu offline retry for %llu secs\n",
310 (u_longlong_t)(cma.cma_cpu_curdelay / NANOSEC));
311
312 cma.cma_cpu_timerid =
313 fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay);
314 }
315
316 void
cma_cpu_fini(fmd_hdl_t * hdl)317 cma_cpu_fini(fmd_hdl_t *hdl)
318 {
319 cma_cpu_t *cpu;
320
321 while ((cpu = cma.cma_cpus) != NULL) {
322 cma.cma_cpus = cpu->cpu_next;
323 cma_cpu_free(hdl, cpu);
324 }
325 }
326