1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <cma.h>
27
28 #include <fcntl.h>
29 #include <unistd.h>
30 #include <strings.h>
31 #include <errno.h>
32 #include <time.h>
33 #include <fm/fmd_api.h>
34 #include <fm/fmd_agent.h>
35 #include <sys/fm/protocol.h>
36 #include <sys/bl.h>
37 #include <sys/processor.h>
38
39 static int cpu_statechange(fmd_hdl_t *, nvlist_t *, nvlist_t *, const char *,
40 uint32_t, boolean_t);
41
42 #ifndef opl
43 /*
44 * Perform retire/unretire by invoking the topo methods registered in the
45 * hc-scheme resource.
46 *
47 * If the fault is found to be diagnosed under the old topology, the resource
48 * will not exist in the current topology, then we fall back to legacy retire
49 * (using the "cpu" scheme ASRU).
50 */
51
52 static boolean_t
old_topo_fault(nvlist_t * nvl)53 old_topo_fault(nvlist_t *nvl)
54 {
55 nvlist_t *rsrc;
56 #ifdef i386
57 nvlist_t **hcl;
58 uint_t nhcl = 0;
59 char *name;
60 #endif
61
62 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) != 0)
63 return (B_TRUE);
64 #ifdef i386
65 /*
66 * x86 has moved from "motherboard/chip/cpu" topo to
67 * "motherboard/chip/core/strand"
68 */
69 if (nvlist_lookup_nvlist_array(rsrc, FM_FMRI_HC_LIST, &hcl, &nhcl)
70 == 0 && nhcl == 3 &&
71 nvlist_lookup_string(hcl[0], FM_FMRI_HC_NAME, &name) == 0 &&
72 strcmp(name, "motherboard") == 0 &&
73 nvlist_lookup_string(hcl[1], FM_FMRI_HC_NAME, &name) == 0 &&
74 strcmp(name, "chip") == 0 &&
75 nvlist_lookup_string(hcl[2], FM_FMRI_HC_NAME, &name) == 0 &&
76 strcmp(name, "cpu") == 0)
77 return (B_TRUE);
78 #endif
79
80 return (B_FALSE);
81 }
82
83 /* ARGSUSED */
84 int
cma_cpu_hc_retire(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,boolean_t repair)85 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
86 const char *uuid, boolean_t repair)
87 {
88 int i, err;
89 int rc = CMA_RA_SUCCESS;
90 nvlist_t *rsrc;
91
92 /*
93 * For the cached faults which were diagnosed under the old
94 * topology, we fall back to retire by using cpu-scheme ASRUs.
95 * Under xVM Dom0, since logic cpuid in "cpu" scheme ASRU makes no
96 * sense, the fault should be ignored.
97 */
98 if (old_topo_fault(nvl)) {
99 #ifdef i386
100 if (! cma_is_native)
101 return (CMA_RA_FAILURE);
102 #endif
103 return (cma_cpu_cpu_retire(hdl, nvl, asru, uuid, repair));
104 }
105
106 /*
107 * Lookup the resource and call its topo methods to do retire/unretire
108 */
109 if ((! repair && ! cma.cma_cpu_dooffline) ||
110 (repair && ! cma.cma_cpu_doonline)) {
111 fmd_hdl_debug(hdl, "suppressed %s of CPU\n",
112 repair ? "unretire" : "retire");
113 cma_stats.cpu_supp.fmds_value.ui64++;
114 } else {
115 err = FMD_AGENT_RETIRE_FAIL;
116 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) {
117 if (repair) {
118 err = fmd_nvl_fmri_unretire(hdl, rsrc);
119 } else {
120 for (i = 0; i < cma.cma_cpu_tries; i++) {
121 err = fmd_nvl_fmri_retire(hdl, rsrc);
122 if (err == FMD_AGENT_RETIRE_DONE)
123 break;
124 (void) nanosleep(&cma.cma_cpu_delay,
125 NULL);
126 }
127 }
128 }
129 if (err == FMD_AGENT_RETIRE_DONE) {
130 if (repair)
131 cma_stats.cpu_repairs.fmds_value.ui64++;
132 else
133 cma_stats.cpu_flts.fmds_value.ui64++;
134 } else {
135 rc = CMA_RA_FAILURE;
136 cma_stats.bad_flts.fmds_value.ui64++;
137 #ifdef sun4v
138 /* libldom requests are processed asynchronously */
139 cma_cpu_start_retry(hdl, nvl, uuid, repair);
140 #endif
141 }
142 }
143
144 if ((! repair && ! cma.cma_cpu_doblacklist) ||
145 (repair && ! cma.cma_cpu_dounblacklist)) {
146 fmd_hdl_debug(hdl, "suppressed %s of CPU\n",
147 repair ? "unblacklist" : "blacklist");
148 cma_stats.cpu_blsupp.fmds_value.ui64++;
149 } else {
150 if (cma_cpu_blacklist(hdl, nvl, asru, repair) < 0)
151 cma_stats.cpu_blfails.fmds_value.ui64++;
152 }
153
154 return (rc);
155 }
156
157 #else /* opl */
158
159 /* ARGSUSED 4 */
160 int
cma_cpu_hc_retire(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,boolean_t repair)161 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
162 const char *uuid, boolean_t repair)
163 {
164 uint_t cpuid;
165 uint_t i, nprs;
166 nvlist_t **hc_prs = NULL, *hc_spec_nvl;
167
168 /* OPL has ASRU in "hc" scheme */
169 if (nvlist_lookup_nvlist(asru, FM_FMRI_HC_SPECIFIC,
170 &hc_spec_nvl) != 0) {
171 cma_stats.bad_flts.fmds_value.ui64++;
172 fmd_hdl_debug(hdl,
173 "cma_cpu_hc_retire lookup hc_spec_nvl failed\n");
174 return (CMA_RA_FAILURE);
175 }
176
177 if (nvlist_lookup_nvlist_array(hc_spec_nvl, FM_FMRI_HC_CPUIDS,
178 &hc_prs, &nprs) != 0) {
179 cma_stats.bad_flts.fmds_value.ui64++;
180 fmd_hdl_debug(hdl,
181 "cma_cpu_hc_retire lookup cpuid array failed\n");
182 return (CMA_RA_FAILURE);
183 }
184
185 for (i = 0; i < nprs; i++) {
186 if (nvlist_lookup_uint32(hc_prs[i],
187 FM_FMRI_CPU_ID, &cpuid) != 0) {
188 cma_stats.bad_flts.fmds_value.ui64++;
189 return (CMA_RA_FAILURE);
190 }
191
192 if (cpu_statechange(hdl, nvl, hc_prs[i], uuid, cpuid, repair)
193 != CMA_RA_SUCCESS) {
194 cma_stats.bad_flts.fmds_value.ui64++;
195 return (CMA_RA_FAILURE);
196 }
197 }
198
199 return (CMA_RA_SUCCESS);
200 }
201 #endif /* opl */
202
203 /*
204 * The rest of this file uses ASRUs to do retire, this is now not the
205 * preferable way, but it's still needed for some circumstances when
206 * retire via topo methods can't work, ie.
207 *
208 * 1) There are legacy platforms which don't have full topology.
209 * 2) The resources in the FMD cached faults may not be set or exist in the
210 * up-to-dated topology.
211 */
212
213 /* ARGSUSED */
214 static int
cpu_online(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,uint32_t cpuid)215 cpu_online(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
216 uint32_t cpuid)
217 {
218 int err = CMA_RA_SUCCESS;
219
220 if (cma.cma_cpu_doonline) {
221 err = cma_cpu_statechange(hdl, asru, uuid, P_ONLINE,
222 B_TRUE);
223 } else {
224 fmd_hdl_debug(hdl, "suppressed online of CPU %u\n",
225 cpuid);
226 cma_stats.cpu_supp.fmds_value.ui64++;
227 }
228
229 /* OPL performs the blacklist in the service processor */
230 #ifndef opl
231 if (cma.cma_cpu_dounblacklist) {
232 if (cma_cpu_blacklist(hdl, nvl, asru, B_TRUE) < 0)
233 cma_stats.cpu_blfails.fmds_value.ui64++;
234 } else {
235 fmd_hdl_debug(hdl, "suppressed unblacklist of CPU %u\n", cpuid);
236 cma_stats.cpu_blsupp.fmds_value.ui64++;
237 }
238 #endif /* opl */
239
240 return (err);
241 }
242
243 /* ARGSUSED */
244 static int
cpu_offline(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,uint32_t cpuid)245 cpu_offline(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
246 uint32_t cpuid)
247 {
248 int err = CMA_RA_FAILURE;
249
250 if (cma.cma_cpu_dooffline) {
251 int cpustate = P_FAULTED;
252
253 if (cma.cma_cpu_forcedoffline)
254 cpustate |= P_FORCED;
255 err = cma_cpu_statechange(hdl, asru, uuid, cpustate,
256 B_FALSE);
257 } else {
258 fmd_hdl_debug(hdl, "suppressed offline of CPU %u\n",
259 cpuid);
260 cma_stats.cpu_supp.fmds_value.ui64++;
261 }
262
263 /* OPL performs the blacklist in the service processor */
264 #ifndef opl
265 if (cma.cma_cpu_doblacklist) {
266 if (cma_cpu_blacklist(hdl, nvl, asru, B_FALSE) < 0)
267 cma_stats.cpu_blfails.fmds_value.ui64++;
268 } else {
269 fmd_hdl_debug(hdl, "suppressed blacklist of CPU %u\n",
270 cpuid);
271 cma_stats.cpu_blsupp.fmds_value.ui64++;
272 }
273 #endif /* opl */
274
275 return (err);
276 }
277
278 static int
cpu_statechange(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,uint32_t cpuid,boolean_t repair)279 cpu_statechange(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
280 uint32_t cpuid, boolean_t repair)
281 {
282 if (repair)
283 return (cpu_online(hdl, nvl, asru, uuid, cpuid));
284 else
285 return (cpu_offline(hdl, nvl, asru, uuid, cpuid));
286 }
287
288 const char *
p_online_state_fmt(int state)289 p_online_state_fmt(int state)
290 {
291 state &= ~P_FORCED;
292 switch (state) {
293 case P_OFFLINE:
294 return (PS_OFFLINE);
295 case P_ONLINE:
296 return (PS_ONLINE);
297 case P_FAULTED:
298 return (PS_FAULTED);
299 case P_POWEROFF:
300 return (PS_POWEROFF);
301 case P_NOINTR:
302 return (PS_NOINTR);
303 case P_SPARE:
304 return (PS_SPARE);
305 default:
306 return ("unknown");
307 }
308 }
309
310 int
cma_cpu_cpu_retire(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,boolean_t repair)311 cma_cpu_cpu_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
312 const char *uuid, boolean_t repair)
313 {
314 uint_t cpuid;
315
316 if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
317 fmd_hdl_debug(hdl, "cpu fault missing '%s'\n", FM_FMRI_CPU_ID);
318 cma_stats.bad_flts.fmds_value.ui64++;
319 return (CMA_RA_FAILURE);
320 }
321
322 return (cpu_statechange(hdl, nvl, asru, uuid, cpuid, repair));
323 }
324