1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 *
25 * Copyright 2019 Joyent, Inc.
26 */
27
28 #include <cma.h>
29
30 #include <fcntl.h>
31 #include <unistd.h>
32 #include <strings.h>
33 #include <errno.h>
34 #include <time.h>
35 #include <fm/fmd_api.h>
36 #include <fm/fmd_agent.h>
37 #include <sys/fm/protocol.h>
38 #include <sys/bl.h>
39 #include <sys/processor.h>
40
41 static int cpu_statechange(fmd_hdl_t *, nvlist_t *, nvlist_t *, const char *,
42 uint32_t, boolean_t);
43
44 #ifndef opl
45 /*
46 * Perform retire/unretire by invoking the topo methods registered in the
47 * hc-scheme resource.
48 *
49 * If the fault is found to be diagnosed under the old topology, the resource
50 * will not exist in the current topology, then we fall back to legacy retire
51 * (using the "cpu" scheme ASRU).
52 */
53
54 static boolean_t
old_topo_fault(nvlist_t * nvl)55 old_topo_fault(nvlist_t *nvl)
56 {
57 nvlist_t *rsrc;
58 #ifdef i386
59 nvlist_t **hcl;
60 uint_t nhcl = 0;
61 char *name;
62 #endif
63
64 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) != 0)
65 return (B_TRUE);
66 #ifdef i386
67 /*
68 * x86 has moved from "motherboard/chip/cpu" topo to
69 * "motherboard/chip/core/strand"
70 */
71 if (nvlist_lookup_nvlist_array(rsrc, FM_FMRI_HC_LIST, &hcl, &nhcl)
72 == 0 && nhcl == 3 &&
73 nvlist_lookup_string(hcl[0], FM_FMRI_HC_NAME, &name) == 0 &&
74 strcmp(name, "motherboard") == 0 &&
75 nvlist_lookup_string(hcl[1], FM_FMRI_HC_NAME, &name) == 0 &&
76 strcmp(name, "chip") == 0 &&
77 nvlist_lookup_string(hcl[2], FM_FMRI_HC_NAME, &name) == 0 &&
78 strcmp(name, "cpu") == 0)
79 return (B_TRUE);
80 #endif
81
82 return (B_FALSE);
83 }
84
85 /* ARGSUSED */
86 int
cma_cpu_hc_retire(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,boolean_t repair)87 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
88 const char *uuid, boolean_t repair)
89 {
90 int i, err;
91 int rc = CMA_RA_SUCCESS;
92 nvlist_t *rsrc;
93
94 /*
95 * For the cached faults which were diagnosed under the old
96 * topology, we fall back to retire by using cpu-scheme ASRUs.
97 * Under xVM Dom0, since logic cpuid in "cpu" scheme ASRU makes no
98 * sense, the fault should be ignored.
99 */
100 if (old_topo_fault(nvl)) {
101 #ifdef i386
102 if (! cma_is_native)
103 return (CMA_RA_FAILURE);
104 #endif
105 return (cma_cpu_cpu_retire(hdl, nvl, asru, uuid, repair));
106 }
107
108 /*
109 * Lookup the resource and call its topo methods to do retire/unretire
110 */
111 if ((! repair && ! cma.cma_cpu_dooffline) ||
112 (repair && ! cma.cma_cpu_doonline)) {
113 fmd_hdl_debug(hdl, "suppressed %s of CPU\n",
114 repair ? "unretire" : "retire");
115 cma_stats.cpu_supp.fmds_value.ui64++;
116 } else {
117 err = FMD_AGENT_RETIRE_FAIL;
118 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) {
119 if (repair) {
120 err = fmd_nvl_fmri_unretire(hdl, rsrc);
121 } else {
122 for (i = 0; i < cma.cma_cpu_tries; i++) {
123 err = fmd_nvl_fmri_retire(hdl, rsrc);
124 if (err == FMD_AGENT_RETIRE_DONE)
125 break;
126 (void) nanosleep(&cma.cma_cpu_delay,
127 NULL);
128 }
129 }
130 }
131 if (err == FMD_AGENT_RETIRE_DONE) {
132 if (repair)
133 cma_stats.cpu_repairs.fmds_value.ui64++;
134 else
135 cma_stats.cpu_flts.fmds_value.ui64++;
136 } else {
137 rc = CMA_RA_FAILURE;
138 cma_stats.bad_flts.fmds_value.ui64++;
139 #ifdef sun4v
140 /* libldom requests are processed asynchronously */
141 cma_cpu_start_retry(hdl, nvl, uuid, repair);
142 #endif
143 }
144 }
145
146 if ((! repair && ! cma.cma_cpu_doblacklist) ||
147 (repair && ! cma.cma_cpu_dounblacklist)) {
148 fmd_hdl_debug(hdl, "suppressed %s of CPU\n",
149 repair ? "unblacklist" : "blacklist");
150 cma_stats.cpu_blsupp.fmds_value.ui64++;
151 } else {
152 if (cma_cpu_blacklist(hdl, nvl, asru, repair) < 0)
153 cma_stats.cpu_blfails.fmds_value.ui64++;
154 }
155
156 return (rc);
157 }
158
159 #else /* opl */
160
161 /* ARGSUSED 4 */
162 int
cma_cpu_hc_retire(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,boolean_t repair)163 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
164 const char *uuid, boolean_t repair)
165 {
166 uint_t cpuid;
167 uint_t i, nprs;
168 nvlist_t **hc_prs = NULL, *hc_spec_nvl;
169
170 /* OPL has ASRU in "hc" scheme */
171 if (nvlist_lookup_nvlist(asru, FM_FMRI_HC_SPECIFIC,
172 &hc_spec_nvl) != 0) {
173 cma_stats.bad_flts.fmds_value.ui64++;
174 fmd_hdl_debug(hdl,
175 "cma_cpu_hc_retire lookup hc_spec_nvl failed\n");
176 return (CMA_RA_FAILURE);
177 }
178
179 if (nvlist_lookup_nvlist_array(hc_spec_nvl, FM_FMRI_HC_CPUIDS,
180 &hc_prs, &nprs) != 0) {
181 cma_stats.bad_flts.fmds_value.ui64++;
182 fmd_hdl_debug(hdl,
183 "cma_cpu_hc_retire lookup cpuid array failed\n");
184 return (CMA_RA_FAILURE);
185 }
186
187 for (i = 0; i < nprs; i++) {
188 if (nvlist_lookup_uint32(hc_prs[i],
189 FM_FMRI_CPU_ID, &cpuid) != 0) {
190 cma_stats.bad_flts.fmds_value.ui64++;
191 return (CMA_RA_FAILURE);
192 }
193
194 if (cpu_statechange(hdl, nvl, hc_prs[i], uuid, cpuid, repair)
195 != CMA_RA_SUCCESS) {
196 cma_stats.bad_flts.fmds_value.ui64++;
197 return (CMA_RA_FAILURE);
198 }
199 }
200
201 return (CMA_RA_SUCCESS);
202 }
203 #endif /* opl */
204
205 /*
206 * The rest of this file uses ASRUs to do retire, this is now not the
207 * preferable way, but it's still needed for some circumstances when
208 * retire via topo methods can't work, ie.
209 *
210 * 1) There are legacy platforms which don't have full topology.
211 * 2) The resources in the FMD cached faults may not be set or exist in the
212 * up-to-dated topology.
213 */
214
215 /* ARGSUSED */
216 static int
cpu_online(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,uint32_t cpuid)217 cpu_online(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
218 uint32_t cpuid)
219 {
220 int err = CMA_RA_SUCCESS;
221
222 if (cma.cma_cpu_doonline) {
223 err = cma_cpu_statechange(hdl, asru, uuid, P_ONLINE,
224 B_TRUE);
225 } else {
226 fmd_hdl_debug(hdl, "suppressed online of CPU %u\n",
227 cpuid);
228 cma_stats.cpu_supp.fmds_value.ui64++;
229 }
230
231 /* OPL performs the blacklist in the service processor */
232 #ifndef opl
233 if (cma.cma_cpu_dounblacklist) {
234 if (cma_cpu_blacklist(hdl, nvl, asru, B_TRUE) < 0)
235 cma_stats.cpu_blfails.fmds_value.ui64++;
236 } else {
237 fmd_hdl_debug(hdl, "suppressed unblacklist of CPU %u\n", cpuid);
238 cma_stats.cpu_blsupp.fmds_value.ui64++;
239 }
240 #endif /* opl */
241
242 return (err);
243 }
244
245 /* ARGSUSED */
246 static int
cpu_offline(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,uint32_t cpuid)247 cpu_offline(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
248 uint32_t cpuid)
249 {
250 int err = CMA_RA_FAILURE;
251
252 if (cma.cma_cpu_dooffline) {
253 int cpustate = P_FAULTED;
254
255 if (cma.cma_cpu_forcedoffline)
256 cpustate |= P_FORCED;
257 err = cma_cpu_statechange(hdl, asru, uuid, cpustate,
258 B_FALSE);
259 } else {
260 fmd_hdl_debug(hdl, "suppressed offline of CPU %u\n",
261 cpuid);
262 cma_stats.cpu_supp.fmds_value.ui64++;
263 }
264
265 /* OPL performs the blacklist in the service processor */
266 #ifndef opl
267 if (cma.cma_cpu_doblacklist) {
268 if (cma_cpu_blacklist(hdl, nvl, asru, B_FALSE) < 0)
269 cma_stats.cpu_blfails.fmds_value.ui64++;
270 } else {
271 fmd_hdl_debug(hdl, "suppressed blacklist of CPU %u\n",
272 cpuid);
273 cma_stats.cpu_blsupp.fmds_value.ui64++;
274 }
275 #endif /* opl */
276
277 return (err);
278 }
279
280 static int
cpu_statechange(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,uint32_t cpuid,boolean_t repair)281 cpu_statechange(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
282 uint32_t cpuid, boolean_t repair)
283 {
284 if (repair)
285 return (cpu_online(hdl, nvl, asru, uuid, cpuid));
286 else
287 return (cpu_offline(hdl, nvl, asru, uuid, cpuid));
288 }
289
290 const char *
p_online_state_fmt(int state)291 p_online_state_fmt(int state)
292 {
293 state &= ~P_FORCED;
294 switch (state) {
295 case P_OFFLINE:
296 return (PS_OFFLINE);
297 case P_ONLINE:
298 return (PS_ONLINE);
299 case P_FAULTED:
300 return (PS_FAULTED);
301 case P_POWEROFF:
302 return (PS_POWEROFF);
303 case P_NOINTR:
304 return (PS_NOINTR);
305 case P_SPARE:
306 return (PS_SPARE);
307 case P_DISABLED:
308 return (PS_DISABLED);
309 default:
310 return ("unknown");
311 }
312 }
313
314 int
cma_cpu_cpu_retire(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,boolean_t repair)315 cma_cpu_cpu_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
316 const char *uuid, boolean_t repair)
317 {
318 uint_t cpuid;
319
320 if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
321 fmd_hdl_debug(hdl, "cpu fault missing '%s'\n", FM_FMRI_CPU_ID);
322 cma_stats.bad_flts.fmds_value.ui64++;
323 return (CMA_RA_FAILURE);
324 }
325
326 return (cpu_statechange(hdl, nvl, asru, uuid, cpuid, repair));
327 }
328