xref: /illumos-gate/usr/src/cmd/fm/modules/common/cpumem-retire/cma_cpu.c (revision 66582b606a8194f7f3ba5b3a3a6dca5b0d346361)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * Copyright 2019 Joyent, Inc.
26  */
27 
28 #include <cma.h>
29 
30 #include <fcntl.h>
31 #include <unistd.h>
32 #include <strings.h>
33 #include <errno.h>
34 #include <time.h>
35 #include <fm/fmd_api.h>
36 #include <fm/fmd_agent.h>
37 #include <sys/fm/protocol.h>
38 #include <sys/bl.h>
39 #include <sys/processor.h>
40 
41 static int cpu_statechange(fmd_hdl_t *, nvlist_t *, nvlist_t *, const char *,
42     uint32_t, boolean_t);
43 
44 #ifndef opl
45 /*
46  * Perform retire/unretire by invoking the topo methods registered in the
47  * hc-scheme resource.
48  *
49  * If the fault is found to be diagnosed under the old topology, the resource
50  * will not exist in the current topology, then we fall back to legacy retire
51  * (using the "cpu" scheme ASRU).
52  */
53 
54 static boolean_t
55 old_topo_fault(nvlist_t *nvl)
56 {
57 	nvlist_t *rsrc;
58 #ifdef i386
59 	nvlist_t **hcl;
60 	uint_t nhcl = 0;
61 	char *name;
62 #endif
63 
64 	if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) != 0)
65 		return (B_TRUE);
66 #ifdef i386
67 	/*
68 	 * x86 has moved from "motherboard/chip/cpu" topo to
69 	 * "motherboard/chip/core/strand"
70 	 */
71 	if (nvlist_lookup_nvlist_array(rsrc, FM_FMRI_HC_LIST, &hcl, &nhcl)
72 	    == 0 && nhcl == 3 &&
73 	    nvlist_lookup_string(hcl[0], FM_FMRI_HC_NAME, &name) == 0 &&
74 	    strcmp(name, "motherboard") == 0 &&
75 	    nvlist_lookup_string(hcl[1], FM_FMRI_HC_NAME, &name) == 0 &&
76 	    strcmp(name, "chip") == 0 &&
77 	    nvlist_lookup_string(hcl[2], FM_FMRI_HC_NAME, &name) == 0 &&
78 	    strcmp(name, "cpu") == 0)
79 		return (B_TRUE);
80 #endif
81 
82 	return (B_FALSE);
83 }
84 
85 /* ARGSUSED */
86 int
87 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
88     const char *uuid, boolean_t repair)
89 {
90 	int i, err;
91 	int rc = CMA_RA_SUCCESS;
92 	nvlist_t *rsrc;
93 
94 	/*
95 	 * For the cached faults which were diagnosed under the old
96 	 * topology,  we fall back to retire by using cpu-scheme ASRUs.
97 	 * Under xVM Dom0, since logic cpuid in "cpu" scheme ASRU makes no
98 	 * sense, the fault should be ignored.
99 	 */
100 	if (old_topo_fault(nvl)) {
101 #ifdef i386
102 		if (! cma_is_native)
103 			return (CMA_RA_FAILURE);
104 #endif
105 		return (cma_cpu_cpu_retire(hdl, nvl, asru, uuid, repair));
106 	}
107 
108 	/*
109 	 * Lookup the resource and call its topo methods to do retire/unretire
110 	 */
111 	if ((! repair && ! cma.cma_cpu_dooffline) ||
112 	    (repair && ! cma.cma_cpu_doonline)) {
113 		fmd_hdl_debug(hdl, "suppressed %s of CPU\n",
114 		    repair ? "unretire" : "retire");
115 		cma_stats.cpu_supp.fmds_value.ui64++;
116 	} else {
117 		err = FMD_AGENT_RETIRE_FAIL;
118 		if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) {
119 			if (repair) {
120 				err = fmd_nvl_fmri_unretire(hdl, rsrc);
121 			} else {
122 				for (i = 0; i < cma.cma_cpu_tries; i++) {
123 					err = fmd_nvl_fmri_retire(hdl, rsrc);
124 					if (err == FMD_AGENT_RETIRE_DONE)
125 						break;
126 					(void) nanosleep(&cma.cma_cpu_delay,
127 					    NULL);
128 				}
129 			}
130 		}
131 		if (err == FMD_AGENT_RETIRE_DONE) {
132 			if (repair)
133 				cma_stats.cpu_repairs.fmds_value.ui64++;
134 			else
135 				cma_stats.cpu_flts.fmds_value.ui64++;
136 		} else {
137 			rc = CMA_RA_FAILURE;
138 			cma_stats.bad_flts.fmds_value.ui64++;
139 #ifdef sun4v
140 			/* libldom requests are processed asynchronously */
141 			cma_cpu_start_retry(hdl, nvl, uuid, repair);
142 #endif
143 		}
144 	}
145 
146 	if ((! repair && ! cma.cma_cpu_doblacklist) ||
147 	    (repair && ! cma.cma_cpu_dounblacklist)) {
148 		fmd_hdl_debug(hdl, "suppressed %s of CPU\n",
149 		    repair ? "unblacklist" : "blacklist");
150 		cma_stats.cpu_blsupp.fmds_value.ui64++;
151 	} else {
152 		if (cma_cpu_blacklist(hdl, nvl, asru, repair) < 0)
153 			cma_stats.cpu_blfails.fmds_value.ui64++;
154 	}
155 
156 	return (rc);
157 }
158 
159 #else /* opl */
160 
161 /* ARGSUSED 4 */
162 int
163 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
164     const char *uuid, boolean_t repair)
165 {
166 	uint_t cpuid;
167 	uint_t i, nprs;
168 	nvlist_t **hc_prs = NULL, *hc_spec_nvl;
169 
170 	/* OPL has ASRU in "hc" scheme */
171 	if (nvlist_lookup_nvlist(asru, FM_FMRI_HC_SPECIFIC,
172 	    &hc_spec_nvl) != 0) {
173 		cma_stats.bad_flts.fmds_value.ui64++;
174 		fmd_hdl_debug(hdl,
175 		    "cma_cpu_hc_retire lookup hc_spec_nvl failed\n");
176 		return (CMA_RA_FAILURE);
177 	}
178 
179 	if (nvlist_lookup_nvlist_array(hc_spec_nvl, FM_FMRI_HC_CPUIDS,
180 	    &hc_prs, &nprs) != 0) {
181 		cma_stats.bad_flts.fmds_value.ui64++;
182 		fmd_hdl_debug(hdl,
183 		    "cma_cpu_hc_retire lookup cpuid array failed\n");
184 		return (CMA_RA_FAILURE);
185 	}
186 
187 	for (i = 0; i < nprs; i++) {
188 		if (nvlist_lookup_uint32(hc_prs[i],
189 		    FM_FMRI_CPU_ID, &cpuid) != 0) {
190 			cma_stats.bad_flts.fmds_value.ui64++;
191 			return (CMA_RA_FAILURE);
192 		}
193 
194 		if (cpu_statechange(hdl, nvl, hc_prs[i], uuid, cpuid, repair)
195 		    != CMA_RA_SUCCESS) {
196 			cma_stats.bad_flts.fmds_value.ui64++;
197 			return (CMA_RA_FAILURE);
198 		}
199 	}
200 
201 	return (CMA_RA_SUCCESS);
202 }
203 #endif /* opl */
204 
205 /*
206  * The rest of this file uses ASRUs to do retire, this is now not the
207  * preferable way, but it's still needed for some circumstances when
208  * retire via topo methods can't work, ie.
209  *
210  * 1) There are legacy platforms which don't have full topology.
211  * 2) The resources in the FMD cached faults may not be set or exist in the
212  *    up-to-dated topology.
213  */
214 
215 /* ARGSUSED */
216 static int
217 cpu_online(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
218     uint32_t cpuid)
219 {
220 	int err = CMA_RA_SUCCESS;
221 
222 	if (cma.cma_cpu_doonline) {
223 		err = cma_cpu_statechange(hdl, asru, uuid, P_ONLINE,
224 		    B_TRUE);
225 	} else {
226 		fmd_hdl_debug(hdl, "suppressed online of CPU %u\n",
227 		    cpuid);
228 		cma_stats.cpu_supp.fmds_value.ui64++;
229 	}
230 
231 	/* OPL performs the blacklist in the service processor */
232 #ifndef opl
233 	if (cma.cma_cpu_dounblacklist) {
234 		if (cma_cpu_blacklist(hdl, nvl, asru, B_TRUE) < 0)
235 			cma_stats.cpu_blfails.fmds_value.ui64++;
236 	} else {
237 		fmd_hdl_debug(hdl, "suppressed unblacklist of CPU %u\n", cpuid);
238 		cma_stats.cpu_blsupp.fmds_value.ui64++;
239 	}
240 #endif /* opl */
241 
242 	return (err);
243 }
244 
245 /* ARGSUSED */
246 static int
247 cpu_offline(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
248     uint32_t cpuid)
249 {
250 	int err = CMA_RA_FAILURE;
251 
252 	if (cma.cma_cpu_dooffline) {
253 		int cpustate = P_FAULTED;
254 
255 		if (cma.cma_cpu_forcedoffline)
256 			cpustate |= P_FORCED;
257 		err = cma_cpu_statechange(hdl, asru, uuid, cpustate,
258 		    B_FALSE);
259 	} else {
260 		fmd_hdl_debug(hdl, "suppressed offline of CPU %u\n",
261 		    cpuid);
262 		cma_stats.cpu_supp.fmds_value.ui64++;
263 	}
264 
265 	/* OPL performs the blacklist in the service processor */
266 #ifndef opl
267 	if (cma.cma_cpu_doblacklist) {
268 		if (cma_cpu_blacklist(hdl, nvl, asru, B_FALSE) < 0)
269 			cma_stats.cpu_blfails.fmds_value.ui64++;
270 	} else {
271 		fmd_hdl_debug(hdl, "suppressed blacklist of CPU %u\n",
272 		    cpuid);
273 		cma_stats.cpu_blsupp.fmds_value.ui64++;
274 	}
275 #endif /* opl */
276 
277 	return (err);
278 }
279 
280 static int
281 cpu_statechange(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
282     uint32_t cpuid, boolean_t repair)
283 {
284 	if (repair)
285 		return (cpu_online(hdl, nvl, asru, uuid, cpuid));
286 	else
287 		return (cpu_offline(hdl, nvl, asru, uuid, cpuid));
288 }
289 
290 const char *
291 p_online_state_fmt(int state)
292 {
293 	state &= ~P_FORCED;
294 	switch (state) {
295 	case P_OFFLINE:
296 		return (PS_OFFLINE);
297 	case P_ONLINE:
298 		return (PS_ONLINE);
299 	case P_FAULTED:
300 		return (PS_FAULTED);
301 	case P_POWEROFF:
302 		return (PS_POWEROFF);
303 	case P_NOINTR:
304 		return (PS_NOINTR);
305 	case P_SPARE:
306 		return (PS_SPARE);
307 	case P_DISABLED:
308 		return (PS_DISABLED);
309 	default:
310 		return ("unknown");
311 	}
312 }
313 
314 int
315 cma_cpu_cpu_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
316     const char *uuid, boolean_t repair)
317 {
318 	uint_t cpuid;
319 
320 	if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
321 		fmd_hdl_debug(hdl, "cpu fault missing '%s'\n", FM_FMRI_CPU_ID);
322 		cma_stats.bad_flts.fmds_value.ui64++;
323 		return (CMA_RA_FAILURE);
324 	}
325 
326 	return (cpu_statechange(hdl, nvl, asru, uuid, cpuid, repair));
327 }
328