xref: /illumos-gate/usr/src/cmd/fm/modules/common/cpumem-retire/cma_cpu.c (revision 35a5a3587fd94b666239c157d3722745250ccbd7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <cma.h>
27 
28 #include <fcntl.h>
29 #include <unistd.h>
30 #include <strings.h>
31 #include <errno.h>
32 #include <time.h>
33 #include <fm/fmd_api.h>
34 #include <fm/fmd_agent.h>
35 #include <sys/fm/protocol.h>
36 #include <sys/bl.h>
37 #include <sys/processor.h>
38 
39 static int cpu_statechange(fmd_hdl_t *, nvlist_t *, nvlist_t *, const char *,
40     uint32_t, boolean_t);
41 
42 #ifndef opl
43 /*
44  * Perform retire/unretire by invoking the topo methods registered in the
45  * hc-scheme resource.
46  *
47  * If the fault is found to be diagnosed under the old topology, the resource
48  * will not exist in the current topology, then we fall back to legacy retire
49  * (using the "cpu" scheme ASRU).
50  */
51 
52 static boolean_t
53 old_topo_fault(nvlist_t *nvl)
54 {
55 	nvlist_t *rsrc;
56 #ifdef i386
57 	nvlist_t **hcl;
58 	uint_t nhcl = 0;
59 	char *name;
60 #endif
61 
62 	if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) != 0)
63 		return (B_TRUE);
64 #ifdef i386
65 	/*
66 	 * x86 has moved from "motherboard/chip/cpu" topo to
67 	 * "motherboard/chip/core/strand"
68 	 */
69 	if (nvlist_lookup_nvlist_array(rsrc, FM_FMRI_HC_LIST, &hcl, &nhcl)
70 	    == 0 && nhcl == 3 &&
71 	    nvlist_lookup_string(hcl[0], FM_FMRI_HC_NAME, &name) == 0 &&
72 	    strcmp(name, "motherboard") == 0 &&
73 	    nvlist_lookup_string(hcl[1], FM_FMRI_HC_NAME, &name) == 0 &&
74 	    strcmp(name, "chip") == 0 &&
75 	    nvlist_lookup_string(hcl[2], FM_FMRI_HC_NAME, &name) == 0 &&
76 	    strcmp(name, "cpu") == 0)
77 		return (B_TRUE);
78 #endif
79 
80 	return (B_FALSE);
81 }
82 
83 /* ARGSUSED */
84 int
85 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
86     const char *uuid, boolean_t repair)
87 {
88 	int i, err;
89 	int rc = CMA_RA_SUCCESS;
90 	nvlist_t *rsrc;
91 
92 	/*
93 	 * For the cached faults which were diagnosed under the old
94 	 * topology,  we fall back to retire by using cpu-scheme ASRUs.
95 	 * Under xVM Dom0, since logic cpuid in "cpu" scheme ASRU makes no
96 	 * sense, the fault should be ignored.
97 	 */
98 	if (old_topo_fault(nvl)) {
99 #ifdef i386
100 		if (! cma_is_native)
101 			return (CMA_RA_FAILURE);
102 #endif
103 		return (cma_cpu_cpu_retire(hdl, nvl, asru, uuid, repair));
104 	}
105 
106 	/*
107 	 * Lookup the resource and call its topo methods to do retire/unretire
108 	 */
109 	if ((! repair && ! cma.cma_cpu_dooffline) ||
110 	    (repair && ! cma.cma_cpu_doonline)) {
111 		fmd_hdl_debug(hdl, "suppressed %s of CPU\n",
112 		    repair ? "unretire" : "retire");
113 		cma_stats.cpu_supp.fmds_value.ui64++;
114 	} else {
115 		err = FMD_AGENT_RETIRE_FAIL;
116 		if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) {
117 			if (repair) {
118 				err = fmd_nvl_fmri_unretire(hdl, rsrc);
119 			} else {
120 				for (i = 0; i < cma.cma_cpu_tries; i++) {
121 					err = fmd_nvl_fmri_retire(hdl, rsrc);
122 					if (err == FMD_AGENT_RETIRE_DONE)
123 						break;
124 					(void) nanosleep(&cma.cma_cpu_delay,
125 					    NULL);
126 				}
127 			}
128 		}
129 		if (err == FMD_AGENT_RETIRE_DONE) {
130 			if (repair)
131 				cma_stats.cpu_repairs.fmds_value.ui64++;
132 			else
133 				cma_stats.cpu_flts.fmds_value.ui64++;
134 		} else {
135 			rc = CMA_RA_FAILURE;
136 			cma_stats.bad_flts.fmds_value.ui64++;
137 #ifdef sun4v
138 			/* libldom requests are processed asynchronously */
139 			cma_cpu_start_retry(hdl, nvl, uuid, repair);
140 #endif
141 		}
142 	}
143 
144 	if ((! repair && ! cma.cma_cpu_doblacklist) ||
145 	    (repair && ! cma.cma_cpu_dounblacklist)) {
146 		fmd_hdl_debug(hdl, "suppressed %s of CPU\n",
147 		    repair ? "unblacklist" : "blacklist");
148 		cma_stats.cpu_blsupp.fmds_value.ui64++;
149 	} else {
150 		if (cma_cpu_blacklist(hdl, nvl, asru, repair) < 0)
151 			cma_stats.cpu_blfails.fmds_value.ui64++;
152 	}
153 
154 	return (rc);
155 }
156 
157 #else /* opl */
158 
159 /* ARGSUSED 4 */
160 int
161 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
162     const char *uuid, boolean_t repair)
163 {
164 	uint_t cpuid;
165 	uint_t i, nprs;
166 	nvlist_t **hc_prs = NULL, *hc_spec_nvl;
167 
168 	/* OPL has ASRU in "hc" scheme */
169 	if (nvlist_lookup_nvlist(asru, FM_FMRI_HC_SPECIFIC,
170 	    &hc_spec_nvl) != 0) {
171 		cma_stats.bad_flts.fmds_value.ui64++;
172 		fmd_hdl_debug(hdl,
173 		    "cma_cpu_hc_retire lookup hc_spec_nvl failed\n");
174 		return (CMA_RA_FAILURE);
175 	}
176 
177 	if (nvlist_lookup_nvlist_array(hc_spec_nvl, FM_FMRI_HC_CPUIDS,
178 	    &hc_prs, &nprs) != 0) {
179 		cma_stats.bad_flts.fmds_value.ui64++;
180 		fmd_hdl_debug(hdl,
181 		    "cma_cpu_hc_retire lookup cpuid array failed\n");
182 		return (CMA_RA_FAILURE);
183 	}
184 
185 	for (i = 0; i < nprs; i++) {
186 		if (nvlist_lookup_uint32(hc_prs[i],
187 		    FM_FMRI_CPU_ID, &cpuid) != 0) {
188 			cma_stats.bad_flts.fmds_value.ui64++;
189 			return (CMA_RA_FAILURE);
190 		}
191 
192 		if (cpu_statechange(hdl, nvl, hc_prs[i], uuid, cpuid, repair)
193 		    != CMA_RA_SUCCESS) {
194 			cma_stats.bad_flts.fmds_value.ui64++;
195 			return (CMA_RA_FAILURE);
196 		}
197 	}
198 
199 	return (CMA_RA_SUCCESS);
200 }
201 #endif /* opl */
202 
203 /*
204  * The rest of this file uses ASRUs to do retire, this is now not the
205  * preferable way, but it's still needed for some circumstances when
206  * retire via topo methods can't work, ie.
207  *
208  * 1) There are legacy platforms which don't have full topology.
209  * 2) The resources in the FMD cached faults may not be set or exist in the
210  *    up-to-dated topology.
211  */
212 
213 /* ARGSUSED */
214 static int
215 cpu_online(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
216     uint32_t cpuid)
217 {
218 	int err = CMA_RA_SUCCESS;
219 
220 	if (cma.cma_cpu_doonline) {
221 		err = cma_cpu_statechange(hdl, asru, uuid, P_ONLINE,
222 		    B_TRUE);
223 	} else {
224 		fmd_hdl_debug(hdl, "suppressed online of CPU %u\n",
225 		    cpuid);
226 		cma_stats.cpu_supp.fmds_value.ui64++;
227 	}
228 
229 	/* OPL performs the blacklist in the service processor */
230 #ifndef opl
231 	if (cma.cma_cpu_dounblacklist) {
232 		if (cma_cpu_blacklist(hdl, nvl, asru, B_TRUE) < 0)
233 			cma_stats.cpu_blfails.fmds_value.ui64++;
234 	} else {
235 		fmd_hdl_debug(hdl, "suppressed unblacklist of CPU %u\n", cpuid);
236 		cma_stats.cpu_blsupp.fmds_value.ui64++;
237 	}
238 #endif /* opl */
239 
240 	return (err);
241 }
242 
243 /* ARGSUSED */
244 static int
245 cpu_offline(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
246     uint32_t cpuid)
247 {
248 	int err = CMA_RA_FAILURE;
249 
250 	if (cma.cma_cpu_dooffline) {
251 		int cpustate = P_FAULTED;
252 
253 		if (cma.cma_cpu_forcedoffline)
254 			cpustate |= P_FORCED;
255 		err = cma_cpu_statechange(hdl, asru, uuid, cpustate,
256 		    B_FALSE);
257 	} else {
258 		fmd_hdl_debug(hdl, "suppressed offline of CPU %u\n",
259 		    cpuid);
260 		cma_stats.cpu_supp.fmds_value.ui64++;
261 	}
262 
263 	/* OPL performs the blacklist in the service processor */
264 #ifndef opl
265 	if (cma.cma_cpu_doblacklist) {
266 		if (cma_cpu_blacklist(hdl, nvl, asru, B_FALSE) < 0)
267 			cma_stats.cpu_blfails.fmds_value.ui64++;
268 	} else {
269 		fmd_hdl_debug(hdl, "suppressed blacklist of CPU %u\n",
270 		    cpuid);
271 		cma_stats.cpu_blsupp.fmds_value.ui64++;
272 	}
273 #endif /* opl */
274 
275 	return (err);
276 }
277 
278 static int
279 cpu_statechange(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
280     uint32_t cpuid, boolean_t repair)
281 {
282 	if (repair)
283 		return (cpu_online(hdl, nvl, asru, uuid, cpuid));
284 	else
285 		return (cpu_offline(hdl, nvl, asru, uuid, cpuid));
286 }
287 
288 const char *
289 p_online_state_fmt(int state)
290 {
291 	state &= ~P_FORCED;
292 	switch (state) {
293 	case P_OFFLINE:
294 		return (PS_OFFLINE);
295 	case P_ONLINE:
296 		return (PS_ONLINE);
297 	case P_FAULTED:
298 		return (PS_FAULTED);
299 	case P_POWEROFF:
300 		return (PS_POWEROFF);
301 	case P_NOINTR:
302 		return (PS_NOINTR);
303 	case P_SPARE:
304 		return (PS_SPARE);
305 	default:
306 		return ("unknown");
307 	}
308 }
309 
310 int
311 cma_cpu_cpu_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
312     const char *uuid, boolean_t repair)
313 {
314 	uint_t cpuid;
315 
316 	if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
317 		fmd_hdl_debug(hdl, "cpu fault missing '%s'\n", FM_FMRI_CPU_ID);
318 		cma_stats.bad_flts.fmds_value.ui64++;
319 		return (CMA_RA_FAILURE);
320 	}
321 
322 	return (cpu_statechange(hdl, nvl, asru, uuid, cpuid, repair));
323 }
324