xref: /illumos-gate/usr/src/cmd/fm/modules/common/cpumem-retire/cma_cpu.c (revision 62c8caf3fac65817982e780c1efa988846153bf0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <cma.h>
27 
28 #include <fcntl.h>
29 #include <unistd.h>
30 #include <strings.h>
31 #include <errno.h>
32 #include <time.h>
33 #include <fm/fmd_api.h>
34 #include <fm/fmd_agent.h>
35 #include <sys/fm/protocol.h>
36 #include <sys/bl.h>
37 #include <sys/processor.h>
38 
39 #ifdef i386
40 /*
41  * On x86, retire/unretire are done via the topo methods.
42  * To minimize the impact on existing/legacy sparc work, we leave
43  * some residual #ifdef ugliness.  The long-term intention would be to
44  * leave that legacy stuff to die a natural death when sparc diagnosis
45  * work can use the topo way of doing things.
46  */
47 
48 /*
49  * Check if the resource in the fault is in motherboard/chip/cpu topo.
50  */
51 static boolean_t
52 old_topo_fault(nvlist_t *nvl)
53 {
54 	nvlist_t *rsrc, **hcl;
55 	uint_t nhcl = 0;
56 	char *name;
57 
58 	if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0 &&
59 	    nvlist_lookup_nvlist_array(rsrc, FM_FMRI_HC_LIST, &hcl, &nhcl)
60 	    == 0 && nhcl == 3 &&
61 	    nvlist_lookup_string(hcl[0], FM_FMRI_HC_NAME, &name) == 0 &&
62 	    strcmp(name, "motherboard") == 0 &&
63 	    nvlist_lookup_string(hcl[1], FM_FMRI_HC_NAME, &name) == 0 &&
64 	    strcmp(name, "chip") == 0 &&
65 	    nvlist_lookup_string(hcl[2], FM_FMRI_HC_NAME, &name) == 0 &&
66 	    strcmp(name, "cpu") == 0)
67 		return (1);
68 
69 	return (0);
70 }
71 
72 /* ARGSUSED */
73 int
74 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
75     const char *uuid, boolean_t repair)
76 {
77 	int err;
78 	int rc = CMA_RA_SUCCESS;
79 	nvlist_t *rsrc;
80 
81 	/*
82 	 * For the cached faults which were diagnosed under the old
83 	 * chip/cpu topology, when in native, we call p_online(2) for the
84 	 * "cpu" scheme ASRUs.  Under Dom0, since logic cpuid in "cpu"
85 	 * scheme ASRU makes no sense, the fault should be ignored.
86 	 */
87 	if (old_topo_fault(nvl)) {
88 		if (cma_is_native)
89 			return (cma_cpu_retire(hdl, nvl, asru, uuid, repair));
90 		return (CMA_RA_FAILURE);
91 	}
92 
93 	/*
94 	 * Lookup the resource and call its topo methods to do retire/unretire
95 	 */
96 	if ((! repair && ! cma.cma_cpu_dooffline) ||
97 	    (repair && ! cma.cma_cpu_doonline)) {
98 		fmd_hdl_debug(hdl, "suppressed %s of CPU\n",
99 		    repair ? "unretire" : "retire");
100 		cma_stats.cpu_supp.fmds_value.ui64++;
101 	} else {
102 		err = FMD_AGENT_RETIRE_FAIL;
103 		if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) {
104 			err = repair ? fmd_nvl_fmri_unretire(hdl, rsrc) :
105 			    fmd_nvl_fmri_retire(hdl, rsrc);
106 		}
107 		if (err == FMD_AGENT_RETIRE_DONE) {
108 			if (repair)
109 				cma_stats.cpu_repairs.fmds_value.ui64++;
110 			else
111 				cma_stats.cpu_flts.fmds_value.ui64++;
112 		} else {
113 			rc = CMA_RA_FAILURE;
114 			cma_stats.bad_flts.fmds_value.ui64++;
115 		}
116 	}
117 
118 	if ((! repair && ! cma.cma_cpu_doblacklist) ||
119 	    (repair && ! cma.cma_cpu_dounblacklist)) {
120 		fmd_hdl_debug(hdl, "suppressed %s of CPU\n",
121 		    repair ? "unblacklist" : "blacklist");
122 		cma_stats.cpu_blsupp.fmds_value.ui64++;
123 	} else {
124 		if (cma_cpu_blacklist(hdl, nvl, asru, repair) < 0)
125 			cma_stats.cpu_blfails.fmds_value.ui64++;
126 	}
127 
128 	return (rc);
129 }
130 #endif /* i386 */
131 
132 /* ARGSUSED */
133 static int
134 cpu_online(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
135     uint32_t cpuid)
136 {
137 	int err = CMA_RA_SUCCESS;
138 
139 	if (cma.cma_cpu_doonline) {
140 		err = cma_cpu_statechange(hdl, asru, uuid, P_ONLINE,
141 		    B_TRUE);
142 	} else {
143 		fmd_hdl_debug(hdl, "suppressed online of CPU %u\n",
144 		    cpuid);
145 		cma_stats.cpu_supp.fmds_value.ui64++;
146 	}
147 
148 	/* OPL performs the blacklist in the service processor */
149 #ifndef opl
150 	if (cma.cma_cpu_dounblacklist) {
151 		if (cma_cpu_blacklist(hdl, nvl, asru, B_TRUE) < 0)
152 			cma_stats.cpu_blfails.fmds_value.ui64++;
153 	} else {
154 		fmd_hdl_debug(hdl, "suppressed unblacklist of CPU %u\n", cpuid);
155 		cma_stats.cpu_blsupp.fmds_value.ui64++;
156 	}
157 #endif /* opl */
158 
159 	return (err);
160 }
161 
162 /* ARGSUSED */
163 static int
164 cpu_offline(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
165     uint32_t cpuid)
166 {
167 	int err = CMA_RA_FAILURE;
168 
169 	if (cma.cma_cpu_dooffline) {
170 		int cpustate = P_FAULTED;
171 
172 		if (cma.cma_cpu_forcedoffline)
173 			cpustate |= P_FORCED;
174 		err = cma_cpu_statechange(hdl, asru, uuid, cpustate,
175 		    B_FALSE);
176 	} else {
177 		fmd_hdl_debug(hdl, "suppressed offline of CPU %u\n",
178 		    cpuid);
179 		cma_stats.cpu_supp.fmds_value.ui64++;
180 	}
181 
182 	/* OPL performs the blacklist in the service processor */
183 #ifndef opl
184 	if (cma.cma_cpu_doblacklist) {
185 		if (cma_cpu_blacklist(hdl, nvl, asru, B_FALSE) < 0)
186 			cma_stats.cpu_blfails.fmds_value.ui64++;
187 	} else {
188 		fmd_hdl_debug(hdl, "suppressed blacklist of CPU %u\n",
189 		    cpuid);
190 		cma_stats.cpu_blsupp.fmds_value.ui64++;
191 	}
192 #endif /* opl */
193 
194 	return (err);
195 }
196 
197 static int
198 cpu_statechange(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
199     uint32_t cpuid, boolean_t repair)
200 {
201 	if (repair)
202 		return (cpu_online(hdl, nvl, asru, uuid, cpuid));
203 	else
204 		return (cpu_offline(hdl, nvl, asru, uuid, cpuid));
205 }
206 
207 const char *
208 p_online_state_fmt(int state)
209 {
210 	state &= ~P_FORCED;
211 	switch (state) {
212 	case P_OFFLINE:
213 		return (PS_OFFLINE);
214 	case P_ONLINE:
215 		return (PS_ONLINE);
216 	case P_FAULTED:
217 		return (PS_FAULTED);
218 	case P_POWEROFF:
219 		return (PS_POWEROFF);
220 	case P_NOINTR:
221 		return (PS_NOINTR);
222 	case P_SPARE:
223 		return (PS_SPARE);
224 	default:
225 		return ("unknown");
226 	}
227 }
228 
229 int
230 cma_cpu_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
231     boolean_t repair)
232 {
233 	uint_t cpuid;
234 
235 	if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
236 		fmd_hdl_debug(hdl, "cpu fault missing '%s'\n", FM_FMRI_CPU_ID);
237 		cma_stats.bad_flts.fmds_value.ui64++;
238 		return (CMA_RA_FAILURE);
239 	}
240 
241 	return (cpu_statechange(hdl, nvl, asru, uuid, cpuid, repair));
242 }
243 
244 #ifdef opl
245 /* ARGSUSED 4 */
246 int
247 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
248     const char *uuid, boolean_t repair)
249 {
250 	uint_t cpuid;
251 	uint_t i, nprs;
252 	nvlist_t **hc_prs = NULL, *hc_spec_nvl;
253 
254 	if (nvlist_lookup_nvlist(asru, FM_FMRI_HC_SPECIFIC,
255 	    &hc_spec_nvl) != 0) {
256 		cma_stats.bad_flts.fmds_value.ui64++;
257 		fmd_hdl_debug(hdl,
258 		    "cma_cpu_hc_retire lookup hc_spec_nvl failed\n");
259 		return (CMA_RA_FAILURE);
260 	}
261 
262 	if (nvlist_lookup_nvlist_array(hc_spec_nvl, FM_FMRI_HC_CPUIDS,
263 	    &hc_prs, &nprs) != 0) {
264 		cma_stats.bad_flts.fmds_value.ui64++;
265 		fmd_hdl_debug(hdl,
266 		    "cma_cpu_hc_retire lookup cpuid array failed\n");
267 		return (CMA_RA_FAILURE);
268 	}
269 
270 	for (i = 0; i < nprs; i++) {
271 		if (nvlist_lookup_uint32(hc_prs[i],
272 		    FM_FMRI_CPU_ID, &cpuid) != 0) {
273 			cma_stats.bad_flts.fmds_value.ui64++;
274 			return (CMA_RA_FAILURE);
275 		}
276 
277 		if (cpu_statechange(hdl, nvl, hc_prs[i], uuid, cpuid, repair)
278 		    != CMA_RA_SUCCESS) {
279 			cma_stats.bad_flts.fmds_value.ui64++;
280 			return (CMA_RA_FAILURE);
281 		}
282 	}
283 
284 	return (CMA_RA_SUCCESS);
285 }
286 #endif /* opl */
287