xref: /titanic_44/usr/src/cmd/fm/modules/sun4v/cpumem-retire/cma_cpu_sun4v.c (revision e2dcee5754c56d91c6e1ff847db294541069ca0d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <cma.h>
27 
28 #include <sys/fm/ldom.h>
29 #include <sys/fm/protocol.h>
30 #include <fm/fmd_fmri.h>
31 #include <fm/libtopo.h>
32 
33 #include <assert.h>
34 #include <fcntl.h>
35 #include <unistd.h>
36 #include <errno.h>
37 #include <strings.h>
38 
39 #include <sys/types.h>
40 #include <sys/processor.h>
41 
42 extern ldom_hdl_t *cma_lhp;
43 
44 /*ARGSUSED*/
45 int
cpu_blacklist_cmd(fmd_hdl_t * hdl,nvlist_t * fmri,boolean_t repair)46 cpu_blacklist_cmd(fmd_hdl_t *hdl, nvlist_t *fmri, boolean_t repair)
47 {
48 	if (repair)
49 		return (ldom_fmri_unblacklist(cma_lhp, fmri));
50 	else
51 		return (ldom_fmri_blacklist(cma_lhp, fmri));
52 }
53 
54 int
cma_cpu_blacklist(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,boolean_t repair)55 cma_cpu_blacklist(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
56     boolean_t repair)
57 {
58 	nvlist_t *fmri;
59 	int rc, err;
60 
61 	/*
62 	 * Some platforms have special unums for the E$ DIMMs.	If we're dealing
63 	 * with a platform that has these unums, one will have been added to the
64 	 * fault as the resource.  We'll use that for the blacklisting.  If we
65 	 * can't find a resource, we'll fall back to the ASRU.
66 	 */
67 	if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &fmri) != 0)
68 		fmri = asru;
69 
70 	rc = cpu_blacklist_cmd(hdl, fmri, repair);
71 	err = errno;
72 
73 	if (rc < 0 && err != ENOTSUP) {
74 		errno = err;
75 		return (-1);
76 	}
77 
78 	return (0);
79 }
80 
81 /*ARGSUSED*/
82 static int
cpu_cmd(fmd_hdl_t * hdl,nvlist_t * fmri,int cmd)83 cpu_cmd(fmd_hdl_t *hdl, nvlist_t *fmri, int cmd)
84 {
85 	int rc = 0;
86 	char *scheme;
87 
88 	/*
89 	 * We're using topo retire if the fmri is in "hc" scheme.
90 	 */
91 	if (nvlist_lookup_string(fmri, FM_FMRI_SCHEME, &scheme) == 0 &&
92 	    strcmp(scheme, FM_FMRI_SCHEME_HC) == 0) {
93 		if (cmd != P_STATUS) {
94 			errno = EINVAL;
95 			return (-1);
96 		}
97 		rc = fmd_nvl_fmri_service_state(hdl, fmri);
98 		switch (rc) {
99 		case FMD_SERVICE_STATE_UNUSABLE:
100 			return (P_FAULTED);
101 		case -1:
102 			return (-1);
103 		default:
104 			return (P_ONLINE);
105 		}
106 	}
107 
108 	switch (cmd & ~P_FORCED) {
109 	case P_STATUS:
110 		rc = ldom_fmri_status(cma_lhp, fmri);
111 		break;
112 	case P_FAULTED:
113 		rc = ldom_fmri_retire(cma_lhp, fmri);
114 		break;
115 	case P_ONLINE:
116 		rc = ldom_fmri_unretire(cma_lhp, fmri);
117 		break;
118 	default:
119 		errno = EINVAL;
120 		return (-1);
121 	}
122 
123 	if (rc != P_OFFLINE && rc != P_ONLINE && rc != P_FAULTED) {
124 		errno = rc;
125 		return (-1);
126 	}
127 
128 	return (rc);
129 }
130 
131 void
cma_cpu_start_retry(fmd_hdl_t * hdl,nvlist_t * fmri,const char * uuid,boolean_t repair)132 cma_cpu_start_retry(fmd_hdl_t *hdl, nvlist_t *fmri, const char *uuid,
133     boolean_t repair)
134 {
135 	cma_cpu_t *cpu;
136 	char *scheme;
137 	uint_t cpuid;
138 	nvlist_t *asru = NULL;
139 	topo_hdl_t *thp;
140 	int err;
141 
142 	if (repair || nvlist_lookup_string(fmri, FM_FMRI_SCHEME, &scheme) != 0)
143 		return;
144 	if (strcmp(scheme, FM_FMRI_SCHEME_CPU) == 0) {
145 		if (nvlist_lookup_uint32(fmri, FM_FMRI_CPU_ID, &cpuid) != 0)
146 			return;
147 	} else if (strcmp(scheme, FM_FMRI_SCHEME_HC) != 0) {
148 		return;
149 	} else {
150 		/* lookup cpuid from ASRU */
151 		thp = fmd_fmri_topo_hold(TOPO_VERSION);
152 		if (thp != NULL) {
153 			(void) topo_fmri_asru(thp, fmri, &asru, &err);
154 			fmd_fmri_topo_rele(thp);
155 		}
156 		if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
157 			nvlist_free(asru);
158 			return;
159 		}
160 	}
161 
162 	/*
163 	 * check to see if the cpu has been offline.
164 	 */
165 	fmd_hdl_debug(hdl, "cpu %u is not offline yet - sleeping\n", cpuid);
166 
167 	/*
168 	 * Create a cpu node and add to the head of the cpu list
169 	 */
170 	cpu = fmd_hdl_zalloc(hdl, sizeof (cma_cpu_t), FMD_SLEEP);
171 	(void) nvlist_dup(fmri, &cpu->cpu_fmri, 0);
172 	if (uuid != NULL)
173 		cpu->cpu_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP);
174 
175 	cpu->cpuid = cpuid;
176 	cpu->cpu_next = cma.cma_cpus;
177 	cma.cma_cpus = cpu;
178 
179 	if (cma.cma_cpu_timerid != 0)
180 		fmd_timer_remove(hdl, cma.cma_cpu_timerid);
181 
182 	cma.cma_cpu_curdelay = cma.cma_cpu_mindelay;
183 
184 	cma.cma_cpu_timerid =
185 	    fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay);
186 }
187 
188 
189 int
cma_cpu_statechange(fmd_hdl_t * hdl,nvlist_t * asru,const char * uuid,int cpustate,boolean_t repair)190 cma_cpu_statechange(fmd_hdl_t *hdl, nvlist_t *asru, const char *uuid,
191     int cpustate, boolean_t repair)
192 {
193 	int i;
194 	uint_t cpuid;
195 
196 	if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
197 		fmd_hdl_debug(hdl, "missing '%s'\n", FM_FMRI_CPU_ID);
198 		cma_stats.bad_flts.fmds_value.ui64++;
199 		return (CMA_RA_FAILURE);
200 	}
201 
202 	/*
203 	 * cpu offlining using ldom_fmri_retire() may be asynchronous, so we
204 	 * have to set the timer and check the cpu status later.
205 	 */
206 	for (i = 0; i < cma.cma_cpu_tries;
207 	    i++, (void) nanosleep(&cma.cma_cpu_delay, NULL)) {
208 		if (cpu_cmd(hdl, asru, cpustate) != -1) {
209 			if (repair)
210 				cma_stats.cpu_repairs.fmds_value.ui64++;
211 			else
212 				cma_stats.cpu_flts.fmds_value.ui64++;
213 			break;
214 		}
215 	}
216 
217 	if (i >= cma.cma_cpu_tries) {
218 		cma_stats.cpu_fails.fmds_value.ui64++;
219 	}
220 
221 	cma_cpu_start_retry(hdl, asru, uuid, repair);
222 
223 	return (CMA_RA_FAILURE);
224 }
225 
226 static int
cpu_retry(fmd_hdl_t * hdl,cma_cpu_t * cpu)227 cpu_retry(fmd_hdl_t *hdl, cma_cpu_t *cpu)
228 {
229 	int rc = 0;
230 
231 	fmd_hdl_debug(hdl, "cpu_retry()\n");
232 
233 	if (cpu->cpu_fmri == NULL) {
234 		return (1);
235 	}
236 
237 	if (!fmd_nvl_fmri_present(hdl, cpu->cpu_fmri)) {
238 		fmd_hdl_debug(hdl, "cpu %u is not present", cpu->cpuid);
239 		return (1);
240 	}
241 
242 	rc = cpu_cmd(hdl, cpu->cpu_fmri, P_STATUS);
243 	if (rc == P_FAULTED || rc == P_OFFLINE) {
244 		fmd_hdl_debug(hdl, "cpu %u is offlined on retry %u\n",
245 		    cpu->cpuid, cpu->cpu_nretries);
246 		cma_stats.cpu_flts.fmds_value.ui64++;
247 
248 		if (cpu->cpu_uuid != NULL)
249 			fmd_case_uuclose(hdl, cpu->cpu_uuid);
250 		return (1); /* success */
251 	}
252 
253 	if (rc == -1) {
254 		fmd_hdl_debug(hdl, "failed to retry cpu %u\n", cpu->cpuid);
255 		cma_stats.page_fails.fmds_value.ui64++;
256 		return (1); /* give up */
257 	}
258 
259 	return (0);
260 }
261 
262 static void
cma_cpu_free(fmd_hdl_t * hdl,cma_cpu_t * cpu)263 cma_cpu_free(fmd_hdl_t *hdl, cma_cpu_t *cpu)
264 {
265 	nvlist_free(cpu->cpu_fmri);
266 	if (cpu->cpu_uuid != NULL)
267 		fmd_hdl_strfree(hdl, cpu->cpu_uuid);
268 	fmd_hdl_free(hdl, cpu, sizeof (cma_cpu_t));
269 }
270 
271 void
cma_cpu_retry(fmd_hdl_t * hdl)272 cma_cpu_retry(fmd_hdl_t *hdl)
273 {
274 	cma_cpu_t **cpup;
275 
276 	fmd_hdl_debug(hdl, "cma_cpu_retry: timer fired\n");
277 
278 	cma.cma_cpu_timerid = 0;
279 
280 	cpup = &cma.cma_cpus;
281 	while (*cpup != NULL) {
282 		cma_cpu_t *cpu = *cpup;
283 
284 		if (cpu_retry(hdl, cpu)) {
285 			/*
286 			 * Successful retry or we're giving up - remove from
287 			 * the list
288 			 */
289 			*cpup = cpu->cpu_next;
290 
291 			cma_cpu_free(hdl, cpu);
292 		} else {
293 			cpu->cpu_nretries++;
294 			cpup = &cpu->cpu_next;
295 		}
296 	}
297 
298 	if (cma.cma_cpus == NULL)
299 		return; /* no more cpus */
300 
301 	/*
302 	 * We still have cpus to check.  Back the delay
303 	 * off, and schedule a retry.
304 	 */
305 	cma.cma_cpu_curdelay = MIN(cma.cma_cpu_curdelay * 2,
306 	    cma.cma_cpu_maxdelay);
307 
308 	fmd_hdl_debug(hdl, "scheduled cpu offline retry for %llu secs\n",
309 	    (u_longlong_t)(cma.cma_cpu_curdelay / NANOSEC));
310 
311 	cma.cma_cpu_timerid =
312 	    fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay);
313 }
314 
315 void
cma_cpu_fini(fmd_hdl_t * hdl)316 cma_cpu_fini(fmd_hdl_t *hdl)
317 {
318 	cma_cpu_t *cpu;
319 
320 	while ((cpu = cma.cma_cpus) != NULL) {
321 		cma.cma_cpus = cpu->cpu_next;
322 		cma_cpu_free(hdl, cpu);
323 	}
324 }
325