xref: /titanic_50/usr/src/cmd/fm/modules/sun4v/cpumem-diagnosis/cmd_cpu_arch.c (revision 050c9ebdc9d01dca610febe083c1796c5e013868)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Support routines for managing per-CPU state.
30  */
31 
32 #include <cmd_cpu.h>
33 #include <cmd_mem.h>
34 #include <cmd.h>
35 
36 #include <stdio.h>
37 #include <string.h>
38 #include <strings.h>
39 #include <errno.h>
40 #include <kstat.h>
41 #include <fm/fmd_api.h>
42 #include <sys/async.h>
43 #include <sys/fm/protocol.h>
44 #include <sys/fm/cpu/UltraSPARC-T1.h>
45 #include <sys/niagararegs.h>
46 #include <cmd_hc_sun4v.h>
47 
48 int cmd_afsr_check(fmd_hdl_t *,  uint64_t, cmd_errcl_t, uint8_t *);
49 
50 const errdata_t l3errdata =
51 	{ &cmd.cmd_l3data_serd, "l3cachedata", CMD_PTR_CPU_L3DATA  };
52 const errdata_t n1l2errdata =
53 	{ &cmd.cmd_l2data_serd, "l2cachedata", CMD_PTR_CPU_L2DATA };
54 const errdata_t n2ce_l2errdata =
55 	{ &cmd.cmd_l2data_serd, "l2data-c", CMD_PTR_CPU_L2DATA };
56 const errdata_t n2ue_l2errdata =
57 	{ &cmd.cmd_l2data_serd, "l2data-u", CMD_PTR_CPU_L2DATA };
58 const errdata_t miscregsdata =
59 	{ &cmd.cmd_miscregs_serd, "misc_reg", CMD_PTR_CPU_MISC_REGS };
60 const errdata_t dcachedata =
61 	{ &cmd.cmd_dcache_serd, "dcache", CMD_PTR_CPU_DCACHE };
62 const errdata_t icachedata =
63 	{ &cmd.cmd_icache_serd, "icache", CMD_PTR_CPU_ICACHE };
64 
65 static int
cmd_xr_error_type(cmd_errcl_t clcode)66 cmd_xr_error_type(cmd_errcl_t clcode)
67 {
68 	if (CMD_ERRCL_ISMISCREGS(clcode))
69 		return (MISCREGS_ERR);
70 	else if (CMD_ERRCL_ISL2XXCU(clcode))
71 		return (L2_ERR);
72 	else if (CMD_ERRCL_ISL2ND(clcode))
73 		return (L2ND_ERR);
74 	else if (CMD_ERRCL_ISMEM(clcode))
75 		return (MEM_ERR);
76 	else if (CMD_ERRCL_ISDCDP(clcode))
77 		return (DCDP_ERR);
78 	else if (CMD_ERRCL_ISICDP(clcode))
79 		return (ICDP_ERR);
80 	else if (CMD_ERRCL_REMOTEL2(clcode))
81 		return (REMOTE_L2ERR);
82 	else
83 		return (UNKNOWN_ERR);
84 }
85 
86 void
cmd_fill_errdata(cmd_errcl_t clcode,cmd_cpu_t * cpu,cmd_case_t ** cc,const errdata_t ** ed)87 cmd_fill_errdata(cmd_errcl_t clcode, cmd_cpu_t *cpu, cmd_case_t **cc,
88     const errdata_t **ed)
89 {
90 	int err_type;
91 
92 	err_type = cmd_xr_error_type(clcode);
93 	switch (err_type) {
94 		case MISCREGS_ERR:
95 			*ed = &miscregsdata;
96 			*cc = &cpu->cpu_misc_regs;
97 			break;
98 		case L2_ERR:
99 		case REMOTE_L2ERR:
100 			if (cpu->cpu_type == CPU_ULTRASPARC_T1) {
101 				*ed = &n1l2errdata;
102 				*cc = &cpu->cpu_l2data;
103 			} else {
104 				if (CMD_ERRCL_ISL2CE(clcode)) {
105 					*ed = &n2ce_l2errdata;
106 					*cc = &cpu->cpu_l2data;
107 				} else {
108 					*ed = &n2ue_l2errdata;
109 					*cc = &cpu->cpu_l2data;
110 				}
111 			}
112 			break;
113 		case DCDP_ERR:
114 			*ed = &dcachedata;
115 			*cc = &cpu->cpu_dcache;
116 			break;
117 		case ICDP_ERR:
118 			*ed = &icachedata;
119 			*cc = &cpu->cpu_icache;
120 			break;
121 		/*
122 		 * When an error goes through the train, it requires
123 		 * to have cmd_case_t & errdata_t structures even it is not
124 		 * diagnosed when the error is resolved. Sun4v does
125 		 * does not have a L3 error, but the L3 cpu case was defined,
126 		 * so its data structures are used for the default cases.
127 		 */
128 		default:
129 			*ed = &l3errdata;
130 			*cc = &cpu->cpu_l3data;
131 			break;
132 	}
133 }
134 
135 int
cmd_afar_status_check(uint8_t afar_status,cmd_errcl_t clcode)136 cmd_afar_status_check(uint8_t afar_status, cmd_errcl_t clcode)
137 {
138 
139 	/*
140 	 * There is no L2 data for a remote write back
141 	 * cache error in the ereport, so skip the status check
142 	 */
143 	if (clcode == CMD_ERRCL_WBUE)
144 		return (0);
145 
146 	if (afar_status == AFLT_STAT_VALID)
147 		return (0);
148 	return (-1);
149 }
150 
151 /*
152  * Search for the entry that matches the ena and the AFAR
153  * if we have a valid AFAR, otherwise search for the entry
154  * that its's ena is < delta ENA.
155  */
156 /*ARGSUSED*/
157 cmd_xxcu_trw_t *
cmd_trw_lookup(uint64_t ena,uint8_t afar_status,uint64_t afar)158 cmd_trw_lookup(uint64_t ena, uint8_t afar_status, uint64_t afar)
159 {
160 	int i;
161 
162 	if (afar_status == AFLT_STAT_VALID) {
163 		for (i = 0; i < cmd.cmd_xxcu_ntrw; i++) {
164 			if (cmd.cmd_xxcu_trw[i].trw_ena != 0) {
165 				if ((llabs(ena - cmd.cmd_xxcu_trw[i].trw_ena) <
166 				    cmd.cmd_delta_ena) &&
167 				    (cmd.cmd_xxcu_trw[i].trw_afar == afar))
168 					return (&cmd.cmd_xxcu_trw[i]);
169 			}
170 		}
171 	}
172 
173 	for (i = 0; i < cmd.cmd_xxcu_ntrw; i++) {
174 		if (cmd.cmd_xxcu_trw[i].trw_ena != 0) {
175 			if (llabs(ena - cmd.cmd_xxcu_trw[i].trw_ena)
176 			    < cmd.cmd_delta_ena)
177 				return (&cmd.cmd_xxcu_trw[i]);
178 		}
179 	}
180 
181 	return (NULL);
182 }
183 
184 cmd_errcl_t
cmd_get_nextbit(cmd_errcl_t trw_mask)185 cmd_get_nextbit(cmd_errcl_t trw_mask)
186 {
187 	cmd_errcl_t tmp_mask = 0;
188 	cmd_errcl_t tmp;
189 	int i;
190 
191 	for (i = 0; i < 64; i++) {
192 		tmp = (0x0000000000000001ULL << i);
193 		if (tmp & trw_mask) {
194 			tmp_mask = tmp;
195 			break;
196 		}
197 	}
198 	return (tmp_mask);
199 }
200 
201 /*
202  * For a resolved error, its error code will be paired with
203  * each error code in the train mask and compared against the
204  * pre-defined trains in the cmd_cpu.c to determine if the error
205  * is in the train.
206  */
207 cmd_errcl_t
cmd_combine_two_train(cmd_errcl_t trw_mask,cmd_errcl_t resolved_err)208 cmd_combine_two_train(cmd_errcl_t trw_mask, cmd_errcl_t resolved_err)
209 {
210 	cmd_errcl_t tmp_mask = 0;
211 	cmd_errcl_t train_mask = 0;
212 	cmd_errcl_t cause = 0;
213 	cmd_errcl_t error_mask = trw_mask ^ resolved_err;
214 
215 	while (error_mask) {
216 		tmp_mask = cmd_get_nextbit(error_mask);
217 		if (tmp_mask == 0)
218 			break;
219 		train_mask = tmp_mask | resolved_err;
220 		cause = cmd_xxcu_train_match(train_mask);
221 		if (cause) {
222 			return (cause);
223 		}
224 		error_mask = error_mask ^ tmp_mask;
225 	}
226 	return (0);
227 }
228 
229 cmd_errcl_t
cmd_train_match(cmd_errcl_t trw_mask,cmd_errcl_t resolved_err)230 cmd_train_match(cmd_errcl_t trw_mask, cmd_errcl_t resolved_err)
231 {
232 	return (cmd_combine_two_train(trw_mask, resolved_err));
233 }
234 
235 int
cmd_xr_fill(fmd_hdl_t * hdl,nvlist_t * nvl,cmd_xr_t * xr,cmd_errcl_t clcode)236 cmd_xr_fill(fmd_hdl_t *hdl, nvlist_t *nvl, cmd_xr_t *xr, cmd_errcl_t clcode)
237 {
238 	uint64_t niagara_l2_afsr = 0;
239 	int errtype;
240 
241 	errtype = cmd_xr_error_type(clcode);
242 	/*
243 	 * skip the fill data for the errors which is not L2 errors.
244 	 */
245 	if (errtype != L2_ERR) {
246 		fmd_hdl_debug(hdl, "Skip fill L2 data for errtype %d\n",
247 		    errtype);
248 		return (0);
249 	}
250 
251 	if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_AFSR,
252 	    &niagara_l2_afsr) != 0 &&
253 	    nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_ESR,
254 	    &niagara_l2_afsr) != 0) {
255 		fmd_hdl_debug(hdl, "No L2 AFSR data");
256 		return (-1);
257 	}
258 	if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_AFAR,
259 	    &xr->xr_afar) != 0 &&
260 	    nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_EAR,
261 	    &xr->xr_afar) != 0) {
262 		fmd_hdl_debug(hdl, "No L2 AFAR data");
263 		return (-1);
264 	}
265 	if (nvlist_lookup_uint32(nvl, FM_EREPORT_PAYLOAD_NAME_L2_SYND,
266 	    &xr->xr_synd) != 0) {
267 		/* Niagara-2 doesn't provide separate (redundant) l2-synd */
268 		xr->xr_synd = niagara_l2_afsr & NI2_L2AFSR_SYND;
269 	}
270 
271 	if (cmd_afsr_check(hdl, niagara_l2_afsr, clcode,
272 	    &xr->xr_synd_status) != 0) {
273 		fmd_hdl_debug(hdl, "Invalid L2 syndrome");
274 		return (-1);
275 	}
276 
277 	xr->xr_afar_status = xr->xr_synd_status;
278 	return (0);
279 }
280 
281 int
cmd_cpu_synd_check(uint32_t synd,cmd_errcl_t clcode)282 cmd_cpu_synd_check(uint32_t synd, cmd_errcl_t clcode)
283 {
284 	int i;
285 
286 	/*
287 	 * Niagara L2 fetches from a memory location containing a UE
288 	 * are given a poison syndrome in one or more 7 bit subsyndromes
289 	 * each covering one of 4 4 byte checkwords.
290 	 *
291 	 * 0 is an invalid syndrome because it denotes no error, but
292 	 * is associated with an ereport -- meaning there WAS an error.
293 	 */
294 	/*
295 	 * HW does not store the syndrome value for write-back cache
296 	 * error, so skip the synd check for L2 write-back error
297 	 */
298 	if (CMD_ERRCL_L2UE_WRITEBACK(clcode))
299 		return (0);
300 
301 	if (synd == 0)
302 		return (-1);
303 
304 	for (i = 0; i < 4; i++) {
305 		if (((synd >> i*NI_L2_POISON_SYND_SIZE) &
306 		    NI_L2_POISON_SYND_MASK) == NI_L2_POISON_SYND_FROM_DAU)
307 			return (-1);
308 	}
309 	return (0);
310 }
311 
312 int
cmd_afsr_check(fmd_hdl_t * hdl,uint64_t afsr,cmd_errcl_t clcode,uint8_t * stat_val)313 cmd_afsr_check(fmd_hdl_t *hdl, uint64_t afsr,
314     cmd_errcl_t clcode, uint8_t *stat_val)
315 {
316 	/*
317 	 * Set Niagara afar and synd validity.
318 	 * For a given set of error registers, the payload value is valid iff
319 	 * no higher priority error status bit is set.  See niagararegs.h
320 	 * for error status bit values and priority settings.
321 	 */
322 	switch (clcode) {
323 	case CMD_ERRCL_LDAU:
324 	case CMD_ERRCL_LDSU:
325 	case CMD_ERRCL_DL2U:
326 	case CMD_ERRCL_IL2U:
327 		*stat_val =
328 		    ((afsr & NI_L2AFSR_P02) == 0) ?
329 		    AFLT_STAT_VALID: AFLT_STAT_INVALID;
330 		break;
331 	case CMD_ERRCL_LDWU:
332 		*stat_val =
333 		    ((afsr & NI_L2AFSR_P03) == 0) ?
334 		    AFLT_STAT_VALID : AFLT_STAT_INVALID;
335 		break;
336 	case CMD_ERRCL_LDRU:
337 		*stat_val =
338 		    ((afsr & NI_L2AFSR_P04) == 0) ?
339 		    AFLT_STAT_VALID : AFLT_STAT_INVALID;
340 		break;
341 	case CMD_ERRCL_LDAC:
342 	case CMD_ERRCL_LDSC:
343 		*stat_val =
344 		    ((afsr & NI_L2AFSR_P08) == 0) ?
345 		    AFLT_STAT_VALID : AFLT_STAT_INVALID;
346 		break;
347 	case CMD_ERRCL_LDWC:
348 		*stat_val =
349 		    ((afsr & NI_L2AFSR_P09) == 0) ?
350 		    AFLT_STAT_VALID : AFLT_STAT_INVALID;
351 		break;
352 	case CMD_ERRCL_LDRC:
353 		*stat_val =
354 		    ((afsr & NI_L2AFSR_P10) == 0) ?
355 		    AFLT_STAT_VALID : AFLT_STAT_INVALID;
356 		break;
357 	default:
358 		fmd_hdl_debug(hdl, "Niagara unrecognized l2cache error\n");
359 		return (-1);
360 	}
361 	return (0);
362 }
363 
364 
365 int
cmd_afar_valid(fmd_hdl_t * hdl,nvlist_t * nvl,cmd_errcl_t clcode,uint64_t * afar)366 cmd_afar_valid(fmd_hdl_t *hdl, nvlist_t *nvl, cmd_errcl_t clcode,
367     uint64_t *afar)
368 {
369 	uint64_t niagara_l2_afsr = 0;
370 	uint8_t stat_val;
371 
372 	/*
373 	 * In Niagara-1, we carried forward the register names afsr and afar
374 	 * in ereports from sun4u, even though the hardware registers were
375 	 * named esr and ear respectively.  In Niagara-2 we decided to conform
376 	 * to the hardware names.
377 	 */
378 
379 	if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_AFSR,
380 	    &niagara_l2_afsr) != 0 &&
381 	    nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_ESR,
382 	    &niagara_l2_afsr) != 0)
383 		return (-1);
384 
385 	if (cmd_afsr_check(hdl, niagara_l2_afsr, clcode, &stat_val) != 0)
386 		return (-1);
387 
388 	if (stat_val == AFLT_STAT_VALID) {
389 		if (nvlist_lookup_uint64(nvl,
390 		    FM_EREPORT_PAYLOAD_NAME_L2_AFAR, afar) == 0 ||
391 		    nvlist_lookup_uint64(nvl,
392 		    FM_EREPORT_PAYLOAD_NAME_L2_EAR, afar) == 0)
393 			return (0);
394 	}
395 	return (-1);
396 }
397 
398 /*
399  * sun4v cmd_cpu_get_frustr expects a 'cpufru' element in 'detector' FMRI
400  * of ereport (which is stored as 'asru' of cmd_cpu_t).  For early sun4v,
401  * this was mistakenly spec'ed as "hc://MB" instead of "hc:///component=MB",
402  * so this situation must be remediated when found.
403  */
404 
405 char *
cmd_cpu_getfrustr(fmd_hdl_t * hdl,cmd_cpu_t * cp)406 cmd_cpu_getfrustr(fmd_hdl_t *hdl, cmd_cpu_t *cp)
407 {
408 	char *frustr;
409 	nvlist_t *asru = cp->cpu_asru_nvl;
410 
411 	if (nvlist_lookup_string(asru, FM_FMRI_CPU_CPUFRU, &frustr) == 0) {
412 		fmd_hdl_debug(hdl, "cmd_cpu_getfrustr: cpufru=%s\n", frustr);
413 		if (strncmp(frustr, CPU_FRU_FMRI,
414 		    sizeof (CPU_FRU_FMRI) -1) == 0)
415 			return (fmd_hdl_strdup(hdl, frustr, FMD_SLEEP));
416 		else {
417 			char *s1, *s2;
418 			size_t frustrlen;
419 
420 			s2 = strstr(frustr, "MB");
421 			if ((s2 == NULL) || strcmp(s2, EMPTY_STR) == 0) {
422 				fmd_hdl_debug(hdl,
423 				    "cmd_cpu_getfrustr: no cpufru");
424 				return (NULL);
425 			}
426 			frustrlen = strlen(s2) + sizeof (CPU_FRU_FMRI);
427 			s1 = fmd_hdl_alloc(hdl, frustrlen, FMD_SLEEP);
428 			s1 = strcpy(s1, CPU_FRU_FMRI);
429 			s1 = strcat(s1, s2);
430 			fmd_hdl_debug(hdl, "cmd_cpu_getfrustr frustr=%s\n", s1);
431 			return (s1);
432 		}
433 	}
434 	(void) cmd_set_errno(ENOENT);
435 	return (NULL);
436 }
437 
438 char *
cmd_cpu_getpartstr(fmd_hdl_t * hdl,cmd_cpu_t * cp)439 cmd_cpu_getpartstr(fmd_hdl_t *hdl, cmd_cpu_t *cp) {
440 	char *partstr;
441 	nvlist_t *asru = cp->cpu_asru_nvl;
442 
443 	if (nvlist_lookup_string(asru, FM_FMRI_HC_PART, &partstr) == 0)
444 		return (fmd_hdl_strdup(hdl, partstr, FMD_SLEEP));
445 	else
446 		return (NULL);
447 }
448 
449 char *
cmd_cpu_getserialstr(fmd_hdl_t * hdl,cmd_cpu_t * cp)450 cmd_cpu_getserialstr(fmd_hdl_t *hdl, cmd_cpu_t *cp) {
451 	char *serialstr;
452 	nvlist_t *asru = cp->cpu_asru_nvl;
453 
454 	if (nvlist_lookup_string(asru, FM_FMRI_HC_SERIAL_ID, &serialstr) == 0)
455 		return (fmd_hdl_strdup(hdl, serialstr, FMD_SLEEP));
456 	else
457 		return (NULL);
458 }
459 
460 nvlist_t *
cmd_cpu_mkfru(fmd_hdl_t * hdl,char * frustr,char * serialstr,char * partstr)461 cmd_cpu_mkfru(fmd_hdl_t *hdl, char *frustr, char *serialstr, char *partstr)
462 {
463 
464 	nvlist_t *fru;
465 	if (strncmp(frustr, CPU_FRU_FMRI, sizeof (CPU_FRU_FMRI) - 1) != 0)
466 		return (NULL);
467 	fru = cmd_mkboard_fru(hdl, frustr, serialstr, partstr);
468 	return (fru);
469 }
470