1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #pragma ident "%Z%%M% %I% %E% SMI"
27
28 /*
29 * Support routines for managing per-CPU state.
30 */
31
32 #include <cmd_cpu.h>
33 #include <cmd_mem.h>
34 #include <cmd.h>
35
36 #include <stdio.h>
37 #include <string.h>
38 #include <strings.h>
39 #include <errno.h>
40 #include <kstat.h>
41 #include <fm/fmd_api.h>
42 #include <sys/async.h>
43 #include <sys/fm/protocol.h>
44 #include <sys/fm/cpu/UltraSPARC-T1.h>
45 #include <sys/niagararegs.h>
46 #include <cmd_hc_sun4v.h>
47
48 int cmd_afsr_check(fmd_hdl_t *, uint64_t, cmd_errcl_t, uint8_t *);
49
50 const errdata_t l3errdata =
51 { &cmd.cmd_l3data_serd, "l3cachedata", CMD_PTR_CPU_L3DATA };
52 const errdata_t n1l2errdata =
53 { &cmd.cmd_l2data_serd, "l2cachedata", CMD_PTR_CPU_L2DATA };
54 const errdata_t n2ce_l2errdata =
55 { &cmd.cmd_l2data_serd, "l2data-c", CMD_PTR_CPU_L2DATA };
56 const errdata_t n2ue_l2errdata =
57 { &cmd.cmd_l2data_serd, "l2data-u", CMD_PTR_CPU_L2DATA };
58 const errdata_t miscregsdata =
59 { &cmd.cmd_miscregs_serd, "misc_reg", CMD_PTR_CPU_MISC_REGS };
60 const errdata_t dcachedata =
61 { &cmd.cmd_dcache_serd, "dcache", CMD_PTR_CPU_DCACHE };
62 const errdata_t icachedata =
63 { &cmd.cmd_icache_serd, "icache", CMD_PTR_CPU_ICACHE };
64
65 static int
cmd_xr_error_type(cmd_errcl_t clcode)66 cmd_xr_error_type(cmd_errcl_t clcode)
67 {
68 if (CMD_ERRCL_ISMISCREGS(clcode))
69 return (MISCREGS_ERR);
70 else if (CMD_ERRCL_ISL2XXCU(clcode))
71 return (L2_ERR);
72 else if (CMD_ERRCL_ISL2ND(clcode))
73 return (L2ND_ERR);
74 else if (CMD_ERRCL_ISMEM(clcode))
75 return (MEM_ERR);
76 else if (CMD_ERRCL_ISDCDP(clcode))
77 return (DCDP_ERR);
78 else if (CMD_ERRCL_ISICDP(clcode))
79 return (ICDP_ERR);
80 else if (CMD_ERRCL_REMOTEL2(clcode))
81 return (REMOTE_L2ERR);
82 else
83 return (UNKNOWN_ERR);
84 }
85
86 void
cmd_fill_errdata(cmd_errcl_t clcode,cmd_cpu_t * cpu,cmd_case_t ** cc,const errdata_t ** ed)87 cmd_fill_errdata(cmd_errcl_t clcode, cmd_cpu_t *cpu, cmd_case_t **cc,
88 const errdata_t **ed)
89 {
90 int err_type;
91
92 err_type = cmd_xr_error_type(clcode);
93 switch (err_type) {
94 case MISCREGS_ERR:
95 *ed = &miscregsdata;
96 *cc = &cpu->cpu_misc_regs;
97 break;
98 case L2_ERR:
99 case REMOTE_L2ERR:
100 if (cpu->cpu_type == CPU_ULTRASPARC_T1) {
101 *ed = &n1l2errdata;
102 *cc = &cpu->cpu_l2data;
103 } else {
104 if (CMD_ERRCL_ISL2CE(clcode)) {
105 *ed = &n2ce_l2errdata;
106 *cc = &cpu->cpu_l2data;
107 } else {
108 *ed = &n2ue_l2errdata;
109 *cc = &cpu->cpu_l2data;
110 }
111 }
112 break;
113 case DCDP_ERR:
114 *ed = &dcachedata;
115 *cc = &cpu->cpu_dcache;
116 break;
117 case ICDP_ERR:
118 *ed = &icachedata;
119 *cc = &cpu->cpu_icache;
120 break;
121 /*
122 * When an error goes through the train, it requires
123 * to have cmd_case_t & errdata_t structures even it is not
124 * diagnosed when the error is resolved. Sun4v does
125 * does not have a L3 error, but the L3 cpu case was defined,
126 * so its data structures are used for the default cases.
127 */
128 default:
129 *ed = &l3errdata;
130 *cc = &cpu->cpu_l3data;
131 break;
132 }
133 }
134
135 int
cmd_afar_status_check(uint8_t afar_status,cmd_errcl_t clcode)136 cmd_afar_status_check(uint8_t afar_status, cmd_errcl_t clcode)
137 {
138
139 /*
140 * There is no L2 data for a remote write back
141 * cache error in the ereport, so skip the status check
142 */
143 if (clcode == CMD_ERRCL_WBUE)
144 return (0);
145
146 if (afar_status == AFLT_STAT_VALID)
147 return (0);
148 return (-1);
149 }
150
151 /*
152 * Search for the entry that matches the ena and the AFAR
153 * if we have a valid AFAR, otherwise search for the entry
154 * that its's ena is < delta ENA.
155 */
156 /*ARGSUSED*/
157 cmd_xxcu_trw_t *
cmd_trw_lookup(uint64_t ena,uint8_t afar_status,uint64_t afar)158 cmd_trw_lookup(uint64_t ena, uint8_t afar_status, uint64_t afar)
159 {
160 int i;
161
162 if (afar_status == AFLT_STAT_VALID) {
163 for (i = 0; i < cmd.cmd_xxcu_ntrw; i++) {
164 if (cmd.cmd_xxcu_trw[i].trw_ena != 0) {
165 if ((llabs(ena - cmd.cmd_xxcu_trw[i].trw_ena) <
166 cmd.cmd_delta_ena) &&
167 (cmd.cmd_xxcu_trw[i].trw_afar == afar))
168 return (&cmd.cmd_xxcu_trw[i]);
169 }
170 }
171 }
172
173 for (i = 0; i < cmd.cmd_xxcu_ntrw; i++) {
174 if (cmd.cmd_xxcu_trw[i].trw_ena != 0) {
175 if (llabs(ena - cmd.cmd_xxcu_trw[i].trw_ena)
176 < cmd.cmd_delta_ena)
177 return (&cmd.cmd_xxcu_trw[i]);
178 }
179 }
180
181 return (NULL);
182 }
183
184 cmd_errcl_t
cmd_get_nextbit(cmd_errcl_t trw_mask)185 cmd_get_nextbit(cmd_errcl_t trw_mask)
186 {
187 cmd_errcl_t tmp_mask = 0;
188 cmd_errcl_t tmp;
189 int i;
190
191 for (i = 0; i < 64; i++) {
192 tmp = (0x0000000000000001ULL << i);
193 if (tmp & trw_mask) {
194 tmp_mask = tmp;
195 break;
196 }
197 }
198 return (tmp_mask);
199 }
200
201 /*
202 * For a resolved error, its error code will be paired with
203 * each error code in the train mask and compared against the
204 * pre-defined trains in the cmd_cpu.c to determine if the error
205 * is in the train.
206 */
207 cmd_errcl_t
cmd_combine_two_train(cmd_errcl_t trw_mask,cmd_errcl_t resolved_err)208 cmd_combine_two_train(cmd_errcl_t trw_mask, cmd_errcl_t resolved_err)
209 {
210 cmd_errcl_t tmp_mask = 0;
211 cmd_errcl_t train_mask = 0;
212 cmd_errcl_t cause = 0;
213 cmd_errcl_t error_mask = trw_mask ^ resolved_err;
214
215 while (error_mask) {
216 tmp_mask = cmd_get_nextbit(error_mask);
217 if (tmp_mask == 0)
218 break;
219 train_mask = tmp_mask | resolved_err;
220 cause = cmd_xxcu_train_match(train_mask);
221 if (cause) {
222 return (cause);
223 }
224 error_mask = error_mask ^ tmp_mask;
225 }
226 return (0);
227 }
228
229 cmd_errcl_t
cmd_train_match(cmd_errcl_t trw_mask,cmd_errcl_t resolved_err)230 cmd_train_match(cmd_errcl_t trw_mask, cmd_errcl_t resolved_err)
231 {
232 return (cmd_combine_two_train(trw_mask, resolved_err));
233 }
234
235 int
cmd_xr_fill(fmd_hdl_t * hdl,nvlist_t * nvl,cmd_xr_t * xr,cmd_errcl_t clcode)236 cmd_xr_fill(fmd_hdl_t *hdl, nvlist_t *nvl, cmd_xr_t *xr, cmd_errcl_t clcode)
237 {
238 uint64_t niagara_l2_afsr = 0;
239 int errtype;
240
241 errtype = cmd_xr_error_type(clcode);
242 /*
243 * skip the fill data for the errors which is not L2 errors.
244 */
245 if (errtype != L2_ERR) {
246 fmd_hdl_debug(hdl, "Skip fill L2 data for errtype %d\n",
247 errtype);
248 return (0);
249 }
250
251 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_AFSR,
252 &niagara_l2_afsr) != 0 &&
253 nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_ESR,
254 &niagara_l2_afsr) != 0) {
255 fmd_hdl_debug(hdl, "No L2 AFSR data");
256 return (-1);
257 }
258 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_AFAR,
259 &xr->xr_afar) != 0 &&
260 nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_EAR,
261 &xr->xr_afar) != 0) {
262 fmd_hdl_debug(hdl, "No L2 AFAR data");
263 return (-1);
264 }
265 if (nvlist_lookup_uint32(nvl, FM_EREPORT_PAYLOAD_NAME_L2_SYND,
266 &xr->xr_synd) != 0) {
267 /* Niagara-2 doesn't provide separate (redundant) l2-synd */
268 xr->xr_synd = niagara_l2_afsr & NI2_L2AFSR_SYND;
269 }
270
271 if (cmd_afsr_check(hdl, niagara_l2_afsr, clcode,
272 &xr->xr_synd_status) != 0) {
273 fmd_hdl_debug(hdl, "Invalid L2 syndrome");
274 return (-1);
275 }
276
277 xr->xr_afar_status = xr->xr_synd_status;
278 return (0);
279 }
280
281 int
cmd_cpu_synd_check(uint32_t synd,cmd_errcl_t clcode)282 cmd_cpu_synd_check(uint32_t synd, cmd_errcl_t clcode)
283 {
284 int i;
285
286 /*
287 * Niagara L2 fetches from a memory location containing a UE
288 * are given a poison syndrome in one or more 7 bit subsyndromes
289 * each covering one of 4 4 byte checkwords.
290 *
291 * 0 is an invalid syndrome because it denotes no error, but
292 * is associated with an ereport -- meaning there WAS an error.
293 */
294 /*
295 * HW does not store the syndrome value for write-back cache
296 * error, so skip the synd check for L2 write-back error
297 */
298 if (CMD_ERRCL_L2UE_WRITEBACK(clcode))
299 return (0);
300
301 if (synd == 0)
302 return (-1);
303
304 for (i = 0; i < 4; i++) {
305 if (((synd >> i*NI_L2_POISON_SYND_SIZE) &
306 NI_L2_POISON_SYND_MASK) == NI_L2_POISON_SYND_FROM_DAU)
307 return (-1);
308 }
309 return (0);
310 }
311
312 int
cmd_afsr_check(fmd_hdl_t * hdl,uint64_t afsr,cmd_errcl_t clcode,uint8_t * stat_val)313 cmd_afsr_check(fmd_hdl_t *hdl, uint64_t afsr,
314 cmd_errcl_t clcode, uint8_t *stat_val)
315 {
316 /*
317 * Set Niagara afar and synd validity.
318 * For a given set of error registers, the payload value is valid iff
319 * no higher priority error status bit is set. See niagararegs.h
320 * for error status bit values and priority settings.
321 */
322 switch (clcode) {
323 case CMD_ERRCL_LDAU:
324 case CMD_ERRCL_LDSU:
325 case CMD_ERRCL_DL2U:
326 case CMD_ERRCL_IL2U:
327 *stat_val =
328 ((afsr & NI_L2AFSR_P02) == 0) ?
329 AFLT_STAT_VALID: AFLT_STAT_INVALID;
330 break;
331 case CMD_ERRCL_LDWU:
332 *stat_val =
333 ((afsr & NI_L2AFSR_P03) == 0) ?
334 AFLT_STAT_VALID : AFLT_STAT_INVALID;
335 break;
336 case CMD_ERRCL_LDRU:
337 *stat_val =
338 ((afsr & NI_L2AFSR_P04) == 0) ?
339 AFLT_STAT_VALID : AFLT_STAT_INVALID;
340 break;
341 case CMD_ERRCL_LDAC:
342 case CMD_ERRCL_LDSC:
343 *stat_val =
344 ((afsr & NI_L2AFSR_P08) == 0) ?
345 AFLT_STAT_VALID : AFLT_STAT_INVALID;
346 break;
347 case CMD_ERRCL_LDWC:
348 *stat_val =
349 ((afsr & NI_L2AFSR_P09) == 0) ?
350 AFLT_STAT_VALID : AFLT_STAT_INVALID;
351 break;
352 case CMD_ERRCL_LDRC:
353 *stat_val =
354 ((afsr & NI_L2AFSR_P10) == 0) ?
355 AFLT_STAT_VALID : AFLT_STAT_INVALID;
356 break;
357 default:
358 fmd_hdl_debug(hdl, "Niagara unrecognized l2cache error\n");
359 return (-1);
360 }
361 return (0);
362 }
363
364
365 int
cmd_afar_valid(fmd_hdl_t * hdl,nvlist_t * nvl,cmd_errcl_t clcode,uint64_t * afar)366 cmd_afar_valid(fmd_hdl_t *hdl, nvlist_t *nvl, cmd_errcl_t clcode,
367 uint64_t *afar)
368 {
369 uint64_t niagara_l2_afsr = 0;
370 uint8_t stat_val;
371
372 /*
373 * In Niagara-1, we carried forward the register names afsr and afar
374 * in ereports from sun4u, even though the hardware registers were
375 * named esr and ear respectively. In Niagara-2 we decided to conform
376 * to the hardware names.
377 */
378
379 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_AFSR,
380 &niagara_l2_afsr) != 0 &&
381 nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_ESR,
382 &niagara_l2_afsr) != 0)
383 return (-1);
384
385 if (cmd_afsr_check(hdl, niagara_l2_afsr, clcode, &stat_val) != 0)
386 return (-1);
387
388 if (stat_val == AFLT_STAT_VALID) {
389 if (nvlist_lookup_uint64(nvl,
390 FM_EREPORT_PAYLOAD_NAME_L2_AFAR, afar) == 0 ||
391 nvlist_lookup_uint64(nvl,
392 FM_EREPORT_PAYLOAD_NAME_L2_EAR, afar) == 0)
393 return (0);
394 }
395 return (-1);
396 }
397
398 /*
399 * sun4v cmd_cpu_get_frustr expects a 'cpufru' element in 'detector' FMRI
400 * of ereport (which is stored as 'asru' of cmd_cpu_t). For early sun4v,
401 * this was mistakenly spec'ed as "hc://MB" instead of "hc:///component=MB",
402 * so this situation must be remediated when found.
403 */
404
405 char *
cmd_cpu_getfrustr(fmd_hdl_t * hdl,cmd_cpu_t * cp)406 cmd_cpu_getfrustr(fmd_hdl_t *hdl, cmd_cpu_t *cp)
407 {
408 char *frustr;
409 nvlist_t *asru = cp->cpu_asru_nvl;
410
411 if (nvlist_lookup_string(asru, FM_FMRI_CPU_CPUFRU, &frustr) == 0) {
412 fmd_hdl_debug(hdl, "cmd_cpu_getfrustr: cpufru=%s\n", frustr);
413 if (strncmp(frustr, CPU_FRU_FMRI,
414 sizeof (CPU_FRU_FMRI) -1) == 0)
415 return (fmd_hdl_strdup(hdl, frustr, FMD_SLEEP));
416 else {
417 char *s1, *s2;
418 size_t frustrlen;
419
420 s2 = strstr(frustr, "MB");
421 if ((s2 == NULL) || strcmp(s2, EMPTY_STR) == 0) {
422 fmd_hdl_debug(hdl,
423 "cmd_cpu_getfrustr: no cpufru");
424 return (NULL);
425 }
426 frustrlen = strlen(s2) + sizeof (CPU_FRU_FMRI);
427 s1 = fmd_hdl_alloc(hdl, frustrlen, FMD_SLEEP);
428 s1 = strcpy(s1, CPU_FRU_FMRI);
429 s1 = strcat(s1, s2);
430 fmd_hdl_debug(hdl, "cmd_cpu_getfrustr frustr=%s\n", s1);
431 return (s1);
432 }
433 }
434 (void) cmd_set_errno(ENOENT);
435 return (NULL);
436 }
437
438 char *
cmd_cpu_getpartstr(fmd_hdl_t * hdl,cmd_cpu_t * cp)439 cmd_cpu_getpartstr(fmd_hdl_t *hdl, cmd_cpu_t *cp) {
440 char *partstr;
441 nvlist_t *asru = cp->cpu_asru_nvl;
442
443 if (nvlist_lookup_string(asru, FM_FMRI_HC_PART, &partstr) == 0)
444 return (fmd_hdl_strdup(hdl, partstr, FMD_SLEEP));
445 else
446 return (NULL);
447 }
448
449 char *
cmd_cpu_getserialstr(fmd_hdl_t * hdl,cmd_cpu_t * cp)450 cmd_cpu_getserialstr(fmd_hdl_t *hdl, cmd_cpu_t *cp) {
451 char *serialstr;
452 nvlist_t *asru = cp->cpu_asru_nvl;
453
454 if (nvlist_lookup_string(asru, FM_FMRI_HC_SERIAL_ID, &serialstr) == 0)
455 return (fmd_hdl_strdup(hdl, serialstr, FMD_SLEEP));
456 else
457 return (NULL);
458 }
459
460 nvlist_t *
cmd_cpu_mkfru(fmd_hdl_t * hdl,char * frustr,char * serialstr,char * partstr)461 cmd_cpu_mkfru(fmd_hdl_t *hdl, char *frustr, char *serialstr, char *partstr)
462 {
463
464 nvlist_t *fru;
465 if (strncmp(frustr, CPU_FRU_FMRI, sizeof (CPU_FRU_FMRI) - 1) != 0)
466 return (NULL);
467 fru = cmd_mkboard_fru(hdl, frustr, serialstr, partstr);
468 return (fru);
469 }
470