1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #pragma ident "%Z%%M% %I% %E% SMI"
27
28 #include <sys/plat_ecc_unum.h>
29 #include <sys/utsname.h>
30 #include <sys/cmn_err.h>
31 #include <sys/async.h>
32 #include <sys/errno.h>
33 #include <sys/fm/protocol.h>
34 #include <sys/fm/cpu/UltraSPARC-III.h>
35 #include <sys/bl.h>
36 #include <sys/taskq.h>
37 #include <sys/condvar.h>
38 #include <sys/plat_ecc_dimm.h>
39
40 /*
41 * Pointer to platform specific function to initialize a cache of DIMM
42 * serial ids
43 */
44 int (*p2init_sid_cache)(void);
45
46 /*
47 * This file contains the common code that is used for parsing
48 * ecc unum data and logging it appropriately as the platform
49 * that calls this code implements.
50 */
51
52 int plat_ecc_dispatch_task(plat_ecc_message_t *);
53 static void plat_ecc_send_msg(void *);
54
55 #define CHECK_UNUM \
56 if (unum_ptr == NULL) { \
57 break; \
58 }
59
60 /*
61 * See plat_ecc_unum.h for the meaning of these variables.
62 */
63 int ecc_log_fruid_enable = ECC_FRUID_ENABLE_DEFAULT;
64
65 uint32_t plat_ecc_capability_map_domain = PLAT_ECC_CAPABILITY_DOMAIN_DEFAULT;
66 uint32_t plat_ecc_capability_map_sc = PLAT_ECC_CAPABILITY_SC_DEFAULT;
67 uint16_t ecc_error2_mailbox_flags = PLAT_ECC_ERROR2_SEND_DEFAULT;
68 uint16_t ecc_indictment2_mailbox_flags = PLAT_ECC_SEND_INDICT2_DEFAULT;
69
70 /*
71 * We log all ECC errors using the function that is defined as
72 * plat_send_ecc_mailbox_msg(); We first parse the unum string and
73 * then pass the data to be logged to the plat_send_ecc_mailbox_msg
74 * function for logging. Each platform that uses this code needs to
75 * implement a suitable function for this purpose.
76 */
77 void
plat_log_fruid_error(int synd_code,struct async_flt * ecc,char * unum,uint64_t afsr_bit)78 plat_log_fruid_error(int synd_code, struct async_flt *ecc, char *unum,
79 uint64_t afsr_bit)
80 {
81 plat_ecc_error_data_t ecc_error_data;
82 enum plat_ecc_type ecc_type = PLAT_ECC_UNKNOWN;
83 int board_num;
84 int proc_position;
85 int invalid_unum = 1;
86
87 bzero(&ecc_error_data, sizeof (plat_ecc_error_data_t));
88 ecc_error_data.version = PLAT_ECC_VERSION;
89
90 switch (afsr_bit) {
91 case C_AFSR_CE:
92 ecc_error_data.error_code = PLAT_ERROR_CODE_CE;
93 break;
94 case C_AFSR_UE:
95 ecc_error_data.error_code = PLAT_ERROR_CODE_UE;
96 break;
97 case C_AFSR_EDC:
98 ecc_error_data.error_code = PLAT_ERROR_CODE_EDC;
99 break;
100 case C_AFSR_EDU:
101 ecc_error_data.error_code = PLAT_ERROR_CODE_EDU;
102 break;
103 case C_AFSR_WDC:
104 ecc_error_data.error_code = PLAT_ERROR_CODE_WDC;
105 break;
106 case C_AFSR_WDU:
107 ecc_error_data.error_code = PLAT_ERROR_CODE_WDU;
108 break;
109 case C_AFSR_CPC:
110 ecc_error_data.error_code = PLAT_ERROR_CODE_CPC;
111 break;
112 case C_AFSR_CPU:
113 ecc_error_data.error_code = PLAT_ERROR_CODE_CPU;
114 break;
115 case C_AFSR_UCC:
116 ecc_error_data.error_code = PLAT_ERROR_CODE_UCC;
117 break;
118 case C_AFSR_UCU:
119 ecc_error_data.error_code = PLAT_ERROR_CODE_UCU;
120 break;
121 case C_AFSR_EMC:
122 ecc_error_data.error_code = PLAT_ERROR_CODE_EMC;
123 break;
124 case C_AFSR_EMU:
125 ecc_error_data.error_code = PLAT_ERROR_CODE_EMU;
126 break;
127 default:
128 /*
129 * Do not send messages with unknown error codes, since
130 * the SC will not be able to tell what type of error
131 * occurred.
132 */
133 return;
134 }
135
136 ecc_error_data.detecting_proc = ecc->flt_bus_id;
137
138 if (ecc->flt_in_memory)
139 ecc_type = PLAT_ECC_MEMORY;
140 else if (ecc->flt_status & ECC_ECACHE)
141 ecc_type = PLAT_ECC_ECACHE;
142
143 switch (ecc_type) {
144 case PLAT_ECC_MEMORY: {
145 /*
146 * The unum string is expected to be in this form:
147 * "/N0/SB12/P0/B0/D2 J13500, ..."
148 * for serengeti. As this code is shared with Starcat
149 * if N is missing then it is set to 0.
150 * From that we will extract the bank number, dimm
151 * number, and Jnumber.
152 */
153 char *unum_ptr = unum;
154 char *jno_ptr = ecc_error_data.Jnumber;
155 int i;
156
157 /*
158 * On Serengeti we expect to find 'N' in the unum string
159 * however, on Starcat 'N' does not appear in the unum string.
160 * We do not want this code to break at this point, so the
161 * unum_ptr is reset to the start of unum string if we fail
162 * to find an 'N'.
163 */
164 unum_ptr = strchr(unum_ptr, 'N');
165 if (unum_ptr == NULL) {
166 ecc_error_data.node_no = 0;
167 unum_ptr = unum;
168 } else {
169 unum_ptr++;
170 ecc_error_data.node_no = stoi(&unum_ptr);
171 }
172
173 /*
174 * Now pull out the SB number
175 */
176 unum_ptr = strstr(unum_ptr, "SB");
177 CHECK_UNUM;
178 unum_ptr += 2;
179 board_num = stoi(&unum_ptr);
180
181 /*
182 * Now pull out the Proc position (relative to the board)
183 */
184 unum_ptr = strchr(unum_ptr, 'P');
185 CHECK_UNUM;
186 unum_ptr++;
187 proc_position = stoi(&unum_ptr);
188
189 /*
190 * Using the SB number and Proc position we create a FRU
191 * cpu id.
192 */
193 ecc_error_data.proc_num =
194 plat_make_fru_cpuid(board_num, 0, proc_position);
195
196 /*
197 * Now pull out the Memory Bank number
198 */
199 unum_ptr = strchr(unum_ptr, 'B');
200 CHECK_UNUM;
201 unum_ptr++;
202 ecc_error_data.bank_no = (stoi(&unum_ptr) & 0x01);
203
204 /*
205 * Now pull out the Dimm number within the Memory Bank.
206 */
207 unum_ptr = strchr(unum_ptr, 'D');
208 CHECK_UNUM;
209 unum_ptr++;
210 ecc_error_data.ecache_dimm_no = (stoi(&unum_ptr) & 0x03);
211
212 /*
213 * Now pull out the J-number.
214 */
215 unum_ptr = strchr(unum_ptr, 'J');
216 CHECK_UNUM;
217 unum_ptr++;
218 for (i = PLAT_ECC_JNUMBER_LENGTH;
219 i > 0 && *unum_ptr >= '0' && *unum_ptr <= '9'; i--)
220 *jno_ptr++ = *unum_ptr++;
221 *jno_ptr = NULL;
222
223 /*
224 * If we get here, we can assume the unum is valid
225 */
226 invalid_unum = 0;
227 break;
228 }
229 case PLAT_ECC_ECACHE: {
230 /*
231 * The unum string is expected to be in this form:
232 * "[/N0/][SB|IO]12/P0/E0 J13500, ..."
233 * for serengeti. As this code is shared with Starcat
234 * if N is missing then it is set to 0. IO may only appear
235 * on Starcats. From that we will extract the bank number,
236 * dimm number, and Jnumber.
237 */
238 char *unum_ptr = unum;
239 char *jno_ptr = ecc_error_data.Jnumber;
240 int is_maxcat = 0;
241 int i;
242
243 /*
244 * On Serengeti we expect to find 'N' in the unum string
245 * however, on Starcat 'N' does not appear in the unum string.
246 * We do not want this code to break at this point, so the
247 * unum_ptr is reset to the start of unum string if we fail
248 * to find an 'N'.
249 */
250 unum_ptr = strchr(unum_ptr, 'N');
251 if (unum_ptr == NULL) {
252 ecc_error_data.node_no = 0;
253 unum_ptr = unum;
254 } else {
255 unum_ptr++;
256 ecc_error_data.node_no = stoi(&unum_ptr);
257 }
258
259 /*
260 * Now pull out the SB/IO number
261 */
262 unum_ptr = strstr(unum_ptr, "SB");
263 if (unum_ptr == NULL) {
264
265 /*
266 * Since this is an E$ error, it must have occurred on
267 * either a System Board (represented by "SB" in the
268 * unum string) or a Maxcat board ("IO" in the unum
269 * string). Since we failed the "SB" check, we'll
270 * assume this is a maxcat board.
271 */
272 is_maxcat = 1;
273 unum_ptr = strstr(unum, "IO");
274 }
275 CHECK_UNUM;
276 unum_ptr += 2;
277 board_num = stoi(&unum_ptr);
278
279 /*
280 * Now pull out the Proc position (relative to the board)
281 */
282 unum_ptr = strchr(unum_ptr, 'P');
283 CHECK_UNUM;
284 unum_ptr++;
285 proc_position = stoi(&unum_ptr);
286
287 /*
288 * Using the SB/IO number, slot 0/1 value (is_maxcat), and
289 * proc position, we create the cpu id.
290 */
291 ecc_error_data.proc_num = plat_make_fru_cpuid(board_num,
292 is_maxcat, proc_position);
293
294 ecc_error_data.bank_no = 0; /* not used */
295
296 unum_ptr = strchr(unum_ptr, 'E');
297 CHECK_UNUM;
298 unum_ptr++;
299 ecc_error_data.ecache_dimm_no = (stoi(&unum_ptr) & 0x01);
300
301 unum_ptr = strchr(unum_ptr, 'J');
302 CHECK_UNUM;
303 unum_ptr++;
304 for (i = PLAT_ECC_JNUMBER_LENGTH;
305 i > 0 && *unum_ptr >= '0' && *unum_ptr <= '9'; i--)
306 *jno_ptr++ = *unum_ptr++;
307 *jno_ptr = NULL;
308
309 /*
310 * If we get here, we can assume the unum is valid
311 */
312 invalid_unum = 0;
313 break;
314 }
315 default:
316 /*
317 * Unknown error
318 */
319 break;
320 }
321
322 /*
323 * This is where CHECK_UNUM goes when it finds an error
324 */
325
326 if (ECC_SYND_DATA_BEGIN <= synd_code &&
327 synd_code < ECC_SYND_ECC_BEGIN) {
328 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE;
329 ecc_error_data.databit_type = PLAT_BIT_TYPE_DATA;
330 ecc_error_data.databit_no = synd_code;
331 } else if (ECC_SYND_ECC_BEGIN <= synd_code &&
332 synd_code < ECC_SYND_MTAG_BEGIN) {
333 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE;
334 ecc_error_data.databit_type = PLAT_BIT_TYPE_ECC;
335 ecc_error_data.databit_no = synd_code - ECC_SYND_ECC_BEGIN;
336 } else if (ECC_SYND_MTAG_BEGIN <= synd_code &&
337 synd_code < ECC_SYND_MECC_BEGIN) {
338 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE;
339 ecc_error_data.databit_type = PLAT_BIT_TYPE_MTAG_D;
340 ecc_error_data.databit_no = synd_code - ECC_SYND_MTAG_BEGIN;
341 } else if (ECC_SYND_MECC_BEGIN <= synd_code &&
342 synd_code < ECC_SYND_M2) {
343 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE;
344 ecc_error_data.databit_type = PLAT_BIT_TYPE_MTAG_E;
345 ecc_error_data.databit_no = synd_code - ECC_SYND_MECC_BEGIN;
346 } else {
347 switch (synd_code) {
348 case ECC_SYND_M2:
349 ecc_error_data.error_type = PLAT_ERROR_TYPE_M2;
350 break;
351 case ECC_SYND_M3:
352 ecc_error_data.error_type = PLAT_ERROR_TYPE_M3;
353 break;
354 case ECC_SYND_M4:
355 ecc_error_data.error_type = PLAT_ERROR_TYPE_M4;
356 break;
357 case ECC_SYND_M:
358 ecc_error_data.error_type = PLAT_ERROR_TYPE_M;
359 break;
360 default:
361 ecc_error_data.error_type = PLAT_ERROR_TYPE_UNK;
362 break;
363 }
364 ecc_error_data.databit_type = PLAT_BIT_TYPE_MULTI;
365 ecc_error_data.databit_no = 0; /* not used */
366 }
367
368 #ifdef DEBUG
369 if (invalid_unum &&
370 (ecc_error_data.error_code != PLAT_ERROR_CODE_UE) &&
371 unum && *unum)
372 cmn_err(CE_WARN, "Unexpected unum string format: %s\n", unum);
373 #endif
374
375 /*
376 * Send this data off as a mailbox message to the SC.
377 */
378 (void) plat_send_ecc_mailbox_msg(PLAT_ECC_ERROR_MESSAGE,
379 &ecc_error_data);
380 }
381
382 /*
383 * The unum string for memory is expected to be in this form:
384 * "[/N0/]SB12/P0/B0/D2 [J13500]"
385 * Or if the unum was generated as the result of a UE:
386 * "[/N0/]SB12/P0/B0 [J13500, ...]"
387 * From that we will extract the board number, processor position,
388 * bank number and jnumber.
389 *
390 * Return (1) for an invalid unum string. If the unum is for an
391 * individual DIMM and there is no jnumber, jnumber will be set
392 * to -1 and the caller can decide if the unum is valid. This
393 * is because Serengeti does not have jnumbers for bank unums
394 * which may be used to create DIMM unums (e.g. for acquiring
395 * DIMM serial ids).
396 */
397
398 int
parse_unum_memory(char * unum,int * board,int * pos,int * bank,int * dimm,int * jnumber)399 parse_unum_memory(char *unum, int *board, int *pos, int *bank, int *dimm,
400 int *jnumber)
401 {
402 char *c;
403
404 if ((c = strstr(unum, "SB")) == NULL)
405 return (1);
406 c += 2;
407 *board = (uint8_t)stoi(&c);
408
409 if (*c++ != '/' || *c++ != 'P')
410 return (1);
411 *pos = stoi(&c);
412
413 if (*c++ != '/' || *c++ != 'B')
414 return (1);
415 *bank = stoi(&c);
416
417 if ((c = strchr(c, 'D')) == NULL) {
418 *dimm = -1;
419 *jnumber = 0;
420 return (0);
421 }
422 c++;
423 *dimm = stoi(&c);
424
425 if ((c = strchr(c, 'J')) == NULL) {
426 *jnumber = -1;
427 return (0);
428 }
429
430 c++;
431 *jnumber = (uint16_t)stoi(&c);
432
433 return (0);
434 }
435
436 /*
437 * The unum string for ecache is expected to be in this form:
438 * "[/N0/][SB|IO]12/P0/E0 J13500, ..."
439 * From that we will extract the board number, processor position and
440 * junmber.
441 *
442 * return (1) for any invalid unum string.
443 */
444 static int
parse_unum_ecache(char * unum,int * board,int * pos,int * jnumber,int * maxcat)445 parse_unum_ecache(char *unum, int *board, int *pos, int *jnumber, int *maxcat)
446 {
447 char *c;
448
449 if ((c = strstr(unum, "SB")) == NULL) {
450 /*
451 * Since this is an E$ error, it must have occurred on
452 * either a System Board (represented by "SB" in the
453 * unum string) or a Maxcat board ("IO" in the unum
454 * string).
455 */
456 if ((c = strstr(unum, "IO")) == NULL)
457 return (1);
458 *maxcat = 1;
459 }
460
461 c += 2;
462 *board = (uint8_t)stoi(&c);
463
464 if (*c++ != '/' || *c++ != 'P')
465 return (1);
466 *pos = stoi(&c);
467
468 if ((c = strchr(c, 'J')) == NULL)
469 return (1);
470
471 c++;
472 *jnumber = (uint16_t)stoi(&c);
473
474 return (0);
475 }
476
477 /* The following array maps the error to its corresponding set */
478 static int plat_ecc_e2d_map[PLAT_ECC_ERROR2_NUMVALS] = {
479 PLAT_ECC_ERROR2_NONE, /* 0x00 */
480 PLAT_ECC_ERROR2_SEND_L2_XXC, /* 0x01 */
481 PLAT_ECC_ERROR2_SEND_L2_XXU, /* 0x02 */
482 PLAT_ECC_ERROR2_SEND_L3_XXC, /* 0x03 */
483 PLAT_ECC_ERROR2_SEND_L3_XXU, /* 0x04 */
484 PLAT_ECC_ERROR2_SEND_MEM_ERRS, /* 0x05 */
485 PLAT_ECC_ERROR2_SEND_MEM_ERRS, /* 0x06 */
486 PLAT_ECC_ERROR2_SEND_MEM_ERRS, /* 0x07 */
487 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x08 */
488 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x09 */
489 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x0a */
490 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x0b */
491 PLAT_ECC_ERROR2_SEND_L2_TAG_ERRS, /* 0x0c */
492 PLAT_ECC_ERROR2_SEND_L2_TAG_ERRS, /* 0x0d */
493 PLAT_ECC_ERROR2_SEND_L3_TAG_ERRS, /* 0x0e */
494 PLAT_ECC_ERROR2_SEND_L3_TAG_ERRS, /* 0x0f */
495 PLAT_ECC_ERROR2_SEND_L1_PARITY, /* 0x10 */
496 PLAT_ECC_ERROR2_SEND_L1_PARITY, /* 0x11 */
497 PLAT_ECC_ERROR2_SEND_TLB_PARITY, /* 0x12 */
498 PLAT_ECC_ERROR2_SEND_TLB_PARITY, /* 0x13 */
499 PLAT_ECC_ERROR2_SEND_IV_ERRS, /* 0x14 */
500 PLAT_ECC_ERROR2_SEND_IV_ERRS, /* 0x15 */
501 PLAT_ECC_ERROR2_SEND_MTAG_XXC, /* 0x16 */
502 PLAT_ECC_ERROR2_SEND_IV_MTAG_XXC, /* 0x17 */
503 PLAT_ECC_ERROR2_SEND_L3_XXC, /* 0x18 */
504 PLAT_ECC_ERROR2_SEND_PCACHE /* 0x19 */
505 };
506
507 /*
508 * log enhanced error information to SC.
509 */
510 void
plat_log_fruid_error2(int msg_type,char * unum,struct async_flt * aflt,plat_ecc_ch_async_flt_t * ecc_ch_flt)511 plat_log_fruid_error2(int msg_type, char *unum, struct async_flt *aflt,
512 plat_ecc_ch_async_flt_t *ecc_ch_flt)
513 {
514 plat_ecc_error2_data_t e2d = {0};
515 int board, pos, bank, dimm, jnumber;
516 int maxcat = 0;
517 uint16_t flags;
518
519 /* Check the flags */
520 flags = plat_ecc_e2d_map[msg_type];
521 if ((ecc_error2_mailbox_flags & flags) == 0)
522 return;
523
524 /* Fill the header */
525 e2d.ee2d_major_version = PLAT_ECC_ERROR2_VERSION_MAJOR;
526 e2d.ee2d_minor_version = PLAT_ECC_ERROR2_VERSION_MINOR;
527 e2d.ee2d_msg_type = PLAT_ECC_ERROR2_MESSAGE;
528 e2d.ee2d_msg_length = sizeof (plat_ecc_error2_data_t);
529
530 /* Fill the data */
531 if (aflt->flt_in_memory) {
532 if (parse_unum_memory(unum, &board, &pos, &bank, &dimm,
533 &jnumber) || (dimm != -1 && jnumber == -1))
534 return;
535 /*
536 * Using the SB number and Proc position we create a FRU
537 * cpu id.
538 */
539 e2d.ee2d_owning_proc = plat_make_fru_cpuid(board, 0, pos);
540 e2d.ee2d_jnumber = jnumber;
541 e2d.ee2d_bank_number = bank;
542 } else if (aflt->flt_status & ECC_ECACHE) {
543 if (parse_unum_ecache(unum, &board, &pos, &jnumber, &maxcat))
544 return;
545 /*
546 * Using the SB number and Proc position we create a FRU
547 * cpu id.
548 */
549 e2d.ee2d_owning_proc = plat_make_fru_cpuid(board, maxcat, pos);
550 e2d.ee2d_jnumber = jnumber;
551 e2d.ee2d_bank_number = (uint8_t)-1;
552 } else {
553 /*
554 * L1 Cache
555 */
556 e2d.ee2d_owning_proc = aflt->flt_bus_id;
557 e2d.ee2d_jnumber = (uint16_t)-1;
558 e2d.ee2d_bank_number = (uint8_t)-1;
559 }
560
561 e2d.ee2d_type = (uint8_t)msg_type;
562 e2d.ee2d_afar_status = (uint8_t)ecc_ch_flt->ecaf_afar_status;
563 e2d.ee2d_synd_status = (uint8_t)ecc_ch_flt->ecaf_synd_status;
564 e2d.ee2d_detecting_proc = aflt->flt_bus_id;
565 e2d.ee2d_cpu_impl = cpunodes[e2d.ee2d_owning_proc].implementation;
566 e2d.ee2d_timestamp = aflt->flt_id;
567 e2d.ee2d_afsr = aflt->flt_stat;
568 e2d.ee2d_afar = aflt->flt_addr;
569
570 e2d.ee2d_sdw_afsr = ecc_ch_flt->ecaf_sdw_afsr;
571 e2d.ee2d_sdw_afar = ecc_ch_flt->ecaf_sdw_afar;
572 e2d.ee2d_afsr_ext = ecc_ch_flt->ecaf_afsr_ext;
573 e2d.ee2d_sdw_afsr_ext = ecc_ch_flt->ecaf_sdw_afsr_ext;
574
575 /* Send the message to SC */
576 (void) plat_send_ecc_mailbox_msg(PLAT_ECC_ERROR2_MESSAGE, &e2d);
577 }
578
579 uint8_t ecc_indictment_mailbox_disable = PLAT_ECC_INDICTMENT_OK;
580 uint8_t ecc_indictment_mailbox_flags = PLAT_ECC_SEND_DEFAULT_INDICT;
581
582 /*
583 * We log all Solaris indictments of failing hardware. We pull the system
584 * board number and jnumber out of the unum string, and calculate the cpuid
585 * from some members of the unum string. The rest of the structure is filled
586 * in through the other arguments. The data structure is then passed to
587 * plat_ecc_dispatch_task(). This function should only be loaded into memory
588 * or called on platforms that define a plat_send_ecc_mailbox_msg() function.
589 */
590 static int
plat_log_fruid_indictment(int msg_type,struct async_flt * aflt,char * unum)591 plat_log_fruid_indictment(int msg_type, struct async_flt *aflt, char *unum)
592 {
593 plat_ecc_message_t *wrapperp;
594 plat_ecc_indict_msg_contents_t *contentsp;
595 char *unum_ptr;
596 int is_maxcat = 0;
597
598 switch (ecc_indictment_mailbox_disable) {
599 case (PLAT_ECC_INDICTMENT_OK):
600 case (PLAT_ECC_INDICTMENT_SUSPECT):
601 break;
602 case (PLAT_ECC_INDICTMENT_NO_SEND):
603 default:
604 return (ECONNREFUSED);
605 }
606
607 switch (msg_type) {
608 case (PLAT_ECC_INDICT_DIMM):
609 if ((ecc_indictment_mailbox_flags &
610 PLAT_ECC_SEND_DIMM_INDICT) == 0)
611 return (ECONNREFUSED);
612 break;
613 case (PLAT_ECC_INDICT_ECACHE_CORRECTABLES):
614 if ((ecc_indictment_mailbox_flags &
615 PLAT_ECC_SEND_ECACHE_XXC_INDICT) == 0)
616 return (ECONNREFUSED);
617 break;
618 case (PLAT_ECC_INDICT_ECACHE_UNCORRECTABLE):
619 if ((ecc_indictment_mailbox_flags &
620 PLAT_ECC_SEND_ECACHE_XXU_INDICT) == 0)
621 return (ECONNREFUSED);
622 break;
623 default:
624 return (ECONNREFUSED);
625 }
626
627 /* LINTED: E_TRUE_LOGICAL_EXPR */
628 ASSERT(sizeof (plat_ecc_indictment_data_t) == PLAT_ECC_INDICT_SIZE);
629
630 wrapperp = (plat_ecc_message_t *)
631 kmem_zalloc(sizeof (plat_ecc_message_t), KM_SLEEP);
632
633 wrapperp->ecc_msg_status = PLAT_ECC_NO_MSG_ACTIVE;
634 wrapperp->ecc_msg_type = PLAT_ECC_INDICTMENT_MESSAGE;
635 wrapperp->ecc_msg_len = sizeof (plat_ecc_indictment_data_t);
636 wrapperp->ecc_msg_data = kmem_zalloc(wrapperp->ecc_msg_len, KM_SLEEP);
637
638 contentsp = &(((plat_ecc_indictment_data_t *)
639 wrapperp->ecc_msg_data)->msg_contents);
640
641 /*
642 * Find board_num, jnumber, and proc position from the unum string.
643 * Use the board number, is_maxcat, and proc position to calculate
644 * cpuid.
645 */
646 unum_ptr = strstr(unum, "SB");
647 if (unum_ptr == NULL) {
648 is_maxcat = 1;
649 unum_ptr = strstr(unum, "IO");
650 if (unum_ptr == NULL) {
651 kmem_free(wrapperp->ecc_msg_data,
652 wrapperp->ecc_msg_len);
653 kmem_free(wrapperp, sizeof (plat_ecc_message_t));
654 return (EINVAL);
655 }
656 }
657 unum_ptr += 2;
658 contentsp->board_num = (uint8_t)stoi(&unum_ptr);
659
660 unum_ptr = strchr(unum_ptr, 'P');
661 if (unum_ptr == NULL) {
662 kmem_free(wrapperp->ecc_msg_data, wrapperp->ecc_msg_len);
663 kmem_free(wrapperp, sizeof (plat_ecc_message_t));
664 return (EINVAL);
665 }
666 unum_ptr++;
667 contentsp->detecting_proc =
668 (uint16_t)plat_make_fru_cpuid(contentsp->board_num, is_maxcat,
669 stoi(&unum_ptr));
670
671 unum_ptr = strchr(unum_ptr, 'J');
672 if (unum_ptr == NULL) {
673 kmem_free(wrapperp->ecc_msg_data, wrapperp->ecc_msg_len);
674 kmem_free(wrapperp, sizeof (plat_ecc_message_t));
675 return (EINVAL);
676 }
677 unum_ptr++;
678 contentsp->jnumber = (uint16_t)stoi(&unum_ptr);
679
680 /*
681 * Fill in the rest of the data
682 */
683 contentsp->version = PLAT_ECC_INDICTMENT_VERSION;
684 contentsp->indictment_type = msg_type;
685 contentsp->indictment_uncertain = ecc_indictment_mailbox_disable;
686 contentsp->syndrome = aflt->flt_synd;
687 contentsp->afsr = aflt->flt_stat;
688 contentsp->afar = aflt->flt_addr;
689
690 /*
691 * Build the solaris_version string:
692 */
693 (void) snprintf(contentsp->solaris_version,
694 PLAT_ECC_VERSION_LENGTH, "%s %s", utsname.release, utsname.version);
695
696 /*
697 * Send the data on to the queuing function
698 */
699 return (plat_ecc_dispatch_task(wrapperp));
700 }
701
702 /* The following array maps the indictment to its corresponding set */
703 static int plat_ecc_i2d_map[PLAT_ECC_INDICT2_NUMVALS] = {
704 PLAT_ECC_INDICT2_NONE, /* 0x00 */
705 PLAT_ECC_SEND_INDICT2_L2_XXU, /* 0x01 */
706 PLAT_ECC_SEND_INDICT2_L2_XXC_SERD, /* 0x02 */
707 PLAT_ECC_SEND_INDICT2_L2_TAG_SERD, /* 0x03 */
708 PLAT_ECC_SEND_INDICT2_L3_XXU, /* 0x04 */
709 PLAT_ECC_SEND_INDICT2_L3_XXC_SERD, /* 0x05 */
710 PLAT_ECC_SEND_INDICT2_L3_TAG_SERD, /* 0x06 */
711 PLAT_ECC_SEND_INDICT2_L1_SERD, /* 0x07 */
712 PLAT_ECC_SEND_INDICT2_L1_SERD, /* 0x08 */
713 PLAT_ECC_SEND_INDICT2_TLB_SERD, /* 0x09 */
714 PLAT_ECC_SEND_INDICT2_TLB_SERD, /* 0x0a */
715 PLAT_ECC_SEND_INDICT2_FPU, /* 0x0b */
716 PLAT_ECC_SEND_INDICT2_PCACHE_SERD /* 0x0c */
717 };
718
719 static int
plat_log_fruid_indictment2(int msg_type,struct async_flt * aflt,char * unum)720 plat_log_fruid_indictment2(int msg_type, struct async_flt *aflt, char *unum)
721 {
722 plat_ecc_message_t *wrapperp;
723 plat_ecc_indictment2_data_t *i2d;
724 int board, pos, jnumber;
725 int maxcat = 0;
726 uint16_t flags;
727
728 /*
729 * If the unum is null or empty, skip parsing it
730 */
731 if (unum && unum[0] != '\0') {
732 if (parse_unum_ecache(unum, &board, &pos, &jnumber, &maxcat))
733 return (EINVAL);
734 }
735
736 if ((ecc_indictment_mailbox_disable != PLAT_ECC_INDICTMENT_OK) &&
737 (ecc_indictment_mailbox_disable != PLAT_ECC_INDICTMENT_SUSPECT))
738 return (ECONNREFUSED);
739
740 /* Check the flags */
741 flags = plat_ecc_i2d_map[msg_type];
742 if ((ecc_indictment2_mailbox_flags & flags) == 0)
743 return (ECONNREFUSED);
744
745 wrapperp = (plat_ecc_message_t *)
746 kmem_zalloc(sizeof (plat_ecc_message_t), KM_SLEEP);
747
748 /* Initialize the wrapper */
749 wrapperp->ecc_msg_status = PLAT_ECC_NO_MSG_ACTIVE;
750 wrapperp->ecc_msg_type = PLAT_ECC_INDICTMENT2_MESSAGE;
751 wrapperp->ecc_msg_len = sizeof (plat_ecc_indictment2_data_t);
752 wrapperp->ecc_msg_data = kmem_zalloc(wrapperp->ecc_msg_len, KM_SLEEP);
753
754 i2d = (plat_ecc_indictment2_data_t *)wrapperp->ecc_msg_data;
755
756 /* Fill the header */
757 i2d->ei2d_major_version = PLAT_ECC_INDICT2_MAJOR_VERSION;
758 i2d->ei2d_minor_version = PLAT_ECC_INDICT2_MINOR_VERSION;
759 i2d->ei2d_msg_type = PLAT_ECC_INDICTMENT2_MESSAGE;
760 i2d->ei2d_msg_length = sizeof (plat_ecc_indictment2_data_t);
761
762 /* Fill the data */
763 if (unum && unum[0] != '\0') {
764 i2d->ei2d_arraigned_proc = plat_make_fru_cpuid(board, maxcat,
765 pos);
766 i2d->ei2d_board_num = board;
767 i2d->ei2d_jnumber = jnumber;
768 } else {
769 i2d->ei2d_arraigned_proc = aflt->flt_inst;
770 i2d->ei2d_board_num = (uint8_t)
771 plat_make_fru_boardnum(i2d->ei2d_arraigned_proc);
772 i2d->ei2d_jnumber = (uint16_t)-1;
773 }
774
775 i2d->ei2d_type = msg_type;
776 i2d->ei2d_uncertain = ecc_indictment_mailbox_disable;
777 i2d->ei2d_cpu_impl = cpunodes[i2d->ei2d_arraigned_proc].implementation;
778 i2d->ei2d_timestamp = aflt->flt_id;
779
780 /*
781 * Send the data on to the queuing function
782 */
783 return (plat_ecc_dispatch_task(wrapperp));
784 }
785
786 int
plat_ecc_capability_send(void)787 plat_ecc_capability_send(void)
788 {
789 plat_ecc_message_t *wrapperp;
790 plat_capability_data_t *cap;
791 int ver_len;
792
793 wrapperp = kmem_zalloc(sizeof (plat_ecc_message_t), KM_SLEEP);
794
795 ver_len = strlen(utsname.release) + strlen(utsname.version) + 2;
796
797 /* Initialize the wrapper */
798 wrapperp->ecc_msg_status = PLAT_ECC_NO_MSG_ACTIVE;
799 wrapperp->ecc_msg_type = PLAT_ECC_CAPABILITY_MESSAGE;
800 wrapperp->ecc_msg_len = sizeof (plat_capability_data_t) + ver_len;
801 wrapperp->ecc_msg_data = kmem_zalloc(wrapperp->ecc_msg_len, KM_SLEEP);
802
803 cap = (plat_capability_data_t *)wrapperp->ecc_msg_data;
804
805 /* Fill the header */
806 cap->capd_major_version = PLAT_ECC_CAP_VERSION_MAJOR;
807 cap->capd_minor_version = PLAT_ECC_CAP_VERSION_MINOR;
808 cap->capd_msg_type = PLAT_ECC_CAPABILITY_MESSAGE;
809 cap->capd_msg_length = wrapperp->ecc_msg_len;
810
811 /* Set the default domain capability */
812 cap->capd_capability = PLAT_ECC_CAPABILITY_DOMAIN_DEFAULT;
813
814 /*
815 * Build the solaris_version string:
816 * utsname.release + " " + utsname.version
817 */
818 (void) snprintf(cap->capd_solaris_version, ver_len, "%s %s",
819 utsname.release, utsname.version);
820
821 /*
822 * Send the data on to the queuing function
823 */
824 return (plat_ecc_dispatch_task(wrapperp));
825 }
826
827 int
plat_ecc_capability_sc_get(int type)828 plat_ecc_capability_sc_get(int type)
829 {
830 switch (type) {
831 case PLAT_ECC_ERROR_MESSAGE:
832 if (ecc_log_fruid_enable &&
833 (!(plat_ecc_capability_map_sc &
834 PLAT_ECC_CAPABILITY_ERROR2)))
835 return (1);
836 break;
837 case PLAT_ECC_ERROR2_MESSAGE:
838 if (plat_ecc_capability_map_sc &
839 PLAT_ECC_CAPABILITY_ERROR2)
840 return (1);
841 break;
842 case PLAT_ECC_INDICTMENT_MESSAGE:
843 if (!(plat_ecc_capability_map_sc &
844 PLAT_ECC_CAPABILITY_INDICT2) ||
845 !(plat_ecc_capability_map_domain &
846 PLAT_ECC_CAPABILITY_FMA))
847 return (1);
848 break;
849 case PLAT_ECC_INDICTMENT2_MESSAGE:
850 if (plat_ecc_capability_map_sc &
851 PLAT_ECC_CAPABILITY_INDICT2)
852 return (1);
853 break;
854 case PLAT_ECC_DIMM_SID_MESSAGE:
855 if (plat_ecc_capability_map_sc &
856 PLAT_ECC_CAPABILITY_DIMM_SID)
857 return (1);
858 default:
859 return (0);
860 }
861 return (0);
862 }
863
864 int plat_ecc_cap_sc_set_cnt = 0;
865
866 void
plat_ecc_capability_sc_set(uint32_t cap)867 plat_ecc_capability_sc_set(uint32_t cap)
868 {
869 plat_ecc_capability_map_sc = cap;
870
871 if (!plat_ecc_cap_sc_set_cnt && (cap & PLAT_ECC_CAPABILITY_DIMM_SID))
872 if (p2init_sid_cache)
873 p2init_sid_cache();
874
875 plat_ecc_cap_sc_set_cnt++;
876 }
877
878 /*
879 * The following table represents mapping between the indictment1 reason
880 * to its type.
881 */
882
883 static plat_ecc_bl_map_t plat_ecc_bl_map_v1[] = {
884 { "l2cachedata", PLAT_ECC_INDICT_ECACHE_CORRECTABLES },
885 { "l3cachedata", PLAT_ECC_INDICT_ECACHE_CORRECTABLES },
886 { "l2cachedata", PLAT_ECC_INDICT_ECACHE_UNCORRECTABLE },
887 { "l3cachedata", PLAT_ECC_INDICT_ECACHE_UNCORRECTABLE }
888 };
889
890 /*
891 * The following table represents mapping between the indictment2 reason
892 * to its type.
893 */
894
895 static plat_ecc_bl_map_t plat_ecc_bl_map_v2[] = {
896 { "l2cachedata", PLAT_ECC_INDICT2_L2_SERD },
897 { "l3cachedata", PLAT_ECC_INDICT2_L3_SERD },
898 { "l2cachedata", PLAT_ECC_INDICT2_L2_UE },
899 { "l3cachedata", PLAT_ECC_INDICT2_L3_UE },
900 { "l2cachetag", PLAT_ECC_INDICT2_L2_TAG_SERD },
901 { "l3cachetag", PLAT_ECC_INDICT2_L3_TAG_SERD },
902 { "icache", PLAT_ECC_INDICT2_ICACHE_SERD },
903 { "dcache", PLAT_ECC_INDICT2_DCACHE_SERD },
904 { "pcache", PLAT_ECC_INDICT2_PCACHE_SERD },
905 { "itlb", PLAT_ECC_INDICT2_ITLB_SERD },
906 { "dtlb", PLAT_ECC_INDICT2_DTLB_SERD },
907 { "fpu", PLAT_ECC_INDICT2_FPU }
908 };
909
910 /*
911 * The following function returns the indictment type for a given version
912 */
913 static int
flt_name_to_msg_type(const char * fault,int indict_version)914 flt_name_to_msg_type(const char *fault, int indict_version)
915 {
916 plat_ecc_bl_map_t *mapp;
917 char *fltnm = "fault.cpu.";
918 int mapsz;
919 char *p;
920 int i;
921
922 /* Check if it starts with proper fault name */
923 if (strncmp(fault, fltnm, strlen(fltnm)) != 0)
924 return (PLAT_ECC_INDICT_NONE);
925
926 fault += strlen(fltnm); /* c = "ultraSPARC-IV.icache" */
927
928 /* Skip the cpu type */
929 if ((p = strchr(fault, '.')) == NULL)
930 return (PLAT_ECC_INDICT_NONE);
931
932 p++; /* skip the "." */
933
934 if (indict_version == 0) {
935 mapp = plat_ecc_bl_map_v1;
936 mapsz = sizeof (plat_ecc_bl_map_v1) /
937 sizeof (plat_ecc_bl_map_t);
938 } else {
939 mapp = plat_ecc_bl_map_v2;
940 mapsz = sizeof (plat_ecc_bl_map_v2) /
941 sizeof (plat_ecc_bl_map_t);
942 }
943 for (i = 0; i < mapsz; i++) {
944 if (strcmp(p, mapp[i].ebm_reason) == 0) {
945 return (mapp[i].ebm_type);
946 }
947 }
948 return (PLAT_ECC_INDICT_NONE);
949 }
950
951 /*
952 * Blacklisting
953 */
954 int
plat_blacklist(int cmd,const char * scheme,nvlist_t * fmri,const char * class)955 plat_blacklist(int cmd, const char *scheme, nvlist_t *fmri, const char *class)
956 {
957 struct async_flt aflt;
958 char *unum;
959 int msg_type, is_old_indict;
960
961 if (fmri == NULL)
962 return (EINVAL);
963 if (cmd != BLIOC_INSERT)
964 return (ENOTSUP);
965
966 /*
967 * We support both the blacklisting of CPUs via mem-schemed
968 * FMRIs that name E$ J-numbers, and CPUs via cpu-schemed FMRIs
969 * that name the cpuid.
970 */
971 if (strcmp(scheme, FM_FMRI_SCHEME_MEM) == 0) {
972 if (nvlist_lookup_string(fmri, FM_FMRI_MEM_UNUM, &unum))
973 return (EINVAL);
974 aflt.flt_inst = (uint_t)-1;
975 } else if (strcmp(scheme, FM_FMRI_SCHEME_CPU) == 0) {
976 if (nvlist_lookup_uint32(fmri, FM_FMRI_CPU_ID, &aflt.flt_inst))
977 return (EINVAL);
978 unum = NULL;
979 } else {
980 return (ENOTSUP);
981 }
982
983 /*
984 * If the SC cannot handle indictment2, so fall back to old one.
985 * Also if the domain does not support FMA, then send only the old one.
986 */
987
988 is_old_indict = plat_ecc_capability_sc_get(PLAT_ECC_INDICTMENT_MESSAGE);
989
990 if (is_old_indict)
991 msg_type = flt_name_to_msg_type(class, 0);
992 else
993 msg_type = flt_name_to_msg_type(class, 1);
994
995 if (msg_type == PLAT_ECC_INDICT_NONE)
996 return (ENOTSUP);
997
998 /*
999 * The current blacklisting interfaces are designed for a world where
1000 * the SC is much more involved in the diagnosis and error reporting
1001 * process than it is in the FMA world. As such, the existing
1002 * interfaces want all kinds of information about the error that's
1003 * triggering the blacklist. In the FMA world, we don't have access
1004 * to any of that information by the time we're doing the blacklist,
1005 * so we fake values.
1006 */
1007 aflt.flt_id = gethrtime();
1008 aflt.flt_addr = -1;
1009 aflt.flt_stat = -1;
1010 aflt.flt_synd = (ushort_t)-1;
1011
1012 if (is_old_indict) {
1013 if (unum && unum[0] != '\0')
1014 return (plat_log_fruid_indictment(msg_type, &aflt,
1015 unum));
1016 else
1017 return (ENOTSUP);
1018 } else {
1019 return (plat_log_fruid_indictment2(msg_type, &aflt, unum));
1020 }
1021 }
1022
1023 static kcondvar_t plat_ecc_condvar;
1024 static kmutex_t plat_ecc_mutex;
1025 static taskq_t *plat_ecc_taskq;
1026
1027 /*
1028 * plat_ecc_dispatch_task: Dispatch the task on a taskq and wait for the
1029 * return value. We use cv_wait_sig to wait for the return values. If a
1030 * signal interrupts us, we return EINTR. Otherwise, we return the value
1031 * returned by the mailbox functions.
1032 *
1033 * To avoid overloading the lower-level mailbox routines, we use a taskq
1034 * to serialize all messages. Currently, it is expected that only one
1035 * process (fmd) will use this ioctl, so the delay caused by the taskq
1036 * should not have much of an effect.
1037 */
1038 int
plat_ecc_dispatch_task(plat_ecc_message_t * msg)1039 plat_ecc_dispatch_task(plat_ecc_message_t *msg)
1040 {
1041 int ret;
1042
1043 ASSERT(msg != NULL);
1044 ASSERT(plat_ecc_taskq != NULL);
1045
1046 if (taskq_dispatch(plat_ecc_taskq, plat_ecc_send_msg,
1047 (void *)msg, TQ_NOSLEEP) == NULL) {
1048 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len);
1049 kmem_free(msg, sizeof (plat_ecc_message_t));
1050 return (ENOMEM);
1051 }
1052 mutex_enter(&plat_ecc_mutex);
1053
1054 /*
1055 * It's possible that the taskq function completed before we
1056 * acquired the mutex. Check for this first. If this did not
1057 * happen, we wait for the taskq function to signal us, or an
1058 * interrupt. We also check ecc_msg_status to protect against
1059 * spurious wakeups from cv_wait_sig.
1060 */
1061 if (msg->ecc_msg_status == PLAT_ECC_MSG_SENT) {
1062 ret = msg->ecc_msg_ret;
1063 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len);
1064 kmem_free(msg, sizeof (plat_ecc_message_t));
1065 } else {
1066 msg->ecc_msg_status = PLAT_ECC_TASK_DISPATCHED;
1067
1068 while ((ret = cv_wait_sig(&plat_ecc_condvar,
1069 &plat_ecc_mutex)) != 0 &&
1070 msg->ecc_msg_status == PLAT_ECC_TASK_DISPATCHED)
1071 ;
1072
1073 if ((ret == 0) && (msg->ecc_msg_status != PLAT_ECC_MSG_SENT)) {
1074 /* An interrupt was received */
1075 msg->ecc_msg_status = PLAT_ECC_INTERRUPT_RECEIVED;
1076 ret = EINTR;
1077 } else {
1078 ret = msg->ecc_msg_ret;
1079 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len);
1080 kmem_free(msg, sizeof (plat_ecc_message_t));
1081 }
1082 }
1083 mutex_exit(&plat_ecc_mutex);
1084 return (ret);
1085 }
1086
1087 static void
plat_ecc_send_msg(void * arg)1088 plat_ecc_send_msg(void *arg)
1089 {
1090 plat_ecc_message_t *msg = arg;
1091 int ret;
1092
1093 /*
1094 * Send this data off as a mailbox message to the SC.
1095 */
1096 ret = plat_send_ecc_mailbox_msg(msg->ecc_msg_type, msg->ecc_msg_data);
1097
1098 mutex_enter(&plat_ecc_mutex);
1099
1100 /*
1101 * If the dispatching function received an interrupt, don't bother
1102 * signalling it, and throw away the results. Otherwise, set the
1103 * return value and signal the condvar.
1104 */
1105 if (msg->ecc_msg_status == PLAT_ECC_INTERRUPT_RECEIVED) {
1106 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len);
1107 kmem_free(msg, sizeof (plat_ecc_message_t));
1108 } else {
1109 msg->ecc_msg_ret = ret;
1110 msg->ecc_msg_status = PLAT_ECC_MSG_SENT;
1111 cv_broadcast(&plat_ecc_condvar);
1112 }
1113
1114 mutex_exit(&plat_ecc_mutex);
1115 }
1116
1117 void
plat_ecc_init(void)1118 plat_ecc_init(void)
1119 {
1120 int bd;
1121
1122 mutex_init(&plat_ecc_mutex, NULL, MUTEX_DEFAULT, NULL);
1123 cv_init(&plat_ecc_condvar, NULL, CV_DEFAULT, NULL);
1124 plat_ecc_taskq = taskq_create("plat_ecc_taskq", 1, minclsyspri,
1125 PLAT_ECC_TASKQ_MIN, PLAT_ECC_TASKQ_MAX, TASKQ_PREPOPULATE);
1126 ASSERT(plat_ecc_taskq != NULL);
1127
1128 for (bd = 0; bd < plat_max_cpumem_boards(); bd++) {
1129 mutex_init(&domain_dimm_sids[bd].pdsb_lock,
1130 NULL, MUTEX_DEFAULT, NULL);
1131 }
1132
1133 }
1134