1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/plat_ecc_unum.h>
27 #include <sys/utsname.h>
28 #include <sys/cmn_err.h>
29 #include <sys/async.h>
30 #include <sys/errno.h>
31 #include <sys/fm/protocol.h>
32 #include <sys/fm/cpu/UltraSPARC-III.h>
33 #include <sys/bl.h>
34 #include <sys/taskq.h>
35 #include <sys/condvar.h>
36 #include <sys/plat_ecc_dimm.h>
37
38 /*
39 * Pointer to platform specific function to initialize a cache of DIMM
40 * serial ids
41 */
42 int (*p2init_sid_cache)(void);
43
44 /*
45 * This file contains the common code that is used for parsing
46 * ecc unum data and logging it appropriately as the platform
47 * that calls this code implements.
48 */
49
50 int plat_ecc_dispatch_task(plat_ecc_message_t *);
51 static void plat_ecc_send_msg(void *);
52
53 #define CHECK_UNUM \
54 if (unum_ptr == NULL) { \
55 break; \
56 }
57
58 /*
59 * See plat_ecc_unum.h for the meaning of these variables.
60 */
61 int ecc_log_fruid_enable = ECC_FRUID_ENABLE_DEFAULT;
62
63 uint32_t plat_ecc_capability_map_domain = PLAT_ECC_CAPABILITY_DOMAIN_DEFAULT;
64 uint32_t plat_ecc_capability_map_sc = PLAT_ECC_CAPABILITY_SC_DEFAULT;
65 uint16_t ecc_error2_mailbox_flags = PLAT_ECC_ERROR2_SEND_DEFAULT;
66 uint16_t ecc_indictment2_mailbox_flags = PLAT_ECC_SEND_INDICT2_DEFAULT;
67
68 /*
69 * We log all ECC errors using the function that is defined as
70 * plat_send_ecc_mailbox_msg(); We first parse the unum string and
71 * then pass the data to be logged to the plat_send_ecc_mailbox_msg
72 * function for logging. Each platform that uses this code needs to
73 * implement a suitable function for this purpose.
74 */
75 void
plat_log_fruid_error(int synd_code,struct async_flt * ecc,char * unum,uint64_t afsr_bit)76 plat_log_fruid_error(int synd_code, struct async_flt *ecc, char *unum,
77 uint64_t afsr_bit)
78 {
79 plat_ecc_error_data_t ecc_error_data;
80 enum plat_ecc_type ecc_type = PLAT_ECC_UNKNOWN;
81 int board_num;
82 int proc_position;
83 int invalid_unum = 1;
84
85 bzero(&ecc_error_data, sizeof (plat_ecc_error_data_t));
86 ecc_error_data.version = PLAT_ECC_VERSION;
87
88 switch (afsr_bit) {
89 case C_AFSR_CE:
90 ecc_error_data.error_code = PLAT_ERROR_CODE_CE;
91 break;
92 case C_AFSR_UE:
93 ecc_error_data.error_code = PLAT_ERROR_CODE_UE;
94 break;
95 case C_AFSR_EDC:
96 ecc_error_data.error_code = PLAT_ERROR_CODE_EDC;
97 break;
98 case C_AFSR_EDU:
99 ecc_error_data.error_code = PLAT_ERROR_CODE_EDU;
100 break;
101 case C_AFSR_WDC:
102 ecc_error_data.error_code = PLAT_ERROR_CODE_WDC;
103 break;
104 case C_AFSR_WDU:
105 ecc_error_data.error_code = PLAT_ERROR_CODE_WDU;
106 break;
107 case C_AFSR_CPC:
108 ecc_error_data.error_code = PLAT_ERROR_CODE_CPC;
109 break;
110 case C_AFSR_CPU:
111 ecc_error_data.error_code = PLAT_ERROR_CODE_CPU;
112 break;
113 case C_AFSR_UCC:
114 ecc_error_data.error_code = PLAT_ERROR_CODE_UCC;
115 break;
116 case C_AFSR_UCU:
117 ecc_error_data.error_code = PLAT_ERROR_CODE_UCU;
118 break;
119 case C_AFSR_EMC:
120 ecc_error_data.error_code = PLAT_ERROR_CODE_EMC;
121 break;
122 case C_AFSR_EMU:
123 ecc_error_data.error_code = PLAT_ERROR_CODE_EMU;
124 break;
125 default:
126 /*
127 * Do not send messages with unknown error codes, since
128 * the SC will not be able to tell what type of error
129 * occurred.
130 */
131 return;
132 }
133
134 ecc_error_data.detecting_proc = ecc->flt_bus_id;
135
136 if (ecc->flt_in_memory)
137 ecc_type = PLAT_ECC_MEMORY;
138 else if (ecc->flt_status & ECC_ECACHE)
139 ecc_type = PLAT_ECC_ECACHE;
140
141 switch (ecc_type) {
142 case PLAT_ECC_MEMORY: {
143 /*
144 * The unum string is expected to be in this form:
145 * "/N0/SB12/P0/B0/D2 J13500, ..."
146 * for serengeti. As this code is shared with Starcat
147 * if N is missing then it is set to 0.
148 * From that we will extract the bank number, dimm
149 * number, and Jnumber.
150 */
151 char *unum_ptr = unum;
152 char *jno_ptr = ecc_error_data.Jnumber;
153 int i;
154
155 /*
156 * On Serengeti we expect to find 'N' in the unum string
157 * however, on Starcat 'N' does not appear in the unum string.
158 * We do not want this code to break at this point, so the
159 * unum_ptr is reset to the start of unum string if we fail
160 * to find an 'N'.
161 */
162 unum_ptr = strchr(unum_ptr, 'N');
163 if (unum_ptr == NULL) {
164 ecc_error_data.node_no = 0;
165 unum_ptr = unum;
166 } else {
167 unum_ptr++;
168 ecc_error_data.node_no = stoi(&unum_ptr);
169 }
170
171 /*
172 * Now pull out the SB number
173 */
174 unum_ptr = strstr(unum_ptr, "SB");
175 CHECK_UNUM;
176 unum_ptr += 2;
177 board_num = stoi(&unum_ptr);
178
179 /*
180 * Now pull out the Proc position (relative to the board)
181 */
182 unum_ptr = strchr(unum_ptr, 'P');
183 CHECK_UNUM;
184 unum_ptr++;
185 proc_position = stoi(&unum_ptr);
186
187 /*
188 * Using the SB number and Proc position we create a FRU
189 * cpu id.
190 */
191 ecc_error_data.proc_num =
192 plat_make_fru_cpuid(board_num, 0, proc_position);
193
194 /*
195 * Now pull out the Memory Bank number
196 */
197 unum_ptr = strchr(unum_ptr, 'B');
198 CHECK_UNUM;
199 unum_ptr++;
200 ecc_error_data.bank_no = (stoi(&unum_ptr) & 0x01);
201
202 /*
203 * Now pull out the Dimm number within the Memory Bank.
204 */
205 unum_ptr = strchr(unum_ptr, 'D');
206 CHECK_UNUM;
207 unum_ptr++;
208 ecc_error_data.ecache_dimm_no = (stoi(&unum_ptr) & 0x03);
209
210 /*
211 * Now pull out the J-number.
212 */
213 unum_ptr = strchr(unum_ptr, 'J');
214 CHECK_UNUM;
215 unum_ptr++;
216 for (i = PLAT_ECC_JNUMBER_LENGTH;
217 i > 0 && *unum_ptr >= '0' && *unum_ptr <= '9'; i--)
218 *jno_ptr++ = *unum_ptr++;
219 *jno_ptr = '\0';
220
221 /*
222 * If we get here, we can assume the unum is valid
223 */
224 invalid_unum = 0;
225 break;
226 }
227 case PLAT_ECC_ECACHE: {
228 /*
229 * The unum string is expected to be in this form:
230 * "[/N0/][SB|IO]12/P0/E0 J13500, ..."
231 * for serengeti. As this code is shared with Starcat
232 * if N is missing then it is set to 0. IO may only appear
233 * on Starcats. From that we will extract the bank number,
234 * dimm number, and Jnumber.
235 */
236 char *unum_ptr = unum;
237 char *jno_ptr = ecc_error_data.Jnumber;
238 int is_maxcat = 0;
239 int i;
240
241 /*
242 * On Serengeti we expect to find 'N' in the unum string
243 * however, on Starcat 'N' does not appear in the unum string.
244 * We do not want this code to break at this point, so the
245 * unum_ptr is reset to the start of unum string if we fail
246 * to find an 'N'.
247 */
248 unum_ptr = strchr(unum_ptr, 'N');
249 if (unum_ptr == NULL) {
250 ecc_error_data.node_no = 0;
251 unum_ptr = unum;
252 } else {
253 unum_ptr++;
254 ecc_error_data.node_no = stoi(&unum_ptr);
255 }
256
257 /*
258 * Now pull out the SB/IO number
259 */
260 unum_ptr = strstr(unum_ptr, "SB");
261 if (unum_ptr == NULL) {
262
263 /*
264 * Since this is an E$ error, it must have occurred on
265 * either a System Board (represented by "SB" in the
266 * unum string) or a Maxcat board ("IO" in the unum
267 * string). Since we failed the "SB" check, we'll
268 * assume this is a maxcat board.
269 */
270 is_maxcat = 1;
271 unum_ptr = strstr(unum, "IO");
272 }
273 CHECK_UNUM;
274 unum_ptr += 2;
275 board_num = stoi(&unum_ptr);
276
277 /*
278 * Now pull out the Proc position (relative to the board)
279 */
280 unum_ptr = strchr(unum_ptr, 'P');
281 CHECK_UNUM;
282 unum_ptr++;
283 proc_position = stoi(&unum_ptr);
284
285 /*
286 * Using the SB/IO number, slot 0/1 value (is_maxcat), and
287 * proc position, we create the cpu id.
288 */
289 ecc_error_data.proc_num = plat_make_fru_cpuid(board_num,
290 is_maxcat, proc_position);
291
292 ecc_error_data.bank_no = 0; /* not used */
293
294 unum_ptr = strchr(unum_ptr, 'E');
295 CHECK_UNUM;
296 unum_ptr++;
297 ecc_error_data.ecache_dimm_no = (stoi(&unum_ptr) & 0x01);
298
299 unum_ptr = strchr(unum_ptr, 'J');
300 CHECK_UNUM;
301 unum_ptr++;
302 for (i = PLAT_ECC_JNUMBER_LENGTH;
303 i > 0 && *unum_ptr >= '0' && *unum_ptr <= '9'; i--)
304 *jno_ptr++ = *unum_ptr++;
305 *jno_ptr = '\0';
306
307 /*
308 * If we get here, we can assume the unum is valid
309 */
310 invalid_unum = 0;
311 break;
312 }
313 default:
314 /*
315 * Unknown error
316 */
317 break;
318 }
319
320 /*
321 * This is where CHECK_UNUM goes when it finds an error
322 */
323
324 if (ECC_SYND_DATA_BEGIN <= synd_code &&
325 synd_code < ECC_SYND_ECC_BEGIN) {
326 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE;
327 ecc_error_data.databit_type = PLAT_BIT_TYPE_DATA;
328 ecc_error_data.databit_no = synd_code;
329 } else if (ECC_SYND_ECC_BEGIN <= synd_code &&
330 synd_code < ECC_SYND_MTAG_BEGIN) {
331 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE;
332 ecc_error_data.databit_type = PLAT_BIT_TYPE_ECC;
333 ecc_error_data.databit_no = synd_code - ECC_SYND_ECC_BEGIN;
334 } else if (ECC_SYND_MTAG_BEGIN <= synd_code &&
335 synd_code < ECC_SYND_MECC_BEGIN) {
336 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE;
337 ecc_error_data.databit_type = PLAT_BIT_TYPE_MTAG_D;
338 ecc_error_data.databit_no = synd_code - ECC_SYND_MTAG_BEGIN;
339 } else if (ECC_SYND_MECC_BEGIN <= synd_code &&
340 synd_code < ECC_SYND_M2) {
341 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE;
342 ecc_error_data.databit_type = PLAT_BIT_TYPE_MTAG_E;
343 ecc_error_data.databit_no = synd_code - ECC_SYND_MECC_BEGIN;
344 } else {
345 switch (synd_code) {
346 case ECC_SYND_M2:
347 ecc_error_data.error_type = PLAT_ERROR_TYPE_M2;
348 break;
349 case ECC_SYND_M3:
350 ecc_error_data.error_type = PLAT_ERROR_TYPE_M3;
351 break;
352 case ECC_SYND_M4:
353 ecc_error_data.error_type = PLAT_ERROR_TYPE_M4;
354 break;
355 case ECC_SYND_M:
356 ecc_error_data.error_type = PLAT_ERROR_TYPE_M;
357 break;
358 default:
359 ecc_error_data.error_type = PLAT_ERROR_TYPE_UNK;
360 break;
361 }
362 ecc_error_data.databit_type = PLAT_BIT_TYPE_MULTI;
363 ecc_error_data.databit_no = 0; /* not used */
364 }
365
366 #ifdef DEBUG
367 if (invalid_unum &&
368 (ecc_error_data.error_code != PLAT_ERROR_CODE_UE) &&
369 unum && *unum)
370 cmn_err(CE_WARN, "Unexpected unum string format: %s\n", unum);
371 #endif
372
373 /*
374 * Send this data off as a mailbox message to the SC.
375 */
376 (void) plat_send_ecc_mailbox_msg(PLAT_ECC_ERROR_MESSAGE,
377 &ecc_error_data);
378 }
379
380 /*
381 * The unum string for memory is expected to be in this form:
382 * "[/N0/]SB12/P0/B0/D2 [J13500]"
383 * Or if the unum was generated as the result of a UE:
384 * "[/N0/]SB12/P0/B0 [J13500, ...]"
385 * From that we will extract the board number, processor position,
386 * bank number and jnumber.
387 *
388 * Return (1) for an invalid unum string. If the unum is for an
389 * individual DIMM and there is no jnumber, jnumber will be set
390 * to -1 and the caller can decide if the unum is valid. This
391 * is because Serengeti does not have jnumbers for bank unums
392 * which may be used to create DIMM unums (e.g. for acquiring
393 * DIMM serial ids).
394 */
395
396 int
parse_unum_memory(char * unum,int * board,int * pos,int * bank,int * dimm,int * jnumber)397 parse_unum_memory(char *unum, int *board, int *pos, int *bank, int *dimm,
398 int *jnumber)
399 {
400 char *c;
401
402 if ((c = strstr(unum, "SB")) == NULL)
403 return (1);
404 c += 2;
405 *board = (uint8_t)stoi(&c);
406
407 if (*c++ != '/' || *c++ != 'P')
408 return (1);
409 *pos = stoi(&c);
410
411 if (*c++ != '/' || *c++ != 'B')
412 return (1);
413 *bank = stoi(&c);
414
415 if ((c = strchr(c, 'D')) == NULL) {
416 *dimm = -1;
417 *jnumber = 0;
418 return (0);
419 }
420 c++;
421 *dimm = stoi(&c);
422
423 if ((c = strchr(c, 'J')) == NULL) {
424 *jnumber = -1;
425 return (0);
426 }
427
428 c++;
429 *jnumber = (uint16_t)stoi(&c);
430
431 return (0);
432 }
433
434 /*
435 * The unum string for ecache is expected to be in this form:
436 * "[/N0/][SB|IO]12/P0/E0 J13500, ..."
437 * From that we will extract the board number, processor position and
438 * junmber.
439 *
440 * return (1) for any invalid unum string.
441 */
442 static int
parse_unum_ecache(char * unum,int * board,int * pos,int * jnumber,int * maxcat)443 parse_unum_ecache(char *unum, int *board, int *pos, int *jnumber, int *maxcat)
444 {
445 char *c;
446
447 if ((c = strstr(unum, "SB")) == NULL) {
448 /*
449 * Since this is an E$ error, it must have occurred on
450 * either a System Board (represented by "SB" in the
451 * unum string) or a Maxcat board ("IO" in the unum
452 * string).
453 */
454 if ((c = strstr(unum, "IO")) == NULL)
455 return (1);
456 *maxcat = 1;
457 }
458
459 c += 2;
460 *board = (uint8_t)stoi(&c);
461
462 if (*c++ != '/' || *c++ != 'P')
463 return (1);
464 *pos = stoi(&c);
465
466 if ((c = strchr(c, 'J')) == NULL)
467 return (1);
468
469 c++;
470 *jnumber = (uint16_t)stoi(&c);
471
472 return (0);
473 }
474
475 /* The following array maps the error to its corresponding set */
476 static int plat_ecc_e2d_map[PLAT_ECC_ERROR2_NUMVALS] = {
477 PLAT_ECC_ERROR2_NONE, /* 0x00 */
478 PLAT_ECC_ERROR2_SEND_L2_XXC, /* 0x01 */
479 PLAT_ECC_ERROR2_SEND_L2_XXU, /* 0x02 */
480 PLAT_ECC_ERROR2_SEND_L3_XXC, /* 0x03 */
481 PLAT_ECC_ERROR2_SEND_L3_XXU, /* 0x04 */
482 PLAT_ECC_ERROR2_SEND_MEM_ERRS, /* 0x05 */
483 PLAT_ECC_ERROR2_SEND_MEM_ERRS, /* 0x06 */
484 PLAT_ECC_ERROR2_SEND_MEM_ERRS, /* 0x07 */
485 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x08 */
486 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x09 */
487 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x0a */
488 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x0b */
489 PLAT_ECC_ERROR2_SEND_L2_TAG_ERRS, /* 0x0c */
490 PLAT_ECC_ERROR2_SEND_L2_TAG_ERRS, /* 0x0d */
491 PLAT_ECC_ERROR2_SEND_L3_TAG_ERRS, /* 0x0e */
492 PLAT_ECC_ERROR2_SEND_L3_TAG_ERRS, /* 0x0f */
493 PLAT_ECC_ERROR2_SEND_L1_PARITY, /* 0x10 */
494 PLAT_ECC_ERROR2_SEND_L1_PARITY, /* 0x11 */
495 PLAT_ECC_ERROR2_SEND_TLB_PARITY, /* 0x12 */
496 PLAT_ECC_ERROR2_SEND_TLB_PARITY, /* 0x13 */
497 PLAT_ECC_ERROR2_SEND_IV_ERRS, /* 0x14 */
498 PLAT_ECC_ERROR2_SEND_IV_ERRS, /* 0x15 */
499 PLAT_ECC_ERROR2_SEND_MTAG_XXC, /* 0x16 */
500 PLAT_ECC_ERROR2_SEND_IV_MTAG_XXC, /* 0x17 */
501 PLAT_ECC_ERROR2_SEND_L3_XXC, /* 0x18 */
502 PLAT_ECC_ERROR2_SEND_PCACHE /* 0x19 */
503 };
504
505 /*
506 * log enhanced error information to SC.
507 */
508 void
plat_log_fruid_error2(int msg_type,char * unum,struct async_flt * aflt,plat_ecc_ch_async_flt_t * ecc_ch_flt)509 plat_log_fruid_error2(int msg_type, char *unum, struct async_flt *aflt,
510 plat_ecc_ch_async_flt_t *ecc_ch_flt)
511 {
512 plat_ecc_error2_data_t e2d = {0};
513 int board, pos, bank, dimm, jnumber;
514 int maxcat = 0;
515 uint16_t flags;
516
517 /* Check the flags */
518 flags = plat_ecc_e2d_map[msg_type];
519 if ((ecc_error2_mailbox_flags & flags) == 0)
520 return;
521
522 /* Fill the header */
523 e2d.ee2d_major_version = PLAT_ECC_ERROR2_VERSION_MAJOR;
524 e2d.ee2d_minor_version = PLAT_ECC_ERROR2_VERSION_MINOR;
525 e2d.ee2d_msg_type = PLAT_ECC_ERROR2_MESSAGE;
526 e2d.ee2d_msg_length = sizeof (plat_ecc_error2_data_t);
527
528 /* Fill the data */
529 if (aflt->flt_in_memory) {
530 if (parse_unum_memory(unum, &board, &pos, &bank, &dimm,
531 &jnumber) || (dimm != -1 && jnumber == -1))
532 return;
533 /*
534 * Using the SB number and Proc position we create a FRU
535 * cpu id.
536 */
537 e2d.ee2d_owning_proc = plat_make_fru_cpuid(board, 0, pos);
538 e2d.ee2d_jnumber = jnumber;
539 e2d.ee2d_bank_number = bank;
540 } else if (aflt->flt_status & ECC_ECACHE) {
541 if (parse_unum_ecache(unum, &board, &pos, &jnumber, &maxcat))
542 return;
543 /*
544 * Using the SB number and Proc position we create a FRU
545 * cpu id.
546 */
547 e2d.ee2d_owning_proc = plat_make_fru_cpuid(board, maxcat, pos);
548 e2d.ee2d_jnumber = jnumber;
549 e2d.ee2d_bank_number = (uint8_t)-1;
550 } else {
551 /*
552 * L1 Cache
553 */
554 e2d.ee2d_owning_proc = aflt->flt_bus_id;
555 e2d.ee2d_jnumber = (uint16_t)-1;
556 e2d.ee2d_bank_number = (uint8_t)-1;
557 }
558
559 e2d.ee2d_type = (uint8_t)msg_type;
560 e2d.ee2d_afar_status = (uint8_t)ecc_ch_flt->ecaf_afar_status;
561 e2d.ee2d_synd_status = (uint8_t)ecc_ch_flt->ecaf_synd_status;
562 e2d.ee2d_detecting_proc = aflt->flt_bus_id;
563 e2d.ee2d_cpu_impl = cpunodes[e2d.ee2d_owning_proc].implementation;
564 e2d.ee2d_timestamp = aflt->flt_id;
565 e2d.ee2d_afsr = aflt->flt_stat;
566 e2d.ee2d_afar = aflt->flt_addr;
567
568 e2d.ee2d_sdw_afsr = ecc_ch_flt->ecaf_sdw_afsr;
569 e2d.ee2d_sdw_afar = ecc_ch_flt->ecaf_sdw_afar;
570 e2d.ee2d_afsr_ext = ecc_ch_flt->ecaf_afsr_ext;
571 e2d.ee2d_sdw_afsr_ext = ecc_ch_flt->ecaf_sdw_afsr_ext;
572
573 /* Send the message to SC */
574 (void) plat_send_ecc_mailbox_msg(PLAT_ECC_ERROR2_MESSAGE, &e2d);
575 }
576
577 uint8_t ecc_indictment_mailbox_disable = PLAT_ECC_INDICTMENT_OK;
578 uint8_t ecc_indictment_mailbox_flags = PLAT_ECC_SEND_DEFAULT_INDICT;
579
580 /*
581 * We log all Solaris indictments of failing hardware. We pull the system
582 * board number and jnumber out of the unum string, and calculate the cpuid
583 * from some members of the unum string. The rest of the structure is filled
584 * in through the other arguments. The data structure is then passed to
585 * plat_ecc_dispatch_task(). This function should only be loaded into memory
586 * or called on platforms that define a plat_send_ecc_mailbox_msg() function.
587 */
588 static int
plat_log_fruid_indictment(int msg_type,struct async_flt * aflt,char * unum)589 plat_log_fruid_indictment(int msg_type, struct async_flt *aflt, char *unum)
590 {
591 plat_ecc_message_t *wrapperp;
592 plat_ecc_indict_msg_contents_t *contentsp;
593 char *unum_ptr;
594 int is_maxcat = 0;
595
596 switch (ecc_indictment_mailbox_disable) {
597 case (PLAT_ECC_INDICTMENT_OK):
598 case (PLAT_ECC_INDICTMENT_SUSPECT):
599 break;
600 case (PLAT_ECC_INDICTMENT_NO_SEND):
601 default:
602 return (ECONNREFUSED);
603 }
604
605 switch (msg_type) {
606 case (PLAT_ECC_INDICT_DIMM):
607 if ((ecc_indictment_mailbox_flags &
608 PLAT_ECC_SEND_DIMM_INDICT) == 0)
609 return (ECONNREFUSED);
610 break;
611 case (PLAT_ECC_INDICT_ECACHE_CORRECTABLES):
612 if ((ecc_indictment_mailbox_flags &
613 PLAT_ECC_SEND_ECACHE_XXC_INDICT) == 0)
614 return (ECONNREFUSED);
615 break;
616 case (PLAT_ECC_INDICT_ECACHE_UNCORRECTABLE):
617 if ((ecc_indictment_mailbox_flags &
618 PLAT_ECC_SEND_ECACHE_XXU_INDICT) == 0)
619 return (ECONNREFUSED);
620 break;
621 default:
622 return (ECONNREFUSED);
623 }
624
625 /* LINTED: E_TRUE_LOGICAL_EXPR */
626 ASSERT(sizeof (plat_ecc_indictment_data_t) == PLAT_ECC_INDICT_SIZE);
627
628 wrapperp = (plat_ecc_message_t *)
629 kmem_zalloc(sizeof (plat_ecc_message_t), KM_SLEEP);
630
631 wrapperp->ecc_msg_status = PLAT_ECC_NO_MSG_ACTIVE;
632 wrapperp->ecc_msg_type = PLAT_ECC_INDICTMENT_MESSAGE;
633 wrapperp->ecc_msg_len = sizeof (plat_ecc_indictment_data_t);
634 wrapperp->ecc_msg_data = kmem_zalloc(wrapperp->ecc_msg_len, KM_SLEEP);
635
636 contentsp = &(((plat_ecc_indictment_data_t *)
637 wrapperp->ecc_msg_data)->msg_contents);
638
639 /*
640 * Find board_num, jnumber, and proc position from the unum string.
641 * Use the board number, is_maxcat, and proc position to calculate
642 * cpuid.
643 */
644 unum_ptr = strstr(unum, "SB");
645 if (unum_ptr == NULL) {
646 is_maxcat = 1;
647 unum_ptr = strstr(unum, "IO");
648 if (unum_ptr == NULL) {
649 kmem_free(wrapperp->ecc_msg_data,
650 wrapperp->ecc_msg_len);
651 kmem_free(wrapperp, sizeof (plat_ecc_message_t));
652 return (EINVAL);
653 }
654 }
655 unum_ptr += 2;
656 contentsp->board_num = (uint8_t)stoi(&unum_ptr);
657
658 unum_ptr = strchr(unum_ptr, 'P');
659 if (unum_ptr == NULL) {
660 kmem_free(wrapperp->ecc_msg_data, wrapperp->ecc_msg_len);
661 kmem_free(wrapperp, sizeof (plat_ecc_message_t));
662 return (EINVAL);
663 }
664 unum_ptr++;
665 contentsp->detecting_proc =
666 (uint16_t)plat_make_fru_cpuid(contentsp->board_num, is_maxcat,
667 stoi(&unum_ptr));
668
669 unum_ptr = strchr(unum_ptr, 'J');
670 if (unum_ptr == NULL) {
671 kmem_free(wrapperp->ecc_msg_data, wrapperp->ecc_msg_len);
672 kmem_free(wrapperp, sizeof (plat_ecc_message_t));
673 return (EINVAL);
674 }
675 unum_ptr++;
676 contentsp->jnumber = (uint16_t)stoi(&unum_ptr);
677
678 /*
679 * Fill in the rest of the data
680 */
681 contentsp->version = PLAT_ECC_INDICTMENT_VERSION;
682 contentsp->indictment_type = msg_type;
683 contentsp->indictment_uncertain = ecc_indictment_mailbox_disable;
684 contentsp->syndrome = aflt->flt_synd;
685 contentsp->afsr = aflt->flt_stat;
686 contentsp->afar = aflt->flt_addr;
687
688 /*
689 * Build the solaris_version string:
690 */
691 (void) snprintf(contentsp->solaris_version,
692 PLAT_ECC_VERSION_LENGTH, "%s %s", utsname.release, utsname.version);
693
694 /*
695 * Send the data on to the queuing function
696 */
697 return (plat_ecc_dispatch_task(wrapperp));
698 }
699
700 /* The following array maps the indictment to its corresponding set */
701 static int plat_ecc_i2d_map[PLAT_ECC_INDICT2_NUMVALS] = {
702 PLAT_ECC_INDICT2_NONE, /* 0x00 */
703 PLAT_ECC_SEND_INDICT2_L2_XXU, /* 0x01 */
704 PLAT_ECC_SEND_INDICT2_L2_XXC_SERD, /* 0x02 */
705 PLAT_ECC_SEND_INDICT2_L2_TAG_SERD, /* 0x03 */
706 PLAT_ECC_SEND_INDICT2_L3_XXU, /* 0x04 */
707 PLAT_ECC_SEND_INDICT2_L3_XXC_SERD, /* 0x05 */
708 PLAT_ECC_SEND_INDICT2_L3_TAG_SERD, /* 0x06 */
709 PLAT_ECC_SEND_INDICT2_L1_SERD, /* 0x07 */
710 PLAT_ECC_SEND_INDICT2_L1_SERD, /* 0x08 */
711 PLAT_ECC_SEND_INDICT2_TLB_SERD, /* 0x09 */
712 PLAT_ECC_SEND_INDICT2_TLB_SERD, /* 0x0a */
713 PLAT_ECC_SEND_INDICT2_FPU, /* 0x0b */
714 PLAT_ECC_SEND_INDICT2_PCACHE_SERD /* 0x0c */
715 };
716
717 static int
plat_log_fruid_indictment2(int msg_type,struct async_flt * aflt,char * unum)718 plat_log_fruid_indictment2(int msg_type, struct async_flt *aflt, char *unum)
719 {
720 plat_ecc_message_t *wrapperp;
721 plat_ecc_indictment2_data_t *i2d;
722 int board, pos, jnumber;
723 int maxcat = 0;
724 uint16_t flags;
725
726 /*
727 * If the unum is null or empty, skip parsing it
728 */
729 if (unum && unum[0] != '\0') {
730 if (parse_unum_ecache(unum, &board, &pos, &jnumber, &maxcat))
731 return (EINVAL);
732 }
733
734 if ((ecc_indictment_mailbox_disable != PLAT_ECC_INDICTMENT_OK) &&
735 (ecc_indictment_mailbox_disable != PLAT_ECC_INDICTMENT_SUSPECT))
736 return (ECONNREFUSED);
737
738 /* Check the flags */
739 flags = plat_ecc_i2d_map[msg_type];
740 if ((ecc_indictment2_mailbox_flags & flags) == 0)
741 return (ECONNREFUSED);
742
743 wrapperp = (plat_ecc_message_t *)
744 kmem_zalloc(sizeof (plat_ecc_message_t), KM_SLEEP);
745
746 /* Initialize the wrapper */
747 wrapperp->ecc_msg_status = PLAT_ECC_NO_MSG_ACTIVE;
748 wrapperp->ecc_msg_type = PLAT_ECC_INDICTMENT2_MESSAGE;
749 wrapperp->ecc_msg_len = sizeof (plat_ecc_indictment2_data_t);
750 wrapperp->ecc_msg_data = kmem_zalloc(wrapperp->ecc_msg_len, KM_SLEEP);
751
752 i2d = (plat_ecc_indictment2_data_t *)wrapperp->ecc_msg_data;
753
754 /* Fill the header */
755 i2d->ei2d_major_version = PLAT_ECC_INDICT2_MAJOR_VERSION;
756 i2d->ei2d_minor_version = PLAT_ECC_INDICT2_MINOR_VERSION;
757 i2d->ei2d_msg_type = PLAT_ECC_INDICTMENT2_MESSAGE;
758 i2d->ei2d_msg_length = sizeof (plat_ecc_indictment2_data_t);
759
760 /* Fill the data */
761 if (unum && unum[0] != '\0') {
762 i2d->ei2d_arraigned_proc = plat_make_fru_cpuid(board, maxcat,
763 pos);
764 i2d->ei2d_board_num = board;
765 i2d->ei2d_jnumber = jnumber;
766 } else {
767 i2d->ei2d_arraigned_proc = aflt->flt_inst;
768 i2d->ei2d_board_num = (uint8_t)
769 plat_make_fru_boardnum(i2d->ei2d_arraigned_proc);
770 i2d->ei2d_jnumber = (uint16_t)-1;
771 }
772
773 i2d->ei2d_type = msg_type;
774 i2d->ei2d_uncertain = ecc_indictment_mailbox_disable;
775 i2d->ei2d_cpu_impl = cpunodes[i2d->ei2d_arraigned_proc].implementation;
776 i2d->ei2d_timestamp = aflt->flt_id;
777
778 /*
779 * Send the data on to the queuing function
780 */
781 return (plat_ecc_dispatch_task(wrapperp));
782 }
783
784 int
plat_ecc_capability_send(void)785 plat_ecc_capability_send(void)
786 {
787 plat_ecc_message_t *wrapperp;
788 plat_capability_data_t *cap;
789 int ver_len;
790
791 wrapperp = kmem_zalloc(sizeof (plat_ecc_message_t), KM_SLEEP);
792
793 ver_len = strlen(utsname.release) + strlen(utsname.version) + 2;
794
795 /* Initialize the wrapper */
796 wrapperp->ecc_msg_status = PLAT_ECC_NO_MSG_ACTIVE;
797 wrapperp->ecc_msg_type = PLAT_ECC_CAPABILITY_MESSAGE;
798 wrapperp->ecc_msg_len = sizeof (plat_capability_data_t) + ver_len;
799 wrapperp->ecc_msg_data = kmem_zalloc(wrapperp->ecc_msg_len, KM_SLEEP);
800
801 cap = (plat_capability_data_t *)wrapperp->ecc_msg_data;
802
803 /* Fill the header */
804 cap->capd_major_version = PLAT_ECC_CAP_VERSION_MAJOR;
805 cap->capd_minor_version = PLAT_ECC_CAP_VERSION_MINOR;
806 cap->capd_msg_type = PLAT_ECC_CAPABILITY_MESSAGE;
807 cap->capd_msg_length = wrapperp->ecc_msg_len;
808
809 /* Set the default domain capability */
810 cap->capd_capability = PLAT_ECC_CAPABILITY_DOMAIN_DEFAULT;
811
812 /*
813 * Build the solaris_version string:
814 * utsname.release + " " + utsname.version
815 */
816 (void) snprintf(cap->capd_solaris_version, ver_len, "%s %s",
817 utsname.release, utsname.version);
818
819 /*
820 * Send the data on to the queuing function
821 */
822 return (plat_ecc_dispatch_task(wrapperp));
823 }
824
825 int
plat_ecc_capability_sc_get(int type)826 plat_ecc_capability_sc_get(int type)
827 {
828 switch (type) {
829 case PLAT_ECC_ERROR_MESSAGE:
830 if (ecc_log_fruid_enable &&
831 (!(plat_ecc_capability_map_sc &
832 PLAT_ECC_CAPABILITY_ERROR2)))
833 return (1);
834 break;
835 case PLAT_ECC_ERROR2_MESSAGE:
836 if (plat_ecc_capability_map_sc &
837 PLAT_ECC_CAPABILITY_ERROR2)
838 return (1);
839 break;
840 case PLAT_ECC_INDICTMENT_MESSAGE:
841 if (!(plat_ecc_capability_map_sc &
842 PLAT_ECC_CAPABILITY_INDICT2) ||
843 !(plat_ecc_capability_map_domain &
844 PLAT_ECC_CAPABILITY_FMA))
845 return (1);
846 break;
847 case PLAT_ECC_INDICTMENT2_MESSAGE:
848 if (plat_ecc_capability_map_sc &
849 PLAT_ECC_CAPABILITY_INDICT2)
850 return (1);
851 break;
852 case PLAT_ECC_DIMM_SID_MESSAGE:
853 if (plat_ecc_capability_map_sc &
854 PLAT_ECC_CAPABILITY_DIMM_SID)
855 return (1);
856 /* FALLTHROUGH */
857 default:
858 return (0);
859 }
860 return (0);
861 }
862
863 int plat_ecc_cap_sc_set_cnt = 0;
864
865 void
plat_ecc_capability_sc_set(uint32_t cap)866 plat_ecc_capability_sc_set(uint32_t cap)
867 {
868 plat_ecc_capability_map_sc = cap;
869
870 if (!plat_ecc_cap_sc_set_cnt && (cap & PLAT_ECC_CAPABILITY_DIMM_SID))
871 if (p2init_sid_cache)
872 p2init_sid_cache();
873
874 plat_ecc_cap_sc_set_cnt++;
875 }
876
877 /*
878 * The following table represents mapping between the indictment1 reason
879 * to its type.
880 */
881
882 static plat_ecc_bl_map_t plat_ecc_bl_map_v1[] = {
883 { "l2cachedata", PLAT_ECC_INDICT_ECACHE_CORRECTABLES },
884 { "l3cachedata", PLAT_ECC_INDICT_ECACHE_CORRECTABLES },
885 { "l2cachedata", PLAT_ECC_INDICT_ECACHE_UNCORRECTABLE },
886 { "l3cachedata", PLAT_ECC_INDICT_ECACHE_UNCORRECTABLE }
887 };
888
889 /*
890 * The following table represents mapping between the indictment2 reason
891 * to its type.
892 */
893
894 static plat_ecc_bl_map_t plat_ecc_bl_map_v2[] = {
895 { "l2cachedata", PLAT_ECC_INDICT2_L2_SERD },
896 { "l3cachedata", PLAT_ECC_INDICT2_L3_SERD },
897 { "l2cachedata", PLAT_ECC_INDICT2_L2_UE },
898 { "l3cachedata", PLAT_ECC_INDICT2_L3_UE },
899 { "l2cachetag", PLAT_ECC_INDICT2_L2_TAG_SERD },
900 { "l3cachetag", PLAT_ECC_INDICT2_L3_TAG_SERD },
901 { "icache", PLAT_ECC_INDICT2_ICACHE_SERD },
902 { "dcache", PLAT_ECC_INDICT2_DCACHE_SERD },
903 { "pcache", PLAT_ECC_INDICT2_PCACHE_SERD },
904 { "itlb", PLAT_ECC_INDICT2_ITLB_SERD },
905 { "dtlb", PLAT_ECC_INDICT2_DTLB_SERD },
906 { "fpu", PLAT_ECC_INDICT2_FPU }
907 };
908
909 /*
910 * The following function returns the indictment type for a given version
911 */
912 static int
flt_name_to_msg_type(const char * fault,int indict_version)913 flt_name_to_msg_type(const char *fault, int indict_version)
914 {
915 plat_ecc_bl_map_t *mapp;
916 char *fltnm = "fault.cpu.";
917 int mapsz;
918 char *p;
919 int i;
920
921 /* Check if it starts with proper fault name */
922 if (strncmp(fault, fltnm, strlen(fltnm)) != 0)
923 return (PLAT_ECC_INDICT_NONE);
924
925 fault += strlen(fltnm); /* c = "ultraSPARC-IV.icache" */
926
927 /* Skip the cpu type */
928 if ((p = strchr(fault, '.')) == NULL)
929 return (PLAT_ECC_INDICT_NONE);
930
931 p++; /* skip the "." */
932
933 if (indict_version == 0) {
934 mapp = plat_ecc_bl_map_v1;
935 mapsz = sizeof (plat_ecc_bl_map_v1) /
936 sizeof (plat_ecc_bl_map_t);
937 } else {
938 mapp = plat_ecc_bl_map_v2;
939 mapsz = sizeof (plat_ecc_bl_map_v2) /
940 sizeof (plat_ecc_bl_map_t);
941 }
942 for (i = 0; i < mapsz; i++) {
943 if (strcmp(p, mapp[i].ebm_reason) == 0) {
944 return (mapp[i].ebm_type);
945 }
946 }
947 return (PLAT_ECC_INDICT_NONE);
948 }
949
950 /*
951 * Blacklisting
952 */
953 int
plat_blacklist(int cmd,const char * scheme,nvlist_t * fmri,const char * class)954 plat_blacklist(int cmd, const char *scheme, nvlist_t *fmri, const char *class)
955 {
956 struct async_flt aflt;
957 char *unum;
958 int msg_type, is_old_indict;
959
960 if (fmri == NULL)
961 return (EINVAL);
962 if (cmd != BLIOC_INSERT)
963 return (ENOTSUP);
964
965 /*
966 * We support both the blacklisting of CPUs via mem-schemed
967 * FMRIs that name E$ J-numbers, and CPUs via cpu-schemed FMRIs
968 * that name the cpuid.
969 */
970 if (strcmp(scheme, FM_FMRI_SCHEME_MEM) == 0) {
971 if (nvlist_lookup_string(fmri, FM_FMRI_MEM_UNUM, &unum))
972 return (EINVAL);
973 aflt.flt_inst = (uint_t)-1;
974 } else if (strcmp(scheme, FM_FMRI_SCHEME_CPU) == 0) {
975 if (nvlist_lookup_uint32(fmri, FM_FMRI_CPU_ID, &aflt.flt_inst))
976 return (EINVAL);
977 unum = NULL;
978 } else {
979 return (ENOTSUP);
980 }
981
982 /*
983 * If the SC cannot handle indictment2, so fall back to old one.
984 * Also if the domain does not support FMA, then send only the old one.
985 */
986
987 is_old_indict = plat_ecc_capability_sc_get(PLAT_ECC_INDICTMENT_MESSAGE);
988
989 if (is_old_indict)
990 msg_type = flt_name_to_msg_type(class, 0);
991 else
992 msg_type = flt_name_to_msg_type(class, 1);
993
994 if (msg_type == PLAT_ECC_INDICT_NONE)
995 return (ENOTSUP);
996
997 /*
998 * The current blacklisting interfaces are designed for a world where
999 * the SC is much more involved in the diagnosis and error reporting
1000 * process than it is in the FMA world. As such, the existing
1001 * interfaces want all kinds of information about the error that's
1002 * triggering the blacklist. In the FMA world, we don't have access
1003 * to any of that information by the time we're doing the blacklist,
1004 * so we fake values.
1005 */
1006 aflt.flt_id = gethrtime();
1007 aflt.flt_addr = -1;
1008 aflt.flt_stat = -1;
1009 aflt.flt_synd = (ushort_t)-1;
1010
1011 if (is_old_indict) {
1012 if (unum && unum[0] != '\0')
1013 return (plat_log_fruid_indictment(msg_type, &aflt,
1014 unum));
1015 else
1016 return (ENOTSUP);
1017 } else {
1018 return (plat_log_fruid_indictment2(msg_type, &aflt, unum));
1019 }
1020 }
1021
1022 static kcondvar_t plat_ecc_condvar;
1023 static kmutex_t plat_ecc_mutex;
1024 static taskq_t *plat_ecc_taskq;
1025
1026 /*
1027 * plat_ecc_dispatch_task: Dispatch the task on a taskq and wait for the
1028 * return value. We use cv_wait_sig to wait for the return values. If a
1029 * signal interrupts us, we return EINTR. Otherwise, we return the value
1030 * returned by the mailbox functions.
1031 *
1032 * To avoid overloading the lower-level mailbox routines, we use a taskq
1033 * to serialize all messages. Currently, it is expected that only one
1034 * process (fmd) will use this ioctl, so the delay caused by the taskq
1035 * should not have much of an effect.
1036 */
1037 int
plat_ecc_dispatch_task(plat_ecc_message_t * msg)1038 plat_ecc_dispatch_task(plat_ecc_message_t *msg)
1039 {
1040 int ret;
1041
1042 ASSERT(msg != NULL);
1043 ASSERT(plat_ecc_taskq != NULL);
1044
1045 if (taskq_dispatch(plat_ecc_taskq, plat_ecc_send_msg,
1046 (void *)msg, TQ_NOSLEEP) == TASKQID_INVALID) {
1047 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len);
1048 kmem_free(msg, sizeof (plat_ecc_message_t));
1049 return (ENOMEM);
1050 }
1051 mutex_enter(&plat_ecc_mutex);
1052
1053 /*
1054 * It's possible that the taskq function completed before we
1055 * acquired the mutex. Check for this first. If this did not
1056 * happen, we wait for the taskq function to signal us, or an
1057 * interrupt. We also check ecc_msg_status to protect against
1058 * spurious wakeups from cv_wait_sig.
1059 */
1060 if (msg->ecc_msg_status == PLAT_ECC_MSG_SENT) {
1061 ret = msg->ecc_msg_ret;
1062 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len);
1063 kmem_free(msg, sizeof (plat_ecc_message_t));
1064 } else {
1065 msg->ecc_msg_status = PLAT_ECC_TASK_DISPATCHED;
1066
1067 while ((ret = cv_wait_sig(&plat_ecc_condvar,
1068 &plat_ecc_mutex)) != 0 &&
1069 msg->ecc_msg_status == PLAT_ECC_TASK_DISPATCHED)
1070 ;
1071
1072 if ((ret == 0) && (msg->ecc_msg_status != PLAT_ECC_MSG_SENT)) {
1073 /* An interrupt was received */
1074 msg->ecc_msg_status = PLAT_ECC_INTERRUPT_RECEIVED;
1075 ret = EINTR;
1076 } else {
1077 ret = msg->ecc_msg_ret;
1078 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len);
1079 kmem_free(msg, sizeof (plat_ecc_message_t));
1080 }
1081 }
1082 mutex_exit(&plat_ecc_mutex);
1083 return (ret);
1084 }
1085
1086 static void
plat_ecc_send_msg(void * arg)1087 plat_ecc_send_msg(void *arg)
1088 {
1089 plat_ecc_message_t *msg = arg;
1090 int ret;
1091
1092 /*
1093 * Send this data off as a mailbox message to the SC.
1094 */
1095 ret = plat_send_ecc_mailbox_msg(msg->ecc_msg_type, msg->ecc_msg_data);
1096
1097 mutex_enter(&plat_ecc_mutex);
1098
1099 /*
1100 * If the dispatching function received an interrupt, don't bother
1101 * signalling it, and throw away the results. Otherwise, set the
1102 * return value and signal the condvar.
1103 */
1104 if (msg->ecc_msg_status == PLAT_ECC_INTERRUPT_RECEIVED) {
1105 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len);
1106 kmem_free(msg, sizeof (plat_ecc_message_t));
1107 } else {
1108 msg->ecc_msg_ret = ret;
1109 msg->ecc_msg_status = PLAT_ECC_MSG_SENT;
1110 cv_broadcast(&plat_ecc_condvar);
1111 }
1112
1113 mutex_exit(&plat_ecc_mutex);
1114 }
1115
1116 void
plat_ecc_init(void)1117 plat_ecc_init(void)
1118 {
1119 int bd;
1120
1121 mutex_init(&plat_ecc_mutex, NULL, MUTEX_DEFAULT, NULL);
1122 cv_init(&plat_ecc_condvar, NULL, CV_DEFAULT, NULL);
1123 plat_ecc_taskq = taskq_create("plat_ecc_taskq", 1, minclsyspri,
1124 PLAT_ECC_TASKQ_MIN, PLAT_ECC_TASKQ_MAX, TASKQ_PREPOPULATE);
1125 ASSERT(plat_ecc_taskq != NULL);
1126
1127 for (bd = 0; bd < plat_max_cpumem_boards(); bd++) {
1128 mutex_init(&domain_dimm_sids[bd].pdsb_lock,
1129 NULL, MUTEX_DEFAULT, NULL);
1130 }
1131
1132 }
1133