1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/plat_ecc_unum.h> 29 #include <sys/utsname.h> 30 #include <sys/cmn_err.h> 31 #include <sys/async.h> 32 #include <sys/errno.h> 33 #include <sys/fm/protocol.h> 34 #include <sys/fm/cpu/UltraSPARC-III.h> 35 #include <sys/bl.h> 36 #include <sys/taskq.h> 37 #include <sys/condvar.h> 38 #include <sys/plat_ecc_dimm.h> 39 40 /* 41 * Pointer to platform specific function to initialize a cache of DIMM 42 * serial ids 43 */ 44 int (*p2init_sid_cache)(void); 45 46 /* 47 * This file contains the common code that is used for parsing 48 * ecc unum data and logging it appropriately as the platform 49 * that calls this code implements. 50 */ 51 52 int plat_ecc_dispatch_task(plat_ecc_message_t *); 53 static void plat_ecc_send_msg(void *); 54 55 #define CHECK_UNUM \ 56 if (unum_ptr == NULL) { \ 57 break; \ 58 } 59 60 /* 61 * See plat_ecc_unum.h for the meaning of these variables. 62 */ 63 int ecc_log_fruid_enable = ECC_FRUID_ENABLE_DEFAULT; 64 65 uint32_t plat_ecc_capability_map_domain = PLAT_ECC_CAPABILITY_DOMAIN_DEFAULT; 66 uint32_t plat_ecc_capability_map_sc = PLAT_ECC_CAPABILITY_SC_DEFAULT; 67 uint16_t ecc_error2_mailbox_flags = PLAT_ECC_ERROR2_SEND_DEFAULT; 68 uint16_t ecc_indictment2_mailbox_flags = PLAT_ECC_SEND_INDICT2_DEFAULT; 69 70 /* 71 * We log all ECC errors using the function that is defined as 72 * plat_send_ecc_mailbox_msg(); We first parse the unum string and 73 * then pass the data to be logged to the plat_send_ecc_mailbox_msg 74 * function for logging. Each platform that uses this code needs to 75 * implement a suitable function for this purpose. 76 */ 77 void 78 plat_log_fruid_error(int synd_code, struct async_flt *ecc, char *unum, 79 uint64_t afsr_bit) 80 { 81 plat_ecc_error_data_t ecc_error_data; 82 enum plat_ecc_type ecc_type = PLAT_ECC_UNKNOWN; 83 int board_num; 84 int proc_position; 85 int invalid_unum = 1; 86 87 bzero(&ecc_error_data, sizeof (plat_ecc_error_data_t)); 88 ecc_error_data.version = PLAT_ECC_VERSION; 89 90 switch (afsr_bit) { 91 case C_AFSR_CE: 92 ecc_error_data.error_code = PLAT_ERROR_CODE_CE; 93 break; 94 case C_AFSR_UE: 95 ecc_error_data.error_code = PLAT_ERROR_CODE_UE; 96 break; 97 case C_AFSR_EDC: 98 ecc_error_data.error_code = PLAT_ERROR_CODE_EDC; 99 break; 100 case C_AFSR_EDU: 101 ecc_error_data.error_code = PLAT_ERROR_CODE_EDU; 102 break; 103 case C_AFSR_WDC: 104 ecc_error_data.error_code = PLAT_ERROR_CODE_WDC; 105 break; 106 case C_AFSR_WDU: 107 ecc_error_data.error_code = PLAT_ERROR_CODE_WDU; 108 break; 109 case C_AFSR_CPC: 110 ecc_error_data.error_code = PLAT_ERROR_CODE_CPC; 111 break; 112 case C_AFSR_CPU: 113 ecc_error_data.error_code = PLAT_ERROR_CODE_CPU; 114 break; 115 case C_AFSR_UCC: 116 ecc_error_data.error_code = PLAT_ERROR_CODE_UCC; 117 break; 118 case C_AFSR_UCU: 119 ecc_error_data.error_code = PLAT_ERROR_CODE_UCU; 120 break; 121 case C_AFSR_EMC: 122 ecc_error_data.error_code = PLAT_ERROR_CODE_EMC; 123 break; 124 case C_AFSR_EMU: 125 ecc_error_data.error_code = PLAT_ERROR_CODE_EMU; 126 break; 127 default: 128 /* 129 * Do not send messages with unknown error codes, since 130 * the SC will not be able to tell what type of error 131 * occurred. 132 */ 133 return; 134 } 135 136 ecc_error_data.detecting_proc = ecc->flt_bus_id; 137 138 if (ecc->flt_in_memory) 139 ecc_type = PLAT_ECC_MEMORY; 140 else if (ecc->flt_status & ECC_ECACHE) 141 ecc_type = PLAT_ECC_ECACHE; 142 143 switch (ecc_type) { 144 case PLAT_ECC_MEMORY: { 145 /* 146 * The unum string is expected to be in this form: 147 * "/N0/SB12/P0/B0/D2 J13500, ..." 148 * for serengeti. As this code is shared with Starcat 149 * if N is missing then it is set to 0. 150 * From that we will extract the bank number, dimm 151 * number, and Jnumber. 152 */ 153 char *unum_ptr = unum; 154 char *jno_ptr = ecc_error_data.Jnumber; 155 int i; 156 157 /* 158 * On Serengeti we expect to find 'N' in the unum string 159 * however, on Starcat 'N' does not appear in the unum string. 160 * We do not want this code to break at this point, so the 161 * unum_ptr is reset to the start of unum string if we fail 162 * to find an 'N'. 163 */ 164 unum_ptr = strchr(unum_ptr, 'N'); 165 if (unum_ptr == NULL) { 166 ecc_error_data.node_no = 0; 167 unum_ptr = unum; 168 } else { 169 unum_ptr++; 170 ecc_error_data.node_no = stoi(&unum_ptr); 171 } 172 173 /* 174 * Now pull out the SB number 175 */ 176 unum_ptr = strstr(unum_ptr, "SB"); 177 CHECK_UNUM; 178 unum_ptr += 2; 179 board_num = stoi(&unum_ptr); 180 181 /* 182 * Now pull out the Proc position (relative to the board) 183 */ 184 unum_ptr = strchr(unum_ptr, 'P'); 185 CHECK_UNUM; 186 unum_ptr++; 187 proc_position = stoi(&unum_ptr); 188 189 /* 190 * Using the SB number and Proc position we create a FRU 191 * cpu id. 192 */ 193 ecc_error_data.proc_num = 194 plat_make_fru_cpuid(board_num, 0, proc_position); 195 196 /* 197 * Now pull out the Memory Bank number 198 */ 199 unum_ptr = strchr(unum_ptr, 'B'); 200 CHECK_UNUM; 201 unum_ptr++; 202 ecc_error_data.bank_no = (stoi(&unum_ptr) & 0x01); 203 204 /* 205 * Now pull out the Dimm number within the Memory Bank. 206 */ 207 unum_ptr = strchr(unum_ptr, 'D'); 208 CHECK_UNUM; 209 unum_ptr++; 210 ecc_error_data.ecache_dimm_no = (stoi(&unum_ptr) & 0x03); 211 212 /* 213 * Now pull out the J-number. 214 */ 215 unum_ptr = strchr(unum_ptr, 'J'); 216 CHECK_UNUM; 217 unum_ptr++; 218 for (i = PLAT_ECC_JNUMBER_LENGTH; 219 i > 0 && *unum_ptr >= '0' && *unum_ptr <= '9'; i--) 220 *jno_ptr++ = *unum_ptr++; 221 *jno_ptr = NULL; 222 223 /* 224 * If we get here, we can assume the unum is valid 225 */ 226 invalid_unum = 0; 227 break; 228 } 229 case PLAT_ECC_ECACHE: { 230 /* 231 * The unum string is expected to be in this form: 232 * "[/N0/][SB|IO]12/P0/E0 J13500, ..." 233 * for serengeti. As this code is shared with Starcat 234 * if N is missing then it is set to 0. IO may only appear 235 * on Starcats. From that we will extract the bank number, 236 * dimm number, and Jnumber. 237 */ 238 char *unum_ptr = unum; 239 char *jno_ptr = ecc_error_data.Jnumber; 240 int is_maxcat = 0; 241 int i; 242 243 /* 244 * On Serengeti we expect to find 'N' in the unum string 245 * however, on Starcat 'N' does not appear in the unum string. 246 * We do not want this code to break at this point, so the 247 * unum_ptr is reset to the start of unum string if we fail 248 * to find an 'N'. 249 */ 250 unum_ptr = strchr(unum_ptr, 'N'); 251 if (unum_ptr == NULL) { 252 ecc_error_data.node_no = 0; 253 unum_ptr = unum; 254 } else { 255 unum_ptr++; 256 ecc_error_data.node_no = stoi(&unum_ptr); 257 } 258 259 /* 260 * Now pull out the SB/IO number 261 */ 262 unum_ptr = strstr(unum_ptr, "SB"); 263 if (unum_ptr == NULL) { 264 265 /* 266 * Since this is an E$ error, it must have occurred on 267 * either a System Board (represented by "SB" in the 268 * unum string) or a Maxcat board ("IO" in the unum 269 * string). Since we failed the "SB" check, we'll 270 * assume this is a maxcat board. 271 */ 272 is_maxcat = 1; 273 unum_ptr = strstr(unum, "IO"); 274 } 275 CHECK_UNUM; 276 unum_ptr += 2; 277 board_num = stoi(&unum_ptr); 278 279 /* 280 * Now pull out the Proc position (relative to the board) 281 */ 282 unum_ptr = strchr(unum_ptr, 'P'); 283 CHECK_UNUM; 284 unum_ptr++; 285 proc_position = stoi(&unum_ptr); 286 287 /* 288 * Using the SB/IO number, slot 0/1 value (is_maxcat), and 289 * proc position, we create the cpu id. 290 */ 291 ecc_error_data.proc_num = plat_make_fru_cpuid(board_num, 292 is_maxcat, proc_position); 293 294 ecc_error_data.bank_no = 0; /* not used */ 295 296 unum_ptr = strchr(unum_ptr, 'E'); 297 CHECK_UNUM; 298 unum_ptr++; 299 ecc_error_data.ecache_dimm_no = (stoi(&unum_ptr) & 0x01); 300 301 unum_ptr = strchr(unum_ptr, 'J'); 302 CHECK_UNUM; 303 unum_ptr++; 304 for (i = PLAT_ECC_JNUMBER_LENGTH; 305 i > 0 && *unum_ptr >= '0' && *unum_ptr <= '9'; i--) 306 *jno_ptr++ = *unum_ptr++; 307 *jno_ptr = NULL; 308 309 /* 310 * If we get here, we can assume the unum is valid 311 */ 312 invalid_unum = 0; 313 break; 314 } 315 default: 316 /* 317 * Unknown error 318 */ 319 break; 320 } 321 322 /* 323 * This is where CHECK_UNUM goes when it finds an error 324 */ 325 326 if (ECC_SYND_DATA_BEGIN <= synd_code && 327 synd_code < ECC_SYND_ECC_BEGIN) { 328 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE; 329 ecc_error_data.databit_type = PLAT_BIT_TYPE_DATA; 330 ecc_error_data.databit_no = synd_code; 331 } else if (ECC_SYND_ECC_BEGIN <= synd_code && 332 synd_code < ECC_SYND_MTAG_BEGIN) { 333 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE; 334 ecc_error_data.databit_type = PLAT_BIT_TYPE_ECC; 335 ecc_error_data.databit_no = synd_code - ECC_SYND_ECC_BEGIN; 336 } else if (ECC_SYND_MTAG_BEGIN <= synd_code && 337 synd_code < ECC_SYND_MECC_BEGIN) { 338 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE; 339 ecc_error_data.databit_type = PLAT_BIT_TYPE_MTAG_D; 340 ecc_error_data.databit_no = synd_code - ECC_SYND_MTAG_BEGIN; 341 } else if (ECC_SYND_MECC_BEGIN <= synd_code && 342 synd_code < ECC_SYND_M2) { 343 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE; 344 ecc_error_data.databit_type = PLAT_BIT_TYPE_MTAG_E; 345 ecc_error_data.databit_no = synd_code - ECC_SYND_MECC_BEGIN; 346 } else { 347 switch (synd_code) { 348 case ECC_SYND_M2: 349 ecc_error_data.error_type = PLAT_ERROR_TYPE_M2; 350 break; 351 case ECC_SYND_M3: 352 ecc_error_data.error_type = PLAT_ERROR_TYPE_M3; 353 break; 354 case ECC_SYND_M4: 355 ecc_error_data.error_type = PLAT_ERROR_TYPE_M4; 356 break; 357 case ECC_SYND_M: 358 ecc_error_data.error_type = PLAT_ERROR_TYPE_M; 359 break; 360 default: 361 ecc_error_data.error_type = PLAT_ERROR_TYPE_UNK; 362 break; 363 } 364 ecc_error_data.databit_type = PLAT_BIT_TYPE_MULTI; 365 ecc_error_data.databit_no = 0; /* not used */ 366 } 367 368 #ifdef DEBUG 369 if (invalid_unum && 370 (ecc_error_data.error_code != PLAT_ERROR_CODE_UE) && 371 unum && *unum) 372 cmn_err(CE_WARN, "Unexpected unum string format: %s\n", unum); 373 #endif 374 375 /* 376 * Send this data off as a mailbox message to the SC. 377 */ 378 (void) plat_send_ecc_mailbox_msg(PLAT_ECC_ERROR_MESSAGE, 379 &ecc_error_data); 380 } 381 382 /* 383 * The unum string for memory is expected to be in this form: 384 * "[/N0/]SB12/P0/B0/D2 [J13500]" 385 * Or if the unum was generated as the result of a UE: 386 * "[/N0/]SB12/P0/B0 [J13500, ...]" 387 * From that we will extract the board number, processor position, 388 * bank number and jnumber. 389 * 390 * Return (1) for an invalid unum string. If the unum is for an 391 * individual DIMM and there is no jnumber, jnumber will be set 392 * to -1 and the caller can decide if the unum is valid. This 393 * is because Serengeti does not have jnumbers for bank unums 394 * which may be used to create DIMM unums (e.g. for acquiring 395 * DIMM serial ids). 396 */ 397 398 int 399 parse_unum_memory(char *unum, int *board, int *pos, int *bank, int *dimm, 400 int *jnumber) 401 { 402 char *c; 403 404 if ((c = strstr(unum, "SB")) == NULL) 405 return (1); 406 c += 2; 407 *board = (uint8_t)stoi(&c); 408 409 if (*c++ != '/' || *c++ != 'P') 410 return (1); 411 *pos = stoi(&c); 412 413 if (*c++ != '/' || *c++ != 'B') 414 return (1); 415 *bank = stoi(&c); 416 417 if ((c = strchr(c, 'D')) == NULL) { 418 *dimm = -1; 419 *jnumber = 0; 420 return (0); 421 } 422 c++; 423 *dimm = stoi(&c); 424 425 if ((c = strchr(c, 'J')) == NULL) { 426 *jnumber = -1; 427 return (0); 428 } 429 430 c++; 431 *jnumber = (uint16_t)stoi(&c); 432 433 return (0); 434 } 435 436 /* 437 * The unum string for ecache is expected to be in this form: 438 * "[/N0/][SB|IO]12/P0/E0 J13500, ..." 439 * From that we will extract the board number, processor position and 440 * junmber. 441 * 442 * return (1) for any invalid unum string. 443 */ 444 static int 445 parse_unum_ecache(char *unum, int *board, int *pos, int *jnumber, int *maxcat) 446 { 447 char *c; 448 449 if ((c = strstr(unum, "SB")) == NULL) { 450 /* 451 * Since this is an E$ error, it must have occurred on 452 * either a System Board (represented by "SB" in the 453 * unum string) or a Maxcat board ("IO" in the unum 454 * string). 455 */ 456 if ((c = strstr(unum, "IO")) == NULL) 457 return (1); 458 *maxcat = 1; 459 } 460 461 c += 2; 462 *board = (uint8_t)stoi(&c); 463 464 if (*c++ != '/' || *c++ != 'P') 465 return (1); 466 *pos = stoi(&c); 467 468 if ((c = strchr(c, 'J')) == NULL) 469 return (1); 470 471 c++; 472 *jnumber = (uint16_t)stoi(&c); 473 474 return (0); 475 } 476 477 /* The following array maps the error to its corresponding set */ 478 static int plat_ecc_e2d_map[PLAT_ECC_ERROR2_NUMVALS] = { 479 PLAT_ECC_ERROR2_NONE, /* 0x00 */ 480 PLAT_ECC_ERROR2_SEND_L2_XXC, /* 0x01 */ 481 PLAT_ECC_ERROR2_SEND_L2_XXU, /* 0x02 */ 482 PLAT_ECC_ERROR2_SEND_L3_XXC, /* 0x03 */ 483 PLAT_ECC_ERROR2_SEND_L3_XXU, /* 0x04 */ 484 PLAT_ECC_ERROR2_SEND_MEM_ERRS, /* 0x05 */ 485 PLAT_ECC_ERROR2_SEND_MEM_ERRS, /* 0x06 */ 486 PLAT_ECC_ERROR2_SEND_MEM_ERRS, /* 0x07 */ 487 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x08 */ 488 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x09 */ 489 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x0a */ 490 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x0b */ 491 PLAT_ECC_ERROR2_SEND_L2_TAG_ERRS, /* 0x0c */ 492 PLAT_ECC_ERROR2_SEND_L2_TAG_ERRS, /* 0x0d */ 493 PLAT_ECC_ERROR2_SEND_L3_TAG_ERRS, /* 0x0e */ 494 PLAT_ECC_ERROR2_SEND_L3_TAG_ERRS, /* 0x0f */ 495 PLAT_ECC_ERROR2_SEND_L1_PARITY, /* 0x10 */ 496 PLAT_ECC_ERROR2_SEND_L1_PARITY, /* 0x11 */ 497 PLAT_ECC_ERROR2_SEND_TLB_PARITY, /* 0x12 */ 498 PLAT_ECC_ERROR2_SEND_TLB_PARITY, /* 0x13 */ 499 PLAT_ECC_ERROR2_SEND_IV_ERRS, /* 0x14 */ 500 PLAT_ECC_ERROR2_SEND_IV_ERRS, /* 0x15 */ 501 PLAT_ECC_ERROR2_SEND_MTAG_XXC, /* 0x16 */ 502 PLAT_ECC_ERROR2_SEND_IV_MTAG_XXC, /* 0x17 */ 503 PLAT_ECC_ERROR2_SEND_L3_XXC, /* 0x18 */ 504 PLAT_ECC_ERROR2_SEND_PCACHE /* 0x19 */ 505 }; 506 507 /* 508 * log enhanced error information to SC. 509 */ 510 void 511 plat_log_fruid_error2(int msg_type, char *unum, struct async_flt *aflt, 512 plat_ecc_ch_async_flt_t *ecc_ch_flt) 513 { 514 plat_ecc_error2_data_t e2d = {0}; 515 int board, pos, bank, dimm, jnumber; 516 int maxcat = 0; 517 uint16_t flags; 518 519 /* Check the flags */ 520 flags = plat_ecc_e2d_map[msg_type]; 521 if ((ecc_error2_mailbox_flags & flags) == 0) 522 return; 523 524 /* Fill the header */ 525 e2d.ee2d_major_version = PLAT_ECC_ERROR2_VERSION_MAJOR; 526 e2d.ee2d_minor_version = PLAT_ECC_ERROR2_VERSION_MINOR; 527 e2d.ee2d_msg_type = PLAT_ECC_ERROR2_MESSAGE; 528 e2d.ee2d_msg_length = sizeof (plat_ecc_error2_data_t); 529 530 /* Fill the data */ 531 if (aflt->flt_in_memory) { 532 if (parse_unum_memory(unum, &board, &pos, &bank, &dimm, 533 &jnumber) || (dimm != -1 && jnumber == -1)) 534 return; 535 /* 536 * Using the SB number and Proc position we create a FRU 537 * cpu id. 538 */ 539 e2d.ee2d_owning_proc = plat_make_fru_cpuid(board, 0, pos); 540 e2d.ee2d_jnumber = jnumber; 541 e2d.ee2d_bank_number = bank; 542 } else if (aflt->flt_status & ECC_ECACHE) { 543 if (parse_unum_ecache(unum, &board, &pos, &jnumber, &maxcat)) 544 return; 545 /* 546 * Using the SB number and Proc position we create a FRU 547 * cpu id. 548 */ 549 e2d.ee2d_owning_proc = plat_make_fru_cpuid(board, maxcat, pos); 550 e2d.ee2d_jnumber = jnumber; 551 e2d.ee2d_bank_number = (uint8_t)-1; 552 } else { 553 /* 554 * L1 Cache 555 */ 556 e2d.ee2d_owning_proc = aflt->flt_bus_id; 557 e2d.ee2d_jnumber = (uint16_t)-1; 558 e2d.ee2d_bank_number = (uint8_t)-1; 559 } 560 561 e2d.ee2d_type = (uint8_t)msg_type; 562 e2d.ee2d_afar_status = (uint8_t)ecc_ch_flt->ecaf_afar_status; 563 e2d.ee2d_synd_status = (uint8_t)ecc_ch_flt->ecaf_synd_status; 564 e2d.ee2d_detecting_proc = aflt->flt_bus_id; 565 e2d.ee2d_cpu_impl = cpunodes[e2d.ee2d_owning_proc].implementation; 566 e2d.ee2d_timestamp = aflt->flt_id; 567 e2d.ee2d_afsr = aflt->flt_stat; 568 e2d.ee2d_afar = aflt->flt_addr; 569 570 e2d.ee2d_sdw_afsr = ecc_ch_flt->ecaf_sdw_afsr; 571 e2d.ee2d_sdw_afar = ecc_ch_flt->ecaf_sdw_afar; 572 e2d.ee2d_afsr_ext = ecc_ch_flt->ecaf_afsr_ext; 573 e2d.ee2d_sdw_afsr_ext = ecc_ch_flt->ecaf_sdw_afsr_ext; 574 575 /* Send the message to SC */ 576 (void) plat_send_ecc_mailbox_msg(PLAT_ECC_ERROR2_MESSAGE, &e2d); 577 } 578 579 uint8_t ecc_indictment_mailbox_disable = PLAT_ECC_INDICTMENT_OK; 580 uint8_t ecc_indictment_mailbox_flags = PLAT_ECC_SEND_DEFAULT_INDICT; 581 582 /* 583 * We log all Solaris indictments of failing hardware. We pull the system 584 * board number and jnumber out of the unum string, and calculate the cpuid 585 * from some members of the unum string. The rest of the structure is filled 586 * in through the other arguments. The data structure is then passed to 587 * plat_ecc_dispatch_task(). This function should only be loaded into memory 588 * or called on platforms that define a plat_send_ecc_mailbox_msg() function. 589 */ 590 static int 591 plat_log_fruid_indictment(int msg_type, struct async_flt *aflt, char *unum) 592 { 593 plat_ecc_message_t *wrapperp; 594 plat_ecc_indict_msg_contents_t *contentsp; 595 char *unum_ptr; 596 int is_maxcat = 0; 597 598 switch (ecc_indictment_mailbox_disable) { 599 case (PLAT_ECC_INDICTMENT_OK): 600 case (PLAT_ECC_INDICTMENT_SUSPECT): 601 break; 602 case (PLAT_ECC_INDICTMENT_NO_SEND): 603 default: 604 return (ECONNREFUSED); 605 } 606 607 switch (msg_type) { 608 case (PLAT_ECC_INDICT_DIMM): 609 if ((ecc_indictment_mailbox_flags & 610 PLAT_ECC_SEND_DIMM_INDICT) == 0) 611 return (ECONNREFUSED); 612 break; 613 case (PLAT_ECC_INDICT_ECACHE_CORRECTABLES): 614 if ((ecc_indictment_mailbox_flags & 615 PLAT_ECC_SEND_ECACHE_XXC_INDICT) == 0) 616 return (ECONNREFUSED); 617 break; 618 case (PLAT_ECC_INDICT_ECACHE_UNCORRECTABLE): 619 if ((ecc_indictment_mailbox_flags & 620 PLAT_ECC_SEND_ECACHE_XXU_INDICT) == 0) 621 return (ECONNREFUSED); 622 break; 623 default: 624 return (ECONNREFUSED); 625 } 626 627 /* LINTED: E_TRUE_LOGICAL_EXPR */ 628 ASSERT(sizeof (plat_ecc_indictment_data_t) == PLAT_ECC_INDICT_SIZE); 629 630 wrapperp = (plat_ecc_message_t *) 631 kmem_zalloc(sizeof (plat_ecc_message_t), KM_SLEEP); 632 633 wrapperp->ecc_msg_status = PLAT_ECC_NO_MSG_ACTIVE; 634 wrapperp->ecc_msg_type = PLAT_ECC_INDICTMENT_MESSAGE; 635 wrapperp->ecc_msg_len = sizeof (plat_ecc_indictment_data_t); 636 wrapperp->ecc_msg_data = kmem_zalloc(wrapperp->ecc_msg_len, KM_SLEEP); 637 638 contentsp = &(((plat_ecc_indictment_data_t *) 639 wrapperp->ecc_msg_data)->msg_contents); 640 641 /* 642 * Find board_num, jnumber, and proc position from the unum string. 643 * Use the board number, is_maxcat, and proc position to calculate 644 * cpuid. 645 */ 646 unum_ptr = strstr(unum, "SB"); 647 if (unum_ptr == NULL) { 648 is_maxcat = 1; 649 unum_ptr = strstr(unum, "IO"); 650 if (unum_ptr == NULL) { 651 kmem_free(wrapperp->ecc_msg_data, 652 wrapperp->ecc_msg_len); 653 kmem_free(wrapperp, sizeof (plat_ecc_message_t)); 654 return (EINVAL); 655 } 656 } 657 unum_ptr += 2; 658 contentsp->board_num = (uint8_t)stoi(&unum_ptr); 659 660 unum_ptr = strchr(unum_ptr, 'P'); 661 if (unum_ptr == NULL) { 662 kmem_free(wrapperp->ecc_msg_data, wrapperp->ecc_msg_len); 663 kmem_free(wrapperp, sizeof (plat_ecc_message_t)); 664 return (EINVAL); 665 } 666 unum_ptr++; 667 contentsp->detecting_proc = 668 (uint16_t)plat_make_fru_cpuid(contentsp->board_num, is_maxcat, 669 stoi(&unum_ptr)); 670 671 unum_ptr = strchr(unum_ptr, 'J'); 672 if (unum_ptr == NULL) { 673 kmem_free(wrapperp->ecc_msg_data, wrapperp->ecc_msg_len); 674 kmem_free(wrapperp, sizeof (plat_ecc_message_t)); 675 return (EINVAL); 676 } 677 unum_ptr++; 678 contentsp->jnumber = (uint16_t)stoi(&unum_ptr); 679 680 /* 681 * Fill in the rest of the data 682 */ 683 contentsp->version = PLAT_ECC_INDICTMENT_VERSION; 684 contentsp->indictment_type = msg_type; 685 contentsp->indictment_uncertain = ecc_indictment_mailbox_disable; 686 contentsp->syndrome = aflt->flt_synd; 687 contentsp->afsr = aflt->flt_stat; 688 contentsp->afar = aflt->flt_addr; 689 690 /* 691 * Build the solaris_version string: 692 */ 693 (void) snprintf(contentsp->solaris_version, 694 PLAT_ECC_VERSION_LENGTH, "%s %s", utsname.release, utsname.version); 695 696 /* 697 * Send the data on to the queuing function 698 */ 699 return (plat_ecc_dispatch_task(wrapperp)); 700 } 701 702 /* The following array maps the indictment to its corresponding set */ 703 static int plat_ecc_i2d_map[PLAT_ECC_INDICT2_NUMVALS] = { 704 PLAT_ECC_INDICT2_NONE, /* 0x00 */ 705 PLAT_ECC_SEND_INDICT2_L2_XXU, /* 0x01 */ 706 PLAT_ECC_SEND_INDICT2_L2_XXC_SERD, /* 0x02 */ 707 PLAT_ECC_SEND_INDICT2_L2_TAG_SERD, /* 0x03 */ 708 PLAT_ECC_SEND_INDICT2_L3_XXU, /* 0x04 */ 709 PLAT_ECC_SEND_INDICT2_L3_XXC_SERD, /* 0x05 */ 710 PLAT_ECC_SEND_INDICT2_L3_TAG_SERD, /* 0x06 */ 711 PLAT_ECC_SEND_INDICT2_L1_SERD, /* 0x07 */ 712 PLAT_ECC_SEND_INDICT2_L1_SERD, /* 0x08 */ 713 PLAT_ECC_SEND_INDICT2_TLB_SERD, /* 0x09 */ 714 PLAT_ECC_SEND_INDICT2_TLB_SERD, /* 0x0a */ 715 PLAT_ECC_SEND_INDICT2_FPU, /* 0x0b */ 716 PLAT_ECC_SEND_INDICT2_PCACHE_SERD /* 0x0c */ 717 }; 718 719 static int 720 plat_log_fruid_indictment2(int msg_type, struct async_flt *aflt, char *unum) 721 { 722 plat_ecc_message_t *wrapperp; 723 plat_ecc_indictment2_data_t *i2d; 724 int board, pos, jnumber; 725 int maxcat = 0; 726 uint16_t flags; 727 728 /* 729 * If the unum is null or empty, skip parsing it 730 */ 731 if (unum && unum[0] != '\0') { 732 if (parse_unum_ecache(unum, &board, &pos, &jnumber, &maxcat)) 733 return (EINVAL); 734 } 735 736 if ((ecc_indictment_mailbox_disable != PLAT_ECC_INDICTMENT_OK) && 737 (ecc_indictment_mailbox_disable != PLAT_ECC_INDICTMENT_SUSPECT)) 738 return (ECONNREFUSED); 739 740 /* Check the flags */ 741 flags = plat_ecc_i2d_map[msg_type]; 742 if ((ecc_indictment2_mailbox_flags & flags) == 0) 743 return (ECONNREFUSED); 744 745 wrapperp = (plat_ecc_message_t *) 746 kmem_zalloc(sizeof (plat_ecc_message_t), KM_SLEEP); 747 748 /* Initialize the wrapper */ 749 wrapperp->ecc_msg_status = PLAT_ECC_NO_MSG_ACTIVE; 750 wrapperp->ecc_msg_type = PLAT_ECC_INDICTMENT2_MESSAGE; 751 wrapperp->ecc_msg_len = sizeof (plat_ecc_indictment2_data_t); 752 wrapperp->ecc_msg_data = kmem_zalloc(wrapperp->ecc_msg_len, KM_SLEEP); 753 754 i2d = (plat_ecc_indictment2_data_t *)wrapperp->ecc_msg_data; 755 756 /* Fill the header */ 757 i2d->ei2d_major_version = PLAT_ECC_INDICT2_MAJOR_VERSION; 758 i2d->ei2d_minor_version = PLAT_ECC_INDICT2_MINOR_VERSION; 759 i2d->ei2d_msg_type = PLAT_ECC_INDICTMENT2_MESSAGE; 760 i2d->ei2d_msg_length = sizeof (plat_ecc_indictment2_data_t); 761 762 /* Fill the data */ 763 if (unum && unum[0] != '\0') { 764 i2d->ei2d_arraigned_proc = plat_make_fru_cpuid(board, maxcat, 765 pos); 766 i2d->ei2d_board_num = board; 767 i2d->ei2d_jnumber = jnumber; 768 } else { 769 i2d->ei2d_arraigned_proc = aflt->flt_inst; 770 i2d->ei2d_board_num = (uint8_t) 771 plat_make_fru_boardnum(i2d->ei2d_arraigned_proc); 772 i2d->ei2d_jnumber = (uint16_t)-1; 773 } 774 775 i2d->ei2d_type = msg_type; 776 i2d->ei2d_uncertain = ecc_indictment_mailbox_disable; 777 i2d->ei2d_cpu_impl = cpunodes[i2d->ei2d_arraigned_proc].implementation; 778 i2d->ei2d_timestamp = aflt->flt_id; 779 780 /* 781 * Send the data on to the queuing function 782 */ 783 return (plat_ecc_dispatch_task(wrapperp)); 784 } 785 786 int 787 plat_ecc_capability_send(void) 788 { 789 plat_ecc_message_t *wrapperp; 790 plat_capability_data_t *cap; 791 int ver_len; 792 793 wrapperp = kmem_zalloc(sizeof (plat_ecc_message_t), KM_SLEEP); 794 795 ver_len = strlen(utsname.release) + strlen(utsname.version) + 2; 796 797 /* Initialize the wrapper */ 798 wrapperp->ecc_msg_status = PLAT_ECC_NO_MSG_ACTIVE; 799 wrapperp->ecc_msg_type = PLAT_ECC_CAPABILITY_MESSAGE; 800 wrapperp->ecc_msg_len = sizeof (plat_capability_data_t) + ver_len; 801 wrapperp->ecc_msg_data = kmem_zalloc(wrapperp->ecc_msg_len, KM_SLEEP); 802 803 cap = (plat_capability_data_t *)wrapperp->ecc_msg_data; 804 805 /* Fill the header */ 806 cap->capd_major_version = PLAT_ECC_CAP_VERSION_MAJOR; 807 cap->capd_minor_version = PLAT_ECC_CAP_VERSION_MINOR; 808 cap->capd_msg_type = PLAT_ECC_CAPABILITY_MESSAGE; 809 cap->capd_msg_length = wrapperp->ecc_msg_len; 810 811 /* Set the default domain capability */ 812 cap->capd_capability = PLAT_ECC_CAPABILITY_DOMAIN_DEFAULT; 813 814 /* 815 * Build the solaris_version string: 816 * utsname.release + " " + utsname.version 817 */ 818 (void) snprintf(cap->capd_solaris_version, ver_len, "%s %s", 819 utsname.release, utsname.version); 820 821 /* 822 * Send the data on to the queuing function 823 */ 824 return (plat_ecc_dispatch_task(wrapperp)); 825 } 826 827 int 828 plat_ecc_capability_sc_get(int type) 829 { 830 switch (type) { 831 case PLAT_ECC_ERROR_MESSAGE: 832 if (ecc_log_fruid_enable && 833 (!(plat_ecc_capability_map_sc & 834 PLAT_ECC_CAPABILITY_ERROR2))) 835 return (1); 836 break; 837 case PLAT_ECC_ERROR2_MESSAGE: 838 if (plat_ecc_capability_map_sc & 839 PLAT_ECC_CAPABILITY_ERROR2) 840 return (1); 841 break; 842 case PLAT_ECC_INDICTMENT_MESSAGE: 843 if (!(plat_ecc_capability_map_sc & 844 PLAT_ECC_CAPABILITY_INDICT2) || 845 !(plat_ecc_capability_map_domain & 846 PLAT_ECC_CAPABILITY_FMA)) 847 return (1); 848 break; 849 case PLAT_ECC_INDICTMENT2_MESSAGE: 850 if (plat_ecc_capability_map_sc & 851 PLAT_ECC_CAPABILITY_INDICT2) 852 return (1); 853 break; 854 case PLAT_ECC_DIMM_SID_MESSAGE: 855 if (plat_ecc_capability_map_sc & 856 PLAT_ECC_CAPABILITY_DIMM_SID) 857 return (1); 858 default: 859 return (0); 860 } 861 return (0); 862 } 863 864 int plat_ecc_cap_sc_set_cnt = 0; 865 866 void 867 plat_ecc_capability_sc_set(uint32_t cap) 868 { 869 plat_ecc_capability_map_sc = cap; 870 871 if (!plat_ecc_cap_sc_set_cnt && (cap & PLAT_ECC_CAPABILITY_DIMM_SID)) 872 if (p2init_sid_cache) 873 p2init_sid_cache(); 874 875 plat_ecc_cap_sc_set_cnt++; 876 } 877 878 /* 879 * The following table represents mapping between the indictment1 reason 880 * to its type. 881 */ 882 883 static plat_ecc_bl_map_t plat_ecc_bl_map_v1[] = { 884 { "l2cachedata", PLAT_ECC_INDICT_ECACHE_CORRECTABLES }, 885 { "l3cachedata", PLAT_ECC_INDICT_ECACHE_CORRECTABLES }, 886 { "l2cachedata", PLAT_ECC_INDICT_ECACHE_UNCORRECTABLE }, 887 { "l3cachedata", PLAT_ECC_INDICT_ECACHE_UNCORRECTABLE } 888 }; 889 890 /* 891 * The following table represents mapping between the indictment2 reason 892 * to its type. 893 */ 894 895 static plat_ecc_bl_map_t plat_ecc_bl_map_v2[] = { 896 { "l2cachedata", PLAT_ECC_INDICT2_L2_SERD }, 897 { "l3cachedata", PLAT_ECC_INDICT2_L3_SERD }, 898 { "l2cachedata", PLAT_ECC_INDICT2_L2_UE }, 899 { "l3cachedata", PLAT_ECC_INDICT2_L3_UE }, 900 { "l2cachetag", PLAT_ECC_INDICT2_L2_TAG_SERD }, 901 { "l3cachetag", PLAT_ECC_INDICT2_L3_TAG_SERD }, 902 { "icache", PLAT_ECC_INDICT2_ICACHE_SERD }, 903 { "dcache", PLAT_ECC_INDICT2_DCACHE_SERD }, 904 { "pcache", PLAT_ECC_INDICT2_PCACHE_SERD }, 905 { "itlb", PLAT_ECC_INDICT2_ITLB_SERD }, 906 { "dtlb", PLAT_ECC_INDICT2_DTLB_SERD }, 907 { "fpu", PLAT_ECC_INDICT2_FPU } 908 }; 909 910 /* 911 * The following function returns the indictment type for a given version 912 */ 913 static int 914 flt_name_to_msg_type(const char *fault, int indict_version) 915 { 916 plat_ecc_bl_map_t *mapp; 917 char *fltnm = "fault.cpu."; 918 int mapsz; 919 char *p; 920 int i; 921 922 /* Check if it starts with proper fault name */ 923 if (strncmp(fault, fltnm, strlen(fltnm)) != 0) 924 return (PLAT_ECC_INDICT_NONE); 925 926 fault += strlen(fltnm); /* c = "ultraSPARC-IV.icache" */ 927 928 /* Skip the cpu type */ 929 if ((p = strchr(fault, '.')) == NULL) 930 return (PLAT_ECC_INDICT_NONE); 931 932 p++; /* skip the "." */ 933 934 if (indict_version == 0) { 935 mapp = plat_ecc_bl_map_v1; 936 mapsz = sizeof (plat_ecc_bl_map_v1) / 937 sizeof (plat_ecc_bl_map_t); 938 } else { 939 mapp = plat_ecc_bl_map_v2; 940 mapsz = sizeof (plat_ecc_bl_map_v2) / 941 sizeof (plat_ecc_bl_map_t); 942 } 943 for (i = 0; i < mapsz; i++) { 944 if (strcmp(p, mapp[i].ebm_reason) == 0) { 945 return (mapp[i].ebm_type); 946 } 947 } 948 return (PLAT_ECC_INDICT_NONE); 949 } 950 951 /* 952 * Blacklisting 953 */ 954 int 955 plat_blacklist(int cmd, const char *scheme, nvlist_t *fmri, const char *class) 956 { 957 struct async_flt aflt; 958 char *unum; 959 int msg_type, is_old_indict; 960 961 if (fmri == NULL) 962 return (EINVAL); 963 if (cmd != BLIOC_INSERT) 964 return (ENOTSUP); 965 966 /* 967 * We support both the blacklisting of CPUs via mem-schemed 968 * FMRIs that name E$ J-numbers, and CPUs via cpu-schemed FMRIs 969 * that name the cpuid. 970 */ 971 if (strcmp(scheme, FM_FMRI_SCHEME_MEM) == 0) { 972 if (nvlist_lookup_string(fmri, FM_FMRI_MEM_UNUM, &unum)) 973 return (EINVAL); 974 aflt.flt_inst = (uint_t)-1; 975 } else if (strcmp(scheme, FM_FMRI_SCHEME_CPU) == 0) { 976 if (nvlist_lookup_uint32(fmri, FM_FMRI_CPU_ID, &aflt.flt_inst)) 977 return (EINVAL); 978 unum = NULL; 979 } else { 980 return (ENOTSUP); 981 } 982 983 /* 984 * If the SC cannot handle indictment2, so fall back to old one. 985 * Also if the domain does not support FMA, then send only the old one. 986 */ 987 988 is_old_indict = plat_ecc_capability_sc_get(PLAT_ECC_INDICTMENT_MESSAGE); 989 990 if (is_old_indict) 991 msg_type = flt_name_to_msg_type(class, 0); 992 else 993 msg_type = flt_name_to_msg_type(class, 1); 994 995 if (msg_type == PLAT_ECC_INDICT_NONE) 996 return (ENOTSUP); 997 998 /* 999 * The current blacklisting interfaces are designed for a world where 1000 * the SC is much more involved in the diagnosis and error reporting 1001 * process than it is in the FMA world. As such, the existing 1002 * interfaces want all kinds of information about the error that's 1003 * triggering the blacklist. In the FMA world, we don't have access 1004 * to any of that information by the time we're doing the blacklist, 1005 * so we fake values. 1006 */ 1007 aflt.flt_id = gethrtime(); 1008 aflt.flt_addr = -1; 1009 aflt.flt_stat = -1; 1010 aflt.flt_synd = (ushort_t)-1; 1011 1012 if (is_old_indict) { 1013 if (unum && unum[0] != '\0') 1014 return (plat_log_fruid_indictment(msg_type, &aflt, 1015 unum)); 1016 else 1017 return (ENOTSUP); 1018 } else { 1019 return (plat_log_fruid_indictment2(msg_type, &aflt, unum)); 1020 } 1021 } 1022 1023 static kcondvar_t plat_ecc_condvar; 1024 static kmutex_t plat_ecc_mutex; 1025 static taskq_t *plat_ecc_taskq; 1026 1027 /* 1028 * plat_ecc_dispatch_task: Dispatch the task on a taskq and wait for the 1029 * return value. We use cv_wait_sig to wait for the return values. If a 1030 * signal interrupts us, we return EINTR. Otherwise, we return the value 1031 * returned by the mailbox functions. 1032 * 1033 * To avoid overloading the lower-level mailbox routines, we use a taskq 1034 * to serialize all messages. Currently, it is expected that only one 1035 * process (fmd) will use this ioctl, so the delay caused by the taskq 1036 * should not have much of an effect. 1037 */ 1038 int 1039 plat_ecc_dispatch_task(plat_ecc_message_t *msg) 1040 { 1041 int ret; 1042 1043 ASSERT(msg != NULL); 1044 ASSERT(plat_ecc_taskq != NULL); 1045 1046 if (taskq_dispatch(plat_ecc_taskq, plat_ecc_send_msg, 1047 (void *)msg, TQ_NOSLEEP) == TASKQID_INVALID) { 1048 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len); 1049 kmem_free(msg, sizeof (plat_ecc_message_t)); 1050 return (ENOMEM); 1051 } 1052 mutex_enter(&plat_ecc_mutex); 1053 1054 /* 1055 * It's possible that the taskq function completed before we 1056 * acquired the mutex. Check for this first. If this did not 1057 * happen, we wait for the taskq function to signal us, or an 1058 * interrupt. We also check ecc_msg_status to protect against 1059 * spurious wakeups from cv_wait_sig. 1060 */ 1061 if (msg->ecc_msg_status == PLAT_ECC_MSG_SENT) { 1062 ret = msg->ecc_msg_ret; 1063 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len); 1064 kmem_free(msg, sizeof (plat_ecc_message_t)); 1065 } else { 1066 msg->ecc_msg_status = PLAT_ECC_TASK_DISPATCHED; 1067 1068 while ((ret = cv_wait_sig(&plat_ecc_condvar, 1069 &plat_ecc_mutex)) != 0 && 1070 msg->ecc_msg_status == PLAT_ECC_TASK_DISPATCHED) 1071 ; 1072 1073 if ((ret == 0) && (msg->ecc_msg_status != PLAT_ECC_MSG_SENT)) { 1074 /* An interrupt was received */ 1075 msg->ecc_msg_status = PLAT_ECC_INTERRUPT_RECEIVED; 1076 ret = EINTR; 1077 } else { 1078 ret = msg->ecc_msg_ret; 1079 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len); 1080 kmem_free(msg, sizeof (plat_ecc_message_t)); 1081 } 1082 } 1083 mutex_exit(&plat_ecc_mutex); 1084 return (ret); 1085 } 1086 1087 static void 1088 plat_ecc_send_msg(void *arg) 1089 { 1090 plat_ecc_message_t *msg = arg; 1091 int ret; 1092 1093 /* 1094 * Send this data off as a mailbox message to the SC. 1095 */ 1096 ret = plat_send_ecc_mailbox_msg(msg->ecc_msg_type, msg->ecc_msg_data); 1097 1098 mutex_enter(&plat_ecc_mutex); 1099 1100 /* 1101 * If the dispatching function received an interrupt, don't bother 1102 * signalling it, and throw away the results. Otherwise, set the 1103 * return value and signal the condvar. 1104 */ 1105 if (msg->ecc_msg_status == PLAT_ECC_INTERRUPT_RECEIVED) { 1106 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len); 1107 kmem_free(msg, sizeof (plat_ecc_message_t)); 1108 } else { 1109 msg->ecc_msg_ret = ret; 1110 msg->ecc_msg_status = PLAT_ECC_MSG_SENT; 1111 cv_broadcast(&plat_ecc_condvar); 1112 } 1113 1114 mutex_exit(&plat_ecc_mutex); 1115 } 1116 1117 void 1118 plat_ecc_init(void) 1119 { 1120 int bd; 1121 1122 mutex_init(&plat_ecc_mutex, NULL, MUTEX_DEFAULT, NULL); 1123 cv_init(&plat_ecc_condvar, NULL, CV_DEFAULT, NULL); 1124 plat_ecc_taskq = taskq_create("plat_ecc_taskq", 1, minclsyspri, 1125 PLAT_ECC_TASKQ_MIN, PLAT_ECC_TASKQ_MAX, TASKQ_PREPOPULATE); 1126 ASSERT(plat_ecc_taskq != NULL); 1127 1128 for (bd = 0; bd < plat_max_cpumem_boards(); bd++) { 1129 mutex_init(&domain_dimm_sids[bd].pdsb_lock, 1130 NULL, MUTEX_DEFAULT, NULL); 1131 } 1132 1133 } 1134