1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/plat_ecc_unum.h> 27 #include <sys/utsname.h> 28 #include <sys/cmn_err.h> 29 #include <sys/async.h> 30 #include <sys/errno.h> 31 #include <sys/fm/protocol.h> 32 #include <sys/fm/cpu/UltraSPARC-III.h> 33 #include <sys/bl.h> 34 #include <sys/taskq.h> 35 #include <sys/condvar.h> 36 #include <sys/plat_ecc_dimm.h> 37 38 /* 39 * Pointer to platform specific function to initialize a cache of DIMM 40 * serial ids 41 */ 42 int (*p2init_sid_cache)(void); 43 44 /* 45 * This file contains the common code that is used for parsing 46 * ecc unum data and logging it appropriately as the platform 47 * that calls this code implements. 48 */ 49 50 int plat_ecc_dispatch_task(plat_ecc_message_t *); 51 static void plat_ecc_send_msg(void *); 52 53 #define CHECK_UNUM \ 54 if (unum_ptr == NULL) { \ 55 break; \ 56 } 57 58 /* 59 * See plat_ecc_unum.h for the meaning of these variables. 60 */ 61 int ecc_log_fruid_enable = ECC_FRUID_ENABLE_DEFAULT; 62 63 uint32_t plat_ecc_capability_map_domain = PLAT_ECC_CAPABILITY_DOMAIN_DEFAULT; 64 uint32_t plat_ecc_capability_map_sc = PLAT_ECC_CAPABILITY_SC_DEFAULT; 65 uint16_t ecc_error2_mailbox_flags = PLAT_ECC_ERROR2_SEND_DEFAULT; 66 uint16_t ecc_indictment2_mailbox_flags = PLAT_ECC_SEND_INDICT2_DEFAULT; 67 68 /* 69 * We log all ECC errors using the function that is defined as 70 * plat_send_ecc_mailbox_msg(); We first parse the unum string and 71 * then pass the data to be logged to the plat_send_ecc_mailbox_msg 72 * function for logging. Each platform that uses this code needs to 73 * implement a suitable function for this purpose. 74 */ 75 void 76 plat_log_fruid_error(int synd_code, struct async_flt *ecc, char *unum, 77 uint64_t afsr_bit) 78 { 79 plat_ecc_error_data_t ecc_error_data; 80 enum plat_ecc_type ecc_type = PLAT_ECC_UNKNOWN; 81 int board_num; 82 int proc_position; 83 int invalid_unum = 1; 84 85 bzero(&ecc_error_data, sizeof (plat_ecc_error_data_t)); 86 ecc_error_data.version = PLAT_ECC_VERSION; 87 88 switch (afsr_bit) { 89 case C_AFSR_CE: 90 ecc_error_data.error_code = PLAT_ERROR_CODE_CE; 91 break; 92 case C_AFSR_UE: 93 ecc_error_data.error_code = PLAT_ERROR_CODE_UE; 94 break; 95 case C_AFSR_EDC: 96 ecc_error_data.error_code = PLAT_ERROR_CODE_EDC; 97 break; 98 case C_AFSR_EDU: 99 ecc_error_data.error_code = PLAT_ERROR_CODE_EDU; 100 break; 101 case C_AFSR_WDC: 102 ecc_error_data.error_code = PLAT_ERROR_CODE_WDC; 103 break; 104 case C_AFSR_WDU: 105 ecc_error_data.error_code = PLAT_ERROR_CODE_WDU; 106 break; 107 case C_AFSR_CPC: 108 ecc_error_data.error_code = PLAT_ERROR_CODE_CPC; 109 break; 110 case C_AFSR_CPU: 111 ecc_error_data.error_code = PLAT_ERROR_CODE_CPU; 112 break; 113 case C_AFSR_UCC: 114 ecc_error_data.error_code = PLAT_ERROR_CODE_UCC; 115 break; 116 case C_AFSR_UCU: 117 ecc_error_data.error_code = PLAT_ERROR_CODE_UCU; 118 break; 119 case C_AFSR_EMC: 120 ecc_error_data.error_code = PLAT_ERROR_CODE_EMC; 121 break; 122 case C_AFSR_EMU: 123 ecc_error_data.error_code = PLAT_ERROR_CODE_EMU; 124 break; 125 default: 126 /* 127 * Do not send messages with unknown error codes, since 128 * the SC will not be able to tell what type of error 129 * occurred. 130 */ 131 return; 132 } 133 134 ecc_error_data.detecting_proc = ecc->flt_bus_id; 135 136 if (ecc->flt_in_memory) 137 ecc_type = PLAT_ECC_MEMORY; 138 else if (ecc->flt_status & ECC_ECACHE) 139 ecc_type = PLAT_ECC_ECACHE; 140 141 switch (ecc_type) { 142 case PLAT_ECC_MEMORY: { 143 /* 144 * The unum string is expected to be in this form: 145 * "/N0/SB12/P0/B0/D2 J13500, ..." 146 * for serengeti. As this code is shared with Starcat 147 * if N is missing then it is set to 0. 148 * From that we will extract the bank number, dimm 149 * number, and Jnumber. 150 */ 151 char *unum_ptr = unum; 152 char *jno_ptr = ecc_error_data.Jnumber; 153 int i; 154 155 /* 156 * On Serengeti we expect to find 'N' in the unum string 157 * however, on Starcat 'N' does not appear in the unum string. 158 * We do not want this code to break at this point, so the 159 * unum_ptr is reset to the start of unum string if we fail 160 * to find an 'N'. 161 */ 162 unum_ptr = strchr(unum_ptr, 'N'); 163 if (unum_ptr == NULL) { 164 ecc_error_data.node_no = 0; 165 unum_ptr = unum; 166 } else { 167 unum_ptr++; 168 ecc_error_data.node_no = stoi(&unum_ptr); 169 } 170 171 /* 172 * Now pull out the SB number 173 */ 174 unum_ptr = strstr(unum_ptr, "SB"); 175 CHECK_UNUM; 176 unum_ptr += 2; 177 board_num = stoi(&unum_ptr); 178 179 /* 180 * Now pull out the Proc position (relative to the board) 181 */ 182 unum_ptr = strchr(unum_ptr, 'P'); 183 CHECK_UNUM; 184 unum_ptr++; 185 proc_position = stoi(&unum_ptr); 186 187 /* 188 * Using the SB number and Proc position we create a FRU 189 * cpu id. 190 */ 191 ecc_error_data.proc_num = 192 plat_make_fru_cpuid(board_num, 0, proc_position); 193 194 /* 195 * Now pull out the Memory Bank number 196 */ 197 unum_ptr = strchr(unum_ptr, 'B'); 198 CHECK_UNUM; 199 unum_ptr++; 200 ecc_error_data.bank_no = (stoi(&unum_ptr) & 0x01); 201 202 /* 203 * Now pull out the Dimm number within the Memory Bank. 204 */ 205 unum_ptr = strchr(unum_ptr, 'D'); 206 CHECK_UNUM; 207 unum_ptr++; 208 ecc_error_data.ecache_dimm_no = (stoi(&unum_ptr) & 0x03); 209 210 /* 211 * Now pull out the J-number. 212 */ 213 unum_ptr = strchr(unum_ptr, 'J'); 214 CHECK_UNUM; 215 unum_ptr++; 216 for (i = PLAT_ECC_JNUMBER_LENGTH; 217 i > 0 && *unum_ptr >= '0' && *unum_ptr <= '9'; i--) 218 *jno_ptr++ = *unum_ptr++; 219 *jno_ptr = '\0'; 220 221 /* 222 * If we get here, we can assume the unum is valid 223 */ 224 invalid_unum = 0; 225 break; 226 } 227 case PLAT_ECC_ECACHE: { 228 /* 229 * The unum string is expected to be in this form: 230 * "[/N0/][SB|IO]12/P0/E0 J13500, ..." 231 * for serengeti. As this code is shared with Starcat 232 * if N is missing then it is set to 0. IO may only appear 233 * on Starcats. From that we will extract the bank number, 234 * dimm number, and Jnumber. 235 */ 236 char *unum_ptr = unum; 237 char *jno_ptr = ecc_error_data.Jnumber; 238 int is_maxcat = 0; 239 int i; 240 241 /* 242 * On Serengeti we expect to find 'N' in the unum string 243 * however, on Starcat 'N' does not appear in the unum string. 244 * We do not want this code to break at this point, so the 245 * unum_ptr is reset to the start of unum string if we fail 246 * to find an 'N'. 247 */ 248 unum_ptr = strchr(unum_ptr, 'N'); 249 if (unum_ptr == NULL) { 250 ecc_error_data.node_no = 0; 251 unum_ptr = unum; 252 } else { 253 unum_ptr++; 254 ecc_error_data.node_no = stoi(&unum_ptr); 255 } 256 257 /* 258 * Now pull out the SB/IO number 259 */ 260 unum_ptr = strstr(unum_ptr, "SB"); 261 if (unum_ptr == NULL) { 262 263 /* 264 * Since this is an E$ error, it must have occurred on 265 * either a System Board (represented by "SB" in the 266 * unum string) or a Maxcat board ("IO" in the unum 267 * string). Since we failed the "SB" check, we'll 268 * assume this is a maxcat board. 269 */ 270 is_maxcat = 1; 271 unum_ptr = strstr(unum, "IO"); 272 } 273 CHECK_UNUM; 274 unum_ptr += 2; 275 board_num = stoi(&unum_ptr); 276 277 /* 278 * Now pull out the Proc position (relative to the board) 279 */ 280 unum_ptr = strchr(unum_ptr, 'P'); 281 CHECK_UNUM; 282 unum_ptr++; 283 proc_position = stoi(&unum_ptr); 284 285 /* 286 * Using the SB/IO number, slot 0/1 value (is_maxcat), and 287 * proc position, we create the cpu id. 288 */ 289 ecc_error_data.proc_num = plat_make_fru_cpuid(board_num, 290 is_maxcat, proc_position); 291 292 ecc_error_data.bank_no = 0; /* not used */ 293 294 unum_ptr = strchr(unum_ptr, 'E'); 295 CHECK_UNUM; 296 unum_ptr++; 297 ecc_error_data.ecache_dimm_no = (stoi(&unum_ptr) & 0x01); 298 299 unum_ptr = strchr(unum_ptr, 'J'); 300 CHECK_UNUM; 301 unum_ptr++; 302 for (i = PLAT_ECC_JNUMBER_LENGTH; 303 i > 0 && *unum_ptr >= '0' && *unum_ptr <= '9'; i--) 304 *jno_ptr++ = *unum_ptr++; 305 *jno_ptr = '\0'; 306 307 /* 308 * If we get here, we can assume the unum is valid 309 */ 310 invalid_unum = 0; 311 break; 312 } 313 default: 314 /* 315 * Unknown error 316 */ 317 break; 318 } 319 320 /* 321 * This is where CHECK_UNUM goes when it finds an error 322 */ 323 324 if (ECC_SYND_DATA_BEGIN <= synd_code && 325 synd_code < ECC_SYND_ECC_BEGIN) { 326 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE; 327 ecc_error_data.databit_type = PLAT_BIT_TYPE_DATA; 328 ecc_error_data.databit_no = synd_code; 329 } else if (ECC_SYND_ECC_BEGIN <= synd_code && 330 synd_code < ECC_SYND_MTAG_BEGIN) { 331 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE; 332 ecc_error_data.databit_type = PLAT_BIT_TYPE_ECC; 333 ecc_error_data.databit_no = synd_code - ECC_SYND_ECC_BEGIN; 334 } else if (ECC_SYND_MTAG_BEGIN <= synd_code && 335 synd_code < ECC_SYND_MECC_BEGIN) { 336 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE; 337 ecc_error_data.databit_type = PLAT_BIT_TYPE_MTAG_D; 338 ecc_error_data.databit_no = synd_code - ECC_SYND_MTAG_BEGIN; 339 } else if (ECC_SYND_MECC_BEGIN <= synd_code && 340 synd_code < ECC_SYND_M2) { 341 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE; 342 ecc_error_data.databit_type = PLAT_BIT_TYPE_MTAG_E; 343 ecc_error_data.databit_no = synd_code - ECC_SYND_MECC_BEGIN; 344 } else { 345 switch (synd_code) { 346 case ECC_SYND_M2: 347 ecc_error_data.error_type = PLAT_ERROR_TYPE_M2; 348 break; 349 case ECC_SYND_M3: 350 ecc_error_data.error_type = PLAT_ERROR_TYPE_M3; 351 break; 352 case ECC_SYND_M4: 353 ecc_error_data.error_type = PLAT_ERROR_TYPE_M4; 354 break; 355 case ECC_SYND_M: 356 ecc_error_data.error_type = PLAT_ERROR_TYPE_M; 357 break; 358 default: 359 ecc_error_data.error_type = PLAT_ERROR_TYPE_UNK; 360 break; 361 } 362 ecc_error_data.databit_type = PLAT_BIT_TYPE_MULTI; 363 ecc_error_data.databit_no = 0; /* not used */ 364 } 365 366 #ifdef DEBUG 367 if (invalid_unum && 368 (ecc_error_data.error_code != PLAT_ERROR_CODE_UE) && 369 unum && *unum) 370 cmn_err(CE_WARN, "Unexpected unum string format: %s\n", unum); 371 #endif 372 373 /* 374 * Send this data off as a mailbox message to the SC. 375 */ 376 (void) plat_send_ecc_mailbox_msg(PLAT_ECC_ERROR_MESSAGE, 377 &ecc_error_data); 378 } 379 380 /* 381 * The unum string for memory is expected to be in this form: 382 * "[/N0/]SB12/P0/B0/D2 [J13500]" 383 * Or if the unum was generated as the result of a UE: 384 * "[/N0/]SB12/P0/B0 [J13500, ...]" 385 * From that we will extract the board number, processor position, 386 * bank number and jnumber. 387 * 388 * Return (1) for an invalid unum string. If the unum is for an 389 * individual DIMM and there is no jnumber, jnumber will be set 390 * to -1 and the caller can decide if the unum is valid. This 391 * is because Serengeti does not have jnumbers for bank unums 392 * which may be used to create DIMM unums (e.g. for acquiring 393 * DIMM serial ids). 394 */ 395 396 int 397 parse_unum_memory(char *unum, int *board, int *pos, int *bank, int *dimm, 398 int *jnumber) 399 { 400 char *c; 401 402 if ((c = strstr(unum, "SB")) == NULL) 403 return (1); 404 c += 2; 405 *board = (uint8_t)stoi(&c); 406 407 if (*c++ != '/' || *c++ != 'P') 408 return (1); 409 *pos = stoi(&c); 410 411 if (*c++ != '/' || *c++ != 'B') 412 return (1); 413 *bank = stoi(&c); 414 415 if ((c = strchr(c, 'D')) == NULL) { 416 *dimm = -1; 417 *jnumber = 0; 418 return (0); 419 } 420 c++; 421 *dimm = stoi(&c); 422 423 if ((c = strchr(c, 'J')) == NULL) { 424 *jnumber = -1; 425 return (0); 426 } 427 428 c++; 429 *jnumber = (uint16_t)stoi(&c); 430 431 return (0); 432 } 433 434 /* 435 * The unum string for ecache is expected to be in this form: 436 * "[/N0/][SB|IO]12/P0/E0 J13500, ..." 437 * From that we will extract the board number, processor position and 438 * junmber. 439 * 440 * return (1) for any invalid unum string. 441 */ 442 static int 443 parse_unum_ecache(char *unum, int *board, int *pos, int *jnumber, int *maxcat) 444 { 445 char *c; 446 447 if ((c = strstr(unum, "SB")) == NULL) { 448 /* 449 * Since this is an E$ error, it must have occurred on 450 * either a System Board (represented by "SB" in the 451 * unum string) or a Maxcat board ("IO" in the unum 452 * string). 453 */ 454 if ((c = strstr(unum, "IO")) == NULL) 455 return (1); 456 *maxcat = 1; 457 } 458 459 c += 2; 460 *board = (uint8_t)stoi(&c); 461 462 if (*c++ != '/' || *c++ != 'P') 463 return (1); 464 *pos = stoi(&c); 465 466 if ((c = strchr(c, 'J')) == NULL) 467 return (1); 468 469 c++; 470 *jnumber = (uint16_t)stoi(&c); 471 472 return (0); 473 } 474 475 /* The following array maps the error to its corresponding set */ 476 static int plat_ecc_e2d_map[PLAT_ECC_ERROR2_NUMVALS] = { 477 PLAT_ECC_ERROR2_NONE, /* 0x00 */ 478 PLAT_ECC_ERROR2_SEND_L2_XXC, /* 0x01 */ 479 PLAT_ECC_ERROR2_SEND_L2_XXU, /* 0x02 */ 480 PLAT_ECC_ERROR2_SEND_L3_XXC, /* 0x03 */ 481 PLAT_ECC_ERROR2_SEND_L3_XXU, /* 0x04 */ 482 PLAT_ECC_ERROR2_SEND_MEM_ERRS, /* 0x05 */ 483 PLAT_ECC_ERROR2_SEND_MEM_ERRS, /* 0x06 */ 484 PLAT_ECC_ERROR2_SEND_MEM_ERRS, /* 0x07 */ 485 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x08 */ 486 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x09 */ 487 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x0a */ 488 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x0b */ 489 PLAT_ECC_ERROR2_SEND_L2_TAG_ERRS, /* 0x0c */ 490 PLAT_ECC_ERROR2_SEND_L2_TAG_ERRS, /* 0x0d */ 491 PLAT_ECC_ERROR2_SEND_L3_TAG_ERRS, /* 0x0e */ 492 PLAT_ECC_ERROR2_SEND_L3_TAG_ERRS, /* 0x0f */ 493 PLAT_ECC_ERROR2_SEND_L1_PARITY, /* 0x10 */ 494 PLAT_ECC_ERROR2_SEND_L1_PARITY, /* 0x11 */ 495 PLAT_ECC_ERROR2_SEND_TLB_PARITY, /* 0x12 */ 496 PLAT_ECC_ERROR2_SEND_TLB_PARITY, /* 0x13 */ 497 PLAT_ECC_ERROR2_SEND_IV_ERRS, /* 0x14 */ 498 PLAT_ECC_ERROR2_SEND_IV_ERRS, /* 0x15 */ 499 PLAT_ECC_ERROR2_SEND_MTAG_XXC, /* 0x16 */ 500 PLAT_ECC_ERROR2_SEND_IV_MTAG_XXC, /* 0x17 */ 501 PLAT_ECC_ERROR2_SEND_L3_XXC, /* 0x18 */ 502 PLAT_ECC_ERROR2_SEND_PCACHE /* 0x19 */ 503 }; 504 505 /* 506 * log enhanced error information to SC. 507 */ 508 void 509 plat_log_fruid_error2(int msg_type, char *unum, struct async_flt *aflt, 510 plat_ecc_ch_async_flt_t *ecc_ch_flt) 511 { 512 plat_ecc_error2_data_t e2d = {0}; 513 int board, pos, bank, dimm, jnumber; 514 int maxcat = 0; 515 uint16_t flags; 516 517 /* Check the flags */ 518 flags = plat_ecc_e2d_map[msg_type]; 519 if ((ecc_error2_mailbox_flags & flags) == 0) 520 return; 521 522 /* Fill the header */ 523 e2d.ee2d_major_version = PLAT_ECC_ERROR2_VERSION_MAJOR; 524 e2d.ee2d_minor_version = PLAT_ECC_ERROR2_VERSION_MINOR; 525 e2d.ee2d_msg_type = PLAT_ECC_ERROR2_MESSAGE; 526 e2d.ee2d_msg_length = sizeof (plat_ecc_error2_data_t); 527 528 /* Fill the data */ 529 if (aflt->flt_in_memory) { 530 if (parse_unum_memory(unum, &board, &pos, &bank, &dimm, 531 &jnumber) || (dimm != -1 && jnumber == -1)) 532 return; 533 /* 534 * Using the SB number and Proc position we create a FRU 535 * cpu id. 536 */ 537 e2d.ee2d_owning_proc = plat_make_fru_cpuid(board, 0, pos); 538 e2d.ee2d_jnumber = jnumber; 539 e2d.ee2d_bank_number = bank; 540 } else if (aflt->flt_status & ECC_ECACHE) { 541 if (parse_unum_ecache(unum, &board, &pos, &jnumber, &maxcat)) 542 return; 543 /* 544 * Using the SB number and Proc position we create a FRU 545 * cpu id. 546 */ 547 e2d.ee2d_owning_proc = plat_make_fru_cpuid(board, maxcat, pos); 548 e2d.ee2d_jnumber = jnumber; 549 e2d.ee2d_bank_number = (uint8_t)-1; 550 } else { 551 /* 552 * L1 Cache 553 */ 554 e2d.ee2d_owning_proc = aflt->flt_bus_id; 555 e2d.ee2d_jnumber = (uint16_t)-1; 556 e2d.ee2d_bank_number = (uint8_t)-1; 557 } 558 559 e2d.ee2d_type = (uint8_t)msg_type; 560 e2d.ee2d_afar_status = (uint8_t)ecc_ch_flt->ecaf_afar_status; 561 e2d.ee2d_synd_status = (uint8_t)ecc_ch_flt->ecaf_synd_status; 562 e2d.ee2d_detecting_proc = aflt->flt_bus_id; 563 e2d.ee2d_cpu_impl = cpunodes[e2d.ee2d_owning_proc].implementation; 564 e2d.ee2d_timestamp = aflt->flt_id; 565 e2d.ee2d_afsr = aflt->flt_stat; 566 e2d.ee2d_afar = aflt->flt_addr; 567 568 e2d.ee2d_sdw_afsr = ecc_ch_flt->ecaf_sdw_afsr; 569 e2d.ee2d_sdw_afar = ecc_ch_flt->ecaf_sdw_afar; 570 e2d.ee2d_afsr_ext = ecc_ch_flt->ecaf_afsr_ext; 571 e2d.ee2d_sdw_afsr_ext = ecc_ch_flt->ecaf_sdw_afsr_ext; 572 573 /* Send the message to SC */ 574 (void) plat_send_ecc_mailbox_msg(PLAT_ECC_ERROR2_MESSAGE, &e2d); 575 } 576 577 uint8_t ecc_indictment_mailbox_disable = PLAT_ECC_INDICTMENT_OK; 578 uint8_t ecc_indictment_mailbox_flags = PLAT_ECC_SEND_DEFAULT_INDICT; 579 580 /* 581 * We log all Solaris indictments of failing hardware. We pull the system 582 * board number and jnumber out of the unum string, and calculate the cpuid 583 * from some members of the unum string. The rest of the structure is filled 584 * in through the other arguments. The data structure is then passed to 585 * plat_ecc_dispatch_task(). This function should only be loaded into memory 586 * or called on platforms that define a plat_send_ecc_mailbox_msg() function. 587 */ 588 static int 589 plat_log_fruid_indictment(int msg_type, struct async_flt *aflt, char *unum) 590 { 591 plat_ecc_message_t *wrapperp; 592 plat_ecc_indict_msg_contents_t *contentsp; 593 char *unum_ptr; 594 int is_maxcat = 0; 595 596 switch (ecc_indictment_mailbox_disable) { 597 case (PLAT_ECC_INDICTMENT_OK): 598 case (PLAT_ECC_INDICTMENT_SUSPECT): 599 break; 600 case (PLAT_ECC_INDICTMENT_NO_SEND): 601 default: 602 return (ECONNREFUSED); 603 } 604 605 switch (msg_type) { 606 case (PLAT_ECC_INDICT_DIMM): 607 if ((ecc_indictment_mailbox_flags & 608 PLAT_ECC_SEND_DIMM_INDICT) == 0) 609 return (ECONNREFUSED); 610 break; 611 case (PLAT_ECC_INDICT_ECACHE_CORRECTABLES): 612 if ((ecc_indictment_mailbox_flags & 613 PLAT_ECC_SEND_ECACHE_XXC_INDICT) == 0) 614 return (ECONNREFUSED); 615 break; 616 case (PLAT_ECC_INDICT_ECACHE_UNCORRECTABLE): 617 if ((ecc_indictment_mailbox_flags & 618 PLAT_ECC_SEND_ECACHE_XXU_INDICT) == 0) 619 return (ECONNREFUSED); 620 break; 621 default: 622 return (ECONNREFUSED); 623 } 624 625 /* LINTED: E_TRUE_LOGICAL_EXPR */ 626 ASSERT(sizeof (plat_ecc_indictment_data_t) == PLAT_ECC_INDICT_SIZE); 627 628 wrapperp = (plat_ecc_message_t *) 629 kmem_zalloc(sizeof (plat_ecc_message_t), KM_SLEEP); 630 631 wrapperp->ecc_msg_status = PLAT_ECC_NO_MSG_ACTIVE; 632 wrapperp->ecc_msg_type = PLAT_ECC_INDICTMENT_MESSAGE; 633 wrapperp->ecc_msg_len = sizeof (plat_ecc_indictment_data_t); 634 wrapperp->ecc_msg_data = kmem_zalloc(wrapperp->ecc_msg_len, KM_SLEEP); 635 636 contentsp = &(((plat_ecc_indictment_data_t *) 637 wrapperp->ecc_msg_data)->msg_contents); 638 639 /* 640 * Find board_num, jnumber, and proc position from the unum string. 641 * Use the board number, is_maxcat, and proc position to calculate 642 * cpuid. 643 */ 644 unum_ptr = strstr(unum, "SB"); 645 if (unum_ptr == NULL) { 646 is_maxcat = 1; 647 unum_ptr = strstr(unum, "IO"); 648 if (unum_ptr == NULL) { 649 kmem_free(wrapperp->ecc_msg_data, 650 wrapperp->ecc_msg_len); 651 kmem_free(wrapperp, sizeof (plat_ecc_message_t)); 652 return (EINVAL); 653 } 654 } 655 unum_ptr += 2; 656 contentsp->board_num = (uint8_t)stoi(&unum_ptr); 657 658 unum_ptr = strchr(unum_ptr, 'P'); 659 if (unum_ptr == NULL) { 660 kmem_free(wrapperp->ecc_msg_data, wrapperp->ecc_msg_len); 661 kmem_free(wrapperp, sizeof (plat_ecc_message_t)); 662 return (EINVAL); 663 } 664 unum_ptr++; 665 contentsp->detecting_proc = 666 (uint16_t)plat_make_fru_cpuid(contentsp->board_num, is_maxcat, 667 stoi(&unum_ptr)); 668 669 unum_ptr = strchr(unum_ptr, 'J'); 670 if (unum_ptr == NULL) { 671 kmem_free(wrapperp->ecc_msg_data, wrapperp->ecc_msg_len); 672 kmem_free(wrapperp, sizeof (plat_ecc_message_t)); 673 return (EINVAL); 674 } 675 unum_ptr++; 676 contentsp->jnumber = (uint16_t)stoi(&unum_ptr); 677 678 /* 679 * Fill in the rest of the data 680 */ 681 contentsp->version = PLAT_ECC_INDICTMENT_VERSION; 682 contentsp->indictment_type = msg_type; 683 contentsp->indictment_uncertain = ecc_indictment_mailbox_disable; 684 contentsp->syndrome = aflt->flt_synd; 685 contentsp->afsr = aflt->flt_stat; 686 contentsp->afar = aflt->flt_addr; 687 688 /* 689 * Build the solaris_version string: 690 */ 691 (void) snprintf(contentsp->solaris_version, 692 PLAT_ECC_VERSION_LENGTH, "%s %s", utsname.release, utsname.version); 693 694 /* 695 * Send the data on to the queuing function 696 */ 697 return (plat_ecc_dispatch_task(wrapperp)); 698 } 699 700 /* The following array maps the indictment to its corresponding set */ 701 static int plat_ecc_i2d_map[PLAT_ECC_INDICT2_NUMVALS] = { 702 PLAT_ECC_INDICT2_NONE, /* 0x00 */ 703 PLAT_ECC_SEND_INDICT2_L2_XXU, /* 0x01 */ 704 PLAT_ECC_SEND_INDICT2_L2_XXC_SERD, /* 0x02 */ 705 PLAT_ECC_SEND_INDICT2_L2_TAG_SERD, /* 0x03 */ 706 PLAT_ECC_SEND_INDICT2_L3_XXU, /* 0x04 */ 707 PLAT_ECC_SEND_INDICT2_L3_XXC_SERD, /* 0x05 */ 708 PLAT_ECC_SEND_INDICT2_L3_TAG_SERD, /* 0x06 */ 709 PLAT_ECC_SEND_INDICT2_L1_SERD, /* 0x07 */ 710 PLAT_ECC_SEND_INDICT2_L1_SERD, /* 0x08 */ 711 PLAT_ECC_SEND_INDICT2_TLB_SERD, /* 0x09 */ 712 PLAT_ECC_SEND_INDICT2_TLB_SERD, /* 0x0a */ 713 PLAT_ECC_SEND_INDICT2_FPU, /* 0x0b */ 714 PLAT_ECC_SEND_INDICT2_PCACHE_SERD /* 0x0c */ 715 }; 716 717 static int 718 plat_log_fruid_indictment2(int msg_type, struct async_flt *aflt, char *unum) 719 { 720 plat_ecc_message_t *wrapperp; 721 plat_ecc_indictment2_data_t *i2d; 722 int board, pos, jnumber; 723 int maxcat = 0; 724 uint16_t flags; 725 726 /* 727 * If the unum is null or empty, skip parsing it 728 */ 729 if (unum && unum[0] != '\0') { 730 if (parse_unum_ecache(unum, &board, &pos, &jnumber, &maxcat)) 731 return (EINVAL); 732 } 733 734 if ((ecc_indictment_mailbox_disable != PLAT_ECC_INDICTMENT_OK) && 735 (ecc_indictment_mailbox_disable != PLAT_ECC_INDICTMENT_SUSPECT)) 736 return (ECONNREFUSED); 737 738 /* Check the flags */ 739 flags = plat_ecc_i2d_map[msg_type]; 740 if ((ecc_indictment2_mailbox_flags & flags) == 0) 741 return (ECONNREFUSED); 742 743 wrapperp = (plat_ecc_message_t *) 744 kmem_zalloc(sizeof (plat_ecc_message_t), KM_SLEEP); 745 746 /* Initialize the wrapper */ 747 wrapperp->ecc_msg_status = PLAT_ECC_NO_MSG_ACTIVE; 748 wrapperp->ecc_msg_type = PLAT_ECC_INDICTMENT2_MESSAGE; 749 wrapperp->ecc_msg_len = sizeof (plat_ecc_indictment2_data_t); 750 wrapperp->ecc_msg_data = kmem_zalloc(wrapperp->ecc_msg_len, KM_SLEEP); 751 752 i2d = (plat_ecc_indictment2_data_t *)wrapperp->ecc_msg_data; 753 754 /* Fill the header */ 755 i2d->ei2d_major_version = PLAT_ECC_INDICT2_MAJOR_VERSION; 756 i2d->ei2d_minor_version = PLAT_ECC_INDICT2_MINOR_VERSION; 757 i2d->ei2d_msg_type = PLAT_ECC_INDICTMENT2_MESSAGE; 758 i2d->ei2d_msg_length = sizeof (plat_ecc_indictment2_data_t); 759 760 /* Fill the data */ 761 if (unum && unum[0] != '\0') { 762 i2d->ei2d_arraigned_proc = plat_make_fru_cpuid(board, maxcat, 763 pos); 764 i2d->ei2d_board_num = board; 765 i2d->ei2d_jnumber = jnumber; 766 } else { 767 i2d->ei2d_arraigned_proc = aflt->flt_inst; 768 i2d->ei2d_board_num = (uint8_t) 769 plat_make_fru_boardnum(i2d->ei2d_arraigned_proc); 770 i2d->ei2d_jnumber = (uint16_t)-1; 771 } 772 773 i2d->ei2d_type = msg_type; 774 i2d->ei2d_uncertain = ecc_indictment_mailbox_disable; 775 i2d->ei2d_cpu_impl = cpunodes[i2d->ei2d_arraigned_proc].implementation; 776 i2d->ei2d_timestamp = aflt->flt_id; 777 778 /* 779 * Send the data on to the queuing function 780 */ 781 return (plat_ecc_dispatch_task(wrapperp)); 782 } 783 784 int 785 plat_ecc_capability_send(void) 786 { 787 plat_ecc_message_t *wrapperp; 788 plat_capability_data_t *cap; 789 int ver_len; 790 791 wrapperp = kmem_zalloc(sizeof (plat_ecc_message_t), KM_SLEEP); 792 793 ver_len = strlen(utsname.release) + strlen(utsname.version) + 2; 794 795 /* Initialize the wrapper */ 796 wrapperp->ecc_msg_status = PLAT_ECC_NO_MSG_ACTIVE; 797 wrapperp->ecc_msg_type = PLAT_ECC_CAPABILITY_MESSAGE; 798 wrapperp->ecc_msg_len = sizeof (plat_capability_data_t) + ver_len; 799 wrapperp->ecc_msg_data = kmem_zalloc(wrapperp->ecc_msg_len, KM_SLEEP); 800 801 cap = (plat_capability_data_t *)wrapperp->ecc_msg_data; 802 803 /* Fill the header */ 804 cap->capd_major_version = PLAT_ECC_CAP_VERSION_MAJOR; 805 cap->capd_minor_version = PLAT_ECC_CAP_VERSION_MINOR; 806 cap->capd_msg_type = PLAT_ECC_CAPABILITY_MESSAGE; 807 cap->capd_msg_length = wrapperp->ecc_msg_len; 808 809 /* Set the default domain capability */ 810 cap->capd_capability = PLAT_ECC_CAPABILITY_DOMAIN_DEFAULT; 811 812 /* 813 * Build the solaris_version string: 814 * utsname.release + " " + utsname.version 815 */ 816 (void) snprintf(cap->capd_solaris_version, ver_len, "%s %s", 817 utsname.release, utsname.version); 818 819 /* 820 * Send the data on to the queuing function 821 */ 822 return (plat_ecc_dispatch_task(wrapperp)); 823 } 824 825 int 826 plat_ecc_capability_sc_get(int type) 827 { 828 switch (type) { 829 case PLAT_ECC_ERROR_MESSAGE: 830 if (ecc_log_fruid_enable && 831 (!(plat_ecc_capability_map_sc & 832 PLAT_ECC_CAPABILITY_ERROR2))) 833 return (1); 834 break; 835 case PLAT_ECC_ERROR2_MESSAGE: 836 if (plat_ecc_capability_map_sc & 837 PLAT_ECC_CAPABILITY_ERROR2) 838 return (1); 839 break; 840 case PLAT_ECC_INDICTMENT_MESSAGE: 841 if (!(plat_ecc_capability_map_sc & 842 PLAT_ECC_CAPABILITY_INDICT2) || 843 !(plat_ecc_capability_map_domain & 844 PLAT_ECC_CAPABILITY_FMA)) 845 return (1); 846 break; 847 case PLAT_ECC_INDICTMENT2_MESSAGE: 848 if (plat_ecc_capability_map_sc & 849 PLAT_ECC_CAPABILITY_INDICT2) 850 return (1); 851 break; 852 case PLAT_ECC_DIMM_SID_MESSAGE: 853 if (plat_ecc_capability_map_sc & 854 PLAT_ECC_CAPABILITY_DIMM_SID) 855 return (1); 856 default: 857 return (0); 858 } 859 return (0); 860 } 861 862 int plat_ecc_cap_sc_set_cnt = 0; 863 864 void 865 plat_ecc_capability_sc_set(uint32_t cap) 866 { 867 plat_ecc_capability_map_sc = cap; 868 869 if (!plat_ecc_cap_sc_set_cnt && (cap & PLAT_ECC_CAPABILITY_DIMM_SID)) 870 if (p2init_sid_cache) 871 p2init_sid_cache(); 872 873 plat_ecc_cap_sc_set_cnt++; 874 } 875 876 /* 877 * The following table represents mapping between the indictment1 reason 878 * to its type. 879 */ 880 881 static plat_ecc_bl_map_t plat_ecc_bl_map_v1[] = { 882 { "l2cachedata", PLAT_ECC_INDICT_ECACHE_CORRECTABLES }, 883 { "l3cachedata", PLAT_ECC_INDICT_ECACHE_CORRECTABLES }, 884 { "l2cachedata", PLAT_ECC_INDICT_ECACHE_UNCORRECTABLE }, 885 { "l3cachedata", PLAT_ECC_INDICT_ECACHE_UNCORRECTABLE } 886 }; 887 888 /* 889 * The following table represents mapping between the indictment2 reason 890 * to its type. 891 */ 892 893 static plat_ecc_bl_map_t plat_ecc_bl_map_v2[] = { 894 { "l2cachedata", PLAT_ECC_INDICT2_L2_SERD }, 895 { "l3cachedata", PLAT_ECC_INDICT2_L3_SERD }, 896 { "l2cachedata", PLAT_ECC_INDICT2_L2_UE }, 897 { "l3cachedata", PLAT_ECC_INDICT2_L3_UE }, 898 { "l2cachetag", PLAT_ECC_INDICT2_L2_TAG_SERD }, 899 { "l3cachetag", PLAT_ECC_INDICT2_L3_TAG_SERD }, 900 { "icache", PLAT_ECC_INDICT2_ICACHE_SERD }, 901 { "dcache", PLAT_ECC_INDICT2_DCACHE_SERD }, 902 { "pcache", PLAT_ECC_INDICT2_PCACHE_SERD }, 903 { "itlb", PLAT_ECC_INDICT2_ITLB_SERD }, 904 { "dtlb", PLAT_ECC_INDICT2_DTLB_SERD }, 905 { "fpu", PLAT_ECC_INDICT2_FPU } 906 }; 907 908 /* 909 * The following function returns the indictment type for a given version 910 */ 911 static int 912 flt_name_to_msg_type(const char *fault, int indict_version) 913 { 914 plat_ecc_bl_map_t *mapp; 915 char *fltnm = "fault.cpu."; 916 int mapsz; 917 char *p; 918 int i; 919 920 /* Check if it starts with proper fault name */ 921 if (strncmp(fault, fltnm, strlen(fltnm)) != 0) 922 return (PLAT_ECC_INDICT_NONE); 923 924 fault += strlen(fltnm); /* c = "ultraSPARC-IV.icache" */ 925 926 /* Skip the cpu type */ 927 if ((p = strchr(fault, '.')) == NULL) 928 return (PLAT_ECC_INDICT_NONE); 929 930 p++; /* skip the "." */ 931 932 if (indict_version == 0) { 933 mapp = plat_ecc_bl_map_v1; 934 mapsz = sizeof (plat_ecc_bl_map_v1) / 935 sizeof (plat_ecc_bl_map_t); 936 } else { 937 mapp = plat_ecc_bl_map_v2; 938 mapsz = sizeof (plat_ecc_bl_map_v2) / 939 sizeof (plat_ecc_bl_map_t); 940 } 941 for (i = 0; i < mapsz; i++) { 942 if (strcmp(p, mapp[i].ebm_reason) == 0) { 943 return (mapp[i].ebm_type); 944 } 945 } 946 return (PLAT_ECC_INDICT_NONE); 947 } 948 949 /* 950 * Blacklisting 951 */ 952 int 953 plat_blacklist(int cmd, const char *scheme, nvlist_t *fmri, const char *class) 954 { 955 struct async_flt aflt; 956 char *unum; 957 int msg_type, is_old_indict; 958 959 if (fmri == NULL) 960 return (EINVAL); 961 if (cmd != BLIOC_INSERT) 962 return (ENOTSUP); 963 964 /* 965 * We support both the blacklisting of CPUs via mem-schemed 966 * FMRIs that name E$ J-numbers, and CPUs via cpu-schemed FMRIs 967 * that name the cpuid. 968 */ 969 if (strcmp(scheme, FM_FMRI_SCHEME_MEM) == 0) { 970 if (nvlist_lookup_string(fmri, FM_FMRI_MEM_UNUM, &unum)) 971 return (EINVAL); 972 aflt.flt_inst = (uint_t)-1; 973 } else if (strcmp(scheme, FM_FMRI_SCHEME_CPU) == 0) { 974 if (nvlist_lookup_uint32(fmri, FM_FMRI_CPU_ID, &aflt.flt_inst)) 975 return (EINVAL); 976 unum = NULL; 977 } else { 978 return (ENOTSUP); 979 } 980 981 /* 982 * If the SC cannot handle indictment2, so fall back to old one. 983 * Also if the domain does not support FMA, then send only the old one. 984 */ 985 986 is_old_indict = plat_ecc_capability_sc_get(PLAT_ECC_INDICTMENT_MESSAGE); 987 988 if (is_old_indict) 989 msg_type = flt_name_to_msg_type(class, 0); 990 else 991 msg_type = flt_name_to_msg_type(class, 1); 992 993 if (msg_type == PLAT_ECC_INDICT_NONE) 994 return (ENOTSUP); 995 996 /* 997 * The current blacklisting interfaces are designed for a world where 998 * the SC is much more involved in the diagnosis and error reporting 999 * process than it is in the FMA world. As such, the existing 1000 * interfaces want all kinds of information about the error that's 1001 * triggering the blacklist. In the FMA world, we don't have access 1002 * to any of that information by the time we're doing the blacklist, 1003 * so we fake values. 1004 */ 1005 aflt.flt_id = gethrtime(); 1006 aflt.flt_addr = -1; 1007 aflt.flt_stat = -1; 1008 aflt.flt_synd = (ushort_t)-1; 1009 1010 if (is_old_indict) { 1011 if (unum && unum[0] != '\0') 1012 return (plat_log_fruid_indictment(msg_type, &aflt, 1013 unum)); 1014 else 1015 return (ENOTSUP); 1016 } else { 1017 return (plat_log_fruid_indictment2(msg_type, &aflt, unum)); 1018 } 1019 } 1020 1021 static kcondvar_t plat_ecc_condvar; 1022 static kmutex_t plat_ecc_mutex; 1023 static taskq_t *plat_ecc_taskq; 1024 1025 /* 1026 * plat_ecc_dispatch_task: Dispatch the task on a taskq and wait for the 1027 * return value. We use cv_wait_sig to wait for the return values. If a 1028 * signal interrupts us, we return EINTR. Otherwise, we return the value 1029 * returned by the mailbox functions. 1030 * 1031 * To avoid overloading the lower-level mailbox routines, we use a taskq 1032 * to serialize all messages. Currently, it is expected that only one 1033 * process (fmd) will use this ioctl, so the delay caused by the taskq 1034 * should not have much of an effect. 1035 */ 1036 int 1037 plat_ecc_dispatch_task(plat_ecc_message_t *msg) 1038 { 1039 int ret; 1040 1041 ASSERT(msg != NULL); 1042 ASSERT(plat_ecc_taskq != NULL); 1043 1044 if (taskq_dispatch(plat_ecc_taskq, plat_ecc_send_msg, 1045 (void *)msg, TQ_NOSLEEP) == TASKQID_INVALID) { 1046 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len); 1047 kmem_free(msg, sizeof (plat_ecc_message_t)); 1048 return (ENOMEM); 1049 } 1050 mutex_enter(&plat_ecc_mutex); 1051 1052 /* 1053 * It's possible that the taskq function completed before we 1054 * acquired the mutex. Check for this first. If this did not 1055 * happen, we wait for the taskq function to signal us, or an 1056 * interrupt. We also check ecc_msg_status to protect against 1057 * spurious wakeups from cv_wait_sig. 1058 */ 1059 if (msg->ecc_msg_status == PLAT_ECC_MSG_SENT) { 1060 ret = msg->ecc_msg_ret; 1061 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len); 1062 kmem_free(msg, sizeof (plat_ecc_message_t)); 1063 } else { 1064 msg->ecc_msg_status = PLAT_ECC_TASK_DISPATCHED; 1065 1066 while ((ret = cv_wait_sig(&plat_ecc_condvar, 1067 &plat_ecc_mutex)) != 0 && 1068 msg->ecc_msg_status == PLAT_ECC_TASK_DISPATCHED) 1069 ; 1070 1071 if ((ret == 0) && (msg->ecc_msg_status != PLAT_ECC_MSG_SENT)) { 1072 /* An interrupt was received */ 1073 msg->ecc_msg_status = PLAT_ECC_INTERRUPT_RECEIVED; 1074 ret = EINTR; 1075 } else { 1076 ret = msg->ecc_msg_ret; 1077 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len); 1078 kmem_free(msg, sizeof (plat_ecc_message_t)); 1079 } 1080 } 1081 mutex_exit(&plat_ecc_mutex); 1082 return (ret); 1083 } 1084 1085 static void 1086 plat_ecc_send_msg(void *arg) 1087 { 1088 plat_ecc_message_t *msg = arg; 1089 int ret; 1090 1091 /* 1092 * Send this data off as a mailbox message to the SC. 1093 */ 1094 ret = plat_send_ecc_mailbox_msg(msg->ecc_msg_type, msg->ecc_msg_data); 1095 1096 mutex_enter(&plat_ecc_mutex); 1097 1098 /* 1099 * If the dispatching function received an interrupt, don't bother 1100 * signalling it, and throw away the results. Otherwise, set the 1101 * return value and signal the condvar. 1102 */ 1103 if (msg->ecc_msg_status == PLAT_ECC_INTERRUPT_RECEIVED) { 1104 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len); 1105 kmem_free(msg, sizeof (plat_ecc_message_t)); 1106 } else { 1107 msg->ecc_msg_ret = ret; 1108 msg->ecc_msg_status = PLAT_ECC_MSG_SENT; 1109 cv_broadcast(&plat_ecc_condvar); 1110 } 1111 1112 mutex_exit(&plat_ecc_mutex); 1113 } 1114 1115 void 1116 plat_ecc_init(void) 1117 { 1118 int bd; 1119 1120 mutex_init(&plat_ecc_mutex, NULL, MUTEX_DEFAULT, NULL); 1121 cv_init(&plat_ecc_condvar, NULL, CV_DEFAULT, NULL); 1122 plat_ecc_taskq = taskq_create("plat_ecc_taskq", 1, minclsyspri, 1123 PLAT_ECC_TASKQ_MIN, PLAT_ECC_TASKQ_MAX, TASKQ_PREPOPULATE); 1124 ASSERT(plat_ecc_taskq != NULL); 1125 1126 for (bd = 0; bd < plat_max_cpumem_boards(); bd++) { 1127 mutex_init(&domain_dimm_sids[bd].pdsb_lock, 1128 NULL, MUTEX_DEFAULT, NULL); 1129 } 1130 1131 } 1132