1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/plat_ecc_unum.h> 30 #include <sys/utsname.h> 31 #include <sys/cmn_err.h> 32 #include <sys/async.h> 33 #include <sys/errno.h> 34 #include <sys/fm/protocol.h> 35 #include <sys/fm/cpu/UltraSPARC-III.h> 36 #include <sys/bl.h> 37 #include <sys/taskq.h> 38 #include <sys/condvar.h> 39 #include <sys/plat_ecc_dimm.h> 40 41 /* 42 * Pointer to platform specific function to initialize a cache of DIMM 43 * serial ids 44 */ 45 int (*p2init_sid_cache)(void); 46 47 /* 48 * This file contains the common code that is used for parsing 49 * ecc unum data and logging it appropriately as the platform 50 * that calls this code implements. 51 */ 52 53 int plat_ecc_dispatch_task(plat_ecc_message_t *); 54 static void plat_ecc_send_msg(void *); 55 56 #define CHECK_UNUM \ 57 if (unum_ptr == NULL) { \ 58 break; \ 59 } 60 61 /* 62 * See plat_ecc_unum.h for the meaning of these variables. 63 */ 64 int ecc_log_fruid_enable = ECC_FRUID_ENABLE_DEFAULT; 65 66 uint32_t plat_ecc_capability_map_domain = PLAT_ECC_CAPABILITY_DOMAIN_DEFAULT; 67 uint32_t plat_ecc_capability_map_sc = PLAT_ECC_CAPABILITY_SC_DEFAULT; 68 uint16_t ecc_error2_mailbox_flags = PLAT_ECC_ERROR2_SEND_DEFAULT; 69 uint16_t ecc_indictment2_mailbox_flags = PLAT_ECC_SEND_INDICT2_DEFAULT; 70 71 /* 72 * We log all ECC errors using the function that is defined as 73 * plat_send_ecc_mailbox_msg(); We first parse the unum string and 74 * then pass the data to be logged to the plat_send_ecc_mailbox_msg 75 * function for logging. Each platform that uses this code needs to 76 * implement a suitable function for this purpose. 77 */ 78 void 79 plat_log_fruid_error(int synd_code, struct async_flt *ecc, char *unum, 80 uint64_t afsr_bit) 81 { 82 plat_ecc_error_data_t ecc_error_data; 83 enum plat_ecc_type ecc_type = PLAT_ECC_UNKNOWN; 84 int board_num; 85 int proc_position; 86 int invalid_unum = 1; 87 88 bzero(&ecc_error_data, sizeof (plat_ecc_error_data_t)); 89 ecc_error_data.version = PLAT_ECC_VERSION; 90 91 switch (afsr_bit) { 92 case C_AFSR_CE: 93 ecc_error_data.error_code = PLAT_ERROR_CODE_CE; 94 break; 95 case C_AFSR_UE: 96 ecc_error_data.error_code = PLAT_ERROR_CODE_UE; 97 break; 98 case C_AFSR_EDC: 99 ecc_error_data.error_code = PLAT_ERROR_CODE_EDC; 100 break; 101 case C_AFSR_EDU: 102 ecc_error_data.error_code = PLAT_ERROR_CODE_EDU; 103 break; 104 case C_AFSR_WDC: 105 ecc_error_data.error_code = PLAT_ERROR_CODE_WDC; 106 break; 107 case C_AFSR_WDU: 108 ecc_error_data.error_code = PLAT_ERROR_CODE_WDU; 109 break; 110 case C_AFSR_CPC: 111 ecc_error_data.error_code = PLAT_ERROR_CODE_CPC; 112 break; 113 case C_AFSR_CPU: 114 ecc_error_data.error_code = PLAT_ERROR_CODE_CPU; 115 break; 116 case C_AFSR_UCC: 117 ecc_error_data.error_code = PLAT_ERROR_CODE_UCC; 118 break; 119 case C_AFSR_UCU: 120 ecc_error_data.error_code = PLAT_ERROR_CODE_UCU; 121 break; 122 case C_AFSR_EMC: 123 ecc_error_data.error_code = PLAT_ERROR_CODE_EMC; 124 break; 125 case C_AFSR_EMU: 126 ecc_error_data.error_code = PLAT_ERROR_CODE_EMU; 127 break; 128 default: 129 /* 130 * Do not send messages with unknown error codes, since 131 * the SC will not be able to tell what type of error 132 * occurred. 133 */ 134 return; 135 } 136 137 ecc_error_data.detecting_proc = ecc->flt_bus_id; 138 139 if (ecc->flt_in_memory) 140 ecc_type = PLAT_ECC_MEMORY; 141 else if (ecc->flt_status & ECC_ECACHE) 142 ecc_type = PLAT_ECC_ECACHE; 143 144 switch (ecc_type) { 145 case PLAT_ECC_MEMORY: { 146 /* 147 * The unum string is expected to be in this form: 148 * "/N0/SB12/P0/B0/D2 J13500, ..." 149 * for serengeti. As this code is shared with Starcat 150 * if N is missing then it is set to 0. 151 * From that we will extract the bank number, dimm 152 * number, and Jnumber. 153 */ 154 char *unum_ptr = unum; 155 char *jno_ptr = ecc_error_data.Jnumber; 156 int i; 157 158 /* 159 * On Serengeti we expect to find 'N' in the unum string 160 * however, on Starcat 'N' does not appear in the unum string. 161 * We do not want this code to break at this point, so the 162 * unum_ptr is reset to the start of unum string if we fail 163 * to find an 'N'. 164 */ 165 unum_ptr = strchr(unum_ptr, 'N'); 166 if (unum_ptr == NULL) { 167 ecc_error_data.node_no = 0; 168 unum_ptr = unum; 169 } else { 170 unum_ptr++; 171 ecc_error_data.node_no = stoi(&unum_ptr); 172 } 173 174 /* 175 * Now pull out the SB number 176 */ 177 unum_ptr = strstr(unum_ptr, "SB"); 178 CHECK_UNUM; 179 unum_ptr += 2; 180 board_num = stoi(&unum_ptr); 181 182 /* 183 * Now pull out the Proc position (relative to the board) 184 */ 185 unum_ptr = strchr(unum_ptr, 'P'); 186 CHECK_UNUM; 187 unum_ptr++; 188 proc_position = stoi(&unum_ptr); 189 190 /* 191 * Using the SB number and Proc position we create a FRU 192 * cpu id. 193 */ 194 ecc_error_data.proc_num = 195 plat_make_fru_cpuid(board_num, 0, proc_position); 196 197 /* 198 * Now pull out the Memory Bank number 199 */ 200 unum_ptr = strchr(unum_ptr, 'B'); 201 CHECK_UNUM; 202 unum_ptr++; 203 ecc_error_data.bank_no = (stoi(&unum_ptr) & 0x01); 204 205 /* 206 * Now pull out the Dimm number within the Memory Bank. 207 */ 208 unum_ptr = strchr(unum_ptr, 'D'); 209 CHECK_UNUM; 210 unum_ptr++; 211 ecc_error_data.ecache_dimm_no = (stoi(&unum_ptr) & 0x03); 212 213 /* 214 * Now pull out the J-number. 215 */ 216 unum_ptr = strchr(unum_ptr, 'J'); 217 CHECK_UNUM; 218 unum_ptr++; 219 for (i = PLAT_ECC_JNUMBER_LENGTH; 220 i > 0 && *unum_ptr >= '0' && *unum_ptr <= '9'; i--) 221 *jno_ptr++ = *unum_ptr++; 222 *jno_ptr = NULL; 223 224 /* 225 * If we get here, we can assume the unum is valid 226 */ 227 invalid_unum = 0; 228 break; 229 } 230 case PLAT_ECC_ECACHE: { 231 /* 232 * The unum string is expected to be in this form: 233 * "[/N0/][SB|IO]12/P0/E0 J13500, ..." 234 * for serengeti. As this code is shared with Starcat 235 * if N is missing then it is set to 0. IO may only appear 236 * on Starcats. From that we will extract the bank number, 237 * dimm number, and Jnumber. 238 */ 239 char *unum_ptr = unum; 240 char *jno_ptr = ecc_error_data.Jnumber; 241 int is_maxcat = 0; 242 int i; 243 244 /* 245 * On Serengeti we expect to find 'N' in the unum string 246 * however, on Starcat 'N' does not appear in the unum string. 247 * We do not want this code to break at this point, so the 248 * unum_ptr is reset to the start of unum string if we fail 249 * to find an 'N'. 250 */ 251 unum_ptr = strchr(unum_ptr, 'N'); 252 if (unum_ptr == NULL) { 253 ecc_error_data.node_no = 0; 254 unum_ptr = unum; 255 } else { 256 unum_ptr++; 257 ecc_error_data.node_no = stoi(&unum_ptr); 258 } 259 260 /* 261 * Now pull out the SB/IO number 262 */ 263 unum_ptr = strstr(unum_ptr, "SB"); 264 if (unum_ptr == NULL) { 265 266 /* 267 * Since this is an E$ error, it must have occurred on 268 * either a System Board (represented by "SB" in the 269 * unum string) or a Maxcat board ("IO" in the unum 270 * string). Since we failed the "SB" check, we'll 271 * assume this is a maxcat board. 272 */ 273 is_maxcat = 1; 274 unum_ptr = strstr(unum, "IO"); 275 } 276 CHECK_UNUM; 277 unum_ptr += 2; 278 board_num = stoi(&unum_ptr); 279 280 /* 281 * Now pull out the Proc position (relative to the board) 282 */ 283 unum_ptr = strchr(unum_ptr, 'P'); 284 CHECK_UNUM; 285 unum_ptr++; 286 proc_position = stoi(&unum_ptr); 287 288 /* 289 * Using the SB/IO number, slot 0/1 value (is_maxcat), and 290 * proc position, we create the cpu id. 291 */ 292 ecc_error_data.proc_num = plat_make_fru_cpuid(board_num, 293 is_maxcat, proc_position); 294 295 ecc_error_data.bank_no = 0; /* not used */ 296 297 unum_ptr = strchr(unum_ptr, 'E'); 298 CHECK_UNUM; 299 unum_ptr++; 300 ecc_error_data.ecache_dimm_no = (stoi(&unum_ptr) & 0x01); 301 302 unum_ptr = strchr(unum_ptr, 'J'); 303 CHECK_UNUM; 304 unum_ptr++; 305 for (i = PLAT_ECC_JNUMBER_LENGTH; 306 i > 0 && *unum_ptr >= '0' && *unum_ptr <= '9'; i--) 307 *jno_ptr++ = *unum_ptr++; 308 *jno_ptr = NULL; 309 310 /* 311 * If we get here, we can assume the unum is valid 312 */ 313 invalid_unum = 0; 314 break; 315 } 316 default: 317 /* 318 * Unknown error 319 */ 320 break; 321 } 322 323 /* 324 * This is where CHECK_UNUM goes when it finds an error 325 */ 326 327 if (ECC_SYND_DATA_BEGIN <= synd_code && 328 synd_code < ECC_SYND_ECC_BEGIN) { 329 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE; 330 ecc_error_data.databit_type = PLAT_BIT_TYPE_DATA; 331 ecc_error_data.databit_no = synd_code; 332 } else if (ECC_SYND_ECC_BEGIN <= synd_code && 333 synd_code < ECC_SYND_MTAG_BEGIN) { 334 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE; 335 ecc_error_data.databit_type = PLAT_BIT_TYPE_ECC; 336 ecc_error_data.databit_no = synd_code - ECC_SYND_ECC_BEGIN; 337 } else if (ECC_SYND_MTAG_BEGIN <= synd_code && 338 synd_code < ECC_SYND_MECC_BEGIN) { 339 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE; 340 ecc_error_data.databit_type = PLAT_BIT_TYPE_MTAG_D; 341 ecc_error_data.databit_no = synd_code - ECC_SYND_MTAG_BEGIN; 342 } else if (ECC_SYND_MECC_BEGIN <= synd_code && 343 synd_code < ECC_SYND_M2) { 344 ecc_error_data.error_type = PLAT_ERROR_TYPE_SINGLE; 345 ecc_error_data.databit_type = PLAT_BIT_TYPE_MTAG_E; 346 ecc_error_data.databit_no = synd_code - ECC_SYND_MECC_BEGIN; 347 } else { 348 switch (synd_code) { 349 case ECC_SYND_M2: 350 ecc_error_data.error_type = PLAT_ERROR_TYPE_M2; 351 break; 352 case ECC_SYND_M3: 353 ecc_error_data.error_type = PLAT_ERROR_TYPE_M3; 354 break; 355 case ECC_SYND_M4: 356 ecc_error_data.error_type = PLAT_ERROR_TYPE_M4; 357 break; 358 case ECC_SYND_M: 359 ecc_error_data.error_type = PLAT_ERROR_TYPE_M; 360 break; 361 default: 362 ecc_error_data.error_type = PLAT_ERROR_TYPE_UNK; 363 break; 364 } 365 ecc_error_data.databit_type = PLAT_BIT_TYPE_MULTI; 366 ecc_error_data.databit_no = 0; /* not used */ 367 } 368 369 #ifdef DEBUG 370 if (invalid_unum && 371 (ecc_error_data.error_code != PLAT_ERROR_CODE_UE) && 372 unum && *unum) 373 cmn_err(CE_WARN, "Unexpected unum string format: %s\n", unum); 374 #endif 375 376 /* 377 * Send this data off as a mailbox message to the SC. 378 */ 379 (void) plat_send_ecc_mailbox_msg(PLAT_ECC_ERROR_MESSAGE, 380 &ecc_error_data); 381 } 382 383 /* 384 * The unum string for memory is expected to be in this form: 385 * "[/N0/]SB12/P0/B0/D2 [J13500]" 386 * Or if the unum was generated as the result of a UE: 387 * "[/N0/]SB12/P0/B0 [J13500, ...]" 388 * From that we will extract the board number, processor position, 389 * bank number and jnumber. 390 * 391 * Return (1) for an invalid unum string. If the unum is for an 392 * individual DIMM and there is no jnumber, jnumber will be set 393 * to -1 and the caller can decide if the unum is valid. This 394 * is because Serengeti does not have jnumbers for bank unums 395 * which may be used to create DIMM unums (e.g. for acquiring 396 * DIMM serial ids). 397 */ 398 399 int 400 parse_unum_memory(char *unum, int *board, int *pos, int *bank, int *dimm, 401 int *jnumber) 402 { 403 char *c; 404 405 if ((c = strstr(unum, "SB")) == NULL) 406 return (1); 407 c += 2; 408 *board = (uint8_t)stoi(&c); 409 410 if (*c++ != '/' || *c++ != 'P') 411 return (1); 412 *pos = stoi(&c); 413 414 if (*c++ != '/' || *c++ != 'B') 415 return (1); 416 *bank = stoi(&c); 417 418 if ((c = strchr(c, 'D')) == NULL) { 419 *dimm = -1; 420 *jnumber = 0; 421 return (0); 422 } 423 c++; 424 *dimm = stoi(&c); 425 426 if ((c = strchr(c, 'J')) == NULL) { 427 *jnumber = -1; 428 return (0); 429 } 430 431 c++; 432 *jnumber = (uint16_t)stoi(&c); 433 434 return (0); 435 } 436 437 /* 438 * The unum string for ecache is expected to be in this form: 439 * "[/N0/][SB|IO]12/P0/E0 J13500, ..." 440 * From that we will extract the board number, processor position and 441 * junmber. 442 * 443 * return (1) for any invalid unum string. 444 */ 445 static int 446 parse_unum_ecache(char *unum, int *board, int *pos, int *jnumber, int *maxcat) 447 { 448 char *c; 449 450 if ((c = strstr(unum, "SB")) == NULL) { 451 /* 452 * Since this is an E$ error, it must have occurred on 453 * either a System Board (represented by "SB" in the 454 * unum string) or a Maxcat board ("IO" in the unum 455 * string). 456 */ 457 if ((c = strstr(unum, "IO")) == NULL) 458 return (1); 459 *maxcat = 1; 460 } 461 462 c += 2; 463 *board = (uint8_t)stoi(&c); 464 465 if (*c++ != '/' || *c++ != 'P') 466 return (1); 467 *pos = stoi(&c); 468 469 if ((c = strchr(c, 'J')) == NULL) 470 return (1); 471 472 c++; 473 *jnumber = (uint16_t)stoi(&c); 474 475 return (0); 476 } 477 478 /* The following array maps the error to its corresponding set */ 479 static int plat_ecc_e2d_map[PLAT_ECC_ERROR2_NUMVALS] = { 480 PLAT_ECC_ERROR2_NONE, /* 0x00 */ 481 PLAT_ECC_ERROR2_SEND_L2_XXC, /* 0x01 */ 482 PLAT_ECC_ERROR2_SEND_L2_XXU, /* 0x02 */ 483 PLAT_ECC_ERROR2_SEND_L3_XXC, /* 0x03 */ 484 PLAT_ECC_ERROR2_SEND_L3_XXU, /* 0x04 */ 485 PLAT_ECC_ERROR2_SEND_MEM_ERRS, /* 0x05 */ 486 PLAT_ECC_ERROR2_SEND_MEM_ERRS, /* 0x06 */ 487 PLAT_ECC_ERROR2_SEND_MEM_ERRS, /* 0x07 */ 488 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x08 */ 489 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x09 */ 490 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x0a */ 491 PLAT_ECC_ERROR2_SEND_BUS_ERRS, /* 0x0b */ 492 PLAT_ECC_ERROR2_SEND_L2_TAG_ERRS, /* 0x0c */ 493 PLAT_ECC_ERROR2_SEND_L2_TAG_ERRS, /* 0x0d */ 494 PLAT_ECC_ERROR2_SEND_L3_TAG_ERRS, /* 0x0e */ 495 PLAT_ECC_ERROR2_SEND_L3_TAG_ERRS, /* 0x0f */ 496 PLAT_ECC_ERROR2_SEND_L1_PARITY, /* 0x10 */ 497 PLAT_ECC_ERROR2_SEND_L1_PARITY, /* 0x11 */ 498 PLAT_ECC_ERROR2_SEND_TLB_PARITY, /* 0x12 */ 499 PLAT_ECC_ERROR2_SEND_TLB_PARITY, /* 0x13 */ 500 PLAT_ECC_ERROR2_SEND_IV_ERRS, /* 0x14 */ 501 PLAT_ECC_ERROR2_SEND_IV_ERRS, /* 0x15 */ 502 PLAT_ECC_ERROR2_SEND_MTAG_XXC, /* 0x16 */ 503 PLAT_ECC_ERROR2_SEND_IV_MTAG_XXC, /* 0x17 */ 504 PLAT_ECC_ERROR2_SEND_L3_XXC, /* 0x18 */ 505 PLAT_ECC_ERROR2_SEND_PCACHE /* 0x19 */ 506 }; 507 508 /* 509 * log enhanced error information to SC. 510 */ 511 void 512 plat_log_fruid_error2(int msg_type, char *unum, struct async_flt *aflt, 513 plat_ecc_ch_async_flt_t *ecc_ch_flt) 514 { 515 plat_ecc_error2_data_t e2d = {0}; 516 int board, pos, bank, dimm, jnumber; 517 int maxcat = 0; 518 uint16_t flags; 519 520 /* Check the flags */ 521 flags = plat_ecc_e2d_map[msg_type]; 522 if ((ecc_error2_mailbox_flags & flags) == 0) 523 return; 524 525 /* Fill the header */ 526 e2d.ee2d_major_version = PLAT_ECC_ERROR2_VERSION_MAJOR; 527 e2d.ee2d_minor_version = PLAT_ECC_ERROR2_VERSION_MINOR; 528 e2d.ee2d_msg_type = PLAT_ECC_ERROR2_MESSAGE; 529 e2d.ee2d_msg_length = sizeof (plat_ecc_error2_data_t); 530 531 /* Fill the data */ 532 if (aflt->flt_in_memory) { 533 if (parse_unum_memory(unum, &board, &pos, &bank, &dimm, 534 &jnumber) || (dimm != -1 && jnumber == -1)) 535 return; 536 /* 537 * Using the SB number and Proc position we create a FRU 538 * cpu id. 539 */ 540 e2d.ee2d_owning_proc = plat_make_fru_cpuid(board, 0, pos); 541 e2d.ee2d_jnumber = jnumber; 542 e2d.ee2d_bank_number = bank; 543 } else if (aflt->flt_status & ECC_ECACHE) { 544 if (parse_unum_ecache(unum, &board, &pos, &jnumber, &maxcat)) 545 return; 546 /* 547 * Using the SB number and Proc position we create a FRU 548 * cpu id. 549 */ 550 e2d.ee2d_owning_proc = plat_make_fru_cpuid(board, maxcat, pos); 551 e2d.ee2d_jnumber = jnumber; 552 e2d.ee2d_bank_number = -1; 553 } else { 554 /* 555 * L1 Cache 556 */ 557 e2d.ee2d_owning_proc = aflt->flt_bus_id; 558 e2d.ee2d_jnumber = -1; 559 e2d.ee2d_bank_number = -1; 560 } 561 562 e2d.ee2d_type = (uint8_t)msg_type; 563 e2d.ee2d_afar_status = (uint8_t)ecc_ch_flt->ecaf_afar_status; 564 e2d.ee2d_synd_status = (uint8_t)ecc_ch_flt->ecaf_synd_status; 565 e2d.ee2d_detecting_proc = aflt->flt_bus_id; 566 e2d.ee2d_cpu_impl = cpunodes[e2d.ee2d_owning_proc].implementation; 567 e2d.ee2d_timestamp = aflt->flt_id; 568 e2d.ee2d_afsr = aflt->flt_stat; 569 e2d.ee2d_afar = aflt->flt_addr; 570 571 e2d.ee2d_sdw_afsr = ecc_ch_flt->ecaf_sdw_afsr; 572 e2d.ee2d_sdw_afar = ecc_ch_flt->ecaf_sdw_afar; 573 e2d.ee2d_afsr_ext = ecc_ch_flt->ecaf_afsr_ext; 574 e2d.ee2d_sdw_afsr_ext = ecc_ch_flt->ecaf_sdw_afsr_ext; 575 576 /* Send the message to SC */ 577 (void) plat_send_ecc_mailbox_msg(PLAT_ECC_ERROR2_MESSAGE, &e2d); 578 } 579 580 uint8_t ecc_indictment_mailbox_disable = PLAT_ECC_INDICTMENT_OK; 581 uint8_t ecc_indictment_mailbox_flags = PLAT_ECC_SEND_DEFAULT_INDICT; 582 583 /* 584 * We log all Solaris indictments of failing hardware. We pull the system 585 * board number and jnumber out of the unum string, and calculate the cpuid 586 * from some members of the unum string. The rest of the structure is filled 587 * in through the other arguments. The data structure is then passed to 588 * plat_ecc_dispatch_task(). This function should only be loaded into memory 589 * or called on platforms that define a plat_send_ecc_mailbox_msg() function. 590 */ 591 static int 592 plat_log_fruid_indictment(int msg_type, struct async_flt *aflt, char *unum) 593 { 594 plat_ecc_message_t *wrapperp; 595 plat_ecc_indict_msg_contents_t *contentsp; 596 char *unum_ptr; 597 int is_maxcat = 0; 598 599 switch (ecc_indictment_mailbox_disable) { 600 case (PLAT_ECC_INDICTMENT_OK): 601 case (PLAT_ECC_INDICTMENT_SUSPECT): 602 break; 603 case (PLAT_ECC_INDICTMENT_NO_SEND): 604 default: 605 return (ECONNREFUSED); 606 } 607 608 switch (msg_type) { 609 case (PLAT_ECC_INDICT_DIMM): 610 if ((ecc_indictment_mailbox_flags & 611 PLAT_ECC_SEND_DIMM_INDICT) == 0) 612 return (ECONNREFUSED); 613 break; 614 case (PLAT_ECC_INDICT_ECACHE_CORRECTABLES): 615 if ((ecc_indictment_mailbox_flags & 616 PLAT_ECC_SEND_ECACHE_XXC_INDICT) == 0) 617 return (ECONNREFUSED); 618 break; 619 case (PLAT_ECC_INDICT_ECACHE_UNCORRECTABLE): 620 if ((ecc_indictment_mailbox_flags & 621 PLAT_ECC_SEND_ECACHE_XXU_INDICT) == 0) 622 return (ECONNREFUSED); 623 break; 624 default: 625 return (ECONNREFUSED); 626 } 627 628 /* LINTED: E_TRUE_LOGICAL_EXPR */ 629 ASSERT(sizeof (plat_ecc_indictment_data_t) == PLAT_ECC_INDICT_SIZE); 630 631 wrapperp = (plat_ecc_message_t *) 632 kmem_zalloc(sizeof (plat_ecc_message_t), KM_SLEEP); 633 634 wrapperp->ecc_msg_status = PLAT_ECC_NO_MSG_ACTIVE; 635 wrapperp->ecc_msg_type = PLAT_ECC_INDICTMENT_MESSAGE; 636 wrapperp->ecc_msg_len = sizeof (plat_ecc_indictment_data_t); 637 wrapperp->ecc_msg_data = kmem_zalloc(wrapperp->ecc_msg_len, KM_SLEEP); 638 639 contentsp = &(((plat_ecc_indictment_data_t *) 640 wrapperp->ecc_msg_data)->msg_contents); 641 642 /* 643 * Find board_num, jnumber, and proc position from the unum string. 644 * Use the board number, is_maxcat, and proc position to calculate 645 * cpuid. 646 */ 647 unum_ptr = strstr(unum, "SB"); 648 if (unum_ptr == NULL) { 649 is_maxcat = 1; 650 unum_ptr = strstr(unum, "IO"); 651 if (unum_ptr == NULL) { 652 kmem_free(wrapperp->ecc_msg_data, 653 wrapperp->ecc_msg_len); 654 kmem_free(wrapperp, sizeof (plat_ecc_message_t)); 655 return (EINVAL); 656 } 657 } 658 unum_ptr += 2; 659 contentsp->board_num = (uint8_t)stoi(&unum_ptr); 660 661 unum_ptr = strchr(unum_ptr, 'P'); 662 if (unum_ptr == NULL) { 663 kmem_free(wrapperp->ecc_msg_data, wrapperp->ecc_msg_len); 664 kmem_free(wrapperp, sizeof (plat_ecc_message_t)); 665 return (EINVAL); 666 } 667 unum_ptr++; 668 contentsp->detecting_proc = 669 (uint16_t)plat_make_fru_cpuid(contentsp->board_num, is_maxcat, 670 stoi(&unum_ptr)); 671 672 unum_ptr = strchr(unum_ptr, 'J'); 673 if (unum_ptr == NULL) { 674 kmem_free(wrapperp->ecc_msg_data, wrapperp->ecc_msg_len); 675 kmem_free(wrapperp, sizeof (plat_ecc_message_t)); 676 return (EINVAL); 677 } 678 unum_ptr++; 679 contentsp->jnumber = (uint16_t)stoi(&unum_ptr); 680 681 /* 682 * Fill in the rest of the data 683 */ 684 contentsp->version = PLAT_ECC_INDICTMENT_VERSION; 685 contentsp->indictment_type = msg_type; 686 contentsp->indictment_uncertain = ecc_indictment_mailbox_disable; 687 contentsp->syndrome = aflt->flt_synd; 688 contentsp->afsr = aflt->flt_stat; 689 contentsp->afar = aflt->flt_addr; 690 691 /* 692 * Build the solaris_version string: 693 */ 694 (void) snprintf(contentsp->solaris_version, 695 PLAT_ECC_VERSION_LENGTH, "%s %s", utsname.release, utsname.version); 696 697 /* 698 * Send the data on to the queuing function 699 */ 700 return (plat_ecc_dispatch_task(wrapperp)); 701 } 702 703 /* The following array maps the indictment to its corresponding set */ 704 static int plat_ecc_i2d_map[PLAT_ECC_INDICT2_NUMVALS] = { 705 PLAT_ECC_INDICT2_NONE, /* 0x00 */ 706 PLAT_ECC_SEND_INDICT2_L2_XXU, /* 0x01 */ 707 PLAT_ECC_SEND_INDICT2_L2_XXC_SERD, /* 0x02 */ 708 PLAT_ECC_SEND_INDICT2_L2_TAG_SERD, /* 0x03 */ 709 PLAT_ECC_SEND_INDICT2_L3_XXU, /* 0x04 */ 710 PLAT_ECC_SEND_INDICT2_L3_XXC_SERD, /* 0x05 */ 711 PLAT_ECC_SEND_INDICT2_L3_TAG_SERD, /* 0x06 */ 712 PLAT_ECC_SEND_INDICT2_L1_SERD, /* 0x07 */ 713 PLAT_ECC_SEND_INDICT2_L1_SERD, /* 0x08 */ 714 PLAT_ECC_SEND_INDICT2_TLB_SERD, /* 0x09 */ 715 PLAT_ECC_SEND_INDICT2_TLB_SERD, /* 0x0a */ 716 PLAT_ECC_SEND_INDICT2_FPU, /* 0x0b */ 717 PLAT_ECC_SEND_INDICT2_PCACHE_SERD /* 0x0c */ 718 }; 719 720 static int 721 plat_log_fruid_indictment2(int msg_type, struct async_flt *aflt, char *unum) 722 { 723 plat_ecc_message_t *wrapperp; 724 plat_ecc_indictment2_data_t *i2d; 725 int board, pos, jnumber; 726 int maxcat = 0; 727 uint16_t flags; 728 729 /* 730 * If the unum is null or empty, skip parsing it 731 */ 732 if (unum && unum[0] != '\0') { 733 if (parse_unum_ecache(unum, &board, &pos, &jnumber, &maxcat)) 734 return (EINVAL); 735 } 736 737 if ((ecc_indictment_mailbox_disable != PLAT_ECC_INDICTMENT_OK) && 738 (ecc_indictment_mailbox_disable != PLAT_ECC_INDICTMENT_SUSPECT)) 739 return (ECONNREFUSED); 740 741 /* Check the flags */ 742 flags = plat_ecc_i2d_map[msg_type]; 743 if ((ecc_indictment2_mailbox_flags & flags) == 0) 744 return (ECONNREFUSED); 745 746 wrapperp = (plat_ecc_message_t *) 747 kmem_zalloc(sizeof (plat_ecc_message_t), KM_SLEEP); 748 749 /* Initialize the wrapper */ 750 wrapperp->ecc_msg_status = PLAT_ECC_NO_MSG_ACTIVE; 751 wrapperp->ecc_msg_type = PLAT_ECC_INDICTMENT2_MESSAGE; 752 wrapperp->ecc_msg_len = sizeof (plat_ecc_indictment2_data_t); 753 wrapperp->ecc_msg_data = kmem_zalloc(wrapperp->ecc_msg_len, KM_SLEEP); 754 755 i2d = (plat_ecc_indictment2_data_t *)wrapperp->ecc_msg_data; 756 757 /* Fill the header */ 758 i2d->ei2d_major_version = PLAT_ECC_INDICT2_MAJOR_VERSION; 759 i2d->ei2d_minor_version = PLAT_ECC_INDICT2_MINOR_VERSION; 760 i2d->ei2d_msg_type = PLAT_ECC_INDICTMENT2_MESSAGE; 761 i2d->ei2d_msg_length = sizeof (plat_ecc_indictment2_data_t); 762 763 /* Fill the data */ 764 if (unum && unum[0] != '\0') { 765 i2d->ei2d_arraigned_proc = plat_make_fru_cpuid(board, maxcat, 766 pos); 767 i2d->ei2d_board_num = board; 768 i2d->ei2d_jnumber = jnumber; 769 } else { 770 i2d->ei2d_arraigned_proc = aflt->flt_inst; 771 i2d->ei2d_board_num = (uint8_t) 772 plat_make_fru_boardnum(i2d->ei2d_arraigned_proc); 773 i2d->ei2d_jnumber = -1; 774 } 775 776 i2d->ei2d_type = msg_type; 777 i2d->ei2d_uncertain = ecc_indictment_mailbox_disable; 778 i2d->ei2d_cpu_impl = cpunodes[i2d->ei2d_arraigned_proc].implementation; 779 i2d->ei2d_timestamp = aflt->flt_id; 780 781 /* 782 * Send the data on to the queuing function 783 */ 784 return (plat_ecc_dispatch_task(wrapperp)); 785 } 786 787 int 788 plat_ecc_capability_send(void) 789 { 790 plat_ecc_message_t *wrapperp; 791 plat_capability_data_t *cap; 792 int ver_len; 793 794 wrapperp = kmem_zalloc(sizeof (plat_ecc_message_t), KM_SLEEP); 795 796 ver_len = strlen(utsname.release) + strlen(utsname.version) + 2; 797 798 /* Initialize the wrapper */ 799 wrapperp->ecc_msg_status = PLAT_ECC_NO_MSG_ACTIVE; 800 wrapperp->ecc_msg_type = PLAT_ECC_CAPABILITY_MESSAGE; 801 wrapperp->ecc_msg_len = sizeof (plat_capability_data_t) + ver_len; 802 wrapperp->ecc_msg_data = kmem_zalloc(wrapperp->ecc_msg_len, KM_SLEEP); 803 804 cap = (plat_capability_data_t *)wrapperp->ecc_msg_data; 805 806 /* Fill the header */ 807 cap->capd_major_version = PLAT_ECC_CAP_VERSION_MAJOR; 808 cap->capd_minor_version = PLAT_ECC_CAP_VERSION_MINOR; 809 cap->capd_msg_type = PLAT_ECC_CAPABILITY_MESSAGE; 810 cap->capd_msg_length = wrapperp->ecc_msg_len; 811 812 /* Set the default domain capability */ 813 cap->capd_capability = PLAT_ECC_CAPABILITY_DOMAIN_DEFAULT; 814 815 /* 816 * Build the solaris_version string: 817 * utsname.release + " " + utsname.version 818 */ 819 (void) snprintf(cap->capd_solaris_version, ver_len, "%s %s", 820 utsname.release, utsname.version); 821 822 /* 823 * Send the data on to the queuing function 824 */ 825 return (plat_ecc_dispatch_task(wrapperp)); 826 } 827 828 int 829 plat_ecc_capability_sc_get(int type) 830 { 831 switch (type) { 832 case PLAT_ECC_ERROR_MESSAGE: 833 if (ecc_log_fruid_enable && 834 (!(plat_ecc_capability_map_sc & 835 PLAT_ECC_CAPABILITY_ERROR2))) 836 return (1); 837 break; 838 case PLAT_ECC_ERROR2_MESSAGE: 839 if (plat_ecc_capability_map_sc & 840 PLAT_ECC_CAPABILITY_ERROR2) 841 return (1); 842 break; 843 case PLAT_ECC_INDICTMENT_MESSAGE: 844 if (!(plat_ecc_capability_map_sc & 845 PLAT_ECC_CAPABILITY_INDICT2) || 846 !(plat_ecc_capability_map_domain & 847 PLAT_ECC_CAPABILITY_FMA)) 848 return (1); 849 break; 850 case PLAT_ECC_INDICTMENT2_MESSAGE: 851 if (plat_ecc_capability_map_sc & 852 PLAT_ECC_CAPABILITY_INDICT2) 853 return (1); 854 break; 855 case PLAT_ECC_DIMM_SID_MESSAGE: 856 if (plat_ecc_capability_map_sc & 857 PLAT_ECC_CAPABILITY_DIMM_SID) 858 return (1); 859 default: 860 return (0); 861 } 862 return (0); 863 } 864 865 int plat_ecc_cap_sc_set_cnt = 0; 866 867 void 868 plat_ecc_capability_sc_set(uint32_t cap) 869 { 870 plat_ecc_capability_map_sc = cap; 871 872 if (!plat_ecc_cap_sc_set_cnt && (cap & PLAT_ECC_CAPABILITY_DIMM_SID)) 873 if (p2init_sid_cache) 874 p2init_sid_cache(); 875 876 plat_ecc_cap_sc_set_cnt++; 877 } 878 879 /* 880 * The following table represents mapping between the indictment1 reason 881 * to its type. 882 */ 883 884 static plat_ecc_bl_map_t plat_ecc_bl_map_v1[] = { 885 { "l2cachedata", PLAT_ECC_INDICT_ECACHE_CORRECTABLES }, 886 { "l3cachedata", PLAT_ECC_INDICT_ECACHE_CORRECTABLES }, 887 { "l2cachedata", PLAT_ECC_INDICT_ECACHE_UNCORRECTABLE }, 888 { "l3cachedata", PLAT_ECC_INDICT_ECACHE_UNCORRECTABLE } 889 }; 890 891 /* 892 * The following table represents mapping between the indictment2 reason 893 * to its type. 894 */ 895 896 static plat_ecc_bl_map_t plat_ecc_bl_map_v2[] = { 897 { "l2cachedata", PLAT_ECC_INDICT2_L2_SERD }, 898 { "l3cachedata", PLAT_ECC_INDICT2_L3_SERD }, 899 { "l2cachedata", PLAT_ECC_INDICT2_L2_UE }, 900 { "l3cachedata", PLAT_ECC_INDICT2_L3_UE }, 901 { "l2cachetag", PLAT_ECC_INDICT2_L2_TAG_SERD }, 902 { "l3cachetag", PLAT_ECC_INDICT2_L3_TAG_SERD }, 903 { "icache", PLAT_ECC_INDICT2_ICACHE_SERD }, 904 { "dcache", PLAT_ECC_INDICT2_DCACHE_SERD }, 905 { "pcache", PLAT_ECC_INDICT2_PCACHE_SERD }, 906 { "itlb", PLAT_ECC_INDICT2_ITLB_SERD }, 907 { "dtlb", PLAT_ECC_INDICT2_DTLB_SERD }, 908 { "fpu", PLAT_ECC_INDICT2_FPU } 909 }; 910 911 /* 912 * The following function returns the indictment type for a given version 913 */ 914 static int 915 flt_name_to_msg_type(const char *fault, int indict_version) 916 { 917 plat_ecc_bl_map_t *mapp; 918 char *fltnm = "fault.cpu."; 919 int mapsz; 920 char *p; 921 int i; 922 923 /* Check if it starts with proper fault name */ 924 if (strncmp(fault, fltnm, strlen(fltnm)) != 0) 925 return (PLAT_ECC_INDICT_NONE); 926 927 fault += strlen(fltnm); /* c = "ultraSPARC-IV.icache" */ 928 929 /* Skip the cpu type */ 930 if ((p = strchr(fault, '.')) == NULL) 931 return (PLAT_ECC_INDICT_NONE); 932 933 p++; /* skip the "." */ 934 935 if (indict_version == 0) { 936 mapp = plat_ecc_bl_map_v1; 937 mapsz = sizeof (plat_ecc_bl_map_v1) / 938 sizeof (plat_ecc_bl_map_t); 939 } else { 940 mapp = plat_ecc_bl_map_v2; 941 mapsz = sizeof (plat_ecc_bl_map_v2) / 942 sizeof (plat_ecc_bl_map_t); 943 } 944 for (i = 0; i < mapsz; i++) { 945 if (strcmp(p, mapp[i].ebm_reason) == 0) { 946 return (mapp[i].ebm_type); 947 } 948 } 949 return (PLAT_ECC_INDICT_NONE); 950 } 951 952 /* 953 * Blacklisting 954 */ 955 int 956 plat_blacklist(int cmd, const char *scheme, nvlist_t *fmri, const char *class) 957 { 958 struct async_flt aflt; 959 char *unum; 960 int msg_type, is_old_indict; 961 962 if (fmri == NULL) 963 return (EINVAL); 964 if (cmd != BLIOC_INSERT) 965 return (ENOTSUP); 966 967 /* 968 * We support both the blacklisting of CPUs via mem-schemed 969 * FMRIs that name E$ J-numbers, and CPUs via cpu-schemed FMRIs 970 * that name the cpuid. 971 */ 972 if (strcmp(scheme, FM_FMRI_SCHEME_MEM) == 0) { 973 if (nvlist_lookup_string(fmri, FM_FMRI_MEM_UNUM, &unum)) 974 return (EINVAL); 975 aflt.flt_inst = -1; 976 } else if (strcmp(scheme, FM_FMRI_SCHEME_CPU) == 0) { 977 if (nvlist_lookup_uint32(fmri, FM_FMRI_CPU_ID, &aflt.flt_inst)) 978 return (EINVAL); 979 unum = NULL; 980 } else { 981 return (ENOTSUP); 982 } 983 984 /* 985 * If the SC cannot handle indictment2, so fall back to old one. 986 * Also if the domain does not support FMA, then send only the old one. 987 */ 988 989 is_old_indict = plat_ecc_capability_sc_get(PLAT_ECC_INDICTMENT_MESSAGE); 990 991 if (is_old_indict) 992 msg_type = flt_name_to_msg_type(class, 0); 993 else 994 msg_type = flt_name_to_msg_type(class, 1); 995 996 if (msg_type == PLAT_ECC_INDICT_NONE) 997 return (ENOTSUP); 998 999 /* 1000 * The current blacklisting interfaces are designed for a world where 1001 * the SC is much more involved in the diagnosis and error reporting 1002 * process than it is in the FMA world. As such, the existing 1003 * interfaces want all kinds of information about the error that's 1004 * triggering the blacklist. In the FMA world, we don't have access 1005 * to any of that information by the time we're doing the blacklist, 1006 * so we fake values. 1007 */ 1008 aflt.flt_id = gethrtime(); 1009 aflt.flt_addr = -1; 1010 aflt.flt_stat = -1; 1011 aflt.flt_synd = -1; 1012 1013 if (is_old_indict) { 1014 if (unum && unum[0] != '\0') 1015 return (plat_log_fruid_indictment(msg_type, &aflt, 1016 unum)); 1017 else 1018 return (ENOTSUP); 1019 } else { 1020 return (plat_log_fruid_indictment2(msg_type, &aflt, unum)); 1021 } 1022 } 1023 1024 static kcondvar_t plat_ecc_condvar; 1025 static kmutex_t plat_ecc_mutex; 1026 static taskq_t *plat_ecc_taskq; 1027 1028 /* 1029 * plat_ecc_dispatch_task: Dispatch the task on a taskq and wait for the 1030 * return value. We use cv_wait_sig to wait for the return values. If a 1031 * signal interrupts us, we return EINTR. Otherwise, we return the value 1032 * returned by the mailbox functions. 1033 * 1034 * To avoid overloading the lower-level mailbox routines, we use a taskq 1035 * to serialize all messages. Currently, it is expected that only one 1036 * process (fmd) will use this ioctl, so the delay caused by the taskq 1037 * should not have much of an effect. 1038 */ 1039 int 1040 plat_ecc_dispatch_task(plat_ecc_message_t *msg) 1041 { 1042 int ret; 1043 1044 ASSERT(msg != NULL); 1045 ASSERT(plat_ecc_taskq != NULL); 1046 1047 if (taskq_dispatch(plat_ecc_taskq, plat_ecc_send_msg, 1048 (void *)msg, TQ_NOSLEEP) == NULL) { 1049 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len); 1050 kmem_free(msg, sizeof (plat_ecc_message_t)); 1051 return (ENOMEM); 1052 } 1053 mutex_enter(&plat_ecc_mutex); 1054 1055 /* 1056 * It's possible that the taskq function completed before we 1057 * acquired the mutex. Check for this first. If this did not 1058 * happen, we wait for the taskq function to signal us, or an 1059 * interrupt. We also check ecc_msg_status to protect against 1060 * spurious wakeups from cv_wait_sig. 1061 */ 1062 if (msg->ecc_msg_status == PLAT_ECC_MSG_SENT) { 1063 ret = msg->ecc_msg_ret; 1064 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len); 1065 kmem_free(msg, sizeof (plat_ecc_message_t)); 1066 } else { 1067 msg->ecc_msg_status = PLAT_ECC_TASK_DISPATCHED; 1068 1069 while ((ret = cv_wait_sig(&plat_ecc_condvar, 1070 &plat_ecc_mutex)) != 0 && 1071 msg->ecc_msg_status == PLAT_ECC_TASK_DISPATCHED) 1072 ; 1073 1074 if ((ret == 0) && (msg->ecc_msg_status != PLAT_ECC_MSG_SENT)) { 1075 /* An interrupt was received */ 1076 msg->ecc_msg_status = PLAT_ECC_INTERRUPT_RECEIVED; 1077 ret = EINTR; 1078 } else { 1079 ret = msg->ecc_msg_ret; 1080 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len); 1081 kmem_free(msg, sizeof (plat_ecc_message_t)); 1082 } 1083 } 1084 mutex_exit(&plat_ecc_mutex); 1085 return (ret); 1086 } 1087 1088 static void 1089 plat_ecc_send_msg(void *arg) 1090 { 1091 plat_ecc_message_t *msg = arg; 1092 int ret; 1093 1094 /* 1095 * Send this data off as a mailbox message to the SC. 1096 */ 1097 ret = plat_send_ecc_mailbox_msg(msg->ecc_msg_type, msg->ecc_msg_data); 1098 1099 mutex_enter(&plat_ecc_mutex); 1100 1101 /* 1102 * If the dispatching function received an interrupt, don't bother 1103 * signalling it, and throw away the results. Otherwise, set the 1104 * return value and signal the condvar. 1105 */ 1106 if (msg->ecc_msg_status == PLAT_ECC_INTERRUPT_RECEIVED) { 1107 kmem_free(msg->ecc_msg_data, msg->ecc_msg_len); 1108 kmem_free(msg, sizeof (plat_ecc_message_t)); 1109 } else { 1110 msg->ecc_msg_ret = ret; 1111 msg->ecc_msg_status = PLAT_ECC_MSG_SENT; 1112 cv_broadcast(&plat_ecc_condvar); 1113 } 1114 1115 mutex_exit(&plat_ecc_mutex); 1116 } 1117 1118 void 1119 plat_ecc_init(void) 1120 { 1121 int bd; 1122 1123 mutex_init(&plat_ecc_mutex, NULL, MUTEX_DEFAULT, NULL); 1124 cv_init(&plat_ecc_condvar, NULL, CV_DEFAULT, NULL); 1125 plat_ecc_taskq = taskq_create("plat_ecc_taskq", 1, minclsyspri, 1126 PLAT_ECC_TASKQ_MIN, PLAT_ECC_TASKQ_MAX, TASKQ_PREPOPULATE); 1127 ASSERT(plat_ecc_taskq != NULL); 1128 1129 for (bd = 0; bd < plat_max_cpumem_boards(); bd++) { 1130 mutex_init(&domain_dimm_sids[bd].pdsb_lock, 1131 NULL, MUTEX_DEFAULT, NULL); 1132 } 1133 1134 } 1135