1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 27 /* 28 * Support routines for managing per-Lxcache state. 29 */ 30 31 #include <cmd_Lxcache.h> 32 #include <cmd_mem.h> 33 #include <cmd_cpu.h> 34 #include <cmd.h> 35 #include <errno.h> 36 #include <fcntl.h> 37 #include <unistd.h> 38 #include <stdio.h> 39 #include <strings.h> 40 #include <fm/fmd_api.h> 41 #include <sys/fm/protocol.h> 42 #include <sys/cheetahregs.h> 43 #include <sys/mem_cache.h> 44 45 #define PN_ECSTATE_NA 5 46 /* 47 * These values are our threshold values for SERDing CPU's based on the 48 * the # of times we have retired a cache line for each category. 49 */ 50 51 #define CMD_CPU_SERD_AGG_1 64 52 #define CMD_CPU_SERD_AGG_2 64 53 54 static int8_t cmd_lowest_way[16] = { 55 /* 0x0 0x1 0x2 0x3 0x4 0x5 0x6 0x7 0x8 0x9 0xa 0xb 0xc 0xd 0xe 0xf */ 56 -1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0}; 57 static int cmd_num_of_bits[16] = { 58 /* 0x0 0x1 0x2 0x3 0x4 0x5 0x6 0x7 0x8 0x9 0xa 0xb 0xc 0xd 0xe 0xf */ 59 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; 60 61 62 void 63 cmd_Lxcache_write(fmd_hdl_t *hdl, cmd_Lxcache_t *Lxcache) 64 { 65 fmd_buf_write(hdl, NULL, Lxcache->Lxcache_bufname, Lxcache, 66 sizeof (cmd_Lxcache_pers_t)); 67 } 68 69 const char * 70 cmd_type_to_str(cmd_ptrsubtype_t pstype) 71 { 72 switch (pstype) { 73 case CMD_PTR_CPU_L2DATA: 74 return ("l2data"); 75 break; 76 case CMD_PTR_CPU_L3DATA: 77 return ("l3data"); 78 break; 79 case CMD_PTR_CPU_L2TAG: 80 return ("l2tag"); 81 break; 82 case CMD_PTR_CPU_L3TAG: 83 return ("l3tag"); 84 break; 85 default: 86 return ("unknown"); 87 break; 88 } 89 } 90 91 const char * 92 cmd_flags_to_str(int flags) 93 { 94 switch (flags) { 95 case CMD_LxCACHE_F_ACTIVE: 96 return ("ACTIVE"); 97 case CMD_LxCACHE_F_FAULTING: 98 return ("FAULTING"); 99 case CMD_LxCACHE_F_RETIRED: 100 return ("RETIRED"); 101 case CMD_LxCACHE_F_UNRETIRED: 102 return ("UNRETIRED"); 103 case CMD_LxCACHE_F_RERETIRED: 104 return ("RERETIRED"); 105 default: 106 return ("Unknown_flags"); 107 } 108 } 109 110 const char * 111 cmd_reason_to_str(int reason) 112 { 113 switch (reason) { 114 case CMD_LXSUSPECT_DATA: 115 return ("SUSPECT_DATA"); 116 case CMD_LXSUSPECT_0_TAG: 117 return ("SUSPECT_0_TAG"); 118 case CMD_LXSUSPECT_1_TAG: 119 return ("SUSPECT_1_TAG"); 120 case CMD_LXCONVICTED: 121 return ("CONVICTED"); 122 case CMD_LXFUNCTIONING: 123 return ("FUNCTIONING"); 124 default: 125 return ("Unknown_reason"); 126 } 127 } 128 129 static void 130 cmd_pretty_print_Lxcache(fmd_hdl_t *hdl, cmd_Lxcache_t *Lxcache) 131 { 132 fmd_hdl_debug(hdl, 133 "\n" 134 " cpu = %s\n" 135 " type = %s\n" 136 " index = %d\n" 137 " way = %d\n" 138 " bit = %d\n" 139 " reason = %s\n" 140 " flags = %s\n", 141 Lxcache->Lxcache_cpu_bufname, 142 cmd_type_to_str(Lxcache->Lxcache_type), 143 Lxcache->Lxcache_index, 144 Lxcache->Lxcache_way, 145 Lxcache->Lxcache_bit, 146 cmd_reason_to_str(Lxcache->Lxcache_reason), 147 cmd_flags_to_str(Lxcache->Lxcache_flags)); 148 } 149 150 void 151 cmd_Lxcache_free(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_Lxcache_t *Lxcache, 152 int destroy) 153 { 154 cmd_case_t *cc = &Lxcache->Lxcache_case; 155 156 fmd_hdl_debug(hdl, "Entering cmd_Lxcache_free for %s destroy = %d\n", 157 Lxcache->Lxcache_bufname, destroy); 158 159 if (cc->cc_cp != NULL) 160 cmd_case_fini(hdl, cc->cc_cp, destroy); 161 if (cc->cc_serdnm != NULL) { 162 if (fmd_serd_exists(hdl, cc->cc_serdnm) && destroy) { 163 fmd_serd_destroy(hdl, cc->cc_serdnm); 164 fmd_hdl_strfree(hdl, cc->cc_serdnm); 165 cc->cc_serdnm = NULL; 166 } 167 } 168 if (Lxcache->Lxcache_nvl) { 169 nvlist_free(Lxcache->Lxcache_nvl); 170 Lxcache->Lxcache_nvl = NULL; 171 } 172 /* 173 * Clean up the SERD engine created to handle recheck of TAGS. 174 * This SERD engine was created to save the event pointer. 175 */ 176 if (Lxcache->Lxcache_serdnm != NULL) { 177 if (fmd_serd_exists(hdl, Lxcache->Lxcache_serdnm) && destroy) { 178 fmd_serd_destroy(hdl, Lxcache->Lxcache_serdnm); 179 fmd_hdl_strfree(hdl, Lxcache->Lxcache_serdnm); 180 Lxcache->Lxcache_serdnm = NULL; 181 } 182 } 183 Lxcache->Lxcache_timeout_id = -1; 184 Lxcache->Lxcache_ep = NULL; 185 Lxcache->Lxcache_retry_count = 0; 186 if (destroy) 187 fmd_buf_destroy(hdl, NULL, Lxcache->Lxcache_bufname); 188 cmd_fmri_fini(hdl, &Lxcache->Lxcache_asru, destroy); 189 cmd_list_delete(&cpu->cpu_Lxcaches, Lxcache); 190 fmd_hdl_free(hdl, Lxcache, sizeof (cmd_Lxcache_t)); 191 } 192 193 void 194 cmd_Lxcache_destroy(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_Lxcache_t *Lxcache) 195 { 196 cmd_Lxcache_free(hdl, cpu, Lxcache, FMD_B_TRUE); 197 } 198 199 cmd_Lxcache_t * 200 cmd_Lxcache_lookup_by_type_index_way_bit(cmd_cpu_t *cpu, 201 cmd_ptrsubtype_t pstype, int32_t index, int8_t way, int16_t bit) 202 { 203 cmd_Lxcache_t *Lxcache; 204 205 for (Lxcache = cmd_list_next(&cpu->cpu_Lxcaches); Lxcache != NULL; 206 Lxcache = cmd_list_next(Lxcache)) { 207 if ((Lxcache->Lxcache_type == pstype) && 208 (Lxcache->Lxcache_index == (uint32_t)index) && 209 (Lxcache->Lxcache_way == (uint32_t)way) && 210 (Lxcache->Lxcache_bit == (uint16_t)bit)) 211 return (Lxcache); 212 } 213 214 return (NULL); 215 } 216 217 cmd_Lxcache_t * 218 cmd_Lxcache_create(fmd_hdl_t *hdl, cmd_xr_t *xr, cmd_cpu_t *cpu, 219 nvlist_t *modasru, cmd_ptrsubtype_t pstype, int32_t index, 220 int8_t way, int16_t bit) 221 { 222 cmd_Lxcache_t *Lxcache; 223 nvlist_t *asru; 224 const char *pstype_name; 225 uint8_t fmri_Lxcache_type; 226 227 pstype_name = cmd_type_to_str(pstype); 228 fmd_hdl_debug(hdl, 229 "\n%s:cpu_id %d:Creating new Lxcache for index=%d way=%d bit=%d\n", 230 pstype_name, cpu->cpu_cpuid, index, way, bit); 231 232 CMD_CPU_STAT_BUMP(cpu, Lxcache_creat); 233 234 Lxcache = fmd_hdl_zalloc(hdl, sizeof (cmd_Lxcache_t), FMD_SLEEP); 235 (void) strncpy(Lxcache->Lxcache_cpu_bufname, 236 cpu->cpu_bufname, CMD_BUFNMLEN); 237 Lxcache->Lxcache_nodetype = CMD_NT_LxCACHE; 238 Lxcache->Lxcache_version = CMD_LxCACHE_VERSION; 239 Lxcache->Lxcache_type = pstype; 240 Lxcache->Lxcache_index = (uint32_t)index; 241 Lxcache->Lxcache_way = (uint32_t)way; 242 Lxcache->Lxcache_bit = (uint16_t)bit; 243 Lxcache->Lxcache_reason = CMD_LXFUNCTIONING; 244 Lxcache->Lxcache_flags = CMD_LxCACHE_F_ACTIVE; 245 Lxcache->Lxcache_timeout_id = -1; 246 Lxcache->Lxcache_retry_count = 0; 247 Lxcache->Lxcache_nvl = NULL; 248 Lxcache->Lxcache_ep = NULL; 249 Lxcache->Lxcache_serdnm = NULL; 250 Lxcache->Lxcache_clcode = 0; 251 Lxcache->xr = xr; 252 Lxcache->Lxcache_retired_fmri[0] = '\0'; 253 switch (pstype) { 254 case CMD_PTR_CPU_L2DATA: 255 fmri_Lxcache_type = FM_FMRI_CPU_CACHE_TYPE_L2; 256 break; 257 case CMD_PTR_CPU_L3DATA: 258 fmri_Lxcache_type = FM_FMRI_CPU_CACHE_TYPE_L3; 259 break; 260 case CMD_PTR_CPU_L2TAG: 261 fmri_Lxcache_type = FM_FMRI_CPU_CACHE_TYPE_L2; 262 break; 263 case CMD_PTR_CPU_L3TAG: 264 fmri_Lxcache_type = FM_FMRI_CPU_CACHE_TYPE_L3; 265 break; 266 default: 267 break; 268 } 269 270 cmd_bufname(Lxcache->Lxcache_bufname, sizeof (Lxcache->Lxcache_bufname), 271 "Lxcache_%s_%d_%d_%d_%d", pstype_name, cpu->cpu_cpuid, 272 index, way, bit); 273 fmd_hdl_debug(hdl, 274 "\n%s:cpu_id %d: new Lxcache name is %s\n", 275 pstype_name, cpu->cpu_cpuid, Lxcache->Lxcache_bufname); 276 if ((errno = nvlist_dup(modasru, &asru, 0)) != 0 || 277 (errno = nvlist_add_uint32(asru, FM_FMRI_CPU_CACHE_INDEX, 278 index)) != 0 || 279 (errno = nvlist_add_uint32(asru, FM_FMRI_CPU_CACHE_WAY, 280 (uint32_t)way)) != 0 || 281 (errno = nvlist_add_uint16(asru, FM_FMRI_CPU_CACHE_BIT, 282 bit)) != 0 || 283 (errno = nvlist_add_uint8(asru, FM_FMRI_CPU_CACHE_TYPE, 284 fmri_Lxcache_type)) != 0 || 285 (errno = fmd_nvl_fmri_expand(hdl, asru)) != 0) 286 fmd_hdl_abort(hdl, "failed to build Lxcache fmri"); 287 asru->nvl_nvflag |= NV_UNIQUE_NAME_TYPE; 288 289 cmd_fmri_init(hdl, &Lxcache->Lxcache_asru, asru, 290 "%s_asru_%d_%d_%d", pstype_name, index, way, bit); 291 292 nvlist_free(asru); 293 294 cmd_list_append(&cpu->cpu_Lxcaches, Lxcache); 295 cmd_Lxcache_write(hdl, Lxcache); 296 297 return (Lxcache); 298 } 299 300 cmd_Lxcache_t * 301 cmd_Lxcache_lookup_by_index_way(cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype, 302 int32_t index, int8_t way) 303 { 304 cmd_Lxcache_t *cache; 305 306 for (cache = cmd_list_next(&cpu->cpu_Lxcaches); cache != NULL; 307 cache = cmd_list_next(cache)) { 308 if ((cache->Lxcache_index == (uint32_t)index) && 309 (cache->Lxcache_way == (uint32_t)way) && 310 (cache->Lxcache_type == pstype)) { 311 return (cache); 312 } 313 } 314 315 return (NULL); 316 } 317 318 static cmd_Lxcache_t * 319 Lxcache_wrapv1(fmd_hdl_t *hdl, cmd_Lxcache_pers_t *pers, size_t psz) 320 { 321 cmd_Lxcache_t *Lxcache; 322 323 if (psz != sizeof (cmd_Lxcache_pers_t)) { 324 fmd_hdl_abort(hdl, "size of state doesn't match size of " 325 "version 1 state (%u bytes).\n", 326 sizeof (cmd_Lxcache_pers_t)); 327 } 328 329 Lxcache = fmd_hdl_zalloc(hdl, sizeof (cmd_Lxcache_t), FMD_SLEEP); 330 bcopy(pers, Lxcache, sizeof (cmd_Lxcache_pers_t)); 331 fmd_hdl_free(hdl, pers, psz); 332 return (Lxcache); 333 } 334 335 void * 336 cmd_Lxcache_restore(fmd_hdl_t *hdl, fmd_case_t *cp, cmd_case_ptr_t *ptr) 337 { 338 cmd_Lxcache_t *Lxcache; 339 cmd_Lxcache_t *recovered_Lxcache; 340 cmd_cpu_t *cpu; 341 size_t Lxcachesz; 342 char *serdnm; 343 344 /* 345 * We need to first extract the cpu name by reading directly 346 * from fmd buffers in order to begin our search for Lxcache in 347 * the appropriate cpu list. 348 * After we identify the cpu list using buf name we look 349 * in cpu list for our Lxcache states. 350 */ 351 fmd_hdl_debug(hdl, "restoring Lxcache from %s\n", ptr->ptr_name); 352 353 if ((Lxcachesz = fmd_buf_size(hdl, NULL, ptr->ptr_name)) == 0) { 354 fmd_hdl_abort(hdl, "Lxcache referenced by case %s does " 355 "not exist in saved state\n", 356 fmd_case_uuid(hdl, cp)); 357 } else if (Lxcachesz != sizeof (cmd_Lxcache_pers_t)) { 358 fmd_hdl_abort(hdl, "Lxcache buffer referenced by case %s " 359 "is %d bytes. Expected size is %d bytes\n", 360 fmd_case_uuid(hdl, cp), Lxcachesz, 361 sizeof (cmd_Lxcache_pers_t)); 362 } 363 364 if ((Lxcache = cmd_buf_read(hdl, NULL, ptr->ptr_name, 365 Lxcachesz)) == NULL) { 366 fmd_hdl_abort(hdl, "failed to read Lxcache buf %s", 367 ptr->ptr_name); 368 } 369 cmd_pretty_print_Lxcache(hdl, Lxcache); 370 371 fmd_hdl_debug(hdl, "found %d in version field\n", 372 Lxcache->Lxcache_version); 373 cpu = cmd_restore_cpu_only(hdl, cp, Lxcache->Lxcache_cpu_bufname); 374 if (cpu == NULL) { 375 fmd_hdl_debug(hdl, 376 "\nCould not restore cpu %s\n", 377 Lxcache->Lxcache_cpu_bufname); 378 return (NULL); 379 } 380 recovered_Lxcache = Lxcache; /* save the recovered Lxcache */ 381 382 for (Lxcache = cmd_list_next(&cpu->cpu_Lxcaches); Lxcache != NULL; 383 Lxcache = cmd_list_next(Lxcache)) { 384 if (strcmp(Lxcache->Lxcache_bufname, ptr->ptr_name) == 0) 385 break; 386 } 387 388 if (Lxcache == NULL) { 389 390 switch (recovered_Lxcache->Lxcache_version) { 391 case CMD_LxCACHE_VERSION_1: 392 Lxcache = Lxcache_wrapv1(hdl, 393 (cmd_Lxcache_pers_t *)recovered_Lxcache, 394 Lxcachesz); 395 break; 396 default: 397 fmd_hdl_abort(hdl, "unknown version (found %d) " 398 "for Lxcache state referenced by case %s.\n", 399 recovered_Lxcache->Lxcache_version, 400 fmd_case_uuid(hdl, cp)); 401 break; 402 } 403 404 cmd_fmri_restore(hdl, &Lxcache->Lxcache_asru); 405 /* 406 * We need to cleanup the information associated with 407 * the timeout routine because these are not checkpointed 408 * and cannot be retored. 409 */ 410 Lxcache->Lxcache_timeout_id = -1; 411 Lxcache->Lxcache_retry_count = 0; 412 Lxcache->Lxcache_nvl = NULL; 413 Lxcache->Lxcache_ep = NULL; 414 Lxcache->Lxcache_serdnm = NULL; 415 416 cmd_list_append(&cpu->cpu_Lxcaches, Lxcache); 417 } 418 serdnm = cmd_Lxcache_serdnm_create(hdl, cpu->cpu_cpuid, 419 Lxcache->Lxcache_type, Lxcache->Lxcache_index, 420 Lxcache->Lxcache_way, Lxcache->Lxcache_bit); 421 fmd_hdl_debug(hdl, 422 "cpu_id %d: serdname for the case is %s\n", 423 cpu->cpu_cpuid, serdnm); 424 fmd_hdl_debug(hdl, 425 "cpu_id %d: restoring the case for index %d way %d bit %d\n", 426 cpu->cpu_cpuid, Lxcache->Lxcache_index, 427 Lxcache->Lxcache_way, Lxcache->Lxcache_bit); 428 cmd_case_restore(hdl, &Lxcache->Lxcache_case, cp, serdnm); 429 430 return (Lxcache); 431 } 432 433 /*ARGSUSED*/ 434 void 435 cmd_Lxcache_validate(fmd_hdl_t *hdl, cmd_cpu_t *cpu) 436 { 437 cmd_Lxcache_t *Lxcache, *next; 438 439 for (Lxcache = cmd_list_next(&cpu->cpu_Lxcaches); 440 Lxcache != NULL; Lxcache = next) { 441 next = cmd_list_next(Lxcache); 442 443 if (fmd_nvl_fmri_unusable(hdl, Lxcache->Lxcache_asru_nvl)) { 444 cmd_Lxcache_destroy(hdl, cpu, Lxcache); 445 } 446 } 447 } 448 449 void 450 cmd_Lxcache_dirty(fmd_hdl_t *hdl, cmd_Lxcache_t *Lxcache) 451 { 452 if (fmd_buf_size(hdl, NULL, Lxcache->Lxcache_bufname) != 453 sizeof (cmd_Lxcache_pers_t)) 454 fmd_buf_destroy(hdl, NULL, Lxcache->Lxcache_bufname); 455 456 /* No need to rewrite the FMRIs in the Lxcache - they don't change */ 457 fmd_buf_write(hdl, NULL, 458 Lxcache->Lxcache_bufname, &Lxcache->Lxcache_pers, 459 sizeof (cmd_Lxcache_pers_t)); 460 } 461 462 void 463 cmd_Lxcache_fini(fmd_hdl_t *hdl, cmd_cpu_t *cpu) 464 { 465 cmd_Lxcache_t *Lxcache; 466 467 while ((Lxcache = cmd_list_next(&cpu->cpu_Lxcaches)) != NULL) 468 cmd_Lxcache_free(hdl, cpu, Lxcache, FMD_B_FALSE); 469 } 470 471 char * 472 cmd_Lxcache_serdnm_create(fmd_hdl_t *hdl, uint32_t cpu_id, 473 cmd_ptrsubtype_t pstype, 474 int32_t index, int8_t way, int16_t bit) 475 { 476 const char *fmt = "cpu_%d:%s_%d_%d_%d_serd"; 477 const char *serdbase; 478 size_t sz; 479 char *nm; 480 481 serdbase = cmd_type_to_str(pstype); 482 sz = (snprintf(NULL, 0, fmt, cpu_id, serdbase, index, way, bit) + 1); 483 nm = fmd_hdl_alloc(hdl, sz, FMD_SLEEP); 484 (void) snprintf(nm, sz, fmt, cpu_id, serdbase, index, way, bit); 485 return (nm); 486 } 487 488 char * 489 cmd_Lxcache_anonymous_serdnm_create(fmd_hdl_t *hdl, uint32_t cpu_id, 490 cmd_ptrsubtype_t pstype, 491 int32_t index, int8_t way, int16_t bit) 492 { 493 const char *fmt = "cpu_%d:%s_%d_%d_%d_anonymous_serd"; 494 const char *serdbase; 495 size_t sz; 496 char *nm; 497 498 serdbase = cmd_type_to_str(pstype); 499 sz = (snprintf(NULL, 0, fmt, cpu_id, serdbase, index, way, bit) + 1); 500 nm = fmd_hdl_alloc(hdl, sz, FMD_SLEEP); 501 (void) snprintf(nm, sz, fmt, cpu_id, serdbase, index, way, bit); 502 return (nm); 503 } 504 505 /* 506 * Count the number of SERD type 2 ways retired for a given cpu 507 * These are defined to be L3 Cache data retirements 508 */ 509 510 uint32_t 511 cmd_Lx_index_count_type2_ways(cmd_cpu_t *cpu) 512 { 513 cmd_Lxcache_t *cache = NULL; 514 uint32_t ret_count = 0; 515 516 for (cache = cmd_list_next(&cpu->cpu_Lxcaches); cache != NULL; 517 cache = cmd_list_next(cache)) { 518 if ((cache->Lxcache_flags & CMD_LxCACHE_F_RETIRED) && 519 (cache->Lxcache_type == CMD_PTR_CPU_L3DATA)) { 520 ret_count++; 521 } 522 } 523 return (ret_count); 524 } 525 /* 526 * Count the number of SERD type 1 ways retired for a given cpu 527 * These are defined to be L2 Data, tag and L3 Tag retirements 528 */ 529 530 uint32_t 531 cmd_Lx_index_count_type1_ways(cmd_cpu_t *cpu) 532 { 533 cmd_Lxcache_t *cache = NULL; 534 uint32_t ret_count = 0; 535 536 for (cache = cmd_list_next(&cpu->cpu_Lxcaches); cache != NULL; 537 cache = cmd_list_next(cache)) { 538 if ((cache->Lxcache_flags & CMD_LxCACHE_F_RETIRED) && 539 ((cache->Lxcache_type == CMD_PTR_CPU_L2DATA) || 540 IS_TAG(cache->Lxcache_type))) { 541 ret_count++; 542 } 543 } 544 return (ret_count); 545 } 546 547 void 548 cmd_fault_the_cpu(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype, 549 const char *fltnm) 550 { 551 fmd_case_t *cp; 552 const char *uuid; 553 554 cp = cmd_case_create(hdl, &cpu->cpu_header, pstype, 555 &uuid); 556 fmd_hdl_debug(hdl, 557 "\n%s:cpu_id %d Created case %s to retire CPU\n", 558 fltnm, cpu->cpu_cpuid); 559 560 if ((errno = fmd_nvl_fmri_expand(hdl, cpu->cpu_asru_nvl)) != 0) 561 fmd_hdl_abort(hdl, "failed to build CPU fmri"); 562 563 cmd_cpu_create_faultlist(hdl, cp, cpu, fltnm, NULL, HUNDRED_PERCENT); 564 fmd_case_solve(hdl, cp); 565 } 566 567 void 568 cmd_retire_cpu_if_limits_exceeded(fmd_hdl_t *hdl, cmd_cpu_t *cpu, 569 cmd_ptrsubtype_t pstype, const char *fltnm) 570 { 571 int cpu_retired_1, cpu_retired_2; 572 573 /* Retrieve the number of retired ways for each category */ 574 575 cpu_retired_1 = cmd_Lx_index_count_type1_ways(cpu); 576 cpu_retired_2 = cmd_Lx_index_count_type2_ways(cpu); 577 fmd_hdl_debug(hdl, 578 "\n%s:CPU %d retired Type 1 way count is: %d\n", 579 fltnm, cpu->cpu_cpuid, cpu_retired_1); 580 fmd_hdl_debug(hdl, "\n%s:CPU %d retired Type 2 way count is: %d\n", 581 fltnm, cpu->cpu_cpuid, cpu_retired_2); 582 583 if (((cpu_retired_1 > CMD_CPU_SERD_AGG_1) || 584 (cpu_retired_2 > CMD_CPU_SERD_AGG_2)) && 585 (cpu->cpu_faulting != FMD_B_TRUE)) { 586 cmd_fault_the_cpu(hdl, cpu, pstype, fltnm); 587 } 588 } 589 590 void 591 cmd_Lxcache_fault(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_Lxcache_t *Lxcache, 592 const char *fltnm, nvlist_t *rsrc, uint_t cert) 593 { 594 char fltmsg[64]; 595 nvlist_t *flt; 596 597 (void) snprintf(fltmsg, sizeof (fltmsg), "fault.cpu.%s.%s-line", 598 cmd_cpu_type2name(hdl, cpu->cpu_type), fltnm); 599 fmd_hdl_debug(hdl, 600 "\n%s:cpu_id %d: fltmsg = %s\n", 601 fltnm, cpu->cpu_cpuid, fltmsg); 602 if (Lxcache->Lxcache_flags & CMD_LxCACHE_F_FAULTING) { 603 return; 604 } 605 Lxcache->Lxcache_flags |= CMD_LxCACHE_F_FAULTING; 606 flt = fmd_nvl_create_fault(hdl, fltmsg, cert, 607 Lxcache->Lxcache_asru.fmri_nvl, cpu->cpu_fru_nvl, rsrc); 608 if (nvlist_add_boolean_value(flt, FM_SUSPECT_MESSAGE, B_FALSE) != 0) 609 fmd_hdl_abort(hdl, "failed to add no-message member to fault"); 610 611 fmd_hdl_debug(hdl, 612 "\n%s:cpu_id %d: adding suspect list to case %s\n", 613 fltnm, cpu->cpu_cpuid, 614 fmd_case_uuid(hdl, Lxcache->Lxcache_case.cc_cp)); 615 fmd_case_add_suspect(hdl, Lxcache->Lxcache_case.cc_cp, flt); 616 fmd_case_solve(hdl, Lxcache->Lxcache_case.cc_cp); 617 if (Lxcache->Lxcache_retired_fmri[0] == 0) { 618 if (cmd_fmri_nvl2str(hdl, Lxcache->Lxcache_asru.fmri_nvl, 619 Lxcache->Lxcache_retired_fmri, 620 sizeof (Lxcache->Lxcache_retired_fmri)) == -1) 621 fmd_hdl_debug(hdl, 622 "\n%s:cpu_id %d: Failed to save the" 623 " retired fmri string\n", 624 fltnm, cpu->cpu_cpuid); 625 else 626 fmd_hdl_debug(hdl, 627 "\n%s:cpu_id %d:Saved the retired fmri string %s\n", 628 fltnm, cpu->cpu_cpuid, 629 Lxcache->Lxcache_retired_fmri); 630 } 631 Lxcache->Lxcache_flags &= ~(CMD_LxCACHE_F_FAULTING); 632 633 } 634 635 void 636 cmd_Lxcache_close(fmd_hdl_t *hdl, void *arg) 637 { 638 cmd_cpu_t *cpu; 639 cmd_Lxcache_t *Lxcache; 640 cmd_case_t *cc; 641 642 Lxcache = (cmd_Lxcache_t *)arg; 643 fmd_hdl_debug(hdl, "cmd_Lxcache_close called for %s\n", 644 Lxcache->Lxcache_bufname); 645 cc = &Lxcache->Lxcache_case; 646 647 for (cpu = cmd_list_next(&cmd.cmd_cpus); cpu != NULL; 648 cpu = cmd_list_next(cpu)) { 649 if (strcmp(cpu->cpu_bufname, 650 Lxcache->Lxcache_cpu_bufname) == 0) 651 break; 652 } 653 if (cpu == NULL) 654 fmd_hdl_abort(hdl, "failed to find the cpu %s for %s\n", 655 Lxcache->Lxcache_cpu_bufname, 656 Lxcache->Lxcache_bufname); 657 /* 658 * We will destroy the case and serd engine. 659 * The rest will be destroyed when we retire the CPU 660 * until then we keep the Lxcache strutures alive. 661 */ 662 if (cc->cc_cp != NULL) { 663 cmd_case_fini(hdl, cc->cc_cp, FMD_B_TRUE); 664 cc->cc_cp = NULL; 665 } 666 if (cc->cc_serdnm != NULL) { 667 if (fmd_serd_exists(hdl, cc->cc_serdnm)) 668 fmd_serd_destroy(hdl, cc->cc_serdnm); 669 fmd_hdl_strfree(hdl, cc->cc_serdnm); 670 cc->cc_serdnm = NULL; 671 } 672 673 } 674 675 cmd_Lxcache_t * 676 cmd_Lxcache_lookup_by_timeout_id(id_t id) 677 { 678 cmd_cpu_t *cpu; 679 cmd_Lxcache_t *cmd_Lxcache; 680 681 for (cpu = cmd_list_next(&cmd.cmd_cpus); cpu != NULL; 682 cpu = cmd_list_next(cpu)) { 683 for (cmd_Lxcache = cmd_list_next(&cpu->cpu_Lxcaches); 684 cmd_Lxcache != NULL; 685 cmd_Lxcache = cmd_list_next(cmd_Lxcache)) { 686 if (cmd_Lxcache->Lxcache_timeout_id == id) 687 return (cmd_Lxcache); 688 } 689 } 690 return (NULL); 691 } 692 693 void 694 cmd_Lxcache_gc(fmd_hdl_t *hdl) 695 { 696 cmd_cpu_t *cpu; 697 698 for (cpu = cmd_list_next(&cmd.cmd_cpus); cpu != NULL; 699 cpu = cmd_list_next(cpu)) 700 cmd_Lxcache_validate(hdl, cpu); 701 } 702 703 cmd_evdisp_t 704 get_tagdata(cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype, 705 int32_t index, uint64_t *tag_data) 706 { 707 int fd; 708 cache_info_t cache_info; 709 710 fd = open(mem_cache_device, O_RDONLY); 711 if (fd == -1) { 712 (void) printf( 713 "cpu_id = %d could not open %s to read tag info.\n", 714 cpu->cpu_cpuid, mem_cache_device); 715 return (CMD_EVD_BAD); 716 } 717 switch (pstype) { 718 case CMD_PTR_CPU_L2TAG: 719 case CMD_PTR_CPU_L2DATA: 720 cache_info.cache = L2_CACHE_TAG; 721 break; 722 case CMD_PTR_CPU_L3TAG: 723 case CMD_PTR_CPU_L3DATA: 724 cache_info.cache = L3_CACHE_TAG; 725 break; 726 } 727 cache_info.cpu_id = cpu->cpu_cpuid; 728 cache_info.index = index; 729 cache_info.datap = tag_data; 730 cache_info.way = 0; 731 732 if (test_mode) { 733 734 if (ioctl(fd, MEM_CACHE_READ_ERROR_INJECTED_TAGS, &cache_info) 735 == -1) { 736 (void) printf("cpu_id = %d ioctl" 737 " MEM_CACHE_READ_ERROR_INJECTED_TAGS failed" 738 " errno = %d\n", 739 cpu->cpu_cpuid, errno); 740 (void) close(fd); 741 return (CMD_EVD_BAD); 742 } 743 } else { 744 if (ioctl(fd, MEM_CACHE_READ_TAGS, &cache_info) 745 == -1) { 746 (void) printf("cpu_id = %d ioctl" 747 " MEM_CACHE_READ_TAGS failed" 748 " errno = %d\n", 749 cpu->cpu_cpuid, errno); 750 (void) close(fd); 751 return (CMD_EVD_BAD); 752 } 753 } 754 (void) close(fd); 755 return (CMD_EVD_OK); 756 } 757 758 int 759 get_index_retired_ways(cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype, int32_t index) 760 { 761 int i, retired_ways; 762 uint64_t tag_data[PN_CACHE_NWAYS]; 763 764 if (get_tagdata(cpu, pstype, index, tag_data) != 0) { 765 return (-1); 766 } 767 retired_ways = 0; 768 for (i = 0; i < PN_CACHE_NWAYS; i++) { 769 if ((tag_data[i] & CH_ECSTATE_MASK) == 770 PN_ECSTATE_NA) 771 retired_ways++; 772 } 773 return (retired_ways); 774 } 775 776 boolean_t 777 cmd_cache_way_retire(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_Lxcache_t *Lxcache) 778 { 779 const char *fltnm; 780 cache_info_t cache_info; 781 int ret, fd; 782 783 fltnm = cmd_type_to_str(Lxcache->Lxcache_type); 784 fd = open(mem_cache_device, O_RDWR); 785 if (fd == -1) { 786 fmd_hdl_debug(hdl, 787 "fltnm:cpu_id %d open of %s failed\n", 788 fltnm, cpu->cpu_cpuid, mem_cache_device); 789 return (B_FALSE); 790 } 791 cache_info.cpu_id = cpu->cpu_cpuid; 792 cache_info.way = Lxcache->Lxcache_way; 793 cache_info.bit = Lxcache->Lxcache_bit; 794 cache_info.index = Lxcache->Lxcache_index; 795 796 switch (Lxcache->Lxcache_type) { 797 case CMD_PTR_CPU_L2TAG: 798 cache_info.cache = L2_CACHE_TAG; 799 break; 800 case CMD_PTR_CPU_L2DATA: 801 cache_info.cache = L2_CACHE_DATA; 802 break; 803 case CMD_PTR_CPU_L3TAG: 804 cache_info.cache = L3_CACHE_TAG; 805 break; 806 case CMD_PTR_CPU_L3DATA: 807 cache_info.cache = L3_CACHE_DATA; 808 break; 809 } 810 811 fmd_hdl_debug(hdl, 812 "\n%s:cpu %d: Retiring index %d, way %d bit %d\n", 813 fltnm, cpu->cpu_cpuid, cache_info.index, cache_info.way, 814 (int16_t)cache_info.bit); 815 ret = ioctl(fd, MEM_CACHE_RETIRE, &cache_info); 816 (void) close(fd); 817 if (ret == -1) { 818 fmd_hdl_debug(hdl, 819 "fltnm:cpu_id %d MEM_CACHE_RETIRE ioctl failed\n", 820 fltnm, cpu->cpu_cpuid); 821 return (B_FALSE); 822 } 823 824 return (B_TRUE); 825 } 826 827 boolean_t 828 cmd_cache_way_unretire(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_Lxcache_t *Lxcache) 829 { 830 const char *fltnm; 831 cache_info_t cache_info; 832 int ret, fd; 833 834 fltnm = cmd_type_to_str(Lxcache->Lxcache_type); 835 fd = open(mem_cache_device, O_RDWR); 836 if (fd == -1) { 837 fmd_hdl_debug(hdl, 838 "fltnm:cpu_id %d open of %s failed\n", 839 fltnm, cpu->cpu_cpuid, mem_cache_device); 840 return (B_FALSE); 841 } 842 cache_info.cpu_id = cpu->cpu_cpuid; 843 cache_info.way = Lxcache->Lxcache_way; 844 cache_info.bit = Lxcache->Lxcache_bit; 845 cache_info.index = Lxcache->Lxcache_index; 846 847 switch (Lxcache->Lxcache_type) { 848 case CMD_PTR_CPU_L2TAG: 849 cache_info.cache = L2_CACHE_TAG; 850 break; 851 case CMD_PTR_CPU_L2DATA: 852 cache_info.cache = L2_CACHE_DATA; 853 break; 854 case CMD_PTR_CPU_L3TAG: 855 cache_info.cache = L3_CACHE_TAG; 856 break; 857 case CMD_PTR_CPU_L3DATA: 858 cache_info.cache = L3_CACHE_DATA; 859 break; 860 } 861 862 fmd_hdl_debug(hdl, 863 "\n%s:cpu %d: Unretiring index %d, way %d bit %d\n", 864 fltnm, cpu->cpu_cpuid, cache_info.index, cache_info.way, 865 (int16_t)cache_info.bit); 866 ret = ioctl(fd, MEM_CACHE_UNRETIRE, &cache_info); 867 (void) close(fd); 868 if (ret == -1) { 869 fmd_hdl_debug(hdl, 870 "fltnm:cpu_id %d MEM_CACHE_UNRETIRE ioctl failed\n", 871 fltnm, cpu->cpu_cpuid); 872 return (B_FALSE); 873 } 874 875 return (B_TRUE); 876 } 877 878 static cmd_Lxcache_t * 879 cmd_Lxcache_lookup_by_type_index_way_flags(cmd_cpu_t *cpu, 880 cmd_ptrsubtype_t type, int32_t index, int8_t way, int32_t flags) 881 { 882 cmd_Lxcache_t *cmd_Lxcache; 883 884 for (cmd_Lxcache = cmd_list_next(&cpu->cpu_Lxcaches); 885 cmd_Lxcache != NULL; 886 cmd_Lxcache = cmd_list_next(cmd_Lxcache)) { 887 if ((cmd_Lxcache->Lxcache_index == index) && 888 (cmd_Lxcache->Lxcache_way == way) && 889 (cmd_Lxcache->Lxcache_type == type) && 890 (cmd_Lxcache->Lxcache_flags & flags)) 891 return (cmd_Lxcache); 892 } 893 return (NULL); 894 } 895 896 static int8_t 897 cmd_Lxcache_get_bit_array_of_available_ways(cmd_cpu_t *cpu, 898 cmd_ptrsubtype_t type, int32_t index) 899 { 900 uint8_t bit_array_of_unavailable_ways; 901 uint8_t bit_array_of_available_ways; 902 cmd_ptrsubtype_t match_type; 903 cmd_Lxcache_t *cmd_Lxcache; 904 uint8_t bit_array_of_retired_ways; 905 906 907 /* 908 * We scan the Lxcache structures for this CPU and collect 909 * the following 2 information. 910 * - bit_array_of_retired_ways 911 * - bit_array_of_unavailable_ways 912 * If type is Lx_TAG then unavailable_ways will not include ways that 913 * were retired due to DATA faults, because these ways can still be 914 * re-retired for TAG faults. 915 * If 3 ways have been retired then we protect the only remaining 916 * unretired way by marking it as unavailable. 917 */ 918 bit_array_of_unavailable_ways = 0; 919 bit_array_of_retired_ways = 0; 920 switch (type) { 921 case CMD_PTR_CPU_L2TAG: 922 match_type = CMD_PTR_CPU_L2DATA; 923 break; 924 case CMD_PTR_CPU_L2DATA: 925 match_type = CMD_PTR_CPU_L2TAG; 926 break; 927 case CMD_PTR_CPU_L3TAG: 928 match_type = CMD_PTR_CPU_L3DATA; 929 break; 930 case CMD_PTR_CPU_L3DATA: 931 match_type = CMD_PTR_CPU_L3TAG; 932 break; 933 } 934 935 for (cmd_Lxcache = cmd_list_next(&cpu->cpu_Lxcaches); 936 cmd_Lxcache != NULL; 937 cmd_Lxcache = cmd_list_next(cmd_Lxcache)) { 938 if ((cmd_Lxcache->Lxcache_index == index) && 939 ((cmd_Lxcache->Lxcache_type == type) || 940 (cmd_Lxcache->Lxcache_type == match_type)) && 941 (cmd_Lxcache->Lxcache_flags & 942 (CMD_LxCACHE_F_RETIRED | CMD_LxCACHE_F_RERETIRED))) { 943 bit_array_of_retired_ways |= 944 (1 << cmd_Lxcache->Lxcache_way); 945 /* 946 * If we are calling this while handling TAG errors 947 * we can reretire the cachelines retired due to DATA 948 * errors. We will ignore the cachelnes that are 949 * retired due to DATA faults. 950 */ 951 if ((type == CMD_PTR_CPU_L2TAG) && 952 (cmd_Lxcache->Lxcache_type == CMD_PTR_CPU_L2DATA)) 953 continue; 954 if ((type == CMD_PTR_CPU_L3TAG) && 955 (cmd_Lxcache->Lxcache_type == CMD_PTR_CPU_L3DATA)) 956 continue; 957 bit_array_of_unavailable_ways |= 958 (1 << cmd_Lxcache->Lxcache_way); 959 } 960 } 961 if (cmd_num_of_bits[bit_array_of_retired_ways & 0xf] == 3) { 962 /* 963 * special case: 3 ways are already retired. 964 * The Lone unretired way is set as 1, rest are set as 0. 965 * We now OR this with bit_array_of_unavailable_ways 966 * so that this unretired way will not be allocated. 967 */ 968 bit_array_of_retired_ways ^= 0xf; 969 bit_array_of_retired_ways &= 0xf; 970 bit_array_of_unavailable_ways |= bit_array_of_retired_ways; 971 } 972 bit_array_of_available_ways = 973 ((bit_array_of_unavailable_ways ^ 0xf) & 0xf); 974 return (bit_array_of_available_ways); 975 } 976 977 978 /* 979 * Look for a way next to the specified way that is 980 * not in a retired state. 981 * We stop when way 3 is reached. 982 */ 983 int8_t 984 cmd_Lxcache_get_next_retirable_way(cmd_cpu_t *cpu, 985 int32_t index, cmd_ptrsubtype_t pstype, int8_t specified_way) 986 { 987 uint8_t bit_array_of_ways; 988 int8_t mask; 989 990 if (specified_way == 3) 991 return (-1); 992 bit_array_of_ways = cmd_Lxcache_get_bit_array_of_available_ways( 993 cpu, 994 pstype, index); 995 if (specified_way == 2) 996 mask = 0x8; 997 else if (specified_way == 1) 998 mask = 0xc; 999 else 1000 mask = 0xe; 1001 return (cmd_lowest_way[bit_array_of_ways & mask]); 1002 } 1003 1004 int8_t 1005 cmd_Lxcache_get_lowest_retirable_way(cmd_cpu_t *cpu, 1006 int32_t index, cmd_ptrsubtype_t pstype) 1007 { 1008 uint8_t bit_array_of_ways; 1009 1010 bit_array_of_ways = cmd_Lxcache_get_bit_array_of_available_ways( 1011 cpu, 1012 pstype, index); 1013 return (cmd_lowest_way[bit_array_of_ways]); 1014 } 1015 1016 cmd_Lxcache_t * 1017 cmd_Lxcache_lookup_by_type_index_way_reason(cmd_cpu_t *cpu, 1018 cmd_ptrsubtype_t pstype, int32_t index, int8_t way, int32_t reason) 1019 { 1020 cmd_Lxcache_t *cmd_Lxcache; 1021 1022 for (cmd_Lxcache = cmd_list_next(&cpu->cpu_Lxcaches); 1023 cmd_Lxcache != NULL; 1024 cmd_Lxcache = cmd_list_next(cmd_Lxcache)) { 1025 if ((cmd_Lxcache->Lxcache_index == (uint32_t)index) && 1026 (cmd_Lxcache->Lxcache_way == (uint32_t)way) && 1027 (cmd_Lxcache->Lxcache_reason & reason) && 1028 (cmd_Lxcache->Lxcache_type == pstype)) { 1029 return (cmd_Lxcache); 1030 } 1031 } 1032 return (NULL); 1033 } 1034 1035 cmd_Lxcache_t * 1036 cmd_Lxcache_lookup_by_type_index_bit_reason(cmd_cpu_t *cpu, 1037 cmd_ptrsubtype_t pstype, int32_t index, int16_t bit, int32_t reason) 1038 { 1039 cmd_Lxcache_t *cmd_Lxcache; 1040 1041 for (cmd_Lxcache = cmd_list_next(&cpu->cpu_Lxcaches); 1042 cmd_Lxcache != NULL; 1043 cmd_Lxcache = cmd_list_next(cmd_Lxcache)) { 1044 if ((cmd_Lxcache->Lxcache_index == (uint32_t)index) && 1045 (cmd_Lxcache->Lxcache_bit == (uint16_t)bit) && 1046 (cmd_Lxcache->Lxcache_reason & reason) && 1047 (cmd_Lxcache->Lxcache_type == pstype)) { 1048 return (cmd_Lxcache); 1049 } 1050 } 1051 return (NULL); 1052 } 1053 1054 void 1055 cmd_Lxcache_destroy_anonymous_serd_engines(fmd_hdl_t *hdl, cmd_cpu_t *cpu, 1056 cmd_ptrsubtype_t type, int32_t index, int16_t bit) 1057 { 1058 cmd_Lxcache_t *cmd_Lxcache; 1059 cmd_case_t *cc; 1060 1061 for (cmd_Lxcache = cmd_list_next(&cpu->cpu_Lxcaches); 1062 cmd_Lxcache != NULL; 1063 cmd_Lxcache = cmd_list_next(cmd_Lxcache)) { 1064 if ((cmd_Lxcache->Lxcache_type == type) && 1065 (cmd_Lxcache->Lxcache_index == (uint32_t)index) && 1066 (cmd_Lxcache->Lxcache_bit == (uint16_t)bit) && 1067 (cmd_Lxcache->Lxcache_way == (uint32_t)CMD_ANON_WAY)) { 1068 cc = &cmd_Lxcache->Lxcache_case; 1069 if (cc == NULL) 1070 continue; 1071 if (cc->cc_serdnm != NULL) { 1072 if (fmd_serd_exists(hdl, cc->cc_serdnm)) { 1073 fmd_hdl_debug(hdl, 1074 "\n%s:cpu_id %d destroying SERD" 1075 " engine %s\n", 1076 cmd_type_to_str(type), 1077 cpu->cpu_cpuid, cc->cc_serdnm); 1078 fmd_serd_destroy(hdl, cc->cc_serdnm); 1079 } 1080 fmd_hdl_strfree(hdl, cc->cc_serdnm); 1081 cc->cc_serdnm = NULL; 1082 } 1083 } 1084 } 1085 } 1086 1087 ssize_t 1088 cmd_fmri_nvl2str(fmd_hdl_t *hdl, nvlist_t *nvl, char *buf, size_t buflen) 1089 { 1090 uint8_t type; 1091 uint32_t cpuid, way; 1092 uint32_t index; 1093 uint16_t bit; 1094 char *serstr = NULL; 1095 char missing_list[128]; 1096 1097 missing_list[0] = 0; 1098 if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID, &cpuid) != 0) 1099 (void) strcat(missing_list, FM_FMRI_CPU_ID); 1100 if (nvlist_lookup_string(nvl, FM_FMRI_CPU_SERIAL_ID, &serstr) != 0) 1101 (void) strcat(missing_list, FM_FMRI_CPU_SERIAL_ID); 1102 if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_CACHE_INDEX, &index) != 0) 1103 (void) strcat(missing_list, FM_FMRI_CPU_CACHE_INDEX); 1104 if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_CACHE_WAY, &way) != 0) 1105 (void) strcat(missing_list, FM_FMRI_CPU_CACHE_WAY); 1106 if (nvlist_lookup_uint16(nvl, FM_FMRI_CPU_CACHE_BIT, &bit) != 0) 1107 (void) strcat(missing_list, FM_FMRI_CPU_CACHE_BIT); 1108 if (nvlist_lookup_uint8(nvl, FM_FMRI_CPU_CACHE_TYPE, &type) != 0) 1109 (void) strcat(missing_list, FM_FMRI_CPU_CACHE_TYPE); 1110 1111 if (strlen(missing_list) != 0) { 1112 fmd_hdl_debug(hdl, 1113 "\ncmd_fmri_nvl2str: missing %s in fmri\n", 1114 missing_list); 1115 return (-1); 1116 } 1117 1118 return (snprintf(buf, buflen, 1119 "cpu:///%s=%u/%s=%s/%s=%u/%s=%u/%s=%d/%s=%d", 1120 FM_FMRI_CPU_ID, cpuid, 1121 FM_FMRI_CPU_SERIAL_ID, serstr, 1122 FM_FMRI_CPU_CACHE_INDEX, index, 1123 FM_FMRI_CPU_CACHE_WAY, way, 1124 FM_FMRI_CPU_CACHE_BIT, bit, 1125 FM_FMRI_CPU_CACHE_TYPE, type)); 1126 } 1127 1128 boolean_t 1129 cmd_create_case_for_Lxcache(fmd_hdl_t *hdl, cmd_cpu_t *cpu, 1130 cmd_Lxcache_t *cmd_Lxcache) 1131 { 1132 const char *fltnm; 1133 const char *uuid; 1134 1135 if (cmd_Lxcache->Lxcache_case.cc_cp != NULL) 1136 return (B_TRUE); 1137 cmd_Lxcache->Lxcache_case.cc_cp = cmd_case_create(hdl, 1138 &cmd_Lxcache->Lxcache_header, CMD_PTR_LxCACHE_CASE, 1139 &uuid); 1140 fltnm = cmd_type_to_str(cmd_Lxcache->Lxcache_type); 1141 if (cmd_Lxcache->Lxcache_case.cc_cp == NULL) { 1142 fmd_hdl_debug(hdl, 1143 "\n%s:cpu_id %d:Failed to create a case for" 1144 " index %d way %d bit %d\n", 1145 fltnm, cpu->cpu_cpuid, 1146 cmd_Lxcache->Lxcache_index, 1147 cmd_Lxcache->Lxcache_way, cmd_Lxcache->Lxcache_bit); 1148 return (B_FALSE); 1149 } 1150 fmd_hdl_debug(hdl, 1151 "\n%s:cpu_id %d: New case %s created.\n", 1152 fltnm, cpu->cpu_cpuid, uuid); 1153 if (cmd_Lxcache->Lxcache_ep) 1154 fmd_case_add_ereport(hdl, cmd_Lxcache->Lxcache_case.cc_cp, 1155 cmd_Lxcache->Lxcache_ep); 1156 return (B_TRUE); 1157 } 1158 1159 static int 1160 cmd_repair_fmri(fmd_hdl_t *hdl, char *buf) 1161 { 1162 int err; 1163 1164 err = fmd_repair_asru(hdl, buf); 1165 if (err) { 1166 fmd_hdl_debug(hdl, 1167 "Failed to repair %s err = %d\n", buf, err); 1168 } 1169 return (err); 1170 } 1171 1172 boolean_t 1173 cmd_Lxcache_unretire(fmd_hdl_t *hdl, cmd_cpu_t *cpu, 1174 cmd_Lxcache_t *unretire_this_Lxcache, const char *fltnm) 1175 { 1176 cmd_ptrsubtype_t data_type; 1177 cmd_Lxcache_t *previously_retired_Lxcache; 1178 int found_reretired_cacheline = 0; 1179 int certainty; 1180 1181 /* 1182 * If we are unretiring a cacheline retired due to suspected TAG 1183 * fault, then we must first check if we are using a cacheline 1184 * that was retired earlier for DATA fault. 1185 * If so we will not unretire the cacheline. 1186 * We will change the flags to reflect the current condition. 1187 * We will return success, though. 1188 */ 1189 if (IS_TAG(unretire_this_Lxcache->Lxcache_type)) { 1190 if (unretire_this_Lxcache->Lxcache_type == CMD_PTR_CPU_L2TAG) 1191 data_type = CMD_PTR_CPU_L2DATA; 1192 if (unretire_this_Lxcache->Lxcache_type == CMD_PTR_CPU_L3TAG) 1193 data_type = CMD_PTR_CPU_L3DATA; 1194 fmd_hdl_debug(hdl, 1195 "\n%s:cpuid %d checking if there is a %s" 1196 " cacheline re-retired at this index %d and way %d\n", 1197 fltnm, cpu->cpu_cpuid, cmd_type_to_str(data_type), 1198 unretire_this_Lxcache->Lxcache_index, 1199 unretire_this_Lxcache->Lxcache_way); 1200 previously_retired_Lxcache = 1201 cmd_Lxcache_lookup_by_type_index_way_flags( 1202 cpu, data_type, unretire_this_Lxcache->Lxcache_index, 1203 unretire_this_Lxcache->Lxcache_way, 1204 CMD_LxCACHE_F_RERETIRED); 1205 if (previously_retired_Lxcache) { 1206 fmd_hdl_debug(hdl, 1207 "\n%s:cpuid %d Found a %s cacheline re-retired at" 1208 " this index %d and way %d. Will mark this" 1209 " RETIRED\n", 1210 fltnm, cpu->cpu_cpuid, cmd_type_to_str(data_type), 1211 unretire_this_Lxcache->Lxcache_index, 1212 unretire_this_Lxcache->Lxcache_way); 1213 /* 1214 * We call the cmd_Lxcache_fault to inform fmd 1215 * about the suspect fmri. The cacheline is already 1216 * retired but the existing suspect fmri is for TAG 1217 * fault which will be removed in this routine. 1218 */ 1219 if (previously_retired_Lxcache->Lxcache_reason 1220 == CMD_LXCONVICTED) 1221 certainty = HUNDRED_PERCENT; 1222 else 1223 certainty = SUSPECT_PERCENT; 1224 cmd_Lxcache_fault(hdl, cpu, previously_retired_Lxcache, 1225 fltnm, cpu->cpu_fru_nvl, certainty); 1226 previously_retired_Lxcache->Lxcache_flags = 1227 CMD_LxCACHE_F_RETIRED; 1228 /* 1229 * Update persistent storage 1230 */ 1231 cmd_Lxcache_write(hdl, previously_retired_Lxcache); 1232 found_reretired_cacheline = 1; 1233 } 1234 } else { 1235 /* 1236 * We have been called to unretire a cacheline retired 1237 * earlier due to DATA errors. 1238 * If this cacheline is marked RERETIRED then it means that 1239 * the cacheline has been retired due to TAG errors and 1240 * we should not be unretiring the cacheline. 1241 */ 1242 if (unretire_this_Lxcache->Lxcache_flags & 1243 CMD_LxCACHE_F_RERETIRED) { 1244 fmd_hdl_debug(hdl, 1245 "\n%s:cpuid %d The cacheline at index %d and" 1246 " way %d which we are attempting to unretire" 1247 " is in RERETIRED state. Therefore we will not" 1248 " unretire it but will mark it as RETIRED.\n", 1249 fltnm, cpu->cpu_cpuid, 1250 unretire_this_Lxcache->Lxcache_index, 1251 unretire_this_Lxcache->Lxcache_way); 1252 found_reretired_cacheline = 1; 1253 } 1254 } 1255 /* 1256 * if we did not find a RERETIRED cacheline above 1257 * unretire the cacheline. 1258 */ 1259 if (!found_reretired_cacheline) { 1260 if (cmd_cache_way_unretire(hdl, cpu, unretire_this_Lxcache) 1261 == B_FALSE) 1262 return (B_FALSE); 1263 } 1264 unretire_this_Lxcache->Lxcache_flags = CMD_LxCACHE_F_UNRETIRED; 1265 /* 1266 * We have exonerated the cacheline. We need to inform the fmd 1267 * that we have repaired the suspect fmri that we retired earlier. 1268 * The cpumem agent will not unretire cacheline in response to 1269 * the list.repair events it receives. 1270 */ 1271 if (unretire_this_Lxcache->Lxcache_retired_fmri[0] != 0) { 1272 fmd_hdl_debug(hdl, 1273 "\n%s:cpuid %d Repairing the retired fmri %s", 1274 fltnm, cpu->cpu_cpuid, 1275 unretire_this_Lxcache->Lxcache_retired_fmri); 1276 if (cmd_repair_fmri(hdl, 1277 unretire_this_Lxcache->Lxcache_retired_fmri) != 0) { 1278 fmd_hdl_debug(hdl, 1279 "\n%s:cpuid %d Failed to repair retired fmri.", 1280 fltnm, cpu->cpu_cpuid); 1281 /* 1282 * We need to retire the cacheline that we just 1283 * unretired. 1284 */ 1285 if (cmd_cache_way_retire(hdl, cpu, 1286 unretire_this_Lxcache) == B_FALSE) { 1287 /* 1288 * A hopeless situation. 1289 * cannot maintain consistency of cacheline 1290 * sate between fmd and DE. 1291 * Aborting the DE. 1292 */ 1293 fmd_hdl_abort(hdl, 1294 "\n%s:cpuid %d We are unable to repair" 1295 " the fmri we just unretired and are" 1296 " unable to restore the DE and fmd to" 1297 " a sane state.\n", 1298 fltnm, cpu->cpu_cpuid); 1299 } 1300 return (B_FALSE); 1301 } else { 1302 unretire_this_Lxcache->Lxcache_retired_fmri[0] = 0; 1303 } 1304 } 1305 return (B_TRUE); 1306 } 1307 1308 boolean_t 1309 cmd_Lxcache_retire(fmd_hdl_t *hdl, cmd_cpu_t *cpu, 1310 cmd_Lxcache_t *retire_this_Lxcache, const char *fltnm, uint_t cert) 1311 { 1312 cmd_Lxcache_t *previously_retired_Lxcache; 1313 cmd_ptrsubtype_t data_type; 1314 const char *uuid; 1315 char suspect_list[128]; 1316 1317 fmd_hdl_debug(hdl, 1318 "\n%s:cpu_id %d: cmd_Lxcache_retire called for index %d" 1319 " way %d bit %d\n", 1320 fltnm, cpu->cpu_cpuid, retire_this_Lxcache->Lxcache_index, 1321 retire_this_Lxcache->Lxcache_way, retire_this_Lxcache->Lxcache_bit); 1322 if (fmd_case_solved(hdl, retire_this_Lxcache->Lxcache_case.cc_cp)) { 1323 /* 1324 * Case solved implies that the cache line is already 1325 * retired as SUSPECT_0_TAG and we are here to retire this 1326 * as SUSPECT_1_TAG. 1327 * We will first repair the retired cacheline 1328 * so that it does not get retired during replay for 1329 * wrong reason. 1330 * If we are able to repair the retired cacheline we close the 1331 * case and open a new case for it. 1332 */ 1333 if (retire_this_Lxcache->Lxcache_reason != 1334 CMD_LXSUSPECT_0_TAG) { 1335 fmd_hdl_debug(hdl, 1336 "\n%s:cpu_id %d: Unexpected condition encountered." 1337 " Expected the reason for retirement as" 1338 " SUSPECT_0_TAG however found the reason" 1339 " to be %s\n", 1340 fltnm, cpu->cpu_cpuid, 1341 cmd_reason_to_str( 1342 retire_this_Lxcache->Lxcache_reason)); 1343 return (B_FALSE); 1344 } 1345 fmd_hdl_debug(hdl, 1346 "\n%s:cpu_id %d: We are re-retiring SUSPECT_0_TAG as" 1347 " SUSPECT_1_TAG index %d way %d bit %d\n", 1348 fltnm, cpu->cpu_cpuid, 1349 retire_this_Lxcache->Lxcache_index, 1350 retire_this_Lxcache->Lxcache_way, 1351 retire_this_Lxcache->Lxcache_bit); 1352 fmd_hdl_debug(hdl, 1353 "\n%s:cpu_id %d: The existing case for this Lxcache has" 1354 " has been already solved. We will first repair the suspect" 1355 " cacheline and if we are successful then close this case," 1356 " and open a new case.\n", 1357 fltnm, cpu->cpu_cpuid); 1358 /* 1359 * repair the retired cacheline. 1360 */ 1361 if (retire_this_Lxcache->Lxcache_retired_fmri[0] != 0) { 1362 fmd_hdl_debug(hdl, 1363 "\n%s:cpuid %d Repairing the retired suspect" 1364 " cacheline %s\n", 1365 fltnm, cpu->cpu_cpuid, 1366 retire_this_Lxcache->Lxcache_retired_fmri); 1367 if (cmd_repair_fmri(hdl, 1368 retire_this_Lxcache->Lxcache_retired_fmri) != 0) { 1369 fmd_hdl_debug(hdl, 1370 "\n%s:cpuid %d Failed to repair the" 1371 " retired fmri.", 1372 fltnm, cpu->cpu_cpuid); 1373 return (B_FALSE); 1374 } else { 1375 retire_this_Lxcache->Lxcache_retired_fmri[0] = 1376 0; 1377 } 1378 } 1379 uuid = fmd_case_uuid(hdl, 1380 retire_this_Lxcache->Lxcache_case.cc_cp); 1381 fmd_hdl_debug(hdl, 1382 "\n%s:cpuid %d: Closing the case %s\n", 1383 fltnm, cpu->cpu_cpuid, uuid); 1384 cmd_case_fini(hdl, retire_this_Lxcache->Lxcache_case.cc_cp, 1385 FMD_B_TRUE); 1386 retire_this_Lxcache->Lxcache_case.cc_cp = NULL; 1387 if (cmd_create_case_for_Lxcache(hdl, cpu, retire_this_Lxcache) 1388 == B_FALSE) 1389 return (B_FALSE); 1390 } else { 1391 /* 1392 * Not a SUSPECT_0_TAG. 1393 * We should be entering this path if the cacheline is 1394 * transitioning from ACTIVE/UNRETIRED to RETIRED state. 1395 * If the cacheline state is not as expected we print debug 1396 * message and return failure. 1397 */ 1398 if ((retire_this_Lxcache->Lxcache_flags != 1399 CMD_LxCACHE_F_ACTIVE) && 1400 (retire_this_Lxcache->Lxcache_flags 1401 != CMD_LxCACHE_F_UNRETIRED)) { 1402 /* 1403 * Unexpected condition. 1404 */ 1405 fmd_hdl_debug(hdl, 1406 "\n%s:cpu_id %d:Unexpected state %s for the" 1407 " cacheline at index %d way %d encountered.\n", 1408 fltnm, cpu->cpu_cpuid, 1409 cmd_flags_to_str( 1410 retire_this_Lxcache->Lxcache_flags), 1411 retire_this_Lxcache->Lxcache_index, 1412 retire_this_Lxcache->Lxcache_way); 1413 return (B_FALSE); 1414 } 1415 } 1416 suspect_list[0] = 0; 1417 (void) cmd_fmri_nvl2str(hdl, retire_this_Lxcache->Lxcache_asru.fmri_nvl, 1418 suspect_list, sizeof (suspect_list)); 1419 fmd_hdl_debug(hdl, 1420 "\n%s:cpu_id %d:current suspect list is %s\n", 1421 fltnm, cpu->cpu_cpuid, suspect_list); 1422 cmd_Lxcache_fault(hdl, cpu, retire_this_Lxcache, fltnm, 1423 cpu->cpu_fru_nvl, 1424 cert); 1425 retire_this_Lxcache->Lxcache_flags = CMD_LxCACHE_F_RETIRED; 1426 if (IS_TAG(retire_this_Lxcache->Lxcache_type)) { 1427 /* 1428 * If the cacheline we just retired was retired earlier 1429 * due to DATA faults we mark the Lxcache 1430 * corresponding to DATA as RERETIRED. 1431 */ 1432 if (retire_this_Lxcache->Lxcache_type == CMD_PTR_CPU_L2TAG) 1433 data_type = CMD_PTR_CPU_L2DATA; 1434 if (retire_this_Lxcache->Lxcache_type == CMD_PTR_CPU_L3TAG) 1435 data_type = CMD_PTR_CPU_L3DATA; 1436 fmd_hdl_debug(hdl, 1437 "\n%s:cpuid %d checking if there is a %s" 1438 " cacheline retired at this index %d way %d\n", 1439 fltnm, cpu->cpu_cpuid, 1440 cmd_type_to_str(data_type), 1441 retire_this_Lxcache->Lxcache_index, 1442 retire_this_Lxcache->Lxcache_way); 1443 previously_retired_Lxcache = 1444 cmd_Lxcache_lookup_by_type_index_way_flags(cpu, 1445 data_type, retire_this_Lxcache->Lxcache_index, 1446 retire_this_Lxcache->Lxcache_way, CMD_LxCACHE_F_RETIRED); 1447 if (previously_retired_Lxcache) { 1448 fmd_hdl_debug(hdl, 1449 "\n%s:cpu_id %d: Found index %d way %d" 1450 " retired earlier. Will mark this Lxcache" 1451 " as RERETIRED.\n", 1452 fltnm, cpu->cpu_cpuid, 1453 retire_this_Lxcache->Lxcache_index, 1454 retire_this_Lxcache->Lxcache_way); 1455 /* 1456 * First repair the retired cacheline and if successful 1457 * close the existing case and create a new case. 1458 */ 1459 1460 /* 1461 * This cacheline has already been retired for 1462 * TAG fault. 1463 * Repair the previously retired DATA fault cacheline so 1464 * that it does not get retired by fmd during replay. 1465 */ 1466 if (previously_retired_Lxcache->Lxcache_retired_fmri[0] 1467 != 0) { 1468 fmd_hdl_debug(hdl, 1469 "\n%s:cpuid %d Repairing the cacheline" 1470 " retired due to data errors. %s\n", 1471 fltnm, cpu->cpu_cpuid, 1472 previously_retired_Lxcache-> 1473 Lxcache_retired_fmri); 1474 if (cmd_repair_fmri(hdl, 1475 previously_retired_Lxcache-> 1476 Lxcache_retired_fmri) 1477 != 0) { 1478 fmd_hdl_debug(hdl, 1479 "\n%s:cpuid %d Failed to repair the" 1480 " retired fmri.", 1481 fltnm, cpu->cpu_cpuid); 1482 return (B_FALSE); 1483 } else { 1484 previously_retired_Lxcache-> 1485 Lxcache_retired_fmri[0] = 0; 1486 } 1487 } 1488 cmd_case_fini(hdl, 1489 previously_retired_Lxcache->Lxcache_case.cc_cp, 1490 FMD_B_TRUE); 1491 previously_retired_Lxcache->Lxcache_case.cc_cp = NULL; 1492 previously_retired_Lxcache->Lxcache_flags = 1493 CMD_LxCACHE_F_RERETIRED; 1494 /* 1495 * Update persistent storage 1496 */ 1497 cmd_Lxcache_write(hdl, previously_retired_Lxcache); 1498 /* 1499 * Create a new case so that this Lxcache structure 1500 * gets restored on replay. 1501 */ 1502 if (cmd_create_case_for_Lxcache(hdl, cpu, 1503 previously_retired_Lxcache) == B_FALSE) 1504 return (B_FALSE); 1505 } 1506 } 1507 cmd_retire_cpu_if_limits_exceeded(hdl, cpu, 1508 retire_this_Lxcache->Lxcache_type, 1509 fltnm); 1510 return (B_TRUE); 1511 } 1512