1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <cma.h> 28 29 #include <unistd.h> 30 #include <fcntl.h> 31 #include <strings.h> 32 #include <errno.h> 33 #include <time.h> 34 #include <fm/fmd_api.h> 35 #include <sys/fm/protocol.h> 36 #include <sys/systeminfo.h> 37 #include <sys/utsname.h> 38 39 #ifdef sun4v 40 #include <sys/fm/ldom.h> 41 42 static fmd_hdl_t *init_hdl; 43 ldom_hdl_t *cma_lhp; 44 #endif 45 46 #ifdef i386 47 boolean_t cma_is_native; 48 #endif 49 50 extern const char *fmd_fmri_get_platform(); 51 52 cma_t cma; 53 54 cma_stats_t cma_stats = { 55 { "cpu_flts", FMD_TYPE_UINT64, "cpu faults resolved" }, 56 { "cpu_repairs", FMD_TYPE_UINT64, "cpu faults repaired" }, 57 { "cpu_fails", FMD_TYPE_UINT64, "cpu faults unresolveable" }, 58 { "cpu_blfails", FMD_TYPE_UINT64, "failed cpu blacklists" }, 59 { "cpu_supp", FMD_TYPE_UINT64, "cpu offlines suppressed" }, 60 { "cpu_blsupp", FMD_TYPE_UINT64, "cpu blacklists suppressed" }, 61 { "page_flts", FMD_TYPE_UINT64, "page faults resolved" }, 62 { "page_repairs", FMD_TYPE_UINT64, "page faults repaired" }, 63 { "page_fails", FMD_TYPE_UINT64, "page faults unresolveable" }, 64 { "page_supp", FMD_TYPE_UINT64, "page retires suppressed" }, 65 { "page_nonent", FMD_TYPE_UINT64, "retires for non-existent fmris" }, 66 { "bad_flts", FMD_TYPE_UINT64, "invalid fault events received" }, 67 { "nop_flts", FMD_TYPE_UINT64, "inapplicable fault events received" }, 68 { "auto_flts", FMD_TYPE_UINT64, "auto-close faults received" } 69 }; 70 71 typedef struct cma_subscriber { 72 const char *subr_class; 73 const char *subr_sname; 74 uint_t subr_svers; 75 int (*subr_func)(fmd_hdl_t *, nvlist_t *, nvlist_t *, const char *, 76 boolean_t); 77 } cma_subscriber_t; 78 79 static const cma_subscriber_t cma_subrs[] = { 80 #if defined(i386) 81 /* 82 * On x86, the ASRUs are expected to be in hc scheme. When 83 * cpumem-retire wants to retire a cpu or mem page, it calls the 84 * methods registered in the topo node to do that. The topo 85 * enumerator, which necessarily knows all the config info that 86 * we'd ever need in deciding what/how to retire etc. This takes 87 * away much of that complexity from the agent into the entity 88 * that knows all config/topo information. 89 */ 90 { "fault.memory.page", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 91 cma_page_retire }, 92 { "fault.memory.page_sb", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 93 cma_page_retire }, 94 { "fault.memory.page_ck", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 95 cma_page_retire }, 96 { "fault.memory.page_ue", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 97 cma_page_retire }, 98 { "fault.memory.generic-x86.page_ce", FM_FMRI_SCHEME_HC, 99 FM_HC_SCHEME_VERSION, cma_page_retire }, 100 { "fault.memory.generic-x86.page_ue", FM_FMRI_SCHEME_HC, 101 FM_HC_SCHEME_VERSION, cma_page_retire }, 102 { "fault.memory.intel.page_ce", FM_FMRI_SCHEME_HC, 103 FM_HC_SCHEME_VERSION, cma_page_retire }, 104 { "fault.memory.intel.page_ue", FM_FMRI_SCHEME_HC, 105 FM_HC_SCHEME_VERSION, cma_page_retire }, 106 { "fault.memory.dimm", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 107 NULL }, 108 { "fault.memory.dimm_sb", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 109 NULL }, 110 { "fault.memory.dimm_ck", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 111 NULL }, 112 { "fault.memory.dimm_ue", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 113 NULL }, 114 { "fault.memory.generic-x86.dimm_ce", FM_FMRI_SCHEME_HC, 115 FM_HC_SCHEME_VERSION, NULL }, 116 { "fault.memory.generic-x86.dimm_ue", FM_FMRI_SCHEME_HC, 117 FM_HC_SCHEME_VERSION, NULL }, 118 { "fault.memory.intel.dimm_ce", FM_FMRI_SCHEME_HC, 119 FM_HC_SCHEME_VERSION, NULL }, 120 { "fault.memory.intel.dimm_ue", FM_FMRI_SCHEME_HC, 121 FM_HC_SCHEME_VERSION, NULL }, 122 { "fault.memory.intel.fbd.*", FM_FMRI_SCHEME_HC, 123 FM_HC_SCHEME_VERSION, NULL }, 124 { "fault.memory.dimm_testfail", FM_FMRI_SCHEME_HC, 125 FM_HC_SCHEME_VERSION, NULL }, 126 { "fault.memory.bank", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 127 NULL }, 128 { "fault.memory.datapath", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 129 NULL }, 130 { "fault.cpu.intel.quickpath.mem_scrubbing", FM_FMRI_SCHEME_HC, 131 FM_HC_SCHEME_VERSION, cma_page_retire }, 132 { "fault.cpu.intel.quickpath.*", FM_FMRI_SCHEME_HC, 133 FM_HC_SCHEME_VERSION, NULL }, 134 { "fault.cpu.generic-x86.mc", FM_FMRI_SCHEME_HC, 135 FM_HC_SCHEME_VERSION, NULL }, 136 { "fault.cpu.intel.dma", FM_FMRI_SCHEME_HC, 137 FM_HC_SCHEME_VERSION, NULL }, 138 { "fault.cpu.intel.dma", FM_FMRI_SCHEME_CPU, 139 FM_CPU_SCHEME_VERSION, NULL }, 140 141 /* 142 * The ASRU for cpu faults are in cpu scheme on native and in hc 143 * scheme on xpv. So each cpu fault class needs to be listed twice. 144 */ 145 146 /* 147 * The following faults do NOT retire a cpu thread, 148 * and therefore must be intercepted before 149 * the default "fault.cpu.*" dispatch to cma_cpu_hc_retire. 150 */ 151 { "fault.cpu.amd.dramchannel", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 152 NULL }, 153 { "fault.cpu.amd.dramchannel", FM_FMRI_SCHEME_CPU, 154 FM_CPU_SCHEME_VERSION, NULL }, 155 { "fault.cpu.generic-x86.bus_interconnect_memory", FM_FMRI_SCHEME_HC, 156 FM_HC_SCHEME_VERSION, NULL }, 157 { "fault.cpu.generic-x86.bus_interconnect_memory", FM_FMRI_SCHEME_CPU, 158 FM_CPU_SCHEME_VERSION, NULL }, 159 { "fault.cpu.generic-x86.bus_interconnect_io", FM_FMRI_SCHEME_HC, 160 FM_HC_SCHEME_VERSION, NULL }, 161 { "fault.cpu.generic-x86.bus_interconnect_io", FM_FMRI_SCHEME_CPU, 162 FM_CPU_SCHEME_VERSION, NULL }, 163 { "fault.cpu.generic-x86.bus_interconnect", FM_FMRI_SCHEME_HC, 164 FM_HC_SCHEME_VERSION, NULL }, 165 { "fault.cpu.generic-x86.bus_interconnect", FM_FMRI_SCHEME_CPU, 166 FM_CPU_SCHEME_VERSION, NULL }, 167 { "fault.cpu.intel.bus_interconnect_memory", FM_FMRI_SCHEME_HC, 168 FM_HC_SCHEME_VERSION, NULL }, 169 { "fault.cpu.intel.bus_interconnect_memory", FM_FMRI_SCHEME_CPU, 170 FM_CPU_SCHEME_VERSION, NULL }, 171 { "fault.cpu.intel.bus_interconnect_io", FM_FMRI_SCHEME_HC, 172 FM_HC_SCHEME_VERSION, NULL }, 173 { "fault.cpu.intel.bus_interconnect_io", FM_FMRI_SCHEME_CPU, 174 FM_CPU_SCHEME_VERSION, NULL }, 175 { "fault.cpu.intel.bus_interconnect", FM_FMRI_SCHEME_HC, 176 FM_HC_SCHEME_VERSION, NULL }, 177 { "fault.cpu.intel.bus_interconnect", FM_FMRI_SCHEME_CPU, 178 FM_CPU_SCHEME_VERSION, NULL }, 179 { "fault.cpu.intel.nb.*", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 180 NULL }, 181 { "fault.cpu.intel.nb.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 182 NULL }, 183 { "fault.cpu.intel.dma", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 184 NULL }, 185 { "fault.cpu.intel.dma", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 186 NULL }, 187 { "fault.cpu.*", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 188 cma_cpu_hc_retire }, 189 { "fault.cpu.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 190 cma_cpu_hc_retire }, 191 #elif defined(sun4v) 192 /* 193 * The following are PI sun4v faults 194 */ 195 { "fault.memory.memlink", FM_FMRI_SCHEME_HC, 196 FM_HC_SCHEME_VERSION, NULL }, 197 { "fault.memory.memlink-uc", FM_FMRI_SCHEME_HC, 198 FM_HC_SCHEME_VERSION, NULL }, 199 { "fault.memory.memlink-failover", FM_FMRI_SCHEME_HC, 200 FM_HC_SCHEME_VERSION, NULL }, 201 { "fault.memory.dimm-ue-imminent", FM_FMRI_SCHEME_HC, 202 FM_HC_SCHEME_VERSION, NULL }, 203 { "fault.memory.dram-ue-imminent", FM_FMRI_SCHEME_HC, 204 FM_HC_SCHEME_VERSION, NULL }, 205 { "fault.memory.dimm-page-retires-excessive", FM_FMRI_SCHEME_HC, 206 FM_HC_SCHEME_VERSION, NULL }, 207 { "fault.memory.page", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 208 cma_page_retire }, 209 { "fault.memory.dimm", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 210 NULL }, 211 { "fault.memory.dimm_sb", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 212 NULL }, 213 { "fault.memory.dimm_ck", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 214 NULL }, 215 { "fault.memory.dimm_ue", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 216 NULL }, 217 { "fault.memory.dimm-page-retires-excessive", FM_FMRI_SCHEME_MEM, 218 FM_MEM_SCHEME_VERSION, NULL }, 219 { "fault.memory.dimm-ue-imminent", FM_FMRI_SCHEME_MEM, 220 FM_MEM_SCHEME_VERSION, NULL }, 221 { "fault.memory.dram-ue-imminent", FM_FMRI_SCHEME_MEM, 222 FM_MEM_SCHEME_VERSION, NULL }, 223 { "fault.memory.bank", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 224 NULL }, 225 { "fault.memory.datapath", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 226 NULL }, 227 { "fault.memory.link-c", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 228 NULL }, 229 { "fault.memory.link-u", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 230 NULL }, 231 { "fault.memory.link-f", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 232 NULL }, 233 { "fault.memory.link-c", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 234 NULL }, 235 { "fault.memory.link-u", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 236 NULL }, 237 { "fault.memory.link-f", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 238 NULL }, 239 240 /* 241 * The following ultraSPARC-T1/T2 faults do NOT retire a cpu thread, 242 * and therefore must be intercepted before 243 * the default "fault.cpu.*" dispatch to cma_cpu_hc_retire. 244 */ 245 { "fault.cpu.*.l2cachedata", FM_FMRI_SCHEME_CPU, 246 FM_CPU_SCHEME_VERSION, NULL }, 247 { "fault.cpu.*.l2cachetag", FM_FMRI_SCHEME_CPU, 248 FM_CPU_SCHEME_VERSION, NULL }, 249 { "fault.cpu.*.l2cachectl", FM_FMRI_SCHEME_CPU, 250 FM_CPU_SCHEME_VERSION, NULL }, 251 { "fault.cpu.*.l2data-c", FM_FMRI_SCHEME_CPU, 252 FM_CPU_SCHEME_VERSION, NULL }, 253 { "fault.cpu.*.l2data-u", FM_FMRI_SCHEME_CPU, 254 FM_CPU_SCHEME_VERSION, NULL }, 255 { "fault.cpu.*.mau", FM_FMRI_SCHEME_CPU, 256 FM_CPU_SCHEME_VERSION, NULL }, 257 { "fault.cpu.*.lfu-u", FM_FMRI_SCHEME_CPU, 258 FM_CPU_SCHEME_VERSION, NULL }, 259 { "fault.cpu.*.lfu-f", FM_FMRI_SCHEME_CPU, 260 FM_CPU_SCHEME_VERSION, NULL }, 261 { "fault.cpu.*.lfu-p", FM_FMRI_SCHEME_CPU, 262 FM_CPU_SCHEME_VERSION, NULL }, 263 { "fault.cpu.ultraSPARC-T1.freg", FM_FMRI_SCHEME_CPU, 264 FM_CPU_SCHEME_VERSION, NULL }, 265 { "fault.cpu.ultraSPARC-T1.l2cachedata", FM_FMRI_SCHEME_CPU, 266 FM_CPU_SCHEME_VERSION, NULL }, 267 { "fault.cpu.ultraSPARC-T1.l2cachetag", FM_FMRI_SCHEME_CPU, 268 FM_CPU_SCHEME_VERSION, NULL }, 269 { "fault.cpu.ultraSPARC-T1.l2cachectl", FM_FMRI_SCHEME_CPU, 270 FM_CPU_SCHEME_VERSION, NULL }, 271 { "fault.cpu.ultraSPARC-T1.mau", FM_FMRI_SCHEME_CPU, 272 FM_CPU_SCHEME_VERSION, NULL }, 273 { "fault.cpu.ultraSPARC-T2plus.chip", FM_FMRI_SCHEME_HC, 274 FM_HC_SCHEME_VERSION, NULL }, 275 { "fault.cpu.*", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 276 cma_cpu_hc_retire }, 277 { "fault.cpu.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 278 cma_cpu_hc_retire }, 279 #elif defined(opl) 280 { "fault.memory.page", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 281 cma_page_retire }, 282 { "fault.memory.dimm", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 283 NULL }, 284 { "fault.memory.dimm-page-retires-excessive", FM_FMRI_SCHEME_MEM, 285 FM_MEM_SCHEME_VERSION, NULL }, 286 { "fault.memory.dimm-ue-imminent", FM_FMRI_SCHEME_MEM, 287 FM_MEM_SCHEME_VERSION, NULL }, 288 { "fault.memory.dram-ue-imminent", FM_FMRI_SCHEME_MEM, 289 FM_MEM_SCHEME_VERSION, NULL }, 290 { "fault.memory.bank", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 291 NULL }, 292 { "fault.cpu.SPARC64-VI.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 293 cma_cpu_cpu_retire }, 294 { "fault.cpu.SPARC64-VII.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 295 cma_cpu_cpu_retire }, 296 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VI.core.se", 297 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 298 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VI.core.se-offlinereq", 299 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 300 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VI.core.ce", 301 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 302 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VI.core.ce-offlinereq", 303 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 304 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VII.core.se", 305 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 306 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VII.core.se-offlinereq", 307 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 308 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VII.core.ce", 309 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 310 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VII.core.ce-offlinereq", 311 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 312 #else 313 /* 314 * For platforms excluding i386, sun4v and opl. 315 */ 316 { "fault.memory.page", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 317 cma_page_retire }, 318 { "fault.memory.page_sb", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 319 cma_page_retire }, 320 { "fault.memory.page_ck", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 321 cma_page_retire }, 322 { "fault.memory.page_ue", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 323 cma_page_retire }, 324 { "fault.memory.dimm", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 325 NULL }, 326 { "fault.memory.dimm_sb", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 327 NULL }, 328 { "fault.memory.dimm_ck", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 329 NULL }, 330 { "fault.memory.dimm_ue", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 331 NULL }, 332 { "fault.memory.dimm-page-retires-excessive", FM_FMRI_SCHEME_MEM, 333 FM_MEM_SCHEME_VERSION, NULL }, 334 { "fault.memory.dimm-ue-imminent", FM_FMRI_SCHEME_MEM, 335 FM_MEM_SCHEME_VERSION, NULL }, 336 { "fault.memory.dram-ue-imminent", FM_FMRI_SCHEME_MEM, 337 FM_MEM_SCHEME_VERSION, NULL }, 338 { "fault.memory.dimm_testfail", FM_FMRI_SCHEME_MEM, 339 FM_MEM_SCHEME_VERSION, NULL }, 340 { "fault.memory.bank", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 341 NULL }, 342 { "fault.memory.datapath", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 343 NULL }, 344 345 /* 346 * The following faults do NOT retire a cpu thread, 347 * and therefore must be intercepted before 348 * the default "fault.cpu.*" dispatch to cma_cpu_cpu_retire. 349 */ 350 { "fault.cpu.ultraSPARC-IVplus.l2cachedata-line", 351 FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 352 cma_cache_way_retire }, 353 { "fault.cpu.ultraSPARC-IVplus.l3cachedata-line", 354 FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 355 cma_cache_way_retire }, 356 { "fault.cpu.ultraSPARC-IVplus.l2cachetag-line", 357 FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 358 cma_cache_way_retire }, 359 { "fault.cpu.ultraSPARC-IVplus.l3cachetag-line", 360 FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 361 cma_cache_way_retire }, 362 363 /* 364 * Default "fault.cpu.*" for "cpu" scheme ASRU dispatch. 365 */ 366 { "fault.cpu.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 367 cma_cpu_cpu_retire }, 368 #endif 369 { NULL, NULL, 0, NULL } 370 }; 371 372 static const cma_subscriber_t * 373 nvl2subr(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t **asrup) 374 { 375 const cma_subscriber_t *sp; 376 nvlist_t *asru; 377 char *scheme; 378 uint8_t version; 379 boolean_t retire; 380 381 if (nvlist_lookup_boolean_value(nvl, FM_SUSPECT_RETIRE, &retire) == 0 && 382 retire == 0) { 383 fmd_hdl_debug(hdl, "cma_recv: retire suppressed"); 384 return (NULL); 385 } 386 387 if (nvlist_lookup_nvlist(nvl, FM_FAULT_ASRU, &asru) != 0 || 388 nvlist_lookup_string(asru, FM_FMRI_SCHEME, &scheme) != 0 || 389 nvlist_lookup_uint8(asru, FM_VERSION, &version) != 0) { 390 cma_stats.bad_flts.fmds_value.ui64++; 391 return (NULL); 392 } 393 394 for (sp = cma_subrs; sp->subr_class != NULL; sp++) { 395 if (fmd_nvl_class_match(hdl, nvl, sp->subr_class) && 396 strcmp(scheme, sp->subr_sname) == 0 && 397 version <= sp->subr_svers) { 398 *asrup = asru; 399 return (sp); 400 } 401 } 402 403 cma_stats.nop_flts.fmds_value.ui64++; 404 return (NULL); 405 } 406 407 static void 408 cma_recv_list(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class) 409 { 410 char *uuid = NULL; 411 nvlist_t **nva, **save_nva; 412 uint_t nvc = 0, save_nvc; 413 uint_t keepopen; 414 int err = 0; 415 nvlist_t *asru = NULL; 416 uint32_t index; 417 418 err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid); 419 err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 420 &nva, &nvc); 421 if (err != 0) { 422 cma_stats.bad_flts.fmds_value.ui64++; 423 return; 424 } 425 426 save_nvc = keepopen = nvc; 427 save_nva = nva; 428 while (nvc-- != 0 && (strcmp(class, FM_LIST_SUSPECT_CLASS) != 0 || 429 !fmd_case_uuclosed(hdl, uuid))) { 430 nvlist_t *nvl = *nva++; 431 const cma_subscriber_t *subr; 432 int has_fault; 433 434 if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL) 435 continue; 436 437 /* 438 * A handler returns CMA_RA_SUCCESS to indicate that 439 * from this suspects point-of-view the case may be 440 * closed, CMA_RA_FAILURE otherwise. 441 * A handler must not close the case itself. 442 */ 443 if (subr->subr_func != NULL) { 444 has_fault = fmd_nvl_fmri_has_fault(hdl, asru, 445 FMD_HAS_FAULT_ASRU, NULL); 446 if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0) { 447 if (has_fault == 1) 448 err = subr->subr_func(hdl, nvl, asru, 449 uuid, 0); 450 } else { 451 if (has_fault == 0) 452 err = subr->subr_func(hdl, nvl, asru, 453 uuid, 1); 454 } 455 if (err == CMA_RA_SUCCESS) 456 keepopen--; 457 } 458 } 459 460 /* 461 * Run though again to catch any new faults in list.updated. 462 */ 463 while (save_nvc-- != 0 && (strcmp(class, FM_LIST_UPDATED_CLASS) == 0)) { 464 nvlist_t *nvl = *save_nva++; 465 const cma_subscriber_t *subr; 466 int has_fault; 467 468 if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL) 469 continue; 470 if (subr->subr_func != NULL) { 471 has_fault = fmd_nvl_fmri_has_fault(hdl, asru, 472 FMD_HAS_FAULT_ASRU, NULL); 473 if (has_fault == 1) 474 err = subr->subr_func(hdl, nvl, asru, uuid, 0); 475 } 476 } 477 478 /* 479 * Do not close the case if we are handling cache faults. 480 */ 481 if (asru != NULL) { 482 if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_CACHE_INDEX, 483 &index) != 0) { 484 if (!keepopen && strcmp(class, 485 FM_LIST_SUSPECT_CLASS) == 0) { 486 fmd_case_uuclose(hdl, uuid); 487 } 488 } 489 } 490 491 if (!keepopen && strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 492 fmd_case_uuresolved(hdl, uuid); 493 } 494 495 static void 496 cma_recv_one(fmd_hdl_t *hdl, nvlist_t *nvl) 497 { 498 const cma_subscriber_t *subr; 499 nvlist_t *asru; 500 501 if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL) 502 return; 503 504 if (subr->subr_func != NULL) { 505 if (fmd_nvl_fmri_has_fault(hdl, asru, 506 FMD_HAS_FAULT_ASRU, NULL) == 1) 507 (void) subr->subr_func(hdl, nvl, asru, NULL, 0); 508 } 509 } 510 511 /*ARGSUSED*/ 512 static void 513 cma_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) 514 { 515 fmd_hdl_debug(hdl, "received %s\n", class); 516 517 if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) 518 return; 519 520 if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 || 521 strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 || 522 strcmp(class, FM_LIST_UPDATED_CLASS) == 0) 523 cma_recv_list(hdl, nvl, class); 524 else 525 cma_recv_one(hdl, nvl); 526 } 527 528 /*ARGSUSED*/ 529 static void 530 cma_timeout(fmd_hdl_t *hdl, id_t id, void *arg) 531 { 532 if (id == cma.cma_page_timerid) 533 cma_page_retry(hdl); 534 #ifdef sun4v 535 /* 536 * cpu offline/online needs to be retried on sun4v because 537 * ldom request can be asynchronous. 538 */ 539 else if (id == cma.cma_cpu_timerid) 540 cma_cpu_retry(hdl); 541 #endif 542 } 543 544 #ifdef sun4v 545 static void * 546 cma_init_alloc(size_t size) 547 { 548 return (fmd_hdl_alloc(init_hdl, size, FMD_SLEEP)); 549 } 550 551 static void 552 cma_init_free(void *addr, size_t size) 553 { 554 fmd_hdl_free(init_hdl, addr, size); 555 } 556 #endif 557 558 static const fmd_hdl_ops_t fmd_ops = { 559 cma_recv, /* fmdo_recv */ 560 cma_timeout, /* fmdo_timeout */ 561 NULL, /* fmdo_close */ 562 NULL, /* fmdo_stats */ 563 NULL, /* fmdo_gc */ 564 }; 565 566 static const fmd_prop_t fmd_props[] = { 567 { "cpu_tries", FMD_TYPE_UINT32, "10" }, 568 { "cpu_delay", FMD_TYPE_TIME, "1sec" }, 569 #ifdef sun4v 570 { "cpu_ret_mindelay", FMD_TYPE_TIME, "5sec" }, 571 { "cpu_ret_maxdelay", FMD_TYPE_TIME, "5min" }, 572 #endif /* sun4v */ 573 { "cpu_offline_enable", FMD_TYPE_BOOL, "true" }, 574 { "cpu_online_enable", FMD_TYPE_BOOL, "true" }, 575 { "cpu_forced_offline", FMD_TYPE_BOOL, "true" }, 576 #ifdef opl 577 { "cpu_blacklist_enable", FMD_TYPE_BOOL, "false" }, 578 { "cpu_unblacklist_enable", FMD_TYPE_BOOL, "false" }, 579 #else 580 { "cpu_blacklist_enable", FMD_TYPE_BOOL, "true" }, 581 { "cpu_unblacklist_enable", FMD_TYPE_BOOL, "true" }, 582 #endif /* opl */ 583 { "page_ret_mindelay", FMD_TYPE_TIME, "1sec" }, 584 { "page_ret_maxdelay", FMD_TYPE_TIME, "5min" }, 585 { "page_retire_enable", FMD_TYPE_BOOL, "true" }, 586 { "page_unretire_enable", FMD_TYPE_BOOL, "true" }, 587 { NULL, 0, NULL } 588 }; 589 590 static const fmd_hdl_info_t fmd_info = { 591 "CPU/Memory Retire Agent", CMA_VERSION, &fmd_ops, fmd_props 592 }; 593 594 void 595 _fmd_init(fmd_hdl_t *hdl) 596 { 597 hrtime_t nsec; 598 #ifdef i386 599 char buf[BUFSIZ]; 600 const char *dom0 = "control_d"; 601 602 /* 603 * Abort the cpumem-retire module if Solaris is running under DomU. 604 */ 605 if (sysinfo(SI_PLATFORM, buf, sizeof (buf)) == -1) 606 return; 607 608 if (strncmp(buf, "i86pc", sizeof (buf)) == 0) { 609 cma_is_native = B_TRUE; 610 } else if (strncmp(buf, "i86xpv", sizeof (buf)) != 0) { 611 return; 612 } else { 613 int fd = open("/dev/xen/domcaps", O_RDONLY); 614 615 if (fd != -1) { 616 if (read(fd, buf, sizeof (buf)) <= 0 || 617 strncmp(buf, dom0, strlen(dom0)) != 0) { 618 (void) close(fd); 619 return; 620 } 621 (void) close(fd); 622 } 623 cma_is_native = B_FALSE; 624 } 625 #endif /* i386 */ 626 627 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) 628 return; /* invalid data in configuration file */ 629 630 fmd_hdl_subscribe(hdl, "fault.cpu.*"); 631 fmd_hdl_subscribe(hdl, "fault.memory.*"); 632 #ifdef opl 633 fmd_hdl_subscribe(hdl, "fault.chassis.SPARC-Enterprise.cpu.*"); 634 #endif 635 636 (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (cma_stats) / 637 sizeof (fmd_stat_t), (fmd_stat_t *)&cma_stats); 638 639 cma.cma_cpu_tries = fmd_prop_get_int32(hdl, "cpu_tries"); 640 641 nsec = fmd_prop_get_int64(hdl, "cpu_delay"); 642 cma.cma_cpu_delay.tv_sec = nsec / NANOSEC; 643 cma.cma_cpu_delay.tv_nsec = nsec % NANOSEC; 644 645 cma.cma_page_mindelay = fmd_prop_get_int64(hdl, "page_ret_mindelay"); 646 cma.cma_page_maxdelay = fmd_prop_get_int64(hdl, "page_ret_maxdelay"); 647 648 #ifdef sun4v 649 cma.cma_cpu_mindelay = fmd_prop_get_int64(hdl, "cpu_ret_mindelay"); 650 cma.cma_cpu_maxdelay = fmd_prop_get_int64(hdl, "cpu_ret_maxdelay"); 651 #endif 652 653 cma.cma_cpu_dooffline = fmd_prop_get_int32(hdl, "cpu_offline_enable"); 654 cma.cma_cpu_forcedoffline = fmd_prop_get_int32(hdl, 655 "cpu_forced_offline"); 656 cma.cma_cpu_doonline = fmd_prop_get_int32(hdl, "cpu_online_enable"); 657 cma.cma_cpu_doblacklist = fmd_prop_get_int32(hdl, 658 "cpu_blacklist_enable"); 659 cma.cma_cpu_dounblacklist = fmd_prop_get_int32(hdl, 660 "cpu_unblacklist_enable"); 661 cma.cma_page_doretire = fmd_prop_get_int32(hdl, "page_retire_enable"); 662 cma.cma_page_dounretire = fmd_prop_get_int32(hdl, 663 "page_unretire_enable"); 664 665 if (cma.cma_page_maxdelay < cma.cma_page_mindelay) 666 fmd_hdl_abort(hdl, "page retirement delays conflict\n"); 667 668 #ifdef sun4v 669 init_hdl = hdl; 670 cma_lhp = ldom_init(cma_init_alloc, cma_init_free); 671 #endif 672 } 673 674 void 675 _fmd_fini(fmd_hdl_t *hdl) 676 { 677 #ifdef sun4v 678 ldom_fini(cma_lhp); 679 cma_cpu_fini(hdl); 680 #endif 681 cma_page_fini(hdl); 682 } 683