1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <cma.h> 27 28 #include <unistd.h> 29 #include <fcntl.h> 30 #include <strings.h> 31 #include <errno.h> 32 #include <time.h> 33 #include <fm/fmd_api.h> 34 #include <sys/fm/protocol.h> 35 #include <sys/systeminfo.h> 36 #include <sys/utsname.h> 37 38 #ifdef sun4v 39 #include <sys/fm/ldom.h> 40 41 static fmd_hdl_t *init_hdl; 42 ldom_hdl_t *cma_lhp; 43 #endif 44 45 #ifdef i386 46 boolean_t cma_is_native; 47 #endif 48 49 extern const char *fmd_fmri_get_platform(); 50 51 cma_t cma; 52 53 cma_stats_t cma_stats = { 54 { "cpu_flts", FMD_TYPE_UINT64, "cpu faults resolved" }, 55 { "cpu_repairs", FMD_TYPE_UINT64, "cpu faults repaired" }, 56 { "cpu_fails", FMD_TYPE_UINT64, "cpu faults unresolveable" }, 57 { "cpu_blfails", FMD_TYPE_UINT64, "failed cpu blacklists" }, 58 { "cpu_supp", FMD_TYPE_UINT64, "cpu offlines suppressed" }, 59 { "cpu_blsupp", FMD_TYPE_UINT64, "cpu blacklists suppressed" }, 60 { "page_flts", FMD_TYPE_UINT64, "page faults resolved" }, 61 { "page_repairs", FMD_TYPE_UINT64, "page faults repaired" }, 62 { "page_fails", FMD_TYPE_UINT64, "page faults unresolveable" }, 63 { "page_supp", FMD_TYPE_UINT64, "page retires suppressed" }, 64 { "page_nonent", FMD_TYPE_UINT64, "retires for non-existent fmris" }, 65 { "bad_flts", FMD_TYPE_UINT64, "invalid fault events received" }, 66 { "nop_flts", FMD_TYPE_UINT64, "inapplicable fault events received" }, 67 { "auto_flts", FMD_TYPE_UINT64, "auto-close faults received" } 68 }; 69 70 typedef struct cma_subscriber { 71 const char *subr_class; 72 const char *subr_sname; 73 uint_t subr_svers; 74 int (*subr_func)(fmd_hdl_t *, nvlist_t *, nvlist_t *, const char *, 75 boolean_t); 76 } cma_subscriber_t; 77 78 static const cma_subscriber_t cma_subrs[] = { 79 #if defined(i386) 80 /* 81 * On x86, the ASRUs are expected to be in hc scheme. When 82 * cpumem-retire wants to retire a cpu or mem page, it calls the 83 * methods registered in the topo node to do that. The topo 84 * enumerator, which necessarily knows all the config info that 85 * we'd ever need in deciding what/how to retire etc. This takes 86 * away much of that complexity from the agent into the entity 87 * that knows all config/topo information. 88 */ 89 { "fault.memory.page", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 90 cma_page_retire }, 91 { "fault.memory.page_sb", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 92 cma_page_retire }, 93 { "fault.memory.page_ck", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 94 cma_page_retire }, 95 { "fault.memory.page_ue", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 96 cma_page_retire }, 97 { "fault.memory.generic-x86.page_ce", FM_FMRI_SCHEME_HC, 98 FM_HC_SCHEME_VERSION, cma_page_retire }, 99 { "fault.memory.generic-x86.page_ue", FM_FMRI_SCHEME_HC, 100 FM_HC_SCHEME_VERSION, cma_page_retire }, 101 { "fault.memory.intel.page_ce", FM_FMRI_SCHEME_HC, 102 FM_HC_SCHEME_VERSION, cma_page_retire }, 103 { "fault.memory.intel.page_ue", FM_FMRI_SCHEME_HC, 104 FM_HC_SCHEME_VERSION, cma_page_retire }, 105 { "fault.memory.dimm", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 106 NULL }, 107 { "fault.memory.dimm_sb", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 108 NULL }, 109 { "fault.memory.dimm_ck", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 110 NULL }, 111 { "fault.memory.dimm_ue", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 112 NULL }, 113 { "fault.memory.generic-x86.dimm_ce", FM_FMRI_SCHEME_HC, 114 FM_HC_SCHEME_VERSION, NULL }, 115 { "fault.memory.generic-x86.dimm_ue", FM_FMRI_SCHEME_HC, 116 FM_HC_SCHEME_VERSION, NULL }, 117 { "fault.memory.intel.dimm_ce", FM_FMRI_SCHEME_HC, 118 FM_HC_SCHEME_VERSION, NULL }, 119 { "fault.memory.intel.dimm_ue", FM_FMRI_SCHEME_HC, 120 FM_HC_SCHEME_VERSION, NULL }, 121 { "fault.memory.intel.fbd.*", FM_FMRI_SCHEME_HC, 122 FM_HC_SCHEME_VERSION, NULL }, 123 { "fault.memory.dimm_testfail", FM_FMRI_SCHEME_HC, 124 FM_HC_SCHEME_VERSION, NULL }, 125 { "fault.memory.bank", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 126 NULL }, 127 { "fault.memory.datapath", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 128 NULL }, 129 { "fault.cpu.intel.quickpath.mem_scrubbing", FM_FMRI_SCHEME_HC, 130 FM_HC_SCHEME_VERSION, cma_page_retire }, 131 { "fault.cpu.intel.quickpath.*", FM_FMRI_SCHEME_HC, 132 FM_HC_SCHEME_VERSION, NULL }, 133 { "fault.cpu.generic-x86.mc", FM_FMRI_SCHEME_HC, 134 FM_HC_SCHEME_VERSION, NULL }, 135 { "fault.cpu.intel.dma", FM_FMRI_SCHEME_HC, 136 FM_HC_SCHEME_VERSION, NULL }, 137 { "fault.cpu.intel.dma", FM_FMRI_SCHEME_CPU, 138 FM_CPU_SCHEME_VERSION, NULL }, 139 140 /* 141 * The ASRU for cpu faults are in cpu scheme on native and in hc 142 * scheme on xpv. So each cpu fault class needs to be listed twice. 143 */ 144 145 /* 146 * The following faults do NOT retire a cpu thread, 147 * and therefore must be intercepted before 148 * the default "fault.cpu.*" dispatch to cma_cpu_hc_retire. 149 */ 150 { "fault.cpu.amd.dramchannel", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 151 NULL }, 152 { "fault.cpu.amd.dramchannel", FM_FMRI_SCHEME_CPU, 153 FM_CPU_SCHEME_VERSION, NULL }, 154 { "fault.cpu.generic-x86.bus_interconnect_memory", FM_FMRI_SCHEME_HC, 155 FM_HC_SCHEME_VERSION, NULL }, 156 { "fault.cpu.generic-x86.bus_interconnect_memory", FM_FMRI_SCHEME_CPU, 157 FM_CPU_SCHEME_VERSION, NULL }, 158 { "fault.cpu.generic-x86.bus_interconnect_io", FM_FMRI_SCHEME_HC, 159 FM_HC_SCHEME_VERSION, NULL }, 160 { "fault.cpu.generic-x86.bus_interconnect_io", FM_FMRI_SCHEME_CPU, 161 FM_CPU_SCHEME_VERSION, NULL }, 162 { "fault.cpu.generic-x86.bus_interconnect", FM_FMRI_SCHEME_HC, 163 FM_HC_SCHEME_VERSION, NULL }, 164 { "fault.cpu.generic-x86.bus_interconnect", FM_FMRI_SCHEME_CPU, 165 FM_CPU_SCHEME_VERSION, NULL }, 166 { "fault.cpu.intel.bus_interconnect_memory", FM_FMRI_SCHEME_HC, 167 FM_HC_SCHEME_VERSION, NULL }, 168 { "fault.cpu.intel.bus_interconnect_memory", FM_FMRI_SCHEME_CPU, 169 FM_CPU_SCHEME_VERSION, NULL }, 170 { "fault.cpu.intel.bus_interconnect_io", FM_FMRI_SCHEME_HC, 171 FM_HC_SCHEME_VERSION, NULL }, 172 { "fault.cpu.intel.bus_interconnect_io", FM_FMRI_SCHEME_CPU, 173 FM_CPU_SCHEME_VERSION, NULL }, 174 { "fault.cpu.intel.bus_interconnect", FM_FMRI_SCHEME_HC, 175 FM_HC_SCHEME_VERSION, NULL }, 176 { "fault.cpu.intel.bus_interconnect", FM_FMRI_SCHEME_CPU, 177 FM_CPU_SCHEME_VERSION, NULL }, 178 { "fault.cpu.intel.nb.*", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 179 NULL }, 180 { "fault.cpu.intel.nb.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 181 NULL }, 182 { "fault.cpu.intel.dma", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 183 NULL }, 184 { "fault.cpu.intel.dma", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 185 NULL }, 186 { "fault.cpu.*", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 187 cma_cpu_hc_retire }, 188 { "fault.cpu.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 189 cma_cpu_hc_retire }, 190 #elif defined(sun4v) 191 /* 192 * The following are PI sun4v faults 193 */ 194 { "fault.memory.memlink", FM_FMRI_SCHEME_HC, 195 FM_HC_SCHEME_VERSION, NULL }, 196 { "fault.memory.memlink-uc", FM_FMRI_SCHEME_HC, 197 FM_HC_SCHEME_VERSION, NULL }, 198 { "fault.memory.memlink-failover", FM_FMRI_SCHEME_HC, 199 FM_HC_SCHEME_VERSION, NULL }, 200 { "fault.memory.dimm-ue-imminent", FM_FMRI_SCHEME_HC, 201 FM_HC_SCHEME_VERSION, NULL }, 202 { "fault.memory.dram-ue-imminent", FM_FMRI_SCHEME_HC, 203 FM_HC_SCHEME_VERSION, NULL }, 204 { "fault.memory.dimm-page-retires-excessive", FM_FMRI_SCHEME_HC, 205 FM_HC_SCHEME_VERSION, NULL }, 206 { "fault.memory.page", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 207 cma_page_retire }, 208 { "fault.memory.dimm", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 209 NULL }, 210 { "fault.memory.dimm_sb", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 211 NULL }, 212 { "fault.memory.dimm_ck", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 213 NULL }, 214 { "fault.memory.dimm_ue", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 215 NULL }, 216 { "fault.memory.dimm-page-retires-excessive", FM_FMRI_SCHEME_MEM, 217 FM_MEM_SCHEME_VERSION, NULL }, 218 { "fault.memory.dimm-ue-imminent", FM_FMRI_SCHEME_MEM, 219 FM_MEM_SCHEME_VERSION, NULL }, 220 { "fault.memory.dram-ue-imminent", FM_FMRI_SCHEME_MEM, 221 FM_MEM_SCHEME_VERSION, NULL }, 222 { "fault.memory.bank", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 223 NULL }, 224 { "fault.memory.datapath", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 225 NULL }, 226 { "fault.memory.datapath", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 227 NULL }, 228 { "fault.memory.link-c", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 229 NULL }, 230 { "fault.memory.link-u", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 231 NULL }, 232 { "fault.memory.link-f", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 233 NULL }, 234 { "fault.memory.link-c", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 235 NULL }, 236 { "fault.memory.link-u", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 237 NULL }, 238 { "fault.memory.link-f", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 239 NULL }, 240 241 /* 242 * The following ultraSPARC-T1/T2 faults do NOT retire a cpu thread, 243 * and therefore must be intercepted before 244 * the default "fault.cpu.*" dispatch to cma_cpu_hc_retire. 245 */ 246 { "fault.cpu.*.l2cachedata", FM_FMRI_SCHEME_CPU, 247 FM_CPU_SCHEME_VERSION, NULL }, 248 { "fault.cpu.*.l2cachetag", FM_FMRI_SCHEME_CPU, 249 FM_CPU_SCHEME_VERSION, NULL }, 250 { "fault.cpu.*.l2cachectl", FM_FMRI_SCHEME_CPU, 251 FM_CPU_SCHEME_VERSION, NULL }, 252 { "fault.cpu.*.l2data-c", FM_FMRI_SCHEME_CPU, 253 FM_CPU_SCHEME_VERSION, NULL }, 254 { "fault.cpu.*.l2data-u", FM_FMRI_SCHEME_CPU, 255 FM_CPU_SCHEME_VERSION, NULL }, 256 { "fault.cpu.*.mau", FM_FMRI_SCHEME_CPU, 257 FM_CPU_SCHEME_VERSION, NULL }, 258 { "fault.cpu.*.lfu-u", FM_FMRI_SCHEME_CPU, 259 FM_CPU_SCHEME_VERSION, NULL }, 260 { "fault.cpu.*.lfu-f", FM_FMRI_SCHEME_CPU, 261 FM_CPU_SCHEME_VERSION, NULL }, 262 { "fault.cpu.*.lfu-p", FM_FMRI_SCHEME_CPU, 263 FM_CPU_SCHEME_VERSION, NULL }, 264 { "fault.cpu.ultraSPARC-T1.freg", FM_FMRI_SCHEME_CPU, 265 FM_CPU_SCHEME_VERSION, NULL }, 266 { "fault.cpu.ultraSPARC-T1.l2cachedata", FM_FMRI_SCHEME_CPU, 267 FM_CPU_SCHEME_VERSION, NULL }, 268 { "fault.cpu.ultraSPARC-T1.l2cachetag", FM_FMRI_SCHEME_CPU, 269 FM_CPU_SCHEME_VERSION, NULL }, 270 { "fault.cpu.ultraSPARC-T1.l2cachectl", FM_FMRI_SCHEME_CPU, 271 FM_CPU_SCHEME_VERSION, NULL }, 272 { "fault.cpu.ultraSPARC-T1.mau", FM_FMRI_SCHEME_CPU, 273 FM_CPU_SCHEME_VERSION, NULL }, 274 { "fault.cpu.ultraSPARC-T2plus.chip", FM_FMRI_SCHEME_HC, 275 FM_HC_SCHEME_VERSION, NULL }, 276 { "fault.cpu.*", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 277 cma_cpu_hc_retire }, 278 { "fault.cpu.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 279 cma_cpu_hc_retire }, 280 #elif defined(opl) 281 { "fault.memory.page", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 282 cma_page_retire }, 283 { "fault.memory.dimm", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 284 NULL }, 285 { "fault.memory.dimm-page-retires-excessive", FM_FMRI_SCHEME_MEM, 286 FM_MEM_SCHEME_VERSION, NULL }, 287 { "fault.memory.dimm-ue-imminent", FM_FMRI_SCHEME_MEM, 288 FM_MEM_SCHEME_VERSION, NULL }, 289 { "fault.memory.dram-ue-imminent", FM_FMRI_SCHEME_MEM, 290 FM_MEM_SCHEME_VERSION, NULL }, 291 { "fault.memory.bank", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 292 NULL }, 293 { "fault.cpu.SPARC64-VI.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 294 cma_cpu_cpu_retire }, 295 { "fault.cpu.SPARC64-VII.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 296 cma_cpu_cpu_retire }, 297 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VI.core.se", 298 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 299 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VI.core.se-offlinereq", 300 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 301 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VI.core.ce", 302 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 303 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VI.core.ce-offlinereq", 304 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 305 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VII.core.se", 306 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 307 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VII.core.se-offlinereq", 308 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 309 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VII.core.ce", 310 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 311 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VII.core.ce-offlinereq", 312 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 313 #else 314 /* 315 * For platforms excluding i386, sun4v and opl. 316 */ 317 { "fault.memory.page", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 318 cma_page_retire }, 319 { "fault.memory.page_sb", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 320 cma_page_retire }, 321 { "fault.memory.page_ck", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 322 cma_page_retire }, 323 { "fault.memory.page_ue", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 324 cma_page_retire }, 325 { "fault.memory.dimm", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 326 NULL }, 327 { "fault.memory.dimm_sb", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 328 NULL }, 329 { "fault.memory.dimm_ck", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 330 NULL }, 331 { "fault.memory.dimm_ue", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 332 NULL }, 333 { "fault.memory.dimm-page-retires-excessive", FM_FMRI_SCHEME_MEM, 334 FM_MEM_SCHEME_VERSION, NULL }, 335 { "fault.memory.dimm-ue-imminent", FM_FMRI_SCHEME_MEM, 336 FM_MEM_SCHEME_VERSION, NULL }, 337 { "fault.memory.dram-ue-imminent", FM_FMRI_SCHEME_MEM, 338 FM_MEM_SCHEME_VERSION, NULL }, 339 { "fault.memory.dimm_testfail", FM_FMRI_SCHEME_MEM, 340 FM_MEM_SCHEME_VERSION, NULL }, 341 { "fault.memory.bank", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 342 NULL }, 343 { "fault.memory.datapath", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 344 NULL }, 345 { "fault.memory.datapath", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 346 NULL }, 347 { "fault.memory.datapath", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 348 NULL }, 349 350 /* 351 * The following faults do NOT retire a cpu thread, 352 * and therefore must be intercepted before 353 * the default "fault.cpu.*" dispatch to cma_cpu_cpu_retire. 354 */ 355 { "fault.cpu.ultraSPARC-IVplus.l2cachedata-line", 356 FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 357 cma_cache_way_retire }, 358 { "fault.cpu.ultraSPARC-IVplus.l3cachedata-line", 359 FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 360 cma_cache_way_retire }, 361 { "fault.cpu.ultraSPARC-IVplus.l2cachetag-line", 362 FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 363 cma_cache_way_retire }, 364 { "fault.cpu.ultraSPARC-IVplus.l3cachetag-line", 365 FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 366 cma_cache_way_retire }, 367 368 /* 369 * Default "fault.cpu.*" for "cpu" scheme ASRU dispatch. 370 */ 371 { "fault.cpu.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 372 cma_cpu_cpu_retire }, 373 #endif 374 { NULL, NULL, 0, NULL } 375 }; 376 377 static const cma_subscriber_t * 378 nvl2subr(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t **asrup) 379 { 380 const cma_subscriber_t *sp; 381 nvlist_t *asru; 382 char *scheme; 383 uint8_t version; 384 boolean_t retire; 385 386 if (nvlist_lookup_boolean_value(nvl, FM_SUSPECT_RETIRE, &retire) == 0 && 387 retire == 0) { 388 fmd_hdl_debug(hdl, "cma_recv: retire suppressed"); 389 return (NULL); 390 } 391 392 if (nvlist_lookup_nvlist(nvl, FM_FAULT_ASRU, &asru) != 0 || 393 nvlist_lookup_string(asru, FM_FMRI_SCHEME, &scheme) != 0 || 394 nvlist_lookup_uint8(asru, FM_VERSION, &version) != 0) { 395 cma_stats.bad_flts.fmds_value.ui64++; 396 return (NULL); 397 } 398 399 for (sp = cma_subrs; sp->subr_class != NULL; sp++) { 400 if (fmd_nvl_class_match(hdl, nvl, sp->subr_class) && 401 strcmp(scheme, sp->subr_sname) == 0 && 402 version <= sp->subr_svers) { 403 *asrup = asru; 404 return (sp); 405 } 406 } 407 408 cma_stats.nop_flts.fmds_value.ui64++; 409 return (NULL); 410 } 411 412 static void 413 cma_recv_list(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class) 414 { 415 char *uuid = NULL; 416 nvlist_t **nva, **save_nva; 417 uint_t nvc = 0, save_nvc; 418 uint_t keepopen; 419 int err = 0; 420 nvlist_t *asru = NULL; 421 uint32_t index; 422 423 err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid); 424 err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 425 &nva, &nvc); 426 if (err != 0) { 427 cma_stats.bad_flts.fmds_value.ui64++; 428 return; 429 } 430 431 save_nvc = keepopen = nvc; 432 save_nva = nva; 433 while (nvc-- != 0 && (strcmp(class, FM_LIST_SUSPECT_CLASS) != 0 || 434 !fmd_case_uuclosed(hdl, uuid))) { 435 nvlist_t *nvl = *nva++; 436 const cma_subscriber_t *subr; 437 int has_fault; 438 439 if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL) 440 continue; 441 442 /* 443 * A handler returns CMA_RA_SUCCESS to indicate that 444 * from this suspects point-of-view the case may be 445 * closed, CMA_RA_FAILURE otherwise. 446 * A handler must not close the case itself. 447 */ 448 if (subr->subr_func != NULL) { 449 has_fault = fmd_nvl_fmri_has_fault(hdl, asru, 450 FMD_HAS_FAULT_ASRU, NULL); 451 if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0) { 452 if (has_fault == 1) 453 err = subr->subr_func(hdl, nvl, asru, 454 uuid, 0); 455 } else { 456 if (has_fault == 0) 457 err = subr->subr_func(hdl, nvl, asru, 458 uuid, 1); 459 } 460 if (err == CMA_RA_SUCCESS) 461 keepopen--; 462 } 463 } 464 465 /* 466 * Run though again to catch any new faults in list.updated. 467 */ 468 while (save_nvc-- != 0 && (strcmp(class, FM_LIST_UPDATED_CLASS) == 0)) { 469 nvlist_t *nvl = *save_nva++; 470 const cma_subscriber_t *subr; 471 int has_fault; 472 473 if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL) 474 continue; 475 if (subr->subr_func != NULL) { 476 has_fault = fmd_nvl_fmri_has_fault(hdl, asru, 477 FMD_HAS_FAULT_ASRU, NULL); 478 if (has_fault == 1) 479 err = subr->subr_func(hdl, nvl, asru, uuid, 0); 480 } 481 } 482 483 /* 484 * Do not close the case if we are handling cache faults. 485 */ 486 if (asru != NULL) { 487 if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_CACHE_INDEX, 488 &index) != 0) { 489 if (!keepopen && strcmp(class, 490 FM_LIST_SUSPECT_CLASS) == 0) { 491 fmd_case_uuclose(hdl, uuid); 492 } 493 } 494 } 495 496 if (!keepopen && strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 497 fmd_case_uuresolved(hdl, uuid); 498 } 499 500 static void 501 cma_recv_one(fmd_hdl_t *hdl, nvlist_t *nvl) 502 { 503 const cma_subscriber_t *subr; 504 nvlist_t *asru; 505 506 if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL) 507 return; 508 509 if (subr->subr_func != NULL) { 510 if (fmd_nvl_fmri_has_fault(hdl, asru, 511 FMD_HAS_FAULT_ASRU, NULL) == 1) 512 (void) subr->subr_func(hdl, nvl, asru, NULL, 0); 513 } 514 } 515 516 /*ARGSUSED*/ 517 static void 518 cma_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) 519 { 520 fmd_hdl_debug(hdl, "received %s\n", class); 521 522 if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) 523 return; 524 525 if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 || 526 strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 || 527 strcmp(class, FM_LIST_UPDATED_CLASS) == 0) 528 cma_recv_list(hdl, nvl, class); 529 else 530 cma_recv_one(hdl, nvl); 531 } 532 533 /*ARGSUSED*/ 534 static void 535 cma_timeout(fmd_hdl_t *hdl, id_t id, void *arg) 536 { 537 if (id == cma.cma_page_timerid) 538 cma_page_retry(hdl); 539 #ifdef sun4v 540 /* 541 * cpu offline/online needs to be retried on sun4v because 542 * ldom request can be asynchronous. 543 */ 544 else if (id == cma.cma_cpu_timerid) 545 cma_cpu_retry(hdl); 546 #endif 547 } 548 549 #ifdef sun4v 550 static void * 551 cma_init_alloc(size_t size) 552 { 553 return (fmd_hdl_alloc(init_hdl, size, FMD_SLEEP)); 554 } 555 556 static void 557 cma_init_free(void *addr, size_t size) 558 { 559 fmd_hdl_free(init_hdl, addr, size); 560 } 561 #endif 562 563 static const fmd_hdl_ops_t fmd_ops = { 564 cma_recv, /* fmdo_recv */ 565 cma_timeout, /* fmdo_timeout */ 566 NULL, /* fmdo_close */ 567 NULL, /* fmdo_stats */ 568 NULL, /* fmdo_gc */ 569 }; 570 571 static const fmd_prop_t fmd_props[] = { 572 { "cpu_tries", FMD_TYPE_UINT32, "10" }, 573 { "cpu_delay", FMD_TYPE_TIME, "1sec" }, 574 #ifdef sun4v 575 { "cpu_ret_mindelay", FMD_TYPE_TIME, "5sec" }, 576 { "cpu_ret_maxdelay", FMD_TYPE_TIME, "5min" }, 577 #endif /* sun4v */ 578 { "cpu_offline_enable", FMD_TYPE_BOOL, "true" }, 579 { "cpu_online_enable", FMD_TYPE_BOOL, "true" }, 580 { "cpu_forced_offline", FMD_TYPE_BOOL, "true" }, 581 #ifdef opl 582 { "cpu_blacklist_enable", FMD_TYPE_BOOL, "false" }, 583 { "cpu_unblacklist_enable", FMD_TYPE_BOOL, "false" }, 584 #else 585 { "cpu_blacklist_enable", FMD_TYPE_BOOL, "true" }, 586 { "cpu_unblacklist_enable", FMD_TYPE_BOOL, "true" }, 587 #endif /* opl */ 588 { "page_ret_mindelay", FMD_TYPE_TIME, "1sec" }, 589 { "page_ret_maxdelay", FMD_TYPE_TIME, "5min" }, 590 { "page_retire_enable", FMD_TYPE_BOOL, "true" }, 591 { "page_unretire_enable", FMD_TYPE_BOOL, "true" }, 592 { NULL, 0, NULL } 593 }; 594 595 static const fmd_hdl_info_t fmd_info = { 596 "CPU/Memory Retire Agent", CMA_VERSION, &fmd_ops, fmd_props 597 }; 598 599 void 600 _fmd_init(fmd_hdl_t *hdl) 601 { 602 hrtime_t nsec; 603 #ifdef i386 604 char buf[BUFSIZ]; 605 const char *dom0 = "control_d"; 606 607 /* 608 * Abort the cpumem-retire module if Solaris is running under DomU. 609 */ 610 if (sysinfo(SI_PLATFORM, buf, sizeof (buf)) == -1) 611 return; 612 613 if (strncmp(buf, "i86pc", sizeof (buf)) == 0) { 614 cma_is_native = B_TRUE; 615 } else if (strncmp(buf, "i86xpv", sizeof (buf)) != 0) { 616 return; 617 } else { 618 int fd = open("/dev/xen/domcaps", O_RDONLY); 619 620 if (fd != -1) { 621 if (read(fd, buf, sizeof (buf)) <= 0 || 622 strncmp(buf, dom0, strlen(dom0)) != 0) { 623 (void) close(fd); 624 return; 625 } 626 (void) close(fd); 627 } 628 cma_is_native = B_FALSE; 629 } 630 #endif /* i386 */ 631 632 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) 633 return; /* invalid data in configuration file */ 634 635 fmd_hdl_subscribe(hdl, "fault.cpu.*"); 636 fmd_hdl_subscribe(hdl, "fault.memory.*"); 637 #ifdef opl 638 fmd_hdl_subscribe(hdl, "fault.chassis.SPARC-Enterprise.cpu.*"); 639 #endif 640 641 (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (cma_stats) / 642 sizeof (fmd_stat_t), (fmd_stat_t *)&cma_stats); 643 644 cma.cma_cpu_tries = fmd_prop_get_int32(hdl, "cpu_tries"); 645 646 nsec = fmd_prop_get_int64(hdl, "cpu_delay"); 647 cma.cma_cpu_delay.tv_sec = nsec / NANOSEC; 648 cma.cma_cpu_delay.tv_nsec = nsec % NANOSEC; 649 650 cma.cma_page_mindelay = fmd_prop_get_int64(hdl, "page_ret_mindelay"); 651 cma.cma_page_maxdelay = fmd_prop_get_int64(hdl, "page_ret_maxdelay"); 652 653 #ifdef sun4v 654 cma.cma_cpu_mindelay = fmd_prop_get_int64(hdl, "cpu_ret_mindelay"); 655 cma.cma_cpu_maxdelay = fmd_prop_get_int64(hdl, "cpu_ret_maxdelay"); 656 #endif 657 658 cma.cma_cpu_dooffline = fmd_prop_get_int32(hdl, "cpu_offline_enable"); 659 cma.cma_cpu_forcedoffline = fmd_prop_get_int32(hdl, 660 "cpu_forced_offline"); 661 cma.cma_cpu_doonline = fmd_prop_get_int32(hdl, "cpu_online_enable"); 662 cma.cma_cpu_doblacklist = fmd_prop_get_int32(hdl, 663 "cpu_blacklist_enable"); 664 cma.cma_cpu_dounblacklist = fmd_prop_get_int32(hdl, 665 "cpu_unblacklist_enable"); 666 cma.cma_page_doretire = fmd_prop_get_int32(hdl, "page_retire_enable"); 667 cma.cma_page_dounretire = fmd_prop_get_int32(hdl, 668 "page_unretire_enable"); 669 670 if (cma.cma_page_maxdelay < cma.cma_page_mindelay) 671 fmd_hdl_abort(hdl, "page retirement delays conflict\n"); 672 673 #ifdef sun4v 674 init_hdl = hdl; 675 cma_lhp = ldom_init(cma_init_alloc, cma_init_free); 676 #endif 677 } 678 679 void 680 _fmd_fini(fmd_hdl_t *hdl) 681 { 682 #ifdef sun4v 683 ldom_fini(cma_lhp); 684 cma_cpu_fini(hdl); 685 #endif 686 cma_page_fini(hdl); 687 } 688