1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 25 * Use is subject to license terms. 26 */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 30 #include <cma.h> 31 32 #include <strings.h> 33 #include <errno.h> 34 #include <time.h> 35 #include <fm/fmd_api.h> 36 #include <sys/fm/protocol.h> 37 38 cma_t cma; 39 40 cma_stats_t cma_stats = { 41 { "cpu_flts", FMD_TYPE_UINT64, "cpu faults resolved" }, 42 { "cpu_fails", FMD_TYPE_UINT64, "cpu faults unresolveable" }, 43 { "cpu_blfails", FMD_TYPE_UINT64, "failed cpu blacklists" }, 44 { "cpu_supp", FMD_TYPE_UINT64, "cpu offlines suppressed" }, 45 { "cpu_blsupp", FMD_TYPE_UINT64, "cpu blacklists suppressed" }, 46 { "page_flts", FMD_TYPE_UINT64, "page faults resolved" }, 47 { "page_fails", FMD_TYPE_UINT64, "page faults unresolveable" }, 48 { "page_supp", FMD_TYPE_UINT64, "page retires suppressed" }, 49 { "page_nonent", FMD_TYPE_UINT64, "retires for non-existent fmris" }, 50 { "page_retmax", FMD_TYPE_UINT64, "hit max retries for page retire" }, 51 { "bad_flts", FMD_TYPE_UINT64, "invalid fault events received" }, 52 { "nop_flts", FMD_TYPE_UINT64, "inapplicable fault events received" }, 53 { "auto_flts", FMD_TYPE_UINT64, "auto-close faults received" } 54 }; 55 56 typedef struct cma_subscriber { 57 const char *subr_class; 58 const char *subr_sname; 59 uint_t subr_svers; 60 void (*subr_func)(fmd_hdl_t *, nvlist_t *, nvlist_t *, const char *); 61 } cma_subscriber_t; 62 63 static const cma_subscriber_t cma_subrs[] = { 64 { "fault.cpu.*", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 65 cma_cpu_retire }, 66 { "fault.memory.page", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 67 cma_page_retire }, 68 { "fault.memory.dimm", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 69 NULL }, 70 { "fault.memory.dimm_sb", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 71 NULL }, 72 { "fault.memory.dimm_ck", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 73 NULL }, 74 { "fault.memory.dimm_ue", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 75 NULL }, 76 { "fault.memory.bank", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 77 NULL }, 78 { "fault.memory.datapath", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 79 NULL }, 80 81 /* 82 * The following ultraSPARC-T1 faults do NOT retire a cpu thread, 83 * and therefore must be intercepted before 84 * the default "fault.cpu.*" dispatch to cma_cpu_retire. 85 */ 86 { "fault.cpu.ultraSPARC-T1.freg", FM_FMRI_SCHEME_CPU, 87 FM_CPU_SCHEME_VERSION, NULL }, 88 { "fault.cpu.ultraSPARC-T1.l2cachedata", FM_FMRI_SCHEME_CPU, 89 FM_CPU_SCHEME_VERSION, NULL }, 90 { "fault.cpu.ultraSPARC-T1.l2cachetag", FM_FMRI_SCHEME_CPU, 91 FM_CPU_SCHEME_VERSION, NULL }, 92 { "fault.cpu.ultraSPARC-T1.l2cachectl", FM_FMRI_SCHEME_CPU, 93 FM_CPU_SCHEME_VERSION, NULL }, 94 { "fault.cpu.ultraSPARC-T1.mau", FM_FMRI_SCHEME_CPU, 95 FM_CPU_SCHEME_VERSION, NULL }, 96 { "fault.cpu.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 97 cma_cpu_retire }, 98 { NULL, NULL, 0, NULL } 99 }; 100 101 static const cma_subscriber_t * 102 nvl2subr(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t **asrup) 103 { 104 const cma_subscriber_t *sp; 105 nvlist_t *asru; 106 char *scheme; 107 uint8_t version; 108 109 if (nvlist_lookup_nvlist(nvl, FM_FAULT_ASRU, &asru) != 0 || 110 nvlist_lookup_string(asru, FM_FMRI_SCHEME, &scheme) != 0 || 111 nvlist_lookup_uint8(asru, FM_VERSION, &version) != 0) { 112 cma_stats.bad_flts.fmds_value.ui64++; 113 return (NULL); 114 } 115 116 for (sp = cma_subrs; sp->subr_class != NULL; sp++) { 117 if (fmd_nvl_class_match(hdl, nvl, sp->subr_class) && 118 strcmp(scheme, sp->subr_sname) == 0 && 119 version <= sp->subr_svers) { 120 *asrup = asru; 121 return (sp); 122 } 123 } 124 125 cma_stats.nop_flts.fmds_value.ui64++; 126 return (NULL); 127 } 128 129 static void 130 cma_recv_list(fmd_hdl_t *hdl, nvlist_t *nvl) 131 { 132 char *uuid = NULL; 133 nvlist_t **nva; 134 uint_t nvc = 0; 135 int err = 0; 136 137 err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid); 138 err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 139 &nva, &nvc); 140 if (err != 0) { 141 cma_stats.bad_flts.fmds_value.ui64++; 142 return; 143 } 144 145 while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) { 146 nvlist_t *nvl = *nva++; 147 const cma_subscriber_t *subr; 148 nvlist_t *asru; 149 150 if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL) 151 continue; 152 153 if (subr->subr_func != NULL) 154 subr->subr_func(hdl, nvl, asru, uuid); 155 } 156 } 157 158 static void 159 cma_recv_one(fmd_hdl_t *hdl, nvlist_t *nvl) 160 { 161 const cma_subscriber_t *subr; 162 nvlist_t *asru; 163 164 if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL) 165 return; 166 167 if (subr->subr_func != NULL) 168 subr->subr_func(hdl, nvl, asru, NULL); 169 } 170 171 /*ARGSUSED*/ 172 static void 173 cma_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) 174 { 175 fmd_hdl_debug(hdl, "received %s\n", class); 176 177 if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0) 178 cma_recv_list(hdl, nvl); 179 else 180 cma_recv_one(hdl, nvl); 181 } 182 183 /*ARGSUSED*/ 184 static void 185 cma_timeout(fmd_hdl_t *hdl, id_t id, void *arg) 186 { 187 if (id == cma.cma_page_timerid) 188 cma_page_retry(hdl); 189 } 190 191 static const fmd_hdl_ops_t fmd_ops = { 192 cma_recv, /* fmdo_recv */ 193 cma_timeout, /* fmdo_timeout */ 194 NULL, /* fmdo_close */ 195 NULL, /* fmdo_stats */ 196 NULL, /* fmdo_gc */ 197 }; 198 199 static const fmd_prop_t fmd_props[] = { 200 { "cpu_tries", FMD_TYPE_UINT32, "10" }, 201 { "cpu_delay", FMD_TYPE_TIME, "1sec" }, 202 { "cpu_offline_enable", FMD_TYPE_BOOL, "true" }, 203 { "cpu_forced_offline", FMD_TYPE_BOOL, "true" }, 204 { "cpu_blacklist_enable", FMD_TYPE_BOOL, "true" }, 205 { "page_ret_mindelay", FMD_TYPE_TIME, "1sec" }, 206 { "page_ret_maxdelay", FMD_TYPE_TIME, "5min" }, 207 { "page_retire_enable", FMD_TYPE_BOOL, "true" }, 208 #ifdef i386 209 /* 210 * On i386, leaving cases open while we retry the 211 * retire can cause the eft module to use large amounts 212 * of memory. Until eft is fixed, we set a maximum number 213 * of retries on page retires, after which the case will 214 * be closed. 215 */ 216 { "page_retire_maxretries", FMD_TYPE_UINT32, "8" }, 217 #else 218 { "page_retire_maxretries", FMD_TYPE_UINT32, "0" }, 219 #endif /* i386 */ 220 { NULL, 0, NULL } 221 }; 222 223 static const fmd_hdl_info_t fmd_info = { 224 "CPU/Memory Retire Agent", CMA_VERSION, &fmd_ops, fmd_props 225 }; 226 227 void 228 _fmd_init(fmd_hdl_t *hdl) 229 { 230 hrtime_t nsec; 231 232 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) 233 return; /* invalid data in configuration file */ 234 235 fmd_hdl_subscribe(hdl, "fault.cpu.*"); 236 fmd_hdl_subscribe(hdl, "fault.memory.*"); 237 238 (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (cma_stats) / 239 sizeof (fmd_stat_t), (fmd_stat_t *)&cma_stats); 240 241 cma.cma_cpu_tries = fmd_prop_get_int32(hdl, "cpu_tries"); 242 243 nsec = fmd_prop_get_int64(hdl, "cpu_delay"); 244 cma.cma_cpu_delay.tv_sec = nsec / NANOSEC; 245 cma.cma_cpu_delay.tv_nsec = nsec % NANOSEC; 246 247 cma.cma_page_mindelay = fmd_prop_get_int64(hdl, "page_ret_mindelay"); 248 cma.cma_page_maxdelay = fmd_prop_get_int64(hdl, "page_ret_maxdelay"); 249 250 cma.cma_cpu_dooffline = fmd_prop_get_int32(hdl, "cpu_offline_enable"); 251 cma.cma_cpu_forcedoffline = fmd_prop_get_int32(hdl, 252 "cpu_forced_offline"); 253 cma.cma_cpu_doblacklist = fmd_prop_get_int32(hdl, 254 "cpu_blacklist_enable"); 255 cma.cma_page_doretire = fmd_prop_get_int32(hdl, "page_retire_enable"); 256 cma.cma_page_maxretries = 257 fmd_prop_get_int32(hdl, "page_retire_maxretries"); 258 259 if (cma.cma_page_maxdelay < cma.cma_page_mindelay) 260 fmd_hdl_abort(hdl, "page retirement delays conflict\n"); 261 } 262 263 void 264 _fmd_fini(fmd_hdl_t *hdl) 265 { 266 cma_page_fini(hdl); 267 } 268