1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <cma.h> 30 31 #include <strings.h> 32 #include <errno.h> 33 #include <time.h> 34 #include <fm/fmd_api.h> 35 #include <sys/fm/protocol.h> 36 37 cma_t cma; 38 39 cma_stats_t cma_stats = { 40 { "cpu_flts", FMD_TYPE_UINT64, "cpu faults resolved" }, 41 { "cpu_fails", FMD_TYPE_UINT64, "cpu faults unresolveable" }, 42 { "cpu_blfails", FMD_TYPE_UINT64, "failed cpu blacklists" }, 43 { "cpu_supp", FMD_TYPE_UINT64, "cpu offlines suppressed" }, 44 { "cpu_blsupp", FMD_TYPE_UINT64, "cpu blacklists suppressed" }, 45 { "page_flts", FMD_TYPE_UINT64, "page faults resolved" }, 46 { "page_fails", FMD_TYPE_UINT64, "page faults unresolveable" }, 47 { "page_supp", FMD_TYPE_UINT64, "page retires suppressed" }, 48 { "page_nonent", FMD_TYPE_UINT64, "retires for non-existent fmris" }, 49 { "bad_flts", FMD_TYPE_UINT64, "invalid fault events received" }, 50 { "nop_flts", FMD_TYPE_UINT64, "inapplicable fault events received" }, 51 { "auto_flts", FMD_TYPE_UINT64, "auto-close faults received" } 52 }; 53 54 typedef struct cma_subscriber { 55 const char *subr_class; 56 const char *subr_sname; 57 uint_t subr_svers; 58 void (*subr_func)(fmd_hdl_t *, nvlist_t *, nvlist_t *, const char *); 59 } cma_subscriber_t; 60 61 static const cma_subscriber_t cma_subrs[] = { 62 { "fault.memory.page", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 63 cma_page_retire }, 64 { "fault.memory.dimm", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 65 NULL }, 66 { "fault.memory.bank", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 67 NULL }, 68 { "fault.memory.datapath", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 69 NULL }, 70 71 /* 72 * The following ultraSPARC-T1 faults do NOT retire a cpu thread, 73 * and therefore must be intercepted before 74 * the default "fault.cpu.*" dispatch to cma_cpu_retire. 75 */ 76 { "fault.cpu.ultraSPARC-T1.freg", FM_FMRI_SCHEME_CPU, 77 FM_CPU_SCHEME_VERSION, NULL }, 78 { "fault.cpu.ultraSPARC-T1.l2cachedata", FM_FMRI_SCHEME_CPU, 79 FM_CPU_SCHEME_VERSION, NULL }, 80 { "fault.cpu.ultraSPARC-T1.l2cachetag", FM_FMRI_SCHEME_CPU, 81 FM_CPU_SCHEME_VERSION, NULL }, 82 { "fault.cpu.ultraSPARC-T1.l2cachectl", FM_FMRI_SCHEME_CPU, 83 FM_CPU_SCHEME_VERSION, NULL }, 84 { "fault.cpu.ultraSPARC-T1.mau", FM_FMRI_SCHEME_CPU, 85 FM_CPU_SCHEME_VERSION, NULL }, 86 { "fault.cpu.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 87 cma_cpu_retire }, 88 { NULL, NULL, 0, NULL } 89 }; 90 91 static const cma_subscriber_t * 92 nvl2subr(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t **asrup) 93 { 94 const cma_subscriber_t *sp; 95 nvlist_t *asru; 96 char *scheme; 97 uint8_t version; 98 99 if (nvlist_lookup_nvlist(nvl, FM_FAULT_ASRU, &asru) != 0 || 100 nvlist_lookup_string(asru, FM_FMRI_SCHEME, &scheme) != 0 || 101 nvlist_lookup_uint8(asru, FM_VERSION, &version) != 0) { 102 cma_stats.bad_flts.fmds_value.ui64++; 103 return (NULL); 104 } 105 106 for (sp = cma_subrs; sp->subr_class != NULL; sp++) { 107 if (fmd_nvl_class_match(hdl, nvl, sp->subr_class) && 108 strcmp(scheme, sp->subr_sname) == 0 && 109 version <= sp->subr_svers) { 110 *asrup = asru; 111 return (sp); 112 } 113 } 114 115 cma_stats.nop_flts.fmds_value.ui64++; 116 return (NULL); 117 } 118 119 static void 120 cma_recv_list(fmd_hdl_t *hdl, nvlist_t *nvl) 121 { 122 char *uuid = NULL; 123 nvlist_t **nva; 124 uint_t nvc = 0; 125 int err = 0; 126 127 err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid); 128 err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 129 &nva, &nvc); 130 if (err != 0) { 131 cma_stats.bad_flts.fmds_value.ui64++; 132 return; 133 } 134 135 while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) { 136 nvlist_t *nvl = *nva++; 137 const cma_subscriber_t *subr; 138 nvlist_t *asru; 139 140 if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL) 141 continue; 142 143 if (uuid != NULL) 144 fmd_case_uuconvict(hdl, uuid, nvl); 145 146 if (subr->subr_func != NULL) 147 subr->subr_func(hdl, nvl, asru, uuid); 148 } 149 } 150 151 static void 152 cma_recv_one(fmd_hdl_t *hdl, nvlist_t *nvl) 153 { 154 const cma_subscriber_t *subr; 155 nvlist_t *asru; 156 157 if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL) 158 return; 159 160 if (subr->subr_func != NULL) 161 subr->subr_func(hdl, nvl, asru, NULL); 162 } 163 164 /*ARGSUSED*/ 165 static void 166 cma_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) 167 { 168 fmd_hdl_debug(hdl, "received %s\n", class); 169 170 if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0) 171 cma_recv_list(hdl, nvl); 172 else 173 cma_recv_one(hdl, nvl); 174 } 175 176 /*ARGSUSED*/ 177 static void 178 cma_timeout(fmd_hdl_t *hdl, id_t id, void *arg) 179 { 180 if (id == cma.cma_page_timerid) 181 cma_page_retry(hdl); 182 } 183 184 static const fmd_hdl_ops_t fmd_ops = { 185 cma_recv, /* fmdo_recv */ 186 cma_timeout, /* fmdo_timeout */ 187 NULL, /* fmdo_close */ 188 NULL, /* fmdo_stats */ 189 NULL, /* fmdo_gc */ 190 }; 191 192 static const fmd_prop_t fmd_props[] = { 193 { "cpu_tries", FMD_TYPE_UINT32, "10" }, 194 { "cpu_delay", FMD_TYPE_TIME, "1sec" }, 195 { "cpu_offline_enable", FMD_TYPE_BOOL, "true" }, 196 { "cpu_forced_offline", FMD_TYPE_BOOL, "true" }, 197 { "cpu_blacklist_enable", FMD_TYPE_BOOL, "true" }, 198 { "page_ret_mindelay", FMD_TYPE_TIME, "1sec" }, 199 { "page_ret_maxdelay", FMD_TYPE_TIME, "5min" }, 200 { "page_retire_enable", FMD_TYPE_BOOL, "true" }, 201 { NULL, 0, NULL } 202 }; 203 204 static const fmd_hdl_info_t fmd_info = { 205 "CPU/Memory Retire Agent", CMA_VERSION, &fmd_ops, fmd_props 206 }; 207 208 void 209 _fmd_init(fmd_hdl_t *hdl) 210 { 211 hrtime_t nsec; 212 213 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) 214 return; /* invalid data in configuration file */ 215 216 fmd_hdl_subscribe(hdl, "fault.cpu.*"); 217 fmd_hdl_subscribe(hdl, "fault.memory.*"); 218 219 (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (cma_stats) / 220 sizeof (fmd_stat_t), (fmd_stat_t *)&cma_stats); 221 222 cma.cma_cpu_tries = fmd_prop_get_int32(hdl, "cpu_tries"); 223 224 nsec = fmd_prop_get_int64(hdl, "cpu_delay"); 225 cma.cma_cpu_delay.tv_sec = nsec / NANOSEC; 226 cma.cma_cpu_delay.tv_nsec = nsec % NANOSEC; 227 228 cma.cma_page_mindelay = fmd_prop_get_int64(hdl, "page_ret_mindelay"); 229 cma.cma_page_maxdelay = fmd_prop_get_int64(hdl, "page_ret_maxdelay"); 230 231 cma.cma_cpu_dooffline = fmd_prop_get_int32(hdl, "cpu_offline_enable"); 232 cma.cma_cpu_forcedoffline = fmd_prop_get_int32(hdl, 233 "cpu_forced_offline"); 234 cma.cma_cpu_doblacklist = fmd_prop_get_int32(hdl, 235 "cpu_blacklist_enable"); 236 cma.cma_page_doretire = fmd_prop_get_int32(hdl, "page_retire_enable"); 237 238 if (cma.cma_page_maxdelay < cma.cma_page_mindelay) 239 fmd_hdl_abort(hdl, "page retirement delays conflict\n"); 240 } 241 242 void 243 _fmd_fini(fmd_hdl_t *hdl) 244 { 245 cma_page_fini(hdl); 246 } 247