1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <cma.h> 27 28 #include <fcntl.h> 29 #include <unistd.h> 30 #include <strings.h> 31 #include <errno.h> 32 #include <time.h> 33 #include <fm/fmd_api.h> 34 #include <fm/fmd_agent.h> 35 #include <sys/fm/protocol.h> 36 #include <sys/bl.h> 37 #include <sys/processor.h> 38 39 #ifdef i386 40 /* 41 * On x86, retire/unretire are done via the topo methods. 42 * To minimize the impact on existing/legacy sparc work, we leave 43 * some residual #ifdef ugliness. The long-term intention would be to 44 * leave that legacy stuff to die a natural death when sparc diagnosis 45 * work can use the topo way of doing things. 46 */ 47 48 /* 49 * Check if the resource in the fault is in motherboard/chip/cpu topo. 50 */ 51 static boolean_t 52 old_topo_fault(nvlist_t *nvl) 53 { 54 nvlist_t *rsrc, **hcl; 55 uint_t nhcl = 0; 56 char *name; 57 58 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0 && 59 nvlist_lookup_nvlist_array(rsrc, FM_FMRI_HC_LIST, &hcl, &nhcl) 60 == 0 && nhcl == 3 && 61 nvlist_lookup_string(hcl[0], FM_FMRI_HC_NAME, &name) == 0 && 62 strcmp(name, "motherboard") == 0 && 63 nvlist_lookup_string(hcl[1], FM_FMRI_HC_NAME, &name) == 0 && 64 strcmp(name, "chip") == 0 && 65 nvlist_lookup_string(hcl[2], FM_FMRI_HC_NAME, &name) == 0 && 66 strcmp(name, "cpu") == 0) 67 return (1); 68 69 return (0); 70 } 71 72 /* ARGSUSED */ 73 int 74 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, 75 const char *uuid, boolean_t repair) 76 { 77 int err; 78 int rc = CMA_RA_SUCCESS; 79 nvlist_t *rsrc; 80 81 /* 82 * For the cached faults which were diagnosed under the old 83 * chip/cpu topology, when in native, we call p_online(2) for the 84 * "cpu" scheme ASRUs. Under Dom0, since logic cpuid in "cpu" 85 * scheme ASRU makes no sense, the fault should be ignored. 86 */ 87 if (old_topo_fault(nvl)) { 88 if (cma_is_native) 89 return (cma_cpu_retire(hdl, nvl, asru, uuid, repair)); 90 return (CMA_RA_FAILURE); 91 } 92 93 /* 94 * Lookup the resource and call its topo methods to do retire/unretire 95 */ 96 if ((! repair && ! cma.cma_cpu_dooffline) || 97 (repair && ! cma.cma_cpu_doonline)) { 98 fmd_hdl_debug(hdl, "suppressed %s of CPU\n", 99 repair ? "unretire" : "retire"); 100 cma_stats.cpu_supp.fmds_value.ui64++; 101 } else { 102 err = FMD_AGENT_RETIRE_FAIL; 103 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) { 104 err = repair ? fmd_nvl_fmri_unretire(hdl, rsrc) : 105 fmd_nvl_fmri_retire(hdl, rsrc); 106 } 107 if (err == FMD_AGENT_RETIRE_DONE) { 108 if (repair) 109 cma_stats.cpu_repairs.fmds_value.ui64++; 110 else 111 cma_stats.cpu_flts.fmds_value.ui64++; 112 } else { 113 rc = CMA_RA_FAILURE; 114 cma_stats.bad_flts.fmds_value.ui64++; 115 } 116 } 117 118 if ((! repair && ! cma.cma_cpu_doblacklist) || 119 (repair && ! cma.cma_cpu_dounblacklist)) { 120 fmd_hdl_debug(hdl, "suppressed %s of CPU\n", 121 repair ? "unblacklist" : "blacklist"); 122 cma_stats.cpu_blsupp.fmds_value.ui64++; 123 } else { 124 if (cma_cpu_blacklist(hdl, nvl, asru, repair) < 0) 125 cma_stats.cpu_blfails.fmds_value.ui64++; 126 } 127 128 return (rc); 129 } 130 #endif /* i386 */ 131 132 /* ARGSUSED */ 133 static int 134 cpu_online(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid, 135 uint32_t cpuid) 136 { 137 int err = CMA_RA_SUCCESS; 138 139 if (cma.cma_cpu_doonline) { 140 err = cma_cpu_statechange(hdl, asru, uuid, P_ONLINE, 141 B_TRUE); 142 } else { 143 fmd_hdl_debug(hdl, "suppressed online of CPU %u\n", 144 cpuid); 145 cma_stats.cpu_supp.fmds_value.ui64++; 146 } 147 148 /* OPL performs the blacklist in the service processor */ 149 #ifndef opl 150 if (cma.cma_cpu_dounblacklist) { 151 if (cma_cpu_blacklist(hdl, nvl, asru, B_TRUE) < 0) 152 cma_stats.cpu_blfails.fmds_value.ui64++; 153 } else { 154 fmd_hdl_debug(hdl, "suppressed unblacklist of CPU %u\n", cpuid); 155 cma_stats.cpu_blsupp.fmds_value.ui64++; 156 } 157 #endif /* opl */ 158 159 return (err); 160 } 161 162 /* ARGSUSED */ 163 static int 164 cpu_offline(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid, 165 uint32_t cpuid) 166 { 167 int err = CMA_RA_FAILURE; 168 169 if (cma.cma_cpu_dooffline) { 170 int cpustate = P_FAULTED; 171 172 if (cma.cma_cpu_forcedoffline) 173 cpustate |= P_FORCED; 174 err = cma_cpu_statechange(hdl, asru, uuid, cpustate, 175 B_FALSE); 176 } else { 177 fmd_hdl_debug(hdl, "suppressed offline of CPU %u\n", 178 cpuid); 179 cma_stats.cpu_supp.fmds_value.ui64++; 180 } 181 182 /* OPL performs the blacklist in the service processor */ 183 #ifndef opl 184 if (cma.cma_cpu_doblacklist) { 185 if (cma_cpu_blacklist(hdl, nvl, asru, B_FALSE) < 0) 186 cma_stats.cpu_blfails.fmds_value.ui64++; 187 } else { 188 fmd_hdl_debug(hdl, "suppressed blacklist of CPU %u\n", 189 cpuid); 190 cma_stats.cpu_blsupp.fmds_value.ui64++; 191 } 192 #endif /* opl */ 193 194 return (err); 195 } 196 197 static int 198 cpu_statechange(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid, 199 uint32_t cpuid, boolean_t repair) 200 { 201 if (repair) 202 return (cpu_online(hdl, nvl, asru, uuid, cpuid)); 203 else 204 return (cpu_offline(hdl, nvl, asru, uuid, cpuid)); 205 } 206 207 const char * 208 p_online_state_fmt(int state) 209 { 210 state &= ~P_FORCED; 211 switch (state) { 212 case P_OFFLINE: 213 return (PS_OFFLINE); 214 case P_ONLINE: 215 return (PS_ONLINE); 216 case P_FAULTED: 217 return (PS_FAULTED); 218 case P_POWEROFF: 219 return (PS_POWEROFF); 220 case P_NOINTR: 221 return (PS_NOINTR); 222 case P_SPARE: 223 return (PS_SPARE); 224 default: 225 return ("unknown"); 226 } 227 } 228 229 int 230 cma_cpu_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid, 231 boolean_t repair) 232 { 233 uint_t cpuid; 234 235 if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) { 236 fmd_hdl_debug(hdl, "cpu fault missing '%s'\n", FM_FMRI_CPU_ID); 237 cma_stats.bad_flts.fmds_value.ui64++; 238 return (CMA_RA_FAILURE); 239 } 240 241 return (cpu_statechange(hdl, nvl, asru, uuid, cpuid, repair)); 242 } 243 244 #ifdef opl 245 /* ARGSUSED 4 */ 246 int 247 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, 248 const char *uuid, boolean_t repair) 249 { 250 uint_t cpuid; 251 uint_t i, nprs; 252 nvlist_t **hc_prs = NULL, *hc_spec_nvl; 253 254 if (nvlist_lookup_nvlist(asru, FM_FMRI_HC_SPECIFIC, 255 &hc_spec_nvl) != 0) { 256 cma_stats.bad_flts.fmds_value.ui64++; 257 fmd_hdl_debug(hdl, 258 "cma_cpu_hc_retire lookup hc_spec_nvl failed\n"); 259 return (CMA_RA_FAILURE); 260 } 261 262 if (nvlist_lookup_nvlist_array(hc_spec_nvl, FM_FMRI_HC_CPUIDS, 263 &hc_prs, &nprs) != 0) { 264 cma_stats.bad_flts.fmds_value.ui64++; 265 fmd_hdl_debug(hdl, 266 "cma_cpu_hc_retire lookup cpuid array failed\n"); 267 return (CMA_RA_FAILURE); 268 } 269 270 for (i = 0; i < nprs; i++) { 271 if (nvlist_lookup_uint32(hc_prs[i], 272 FM_FMRI_CPU_ID, &cpuid) != 0) { 273 cma_stats.bad_flts.fmds_value.ui64++; 274 return (CMA_RA_FAILURE); 275 } 276 277 if (cpu_statechange(hdl, nvl, hc_prs[i], uuid, cpuid, repair) 278 != CMA_RA_SUCCESS) { 279 cma_stats.bad_flts.fmds_value.ui64++; 280 return (CMA_RA_FAILURE); 281 } 282 } 283 284 return (CMA_RA_SUCCESS); 285 } 286 #endif /* opl */ 287