1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <cma.h> 27 28 #include <fcntl.h> 29 #include <unistd.h> 30 #include <strings.h> 31 #include <errno.h> 32 #include <time.h> 33 #include <fm/fmd_api.h> 34 #include <fm/fmd_agent.h> 35 #include <sys/fm/protocol.h> 36 #include <sys/bl.h> 37 #include <sys/processor.h> 38 39 static int cpu_statechange(fmd_hdl_t *, nvlist_t *, nvlist_t *, const char *, 40 uint32_t, boolean_t); 41 42 #ifndef opl 43 /* 44 * Perform retire/unretire by invoking the topo methods registered in the 45 * hc-scheme resource. 46 * 47 * If the fault is found to be diagnosed under the old topology, the resource 48 * will not exist in the current topology, then we fall back to legacy retire 49 * (using the "cpu" scheme ASRU). 50 */ 51 52 static boolean_t 53 old_topo_fault(nvlist_t *nvl) 54 { 55 nvlist_t *rsrc; 56 #ifdef i386 57 nvlist_t **hcl; 58 uint_t nhcl = 0; 59 char *name; 60 #endif 61 62 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) != 0) 63 return (B_TRUE); 64 #ifdef i386 65 /* 66 * x86 has moved from "motherboard/chip/cpu" topo to 67 * "motherboard/chip/core/strand" 68 */ 69 if (nvlist_lookup_nvlist_array(rsrc, FM_FMRI_HC_LIST, &hcl, &nhcl) 70 == 0 && nhcl == 3 && 71 nvlist_lookup_string(hcl[0], FM_FMRI_HC_NAME, &name) == 0 && 72 strcmp(name, "motherboard") == 0 && 73 nvlist_lookup_string(hcl[1], FM_FMRI_HC_NAME, &name) == 0 && 74 strcmp(name, "chip") == 0 && 75 nvlist_lookup_string(hcl[2], FM_FMRI_HC_NAME, &name) == 0 && 76 strcmp(name, "cpu") == 0) 77 return (B_TRUE); 78 #endif 79 80 return (B_FALSE); 81 } 82 83 /* ARGSUSED */ 84 int 85 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, 86 const char *uuid, boolean_t repair) 87 { 88 int i, err; 89 int rc = CMA_RA_SUCCESS; 90 nvlist_t *rsrc; 91 92 /* 93 * For the cached faults which were diagnosed under the old 94 * topology, we fall back to retire by using cpu-scheme ASRUs. 95 * Under xVM Dom0, since logic cpuid in "cpu" scheme ASRU makes no 96 * sense, the fault should be ignored. 97 */ 98 if (old_topo_fault(nvl)) { 99 #ifdef i386 100 if (! cma_is_native) 101 return (CMA_RA_FAILURE); 102 #endif 103 return (cma_cpu_cpu_retire(hdl, nvl, asru, uuid, repair)); 104 } 105 106 /* 107 * Lookup the resource and call its topo methods to do retire/unretire 108 */ 109 if ((! repair && ! cma.cma_cpu_dooffline) || 110 (repair && ! cma.cma_cpu_doonline)) { 111 fmd_hdl_debug(hdl, "suppressed %s of CPU\n", 112 repair ? "unretire" : "retire"); 113 cma_stats.cpu_supp.fmds_value.ui64++; 114 } else { 115 err = FMD_AGENT_RETIRE_FAIL; 116 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) { 117 if (repair) { 118 err = fmd_nvl_fmri_unretire(hdl, rsrc); 119 } else { 120 for (i = 0; i < cma.cma_cpu_tries; i++) { 121 err = fmd_nvl_fmri_retire(hdl, rsrc); 122 if (err == FMD_AGENT_RETIRE_DONE) 123 break; 124 (void) nanosleep(&cma.cma_cpu_delay, 125 NULL); 126 } 127 } 128 } 129 if (err == FMD_AGENT_RETIRE_DONE) { 130 if (repair) 131 cma_stats.cpu_repairs.fmds_value.ui64++; 132 else 133 cma_stats.cpu_flts.fmds_value.ui64++; 134 } else { 135 rc = CMA_RA_FAILURE; 136 cma_stats.bad_flts.fmds_value.ui64++; 137 #ifdef sun4v 138 /* libldom requests are processed asynchronously */ 139 cma_cpu_start_retry(hdl, nvl, uuid, repair); 140 #endif 141 } 142 } 143 144 if ((! repair && ! cma.cma_cpu_doblacklist) || 145 (repair && ! cma.cma_cpu_dounblacklist)) { 146 fmd_hdl_debug(hdl, "suppressed %s of CPU\n", 147 repair ? "unblacklist" : "blacklist"); 148 cma_stats.cpu_blsupp.fmds_value.ui64++; 149 } else { 150 if (cma_cpu_blacklist(hdl, nvl, asru, repair) < 0) 151 cma_stats.cpu_blfails.fmds_value.ui64++; 152 } 153 154 return (rc); 155 } 156 157 #else /* opl */ 158 159 /* ARGSUSED 4 */ 160 int 161 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, 162 const char *uuid, boolean_t repair) 163 { 164 uint_t cpuid; 165 uint_t i, nprs; 166 nvlist_t **hc_prs = NULL, *hc_spec_nvl; 167 168 /* OPL has ASRU in "hc" scheme */ 169 if (nvlist_lookup_nvlist(asru, FM_FMRI_HC_SPECIFIC, 170 &hc_spec_nvl) != 0) { 171 cma_stats.bad_flts.fmds_value.ui64++; 172 fmd_hdl_debug(hdl, 173 "cma_cpu_hc_retire lookup hc_spec_nvl failed\n"); 174 return (CMA_RA_FAILURE); 175 } 176 177 if (nvlist_lookup_nvlist_array(hc_spec_nvl, FM_FMRI_HC_CPUIDS, 178 &hc_prs, &nprs) != 0) { 179 cma_stats.bad_flts.fmds_value.ui64++; 180 fmd_hdl_debug(hdl, 181 "cma_cpu_hc_retire lookup cpuid array failed\n"); 182 return (CMA_RA_FAILURE); 183 } 184 185 for (i = 0; i < nprs; i++) { 186 if (nvlist_lookup_uint32(hc_prs[i], 187 FM_FMRI_CPU_ID, &cpuid) != 0) { 188 cma_stats.bad_flts.fmds_value.ui64++; 189 return (CMA_RA_FAILURE); 190 } 191 192 if (cpu_statechange(hdl, nvl, hc_prs[i], uuid, cpuid, repair) 193 != CMA_RA_SUCCESS) { 194 cma_stats.bad_flts.fmds_value.ui64++; 195 return (CMA_RA_FAILURE); 196 } 197 } 198 199 return (CMA_RA_SUCCESS); 200 } 201 #endif /* opl */ 202 203 /* 204 * The rest of this file uses ASRUs to do retire, this is now not the 205 * preferable way, but it's still needed for some circumstances when 206 * retire via topo methods can't work, ie. 207 * 208 * 1) There are legacy platforms which don't have full topology. 209 * 2) The resources in the FMD cached faults may not be set or exist in the 210 * up-to-dated topology. 211 */ 212 213 /* ARGSUSED */ 214 static int 215 cpu_online(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid, 216 uint32_t cpuid) 217 { 218 int err = CMA_RA_SUCCESS; 219 220 if (cma.cma_cpu_doonline) { 221 err = cma_cpu_statechange(hdl, asru, uuid, P_ONLINE, 222 B_TRUE); 223 } else { 224 fmd_hdl_debug(hdl, "suppressed online of CPU %u\n", 225 cpuid); 226 cma_stats.cpu_supp.fmds_value.ui64++; 227 } 228 229 /* OPL performs the blacklist in the service processor */ 230 #ifndef opl 231 if (cma.cma_cpu_dounblacklist) { 232 if (cma_cpu_blacklist(hdl, nvl, asru, B_TRUE) < 0) 233 cma_stats.cpu_blfails.fmds_value.ui64++; 234 } else { 235 fmd_hdl_debug(hdl, "suppressed unblacklist of CPU %u\n", cpuid); 236 cma_stats.cpu_blsupp.fmds_value.ui64++; 237 } 238 #endif /* opl */ 239 240 return (err); 241 } 242 243 /* ARGSUSED */ 244 static int 245 cpu_offline(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid, 246 uint32_t cpuid) 247 { 248 int err = CMA_RA_FAILURE; 249 250 if (cma.cma_cpu_dooffline) { 251 int cpustate = P_FAULTED; 252 253 if (cma.cma_cpu_forcedoffline) 254 cpustate |= P_FORCED; 255 err = cma_cpu_statechange(hdl, asru, uuid, cpustate, 256 B_FALSE); 257 } else { 258 fmd_hdl_debug(hdl, "suppressed offline of CPU %u\n", 259 cpuid); 260 cma_stats.cpu_supp.fmds_value.ui64++; 261 } 262 263 /* OPL performs the blacklist in the service processor */ 264 #ifndef opl 265 if (cma.cma_cpu_doblacklist) { 266 if (cma_cpu_blacklist(hdl, nvl, asru, B_FALSE) < 0) 267 cma_stats.cpu_blfails.fmds_value.ui64++; 268 } else { 269 fmd_hdl_debug(hdl, "suppressed blacklist of CPU %u\n", 270 cpuid); 271 cma_stats.cpu_blsupp.fmds_value.ui64++; 272 } 273 #endif /* opl */ 274 275 return (err); 276 } 277 278 static int 279 cpu_statechange(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid, 280 uint32_t cpuid, boolean_t repair) 281 { 282 if (repair) 283 return (cpu_online(hdl, nvl, asru, uuid, cpuid)); 284 else 285 return (cpu_offline(hdl, nvl, asru, uuid, cpuid)); 286 } 287 288 const char * 289 p_online_state_fmt(int state) 290 { 291 state &= ~P_FORCED; 292 switch (state) { 293 case P_OFFLINE: 294 return (PS_OFFLINE); 295 case P_ONLINE: 296 return (PS_ONLINE); 297 case P_FAULTED: 298 return (PS_FAULTED); 299 case P_POWEROFF: 300 return (PS_POWEROFF); 301 case P_NOINTR: 302 return (PS_NOINTR); 303 case P_SPARE: 304 return (PS_SPARE); 305 default: 306 return ("unknown"); 307 } 308 } 309 310 int 311 cma_cpu_cpu_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, 312 const char *uuid, boolean_t repair) 313 { 314 uint_t cpuid; 315 316 if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) { 317 fmd_hdl_debug(hdl, "cpu fault missing '%s'\n", FM_FMRI_CPU_ID); 318 cma_stats.bad_flts.fmds_value.ui64++; 319 return (CMA_RA_FAILURE); 320 } 321 322 return (cpu_statechange(hdl, nvl, asru, uuid, cpuid, repair)); 323 } 324