1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright 2019 Joyent, Inc. 26 */ 27 28 #include <cma.h> 29 30 #include <fcntl.h> 31 #include <unistd.h> 32 #include <strings.h> 33 #include <errno.h> 34 #include <time.h> 35 #include <fm/fmd_api.h> 36 #include <fm/fmd_agent.h> 37 #include <sys/fm/protocol.h> 38 #include <sys/bl.h> 39 #include <sys/processor.h> 40 41 static int cpu_statechange(fmd_hdl_t *, nvlist_t *, nvlist_t *, const char *, 42 uint32_t, boolean_t); 43 44 #ifndef opl 45 /* 46 * Perform retire/unretire by invoking the topo methods registered in the 47 * hc-scheme resource. 48 * 49 * If the fault is found to be diagnosed under the old topology, the resource 50 * will not exist in the current topology, then we fall back to legacy retire 51 * (using the "cpu" scheme ASRU). 52 */ 53 54 static boolean_t 55 old_topo_fault(nvlist_t *nvl) 56 { 57 nvlist_t *rsrc; 58 #ifdef i386 59 nvlist_t **hcl; 60 uint_t nhcl = 0; 61 char *name; 62 #endif 63 64 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) != 0) 65 return (B_TRUE); 66 #ifdef i386 67 /* 68 * x86 has moved from "motherboard/chip/cpu" topo to 69 * "motherboard/chip/core/strand" 70 */ 71 if (nvlist_lookup_nvlist_array(rsrc, FM_FMRI_HC_LIST, &hcl, &nhcl) 72 == 0 && nhcl == 3 && 73 nvlist_lookup_string(hcl[0], FM_FMRI_HC_NAME, &name) == 0 && 74 strcmp(name, "motherboard") == 0 && 75 nvlist_lookup_string(hcl[1], FM_FMRI_HC_NAME, &name) == 0 && 76 strcmp(name, "chip") == 0 && 77 nvlist_lookup_string(hcl[2], FM_FMRI_HC_NAME, &name) == 0 && 78 strcmp(name, "cpu") == 0) 79 return (B_TRUE); 80 #endif 81 82 return (B_FALSE); 83 } 84 85 /* ARGSUSED */ 86 int 87 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, 88 const char *uuid, boolean_t repair) 89 { 90 int i, err; 91 int rc = CMA_RA_SUCCESS; 92 nvlist_t *rsrc; 93 94 /* 95 * For the cached faults which were diagnosed under the old 96 * topology, we fall back to retire by using cpu-scheme ASRUs. 97 * Under xVM Dom0, since logic cpuid in "cpu" scheme ASRU makes no 98 * sense, the fault should be ignored. 99 */ 100 if (old_topo_fault(nvl)) { 101 #ifdef i386 102 if (! cma_is_native) 103 return (CMA_RA_FAILURE); 104 #endif 105 return (cma_cpu_cpu_retire(hdl, nvl, asru, uuid, repair)); 106 } 107 108 /* 109 * Lookup the resource and call its topo methods to do retire/unretire 110 */ 111 if ((! repair && ! cma.cma_cpu_dooffline) || 112 (repair && ! cma.cma_cpu_doonline)) { 113 fmd_hdl_debug(hdl, "suppressed %s of CPU\n", 114 repair ? "unretire" : "retire"); 115 cma_stats.cpu_supp.fmds_value.ui64++; 116 } else { 117 err = FMD_AGENT_RETIRE_FAIL; 118 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) { 119 if (repair) { 120 err = fmd_nvl_fmri_unretire(hdl, rsrc); 121 } else { 122 for (i = 0; i < cma.cma_cpu_tries; i++) { 123 err = fmd_nvl_fmri_retire(hdl, rsrc); 124 if (err == FMD_AGENT_RETIRE_DONE) 125 break; 126 (void) nanosleep(&cma.cma_cpu_delay, 127 NULL); 128 } 129 } 130 } 131 if (err == FMD_AGENT_RETIRE_DONE) { 132 if (repair) 133 cma_stats.cpu_repairs.fmds_value.ui64++; 134 else 135 cma_stats.cpu_flts.fmds_value.ui64++; 136 } else { 137 rc = CMA_RA_FAILURE; 138 cma_stats.bad_flts.fmds_value.ui64++; 139 #ifdef sun4v 140 /* libldom requests are processed asynchronously */ 141 cma_cpu_start_retry(hdl, nvl, uuid, repair); 142 #endif 143 } 144 } 145 146 if ((! repair && ! cma.cma_cpu_doblacklist) || 147 (repair && ! cma.cma_cpu_dounblacklist)) { 148 fmd_hdl_debug(hdl, "suppressed %s of CPU\n", 149 repair ? "unblacklist" : "blacklist"); 150 cma_stats.cpu_blsupp.fmds_value.ui64++; 151 } else { 152 if (cma_cpu_blacklist(hdl, nvl, asru, repair) < 0) 153 cma_stats.cpu_blfails.fmds_value.ui64++; 154 } 155 156 return (rc); 157 } 158 159 #else /* opl */ 160 161 /* ARGSUSED 4 */ 162 int 163 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, 164 const char *uuid, boolean_t repair) 165 { 166 uint_t cpuid; 167 uint_t i, nprs; 168 nvlist_t **hc_prs = NULL, *hc_spec_nvl; 169 170 /* OPL has ASRU in "hc" scheme */ 171 if (nvlist_lookup_nvlist(asru, FM_FMRI_HC_SPECIFIC, 172 &hc_spec_nvl) != 0) { 173 cma_stats.bad_flts.fmds_value.ui64++; 174 fmd_hdl_debug(hdl, 175 "cma_cpu_hc_retire lookup hc_spec_nvl failed\n"); 176 return (CMA_RA_FAILURE); 177 } 178 179 if (nvlist_lookup_nvlist_array(hc_spec_nvl, FM_FMRI_HC_CPUIDS, 180 &hc_prs, &nprs) != 0) { 181 cma_stats.bad_flts.fmds_value.ui64++; 182 fmd_hdl_debug(hdl, 183 "cma_cpu_hc_retire lookup cpuid array failed\n"); 184 return (CMA_RA_FAILURE); 185 } 186 187 for (i = 0; i < nprs; i++) { 188 if (nvlist_lookup_uint32(hc_prs[i], 189 FM_FMRI_CPU_ID, &cpuid) != 0) { 190 cma_stats.bad_flts.fmds_value.ui64++; 191 return (CMA_RA_FAILURE); 192 } 193 194 if (cpu_statechange(hdl, nvl, hc_prs[i], uuid, cpuid, repair) 195 != CMA_RA_SUCCESS) { 196 cma_stats.bad_flts.fmds_value.ui64++; 197 return (CMA_RA_FAILURE); 198 } 199 } 200 201 return (CMA_RA_SUCCESS); 202 } 203 #endif /* opl */ 204 205 /* 206 * The rest of this file uses ASRUs to do retire, this is now not the 207 * preferable way, but it's still needed for some circumstances when 208 * retire via topo methods can't work, ie. 209 * 210 * 1) There are legacy platforms which don't have full topology. 211 * 2) The resources in the FMD cached faults may not be set or exist in the 212 * up-to-dated topology. 213 */ 214 215 /* ARGSUSED */ 216 static int 217 cpu_online(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid, 218 uint32_t cpuid) 219 { 220 int err = CMA_RA_SUCCESS; 221 222 if (cma.cma_cpu_doonline) { 223 err = cma_cpu_statechange(hdl, asru, uuid, P_ONLINE, 224 B_TRUE); 225 } else { 226 fmd_hdl_debug(hdl, "suppressed online of CPU %u\n", 227 cpuid); 228 cma_stats.cpu_supp.fmds_value.ui64++; 229 } 230 231 /* OPL performs the blacklist in the service processor */ 232 #ifndef opl 233 if (cma.cma_cpu_dounblacklist) { 234 if (cma_cpu_blacklist(hdl, nvl, asru, B_TRUE) < 0) 235 cma_stats.cpu_blfails.fmds_value.ui64++; 236 } else { 237 fmd_hdl_debug(hdl, "suppressed unblacklist of CPU %u\n", cpuid); 238 cma_stats.cpu_blsupp.fmds_value.ui64++; 239 } 240 #endif /* opl */ 241 242 return (err); 243 } 244 245 /* ARGSUSED */ 246 static int 247 cpu_offline(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid, 248 uint32_t cpuid) 249 { 250 int err = CMA_RA_FAILURE; 251 252 if (cma.cma_cpu_dooffline) { 253 int cpustate = P_FAULTED; 254 255 if (cma.cma_cpu_forcedoffline) 256 cpustate |= P_FORCED; 257 err = cma_cpu_statechange(hdl, asru, uuid, cpustate, 258 B_FALSE); 259 } else { 260 fmd_hdl_debug(hdl, "suppressed offline of CPU %u\n", 261 cpuid); 262 cma_stats.cpu_supp.fmds_value.ui64++; 263 } 264 265 /* OPL performs the blacklist in the service processor */ 266 #ifndef opl 267 if (cma.cma_cpu_doblacklist) { 268 if (cma_cpu_blacklist(hdl, nvl, asru, B_FALSE) < 0) 269 cma_stats.cpu_blfails.fmds_value.ui64++; 270 } else { 271 fmd_hdl_debug(hdl, "suppressed blacklist of CPU %u\n", 272 cpuid); 273 cma_stats.cpu_blsupp.fmds_value.ui64++; 274 } 275 #endif /* opl */ 276 277 return (err); 278 } 279 280 static int 281 cpu_statechange(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid, 282 uint32_t cpuid, boolean_t repair) 283 { 284 if (repair) 285 return (cpu_online(hdl, nvl, asru, uuid, cpuid)); 286 else 287 return (cpu_offline(hdl, nvl, asru, uuid, cpuid)); 288 } 289 290 const char * 291 p_online_state_fmt(int state) 292 { 293 state &= ~P_FORCED; 294 switch (state) { 295 case P_OFFLINE: 296 return (PS_OFFLINE); 297 case P_ONLINE: 298 return (PS_ONLINE); 299 case P_FAULTED: 300 return (PS_FAULTED); 301 case P_POWEROFF: 302 return (PS_POWEROFF); 303 case P_NOINTR: 304 return (PS_NOINTR); 305 case P_SPARE: 306 return (PS_SPARE); 307 case P_DISABLED: 308 return (PS_DISABLED); 309 default: 310 return ("unknown"); 311 } 312 } 313 314 int 315 cma_cpu_cpu_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, 316 const char *uuid, boolean_t repair) 317 { 318 uint_t cpuid; 319 320 if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) { 321 fmd_hdl_debug(hdl, "cpu fault missing '%s'\n", FM_FMRI_CPU_ID); 322 cma_stats.bad_flts.fmds_value.ui64++; 323 return (CMA_RA_FAILURE); 324 } 325 326 return (cpu_statechange(hdl, nvl, asru, uuid, cpuid, repair)); 327 } 328