1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <cma.h> 27 28 #include <sys/fm/ldom.h> 29 #include <sys/fm/protocol.h> 30 #include <fm/fmd_fmri.h> 31 #include <fm/libtopo.h> 32 33 #include <assert.h> 34 #include <fcntl.h> 35 #include <unistd.h> 36 #include <errno.h> 37 #include <strings.h> 38 39 #include <sys/types.h> 40 #include <sys/processor.h> 41 42 extern ldom_hdl_t *cma_lhp; 43 44 /*ARGSUSED*/ 45 int 46 cpu_blacklist_cmd(fmd_hdl_t *hdl, nvlist_t *fmri, boolean_t repair) 47 { 48 if (repair) 49 return (ldom_fmri_unblacklist(cma_lhp, fmri)); 50 else 51 return (ldom_fmri_blacklist(cma_lhp, fmri)); 52 } 53 54 int 55 cma_cpu_blacklist(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, 56 boolean_t repair) 57 { 58 nvlist_t *fmri; 59 int rc, err; 60 61 /* 62 * Some platforms have special unums for the E$ DIMMs. If we're dealing 63 * with a platform that has these unums, one will have been added to the 64 * fault as the resource. We'll use that for the blacklisting. If we 65 * can't find a resource, we'll fall back to the ASRU. 66 */ 67 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &fmri) != 0) 68 fmri = asru; 69 70 rc = cpu_blacklist_cmd(hdl, fmri, repair); 71 err = errno; 72 73 if (rc < 0 && err != ENOTSUP) { 74 errno = err; 75 return (-1); 76 } 77 78 return (0); 79 } 80 81 /*ARGSUSED*/ 82 static int 83 cpu_cmd(fmd_hdl_t *hdl, nvlist_t *fmri, int cmd) 84 { 85 int rc = 0; 86 char *scheme; 87 88 /* 89 * We're using topo retire if the fmri is in "hc" scheme. 90 */ 91 if (nvlist_lookup_string(fmri, FM_FMRI_SCHEME, &scheme) == 0 && 92 strcmp(scheme, FM_FMRI_SCHEME_HC) == 0) { 93 if (cmd != P_STATUS) { 94 errno = EINVAL; 95 return (-1); 96 } 97 rc = fmd_nvl_fmri_service_state(hdl, fmri); 98 switch (rc) { 99 case FMD_SERVICE_STATE_UNUSABLE: 100 return (P_FAULTED); 101 case -1: 102 return (-1); 103 default: 104 return (P_ONLINE); 105 } 106 } 107 108 switch (cmd & ~P_FORCED) { 109 case P_STATUS: 110 rc = ldom_fmri_status(cma_lhp, fmri); 111 break; 112 case P_FAULTED: 113 rc = ldom_fmri_retire(cma_lhp, fmri); 114 break; 115 case P_ONLINE: 116 rc = ldom_fmri_unretire(cma_lhp, fmri); 117 break; 118 default: 119 errno = EINVAL; 120 return (-1); 121 } 122 123 if (rc != P_OFFLINE && rc != P_ONLINE && rc != P_FAULTED) { 124 errno = rc; 125 return (-1); 126 } 127 128 return (rc); 129 } 130 131 void 132 cma_cpu_start_retry(fmd_hdl_t *hdl, nvlist_t *fmri, const char *uuid, 133 boolean_t repair) 134 { 135 cma_cpu_t *cpu; 136 char *scheme; 137 uint_t cpuid; 138 nvlist_t *asru = NULL; 139 topo_hdl_t *thp; 140 int err; 141 142 if (repair || nvlist_lookup_string(fmri, FM_FMRI_SCHEME, &scheme) != 0) 143 return; 144 if (strcmp(scheme, FM_FMRI_SCHEME_CPU) == 0) { 145 if (nvlist_lookup_uint32(fmri, FM_FMRI_CPU_ID, &cpuid) != 0) 146 return; 147 } else if (strcmp(scheme, FM_FMRI_SCHEME_HC) != 0) { 148 return; 149 } else { 150 /* lookup cpuid from ASRU */ 151 thp = fmd_fmri_topo_hold(TOPO_VERSION); 152 if (thp != NULL) { 153 (void) topo_fmri_asru(thp, fmri, &asru, &err); 154 fmd_fmri_topo_rele(thp); 155 } 156 if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) { 157 nvlist_free(asru); 158 return; 159 } 160 } 161 162 /* 163 * check to see if the cpu has been offline. 164 */ 165 fmd_hdl_debug(hdl, "cpu %u is not offline yet - sleeping\n", cpuid); 166 167 /* 168 * Create a cpu node and add to the head of the cpu list 169 */ 170 cpu = fmd_hdl_zalloc(hdl, sizeof (cma_cpu_t), FMD_SLEEP); 171 (void) nvlist_dup(fmri, &cpu->cpu_fmri, 0); 172 if (uuid != NULL) 173 cpu->cpu_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP); 174 175 cpu->cpuid = cpuid; 176 cpu->cpu_next = cma.cma_cpus; 177 cma.cma_cpus = cpu; 178 179 if (cma.cma_cpu_timerid != 0) 180 fmd_timer_remove(hdl, cma.cma_cpu_timerid); 181 182 cma.cma_cpu_curdelay = cma.cma_cpu_mindelay; 183 184 cma.cma_cpu_timerid = 185 fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay); 186 } 187 188 189 int 190 cma_cpu_statechange(fmd_hdl_t *hdl, nvlist_t *asru, const char *uuid, 191 int cpustate, boolean_t repair) 192 { 193 int i; 194 uint_t cpuid; 195 196 if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) { 197 fmd_hdl_debug(hdl, "missing '%s'\n", FM_FMRI_CPU_ID); 198 cma_stats.bad_flts.fmds_value.ui64++; 199 return (CMA_RA_FAILURE); 200 } 201 202 /* 203 * cpu offlining using ldom_fmri_retire() may be asynchronous, so we 204 * have to set the timer and check the cpu status later. 205 */ 206 for (i = 0; i < cma.cma_cpu_tries; 207 i++, (void) nanosleep(&cma.cma_cpu_delay, NULL)) { 208 if (cpu_cmd(hdl, asru, cpustate) != -1) { 209 if (repair) 210 cma_stats.cpu_repairs.fmds_value.ui64++; 211 else 212 cma_stats.cpu_flts.fmds_value.ui64++; 213 break; 214 } 215 } 216 217 if (i >= cma.cma_cpu_tries) { 218 cma_stats.cpu_fails.fmds_value.ui64++; 219 } 220 221 cma_cpu_start_retry(hdl, asru, uuid, repair); 222 223 return (CMA_RA_FAILURE); 224 } 225 226 static int 227 cpu_retry(fmd_hdl_t *hdl, cma_cpu_t *cpu) 228 { 229 int rc = 0; 230 231 fmd_hdl_debug(hdl, "cpu_retry()\n"); 232 233 if (cpu->cpu_fmri == NULL) { 234 return (1); 235 } 236 237 if (!fmd_nvl_fmri_present(hdl, cpu->cpu_fmri)) { 238 fmd_hdl_debug(hdl, "cpu %u is not present", cpu->cpuid); 239 return (1); 240 } 241 242 rc = cpu_cmd(hdl, cpu->cpu_fmri, P_STATUS); 243 if (rc == P_FAULTED || rc == P_OFFLINE) { 244 fmd_hdl_debug(hdl, "cpu %u is offlined on retry %u\n", 245 cpu->cpuid, cpu->cpu_nretries); 246 cma_stats.cpu_flts.fmds_value.ui64++; 247 248 if (cpu->cpu_uuid != NULL) 249 fmd_case_uuclose(hdl, cpu->cpu_uuid); 250 return (1); /* success */ 251 } 252 253 if (rc == -1) { 254 fmd_hdl_debug(hdl, "failed to retry cpu %u\n", cpu->cpuid); 255 cma_stats.page_fails.fmds_value.ui64++; 256 return (1); /* give up */ 257 } 258 259 return (0); 260 } 261 262 static void 263 cma_cpu_free(fmd_hdl_t *hdl, cma_cpu_t *cpu) 264 { 265 nvlist_free(cpu->cpu_fmri); 266 if (cpu->cpu_uuid != NULL) 267 fmd_hdl_strfree(hdl, cpu->cpu_uuid); 268 fmd_hdl_free(hdl, cpu, sizeof (cma_cpu_t)); 269 } 270 271 void 272 cma_cpu_retry(fmd_hdl_t *hdl) 273 { 274 cma_cpu_t **cpup; 275 276 fmd_hdl_debug(hdl, "cma_cpu_retry: timer fired\n"); 277 278 cma.cma_cpu_timerid = 0; 279 280 cpup = &cma.cma_cpus; 281 while (*cpup != NULL) { 282 cma_cpu_t *cpu = *cpup; 283 284 if (cpu_retry(hdl, cpu)) { 285 /* 286 * Successful retry or we're giving up - remove from 287 * the list 288 */ 289 *cpup = cpu->cpu_next; 290 291 cma_cpu_free(hdl, cpu); 292 } else { 293 cpu->cpu_nretries++; 294 cpup = &cpu->cpu_next; 295 } 296 } 297 298 if (cma.cma_cpus == NULL) 299 return; /* no more cpus */ 300 301 /* 302 * We still have cpus to check. Back the delay 303 * off, and schedule a retry. 304 */ 305 cma.cma_cpu_curdelay = MIN(cma.cma_cpu_curdelay * 2, 306 cma.cma_cpu_maxdelay); 307 308 fmd_hdl_debug(hdl, "scheduled cpu offline retry for %llu secs\n", 309 (u_longlong_t)(cma.cma_cpu_curdelay / NANOSEC)); 310 311 cma.cma_cpu_timerid = 312 fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay); 313 } 314 315 void 316 cma_cpu_fini(fmd_hdl_t *hdl) 317 { 318 cma_cpu_t *cpu; 319 320 while ((cpu = cma.cma_cpus) != NULL) { 321 cma.cma_cpus = cpu->cpu_next; 322 cma_cpu_free(hdl, cpu); 323 } 324 } 325