xref: /illumos-gate/usr/src/cmd/fm/modules/common/cpumem-retire/cma_cpu.c (revision a07094369b21309434206d9b3601d162693466fc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <cma.h>
30 
31 #include <fcntl.h>
32 #include <unistd.h>
33 #include <strings.h>
34 #include <errno.h>
35 #include <time.h>
36 #include <fm/fmd_api.h>
37 #include <sys/fm/protocol.h>
38 #include <sys/bl.h>
39 #include <sys/processor.h>
40 
41 static int
42 cpu_blacklist(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru)
43 {
44 	bl_req_t blr;
45 	nvlist_t *fmri;
46 	char *fmribuf;
47 	size_t fmrisz;
48 	int fd, rc, err;
49 	char *class;
50 
51 	/*
52 	 * Some platforms have special unums for the E$ DIMMs.  If we're dealing
53 	 * with a platform that has these unums, one will have been added to the
54 	 * fault as the resource.  We'll use that for the blacklisting.  If we
55 	 * can't find a resource, we'll fall back to the ASRU.
56 	 */
57 	if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &fmri) != 0)
58 		fmri = asru;
59 
60 	if ((nvlist_lookup_string(nvl, FM_CLASS, &class) != 0) ||
61 	    (class == NULL) || (*class == '\0')) {
62 		fmd_hdl_debug(hdl, "failed to get the fault class name\n");
63 		errno = EINVAL;
64 		return (-1);
65 	}
66 
67 	if ((fd = open("/dev/bl", O_RDONLY)) < 0)
68 		return (-1); /* errno is set for us */
69 
70 	if ((errno = nvlist_size(fmri, &fmrisz, NV_ENCODE_NATIVE)) != 0 ||
71 	    (fmribuf = fmd_hdl_alloc(hdl, fmrisz, FMD_SLEEP)) == NULL) {
72 		(void) close(fd);
73 		return (-1); /* errno is set for us */
74 	}
75 
76 	if ((errno = nvlist_pack(fmri, &fmribuf, &fmrisz,
77 	    NV_ENCODE_NATIVE, 0)) != 0) {
78 		fmd_hdl_free(hdl, fmribuf, fmrisz);
79 		(void) close(fd);
80 		return (-1); /* errno is set for us */
81 	}
82 
83 	blr.bl_fmri = fmribuf;
84 	blr.bl_fmrisz = fmrisz;
85 	blr.bl_class = class;
86 
87 	rc = ioctl(fd, BLIOC_INSERT, &blr);
88 	err = errno;
89 
90 	fmd_hdl_free(hdl, fmribuf, fmrisz);
91 	(void) close(fd);
92 
93 	if (rc < 0 && err != ENOTSUP) {
94 		errno = err;
95 		return (-1);
96 	}
97 
98 	return (0);
99 }
100 
101 static void
102 cpu_offline(fmd_hdl_t *hdl, const char *uuid, uint_t cpuid, int cpustate)
103 {
104 	int i;
105 
106 	for (i = 0; i < cma.cma_cpu_tries;
107 	    i++, (void) nanosleep(&cma.cma_cpu_delay, NULL)) {
108 		if (p_online(cpuid, cpustate) != -1) {
109 			fmd_hdl_debug(hdl, "offlined cpu %u\n", cpuid);
110 			cma_stats.cpu_flts.fmds_value.ui64++;
111 			if (uuid != NULL)
112 				fmd_case_uuclose(hdl, uuid);
113 			return;
114 		}
115 	}
116 
117 	fmd_hdl_debug(hdl, "failed to offline %u: %s\n", cpuid,
118 	    strerror(errno));
119 	cma_stats.cpu_fails.fmds_value.ui64++;
120 }
121 
122 void
123 cma_cpu_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid)
124 {
125 	uint_t cpuid, cpuvid;
126 
127 	/*
128 	 * This added expansion is needed to cover the situation where a
129 	 * cpu fault from the resource cache is replayed at fmd restart,
130 	 * and the cpu resource has been remapped or replaced.  The stored
131 	 * FMRI is expanded, but may have stale data.
132 	 */
133 	if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
134 		fmd_hdl_debug(hdl, "failed to expand cpu asru\n");
135 		cma_stats.bad_flts.fmds_value.ui64++;
136 		return;
137 	}
138 
139 	if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
140 		fmd_hdl_debug(hdl, "cpu fault missing '%s'\n", FM_FMRI_CPU_ID);
141 		cma_stats.bad_flts.fmds_value.ui64++;
142 		return;
143 	}
144 
145 	/*
146 	 * If this asru's FMRI contains a virtual CPU id, use that value for
147 	 * p_online() call instead of (physical) cpu id.
148 	 */
149 
150 	if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_VID, &cpuvid) == 0)
151 		cpuid = cpuvid;
152 
153 	if (cma.cma_cpu_dooffline) {
154 		int cpustate = P_FAULTED;
155 
156 		if (cma.cma_cpu_forcedoffline)
157 			cpustate |= P_FORCED;
158 
159 		cpu_offline(hdl, uuid, cpuid, cpustate);
160 	} else {
161 		fmd_hdl_debug(hdl, "suppressed offline of CPU %u\n", cpuid);
162 		cma_stats.cpu_supp.fmds_value.ui64++;
163 	}
164 
165 	if (cma.cma_cpu_doblacklist) {
166 		if (cpu_blacklist(hdl, nvl, asru) < 0)
167 			cma_stats.cpu_blfails.fmds_value.ui64++;
168 	} else {
169 		fmd_hdl_debug(hdl, "suppressed blacklist of CPU %u\n", cpuid);
170 		cma_stats.cpu_blsupp.fmds_value.ui64++;
171 	}
172 }
173