xref: /illumos-gate/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c (revision a07094369b21309434206d9b3601d162693466fc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 #include <cma.h>
31 
32 #include <strings.h>
33 #include <errno.h>
34 #include <time.h>
35 #include <fm/fmd_api.h>
36 #include <sys/fm/protocol.h>
37 
38 cma_t cma;
39 
40 cma_stats_t cma_stats = {
41 	{ "cpu_flts", FMD_TYPE_UINT64, "cpu faults resolved" },
42 	{ "cpu_fails", FMD_TYPE_UINT64, "cpu faults unresolveable" },
43 	{ "cpu_blfails", FMD_TYPE_UINT64, "failed cpu blacklists" },
44 	{ "cpu_supp", FMD_TYPE_UINT64, "cpu offlines suppressed" },
45 	{ "cpu_blsupp", FMD_TYPE_UINT64, "cpu blacklists suppressed" },
46 	{ "page_flts", FMD_TYPE_UINT64, "page faults resolved" },
47 	{ "page_fails", FMD_TYPE_UINT64, "page faults unresolveable" },
48 	{ "page_supp", FMD_TYPE_UINT64, "page retires suppressed" },
49 	{ "page_nonent", FMD_TYPE_UINT64, "retires for non-existent fmris" },
50 	{ "page_retmax", FMD_TYPE_UINT64, "hit max retries for page retire" },
51 	{ "bad_flts", FMD_TYPE_UINT64, "invalid fault events received" },
52 	{ "nop_flts", FMD_TYPE_UINT64, "inapplicable fault events received" },
53 	{ "auto_flts", FMD_TYPE_UINT64, "auto-close faults received" }
54 };
55 
56 typedef struct cma_subscriber {
57 	const char *subr_class;
58 	const char *subr_sname;
59 	uint_t subr_svers;
60 	void (*subr_func)(fmd_hdl_t *, nvlist_t *, nvlist_t *, const char *);
61 } cma_subscriber_t;
62 
63 static const cma_subscriber_t cma_subrs[] = {
64 	{ "fault.cpu.*", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION,
65 	    cma_cpu_retire },
66 	{ "fault.memory.page", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION,
67 	    cma_page_retire },
68 	{ "fault.memory.dimm", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION,
69 	    NULL },
70 	{ "fault.memory.dimm_sb", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION,
71 	    NULL },
72 	{ "fault.memory.dimm_ck", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION,
73 	    NULL },
74 	{ "fault.memory.dimm_ue", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION,
75 	    NULL },
76 	{ "fault.memory.bank", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION,
77 	    NULL },
78 	{ "fault.memory.datapath", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION,
79 	    NULL },
80 
81 	/*
82 	 * The following ultraSPARC-T1 faults do NOT retire a cpu thread,
83 	 * and therefore must be intercepted before
84 	 * the default "fault.cpu.*" dispatch to cma_cpu_retire.
85 	 */
86 	{ "fault.cpu.ultraSPARC-T1.freg", FM_FMRI_SCHEME_CPU,
87 	    FM_CPU_SCHEME_VERSION, NULL },
88 	{ "fault.cpu.ultraSPARC-T1.l2cachedata", FM_FMRI_SCHEME_CPU,
89 	    FM_CPU_SCHEME_VERSION, NULL },
90 	{ "fault.cpu.ultraSPARC-T1.l2cachetag", FM_FMRI_SCHEME_CPU,
91 	    FM_CPU_SCHEME_VERSION, NULL },
92 	{ "fault.cpu.ultraSPARC-T1.l2cachectl", FM_FMRI_SCHEME_CPU,
93 	    FM_CPU_SCHEME_VERSION, NULL },
94 	{ "fault.cpu.ultraSPARC-T1.mau", FM_FMRI_SCHEME_CPU,
95 	    FM_CPU_SCHEME_VERSION, NULL },
96 	{ "fault.cpu.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION,
97 	    cma_cpu_retire },
98 	{ NULL, NULL, 0, NULL }
99 };
100 
101 static const cma_subscriber_t *
102 nvl2subr(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t **asrup)
103 {
104 	const cma_subscriber_t *sp;
105 	nvlist_t *asru;
106 	char *scheme;
107 	uint8_t version;
108 
109 	if (nvlist_lookup_nvlist(nvl, FM_FAULT_ASRU, &asru) != 0 ||
110 	    nvlist_lookup_string(asru, FM_FMRI_SCHEME, &scheme) != 0 ||
111 	    nvlist_lookup_uint8(asru, FM_VERSION, &version) != 0) {
112 		cma_stats.bad_flts.fmds_value.ui64++;
113 		return (NULL);
114 	}
115 
116 	for (sp = cma_subrs; sp->subr_class != NULL; sp++) {
117 		if (fmd_nvl_class_match(hdl, nvl, sp->subr_class) &&
118 		    strcmp(scheme, sp->subr_sname) == 0 &&
119 		    version <= sp->subr_svers) {
120 			*asrup = asru;
121 			return (sp);
122 		}
123 	}
124 
125 	cma_stats.nop_flts.fmds_value.ui64++;
126 	return (NULL);
127 }
128 
129 static void
130 cma_recv_list(fmd_hdl_t *hdl, nvlist_t *nvl)
131 {
132 	char *uuid = NULL;
133 	nvlist_t **nva;
134 	uint_t nvc = 0;
135 	int err = 0;
136 
137 	err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
138 	err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
139 	    &nva, &nvc);
140 	if (err != 0) {
141 		cma_stats.bad_flts.fmds_value.ui64++;
142 		return;
143 	}
144 
145 	while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) {
146 		nvlist_t *nvl = *nva++;
147 		const cma_subscriber_t *subr;
148 		nvlist_t *asru;
149 
150 		if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL)
151 			continue;
152 
153 		if (subr->subr_func != NULL)
154 			subr->subr_func(hdl, nvl, asru, uuid);
155 	}
156 }
157 
158 static void
159 cma_recv_one(fmd_hdl_t *hdl, nvlist_t *nvl)
160 {
161 	const cma_subscriber_t *subr;
162 	nvlist_t *asru;
163 
164 	if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL)
165 		return;
166 
167 	if (subr->subr_func != NULL)
168 		subr->subr_func(hdl, nvl, asru, NULL);
169 }
170 
171 /*ARGSUSED*/
172 static void
173 cma_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
174 {
175 	fmd_hdl_debug(hdl, "received %s\n", class);
176 
177 	if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0)
178 		cma_recv_list(hdl, nvl);
179 	else
180 		cma_recv_one(hdl, nvl);
181 }
182 
183 /*ARGSUSED*/
184 static void
185 cma_timeout(fmd_hdl_t *hdl, id_t id, void *arg)
186 {
187 	if (id == cma.cma_page_timerid)
188 		cma_page_retry(hdl);
189 }
190 
191 static const fmd_hdl_ops_t fmd_ops = {
192 	cma_recv,	/* fmdo_recv */
193 	cma_timeout,	/* fmdo_timeout */
194 	NULL,		/* fmdo_close */
195 	NULL,		/* fmdo_stats */
196 	NULL,		/* fmdo_gc */
197 };
198 
199 static const fmd_prop_t fmd_props[] = {
200 	{ "cpu_tries", FMD_TYPE_UINT32, "10" },
201 	{ "cpu_delay", FMD_TYPE_TIME, "1sec" },
202 	{ "cpu_offline_enable", FMD_TYPE_BOOL, "true" },
203 	{ "cpu_forced_offline", FMD_TYPE_BOOL, "true" },
204 	{ "cpu_blacklist_enable", FMD_TYPE_BOOL, "true" },
205 	{ "page_ret_mindelay", FMD_TYPE_TIME, "1sec" },
206 	{ "page_ret_maxdelay", FMD_TYPE_TIME, "5min" },
207 	{ "page_retire_enable", FMD_TYPE_BOOL, "true" },
208 #ifdef	i386
209 	/*
210 	 * On i386, leaving cases open while we retry the
211 	 * retire can cause the eft module to use large amounts
212 	 * of memory.  Until eft is fixed, we set a maximum number
213 	 * of retries on page retires, after which the case will
214 	 * be closed.
215 	 */
216 	{ "page_retire_maxretries", FMD_TYPE_UINT32, "8" },
217 #else
218 	{ "page_retire_maxretries", FMD_TYPE_UINT32, "0" },
219 #endif	/* i386 */
220 	{ NULL, 0, NULL }
221 };
222 
223 static const fmd_hdl_info_t fmd_info = {
224 	"CPU/Memory Retire Agent", CMA_VERSION, &fmd_ops, fmd_props
225 };
226 
227 void
228 _fmd_init(fmd_hdl_t *hdl)
229 {
230 	hrtime_t nsec;
231 
232 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0)
233 		return; /* invalid data in configuration file */
234 
235 	fmd_hdl_subscribe(hdl, "fault.cpu.*");
236 	fmd_hdl_subscribe(hdl, "fault.memory.*");
237 
238 	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (cma_stats) /
239 	    sizeof (fmd_stat_t), (fmd_stat_t *)&cma_stats);
240 
241 	cma.cma_cpu_tries = fmd_prop_get_int32(hdl, "cpu_tries");
242 
243 	nsec = fmd_prop_get_int64(hdl, "cpu_delay");
244 	cma.cma_cpu_delay.tv_sec = nsec / NANOSEC;
245 	cma.cma_cpu_delay.tv_nsec = nsec % NANOSEC;
246 
247 	cma.cma_page_mindelay = fmd_prop_get_int64(hdl, "page_ret_mindelay");
248 	cma.cma_page_maxdelay = fmd_prop_get_int64(hdl, "page_ret_maxdelay");
249 
250 	cma.cma_cpu_dooffline = fmd_prop_get_int32(hdl, "cpu_offline_enable");
251 	cma.cma_cpu_forcedoffline = fmd_prop_get_int32(hdl,
252 	    "cpu_forced_offline");
253 	cma.cma_cpu_doblacklist = fmd_prop_get_int32(hdl,
254 	    "cpu_blacklist_enable");
255 	cma.cma_page_doretire = fmd_prop_get_int32(hdl, "page_retire_enable");
256 	cma.cma_page_maxretries =
257 	    fmd_prop_get_int32(hdl, "page_retire_maxretries");
258 
259 	if (cma.cma_page_maxdelay < cma.cma_page_mindelay)
260 		fmd_hdl_abort(hdl, "page retirement delays conflict\n");
261 }
262 
263 void
264 _fmd_fini(fmd_hdl_t *hdl)
265 {
266 	cma_page_fini(hdl);
267 }
268