xref: /illumos-gate/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c (revision 7f7322febbcfe774b7270abc3b191c094bfcc517)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 #include <cma.h>
31 
32 #include <strings.h>
33 #include <errno.h>
34 #include <time.h>
35 #include <fm/fmd_api.h>
36 #include <sys/fm/protocol.h>
37 
38 cma_t cma;
39 
40 cma_stats_t cma_stats = {
41 	{ "cpu_flts", FMD_TYPE_UINT64, "cpu faults resolved" },
42 	{ "cpu_fails", FMD_TYPE_UINT64, "cpu faults unresolveable" },
43 	{ "cpu_blfails", FMD_TYPE_UINT64, "failed cpu blacklists" },
44 	{ "cpu_supp", FMD_TYPE_UINT64, "cpu offlines suppressed" },
45 	{ "cpu_blsupp", FMD_TYPE_UINT64, "cpu blacklists suppressed" },
46 	{ "page_flts", FMD_TYPE_UINT64, "page faults resolved" },
47 	{ "page_fails", FMD_TYPE_UINT64, "page faults unresolveable" },
48 	{ "page_supp", FMD_TYPE_UINT64, "page retires suppressed" },
49 	{ "page_nonent", FMD_TYPE_UINT64, "retires for non-existent fmris" },
50 	{ "bad_flts", FMD_TYPE_UINT64, "invalid fault events received" },
51 	{ "nop_flts", FMD_TYPE_UINT64, "inapplicable fault events received" },
52 	{ "auto_flts", FMD_TYPE_UINT64, "auto-close faults received" }
53 };
54 
55 typedef struct cma_subscriber {
56 	const char *subr_class;
57 	const char *subr_sname;
58 	uint_t subr_svers;
59 	void (*subr_func)(fmd_hdl_t *, nvlist_t *, nvlist_t *, const char *);
60 } cma_subscriber_t;
61 
62 static const cma_subscriber_t cma_subrs[] = {
63 	{ "fault.memory.page", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION,
64 	    cma_page_retire },
65 	{ "fault.memory.dimm", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION,
66 	    NULL },
67 	{ "fault.memory.bank", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION,
68 	    NULL },
69 	{ "fault.memory.datapath", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION,
70 	    NULL },
71 
72 	/*
73 	 * The following ultraSPARC-T1 faults do NOT retire a cpu thread,
74 	 * and therefore must be intercepted before
75 	 * the default "fault.cpu.*" dispatch to cma_cpu_retire.
76 	 */
77 	{ "fault.cpu.ultraSPARC-T1.freg", FM_FMRI_SCHEME_CPU,
78 	    FM_CPU_SCHEME_VERSION, NULL },
79 	{ "fault.cpu.ultraSPARC-T1.l2cachedata", FM_FMRI_SCHEME_CPU,
80 	    FM_CPU_SCHEME_VERSION, NULL },
81 	{ "fault.cpu.ultraSPARC-T1.l2cachetag", FM_FMRI_SCHEME_CPU,
82 	    FM_CPU_SCHEME_VERSION, NULL },
83 	{ "fault.cpu.ultraSPARC-T1.l2cachectl", FM_FMRI_SCHEME_CPU,
84 	    FM_CPU_SCHEME_VERSION, NULL },
85 	{ "fault.cpu.ultraSPARC-T1.mau", FM_FMRI_SCHEME_CPU,
86 	    FM_CPU_SCHEME_VERSION, NULL },
87 	{ "fault.cpu.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION,
88 	    cma_cpu_retire },
89 	{ NULL, NULL, 0, NULL }
90 };
91 
92 static const cma_subscriber_t *
93 nvl2subr(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t **asrup)
94 {
95 	const cma_subscriber_t *sp;
96 	nvlist_t *asru;
97 	char *scheme;
98 	uint8_t version;
99 
100 	if (nvlist_lookup_nvlist(nvl, FM_FAULT_ASRU, &asru) != 0 ||
101 	    nvlist_lookup_string(asru, FM_FMRI_SCHEME, &scheme) != 0 ||
102 	    nvlist_lookup_uint8(asru, FM_VERSION, &version) != 0) {
103 		cma_stats.bad_flts.fmds_value.ui64++;
104 		return (NULL);
105 	}
106 
107 	for (sp = cma_subrs; sp->subr_class != NULL; sp++) {
108 		if (fmd_nvl_class_match(hdl, nvl, sp->subr_class) &&
109 		    strcmp(scheme, sp->subr_sname) == 0 &&
110 		    version <= sp->subr_svers) {
111 			*asrup = asru;
112 			return (sp);
113 		}
114 	}
115 
116 	cma_stats.nop_flts.fmds_value.ui64++;
117 	return (NULL);
118 }
119 
120 static void
121 cma_recv_list(fmd_hdl_t *hdl, nvlist_t *nvl)
122 {
123 	char *uuid = NULL;
124 	nvlist_t **nva;
125 	uint_t nvc = 0;
126 	int err = 0;
127 
128 	err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
129 	err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
130 	    &nva, &nvc);
131 	if (err != 0) {
132 		cma_stats.bad_flts.fmds_value.ui64++;
133 		return;
134 	}
135 
136 	while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) {
137 		nvlist_t *nvl = *nva++;
138 		const cma_subscriber_t *subr;
139 		nvlist_t *asru;
140 
141 		if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL)
142 			continue;
143 
144 		if (subr->subr_func != NULL)
145 			subr->subr_func(hdl, nvl, asru, uuid);
146 	}
147 }
148 
149 static void
150 cma_recv_one(fmd_hdl_t *hdl, nvlist_t *nvl)
151 {
152 	const cma_subscriber_t *subr;
153 	nvlist_t *asru;
154 
155 	if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL)
156 		return;
157 
158 	if (subr->subr_func != NULL)
159 		subr->subr_func(hdl, nvl, asru, NULL);
160 }
161 
162 /*ARGSUSED*/
163 static void
164 cma_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
165 {
166 	fmd_hdl_debug(hdl, "received %s\n", class);
167 
168 	if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0)
169 		cma_recv_list(hdl, nvl);
170 	else
171 		cma_recv_one(hdl, nvl);
172 }
173 
174 /*ARGSUSED*/
175 static void
176 cma_timeout(fmd_hdl_t *hdl, id_t id, void *arg)
177 {
178 	if (id == cma.cma_page_timerid)
179 		cma_page_retry(hdl);
180 }
181 
182 static const fmd_hdl_ops_t fmd_ops = {
183 	cma_recv,	/* fmdo_recv */
184 	cma_timeout,	/* fmdo_timeout */
185 	NULL,		/* fmdo_close */
186 	NULL,		/* fmdo_stats */
187 	NULL,		/* fmdo_gc */
188 };
189 
190 static const fmd_prop_t fmd_props[] = {
191 	{ "cpu_tries", FMD_TYPE_UINT32, "10" },
192 	{ "cpu_delay", FMD_TYPE_TIME, "1sec" },
193 	{ "cpu_offline_enable", FMD_TYPE_BOOL, "true" },
194 	{ "cpu_forced_offline", FMD_TYPE_BOOL, "true" },
195 	{ "cpu_blacklist_enable", FMD_TYPE_BOOL, "true" },
196 	{ "page_ret_mindelay", FMD_TYPE_TIME, "1sec" },
197 	{ "page_ret_maxdelay", FMD_TYPE_TIME, "5min" },
198 	{ "page_retire_enable", FMD_TYPE_BOOL, "true" },
199 	{ NULL, 0, NULL }
200 };
201 
202 static const fmd_hdl_info_t fmd_info = {
203 	"CPU/Memory Retire Agent", CMA_VERSION, &fmd_ops, fmd_props
204 };
205 
206 void
207 _fmd_init(fmd_hdl_t *hdl)
208 {
209 	hrtime_t nsec;
210 
211 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0)
212 		return; /* invalid data in configuration file */
213 
214 	fmd_hdl_subscribe(hdl, "fault.cpu.*");
215 	fmd_hdl_subscribe(hdl, "fault.memory.*");
216 
217 	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (cma_stats) /
218 	    sizeof (fmd_stat_t), (fmd_stat_t *)&cma_stats);
219 
220 	cma.cma_cpu_tries = fmd_prop_get_int32(hdl, "cpu_tries");
221 
222 	nsec = fmd_prop_get_int64(hdl, "cpu_delay");
223 	cma.cma_cpu_delay.tv_sec = nsec / NANOSEC;
224 	cma.cma_cpu_delay.tv_nsec = nsec % NANOSEC;
225 
226 	cma.cma_page_mindelay = fmd_prop_get_int64(hdl, "page_ret_mindelay");
227 	cma.cma_page_maxdelay = fmd_prop_get_int64(hdl, "page_ret_maxdelay");
228 
229 	cma.cma_cpu_dooffline = fmd_prop_get_int32(hdl, "cpu_offline_enable");
230 	cma.cma_cpu_forcedoffline = fmd_prop_get_int32(hdl,
231 	    "cpu_forced_offline");
232 	cma.cma_cpu_doblacklist = fmd_prop_get_int32(hdl,
233 	    "cpu_blacklist_enable");
234 	cma.cma_page_doretire = fmd_prop_get_int32(hdl, "page_retire_enable");
235 
236 	if (cma.cma_page_maxdelay < cma.cma_page_mindelay)
237 		fmd_hdl_abort(hdl, "page retirement delays conflict\n");
238 }
239 
240 void
241 _fmd_fini(fmd_hdl_t *hdl)
242 {
243 	cma_page_fini(hdl);
244 }
245