xref: /illumos-gate/usr/src/cmd/fm/modules/common/io-retire/rio_main.c (revision bb0ade0978a02d3fe0b0165cd4725fdcb593fbfb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/fm/protocol.h>
30 #include <fm/fmd_api.h>
31 #include <strings.h>
32 #include <libdevinfo.h>
33 #include <sys/modctl.h>
34 
35 static int	global_disable;
36 
37 struct except_list {
38 	char			*el_fault;
39 	struct except_list	*el_next;
40 };
41 
42 static struct except_list *except_list;
43 
44 static void
45 parse_exception_string(fmd_hdl_t *hdl, char *estr)
46 {
47 	char	*p;
48 	char	*next;
49 	size_t	len;
50 	struct except_list *elem;
51 
52 	len = strlen(estr);
53 
54 	p = estr;
55 	for (;;) {
56 		/* Remove leading ':' */
57 		while (*p == ':')
58 			p++;
59 		if (*p == '\0')
60 			break;
61 
62 		next = strchr(p, ':');
63 
64 		if (next)
65 			*next = '\0';
66 
67 		elem = fmd_hdl_alloc(hdl,
68 		    sizeof (struct except_list), FMD_SLEEP);
69 		elem->el_fault = fmd_hdl_strdup(hdl, p, FMD_SLEEP);
70 		elem->el_next = except_list;
71 		except_list = elem;
72 
73 		if (next) {
74 			*next = ':';
75 			p = next + 1;
76 		} else {
77 			break;
78 		}
79 	}
80 
81 	if (len != strlen(estr)) {
82 		fmd_hdl_abort(hdl, "Error parsing exception list: %s\n", estr);
83 	}
84 }
85 
86 /*
87  * Returns
88  *	1  if fault on exception list
89  *	0  otherwise
90  */
91 static int
92 fault_exception(fmd_hdl_t *hdl, nvlist_t *fault)
93 {
94 	struct except_list *elem;
95 
96 	for (elem = except_list; elem; elem = elem->el_next) {
97 		if (fmd_nvl_class_match(hdl, fault, elem->el_fault)) {
98 			fmd_hdl_debug(hdl, "rio_recv: Skipping fault "
99 			    "on exception list (%s)\n", elem->el_fault);
100 			return (1);
101 		}
102 	}
103 
104 	return (0);
105 }
106 
107 static void
108 free_exception_list(fmd_hdl_t *hdl)
109 {
110 	struct except_list *elem;
111 
112 	while (except_list) {
113 		elem = except_list;
114 		except_list = elem->el_next;
115 		fmd_hdl_strfree(hdl, elem->el_fault);
116 		fmd_hdl_free(hdl, elem, sizeof (*elem));
117 	}
118 }
119 
120 
121 /*ARGSUSED*/
122 static void
123 rio_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
124 {
125 	nvlist_t	**faults;
126 	nvlist_t	*asru;
127 	uint_t		nfaults;
128 	int		f;
129 	char		devpath[PATH_MAX];
130 	char		*path;
131 	char		*uuid;
132 	char		*scheme;
133 	di_retire_t	drt = {0};
134 	int		retire;
135 	int		rval;
136 	int		error;
137 	char		*snglfault = FM_FAULT_CLASS"."FM_ERROR_IO".";
138 	boolean_t	rtr;
139 
140 
141 	/*
142 	 * If disabled, we don't do retire. We still do unretires though
143 	 */
144 	if (global_disable && strcmp(class, FM_LIST_SUSPECT_CLASS) == 0) {
145 		fmd_hdl_debug(hdl, "rio_recv: retire disabled\n");
146 		return;
147 	}
148 
149 	drt.rt_abort = (void (*)(void *, const char *, ...))fmd_hdl_abort;
150 	drt.rt_debug = (void (*)(void *, const char *, ...))fmd_hdl_debug;
151 	drt.rt_hdl = hdl;
152 
153 	if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0) {
154 		retire = 1;
155 	} else if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) {
156 		retire = 0;
157 	} else if (strncmp(class, snglfault, strlen(snglfault)) == 0) {
158 		fmd_hdl_debug(hdl, "rio_recv: single fault: %s\n", class);
159 		return;
160 	} else {
161 		fmd_hdl_debug(hdl, "rio_recv: not list.* class: %s\n", class);
162 		return;
163 	}
164 
165 	faults = NULL;
166 	nfaults = 0;
167 	if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
168 	    &faults, &nfaults) != 0) {
169 		fmd_hdl_debug(hdl, "rio_recv: no fault list");
170 		return;
171 	}
172 
173 	devpath[0] = '\0';
174 	rval = 0;
175 	for (f = 0; f < nfaults; f++) {
176 		if (nvlist_lookup_boolean_value(faults[f], FM_SUSPECT_RETIRE,
177 		    &rtr) == 0 && !rtr) {
178 			fmd_hdl_debug(hdl, "rio_recv: retire suppressed");
179 			continue;
180 		}
181 
182 		if (nvlist_lookup_nvlist(faults[f], FM_FAULT_ASRU,
183 		    &asru) != 0) {
184 			fmd_hdl_debug(hdl, "rio_recv: no asru in fault");
185 			continue;
186 		}
187 
188 		scheme = NULL;
189 		if (nvlist_lookup_string(asru, FM_FMRI_SCHEME, &scheme) != 0 ||
190 		    strcmp(scheme, FM_FMRI_SCHEME_DEV) != 0) {
191 			fmd_hdl_debug(hdl, "rio_recv: not \"dev\" scheme: %s",
192 			    scheme ? scheme : "<NULL>");
193 			continue;
194 		}
195 
196 		if (retire && fault_exception(hdl, faults[f]))
197 			continue;
198 
199 		if (nvlist_lookup_string(asru, FM_FMRI_DEV_PATH,
200 		    &path) != 0 || path[0] == '\0') {
201 			fmd_hdl_debug(hdl, "rio_recv: no dev path in asru");
202 			continue;
203 		}
204 
205 		/*
206 		 * If retire, we retire only if a single ASRU is pinpointed.
207 		 * We don't do automatic retires if a fault event pinpoints
208 		 * more than one ASRU.
209 		 */
210 		if (retire) {
211 			if (devpath[0] != '\0' && strcmp(path, devpath) != 0) {
212 				fmd_hdl_debug(hdl,
213 				    "rio_recv: Skipping: multiple ASRU");
214 				return;
215 			} else if (devpath[0] == '\0') {
216 				(void) strlcpy(devpath, path, sizeof (devpath));
217 			}
218 		} else {
219 			error = di_unretire_device(path, &drt);
220 			if (error != 0) {
221 				fmd_hdl_debug(hdl, "rio_recv: "
222 				    "di_unretire_device failed: error: %d %s",
223 				    error, path);
224 				rval = -1;
225 			}
226 		}
227 	}
228 
229 	if (retire) {
230 		if (devpath[0] == '\0')
231 			return;
232 		error = di_retire_device(devpath, &drt, 0);
233 		if (error != 0) {
234 			fmd_hdl_debug(hdl, "rio_recv: di_retire_device "
235 			    "failed: error: %d %s", error, devpath);
236 			rval = -1;
237 		}
238 	}
239 
240 	/*
241 	 * The fmd framework takes care of moving a case to the repaired
242 	 * state. To move the case to the closed state however, we (the
243 	 * retire agent) need to call fmd_case_uuclose()
244 	 */
245 	if (retire && rval == 0) {
246 		if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0 &&
247 		    !fmd_case_uuclosed(hdl, uuid)) {
248 			fmd_case_uuclose(hdl, uuid);
249 		}
250 	}
251 }
252 
253 static const fmd_hdl_ops_t fmd_ops = {
254 	rio_recv,	/* fmdo_recv */
255 	NULL,		/* fmdo_timeout */
256 	NULL,		/* fmdo_close */
257 	NULL,		/* fmdo_stats */
258 	NULL,		/* fmdo_gc */
259 };
260 
261 static const fmd_prop_t rio_props[] = {
262 	{ "global-disable", FMD_TYPE_BOOL, "false" },
263 	{ "fault-exceptions", FMD_TYPE_STRING, NULL },
264 	{ NULL, 0, NULL }
265 };
266 
267 static const fmd_hdl_info_t fmd_info = {
268 	"I/O Retire Agent", "2.0", &fmd_ops, rio_props
269 };
270 
271 void
272 _fmd_init(fmd_hdl_t *hdl)
273 {
274 	char	*estr;
275 	char	*estrdup;
276 
277 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
278 		fmd_hdl_debug(hdl, "failed to register handle\n");
279 		return;
280 	}
281 
282 	global_disable = fmd_prop_get_int32(hdl, "global-disable");
283 
284 	estrdup = NULL;
285 	if (estr = fmd_prop_get_string(hdl, "fault-exceptions")) {
286 		estrdup = fmd_hdl_strdup(hdl, estr, FMD_SLEEP);
287 		fmd_prop_free_string(hdl, estr);
288 		parse_exception_string(hdl, estrdup);
289 		fmd_hdl_strfree(hdl, estrdup);
290 	}
291 }
292 
293 void
294 _fmd_fini(fmd_hdl_t *hdl)
295 {
296 	free_exception_list(hdl);
297 }
298