xref: /illumos-gate/usr/src/cmd/fm/modules/common/io-retire/rio_main.c (revision 533affcbc7fc4d0c8132976ea454aaa715fe2307)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/fm/protocol.h>
28 #include <fm/fmd_api.h>
29 #include <strings.h>
30 #include <libdevinfo.h>
31 #include <sys/modctl.h>
32 
33 static int	global_disable;
34 
35 struct except_list {
36 	char			*el_fault;
37 	struct except_list	*el_next;
38 };
39 
40 static struct except_list *except_list;
41 
42 static void
43 parse_exception_string(fmd_hdl_t *hdl, char *estr)
44 {
45 	char	*p;
46 	char	*next;
47 	size_t	len;
48 	struct except_list *elem;
49 
50 	len = strlen(estr);
51 
52 	p = estr;
53 	for (;;) {
54 		/* Remove leading ':' */
55 		while (*p == ':')
56 			p++;
57 		if (*p == '\0')
58 			break;
59 
60 		next = strchr(p, ':');
61 
62 		if (next)
63 			*next = '\0';
64 
65 		elem = fmd_hdl_alloc(hdl,
66 		    sizeof (struct except_list), FMD_SLEEP);
67 		elem->el_fault = fmd_hdl_strdup(hdl, p, FMD_SLEEP);
68 		elem->el_next = except_list;
69 		except_list = elem;
70 
71 		if (next) {
72 			*next = ':';
73 			p = next + 1;
74 		} else {
75 			break;
76 		}
77 	}
78 
79 	if (len != strlen(estr)) {
80 		fmd_hdl_abort(hdl, "Error parsing exception list: %s\n", estr);
81 	}
82 }
83 
84 /*
85  * Returns
86  *	1  if fault on exception list
87  *	0  otherwise
88  */
89 static int
90 fault_exception(fmd_hdl_t *hdl, nvlist_t *fault)
91 {
92 	struct except_list *elem;
93 
94 	for (elem = except_list; elem; elem = elem->el_next) {
95 		if (fmd_nvl_class_match(hdl, fault, elem->el_fault)) {
96 			fmd_hdl_debug(hdl, "rio_recv: Skipping fault "
97 			    "on exception list (%s)\n", elem->el_fault);
98 			return (1);
99 		}
100 	}
101 
102 	return (0);
103 }
104 
105 static void
106 free_exception_list(fmd_hdl_t *hdl)
107 {
108 	struct except_list *elem;
109 
110 	while (except_list) {
111 		elem = except_list;
112 		except_list = elem->el_next;
113 		fmd_hdl_strfree(hdl, elem->el_fault);
114 		fmd_hdl_free(hdl, elem, sizeof (*elem));
115 	}
116 }
117 
118 
119 /*ARGSUSED*/
120 static void
121 rio_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
122 {
123 	nvlist_t	**faults = NULL;
124 	nvlist_t	*asru;
125 	uint_t		nfaults = 0;
126 	int		f;
127 	char		*path;
128 	char		*uuid;
129 	char		*scheme;
130 	di_retire_t	drt = {0};
131 	int		retire;
132 	int		rval = 0;
133 	int		valid_suspect = 0;
134 	int		error;
135 	char		*snglfault = FM_FAULT_CLASS"."FM_ERROR_IO".";
136 	boolean_t	rtr;
137 
138 
139 	/*
140 	 * If disabled, we don't do retire. We still do unretires though
141 	 */
142 	if (global_disable && (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 ||
143 	    strcmp(class, FM_LIST_UPDATED_CLASS) == 0)) {
144 		fmd_hdl_debug(hdl, "rio_recv: retire disabled\n");
145 		return;
146 	}
147 
148 	drt.rt_abort = (void (*)(void *, const char *, ...))fmd_hdl_abort;
149 	drt.rt_debug = (void (*)(void *, const char *, ...))fmd_hdl_debug;
150 	drt.rt_hdl = hdl;
151 
152 	if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0) {
153 		retire = 1;
154 	} else if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) {
155 		retire = 0;
156 	} else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0) {
157 		retire = 0;
158 	} else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) {
159 		return;
160 	} else if (strncmp(class, snglfault, strlen(snglfault)) == 0) {
161 		retire = 1;
162 		faults = &nvl;
163 		nfaults = 1;
164 	} else {
165 		fmd_hdl_debug(hdl, "rio_recv: not list.* class: %s\n", class);
166 		return;
167 	}
168 
169 	if (nfaults == 0 && nvlist_lookup_nvlist_array(nvl,
170 	    FM_SUSPECT_FAULT_LIST, &faults, &nfaults) != 0) {
171 		fmd_hdl_debug(hdl, "rio_recv: no fault list");
172 		return;
173 	}
174 
175 	for (f = 0; f < nfaults; f++) {
176 		if (nvlist_lookup_boolean_value(faults[f], FM_SUSPECT_RETIRE,
177 		    &rtr) == 0 && !rtr) {
178 			fmd_hdl_debug(hdl, "rio_recv: retire suppressed");
179 			continue;
180 		}
181 
182 		if (nvlist_lookup_nvlist(faults[f], FM_FAULT_ASRU,
183 		    &asru) != 0) {
184 			fmd_hdl_debug(hdl, "rio_recv: no asru in fault");
185 			continue;
186 		}
187 
188 		scheme = NULL;
189 		if (nvlist_lookup_string(asru, FM_FMRI_SCHEME, &scheme) != 0 ||
190 		    strcmp(scheme, FM_FMRI_SCHEME_DEV) != 0) {
191 			fmd_hdl_debug(hdl, "rio_recv: not \"dev\" scheme: %s",
192 			    scheme ? scheme : "<NULL>");
193 			continue;
194 		}
195 
196 		if (fault_exception(hdl, faults[f]))
197 			continue;
198 
199 		if (nvlist_lookup_string(asru, FM_FMRI_DEV_PATH,
200 		    &path) != 0 || path[0] == '\0') {
201 			fmd_hdl_debug(hdl, "rio_recv: no dev path in asru");
202 			continue;
203 		}
204 
205 		valid_suspect = 1;
206 		if (retire) {
207 			if (fmd_nvl_fmri_has_fault(hdl, asru,
208 			    FMD_HAS_FAULT_ASRU, NULL) == 1) {
209 				error = di_retire_device(path, &drt, 0);
210 				if (error != 0) {
211 					fmd_hdl_debug(hdl, "rio_recv:"
212 					    " di_retire_device failed:"
213 					    " error: %d %s", error, path);
214 					rval = -1;
215 				}
216 			}
217 		} else {
218 			if (fmd_nvl_fmri_has_fault(hdl, asru,
219 			    FMD_HAS_FAULT_ASRU, NULL) == 0) {
220 				error = di_unretire_device(path, &drt);
221 				if (error != 0) {
222 					fmd_hdl_debug(hdl, "rio_recv:"
223 					    " di_unretire_device failed:"
224 					    " error: %d %s", error, path);
225 					rval = -1;
226 				}
227 			}
228 		}
229 	}
230 	/*
231 	 * Run through again to handle new faults in a list.updated.
232 	 */
233 	for (f = 0; f < nfaults; f++) {
234 		if (nvlist_lookup_boolean_value(faults[f], FM_SUSPECT_RETIRE,
235 		    &rtr) == 0 && !rtr) {
236 			fmd_hdl_debug(hdl, "rio_recv: retire suppressed");
237 			continue;
238 		}
239 
240 		if (nvlist_lookup_nvlist(faults[f], FM_FAULT_ASRU,
241 		    &asru) != 0) {
242 			fmd_hdl_debug(hdl, "rio_recv: no asru in fault");
243 			continue;
244 		}
245 
246 		scheme = NULL;
247 		if (nvlist_lookup_string(asru, FM_FMRI_SCHEME, &scheme) != 0 ||
248 		    strcmp(scheme, FM_FMRI_SCHEME_DEV) != 0) {
249 			fmd_hdl_debug(hdl, "rio_recv: not \"dev\" scheme: %s",
250 			    scheme ? scheme : "<NULL>");
251 			continue;
252 		}
253 
254 		if (fault_exception(hdl, faults[f]))
255 			continue;
256 
257 		if (nvlist_lookup_string(asru, FM_FMRI_DEV_PATH,
258 		    &path) != 0 || path[0] == '\0') {
259 			fmd_hdl_debug(hdl, "rio_recv: no dev path in asru");
260 			continue;
261 		}
262 
263 		if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0) {
264 			if (fmd_nvl_fmri_has_fault(hdl, asru,
265 			    FMD_HAS_FAULT_ASRU, NULL) == 1) {
266 				error = di_retire_device(path, &drt, 0);
267 				if (error != 0) {
268 					fmd_hdl_debug(hdl, "rio_recv:"
269 					    " di_retire_device failed:"
270 					    " error: %d %s", error, path);
271 				}
272 			}
273 		}
274 	}
275 
276 	/*
277 	 * Don't send uuclose or uuresolved unless at least one suspect
278 	 * was valid for this retire agent and no retires/unretires failed.
279 	 */
280 	if (valid_suspect == 0)
281 		return;
282 
283 	/*
284 	 * The fmd framework takes care of moving a case to the repaired
285 	 * state. To move the case to the closed state however, we (the
286 	 * retire agent) need to call fmd_case_uuclose()
287 	 */
288 	if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 && rval == 0) {
289 		if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0 &&
290 		    !fmd_case_uuclosed(hdl, uuid)) {
291 			fmd_case_uuclose(hdl, uuid);
292 		}
293 	}
294 
295 	/*
296 	 * Similarly to move the case to the resolved state, we (the
297 	 * retire agent) need to call fmd_case_uuresolved()
298 	 */
299 	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && rval == 0 &&
300 	    nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0)
301 		fmd_case_uuresolved(hdl, uuid);
302 }
303 
304 static const fmd_hdl_ops_t fmd_ops = {
305 	rio_recv,	/* fmdo_recv */
306 	NULL,		/* fmdo_timeout */
307 	NULL,		/* fmdo_close */
308 	NULL,		/* fmdo_stats */
309 	NULL,		/* fmdo_gc */
310 };
311 
312 static const fmd_prop_t rio_props[] = {
313 	{ "global-disable", FMD_TYPE_BOOL, "false" },
314 	{ "fault-exceptions", FMD_TYPE_STRING, NULL },
315 	{ NULL, 0, NULL }
316 };
317 
318 static const fmd_hdl_info_t fmd_info = {
319 	"I/O Retire Agent", "2.0", &fmd_ops, rio_props
320 };
321 
322 void
323 _fmd_init(fmd_hdl_t *hdl)
324 {
325 	char	*estr;
326 	char	*estrdup;
327 
328 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
329 		fmd_hdl_debug(hdl, "failed to register handle\n");
330 		return;
331 	}
332 
333 	global_disable = fmd_prop_get_int32(hdl, "global-disable");
334 
335 	estrdup = NULL;
336 	if (estr = fmd_prop_get_string(hdl, "fault-exceptions")) {
337 		estrdup = fmd_hdl_strdup(hdl, estr, FMD_SLEEP);
338 		fmd_prop_free_string(hdl, estr);
339 		parse_exception_string(hdl, estrdup);
340 		fmd_hdl_strfree(hdl, estrdup);
341 	}
342 }
343 
344 void
345 _fmd_fini(fmd_hdl_t *hdl)
346 {
347 	free_exception_list(hdl);
348 }
349