xref: /illumos-gate/usr/src/cmd/fm/modules/common/io-retire/rio_main.c (revision 5750ef5c2584f7399d9b98bfd513c0ca9f79f66e)
125e8c5aaSvikram /*
225e8c5aaSvikram  * CDDL HEADER START
325e8c5aaSvikram  *
425e8c5aaSvikram  * The contents of this file are subject to the terms of the
525e8c5aaSvikram  * Common Development and Distribution License (the "License").
625e8c5aaSvikram  * You may not use this file except in compliance with the License.
725e8c5aaSvikram  *
825e8c5aaSvikram  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
925e8c5aaSvikram  * or http://www.opensolaris.org/os/licensing.
1025e8c5aaSvikram  * See the License for the specific language governing permissions
1125e8c5aaSvikram  * and limitations under the License.
1225e8c5aaSvikram  *
1325e8c5aaSvikram  * When distributing Covered Code, include this CDDL HEADER in each
1425e8c5aaSvikram  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1525e8c5aaSvikram  * If applicable, add the following below this CDDL HEADER, with the
1625e8c5aaSvikram  * fields enclosed by brackets "[]" replaced with your own identifying
1725e8c5aaSvikram  * information: Portions Copyright [yyyy] [name of copyright owner]
1825e8c5aaSvikram  *
1925e8c5aaSvikram  * CDDL HEADER END
2025e8c5aaSvikram  */
2125e8c5aaSvikram 
2225e8c5aaSvikram /*
23cbf75e67SStephen Hanson  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
2425e8c5aaSvikram  * Use is subject to license terms.
2525e8c5aaSvikram  */
2625e8c5aaSvikram 
2725e8c5aaSvikram #include <sys/fm/protocol.h>
2825e8c5aaSvikram #include <fm/fmd_api.h>
2925e8c5aaSvikram #include <strings.h>
3025e8c5aaSvikram #include <libdevinfo.h>
3125e8c5aaSvikram #include <sys/modctl.h>
3225e8c5aaSvikram 
3325e8c5aaSvikram static int	global_disable;
3425e8c5aaSvikram 
3525e8c5aaSvikram struct except_list {
3625e8c5aaSvikram 	char			*el_fault;
3725e8c5aaSvikram 	struct except_list	*el_next;
3825e8c5aaSvikram };
3925e8c5aaSvikram 
4025e8c5aaSvikram static struct except_list *except_list;
4125e8c5aaSvikram 
4225e8c5aaSvikram static void
parse_exception_string(fmd_hdl_t * hdl,char * estr)4325e8c5aaSvikram parse_exception_string(fmd_hdl_t *hdl, char *estr)
4425e8c5aaSvikram {
4525e8c5aaSvikram 	char	*p;
4625e8c5aaSvikram 	char	*next;
4725e8c5aaSvikram 	size_t	len;
4825e8c5aaSvikram 	struct except_list *elem;
4925e8c5aaSvikram 
5025e8c5aaSvikram 	len = strlen(estr);
5125e8c5aaSvikram 
5225e8c5aaSvikram 	p = estr;
5325e8c5aaSvikram 	for (;;) {
5425e8c5aaSvikram 		/* Remove leading ':' */
5525e8c5aaSvikram 		while (*p == ':')
5625e8c5aaSvikram 			p++;
5725e8c5aaSvikram 		if (*p == '\0')
5825e8c5aaSvikram 			break;
5925e8c5aaSvikram 
6025e8c5aaSvikram 		next = strchr(p, ':');
6125e8c5aaSvikram 
6225e8c5aaSvikram 		if (next)
6325e8c5aaSvikram 			*next = '\0';
6425e8c5aaSvikram 
6525e8c5aaSvikram 		elem = fmd_hdl_alloc(hdl,
6625e8c5aaSvikram 		    sizeof (struct except_list), FMD_SLEEP);
6725e8c5aaSvikram 		elem->el_fault = fmd_hdl_strdup(hdl, p, FMD_SLEEP);
6825e8c5aaSvikram 		elem->el_next = except_list;
6925e8c5aaSvikram 		except_list = elem;
7025e8c5aaSvikram 
7125e8c5aaSvikram 		if (next) {
7225e8c5aaSvikram 			*next = ':';
7325e8c5aaSvikram 			p = next + 1;
7425e8c5aaSvikram 		} else {
7525e8c5aaSvikram 			break;
7625e8c5aaSvikram 		}
7725e8c5aaSvikram 	}
7825e8c5aaSvikram 
7925e8c5aaSvikram 	if (len != strlen(estr)) {
8025e8c5aaSvikram 		fmd_hdl_abort(hdl, "Error parsing exception list: %s\n", estr);
8125e8c5aaSvikram 	}
8225e8c5aaSvikram }
8325e8c5aaSvikram 
8425e8c5aaSvikram /*
8525e8c5aaSvikram  * Returns
8625e8c5aaSvikram  *	1  if fault on exception list
8725e8c5aaSvikram  *	0  otherwise
8825e8c5aaSvikram  */
8925e8c5aaSvikram static int
fault_exception(fmd_hdl_t * hdl,nvlist_t * fault)9025e8c5aaSvikram fault_exception(fmd_hdl_t *hdl, nvlist_t *fault)
9125e8c5aaSvikram {
9225e8c5aaSvikram 	struct except_list *elem;
9325e8c5aaSvikram 
9425e8c5aaSvikram 	for (elem = except_list; elem; elem = elem->el_next) {
9525e8c5aaSvikram 		if (fmd_nvl_class_match(hdl, fault, elem->el_fault)) {
9625e8c5aaSvikram 			fmd_hdl_debug(hdl, "rio_recv: Skipping fault "
9725e8c5aaSvikram 			    "on exception list (%s)\n", elem->el_fault);
9825e8c5aaSvikram 			return (1);
9925e8c5aaSvikram 		}
10025e8c5aaSvikram 	}
10125e8c5aaSvikram 
10225e8c5aaSvikram 	return (0);
10325e8c5aaSvikram }
10425e8c5aaSvikram 
10525e8c5aaSvikram static void
free_exception_list(fmd_hdl_t * hdl)10625e8c5aaSvikram free_exception_list(fmd_hdl_t *hdl)
10725e8c5aaSvikram {
10825e8c5aaSvikram 	struct except_list *elem;
10925e8c5aaSvikram 
11025e8c5aaSvikram 	while (except_list) {
11125e8c5aaSvikram 		elem = except_list;
11225e8c5aaSvikram 		except_list = elem->el_next;
11325e8c5aaSvikram 		fmd_hdl_strfree(hdl, elem->el_fault);
11425e8c5aaSvikram 		fmd_hdl_free(hdl, elem, sizeof (*elem));
11525e8c5aaSvikram 	}
11625e8c5aaSvikram }
11725e8c5aaSvikram 
11825e8c5aaSvikram 
11925e8c5aaSvikram /*ARGSUSED*/
12025e8c5aaSvikram static void
rio_recv(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class)12125e8c5aaSvikram rio_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
12225e8c5aaSvikram {
12325c6ff4bSstephh 	nvlist_t	**faults = NULL;
12425e8c5aaSvikram 	nvlist_t	*asru;
12525c6ff4bSstephh 	uint_t		nfaults = 0;
12625e8c5aaSvikram 	int		f;
12725e8c5aaSvikram 	char		*path;
12825e8c5aaSvikram 	char		*uuid;
12925e8c5aaSvikram 	char		*scheme;
13025e8c5aaSvikram 	di_retire_t	drt = {0};
13125e8c5aaSvikram 	int		retire;
13225c6ff4bSstephh 	int		rval = 0;
133cbf75e67SStephen Hanson 	int		valid_suspect = 0;
13425e8c5aaSvikram 	int		error;
13525e8c5aaSvikram 	char		*snglfault = FM_FAULT_CLASS"."FM_ERROR_IO".";
136b7d3956bSstephh 	boolean_t	rtr;
13725e8c5aaSvikram 
13825e8c5aaSvikram 
13925e8c5aaSvikram 	/*
14025e8c5aaSvikram 	 * If disabled, we don't do retire. We still do unretires though
14125e8c5aaSvikram 	 */
142*5750ef5cSStephen Hanson 	if (global_disable && (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 ||
143*5750ef5cSStephen Hanson 	    strcmp(class, FM_LIST_UPDATED_CLASS) == 0)) {
14425e8c5aaSvikram 		fmd_hdl_debug(hdl, "rio_recv: retire disabled\n");
14525e8c5aaSvikram 		return;
14625e8c5aaSvikram 	}
14725e8c5aaSvikram 
14825e8c5aaSvikram 	drt.rt_abort = (void (*)(void *, const char *, ...))fmd_hdl_abort;
14925e8c5aaSvikram 	drt.rt_debug = (void (*)(void *, const char *, ...))fmd_hdl_debug;
15025e8c5aaSvikram 	drt.rt_hdl = hdl;
15125e8c5aaSvikram 
15225e8c5aaSvikram 	if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0) {
15325e8c5aaSvikram 		retire = 1;
15425e8c5aaSvikram 	} else if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) {
15525e8c5aaSvikram 		retire = 0;
15625c6ff4bSstephh 	} else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0) {
15725c6ff4bSstephh 		retire = 0;
158cbf75e67SStephen Hanson 	} else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) {
159cbf75e67SStephen Hanson 		return;
16025e8c5aaSvikram 	} else if (strncmp(class, snglfault, strlen(snglfault)) == 0) {
16125c6ff4bSstephh 		retire = 1;
16225c6ff4bSstephh 		faults = &nvl;
16325c6ff4bSstephh 		nfaults = 1;
16425e8c5aaSvikram 	} else {
16525e8c5aaSvikram 		fmd_hdl_debug(hdl, "rio_recv: not list.* class: %s\n", class);
16625e8c5aaSvikram 		return;
16725e8c5aaSvikram 	}
16825e8c5aaSvikram 
16925c6ff4bSstephh 	if (nfaults == 0 && nvlist_lookup_nvlist_array(nvl,
17025c6ff4bSstephh 	    FM_SUSPECT_FAULT_LIST, &faults, &nfaults) != 0) {
17125e8c5aaSvikram 		fmd_hdl_debug(hdl, "rio_recv: no fault list");
17225e8c5aaSvikram 		return;
17325e8c5aaSvikram 	}
17425e8c5aaSvikram 
17525e8c5aaSvikram 	for (f = 0; f < nfaults; f++) {
176b7d3956bSstephh 		if (nvlist_lookup_boolean_value(faults[f], FM_SUSPECT_RETIRE,
177b7d3956bSstephh 		    &rtr) == 0 && !rtr) {
178b7d3956bSstephh 			fmd_hdl_debug(hdl, "rio_recv: retire suppressed");
179b7d3956bSstephh 			continue;
180b7d3956bSstephh 		}
181b7d3956bSstephh 
18225e8c5aaSvikram 		if (nvlist_lookup_nvlist(faults[f], FM_FAULT_ASRU,
18325e8c5aaSvikram 		    &asru) != 0) {
18425e8c5aaSvikram 			fmd_hdl_debug(hdl, "rio_recv: no asru in fault");
18525e8c5aaSvikram 			continue;
18625e8c5aaSvikram 		}
18725e8c5aaSvikram 
18825e8c5aaSvikram 		scheme = NULL;
18925e8c5aaSvikram 		if (nvlist_lookup_string(asru, FM_FMRI_SCHEME, &scheme) != 0 ||
19025e8c5aaSvikram 		    strcmp(scheme, FM_FMRI_SCHEME_DEV) != 0) {
19125e8c5aaSvikram 			fmd_hdl_debug(hdl, "rio_recv: not \"dev\" scheme: %s",
19225e8c5aaSvikram 			    scheme ? scheme : "<NULL>");
19325e8c5aaSvikram 			continue;
19425e8c5aaSvikram 		}
19525e8c5aaSvikram 
19625c6ff4bSstephh 		if (fault_exception(hdl, faults[f]))
19725e8c5aaSvikram 			continue;
19825e8c5aaSvikram 
19925e8c5aaSvikram 		if (nvlist_lookup_string(asru, FM_FMRI_DEV_PATH,
20025e8c5aaSvikram 		    &path) != 0 || path[0] == '\0') {
20125e8c5aaSvikram 			fmd_hdl_debug(hdl, "rio_recv: no dev path in asru");
20225e8c5aaSvikram 			continue;
20325e8c5aaSvikram 		}
20425e8c5aaSvikram 
205cbf75e67SStephen Hanson 		valid_suspect = 1;
20625e8c5aaSvikram 		if (retire) {
20725c6ff4bSstephh 			if (fmd_nvl_fmri_has_fault(hdl, asru,
20825c6ff4bSstephh 			    FMD_HAS_FAULT_ASRU, NULL) == 1) {
20925c6ff4bSstephh 				error = di_retire_device(path, &drt, 0);
21025c6ff4bSstephh 				if (error != 0) {
21125c6ff4bSstephh 					fmd_hdl_debug(hdl, "rio_recv:"
21225c6ff4bSstephh 					    " di_retire_device failed:"
21325c6ff4bSstephh 					    " error: %d %s", error, path);
21425c6ff4bSstephh 					rval = -1;
21525c6ff4bSstephh 				}
21625e8c5aaSvikram 			}
21725e8c5aaSvikram 		} else {
21825c6ff4bSstephh 			if (fmd_nvl_fmri_has_fault(hdl, asru,
21925c6ff4bSstephh 			    FMD_HAS_FAULT_ASRU, NULL) == 0) {
22025e8c5aaSvikram 				error = di_unretire_device(path, &drt);
22125e8c5aaSvikram 				if (error != 0) {
22225e8c5aaSvikram 					fmd_hdl_debug(hdl, "rio_recv:"
22325c6ff4bSstephh 					    " di_unretire_device failed:"
22425c6ff4bSstephh 					    " error: %d %s", error, path);
22525e8c5aaSvikram 					rval = -1;
22625e8c5aaSvikram 				}
22725e8c5aaSvikram 			}
22825e8c5aaSvikram 		}
22925e8c5aaSvikram 	}
230*5750ef5cSStephen Hanson 	/*
231*5750ef5cSStephen Hanson 	 * Run through again to handle new faults in a list.updated.
232*5750ef5cSStephen Hanson 	 */
233*5750ef5cSStephen Hanson 	for (f = 0; f < nfaults; f++) {
234*5750ef5cSStephen Hanson 		if (nvlist_lookup_boolean_value(faults[f], FM_SUSPECT_RETIRE,
235*5750ef5cSStephen Hanson 		    &rtr) == 0 && !rtr) {
236*5750ef5cSStephen Hanson 			fmd_hdl_debug(hdl, "rio_recv: retire suppressed");
237*5750ef5cSStephen Hanson 			continue;
238*5750ef5cSStephen Hanson 		}
239*5750ef5cSStephen Hanson 
240*5750ef5cSStephen Hanson 		if (nvlist_lookup_nvlist(faults[f], FM_FAULT_ASRU,
241*5750ef5cSStephen Hanson 		    &asru) != 0) {
242*5750ef5cSStephen Hanson 			fmd_hdl_debug(hdl, "rio_recv: no asru in fault");
243*5750ef5cSStephen Hanson 			continue;
244*5750ef5cSStephen Hanson 		}
245*5750ef5cSStephen Hanson 
246*5750ef5cSStephen Hanson 		scheme = NULL;
247*5750ef5cSStephen Hanson 		if (nvlist_lookup_string(asru, FM_FMRI_SCHEME, &scheme) != 0 ||
248*5750ef5cSStephen Hanson 		    strcmp(scheme, FM_FMRI_SCHEME_DEV) != 0) {
249*5750ef5cSStephen Hanson 			fmd_hdl_debug(hdl, "rio_recv: not \"dev\" scheme: %s",
250*5750ef5cSStephen Hanson 			    scheme ? scheme : "<NULL>");
251*5750ef5cSStephen Hanson 			continue;
252*5750ef5cSStephen Hanson 		}
253*5750ef5cSStephen Hanson 
254*5750ef5cSStephen Hanson 		if (fault_exception(hdl, faults[f]))
255*5750ef5cSStephen Hanson 			continue;
256*5750ef5cSStephen Hanson 
257*5750ef5cSStephen Hanson 		if (nvlist_lookup_string(asru, FM_FMRI_DEV_PATH,
258*5750ef5cSStephen Hanson 		    &path) != 0 || path[0] == '\0') {
259*5750ef5cSStephen Hanson 			fmd_hdl_debug(hdl, "rio_recv: no dev path in asru");
260*5750ef5cSStephen Hanson 			continue;
261*5750ef5cSStephen Hanson 		}
262*5750ef5cSStephen Hanson 
263*5750ef5cSStephen Hanson 		if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0) {
264*5750ef5cSStephen Hanson 			if (fmd_nvl_fmri_has_fault(hdl, asru,
265*5750ef5cSStephen Hanson 			    FMD_HAS_FAULT_ASRU, NULL) == 1) {
266*5750ef5cSStephen Hanson 				error = di_retire_device(path, &drt, 0);
267*5750ef5cSStephen Hanson 				if (error != 0) {
268*5750ef5cSStephen Hanson 					fmd_hdl_debug(hdl, "rio_recv:"
269*5750ef5cSStephen Hanson 					    " di_retire_device failed:"
270*5750ef5cSStephen Hanson 					    " error: %d %s", error, path);
271*5750ef5cSStephen Hanson 				}
272*5750ef5cSStephen Hanson 			}
273*5750ef5cSStephen Hanson 		}
274*5750ef5cSStephen Hanson 	}
27525e8c5aaSvikram 
27625e8c5aaSvikram 	/*
277cbf75e67SStephen Hanson 	 * Don't send uuclose or uuresolved unless at least one suspect
278cbf75e67SStephen Hanson 	 * was valid for this retire agent and no retires/unretires failed.
279cbf75e67SStephen Hanson 	 */
280cbf75e67SStephen Hanson 	if (valid_suspect == 0)
281cbf75e67SStephen Hanson 		return;
282cbf75e67SStephen Hanson 
283cbf75e67SStephen Hanson 	/*
28425e8c5aaSvikram 	 * The fmd framework takes care of moving a case to the repaired
28525e8c5aaSvikram 	 * state. To move the case to the closed state however, we (the
28625e8c5aaSvikram 	 * retire agent) need to call fmd_case_uuclose()
28725e8c5aaSvikram 	 */
28825c6ff4bSstephh 	if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 && rval == 0) {
28925e8c5aaSvikram 		if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0 &&
29025e8c5aaSvikram 		    !fmd_case_uuclosed(hdl, uuid)) {
29125e8c5aaSvikram 			fmd_case_uuclose(hdl, uuid);
29225e8c5aaSvikram 		}
29325e8c5aaSvikram 	}
29425c6ff4bSstephh 
29525c6ff4bSstephh 	/*
29625c6ff4bSstephh 	 * Similarly to move the case to the resolved state, we (the
29725c6ff4bSstephh 	 * retire agent) need to call fmd_case_uuresolved()
29825c6ff4bSstephh 	 */
29925c6ff4bSstephh 	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && rval == 0 &&
30025c6ff4bSstephh 	    nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0)
30125c6ff4bSstephh 		fmd_case_uuresolved(hdl, uuid);
30225e8c5aaSvikram }
30325e8c5aaSvikram 
30425e8c5aaSvikram static const fmd_hdl_ops_t fmd_ops = {
30525e8c5aaSvikram 	rio_recv,	/* fmdo_recv */
30625e8c5aaSvikram 	NULL,		/* fmdo_timeout */
30725e8c5aaSvikram 	NULL,		/* fmdo_close */
30825e8c5aaSvikram 	NULL,		/* fmdo_stats */
30925e8c5aaSvikram 	NULL,		/* fmdo_gc */
31025e8c5aaSvikram };
31125e8c5aaSvikram 
31225e8c5aaSvikram static const fmd_prop_t rio_props[] = {
31325e8c5aaSvikram 	{ "global-disable", FMD_TYPE_BOOL, "false" },
31425e8c5aaSvikram 	{ "fault-exceptions", FMD_TYPE_STRING, NULL },
31525e8c5aaSvikram 	{ NULL, 0, NULL }
31625e8c5aaSvikram };
31725e8c5aaSvikram 
31825e8c5aaSvikram static const fmd_hdl_info_t fmd_info = {
31925e8c5aaSvikram 	"I/O Retire Agent", "2.0", &fmd_ops, rio_props
32025e8c5aaSvikram };
32125e8c5aaSvikram 
32225e8c5aaSvikram void
_fmd_init(fmd_hdl_t * hdl)32325e8c5aaSvikram _fmd_init(fmd_hdl_t *hdl)
32425e8c5aaSvikram {
32525e8c5aaSvikram 	char	*estr;
32625e8c5aaSvikram 	char	*estrdup;
32725e8c5aaSvikram 
32825e8c5aaSvikram 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
32925e8c5aaSvikram 		fmd_hdl_debug(hdl, "failed to register handle\n");
33025e8c5aaSvikram 		return;
33125e8c5aaSvikram 	}
33225e8c5aaSvikram 
33325e8c5aaSvikram 	global_disable = fmd_prop_get_int32(hdl, "global-disable");
33425e8c5aaSvikram 
33525e8c5aaSvikram 	estrdup = NULL;
33625e8c5aaSvikram 	if (estr = fmd_prop_get_string(hdl, "fault-exceptions")) {
33725e8c5aaSvikram 		estrdup = fmd_hdl_strdup(hdl, estr, FMD_SLEEP);
33825e8c5aaSvikram 		fmd_prop_free_string(hdl, estr);
33925e8c5aaSvikram 		parse_exception_string(hdl, estrdup);
34025e8c5aaSvikram 		fmd_hdl_strfree(hdl, estrdup);
34125e8c5aaSvikram 	}
34225e8c5aaSvikram }
34325e8c5aaSvikram 
34425e8c5aaSvikram void
_fmd_fini(fmd_hdl_t * hdl)34525e8c5aaSvikram _fmd_fini(fmd_hdl_t *hdl)
34625e8c5aaSvikram {
34725e8c5aaSvikram 	free_exception_list(hdl);
34825e8c5aaSvikram }
349