xref: /illumos-gate/usr/src/cmd/fm/modules/common/io-retire/rio_main.c (revision 89a7715a55deca73d03076f5c24463717f0aaa91)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/fm/protocol.h>
30 #include <fm/fmd_api.h>
31 #include <strings.h>
32 #include <libdevinfo.h>
33 #include <sys/modctl.h>
34 
35 static int	global_disable;
36 
37 struct except_list {
38 	char			*el_fault;
39 	struct except_list	*el_next;
40 };
41 
42 static struct except_list *except_list;
43 
44 static void
45 parse_exception_string(fmd_hdl_t *hdl, char *estr)
46 {
47 	char	*p;
48 	char	*next;
49 	size_t	len;
50 	struct except_list *elem;
51 
52 	len = strlen(estr);
53 
54 	p = estr;
55 	for (;;) {
56 		/* Remove leading ':' */
57 		while (*p == ':')
58 			p++;
59 		if (*p == '\0')
60 			break;
61 
62 		next = strchr(p, ':');
63 
64 		if (next)
65 			*next = '\0';
66 
67 		elem = fmd_hdl_alloc(hdl,
68 		    sizeof (struct except_list), FMD_SLEEP);
69 		elem->el_fault = fmd_hdl_strdup(hdl, p, FMD_SLEEP);
70 		elem->el_next = except_list;
71 		except_list = elem;
72 
73 		if (next) {
74 			*next = ':';
75 			p = next + 1;
76 		} else {
77 			break;
78 		}
79 	}
80 
81 	if (len != strlen(estr)) {
82 		fmd_hdl_abort(hdl, "Error parsing exception list: %s\n", estr);
83 	}
84 }
85 
86 /*
87  * Returns
88  *	1  if fault on exception list
89  *	0  otherwise
90  */
91 static int
92 fault_exception(fmd_hdl_t *hdl, nvlist_t *fault)
93 {
94 	struct except_list *elem;
95 
96 	for (elem = except_list; elem; elem = elem->el_next) {
97 		if (fmd_nvl_class_match(hdl, fault, elem->el_fault)) {
98 			fmd_hdl_debug(hdl, "rio_recv: Skipping fault "
99 			    "on exception list (%s)\n", elem->el_fault);
100 			return (1);
101 		}
102 	}
103 
104 	return (0);
105 }
106 
107 static void
108 free_exception_list(fmd_hdl_t *hdl)
109 {
110 	struct except_list *elem;
111 
112 	while (except_list) {
113 		elem = except_list;
114 		except_list = elem->el_next;
115 		fmd_hdl_strfree(hdl, elem->el_fault);
116 		fmd_hdl_free(hdl, elem, sizeof (*elem));
117 	}
118 }
119 
120 
121 /*ARGSUSED*/
122 static void
123 rio_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
124 {
125 	nvlist_t	**faults;
126 	nvlist_t	*asru;
127 	uint_t		nfaults;
128 	int		f;
129 	char		devpath[PATH_MAX];
130 	char		*path;
131 	char		*uuid;
132 	char		*scheme;
133 	di_retire_t	drt = {0};
134 	int		retire;
135 	int		rval;
136 	int		error;
137 	char		*snglfault = FM_FAULT_CLASS"."FM_ERROR_IO".";
138 
139 
140 	/*
141 	 * If disabled, we don't do retire. We still do unretires though
142 	 */
143 	if (global_disable && strcmp(class, FM_LIST_SUSPECT_CLASS) == 0) {
144 		fmd_hdl_debug(hdl, "rio_recv: retire disabled\n");
145 		return;
146 	}
147 
148 	drt.rt_abort = (void (*)(void *, const char *, ...))fmd_hdl_abort;
149 	drt.rt_debug = (void (*)(void *, const char *, ...))fmd_hdl_debug;
150 	drt.rt_hdl = hdl;
151 
152 	if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0) {
153 		retire = 1;
154 	} else if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) {
155 		retire = 0;
156 	} else if (strncmp(class, snglfault, strlen(snglfault)) == 0) {
157 		fmd_hdl_debug(hdl, "rio_recv: single fault: %s\n", class);
158 		return;
159 	} else {
160 		fmd_hdl_debug(hdl, "rio_recv: not list.* class: %s\n", class);
161 		return;
162 	}
163 
164 	faults = NULL;
165 	nfaults = 0;
166 	if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
167 	    &faults, &nfaults) != 0) {
168 		fmd_hdl_debug(hdl, "rio_recv: no fault list");
169 		return;
170 	}
171 
172 	devpath[0] = '\0';
173 	rval = 0;
174 	for (f = 0; f < nfaults; f++) {
175 		if (nvlist_lookup_nvlist(faults[f], FM_FAULT_ASRU,
176 		    &asru) != 0) {
177 			fmd_hdl_debug(hdl, "rio_recv: no asru in fault");
178 			continue;
179 		}
180 
181 		scheme = NULL;
182 		if (nvlist_lookup_string(asru, FM_FMRI_SCHEME, &scheme) != 0 ||
183 		    strcmp(scheme, FM_FMRI_SCHEME_DEV) != 0) {
184 			fmd_hdl_debug(hdl, "rio_recv: not \"dev\" scheme: %s",
185 			    scheme ? scheme : "<NULL>");
186 			continue;
187 		}
188 
189 		if (retire && fault_exception(hdl, faults[f]))
190 			continue;
191 
192 		if (nvlist_lookup_string(asru, FM_FMRI_DEV_PATH,
193 		    &path) != 0 || path[0] == '\0') {
194 			fmd_hdl_debug(hdl, "rio_recv: no dev path in asru");
195 			continue;
196 		}
197 
198 		/*
199 		 * If retire, we retire only if a single ASRU is pinpointed.
200 		 * We don't do automatic retires if a fault event pinpoints
201 		 * more than one ASRU.
202 		 */
203 		if (retire) {
204 			if (devpath[0] != '\0' && strcmp(path, devpath) != 0) {
205 				fmd_hdl_debug(hdl,
206 				    "rio_recv: Skipping: multiple ASRU");
207 				return;
208 			} else if (devpath[0] == '\0') {
209 				(void) strlcpy(devpath, path, sizeof (devpath));
210 			}
211 		} else {
212 			error = di_unretire_device(path, &drt);
213 			if (error != 0) {
214 				fmd_hdl_debug(hdl, "rio_recv: "
215 				    "di_unretire_device failed: error: %d %s",
216 				    error, path);
217 				rval = -1;
218 			}
219 		}
220 	}
221 
222 	if (retire) {
223 		if (devpath[0] == '\0')
224 			return;
225 		error = di_retire_device(devpath, &drt, 0);
226 		if (error != 0) {
227 			fmd_hdl_debug(hdl, "rio_recv: di_retire_device "
228 			    "failed: error: %d %s", error, devpath);
229 			rval = -1;
230 		}
231 	}
232 
233 	/*
234 	 * The fmd framework takes care of moving a case to the repaired
235 	 * state. To move the case to the closed state however, we (the
236 	 * retire agent) need to call fmd_case_uuclose()
237 	 */
238 	if (retire && rval == 0) {
239 		if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0 &&
240 		    !fmd_case_uuclosed(hdl, uuid)) {
241 			fmd_case_uuclose(hdl, uuid);
242 		}
243 	}
244 }
245 
246 static const fmd_hdl_ops_t fmd_ops = {
247 	rio_recv,	/* fmdo_recv */
248 	NULL,		/* fmdo_timeout */
249 	NULL,		/* fmdo_close */
250 	NULL,		/* fmdo_stats */
251 	NULL,		/* fmdo_gc */
252 };
253 
254 static const fmd_prop_t rio_props[] = {
255 	{ "global-disable", FMD_TYPE_BOOL, "false" },
256 	{ "fault-exceptions", FMD_TYPE_STRING, NULL },
257 	{ NULL, 0, NULL }
258 };
259 
260 static const fmd_hdl_info_t fmd_info = {
261 	"I/O Retire Agent", "2.0", &fmd_ops, rio_props
262 };
263 
264 void
265 _fmd_init(fmd_hdl_t *hdl)
266 {
267 	char	*estr;
268 	char	*estrdup;
269 
270 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
271 		fmd_hdl_debug(hdl, "failed to register handle\n");
272 		return;
273 	}
274 
275 	global_disable = fmd_prop_get_int32(hdl, "global-disable");
276 
277 	estrdup = NULL;
278 	if (estr = fmd_prop_get_string(hdl, "fault-exceptions")) {
279 		estrdup = fmd_hdl_strdup(hdl, estr, FMD_SLEEP);
280 		fmd_prop_free_string(hdl, estr);
281 		parse_exception_string(hdl, estrdup);
282 		fmd_hdl_strfree(hdl, estrdup);
283 	}
284 }
285 
286 void
287 _fmd_fini(fmd_hdl_t *hdl)
288 {
289 	free_exception_list(hdl);
290 }
291