xref: /illumos-gate/usr/src/cmd/fm/modules/common/disk-monitor/disk_monitor.c (revision 0244979b1714a04f23ac9fa8367e59f6fb75d8f3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
26  */
27 
28 /*
29  * Disk Monitor
30  */
31 #include <sys/types.h>
32 #include <sys/stat.h>
33 #include <fcntl.h>
34 #include <time.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <strings.h>
38 #include <stdarg.h>
39 #include <errno.h>
40 #include <signal.h>
41 #include <unistd.h>
42 #include <pthread.h>
43 #include <libnvpair.h>
44 #include <fm/fmd_api.h>
45 #include <fm/fmd_fmri.h>
46 #include <sys/fm/protocol.h>
47 #include <sys/fm/io/disk.h>
48 #include <fm/libtopo.h>
49 
50 #include "disk_monitor.h"
51 #include "hotplug_mgr.h"
52 #include "schg_mgr.h"
53 #include "topo_gather.h"
54 #include "dm_platform.h"
55 
56 #define	THIS_FMD_MODULE_NAME "disk-monitor"
57 
58 static enum disk_init_state {
59 	INIT_STATE_NONE = 0,
60 	STATE_CHANGE_MGR_INITTED = 2,
61 	HOTPLUG_MGR_INITTED = 4
62 } g_init_state = INIT_STATE_NONE;
63 
64 typedef enum {
65 	LT_SUSPECT,
66 	LT_REPAIRED
67 } fm_list_type_t;
68 
69 /*
70  * Global verbosity flag -- controls chattiness of debug messages and
71  * warnings.  Its value is determined by the fmd property "log-level"
72  * settable in the DE's .conf file.
73  */
74 log_class_t			g_verbose = 0;
75 cfgdata_t			*config_data = NULL;
76 fmd_hdl_t			*g_fm_hdl = NULL;
77 
78 static const fmd_prop_t		fmd_props[];
79 
80 static void
diskmon_teardown_all(void)81 diskmon_teardown_all(void)
82 {
83 	cleanup_hotplug_manager();
84 	cleanup_state_change_manager(config_data);
85 	config_fini();
86 }
87 
88 static int
count_disks(diskmon_t * disklistp)89 count_disks(diskmon_t *disklistp)
90 {
91 	int i = 0;
92 
93 	while (disklistp != NULL) {
94 		i++;
95 		disklistp = disklistp->next;
96 	}
97 
98 	return (i);
99 }
100 
101 static int
diskmon_init(void)102 diskmon_init(void)
103 {
104 	/*
105 	 * Block the generation of state change events (generated by the
106 	 * hotplug manager thread) here; they will be unblocked after the
107 	 * state change manager thread is ready to accept state changes
108 	 * (shortly after it starts).
109 	 */
110 	block_state_change_events();
111 
112 	if (dm_platform_init() != 0)
113 		goto cleanup;
114 
115 	if (init_hotplug_manager() != 0)
116 		goto cleanup;
117 	else
118 		g_init_state |= HOTPLUG_MGR_INITTED;
119 
120 	if (init_state_change_manager(config_data) != 0)
121 		goto cleanup;
122 	else
123 		g_init_state |= STATE_CHANGE_MGR_INITTED;
124 
125 	return (E_SUCCESS);
126 
127 cleanup:
128 
129 	unblock_state_change_events();
130 
131 	/*
132 	 * The cleanup order here does matter, due to dependencies between the
133 	 * managers.
134 	 */
135 	if (g_init_state & HOTPLUG_MGR_INITTED)
136 		cleanup_hotplug_manager();
137 	if (g_init_state & STATE_CHANGE_MGR_INITTED)
138 		cleanup_state_change_manager(config_data);
139 	dm_platform_fini();
140 
141 	return (E_ERROR);
142 }
143 
144 static void
dm_fault_execute_actions(fmd_hdl_t * hdl,diskmon_t * diskp,nvlist_t * nvl)145 dm_fault_execute_actions(fmd_hdl_t *hdl, diskmon_t *diskp, nvlist_t *nvl)
146 {
147 	const char		*action_prop = NULL;
148 	const char		*action_string;
149 
150 	/*
151 	 * The predictive failure action is the activation of the fault
152 	 * indicator.
153 	 */
154 	if (fmd_nvl_class_match(hdl, nvl,
155 	    DISK_ERROR_CLASS "." FM_FAULT_DISK_OVERTEMP))
156 		action_prop = DISK_PROP_OTEMPACTION;
157 
158 	if (fmd_nvl_class_match(hdl, nvl,
159 	    DISK_ERROR_CLASS "." FM_FAULT_DISK_TESTFAIL))
160 		action_prop = DISK_PROP_STFAILACTION;
161 
162 	if (fmd_nvl_class_match(hdl, nvl,
163 	    DISK_ERROR_CLASS "." FM_FAULT_SSM_WEAROUT))
164 		action_prop = DISK_PROP_SSMWEAROUTACTION;
165 
166 	dm_fault_indicator_set(diskp, INDICATOR_ON);
167 
168 	if (action_prop != NULL &&
169 	    (action_string = dm_prop_lookup(diskp->props, action_prop))
170 	    != NULL) {
171 
172 		if (dm_platform_indicator_execute(action_string) != 0) {
173 			log_warn("Fault action `%s' did not successfully "
174 			    "complete.\n", action_string);
175 		}
176 	}
177 }
178 
179 static void
diskmon_agent_repair(fmd_hdl_t * hdl,nvlist_t * nvl,int repair)180 diskmon_agent_repair(fmd_hdl_t *hdl, nvlist_t *nvl, int repair)
181 {
182 	char		*uuid = NULL;
183 	nvlist_t	**nva;
184 	uint_t		nvc;
185 	diskmon_t	*diskp;
186 	nvlist_t	*fmri;
187 	nvlist_t	*fltnvl;
188 	int		err = 0;
189 
190 	err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
191 	err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
192 	    &nva, &nvc);
193 	if (err != 0)
194 		return;
195 
196 	while (nvc-- != 0) {
197 
198 		fltnvl = *nva++;
199 
200 		if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri)
201 		    != 0)
202 			continue;
203 
204 		if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
205 			continue;
206 
207 		log_msg(MM_MAIN, "Disk %s repaired!\n",
208 		    diskp->location);
209 
210 		dm_fault_indicator_set(diskp, INDICATOR_OFF);
211 
212 		dm_state_change(diskp, HPS_REPAIRED);
213 	}
214 
215 	if (repair)
216 		fmd_case_uuresolved(hdl, uuid);
217 
218 }
219 
220 static void
diskmon_agent_suspect(fmd_hdl_t * hdl,nvlist_t * nvl)221 diskmon_agent_suspect(fmd_hdl_t *hdl, nvlist_t *nvl)
222 {
223 	char		*uuid = NULL;
224 	nvlist_t	**nva;
225 	uint_t		nvc;
226 	diskmon_t	*diskp;
227 	nvlist_t	*fmri;
228 	nvlist_t	*fltnvl;
229 	int		err = 0;
230 
231 	err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
232 	err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
233 	    &nva, &nvc);
234 	if (err != 0)
235 		return;
236 
237 	while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) {
238 
239 		fltnvl = *nva++;
240 
241 		if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) != 0)
242 			continue;
243 
244 		if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
245 			continue;
246 
247 		/* Execute the actions associated with this fault */
248 		dm_fault_execute_actions(hdl, diskp,  fltnvl);
249 
250 		/*
251 		 * Send a state change event to the state change manager
252 		 */
253 		dm_state_change(diskp, HPS_FAULTED);
254 	}
255 
256 	if (!fmd_case_uuclosed(hdl, uuid)) {
257 		/* Case is closed */
258 		fmd_case_uuclose(hdl, uuid);
259 	}
260 }
261 
262 /*ARGSUSED*/
263 static void
diskmon_recv(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class)264 diskmon_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
265 {
266 	diskmon_t	*diskp;
267 	nvlist_t	*fmri;
268 
269 	if (g_verbose & MM_MAIN)
270 		nvlist_print(stderr, nvl);
271 
272 	/*
273 	 * Act on the fault suspect list or repaired list (embedded agent
274 	 * action).
275 	 */
276 	if (fmd_nvl_class_match(hdl, nvl, FM_LIST_REPAIRED_CLASS)) {
277 
278 		diskmon_agent_repair(hdl, nvl, 1);
279 		return;
280 
281 	} else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_UPDATED_CLASS)) {
282 
283 		diskmon_agent_repair(hdl, nvl, 0);
284 		return;
285 
286 	} else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_SUSPECT_CLASS)) {
287 
288 		diskmon_agent_suspect(hdl, nvl);
289 		return;
290 	} else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_RESOLVED_CLASS)) {
291 		return;
292 	}
293 
294 	/*
295 	 * If we get any replayed faults, set the diskmon's faulted
296 	 * flag for the appropriate fault, then change the diskmon's state
297 	 * to faulted.
298 	 */
299 	if (fmd_nvl_class_match(hdl, nvl, DISK_ERROR_CLASS ".*")) {
300 
301 		if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE,
302 		    &fmri) != 0)
303 			return;
304 
305 		if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
306 			return;
307 
308 		/* Execute the actions associated with this fault */
309 		dm_fault_execute_actions(hdl, diskp, nvl);
310 
311 		/*
312 		 * If the fault wasn't generated by this module, send a
313 		 * state change event to the state change manager
314 		 */
315 		dm_state_change(diskp, HPS_FAULTED);
316 		return;
317 	}
318 }
319 
320 static const fmd_hdl_ops_t fmd_ops = {
321 	diskmon_recv,	/* fmdo_recv */
322 	NULL,		/* fmdo_timeout */
323 	NULL,		/* fmdo_close */
324 	NULL,		/* fmdo_stats */
325 	NULL,		/* fmdo_gc */
326 };
327 
328 static const fmd_prop_t fmd_props[] = {
329 	{ GLOBAL_PROP_LOG_LEVEL, FMD_TYPE_UINT32, "0" },
330 	{ NULL, 0, NULL }
331 };
332 
333 static const fmd_hdl_info_t fmd_info = {
334 	"Disk Monitor",
335 	DISK_MONITOR_MODULE_VERSION,
336 	&fmd_ops,
337 	fmd_props
338 };
339 
340 void
_fmd_init(fmd_hdl_t * hdl)341 _fmd_init(fmd_hdl_t *hdl)
342 {
343 	fmd_case_t	*cp;
344 	int		disk_count;
345 
346 	g_fm_hdl = hdl;
347 
348 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
349 		return;
350 	}
351 
352 	if (config_init()) {
353 		log_err("Could not initialize configuration!\n");
354 		fmd_hdl_unregister(hdl);
355 		return;
356 	}
357 
358 	if (config_get(hdl, fmd_props)) {
359 		config_fini();
360 		log_err("Could not retrieve configuration from libtopo!\n");
361 		fmd_hdl_unregister(hdl);
362 		return;
363 	}
364 
365 	/*
366 	 * If there are no disks to monitor, bail out
367 	 */
368 	if ((disk_count = count_disks(config_data->disk_list)) == 0) {
369 		config_fini();
370 		fmd_hdl_unregister(hdl);
371 		return;
372 	}
373 
374 	if (diskmon_init() == E_ERROR) {
375 		config_fini();
376 		fmd_hdl_unregister(hdl);
377 		return;
378 	}
379 
380 	log_msg(MM_MAIN, "Monitoring %d disks.\n", disk_count);
381 
382 	/*
383 	 * Iterate over all active cases.
384 	 * Since we automatically solve all cases, these cases must have
385 	 * had the fault added, but the DE must have been interrupted
386 	 * before they were solved.
387 	 */
388 	for (cp = fmd_case_next(hdl, NULL);
389 	    cp != NULL; cp = fmd_case_next(hdl, cp)) {
390 
391 		if (!fmd_case_solved(hdl, cp))
392 			fmd_case_solve(hdl, cp);
393 	}
394 }
395 
396 /*ARGSUSED*/
397 void
_fmd_fini(fmd_hdl_t * hdl)398 _fmd_fini(fmd_hdl_t *hdl)
399 {
400 	diskmon_teardown_all();
401 	g_fm_hdl = NULL;
402 }
403