xref: /titanic_52/usr/src/cmd/fm/modules/common/disk-monitor/disk_monitor.c (revision cbf75e67acb6c32a2f4884f28a839d59f7988d37)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Disk Monitor
29  */
30 #include <sys/types.h>
31 #include <sys/stat.h>
32 #include <fcntl.h>
33 #include <time.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <strings.h>
37 #include <stdarg.h>
38 #include <errno.h>
39 #include <signal.h>
40 #include <unistd.h>
41 #include <pthread.h>
42 #include <libnvpair.h>
43 #include <fm/fmd_api.h>
44 #include <fm/fmd_fmri.h>
45 #include <sys/fm/protocol.h>
46 #include <sys/fm/io/disk.h>
47 #include <fm/libtopo.h>
48 
49 #include "disk_monitor.h"
50 #include "hotplug_mgr.h"
51 #include "schg_mgr.h"
52 #include "topo_gather.h"
53 #include "dm_platform.h"
54 
55 #define	THIS_FMD_MODULE_NAME "disk-monitor"
56 
57 static enum disk_init_state {
58 	INIT_STATE_NONE = 0,
59 	STATE_CHANGE_MGR_INITTED = 2,
60 	HOTPLUG_MGR_INITTED = 4
61 } g_init_state = INIT_STATE_NONE;
62 
63 typedef enum {
64 	LT_SUSPECT,
65 	LT_REPAIRED
66 } fm_list_type_t;
67 
68 /*
69  * Global verbosity flag -- controls chattiness of debug messages and
70  * warnings.  Its value is determined by the fmd property "log-level"
71  * settable in the DE's .conf file.
72  */
73 log_class_t			g_verbose = 0;
74 cfgdata_t			*config_data = NULL;
75 fmd_hdl_t			*g_fm_hdl = NULL;
76 
77 static const fmd_prop_t		fmd_props[];
78 
79 static void
80 diskmon_teardown_all(void)
81 {
82 	cleanup_hotplug_manager();
83 	cleanup_state_change_manager(config_data);
84 	config_fini();
85 }
86 
87 static int
88 count_disks(diskmon_t *disklistp)
89 {
90 	int i = 0;
91 
92 	while (disklistp != NULL) {
93 		i++;
94 		disklistp = disklistp->next;
95 	}
96 
97 	return (i);
98 }
99 
100 static int
101 diskmon_init(void)
102 {
103 	/*
104 	 * Block the generation of state change events (generated by the
105 	 * hotplug manager thread) here; they will be unblocked after the
106 	 * state change manager thread is ready to accept state changes
107 	 * (shortly after it starts).
108 	 */
109 	block_state_change_events();
110 
111 	if (dm_platform_init() != 0)
112 		goto cleanup;
113 
114 	if (init_hotplug_manager() != 0)
115 		goto cleanup;
116 	else
117 		g_init_state |= HOTPLUG_MGR_INITTED;
118 
119 	if (init_state_change_manager(config_data) != 0)
120 		goto cleanup;
121 	else
122 		g_init_state |= STATE_CHANGE_MGR_INITTED;
123 
124 	return (E_SUCCESS);
125 
126 cleanup:
127 
128 	unblock_state_change_events();
129 
130 	/*
131 	 * The cleanup order here does matter, due to dependencies between the
132 	 * managers.
133 	 */
134 	if (g_init_state & HOTPLUG_MGR_INITTED)
135 		cleanup_hotplug_manager();
136 	if (g_init_state & STATE_CHANGE_MGR_INITTED)
137 		cleanup_state_change_manager(config_data);
138 	dm_platform_fini();
139 
140 	return (E_ERROR);
141 }
142 
143 static void
144 dm_fault_execute_actions(fmd_hdl_t *hdl, diskmon_t *diskp, nvlist_t *nvl)
145 {
146 	const char		*action_prop = NULL;
147 	const char		*action_string;
148 
149 	/*
150 	 * The predictive failure action is the activation of the fault
151 	 * indicator.
152 	 */
153 	if (fmd_nvl_class_match(hdl, nvl,
154 	    DISK_ERROR_CLASS "." FM_FAULT_DISK_OVERTEMP))
155 		action_prop = DISK_PROP_OTEMPACTION;
156 
157 	if (fmd_nvl_class_match(hdl, nvl,
158 	    DISK_ERROR_CLASS "." FM_FAULT_DISK_TESTFAIL))
159 		action_prop = DISK_PROP_STFAILACTION;
160 
161 	dm_fault_indicator_set(diskp, INDICATOR_ON);
162 
163 	if (action_prop != NULL &&
164 	    (action_string = dm_prop_lookup(diskp->props, action_prop))
165 	    != NULL) {
166 
167 		if (dm_platform_indicator_execute(action_string) != 0) {
168 			log_warn("Fault action `%s' did not successfully "
169 			    "complete.\n", action_string);
170 		}
171 	}
172 }
173 
174 static void
175 diskmon_agent_repair(fmd_hdl_t *hdl, nvlist_t *nvl, int repair)
176 {
177 	char		*uuid = NULL;
178 	nvlist_t	**nva;
179 	uint_t		nvc;
180 	diskmon_t	*diskp;
181 	nvlist_t	*fmri;
182 	nvlist_t	*fltnvl;
183 	int		err = 0;
184 
185 	err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
186 	err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
187 	    &nva, &nvc);
188 	if (err != 0)
189 		return;
190 
191 	while (nvc-- != 0) {
192 
193 		fltnvl = *nva++;
194 
195 		if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri)
196 		    != 0)
197 			continue;
198 
199 		if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
200 			continue;
201 
202 		log_msg(MM_MAIN, "Disk %s repaired!\n",
203 		    diskp->location);
204 
205 		dm_fault_indicator_set(diskp, INDICATOR_OFF);
206 
207 		dm_state_change(diskp, HPS_REPAIRED);
208 	}
209 
210 	if (repair)
211 		fmd_case_uuresolved(hdl, uuid);
212 
213 }
214 
215 static void
216 diskmon_agent_suspect(fmd_hdl_t *hdl, nvlist_t *nvl)
217 {
218 	char		*uuid = NULL;
219 	nvlist_t	**nva;
220 	uint_t		nvc;
221 	diskmon_t	*diskp;
222 	nvlist_t	*fmri;
223 	nvlist_t	*fltnvl;
224 	int		err = 0;
225 
226 	err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
227 	err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
228 	    &nva, &nvc);
229 	if (err != 0)
230 		return;
231 
232 	while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) {
233 
234 		fltnvl = *nva++;
235 
236 		if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) != 0)
237 			continue;
238 
239 		if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
240 			continue;
241 
242 		/* Execute the actions associated with this fault */
243 		dm_fault_execute_actions(hdl, diskp,  fltnvl);
244 
245 		/*
246 		 * Send a state change event to the state change manager
247 		 */
248 		dm_state_change(diskp, HPS_FAULTED);
249 	}
250 
251 	if (!fmd_case_uuclosed(hdl, uuid)) {
252 		/* Case is closed */
253 		fmd_case_uuclose(hdl, uuid);
254 	}
255 }
256 
257 /*ARGSUSED*/
258 static void
259 diskmon_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
260 {
261 	diskmon_t	*diskp;
262 	nvlist_t	*fmri;
263 
264 	if (g_verbose & MM_MAIN)
265 		nvlist_print(stderr, nvl);
266 
267 	/*
268 	 * Act on the fault suspect list or repaired list (embedded agent
269 	 * action).
270 	 */
271 	if (fmd_nvl_class_match(hdl, nvl, FM_LIST_REPAIRED_CLASS)) {
272 
273 		diskmon_agent_repair(hdl, nvl, 1);
274 		return;
275 
276 	} else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_UPDATED_CLASS)) {
277 
278 		diskmon_agent_repair(hdl, nvl, 0);
279 		return;
280 
281 	} else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_SUSPECT_CLASS)) {
282 
283 		diskmon_agent_suspect(hdl, nvl);
284 		return;
285 	} else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_RESOLVED_CLASS)) {
286 		return;
287 	}
288 
289 	/*
290 	 * If we get any replayed faults, set the diskmon's faulted
291 	 * flag for the appropriate fault, then change the diskmon's state
292 	 * to faulted.
293 	 */
294 	if (fmd_nvl_class_match(hdl, nvl, DISK_ERROR_CLASS ".*")) {
295 
296 		if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE,
297 		    &fmri) != 0)
298 			return;
299 
300 		if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
301 			return;
302 
303 		/* Execute the actions associated with this fault */
304 		dm_fault_execute_actions(hdl, diskp, nvl);
305 
306 		/*
307 		 * If the fault wasn't generated by this module, send a
308 		 * state change event to the state change manager
309 		 */
310 		dm_state_change(diskp, HPS_FAULTED);
311 		return;
312 	}
313 }
314 
315 static const fmd_hdl_ops_t fmd_ops = {
316 	diskmon_recv,	/* fmdo_recv */
317 	NULL,		/* fmdo_timeout */
318 	NULL,		/* fmdo_close */
319 	NULL,		/* fmdo_stats */
320 	NULL,		/* fmdo_gc */
321 };
322 
323 static const fmd_prop_t fmd_props[] = {
324 	{ GLOBAL_PROP_LOG_LEVEL, FMD_TYPE_UINT32, "0" },
325 	{ NULL, 0, NULL }
326 };
327 
328 static const fmd_hdl_info_t fmd_info = {
329 	"Disk Monitor",
330 	DISK_MONITOR_MODULE_VERSION,
331 	&fmd_ops,
332 	fmd_props
333 };
334 
335 void
336 _fmd_init(fmd_hdl_t *hdl)
337 {
338 	fmd_case_t	*cp;
339 	int		disk_count;
340 
341 	g_fm_hdl = hdl;
342 
343 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
344 		return;
345 	}
346 
347 	if (config_init()) {
348 		log_err("Could not initialize configuration!\n");
349 		fmd_hdl_unregister(hdl);
350 		return;
351 	}
352 
353 	if (config_get(hdl, fmd_props)) {
354 		config_fini();
355 		log_err("Could not retrieve configuration from libtopo!\n");
356 		fmd_hdl_unregister(hdl);
357 		return;
358 	}
359 
360 	/*
361 	 * If there are no disks to monitor, bail out
362 	 */
363 	if ((disk_count = count_disks(config_data->disk_list)) == 0) {
364 		config_fini();
365 		fmd_hdl_unregister(hdl);
366 		return;
367 	}
368 
369 	if (diskmon_init() == E_ERROR) {
370 		config_fini();
371 		fmd_hdl_unregister(hdl);
372 		return;
373 	}
374 
375 	log_msg(MM_MAIN, "Monitoring %d disks.\n", disk_count);
376 
377 	/*
378 	 * Iterate over all active cases.
379 	 * Since we automatically solve all cases, these cases must have
380 	 * had the fault added, but the DE must have been interrupted
381 	 * before they were solved.
382 	 */
383 	for (cp = fmd_case_next(hdl, NULL);
384 	    cp != NULL; cp = fmd_case_next(hdl, cp)) {
385 
386 		if (!fmd_case_solved(hdl, cp))
387 			fmd_case_solve(hdl, cp);
388 	}
389 }
390 
391 /*ARGSUSED*/
392 void
393 _fmd_fini(fmd_hdl_t *hdl)
394 {
395 	diskmon_teardown_all();
396 	g_fm_hdl = NULL;
397 }
398