1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Disk Monitor
29 */
30 #include <sys/types.h>
31 #include <sys/stat.h>
32 #include <fcntl.h>
33 #include <time.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <strings.h>
37 #include <stdarg.h>
38 #include <errno.h>
39 #include <signal.h>
40 #include <unistd.h>
41 #include <pthread.h>
42 #include <libnvpair.h>
43 #include <fm/fmd_api.h>
44 #include <fm/fmd_fmri.h>
45 #include <sys/fm/protocol.h>
46 #include <sys/fm/io/disk.h>
47 #include <fm/libtopo.h>
48
49 #include "disk_monitor.h"
50 #include "hotplug_mgr.h"
51 #include "schg_mgr.h"
52 #include "topo_gather.h"
53 #include "dm_platform.h"
54
55 #define THIS_FMD_MODULE_NAME "disk-monitor"
56
57 static enum disk_init_state {
58 INIT_STATE_NONE = 0,
59 STATE_CHANGE_MGR_INITTED = 2,
60 HOTPLUG_MGR_INITTED = 4
61 } g_init_state = INIT_STATE_NONE;
62
63 typedef enum {
64 LT_SUSPECT,
65 LT_REPAIRED
66 } fm_list_type_t;
67
68 /*
69 * Global verbosity flag -- controls chattiness of debug messages and
70 * warnings. Its value is determined by the fmd property "log-level"
71 * settable in the DE's .conf file.
72 */
73 log_class_t g_verbose = 0;
74 cfgdata_t *config_data = NULL;
75 fmd_hdl_t *g_fm_hdl = NULL;
76
77 static const fmd_prop_t fmd_props[];
78
79 static void
diskmon_teardown_all(void)80 diskmon_teardown_all(void)
81 {
82 cleanup_hotplug_manager();
83 cleanup_state_change_manager(config_data);
84 config_fini();
85 }
86
87 static int
count_disks(diskmon_t * disklistp)88 count_disks(diskmon_t *disklistp)
89 {
90 int i = 0;
91
92 while (disklistp != NULL) {
93 i++;
94 disklistp = disklistp->next;
95 }
96
97 return (i);
98 }
99
100 static int
diskmon_init(void)101 diskmon_init(void)
102 {
103 /*
104 * Block the generation of state change events (generated by the
105 * hotplug manager thread) here; they will be unblocked after the
106 * state change manager thread is ready to accept state changes
107 * (shortly after it starts).
108 */
109 block_state_change_events();
110
111 if (dm_platform_init() != 0)
112 goto cleanup;
113
114 if (init_hotplug_manager() != 0)
115 goto cleanup;
116 else
117 g_init_state |= HOTPLUG_MGR_INITTED;
118
119 if (init_state_change_manager(config_data) != 0)
120 goto cleanup;
121 else
122 g_init_state |= STATE_CHANGE_MGR_INITTED;
123
124 return (E_SUCCESS);
125
126 cleanup:
127
128 unblock_state_change_events();
129
130 /*
131 * The cleanup order here does matter, due to dependencies between the
132 * managers.
133 */
134 if (g_init_state & HOTPLUG_MGR_INITTED)
135 cleanup_hotplug_manager();
136 if (g_init_state & STATE_CHANGE_MGR_INITTED)
137 cleanup_state_change_manager(config_data);
138 dm_platform_fini();
139
140 return (E_ERROR);
141 }
142
143 static void
dm_fault_execute_actions(fmd_hdl_t * hdl,diskmon_t * diskp,nvlist_t * nvl)144 dm_fault_execute_actions(fmd_hdl_t *hdl, diskmon_t *diskp, nvlist_t *nvl)
145 {
146 const char *action_prop = NULL;
147 const char *action_string;
148
149 /*
150 * The predictive failure action is the activation of the fault
151 * indicator.
152 */
153 if (fmd_nvl_class_match(hdl, nvl,
154 DISK_ERROR_CLASS "." FM_FAULT_DISK_OVERTEMP))
155 action_prop = DISK_PROP_OTEMPACTION;
156
157 if (fmd_nvl_class_match(hdl, nvl,
158 DISK_ERROR_CLASS "." FM_FAULT_DISK_TESTFAIL))
159 action_prop = DISK_PROP_STFAILACTION;
160
161 dm_fault_indicator_set(diskp, INDICATOR_ON);
162
163 if (action_prop != NULL &&
164 (action_string = dm_prop_lookup(diskp->props, action_prop))
165 != NULL) {
166
167 if (dm_platform_indicator_execute(action_string) != 0) {
168 log_warn("Fault action `%s' did not successfully "
169 "complete.\n", action_string);
170 }
171 }
172 }
173
174 static void
diskmon_agent_repair(fmd_hdl_t * hdl,nvlist_t * nvl,int repair)175 diskmon_agent_repair(fmd_hdl_t *hdl, nvlist_t *nvl, int repair)
176 {
177 char *uuid = NULL;
178 nvlist_t **nva;
179 uint_t nvc;
180 diskmon_t *diskp;
181 nvlist_t *fmri;
182 nvlist_t *fltnvl;
183 int err = 0;
184
185 err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
186 err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
187 &nva, &nvc);
188 if (err != 0)
189 return;
190
191 while (nvc-- != 0) {
192
193 fltnvl = *nva++;
194
195 if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri)
196 != 0)
197 continue;
198
199 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
200 continue;
201
202 log_msg(MM_MAIN, "Disk %s repaired!\n",
203 diskp->location);
204
205 dm_fault_indicator_set(diskp, INDICATOR_OFF);
206
207 dm_state_change(diskp, HPS_REPAIRED);
208 }
209
210 if (repair)
211 fmd_case_uuresolved(hdl, uuid);
212
213 }
214
215 static void
diskmon_agent_suspect(fmd_hdl_t * hdl,nvlist_t * nvl)216 diskmon_agent_suspect(fmd_hdl_t *hdl, nvlist_t *nvl)
217 {
218 char *uuid = NULL;
219 nvlist_t **nva;
220 uint_t nvc;
221 diskmon_t *diskp;
222 nvlist_t *fmri;
223 nvlist_t *fltnvl;
224 int err = 0;
225
226 err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
227 err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
228 &nva, &nvc);
229 if (err != 0)
230 return;
231
232 while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) {
233
234 fltnvl = *nva++;
235
236 if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) != 0)
237 continue;
238
239 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
240 continue;
241
242 /* Execute the actions associated with this fault */
243 dm_fault_execute_actions(hdl, diskp, fltnvl);
244
245 /*
246 * Send a state change event to the state change manager
247 */
248 dm_state_change(diskp, HPS_FAULTED);
249 }
250
251 if (!fmd_case_uuclosed(hdl, uuid)) {
252 /* Case is closed */
253 fmd_case_uuclose(hdl, uuid);
254 }
255 }
256
257 /*ARGSUSED*/
258 static void
diskmon_recv(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class)259 diskmon_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
260 {
261 diskmon_t *diskp;
262 nvlist_t *fmri;
263
264 if (g_verbose & MM_MAIN)
265 nvlist_print(stderr, nvl);
266
267 /*
268 * Act on the fault suspect list or repaired list (embedded agent
269 * action).
270 */
271 if (fmd_nvl_class_match(hdl, nvl, FM_LIST_REPAIRED_CLASS)) {
272
273 diskmon_agent_repair(hdl, nvl, 1);
274 return;
275
276 } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_UPDATED_CLASS)) {
277
278 diskmon_agent_repair(hdl, nvl, 0);
279 return;
280
281 } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_SUSPECT_CLASS)) {
282
283 diskmon_agent_suspect(hdl, nvl);
284 return;
285 } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_RESOLVED_CLASS)) {
286 return;
287 }
288
289 /*
290 * If we get any replayed faults, set the diskmon's faulted
291 * flag for the appropriate fault, then change the diskmon's state
292 * to faulted.
293 */
294 if (fmd_nvl_class_match(hdl, nvl, DISK_ERROR_CLASS ".*")) {
295
296 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE,
297 &fmri) != 0)
298 return;
299
300 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
301 return;
302
303 /* Execute the actions associated with this fault */
304 dm_fault_execute_actions(hdl, diskp, nvl);
305
306 /*
307 * If the fault wasn't generated by this module, send a
308 * state change event to the state change manager
309 */
310 dm_state_change(diskp, HPS_FAULTED);
311 return;
312 }
313 }
314
315 static const fmd_hdl_ops_t fmd_ops = {
316 diskmon_recv, /* fmdo_recv */
317 NULL, /* fmdo_timeout */
318 NULL, /* fmdo_close */
319 NULL, /* fmdo_stats */
320 NULL, /* fmdo_gc */
321 };
322
323 static const fmd_prop_t fmd_props[] = {
324 { GLOBAL_PROP_LOG_LEVEL, FMD_TYPE_UINT32, "0" },
325 { NULL, 0, NULL }
326 };
327
328 static const fmd_hdl_info_t fmd_info = {
329 "Disk Monitor",
330 DISK_MONITOR_MODULE_VERSION,
331 &fmd_ops,
332 fmd_props
333 };
334
335 void
_fmd_init(fmd_hdl_t * hdl)336 _fmd_init(fmd_hdl_t *hdl)
337 {
338 fmd_case_t *cp;
339 int disk_count;
340
341 g_fm_hdl = hdl;
342
343 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
344 return;
345 }
346
347 if (config_init()) {
348 log_err("Could not initialize configuration!\n");
349 fmd_hdl_unregister(hdl);
350 return;
351 }
352
353 if (config_get(hdl, fmd_props)) {
354 config_fini();
355 log_err("Could not retrieve configuration from libtopo!\n");
356 fmd_hdl_unregister(hdl);
357 return;
358 }
359
360 /*
361 * If there are no disks to monitor, bail out
362 */
363 if ((disk_count = count_disks(config_data->disk_list)) == 0) {
364 config_fini();
365 fmd_hdl_unregister(hdl);
366 return;
367 }
368
369 if (diskmon_init() == E_ERROR) {
370 config_fini();
371 fmd_hdl_unregister(hdl);
372 return;
373 }
374
375 log_msg(MM_MAIN, "Monitoring %d disks.\n", disk_count);
376
377 /*
378 * Iterate over all active cases.
379 * Since we automatically solve all cases, these cases must have
380 * had the fault added, but the DE must have been interrupted
381 * before they were solved.
382 */
383 for (cp = fmd_case_next(hdl, NULL);
384 cp != NULL; cp = fmd_case_next(hdl, cp)) {
385
386 if (!fmd_case_solved(hdl, cp))
387 fmd_case_solve(hdl, cp);
388 }
389 }
390
391 /*ARGSUSED*/
392 void
_fmd_fini(fmd_hdl_t * hdl)393 _fmd_fini(fmd_hdl_t *hdl)
394 {
395 diskmon_teardown_all();
396 g_fm_hdl = NULL;
397 }
398