1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Disk Monitor 29 */ 30 #include <sys/types.h> 31 #include <sys/stat.h> 32 #include <fcntl.h> 33 #include <time.h> 34 #include <stdio.h> 35 #include <stdlib.h> 36 #include <strings.h> 37 #include <stdarg.h> 38 #include <errno.h> 39 #include <signal.h> 40 #include <unistd.h> 41 #include <pthread.h> 42 #include <libnvpair.h> 43 #include <fm/fmd_api.h> 44 #include <fm/fmd_fmri.h> 45 #include <sys/fm/protocol.h> 46 #include <sys/fm/io/disk.h> 47 #include <fm/libtopo.h> 48 49 #include "disk_monitor.h" 50 #include "hotplug_mgr.h" 51 #include "schg_mgr.h" 52 #include "topo_gather.h" 53 #include "dm_platform.h" 54 55 #define THIS_FMD_MODULE_NAME "disk-monitor" 56 57 static enum disk_init_state { 58 INIT_STATE_NONE = 0, 59 STATE_CHANGE_MGR_INITTED = 2, 60 HOTPLUG_MGR_INITTED = 4 61 } g_init_state = INIT_STATE_NONE; 62 63 typedef enum { 64 LT_SUSPECT, 65 LT_REPAIRED 66 } fm_list_type_t; 67 68 /* 69 * Global verbosity flag -- controls chattiness of debug messages and 70 * warnings. Its value is determined by the fmd property "log-level" 71 * settable in the DE's .conf file. 72 */ 73 log_class_t g_verbose = 0; 74 cfgdata_t *config_data = NULL; 75 fmd_hdl_t *g_fm_hdl = NULL; 76 77 static const fmd_prop_t fmd_props[]; 78 79 static void 80 diskmon_teardown_all(void) 81 { 82 cleanup_hotplug_manager(); 83 cleanup_state_change_manager(config_data); 84 config_fini(); 85 } 86 87 static int 88 count_disks(diskmon_t *disklistp) 89 { 90 int i = 0; 91 92 while (disklistp != NULL) { 93 i++; 94 disklistp = disklistp->next; 95 } 96 97 return (i); 98 } 99 100 static int 101 diskmon_init(void) 102 { 103 /* 104 * Block the generation of state change events (generated by the 105 * hotplug manager thread) here; they will be unblocked after the 106 * state change manager thread is ready to accept state changes 107 * (shortly after it starts). 108 */ 109 block_state_change_events(); 110 111 if (dm_platform_init() != 0) 112 goto cleanup; 113 114 if (init_hotplug_manager() != 0) 115 goto cleanup; 116 else 117 g_init_state |= HOTPLUG_MGR_INITTED; 118 119 if (init_state_change_manager(config_data) != 0) 120 goto cleanup; 121 else 122 g_init_state |= STATE_CHANGE_MGR_INITTED; 123 124 return (E_SUCCESS); 125 126 cleanup: 127 128 unblock_state_change_events(); 129 130 /* 131 * The cleanup order here does matter, due to dependencies between the 132 * managers. 133 */ 134 if (g_init_state & HOTPLUG_MGR_INITTED) 135 cleanup_hotplug_manager(); 136 if (g_init_state & STATE_CHANGE_MGR_INITTED) 137 cleanup_state_change_manager(config_data); 138 dm_platform_fini(); 139 140 return (E_ERROR); 141 } 142 143 static void 144 dm_fault_execute_actions(fmd_hdl_t *hdl, diskmon_t *diskp, nvlist_t *nvl) 145 { 146 const char *action_prop = NULL; 147 const char *action_string; 148 149 /* 150 * The predictive failure action is the activation of the fault 151 * indicator. 152 */ 153 if (fmd_nvl_class_match(hdl, nvl, 154 DISK_ERROR_CLASS "." FM_FAULT_DISK_OVERTEMP)) 155 action_prop = DISK_PROP_OTEMPACTION; 156 157 if (fmd_nvl_class_match(hdl, nvl, 158 DISK_ERROR_CLASS "." FM_FAULT_DISK_TESTFAIL)) 159 action_prop = DISK_PROP_STFAILACTION; 160 161 dm_fault_indicator_set(diskp, INDICATOR_ON); 162 163 if (action_prop != NULL && 164 (action_string = dm_prop_lookup(diskp->props, action_prop)) 165 != NULL) { 166 167 if (dm_platform_indicator_execute(action_string) != 0) { 168 log_warn("Fault action `%s' did not successfully " 169 "complete.\n", action_string); 170 } 171 } 172 } 173 174 static void 175 diskmon_agent_repair(fmd_hdl_t *hdl, nvlist_t *nvl, int repair) 176 { 177 char *uuid = NULL; 178 nvlist_t **nva; 179 uint_t nvc; 180 diskmon_t *diskp; 181 nvlist_t *fmri; 182 nvlist_t *fltnvl; 183 int err = 0; 184 185 err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid); 186 err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 187 &nva, &nvc); 188 if (err != 0) 189 return; 190 191 while (nvc-- != 0) { 192 193 fltnvl = *nva++; 194 195 if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) 196 != 0) 197 continue; 198 199 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL) 200 continue; 201 202 log_msg(MM_MAIN, "Disk %s repaired!\n", 203 diskp->location); 204 205 dm_fault_indicator_set(diskp, INDICATOR_OFF); 206 207 dm_state_change(diskp, HPS_REPAIRED); 208 } 209 210 if (repair) 211 fmd_case_uuresolved(hdl, uuid); 212 213 } 214 215 static void 216 diskmon_agent_suspect(fmd_hdl_t *hdl, nvlist_t *nvl) 217 { 218 char *uuid = NULL; 219 nvlist_t **nva; 220 uint_t nvc; 221 diskmon_t *diskp; 222 nvlist_t *fmri; 223 nvlist_t *fltnvl; 224 int err = 0; 225 226 err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid); 227 err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 228 &nva, &nvc); 229 if (err != 0) 230 return; 231 232 while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) { 233 234 fltnvl = *nva++; 235 236 if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) != 0) 237 continue; 238 239 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL) 240 continue; 241 242 /* Execute the actions associated with this fault */ 243 dm_fault_execute_actions(hdl, diskp, fltnvl); 244 245 /* 246 * Send a state change event to the state change manager 247 */ 248 dm_state_change(diskp, HPS_FAULTED); 249 } 250 251 if (!fmd_case_uuclosed(hdl, uuid)) { 252 /* Case is closed */ 253 fmd_case_uuclose(hdl, uuid); 254 } 255 } 256 257 /*ARGSUSED*/ 258 static void 259 diskmon_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) 260 { 261 diskmon_t *diskp; 262 nvlist_t *fmri; 263 264 if (g_verbose & MM_MAIN) 265 nvlist_print(stderr, nvl); 266 267 /* 268 * Act on the fault suspect list or repaired list (embedded agent 269 * action). 270 */ 271 if (fmd_nvl_class_match(hdl, nvl, FM_LIST_REPAIRED_CLASS)) { 272 273 diskmon_agent_repair(hdl, nvl, 1); 274 return; 275 276 } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_UPDATED_CLASS)) { 277 278 diskmon_agent_repair(hdl, nvl, 0); 279 return; 280 281 } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_SUSPECT_CLASS)) { 282 283 diskmon_agent_suspect(hdl, nvl); 284 return; 285 } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_RESOLVED_CLASS)) { 286 return; 287 } 288 289 /* 290 * If we get any replayed faults, set the diskmon's faulted 291 * flag for the appropriate fault, then change the diskmon's state 292 * to faulted. 293 */ 294 if (fmd_nvl_class_match(hdl, nvl, DISK_ERROR_CLASS ".*")) { 295 296 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, 297 &fmri) != 0) 298 return; 299 300 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL) 301 return; 302 303 /* Execute the actions associated with this fault */ 304 dm_fault_execute_actions(hdl, diskp, nvl); 305 306 /* 307 * If the fault wasn't generated by this module, send a 308 * state change event to the state change manager 309 */ 310 dm_state_change(diskp, HPS_FAULTED); 311 return; 312 } 313 } 314 315 static const fmd_hdl_ops_t fmd_ops = { 316 diskmon_recv, /* fmdo_recv */ 317 NULL, /* fmdo_timeout */ 318 NULL, /* fmdo_close */ 319 NULL, /* fmdo_stats */ 320 NULL, /* fmdo_gc */ 321 }; 322 323 static const fmd_prop_t fmd_props[] = { 324 { GLOBAL_PROP_LOG_LEVEL, FMD_TYPE_UINT32, "0" }, 325 { NULL, 0, NULL } 326 }; 327 328 static const fmd_hdl_info_t fmd_info = { 329 "Disk Monitor", 330 DISK_MONITOR_MODULE_VERSION, 331 &fmd_ops, 332 fmd_props 333 }; 334 335 void 336 _fmd_init(fmd_hdl_t *hdl) 337 { 338 fmd_case_t *cp; 339 int disk_count; 340 341 g_fm_hdl = hdl; 342 343 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { 344 return; 345 } 346 347 if (config_init()) { 348 log_err("Could not initialize configuration!\n"); 349 fmd_hdl_unregister(hdl); 350 return; 351 } 352 353 if (config_get(hdl, fmd_props)) { 354 config_fini(); 355 log_err("Could not retrieve configuration from libtopo!\n"); 356 fmd_hdl_unregister(hdl); 357 return; 358 } 359 360 /* 361 * If there are no disks to monitor, bail out 362 */ 363 if ((disk_count = count_disks(config_data->disk_list)) == 0) { 364 config_fini(); 365 fmd_hdl_unregister(hdl); 366 return; 367 } 368 369 if (diskmon_init() == E_ERROR) { 370 config_fini(); 371 fmd_hdl_unregister(hdl); 372 return; 373 } 374 375 log_msg(MM_MAIN, "Monitoring %d disks.\n", disk_count); 376 377 /* 378 * Iterate over all active cases. 379 * Since we automatically solve all cases, these cases must have 380 * had the fault added, but the DE must have been interrupted 381 * before they were solved. 382 */ 383 for (cp = fmd_case_next(hdl, NULL); 384 cp != NULL; cp = fmd_case_next(hdl, cp)) { 385 386 if (!fmd_case_solved(hdl, cp)) 387 fmd_case_solve(hdl, cp); 388 } 389 } 390 391 /*ARGSUSED*/ 392 void 393 _fmd_fini(fmd_hdl_t *hdl) 394 { 395 diskmon_teardown_all(); 396 g_fm_hdl = NULL; 397 } 398