xref: /illumos-gate/usr/src/cmd/fm/modules/common/disk-lights/disk_lights.c (revision 00277c9e43668ff248a12ee635ce125957750373)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
14  */
15 
16 /*
17  * Disk Lights Agent (FMA)
18  *
19  * This Fault Management Daemon (fmd) module periodically scans the topology
20  * tree, enumerates all disks with associated fault indicators, and then
21  * synchronises the fault status of resources in the FMA Resource Cache with
22  * the indicators.  In short: it turns the fault light on for befallen disks.
23  *
24  * Presently, we recognise associated fault indicators for disks by looking
25  * for the following structure in the topology tree:
26  *
27  *    /bay=N
28  *      |
29  *      +---- /disk=0   <---------------- our Disk
30  *      |
31  *      +---- /bay=N?indicator=fail <---- the Fault Light
32  *      \---- /bay=N?indicator=ident
33  *
34  * That is: a DISK node will have a parent BAY; that BAY will itself have
35  * child Facility nodes, one of which will be called "fail".  If any of the
36  * above does not hold, we simply do nothing for this disk.
37  */
38 
39 #include <string.h>
40 #include <strings.h>
41 #include <libnvpair.h>
42 #include <fm/libtopo.h>
43 #include <fm/topo_list.h>
44 #include <fm/topo_hc.h>
45 #include <fm/fmd_api.h>
46 #include <sys/fm/protocol.h>
47 
48 
49 typedef struct disk_lights {
50 	fmd_hdl_t *dl_fmd;
51 	uint64_t dl_poll_interval;
52 	uint64_t dl_coalesce_interval;
53 	id_t dl_timer;
54 	boolean_t dl_triggered;
55 } disk_lights_t;
56 
57 static void disklights_topo(fmd_hdl_t *, topo_hdl_t *);
58 static void disklights_recv(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
59     const char *);
60 static void disklights_timeout(fmd_hdl_t *, id_t, void *);
61 
62 static const fmd_hdl_ops_t fmd_ops = {
63 	disklights_recv,	/* fmdo_recv */
64 	disklights_timeout,	/* fmdo_timeout */
65 	NULL,			/* fmdo_close */
66 	NULL,			/* fmdo_stats */
67 	NULL,			/* fmdo_gc */
68 	NULL,			/* fmdo_send */
69 	disklights_topo,	/* fmdo_topo */
70 };
71 
72 /*
73  * POLL_INTERVAL is the period after which we perform an unsolicited poll
74  * to ensure we remain in sync with reality.
75  */
76 #define	DL_PROP_POLL_INTERVAL		"poll-interval"
77 
78 /*
79  * COALESCE_INTERVAL is how long we wait after we are trigged by either a
80  * topology change or a relevant list.* event, in order to allow a series
81  * of events to coalesce.
82  */
83 #define	DL_PROP_COALESCE_INTERVAL	"coalesce-interval"
84 
85 static const fmd_prop_t fmd_props[] = {
86 	{ DL_PROP_POLL_INTERVAL, FMD_TYPE_TIME, "5min" },
87 	{ DL_PROP_COALESCE_INTERVAL, FMD_TYPE_TIME, "3s" },
88 	{ NULL, 0, NULL }
89 };
90 
91 static const fmd_hdl_info_t fmd_info = {
92 	"Disk Lights Agent",
93 	"1.0",
94 	&fmd_ops,
95 	fmd_props
96 };
97 
98 /*
99  * Fetch the Facility Node properties (name, type) from the FMRI
100  * for this node, or return -1 if we can't.
101  */
102 static int
103 get_facility_props(topo_hdl_t *hdl, tnode_t *node, char **facname,
104     char **factype)
105 {
106 	int e, ret = -1;
107 	nvlist_t *fmri = NULL, *fnvl;
108 	char *nn = NULL, *tt = NULL;
109 
110 	if (topo_node_resource(node, &fmri, &e) != 0)
111 		goto out;
112 
113 	if (nvlist_lookup_nvlist(fmri, FM_FMRI_FACILITY, &fnvl) != 0)
114 		goto out;
115 
116 	if (nvlist_lookup_string(fnvl, FM_FMRI_FACILITY_NAME, &nn) != 0)
117 		goto out;
118 
119 	if (nvlist_lookup_string(fnvl, FM_FMRI_FACILITY_TYPE, &tt) != 0)
120 		goto out;
121 
122 	*facname = topo_hdl_strdup(hdl, nn);
123 	*factype = topo_hdl_strdup(hdl, tt);
124 	ret = 0;
125 
126 out:
127 	nvlist_free(fmri);
128 	return (ret);
129 }
130 
131 typedef struct dl_fault_walk_inner {
132 	char *fwi_name;
133 	uint32_t fwi_mode;
134 } dl_fault_walk_inner_t;
135 
136 static int
137 dl_fault_walk_inner(topo_hdl_t *thp, tnode_t *node, void *arg)
138 {
139 	dl_fault_walk_inner_t *fwi = arg;
140 	char *facname = NULL, *factype = NULL;
141 	int err;
142 
143 	/*
144 	 * We're only interested in BAY children that are valid Facility Nodes.
145 	 */
146 	if (topo_node_flags(node) != TOPO_NODE_FACILITY ||
147 	    get_facility_props(thp, node, &facname, &factype) != 0) {
148 		goto out;
149 	}
150 
151 	if (strcmp(fwi->fwi_name, facname) != 0)
152 		goto out;
153 
154 	/*
155 	 * Attempt to set the LED mode appropriately.  If this fails, give up
156 	 * and move on.
157 	 */
158 	(void) topo_prop_set_uint32(node, TOPO_PGROUP_FACILITY, TOPO_LED_MODE,
159 	    TOPO_PROP_MUTABLE, fwi->fwi_mode, &err);
160 
161 out:
162 	topo_hdl_strfree(thp, facname);
163 	topo_hdl_strfree(thp, factype);
164 	return (TOPO_WALK_NEXT);
165 }
166 
167 static int
168 dl_fault_walk_outer(topo_hdl_t *thp, tnode_t *node, void *arg)
169 {
170 	disk_lights_t *dl = arg;
171 	dl_fault_walk_inner_t fwi;
172 	tnode_t *pnode;
173 	int err, has_fault;
174 	nvlist_t *fmri = NULL;
175 
176 	bzero(&fwi, sizeof (fwi));
177 
178 	/*
179 	 * We are only looking for DISK nodes in the topology that have a parent
180 	 * BAY.
181 	 */
182 	if (strcmp(DISK, topo_node_name(node)) != 0 ||
183 	    (pnode = topo_node_parent(node)) == NULL ||
184 	    strcmp(BAY, topo_node_name(pnode)) != 0) {
185 		return (TOPO_WALK_NEXT);
186 	}
187 
188 	/*
189 	 * Check to see if the Resource this FMRI describes is Faulty:
190 	 */
191 	if (topo_node_resource(node, &fmri, &err) != 0)
192 		return (TOPO_WALK_NEXT);
193 	has_fault = fmd_nvl_fmri_has_fault(dl->dl_fmd, fmri,
194 	    FMD_HAS_FAULT_RESOURCE, NULL);
195 	nvlist_free(fmri);
196 
197 	/*
198 	 * Walk the children of this BAY and flush out our fault status if
199 	 * we find an appropriate indicator node.
200 	 */
201 	fwi.fwi_name = "fail";
202 	fwi.fwi_mode = has_fault ? TOPO_LED_STATE_ON : TOPO_LED_STATE_OFF;
203 	(void) topo_node_child_walk(thp, pnode, dl_fault_walk_inner, &fwi,
204 	    &err);
205 
206 	return (TOPO_WALK_NEXT);
207 }
208 
209 /*
210  * Walk all of the topology nodes looking for DISKs that match the structure
211  * described in the overview.  Once we find them, check their fault status
212  * and update their fault indiciator accordingly.
213  */
214 static void
215 dl_examine_topo(disk_lights_t *dl)
216 {
217 	int err;
218 	topo_hdl_t *thp = NULL;
219 	topo_walk_t *twp = NULL;
220 
221 	thp = fmd_hdl_topo_hold(dl->dl_fmd, TOPO_VERSION);
222 	if ((twp = topo_walk_init(thp, FM_FMRI_SCHEME_HC, dl_fault_walk_outer,
223 	    dl, &err)) == NULL) {
224 		fmd_hdl_error(dl->dl_fmd, "failed to get topology: %s\n",
225 		    topo_strerror(err));
226 		goto out;
227 	}
228 
229 	if (topo_walk_step(twp, TOPO_WALK_CHILD) == TOPO_WALK_ERR) {
230 		fmd_hdl_error(dl->dl_fmd, "failed to walk topology: %s\n",
231 		    topo_strerror(err));
232 		goto out;
233 	}
234 
235 out:
236 	if (twp != NULL)
237 		topo_walk_fini(twp);
238 	if (thp != NULL)
239 		fmd_hdl_topo_rele(dl->dl_fmd, thp);
240 }
241 
242 static void
243 dl_trigger_enum(disk_lights_t *dl)
244 {
245 	/*
246 	 * If we're already on the short-poll coalesce timer, then return
247 	 * immediately.
248 	 */
249 	if (dl->dl_triggered == B_TRUE)
250 		return;
251 	dl->dl_triggered = B_TRUE;
252 
253 	/*
254 	 * Replace existing poll timer with coalesce timer:
255 	 */
256 	if (dl->dl_timer != 0)
257 		fmd_timer_remove(dl->dl_fmd, dl->dl_timer);
258 	dl->dl_timer = fmd_timer_install(dl->dl_fmd, NULL, NULL,
259 	    dl->dl_coalesce_interval);
260 }
261 
262 /*ARGSUSED*/
263 static void
264 disklights_timeout(fmd_hdl_t *hdl, id_t id, void *data)
265 {
266 	disk_lights_t *dl = fmd_hdl_getspecific(hdl);
267 
268 	dl->dl_triggered = B_FALSE;
269 
270 	dl_examine_topo(dl);
271 
272 	/*
273 	 * Install the long-interval timer for the next poll.
274 	 */
275 	dl->dl_timer = fmd_timer_install(hdl, NULL, NULL, dl->dl_poll_interval);
276 }
277 
278 /*ARGSUSED*/
279 static void
280 disklights_topo(fmd_hdl_t *hdl, topo_hdl_t *thp)
281 {
282 	disk_lights_t *dl = fmd_hdl_getspecific(hdl);
283 
284 	dl_trigger_enum(dl);
285 }
286 
287 /*ARGSUSED*/
288 static void
289 disklights_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
290     const char *class)
291 {
292 	disk_lights_t *dl = fmd_hdl_getspecific(hdl);
293 
294 	dl_trigger_enum(dl);
295 }
296 
297 void
298 _fmd_init(fmd_hdl_t *hdl)
299 {
300 	disk_lights_t *dl;
301 
302 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0)
303 		return;
304 
305 	dl = fmd_hdl_zalloc(hdl, sizeof (*dl), FMD_SLEEP);
306 	fmd_hdl_setspecific(hdl, dl);
307 
308 	/*
309 	 * Load Configuration:
310 	 */
311 	dl->dl_fmd = hdl;
312 	dl->dl_poll_interval = fmd_prop_get_int64(hdl, DL_PROP_POLL_INTERVAL);
313 	dl->dl_coalesce_interval = fmd_prop_get_int64(hdl,
314 	    DL_PROP_COALESCE_INTERVAL);
315 
316 	/*
317 	 * Schedule the initial enumeration:
318 	 */
319 	dl_trigger_enum(dl);
320 }
321 
322 void
323 _fmd_fini(fmd_hdl_t *hdl)
324 {
325 	disk_lights_t *dl = fmd_hdl_getspecific(hdl);
326 
327 	fmd_hdl_free(hdl, dl, sizeof (*dl));
328 }
329