xref: /illumos-gate/usr/src/cmd/fm/modules/common/sensor-transport/sensor_transport.c (revision b210e77709da8e42dfe621e10ccf4be504206058)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2018, Joyent, Inc.
24  */
25 
26 #include <fm/fmd_api.h>
27 #include <fm/libtopo.h>
28 #include <fm/topo_hc.h>
29 #include <fm/topo_mod.h>
30 #include <fm/topo_method.h>
31 
32 #include <sys/fm/protocol.h>
33 #include <sys/systeminfo.h>
34 
35 #include <string.h>
36 
37 #define	ST_EREPORT_CLASS	"ereport.sensor.failure"
38 
39 typedef struct sensor_fault {
40 	struct sensor_fault	*sf_next;
41 	char			*sf_fru;
42 	uint32_t		sf_num_fails;
43 	boolean_t		sf_last_faulted;
44 	boolean_t		sf_faulted;
45 	boolean_t		sf_unknown;
46 } sensor_fault_t;
47 
48 typedef struct sensor_transport {
49 	fmd_hdl_t	*st_hdl;
50 	fmd_xprt_t	*st_xprt;
51 	hrtime_t	st_interval;
52 	id_t		st_timer;
53 	sensor_fault_t	*st_faults;
54 	boolean_t	st_first;
55 	/*
56 	 * The number of consecutive sensor readings indicating failure that
57 	 * we'll tolerate before sending an ereport.
58 	 */
59 	uint32_t	st_tolerance;
60 	nvlist_t	*st_spoofs;
61 } sensor_transport_t;
62 
63 typedef struct st_stats {
64 	fmd_stat_t st_bad_fmri;
65 	fmd_stat_t st_topo_errs;
66 	fmd_stat_t st_repairs;
67 } st_stats_t;
68 
69 st_stats_t st_stats = {
70 	{ "bad_fmri", FMD_TYPE_UINT64, "bad or missing resource/FRU FMRI" },
71 	{ "topo_errors", FMD_TYPE_UINT64, "errors walking topology" },
72 	{ "repairs", FMD_TYPE_UINT64, "auto repairs" }
73 };
74 
75 static int st_check_component_complaints;
76 static int have_complained;
77 static char *spoof_prop = NULL;
78 
79 static int
80 st_check_component(topo_hdl_t *thp, tnode_t *node, void *arg)
81 {
82 	sensor_transport_t *stp = arg;
83 	fmd_hdl_t *hdl = stp->st_hdl;
84 	const char *name = topo_node_name(node);
85 	nvlist_t *nvl, *props, *rsrc, *fru;
86 	char *fmri;
87 	int err, ret;
88 	int32_t last_source, source = -1;
89 	boolean_t nonrecov, faulted, predictive, source_diff, injected;
90 	nvpair_t *nvp;
91 	uint64_t ena;
92 	nvlist_t *event;
93 	sensor_fault_t *sfp, **current;
94 
95 	if (strcmp(name, FAN) != 0 && strcmp(name, PSU) != 0)
96 		return (0);
97 
98 	if (topo_node_resource(node, &rsrc, NULL) != 0) {
99 		st_stats.st_bad_fmri.fmds_value.ui64++;
100 		return (0);
101 	}
102 
103 	/*
104 	 * If the resource isn't present, don't bother invoking the sensor
105 	 * failure method.  It may be that the sensors aren't part of the same
106 	 * physical FRU and will report failure if the FRU is no longer there.
107 	 */
108 	if ((ret = topo_fmri_present(thp, rsrc, &err)) < 0) {
109 		fmd_hdl_debug(hdl, "topo_fmri_present() failed for %s=%d",
110 		    name, topo_node_instance(node));
111 		nvlist_free(rsrc);
112 		return (0);
113 	}
114 
115 	if (!ret) {
116 		fmd_hdl_debug(hdl, "%s=%d is not present, ignoring",
117 		    name, topo_node_instance(node));
118 		nvlist_free(rsrc);
119 		return (0);
120 	}
121 
122 	if (topo_method_invoke(node, TOPO_METH_SENSOR_FAILURE,
123 	    TOPO_METH_SENSOR_FAILURE_VERSION, stp->st_spoofs, &nvl, &err) !=
124 	    0) {
125 		if (err == ETOPO_METHOD_NOTSUP) {
126 			st_check_component_complaints++;
127 			if (!have_complained) {
128 				fmd_hdl_debug(hdl, "Method %s not supported "
129 				    "on %s=%d", TOPO_METH_SENSOR_FAILURE, name,
130 				    topo_node_instance(node));
131 			}
132 			nvlist_free(rsrc);
133 			return (0);
134 		}
135 		nvl = NULL;
136 	}
137 
138 	if (topo_node_fru(node, &fru, NULL, &err) != 0) {
139 		st_stats.st_bad_fmri.fmds_value.ui64++;
140 		nvlist_free(nvl);
141 		nvlist_free(rsrc);
142 		return (0);
143 	}
144 
145 	if (topo_fmri_nvl2str(thp, fru, &fmri, &err) != 0) {
146 		st_stats.st_bad_fmri.fmds_value.ui64++;
147 		nvlist_free(nvl);
148 		nvlist_free(fru);
149 		nvlist_free(rsrc);
150 		return (0);
151 	}
152 
153 	nvlist_free(fru);
154 
155 	faulted = nonrecov = source_diff = injected = B_FALSE;
156 	predictive = B_TRUE;
157 	if (nvl != NULL)  {
158 		nvp = NULL;
159 		while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
160 			if (nvpair_value_nvlist(nvp, &props) != 0)
161 				continue;
162 
163 			faulted = B_TRUE;
164 
165 			/*
166 			 * We need some simple rules to handle the case where
167 			 * there are multiple facility nodes that indicate
168 			 * a problem with this FRU, but disagree on the values
169 			 * of nonrecov, predictive or source:
170 			 *
171 			 * 1) nonrecov will be set to true if one or more
172 			 *   facility nodes indicates true.  Otherwise it will
173 			 *   default to false
174 			 *
175 			 * 2) predictive will default to false and remain false
176 			 *    if one or more facility nodes indicate false.
177 			 *
178 			 * 3) source will be set to unknown unless all facility
179 			 *    nodes agree on the source
180 			 *
181 			 * 4) injected defaults to false, but will be set to
182 			 *    true if any of the sensor states were injected.
183 			 */
184 			if (nonrecov == B_FALSE)
185 				if (nvlist_lookup_boolean_value(props,
186 				    "nonrecov", &nonrecov) != 0)
187 					nonrecov = B_FALSE;
188 			if (predictive == B_TRUE)
189 				if (nvlist_lookup_boolean_value(props,
190 				    "predictive", &predictive) != 0)
191 					predictive = B_FALSE;
192 			(void) nvlist_lookup_boolean_value(props,
193 			    "injected", &injected);
194 
195 			last_source = source;
196 			if (nvlist_lookup_uint32(props, "source",
197 			    (uint32_t *)&source) != 0)
198 				source = TOPO_SENSOR_ERRSRC_UNKNOWN;
199 			if (last_source != -1 && last_source != source)
200 				source_diff = B_TRUE;
201 		}
202 		if (source_diff)
203 			source = TOPO_SENSOR_ERRSRC_UNKNOWN;
204 	}
205 
206 	/*
207 	 * See if we know about this fru.
208 	 */
209 	for (current = &stp->st_faults; *current != NULL;
210 	    current = &(*current)->sf_next) {
211 		if (topo_fmri_strcmp(thp, fmri,
212 		    (*current)->sf_fru))
213 			break;
214 	}
215 
216 	sfp = *current;
217 	if (sfp == NULL) {
218 		/*
219 		 * We add this FRU to our list under two circumstances:
220 		 *
221 		 *	1. This FRU is faulted and needs to be remembered to
222 		 *	   avoid duplicate ereports.
223 		 *
224 		 *	2. This is the initial pass, and we want to repair the
225 		 *	   FRU in case it was repaired while we were offline.
226 		 */
227 		if (stp->st_first || faulted) {
228 			sfp = fmd_hdl_zalloc(hdl, sizeof (sensor_fault_t),
229 			    FMD_SLEEP);
230 			sfp->sf_fru = fmd_hdl_strdup(hdl, fmri, FMD_SLEEP);
231 			sfp->sf_next = stp->st_faults;
232 			stp->st_faults = sfp;
233 		} else {
234 			goto out;
235 		}
236 	}
237 
238 	if (faulted)
239 		sfp->sf_num_fails++;
240 
241 	if (nvl == NULL)
242 		sfp->sf_unknown = B_TRUE;
243 
244 	if (faulted) {
245 		/*
246 		 * Construct and post the ereport.
247 		 *
248 		 * XXFM we only post one ereport per fru.  It should be possible
249 		 * to uniquely identify faulty resources instead and post one
250 		 * per resource, even if they share the same FRU.
251 		 */
252 		if (!sfp->sf_last_faulted &&
253 		    (sfp->sf_num_fails > stp->st_tolerance)) {
254 			ena = fmd_event_ena_create(hdl);
255 			event = fmd_nvl_alloc(hdl, FMD_SLEEP);
256 
257 			(void) nvlist_add_string(event, "type", name);
258 			(void) nvlist_add_boolean_value(event, "nonrecov",
259 			    nonrecov);
260 			(void) nvlist_add_boolean_value(event, "predictive",
261 			    predictive);
262 			(void) nvlist_add_uint32(event, "source",
263 			    (uint32_t)source);
264 			(void) nvlist_add_nvlist(event, "details", nvl);
265 			(void) nvlist_add_string(event, FM_CLASS,
266 			    ST_EREPORT_CLASS);
267 			(void) nvlist_add_uint8(event, FM_VERSION,
268 			    FM_EREPORT_VERSION);
269 			(void) nvlist_add_uint64(event, FM_EREPORT_ENA, ena);
270 			(void) nvlist_add_nvlist(event, FM_EREPORT_DETECTOR,
271 			    rsrc);
272 			(void) nvlist_add_boolean_value(event, "__injected",
273 			    injected);
274 			fmd_xprt_post(hdl, stp->st_xprt, event, 0);
275 			fmd_hdl_debug(hdl, "posted ereport: %s",
276 			    ST_EREPORT_CLASS);
277 		}
278 
279 		sfp->sf_faulted = B_TRUE;
280 	}
281 
282 out:
283 	topo_hdl_strfree(thp, fmri);
284 	nvlist_free(rsrc);
285 	nvlist_free(nvl);
286 	return (0);
287 }
288 
289 int st_timeout_verbose = 0;
290 
291 /*ARGSUSED*/
292 static void
293 st_timeout(fmd_hdl_t *hdl, id_t id, void *data)
294 {
295 	sensor_transport_t *stp;
296 	sensor_fault_t *sfp, **current;
297 	topo_hdl_t *thp;
298 	topo_walk_t *twp;
299 	int err;
300 
301 	if (st_timeout_verbose)
302 		fmd_hdl_debug(hdl, "timeout: checking topology");
303 
304 	stp = fmd_hdl_getspecific(hdl);
305 	thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION);
306 
307 	if ((twp = topo_walk_init(thp, FM_FMRI_SCHEME_HC, st_check_component,
308 	    stp, &err)) == NULL) {
309 		fmd_hdl_topo_rele(hdl, thp);
310 		fmd_hdl_error(hdl, "failed to walk topology: %s\n",
311 		    topo_strerror(err));
312 		st_stats.st_topo_errs.fmds_value.ui64++;
313 		return;
314 	}
315 
316 	if (st_check_component_complaints)
317 		have_complained++;
318 
319 	/*
320 	 * Initialize values in our internal FRU list for this iteration of
321 	 * sensor reads.  Keep track of whether the FRU was faulted in the
322 	 * previous pass so we don't send multiple ereports for the same
323 	 * problem.
324 	 */
325 	for (sfp = stp->st_faults; sfp != NULL; sfp = sfp->sf_next) {
326 		sfp->sf_unknown = B_FALSE;
327 		if (sfp->sf_num_fails > stp->st_tolerance)
328 			sfp->sf_last_faulted = sfp->sf_faulted;
329 		sfp->sf_faulted = B_FALSE;
330 	}
331 
332 	if (topo_walk_step(twp, TOPO_WALK_CHILD) == TOPO_WALK_ERR) {
333 		topo_walk_fini(twp);
334 		fmd_hdl_topo_rele(hdl, thp);
335 		fmd_hdl_error(hdl, "failed to walk topology\n");
336 		st_stats.st_topo_errs.fmds_value.ui64++;
337 		return;
338 	}
339 
340 	/*
341 	 * Remove any faults that weren't seen in the last pass.
342 	 */
343 	for (current = &stp->st_faults; *current != NULL; ) {
344 		sfp = *current;
345 		if (!sfp->sf_faulted && !sfp->sf_unknown) {
346 			fmd_hdl_debug(hdl, "repairing %s", sfp->sf_fru);
347 			fmd_repair_fru(hdl, sfp->sf_fru);
348 			st_stats.st_repairs.fmds_value.ui64++;
349 			*current = sfp->sf_next;
350 			fmd_hdl_strfree(hdl, sfp->sf_fru);
351 			fmd_hdl_free(hdl, sfp, sizeof (sensor_fault_t));
352 		} else {
353 			current = &sfp->sf_next;
354 		}
355 	}
356 
357 	stp->st_first = B_FALSE;
358 	topo_walk_fini(twp);
359 	fmd_hdl_topo_rele(hdl, thp);
360 
361 	stp->st_timer = fmd_timer_install(hdl, NULL, NULL, stp->st_interval);
362 }
363 
364 /*
365  * Parse the value of the spoof-sensor-state module property and store the
366  * result in an nvlist of nvlists.  The format of the value is 3-tuple,
367  * delimited by colons, as follows:
368  *
369  * FMRIPATTERN:SENSORNAME:SENSORSTATE;...
370  *
371  * where FMRIPATTERN can be a string with wildcards that matches the FMRI
372  * of a node associated with the target sensor facility.
373  *
374  * where SENSORNAME is the node name of the target sensor facility
375  *
376  * where SENSORSTATE is the desired sensor state value to spoof.
377  *
378  * Multiple tuples can be specifed, delimited by semicolons.
379  *
380  * If any errors are encountered while parsing the value, all parsing is
381  * ceased and an ereport will be generated indicating a failure to parse
382  * the value.
383  */
384 /*ARGSUSED*/
385 static int
386 parse_spoof_param(fmd_hdl_t *hdl, char *param, sensor_transport_t *stp)
387 {
388 	char *sensor, *last_sensor, *field, *last_field;
389 	nvlist_t *spoof;
390 
391 	if (nvlist_alloc(&stp->st_spoofs, NV_UNIQUE_NAME, 0) != 0) {
392 		return (-1);
393 	}
394 
395 	sensor = strtok_r(param, ";", &last_sensor);
396 	while (sensor != NULL) {
397 		if (nvlist_alloc(&spoof, NV_UNIQUE_NAME, 0) != 0)
398 			goto err;
399 
400 		if ((field = strtok_r(sensor, ":", &last_field)) == NULL ||
401 		    nvlist_add_string(spoof, ST_SPOOF_FMRI, field) != 0)
402 			goto err;
403 
404 		if ((field = strtok_r(NULL, ":", &last_field)) == NULL ||
405 		    nvlist_add_string(spoof, ST_SPOOF_SENSOR, field) != 0)
406 			goto err;
407 
408 		if ((field = strtok_r(NULL, ":", &last_field)) == NULL ||
409 		    nvlist_add_uint32(spoof, ST_SPOOF_STATE,
410 		    strtol(field, NULL, 0)) != 0)
411 			goto err;
412 
413 		if (nvlist_add_nvlist(stp->st_spoofs, sensor, spoof) != 0)
414 			goto err;
415 
416 		spoof = NULL;
417 		sensor = strtok_r(NULL, ";", &last_sensor);
418 	}
419 
420 	return (0);
421 err:
422 	nvlist_free(spoof);
423 	nvlist_free(stp->st_spoofs);
424 	stp->st_spoofs = NULL;
425 	return (-1);
426 }
427 
428 static const fmd_prop_t fmd_props[] = {
429 	{ "interval", FMD_TYPE_TIME, "1min" },
430 	{ "tolerance", FMD_TYPE_UINT32, "1" },
431 	{ "spoof_sensor_state", FMD_TYPE_STRING, NULL },
432 	{ NULL, 0, NULL }
433 };
434 
435 static const fmd_hdl_ops_t fmd_ops = {
436 	NULL,			/* fmdo_recv */
437 	st_timeout,		/* fmdo_timeout */
438 	NULL,			/* fmdo_close */
439 	NULL,			/* fmdo_stats */
440 	NULL,			/* fmdo_gc */
441 	NULL,			/* fmdo_send */
442 	NULL			/* fmdo_topo */
443 };
444 
445 static const fmd_hdl_info_t fmd_info = {
446 	"Sensor Transport Agent", "1.1", &fmd_ops, fmd_props
447 };
448 
449 void
450 _fmd_init(fmd_hdl_t *hdl)
451 {
452 	sensor_transport_t *stp;
453 	char buf[SYS_NMLN];
454 
455 	/*
456 	 * The sensor-transport module is currently only supported on x86
457 	 * platforms.  So to avoid unnecessarily wasting cpu cycles on sparc
458 	 * walking the hc scheme tree every 60 seconds, we'll bail out before
459 	 * registering the handle.
460 	 */
461 	if ((sysinfo(SI_ARCHITECTURE, buf, sizeof (buf)) == -1) ||
462 	    (strcmp(buf, "i386") != 0))
463 		return;
464 
465 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0)
466 		return;
467 
468 	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC,
469 	    sizeof (st_stats) / sizeof (fmd_stat_t),
470 	    (fmd_stat_t *)&st_stats);
471 
472 	stp = fmd_hdl_zalloc(hdl, sizeof (sensor_transport_t), FMD_SLEEP);
473 	stp->st_interval = fmd_prop_get_int64(hdl, "interval");
474 	stp->st_tolerance = fmd_prop_get_int32(hdl, "tolerance");
475 	spoof_prop = fmd_prop_get_string(hdl, "spoof_sensor_state");
476 
477 	if (spoof_prop != NULL && parse_spoof_param(hdl, spoof_prop, stp) != 0)
478 		fmd_hdl_error(hdl, "Error parsing config file");
479 
480 	fmd_hdl_setspecific(hdl, stp);
481 
482 	stp->st_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY, NULL, NULL);
483 	stp->st_hdl = hdl;
484 	stp->st_first = B_TRUE;
485 
486 	/* kick off the first asynchronous discovery */
487 	stp->st_timer = fmd_timer_install(hdl, NULL, NULL, 0);
488 }
489 
490 void
491 _fmd_fini(fmd_hdl_t *hdl)
492 {
493 	sensor_transport_t *stp;
494 	sensor_fault_t *sfp;
495 
496 	stp = fmd_hdl_getspecific(hdl);
497 	if (stp != NULL) {
498 		fmd_xprt_close(hdl, stp->st_xprt);
499 
500 		while ((sfp = stp->st_faults) != NULL) {
501 			stp->st_faults = sfp->sf_next;
502 
503 			fmd_hdl_strfree(hdl, sfp->sf_fru);
504 			fmd_hdl_free(hdl, sfp, sizeof (sensor_fault_t));
505 		}
506 		nvlist_free(stp->st_spoofs);
507 		fmd_hdl_free(hdl, stp, sizeof (sensor_transport_t));
508 	}
509 	fmd_prop_free_string(hdl, spoof_prop);
510 }
511