xref: /titanic_50/usr/src/cmd/fm/fmd/common/fmd_sysevent.c (revision 3d09a4fec6be19a6f09e277d5d5d17942bb4abf4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/sysevent/eventdefs.h>
28 #include <sys/sysevent.h>
29 #include <sys/sysevent_impl.h>
30 #include <sys/fm/protocol.h>
31 #include <sys/sysmacros.h>
32 #include <sys/dumphdr.h>
33 #include <sys/dumpadm.h>
34 #include <sys/fm/util.h>
35 
36 #include <libsysevent.h>
37 #include <libnvpair.h>
38 #include <alloca.h>
39 #include <limits.h>
40 #include <strings.h>
41 #include <unistd.h>
42 #include <fcntl.h>
43 #include <errno.h>
44 
45 #undef MUTEX_HELD
46 #undef RW_READ_HELD
47 #undef RW_WRITE_HELD
48 
49 #include <fmd_api.h>
50 #include <fmd_log.h>
51 #include <fmd_subr.h>
52 #include <fmd_dispq.h>
53 #include <fmd_dr.h>
54 #include <fmd_module.h>
55 #include <fmd_protocol.h>
56 #include <fmd_scheme.h>
57 #include <fmd_error.h>
58 
59 #include <fmd.h>
60 
61 static char *sysev_channel;	/* event channel to which we are subscribed */
62 static char *sysev_class;	/* event class to which we are subscribed */
63 static char *sysev_device;	/* device path to use for replaying events */
64 static char *sysev_sid;		/* event channel subscriber identifier */
65 static void *sysev_evc;		/* event channel cookie from evc_bind */
66 
67 static fmd_xprt_t *sysev_xprt;
68 static int sysev_xprt_refcnt;
69 static fmd_hdl_t *sysev_hdl;
70 
71 static struct sysev_stats {
72 	fmd_stat_t dump_replay;
73 	fmd_stat_t dump_lost;
74 	fmd_stat_t bad_class;
75 	fmd_stat_t bad_attr;
76 	fmd_stat_t eagain;
77 } sysev_stats = {
78 	{ "dump_replay", FMD_TYPE_UINT64, "events replayed from dump device" },
79 	{ "dump_lost", FMD_TYPE_UINT64, "events lost from dump device" },
80 	{ "bad_class", FMD_TYPE_UINT64, "events dropped due to invalid class" },
81 	{ "bad_attr", FMD_TYPE_UINT64, "events dropped due to invalid nvlist" },
82 	{ "eagain", FMD_TYPE_UINT64, "events retried due to low memory" },
83 };
84 
85 static pthread_cond_t sysev_cv = PTHREAD_COND_INITIALIZER;
86 static pthread_mutex_t sysev_mutex = PTHREAD_MUTEX_INITIALIZER;
87 static int sysev_replay_wait = 1;
88 static int sysev_exiting;
89 
90 /*
91  * Entry point for legacy sysevents.  This function is responsible for two
92  * things: passing off interesting events to the DR handler, and converting
93  * sysevents into resource events that modules can then subscribe to.
94  */
95 static void
96 sysev_legacy(sysevent_t *sep)
97 {
98 	const char *class = sysevent_get_class_name(sep);
99 	const char *subclass = sysevent_get_subclass_name(sep);
100 	char *fullclass;
101 	size_t len;
102 	nvlist_t *attr, *nvl;
103 	fmd_event_t *e;
104 	hrtime_t hrt;
105 
106 	/* notify the DR subsystem of the event */
107 	fmd_dr_event(sep);
108 
109 	/* get the matching sysevent name */
110 	len = snprintf(NULL, 0, "%s%s.%s", SYSEVENT_RSRC_CLASS,
111 	    class, subclass);
112 	fullclass = alloca(len + 1);
113 	(void) snprintf(fullclass, len + 1, "%s%s.%s",
114 	    SYSEVENT_RSRC_CLASS, class, subclass);
115 
116 	/* construct the event payload */
117 	(void) nvlist_xalloc(&nvl, NV_UNIQUE_NAME, &fmd.d_nva);
118 	(void) nvlist_add_string(nvl, FM_CLASS, fullclass);
119 	(void) nvlist_add_uint8(nvl, FM_VERSION, FM_RSRC_VERSION);
120 	if (sysevent_get_attr_list(sep, &attr) == 0) {
121 		(void) nvlist_merge(nvl, attr, 0);
122 		nvlist_free(attr);
123 	}
124 
125 	/*
126 	 * Dispatch the event.  Ideally, we'd like to use the same transport
127 	 * interface as sysev_recv(), but because the legacy sysevent mechanism
128 	 * puts in a thread outside fmd's control, using the module APIs is
129 	 * impossible.
130 	 */
131 	sysevent_get_time(sep, &hrt);
132 	(void) nvlist_lookup_string(nvl, FM_CLASS, &fullclass);
133 	e = fmd_event_create(FMD_EVT_PROTOCOL, hrt, nvl, fullclass);
134 	fmd_dispq_dispatch(fmd.d_disp, e, fullclass);
135 }
136 
137 /*
138  * Receive an event from the SysEvent channel and post it to our transport.
139  * Under extreme low-memory situations where we cannot event unpack the event,
140  * we can request that SysEvent redeliver the event later by returning EAGAIN.
141  * If we do this too many times, the kernel will drop the event.  Rather than
142  * keeping state per-event, we simply attempt a garbage-collect, hoping that
143  * enough free memory will be available by the time the event is redelivered.
144  */
145 static int
146 sysev_recv(sysevent_t *sep, void *arg)
147 {
148 	uint64_t seq = sysevent_get_seq(sep);
149 	fmd_xprt_t *xp = arg;
150 	nvlist_t *nvl;
151 	hrtime_t hrt;
152 	int rc = 0;
153 
154 	(void) pthread_mutex_lock(&sysev_mutex);
155 	if (sysev_exiting == 1) {
156 		while (sysev_xprt_refcnt > 0)
157 			(void) pthread_cond_wait(&sysev_cv, &sysev_mutex);
158 		(void) pthread_mutex_unlock(&sysev_mutex);
159 		return (EAGAIN);
160 	}
161 	sysev_xprt_refcnt++;
162 	while (sysev_replay_wait)
163 		(void) pthread_cond_wait(&sysev_cv, &sysev_mutex);
164 	(void) pthread_mutex_unlock(&sysev_mutex);
165 
166 	if (strcmp(sysevent_get_class_name(sep), EC_FM) != 0) {
167 		fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: unexpected"
168 		    " transport class %s\n", seq, sysevent_get_class_name(sep));
169 		sysev_stats.bad_class.fmds_value.ui64++;
170 	} else if (sysevent_get_attr_list(sep, &nvl) != 0) {
171 		if (errno == EAGAIN || errno == ENOMEM) {
172 			fmd_modhash_tryapply(fmd.d_mod_hash, fmd_module_trygc);
173 			fmd_scheme_hash_trygc(fmd.d_schemes);
174 			sysev_stats.eagain.fmds_value.ui64++;
175 			rc = EAGAIN;
176 		} else {
177 			fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: "
178 			    "missing or invalid payload", seq);
179 			sysev_stats.bad_attr.fmds_value.ui64++;
180 		}
181 	} else {
182 		sysevent_get_time(sep, &hrt);
183 		fmd_xprt_post(sysev_hdl, xp, nvl, hrt);
184 	}
185 
186 	(void) pthread_mutex_lock(&sysev_mutex);
187 	if (--sysev_xprt_refcnt == 0 && sysev_exiting == 1)
188 		(void) pthread_cond_broadcast(&sysev_cv);
189 	(void) pthread_mutex_unlock(&sysev_mutex);
190 
191 	return (rc);
192 }
193 
194 /*
195  * Checksum algorithm used by the dump transport for verifying the content of
196  * error reports saved on the dump device (copy of the kernel's checksum32()).
197  */
198 static uint32_t
199 sysev_checksum(void *cp_arg, size_t length)
200 {
201 	uchar_t *cp, *ep;
202 	uint32_t sum = 0;
203 
204 	for (cp = cp_arg, ep = cp + length; cp < ep; cp++)
205 		sum = ((sum >> 1) | (sum << 31)) + *cp;
206 
207 	return (sum);
208 }
209 
210 /*
211  * Replay saved events from the dump transport.  This function is installed as
212  * the timer callback and is called only once during the module's lifetime.
213  */
214 /*ARGSUSED*/
215 static void
216 sysev_replay(fmd_hdl_t *hdl, id_t id, void *arg)
217 {
218 	char *dumpdev;
219 	off64_t off, off0;
220 	int fd, err;
221 
222 	/*
223 	 * Determine the appropriate dump device to use for replaying pending
224 	 * error reports.  If the device property is NULL (default), we
225 	 * open and query /dev/dump to determine the current dump device.
226 	 */
227 	if ((dumpdev = sysev_device) == NULL) {
228 		if ((fd = open("/dev/dump", O_RDONLY)) == -1) {
229 			fmd_hdl_error(hdl, "failed to open /dev/dump "
230 			    "to locate dump device for event replay");
231 			goto done;
232 		}
233 
234 		dumpdev = alloca(PATH_MAX);
235 		err = ioctl(fd, DIOCGETDEV, dumpdev);
236 		(void) close(fd);
237 
238 		if (err == -1) {
239 			if (errno != ENODEV) {
240 				fmd_hdl_error(hdl, "failed to obtain "
241 				    "path to dump device for event replay");
242 			}
243 			goto done;
244 		}
245 	}
246 
247 	if (strcmp(dumpdev, "/dev/null") == 0)
248 		goto done; /* return silently and skip replay for /dev/null */
249 
250 	/*
251 	 * Open the appropriate device and then determine the offset of the
252 	 * start of the ereport dump region located at the end of the device.
253 	 */
254 	if ((fd = open64(dumpdev, O_RDWR | O_DSYNC)) == -1) {
255 		fmd_hdl_error(hdl, "failed to open dump transport %s "
256 		    "(pending events will not be replayed)", dumpdev);
257 		goto done;
258 	}
259 
260 	off = DUMP_OFFSET + DUMP_LOGSIZE + DUMP_ERPTSIZE;
261 	off = off0 = lseek64(fd, -off, SEEK_END) & -DUMP_OFFSET;
262 
263 	if (off == (off64_t)-1LL) {
264 		fmd_hdl_error(hdl, "failed to seek dump transport %s "
265 		    "(pending events will not be replayed)", dumpdev);
266 		(void) close(fd);
267 		goto done;
268 	}
269 
270 	/*
271 	 * The ereport dump region is a sequence of erpt_dump_t headers each of
272 	 * which is followed by packed nvlist data.  We iterate over them in
273 	 * order, unpacking and dispatching each one to our dispatch queue.
274 	 */
275 	for (;;) {
276 		char nvbuf[ERPT_DATA_SZ];
277 		uint32_t chksum;
278 		erpt_dump_t ed;
279 		nvlist_t *nvl;
280 
281 		fmd_timeval_t ftv, tod;
282 		hrtime_t hrt;
283 		uint64_t ena;
284 
285 		if (pread64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
286 			fmd_hdl_error(hdl, "failed to read from dump "
287 			    "transport %s (pending events lost)", dumpdev);
288 			break;
289 		}
290 
291 		if (ed.ed_magic == 0 && ed.ed_size == 0)
292 			break; /* end of list: all zero */
293 
294 		if (ed.ed_magic == 0) {
295 			off += sizeof (ed) + ed.ed_size;
296 			continue; /* continue searching */
297 		}
298 
299 		if (ed.ed_magic != ERPT_MAGIC) {
300 			/*
301 			 * Stop reading silently if the first record has the
302 			 * wrong magic number; this likely indicates that we
303 			 * rebooted from non-FMA bits or paged over the dump.
304 			 */
305 			if (off == off0)
306 				break;
307 
308 			fmd_hdl_error(hdl, "invalid dump transport "
309 			    "record at %llx (magic number %x, expected %x)\n",
310 			    (u_longlong_t)off, ed.ed_magic, ERPT_MAGIC);
311 			break;
312 		}
313 
314 		if (ed.ed_size > ERPT_DATA_SZ) {
315 			fmd_hdl_error(hdl, "invalid dump transport "
316 			    "record at %llx size (%u exceeds limit)\n",
317 			    (u_longlong_t)off, ed.ed_size);
318 			break;
319 		}
320 
321 		if (pread64(fd, nvbuf, ed.ed_size,
322 		    off + sizeof (ed)) != ed.ed_size) {
323 			fmd_hdl_error(hdl, "failed to read dump "
324 			    "transport event (offset %llx)", (u_longlong_t)off);
325 
326 			sysev_stats.dump_lost.fmds_value.ui64++;
327 			goto next;
328 		}
329 
330 		if ((chksum = sysev_checksum(nvbuf,
331 		    ed.ed_size)) != ed.ed_chksum) {
332 			fmd_hdl_error(hdl, "dump transport event at "
333 			    "offset %llx is corrupt (checksum %x != %x)\n",
334 			    (u_longlong_t)off, chksum, ed.ed_chksum);
335 
336 			sysev_stats.dump_lost.fmds_value.ui64++;
337 			goto next;
338 		}
339 
340 		if ((err = nvlist_xunpack(nvbuf,
341 		    ed.ed_size, &nvl, &fmd.d_nva)) != 0) {
342 			fmd_hdl_error(hdl, "failed to unpack dump "
343 			    "transport event at offset %llx: %s\n",
344 			    (u_longlong_t)off, fmd_strerror(err));
345 
346 			sysev_stats.dump_lost.fmds_value.ui64++;
347 			goto next;
348 		}
349 
350 		/*
351 		 * If ed_hrt_nsec is set it contains the gethrtime() value from
352 		 * when the event was originally enqueued for the transport.
353 		 * If it is zero, we use the weaker bound ed_hrt_base instead.
354 		 */
355 		if (ed.ed_hrt_nsec != 0)
356 			hrt = ed.ed_hrt_nsec;
357 		else
358 			hrt = ed.ed_hrt_base;
359 
360 		/*
361 		 * If this is an FMA protocol event of class "ereport.*" that
362 		 * contains valid ENA, we can improve the precision of 'hrt'.
363 		 */
364 		if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) == 0)
365 			hrt = fmd_time_ena2hrt(hrt, ena);
366 
367 		/*
368 		 * Now convert 'hrt' to an adjustable TOD based on the values
369 		 * in ed_tod_base which correspond to one another and are
370 		 * sampled before reboot using the old gethrtime() clock.
371 		 * fmd_event_recreate() will use this TOD value to re-assign
372 		 * the event an updated gethrtime() value based on the current
373 		 * value of the non-adjustable gethrtime() clock.  Phew.
374 		 */
375 		tod.ftv_sec = ed.ed_tod_base.sec;
376 		tod.ftv_nsec = ed.ed_tod_base.nsec;
377 		fmd_time_hrt2tod(ed.ed_hrt_base, &tod, hrt, &ftv);
378 
379 		(void) nvlist_remove_all(nvl, FMD_EVN_TOD);
380 		(void) nvlist_add_uint64_array(nvl,
381 		    FMD_EVN_TOD, (uint64_t *)&ftv, 2);
382 
383 		fmd_xprt_post(hdl, sysev_xprt, nvl, 0);
384 		sysev_stats.dump_replay.fmds_value.ui64++;
385 
386 next:
387 		/*
388 		 * Reset the magic number for the event record to zero so that
389 		 * we do not replay the same event multiple times.
390 		 */
391 		ed.ed_magic = 0;
392 
393 		if (pwrite64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
394 			fmd_hdl_error(hdl, "failed to mark dump "
395 			    "transport event (offset %llx)", (u_longlong_t)off);
396 		}
397 
398 		off += sizeof (ed) + ed.ed_size;
399 	}
400 
401 	(void) close(fd);
402 done:
403 	(void) pthread_mutex_lock(&sysev_mutex);
404 	sysev_replay_wait = 0;
405 	(void) pthread_cond_broadcast(&sysev_cv);
406 	(void) pthread_mutex_unlock(&sysev_mutex);
407 }
408 
409 static const fmd_prop_t sysev_props[] = {
410 	{ "class", FMD_TYPE_STRING, EC_ALL },		/* event class */
411 	{ "device", FMD_TYPE_STRING, NULL },		/* replay device */
412 	{ "channel", FMD_TYPE_STRING, FM_ERROR_CHAN },	/* channel name */
413 	{ "sid", FMD_TYPE_STRING, "fmd" },		/* subscriber id */
414 	{ NULL, 0, NULL }
415 };
416 
417 static const fmd_hdl_ops_t sysev_ops = {
418 	NULL,		/* fmdo_recv */
419 	sysev_replay,	/* fmdo_timeout */
420 	NULL,		/* fmdo_close */
421 	NULL,		/* fmdo_stats */
422 	NULL,		/* fmdo_gc */
423 	NULL,		/* fmdo_send */
424 };
425 
426 static const fmd_hdl_info_t sysev_info = {
427 	"SysEvent Transport Agent", "1.0", &sysev_ops, sysev_props
428 };
429 
430 /*
431  * Bind to the sysevent channel we use for listening for error events and then
432  * subscribe to appropriate events received over this channel.  Setup the
433  * legacy sysevent handler for creating sysevent resources and forwarding DR
434  * events.
435  */
436 void
437 sysev_init(fmd_hdl_t *hdl)
438 {
439 	uint_t flags;
440 	const char *subclasses[] = { EC_SUB_ALL };
441 
442 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &sysev_info) != 0)
443 		return; /* invalid property settings */
444 
445 	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (sysev_stats) /
446 	    sizeof (fmd_stat_t), (fmd_stat_t *)&sysev_stats);
447 
448 	sysev_channel = fmd_prop_get_string(hdl, "channel");
449 	sysev_class = fmd_prop_get_string(hdl, "class");
450 	sysev_device = fmd_prop_get_string(hdl, "device");
451 	sysev_sid = fmd_prop_get_string(hdl, "sid");
452 
453 	if (sysev_channel == NULL)
454 		fmd_hdl_abort(hdl, "channel property must be defined\n");
455 
456 	if (sysev_sid == NULL)
457 		fmd_hdl_abort(hdl, "sid property must be defined\n");
458 
459 	if ((errno = sysevent_evc_bind(sysev_channel, &sysev_evc,
460 	    EVCH_CREAT | EVCH_HOLD_PEND)) != 0) {
461 		fmd_hdl_abort(hdl, "failed to bind to event transport "
462 		    "channel %s", sysev_channel);
463 	}
464 
465 	sysev_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY |
466 	    FMD_XPRT_CACHE_AS_LOCAL, NULL, NULL);
467 	sysev_hdl = hdl;
468 
469 	/*
470 	 * If we're subscribing to the default channel, keep our subscription
471 	 * active even if we die unexpectedly so we continue queuing events.
472 	 * If we're not (e.g. running under fmsim), do not specify SUB_KEEP so
473 	 * that our event channel will be destroyed if we die unpleasantly.
474 	 */
475 	if (strcmp(sysev_channel, FM_ERROR_CHAN) == 0)
476 		flags = EVCH_SUB_KEEP | EVCH_SUB_DUMP;
477 	else
478 		flags = EVCH_SUB_DUMP;
479 
480 	errno = sysevent_evc_subscribe(sysev_evc,
481 	    sysev_sid, sysev_class, sysev_recv, sysev_xprt, flags);
482 
483 	if (errno != 0) {
484 		if (errno == EEXIST) {
485 			fmd_hdl_abort(hdl, "another fault management daemon is "
486 			    "active on transport channel %s\n", sysev_channel);
487 		} else {
488 			fmd_hdl_abort(hdl, "failed to subscribe to %s on "
489 			    "transport channel %s", sysev_class, sysev_channel);
490 		}
491 	}
492 
493 	/*
494 	 * Once the transport is open, install a single timer to fire at once
495 	 * in the context of the module's thread to run sysev_replay().  This
496 	 * thread will block in its first fmd_xprt_post() until fmd is ready.
497 	 */
498 	fmd_hdl_debug(hdl, "transport '%s' open\n", sysev_channel);
499 	(void) fmd_timer_install(hdl, NULL, NULL, 0);
500 
501 	/*
502 	 * Open the legacy sysevent handle and subscribe to all events.  These
503 	 * are automatically converted to "resource.sysevent.*" events so that
504 	 * modules can manage these events without additional infrastructure.
505 	 */
506 	if (geteuid() != 0)
507 		return;
508 
509 	if ((fmd.d_sysev_hdl =
510 	    sysevent_bind_handle(sysev_legacy)) == NULL)
511 		fmd_hdl_abort(hdl, "failed to bind to legacy sysevent channel");
512 
513 	if (sysevent_subscribe_event(fmd.d_sysev_hdl, EC_ALL,
514 	    subclasses, 1) != 0)
515 		fmd_hdl_abort(hdl, "failed to subscribe to legacy sysevents");
516 }
517 
518 /*
519  * Close the channel by unsubscribing and unbinding.  We only do this when a
520  * a non-default channel has been selected.  If we're using FM_ERROR_CHAN,
521  * the system default, we do *not* want to unsubscribe because the kernel will
522  * remove the subscriber queue and any events published in our absence will
523  * therefore be lost.  This scenario may occur when, for example, fmd is sent
524  * a SIGTERM by init(1M) during reboot but an error is detected and makes it
525  * into the sysevent channel queue before init(1M) manages to call uadmin(2).
526  */
527 void
528 sysev_fini(fmd_hdl_t *hdl)
529 {
530 	if (strcmp(sysev_channel, FM_ERROR_CHAN) != 0) {
531 		sysevent_evc_unsubscribe(sysev_evc, sysev_sid);
532 		sysevent_evc_unbind(sysev_evc);
533 	}
534 
535 	if (fmd.d_sysev_hdl != NULL)
536 		sysevent_unbind_handle(fmd.d_sysev_hdl);
537 
538 	if (sysev_xprt != NULL) {
539 		/*
540 		 * Wait callback returns before destroy the transport.
541 		 */
542 		(void) pthread_mutex_lock(&sysev_mutex);
543 		sysev_exiting = 1;
544 		while (sysev_xprt_refcnt > 0)
545 			(void) pthread_cond_wait(&sysev_cv, &sysev_mutex);
546 		(void) pthread_mutex_unlock(&sysev_mutex);
547 		fmd_xprt_close(hdl, sysev_xprt);
548 	}
549 
550 	fmd_prop_free_string(hdl, sysev_class);
551 	fmd_prop_free_string(hdl, sysev_channel);
552 	fmd_prop_free_string(hdl, sysev_device);
553 	fmd_prop_free_string(hdl, sysev_sid);
554 }
555