xref: /titanic_50/usr/src/cmd/fm/fmd/common/fmd_sysevent.c (revision 047f6e6f42a3d50d3e38a05c00bf7dd3fafac726)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/sysevent/eventdefs.h>
28 #include <sys/sysevent.h>
29 #include <sys/sysevent_impl.h>
30 #include <sys/fm/protocol.h>
31 #include <sys/sysmacros.h>
32 #include <sys/dumphdr.h>
33 #include <sys/dumpadm.h>
34 #include <sys/fm/util.h>
35 
36 #include <libsysevent.h>
37 #include <libnvpair.h>
38 #include <alloca.h>
39 #include <limits.h>
40 #include <strings.h>
41 #include <unistd.h>
42 #include <fcntl.h>
43 #include <errno.h>
44 
45 #undef MUTEX_HELD
46 #undef RW_READ_HELD
47 #undef RW_WRITE_HELD
48 
49 #include <fmd_api.h>
50 #include <fmd_log.h>
51 #include <fmd_subr.h>
52 #include <fmd_dispq.h>
53 #include <fmd_dr.h>
54 #include <fmd_module.h>
55 #include <fmd_protocol.h>
56 #include <fmd_scheme.h>
57 #include <fmd_error.h>
58 
59 #include <fmd.h>
60 
61 static char *sysev_channel;	/* event channel to which we are subscribed */
62 static char *sysev_class;	/* event class to which we are subscribed */
63 static char *sysev_device;	/* device path to use for replaying events */
64 static char *sysev_sid;		/* event channel subscriber identifier */
65 static void *sysev_evc;		/* event channel cookie from evc_bind */
66 
67 static fmd_xprt_t *sysev_xprt;
68 static int sysev_xprt_refcnt;
69 static fmd_hdl_t *sysev_hdl;
70 
71 static struct sysev_stats {
72 	fmd_stat_t dump_replay;
73 	fmd_stat_t dump_lost;
74 	fmd_stat_t bad_class;
75 	fmd_stat_t bad_attr;
76 	fmd_stat_t eagain;
77 } sysev_stats = {
78 	{ "dump_replay", FMD_TYPE_UINT64, "events replayed from dump device" },
79 	{ "dump_lost", FMD_TYPE_UINT64, "events lost from dump device" },
80 	{ "bad_class", FMD_TYPE_UINT64, "events dropped due to invalid class" },
81 	{ "bad_attr", FMD_TYPE_UINT64, "events dropped due to invalid nvlist" },
82 	{ "eagain", FMD_TYPE_UINT64, "events retried due to low memory" },
83 };
84 
85 static pthread_cond_t sysev_cv = PTHREAD_COND_INITIALIZER;
86 static pthread_mutex_t sysev_mutex = PTHREAD_MUTEX_INITIALIZER;
87 static int sysev_replay_wait = 1;
88 static int sysev_exiting;
89 
90 /*
91  * Entry point for legacy sysevents.  This function is responsible for two
92  * things: passing off interesting events to the DR handler, and converting
93  * sysevents into resource events that modules can then subscribe to.
94  */
95 static void
96 sysev_legacy(sysevent_t *sep)
97 {
98 	const char *class = sysevent_get_class_name(sep);
99 	const char *subclass = sysevent_get_subclass_name(sep);
100 	char *fullclass;
101 	size_t len;
102 	nvlist_t *attr, *nvl;
103 	fmd_event_t *e;
104 	hrtime_t hrt;
105 
106 	/* notify the DR subsystem of the event */
107 	fmd_dr_event(sep);
108 
109 	/* get the matching sysevent name */
110 	len = snprintf(NULL, 0, "%s%s.%s", SYSEVENT_RSRC_CLASS,
111 	    class, subclass);
112 	fullclass = alloca(len + 1);
113 	(void) snprintf(fullclass, len + 1, "%s%s.%s",
114 	    SYSEVENT_RSRC_CLASS, class, subclass);
115 
116 	/* construct the event payload */
117 	(void) nvlist_xalloc(&nvl, NV_UNIQUE_NAME, &fmd.d_nva);
118 	if (sysevent_get_attr_list(sep, &attr) == 0) {
119 		(void) nvlist_merge(nvl, attr, 0);
120 		nvlist_free(attr);
121 	}
122 
123 	/*
124 	 * Add class and version after the nvlist_merge() just in case
125 	 * the sysevent has an attribute called class or version.
126 	 */
127 	(void) nvlist_add_string(nvl, FM_CLASS, fullclass);
128 	(void) nvlist_add_uint8(nvl, FM_VERSION, FM_RSRC_VERSION);
129 
130 	/*
131 	 * Dispatch the event.  Ideally, we'd like to use the same transport
132 	 * interface as sysev_recv(), but because the legacy sysevent mechanism
133 	 * puts in a thread outside fmd's control, using the module APIs is
134 	 * impossible.
135 	 */
136 	sysevent_get_time(sep, &hrt);
137 	(void) nvlist_lookup_string(nvl, FM_CLASS, &fullclass);
138 	e = fmd_event_create(FMD_EVT_PROTOCOL, hrt, nvl, fullclass);
139 	fmd_dispq_dispatch(fmd.d_disp, e, fullclass);
140 }
141 
142 /*
143  * Receive an event from the SysEvent channel and post it to our transport.
144  * Under extreme low-memory situations where we cannot event unpack the event,
145  * we can request that SysEvent redeliver the event later by returning EAGAIN.
146  * If we do this too many times, the kernel will drop the event.  Rather than
147  * keeping state per-event, we simply attempt a garbage-collect, hoping that
148  * enough free memory will be available by the time the event is redelivered.
149  */
150 static int
151 sysev_recv(sysevent_t *sep, void *arg)
152 {
153 	uint64_t seq = sysevent_get_seq(sep);
154 	fmd_xprt_t *xp = arg;
155 	nvlist_t *nvl;
156 	hrtime_t hrt;
157 	int rc = 0;
158 
159 	(void) pthread_mutex_lock(&sysev_mutex);
160 	if (sysev_exiting == 1) {
161 		while (sysev_xprt_refcnt > 0)
162 			(void) pthread_cond_wait(&sysev_cv, &sysev_mutex);
163 		(void) pthread_mutex_unlock(&sysev_mutex);
164 		return (EAGAIN);
165 	}
166 	sysev_xprt_refcnt++;
167 	while (sysev_replay_wait)
168 		(void) pthread_cond_wait(&sysev_cv, &sysev_mutex);
169 	(void) pthread_mutex_unlock(&sysev_mutex);
170 
171 	if (strcmp(sysevent_get_class_name(sep), EC_FM) != 0) {
172 		fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: unexpected"
173 		    " transport class %s\n", seq, sysevent_get_class_name(sep));
174 		sysev_stats.bad_class.fmds_value.ui64++;
175 	} else if (sysevent_get_attr_list(sep, &nvl) != 0) {
176 		if (errno == EAGAIN || errno == ENOMEM) {
177 			fmd_modhash_tryapply(fmd.d_mod_hash, fmd_module_trygc);
178 			fmd_scheme_hash_trygc(fmd.d_schemes);
179 			sysev_stats.eagain.fmds_value.ui64++;
180 			rc = EAGAIN;
181 		} else {
182 			fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: "
183 			    "missing or invalid payload", seq);
184 			sysev_stats.bad_attr.fmds_value.ui64++;
185 		}
186 	} else {
187 		sysevent_get_time(sep, &hrt);
188 		fmd_xprt_post(sysev_hdl, xp, nvl, hrt);
189 	}
190 
191 	(void) pthread_mutex_lock(&sysev_mutex);
192 	if (--sysev_xprt_refcnt == 0 && sysev_exiting == 1)
193 		(void) pthread_cond_broadcast(&sysev_cv);
194 	(void) pthread_mutex_unlock(&sysev_mutex);
195 
196 	return (rc);
197 }
198 
199 /*
200  * Checksum algorithm used by the dump transport for verifying the content of
201  * error reports saved on the dump device (copy of the kernel's checksum32()).
202  */
203 static uint32_t
204 sysev_checksum(void *cp_arg, size_t length)
205 {
206 	uchar_t *cp, *ep;
207 	uint32_t sum = 0;
208 
209 	for (cp = cp_arg, ep = cp + length; cp < ep; cp++)
210 		sum = ((sum >> 1) | (sum << 31)) + *cp;
211 
212 	return (sum);
213 }
214 
215 /*
216  * Replay saved events from the dump transport.  This function is installed as
217  * the timer callback and is called only once during the module's lifetime.
218  */
219 /*ARGSUSED*/
220 static void
221 sysev_replay(fmd_hdl_t *hdl, id_t id, void *arg)
222 {
223 	char *dumpdev;
224 	off64_t off, off0;
225 	int fd, err;
226 
227 	/*
228 	 * Determine the appropriate dump device to use for replaying pending
229 	 * error reports.  If the device property is NULL (default), we
230 	 * open and query /dev/dump to determine the current dump device.
231 	 */
232 	if ((dumpdev = sysev_device) == NULL) {
233 		if ((fd = open("/dev/dump", O_RDONLY)) == -1) {
234 			fmd_hdl_error(hdl, "failed to open /dev/dump "
235 			    "to locate dump device for event replay");
236 			goto done;
237 		}
238 
239 		dumpdev = alloca(PATH_MAX);
240 		err = ioctl(fd, DIOCGETDEV, dumpdev);
241 		(void) close(fd);
242 
243 		if (err == -1) {
244 			if (errno != ENODEV) {
245 				fmd_hdl_error(hdl, "failed to obtain "
246 				    "path to dump device for event replay");
247 			}
248 			goto done;
249 		}
250 	}
251 
252 	if (strcmp(dumpdev, "/dev/null") == 0)
253 		goto done; /* return silently and skip replay for /dev/null */
254 
255 	/*
256 	 * Open the appropriate device and then determine the offset of the
257 	 * start of the ereport dump region located at the end of the device.
258 	 */
259 	if ((fd = open64(dumpdev, O_RDWR | O_DSYNC)) == -1) {
260 		fmd_hdl_error(hdl, "failed to open dump transport %s "
261 		    "(pending events will not be replayed)", dumpdev);
262 		goto done;
263 	}
264 
265 	off = DUMP_OFFSET + DUMP_LOGSIZE + DUMP_ERPTSIZE;
266 	off = off0 = lseek64(fd, -off, SEEK_END) & -DUMP_OFFSET;
267 
268 	if (off == (off64_t)-1LL) {
269 		fmd_hdl_error(hdl, "failed to seek dump transport %s "
270 		    "(pending events will not be replayed)", dumpdev);
271 		(void) close(fd);
272 		goto done;
273 	}
274 
275 	/*
276 	 * The ereport dump region is a sequence of erpt_dump_t headers each of
277 	 * which is followed by packed nvlist data.  We iterate over them in
278 	 * order, unpacking and dispatching each one to our dispatch queue.
279 	 */
280 	for (;;) {
281 		char nvbuf[ERPT_DATA_SZ];
282 		uint32_t chksum;
283 		erpt_dump_t ed;
284 		nvlist_t *nvl;
285 
286 		fmd_timeval_t ftv, tod;
287 		hrtime_t hrt;
288 		uint64_t ena;
289 
290 		if (pread64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
291 			fmd_hdl_error(hdl, "failed to read from dump "
292 			    "transport %s (pending events lost)", dumpdev);
293 			break;
294 		}
295 
296 		if (ed.ed_magic == 0 && ed.ed_size == 0)
297 			break; /* end of list: all zero */
298 
299 		if (ed.ed_magic == 0) {
300 			off += sizeof (ed) + ed.ed_size;
301 			continue; /* continue searching */
302 		}
303 
304 		if (ed.ed_magic != ERPT_MAGIC) {
305 			/*
306 			 * Stop reading silently if the first record has the
307 			 * wrong magic number; this likely indicates that we
308 			 * rebooted from non-FMA bits or paged over the dump.
309 			 */
310 			if (off == off0)
311 				break;
312 
313 			fmd_hdl_error(hdl, "invalid dump transport "
314 			    "record at %llx (magic number %x, expected %x)\n",
315 			    (u_longlong_t)off, ed.ed_magic, ERPT_MAGIC);
316 			break;
317 		}
318 
319 		if (ed.ed_size > ERPT_DATA_SZ) {
320 			fmd_hdl_error(hdl, "invalid dump transport "
321 			    "record at %llx size (%u exceeds limit)\n",
322 			    (u_longlong_t)off, ed.ed_size);
323 			break;
324 		}
325 
326 		if (pread64(fd, nvbuf, ed.ed_size,
327 		    off + sizeof (ed)) != ed.ed_size) {
328 			fmd_hdl_error(hdl, "failed to read dump "
329 			    "transport event (offset %llx)", (u_longlong_t)off);
330 
331 			sysev_stats.dump_lost.fmds_value.ui64++;
332 			goto next;
333 		}
334 
335 		if ((chksum = sysev_checksum(nvbuf,
336 		    ed.ed_size)) != ed.ed_chksum) {
337 			fmd_hdl_error(hdl, "dump transport event at "
338 			    "offset %llx is corrupt (checksum %x != %x)\n",
339 			    (u_longlong_t)off, chksum, ed.ed_chksum);
340 
341 			sysev_stats.dump_lost.fmds_value.ui64++;
342 			goto next;
343 		}
344 
345 		if ((err = nvlist_xunpack(nvbuf,
346 		    ed.ed_size, &nvl, &fmd.d_nva)) != 0) {
347 			fmd_hdl_error(hdl, "failed to unpack dump "
348 			    "transport event at offset %llx: %s\n",
349 			    (u_longlong_t)off, fmd_strerror(err));
350 
351 			sysev_stats.dump_lost.fmds_value.ui64++;
352 			goto next;
353 		}
354 
355 		/*
356 		 * If ed_hrt_nsec is set it contains the gethrtime() value from
357 		 * when the event was originally enqueued for the transport.
358 		 * If it is zero, we use the weaker bound ed_hrt_base instead.
359 		 */
360 		if (ed.ed_hrt_nsec != 0)
361 			hrt = ed.ed_hrt_nsec;
362 		else
363 			hrt = ed.ed_hrt_base;
364 
365 		/*
366 		 * If this is an FMA protocol event of class "ereport.*" that
367 		 * contains valid ENA, we can improve the precision of 'hrt'.
368 		 */
369 		if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) == 0)
370 			hrt = fmd_time_ena2hrt(hrt, ena);
371 
372 		/*
373 		 * Now convert 'hrt' to an adjustable TOD based on the values
374 		 * in ed_tod_base which correspond to one another and are
375 		 * sampled before reboot using the old gethrtime() clock.
376 		 * fmd_event_recreate() will use this TOD value to re-assign
377 		 * the event an updated gethrtime() value based on the current
378 		 * value of the non-adjustable gethrtime() clock.  Phew.
379 		 */
380 		tod.ftv_sec = ed.ed_tod_base.sec;
381 		tod.ftv_nsec = ed.ed_tod_base.nsec;
382 		fmd_time_hrt2tod(ed.ed_hrt_base, &tod, hrt, &ftv);
383 
384 		(void) nvlist_remove_all(nvl, FMD_EVN_TOD);
385 		(void) nvlist_add_uint64_array(nvl,
386 		    FMD_EVN_TOD, (uint64_t *)&ftv, 2);
387 
388 		fmd_xprt_post(hdl, sysev_xprt, nvl, 0);
389 		sysev_stats.dump_replay.fmds_value.ui64++;
390 
391 next:
392 		/*
393 		 * Reset the magic number for the event record to zero so that
394 		 * we do not replay the same event multiple times.
395 		 */
396 		ed.ed_magic = 0;
397 
398 		if (pwrite64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
399 			fmd_hdl_error(hdl, "failed to mark dump "
400 			    "transport event (offset %llx)", (u_longlong_t)off);
401 		}
402 
403 		off += sizeof (ed) + ed.ed_size;
404 	}
405 
406 	(void) close(fd);
407 done:
408 	(void) pthread_mutex_lock(&sysev_mutex);
409 	sysev_replay_wait = 0;
410 	(void) pthread_cond_broadcast(&sysev_cv);
411 	(void) pthread_mutex_unlock(&sysev_mutex);
412 }
413 
414 static const fmd_prop_t sysev_props[] = {
415 	{ "class", FMD_TYPE_STRING, EC_ALL },		/* event class */
416 	{ "device", FMD_TYPE_STRING, NULL },		/* replay device */
417 	{ "channel", FMD_TYPE_STRING, FM_ERROR_CHAN },	/* channel name */
418 	{ "sid", FMD_TYPE_STRING, "fmd" },		/* subscriber id */
419 	{ NULL, 0, NULL }
420 };
421 
422 static const fmd_hdl_ops_t sysev_ops = {
423 	NULL,		/* fmdo_recv */
424 	sysev_replay,	/* fmdo_timeout */
425 	NULL,		/* fmdo_close */
426 	NULL,		/* fmdo_stats */
427 	NULL,		/* fmdo_gc */
428 	NULL,		/* fmdo_send */
429 };
430 
431 static const fmd_hdl_info_t sysev_info = {
432 	"SysEvent Transport Agent", "1.0", &sysev_ops, sysev_props
433 };
434 
435 /*
436  * Bind to the sysevent channel we use for listening for error events and then
437  * subscribe to appropriate events received over this channel.  Setup the
438  * legacy sysevent handler for creating sysevent resources and forwarding DR
439  * events.
440  */
441 void
442 sysev_init(fmd_hdl_t *hdl)
443 {
444 	uint_t flags;
445 	const char *subclasses[] = { EC_SUB_ALL };
446 
447 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &sysev_info) != 0)
448 		return; /* invalid property settings */
449 
450 	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (sysev_stats) /
451 	    sizeof (fmd_stat_t), (fmd_stat_t *)&sysev_stats);
452 
453 	sysev_channel = fmd_prop_get_string(hdl, "channel");
454 	sysev_class = fmd_prop_get_string(hdl, "class");
455 	sysev_device = fmd_prop_get_string(hdl, "device");
456 	sysev_sid = fmd_prop_get_string(hdl, "sid");
457 
458 	if (sysev_channel == NULL)
459 		fmd_hdl_abort(hdl, "channel property must be defined\n");
460 
461 	if (sysev_sid == NULL)
462 		fmd_hdl_abort(hdl, "sid property must be defined\n");
463 
464 	if ((errno = sysevent_evc_bind(sysev_channel, &sysev_evc,
465 	    EVCH_CREAT | EVCH_HOLD_PEND)) != 0) {
466 		fmd_hdl_abort(hdl, "failed to bind to event transport "
467 		    "channel %s", sysev_channel);
468 	}
469 
470 	sysev_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY |
471 	    FMD_XPRT_CACHE_AS_LOCAL, NULL, NULL);
472 	sysev_hdl = hdl;
473 
474 	/*
475 	 * If we're subscribing to the default channel, keep our subscription
476 	 * active even if we die unexpectedly so we continue queuing events.
477 	 * If we're not (e.g. running under fmsim), do not specify SUB_KEEP so
478 	 * that our event channel will be destroyed if we die unpleasantly.
479 	 */
480 	if (strcmp(sysev_channel, FM_ERROR_CHAN) == 0)
481 		flags = EVCH_SUB_KEEP | EVCH_SUB_DUMP;
482 	else
483 		flags = EVCH_SUB_DUMP;
484 
485 	errno = sysevent_evc_subscribe(sysev_evc,
486 	    sysev_sid, sysev_class, sysev_recv, sysev_xprt, flags);
487 
488 	if (errno != 0) {
489 		if (errno == EEXIST) {
490 			fmd_hdl_abort(hdl, "another fault management daemon is "
491 			    "active on transport channel %s\n", sysev_channel);
492 		} else {
493 			fmd_hdl_abort(hdl, "failed to subscribe to %s on "
494 			    "transport channel %s", sysev_class, sysev_channel);
495 		}
496 	}
497 
498 	/*
499 	 * Once the transport is open, install a single timer to fire at once
500 	 * in the context of the module's thread to run sysev_replay().  This
501 	 * thread will block in its first fmd_xprt_post() until fmd is ready.
502 	 */
503 	fmd_hdl_debug(hdl, "transport '%s' open\n", sysev_channel);
504 	(void) fmd_timer_install(hdl, NULL, NULL, 0);
505 
506 	/*
507 	 * Open the legacy sysevent handle and subscribe to all events.  These
508 	 * are automatically converted to "resource.sysevent.*" events so that
509 	 * modules can manage these events without additional infrastructure.
510 	 */
511 	if (geteuid() != 0)
512 		return;
513 
514 	if ((fmd.d_sysev_hdl =
515 	    sysevent_bind_handle(sysev_legacy)) == NULL)
516 		fmd_hdl_abort(hdl, "failed to bind to legacy sysevent channel");
517 
518 	if (sysevent_subscribe_event(fmd.d_sysev_hdl, EC_ALL,
519 	    subclasses, 1) != 0)
520 		fmd_hdl_abort(hdl, "failed to subscribe to legacy sysevents");
521 }
522 
523 /*
524  * Close the channel by unsubscribing and unbinding.  We only do this when a
525  * a non-default channel has been selected.  If we're using FM_ERROR_CHAN,
526  * the system default, we do *not* want to unsubscribe because the kernel will
527  * remove the subscriber queue and any events published in our absence will
528  * therefore be lost.  This scenario may occur when, for example, fmd is sent
529  * a SIGTERM by init(1M) during reboot but an error is detected and makes it
530  * into the sysevent channel queue before init(1M) manages to call uadmin(2).
531  */
532 void
533 sysev_fini(fmd_hdl_t *hdl)
534 {
535 	if (strcmp(sysev_channel, FM_ERROR_CHAN) != 0) {
536 		(void) sysevent_evc_unsubscribe(sysev_evc, sysev_sid);
537 		(void) sysevent_evc_unbind(sysev_evc);
538 	}
539 
540 	if (fmd.d_sysev_hdl != NULL)
541 		sysevent_unbind_handle(fmd.d_sysev_hdl);
542 
543 	if (sysev_xprt != NULL) {
544 		/*
545 		 * Wait callback returns before destroy the transport.
546 		 */
547 		(void) pthread_mutex_lock(&sysev_mutex);
548 		sysev_exiting = 1;
549 		while (sysev_xprt_refcnt > 0)
550 			(void) pthread_cond_wait(&sysev_cv, &sysev_mutex);
551 		(void) pthread_mutex_unlock(&sysev_mutex);
552 		fmd_xprt_close(hdl, sysev_xprt);
553 	}
554 
555 	fmd_prop_free_string(hdl, sysev_class);
556 	fmd_prop_free_string(hdl, sysev_channel);
557 	fmd_prop_free_string(hdl, sysev_device);
558 	fmd_prop_free_string(hdl, sysev_sid);
559 }
560