xref: /titanic_52/usr/src/cmd/fm/fmd/common/fmd_sysevent.c (revision d2ec54f7875f7e05edd56195adbeb593c947763f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/sysevent/eventdefs.h>
30 #include <sys/sysevent.h>
31 #include <sys/sysevent_impl.h>
32 #include <sys/fm/protocol.h>
33 #include <sys/sysmacros.h>
34 #include <sys/dumphdr.h>
35 #include <sys/dumpadm.h>
36 #include <sys/fm/util.h>
37 
38 #include <libsysevent.h>
39 #include <libnvpair.h>
40 #include <alloca.h>
41 #include <limits.h>
42 #include <strings.h>
43 #include <unistd.h>
44 #include <fcntl.h>
45 #include <errno.h>
46 
47 #undef MUTEX_HELD
48 #undef RW_READ_HELD
49 #undef RW_WRITE_HELD
50 
51 #include <fmd_api.h>
52 #include <fmd_log.h>
53 #include <fmd_subr.h>
54 #include <fmd_dispq.h>
55 #include <fmd_module.h>
56 #include <fmd_scheme.h>
57 #include <fmd_error.h>
58 
59 #include <fmd.h>
60 
61 static char *sysev_channel;	/* event channel to which we are subscribed */
62 static char *sysev_class;	/* event class to which we are subscribed */
63 static char *sysev_device;	/* device path to use for replaying events */
64 static char *sysev_sid;		/* event channel subscriber identifier */
65 static void *sysev_evc;		/* event channel cookie from evc_bind */
66 
67 static fmd_xprt_t *sysev_xprt;
68 static int sysev_xprt_refcnt;
69 static fmd_hdl_t *sysev_hdl;
70 
71 static struct sysev_stats {
72 	fmd_stat_t dump_replay;
73 	fmd_stat_t dump_lost;
74 	fmd_stat_t bad_class;
75 	fmd_stat_t bad_attr;
76 	fmd_stat_t eagain;
77 } sysev_stats = {
78 	{ "dump_replay", FMD_TYPE_UINT64, "events replayed from dump device" },
79 	{ "dump_lost", FMD_TYPE_UINT64, "events lost from dump device" },
80 	{ "bad_class", FMD_TYPE_UINT64, "events dropped due to invalid class" },
81 	{ "bad_attr", FMD_TYPE_UINT64, "events dropped due to invalid nvlist" },
82 	{ "eagain", FMD_TYPE_UINT64, "events retried due to low memory" },
83 };
84 
85 static pthread_cond_t sysev_cv = PTHREAD_COND_INITIALIZER;
86 static pthread_mutex_t sysev_mutex = PTHREAD_MUTEX_INITIALIZER;
87 static int sysev_replay_wait = 1;
88 static int sysev_exiting;
89 
90 
91 /*
92  * Receive an event from the SysEvent channel and post it to our transport.
93  * Under extreme low-memory situations where we cannot event unpack the event,
94  * we can request that SysEvent redeliver the event later by returning EAGAIN.
95  * If we do this too many times, the kernel will drop the event.  Rather than
96  * keeping state per-event, we simply attempt a garbage-collect, hoping that
97  * enough free memory will be available by the time the event is redelivered.
98  */
99 static int
100 sysev_recv(sysevent_t *sep, void *arg)
101 {
102 	uint64_t seq = sysevent_get_seq(sep);
103 	fmd_xprt_t *xp = arg;
104 	nvlist_t *nvl;
105 	hrtime_t hrt;
106 	int rc = 0;
107 
108 	(void) pthread_mutex_lock(&sysev_mutex);
109 	if (sysev_exiting == 1) {
110 		while (sysev_xprt_refcnt > 0)
111 			(void) pthread_cond_wait(&sysev_cv, &sysev_mutex);
112 		(void) pthread_mutex_unlock(&sysev_mutex);
113 		return (EAGAIN);
114 	}
115 	sysev_xprt_refcnt++;
116 	while (sysev_replay_wait)
117 		(void) pthread_cond_wait(&sysev_cv, &sysev_mutex);
118 	(void) pthread_mutex_unlock(&sysev_mutex);
119 
120 	if (strcmp(sysevent_get_class_name(sep), EC_FM) != 0) {
121 		fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: unexpected"
122 		    " transport class %s\n", seq, sysevent_get_class_name(sep));
123 		sysev_stats.bad_class.fmds_value.ui64++;
124 	} else if (sysevent_get_attr_list(sep, &nvl) != 0) {
125 		if (errno == EAGAIN || errno == ENOMEM) {
126 			fmd_modhash_tryapply(fmd.d_mod_hash, fmd_module_trygc);
127 			fmd_scheme_hash_trygc(fmd.d_schemes);
128 			sysev_stats.eagain.fmds_value.ui64++;
129 			rc = EAGAIN;
130 		} else {
131 			fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: "
132 			    "missing or invalid payload", seq);
133 			sysev_stats.bad_attr.fmds_value.ui64++;
134 		}
135 	} else {
136 		sysevent_get_time(sep, &hrt);
137 		fmd_xprt_post(sysev_hdl, xp, nvl, hrt);
138 	}
139 
140 	(void) pthread_mutex_lock(&sysev_mutex);
141 	if (--sysev_xprt_refcnt == 0 && sysev_exiting == 1)
142 		(void) pthread_cond_broadcast(&sysev_cv);
143 	(void) pthread_mutex_unlock(&sysev_mutex);
144 
145 	return (rc);
146 }
147 
148 /*
149  * Checksum algorithm used by the dump transport for verifying the content of
150  * error reports saved on the dump device (copy of the kernel's checksum32()).
151  */
152 static uint32_t
153 sysev_checksum(void *cp_arg, size_t length)
154 {
155 	uchar_t *cp, *ep;
156 	uint32_t sum = 0;
157 
158 	for (cp = cp_arg, ep = cp + length; cp < ep; cp++)
159 		sum = ((sum >> 1) | (sum << 31)) + *cp;
160 
161 	return (sum);
162 }
163 
164 /*
165  * Replay saved events from the dump transport.  This function is installed as
166  * the timer callback and is called only once during the module's lifetime.
167  */
168 /*ARGSUSED*/
169 static void
170 sysev_replay(fmd_hdl_t *hdl, id_t id, void *arg)
171 {
172 	char *dumpdev;
173 	off64_t off, off0;
174 	int fd, err;
175 
176 	/*
177 	 * Determine the appropriate dump device to use for replaying pending
178 	 * error reports.  If the device property is NULL (default), we
179 	 * open and query /dev/dump to determine the current dump device.
180 	 */
181 	if ((dumpdev = sysev_device) == NULL) {
182 		if ((fd = open("/dev/dump", O_RDONLY)) == -1) {
183 			fmd_hdl_error(hdl, "failed to open /dev/dump "
184 			    "to locate dump device for event replay");
185 			goto done;
186 		}
187 
188 		dumpdev = alloca(PATH_MAX);
189 		err = ioctl(fd, DIOCGETDEV, dumpdev);
190 		(void) close(fd);
191 
192 		if (err == -1) {
193 			if (errno != ENODEV) {
194 				fmd_hdl_error(hdl, "failed to obtain "
195 				    "path to dump device for event replay");
196 			}
197 			goto done;
198 		}
199 	}
200 
201 	if (strcmp(dumpdev, "/dev/null") == 0)
202 		goto done; /* return silently and skip replay for /dev/null */
203 
204 	/*
205 	 * Open the appropriate device and then determine the offset of the
206 	 * start of the ereport dump region located at the end of the device.
207 	 */
208 	if ((fd = open64(dumpdev, O_RDWR | O_DSYNC)) == -1) {
209 		fmd_hdl_error(hdl, "failed to open dump transport %s "
210 		    "(pending events will not be replayed)", dumpdev);
211 		goto done;
212 	}
213 
214 	off = DUMP_OFFSET + DUMP_LOGSIZE + DUMP_ERPTSIZE;
215 	off = off0 = lseek64(fd, -off, SEEK_END) & -DUMP_OFFSET;
216 
217 	if (off == (off64_t)-1LL) {
218 		fmd_hdl_error(hdl, "failed to seek dump transport %s "
219 		    "(pending events will not be replayed)", dumpdev);
220 		(void) close(fd);
221 		goto done;
222 	}
223 
224 	/*
225 	 * The ereport dump region is a sequence of erpt_dump_t headers each of
226 	 * which is followed by packed nvlist data.  We iterate over them in
227 	 * order, unpacking and dispatching each one to our dispatch queue.
228 	 */
229 	for (;;) {
230 		char nvbuf[ERPT_DATA_SZ];
231 		uint32_t chksum;
232 		erpt_dump_t ed;
233 		nvlist_t *nvl;
234 
235 		fmd_timeval_t ftv, tod;
236 		hrtime_t hrt;
237 		uint64_t ena;
238 
239 		if (pread64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
240 			fmd_hdl_error(hdl, "failed to read from dump "
241 			    "transport %s (pending events lost)", dumpdev);
242 			break;
243 		}
244 
245 		if (ed.ed_magic == 0 && ed.ed_size == 0)
246 			break; /* end of list: all zero */
247 
248 		if (ed.ed_magic == 0) {
249 			off += sizeof (ed) + ed.ed_size;
250 			continue; /* continue searching */
251 		}
252 
253 		if (ed.ed_magic != ERPT_MAGIC) {
254 			/*
255 			 * Stop reading silently if the first record has the
256 			 * wrong magic number; this likely indicates that we
257 			 * rebooted from non-FMA bits or paged over the dump.
258 			 */
259 			if (off == off0)
260 				break;
261 
262 			fmd_hdl_error(hdl, "invalid dump transport "
263 			    "record at %llx (magic number %x, expected %x)\n",
264 			    (u_longlong_t)off, ed.ed_magic, ERPT_MAGIC);
265 			break;
266 		}
267 
268 		if (ed.ed_size > ERPT_DATA_SZ) {
269 			fmd_hdl_error(hdl, "invalid dump transport "
270 			    "record at %llx size (%u exceeds limit)\n",
271 			    (u_longlong_t)off, ed.ed_size);
272 			break;
273 		}
274 
275 		if (pread64(fd, nvbuf, ed.ed_size,
276 		    off + sizeof (ed)) != ed.ed_size) {
277 			fmd_hdl_error(hdl, "failed to read dump "
278 			    "transport event (offset %llx)", (u_longlong_t)off);
279 
280 			sysev_stats.dump_lost.fmds_value.ui64++;
281 			goto next;
282 		}
283 
284 		if ((chksum = sysev_checksum(nvbuf,
285 		    ed.ed_size)) != ed.ed_chksum) {
286 			fmd_hdl_error(hdl, "dump transport event at "
287 			    "offset %llx is corrupt (checksum %x != %x)\n",
288 			    (u_longlong_t)off, chksum, ed.ed_chksum);
289 
290 			sysev_stats.dump_lost.fmds_value.ui64++;
291 			goto next;
292 		}
293 
294 		if ((err = nvlist_xunpack(nvbuf,
295 		    ed.ed_size, &nvl, &fmd.d_nva)) != 0) {
296 			fmd_hdl_error(hdl, "failed to unpack dump "
297 			    "transport event at offset %llx: %s\n",
298 			    (u_longlong_t)off, fmd_strerror(err));
299 
300 			sysev_stats.dump_lost.fmds_value.ui64++;
301 			goto next;
302 		}
303 
304 		/*
305 		 * If ed_hrt_nsec is set it contains the gethrtime() value from
306 		 * when the event was originally enqueued for the transport.
307 		 * If it is zero, we use the weaker bound ed_hrt_base instead.
308 		 */
309 		if (ed.ed_hrt_nsec != 0)
310 			hrt = ed.ed_hrt_nsec;
311 		else
312 			hrt = ed.ed_hrt_base;
313 
314 		/*
315 		 * If this is an FMA protocol event of class "ereport.*" that
316 		 * contains valid ENA, we can improve the precision of 'hrt'.
317 		 */
318 		if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) == 0)
319 			hrt = fmd_time_ena2hrt(hrt, ena);
320 
321 		/*
322 		 * Now convert 'hrt' to an adjustable TOD based on the values
323 		 * in ed_tod_base which correspond to one another and are
324 		 * sampled before reboot using the old gethrtime() clock.
325 		 * fmd_event_recreate() will use this TOD value to re-assign
326 		 * the event an updated gethrtime() value based on the current
327 		 * value of the non-adjustable gethrtime() clock.  Phew.
328 		 */
329 		tod.ftv_sec = ed.ed_tod_base.sec;
330 		tod.ftv_nsec = ed.ed_tod_base.nsec;
331 		fmd_time_hrt2tod(ed.ed_hrt_base, &tod, hrt, &ftv);
332 
333 		(void) nvlist_remove_all(nvl, FMD_EVN_TOD);
334 		(void) nvlist_add_uint64_array(nvl,
335 		    FMD_EVN_TOD, (uint64_t *)&ftv, 2);
336 
337 		fmd_xprt_post(hdl, sysev_xprt, nvl, 0);
338 		sysev_stats.dump_replay.fmds_value.ui64++;
339 
340 next:
341 		/*
342 		 * Reset the magic number for the event record to zero so that
343 		 * we do not replay the same event multiple times.
344 		 */
345 		ed.ed_magic = 0;
346 
347 		if (pwrite64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
348 			fmd_hdl_error(hdl, "failed to mark dump "
349 			    "transport event (offset %llx)", (u_longlong_t)off);
350 		}
351 
352 		off += sizeof (ed) + ed.ed_size;
353 	}
354 
355 	(void) close(fd);
356 done:
357 	(void) pthread_mutex_lock(&sysev_mutex);
358 	sysev_replay_wait = 0;
359 	(void) pthread_cond_broadcast(&sysev_cv);
360 	(void) pthread_mutex_unlock(&sysev_mutex);
361 }
362 
363 static const fmd_prop_t sysev_props[] = {
364 	{ "class", FMD_TYPE_STRING, EC_ALL },		/* event class */
365 	{ "device", FMD_TYPE_STRING, NULL },		/* replay device */
366 	{ "channel", FMD_TYPE_STRING, FM_ERROR_CHAN },	/* channel name */
367 	{ "sid", FMD_TYPE_STRING, "fmd" },		/* subscriber id */
368 	{ NULL, 0, NULL }
369 };
370 
371 static const fmd_hdl_ops_t sysev_ops = {
372 	NULL,		/* fmdo_recv */
373 	sysev_replay,	/* fmdo_timeout */
374 	NULL,		/* fmdo_close */
375 	NULL,		/* fmdo_stats */
376 	NULL,		/* fmdo_gc */
377 	NULL,		/* fmdo_send */
378 };
379 
380 static const fmd_hdl_info_t sysev_info = {
381 	"SysEvent Transport Agent", "1.0", &sysev_ops, sysev_props
382 };
383 
384 /*
385  * Bind to the sysevent channel we use for listening for error events and then
386  * subscribe to appropriate events received over this channel.
387  */
388 void
389 sysev_init(fmd_hdl_t *hdl)
390 {
391 	uint_t flags;
392 
393 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &sysev_info) != 0)
394 		return; /* invalid property settings */
395 
396 	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (sysev_stats) /
397 	    sizeof (fmd_stat_t), (fmd_stat_t *)&sysev_stats);
398 
399 	sysev_channel = fmd_prop_get_string(hdl, "channel");
400 	sysev_class = fmd_prop_get_string(hdl, "class");
401 	sysev_device = fmd_prop_get_string(hdl, "device");
402 	sysev_sid = fmd_prop_get_string(hdl, "sid");
403 
404 	if (sysev_channel == NULL)
405 		fmd_hdl_abort(hdl, "channel property must be defined\n");
406 
407 	if (sysev_sid == NULL)
408 		fmd_hdl_abort(hdl, "sid property must be defined\n");
409 
410 	if ((errno = sysevent_evc_bind(sysev_channel, &sysev_evc,
411 	    EVCH_CREAT | EVCH_HOLD_PEND)) != 0) {
412 		fmd_hdl_abort(hdl, "failed to bind to event transport "
413 		    "channel %s", sysev_channel);
414 	}
415 
416 	sysev_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY, NULL, NULL);
417 	sysev_hdl = hdl;
418 
419 	/*
420 	 * If we're subscribing to the default channel, keep our subscription
421 	 * active even if we die unexpectedly so we continue queuing events.
422 	 * If we're not (e.g. running under fmsim), do not specify SUB_KEEP so
423 	 * that our event channel will be destroyed if we die unpleasantly.
424 	 */
425 	if (strcmp(sysev_channel, FM_ERROR_CHAN) == 0)
426 		flags = EVCH_SUB_KEEP | EVCH_SUB_DUMP;
427 	else
428 		flags = EVCH_SUB_DUMP;
429 
430 	errno = sysevent_evc_subscribe(sysev_evc,
431 	    sysev_sid, sysev_class, sysev_recv, sysev_xprt, flags);
432 
433 	if (errno != 0) {
434 		if (errno == EEXIST) {
435 			fmd_hdl_abort(hdl, "another fault management daemon is "
436 			    "active on transport channel %s\n", sysev_channel);
437 		} else {
438 			fmd_hdl_abort(hdl, "failed to subscribe to %s on "
439 			    "transport channel %s", sysev_class, sysev_channel);
440 		}
441 	}
442 
443 	/*
444 	 * Once the transport is open, install a single timer to fire at once
445 	 * in the context of the module's thread to run sysev_replay().  This
446 	 * thread will block in its first fmd_xprt_post() until fmd is ready.
447 	 */
448 	fmd_hdl_debug(hdl, "transport '%s' open\n", sysev_channel);
449 	(void) fmd_timer_install(hdl, NULL, NULL, 0);
450 }
451 
452 /*
453  * Close the channel by unsubscribing and unbinding.  We only do this when a
454  * a non-default channel has been selected.  If we're using FM_ERROR_CHAN,
455  * the system default, we do *not* want to unsubscribe because the kernel will
456  * remove the subscriber queue and any events published in our absence will
457  * therefore be lost.  This scenario may occur when, for example, fmd is sent
458  * a SIGTERM by init(1M) during reboot but an error is detected and makes it
459  * into the sysevent channel queue before init(1M) manages to call uadmin(2).
460  */
461 void
462 sysev_fini(fmd_hdl_t *hdl)
463 {
464 	if (strcmp(sysev_channel, FM_ERROR_CHAN) != 0) {
465 		sysevent_evc_unsubscribe(sysev_evc, sysev_sid);
466 		sysevent_evc_unbind(sysev_evc);
467 	}
468 
469 	if (sysev_xprt != NULL) {
470 		/*
471 		 * Wait callback returns before destroy the transport.
472 		 */
473 		(void) pthread_mutex_lock(&sysev_mutex);
474 		sysev_exiting = 1;
475 		while (sysev_xprt_refcnt > 0)
476 			(void) pthread_cond_wait(&sysev_cv, &sysev_mutex);
477 		(void) pthread_mutex_unlock(&sysev_mutex);
478 		fmd_xprt_close(hdl, sysev_xprt);
479 	}
480 
481 	fmd_prop_free_string(hdl, sysev_class);
482 	fmd_prop_free_string(hdl, sysev_channel);
483 	fmd_prop_free_string(hdl, sysev_device);
484 	fmd_prop_free_string(hdl, sysev_sid);
485 }
486