xref: /titanic_50/usr/src/cmd/fm/fmd/common/fmd_sysevent.c (revision febcc4a52c3ed7fe3a106da2c2ba52c56afd5111)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/sysevent/eventdefs.h>
30 #include <sys/sysevent.h>
31 #include <sys/sysevent_impl.h>
32 #include <sys/fm/protocol.h>
33 #include <sys/sysmacros.h>
34 #include <sys/dumphdr.h>
35 #include <sys/dumpadm.h>
36 #include <sys/fm/util.h>
37 
38 #include <libsysevent.h>
39 #include <libnvpair.h>
40 #include <alloca.h>
41 #include <limits.h>
42 #include <strings.h>
43 #include <unistd.h>
44 #include <fcntl.h>
45 #include <errno.h>
46 
47 #undef MUTEX_HELD
48 #undef RW_READ_HELD
49 #undef RW_WRITE_HELD
50 
51 #include <fmd_api.h>
52 #include <fmd_log.h>
53 #include <fmd_subr.h>
54 #include <fmd_dispq.h>
55 #include <fmd_module.h>
56 #include <fmd_scheme.h>
57 #include <fmd_error.h>
58 
59 #include <fmd.h>
60 
61 static char *sysev_channel;	/* event channel to which we are subscribed */
62 static char *sysev_class;	/* event class to which we are subscribed */
63 static char *sysev_device;	/* device path to use for replaying events */
64 static char *sysev_sid;		/* event channel subscriber identifier */
65 static void *sysev_evc;		/* event channel cookie from evc_bind */
66 
67 static fmd_xprt_t *sysev_xprt;
68 static fmd_hdl_t *sysev_hdl;
69 
70 static struct sysev_stats {
71 	fmd_stat_t dump_replay;
72 	fmd_stat_t dump_lost;
73 	fmd_stat_t bad_class;
74 	fmd_stat_t bad_attr;
75 	fmd_stat_t eagain;
76 } sysev_stats = {
77 	{ "dump_replay", FMD_TYPE_UINT64, "events replayed from dump device" },
78 	{ "dump_lost", FMD_TYPE_UINT64, "events lost from dump device" },
79 	{ "bad_class", FMD_TYPE_UINT64, "events dropped due to invalid class" },
80 	{ "bad_attr", FMD_TYPE_UINT64, "events dropped due to invalid nvlist" },
81 	{ "eagain", FMD_TYPE_UINT64, "events retried due to low memory" },
82 };
83 
84 static pthread_cond_t sysev_replay_cv = PTHREAD_COND_INITIALIZER;
85 static pthread_mutex_t sysev_replay_mutex = PTHREAD_MUTEX_INITIALIZER;
86 static int sysev_replay_wait = 1;
87 
88 /*
89  * Receive an event from the SysEvent channel and post it to our transport.
90  * Under extreme low-memory situations where we cannot event unpack the event,
91  * we can request that SysEvent redeliver the event later by returning EAGAIN.
92  * If we do this too many times, the kernel will drop the event.  Rather than
93  * keeping state per-event, we simply attempt a garbage-collect, hoping that
94  * enough free memory will be available by the time the event is redelivered.
95  */
96 static int
97 sysev_recv(sysevent_t *sep, void *arg)
98 {
99 	uint64_t seq = sysevent_get_seq(sep);
100 	fmd_xprt_t *xp = arg;
101 	nvlist_t *nvl;
102 	hrtime_t hrt;
103 
104 	(void) pthread_mutex_lock(&sysev_replay_mutex);
105 	while (sysev_replay_wait)
106 		(void) pthread_cond_wait(&sysev_replay_cv, &sysev_replay_mutex);
107 	(void) pthread_mutex_unlock(&sysev_replay_mutex);
108 
109 	if (strcmp(sysevent_get_class_name(sep), EC_FM) != 0) {
110 		fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: unexpected"
111 		    " transport class %s\n", seq, sysevent_get_class_name(sep));
112 		sysev_stats.bad_class.fmds_value.ui64++;
113 		return (0);
114 	}
115 
116 	if (sysevent_get_attr_list(sep, &nvl) != 0) {
117 		if (errno == EAGAIN || errno == ENOMEM) {
118 			fmd_modhash_tryapply(fmd.d_mod_hash, fmd_module_trygc);
119 			fmd_scheme_hash_trygc(fmd.d_schemes);
120 			sysev_stats.eagain.fmds_value.ui64++;
121 			return (EAGAIN);
122 		}
123 
124 		fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: missing "
125 		    "or invalid payload", seq);
126 		sysev_stats.bad_attr.fmds_value.ui64++;
127 		return (0);
128 	}
129 
130 	sysevent_get_time(sep, &hrt);
131 	fmd_xprt_post(sysev_hdl, xp, nvl, hrt);
132 	return (0);
133 }
134 
135 /*
136  * Checksum algorithm used by the dump transport for verifying the content of
137  * error reports saved on the dump device (copy of the kernel's checksum32()).
138  */
139 static uint32_t
140 sysev_checksum(void *cp_arg, size_t length)
141 {
142 	uchar_t *cp, *ep;
143 	uint32_t sum = 0;
144 
145 	for (cp = cp_arg, ep = cp + length; cp < ep; cp++)
146 		sum = ((sum >> 1) | (sum << 31)) + *cp;
147 
148 	return (sum);
149 }
150 
151 /*
152  * Replay saved events from the dump transport.  This function is installed as
153  * the timer callback and is called only once during the module's lifetime.
154  */
155 /*ARGSUSED*/
156 static void
157 sysev_replay(fmd_hdl_t *hdl, id_t id, void *arg)
158 {
159 	char *dumpdev;
160 	off64_t off, off0;
161 	int fd, err;
162 
163 	/*
164 	 * Determine the appropriate dump device to use for replaying pending
165 	 * error reports.  If the device property is NULL (default), we
166 	 * open and query /dev/dump to determine the current dump device.
167 	 */
168 	if ((dumpdev = sysev_device) == NULL) {
169 		if ((fd = open("/dev/dump", O_RDONLY)) == -1) {
170 			fmd_hdl_error(hdl, "failed to open /dev/dump "
171 			    "to locate dump device for event replay");
172 			goto done;
173 		}
174 
175 		dumpdev = alloca(PATH_MAX);
176 		err = ioctl(fd, DIOCGETDEV, dumpdev);
177 		(void) close(fd);
178 
179 		if (err == -1) {
180 			if (errno != ENODEV) {
181 				fmd_hdl_error(hdl, "failed to obtain "
182 				    "path to dump device for event replay");
183 			}
184 			goto done;
185 		}
186 	}
187 
188 	if (strcmp(dumpdev, "/dev/null") == 0)
189 		goto done; /* return silently and skip replay for /dev/null */
190 
191 	/*
192 	 * Open the appropriate device and then determine the offset of the
193 	 * start of the ereport dump region located at the end of the device.
194 	 */
195 	if ((fd = open64(dumpdev, O_RDWR | O_DSYNC)) == -1) {
196 		fmd_hdl_error(hdl, "failed to open dump transport %s "
197 		    "(pending events will not be replayed)", dumpdev);
198 		goto done;
199 	}
200 
201 	off = DUMP_OFFSET + DUMP_LOGSIZE + DUMP_ERPTSIZE;
202 	off = off0 = lseek64(fd, -off, SEEK_END) & -DUMP_OFFSET;
203 
204 	if (off == (off64_t)-1LL) {
205 		fmd_hdl_error(hdl, "failed to seek dump transport %s "
206 		    "(pending events will not be replayed)", dumpdev);
207 		(void) close(fd);
208 		goto done;
209 	}
210 
211 	/*
212 	 * The ereport dump region is a sequence of erpt_dump_t headers each of
213 	 * which is followed by packed nvlist data.  We iterate over them in
214 	 * order, unpacking and dispatching each one to our dispatch queue.
215 	 */
216 	for (;;) {
217 		char nvbuf[ERPT_DATA_SZ];
218 		uint32_t chksum;
219 		erpt_dump_t ed;
220 		nvlist_t *nvl;
221 
222 		fmd_timeval_t ftv, tod;
223 		hrtime_t hrt;
224 		uint64_t ena;
225 
226 		if (pread64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
227 			fmd_hdl_error(hdl, "failed to read from dump "
228 			    "transport %s (pending events lost)", dumpdev);
229 			break;
230 		}
231 
232 		if (ed.ed_magic == 0 && ed.ed_size == 0)
233 			break; /* end of list: all zero */
234 
235 		if (ed.ed_magic == 0) {
236 			off += sizeof (ed) + ed.ed_size;
237 			continue; /* continue searching */
238 		}
239 
240 		if (ed.ed_magic != ERPT_MAGIC) {
241 			/*
242 			 * Stop reading silently if the first record has the
243 			 * wrong magic number; this likely indicates that we
244 			 * rebooted from non-FMA bits or paged over the dump.
245 			 */
246 			if (off == off0)
247 				break;
248 
249 			fmd_hdl_error(hdl, "invalid dump transport "
250 			    "record at %llx (magic number %x, expected %x)\n",
251 			    (u_longlong_t)off, ed.ed_magic, ERPT_MAGIC);
252 			break;
253 		}
254 
255 		if (ed.ed_size > ERPT_DATA_SZ) {
256 			fmd_hdl_error(hdl, "invalid dump transport "
257 			    "record at %llx size (%u exceeds limit)\n",
258 			    (u_longlong_t)off, ed.ed_size);
259 			break;
260 		}
261 
262 		if (pread64(fd, nvbuf, ed.ed_size,
263 		    off + sizeof (ed)) != ed.ed_size) {
264 			fmd_hdl_error(hdl, "failed to read dump "
265 			    "transport event (offset %llx)", (u_longlong_t)off);
266 
267 			sysev_stats.dump_lost.fmds_value.ui64++;
268 			goto next;
269 		}
270 
271 		if ((chksum = sysev_checksum(nvbuf,
272 		    ed.ed_size)) != ed.ed_chksum) {
273 			fmd_hdl_error(hdl, "dump transport event at "
274 			    "offset %llx is corrupt (checksum %x != %x)\n",
275 			    (u_longlong_t)off, chksum, ed.ed_chksum);
276 
277 			sysev_stats.dump_lost.fmds_value.ui64++;
278 			goto next;
279 		}
280 
281 		if ((err = nvlist_xunpack(nvbuf,
282 		    ed.ed_size, &nvl, &fmd.d_nva)) != 0) {
283 			fmd_hdl_error(hdl, "failed to unpack dump "
284 			    "transport event at offset %llx: %s\n",
285 			    (u_longlong_t)off, fmd_strerror(err));
286 
287 			sysev_stats.dump_lost.fmds_value.ui64++;
288 			goto next;
289 		}
290 
291 		/*
292 		 * If ed_hrt_nsec is set it contains the gethrtime() value from
293 		 * when the event was originally enqueued for the transport.
294 		 * If it is zero, we use the weaker bound ed_hrt_base instead.
295 		 */
296 		if (ed.ed_hrt_nsec != 0)
297 			hrt = ed.ed_hrt_nsec;
298 		else
299 			hrt = ed.ed_hrt_base;
300 
301 		/*
302 		 * If this is an FMA protocol event of class "ereport.*" that
303 		 * contains valid ENA, we can improve the precision of 'hrt'.
304 		 */
305 		if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) == 0)
306 			hrt = fmd_time_ena2hrt(hrt, ena);
307 
308 		/*
309 		 * Now convert 'hrt' to an adjustable TOD based on the values
310 		 * in ed_tod_base which correspond to one another and are
311 		 * sampled before reboot using the old gethrtime() clock.
312 		 * fmd_event_recreate() will use this TOD value to re-assign
313 		 * the event an updated gethrtime() value based on the current
314 		 * value of the non-adjustable gethrtime() clock.  Phew.
315 		 */
316 		tod.ftv_sec = ed.ed_tod_base.sec;
317 		tod.ftv_nsec = ed.ed_tod_base.nsec;
318 		fmd_time_hrt2tod(ed.ed_hrt_base, &tod, hrt, &ftv);
319 
320 		(void) nvlist_remove_all(nvl, FMD_EVN_TOD);
321 		(void) nvlist_add_uint64_array(nvl,
322 		    FMD_EVN_TOD, (uint64_t *)&ftv, 2);
323 
324 		fmd_xprt_post(hdl, sysev_xprt, nvl, 0);
325 		sysev_stats.dump_replay.fmds_value.ui64++;
326 
327 next:
328 		/*
329 		 * Reset the magic number for the event record to zero so that
330 		 * we do not replay the same event multiple times.
331 		 */
332 		ed.ed_magic = 0;
333 
334 		if (pwrite64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
335 			fmd_hdl_error(hdl, "failed to mark dump "
336 			    "transport event (offset %llx)", (u_longlong_t)off);
337 		}
338 
339 		off += sizeof (ed) + ed.ed_size;
340 	}
341 
342 	(void) close(fd);
343 done:
344 	(void) pthread_mutex_lock(&sysev_replay_mutex);
345 	sysev_replay_wait = 0;
346 	(void) pthread_cond_broadcast(&sysev_replay_cv);
347 	(void) pthread_mutex_unlock(&sysev_replay_mutex);
348 }
349 
350 static const fmd_prop_t sysev_props[] = {
351 	{ "class", FMD_TYPE_STRING, EC_ALL },		/* event class */
352 	{ "device", FMD_TYPE_STRING, NULL },		/* replay device */
353 	{ "channel", FMD_TYPE_STRING, FM_ERROR_CHAN },	/* channel name */
354 	{ "sid", FMD_TYPE_STRING, "fmd" },		/* subscriber id */
355 	{ NULL, 0, NULL }
356 };
357 
358 static const fmd_hdl_ops_t sysev_ops = {
359 	NULL,		/* fmdo_recv */
360 	sysev_replay,	/* fmdo_timeout */
361 	NULL,		/* fmdo_close */
362 	NULL,		/* fmdo_stats */
363 	NULL,		/* fmdo_gc */
364 	NULL,		/* fmdo_send */
365 };
366 
367 static const fmd_hdl_info_t sysev_info = {
368 	"SysEvent Transport Agent", "1.0", &sysev_ops, sysev_props
369 };
370 
371 /*
372  * Bind to the sysevent channel we use for listening for error events and then
373  * subscribe to appropriate events received over this channel.
374  */
375 void
376 sysev_init(fmd_hdl_t *hdl)
377 {
378 	uint_t flags;
379 
380 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &sysev_info) != 0)
381 		return; /* invalid property settings */
382 
383 	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (sysev_stats) /
384 	    sizeof (fmd_stat_t), (fmd_stat_t *)&sysev_stats);
385 
386 	sysev_channel = fmd_prop_get_string(hdl, "channel");
387 	sysev_class = fmd_prop_get_string(hdl, "class");
388 	sysev_device = fmd_prop_get_string(hdl, "device");
389 	sysev_sid = fmd_prop_get_string(hdl, "sid");
390 
391 	if (sysev_channel == NULL)
392 		fmd_hdl_abort(hdl, "channel property must be defined\n");
393 
394 	if (sysev_sid == NULL)
395 		fmd_hdl_abort(hdl, "sid property must be defined\n");
396 
397 	if ((errno = sysevent_evc_bind(sysev_channel, &sysev_evc,
398 	    EVCH_CREAT | EVCH_HOLD_PEND)) != 0) {
399 		fmd_hdl_abort(hdl, "failed to bind to event transport "
400 		    "channel %s", sysev_channel);
401 	}
402 
403 	sysev_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY, NULL, NULL);
404 	sysev_hdl = hdl;
405 
406 	/*
407 	 * If we're subscribing to the default channel, keep our subscription
408 	 * active even if we die unexpectedly so we continue queuing events.
409 	 * If we're not (e.g. running under fmsim), do not specify SUB_KEEP so
410 	 * that our event channel will be destroyed if we die unpleasantly.
411 	 */
412 	if (strcmp(sysev_channel, FM_ERROR_CHAN) == 0)
413 		flags = EVCH_SUB_KEEP | EVCH_SUB_DUMP;
414 	else
415 		flags = EVCH_SUB_DUMP;
416 
417 	errno = sysevent_evc_subscribe(sysev_evc,
418 	    sysev_sid, sysev_class, sysev_recv, sysev_xprt, flags);
419 
420 	if (errno != 0) {
421 		if (errno == EEXIST) {
422 			fmd_hdl_abort(hdl, "another fault management daemon is "
423 			    "active on transport channel %s\n", sysev_channel);
424 		} else {
425 			fmd_hdl_abort(hdl, "failed to subscribe to %s on "
426 			    "transport channel %s", sysev_class, sysev_channel);
427 		}
428 	}
429 
430 	/*
431 	 * Once the transport is open, install a single timer to fire at once
432 	 * in the context of the module's thread to run sysev_replay().  This
433 	 * thread will block in its first fmd_xprt_post() until fmd is ready.
434 	 */
435 	fmd_hdl_debug(hdl, "transport '%s' open\n", sysev_channel);
436 	(void) fmd_timer_install(hdl, NULL, NULL, 0);
437 }
438 
439 /*
440  * Close the channel by unsubscribing and unbinding.  We only do this when a
441  * a non-default channel has been selected.  If we're using FM_ERROR_CHAN,
442  * the system default, we do *not* want to unsubscribe because the kernel will
443  * remove the subscriber queue and any events published in our absence will
444  * therefore be lost.  This scenario may occur when, for example, fmd is sent
445  * a SIGTERM by init(1M) during reboot but an error is detected and makes it
446  * into the sysevent channel queue before init(1M) manages to call uadmin(2).
447  */
448 void
449 sysev_fini(fmd_hdl_t *hdl)
450 {
451 	if (strcmp(sysev_channel, FM_ERROR_CHAN) != 0) {
452 		sysevent_evc_unsubscribe(sysev_evc, sysev_sid);
453 		sysevent_evc_unbind(sysev_evc);
454 	}
455 
456 	if (sysev_xprt != NULL)
457 		fmd_xprt_close(hdl, sysev_xprt);
458 
459 	fmd_prop_free_string(hdl, sysev_class);
460 	fmd_prop_free_string(hdl, sysev_channel);
461 	fmd_prop_free_string(hdl, sysev_device);
462 	fmd_prop_free_string(hdl, sysev_sid);
463 }
464