xref: /titanic_41/usr/src/cmd/fm/fmd/common/fmd_sysevent.c (revision 86aa80977b878fdd7d76a9ee3f2119010f4b8ec0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 #include <sys/sysevent/eventdefs.h>
31 #include <sys/sysevent.h>
32 #include <sys/sysevent_impl.h>
33 #include <sys/fm/protocol.h>
34 #include <sys/sysmacros.h>
35 #include <sys/dumphdr.h>
36 #include <sys/dumpadm.h>
37 #include <sys/fm/util.h>
38 
39 #include <libsysevent.h>
40 #include <libnvpair.h>
41 #include <alloca.h>
42 #include <limits.h>
43 #include <strings.h>
44 #include <unistd.h>
45 #include <fcntl.h>
46 #include <errno.h>
47 
48 #undef MUTEX_HELD
49 #undef RW_READ_HELD
50 #undef RW_WRITE_HELD
51 
52 #include <fmd_api.h>
53 #include <fmd_log.h>
54 #include <fmd_subr.h>
55 #include <fmd_dispq.h>
56 #include <fmd_module.h>
57 #include <fmd_scheme.h>
58 #include <fmd_error.h>
59 
60 #include <fmd.h>
61 
62 static char *sysev_channel;	/* event channel to which we are subscribed */
63 static char *sysev_class;	/* event class to which we are subscribed */
64 static char *sysev_device;	/* device path to use for replaying events */
65 static char *sysev_sid;		/* event channel subscriber identifier */
66 static void *sysev_evc;		/* event channel cookie from evc_bind */
67 
68 static fmd_xprt_t *sysev_xprt;
69 static fmd_hdl_t *sysev_hdl;
70 
71 static struct sysev_stats {
72 	fmd_stat_t dump_replay;
73 	fmd_stat_t dump_lost;
74 	fmd_stat_t bad_class;
75 	fmd_stat_t bad_attr;
76 	fmd_stat_t eagain;
77 } sysev_stats = {
78 	{ "dump_replay", FMD_TYPE_UINT64, "events replayed from dump device" },
79 	{ "dump_lost", FMD_TYPE_UINT64, "events lost from dump device" },
80 	{ "bad_class", FMD_TYPE_UINT64, "events dropped due to invalid class" },
81 	{ "bad_attr", FMD_TYPE_UINT64, "events dropped due to invalid nvlist" },
82 	{ "eagain", FMD_TYPE_UINT64, "events retried due to low memory" },
83 };
84 
85 /*
86  * Receive an event from the SysEvent channel and post it to our transport.
87  * Under extreme low-memory situations where we cannot event unpack the event,
88  * we can request that SysEvent redeliver the event later by returning EAGAIN.
89  * If we do this too many times, the kernel will drop the event.  Rather than
90  * keeping state per-event, we simply attempt a garbage-collect, hoping that
91  * enough free memory will be available by the time the event is redelivered.
92  */
93 static int
94 sysev_recv(sysevent_t *sep, void *arg)
95 {
96 	uint64_t seq = sysevent_get_seq(sep);
97 	fmd_xprt_t *xp = arg;
98 	nvlist_t *nvl;
99 	hrtime_t hrt;
100 
101 	if (strcmp(sysevent_get_class_name(sep), EC_FM) != 0) {
102 		fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: unexpected"
103 		    " transport class %s\n", seq, sysevent_get_class_name(sep));
104 		sysev_stats.bad_class.fmds_value.ui64++;
105 		return (0);
106 	}
107 
108 	if (sysevent_get_attr_list(sep, &nvl) != 0) {
109 		if (errno == EAGAIN || errno == ENOMEM) {
110 			fmd_modhash_tryapply(fmd.d_mod_hash, fmd_module_trygc);
111 			fmd_scheme_hash_trygc(fmd.d_schemes);
112 			sysev_stats.eagain.fmds_value.ui64++;
113 			return (EAGAIN);
114 		}
115 
116 		fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: missing "
117 		    "or invalid payload", seq);
118 		sysev_stats.bad_attr.fmds_value.ui64++;
119 		return (0);
120 	}
121 
122 	sysevent_get_time(sep, &hrt);
123 	fmd_xprt_post(sysev_hdl, xp, nvl, hrt);
124 	return (0);
125 }
126 
127 /*
128  * Checksum algorithm used by the dump transport for verifying the content of
129  * error reports saved on the dump device (copy of the kernel's checksum32()).
130  */
131 static uint32_t
132 sysev_checksum(void *cp_arg, size_t length)
133 {
134 	uchar_t *cp, *ep;
135 	uint32_t sum = 0;
136 
137 	for (cp = cp_arg, ep = cp + length; cp < ep; cp++)
138 		sum = ((sum >> 1) | (sum << 31)) + *cp;
139 
140 	return (sum);
141 }
142 
143 /*
144  * Replay saved events from the dump transport.  This function is installed as
145  * the timer callback and is called only once during the module's lifetime.
146  */
147 /*ARGSUSED*/
148 static void
149 sysev_replay(fmd_hdl_t *hdl, id_t id, void *arg)
150 {
151 	char *dumpdev;
152 	off64_t off, off0;
153 	int fd, err;
154 
155 	/*
156 	 * Determine the appropriate dump device to use for replaying pending
157 	 * error reports.  If the device property is NULL (default), we
158 	 * open and query /dev/dump to determine the current dump device.
159 	 */
160 	if ((dumpdev = sysev_device) == NULL) {
161 		if ((fd = open("/dev/dump", O_RDONLY)) == -1) {
162 			fmd_hdl_error(hdl, "failed to open /dev/dump "
163 			    "to locate dump device for event replay");
164 			return;
165 		}
166 
167 		dumpdev = alloca(PATH_MAX);
168 		err = ioctl(fd, DIOCGETDEV, dumpdev);
169 		(void) close(fd);
170 
171 		if (err == -1) {
172 			if (errno != ENODEV) {
173 				fmd_hdl_error(hdl, "failed to obtain "
174 				    "path to dump device for event replay");
175 			}
176 			return;
177 		}
178 	}
179 
180 	if (strcmp(dumpdev, "/dev/null") == 0)
181 		return; /* return silently and skip replay for /dev/null */
182 
183 	/*
184 	 * Open the appropriate device and then determine the offset of the
185 	 * start of the ereport dump region located at the end of the device.
186 	 */
187 	if ((fd = open64(dumpdev, O_RDWR | O_DSYNC)) == -1) {
188 		fmd_hdl_error(hdl, "failed to open dump transport %s "
189 		    "(pending events will not be replayed)", dumpdev);
190 		return;
191 	}
192 
193 	off = DUMP_OFFSET + DUMP_LOGSIZE + DUMP_ERPTSIZE;
194 	off = off0 = lseek64(fd, -off, SEEK_END) & -DUMP_OFFSET;
195 
196 	if (off == (off64_t)-1LL) {
197 		fmd_hdl_error(hdl, "failed to seek dump transport %s "
198 		    "(pending events will not be replayed)", dumpdev);
199 		(void) close(fd);
200 		return;
201 	}
202 
203 	/*
204 	 * The ereport dump region is a sequence of erpt_dump_t headers each of
205 	 * which is followed by packed nvlist data.  We iterate over them in
206 	 * order, unpacking and dispatching each one to our dispatch queue.
207 	 */
208 	for (;;) {
209 		char nvbuf[ERPT_DATA_SZ];
210 		uint32_t chksum;
211 		erpt_dump_t ed;
212 		nvlist_t *nvl;
213 
214 		fmd_timeval_t ftv, tod;
215 		hrtime_t hrt;
216 		uint64_t ena;
217 
218 		if (pread64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
219 			fmd_hdl_error(hdl, "failed to read from dump "
220 			    "transport %s (pending events lost)", dumpdev);
221 			break;
222 		}
223 
224 		if (ed.ed_magic == 0 && ed.ed_size == 0)
225 			break; /* end of list: all zero */
226 
227 		if (ed.ed_magic == 0) {
228 			off += sizeof (ed) + ed.ed_size;
229 			continue; /* continue searching */
230 		}
231 
232 		if (ed.ed_magic != ERPT_MAGIC) {
233 			/*
234 			 * Stop reading silently if the first record has the
235 			 * wrong magic number; this likely indicates that we
236 			 * rebooted from non-FMA bits or paged over the dump.
237 			 */
238 			if (off == off0)
239 				break;
240 
241 			fmd_hdl_error(hdl, "invalid dump transport "
242 			    "record at %llx (magic number %x, expected %x)\n",
243 			    (u_longlong_t)off, ed.ed_magic, ERPT_MAGIC);
244 			break;
245 		}
246 
247 		if (ed.ed_size > ERPT_DATA_SZ) {
248 			fmd_hdl_error(hdl, "invalid dump transport "
249 			    "record at %llx size (%u exceeds limit)\n",
250 			    (u_longlong_t)off, ed.ed_size);
251 			break;
252 		}
253 
254 		if (pread64(fd, nvbuf, ed.ed_size,
255 		    off + sizeof (ed)) != ed.ed_size) {
256 			fmd_hdl_error(hdl, "failed to read dump "
257 			    "transport event (offset %llx)", (u_longlong_t)off);
258 
259 			sysev_stats.dump_lost.fmds_value.ui64++;
260 			goto next;
261 		}
262 
263 		if ((chksum = sysev_checksum(nvbuf,
264 		    ed.ed_size)) != ed.ed_chksum) {
265 			fmd_hdl_error(hdl, "dump transport event at "
266 			    "offset %llx is corrupt (checksum %x != %x)\n",
267 			    (u_longlong_t)off, chksum, ed.ed_chksum);
268 
269 			sysev_stats.dump_lost.fmds_value.ui64++;
270 			goto next;
271 		}
272 
273 		if ((err = nvlist_xunpack(nvbuf,
274 		    ed.ed_size, &nvl, &fmd.d_nva)) != 0) {
275 			fmd_hdl_error(hdl, "failed to unpack dump "
276 			    "transport event at offset %llx: %s\n",
277 			    (u_longlong_t)off, fmd_strerror(err));
278 
279 			sysev_stats.dump_lost.fmds_value.ui64++;
280 			goto next;
281 		}
282 
283 		/*
284 		 * If ed_hrt_nsec is set it contains the gethrtime() value from
285 		 * when the event was originally enqueued for the transport.
286 		 * If it is zero, we use the weaker bound ed_hrt_base instead.
287 		 */
288 		if (ed.ed_hrt_nsec != 0)
289 			hrt = ed.ed_hrt_nsec;
290 		else
291 			hrt = ed.ed_hrt_base;
292 
293 		/*
294 		 * If this is an FMA protocol event of class "ereport.*" that
295 		 * contains valid ENA, we can improve the precision of 'hrt'.
296 		 */
297 		if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) == 0)
298 			hrt = fmd_time_ena2hrt(hrt, ena);
299 
300 		/*
301 		 * Now convert 'hrt' to an adjustable TOD based on the values
302 		 * in ed_tod_base which correspond to one another and are
303 		 * sampled before reboot using the old gethrtime() clock.
304 		 * fmd_event_recreate() will use this TOD value to re-assign
305 		 * the event an updated gethrtime() value based on the current
306 		 * value of the non-adjustable gethrtime() clock.  Phew.
307 		 */
308 		tod.ftv_sec = ed.ed_tod_base.sec;
309 		tod.ftv_nsec = ed.ed_tod_base.nsec;
310 		fmd_time_hrt2tod(ed.ed_hrt_base, &tod, hrt, &ftv);
311 
312 		(void) nvlist_remove_all(nvl, FMD_EVN_TOD);
313 		(void) nvlist_add_uint64_array(nvl,
314 		    FMD_EVN_TOD, (uint64_t *)&ftv, 2);
315 
316 		fmd_xprt_post(hdl, sysev_xprt, nvl, 0);
317 		sysev_stats.dump_replay.fmds_value.ui64++;
318 
319 next:
320 		/*
321 		 * Reset the magic number for the event record to zero so that
322 		 * we do not replay the same event multiple times.
323 		 */
324 		ed.ed_magic = 0;
325 
326 		if (pwrite64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
327 			fmd_hdl_error(hdl, "failed to mark dump "
328 			    "transport event (offset %llx)", (u_longlong_t)off);
329 		}
330 
331 		off += sizeof (ed) + ed.ed_size;
332 	}
333 
334 	(void) close(fd);
335 }
336 
337 static const fmd_prop_t sysev_props[] = {
338 	{ "class", FMD_TYPE_STRING, EC_ALL },		/* event class */
339 	{ "device", FMD_TYPE_STRING, NULL },		/* replay device */
340 	{ "channel", FMD_TYPE_STRING, FM_ERROR_CHAN },	/* channel name */
341 	{ "sid", FMD_TYPE_STRING, "fmd" },		/* subscriber id */
342 	{ NULL, 0, NULL }
343 };
344 
345 static const fmd_hdl_ops_t sysev_ops = {
346 	NULL,		/* fmdo_recv */
347 	sysev_replay,	/* fmdo_timeout */
348 	NULL,		/* fmdo_close */
349 	NULL,		/* fmdo_stats */
350 	NULL,		/* fmdo_gc */
351 	NULL,		/* fmdo_send */
352 };
353 
354 static const fmd_hdl_info_t sysev_info = {
355 	"SysEvent Transport Agent", "1.0", &sysev_ops, sysev_props
356 };
357 
358 /*
359  * Bind to the sysevent channel we use for listening for error events and then
360  * subscribe to appropriate events received over this channel.
361  */
362 void
363 sysev_init(fmd_hdl_t *hdl)
364 {
365 	uint_t flags;
366 
367 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &sysev_info) != 0)
368 		return; /* invalid property settings */
369 
370 	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (sysev_stats) /
371 	    sizeof (fmd_stat_t), (fmd_stat_t *)&sysev_stats);
372 
373 	sysev_channel = fmd_prop_get_string(hdl, "channel");
374 	sysev_class = fmd_prop_get_string(hdl, "class");
375 	sysev_device = fmd_prop_get_string(hdl, "device");
376 	sysev_sid = fmd_prop_get_string(hdl, "sid");
377 
378 	if (sysev_channel == NULL)
379 		fmd_hdl_abort(hdl, "channel property must be defined\n");
380 
381 	if (sysev_sid == NULL)
382 		fmd_hdl_abort(hdl, "sid property must be defined\n");
383 
384 	if ((errno = sysevent_evc_bind(sysev_channel, &sysev_evc,
385 	    EVCH_CREAT | EVCH_HOLD_PEND)) != 0) {
386 		fmd_hdl_abort(hdl, "failed to bind to event transport "
387 		    "channel %s", sysev_channel);
388 	}
389 
390 	sysev_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY, NULL, NULL);
391 	sysev_hdl = hdl;
392 
393 	/*
394 	 * If we're subscribing to the default channel, keep our subscription
395 	 * active even if we die unexpectedly so we continue queuing events.
396 	 * If we're not (e.g. running under fmsim), do not specify SUB_KEEP so
397 	 * that our event channel will be destroyed if we die unpleasantly.
398 	 */
399 	if (strcmp(sysev_channel, FM_ERROR_CHAN) == 0)
400 		flags = EVCH_SUB_KEEP | EVCH_SUB_DUMP;
401 	else
402 		flags = EVCH_SUB_DUMP;
403 
404 	errno = sysevent_evc_subscribe(sysev_evc,
405 	    sysev_sid, sysev_class, sysev_recv, sysev_xprt, flags);
406 
407 	if (errno != 0) {
408 		if (errno == EEXIST) {
409 			fmd_hdl_abort(hdl, "another fault management daemon is "
410 			    "active on transport channel %s\n", sysev_channel);
411 		} else {
412 			fmd_hdl_abort(hdl, "failed to subscribe to %s on "
413 			    "transport channel %s", sysev_class, sysev_channel);
414 		}
415 	}
416 
417 	/*
418 	 * Once the transport is open, install a single timer to fire at once
419 	 * in the context of the module's thread to run sysev_replay().  This
420 	 * thread will block in its first fmd_xprt_post() until fmd is ready.
421 	 */
422 	fmd_hdl_debug(hdl, "transport '%s' open\n", sysev_channel);
423 	(void) fmd_timer_install(hdl, NULL, NULL, 0);
424 }
425 
426 /*
427  * Close the channel by unsubscribing and unbinding.  We only do this when a
428  * a non-default channel has been selected.  If we're using FM_ERROR_CHAN,
429  * the system default, we do *not* want to unsubscribe because the kernel will
430  * remove the subscriber queue and any events published in our absence will
431  * therefore be lost.  This scenario may occur when, for example, fmd is sent
432  * a SIGTERM by init(1M) during reboot but an error is detected and makes it
433  * into the sysevent channel queue before init(1M) manages to call uadmin(2).
434  */
435 void
436 sysev_fini(fmd_hdl_t *hdl)
437 {
438 	if (strcmp(sysev_channel, FM_ERROR_CHAN) != 0) {
439 		sysevent_evc_unsubscribe(sysev_evc, sysev_sid);
440 		sysevent_evc_unbind(sysev_evc);
441 	}
442 
443 	if (sysev_xprt != NULL)
444 		fmd_xprt_close(hdl, sysev_xprt);
445 
446 	fmd_prop_free_string(hdl, sysev_class);
447 	fmd_prop_free_string(hdl, sysev_channel);
448 	fmd_prop_free_string(hdl, sysev_device);
449 	fmd_prop_free_string(hdl, sysev_sid);
450 }
451