xref: /titanic_52/usr/src/cmd/fm/fmd/common/fmd_sysevent.c (revision 29949e866e40b95795203f3ee46f44a197c946e4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 #include <sys/sysevent/eventdefs.h>
31 #include <sys/sysevent.h>
32 #include <sys/sysevent_impl.h>
33 #include <sys/fm/protocol.h>
34 #include <sys/sysmacros.h>
35 #include <sys/dumphdr.h>
36 #include <sys/dumpadm.h>
37 
38 #include <libsysevent.h>
39 #include <libnvpair.h>
40 #include <alloca.h>
41 #include <limits.h>
42 #include <strings.h>
43 #include <unistd.h>
44 #include <fcntl.h>
45 #include <errno.h>
46 
47 #undef MUTEX_HELD
48 #undef RW_READ_HELD
49 #undef RW_WRITE_HELD
50 
51 #include <fmd_api.h>
52 #include <fmd_log.h>
53 #include <fmd_subr.h>
54 #include <fmd_dispq.h>
55 #include <fmd_module.h>
56 #include <fmd_scheme.h>
57 #include <fmd_error.h>
58 
59 #include <fmd.h>
60 
61 static char *sysev_channel;	/* event channel to which we are subscribed */
62 static char *sysev_class;	/* event class to which we are subscribed */
63 static char *sysev_device;	/* device path to use for replaying events */
64 static char *sysev_sid;		/* event channel subscriber identifier */
65 static void *sysev_evc;		/* event channel cookie from evc_bind */
66 
67 static fmd_xprt_t *sysev_xprt;
68 static fmd_hdl_t *sysev_hdl;
69 
70 static struct sysev_stats {
71 	fmd_stat_t dump_replay;
72 	fmd_stat_t dump_lost;
73 	fmd_stat_t bad_class;
74 	fmd_stat_t bad_attr;
75 	fmd_stat_t eagain;
76 } sysev_stats = {
77 	{ "dump_replay", FMD_TYPE_UINT64, "events replayed from dump device" },
78 	{ "dump_lost", FMD_TYPE_UINT64, "events lost from dump device" },
79 	{ "bad_class", FMD_TYPE_UINT64, "events dropped due to invalid class" },
80 	{ "bad_attr", FMD_TYPE_UINT64, "events dropped due to invalid nvlist" },
81 	{ "eagain", FMD_TYPE_UINT64, "events retried due to low memory" },
82 };
83 
84 /*
85  * Receive an event from the SysEvent channel and post it to our transport.
86  * Under extreme low-memory situations where we cannot event unpack the event,
87  * we can request that SysEvent redeliver the event later by returning EAGAIN.
88  * If we do this too many times, the kernel will drop the event.  Rather than
89  * keeping state per-event, we simply attempt a garbage-collect, hoping that
90  * enough free memory will be available by the time the event is redelivered.
91  */
92 static int
93 sysev_recv(sysevent_t *sep, void *arg)
94 {
95 	uint64_t seq = sysevent_get_seq(sep);
96 	fmd_xprt_t *xp = arg;
97 	nvlist_t *nvl;
98 	hrtime_t hrt;
99 
100 	if (strcmp(sysevent_get_class_name(sep), EC_FM) != 0) {
101 		fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: unexpected"
102 		    " transport class %s\n", seq, sysevent_get_class_name(sep));
103 		sysev_stats.bad_class.fmds_value.ui64++;
104 		return (0);
105 	}
106 
107 	if (sysevent_get_attr_list(sep, &nvl) != 0) {
108 		if (errno == EAGAIN || errno == ENOMEM) {
109 			fmd_modhash_tryapply(fmd.d_mod_hash, fmd_module_trygc);
110 			fmd_scheme_hash_trygc(fmd.d_schemes);
111 			sysev_stats.eagain.fmds_value.ui64++;
112 			return (EAGAIN);
113 		}
114 
115 		fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: missing "
116 		    "or invalid payload", seq);
117 		sysev_stats.bad_attr.fmds_value.ui64++;
118 		return (0);
119 	}
120 
121 	sysevent_get_time(sep, &hrt);
122 	fmd_xprt_post(sysev_hdl, xp, nvl, hrt);
123 	return (0);
124 }
125 
126 /*
127  * Checksum algorithm used by the dump transport for verifying the content of
128  * error reports saved on the dump device (copy of the kernel's checksum32()).
129  */
130 static uint32_t
131 sysev_checksum(void *cp_arg, size_t length)
132 {
133 	uchar_t *cp, *ep;
134 	uint32_t sum = 0;
135 
136 	for (cp = cp_arg, ep = cp + length; cp < ep; cp++)
137 		sum = ((sum >> 1) | (sum << 31)) + *cp;
138 
139 	return (sum);
140 }
141 
142 /*
143  * Replay saved events from the dump transport.  This function is installed as
144  * the timer callback and is called only once during the module's lifetime.
145  */
146 /*ARGSUSED*/
147 static void
148 sysev_replay(fmd_hdl_t *hdl, id_t id, void *arg)
149 {
150 	char *dumpdev;
151 	off64_t off, off0;
152 	int fd, err;
153 
154 	/*
155 	 * Determine the appropriate dump device to use for replaying pending
156 	 * error reports.  If the device property is NULL (default), we
157 	 * open and query /dev/dump to determine the current dump device.
158 	 */
159 	if ((dumpdev = sysev_device) == NULL) {
160 		if ((fd = open("/dev/dump", O_RDONLY)) == -1) {
161 			fmd_hdl_error(hdl, "failed to open /dev/dump "
162 			    "to locate dump device for event replay");
163 			return;
164 		}
165 
166 		dumpdev = alloca(PATH_MAX);
167 		err = ioctl(fd, DIOCGETDEV, dumpdev);
168 		(void) close(fd);
169 
170 		if (err == -1) {
171 			if (errno != ENODEV) {
172 				fmd_hdl_error(hdl, "failed to obtain "
173 				    "path to dump device for event replay");
174 			}
175 			return;
176 		}
177 	}
178 
179 	if (strcmp(dumpdev, "/dev/null") == 0)
180 		return; /* return silently and skip replay for /dev/null */
181 
182 	/*
183 	 * Open the appropriate device and then determine the offset of the
184 	 * start of the ereport dump region located at the end of the device.
185 	 */
186 	if ((fd = open64(dumpdev, O_RDWR | O_DSYNC)) == -1) {
187 		fmd_hdl_error(hdl, "failed to open dump transport %s "
188 		    "(pending events will not be replayed)", dumpdev);
189 		return;
190 	}
191 
192 	off = DUMP_OFFSET + DUMP_LOGSIZE + DUMP_ERPTSIZE;
193 	off = off0 = lseek64(fd, -off, SEEK_END) & -DUMP_OFFSET;
194 
195 	if (off == (off64_t)-1LL) {
196 		fmd_hdl_error(hdl, "failed to seek dump transport %s "
197 		    "(pending events will not be replayed)", dumpdev);
198 		(void) close(fd);
199 		return;
200 	}
201 
202 	/*
203 	 * The ereport dump region is a sequence of erpt_dump_t headers each of
204 	 * which is followed by packed nvlist data.  We iterate over them in
205 	 * order, unpacking and dispatching each one to our dispatch queue.
206 	 */
207 	for (;;) {
208 		char nvbuf[ERPT_DATA_SZ];
209 		uint32_t chksum;
210 		erpt_dump_t ed;
211 		nvlist_t *nvl;
212 
213 		fmd_timeval_t ftv, tod;
214 		hrtime_t hrt;
215 		uint64_t ena;
216 
217 		if (pread64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
218 			fmd_hdl_error(hdl, "failed to read from dump "
219 			    "transport %s (pending events lost)", dumpdev);
220 			break;
221 		}
222 
223 		if (ed.ed_magic == 0 && ed.ed_size == 0)
224 			break; /* end of list: all zero */
225 
226 		if (ed.ed_magic == 0) {
227 			off += sizeof (ed) + ed.ed_size;
228 			continue; /* continue searching */
229 		}
230 
231 		if (ed.ed_magic != ERPT_MAGIC) {
232 			/*
233 			 * Stop reading silently if the first record has the
234 			 * wrong magic number; this likely indicates that we
235 			 * rebooted from non-FMA bits or paged over the dump.
236 			 */
237 			if (off == off0)
238 				break;
239 
240 			fmd_hdl_error(hdl, "invalid dump transport "
241 			    "record at %llx (magic number %x, expected %x)\n",
242 			    (u_longlong_t)off, ed.ed_magic, ERPT_MAGIC);
243 			break;
244 		}
245 
246 		if (ed.ed_size > ERPT_DATA_SZ) {
247 			fmd_hdl_error(hdl, "invalid dump transport "
248 			    "record at %llx size (%u exceeds limit)\n",
249 			    (u_longlong_t)off, ed.ed_size);
250 			break;
251 		}
252 
253 		if (pread64(fd, nvbuf, ed.ed_size,
254 		    off + sizeof (ed)) != ed.ed_size) {
255 			fmd_hdl_error(hdl, "failed to read dump "
256 			    "transport event (offset %llx)", (u_longlong_t)off);
257 
258 			sysev_stats.dump_lost.fmds_value.ui64++;
259 			goto next;
260 		}
261 
262 		if ((chksum = sysev_checksum(nvbuf,
263 		    ed.ed_size)) != ed.ed_chksum) {
264 			fmd_hdl_error(hdl, "dump transport event at "
265 			    "offset %llx is corrupt (checksum %x != %x)\n",
266 			    (u_longlong_t)off, chksum, ed.ed_chksum);
267 
268 			sysev_stats.dump_lost.fmds_value.ui64++;
269 			goto next;
270 		}
271 
272 		if ((err = nvlist_xunpack(nvbuf,
273 		    ed.ed_size, &nvl, &fmd.d_nva)) != 0) {
274 			fmd_hdl_error(hdl, "failed to unpack dump "
275 			    "transport event at offset %llx: %s\n",
276 			    (u_longlong_t)off, fmd_strerror(err));
277 
278 			sysev_stats.dump_lost.fmds_value.ui64++;
279 			goto next;
280 		}
281 
282 		/*
283 		 * If ed_hrt_nsec is set it contains the gethrtime() value from
284 		 * when the event was originally enqueued for the transport.
285 		 * If it is zero, we use the weaker bound ed_hrt_base instead.
286 		 */
287 		if (ed.ed_hrt_nsec != 0)
288 			hrt = ed.ed_hrt_nsec;
289 		else
290 			hrt = ed.ed_hrt_base;
291 
292 		/*
293 		 * If this is an FMA protocol event of class "ereport.*" that
294 		 * contains valid ENA, we can improve the precision of 'hrt'.
295 		 */
296 		if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) == 0)
297 			hrt = fmd_time_ena2hrt(hrt, ena);
298 
299 		/*
300 		 * Now convert 'hrt' to an adjustable TOD based on the values
301 		 * in ed_tod_base which correspond to one another and are
302 		 * sampled before reboot using the old gethrtime() clock.
303 		 * fmd_event_recreate() will use this TOD value to re-assign
304 		 * the event an updated gethrtime() value based on the current
305 		 * value of the non-adjustable gethrtime() clock.  Phew.
306 		 */
307 		tod.ftv_sec = ed.ed_tod_base.sec;
308 		tod.ftv_nsec = ed.ed_tod_base.nsec;
309 		fmd_time_hrt2tod(ed.ed_hrt_base, &tod, hrt, &ftv);
310 
311 		(void) nvlist_remove_all(nvl, FMD_EVN_TOD);
312 		(void) nvlist_add_uint64_array(nvl,
313 		    FMD_EVN_TOD, (uint64_t *)&ftv, 2);
314 
315 		fmd_xprt_post(hdl, sysev_xprt, nvl, 0);
316 		sysev_stats.dump_replay.fmds_value.ui64++;
317 
318 next:
319 		/*
320 		 * Reset the magic number for the event record to zero so that
321 		 * we do not replay the same event multiple times.
322 		 */
323 		ed.ed_magic = 0;
324 
325 		if (pwrite64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
326 			fmd_hdl_error(hdl, "failed to mark dump "
327 			    "transport event (offset %llx)", (u_longlong_t)off);
328 		}
329 
330 		off += sizeof (ed) + ed.ed_size;
331 	}
332 
333 	(void) close(fd);
334 }
335 
336 static const fmd_prop_t sysev_props[] = {
337 	{ "class", FMD_TYPE_STRING, EC_ALL },		/* event class */
338 	{ "device", FMD_TYPE_STRING, NULL },		/* replay device */
339 	{ "channel", FMD_TYPE_STRING, FM_ERROR_CHAN },	/* channel name */
340 	{ "sid", FMD_TYPE_STRING, "fmd" },		/* subscriber id */
341 	{ NULL, 0, NULL }
342 };
343 
344 static const fmd_hdl_ops_t sysev_ops = {
345 	NULL,		/* fmdo_recv */
346 	sysev_replay,	/* fmdo_timeout */
347 	NULL,		/* fmdo_close */
348 	NULL,		/* fmdo_stats */
349 	NULL,		/* fmdo_gc */
350 	NULL,		/* fmdo_send */
351 };
352 
353 static const fmd_hdl_info_t sysev_info = {
354 	"SysEvent Transport Agent", "1.0", &sysev_ops, sysev_props
355 };
356 
357 /*
358  * Bind to the sysevent channel we use for listening for error events and then
359  * subscribe to appropriate events received over this channel.
360  */
361 void
362 sysev_init(fmd_hdl_t *hdl)
363 {
364 	uint_t flags;
365 
366 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &sysev_info) != 0)
367 		return; /* invalid property settings */
368 
369 	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (sysev_stats) /
370 	    sizeof (fmd_stat_t), (fmd_stat_t *)&sysev_stats);
371 
372 	sysev_channel = fmd_prop_get_string(hdl, "channel");
373 	sysev_class = fmd_prop_get_string(hdl, "class");
374 	sysev_device = fmd_prop_get_string(hdl, "device");
375 	sysev_sid = fmd_prop_get_string(hdl, "sid");
376 
377 	if (sysev_channel == NULL)
378 		fmd_hdl_abort(hdl, "channel property must be defined\n");
379 
380 	if (sysev_sid == NULL)
381 		fmd_hdl_abort(hdl, "sid property must be defined\n");
382 
383 	if ((errno = sysevent_evc_bind(sysev_channel, &sysev_evc,
384 	    EVCH_CREAT | EVCH_HOLD_PEND)) != 0) {
385 		fmd_hdl_abort(hdl, "failed to bind to event transport "
386 		    "channel %s", sysev_channel);
387 	}
388 
389 	sysev_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY, NULL, NULL);
390 	sysev_hdl = hdl;
391 
392 	/*
393 	 * If we're subscribing to the default channel, keep our subscription
394 	 * active even if we die unexpectedly so we continue queuing events.
395 	 * If we're not (e.g. running under fmsim), do not specify SUB_KEEP so
396 	 * that our event channel will be destroyed if we die unpleasantly.
397 	 */
398 	if (strcmp(sysev_channel, FM_ERROR_CHAN) == 0)
399 		flags = EVCH_SUB_KEEP | EVCH_SUB_DUMP;
400 	else
401 		flags = EVCH_SUB_DUMP;
402 
403 	errno = sysevent_evc_subscribe(sysev_evc,
404 	    sysev_sid, sysev_class, sysev_recv, sysev_xprt, flags);
405 
406 	if (errno != 0) {
407 		if (errno == EEXIST) {
408 			fmd_hdl_abort(hdl, "another fault management daemon is "
409 			    "active on transport channel %s\n", sysev_channel);
410 		} else {
411 			fmd_hdl_abort(hdl, "failed to subscribe to %s on "
412 			    "transport channel %s", sysev_class, sysev_channel);
413 		}
414 	}
415 
416 	/*
417 	 * Once the transport is open, install a single timer to fire at once
418 	 * in the context of the module's thread to run sysev_replay().  This
419 	 * thread will block in its first fmd_xprt_post() until fmd is ready.
420 	 */
421 	fmd_hdl_debug(hdl, "transport '%s' open\n", sysev_channel);
422 	(void) fmd_timer_install(hdl, NULL, NULL, 0);
423 }
424 
425 /*
426  * Close the channel by unsubscribing and unbinding.  We only do this when a
427  * a non-default channel has been selected.  If we're using FM_ERROR_CHAN,
428  * the system default, we do *not* want to unsubscribe because the kernel will
429  * remove the subscriber queue and any events published in our absence will
430  * therefore be lost.  This scenario may occur when, for example, fmd is sent
431  * a SIGTERM by init(1M) during reboot but an error is detected and makes it
432  * into the sysevent channel queue before init(1M) manages to call uadmin(2).
433  */
434 void
435 sysev_fini(fmd_hdl_t *hdl)
436 {
437 	if (strcmp(sysev_channel, FM_ERROR_CHAN) != 0) {
438 		sysevent_evc_unsubscribe(sysev_evc, sysev_sid);
439 		sysevent_evc_unbind(sysev_evc);
440 	}
441 
442 	if (sysev_xprt != NULL)
443 		fmd_xprt_close(hdl, sysev_xprt);
444 
445 	fmd_prop_free_string(hdl, sysev_class);
446 	fmd_prop_free_string(hdl, sysev_channel);
447 	fmd_prop_free_string(hdl, sysev_device);
448 	fmd_prop_free_string(hdl, sysev_sid);
449 }
450