1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/sysevent/eventdefs.h>
27 #include <sys/sysevent.h>
28 #include <sys/sysevent_impl.h>
29 #include <sys/fm/protocol.h>
30 #include <sys/sysmacros.h>
31 #include <sys/dumphdr.h>
32 #include <sys/dumpadm.h>
33 #include <sys/fm/util.h>
34
35 #include <libsysevent.h>
36 #include <libnvpair.h>
37 #include <alloca.h>
38 #include <limits.h>
39 #include <strings.h>
40 #include <unistd.h>
41 #include <fcntl.h>
42 #include <errno.h>
43 #include <zone.h>
44
45 #undef MUTEX_HELD
46 #undef RW_READ_HELD
47 #undef RW_WRITE_HELD
48
49 #include <fmd_api.h>
50 #include <fmd_log.h>
51 #include <fmd_subr.h>
52 #include <fmd_dispq.h>
53 #include <fmd_dr.h>
54 #include <fmd_module.h>
55 #include <fmd_protocol.h>
56 #include <fmd_scheme.h>
57 #include <fmd_error.h>
58
59 #include <fmd.h>
60
61 static char *sysev_channel; /* event channel to which we are subscribed */
62 static char *sysev_class; /* event class to which we are subscribed */
63 static char *sysev_device; /* device path to use for replaying events */
64 static char *sysev_sid; /* event channel subscriber identifier */
65 static void *sysev_evc; /* event channel cookie from evc_bind */
66
67 static fmd_xprt_t *sysev_xprt;
68 static int sysev_xprt_refcnt;
69 static fmd_hdl_t *sysev_hdl;
70
71 static struct sysev_stats {
72 fmd_stat_t dump_replay;
73 fmd_stat_t dump_lost;
74 fmd_stat_t bad_class;
75 fmd_stat_t bad_attr;
76 fmd_stat_t eagain;
77 } sysev_stats = {
78 { "dump_replay", FMD_TYPE_UINT64, "events replayed from dump device" },
79 { "dump_lost", FMD_TYPE_UINT64, "events lost from dump device" },
80 { "bad_class", FMD_TYPE_UINT64, "events dropped due to invalid class" },
81 { "bad_attr", FMD_TYPE_UINT64, "events dropped due to invalid nvlist" },
82 { "eagain", FMD_TYPE_UINT64, "events retried due to low memory" },
83 };
84
85 static pthread_cond_t sysev_cv = PTHREAD_COND_INITIALIZER;
86 static pthread_mutex_t sysev_mutex = PTHREAD_MUTEX_INITIALIZER;
87 static int sysev_replay_wait = 1;
88 static int sysev_exiting;
89
90 static sysevent_subattr_t *subattr;
91
92 /*
93 * Entry point for legacy sysevents. This function is responsible for two
94 * things: passing off interesting events to the DR handler, and converting
95 * sysevents into resource events that modules can then subscribe to.
96 */
97 static void
sysev_legacy(sysevent_t * sep)98 sysev_legacy(sysevent_t *sep)
99 {
100 const char *class = sysevent_get_class_name(sep);
101 const char *subclass = sysevent_get_subclass_name(sep);
102 char *fullclass;
103 size_t len;
104 nvlist_t *attr, *nvl;
105 hrtime_t hrt;
106
107 /* notify the DR subsystem of the event */
108 fmd_dr_event(sep);
109
110 /* get the matching sysevent name */
111 len = snprintf(NULL, 0, "%s%s.%s", SYSEVENT_RSRC_CLASS,
112 class, subclass);
113 fullclass = alloca(len + 1);
114 (void) snprintf(fullclass, len + 1, "%s%s.%s",
115 SYSEVENT_RSRC_CLASS, class, subclass);
116
117 /* construct the event payload */
118 (void) nvlist_xalloc(&nvl, NV_UNIQUE_NAME, &fmd.d_nva);
119 if (sysevent_get_attr_list(sep, &attr) == 0) {
120 (void) nvlist_merge(nvl, attr, 0);
121 nvlist_free(attr);
122 }
123
124 /*
125 * Add class and version after the nvlist_merge() just in case
126 * the sysevent has an attribute called class or version.
127 */
128 (void) nvlist_add_string(nvl, FM_CLASS, fullclass);
129 (void) nvlist_add_uint8(nvl, FM_VERSION, FM_RSRC_VERSION);
130
131 /*
132 * Dispatch the event. Because we have used sysevent_bind_xhandle
133 * the delivery thread is blessed as a proper fmd thread so
134 * we may use regular fmd api calls.
135 */
136 sysevent_get_time(sep, &hrt);
137 fmd_xprt_post(sysev_hdl, sysev_xprt, nvl, hrt);
138 }
139
140 /*
141 * Receive an event from the SysEvent channel and post it to our transport.
142 * Under extreme low-memory situations where we cannot event unpack the event,
143 * we can request that SysEvent redeliver the event later by returning EAGAIN.
144 * If we do this too many times, the kernel will drop the event. Rather than
145 * keeping state per-event, we simply attempt a garbage-collect, hoping that
146 * enough free memory will be available by the time the event is redelivered.
147 */
148 static int
sysev_recv(sysevent_t * sep,void * arg)149 sysev_recv(sysevent_t *sep, void *arg)
150 {
151 uint64_t seq = sysevent_get_seq(sep);
152 fmd_xprt_t *xp = arg;
153 nvlist_t *nvl;
154 hrtime_t hrt;
155 int rc = 0;
156
157 (void) pthread_mutex_lock(&sysev_mutex);
158 if (sysev_exiting == 1) {
159 while (sysev_xprt_refcnt > 0)
160 (void) pthread_cond_wait(&sysev_cv, &sysev_mutex);
161 (void) pthread_mutex_unlock(&sysev_mutex);
162 return (EAGAIN);
163 }
164 sysev_xprt_refcnt++;
165 while (sysev_replay_wait)
166 (void) pthread_cond_wait(&sysev_cv, &sysev_mutex);
167 (void) pthread_mutex_unlock(&sysev_mutex);
168
169 if (strcmp(sysevent_get_class_name(sep), EC_FM) != 0) {
170 fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: unexpected"
171 " transport class %s\n", seq, sysevent_get_class_name(sep));
172 sysev_stats.bad_class.fmds_value.ui64++;
173 } else if (sysevent_get_attr_list(sep, &nvl) != 0) {
174 if (errno == EAGAIN || errno == ENOMEM) {
175 fmd_modhash_tryapply(fmd.d_mod_hash, fmd_module_trygc);
176 fmd_scheme_hash_trygc(fmd.d_schemes);
177 sysev_stats.eagain.fmds_value.ui64++;
178 rc = EAGAIN;
179 } else {
180 fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: "
181 "missing or invalid payload", seq);
182 sysev_stats.bad_attr.fmds_value.ui64++;
183 }
184 } else {
185 sysevent_get_time(sep, &hrt);
186 fmd_xprt_post(sysev_hdl, xp, nvl, hrt);
187 }
188
189 (void) pthread_mutex_lock(&sysev_mutex);
190 if (--sysev_xprt_refcnt == 0 && sysev_exiting == 1)
191 (void) pthread_cond_broadcast(&sysev_cv);
192 (void) pthread_mutex_unlock(&sysev_mutex);
193
194 return (rc);
195 }
196
197 /*
198 * Checksum algorithm used by the dump transport for verifying the content of
199 * error reports saved on the dump device (copy of the kernel's checksum32()).
200 */
201 static uint32_t
sysev_checksum(void * cp_arg,size_t length)202 sysev_checksum(void *cp_arg, size_t length)
203 {
204 uchar_t *cp, *ep;
205 uint32_t sum = 0;
206
207 for (cp = cp_arg, ep = cp + length; cp < ep; cp++)
208 sum = ((sum >> 1) | (sum << 31)) + *cp;
209
210 return (sum);
211 }
212
213 /*
214 * Replay saved events from the dump transport. This function is installed as
215 * the timer callback and is called only once during the module's lifetime.
216 */
217 /*ARGSUSED*/
218 static void
sysev_replay(fmd_hdl_t * hdl,id_t id,void * arg)219 sysev_replay(fmd_hdl_t *hdl, id_t id, void *arg)
220 {
221 char *dumpdev;
222 off64_t off, off0;
223 int fd, err;
224
225 /*
226 * Determine the appropriate dump device to use for replaying pending
227 * error reports. If the device property is NULL (default), we
228 * open and query /dev/dump to determine the current dump device.
229 */
230 if ((dumpdev = sysev_device) == NULL) {
231 if ((fd = open("/dev/dump", O_RDONLY)) == -1) {
232 fmd_hdl_error(hdl, "failed to open /dev/dump "
233 "to locate dump device for event replay");
234 goto done;
235 }
236
237 dumpdev = alloca(PATH_MAX);
238 err = ioctl(fd, DIOCGETDEV, dumpdev);
239 (void) close(fd);
240
241 if (err == -1) {
242 if (errno != ENODEV) {
243 fmd_hdl_error(hdl, "failed to obtain "
244 "path to dump device for event replay");
245 }
246 goto done;
247 }
248 }
249
250 if (strcmp(dumpdev, "/dev/null") == 0)
251 goto done; /* return silently and skip replay for /dev/null */
252
253 /*
254 * Open the appropriate device and then determine the offset of the
255 * start of the ereport dump region located at the end of the device.
256 */
257 if ((fd = open64(dumpdev, O_RDWR | O_DSYNC)) == -1) {
258 fmd_hdl_error(hdl, "failed to open dump transport %s "
259 "(pending events will not be replayed)", dumpdev);
260 goto done;
261 }
262
263 off = DUMP_OFFSET + DUMP_LOGSIZE + DUMP_ERPTSIZE;
264 off = off0 = lseek64(fd, -off, SEEK_END) & -DUMP_OFFSET;
265
266 if (off == (off64_t)-1LL) {
267 fmd_hdl_error(hdl, "failed to seek dump transport %s "
268 "(pending events will not be replayed)", dumpdev);
269 (void) close(fd);
270 goto done;
271 }
272
273 /*
274 * The ereport dump region is a sequence of erpt_dump_t headers each of
275 * which is followed by packed nvlist data. We iterate over them in
276 * order, unpacking and dispatching each one to our dispatch queue.
277 */
278 for (;;) {
279 char nvbuf[ERPT_DATA_SZ];
280 uint32_t chksum;
281 erpt_dump_t ed;
282 nvlist_t *nvl;
283
284 fmd_timeval_t ftv, tod;
285 hrtime_t hrt;
286 uint64_t ena;
287
288 if (pread64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
289 fmd_hdl_error(hdl, "failed to read from dump "
290 "transport %s (pending events lost)", dumpdev);
291 break;
292 }
293
294 if (ed.ed_magic == 0 && ed.ed_size == 0)
295 break; /* end of list: all zero */
296
297 if (ed.ed_magic == 0) {
298 off += sizeof (ed) + ed.ed_size;
299 continue; /* continue searching */
300 }
301
302 if (ed.ed_magic != ERPT_MAGIC) {
303 /*
304 * Stop reading silently if the first record has the
305 * wrong magic number; this likely indicates that we
306 * rebooted from non-FMA bits or paged over the dump.
307 */
308 if (off == off0)
309 break;
310
311 fmd_hdl_error(hdl, "invalid dump transport "
312 "record at %llx (magic number %x, expected %x)\n",
313 (u_longlong_t)off, ed.ed_magic, ERPT_MAGIC);
314 break;
315 }
316
317 if (ed.ed_size > ERPT_DATA_SZ) {
318 fmd_hdl_error(hdl, "invalid dump transport "
319 "record at %llx size (%u exceeds limit)\n",
320 (u_longlong_t)off, ed.ed_size);
321 break;
322 }
323
324 if (pread64(fd, nvbuf, ed.ed_size,
325 off + sizeof (ed)) != ed.ed_size) {
326 fmd_hdl_error(hdl, "failed to read dump "
327 "transport event (offset %llx)", (u_longlong_t)off);
328
329 sysev_stats.dump_lost.fmds_value.ui64++;
330 goto next;
331 }
332
333 if ((chksum = sysev_checksum(nvbuf,
334 ed.ed_size)) != ed.ed_chksum) {
335 fmd_hdl_error(hdl, "dump transport event at "
336 "offset %llx is corrupt (checksum %x != %x)\n",
337 (u_longlong_t)off, chksum, ed.ed_chksum);
338
339 sysev_stats.dump_lost.fmds_value.ui64++;
340 goto next;
341 }
342
343 if ((err = nvlist_xunpack(nvbuf,
344 ed.ed_size, &nvl, &fmd.d_nva)) != 0) {
345 fmd_hdl_error(hdl, "failed to unpack dump "
346 "transport event at offset %llx: %s\n",
347 (u_longlong_t)off, fmd_strerror(err));
348
349 sysev_stats.dump_lost.fmds_value.ui64++;
350 goto next;
351 }
352
353 /*
354 * If ed_hrt_nsec is set it contains the gethrtime() value from
355 * when the event was originally enqueued for the transport.
356 * If it is zero, we use the weaker bound ed_hrt_base instead.
357 */
358 if (ed.ed_hrt_nsec != 0)
359 hrt = ed.ed_hrt_nsec;
360 else
361 hrt = ed.ed_hrt_base;
362
363 /*
364 * If this is an FMA protocol event of class "ereport.*" that
365 * contains valid ENA, we can improve the precision of 'hrt'.
366 */
367 if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) == 0)
368 hrt = fmd_time_ena2hrt(hrt, ena);
369
370 /*
371 * Now convert 'hrt' to an adjustable TOD based on the values
372 * in ed_tod_base which correspond to one another and are
373 * sampled before reboot using the old gethrtime() clock.
374 * fmd_event_recreate() will use this TOD value to re-assign
375 * the event an updated gethrtime() value based on the current
376 * value of the non-adjustable gethrtime() clock. Phew.
377 */
378 tod.ftv_sec = ed.ed_tod_base.sec;
379 tod.ftv_nsec = ed.ed_tod_base.nsec;
380 fmd_time_hrt2tod(ed.ed_hrt_base, &tod, hrt, &ftv);
381
382 (void) nvlist_remove_all(nvl, FMD_EVN_TOD);
383 (void) nvlist_add_uint64_array(nvl,
384 FMD_EVN_TOD, (uint64_t *)&ftv, 2);
385
386 fmd_xprt_post(hdl, sysev_xprt, nvl, 0);
387 sysev_stats.dump_replay.fmds_value.ui64++;
388
389 next:
390 /*
391 * Reset the magic number for the event record to zero so that
392 * we do not replay the same event multiple times.
393 */
394 ed.ed_magic = 0;
395
396 if (pwrite64(fd, &ed, sizeof (ed), off) != sizeof (ed)) {
397 fmd_hdl_error(hdl, "failed to mark dump "
398 "transport event (offset %llx)", (u_longlong_t)off);
399 }
400
401 off += sizeof (ed) + ed.ed_size;
402 }
403
404 (void) close(fd);
405 done:
406 (void) pthread_mutex_lock(&sysev_mutex);
407 sysev_replay_wait = 0;
408 (void) pthread_cond_broadcast(&sysev_cv);
409 (void) pthread_mutex_unlock(&sysev_mutex);
410 }
411
412 static const fmd_prop_t sysev_props[] = {
413 { "class", FMD_TYPE_STRING, EC_ALL }, /* event class */
414 { "device", FMD_TYPE_STRING, NULL }, /* replay device */
415 { "channel", FMD_TYPE_STRING, FM_ERROR_CHAN }, /* channel name */
416 { "sid", FMD_TYPE_STRING, "fmd" }, /* subscriber id */
417 { NULL, 0, NULL }
418 };
419
420 static const fmd_hdl_ops_t sysev_ops = {
421 NULL, /* fmdo_recv */
422 sysev_replay, /* fmdo_timeout */
423 NULL, /* fmdo_close */
424 NULL, /* fmdo_stats */
425 NULL, /* fmdo_gc */
426 NULL, /* fmdo_send */
427 };
428
429 static const fmd_hdl_info_t sysev_info = {
430 "SysEvent Transport Agent", "1.0", &sysev_ops, sysev_props
431 };
432
433 /*
434 * Bind to the sysevent channel we use for listening for error events and then
435 * subscribe to appropriate events received over this channel. Setup the
436 * legacy sysevent handler for creating sysevent resources and forwarding DR
437 * events.
438 */
439 void
sysev_init(fmd_hdl_t * hdl)440 sysev_init(fmd_hdl_t *hdl)
441 {
442 uint_t flags;
443 const char *subclasses[] = { EC_SUB_ALL };
444
445 /* This builtin is for the global zone only */
446 if (getzoneid() != GLOBAL_ZONEID)
447 return;
448
449 if (fmd_hdl_register(hdl, FMD_API_VERSION, &sysev_info) != 0)
450 return; /* invalid property settings */
451
452 (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (sysev_stats) /
453 sizeof (fmd_stat_t), (fmd_stat_t *)&sysev_stats);
454
455 sysev_channel = fmd_prop_get_string(hdl, "channel");
456 sysev_class = fmd_prop_get_string(hdl, "class");
457 sysev_device = fmd_prop_get_string(hdl, "device");
458 sysev_sid = fmd_prop_get_string(hdl, "sid");
459
460 if (sysev_channel == NULL)
461 fmd_hdl_abort(hdl, "channel property must be defined\n");
462
463 if (sysev_sid == NULL)
464 fmd_hdl_abort(hdl, "sid property must be defined\n");
465
466 if ((errno = sysevent_evc_bind(sysev_channel, &sysev_evc,
467 EVCH_CREAT | EVCH_HOLD_PEND)) != 0) {
468 fmd_hdl_abort(hdl, "failed to bind to event transport "
469 "channel %s", sysev_channel);
470 }
471
472 sysev_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY |
473 FMD_XPRT_CACHE_AS_LOCAL, NULL, NULL);
474 sysev_hdl = hdl;
475
476 /*
477 * If we're subscribing to the default channel, keep our subscription
478 * active even if we die unexpectedly so we continue queuing events.
479 * If we're not (e.g. running under fmsim), do not specify SUB_KEEP so
480 * that our event channel will be destroyed if we die unpleasantly.
481 */
482 if (strcmp(sysev_channel, FM_ERROR_CHAN) == 0)
483 flags = EVCH_SUB_KEEP | EVCH_SUB_DUMP;
484 else
485 flags = EVCH_SUB_DUMP;
486
487 if ((subattr = sysevent_subattr_alloc()) == NULL)
488 fmd_hdl_abort(hdl, "failed to allocate subscription "
489 "attributes");
490
491 sysevent_subattr_thrcreate(subattr, fmd_doorthr_create, NULL);
492 sysevent_subattr_thrsetup(subattr, fmd_doorthr_setup, NULL);
493
494 errno = sysevent_evc_xsubscribe(sysev_evc,
495 sysev_sid, sysev_class, sysev_recv, sysev_xprt, flags, subattr);
496
497 if (errno != 0) {
498 if (errno == EEXIST) {
499 fmd_hdl_abort(hdl, "another fault management daemon is "
500 "active on transport channel %s\n", sysev_channel);
501 } else {
502 fmd_hdl_abort(hdl, "failed to xsubscribe to %s on "
503 "transport channel %s", sysev_class, sysev_channel);
504 }
505 }
506
507 /*
508 * Once the transport is open, install a single timer to fire at once
509 * in the context of the module's thread to run sysev_replay(). This
510 * thread will block in its first fmd_xprt_post() until fmd is ready.
511 */
512 fmd_hdl_debug(hdl, "transport '%s' open\n", sysev_channel);
513 (void) fmd_timer_install(hdl, NULL, NULL, 0);
514
515 /*
516 * Open the legacy sysevent handle and subscribe to all events. These
517 * are automatically converted to "resource.sysevent.*" events so that
518 * modules can manage these events without additional infrastructure.
519 */
520 if (geteuid() != 0)
521 return;
522
523 if ((fmd.d_sysev_hdl =
524 sysevent_bind_xhandle(sysev_legacy, subattr)) == NULL)
525 fmd_hdl_abort(hdl, "failed to bind to legacy sysevent channel");
526
527 if (sysevent_subscribe_event(fmd.d_sysev_hdl, EC_ALL,
528 subclasses, 1) != 0)
529 fmd_hdl_abort(hdl, "failed to subscribe to legacy sysevents");
530 }
531
532 /*
533 * Close the channel by unsubscribing and unbinding. We only do this when a
534 * a non-default channel has been selected. If we're using FM_ERROR_CHAN,
535 * the system default, we do *not* want to unsubscribe because the kernel will
536 * remove the subscriber queue and any events published in our absence will
537 * therefore be lost. This scenario may occur when, for example, fmd is sent
538 * a SIGTERM by init(1M) during reboot but an error is detected and makes it
539 * into the sysevent channel queue before init(1M) manages to call uadmin(2).
540 */
541 void
sysev_fini(fmd_hdl_t * hdl)542 sysev_fini(fmd_hdl_t *hdl)
543 {
544 if (strcmp(sysev_channel, FM_ERROR_CHAN) != 0) {
545 (void) sysevent_evc_unsubscribe(sysev_evc, sysev_sid);
546 (void) sysevent_evc_unbind(sysev_evc);
547 }
548
549 if (fmd.d_sysev_hdl != NULL)
550 sysevent_unbind_handle(fmd.d_sysev_hdl);
551
552 if (subattr != NULL) {
553 sysevent_subattr_free(subattr);
554 subattr = NULL;
555 }
556
557 if (sysev_xprt != NULL) {
558 /*
559 * Wait callback returns before destroy the transport.
560 */
561 (void) pthread_mutex_lock(&sysev_mutex);
562 sysev_exiting = 1;
563 while (sysev_xprt_refcnt > 0)
564 (void) pthread_cond_wait(&sysev_cv, &sysev_mutex);
565 (void) pthread_mutex_unlock(&sysev_mutex);
566 fmd_xprt_close(hdl, sysev_xprt);
567 }
568
569 fmd_prop_free_string(hdl, sysev_class);
570 fmd_prop_free_string(hdl, sysev_channel);
571 fmd_prop_free_string(hdl, sysev_device);
572 fmd_prop_free_string(hdl, sysev_sid);
573 }
574