1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/sysevent/eventdefs.h> 27 #include <sys/sysevent.h> 28 #include <sys/sysevent_impl.h> 29 #include <sys/fm/protocol.h> 30 #include <sys/sysmacros.h> 31 #include <sys/dumphdr.h> 32 #include <sys/dumpadm.h> 33 #include <sys/fm/util.h> 34 35 #include <libsysevent.h> 36 #include <libnvpair.h> 37 #include <alloca.h> 38 #include <limits.h> 39 #include <strings.h> 40 #include <unistd.h> 41 #include <fcntl.h> 42 #include <errno.h> 43 #include <zone.h> 44 45 #undef MUTEX_HELD 46 #undef RW_READ_HELD 47 #undef RW_WRITE_HELD 48 49 #include <fmd_api.h> 50 #include <fmd_log.h> 51 #include <fmd_subr.h> 52 #include <fmd_dispq.h> 53 #include <fmd_dr.h> 54 #include <fmd_module.h> 55 #include <fmd_protocol.h> 56 #include <fmd_scheme.h> 57 #include <fmd_error.h> 58 59 #include <fmd.h> 60 61 static char *sysev_channel; /* event channel to which we are subscribed */ 62 static char *sysev_class; /* event class to which we are subscribed */ 63 static char *sysev_device; /* device path to use for replaying events */ 64 static char *sysev_sid; /* event channel subscriber identifier */ 65 static void *sysev_evc; /* event channel cookie from evc_bind */ 66 67 static fmd_xprt_t *sysev_xprt; 68 static int sysev_xprt_refcnt; 69 static fmd_hdl_t *sysev_hdl; 70 71 static struct sysev_stats { 72 fmd_stat_t dump_replay; 73 fmd_stat_t dump_lost; 74 fmd_stat_t bad_class; 75 fmd_stat_t bad_attr; 76 fmd_stat_t eagain; 77 } sysev_stats = { 78 { "dump_replay", FMD_TYPE_UINT64, "events replayed from dump device" }, 79 { "dump_lost", FMD_TYPE_UINT64, "events lost from dump device" }, 80 { "bad_class", FMD_TYPE_UINT64, "events dropped due to invalid class" }, 81 { "bad_attr", FMD_TYPE_UINT64, "events dropped due to invalid nvlist" }, 82 { "eagain", FMD_TYPE_UINT64, "events retried due to low memory" }, 83 }; 84 85 static pthread_cond_t sysev_cv = PTHREAD_COND_INITIALIZER; 86 static pthread_mutex_t sysev_mutex = PTHREAD_MUTEX_INITIALIZER; 87 static int sysev_replay_wait = 1; 88 static int sysev_exiting; 89 90 static sysevent_subattr_t *subattr; 91 92 /* 93 * Entry point for legacy sysevents. This function is responsible for two 94 * things: passing off interesting events to the DR handler, and converting 95 * sysevents into resource events that modules can then subscribe to. 96 */ 97 static void 98 sysev_legacy(sysevent_t *sep) 99 { 100 const char *class = sysevent_get_class_name(sep); 101 const char *subclass = sysevent_get_subclass_name(sep); 102 char *fullclass; 103 size_t len; 104 nvlist_t *attr, *nvl; 105 hrtime_t hrt; 106 107 /* notify the DR subsystem of the event */ 108 fmd_dr_event(sep); 109 110 /* get the matching sysevent name */ 111 len = snprintf(NULL, 0, "%s%s.%s", SYSEVENT_RSRC_CLASS, 112 class, subclass); 113 fullclass = alloca(len + 1); 114 (void) snprintf(fullclass, len + 1, "%s%s.%s", 115 SYSEVENT_RSRC_CLASS, class, subclass); 116 117 /* construct the event payload */ 118 (void) nvlist_xalloc(&nvl, NV_UNIQUE_NAME, &fmd.d_nva); 119 if (sysevent_get_attr_list(sep, &attr) == 0) { 120 (void) nvlist_merge(nvl, attr, 0); 121 nvlist_free(attr); 122 } 123 124 /* 125 * Add class and version after the nvlist_merge() just in case 126 * the sysevent has an attribute called class or version. 127 */ 128 (void) nvlist_add_string(nvl, FM_CLASS, fullclass); 129 (void) nvlist_add_uint8(nvl, FM_VERSION, FM_RSRC_VERSION); 130 131 /* 132 * Dispatch the event. Because we have used sysevent_bind_xhandle 133 * the delivery thread is blessed as a proper fmd thread so 134 * we may use regular fmd api calls. 135 */ 136 sysevent_get_time(sep, &hrt); 137 fmd_xprt_post(sysev_hdl, sysev_xprt, nvl, hrt); 138 } 139 140 /* 141 * Receive an event from the SysEvent channel and post it to our transport. 142 * Under extreme low-memory situations where we cannot event unpack the event, 143 * we can request that SysEvent redeliver the event later by returning EAGAIN. 144 * If we do this too many times, the kernel will drop the event. Rather than 145 * keeping state per-event, we simply attempt a garbage-collect, hoping that 146 * enough free memory will be available by the time the event is redelivered. 147 */ 148 static int 149 sysev_recv(sysevent_t *sep, void *arg) 150 { 151 uint64_t seq = sysevent_get_seq(sep); 152 fmd_xprt_t *xp = arg; 153 nvlist_t *nvl; 154 hrtime_t hrt; 155 int rc = 0; 156 157 (void) pthread_mutex_lock(&sysev_mutex); 158 if (sysev_exiting == 1) { 159 while (sysev_xprt_refcnt > 0) 160 (void) pthread_cond_wait(&sysev_cv, &sysev_mutex); 161 (void) pthread_mutex_unlock(&sysev_mutex); 162 return (EAGAIN); 163 } 164 sysev_xprt_refcnt++; 165 while (sysev_replay_wait) 166 (void) pthread_cond_wait(&sysev_cv, &sysev_mutex); 167 (void) pthread_mutex_unlock(&sysev_mutex); 168 169 if (strcmp(sysevent_get_class_name(sep), EC_FM) != 0) { 170 fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: unexpected" 171 " transport class %s\n", seq, sysevent_get_class_name(sep)); 172 sysev_stats.bad_class.fmds_value.ui64++; 173 } else if (sysevent_get_attr_list(sep, &nvl) != 0) { 174 if (errno == EAGAIN || errno == ENOMEM) { 175 fmd_modhash_tryapply(fmd.d_mod_hash, fmd_module_trygc); 176 fmd_scheme_hash_trygc(fmd.d_schemes); 177 sysev_stats.eagain.fmds_value.ui64++; 178 rc = EAGAIN; 179 } else { 180 fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: " 181 "missing or invalid payload", seq); 182 sysev_stats.bad_attr.fmds_value.ui64++; 183 } 184 } else { 185 sysevent_get_time(sep, &hrt); 186 fmd_xprt_post(sysev_hdl, xp, nvl, hrt); 187 } 188 189 (void) pthread_mutex_lock(&sysev_mutex); 190 if (--sysev_xprt_refcnt == 0 && sysev_exiting == 1) 191 (void) pthread_cond_broadcast(&sysev_cv); 192 (void) pthread_mutex_unlock(&sysev_mutex); 193 194 return (rc); 195 } 196 197 /* 198 * Checksum algorithm used by the dump transport for verifying the content of 199 * error reports saved on the dump device (copy of the kernel's checksum32()). 200 */ 201 static uint32_t 202 sysev_checksum(void *cp_arg, size_t length) 203 { 204 uchar_t *cp, *ep; 205 uint32_t sum = 0; 206 207 for (cp = cp_arg, ep = cp + length; cp < ep; cp++) 208 sum = ((sum >> 1) | (sum << 31)) + *cp; 209 210 return (sum); 211 } 212 213 /* 214 * Replay saved events from the dump transport. This function is installed as 215 * the timer callback and is called only once during the module's lifetime. 216 */ 217 /*ARGSUSED*/ 218 static void 219 sysev_replay(fmd_hdl_t *hdl, id_t id, void *arg) 220 { 221 char *dumpdev; 222 off64_t off, off0; 223 int fd, err; 224 225 /* 226 * Determine the appropriate dump device to use for replaying pending 227 * error reports. If the device property is NULL (default), we 228 * open and query /dev/dump to determine the current dump device. 229 */ 230 if ((dumpdev = sysev_device) == NULL) { 231 if ((fd = open("/dev/dump", O_RDONLY)) == -1) { 232 fmd_hdl_error(hdl, "failed to open /dev/dump " 233 "to locate dump device for event replay"); 234 goto done; 235 } 236 237 dumpdev = alloca(PATH_MAX); 238 err = ioctl(fd, DIOCGETDEV, dumpdev); 239 (void) close(fd); 240 241 if (err == -1) { 242 if (errno != ENODEV) { 243 fmd_hdl_error(hdl, "failed to obtain " 244 "path to dump device for event replay"); 245 } 246 goto done; 247 } 248 } 249 250 if (strcmp(dumpdev, "/dev/null") == 0) 251 goto done; /* return silently and skip replay for /dev/null */ 252 253 /* 254 * Open the appropriate device and then determine the offset of the 255 * start of the ereport dump region located at the end of the device. 256 */ 257 if ((fd = open64(dumpdev, O_RDWR | O_DSYNC)) == -1) { 258 fmd_hdl_error(hdl, "failed to open dump transport %s " 259 "(pending events will not be replayed)", dumpdev); 260 goto done; 261 } 262 263 off = DUMP_OFFSET + DUMP_LOGSIZE + DUMP_ERPTSIZE; 264 off = off0 = lseek64(fd, -off, SEEK_END) & -DUMP_OFFSET; 265 266 if (off == (off64_t)-1LL) { 267 fmd_hdl_error(hdl, "failed to seek dump transport %s " 268 "(pending events will not be replayed)", dumpdev); 269 (void) close(fd); 270 goto done; 271 } 272 273 /* 274 * The ereport dump region is a sequence of erpt_dump_t headers each of 275 * which is followed by packed nvlist data. We iterate over them in 276 * order, unpacking and dispatching each one to our dispatch queue. 277 */ 278 for (;;) { 279 char nvbuf[ERPT_DATA_SZ]; 280 uint32_t chksum; 281 erpt_dump_t ed; 282 nvlist_t *nvl; 283 284 fmd_timeval_t ftv, tod; 285 hrtime_t hrt; 286 uint64_t ena; 287 288 if (pread64(fd, &ed, sizeof (ed), off) != sizeof (ed)) { 289 fmd_hdl_error(hdl, "failed to read from dump " 290 "transport %s (pending events lost)", dumpdev); 291 break; 292 } 293 294 if (ed.ed_magic == 0 && ed.ed_size == 0) 295 break; /* end of list: all zero */ 296 297 if (ed.ed_magic == 0) { 298 off += sizeof (ed) + ed.ed_size; 299 continue; /* continue searching */ 300 } 301 302 if (ed.ed_magic != ERPT_MAGIC) { 303 /* 304 * Stop reading silently if the first record has the 305 * wrong magic number; this likely indicates that we 306 * rebooted from non-FMA bits or paged over the dump. 307 */ 308 if (off == off0) 309 break; 310 311 fmd_hdl_error(hdl, "invalid dump transport " 312 "record at %llx (magic number %x, expected %x)\n", 313 (u_longlong_t)off, ed.ed_magic, ERPT_MAGIC); 314 break; 315 } 316 317 if (ed.ed_size > ERPT_DATA_SZ) { 318 fmd_hdl_error(hdl, "invalid dump transport " 319 "record at %llx size (%u exceeds limit)\n", 320 (u_longlong_t)off, ed.ed_size); 321 break; 322 } 323 324 if (pread64(fd, nvbuf, ed.ed_size, 325 off + sizeof (ed)) != ed.ed_size) { 326 fmd_hdl_error(hdl, "failed to read dump " 327 "transport event (offset %llx)", (u_longlong_t)off); 328 329 sysev_stats.dump_lost.fmds_value.ui64++; 330 goto next; 331 } 332 333 if ((chksum = sysev_checksum(nvbuf, 334 ed.ed_size)) != ed.ed_chksum) { 335 fmd_hdl_error(hdl, "dump transport event at " 336 "offset %llx is corrupt (checksum %x != %x)\n", 337 (u_longlong_t)off, chksum, ed.ed_chksum); 338 339 sysev_stats.dump_lost.fmds_value.ui64++; 340 goto next; 341 } 342 343 if ((err = nvlist_xunpack(nvbuf, 344 ed.ed_size, &nvl, &fmd.d_nva)) != 0) { 345 fmd_hdl_error(hdl, "failed to unpack dump " 346 "transport event at offset %llx: %s\n", 347 (u_longlong_t)off, fmd_strerror(err)); 348 349 sysev_stats.dump_lost.fmds_value.ui64++; 350 goto next; 351 } 352 353 /* 354 * If ed_hrt_nsec is set it contains the gethrtime() value from 355 * when the event was originally enqueued for the transport. 356 * If it is zero, we use the weaker bound ed_hrt_base instead. 357 */ 358 if (ed.ed_hrt_nsec != 0) 359 hrt = ed.ed_hrt_nsec; 360 else 361 hrt = ed.ed_hrt_base; 362 363 /* 364 * If this is an FMA protocol event of class "ereport.*" that 365 * contains valid ENA, we can improve the precision of 'hrt'. 366 */ 367 if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) == 0) 368 hrt = fmd_time_ena2hrt(hrt, ena); 369 370 /* 371 * Now convert 'hrt' to an adjustable TOD based on the values 372 * in ed_tod_base which correspond to one another and are 373 * sampled before reboot using the old gethrtime() clock. 374 * fmd_event_recreate() will use this TOD value to re-assign 375 * the event an updated gethrtime() value based on the current 376 * value of the non-adjustable gethrtime() clock. Phew. 377 */ 378 tod.ftv_sec = ed.ed_tod_base.sec; 379 tod.ftv_nsec = ed.ed_tod_base.nsec; 380 fmd_time_hrt2tod(ed.ed_hrt_base, &tod, hrt, &ftv); 381 382 (void) nvlist_remove_all(nvl, FMD_EVN_TOD); 383 (void) nvlist_add_uint64_array(nvl, 384 FMD_EVN_TOD, (uint64_t *)&ftv, 2); 385 386 fmd_xprt_post(hdl, sysev_xprt, nvl, 0); 387 sysev_stats.dump_replay.fmds_value.ui64++; 388 389 next: 390 /* 391 * Reset the magic number for the event record to zero so that 392 * we do not replay the same event multiple times. 393 */ 394 ed.ed_magic = 0; 395 396 if (pwrite64(fd, &ed, sizeof (ed), off) != sizeof (ed)) { 397 fmd_hdl_error(hdl, "failed to mark dump " 398 "transport event (offset %llx)", (u_longlong_t)off); 399 } 400 401 off += sizeof (ed) + ed.ed_size; 402 } 403 404 (void) close(fd); 405 done: 406 (void) pthread_mutex_lock(&sysev_mutex); 407 sysev_replay_wait = 0; 408 (void) pthread_cond_broadcast(&sysev_cv); 409 (void) pthread_mutex_unlock(&sysev_mutex); 410 } 411 412 static const fmd_prop_t sysev_props[] = { 413 { "class", FMD_TYPE_STRING, EC_ALL }, /* event class */ 414 { "device", FMD_TYPE_STRING, NULL }, /* replay device */ 415 { "channel", FMD_TYPE_STRING, FM_ERROR_CHAN }, /* channel name */ 416 { "sid", FMD_TYPE_STRING, "fmd" }, /* subscriber id */ 417 { NULL, 0, NULL } 418 }; 419 420 static const fmd_hdl_ops_t sysev_ops = { 421 NULL, /* fmdo_recv */ 422 sysev_replay, /* fmdo_timeout */ 423 NULL, /* fmdo_close */ 424 NULL, /* fmdo_stats */ 425 NULL, /* fmdo_gc */ 426 NULL, /* fmdo_send */ 427 }; 428 429 static const fmd_hdl_info_t sysev_info = { 430 "SysEvent Transport Agent", "1.0", &sysev_ops, sysev_props 431 }; 432 433 /* 434 * Bind to the sysevent channel we use for listening for error events and then 435 * subscribe to appropriate events received over this channel. Setup the 436 * legacy sysevent handler for creating sysevent resources and forwarding DR 437 * events. 438 */ 439 void 440 sysev_init(fmd_hdl_t *hdl) 441 { 442 uint_t flags; 443 const char *subclasses[] = { EC_SUB_ALL }; 444 445 /* This builtin is for the global zone only */ 446 if (getzoneid() != GLOBAL_ZONEID) 447 return; 448 449 if (fmd_hdl_register(hdl, FMD_API_VERSION, &sysev_info) != 0) 450 return; /* invalid property settings */ 451 452 (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (sysev_stats) / 453 sizeof (fmd_stat_t), (fmd_stat_t *)&sysev_stats); 454 455 sysev_channel = fmd_prop_get_string(hdl, "channel"); 456 sysev_class = fmd_prop_get_string(hdl, "class"); 457 sysev_device = fmd_prop_get_string(hdl, "device"); 458 sysev_sid = fmd_prop_get_string(hdl, "sid"); 459 460 if (sysev_channel == NULL) 461 fmd_hdl_abort(hdl, "channel property must be defined\n"); 462 463 if (sysev_sid == NULL) 464 fmd_hdl_abort(hdl, "sid property must be defined\n"); 465 466 if ((errno = sysevent_evc_bind(sysev_channel, &sysev_evc, 467 EVCH_CREAT | EVCH_HOLD_PEND)) != 0) { 468 fmd_hdl_abort(hdl, "failed to bind to event transport " 469 "channel %s", sysev_channel); 470 } 471 472 sysev_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY | 473 FMD_XPRT_CACHE_AS_LOCAL, NULL, NULL); 474 sysev_hdl = hdl; 475 476 /* 477 * If we're subscribing to the default channel, keep our subscription 478 * active even if we die unexpectedly so we continue queuing events. 479 * If we're not (e.g. running under fmsim), do not specify SUB_KEEP so 480 * that our event channel will be destroyed if we die unpleasantly. 481 */ 482 if (strcmp(sysev_channel, FM_ERROR_CHAN) == 0) 483 flags = EVCH_SUB_KEEP | EVCH_SUB_DUMP; 484 else 485 flags = EVCH_SUB_DUMP; 486 487 if ((subattr = sysevent_subattr_alloc()) == NULL) 488 fmd_hdl_abort(hdl, "failed to allocate subscription " 489 "attributes"); 490 491 sysevent_subattr_thrcreate(subattr, fmd_doorthr_create, NULL); 492 sysevent_subattr_thrsetup(subattr, fmd_doorthr_setup, NULL); 493 494 errno = sysevent_evc_xsubscribe(sysev_evc, 495 sysev_sid, sysev_class, sysev_recv, sysev_xprt, flags, subattr); 496 497 if (errno != 0) { 498 if (errno == EEXIST) { 499 fmd_hdl_abort(hdl, "another fault management daemon is " 500 "active on transport channel %s\n", sysev_channel); 501 } else { 502 fmd_hdl_abort(hdl, "failed to xsubscribe to %s on " 503 "transport channel %s", sysev_class, sysev_channel); 504 } 505 } 506 507 /* 508 * Once the transport is open, install a single timer to fire at once 509 * in the context of the module's thread to run sysev_replay(). This 510 * thread will block in its first fmd_xprt_post() until fmd is ready. 511 */ 512 fmd_hdl_debug(hdl, "transport '%s' open\n", sysev_channel); 513 (void) fmd_timer_install(hdl, NULL, NULL, 0); 514 515 /* 516 * Open the legacy sysevent handle and subscribe to all events. These 517 * are automatically converted to "resource.sysevent.*" events so that 518 * modules can manage these events without additional infrastructure. 519 */ 520 if (geteuid() != 0) 521 return; 522 523 if ((fmd.d_sysev_hdl = 524 sysevent_bind_xhandle(sysev_legacy, subattr)) == NULL) 525 fmd_hdl_abort(hdl, "failed to bind to legacy sysevent channel"); 526 527 if (sysevent_subscribe_event(fmd.d_sysev_hdl, EC_ALL, 528 subclasses, 1) != 0) 529 fmd_hdl_abort(hdl, "failed to subscribe to legacy sysevents"); 530 } 531 532 /* 533 * Close the channel by unsubscribing and unbinding. We only do this when a 534 * a non-default channel has been selected. If we're using FM_ERROR_CHAN, 535 * the system default, we do *not* want to unsubscribe because the kernel will 536 * remove the subscriber queue and any events published in our absence will 537 * therefore be lost. This scenario may occur when, for example, fmd is sent 538 * a SIGTERM by init(1M) during reboot but an error is detected and makes it 539 * into the sysevent channel queue before init(1M) manages to call uadmin(2). 540 */ 541 void 542 sysev_fini(fmd_hdl_t *hdl) 543 { 544 if (strcmp(sysev_channel, FM_ERROR_CHAN) != 0) { 545 (void) sysevent_evc_unsubscribe(sysev_evc, sysev_sid); 546 (void) sysevent_evc_unbind(sysev_evc); 547 } 548 549 if (fmd.d_sysev_hdl != NULL) 550 sysevent_unbind_handle(fmd.d_sysev_hdl); 551 552 if (subattr != NULL) { 553 sysevent_subattr_free(subattr); 554 subattr = NULL; 555 } 556 557 if (sysev_xprt != NULL) { 558 /* 559 * Wait callback returns before destroy the transport. 560 */ 561 (void) pthread_mutex_lock(&sysev_mutex); 562 sysev_exiting = 1; 563 while (sysev_xprt_refcnt > 0) 564 (void) pthread_cond_wait(&sysev_cv, &sysev_mutex); 565 (void) pthread_mutex_unlock(&sysev_mutex); 566 fmd_xprt_close(hdl, sysev_xprt); 567 } 568 569 fmd_prop_free_string(hdl, sysev_class); 570 fmd_prop_free_string(hdl, sysev_channel); 571 fmd_prop_free_string(hdl, sysev_device); 572 fmd_prop_free_string(hdl, sysev_sid); 573 } 574