1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/sysevent/eventdefs.h> 30 #include <sys/sysevent.h> 31 #include <sys/sysevent_impl.h> 32 #include <sys/fm/protocol.h> 33 #include <sys/sysmacros.h> 34 #include <sys/dumphdr.h> 35 #include <sys/dumpadm.h> 36 #include <sys/fm/util.h> 37 38 #include <libsysevent.h> 39 #include <libnvpair.h> 40 #include <alloca.h> 41 #include <limits.h> 42 #include <strings.h> 43 #include <unistd.h> 44 #include <fcntl.h> 45 #include <errno.h> 46 47 #undef MUTEX_HELD 48 #undef RW_READ_HELD 49 #undef RW_WRITE_HELD 50 51 #include <fmd_api.h> 52 #include <fmd_log.h> 53 #include <fmd_subr.h> 54 #include <fmd_dispq.h> 55 #include <fmd_module.h> 56 #include <fmd_scheme.h> 57 #include <fmd_error.h> 58 59 #include <fmd.h> 60 61 static char *sysev_channel; /* event channel to which we are subscribed */ 62 static char *sysev_class; /* event class to which we are subscribed */ 63 static char *sysev_device; /* device path to use for replaying events */ 64 static char *sysev_sid; /* event channel subscriber identifier */ 65 static void *sysev_evc; /* event channel cookie from evc_bind */ 66 67 static fmd_xprt_t *sysev_xprt; 68 static fmd_hdl_t *sysev_hdl; 69 70 static struct sysev_stats { 71 fmd_stat_t dump_replay; 72 fmd_stat_t dump_lost; 73 fmd_stat_t bad_class; 74 fmd_stat_t bad_attr; 75 fmd_stat_t eagain; 76 } sysev_stats = { 77 { "dump_replay", FMD_TYPE_UINT64, "events replayed from dump device" }, 78 { "dump_lost", FMD_TYPE_UINT64, "events lost from dump device" }, 79 { "bad_class", FMD_TYPE_UINT64, "events dropped due to invalid class" }, 80 { "bad_attr", FMD_TYPE_UINT64, "events dropped due to invalid nvlist" }, 81 { "eagain", FMD_TYPE_UINT64, "events retried due to low memory" }, 82 }; 83 84 static pthread_cond_t sysev_replay_cv = PTHREAD_COND_INITIALIZER; 85 static pthread_mutex_t sysev_replay_mutex = PTHREAD_MUTEX_INITIALIZER; 86 static int sysev_replay_wait = 1; 87 88 /* 89 * Receive an event from the SysEvent channel and post it to our transport. 90 * Under extreme low-memory situations where we cannot event unpack the event, 91 * we can request that SysEvent redeliver the event later by returning EAGAIN. 92 * If we do this too many times, the kernel will drop the event. Rather than 93 * keeping state per-event, we simply attempt a garbage-collect, hoping that 94 * enough free memory will be available by the time the event is redelivered. 95 */ 96 static int 97 sysev_recv(sysevent_t *sep, void *arg) 98 { 99 uint64_t seq = sysevent_get_seq(sep); 100 fmd_xprt_t *xp = arg; 101 nvlist_t *nvl; 102 hrtime_t hrt; 103 104 (void) pthread_mutex_lock(&sysev_replay_mutex); 105 while (sysev_replay_wait) 106 (void) pthread_cond_wait(&sysev_replay_cv, &sysev_replay_mutex); 107 (void) pthread_mutex_unlock(&sysev_replay_mutex); 108 109 if (strcmp(sysevent_get_class_name(sep), EC_FM) != 0) { 110 fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: unexpected" 111 " transport class %s\n", seq, sysevent_get_class_name(sep)); 112 sysev_stats.bad_class.fmds_value.ui64++; 113 return (0); 114 } 115 116 if (sysevent_get_attr_list(sep, &nvl) != 0) { 117 if (errno == EAGAIN || errno == ENOMEM) { 118 fmd_modhash_tryapply(fmd.d_mod_hash, fmd_module_trygc); 119 fmd_scheme_hash_trygc(fmd.d_schemes); 120 sysev_stats.eagain.fmds_value.ui64++; 121 return (EAGAIN); 122 } 123 124 fmd_hdl_error(sysev_hdl, "discarding event 0x%llx: missing " 125 "or invalid payload", seq); 126 sysev_stats.bad_attr.fmds_value.ui64++; 127 return (0); 128 } 129 130 sysevent_get_time(sep, &hrt); 131 fmd_xprt_post(sysev_hdl, xp, nvl, hrt); 132 return (0); 133 } 134 135 /* 136 * Checksum algorithm used by the dump transport for verifying the content of 137 * error reports saved on the dump device (copy of the kernel's checksum32()). 138 */ 139 static uint32_t 140 sysev_checksum(void *cp_arg, size_t length) 141 { 142 uchar_t *cp, *ep; 143 uint32_t sum = 0; 144 145 for (cp = cp_arg, ep = cp + length; cp < ep; cp++) 146 sum = ((sum >> 1) | (sum << 31)) + *cp; 147 148 return (sum); 149 } 150 151 /* 152 * Replay saved events from the dump transport. This function is installed as 153 * the timer callback and is called only once during the module's lifetime. 154 */ 155 /*ARGSUSED*/ 156 static void 157 sysev_replay(fmd_hdl_t *hdl, id_t id, void *arg) 158 { 159 char *dumpdev; 160 off64_t off, off0; 161 int fd, err; 162 163 /* 164 * Determine the appropriate dump device to use for replaying pending 165 * error reports. If the device property is NULL (default), we 166 * open and query /dev/dump to determine the current dump device. 167 */ 168 if ((dumpdev = sysev_device) == NULL) { 169 if ((fd = open("/dev/dump", O_RDONLY)) == -1) { 170 fmd_hdl_error(hdl, "failed to open /dev/dump " 171 "to locate dump device for event replay"); 172 goto done; 173 } 174 175 dumpdev = alloca(PATH_MAX); 176 err = ioctl(fd, DIOCGETDEV, dumpdev); 177 (void) close(fd); 178 179 if (err == -1) { 180 if (errno != ENODEV) { 181 fmd_hdl_error(hdl, "failed to obtain " 182 "path to dump device for event replay"); 183 } 184 goto done; 185 } 186 } 187 188 if (strcmp(dumpdev, "/dev/null") == 0) 189 goto done; /* return silently and skip replay for /dev/null */ 190 191 /* 192 * Open the appropriate device and then determine the offset of the 193 * start of the ereport dump region located at the end of the device. 194 */ 195 if ((fd = open64(dumpdev, O_RDWR | O_DSYNC)) == -1) { 196 fmd_hdl_error(hdl, "failed to open dump transport %s " 197 "(pending events will not be replayed)", dumpdev); 198 goto done; 199 } 200 201 off = DUMP_OFFSET + DUMP_LOGSIZE + DUMP_ERPTSIZE; 202 off = off0 = lseek64(fd, -off, SEEK_END) & -DUMP_OFFSET; 203 204 if (off == (off64_t)-1LL) { 205 fmd_hdl_error(hdl, "failed to seek dump transport %s " 206 "(pending events will not be replayed)", dumpdev); 207 (void) close(fd); 208 goto done; 209 } 210 211 /* 212 * The ereport dump region is a sequence of erpt_dump_t headers each of 213 * which is followed by packed nvlist data. We iterate over them in 214 * order, unpacking and dispatching each one to our dispatch queue. 215 */ 216 for (;;) { 217 char nvbuf[ERPT_DATA_SZ]; 218 uint32_t chksum; 219 erpt_dump_t ed; 220 nvlist_t *nvl; 221 222 fmd_timeval_t ftv, tod; 223 hrtime_t hrt; 224 uint64_t ena; 225 226 if (pread64(fd, &ed, sizeof (ed), off) != sizeof (ed)) { 227 fmd_hdl_error(hdl, "failed to read from dump " 228 "transport %s (pending events lost)", dumpdev); 229 break; 230 } 231 232 if (ed.ed_magic == 0 && ed.ed_size == 0) 233 break; /* end of list: all zero */ 234 235 if (ed.ed_magic == 0) { 236 off += sizeof (ed) + ed.ed_size; 237 continue; /* continue searching */ 238 } 239 240 if (ed.ed_magic != ERPT_MAGIC) { 241 /* 242 * Stop reading silently if the first record has the 243 * wrong magic number; this likely indicates that we 244 * rebooted from non-FMA bits or paged over the dump. 245 */ 246 if (off == off0) 247 break; 248 249 fmd_hdl_error(hdl, "invalid dump transport " 250 "record at %llx (magic number %x, expected %x)\n", 251 (u_longlong_t)off, ed.ed_magic, ERPT_MAGIC); 252 break; 253 } 254 255 if (ed.ed_size > ERPT_DATA_SZ) { 256 fmd_hdl_error(hdl, "invalid dump transport " 257 "record at %llx size (%u exceeds limit)\n", 258 (u_longlong_t)off, ed.ed_size); 259 break; 260 } 261 262 if (pread64(fd, nvbuf, ed.ed_size, 263 off + sizeof (ed)) != ed.ed_size) { 264 fmd_hdl_error(hdl, "failed to read dump " 265 "transport event (offset %llx)", (u_longlong_t)off); 266 267 sysev_stats.dump_lost.fmds_value.ui64++; 268 goto next; 269 } 270 271 if ((chksum = sysev_checksum(nvbuf, 272 ed.ed_size)) != ed.ed_chksum) { 273 fmd_hdl_error(hdl, "dump transport event at " 274 "offset %llx is corrupt (checksum %x != %x)\n", 275 (u_longlong_t)off, chksum, ed.ed_chksum); 276 277 sysev_stats.dump_lost.fmds_value.ui64++; 278 goto next; 279 } 280 281 if ((err = nvlist_xunpack(nvbuf, 282 ed.ed_size, &nvl, &fmd.d_nva)) != 0) { 283 fmd_hdl_error(hdl, "failed to unpack dump " 284 "transport event at offset %llx: %s\n", 285 (u_longlong_t)off, fmd_strerror(err)); 286 287 sysev_stats.dump_lost.fmds_value.ui64++; 288 goto next; 289 } 290 291 /* 292 * If ed_hrt_nsec is set it contains the gethrtime() value from 293 * when the event was originally enqueued for the transport. 294 * If it is zero, we use the weaker bound ed_hrt_base instead. 295 */ 296 if (ed.ed_hrt_nsec != 0) 297 hrt = ed.ed_hrt_nsec; 298 else 299 hrt = ed.ed_hrt_base; 300 301 /* 302 * If this is an FMA protocol event of class "ereport.*" that 303 * contains valid ENA, we can improve the precision of 'hrt'. 304 */ 305 if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) == 0) 306 hrt = fmd_time_ena2hrt(hrt, ena); 307 308 /* 309 * Now convert 'hrt' to an adjustable TOD based on the values 310 * in ed_tod_base which correspond to one another and are 311 * sampled before reboot using the old gethrtime() clock. 312 * fmd_event_recreate() will use this TOD value to re-assign 313 * the event an updated gethrtime() value based on the current 314 * value of the non-adjustable gethrtime() clock. Phew. 315 */ 316 tod.ftv_sec = ed.ed_tod_base.sec; 317 tod.ftv_nsec = ed.ed_tod_base.nsec; 318 fmd_time_hrt2tod(ed.ed_hrt_base, &tod, hrt, &ftv); 319 320 (void) nvlist_remove_all(nvl, FMD_EVN_TOD); 321 (void) nvlist_add_uint64_array(nvl, 322 FMD_EVN_TOD, (uint64_t *)&ftv, 2); 323 324 fmd_xprt_post(hdl, sysev_xprt, nvl, 0); 325 sysev_stats.dump_replay.fmds_value.ui64++; 326 327 next: 328 /* 329 * Reset the magic number for the event record to zero so that 330 * we do not replay the same event multiple times. 331 */ 332 ed.ed_magic = 0; 333 334 if (pwrite64(fd, &ed, sizeof (ed), off) != sizeof (ed)) { 335 fmd_hdl_error(hdl, "failed to mark dump " 336 "transport event (offset %llx)", (u_longlong_t)off); 337 } 338 339 off += sizeof (ed) + ed.ed_size; 340 } 341 342 (void) close(fd); 343 done: 344 (void) pthread_mutex_lock(&sysev_replay_mutex); 345 sysev_replay_wait = 0; 346 (void) pthread_cond_broadcast(&sysev_replay_cv); 347 (void) pthread_mutex_unlock(&sysev_replay_mutex); 348 } 349 350 static const fmd_prop_t sysev_props[] = { 351 { "class", FMD_TYPE_STRING, EC_ALL }, /* event class */ 352 { "device", FMD_TYPE_STRING, NULL }, /* replay device */ 353 { "channel", FMD_TYPE_STRING, FM_ERROR_CHAN }, /* channel name */ 354 { "sid", FMD_TYPE_STRING, "fmd" }, /* subscriber id */ 355 { NULL, 0, NULL } 356 }; 357 358 static const fmd_hdl_ops_t sysev_ops = { 359 NULL, /* fmdo_recv */ 360 sysev_replay, /* fmdo_timeout */ 361 NULL, /* fmdo_close */ 362 NULL, /* fmdo_stats */ 363 NULL, /* fmdo_gc */ 364 NULL, /* fmdo_send */ 365 }; 366 367 static const fmd_hdl_info_t sysev_info = { 368 "SysEvent Transport Agent", "1.0", &sysev_ops, sysev_props 369 }; 370 371 /* 372 * Bind to the sysevent channel we use for listening for error events and then 373 * subscribe to appropriate events received over this channel. 374 */ 375 void 376 sysev_init(fmd_hdl_t *hdl) 377 { 378 uint_t flags; 379 380 if (fmd_hdl_register(hdl, FMD_API_VERSION, &sysev_info) != 0) 381 return; /* invalid property settings */ 382 383 (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (sysev_stats) / 384 sizeof (fmd_stat_t), (fmd_stat_t *)&sysev_stats); 385 386 sysev_channel = fmd_prop_get_string(hdl, "channel"); 387 sysev_class = fmd_prop_get_string(hdl, "class"); 388 sysev_device = fmd_prop_get_string(hdl, "device"); 389 sysev_sid = fmd_prop_get_string(hdl, "sid"); 390 391 if (sysev_channel == NULL) 392 fmd_hdl_abort(hdl, "channel property must be defined\n"); 393 394 if (sysev_sid == NULL) 395 fmd_hdl_abort(hdl, "sid property must be defined\n"); 396 397 if ((errno = sysevent_evc_bind(sysev_channel, &sysev_evc, 398 EVCH_CREAT | EVCH_HOLD_PEND)) != 0) { 399 fmd_hdl_abort(hdl, "failed to bind to event transport " 400 "channel %s", sysev_channel); 401 } 402 403 sysev_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY, NULL, NULL); 404 sysev_hdl = hdl; 405 406 /* 407 * If we're subscribing to the default channel, keep our subscription 408 * active even if we die unexpectedly so we continue queuing events. 409 * If we're not (e.g. running under fmsim), do not specify SUB_KEEP so 410 * that our event channel will be destroyed if we die unpleasantly. 411 */ 412 if (strcmp(sysev_channel, FM_ERROR_CHAN) == 0) 413 flags = EVCH_SUB_KEEP | EVCH_SUB_DUMP; 414 else 415 flags = EVCH_SUB_DUMP; 416 417 errno = sysevent_evc_subscribe(sysev_evc, 418 sysev_sid, sysev_class, sysev_recv, sysev_xprt, flags); 419 420 if (errno != 0) { 421 if (errno == EEXIST) { 422 fmd_hdl_abort(hdl, "another fault management daemon is " 423 "active on transport channel %s\n", sysev_channel); 424 } else { 425 fmd_hdl_abort(hdl, "failed to subscribe to %s on " 426 "transport channel %s", sysev_class, sysev_channel); 427 } 428 } 429 430 /* 431 * Once the transport is open, install a single timer to fire at once 432 * in the context of the module's thread to run sysev_replay(). This 433 * thread will block in its first fmd_xprt_post() until fmd is ready. 434 */ 435 fmd_hdl_debug(hdl, "transport '%s' open\n", sysev_channel); 436 (void) fmd_timer_install(hdl, NULL, NULL, 0); 437 } 438 439 /* 440 * Close the channel by unsubscribing and unbinding. We only do this when a 441 * a non-default channel has been selected. If we're using FM_ERROR_CHAN, 442 * the system default, we do *not* want to unsubscribe because the kernel will 443 * remove the subscriber queue and any events published in our absence will 444 * therefore be lost. This scenario may occur when, for example, fmd is sent 445 * a SIGTERM by init(1M) during reboot but an error is detected and makes it 446 * into the sysevent channel queue before init(1M) manages to call uadmin(2). 447 */ 448 void 449 sysev_fini(fmd_hdl_t *hdl) 450 { 451 if (strcmp(sysev_channel, FM_ERROR_CHAN) != 0) { 452 sysevent_evc_unsubscribe(sysev_evc, sysev_sid); 453 sysevent_evc_unbind(sysev_evc); 454 } 455 456 if (sysev_xprt != NULL) 457 fmd_xprt_close(hdl, sysev_xprt); 458 459 fmd_prop_free_string(hdl, sysev_class); 460 fmd_prop_free_string(hdl, sysev_channel); 461 fmd_prop_free_string(hdl, sysev_device); 462 fmd_prop_free_string(hdl, sysev_sid); 463 } 464