1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Fault Management Architecture (FMA) Resource and Protocol Support 27 * 28 * The routines contained herein provide services to support kernel subsystems 29 * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089). 30 * 31 * Name-Value Pair Lists 32 * 33 * The embodiment of an FMA protocol element (event, fmri or authority) is a 34 * name-value pair list (nvlist_t). FMA-specific nvlist constructor and 35 * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used 36 * to create an nvpair list using custom allocators. Callers may choose to 37 * allocate either from the kernel memory allocator, or from a preallocated 38 * buffer, useful in constrained contexts like high-level interrupt routines. 39 * 40 * Protocol Event and FMRI Construction 41 * 42 * Convenience routines are provided to construct nvlist events according to 43 * the FMA Event Protocol and Naming Schema specification for ereports and 44 * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes. 45 * 46 * ENA Manipulation 47 * 48 * Routines to generate ENA formats 0, 1 and 2 are available as well as 49 * routines to increment formats 1 and 2. Individual fields within the 50 * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(), 51 * fm_ena_format_get() and fm_ena_gen_get(). 52 */ 53 54 #include <sys/types.h> 55 #include <sys/time.h> 56 #include <sys/list.h> 57 #include <sys/nvpair.h> 58 #include <sys/cmn_err.h> 59 #include <sys/sysmacros.h> 60 #include <sys/sunddi.h> 61 #include <sys/systeminfo.h> 62 #include <sys/fm/util.h> 63 #include <sys/fm/protocol.h> 64 #include <sys/kstat.h> 65 #include <sys/zfs_context.h> 66 #ifdef _KERNEL 67 #include <sys/atomic.h> 68 #include <sys/condvar.h> 69 #include <sys/zfs_ioctl.h> 70 71 int zfs_zevent_len_max = 512; 72 73 static int zevent_len_cur = 0; 74 static int zevent_waiters = 0; 75 static int zevent_flags = 0; 76 77 /* Num events rate limited since the last time zfs_zevent_next() was called */ 78 static uint64_t ratelimit_dropped = 0; 79 80 /* 81 * The EID (Event IDentifier) is used to uniquely tag a zevent when it is 82 * posted. The posted EIDs are monotonically increasing but not persistent. 83 * They will be reset to the initial value (1) each time the kernel module is 84 * loaded. 85 */ 86 static uint64_t zevent_eid = 0; 87 88 static kmutex_t zevent_lock; 89 static list_t zevent_list; 90 static kcondvar_t zevent_cv; 91 #endif /* _KERNEL */ 92 93 94 /* 95 * Common fault management kstats to record event generation failures 96 */ 97 98 struct erpt_kstat { 99 kstat_named_t erpt_dropped; /* num erpts dropped on post */ 100 kstat_named_t erpt_set_failed; /* num erpt set failures */ 101 kstat_named_t fmri_set_failed; /* num fmri set failures */ 102 kstat_named_t payload_set_failed; /* num payload set failures */ 103 kstat_named_t erpt_duplicates; /* num duplicate erpts */ 104 }; 105 106 static struct erpt_kstat erpt_kstat_data = { 107 { "erpt-dropped", KSTAT_DATA_UINT64 }, 108 { "erpt-set-failed", KSTAT_DATA_UINT64 }, 109 { "fmri-set-failed", KSTAT_DATA_UINT64 }, 110 { "payload-set-failed", KSTAT_DATA_UINT64 }, 111 { "erpt-duplicates", KSTAT_DATA_UINT64 } 112 }; 113 114 kstat_t *fm_ksp; 115 116 #ifdef _KERNEL 117 118 static zevent_t * 119 zfs_zevent_alloc(void) 120 { 121 zevent_t *ev; 122 123 ev = kmem_zalloc(sizeof (zevent_t), KM_SLEEP); 124 125 list_create(&ev->ev_ze_list, sizeof (zfs_zevent_t), 126 offsetof(zfs_zevent_t, ze_node)); 127 list_link_init(&ev->ev_node); 128 129 return (ev); 130 } 131 132 static void 133 zfs_zevent_free(zevent_t *ev) 134 { 135 /* Run provided cleanup callback */ 136 ev->ev_cb(ev->ev_nvl, ev->ev_detector); 137 138 list_destroy(&ev->ev_ze_list); 139 kmem_free(ev, sizeof (zevent_t)); 140 } 141 142 static void 143 zfs_zevent_drain(zevent_t *ev) 144 { 145 zfs_zevent_t *ze; 146 147 ASSERT(MUTEX_HELD(&zevent_lock)); 148 list_remove(&zevent_list, ev); 149 150 /* Remove references to this event in all private file data */ 151 while ((ze = list_head(&ev->ev_ze_list)) != NULL) { 152 list_remove(&ev->ev_ze_list, ze); 153 ze->ze_zevent = NULL; 154 ze->ze_dropped++; 155 } 156 157 zfs_zevent_free(ev); 158 } 159 160 void 161 zfs_zevent_drain_all(int *count) 162 { 163 zevent_t *ev; 164 165 mutex_enter(&zevent_lock); 166 while ((ev = list_head(&zevent_list)) != NULL) 167 zfs_zevent_drain(ev); 168 169 *count = zevent_len_cur; 170 zevent_len_cur = 0; 171 mutex_exit(&zevent_lock); 172 } 173 174 /* 175 * New zevents are inserted at the head. If the maximum queue 176 * length is exceeded a zevent will be drained from the tail. 177 * As part of this any user space processes which currently have 178 * a reference to this zevent_t in their private data will have 179 * this reference set to NULL. 180 */ 181 static void 182 zfs_zevent_insert(zevent_t *ev) 183 { 184 ASSERT(MUTEX_HELD(&zevent_lock)); 185 list_insert_head(&zevent_list, ev); 186 187 if (zevent_len_cur >= zfs_zevent_len_max) 188 zfs_zevent_drain(list_tail(&zevent_list)); 189 else 190 zevent_len_cur++; 191 } 192 193 /* 194 * Post a zevent. The cb will be called when nvl and detector are no longer 195 * needed, i.e.: 196 * - An error happened and a zevent can't be posted. In this case, cb is called 197 * before zfs_zevent_post() returns. 198 * - The event is being drained and freed. 199 */ 200 int 201 zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb) 202 { 203 inode_timespec_t tv; 204 int64_t tv_array[2]; 205 uint64_t eid; 206 size_t nvl_size = 0; 207 zevent_t *ev; 208 int error; 209 210 ASSERT(cb != NULL); 211 212 gethrestime(&tv); 213 tv_array[0] = tv.tv_sec; 214 tv_array[1] = tv.tv_nsec; 215 216 error = nvlist_add_int64_array(nvl, FM_EREPORT_TIME, tv_array, 2); 217 if (error) { 218 atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); 219 goto out; 220 } 221 222 eid = atomic_inc_64_nv(&zevent_eid); 223 error = nvlist_add_uint64(nvl, FM_EREPORT_EID, eid); 224 if (error) { 225 atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); 226 goto out; 227 } 228 229 error = nvlist_size(nvl, &nvl_size, NV_ENCODE_NATIVE); 230 if (error) { 231 atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); 232 goto out; 233 } 234 235 if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) { 236 atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); 237 error = EOVERFLOW; 238 goto out; 239 } 240 241 ev = zfs_zevent_alloc(); 242 if (ev == NULL) { 243 atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); 244 error = ENOMEM; 245 goto out; 246 } 247 248 ev->ev_nvl = nvl; 249 ev->ev_detector = detector; 250 ev->ev_cb = cb; 251 ev->ev_eid = eid; 252 253 mutex_enter(&zevent_lock); 254 zfs_zevent_insert(ev); 255 cv_broadcast(&zevent_cv); 256 mutex_exit(&zevent_lock); 257 258 out: 259 if (error) 260 cb(nvl, detector); 261 262 return (error); 263 } 264 265 void 266 zfs_zevent_track_duplicate(void) 267 { 268 atomic_inc_64(&erpt_kstat_data.erpt_duplicates.value.ui64); 269 } 270 271 static int 272 zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze) 273 { 274 *ze = zfsdev_get_state(minor, ZST_ZEVENT); 275 if (*ze == NULL) 276 return (SET_ERROR(EBADF)); 277 278 return (0); 279 } 280 281 zfs_file_t * 282 zfs_zevent_fd_hold(int fd, minor_t *minorp, zfs_zevent_t **ze) 283 { 284 zfs_file_t *fp = zfs_file_get(fd); 285 if (fp == NULL) 286 return (NULL); 287 288 int error = zfsdev_getminor(fp, minorp); 289 if (error == 0) 290 error = zfs_zevent_minor_to_state(*minorp, ze); 291 292 if (error) { 293 zfs_zevent_fd_rele(fp); 294 fp = NULL; 295 } 296 297 return (fp); 298 } 299 300 void 301 zfs_zevent_fd_rele(zfs_file_t *fp) 302 { 303 zfs_file_put(fp); 304 } 305 306 /* 307 * Get the next zevent in the stream and place a copy in 'event'. This 308 * may fail with ENOMEM if the encoded nvlist size exceeds the passed 309 * 'event_size'. In this case the stream pointer is not advanced and 310 * and 'event_size' is set to the minimum required buffer size. 311 */ 312 int 313 zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size, 314 uint64_t *dropped) 315 { 316 zevent_t *ev; 317 size_t size; 318 int error = 0; 319 320 mutex_enter(&zevent_lock); 321 if (ze->ze_zevent == NULL) { 322 /* New stream start at the beginning/tail */ 323 ev = list_tail(&zevent_list); 324 if (ev == NULL) { 325 error = ENOENT; 326 goto out; 327 } 328 } else { 329 /* 330 * Existing stream continue with the next element and remove 331 * ourselves from the wait queue for the previous element 332 */ 333 ev = list_prev(&zevent_list, ze->ze_zevent); 334 if (ev == NULL) { 335 error = ENOENT; 336 goto out; 337 } 338 } 339 340 VERIFY(nvlist_size(ev->ev_nvl, &size, NV_ENCODE_NATIVE) == 0); 341 if (size > *event_size) { 342 *event_size = size; 343 error = ENOMEM; 344 goto out; 345 } 346 347 if (ze->ze_zevent) 348 list_remove(&ze->ze_zevent->ev_ze_list, ze); 349 350 ze->ze_zevent = ev; 351 list_insert_head(&ev->ev_ze_list, ze); 352 (void) nvlist_dup(ev->ev_nvl, event, KM_SLEEP); 353 *dropped = ze->ze_dropped; 354 355 #ifdef _KERNEL 356 /* Include events dropped due to rate limiting */ 357 *dropped += atomic_swap_64(&ratelimit_dropped, 0); 358 #endif 359 ze->ze_dropped = 0; 360 out: 361 mutex_exit(&zevent_lock); 362 363 return (error); 364 } 365 366 /* 367 * Wait in an interruptible state for any new events. 368 */ 369 int 370 zfs_zevent_wait(zfs_zevent_t *ze) 371 { 372 int error = EAGAIN; 373 374 mutex_enter(&zevent_lock); 375 zevent_waiters++; 376 377 while (error == EAGAIN) { 378 if (zevent_flags & ZEVENT_SHUTDOWN) { 379 error = SET_ERROR(ESHUTDOWN); 380 break; 381 } 382 383 error = cv_wait_sig(&zevent_cv, &zevent_lock); 384 if (signal_pending(current)) { 385 error = SET_ERROR(EINTR); 386 break; 387 } else if (!list_is_empty(&zevent_list)) { 388 error = 0; 389 continue; 390 } else { 391 error = EAGAIN; 392 } 393 } 394 395 zevent_waiters--; 396 mutex_exit(&zevent_lock); 397 398 return (error); 399 } 400 401 /* 402 * The caller may seek to a specific EID by passing that EID. If the EID 403 * is still available in the posted list of events the cursor is positioned 404 * there. Otherwise ENOENT is returned and the cursor is not moved. 405 * 406 * There are two reserved EIDs which may be passed and will never fail. 407 * ZEVENT_SEEK_START positions the cursor at the start of the list, and 408 * ZEVENT_SEEK_END positions the cursor at the end of the list. 409 */ 410 int 411 zfs_zevent_seek(zfs_zevent_t *ze, uint64_t eid) 412 { 413 zevent_t *ev; 414 int error = 0; 415 416 mutex_enter(&zevent_lock); 417 418 if (eid == ZEVENT_SEEK_START) { 419 if (ze->ze_zevent) 420 list_remove(&ze->ze_zevent->ev_ze_list, ze); 421 422 ze->ze_zevent = NULL; 423 goto out; 424 } 425 426 if (eid == ZEVENT_SEEK_END) { 427 if (ze->ze_zevent) 428 list_remove(&ze->ze_zevent->ev_ze_list, ze); 429 430 ev = list_head(&zevent_list); 431 if (ev) { 432 ze->ze_zevent = ev; 433 list_insert_head(&ev->ev_ze_list, ze); 434 } else { 435 ze->ze_zevent = NULL; 436 } 437 438 goto out; 439 } 440 441 for (ev = list_tail(&zevent_list); ev != NULL; 442 ev = list_prev(&zevent_list, ev)) { 443 if (ev->ev_eid == eid) { 444 if (ze->ze_zevent) 445 list_remove(&ze->ze_zevent->ev_ze_list, ze); 446 447 ze->ze_zevent = ev; 448 list_insert_head(&ev->ev_ze_list, ze); 449 break; 450 } 451 } 452 453 if (ev == NULL) 454 error = ENOENT; 455 456 out: 457 mutex_exit(&zevent_lock); 458 459 return (error); 460 } 461 462 void 463 zfs_zevent_init(zfs_zevent_t **zep) 464 { 465 zfs_zevent_t *ze; 466 467 ze = *zep = kmem_zalloc(sizeof (zfs_zevent_t), KM_SLEEP); 468 list_link_init(&ze->ze_node); 469 } 470 471 void 472 zfs_zevent_destroy(zfs_zevent_t *ze) 473 { 474 mutex_enter(&zevent_lock); 475 if (ze->ze_zevent) 476 list_remove(&ze->ze_zevent->ev_ze_list, ze); 477 mutex_exit(&zevent_lock); 478 479 kmem_free(ze, sizeof (zfs_zevent_t)); 480 } 481 #endif /* _KERNEL */ 482 483 /* 484 * Wrappers for FM nvlist allocators 485 */ 486 /* ARGSUSED */ 487 static void * 488 i_fm_alloc(nv_alloc_t *nva, size_t size) 489 { 490 return (kmem_zalloc(size, KM_SLEEP)); 491 } 492 493 /* ARGSUSED */ 494 static void 495 i_fm_free(nv_alloc_t *nva, void *buf, size_t size) 496 { 497 kmem_free(buf, size); 498 } 499 500 const nv_alloc_ops_t fm_mem_alloc_ops = { 501 .nv_ao_init = NULL, 502 .nv_ao_fini = NULL, 503 .nv_ao_alloc = i_fm_alloc, 504 .nv_ao_free = i_fm_free, 505 .nv_ao_reset = NULL 506 }; 507 508 /* 509 * Create and initialize a new nv_alloc_t for a fixed buffer, buf. A pointer 510 * to the newly allocated nv_alloc_t structure is returned upon success or NULL 511 * is returned to indicate that the nv_alloc structure could not be created. 512 */ 513 nv_alloc_t * 514 fm_nva_xcreate(char *buf, size_t bufsz) 515 { 516 nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP); 517 518 if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) { 519 kmem_free(nvhdl, sizeof (nv_alloc_t)); 520 return (NULL); 521 } 522 523 return (nvhdl); 524 } 525 526 /* 527 * Destroy a previously allocated nv_alloc structure. The fixed buffer 528 * associated with nva must be freed by the caller. 529 */ 530 void 531 fm_nva_xdestroy(nv_alloc_t *nva) 532 { 533 nv_alloc_fini(nva); 534 kmem_free(nva, sizeof (nv_alloc_t)); 535 } 536 537 /* 538 * Create a new nv list. A pointer to a new nv list structure is returned 539 * upon success or NULL is returned to indicate that the structure could 540 * not be created. The newly created nv list is created and managed by the 541 * operations installed in nva. If nva is NULL, the default FMA nva 542 * operations are installed and used. 543 * 544 * When called from the kernel and nva == NULL, this function must be called 545 * from passive kernel context with no locks held that can prevent a 546 * sleeping memory allocation from occurring. Otherwise, this function may 547 * be called from other kernel contexts as long a valid nva created via 548 * fm_nva_create() is supplied. 549 */ 550 nvlist_t * 551 fm_nvlist_create(nv_alloc_t *nva) 552 { 553 int hdl_alloced = 0; 554 nvlist_t *nvl; 555 nv_alloc_t *nvhdl; 556 557 if (nva == NULL) { 558 nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP); 559 560 if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) { 561 kmem_free(nvhdl, sizeof (nv_alloc_t)); 562 return (NULL); 563 } 564 hdl_alloced = 1; 565 } else { 566 nvhdl = nva; 567 } 568 569 if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) { 570 if (hdl_alloced) { 571 nv_alloc_fini(nvhdl); 572 kmem_free(nvhdl, sizeof (nv_alloc_t)); 573 } 574 return (NULL); 575 } 576 577 return (nvl); 578 } 579 580 /* 581 * Destroy a previously allocated nvlist structure. flag indicates whether 582 * or not the associated nva structure should be freed (FM_NVA_FREE) or 583 * retained (FM_NVA_RETAIN). Retaining the nv alloc structure allows 584 * it to be re-used for future nvlist creation operations. 585 */ 586 void 587 fm_nvlist_destroy(nvlist_t *nvl, int flag) 588 { 589 nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl); 590 591 nvlist_free(nvl); 592 593 if (nva != NULL) { 594 if (flag == FM_NVA_FREE) 595 fm_nva_xdestroy(nva); 596 } 597 } 598 599 int 600 i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap) 601 { 602 int nelem, ret = 0; 603 data_type_t type; 604 605 while (ret == 0 && name != NULL) { 606 type = va_arg(ap, data_type_t); 607 switch (type) { 608 case DATA_TYPE_BYTE: 609 ret = nvlist_add_byte(payload, name, 610 va_arg(ap, uint_t)); 611 break; 612 case DATA_TYPE_BYTE_ARRAY: 613 nelem = va_arg(ap, int); 614 ret = nvlist_add_byte_array(payload, name, 615 va_arg(ap, uchar_t *), nelem); 616 break; 617 case DATA_TYPE_BOOLEAN_VALUE: 618 ret = nvlist_add_boolean_value(payload, name, 619 va_arg(ap, boolean_t)); 620 break; 621 case DATA_TYPE_BOOLEAN_ARRAY: 622 nelem = va_arg(ap, int); 623 ret = nvlist_add_boolean_array(payload, name, 624 va_arg(ap, boolean_t *), nelem); 625 break; 626 case DATA_TYPE_INT8: 627 ret = nvlist_add_int8(payload, name, 628 va_arg(ap, int)); 629 break; 630 case DATA_TYPE_INT8_ARRAY: 631 nelem = va_arg(ap, int); 632 ret = nvlist_add_int8_array(payload, name, 633 va_arg(ap, int8_t *), nelem); 634 break; 635 case DATA_TYPE_UINT8: 636 ret = nvlist_add_uint8(payload, name, 637 va_arg(ap, uint_t)); 638 break; 639 case DATA_TYPE_UINT8_ARRAY: 640 nelem = va_arg(ap, int); 641 ret = nvlist_add_uint8_array(payload, name, 642 va_arg(ap, uint8_t *), nelem); 643 break; 644 case DATA_TYPE_INT16: 645 ret = nvlist_add_int16(payload, name, 646 va_arg(ap, int)); 647 break; 648 case DATA_TYPE_INT16_ARRAY: 649 nelem = va_arg(ap, int); 650 ret = nvlist_add_int16_array(payload, name, 651 va_arg(ap, int16_t *), nelem); 652 break; 653 case DATA_TYPE_UINT16: 654 ret = nvlist_add_uint16(payload, name, 655 va_arg(ap, uint_t)); 656 break; 657 case DATA_TYPE_UINT16_ARRAY: 658 nelem = va_arg(ap, int); 659 ret = nvlist_add_uint16_array(payload, name, 660 va_arg(ap, uint16_t *), nelem); 661 break; 662 case DATA_TYPE_INT32: 663 ret = nvlist_add_int32(payload, name, 664 va_arg(ap, int32_t)); 665 break; 666 case DATA_TYPE_INT32_ARRAY: 667 nelem = va_arg(ap, int); 668 ret = nvlist_add_int32_array(payload, name, 669 va_arg(ap, int32_t *), nelem); 670 break; 671 case DATA_TYPE_UINT32: 672 ret = nvlist_add_uint32(payload, name, 673 va_arg(ap, uint32_t)); 674 break; 675 case DATA_TYPE_UINT32_ARRAY: 676 nelem = va_arg(ap, int); 677 ret = nvlist_add_uint32_array(payload, name, 678 va_arg(ap, uint32_t *), nelem); 679 break; 680 case DATA_TYPE_INT64: 681 ret = nvlist_add_int64(payload, name, 682 va_arg(ap, int64_t)); 683 break; 684 case DATA_TYPE_INT64_ARRAY: 685 nelem = va_arg(ap, int); 686 ret = nvlist_add_int64_array(payload, name, 687 va_arg(ap, int64_t *), nelem); 688 break; 689 case DATA_TYPE_UINT64: 690 ret = nvlist_add_uint64(payload, name, 691 va_arg(ap, uint64_t)); 692 break; 693 case DATA_TYPE_UINT64_ARRAY: 694 nelem = va_arg(ap, int); 695 ret = nvlist_add_uint64_array(payload, name, 696 va_arg(ap, uint64_t *), nelem); 697 break; 698 case DATA_TYPE_STRING: 699 ret = nvlist_add_string(payload, name, 700 va_arg(ap, char *)); 701 break; 702 case DATA_TYPE_STRING_ARRAY: 703 nelem = va_arg(ap, int); 704 ret = nvlist_add_string_array(payload, name, 705 va_arg(ap, char **), nelem); 706 break; 707 case DATA_TYPE_NVLIST: 708 ret = nvlist_add_nvlist(payload, name, 709 va_arg(ap, nvlist_t *)); 710 break; 711 case DATA_TYPE_NVLIST_ARRAY: 712 nelem = va_arg(ap, int); 713 ret = nvlist_add_nvlist_array(payload, name, 714 va_arg(ap, nvlist_t **), nelem); 715 break; 716 default: 717 ret = EINVAL; 718 } 719 720 name = va_arg(ap, char *); 721 } 722 return (ret); 723 } 724 725 void 726 fm_payload_set(nvlist_t *payload, ...) 727 { 728 int ret; 729 const char *name; 730 va_list ap; 731 732 va_start(ap, payload); 733 name = va_arg(ap, char *); 734 ret = i_fm_payload_set(payload, name, ap); 735 va_end(ap); 736 737 if (ret) 738 atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64); 739 } 740 741 /* 742 * Set-up and validate the members of an ereport event according to: 743 * 744 * Member name Type Value 745 * ==================================================== 746 * class string ereport 747 * version uint8_t 0 748 * ena uint64_t <ena> 749 * detector nvlist_t <detector> 750 * ereport-payload nvlist_t <var args> 751 * 752 * We don't actually add a 'version' member to the payload. Really, 753 * the version quoted to us by our caller is that of the category 1 754 * "ereport" event class (and we require FM_EREPORT_VERS0) but 755 * the payload version of the actual leaf class event under construction 756 * may be something else. Callers should supply a version in the varargs, 757 * or (better) we could take two version arguments - one for the 758 * ereport category 1 classification (expect FM_EREPORT_VERS0) and one 759 * for the leaf class. 760 */ 761 void 762 fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class, 763 uint64_t ena, const nvlist_t *detector, ...) 764 { 765 char ereport_class[FM_MAX_CLASS]; 766 const char *name; 767 va_list ap; 768 int ret; 769 770 if (version != FM_EREPORT_VERS0) { 771 atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); 772 return; 773 } 774 775 (void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s", 776 FM_EREPORT_CLASS, erpt_class); 777 if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) { 778 atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); 779 return; 780 } 781 782 if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) { 783 atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); 784 } 785 786 if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR, 787 (nvlist_t *)detector) != 0) { 788 atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); 789 } 790 791 va_start(ap, detector); 792 name = va_arg(ap, const char *); 793 ret = i_fm_payload_set(ereport, name, ap); 794 va_end(ap); 795 796 if (ret) 797 atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); 798 } 799 800 /* 801 * Set-up and validate the members of an hc fmri according to; 802 * 803 * Member name Type Value 804 * =================================================== 805 * version uint8_t 0 806 * auth nvlist_t <auth> 807 * hc-name string <name> 808 * hc-id string <id> 809 * 810 * Note that auth and hc-id are optional members. 811 */ 812 813 #define HC_MAXPAIRS 20 814 #define HC_MAXNAMELEN 50 815 816 static int 817 fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth) 818 { 819 if (version != FM_HC_SCHEME_VERSION) { 820 atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); 821 return (0); 822 } 823 824 if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 || 825 nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) { 826 atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); 827 return (0); 828 } 829 830 if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY, 831 (nvlist_t *)auth) != 0) { 832 atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); 833 return (0); 834 } 835 836 return (1); 837 } 838 839 void 840 fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth, 841 nvlist_t *snvl, int npairs, ...) 842 { 843 nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri); 844 nvlist_t *pairs[HC_MAXPAIRS]; 845 va_list ap; 846 int i; 847 848 if (!fm_fmri_hc_set_common(fmri, version, auth)) 849 return; 850 851 npairs = MIN(npairs, HC_MAXPAIRS); 852 853 va_start(ap, npairs); 854 for (i = 0; i < npairs; i++) { 855 const char *name = va_arg(ap, const char *); 856 uint32_t id = va_arg(ap, uint32_t); 857 char idstr[11]; 858 859 (void) snprintf(idstr, sizeof (idstr), "%u", id); 860 861 pairs[i] = fm_nvlist_create(nva); 862 if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 || 863 nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) { 864 atomic_inc_64( 865 &erpt_kstat_data.fmri_set_failed.value.ui64); 866 } 867 } 868 va_end(ap); 869 870 if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0) 871 atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); 872 873 for (i = 0; i < npairs; i++) 874 fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN); 875 876 if (snvl != NULL) { 877 if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) { 878 atomic_inc_64( 879 &erpt_kstat_data.fmri_set_failed.value.ui64); 880 } 881 } 882 } 883 884 void 885 fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth, 886 nvlist_t *snvl, nvlist_t *bboard, int npairs, ...) 887 { 888 nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri); 889 nvlist_t *pairs[HC_MAXPAIRS]; 890 nvlist_t **hcl; 891 uint_t n; 892 int i, j; 893 va_list ap; 894 char *hcname, *hcid; 895 896 if (!fm_fmri_hc_set_common(fmri, version, auth)) 897 return; 898 899 /* 900 * copy the bboard nvpairs to the pairs array 901 */ 902 if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n) 903 != 0) { 904 atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); 905 return; 906 } 907 908 for (i = 0; i < n; i++) { 909 if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME, 910 &hcname) != 0) { 911 atomic_inc_64( 912 &erpt_kstat_data.fmri_set_failed.value.ui64); 913 return; 914 } 915 if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) { 916 atomic_inc_64( 917 &erpt_kstat_data.fmri_set_failed.value.ui64); 918 return; 919 } 920 921 pairs[i] = fm_nvlist_create(nva); 922 if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 || 923 nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) { 924 for (j = 0; j <= i; j++) { 925 if (pairs[j] != NULL) 926 fm_nvlist_destroy(pairs[j], 927 FM_NVA_RETAIN); 928 } 929 atomic_inc_64( 930 &erpt_kstat_data.fmri_set_failed.value.ui64); 931 return; 932 } 933 } 934 935 /* 936 * create the pairs from passed in pairs 937 */ 938 npairs = MIN(npairs, HC_MAXPAIRS); 939 940 va_start(ap, npairs); 941 for (i = n; i < npairs + n; i++) { 942 const char *name = va_arg(ap, const char *); 943 uint32_t id = va_arg(ap, uint32_t); 944 char idstr[11]; 945 (void) snprintf(idstr, sizeof (idstr), "%u", id); 946 pairs[i] = fm_nvlist_create(nva); 947 if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 || 948 nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) { 949 for (j = 0; j <= i; j++) { 950 if (pairs[j] != NULL) 951 fm_nvlist_destroy(pairs[j], 952 FM_NVA_RETAIN); 953 } 954 atomic_inc_64( 955 &erpt_kstat_data.fmri_set_failed.value.ui64); 956 return; 957 } 958 } 959 va_end(ap); 960 961 /* 962 * Create the fmri hc list 963 */ 964 if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, 965 npairs + n) != 0) { 966 atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); 967 return; 968 } 969 970 for (i = 0; i < npairs + n; i++) { 971 fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN); 972 } 973 974 if (snvl != NULL) { 975 if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) { 976 atomic_inc_64( 977 &erpt_kstat_data.fmri_set_failed.value.ui64); 978 return; 979 } 980 } 981 } 982 983 /* 984 * Set-up and validate the members of an dev fmri according to: 985 * 986 * Member name Type Value 987 * ==================================================== 988 * version uint8_t 0 989 * auth nvlist_t <auth> 990 * devpath string <devpath> 991 * [devid] string <devid> 992 * [target-port-l0id] string <target-port-lun0-id> 993 * 994 * Note that auth and devid are optional members. 995 */ 996 void 997 fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth, 998 const char *devpath, const char *devid, const char *tpl0) 999 { 1000 int err = 0; 1001 1002 if (version != DEV_SCHEME_VERSION0) { 1003 atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); 1004 return; 1005 } 1006 1007 err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version); 1008 err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV); 1009 1010 if (auth != NULL) { 1011 err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY, 1012 (nvlist_t *)auth); 1013 } 1014 1015 err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath); 1016 1017 if (devid != NULL) 1018 err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid); 1019 1020 if (tpl0 != NULL) 1021 err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0); 1022 1023 if (err) 1024 atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); 1025 1026 } 1027 1028 /* 1029 * Set-up and validate the members of an cpu fmri according to: 1030 * 1031 * Member name Type Value 1032 * ==================================================== 1033 * version uint8_t 0 1034 * auth nvlist_t <auth> 1035 * cpuid uint32_t <cpu_id> 1036 * cpumask uint8_t <cpu_mask> 1037 * serial uint64_t <serial_id> 1038 * 1039 * Note that auth, cpumask, serial are optional members. 1040 * 1041 */ 1042 void 1043 fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth, 1044 uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp) 1045 { 1046 uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64; 1047 1048 if (version < CPU_SCHEME_VERSION1) { 1049 atomic_inc_64(failedp); 1050 return; 1051 } 1052 1053 if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) { 1054 atomic_inc_64(failedp); 1055 return; 1056 } 1057 1058 if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME, 1059 FM_FMRI_SCHEME_CPU) != 0) { 1060 atomic_inc_64(failedp); 1061 return; 1062 } 1063 1064 if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY, 1065 (nvlist_t *)auth) != 0) 1066 atomic_inc_64(failedp); 1067 1068 if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0) 1069 atomic_inc_64(failedp); 1070 1071 if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK, 1072 *cpu_maskp) != 0) 1073 atomic_inc_64(failedp); 1074 1075 if (serial_idp == NULL || nvlist_add_string(fmri_cpu, 1076 FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0) 1077 atomic_inc_64(failedp); 1078 } 1079 1080 /* 1081 * Set-up and validate the members of a mem according to: 1082 * 1083 * Member name Type Value 1084 * ==================================================== 1085 * version uint8_t 0 1086 * auth nvlist_t <auth> [optional] 1087 * unum string <unum> 1088 * serial string <serial> [optional*] 1089 * offset uint64_t <offset> [optional] 1090 * 1091 * * serial is required if offset is present 1092 */ 1093 void 1094 fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth, 1095 const char *unum, const char *serial, uint64_t offset) 1096 { 1097 if (version != MEM_SCHEME_VERSION0) { 1098 atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); 1099 return; 1100 } 1101 1102 if (!serial && (offset != (uint64_t)-1)) { 1103 atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); 1104 return; 1105 } 1106 1107 if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) { 1108 atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); 1109 return; 1110 } 1111 1112 if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) { 1113 atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); 1114 return; 1115 } 1116 1117 if (auth != NULL) { 1118 if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY, 1119 (nvlist_t *)auth) != 0) { 1120 atomic_inc_64( 1121 &erpt_kstat_data.fmri_set_failed.value.ui64); 1122 } 1123 } 1124 1125 if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) { 1126 atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); 1127 } 1128 1129 if (serial != NULL) { 1130 if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID, 1131 (char **)&serial, 1) != 0) { 1132 atomic_inc_64( 1133 &erpt_kstat_data.fmri_set_failed.value.ui64); 1134 } 1135 if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri, 1136 FM_FMRI_MEM_OFFSET, offset) != 0) { 1137 atomic_inc_64( 1138 &erpt_kstat_data.fmri_set_failed.value.ui64); 1139 } 1140 } 1141 } 1142 1143 void 1144 fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid, 1145 uint64_t vdev_guid) 1146 { 1147 if (version != ZFS_SCHEME_VERSION0) { 1148 atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); 1149 return; 1150 } 1151 1152 if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) { 1153 atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); 1154 return; 1155 } 1156 1157 if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) { 1158 atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); 1159 return; 1160 } 1161 1162 if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) { 1163 atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); 1164 } 1165 1166 if (vdev_guid != 0) { 1167 if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) { 1168 atomic_inc_64( 1169 &erpt_kstat_data.fmri_set_failed.value.ui64); 1170 } 1171 } 1172 } 1173 1174 uint64_t 1175 fm_ena_increment(uint64_t ena) 1176 { 1177 uint64_t new_ena; 1178 1179 switch (ENA_FORMAT(ena)) { 1180 case FM_ENA_FMT1: 1181 new_ena = ena + (1 << ENA_FMT1_GEN_SHFT); 1182 break; 1183 case FM_ENA_FMT2: 1184 new_ena = ena + (1 << ENA_FMT2_GEN_SHFT); 1185 break; 1186 default: 1187 new_ena = 0; 1188 } 1189 1190 return (new_ena); 1191 } 1192 1193 uint64_t 1194 fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format) 1195 { 1196 uint64_t ena = 0; 1197 1198 switch (format) { 1199 case FM_ENA_FMT1: 1200 if (timestamp) { 1201 ena = (uint64_t)((format & ENA_FORMAT_MASK) | 1202 ((cpuid << ENA_FMT1_CPUID_SHFT) & 1203 ENA_FMT1_CPUID_MASK) | 1204 ((timestamp << ENA_FMT1_TIME_SHFT) & 1205 ENA_FMT1_TIME_MASK)); 1206 } else { 1207 ena = (uint64_t)((format & ENA_FORMAT_MASK) | 1208 ((cpuid << ENA_FMT1_CPUID_SHFT) & 1209 ENA_FMT1_CPUID_MASK) | 1210 ((gethrtime() << ENA_FMT1_TIME_SHFT) & 1211 ENA_FMT1_TIME_MASK)); 1212 } 1213 break; 1214 case FM_ENA_FMT2: 1215 ena = (uint64_t)((format & ENA_FORMAT_MASK) | 1216 ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK)); 1217 break; 1218 default: 1219 break; 1220 } 1221 1222 return (ena); 1223 } 1224 1225 uint64_t 1226 fm_ena_generate(uint64_t timestamp, uchar_t format) 1227 { 1228 uint64_t ena; 1229 1230 kpreempt_disable(); 1231 ena = fm_ena_generate_cpu(timestamp, getcpuid(), format); 1232 kpreempt_enable(); 1233 1234 return (ena); 1235 } 1236 1237 uint64_t 1238 fm_ena_generation_get(uint64_t ena) 1239 { 1240 uint64_t gen; 1241 1242 switch (ENA_FORMAT(ena)) { 1243 case FM_ENA_FMT1: 1244 gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT; 1245 break; 1246 case FM_ENA_FMT2: 1247 gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT; 1248 break; 1249 default: 1250 gen = 0; 1251 break; 1252 } 1253 1254 return (gen); 1255 } 1256 1257 uchar_t 1258 fm_ena_format_get(uint64_t ena) 1259 { 1260 1261 return (ENA_FORMAT(ena)); 1262 } 1263 1264 uint64_t 1265 fm_ena_id_get(uint64_t ena) 1266 { 1267 uint64_t id; 1268 1269 switch (ENA_FORMAT(ena)) { 1270 case FM_ENA_FMT1: 1271 id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT; 1272 break; 1273 case FM_ENA_FMT2: 1274 id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT; 1275 break; 1276 default: 1277 id = 0; 1278 } 1279 1280 return (id); 1281 } 1282 1283 uint64_t 1284 fm_ena_time_get(uint64_t ena) 1285 { 1286 uint64_t time; 1287 1288 switch (ENA_FORMAT(ena)) { 1289 case FM_ENA_FMT1: 1290 time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT; 1291 break; 1292 case FM_ENA_FMT2: 1293 time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT; 1294 break; 1295 default: 1296 time = 0; 1297 } 1298 1299 return (time); 1300 } 1301 1302 #ifdef _KERNEL 1303 /* 1304 * Helper function to increment ereport dropped count. Used by the event 1305 * rate limiting code to give feedback to the user about how many events were 1306 * rate limited by including them in the 'dropped' count. 1307 */ 1308 void 1309 fm_erpt_dropped_increment(void) 1310 { 1311 atomic_inc_64(&ratelimit_dropped); 1312 } 1313 1314 void 1315 fm_init(void) 1316 { 1317 zevent_len_cur = 0; 1318 zevent_flags = 0; 1319 1320 /* Initialize zevent allocation and generation kstats */ 1321 fm_ksp = kstat_create("zfs", 0, "fm", "misc", KSTAT_TYPE_NAMED, 1322 sizeof (struct erpt_kstat) / sizeof (kstat_named_t), 1323 KSTAT_FLAG_VIRTUAL); 1324 1325 if (fm_ksp != NULL) { 1326 fm_ksp->ks_data = &erpt_kstat_data; 1327 kstat_install(fm_ksp); 1328 } else { 1329 cmn_err(CE_NOTE, "failed to create fm/misc kstat\n"); 1330 } 1331 1332 mutex_init(&zevent_lock, NULL, MUTEX_DEFAULT, NULL); 1333 list_create(&zevent_list, sizeof (zevent_t), 1334 offsetof(zevent_t, ev_node)); 1335 cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL); 1336 1337 zfs_ereport_init(); 1338 } 1339 1340 void 1341 fm_fini(void) 1342 { 1343 int count; 1344 1345 zfs_ereport_fini(); 1346 1347 zfs_zevent_drain_all(&count); 1348 1349 mutex_enter(&zevent_lock); 1350 cv_broadcast(&zevent_cv); 1351 1352 zevent_flags |= ZEVENT_SHUTDOWN; 1353 while (zevent_waiters > 0) { 1354 mutex_exit(&zevent_lock); 1355 schedule(); 1356 mutex_enter(&zevent_lock); 1357 } 1358 mutex_exit(&zevent_lock); 1359 1360 cv_destroy(&zevent_cv); 1361 list_destroy(&zevent_list); 1362 mutex_destroy(&zevent_lock); 1363 1364 if (fm_ksp != NULL) { 1365 kstat_delete(fm_ksp); 1366 fm_ksp = NULL; 1367 } 1368 } 1369 #endif /* _KERNEL */ 1370 1371 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, len_max, INT, ZMOD_RW, 1372 "Max event queue length"); 1373