xref: /freebsd/sys/contrib/openzfs/module/zfs/fm.c (revision dd41de95a84d979615a2ef11df6850622bf6184e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Fault Management Architecture (FMA) Resource and Protocol Support
27  *
28  * The routines contained herein provide services to support kernel subsystems
29  * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089).
30  *
31  * Name-Value Pair Lists
32  *
33  * The embodiment of an FMA protocol element (event, fmri or authority) is a
34  * name-value pair list (nvlist_t).  FMA-specific nvlist constructor and
35  * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used
36  * to create an nvpair list using custom allocators.  Callers may choose to
37  * allocate either from the kernel memory allocator, or from a preallocated
38  * buffer, useful in constrained contexts like high-level interrupt routines.
39  *
40  * Protocol Event and FMRI Construction
41  *
42  * Convenience routines are provided to construct nvlist events according to
43  * the FMA Event Protocol and Naming Schema specification for ereports and
44  * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes.
45  *
46  * ENA Manipulation
47  *
48  * Routines to generate ENA formats 0, 1 and 2 are available as well as
49  * routines to increment formats 1 and 2.  Individual fields within the
50  * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(),
51  * fm_ena_format_get() and fm_ena_gen_get().
52  */
53 
54 #include <sys/types.h>
55 #include <sys/time.h>
56 #include <sys/list.h>
57 #include <sys/nvpair.h>
58 #include <sys/cmn_err.h>
59 #include <sys/sysmacros.h>
60 #include <sys/sunddi.h>
61 #include <sys/systeminfo.h>
62 #include <sys/fm/util.h>
63 #include <sys/fm/protocol.h>
64 #include <sys/kstat.h>
65 #include <sys/zfs_context.h>
66 #ifdef _KERNEL
67 #include <sys/atomic.h>
68 #include <sys/condvar.h>
69 #include <sys/console.h>
70 #include <sys/zfs_ioctl.h>
71 
72 int zfs_zevent_len_max = 0;
73 int zfs_zevent_cols = 80;
74 int zfs_zevent_console = 0;
75 
76 static int zevent_len_cur = 0;
77 static int zevent_waiters = 0;
78 static int zevent_flags = 0;
79 
80 /* Num events rate limited since the last time zfs_zevent_next() was called */
81 static uint64_t ratelimit_dropped = 0;
82 
83 /*
84  * The EID (Event IDentifier) is used to uniquely tag a zevent when it is
85  * posted.  The posted EIDs are monotonically increasing but not persistent.
86  * They will be reset to the initial value (1) each time the kernel module is
87  * loaded.
88  */
89 static uint64_t zevent_eid = 0;
90 
91 static kmutex_t zevent_lock;
92 static list_t zevent_list;
93 static kcondvar_t zevent_cv;
94 #endif /* _KERNEL */
95 
96 
97 /*
98  * Common fault management kstats to record event generation failures
99  */
100 
101 struct erpt_kstat {
102 	kstat_named_t	erpt_dropped;		/* num erpts dropped on post */
103 	kstat_named_t	erpt_set_failed;	/* num erpt set failures */
104 	kstat_named_t	fmri_set_failed;	/* num fmri set failures */
105 	kstat_named_t	payload_set_failed;	/* num payload set failures */
106 	kstat_named_t	erpt_duplicates;	/* num duplicate erpts */
107 };
108 
109 static struct erpt_kstat erpt_kstat_data = {
110 	{ "erpt-dropped", KSTAT_DATA_UINT64 },
111 	{ "erpt-set-failed", KSTAT_DATA_UINT64 },
112 	{ "fmri-set-failed", KSTAT_DATA_UINT64 },
113 	{ "payload-set-failed", KSTAT_DATA_UINT64 },
114 	{ "erpt-duplicates", KSTAT_DATA_UINT64 }
115 };
116 
117 kstat_t *fm_ksp;
118 
119 #ifdef _KERNEL
120 
121 /*
122  * Formatting utility function for fm_nvprintr.  We attempt to wrap chunks of
123  * output so they aren't split across console lines, and return the end column.
124  */
125 /*PRINTFLIKE4*/
126 static int
127 fm_printf(int depth, int c, int cols, const char *format, ...)
128 {
129 	va_list ap;
130 	int width;
131 	char c1;
132 
133 	va_start(ap, format);
134 	width = vsnprintf(&c1, sizeof (c1), format, ap);
135 	va_end(ap);
136 
137 	if (c + width >= cols) {
138 		console_printf("\n");
139 		c = 0;
140 		if (format[0] != ' ' && depth > 0) {
141 			console_printf(" ");
142 			c++;
143 		}
144 	}
145 
146 	va_start(ap, format);
147 	console_vprintf(format, ap);
148 	va_end(ap);
149 
150 	return ((c + width) % cols);
151 }
152 
153 /*
154  * Recursively print an nvlist in the specified column width and return the
155  * column we end up in.  This function is called recursively by fm_nvprint(),
156  * below.  We generically format the entire nvpair using hexadecimal
157  * integers and strings, and elide any integer arrays.  Arrays are basically
158  * used for cache dumps right now, so we suppress them so as not to overwhelm
159  * the amount of console output we produce at panic time.  This can be further
160  * enhanced as FMA technology grows based upon the needs of consumers.  All
161  * FMA telemetry is logged using the dump device transport, so the console
162  * output serves only as a fallback in case this procedure is unsuccessful.
163  */
164 static int
165 fm_nvprintr(nvlist_t *nvl, int d, int c, int cols)
166 {
167 	nvpair_t *nvp;
168 
169 	for (nvp = nvlist_next_nvpair(nvl, NULL);
170 	    nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
171 
172 		data_type_t type = nvpair_type(nvp);
173 		const char *name = nvpair_name(nvp);
174 
175 		boolean_t b;
176 		uint8_t i8;
177 		uint16_t i16;
178 		uint32_t i32;
179 		uint64_t i64;
180 		char *str;
181 		nvlist_t *cnv;
182 
183 		if (strcmp(name, FM_CLASS) == 0)
184 			continue; /* already printed by caller */
185 
186 		c = fm_printf(d, c, cols, " %s=", name);
187 
188 		switch (type) {
189 		case DATA_TYPE_BOOLEAN:
190 			c = fm_printf(d + 1, c, cols, " 1");
191 			break;
192 
193 		case DATA_TYPE_BOOLEAN_VALUE:
194 			(void) nvpair_value_boolean_value(nvp, &b);
195 			c = fm_printf(d + 1, c, cols, b ? "1" : "0");
196 			break;
197 
198 		case DATA_TYPE_BYTE:
199 			(void) nvpair_value_byte(nvp, &i8);
200 			c = fm_printf(d + 1, c, cols, "0x%x", i8);
201 			break;
202 
203 		case DATA_TYPE_INT8:
204 			(void) nvpair_value_int8(nvp, (void *)&i8);
205 			c = fm_printf(d + 1, c, cols, "0x%x", i8);
206 			break;
207 
208 		case DATA_TYPE_UINT8:
209 			(void) nvpair_value_uint8(nvp, &i8);
210 			c = fm_printf(d + 1, c, cols, "0x%x", i8);
211 			break;
212 
213 		case DATA_TYPE_INT16:
214 			(void) nvpair_value_int16(nvp, (void *)&i16);
215 			c = fm_printf(d + 1, c, cols, "0x%x", i16);
216 			break;
217 
218 		case DATA_TYPE_UINT16:
219 			(void) nvpair_value_uint16(nvp, &i16);
220 			c = fm_printf(d + 1, c, cols, "0x%x", i16);
221 			break;
222 
223 		case DATA_TYPE_INT32:
224 			(void) nvpair_value_int32(nvp, (void *)&i32);
225 			c = fm_printf(d + 1, c, cols, "0x%x", i32);
226 			break;
227 
228 		case DATA_TYPE_UINT32:
229 			(void) nvpair_value_uint32(nvp, &i32);
230 			c = fm_printf(d + 1, c, cols, "0x%x", i32);
231 			break;
232 
233 		case DATA_TYPE_INT64:
234 			(void) nvpair_value_int64(nvp, (void *)&i64);
235 			c = fm_printf(d + 1, c, cols, "0x%llx",
236 			    (u_longlong_t)i64);
237 			break;
238 
239 		case DATA_TYPE_UINT64:
240 			(void) nvpair_value_uint64(nvp, &i64);
241 			c = fm_printf(d + 1, c, cols, "0x%llx",
242 			    (u_longlong_t)i64);
243 			break;
244 
245 		case DATA_TYPE_HRTIME:
246 			(void) nvpair_value_hrtime(nvp, (void *)&i64);
247 			c = fm_printf(d + 1, c, cols, "0x%llx",
248 			    (u_longlong_t)i64);
249 			break;
250 
251 		case DATA_TYPE_STRING:
252 			(void) nvpair_value_string(nvp, &str);
253 			c = fm_printf(d + 1, c, cols, "\"%s\"",
254 			    str ? str : "<NULL>");
255 			break;
256 
257 		case DATA_TYPE_NVLIST:
258 			c = fm_printf(d + 1, c, cols, "[");
259 			(void) nvpair_value_nvlist(nvp, &cnv);
260 			c = fm_nvprintr(cnv, d + 1, c, cols);
261 			c = fm_printf(d + 1, c, cols, " ]");
262 			break;
263 
264 		case DATA_TYPE_NVLIST_ARRAY: {
265 			nvlist_t **val;
266 			uint_t i, nelem;
267 
268 			c = fm_printf(d + 1, c, cols, "[");
269 			(void) nvpair_value_nvlist_array(nvp, &val, &nelem);
270 			for (i = 0; i < nelem; i++) {
271 				c = fm_nvprintr(val[i], d + 1, c, cols);
272 			}
273 			c = fm_printf(d + 1, c, cols, " ]");
274 			}
275 			break;
276 
277 		case DATA_TYPE_INT8_ARRAY: {
278 			int8_t *val;
279 			uint_t i, nelem;
280 
281 			c = fm_printf(d + 1, c, cols, "[ ");
282 			(void) nvpair_value_int8_array(nvp, &val, &nelem);
283 			for (i = 0; i < nelem; i++)
284 				c = fm_printf(d + 1, c, cols, "0x%llx ",
285 				    (u_longlong_t)val[i]);
286 
287 			c = fm_printf(d + 1, c, cols, "]");
288 			break;
289 			}
290 
291 		case DATA_TYPE_UINT8_ARRAY: {
292 			uint8_t *val;
293 			uint_t i, nelem;
294 
295 			c = fm_printf(d + 1, c, cols, "[ ");
296 			(void) nvpair_value_uint8_array(nvp, &val, &nelem);
297 			for (i = 0; i < nelem; i++)
298 				c = fm_printf(d + 1, c, cols, "0x%llx ",
299 				    (u_longlong_t)val[i]);
300 
301 			c = fm_printf(d + 1, c, cols, "]");
302 			break;
303 			}
304 
305 		case DATA_TYPE_INT16_ARRAY: {
306 			int16_t *val;
307 			uint_t i, nelem;
308 
309 			c = fm_printf(d + 1, c, cols, "[ ");
310 			(void) nvpair_value_int16_array(nvp, &val, &nelem);
311 			for (i = 0; i < nelem; i++)
312 				c = fm_printf(d + 1, c, cols, "0x%llx ",
313 				    (u_longlong_t)val[i]);
314 
315 			c = fm_printf(d + 1, c, cols, "]");
316 			break;
317 			}
318 
319 		case DATA_TYPE_UINT16_ARRAY: {
320 			uint16_t *val;
321 			uint_t i, nelem;
322 
323 			c = fm_printf(d + 1, c, cols, "[ ");
324 			(void) nvpair_value_uint16_array(nvp, &val, &nelem);
325 			for (i = 0; i < nelem; i++)
326 				c = fm_printf(d + 1, c, cols, "0x%llx ",
327 				    (u_longlong_t)val[i]);
328 
329 			c = fm_printf(d + 1, c, cols, "]");
330 			break;
331 			}
332 
333 		case DATA_TYPE_INT32_ARRAY: {
334 			int32_t *val;
335 			uint_t i, nelem;
336 
337 			c = fm_printf(d + 1, c, cols, "[ ");
338 			(void) nvpair_value_int32_array(nvp, &val, &nelem);
339 			for (i = 0; i < nelem; i++)
340 			c = fm_printf(d + 1, c, cols, "0x%llx ",
341 			    (u_longlong_t)val[i]);
342 
343 			c = fm_printf(d + 1, c, cols, "]");
344 			break;
345 			}
346 
347 		case DATA_TYPE_UINT32_ARRAY: {
348 			uint32_t *val;
349 			uint_t i, nelem;
350 
351 			c = fm_printf(d + 1, c, cols, "[ ");
352 			(void) nvpair_value_uint32_array(nvp, &val, &nelem);
353 			for (i = 0; i < nelem; i++)
354 				c = fm_printf(d + 1, c, cols, "0x%llx ",
355 				    (u_longlong_t)val[i]);
356 
357 			c = fm_printf(d + 1, c, cols, "]");
358 			break;
359 			}
360 
361 		case DATA_TYPE_INT64_ARRAY: {
362 			int64_t *val;
363 			uint_t i, nelem;
364 
365 			c = fm_printf(d + 1, c, cols, "[ ");
366 			(void) nvpair_value_int64_array(nvp, &val, &nelem);
367 			for (i = 0; i < nelem; i++)
368 				c = fm_printf(d + 1, c, cols, "0x%llx ",
369 				    (u_longlong_t)val[i]);
370 
371 			c = fm_printf(d + 1, c, cols, "]");
372 			break;
373 			}
374 
375 		case DATA_TYPE_UINT64_ARRAY: {
376 			uint64_t *val;
377 			uint_t i, nelem;
378 
379 			c = fm_printf(d + 1, c, cols, "[ ");
380 			(void) nvpair_value_uint64_array(nvp, &val, &nelem);
381 			for (i = 0; i < nelem; i++)
382 				c = fm_printf(d + 1, c, cols, "0x%llx ",
383 				    (u_longlong_t)val[i]);
384 
385 			c = fm_printf(d + 1, c, cols, "]");
386 			break;
387 			}
388 
389 		case DATA_TYPE_STRING_ARRAY:
390 		case DATA_TYPE_BOOLEAN_ARRAY:
391 		case DATA_TYPE_BYTE_ARRAY:
392 			c = fm_printf(d + 1, c, cols, "[...]");
393 			break;
394 
395 		case DATA_TYPE_UNKNOWN:
396 		case DATA_TYPE_DONTCARE:
397 			c = fm_printf(d + 1, c, cols, "<unknown>");
398 			break;
399 		}
400 	}
401 
402 	return (c);
403 }
404 
405 void
406 fm_nvprint(nvlist_t *nvl)
407 {
408 	char *class;
409 	int c = 0;
410 
411 	console_printf("\n");
412 
413 	if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0)
414 		c = fm_printf(0, c, zfs_zevent_cols, "%s", class);
415 
416 	if (fm_nvprintr(nvl, 0, c, zfs_zevent_cols) != 0)
417 		console_printf("\n");
418 
419 	console_printf("\n");
420 }
421 
422 static zevent_t *
423 zfs_zevent_alloc(void)
424 {
425 	zevent_t *ev;
426 
427 	ev = kmem_zalloc(sizeof (zevent_t), KM_SLEEP);
428 
429 	list_create(&ev->ev_ze_list, sizeof (zfs_zevent_t),
430 	    offsetof(zfs_zevent_t, ze_node));
431 	list_link_init(&ev->ev_node);
432 
433 	return (ev);
434 }
435 
436 static void
437 zfs_zevent_free(zevent_t *ev)
438 {
439 	/* Run provided cleanup callback */
440 	ev->ev_cb(ev->ev_nvl, ev->ev_detector);
441 
442 	list_destroy(&ev->ev_ze_list);
443 	kmem_free(ev, sizeof (zevent_t));
444 }
445 
446 static void
447 zfs_zevent_drain(zevent_t *ev)
448 {
449 	zfs_zevent_t *ze;
450 
451 	ASSERT(MUTEX_HELD(&zevent_lock));
452 	list_remove(&zevent_list, ev);
453 
454 	/* Remove references to this event in all private file data */
455 	while ((ze = list_head(&ev->ev_ze_list)) != NULL) {
456 		list_remove(&ev->ev_ze_list, ze);
457 		ze->ze_zevent = NULL;
458 		ze->ze_dropped++;
459 	}
460 
461 	zfs_zevent_free(ev);
462 }
463 
464 void
465 zfs_zevent_drain_all(int *count)
466 {
467 	zevent_t *ev;
468 
469 	mutex_enter(&zevent_lock);
470 	while ((ev = list_head(&zevent_list)) != NULL)
471 		zfs_zevent_drain(ev);
472 
473 	*count = zevent_len_cur;
474 	zevent_len_cur = 0;
475 	mutex_exit(&zevent_lock);
476 }
477 
478 /*
479  * New zevents are inserted at the head.  If the maximum queue
480  * length is exceeded a zevent will be drained from the tail.
481  * As part of this any user space processes which currently have
482  * a reference to this zevent_t in their private data will have
483  * this reference set to NULL.
484  */
485 static void
486 zfs_zevent_insert(zevent_t *ev)
487 {
488 	ASSERT(MUTEX_HELD(&zevent_lock));
489 	list_insert_head(&zevent_list, ev);
490 
491 	if (zevent_len_cur >= zfs_zevent_len_max)
492 		zfs_zevent_drain(list_tail(&zevent_list));
493 	else
494 		zevent_len_cur++;
495 }
496 
497 /*
498  * Post a zevent. The cb will be called when nvl and detector are no longer
499  * needed, i.e.:
500  * - An error happened and a zevent can't be posted. In this case, cb is called
501  *   before zfs_zevent_post() returns.
502  * - The event is being drained and freed.
503  */
504 int
505 zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb)
506 {
507 	inode_timespec_t tv;
508 	int64_t tv_array[2];
509 	uint64_t eid;
510 	size_t nvl_size = 0;
511 	zevent_t *ev;
512 	int error;
513 
514 	ASSERT(cb != NULL);
515 
516 	gethrestime(&tv);
517 	tv_array[0] = tv.tv_sec;
518 	tv_array[1] = tv.tv_nsec;
519 
520 	error = nvlist_add_int64_array(nvl, FM_EREPORT_TIME, tv_array, 2);
521 	if (error) {
522 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
523 		goto out;
524 	}
525 
526 	eid = atomic_inc_64_nv(&zevent_eid);
527 	error = nvlist_add_uint64(nvl, FM_EREPORT_EID, eid);
528 	if (error) {
529 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
530 		goto out;
531 	}
532 
533 	error = nvlist_size(nvl, &nvl_size, NV_ENCODE_NATIVE);
534 	if (error) {
535 		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
536 		goto out;
537 	}
538 
539 	if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) {
540 		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
541 		error = EOVERFLOW;
542 		goto out;
543 	}
544 
545 	if (zfs_zevent_console)
546 		fm_nvprint(nvl);
547 
548 	ev = zfs_zevent_alloc();
549 	if (ev == NULL) {
550 		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
551 		error = ENOMEM;
552 		goto out;
553 	}
554 
555 	ev->ev_nvl = nvl;
556 	ev->ev_detector = detector;
557 	ev->ev_cb = cb;
558 	ev->ev_eid = eid;
559 
560 	mutex_enter(&zevent_lock);
561 	zfs_zevent_insert(ev);
562 	cv_broadcast(&zevent_cv);
563 	mutex_exit(&zevent_lock);
564 
565 out:
566 	if (error)
567 		cb(nvl, detector);
568 
569 	return (error);
570 }
571 
572 void
573 zfs_zevent_track_duplicate(void)
574 {
575 	atomic_inc_64(&erpt_kstat_data.erpt_duplicates.value.ui64);
576 }
577 
578 static int
579 zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze)
580 {
581 	*ze = zfsdev_get_state(minor, ZST_ZEVENT);
582 	if (*ze == NULL)
583 		return (SET_ERROR(EBADF));
584 
585 	return (0);
586 }
587 
588 int
589 zfs_zevent_fd_hold(int fd, minor_t *minorp, zfs_zevent_t **ze)
590 {
591 	int error;
592 
593 	error = zfsdev_getminor(fd, minorp);
594 	if (error == 0)
595 		error = zfs_zevent_minor_to_state(*minorp, ze);
596 
597 	if (error)
598 		zfs_zevent_fd_rele(fd);
599 
600 	return (error);
601 }
602 
603 void
604 zfs_zevent_fd_rele(int fd)
605 {
606 	zfs_file_put(fd);
607 }
608 
609 /*
610  * Get the next zevent in the stream and place a copy in 'event'.  This
611  * may fail with ENOMEM if the encoded nvlist size exceeds the passed
612  * 'event_size'.  In this case the stream pointer is not advanced and
613  * and 'event_size' is set to the minimum required buffer size.
614  */
615 int
616 zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size,
617     uint64_t *dropped)
618 {
619 	zevent_t *ev;
620 	size_t size;
621 	int error = 0;
622 
623 	mutex_enter(&zevent_lock);
624 	if (ze->ze_zevent == NULL) {
625 		/* New stream start at the beginning/tail */
626 		ev = list_tail(&zevent_list);
627 		if (ev == NULL) {
628 			error = ENOENT;
629 			goto out;
630 		}
631 	} else {
632 		/*
633 		 * Existing stream continue with the next element and remove
634 		 * ourselves from the wait queue for the previous element
635 		 */
636 		ev = list_prev(&zevent_list, ze->ze_zevent);
637 		if (ev == NULL) {
638 			error = ENOENT;
639 			goto out;
640 		}
641 	}
642 
643 	VERIFY(nvlist_size(ev->ev_nvl, &size, NV_ENCODE_NATIVE) == 0);
644 	if (size > *event_size) {
645 		*event_size = size;
646 		error = ENOMEM;
647 		goto out;
648 	}
649 
650 	if (ze->ze_zevent)
651 		list_remove(&ze->ze_zevent->ev_ze_list, ze);
652 
653 	ze->ze_zevent = ev;
654 	list_insert_head(&ev->ev_ze_list, ze);
655 	(void) nvlist_dup(ev->ev_nvl, event, KM_SLEEP);
656 	*dropped = ze->ze_dropped;
657 
658 #ifdef _KERNEL
659 	/* Include events dropped due to rate limiting */
660 	*dropped += ratelimit_dropped;
661 	ratelimit_dropped = 0;
662 #endif
663 	ze->ze_dropped = 0;
664 out:
665 	mutex_exit(&zevent_lock);
666 
667 	return (error);
668 }
669 
670 /*
671  * Wait in an interruptible state for any new events.
672  */
673 int
674 zfs_zevent_wait(zfs_zevent_t *ze)
675 {
676 	int error = EAGAIN;
677 
678 	mutex_enter(&zevent_lock);
679 	zevent_waiters++;
680 
681 	while (error == EAGAIN) {
682 		if (zevent_flags & ZEVENT_SHUTDOWN) {
683 			error = SET_ERROR(ESHUTDOWN);
684 			break;
685 		}
686 
687 		error = cv_wait_sig(&zevent_cv, &zevent_lock);
688 		if (signal_pending(current)) {
689 			error = SET_ERROR(EINTR);
690 			break;
691 		} else if (!list_is_empty(&zevent_list)) {
692 			error = 0;
693 			continue;
694 		} else {
695 			error = EAGAIN;
696 		}
697 	}
698 
699 	zevent_waiters--;
700 	mutex_exit(&zevent_lock);
701 
702 	return (error);
703 }
704 
705 /*
706  * The caller may seek to a specific EID by passing that EID.  If the EID
707  * is still available in the posted list of events the cursor is positioned
708  * there.  Otherwise ENOENT is returned and the cursor is not moved.
709  *
710  * There are two reserved EIDs which may be passed and will never fail.
711  * ZEVENT_SEEK_START positions the cursor at the start of the list, and
712  * ZEVENT_SEEK_END positions the cursor at the end of the list.
713  */
714 int
715 zfs_zevent_seek(zfs_zevent_t *ze, uint64_t eid)
716 {
717 	zevent_t *ev;
718 	int error = 0;
719 
720 	mutex_enter(&zevent_lock);
721 
722 	if (eid == ZEVENT_SEEK_START) {
723 		if (ze->ze_zevent)
724 			list_remove(&ze->ze_zevent->ev_ze_list, ze);
725 
726 		ze->ze_zevent = NULL;
727 		goto out;
728 	}
729 
730 	if (eid == ZEVENT_SEEK_END) {
731 		if (ze->ze_zevent)
732 			list_remove(&ze->ze_zevent->ev_ze_list, ze);
733 
734 		ev = list_head(&zevent_list);
735 		if (ev) {
736 			ze->ze_zevent = ev;
737 			list_insert_head(&ev->ev_ze_list, ze);
738 		} else {
739 			ze->ze_zevent = NULL;
740 		}
741 
742 		goto out;
743 	}
744 
745 	for (ev = list_tail(&zevent_list); ev != NULL;
746 	    ev = list_prev(&zevent_list, ev)) {
747 		if (ev->ev_eid == eid) {
748 			if (ze->ze_zevent)
749 				list_remove(&ze->ze_zevent->ev_ze_list, ze);
750 
751 			ze->ze_zevent = ev;
752 			list_insert_head(&ev->ev_ze_list, ze);
753 			break;
754 		}
755 	}
756 
757 	if (ev == NULL)
758 		error = ENOENT;
759 
760 out:
761 	mutex_exit(&zevent_lock);
762 
763 	return (error);
764 }
765 
766 void
767 zfs_zevent_init(zfs_zevent_t **zep)
768 {
769 	zfs_zevent_t *ze;
770 
771 	ze = *zep = kmem_zalloc(sizeof (zfs_zevent_t), KM_SLEEP);
772 	list_link_init(&ze->ze_node);
773 }
774 
775 void
776 zfs_zevent_destroy(zfs_zevent_t *ze)
777 {
778 	mutex_enter(&zevent_lock);
779 	if (ze->ze_zevent)
780 		list_remove(&ze->ze_zevent->ev_ze_list, ze);
781 	mutex_exit(&zevent_lock);
782 
783 	kmem_free(ze, sizeof (zfs_zevent_t));
784 }
785 #endif /* _KERNEL */
786 
787 /*
788  * Wrappers for FM nvlist allocators
789  */
790 /* ARGSUSED */
791 static void *
792 i_fm_alloc(nv_alloc_t *nva, size_t size)
793 {
794 	return (kmem_zalloc(size, KM_SLEEP));
795 }
796 
797 /* ARGSUSED */
798 static void
799 i_fm_free(nv_alloc_t *nva, void *buf, size_t size)
800 {
801 	kmem_free(buf, size);
802 }
803 
804 const nv_alloc_ops_t fm_mem_alloc_ops = {
805 	.nv_ao_init = NULL,
806 	.nv_ao_fini = NULL,
807 	.nv_ao_alloc = i_fm_alloc,
808 	.nv_ao_free = i_fm_free,
809 	.nv_ao_reset = NULL
810 };
811 
812 /*
813  * Create and initialize a new nv_alloc_t for a fixed buffer, buf.  A pointer
814  * to the newly allocated nv_alloc_t structure is returned upon success or NULL
815  * is returned to indicate that the nv_alloc structure could not be created.
816  */
817 nv_alloc_t *
818 fm_nva_xcreate(char *buf, size_t bufsz)
819 {
820 	nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
821 
822 	if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) {
823 		kmem_free(nvhdl, sizeof (nv_alloc_t));
824 		return (NULL);
825 	}
826 
827 	return (nvhdl);
828 }
829 
830 /*
831  * Destroy a previously allocated nv_alloc structure.  The fixed buffer
832  * associated with nva must be freed by the caller.
833  */
834 void
835 fm_nva_xdestroy(nv_alloc_t *nva)
836 {
837 	nv_alloc_fini(nva);
838 	kmem_free(nva, sizeof (nv_alloc_t));
839 }
840 
841 /*
842  * Create a new nv list.  A pointer to a new nv list structure is returned
843  * upon success or NULL is returned to indicate that the structure could
844  * not be created.  The newly created nv list is created and managed by the
845  * operations installed in nva.   If nva is NULL, the default FMA nva
846  * operations are installed and used.
847  *
848  * When called from the kernel and nva == NULL, this function must be called
849  * from passive kernel context with no locks held that can prevent a
850  * sleeping memory allocation from occurring.  Otherwise, this function may
851  * be called from other kernel contexts as long a valid nva created via
852  * fm_nva_create() is supplied.
853  */
854 nvlist_t *
855 fm_nvlist_create(nv_alloc_t *nva)
856 {
857 	int hdl_alloced = 0;
858 	nvlist_t *nvl;
859 	nv_alloc_t *nvhdl;
860 
861 	if (nva == NULL) {
862 		nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
863 
864 		if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) {
865 			kmem_free(nvhdl, sizeof (nv_alloc_t));
866 			return (NULL);
867 		}
868 		hdl_alloced = 1;
869 	} else {
870 		nvhdl = nva;
871 	}
872 
873 	if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) {
874 		if (hdl_alloced) {
875 			nv_alloc_fini(nvhdl);
876 			kmem_free(nvhdl, sizeof (nv_alloc_t));
877 		}
878 		return (NULL);
879 	}
880 
881 	return (nvl);
882 }
883 
884 /*
885  * Destroy a previously allocated nvlist structure.  flag indicates whether
886  * or not the associated nva structure should be freed (FM_NVA_FREE) or
887  * retained (FM_NVA_RETAIN).  Retaining the nv alloc structure allows
888  * it to be re-used for future nvlist creation operations.
889  */
890 void
891 fm_nvlist_destroy(nvlist_t *nvl, int flag)
892 {
893 	nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl);
894 
895 	nvlist_free(nvl);
896 
897 	if (nva != NULL) {
898 		if (flag == FM_NVA_FREE)
899 			fm_nva_xdestroy(nva);
900 	}
901 }
902 
903 int
904 i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap)
905 {
906 	int nelem, ret = 0;
907 	data_type_t type;
908 
909 	while (ret == 0 && name != NULL) {
910 		type = va_arg(ap, data_type_t);
911 		switch (type) {
912 		case DATA_TYPE_BYTE:
913 			ret = nvlist_add_byte(payload, name,
914 			    va_arg(ap, uint_t));
915 			break;
916 		case DATA_TYPE_BYTE_ARRAY:
917 			nelem = va_arg(ap, int);
918 			ret = nvlist_add_byte_array(payload, name,
919 			    va_arg(ap, uchar_t *), nelem);
920 			break;
921 		case DATA_TYPE_BOOLEAN_VALUE:
922 			ret = nvlist_add_boolean_value(payload, name,
923 			    va_arg(ap, boolean_t));
924 			break;
925 		case DATA_TYPE_BOOLEAN_ARRAY:
926 			nelem = va_arg(ap, int);
927 			ret = nvlist_add_boolean_array(payload, name,
928 			    va_arg(ap, boolean_t *), nelem);
929 			break;
930 		case DATA_TYPE_INT8:
931 			ret = nvlist_add_int8(payload, name,
932 			    va_arg(ap, int));
933 			break;
934 		case DATA_TYPE_INT8_ARRAY:
935 			nelem = va_arg(ap, int);
936 			ret = nvlist_add_int8_array(payload, name,
937 			    va_arg(ap, int8_t *), nelem);
938 			break;
939 		case DATA_TYPE_UINT8:
940 			ret = nvlist_add_uint8(payload, name,
941 			    va_arg(ap, uint_t));
942 			break;
943 		case DATA_TYPE_UINT8_ARRAY:
944 			nelem = va_arg(ap, int);
945 			ret = nvlist_add_uint8_array(payload, name,
946 			    va_arg(ap, uint8_t *), nelem);
947 			break;
948 		case DATA_TYPE_INT16:
949 			ret = nvlist_add_int16(payload, name,
950 			    va_arg(ap, int));
951 			break;
952 		case DATA_TYPE_INT16_ARRAY:
953 			nelem = va_arg(ap, int);
954 			ret = nvlist_add_int16_array(payload, name,
955 			    va_arg(ap, int16_t *), nelem);
956 			break;
957 		case DATA_TYPE_UINT16:
958 			ret = nvlist_add_uint16(payload, name,
959 			    va_arg(ap, uint_t));
960 			break;
961 		case DATA_TYPE_UINT16_ARRAY:
962 			nelem = va_arg(ap, int);
963 			ret = nvlist_add_uint16_array(payload, name,
964 			    va_arg(ap, uint16_t *), nelem);
965 			break;
966 		case DATA_TYPE_INT32:
967 			ret = nvlist_add_int32(payload, name,
968 			    va_arg(ap, int32_t));
969 			break;
970 		case DATA_TYPE_INT32_ARRAY:
971 			nelem = va_arg(ap, int);
972 			ret = nvlist_add_int32_array(payload, name,
973 			    va_arg(ap, int32_t *), nelem);
974 			break;
975 		case DATA_TYPE_UINT32:
976 			ret = nvlist_add_uint32(payload, name,
977 			    va_arg(ap, uint32_t));
978 			break;
979 		case DATA_TYPE_UINT32_ARRAY:
980 			nelem = va_arg(ap, int);
981 			ret = nvlist_add_uint32_array(payload, name,
982 			    va_arg(ap, uint32_t *), nelem);
983 			break;
984 		case DATA_TYPE_INT64:
985 			ret = nvlist_add_int64(payload, name,
986 			    va_arg(ap, int64_t));
987 			break;
988 		case DATA_TYPE_INT64_ARRAY:
989 			nelem = va_arg(ap, int);
990 			ret = nvlist_add_int64_array(payload, name,
991 			    va_arg(ap, int64_t *), nelem);
992 			break;
993 		case DATA_TYPE_UINT64:
994 			ret = nvlist_add_uint64(payload, name,
995 			    va_arg(ap, uint64_t));
996 			break;
997 		case DATA_TYPE_UINT64_ARRAY:
998 			nelem = va_arg(ap, int);
999 			ret = nvlist_add_uint64_array(payload, name,
1000 			    va_arg(ap, uint64_t *), nelem);
1001 			break;
1002 		case DATA_TYPE_STRING:
1003 			ret = nvlist_add_string(payload, name,
1004 			    va_arg(ap, char *));
1005 			break;
1006 		case DATA_TYPE_STRING_ARRAY:
1007 			nelem = va_arg(ap, int);
1008 			ret = nvlist_add_string_array(payload, name,
1009 			    va_arg(ap, char **), nelem);
1010 			break;
1011 		case DATA_TYPE_NVLIST:
1012 			ret = nvlist_add_nvlist(payload, name,
1013 			    va_arg(ap, nvlist_t *));
1014 			break;
1015 		case DATA_TYPE_NVLIST_ARRAY:
1016 			nelem = va_arg(ap, int);
1017 			ret = nvlist_add_nvlist_array(payload, name,
1018 			    va_arg(ap, nvlist_t **), nelem);
1019 			break;
1020 		default:
1021 			ret = EINVAL;
1022 		}
1023 
1024 		name = va_arg(ap, char *);
1025 	}
1026 	return (ret);
1027 }
1028 
1029 void
1030 fm_payload_set(nvlist_t *payload, ...)
1031 {
1032 	int ret;
1033 	const char *name;
1034 	va_list ap;
1035 
1036 	va_start(ap, payload);
1037 	name = va_arg(ap, char *);
1038 	ret = i_fm_payload_set(payload, name, ap);
1039 	va_end(ap);
1040 
1041 	if (ret)
1042 		atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64);
1043 }
1044 
1045 /*
1046  * Set-up and validate the members of an ereport event according to:
1047  *
1048  *	Member name		Type		Value
1049  *	====================================================
1050  *	class			string		ereport
1051  *	version			uint8_t		0
1052  *	ena			uint64_t	<ena>
1053  *	detector		nvlist_t	<detector>
1054  *	ereport-payload		nvlist_t	<var args>
1055  *
1056  * We don't actually add a 'version' member to the payload.  Really,
1057  * the version quoted to us by our caller is that of the category 1
1058  * "ereport" event class (and we require FM_EREPORT_VERS0) but
1059  * the payload version of the actual leaf class event under construction
1060  * may be something else.  Callers should supply a version in the varargs,
1061  * or (better) we could take two version arguments - one for the
1062  * ereport category 1 classification (expect FM_EREPORT_VERS0) and one
1063  * for the leaf class.
1064  */
1065 void
1066 fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class,
1067     uint64_t ena, const nvlist_t *detector, ...)
1068 {
1069 	char ereport_class[FM_MAX_CLASS];
1070 	const char *name;
1071 	va_list ap;
1072 	int ret;
1073 
1074 	if (version != FM_EREPORT_VERS0) {
1075 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
1076 		return;
1077 	}
1078 
1079 	(void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s",
1080 	    FM_EREPORT_CLASS, erpt_class);
1081 	if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) {
1082 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
1083 		return;
1084 	}
1085 
1086 	if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) {
1087 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
1088 	}
1089 
1090 	if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR,
1091 	    (nvlist_t *)detector) != 0) {
1092 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
1093 	}
1094 
1095 	va_start(ap, detector);
1096 	name = va_arg(ap, const char *);
1097 	ret = i_fm_payload_set(ereport, name, ap);
1098 	va_end(ap);
1099 
1100 	if (ret)
1101 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
1102 }
1103 
1104 /*
1105  * Set-up and validate the members of an hc fmri according to;
1106  *
1107  *	Member name		Type		Value
1108  *	===================================================
1109  *	version			uint8_t		0
1110  *	auth			nvlist_t	<auth>
1111  *	hc-name			string		<name>
1112  *	hc-id			string		<id>
1113  *
1114  * Note that auth and hc-id are optional members.
1115  */
1116 
1117 #define	HC_MAXPAIRS	20
1118 #define	HC_MAXNAMELEN	50
1119 
1120 static int
1121 fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth)
1122 {
1123 	if (version != FM_HC_SCHEME_VERSION) {
1124 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1125 		return (0);
1126 	}
1127 
1128 	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 ||
1129 	    nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) {
1130 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1131 		return (0);
1132 	}
1133 
1134 	if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
1135 	    (nvlist_t *)auth) != 0) {
1136 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1137 		return (0);
1138 	}
1139 
1140 	return (1);
1141 }
1142 
1143 void
1144 fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth,
1145     nvlist_t *snvl, int npairs, ...)
1146 {
1147 	nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
1148 	nvlist_t *pairs[HC_MAXPAIRS];
1149 	va_list ap;
1150 	int i;
1151 
1152 	if (!fm_fmri_hc_set_common(fmri, version, auth))
1153 		return;
1154 
1155 	npairs = MIN(npairs, HC_MAXPAIRS);
1156 
1157 	va_start(ap, npairs);
1158 	for (i = 0; i < npairs; i++) {
1159 		const char *name = va_arg(ap, const char *);
1160 		uint32_t id = va_arg(ap, uint32_t);
1161 		char idstr[11];
1162 
1163 		(void) snprintf(idstr, sizeof (idstr), "%u", id);
1164 
1165 		pairs[i] = fm_nvlist_create(nva);
1166 		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
1167 		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
1168 			atomic_inc_64(
1169 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1170 		}
1171 	}
1172 	va_end(ap);
1173 
1174 	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0)
1175 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1176 
1177 	for (i = 0; i < npairs; i++)
1178 		fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
1179 
1180 	if (snvl != NULL) {
1181 		if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
1182 			atomic_inc_64(
1183 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1184 		}
1185 	}
1186 }
1187 
1188 void
1189 fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
1190     nvlist_t *snvl, nvlist_t *bboard, int npairs, ...)
1191 {
1192 	nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
1193 	nvlist_t *pairs[HC_MAXPAIRS];
1194 	nvlist_t **hcl;
1195 	uint_t n;
1196 	int i, j;
1197 	va_list ap;
1198 	char *hcname, *hcid;
1199 
1200 	if (!fm_fmri_hc_set_common(fmri, version, auth))
1201 		return;
1202 
1203 	/*
1204 	 * copy the bboard nvpairs to the pairs array
1205 	 */
1206 	if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n)
1207 	    != 0) {
1208 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1209 		return;
1210 	}
1211 
1212 	for (i = 0; i < n; i++) {
1213 		if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME,
1214 		    &hcname) != 0) {
1215 			atomic_inc_64(
1216 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1217 			return;
1218 		}
1219 		if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) {
1220 			atomic_inc_64(
1221 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1222 			return;
1223 		}
1224 
1225 		pairs[i] = fm_nvlist_create(nva);
1226 		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 ||
1227 		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) {
1228 			for (j = 0; j <= i; j++) {
1229 				if (pairs[j] != NULL)
1230 					fm_nvlist_destroy(pairs[j],
1231 					    FM_NVA_RETAIN);
1232 			}
1233 			atomic_inc_64(
1234 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1235 			return;
1236 		}
1237 	}
1238 
1239 	/*
1240 	 * create the pairs from passed in pairs
1241 	 */
1242 	npairs = MIN(npairs, HC_MAXPAIRS);
1243 
1244 	va_start(ap, npairs);
1245 	for (i = n; i < npairs + n; i++) {
1246 		const char *name = va_arg(ap, const char *);
1247 		uint32_t id = va_arg(ap, uint32_t);
1248 		char idstr[11];
1249 		(void) snprintf(idstr, sizeof (idstr), "%u", id);
1250 		pairs[i] = fm_nvlist_create(nva);
1251 		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
1252 		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
1253 			for (j = 0; j <= i; j++) {
1254 				if (pairs[j] != NULL)
1255 					fm_nvlist_destroy(pairs[j],
1256 					    FM_NVA_RETAIN);
1257 			}
1258 			atomic_inc_64(
1259 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1260 			return;
1261 		}
1262 	}
1263 	va_end(ap);
1264 
1265 	/*
1266 	 * Create the fmri hc list
1267 	 */
1268 	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
1269 	    npairs + n) != 0) {
1270 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1271 		return;
1272 	}
1273 
1274 	for (i = 0; i < npairs + n; i++) {
1275 			fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
1276 	}
1277 
1278 	if (snvl != NULL) {
1279 		if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
1280 			atomic_inc_64(
1281 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1282 			return;
1283 		}
1284 	}
1285 }
1286 
1287 /*
1288  * Set-up and validate the members of an dev fmri according to:
1289  *
1290  *	Member name		Type		Value
1291  *	====================================================
1292  *	version			uint8_t		0
1293  *	auth			nvlist_t	<auth>
1294  *	devpath			string		<devpath>
1295  *	[devid]			string		<devid>
1296  *	[target-port-l0id]	string		<target-port-lun0-id>
1297  *
1298  * Note that auth and devid are optional members.
1299  */
1300 void
1301 fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth,
1302     const char *devpath, const char *devid, const char *tpl0)
1303 {
1304 	int err = 0;
1305 
1306 	if (version != DEV_SCHEME_VERSION0) {
1307 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1308 		return;
1309 	}
1310 
1311 	err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version);
1312 	err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV);
1313 
1314 	if (auth != NULL) {
1315 		err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY,
1316 		    (nvlist_t *)auth);
1317 	}
1318 
1319 	err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath);
1320 
1321 	if (devid != NULL)
1322 		err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid);
1323 
1324 	if (tpl0 != NULL)
1325 		err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0);
1326 
1327 	if (err)
1328 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1329 
1330 }
1331 
1332 /*
1333  * Set-up and validate the members of an cpu fmri according to:
1334  *
1335  *	Member name		Type		Value
1336  *	====================================================
1337  *	version			uint8_t		0
1338  *	auth			nvlist_t	<auth>
1339  *	cpuid			uint32_t	<cpu_id>
1340  *	cpumask			uint8_t		<cpu_mask>
1341  *	serial			uint64_t	<serial_id>
1342  *
1343  * Note that auth, cpumask, serial are optional members.
1344  *
1345  */
1346 void
1347 fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth,
1348     uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp)
1349 {
1350 	uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64;
1351 
1352 	if (version < CPU_SCHEME_VERSION1) {
1353 		atomic_inc_64(failedp);
1354 		return;
1355 	}
1356 
1357 	if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) {
1358 		atomic_inc_64(failedp);
1359 		return;
1360 	}
1361 
1362 	if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME,
1363 	    FM_FMRI_SCHEME_CPU) != 0) {
1364 		atomic_inc_64(failedp);
1365 		return;
1366 	}
1367 
1368 	if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY,
1369 	    (nvlist_t *)auth) != 0)
1370 		atomic_inc_64(failedp);
1371 
1372 	if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0)
1373 		atomic_inc_64(failedp);
1374 
1375 	if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK,
1376 	    *cpu_maskp) != 0)
1377 		atomic_inc_64(failedp);
1378 
1379 	if (serial_idp == NULL || nvlist_add_string(fmri_cpu,
1380 	    FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0)
1381 			atomic_inc_64(failedp);
1382 }
1383 
1384 /*
1385  * Set-up and validate the members of a mem according to:
1386  *
1387  *	Member name		Type		Value
1388  *	====================================================
1389  *	version			uint8_t		0
1390  *	auth			nvlist_t	<auth>		[optional]
1391  *	unum			string		<unum>
1392  *	serial			string		<serial>	[optional*]
1393  *	offset			uint64_t	<offset>	[optional]
1394  *
1395  *	* serial is required if offset is present
1396  */
1397 void
1398 fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth,
1399     const char *unum, const char *serial, uint64_t offset)
1400 {
1401 	if (version != MEM_SCHEME_VERSION0) {
1402 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1403 		return;
1404 	}
1405 
1406 	if (!serial && (offset != (uint64_t)-1)) {
1407 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1408 		return;
1409 	}
1410 
1411 	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
1412 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1413 		return;
1414 	}
1415 
1416 	if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) {
1417 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1418 		return;
1419 	}
1420 
1421 	if (auth != NULL) {
1422 		if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
1423 		    (nvlist_t *)auth) != 0) {
1424 			atomic_inc_64(
1425 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1426 		}
1427 	}
1428 
1429 	if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) {
1430 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1431 	}
1432 
1433 	if (serial != NULL) {
1434 		if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID,
1435 		    (char **)&serial, 1) != 0) {
1436 			atomic_inc_64(
1437 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1438 		}
1439 		if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri,
1440 		    FM_FMRI_MEM_OFFSET, offset) != 0) {
1441 			atomic_inc_64(
1442 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1443 		}
1444 	}
1445 }
1446 
1447 void
1448 fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid,
1449     uint64_t vdev_guid)
1450 {
1451 	if (version != ZFS_SCHEME_VERSION0) {
1452 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1453 		return;
1454 	}
1455 
1456 	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
1457 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1458 		return;
1459 	}
1460 
1461 	if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) {
1462 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1463 		return;
1464 	}
1465 
1466 	if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) {
1467 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1468 	}
1469 
1470 	if (vdev_guid != 0) {
1471 		if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) {
1472 			atomic_inc_64(
1473 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1474 		}
1475 	}
1476 }
1477 
1478 uint64_t
1479 fm_ena_increment(uint64_t ena)
1480 {
1481 	uint64_t new_ena;
1482 
1483 	switch (ENA_FORMAT(ena)) {
1484 	case FM_ENA_FMT1:
1485 		new_ena = ena + (1 << ENA_FMT1_GEN_SHFT);
1486 		break;
1487 	case FM_ENA_FMT2:
1488 		new_ena = ena + (1 << ENA_FMT2_GEN_SHFT);
1489 		break;
1490 	default:
1491 		new_ena = 0;
1492 	}
1493 
1494 	return (new_ena);
1495 }
1496 
1497 uint64_t
1498 fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format)
1499 {
1500 	uint64_t ena = 0;
1501 
1502 	switch (format) {
1503 	case FM_ENA_FMT1:
1504 		if (timestamp) {
1505 			ena = (uint64_t)((format & ENA_FORMAT_MASK) |
1506 			    ((cpuid << ENA_FMT1_CPUID_SHFT) &
1507 			    ENA_FMT1_CPUID_MASK) |
1508 			    ((timestamp << ENA_FMT1_TIME_SHFT) &
1509 			    ENA_FMT1_TIME_MASK));
1510 		} else {
1511 			ena = (uint64_t)((format & ENA_FORMAT_MASK) |
1512 			    ((cpuid << ENA_FMT1_CPUID_SHFT) &
1513 			    ENA_FMT1_CPUID_MASK) |
1514 			    ((gethrtime() << ENA_FMT1_TIME_SHFT) &
1515 			    ENA_FMT1_TIME_MASK));
1516 		}
1517 		break;
1518 	case FM_ENA_FMT2:
1519 		ena = (uint64_t)((format & ENA_FORMAT_MASK) |
1520 		    ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK));
1521 		break;
1522 	default:
1523 		break;
1524 	}
1525 
1526 	return (ena);
1527 }
1528 
1529 uint64_t
1530 fm_ena_generate(uint64_t timestamp, uchar_t format)
1531 {
1532 	uint64_t ena;
1533 
1534 	kpreempt_disable();
1535 	ena = fm_ena_generate_cpu(timestamp, getcpuid(), format);
1536 	kpreempt_enable();
1537 
1538 	return (ena);
1539 }
1540 
1541 uint64_t
1542 fm_ena_generation_get(uint64_t ena)
1543 {
1544 	uint64_t gen;
1545 
1546 	switch (ENA_FORMAT(ena)) {
1547 	case FM_ENA_FMT1:
1548 		gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT;
1549 		break;
1550 	case FM_ENA_FMT2:
1551 		gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT;
1552 		break;
1553 	default:
1554 		gen = 0;
1555 		break;
1556 	}
1557 
1558 	return (gen);
1559 }
1560 
1561 uchar_t
1562 fm_ena_format_get(uint64_t ena)
1563 {
1564 
1565 	return (ENA_FORMAT(ena));
1566 }
1567 
1568 uint64_t
1569 fm_ena_id_get(uint64_t ena)
1570 {
1571 	uint64_t id;
1572 
1573 	switch (ENA_FORMAT(ena)) {
1574 	case FM_ENA_FMT1:
1575 		id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT;
1576 		break;
1577 	case FM_ENA_FMT2:
1578 		id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT;
1579 		break;
1580 	default:
1581 		id = 0;
1582 	}
1583 
1584 	return (id);
1585 }
1586 
1587 uint64_t
1588 fm_ena_time_get(uint64_t ena)
1589 {
1590 	uint64_t time;
1591 
1592 	switch (ENA_FORMAT(ena)) {
1593 	case FM_ENA_FMT1:
1594 		time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT;
1595 		break;
1596 	case FM_ENA_FMT2:
1597 		time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT;
1598 		break;
1599 	default:
1600 		time = 0;
1601 	}
1602 
1603 	return (time);
1604 }
1605 
1606 #ifdef _KERNEL
1607 /*
1608  * Helper function to increment ereport dropped count.  Used by the event
1609  * rate limiting code to give feedback to the user about how many events were
1610  * rate limited by including them in the 'dropped' count.
1611  */
1612 void
1613 fm_erpt_dropped_increment(void)
1614 {
1615 	atomic_inc_64(&ratelimit_dropped);
1616 }
1617 
1618 void
1619 fm_init(void)
1620 {
1621 	zevent_len_cur = 0;
1622 	zevent_flags = 0;
1623 
1624 	if (zfs_zevent_len_max == 0)
1625 		zfs_zevent_len_max = ERPT_MAX_ERRS * MAX(max_ncpus, 4);
1626 
1627 	/* Initialize zevent allocation and generation kstats */
1628 	fm_ksp = kstat_create("zfs", 0, "fm", "misc", KSTAT_TYPE_NAMED,
1629 	    sizeof (struct erpt_kstat) / sizeof (kstat_named_t),
1630 	    KSTAT_FLAG_VIRTUAL);
1631 
1632 	if (fm_ksp != NULL) {
1633 		fm_ksp->ks_data = &erpt_kstat_data;
1634 		kstat_install(fm_ksp);
1635 	} else {
1636 		cmn_err(CE_NOTE, "failed to create fm/misc kstat\n");
1637 	}
1638 
1639 	mutex_init(&zevent_lock, NULL, MUTEX_DEFAULT, NULL);
1640 	list_create(&zevent_list, sizeof (zevent_t),
1641 	    offsetof(zevent_t, ev_node));
1642 	cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL);
1643 
1644 	zfs_ereport_init();
1645 }
1646 
1647 void
1648 fm_fini(void)
1649 {
1650 	int count;
1651 
1652 	zfs_ereport_fini();
1653 
1654 	zfs_zevent_drain_all(&count);
1655 
1656 	mutex_enter(&zevent_lock);
1657 	cv_broadcast(&zevent_cv);
1658 
1659 	zevent_flags |= ZEVENT_SHUTDOWN;
1660 	while (zevent_waiters > 0) {
1661 		mutex_exit(&zevent_lock);
1662 		schedule();
1663 		mutex_enter(&zevent_lock);
1664 	}
1665 	mutex_exit(&zevent_lock);
1666 
1667 	cv_destroy(&zevent_cv);
1668 	list_destroy(&zevent_list);
1669 	mutex_destroy(&zevent_lock);
1670 
1671 	if (fm_ksp != NULL) {
1672 		kstat_delete(fm_ksp);
1673 		fm_ksp = NULL;
1674 	}
1675 }
1676 #endif /* _KERNEL */
1677 
1678 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, len_max, INT, ZMOD_RW,
1679 	"Max event queue length");
1680 
1681 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, cols, INT, ZMOD_RW,
1682 	"Max event column width");
1683 
1684 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, console, INT, ZMOD_RW,
1685 	"Log events to the console");
1686