xref: /freebsd/sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c (revision d198b8774d2cfb6f140893e1c6236af9e97d1497)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
25  * Copyright (c) 2016, Intel Corporation.
26  */
27 
28 #include <stddef.h>
29 #include <string.h>
30 #include <libuutil.h>
31 #include <libzfs.h>
32 #include <sys/types.h>
33 #include <sys/time.h>
34 #include <sys/fs/zfs.h>
35 #include <sys/fm/protocol.h>
36 #include <sys/fm/fs/zfs.h>
37 #include <sys/zio.h>
38 
39 #include "zfs_agents.h"
40 #include "fmd_api.h"
41 
42 /*
43  * Default values for the serd engine when processing checksum or io errors. The
44  * semantics are N <events> in T <seconds>.
45  */
46 #define	DEFAULT_CHECKSUM_N	10	/* events */
47 #define	DEFAULT_CHECKSUM_T	600	/* seconds */
48 #define	DEFAULT_IO_N		10	/* events */
49 #define	DEFAULT_IO_T		600	/* seconds */
50 
51 /*
52  * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'.  This
53  * #define reserves enough space for two 64-bit hex values plus the length of
54  * the longest string.
55  */
56 #define	MAX_SERDLEN	(16 * 2 + sizeof ("zfs___checksum"))
57 
58 /*
59  * On-disk case structure.  This must maintain backwards compatibility with
60  * previous versions of the DE.  By default, any members appended to the end
61  * will be filled with zeros if they don't exist in a previous version.
62  */
63 typedef struct zfs_case_data {
64 	uint64_t	zc_version;
65 	uint64_t	zc_ena;
66 	uint64_t	zc_pool_guid;
67 	uint64_t	zc_vdev_guid;
68 	int		zc_pool_state;
69 	char		zc_serd_checksum[MAX_SERDLEN];
70 	char		zc_serd_io[MAX_SERDLEN];
71 	int		zc_has_remove_timer;
72 } zfs_case_data_t;
73 
74 /*
75  * Time-of-day
76  */
77 typedef struct er_timeval {
78 	uint64_t	ertv_sec;
79 	uint64_t	ertv_nsec;
80 } er_timeval_t;
81 
82 /*
83  * In-core case structure.
84  */
85 typedef struct zfs_case {
86 	boolean_t	zc_present;
87 	uint32_t	zc_version;
88 	zfs_case_data_t	zc_data;
89 	fmd_case_t	*zc_case;
90 	uu_list_node_t	zc_node;
91 	id_t		zc_remove_timer;
92 	char		*zc_fru;
93 	er_timeval_t	zc_when;
94 } zfs_case_t;
95 
96 #define	CASE_DATA			"data"
97 #define	CASE_FRU			"fru"
98 #define	CASE_DATA_VERSION_INITIAL	1
99 #define	CASE_DATA_VERSION_SERD		2
100 
101 typedef struct zfs_de_stats {
102 	fmd_stat_t	old_drops;
103 	fmd_stat_t	dev_drops;
104 	fmd_stat_t	vdev_drops;
105 	fmd_stat_t	import_drops;
106 	fmd_stat_t	resource_drops;
107 } zfs_de_stats_t;
108 
109 zfs_de_stats_t zfs_stats = {
110 	{ "old_drops", FMD_TYPE_UINT64, "ereports dropped (from before load)" },
111 	{ "dev_drops", FMD_TYPE_UINT64, "ereports dropped (dev during open)"},
112 	{ "vdev_drops", FMD_TYPE_UINT64, "ereports dropped (weird vdev types)"},
113 	{ "import_drops", FMD_TYPE_UINT64, "ereports dropped (during import)" },
114 	{ "resource_drops", FMD_TYPE_UINT64, "resource related ereports" }
115 };
116 
117 static hrtime_t zfs_remove_timeout;
118 
119 uu_list_pool_t *zfs_case_pool;
120 uu_list_t *zfs_cases;
121 
122 #define	ZFS_MAKE_RSRC(type)	\
123     FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type
124 #define	ZFS_MAKE_EREPORT(type)	\
125     FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
126 
127 /*
128  * Write out the persistent representation of an active case.
129  */
130 static void
131 zfs_case_serialize(zfs_case_t *zcp)
132 {
133 	zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD;
134 }
135 
136 /*
137  * Read back the persistent representation of an active case.
138  */
139 static zfs_case_t *
140 zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
141 {
142 	zfs_case_t *zcp;
143 
144 	zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP);
145 	zcp->zc_case = cp;
146 
147 	fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data,
148 	    sizeof (zcp->zc_data));
149 
150 	if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) {
151 		fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
152 		return (NULL);
153 	}
154 
155 	/*
156 	 * fmd_buf_read() will have already zeroed out the remainder of the
157 	 * buffer, so we don't have to do anything special if the version
158 	 * doesn't include the SERD engine name.
159 	 */
160 
161 	if (zcp->zc_data.zc_has_remove_timer)
162 		zcp->zc_remove_timer = fmd_timer_install(hdl, zcp,
163 		    NULL, zfs_remove_timeout);
164 
165 	uu_list_node_init(zcp, &zcp->zc_node, zfs_case_pool);
166 	(void) uu_list_insert_before(zfs_cases, NULL, zcp);
167 
168 	fmd_case_setspecific(hdl, cp, zcp);
169 
170 	return (zcp);
171 }
172 
173 /*
174  * Iterate over any active cases.  If any cases are associated with a pool or
175  * vdev which is no longer present on the system, close the associated case.
176  */
177 static void
178 zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd, er_timeval_t *loaded)
179 {
180 	uint64_t vdev_guid = 0;
181 	uint_t c, children;
182 	nvlist_t **child;
183 	zfs_case_t *zcp;
184 
185 	(void) nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid);
186 
187 	/*
188 	 * Mark any cases associated with this (pool, vdev) pair.
189 	 */
190 	for (zcp = uu_list_first(zfs_cases); zcp != NULL;
191 	    zcp = uu_list_next(zfs_cases, zcp)) {
192 		if (zcp->zc_data.zc_pool_guid == pool_guid &&
193 		    zcp->zc_data.zc_vdev_guid == vdev_guid) {
194 			zcp->zc_present = B_TRUE;
195 			zcp->zc_when = *loaded;
196 		}
197 	}
198 
199 	/*
200 	 * Iterate over all children.
201 	 */
202 	if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child,
203 	    &children) == 0) {
204 		for (c = 0; c < children; c++)
205 			zfs_mark_vdev(pool_guid, child[c], loaded);
206 	}
207 
208 	if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child,
209 	    &children) == 0) {
210 		for (c = 0; c < children; c++)
211 			zfs_mark_vdev(pool_guid, child[c], loaded);
212 	}
213 
214 	if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child,
215 	    &children) == 0) {
216 		for (c = 0; c < children; c++)
217 			zfs_mark_vdev(pool_guid, child[c], loaded);
218 	}
219 }
220 
221 static int
222 zfs_mark_pool(zpool_handle_t *zhp, void *unused)
223 {
224 	(void) unused;
225 	zfs_case_t *zcp;
226 	uint64_t pool_guid;
227 	uint64_t *tod;
228 	er_timeval_t loaded = { 0 };
229 	nvlist_t *config, *vd;
230 	uint_t nelem = 0;
231 	int ret;
232 
233 	pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL);
234 	/*
235 	 * Mark any cases associated with just this pool.
236 	 */
237 	for (zcp = uu_list_first(zfs_cases); zcp != NULL;
238 	    zcp = uu_list_next(zfs_cases, zcp)) {
239 		if (zcp->zc_data.zc_pool_guid == pool_guid &&
240 		    zcp->zc_data.zc_vdev_guid == 0)
241 			zcp->zc_present = B_TRUE;
242 	}
243 
244 	if ((config = zpool_get_config(zhp, NULL)) == NULL) {
245 		zpool_close(zhp);
246 		return (-1);
247 	}
248 
249 	(void) nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME,
250 	    &tod, &nelem);
251 	if (nelem == 2) {
252 		loaded.ertv_sec = tod[0];
253 		loaded.ertv_nsec = tod[1];
254 		for (zcp = uu_list_first(zfs_cases); zcp != NULL;
255 		    zcp = uu_list_next(zfs_cases, zcp)) {
256 			if (zcp->zc_data.zc_pool_guid == pool_guid &&
257 			    zcp->zc_data.zc_vdev_guid == 0) {
258 				zcp->zc_when = loaded;
259 			}
260 		}
261 	}
262 
263 	ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd);
264 	if (ret) {
265 		zpool_close(zhp);
266 		return (-1);
267 	}
268 
269 	zfs_mark_vdev(pool_guid, vd, &loaded);
270 
271 	zpool_close(zhp);
272 
273 	return (0);
274 }
275 
276 struct load_time_arg {
277 	uint64_t lt_guid;
278 	er_timeval_t *lt_time;
279 	boolean_t lt_found;
280 };
281 
282 static int
283 zpool_find_load_time(zpool_handle_t *zhp, void *arg)
284 {
285 	struct load_time_arg *lta = arg;
286 	uint64_t pool_guid;
287 	uint64_t *tod;
288 	nvlist_t *config;
289 	uint_t nelem;
290 
291 	if (lta->lt_found) {
292 		zpool_close(zhp);
293 		return (0);
294 	}
295 
296 	pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL);
297 	if (pool_guid != lta->lt_guid) {
298 		zpool_close(zhp);
299 		return (0);
300 	}
301 
302 	if ((config = zpool_get_config(zhp, NULL)) == NULL) {
303 		zpool_close(zhp);
304 		return (-1);
305 	}
306 
307 	if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME,
308 	    &tod, &nelem) == 0 && nelem == 2) {
309 		lta->lt_found = B_TRUE;
310 		lta->lt_time->ertv_sec = tod[0];
311 		lta->lt_time->ertv_nsec = tod[1];
312 	}
313 
314 	zpool_close(zhp);
315 
316 	return (0);
317 }
318 
319 static void
320 zfs_purge_cases(fmd_hdl_t *hdl)
321 {
322 	zfs_case_t *zcp;
323 	uu_list_walk_t *walk;
324 	libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl);
325 
326 	/*
327 	 * There is no way to open a pool by GUID, or lookup a vdev by GUID.  No
328 	 * matter what we do, we're going to have to stomach an O(vdevs * cases)
329 	 * algorithm.  In reality, both quantities are likely so small that
330 	 * neither will matter. Given that iterating over pools is more
331 	 * expensive than iterating over the in-memory case list, we opt for a
332 	 * 'present' flag in each case that starts off cleared.  We then iterate
333 	 * over all pools, marking those that are still present, and removing
334 	 * those that aren't found.
335 	 *
336 	 * Note that we could also construct an FMRI and rely on
337 	 * fmd_nvl_fmri_present(), but this would end up doing the same search.
338 	 */
339 
340 	/*
341 	 * Mark the cases as not present.
342 	 */
343 	for (zcp = uu_list_first(zfs_cases); zcp != NULL;
344 	    zcp = uu_list_next(zfs_cases, zcp))
345 		zcp->zc_present = B_FALSE;
346 
347 	/*
348 	 * Iterate over all pools and mark the pools and vdevs found.  If this
349 	 * fails (most probably because we're out of memory), then don't close
350 	 * any of the cases and we cannot be sure they are accurate.
351 	 */
352 	if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0)
353 		return;
354 
355 	/*
356 	 * Remove those cases which were not found.
357 	 */
358 	walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST);
359 	while ((zcp = uu_list_walk_next(walk)) != NULL) {
360 		if (!zcp->zc_present)
361 			fmd_case_close(hdl, zcp->zc_case);
362 	}
363 	uu_list_walk_end(walk);
364 }
365 
366 /*
367  * Construct the name of a serd engine given the pool/vdev GUID and type (io or
368  * checksum).
369  */
370 static void
371 zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid,
372     const char *type)
373 {
374 	(void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s",
375 	    (long long unsigned int)pool_guid,
376 	    (long long unsigned int)vdev_guid, type);
377 }
378 
379 /*
380  * Solve a given ZFS case.  This first checks to make sure the diagnosis is
381  * still valid, as well as cleaning up any pending timer associated with the
382  * case.
383  */
384 static void
385 zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname)
386 {
387 	nvlist_t *detector, *fault;
388 	boolean_t serialize;
389 	nvlist_t *fru = NULL;
390 	fmd_hdl_debug(hdl, "solving fault '%s'", faultname);
391 
392 	/*
393 	 * Construct the detector from the case data.  The detector is in the
394 	 * ZFS scheme, and is either the pool or the vdev, depending on whether
395 	 * this is a vdev or pool fault.
396 	 */
397 	detector = fmd_nvl_alloc(hdl, FMD_SLEEP);
398 
399 	(void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0);
400 	(void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS);
401 	(void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL,
402 	    zcp->zc_data.zc_pool_guid);
403 	if (zcp->zc_data.zc_vdev_guid != 0) {
404 		(void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV,
405 		    zcp->zc_data.zc_vdev_guid);
406 	}
407 
408 	fault = fmd_nvl_create_fault(hdl, faultname, 100, detector,
409 	    fru, detector);
410 	fmd_case_add_suspect(hdl, zcp->zc_case, fault);
411 
412 	nvlist_free(fru);
413 
414 	fmd_case_solve(hdl, zcp->zc_case);
415 
416 	serialize = B_FALSE;
417 	if (zcp->zc_data.zc_has_remove_timer) {
418 		fmd_timer_remove(hdl, zcp->zc_remove_timer);
419 		zcp->zc_data.zc_has_remove_timer = 0;
420 		serialize = B_TRUE;
421 	}
422 	if (serialize)
423 		zfs_case_serialize(zcp);
424 
425 	nvlist_free(detector);
426 }
427 
428 static boolean_t
429 timeval_earlier(er_timeval_t *a, er_timeval_t *b)
430 {
431 	return (a->ertv_sec < b->ertv_sec ||
432 	    (a->ertv_sec == b->ertv_sec && a->ertv_nsec < b->ertv_nsec));
433 }
434 
435 static void
436 zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when)
437 {
438 	(void) hdl;
439 	int64_t *tod;
440 	uint_t	nelem;
441 
442 	if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tod,
443 	    &nelem) == 0 && nelem == 2) {
444 		when->ertv_sec = tod[0];
445 		when->ertv_nsec = tod[1];
446 	} else {
447 		when->ertv_sec = when->ertv_nsec = UINT64_MAX;
448 	}
449 }
450 
451 /*
452  * Main fmd entry point.
453  */
454 static void
455 zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
456 {
457 	zfs_case_t *zcp, *dcp;
458 	int32_t pool_state;
459 	uint64_t ena, pool_guid, vdev_guid;
460 	uint64_t checksum_n, checksum_t;
461 	uint64_t io_n, io_t;
462 	er_timeval_t pool_load;
463 	er_timeval_t er_when;
464 	nvlist_t *detector;
465 	boolean_t pool_found = B_FALSE;
466 	boolean_t isresource;
467 	const char *type;
468 
469 	/*
470 	 * We subscribe to notifications for vdev or pool removal.  In these
471 	 * cases, there may be cases that no longer apply.  Purge any cases
472 	 * that no longer apply.
473 	 */
474 	if (fmd_nvl_class_match(hdl, nvl, "sysevent.fs.zfs.*")) {
475 		fmd_hdl_debug(hdl, "purging orphaned cases from %s",
476 		    strrchr(class, '.') + 1);
477 		zfs_purge_cases(hdl);
478 		zfs_stats.resource_drops.fmds_value.ui64++;
479 		return;
480 	}
481 
482 	isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*");
483 
484 	if (isresource) {
485 		/*
486 		 * For resources, we don't have a normal payload.
487 		 */
488 		if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
489 		    &vdev_guid) != 0)
490 			pool_state = SPA_LOAD_OPEN;
491 		else
492 			pool_state = SPA_LOAD_NONE;
493 		detector = NULL;
494 	} else {
495 		(void) nvlist_lookup_nvlist(nvl,
496 		    FM_EREPORT_DETECTOR, &detector);
497 		(void) nvlist_lookup_int32(nvl,
498 		    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state);
499 	}
500 
501 	/*
502 	 * We also ignore all ereports generated during an import of a pool,
503 	 * since the only possible fault (.pool) would result in import failure,
504 	 * and hence no persistent fault.  Some day we may want to do something
505 	 * with these ereports, so we continue generating them internally.
506 	 */
507 	if (pool_state == SPA_LOAD_IMPORT) {
508 		zfs_stats.import_drops.fmds_value.ui64++;
509 		fmd_hdl_debug(hdl, "ignoring '%s' during import", class);
510 		return;
511 	}
512 
513 	/*
514 	 * Device I/O errors are ignored during pool open.
515 	 */
516 	if (pool_state == SPA_LOAD_OPEN &&
517 	    (fmd_nvl_class_match(hdl, nvl,
518 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) ||
519 	    fmd_nvl_class_match(hdl, nvl,
520 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) ||
521 	    fmd_nvl_class_match(hdl, nvl,
522 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE)))) {
523 		fmd_hdl_debug(hdl, "ignoring '%s' during pool open", class);
524 		zfs_stats.dev_drops.fmds_value.ui64++;
525 		return;
526 	}
527 
528 	/*
529 	 * We ignore ereports for anything except disks and files.
530 	 */
531 	if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
532 	    &type) == 0) {
533 		if (strcmp(type, VDEV_TYPE_DISK) != 0 &&
534 		    strcmp(type, VDEV_TYPE_FILE) != 0) {
535 			zfs_stats.vdev_drops.fmds_value.ui64++;
536 			return;
537 		}
538 	}
539 
540 	/*
541 	 * Determine if this ereport corresponds to an open case.
542 	 * Each vdev or pool can have a single case.
543 	 */
544 	(void) nvlist_lookup_uint64(nvl,
545 	    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid);
546 	if (nvlist_lookup_uint64(nvl,
547 	    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
548 		vdev_guid = 0;
549 	if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0)
550 		ena = 0;
551 
552 	zfs_ereport_when(hdl, nvl, &er_when);
553 
554 	for (zcp = uu_list_first(zfs_cases); zcp != NULL;
555 	    zcp = uu_list_next(zfs_cases, zcp)) {
556 		if (zcp->zc_data.zc_pool_guid == pool_guid) {
557 			pool_found = B_TRUE;
558 			pool_load = zcp->zc_when;
559 		}
560 		if (zcp->zc_data.zc_vdev_guid == vdev_guid)
561 			break;
562 	}
563 
564 	/*
565 	 * Avoid falsely accusing a pool of being faulty.  Do so by
566 	 * not replaying ereports that were generated prior to the
567 	 * current import.  If the failure that generated them was
568 	 * transient because the device was actually removed but we
569 	 * didn't receive the normal asynchronous notification, we
570 	 * don't want to mark it as faulted and potentially panic. If
571 	 * there is still a problem we'd expect not to be able to
572 	 * import the pool, or that new ereports will be generated
573 	 * once the pool is used.
574 	 */
575 	if (pool_found && timeval_earlier(&er_when, &pool_load)) {
576 		fmd_hdl_debug(hdl, "ignoring pool %llx, "
577 		    "ereport time %lld.%lld, pool load time = %lld.%lld",
578 		    pool_guid, er_when.ertv_sec, er_when.ertv_nsec,
579 		    pool_load.ertv_sec, pool_load.ertv_nsec);
580 		zfs_stats.old_drops.fmds_value.ui64++;
581 		return;
582 	}
583 
584 	if (!pool_found) {
585 		/*
586 		 * Haven't yet seen this pool, but same situation
587 		 * may apply.
588 		 */
589 		libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl);
590 		struct load_time_arg la;
591 
592 		la.lt_guid = pool_guid;
593 		la.lt_time = &pool_load;
594 		la.lt_found = B_FALSE;
595 
596 		if (zhdl != NULL &&
597 		    zpool_iter(zhdl, zpool_find_load_time, &la) == 0 &&
598 		    la.lt_found == B_TRUE) {
599 			pool_found = B_TRUE;
600 
601 			if (timeval_earlier(&er_when, &pool_load)) {
602 				fmd_hdl_debug(hdl, "ignoring pool %llx, "
603 				    "ereport time %lld.%lld, "
604 				    "pool load time = %lld.%lld",
605 				    pool_guid, er_when.ertv_sec,
606 				    er_when.ertv_nsec, pool_load.ertv_sec,
607 				    pool_load.ertv_nsec);
608 				zfs_stats.old_drops.fmds_value.ui64++;
609 				return;
610 			}
611 		}
612 	}
613 
614 	if (zcp == NULL) {
615 		fmd_case_t *cs;
616 		zfs_case_data_t data = { 0 };
617 
618 		/*
619 		 * If this is one of our 'fake' resource ereports, and there is
620 		 * no case open, simply discard it.
621 		 */
622 		if (isresource) {
623 			zfs_stats.resource_drops.fmds_value.ui64++;
624 			fmd_hdl_debug(hdl, "discarding '%s for vdev %llu",
625 			    class, vdev_guid);
626 			return;
627 		}
628 
629 		/*
630 		 * Skip tracking some ereports
631 		 */
632 		if (strcmp(class,
633 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 ||
634 		    strcmp(class,
635 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 ||
636 		    strcmp(class,
637 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) {
638 			zfs_stats.resource_drops.fmds_value.ui64++;
639 			return;
640 		}
641 
642 		/*
643 		 * Open a new case.
644 		 */
645 		cs = fmd_case_open(hdl, NULL);
646 
647 		fmd_hdl_debug(hdl, "opening case for vdev %llu due to '%s'",
648 		    vdev_guid, class);
649 
650 		/*
651 		 * Initialize the case buffer.  To commonize code, we actually
652 		 * create the buffer with existing data, and then call
653 		 * zfs_case_unserialize() to instantiate the in-core structure.
654 		 */
655 		fmd_buf_create(hdl, cs, CASE_DATA, sizeof (zfs_case_data_t));
656 
657 		data.zc_version = CASE_DATA_VERSION_SERD;
658 		data.zc_ena = ena;
659 		data.zc_pool_guid = pool_guid;
660 		data.zc_vdev_guid = vdev_guid;
661 		data.zc_pool_state = (int)pool_state;
662 
663 		fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data));
664 
665 		zcp = zfs_case_unserialize(hdl, cs);
666 		assert(zcp != NULL);
667 		if (pool_found)
668 			zcp->zc_when = pool_load;
669 	}
670 
671 	if (isresource) {
672 		fmd_hdl_debug(hdl, "resource event '%s'", class);
673 
674 		if (fmd_nvl_class_match(hdl, nvl,
675 		    ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) {
676 			/*
677 			 * The 'resource.fs.zfs.autoreplace' event indicates
678 			 * that the pool was loaded with the 'autoreplace'
679 			 * property set.  In this case, any pending device
680 			 * failures should be ignored, as the asynchronous
681 			 * autoreplace handling will take care of them.
682 			 */
683 			fmd_case_close(hdl, zcp->zc_case);
684 		} else if (fmd_nvl_class_match(hdl, nvl,
685 		    ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) {
686 			/*
687 			 * The 'resource.fs.zfs.removed' event indicates that
688 			 * device removal was detected, and the device was
689 			 * closed asynchronously.  If this is the case, we
690 			 * assume that any recent I/O errors were due to the
691 			 * device removal, not any fault of the device itself.
692 			 * We reset the SERD engine, and cancel any pending
693 			 * timers.
694 			 */
695 			if (zcp->zc_data.zc_has_remove_timer) {
696 				fmd_timer_remove(hdl, zcp->zc_remove_timer);
697 				zcp->zc_data.zc_has_remove_timer = 0;
698 				zfs_case_serialize(zcp);
699 			}
700 			if (zcp->zc_data.zc_serd_io[0] != '\0')
701 				fmd_serd_reset(hdl, zcp->zc_data.zc_serd_io);
702 			if (zcp->zc_data.zc_serd_checksum[0] != '\0')
703 				fmd_serd_reset(hdl,
704 				    zcp->zc_data.zc_serd_checksum);
705 		} else if (fmd_nvl_class_match(hdl, nvl,
706 		    ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) {
707 			uint64_t state = 0;
708 
709 			if (zcp != NULL &&
710 			    nvlist_lookup_uint64(nvl,
711 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state) == 0 &&
712 			    state == VDEV_STATE_HEALTHY) {
713 				fmd_hdl_debug(hdl, "closing case after a "
714 				    "device statechange to healthy");
715 				fmd_case_close(hdl, zcp->zc_case);
716 			}
717 		}
718 		zfs_stats.resource_drops.fmds_value.ui64++;
719 		return;
720 	}
721 
722 	/*
723 	 * Associate the ereport with this case.
724 	 */
725 	fmd_case_add_ereport(hdl, zcp->zc_case, ep);
726 
727 	/*
728 	 * Don't do anything else if this case is already solved.
729 	 */
730 	if (fmd_case_solved(hdl, zcp->zc_case))
731 		return;
732 
733 	fmd_hdl_debug(hdl, "error event '%s'", class);
734 
735 	/*
736 	 * Determine if we should solve the case and generate a fault.  We solve
737 	 * a case if:
738 	 *
739 	 * 	a. A pool failed to open (ereport.fs.zfs.pool)
740 	 * 	b. A device failed to open (ereport.fs.zfs.pool) while a pool
741 	 *	   was up and running.
742 	 *
743 	 * We may see a series of ereports associated with a pool open, all
744 	 * chained together by the same ENA.  If the pool open succeeds, then
745 	 * we'll see no further ereports.  To detect when a pool open has
746 	 * succeeded, we associate a timer with the event.  When it expires, we
747 	 * close the case.
748 	 */
749 	if (fmd_nvl_class_match(hdl, nvl,
750 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) {
751 		/*
752 		 * Pool level fault.  Before solving the case, go through and
753 		 * close any open device cases that may be pending.
754 		 */
755 		for (dcp = uu_list_first(zfs_cases); dcp != NULL;
756 		    dcp = uu_list_next(zfs_cases, dcp)) {
757 			if (dcp->zc_data.zc_pool_guid ==
758 			    zcp->zc_data.zc_pool_guid &&
759 			    dcp->zc_data.zc_vdev_guid != 0)
760 				fmd_case_close(hdl, dcp->zc_case);
761 		}
762 
763 		zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool");
764 	} else if (fmd_nvl_class_match(hdl, nvl,
765 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) {
766 		/*
767 		 * Pool level fault for reading the intent logs.
768 		 */
769 		zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay");
770 	} else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) {
771 		/*
772 		 * Device fault.
773 		 */
774 		zfs_case_solve(hdl, zcp, "fault.fs.zfs.device");
775 	} else if (fmd_nvl_class_match(hdl, nvl,
776 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) ||
777 	    fmd_nvl_class_match(hdl, nvl,
778 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) ||
779 	    fmd_nvl_class_match(hdl, nvl,
780 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) ||
781 	    fmd_nvl_class_match(hdl, nvl,
782 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
783 		const char *failmode = NULL;
784 		boolean_t checkremove = B_FALSE;
785 		uint32_t pri = 0;
786 		int32_t flags = 0;
787 
788 		/*
789 		 * If this is a checksum or I/O error, then toss it into the
790 		 * appropriate SERD engine and check to see if it has fired.
791 		 * Ideally, we want to do something more sophisticated,
792 		 * (persistent errors for a single data block, etc).  For now,
793 		 * a single SERD engine is sufficient.
794 		 */
795 		if (fmd_nvl_class_match(hdl, nvl,
796 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) {
797 			if (zcp->zc_data.zc_serd_io[0] == '\0') {
798 				if (nvlist_lookup_uint64(nvl,
799 				    FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N,
800 				    &io_n) != 0) {
801 					io_n = DEFAULT_IO_N;
802 				}
803 				if (nvlist_lookup_uint64(nvl,
804 				    FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T,
805 				    &io_t) != 0) {
806 					io_t = DEFAULT_IO_T;
807 				}
808 				zfs_serd_name(zcp->zc_data.zc_serd_io,
809 				    pool_guid, vdev_guid, "io");
810 				fmd_serd_create(hdl, zcp->zc_data.zc_serd_io,
811 				    io_n,
812 				    SEC2NSEC(io_t));
813 				zfs_case_serialize(zcp);
814 			}
815 			if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
816 				checkremove = B_TRUE;
817 		} else if (fmd_nvl_class_match(hdl, nvl,
818 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
819 			/*
820 			 * We ignore ereports for checksum errors generated by
821 			 * scrub/resilver I/O to avoid potentially further
822 			 * degrading the pool while it's being repaired.
823 			 */
824 			if (((nvlist_lookup_uint32(nvl,
825 			    FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, &pri) == 0) &&
826 			    (pri == ZIO_PRIORITY_SCRUB ||
827 			    pri == ZIO_PRIORITY_REBUILD)) ||
828 			    ((nvlist_lookup_int32(nvl,
829 			    FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags) == 0) &&
830 			    (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
831 				fmd_hdl_debug(hdl, "ignoring '%s' for "
832 				    "scrub/resilver I/O", class);
833 				return;
834 			}
835 
836 			if (zcp->zc_data.zc_serd_checksum[0] == '\0') {
837 				if (nvlist_lookup_uint64(nvl,
838 				    FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N,
839 				    &checksum_n) != 0) {
840 					checksum_n = DEFAULT_CHECKSUM_N;
841 				}
842 				if (nvlist_lookup_uint64(nvl,
843 				    FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T,
844 				    &checksum_t) != 0) {
845 					checksum_t = DEFAULT_CHECKSUM_T;
846 				}
847 
848 				zfs_serd_name(zcp->zc_data.zc_serd_checksum,
849 				    pool_guid, vdev_guid, "checksum");
850 				fmd_serd_create(hdl,
851 				    zcp->zc_data.zc_serd_checksum,
852 				    checksum_n,
853 				    SEC2NSEC(checksum_t));
854 				zfs_case_serialize(zcp);
855 			}
856 			if (fmd_serd_record(hdl,
857 			    zcp->zc_data.zc_serd_checksum, ep)) {
858 				zfs_case_solve(hdl, zcp,
859 				    "fault.fs.zfs.vdev.checksum");
860 			}
861 		} else if (fmd_nvl_class_match(hdl, nvl,
862 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) &&
863 		    (nvlist_lookup_string(nvl,
864 		    FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) &&
865 		    failmode != NULL) {
866 			if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE,
867 			    strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) {
868 				zfs_case_solve(hdl, zcp,
869 				    "fault.fs.zfs.io_failure_continue");
870 			} else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT,
871 			    strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) {
872 				zfs_case_solve(hdl, zcp,
873 				    "fault.fs.zfs.io_failure_wait");
874 			}
875 		} else if (fmd_nvl_class_match(hdl, nvl,
876 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
877 #ifndef __linux__
878 			/* This causes an unexpected fault diagnosis on linux */
879 			checkremove = B_TRUE;
880 #endif
881 		}
882 
883 		/*
884 		 * Because I/O errors may be due to device removal, we postpone
885 		 * any diagnosis until we're sure that we aren't about to
886 		 * receive a 'resource.fs.zfs.removed' event.
887 		 */
888 		if (checkremove) {
889 			if (zcp->zc_data.zc_has_remove_timer)
890 				fmd_timer_remove(hdl, zcp->zc_remove_timer);
891 			zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL,
892 			    zfs_remove_timeout);
893 			if (!zcp->zc_data.zc_has_remove_timer) {
894 				zcp->zc_data.zc_has_remove_timer = 1;
895 				zfs_case_serialize(zcp);
896 			}
897 		}
898 	}
899 }
900 
901 /*
902  * The timeout is fired when we diagnosed an I/O error, and it was not due to
903  * device removal (which would cause the timeout to be cancelled).
904  */
905 static void
906 zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data)
907 {
908 	zfs_case_t *zcp = data;
909 
910 	if (id == zcp->zc_remove_timer)
911 		zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io");
912 }
913 
914 /*
915  * The specified case has been closed and any case-specific
916  * data structures should be deallocated.
917  */
918 static void
919 zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
920 {
921 	zfs_case_t *zcp = fmd_case_getspecific(hdl, cs);
922 
923 	if (zcp->zc_data.zc_serd_checksum[0] != '\0')
924 		fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum);
925 	if (zcp->zc_data.zc_serd_io[0] != '\0')
926 		fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io);
927 	if (zcp->zc_data.zc_has_remove_timer)
928 		fmd_timer_remove(hdl, zcp->zc_remove_timer);
929 
930 	uu_list_remove(zfs_cases, zcp);
931 	uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool);
932 	fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
933 }
934 
935 /*
936  * We use the fmd gc entry point to look for old cases that no longer apply.
937  * This allows us to keep our set of case data small in a long running system.
938  */
939 static void
940 zfs_fm_gc(fmd_hdl_t *hdl)
941 {
942 	zfs_purge_cases(hdl);
943 }
944 
945 static const fmd_hdl_ops_t fmd_ops = {
946 	zfs_fm_recv,	/* fmdo_recv */
947 	zfs_fm_timeout,	/* fmdo_timeout */
948 	zfs_fm_close,	/* fmdo_close */
949 	NULL,		/* fmdo_stats */
950 	zfs_fm_gc,	/* fmdo_gc */
951 };
952 
953 static const fmd_prop_t fmd_props[] = {
954 	{ "checksum_N", FMD_TYPE_UINT32, "10" },
955 	{ "checksum_T", FMD_TYPE_TIME, "10min" },
956 	{ "io_N", FMD_TYPE_UINT32, "10" },
957 	{ "io_T", FMD_TYPE_TIME, "10min" },
958 	{ "remove_timeout", FMD_TYPE_TIME, "15sec" },
959 	{ NULL, 0, NULL }
960 };
961 
962 static const fmd_hdl_info_t fmd_info = {
963 	"ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props
964 };
965 
966 void
967 _zfs_diagnosis_init(fmd_hdl_t *hdl)
968 {
969 	libzfs_handle_t *zhdl;
970 
971 	if ((zhdl = libzfs_init()) == NULL)
972 		return;
973 
974 	if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool",
975 	    sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node),
976 	    NULL, UU_LIST_POOL_DEBUG)) == NULL) {
977 		libzfs_fini(zhdl);
978 		return;
979 	}
980 
981 	if ((zfs_cases = uu_list_create(zfs_case_pool, NULL,
982 	    UU_LIST_DEBUG)) == NULL) {
983 		uu_list_pool_destroy(zfs_case_pool);
984 		libzfs_fini(zhdl);
985 		return;
986 	}
987 
988 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
989 		uu_list_destroy(zfs_cases);
990 		uu_list_pool_destroy(zfs_case_pool);
991 		libzfs_fini(zhdl);
992 		return;
993 	}
994 
995 	fmd_hdl_setspecific(hdl, zhdl);
996 
997 	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) /
998 	    sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats);
999 
1000 	zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout");
1001 }
1002 
1003 void
1004 _zfs_diagnosis_fini(fmd_hdl_t *hdl)
1005 {
1006 	zfs_case_t *zcp;
1007 	uu_list_walk_t *walk;
1008 	libzfs_handle_t *zhdl;
1009 
1010 	/*
1011 	 * Remove all active cases.
1012 	 */
1013 	walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST);
1014 	while ((zcp = uu_list_walk_next(walk)) != NULL) {
1015 		fmd_hdl_debug(hdl, "removing case ena %llu",
1016 		    (long long unsigned)zcp->zc_data.zc_ena);
1017 		uu_list_remove(zfs_cases, zcp);
1018 		uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool);
1019 		fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
1020 	}
1021 	uu_list_walk_end(walk);
1022 
1023 	uu_list_destroy(zfs_cases);
1024 	uu_list_pool_destroy(zfs_case_pool);
1025 
1026 	zhdl = fmd_hdl_getspecific(hdl);
1027 	libzfs_fini(zhdl);
1028 }
1029