1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Copyright (c) 2012,2021 by Delphix. All rights reserved.
29 */
30
31 #include <sys/spa.h>
32 #include <sys/spa_impl.h>
33 #include <sys/vdev.h>
34 #include <sys/vdev_impl.h>
35 #include <sys/zio.h>
36 #include <sys/zio_checksum.h>
37
38 #include <sys/fm/fs/zfs.h>
39 #include <sys/fm/protocol.h>
40 #include <sys/fm/util.h>
41 #include <sys/sysevent.h>
42
43 /*
44 * This general routine is responsible for generating all the different ZFS
45 * ereports. The payload is dependent on the class, and which arguments are
46 * supplied to the function:
47 *
48 * EREPORT POOL VDEV IO
49 * block X X X
50 * data X X
51 * device X X
52 * pool X
53 *
54 * If we are in a loading state, all errors are chained together by the same
55 * SPA-wide ENA (Error Numeric Association).
56 *
57 * For isolated I/O requests, we get the ENA from the zio_t. The propagation
58 * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want
59 * to chain together all ereports associated with a logical piece of data. For
60 * read I/Os, there are basically three 'types' of I/O, which form a roughly
61 * layered diagram:
62 *
63 * +---------------+
64 * | Aggregate I/O | No associated logical data or device
65 * +---------------+
66 * |
67 * V
68 * +---------------+ Reads associated with a piece of logical data.
69 * | Read I/O | This includes reads on behalf of RAID-Z,
70 * +---------------+ mirrors, gang blocks, retries, etc.
71 * |
72 * V
73 * +---------------+ Reads associated with a particular device, but
74 * | Physical I/O | no logical data. Issued as part of vdev caching
75 * +---------------+ and I/O aggregation.
76 *
77 * Note that 'physical I/O' here is not the same terminology as used in the rest
78 * of ZIO. Typically, 'physical I/O' simply means that there is no attached
79 * blockpointer. But I/O with no associated block pointer can still be related
80 * to a logical piece of data (i.e. RAID-Z requests).
81 *
82 * Purely physical I/O always have unique ENAs. They are not related to a
83 * particular piece of logical data, and therefore cannot be chained together.
84 * We still generate an ereport, but the DE doesn't correlate it with any
85 * logical piece of data. When such an I/O fails, the delegated I/O requests
86 * will issue a retry, which will trigger the 'real' ereport with the correct
87 * ENA.
88 *
89 * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
90 * When a new logical I/O is issued, we set this to point to itself. Child I/Os
91 * then inherit this pointer, so that when it is first set subsequent failures
92 * will use the same ENA. For vdev cache fill and queue aggregation I/O,
93 * this pointer is set to NULL, and no ereport will be generated (since it
94 * doesn't actually correspond to any particular device or piece of data,
95 * and the caller will always retry without caching or queueing anyway).
96 *
97 * For checksum errors, we want to include more information about the actual
98 * error which occurs. Accordingly, we build an ereport when the error is
99 * noticed, but instead of sending it in immediately, we hang it off of the
100 * io_cksum_report field of the logical IO. When the logical IO completes
101 * (successfully or not), zfs_ereport_finish_checksum() is called with the
102 * good and bad versions of the buffer (if available), and we annotate the
103 * ereport with information about the differences.
104 */
105
106 #ifdef _KERNEL
107 /*
108 * Duplicate ereport Detection
109 *
110 * Some ereports are retained momentarily for detecting duplicates. These
111 * are kept in a recent_events_node_t in both a time-ordered list and an AVL
112 * tree of recent unique ereports.
113 *
114 * The lifespan of these recent ereports is bounded (15 mins) and a cleaner
115 * task is used to purge stale entries.
116 */
117 static list_t recent_events_list;
118 static avl_tree_t recent_events_tree;
119 static kmutex_t recent_events_lock;
120 static taskqid_t recent_events_cleaner_tqid;
121
122 /*
123 * Each node is about 128 bytes so 2,000 would consume 1/4 MiB.
124 *
125 * This setting can be changed dynamically and setting it to zero
126 * disables duplicate detection.
127 */
128 static unsigned int zfs_zevent_retain_max = 2000;
129
130 /*
131 * The lifespan for a recent ereport entry. The default of 15 minutes is
132 * intended to outlive the zfs diagnosis engine's threshold of 10 errors
133 * over a period of 10 minutes.
134 */
135 static unsigned int zfs_zevent_retain_expire_secs = 900;
136
137 typedef enum zfs_subclass {
138 ZSC_IO,
139 ZSC_DATA,
140 ZSC_CHECKSUM
141 } zfs_subclass_t;
142
143 typedef struct {
144 /* common criteria */
145 uint64_t re_pool_guid;
146 uint64_t re_vdev_guid;
147 int re_io_error;
148 uint64_t re_io_size;
149 uint64_t re_io_offset;
150 zfs_subclass_t re_subclass;
151 zio_priority_t re_io_priority;
152
153 /* logical zio criteria (optional) */
154 zbookmark_phys_t re_io_bookmark;
155
156 /* internal state */
157 avl_node_t re_tree_link;
158 list_node_t re_list_link;
159 uint64_t re_timestamp;
160 } recent_events_node_t;
161
162 static int
recent_events_compare(const void * a,const void * b)163 recent_events_compare(const void *a, const void *b)
164 {
165 const recent_events_node_t *node1 = a;
166 const recent_events_node_t *node2 = b;
167 int cmp;
168
169 /*
170 * The comparison order here is somewhat arbitrary.
171 * What's important is that if every criteria matches, then it
172 * is a duplicate (i.e. compare returns 0)
173 */
174 if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0)
175 return (cmp);
176 if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0)
177 return (cmp);
178 if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0)
179 return (cmp);
180 if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0)
181 return (cmp);
182 if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0)
183 return (cmp);
184 if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0)
185 return (cmp);
186 if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0)
187 return (cmp);
188
189 const zbookmark_phys_t *zb1 = &node1->re_io_bookmark;
190 const zbookmark_phys_t *zb2 = &node2->re_io_bookmark;
191
192 if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0)
193 return (cmp);
194 if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0)
195 return (cmp);
196 if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0)
197 return (cmp);
198 if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0)
199 return (cmp);
200
201 return (0);
202 }
203
204 /*
205 * workaround: vdev properties don't have inheritance
206 */
207 static uint64_t
vdev_prop_get_inherited(vdev_t * vd,vdev_prop_t prop)208 vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop)
209 {
210 uint64_t propdef, propval;
211
212 propdef = vdev_prop_default_numeric(prop);
213 switch (prop) {
214 case VDEV_PROP_CHECKSUM_N:
215 propval = vd->vdev_checksum_n;
216 break;
217 case VDEV_PROP_CHECKSUM_T:
218 propval = vd->vdev_checksum_t;
219 break;
220 case VDEV_PROP_IO_N:
221 propval = vd->vdev_io_n;
222 break;
223 case VDEV_PROP_IO_T:
224 propval = vd->vdev_io_t;
225 break;
226 case VDEV_PROP_SLOW_IO_EVENTS:
227 propval = vd->vdev_slow_io_events;
228 break;
229 case VDEV_PROP_SLOW_IO_N:
230 propval = vd->vdev_slow_io_n;
231 break;
232 case VDEV_PROP_SLOW_IO_T:
233 propval = vd->vdev_slow_io_t;
234 break;
235 default:
236 propval = propdef;
237 break;
238 }
239
240 if (propval != propdef)
241 return (propval);
242
243 if (vd->vdev_parent == NULL)
244 return (propdef);
245
246 return (vdev_prop_get_inherited(vd->vdev_parent, prop));
247 }
248
249 static void zfs_ereport_schedule_cleaner(void);
250
251 /*
252 * background task to clean stale recent event nodes.
253 */
254 static void
zfs_ereport_cleaner(void * arg)255 zfs_ereport_cleaner(void *arg)
256 {
257 recent_events_node_t *entry;
258 uint64_t now = gethrtime();
259
260 /*
261 * purge expired entries
262 */
263 mutex_enter(&recent_events_lock);
264 while ((entry = list_tail(&recent_events_list)) != NULL) {
265 uint64_t age = NSEC2SEC(now - entry->re_timestamp);
266 if (age <= zfs_zevent_retain_expire_secs)
267 break;
268
269 /* remove expired node */
270 avl_remove(&recent_events_tree, entry);
271 list_remove(&recent_events_list, entry);
272 kmem_free(entry, sizeof (*entry));
273 }
274
275 /* Restart the cleaner if more entries remain */
276 recent_events_cleaner_tqid = 0;
277 if (!list_is_empty(&recent_events_list))
278 zfs_ereport_schedule_cleaner();
279
280 mutex_exit(&recent_events_lock);
281 }
282
283 static void
zfs_ereport_schedule_cleaner(void)284 zfs_ereport_schedule_cleaner(void)
285 {
286 ASSERT(MUTEX_HELD(&recent_events_lock));
287
288 uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1);
289
290 recent_events_cleaner_tqid = taskq_dispatch_delay(
291 system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP,
292 ddi_get_lbolt() + NSEC_TO_TICK(timeout));
293 }
294
295 /*
296 * Clear entries for a given vdev or all vdevs in a pool when vdev == NULL
297 */
298 void
zfs_ereport_clear(spa_t * spa,vdev_t * vd)299 zfs_ereport_clear(spa_t *spa, vdev_t *vd)
300 {
301 uint64_t vdev_guid, pool_guid;
302
303 ASSERT(vd != NULL || spa != NULL);
304 if (vd == NULL) {
305 vdev_guid = 0;
306 pool_guid = spa_guid(spa);
307 } else {
308 vdev_guid = vd->vdev_guid;
309 pool_guid = 0;
310 }
311
312 mutex_enter(&recent_events_lock);
313
314 recent_events_node_t *next = list_head(&recent_events_list);
315 while (next != NULL) {
316 recent_events_node_t *entry = next;
317
318 next = list_next(&recent_events_list, next);
319
320 if (entry->re_vdev_guid == vdev_guid ||
321 entry->re_pool_guid == pool_guid) {
322 avl_remove(&recent_events_tree, entry);
323 list_remove(&recent_events_list, entry);
324 kmem_free(entry, sizeof (*entry));
325 }
326 }
327
328 mutex_exit(&recent_events_lock);
329 }
330
331 /*
332 * Check if an ereport would be a duplicate of one recently posted.
333 *
334 * An ereport is considered a duplicate if the set of criteria in
335 * recent_events_node_t all match.
336 *
337 * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM
338 * are candidates for duplicate checking.
339 */
340 static boolean_t
zfs_ereport_is_duplicate(const char * subclass,spa_t * spa,vdev_t * vd,const zbookmark_phys_t * zb,zio_t * zio,uint64_t offset,uint64_t size)341 zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd,
342 const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size)
343 {
344 recent_events_node_t search = {0}, *entry;
345
346 if (vd == NULL || zio == NULL)
347 return (B_FALSE);
348
349 if (zfs_zevent_retain_max == 0)
350 return (B_FALSE);
351
352 if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0)
353 search.re_subclass = ZSC_IO;
354 else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0)
355 search.re_subclass = ZSC_DATA;
356 else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0)
357 search.re_subclass = ZSC_CHECKSUM;
358 else
359 return (B_FALSE);
360
361 search.re_pool_guid = spa_guid(spa);
362 search.re_vdev_guid = vd->vdev_guid;
363 search.re_io_error = zio->io_error;
364 search.re_io_priority = zio->io_priority;
365 /* if size is supplied use it over what's in zio */
366 if (size) {
367 search.re_io_size = size;
368 search.re_io_offset = offset;
369 } else {
370 search.re_io_size = zio->io_size;
371 search.re_io_offset = zio->io_offset;
372 }
373
374 /* grab optional logical zio criteria */
375 if (zb != NULL) {
376 search.re_io_bookmark.zb_objset = zb->zb_objset;
377 search.re_io_bookmark.zb_object = zb->zb_object;
378 search.re_io_bookmark.zb_level = zb->zb_level;
379 search.re_io_bookmark.zb_blkid = zb->zb_blkid;
380 }
381
382 uint64_t now = gethrtime();
383
384 mutex_enter(&recent_events_lock);
385
386 /* check if we have seen this one recently */
387 entry = avl_find(&recent_events_tree, &search, NULL);
388 if (entry != NULL) {
389 uint64_t age = NSEC2SEC(now - entry->re_timestamp);
390
391 /*
392 * There is still an active cleaner (since we're here).
393 * Reset the last seen time for this duplicate entry
394 * so that its lifespand gets extended.
395 */
396 list_remove(&recent_events_list, entry);
397 list_insert_head(&recent_events_list, entry);
398 entry->re_timestamp = now;
399
400 zfs_zevent_track_duplicate();
401 mutex_exit(&recent_events_lock);
402
403 return (age <= zfs_zevent_retain_expire_secs);
404 }
405
406 if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) {
407 /* recycle oldest node */
408 entry = list_tail(&recent_events_list);
409 ASSERT(entry != NULL);
410 list_remove(&recent_events_list, entry);
411 avl_remove(&recent_events_tree, entry);
412 } else {
413 entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP);
414 }
415
416 /* record this as a recent ereport */
417 *entry = search;
418 avl_add(&recent_events_tree, entry);
419 list_insert_head(&recent_events_list, entry);
420 entry->re_timestamp = now;
421
422 /* Start a cleaner if not already scheduled */
423 if (recent_events_cleaner_tqid == 0)
424 zfs_ereport_schedule_cleaner();
425
426 mutex_exit(&recent_events_lock);
427 return (B_FALSE);
428 }
429
430 void
zfs_zevent_post_cb(nvlist_t * nvl,nvlist_t * detector)431 zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector)
432 {
433 if (nvl)
434 fm_nvlist_destroy(nvl, FM_NVA_FREE);
435
436 if (detector)
437 fm_nvlist_destroy(detector, FM_NVA_FREE);
438 }
439
440 /*
441 * We want to rate limit ZIO delay, deadman, and checksum events so as to not
442 * flood zevent consumers when a disk is acting up.
443 *
444 * Returns 1 if we're ratelimiting, 0 if not.
445 */
446 static int
zfs_is_ratelimiting_event(const char * subclass,vdev_t * vd)447 zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
448 {
449 int rc = 0;
450 /*
451 * zfs_ratelimit() returns 1 if we're *not* ratelimiting and 0 if we
452 * are. Invert it to get our return value.
453 */
454 if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
455 rc = !zfs_ratelimit(&vd->vdev_delay_rl);
456 } else if (strcmp(subclass, FM_EREPORT_ZFS_DEADMAN) == 0) {
457 rc = !zfs_ratelimit(&vd->vdev_deadman_rl);
458 } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
459 rc = !zfs_ratelimit(&vd->vdev_checksum_rl);
460 }
461
462 if (rc) {
463 /* We're rate limiting */
464 fm_erpt_dropped_increment();
465 }
466
467 return (rc);
468 }
469
470 /*
471 * Return B_TRUE if the event actually posted, B_FALSE if not.
472 */
473 static boolean_t
zfs_ereport_start(nvlist_t ** ereport_out,nvlist_t ** detector_out,const char * subclass,spa_t * spa,vdev_t * vd,const zbookmark_phys_t * zb,zio_t * zio,uint64_t stateoroffset,uint64_t size)474 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
475 const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
476 zio_t *zio, uint64_t stateoroffset, uint64_t size)
477 {
478 nvlist_t *ereport, *detector;
479
480 uint64_t ena;
481 char class[64];
482
483 if ((ereport = fm_nvlist_create(NULL)) == NULL)
484 return (B_FALSE);
485
486 if ((detector = fm_nvlist_create(NULL)) == NULL) {
487 fm_nvlist_destroy(ereport, FM_NVA_FREE);
488 return (B_FALSE);
489 }
490
491 /*
492 * Serialize ereport generation
493 */
494 mutex_enter(&spa->spa_errlist_lock);
495
496 /*
497 * Determine the ENA to use for this event. If we are in a loading
498 * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use
499 * a root zio-wide ENA. Otherwise, simply use a unique ENA.
500 */
501 if (spa_load_state(spa) != SPA_LOAD_NONE) {
502 if (spa->spa_ena == 0)
503 spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
504 ena = spa->spa_ena;
505 } else if (zio != NULL && zio->io_logical != NULL) {
506 if (zio->io_logical->io_ena == 0)
507 zio->io_logical->io_ena =
508 fm_ena_generate(0, FM_ENA_FMT1);
509 ena = zio->io_logical->io_ena;
510 } else {
511 ena = fm_ena_generate(0, FM_ENA_FMT1);
512 }
513
514 /*
515 * Construct the full class, detector, and other standard FMA fields.
516 */
517 (void) snprintf(class, sizeof (class), "%s.%s",
518 ZFS_ERROR_CLASS, subclass);
519
520 fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
521 vd != NULL ? vd->vdev_guid : 0);
522
523 fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
524
525 /*
526 * Construct the per-ereport payload, depending on which parameters are
527 * passed in.
528 */
529
530 /*
531 * Generic payload members common to all ereports.
532 */
533 fm_payload_set(ereport,
534 FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa),
535 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa),
536 FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64,
537 (uint64_t)spa_state(spa),
538 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
539 (int32_t)spa_load_state(spa), NULL);
540
541 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
542 DATA_TYPE_STRING,
543 spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
544 FM_EREPORT_FAILMODE_WAIT :
545 spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
546 FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
547 NULL);
548
549 if (vd != NULL) {
550 vdev_t *pvd = vd->vdev_parent;
551 vdev_queue_t *vq = &vd->vdev_queue;
552 vdev_stat_t *vs = &vd->vdev_stat;
553 vdev_t *spare_vd;
554 uint64_t *spare_guids;
555 char **spare_paths;
556 int i, spare_count;
557
558 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
559 DATA_TYPE_UINT64, vd->vdev_guid,
560 FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
561 DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
562 if (vd->vdev_path != NULL)
563 fm_payload_set(ereport,
564 FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
565 DATA_TYPE_STRING, vd->vdev_path, NULL);
566 if (vd->vdev_devid != NULL)
567 fm_payload_set(ereport,
568 FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
569 DATA_TYPE_STRING, vd->vdev_devid, NULL);
570 if (vd->vdev_fru != NULL)
571 fm_payload_set(ereport,
572 FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
573 DATA_TYPE_STRING, vd->vdev_fru, NULL);
574 if (vd->vdev_enc_sysfs_path != NULL)
575 fm_payload_set(ereport,
576 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
577 DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL);
578 if (vd->vdev_ashift)
579 fm_payload_set(ereport,
580 FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT,
581 DATA_TYPE_UINT64, vd->vdev_ashift, NULL);
582
583 if (vq != NULL) {
584 fm_payload_set(ereport,
585 FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS,
586 DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL);
587 fm_payload_set(ereport,
588 FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS,
589 DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL);
590 }
591
592 if (vs != NULL) {
593 fm_payload_set(ereport,
594 FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS,
595 DATA_TYPE_UINT64, vs->vs_read_errors,
596 FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS,
597 DATA_TYPE_UINT64, vs->vs_write_errors,
598 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS,
599 DATA_TYPE_UINT64, vs->vs_checksum_errors,
600 FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS,
601 DATA_TYPE_UINT64, vs->vs_slow_ios,
602 FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS,
603 DATA_TYPE_UINT64, vs->vs_dio_verify_errors,
604 NULL);
605 }
606
607 if (pvd != NULL) {
608 fm_payload_set(ereport,
609 FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
610 DATA_TYPE_UINT64, pvd->vdev_guid,
611 FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
612 DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
613 NULL);
614 if (pvd->vdev_path)
615 fm_payload_set(ereport,
616 FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
617 DATA_TYPE_STRING, pvd->vdev_path, NULL);
618 if (pvd->vdev_devid)
619 fm_payload_set(ereport,
620 FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
621 DATA_TYPE_STRING, pvd->vdev_devid, NULL);
622 }
623
624 spare_count = spa->spa_spares.sav_count;
625 spare_paths = kmem_zalloc(sizeof (char *) * spare_count,
626 KM_SLEEP);
627 spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count,
628 KM_SLEEP);
629
630 for (i = 0; i < spare_count; i++) {
631 spare_vd = spa->spa_spares.sav_vdevs[i];
632 if (spare_vd) {
633 spare_paths[i] = spare_vd->vdev_path;
634 spare_guids[i] = spare_vd->vdev_guid;
635 }
636 }
637
638 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS,
639 DATA_TYPE_STRING_ARRAY, spare_count, spare_paths,
640 FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS,
641 DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL);
642
643 kmem_free(spare_guids, sizeof (uint64_t) * spare_count);
644 kmem_free(spare_paths, sizeof (char *) * spare_count);
645 }
646
647 if (zio != NULL) {
648 /*
649 * Payload common to all I/Os.
650 */
651 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
652 DATA_TYPE_INT32, zio->io_error, NULL);
653 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
654 DATA_TYPE_UINT64, zio->io_flags, NULL);
655 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
656 DATA_TYPE_UINT32, zio->io_stage, NULL);
657 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,
658 DATA_TYPE_UINT32, zio->io_pipeline, NULL);
659 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
660 DATA_TYPE_UINT64, zio->io_delay, NULL);
661 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP,
662 DATA_TYPE_UINT64, zio->io_timestamp, NULL);
663 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
664 DATA_TYPE_UINT64, zio->io_delta, NULL);
665 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TYPE,
666 DATA_TYPE_UINT32, zio->io_type, NULL);
667 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY,
668 DATA_TYPE_UINT32, zio->io_priority, NULL);
669
670 /*
671 * If the 'size' parameter is non-zero, it indicates this is a
672 * RAID-Z or other I/O where the physical offset and length are
673 * provided for us, instead of within the zio_t.
674 */
675 if (vd != NULL) {
676 if (size)
677 fm_payload_set(ereport,
678 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
679 DATA_TYPE_UINT64, stateoroffset,
680 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
681 DATA_TYPE_UINT64, size, NULL);
682 else
683 fm_payload_set(ereport,
684 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
685 DATA_TYPE_UINT64, zio->io_offset,
686 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
687 DATA_TYPE_UINT64, zio->io_size, NULL);
688 }
689 } else if (vd != NULL) {
690 /*
691 * If we have a vdev but no zio, this is a device fault, and the
692 * 'stateoroffset' parameter indicates the previous state of the
693 * vdev.
694 */
695 fm_payload_set(ereport,
696 FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
697 DATA_TYPE_UINT64, stateoroffset, NULL);
698 }
699
700 /*
701 * Payload for I/Os with corresponding logical information.
702 */
703 if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) {
704 fm_payload_set(ereport,
705 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
706 DATA_TYPE_UINT64, zb->zb_objset,
707 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
708 DATA_TYPE_UINT64, zb->zb_object,
709 FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
710 DATA_TYPE_INT64, zb->zb_level,
711 FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
712 DATA_TYPE_UINT64, zb->zb_blkid, NULL);
713 }
714
715 /*
716 * Payload for tuning the zed
717 */
718 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
719 uint64_t cksum_n, cksum_t;
720
721 cksum_n = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_N);
722 if (cksum_n != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N))
723 fm_payload_set(ereport,
724 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N,
725 DATA_TYPE_UINT64,
726 cksum_n,
727 NULL);
728
729 cksum_t = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_T);
730 if (cksum_t != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T))
731 fm_payload_set(ereport,
732 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T,
733 DATA_TYPE_UINT64,
734 cksum_t,
735 NULL);
736 }
737
738 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) {
739 uint64_t io_n, io_t;
740
741 io_n = vdev_prop_get_inherited(vd, VDEV_PROP_IO_N);
742 if (io_n != vdev_prop_default_numeric(VDEV_PROP_IO_N))
743 fm_payload_set(ereport,
744 FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N,
745 DATA_TYPE_UINT64,
746 io_n,
747 NULL);
748
749 io_t = vdev_prop_get_inherited(vd, VDEV_PROP_IO_T);
750 if (io_t != vdev_prop_default_numeric(VDEV_PROP_IO_T))
751 fm_payload_set(ereport,
752 FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T,
753 DATA_TYPE_UINT64,
754 io_t,
755 NULL);
756 }
757
758 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
759 uint64_t slow_io_n, slow_io_t;
760
761 slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N);
762 if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N))
763 fm_payload_set(ereport,
764 FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
765 DATA_TYPE_UINT64,
766 slow_io_n,
767 NULL);
768
769 slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T);
770 if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T))
771 fm_payload_set(ereport,
772 FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
773 DATA_TYPE_UINT64,
774 slow_io_t,
775 NULL);
776 }
777
778 mutex_exit(&spa->spa_errlist_lock);
779
780 *ereport_out = ereport;
781 *detector_out = detector;
782 return (B_TRUE);
783 }
784
785 /* if it's <= 128 bytes, save the corruption directly */
786 #define ZFM_MAX_INLINE (128 / sizeof (uint64_t))
787
788 #define MAX_RANGES 16
789
790 typedef struct zfs_ecksum_info {
791 /* inline arrays of bits set and cleared. */
792 uint64_t zei_bits_set[ZFM_MAX_INLINE];
793 uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
794
795 /*
796 * for each range, the number of bits set and cleared. The Hamming
797 * distance between the good and bad buffers is the sum of them all.
798 */
799 uint32_t zei_range_sets[MAX_RANGES];
800 uint32_t zei_range_clears[MAX_RANGES];
801
802 struct zei_ranges {
803 uint32_t zr_start;
804 uint32_t zr_end;
805 } zei_ranges[MAX_RANGES];
806
807 size_t zei_range_count;
808 uint32_t zei_mingap;
809 uint32_t zei_allowed_mingap;
810
811 } zfs_ecksum_info_t;
812
813 static void
update_bad_bits(uint64_t value_arg,uint32_t * count)814 update_bad_bits(uint64_t value_arg, uint32_t *count)
815 {
816 size_t i;
817 size_t bits = 0;
818 uint64_t value = BE_64(value_arg);
819
820 /* We store the bits in big-endian (largest-first) order */
821 for (i = 0; i < 64; i++) {
822 if (value & (1ull << i))
823 ++bits;
824 }
825 /* update the count of bits changed */
826 *count += bits;
827 }
828
829 /*
830 * We've now filled up the range array, and need to increase "mingap" and
831 * shrink the range list accordingly. zei_mingap is always the smallest
832 * distance between array entries, so we set the new_allowed_gap to be
833 * one greater than that. We then go through the list, joining together
834 * any ranges which are closer than the new_allowed_gap.
835 *
836 * By construction, there will be at least one. We also update zei_mingap
837 * to the new smallest gap, to prepare for our next invocation.
838 */
839 static void
zei_shrink_ranges(zfs_ecksum_info_t * eip)840 zei_shrink_ranges(zfs_ecksum_info_t *eip)
841 {
842 uint32_t mingap = UINT32_MAX;
843 uint32_t new_allowed_gap = eip->zei_mingap + 1;
844
845 size_t idx, output;
846 size_t max = eip->zei_range_count;
847
848 struct zei_ranges *r = eip->zei_ranges;
849
850 ASSERT3U(eip->zei_range_count, >, 0);
851 ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
852
853 output = idx = 0;
854 while (idx < max - 1) {
855 uint32_t start = r[idx].zr_start;
856 uint32_t end = r[idx].zr_end;
857
858 while (idx < max - 1) {
859 idx++;
860
861 uint32_t nstart = r[idx].zr_start;
862 uint32_t nend = r[idx].zr_end;
863
864 uint32_t gap = nstart - end;
865 if (gap < new_allowed_gap) {
866 end = nend;
867 continue;
868 }
869 if (gap < mingap)
870 mingap = gap;
871 break;
872 }
873 r[output].zr_start = start;
874 r[output].zr_end = end;
875 output++;
876 }
877 ASSERT3U(output, <, eip->zei_range_count);
878 eip->zei_range_count = output;
879 eip->zei_mingap = mingap;
880 eip->zei_allowed_mingap = new_allowed_gap;
881 }
882
883 static void
zei_add_range(zfs_ecksum_info_t * eip,int start,int end)884 zei_add_range(zfs_ecksum_info_t *eip, int start, int end)
885 {
886 struct zei_ranges *r = eip->zei_ranges;
887 size_t count = eip->zei_range_count;
888
889 if (count >= MAX_RANGES) {
890 zei_shrink_ranges(eip);
891 count = eip->zei_range_count;
892 }
893 if (count == 0) {
894 eip->zei_mingap = UINT32_MAX;
895 eip->zei_allowed_mingap = 1;
896 } else {
897 int gap = start - r[count - 1].zr_end;
898
899 if (gap < eip->zei_allowed_mingap) {
900 r[count - 1].zr_end = end;
901 return;
902 }
903 if (gap < eip->zei_mingap)
904 eip->zei_mingap = gap;
905 }
906 r[count].zr_start = start;
907 r[count].zr_end = end;
908 eip->zei_range_count++;
909 }
910
911 static size_t
zei_range_total_size(zfs_ecksum_info_t * eip)912 zei_range_total_size(zfs_ecksum_info_t *eip)
913 {
914 struct zei_ranges *r = eip->zei_ranges;
915 size_t count = eip->zei_range_count;
916 size_t result = 0;
917 size_t idx;
918
919 for (idx = 0; idx < count; idx++)
920 result += (r[idx].zr_end - r[idx].zr_start);
921
922 return (result);
923 }
924
925 static zfs_ecksum_info_t *
annotate_ecksum(nvlist_t * ereport,zio_bad_cksum_t * info,const abd_t * goodabd,const abd_t * badabd,size_t size,boolean_t drop_if_identical)926 annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
927 const abd_t *goodabd, const abd_t *badabd, size_t size,
928 boolean_t drop_if_identical)
929 {
930 const uint64_t *good;
931 const uint64_t *bad;
932
933 size_t nui64s = size / sizeof (uint64_t);
934
935 size_t inline_size;
936 int no_inline = 0;
937 size_t idx;
938 size_t range;
939
940 size_t offset = 0;
941 ssize_t start = -1;
942
943 zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
944
945 /* don't do any annotation for injected checksum errors */
946 if (info != NULL && info->zbc_injected)
947 return (eip);
948
949 if (info != NULL && info->zbc_has_cksum) {
950 fm_payload_set(ereport,
951 FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
952 DATA_TYPE_STRING,
953 info->zbc_checksum_name,
954 NULL);
955
956 if (info->zbc_byteswapped) {
957 fm_payload_set(ereport,
958 FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
959 DATA_TYPE_BOOLEAN, 1,
960 NULL);
961 }
962 }
963
964 if (badabd == NULL || goodabd == NULL)
965 return (eip);
966
967 ASSERT3U(nui64s, <=, UINT32_MAX);
968 ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
969 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
970 ASSERT3U(size, <=, UINT32_MAX);
971
972 good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size);
973 bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size);
974
975 /* build up the range list by comparing the two buffers. */
976 for (idx = 0; idx < nui64s; idx++) {
977 if (good[idx] == bad[idx]) {
978 if (start == -1)
979 continue;
980
981 zei_add_range(eip, start, idx);
982 start = -1;
983 } else {
984 if (start != -1)
985 continue;
986
987 start = idx;
988 }
989 }
990 if (start != -1)
991 zei_add_range(eip, start, idx);
992
993 /* See if it will fit in our inline buffers */
994 inline_size = zei_range_total_size(eip);
995 if (inline_size > ZFM_MAX_INLINE)
996 no_inline = 1;
997
998 /*
999 * If there is no change and we want to drop if the buffers are
1000 * identical, do so.
1001 */
1002 if (inline_size == 0 && drop_if_identical) {
1003 kmem_free(eip, sizeof (*eip));
1004 abd_return_buf((abd_t *)goodabd, (void *)good, size);
1005 abd_return_buf((abd_t *)badabd, (void *)bad, size);
1006 return (NULL);
1007 }
1008
1009 /*
1010 * Now walk through the ranges, filling in the details of the
1011 * differences. Also convert our uint64_t-array offsets to byte
1012 * offsets.
1013 */
1014 for (range = 0; range < eip->zei_range_count; range++) {
1015 size_t start = eip->zei_ranges[range].zr_start;
1016 size_t end = eip->zei_ranges[range].zr_end;
1017
1018 for (idx = start; idx < end; idx++) {
1019 uint64_t set, cleared;
1020
1021 // bits set in bad, but not in good
1022 set = ((~good[idx]) & bad[idx]);
1023 // bits set in good, but not in bad
1024 cleared = (good[idx] & (~bad[idx]));
1025
1026 if (!no_inline) {
1027 ASSERT3U(offset, <, inline_size);
1028 eip->zei_bits_set[offset] = set;
1029 eip->zei_bits_cleared[offset] = cleared;
1030 offset++;
1031 }
1032
1033 update_bad_bits(set, &eip->zei_range_sets[range]);
1034 update_bad_bits(cleared, &eip->zei_range_clears[range]);
1035 }
1036
1037 /* convert to byte offsets */
1038 eip->zei_ranges[range].zr_start *= sizeof (uint64_t);
1039 eip->zei_ranges[range].zr_end *= sizeof (uint64_t);
1040 }
1041
1042 abd_return_buf((abd_t *)goodabd, (void *)good, size);
1043 abd_return_buf((abd_t *)badabd, (void *)bad, size);
1044
1045 eip->zei_allowed_mingap *= sizeof (uint64_t);
1046 inline_size *= sizeof (uint64_t);
1047
1048 /* fill in ereport */
1049 fm_payload_set(ereport,
1050 FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
1051 DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
1052 (uint32_t *)eip->zei_ranges,
1053 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
1054 DATA_TYPE_UINT32, eip->zei_allowed_mingap,
1055 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
1056 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
1057 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
1058 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
1059 NULL);
1060
1061 if (!no_inline) {
1062 fm_payload_set(ereport,
1063 FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
1064 DATA_TYPE_UINT8_ARRAY,
1065 inline_size, (uint8_t *)eip->zei_bits_set,
1066 FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
1067 DATA_TYPE_UINT8_ARRAY,
1068 inline_size, (uint8_t *)eip->zei_bits_cleared,
1069 NULL);
1070 }
1071 return (eip);
1072 }
1073 #else
1074 void
zfs_ereport_clear(spa_t * spa,vdev_t * vd)1075 zfs_ereport_clear(spa_t *spa, vdev_t *vd)
1076 {
1077 (void) spa, (void) vd;
1078 }
1079 #endif
1080
1081 /*
1082 * Make sure our event is still valid for the given zio/vdev/pool. For example,
1083 * we don't want to keep logging events for a faulted or missing vdev.
1084 */
1085 boolean_t
zfs_ereport_is_valid(const char * subclass,spa_t * spa,vdev_t * vd,zio_t * zio)1086 zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
1087 {
1088 #ifdef _KERNEL
1089 /*
1090 * If we are doing a spa_tryimport() or in recovery mode,
1091 * ignore errors.
1092 */
1093 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
1094 spa_load_state(spa) == SPA_LOAD_RECOVER)
1095 return (B_FALSE);
1096
1097 /*
1098 * If we are in the middle of opening a pool, and the previous attempt
1099 * failed, don't bother logging any new ereports - we're just going to
1100 * get the same diagnosis anyway.
1101 */
1102 if (spa_load_state(spa) != SPA_LOAD_NONE &&
1103 spa->spa_last_open_failed)
1104 return (B_FALSE);
1105
1106 if (zio != NULL) {
1107 /* If this is not a read or write zio, ignore the error */
1108 if (zio->io_type != ZIO_TYPE_READ &&
1109 zio->io_type != ZIO_TYPE_WRITE)
1110 return (B_FALSE);
1111
1112 if (vd != NULL) {
1113 /*
1114 * If the vdev has already been marked as failing due
1115 * to a failed probe, then ignore any subsequent I/O
1116 * errors, as the DE will automatically fault the vdev
1117 * on the first such failure. This also catches cases
1118 * where vdev_remove_wanted is set and the device has
1119 * not yet been asynchronously placed into the REMOVED
1120 * state.
1121 */
1122 if (zio->io_vd == vd && !vdev_accessible(vd, zio))
1123 return (B_FALSE);
1124
1125 /*
1126 * Ignore checksum errors for reads from DTL regions of
1127 * leaf vdevs.
1128 */
1129 if (zio->io_type == ZIO_TYPE_READ &&
1130 zio->io_error == ECKSUM &&
1131 vd->vdev_ops->vdev_op_leaf &&
1132 vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
1133 return (B_FALSE);
1134 }
1135 }
1136
1137 /*
1138 * For probe failure, we want to avoid posting ereports if we've
1139 * already removed the device in the meantime.
1140 */
1141 if (vd != NULL &&
1142 strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
1143 (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
1144 return (B_FALSE);
1145
1146 /* Ignore bogus delay events (like from ioctls or unqueued IOs) */
1147 if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
1148 (zio != NULL) && (!zio->io_timestamp)) {
1149 return (B_FALSE);
1150 }
1151 #else
1152 (void) subclass, (void) spa, (void) vd, (void) zio;
1153 #endif
1154 return (B_TRUE);
1155 }
1156
1157 /*
1158 * Post an ereport for the given subclass
1159 *
1160 * Returns
1161 * - 0 if an event was posted
1162 * - EINVAL if there was a problem posting event
1163 * - EBUSY if the event was rate limited
1164 * - EALREADY if the event was already posted (duplicate)
1165 */
1166 int
zfs_ereport_post(const char * subclass,spa_t * spa,vdev_t * vd,const zbookmark_phys_t * zb,zio_t * zio,uint64_t state)1167 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
1168 const zbookmark_phys_t *zb, zio_t *zio, uint64_t state)
1169 {
1170 int rc = 0;
1171 #ifdef _KERNEL
1172 nvlist_t *ereport = NULL;
1173 nvlist_t *detector = NULL;
1174
1175 if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
1176 return (EINVAL);
1177
1178 if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0))
1179 return (SET_ERROR(EALREADY));
1180
1181 if (zfs_is_ratelimiting_event(subclass, vd))
1182 return (SET_ERROR(EBUSY));
1183
1184 if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
1185 zb, zio, state, 0))
1186 return (SET_ERROR(EINVAL)); /* couldn't post event */
1187
1188 if (ereport == NULL)
1189 return (SET_ERROR(EINVAL));
1190
1191 /* Cleanup is handled by the callback function */
1192 rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
1193 #else
1194 (void) subclass, (void) spa, (void) vd, (void) zb, (void) zio,
1195 (void) state;
1196 #endif
1197 return (rc);
1198 }
1199
1200 /*
1201 * Prepare a checksum ereport
1202 *
1203 * Returns
1204 * - 0 if an event was posted
1205 * - EINVAL if there was a problem posting event
1206 * - EBUSY if the event was rate limited
1207 * - EALREADY if the event was already posted (duplicate)
1208 */
1209 int
zfs_ereport_start_checksum(spa_t * spa,vdev_t * vd,const zbookmark_phys_t * zb,struct zio * zio,uint64_t offset,uint64_t length,zio_bad_cksum_t * info)1210 zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
1211 struct zio *zio, uint64_t offset, uint64_t length, zio_bad_cksum_t *info)
1212 {
1213 zio_cksum_report_t *report;
1214
1215 #ifdef _KERNEL
1216 if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
1217 return (SET_ERROR(EINVAL));
1218
1219 if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
1220 offset, length))
1221 return (SET_ERROR(EALREADY));
1222
1223 if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
1224 return (SET_ERROR(EBUSY));
1225 #else
1226 (void) zb, (void) offset;
1227 #endif
1228
1229 report = kmem_zalloc(sizeof (*report), KM_SLEEP);
1230
1231 zio_vsd_default_cksum_report(zio, report);
1232
1233 /* copy the checksum failure information if it was provided */
1234 if (info != NULL) {
1235 report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
1236 memcpy(report->zcr_ckinfo, info, sizeof (*info));
1237 }
1238
1239 report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift;
1240 report->zcr_align =
1241 vdev_psize_to_asize(vd->vdev_top, report->zcr_sector);
1242 report->zcr_length = length;
1243
1244 #ifdef _KERNEL
1245 (void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
1246 FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length);
1247
1248 if (report->zcr_ereport == NULL) {
1249 zfs_ereport_free_checksum(report);
1250 return (0);
1251 }
1252 #endif
1253
1254 mutex_enter(&spa->spa_errlist_lock);
1255 report->zcr_next = zio->io_logical->io_cksum_report;
1256 zio->io_logical->io_cksum_report = report;
1257 mutex_exit(&spa->spa_errlist_lock);
1258 return (0);
1259 }
1260
1261 void
zfs_ereport_finish_checksum(zio_cksum_report_t * report,const abd_t * good_data,const abd_t * bad_data,boolean_t drop_if_identical)1262 zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data,
1263 const abd_t *bad_data, boolean_t drop_if_identical)
1264 {
1265 #ifdef _KERNEL
1266 zfs_ecksum_info_t *info;
1267
1268 info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
1269 good_data, bad_data, report->zcr_length, drop_if_identical);
1270 if (info != NULL)
1271 zfs_zevent_post(report->zcr_ereport,
1272 report->zcr_detector, zfs_zevent_post_cb);
1273 else
1274 zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector);
1275
1276 report->zcr_ereport = report->zcr_detector = NULL;
1277 if (info != NULL)
1278 kmem_free(info, sizeof (*info));
1279 #else
1280 (void) report, (void) good_data, (void) bad_data,
1281 (void) drop_if_identical;
1282 #endif
1283 }
1284
1285 void
zfs_ereport_free_checksum(zio_cksum_report_t * rpt)1286 zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
1287 {
1288 #ifdef _KERNEL
1289 if (rpt->zcr_ereport != NULL) {
1290 fm_nvlist_destroy(rpt->zcr_ereport,
1291 FM_NVA_FREE);
1292 fm_nvlist_destroy(rpt->zcr_detector,
1293 FM_NVA_FREE);
1294 }
1295 #endif
1296 rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
1297
1298 if (rpt->zcr_ckinfo != NULL)
1299 kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
1300
1301 kmem_free(rpt, sizeof (*rpt));
1302 }
1303
1304 /*
1305 * Post a checksum ereport
1306 *
1307 * Returns
1308 * - 0 if an event was posted
1309 * - EINVAL if there was a problem posting event
1310 * - EBUSY if the event was rate limited
1311 * - EALREADY if the event was already posted (duplicate)
1312 */
1313 int
zfs_ereport_post_checksum(spa_t * spa,vdev_t * vd,const zbookmark_phys_t * zb,struct zio * zio,uint64_t offset,uint64_t length,const abd_t * good_data,const abd_t * bad_data,zio_bad_cksum_t * zbc)1314 zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
1315 struct zio *zio, uint64_t offset, uint64_t length,
1316 const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc)
1317 {
1318 int rc = 0;
1319 #ifdef _KERNEL
1320 nvlist_t *ereport = NULL;
1321 nvlist_t *detector = NULL;
1322 zfs_ecksum_info_t *info;
1323
1324 if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
1325 return (SET_ERROR(EINVAL));
1326
1327 if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
1328 offset, length))
1329 return (SET_ERROR(EALREADY));
1330
1331 if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
1332 return (SET_ERROR(EBUSY));
1333
1334 if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
1335 spa, vd, zb, zio, offset, length) || (ereport == NULL)) {
1336 return (SET_ERROR(EINVAL));
1337 }
1338
1339 info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
1340 B_FALSE);
1341
1342 if (info != NULL) {
1343 rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
1344 kmem_free(info, sizeof (*info));
1345 }
1346 #else
1347 (void) spa, (void) vd, (void) zb, (void) zio, (void) offset,
1348 (void) length, (void) good_data, (void) bad_data, (void) zbc;
1349 #endif
1350 return (rc);
1351 }
1352
1353 /*
1354 * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of
1355 * change in the pool. All sysevents are listed in sys/sysevent/eventdefs.h
1356 * and are designed to be consumed by the ZFS Event Daemon (ZED). For
1357 * additional details refer to the zed(8) man page.
1358 */
1359 nvlist_t *
zfs_event_create(spa_t * spa,vdev_t * vd,const char * type,const char * name,nvlist_t * aux)1360 zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name,
1361 nvlist_t *aux)
1362 {
1363 nvlist_t *resource = NULL;
1364 #ifdef _KERNEL
1365 char class[64];
1366
1367 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
1368 return (NULL);
1369
1370 if ((resource = fm_nvlist_create(NULL)) == NULL)
1371 return (NULL);
1372
1373 (void) snprintf(class, sizeof (class), "%s.%s.%s", type,
1374 ZFS_ERROR_CLASS, name);
1375 VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION));
1376 VERIFY0(nvlist_add_string(resource, FM_CLASS, class));
1377 VERIFY0(nvlist_add_string(resource,
1378 FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa)));
1379 VERIFY0(nvlist_add_uint64(resource,
1380 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)));
1381 VERIFY0(nvlist_add_uint64(resource,
1382 FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa)));
1383 VERIFY0(nvlist_add_int32(resource,
1384 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa)));
1385
1386 if (vd) {
1387 VERIFY0(nvlist_add_uint64(resource,
1388 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid));
1389 VERIFY0(nvlist_add_uint64(resource,
1390 FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state));
1391 if (vd->vdev_path != NULL)
1392 VERIFY0(nvlist_add_string(resource,
1393 FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path));
1394 if (vd->vdev_devid != NULL)
1395 VERIFY0(nvlist_add_string(resource,
1396 FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid));
1397 if (vd->vdev_fru != NULL)
1398 VERIFY0(nvlist_add_string(resource,
1399 FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru));
1400 if (vd->vdev_enc_sysfs_path != NULL)
1401 VERIFY0(nvlist_add_string(resource,
1402 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
1403 vd->vdev_enc_sysfs_path));
1404 }
1405
1406 /* also copy any optional payload data */
1407 if (aux) {
1408 nvpair_t *elem = NULL;
1409
1410 while ((elem = nvlist_next_nvpair(aux, elem)) != NULL)
1411 (void) nvlist_add_nvpair(resource, elem);
1412 }
1413 #else
1414 (void) spa, (void) vd, (void) type, (void) name, (void) aux;
1415 #endif
1416 return (resource);
1417 }
1418
1419 static void
zfs_post_common(spa_t * spa,vdev_t * vd,const char * type,const char * name,nvlist_t * aux)1420 zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name,
1421 nvlist_t *aux)
1422 {
1423 #ifdef _KERNEL
1424 nvlist_t *resource;
1425
1426 resource = zfs_event_create(spa, vd, type, name, aux);
1427 if (resource)
1428 zfs_zevent_post(resource, NULL, zfs_zevent_post_cb);
1429 #else
1430 (void) spa, (void) vd, (void) type, (void) name, (void) aux;
1431 #endif
1432 }
1433
1434 /*
1435 * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
1436 * has been removed from the system. This will cause the DE to ignore any
1437 * recent I/O errors, inferring that they are due to the asynchronous device
1438 * removal.
1439 */
1440 void
zfs_post_remove(spa_t * spa,vdev_t * vd,boolean_t by_kernel)1441 zfs_post_remove(spa_t *spa, vdev_t *vd, boolean_t by_kernel)
1442 {
1443 nvlist_t *aux = NULL;
1444
1445 if (by_kernel) {
1446 /*
1447 * Add optional supplemental keys to payload
1448 */
1449 aux = fm_nvlist_create(NULL);
1450 if (aux)
1451 fnvlist_add_boolean(aux, "by_kernel");
1452 }
1453
1454 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, aux);
1455
1456 if (by_kernel && aux)
1457 fm_nvlist_destroy(aux, FM_NVA_FREE);
1458 }
1459
1460 /*
1461 * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
1462 * has the 'autoreplace' property set, and therefore any broken vdevs will be
1463 * handled by higher level logic, and no vdev fault should be generated.
1464 */
1465 void
zfs_post_autoreplace(spa_t * spa,vdev_t * vd)1466 zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
1467 {
1468 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL);
1469 }
1470
1471 /*
1472 * The 'resource.fs.zfs.statechange' event is an internal signal that the
1473 * given vdev has transitioned its state to DEGRADED or HEALTHY. This will
1474 * cause the retire agent to repair any outstanding fault management cases
1475 * open because the device was not found (fault.fs.zfs.device).
1476 */
1477 void
zfs_post_state_change(spa_t * spa,vdev_t * vd,uint64_t laststate)1478 zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
1479 {
1480 #ifdef _KERNEL
1481 nvlist_t *aux;
1482
1483 /*
1484 * Add optional supplemental keys to payload
1485 */
1486 aux = fm_nvlist_create(NULL);
1487 if (vd && aux) {
1488 if (vd->vdev_physpath) {
1489 fnvlist_add_string(aux,
1490 FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH,
1491 vd->vdev_physpath);
1492 }
1493 if (vd->vdev_enc_sysfs_path) {
1494 fnvlist_add_string(aux,
1495 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
1496 vd->vdev_enc_sysfs_path);
1497 }
1498
1499 fnvlist_add_uint64(aux,
1500 FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate);
1501 }
1502
1503 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE,
1504 aux);
1505
1506 if (aux)
1507 fm_nvlist_destroy(aux, FM_NVA_FREE);
1508 #else
1509 (void) spa, (void) vd, (void) laststate;
1510 #endif
1511 }
1512
1513 #ifdef _KERNEL
1514 void
zfs_ereport_init(void)1515 zfs_ereport_init(void)
1516 {
1517 mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL);
1518 list_create(&recent_events_list, sizeof (recent_events_node_t),
1519 offsetof(recent_events_node_t, re_list_link));
1520 avl_create(&recent_events_tree, recent_events_compare,
1521 sizeof (recent_events_node_t), offsetof(recent_events_node_t,
1522 re_tree_link));
1523 }
1524
1525 /*
1526 * This 'early' fini needs to run before zfs_fini() which on Linux waits
1527 * for the system_delay_taskq to drain.
1528 */
1529 void
zfs_ereport_taskq_fini(void)1530 zfs_ereport_taskq_fini(void)
1531 {
1532 mutex_enter(&recent_events_lock);
1533 if (recent_events_cleaner_tqid != 0) {
1534 taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid);
1535 recent_events_cleaner_tqid = 0;
1536 }
1537 mutex_exit(&recent_events_lock);
1538 }
1539
1540 void
zfs_ereport_fini(void)1541 zfs_ereport_fini(void)
1542 {
1543 recent_events_node_t *entry;
1544
1545 while ((entry = list_remove_head(&recent_events_list)) != NULL) {
1546 avl_remove(&recent_events_tree, entry);
1547 kmem_free(entry, sizeof (*entry));
1548 }
1549 avl_destroy(&recent_events_tree);
1550 list_destroy(&recent_events_list);
1551 mutex_destroy(&recent_events_lock);
1552 }
1553
1554 void
zfs_ereport_snapshot_post(const char * subclass,spa_t * spa,const char * name)1555 zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name)
1556 {
1557 nvlist_t *aux;
1558
1559 aux = fm_nvlist_create(NULL);
1560 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name);
1561
1562 zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
1563 fm_nvlist_destroy(aux, FM_NVA_FREE);
1564 }
1565
1566 /*
1567 * Post when a event when a zvol is created or removed
1568 *
1569 * This is currently only used by macOS, since it uses the event to create
1570 * symlinks between the volume name (mypool/myvol) and the actual /dev
1571 * device (/dev/disk3). For example:
1572 *
1573 * /var/run/zfs/dsk/mypool/myvol -> /dev/disk3
1574 *
1575 * name: The full name of the zvol ("mypool/myvol")
1576 * dev_name: The full /dev name for the zvol ("/dev/disk3")
1577 * raw_name: The raw /dev name for the zvol ("/dev/rdisk3")
1578 */
1579 void
zfs_ereport_zvol_post(const char * subclass,const char * name,const char * dev_name,const char * raw_name)1580 zfs_ereport_zvol_post(const char *subclass, const char *name,
1581 const char *dev_name, const char *raw_name)
1582 {
1583 nvlist_t *aux;
1584 char *r;
1585
1586 boolean_t locked = spa_namespace_held();
1587 if (!locked) spa_namespace_enter(FTAG);
1588 spa_t *spa = spa_lookup(name);
1589 if (!locked) spa_namespace_exit(FTAG);
1590
1591 if (spa == NULL)
1592 return;
1593
1594 aux = fm_nvlist_create(NULL);
1595 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name);
1596 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME,
1597 raw_name);
1598 r = strchr(name, '/');
1599 if (r && r[1])
1600 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]);
1601
1602 zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
1603 fm_nvlist_destroy(aux, FM_NVA_FREE);
1604 }
1605
1606 EXPORT_SYMBOL(zfs_ereport_post);
1607 EXPORT_SYMBOL(zfs_ereport_is_valid);
1608 EXPORT_SYMBOL(zfs_ereport_post_checksum);
1609 EXPORT_SYMBOL(zfs_post_remove);
1610 EXPORT_SYMBOL(zfs_post_autoreplace);
1611 EXPORT_SYMBOL(zfs_post_state_change);
1612
1613 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW,
1614 "Maximum recent zevents records to retain for duplicate checking");
1615 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW,
1616 "Expiration time for recent zevents records");
1617 #endif /* _KERNEL */
1618