1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Copyright (c) 2012,2021 by Delphix. All rights reserved.
29 */
30
31 #include <sys/spa.h>
32 #include <sys/spa_impl.h>
33 #include <sys/vdev.h>
34 #include <sys/vdev_impl.h>
35 #include <sys/zio.h>
36 #include <sys/zio_checksum.h>
37
38 #include <sys/fm/fs/zfs.h>
39 #include <sys/fm/protocol.h>
40 #include <sys/fm/util.h>
41 #include <sys/sysevent.h>
42
43 /*
44 * This general routine is responsible for generating all the different ZFS
45 * ereports. The payload is dependent on the class, and which arguments are
46 * supplied to the function:
47 *
48 * EREPORT POOL VDEV IO
49 * block X X X
50 * data X X
51 * device X X
52 * pool X
53 *
54 * If we are in a loading state, all errors are chained together by the same
55 * SPA-wide ENA (Error Numeric Association).
56 *
57 * For isolated I/O requests, we get the ENA from the zio_t. The propagation
58 * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want
59 * to chain together all ereports associated with a logical piece of data. For
60 * read I/Os, there are basically three 'types' of I/O, which form a roughly
61 * layered diagram:
62 *
63 * +---------------+
64 * | Aggregate I/O | No associated logical data or device
65 * +---------------+
66 * |
67 * V
68 * +---------------+ Reads associated with a piece of logical data.
69 * | Read I/O | This includes reads on behalf of RAID-Z,
70 * +---------------+ mirrors, gang blocks, retries, etc.
71 * |
72 * V
73 * +---------------+ Reads associated with a particular device, but
74 * | Physical I/O | no logical data. Issued as part of vdev caching
75 * +---------------+ and I/O aggregation.
76 *
77 * Note that 'physical I/O' here is not the same terminology as used in the rest
78 * of ZIO. Typically, 'physical I/O' simply means that there is no attached
79 * blockpointer. But I/O with no associated block pointer can still be related
80 * to a logical piece of data (i.e. RAID-Z requests).
81 *
82 * Purely physical I/O always have unique ENAs. They are not related to a
83 * particular piece of logical data, and therefore cannot be chained together.
84 * We still generate an ereport, but the DE doesn't correlate it with any
85 * logical piece of data. When such an I/O fails, the delegated I/O requests
86 * will issue a retry, which will trigger the 'real' ereport with the correct
87 * ENA.
88 *
89 * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
90 * When a new logical I/O is issued, we set this to point to itself. Child I/Os
91 * then inherit this pointer, so that when it is first set subsequent failures
92 * will use the same ENA. For vdev cache fill and queue aggregation I/O,
93 * this pointer is set to NULL, and no ereport will be generated (since it
94 * doesn't actually correspond to any particular device or piece of data,
95 * and the caller will always retry without caching or queueing anyway).
96 *
97 * For checksum errors, we want to include more information about the actual
98 * error which occurs. Accordingly, we build an ereport when the error is
99 * noticed, but instead of sending it in immediately, we hang it off of the
100 * io_cksum_report field of the logical IO. When the logical IO completes
101 * (successfully or not), zfs_ereport_finish_checksum() is called with the
102 * good and bad versions of the buffer (if available), and we annotate the
103 * ereport with information about the differences.
104 */
105
106 #ifdef _KERNEL
107 /*
108 * Duplicate ereport Detection
109 *
110 * Some ereports are retained momentarily for detecting duplicates. These
111 * are kept in a recent_events_node_t in both a time-ordered list and an AVL
112 * tree of recent unique ereports.
113 *
114 * The lifespan of these recent ereports is bounded (15 mins) and a cleaner
115 * task is used to purge stale entries.
116 */
117 static list_t recent_events_list;
118 static avl_tree_t recent_events_tree;
119 static kmutex_t recent_events_lock;
120 static taskqid_t recent_events_cleaner_tqid;
121
122 /*
123 * Each node is about 128 bytes so 2,000 would consume 1/4 MiB.
124 *
125 * This setting can be changed dynamically and setting it to zero
126 * disables duplicate detection.
127 */
128 static unsigned int zfs_zevent_retain_max = 2000;
129
130 /*
131 * The lifespan for a recent ereport entry. The default of 15 minutes is
132 * intended to outlive the zfs diagnosis engine's threshold of 10 errors
133 * over a period of 10 minutes.
134 */
135 static unsigned int zfs_zevent_retain_expire_secs = 900;
136
137 typedef enum zfs_subclass {
138 ZSC_IO,
139 ZSC_DATA,
140 ZSC_CHECKSUM
141 } zfs_subclass_t;
142
143 typedef struct {
144 /* common criteria */
145 uint64_t re_pool_guid;
146 uint64_t re_vdev_guid;
147 int re_io_error;
148 uint64_t re_io_size;
149 uint64_t re_io_offset;
150 zfs_subclass_t re_subclass;
151 zio_priority_t re_io_priority;
152
153 /* logical zio criteria (optional) */
154 zbookmark_phys_t re_io_bookmark;
155
156 /* internal state */
157 avl_node_t re_tree_link;
158 list_node_t re_list_link;
159 uint64_t re_timestamp;
160 } recent_events_node_t;
161
162 static int
recent_events_compare(const void * a,const void * b)163 recent_events_compare(const void *a, const void *b)
164 {
165 const recent_events_node_t *node1 = a;
166 const recent_events_node_t *node2 = b;
167 int cmp;
168
169 /*
170 * The comparison order here is somewhat arbitrary.
171 * What's important is that if every criteria matches, then it
172 * is a duplicate (i.e. compare returns 0)
173 */
174 if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0)
175 return (cmp);
176 if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0)
177 return (cmp);
178 if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0)
179 return (cmp);
180 if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0)
181 return (cmp);
182 if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0)
183 return (cmp);
184 if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0)
185 return (cmp);
186 if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0)
187 return (cmp);
188
189 const zbookmark_phys_t *zb1 = &node1->re_io_bookmark;
190 const zbookmark_phys_t *zb2 = &node2->re_io_bookmark;
191
192 if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0)
193 return (cmp);
194 if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0)
195 return (cmp);
196 if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0)
197 return (cmp);
198 if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0)
199 return (cmp);
200
201 return (0);
202 }
203
204 /*
205 * workaround: vdev properties don't have inheritance
206 */
207 static uint64_t
vdev_prop_get_inherited(vdev_t * vd,vdev_prop_t prop)208 vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop)
209 {
210 uint64_t propdef, propval;
211
212 propdef = vdev_prop_default_numeric(prop);
213 switch (prop) {
214 case VDEV_PROP_CHECKSUM_N:
215 propval = vd->vdev_checksum_n;
216 break;
217 case VDEV_PROP_CHECKSUM_T:
218 propval = vd->vdev_checksum_t;
219 break;
220 case VDEV_PROP_IO_N:
221 propval = vd->vdev_io_n;
222 break;
223 case VDEV_PROP_IO_T:
224 propval = vd->vdev_io_t;
225 break;
226 case VDEV_PROP_SLOW_IO_N:
227 propval = vd->vdev_slow_io_n;
228 break;
229 case VDEV_PROP_SLOW_IO_T:
230 propval = vd->vdev_slow_io_t;
231 break;
232 default:
233 propval = propdef;
234 break;
235 }
236
237 if (propval != propdef)
238 return (propval);
239
240 if (vd->vdev_parent == NULL)
241 return (propdef);
242
243 return (vdev_prop_get_inherited(vd->vdev_parent, prop));
244 }
245
246 static void zfs_ereport_schedule_cleaner(void);
247
248 /*
249 * background task to clean stale recent event nodes.
250 */
251 static void
zfs_ereport_cleaner(void * arg)252 zfs_ereport_cleaner(void *arg)
253 {
254 recent_events_node_t *entry;
255 uint64_t now = gethrtime();
256
257 /*
258 * purge expired entries
259 */
260 mutex_enter(&recent_events_lock);
261 while ((entry = list_tail(&recent_events_list)) != NULL) {
262 uint64_t age = NSEC2SEC(now - entry->re_timestamp);
263 if (age <= zfs_zevent_retain_expire_secs)
264 break;
265
266 /* remove expired node */
267 avl_remove(&recent_events_tree, entry);
268 list_remove(&recent_events_list, entry);
269 kmem_free(entry, sizeof (*entry));
270 }
271
272 /* Restart the cleaner if more entries remain */
273 recent_events_cleaner_tqid = 0;
274 if (!list_is_empty(&recent_events_list))
275 zfs_ereport_schedule_cleaner();
276
277 mutex_exit(&recent_events_lock);
278 }
279
280 static void
zfs_ereport_schedule_cleaner(void)281 zfs_ereport_schedule_cleaner(void)
282 {
283 ASSERT(MUTEX_HELD(&recent_events_lock));
284
285 uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1);
286
287 recent_events_cleaner_tqid = taskq_dispatch_delay(
288 system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP,
289 ddi_get_lbolt() + NSEC_TO_TICK(timeout));
290 }
291
292 /*
293 * Clear entries for a given vdev or all vdevs in a pool when vdev == NULL
294 */
295 void
zfs_ereport_clear(spa_t * spa,vdev_t * vd)296 zfs_ereport_clear(spa_t *spa, vdev_t *vd)
297 {
298 uint64_t vdev_guid, pool_guid;
299
300 ASSERT(vd != NULL || spa != NULL);
301 if (vd == NULL) {
302 vdev_guid = 0;
303 pool_guid = spa_guid(spa);
304 } else {
305 vdev_guid = vd->vdev_guid;
306 pool_guid = 0;
307 }
308
309 mutex_enter(&recent_events_lock);
310
311 recent_events_node_t *next = list_head(&recent_events_list);
312 while (next != NULL) {
313 recent_events_node_t *entry = next;
314
315 next = list_next(&recent_events_list, next);
316
317 if (entry->re_vdev_guid == vdev_guid ||
318 entry->re_pool_guid == pool_guid) {
319 avl_remove(&recent_events_tree, entry);
320 list_remove(&recent_events_list, entry);
321 kmem_free(entry, sizeof (*entry));
322 }
323 }
324
325 mutex_exit(&recent_events_lock);
326 }
327
328 /*
329 * Check if an ereport would be a duplicate of one recently posted.
330 *
331 * An ereport is considered a duplicate if the set of criteria in
332 * recent_events_node_t all match.
333 *
334 * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM
335 * are candidates for duplicate checking.
336 */
337 static boolean_t
zfs_ereport_is_duplicate(const char * subclass,spa_t * spa,vdev_t * vd,const zbookmark_phys_t * zb,zio_t * zio,uint64_t offset,uint64_t size)338 zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd,
339 const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size)
340 {
341 recent_events_node_t search = {0}, *entry;
342
343 if (vd == NULL || zio == NULL)
344 return (B_FALSE);
345
346 if (zfs_zevent_retain_max == 0)
347 return (B_FALSE);
348
349 if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0)
350 search.re_subclass = ZSC_IO;
351 else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0)
352 search.re_subclass = ZSC_DATA;
353 else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0)
354 search.re_subclass = ZSC_CHECKSUM;
355 else
356 return (B_FALSE);
357
358 search.re_pool_guid = spa_guid(spa);
359 search.re_vdev_guid = vd->vdev_guid;
360 search.re_io_error = zio->io_error;
361 search.re_io_priority = zio->io_priority;
362 /* if size is supplied use it over what's in zio */
363 if (size) {
364 search.re_io_size = size;
365 search.re_io_offset = offset;
366 } else {
367 search.re_io_size = zio->io_size;
368 search.re_io_offset = zio->io_offset;
369 }
370
371 /* grab optional logical zio criteria */
372 if (zb != NULL) {
373 search.re_io_bookmark.zb_objset = zb->zb_objset;
374 search.re_io_bookmark.zb_object = zb->zb_object;
375 search.re_io_bookmark.zb_level = zb->zb_level;
376 search.re_io_bookmark.zb_blkid = zb->zb_blkid;
377 }
378
379 uint64_t now = gethrtime();
380
381 mutex_enter(&recent_events_lock);
382
383 /* check if we have seen this one recently */
384 entry = avl_find(&recent_events_tree, &search, NULL);
385 if (entry != NULL) {
386 uint64_t age = NSEC2SEC(now - entry->re_timestamp);
387
388 /*
389 * There is still an active cleaner (since we're here).
390 * Reset the last seen time for this duplicate entry
391 * so that its lifespand gets extended.
392 */
393 list_remove(&recent_events_list, entry);
394 list_insert_head(&recent_events_list, entry);
395 entry->re_timestamp = now;
396
397 zfs_zevent_track_duplicate();
398 mutex_exit(&recent_events_lock);
399
400 return (age <= zfs_zevent_retain_expire_secs);
401 }
402
403 if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) {
404 /* recycle oldest node */
405 entry = list_tail(&recent_events_list);
406 ASSERT(entry != NULL);
407 list_remove(&recent_events_list, entry);
408 avl_remove(&recent_events_tree, entry);
409 } else {
410 entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP);
411 }
412
413 /* record this as a recent ereport */
414 *entry = search;
415 avl_add(&recent_events_tree, entry);
416 list_insert_head(&recent_events_list, entry);
417 entry->re_timestamp = now;
418
419 /* Start a cleaner if not already scheduled */
420 if (recent_events_cleaner_tqid == 0)
421 zfs_ereport_schedule_cleaner();
422
423 mutex_exit(&recent_events_lock);
424 return (B_FALSE);
425 }
426
427 void
zfs_zevent_post_cb(nvlist_t * nvl,nvlist_t * detector)428 zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector)
429 {
430 if (nvl)
431 fm_nvlist_destroy(nvl, FM_NVA_FREE);
432
433 if (detector)
434 fm_nvlist_destroy(detector, FM_NVA_FREE);
435 }
436
437 /*
438 * We want to rate limit ZIO delay, deadman, and checksum events so as to not
439 * flood zevent consumers when a disk is acting up.
440 *
441 * Returns 1 if we're ratelimiting, 0 if not.
442 */
443 static int
zfs_is_ratelimiting_event(const char * subclass,vdev_t * vd)444 zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
445 {
446 int rc = 0;
447 /*
448 * zfs_ratelimit() returns 1 if we're *not* ratelimiting and 0 if we
449 * are. Invert it to get our return value.
450 */
451 if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
452 rc = !zfs_ratelimit(&vd->vdev_delay_rl);
453 } else if (strcmp(subclass, FM_EREPORT_ZFS_DEADMAN) == 0) {
454 rc = !zfs_ratelimit(&vd->vdev_deadman_rl);
455 } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
456 rc = !zfs_ratelimit(&vd->vdev_checksum_rl);
457 }
458
459 if (rc) {
460 /* We're rate limiting */
461 fm_erpt_dropped_increment();
462 }
463
464 return (rc);
465 }
466
467 /*
468 * Return B_TRUE if the event actually posted, B_FALSE if not.
469 */
470 static boolean_t
zfs_ereport_start(nvlist_t ** ereport_out,nvlist_t ** detector_out,const char * subclass,spa_t * spa,vdev_t * vd,const zbookmark_phys_t * zb,zio_t * zio,uint64_t stateoroffset,uint64_t size)471 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
472 const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
473 zio_t *zio, uint64_t stateoroffset, uint64_t size)
474 {
475 nvlist_t *ereport, *detector;
476
477 uint64_t ena;
478 char class[64];
479
480 if ((ereport = fm_nvlist_create(NULL)) == NULL)
481 return (B_FALSE);
482
483 if ((detector = fm_nvlist_create(NULL)) == NULL) {
484 fm_nvlist_destroy(ereport, FM_NVA_FREE);
485 return (B_FALSE);
486 }
487
488 /*
489 * Serialize ereport generation
490 */
491 mutex_enter(&spa->spa_errlist_lock);
492
493 /*
494 * Determine the ENA to use for this event. If we are in a loading
495 * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use
496 * a root zio-wide ENA. Otherwise, simply use a unique ENA.
497 */
498 if (spa_load_state(spa) != SPA_LOAD_NONE) {
499 if (spa->spa_ena == 0)
500 spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
501 ena = spa->spa_ena;
502 } else if (zio != NULL && zio->io_logical != NULL) {
503 if (zio->io_logical->io_ena == 0)
504 zio->io_logical->io_ena =
505 fm_ena_generate(0, FM_ENA_FMT1);
506 ena = zio->io_logical->io_ena;
507 } else {
508 ena = fm_ena_generate(0, FM_ENA_FMT1);
509 }
510
511 /*
512 * Construct the full class, detector, and other standard FMA fields.
513 */
514 (void) snprintf(class, sizeof (class), "%s.%s",
515 ZFS_ERROR_CLASS, subclass);
516
517 fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
518 vd != NULL ? vd->vdev_guid : 0);
519
520 fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
521
522 /*
523 * Construct the per-ereport payload, depending on which parameters are
524 * passed in.
525 */
526
527 /*
528 * Generic payload members common to all ereports.
529 */
530 fm_payload_set(ereport,
531 FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa),
532 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa),
533 FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64,
534 (uint64_t)spa_state(spa),
535 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
536 (int32_t)spa_load_state(spa), NULL);
537
538 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
539 DATA_TYPE_STRING,
540 spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
541 FM_EREPORT_FAILMODE_WAIT :
542 spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
543 FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
544 NULL);
545
546 if (vd != NULL) {
547 vdev_t *pvd = vd->vdev_parent;
548 vdev_queue_t *vq = &vd->vdev_queue;
549 vdev_stat_t *vs = &vd->vdev_stat;
550 vdev_t *spare_vd;
551 uint64_t *spare_guids;
552 char **spare_paths;
553 int i, spare_count;
554
555 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
556 DATA_TYPE_UINT64, vd->vdev_guid,
557 FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
558 DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
559 if (vd->vdev_path != NULL)
560 fm_payload_set(ereport,
561 FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
562 DATA_TYPE_STRING, vd->vdev_path, NULL);
563 if (vd->vdev_devid != NULL)
564 fm_payload_set(ereport,
565 FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
566 DATA_TYPE_STRING, vd->vdev_devid, NULL);
567 if (vd->vdev_fru != NULL)
568 fm_payload_set(ereport,
569 FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
570 DATA_TYPE_STRING, vd->vdev_fru, NULL);
571 if (vd->vdev_enc_sysfs_path != NULL)
572 fm_payload_set(ereport,
573 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
574 DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL);
575 if (vd->vdev_ashift)
576 fm_payload_set(ereport,
577 FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT,
578 DATA_TYPE_UINT64, vd->vdev_ashift, NULL);
579
580 if (vq != NULL) {
581 fm_payload_set(ereport,
582 FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS,
583 DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL);
584 fm_payload_set(ereport,
585 FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS,
586 DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL);
587 }
588
589 if (vs != NULL) {
590 fm_payload_set(ereport,
591 FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS,
592 DATA_TYPE_UINT64, vs->vs_read_errors,
593 FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS,
594 DATA_TYPE_UINT64, vs->vs_write_errors,
595 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS,
596 DATA_TYPE_UINT64, vs->vs_checksum_errors,
597 FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS,
598 DATA_TYPE_UINT64, vs->vs_slow_ios,
599 FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS,
600 DATA_TYPE_UINT64, vs->vs_dio_verify_errors,
601 NULL);
602 }
603
604 if (pvd != NULL) {
605 fm_payload_set(ereport,
606 FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
607 DATA_TYPE_UINT64, pvd->vdev_guid,
608 FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
609 DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
610 NULL);
611 if (pvd->vdev_path)
612 fm_payload_set(ereport,
613 FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
614 DATA_TYPE_STRING, pvd->vdev_path, NULL);
615 if (pvd->vdev_devid)
616 fm_payload_set(ereport,
617 FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
618 DATA_TYPE_STRING, pvd->vdev_devid, NULL);
619 }
620
621 spare_count = spa->spa_spares.sav_count;
622 spare_paths = kmem_zalloc(sizeof (char *) * spare_count,
623 KM_SLEEP);
624 spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count,
625 KM_SLEEP);
626
627 for (i = 0; i < spare_count; i++) {
628 spare_vd = spa->spa_spares.sav_vdevs[i];
629 if (spare_vd) {
630 spare_paths[i] = spare_vd->vdev_path;
631 spare_guids[i] = spare_vd->vdev_guid;
632 }
633 }
634
635 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS,
636 DATA_TYPE_STRING_ARRAY, spare_count, spare_paths,
637 FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS,
638 DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL);
639
640 kmem_free(spare_guids, sizeof (uint64_t) * spare_count);
641 kmem_free(spare_paths, sizeof (char *) * spare_count);
642 }
643
644 if (zio != NULL) {
645 /*
646 * Payload common to all I/Os.
647 */
648 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
649 DATA_TYPE_INT32, zio->io_error, NULL);
650 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
651 DATA_TYPE_UINT64, zio->io_flags, NULL);
652 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
653 DATA_TYPE_UINT32, zio->io_stage, NULL);
654 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,
655 DATA_TYPE_UINT32, zio->io_pipeline, NULL);
656 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
657 DATA_TYPE_UINT64, zio->io_delay, NULL);
658 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP,
659 DATA_TYPE_UINT64, zio->io_timestamp, NULL);
660 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
661 DATA_TYPE_UINT64, zio->io_delta, NULL);
662 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY,
663 DATA_TYPE_UINT32, zio->io_priority, NULL);
664
665 /*
666 * If the 'size' parameter is non-zero, it indicates this is a
667 * RAID-Z or other I/O where the physical offset and length are
668 * provided for us, instead of within the zio_t.
669 */
670 if (vd != NULL) {
671 if (size)
672 fm_payload_set(ereport,
673 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
674 DATA_TYPE_UINT64, stateoroffset,
675 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
676 DATA_TYPE_UINT64, size, NULL);
677 else
678 fm_payload_set(ereport,
679 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
680 DATA_TYPE_UINT64, zio->io_offset,
681 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
682 DATA_TYPE_UINT64, zio->io_size, NULL);
683 }
684 } else if (vd != NULL) {
685 /*
686 * If we have a vdev but no zio, this is a device fault, and the
687 * 'stateoroffset' parameter indicates the previous state of the
688 * vdev.
689 */
690 fm_payload_set(ereport,
691 FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
692 DATA_TYPE_UINT64, stateoroffset, NULL);
693 }
694
695 /*
696 * Payload for I/Os with corresponding logical information.
697 */
698 if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) {
699 fm_payload_set(ereport,
700 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
701 DATA_TYPE_UINT64, zb->zb_objset,
702 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
703 DATA_TYPE_UINT64, zb->zb_object,
704 FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
705 DATA_TYPE_INT64, zb->zb_level,
706 FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
707 DATA_TYPE_UINT64, zb->zb_blkid, NULL);
708 }
709
710 /*
711 * Payload for tuning the zed
712 */
713 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
714 uint64_t cksum_n, cksum_t;
715
716 cksum_n = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_N);
717 if (cksum_n != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N))
718 fm_payload_set(ereport,
719 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N,
720 DATA_TYPE_UINT64,
721 cksum_n,
722 NULL);
723
724 cksum_t = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_T);
725 if (cksum_t != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T))
726 fm_payload_set(ereport,
727 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T,
728 DATA_TYPE_UINT64,
729 cksum_t,
730 NULL);
731 }
732
733 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) {
734 uint64_t io_n, io_t;
735
736 io_n = vdev_prop_get_inherited(vd, VDEV_PROP_IO_N);
737 if (io_n != vdev_prop_default_numeric(VDEV_PROP_IO_N))
738 fm_payload_set(ereport,
739 FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N,
740 DATA_TYPE_UINT64,
741 io_n,
742 NULL);
743
744 io_t = vdev_prop_get_inherited(vd, VDEV_PROP_IO_T);
745 if (io_t != vdev_prop_default_numeric(VDEV_PROP_IO_T))
746 fm_payload_set(ereport,
747 FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T,
748 DATA_TYPE_UINT64,
749 io_t,
750 NULL);
751 }
752
753 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
754 uint64_t slow_io_n, slow_io_t;
755
756 slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N);
757 if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N))
758 fm_payload_set(ereport,
759 FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
760 DATA_TYPE_UINT64,
761 slow_io_n,
762 NULL);
763
764 slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T);
765 if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T))
766 fm_payload_set(ereport,
767 FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
768 DATA_TYPE_UINT64,
769 slow_io_t,
770 NULL);
771 }
772
773 mutex_exit(&spa->spa_errlist_lock);
774
775 *ereport_out = ereport;
776 *detector_out = detector;
777 return (B_TRUE);
778 }
779
780 /* if it's <= 128 bytes, save the corruption directly */
781 #define ZFM_MAX_INLINE (128 / sizeof (uint64_t))
782
783 #define MAX_RANGES 16
784
785 typedef struct zfs_ecksum_info {
786 /* inline arrays of bits set and cleared. */
787 uint64_t zei_bits_set[ZFM_MAX_INLINE];
788 uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
789
790 /*
791 * for each range, the number of bits set and cleared. The Hamming
792 * distance between the good and bad buffers is the sum of them all.
793 */
794 uint32_t zei_range_sets[MAX_RANGES];
795 uint32_t zei_range_clears[MAX_RANGES];
796
797 struct zei_ranges {
798 uint32_t zr_start;
799 uint32_t zr_end;
800 } zei_ranges[MAX_RANGES];
801
802 size_t zei_range_count;
803 uint32_t zei_mingap;
804 uint32_t zei_allowed_mingap;
805
806 } zfs_ecksum_info_t;
807
808 static void
update_bad_bits(uint64_t value_arg,uint32_t * count)809 update_bad_bits(uint64_t value_arg, uint32_t *count)
810 {
811 size_t i;
812 size_t bits = 0;
813 uint64_t value = BE_64(value_arg);
814
815 /* We store the bits in big-endian (largest-first) order */
816 for (i = 0; i < 64; i++) {
817 if (value & (1ull << i))
818 ++bits;
819 }
820 /* update the count of bits changed */
821 *count += bits;
822 }
823
824 /*
825 * We've now filled up the range array, and need to increase "mingap" and
826 * shrink the range list accordingly. zei_mingap is always the smallest
827 * distance between array entries, so we set the new_allowed_gap to be
828 * one greater than that. We then go through the list, joining together
829 * any ranges which are closer than the new_allowed_gap.
830 *
831 * By construction, there will be at least one. We also update zei_mingap
832 * to the new smallest gap, to prepare for our next invocation.
833 */
834 static void
zei_shrink_ranges(zfs_ecksum_info_t * eip)835 zei_shrink_ranges(zfs_ecksum_info_t *eip)
836 {
837 uint32_t mingap = UINT32_MAX;
838 uint32_t new_allowed_gap = eip->zei_mingap + 1;
839
840 size_t idx, output;
841 size_t max = eip->zei_range_count;
842
843 struct zei_ranges *r = eip->zei_ranges;
844
845 ASSERT3U(eip->zei_range_count, >, 0);
846 ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
847
848 output = idx = 0;
849 while (idx < max - 1) {
850 uint32_t start = r[idx].zr_start;
851 uint32_t end = r[idx].zr_end;
852
853 while (idx < max - 1) {
854 idx++;
855
856 uint32_t nstart = r[idx].zr_start;
857 uint32_t nend = r[idx].zr_end;
858
859 uint32_t gap = nstart - end;
860 if (gap < new_allowed_gap) {
861 end = nend;
862 continue;
863 }
864 if (gap < mingap)
865 mingap = gap;
866 break;
867 }
868 r[output].zr_start = start;
869 r[output].zr_end = end;
870 output++;
871 }
872 ASSERT3U(output, <, eip->zei_range_count);
873 eip->zei_range_count = output;
874 eip->zei_mingap = mingap;
875 eip->zei_allowed_mingap = new_allowed_gap;
876 }
877
878 static void
zei_add_range(zfs_ecksum_info_t * eip,int start,int end)879 zei_add_range(zfs_ecksum_info_t *eip, int start, int end)
880 {
881 struct zei_ranges *r = eip->zei_ranges;
882 size_t count = eip->zei_range_count;
883
884 if (count >= MAX_RANGES) {
885 zei_shrink_ranges(eip);
886 count = eip->zei_range_count;
887 }
888 if (count == 0) {
889 eip->zei_mingap = UINT32_MAX;
890 eip->zei_allowed_mingap = 1;
891 } else {
892 int gap = start - r[count - 1].zr_end;
893
894 if (gap < eip->zei_allowed_mingap) {
895 r[count - 1].zr_end = end;
896 return;
897 }
898 if (gap < eip->zei_mingap)
899 eip->zei_mingap = gap;
900 }
901 r[count].zr_start = start;
902 r[count].zr_end = end;
903 eip->zei_range_count++;
904 }
905
906 static size_t
zei_range_total_size(zfs_ecksum_info_t * eip)907 zei_range_total_size(zfs_ecksum_info_t *eip)
908 {
909 struct zei_ranges *r = eip->zei_ranges;
910 size_t count = eip->zei_range_count;
911 size_t result = 0;
912 size_t idx;
913
914 for (idx = 0; idx < count; idx++)
915 result += (r[idx].zr_end - r[idx].zr_start);
916
917 return (result);
918 }
919
920 static zfs_ecksum_info_t *
annotate_ecksum(nvlist_t * ereport,zio_bad_cksum_t * info,const abd_t * goodabd,const abd_t * badabd,size_t size,boolean_t drop_if_identical)921 annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
922 const abd_t *goodabd, const abd_t *badabd, size_t size,
923 boolean_t drop_if_identical)
924 {
925 const uint64_t *good;
926 const uint64_t *bad;
927
928 size_t nui64s = size / sizeof (uint64_t);
929
930 size_t inline_size;
931 int no_inline = 0;
932 size_t idx;
933 size_t range;
934
935 size_t offset = 0;
936 ssize_t start = -1;
937
938 zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
939
940 /* don't do any annotation for injected checksum errors */
941 if (info != NULL && info->zbc_injected)
942 return (eip);
943
944 if (info != NULL && info->zbc_has_cksum) {
945 fm_payload_set(ereport,
946 FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
947 DATA_TYPE_STRING,
948 info->zbc_checksum_name,
949 NULL);
950
951 if (info->zbc_byteswapped) {
952 fm_payload_set(ereport,
953 FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
954 DATA_TYPE_BOOLEAN, 1,
955 NULL);
956 }
957 }
958
959 if (badabd == NULL || goodabd == NULL)
960 return (eip);
961
962 ASSERT3U(nui64s, <=, UINT32_MAX);
963 ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
964 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
965 ASSERT3U(size, <=, UINT32_MAX);
966
967 good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size);
968 bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size);
969
970 /* build up the range list by comparing the two buffers. */
971 for (idx = 0; idx < nui64s; idx++) {
972 if (good[idx] == bad[idx]) {
973 if (start == -1)
974 continue;
975
976 zei_add_range(eip, start, idx);
977 start = -1;
978 } else {
979 if (start != -1)
980 continue;
981
982 start = idx;
983 }
984 }
985 if (start != -1)
986 zei_add_range(eip, start, idx);
987
988 /* See if it will fit in our inline buffers */
989 inline_size = zei_range_total_size(eip);
990 if (inline_size > ZFM_MAX_INLINE)
991 no_inline = 1;
992
993 /*
994 * If there is no change and we want to drop if the buffers are
995 * identical, do so.
996 */
997 if (inline_size == 0 && drop_if_identical) {
998 kmem_free(eip, sizeof (*eip));
999 abd_return_buf((abd_t *)goodabd, (void *)good, size);
1000 abd_return_buf((abd_t *)badabd, (void *)bad, size);
1001 return (NULL);
1002 }
1003
1004 /*
1005 * Now walk through the ranges, filling in the details of the
1006 * differences. Also convert our uint64_t-array offsets to byte
1007 * offsets.
1008 */
1009 for (range = 0; range < eip->zei_range_count; range++) {
1010 size_t start = eip->zei_ranges[range].zr_start;
1011 size_t end = eip->zei_ranges[range].zr_end;
1012
1013 for (idx = start; idx < end; idx++) {
1014 uint64_t set, cleared;
1015
1016 // bits set in bad, but not in good
1017 set = ((~good[idx]) & bad[idx]);
1018 // bits set in good, but not in bad
1019 cleared = (good[idx] & (~bad[idx]));
1020
1021 if (!no_inline) {
1022 ASSERT3U(offset, <, inline_size);
1023 eip->zei_bits_set[offset] = set;
1024 eip->zei_bits_cleared[offset] = cleared;
1025 offset++;
1026 }
1027
1028 update_bad_bits(set, &eip->zei_range_sets[range]);
1029 update_bad_bits(cleared, &eip->zei_range_clears[range]);
1030 }
1031
1032 /* convert to byte offsets */
1033 eip->zei_ranges[range].zr_start *= sizeof (uint64_t);
1034 eip->zei_ranges[range].zr_end *= sizeof (uint64_t);
1035 }
1036
1037 abd_return_buf((abd_t *)goodabd, (void *)good, size);
1038 abd_return_buf((abd_t *)badabd, (void *)bad, size);
1039
1040 eip->zei_allowed_mingap *= sizeof (uint64_t);
1041 inline_size *= sizeof (uint64_t);
1042
1043 /* fill in ereport */
1044 fm_payload_set(ereport,
1045 FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
1046 DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
1047 (uint32_t *)eip->zei_ranges,
1048 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
1049 DATA_TYPE_UINT32, eip->zei_allowed_mingap,
1050 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
1051 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
1052 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
1053 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
1054 NULL);
1055
1056 if (!no_inline) {
1057 fm_payload_set(ereport,
1058 FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
1059 DATA_TYPE_UINT8_ARRAY,
1060 inline_size, (uint8_t *)eip->zei_bits_set,
1061 FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
1062 DATA_TYPE_UINT8_ARRAY,
1063 inline_size, (uint8_t *)eip->zei_bits_cleared,
1064 NULL);
1065 }
1066 return (eip);
1067 }
1068 #else
1069 void
zfs_ereport_clear(spa_t * spa,vdev_t * vd)1070 zfs_ereport_clear(spa_t *spa, vdev_t *vd)
1071 {
1072 (void) spa, (void) vd;
1073 }
1074 #endif
1075
1076 /*
1077 * Make sure our event is still valid for the given zio/vdev/pool. For example,
1078 * we don't want to keep logging events for a faulted or missing vdev.
1079 */
1080 boolean_t
zfs_ereport_is_valid(const char * subclass,spa_t * spa,vdev_t * vd,zio_t * zio)1081 zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
1082 {
1083 #ifdef _KERNEL
1084 /*
1085 * If we are doing a spa_tryimport() or in recovery mode,
1086 * ignore errors.
1087 */
1088 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
1089 spa_load_state(spa) == SPA_LOAD_RECOVER)
1090 return (B_FALSE);
1091
1092 /*
1093 * If we are in the middle of opening a pool, and the previous attempt
1094 * failed, don't bother logging any new ereports - we're just going to
1095 * get the same diagnosis anyway.
1096 */
1097 if (spa_load_state(spa) != SPA_LOAD_NONE &&
1098 spa->spa_last_open_failed)
1099 return (B_FALSE);
1100
1101 if (zio != NULL) {
1102 /* If this is not a read or write zio, ignore the error */
1103 if (zio->io_type != ZIO_TYPE_READ &&
1104 zio->io_type != ZIO_TYPE_WRITE)
1105 return (B_FALSE);
1106
1107 if (vd != NULL) {
1108 /*
1109 * If the vdev has already been marked as failing due
1110 * to a failed probe, then ignore any subsequent I/O
1111 * errors, as the DE will automatically fault the vdev
1112 * on the first such failure. This also catches cases
1113 * where vdev_remove_wanted is set and the device has
1114 * not yet been asynchronously placed into the REMOVED
1115 * state.
1116 */
1117 if (zio->io_vd == vd && !vdev_accessible(vd, zio))
1118 return (B_FALSE);
1119
1120 /*
1121 * Ignore checksum errors for reads from DTL regions of
1122 * leaf vdevs.
1123 */
1124 if (zio->io_type == ZIO_TYPE_READ &&
1125 zio->io_error == ECKSUM &&
1126 vd->vdev_ops->vdev_op_leaf &&
1127 vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
1128 return (B_FALSE);
1129 }
1130 }
1131
1132 /*
1133 * For probe failure, we want to avoid posting ereports if we've
1134 * already removed the device in the meantime.
1135 */
1136 if (vd != NULL &&
1137 strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
1138 (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
1139 return (B_FALSE);
1140
1141 /* Ignore bogus delay events (like from ioctls or unqueued IOs) */
1142 if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
1143 (zio != NULL) && (!zio->io_timestamp)) {
1144 return (B_FALSE);
1145 }
1146 #else
1147 (void) subclass, (void) spa, (void) vd, (void) zio;
1148 #endif
1149 return (B_TRUE);
1150 }
1151
1152 /*
1153 * Post an ereport for the given subclass
1154 *
1155 * Returns
1156 * - 0 if an event was posted
1157 * - EINVAL if there was a problem posting event
1158 * - EBUSY if the event was rate limited
1159 * - EALREADY if the event was already posted (duplicate)
1160 */
1161 int
zfs_ereport_post(const char * subclass,spa_t * spa,vdev_t * vd,const zbookmark_phys_t * zb,zio_t * zio,uint64_t state)1162 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
1163 const zbookmark_phys_t *zb, zio_t *zio, uint64_t state)
1164 {
1165 int rc = 0;
1166 #ifdef _KERNEL
1167 nvlist_t *ereport = NULL;
1168 nvlist_t *detector = NULL;
1169
1170 if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
1171 return (EINVAL);
1172
1173 if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0))
1174 return (SET_ERROR(EALREADY));
1175
1176 if (zfs_is_ratelimiting_event(subclass, vd))
1177 return (SET_ERROR(EBUSY));
1178
1179 if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
1180 zb, zio, state, 0))
1181 return (SET_ERROR(EINVAL)); /* couldn't post event */
1182
1183 if (ereport == NULL)
1184 return (SET_ERROR(EINVAL));
1185
1186 /* Cleanup is handled by the callback function */
1187 rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
1188 #else
1189 (void) subclass, (void) spa, (void) vd, (void) zb, (void) zio,
1190 (void) state;
1191 #endif
1192 return (rc);
1193 }
1194
1195 /*
1196 * Prepare a checksum ereport
1197 *
1198 * Returns
1199 * - 0 if an event was posted
1200 * - EINVAL if there was a problem posting event
1201 * - EBUSY if the event was rate limited
1202 * - EALREADY if the event was already posted (duplicate)
1203 */
1204 int
zfs_ereport_start_checksum(spa_t * spa,vdev_t * vd,const zbookmark_phys_t * zb,struct zio * zio,uint64_t offset,uint64_t length,zio_bad_cksum_t * info)1205 zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
1206 struct zio *zio, uint64_t offset, uint64_t length, zio_bad_cksum_t *info)
1207 {
1208 zio_cksum_report_t *report;
1209
1210 #ifdef _KERNEL
1211 if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
1212 return (SET_ERROR(EINVAL));
1213
1214 if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
1215 offset, length))
1216 return (SET_ERROR(EALREADY));
1217
1218 if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
1219 return (SET_ERROR(EBUSY));
1220 #else
1221 (void) zb, (void) offset;
1222 #endif
1223
1224 report = kmem_zalloc(sizeof (*report), KM_SLEEP);
1225
1226 zio_vsd_default_cksum_report(zio, report);
1227
1228 /* copy the checksum failure information if it was provided */
1229 if (info != NULL) {
1230 report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
1231 memcpy(report->zcr_ckinfo, info, sizeof (*info));
1232 }
1233
1234 report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift;
1235 report->zcr_align =
1236 vdev_psize_to_asize(vd->vdev_top, report->zcr_sector);
1237 report->zcr_length = length;
1238
1239 #ifdef _KERNEL
1240 (void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
1241 FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length);
1242
1243 if (report->zcr_ereport == NULL) {
1244 zfs_ereport_free_checksum(report);
1245 return (0);
1246 }
1247 #endif
1248
1249 mutex_enter(&spa->spa_errlist_lock);
1250 report->zcr_next = zio->io_logical->io_cksum_report;
1251 zio->io_logical->io_cksum_report = report;
1252 mutex_exit(&spa->spa_errlist_lock);
1253 return (0);
1254 }
1255
1256 void
zfs_ereport_finish_checksum(zio_cksum_report_t * report,const abd_t * good_data,const abd_t * bad_data,boolean_t drop_if_identical)1257 zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data,
1258 const abd_t *bad_data, boolean_t drop_if_identical)
1259 {
1260 #ifdef _KERNEL
1261 zfs_ecksum_info_t *info;
1262
1263 info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
1264 good_data, bad_data, report->zcr_length, drop_if_identical);
1265 if (info != NULL)
1266 zfs_zevent_post(report->zcr_ereport,
1267 report->zcr_detector, zfs_zevent_post_cb);
1268 else
1269 zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector);
1270
1271 report->zcr_ereport = report->zcr_detector = NULL;
1272 if (info != NULL)
1273 kmem_free(info, sizeof (*info));
1274 #else
1275 (void) report, (void) good_data, (void) bad_data,
1276 (void) drop_if_identical;
1277 #endif
1278 }
1279
1280 void
zfs_ereport_free_checksum(zio_cksum_report_t * rpt)1281 zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
1282 {
1283 #ifdef _KERNEL
1284 if (rpt->zcr_ereport != NULL) {
1285 fm_nvlist_destroy(rpt->zcr_ereport,
1286 FM_NVA_FREE);
1287 fm_nvlist_destroy(rpt->zcr_detector,
1288 FM_NVA_FREE);
1289 }
1290 #endif
1291 rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
1292
1293 if (rpt->zcr_ckinfo != NULL)
1294 kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
1295
1296 kmem_free(rpt, sizeof (*rpt));
1297 }
1298
1299 /*
1300 * Post a checksum ereport
1301 *
1302 * Returns
1303 * - 0 if an event was posted
1304 * - EINVAL if there was a problem posting event
1305 * - EBUSY if the event was rate limited
1306 * - EALREADY if the event was already posted (duplicate)
1307 */
1308 int
zfs_ereport_post_checksum(spa_t * spa,vdev_t * vd,const zbookmark_phys_t * zb,struct zio * zio,uint64_t offset,uint64_t length,const abd_t * good_data,const abd_t * bad_data,zio_bad_cksum_t * zbc)1309 zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
1310 struct zio *zio, uint64_t offset, uint64_t length,
1311 const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc)
1312 {
1313 int rc = 0;
1314 #ifdef _KERNEL
1315 nvlist_t *ereport = NULL;
1316 nvlist_t *detector = NULL;
1317 zfs_ecksum_info_t *info;
1318
1319 if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
1320 return (SET_ERROR(EINVAL));
1321
1322 if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
1323 offset, length))
1324 return (SET_ERROR(EALREADY));
1325
1326 if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
1327 return (SET_ERROR(EBUSY));
1328
1329 if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
1330 spa, vd, zb, zio, offset, length) || (ereport == NULL)) {
1331 return (SET_ERROR(EINVAL));
1332 }
1333
1334 info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
1335 B_FALSE);
1336
1337 if (info != NULL) {
1338 rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
1339 kmem_free(info, sizeof (*info));
1340 }
1341 #else
1342 (void) spa, (void) vd, (void) zb, (void) zio, (void) offset,
1343 (void) length, (void) good_data, (void) bad_data, (void) zbc;
1344 #endif
1345 return (rc);
1346 }
1347
1348 /*
1349 * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of
1350 * change in the pool. All sysevents are listed in sys/sysevent/eventdefs.h
1351 * and are designed to be consumed by the ZFS Event Daemon (ZED). For
1352 * additional details refer to the zed(8) man page.
1353 */
1354 nvlist_t *
zfs_event_create(spa_t * spa,vdev_t * vd,const char * type,const char * name,nvlist_t * aux)1355 zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name,
1356 nvlist_t *aux)
1357 {
1358 nvlist_t *resource = NULL;
1359 #ifdef _KERNEL
1360 char class[64];
1361
1362 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
1363 return (NULL);
1364
1365 if ((resource = fm_nvlist_create(NULL)) == NULL)
1366 return (NULL);
1367
1368 (void) snprintf(class, sizeof (class), "%s.%s.%s", type,
1369 ZFS_ERROR_CLASS, name);
1370 VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION));
1371 VERIFY0(nvlist_add_string(resource, FM_CLASS, class));
1372 VERIFY0(nvlist_add_string(resource,
1373 FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa)));
1374 VERIFY0(nvlist_add_uint64(resource,
1375 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)));
1376 VERIFY0(nvlist_add_uint64(resource,
1377 FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa)));
1378 VERIFY0(nvlist_add_int32(resource,
1379 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa)));
1380
1381 if (vd) {
1382 VERIFY0(nvlist_add_uint64(resource,
1383 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid));
1384 VERIFY0(nvlist_add_uint64(resource,
1385 FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state));
1386 if (vd->vdev_path != NULL)
1387 VERIFY0(nvlist_add_string(resource,
1388 FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path));
1389 if (vd->vdev_devid != NULL)
1390 VERIFY0(nvlist_add_string(resource,
1391 FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid));
1392 if (vd->vdev_fru != NULL)
1393 VERIFY0(nvlist_add_string(resource,
1394 FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru));
1395 if (vd->vdev_enc_sysfs_path != NULL)
1396 VERIFY0(nvlist_add_string(resource,
1397 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
1398 vd->vdev_enc_sysfs_path));
1399 }
1400
1401 /* also copy any optional payload data */
1402 if (aux) {
1403 nvpair_t *elem = NULL;
1404
1405 while ((elem = nvlist_next_nvpair(aux, elem)) != NULL)
1406 (void) nvlist_add_nvpair(resource, elem);
1407 }
1408 #else
1409 (void) spa, (void) vd, (void) type, (void) name, (void) aux;
1410 #endif
1411 return (resource);
1412 }
1413
1414 static void
zfs_post_common(spa_t * spa,vdev_t * vd,const char * type,const char * name,nvlist_t * aux)1415 zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name,
1416 nvlist_t *aux)
1417 {
1418 #ifdef _KERNEL
1419 nvlist_t *resource;
1420
1421 resource = zfs_event_create(spa, vd, type, name, aux);
1422 if (resource)
1423 zfs_zevent_post(resource, NULL, zfs_zevent_post_cb);
1424 #else
1425 (void) spa, (void) vd, (void) type, (void) name, (void) aux;
1426 #endif
1427 }
1428
1429 /*
1430 * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
1431 * has been removed from the system. This will cause the DE to ignore any
1432 * recent I/O errors, inferring that they are due to the asynchronous device
1433 * removal.
1434 */
1435 void
zfs_post_remove(spa_t * spa,vdev_t * vd)1436 zfs_post_remove(spa_t *spa, vdev_t *vd)
1437 {
1438 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL);
1439 }
1440
1441 /*
1442 * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
1443 * has the 'autoreplace' property set, and therefore any broken vdevs will be
1444 * handled by higher level logic, and no vdev fault should be generated.
1445 */
1446 void
zfs_post_autoreplace(spa_t * spa,vdev_t * vd)1447 zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
1448 {
1449 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL);
1450 }
1451
1452 /*
1453 * The 'resource.fs.zfs.statechange' event is an internal signal that the
1454 * given vdev has transitioned its state to DEGRADED or HEALTHY. This will
1455 * cause the retire agent to repair any outstanding fault management cases
1456 * open because the device was not found (fault.fs.zfs.device).
1457 */
1458 void
zfs_post_state_change(spa_t * spa,vdev_t * vd,uint64_t laststate)1459 zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
1460 {
1461 #ifdef _KERNEL
1462 nvlist_t *aux;
1463
1464 /*
1465 * Add optional supplemental keys to payload
1466 */
1467 aux = fm_nvlist_create(NULL);
1468 if (vd && aux) {
1469 if (vd->vdev_physpath) {
1470 fnvlist_add_string(aux,
1471 FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH,
1472 vd->vdev_physpath);
1473 }
1474 if (vd->vdev_enc_sysfs_path) {
1475 fnvlist_add_string(aux,
1476 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
1477 vd->vdev_enc_sysfs_path);
1478 }
1479
1480 fnvlist_add_uint64(aux,
1481 FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate);
1482 }
1483
1484 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE,
1485 aux);
1486
1487 if (aux)
1488 fm_nvlist_destroy(aux, FM_NVA_FREE);
1489 #else
1490 (void) spa, (void) vd, (void) laststate;
1491 #endif
1492 }
1493
1494 #ifdef _KERNEL
1495 void
zfs_ereport_init(void)1496 zfs_ereport_init(void)
1497 {
1498 mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL);
1499 list_create(&recent_events_list, sizeof (recent_events_node_t),
1500 offsetof(recent_events_node_t, re_list_link));
1501 avl_create(&recent_events_tree, recent_events_compare,
1502 sizeof (recent_events_node_t), offsetof(recent_events_node_t,
1503 re_tree_link));
1504 }
1505
1506 /*
1507 * This 'early' fini needs to run before zfs_fini() which on Linux waits
1508 * for the system_delay_taskq to drain.
1509 */
1510 void
zfs_ereport_taskq_fini(void)1511 zfs_ereport_taskq_fini(void)
1512 {
1513 mutex_enter(&recent_events_lock);
1514 if (recent_events_cleaner_tqid != 0) {
1515 taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid);
1516 recent_events_cleaner_tqid = 0;
1517 }
1518 mutex_exit(&recent_events_lock);
1519 }
1520
1521 void
zfs_ereport_fini(void)1522 zfs_ereport_fini(void)
1523 {
1524 recent_events_node_t *entry;
1525
1526 while ((entry = list_remove_head(&recent_events_list)) != NULL) {
1527 avl_remove(&recent_events_tree, entry);
1528 kmem_free(entry, sizeof (*entry));
1529 }
1530 avl_destroy(&recent_events_tree);
1531 list_destroy(&recent_events_list);
1532 mutex_destroy(&recent_events_lock);
1533 }
1534
1535 void
zfs_ereport_snapshot_post(const char * subclass,spa_t * spa,const char * name)1536 zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name)
1537 {
1538 nvlist_t *aux;
1539
1540 aux = fm_nvlist_create(NULL);
1541 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name);
1542
1543 zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
1544 fm_nvlist_destroy(aux, FM_NVA_FREE);
1545 }
1546
1547 /*
1548 * Post when a event when a zvol is created or removed
1549 *
1550 * This is currently only used by macOS, since it uses the event to create
1551 * symlinks between the volume name (mypool/myvol) and the actual /dev
1552 * device (/dev/disk3). For example:
1553 *
1554 * /var/run/zfs/dsk/mypool/myvol -> /dev/disk3
1555 *
1556 * name: The full name of the zvol ("mypool/myvol")
1557 * dev_name: The full /dev name for the zvol ("/dev/disk3")
1558 * raw_name: The raw /dev name for the zvol ("/dev/rdisk3")
1559 */
1560 void
zfs_ereport_zvol_post(const char * subclass,const char * name,const char * dev_name,const char * raw_name)1561 zfs_ereport_zvol_post(const char *subclass, const char *name,
1562 const char *dev_name, const char *raw_name)
1563 {
1564 nvlist_t *aux;
1565 char *r;
1566
1567 boolean_t locked = mutex_owned(&spa_namespace_lock);
1568 if (!locked) mutex_enter(&spa_namespace_lock);
1569 spa_t *spa = spa_lookup(name);
1570 if (!locked) mutex_exit(&spa_namespace_lock);
1571
1572 if (spa == NULL)
1573 return;
1574
1575 aux = fm_nvlist_create(NULL);
1576 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name);
1577 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME,
1578 raw_name);
1579 r = strchr(name, '/');
1580 if (r && r[1])
1581 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]);
1582
1583 zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
1584 fm_nvlist_destroy(aux, FM_NVA_FREE);
1585 }
1586
1587 EXPORT_SYMBOL(zfs_ereport_post);
1588 EXPORT_SYMBOL(zfs_ereport_is_valid);
1589 EXPORT_SYMBOL(zfs_ereport_post_checksum);
1590 EXPORT_SYMBOL(zfs_post_remove);
1591 EXPORT_SYMBOL(zfs_post_autoreplace);
1592 EXPORT_SYMBOL(zfs_post_state_change);
1593
1594 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW,
1595 "Maximum recent zevents records to retain for duplicate checking");
1596 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW,
1597 "Expiration time for recent zevents records");
1598 #endif /* _KERNEL */
1599