1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Copyright (c) 2012,2021 by Delphix. All rights reserved.
29 */
30
31 #include <sys/spa.h>
32 #include <sys/spa_impl.h>
33 #include <sys/vdev.h>
34 #include <sys/vdev_impl.h>
35 #include <sys/zio.h>
36 #include <sys/zio_checksum.h>
37
38 #include <sys/fm/fs/zfs.h>
39 #include <sys/fm/protocol.h>
40 #include <sys/fm/util.h>
41 #include <sys/sysevent.h>
42
43 /*
44 * This general routine is responsible for generating all the different ZFS
45 * ereports. The payload is dependent on the class, and which arguments are
46 * supplied to the function:
47 *
48 * EREPORT POOL VDEV IO
49 * block X X X
50 * data X X
51 * device X X
52 * pool X
53 *
54 * If we are in a loading state, all errors are chained together by the same
55 * SPA-wide ENA (Error Numeric Association).
56 *
57 * For isolated I/O requests, we get the ENA from the zio_t. The propagation
58 * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want
59 * to chain together all ereports associated with a logical piece of data. For
60 * read I/Os, there are basically three 'types' of I/O, which form a roughly
61 * layered diagram:
62 *
63 * +---------------+
64 * | Aggregate I/O | No associated logical data or device
65 * +---------------+
66 * |
67 * V
68 * +---------------+ Reads associated with a piece of logical data.
69 * | Read I/O | This includes reads on behalf of RAID-Z,
70 * +---------------+ mirrors, gang blocks, retries, etc.
71 * |
72 * V
73 * +---------------+ Reads associated with a particular device, but
74 * | Physical I/O | no logical data. Issued as part of vdev caching
75 * +---------------+ and I/O aggregation.
76 *
77 * Note that 'physical I/O' here is not the same terminology as used in the rest
78 * of ZIO. Typically, 'physical I/O' simply means that there is no attached
79 * blockpointer. But I/O with no associated block pointer can still be related
80 * to a logical piece of data (i.e. RAID-Z requests).
81 *
82 * Purely physical I/O always have unique ENAs. They are not related to a
83 * particular piece of logical data, and therefore cannot be chained together.
84 * We still generate an ereport, but the DE doesn't correlate it with any
85 * logical piece of data. When such an I/O fails, the delegated I/O requests
86 * will issue a retry, which will trigger the 'real' ereport with the correct
87 * ENA.
88 *
89 * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
90 * When a new logical I/O is issued, we set this to point to itself. Child I/Os
91 * then inherit this pointer, so that when it is first set subsequent failures
92 * will use the same ENA. For vdev cache fill and queue aggregation I/O,
93 * this pointer is set to NULL, and no ereport will be generated (since it
94 * doesn't actually correspond to any particular device or piece of data,
95 * and the caller will always retry without caching or queueing anyway).
96 *
97 * For checksum errors, we want to include more information about the actual
98 * error which occurs. Accordingly, we build an ereport when the error is
99 * noticed, but instead of sending it in immediately, we hang it off of the
100 * io_cksum_report field of the logical IO. When the logical IO completes
101 * (successfully or not), zfs_ereport_finish_checksum() is called with the
102 * good and bad versions of the buffer (if available), and we annotate the
103 * ereport with information about the differences.
104 */
105
106 #ifdef _KERNEL
107 /*
108 * Duplicate ereport Detection
109 *
110 * Some ereports are retained momentarily for detecting duplicates. These
111 * are kept in a recent_events_node_t in both a time-ordered list and an AVL
112 * tree of recent unique ereports.
113 *
114 * The lifespan of these recent ereports is bounded (15 mins) and a cleaner
115 * task is used to purge stale entries.
116 */
117 static list_t recent_events_list;
118 static avl_tree_t recent_events_tree;
119 static kmutex_t recent_events_lock;
120 static taskqid_t recent_events_cleaner_tqid;
121
122 /*
123 * Each node is about 128 bytes so 2,000 would consume 1/4 MiB.
124 *
125 * This setting can be changed dynamically and setting it to zero
126 * disables duplicate detection.
127 */
128 static unsigned int zfs_zevent_retain_max = 2000;
129
130 /*
131 * The lifespan for a recent ereport entry. The default of 15 minutes is
132 * intended to outlive the zfs diagnosis engine's threshold of 10 errors
133 * over a period of 10 minutes.
134 */
135 static unsigned int zfs_zevent_retain_expire_secs = 900;
136
137 typedef enum zfs_subclass {
138 ZSC_IO,
139 ZSC_DATA,
140 ZSC_CHECKSUM
141 } zfs_subclass_t;
142
143 typedef struct {
144 /* common criteria */
145 uint64_t re_pool_guid;
146 uint64_t re_vdev_guid;
147 int re_io_error;
148 uint64_t re_io_size;
149 uint64_t re_io_offset;
150 zfs_subclass_t re_subclass;
151 zio_priority_t re_io_priority;
152
153 /* logical zio criteria (optional) */
154 zbookmark_phys_t re_io_bookmark;
155
156 /* internal state */
157 avl_node_t re_tree_link;
158 list_node_t re_list_link;
159 uint64_t re_timestamp;
160 } recent_events_node_t;
161
162 static int
recent_events_compare(const void * a,const void * b)163 recent_events_compare(const void *a, const void *b)
164 {
165 const recent_events_node_t *node1 = a;
166 const recent_events_node_t *node2 = b;
167 int cmp;
168
169 /*
170 * The comparison order here is somewhat arbitrary.
171 * What's important is that if every criteria matches, then it
172 * is a duplicate (i.e. compare returns 0)
173 */
174 if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0)
175 return (cmp);
176 if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0)
177 return (cmp);
178 if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0)
179 return (cmp);
180 if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0)
181 return (cmp);
182 if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0)
183 return (cmp);
184 if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0)
185 return (cmp);
186 if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0)
187 return (cmp);
188
189 const zbookmark_phys_t *zb1 = &node1->re_io_bookmark;
190 const zbookmark_phys_t *zb2 = &node2->re_io_bookmark;
191
192 if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0)
193 return (cmp);
194 if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0)
195 return (cmp);
196 if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0)
197 return (cmp);
198 if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0)
199 return (cmp);
200
201 return (0);
202 }
203
204 /*
205 * workaround: vdev properties don't have inheritance
206 */
207 static uint64_t
vdev_prop_get_inherited(vdev_t * vd,vdev_prop_t prop)208 vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop)
209 {
210 uint64_t propdef, propval;
211
212 propdef = vdev_prop_default_numeric(prop);
213 switch (prop) {
214 case VDEV_PROP_CHECKSUM_N:
215 propval = vd->vdev_checksum_n;
216 break;
217 case VDEV_PROP_CHECKSUM_T:
218 propval = vd->vdev_checksum_t;
219 break;
220 case VDEV_PROP_IO_N:
221 propval = vd->vdev_io_n;
222 break;
223 case VDEV_PROP_IO_T:
224 propval = vd->vdev_io_t;
225 break;
226 case VDEV_PROP_SLOW_IO_N:
227 propval = vd->vdev_slow_io_n;
228 break;
229 case VDEV_PROP_SLOW_IO_T:
230 propval = vd->vdev_slow_io_t;
231 break;
232 default:
233 propval = propdef;
234 break;
235 }
236
237 if (propval != propdef)
238 return (propval);
239
240 if (vd->vdev_parent == NULL)
241 return (propdef);
242
243 return (vdev_prop_get_inherited(vd->vdev_parent, prop));
244 }
245
246 static void zfs_ereport_schedule_cleaner(void);
247
248 /*
249 * background task to clean stale recent event nodes.
250 */
251 static void
zfs_ereport_cleaner(void * arg)252 zfs_ereport_cleaner(void *arg)
253 {
254 recent_events_node_t *entry;
255 uint64_t now = gethrtime();
256
257 /*
258 * purge expired entries
259 */
260 mutex_enter(&recent_events_lock);
261 while ((entry = list_tail(&recent_events_list)) != NULL) {
262 uint64_t age = NSEC2SEC(now - entry->re_timestamp);
263 if (age <= zfs_zevent_retain_expire_secs)
264 break;
265
266 /* remove expired node */
267 avl_remove(&recent_events_tree, entry);
268 list_remove(&recent_events_list, entry);
269 kmem_free(entry, sizeof (*entry));
270 }
271
272 /* Restart the cleaner if more entries remain */
273 recent_events_cleaner_tqid = 0;
274 if (!list_is_empty(&recent_events_list))
275 zfs_ereport_schedule_cleaner();
276
277 mutex_exit(&recent_events_lock);
278 }
279
280 static void
zfs_ereport_schedule_cleaner(void)281 zfs_ereport_schedule_cleaner(void)
282 {
283 ASSERT(MUTEX_HELD(&recent_events_lock));
284
285 uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1);
286
287 recent_events_cleaner_tqid = taskq_dispatch_delay(
288 system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP,
289 ddi_get_lbolt() + NSEC_TO_TICK(timeout));
290 }
291
292 /*
293 * Clear entries for a given vdev or all vdevs in a pool when vdev == NULL
294 */
295 void
zfs_ereport_clear(spa_t * spa,vdev_t * vd)296 zfs_ereport_clear(spa_t *spa, vdev_t *vd)
297 {
298 uint64_t vdev_guid, pool_guid;
299
300 ASSERT(vd != NULL || spa != NULL);
301 if (vd == NULL) {
302 vdev_guid = 0;
303 pool_guid = spa_guid(spa);
304 } else {
305 vdev_guid = vd->vdev_guid;
306 pool_guid = 0;
307 }
308
309 mutex_enter(&recent_events_lock);
310
311 recent_events_node_t *next = list_head(&recent_events_list);
312 while (next != NULL) {
313 recent_events_node_t *entry = next;
314
315 next = list_next(&recent_events_list, next);
316
317 if (entry->re_vdev_guid == vdev_guid ||
318 entry->re_pool_guid == pool_guid) {
319 avl_remove(&recent_events_tree, entry);
320 list_remove(&recent_events_list, entry);
321 kmem_free(entry, sizeof (*entry));
322 }
323 }
324
325 mutex_exit(&recent_events_lock);
326 }
327
328 /*
329 * Check if an ereport would be a duplicate of one recently posted.
330 *
331 * An ereport is considered a duplicate if the set of criteria in
332 * recent_events_node_t all match.
333 *
334 * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM
335 * are candidates for duplicate checking.
336 */
337 static boolean_t
zfs_ereport_is_duplicate(const char * subclass,spa_t * spa,vdev_t * vd,const zbookmark_phys_t * zb,zio_t * zio,uint64_t offset,uint64_t size)338 zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd,
339 const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size)
340 {
341 recent_events_node_t search = {0}, *entry;
342
343 if (vd == NULL || zio == NULL)
344 return (B_FALSE);
345
346 if (zfs_zevent_retain_max == 0)
347 return (B_FALSE);
348
349 if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0)
350 search.re_subclass = ZSC_IO;
351 else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0)
352 search.re_subclass = ZSC_DATA;
353 else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0)
354 search.re_subclass = ZSC_CHECKSUM;
355 else
356 return (B_FALSE);
357
358 search.re_pool_guid = spa_guid(spa);
359 search.re_vdev_guid = vd->vdev_guid;
360 search.re_io_error = zio->io_error;
361 search.re_io_priority = zio->io_priority;
362 /* if size is supplied use it over what's in zio */
363 if (size) {
364 search.re_io_size = size;
365 search.re_io_offset = offset;
366 } else {
367 search.re_io_size = zio->io_size;
368 search.re_io_offset = zio->io_offset;
369 }
370
371 /* grab optional logical zio criteria */
372 if (zb != NULL) {
373 search.re_io_bookmark.zb_objset = zb->zb_objset;
374 search.re_io_bookmark.zb_object = zb->zb_object;
375 search.re_io_bookmark.zb_level = zb->zb_level;
376 search.re_io_bookmark.zb_blkid = zb->zb_blkid;
377 }
378
379 uint64_t now = gethrtime();
380
381 mutex_enter(&recent_events_lock);
382
383 /* check if we have seen this one recently */
384 entry = avl_find(&recent_events_tree, &search, NULL);
385 if (entry != NULL) {
386 uint64_t age = NSEC2SEC(now - entry->re_timestamp);
387
388 /*
389 * There is still an active cleaner (since we're here).
390 * Reset the last seen time for this duplicate entry
391 * so that its lifespand gets extended.
392 */
393 list_remove(&recent_events_list, entry);
394 list_insert_head(&recent_events_list, entry);
395 entry->re_timestamp = now;
396
397 zfs_zevent_track_duplicate();
398 mutex_exit(&recent_events_lock);
399
400 return (age <= zfs_zevent_retain_expire_secs);
401 }
402
403 if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) {
404 /* recycle oldest node */
405 entry = list_tail(&recent_events_list);
406 ASSERT(entry != NULL);
407 list_remove(&recent_events_list, entry);
408 avl_remove(&recent_events_tree, entry);
409 } else {
410 entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP);
411 }
412
413 /* record this as a recent ereport */
414 *entry = search;
415 avl_add(&recent_events_tree, entry);
416 list_insert_head(&recent_events_list, entry);
417 entry->re_timestamp = now;
418
419 /* Start a cleaner if not already scheduled */
420 if (recent_events_cleaner_tqid == 0)
421 zfs_ereport_schedule_cleaner();
422
423 mutex_exit(&recent_events_lock);
424 return (B_FALSE);
425 }
426
427 void
zfs_zevent_post_cb(nvlist_t * nvl,nvlist_t * detector)428 zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector)
429 {
430 if (nvl)
431 fm_nvlist_destroy(nvl, FM_NVA_FREE);
432
433 if (detector)
434 fm_nvlist_destroy(detector, FM_NVA_FREE);
435 }
436
437 /*
438 * We want to rate limit ZIO delay, deadman, and checksum events so as to not
439 * flood zevent consumers when a disk is acting up.
440 *
441 * Returns 1 if we're ratelimiting, 0 if not.
442 */
443 static int
zfs_is_ratelimiting_event(const char * subclass,vdev_t * vd)444 zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
445 {
446 int rc = 0;
447 /*
448 * zfs_ratelimit() returns 1 if we're *not* ratelimiting and 0 if we
449 * are. Invert it to get our return value.
450 */
451 if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
452 rc = !zfs_ratelimit(&vd->vdev_delay_rl);
453 } else if (strcmp(subclass, FM_EREPORT_ZFS_DEADMAN) == 0) {
454 rc = !zfs_ratelimit(&vd->vdev_deadman_rl);
455 } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
456 rc = !zfs_ratelimit(&vd->vdev_checksum_rl);
457 }
458
459 if (rc) {
460 /* We're rate limiting */
461 fm_erpt_dropped_increment();
462 }
463
464 return (rc);
465 }
466
467 /*
468 * Return B_TRUE if the event actually posted, B_FALSE if not.
469 */
470 static boolean_t
zfs_ereport_start(nvlist_t ** ereport_out,nvlist_t ** detector_out,const char * subclass,spa_t * spa,vdev_t * vd,const zbookmark_phys_t * zb,zio_t * zio,uint64_t stateoroffset,uint64_t size)471 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
472 const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
473 zio_t *zio, uint64_t stateoroffset, uint64_t size)
474 {
475 nvlist_t *ereport, *detector;
476
477 uint64_t ena;
478 char class[64];
479
480 if ((ereport = fm_nvlist_create(NULL)) == NULL)
481 return (B_FALSE);
482
483 if ((detector = fm_nvlist_create(NULL)) == NULL) {
484 fm_nvlist_destroy(ereport, FM_NVA_FREE);
485 return (B_FALSE);
486 }
487
488 /*
489 * Serialize ereport generation
490 */
491 mutex_enter(&spa->spa_errlist_lock);
492
493 /*
494 * Determine the ENA to use for this event. If we are in a loading
495 * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use
496 * a root zio-wide ENA. Otherwise, simply use a unique ENA.
497 */
498 if (spa_load_state(spa) != SPA_LOAD_NONE) {
499 if (spa->spa_ena == 0)
500 spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
501 ena = spa->spa_ena;
502 } else if (zio != NULL && zio->io_logical != NULL) {
503 if (zio->io_logical->io_ena == 0)
504 zio->io_logical->io_ena =
505 fm_ena_generate(0, FM_ENA_FMT1);
506 ena = zio->io_logical->io_ena;
507 } else {
508 ena = fm_ena_generate(0, FM_ENA_FMT1);
509 }
510
511 /*
512 * Construct the full class, detector, and other standard FMA fields.
513 */
514 (void) snprintf(class, sizeof (class), "%s.%s",
515 ZFS_ERROR_CLASS, subclass);
516
517 fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
518 vd != NULL ? vd->vdev_guid : 0);
519
520 fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
521
522 /*
523 * Construct the per-ereport payload, depending on which parameters are
524 * passed in.
525 */
526
527 /*
528 * Generic payload members common to all ereports.
529 */
530 fm_payload_set(ereport,
531 FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa),
532 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa),
533 FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64,
534 (uint64_t)spa_state(spa),
535 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
536 (int32_t)spa_load_state(spa), NULL);
537
538 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
539 DATA_TYPE_STRING,
540 spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
541 FM_EREPORT_FAILMODE_WAIT :
542 spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
543 FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
544 NULL);
545
546 if (vd != NULL) {
547 vdev_t *pvd = vd->vdev_parent;
548 vdev_queue_t *vq = &vd->vdev_queue;
549 vdev_stat_t *vs = &vd->vdev_stat;
550 vdev_t *spare_vd;
551 uint64_t *spare_guids;
552 char **spare_paths;
553 int i, spare_count;
554
555 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
556 DATA_TYPE_UINT64, vd->vdev_guid,
557 FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
558 DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
559 if (vd->vdev_path != NULL)
560 fm_payload_set(ereport,
561 FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
562 DATA_TYPE_STRING, vd->vdev_path, NULL);
563 if (vd->vdev_devid != NULL)
564 fm_payload_set(ereport,
565 FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
566 DATA_TYPE_STRING, vd->vdev_devid, NULL);
567 if (vd->vdev_fru != NULL)
568 fm_payload_set(ereport,
569 FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
570 DATA_TYPE_STRING, vd->vdev_fru, NULL);
571 if (vd->vdev_enc_sysfs_path != NULL)
572 fm_payload_set(ereport,
573 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
574 DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL);
575 if (vd->vdev_ashift)
576 fm_payload_set(ereport,
577 FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT,
578 DATA_TYPE_UINT64, vd->vdev_ashift, NULL);
579
580 if (vq != NULL) {
581 fm_payload_set(ereport,
582 FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS,
583 DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL);
584 fm_payload_set(ereport,
585 FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS,
586 DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL);
587 }
588
589 if (vs != NULL) {
590 fm_payload_set(ereport,
591 FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS,
592 DATA_TYPE_UINT64, vs->vs_read_errors,
593 FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS,
594 DATA_TYPE_UINT64, vs->vs_write_errors,
595 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS,
596 DATA_TYPE_UINT64, vs->vs_checksum_errors,
597 FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS,
598 DATA_TYPE_UINT64, vs->vs_slow_ios,
599 FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS,
600 DATA_TYPE_UINT64, vs->vs_dio_verify_errors,
601 NULL);
602 }
603
604 if (pvd != NULL) {
605 fm_payload_set(ereport,
606 FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
607 DATA_TYPE_UINT64, pvd->vdev_guid,
608 FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
609 DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
610 NULL);
611 if (pvd->vdev_path)
612 fm_payload_set(ereport,
613 FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
614 DATA_TYPE_STRING, pvd->vdev_path, NULL);
615 if (pvd->vdev_devid)
616 fm_payload_set(ereport,
617 FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
618 DATA_TYPE_STRING, pvd->vdev_devid, NULL);
619 }
620
621 spare_count = spa->spa_spares.sav_count;
622 spare_paths = kmem_zalloc(sizeof (char *) * spare_count,
623 KM_SLEEP);
624 spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count,
625 KM_SLEEP);
626
627 for (i = 0; i < spare_count; i++) {
628 spare_vd = spa->spa_spares.sav_vdevs[i];
629 if (spare_vd) {
630 spare_paths[i] = spare_vd->vdev_path;
631 spare_guids[i] = spare_vd->vdev_guid;
632 }
633 }
634
635 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS,
636 DATA_TYPE_STRING_ARRAY, spare_count, spare_paths,
637 FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS,
638 DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL);
639
640 kmem_free(spare_guids, sizeof (uint64_t) * spare_count);
641 kmem_free(spare_paths, sizeof (char *) * spare_count);
642 }
643
644 if (zio != NULL) {
645 /*
646 * Payload common to all I/Os.
647 */
648 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
649 DATA_TYPE_INT32, zio->io_error, NULL);
650 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
651 DATA_TYPE_UINT64, zio->io_flags, NULL);
652 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
653 DATA_TYPE_UINT32, zio->io_stage, NULL);
654 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,
655 DATA_TYPE_UINT32, zio->io_pipeline, NULL);
656 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
657 DATA_TYPE_UINT64, zio->io_delay, NULL);
658 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP,
659 DATA_TYPE_UINT64, zio->io_timestamp, NULL);
660 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
661 DATA_TYPE_UINT64, zio->io_delta, NULL);
662 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TYPE,
663 DATA_TYPE_UINT32, zio->io_type, NULL);
664 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY,
665 DATA_TYPE_UINT32, zio->io_priority, NULL);
666
667 /*
668 * If the 'size' parameter is non-zero, it indicates this is a
669 * RAID-Z or other I/O where the physical offset and length are
670 * provided for us, instead of within the zio_t.
671 */
672 if (vd != NULL) {
673 if (size)
674 fm_payload_set(ereport,
675 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
676 DATA_TYPE_UINT64, stateoroffset,
677 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
678 DATA_TYPE_UINT64, size, NULL);
679 else
680 fm_payload_set(ereport,
681 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
682 DATA_TYPE_UINT64, zio->io_offset,
683 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
684 DATA_TYPE_UINT64, zio->io_size, NULL);
685 }
686 } else if (vd != NULL) {
687 /*
688 * If we have a vdev but no zio, this is a device fault, and the
689 * 'stateoroffset' parameter indicates the previous state of the
690 * vdev.
691 */
692 fm_payload_set(ereport,
693 FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
694 DATA_TYPE_UINT64, stateoroffset, NULL);
695 }
696
697 /*
698 * Payload for I/Os with corresponding logical information.
699 */
700 if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) {
701 fm_payload_set(ereport,
702 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
703 DATA_TYPE_UINT64, zb->zb_objset,
704 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
705 DATA_TYPE_UINT64, zb->zb_object,
706 FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
707 DATA_TYPE_INT64, zb->zb_level,
708 FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
709 DATA_TYPE_UINT64, zb->zb_blkid, NULL);
710 }
711
712 /*
713 * Payload for tuning the zed
714 */
715 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
716 uint64_t cksum_n, cksum_t;
717
718 cksum_n = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_N);
719 if (cksum_n != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N))
720 fm_payload_set(ereport,
721 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N,
722 DATA_TYPE_UINT64,
723 cksum_n,
724 NULL);
725
726 cksum_t = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_T);
727 if (cksum_t != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T))
728 fm_payload_set(ereport,
729 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T,
730 DATA_TYPE_UINT64,
731 cksum_t,
732 NULL);
733 }
734
735 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) {
736 uint64_t io_n, io_t;
737
738 io_n = vdev_prop_get_inherited(vd, VDEV_PROP_IO_N);
739 if (io_n != vdev_prop_default_numeric(VDEV_PROP_IO_N))
740 fm_payload_set(ereport,
741 FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N,
742 DATA_TYPE_UINT64,
743 io_n,
744 NULL);
745
746 io_t = vdev_prop_get_inherited(vd, VDEV_PROP_IO_T);
747 if (io_t != vdev_prop_default_numeric(VDEV_PROP_IO_T))
748 fm_payload_set(ereport,
749 FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T,
750 DATA_TYPE_UINT64,
751 io_t,
752 NULL);
753 }
754
755 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
756 uint64_t slow_io_n, slow_io_t;
757
758 slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N);
759 if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N))
760 fm_payload_set(ereport,
761 FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
762 DATA_TYPE_UINT64,
763 slow_io_n,
764 NULL);
765
766 slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T);
767 if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T))
768 fm_payload_set(ereport,
769 FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
770 DATA_TYPE_UINT64,
771 slow_io_t,
772 NULL);
773 }
774
775 mutex_exit(&spa->spa_errlist_lock);
776
777 *ereport_out = ereport;
778 *detector_out = detector;
779 return (B_TRUE);
780 }
781
782 /* if it's <= 128 bytes, save the corruption directly */
783 #define ZFM_MAX_INLINE (128 / sizeof (uint64_t))
784
785 #define MAX_RANGES 16
786
787 typedef struct zfs_ecksum_info {
788 /* inline arrays of bits set and cleared. */
789 uint64_t zei_bits_set[ZFM_MAX_INLINE];
790 uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
791
792 /*
793 * for each range, the number of bits set and cleared. The Hamming
794 * distance between the good and bad buffers is the sum of them all.
795 */
796 uint32_t zei_range_sets[MAX_RANGES];
797 uint32_t zei_range_clears[MAX_RANGES];
798
799 struct zei_ranges {
800 uint32_t zr_start;
801 uint32_t zr_end;
802 } zei_ranges[MAX_RANGES];
803
804 size_t zei_range_count;
805 uint32_t zei_mingap;
806 uint32_t zei_allowed_mingap;
807
808 } zfs_ecksum_info_t;
809
810 static void
update_bad_bits(uint64_t value_arg,uint32_t * count)811 update_bad_bits(uint64_t value_arg, uint32_t *count)
812 {
813 size_t i;
814 size_t bits = 0;
815 uint64_t value = BE_64(value_arg);
816
817 /* We store the bits in big-endian (largest-first) order */
818 for (i = 0; i < 64; i++) {
819 if (value & (1ull << i))
820 ++bits;
821 }
822 /* update the count of bits changed */
823 *count += bits;
824 }
825
826 /*
827 * We've now filled up the range array, and need to increase "mingap" and
828 * shrink the range list accordingly. zei_mingap is always the smallest
829 * distance between array entries, so we set the new_allowed_gap to be
830 * one greater than that. We then go through the list, joining together
831 * any ranges which are closer than the new_allowed_gap.
832 *
833 * By construction, there will be at least one. We also update zei_mingap
834 * to the new smallest gap, to prepare for our next invocation.
835 */
836 static void
zei_shrink_ranges(zfs_ecksum_info_t * eip)837 zei_shrink_ranges(zfs_ecksum_info_t *eip)
838 {
839 uint32_t mingap = UINT32_MAX;
840 uint32_t new_allowed_gap = eip->zei_mingap + 1;
841
842 size_t idx, output;
843 size_t max = eip->zei_range_count;
844
845 struct zei_ranges *r = eip->zei_ranges;
846
847 ASSERT3U(eip->zei_range_count, >, 0);
848 ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
849
850 output = idx = 0;
851 while (idx < max - 1) {
852 uint32_t start = r[idx].zr_start;
853 uint32_t end = r[idx].zr_end;
854
855 while (idx < max - 1) {
856 idx++;
857
858 uint32_t nstart = r[idx].zr_start;
859 uint32_t nend = r[idx].zr_end;
860
861 uint32_t gap = nstart - end;
862 if (gap < new_allowed_gap) {
863 end = nend;
864 continue;
865 }
866 if (gap < mingap)
867 mingap = gap;
868 break;
869 }
870 r[output].zr_start = start;
871 r[output].zr_end = end;
872 output++;
873 }
874 ASSERT3U(output, <, eip->zei_range_count);
875 eip->zei_range_count = output;
876 eip->zei_mingap = mingap;
877 eip->zei_allowed_mingap = new_allowed_gap;
878 }
879
880 static void
zei_add_range(zfs_ecksum_info_t * eip,int start,int end)881 zei_add_range(zfs_ecksum_info_t *eip, int start, int end)
882 {
883 struct zei_ranges *r = eip->zei_ranges;
884 size_t count = eip->zei_range_count;
885
886 if (count >= MAX_RANGES) {
887 zei_shrink_ranges(eip);
888 count = eip->zei_range_count;
889 }
890 if (count == 0) {
891 eip->zei_mingap = UINT32_MAX;
892 eip->zei_allowed_mingap = 1;
893 } else {
894 int gap = start - r[count - 1].zr_end;
895
896 if (gap < eip->zei_allowed_mingap) {
897 r[count - 1].zr_end = end;
898 return;
899 }
900 if (gap < eip->zei_mingap)
901 eip->zei_mingap = gap;
902 }
903 r[count].zr_start = start;
904 r[count].zr_end = end;
905 eip->zei_range_count++;
906 }
907
908 static size_t
zei_range_total_size(zfs_ecksum_info_t * eip)909 zei_range_total_size(zfs_ecksum_info_t *eip)
910 {
911 struct zei_ranges *r = eip->zei_ranges;
912 size_t count = eip->zei_range_count;
913 size_t result = 0;
914 size_t idx;
915
916 for (idx = 0; idx < count; idx++)
917 result += (r[idx].zr_end - r[idx].zr_start);
918
919 return (result);
920 }
921
922 static zfs_ecksum_info_t *
annotate_ecksum(nvlist_t * ereport,zio_bad_cksum_t * info,const abd_t * goodabd,const abd_t * badabd,size_t size,boolean_t drop_if_identical)923 annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
924 const abd_t *goodabd, const abd_t *badabd, size_t size,
925 boolean_t drop_if_identical)
926 {
927 const uint64_t *good;
928 const uint64_t *bad;
929
930 size_t nui64s = size / sizeof (uint64_t);
931
932 size_t inline_size;
933 int no_inline = 0;
934 size_t idx;
935 size_t range;
936
937 size_t offset = 0;
938 ssize_t start = -1;
939
940 zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
941
942 /* don't do any annotation for injected checksum errors */
943 if (info != NULL && info->zbc_injected)
944 return (eip);
945
946 if (info != NULL && info->zbc_has_cksum) {
947 fm_payload_set(ereport,
948 FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
949 DATA_TYPE_STRING,
950 info->zbc_checksum_name,
951 NULL);
952
953 if (info->zbc_byteswapped) {
954 fm_payload_set(ereport,
955 FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
956 DATA_TYPE_BOOLEAN, 1,
957 NULL);
958 }
959 }
960
961 if (badabd == NULL || goodabd == NULL)
962 return (eip);
963
964 ASSERT3U(nui64s, <=, UINT32_MAX);
965 ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
966 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
967 ASSERT3U(size, <=, UINT32_MAX);
968
969 good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size);
970 bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size);
971
972 /* build up the range list by comparing the two buffers. */
973 for (idx = 0; idx < nui64s; idx++) {
974 if (good[idx] == bad[idx]) {
975 if (start == -1)
976 continue;
977
978 zei_add_range(eip, start, idx);
979 start = -1;
980 } else {
981 if (start != -1)
982 continue;
983
984 start = idx;
985 }
986 }
987 if (start != -1)
988 zei_add_range(eip, start, idx);
989
990 /* See if it will fit in our inline buffers */
991 inline_size = zei_range_total_size(eip);
992 if (inline_size > ZFM_MAX_INLINE)
993 no_inline = 1;
994
995 /*
996 * If there is no change and we want to drop if the buffers are
997 * identical, do so.
998 */
999 if (inline_size == 0 && drop_if_identical) {
1000 kmem_free(eip, sizeof (*eip));
1001 abd_return_buf((abd_t *)goodabd, (void *)good, size);
1002 abd_return_buf((abd_t *)badabd, (void *)bad, size);
1003 return (NULL);
1004 }
1005
1006 /*
1007 * Now walk through the ranges, filling in the details of the
1008 * differences. Also convert our uint64_t-array offsets to byte
1009 * offsets.
1010 */
1011 for (range = 0; range < eip->zei_range_count; range++) {
1012 size_t start = eip->zei_ranges[range].zr_start;
1013 size_t end = eip->zei_ranges[range].zr_end;
1014
1015 for (idx = start; idx < end; idx++) {
1016 uint64_t set, cleared;
1017
1018 // bits set in bad, but not in good
1019 set = ((~good[idx]) & bad[idx]);
1020 // bits set in good, but not in bad
1021 cleared = (good[idx] & (~bad[idx]));
1022
1023 if (!no_inline) {
1024 ASSERT3U(offset, <, inline_size);
1025 eip->zei_bits_set[offset] = set;
1026 eip->zei_bits_cleared[offset] = cleared;
1027 offset++;
1028 }
1029
1030 update_bad_bits(set, &eip->zei_range_sets[range]);
1031 update_bad_bits(cleared, &eip->zei_range_clears[range]);
1032 }
1033
1034 /* convert to byte offsets */
1035 eip->zei_ranges[range].zr_start *= sizeof (uint64_t);
1036 eip->zei_ranges[range].zr_end *= sizeof (uint64_t);
1037 }
1038
1039 abd_return_buf((abd_t *)goodabd, (void *)good, size);
1040 abd_return_buf((abd_t *)badabd, (void *)bad, size);
1041
1042 eip->zei_allowed_mingap *= sizeof (uint64_t);
1043 inline_size *= sizeof (uint64_t);
1044
1045 /* fill in ereport */
1046 fm_payload_set(ereport,
1047 FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
1048 DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
1049 (uint32_t *)eip->zei_ranges,
1050 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
1051 DATA_TYPE_UINT32, eip->zei_allowed_mingap,
1052 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
1053 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
1054 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
1055 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
1056 NULL);
1057
1058 if (!no_inline) {
1059 fm_payload_set(ereport,
1060 FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
1061 DATA_TYPE_UINT8_ARRAY,
1062 inline_size, (uint8_t *)eip->zei_bits_set,
1063 FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
1064 DATA_TYPE_UINT8_ARRAY,
1065 inline_size, (uint8_t *)eip->zei_bits_cleared,
1066 NULL);
1067 }
1068 return (eip);
1069 }
1070 #else
1071 void
zfs_ereport_clear(spa_t * spa,vdev_t * vd)1072 zfs_ereport_clear(spa_t *spa, vdev_t *vd)
1073 {
1074 (void) spa, (void) vd;
1075 }
1076 #endif
1077
1078 /*
1079 * Make sure our event is still valid for the given zio/vdev/pool. For example,
1080 * we don't want to keep logging events for a faulted or missing vdev.
1081 */
1082 boolean_t
zfs_ereport_is_valid(const char * subclass,spa_t * spa,vdev_t * vd,zio_t * zio)1083 zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
1084 {
1085 #ifdef _KERNEL
1086 /*
1087 * If we are doing a spa_tryimport() or in recovery mode,
1088 * ignore errors.
1089 */
1090 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
1091 spa_load_state(spa) == SPA_LOAD_RECOVER)
1092 return (B_FALSE);
1093
1094 /*
1095 * If we are in the middle of opening a pool, and the previous attempt
1096 * failed, don't bother logging any new ereports - we're just going to
1097 * get the same diagnosis anyway.
1098 */
1099 if (spa_load_state(spa) != SPA_LOAD_NONE &&
1100 spa->spa_last_open_failed)
1101 return (B_FALSE);
1102
1103 if (zio != NULL) {
1104 /* If this is not a read or write zio, ignore the error */
1105 if (zio->io_type != ZIO_TYPE_READ &&
1106 zio->io_type != ZIO_TYPE_WRITE)
1107 return (B_FALSE);
1108
1109 if (vd != NULL) {
1110 /*
1111 * If the vdev has already been marked as failing due
1112 * to a failed probe, then ignore any subsequent I/O
1113 * errors, as the DE will automatically fault the vdev
1114 * on the first such failure. This also catches cases
1115 * where vdev_remove_wanted is set and the device has
1116 * not yet been asynchronously placed into the REMOVED
1117 * state.
1118 */
1119 if (zio->io_vd == vd && !vdev_accessible(vd, zio))
1120 return (B_FALSE);
1121
1122 /*
1123 * Ignore checksum errors for reads from DTL regions of
1124 * leaf vdevs.
1125 */
1126 if (zio->io_type == ZIO_TYPE_READ &&
1127 zio->io_error == ECKSUM &&
1128 vd->vdev_ops->vdev_op_leaf &&
1129 vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
1130 return (B_FALSE);
1131 }
1132 }
1133
1134 /*
1135 * For probe failure, we want to avoid posting ereports if we've
1136 * already removed the device in the meantime.
1137 */
1138 if (vd != NULL &&
1139 strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
1140 (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
1141 return (B_FALSE);
1142
1143 /* Ignore bogus delay events (like from ioctls or unqueued IOs) */
1144 if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
1145 (zio != NULL) && (!zio->io_timestamp)) {
1146 return (B_FALSE);
1147 }
1148 #else
1149 (void) subclass, (void) spa, (void) vd, (void) zio;
1150 #endif
1151 return (B_TRUE);
1152 }
1153
1154 /*
1155 * Post an ereport for the given subclass
1156 *
1157 * Returns
1158 * - 0 if an event was posted
1159 * - EINVAL if there was a problem posting event
1160 * - EBUSY if the event was rate limited
1161 * - EALREADY if the event was already posted (duplicate)
1162 */
1163 int
zfs_ereport_post(const char * subclass,spa_t * spa,vdev_t * vd,const zbookmark_phys_t * zb,zio_t * zio,uint64_t state)1164 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
1165 const zbookmark_phys_t *zb, zio_t *zio, uint64_t state)
1166 {
1167 int rc = 0;
1168 #ifdef _KERNEL
1169 nvlist_t *ereport = NULL;
1170 nvlist_t *detector = NULL;
1171
1172 if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
1173 return (EINVAL);
1174
1175 if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0))
1176 return (SET_ERROR(EALREADY));
1177
1178 if (zfs_is_ratelimiting_event(subclass, vd))
1179 return (SET_ERROR(EBUSY));
1180
1181 if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
1182 zb, zio, state, 0))
1183 return (SET_ERROR(EINVAL)); /* couldn't post event */
1184
1185 if (ereport == NULL)
1186 return (SET_ERROR(EINVAL));
1187
1188 /* Cleanup is handled by the callback function */
1189 rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
1190 #else
1191 (void) subclass, (void) spa, (void) vd, (void) zb, (void) zio,
1192 (void) state;
1193 #endif
1194 return (rc);
1195 }
1196
1197 /*
1198 * Prepare a checksum ereport
1199 *
1200 * Returns
1201 * - 0 if an event was posted
1202 * - EINVAL if there was a problem posting event
1203 * - EBUSY if the event was rate limited
1204 * - EALREADY if the event was already posted (duplicate)
1205 */
1206 int
zfs_ereport_start_checksum(spa_t * spa,vdev_t * vd,const zbookmark_phys_t * zb,struct zio * zio,uint64_t offset,uint64_t length,zio_bad_cksum_t * info)1207 zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
1208 struct zio *zio, uint64_t offset, uint64_t length, zio_bad_cksum_t *info)
1209 {
1210 zio_cksum_report_t *report;
1211
1212 #ifdef _KERNEL
1213 if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
1214 return (SET_ERROR(EINVAL));
1215
1216 if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
1217 offset, length))
1218 return (SET_ERROR(EALREADY));
1219
1220 if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
1221 return (SET_ERROR(EBUSY));
1222 #else
1223 (void) zb, (void) offset;
1224 #endif
1225
1226 report = kmem_zalloc(sizeof (*report), KM_SLEEP);
1227
1228 zio_vsd_default_cksum_report(zio, report);
1229
1230 /* copy the checksum failure information if it was provided */
1231 if (info != NULL) {
1232 report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
1233 memcpy(report->zcr_ckinfo, info, sizeof (*info));
1234 }
1235
1236 report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift;
1237 report->zcr_align =
1238 vdev_psize_to_asize(vd->vdev_top, report->zcr_sector);
1239 report->zcr_length = length;
1240
1241 #ifdef _KERNEL
1242 (void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
1243 FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length);
1244
1245 if (report->zcr_ereport == NULL) {
1246 zfs_ereport_free_checksum(report);
1247 return (0);
1248 }
1249 #endif
1250
1251 mutex_enter(&spa->spa_errlist_lock);
1252 report->zcr_next = zio->io_logical->io_cksum_report;
1253 zio->io_logical->io_cksum_report = report;
1254 mutex_exit(&spa->spa_errlist_lock);
1255 return (0);
1256 }
1257
1258 void
zfs_ereport_finish_checksum(zio_cksum_report_t * report,const abd_t * good_data,const abd_t * bad_data,boolean_t drop_if_identical)1259 zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data,
1260 const abd_t *bad_data, boolean_t drop_if_identical)
1261 {
1262 #ifdef _KERNEL
1263 zfs_ecksum_info_t *info;
1264
1265 info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
1266 good_data, bad_data, report->zcr_length, drop_if_identical);
1267 if (info != NULL)
1268 zfs_zevent_post(report->zcr_ereport,
1269 report->zcr_detector, zfs_zevent_post_cb);
1270 else
1271 zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector);
1272
1273 report->zcr_ereport = report->zcr_detector = NULL;
1274 if (info != NULL)
1275 kmem_free(info, sizeof (*info));
1276 #else
1277 (void) report, (void) good_data, (void) bad_data,
1278 (void) drop_if_identical;
1279 #endif
1280 }
1281
1282 void
zfs_ereport_free_checksum(zio_cksum_report_t * rpt)1283 zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
1284 {
1285 #ifdef _KERNEL
1286 if (rpt->zcr_ereport != NULL) {
1287 fm_nvlist_destroy(rpt->zcr_ereport,
1288 FM_NVA_FREE);
1289 fm_nvlist_destroy(rpt->zcr_detector,
1290 FM_NVA_FREE);
1291 }
1292 #endif
1293 rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
1294
1295 if (rpt->zcr_ckinfo != NULL)
1296 kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
1297
1298 kmem_free(rpt, sizeof (*rpt));
1299 }
1300
1301 /*
1302 * Post a checksum ereport
1303 *
1304 * Returns
1305 * - 0 if an event was posted
1306 * - EINVAL if there was a problem posting event
1307 * - EBUSY if the event was rate limited
1308 * - EALREADY if the event was already posted (duplicate)
1309 */
1310 int
zfs_ereport_post_checksum(spa_t * spa,vdev_t * vd,const zbookmark_phys_t * zb,struct zio * zio,uint64_t offset,uint64_t length,const abd_t * good_data,const abd_t * bad_data,zio_bad_cksum_t * zbc)1311 zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
1312 struct zio *zio, uint64_t offset, uint64_t length,
1313 const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc)
1314 {
1315 int rc = 0;
1316 #ifdef _KERNEL
1317 nvlist_t *ereport = NULL;
1318 nvlist_t *detector = NULL;
1319 zfs_ecksum_info_t *info;
1320
1321 if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
1322 return (SET_ERROR(EINVAL));
1323
1324 if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
1325 offset, length))
1326 return (SET_ERROR(EALREADY));
1327
1328 if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
1329 return (SET_ERROR(EBUSY));
1330
1331 if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
1332 spa, vd, zb, zio, offset, length) || (ereport == NULL)) {
1333 return (SET_ERROR(EINVAL));
1334 }
1335
1336 info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
1337 B_FALSE);
1338
1339 if (info != NULL) {
1340 rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
1341 kmem_free(info, sizeof (*info));
1342 }
1343 #else
1344 (void) spa, (void) vd, (void) zb, (void) zio, (void) offset,
1345 (void) length, (void) good_data, (void) bad_data, (void) zbc;
1346 #endif
1347 return (rc);
1348 }
1349
1350 /*
1351 * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of
1352 * change in the pool. All sysevents are listed in sys/sysevent/eventdefs.h
1353 * and are designed to be consumed by the ZFS Event Daemon (ZED). For
1354 * additional details refer to the zed(8) man page.
1355 */
1356 nvlist_t *
zfs_event_create(spa_t * spa,vdev_t * vd,const char * type,const char * name,nvlist_t * aux)1357 zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name,
1358 nvlist_t *aux)
1359 {
1360 nvlist_t *resource = NULL;
1361 #ifdef _KERNEL
1362 char class[64];
1363
1364 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
1365 return (NULL);
1366
1367 if ((resource = fm_nvlist_create(NULL)) == NULL)
1368 return (NULL);
1369
1370 (void) snprintf(class, sizeof (class), "%s.%s.%s", type,
1371 ZFS_ERROR_CLASS, name);
1372 VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION));
1373 VERIFY0(nvlist_add_string(resource, FM_CLASS, class));
1374 VERIFY0(nvlist_add_string(resource,
1375 FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa)));
1376 VERIFY0(nvlist_add_uint64(resource,
1377 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)));
1378 VERIFY0(nvlist_add_uint64(resource,
1379 FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa)));
1380 VERIFY0(nvlist_add_int32(resource,
1381 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa)));
1382
1383 if (vd) {
1384 VERIFY0(nvlist_add_uint64(resource,
1385 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid));
1386 VERIFY0(nvlist_add_uint64(resource,
1387 FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state));
1388 if (vd->vdev_path != NULL)
1389 VERIFY0(nvlist_add_string(resource,
1390 FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path));
1391 if (vd->vdev_devid != NULL)
1392 VERIFY0(nvlist_add_string(resource,
1393 FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid));
1394 if (vd->vdev_fru != NULL)
1395 VERIFY0(nvlist_add_string(resource,
1396 FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru));
1397 if (vd->vdev_enc_sysfs_path != NULL)
1398 VERIFY0(nvlist_add_string(resource,
1399 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
1400 vd->vdev_enc_sysfs_path));
1401 }
1402
1403 /* also copy any optional payload data */
1404 if (aux) {
1405 nvpair_t *elem = NULL;
1406
1407 while ((elem = nvlist_next_nvpair(aux, elem)) != NULL)
1408 (void) nvlist_add_nvpair(resource, elem);
1409 }
1410 #else
1411 (void) spa, (void) vd, (void) type, (void) name, (void) aux;
1412 #endif
1413 return (resource);
1414 }
1415
1416 static void
zfs_post_common(spa_t * spa,vdev_t * vd,const char * type,const char * name,nvlist_t * aux)1417 zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name,
1418 nvlist_t *aux)
1419 {
1420 #ifdef _KERNEL
1421 nvlist_t *resource;
1422
1423 resource = zfs_event_create(spa, vd, type, name, aux);
1424 if (resource)
1425 zfs_zevent_post(resource, NULL, zfs_zevent_post_cb);
1426 #else
1427 (void) spa, (void) vd, (void) type, (void) name, (void) aux;
1428 #endif
1429 }
1430
1431 /*
1432 * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
1433 * has been removed from the system. This will cause the DE to ignore any
1434 * recent I/O errors, inferring that they are due to the asynchronous device
1435 * removal.
1436 */
1437 void
zfs_post_remove(spa_t * spa,vdev_t * vd,boolean_t by_kernel)1438 zfs_post_remove(spa_t *spa, vdev_t *vd, boolean_t by_kernel)
1439 {
1440 nvlist_t *aux = NULL;
1441
1442 if (by_kernel) {
1443 /*
1444 * Add optional supplemental keys to payload
1445 */
1446 aux = fm_nvlist_create(NULL);
1447 if (aux)
1448 fnvlist_add_boolean(aux, "by_kernel");
1449 }
1450
1451 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, aux);
1452
1453 if (by_kernel && aux)
1454 fm_nvlist_destroy(aux, FM_NVA_FREE);
1455 }
1456
1457 /*
1458 * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
1459 * has the 'autoreplace' property set, and therefore any broken vdevs will be
1460 * handled by higher level logic, and no vdev fault should be generated.
1461 */
1462 void
zfs_post_autoreplace(spa_t * spa,vdev_t * vd)1463 zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
1464 {
1465 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL);
1466 }
1467
1468 /*
1469 * The 'resource.fs.zfs.statechange' event is an internal signal that the
1470 * given vdev has transitioned its state to DEGRADED or HEALTHY. This will
1471 * cause the retire agent to repair any outstanding fault management cases
1472 * open because the device was not found (fault.fs.zfs.device).
1473 */
1474 void
zfs_post_state_change(spa_t * spa,vdev_t * vd,uint64_t laststate)1475 zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
1476 {
1477 #ifdef _KERNEL
1478 nvlist_t *aux;
1479
1480 /*
1481 * Add optional supplemental keys to payload
1482 */
1483 aux = fm_nvlist_create(NULL);
1484 if (vd && aux) {
1485 if (vd->vdev_physpath) {
1486 fnvlist_add_string(aux,
1487 FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH,
1488 vd->vdev_physpath);
1489 }
1490 if (vd->vdev_enc_sysfs_path) {
1491 fnvlist_add_string(aux,
1492 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
1493 vd->vdev_enc_sysfs_path);
1494 }
1495
1496 fnvlist_add_uint64(aux,
1497 FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate);
1498 }
1499
1500 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE,
1501 aux);
1502
1503 if (aux)
1504 fm_nvlist_destroy(aux, FM_NVA_FREE);
1505 #else
1506 (void) spa, (void) vd, (void) laststate;
1507 #endif
1508 }
1509
1510 #ifdef _KERNEL
1511 void
zfs_ereport_init(void)1512 zfs_ereport_init(void)
1513 {
1514 mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL);
1515 list_create(&recent_events_list, sizeof (recent_events_node_t),
1516 offsetof(recent_events_node_t, re_list_link));
1517 avl_create(&recent_events_tree, recent_events_compare,
1518 sizeof (recent_events_node_t), offsetof(recent_events_node_t,
1519 re_tree_link));
1520 }
1521
1522 /*
1523 * This 'early' fini needs to run before zfs_fini() which on Linux waits
1524 * for the system_delay_taskq to drain.
1525 */
1526 void
zfs_ereport_taskq_fini(void)1527 zfs_ereport_taskq_fini(void)
1528 {
1529 mutex_enter(&recent_events_lock);
1530 if (recent_events_cleaner_tqid != 0) {
1531 taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid);
1532 recent_events_cleaner_tqid = 0;
1533 }
1534 mutex_exit(&recent_events_lock);
1535 }
1536
1537 void
zfs_ereport_fini(void)1538 zfs_ereport_fini(void)
1539 {
1540 recent_events_node_t *entry;
1541
1542 while ((entry = list_remove_head(&recent_events_list)) != NULL) {
1543 avl_remove(&recent_events_tree, entry);
1544 kmem_free(entry, sizeof (*entry));
1545 }
1546 avl_destroy(&recent_events_tree);
1547 list_destroy(&recent_events_list);
1548 mutex_destroy(&recent_events_lock);
1549 }
1550
1551 void
zfs_ereport_snapshot_post(const char * subclass,spa_t * spa,const char * name)1552 zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name)
1553 {
1554 nvlist_t *aux;
1555
1556 aux = fm_nvlist_create(NULL);
1557 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name);
1558
1559 zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
1560 fm_nvlist_destroy(aux, FM_NVA_FREE);
1561 }
1562
1563 /*
1564 * Post when a event when a zvol is created or removed
1565 *
1566 * This is currently only used by macOS, since it uses the event to create
1567 * symlinks between the volume name (mypool/myvol) and the actual /dev
1568 * device (/dev/disk3). For example:
1569 *
1570 * /var/run/zfs/dsk/mypool/myvol -> /dev/disk3
1571 *
1572 * name: The full name of the zvol ("mypool/myvol")
1573 * dev_name: The full /dev name for the zvol ("/dev/disk3")
1574 * raw_name: The raw /dev name for the zvol ("/dev/rdisk3")
1575 */
1576 void
zfs_ereport_zvol_post(const char * subclass,const char * name,const char * dev_name,const char * raw_name)1577 zfs_ereport_zvol_post(const char *subclass, const char *name,
1578 const char *dev_name, const char *raw_name)
1579 {
1580 nvlist_t *aux;
1581 char *r;
1582
1583 boolean_t locked = mutex_owned(&spa_namespace_lock);
1584 if (!locked) mutex_enter(&spa_namespace_lock);
1585 spa_t *spa = spa_lookup(name);
1586 if (!locked) mutex_exit(&spa_namespace_lock);
1587
1588 if (spa == NULL)
1589 return;
1590
1591 aux = fm_nvlist_create(NULL);
1592 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name);
1593 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME,
1594 raw_name);
1595 r = strchr(name, '/');
1596 if (r && r[1])
1597 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]);
1598
1599 zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
1600 fm_nvlist_destroy(aux, FM_NVA_FREE);
1601 }
1602
1603 EXPORT_SYMBOL(zfs_ereport_post);
1604 EXPORT_SYMBOL(zfs_ereport_is_valid);
1605 EXPORT_SYMBOL(zfs_ereport_post_checksum);
1606 EXPORT_SYMBOL(zfs_post_remove);
1607 EXPORT_SYMBOL(zfs_post_autoreplace);
1608 EXPORT_SYMBOL(zfs_post_state_change);
1609
1610 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW,
1611 "Maximum recent zevents records to retain for duplicate checking");
1612 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW,
1613 "Expiration time for recent zevents records");
1614 #endif /* _KERNEL */
1615