1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 2012,2021 by Delphix. All rights reserved. 29 */ 30 31 #include <sys/spa.h> 32 #include <sys/spa_impl.h> 33 #include <sys/vdev.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/zio.h> 36 #include <sys/zio_checksum.h> 37 38 #include <sys/fm/fs/zfs.h> 39 #include <sys/fm/protocol.h> 40 #include <sys/fm/util.h> 41 #include <sys/sysevent.h> 42 43 /* 44 * This general routine is responsible for generating all the different ZFS 45 * ereports. The payload is dependent on the class, and which arguments are 46 * supplied to the function: 47 * 48 * EREPORT POOL VDEV IO 49 * block X X X 50 * data X X 51 * device X X 52 * pool X 53 * 54 * If we are in a loading state, all errors are chained together by the same 55 * SPA-wide ENA (Error Numeric Association). 56 * 57 * For isolated I/O requests, we get the ENA from the zio_t. The propagation 58 * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want 59 * to chain together all ereports associated with a logical piece of data. For 60 * read I/Os, there are basically three 'types' of I/O, which form a roughly 61 * layered diagram: 62 * 63 * +---------------+ 64 * | Aggregate I/O | No associated logical data or device 65 * +---------------+ 66 * | 67 * V 68 * +---------------+ Reads associated with a piece of logical data. 69 * | Read I/O | This includes reads on behalf of RAID-Z, 70 * +---------------+ mirrors, gang blocks, retries, etc. 71 * | 72 * V 73 * +---------------+ Reads associated with a particular device, but 74 * | Physical I/O | no logical data. Issued as part of vdev caching 75 * +---------------+ and I/O aggregation. 76 * 77 * Note that 'physical I/O' here is not the same terminology as used in the rest 78 * of ZIO. Typically, 'physical I/O' simply means that there is no attached 79 * blockpointer. But I/O with no associated block pointer can still be related 80 * to a logical piece of data (i.e. RAID-Z requests). 81 * 82 * Purely physical I/O always have unique ENAs. They are not related to a 83 * particular piece of logical data, and therefore cannot be chained together. 84 * We still generate an ereport, but the DE doesn't correlate it with any 85 * logical piece of data. When such an I/O fails, the delegated I/O requests 86 * will issue a retry, which will trigger the 'real' ereport with the correct 87 * ENA. 88 * 89 * We keep track of the ENA for a ZIO chain through the 'io_logical' member. 90 * When a new logical I/O is issued, we set this to point to itself. Child I/Os 91 * then inherit this pointer, so that when it is first set subsequent failures 92 * will use the same ENA. For vdev cache fill and queue aggregation I/O, 93 * this pointer is set to NULL, and no ereport will be generated (since it 94 * doesn't actually correspond to any particular device or piece of data, 95 * and the caller will always retry without caching or queueing anyway). 96 * 97 * For checksum errors, we want to include more information about the actual 98 * error which occurs. Accordingly, we build an ereport when the error is 99 * noticed, but instead of sending it in immediately, we hang it off of the 100 * io_cksum_report field of the logical IO. When the logical IO completes 101 * (successfully or not), zfs_ereport_finish_checksum() is called with the 102 * good and bad versions of the buffer (if available), and we annotate the 103 * ereport with information about the differences. 104 */ 105 106 #ifdef _KERNEL 107 /* 108 * Duplicate ereport Detection 109 * 110 * Some ereports are retained momentarily for detecting duplicates. These 111 * are kept in a recent_events_node_t in both a time-ordered list and an AVL 112 * tree of recent unique ereports. 113 * 114 * The lifespan of these recent ereports is bounded (15 mins) and a cleaner 115 * task is used to purge stale entries. 116 */ 117 static list_t recent_events_list; 118 static avl_tree_t recent_events_tree; 119 static kmutex_t recent_events_lock; 120 static taskqid_t recent_events_cleaner_tqid; 121 122 /* 123 * Each node is about 128 bytes so 2,000 would consume 1/4 MiB. 124 * 125 * This setting can be changed dynamically and setting it to zero 126 * disables duplicate detection. 127 */ 128 static unsigned int zfs_zevent_retain_max = 2000; 129 130 /* 131 * The lifespan for a recent ereport entry. The default of 15 minutes is 132 * intended to outlive the zfs diagnosis engine's threshold of 10 errors 133 * over a period of 10 minutes. 134 */ 135 static unsigned int zfs_zevent_retain_expire_secs = 900; 136 137 typedef enum zfs_subclass { 138 ZSC_IO, 139 ZSC_DATA, 140 ZSC_CHECKSUM 141 } zfs_subclass_t; 142 143 typedef struct { 144 /* common criteria */ 145 uint64_t re_pool_guid; 146 uint64_t re_vdev_guid; 147 int re_io_error; 148 uint64_t re_io_size; 149 uint64_t re_io_offset; 150 zfs_subclass_t re_subclass; 151 zio_priority_t re_io_priority; 152 153 /* logical zio criteria (optional) */ 154 zbookmark_phys_t re_io_bookmark; 155 156 /* internal state */ 157 avl_node_t re_tree_link; 158 list_node_t re_list_link; 159 uint64_t re_timestamp; 160 } recent_events_node_t; 161 162 static int 163 recent_events_compare(const void *a, const void *b) 164 { 165 const recent_events_node_t *node1 = a; 166 const recent_events_node_t *node2 = b; 167 int cmp; 168 169 /* 170 * The comparison order here is somewhat arbitrary. 171 * What's important is that if every criteria matches, then it 172 * is a duplicate (i.e. compare returns 0) 173 */ 174 if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0) 175 return (cmp); 176 if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0) 177 return (cmp); 178 if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0) 179 return (cmp); 180 if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0) 181 return (cmp); 182 if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0) 183 return (cmp); 184 if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0) 185 return (cmp); 186 if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0) 187 return (cmp); 188 189 const zbookmark_phys_t *zb1 = &node1->re_io_bookmark; 190 const zbookmark_phys_t *zb2 = &node2->re_io_bookmark; 191 192 if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0) 193 return (cmp); 194 if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0) 195 return (cmp); 196 if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0) 197 return (cmp); 198 if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0) 199 return (cmp); 200 201 return (0); 202 } 203 204 /* 205 * workaround: vdev properties don't have inheritance 206 */ 207 static uint64_t 208 vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop) 209 { 210 uint64_t propdef, propval; 211 212 propdef = vdev_prop_default_numeric(prop); 213 switch (prop) { 214 case VDEV_PROP_CHECKSUM_N: 215 propval = vd->vdev_checksum_n; 216 break; 217 case VDEV_PROP_CHECKSUM_T: 218 propval = vd->vdev_checksum_t; 219 break; 220 case VDEV_PROP_IO_N: 221 propval = vd->vdev_io_n; 222 break; 223 case VDEV_PROP_IO_T: 224 propval = vd->vdev_io_t; 225 break; 226 case VDEV_PROP_SLOW_IO_N: 227 propval = vd->vdev_slow_io_n; 228 break; 229 case VDEV_PROP_SLOW_IO_T: 230 propval = vd->vdev_slow_io_t; 231 break; 232 default: 233 propval = propdef; 234 break; 235 } 236 237 if (propval != propdef) 238 return (propval); 239 240 if (vd->vdev_parent == NULL) 241 return (propdef); 242 243 return (vdev_prop_get_inherited(vd->vdev_parent, prop)); 244 } 245 246 static void zfs_ereport_schedule_cleaner(void); 247 248 /* 249 * background task to clean stale recent event nodes. 250 */ 251 static void 252 zfs_ereport_cleaner(void *arg) 253 { 254 recent_events_node_t *entry; 255 uint64_t now = gethrtime(); 256 257 /* 258 * purge expired entries 259 */ 260 mutex_enter(&recent_events_lock); 261 while ((entry = list_tail(&recent_events_list)) != NULL) { 262 uint64_t age = NSEC2SEC(now - entry->re_timestamp); 263 if (age <= zfs_zevent_retain_expire_secs) 264 break; 265 266 /* remove expired node */ 267 avl_remove(&recent_events_tree, entry); 268 list_remove(&recent_events_list, entry); 269 kmem_free(entry, sizeof (*entry)); 270 } 271 272 /* Restart the cleaner if more entries remain */ 273 recent_events_cleaner_tqid = 0; 274 if (!list_is_empty(&recent_events_list)) 275 zfs_ereport_schedule_cleaner(); 276 277 mutex_exit(&recent_events_lock); 278 } 279 280 static void 281 zfs_ereport_schedule_cleaner(void) 282 { 283 ASSERT(MUTEX_HELD(&recent_events_lock)); 284 285 uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1); 286 287 recent_events_cleaner_tqid = taskq_dispatch_delay( 288 system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP, 289 ddi_get_lbolt() + NSEC_TO_TICK(timeout)); 290 } 291 292 /* 293 * Clear entries for a given vdev or all vdevs in a pool when vdev == NULL 294 */ 295 void 296 zfs_ereport_clear(spa_t *spa, vdev_t *vd) 297 { 298 uint64_t vdev_guid, pool_guid; 299 300 ASSERT(vd != NULL || spa != NULL); 301 if (vd == NULL) { 302 vdev_guid = 0; 303 pool_guid = spa_guid(spa); 304 } else { 305 vdev_guid = vd->vdev_guid; 306 pool_guid = 0; 307 } 308 309 mutex_enter(&recent_events_lock); 310 311 recent_events_node_t *next = list_head(&recent_events_list); 312 while (next != NULL) { 313 recent_events_node_t *entry = next; 314 315 next = list_next(&recent_events_list, next); 316 317 if (entry->re_vdev_guid == vdev_guid || 318 entry->re_pool_guid == pool_guid) { 319 avl_remove(&recent_events_tree, entry); 320 list_remove(&recent_events_list, entry); 321 kmem_free(entry, sizeof (*entry)); 322 } 323 } 324 325 mutex_exit(&recent_events_lock); 326 } 327 328 /* 329 * Check if an ereport would be a duplicate of one recently posted. 330 * 331 * An ereport is considered a duplicate if the set of criteria in 332 * recent_events_node_t all match. 333 * 334 * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM 335 * are candidates for duplicate checking. 336 */ 337 static boolean_t 338 zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd, 339 const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size) 340 { 341 recent_events_node_t search = {0}, *entry; 342 343 if (vd == NULL || zio == NULL) 344 return (B_FALSE); 345 346 if (zfs_zevent_retain_max == 0) 347 return (B_FALSE); 348 349 if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) 350 search.re_subclass = ZSC_IO; 351 else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0) 352 search.re_subclass = ZSC_DATA; 353 else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) 354 search.re_subclass = ZSC_CHECKSUM; 355 else 356 return (B_FALSE); 357 358 search.re_pool_guid = spa_guid(spa); 359 search.re_vdev_guid = vd->vdev_guid; 360 search.re_io_error = zio->io_error; 361 search.re_io_priority = zio->io_priority; 362 /* if size is supplied use it over what's in zio */ 363 if (size) { 364 search.re_io_size = size; 365 search.re_io_offset = offset; 366 } else { 367 search.re_io_size = zio->io_size; 368 search.re_io_offset = zio->io_offset; 369 } 370 371 /* grab optional logical zio criteria */ 372 if (zb != NULL) { 373 search.re_io_bookmark.zb_objset = zb->zb_objset; 374 search.re_io_bookmark.zb_object = zb->zb_object; 375 search.re_io_bookmark.zb_level = zb->zb_level; 376 search.re_io_bookmark.zb_blkid = zb->zb_blkid; 377 } 378 379 uint64_t now = gethrtime(); 380 381 mutex_enter(&recent_events_lock); 382 383 /* check if we have seen this one recently */ 384 entry = avl_find(&recent_events_tree, &search, NULL); 385 if (entry != NULL) { 386 uint64_t age = NSEC2SEC(now - entry->re_timestamp); 387 388 /* 389 * There is still an active cleaner (since we're here). 390 * Reset the last seen time for this duplicate entry 391 * so that its lifespand gets extended. 392 */ 393 list_remove(&recent_events_list, entry); 394 list_insert_head(&recent_events_list, entry); 395 entry->re_timestamp = now; 396 397 zfs_zevent_track_duplicate(); 398 mutex_exit(&recent_events_lock); 399 400 return (age <= zfs_zevent_retain_expire_secs); 401 } 402 403 if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) { 404 /* recycle oldest node */ 405 entry = list_tail(&recent_events_list); 406 ASSERT(entry != NULL); 407 list_remove(&recent_events_list, entry); 408 avl_remove(&recent_events_tree, entry); 409 } else { 410 entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP); 411 } 412 413 /* record this as a recent ereport */ 414 *entry = search; 415 avl_add(&recent_events_tree, entry); 416 list_insert_head(&recent_events_list, entry); 417 entry->re_timestamp = now; 418 419 /* Start a cleaner if not already scheduled */ 420 if (recent_events_cleaner_tqid == 0) 421 zfs_ereport_schedule_cleaner(); 422 423 mutex_exit(&recent_events_lock); 424 return (B_FALSE); 425 } 426 427 void 428 zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) 429 { 430 if (nvl) 431 fm_nvlist_destroy(nvl, FM_NVA_FREE); 432 433 if (detector) 434 fm_nvlist_destroy(detector, FM_NVA_FREE); 435 } 436 437 /* 438 * We want to rate limit ZIO delay, deadman, and checksum events so as to not 439 * flood zevent consumers when a disk is acting up. 440 * 441 * Returns 1 if we're ratelimiting, 0 if not. 442 */ 443 static int 444 zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd) 445 { 446 int rc = 0; 447 /* 448 * zfs_ratelimit() returns 1 if we're *not* ratelimiting and 0 if we 449 * are. Invert it to get our return value. 450 */ 451 if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { 452 rc = !zfs_ratelimit(&vd->vdev_delay_rl); 453 } else if (strcmp(subclass, FM_EREPORT_ZFS_DEADMAN) == 0) { 454 rc = !zfs_ratelimit(&vd->vdev_deadman_rl); 455 } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { 456 rc = !zfs_ratelimit(&vd->vdev_checksum_rl); 457 } 458 459 if (rc) { 460 /* We're rate limiting */ 461 fm_erpt_dropped_increment(); 462 } 463 464 return (rc); 465 } 466 467 /* 468 * Return B_TRUE if the event actually posted, B_FALSE if not. 469 */ 470 static boolean_t 471 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, 472 const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, 473 zio_t *zio, uint64_t stateoroffset, uint64_t size) 474 { 475 nvlist_t *ereport, *detector; 476 477 uint64_t ena; 478 char class[64]; 479 480 if ((ereport = fm_nvlist_create(NULL)) == NULL) 481 return (B_FALSE); 482 483 if ((detector = fm_nvlist_create(NULL)) == NULL) { 484 fm_nvlist_destroy(ereport, FM_NVA_FREE); 485 return (B_FALSE); 486 } 487 488 /* 489 * Serialize ereport generation 490 */ 491 mutex_enter(&spa->spa_errlist_lock); 492 493 /* 494 * Determine the ENA to use for this event. If we are in a loading 495 * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use 496 * a root zio-wide ENA. Otherwise, simply use a unique ENA. 497 */ 498 if (spa_load_state(spa) != SPA_LOAD_NONE) { 499 if (spa->spa_ena == 0) 500 spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); 501 ena = spa->spa_ena; 502 } else if (zio != NULL && zio->io_logical != NULL) { 503 if (zio->io_logical->io_ena == 0) 504 zio->io_logical->io_ena = 505 fm_ena_generate(0, FM_ENA_FMT1); 506 ena = zio->io_logical->io_ena; 507 } else { 508 ena = fm_ena_generate(0, FM_ENA_FMT1); 509 } 510 511 /* 512 * Construct the full class, detector, and other standard FMA fields. 513 */ 514 (void) snprintf(class, sizeof (class), "%s.%s", 515 ZFS_ERROR_CLASS, subclass); 516 517 fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), 518 vd != NULL ? vd->vdev_guid : 0); 519 520 fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); 521 522 /* 523 * Construct the per-ereport payload, depending on which parameters are 524 * passed in. 525 */ 526 527 /* 528 * Generic payload members common to all ereports. 529 */ 530 fm_payload_set(ereport, 531 FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa), 532 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa), 533 FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64, 534 (uint64_t)spa_state(spa), 535 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, 536 (int32_t)spa_load_state(spa), NULL); 537 538 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, 539 DATA_TYPE_STRING, 540 spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? 541 FM_EREPORT_FAILMODE_WAIT : 542 spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? 543 FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, 544 NULL); 545 546 if (vd != NULL) { 547 vdev_t *pvd = vd->vdev_parent; 548 vdev_queue_t *vq = &vd->vdev_queue; 549 vdev_stat_t *vs = &vd->vdev_stat; 550 vdev_t *spare_vd; 551 uint64_t *spare_guids; 552 char **spare_paths; 553 int i, spare_count; 554 555 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, 556 DATA_TYPE_UINT64, vd->vdev_guid, 557 FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, 558 DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); 559 if (vd->vdev_path != NULL) 560 fm_payload_set(ereport, 561 FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, 562 DATA_TYPE_STRING, vd->vdev_path, NULL); 563 if (vd->vdev_devid != NULL) 564 fm_payload_set(ereport, 565 FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, 566 DATA_TYPE_STRING, vd->vdev_devid, NULL); 567 if (vd->vdev_fru != NULL) 568 fm_payload_set(ereport, 569 FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, 570 DATA_TYPE_STRING, vd->vdev_fru, NULL); 571 if (vd->vdev_enc_sysfs_path != NULL) 572 fm_payload_set(ereport, 573 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, 574 DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL); 575 if (vd->vdev_ashift) 576 fm_payload_set(ereport, 577 FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT, 578 DATA_TYPE_UINT64, vd->vdev_ashift, NULL); 579 580 if (vq != NULL) { 581 fm_payload_set(ereport, 582 FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS, 583 DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL); 584 fm_payload_set(ereport, 585 FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS, 586 DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL); 587 } 588 589 if (vs != NULL) { 590 fm_payload_set(ereport, 591 FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS, 592 DATA_TYPE_UINT64, vs->vs_read_errors, 593 FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, 594 DATA_TYPE_UINT64, vs->vs_write_errors, 595 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, 596 DATA_TYPE_UINT64, vs->vs_checksum_errors, 597 FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, 598 DATA_TYPE_UINT64, vs->vs_slow_ios, 599 FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS, 600 DATA_TYPE_UINT64, vs->vs_dio_verify_errors, 601 NULL); 602 } 603 604 if (pvd != NULL) { 605 fm_payload_set(ereport, 606 FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, 607 DATA_TYPE_UINT64, pvd->vdev_guid, 608 FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, 609 DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, 610 NULL); 611 if (pvd->vdev_path) 612 fm_payload_set(ereport, 613 FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, 614 DATA_TYPE_STRING, pvd->vdev_path, NULL); 615 if (pvd->vdev_devid) 616 fm_payload_set(ereport, 617 FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, 618 DATA_TYPE_STRING, pvd->vdev_devid, NULL); 619 } 620 621 spare_count = spa->spa_spares.sav_count; 622 spare_paths = kmem_zalloc(sizeof (char *) * spare_count, 623 KM_SLEEP); 624 spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count, 625 KM_SLEEP); 626 627 for (i = 0; i < spare_count; i++) { 628 spare_vd = spa->spa_spares.sav_vdevs[i]; 629 if (spare_vd) { 630 spare_paths[i] = spare_vd->vdev_path; 631 spare_guids[i] = spare_vd->vdev_guid; 632 } 633 } 634 635 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS, 636 DATA_TYPE_STRING_ARRAY, spare_count, spare_paths, 637 FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS, 638 DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL); 639 640 kmem_free(spare_guids, sizeof (uint64_t) * spare_count); 641 kmem_free(spare_paths, sizeof (char *) * spare_count); 642 } 643 644 if (zio != NULL) { 645 /* 646 * Payload common to all I/Os. 647 */ 648 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, 649 DATA_TYPE_INT32, zio->io_error, NULL); 650 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, 651 DATA_TYPE_UINT64, zio->io_flags, NULL); 652 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE, 653 DATA_TYPE_UINT32, zio->io_stage, NULL); 654 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE, 655 DATA_TYPE_UINT32, zio->io_pipeline, NULL); 656 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY, 657 DATA_TYPE_UINT64, zio->io_delay, NULL); 658 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP, 659 DATA_TYPE_UINT64, zio->io_timestamp, NULL); 660 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, 661 DATA_TYPE_UINT64, zio->io_delta, NULL); 662 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, 663 DATA_TYPE_UINT32, zio->io_priority, NULL); 664 665 /* 666 * If the 'size' parameter is non-zero, it indicates this is a 667 * RAID-Z or other I/O where the physical offset and length are 668 * provided for us, instead of within the zio_t. 669 */ 670 if (vd != NULL) { 671 if (size) 672 fm_payload_set(ereport, 673 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, 674 DATA_TYPE_UINT64, stateoroffset, 675 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, 676 DATA_TYPE_UINT64, size, NULL); 677 else 678 fm_payload_set(ereport, 679 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, 680 DATA_TYPE_UINT64, zio->io_offset, 681 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, 682 DATA_TYPE_UINT64, zio->io_size, NULL); 683 } 684 } else if (vd != NULL) { 685 /* 686 * If we have a vdev but no zio, this is a device fault, and the 687 * 'stateoroffset' parameter indicates the previous state of the 688 * vdev. 689 */ 690 fm_payload_set(ereport, 691 FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, 692 DATA_TYPE_UINT64, stateoroffset, NULL); 693 } 694 695 /* 696 * Payload for I/Os with corresponding logical information. 697 */ 698 if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) { 699 fm_payload_set(ereport, 700 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, 701 DATA_TYPE_UINT64, zb->zb_objset, 702 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, 703 DATA_TYPE_UINT64, zb->zb_object, 704 FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, 705 DATA_TYPE_INT64, zb->zb_level, 706 FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, 707 DATA_TYPE_UINT64, zb->zb_blkid, NULL); 708 } 709 710 /* 711 * Payload for tuning the zed 712 */ 713 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { 714 uint64_t cksum_n, cksum_t; 715 716 cksum_n = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_N); 717 if (cksum_n != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N)) 718 fm_payload_set(ereport, 719 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N, 720 DATA_TYPE_UINT64, 721 cksum_n, 722 NULL); 723 724 cksum_t = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_T); 725 if (cksum_t != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T)) 726 fm_payload_set(ereport, 727 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T, 728 DATA_TYPE_UINT64, 729 cksum_t, 730 NULL); 731 } 732 733 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) { 734 uint64_t io_n, io_t; 735 736 io_n = vdev_prop_get_inherited(vd, VDEV_PROP_IO_N); 737 if (io_n != vdev_prop_default_numeric(VDEV_PROP_IO_N)) 738 fm_payload_set(ereport, 739 FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N, 740 DATA_TYPE_UINT64, 741 io_n, 742 NULL); 743 744 io_t = vdev_prop_get_inherited(vd, VDEV_PROP_IO_T); 745 if (io_t != vdev_prop_default_numeric(VDEV_PROP_IO_T)) 746 fm_payload_set(ereport, 747 FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T, 748 DATA_TYPE_UINT64, 749 io_t, 750 NULL); 751 } 752 753 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { 754 uint64_t slow_io_n, slow_io_t; 755 756 slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N); 757 if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N)) 758 fm_payload_set(ereport, 759 FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N, 760 DATA_TYPE_UINT64, 761 slow_io_n, 762 NULL); 763 764 slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T); 765 if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T)) 766 fm_payload_set(ereport, 767 FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T, 768 DATA_TYPE_UINT64, 769 slow_io_t, 770 NULL); 771 } 772 773 mutex_exit(&spa->spa_errlist_lock); 774 775 *ereport_out = ereport; 776 *detector_out = detector; 777 return (B_TRUE); 778 } 779 780 /* if it's <= 128 bytes, save the corruption directly */ 781 #define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) 782 783 #define MAX_RANGES 16 784 785 typedef struct zfs_ecksum_info { 786 /* inline arrays of bits set and cleared. */ 787 uint64_t zei_bits_set[ZFM_MAX_INLINE]; 788 uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; 789 790 /* 791 * for each range, the number of bits set and cleared. The Hamming 792 * distance between the good and bad buffers is the sum of them all. 793 */ 794 uint32_t zei_range_sets[MAX_RANGES]; 795 uint32_t zei_range_clears[MAX_RANGES]; 796 797 struct zei_ranges { 798 uint32_t zr_start; 799 uint32_t zr_end; 800 } zei_ranges[MAX_RANGES]; 801 802 size_t zei_range_count; 803 uint32_t zei_mingap; 804 uint32_t zei_allowed_mingap; 805 806 } zfs_ecksum_info_t; 807 808 static void 809 update_bad_bits(uint64_t value_arg, uint32_t *count) 810 { 811 size_t i; 812 size_t bits = 0; 813 uint64_t value = BE_64(value_arg); 814 815 /* We store the bits in big-endian (largest-first) order */ 816 for (i = 0; i < 64; i++) { 817 if (value & (1ull << i)) 818 ++bits; 819 } 820 /* update the count of bits changed */ 821 *count += bits; 822 } 823 824 /* 825 * We've now filled up the range array, and need to increase "mingap" and 826 * shrink the range list accordingly. zei_mingap is always the smallest 827 * distance between array entries, so we set the new_allowed_gap to be 828 * one greater than that. We then go through the list, joining together 829 * any ranges which are closer than the new_allowed_gap. 830 * 831 * By construction, there will be at least one. We also update zei_mingap 832 * to the new smallest gap, to prepare for our next invocation. 833 */ 834 static void 835 zei_shrink_ranges(zfs_ecksum_info_t *eip) 836 { 837 uint32_t mingap = UINT32_MAX; 838 uint32_t new_allowed_gap = eip->zei_mingap + 1; 839 840 size_t idx, output; 841 size_t max = eip->zei_range_count; 842 843 struct zei_ranges *r = eip->zei_ranges; 844 845 ASSERT3U(eip->zei_range_count, >, 0); 846 ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); 847 848 output = idx = 0; 849 while (idx < max - 1) { 850 uint32_t start = r[idx].zr_start; 851 uint32_t end = r[idx].zr_end; 852 853 while (idx < max - 1) { 854 idx++; 855 856 uint32_t nstart = r[idx].zr_start; 857 uint32_t nend = r[idx].zr_end; 858 859 uint32_t gap = nstart - end; 860 if (gap < new_allowed_gap) { 861 end = nend; 862 continue; 863 } 864 if (gap < mingap) 865 mingap = gap; 866 break; 867 } 868 r[output].zr_start = start; 869 r[output].zr_end = end; 870 output++; 871 } 872 ASSERT3U(output, <, eip->zei_range_count); 873 eip->zei_range_count = output; 874 eip->zei_mingap = mingap; 875 eip->zei_allowed_mingap = new_allowed_gap; 876 } 877 878 static void 879 zei_add_range(zfs_ecksum_info_t *eip, int start, int end) 880 { 881 struct zei_ranges *r = eip->zei_ranges; 882 size_t count = eip->zei_range_count; 883 884 if (count >= MAX_RANGES) { 885 zei_shrink_ranges(eip); 886 count = eip->zei_range_count; 887 } 888 if (count == 0) { 889 eip->zei_mingap = UINT32_MAX; 890 eip->zei_allowed_mingap = 1; 891 } else { 892 int gap = start - r[count - 1].zr_end; 893 894 if (gap < eip->zei_allowed_mingap) { 895 r[count - 1].zr_end = end; 896 return; 897 } 898 if (gap < eip->zei_mingap) 899 eip->zei_mingap = gap; 900 } 901 r[count].zr_start = start; 902 r[count].zr_end = end; 903 eip->zei_range_count++; 904 } 905 906 static size_t 907 zei_range_total_size(zfs_ecksum_info_t *eip) 908 { 909 struct zei_ranges *r = eip->zei_ranges; 910 size_t count = eip->zei_range_count; 911 size_t result = 0; 912 size_t idx; 913 914 for (idx = 0; idx < count; idx++) 915 result += (r[idx].zr_end - r[idx].zr_start); 916 917 return (result); 918 } 919 920 static zfs_ecksum_info_t * 921 annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, 922 const abd_t *goodabd, const abd_t *badabd, size_t size, 923 boolean_t drop_if_identical) 924 { 925 const uint64_t *good; 926 const uint64_t *bad; 927 928 size_t nui64s = size / sizeof (uint64_t); 929 930 size_t inline_size; 931 int no_inline = 0; 932 size_t idx; 933 size_t range; 934 935 size_t offset = 0; 936 ssize_t start = -1; 937 938 zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); 939 940 /* don't do any annotation for injected checksum errors */ 941 if (info != NULL && info->zbc_injected) 942 return (eip); 943 944 if (info != NULL && info->zbc_has_cksum) { 945 fm_payload_set(ereport, 946 FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, 947 DATA_TYPE_STRING, 948 info->zbc_checksum_name, 949 NULL); 950 951 if (info->zbc_byteswapped) { 952 fm_payload_set(ereport, 953 FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, 954 DATA_TYPE_BOOLEAN, 1, 955 NULL); 956 } 957 } 958 959 if (badabd == NULL || goodabd == NULL) 960 return (eip); 961 962 ASSERT3U(nui64s, <=, UINT32_MAX); 963 ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); 964 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 965 ASSERT3U(size, <=, UINT32_MAX); 966 967 good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size); 968 bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size); 969 970 /* build up the range list by comparing the two buffers. */ 971 for (idx = 0; idx < nui64s; idx++) { 972 if (good[idx] == bad[idx]) { 973 if (start == -1) 974 continue; 975 976 zei_add_range(eip, start, idx); 977 start = -1; 978 } else { 979 if (start != -1) 980 continue; 981 982 start = idx; 983 } 984 } 985 if (start != -1) 986 zei_add_range(eip, start, idx); 987 988 /* See if it will fit in our inline buffers */ 989 inline_size = zei_range_total_size(eip); 990 if (inline_size > ZFM_MAX_INLINE) 991 no_inline = 1; 992 993 /* 994 * If there is no change and we want to drop if the buffers are 995 * identical, do so. 996 */ 997 if (inline_size == 0 && drop_if_identical) { 998 kmem_free(eip, sizeof (*eip)); 999 abd_return_buf((abd_t *)goodabd, (void *)good, size); 1000 abd_return_buf((abd_t *)badabd, (void *)bad, size); 1001 return (NULL); 1002 } 1003 1004 /* 1005 * Now walk through the ranges, filling in the details of the 1006 * differences. Also convert our uint64_t-array offsets to byte 1007 * offsets. 1008 */ 1009 for (range = 0; range < eip->zei_range_count; range++) { 1010 size_t start = eip->zei_ranges[range].zr_start; 1011 size_t end = eip->zei_ranges[range].zr_end; 1012 1013 for (idx = start; idx < end; idx++) { 1014 uint64_t set, cleared; 1015 1016 // bits set in bad, but not in good 1017 set = ((~good[idx]) & bad[idx]); 1018 // bits set in good, but not in bad 1019 cleared = (good[idx] & (~bad[idx])); 1020 1021 if (!no_inline) { 1022 ASSERT3U(offset, <, inline_size); 1023 eip->zei_bits_set[offset] = set; 1024 eip->zei_bits_cleared[offset] = cleared; 1025 offset++; 1026 } 1027 1028 update_bad_bits(set, &eip->zei_range_sets[range]); 1029 update_bad_bits(cleared, &eip->zei_range_clears[range]); 1030 } 1031 1032 /* convert to byte offsets */ 1033 eip->zei_ranges[range].zr_start *= sizeof (uint64_t); 1034 eip->zei_ranges[range].zr_end *= sizeof (uint64_t); 1035 } 1036 1037 abd_return_buf((abd_t *)goodabd, (void *)good, size); 1038 abd_return_buf((abd_t *)badabd, (void *)bad, size); 1039 1040 eip->zei_allowed_mingap *= sizeof (uint64_t); 1041 inline_size *= sizeof (uint64_t); 1042 1043 /* fill in ereport */ 1044 fm_payload_set(ereport, 1045 FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, 1046 DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, 1047 (uint32_t *)eip->zei_ranges, 1048 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, 1049 DATA_TYPE_UINT32, eip->zei_allowed_mingap, 1050 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, 1051 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, 1052 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, 1053 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, 1054 NULL); 1055 1056 if (!no_inline) { 1057 fm_payload_set(ereport, 1058 FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, 1059 DATA_TYPE_UINT8_ARRAY, 1060 inline_size, (uint8_t *)eip->zei_bits_set, 1061 FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, 1062 DATA_TYPE_UINT8_ARRAY, 1063 inline_size, (uint8_t *)eip->zei_bits_cleared, 1064 NULL); 1065 } 1066 return (eip); 1067 } 1068 #else 1069 void 1070 zfs_ereport_clear(spa_t *spa, vdev_t *vd) 1071 { 1072 (void) spa, (void) vd; 1073 } 1074 #endif 1075 1076 /* 1077 * Make sure our event is still valid for the given zio/vdev/pool. For example, 1078 * we don't want to keep logging events for a faulted or missing vdev. 1079 */ 1080 boolean_t 1081 zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) 1082 { 1083 #ifdef _KERNEL 1084 /* 1085 * If we are doing a spa_tryimport() or in recovery mode, 1086 * ignore errors. 1087 */ 1088 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || 1089 spa_load_state(spa) == SPA_LOAD_RECOVER) 1090 return (B_FALSE); 1091 1092 /* 1093 * If we are in the middle of opening a pool, and the previous attempt 1094 * failed, don't bother logging any new ereports - we're just going to 1095 * get the same diagnosis anyway. 1096 */ 1097 if (spa_load_state(spa) != SPA_LOAD_NONE && 1098 spa->spa_last_open_failed) 1099 return (B_FALSE); 1100 1101 if (zio != NULL) { 1102 /* If this is not a read or write zio, ignore the error */ 1103 if (zio->io_type != ZIO_TYPE_READ && 1104 zio->io_type != ZIO_TYPE_WRITE) 1105 return (B_FALSE); 1106 1107 if (vd != NULL) { 1108 /* 1109 * If the vdev has already been marked as failing due 1110 * to a failed probe, then ignore any subsequent I/O 1111 * errors, as the DE will automatically fault the vdev 1112 * on the first such failure. This also catches cases 1113 * where vdev_remove_wanted is set and the device has 1114 * not yet been asynchronously placed into the REMOVED 1115 * state. 1116 */ 1117 if (zio->io_vd == vd && !vdev_accessible(vd, zio)) 1118 return (B_FALSE); 1119 1120 /* 1121 * Ignore checksum errors for reads from DTL regions of 1122 * leaf vdevs. 1123 */ 1124 if (zio->io_type == ZIO_TYPE_READ && 1125 zio->io_error == ECKSUM && 1126 vd->vdev_ops->vdev_op_leaf && 1127 vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) 1128 return (B_FALSE); 1129 } 1130 } 1131 1132 /* 1133 * For probe failure, we want to avoid posting ereports if we've 1134 * already removed the device in the meantime. 1135 */ 1136 if (vd != NULL && 1137 strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && 1138 (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) 1139 return (B_FALSE); 1140 1141 /* Ignore bogus delay events (like from ioctls or unqueued IOs) */ 1142 if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && 1143 (zio != NULL) && (!zio->io_timestamp)) { 1144 return (B_FALSE); 1145 } 1146 #else 1147 (void) subclass, (void) spa, (void) vd, (void) zio; 1148 #endif 1149 return (B_TRUE); 1150 } 1151 1152 /* 1153 * Post an ereport for the given subclass 1154 * 1155 * Returns 1156 * - 0 if an event was posted 1157 * - EINVAL if there was a problem posting event 1158 * - EBUSY if the event was rate limited 1159 * - EALREADY if the event was already posted (duplicate) 1160 */ 1161 int 1162 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, 1163 const zbookmark_phys_t *zb, zio_t *zio, uint64_t state) 1164 { 1165 int rc = 0; 1166 #ifdef _KERNEL 1167 nvlist_t *ereport = NULL; 1168 nvlist_t *detector = NULL; 1169 1170 if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) 1171 return (EINVAL); 1172 1173 if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0)) 1174 return (SET_ERROR(EALREADY)); 1175 1176 if (zfs_is_ratelimiting_event(subclass, vd)) 1177 return (SET_ERROR(EBUSY)); 1178 1179 if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, 1180 zb, zio, state, 0)) 1181 return (SET_ERROR(EINVAL)); /* couldn't post event */ 1182 1183 if (ereport == NULL) 1184 return (SET_ERROR(EINVAL)); 1185 1186 /* Cleanup is handled by the callback function */ 1187 rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); 1188 #else 1189 (void) subclass, (void) spa, (void) vd, (void) zb, (void) zio, 1190 (void) state; 1191 #endif 1192 return (rc); 1193 } 1194 1195 /* 1196 * Prepare a checksum ereport 1197 * 1198 * Returns 1199 * - 0 if an event was posted 1200 * - EINVAL if there was a problem posting event 1201 * - EBUSY if the event was rate limited 1202 * - EALREADY if the event was already posted (duplicate) 1203 */ 1204 int 1205 zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, 1206 struct zio *zio, uint64_t offset, uint64_t length, zio_bad_cksum_t *info) 1207 { 1208 zio_cksum_report_t *report; 1209 1210 #ifdef _KERNEL 1211 if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) 1212 return (SET_ERROR(EINVAL)); 1213 1214 if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, 1215 offset, length)) 1216 return (SET_ERROR(EALREADY)); 1217 1218 if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) 1219 return (SET_ERROR(EBUSY)); 1220 #else 1221 (void) zb, (void) offset; 1222 #endif 1223 1224 report = kmem_zalloc(sizeof (*report), KM_SLEEP); 1225 1226 zio_vsd_default_cksum_report(zio, report); 1227 1228 /* copy the checksum failure information if it was provided */ 1229 if (info != NULL) { 1230 report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); 1231 memcpy(report->zcr_ckinfo, info, sizeof (*info)); 1232 } 1233 1234 report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift; 1235 report->zcr_align = 1236 vdev_psize_to_asize(vd->vdev_top, report->zcr_sector); 1237 report->zcr_length = length; 1238 1239 #ifdef _KERNEL 1240 (void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, 1241 FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length); 1242 1243 if (report->zcr_ereport == NULL) { 1244 zfs_ereport_free_checksum(report); 1245 return (0); 1246 } 1247 #endif 1248 1249 mutex_enter(&spa->spa_errlist_lock); 1250 report->zcr_next = zio->io_logical->io_cksum_report; 1251 zio->io_logical->io_cksum_report = report; 1252 mutex_exit(&spa->spa_errlist_lock); 1253 return (0); 1254 } 1255 1256 void 1257 zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, 1258 const abd_t *bad_data, boolean_t drop_if_identical) 1259 { 1260 #ifdef _KERNEL 1261 zfs_ecksum_info_t *info; 1262 1263 info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, 1264 good_data, bad_data, report->zcr_length, drop_if_identical); 1265 if (info != NULL) 1266 zfs_zevent_post(report->zcr_ereport, 1267 report->zcr_detector, zfs_zevent_post_cb); 1268 else 1269 zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector); 1270 1271 report->zcr_ereport = report->zcr_detector = NULL; 1272 if (info != NULL) 1273 kmem_free(info, sizeof (*info)); 1274 #else 1275 (void) report, (void) good_data, (void) bad_data, 1276 (void) drop_if_identical; 1277 #endif 1278 } 1279 1280 void 1281 zfs_ereport_free_checksum(zio_cksum_report_t *rpt) 1282 { 1283 #ifdef _KERNEL 1284 if (rpt->zcr_ereport != NULL) { 1285 fm_nvlist_destroy(rpt->zcr_ereport, 1286 FM_NVA_FREE); 1287 fm_nvlist_destroy(rpt->zcr_detector, 1288 FM_NVA_FREE); 1289 } 1290 #endif 1291 rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); 1292 1293 if (rpt->zcr_ckinfo != NULL) 1294 kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); 1295 1296 kmem_free(rpt, sizeof (*rpt)); 1297 } 1298 1299 /* 1300 * Post a checksum ereport 1301 * 1302 * Returns 1303 * - 0 if an event was posted 1304 * - EINVAL if there was a problem posting event 1305 * - EBUSY if the event was rate limited 1306 * - EALREADY if the event was already posted (duplicate) 1307 */ 1308 int 1309 zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, 1310 struct zio *zio, uint64_t offset, uint64_t length, 1311 const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) 1312 { 1313 int rc = 0; 1314 #ifdef _KERNEL 1315 nvlist_t *ereport = NULL; 1316 nvlist_t *detector = NULL; 1317 zfs_ecksum_info_t *info; 1318 1319 if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) 1320 return (SET_ERROR(EINVAL)); 1321 1322 if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, 1323 offset, length)) 1324 return (SET_ERROR(EALREADY)); 1325 1326 if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) 1327 return (SET_ERROR(EBUSY)); 1328 1329 if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, 1330 spa, vd, zb, zio, offset, length) || (ereport == NULL)) { 1331 return (SET_ERROR(EINVAL)); 1332 } 1333 1334 info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, 1335 B_FALSE); 1336 1337 if (info != NULL) { 1338 rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); 1339 kmem_free(info, sizeof (*info)); 1340 } 1341 #else 1342 (void) spa, (void) vd, (void) zb, (void) zio, (void) offset, 1343 (void) length, (void) good_data, (void) bad_data, (void) zbc; 1344 #endif 1345 return (rc); 1346 } 1347 1348 /* 1349 * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of 1350 * change in the pool. All sysevents are listed in sys/sysevent/eventdefs.h 1351 * and are designed to be consumed by the ZFS Event Daemon (ZED). For 1352 * additional details refer to the zed(8) man page. 1353 */ 1354 nvlist_t * 1355 zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, 1356 nvlist_t *aux) 1357 { 1358 nvlist_t *resource = NULL; 1359 #ifdef _KERNEL 1360 char class[64]; 1361 1362 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) 1363 return (NULL); 1364 1365 if ((resource = fm_nvlist_create(NULL)) == NULL) 1366 return (NULL); 1367 1368 (void) snprintf(class, sizeof (class), "%s.%s.%s", type, 1369 ZFS_ERROR_CLASS, name); 1370 VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION)); 1371 VERIFY0(nvlist_add_string(resource, FM_CLASS, class)); 1372 VERIFY0(nvlist_add_string(resource, 1373 FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa))); 1374 VERIFY0(nvlist_add_uint64(resource, 1375 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa))); 1376 VERIFY0(nvlist_add_uint64(resource, 1377 FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa))); 1378 VERIFY0(nvlist_add_int32(resource, 1379 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa))); 1380 1381 if (vd) { 1382 VERIFY0(nvlist_add_uint64(resource, 1383 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid)); 1384 VERIFY0(nvlist_add_uint64(resource, 1385 FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state)); 1386 if (vd->vdev_path != NULL) 1387 VERIFY0(nvlist_add_string(resource, 1388 FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path)); 1389 if (vd->vdev_devid != NULL) 1390 VERIFY0(nvlist_add_string(resource, 1391 FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid)); 1392 if (vd->vdev_fru != NULL) 1393 VERIFY0(nvlist_add_string(resource, 1394 FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru)); 1395 if (vd->vdev_enc_sysfs_path != NULL) 1396 VERIFY0(nvlist_add_string(resource, 1397 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, 1398 vd->vdev_enc_sysfs_path)); 1399 } 1400 1401 /* also copy any optional payload data */ 1402 if (aux) { 1403 nvpair_t *elem = NULL; 1404 1405 while ((elem = nvlist_next_nvpair(aux, elem)) != NULL) 1406 (void) nvlist_add_nvpair(resource, elem); 1407 } 1408 #else 1409 (void) spa, (void) vd, (void) type, (void) name, (void) aux; 1410 #endif 1411 return (resource); 1412 } 1413 1414 static void 1415 zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name, 1416 nvlist_t *aux) 1417 { 1418 #ifdef _KERNEL 1419 nvlist_t *resource; 1420 1421 resource = zfs_event_create(spa, vd, type, name, aux); 1422 if (resource) 1423 zfs_zevent_post(resource, NULL, zfs_zevent_post_cb); 1424 #else 1425 (void) spa, (void) vd, (void) type, (void) name, (void) aux; 1426 #endif 1427 } 1428 1429 /* 1430 * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev 1431 * has been removed from the system. This will cause the DE to ignore any 1432 * recent I/O errors, inferring that they are due to the asynchronous device 1433 * removal. 1434 */ 1435 void 1436 zfs_post_remove(spa_t *spa, vdev_t *vd) 1437 { 1438 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL); 1439 } 1440 1441 /* 1442 * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool 1443 * has the 'autoreplace' property set, and therefore any broken vdevs will be 1444 * handled by higher level logic, and no vdev fault should be generated. 1445 */ 1446 void 1447 zfs_post_autoreplace(spa_t *spa, vdev_t *vd) 1448 { 1449 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL); 1450 } 1451 1452 /* 1453 * The 'resource.fs.zfs.statechange' event is an internal signal that the 1454 * given vdev has transitioned its state to DEGRADED or HEALTHY. This will 1455 * cause the retire agent to repair any outstanding fault management cases 1456 * open because the device was not found (fault.fs.zfs.device). 1457 */ 1458 void 1459 zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) 1460 { 1461 #ifdef _KERNEL 1462 nvlist_t *aux; 1463 1464 /* 1465 * Add optional supplemental keys to payload 1466 */ 1467 aux = fm_nvlist_create(NULL); 1468 if (vd && aux) { 1469 if (vd->vdev_physpath) { 1470 fnvlist_add_string(aux, 1471 FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH, 1472 vd->vdev_physpath); 1473 } 1474 if (vd->vdev_enc_sysfs_path) { 1475 fnvlist_add_string(aux, 1476 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, 1477 vd->vdev_enc_sysfs_path); 1478 } 1479 1480 fnvlist_add_uint64(aux, 1481 FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate); 1482 } 1483 1484 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE, 1485 aux); 1486 1487 if (aux) 1488 fm_nvlist_destroy(aux, FM_NVA_FREE); 1489 #else 1490 (void) spa, (void) vd, (void) laststate; 1491 #endif 1492 } 1493 1494 #ifdef _KERNEL 1495 void 1496 zfs_ereport_init(void) 1497 { 1498 mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL); 1499 list_create(&recent_events_list, sizeof (recent_events_node_t), 1500 offsetof(recent_events_node_t, re_list_link)); 1501 avl_create(&recent_events_tree, recent_events_compare, 1502 sizeof (recent_events_node_t), offsetof(recent_events_node_t, 1503 re_tree_link)); 1504 } 1505 1506 /* 1507 * This 'early' fini needs to run before zfs_fini() which on Linux waits 1508 * for the system_delay_taskq to drain. 1509 */ 1510 void 1511 zfs_ereport_taskq_fini(void) 1512 { 1513 mutex_enter(&recent_events_lock); 1514 if (recent_events_cleaner_tqid != 0) { 1515 taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid); 1516 recent_events_cleaner_tqid = 0; 1517 } 1518 mutex_exit(&recent_events_lock); 1519 } 1520 1521 void 1522 zfs_ereport_fini(void) 1523 { 1524 recent_events_node_t *entry; 1525 1526 while ((entry = list_remove_head(&recent_events_list)) != NULL) { 1527 avl_remove(&recent_events_tree, entry); 1528 kmem_free(entry, sizeof (*entry)); 1529 } 1530 avl_destroy(&recent_events_tree); 1531 list_destroy(&recent_events_list); 1532 mutex_destroy(&recent_events_lock); 1533 } 1534 1535 void 1536 zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name) 1537 { 1538 nvlist_t *aux; 1539 1540 aux = fm_nvlist_create(NULL); 1541 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name); 1542 1543 zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux); 1544 fm_nvlist_destroy(aux, FM_NVA_FREE); 1545 } 1546 1547 /* 1548 * Post when a event when a zvol is created or removed 1549 * 1550 * This is currently only used by macOS, since it uses the event to create 1551 * symlinks between the volume name (mypool/myvol) and the actual /dev 1552 * device (/dev/disk3). For example: 1553 * 1554 * /var/run/zfs/dsk/mypool/myvol -> /dev/disk3 1555 * 1556 * name: The full name of the zvol ("mypool/myvol") 1557 * dev_name: The full /dev name for the zvol ("/dev/disk3") 1558 * raw_name: The raw /dev name for the zvol ("/dev/rdisk3") 1559 */ 1560 void 1561 zfs_ereport_zvol_post(const char *subclass, const char *name, 1562 const char *dev_name, const char *raw_name) 1563 { 1564 nvlist_t *aux; 1565 char *r; 1566 1567 boolean_t locked = mutex_owned(&spa_namespace_lock); 1568 if (!locked) mutex_enter(&spa_namespace_lock); 1569 spa_t *spa = spa_lookup(name); 1570 if (!locked) mutex_exit(&spa_namespace_lock); 1571 1572 if (spa == NULL) 1573 return; 1574 1575 aux = fm_nvlist_create(NULL); 1576 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name); 1577 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME, 1578 raw_name); 1579 r = strchr(name, '/'); 1580 if (r && r[1]) 1581 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]); 1582 1583 zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux); 1584 fm_nvlist_destroy(aux, FM_NVA_FREE); 1585 } 1586 1587 EXPORT_SYMBOL(zfs_ereport_post); 1588 EXPORT_SYMBOL(zfs_ereport_is_valid); 1589 EXPORT_SYMBOL(zfs_ereport_post_checksum); 1590 EXPORT_SYMBOL(zfs_post_remove); 1591 EXPORT_SYMBOL(zfs_post_autoreplace); 1592 EXPORT_SYMBOL(zfs_post_state_change); 1593 1594 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW, 1595 "Maximum recent zevents records to retain for duplicate checking"); 1596 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW, 1597 "Expiration time for recent zevents records"); 1598 #endif /* _KERNEL */ 1599