1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 2012,2021 by Delphix. All rights reserved. 29 */ 30 31 #include <sys/spa.h> 32 #include <sys/spa_impl.h> 33 #include <sys/vdev.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/zio.h> 36 #include <sys/zio_checksum.h> 37 38 #include <sys/fm/fs/zfs.h> 39 #include <sys/fm/protocol.h> 40 #include <sys/fm/util.h> 41 #include <sys/sysevent.h> 42 43 /* 44 * This general routine is responsible for generating all the different ZFS 45 * ereports. The payload is dependent on the class, and which arguments are 46 * supplied to the function: 47 * 48 * EREPORT POOL VDEV IO 49 * block X X X 50 * data X X 51 * device X X 52 * pool X 53 * 54 * If we are in a loading state, all errors are chained together by the same 55 * SPA-wide ENA (Error Numeric Association). 56 * 57 * For isolated I/O requests, we get the ENA from the zio_t. The propagation 58 * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want 59 * to chain together all ereports associated with a logical piece of data. For 60 * read I/Os, there are basically three 'types' of I/O, which form a roughly 61 * layered diagram: 62 * 63 * +---------------+ 64 * | Aggregate I/O | No associated logical data or device 65 * +---------------+ 66 * | 67 * V 68 * +---------------+ Reads associated with a piece of logical data. 69 * | Read I/O | This includes reads on behalf of RAID-Z, 70 * +---------------+ mirrors, gang blocks, retries, etc. 71 * | 72 * V 73 * +---------------+ Reads associated with a particular device, but 74 * | Physical I/O | no logical data. Issued as part of vdev caching 75 * +---------------+ and I/O aggregation. 76 * 77 * Note that 'physical I/O' here is not the same terminology as used in the rest 78 * of ZIO. Typically, 'physical I/O' simply means that there is no attached 79 * blockpointer. But I/O with no associated block pointer can still be related 80 * to a logical piece of data (i.e. RAID-Z requests). 81 * 82 * Purely physical I/O always have unique ENAs. They are not related to a 83 * particular piece of logical data, and therefore cannot be chained together. 84 * We still generate an ereport, but the DE doesn't correlate it with any 85 * logical piece of data. When such an I/O fails, the delegated I/O requests 86 * will issue a retry, which will trigger the 'real' ereport with the correct 87 * ENA. 88 * 89 * We keep track of the ENA for a ZIO chain through the 'io_logical' member. 90 * When a new logical I/O is issued, we set this to point to itself. Child I/Os 91 * then inherit this pointer, so that when it is first set subsequent failures 92 * will use the same ENA. For vdev cache fill and queue aggregation I/O, 93 * this pointer is set to NULL, and no ereport will be generated (since it 94 * doesn't actually correspond to any particular device or piece of data, 95 * and the caller will always retry without caching or queueing anyway). 96 * 97 * For checksum errors, we want to include more information about the actual 98 * error which occurs. Accordingly, we build an ereport when the error is 99 * noticed, but instead of sending it in immediately, we hang it off of the 100 * io_cksum_report field of the logical IO. When the logical IO completes 101 * (successfully or not), zfs_ereport_finish_checksum() is called with the 102 * good and bad versions of the buffer (if available), and we annotate the 103 * ereport with information about the differences. 104 */ 105 106 #ifdef _KERNEL 107 /* 108 * Duplicate ereport Detection 109 * 110 * Some ereports are retained momentarily for detecting duplicates. These 111 * are kept in a recent_events_node_t in both a time-ordered list and an AVL 112 * tree of recent unique ereports. 113 * 114 * The lifespan of these recent ereports is bounded (15 mins) and a cleaner 115 * task is used to purge stale entries. 116 */ 117 static list_t recent_events_list; 118 static avl_tree_t recent_events_tree; 119 static kmutex_t recent_events_lock; 120 static taskqid_t recent_events_cleaner_tqid; 121 122 /* 123 * Each node is about 128 bytes so 2,000 would consume 1/4 MiB. 124 * 125 * This setting can be changed dynamically and setting it to zero 126 * disables duplicate detection. 127 */ 128 static unsigned int zfs_zevent_retain_max = 2000; 129 130 /* 131 * The lifespan for a recent ereport entry. The default of 15 minutes is 132 * intended to outlive the zfs diagnosis engine's threshold of 10 errors 133 * over a period of 10 minutes. 134 */ 135 static unsigned int zfs_zevent_retain_expire_secs = 900; 136 137 typedef enum zfs_subclass { 138 ZSC_IO, 139 ZSC_DATA, 140 ZSC_CHECKSUM 141 } zfs_subclass_t; 142 143 typedef struct { 144 /* common criteria */ 145 uint64_t re_pool_guid; 146 uint64_t re_vdev_guid; 147 int re_io_error; 148 uint64_t re_io_size; 149 uint64_t re_io_offset; 150 zfs_subclass_t re_subclass; 151 zio_priority_t re_io_priority; 152 153 /* logical zio criteria (optional) */ 154 zbookmark_phys_t re_io_bookmark; 155 156 /* internal state */ 157 avl_node_t re_tree_link; 158 list_node_t re_list_link; 159 uint64_t re_timestamp; 160 } recent_events_node_t; 161 162 static int 163 recent_events_compare(const void *a, const void *b) 164 { 165 const recent_events_node_t *node1 = a; 166 const recent_events_node_t *node2 = b; 167 int cmp; 168 169 /* 170 * The comparison order here is somewhat arbitrary. 171 * What's important is that if every criteria matches, then it 172 * is a duplicate (i.e. compare returns 0) 173 */ 174 if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0) 175 return (cmp); 176 if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0) 177 return (cmp); 178 if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0) 179 return (cmp); 180 if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0) 181 return (cmp); 182 if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0) 183 return (cmp); 184 if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0) 185 return (cmp); 186 if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0) 187 return (cmp); 188 189 const zbookmark_phys_t *zb1 = &node1->re_io_bookmark; 190 const zbookmark_phys_t *zb2 = &node2->re_io_bookmark; 191 192 if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0) 193 return (cmp); 194 if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0) 195 return (cmp); 196 if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0) 197 return (cmp); 198 if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0) 199 return (cmp); 200 201 return (0); 202 } 203 204 /* 205 * workaround: vdev properties don't have inheritance 206 */ 207 static uint64_t 208 vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop) 209 { 210 uint64_t propdef, propval; 211 212 propdef = vdev_prop_default_numeric(prop); 213 switch (prop) { 214 case VDEV_PROP_CHECKSUM_N: 215 propval = vd->vdev_checksum_n; 216 break; 217 case VDEV_PROP_CHECKSUM_T: 218 propval = vd->vdev_checksum_t; 219 break; 220 case VDEV_PROP_IO_N: 221 propval = vd->vdev_io_n; 222 break; 223 case VDEV_PROP_IO_T: 224 propval = vd->vdev_io_t; 225 break; 226 case VDEV_PROP_SLOW_IO_EVENTS: 227 propval = vd->vdev_slow_io_events; 228 break; 229 case VDEV_PROP_SLOW_IO_N: 230 propval = vd->vdev_slow_io_n; 231 break; 232 case VDEV_PROP_SLOW_IO_T: 233 propval = vd->vdev_slow_io_t; 234 break; 235 default: 236 propval = propdef; 237 break; 238 } 239 240 if (propval != propdef) 241 return (propval); 242 243 if (vd->vdev_parent == NULL) 244 return (propdef); 245 246 return (vdev_prop_get_inherited(vd->vdev_parent, prop)); 247 } 248 249 static void zfs_ereport_schedule_cleaner(void); 250 251 /* 252 * background task to clean stale recent event nodes. 253 */ 254 static void 255 zfs_ereport_cleaner(void *arg) 256 { 257 recent_events_node_t *entry; 258 uint64_t now = gethrtime(); 259 260 /* 261 * purge expired entries 262 */ 263 mutex_enter(&recent_events_lock); 264 while ((entry = list_tail(&recent_events_list)) != NULL) { 265 uint64_t age = NSEC2SEC(now - entry->re_timestamp); 266 if (age <= zfs_zevent_retain_expire_secs) 267 break; 268 269 /* remove expired node */ 270 avl_remove(&recent_events_tree, entry); 271 list_remove(&recent_events_list, entry); 272 kmem_free(entry, sizeof (*entry)); 273 } 274 275 /* Restart the cleaner if more entries remain */ 276 recent_events_cleaner_tqid = 0; 277 if (!list_is_empty(&recent_events_list)) 278 zfs_ereport_schedule_cleaner(); 279 280 mutex_exit(&recent_events_lock); 281 } 282 283 static void 284 zfs_ereport_schedule_cleaner(void) 285 { 286 ASSERT(MUTEX_HELD(&recent_events_lock)); 287 288 uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1); 289 290 recent_events_cleaner_tqid = taskq_dispatch_delay( 291 system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP, 292 ddi_get_lbolt() + NSEC_TO_TICK(timeout)); 293 } 294 295 /* 296 * Clear entries for a given vdev or all vdevs in a pool when vdev == NULL 297 */ 298 void 299 zfs_ereport_clear(spa_t *spa, vdev_t *vd) 300 { 301 uint64_t vdev_guid, pool_guid; 302 303 ASSERT(vd != NULL || spa != NULL); 304 if (vd == NULL) { 305 vdev_guid = 0; 306 pool_guid = spa_guid(spa); 307 } else { 308 vdev_guid = vd->vdev_guid; 309 pool_guid = 0; 310 } 311 312 mutex_enter(&recent_events_lock); 313 314 recent_events_node_t *next = list_head(&recent_events_list); 315 while (next != NULL) { 316 recent_events_node_t *entry = next; 317 318 next = list_next(&recent_events_list, next); 319 320 if (entry->re_vdev_guid == vdev_guid || 321 entry->re_pool_guid == pool_guid) { 322 avl_remove(&recent_events_tree, entry); 323 list_remove(&recent_events_list, entry); 324 kmem_free(entry, sizeof (*entry)); 325 } 326 } 327 328 mutex_exit(&recent_events_lock); 329 } 330 331 /* 332 * Check if an ereport would be a duplicate of one recently posted. 333 * 334 * An ereport is considered a duplicate if the set of criteria in 335 * recent_events_node_t all match. 336 * 337 * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM 338 * are candidates for duplicate checking. 339 */ 340 static boolean_t 341 zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd, 342 const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size) 343 { 344 recent_events_node_t search = {0}, *entry; 345 346 if (vd == NULL || zio == NULL) 347 return (B_FALSE); 348 349 if (zfs_zevent_retain_max == 0) 350 return (B_FALSE); 351 352 if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) 353 search.re_subclass = ZSC_IO; 354 else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0) 355 search.re_subclass = ZSC_DATA; 356 else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) 357 search.re_subclass = ZSC_CHECKSUM; 358 else 359 return (B_FALSE); 360 361 search.re_pool_guid = spa_guid(spa); 362 search.re_vdev_guid = vd->vdev_guid; 363 search.re_io_error = zio->io_error; 364 search.re_io_priority = zio->io_priority; 365 /* if size is supplied use it over what's in zio */ 366 if (size) { 367 search.re_io_size = size; 368 search.re_io_offset = offset; 369 } else { 370 search.re_io_size = zio->io_size; 371 search.re_io_offset = zio->io_offset; 372 } 373 374 /* grab optional logical zio criteria */ 375 if (zb != NULL) { 376 search.re_io_bookmark.zb_objset = zb->zb_objset; 377 search.re_io_bookmark.zb_object = zb->zb_object; 378 search.re_io_bookmark.zb_level = zb->zb_level; 379 search.re_io_bookmark.zb_blkid = zb->zb_blkid; 380 } 381 382 uint64_t now = gethrtime(); 383 384 mutex_enter(&recent_events_lock); 385 386 /* check if we have seen this one recently */ 387 entry = avl_find(&recent_events_tree, &search, NULL); 388 if (entry != NULL) { 389 uint64_t age = NSEC2SEC(now - entry->re_timestamp); 390 391 /* 392 * There is still an active cleaner (since we're here). 393 * Reset the last seen time for this duplicate entry 394 * so that its lifespand gets extended. 395 */ 396 list_remove(&recent_events_list, entry); 397 list_insert_head(&recent_events_list, entry); 398 entry->re_timestamp = now; 399 400 zfs_zevent_track_duplicate(); 401 mutex_exit(&recent_events_lock); 402 403 return (age <= zfs_zevent_retain_expire_secs); 404 } 405 406 if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) { 407 /* recycle oldest node */ 408 entry = list_tail(&recent_events_list); 409 ASSERT(entry != NULL); 410 list_remove(&recent_events_list, entry); 411 avl_remove(&recent_events_tree, entry); 412 } else { 413 entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP); 414 } 415 416 /* record this as a recent ereport */ 417 *entry = search; 418 avl_add(&recent_events_tree, entry); 419 list_insert_head(&recent_events_list, entry); 420 entry->re_timestamp = now; 421 422 /* Start a cleaner if not already scheduled */ 423 if (recent_events_cleaner_tqid == 0) 424 zfs_ereport_schedule_cleaner(); 425 426 mutex_exit(&recent_events_lock); 427 return (B_FALSE); 428 } 429 430 void 431 zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) 432 { 433 if (nvl) 434 fm_nvlist_destroy(nvl, FM_NVA_FREE); 435 436 if (detector) 437 fm_nvlist_destroy(detector, FM_NVA_FREE); 438 } 439 440 /* 441 * We want to rate limit ZIO delay, deadman, and checksum events so as to not 442 * flood zevent consumers when a disk is acting up. 443 * 444 * Returns 1 if we're ratelimiting, 0 if not. 445 */ 446 static int 447 zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd) 448 { 449 int rc = 0; 450 /* 451 * zfs_ratelimit() returns 1 if we're *not* ratelimiting and 0 if we 452 * are. Invert it to get our return value. 453 */ 454 if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { 455 rc = !zfs_ratelimit(&vd->vdev_delay_rl); 456 } else if (strcmp(subclass, FM_EREPORT_ZFS_DEADMAN) == 0) { 457 rc = !zfs_ratelimit(&vd->vdev_deadman_rl); 458 } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { 459 rc = !zfs_ratelimit(&vd->vdev_checksum_rl); 460 } 461 462 if (rc) { 463 /* We're rate limiting */ 464 fm_erpt_dropped_increment(); 465 } 466 467 return (rc); 468 } 469 470 /* 471 * Return B_TRUE if the event actually posted, B_FALSE if not. 472 */ 473 static boolean_t 474 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, 475 const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, 476 zio_t *zio, uint64_t stateoroffset, uint64_t size) 477 { 478 nvlist_t *ereport, *detector; 479 480 uint64_t ena; 481 char class[64]; 482 483 if ((ereport = fm_nvlist_create(NULL)) == NULL) 484 return (B_FALSE); 485 486 if ((detector = fm_nvlist_create(NULL)) == NULL) { 487 fm_nvlist_destroy(ereport, FM_NVA_FREE); 488 return (B_FALSE); 489 } 490 491 /* 492 * Serialize ereport generation 493 */ 494 mutex_enter(&spa->spa_errlist_lock); 495 496 /* 497 * Determine the ENA to use for this event. If we are in a loading 498 * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use 499 * a root zio-wide ENA. Otherwise, simply use a unique ENA. 500 */ 501 if (spa_load_state(spa) != SPA_LOAD_NONE) { 502 if (spa->spa_ena == 0) 503 spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); 504 ena = spa->spa_ena; 505 } else if (zio != NULL && zio->io_logical != NULL) { 506 if (zio->io_logical->io_ena == 0) 507 zio->io_logical->io_ena = 508 fm_ena_generate(0, FM_ENA_FMT1); 509 ena = zio->io_logical->io_ena; 510 } else { 511 ena = fm_ena_generate(0, FM_ENA_FMT1); 512 } 513 514 /* 515 * Construct the full class, detector, and other standard FMA fields. 516 */ 517 (void) snprintf(class, sizeof (class), "%s.%s", 518 ZFS_ERROR_CLASS, subclass); 519 520 fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), 521 vd != NULL ? vd->vdev_guid : 0); 522 523 fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); 524 525 /* 526 * Construct the per-ereport payload, depending on which parameters are 527 * passed in. 528 */ 529 530 /* 531 * Generic payload members common to all ereports. 532 */ 533 fm_payload_set(ereport, 534 FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa), 535 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa), 536 FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64, 537 (uint64_t)spa_state(spa), 538 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, 539 (int32_t)spa_load_state(spa), NULL); 540 541 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, 542 DATA_TYPE_STRING, 543 spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? 544 FM_EREPORT_FAILMODE_WAIT : 545 spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? 546 FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, 547 NULL); 548 549 if (vd != NULL) { 550 vdev_t *pvd = vd->vdev_parent; 551 vdev_queue_t *vq = &vd->vdev_queue; 552 vdev_stat_t *vs = &vd->vdev_stat; 553 vdev_t *spare_vd; 554 uint64_t *spare_guids; 555 char **spare_paths; 556 int i, spare_count; 557 558 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, 559 DATA_TYPE_UINT64, vd->vdev_guid, 560 FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, 561 DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); 562 if (vd->vdev_path != NULL) 563 fm_payload_set(ereport, 564 FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, 565 DATA_TYPE_STRING, vd->vdev_path, NULL); 566 if (vd->vdev_devid != NULL) 567 fm_payload_set(ereport, 568 FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, 569 DATA_TYPE_STRING, vd->vdev_devid, NULL); 570 if (vd->vdev_fru != NULL) 571 fm_payload_set(ereport, 572 FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, 573 DATA_TYPE_STRING, vd->vdev_fru, NULL); 574 if (vd->vdev_enc_sysfs_path != NULL) 575 fm_payload_set(ereport, 576 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, 577 DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL); 578 if (vd->vdev_ashift) 579 fm_payload_set(ereport, 580 FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT, 581 DATA_TYPE_UINT64, vd->vdev_ashift, NULL); 582 583 if (vq != NULL) { 584 fm_payload_set(ereport, 585 FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS, 586 DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL); 587 fm_payload_set(ereport, 588 FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS, 589 DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL); 590 } 591 592 if (vs != NULL) { 593 fm_payload_set(ereport, 594 FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS, 595 DATA_TYPE_UINT64, vs->vs_read_errors, 596 FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, 597 DATA_TYPE_UINT64, vs->vs_write_errors, 598 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, 599 DATA_TYPE_UINT64, vs->vs_checksum_errors, 600 FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, 601 DATA_TYPE_UINT64, vs->vs_slow_ios, 602 FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS, 603 DATA_TYPE_UINT64, vs->vs_dio_verify_errors, 604 NULL); 605 } 606 607 if (pvd != NULL) { 608 fm_payload_set(ereport, 609 FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, 610 DATA_TYPE_UINT64, pvd->vdev_guid, 611 FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, 612 DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, 613 NULL); 614 if (pvd->vdev_path) 615 fm_payload_set(ereport, 616 FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, 617 DATA_TYPE_STRING, pvd->vdev_path, NULL); 618 if (pvd->vdev_devid) 619 fm_payload_set(ereport, 620 FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, 621 DATA_TYPE_STRING, pvd->vdev_devid, NULL); 622 } 623 624 spare_count = spa->spa_spares.sav_count; 625 spare_paths = kmem_zalloc(sizeof (char *) * spare_count, 626 KM_SLEEP); 627 spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count, 628 KM_SLEEP); 629 630 for (i = 0; i < spare_count; i++) { 631 spare_vd = spa->spa_spares.sav_vdevs[i]; 632 if (spare_vd) { 633 spare_paths[i] = spare_vd->vdev_path; 634 spare_guids[i] = spare_vd->vdev_guid; 635 } 636 } 637 638 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS, 639 DATA_TYPE_STRING_ARRAY, spare_count, spare_paths, 640 FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS, 641 DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL); 642 643 kmem_free(spare_guids, sizeof (uint64_t) * spare_count); 644 kmem_free(spare_paths, sizeof (char *) * spare_count); 645 } 646 647 if (zio != NULL) { 648 /* 649 * Payload common to all I/Os. 650 */ 651 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, 652 DATA_TYPE_INT32, zio->io_error, NULL); 653 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, 654 DATA_TYPE_UINT64, zio->io_flags, NULL); 655 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE, 656 DATA_TYPE_UINT32, zio->io_stage, NULL); 657 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE, 658 DATA_TYPE_UINT32, zio->io_pipeline, NULL); 659 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY, 660 DATA_TYPE_UINT64, zio->io_delay, NULL); 661 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP, 662 DATA_TYPE_UINT64, zio->io_timestamp, NULL); 663 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, 664 DATA_TYPE_UINT64, zio->io_delta, NULL); 665 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TYPE, 666 DATA_TYPE_UINT32, zio->io_type, NULL); 667 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, 668 DATA_TYPE_UINT32, zio->io_priority, NULL); 669 670 /* 671 * If the 'size' parameter is non-zero, it indicates this is a 672 * RAID-Z or other I/O where the physical offset and length are 673 * provided for us, instead of within the zio_t. 674 */ 675 if (vd != NULL) { 676 if (size) 677 fm_payload_set(ereport, 678 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, 679 DATA_TYPE_UINT64, stateoroffset, 680 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, 681 DATA_TYPE_UINT64, size, NULL); 682 else 683 fm_payload_set(ereport, 684 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, 685 DATA_TYPE_UINT64, zio->io_offset, 686 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, 687 DATA_TYPE_UINT64, zio->io_size, NULL); 688 } 689 } else if (vd != NULL) { 690 /* 691 * If we have a vdev but no zio, this is a device fault, and the 692 * 'stateoroffset' parameter indicates the previous state of the 693 * vdev. 694 */ 695 fm_payload_set(ereport, 696 FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, 697 DATA_TYPE_UINT64, stateoroffset, NULL); 698 } 699 700 /* 701 * Payload for I/Os with corresponding logical information. 702 */ 703 if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) { 704 fm_payload_set(ereport, 705 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, 706 DATA_TYPE_UINT64, zb->zb_objset, 707 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, 708 DATA_TYPE_UINT64, zb->zb_object, 709 FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, 710 DATA_TYPE_INT64, zb->zb_level, 711 FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, 712 DATA_TYPE_UINT64, zb->zb_blkid, NULL); 713 } 714 715 /* 716 * Payload for tuning the zed 717 */ 718 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { 719 uint64_t cksum_n, cksum_t; 720 721 cksum_n = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_N); 722 if (cksum_n != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N)) 723 fm_payload_set(ereport, 724 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N, 725 DATA_TYPE_UINT64, 726 cksum_n, 727 NULL); 728 729 cksum_t = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_T); 730 if (cksum_t != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T)) 731 fm_payload_set(ereport, 732 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T, 733 DATA_TYPE_UINT64, 734 cksum_t, 735 NULL); 736 } 737 738 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) { 739 uint64_t io_n, io_t; 740 741 io_n = vdev_prop_get_inherited(vd, VDEV_PROP_IO_N); 742 if (io_n != vdev_prop_default_numeric(VDEV_PROP_IO_N)) 743 fm_payload_set(ereport, 744 FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N, 745 DATA_TYPE_UINT64, 746 io_n, 747 NULL); 748 749 io_t = vdev_prop_get_inherited(vd, VDEV_PROP_IO_T); 750 if (io_t != vdev_prop_default_numeric(VDEV_PROP_IO_T)) 751 fm_payload_set(ereport, 752 FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T, 753 DATA_TYPE_UINT64, 754 io_t, 755 NULL); 756 } 757 758 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { 759 uint64_t slow_io_n, slow_io_t; 760 761 slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N); 762 if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N)) 763 fm_payload_set(ereport, 764 FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N, 765 DATA_TYPE_UINT64, 766 slow_io_n, 767 NULL); 768 769 slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T); 770 if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T)) 771 fm_payload_set(ereport, 772 FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T, 773 DATA_TYPE_UINT64, 774 slow_io_t, 775 NULL); 776 } 777 778 mutex_exit(&spa->spa_errlist_lock); 779 780 *ereport_out = ereport; 781 *detector_out = detector; 782 return (B_TRUE); 783 } 784 785 /* if it's <= 128 bytes, save the corruption directly */ 786 #define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) 787 788 #define MAX_RANGES 16 789 790 typedef struct zfs_ecksum_info { 791 /* inline arrays of bits set and cleared. */ 792 uint64_t zei_bits_set[ZFM_MAX_INLINE]; 793 uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; 794 795 /* 796 * for each range, the number of bits set and cleared. The Hamming 797 * distance between the good and bad buffers is the sum of them all. 798 */ 799 uint32_t zei_range_sets[MAX_RANGES]; 800 uint32_t zei_range_clears[MAX_RANGES]; 801 802 struct zei_ranges { 803 uint32_t zr_start; 804 uint32_t zr_end; 805 } zei_ranges[MAX_RANGES]; 806 807 size_t zei_range_count; 808 uint32_t zei_mingap; 809 uint32_t zei_allowed_mingap; 810 811 } zfs_ecksum_info_t; 812 813 static void 814 update_bad_bits(uint64_t value_arg, uint32_t *count) 815 { 816 size_t i; 817 size_t bits = 0; 818 uint64_t value = BE_64(value_arg); 819 820 /* We store the bits in big-endian (largest-first) order */ 821 for (i = 0; i < 64; i++) { 822 if (value & (1ull << i)) 823 ++bits; 824 } 825 /* update the count of bits changed */ 826 *count += bits; 827 } 828 829 /* 830 * We've now filled up the range array, and need to increase "mingap" and 831 * shrink the range list accordingly. zei_mingap is always the smallest 832 * distance between array entries, so we set the new_allowed_gap to be 833 * one greater than that. We then go through the list, joining together 834 * any ranges which are closer than the new_allowed_gap. 835 * 836 * By construction, there will be at least one. We also update zei_mingap 837 * to the new smallest gap, to prepare for our next invocation. 838 */ 839 static void 840 zei_shrink_ranges(zfs_ecksum_info_t *eip) 841 { 842 uint32_t mingap = UINT32_MAX; 843 uint32_t new_allowed_gap = eip->zei_mingap + 1; 844 845 size_t idx, output; 846 size_t max = eip->zei_range_count; 847 848 struct zei_ranges *r = eip->zei_ranges; 849 850 ASSERT3U(eip->zei_range_count, >, 0); 851 ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); 852 853 output = idx = 0; 854 while (idx < max - 1) { 855 uint32_t start = r[idx].zr_start; 856 uint32_t end = r[idx].zr_end; 857 858 while (idx < max - 1) { 859 idx++; 860 861 uint32_t nstart = r[idx].zr_start; 862 uint32_t nend = r[idx].zr_end; 863 864 uint32_t gap = nstart - end; 865 if (gap < new_allowed_gap) { 866 end = nend; 867 continue; 868 } 869 if (gap < mingap) 870 mingap = gap; 871 break; 872 } 873 r[output].zr_start = start; 874 r[output].zr_end = end; 875 output++; 876 } 877 ASSERT3U(output, <, eip->zei_range_count); 878 eip->zei_range_count = output; 879 eip->zei_mingap = mingap; 880 eip->zei_allowed_mingap = new_allowed_gap; 881 } 882 883 static void 884 zei_add_range(zfs_ecksum_info_t *eip, int start, int end) 885 { 886 struct zei_ranges *r = eip->zei_ranges; 887 size_t count = eip->zei_range_count; 888 889 if (count >= MAX_RANGES) { 890 zei_shrink_ranges(eip); 891 count = eip->zei_range_count; 892 } 893 if (count == 0) { 894 eip->zei_mingap = UINT32_MAX; 895 eip->zei_allowed_mingap = 1; 896 } else { 897 int gap = start - r[count - 1].zr_end; 898 899 if (gap < eip->zei_allowed_mingap) { 900 r[count - 1].zr_end = end; 901 return; 902 } 903 if (gap < eip->zei_mingap) 904 eip->zei_mingap = gap; 905 } 906 r[count].zr_start = start; 907 r[count].zr_end = end; 908 eip->zei_range_count++; 909 } 910 911 static size_t 912 zei_range_total_size(zfs_ecksum_info_t *eip) 913 { 914 struct zei_ranges *r = eip->zei_ranges; 915 size_t count = eip->zei_range_count; 916 size_t result = 0; 917 size_t idx; 918 919 for (idx = 0; idx < count; idx++) 920 result += (r[idx].zr_end - r[idx].zr_start); 921 922 return (result); 923 } 924 925 static zfs_ecksum_info_t * 926 annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, 927 const abd_t *goodabd, const abd_t *badabd, size_t size, 928 boolean_t drop_if_identical) 929 { 930 const uint64_t *good; 931 const uint64_t *bad; 932 933 size_t nui64s = size / sizeof (uint64_t); 934 935 size_t inline_size; 936 int no_inline = 0; 937 size_t idx; 938 size_t range; 939 940 size_t offset = 0; 941 ssize_t start = -1; 942 943 zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); 944 945 /* don't do any annotation for injected checksum errors */ 946 if (info != NULL && info->zbc_injected) 947 return (eip); 948 949 if (info != NULL && info->zbc_has_cksum) { 950 fm_payload_set(ereport, 951 FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, 952 DATA_TYPE_STRING, 953 info->zbc_checksum_name, 954 NULL); 955 956 if (info->zbc_byteswapped) { 957 fm_payload_set(ereport, 958 FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, 959 DATA_TYPE_BOOLEAN, 1, 960 NULL); 961 } 962 } 963 964 if (badabd == NULL || goodabd == NULL) 965 return (eip); 966 967 ASSERT3U(nui64s, <=, UINT32_MAX); 968 ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); 969 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 970 ASSERT3U(size, <=, UINT32_MAX); 971 972 good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size); 973 bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size); 974 975 /* build up the range list by comparing the two buffers. */ 976 for (idx = 0; idx < nui64s; idx++) { 977 if (good[idx] == bad[idx]) { 978 if (start == -1) 979 continue; 980 981 zei_add_range(eip, start, idx); 982 start = -1; 983 } else { 984 if (start != -1) 985 continue; 986 987 start = idx; 988 } 989 } 990 if (start != -1) 991 zei_add_range(eip, start, idx); 992 993 /* See if it will fit in our inline buffers */ 994 inline_size = zei_range_total_size(eip); 995 if (inline_size > ZFM_MAX_INLINE) 996 no_inline = 1; 997 998 /* 999 * If there is no change and we want to drop if the buffers are 1000 * identical, do so. 1001 */ 1002 if (inline_size == 0 && drop_if_identical) { 1003 kmem_free(eip, sizeof (*eip)); 1004 abd_return_buf((abd_t *)goodabd, (void *)good, size); 1005 abd_return_buf((abd_t *)badabd, (void *)bad, size); 1006 return (NULL); 1007 } 1008 1009 /* 1010 * Now walk through the ranges, filling in the details of the 1011 * differences. Also convert our uint64_t-array offsets to byte 1012 * offsets. 1013 */ 1014 for (range = 0; range < eip->zei_range_count; range++) { 1015 size_t start = eip->zei_ranges[range].zr_start; 1016 size_t end = eip->zei_ranges[range].zr_end; 1017 1018 for (idx = start; idx < end; idx++) { 1019 uint64_t set, cleared; 1020 1021 // bits set in bad, but not in good 1022 set = ((~good[idx]) & bad[idx]); 1023 // bits set in good, but not in bad 1024 cleared = (good[idx] & (~bad[idx])); 1025 1026 if (!no_inline) { 1027 ASSERT3U(offset, <, inline_size); 1028 eip->zei_bits_set[offset] = set; 1029 eip->zei_bits_cleared[offset] = cleared; 1030 offset++; 1031 } 1032 1033 update_bad_bits(set, &eip->zei_range_sets[range]); 1034 update_bad_bits(cleared, &eip->zei_range_clears[range]); 1035 } 1036 1037 /* convert to byte offsets */ 1038 eip->zei_ranges[range].zr_start *= sizeof (uint64_t); 1039 eip->zei_ranges[range].zr_end *= sizeof (uint64_t); 1040 } 1041 1042 abd_return_buf((abd_t *)goodabd, (void *)good, size); 1043 abd_return_buf((abd_t *)badabd, (void *)bad, size); 1044 1045 eip->zei_allowed_mingap *= sizeof (uint64_t); 1046 inline_size *= sizeof (uint64_t); 1047 1048 /* fill in ereport */ 1049 fm_payload_set(ereport, 1050 FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, 1051 DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, 1052 (uint32_t *)eip->zei_ranges, 1053 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, 1054 DATA_TYPE_UINT32, eip->zei_allowed_mingap, 1055 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, 1056 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, 1057 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, 1058 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, 1059 NULL); 1060 1061 if (!no_inline) { 1062 fm_payload_set(ereport, 1063 FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, 1064 DATA_TYPE_UINT8_ARRAY, 1065 inline_size, (uint8_t *)eip->zei_bits_set, 1066 FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, 1067 DATA_TYPE_UINT8_ARRAY, 1068 inline_size, (uint8_t *)eip->zei_bits_cleared, 1069 NULL); 1070 } 1071 return (eip); 1072 } 1073 #else 1074 void 1075 zfs_ereport_clear(spa_t *spa, vdev_t *vd) 1076 { 1077 (void) spa, (void) vd; 1078 } 1079 #endif 1080 1081 /* 1082 * Make sure our event is still valid for the given zio/vdev/pool. For example, 1083 * we don't want to keep logging events for a faulted or missing vdev. 1084 */ 1085 boolean_t 1086 zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) 1087 { 1088 #ifdef _KERNEL 1089 /* 1090 * If we are doing a spa_tryimport() or in recovery mode, 1091 * ignore errors. 1092 */ 1093 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || 1094 spa_load_state(spa) == SPA_LOAD_RECOVER) 1095 return (B_FALSE); 1096 1097 /* 1098 * If we are in the middle of opening a pool, and the previous attempt 1099 * failed, don't bother logging any new ereports - we're just going to 1100 * get the same diagnosis anyway. 1101 */ 1102 if (spa_load_state(spa) != SPA_LOAD_NONE && 1103 spa->spa_last_open_failed) 1104 return (B_FALSE); 1105 1106 if (zio != NULL) { 1107 /* If this is not a read or write zio, ignore the error */ 1108 if (zio->io_type != ZIO_TYPE_READ && 1109 zio->io_type != ZIO_TYPE_WRITE) 1110 return (B_FALSE); 1111 1112 if (vd != NULL) { 1113 /* 1114 * If the vdev has already been marked as failing due 1115 * to a failed probe, then ignore any subsequent I/O 1116 * errors, as the DE will automatically fault the vdev 1117 * on the first such failure. This also catches cases 1118 * where vdev_remove_wanted is set and the device has 1119 * not yet been asynchronously placed into the REMOVED 1120 * state. 1121 */ 1122 if (zio->io_vd == vd && !vdev_accessible(vd, zio)) 1123 return (B_FALSE); 1124 1125 /* 1126 * Ignore checksum errors for reads from DTL regions of 1127 * leaf vdevs. 1128 */ 1129 if (zio->io_type == ZIO_TYPE_READ && 1130 zio->io_error == ECKSUM && 1131 vd->vdev_ops->vdev_op_leaf && 1132 vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) 1133 return (B_FALSE); 1134 } 1135 } 1136 1137 /* 1138 * For probe failure, we want to avoid posting ereports if we've 1139 * already removed the device in the meantime. 1140 */ 1141 if (vd != NULL && 1142 strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && 1143 (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) 1144 return (B_FALSE); 1145 1146 /* Ignore bogus delay events (like from ioctls or unqueued IOs) */ 1147 if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && 1148 (zio != NULL) && (!zio->io_timestamp)) { 1149 return (B_FALSE); 1150 } 1151 #else 1152 (void) subclass, (void) spa, (void) vd, (void) zio; 1153 #endif 1154 return (B_TRUE); 1155 } 1156 1157 /* 1158 * Post an ereport for the given subclass 1159 * 1160 * Returns 1161 * - 0 if an event was posted 1162 * - EINVAL if there was a problem posting event 1163 * - EBUSY if the event was rate limited 1164 * - EALREADY if the event was already posted (duplicate) 1165 */ 1166 int 1167 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, 1168 const zbookmark_phys_t *zb, zio_t *zio, uint64_t state) 1169 { 1170 int rc = 0; 1171 #ifdef _KERNEL 1172 nvlist_t *ereport = NULL; 1173 nvlist_t *detector = NULL; 1174 1175 if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) 1176 return (EINVAL); 1177 1178 if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0)) 1179 return (SET_ERROR(EALREADY)); 1180 1181 if (zfs_is_ratelimiting_event(subclass, vd)) 1182 return (SET_ERROR(EBUSY)); 1183 1184 if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, 1185 zb, zio, state, 0)) 1186 return (SET_ERROR(EINVAL)); /* couldn't post event */ 1187 1188 if (ereport == NULL) 1189 return (SET_ERROR(EINVAL)); 1190 1191 /* Cleanup is handled by the callback function */ 1192 rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); 1193 #else 1194 (void) subclass, (void) spa, (void) vd, (void) zb, (void) zio, 1195 (void) state; 1196 #endif 1197 return (rc); 1198 } 1199 1200 /* 1201 * Prepare a checksum ereport 1202 * 1203 * Returns 1204 * - 0 if an event was posted 1205 * - EINVAL if there was a problem posting event 1206 * - EBUSY if the event was rate limited 1207 * - EALREADY if the event was already posted (duplicate) 1208 */ 1209 int 1210 zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, 1211 struct zio *zio, uint64_t offset, uint64_t length, zio_bad_cksum_t *info) 1212 { 1213 zio_cksum_report_t *report; 1214 1215 #ifdef _KERNEL 1216 if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) 1217 return (SET_ERROR(EINVAL)); 1218 1219 if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, 1220 offset, length)) 1221 return (SET_ERROR(EALREADY)); 1222 1223 if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) 1224 return (SET_ERROR(EBUSY)); 1225 #else 1226 (void) zb, (void) offset; 1227 #endif 1228 1229 report = kmem_zalloc(sizeof (*report), KM_SLEEP); 1230 1231 zio_vsd_default_cksum_report(zio, report); 1232 1233 /* copy the checksum failure information if it was provided */ 1234 if (info != NULL) { 1235 report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); 1236 memcpy(report->zcr_ckinfo, info, sizeof (*info)); 1237 } 1238 1239 report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift; 1240 report->zcr_align = 1241 vdev_psize_to_asize(vd->vdev_top, report->zcr_sector); 1242 report->zcr_length = length; 1243 1244 #ifdef _KERNEL 1245 (void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, 1246 FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length); 1247 1248 if (report->zcr_ereport == NULL) { 1249 zfs_ereport_free_checksum(report); 1250 return (0); 1251 } 1252 #endif 1253 1254 mutex_enter(&spa->spa_errlist_lock); 1255 report->zcr_next = zio->io_logical->io_cksum_report; 1256 zio->io_logical->io_cksum_report = report; 1257 mutex_exit(&spa->spa_errlist_lock); 1258 return (0); 1259 } 1260 1261 void 1262 zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, 1263 const abd_t *bad_data, boolean_t drop_if_identical) 1264 { 1265 #ifdef _KERNEL 1266 zfs_ecksum_info_t *info; 1267 1268 info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, 1269 good_data, bad_data, report->zcr_length, drop_if_identical); 1270 if (info != NULL) 1271 zfs_zevent_post(report->zcr_ereport, 1272 report->zcr_detector, zfs_zevent_post_cb); 1273 else 1274 zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector); 1275 1276 report->zcr_ereport = report->zcr_detector = NULL; 1277 if (info != NULL) 1278 kmem_free(info, sizeof (*info)); 1279 #else 1280 (void) report, (void) good_data, (void) bad_data, 1281 (void) drop_if_identical; 1282 #endif 1283 } 1284 1285 void 1286 zfs_ereport_free_checksum(zio_cksum_report_t *rpt) 1287 { 1288 #ifdef _KERNEL 1289 if (rpt->zcr_ereport != NULL) { 1290 fm_nvlist_destroy(rpt->zcr_ereport, 1291 FM_NVA_FREE); 1292 fm_nvlist_destroy(rpt->zcr_detector, 1293 FM_NVA_FREE); 1294 } 1295 #endif 1296 rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); 1297 1298 if (rpt->zcr_ckinfo != NULL) 1299 kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); 1300 1301 kmem_free(rpt, sizeof (*rpt)); 1302 } 1303 1304 /* 1305 * Post a checksum ereport 1306 * 1307 * Returns 1308 * - 0 if an event was posted 1309 * - EINVAL if there was a problem posting event 1310 * - EBUSY if the event was rate limited 1311 * - EALREADY if the event was already posted (duplicate) 1312 */ 1313 int 1314 zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, 1315 struct zio *zio, uint64_t offset, uint64_t length, 1316 const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) 1317 { 1318 int rc = 0; 1319 #ifdef _KERNEL 1320 nvlist_t *ereport = NULL; 1321 nvlist_t *detector = NULL; 1322 zfs_ecksum_info_t *info; 1323 1324 if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) 1325 return (SET_ERROR(EINVAL)); 1326 1327 if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, 1328 offset, length)) 1329 return (SET_ERROR(EALREADY)); 1330 1331 if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) 1332 return (SET_ERROR(EBUSY)); 1333 1334 if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, 1335 spa, vd, zb, zio, offset, length) || (ereport == NULL)) { 1336 return (SET_ERROR(EINVAL)); 1337 } 1338 1339 info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, 1340 B_FALSE); 1341 1342 if (info != NULL) { 1343 rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); 1344 kmem_free(info, sizeof (*info)); 1345 } 1346 #else 1347 (void) spa, (void) vd, (void) zb, (void) zio, (void) offset, 1348 (void) length, (void) good_data, (void) bad_data, (void) zbc; 1349 #endif 1350 return (rc); 1351 } 1352 1353 /* 1354 * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of 1355 * change in the pool. All sysevents are listed in sys/sysevent/eventdefs.h 1356 * and are designed to be consumed by the ZFS Event Daemon (ZED). For 1357 * additional details refer to the zed(8) man page. 1358 */ 1359 nvlist_t * 1360 zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, 1361 nvlist_t *aux) 1362 { 1363 nvlist_t *resource = NULL; 1364 #ifdef _KERNEL 1365 char class[64]; 1366 1367 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) 1368 return (NULL); 1369 1370 if ((resource = fm_nvlist_create(NULL)) == NULL) 1371 return (NULL); 1372 1373 (void) snprintf(class, sizeof (class), "%s.%s.%s", type, 1374 ZFS_ERROR_CLASS, name); 1375 VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION)); 1376 VERIFY0(nvlist_add_string(resource, FM_CLASS, class)); 1377 VERIFY0(nvlist_add_string(resource, 1378 FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa))); 1379 VERIFY0(nvlist_add_uint64(resource, 1380 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa))); 1381 VERIFY0(nvlist_add_uint64(resource, 1382 FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa))); 1383 VERIFY0(nvlist_add_int32(resource, 1384 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa))); 1385 1386 if (vd) { 1387 VERIFY0(nvlist_add_uint64(resource, 1388 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid)); 1389 VERIFY0(nvlist_add_uint64(resource, 1390 FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state)); 1391 if (vd->vdev_path != NULL) 1392 VERIFY0(nvlist_add_string(resource, 1393 FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path)); 1394 if (vd->vdev_devid != NULL) 1395 VERIFY0(nvlist_add_string(resource, 1396 FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid)); 1397 if (vd->vdev_fru != NULL) 1398 VERIFY0(nvlist_add_string(resource, 1399 FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru)); 1400 if (vd->vdev_enc_sysfs_path != NULL) 1401 VERIFY0(nvlist_add_string(resource, 1402 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, 1403 vd->vdev_enc_sysfs_path)); 1404 } 1405 1406 /* also copy any optional payload data */ 1407 if (aux) { 1408 nvpair_t *elem = NULL; 1409 1410 while ((elem = nvlist_next_nvpair(aux, elem)) != NULL) 1411 (void) nvlist_add_nvpair(resource, elem); 1412 } 1413 #else 1414 (void) spa, (void) vd, (void) type, (void) name, (void) aux; 1415 #endif 1416 return (resource); 1417 } 1418 1419 static void 1420 zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name, 1421 nvlist_t *aux) 1422 { 1423 #ifdef _KERNEL 1424 nvlist_t *resource; 1425 1426 resource = zfs_event_create(spa, vd, type, name, aux); 1427 if (resource) 1428 zfs_zevent_post(resource, NULL, zfs_zevent_post_cb); 1429 #else 1430 (void) spa, (void) vd, (void) type, (void) name, (void) aux; 1431 #endif 1432 } 1433 1434 /* 1435 * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev 1436 * has been removed from the system. This will cause the DE to ignore any 1437 * recent I/O errors, inferring that they are due to the asynchronous device 1438 * removal. 1439 */ 1440 void 1441 zfs_post_remove(spa_t *spa, vdev_t *vd, boolean_t by_kernel) 1442 { 1443 nvlist_t *aux = NULL; 1444 1445 if (by_kernel) { 1446 /* 1447 * Add optional supplemental keys to payload 1448 */ 1449 aux = fm_nvlist_create(NULL); 1450 if (aux) 1451 fnvlist_add_boolean(aux, "by_kernel"); 1452 } 1453 1454 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, aux); 1455 1456 if (by_kernel && aux) 1457 fm_nvlist_destroy(aux, FM_NVA_FREE); 1458 } 1459 1460 /* 1461 * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool 1462 * has the 'autoreplace' property set, and therefore any broken vdevs will be 1463 * handled by higher level logic, and no vdev fault should be generated. 1464 */ 1465 void 1466 zfs_post_autoreplace(spa_t *spa, vdev_t *vd) 1467 { 1468 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL); 1469 } 1470 1471 /* 1472 * The 'resource.fs.zfs.statechange' event is an internal signal that the 1473 * given vdev has transitioned its state to DEGRADED or HEALTHY. This will 1474 * cause the retire agent to repair any outstanding fault management cases 1475 * open because the device was not found (fault.fs.zfs.device). 1476 */ 1477 void 1478 zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) 1479 { 1480 #ifdef _KERNEL 1481 nvlist_t *aux; 1482 1483 /* 1484 * Add optional supplemental keys to payload 1485 */ 1486 aux = fm_nvlist_create(NULL); 1487 if (vd && aux) { 1488 if (vd->vdev_physpath) { 1489 fnvlist_add_string(aux, 1490 FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH, 1491 vd->vdev_physpath); 1492 } 1493 if (vd->vdev_enc_sysfs_path) { 1494 fnvlist_add_string(aux, 1495 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, 1496 vd->vdev_enc_sysfs_path); 1497 } 1498 1499 fnvlist_add_uint64(aux, 1500 FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate); 1501 } 1502 1503 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE, 1504 aux); 1505 1506 if (aux) 1507 fm_nvlist_destroy(aux, FM_NVA_FREE); 1508 #else 1509 (void) spa, (void) vd, (void) laststate; 1510 #endif 1511 } 1512 1513 #ifdef _KERNEL 1514 void 1515 zfs_ereport_init(void) 1516 { 1517 mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL); 1518 list_create(&recent_events_list, sizeof (recent_events_node_t), 1519 offsetof(recent_events_node_t, re_list_link)); 1520 avl_create(&recent_events_tree, recent_events_compare, 1521 sizeof (recent_events_node_t), offsetof(recent_events_node_t, 1522 re_tree_link)); 1523 } 1524 1525 /* 1526 * This 'early' fini needs to run before zfs_fini() which on Linux waits 1527 * for the system_delay_taskq to drain. 1528 */ 1529 void 1530 zfs_ereport_taskq_fini(void) 1531 { 1532 mutex_enter(&recent_events_lock); 1533 if (recent_events_cleaner_tqid != 0) { 1534 taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid); 1535 recent_events_cleaner_tqid = 0; 1536 } 1537 mutex_exit(&recent_events_lock); 1538 } 1539 1540 void 1541 zfs_ereport_fini(void) 1542 { 1543 recent_events_node_t *entry; 1544 1545 while ((entry = list_remove_head(&recent_events_list)) != NULL) { 1546 avl_remove(&recent_events_tree, entry); 1547 kmem_free(entry, sizeof (*entry)); 1548 } 1549 avl_destroy(&recent_events_tree); 1550 list_destroy(&recent_events_list); 1551 mutex_destroy(&recent_events_lock); 1552 } 1553 1554 void 1555 zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name) 1556 { 1557 nvlist_t *aux; 1558 1559 aux = fm_nvlist_create(NULL); 1560 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name); 1561 1562 zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux); 1563 fm_nvlist_destroy(aux, FM_NVA_FREE); 1564 } 1565 1566 /* 1567 * Post when a event when a zvol is created or removed 1568 * 1569 * This is currently only used by macOS, since it uses the event to create 1570 * symlinks between the volume name (mypool/myvol) and the actual /dev 1571 * device (/dev/disk3). For example: 1572 * 1573 * /var/run/zfs/dsk/mypool/myvol -> /dev/disk3 1574 * 1575 * name: The full name of the zvol ("mypool/myvol") 1576 * dev_name: The full /dev name for the zvol ("/dev/disk3") 1577 * raw_name: The raw /dev name for the zvol ("/dev/rdisk3") 1578 */ 1579 void 1580 zfs_ereport_zvol_post(const char *subclass, const char *name, 1581 const char *dev_name, const char *raw_name) 1582 { 1583 nvlist_t *aux; 1584 char *r; 1585 1586 boolean_t locked = spa_namespace_held(); 1587 if (!locked) spa_namespace_enter(FTAG); 1588 spa_t *spa = spa_lookup(name); 1589 if (!locked) spa_namespace_exit(FTAG); 1590 1591 if (spa == NULL) 1592 return; 1593 1594 aux = fm_nvlist_create(NULL); 1595 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name); 1596 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME, 1597 raw_name); 1598 r = strchr(name, '/'); 1599 if (r && r[1]) 1600 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]); 1601 1602 zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux); 1603 fm_nvlist_destroy(aux, FM_NVA_FREE); 1604 } 1605 1606 EXPORT_SYMBOL(zfs_ereport_post); 1607 EXPORT_SYMBOL(zfs_ereport_is_valid); 1608 EXPORT_SYMBOL(zfs_ereport_post_checksum); 1609 EXPORT_SYMBOL(zfs_post_remove); 1610 EXPORT_SYMBOL(zfs_post_autoreplace); 1611 EXPORT_SYMBOL(zfs_post_state_change); 1612 1613 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW, 1614 "Maximum recent zevents records to retain for duplicate checking"); 1615 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW, 1616 "Expiration time for recent zevents records"); 1617 #endif /* _KERNEL */ 1618