1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 2012,2021 by Delphix. All rights reserved. 29 */ 30 31 #include <sys/spa.h> 32 #include <sys/spa_impl.h> 33 #include <sys/vdev.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/zio.h> 36 #include <sys/zio_checksum.h> 37 38 #include <sys/fm/fs/zfs.h> 39 #include <sys/fm/protocol.h> 40 #include <sys/fm/util.h> 41 #include <sys/sysevent.h> 42 43 /* 44 * This general routine is responsible for generating all the different ZFS 45 * ereports. The payload is dependent on the class, and which arguments are 46 * supplied to the function: 47 * 48 * EREPORT POOL VDEV IO 49 * block X X X 50 * data X X 51 * device X X 52 * pool X 53 * 54 * If we are in a loading state, all errors are chained together by the same 55 * SPA-wide ENA (Error Numeric Association). 56 * 57 * For isolated I/O requests, we get the ENA from the zio_t. The propagation 58 * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want 59 * to chain together all ereports associated with a logical piece of data. For 60 * read I/Os, there are basically three 'types' of I/O, which form a roughly 61 * layered diagram: 62 * 63 * +---------------+ 64 * | Aggregate I/O | No associated logical data or device 65 * +---------------+ 66 * | 67 * V 68 * +---------------+ Reads associated with a piece of logical data. 69 * | Read I/O | This includes reads on behalf of RAID-Z, 70 * +---------------+ mirrors, gang blocks, retries, etc. 71 * | 72 * V 73 * +---------------+ Reads associated with a particular device, but 74 * | Physical I/O | no logical data. Issued as part of vdev caching 75 * +---------------+ and I/O aggregation. 76 * 77 * Note that 'physical I/O' here is not the same terminology as used in the rest 78 * of ZIO. Typically, 'physical I/O' simply means that there is no attached 79 * blockpointer. But I/O with no associated block pointer can still be related 80 * to a logical piece of data (i.e. RAID-Z requests). 81 * 82 * Purely physical I/O always have unique ENAs. They are not related to a 83 * particular piece of logical data, and therefore cannot be chained together. 84 * We still generate an ereport, but the DE doesn't correlate it with any 85 * logical piece of data. When such an I/O fails, the delegated I/O requests 86 * will issue a retry, which will trigger the 'real' ereport with the correct 87 * ENA. 88 * 89 * We keep track of the ENA for a ZIO chain through the 'io_logical' member. 90 * When a new logical I/O is issued, we set this to point to itself. Child I/Os 91 * then inherit this pointer, so that when it is first set subsequent failures 92 * will use the same ENA. For vdev cache fill and queue aggregation I/O, 93 * this pointer is set to NULL, and no ereport will be generated (since it 94 * doesn't actually correspond to any particular device or piece of data, 95 * and the caller will always retry without caching or queueing anyway). 96 * 97 * For checksum errors, we want to include more information about the actual 98 * error which occurs. Accordingly, we build an ereport when the error is 99 * noticed, but instead of sending it in immediately, we hang it off of the 100 * io_cksum_report field of the logical IO. When the logical IO completes 101 * (successfully or not), zfs_ereport_finish_checksum() is called with the 102 * good and bad versions of the buffer (if available), and we annotate the 103 * ereport with information about the differences. 104 */ 105 106 #ifdef _KERNEL 107 /* 108 * Duplicate ereport Detection 109 * 110 * Some ereports are retained momentarily for detecting duplicates. These 111 * are kept in a recent_events_node_t in both a time-ordered list and an AVL 112 * tree of recent unique ereports. 113 * 114 * The lifespan of these recent ereports is bounded (15 mins) and a cleaner 115 * task is used to purge stale entries. 116 */ 117 static list_t recent_events_list; 118 static avl_tree_t recent_events_tree; 119 static kmutex_t recent_events_lock; 120 static taskqid_t recent_events_cleaner_tqid; 121 122 /* 123 * Each node is about 128 bytes so 2,000 would consume 1/4 MiB. 124 * 125 * This setting can be changed dynamically and setting it to zero 126 * disables duplicate detection. 127 */ 128 static unsigned int zfs_zevent_retain_max = 2000; 129 130 /* 131 * The lifespan for a recent ereport entry. The default of 15 minutes is 132 * intended to outlive the zfs diagnosis engine's threshold of 10 errors 133 * over a period of 10 minutes. 134 */ 135 static unsigned int zfs_zevent_retain_expire_secs = 900; 136 137 typedef enum zfs_subclass { 138 ZSC_IO, 139 ZSC_DATA, 140 ZSC_CHECKSUM 141 } zfs_subclass_t; 142 143 typedef struct { 144 /* common criteria */ 145 uint64_t re_pool_guid; 146 uint64_t re_vdev_guid; 147 int re_io_error; 148 uint64_t re_io_size; 149 uint64_t re_io_offset; 150 zfs_subclass_t re_subclass; 151 zio_priority_t re_io_priority; 152 153 /* logical zio criteria (optional) */ 154 zbookmark_phys_t re_io_bookmark; 155 156 /* internal state */ 157 avl_node_t re_tree_link; 158 list_node_t re_list_link; 159 uint64_t re_timestamp; 160 } recent_events_node_t; 161 162 static int 163 recent_events_compare(const void *a, const void *b) 164 { 165 const recent_events_node_t *node1 = a; 166 const recent_events_node_t *node2 = b; 167 int cmp; 168 169 /* 170 * The comparison order here is somewhat arbitrary. 171 * What's important is that if every criteria matches, then it 172 * is a duplicate (i.e. compare returns 0) 173 */ 174 if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0) 175 return (cmp); 176 if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0) 177 return (cmp); 178 if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0) 179 return (cmp); 180 if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0) 181 return (cmp); 182 if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0) 183 return (cmp); 184 if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0) 185 return (cmp); 186 if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0) 187 return (cmp); 188 189 const zbookmark_phys_t *zb1 = &node1->re_io_bookmark; 190 const zbookmark_phys_t *zb2 = &node2->re_io_bookmark; 191 192 if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0) 193 return (cmp); 194 if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0) 195 return (cmp); 196 if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0) 197 return (cmp); 198 if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0) 199 return (cmp); 200 201 return (0); 202 } 203 204 /* 205 * workaround: vdev properties don't have inheritance 206 */ 207 static uint64_t 208 vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop) 209 { 210 uint64_t propdef, propval; 211 212 propdef = vdev_prop_default_numeric(prop); 213 switch (prop) { 214 case VDEV_PROP_CHECKSUM_N: 215 propval = vd->vdev_checksum_n; 216 break; 217 case VDEV_PROP_CHECKSUM_T: 218 propval = vd->vdev_checksum_t; 219 break; 220 case VDEV_PROP_IO_N: 221 propval = vd->vdev_io_n; 222 break; 223 case VDEV_PROP_IO_T: 224 propval = vd->vdev_io_t; 225 break; 226 case VDEV_PROP_SLOW_IO_N: 227 propval = vd->vdev_slow_io_n; 228 break; 229 case VDEV_PROP_SLOW_IO_T: 230 propval = vd->vdev_slow_io_t; 231 break; 232 default: 233 propval = propdef; 234 break; 235 } 236 237 if (propval != propdef) 238 return (propval); 239 240 if (vd->vdev_parent == NULL) 241 return (propdef); 242 243 return (vdev_prop_get_inherited(vd->vdev_parent, prop)); 244 } 245 246 static void zfs_ereport_schedule_cleaner(void); 247 248 /* 249 * background task to clean stale recent event nodes. 250 */ 251 static void 252 zfs_ereport_cleaner(void *arg) 253 { 254 recent_events_node_t *entry; 255 uint64_t now = gethrtime(); 256 257 /* 258 * purge expired entries 259 */ 260 mutex_enter(&recent_events_lock); 261 while ((entry = list_tail(&recent_events_list)) != NULL) { 262 uint64_t age = NSEC2SEC(now - entry->re_timestamp); 263 if (age <= zfs_zevent_retain_expire_secs) 264 break; 265 266 /* remove expired node */ 267 avl_remove(&recent_events_tree, entry); 268 list_remove(&recent_events_list, entry); 269 kmem_free(entry, sizeof (*entry)); 270 } 271 272 /* Restart the cleaner if more entries remain */ 273 recent_events_cleaner_tqid = 0; 274 if (!list_is_empty(&recent_events_list)) 275 zfs_ereport_schedule_cleaner(); 276 277 mutex_exit(&recent_events_lock); 278 } 279 280 static void 281 zfs_ereport_schedule_cleaner(void) 282 { 283 ASSERT(MUTEX_HELD(&recent_events_lock)); 284 285 uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1); 286 287 recent_events_cleaner_tqid = taskq_dispatch_delay( 288 system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP, 289 ddi_get_lbolt() + NSEC_TO_TICK(timeout)); 290 } 291 292 /* 293 * Clear entries for a given vdev or all vdevs in a pool when vdev == NULL 294 */ 295 void 296 zfs_ereport_clear(spa_t *spa, vdev_t *vd) 297 { 298 uint64_t vdev_guid, pool_guid; 299 300 ASSERT(vd != NULL || spa != NULL); 301 if (vd == NULL) { 302 vdev_guid = 0; 303 pool_guid = spa_guid(spa); 304 } else { 305 vdev_guid = vd->vdev_guid; 306 pool_guid = 0; 307 } 308 309 mutex_enter(&recent_events_lock); 310 311 recent_events_node_t *next = list_head(&recent_events_list); 312 while (next != NULL) { 313 recent_events_node_t *entry = next; 314 315 next = list_next(&recent_events_list, next); 316 317 if (entry->re_vdev_guid == vdev_guid || 318 entry->re_pool_guid == pool_guid) { 319 avl_remove(&recent_events_tree, entry); 320 list_remove(&recent_events_list, entry); 321 kmem_free(entry, sizeof (*entry)); 322 } 323 } 324 325 mutex_exit(&recent_events_lock); 326 } 327 328 /* 329 * Check if an ereport would be a duplicate of one recently posted. 330 * 331 * An ereport is considered a duplicate if the set of criteria in 332 * recent_events_node_t all match. 333 * 334 * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM 335 * are candidates for duplicate checking. 336 */ 337 static boolean_t 338 zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd, 339 const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size) 340 { 341 recent_events_node_t search = {0}, *entry; 342 343 if (vd == NULL || zio == NULL) 344 return (B_FALSE); 345 346 if (zfs_zevent_retain_max == 0) 347 return (B_FALSE); 348 349 if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) 350 search.re_subclass = ZSC_IO; 351 else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0) 352 search.re_subclass = ZSC_DATA; 353 else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) 354 search.re_subclass = ZSC_CHECKSUM; 355 else 356 return (B_FALSE); 357 358 search.re_pool_guid = spa_guid(spa); 359 search.re_vdev_guid = vd->vdev_guid; 360 search.re_io_error = zio->io_error; 361 search.re_io_priority = zio->io_priority; 362 /* if size is supplied use it over what's in zio */ 363 if (size) { 364 search.re_io_size = size; 365 search.re_io_offset = offset; 366 } else { 367 search.re_io_size = zio->io_size; 368 search.re_io_offset = zio->io_offset; 369 } 370 371 /* grab optional logical zio criteria */ 372 if (zb != NULL) { 373 search.re_io_bookmark.zb_objset = zb->zb_objset; 374 search.re_io_bookmark.zb_object = zb->zb_object; 375 search.re_io_bookmark.zb_level = zb->zb_level; 376 search.re_io_bookmark.zb_blkid = zb->zb_blkid; 377 } 378 379 uint64_t now = gethrtime(); 380 381 mutex_enter(&recent_events_lock); 382 383 /* check if we have seen this one recently */ 384 entry = avl_find(&recent_events_tree, &search, NULL); 385 if (entry != NULL) { 386 uint64_t age = NSEC2SEC(now - entry->re_timestamp); 387 388 /* 389 * There is still an active cleaner (since we're here). 390 * Reset the last seen time for this duplicate entry 391 * so that its lifespand gets extended. 392 */ 393 list_remove(&recent_events_list, entry); 394 list_insert_head(&recent_events_list, entry); 395 entry->re_timestamp = now; 396 397 zfs_zevent_track_duplicate(); 398 mutex_exit(&recent_events_lock); 399 400 return (age <= zfs_zevent_retain_expire_secs); 401 } 402 403 if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) { 404 /* recycle oldest node */ 405 entry = list_tail(&recent_events_list); 406 ASSERT(entry != NULL); 407 list_remove(&recent_events_list, entry); 408 avl_remove(&recent_events_tree, entry); 409 } else { 410 entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP); 411 } 412 413 /* record this as a recent ereport */ 414 *entry = search; 415 avl_add(&recent_events_tree, entry); 416 list_insert_head(&recent_events_list, entry); 417 entry->re_timestamp = now; 418 419 /* Start a cleaner if not already scheduled */ 420 if (recent_events_cleaner_tqid == 0) 421 zfs_ereport_schedule_cleaner(); 422 423 mutex_exit(&recent_events_lock); 424 return (B_FALSE); 425 } 426 427 void 428 zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) 429 { 430 if (nvl) 431 fm_nvlist_destroy(nvl, FM_NVA_FREE); 432 433 if (detector) 434 fm_nvlist_destroy(detector, FM_NVA_FREE); 435 } 436 437 /* 438 * We want to rate limit ZIO delay, deadman, and checksum events so as to not 439 * flood zevent consumers when a disk is acting up. 440 * 441 * Returns 1 if we're ratelimiting, 0 if not. 442 */ 443 static int 444 zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd) 445 { 446 int rc = 0; 447 /* 448 * zfs_ratelimit() returns 1 if we're *not* ratelimiting and 0 if we 449 * are. Invert it to get our return value. 450 */ 451 if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { 452 rc = !zfs_ratelimit(&vd->vdev_delay_rl); 453 } else if (strcmp(subclass, FM_EREPORT_ZFS_DEADMAN) == 0) { 454 rc = !zfs_ratelimit(&vd->vdev_deadman_rl); 455 } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { 456 rc = !zfs_ratelimit(&vd->vdev_checksum_rl); 457 } 458 459 if (rc) { 460 /* We're rate limiting */ 461 fm_erpt_dropped_increment(); 462 } 463 464 return (rc); 465 } 466 467 /* 468 * Return B_TRUE if the event actually posted, B_FALSE if not. 469 */ 470 static boolean_t 471 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, 472 const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, 473 zio_t *zio, uint64_t stateoroffset, uint64_t size) 474 { 475 nvlist_t *ereport, *detector; 476 477 uint64_t ena; 478 char class[64]; 479 480 if ((ereport = fm_nvlist_create(NULL)) == NULL) 481 return (B_FALSE); 482 483 if ((detector = fm_nvlist_create(NULL)) == NULL) { 484 fm_nvlist_destroy(ereport, FM_NVA_FREE); 485 return (B_FALSE); 486 } 487 488 /* 489 * Serialize ereport generation 490 */ 491 mutex_enter(&spa->spa_errlist_lock); 492 493 /* 494 * Determine the ENA to use for this event. If we are in a loading 495 * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use 496 * a root zio-wide ENA. Otherwise, simply use a unique ENA. 497 */ 498 if (spa_load_state(spa) != SPA_LOAD_NONE) { 499 if (spa->spa_ena == 0) 500 spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); 501 ena = spa->spa_ena; 502 } else if (zio != NULL && zio->io_logical != NULL) { 503 if (zio->io_logical->io_ena == 0) 504 zio->io_logical->io_ena = 505 fm_ena_generate(0, FM_ENA_FMT1); 506 ena = zio->io_logical->io_ena; 507 } else { 508 ena = fm_ena_generate(0, FM_ENA_FMT1); 509 } 510 511 /* 512 * Construct the full class, detector, and other standard FMA fields. 513 */ 514 (void) snprintf(class, sizeof (class), "%s.%s", 515 ZFS_ERROR_CLASS, subclass); 516 517 fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), 518 vd != NULL ? vd->vdev_guid : 0); 519 520 fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); 521 522 /* 523 * Construct the per-ereport payload, depending on which parameters are 524 * passed in. 525 */ 526 527 /* 528 * Generic payload members common to all ereports. 529 */ 530 fm_payload_set(ereport, 531 FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa), 532 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa), 533 FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64, 534 (uint64_t)spa_state(spa), 535 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, 536 (int32_t)spa_load_state(spa), NULL); 537 538 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, 539 DATA_TYPE_STRING, 540 spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? 541 FM_EREPORT_FAILMODE_WAIT : 542 spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? 543 FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, 544 NULL); 545 546 if (vd != NULL) { 547 vdev_t *pvd = vd->vdev_parent; 548 vdev_queue_t *vq = &vd->vdev_queue; 549 vdev_stat_t *vs = &vd->vdev_stat; 550 vdev_t *spare_vd; 551 uint64_t *spare_guids; 552 char **spare_paths; 553 int i, spare_count; 554 555 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, 556 DATA_TYPE_UINT64, vd->vdev_guid, 557 FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, 558 DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); 559 if (vd->vdev_path != NULL) 560 fm_payload_set(ereport, 561 FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, 562 DATA_TYPE_STRING, vd->vdev_path, NULL); 563 if (vd->vdev_devid != NULL) 564 fm_payload_set(ereport, 565 FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, 566 DATA_TYPE_STRING, vd->vdev_devid, NULL); 567 if (vd->vdev_fru != NULL) 568 fm_payload_set(ereport, 569 FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, 570 DATA_TYPE_STRING, vd->vdev_fru, NULL); 571 if (vd->vdev_enc_sysfs_path != NULL) 572 fm_payload_set(ereport, 573 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, 574 DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL); 575 if (vd->vdev_ashift) 576 fm_payload_set(ereport, 577 FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT, 578 DATA_TYPE_UINT64, vd->vdev_ashift, NULL); 579 580 if (vq != NULL) { 581 fm_payload_set(ereport, 582 FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS, 583 DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL); 584 fm_payload_set(ereport, 585 FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS, 586 DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL); 587 } 588 589 if (vs != NULL) { 590 fm_payload_set(ereport, 591 FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS, 592 DATA_TYPE_UINT64, vs->vs_read_errors, 593 FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, 594 DATA_TYPE_UINT64, vs->vs_write_errors, 595 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, 596 DATA_TYPE_UINT64, vs->vs_checksum_errors, 597 FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, 598 DATA_TYPE_UINT64, vs->vs_slow_ios, 599 FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS, 600 DATA_TYPE_UINT64, vs->vs_dio_verify_errors, 601 NULL); 602 } 603 604 if (pvd != NULL) { 605 fm_payload_set(ereport, 606 FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, 607 DATA_TYPE_UINT64, pvd->vdev_guid, 608 FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, 609 DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, 610 NULL); 611 if (pvd->vdev_path) 612 fm_payload_set(ereport, 613 FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, 614 DATA_TYPE_STRING, pvd->vdev_path, NULL); 615 if (pvd->vdev_devid) 616 fm_payload_set(ereport, 617 FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, 618 DATA_TYPE_STRING, pvd->vdev_devid, NULL); 619 } 620 621 spare_count = spa->spa_spares.sav_count; 622 spare_paths = kmem_zalloc(sizeof (char *) * spare_count, 623 KM_SLEEP); 624 spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count, 625 KM_SLEEP); 626 627 for (i = 0; i < spare_count; i++) { 628 spare_vd = spa->spa_spares.sav_vdevs[i]; 629 if (spare_vd) { 630 spare_paths[i] = spare_vd->vdev_path; 631 spare_guids[i] = spare_vd->vdev_guid; 632 } 633 } 634 635 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS, 636 DATA_TYPE_STRING_ARRAY, spare_count, spare_paths, 637 FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS, 638 DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL); 639 640 kmem_free(spare_guids, sizeof (uint64_t) * spare_count); 641 kmem_free(spare_paths, sizeof (char *) * spare_count); 642 } 643 644 if (zio != NULL) { 645 /* 646 * Payload common to all I/Os. 647 */ 648 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, 649 DATA_TYPE_INT32, zio->io_error, NULL); 650 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, 651 DATA_TYPE_UINT64, zio->io_flags, NULL); 652 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE, 653 DATA_TYPE_UINT32, zio->io_stage, NULL); 654 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE, 655 DATA_TYPE_UINT32, zio->io_pipeline, NULL); 656 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY, 657 DATA_TYPE_UINT64, zio->io_delay, NULL); 658 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP, 659 DATA_TYPE_UINT64, zio->io_timestamp, NULL); 660 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, 661 DATA_TYPE_UINT64, zio->io_delta, NULL); 662 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TYPE, 663 DATA_TYPE_UINT32, zio->io_type, NULL); 664 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, 665 DATA_TYPE_UINT32, zio->io_priority, NULL); 666 667 /* 668 * If the 'size' parameter is non-zero, it indicates this is a 669 * RAID-Z or other I/O where the physical offset and length are 670 * provided for us, instead of within the zio_t. 671 */ 672 if (vd != NULL) { 673 if (size) 674 fm_payload_set(ereport, 675 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, 676 DATA_TYPE_UINT64, stateoroffset, 677 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, 678 DATA_TYPE_UINT64, size, NULL); 679 else 680 fm_payload_set(ereport, 681 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, 682 DATA_TYPE_UINT64, zio->io_offset, 683 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, 684 DATA_TYPE_UINT64, zio->io_size, NULL); 685 } 686 } else if (vd != NULL) { 687 /* 688 * If we have a vdev but no zio, this is a device fault, and the 689 * 'stateoroffset' parameter indicates the previous state of the 690 * vdev. 691 */ 692 fm_payload_set(ereport, 693 FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, 694 DATA_TYPE_UINT64, stateoroffset, NULL); 695 } 696 697 /* 698 * Payload for I/Os with corresponding logical information. 699 */ 700 if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) { 701 fm_payload_set(ereport, 702 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, 703 DATA_TYPE_UINT64, zb->zb_objset, 704 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, 705 DATA_TYPE_UINT64, zb->zb_object, 706 FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, 707 DATA_TYPE_INT64, zb->zb_level, 708 FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, 709 DATA_TYPE_UINT64, zb->zb_blkid, NULL); 710 } 711 712 /* 713 * Payload for tuning the zed 714 */ 715 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { 716 uint64_t cksum_n, cksum_t; 717 718 cksum_n = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_N); 719 if (cksum_n != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N)) 720 fm_payload_set(ereport, 721 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N, 722 DATA_TYPE_UINT64, 723 cksum_n, 724 NULL); 725 726 cksum_t = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_T); 727 if (cksum_t != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T)) 728 fm_payload_set(ereport, 729 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T, 730 DATA_TYPE_UINT64, 731 cksum_t, 732 NULL); 733 } 734 735 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) { 736 uint64_t io_n, io_t; 737 738 io_n = vdev_prop_get_inherited(vd, VDEV_PROP_IO_N); 739 if (io_n != vdev_prop_default_numeric(VDEV_PROP_IO_N)) 740 fm_payload_set(ereport, 741 FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N, 742 DATA_TYPE_UINT64, 743 io_n, 744 NULL); 745 746 io_t = vdev_prop_get_inherited(vd, VDEV_PROP_IO_T); 747 if (io_t != vdev_prop_default_numeric(VDEV_PROP_IO_T)) 748 fm_payload_set(ereport, 749 FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T, 750 DATA_TYPE_UINT64, 751 io_t, 752 NULL); 753 } 754 755 if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { 756 uint64_t slow_io_n, slow_io_t; 757 758 slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N); 759 if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N)) 760 fm_payload_set(ereport, 761 FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N, 762 DATA_TYPE_UINT64, 763 slow_io_n, 764 NULL); 765 766 slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T); 767 if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T)) 768 fm_payload_set(ereport, 769 FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T, 770 DATA_TYPE_UINT64, 771 slow_io_t, 772 NULL); 773 } 774 775 mutex_exit(&spa->spa_errlist_lock); 776 777 *ereport_out = ereport; 778 *detector_out = detector; 779 return (B_TRUE); 780 } 781 782 /* if it's <= 128 bytes, save the corruption directly */ 783 #define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) 784 785 #define MAX_RANGES 16 786 787 typedef struct zfs_ecksum_info { 788 /* inline arrays of bits set and cleared. */ 789 uint64_t zei_bits_set[ZFM_MAX_INLINE]; 790 uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; 791 792 /* 793 * for each range, the number of bits set and cleared. The Hamming 794 * distance between the good and bad buffers is the sum of them all. 795 */ 796 uint32_t zei_range_sets[MAX_RANGES]; 797 uint32_t zei_range_clears[MAX_RANGES]; 798 799 struct zei_ranges { 800 uint32_t zr_start; 801 uint32_t zr_end; 802 } zei_ranges[MAX_RANGES]; 803 804 size_t zei_range_count; 805 uint32_t zei_mingap; 806 uint32_t zei_allowed_mingap; 807 808 } zfs_ecksum_info_t; 809 810 static void 811 update_bad_bits(uint64_t value_arg, uint32_t *count) 812 { 813 size_t i; 814 size_t bits = 0; 815 uint64_t value = BE_64(value_arg); 816 817 /* We store the bits in big-endian (largest-first) order */ 818 for (i = 0; i < 64; i++) { 819 if (value & (1ull << i)) 820 ++bits; 821 } 822 /* update the count of bits changed */ 823 *count += bits; 824 } 825 826 /* 827 * We've now filled up the range array, and need to increase "mingap" and 828 * shrink the range list accordingly. zei_mingap is always the smallest 829 * distance between array entries, so we set the new_allowed_gap to be 830 * one greater than that. We then go through the list, joining together 831 * any ranges which are closer than the new_allowed_gap. 832 * 833 * By construction, there will be at least one. We also update zei_mingap 834 * to the new smallest gap, to prepare for our next invocation. 835 */ 836 static void 837 zei_shrink_ranges(zfs_ecksum_info_t *eip) 838 { 839 uint32_t mingap = UINT32_MAX; 840 uint32_t new_allowed_gap = eip->zei_mingap + 1; 841 842 size_t idx, output; 843 size_t max = eip->zei_range_count; 844 845 struct zei_ranges *r = eip->zei_ranges; 846 847 ASSERT3U(eip->zei_range_count, >, 0); 848 ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); 849 850 output = idx = 0; 851 while (idx < max - 1) { 852 uint32_t start = r[idx].zr_start; 853 uint32_t end = r[idx].zr_end; 854 855 while (idx < max - 1) { 856 idx++; 857 858 uint32_t nstart = r[idx].zr_start; 859 uint32_t nend = r[idx].zr_end; 860 861 uint32_t gap = nstart - end; 862 if (gap < new_allowed_gap) { 863 end = nend; 864 continue; 865 } 866 if (gap < mingap) 867 mingap = gap; 868 break; 869 } 870 r[output].zr_start = start; 871 r[output].zr_end = end; 872 output++; 873 } 874 ASSERT3U(output, <, eip->zei_range_count); 875 eip->zei_range_count = output; 876 eip->zei_mingap = mingap; 877 eip->zei_allowed_mingap = new_allowed_gap; 878 } 879 880 static void 881 zei_add_range(zfs_ecksum_info_t *eip, int start, int end) 882 { 883 struct zei_ranges *r = eip->zei_ranges; 884 size_t count = eip->zei_range_count; 885 886 if (count >= MAX_RANGES) { 887 zei_shrink_ranges(eip); 888 count = eip->zei_range_count; 889 } 890 if (count == 0) { 891 eip->zei_mingap = UINT32_MAX; 892 eip->zei_allowed_mingap = 1; 893 } else { 894 int gap = start - r[count - 1].zr_end; 895 896 if (gap < eip->zei_allowed_mingap) { 897 r[count - 1].zr_end = end; 898 return; 899 } 900 if (gap < eip->zei_mingap) 901 eip->zei_mingap = gap; 902 } 903 r[count].zr_start = start; 904 r[count].zr_end = end; 905 eip->zei_range_count++; 906 } 907 908 static size_t 909 zei_range_total_size(zfs_ecksum_info_t *eip) 910 { 911 struct zei_ranges *r = eip->zei_ranges; 912 size_t count = eip->zei_range_count; 913 size_t result = 0; 914 size_t idx; 915 916 for (idx = 0; idx < count; idx++) 917 result += (r[idx].zr_end - r[idx].zr_start); 918 919 return (result); 920 } 921 922 static zfs_ecksum_info_t * 923 annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, 924 const abd_t *goodabd, const abd_t *badabd, size_t size, 925 boolean_t drop_if_identical) 926 { 927 const uint64_t *good; 928 const uint64_t *bad; 929 930 size_t nui64s = size / sizeof (uint64_t); 931 932 size_t inline_size; 933 int no_inline = 0; 934 size_t idx; 935 size_t range; 936 937 size_t offset = 0; 938 ssize_t start = -1; 939 940 zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); 941 942 /* don't do any annotation for injected checksum errors */ 943 if (info != NULL && info->zbc_injected) 944 return (eip); 945 946 if (info != NULL && info->zbc_has_cksum) { 947 fm_payload_set(ereport, 948 FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, 949 DATA_TYPE_STRING, 950 info->zbc_checksum_name, 951 NULL); 952 953 if (info->zbc_byteswapped) { 954 fm_payload_set(ereport, 955 FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, 956 DATA_TYPE_BOOLEAN, 1, 957 NULL); 958 } 959 } 960 961 if (badabd == NULL || goodabd == NULL) 962 return (eip); 963 964 ASSERT3U(nui64s, <=, UINT32_MAX); 965 ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); 966 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 967 ASSERT3U(size, <=, UINT32_MAX); 968 969 good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size); 970 bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size); 971 972 /* build up the range list by comparing the two buffers. */ 973 for (idx = 0; idx < nui64s; idx++) { 974 if (good[idx] == bad[idx]) { 975 if (start == -1) 976 continue; 977 978 zei_add_range(eip, start, idx); 979 start = -1; 980 } else { 981 if (start != -1) 982 continue; 983 984 start = idx; 985 } 986 } 987 if (start != -1) 988 zei_add_range(eip, start, idx); 989 990 /* See if it will fit in our inline buffers */ 991 inline_size = zei_range_total_size(eip); 992 if (inline_size > ZFM_MAX_INLINE) 993 no_inline = 1; 994 995 /* 996 * If there is no change and we want to drop if the buffers are 997 * identical, do so. 998 */ 999 if (inline_size == 0 && drop_if_identical) { 1000 kmem_free(eip, sizeof (*eip)); 1001 abd_return_buf((abd_t *)goodabd, (void *)good, size); 1002 abd_return_buf((abd_t *)badabd, (void *)bad, size); 1003 return (NULL); 1004 } 1005 1006 /* 1007 * Now walk through the ranges, filling in the details of the 1008 * differences. Also convert our uint64_t-array offsets to byte 1009 * offsets. 1010 */ 1011 for (range = 0; range < eip->zei_range_count; range++) { 1012 size_t start = eip->zei_ranges[range].zr_start; 1013 size_t end = eip->zei_ranges[range].zr_end; 1014 1015 for (idx = start; idx < end; idx++) { 1016 uint64_t set, cleared; 1017 1018 // bits set in bad, but not in good 1019 set = ((~good[idx]) & bad[idx]); 1020 // bits set in good, but not in bad 1021 cleared = (good[idx] & (~bad[idx])); 1022 1023 if (!no_inline) { 1024 ASSERT3U(offset, <, inline_size); 1025 eip->zei_bits_set[offset] = set; 1026 eip->zei_bits_cleared[offset] = cleared; 1027 offset++; 1028 } 1029 1030 update_bad_bits(set, &eip->zei_range_sets[range]); 1031 update_bad_bits(cleared, &eip->zei_range_clears[range]); 1032 } 1033 1034 /* convert to byte offsets */ 1035 eip->zei_ranges[range].zr_start *= sizeof (uint64_t); 1036 eip->zei_ranges[range].zr_end *= sizeof (uint64_t); 1037 } 1038 1039 abd_return_buf((abd_t *)goodabd, (void *)good, size); 1040 abd_return_buf((abd_t *)badabd, (void *)bad, size); 1041 1042 eip->zei_allowed_mingap *= sizeof (uint64_t); 1043 inline_size *= sizeof (uint64_t); 1044 1045 /* fill in ereport */ 1046 fm_payload_set(ereport, 1047 FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, 1048 DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, 1049 (uint32_t *)eip->zei_ranges, 1050 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, 1051 DATA_TYPE_UINT32, eip->zei_allowed_mingap, 1052 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, 1053 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, 1054 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, 1055 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, 1056 NULL); 1057 1058 if (!no_inline) { 1059 fm_payload_set(ereport, 1060 FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, 1061 DATA_TYPE_UINT8_ARRAY, 1062 inline_size, (uint8_t *)eip->zei_bits_set, 1063 FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, 1064 DATA_TYPE_UINT8_ARRAY, 1065 inline_size, (uint8_t *)eip->zei_bits_cleared, 1066 NULL); 1067 } 1068 return (eip); 1069 } 1070 #else 1071 void 1072 zfs_ereport_clear(spa_t *spa, vdev_t *vd) 1073 { 1074 (void) spa, (void) vd; 1075 } 1076 #endif 1077 1078 /* 1079 * Make sure our event is still valid for the given zio/vdev/pool. For example, 1080 * we don't want to keep logging events for a faulted or missing vdev. 1081 */ 1082 boolean_t 1083 zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) 1084 { 1085 #ifdef _KERNEL 1086 /* 1087 * If we are doing a spa_tryimport() or in recovery mode, 1088 * ignore errors. 1089 */ 1090 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || 1091 spa_load_state(spa) == SPA_LOAD_RECOVER) 1092 return (B_FALSE); 1093 1094 /* 1095 * If we are in the middle of opening a pool, and the previous attempt 1096 * failed, don't bother logging any new ereports - we're just going to 1097 * get the same diagnosis anyway. 1098 */ 1099 if (spa_load_state(spa) != SPA_LOAD_NONE && 1100 spa->spa_last_open_failed) 1101 return (B_FALSE); 1102 1103 if (zio != NULL) { 1104 /* If this is not a read or write zio, ignore the error */ 1105 if (zio->io_type != ZIO_TYPE_READ && 1106 zio->io_type != ZIO_TYPE_WRITE) 1107 return (B_FALSE); 1108 1109 if (vd != NULL) { 1110 /* 1111 * If the vdev has already been marked as failing due 1112 * to a failed probe, then ignore any subsequent I/O 1113 * errors, as the DE will automatically fault the vdev 1114 * on the first such failure. This also catches cases 1115 * where vdev_remove_wanted is set and the device has 1116 * not yet been asynchronously placed into the REMOVED 1117 * state. 1118 */ 1119 if (zio->io_vd == vd && !vdev_accessible(vd, zio)) 1120 return (B_FALSE); 1121 1122 /* 1123 * Ignore checksum errors for reads from DTL regions of 1124 * leaf vdevs. 1125 */ 1126 if (zio->io_type == ZIO_TYPE_READ && 1127 zio->io_error == ECKSUM && 1128 vd->vdev_ops->vdev_op_leaf && 1129 vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) 1130 return (B_FALSE); 1131 } 1132 } 1133 1134 /* 1135 * For probe failure, we want to avoid posting ereports if we've 1136 * already removed the device in the meantime. 1137 */ 1138 if (vd != NULL && 1139 strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && 1140 (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) 1141 return (B_FALSE); 1142 1143 /* Ignore bogus delay events (like from ioctls or unqueued IOs) */ 1144 if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && 1145 (zio != NULL) && (!zio->io_timestamp)) { 1146 return (B_FALSE); 1147 } 1148 #else 1149 (void) subclass, (void) spa, (void) vd, (void) zio; 1150 #endif 1151 return (B_TRUE); 1152 } 1153 1154 /* 1155 * Post an ereport for the given subclass 1156 * 1157 * Returns 1158 * - 0 if an event was posted 1159 * - EINVAL if there was a problem posting event 1160 * - EBUSY if the event was rate limited 1161 * - EALREADY if the event was already posted (duplicate) 1162 */ 1163 int 1164 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, 1165 const zbookmark_phys_t *zb, zio_t *zio, uint64_t state) 1166 { 1167 int rc = 0; 1168 #ifdef _KERNEL 1169 nvlist_t *ereport = NULL; 1170 nvlist_t *detector = NULL; 1171 1172 if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) 1173 return (EINVAL); 1174 1175 if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0)) 1176 return (SET_ERROR(EALREADY)); 1177 1178 if (zfs_is_ratelimiting_event(subclass, vd)) 1179 return (SET_ERROR(EBUSY)); 1180 1181 if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, 1182 zb, zio, state, 0)) 1183 return (SET_ERROR(EINVAL)); /* couldn't post event */ 1184 1185 if (ereport == NULL) 1186 return (SET_ERROR(EINVAL)); 1187 1188 /* Cleanup is handled by the callback function */ 1189 rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); 1190 #else 1191 (void) subclass, (void) spa, (void) vd, (void) zb, (void) zio, 1192 (void) state; 1193 #endif 1194 return (rc); 1195 } 1196 1197 /* 1198 * Prepare a checksum ereport 1199 * 1200 * Returns 1201 * - 0 if an event was posted 1202 * - EINVAL if there was a problem posting event 1203 * - EBUSY if the event was rate limited 1204 * - EALREADY if the event was already posted (duplicate) 1205 */ 1206 int 1207 zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, 1208 struct zio *zio, uint64_t offset, uint64_t length, zio_bad_cksum_t *info) 1209 { 1210 zio_cksum_report_t *report; 1211 1212 #ifdef _KERNEL 1213 if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) 1214 return (SET_ERROR(EINVAL)); 1215 1216 if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, 1217 offset, length)) 1218 return (SET_ERROR(EALREADY)); 1219 1220 if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) 1221 return (SET_ERROR(EBUSY)); 1222 #else 1223 (void) zb, (void) offset; 1224 #endif 1225 1226 report = kmem_zalloc(sizeof (*report), KM_SLEEP); 1227 1228 zio_vsd_default_cksum_report(zio, report); 1229 1230 /* copy the checksum failure information if it was provided */ 1231 if (info != NULL) { 1232 report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); 1233 memcpy(report->zcr_ckinfo, info, sizeof (*info)); 1234 } 1235 1236 report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift; 1237 report->zcr_align = 1238 vdev_psize_to_asize(vd->vdev_top, report->zcr_sector); 1239 report->zcr_length = length; 1240 1241 #ifdef _KERNEL 1242 (void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, 1243 FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length); 1244 1245 if (report->zcr_ereport == NULL) { 1246 zfs_ereport_free_checksum(report); 1247 return (0); 1248 } 1249 #endif 1250 1251 mutex_enter(&spa->spa_errlist_lock); 1252 report->zcr_next = zio->io_logical->io_cksum_report; 1253 zio->io_logical->io_cksum_report = report; 1254 mutex_exit(&spa->spa_errlist_lock); 1255 return (0); 1256 } 1257 1258 void 1259 zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, 1260 const abd_t *bad_data, boolean_t drop_if_identical) 1261 { 1262 #ifdef _KERNEL 1263 zfs_ecksum_info_t *info; 1264 1265 info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, 1266 good_data, bad_data, report->zcr_length, drop_if_identical); 1267 if (info != NULL) 1268 zfs_zevent_post(report->zcr_ereport, 1269 report->zcr_detector, zfs_zevent_post_cb); 1270 else 1271 zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector); 1272 1273 report->zcr_ereport = report->zcr_detector = NULL; 1274 if (info != NULL) 1275 kmem_free(info, sizeof (*info)); 1276 #else 1277 (void) report, (void) good_data, (void) bad_data, 1278 (void) drop_if_identical; 1279 #endif 1280 } 1281 1282 void 1283 zfs_ereport_free_checksum(zio_cksum_report_t *rpt) 1284 { 1285 #ifdef _KERNEL 1286 if (rpt->zcr_ereport != NULL) { 1287 fm_nvlist_destroy(rpt->zcr_ereport, 1288 FM_NVA_FREE); 1289 fm_nvlist_destroy(rpt->zcr_detector, 1290 FM_NVA_FREE); 1291 } 1292 #endif 1293 rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); 1294 1295 if (rpt->zcr_ckinfo != NULL) 1296 kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); 1297 1298 kmem_free(rpt, sizeof (*rpt)); 1299 } 1300 1301 /* 1302 * Post a checksum ereport 1303 * 1304 * Returns 1305 * - 0 if an event was posted 1306 * - EINVAL if there was a problem posting event 1307 * - EBUSY if the event was rate limited 1308 * - EALREADY if the event was already posted (duplicate) 1309 */ 1310 int 1311 zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, 1312 struct zio *zio, uint64_t offset, uint64_t length, 1313 const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) 1314 { 1315 int rc = 0; 1316 #ifdef _KERNEL 1317 nvlist_t *ereport = NULL; 1318 nvlist_t *detector = NULL; 1319 zfs_ecksum_info_t *info; 1320 1321 if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) 1322 return (SET_ERROR(EINVAL)); 1323 1324 if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, 1325 offset, length)) 1326 return (SET_ERROR(EALREADY)); 1327 1328 if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) 1329 return (SET_ERROR(EBUSY)); 1330 1331 if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, 1332 spa, vd, zb, zio, offset, length) || (ereport == NULL)) { 1333 return (SET_ERROR(EINVAL)); 1334 } 1335 1336 info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, 1337 B_FALSE); 1338 1339 if (info != NULL) { 1340 rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); 1341 kmem_free(info, sizeof (*info)); 1342 } 1343 #else 1344 (void) spa, (void) vd, (void) zb, (void) zio, (void) offset, 1345 (void) length, (void) good_data, (void) bad_data, (void) zbc; 1346 #endif 1347 return (rc); 1348 } 1349 1350 /* 1351 * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of 1352 * change in the pool. All sysevents are listed in sys/sysevent/eventdefs.h 1353 * and are designed to be consumed by the ZFS Event Daemon (ZED). For 1354 * additional details refer to the zed(8) man page. 1355 */ 1356 nvlist_t * 1357 zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, 1358 nvlist_t *aux) 1359 { 1360 nvlist_t *resource = NULL; 1361 #ifdef _KERNEL 1362 char class[64]; 1363 1364 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) 1365 return (NULL); 1366 1367 if ((resource = fm_nvlist_create(NULL)) == NULL) 1368 return (NULL); 1369 1370 (void) snprintf(class, sizeof (class), "%s.%s.%s", type, 1371 ZFS_ERROR_CLASS, name); 1372 VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION)); 1373 VERIFY0(nvlist_add_string(resource, FM_CLASS, class)); 1374 VERIFY0(nvlist_add_string(resource, 1375 FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa))); 1376 VERIFY0(nvlist_add_uint64(resource, 1377 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa))); 1378 VERIFY0(nvlist_add_uint64(resource, 1379 FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa))); 1380 VERIFY0(nvlist_add_int32(resource, 1381 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa))); 1382 1383 if (vd) { 1384 VERIFY0(nvlist_add_uint64(resource, 1385 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid)); 1386 VERIFY0(nvlist_add_uint64(resource, 1387 FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state)); 1388 if (vd->vdev_path != NULL) 1389 VERIFY0(nvlist_add_string(resource, 1390 FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path)); 1391 if (vd->vdev_devid != NULL) 1392 VERIFY0(nvlist_add_string(resource, 1393 FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid)); 1394 if (vd->vdev_fru != NULL) 1395 VERIFY0(nvlist_add_string(resource, 1396 FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru)); 1397 if (vd->vdev_enc_sysfs_path != NULL) 1398 VERIFY0(nvlist_add_string(resource, 1399 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, 1400 vd->vdev_enc_sysfs_path)); 1401 } 1402 1403 /* also copy any optional payload data */ 1404 if (aux) { 1405 nvpair_t *elem = NULL; 1406 1407 while ((elem = nvlist_next_nvpair(aux, elem)) != NULL) 1408 (void) nvlist_add_nvpair(resource, elem); 1409 } 1410 #else 1411 (void) spa, (void) vd, (void) type, (void) name, (void) aux; 1412 #endif 1413 return (resource); 1414 } 1415 1416 static void 1417 zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name, 1418 nvlist_t *aux) 1419 { 1420 #ifdef _KERNEL 1421 nvlist_t *resource; 1422 1423 resource = zfs_event_create(spa, vd, type, name, aux); 1424 if (resource) 1425 zfs_zevent_post(resource, NULL, zfs_zevent_post_cb); 1426 #else 1427 (void) spa, (void) vd, (void) type, (void) name, (void) aux; 1428 #endif 1429 } 1430 1431 /* 1432 * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev 1433 * has been removed from the system. This will cause the DE to ignore any 1434 * recent I/O errors, inferring that they are due to the asynchronous device 1435 * removal. 1436 */ 1437 void 1438 zfs_post_remove(spa_t *spa, vdev_t *vd, boolean_t by_kernel) 1439 { 1440 nvlist_t *aux = NULL; 1441 1442 if (by_kernel) { 1443 /* 1444 * Add optional supplemental keys to payload 1445 */ 1446 aux = fm_nvlist_create(NULL); 1447 if (aux) 1448 fnvlist_add_boolean(aux, "by_kernel"); 1449 } 1450 1451 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, aux); 1452 1453 if (by_kernel && aux) 1454 fm_nvlist_destroy(aux, FM_NVA_FREE); 1455 } 1456 1457 /* 1458 * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool 1459 * has the 'autoreplace' property set, and therefore any broken vdevs will be 1460 * handled by higher level logic, and no vdev fault should be generated. 1461 */ 1462 void 1463 zfs_post_autoreplace(spa_t *spa, vdev_t *vd) 1464 { 1465 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL); 1466 } 1467 1468 /* 1469 * The 'resource.fs.zfs.statechange' event is an internal signal that the 1470 * given vdev has transitioned its state to DEGRADED or HEALTHY. This will 1471 * cause the retire agent to repair any outstanding fault management cases 1472 * open because the device was not found (fault.fs.zfs.device). 1473 */ 1474 void 1475 zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) 1476 { 1477 #ifdef _KERNEL 1478 nvlist_t *aux; 1479 1480 /* 1481 * Add optional supplemental keys to payload 1482 */ 1483 aux = fm_nvlist_create(NULL); 1484 if (vd && aux) { 1485 if (vd->vdev_physpath) { 1486 fnvlist_add_string(aux, 1487 FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH, 1488 vd->vdev_physpath); 1489 } 1490 if (vd->vdev_enc_sysfs_path) { 1491 fnvlist_add_string(aux, 1492 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, 1493 vd->vdev_enc_sysfs_path); 1494 } 1495 1496 fnvlist_add_uint64(aux, 1497 FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate); 1498 } 1499 1500 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE, 1501 aux); 1502 1503 if (aux) 1504 fm_nvlist_destroy(aux, FM_NVA_FREE); 1505 #else 1506 (void) spa, (void) vd, (void) laststate; 1507 #endif 1508 } 1509 1510 #ifdef _KERNEL 1511 void 1512 zfs_ereport_init(void) 1513 { 1514 mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL); 1515 list_create(&recent_events_list, sizeof (recent_events_node_t), 1516 offsetof(recent_events_node_t, re_list_link)); 1517 avl_create(&recent_events_tree, recent_events_compare, 1518 sizeof (recent_events_node_t), offsetof(recent_events_node_t, 1519 re_tree_link)); 1520 } 1521 1522 /* 1523 * This 'early' fini needs to run before zfs_fini() which on Linux waits 1524 * for the system_delay_taskq to drain. 1525 */ 1526 void 1527 zfs_ereport_taskq_fini(void) 1528 { 1529 mutex_enter(&recent_events_lock); 1530 if (recent_events_cleaner_tqid != 0) { 1531 taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid); 1532 recent_events_cleaner_tqid = 0; 1533 } 1534 mutex_exit(&recent_events_lock); 1535 } 1536 1537 void 1538 zfs_ereport_fini(void) 1539 { 1540 recent_events_node_t *entry; 1541 1542 while ((entry = list_remove_head(&recent_events_list)) != NULL) { 1543 avl_remove(&recent_events_tree, entry); 1544 kmem_free(entry, sizeof (*entry)); 1545 } 1546 avl_destroy(&recent_events_tree); 1547 list_destroy(&recent_events_list); 1548 mutex_destroy(&recent_events_lock); 1549 } 1550 1551 void 1552 zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name) 1553 { 1554 nvlist_t *aux; 1555 1556 aux = fm_nvlist_create(NULL); 1557 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name); 1558 1559 zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux); 1560 fm_nvlist_destroy(aux, FM_NVA_FREE); 1561 } 1562 1563 /* 1564 * Post when a event when a zvol is created or removed 1565 * 1566 * This is currently only used by macOS, since it uses the event to create 1567 * symlinks between the volume name (mypool/myvol) and the actual /dev 1568 * device (/dev/disk3). For example: 1569 * 1570 * /var/run/zfs/dsk/mypool/myvol -> /dev/disk3 1571 * 1572 * name: The full name of the zvol ("mypool/myvol") 1573 * dev_name: The full /dev name for the zvol ("/dev/disk3") 1574 * raw_name: The raw /dev name for the zvol ("/dev/rdisk3") 1575 */ 1576 void 1577 zfs_ereport_zvol_post(const char *subclass, const char *name, 1578 const char *dev_name, const char *raw_name) 1579 { 1580 nvlist_t *aux; 1581 char *r; 1582 1583 boolean_t locked = mutex_owned(&spa_namespace_lock); 1584 if (!locked) mutex_enter(&spa_namespace_lock); 1585 spa_t *spa = spa_lookup(name); 1586 if (!locked) mutex_exit(&spa_namespace_lock); 1587 1588 if (spa == NULL) 1589 return; 1590 1591 aux = fm_nvlist_create(NULL); 1592 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name); 1593 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME, 1594 raw_name); 1595 r = strchr(name, '/'); 1596 if (r && r[1]) 1597 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]); 1598 1599 zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux); 1600 fm_nvlist_destroy(aux, FM_NVA_FREE); 1601 } 1602 1603 EXPORT_SYMBOL(zfs_ereport_post); 1604 EXPORT_SYMBOL(zfs_ereport_is_valid); 1605 EXPORT_SYMBOL(zfs_ereport_post_checksum); 1606 EXPORT_SYMBOL(zfs_post_remove); 1607 EXPORT_SYMBOL(zfs_post_autoreplace); 1608 EXPORT_SYMBOL(zfs_post_state_change); 1609 1610 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW, 1611 "Maximum recent zevents records to retain for duplicate checking"); 1612 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW, 1613 "Expiration time for recent zevents records"); 1614 #endif /* _KERNEL */ 1615