1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2012,2021 by Delphix. All rights reserved. 28 */ 29 30 #include <sys/spa.h> 31 #include <sys/spa_impl.h> 32 #include <sys/vdev.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/zio.h> 35 #include <sys/zio_checksum.h> 36 37 #include <sys/fm/fs/zfs.h> 38 #include <sys/fm/protocol.h> 39 #include <sys/fm/util.h> 40 #include <sys/sysevent.h> 41 42 /* 43 * This general routine is responsible for generating all the different ZFS 44 * ereports. The payload is dependent on the class, and which arguments are 45 * supplied to the function: 46 * 47 * EREPORT POOL VDEV IO 48 * block X X X 49 * data X X 50 * device X X 51 * pool X 52 * 53 * If we are in a loading state, all errors are chained together by the same 54 * SPA-wide ENA (Error Numeric Association). 55 * 56 * For isolated I/O requests, we get the ENA from the zio_t. The propagation 57 * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want 58 * to chain together all ereports associated with a logical piece of data. For 59 * read I/Os, there are basically three 'types' of I/O, which form a roughly 60 * layered diagram: 61 * 62 * +---------------+ 63 * | Aggregate I/O | No associated logical data or device 64 * +---------------+ 65 * | 66 * V 67 * +---------------+ Reads associated with a piece of logical data. 68 * | Read I/O | This includes reads on behalf of RAID-Z, 69 * +---------------+ mirrors, gang blocks, retries, etc. 70 * | 71 * V 72 * +---------------+ Reads associated with a particular device, but 73 * | Physical I/O | no logical data. Issued as part of vdev caching 74 * +---------------+ and I/O aggregation. 75 * 76 * Note that 'physical I/O' here is not the same terminology as used in the rest 77 * of ZIO. Typically, 'physical I/O' simply means that there is no attached 78 * blockpointer. But I/O with no associated block pointer can still be related 79 * to a logical piece of data (i.e. RAID-Z requests). 80 * 81 * Purely physical I/O always have unique ENAs. They are not related to a 82 * particular piece of logical data, and therefore cannot be chained together. 83 * We still generate an ereport, but the DE doesn't correlate it with any 84 * logical piece of data. When such an I/O fails, the delegated I/O requests 85 * will issue a retry, which will trigger the 'real' ereport with the correct 86 * ENA. 87 * 88 * We keep track of the ENA for a ZIO chain through the 'io_logical' member. 89 * When a new logical I/O is issued, we set this to point to itself. Child I/Os 90 * then inherit this pointer, so that when it is first set subsequent failures 91 * will use the same ENA. For vdev cache fill and queue aggregation I/O, 92 * this pointer is set to NULL, and no ereport will be generated (since it 93 * doesn't actually correspond to any particular device or piece of data, 94 * and the caller will always retry without caching or queueing anyway). 95 * 96 * For checksum errors, we want to include more information about the actual 97 * error which occurs. Accordingly, we build an ereport when the error is 98 * noticed, but instead of sending it in immediately, we hang it off of the 99 * io_cksum_report field of the logical IO. When the logical IO completes 100 * (successfully or not), zfs_ereport_finish_checksum() is called with the 101 * good and bad versions of the buffer (if available), and we annotate the 102 * ereport with information about the differences. 103 */ 104 105 #ifdef _KERNEL 106 /* 107 * Duplicate ereport Detection 108 * 109 * Some ereports are retained momentarily for detecting duplicates. These 110 * are kept in a recent_events_node_t in both a time-ordered list and an AVL 111 * tree of recent unique ereports. 112 * 113 * The lifespan of these recent ereports is bounded (15 mins) and a cleaner 114 * task is used to purge stale entries. 115 */ 116 static list_t recent_events_list; 117 static avl_tree_t recent_events_tree; 118 static kmutex_t recent_events_lock; 119 static taskqid_t recent_events_cleaner_tqid; 120 121 /* 122 * Each node is about 128 bytes so 2,000 would consume 1/4 MiB. 123 * 124 * This setting can be changed dynamically and setting it to zero 125 * disables duplicate detection. 126 */ 127 unsigned int zfs_zevent_retain_max = 2000; 128 129 /* 130 * The lifespan for a recent ereport entry. The default of 15 minutes is 131 * intended to outlive the zfs diagnosis engine's threshold of 10 errors 132 * over a period of 10 minutes. 133 */ 134 unsigned int zfs_zevent_retain_expire_secs = 900; 135 136 typedef enum zfs_subclass { 137 ZSC_IO, 138 ZSC_DATA, 139 ZSC_CHECKSUM 140 } zfs_subclass_t; 141 142 typedef struct { 143 /* common criteria */ 144 uint64_t re_pool_guid; 145 uint64_t re_vdev_guid; 146 int re_io_error; 147 uint64_t re_io_size; 148 uint64_t re_io_offset; 149 zfs_subclass_t re_subclass; 150 zio_priority_t re_io_priority; 151 152 /* logical zio criteria (optional) */ 153 zbookmark_phys_t re_io_bookmark; 154 155 /* internal state */ 156 avl_node_t re_tree_link; 157 list_node_t re_list_link; 158 uint64_t re_timestamp; 159 } recent_events_node_t; 160 161 static int 162 recent_events_compare(const void *a, const void *b) 163 { 164 const recent_events_node_t *node1 = a; 165 const recent_events_node_t *node2 = b; 166 int cmp; 167 168 /* 169 * The comparison order here is somewhat arbitrary. 170 * What's important is that if every criteria matches, then it 171 * is a duplicate (i.e. compare returns 0) 172 */ 173 if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0) 174 return (cmp); 175 if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0) 176 return (cmp); 177 if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0) 178 return (cmp); 179 if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0) 180 return (cmp); 181 if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0) 182 return (cmp); 183 if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0) 184 return (cmp); 185 if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0) 186 return (cmp); 187 188 const zbookmark_phys_t *zb1 = &node1->re_io_bookmark; 189 const zbookmark_phys_t *zb2 = &node2->re_io_bookmark; 190 191 if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0) 192 return (cmp); 193 if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0) 194 return (cmp); 195 if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0) 196 return (cmp); 197 if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0) 198 return (cmp); 199 200 return (0); 201 } 202 203 static void zfs_ereport_schedule_cleaner(void); 204 205 /* 206 * background task to clean stale recent event nodes. 207 */ 208 /*ARGSUSED*/ 209 static void 210 zfs_ereport_cleaner(void *arg) 211 { 212 recent_events_node_t *entry; 213 uint64_t now = gethrtime(); 214 215 /* 216 * purge expired entries 217 */ 218 mutex_enter(&recent_events_lock); 219 while ((entry = list_tail(&recent_events_list)) != NULL) { 220 uint64_t age = NSEC2SEC(now - entry->re_timestamp); 221 if (age <= zfs_zevent_retain_expire_secs) 222 break; 223 224 /* remove expired node */ 225 avl_remove(&recent_events_tree, entry); 226 list_remove(&recent_events_list, entry); 227 kmem_free(entry, sizeof (*entry)); 228 } 229 230 /* Restart the cleaner if more entries remain */ 231 recent_events_cleaner_tqid = 0; 232 if (!list_is_empty(&recent_events_list)) 233 zfs_ereport_schedule_cleaner(); 234 235 mutex_exit(&recent_events_lock); 236 } 237 238 static void 239 zfs_ereport_schedule_cleaner(void) 240 { 241 ASSERT(MUTEX_HELD(&recent_events_lock)); 242 243 uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1); 244 245 recent_events_cleaner_tqid = taskq_dispatch_delay( 246 system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP, 247 ddi_get_lbolt() + NSEC_TO_TICK(timeout)); 248 } 249 250 /* 251 * Clear entries for a given vdev or all vdevs in a pool when vdev == NULL 252 */ 253 void 254 zfs_ereport_clear(spa_t *spa, vdev_t *vd) 255 { 256 uint64_t vdev_guid, pool_guid; 257 int cnt = 0; 258 259 ASSERT(vd != NULL || spa != NULL); 260 if (vd == NULL) { 261 vdev_guid = 0; 262 pool_guid = spa_guid(spa); 263 } else { 264 vdev_guid = vd->vdev_guid; 265 pool_guid = 0; 266 } 267 268 mutex_enter(&recent_events_lock); 269 270 recent_events_node_t *next = list_head(&recent_events_list); 271 while (next != NULL) { 272 recent_events_node_t *entry = next; 273 274 next = list_next(&recent_events_list, next); 275 276 if (entry->re_vdev_guid == vdev_guid || 277 entry->re_pool_guid == pool_guid) { 278 avl_remove(&recent_events_tree, entry); 279 list_remove(&recent_events_list, entry); 280 kmem_free(entry, sizeof (*entry)); 281 cnt++; 282 } 283 } 284 285 mutex_exit(&recent_events_lock); 286 } 287 288 /* 289 * Check if an ereport would be a duplicate of one recently posted. 290 * 291 * An ereport is considered a duplicate if the set of criteria in 292 * recent_events_node_t all match. 293 * 294 * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM 295 * are candidates for duplicate checking. 296 */ 297 static boolean_t 298 zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd, 299 const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size) 300 { 301 recent_events_node_t search = {0}, *entry; 302 303 if (vd == NULL || zio == NULL) 304 return (B_FALSE); 305 306 if (zfs_zevent_retain_max == 0) 307 return (B_FALSE); 308 309 if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) 310 search.re_subclass = ZSC_IO; 311 else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0) 312 search.re_subclass = ZSC_DATA; 313 else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) 314 search.re_subclass = ZSC_CHECKSUM; 315 else 316 return (B_FALSE); 317 318 search.re_pool_guid = spa_guid(spa); 319 search.re_vdev_guid = vd->vdev_guid; 320 search.re_io_error = zio->io_error; 321 search.re_io_priority = zio->io_priority; 322 /* if size is supplied use it over what's in zio */ 323 if (size) { 324 search.re_io_size = size; 325 search.re_io_offset = offset; 326 } else { 327 search.re_io_size = zio->io_size; 328 search.re_io_offset = zio->io_offset; 329 } 330 331 /* grab optional logical zio criteria */ 332 if (zb != NULL) { 333 search.re_io_bookmark.zb_objset = zb->zb_objset; 334 search.re_io_bookmark.zb_object = zb->zb_object; 335 search.re_io_bookmark.zb_level = zb->zb_level; 336 search.re_io_bookmark.zb_blkid = zb->zb_blkid; 337 } 338 339 uint64_t now = gethrtime(); 340 341 mutex_enter(&recent_events_lock); 342 343 /* check if we have seen this one recently */ 344 entry = avl_find(&recent_events_tree, &search, NULL); 345 if (entry != NULL) { 346 uint64_t age = NSEC2SEC(now - entry->re_timestamp); 347 348 /* 349 * There is still an active cleaner (since we're here). 350 * Reset the last seen time for this duplicate entry 351 * so that its lifespand gets extended. 352 */ 353 list_remove(&recent_events_list, entry); 354 list_insert_head(&recent_events_list, entry); 355 entry->re_timestamp = now; 356 357 zfs_zevent_track_duplicate(); 358 mutex_exit(&recent_events_lock); 359 360 return (age <= zfs_zevent_retain_expire_secs); 361 } 362 363 if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) { 364 /* recycle oldest node */ 365 entry = list_tail(&recent_events_list); 366 ASSERT(entry != NULL); 367 list_remove(&recent_events_list, entry); 368 avl_remove(&recent_events_tree, entry); 369 } else { 370 entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP); 371 } 372 373 /* record this as a recent ereport */ 374 *entry = search; 375 avl_add(&recent_events_tree, entry); 376 list_insert_head(&recent_events_list, entry); 377 entry->re_timestamp = now; 378 379 /* Start a cleaner if not already scheduled */ 380 if (recent_events_cleaner_tqid == 0) 381 zfs_ereport_schedule_cleaner(); 382 383 mutex_exit(&recent_events_lock); 384 return (B_FALSE); 385 } 386 387 void 388 zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) 389 { 390 if (nvl) 391 fm_nvlist_destroy(nvl, FM_NVA_FREE); 392 393 if (detector) 394 fm_nvlist_destroy(detector, FM_NVA_FREE); 395 } 396 397 /* 398 * We want to rate limit ZIO delay and checksum events so as to not 399 * flood ZED when a disk is acting up. 400 * 401 * Returns 1 if we're ratelimiting, 0 if not. 402 */ 403 static int 404 zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd) 405 { 406 int rc = 0; 407 /* 408 * __ratelimit() returns 1 if we're *not* ratelimiting and 0 if we 409 * are. Invert it to get our return value. 410 */ 411 if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { 412 rc = !zfs_ratelimit(&vd->vdev_delay_rl); 413 } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { 414 rc = !zfs_ratelimit(&vd->vdev_checksum_rl); 415 } 416 417 if (rc) { 418 /* We're rate limiting */ 419 fm_erpt_dropped_increment(); 420 } 421 422 return (rc); 423 } 424 425 /* 426 * Return B_TRUE if the event actually posted, B_FALSE if not. 427 */ 428 static boolean_t 429 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, 430 const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, 431 zio_t *zio, uint64_t stateoroffset, uint64_t size) 432 { 433 nvlist_t *ereport, *detector; 434 435 uint64_t ena; 436 char class[64]; 437 438 if ((ereport = fm_nvlist_create(NULL)) == NULL) 439 return (B_FALSE); 440 441 if ((detector = fm_nvlist_create(NULL)) == NULL) { 442 fm_nvlist_destroy(ereport, FM_NVA_FREE); 443 return (B_FALSE); 444 } 445 446 /* 447 * Serialize ereport generation 448 */ 449 mutex_enter(&spa->spa_errlist_lock); 450 451 /* 452 * Determine the ENA to use for this event. If we are in a loading 453 * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use 454 * a root zio-wide ENA. Otherwise, simply use a unique ENA. 455 */ 456 if (spa_load_state(spa) != SPA_LOAD_NONE) { 457 if (spa->spa_ena == 0) 458 spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); 459 ena = spa->spa_ena; 460 } else if (zio != NULL && zio->io_logical != NULL) { 461 if (zio->io_logical->io_ena == 0) 462 zio->io_logical->io_ena = 463 fm_ena_generate(0, FM_ENA_FMT1); 464 ena = zio->io_logical->io_ena; 465 } else { 466 ena = fm_ena_generate(0, FM_ENA_FMT1); 467 } 468 469 /* 470 * Construct the full class, detector, and other standard FMA fields. 471 */ 472 (void) snprintf(class, sizeof (class), "%s.%s", 473 ZFS_ERROR_CLASS, subclass); 474 475 fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), 476 vd != NULL ? vd->vdev_guid : 0); 477 478 fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); 479 480 /* 481 * Construct the per-ereport payload, depending on which parameters are 482 * passed in. 483 */ 484 485 /* 486 * Generic payload members common to all ereports. 487 */ 488 fm_payload_set(ereport, 489 FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa), 490 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa), 491 FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64, 492 (uint64_t)spa_state(spa), 493 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, 494 (int32_t)spa_load_state(spa), NULL); 495 496 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, 497 DATA_TYPE_STRING, 498 spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? 499 FM_EREPORT_FAILMODE_WAIT : 500 spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? 501 FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, 502 NULL); 503 504 if (vd != NULL) { 505 vdev_t *pvd = vd->vdev_parent; 506 vdev_queue_t *vq = &vd->vdev_queue; 507 vdev_stat_t *vs = &vd->vdev_stat; 508 vdev_t *spare_vd; 509 uint64_t *spare_guids; 510 char **spare_paths; 511 int i, spare_count; 512 513 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, 514 DATA_TYPE_UINT64, vd->vdev_guid, 515 FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, 516 DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); 517 if (vd->vdev_path != NULL) 518 fm_payload_set(ereport, 519 FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, 520 DATA_TYPE_STRING, vd->vdev_path, NULL); 521 if (vd->vdev_devid != NULL) 522 fm_payload_set(ereport, 523 FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, 524 DATA_TYPE_STRING, vd->vdev_devid, NULL); 525 if (vd->vdev_fru != NULL) 526 fm_payload_set(ereport, 527 FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, 528 DATA_TYPE_STRING, vd->vdev_fru, NULL); 529 if (vd->vdev_enc_sysfs_path != NULL) 530 fm_payload_set(ereport, 531 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, 532 DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL); 533 if (vd->vdev_ashift) 534 fm_payload_set(ereport, 535 FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT, 536 DATA_TYPE_UINT64, vd->vdev_ashift, NULL); 537 538 if (vq != NULL) { 539 fm_payload_set(ereport, 540 FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS, 541 DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL); 542 fm_payload_set(ereport, 543 FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS, 544 DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL); 545 } 546 547 if (vs != NULL) { 548 fm_payload_set(ereport, 549 FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS, 550 DATA_TYPE_UINT64, vs->vs_read_errors, 551 FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, 552 DATA_TYPE_UINT64, vs->vs_write_errors, 553 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, 554 DATA_TYPE_UINT64, vs->vs_checksum_errors, 555 FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, 556 DATA_TYPE_UINT64, vs->vs_slow_ios, 557 NULL); 558 } 559 560 if (pvd != NULL) { 561 fm_payload_set(ereport, 562 FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, 563 DATA_TYPE_UINT64, pvd->vdev_guid, 564 FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, 565 DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, 566 NULL); 567 if (pvd->vdev_path) 568 fm_payload_set(ereport, 569 FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, 570 DATA_TYPE_STRING, pvd->vdev_path, NULL); 571 if (pvd->vdev_devid) 572 fm_payload_set(ereport, 573 FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, 574 DATA_TYPE_STRING, pvd->vdev_devid, NULL); 575 } 576 577 spare_count = spa->spa_spares.sav_count; 578 spare_paths = kmem_zalloc(sizeof (char *) * spare_count, 579 KM_SLEEP); 580 spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count, 581 KM_SLEEP); 582 583 for (i = 0; i < spare_count; i++) { 584 spare_vd = spa->spa_spares.sav_vdevs[i]; 585 if (spare_vd) { 586 spare_paths[i] = spare_vd->vdev_path; 587 spare_guids[i] = spare_vd->vdev_guid; 588 } 589 } 590 591 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS, 592 DATA_TYPE_STRING_ARRAY, spare_count, spare_paths, 593 FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS, 594 DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL); 595 596 kmem_free(spare_guids, sizeof (uint64_t) * spare_count); 597 kmem_free(spare_paths, sizeof (char *) * spare_count); 598 } 599 600 if (zio != NULL) { 601 /* 602 * Payload common to all I/Os. 603 */ 604 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, 605 DATA_TYPE_INT32, zio->io_error, NULL); 606 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, 607 DATA_TYPE_INT32, zio->io_flags, NULL); 608 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE, 609 DATA_TYPE_UINT32, zio->io_stage, NULL); 610 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE, 611 DATA_TYPE_UINT32, zio->io_pipeline, NULL); 612 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY, 613 DATA_TYPE_UINT64, zio->io_delay, NULL); 614 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP, 615 DATA_TYPE_UINT64, zio->io_timestamp, NULL); 616 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, 617 DATA_TYPE_UINT64, zio->io_delta, NULL); 618 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, 619 DATA_TYPE_UINT32, zio->io_priority, NULL); 620 621 /* 622 * If the 'size' parameter is non-zero, it indicates this is a 623 * RAID-Z or other I/O where the physical offset and length are 624 * provided for us, instead of within the zio_t. 625 */ 626 if (vd != NULL) { 627 if (size) 628 fm_payload_set(ereport, 629 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, 630 DATA_TYPE_UINT64, stateoroffset, 631 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, 632 DATA_TYPE_UINT64, size, NULL); 633 else 634 fm_payload_set(ereport, 635 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, 636 DATA_TYPE_UINT64, zio->io_offset, 637 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, 638 DATA_TYPE_UINT64, zio->io_size, NULL); 639 } 640 } else if (vd != NULL) { 641 /* 642 * If we have a vdev but no zio, this is a device fault, and the 643 * 'stateoroffset' parameter indicates the previous state of the 644 * vdev. 645 */ 646 fm_payload_set(ereport, 647 FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, 648 DATA_TYPE_UINT64, stateoroffset, NULL); 649 } 650 651 /* 652 * Payload for I/Os with corresponding logical information. 653 */ 654 if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) { 655 fm_payload_set(ereport, 656 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, 657 DATA_TYPE_UINT64, zb->zb_objset, 658 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, 659 DATA_TYPE_UINT64, zb->zb_object, 660 FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, 661 DATA_TYPE_INT64, zb->zb_level, 662 FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, 663 DATA_TYPE_UINT64, zb->zb_blkid, NULL); 664 } 665 666 mutex_exit(&spa->spa_errlist_lock); 667 668 *ereport_out = ereport; 669 *detector_out = detector; 670 return (B_TRUE); 671 } 672 673 /* if it's <= 128 bytes, save the corruption directly */ 674 #define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) 675 676 #define MAX_RANGES 16 677 678 typedef struct zfs_ecksum_info { 679 /* histograms of set and cleared bits by bit number in a 64-bit word */ 680 uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY]; 681 uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; 682 683 /* inline arrays of bits set and cleared. */ 684 uint64_t zei_bits_set[ZFM_MAX_INLINE]; 685 uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; 686 687 /* 688 * for each range, the number of bits set and cleared. The Hamming 689 * distance between the good and bad buffers is the sum of them all. 690 */ 691 uint32_t zei_range_sets[MAX_RANGES]; 692 uint32_t zei_range_clears[MAX_RANGES]; 693 694 struct zei_ranges { 695 uint32_t zr_start; 696 uint32_t zr_end; 697 } zei_ranges[MAX_RANGES]; 698 699 size_t zei_range_count; 700 uint32_t zei_mingap; 701 uint32_t zei_allowed_mingap; 702 703 } zfs_ecksum_info_t; 704 705 static void 706 update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count) 707 { 708 size_t i; 709 size_t bits = 0; 710 uint64_t value = BE_64(value_arg); 711 712 /* We store the bits in big-endian (largest-first) order */ 713 for (i = 0; i < 64; i++) { 714 if (value & (1ull << i)) { 715 hist[63 - i]++; 716 ++bits; 717 } 718 } 719 /* update the count of bits changed */ 720 *count += bits; 721 } 722 723 /* 724 * We've now filled up the range array, and need to increase "mingap" and 725 * shrink the range list accordingly. zei_mingap is always the smallest 726 * distance between array entries, so we set the new_allowed_gap to be 727 * one greater than that. We then go through the list, joining together 728 * any ranges which are closer than the new_allowed_gap. 729 * 730 * By construction, there will be at least one. We also update zei_mingap 731 * to the new smallest gap, to prepare for our next invocation. 732 */ 733 static void 734 zei_shrink_ranges(zfs_ecksum_info_t *eip) 735 { 736 uint32_t mingap = UINT32_MAX; 737 uint32_t new_allowed_gap = eip->zei_mingap + 1; 738 739 size_t idx, output; 740 size_t max = eip->zei_range_count; 741 742 struct zei_ranges *r = eip->zei_ranges; 743 744 ASSERT3U(eip->zei_range_count, >, 0); 745 ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); 746 747 output = idx = 0; 748 while (idx < max - 1) { 749 uint32_t start = r[idx].zr_start; 750 uint32_t end = r[idx].zr_end; 751 752 while (idx < max - 1) { 753 idx++; 754 755 uint32_t nstart = r[idx].zr_start; 756 uint32_t nend = r[idx].zr_end; 757 758 uint32_t gap = nstart - end; 759 if (gap < new_allowed_gap) { 760 end = nend; 761 continue; 762 } 763 if (gap < mingap) 764 mingap = gap; 765 break; 766 } 767 r[output].zr_start = start; 768 r[output].zr_end = end; 769 output++; 770 } 771 ASSERT3U(output, <, eip->zei_range_count); 772 eip->zei_range_count = output; 773 eip->zei_mingap = mingap; 774 eip->zei_allowed_mingap = new_allowed_gap; 775 } 776 777 static void 778 zei_add_range(zfs_ecksum_info_t *eip, int start, int end) 779 { 780 struct zei_ranges *r = eip->zei_ranges; 781 size_t count = eip->zei_range_count; 782 783 if (count >= MAX_RANGES) { 784 zei_shrink_ranges(eip); 785 count = eip->zei_range_count; 786 } 787 if (count == 0) { 788 eip->zei_mingap = UINT32_MAX; 789 eip->zei_allowed_mingap = 1; 790 } else { 791 int gap = start - r[count - 1].zr_end; 792 793 if (gap < eip->zei_allowed_mingap) { 794 r[count - 1].zr_end = end; 795 return; 796 } 797 if (gap < eip->zei_mingap) 798 eip->zei_mingap = gap; 799 } 800 r[count].zr_start = start; 801 r[count].zr_end = end; 802 eip->zei_range_count++; 803 } 804 805 static size_t 806 zei_range_total_size(zfs_ecksum_info_t *eip) 807 { 808 struct zei_ranges *r = eip->zei_ranges; 809 size_t count = eip->zei_range_count; 810 size_t result = 0; 811 size_t idx; 812 813 for (idx = 0; idx < count; idx++) 814 result += (r[idx].zr_end - r[idx].zr_start); 815 816 return (result); 817 } 818 819 static zfs_ecksum_info_t * 820 annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, 821 const abd_t *goodabd, const abd_t *badabd, size_t size, 822 boolean_t drop_if_identical) 823 { 824 const uint64_t *good; 825 const uint64_t *bad; 826 827 uint64_t allset = 0; 828 uint64_t allcleared = 0; 829 830 size_t nui64s = size / sizeof (uint64_t); 831 832 size_t inline_size; 833 int no_inline = 0; 834 size_t idx; 835 size_t range; 836 837 size_t offset = 0; 838 ssize_t start = -1; 839 840 zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); 841 842 /* don't do any annotation for injected checksum errors */ 843 if (info != NULL && info->zbc_injected) 844 return (eip); 845 846 if (info != NULL && info->zbc_has_cksum) { 847 fm_payload_set(ereport, 848 FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED, 849 DATA_TYPE_UINT64_ARRAY, 850 sizeof (info->zbc_expected) / sizeof (uint64_t), 851 (uint64_t *)&info->zbc_expected, 852 FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL, 853 DATA_TYPE_UINT64_ARRAY, 854 sizeof (info->zbc_actual) / sizeof (uint64_t), 855 (uint64_t *)&info->zbc_actual, 856 FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, 857 DATA_TYPE_STRING, 858 info->zbc_checksum_name, 859 NULL); 860 861 if (info->zbc_byteswapped) { 862 fm_payload_set(ereport, 863 FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, 864 DATA_TYPE_BOOLEAN, 1, 865 NULL); 866 } 867 } 868 869 if (badabd == NULL || goodabd == NULL) 870 return (eip); 871 872 ASSERT3U(nui64s, <=, UINT32_MAX); 873 ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); 874 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 875 ASSERT3U(size, <=, UINT32_MAX); 876 877 good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size); 878 bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size); 879 880 /* build up the range list by comparing the two buffers. */ 881 for (idx = 0; idx < nui64s; idx++) { 882 if (good[idx] == bad[idx]) { 883 if (start == -1) 884 continue; 885 886 zei_add_range(eip, start, idx); 887 start = -1; 888 } else { 889 if (start != -1) 890 continue; 891 892 start = idx; 893 } 894 } 895 if (start != -1) 896 zei_add_range(eip, start, idx); 897 898 /* See if it will fit in our inline buffers */ 899 inline_size = zei_range_total_size(eip); 900 if (inline_size > ZFM_MAX_INLINE) 901 no_inline = 1; 902 903 /* 904 * If there is no change and we want to drop if the buffers are 905 * identical, do so. 906 */ 907 if (inline_size == 0 && drop_if_identical) { 908 kmem_free(eip, sizeof (*eip)); 909 abd_return_buf((abd_t *)goodabd, (void *)good, size); 910 abd_return_buf((abd_t *)badabd, (void *)bad, size); 911 return (NULL); 912 } 913 914 /* 915 * Now walk through the ranges, filling in the details of the 916 * differences. Also convert our uint64_t-array offsets to byte 917 * offsets. 918 */ 919 for (range = 0; range < eip->zei_range_count; range++) { 920 size_t start = eip->zei_ranges[range].zr_start; 921 size_t end = eip->zei_ranges[range].zr_end; 922 923 for (idx = start; idx < end; idx++) { 924 uint64_t set, cleared; 925 926 // bits set in bad, but not in good 927 set = ((~good[idx]) & bad[idx]); 928 // bits set in good, but not in bad 929 cleared = (good[idx] & (~bad[idx])); 930 931 allset |= set; 932 allcleared |= cleared; 933 934 if (!no_inline) { 935 ASSERT3U(offset, <, inline_size); 936 eip->zei_bits_set[offset] = set; 937 eip->zei_bits_cleared[offset] = cleared; 938 offset++; 939 } 940 941 update_histogram(set, eip->zei_histogram_set, 942 &eip->zei_range_sets[range]); 943 update_histogram(cleared, eip->zei_histogram_cleared, 944 &eip->zei_range_clears[range]); 945 } 946 947 /* convert to byte offsets */ 948 eip->zei_ranges[range].zr_start *= sizeof (uint64_t); 949 eip->zei_ranges[range].zr_end *= sizeof (uint64_t); 950 } 951 952 abd_return_buf((abd_t *)goodabd, (void *)good, size); 953 abd_return_buf((abd_t *)badabd, (void *)bad, size); 954 955 eip->zei_allowed_mingap *= sizeof (uint64_t); 956 inline_size *= sizeof (uint64_t); 957 958 /* fill in ereport */ 959 fm_payload_set(ereport, 960 FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, 961 DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, 962 (uint32_t *)eip->zei_ranges, 963 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, 964 DATA_TYPE_UINT32, eip->zei_allowed_mingap, 965 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, 966 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, 967 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, 968 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, 969 NULL); 970 971 if (!no_inline) { 972 fm_payload_set(ereport, 973 FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, 974 DATA_TYPE_UINT8_ARRAY, 975 inline_size, (uint8_t *)eip->zei_bits_set, 976 FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, 977 DATA_TYPE_UINT8_ARRAY, 978 inline_size, (uint8_t *)eip->zei_bits_cleared, 979 NULL); 980 } else { 981 fm_payload_set(ereport, 982 FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, 983 DATA_TYPE_UINT32_ARRAY, 984 NBBY * sizeof (uint64_t), eip->zei_histogram_set, 985 FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, 986 DATA_TYPE_UINT32_ARRAY, 987 NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, 988 NULL); 989 } 990 return (eip); 991 } 992 #else 993 /*ARGSUSED*/ 994 void 995 zfs_ereport_clear(spa_t *spa, vdev_t *vd) 996 { 997 } 998 #endif 999 1000 /* 1001 * Make sure our event is still valid for the given zio/vdev/pool. For example, 1002 * we don't want to keep logging events for a faulted or missing vdev. 1003 */ 1004 boolean_t 1005 zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) 1006 { 1007 #ifdef _KERNEL 1008 /* 1009 * If we are doing a spa_tryimport() or in recovery mode, 1010 * ignore errors. 1011 */ 1012 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || 1013 spa_load_state(spa) == SPA_LOAD_RECOVER) 1014 return (B_FALSE); 1015 1016 /* 1017 * If we are in the middle of opening a pool, and the previous attempt 1018 * failed, don't bother logging any new ereports - we're just going to 1019 * get the same diagnosis anyway. 1020 */ 1021 if (spa_load_state(spa) != SPA_LOAD_NONE && 1022 spa->spa_last_open_failed) 1023 return (B_FALSE); 1024 1025 if (zio != NULL) { 1026 /* 1027 * If this is not a read or write zio, ignore the error. This 1028 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. 1029 */ 1030 if (zio->io_type != ZIO_TYPE_READ && 1031 zio->io_type != ZIO_TYPE_WRITE) 1032 return (B_FALSE); 1033 1034 if (vd != NULL) { 1035 /* 1036 * If the vdev has already been marked as failing due 1037 * to a failed probe, then ignore any subsequent I/O 1038 * errors, as the DE will automatically fault the vdev 1039 * on the first such failure. This also catches cases 1040 * where vdev_remove_wanted is set and the device has 1041 * not yet been asynchronously placed into the REMOVED 1042 * state. 1043 */ 1044 if (zio->io_vd == vd && !vdev_accessible(vd, zio)) 1045 return (B_FALSE); 1046 1047 /* 1048 * Ignore checksum errors for reads from DTL regions of 1049 * leaf vdevs. 1050 */ 1051 if (zio->io_type == ZIO_TYPE_READ && 1052 zio->io_error == ECKSUM && 1053 vd->vdev_ops->vdev_op_leaf && 1054 vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) 1055 return (B_FALSE); 1056 } 1057 } 1058 1059 /* 1060 * For probe failure, we want to avoid posting ereports if we've 1061 * already removed the device in the meantime. 1062 */ 1063 if (vd != NULL && 1064 strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && 1065 (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) 1066 return (B_FALSE); 1067 1068 /* Ignore bogus delay events (like from ioctls or unqueued IOs) */ 1069 if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && 1070 (zio != NULL) && (!zio->io_timestamp)) { 1071 return (B_FALSE); 1072 } 1073 #endif 1074 return (B_TRUE); 1075 } 1076 1077 /* 1078 * Post an ereport for the given subclass 1079 * 1080 * Returns 1081 * - 0 if an event was posted 1082 * - EINVAL if there was a problem posting event 1083 * - EBUSY if the event was rate limited 1084 * - EALREADY if the event was already posted (duplicate) 1085 */ 1086 int 1087 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, 1088 const zbookmark_phys_t *zb, zio_t *zio, uint64_t state) 1089 { 1090 int rc = 0; 1091 #ifdef _KERNEL 1092 nvlist_t *ereport = NULL; 1093 nvlist_t *detector = NULL; 1094 1095 if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) 1096 return (EINVAL); 1097 1098 if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0)) 1099 return (SET_ERROR(EALREADY)); 1100 1101 if (zfs_is_ratelimiting_event(subclass, vd)) 1102 return (SET_ERROR(EBUSY)); 1103 1104 if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, 1105 zb, zio, state, 0)) 1106 return (SET_ERROR(EINVAL)); /* couldn't post event */ 1107 1108 if (ereport == NULL) 1109 return (SET_ERROR(EINVAL)); 1110 1111 /* Cleanup is handled by the callback function */ 1112 rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); 1113 #endif 1114 return (rc); 1115 } 1116 1117 /* 1118 * Prepare a checksum ereport 1119 * 1120 * Returns 1121 * - 0 if an event was posted 1122 * - EINVAL if there was a problem posting event 1123 * - EBUSY if the event was rate limited 1124 * - EALREADY if the event was already posted (duplicate) 1125 */ 1126 int 1127 zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, 1128 struct zio *zio, uint64_t offset, uint64_t length, void *arg, 1129 zio_bad_cksum_t *info) 1130 { 1131 zio_cksum_report_t *report; 1132 1133 #ifdef _KERNEL 1134 if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) 1135 return (SET_ERROR(EINVAL)); 1136 1137 if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, 1138 offset, length)) 1139 return (SET_ERROR(EALREADY)); 1140 1141 if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) 1142 return (SET_ERROR(EBUSY)); 1143 #endif 1144 1145 report = kmem_zalloc(sizeof (*report), KM_SLEEP); 1146 1147 if (zio->io_vsd != NULL) 1148 zio->io_vsd_ops->vsd_cksum_report(zio, report, arg); 1149 else 1150 zio_vsd_default_cksum_report(zio, report, arg); 1151 1152 /* copy the checksum failure information if it was provided */ 1153 if (info != NULL) { 1154 report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); 1155 bcopy(info, report->zcr_ckinfo, sizeof (*info)); 1156 } 1157 1158 report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift; 1159 report->zcr_align = 1160 vdev_psize_to_asize(vd->vdev_top, report->zcr_sector); 1161 report->zcr_length = length; 1162 1163 #ifdef _KERNEL 1164 (void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, 1165 FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length); 1166 1167 if (report->zcr_ereport == NULL) { 1168 zfs_ereport_free_checksum(report); 1169 return (0); 1170 } 1171 #endif 1172 1173 mutex_enter(&spa->spa_errlist_lock); 1174 report->zcr_next = zio->io_logical->io_cksum_report; 1175 zio->io_logical->io_cksum_report = report; 1176 mutex_exit(&spa->spa_errlist_lock); 1177 return (0); 1178 } 1179 1180 void 1181 zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, 1182 const abd_t *bad_data, boolean_t drop_if_identical) 1183 { 1184 #ifdef _KERNEL 1185 zfs_ecksum_info_t *info; 1186 1187 info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, 1188 good_data, bad_data, report->zcr_length, drop_if_identical); 1189 if (info != NULL) 1190 zfs_zevent_post(report->zcr_ereport, 1191 report->zcr_detector, zfs_zevent_post_cb); 1192 else 1193 zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector); 1194 1195 report->zcr_ereport = report->zcr_detector = NULL; 1196 if (info != NULL) 1197 kmem_free(info, sizeof (*info)); 1198 #endif 1199 } 1200 1201 void 1202 zfs_ereport_free_checksum(zio_cksum_report_t *rpt) 1203 { 1204 #ifdef _KERNEL 1205 if (rpt->zcr_ereport != NULL) { 1206 fm_nvlist_destroy(rpt->zcr_ereport, 1207 FM_NVA_FREE); 1208 fm_nvlist_destroy(rpt->zcr_detector, 1209 FM_NVA_FREE); 1210 } 1211 #endif 1212 rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); 1213 1214 if (rpt->zcr_ckinfo != NULL) 1215 kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); 1216 1217 kmem_free(rpt, sizeof (*rpt)); 1218 } 1219 1220 /* 1221 * Post a checksum ereport 1222 * 1223 * Returns 1224 * - 0 if an event was posted 1225 * - EINVAL if there was a problem posting event 1226 * - EBUSY if the event was rate limited 1227 * - EALREADY if the event was already posted (duplicate) 1228 */ 1229 int 1230 zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, 1231 struct zio *zio, uint64_t offset, uint64_t length, 1232 const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) 1233 { 1234 int rc = 0; 1235 #ifdef _KERNEL 1236 nvlist_t *ereport = NULL; 1237 nvlist_t *detector = NULL; 1238 zfs_ecksum_info_t *info; 1239 1240 if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) 1241 return (SET_ERROR(EINVAL)); 1242 1243 if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, 1244 offset, length)) 1245 return (SET_ERROR(EALREADY)); 1246 1247 if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) 1248 return (SET_ERROR(EBUSY)); 1249 1250 if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, 1251 spa, vd, zb, zio, offset, length) || (ereport == NULL)) { 1252 return (SET_ERROR(EINVAL)); 1253 } 1254 1255 info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, 1256 B_FALSE); 1257 1258 if (info != NULL) { 1259 rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); 1260 kmem_free(info, sizeof (*info)); 1261 } 1262 #endif 1263 return (rc); 1264 } 1265 1266 /* 1267 * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of 1268 * change in the pool. All sysevents are listed in sys/sysevent/eventdefs.h 1269 * and are designed to be consumed by the ZFS Event Daemon (ZED). For 1270 * additional details refer to the zed(8) man page. 1271 */ 1272 nvlist_t * 1273 zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, 1274 nvlist_t *aux) 1275 { 1276 nvlist_t *resource = NULL; 1277 #ifdef _KERNEL 1278 char class[64]; 1279 1280 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) 1281 return (NULL); 1282 1283 if ((resource = fm_nvlist_create(NULL)) == NULL) 1284 return (NULL); 1285 1286 (void) snprintf(class, sizeof (class), "%s.%s.%s", type, 1287 ZFS_ERROR_CLASS, name); 1288 VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION)); 1289 VERIFY0(nvlist_add_string(resource, FM_CLASS, class)); 1290 VERIFY0(nvlist_add_string(resource, 1291 FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa))); 1292 VERIFY0(nvlist_add_uint64(resource, 1293 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa))); 1294 VERIFY0(nvlist_add_uint64(resource, 1295 FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa))); 1296 VERIFY0(nvlist_add_int32(resource, 1297 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa))); 1298 1299 if (vd) { 1300 VERIFY0(nvlist_add_uint64(resource, 1301 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid)); 1302 VERIFY0(nvlist_add_uint64(resource, 1303 FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state)); 1304 if (vd->vdev_path != NULL) 1305 VERIFY0(nvlist_add_string(resource, 1306 FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path)); 1307 if (vd->vdev_devid != NULL) 1308 VERIFY0(nvlist_add_string(resource, 1309 FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid)); 1310 if (vd->vdev_fru != NULL) 1311 VERIFY0(nvlist_add_string(resource, 1312 FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru)); 1313 if (vd->vdev_enc_sysfs_path != NULL) 1314 VERIFY0(nvlist_add_string(resource, 1315 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, 1316 vd->vdev_enc_sysfs_path)); 1317 } 1318 1319 /* also copy any optional payload data */ 1320 if (aux) { 1321 nvpair_t *elem = NULL; 1322 1323 while ((elem = nvlist_next_nvpair(aux, elem)) != NULL) 1324 (void) nvlist_add_nvpair(resource, elem); 1325 } 1326 1327 #endif 1328 return (resource); 1329 } 1330 1331 static void 1332 zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name, 1333 nvlist_t *aux) 1334 { 1335 #ifdef _KERNEL 1336 nvlist_t *resource; 1337 1338 resource = zfs_event_create(spa, vd, type, name, aux); 1339 if (resource) 1340 zfs_zevent_post(resource, NULL, zfs_zevent_post_cb); 1341 #endif 1342 } 1343 1344 /* 1345 * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev 1346 * has been removed from the system. This will cause the DE to ignore any 1347 * recent I/O errors, inferring that they are due to the asynchronous device 1348 * removal. 1349 */ 1350 void 1351 zfs_post_remove(spa_t *spa, vdev_t *vd) 1352 { 1353 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL); 1354 } 1355 1356 /* 1357 * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool 1358 * has the 'autoreplace' property set, and therefore any broken vdevs will be 1359 * handled by higher level logic, and no vdev fault should be generated. 1360 */ 1361 void 1362 zfs_post_autoreplace(spa_t *spa, vdev_t *vd) 1363 { 1364 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL); 1365 } 1366 1367 /* 1368 * The 'resource.fs.zfs.statechange' event is an internal signal that the 1369 * given vdev has transitioned its state to DEGRADED or HEALTHY. This will 1370 * cause the retire agent to repair any outstanding fault management cases 1371 * open because the device was not found (fault.fs.zfs.device). 1372 */ 1373 void 1374 zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) 1375 { 1376 #ifdef _KERNEL 1377 nvlist_t *aux; 1378 1379 /* 1380 * Add optional supplemental keys to payload 1381 */ 1382 aux = fm_nvlist_create(NULL); 1383 if (vd && aux) { 1384 if (vd->vdev_physpath) { 1385 (void) nvlist_add_string(aux, 1386 FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH, 1387 vd->vdev_physpath); 1388 } 1389 if (vd->vdev_enc_sysfs_path) { 1390 (void) nvlist_add_string(aux, 1391 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, 1392 vd->vdev_enc_sysfs_path); 1393 } 1394 1395 (void) nvlist_add_uint64(aux, 1396 FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate); 1397 } 1398 1399 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE, 1400 aux); 1401 1402 if (aux) 1403 fm_nvlist_destroy(aux, FM_NVA_FREE); 1404 #endif 1405 } 1406 1407 #ifdef _KERNEL 1408 void 1409 zfs_ereport_init(void) 1410 { 1411 mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL); 1412 list_create(&recent_events_list, sizeof (recent_events_node_t), 1413 offsetof(recent_events_node_t, re_list_link)); 1414 avl_create(&recent_events_tree, recent_events_compare, 1415 sizeof (recent_events_node_t), offsetof(recent_events_node_t, 1416 re_tree_link)); 1417 } 1418 1419 /* 1420 * This 'early' fini needs to run before zfs_fini() which on Linux waits 1421 * for the system_delay_taskq to drain. 1422 */ 1423 void 1424 zfs_ereport_taskq_fini(void) 1425 { 1426 mutex_enter(&recent_events_lock); 1427 if (recent_events_cleaner_tqid != 0) { 1428 taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid); 1429 recent_events_cleaner_tqid = 0; 1430 } 1431 mutex_exit(&recent_events_lock); 1432 } 1433 1434 void 1435 zfs_ereport_fini(void) 1436 { 1437 recent_events_node_t *entry; 1438 1439 while ((entry = list_head(&recent_events_list)) != NULL) { 1440 avl_remove(&recent_events_tree, entry); 1441 list_remove(&recent_events_list, entry); 1442 kmem_free(entry, sizeof (*entry)); 1443 } 1444 avl_destroy(&recent_events_tree); 1445 list_destroy(&recent_events_list); 1446 mutex_destroy(&recent_events_lock); 1447 } 1448 1449 EXPORT_SYMBOL(zfs_ereport_post); 1450 EXPORT_SYMBOL(zfs_ereport_is_valid); 1451 EXPORT_SYMBOL(zfs_ereport_post_checksum); 1452 EXPORT_SYMBOL(zfs_post_remove); 1453 EXPORT_SYMBOL(zfs_post_autoreplace); 1454 EXPORT_SYMBOL(zfs_post_state_change); 1455 1456 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW, 1457 "Maximum recent zevents records to retain for duplicate checking"); 1458 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW, 1459 "Expiration time for recent zevents records"); 1460 #endif /* _KERNEL */ 1461