1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2012 by Delphix. All rights reserved. 28 */ 29 30 #include <sys/spa.h> 31 #include <sys/spa_impl.h> 32 #include <sys/vdev.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/zio.h> 35 #include <sys/zio_checksum.h> 36 37 #include <sys/fm/fs/zfs.h> 38 #include <sys/fm/protocol.h> 39 #include <sys/fm/util.h> 40 #include <sys/sysevent.h> 41 42 /* 43 * This general routine is responsible for generating all the different ZFS 44 * ereports. The payload is dependent on the class, and which arguments are 45 * supplied to the function: 46 * 47 * EREPORT POOL VDEV IO 48 * block X X X 49 * data X X 50 * device X X 51 * pool X 52 * 53 * If we are in a loading state, all errors are chained together by the same 54 * SPA-wide ENA (Error Numeric Association). 55 * 56 * For isolated I/O requests, we get the ENA from the zio_t. The propagation 57 * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want 58 * to chain together all ereports associated with a logical piece of data. For 59 * read I/Os, there are basically three 'types' of I/O, which form a roughly 60 * layered diagram: 61 * 62 * +---------------+ 63 * | Aggregate I/O | No associated logical data or device 64 * +---------------+ 65 * | 66 * V 67 * +---------------+ Reads associated with a piece of logical data. 68 * | Read I/O | This includes reads on behalf of RAID-Z, 69 * +---------------+ mirrors, gang blocks, retries, etc. 70 * | 71 * V 72 * +---------------+ Reads associated with a particular device, but 73 * | Physical I/O | no logical data. Issued as part of vdev caching 74 * +---------------+ and I/O aggregation. 75 * 76 * Note that 'physical I/O' here is not the same terminology as used in the rest 77 * of ZIO. Typically, 'physical I/O' simply means that there is no attached 78 * blockpointer. But I/O with no associated block pointer can still be related 79 * to a logical piece of data (i.e. RAID-Z requests). 80 * 81 * Purely physical I/O always have unique ENAs. They are not related to a 82 * particular piece of logical data, and therefore cannot be chained together. 83 * We still generate an ereport, but the DE doesn't correlate it with any 84 * logical piece of data. When such an I/O fails, the delegated I/O requests 85 * will issue a retry, which will trigger the 'real' ereport with the correct 86 * ENA. 87 * 88 * We keep track of the ENA for a ZIO chain through the 'io_logical' member. 89 * When a new logical I/O is issued, we set this to point to itself. Child I/Os 90 * then inherit this pointer, so that when it is first set subsequent failures 91 * will use the same ENA. For vdev cache fill and queue aggregation I/O, 92 * this pointer is set to NULL, and no ereport will be generated (since it 93 * doesn't actually correspond to any particular device or piece of data, 94 * and the caller will always retry without caching or queueing anyway). 95 * 96 * For checksum errors, we want to include more information about the actual 97 * error which occurs. Accordingly, we build an ereport when the error is 98 * noticed, but instead of sending it in immediately, we hang it off of the 99 * io_cksum_report field of the logical IO. When the logical IO completes 100 * (successfully or not), zfs_ereport_finish_checksum() is called with the 101 * good and bad versions of the buffer (if available), and we annotate the 102 * ereport with information about the differences. 103 */ 104 #ifdef _KERNEL 105 void 106 zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) 107 { 108 if (nvl) 109 fm_nvlist_destroy(nvl, FM_NVA_FREE); 110 111 if (detector) 112 fm_nvlist_destroy(detector, FM_NVA_FREE); 113 } 114 115 /* 116 * We want to rate limit ZIO delay and checksum events so as to not 117 * flood ZED when a disk is acting up. 118 * 119 * Returns 1 if we're ratelimiting, 0 if not. 120 */ 121 static int 122 zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd) 123 { 124 int rc = 0; 125 /* 126 * __ratelimit() returns 1 if we're *not* ratelimiting and 0 if we 127 * are. Invert it to get our return value. 128 */ 129 if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { 130 rc = !zfs_ratelimit(&vd->vdev_delay_rl); 131 } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { 132 rc = !zfs_ratelimit(&vd->vdev_checksum_rl); 133 } 134 135 if (rc) { 136 /* We're rate limiting */ 137 fm_erpt_dropped_increment(); 138 } 139 140 return (rc); 141 } 142 143 /* 144 * Return B_TRUE if the event actually posted, B_FALSE if not. 145 */ 146 static boolean_t 147 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, 148 const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, 149 zio_t *zio, uint64_t stateoroffset, uint64_t size) 150 { 151 nvlist_t *ereport, *detector; 152 153 uint64_t ena; 154 char class[64]; 155 156 if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) 157 return (B_FALSE); 158 159 if ((ereport = fm_nvlist_create(NULL)) == NULL) 160 return (B_FALSE); 161 162 if ((detector = fm_nvlist_create(NULL)) == NULL) { 163 fm_nvlist_destroy(ereport, FM_NVA_FREE); 164 return (B_FALSE); 165 } 166 167 /* 168 * Serialize ereport generation 169 */ 170 mutex_enter(&spa->spa_errlist_lock); 171 172 /* 173 * Determine the ENA to use for this event. If we are in a loading 174 * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use 175 * a root zio-wide ENA. Otherwise, simply use a unique ENA. 176 */ 177 if (spa_load_state(spa) != SPA_LOAD_NONE) { 178 if (spa->spa_ena == 0) 179 spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); 180 ena = spa->spa_ena; 181 } else if (zio != NULL && zio->io_logical != NULL) { 182 if (zio->io_logical->io_ena == 0) 183 zio->io_logical->io_ena = 184 fm_ena_generate(0, FM_ENA_FMT1); 185 ena = zio->io_logical->io_ena; 186 } else { 187 ena = fm_ena_generate(0, FM_ENA_FMT1); 188 } 189 190 /* 191 * Construct the full class, detector, and other standard FMA fields. 192 */ 193 (void) snprintf(class, sizeof (class), "%s.%s", 194 ZFS_ERROR_CLASS, subclass); 195 196 fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), 197 vd != NULL ? vd->vdev_guid : 0); 198 199 fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); 200 201 /* 202 * Construct the per-ereport payload, depending on which parameters are 203 * passed in. 204 */ 205 206 /* 207 * Generic payload members common to all ereports. 208 */ 209 fm_payload_set(ereport, 210 FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa), 211 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa), 212 FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64, 213 (uint64_t)spa_state(spa), 214 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, 215 (int32_t)spa_load_state(spa), NULL); 216 217 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, 218 DATA_TYPE_STRING, 219 spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? 220 FM_EREPORT_FAILMODE_WAIT : 221 spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? 222 FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, 223 NULL); 224 225 if (vd != NULL) { 226 vdev_t *pvd = vd->vdev_parent; 227 vdev_queue_t *vq = &vd->vdev_queue; 228 vdev_stat_t *vs = &vd->vdev_stat; 229 vdev_t *spare_vd; 230 uint64_t *spare_guids; 231 char **spare_paths; 232 int i, spare_count; 233 234 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, 235 DATA_TYPE_UINT64, vd->vdev_guid, 236 FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, 237 DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); 238 if (vd->vdev_path != NULL) 239 fm_payload_set(ereport, 240 FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, 241 DATA_TYPE_STRING, vd->vdev_path, NULL); 242 if (vd->vdev_devid != NULL) 243 fm_payload_set(ereport, 244 FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, 245 DATA_TYPE_STRING, vd->vdev_devid, NULL); 246 if (vd->vdev_fru != NULL) 247 fm_payload_set(ereport, 248 FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, 249 DATA_TYPE_STRING, vd->vdev_fru, NULL); 250 if (vd->vdev_enc_sysfs_path != NULL) 251 fm_payload_set(ereport, 252 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, 253 DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL); 254 if (vd->vdev_ashift) 255 fm_payload_set(ereport, 256 FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT, 257 DATA_TYPE_UINT64, vd->vdev_ashift, NULL); 258 259 if (vq != NULL) { 260 fm_payload_set(ereport, 261 FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS, 262 DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL); 263 fm_payload_set(ereport, 264 FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS, 265 DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL); 266 } 267 268 if (vs != NULL) { 269 fm_payload_set(ereport, 270 FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS, 271 DATA_TYPE_UINT64, vs->vs_read_errors, 272 FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, 273 DATA_TYPE_UINT64, vs->vs_write_errors, 274 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, 275 DATA_TYPE_UINT64, vs->vs_checksum_errors, 276 FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, 277 DATA_TYPE_UINT64, vs->vs_slow_ios, 278 NULL); 279 } 280 281 if (pvd != NULL) { 282 fm_payload_set(ereport, 283 FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, 284 DATA_TYPE_UINT64, pvd->vdev_guid, 285 FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, 286 DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, 287 NULL); 288 if (pvd->vdev_path) 289 fm_payload_set(ereport, 290 FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, 291 DATA_TYPE_STRING, pvd->vdev_path, NULL); 292 if (pvd->vdev_devid) 293 fm_payload_set(ereport, 294 FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, 295 DATA_TYPE_STRING, pvd->vdev_devid, NULL); 296 } 297 298 spare_count = spa->spa_spares.sav_count; 299 spare_paths = kmem_zalloc(sizeof (char *) * spare_count, 300 KM_SLEEP); 301 spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count, 302 KM_SLEEP); 303 304 for (i = 0; i < spare_count; i++) { 305 spare_vd = spa->spa_spares.sav_vdevs[i]; 306 if (spare_vd) { 307 spare_paths[i] = spare_vd->vdev_path; 308 spare_guids[i] = spare_vd->vdev_guid; 309 } 310 } 311 312 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS, 313 DATA_TYPE_STRING_ARRAY, spare_count, spare_paths, 314 FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS, 315 DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL); 316 317 kmem_free(spare_guids, sizeof (uint64_t) * spare_count); 318 kmem_free(spare_paths, sizeof (char *) * spare_count); 319 } 320 321 if (zio != NULL) { 322 /* 323 * Payload common to all I/Os. 324 */ 325 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, 326 DATA_TYPE_INT32, zio->io_error, NULL); 327 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, 328 DATA_TYPE_INT32, zio->io_flags, NULL); 329 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE, 330 DATA_TYPE_UINT32, zio->io_stage, NULL); 331 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE, 332 DATA_TYPE_UINT32, zio->io_pipeline, NULL); 333 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY, 334 DATA_TYPE_UINT64, zio->io_delay, NULL); 335 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP, 336 DATA_TYPE_UINT64, zio->io_timestamp, NULL); 337 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, 338 DATA_TYPE_UINT64, zio->io_delta, NULL); 339 340 /* 341 * If the 'size' parameter is non-zero, it indicates this is a 342 * RAID-Z or other I/O where the physical offset and length are 343 * provided for us, instead of within the zio_t. 344 */ 345 if (vd != NULL) { 346 if (size) 347 fm_payload_set(ereport, 348 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, 349 DATA_TYPE_UINT64, stateoroffset, 350 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, 351 DATA_TYPE_UINT64, size, NULL); 352 else 353 fm_payload_set(ereport, 354 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, 355 DATA_TYPE_UINT64, zio->io_offset, 356 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, 357 DATA_TYPE_UINT64, zio->io_size, NULL); 358 } 359 } else if (vd != NULL) { 360 /* 361 * If we have a vdev but no zio, this is a device fault, and the 362 * 'stateoroffset' parameter indicates the previous state of the 363 * vdev. 364 */ 365 fm_payload_set(ereport, 366 FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, 367 DATA_TYPE_UINT64, stateoroffset, NULL); 368 } 369 370 /* 371 * Payload for I/Os with corresponding logical information. 372 */ 373 if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) { 374 fm_payload_set(ereport, 375 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, 376 DATA_TYPE_UINT64, zb->zb_objset, 377 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, 378 DATA_TYPE_UINT64, zb->zb_object, 379 FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, 380 DATA_TYPE_INT64, zb->zb_level, 381 FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, 382 DATA_TYPE_UINT64, zb->zb_blkid, NULL); 383 } 384 385 mutex_exit(&spa->spa_errlist_lock); 386 387 *ereport_out = ereport; 388 *detector_out = detector; 389 return (B_TRUE); 390 } 391 392 /* if it's <= 128 bytes, save the corruption directly */ 393 #define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) 394 395 #define MAX_RANGES 16 396 397 typedef struct zfs_ecksum_info { 398 /* histograms of set and cleared bits by bit number in a 64-bit word */ 399 uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY]; 400 uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; 401 402 /* inline arrays of bits set and cleared. */ 403 uint64_t zei_bits_set[ZFM_MAX_INLINE]; 404 uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; 405 406 /* 407 * for each range, the number of bits set and cleared. The Hamming 408 * distance between the good and bad buffers is the sum of them all. 409 */ 410 uint32_t zei_range_sets[MAX_RANGES]; 411 uint32_t zei_range_clears[MAX_RANGES]; 412 413 struct zei_ranges { 414 uint32_t zr_start; 415 uint32_t zr_end; 416 } zei_ranges[MAX_RANGES]; 417 418 size_t zei_range_count; 419 uint32_t zei_mingap; 420 uint32_t zei_allowed_mingap; 421 422 } zfs_ecksum_info_t; 423 424 static void 425 update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count) 426 { 427 size_t i; 428 size_t bits = 0; 429 uint64_t value = BE_64(value_arg); 430 431 /* We store the bits in big-endian (largest-first) order */ 432 for (i = 0; i < 64; i++) { 433 if (value & (1ull << i)) { 434 hist[63 - i]++; 435 ++bits; 436 } 437 } 438 /* update the count of bits changed */ 439 *count += bits; 440 } 441 442 /* 443 * We've now filled up the range array, and need to increase "mingap" and 444 * shrink the range list accordingly. zei_mingap is always the smallest 445 * distance between array entries, so we set the new_allowed_gap to be 446 * one greater than that. We then go through the list, joining together 447 * any ranges which are closer than the new_allowed_gap. 448 * 449 * By construction, there will be at least one. We also update zei_mingap 450 * to the new smallest gap, to prepare for our next invocation. 451 */ 452 static void 453 zei_shrink_ranges(zfs_ecksum_info_t *eip) 454 { 455 uint32_t mingap = UINT32_MAX; 456 uint32_t new_allowed_gap = eip->zei_mingap + 1; 457 458 size_t idx, output; 459 size_t max = eip->zei_range_count; 460 461 struct zei_ranges *r = eip->zei_ranges; 462 463 ASSERT3U(eip->zei_range_count, >, 0); 464 ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); 465 466 output = idx = 0; 467 while (idx < max - 1) { 468 uint32_t start = r[idx].zr_start; 469 uint32_t end = r[idx].zr_end; 470 471 while (idx < max - 1) { 472 idx++; 473 474 uint32_t nstart = r[idx].zr_start; 475 uint32_t nend = r[idx].zr_end; 476 477 uint32_t gap = nstart - end; 478 if (gap < new_allowed_gap) { 479 end = nend; 480 continue; 481 } 482 if (gap < mingap) 483 mingap = gap; 484 break; 485 } 486 r[output].zr_start = start; 487 r[output].zr_end = end; 488 output++; 489 } 490 ASSERT3U(output, <, eip->zei_range_count); 491 eip->zei_range_count = output; 492 eip->zei_mingap = mingap; 493 eip->zei_allowed_mingap = new_allowed_gap; 494 } 495 496 static void 497 zei_add_range(zfs_ecksum_info_t *eip, int start, int end) 498 { 499 struct zei_ranges *r = eip->zei_ranges; 500 size_t count = eip->zei_range_count; 501 502 if (count >= MAX_RANGES) { 503 zei_shrink_ranges(eip); 504 count = eip->zei_range_count; 505 } 506 if (count == 0) { 507 eip->zei_mingap = UINT32_MAX; 508 eip->zei_allowed_mingap = 1; 509 } else { 510 int gap = start - r[count - 1].zr_end; 511 512 if (gap < eip->zei_allowed_mingap) { 513 r[count - 1].zr_end = end; 514 return; 515 } 516 if (gap < eip->zei_mingap) 517 eip->zei_mingap = gap; 518 } 519 r[count].zr_start = start; 520 r[count].zr_end = end; 521 eip->zei_range_count++; 522 } 523 524 static size_t 525 zei_range_total_size(zfs_ecksum_info_t *eip) 526 { 527 struct zei_ranges *r = eip->zei_ranges; 528 size_t count = eip->zei_range_count; 529 size_t result = 0; 530 size_t idx; 531 532 for (idx = 0; idx < count; idx++) 533 result += (r[idx].zr_end - r[idx].zr_start); 534 535 return (result); 536 } 537 538 static zfs_ecksum_info_t * 539 annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, 540 const abd_t *goodabd, const abd_t *badabd, size_t size, 541 boolean_t drop_if_identical) 542 { 543 const uint64_t *good; 544 const uint64_t *bad; 545 546 uint64_t allset = 0; 547 uint64_t allcleared = 0; 548 549 size_t nui64s = size / sizeof (uint64_t); 550 551 size_t inline_size; 552 int no_inline = 0; 553 size_t idx; 554 size_t range; 555 556 size_t offset = 0; 557 ssize_t start = -1; 558 559 zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); 560 561 /* don't do any annotation for injected checksum errors */ 562 if (info != NULL && info->zbc_injected) 563 return (eip); 564 565 if (info != NULL && info->zbc_has_cksum) { 566 fm_payload_set(ereport, 567 FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED, 568 DATA_TYPE_UINT64_ARRAY, 569 sizeof (info->zbc_expected) / sizeof (uint64_t), 570 (uint64_t *)&info->zbc_expected, 571 FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL, 572 DATA_TYPE_UINT64_ARRAY, 573 sizeof (info->zbc_actual) / sizeof (uint64_t), 574 (uint64_t *)&info->zbc_actual, 575 FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, 576 DATA_TYPE_STRING, 577 info->zbc_checksum_name, 578 NULL); 579 580 if (info->zbc_byteswapped) { 581 fm_payload_set(ereport, 582 FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, 583 DATA_TYPE_BOOLEAN, 1, 584 NULL); 585 } 586 } 587 588 if (badabd == NULL || goodabd == NULL) 589 return (eip); 590 591 ASSERT3U(nui64s, <=, UINT32_MAX); 592 ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); 593 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 594 ASSERT3U(size, <=, UINT32_MAX); 595 596 good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size); 597 bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size); 598 599 /* build up the range list by comparing the two buffers. */ 600 for (idx = 0; idx < nui64s; idx++) { 601 if (good[idx] == bad[idx]) { 602 if (start == -1) 603 continue; 604 605 zei_add_range(eip, start, idx); 606 start = -1; 607 } else { 608 if (start != -1) 609 continue; 610 611 start = idx; 612 } 613 } 614 if (start != -1) 615 zei_add_range(eip, start, idx); 616 617 /* See if it will fit in our inline buffers */ 618 inline_size = zei_range_total_size(eip); 619 if (inline_size > ZFM_MAX_INLINE) 620 no_inline = 1; 621 622 /* 623 * If there is no change and we want to drop if the buffers are 624 * identical, do so. 625 */ 626 if (inline_size == 0 && drop_if_identical) { 627 kmem_free(eip, sizeof (*eip)); 628 abd_return_buf((abd_t *)goodabd, (void *)good, size); 629 abd_return_buf((abd_t *)badabd, (void *)bad, size); 630 return (NULL); 631 } 632 633 /* 634 * Now walk through the ranges, filling in the details of the 635 * differences. Also convert our uint64_t-array offsets to byte 636 * offsets. 637 */ 638 for (range = 0; range < eip->zei_range_count; range++) { 639 size_t start = eip->zei_ranges[range].zr_start; 640 size_t end = eip->zei_ranges[range].zr_end; 641 642 for (idx = start; idx < end; idx++) { 643 uint64_t set, cleared; 644 645 // bits set in bad, but not in good 646 set = ((~good[idx]) & bad[idx]); 647 // bits set in good, but not in bad 648 cleared = (good[idx] & (~bad[idx])); 649 650 allset |= set; 651 allcleared |= cleared; 652 653 if (!no_inline) { 654 ASSERT3U(offset, <, inline_size); 655 eip->zei_bits_set[offset] = set; 656 eip->zei_bits_cleared[offset] = cleared; 657 offset++; 658 } 659 660 update_histogram(set, eip->zei_histogram_set, 661 &eip->zei_range_sets[range]); 662 update_histogram(cleared, eip->zei_histogram_cleared, 663 &eip->zei_range_clears[range]); 664 } 665 666 /* convert to byte offsets */ 667 eip->zei_ranges[range].zr_start *= sizeof (uint64_t); 668 eip->zei_ranges[range].zr_end *= sizeof (uint64_t); 669 } 670 671 abd_return_buf((abd_t *)goodabd, (void *)good, size); 672 abd_return_buf((abd_t *)badabd, (void *)bad, size); 673 674 eip->zei_allowed_mingap *= sizeof (uint64_t); 675 inline_size *= sizeof (uint64_t); 676 677 /* fill in ereport */ 678 fm_payload_set(ereport, 679 FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, 680 DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, 681 (uint32_t *)eip->zei_ranges, 682 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, 683 DATA_TYPE_UINT32, eip->zei_allowed_mingap, 684 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, 685 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, 686 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, 687 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, 688 NULL); 689 690 if (!no_inline) { 691 fm_payload_set(ereport, 692 FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, 693 DATA_TYPE_UINT8_ARRAY, 694 inline_size, (uint8_t *)eip->zei_bits_set, 695 FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, 696 DATA_TYPE_UINT8_ARRAY, 697 inline_size, (uint8_t *)eip->zei_bits_cleared, 698 NULL); 699 } else { 700 fm_payload_set(ereport, 701 FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, 702 DATA_TYPE_UINT32_ARRAY, 703 NBBY * sizeof (uint64_t), eip->zei_histogram_set, 704 FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, 705 DATA_TYPE_UINT32_ARRAY, 706 NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, 707 NULL); 708 } 709 return (eip); 710 } 711 #endif 712 713 /* 714 * Make sure our event is still valid for the given zio/vdev/pool. For example, 715 * we don't want to keep logging events for a faulted or missing vdev. 716 */ 717 boolean_t 718 zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) 719 { 720 #ifdef _KERNEL 721 /* 722 * If we are doing a spa_tryimport() or in recovery mode, 723 * ignore errors. 724 */ 725 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || 726 spa_load_state(spa) == SPA_LOAD_RECOVER) 727 return (B_FALSE); 728 729 /* 730 * If we are in the middle of opening a pool, and the previous attempt 731 * failed, don't bother logging any new ereports - we're just going to 732 * get the same diagnosis anyway. 733 */ 734 if (spa_load_state(spa) != SPA_LOAD_NONE && 735 spa->spa_last_open_failed) 736 return (B_FALSE); 737 738 if (zio != NULL) { 739 /* 740 * If this is not a read or write zio, ignore the error. This 741 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. 742 */ 743 if (zio->io_type != ZIO_TYPE_READ && 744 zio->io_type != ZIO_TYPE_WRITE) 745 return (B_FALSE); 746 747 if (vd != NULL) { 748 /* 749 * If the vdev has already been marked as failing due 750 * to a failed probe, then ignore any subsequent I/O 751 * errors, as the DE will automatically fault the vdev 752 * on the first such failure. This also catches cases 753 * where vdev_remove_wanted is set and the device has 754 * not yet been asynchronously placed into the REMOVED 755 * state. 756 */ 757 if (zio->io_vd == vd && !vdev_accessible(vd, zio)) 758 return (B_FALSE); 759 760 /* 761 * Ignore checksum errors for reads from DTL regions of 762 * leaf vdevs. 763 */ 764 if (zio->io_type == ZIO_TYPE_READ && 765 zio->io_error == ECKSUM && 766 vd->vdev_ops->vdev_op_leaf && 767 vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) 768 return (B_FALSE); 769 } 770 } 771 772 /* 773 * For probe failure, we want to avoid posting ereports if we've 774 * already removed the device in the meantime. 775 */ 776 if (vd != NULL && 777 strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && 778 (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) 779 return (B_FALSE); 780 781 /* Ignore bogus delay events (like from ioctls or unqueued IOs) */ 782 if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && 783 (zio != NULL) && (!zio->io_timestamp)) { 784 return (B_FALSE); 785 } 786 #endif 787 return (B_TRUE); 788 } 789 790 /* 791 * Return 0 if event was posted, EINVAL if there was a problem posting it or 792 * EBUSY if the event was rate limited. 793 */ 794 int 795 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, 796 const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, 797 uint64_t size) 798 { 799 int rc = 0; 800 #ifdef _KERNEL 801 nvlist_t *ereport = NULL; 802 nvlist_t *detector = NULL; 803 804 if (zfs_is_ratelimiting_event(subclass, vd)) 805 return (SET_ERROR(EBUSY)); 806 807 if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, 808 zb, zio, stateoroffset, size)) 809 return (SET_ERROR(EINVAL)); /* couldn't post event */ 810 811 if (ereport == NULL) 812 return (SET_ERROR(EINVAL)); 813 814 /* Cleanup is handled by the callback function */ 815 rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); 816 #endif 817 return (rc); 818 } 819 820 void 821 zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, 822 struct zio *zio, uint64_t offset, uint64_t length, void *arg, 823 zio_bad_cksum_t *info) 824 { 825 zio_cksum_report_t *report; 826 827 #ifdef _KERNEL 828 if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) 829 return; 830 #endif 831 832 report = kmem_zalloc(sizeof (*report), KM_SLEEP); 833 834 if (zio->io_vsd != NULL) 835 zio->io_vsd_ops->vsd_cksum_report(zio, report, arg); 836 else 837 zio_vsd_default_cksum_report(zio, report, arg); 838 839 /* copy the checksum failure information if it was provided */ 840 if (info != NULL) { 841 report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); 842 bcopy(info, report->zcr_ckinfo, sizeof (*info)); 843 } 844 845 report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift; 846 report->zcr_length = length; 847 848 #ifdef _KERNEL 849 (void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, 850 FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length); 851 852 if (report->zcr_ereport == NULL) { 853 zfs_ereport_free_checksum(report); 854 return; 855 } 856 #endif 857 858 mutex_enter(&spa->spa_errlist_lock); 859 report->zcr_next = zio->io_logical->io_cksum_report; 860 zio->io_logical->io_cksum_report = report; 861 mutex_exit(&spa->spa_errlist_lock); 862 } 863 864 void 865 zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, 866 const abd_t *bad_data, boolean_t drop_if_identical) 867 { 868 #ifdef _KERNEL 869 zfs_ecksum_info_t *info; 870 871 info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, 872 good_data, bad_data, report->zcr_length, drop_if_identical); 873 if (info != NULL) 874 zfs_zevent_post(report->zcr_ereport, 875 report->zcr_detector, zfs_zevent_post_cb); 876 else 877 zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector); 878 879 report->zcr_ereport = report->zcr_detector = NULL; 880 if (info != NULL) 881 kmem_free(info, sizeof (*info)); 882 #endif 883 } 884 885 void 886 zfs_ereport_free_checksum(zio_cksum_report_t *rpt) 887 { 888 #ifdef _KERNEL 889 if (rpt->zcr_ereport != NULL) { 890 fm_nvlist_destroy(rpt->zcr_ereport, 891 FM_NVA_FREE); 892 fm_nvlist_destroy(rpt->zcr_detector, 893 FM_NVA_FREE); 894 } 895 #endif 896 rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); 897 898 if (rpt->zcr_ckinfo != NULL) 899 kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); 900 901 kmem_free(rpt, sizeof (*rpt)); 902 } 903 904 905 int 906 zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, 907 struct zio *zio, uint64_t offset, uint64_t length, 908 const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) 909 { 910 int rc = 0; 911 #ifdef _KERNEL 912 nvlist_t *ereport = NULL; 913 nvlist_t *detector = NULL; 914 zfs_ecksum_info_t *info; 915 916 if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) 917 return (EBUSY); 918 919 if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, 920 spa, vd, zb, zio, offset, length) || (ereport == NULL)) { 921 return (SET_ERROR(EINVAL)); 922 } 923 924 info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, 925 B_FALSE); 926 927 if (info != NULL) { 928 rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); 929 kmem_free(info, sizeof (*info)); 930 } 931 #endif 932 return (rc); 933 } 934 935 /* 936 * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of 937 * change in the pool. All sysevents are listed in sys/sysevent/eventdefs.h 938 * and are designed to be consumed by the ZFS Event Daemon (ZED). For 939 * additional details refer to the zed(8) man page. 940 */ 941 nvlist_t * 942 zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, 943 nvlist_t *aux) 944 { 945 nvlist_t *resource = NULL; 946 #ifdef _KERNEL 947 char class[64]; 948 949 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) 950 return (NULL); 951 952 if ((resource = fm_nvlist_create(NULL)) == NULL) 953 return (NULL); 954 955 (void) snprintf(class, sizeof (class), "%s.%s.%s", type, 956 ZFS_ERROR_CLASS, name); 957 VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION)); 958 VERIFY0(nvlist_add_string(resource, FM_CLASS, class)); 959 VERIFY0(nvlist_add_string(resource, 960 FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa))); 961 VERIFY0(nvlist_add_uint64(resource, 962 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa))); 963 VERIFY0(nvlist_add_uint64(resource, 964 FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa))); 965 VERIFY0(nvlist_add_int32(resource, 966 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa))); 967 968 if (vd) { 969 VERIFY0(nvlist_add_uint64(resource, 970 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid)); 971 VERIFY0(nvlist_add_uint64(resource, 972 FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state)); 973 if (vd->vdev_path != NULL) 974 VERIFY0(nvlist_add_string(resource, 975 FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path)); 976 if (vd->vdev_devid != NULL) 977 VERIFY0(nvlist_add_string(resource, 978 FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid)); 979 if (vd->vdev_fru != NULL) 980 VERIFY0(nvlist_add_string(resource, 981 FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru)); 982 if (vd->vdev_enc_sysfs_path != NULL) 983 VERIFY0(nvlist_add_string(resource, 984 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, 985 vd->vdev_enc_sysfs_path)); 986 } 987 988 /* also copy any optional payload data */ 989 if (aux) { 990 nvpair_t *elem = NULL; 991 992 while ((elem = nvlist_next_nvpair(aux, elem)) != NULL) 993 (void) nvlist_add_nvpair(resource, elem); 994 } 995 996 #endif 997 return (resource); 998 } 999 1000 static void 1001 zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name, 1002 nvlist_t *aux) 1003 { 1004 #ifdef _KERNEL 1005 nvlist_t *resource; 1006 1007 resource = zfs_event_create(spa, vd, type, name, aux); 1008 if (resource) 1009 zfs_zevent_post(resource, NULL, zfs_zevent_post_cb); 1010 #endif 1011 } 1012 1013 /* 1014 * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev 1015 * has been removed from the system. This will cause the DE to ignore any 1016 * recent I/O errors, inferring that they are due to the asynchronous device 1017 * removal. 1018 */ 1019 void 1020 zfs_post_remove(spa_t *spa, vdev_t *vd) 1021 { 1022 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL); 1023 } 1024 1025 /* 1026 * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool 1027 * has the 'autoreplace' property set, and therefore any broken vdevs will be 1028 * handled by higher level logic, and no vdev fault should be generated. 1029 */ 1030 void 1031 zfs_post_autoreplace(spa_t *spa, vdev_t *vd) 1032 { 1033 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL); 1034 } 1035 1036 /* 1037 * The 'resource.fs.zfs.statechange' event is an internal signal that the 1038 * given vdev has transitioned its state to DEGRADED or HEALTHY. This will 1039 * cause the retire agent to repair any outstanding fault management cases 1040 * open because the device was not found (fault.fs.zfs.device). 1041 */ 1042 void 1043 zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) 1044 { 1045 #ifdef _KERNEL 1046 nvlist_t *aux; 1047 1048 /* 1049 * Add optional supplemental keys to payload 1050 */ 1051 aux = fm_nvlist_create(NULL); 1052 if (vd && aux) { 1053 if (vd->vdev_physpath) { 1054 (void) nvlist_add_string(aux, 1055 FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH, 1056 vd->vdev_physpath); 1057 } 1058 if (vd->vdev_enc_sysfs_path) { 1059 (void) nvlist_add_string(aux, 1060 FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, 1061 vd->vdev_enc_sysfs_path); 1062 } 1063 1064 (void) nvlist_add_uint64(aux, 1065 FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate); 1066 } 1067 1068 zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE, 1069 aux); 1070 1071 if (aux) 1072 fm_nvlist_destroy(aux, FM_NVA_FREE); 1073 #endif 1074 } 1075 1076 #if defined(_KERNEL) 1077 EXPORT_SYMBOL(zfs_ereport_post); 1078 EXPORT_SYMBOL(zfs_ereport_is_valid); 1079 EXPORT_SYMBOL(zfs_ereport_post_checksum); 1080 EXPORT_SYMBOL(zfs_post_remove); 1081 EXPORT_SYMBOL(zfs_post_autoreplace); 1082 EXPORT_SYMBOL(zfs_post_state_change); 1083 #endif /* _KERNEL */ 1084