1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright 2019 Joyent, Inc. 26 */ 27 28 #include <assert.h> 29 #include <stddef.h> 30 #include <strings.h> 31 #include <libuutil.h> 32 #include <libzfs.h> 33 #include <fm/fmd_api.h> 34 #include <fm/libtopo.h> 35 #include <sys/types.h> 36 #include <sys/time.h> 37 #include <sys/fs/zfs.h> 38 #include <sys/fm/protocol.h> 39 #include <sys/fm/fs/zfs.h> 40 41 /* 42 * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io,probe}'. 43 * This #define reserves enough space for two 64-bit hex values plus the length 44 * of the longest string. 45 */ 46 #define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum")) 47 48 /* 49 * On-disk case structure. This must maintain backwards compatibility with 50 * previous versions of the DE. By default, any members appended to the end 51 * will be filled with zeros if they don't exist in a previous version. 52 */ 53 typedef struct zfs_case_data { 54 uint64_t zc_version; 55 uint64_t zc_ena; 56 uint64_t zc_pool_guid; 57 uint64_t zc_vdev_guid; 58 int zc_has_timer; /* defunct */ 59 int zc_pool_state; 60 char zc_serd_checksum[MAX_SERDLEN]; 61 char zc_serd_io[MAX_SERDLEN]; 62 int zc_has_remove_timer; 63 char zc_serd_probe[MAX_SERDLEN]; 64 } zfs_case_data_t; 65 66 /* 67 * Time-of-day 68 */ 69 typedef struct er_timeval { 70 uint64_t ertv_sec; 71 uint64_t ertv_nsec; 72 } er_timeval_t; 73 74 /* 75 * In-core case structure. 76 */ 77 typedef struct zfs_case { 78 boolean_t zc_present; 79 uint32_t zc_version; 80 zfs_case_data_t zc_data; 81 fmd_case_t *zc_case; 82 uu_list_node_t zc_node; 83 id_t zc_remove_timer; 84 char *zc_fru; 85 er_timeval_t zc_when; 86 } zfs_case_t; 87 88 #define CASE_DATA "data" 89 #define CASE_FRU "fru" 90 #define CASE_DATA_VERSION_INITIAL 1 91 #define CASE_DATA_VERSION_SERD 2 92 93 /* The length of the maximum uint64 rendered as a decimal string. */ 94 #define MAX_ULL_STR 21 95 96 typedef struct zfs_de_stats { 97 fmd_stat_t old_drops; 98 fmd_stat_t dev_drops; 99 fmd_stat_t vdev_drops; 100 fmd_stat_t import_drops; 101 fmd_stat_t resource_drops; 102 fmd_stat_t pool_drops; 103 } zfs_de_stats_t; 104 105 zfs_de_stats_t zfs_stats = { 106 { "old_drops", FMD_TYPE_UINT64, "ereports dropped (from before load)" }, 107 { "dev_drops", FMD_TYPE_UINT64, "ereports dropped (dev during open)"}, 108 { "vdev_drops", FMD_TYPE_UINT64, "ereports dropped (weird vdev types)"}, 109 { "import_drops", FMD_TYPE_UINT64, "ereports dropped (during import)" }, 110 { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" }, 111 { "pool_drops", FMD_TYPE_UINT64, "ereports dropped (pool iter failed)"}, 112 }; 113 114 static hrtime_t zfs_remove_timeout; 115 116 uu_list_pool_t *zfs_case_pool; 117 uu_list_t *zfs_cases; 118 119 #define ZFS_MAKE_RSRC(type) \ 120 FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type 121 #define ZFS_MAKE_EREPORT(type) \ 122 FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type 123 124 /* 125 * Write out the persistent representation of an active case. 126 */ 127 static void 128 zfs_case_serialize(fmd_hdl_t *hdl, zfs_case_t *zcp) 129 { 130 /* 131 * Always update cases to the latest version, even if they were the 132 * previous version when unserialized. 133 */ 134 zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD; 135 fmd_buf_write(hdl, zcp->zc_case, CASE_DATA, &zcp->zc_data, 136 sizeof (zcp->zc_data)); 137 138 if (zcp->zc_fru != NULL) 139 fmd_buf_write(hdl, zcp->zc_case, CASE_FRU, zcp->zc_fru, 140 strlen(zcp->zc_fru)); 141 } 142 143 /* 144 * Read back the persistent representation of an active case. 145 */ 146 static zfs_case_t * 147 zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) 148 { 149 zfs_case_t *zcp; 150 size_t frulen; 151 152 zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP); 153 zcp->zc_case = cp; 154 155 fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data, 156 sizeof (zcp->zc_data)); 157 158 if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) { 159 fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); 160 return (NULL); 161 } 162 163 if ((frulen = fmd_buf_size(hdl, zcp->zc_case, CASE_FRU)) > 0) { 164 zcp->zc_fru = fmd_hdl_alloc(hdl, frulen + 1, FMD_SLEEP); 165 fmd_buf_read(hdl, zcp->zc_case, CASE_FRU, zcp->zc_fru, 166 frulen); 167 zcp->zc_fru[frulen] = '\0'; 168 } 169 170 /* 171 * fmd_buf_read() will have already zeroed out the remainder of the 172 * buffer, so we don't have to do anything special if the version 173 * doesn't include the SERD engine name. 174 */ 175 176 if (zcp->zc_data.zc_has_remove_timer) 177 zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, 178 NULL, zfs_remove_timeout); 179 180 (void) uu_list_insert_before(zfs_cases, NULL, zcp); 181 182 fmd_case_setspecific(hdl, cp, zcp); 183 184 return (zcp); 185 } 186 187 /* 188 * Iterate over any active cases. If any cases are associated with a pool or 189 * vdev which is no longer present on the system, close the associated case. 190 */ 191 static void 192 zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd, er_timeval_t *loaded) 193 { 194 uint64_t vdev_guid; 195 uint_t c, children; 196 nvlist_t **child; 197 zfs_case_t *zcp; 198 int ret; 199 200 ret = nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid); 201 assert(ret == 0); 202 203 /* 204 * Mark any cases associated with this (pool, vdev) pair. 205 */ 206 for (zcp = uu_list_first(zfs_cases); zcp != NULL; 207 zcp = uu_list_next(zfs_cases, zcp)) { 208 if (zcp->zc_data.zc_pool_guid == pool_guid && 209 zcp->zc_data.zc_vdev_guid == vdev_guid) { 210 zcp->zc_present = B_TRUE; 211 zcp->zc_when = *loaded; 212 } 213 } 214 215 /* 216 * Iterate over all children. 217 */ 218 if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child, 219 &children) == 0) { 220 for (c = 0; c < children; c++) 221 zfs_mark_vdev(pool_guid, child[c], loaded); 222 } 223 224 if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child, 225 &children) == 0) { 226 for (c = 0; c < children; c++) 227 zfs_mark_vdev(pool_guid, child[c], loaded); 228 } 229 230 if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child, 231 &children) == 0) { 232 for (c = 0; c < children; c++) 233 zfs_mark_vdev(pool_guid, child[c], loaded); 234 } 235 } 236 237 /*ARGSUSED*/ 238 static int 239 zfs_mark_pool(zpool_handle_t *zhp, void *unused) 240 { 241 zfs_case_t *zcp; 242 uint64_t pool_guid; 243 uint64_t *tod; 244 er_timeval_t loaded = { 0 }; 245 nvlist_t *config, *vd; 246 uint_t nelem = 0; 247 int ret; 248 249 pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); 250 /* 251 * Mark any cases associated with just this pool. 252 */ 253 for (zcp = uu_list_first(zfs_cases); zcp != NULL; 254 zcp = uu_list_next(zfs_cases, zcp)) { 255 if (zcp->zc_data.zc_pool_guid == pool_guid && 256 zcp->zc_data.zc_vdev_guid == 0) 257 zcp->zc_present = B_TRUE; 258 } 259 260 if ((config = zpool_get_config(zhp, NULL)) == NULL) { 261 zpool_close(zhp); 262 return (-1); 263 } 264 265 (void) nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, 266 &tod, &nelem); 267 if (nelem == 2) { 268 loaded.ertv_sec = tod[0]; 269 loaded.ertv_nsec = tod[1]; 270 for (zcp = uu_list_first(zfs_cases); zcp != NULL; 271 zcp = uu_list_next(zfs_cases, zcp)) { 272 if (zcp->zc_data.zc_pool_guid == pool_guid && 273 zcp->zc_data.zc_vdev_guid == 0) { 274 zcp->zc_when = loaded; 275 } 276 } 277 } 278 279 ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd); 280 assert(ret == 0); 281 282 zfs_mark_vdev(pool_guid, vd, &loaded); 283 284 zpool_close(zhp); 285 286 return (0); 287 } 288 289 /* 290 * Find a pool with a matching GUID. 291 */ 292 typedef struct find_cbdata { 293 uint64_t cb_guid; 294 zpool_handle_t *cb_zhp; 295 } find_cbdata_t; 296 297 static int 298 find_pool(zpool_handle_t *zhp, void *data) 299 { 300 find_cbdata_t *cbp = data; 301 302 if (cbp->cb_guid == 303 zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) { 304 cbp->cb_zhp = zhp; 305 return (0); 306 } 307 308 zpool_close(zhp); 309 return (0); 310 } 311 312 struct load_time_arg { 313 uint64_t lt_guid; 314 er_timeval_t *lt_time; 315 boolean_t lt_found; 316 }; 317 318 static int 319 zpool_find_load_time(zpool_handle_t *zhp, void *arg) 320 { 321 struct load_time_arg *lta = arg; 322 uint64_t pool_guid; 323 uint64_t *tod; 324 nvlist_t *config; 325 uint_t nelem; 326 327 if (lta->lt_found) { 328 zpool_close(zhp); 329 return (0); 330 } 331 332 pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); 333 if (pool_guid != lta->lt_guid) { 334 zpool_close(zhp); 335 return (0); 336 } 337 338 if ((config = zpool_get_config(zhp, NULL)) == NULL) { 339 zpool_close(zhp); 340 return (-1); 341 } 342 343 if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, 344 &tod, &nelem) == 0 && nelem == 2) { 345 lta->lt_found = B_TRUE; 346 lta->lt_time->ertv_sec = tod[0]; 347 lta->lt_time->ertv_nsec = tod[1]; 348 } 349 350 zpool_close(zhp); 351 352 return (0); 353 } 354 355 static void 356 zfs_purge_cases(fmd_hdl_t *hdl) 357 { 358 zfs_case_t *zcp; 359 uu_list_walk_t *walk; 360 libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); 361 362 /* 363 * There is no way to open a pool by GUID, or lookup a vdev by GUID. No 364 * matter what we do, we're going to have to stomach a O(vdevs * cases) 365 * algorithm. In reality, both quantities are likely so small that 366 * neither will matter. Given that iterating over pools is more 367 * expensive than iterating over the in-memory case list, we opt for a 368 * 'present' flag in each case that starts off cleared. We then iterate 369 * over all pools, marking those that are still present, and removing 370 * those that aren't found. 371 * 372 * Note that we could also construct an FMRI and rely on 373 * fmd_nvl_fmri_present(), but this would end up doing the same search. 374 */ 375 376 /* 377 * Mark the cases an not present. 378 */ 379 for (zcp = uu_list_first(zfs_cases); zcp != NULL; 380 zcp = uu_list_next(zfs_cases, zcp)) 381 zcp->zc_present = B_FALSE; 382 383 /* 384 * Iterate over all pools and mark the pools and vdevs found. If this 385 * fails (most probably because we're out of memory), then don't close 386 * any of the cases and we cannot be sure they are accurate. 387 */ 388 if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0) 389 return; 390 391 /* 392 * Remove those cases which were not found. 393 */ 394 walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); 395 while ((zcp = uu_list_walk_next(walk)) != NULL) { 396 if (!zcp->zc_present) 397 fmd_case_close(hdl, zcp->zc_case); 398 } 399 uu_list_walk_end(walk); 400 } 401 402 /* 403 * Construct the name of a serd engine given the pool/vdev GUID and type (io, 404 * checksum, or probe). 405 */ 406 static void 407 zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid, 408 const char *type) 409 { 410 (void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s", pool_guid, 411 vdev_guid, type); 412 } 413 414 /* 415 * Solve a given ZFS case. This first checks to make sure the diagnosis is 416 * still valid, as well as cleaning up any pending timer associated with the 417 * case. 418 */ 419 static void 420 zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname, 421 boolean_t checkunusable) 422 { 423 libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); 424 nvlist_t *detector, *fault; 425 boolean_t serialize; 426 nvlist_t *fmri, *fru; 427 topo_hdl_t *thp; 428 int err; 429 430 /* 431 * Construct the detector from the case data. The detector is in the 432 * ZFS scheme, and is either the pool or the vdev, depending on whether 433 * this is a vdev or pool fault. 434 */ 435 detector = fmd_nvl_alloc(hdl, FMD_SLEEP); 436 437 (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0); 438 (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); 439 (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL, 440 zcp->zc_data.zc_pool_guid); 441 if (zcp->zc_data.zc_vdev_guid != 0) { 442 (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV, 443 zcp->zc_data.zc_vdev_guid); 444 } 445 446 /* 447 * We also want to make sure that the detector (pool or vdev) properly 448 * reflects the diagnosed state, when the fault corresponds to internal 449 * ZFS state (i.e. not checksum or I/O error-induced). Otherwise, a 450 * device which was unavailable early in boot (because the driver/file 451 * wasn't available) and is now healthy will be mis-diagnosed. 452 */ 453 if (!fmd_nvl_fmri_present(hdl, detector) || 454 (checkunusable && !fmd_nvl_fmri_unusable(hdl, detector))) { 455 fmd_case_close(hdl, zcp->zc_case); 456 nvlist_free(detector); 457 return; 458 } 459 460 461 fru = NULL; 462 if (zcp->zc_fru != NULL && 463 (thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION)) != NULL) { 464 /* 465 * If the vdev had an associated FRU, then get the FRU nvlist 466 * from the topo handle and use that in the suspect list. We 467 * explicitly lookup the FRU because the fmri reported from the 468 * kernel may not have up to date details about the disk itself 469 * (serial, part, etc). 470 */ 471 if (topo_fmri_str2nvl(thp, zcp->zc_fru, &fmri, &err) == 0) { 472 /* 473 * If the disk is part of the system chassis, but the 474 * FRU indicates a different chassis ID than our 475 * current system, then ignore the error. This 476 * indicates that the device was part of another 477 * cluster head, and for obvious reasons cannot be 478 * imported on this system. 479 */ 480 if (libzfs_fru_notself(zhdl, zcp->zc_fru)) { 481 fmd_case_close(hdl, zcp->zc_case); 482 nvlist_free(fmri); 483 fmd_hdl_topo_rele(hdl, thp); 484 nvlist_free(detector); 485 return; 486 } 487 488 /* 489 * If the device is no longer present on the system, or 490 * topo_fmri_fru() fails for other reasons, then fall 491 * back to the fmri specified in the vdev. 492 */ 493 if (topo_fmri_fru(thp, fmri, &fru, &err) != 0) 494 fru = fmd_nvl_dup(hdl, fmri, FMD_SLEEP); 495 nvlist_free(fmri); 496 } 497 498 fmd_hdl_topo_rele(hdl, thp); 499 } 500 501 fault = fmd_nvl_create_fault(hdl, faultname, 100, detector, 502 fru, detector); 503 fmd_case_add_suspect(hdl, zcp->zc_case, fault); 504 505 nvlist_free(fru); 506 507 fmd_case_solve(hdl, zcp->zc_case); 508 509 serialize = B_FALSE; 510 if (zcp->zc_data.zc_has_remove_timer) { 511 fmd_timer_remove(hdl, zcp->zc_remove_timer); 512 zcp->zc_data.zc_has_remove_timer = 0; 513 serialize = B_TRUE; 514 } 515 if (serialize) 516 zfs_case_serialize(hdl, zcp); 517 518 nvlist_free(detector); 519 } 520 521 /* 522 * This #define and function access a private interface of the FMA 523 * framework. Ereports include a time-of-day upper bound. 524 * We want to look at that so we can compare it to when pools get 525 * loaded. 526 */ 527 #define FMD_EVN_TOD "__tod" 528 529 static boolean_t 530 timeval_earlier(er_timeval_t *a, er_timeval_t *b) 531 { 532 return (a->ertv_sec < b->ertv_sec || 533 (a->ertv_sec == b->ertv_sec && a->ertv_nsec < b->ertv_nsec)); 534 } 535 536 /*ARGSUSED*/ 537 static void 538 zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when) 539 { 540 uint64_t *tod; 541 uint_t nelem; 542 543 if (nvlist_lookup_uint64_array(nvl, FMD_EVN_TOD, &tod, &nelem) == 0 && 544 nelem == 2) { 545 when->ertv_sec = tod[0]; 546 when->ertv_nsec = tod[1]; 547 } else { 548 when->ertv_sec = when->ertv_nsec = UINT64_MAX; 549 } 550 } 551 552 /* 553 * Main fmd entry point. 554 */ 555 /*ARGSUSED*/ 556 static void 557 zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) 558 { 559 zfs_case_t *zcp, *dcp; 560 libzfs_handle_t *zhdl; 561 zpool_handle_t *zhp; 562 563 int32_t pool_state; 564 uint64_t ena, pool_guid, vdev_guid; 565 er_timeval_t pool_load; 566 er_timeval_t er_when; 567 nvlist_t *detector; 568 boolean_t pool_found = B_FALSE; 569 boolean_t isresource; 570 boolean_t is_inactive_spare, islog, iscache; 571 nvlist_t *vd_nvl = NULL; 572 char *fru, *type, *vdg; 573 find_cbdata_t cb; 574 575 /* 576 * We subscribe to notifications for vdev or pool removal. In these 577 * cases, there may be cases that no longer apply. Purge any cases 578 * that no longer apply. 579 */ 580 if (fmd_nvl_class_match(hdl, nvl, "resource.sysevent.EC_zfs.*")) { 581 zfs_purge_cases(hdl); 582 zfs_stats.resource_drops.fmds_value.ui64++; 583 return; 584 } 585 586 isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*"); 587 588 if (isresource) { 589 /* 590 * For resources, we don't have a normal payload. 591 */ 592 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, 593 &vdev_guid) != 0) 594 pool_state = SPA_LOAD_OPEN; 595 else 596 pool_state = SPA_LOAD_NONE; 597 detector = NULL; 598 } else { 599 (void) nvlist_lookup_nvlist(nvl, 600 FM_EREPORT_DETECTOR, &detector); 601 (void) nvlist_lookup_int32(nvl, 602 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state); 603 } 604 605 /* 606 * We also ignore all ereports generated during an import of a pool, 607 * since the only possible fault (.pool) would result in import failure, 608 * and hence no persistent fault. Some day we may want to do something 609 * with these ereports, so we continue generating them internally. 610 */ 611 if (pool_state == SPA_LOAD_IMPORT) { 612 zfs_stats.import_drops.fmds_value.ui64++; 613 return; 614 } 615 616 /* 617 * Device I/O errors are ignored during pool open. 618 */ 619 if (pool_state == SPA_LOAD_OPEN && 620 (fmd_nvl_class_match(hdl, nvl, 621 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || 622 fmd_nvl_class_match(hdl, nvl, 623 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || 624 fmd_nvl_class_match(hdl, nvl, 625 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE)))) { 626 zfs_stats.dev_drops.fmds_value.ui64++; 627 return; 628 } 629 630 /* 631 * We ignore ereports for anything except disks and files. 632 */ 633 if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, 634 &type) == 0) { 635 if (strcmp(type, VDEV_TYPE_DISK) != 0 && 636 strcmp(type, VDEV_TYPE_FILE) != 0) { 637 zfs_stats.vdev_drops.fmds_value.ui64++; 638 return; 639 } 640 } 641 642 /* 643 * Determine if this ereport corresponds to an open case. Previous 644 * incarnations of this DE used the ENA to chain events together as 645 * part of the same case. The problem with this is that we rely on 646 * global uniqueness of cases based on (pool_guid, vdev_guid) pair when 647 * generating SERD engines. Instead, we have a case for each vdev or 648 * pool, regardless of the ENA. 649 */ 650 (void) nvlist_lookup_uint64(nvl, 651 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid); 652 if (nvlist_lookup_uint64(nvl, 653 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) 654 vdev_guid = 0; 655 if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0) 656 ena = 0; 657 658 zfs_ereport_when(hdl, nvl, &er_when); 659 660 for (zcp = uu_list_first(zfs_cases); zcp != NULL; 661 zcp = uu_list_next(zfs_cases, zcp)) { 662 if (zcp->zc_data.zc_pool_guid == pool_guid) { 663 pool_found = B_TRUE; 664 pool_load = zcp->zc_when; 665 } 666 if (zcp->zc_data.zc_vdev_guid == vdev_guid && 667 zcp->zc_data.zc_pool_guid == pool_guid) 668 break; 669 } 670 671 if (pool_found) { 672 fmd_hdl_debug(hdl, "pool %llx, " 673 "ereport time %lld.%lld, pool load time = %lld.%lld\n", 674 pool_guid, er_when.ertv_sec, er_when.ertv_nsec, 675 pool_load.ertv_sec, pool_load.ertv_nsec); 676 } 677 678 /* 679 * Avoid falsely accusing a pool of being faulty. Do so by 680 * not replaying ereports that were generated prior to the 681 * current import. If the failure that generated them was 682 * transient because the device was actually removed but we 683 * didn't receive the normal asynchronous notification, we 684 * don't want to mark it as faulted and potentially panic. If 685 * there is still a problem we'd expect not to be able to 686 * import the pool, or that new ereports will be generated 687 * once the pool is used. 688 */ 689 if (pool_found && timeval_earlier(&er_when, &pool_load)) { 690 zfs_stats.old_drops.fmds_value.ui64++; 691 return; 692 } 693 694 if (!pool_found) { 695 /* 696 * Haven't yet seen this pool, but same situation 697 * may apply. 698 */ 699 libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); 700 struct load_time_arg la; 701 702 la.lt_guid = pool_guid; 703 la.lt_time = &pool_load; 704 la.lt_found = B_FALSE; 705 706 if (zhdl != NULL && 707 zpool_iter(zhdl, zpool_find_load_time, &la) == 0 && 708 la.lt_found == B_TRUE) { 709 pool_found = B_TRUE; 710 fmd_hdl_debug(hdl, "pool %llx, " 711 "ereport time %lld.%lld, " 712 "pool load time = %lld.%lld\n", 713 pool_guid, er_when.ertv_sec, er_when.ertv_nsec, 714 pool_load.ertv_sec, pool_load.ertv_nsec); 715 if (timeval_earlier(&er_when, &pool_load)) { 716 zfs_stats.old_drops.fmds_value.ui64++; 717 return; 718 } 719 } 720 } 721 722 if (zcp == NULL) { 723 fmd_case_t *cs; 724 zfs_case_data_t data = { 0 }; 725 726 /* 727 * If this is one of our 'fake' resource ereports, and there is 728 * no case open, simply discard it. 729 */ 730 if (isresource) { 731 zfs_stats.resource_drops.fmds_value.ui64++; 732 return; 733 } 734 735 /* 736 * Open a new case. 737 */ 738 cs = fmd_case_open(hdl, NULL); 739 740 /* 741 * Initialize the case buffer. To commonize code, we actually 742 * create the buffer with existing data, and then call 743 * zfs_case_unserialize() to instantiate the in-core structure. 744 */ 745 fmd_buf_create(hdl, cs, CASE_DATA, 746 sizeof (zfs_case_data_t)); 747 748 data.zc_version = CASE_DATA_VERSION_SERD; 749 data.zc_ena = ena; 750 data.zc_pool_guid = pool_guid; 751 data.zc_vdev_guid = vdev_guid; 752 data.zc_pool_state = (int)pool_state; 753 754 fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data)); 755 756 zcp = zfs_case_unserialize(hdl, cs); 757 assert(zcp != NULL); 758 if (pool_found) 759 zcp->zc_when = pool_load; 760 } 761 762 763 /* 764 * If this is an ereport for a case with an associated vdev FRU, make 765 * sure it is accurate and up to date. 766 */ 767 if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, 768 &fru) == 0) { 769 topo_hdl_t *thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION); 770 if (zcp->zc_fru == NULL || 771 !topo_fmri_strcmp(thp, zcp->zc_fru, fru)) { 772 if (zcp->zc_fru != NULL) { 773 fmd_hdl_strfree(hdl, zcp->zc_fru); 774 fmd_buf_destroy(hdl, zcp->zc_case, CASE_FRU); 775 } 776 zcp->zc_fru = fmd_hdl_strdup(hdl, fru, FMD_SLEEP); 777 zfs_case_serialize(hdl, zcp); 778 } 779 fmd_hdl_topo_rele(hdl, thp); 780 } 781 782 if (isresource) { 783 if (fmd_nvl_class_match(hdl, nvl, 784 ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) { 785 /* 786 * The 'resource.fs.zfs.autoreplace' event indicates 787 * that the pool was loaded with the 'autoreplace' 788 * property set. In this case, any pending device 789 * failures should be ignored, as the asynchronous 790 * autoreplace handling will take care of them. 791 */ 792 fmd_case_close(hdl, zcp->zc_case); 793 } else if (fmd_nvl_class_match(hdl, nvl, 794 ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) { 795 /* 796 * The 'resource.fs.zfs.removed' event indicates that 797 * device removal was detected, and the device was 798 * closed asynchronously. If this is the case, we 799 * assume that any recent I/O errors were due to the 800 * device removal, not any fault of the device itself. 801 * We reset the SERD engine, and cancel any pending 802 * timers. 803 */ 804 if (zcp->zc_data.zc_has_remove_timer) { 805 fmd_timer_remove(hdl, zcp->zc_remove_timer); 806 zcp->zc_data.zc_has_remove_timer = 0; 807 zfs_case_serialize(hdl, zcp); 808 } 809 if (zcp->zc_data.zc_serd_io[0] != '\0') 810 fmd_serd_reset(hdl, 811 zcp->zc_data.zc_serd_io); 812 if (zcp->zc_data.zc_serd_checksum[0] != '\0') 813 fmd_serd_reset(hdl, 814 zcp->zc_data.zc_serd_checksum); 815 if (zcp->zc_data.zc_serd_probe[0] != '\0') 816 fmd_serd_reset(hdl, zcp->zc_data.zc_serd_probe); 817 } 818 zfs_stats.resource_drops.fmds_value.ui64++; 819 return; 820 } 821 822 /* 823 * Associate the ereport with this case. 824 */ 825 fmd_case_add_ereport(hdl, zcp->zc_case, ep); 826 827 /* 828 * Don't do anything else if this case is already solved. 829 */ 830 if (fmd_case_solved(hdl, zcp->zc_case)) 831 return; 832 833 zhdl = fmd_hdl_getspecific(hdl); 834 835 /* 836 * Find the corresponding pool. 837 */ 838 cb.cb_guid = pool_guid; 839 cb.cb_zhp = NULL; 840 if (zhdl != NULL && zpool_iter(zhdl, find_pool, &cb) != 0) { 841 zfs_stats.pool_drops.fmds_value.ui64++; 842 return; 843 } 844 845 zhp = cb.cb_zhp; /* NULL if pool was not found. */ 846 if (zhp != NULL) { 847 /* 848 * The libzfs API takes a string representation of a base-10 849 * guid here instead of a number, likely because the primary 850 * libzfs consumers are the CLI tools. 851 */ 852 vdg = fmd_hdl_zalloc(hdl, MAX_ULL_STR, FMD_SLEEP); 853 (void) snprintf(vdg, MAX_ULL_STR, "%" PRIx64, vdev_guid); 854 855 /* 856 * According to libzfs the 'spare' bit is set when the spare is 857 * unused, and unset when in use. 858 * 859 * We don't really care about the returned nvlist. We're only 860 * interested in the boolean flags. 861 */ 862 if ((vd_nvl = zpool_find_vdev(zhp, vdg, 863 &is_inactive_spare, &islog, &iscache)) != NULL) { 864 nvlist_free(vd_nvl); 865 } 866 fmd_hdl_free(hdl, vdg, MAX_ULL_STR); 867 } 868 869 /* 870 * Determine if we should solve the case and generate a fault. We solve 871 * a case if: 872 * 873 * a. A pool failed to open (ereport.fs.zfs.pool) 874 * b. A device failed to open (ereport.fs.zfs.pool) while a pool 875 * was up and running. 876 * 877 * We may see a series of ereports associated with a pool open, all 878 * chained together by the same ENA. If the pool open succeeds, then 879 * we'll see no further ereports. To detect when a pool open has 880 * succeeded, we associate a timer with the event. When it expires, we 881 * close the case. 882 */ 883 if (fmd_nvl_class_match(hdl, nvl, 884 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) { 885 /* 886 * Pool level fault. Before solving the case, go through and 887 * close any open device cases that may be pending. 888 */ 889 for (dcp = uu_list_first(zfs_cases); dcp != NULL; 890 dcp = uu_list_next(zfs_cases, dcp)) { 891 if (dcp->zc_data.zc_pool_guid == 892 zcp->zc_data.zc_pool_guid && 893 dcp->zc_data.zc_vdev_guid != 0) 894 fmd_case_close(hdl, dcp->zc_case); 895 } 896 897 zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool", B_TRUE); 898 } else if (fmd_nvl_class_match(hdl, nvl, 899 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) { 900 /* 901 * Pool level fault for reading the intent logs. 902 */ 903 zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay", B_TRUE); 904 } else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) { 905 /* 906 * Device fault. 907 */ 908 zfs_case_solve(hdl, zcp, "fault.fs.zfs.device", B_TRUE); 909 } else if (fmd_nvl_class_match(hdl, nvl, 910 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || 911 fmd_nvl_class_match(hdl, nvl, 912 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || 913 fmd_nvl_class_match(hdl, nvl, 914 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) || 915 fmd_nvl_class_match(hdl, nvl, 916 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { 917 char *failmode = NULL; 918 boolean_t checkremove = B_FALSE; 919 920 /* 921 * If this is a checksum, I/O, or probe error, then toss it into 922 * the appropriate SERD engine and check to see if it has fired. 923 * Ideally, we want to do something more sophisticated, 924 * (persistent errors for a single data block, etc). For now, 925 * a single SERD engine is sufficient. 926 */ 927 if (fmd_nvl_class_match(hdl, nvl, 928 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) { 929 if (zcp->zc_data.zc_serd_io[0] == '\0') { 930 zfs_serd_name(zcp->zc_data.zc_serd_io, 931 pool_guid, vdev_guid, "io"); 932 fmd_serd_create(hdl, zcp->zc_data.zc_serd_io, 933 fmd_prop_get_int32(hdl, "io_N"), 934 fmd_prop_get_int64(hdl, "io_T")); 935 zfs_case_serialize(hdl, zcp); 936 } 937 if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep)) 938 checkremove = B_TRUE; 939 } else if (fmd_nvl_class_match(hdl, nvl, 940 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { 941 if (zcp->zc_data.zc_serd_checksum[0] == '\0') { 942 zfs_serd_name(zcp->zc_data.zc_serd_checksum, 943 pool_guid, vdev_guid, "checksum"); 944 fmd_serd_create(hdl, 945 zcp->zc_data.zc_serd_checksum, 946 fmd_prop_get_int32(hdl, "checksum_N"), 947 fmd_prop_get_int64(hdl, "checksum_T")); 948 zfs_case_serialize(hdl, zcp); 949 } 950 if (fmd_serd_record(hdl, 951 zcp->zc_data.zc_serd_checksum, ep)) { 952 zfs_case_solve(hdl, zcp, 953 "fault.fs.zfs.vdev.checksum", B_FALSE); 954 } 955 } else if (fmd_nvl_class_match(hdl, nvl, 956 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) && 957 (nvlist_lookup_string(nvl, 958 FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) && 959 failmode != NULL) { 960 if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE, 961 strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) { 962 zfs_case_solve(hdl, zcp, 963 "fault.fs.zfs.io_failure_continue", 964 B_FALSE); 965 } else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT, 966 strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) { 967 zfs_case_solve(hdl, zcp, 968 "fault.fs.zfs.io_failure_wait", B_FALSE); 969 } 970 } else if (fmd_nvl_class_match(hdl, nvl, 971 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { 972 if (zcp->zc_data.zc_serd_probe[0] == '\0') { 973 zfs_serd_name(zcp->zc_data.zc_serd_probe, 974 pool_guid, vdev_guid, "probe"); 975 fmd_serd_create(hdl, zcp->zc_data.zc_serd_probe, 976 fmd_prop_get_int32(hdl, "probe_N"), 977 fmd_prop_get_int64(hdl, "probe_T")); 978 zfs_case_serialize(hdl, zcp); 979 } 980 981 /* 982 * We only want to wait for SERD triggers for spare 983 * vdevs. Normal pool vdevs should be diagnosed 984 * immediately if a probe failure is received. 985 */ 986 if (!is_inactive_spare || fmd_serd_record(hdl, 987 zcp->zc_data.zc_serd_probe, ep)) { 988 checkremove = B_TRUE; 989 } 990 } 991 992 /* 993 * Because I/O errors may be due to device removal, we postpone 994 * any diagnosis until we're sure that we aren't about to 995 * receive a 'resource.fs.zfs.removed' event. 996 */ 997 if (checkremove) { 998 if (zcp->zc_data.zc_has_remove_timer) 999 fmd_timer_remove(hdl, zcp->zc_remove_timer); 1000 zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL, 1001 zfs_remove_timeout); 1002 if (!zcp->zc_data.zc_has_remove_timer) { 1003 zcp->zc_data.zc_has_remove_timer = 1; 1004 zfs_case_serialize(hdl, zcp); 1005 } 1006 } 1007 } 1008 } 1009 1010 /* 1011 * The timeout is fired when we diagnosed an I/O error, and it was not due to 1012 * device removal (which would cause the timeout to be cancelled). 1013 */ 1014 /* ARGSUSED */ 1015 static void 1016 zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data) 1017 { 1018 zfs_case_t *zcp = data; 1019 1020 if (id == zcp->zc_remove_timer) 1021 zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io", B_FALSE); 1022 } 1023 1024 static void 1025 zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) 1026 { 1027 zfs_case_t *zcp = fmd_case_getspecific(hdl, cs); 1028 1029 if (zcp->zc_data.zc_serd_checksum[0] != '\0') 1030 fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum); 1031 if (zcp->zc_data.zc_serd_io[0] != '\0') 1032 fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io); 1033 if (zcp->zc_data.zc_serd_probe[0] != '\0') 1034 fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_probe); 1035 if (zcp->zc_data.zc_has_remove_timer) 1036 fmd_timer_remove(hdl, zcp->zc_remove_timer); 1037 uu_list_remove(zfs_cases, zcp); 1038 fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); 1039 } 1040 1041 /* 1042 * We use the fmd gc entry point to look for old cases that no longer apply. 1043 * This allows us to keep our set of case data small in a long running system. 1044 */ 1045 static void 1046 zfs_fm_gc(fmd_hdl_t *hdl) 1047 { 1048 zfs_purge_cases(hdl); 1049 } 1050 1051 static const fmd_hdl_ops_t fmd_ops = { 1052 zfs_fm_recv, /* fmdo_recv */ 1053 zfs_fm_timeout, /* fmdo_timeout */ 1054 zfs_fm_close, /* fmdo_close */ 1055 NULL, /* fmdo_stats */ 1056 zfs_fm_gc, /* fmdo_gc */ 1057 }; 1058 1059 static const fmd_prop_t fmd_props[] = { 1060 { "checksum_N", FMD_TYPE_UINT32, "10" }, 1061 { "checksum_T", FMD_TYPE_TIME, "10min" }, 1062 { "io_N", FMD_TYPE_UINT32, "10" }, 1063 { "io_T", FMD_TYPE_TIME, "10min" }, 1064 { "probe_N", FMD_TYPE_UINT32, "5" }, 1065 { "probe_T", FMD_TYPE_TIME, "24hour" }, 1066 { "remove_timeout", FMD_TYPE_TIME, "15sec" }, 1067 { NULL, 0, NULL } 1068 }; 1069 1070 static const fmd_hdl_info_t fmd_info = { 1071 "ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props 1072 }; 1073 1074 void 1075 _fmd_init(fmd_hdl_t *hdl) 1076 { 1077 fmd_case_t *cp; 1078 libzfs_handle_t *zhdl; 1079 1080 if ((zhdl = libzfs_init()) == NULL) 1081 return; 1082 1083 if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool", 1084 sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node), 1085 NULL, 0)) == NULL) { 1086 libzfs_fini(zhdl); 1087 return; 1088 } 1089 1090 if ((zfs_cases = uu_list_create(zfs_case_pool, NULL, 0)) == NULL) { 1091 uu_list_pool_destroy(zfs_case_pool); 1092 libzfs_fini(zhdl); 1093 return; 1094 } 1095 1096 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { 1097 uu_list_destroy(zfs_cases); 1098 uu_list_pool_destroy(zfs_case_pool); 1099 libzfs_fini(zhdl); 1100 return; 1101 } 1102 1103 fmd_hdl_setspecific(hdl, zhdl); 1104 1105 (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) / 1106 sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats); 1107 1108 /* 1109 * Iterate over all active cases and unserialize the associated buffers, 1110 * adding them to our list of open cases. 1111 */ 1112 for (cp = fmd_case_next(hdl, NULL); 1113 cp != NULL; cp = fmd_case_next(hdl, cp)) 1114 (void) zfs_case_unserialize(hdl, cp); 1115 1116 /* 1117 * Clear out any old cases that are no longer valid. 1118 */ 1119 zfs_purge_cases(hdl); 1120 1121 zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout"); 1122 } 1123 1124 void 1125 _fmd_fini(fmd_hdl_t *hdl) 1126 { 1127 zfs_case_t *zcp; 1128 uu_list_walk_t *walk; 1129 libzfs_handle_t *zhdl; 1130 1131 /* 1132 * Remove all active cases. 1133 */ 1134 walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); 1135 while ((zcp = uu_list_walk_next(walk)) != NULL) { 1136 uu_list_remove(zfs_cases, zcp); 1137 fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); 1138 } 1139 uu_list_walk_end(walk); 1140 1141 uu_list_destroy(zfs_cases); 1142 uu_list_pool_destroy(zfs_case_pool); 1143 1144 zhdl = fmd_hdl_getspecific(hdl); 1145 libzfs_fini(zhdl); 1146 } 1147