1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <assert.h> 27 #include <stddef.h> 28 #include <strings.h> 29 #include <libuutil.h> 30 #include <libzfs.h> 31 #include <fm/fmd_api.h> 32 #include <fm/libtopo.h> 33 #include <sys/fs/zfs.h> 34 #include <sys/fm/protocol.h> 35 #include <sys/fm/fs/zfs.h> 36 37 /* 38 * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This 39 * #define reserves enough space for two 64-bit hex values plus the length of 40 * the longest string. 41 */ 42 #define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum")) 43 44 /* 45 * On-disk case structure. This must maintain backwards compatibility with 46 * previous versions of the DE. By default, any members appended to the end 47 * will be filled with zeros if they don't exist in a previous version. 48 */ 49 typedef struct zfs_case_data { 50 uint64_t zc_version; 51 uint64_t zc_ena; 52 uint64_t zc_pool_guid; 53 uint64_t zc_vdev_guid; 54 int zc_has_timer; /* defunct */ 55 int zc_pool_state; 56 char zc_serd_checksum[MAX_SERDLEN]; 57 char zc_serd_io[MAX_SERDLEN]; 58 int zc_has_remove_timer; 59 } zfs_case_data_t; 60 61 /* 62 * In-core case structure. 63 */ 64 typedef struct zfs_case { 65 boolean_t zc_present; 66 uint32_t zc_version; 67 zfs_case_data_t zc_data; 68 fmd_case_t *zc_case; 69 uu_list_node_t zc_node; 70 id_t zc_remove_timer; 71 char *zc_fru; 72 } zfs_case_t; 73 74 #define CASE_DATA "data" 75 #define CASE_FRU "fru" 76 #define CASE_DATA_VERSION_INITIAL 1 77 #define CASE_DATA_VERSION_SERD 2 78 79 static hrtime_t zfs_remove_timeout; 80 81 uu_list_pool_t *zfs_case_pool; 82 uu_list_t *zfs_cases; 83 84 #define ZFS_MAKE_RSRC(type) \ 85 FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type 86 #define ZFS_MAKE_EREPORT(type) \ 87 FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type 88 89 /* 90 * Write out the persistent representation of an active case. 91 */ 92 static void 93 zfs_case_serialize(fmd_hdl_t *hdl, zfs_case_t *zcp) 94 { 95 /* 96 * Always update cases to the latest version, even if they were the 97 * previous version when unserialized. 98 */ 99 zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD; 100 fmd_buf_write(hdl, zcp->zc_case, CASE_DATA, &zcp->zc_data, 101 sizeof (zcp->zc_data)); 102 103 if (zcp->zc_fru != NULL) 104 fmd_buf_write(hdl, zcp->zc_case, CASE_FRU, zcp->zc_fru, 105 strlen(zcp->zc_fru)); 106 } 107 108 /* 109 * Read back the persistent representation of an active case. 110 */ 111 static zfs_case_t * 112 zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) 113 { 114 zfs_case_t *zcp; 115 size_t frulen; 116 117 zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP); 118 zcp->zc_case = cp; 119 120 fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data, 121 sizeof (zcp->zc_data)); 122 123 if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) { 124 fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); 125 return (NULL); 126 } 127 128 if ((frulen = fmd_buf_size(hdl, zcp->zc_case, CASE_FRU)) > 0) { 129 zcp->zc_fru = fmd_hdl_alloc(hdl, frulen + 1, FMD_SLEEP); 130 fmd_buf_read(hdl, zcp->zc_case, CASE_FRU, zcp->zc_fru, 131 frulen); 132 zcp->zc_fru[frulen] = '\0'; 133 } 134 135 /* 136 * fmd_buf_read() will have already zeroed out the remainder of the 137 * buffer, so we don't have to do anything special if the version 138 * doesn't include the SERD engine name. 139 */ 140 141 if (zcp->zc_data.zc_has_remove_timer) 142 zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, 143 NULL, zfs_remove_timeout); 144 145 (void) uu_list_insert_before(zfs_cases, NULL, zcp); 146 147 fmd_case_setspecific(hdl, cp, zcp); 148 149 return (zcp); 150 } 151 152 /* 153 * Iterate over any active cases. If any cases are associated with a pool or 154 * vdev which is no longer present on the system, close the associated case. 155 */ 156 static void 157 zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd) 158 { 159 uint64_t vdev_guid; 160 uint_t c, children; 161 nvlist_t **child; 162 zfs_case_t *zcp; 163 int ret; 164 165 ret = nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid); 166 assert(ret == 0); 167 168 /* 169 * Mark any cases associated with this (pool, vdev) pair. 170 */ 171 for (zcp = uu_list_first(zfs_cases); zcp != NULL; 172 zcp = uu_list_next(zfs_cases, zcp)) { 173 if (zcp->zc_data.zc_pool_guid == pool_guid && 174 zcp->zc_data.zc_vdev_guid == vdev_guid) 175 zcp->zc_present = B_TRUE; 176 } 177 178 /* 179 * Iterate over all children. 180 */ 181 if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child, 182 &children) == 0) { 183 for (c = 0; c < children; c++) 184 zfs_mark_vdev(pool_guid, child[c]); 185 } 186 187 if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child, 188 &children) == 0) { 189 for (c = 0; c < children; c++) 190 zfs_mark_vdev(pool_guid, child[c]); 191 } 192 193 if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child, 194 &children) == 0) { 195 for (c = 0; c < children; c++) 196 zfs_mark_vdev(pool_guid, child[c]); 197 } 198 } 199 200 /*ARGSUSED*/ 201 static int 202 zfs_mark_pool(zpool_handle_t *zhp, void *unused) 203 { 204 zfs_case_t *zcp; 205 uint64_t pool_guid; 206 nvlist_t *config, *vd; 207 int ret; 208 209 pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); 210 /* 211 * Mark any cases associated with just this pool. 212 */ 213 for (zcp = uu_list_first(zfs_cases); zcp != NULL; 214 zcp = uu_list_next(zfs_cases, zcp)) { 215 if (zcp->zc_data.zc_pool_guid == pool_guid && 216 zcp->zc_data.zc_vdev_guid == 0) 217 zcp->zc_present = B_TRUE; 218 } 219 220 if ((config = zpool_get_config(zhp, NULL)) == NULL) { 221 zpool_close(zhp); 222 return (-1); 223 } 224 225 ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd); 226 assert(ret == 0); 227 228 zfs_mark_vdev(pool_guid, vd); 229 230 zpool_close(zhp); 231 232 return (0); 233 } 234 235 static void 236 zfs_purge_cases(fmd_hdl_t *hdl) 237 { 238 zfs_case_t *zcp; 239 uu_list_walk_t *walk; 240 libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); 241 242 /* 243 * There is no way to open a pool by GUID, or lookup a vdev by GUID. No 244 * matter what we do, we're going to have to stomach a O(vdevs * cases) 245 * algorithm. In reality, both quantities are likely so small that 246 * neither will matter. Given that iterating over pools is more 247 * expensive than iterating over the in-memory case list, we opt for a 248 * 'present' flag in each case that starts off cleared. We then iterate 249 * over all pools, marking those that are still present, and removing 250 * those that aren't found. 251 * 252 * Note that we could also construct an FMRI and rely on 253 * fmd_nvl_fmri_present(), but this would end up doing the same search. 254 */ 255 256 /* 257 * Mark the cases an not present. 258 */ 259 for (zcp = uu_list_first(zfs_cases); zcp != NULL; 260 zcp = uu_list_next(zfs_cases, zcp)) 261 zcp->zc_present = B_FALSE; 262 263 /* 264 * Iterate over all pools and mark the pools and vdevs found. If this 265 * fails (most probably because we're out of memory), then don't close 266 * any of the cases and we cannot be sure they are accurate. 267 */ 268 if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0) 269 return; 270 271 /* 272 * Remove those cases which were not found. 273 */ 274 walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); 275 while ((zcp = uu_list_walk_next(walk)) != NULL) { 276 if (!zcp->zc_present) 277 fmd_case_close(hdl, zcp->zc_case); 278 } 279 uu_list_walk_end(walk); 280 } 281 282 /* 283 * Construct the name of a serd engine given the pool/vdev GUID and type (io or 284 * checksum). 285 */ 286 static void 287 zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid, 288 const char *type) 289 { 290 (void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s", pool_guid, 291 vdev_guid, type); 292 } 293 294 /* 295 * Solve a given ZFS case. This first checks to make sure the diagnosis is 296 * still valid, as well as cleaning up any pending timer associated with the 297 * case. 298 */ 299 static void 300 zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname, 301 boolean_t checkunusable) 302 { 303 libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); 304 nvlist_t *detector, *fault; 305 boolean_t serialize; 306 nvlist_t *fmri, *fru; 307 topo_hdl_t *thp; 308 int err; 309 310 /* 311 * Construct the detector from the case data. The detector is in the 312 * ZFS scheme, and is either the pool or the vdev, depending on whether 313 * this is a vdev or pool fault. 314 */ 315 detector = fmd_nvl_alloc(hdl, FMD_SLEEP); 316 317 (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0); 318 (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); 319 (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL, 320 zcp->zc_data.zc_pool_guid); 321 if (zcp->zc_data.zc_vdev_guid != 0) { 322 (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV, 323 zcp->zc_data.zc_vdev_guid); 324 } 325 326 /* 327 * We also want to make sure that the detector (pool or vdev) properly 328 * reflects the diagnosed state, when the fault corresponds to internal 329 * ZFS state (i.e. not checksum or I/O error-induced). Otherwise, a 330 * device which was unavailable early in boot (because the driver/file 331 * wasn't available) and is now healthy will be mis-diagnosed. 332 */ 333 if (!fmd_nvl_fmri_present(hdl, detector) || 334 (checkunusable && !fmd_nvl_fmri_unusable(hdl, detector))) { 335 fmd_case_close(hdl, zcp->zc_case); 336 nvlist_free(detector); 337 return; 338 } 339 340 341 fru = NULL; 342 if (zcp->zc_fru != NULL && 343 (thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION)) != NULL) { 344 /* 345 * If the vdev had an associated FRU, then get the FRU nvlist 346 * from the topo handle and use that in the suspect list. We 347 * explicitly lookup the FRU because the fmri reported from the 348 * kernel may not have up to date details about the disk itself 349 * (serial, part, etc). 350 */ 351 if (topo_fmri_str2nvl(thp, zcp->zc_fru, &fmri, &err) == 0) { 352 /* 353 * If the disk is part of the system chassis, but the 354 * FRU indicates a different chassis ID than our 355 * current system, then ignore the error. This 356 * indicates that the device was part of another 357 * cluster head, and for obvious reasons cannot be 358 * imported on this system. 359 */ 360 if (libzfs_fru_notself(zhdl, zcp->zc_fru)) { 361 fmd_case_close(hdl, zcp->zc_case); 362 nvlist_free(fmri); 363 fmd_hdl_topo_rele(hdl, thp); 364 nvlist_free(detector); 365 return; 366 } 367 368 /* 369 * If the device is no longer present on the system, or 370 * topo_fmri_fru() fails for other reasons, then fall 371 * back to the fmri specified in the vdev. 372 */ 373 if (topo_fmri_fru(thp, fmri, &fru, &err) != 0) 374 fru = fmd_nvl_dup(hdl, fmri, FMD_SLEEP); 375 nvlist_free(fmri); 376 } 377 378 fmd_hdl_topo_rele(hdl, thp); 379 } 380 381 fault = fmd_nvl_create_fault(hdl, faultname, 100, detector, 382 fru, detector); 383 fmd_case_add_suspect(hdl, zcp->zc_case, fault); 384 385 nvlist_free(fru); 386 387 fmd_case_solve(hdl, zcp->zc_case); 388 389 serialize = B_FALSE; 390 if (zcp->zc_data.zc_has_remove_timer) { 391 fmd_timer_remove(hdl, zcp->zc_remove_timer); 392 zcp->zc_data.zc_has_remove_timer = 0; 393 serialize = B_TRUE; 394 } 395 if (serialize) 396 zfs_case_serialize(hdl, zcp); 397 398 nvlist_free(detector); 399 } 400 401 /* 402 * Main fmd entry point. 403 */ 404 /*ARGSUSED*/ 405 static void 406 zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) 407 { 408 zfs_case_t *zcp, *dcp; 409 int32_t pool_state; 410 uint64_t ena, pool_guid, vdev_guid; 411 nvlist_t *detector; 412 boolean_t isresource; 413 char *fru, *type; 414 415 /* 416 * We subscribe to notifications for vdev or pool removal. In these 417 * cases, there may be cases that no longer apply. Purge any cases 418 * that no longer apply. 419 */ 420 if (fmd_nvl_class_match(hdl, nvl, "resource.sysevent.EC_zfs.*")) { 421 zfs_purge_cases(hdl); 422 return; 423 } 424 425 isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*"); 426 427 if (isresource) { 428 /* 429 * For resources, we don't have a normal payload. 430 */ 431 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, 432 &vdev_guid) != 0) 433 pool_state = SPA_LOAD_OPEN; 434 else 435 pool_state = SPA_LOAD_NONE; 436 detector = NULL; 437 } else { 438 (void) nvlist_lookup_nvlist(nvl, 439 FM_EREPORT_DETECTOR, &detector); 440 (void) nvlist_lookup_int32(nvl, 441 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state); 442 } 443 444 /* 445 * We also ignore all ereports generated during an import of a pool, 446 * since the only possible fault (.pool) would result in import failure, 447 * and hence no persistent fault. Some day we may want to do something 448 * with these ereports, so we continue generating them internally. 449 */ 450 if (pool_state == SPA_LOAD_IMPORT) 451 return; 452 453 /* 454 * Device I/O errors are ignored during pool open. 455 */ 456 if (pool_state == SPA_LOAD_OPEN && 457 (fmd_nvl_class_match(hdl, nvl, 458 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || 459 fmd_nvl_class_match(hdl, nvl, 460 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || 461 fmd_nvl_class_match(hdl, nvl, 462 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE)))) 463 return; 464 465 /* 466 * We ignore ereports for anything except disks and files. 467 */ 468 if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, 469 &type) == 0) { 470 if (strcmp(type, VDEV_TYPE_DISK) != 0 && 471 strcmp(type, VDEV_TYPE_FILE) != 0) 472 return; 473 } 474 475 /* 476 * Determine if this ereport corresponds to an open case. Previous 477 * incarnations of this DE used the ENA to chain events together as 478 * part of the same case. The problem with this is that we rely on 479 * global uniqueness of cases based on (pool_guid, vdev_guid) pair when 480 * generating SERD engines. Instead, we have a case for each vdev or 481 * pool, regardless of the ENA. 482 */ 483 (void) nvlist_lookup_uint64(nvl, 484 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid); 485 if (nvlist_lookup_uint64(nvl, 486 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) 487 vdev_guid = 0; 488 if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0) 489 ena = 0; 490 491 for (zcp = uu_list_first(zfs_cases); zcp != NULL; 492 zcp = uu_list_next(zfs_cases, zcp)) { 493 if (zcp->zc_data.zc_pool_guid == pool_guid && 494 zcp->zc_data.zc_vdev_guid == vdev_guid) 495 break; 496 } 497 498 if (zcp == NULL) { 499 fmd_case_t *cs; 500 zfs_case_data_t data = { 0 }; 501 502 /* 503 * If this is one of our 'fake' resource ereports, and there is 504 * no case open, simply discard it. 505 */ 506 if (isresource) 507 return; 508 509 /* 510 * Open a new case. 511 */ 512 cs = fmd_case_open(hdl, NULL); 513 514 /* 515 * Initialize the case buffer. To commonize code, we actually 516 * create the buffer with existing data, and then call 517 * zfs_case_unserialize() to instantiate the in-core structure. 518 */ 519 fmd_buf_create(hdl, cs, CASE_DATA, 520 sizeof (zfs_case_data_t)); 521 522 data.zc_version = CASE_DATA_VERSION_SERD; 523 data.zc_ena = ena; 524 data.zc_pool_guid = pool_guid; 525 data.zc_vdev_guid = vdev_guid; 526 data.zc_pool_state = (int)pool_state; 527 528 fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data)); 529 530 zcp = zfs_case_unserialize(hdl, cs); 531 assert(zcp != NULL); 532 } 533 534 /* 535 * If this is an ereport for a case with an associated vdev FRU, make 536 * sure it is accurate and up to date. 537 */ 538 if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, 539 &fru) == 0) { 540 topo_hdl_t *thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION); 541 if (zcp->zc_fru == NULL || 542 !topo_fmri_strcmp(thp, zcp->zc_fru, fru)) { 543 if (zcp->zc_fru != NULL) { 544 fmd_hdl_strfree(hdl, zcp->zc_fru); 545 fmd_buf_destroy(hdl, zcp->zc_case, CASE_FRU); 546 } 547 zcp->zc_fru = fmd_hdl_strdup(hdl, fru, FMD_SLEEP); 548 zfs_case_serialize(hdl, zcp); 549 } 550 fmd_hdl_topo_rele(hdl, thp); 551 } 552 553 if (isresource) { 554 if (fmd_nvl_class_match(hdl, nvl, 555 ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) { 556 /* 557 * The 'resource.fs.zfs.autoreplace' event indicates 558 * that the pool was loaded with the 'autoreplace' 559 * property set. In this case, any pending device 560 * failures should be ignored, as the asynchronous 561 * autoreplace handling will take care of them. 562 */ 563 fmd_case_close(hdl, zcp->zc_case); 564 } else if (fmd_nvl_class_match(hdl, nvl, 565 ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) { 566 /* 567 * The 'resource.fs.zfs.removed' event indicates that 568 * device removal was detected, and the device was 569 * closed asynchronously. If this is the case, we 570 * assume that any recent I/O errors were due to the 571 * device removal, not any fault of the device itself. 572 * We reset the SERD engine, and cancel any pending 573 * timers. 574 */ 575 if (zcp->zc_data.zc_has_remove_timer) { 576 fmd_timer_remove(hdl, zcp->zc_remove_timer); 577 zcp->zc_data.zc_has_remove_timer = 0; 578 zfs_case_serialize(hdl, zcp); 579 } 580 if (zcp->zc_data.zc_serd_io[0] != '\0') 581 fmd_serd_reset(hdl, 582 zcp->zc_data.zc_serd_io); 583 if (zcp->zc_data.zc_serd_checksum[0] != '\0') 584 fmd_serd_reset(hdl, 585 zcp->zc_data.zc_serd_checksum); 586 } 587 return; 588 } 589 590 /* 591 * Associate the ereport with this case. 592 */ 593 fmd_case_add_ereport(hdl, zcp->zc_case, ep); 594 595 /* 596 * Don't do anything else if this case is already solved. 597 */ 598 if (fmd_case_solved(hdl, zcp->zc_case)) 599 return; 600 601 /* 602 * Determine if we should solve the case and generate a fault. We solve 603 * a case if: 604 * 605 * a. A pool failed to open (ereport.fs.zfs.pool) 606 * b. A device failed to open (ereport.fs.zfs.pool) while a pool 607 * was up and running. 608 * 609 * We may see a series of ereports associated with a pool open, all 610 * chained together by the same ENA. If the pool open succeeds, then 611 * we'll see no further ereports. To detect when a pool open has 612 * succeeded, we associate a timer with the event. When it expires, we 613 * close the case. 614 */ 615 if (fmd_nvl_class_match(hdl, nvl, 616 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) { 617 /* 618 * Pool level fault. Before solving the case, go through and 619 * close any open device cases that may be pending. 620 */ 621 for (dcp = uu_list_first(zfs_cases); dcp != NULL; 622 dcp = uu_list_next(zfs_cases, dcp)) { 623 if (dcp->zc_data.zc_pool_guid == 624 zcp->zc_data.zc_pool_guid && 625 dcp->zc_data.zc_vdev_guid != 0) 626 fmd_case_close(hdl, dcp->zc_case); 627 } 628 629 zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool", B_TRUE); 630 } else if (fmd_nvl_class_match(hdl, nvl, 631 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) { 632 /* 633 * Pool level fault for reading the intent logs. 634 */ 635 zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay", B_TRUE); 636 } else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) { 637 /* 638 * Device fault. 639 */ 640 zfs_case_solve(hdl, zcp, "fault.fs.zfs.device", B_TRUE); 641 } else if (fmd_nvl_class_match(hdl, nvl, 642 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || 643 fmd_nvl_class_match(hdl, nvl, 644 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || 645 fmd_nvl_class_match(hdl, nvl, 646 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) || 647 fmd_nvl_class_match(hdl, nvl, 648 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { 649 char *failmode = NULL; 650 boolean_t checkremove = B_FALSE; 651 652 /* 653 * If this is a checksum or I/O error, then toss it into the 654 * appropriate SERD engine and check to see if it has fired. 655 * Ideally, we want to do something more sophisticated, 656 * (persistent errors for a single data block, etc). For now, 657 * a single SERD engine is sufficient. 658 */ 659 if (fmd_nvl_class_match(hdl, nvl, 660 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) { 661 if (zcp->zc_data.zc_serd_io[0] == '\0') { 662 zfs_serd_name(zcp->zc_data.zc_serd_io, 663 pool_guid, vdev_guid, "io"); 664 fmd_serd_create(hdl, zcp->zc_data.zc_serd_io, 665 fmd_prop_get_int32(hdl, "io_N"), 666 fmd_prop_get_int64(hdl, "io_T")); 667 zfs_case_serialize(hdl, zcp); 668 } 669 if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep)) 670 checkremove = B_TRUE; 671 } else if (fmd_nvl_class_match(hdl, nvl, 672 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { 673 if (zcp->zc_data.zc_serd_checksum[0] == '\0') { 674 zfs_serd_name(zcp->zc_data.zc_serd_checksum, 675 pool_guid, vdev_guid, "checksum"); 676 fmd_serd_create(hdl, 677 zcp->zc_data.zc_serd_checksum, 678 fmd_prop_get_int32(hdl, "checksum_N"), 679 fmd_prop_get_int64(hdl, "checksum_T")); 680 zfs_case_serialize(hdl, zcp); 681 } 682 if (fmd_serd_record(hdl, 683 zcp->zc_data.zc_serd_checksum, ep)) { 684 zfs_case_solve(hdl, zcp, 685 "fault.fs.zfs.vdev.checksum", B_FALSE); 686 } 687 } else if (fmd_nvl_class_match(hdl, nvl, 688 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) && 689 (nvlist_lookup_string(nvl, 690 FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) && 691 failmode != NULL) { 692 if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE, 693 strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) { 694 zfs_case_solve(hdl, zcp, 695 "fault.fs.zfs.io_failure_continue", 696 B_FALSE); 697 } else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT, 698 strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) { 699 zfs_case_solve(hdl, zcp, 700 "fault.fs.zfs.io_failure_wait", B_FALSE); 701 } 702 } else if (fmd_nvl_class_match(hdl, nvl, 703 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { 704 checkremove = B_TRUE; 705 } 706 707 /* 708 * Because I/O errors may be due to device removal, we postpone 709 * any diagnosis until we're sure that we aren't about to 710 * receive a 'resource.fs.zfs.removed' event. 711 */ 712 if (checkremove) { 713 if (zcp->zc_data.zc_has_remove_timer) 714 fmd_timer_remove(hdl, zcp->zc_remove_timer); 715 zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL, 716 zfs_remove_timeout); 717 if (!zcp->zc_data.zc_has_remove_timer) { 718 zcp->zc_data.zc_has_remove_timer = 1; 719 zfs_case_serialize(hdl, zcp); 720 } 721 } 722 } 723 } 724 725 /* 726 * The timeout is fired when we diagnosed an I/O error, and it was not due to 727 * device removal (which would cause the timeout to be cancelled). 728 */ 729 /* ARGSUSED */ 730 static void 731 zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data) 732 { 733 zfs_case_t *zcp = data; 734 735 if (id == zcp->zc_remove_timer) 736 zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io", B_FALSE); 737 } 738 739 static void 740 zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) 741 { 742 zfs_case_t *zcp = fmd_case_getspecific(hdl, cs); 743 744 if (zcp->zc_data.zc_serd_checksum[0] != '\0') 745 fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum); 746 if (zcp->zc_data.zc_serd_io[0] != '\0') 747 fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io); 748 if (zcp->zc_data.zc_has_remove_timer) 749 fmd_timer_remove(hdl, zcp->zc_remove_timer); 750 uu_list_remove(zfs_cases, zcp); 751 fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); 752 } 753 754 /* 755 * We use the fmd gc entry point to look for old cases that no longer apply. 756 * This allows us to keep our set of case data small in a long running system. 757 */ 758 static void 759 zfs_fm_gc(fmd_hdl_t *hdl) 760 { 761 zfs_purge_cases(hdl); 762 } 763 764 static const fmd_hdl_ops_t fmd_ops = { 765 zfs_fm_recv, /* fmdo_recv */ 766 zfs_fm_timeout, /* fmdo_timeout */ 767 zfs_fm_close, /* fmdo_close */ 768 NULL, /* fmdo_stats */ 769 zfs_fm_gc, /* fmdo_gc */ 770 }; 771 772 static const fmd_prop_t fmd_props[] = { 773 { "checksum_N", FMD_TYPE_UINT32, "10" }, 774 { "checksum_T", FMD_TYPE_TIME, "10min" }, 775 { "io_N", FMD_TYPE_UINT32, "10" }, 776 { "io_T", FMD_TYPE_TIME, "10min" }, 777 { "remove_timeout", FMD_TYPE_TIME, "15sec" }, 778 { NULL, 0, NULL } 779 }; 780 781 static const fmd_hdl_info_t fmd_info = { 782 "ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props 783 }; 784 785 void 786 _fmd_init(fmd_hdl_t *hdl) 787 { 788 fmd_case_t *cp; 789 libzfs_handle_t *zhdl; 790 791 if ((zhdl = libzfs_init()) == NULL) 792 return; 793 794 if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool", 795 sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node), 796 NULL, 0)) == NULL) { 797 libzfs_fini(zhdl); 798 return; 799 } 800 801 if ((zfs_cases = uu_list_create(zfs_case_pool, NULL, 0)) == NULL) { 802 uu_list_pool_destroy(zfs_case_pool); 803 libzfs_fini(zhdl); 804 return; 805 } 806 807 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { 808 uu_list_destroy(zfs_cases); 809 uu_list_pool_destroy(zfs_case_pool); 810 libzfs_fini(zhdl); 811 return; 812 } 813 814 fmd_hdl_setspecific(hdl, zhdl); 815 816 /* 817 * Iterate over all active cases and unserialize the associated buffers, 818 * adding them to our list of open cases. 819 */ 820 for (cp = fmd_case_next(hdl, NULL); 821 cp != NULL; cp = fmd_case_next(hdl, cp)) 822 (void) zfs_case_unserialize(hdl, cp); 823 824 /* 825 * Clear out any old cases that are no longer valid. 826 */ 827 zfs_purge_cases(hdl); 828 829 zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout"); 830 } 831 832 void 833 _fmd_fini(fmd_hdl_t *hdl) 834 { 835 zfs_case_t *zcp; 836 uu_list_walk_t *walk; 837 libzfs_handle_t *zhdl; 838 839 /* 840 * Remove all active cases. 841 */ 842 walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); 843 while ((zcp = uu_list_walk_next(walk)) != NULL) { 844 uu_list_remove(zfs_cases, zcp); 845 fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); 846 } 847 uu_list_walk_end(walk); 848 849 uu_list_destroy(zfs_cases); 850 uu_list_pool_destroy(zfs_case_pool); 851 852 zhdl = fmd_hdl_getspecific(hdl); 853 libzfs_fini(zhdl); 854 } 855