1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* 25 * Copyright 2019 Joyent, Inc. 26 */ 27 28 /* 29 * The ZFS retire agent is responsible for managing hot spares across all pools. 30 * When we see a device fault or a device removal, we try to open the associated 31 * pool and look for any hot spares. We iterate over any available hot spares 32 * and attempt a 'zpool replace' for each one. 33 * 34 * For vdevs diagnosed as faulty, the agent is also responsible for proactively 35 * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors). 36 */ 37 38 #include <fm/fmd_api.h> 39 #include <sys/fs/zfs.h> 40 #include <sys/fm/protocol.h> 41 #include <sys/fm/fs/zfs.h> 42 #include <libzfs.h> 43 #include <fm/libtopo.h> 44 #include <string.h> 45 46 typedef struct zfs_retire_repaired { 47 struct zfs_retire_repaired *zrr_next; 48 uint64_t zrr_pool; 49 uint64_t zrr_vdev; 50 } zfs_retire_repaired_t; 51 52 typedef struct zfs_retire_data { 53 libzfs_handle_t *zrd_hdl; 54 zfs_retire_repaired_t *zrd_repaired; 55 } zfs_retire_data_t; 56 57 static void 58 zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp) 59 { 60 zfs_retire_repaired_t *zrp; 61 62 while ((zrp = zdp->zrd_repaired) != NULL) { 63 zdp->zrd_repaired = zrp->zrr_next; 64 fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t)); 65 } 66 } 67 68 /* 69 * Find a pool with a matching GUID. 70 */ 71 typedef struct find_cbdata { 72 uint64_t cb_guid; 73 const char *cb_fru; 74 zpool_handle_t *cb_zhp; 75 nvlist_t *cb_vdev; 76 } find_cbdata_t; 77 78 static int 79 find_pool(zpool_handle_t *zhp, void *data) 80 { 81 find_cbdata_t *cbp = data; 82 83 if (cbp->cb_guid == 84 zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) { 85 cbp->cb_zhp = zhp; 86 return (1); 87 } 88 89 zpool_close(zhp); 90 return (0); 91 } 92 93 /* 94 * Find a vdev within a tree with a matching GUID. 95 */ 96 static nvlist_t * 97 find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, const char *search_fru, 98 uint64_t search_guid) 99 { 100 uint64_t guid; 101 nvlist_t **child; 102 uint_t c, children; 103 nvlist_t *ret; 104 char *fru; 105 106 if (search_fru != NULL) { 107 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &fru) == 0 && 108 libzfs_fru_compare(zhdl, fru, search_fru)) 109 return (nv); 110 } else { 111 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && 112 guid == search_guid) 113 return (nv); 114 } 115 116 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 117 &child, &children) != 0) 118 return (NULL); 119 120 for (c = 0; c < children; c++) { 121 if ((ret = find_vdev(zhdl, child[c], search_fru, 122 search_guid)) != NULL) 123 return (ret); 124 } 125 126 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 127 &child, &children) == 0) { 128 for (c = 0; c < children; c++) { 129 if ((ret = find_vdev(zhdl, child[c], search_fru, 130 search_guid)) != NULL) 131 return (ret); 132 } 133 } 134 135 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 136 &child, &children) == 0) { 137 for (c = 0; c < children; c++) { 138 if ((ret = find_vdev(zhdl, child[c], search_fru, 139 search_guid)) != NULL) 140 return (ret); 141 } 142 } 143 144 return (NULL); 145 } 146 147 /* 148 * Given a (pool, vdev) GUID pair, find the matching pool and vdev. 149 */ 150 static zpool_handle_t * 151 find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, 152 nvlist_t **vdevp) 153 { 154 find_cbdata_t cb; 155 zpool_handle_t *zhp; 156 nvlist_t *config, *nvroot; 157 158 /* 159 * Find the corresponding pool and make sure the vdev still exists. 160 */ 161 cb.cb_guid = pool_guid; 162 if (zpool_iter(zhdl, find_pool, &cb) != 1) 163 return (NULL); 164 165 zhp = cb.cb_zhp; 166 config = zpool_get_config(zhp, NULL); 167 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 168 &nvroot) != 0) { 169 zpool_close(zhp); 170 return (NULL); 171 } 172 173 if (vdev_guid != 0) { 174 if ((*vdevp = find_vdev(zhdl, nvroot, NULL, 175 vdev_guid)) == NULL) { 176 zpool_close(zhp); 177 return (NULL); 178 } 179 } 180 181 return (zhp); 182 } 183 184 static int 185 search_pool(zpool_handle_t *zhp, void *data) 186 { 187 find_cbdata_t *cbp = data; 188 nvlist_t *config; 189 nvlist_t *nvroot; 190 191 config = zpool_get_config(zhp, NULL); 192 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 193 &nvroot) != 0) { 194 zpool_close(zhp); 195 return (0); 196 } 197 198 if ((cbp->cb_vdev = find_vdev(zpool_get_handle(zhp), nvroot, 199 cbp->cb_fru, 0)) != NULL) { 200 cbp->cb_zhp = zhp; 201 return (1); 202 } 203 204 zpool_close(zhp); 205 return (0); 206 } 207 208 /* 209 * Given a FRU FMRI, find the matching pool and vdev. 210 */ 211 static zpool_handle_t * 212 find_by_fru(libzfs_handle_t *zhdl, const char *fru, nvlist_t **vdevp) 213 { 214 find_cbdata_t cb; 215 216 cb.cb_fru = fru; 217 cb.cb_zhp = NULL; 218 if (zpool_iter(zhdl, search_pool, &cb) != 1) 219 return (NULL); 220 221 *vdevp = cb.cb_vdev; 222 return (cb.cb_zhp); 223 } 224 225 /* 226 * Given a vdev, attempt to replace it with every known spare until one 227 * succeeds. 228 */ 229 static void 230 replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) 231 { 232 nvlist_t *config, *nvroot, *replacement; 233 nvlist_t **spares; 234 uint_t s, nspares; 235 char *dev_name; 236 zprop_source_t source; 237 int ashift; 238 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 239 libzfs_handle_t *zhdl = zdp->zrd_hdl; 240 241 config = zpool_get_config(zhp, NULL); 242 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 243 &nvroot) != 0) 244 return; 245 246 /* 247 * Find out if there are any hot spares available in the pool. 248 */ 249 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 250 &spares, &nspares) != 0) 251 return; 252 253 /* 254 * lookup "ashift" pool property, we may need it for the replacement 255 */ 256 ashift = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &source); 257 258 replacement = fmd_nvl_alloc(hdl, FMD_SLEEP); 259 260 (void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE, 261 VDEV_TYPE_ROOT); 262 263 dev_name = zpool_vdev_name(zhdl, zhp, vdev, B_FALSE); 264 265 /* 266 * Try to replace each spare, ending when we successfully 267 * replace it. 268 */ 269 for (s = 0; s < nspares; s++) { 270 char *spare_name; 271 272 if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, 273 &spare_name) != 0) 274 continue; 275 276 /* if set, add the "ashift" pool property to the spare nvlist */ 277 if (source != ZPROP_SRC_DEFAULT) 278 (void) nvlist_add_uint64(spares[s], 279 ZPOOL_CONFIG_ASHIFT, ashift); 280 281 (void) nvlist_add_nvlist_array(replacement, 282 ZPOOL_CONFIG_CHILDREN, &spares[s], 1); 283 284 if (zpool_vdev_attach(zhp, dev_name, spare_name, 285 replacement, B_TRUE) == 0) 286 break; 287 } 288 289 free(dev_name); 290 nvlist_free(replacement); 291 } 292 293 /* 294 * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and 295 * ASRU is now usable. ZFS has found the device to be present and 296 * functioning. 297 */ 298 /*ARGSUSED*/ 299 void 300 zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl) 301 { 302 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 303 zfs_retire_repaired_t *zrp; 304 uint64_t pool_guid, vdev_guid; 305 nvlist_t *asru; 306 307 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, 308 &pool_guid) != 0 || nvlist_lookup_uint64(nvl, 309 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) 310 return; 311 312 /* 313 * Before checking the state of the ASRU, go through and see if we've 314 * already made an attempt to repair this ASRU. This list is cleared 315 * whenever we receive any kind of list event, and is designed to 316 * prevent us from generating a feedback loop when we attempt repairs 317 * against a faulted pool. The problem is that checking the unusable 318 * state of the ASRU can involve opening the pool, which can post 319 * statechange events but otherwise leave the pool in the faulted 320 * state. This list allows us to detect when a statechange event is 321 * due to our own request. 322 */ 323 for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) { 324 if (zrp->zrr_pool == pool_guid && 325 zrp->zrr_vdev == vdev_guid) 326 return; 327 } 328 329 asru = fmd_nvl_alloc(hdl, FMD_SLEEP); 330 331 (void) nvlist_add_uint8(asru, FM_VERSION, ZFS_SCHEME_VERSION0); 332 (void) nvlist_add_string(asru, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); 333 (void) nvlist_add_uint64(asru, FM_FMRI_ZFS_POOL, pool_guid); 334 (void) nvlist_add_uint64(asru, FM_FMRI_ZFS_VDEV, vdev_guid); 335 336 /* 337 * We explicitly check for the unusable state here to make sure we 338 * aren't responding to a transient state change. As part of opening a 339 * vdev, it's possible to see the 'statechange' event, only to be 340 * followed by a vdev failure later. If we don't check the current 341 * state of the vdev (or pool) before marking it repaired, then we risk 342 * generating spurious repair events followed immediately by the same 343 * diagnosis. 344 * 345 * This assumes that the ZFS scheme code associated unusable (i.e. 346 * isolated) with its own definition of faulty state. In the case of a 347 * DEGRADED leaf vdev (due to checksum errors), this is not the case. 348 * This works, however, because the transient state change is not 349 * posted in this case. This could be made more explicit by not 350 * relying on the scheme's unusable callback and instead directly 351 * checking the vdev state, where we could correctly account for 352 * DEGRADED state. 353 */ 354 if (!fmd_nvl_fmri_unusable(hdl, asru) && fmd_nvl_fmri_has_fault(hdl, 355 asru, FMD_HAS_FAULT_ASRU, NULL)) { 356 topo_hdl_t *thp; 357 char *fmri = NULL; 358 int err; 359 360 thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION); 361 if (topo_fmri_nvl2str(thp, asru, &fmri, &err) == 0) 362 (void) fmd_repair_asru(hdl, fmri); 363 fmd_hdl_topo_rele(hdl, thp); 364 365 topo_hdl_strfree(thp, fmri); 366 } 367 nvlist_free(asru); 368 zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP); 369 zrp->zrr_next = zdp->zrd_repaired; 370 zrp->zrr_pool = pool_guid; 371 zrp->zrr_vdev = vdev_guid; 372 zdp->zrd_repaired = zrp; 373 } 374 375 /*ARGSUSED*/ 376 static void 377 zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 378 const char *class) 379 { 380 uint64_t pool_guid, vdev_guid; 381 zpool_handle_t *zhp; 382 nvlist_t *resource, *fault, *fru; 383 nvlist_t **faults; 384 uint_t f, nfaults; 385 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 386 libzfs_handle_t *zhdl = zdp->zrd_hdl; 387 boolean_t fault_device, degrade_device; 388 boolean_t is_repair; 389 char *scheme, *fmri; 390 nvlist_t *vdev; 391 char *uuid; 392 int repair_done = 0; 393 boolean_t retire; 394 boolean_t is_disk; 395 vdev_aux_t aux; 396 topo_hdl_t *thp; 397 int err; 398 399 /* 400 * If this is a resource notifying us of device removal, then simply 401 * check for an available spare and continue. 402 */ 403 if (strcmp(class, "resource.fs.zfs.removed") == 0) { 404 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, 405 &pool_guid) != 0 || 406 nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, 407 &vdev_guid) != 0) 408 return; 409 410 if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, 411 &vdev)) == NULL) 412 return; 413 414 if (fmd_prop_get_int32(hdl, "spare_on_remove")) 415 replace_with_spare(hdl, zhp, vdev); 416 zpool_close(zhp); 417 return; 418 } 419 420 if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) 421 return; 422 423 if (strcmp(class, "resource.fs.zfs.statechange") == 0 || 424 strcmp(class, 425 "resource.sysevent.EC_zfs.ESC_ZFS_vdev_remove") == 0) { 426 zfs_vdev_repair(hdl, nvl); 427 return; 428 } 429 430 zfs_retire_clear_data(hdl, zdp); 431 432 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 433 is_repair = B_TRUE; 434 else 435 is_repair = B_FALSE; 436 437 /* 438 * We subscribe to zfs faults as well as all repair events. 439 */ 440 if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 441 &faults, &nfaults) != 0) 442 return; 443 444 for (f = 0; f < nfaults; f++) { 445 fault = faults[f]; 446 447 fault_device = B_FALSE; 448 degrade_device = B_FALSE; 449 is_disk = B_FALSE; 450 451 if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE, 452 &retire) == 0 && retire == 0) 453 continue; 454 455 if (fmd_nvl_class_match(hdl, fault, 456 "fault.io.disk.ssm-wearout") && 457 fmd_prop_get_int32(hdl, "ssm_wearout_skip_retire") == 458 FMD_B_TRUE) { 459 fmd_hdl_debug(hdl, "zfs-retire: ignoring SSM fault"); 460 continue; 461 } 462 463 /* 464 * While we subscribe to fault.fs.zfs.*, we only take action 465 * for faults targeting a specific vdev (open failure or SERD 466 * failure). We also subscribe to fault.io.* events, so that 467 * faulty disks will be faulted in the ZFS configuration. 468 */ 469 if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) { 470 fault_device = B_TRUE; 471 } else if (fmd_nvl_class_match(hdl, fault, 472 "fault.fs.zfs.vdev.checksum")) { 473 degrade_device = B_TRUE; 474 } else if (fmd_nvl_class_match(hdl, fault, 475 "fault.fs.zfs.device")) { 476 fault_device = B_FALSE; 477 } else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) { 478 is_disk = B_TRUE; 479 fault_device = B_TRUE; 480 } else { 481 continue; 482 } 483 484 if (is_disk) { 485 /* 486 * This is a disk fault. Lookup the FRU, convert it to 487 * an FMRI string, and attempt to find a matching vdev. 488 */ 489 if (nvlist_lookup_nvlist(fault, FM_FAULT_FRU, 490 &fru) != 0 || 491 nvlist_lookup_string(fru, FM_FMRI_SCHEME, 492 &scheme) != 0) 493 continue; 494 495 if (strcmp(scheme, FM_FMRI_SCHEME_HC) != 0) 496 continue; 497 498 thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION); 499 if (topo_fmri_nvl2str(thp, fru, &fmri, &err) != 0) { 500 fmd_hdl_topo_rele(hdl, thp); 501 continue; 502 } 503 504 zhp = find_by_fru(zhdl, fmri, &vdev); 505 topo_hdl_strfree(thp, fmri); 506 fmd_hdl_topo_rele(hdl, thp); 507 508 if (zhp == NULL) 509 continue; 510 511 (void) nvlist_lookup_uint64(vdev, 512 ZPOOL_CONFIG_GUID, &vdev_guid); 513 aux = VDEV_AUX_EXTERNAL; 514 } else { 515 /* 516 * This is a ZFS fault. Lookup the resource, and 517 * attempt to find the matching vdev. 518 */ 519 if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, 520 &resource) != 0 || 521 nvlist_lookup_string(resource, FM_FMRI_SCHEME, 522 &scheme) != 0) 523 continue; 524 525 if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0) 526 continue; 527 528 if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL, 529 &pool_guid) != 0) 530 continue; 531 532 if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV, 533 &vdev_guid) != 0) { 534 if (is_repair) 535 vdev_guid = 0; 536 else 537 continue; 538 } 539 540 if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, 541 &vdev)) == NULL) 542 continue; 543 544 aux = VDEV_AUX_ERR_EXCEEDED; 545 } 546 547 if (vdev_guid == 0) { 548 /* 549 * For pool-level repair events, clear the entire pool. 550 */ 551 (void) zpool_clear(zhp, NULL, NULL); 552 zpool_close(zhp); 553 continue; 554 } 555 556 /* 557 * If this is a repair event, then mark the vdev as repaired and 558 * continue. 559 */ 560 if (is_repair) { 561 repair_done = 1; 562 (void) zpool_vdev_clear(zhp, vdev_guid); 563 zpool_close(zhp); 564 continue; 565 } 566 567 /* 568 * Actively fault the device if needed. 569 */ 570 if (fault_device) 571 (void) zpool_vdev_fault(zhp, vdev_guid, aux); 572 if (degrade_device) 573 (void) zpool_vdev_degrade(zhp, vdev_guid, aux); 574 575 /* 576 * Attempt to substitute a hot spare. 577 */ 578 replace_with_spare(hdl, zhp, vdev); 579 zpool_close(zhp); 580 } 581 582 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done && 583 nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0) 584 fmd_case_uuresolved(hdl, uuid); 585 } 586 587 static const fmd_hdl_ops_t fmd_ops = { 588 zfs_retire_recv, /* fmdo_recv */ 589 NULL, /* fmdo_timeout */ 590 NULL, /* fmdo_close */ 591 NULL, /* fmdo_stats */ 592 NULL, /* fmdo_gc */ 593 }; 594 595 static const fmd_prop_t fmd_props[] = { 596 { "spare_on_remove", FMD_TYPE_BOOL, "true" }, 597 { "ssm_wearout_skip_retire", FMD_TYPE_BOOL, "true"}, 598 { NULL, 0, NULL } 599 }; 600 601 static const fmd_hdl_info_t fmd_info = { 602 "ZFS Retire Agent", "1.0", &fmd_ops, fmd_props 603 }; 604 605 void 606 _fmd_init(fmd_hdl_t *hdl) 607 { 608 zfs_retire_data_t *zdp; 609 libzfs_handle_t *zhdl; 610 611 if ((zhdl = libzfs_init()) == NULL) 612 return; 613 614 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { 615 libzfs_fini(zhdl); 616 return; 617 } 618 619 zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP); 620 zdp->zrd_hdl = zhdl; 621 622 fmd_hdl_setspecific(hdl, zdp); 623 } 624 625 void 626 _fmd_fini(fmd_hdl_t *hdl) 627 { 628 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 629 630 if (zdp != NULL) { 631 zfs_retire_clear_data(hdl, zdp); 632 libzfs_fini(zdp->zrd_hdl); 633 fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t)); 634 } 635 } 636