1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* 25 * Copyright 2019 Joyent, Inc. 26 */ 27 28 /* 29 * The ZFS retire agent is responsible for managing hot spares across all pools. 30 * When we see a device fault or a device removal, we try to open the associated 31 * pool and look for any hot spares. We iterate over any available hot spares 32 * and attempt a 'zpool replace' for each one. 33 * 34 * For vdevs diagnosed as faulty, the agent is also responsible for proactively 35 * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors). 36 */ 37 38 #include <fm/fmd_api.h> 39 #include <sys/fs/zfs.h> 40 #include <sys/fm/protocol.h> 41 #include <sys/fm/fs/zfs.h> 42 #include <libzfs.h> 43 #include <fm/libtopo.h> 44 #include <string.h> 45 46 typedef struct zfs_retire_repaired { 47 struct zfs_retire_repaired *zrr_next; 48 uint64_t zrr_pool; 49 uint64_t zrr_vdev; 50 } zfs_retire_repaired_t; 51 52 typedef struct zfs_retire_data { 53 libzfs_handle_t *zrd_hdl; 54 zfs_retire_repaired_t *zrd_repaired; 55 } zfs_retire_data_t; 56 57 static void 58 zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp) 59 { 60 zfs_retire_repaired_t *zrp; 61 62 while ((zrp = zdp->zrd_repaired) != NULL) { 63 zdp->zrd_repaired = zrp->zrr_next; 64 fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t)); 65 } 66 } 67 68 /* 69 * Find a pool with a matching GUID. 70 */ 71 typedef struct find_cbdata { 72 uint64_t cb_guid; 73 const char *cb_fru; 74 zpool_handle_t *cb_zhp; 75 nvlist_t *cb_vdev; 76 } find_cbdata_t; 77 78 static int 79 find_pool(zpool_handle_t *zhp, void *data) 80 { 81 find_cbdata_t *cbp = data; 82 83 if (cbp->cb_guid == 84 zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) { 85 cbp->cb_zhp = zhp; 86 return (1); 87 } 88 89 zpool_close(zhp); 90 return (0); 91 } 92 93 /* 94 * Find a vdev within a tree with a matching GUID. 95 */ 96 static nvlist_t * 97 find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, const char *search_fru, 98 uint64_t search_guid) 99 { 100 uint64_t guid; 101 nvlist_t **child; 102 uint_t c, children; 103 nvlist_t *ret; 104 char *fru; 105 106 if (search_fru != NULL) { 107 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &fru) == 0 && 108 libzfs_fru_compare(zhdl, fru, search_fru)) 109 return (nv); 110 } else { 111 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && 112 guid == search_guid) 113 return (nv); 114 } 115 116 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 117 &child, &children) != 0) 118 return (NULL); 119 120 for (c = 0; c < children; c++) { 121 if ((ret = find_vdev(zhdl, child[c], search_fru, 122 search_guid)) != NULL) 123 return (ret); 124 } 125 126 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 127 &child, &children) != 0) 128 return (NULL); 129 130 for (c = 0; c < children; c++) { 131 if ((ret = find_vdev(zhdl, child[c], search_fru, 132 search_guid)) != NULL) 133 return (ret); 134 } 135 136 return (NULL); 137 } 138 139 /* 140 * Given a (pool, vdev) GUID pair, find the matching pool and vdev. 141 */ 142 static zpool_handle_t * 143 find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, 144 nvlist_t **vdevp) 145 { 146 find_cbdata_t cb; 147 zpool_handle_t *zhp; 148 nvlist_t *config, *nvroot; 149 150 /* 151 * Find the corresponding pool and make sure the vdev still exists. 152 */ 153 cb.cb_guid = pool_guid; 154 if (zpool_iter(zhdl, find_pool, &cb) != 1) 155 return (NULL); 156 157 zhp = cb.cb_zhp; 158 config = zpool_get_config(zhp, NULL); 159 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 160 &nvroot) != 0) { 161 zpool_close(zhp); 162 return (NULL); 163 } 164 165 if (vdev_guid != 0) { 166 if ((*vdevp = find_vdev(zhdl, nvroot, NULL, 167 vdev_guid)) == NULL) { 168 zpool_close(zhp); 169 return (NULL); 170 } 171 } 172 173 return (zhp); 174 } 175 176 static int 177 search_pool(zpool_handle_t *zhp, void *data) 178 { 179 find_cbdata_t *cbp = data; 180 nvlist_t *config; 181 nvlist_t *nvroot; 182 183 config = zpool_get_config(zhp, NULL); 184 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 185 &nvroot) != 0) { 186 zpool_close(zhp); 187 return (0); 188 } 189 190 if ((cbp->cb_vdev = find_vdev(zpool_get_handle(zhp), nvroot, 191 cbp->cb_fru, 0)) != NULL) { 192 cbp->cb_zhp = zhp; 193 return (1); 194 } 195 196 zpool_close(zhp); 197 return (0); 198 } 199 200 /* 201 * Given a FRU FMRI, find the matching pool and vdev. 202 */ 203 static zpool_handle_t * 204 find_by_fru(libzfs_handle_t *zhdl, const char *fru, nvlist_t **vdevp) 205 { 206 find_cbdata_t cb; 207 208 cb.cb_fru = fru; 209 cb.cb_zhp = NULL; 210 if (zpool_iter(zhdl, search_pool, &cb) != 1) 211 return (NULL); 212 213 *vdevp = cb.cb_vdev; 214 return (cb.cb_zhp); 215 } 216 217 /* 218 * Given a vdev, attempt to replace it with every known spare until one 219 * succeeds. 220 */ 221 static void 222 replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) 223 { 224 nvlist_t *config, *nvroot, *replacement; 225 nvlist_t **spares; 226 uint_t s, nspares; 227 char *dev_name; 228 zprop_source_t source; 229 int ashift; 230 231 config = zpool_get_config(zhp, NULL); 232 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 233 &nvroot) != 0) 234 return; 235 236 /* 237 * Find out if there are any hot spares available in the pool. 238 */ 239 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 240 &spares, &nspares) != 0) 241 return; 242 243 /* 244 * lookup "ashift" pool property, we may need it for the replacement 245 */ 246 ashift = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &source); 247 248 replacement = fmd_nvl_alloc(hdl, FMD_SLEEP); 249 250 (void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE, 251 VDEV_TYPE_ROOT); 252 253 dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); 254 255 /* 256 * Try to replace each spare, ending when we successfully 257 * replace it. 258 */ 259 for (s = 0; s < nspares; s++) { 260 char *spare_name; 261 262 if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, 263 &spare_name) != 0) 264 continue; 265 266 /* if set, add the "ashift" pool property to the spare nvlist */ 267 if (source != ZPROP_SRC_DEFAULT) 268 (void) nvlist_add_uint64(spares[s], 269 ZPOOL_CONFIG_ASHIFT, ashift); 270 271 (void) nvlist_add_nvlist_array(replacement, 272 ZPOOL_CONFIG_CHILDREN, &spares[s], 1); 273 274 if (zpool_vdev_attach(zhp, dev_name, spare_name, 275 replacement, B_TRUE) == 0) 276 break; 277 } 278 279 free(dev_name); 280 nvlist_free(replacement); 281 } 282 283 /* 284 * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and 285 * ASRU is now usable. ZFS has found the device to be present and 286 * functioning. 287 */ 288 /*ARGSUSED*/ 289 void 290 zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl) 291 { 292 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 293 zfs_retire_repaired_t *zrp; 294 uint64_t pool_guid, vdev_guid; 295 nvlist_t *asru; 296 297 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, 298 &pool_guid) != 0 || nvlist_lookup_uint64(nvl, 299 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) 300 return; 301 302 /* 303 * Before checking the state of the ASRU, go through and see if we've 304 * already made an attempt to repair this ASRU. This list is cleared 305 * whenever we receive any kind of list event, and is designed to 306 * prevent us from generating a feedback loop when we attempt repairs 307 * against a faulted pool. The problem is that checking the unusable 308 * state of the ASRU can involve opening the pool, which can post 309 * statechange events but otherwise leave the pool in the faulted 310 * state. This list allows us to detect when a statechange event is 311 * due to our own request. 312 */ 313 for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) { 314 if (zrp->zrr_pool == pool_guid && 315 zrp->zrr_vdev == vdev_guid) 316 return; 317 } 318 319 asru = fmd_nvl_alloc(hdl, FMD_SLEEP); 320 321 (void) nvlist_add_uint8(asru, FM_VERSION, ZFS_SCHEME_VERSION0); 322 (void) nvlist_add_string(asru, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); 323 (void) nvlist_add_uint64(asru, FM_FMRI_ZFS_POOL, pool_guid); 324 (void) nvlist_add_uint64(asru, FM_FMRI_ZFS_VDEV, vdev_guid); 325 326 /* 327 * We explicitly check for the unusable state here to make sure we 328 * aren't responding to a transient state change. As part of opening a 329 * vdev, it's possible to see the 'statechange' event, only to be 330 * followed by a vdev failure later. If we don't check the current 331 * state of the vdev (or pool) before marking it repaired, then we risk 332 * generating spurious repair events followed immediately by the same 333 * diagnosis. 334 * 335 * This assumes that the ZFS scheme code associated unusable (i.e. 336 * isolated) with its own definition of faulty state. In the case of a 337 * DEGRADED leaf vdev (due to checksum errors), this is not the case. 338 * This works, however, because the transient state change is not 339 * posted in this case. This could be made more explicit by not 340 * relying on the scheme's unusable callback and instead directly 341 * checking the vdev state, where we could correctly account for 342 * DEGRADED state. 343 */ 344 if (!fmd_nvl_fmri_unusable(hdl, asru) && fmd_nvl_fmri_has_fault(hdl, 345 asru, FMD_HAS_FAULT_ASRU, NULL)) { 346 topo_hdl_t *thp; 347 char *fmri = NULL; 348 int err; 349 350 thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION); 351 if (topo_fmri_nvl2str(thp, asru, &fmri, &err) == 0) 352 (void) fmd_repair_asru(hdl, fmri); 353 fmd_hdl_topo_rele(hdl, thp); 354 355 topo_hdl_strfree(thp, fmri); 356 } 357 nvlist_free(asru); 358 zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP); 359 zrp->zrr_next = zdp->zrd_repaired; 360 zrp->zrr_pool = pool_guid; 361 zrp->zrr_vdev = vdev_guid; 362 zdp->zrd_repaired = zrp; 363 } 364 365 /*ARGSUSED*/ 366 static void 367 zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 368 const char *class) 369 { 370 uint64_t pool_guid, vdev_guid; 371 zpool_handle_t *zhp; 372 nvlist_t *resource, *fault, *fru; 373 nvlist_t **faults; 374 uint_t f, nfaults; 375 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 376 libzfs_handle_t *zhdl = zdp->zrd_hdl; 377 boolean_t fault_device, degrade_device; 378 boolean_t is_repair; 379 char *scheme, *fmri; 380 nvlist_t *vdev; 381 char *uuid; 382 int repair_done = 0; 383 boolean_t retire; 384 boolean_t is_disk; 385 vdev_aux_t aux; 386 topo_hdl_t *thp; 387 int err; 388 389 /* 390 * If this is a resource notifying us of device removal, then simply 391 * check for an available spare and continue. 392 */ 393 if (strcmp(class, "resource.fs.zfs.removed") == 0) { 394 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, 395 &pool_guid) != 0 || 396 nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, 397 &vdev_guid) != 0) 398 return; 399 400 if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, 401 &vdev)) == NULL) 402 return; 403 404 if (fmd_prop_get_int32(hdl, "spare_on_remove")) 405 replace_with_spare(hdl, zhp, vdev); 406 zpool_close(zhp); 407 return; 408 } 409 410 if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) 411 return; 412 413 if (strcmp(class, "resource.fs.zfs.statechange") == 0 || 414 strcmp(class, 415 "resource.sysevent.EC_zfs.ESC_ZFS_vdev_remove") == 0) { 416 zfs_vdev_repair(hdl, nvl); 417 return; 418 } 419 420 zfs_retire_clear_data(hdl, zdp); 421 422 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 423 is_repair = B_TRUE; 424 else 425 is_repair = B_FALSE; 426 427 /* 428 * We subscribe to zfs faults as well as all repair events. 429 */ 430 if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 431 &faults, &nfaults) != 0) 432 return; 433 434 for (f = 0; f < nfaults; f++) { 435 fault = faults[f]; 436 437 fault_device = B_FALSE; 438 degrade_device = B_FALSE; 439 is_disk = B_FALSE; 440 441 if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE, 442 &retire) == 0 && retire == 0) 443 continue; 444 445 if (fmd_nvl_class_match(hdl, fault, 446 "fault.io.disk.ssm-wearout") && 447 fmd_prop_get_int32(hdl, "ssm_wearout_skip_retire") == 448 FMD_B_TRUE) { 449 fmd_hdl_debug(hdl, "zfs-retire: ignoring SSM fault"); 450 continue; 451 } 452 453 /* 454 * While we subscribe to fault.fs.zfs.*, we only take action 455 * for faults targeting a specific vdev (open failure or SERD 456 * failure). We also subscribe to fault.io.* events, so that 457 * faulty disks will be faulted in the ZFS configuration. 458 */ 459 if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) { 460 fault_device = B_TRUE; 461 } else if (fmd_nvl_class_match(hdl, fault, 462 "fault.fs.zfs.vdev.checksum")) { 463 degrade_device = B_TRUE; 464 } else if (fmd_nvl_class_match(hdl, fault, 465 "fault.fs.zfs.device")) { 466 fault_device = B_FALSE; 467 } else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) { 468 is_disk = B_TRUE; 469 fault_device = B_TRUE; 470 } else { 471 continue; 472 } 473 474 if (is_disk) { 475 /* 476 * This is a disk fault. Lookup the FRU, convert it to 477 * an FMRI string, and attempt to find a matching vdev. 478 */ 479 if (nvlist_lookup_nvlist(fault, FM_FAULT_FRU, 480 &fru) != 0 || 481 nvlist_lookup_string(fru, FM_FMRI_SCHEME, 482 &scheme) != 0) 483 continue; 484 485 if (strcmp(scheme, FM_FMRI_SCHEME_HC) != 0) 486 continue; 487 488 thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION); 489 if (topo_fmri_nvl2str(thp, fru, &fmri, &err) != 0) { 490 fmd_hdl_topo_rele(hdl, thp); 491 continue; 492 } 493 494 zhp = find_by_fru(zhdl, fmri, &vdev); 495 topo_hdl_strfree(thp, fmri); 496 fmd_hdl_topo_rele(hdl, thp); 497 498 if (zhp == NULL) 499 continue; 500 501 (void) nvlist_lookup_uint64(vdev, 502 ZPOOL_CONFIG_GUID, &vdev_guid); 503 aux = VDEV_AUX_EXTERNAL; 504 } else { 505 /* 506 * This is a ZFS fault. Lookup the resource, and 507 * attempt to find the matching vdev. 508 */ 509 if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, 510 &resource) != 0 || 511 nvlist_lookup_string(resource, FM_FMRI_SCHEME, 512 &scheme) != 0) 513 continue; 514 515 if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0) 516 continue; 517 518 if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL, 519 &pool_guid) != 0) 520 continue; 521 522 if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV, 523 &vdev_guid) != 0) { 524 if (is_repair) 525 vdev_guid = 0; 526 else 527 continue; 528 } 529 530 if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, 531 &vdev)) == NULL) 532 continue; 533 534 aux = VDEV_AUX_ERR_EXCEEDED; 535 } 536 537 if (vdev_guid == 0) { 538 /* 539 * For pool-level repair events, clear the entire pool. 540 */ 541 (void) zpool_clear(zhp, NULL, NULL); 542 zpool_close(zhp); 543 continue; 544 } 545 546 /* 547 * If this is a repair event, then mark the vdev as repaired and 548 * continue. 549 */ 550 if (is_repair) { 551 repair_done = 1; 552 (void) zpool_vdev_clear(zhp, vdev_guid); 553 zpool_close(zhp); 554 continue; 555 } 556 557 /* 558 * Actively fault the device if needed. 559 */ 560 if (fault_device) 561 (void) zpool_vdev_fault(zhp, vdev_guid, aux); 562 if (degrade_device) 563 (void) zpool_vdev_degrade(zhp, vdev_guid, aux); 564 565 /* 566 * Attempt to substitute a hot spare. 567 */ 568 replace_with_spare(hdl, zhp, vdev); 569 zpool_close(zhp); 570 } 571 572 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done && 573 nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0) 574 fmd_case_uuresolved(hdl, uuid); 575 } 576 577 static const fmd_hdl_ops_t fmd_ops = { 578 zfs_retire_recv, /* fmdo_recv */ 579 NULL, /* fmdo_timeout */ 580 NULL, /* fmdo_close */ 581 NULL, /* fmdo_stats */ 582 NULL, /* fmdo_gc */ 583 }; 584 585 static const fmd_prop_t fmd_props[] = { 586 { "spare_on_remove", FMD_TYPE_BOOL, "true" }, 587 { "ssm_wearout_skip_retire", FMD_TYPE_BOOL, "true"}, 588 { NULL, 0, NULL } 589 }; 590 591 static const fmd_hdl_info_t fmd_info = { 592 "ZFS Retire Agent", "1.0", &fmd_ops, fmd_props 593 }; 594 595 void 596 _fmd_init(fmd_hdl_t *hdl) 597 { 598 zfs_retire_data_t *zdp; 599 libzfs_handle_t *zhdl; 600 601 if ((zhdl = libzfs_init()) == NULL) 602 return; 603 604 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { 605 libzfs_fini(zhdl); 606 return; 607 } 608 609 zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP); 610 zdp->zrd_hdl = zhdl; 611 612 fmd_hdl_setspecific(hdl, zdp); 613 } 614 615 void 616 _fmd_fini(fmd_hdl_t *hdl) 617 { 618 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 619 620 if (zdp != NULL) { 621 zfs_retire_clear_data(hdl, zdp); 622 libzfs_fini(zdp->zrd_hdl); 623 fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t)); 624 } 625 } 626