1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright (c) 2016, Intel Corporation. 25 * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com> 26 */ 27 28 /* 29 * The ZFS retire agent is responsible for managing hot spares across all pools. 30 * When we see a device fault or a device removal, we try to open the associated 31 * pool and look for any hot spares. We iterate over any available hot spares 32 * and attempt a 'zpool replace' for each one. 33 * 34 * For vdevs diagnosed as faulty, the agent is also responsible for proactively 35 * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors). 36 */ 37 38 #include <sys/fs/zfs.h> 39 #include <sys/fm/protocol.h> 40 #include <sys/fm/fs/zfs.h> 41 #include <libzutil.h> 42 #include <libzfs.h> 43 #include <string.h> 44 #include <libgen.h> 45 46 #include "zfs_agents.h" 47 #include "fmd_api.h" 48 49 50 typedef struct zfs_retire_repaired { 51 struct zfs_retire_repaired *zrr_next; 52 uint64_t zrr_pool; 53 uint64_t zrr_vdev; 54 } zfs_retire_repaired_t; 55 56 typedef struct zfs_retire_data { 57 libzfs_handle_t *zrd_hdl; 58 zfs_retire_repaired_t *zrd_repaired; 59 } zfs_retire_data_t; 60 61 static void 62 zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp) 63 { 64 zfs_retire_repaired_t *zrp; 65 66 while ((zrp = zdp->zrd_repaired) != NULL) { 67 zdp->zrd_repaired = zrp->zrr_next; 68 fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t)); 69 } 70 } 71 72 /* 73 * Find a pool with a matching GUID. 74 */ 75 typedef struct find_cbdata { 76 uint64_t cb_guid; 77 zpool_handle_t *cb_zhp; 78 nvlist_t *cb_vdev; 79 uint64_t cb_vdev_guid; 80 uint64_t cb_num_spares; 81 } find_cbdata_t; 82 83 static int 84 find_pool(zpool_handle_t *zhp, void *data) 85 { 86 find_cbdata_t *cbp = data; 87 88 if (cbp->cb_guid == 89 zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) { 90 cbp->cb_zhp = zhp; 91 return (1); 92 } 93 94 zpool_close(zhp); 95 return (0); 96 } 97 98 /* 99 * Find a vdev within a tree with a matching GUID. 100 */ 101 static nvlist_t * 102 find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) 103 { 104 uint64_t guid; 105 nvlist_t **child; 106 uint_t c, children; 107 nvlist_t *ret; 108 109 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && 110 guid == search_guid) { 111 fmd_hdl_debug(fmd_module_hdl("zfs-retire"), 112 "matched vdev %llu", guid); 113 return (nv); 114 } 115 116 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 117 &child, &children) != 0) 118 return (NULL); 119 120 for (c = 0; c < children; c++) { 121 if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) 122 return (ret); 123 } 124 125 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 126 &child, &children) != 0) 127 return (NULL); 128 129 for (c = 0; c < children; c++) { 130 if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) 131 return (ret); 132 } 133 134 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 135 &child, &children) != 0) 136 return (NULL); 137 138 for (c = 0; c < children; c++) { 139 if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) 140 return (ret); 141 } 142 143 return (NULL); 144 } 145 146 static int 147 remove_spares(zpool_handle_t *zhp, void *data) 148 { 149 nvlist_t *config, *nvroot; 150 nvlist_t **spares; 151 uint_t nspares; 152 char *devname; 153 find_cbdata_t *cbp = data; 154 uint64_t spareguid = 0; 155 vdev_stat_t *vs; 156 unsigned int c; 157 158 config = zpool_get_config(zhp, NULL); 159 if (nvlist_lookup_nvlist(config, 160 ZPOOL_CONFIG_VDEV_TREE, &nvroot) != 0) { 161 zpool_close(zhp); 162 return (0); 163 } 164 165 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 166 &spares, &nspares) != 0) { 167 zpool_close(zhp); 168 return (0); 169 } 170 171 for (int i = 0; i < nspares; i++) { 172 if (nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID, 173 &spareguid) == 0 && spareguid == cbp->cb_vdev_guid) { 174 devname = zpool_vdev_name(NULL, zhp, spares[i], 175 B_FALSE); 176 nvlist_lookup_uint64_array(spares[i], 177 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c); 178 if (vs->vs_state != VDEV_STATE_REMOVED && 179 zpool_vdev_remove_wanted(zhp, devname) == 0) 180 cbp->cb_num_spares++; 181 break; 182 } 183 } 184 185 zpool_close(zhp); 186 return (0); 187 } 188 189 /* 190 * Given a vdev guid, find and remove all spares associated with it. 191 */ 192 static int 193 find_and_remove_spares(libzfs_handle_t *zhdl, uint64_t vdev_guid) 194 { 195 find_cbdata_t cb; 196 197 cb.cb_num_spares = 0; 198 cb.cb_vdev_guid = vdev_guid; 199 zpool_iter(zhdl, remove_spares, &cb); 200 201 return (cb.cb_num_spares); 202 } 203 204 /* 205 * Given a (pool, vdev) GUID pair, find the matching pool and vdev. 206 */ 207 static zpool_handle_t * 208 find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, 209 nvlist_t **vdevp) 210 { 211 find_cbdata_t cb; 212 zpool_handle_t *zhp; 213 nvlist_t *config, *nvroot; 214 215 /* 216 * Find the corresponding pool and make sure the vdev still exists. 217 */ 218 cb.cb_guid = pool_guid; 219 if (zpool_iter(zhdl, find_pool, &cb) != 1) 220 return (NULL); 221 222 zhp = cb.cb_zhp; 223 config = zpool_get_config(zhp, NULL); 224 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 225 &nvroot) != 0) { 226 zpool_close(zhp); 227 return (NULL); 228 } 229 230 if (vdev_guid != 0) { 231 if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid)) == NULL) { 232 zpool_close(zhp); 233 return (NULL); 234 } 235 } 236 237 return (zhp); 238 } 239 240 /* 241 * Given a vdev, attempt to replace it with every known spare until one 242 * succeeds or we run out of devices to try. 243 * Return whether we were successful or not in replacing the device. 244 */ 245 static boolean_t 246 replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) 247 { 248 nvlist_t *config, *nvroot, *replacement; 249 nvlist_t **spares; 250 uint_t s, nspares; 251 char *dev_name; 252 zprop_source_t source; 253 int ashift; 254 255 config = zpool_get_config(zhp, NULL); 256 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 257 &nvroot) != 0) 258 return (B_FALSE); 259 260 /* 261 * Find out if there are any hot spares available in the pool. 262 */ 263 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 264 &spares, &nspares) != 0) 265 return (B_FALSE); 266 267 /* 268 * lookup "ashift" pool property, we may need it for the replacement 269 */ 270 ashift = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &source); 271 272 replacement = fmd_nvl_alloc(hdl, FMD_SLEEP); 273 274 (void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE, 275 VDEV_TYPE_ROOT); 276 277 dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); 278 279 /* 280 * Try to replace each spare, ending when we successfully 281 * replace it. 282 */ 283 for (s = 0; s < nspares; s++) { 284 boolean_t rebuild = B_FALSE; 285 const char *spare_name, *type; 286 287 if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, 288 &spare_name) != 0) 289 continue; 290 291 /* prefer sequential resilvering for distributed spares */ 292 if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE, 293 &type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) 294 rebuild = B_TRUE; 295 296 /* if set, add the "ashift" pool property to the spare nvlist */ 297 if (source != ZPROP_SRC_DEFAULT) 298 (void) nvlist_add_uint64(spares[s], 299 ZPOOL_CONFIG_ASHIFT, ashift); 300 301 (void) nvlist_add_nvlist_array(replacement, 302 ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)&spares[s], 1); 303 304 fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'", 305 dev_name, zfs_basename(spare_name)); 306 307 if (zpool_vdev_attach(zhp, dev_name, spare_name, 308 replacement, B_TRUE, rebuild) == 0) { 309 free(dev_name); 310 nvlist_free(replacement); 311 return (B_TRUE); 312 } 313 } 314 315 free(dev_name); 316 nvlist_free(replacement); 317 318 return (B_FALSE); 319 } 320 321 /* 322 * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and 323 * ASRU is now usable. ZFS has found the device to be present and 324 * functioning. 325 */ 326 static void 327 zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl) 328 { 329 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 330 zfs_retire_repaired_t *zrp; 331 uint64_t pool_guid, vdev_guid; 332 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, 333 &pool_guid) != 0 || nvlist_lookup_uint64(nvl, 334 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) 335 return; 336 337 /* 338 * Before checking the state of the ASRU, go through and see if we've 339 * already made an attempt to repair this ASRU. This list is cleared 340 * whenever we receive any kind of list event, and is designed to 341 * prevent us from generating a feedback loop when we attempt repairs 342 * against a faulted pool. The problem is that checking the unusable 343 * state of the ASRU can involve opening the pool, which can post 344 * statechange events but otherwise leave the pool in the faulted 345 * state. This list allows us to detect when a statechange event is 346 * due to our own request. 347 */ 348 for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) { 349 if (zrp->zrr_pool == pool_guid && 350 zrp->zrr_vdev == vdev_guid) 351 return; 352 } 353 354 zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP); 355 zrp->zrr_next = zdp->zrd_repaired; 356 zrp->zrr_pool = pool_guid; 357 zrp->zrr_vdev = vdev_guid; 358 zdp->zrd_repaired = zrp; 359 360 fmd_hdl_debug(hdl, "marking repaired vdev %llu on pool %llu", 361 vdev_guid, pool_guid); 362 } 363 364 static void 365 zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 366 const char *class) 367 { 368 (void) ep; 369 uint64_t pool_guid, vdev_guid; 370 zpool_handle_t *zhp; 371 nvlist_t *resource, *fault; 372 nvlist_t **faults; 373 uint_t f, nfaults; 374 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 375 libzfs_handle_t *zhdl = zdp->zrd_hdl; 376 boolean_t fault_device, degrade_device; 377 boolean_t is_repair; 378 boolean_t l2arc = B_FALSE; 379 boolean_t spare = B_FALSE; 380 const char *scheme; 381 nvlist_t *vdev = NULL; 382 const char *uuid; 383 int repair_done = 0; 384 boolean_t retire; 385 boolean_t is_disk; 386 vdev_aux_t aux; 387 uint64_t state = 0; 388 vdev_stat_t *vs; 389 unsigned int c; 390 391 fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class); 392 393 (void) nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, 394 &state); 395 396 /* 397 * If this is a resource notifying us of device removal then simply 398 * check for an available spare and continue unless the device is a 399 * l2arc vdev, in which case we just offline it. 400 */ 401 if (strcmp(class, "resource.fs.zfs.removed") == 0 || 402 (strcmp(class, "resource.fs.zfs.statechange") == 0 && 403 (state == VDEV_STATE_REMOVED || state == VDEV_STATE_FAULTED))) { 404 const char *devtype; 405 char *devname; 406 407 if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, 408 &devtype) == 0) { 409 if (strcmp(devtype, VDEV_TYPE_SPARE) == 0) 410 spare = B_TRUE; 411 else if (strcmp(devtype, VDEV_TYPE_L2CACHE) == 0) 412 l2arc = B_TRUE; 413 } 414 415 if (nvlist_lookup_uint64(nvl, 416 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) 417 return; 418 419 if (vdev_guid == 0) { 420 fmd_hdl_debug(hdl, "Got a zero GUID"); 421 return; 422 } 423 424 if (spare) { 425 int nspares = find_and_remove_spares(zhdl, vdev_guid); 426 fmd_hdl_debug(hdl, "%d spares removed", nspares); 427 return; 428 } 429 430 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, 431 &pool_guid) != 0) 432 return; 433 434 if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, 435 &vdev)) == NULL) 436 return; 437 438 devname = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); 439 440 nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, 441 (uint64_t **)&vs, &c); 442 443 /* 444 * If state removed is requested for already removed vdev, 445 * its a loopback event from spa_async_remove(). Just 446 * ignore it. 447 */ 448 if (vs->vs_state == VDEV_STATE_REMOVED && 449 state == VDEV_STATE_REMOVED) 450 return; 451 452 /* Remove the vdev since device is unplugged */ 453 int remove_status = 0; 454 if (l2arc || (strcmp(class, "resource.fs.zfs.removed") == 0)) { 455 remove_status = zpool_vdev_remove_wanted(zhp, devname); 456 fmd_hdl_debug(hdl, "zpool_vdev_remove_wanted '%s'" 457 ", err:%d", devname, libzfs_errno(zhdl)); 458 } 459 460 /* Replace the vdev with a spare if its not a l2arc */ 461 if (!l2arc && !remove_status && 462 (!fmd_prop_get_int32(hdl, "spare_on_remove") || 463 replace_with_spare(hdl, zhp, vdev) == B_FALSE)) { 464 /* Could not handle with spare */ 465 fmd_hdl_debug(hdl, "no spare for '%s'", devname); 466 } 467 468 free(devname); 469 zpool_close(zhp); 470 return; 471 } 472 473 if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) 474 return; 475 476 /* 477 * Note: on Linux statechange events are more than just 478 * healthy ones so we need to confirm the actual state value. 479 */ 480 if (strcmp(class, "resource.fs.zfs.statechange") == 0 && 481 state == VDEV_STATE_HEALTHY) { 482 zfs_vdev_repair(hdl, nvl); 483 return; 484 } 485 if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) { 486 zfs_vdev_repair(hdl, nvl); 487 return; 488 } 489 490 zfs_retire_clear_data(hdl, zdp); 491 492 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 493 is_repair = B_TRUE; 494 else 495 is_repair = B_FALSE; 496 497 /* 498 * We subscribe to zfs faults as well as all repair events. 499 */ 500 if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 501 &faults, &nfaults) != 0) 502 return; 503 504 for (f = 0; f < nfaults; f++) { 505 fault = faults[f]; 506 507 fault_device = B_FALSE; 508 degrade_device = B_FALSE; 509 is_disk = B_FALSE; 510 511 if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE, 512 &retire) == 0 && retire == 0) 513 continue; 514 515 /* 516 * While we subscribe to fault.fs.zfs.*, we only take action 517 * for faults targeting a specific vdev (open failure or SERD 518 * failure). We also subscribe to fault.io.* events, so that 519 * faulty disks will be faulted in the ZFS configuration. 520 */ 521 if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) { 522 fault_device = B_TRUE; 523 } else if (fmd_nvl_class_match(hdl, fault, 524 "fault.fs.zfs.vdev.checksum")) { 525 degrade_device = B_TRUE; 526 } else if (fmd_nvl_class_match(hdl, fault, 527 "fault.fs.zfs.device")) { 528 fault_device = B_FALSE; 529 } else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) { 530 is_disk = B_TRUE; 531 fault_device = B_TRUE; 532 } else { 533 continue; 534 } 535 536 if (is_disk) { 537 continue; 538 } else { 539 /* 540 * This is a ZFS fault. Lookup the resource, and 541 * attempt to find the matching vdev. 542 */ 543 if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, 544 &resource) != 0 || 545 nvlist_lookup_string(resource, FM_FMRI_SCHEME, 546 &scheme) != 0) 547 continue; 548 549 if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0) 550 continue; 551 552 if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL, 553 &pool_guid) != 0) 554 continue; 555 556 if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV, 557 &vdev_guid) != 0) { 558 if (is_repair) 559 vdev_guid = 0; 560 else 561 continue; 562 } 563 564 if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, 565 &vdev)) == NULL) 566 continue; 567 568 aux = VDEV_AUX_ERR_EXCEEDED; 569 } 570 571 if (vdev_guid == 0) { 572 /* 573 * For pool-level repair events, clear the entire pool. 574 */ 575 fmd_hdl_debug(hdl, "zpool_clear of pool '%s'", 576 zpool_get_name(zhp)); 577 (void) zpool_clear(zhp, NULL, NULL); 578 zpool_close(zhp); 579 continue; 580 } 581 582 /* 583 * If this is a repair event, then mark the vdev as repaired and 584 * continue. 585 */ 586 if (is_repair) { 587 repair_done = 1; 588 fmd_hdl_debug(hdl, "zpool_clear of pool '%s' vdev %llu", 589 zpool_get_name(zhp), vdev_guid); 590 (void) zpool_vdev_clear(zhp, vdev_guid); 591 zpool_close(zhp); 592 continue; 593 } 594 595 /* 596 * Actively fault the device if needed. 597 */ 598 if (fault_device) 599 (void) zpool_vdev_fault(zhp, vdev_guid, aux); 600 if (degrade_device) 601 (void) zpool_vdev_degrade(zhp, vdev_guid, aux); 602 603 if (fault_device || degrade_device) 604 fmd_hdl_debug(hdl, "zpool_vdev_%s: vdev %llu on '%s'", 605 fault_device ? "fault" : "degrade", vdev_guid, 606 zpool_get_name(zhp)); 607 608 /* 609 * Attempt to substitute a hot spare. 610 */ 611 (void) replace_with_spare(hdl, zhp, vdev); 612 613 zpool_close(zhp); 614 } 615 616 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done && 617 nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0) 618 fmd_case_uuresolved(hdl, uuid); 619 } 620 621 static const fmd_hdl_ops_t fmd_ops = { 622 zfs_retire_recv, /* fmdo_recv */ 623 NULL, /* fmdo_timeout */ 624 NULL, /* fmdo_close */ 625 NULL, /* fmdo_stats */ 626 NULL, /* fmdo_gc */ 627 }; 628 629 static const fmd_prop_t fmd_props[] = { 630 { "spare_on_remove", FMD_TYPE_BOOL, "true" }, 631 { NULL, 0, NULL } 632 }; 633 634 static const fmd_hdl_info_t fmd_info = { 635 "ZFS Retire Agent", "1.0", &fmd_ops, fmd_props 636 }; 637 638 void 639 _zfs_retire_init(fmd_hdl_t *hdl) 640 { 641 zfs_retire_data_t *zdp; 642 libzfs_handle_t *zhdl; 643 644 if ((zhdl = libzfs_init()) == NULL) 645 return; 646 647 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { 648 libzfs_fini(zhdl); 649 return; 650 } 651 652 zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP); 653 zdp->zrd_hdl = zhdl; 654 655 fmd_hdl_setspecific(hdl, zdp); 656 } 657 658 void 659 _zfs_retire_fini(fmd_hdl_t *hdl) 660 { 661 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 662 663 if (zdp != NULL) { 664 zfs_retire_clear_data(hdl, zdp); 665 libzfs_fini(zdp->zrd_hdl); 666 fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t)); 667 } 668 } 669