1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright (c) 2016, Intel Corporation. 25 * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com> 26 */ 27 28 /* 29 * The ZFS retire agent is responsible for managing hot spares across all pools. 30 * When we see a device fault or a device removal, we try to open the associated 31 * pool and look for any hot spares. We iterate over any available hot spares 32 * and attempt a 'zpool replace' for each one. 33 * 34 * For vdevs diagnosed as faulty, the agent is also responsible for proactively 35 * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors). 36 */ 37 38 #include <sys/fs/zfs.h> 39 #include <sys/fm/protocol.h> 40 #include <sys/fm/fs/zfs.h> 41 #include <libzutil.h> 42 #include <libzfs.h> 43 #include <string.h> 44 #include <libgen.h> 45 46 #include "zfs_agents.h" 47 #include "fmd_api.h" 48 49 50 typedef struct zfs_retire_repaired { 51 struct zfs_retire_repaired *zrr_next; 52 uint64_t zrr_pool; 53 uint64_t zrr_vdev; 54 } zfs_retire_repaired_t; 55 56 typedef struct zfs_retire_data { 57 libzfs_handle_t *zrd_hdl; 58 zfs_retire_repaired_t *zrd_repaired; 59 } zfs_retire_data_t; 60 61 static void 62 zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp) 63 { 64 zfs_retire_repaired_t *zrp; 65 66 while ((zrp = zdp->zrd_repaired) != NULL) { 67 zdp->zrd_repaired = zrp->zrr_next; 68 fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t)); 69 } 70 } 71 72 /* 73 * Find a pool with a matching GUID. 74 */ 75 typedef struct find_cbdata { 76 uint64_t cb_guid; 77 zpool_handle_t *cb_zhp; 78 nvlist_t *cb_vdev; 79 } find_cbdata_t; 80 81 static int 82 find_pool(zpool_handle_t *zhp, void *data) 83 { 84 find_cbdata_t *cbp = data; 85 86 if (cbp->cb_guid == 87 zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) { 88 cbp->cb_zhp = zhp; 89 return (1); 90 } 91 92 zpool_close(zhp); 93 return (0); 94 } 95 96 /* 97 * Find a vdev within a tree with a matching GUID. 98 */ 99 static nvlist_t * 100 find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) 101 { 102 uint64_t guid; 103 nvlist_t **child; 104 uint_t c, children; 105 nvlist_t *ret; 106 107 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && 108 guid == search_guid) { 109 fmd_hdl_debug(fmd_module_hdl("zfs-retire"), 110 "matched vdev %llu", guid); 111 return (nv); 112 } 113 114 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 115 &child, &children) != 0) 116 return (NULL); 117 118 for (c = 0; c < children; c++) { 119 if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) 120 return (ret); 121 } 122 123 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 124 &child, &children) != 0) 125 return (NULL); 126 127 for (c = 0; c < children; c++) { 128 if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) 129 return (ret); 130 } 131 132 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 133 &child, &children) != 0) 134 return (NULL); 135 136 for (c = 0; c < children; c++) { 137 if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) 138 return (ret); 139 } 140 141 return (NULL); 142 } 143 144 /* 145 * Given a (pool, vdev) GUID pair, find the matching pool and vdev. 146 */ 147 static zpool_handle_t * 148 find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, 149 nvlist_t **vdevp) 150 { 151 find_cbdata_t cb; 152 zpool_handle_t *zhp; 153 nvlist_t *config, *nvroot; 154 155 /* 156 * Find the corresponding pool and make sure the vdev still exists. 157 */ 158 cb.cb_guid = pool_guid; 159 if (zpool_iter(zhdl, find_pool, &cb) != 1) 160 return (NULL); 161 162 zhp = cb.cb_zhp; 163 config = zpool_get_config(zhp, NULL); 164 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 165 &nvroot) != 0) { 166 zpool_close(zhp); 167 return (NULL); 168 } 169 170 if (vdev_guid != 0) { 171 if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid)) == NULL) { 172 zpool_close(zhp); 173 return (NULL); 174 } 175 } 176 177 return (zhp); 178 } 179 180 /* 181 * Given a vdev, attempt to replace it with every known spare until one 182 * succeeds or we run out of devices to try. 183 * Return whether we were successful or not in replacing the device. 184 */ 185 static boolean_t 186 replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) 187 { 188 nvlist_t *config, *nvroot, *replacement; 189 nvlist_t **spares; 190 uint_t s, nspares; 191 char *dev_name; 192 zprop_source_t source; 193 int ashift; 194 195 config = zpool_get_config(zhp, NULL); 196 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 197 &nvroot) != 0) 198 return (B_FALSE); 199 200 /* 201 * Find out if there are any hot spares available in the pool. 202 */ 203 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 204 &spares, &nspares) != 0) 205 return (B_FALSE); 206 207 /* 208 * lookup "ashift" pool property, we may need it for the replacement 209 */ 210 ashift = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &source); 211 212 replacement = fmd_nvl_alloc(hdl, FMD_SLEEP); 213 214 (void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE, 215 VDEV_TYPE_ROOT); 216 217 dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); 218 219 /* 220 * Try to replace each spare, ending when we successfully 221 * replace it. 222 */ 223 for (s = 0; s < nspares; s++) { 224 boolean_t rebuild = B_FALSE; 225 char *spare_name, *type; 226 227 if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, 228 &spare_name) != 0) 229 continue; 230 231 /* prefer sequential resilvering for distributed spares */ 232 if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE, 233 &type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) 234 rebuild = B_TRUE; 235 236 /* if set, add the "ashift" pool property to the spare nvlist */ 237 if (source != ZPROP_SRC_DEFAULT) 238 (void) nvlist_add_uint64(spares[s], 239 ZPOOL_CONFIG_ASHIFT, ashift); 240 241 (void) nvlist_add_nvlist_array(replacement, 242 ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)&spares[s], 1); 243 244 fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'", 245 dev_name, zfs_basename(spare_name)); 246 247 if (zpool_vdev_attach(zhp, dev_name, spare_name, 248 replacement, B_TRUE, rebuild) == 0) { 249 free(dev_name); 250 nvlist_free(replacement); 251 return (B_TRUE); 252 } 253 } 254 255 free(dev_name); 256 nvlist_free(replacement); 257 258 return (B_FALSE); 259 } 260 261 /* 262 * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and 263 * ASRU is now usable. ZFS has found the device to be present and 264 * functioning. 265 */ 266 static void 267 zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl) 268 { 269 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 270 zfs_retire_repaired_t *zrp; 271 uint64_t pool_guid, vdev_guid; 272 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, 273 &pool_guid) != 0 || nvlist_lookup_uint64(nvl, 274 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) 275 return; 276 277 /* 278 * Before checking the state of the ASRU, go through and see if we've 279 * already made an attempt to repair this ASRU. This list is cleared 280 * whenever we receive any kind of list event, and is designed to 281 * prevent us from generating a feedback loop when we attempt repairs 282 * against a faulted pool. The problem is that checking the unusable 283 * state of the ASRU can involve opening the pool, which can post 284 * statechange events but otherwise leave the pool in the faulted 285 * state. This list allows us to detect when a statechange event is 286 * due to our own request. 287 */ 288 for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) { 289 if (zrp->zrr_pool == pool_guid && 290 zrp->zrr_vdev == vdev_guid) 291 return; 292 } 293 294 zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP); 295 zrp->zrr_next = zdp->zrd_repaired; 296 zrp->zrr_pool = pool_guid; 297 zrp->zrr_vdev = vdev_guid; 298 zdp->zrd_repaired = zrp; 299 300 fmd_hdl_debug(hdl, "marking repaired vdev %llu on pool %llu", 301 vdev_guid, pool_guid); 302 } 303 304 static void 305 zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 306 const char *class) 307 { 308 (void) ep; 309 uint64_t pool_guid, vdev_guid; 310 zpool_handle_t *zhp; 311 nvlist_t *resource, *fault; 312 nvlist_t **faults; 313 uint_t f, nfaults; 314 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 315 libzfs_handle_t *zhdl = zdp->zrd_hdl; 316 boolean_t fault_device, degrade_device; 317 boolean_t is_repair; 318 char *scheme; 319 nvlist_t *vdev = NULL; 320 char *uuid; 321 int repair_done = 0; 322 boolean_t retire; 323 boolean_t is_disk; 324 vdev_aux_t aux; 325 uint64_t state = 0; 326 int l2arc; 327 vdev_stat_t *vs; 328 unsigned int c; 329 330 fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class); 331 332 (void) nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, 333 &state); 334 335 /* 336 * If this is a resource notifying us of device removal then simply 337 * check for an available spare and continue unless the device is a 338 * l2arc vdev, in which case we just offline it. 339 */ 340 if (strcmp(class, "resource.fs.zfs.removed") == 0 || 341 (strcmp(class, "resource.fs.zfs.statechange") == 0 && 342 (state == VDEV_STATE_REMOVED || state == VDEV_STATE_FAULTED))) { 343 char *devtype; 344 char *devname; 345 346 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, 347 &pool_guid) != 0 || 348 nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, 349 &vdev_guid) != 0) 350 return; 351 352 if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, 353 &vdev)) == NULL) 354 return; 355 356 devname = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); 357 358 nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, 359 (uint64_t **)&vs, &c); 360 361 /* 362 * If state removed is requested for already removed vdev, 363 * its a loopback event from spa_async_remove(). Just 364 * ignore it. 365 */ 366 if (vs->vs_state == VDEV_STATE_REMOVED && 367 state == VDEV_STATE_REMOVED) 368 return; 369 370 l2arc = (nvlist_lookup_string(nvl, 371 FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, &devtype) == 0 && 372 strcmp(devtype, VDEV_TYPE_L2CACHE) == 0); 373 374 /* Remove the vdev since device is unplugged */ 375 if (l2arc || (strcmp(class, "resource.fs.zfs.removed") == 0)) { 376 int status = zpool_vdev_remove_wanted(zhp, devname); 377 fmd_hdl_debug(hdl, "zpool_vdev_remove_wanted '%s'" 378 ", ret:%d", devname, status); 379 } 380 381 /* Replace the vdev with a spare if its not a l2arc */ 382 if (!l2arc && (!fmd_prop_get_int32(hdl, "spare_on_remove") || 383 replace_with_spare(hdl, zhp, vdev) == B_FALSE)) { 384 /* Could not handle with spare */ 385 fmd_hdl_debug(hdl, "no spare for '%s'", devname); 386 } 387 388 free(devname); 389 zpool_close(zhp); 390 return; 391 } 392 393 if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) 394 return; 395 396 /* 397 * Note: on Linux statechange events are more than just 398 * healthy ones so we need to confirm the actual state value. 399 */ 400 if (strcmp(class, "resource.fs.zfs.statechange") == 0 && 401 state == VDEV_STATE_HEALTHY) { 402 zfs_vdev_repair(hdl, nvl); 403 return; 404 } 405 if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) { 406 zfs_vdev_repair(hdl, nvl); 407 return; 408 } 409 410 zfs_retire_clear_data(hdl, zdp); 411 412 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 413 is_repair = B_TRUE; 414 else 415 is_repair = B_FALSE; 416 417 /* 418 * We subscribe to zfs faults as well as all repair events. 419 */ 420 if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 421 &faults, &nfaults) != 0) 422 return; 423 424 for (f = 0; f < nfaults; f++) { 425 fault = faults[f]; 426 427 fault_device = B_FALSE; 428 degrade_device = B_FALSE; 429 is_disk = B_FALSE; 430 431 if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE, 432 &retire) == 0 && retire == 0) 433 continue; 434 435 /* 436 * While we subscribe to fault.fs.zfs.*, we only take action 437 * for faults targeting a specific vdev (open failure or SERD 438 * failure). We also subscribe to fault.io.* events, so that 439 * faulty disks will be faulted in the ZFS configuration. 440 */ 441 if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) { 442 fault_device = B_TRUE; 443 } else if (fmd_nvl_class_match(hdl, fault, 444 "fault.fs.zfs.vdev.checksum")) { 445 degrade_device = B_TRUE; 446 } else if (fmd_nvl_class_match(hdl, fault, 447 "fault.fs.zfs.device")) { 448 fault_device = B_FALSE; 449 } else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) { 450 is_disk = B_TRUE; 451 fault_device = B_TRUE; 452 } else { 453 continue; 454 } 455 456 if (is_disk) { 457 continue; 458 } else { 459 /* 460 * This is a ZFS fault. Lookup the resource, and 461 * attempt to find the matching vdev. 462 */ 463 if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, 464 &resource) != 0 || 465 nvlist_lookup_string(resource, FM_FMRI_SCHEME, 466 &scheme) != 0) 467 continue; 468 469 if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0) 470 continue; 471 472 if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL, 473 &pool_guid) != 0) 474 continue; 475 476 if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV, 477 &vdev_guid) != 0) { 478 if (is_repair) 479 vdev_guid = 0; 480 else 481 continue; 482 } 483 484 if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, 485 &vdev)) == NULL) 486 continue; 487 488 aux = VDEV_AUX_ERR_EXCEEDED; 489 } 490 491 if (vdev_guid == 0) { 492 /* 493 * For pool-level repair events, clear the entire pool. 494 */ 495 fmd_hdl_debug(hdl, "zpool_clear of pool '%s'", 496 zpool_get_name(zhp)); 497 (void) zpool_clear(zhp, NULL, NULL); 498 zpool_close(zhp); 499 continue; 500 } 501 502 /* 503 * If this is a repair event, then mark the vdev as repaired and 504 * continue. 505 */ 506 if (is_repair) { 507 repair_done = 1; 508 fmd_hdl_debug(hdl, "zpool_clear of pool '%s' vdev %llu", 509 zpool_get_name(zhp), vdev_guid); 510 (void) zpool_vdev_clear(zhp, vdev_guid); 511 zpool_close(zhp); 512 continue; 513 } 514 515 /* 516 * Actively fault the device if needed. 517 */ 518 if (fault_device) 519 (void) zpool_vdev_fault(zhp, vdev_guid, aux); 520 if (degrade_device) 521 (void) zpool_vdev_degrade(zhp, vdev_guid, aux); 522 523 if (fault_device || degrade_device) 524 fmd_hdl_debug(hdl, "zpool_vdev_%s: vdev %llu on '%s'", 525 fault_device ? "fault" : "degrade", vdev_guid, 526 zpool_get_name(zhp)); 527 528 /* 529 * Attempt to substitute a hot spare. 530 */ 531 (void) replace_with_spare(hdl, zhp, vdev); 532 533 zpool_close(zhp); 534 } 535 536 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done && 537 nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0) 538 fmd_case_uuresolved(hdl, uuid); 539 } 540 541 static const fmd_hdl_ops_t fmd_ops = { 542 zfs_retire_recv, /* fmdo_recv */ 543 NULL, /* fmdo_timeout */ 544 NULL, /* fmdo_close */ 545 NULL, /* fmdo_stats */ 546 NULL, /* fmdo_gc */ 547 }; 548 549 static const fmd_prop_t fmd_props[] = { 550 { "spare_on_remove", FMD_TYPE_BOOL, "true" }, 551 { NULL, 0, NULL } 552 }; 553 554 static const fmd_hdl_info_t fmd_info = { 555 "ZFS Retire Agent", "1.0", &fmd_ops, fmd_props 556 }; 557 558 void 559 _zfs_retire_init(fmd_hdl_t *hdl) 560 { 561 zfs_retire_data_t *zdp; 562 libzfs_handle_t *zhdl; 563 564 if ((zhdl = libzfs_init()) == NULL) 565 return; 566 567 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { 568 libzfs_fini(zhdl); 569 return; 570 } 571 572 zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP); 573 zdp->zrd_hdl = zhdl; 574 575 fmd_hdl_setspecific(hdl, zdp); 576 } 577 578 void 579 _zfs_retire_fini(fmd_hdl_t *hdl) 580 { 581 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 582 583 if (zdp != NULL) { 584 zfs_retire_clear_data(hdl, zdp); 585 libzfs_fini(zdp->zrd_hdl); 586 fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t)); 587 } 588 } 589