1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright (c) 2016, Intel Corporation. 25 * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com> 26 */ 27 28 /* 29 * The ZFS retire agent is responsible for managing hot spares across all pools. 30 * When we see a device fault or a device removal, we try to open the associated 31 * pool and look for any hot spares. We iterate over any available hot spares 32 * and attempt a 'zpool replace' for each one. 33 * 34 * For vdevs diagnosed as faulty, the agent is also responsible for proactively 35 * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors). 36 */ 37 38 #include <sys/fs/zfs.h> 39 #include <sys/fm/protocol.h> 40 #include <sys/fm/fs/zfs.h> 41 #include <libzfs.h> 42 #include <string.h> 43 44 #include "zfs_agents.h" 45 #include "fmd_api.h" 46 47 48 typedef struct zfs_retire_repaired { 49 struct zfs_retire_repaired *zrr_next; 50 uint64_t zrr_pool; 51 uint64_t zrr_vdev; 52 } zfs_retire_repaired_t; 53 54 typedef struct zfs_retire_data { 55 libzfs_handle_t *zrd_hdl; 56 zfs_retire_repaired_t *zrd_repaired; 57 } zfs_retire_data_t; 58 59 static void 60 zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp) 61 { 62 zfs_retire_repaired_t *zrp; 63 64 while ((zrp = zdp->zrd_repaired) != NULL) { 65 zdp->zrd_repaired = zrp->zrr_next; 66 fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t)); 67 } 68 } 69 70 /* 71 * Find a pool with a matching GUID. 72 */ 73 typedef struct find_cbdata { 74 uint64_t cb_guid; 75 zpool_handle_t *cb_zhp; 76 nvlist_t *cb_vdev; 77 } find_cbdata_t; 78 79 static int 80 find_pool(zpool_handle_t *zhp, void *data) 81 { 82 find_cbdata_t *cbp = data; 83 84 if (cbp->cb_guid == 85 zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) { 86 cbp->cb_zhp = zhp; 87 return (1); 88 } 89 90 zpool_close(zhp); 91 return (0); 92 } 93 94 /* 95 * Find a vdev within a tree with a matching GUID. 96 */ 97 static nvlist_t * 98 find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) 99 { 100 uint64_t guid; 101 nvlist_t **child; 102 uint_t c, children; 103 nvlist_t *ret; 104 105 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && 106 guid == search_guid) { 107 fmd_hdl_debug(fmd_module_hdl("zfs-retire"), 108 "matched vdev %llu", guid); 109 return (nv); 110 } 111 112 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 113 &child, &children) != 0) 114 return (NULL); 115 116 for (c = 0; c < children; c++) { 117 if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) 118 return (ret); 119 } 120 121 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 122 &child, &children) != 0) 123 return (NULL); 124 125 for (c = 0; c < children; c++) { 126 if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) 127 return (ret); 128 } 129 130 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 131 &child, &children) != 0) 132 return (NULL); 133 134 for (c = 0; c < children; c++) { 135 if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) 136 return (ret); 137 } 138 139 return (NULL); 140 } 141 142 /* 143 * Given a (pool, vdev) GUID pair, find the matching pool and vdev. 144 */ 145 static zpool_handle_t * 146 find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, 147 nvlist_t **vdevp) 148 { 149 find_cbdata_t cb; 150 zpool_handle_t *zhp; 151 nvlist_t *config, *nvroot; 152 153 /* 154 * Find the corresponding pool and make sure the vdev still exists. 155 */ 156 cb.cb_guid = pool_guid; 157 if (zpool_iter(zhdl, find_pool, &cb) != 1) 158 return (NULL); 159 160 zhp = cb.cb_zhp; 161 config = zpool_get_config(zhp, NULL); 162 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 163 &nvroot) != 0) { 164 zpool_close(zhp); 165 return (NULL); 166 } 167 168 if (vdev_guid != 0) { 169 if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid)) == NULL) { 170 zpool_close(zhp); 171 return (NULL); 172 } 173 } 174 175 return (zhp); 176 } 177 178 /* 179 * Given a vdev, attempt to replace it with every known spare until one 180 * succeeds or we run out of devices to try. 181 * Return whether we were successful or not in replacing the device. 182 */ 183 static boolean_t 184 replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) 185 { 186 nvlist_t *config, *nvroot, *replacement; 187 nvlist_t **spares; 188 uint_t s, nspares; 189 char *dev_name; 190 zprop_source_t source; 191 int ashift; 192 193 config = zpool_get_config(zhp, NULL); 194 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 195 &nvroot) != 0) 196 return (B_FALSE); 197 198 /* 199 * Find out if there are any hot spares available in the pool. 200 */ 201 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 202 &spares, &nspares) != 0) 203 return (B_FALSE); 204 205 /* 206 * lookup "ashift" pool property, we may need it for the replacement 207 */ 208 ashift = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &source); 209 210 replacement = fmd_nvl_alloc(hdl, FMD_SLEEP); 211 212 (void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE, 213 VDEV_TYPE_ROOT); 214 215 dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); 216 217 /* 218 * Try to replace each spare, ending when we successfully 219 * replace it. 220 */ 221 for (s = 0; s < nspares; s++) { 222 boolean_t rebuild = B_FALSE; 223 char *spare_name, *type; 224 225 if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, 226 &spare_name) != 0) 227 continue; 228 229 /* prefer sequential resilvering for distributed spares */ 230 if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE, 231 &type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) 232 rebuild = B_TRUE; 233 234 /* if set, add the "ashift" pool property to the spare nvlist */ 235 if (source != ZPROP_SRC_DEFAULT) 236 (void) nvlist_add_uint64(spares[s], 237 ZPOOL_CONFIG_ASHIFT, ashift); 238 239 (void) nvlist_add_nvlist_array(replacement, 240 ZPOOL_CONFIG_CHILDREN, &spares[s], 1); 241 242 fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'", 243 dev_name, basename(spare_name)); 244 245 if (zpool_vdev_attach(zhp, dev_name, spare_name, 246 replacement, B_TRUE, rebuild) == 0) { 247 free(dev_name); 248 nvlist_free(replacement); 249 return (B_TRUE); 250 } 251 } 252 253 free(dev_name); 254 nvlist_free(replacement); 255 256 return (B_FALSE); 257 } 258 259 /* 260 * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and 261 * ASRU is now usable. ZFS has found the device to be present and 262 * functioning. 263 */ 264 /*ARGSUSED*/ 265 static void 266 zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl) 267 { 268 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 269 zfs_retire_repaired_t *zrp; 270 uint64_t pool_guid, vdev_guid; 271 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, 272 &pool_guid) != 0 || nvlist_lookup_uint64(nvl, 273 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) 274 return; 275 276 /* 277 * Before checking the state of the ASRU, go through and see if we've 278 * already made an attempt to repair this ASRU. This list is cleared 279 * whenever we receive any kind of list event, and is designed to 280 * prevent us from generating a feedback loop when we attempt repairs 281 * against a faulted pool. The problem is that checking the unusable 282 * state of the ASRU can involve opening the pool, which can post 283 * statechange events but otherwise leave the pool in the faulted 284 * state. This list allows us to detect when a statechange event is 285 * due to our own request. 286 */ 287 for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) { 288 if (zrp->zrr_pool == pool_guid && 289 zrp->zrr_vdev == vdev_guid) 290 return; 291 } 292 293 zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP); 294 zrp->zrr_next = zdp->zrd_repaired; 295 zrp->zrr_pool = pool_guid; 296 zrp->zrr_vdev = vdev_guid; 297 zdp->zrd_repaired = zrp; 298 299 fmd_hdl_debug(hdl, "marking repaired vdev %llu on pool %llu", 300 vdev_guid, pool_guid); 301 } 302 303 /*ARGSUSED*/ 304 static void 305 zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 306 const char *class) 307 { 308 uint64_t pool_guid, vdev_guid; 309 zpool_handle_t *zhp; 310 nvlist_t *resource, *fault; 311 nvlist_t **faults; 312 uint_t f, nfaults; 313 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 314 libzfs_handle_t *zhdl = zdp->zrd_hdl; 315 boolean_t fault_device, degrade_device; 316 boolean_t is_repair; 317 char *scheme; 318 nvlist_t *vdev = NULL; 319 char *uuid; 320 int repair_done = 0; 321 boolean_t retire; 322 boolean_t is_disk; 323 vdev_aux_t aux; 324 uint64_t state = 0; 325 326 fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class); 327 328 nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state); 329 330 /* 331 * If this is a resource notifying us of device removal then simply 332 * check for an available spare and continue unless the device is a 333 * l2arc vdev, in which case we just offline it. 334 */ 335 if (strcmp(class, "resource.fs.zfs.removed") == 0 || 336 (strcmp(class, "resource.fs.zfs.statechange") == 0 && 337 state == VDEV_STATE_REMOVED)) { 338 char *devtype; 339 char *devname; 340 341 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, 342 &pool_guid) != 0 || 343 nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, 344 &vdev_guid) != 0) 345 return; 346 347 if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, 348 &vdev)) == NULL) 349 return; 350 351 devname = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); 352 353 /* Can't replace l2arc with a spare: offline the device */ 354 if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, 355 &devtype) == 0 && strcmp(devtype, VDEV_TYPE_L2CACHE) == 0) { 356 fmd_hdl_debug(hdl, "zpool_vdev_offline '%s'", devname); 357 zpool_vdev_offline(zhp, devname, B_TRUE); 358 } else if (!fmd_prop_get_int32(hdl, "spare_on_remove") || 359 replace_with_spare(hdl, zhp, vdev) == B_FALSE) { 360 /* Could not handle with spare */ 361 fmd_hdl_debug(hdl, "no spare for '%s'", devname); 362 } 363 364 free(devname); 365 zpool_close(zhp); 366 return; 367 } 368 369 if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) 370 return; 371 372 /* 373 * Note: on Linux statechange events are more than just 374 * healthy ones so we need to confirm the actual state value. 375 */ 376 if (strcmp(class, "resource.fs.zfs.statechange") == 0 && 377 state == VDEV_STATE_HEALTHY) { 378 zfs_vdev_repair(hdl, nvl); 379 return; 380 } 381 if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) { 382 zfs_vdev_repair(hdl, nvl); 383 return; 384 } 385 386 zfs_retire_clear_data(hdl, zdp); 387 388 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 389 is_repair = B_TRUE; 390 else 391 is_repair = B_FALSE; 392 393 /* 394 * We subscribe to zfs faults as well as all repair events. 395 */ 396 if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 397 &faults, &nfaults) != 0) 398 return; 399 400 for (f = 0; f < nfaults; f++) { 401 fault = faults[f]; 402 403 fault_device = B_FALSE; 404 degrade_device = B_FALSE; 405 is_disk = B_FALSE; 406 407 if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE, 408 &retire) == 0 && retire == 0) 409 continue; 410 411 /* 412 * While we subscribe to fault.fs.zfs.*, we only take action 413 * for faults targeting a specific vdev (open failure or SERD 414 * failure). We also subscribe to fault.io.* events, so that 415 * faulty disks will be faulted in the ZFS configuration. 416 */ 417 if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) { 418 fault_device = B_TRUE; 419 } else if (fmd_nvl_class_match(hdl, fault, 420 "fault.fs.zfs.vdev.checksum")) { 421 degrade_device = B_TRUE; 422 } else if (fmd_nvl_class_match(hdl, fault, 423 "fault.fs.zfs.device")) { 424 fault_device = B_FALSE; 425 } else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) { 426 is_disk = B_TRUE; 427 fault_device = B_TRUE; 428 } else { 429 continue; 430 } 431 432 if (is_disk) { 433 continue; 434 } else { 435 /* 436 * This is a ZFS fault. Lookup the resource, and 437 * attempt to find the matching vdev. 438 */ 439 if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, 440 &resource) != 0 || 441 nvlist_lookup_string(resource, FM_FMRI_SCHEME, 442 &scheme) != 0) 443 continue; 444 445 if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0) 446 continue; 447 448 if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL, 449 &pool_guid) != 0) 450 continue; 451 452 if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV, 453 &vdev_guid) != 0) { 454 if (is_repair) 455 vdev_guid = 0; 456 else 457 continue; 458 } 459 460 if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, 461 &vdev)) == NULL) 462 continue; 463 464 aux = VDEV_AUX_ERR_EXCEEDED; 465 } 466 467 if (vdev_guid == 0) { 468 /* 469 * For pool-level repair events, clear the entire pool. 470 */ 471 fmd_hdl_debug(hdl, "zpool_clear of pool '%s'", 472 zpool_get_name(zhp)); 473 (void) zpool_clear(zhp, NULL, NULL); 474 zpool_close(zhp); 475 continue; 476 } 477 478 /* 479 * If this is a repair event, then mark the vdev as repaired and 480 * continue. 481 */ 482 if (is_repair) { 483 repair_done = 1; 484 fmd_hdl_debug(hdl, "zpool_clear of pool '%s' vdev %llu", 485 zpool_get_name(zhp), vdev_guid); 486 (void) zpool_vdev_clear(zhp, vdev_guid); 487 zpool_close(zhp); 488 continue; 489 } 490 491 /* 492 * Actively fault the device if needed. 493 */ 494 if (fault_device) 495 (void) zpool_vdev_fault(zhp, vdev_guid, aux); 496 if (degrade_device) 497 (void) zpool_vdev_degrade(zhp, vdev_guid, aux); 498 499 if (fault_device || degrade_device) 500 fmd_hdl_debug(hdl, "zpool_vdev_%s: vdev %llu on '%s'", 501 fault_device ? "fault" : "degrade", vdev_guid, 502 zpool_get_name(zhp)); 503 504 /* 505 * Attempt to substitute a hot spare. 506 */ 507 (void) replace_with_spare(hdl, zhp, vdev); 508 509 zpool_close(zhp); 510 } 511 512 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done && 513 nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0) 514 fmd_case_uuresolved(hdl, uuid); 515 } 516 517 static const fmd_hdl_ops_t fmd_ops = { 518 zfs_retire_recv, /* fmdo_recv */ 519 NULL, /* fmdo_timeout */ 520 NULL, /* fmdo_close */ 521 NULL, /* fmdo_stats */ 522 NULL, /* fmdo_gc */ 523 }; 524 525 static const fmd_prop_t fmd_props[] = { 526 { "spare_on_remove", FMD_TYPE_BOOL, "true" }, 527 { NULL, 0, NULL } 528 }; 529 530 static const fmd_hdl_info_t fmd_info = { 531 "ZFS Retire Agent", "1.0", &fmd_ops, fmd_props 532 }; 533 534 void 535 _zfs_retire_init(fmd_hdl_t *hdl) 536 { 537 zfs_retire_data_t *zdp; 538 libzfs_handle_t *zhdl; 539 540 if ((zhdl = libzfs_init()) == NULL) 541 return; 542 543 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { 544 libzfs_fini(zhdl); 545 return; 546 } 547 548 zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP); 549 zdp->zrd_hdl = zhdl; 550 551 fmd_hdl_setspecific(hdl, zdp); 552 } 553 554 void 555 _zfs_retire_fini(fmd_hdl_t *hdl) 556 { 557 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 558 559 if (zdp != NULL) { 560 zfs_retire_clear_data(hdl, zdp); 561 libzfs_fini(zdp->zrd_hdl); 562 fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t)); 563 } 564 } 565