1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright (c) 2016, Intel Corporation. 25 * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com> 26 */ 27 28 /* 29 * The ZFS retire agent is responsible for managing hot spares across all pools. 30 * When we see a device fault or a device removal, we try to open the associated 31 * pool and look for any hot spares. We iterate over any available hot spares 32 * and attempt a 'zpool replace' for each one. 33 * 34 * For vdevs diagnosed as faulty, the agent is also responsible for proactively 35 * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors). 36 */ 37 38 #include <sys/fs/zfs.h> 39 #include <sys/fm/protocol.h> 40 #include <sys/fm/fs/zfs.h> 41 #include <libzutil.h> 42 #include <libzfs.h> 43 #include <string.h> 44 45 #include "zfs_agents.h" 46 #include "fmd_api.h" 47 48 49 typedef struct zfs_retire_repaired { 50 struct zfs_retire_repaired *zrr_next; 51 uint64_t zrr_pool; 52 uint64_t zrr_vdev; 53 } zfs_retire_repaired_t; 54 55 typedef struct zfs_retire_data { 56 libzfs_handle_t *zrd_hdl; 57 zfs_retire_repaired_t *zrd_repaired; 58 } zfs_retire_data_t; 59 60 static void 61 zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp) 62 { 63 zfs_retire_repaired_t *zrp; 64 65 while ((zrp = zdp->zrd_repaired) != NULL) { 66 zdp->zrd_repaired = zrp->zrr_next; 67 fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t)); 68 } 69 } 70 71 /* 72 * Find a pool with a matching GUID. 73 */ 74 typedef struct find_cbdata { 75 uint64_t cb_guid; 76 zpool_handle_t *cb_zhp; 77 nvlist_t *cb_vdev; 78 } find_cbdata_t; 79 80 static int 81 find_pool(zpool_handle_t *zhp, void *data) 82 { 83 find_cbdata_t *cbp = data; 84 85 if (cbp->cb_guid == 86 zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) { 87 cbp->cb_zhp = zhp; 88 return (1); 89 } 90 91 zpool_close(zhp); 92 return (0); 93 } 94 95 /* 96 * Find a vdev within a tree with a matching GUID. 97 */ 98 static nvlist_t * 99 find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) 100 { 101 uint64_t guid; 102 nvlist_t **child; 103 uint_t c, children; 104 nvlist_t *ret; 105 106 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && 107 guid == search_guid) { 108 fmd_hdl_debug(fmd_module_hdl("zfs-retire"), 109 "matched vdev %llu", guid); 110 return (nv); 111 } 112 113 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 114 &child, &children) != 0) 115 return (NULL); 116 117 for (c = 0; c < children; c++) { 118 if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) 119 return (ret); 120 } 121 122 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 123 &child, &children) != 0) 124 return (NULL); 125 126 for (c = 0; c < children; c++) { 127 if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) 128 return (ret); 129 } 130 131 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 132 &child, &children) != 0) 133 return (NULL); 134 135 for (c = 0; c < children; c++) { 136 if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) 137 return (ret); 138 } 139 140 return (NULL); 141 } 142 143 /* 144 * Given a (pool, vdev) GUID pair, find the matching pool and vdev. 145 */ 146 static zpool_handle_t * 147 find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, 148 nvlist_t **vdevp) 149 { 150 find_cbdata_t cb; 151 zpool_handle_t *zhp; 152 nvlist_t *config, *nvroot; 153 154 /* 155 * Find the corresponding pool and make sure the vdev still exists. 156 */ 157 cb.cb_guid = pool_guid; 158 if (zpool_iter(zhdl, find_pool, &cb) != 1) 159 return (NULL); 160 161 zhp = cb.cb_zhp; 162 config = zpool_get_config(zhp, NULL); 163 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 164 &nvroot) != 0) { 165 zpool_close(zhp); 166 return (NULL); 167 } 168 169 if (vdev_guid != 0) { 170 if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid)) == NULL) { 171 zpool_close(zhp); 172 return (NULL); 173 } 174 } 175 176 return (zhp); 177 } 178 179 /* 180 * Given a vdev, attempt to replace it with every known spare until one 181 * succeeds or we run out of devices to try. 182 * Return whether we were successful or not in replacing the device. 183 */ 184 static boolean_t 185 replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) 186 { 187 nvlist_t *config, *nvroot, *replacement; 188 nvlist_t **spares; 189 uint_t s, nspares; 190 char *dev_name; 191 zprop_source_t source; 192 int ashift; 193 194 config = zpool_get_config(zhp, NULL); 195 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 196 &nvroot) != 0) 197 return (B_FALSE); 198 199 /* 200 * Find out if there are any hot spares available in the pool. 201 */ 202 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 203 &spares, &nspares) != 0) 204 return (B_FALSE); 205 206 /* 207 * lookup "ashift" pool property, we may need it for the replacement 208 */ 209 ashift = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &source); 210 211 replacement = fmd_nvl_alloc(hdl, FMD_SLEEP); 212 213 (void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE, 214 VDEV_TYPE_ROOT); 215 216 dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); 217 218 /* 219 * Try to replace each spare, ending when we successfully 220 * replace it. 221 */ 222 for (s = 0; s < nspares; s++) { 223 boolean_t rebuild = B_FALSE; 224 char *spare_name, *type; 225 226 if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, 227 &spare_name) != 0) 228 continue; 229 230 /* prefer sequential resilvering for distributed spares */ 231 if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE, 232 &type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) 233 rebuild = B_TRUE; 234 235 /* if set, add the "ashift" pool property to the spare nvlist */ 236 if (source != ZPROP_SRC_DEFAULT) 237 (void) nvlist_add_uint64(spares[s], 238 ZPOOL_CONFIG_ASHIFT, ashift); 239 240 (void) nvlist_add_nvlist_array(replacement, 241 ZPOOL_CONFIG_CHILDREN, &spares[s], 1); 242 243 fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'", 244 dev_name, zfs_basename(spare_name)); 245 246 if (zpool_vdev_attach(zhp, dev_name, spare_name, 247 replacement, B_TRUE, rebuild) == 0) { 248 free(dev_name); 249 nvlist_free(replacement); 250 return (B_TRUE); 251 } 252 } 253 254 free(dev_name); 255 nvlist_free(replacement); 256 257 return (B_FALSE); 258 } 259 260 /* 261 * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and 262 * ASRU is now usable. ZFS has found the device to be present and 263 * functioning. 264 */ 265 /*ARGSUSED*/ 266 static void 267 zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl) 268 { 269 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 270 zfs_retire_repaired_t *zrp; 271 uint64_t pool_guid, vdev_guid; 272 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, 273 &pool_guid) != 0 || nvlist_lookup_uint64(nvl, 274 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) 275 return; 276 277 /* 278 * Before checking the state of the ASRU, go through and see if we've 279 * already made an attempt to repair this ASRU. This list is cleared 280 * whenever we receive any kind of list event, and is designed to 281 * prevent us from generating a feedback loop when we attempt repairs 282 * against a faulted pool. The problem is that checking the unusable 283 * state of the ASRU can involve opening the pool, which can post 284 * statechange events but otherwise leave the pool in the faulted 285 * state. This list allows us to detect when a statechange event is 286 * due to our own request. 287 */ 288 for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) { 289 if (zrp->zrr_pool == pool_guid && 290 zrp->zrr_vdev == vdev_guid) 291 return; 292 } 293 294 zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP); 295 zrp->zrr_next = zdp->zrd_repaired; 296 zrp->zrr_pool = pool_guid; 297 zrp->zrr_vdev = vdev_guid; 298 zdp->zrd_repaired = zrp; 299 300 fmd_hdl_debug(hdl, "marking repaired vdev %llu on pool %llu", 301 vdev_guid, pool_guid); 302 } 303 304 /*ARGSUSED*/ 305 static void 306 zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 307 const char *class) 308 { 309 uint64_t pool_guid, vdev_guid; 310 zpool_handle_t *zhp; 311 nvlist_t *resource, *fault; 312 nvlist_t **faults; 313 uint_t f, nfaults; 314 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 315 libzfs_handle_t *zhdl = zdp->zrd_hdl; 316 boolean_t fault_device, degrade_device; 317 boolean_t is_repair; 318 char *scheme; 319 nvlist_t *vdev = NULL; 320 char *uuid; 321 int repair_done = 0; 322 boolean_t retire; 323 boolean_t is_disk; 324 vdev_aux_t aux; 325 uint64_t state = 0; 326 327 fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class); 328 329 nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state); 330 331 /* 332 * If this is a resource notifying us of device removal then simply 333 * check for an available spare and continue unless the device is a 334 * l2arc vdev, in which case we just offline it. 335 */ 336 if (strcmp(class, "resource.fs.zfs.removed") == 0 || 337 (strcmp(class, "resource.fs.zfs.statechange") == 0 && 338 (state == VDEV_STATE_REMOVED || state == VDEV_STATE_FAULTED))) { 339 char *devtype; 340 char *devname; 341 342 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, 343 &pool_guid) != 0 || 344 nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, 345 &vdev_guid) != 0) 346 return; 347 348 if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, 349 &vdev)) == NULL) 350 return; 351 352 devname = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); 353 354 /* Can't replace l2arc with a spare: offline the device */ 355 if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, 356 &devtype) == 0 && strcmp(devtype, VDEV_TYPE_L2CACHE) == 0) { 357 fmd_hdl_debug(hdl, "zpool_vdev_offline '%s'", devname); 358 zpool_vdev_offline(zhp, devname, B_TRUE); 359 } else if (!fmd_prop_get_int32(hdl, "spare_on_remove") || 360 replace_with_spare(hdl, zhp, vdev) == B_FALSE) { 361 /* Could not handle with spare */ 362 fmd_hdl_debug(hdl, "no spare for '%s'", devname); 363 } 364 365 free(devname); 366 zpool_close(zhp); 367 return; 368 } 369 370 if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) 371 return; 372 373 /* 374 * Note: on Linux statechange events are more than just 375 * healthy ones so we need to confirm the actual state value. 376 */ 377 if (strcmp(class, "resource.fs.zfs.statechange") == 0 && 378 state == VDEV_STATE_HEALTHY) { 379 zfs_vdev_repair(hdl, nvl); 380 return; 381 } 382 if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) { 383 zfs_vdev_repair(hdl, nvl); 384 return; 385 } 386 387 zfs_retire_clear_data(hdl, zdp); 388 389 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 390 is_repair = B_TRUE; 391 else 392 is_repair = B_FALSE; 393 394 /* 395 * We subscribe to zfs faults as well as all repair events. 396 */ 397 if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 398 &faults, &nfaults) != 0) 399 return; 400 401 for (f = 0; f < nfaults; f++) { 402 fault = faults[f]; 403 404 fault_device = B_FALSE; 405 degrade_device = B_FALSE; 406 is_disk = B_FALSE; 407 408 if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE, 409 &retire) == 0 && retire == 0) 410 continue; 411 412 /* 413 * While we subscribe to fault.fs.zfs.*, we only take action 414 * for faults targeting a specific vdev (open failure or SERD 415 * failure). We also subscribe to fault.io.* events, so that 416 * faulty disks will be faulted in the ZFS configuration. 417 */ 418 if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) { 419 fault_device = B_TRUE; 420 } else if (fmd_nvl_class_match(hdl, fault, 421 "fault.fs.zfs.vdev.checksum")) { 422 degrade_device = B_TRUE; 423 } else if (fmd_nvl_class_match(hdl, fault, 424 "fault.fs.zfs.device")) { 425 fault_device = B_FALSE; 426 } else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) { 427 is_disk = B_TRUE; 428 fault_device = B_TRUE; 429 } else { 430 continue; 431 } 432 433 if (is_disk) { 434 continue; 435 } else { 436 /* 437 * This is a ZFS fault. Lookup the resource, and 438 * attempt to find the matching vdev. 439 */ 440 if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, 441 &resource) != 0 || 442 nvlist_lookup_string(resource, FM_FMRI_SCHEME, 443 &scheme) != 0) 444 continue; 445 446 if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0) 447 continue; 448 449 if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL, 450 &pool_guid) != 0) 451 continue; 452 453 if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV, 454 &vdev_guid) != 0) { 455 if (is_repair) 456 vdev_guid = 0; 457 else 458 continue; 459 } 460 461 if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, 462 &vdev)) == NULL) 463 continue; 464 465 aux = VDEV_AUX_ERR_EXCEEDED; 466 } 467 468 if (vdev_guid == 0) { 469 /* 470 * For pool-level repair events, clear the entire pool. 471 */ 472 fmd_hdl_debug(hdl, "zpool_clear of pool '%s'", 473 zpool_get_name(zhp)); 474 (void) zpool_clear(zhp, NULL, NULL); 475 zpool_close(zhp); 476 continue; 477 } 478 479 /* 480 * If this is a repair event, then mark the vdev as repaired and 481 * continue. 482 */ 483 if (is_repair) { 484 repair_done = 1; 485 fmd_hdl_debug(hdl, "zpool_clear of pool '%s' vdev %llu", 486 zpool_get_name(zhp), vdev_guid); 487 (void) zpool_vdev_clear(zhp, vdev_guid); 488 zpool_close(zhp); 489 continue; 490 } 491 492 /* 493 * Actively fault the device if needed. 494 */ 495 if (fault_device) 496 (void) zpool_vdev_fault(zhp, vdev_guid, aux); 497 if (degrade_device) 498 (void) zpool_vdev_degrade(zhp, vdev_guid, aux); 499 500 if (fault_device || degrade_device) 501 fmd_hdl_debug(hdl, "zpool_vdev_%s: vdev %llu on '%s'", 502 fault_device ? "fault" : "degrade", vdev_guid, 503 zpool_get_name(zhp)); 504 505 /* 506 * Attempt to substitute a hot spare. 507 */ 508 (void) replace_with_spare(hdl, zhp, vdev); 509 510 zpool_close(zhp); 511 } 512 513 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done && 514 nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0) 515 fmd_case_uuresolved(hdl, uuid); 516 } 517 518 static const fmd_hdl_ops_t fmd_ops = { 519 zfs_retire_recv, /* fmdo_recv */ 520 NULL, /* fmdo_timeout */ 521 NULL, /* fmdo_close */ 522 NULL, /* fmdo_stats */ 523 NULL, /* fmdo_gc */ 524 }; 525 526 static const fmd_prop_t fmd_props[] = { 527 { "spare_on_remove", FMD_TYPE_BOOL, "true" }, 528 { NULL, 0, NULL } 529 }; 530 531 static const fmd_hdl_info_t fmd_info = { 532 "ZFS Retire Agent", "1.0", &fmd_ops, fmd_props 533 }; 534 535 void 536 _zfs_retire_init(fmd_hdl_t *hdl) 537 { 538 zfs_retire_data_t *zdp; 539 libzfs_handle_t *zhdl; 540 541 if ((zhdl = libzfs_init()) == NULL) 542 return; 543 544 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { 545 libzfs_fini(zhdl); 546 return; 547 } 548 549 zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP); 550 zdp->zrd_hdl = zhdl; 551 552 fmd_hdl_setspecific(hdl, zdp); 553 } 554 555 void 556 _zfs_retire_fini(fmd_hdl_t *hdl) 557 { 558 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 559 560 if (zdp != NULL) { 561 zfs_retire_clear_data(hdl, zdp); 562 libzfs_fini(zdp->zrd_hdl); 563 fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t)); 564 } 565 } 566