1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * The ZFS retire agent is responsible for managing hot spares across all pools. 28 * When we see a device fault or a device removal, we try to open the associated 29 * pool and look for any hot spares. We iterate over any available hot spares 30 * and attempt a 'zpool replace' for each one. 31 * 32 * For vdevs diagnosed as faulty, the agent is also responsible for proactively 33 * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors). 34 */ 35 36 #include <fm/fmd_api.h> 37 #include <sys/fs/zfs.h> 38 #include <sys/fm/protocol.h> 39 #include <sys/fm/fs/zfs.h> 40 #include <libzfs.h> 41 #include <fm/libtopo.h> 42 #include <string.h> 43 44 typedef struct zfs_retire_repaired { 45 struct zfs_retire_repaired *zrr_next; 46 uint64_t zrr_pool; 47 uint64_t zrr_vdev; 48 } zfs_retire_repaired_t; 49 50 typedef struct zfs_retire_data { 51 libzfs_handle_t *zrd_hdl; 52 zfs_retire_repaired_t *zrd_repaired; 53 } zfs_retire_data_t; 54 55 static void 56 zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp) 57 { 58 zfs_retire_repaired_t *zrp; 59 60 while ((zrp = zdp->zrd_repaired) != NULL) { 61 zdp->zrd_repaired = zrp->zrr_next; 62 fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t)); 63 } 64 } 65 66 /* 67 * Find a pool with a matching GUID. 68 */ 69 typedef struct find_cbdata { 70 uint64_t cb_guid; 71 const char *cb_fru; 72 zpool_handle_t *cb_zhp; 73 nvlist_t *cb_vdev; 74 } find_cbdata_t; 75 76 static int 77 find_pool(zpool_handle_t *zhp, void *data) 78 { 79 find_cbdata_t *cbp = data; 80 81 if (cbp->cb_guid == 82 zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) { 83 cbp->cb_zhp = zhp; 84 return (1); 85 } 86 87 zpool_close(zhp); 88 return (0); 89 } 90 91 /* 92 * Find a vdev within a tree with a matching GUID. 93 */ 94 static nvlist_t * 95 find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, const char *search_fru, 96 uint64_t search_guid) 97 { 98 uint64_t guid; 99 nvlist_t **child; 100 uint_t c, children; 101 nvlist_t *ret; 102 char *fru; 103 104 if (search_fru != NULL) { 105 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &fru) == 0 && 106 libzfs_fru_compare(zhdl, fru, search_fru)) 107 return (nv); 108 } else { 109 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && 110 guid == search_guid) 111 return (nv); 112 } 113 114 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 115 &child, &children) != 0) 116 return (NULL); 117 118 for (c = 0; c < children; c++) { 119 if ((ret = find_vdev(zhdl, child[c], search_fru, 120 search_guid)) != NULL) 121 return (ret); 122 } 123 124 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 125 &child, &children) != 0) 126 return (NULL); 127 128 for (c = 0; c < children; c++) { 129 if ((ret = find_vdev(zhdl, child[c], search_fru, 130 search_guid)) != NULL) 131 return (ret); 132 } 133 134 return (NULL); 135 } 136 137 /* 138 * Given a (pool, vdev) GUID pair, find the matching pool and vdev. 139 */ 140 static zpool_handle_t * 141 find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, 142 nvlist_t **vdevp) 143 { 144 find_cbdata_t cb; 145 zpool_handle_t *zhp; 146 nvlist_t *config, *nvroot; 147 148 /* 149 * Find the corresponding pool and make sure the vdev still exists. 150 */ 151 cb.cb_guid = pool_guid; 152 if (zpool_iter(zhdl, find_pool, &cb) != 1) 153 return (NULL); 154 155 zhp = cb.cb_zhp; 156 config = zpool_get_config(zhp, NULL); 157 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 158 &nvroot) != 0) { 159 zpool_close(zhp); 160 return (NULL); 161 } 162 163 if (vdev_guid != 0) { 164 if ((*vdevp = find_vdev(zhdl, nvroot, NULL, 165 vdev_guid)) == NULL) { 166 zpool_close(zhp); 167 return (NULL); 168 } 169 } 170 171 return (zhp); 172 } 173 174 static int 175 search_pool(zpool_handle_t *zhp, void *data) 176 { 177 find_cbdata_t *cbp = data; 178 nvlist_t *config; 179 nvlist_t *nvroot; 180 181 config = zpool_get_config(zhp, NULL); 182 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 183 &nvroot) != 0) { 184 zpool_close(zhp); 185 return (0); 186 } 187 188 if ((cbp->cb_vdev = find_vdev(zpool_get_handle(zhp), nvroot, 189 cbp->cb_fru, 0)) != NULL) { 190 cbp->cb_zhp = zhp; 191 return (1); 192 } 193 194 zpool_close(zhp); 195 return (0); 196 } 197 198 /* 199 * Given a FRU FMRI, find the matching pool and vdev. 200 */ 201 static zpool_handle_t * 202 find_by_fru(libzfs_handle_t *zhdl, const char *fru, nvlist_t **vdevp) 203 { 204 find_cbdata_t cb; 205 206 cb.cb_fru = fru; 207 cb.cb_zhp = NULL; 208 if (zpool_iter(zhdl, search_pool, &cb) != 1) 209 return (NULL); 210 211 *vdevp = cb.cb_vdev; 212 return (cb.cb_zhp); 213 } 214 215 /* 216 * Given a vdev, attempt to replace it with every known spare until one 217 * succeeds. 218 */ 219 static void 220 replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) 221 { 222 nvlist_t *config, *nvroot, *replacement; 223 nvlist_t **spares; 224 uint_t s, nspares; 225 char *dev_name; 226 227 config = zpool_get_config(zhp, NULL); 228 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 229 &nvroot) != 0) 230 return; 231 232 /* 233 * Find out if there are any hot spares available in the pool. 234 */ 235 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 236 &spares, &nspares) != 0) 237 return; 238 239 replacement = fmd_nvl_alloc(hdl, FMD_SLEEP); 240 241 (void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE, 242 VDEV_TYPE_ROOT); 243 244 dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); 245 246 /* 247 * Try to replace each spare, ending when we successfully 248 * replace it. 249 */ 250 for (s = 0; s < nspares; s++) { 251 char *spare_name; 252 253 if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, 254 &spare_name) != 0) 255 continue; 256 257 (void) nvlist_add_nvlist_array(replacement, 258 ZPOOL_CONFIG_CHILDREN, &spares[s], 1); 259 260 if (zpool_vdev_attach(zhp, dev_name, spare_name, 261 replacement, B_TRUE) == 0) 262 break; 263 } 264 265 free(dev_name); 266 nvlist_free(replacement); 267 } 268 269 /* 270 * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and 271 * ASRU is now usable. ZFS has found the device to be present and 272 * functioning. 273 */ 274 /*ARGSUSED*/ 275 void 276 zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl) 277 { 278 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 279 zfs_retire_repaired_t *zrp; 280 uint64_t pool_guid, vdev_guid; 281 nvlist_t *asru; 282 283 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, 284 &pool_guid) != 0 || nvlist_lookup_uint64(nvl, 285 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) 286 return; 287 288 /* 289 * Before checking the state of the ASRU, go through and see if we've 290 * already made an attempt to repair this ASRU. This list is cleared 291 * whenever we receive any kind of list event, and is designed to 292 * prevent us from generating a feedback loop when we attempt repairs 293 * against a faulted pool. The problem is that checking the unusable 294 * state of the ASRU can involve opening the pool, which can post 295 * statechange events but otherwise leave the pool in the faulted 296 * state. This list allows us to detect when a statechange event is 297 * due to our own request. 298 */ 299 for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) { 300 if (zrp->zrr_pool == pool_guid && 301 zrp->zrr_vdev == vdev_guid) 302 return; 303 } 304 305 asru = fmd_nvl_alloc(hdl, FMD_SLEEP); 306 307 (void) nvlist_add_uint8(asru, FM_VERSION, ZFS_SCHEME_VERSION0); 308 (void) nvlist_add_string(asru, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); 309 (void) nvlist_add_uint64(asru, FM_FMRI_ZFS_POOL, pool_guid); 310 (void) nvlist_add_uint64(asru, FM_FMRI_ZFS_VDEV, vdev_guid); 311 312 /* 313 * We explicitly check for the unusable state here to make sure we 314 * aren't responding to a transient state change. As part of opening a 315 * vdev, it's possible to see the 'statechange' event, only to be 316 * followed by a vdev failure later. If we don't check the current 317 * state of the vdev (or pool) before marking it repaired, then we risk 318 * generating spurious repair events followed immediately by the same 319 * diagnosis. 320 * 321 * This assumes that the ZFS scheme code associated unusable (i.e. 322 * isolated) with its own definition of faulty state. In the case of a 323 * DEGRADED leaf vdev (due to checksum errors), this is not the case. 324 * This works, however, because the transient state change is not 325 * posted in this case. This could be made more explicit by not 326 * relying on the scheme's unusable callback and instead directly 327 * checking the vdev state, where we could correctly account for 328 * DEGRADED state. 329 */ 330 if (!fmd_nvl_fmri_unusable(hdl, asru) && fmd_nvl_fmri_has_fault(hdl, 331 asru, FMD_HAS_FAULT_ASRU, NULL)) { 332 topo_hdl_t *thp; 333 char *fmri = NULL; 334 int err; 335 336 thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION); 337 if (topo_fmri_nvl2str(thp, asru, &fmri, &err) == 0) 338 (void) fmd_repair_asru(hdl, fmri); 339 fmd_hdl_topo_rele(hdl, thp); 340 341 topo_hdl_strfree(thp, fmri); 342 } 343 344 zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP); 345 zrp->zrr_next = zdp->zrd_repaired; 346 zrp->zrr_pool = pool_guid; 347 zrp->zrr_vdev = vdev_guid; 348 zdp->zrd_repaired = zrp; 349 } 350 351 /*ARGSUSED*/ 352 static void 353 zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 354 const char *class) 355 { 356 uint64_t pool_guid, vdev_guid; 357 zpool_handle_t *zhp; 358 nvlist_t *resource, *fault, *fru; 359 nvlist_t **faults; 360 uint_t f, nfaults; 361 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 362 libzfs_handle_t *zhdl = zdp->zrd_hdl; 363 boolean_t fault_device, degrade_device; 364 boolean_t is_repair; 365 char *scheme, *fmri; 366 nvlist_t *vdev; 367 char *uuid; 368 int repair_done = 0; 369 boolean_t retire; 370 boolean_t is_disk; 371 vdev_aux_t aux; 372 topo_hdl_t *thp; 373 int err; 374 375 /* 376 * If this is a resource notifying us of device removal, then simply 377 * check for an available spare and continue. 378 */ 379 if (strcmp(class, "resource.fs.zfs.removed") == 0) { 380 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, 381 &pool_guid) != 0 || 382 nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, 383 &vdev_guid) != 0) 384 return; 385 386 if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, 387 &vdev)) == NULL) 388 return; 389 390 if (fmd_prop_get_int32(hdl, "spare_on_remove")) 391 replace_with_spare(hdl, zhp, vdev); 392 zpool_close(zhp); 393 return; 394 } 395 396 if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) 397 return; 398 399 if (strcmp(class, "resource.fs.zfs.statechange") == 0 || 400 strcmp(class, 401 "resource.sysevent.EC_zfs.ESC_ZFS_vdev_remove") == 0) { 402 zfs_vdev_repair(hdl, nvl); 403 return; 404 } 405 406 zfs_retire_clear_data(hdl, zdp); 407 408 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 409 is_repair = B_TRUE; 410 else 411 is_repair = B_FALSE; 412 413 /* 414 * We subscribe to zfs faults as well as all repair events. 415 */ 416 if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 417 &faults, &nfaults) != 0) 418 return; 419 420 for (f = 0; f < nfaults; f++) { 421 fault = faults[f]; 422 423 fault_device = B_FALSE; 424 degrade_device = B_FALSE; 425 is_disk = B_FALSE; 426 427 if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE, 428 &retire) == 0 && retire == 0) 429 continue; 430 431 /* 432 * While we subscribe to fault.fs.zfs.*, we only take action 433 * for faults targeting a specific vdev (open failure or SERD 434 * failure). We also subscribe to fault.io.* events, so that 435 * faulty disks will be faulted in the ZFS configuration. 436 */ 437 if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) { 438 fault_device = B_TRUE; 439 } else if (fmd_nvl_class_match(hdl, fault, 440 "fault.fs.zfs.vdev.checksum")) { 441 degrade_device = B_TRUE; 442 } else if (fmd_nvl_class_match(hdl, fault, 443 "fault.fs.zfs.device")) { 444 fault_device = B_FALSE; 445 } else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) { 446 is_disk = B_TRUE; 447 fault_device = B_TRUE; 448 } else { 449 continue; 450 } 451 452 if (is_disk) { 453 /* 454 * This is a disk fault. Lookup the FRU, convert it to 455 * an FMRI string, and attempt to find a matching vdev. 456 */ 457 if (nvlist_lookup_nvlist(fault, FM_FAULT_FRU, 458 &fru) != 0 || 459 nvlist_lookup_string(fru, FM_FMRI_SCHEME, 460 &scheme) != 0) 461 continue; 462 463 if (strcmp(scheme, FM_FMRI_SCHEME_HC) != 0) 464 continue; 465 466 thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION); 467 if (topo_fmri_nvl2str(thp, fru, &fmri, &err) != 0) { 468 fmd_hdl_topo_rele(hdl, thp); 469 continue; 470 } 471 472 zhp = find_by_fru(zhdl, fmri, &vdev); 473 topo_hdl_strfree(thp, fmri); 474 fmd_hdl_topo_rele(hdl, thp); 475 476 if (zhp == NULL) 477 continue; 478 479 (void) nvlist_lookup_uint64(vdev, 480 ZPOOL_CONFIG_GUID, &vdev_guid); 481 aux = VDEV_AUX_EXTERNAL; 482 } else { 483 /* 484 * This is a ZFS fault. Lookup the resource, and 485 * attempt to find the matching vdev. 486 */ 487 if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, 488 &resource) != 0 || 489 nvlist_lookup_string(resource, FM_FMRI_SCHEME, 490 &scheme) != 0) 491 continue; 492 493 if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0) 494 continue; 495 496 if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL, 497 &pool_guid) != 0) 498 continue; 499 500 if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV, 501 &vdev_guid) != 0) { 502 if (is_repair) 503 vdev_guid = 0; 504 else 505 continue; 506 } 507 508 if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, 509 &vdev)) == NULL) 510 continue; 511 512 aux = VDEV_AUX_ERR_EXCEEDED; 513 } 514 515 if (vdev_guid == 0) { 516 /* 517 * For pool-level repair events, clear the entire pool. 518 */ 519 (void) zpool_clear(zhp, NULL, NULL); 520 zpool_close(zhp); 521 continue; 522 } 523 524 /* 525 * If this is a repair event, then mark the vdev as repaired and 526 * continue. 527 */ 528 if (is_repair) { 529 repair_done = 1; 530 (void) zpool_vdev_clear(zhp, vdev_guid); 531 zpool_close(zhp); 532 continue; 533 } 534 535 /* 536 * Actively fault the device if needed. 537 */ 538 if (fault_device) 539 (void) zpool_vdev_fault(zhp, vdev_guid, aux); 540 if (degrade_device) 541 (void) zpool_vdev_degrade(zhp, vdev_guid, aux); 542 543 /* 544 * Attempt to substitute a hot spare. 545 */ 546 replace_with_spare(hdl, zhp, vdev); 547 zpool_close(zhp); 548 } 549 550 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done && 551 nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0) 552 fmd_case_uuresolved(hdl, uuid); 553 } 554 555 static const fmd_hdl_ops_t fmd_ops = { 556 zfs_retire_recv, /* fmdo_recv */ 557 NULL, /* fmdo_timeout */ 558 NULL, /* fmdo_close */ 559 NULL, /* fmdo_stats */ 560 NULL, /* fmdo_gc */ 561 }; 562 563 static const fmd_prop_t fmd_props[] = { 564 { "spare_on_remove", FMD_TYPE_BOOL, "true" }, 565 { NULL, 0, NULL } 566 }; 567 568 static const fmd_hdl_info_t fmd_info = { 569 "ZFS Retire Agent", "1.0", &fmd_ops, fmd_props 570 }; 571 572 void 573 _fmd_init(fmd_hdl_t *hdl) 574 { 575 zfs_retire_data_t *zdp; 576 libzfs_handle_t *zhdl; 577 578 if ((zhdl = libzfs_init()) == NULL) 579 return; 580 581 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { 582 libzfs_fini(zhdl); 583 return; 584 } 585 586 zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP); 587 zdp->zrd_hdl = zhdl; 588 589 fmd_hdl_setspecific(hdl, zdp); 590 } 591 592 void 593 _fmd_fini(fmd_hdl_t *hdl) 594 { 595 zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); 596 597 if (zdp != NULL) { 598 zfs_retire_clear_data(hdl, zdp); 599 libzfs_fini(zdp->zrd_hdl); 600 fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t)); 601 } 602 } 603