1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2016, 2017, Intel Corporation. 26 * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. 27 */ 28 29 /* 30 * ZFS syseventd module. 31 * 32 * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c 33 * 34 * The purpose of this module is to identify when devices are added to the 35 * system, and appropriately online or replace the affected vdevs. 36 * 37 * When a device is added to the system: 38 * 39 * 1. Search for any vdevs whose devid matches that of the newly added 40 * device. 41 * 42 * 2. If no vdevs are found, then search for any vdevs whose udev path 43 * matches that of the new device. 44 * 45 * 3. If no vdevs match by either method, then ignore the event. 46 * 47 * 4. Attempt to online the device with a flag to indicate that it should 48 * be unspared when resilvering completes. If this succeeds, then the 49 * same device was inserted and we should continue normally. 50 * 51 * 5. If the pool does not have the 'autoreplace' property set, attempt to 52 * online the device again without the unspare flag, which will 53 * generate a FMA fault. 54 * 55 * 6. If the pool has the 'autoreplace' property set, and the matching vdev 56 * is a whole disk, then label the new disk and attempt a 'zpool 57 * replace'. 58 * 59 * The module responds to EC_DEV_ADD events. The special ESC_ZFS_VDEV_CHECK 60 * event indicates that a device failed to open during pool load, but the 61 * autoreplace property was set. In this case, we deferred the associated 62 * FMA fault until our module had a chance to process the autoreplace logic. 63 * If the device could not be replaced, then the second online attempt will 64 * trigger the FMA fault that we skipped earlier. 65 * 66 * On Linux udev provides a disk insert for both the disk and the partition. 67 */ 68 69 #include <ctype.h> 70 #include <fcntl.h> 71 #include <libnvpair.h> 72 #include <libzfs.h> 73 #include <libzutil.h> 74 #include <limits.h> 75 #include <stddef.h> 76 #include <stdlib.h> 77 #include <string.h> 78 #include <syslog.h> 79 #include <sys/list.h> 80 #include <sys/sunddi.h> 81 #include <sys/sysevent/eventdefs.h> 82 #include <sys/sysevent/dev.h> 83 #include <thread_pool.h> 84 #include <pthread.h> 85 #include <unistd.h> 86 #include <errno.h> 87 #include "zfs_agents.h" 88 #include "../zed_log.h" 89 90 #define DEV_BYID_PATH "/dev/disk/by-id/" 91 #define DEV_BYPATH_PATH "/dev/disk/by-path/" 92 #define DEV_BYVDEV_PATH "/dev/disk/by-vdev/" 93 94 typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t); 95 96 libzfs_handle_t *g_zfshdl; 97 list_t g_pool_list; /* list of unavailable pools at initialization */ 98 list_t g_device_list; /* list of disks with asynchronous label request */ 99 tpool_t *g_tpool; 100 boolean_t g_enumeration_done; 101 pthread_t g_zfs_tid; /* zfs_enum_pools() thread */ 102 103 typedef struct unavailpool { 104 zpool_handle_t *uap_zhp; 105 list_node_t uap_node; 106 } unavailpool_t; 107 108 typedef struct pendingdev { 109 char pd_physpath[128]; 110 list_node_t pd_node; 111 } pendingdev_t; 112 113 static int 114 zfs_toplevel_state(zpool_handle_t *zhp) 115 { 116 nvlist_t *nvroot; 117 vdev_stat_t *vs; 118 unsigned int c; 119 120 verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), 121 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 122 verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, 123 (uint64_t **)&vs, &c) == 0); 124 return (vs->vs_state); 125 } 126 127 static int 128 zfs_unavail_pool(zpool_handle_t *zhp, void *data) 129 { 130 zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)", 131 zpool_get_name(zhp), (int)zfs_toplevel_state(zhp)); 132 133 if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) { 134 unavailpool_t *uap; 135 uap = malloc(sizeof (unavailpool_t)); 136 uap->uap_zhp = zhp; 137 list_insert_tail((list_t *)data, uap); 138 } else { 139 zpool_close(zhp); 140 } 141 return (0); 142 } 143 144 /* 145 * Two stage replace on Linux 146 * since we get disk notifications 147 * we can wait for partitioned disk slice to show up! 148 * 149 * First stage tags the disk, initiates async partitioning, and returns 150 * Second stage finds the tag and proceeds to ZFS labeling/replace 151 * 152 * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach 153 * 154 * 1. physical match with no fs, no partition 155 * tag it top, partition disk 156 * 157 * 2. physical match again, see partition and tag 158 * 159 */ 160 161 /* 162 * The device associated with the given vdev (either by devid or physical path) 163 * has been added to the system. If 'isdisk' is set, then we only attempt a 164 * replacement if it's a whole disk. This also implies that we should label the 165 * disk first. 166 * 167 * First, we attempt to online the device (making sure to undo any spare 168 * operation when finished). If this succeeds, then we're done. If it fails, 169 * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, 170 * but that the label was not what we expected. If the 'autoreplace' property 171 * is enabled, then we relabel the disk (if specified), and attempt a 'zpool 172 * replace'. If the online is successful, but the new state is something else 173 * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of 174 * race, and we should avoid attempting to relabel the disk. 175 * 176 * Also can arrive here from a ESC_ZFS_VDEV_CHECK event 177 */ 178 static void 179 zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) 180 { 181 char *path; 182 vdev_state_t newstate; 183 nvlist_t *nvroot, *newvd; 184 pendingdev_t *device; 185 uint64_t wholedisk = 0ULL; 186 uint64_t offline = 0ULL, faulted = 0ULL; 187 uint64_t guid = 0ULL; 188 char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL; 189 char rawpath[PATH_MAX], fullpath[PATH_MAX]; 190 char devpath[PATH_MAX]; 191 int ret; 192 boolean_t is_sd = B_FALSE; 193 boolean_t is_mpath_wholedisk = B_FALSE; 194 uint_t c; 195 vdev_stat_t *vs; 196 197 if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) 198 return; 199 200 /* Skip healthy disks */ 201 verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, 202 (uint64_t **)&vs, &c) == 0); 203 if (vs->vs_state == VDEV_STATE_HEALTHY) { 204 zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.", 205 __func__, path); 206 return; 207 } 208 209 (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); 210 (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, 211 &enc_sysfs_path); 212 (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); 213 (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); 214 (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_FAULTED, &faulted); 215 216 (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid); 217 218 /* 219 * Special case: 220 * 221 * We've seen times where a disk won't have a ZPOOL_CONFIG_PHYS_PATH 222 * entry in their config. For example, on this force-faulted disk: 223 * 224 * children[0]: 225 * type: 'disk' 226 * id: 0 227 * guid: 14309659774640089719 228 * path: '/dev/disk/by-vdev/L28' 229 * whole_disk: 0 230 * DTL: 654 231 * create_txg: 4 232 * com.delphix:vdev_zap_leaf: 1161 233 * faulted: 1 234 * aux_state: 'external' 235 * children[1]: 236 * type: 'disk' 237 * id: 1 238 * guid: 16002508084177980912 239 * path: '/dev/disk/by-vdev/L29' 240 * devid: 'dm-uuid-mpath-35000c500a61d68a3' 241 * phys_path: 'L29' 242 * vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32' 243 * whole_disk: 0 244 * DTL: 1028 245 * create_txg: 4 246 * com.delphix:vdev_zap_leaf: 131 247 * 248 * If the disk's path is a /dev/disk/by-vdev/ path, then we can infer 249 * the ZPOOL_CONFIG_PHYS_PATH from the by-vdev disk name. 250 */ 251 if (physpath == NULL && path != NULL) { 252 /* If path begins with "/dev/disk/by-vdev/" ... */ 253 if (strncmp(path, DEV_BYVDEV_PATH, 254 strlen(DEV_BYVDEV_PATH)) == 0) { 255 /* Set physpath to the char after "/dev/disk/by-vdev" */ 256 physpath = &path[strlen(DEV_BYVDEV_PATH)]; 257 } 258 } 259 260 /* 261 * We don't want to autoreplace offlined disks. However, we do want to 262 * replace force-faulted disks (`zpool offline -f`). Force-faulted 263 * disks have both offline=1 and faulted=1 in the nvlist. 264 */ 265 if (offline && !faulted) { 266 zed_log_msg(LOG_INFO, "%s: %s is offline, skip autoreplace", 267 __func__, path); 268 return; 269 } 270 271 is_mpath_wholedisk = is_mpath_whole_disk(path); 272 zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'" 273 " %s blank disk, %s mpath blank disk, %s labeled, enc sysfs '%s', " 274 "(guid %llu)", 275 zpool_get_name(zhp), path, 276 physpath ? physpath : "NULL", 277 wholedisk ? "is" : "not", 278 is_mpath_wholedisk? "is" : "not", 279 labeled ? "is" : "not", 280 enc_sysfs_path, 281 (long long unsigned int)guid); 282 283 /* 284 * The VDEV guid is preferred for identification (gets passed in path) 285 */ 286 if (guid != 0) { 287 (void) snprintf(fullpath, sizeof (fullpath), "%llu", 288 (long long unsigned int)guid); 289 } else { 290 /* 291 * otherwise use path sans partition suffix for whole disks 292 */ 293 (void) strlcpy(fullpath, path, sizeof (fullpath)); 294 if (wholedisk) { 295 char *spath = zfs_strip_partition(fullpath); 296 if (!spath) { 297 zed_log_msg(LOG_INFO, "%s: Can't alloc", 298 __func__); 299 return; 300 } 301 302 (void) strlcpy(fullpath, spath, sizeof (fullpath)); 303 free(spath); 304 } 305 } 306 307 /* 308 * Attempt to online the device. 309 */ 310 if (zpool_vdev_online(zhp, fullpath, 311 ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 && 312 (newstate == VDEV_STATE_HEALTHY || 313 newstate == VDEV_STATE_DEGRADED)) { 314 zed_log_msg(LOG_INFO, 315 " zpool_vdev_online: vdev '%s' ('%s') is " 316 "%s", fullpath, physpath, (newstate == VDEV_STATE_HEALTHY) ? 317 "HEALTHY" : "DEGRADED"); 318 return; 319 } 320 321 /* 322 * vdev_id alias rule for using scsi_debug devices (FMA automated 323 * testing) 324 */ 325 if (physpath != NULL && strcmp("scsidebug", physpath) == 0) 326 is_sd = B_TRUE; 327 328 /* 329 * If the pool doesn't have the autoreplace property set, then use 330 * vdev online to trigger a FMA fault by posting an ereport. 331 */ 332 if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || 333 !(wholedisk || is_mpath_wholedisk) || (physpath == NULL)) { 334 (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, 335 &newstate); 336 zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or " 337 "not a blank disk for '%s' ('%s')", fullpath, 338 physpath); 339 return; 340 } 341 342 /* 343 * Convert physical path into its current device node. Rawpath 344 * needs to be /dev/disk/by-vdev for a scsi_debug device since 345 * /dev/disk/by-path will not be present. 346 */ 347 (void) snprintf(rawpath, sizeof (rawpath), "%s%s", 348 is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath); 349 350 if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) { 351 zed_log_msg(LOG_INFO, " realpath: %s failed (%s)", 352 rawpath, strerror(errno)); 353 354 (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, 355 &newstate); 356 357 zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", 358 fullpath, libzfs_error_description(g_zfshdl)); 359 return; 360 } 361 362 /* Only autoreplace bad disks */ 363 if ((vs->vs_state != VDEV_STATE_DEGRADED) && 364 (vs->vs_state != VDEV_STATE_FAULTED) && 365 (vs->vs_state != VDEV_STATE_CANT_OPEN)) { 366 zed_log_msg(LOG_INFO, " not autoreplacing since disk isn't in " 367 "a bad state (currently %d)", vs->vs_state); 368 return; 369 } 370 371 nvlist_lookup_string(vdev, "new_devid", &new_devid); 372 373 if (is_mpath_wholedisk) { 374 /* Don't label device mapper or multipath disks. */ 375 } else if (!labeled) { 376 /* 377 * we're auto-replacing a raw disk, so label it first 378 */ 379 char *leafname; 380 381 /* 382 * If this is a request to label a whole disk, then attempt to 383 * write out the label. Before we can label the disk, we need 384 * to map the physical string that was matched on to the under 385 * lying device node. 386 * 387 * If any part of this process fails, then do a force online 388 * to trigger a ZFS fault for the device (and any hot spare 389 * replacement). 390 */ 391 leafname = strrchr(devpath, '/') + 1; 392 393 /* 394 * If this is a request to label a whole disk, then attempt to 395 * write out the label. 396 */ 397 if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) { 398 zed_log_msg(LOG_INFO, " zpool_label_disk: could not " 399 "label '%s' (%s)", leafname, 400 libzfs_error_description(g_zfshdl)); 401 402 (void) zpool_vdev_online(zhp, fullpath, 403 ZFS_ONLINE_FORCEFAULT, &newstate); 404 return; 405 } 406 407 /* 408 * The disk labeling is asynchronous on Linux. Just record 409 * this label request and return as there will be another 410 * disk add event for the partition after the labeling is 411 * completed. 412 */ 413 device = malloc(sizeof (pendingdev_t)); 414 (void) strlcpy(device->pd_physpath, physpath, 415 sizeof (device->pd_physpath)); 416 list_insert_tail(&g_device_list, device); 417 418 zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)", 419 leafname, (u_longlong_t)guid); 420 421 return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */ 422 423 } else /* labeled */ { 424 boolean_t found = B_FALSE; 425 /* 426 * match up with request above to label the disk 427 */ 428 for (device = list_head(&g_device_list); device != NULL; 429 device = list_next(&g_device_list, device)) { 430 if (strcmp(physpath, device->pd_physpath) == 0) { 431 list_remove(&g_device_list, device); 432 free(device); 433 found = B_TRUE; 434 break; 435 } 436 zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s", 437 physpath, device->pd_physpath); 438 } 439 if (!found) { 440 /* unexpected partition slice encountered */ 441 zed_log_msg(LOG_INFO, "labeled disk %s unexpected here", 442 fullpath); 443 (void) zpool_vdev_online(zhp, fullpath, 444 ZFS_ONLINE_FORCEFAULT, &newstate); 445 return; 446 } 447 448 zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)", 449 physpath, (u_longlong_t)guid); 450 451 (void) snprintf(devpath, sizeof (devpath), "%s%s", 452 DEV_BYID_PATH, new_devid); 453 } 454 455 /* 456 * Construct the root vdev to pass to zpool_vdev_attach(). While adding 457 * the entire vdev structure is harmless, we construct a reduced set of 458 * path/physpath/wholedisk to keep it simple. 459 */ 460 if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) { 461 zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); 462 return; 463 } 464 if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { 465 zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); 466 nvlist_free(nvroot); 467 return; 468 } 469 470 if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 || 471 nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || 472 nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 || 473 (physpath != NULL && nvlist_add_string(newvd, 474 ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) || 475 (enc_sysfs_path != NULL && nvlist_add_string(newvd, 476 ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, enc_sysfs_path) != 0) || 477 nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || 478 nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || 479 nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 480 (const nvlist_t **)&newvd, 1) != 0) { 481 zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs"); 482 nvlist_free(newvd); 483 nvlist_free(nvroot); 484 return; 485 } 486 487 nvlist_free(newvd); 488 489 /* 490 * Wait for udev to verify the links exist, then auto-replace 491 * the leaf disk at same physical location. 492 */ 493 if (zpool_label_disk_wait(path, 3000) != 0) { 494 zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement " 495 "disk %s is missing", path); 496 nvlist_free(nvroot); 497 return; 498 } 499 500 /* 501 * Prefer sequential resilvering when supported (mirrors and dRAID), 502 * otherwise fallback to a traditional healing resilver. 503 */ 504 ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_TRUE); 505 if (ret != 0) { 506 ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, 507 B_TRUE, B_FALSE); 508 } 509 510 zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", 511 fullpath, path, (ret == 0) ? "no errors" : 512 libzfs_error_description(g_zfshdl)); 513 514 nvlist_free(nvroot); 515 } 516 517 /* 518 * Utility functions to find a vdev matching given criteria. 519 */ 520 typedef struct dev_data { 521 const char *dd_compare; 522 const char *dd_prop; 523 zfs_process_func_t dd_func; 524 boolean_t dd_found; 525 boolean_t dd_islabeled; 526 uint64_t dd_pool_guid; 527 uint64_t dd_vdev_guid; 528 const char *dd_new_devid; 529 } dev_data_t; 530 531 static void 532 zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) 533 { 534 dev_data_t *dp = data; 535 char *path = NULL; 536 uint_t c, children; 537 nvlist_t **child; 538 539 /* 540 * First iterate over any children. 541 */ 542 if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, 543 &child, &children) == 0) { 544 for (c = 0; c < children; c++) 545 zfs_iter_vdev(zhp, child[c], data); 546 } 547 548 /* 549 * Iterate over any spares and cache devices 550 */ 551 if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES, 552 &child, &children) == 0) { 553 for (c = 0; c < children; c++) 554 zfs_iter_vdev(zhp, child[c], data); 555 } 556 if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE, 557 &child, &children) == 0) { 558 for (c = 0; c < children; c++) 559 zfs_iter_vdev(zhp, child[c], data); 560 } 561 562 /* once a vdev was matched and processed there is nothing left to do */ 563 if (dp->dd_found) 564 return; 565 566 /* 567 * Match by GUID if available otherwise fallback to devid or physical 568 */ 569 if (dp->dd_vdev_guid != 0) { 570 uint64_t guid; 571 572 if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, 573 &guid) != 0 || guid != dp->dd_vdev_guid) { 574 return; 575 } 576 zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched on %llu", guid); 577 dp->dd_found = B_TRUE; 578 579 } else if (dp->dd_compare != NULL) { 580 /* 581 * NOTE: On Linux there is an event for partition, so unlike 582 * illumos, substring matching is not required to accommodate 583 * the partition suffix. An exact match will be present in 584 * the dp->dd_compare value. 585 */ 586 if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || 587 strcmp(dp->dd_compare, path) != 0) { 588 zed_log_msg(LOG_INFO, " %s: no match (%s != vdev %s)", 589 __func__, dp->dd_compare, path); 590 return; 591 } 592 593 zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s", 594 dp->dd_prop, path); 595 dp->dd_found = B_TRUE; 596 597 /* pass the new devid for use by replacing code */ 598 if (dp->dd_new_devid != NULL) { 599 (void) nvlist_add_string(nvl, "new_devid", 600 dp->dd_new_devid); 601 } 602 } 603 604 (dp->dd_func)(zhp, nvl, dp->dd_islabeled); 605 } 606 607 static void 608 zfs_enable_ds(void *arg) 609 { 610 unavailpool_t *pool = (unavailpool_t *)arg; 611 612 (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0); 613 zpool_close(pool->uap_zhp); 614 free(pool); 615 } 616 617 static int 618 zfs_iter_pool(zpool_handle_t *zhp, void *data) 619 { 620 nvlist_t *config, *nvl; 621 dev_data_t *dp = data; 622 uint64_t pool_guid; 623 unavailpool_t *pool; 624 625 zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)", 626 zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop); 627 628 /* 629 * For each vdev in this pool, look for a match to apply dd_func 630 */ 631 if ((config = zpool_get_config(zhp, NULL)) != NULL) { 632 if (dp->dd_pool_guid == 0 || 633 (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 634 &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) { 635 (void) nvlist_lookup_nvlist(config, 636 ZPOOL_CONFIG_VDEV_TREE, &nvl); 637 zfs_iter_vdev(zhp, nvl, data); 638 } 639 } else { 640 zed_log_msg(LOG_INFO, "%s: no config\n", __func__); 641 } 642 643 /* 644 * if this pool was originally unavailable, 645 * then enable its datasets asynchronously 646 */ 647 if (g_enumeration_done) { 648 for (pool = list_head(&g_pool_list); pool != NULL; 649 pool = list_next(&g_pool_list, pool)) { 650 651 if (strcmp(zpool_get_name(zhp), 652 zpool_get_name(pool->uap_zhp))) 653 continue; 654 if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) { 655 list_remove(&g_pool_list, pool); 656 (void) tpool_dispatch(g_tpool, zfs_enable_ds, 657 pool); 658 break; 659 } 660 } 661 } 662 663 zpool_close(zhp); 664 return (dp->dd_found); /* cease iteration after a match */ 665 } 666 667 /* 668 * Given a physical device location, iterate over all 669 * (pool, vdev) pairs which correspond to that location. 670 */ 671 static boolean_t 672 devphys_iter(const char *physical, const char *devid, zfs_process_func_t func, 673 boolean_t is_slice) 674 { 675 dev_data_t data = { 0 }; 676 677 data.dd_compare = physical; 678 data.dd_func = func; 679 data.dd_prop = ZPOOL_CONFIG_PHYS_PATH; 680 data.dd_found = B_FALSE; 681 data.dd_islabeled = is_slice; 682 data.dd_new_devid = devid; /* used by auto replace code */ 683 684 (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); 685 686 return (data.dd_found); 687 } 688 689 /* 690 * Given a device identifier, find any vdevs with a matching by-vdev 691 * path. Normally we shouldn't need this as the comparison would be 692 * made earlier in the devphys_iter(). For example, if we were replacing 693 * /dev/disk/by-vdev/L28, normally devphys_iter() would match the 694 * ZPOOL_CONFIG_PHYS_PATH of "L28" from the old disk config to "L28" 695 * of the new disk config. However, we've seen cases where 696 * ZPOOL_CONFIG_PHYS_PATH was not in the config for the old disk. Here's 697 * an example of a real 2-disk mirror pool where one disk was force 698 * faulted: 699 * 700 * com.delphix:vdev_zap_top: 129 701 * children[0]: 702 * type: 'disk' 703 * id: 0 704 * guid: 14309659774640089719 705 * path: '/dev/disk/by-vdev/L28' 706 * whole_disk: 0 707 * DTL: 654 708 * create_txg: 4 709 * com.delphix:vdev_zap_leaf: 1161 710 * faulted: 1 711 * aux_state: 'external' 712 * children[1]: 713 * type: 'disk' 714 * id: 1 715 * guid: 16002508084177980912 716 * path: '/dev/disk/by-vdev/L29' 717 * devid: 'dm-uuid-mpath-35000c500a61d68a3' 718 * phys_path: 'L29' 719 * vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32' 720 * whole_disk: 0 721 * DTL: 1028 722 * create_txg: 4 723 * com.delphix:vdev_zap_leaf: 131 724 * 725 * So in the case above, the only thing we could compare is the path. 726 * 727 * We can do this because we assume by-vdev paths are authoritative as physical 728 * paths. We could not assume this for normal paths like /dev/sda since the 729 * physical location /dev/sda points to could change over time. 730 */ 731 static boolean_t 732 by_vdev_path_iter(const char *by_vdev_path, const char *devid, 733 zfs_process_func_t func, boolean_t is_slice) 734 { 735 dev_data_t data = { 0 }; 736 737 data.dd_compare = by_vdev_path; 738 data.dd_func = func; 739 data.dd_prop = ZPOOL_CONFIG_PATH; 740 data.dd_found = B_FALSE; 741 data.dd_islabeled = is_slice; 742 data.dd_new_devid = devid; 743 744 if (strncmp(by_vdev_path, DEV_BYVDEV_PATH, 745 strlen(DEV_BYVDEV_PATH)) != 0) { 746 /* by_vdev_path doesn't start with "/dev/disk/by-vdev/" */ 747 return (B_FALSE); 748 } 749 750 (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); 751 752 return (data.dd_found); 753 } 754 755 /* 756 * Given a device identifier, find any vdevs with a matching devid. 757 * On Linux we can match devid directly which is always a whole disk. 758 */ 759 static boolean_t 760 devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice) 761 { 762 dev_data_t data = { 0 }; 763 764 data.dd_compare = devid; 765 data.dd_func = func; 766 data.dd_prop = ZPOOL_CONFIG_DEVID; 767 data.dd_found = B_FALSE; 768 data.dd_islabeled = is_slice; 769 data.dd_new_devid = devid; 770 771 (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); 772 773 return (data.dd_found); 774 } 775 776 /* 777 * Given a device guid, find any vdevs with a matching guid. 778 */ 779 static boolean_t 780 guid_iter(uint64_t pool_guid, uint64_t vdev_guid, const char *devid, 781 zfs_process_func_t func, boolean_t is_slice) 782 { 783 dev_data_t data = { 0 }; 784 785 data.dd_func = func; 786 data.dd_found = B_FALSE; 787 data.dd_pool_guid = pool_guid; 788 data.dd_vdev_guid = vdev_guid; 789 data.dd_islabeled = is_slice; 790 data.dd_new_devid = devid; 791 792 (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); 793 794 return (data.dd_found); 795 } 796 797 /* 798 * Handle a EC_DEV_ADD.ESC_DISK event. 799 * 800 * illumos 801 * Expects: DEV_PHYS_PATH string in schema 802 * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID 803 * 804 * path: '/dev/dsk/c0t1d0s0' (persistent) 805 * devid: 'id1,sd@SATA_____Hitachi_HDS72101______JP2940HZ3H74MC/a' 806 * phys_path: '/pci@0,0/pci103c,1609@11/disk@1,0:a' 807 * 808 * linux 809 * provides: DEV_PHYS_PATH and DEV_IDENTIFIER strings in schema 810 * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID 811 * 812 * path: '/dev/sdc1' (not persistent) 813 * devid: 'ata-SAMSUNG_HD204UI_S2HGJD2Z805891-part1' 814 * phys_path: 'pci-0000:04:00.0-sas-0x4433221106000000-lun-0' 815 */ 816 static int 817 zfs_deliver_add(nvlist_t *nvl) 818 { 819 char *devpath = NULL, *devid = NULL; 820 uint64_t pool_guid = 0, vdev_guid = 0; 821 boolean_t is_slice; 822 823 /* 824 * Expecting a devid string and an optional physical location and guid 825 */ 826 if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) { 827 zed_log_msg(LOG_INFO, "%s: no dev identifier\n", __func__); 828 return (-1); 829 } 830 831 (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath); 832 (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid); 833 (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid); 834 835 is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0); 836 837 zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s) (is_slice %d)", 838 devid, devpath ? devpath : "NULL", is_slice); 839 840 /* 841 * Iterate over all vdevs looking for a match in the following order: 842 * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk) 843 * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location). 844 * 3. ZPOOL_CONFIG_GUID (identifies unique vdev). 845 * 4. ZPOOL_CONFIG_PATH for /dev/disk/by-vdev devices only (since 846 * by-vdev paths represent physical paths). 847 */ 848 if (devid_iter(devid, zfs_process_add, is_slice)) 849 return (0); 850 if (devpath != NULL && devphys_iter(devpath, devid, zfs_process_add, 851 is_slice)) 852 return (0); 853 if (vdev_guid != 0) 854 (void) guid_iter(pool_guid, vdev_guid, devid, zfs_process_add, 855 is_slice); 856 857 if (devpath != NULL) { 858 /* Can we match a /dev/disk/by-vdev/ path? */ 859 char by_vdev_path[MAXPATHLEN]; 860 snprintf(by_vdev_path, sizeof (by_vdev_path), 861 "/dev/disk/by-vdev/%s", devpath); 862 if (by_vdev_path_iter(by_vdev_path, devid, zfs_process_add, 863 is_slice)) 864 return (0); 865 } 866 867 return (0); 868 } 869 870 /* 871 * Called when we receive a VDEV_CHECK event, which indicates a device could not 872 * be opened during initial pool open, but the autoreplace property was set on 873 * the pool. In this case, we treat it as if it were an add event. 874 */ 875 static int 876 zfs_deliver_check(nvlist_t *nvl) 877 { 878 dev_data_t data = { 0 }; 879 880 if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, 881 &data.dd_pool_guid) != 0 || 882 nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, 883 &data.dd_vdev_guid) != 0 || 884 data.dd_vdev_guid == 0) 885 return (0); 886 887 zed_log_msg(LOG_INFO, "zfs_deliver_check: pool '%llu', vdev %llu", 888 data.dd_pool_guid, data.dd_vdev_guid); 889 890 data.dd_func = zfs_process_add; 891 892 (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); 893 894 return (0); 895 } 896 897 static int 898 zfsdle_vdev_online(zpool_handle_t *zhp, void *data) 899 { 900 char *devname = data; 901 boolean_t avail_spare, l2cache; 902 nvlist_t *tgt; 903 int error; 904 905 zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'", 906 devname, zpool_get_name(zhp)); 907 908 if ((tgt = zpool_find_vdev_by_physpath(zhp, devname, 909 &avail_spare, &l2cache, NULL)) != NULL) { 910 char *path, fullpath[MAXPATHLEN]; 911 uint64_t wholedisk; 912 913 error = nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &path); 914 if (error) { 915 zpool_close(zhp); 916 return (0); 917 } 918 919 error = nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, 920 &wholedisk); 921 if (error) 922 wholedisk = 0; 923 924 if (wholedisk) { 925 path = strrchr(path, '/'); 926 if (path != NULL) { 927 path = zfs_strip_partition(path + 1); 928 if (path == NULL) { 929 zpool_close(zhp); 930 return (0); 931 } 932 } else { 933 zpool_close(zhp); 934 return (0); 935 } 936 937 (void) strlcpy(fullpath, path, sizeof (fullpath)); 938 free(path); 939 940 /* 941 * We need to reopen the pool associated with this 942 * device so that the kernel can update the size of 943 * the expanded device. When expanding there is no 944 * need to restart the scrub from the beginning. 945 */ 946 boolean_t scrub_restart = B_FALSE; 947 (void) zpool_reopen_one(zhp, &scrub_restart); 948 } else { 949 (void) strlcpy(fullpath, path, sizeof (fullpath)); 950 } 951 952 if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { 953 vdev_state_t newstate; 954 955 if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { 956 error = zpool_vdev_online(zhp, fullpath, 0, 957 &newstate); 958 zed_log_msg(LOG_INFO, "zfsdle_vdev_online: " 959 "setting device '%s' to ONLINE state " 960 "in pool '%s': %d", fullpath, 961 zpool_get_name(zhp), error); 962 } 963 } 964 zpool_close(zhp); 965 return (1); 966 } 967 zpool_close(zhp); 968 return (0); 969 } 970 971 /* 972 * This function handles the ESC_DEV_DLE device change event. Use the 973 * provided vdev guid when looking up a disk or partition, when the guid 974 * is not present assume the entire disk is owned by ZFS and append the 975 * expected -part1 partition information then lookup by physical path. 976 */ 977 static int 978 zfs_deliver_dle(nvlist_t *nvl) 979 { 980 char *devname, name[MAXPATHLEN]; 981 uint64_t guid; 982 983 if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &guid) == 0) { 984 sprintf(name, "%llu", (u_longlong_t)guid); 985 } else if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) == 0) { 986 strlcpy(name, devname, MAXPATHLEN); 987 zfs_append_partition(name, MAXPATHLEN); 988 } else { 989 zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath"); 990 } 991 992 if (zpool_iter(g_zfshdl, zfsdle_vdev_online, name) != 1) { 993 zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not " 994 "found", name); 995 return (1); 996 } 997 998 return (0); 999 } 1000 1001 /* 1002 * syseventd daemon module event handler 1003 * 1004 * Handles syseventd daemon zfs device related events: 1005 * 1006 * EC_DEV_ADD.ESC_DISK 1007 * EC_DEV_STATUS.ESC_DEV_DLE 1008 * EC_ZFS.ESC_ZFS_VDEV_CHECK 1009 * 1010 * Note: assumes only one thread active at a time (not thread safe) 1011 */ 1012 static int 1013 zfs_slm_deliver_event(const char *class, const char *subclass, nvlist_t *nvl) 1014 { 1015 int ret; 1016 boolean_t is_check = B_FALSE, is_dle = B_FALSE; 1017 1018 if (strcmp(class, EC_DEV_ADD) == 0) { 1019 /* 1020 * We're mainly interested in disk additions, but we also listen 1021 * for new loop devices, to allow for simplified testing. 1022 */ 1023 if (strcmp(subclass, ESC_DISK) != 0 && 1024 strcmp(subclass, ESC_LOFI) != 0) 1025 return (0); 1026 1027 is_check = B_FALSE; 1028 } else if (strcmp(class, EC_ZFS) == 0 && 1029 strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) { 1030 /* 1031 * This event signifies that a device failed to open 1032 * during pool load, but the 'autoreplace' property was 1033 * set, so we should pretend it's just been added. 1034 */ 1035 is_check = B_TRUE; 1036 } else if (strcmp(class, EC_DEV_STATUS) == 0 && 1037 strcmp(subclass, ESC_DEV_DLE) == 0) { 1038 is_dle = B_TRUE; 1039 } else { 1040 return (0); 1041 } 1042 1043 if (is_dle) 1044 ret = zfs_deliver_dle(nvl); 1045 else if (is_check) 1046 ret = zfs_deliver_check(nvl); 1047 else 1048 ret = zfs_deliver_add(nvl); 1049 1050 return (ret); 1051 } 1052 1053 static void * 1054 zfs_enum_pools(void *arg) 1055 { 1056 (void) arg; 1057 1058 (void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list); 1059 /* 1060 * Linux - instead of using a thread pool, each list entry 1061 * will spawn a thread when an unavailable pool transitions 1062 * to available. zfs_slm_fini will wait for these threads. 1063 */ 1064 g_enumeration_done = B_TRUE; 1065 return (NULL); 1066 } 1067 1068 /* 1069 * called from zed daemon at startup 1070 * 1071 * sent messages from zevents or udev monitor 1072 * 1073 * For now, each agent has its own libzfs instance 1074 */ 1075 int 1076 zfs_slm_init() 1077 { 1078 if ((g_zfshdl = libzfs_init()) == NULL) 1079 return (-1); 1080 1081 /* 1082 * collect a list of unavailable pools (asynchronously, 1083 * since this can take a while) 1084 */ 1085 list_create(&g_pool_list, sizeof (struct unavailpool), 1086 offsetof(struct unavailpool, uap_node)); 1087 1088 if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) { 1089 list_destroy(&g_pool_list); 1090 libzfs_fini(g_zfshdl); 1091 return (-1); 1092 } 1093 1094 pthread_setname_np(g_zfs_tid, "enum-pools"); 1095 list_create(&g_device_list, sizeof (struct pendingdev), 1096 offsetof(struct pendingdev, pd_node)); 1097 1098 return (0); 1099 } 1100 1101 void 1102 zfs_slm_fini() 1103 { 1104 unavailpool_t *pool; 1105 pendingdev_t *device; 1106 1107 /* wait for zfs_enum_pools thread to complete */ 1108 (void) pthread_join(g_zfs_tid, NULL); 1109 /* destroy the thread pool */ 1110 if (g_tpool != NULL) { 1111 tpool_wait(g_tpool); 1112 tpool_destroy(g_tpool); 1113 } 1114 1115 while ((pool = (list_head(&g_pool_list))) != NULL) { 1116 list_remove(&g_pool_list, pool); 1117 zpool_close(pool->uap_zhp); 1118 free(pool); 1119 } 1120 list_destroy(&g_pool_list); 1121 1122 while ((device = (list_head(&g_device_list))) != NULL) { 1123 list_remove(&g_device_list, device); 1124 free(device); 1125 } 1126 list_destroy(&g_device_list); 1127 1128 libzfs_fini(g_zfshdl); 1129 } 1130 1131 void 1132 zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl) 1133 { 1134 zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass); 1135 (void) zfs_slm_deliver_event(class, subclass, nvl); 1136 } 1137