1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * ZFS syseventd module. 27 * 28 * The purpose of this module is to identify when devices are added to the 29 * system, and appropriately online or replace the affected vdevs. 30 * 31 * When a device is added to the system: 32 * 33 * 1. Search for any vdevs whose devid matches that of the newly added 34 * device. 35 * 36 * 2. If no vdevs are found, then search for any vdevs whose devfs path 37 * matches that of the new device. 38 * 39 * 3. If no vdevs match by either method, then ignore the event. 40 * 41 * 4. Attempt to online the device with a flag to indicate that it should 42 * be unspared when resilvering completes. If this succeeds, then the 43 * same device was inserted and we should continue normally. 44 * 45 * 5. If the pool does not have the 'autoreplace' property set, attempt to 46 * online the device again without the unspare flag, which will 47 * generate a FMA fault. 48 * 49 * 6. If the pool has the 'autoreplace' property set, and the matching vdev 50 * is a whole disk, then label the new disk and attempt a 'zpool 51 * replace'. 52 * 53 * The module responds to EC_DEV_ADD events for both disks and lofi devices, 54 * with the latter used for testing. The special ESC_ZFS_VDEV_CHECK event 55 * indicates that a device failed to open during pool load, but the autoreplace 56 * property was set. In this case, we deferred the associated FMA fault until 57 * our module had a chance to process the autoreplace logic. If the device 58 * could not be replaced, then the second online attempt will trigger the FMA 59 * fault that we skipped earlier. 60 */ 61 62 #include <alloca.h> 63 #include <devid.h> 64 #include <fcntl.h> 65 #include <libnvpair.h> 66 #include <libsysevent.h> 67 #include <libzfs.h> 68 #include <limits.h> 69 #include <stdlib.h> 70 #include <string.h> 71 #include <syslog.h> 72 #include <sys/sunddi.h> 73 #include <sys/sysevent/eventdefs.h> 74 #include <sys/sysevent/dev.h> 75 #include <unistd.h> 76 #include "syseventd.h" 77 78 #if defined(__i386) || defined(__amd64) 79 #define PHYS_PATH ":q" 80 #define RAW_SLICE "p0" 81 #elif defined(__sparc) 82 #define PHYS_PATH ":c" 83 #define RAW_SLICE "s2" 84 #else 85 #error Unknown architecture 86 #endif 87 88 typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t); 89 90 libzfs_handle_t *g_zfshdl; 91 92 /* 93 * The device associated with the given vdev (either by devid or physical path) 94 * has been added to the system. If 'isdisk' is set, then we only attempt a 95 * replacement if it's a whole disk. This also implies that we should label the 96 * disk first. 97 * 98 * First, we attempt to online the device (making sure to undo any spare 99 * operation when finished). If this succeeds, then we're done. If it fails, 100 * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, 101 * but that the label was not what we expected. If the 'autoreplace' property 102 * is not set, then we relabel the disk (if specified), and attempt a 'zpool 103 * replace'. If the online is successful, but the new state is something else 104 * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of 105 * race, and we should avoid attempting to relabel the disk. 106 */ 107 static void 108 zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t isdisk) 109 { 110 char *path; 111 vdev_state_t newstate; 112 nvlist_t *nvroot, *newvd; 113 uint64_t wholedisk = 0ULL; 114 char *physpath = NULL; 115 char rawpath[PATH_MAX], fullpath[PATH_MAX]; 116 size_t len; 117 118 if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) 119 return; 120 121 (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); 122 (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); 123 124 /* 125 * We should have a way to online a device by guid. With the current 126 * interface, we are forced to chop off the 's0' for whole disks. 127 */ 128 (void) strlcpy(fullpath, path, sizeof (fullpath)); 129 if (wholedisk) 130 fullpath[strlen(fullpath) - 2] = '\0'; 131 132 /* 133 * Attempt to online the device. It would be nice to online this by 134 * GUID, but the current interface only supports lookup by path. 135 */ 136 if (zpool_vdev_online(zhp, fullpath, 137 ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 && 138 (newstate == VDEV_STATE_HEALTHY || newstate == VDEV_STATE_DEGRADED)) 139 return; 140 141 /* 142 * If the pool doesn't have the autoreplace property set, then attempt a 143 * true online (without the unspare flag), which will trigger a FMA 144 * fault. 145 */ 146 if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || 147 (isdisk && !wholedisk)) { 148 (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, 149 &newstate); 150 return; 151 } 152 153 if (isdisk) { 154 /* 155 * If this is a request to label a whole disk, then attempt to 156 * write out the label. Before we can label the disk, we need 157 * access to a raw node. Ideally, we'd like to walk the devinfo 158 * tree and find a raw node from the corresponding parent node. 159 * This is overly complicated, and since we know how we labeled 160 * this device in the first place, we know it's save to switch 161 * from /dev/dsk to /dev/rdsk and append the backup slice. 162 * 163 * If any part of this process fails, then do a force online to 164 * trigger a ZFS fault for the device (and any hot spare 165 * replacement). 166 */ 167 if (strncmp(path, "/dev/dsk/", 9) != 0) { 168 (void) zpool_vdev_online(zhp, fullpath, 169 ZFS_ONLINE_FORCEFAULT, &newstate); 170 return; 171 } 172 173 (void) strlcpy(rawpath, path + 9, sizeof (rawpath)); 174 len = strlen(rawpath); 175 rawpath[len - 2] = '\0'; 176 177 if (zpool_label_disk(g_zfshdl, zhp, rawpath) != 0) { 178 (void) zpool_vdev_online(zhp, fullpath, 179 ZFS_ONLINE_FORCEFAULT, &newstate); 180 return; 181 } 182 } 183 184 /* 185 * Cosntruct the root vdev to pass to zpool_vdev_attach(). While adding 186 * the entire vdev structure is harmless, we construct a reduced set of 187 * path/physpath/wholedisk to keep it simple. 188 */ 189 if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) 190 return; 191 192 if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { 193 nvlist_free(nvroot); 194 return; 195 } 196 197 if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 || 198 nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || 199 (physpath != NULL && nvlist_add_string(newvd, 200 ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) || 201 nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || 202 nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || 203 nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd, 204 1) != 0) { 205 nvlist_free(newvd); 206 nvlist_free(nvroot); 207 return; 208 } 209 210 nvlist_free(newvd); 211 212 (void) zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE); 213 214 nvlist_free(nvroot); 215 216 } 217 218 /* 219 * Utility functions to find a vdev matching given criteria. 220 */ 221 typedef struct dev_data { 222 const char *dd_compare; 223 const char *dd_prop; 224 zfs_process_func_t dd_func; 225 boolean_t dd_found; 226 boolean_t dd_isdisk; 227 uint64_t dd_pool_guid; 228 uint64_t dd_vdev_guid; 229 } dev_data_t; 230 231 static void 232 zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) 233 { 234 dev_data_t *dp = data; 235 char *path; 236 uint_t c, children; 237 nvlist_t **child; 238 size_t len; 239 uint64_t guid; 240 241 /* 242 * First iterate over any children. 243 */ 244 if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, 245 &child, &children) == 0) { 246 for (c = 0; c < children; c++) 247 zfs_iter_vdev(zhp, child[c], data); 248 return; 249 } 250 251 if (dp->dd_vdev_guid != 0) { 252 if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, 253 &guid) != 0 || guid != dp->dd_vdev_guid) 254 return; 255 } else { 256 len = strlen(dp->dd_compare); 257 258 if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || 259 strncmp(dp->dd_compare, path, len) != 0) 260 return; 261 262 /* 263 * Normally, we want to have an exact match for the comparison 264 * string. However, we allow substring matches in the following 265 * cases: 266 * 267 * <path>: This is a devpath, and the target is one 268 * of its children. 269 * 270 * <path/> This is a devid for a whole disk, and 271 * the target is one of its children. 272 */ 273 if (path[len] != '\0' && path[len] != ':' && 274 path[len - 1] != '/') 275 return; 276 } 277 278 (dp->dd_func)(zhp, nvl, dp->dd_isdisk); 279 } 280 281 static int 282 zfs_iter_pool(zpool_handle_t *zhp, void *data) 283 { 284 nvlist_t *config, *nvl; 285 dev_data_t *dp = data; 286 uint64_t pool_guid; 287 288 if ((config = zpool_get_config(zhp, NULL)) != NULL) { 289 if (dp->dd_pool_guid == 0 || 290 (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 291 &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) { 292 (void) nvlist_lookup_nvlist(config, 293 ZPOOL_CONFIG_VDEV_TREE, &nvl); 294 zfs_iter_vdev(zhp, nvl, data); 295 } 296 } 297 298 zpool_close(zhp); 299 return (0); 300 } 301 302 /* 303 * Given a physical device path, iterate over all (pool, vdev) pairs which 304 * correspond to the given path. 305 */ 306 static boolean_t 307 devpath_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk) 308 { 309 dev_data_t data = { 0 }; 310 311 data.dd_compare = devpath; 312 data.dd_func = func; 313 data.dd_prop = ZPOOL_CONFIG_PHYS_PATH; 314 data.dd_found = B_FALSE; 315 data.dd_isdisk = wholedisk; 316 317 (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); 318 319 return (data.dd_found); 320 } 321 322 /* 323 * Given a /devices path, lookup the corresponding devid for each minor node, 324 * and find any vdevs with matching devids. Doing this straight up would be 325 * rather inefficient, O(minor nodes * vdevs in system), so we take advantage of 326 * the fact that each devid ends with "/<minornode>". Once we find any valid 327 * minor node, we chop off the portion after the last slash, and then search for 328 * matching vdevs, which is O(vdevs in system). 329 */ 330 static boolean_t 331 devid_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk) 332 { 333 size_t len = strlen(devpath) + sizeof ("/devices") + 334 sizeof (PHYS_PATH) - 1; 335 char *fullpath; 336 int fd; 337 ddi_devid_t devid; 338 char *devidstr, *fulldevid; 339 dev_data_t data = { 0 }; 340 341 /* 342 * Try to open a known minor node. 343 */ 344 fullpath = alloca(len); 345 (void) snprintf(fullpath, len, "/devices%s%s", devpath, PHYS_PATH); 346 if ((fd = open(fullpath, O_RDONLY)) < 0) 347 return (B_FALSE); 348 349 /* 350 * Determine the devid as a string, with no trailing slash for the minor 351 * node. 352 */ 353 if (devid_get(fd, &devid) != 0) { 354 (void) close(fd); 355 return (B_FALSE); 356 } 357 (void) close(fd); 358 359 if ((devidstr = devid_str_encode(devid, NULL)) == NULL) { 360 devid_free(devid); 361 return (B_FALSE); 362 } 363 364 len = strlen(devidstr) + 2; 365 fulldevid = alloca(len); 366 (void) snprintf(fulldevid, len, "%s/", devidstr); 367 368 data.dd_compare = fulldevid; 369 data.dd_func = func; 370 data.dd_prop = ZPOOL_CONFIG_DEVID; 371 data.dd_found = B_FALSE; 372 data.dd_isdisk = wholedisk; 373 374 (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); 375 376 devid_str_free(devidstr); 377 devid_free(devid); 378 379 return (data.dd_found); 380 } 381 382 /* 383 * This function is called when we receive a devfs add event. This can be 384 * either a disk event or a lofi event, and the behavior is slightly different 385 * depending on which it is. 386 */ 387 static int 388 zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) 389 { 390 char *devpath, *devname; 391 char path[PATH_MAX], realpath[PATH_MAX]; 392 char *colon, *raw; 393 int ret; 394 395 /* 396 * The main unit of operation is the physical device path. For disks, 397 * this is the device node, as all minor nodes are affected. For lofi 398 * devices, this includes the minor path. Unfortunately, this isn't 399 * represented in the DEV_PHYS_PATH for various reasons. 400 */ 401 if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath) != 0) 402 return (-1); 403 404 /* 405 * If this is a lofi device, then also get the minor instance name. 406 * Unfortunately, the current payload doesn't include an easy way to get 407 * this information. So we cheat by resolving the 'dev_name' (which 408 * refers to the raw device) and taking the portion between ':(*),raw'. 409 */ 410 (void) strlcpy(realpath, devpath, sizeof (realpath)); 411 if (is_lofi) { 412 if (nvlist_lookup_string(nvl, DEV_NAME, 413 &devname) == 0 && 414 (ret = resolvepath(devname, path, 415 sizeof (path))) > 0) { 416 path[ret] = '\0'; 417 colon = strchr(path, ':'); 418 if (colon != NULL) 419 raw = strstr(colon + 1, ",raw"); 420 if (colon != NULL && raw != NULL) { 421 *raw = '\0'; 422 (void) snprintf(realpath, 423 sizeof (realpath), "%s%s", 424 devpath, colon); 425 *raw = ','; 426 } 427 } 428 } 429 430 /* 431 * Iterate over all vdevs with a matching devid, and then those with a 432 * matching /devices path. For disks, we only want to pay attention to 433 * vdevs marked as whole disks. For lofi, we don't care (because we're 434 * matching an exact minor name). 435 */ 436 if (!devid_iter(realpath, zfs_process_add, !is_lofi)) 437 (void) devpath_iter(realpath, zfs_process_add, !is_lofi); 438 439 return (0); 440 } 441 442 /* 443 * Called when we receive a VDEV_CHECK event, which indicates a device could not 444 * be opened during initial pool open, but the autoreplace property was set on 445 * the pool. In this case, we treat it as if it were an add event. 446 */ 447 static int 448 zfs_deliver_check(nvlist_t *nvl) 449 { 450 dev_data_t data = { 0 }; 451 452 if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, 453 &data.dd_pool_guid) != 0 || 454 nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, 455 &data.dd_vdev_guid) != 0) 456 return (0); 457 458 data.dd_isdisk = B_TRUE; 459 data.dd_func = zfs_process_add; 460 461 (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); 462 463 return (0); 464 } 465 466 #define DEVICE_PREFIX "/devices" 467 468 static int 469 zfsdle_vdev_online(zpool_handle_t *zhp, void *data) 470 { 471 char *devname = data; 472 boolean_t avail_spare, l2cache; 473 vdev_state_t newstate; 474 nvlist_t *tgt; 475 476 syseventd_print(9, "zfsdle_vdev_online: searching for %s in pool %s\n", 477 devname, zpool_get_name(zhp)); 478 479 if ((tgt = zpool_find_vdev_by_physpath(zhp, devname, 480 &avail_spare, &l2cache, NULL)) != NULL) { 481 char *path, fullpath[MAXPATHLEN]; 482 uint64_t wholedisk = 0ULL; 483 484 verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, 485 &path) == 0); 486 verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, 487 &wholedisk) == 0); 488 489 (void) strlcpy(fullpath, path, sizeof (fullpath)); 490 if (wholedisk) 491 fullpath[strlen(fullpath) - 2] = '\0'; 492 493 if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { 494 syseventd_print(9, "zfsdle_vdev_online: setting device" 495 " device %s to ONLINE state in pool %s.\n", 496 fullpath, zpool_get_name(zhp)); 497 if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) 498 (void) zpool_vdev_online(zhp, fullpath, 0, 499 &newstate); 500 } 501 zpool_close(zhp); 502 return (1); 503 } 504 zpool_close(zhp); 505 return (0); 506 } 507 508 int 509 zfs_deliver_dle(nvlist_t *nvl) 510 { 511 char *devname; 512 if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) != 0) { 513 syseventd_print(9, "zfs_deliver_event: no physpath\n"); 514 return (-1); 515 } 516 if (strncmp(devname, DEVICE_PREFIX, strlen(DEVICE_PREFIX)) != 0) { 517 syseventd_print(9, "zfs_deliver_event: invalid " 518 "device '%s'", devname); 519 return (-1); 520 } 521 522 /* 523 * We try to find the device using the physical 524 * path that has been supplied. We need to strip off 525 * the /devices prefix before starting our search. 526 */ 527 devname += strlen(DEVICE_PREFIX); 528 if (zpool_iter(g_zfshdl, zfsdle_vdev_online, devname) != 1) { 529 syseventd_print(9, "zfs_deliver_event: device '%s' not" 530 " found\n", devname); 531 return (1); 532 } 533 nvlist_free(nvl); 534 return (0); 535 } 536 537 538 /*ARGSUSED*/ 539 static int 540 zfs_deliver_event(sysevent_t *ev, int unused) 541 { 542 const char *class = sysevent_get_class_name(ev); 543 const char *subclass = sysevent_get_subclass_name(ev); 544 nvlist_t *nvl; 545 int ret; 546 boolean_t is_lofi, is_check, is_dle = B_FALSE; 547 548 if (strcmp(class, EC_DEV_ADD) == 0) { 549 /* 550 * We're mainly interested in disk additions, but we also listen 551 * for new lofi devices, to allow for simplified testing. 552 */ 553 if (strcmp(subclass, ESC_DISK) == 0) 554 is_lofi = B_FALSE; 555 else if (strcmp(subclass, ESC_LOFI) == 0) 556 is_lofi = B_TRUE; 557 else 558 return (0); 559 560 is_check = B_FALSE; 561 } else if (strcmp(class, EC_ZFS) == 0 && 562 strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) { 563 /* 564 * This event signifies that a device failed to open during pool 565 * load, but the 'autoreplace' property was set, so we should 566 * pretend it's just been added. 567 */ 568 is_check = B_TRUE; 569 } else if (strcmp(class, EC_DEV_STATUS) == 0 && 570 strcmp(subclass, ESC_DEV_DLE) == 0) { 571 is_dle = B_TRUE; 572 } else { 573 return (0); 574 } 575 576 if (sysevent_get_attr_list(ev, &nvl) != 0) 577 return (-1); 578 579 if (is_dle) 580 ret = zfs_deliver_dle(nvl); 581 else if (is_check) 582 ret = zfs_deliver_check(nvl); 583 else 584 ret = zfs_deliver_add(nvl, is_lofi); 585 586 nvlist_free(nvl); 587 return (ret); 588 } 589 590 static struct slm_mod_ops zfs_mod_ops = { 591 SE_MAJOR_VERSION, SE_MINOR_VERSION, 10, zfs_deliver_event 592 }; 593 594 struct slm_mod_ops * 595 slm_init() 596 { 597 if ((g_zfshdl = libzfs_init()) == NULL) 598 return (NULL); 599 600 return (&zfs_mod_ops); 601 } 602 603 void 604 slm_fini() 605 { 606 libzfs_fini(g_zfshdl); 607 } 608