1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * ZFS syseventd module. 30 * 31 * The purpose of this module is to identify when devices are added to the 32 * system, and appropriately online or replace the affected vdevs. 33 * 34 * When a device is added to the system: 35 * 36 * 1. Search for any vdevs whose devid matches that of the newly added 37 * device. 38 * 39 * 2. If no vdevs are found, then search for any vdevs whose devfs path 40 * matches that of the new device. 41 * 42 * 3. If no vdevs match by either method, then ignore the event. 43 * 44 * 4. Attempt to online the device with a flag to indicate that it should 45 * be unspared when resilvering completes. If this succeeds, then the 46 * same device was inserted and we should continue normally. 47 * 48 * 5. If the pool does not have the 'autoreplace' property set, attempt to 49 * online the device again without the unspare flag, which will 50 * generate a FMA fault. 51 * 52 * 6. If the pool has the 'autoreplace' property set, and the matching vdev 53 * is a whole disk, then label the new disk and attempt a 'zpool 54 * replace'. 55 * 56 * The module responds to EC_DEV_ADD events for both disks and lofi devices, 57 * with the latter used for testing. The special ESC_ZFS_VDEV_CHECK event 58 * indicates that a device failed to open during pool load, but the autoreplace 59 * property was set. In this case, we deferred the associated FMA fault until 60 * our module had a chance to process the autoreplace logic. If the device 61 * could not be replaced, then the second online attempt will trigger the FMA 62 * fault that we skipped earlier. 63 */ 64 65 #include <alloca.h> 66 #include <devid.h> 67 #include <fcntl.h> 68 #include <libnvpair.h> 69 #include <libsysevent.h> 70 #include <libzfs.h> 71 #include <limits.h> 72 #include <stdlib.h> 73 #include <string.h> 74 #include <syslog.h> 75 #include <sys/sunddi.h> 76 #include <sys/sysevent/eventdefs.h> 77 #include <sys/sysevent/dev.h> 78 #include <unistd.h> 79 80 #if defined(__i386) || defined(__amd64) 81 #define PHYS_PATH ":q" 82 #define RAW_SLICE "p0" 83 #elif defined(__sparc) 84 #define PHYS_PATH ":c" 85 #define RAW_SLICE "s2" 86 #else 87 #error Unknown architecture 88 #endif 89 90 typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t); 91 92 libzfs_handle_t *g_zfshdl; 93 94 /* 95 * The device associated with the given vdev (either by devid or physical path) 96 * has been added to the system. If 'isdisk' is set, then we only attempt a 97 * replacement if it's a whole disk. This also implies that we should label the 98 * disk first. 99 * 100 * First, we attempt to online the device (making sure to undo any spare 101 * operation when finished). If this succeeds, then we're done. If it fails, 102 * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, 103 * but that the label was not what we expected. If the 'autoreplace' property 104 * is not set, then we relabel the disk (if specified), and attempt a 'zpool 105 * replace'. If the online is successful, but the new state is something else 106 * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of 107 * race, and we should avoid attempting to relabel the disk. 108 */ 109 static void 110 zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t isdisk) 111 { 112 char *path; 113 vdev_state_t newstate; 114 nvlist_t *nvroot, *newvd; 115 uint64_t wholedisk = 0ULL; 116 char *physpath = NULL; 117 char rawpath[PATH_MAX], fullpath[PATH_MAX]; 118 size_t len; 119 120 if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) 121 return; 122 123 (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); 124 (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); 125 126 /* 127 * We should have a way to online a device by guid. With the current 128 * interface, we are forced to chop off the 's0' for whole disks. 129 */ 130 (void) strlcpy(fullpath, path, sizeof (fullpath)); 131 if (wholedisk) 132 fullpath[strlen(fullpath) - 2] = '\0'; 133 134 /* 135 * Attempt to online the device. It would be nice to online this by 136 * GUID, but the current interface only supports lookup by path. 137 */ 138 if (zpool_vdev_online(zhp, fullpath, 139 ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 && 140 (newstate == VDEV_STATE_HEALTHY || newstate == VDEV_STATE_DEGRADED)) 141 return; 142 143 /* 144 * If the pool doesn't have the autoreplace property set, then attempt a 145 * true online (without the unspare flag), which will trigger a FMA 146 * fault. 147 */ 148 if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || 149 (isdisk && !wholedisk)) { 150 (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, 151 &newstate); 152 return; 153 } 154 155 if (isdisk) { 156 /* 157 * If this is a request to label a whole disk, then attempt to 158 * write out the label. Before we can label the disk, we need 159 * access to a raw node. Ideally, we'd like to walk the devinfo 160 * tree and find a raw node from the corresponding parent node. 161 * This is overly complicated, and since we know how we labeled 162 * this device in the first place, we know it's save to switch 163 * from /dev/dsk to /dev/rdsk and append the backup slice. 164 * 165 * If any part of this process fails, then do a force online to 166 * trigger a ZFS fault for the device (and any hot spare 167 * replacement). 168 */ 169 if (strncmp(path, "/dev/dsk/", 9) != 0) { 170 (void) zpool_vdev_online(zhp, fullpath, 171 ZFS_ONLINE_FORCEFAULT, &newstate); 172 return; 173 } 174 175 (void) strlcpy(rawpath, path + 9, sizeof (rawpath)); 176 len = strlen(rawpath); 177 rawpath[len - 2] = '\0'; 178 179 if (zpool_label_disk(g_zfshdl, zhp, rawpath) != 0) { 180 (void) zpool_vdev_online(zhp, fullpath, 181 ZFS_ONLINE_FORCEFAULT, &newstate); 182 return; 183 } 184 } 185 186 /* 187 * Cosntruct the root vdev to pass to zpool_vdev_attach(). While adding 188 * the entire vdev structure is harmless, we construct a reduced set of 189 * path/physpath/wholedisk to keep it simple. 190 */ 191 if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) 192 return; 193 194 if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { 195 nvlist_free(nvroot); 196 return; 197 } 198 199 if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 || 200 nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || 201 (physpath != NULL && nvlist_add_string(newvd, 202 ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) || 203 nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || 204 nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || 205 nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd, 206 1) != 0) { 207 nvlist_free(newvd); 208 nvlist_free(nvroot); 209 return; 210 } 211 212 nvlist_free(newvd); 213 214 (void) zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE); 215 216 nvlist_free(nvroot); 217 218 } 219 220 /* 221 * Utility functions to find a vdev matching given criteria. 222 */ 223 typedef struct dev_data { 224 const char *dd_compare; 225 const char *dd_prop; 226 zfs_process_func_t dd_func; 227 boolean_t dd_found; 228 boolean_t dd_isdisk; 229 uint64_t dd_pool_guid; 230 uint64_t dd_vdev_guid; 231 } dev_data_t; 232 233 static void 234 zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) 235 { 236 dev_data_t *dp = data; 237 char *path; 238 uint_t c, children; 239 nvlist_t **child; 240 size_t len; 241 uint64_t guid; 242 243 /* 244 * First iterate over any children. 245 */ 246 if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, 247 &child, &children) == 0) { 248 for (c = 0; c < children; c++) 249 zfs_iter_vdev(zhp, child[c], data); 250 return; 251 } 252 253 if (dp->dd_vdev_guid != 0) { 254 if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, 255 &guid) != 0 || guid != dp->dd_vdev_guid) 256 return; 257 } else { 258 len = strlen(dp->dd_compare); 259 260 if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || 261 strncmp(dp->dd_compare, path, len) != 0) 262 return; 263 264 /* 265 * Normally, we want to have an exact match for the comparison 266 * string. However, we allow substring matches in the following 267 * cases: 268 * 269 * <path>: This is a devpath, and the target is one 270 * of its children. 271 * 272 * <path/> This is a devid for a whole disk, and 273 * the target is one of its children. 274 */ 275 if (path[len] != '\0' && path[len] != ':' && 276 path[len - 1] != '/') 277 return; 278 } 279 280 (dp->dd_func)(zhp, nvl, dp->dd_isdisk); 281 } 282 283 static int 284 zfs_iter_pool(zpool_handle_t *zhp, void *data) 285 { 286 nvlist_t *config, *nvl; 287 dev_data_t *dp = data; 288 uint64_t pool_guid; 289 290 if ((config = zpool_get_config(zhp, NULL)) != NULL) { 291 if (dp->dd_pool_guid == 0 || 292 (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 293 &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) { 294 (void) nvlist_lookup_nvlist(config, 295 ZPOOL_CONFIG_VDEV_TREE, &nvl); 296 zfs_iter_vdev(zhp, nvl, data); 297 } 298 } 299 300 zpool_close(zhp); 301 return (0); 302 } 303 304 /* 305 * Given a physical device path, iterate over all (pool, vdev) pairs which 306 * correspond to the given path. 307 */ 308 static boolean_t 309 devpath_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk) 310 { 311 dev_data_t data = { 0 }; 312 313 data.dd_compare = devpath; 314 data.dd_func = func; 315 data.dd_prop = ZPOOL_CONFIG_PHYS_PATH; 316 data.dd_found = B_FALSE; 317 data.dd_isdisk = wholedisk; 318 319 (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); 320 321 return (data.dd_found); 322 } 323 324 /* 325 * Given a /devices path, lookup the corresponding devid for each minor node, 326 * and find any vdevs with matching devids. Doing this straight up would be 327 * rather inefficient, O(minor nodes * vdevs in system), so we take advantage of 328 * the fact that each devid ends with "/<minornode>". Once we find any valid 329 * minor node, we chop off the portion after the last slash, and then search for 330 * matching vdevs, which is O(vdevs in system). 331 */ 332 static boolean_t 333 devid_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk) 334 { 335 size_t len = strlen(devpath) + sizeof ("/devices") + 336 sizeof (PHYS_PATH) - 1; 337 char *fullpath; 338 int fd; 339 ddi_devid_t devid; 340 char *devidstr, *fulldevid; 341 dev_data_t data = { 0 }; 342 343 /* 344 * Try to open a known minor node. 345 */ 346 fullpath = alloca(len); 347 (void) snprintf(fullpath, len, "/devices%s%s", devpath, PHYS_PATH); 348 if ((fd = open(fullpath, O_RDONLY)) < 0) 349 return (B_FALSE); 350 351 /* 352 * Determine the devid as a string, with no trailing slash for the minor 353 * node. 354 */ 355 if (devid_get(fd, &devid) != 0) { 356 (void) close(fd); 357 return (B_FALSE); 358 } 359 (void) close(fd); 360 361 if ((devidstr = devid_str_encode(devid, NULL)) == NULL) { 362 devid_free(devid); 363 return (B_FALSE); 364 } 365 366 len = strlen(devidstr) + 2; 367 fulldevid = alloca(len); 368 (void) snprintf(fulldevid, len, "%s/", devidstr); 369 370 data.dd_compare = fulldevid; 371 data.dd_func = func; 372 data.dd_prop = ZPOOL_CONFIG_DEVID; 373 data.dd_found = B_FALSE; 374 data.dd_isdisk = wholedisk; 375 376 (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); 377 378 devid_str_free(devidstr); 379 380 return (data.dd_found); 381 } 382 383 /* 384 * This function is called when we receive a devfs add event. This can be 385 * either a disk event or a lofi event, and the behavior is slightly different 386 * depending on which it is. 387 */ 388 static int 389 zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) 390 { 391 char *devpath, *devname; 392 char path[PATH_MAX], realpath[PATH_MAX]; 393 char *colon, *raw; 394 int ret; 395 396 /* 397 * The main unit of operation is the physical device path. For disks, 398 * this is the device node, as all minor nodes are affected. For lofi 399 * devices, this includes the minor path. Unfortunately, this isn't 400 * represented in the DEV_PHYS_PATH for various reasons. 401 */ 402 if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath) != 0) 403 return (-1); 404 405 /* 406 * If this is a lofi device, then also get the minor instance name. 407 * Unfortunately, the current payload doesn't include an easy way to get 408 * this information. So we cheat by resolving the 'dev_name' (which 409 * refers to the raw device) and taking the portion between ':(*),raw'. 410 */ 411 (void) strlcpy(realpath, devpath, sizeof (realpath)); 412 if (is_lofi) { 413 if (nvlist_lookup_string(nvl, DEV_NAME, 414 &devname) == 0 && 415 (ret = resolvepath(devname, path, 416 sizeof (path))) > 0) { 417 path[ret] = '\0'; 418 colon = strchr(path, ':'); 419 if (colon != NULL) 420 raw = strstr(colon + 1, ",raw"); 421 if (colon != NULL && raw != NULL) { 422 *raw = '\0'; 423 (void) snprintf(realpath, 424 sizeof (realpath), "%s%s", 425 devpath, colon); 426 *raw = ','; 427 } 428 } 429 } 430 431 /* 432 * Iterate over all vdevs with a matching devid, and then those with a 433 * matching /devices path. For disks, we only want to pay attention to 434 * vdevs marked as whole disks. For lofi, we don't care (because we're 435 * matching an exact minor name). 436 */ 437 if (!devid_iter(realpath, zfs_process_add, !is_lofi)) 438 (void) devpath_iter(realpath, zfs_process_add, !is_lofi); 439 440 return (0); 441 } 442 443 /* 444 * Called when we receive a VDEV_CHECK event, which indicates a device could not 445 * be opened during initial pool open, but the autoreplace property was set on 446 * the pool. In this case, we treat it as if it were an add event. 447 */ 448 static int 449 zfs_deliver_check(nvlist_t *nvl) 450 { 451 dev_data_t data = { 0 }; 452 453 if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, 454 &data.dd_pool_guid) != 0 || 455 nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, 456 &data.dd_vdev_guid) != 0) 457 return (0); 458 459 data.dd_isdisk = B_TRUE; 460 data.dd_func = zfs_process_add; 461 462 (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); 463 464 return (0); 465 } 466 467 /*ARGSUSED*/ 468 static int 469 zfs_deliver_event(sysevent_t *ev, int unused) 470 { 471 const char *class = sysevent_get_class_name(ev); 472 const char *subclass = sysevent_get_subclass_name(ev); 473 nvlist_t *nvl; 474 int ret; 475 boolean_t is_lofi, is_check; 476 477 if (strcmp(class, EC_DEV_ADD) == 0) { 478 /* 479 * We're mainly interested in disk additions, but we also listen 480 * for new lofi devices, to allow for simplified testing. 481 */ 482 if (strcmp(subclass, ESC_DISK) == 0) 483 is_lofi = B_FALSE; 484 else if (strcmp(subclass, ESC_LOFI) == 0) 485 is_lofi = B_TRUE; 486 else 487 return (0); 488 489 is_check = B_FALSE; 490 } else if (strcmp(class, EC_ZFS) == 0 && 491 strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) { 492 /* 493 * This event signifies that a device failed to open during pool 494 * load, but the 'autoreplace' property was set, so we should 495 * pretend it's just been added. 496 */ 497 is_check = B_TRUE; 498 } else { 499 return (0); 500 } 501 502 if (sysevent_get_attr_list(ev, &nvl) != 0) 503 return (-1); 504 505 if (is_check) 506 ret = zfs_deliver_check(nvl); 507 else 508 ret = zfs_deliver_add(nvl, is_lofi); 509 510 511 nvlist_free(nvl); 512 return (ret); 513 } 514 515 static struct slm_mod_ops zfs_mod_ops = { 516 SE_MAJOR_VERSION, SE_MINOR_VERSION, 10, zfs_deliver_event 517 }; 518 519 struct slm_mod_ops * 520 slm_init() 521 { 522 if ((g_zfshdl = libzfs_init()) == NULL) 523 return (NULL); 524 525 return (&zfs_mod_ops); 526 } 527 528 void 529 slm_fini() 530 { 531 } 532