xref: /illumos-gate/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c (revision b4fb003914e70b41d96dec8011864f6af1faf3ef)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012 by Delphix. All rights reserved.
24  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
25  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
26  * Copyright 2022 Oxide Computer Company
27  */
28 
29 /*
30  * ZFS syseventd module.
31  *
32  * The purpose of this module is to identify when devices are added to the
33  * system, and appropriately online or replace the affected vdevs.
34  *
35  * When a device is added to the system:
36  *
37  *	1. Search for any vdevs whose devid matches that of the newly added
38  *	   device.
39  *
40  *	2. If no vdevs are found, then search for any vdevs whose devfs path
41  *	   matches that of the new device.
42  *
43  *	3. If no vdevs match by either method, then ignore the event.
44  *
45  *	4. Attempt to online the device with a flag to indicate that it should
46  *	   be unspared when resilvering completes.  If this succeeds, then the
47  *	   same device was inserted and we should continue normally.
48  *
49  *	5. If the pool does not have the 'autoreplace' property set, attempt to
50  *	   online the device again without the unspare flag, which will
51  *	   generate a FMA fault.
52  *
53  *	6. If the pool has the 'autoreplace' property set, and the matching vdev
54  *	   is a whole disk, then label the new disk and attempt a 'zpool
55  *	   replace'.
56  *
57  * The module responds to EC_DEV_ADD events for both disks and lofi devices,
58  * with the latter used for testing.  The special ESC_ZFS_VDEV_CHECK event
59  * indicates that a device failed to open during pool load, but the autoreplace
60  * property was set.  In this case, we deferred the associated FMA fault until
61  * our module had a chance to process the autoreplace logic.  If the device
62  * could not be replaced, then the second online attempt will trigger the FMA
63  * fault that we skipped earlier.
64  */
65 
66 #include <alloca.h>
67 #include <devid.h>
68 #include <fcntl.h>
69 #include <libnvpair.h>
70 #include <libsysevent.h>
71 #include <libzfs.h>
72 #include <limits.h>
73 #include <stdlib.h>
74 #include <string.h>
75 #include <syslog.h>
76 #include <sys/list.h>
77 #include <sys/sunddi.h>
78 #include <sys/sysevent/eventdefs.h>
79 #include <sys/sysevent/dev.h>
80 #include <thread_pool.h>
81 #include <unistd.h>
82 #include "syseventd.h"
83 
84 #if defined(__i386) || defined(__amd64)
85 #define	PHYS_PATH	":q"
86 #define	RAW_SLICE	"p0"
87 #elif defined(__sparc)
88 #define	PHYS_PATH	":c"
89 #define	RAW_SLICE	"s2"
90 #else
91 #error Unknown architecture
92 #endif
93 
94 typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t);
95 
96 libzfs_handle_t *g_zfshdl;
97 list_t g_pool_list;
98 tpool_t *g_tpool;
99 boolean_t g_enumeration_done;
100 thread_t g_zfs_tid;
101 
102 typedef struct unavailpool {
103 	zpool_handle_t	*uap_zhp;
104 	list_node_t	uap_node;
105 } unavailpool_t;
106 
107 int
zfs_toplevel_state(zpool_handle_t * zhp)108 zfs_toplevel_state(zpool_handle_t *zhp)
109 {
110 	nvlist_t *nvroot;
111 	vdev_stat_t *vs;
112 	unsigned int c;
113 
114 	verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
115 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
116 	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
117 	    (uint64_t **)&vs, &c) == 0);
118 	return (vs->vs_state);
119 }
120 
121 static int
zfs_unavail_pool(zpool_handle_t * zhp,void * data)122 zfs_unavail_pool(zpool_handle_t *zhp, void *data)
123 {
124 	if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) {
125 		unavailpool_t *uap;
126 		uap = malloc(sizeof (unavailpool_t));
127 		uap->uap_zhp = zhp;
128 		list_insert_tail((list_t *)data, uap);
129 	} else {
130 		zpool_close(zhp);
131 	}
132 	return (0);
133 }
134 
135 /*
136  * The device associated with the given vdev (either by devid or physical path)
137  * has been added to the system.  If 'isdisk' is set, then we only attempt a
138  * replacement if it's a whole disk.  This also implies that we should label the
139  * disk first.
140  *
141  * First, we attempt to online the device (making sure to undo any spare
142  * operation when finished).  If this succeeds, then we're done.  If it fails,
143  * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened,
144  * but that the label was not what we expected.  If the 'autoreplace' property
145  * is not set, then we relabel the disk (if specified), and attempt a 'zpool
146  * replace'.  If the online is successful, but the new state is something else
147  * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of
148  * race, and we should avoid attempting to relabel the disk.
149  */
150 static void
zfs_process_add(zpool_handle_t * zhp,nvlist_t * vdev,boolean_t isdisk)151 zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t isdisk)
152 {
153 	char *path;
154 	vdev_state_t newstate;
155 	nvlist_t *nvroot, *newvd;
156 	uint64_t wholedisk = 0ULL;
157 	uint64_t offline = 0ULL;
158 	char *physpath = NULL;
159 	char rawpath[PATH_MAX], fullpath[PATH_MAX];
160 	zpool_boot_label_t boot_type;
161 	uint64_t boot_size;
162 	size_t len;
163 
164 	if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
165 		return;
166 
167 	(void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath);
168 	(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
169 	(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline);
170 
171 	/*
172 	 * We should have a way to online a device by guid.  With the current
173 	 * interface, we are forced to chop off the 's0' for whole disks.
174 	 */
175 	(void) strlcpy(fullpath, path, sizeof (fullpath));
176 	if (wholedisk)
177 		fullpath[strlen(fullpath) - 2] = '\0';
178 
179 	/*
180 	 * Attempt to online the device.  It would be nice to online this by
181 	 * GUID, but the current interface only supports lookup by path.
182 	 */
183 	if (offline ||
184 	    (zpool_vdev_online(zhp, fullpath,
185 	    ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 &&
186 	    (newstate == VDEV_STATE_HEALTHY ||
187 	    newstate == VDEV_STATE_DEGRADED)))
188 		return;
189 
190 	/*
191 	 * If the pool doesn't have the autoreplace property set, then attempt a
192 	 * true online (without the unspare flag), which will trigger a FMA
193 	 * fault.
194 	 */
195 	if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
196 	    (isdisk && !wholedisk)) {
197 		(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
198 		    &newstate);
199 		return;
200 	}
201 
202 	if (isdisk) {
203 		/*
204 		 * If this is a request to label a whole disk, then attempt to
205 		 * write out the label.  Before we can label the disk, we need
206 		 * access to a raw node.  Ideally, we'd like to walk the devinfo
207 		 * tree and find a raw node from the corresponding parent node.
208 		 * This is overly complicated, and since we know how we labeled
209 		 * this device in the first place, we know it's save to switch
210 		 * from /dev/dsk to /dev/rdsk and append the backup slice.
211 		 *
212 		 * If any part of this process fails, then do a force online to
213 		 * trigger a ZFS fault for the device (and any hot spare
214 		 * replacement).
215 		 */
216 		if (strncmp(path, ZFS_DISK_ROOTD,
217 		    strlen(ZFS_DISK_ROOTD)) != 0) {
218 			(void) zpool_vdev_online(zhp, fullpath,
219 			    ZFS_ONLINE_FORCEFAULT, &newstate);
220 			return;
221 		}
222 
223 		(void) strlcpy(rawpath, path + 9, sizeof (rawpath));
224 		len = strlen(rawpath);
225 		rawpath[len - 2] = '\0';
226 
227 		if (zpool_is_bootable(zhp))
228 			boot_type = ZPOOL_COPY_BOOT_LABEL;
229 		else
230 			boot_type = ZPOOL_NO_BOOT_LABEL;
231 
232 		boot_size = zpool_get_prop_int(zhp, ZPOOL_PROP_BOOTSIZE, NULL);
233 		if (zpool_label_disk(g_zfshdl, zhp, rawpath,
234 		    boot_type, boot_size, NULL) != 0) {
235 			(void) zpool_vdev_online(zhp, fullpath,
236 			    ZFS_ONLINE_FORCEFAULT, &newstate);
237 			return;
238 		}
239 	}
240 
241 	/*
242 	 * Cosntruct the root vdev to pass to zpool_vdev_attach().  While adding
243 	 * the entire vdev structure is harmless, we construct a reduced set of
244 	 * path/physpath/wholedisk to keep it simple.
245 	 */
246 	if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)
247 		return;
248 
249 	if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
250 		nvlist_free(nvroot);
251 		return;
252 	}
253 
254 	if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 ||
255 	    nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 ||
256 	    (physpath != NULL && nvlist_add_string(newvd,
257 	    ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) ||
258 	    nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 ||
259 	    nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 ||
260 	    nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd,
261 	    1) != 0) {
262 		nvlist_free(newvd);
263 		nvlist_free(nvroot);
264 		return;
265 	}
266 
267 	nvlist_free(newvd);
268 
269 	(void) zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE);
270 
271 	nvlist_free(nvroot);
272 
273 }
274 
275 /*
276  * Utility functions to find a vdev matching given criteria.
277  */
278 typedef struct dev_data {
279 	const char		*dd_compare;
280 	const char		*dd_prop;
281 	zfs_process_func_t	dd_func;
282 	boolean_t		dd_found;
283 	boolean_t		dd_isdisk;
284 	uint64_t		dd_pool_guid;
285 	uint64_t		dd_vdev_guid;
286 } dev_data_t;
287 
288 static void
zfs_iter_vdev(zpool_handle_t * zhp,nvlist_t * nvl,void * data)289 zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
290 {
291 	dev_data_t *dp = data;
292 	char *path;
293 	uint_t c, children;
294 	nvlist_t **child;
295 	size_t len;
296 	uint64_t guid;
297 
298 	/*
299 	 * First iterate over any children.
300 	 */
301 	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
302 	    &child, &children) == 0) {
303 		for (c = 0; c < children; c++)
304 			zfs_iter_vdev(zhp, child[c], data);
305 		return;
306 	}
307 
308 	if (dp->dd_vdev_guid != 0) {
309 		if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
310 		    &guid) != 0 || guid != dp->dd_vdev_guid)
311 			return;
312 	} else if (dp->dd_compare != NULL) {
313 		len = strlen(dp->dd_compare);
314 
315 		if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
316 		    strncmp(dp->dd_compare, path, len) != 0)
317 			return;
318 
319 		/*
320 		 * Normally, we want to have an exact match for the comparison
321 		 * string.  However, we allow substring matches in the following
322 		 * cases:
323 		 *
324 		 *	<path>:		This is a devpath, and the target is one
325 		 *			of its children.
326 		 *
327 		 *	<path/>		This is a devid for a whole disk, and
328 		 *			the target is one of its children.
329 		 */
330 		if (path[len] != '\0' && path[len] != ':' &&
331 		    path[len - 1] != '/')
332 			return;
333 	}
334 
335 	(dp->dd_func)(zhp, nvl, dp->dd_isdisk);
336 }
337 
338 void
zfs_enable_ds(void * arg)339 zfs_enable_ds(void *arg)
340 {
341 	unavailpool_t *pool = (unavailpool_t *)arg;
342 
343 	(void) zpool_enable_datasets(pool->uap_zhp, NULL, 0);
344 	zpool_close(pool->uap_zhp);
345 	free(pool);
346 }
347 
348 static int
zfs_iter_pool(zpool_handle_t * zhp,void * data)349 zfs_iter_pool(zpool_handle_t *zhp, void *data)
350 {
351 	nvlist_t *config, *nvl;
352 	dev_data_t *dp = data;
353 	uint64_t pool_guid;
354 	unavailpool_t *pool;
355 
356 	if ((config = zpool_get_config(zhp, NULL)) != NULL) {
357 		if (dp->dd_pool_guid == 0 ||
358 		    (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
359 		    &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) {
360 			(void) nvlist_lookup_nvlist(config,
361 			    ZPOOL_CONFIG_VDEV_TREE, &nvl);
362 			zfs_iter_vdev(zhp, nvl, data);
363 		}
364 	}
365 	if (g_enumeration_done)  {
366 		for (pool = list_head(&g_pool_list); pool != NULL;
367 		    pool = list_next(&g_pool_list, pool)) {
368 
369 			if (strcmp(zpool_get_name(zhp),
370 			    zpool_get_name(pool->uap_zhp)))
371 				continue;
372 			if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) {
373 				list_remove(&g_pool_list, pool);
374 				(void) tpool_dispatch(g_tpool, zfs_enable_ds,
375 				    pool);
376 				break;
377 			}
378 		}
379 	}
380 
381 	zpool_close(zhp);
382 	return (0);
383 }
384 
385 /*
386  * Given a physical device path, iterate over all (pool, vdev) pairs which
387  * correspond to the given path.
388  */
389 static boolean_t
devpath_iter(const char * devpath,zfs_process_func_t func,boolean_t wholedisk)390 devpath_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk)
391 {
392 	dev_data_t data = { 0 };
393 
394 	data.dd_compare = devpath;
395 	data.dd_func = func;
396 	data.dd_prop = ZPOOL_CONFIG_PHYS_PATH;
397 	data.dd_found = B_FALSE;
398 	data.dd_isdisk = wholedisk;
399 
400 	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
401 
402 	return (data.dd_found);
403 }
404 
405 /*
406  * Given a /devices path, lookup the corresponding devid for each minor node,
407  * and find any vdevs with matching devids.  Doing this straight up would be
408  * rather inefficient, O(minor nodes * vdevs in system), so we take advantage of
409  * the fact that each devid ends with "/<minornode>".  Once we find any valid
410  * minor node, we chop off the portion after the last slash, and then search for
411  * matching vdevs, which is O(vdevs in system).
412  */
413 static boolean_t
devid_iter(const char * devpath,zfs_process_func_t func,boolean_t wholedisk)414 devid_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk)
415 {
416 	size_t len = strlen(devpath) + sizeof ("/devices") +
417 	    sizeof (PHYS_PATH) - 1;
418 	char *fullpath;
419 	int fd;
420 	ddi_devid_t devid;
421 	char *devidstr, *fulldevid;
422 	dev_data_t data = { 0 };
423 
424 	/*
425 	 * Try to open a known minor node.
426 	 */
427 	fullpath = alloca(len);
428 	(void) snprintf(fullpath, len, "/devices%s%s", devpath, PHYS_PATH);
429 	if ((fd = open(fullpath, O_RDONLY)) < 0)
430 		return (B_FALSE);
431 
432 	/*
433 	 * Determine the devid as a string, with no trailing slash for the minor
434 	 * node.
435 	 */
436 	if (devid_get(fd, &devid) != 0) {
437 		(void) close(fd);
438 		return (B_FALSE);
439 	}
440 	(void) close(fd);
441 
442 	if ((devidstr = devid_str_encode(devid, NULL)) == NULL) {
443 		devid_free(devid);
444 		return (B_FALSE);
445 	}
446 
447 	len = strlen(devidstr) + 2;
448 	fulldevid = alloca(len);
449 	(void) snprintf(fulldevid, len, "%s/", devidstr);
450 
451 	data.dd_compare = fulldevid;
452 	data.dd_func = func;
453 	data.dd_prop = ZPOOL_CONFIG_DEVID;
454 	data.dd_found = B_FALSE;
455 	data.dd_isdisk = wholedisk;
456 
457 	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
458 
459 	devid_str_free(devidstr);
460 	devid_free(devid);
461 
462 	return (data.dd_found);
463 }
464 
465 /*
466  * This function is called when we receive a devfs add event.  This can be
467  * either a disk event or a lofi event, and the behavior is slightly different
468  * depending on which it is.
469  */
470 static int
zfs_deliver_add(nvlist_t * nvl,boolean_t is_lofi)471 zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)
472 {
473 	char *devpath, *devname;
474 	char path[PATH_MAX], realpath[PATH_MAX];
475 	char *colon, *raw;
476 	int ret;
477 
478 	/*
479 	 * The main unit of operation is the physical device path.  For disks,
480 	 * this is the device node, as all minor nodes are affected.  For lofi
481 	 * devices, this includes the minor path.  Unfortunately, this isn't
482 	 * represented in the DEV_PHYS_PATH for various reasons.
483 	 */
484 	if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath) != 0)
485 		return (-1);
486 
487 	/*
488 	 * If this is a lofi device, then also get the minor instance name.
489 	 * Unfortunately, the current payload doesn't include an easy way to get
490 	 * this information.  So we cheat by resolving the 'dev_name' (which
491 	 * refers to the raw device) and taking the portion between ':(*),raw'.
492 	 */
493 	(void) strlcpy(realpath, devpath, sizeof (realpath));
494 	if (is_lofi) {
495 		if (nvlist_lookup_string(nvl, DEV_NAME,
496 		    &devname) == 0 &&
497 		    (ret = resolvepath(devname, path,
498 		    sizeof (path))) > 0) {
499 			path[ret] = '\0';
500 			colon = strchr(path, ':');
501 			if (colon != NULL)
502 				raw = strstr(colon + 1, ",raw");
503 			if (colon != NULL && raw != NULL) {
504 				*raw = '\0';
505 				(void) snprintf(realpath,
506 				    sizeof (realpath), "%s%s",
507 				    devpath, colon);
508 				*raw = ',';
509 			}
510 		}
511 	}
512 
513 	/*
514 	 * Iterate over all vdevs with a matching devid, and then those with a
515 	 * matching /devices path.  For disks, we only want to pay attention to
516 	 * vdevs marked as whole disks.  For lofi, we don't care (because we're
517 	 * matching an exact minor name).
518 	 */
519 	if (!devid_iter(realpath, zfs_process_add, !is_lofi))
520 		(void) devpath_iter(realpath, zfs_process_add, !is_lofi);
521 
522 	return (0);
523 }
524 
525 /*
526  * Called when we receive a VDEV_CHECK event, which indicates a device could not
527  * be opened during initial pool open, but the autoreplace property was set on
528  * the pool.  In this case, we treat it as if it were an add event.
529  */
530 static int
zfs_deliver_check(nvlist_t * nvl)531 zfs_deliver_check(nvlist_t *nvl)
532 {
533 	dev_data_t data = { 0 };
534 
535 	if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID,
536 	    &data.dd_pool_guid) != 0 ||
537 	    nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID,
538 	    &data.dd_vdev_guid) != 0 ||
539 	    data.dd_vdev_guid == 0)
540 		return (0);
541 
542 	data.dd_isdisk = B_TRUE;
543 	data.dd_func = zfs_process_add;
544 
545 	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
546 
547 	return (0);
548 }
549 
550 #define	DEVICE_PREFIX	"/devices"
551 
552 static int
zfsdle_vdev_online(zpool_handle_t * zhp,void * data)553 zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
554 {
555 	char *devname = data;
556 	boolean_t avail_spare, l2cache;
557 	vdev_state_t newstate;
558 	nvlist_t *tgt;
559 
560 	syseventd_print(9, "%s: searching for %s in pool %s\n", __func__,
561 	    devname, zpool_get_name(zhp));
562 
563 	if ((tgt = zpool_find_vdev_by_physpath(zhp, devname,
564 	    &avail_spare, &l2cache, NULL)) != NULL) {
565 		char *path, fullpath[MAXPATHLEN];
566 		uint64_t wholedisk = 0ULL;
567 
568 		/*
569 		 * If the /dev path of the device is invalid because the disk
570 		 * has been moved to a new location, we need to try to refresh
571 		 * that path before onlining the device.
572 		 */
573 		zpool_vdev_refresh_path(g_zfshdl, zhp, tgt);
574 
575 		verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH,
576 		    &path) == 0);
577 		verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
578 		    &wholedisk) == 0);
579 
580 		syseventd_print(9, "%s: "
581 		    "found %s in pool %s (wholedisk: %s)\n", __func__,
582 		    path, zpool_get_name(zhp),
583 		    wholedisk != 0 ? "true" : "false");
584 
585 		(void) strlcpy(fullpath, path, sizeof (fullpath));
586 		if (wholedisk) {
587 			fullpath[strlen(fullpath) - 2] = '\0';
588 
589 			/*
590 			 * We need to reopen the pool associated with this
591 			 * device so that the kernel can update the size
592 			 * of the expanded device.
593 			 */
594 			(void) zpool_reopen(zhp);
595 		}
596 
597 		if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) {
598 			syseventd_print(9, "%s: "
599 			    "setting device %s to ONLINE state in pool %s.\n",
600 			    __func__, fullpath, zpool_get_name(zhp));
601 			if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) {
602 				(void) zpool_vdev_online(zhp, fullpath, 0,
603 				    &newstate);
604 			}
605 		}
606 		zpool_close(zhp);
607 		return (1);
608 	}
609 	zpool_close(zhp);
610 	return (0);
611 }
612 
613 /*
614  * This function is called for each vdev of a pool for which any of the
615  * following events was recieved:
616  *  - ESC_ZFS_vdev_add
617  *  - ESC_ZFS_vdev_attach
618  *  - ESC_ZFS_vdev_clear
619  *  - ESC_ZFS_vdev_online
620  *  - ESC_ZFS_pool_create
621  *  - ESC_ZFS_pool_import
622  * It will update the vdevs FRU property if it is out of date.
623  */
624 /*ARGSUSED2*/
625 static void
zfs_update_vdev_fru(zpool_handle_t * zhp,nvlist_t * vdev,boolean_t isdisk)626 zfs_update_vdev_fru(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t isdisk)
627 {
628 	char *devpath, *cptr, *oldfru = NULL;
629 	const char *newfru;
630 	uint64_t vdev_guid;
631 
632 	(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &vdev_guid);
633 	(void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &devpath);
634 	(void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_FRU, &oldfru);
635 
636 	/* remove :<slice> from devpath */
637 	cptr = strrchr(devpath, ':');
638 	if (cptr != NULL)
639 		*cptr = '\0';
640 
641 	newfru = libzfs_fru_lookup(g_zfshdl, devpath);
642 	if (newfru == NULL) {
643 		syseventd_print(9, "zfs_update_vdev_fru: no FRU for %s\n",
644 		    devpath);
645 		return;
646 	}
647 
648 	/* do nothing if the FRU hasn't changed */
649 	if (oldfru != NULL && libzfs_fru_compare(g_zfshdl, oldfru, newfru)) {
650 		syseventd_print(9, "zfs_update_vdev_fru: FRU unchanged\n");
651 		return;
652 	}
653 
654 	syseventd_print(9, "zfs_update_vdev_fru: devpath = %s\n", devpath);
655 	syseventd_print(9, "zfs_update_vdev_fru: FRU = %s\n", newfru);
656 
657 	(void) zpool_fru_set(zhp, vdev_guid, newfru);
658 }
659 
660 /*
661  * This function handles the following events:
662  *  - ESC_ZFS_vdev_add
663  *  - ESC_ZFS_vdev_attach
664  *  - ESC_ZFS_vdev_clear
665  *  - ESC_ZFS_vdev_online
666  *  - ESC_ZFS_pool_create
667  *  - ESC_ZFS_pool_import
668  * It will iterate over the pool vdevs to update the FRU property.
669  */
670 int
zfs_deliver_update(nvlist_t * nvl)671 zfs_deliver_update(nvlist_t *nvl)
672 {
673 	dev_data_t dd = { 0 };
674 	char *pname;
675 	zpool_handle_t *zhp;
676 	nvlist_t *config, *vdev;
677 
678 	if (nvlist_lookup_string(nvl, "pool_name", &pname) != 0) {
679 		syseventd_print(9, "zfs_deliver_update: no pool name\n");
680 		return (-1);
681 	}
682 
683 	/*
684 	 * If this event was triggered by a pool export or destroy we cannot
685 	 * open the pool. This is not an error, just return 0 as we don't care
686 	 * about these events.
687 	 */
688 	zhp = zpool_open_canfail(g_zfshdl, pname);
689 	if (zhp == NULL)
690 		return (0);
691 
692 	config = zpool_get_config(zhp, NULL);
693 	if (config == NULL) {
694 		syseventd_print(9, "zfs_deliver_update: "
695 		    "failed to get pool config for %s\n", pname);
696 		zpool_close(zhp);
697 		return (-1);
698 	}
699 
700 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vdev) != 0) {
701 		syseventd_print(0, "zfs_deliver_update: "
702 		    "failed to get vdev tree for %s\n", pname);
703 		zpool_close(zhp);
704 		return (-1);
705 	}
706 
707 	libzfs_fru_refresh(g_zfshdl);
708 
709 	dd.dd_func = zfs_update_vdev_fru;
710 	zfs_iter_vdev(zhp, vdev, &dd);
711 
712 	zpool_close(zhp);
713 	return (0);
714 }
715 
716 int
zfs_deliver_dle(nvlist_t * nvl)717 zfs_deliver_dle(nvlist_t *nvl)
718 {
719 	char *devname;
720 	if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) != 0) {
721 		syseventd_print(9, "zfs_deliver_event: no physpath\n");
722 		return (-1);
723 	}
724 	if (strncmp(devname, DEVICE_PREFIX, strlen(DEVICE_PREFIX)) != 0) {
725 		syseventd_print(9, "zfs_deliver_event: invalid "
726 		    "device '%s'", devname);
727 		return (-1);
728 	}
729 
730 	/*
731 	 * We try to find the device using the physical
732 	 * path that has been supplied. We need to strip off
733 	 * the /devices prefix before starting our search.
734 	 */
735 	devname += strlen(DEVICE_PREFIX);
736 	if (zpool_iter(g_zfshdl, zfsdle_vdev_online, devname) != 1) {
737 		syseventd_print(9, "zfs_deliver_event: device '%s' not"
738 		    " found\n", devname);
739 		return (1);
740 	}
741 	return (0);
742 }
743 
744 
745 /*ARGSUSED*/
746 static int
zfs_deliver_event(sysevent_t * ev,int unused)747 zfs_deliver_event(sysevent_t *ev, int unused)
748 {
749 	const char *class = sysevent_get_class_name(ev);
750 	const char *subclass = sysevent_get_subclass_name(ev);
751 	nvlist_t *nvl;
752 	int ret;
753 	boolean_t is_lofi = B_FALSE, is_check = B_FALSE;
754 	boolean_t is_dle = B_FALSE, is_update = B_FALSE;
755 
756 	if (strcmp(class, EC_DEV_ADD) == 0) {
757 		/*
758 		 * We're mainly interested in disk additions, but we also listen
759 		 * for new lofi devices, to allow for simplified testing.
760 		 */
761 		if (strcmp(subclass, ESC_DISK) == 0)
762 			is_lofi = B_FALSE;
763 		else if (strcmp(subclass, ESC_LOFI) == 0)
764 			is_lofi = B_TRUE;
765 		else
766 			return (0);
767 
768 		is_check = B_FALSE;
769 	} else if (strcmp(class, EC_ZFS) == 0) {
770 		if (strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) {
771 			/*
772 			 * This event signifies that a device failed to open
773 			 * during pool load, but the 'autoreplace' property was
774 			 * set, so we should pretend it's just been added.
775 			 */
776 			is_check = B_TRUE;
777 		} else if ((strcmp(subclass, ESC_ZFS_VDEV_ADD) == 0) ||
778 		    (strcmp(subclass, ESC_ZFS_VDEV_ATTACH) == 0) ||
779 		    (strcmp(subclass, ESC_ZFS_VDEV_CLEAR) == 0) ||
780 		    (strcmp(subclass, ESC_ZFS_VDEV_ONLINE) == 0) ||
781 		    (strcmp(subclass, ESC_ZFS_POOL_CREATE) == 0) ||
782 		    (strcmp(subclass, ESC_ZFS_POOL_IMPORT) == 0)) {
783 			/*
784 			 * When we receive these events we check the pool
785 			 * configuration and update the vdev FRUs if necessary.
786 			 */
787 			is_update = B_TRUE;
788 		}
789 	} else if (strcmp(class, EC_DEV_STATUS) == 0 &&
790 	    strcmp(subclass, ESC_DEV_DLE) == 0) {
791 		is_dle = B_TRUE;
792 	} else {
793 		return (0);
794 	}
795 
796 	if (sysevent_get_attr_list(ev, &nvl) != 0)
797 		return (-1);
798 
799 	if (is_dle)
800 		ret = zfs_deliver_dle(nvl);
801 	else if (is_update)
802 		ret = zfs_deliver_update(nvl);
803 	else if (is_check)
804 		ret = zfs_deliver_check(nvl);
805 	else
806 		ret = zfs_deliver_add(nvl, is_lofi);
807 
808 	nvlist_free(nvl);
809 	return (ret);
810 }
811 
812 /*ARGSUSED*/
813 void *
zfs_enum_pools(void * arg)814 zfs_enum_pools(void *arg)
815 {
816 	(void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list);
817 	if (!list_is_empty(&g_pool_list))
818 		g_tpool = tpool_create(1, sysconf(_SC_NPROCESSORS_ONLN),
819 		    0, NULL);
820 	g_enumeration_done = B_TRUE;
821 	return (NULL);
822 }
823 
824 static struct slm_mod_ops zfs_mod_ops = {
825 	SE_MAJOR_VERSION, SE_MINOR_VERSION, 10, zfs_deliver_event
826 };
827 
828 struct slm_mod_ops *
slm_init()829 slm_init()
830 {
831 	if ((g_zfshdl = libzfs_init()) == NULL)
832 		return (NULL);
833 	/*
834 	 * collect a list of unavailable pools (asynchronously,
835 	 * since this can take a while)
836 	 */
837 	list_create(&g_pool_list, sizeof (struct unavailpool),
838 	    offsetof(struct unavailpool, uap_node));
839 	if (thr_create(NULL, 0, zfs_enum_pools, NULL, 0, &g_zfs_tid) != 0)
840 		return (NULL);
841 	return (&zfs_mod_ops);
842 }
843 
844 void
slm_fini()845 slm_fini()
846 {
847 	unavailpool_t *pool;
848 
849 	(void) thr_join(g_zfs_tid, NULL, NULL);
850 	if (g_tpool != NULL) {
851 		tpool_wait(g_tpool);
852 		tpool_destroy(g_tpool);
853 	}
854 	while ((pool = (list_head(&g_pool_list))) != NULL) {
855 		list_remove(&g_pool_list, pool);
856 		zpool_close(pool->uap_zhp);
857 		free(pool);
858 	}
859 	list_destroy(&g_pool_list);
860 	libzfs_fini(g_zfshdl);
861 }
862