xref: /illumos-gate/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c (revision b1d7ec75953cd517f5b7c3d9cb427ff8ec5d7d07)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * ZFS syseventd module.
27  *
28  * The purpose of this module is to identify when devices are added to the
29  * system, and appropriately online or replace the affected vdevs.
30  *
31  * When a device is added to the system:
32  *
33  * 	1. Search for any vdevs whose devid matches that of the newly added
34  *	   device.
35  *
36  * 	2. If no vdevs are found, then search for any vdevs whose devfs path
37  *	   matches that of the new device.
38  *
39  *	3. If no vdevs match by either method, then ignore the event.
40  *
41  * 	4. Attempt to online the device with a flag to indicate that it should
42  *	   be unspared when resilvering completes.  If this succeeds, then the
43  *	   same device was inserted and we should continue normally.
44  *
45  *	5. If the pool does not have the 'autoreplace' property set, attempt to
46  *	   online the device again without the unspare flag, which will
47  *	   generate a FMA fault.
48  *
49  *	6. If the pool has the 'autoreplace' property set, and the matching vdev
50  *	   is a whole disk, then label the new disk and attempt a 'zpool
51  *	   replace'.
52  *
53  * The module responds to EC_DEV_ADD events for both disks and lofi devices,
54  * with the latter used for testing.  The special ESC_ZFS_VDEV_CHECK event
55  * indicates that a device failed to open during pool load, but the autoreplace
56  * property was set.  In this case, we deferred the associated FMA fault until
57  * our module had a chance to process the autoreplace logic.  If the device
58  * could not be replaced, then the second online attempt will trigger the FMA
59  * fault that we skipped earlier.
60  */
61 
62 #include <alloca.h>
63 #include <devid.h>
64 #include <fcntl.h>
65 #include <libnvpair.h>
66 #include <libsysevent.h>
67 #include <libzfs.h>
68 #include <limits.h>
69 #include <stdlib.h>
70 #include <string.h>
71 #include <syslog.h>
72 #include <sys/sunddi.h>
73 #include <sys/sysevent/eventdefs.h>
74 #include <sys/sysevent/dev.h>
75 #include <unistd.h>
76 #include "syseventd.h"
77 
78 #if defined(__i386) || defined(__amd64)
79 #define	PHYS_PATH	":q"
80 #define	RAW_SLICE	"p0"
81 #elif defined(__sparc)
82 #define	PHYS_PATH	":c"
83 #define	RAW_SLICE	"s2"
84 #else
85 #error Unknown architecture
86 #endif
87 
88 typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t);
89 
90 libzfs_handle_t *g_zfshdl;
91 
92 /*
93  * The device associated with the given vdev (either by devid or physical path)
94  * has been added to the system.  If 'isdisk' is set, then we only attempt a
95  * replacement if it's a whole disk.  This also implies that we should label the
96  * disk first.
97  *
98  * First, we attempt to online the device (making sure to undo any spare
99  * operation when finished).  If this succeeds, then we're done.  If it fails,
100  * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened,
101  * but that the label was not what we expected.  If the 'autoreplace' property
102  * is not set, then we relabel the disk (if specified), and attempt a 'zpool
103  * replace'.  If the online is successful, but the new state is something else
104  * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of
105  * race, and we should avoid attempting to relabel the disk.
106  */
107 static void
108 zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t isdisk)
109 {
110 	char *path;
111 	vdev_state_t newstate;
112 	nvlist_t *nvroot, *newvd;
113 	uint64_t wholedisk = 0ULL;
114 	char *physpath = NULL;
115 	char rawpath[PATH_MAX], fullpath[PATH_MAX];
116 	size_t len;
117 
118 	if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
119 		return;
120 
121 	(void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath);
122 	(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
123 
124 	/*
125 	 * We should have a way to online a device by guid.  With the current
126 	 * interface, we are forced to chop off the 's0' for whole disks.
127 	 */
128 	(void) strlcpy(fullpath, path, sizeof (fullpath));
129 	if (wholedisk)
130 		fullpath[strlen(fullpath) - 2] = '\0';
131 
132 	/*
133 	 * Attempt to online the device.  It would be nice to online this by
134 	 * GUID, but the current interface only supports lookup by path.
135 	 */
136 	if (zpool_vdev_online(zhp, fullpath,
137 	    ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 &&
138 	    (newstate == VDEV_STATE_HEALTHY || newstate == VDEV_STATE_DEGRADED))
139 		return;
140 
141 	/*
142 	 * If the pool doesn't have the autoreplace property set, then attempt a
143 	 * true online (without the unspare flag), which will trigger a FMA
144 	 * fault.
145 	 */
146 	if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
147 	    (isdisk && !wholedisk)) {
148 		(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
149 		    &newstate);
150 		return;
151 	}
152 
153 	if (isdisk) {
154 		/*
155 		 * If this is a request to label a whole disk, then attempt to
156 		 * write out the label.  Before we can label the disk, we need
157 		 * access to a raw node.  Ideally, we'd like to walk the devinfo
158 		 * tree and find a raw node from the corresponding parent node.
159 		 * This is overly complicated, and since we know how we labeled
160 		 * this device in the first place, we know it's save to switch
161 		 * from /dev/dsk to /dev/rdsk and append the backup slice.
162 		 *
163 		 * If any part of this process fails, then do a force online to
164 		 * trigger a ZFS fault for the device (and any hot spare
165 		 * replacement).
166 		 */
167 		if (strncmp(path, "/dev/dsk/", 9) != 0) {
168 			(void) zpool_vdev_online(zhp, fullpath,
169 			    ZFS_ONLINE_FORCEFAULT, &newstate);
170 			return;
171 		}
172 
173 		(void) strlcpy(rawpath, path + 9, sizeof (rawpath));
174 		len = strlen(rawpath);
175 		rawpath[len - 2] = '\0';
176 
177 		if (zpool_label_disk(g_zfshdl, zhp, rawpath) != 0) {
178 			(void) zpool_vdev_online(zhp, fullpath,
179 			    ZFS_ONLINE_FORCEFAULT, &newstate);
180 			return;
181 		}
182 	}
183 
184 	/*
185 	 * Cosntruct the root vdev to pass to zpool_vdev_attach().  While adding
186 	 * the entire vdev structure is harmless, we construct a reduced set of
187 	 * path/physpath/wholedisk to keep it simple.
188 	 */
189 	if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)
190 		return;
191 
192 	if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
193 		nvlist_free(nvroot);
194 		return;
195 	}
196 
197 	if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 ||
198 	    nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 ||
199 	    (physpath != NULL && nvlist_add_string(newvd,
200 	    ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) ||
201 	    nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 ||
202 	    nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 ||
203 	    nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd,
204 	    1) != 0) {
205 		nvlist_free(newvd);
206 		nvlist_free(nvroot);
207 		return;
208 	}
209 
210 	nvlist_free(newvd);
211 
212 	(void) zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE);
213 
214 	nvlist_free(nvroot);
215 
216 }
217 
218 /*
219  * Utility functions to find a vdev matching given criteria.
220  */
221 typedef struct dev_data {
222 	const char		*dd_compare;
223 	const char		*dd_prop;
224 	zfs_process_func_t	dd_func;
225 	boolean_t		dd_found;
226 	boolean_t		dd_isdisk;
227 	uint64_t		dd_pool_guid;
228 	uint64_t		dd_vdev_guid;
229 } dev_data_t;
230 
231 static void
232 zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
233 {
234 	dev_data_t *dp = data;
235 	char *path;
236 	uint_t c, children;
237 	nvlist_t **child;
238 	size_t len;
239 	uint64_t guid;
240 
241 	/*
242 	 * First iterate over any children.
243 	 */
244 	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
245 	    &child, &children) == 0) {
246 		for (c = 0; c < children; c++)
247 			zfs_iter_vdev(zhp, child[c], data);
248 		return;
249 	}
250 
251 	if (dp->dd_vdev_guid != 0) {
252 		if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
253 		    &guid) != 0 || guid != dp->dd_vdev_guid)
254 			return;
255 	} else {
256 		len = strlen(dp->dd_compare);
257 
258 		if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
259 		    strncmp(dp->dd_compare, path, len) != 0)
260 			return;
261 
262 		/*
263 		 * Normally, we want to have an exact match for the comparison
264 		 * string.  However, we allow substring matches in the following
265 		 * cases:
266 		 *
267 		 * 	<path>:		This is a devpath, and the target is one
268 		 * 			of its children.
269 		 *
270 		 * 	<path/>		This is a devid for a whole disk, and
271 		 * 			the target is one of its children.
272 		 */
273 		if (path[len] != '\0' && path[len] != ':' &&
274 		    path[len - 1] != '/')
275 			return;
276 	}
277 
278 	(dp->dd_func)(zhp, nvl, dp->dd_isdisk);
279 }
280 
281 static int
282 zfs_iter_pool(zpool_handle_t *zhp, void *data)
283 {
284 	nvlist_t *config, *nvl;
285 	dev_data_t *dp = data;
286 	uint64_t pool_guid;
287 
288 	if ((config = zpool_get_config(zhp, NULL)) != NULL) {
289 		if (dp->dd_pool_guid == 0 ||
290 		    (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
291 		    &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) {
292 			(void) nvlist_lookup_nvlist(config,
293 			    ZPOOL_CONFIG_VDEV_TREE, &nvl);
294 			zfs_iter_vdev(zhp, nvl, data);
295 		}
296 	}
297 
298 	zpool_close(zhp);
299 	return (0);
300 }
301 
302 /*
303  * Given a physical device path, iterate over all (pool, vdev) pairs which
304  * correspond to the given path.
305  */
306 static boolean_t
307 devpath_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk)
308 {
309 	dev_data_t data = { 0 };
310 
311 	data.dd_compare = devpath;
312 	data.dd_func = func;
313 	data.dd_prop = ZPOOL_CONFIG_PHYS_PATH;
314 	data.dd_found = B_FALSE;
315 	data.dd_isdisk = wholedisk;
316 
317 	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
318 
319 	return (data.dd_found);
320 }
321 
322 /*
323  * Given a /devices path, lookup the corresponding devid for each minor node,
324  * and find any vdevs with matching devids.  Doing this straight up would be
325  * rather inefficient, O(minor nodes * vdevs in system), so we take advantage of
326  * the fact that each devid ends with "/<minornode>".  Once we find any valid
327  * minor node, we chop off the portion after the last slash, and then search for
328  * matching vdevs, which is O(vdevs in system).
329  */
330 static boolean_t
331 devid_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk)
332 {
333 	size_t len = strlen(devpath) + sizeof ("/devices") +
334 	    sizeof (PHYS_PATH) - 1;
335 	char *fullpath;
336 	int fd;
337 	ddi_devid_t devid;
338 	char *devidstr, *fulldevid;
339 	dev_data_t data = { 0 };
340 
341 	/*
342 	 * Try to open a known minor node.
343 	 */
344 	fullpath = alloca(len);
345 	(void) snprintf(fullpath, len, "/devices%s%s", devpath, PHYS_PATH);
346 	if ((fd = open(fullpath, O_RDONLY)) < 0)
347 		return (B_FALSE);
348 
349 	/*
350 	 * Determine the devid as a string, with no trailing slash for the minor
351 	 * node.
352 	 */
353 	if (devid_get(fd, &devid) != 0) {
354 		(void) close(fd);
355 		return (B_FALSE);
356 	}
357 	(void) close(fd);
358 
359 	if ((devidstr = devid_str_encode(devid, NULL)) == NULL) {
360 		devid_free(devid);
361 		return (B_FALSE);
362 	}
363 
364 	len = strlen(devidstr) + 2;
365 	fulldevid = alloca(len);
366 	(void) snprintf(fulldevid, len, "%s/", devidstr);
367 
368 	data.dd_compare = fulldevid;
369 	data.dd_func = func;
370 	data.dd_prop = ZPOOL_CONFIG_DEVID;
371 	data.dd_found = B_FALSE;
372 	data.dd_isdisk = wholedisk;
373 
374 	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
375 
376 	devid_str_free(devidstr);
377 	devid_free(devid);
378 
379 	return (data.dd_found);
380 }
381 
382 /*
383  * This function is called when we receive a devfs add event.  This can be
384  * either a disk event or a lofi event, and the behavior is slightly different
385  * depending on which it is.
386  */
387 static int
388 zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)
389 {
390 	char *devpath, *devname;
391 	char path[PATH_MAX], realpath[PATH_MAX];
392 	char *colon, *raw;
393 	int ret;
394 
395 	/*
396 	 * The main unit of operation is the physical device path.  For disks,
397 	 * this is the device node, as all minor nodes are affected.  For lofi
398 	 * devices, this includes the minor path.  Unfortunately, this isn't
399 	 * represented in the DEV_PHYS_PATH for various reasons.
400 	 */
401 	if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath) != 0)
402 		return (-1);
403 
404 	/*
405 	 * If this is a lofi device, then also get the minor instance name.
406 	 * Unfortunately, the current payload doesn't include an easy way to get
407 	 * this information.  So we cheat by resolving the 'dev_name' (which
408 	 * refers to the raw device) and taking the portion between ':(*),raw'.
409 	 */
410 	(void) strlcpy(realpath, devpath, sizeof (realpath));
411 	if (is_lofi) {
412 		if (nvlist_lookup_string(nvl, DEV_NAME,
413 		    &devname) == 0 &&
414 		    (ret = resolvepath(devname, path,
415 		    sizeof (path))) > 0) {
416 			path[ret] = '\0';
417 			colon = strchr(path, ':');
418 			if (colon != NULL)
419 				raw = strstr(colon + 1, ",raw");
420 			if (colon != NULL && raw != NULL) {
421 				*raw = '\0';
422 				(void) snprintf(realpath,
423 				    sizeof (realpath), "%s%s",
424 				    devpath, colon);
425 				*raw = ',';
426 			}
427 		}
428 	}
429 
430 	/*
431 	 * Iterate over all vdevs with a matching devid, and then those with a
432 	 * matching /devices path.  For disks, we only want to pay attention to
433 	 * vdevs marked as whole disks.  For lofi, we don't care (because we're
434 	 * matching an exact minor name).
435 	 */
436 	if (!devid_iter(realpath, zfs_process_add, !is_lofi))
437 		(void) devpath_iter(realpath, zfs_process_add, !is_lofi);
438 
439 	return (0);
440 }
441 
442 /*
443  * Called when we receive a VDEV_CHECK event, which indicates a device could not
444  * be opened during initial pool open, but the autoreplace property was set on
445  * the pool.  In this case, we treat it as if it were an add event.
446  */
447 static int
448 zfs_deliver_check(nvlist_t *nvl)
449 {
450 	dev_data_t data = { 0 };
451 
452 	if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID,
453 	    &data.dd_pool_guid) != 0 ||
454 	    nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID,
455 	    &data.dd_vdev_guid) != 0)
456 		return (0);
457 
458 	data.dd_isdisk = B_TRUE;
459 	data.dd_func = zfs_process_add;
460 
461 	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
462 
463 	return (0);
464 }
465 
466 #define	DEVICE_PREFIX	"/devices"
467 
468 static int
469 zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
470 {
471 	char *devname = data;
472 	boolean_t avail_spare, l2cache;
473 	vdev_state_t newstate;
474 	nvlist_t *tgt;
475 
476 	syseventd_print(9, "zfsdle_vdev_online: searching for %s in pool %s\n",
477 	    devname, zpool_get_name(zhp));
478 
479 	if ((tgt = zpool_find_vdev_by_physpath(zhp, devname,
480 	    &avail_spare, &l2cache, NULL)) != NULL) {
481 		char *path, fullpath[MAXPATHLEN];
482 		uint64_t wholedisk = 0ULL;
483 
484 		verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH,
485 		    &path) == 0);
486 		verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
487 		    &wholedisk) == 0);
488 
489 		(void) strlcpy(fullpath, path, sizeof (fullpath));
490 		if (wholedisk)
491 			fullpath[strlen(fullpath) - 2] = '\0';
492 
493 		if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) {
494 			syseventd_print(9, "zfsdle_vdev_online: setting device"
495 			    " device %s to ONLINE state in pool %s.\n",
496 			    fullpath, zpool_get_name(zhp));
497 			if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL)
498 				(void) zpool_vdev_online(zhp, fullpath, 0,
499 				    &newstate);
500 		}
501 		zpool_close(zhp);
502 		return (1);
503 	}
504 	zpool_close(zhp);
505 	return (0);
506 }
507 
508 int
509 zfs_deliver_dle(nvlist_t *nvl)
510 {
511 	char *devname;
512 	if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) != 0) {
513 		syseventd_print(9, "zfs_deliver_event: no physpath\n");
514 		return (-1);
515 	}
516 	if (strncmp(devname, DEVICE_PREFIX, strlen(DEVICE_PREFIX)) != 0) {
517 		syseventd_print(9, "zfs_deliver_event: invalid "
518 		    "device '%s'", devname);
519 		return (-1);
520 	}
521 
522 	/*
523 	 * We try to find the device using the physical
524 	 * path that has been supplied. We need to strip off
525 	 * the /devices prefix before starting our search.
526 	 */
527 	devname += strlen(DEVICE_PREFIX);
528 	if (zpool_iter(g_zfshdl, zfsdle_vdev_online, devname) != 1) {
529 		syseventd_print(9, "zfs_deliver_event: device '%s' not"
530 		    " found\n", devname);
531 		return (1);
532 	}
533 	nvlist_free(nvl);
534 	return (0);
535 }
536 
537 
538 /*ARGSUSED*/
539 static int
540 zfs_deliver_event(sysevent_t *ev, int unused)
541 {
542 	const char *class = sysevent_get_class_name(ev);
543 	const char *subclass = sysevent_get_subclass_name(ev);
544 	nvlist_t *nvl;
545 	int ret;
546 	boolean_t is_lofi, is_check, is_dle = B_FALSE;
547 
548 	if (strcmp(class, EC_DEV_ADD) == 0) {
549 		/*
550 		 * We're mainly interested in disk additions, but we also listen
551 		 * for new lofi devices, to allow for simplified testing.
552 		 */
553 		if (strcmp(subclass, ESC_DISK) == 0)
554 			is_lofi = B_FALSE;
555 		else if (strcmp(subclass, ESC_LOFI) == 0)
556 			is_lofi = B_TRUE;
557 		else
558 			return (0);
559 
560 		is_check = B_FALSE;
561 	} else if (strcmp(class, EC_ZFS) == 0 &&
562 	    strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) {
563 		/*
564 		 * This event signifies that a device failed to open during pool
565 		 * load, but the 'autoreplace' property was set, so we should
566 		 * pretend it's just been added.
567 		 */
568 		is_check = B_TRUE;
569 	} else if (strcmp(class, EC_DEV_STATUS) == 0 &&
570 	    strcmp(subclass, ESC_DEV_DLE) == 0) {
571 		is_dle = B_TRUE;
572 	} else {
573 		return (0);
574 	}
575 
576 	if (sysevent_get_attr_list(ev, &nvl) != 0)
577 		return (-1);
578 
579 	if (is_dle)
580 		ret = zfs_deliver_dle(nvl);
581 	else if (is_check)
582 		ret = zfs_deliver_check(nvl);
583 	else
584 		ret = zfs_deliver_add(nvl, is_lofi);
585 
586 	nvlist_free(nvl);
587 	return (ret);
588 }
589 
590 static struct slm_mod_ops zfs_mod_ops = {
591 	SE_MAJOR_VERSION, SE_MINOR_VERSION, 10, zfs_deliver_event
592 };
593 
594 struct slm_mod_ops *
595 slm_init()
596 {
597 	if ((g_zfshdl = libzfs_init()) == NULL)
598 		return (NULL);
599 
600 	return (&zfs_mod_ops);
601 }
602 
603 void
604 slm_fini()
605 {
606 	libzfs_fini(g_zfshdl);
607 }
608