xref: /illumos-gate/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c (revision 36cb57a52da3a1f84927aadcedf362cd3939f2b9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * ZFS syseventd module.
30  *
31  * The purpose of this module is to identify when devices are added to the
32  * system, and appropriately online or replace the affected vdevs.
33  *
34  * When a device is added to the system:
35  *
36  * 	1. Search for any vdevs whose devid matches that of the newly added
37  *	   device.
38  *
39  * 	2. If no vdevs are found, then search for any vdevs whose devfs path
40  *	   matches that of the new device.
41  *
42  *	3. If no vdevs match by either method, then ignore the event.
43  *
44  * 	4. Attempt to online the device with a flag to indicate that it should
45  *	   be unspared when resilvering completes.  If this succeeds, then the
46  *	   same device was inserted and we should continue normally.
47  *
48  *	5. If the pool does not have the 'autoreplace' property set, attempt to
49  *	   online the device again without the unspare flag, which will
50  *	   generate a FMA fault.
51  *
52  *	6. If the pool has the 'autoreplace' property set, and the matching vdev
53  *	   is a whole disk, then label the new disk and attempt a 'zpool
54  *	   replace'.
55  *
56  * The module responds to EC_DEV_ADD events for both disks and lofi devices,
57  * with the latter used for testing.  The special ESC_ZFS_VDEV_CHECK event
58  * indicates that a device failed to open during pool load, but the autoreplace
59  * property was set.  In this case, we deferred the associated FMA fault until
60  * our module had a chance to process the autoreplace logic.  If the device
61  * could not be replaced, then the second online attempt will trigger the FMA
62  * fault that we skipped earlier.
63  */
64 
65 #include <alloca.h>
66 #include <devid.h>
67 #include <fcntl.h>
68 #include <libnvpair.h>
69 #include <libsysevent.h>
70 #include <libzfs.h>
71 #include <limits.h>
72 #include <stdlib.h>
73 #include <string.h>
74 #include <syslog.h>
75 #include <sys/sunddi.h>
76 #include <sys/sysevent/eventdefs.h>
77 #include <sys/sysevent/dev.h>
78 #include <unistd.h>
79 
80 #if defined(__i386) || defined(__amd64)
81 #define	PHYS_PATH	":q"
82 #define	RAW_SLICE	"p0"
83 #elif defined(__sparc)
84 #define	PHYS_PATH	":c"
85 #define	RAW_SLICE	"s2"
86 #else
87 #error Unknown architecture
88 #endif
89 
90 typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t);
91 
92 libzfs_handle_t *g_zfshdl;
93 
94 /*
95  * The device associated with the given vdev (either by devid or physical path)
96  * has been added to the system.  If 'isdisk' is set, then we only attempt a
97  * replacement if it's a whole disk.  This also implies that we should label the
98  * disk first.
99  *
100  * First, we attempt to online the device (making sure to undo any spare
101  * operation when finished).  If this succeeds, then we're done.  If it fails,
102  * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened,
103  * but that the label was not what we expected.  If the 'autoreplace' property
104  * is not set, then we relabel the disk (if specified), and attempt a 'zpool
105  * replace'.  If the online is successful, but the new state is something else
106  * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of
107  * race, and we should avoid attempting to relabel the disk.
108  */
109 static void
110 zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t isdisk)
111 {
112 	char *path;
113 	vdev_state_t newstate;
114 	nvlist_t *nvroot, *newvd;
115 	uint64_t wholedisk = 0ULL;
116 	char *physpath = NULL;
117 	char rawpath[PATH_MAX], fullpath[PATH_MAX];
118 	size_t len;
119 
120 	if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
121 		return;
122 
123 	(void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath);
124 	(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
125 
126 	/*
127 	 * We should have a way to online a device by guid.  With the current
128 	 * interface, we are forced to chop off the 's0' for whole disks.
129 	 */
130 	(void) strlcpy(fullpath, path, sizeof (fullpath));
131 	if (wholedisk)
132 		fullpath[strlen(fullpath) - 2] = '\0';
133 
134 	/*
135 	 * Attempt to online the device.  It would be nice to online this by
136 	 * GUID, but the current interface only supports lookup by path.
137 	 */
138 	if (zpool_vdev_online(zhp, fullpath,
139 	    ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 &&
140 	    (newstate == VDEV_STATE_HEALTHY || newstate == VDEV_STATE_DEGRADED))
141 		return;
142 
143 	/*
144 	 * If the pool doesn't have the autoreplace property set, then attempt a
145 	 * true online (without the unspare flag), which will trigger a FMA
146 	 * fault.
147 	 */
148 	if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
149 	    (isdisk && !wholedisk)) {
150 		(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
151 		    &newstate);
152 		return;
153 	}
154 
155 	if (isdisk) {
156 		/*
157 		 * If this is a request to label a whole disk, then attempt to
158 		 * write out the label.  Before we can label the disk, we need
159 		 * access to a raw node.  Ideally, we'd like to walk the devinfo
160 		 * tree and find a raw node from the corresponding parent node.
161 		 * This is overly complicated, and since we know how we labeled
162 		 * this device in the first place, we know it's save to switch
163 		 * from /dev/dsk to /dev/rdsk and append the backup slice.
164 		 *
165 		 * If any part of this process fails, then do a force online to
166 		 * trigger a ZFS fault for the device (and any hot spare
167 		 * replacement).
168 		 */
169 		if (strncmp(path, "/dev/dsk/", 9) != 0) {
170 			(void) zpool_vdev_online(zhp, fullpath,
171 			    ZFS_ONLINE_FORCEFAULT, &newstate);
172 			return;
173 		}
174 
175 		(void) strlcpy(rawpath, path + 9, sizeof (rawpath));
176 		len = strlen(rawpath);
177 		rawpath[len - 2] = '\0';
178 
179 		if (zpool_label_disk(g_zfshdl, zhp, rawpath) != 0) {
180 			(void) zpool_vdev_online(zhp, fullpath,
181 			    ZFS_ONLINE_FORCEFAULT, &newstate);
182 			return;
183 		}
184 	}
185 
186 	/*
187 	 * Cosntruct the root vdev to pass to zpool_vdev_attach().  While adding
188 	 * the entire vdev structure is harmless, we construct a reduced set of
189 	 * path/physpath/wholedisk to keep it simple.
190 	 */
191 	if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)
192 		return;
193 
194 	if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
195 		nvlist_free(nvroot);
196 		return;
197 	}
198 
199 	if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 ||
200 	    nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 ||
201 	    (physpath != NULL && nvlist_add_string(newvd,
202 	    ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) ||
203 	    nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 ||
204 	    nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 ||
205 	    nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd,
206 	    1) != 0) {
207 		nvlist_free(newvd);
208 		nvlist_free(nvroot);
209 		return;
210 	}
211 
212 	nvlist_free(newvd);
213 
214 	(void) zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE);
215 
216 	nvlist_free(nvroot);
217 
218 }
219 
220 /*
221  * Utility functions to find a vdev matching given criteria.
222  */
223 typedef struct dev_data {
224 	const char		*dd_compare;
225 	const char		*dd_prop;
226 	zfs_process_func_t	dd_func;
227 	boolean_t		dd_found;
228 	boolean_t		dd_isdisk;
229 	uint64_t		dd_pool_guid;
230 	uint64_t		dd_vdev_guid;
231 } dev_data_t;
232 
233 static void
234 zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
235 {
236 	dev_data_t *dp = data;
237 	char *path;
238 	uint_t c, children;
239 	nvlist_t **child;
240 	size_t len;
241 	uint64_t guid;
242 
243 	/*
244 	 * First iterate over any children.
245 	 */
246 	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
247 	    &child, &children) == 0) {
248 		for (c = 0; c < children; c++)
249 			zfs_iter_vdev(zhp, child[c], data);
250 		return;
251 	}
252 
253 	if (dp->dd_vdev_guid != 0) {
254 		if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
255 		    &guid) != 0 || guid != dp->dd_vdev_guid)
256 			return;
257 	} else {
258 		len = strlen(dp->dd_compare);
259 
260 		if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
261 		    strncmp(dp->dd_compare, path, len) != 0)
262 			return;
263 
264 		/*
265 		 * Normally, we want to have an exact match for the comparison
266 		 * string.  However, we allow substring matches in the following
267 		 * cases:
268 		 *
269 		 * 	<path>:		This is a devpath, and the target is one
270 		 * 			of its children.
271 		 *
272 		 * 	<path/>		This is a devid for a whole disk, and
273 		 * 			the target is one of its children.
274 		 */
275 		if (path[len] != '\0' && path[len] != ':' &&
276 		    path[len - 1] != '/')
277 			return;
278 	}
279 
280 	(dp->dd_func)(zhp, nvl, dp->dd_isdisk);
281 }
282 
283 static int
284 zfs_iter_pool(zpool_handle_t *zhp, void *data)
285 {
286 	nvlist_t *config, *nvl;
287 	dev_data_t *dp = data;
288 	uint64_t pool_guid;
289 
290 	if ((config = zpool_get_config(zhp, NULL)) != NULL) {
291 		if (dp->dd_pool_guid == 0 ||
292 		    (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
293 		    &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) {
294 			(void) nvlist_lookup_nvlist(config,
295 			    ZPOOL_CONFIG_VDEV_TREE, &nvl);
296 			zfs_iter_vdev(zhp, nvl, data);
297 		}
298 	}
299 
300 	zpool_close(zhp);
301 	return (0);
302 }
303 
304 /*
305  * Given a physical device path, iterate over all (pool, vdev) pairs which
306  * correspond to the given path.
307  */
308 static boolean_t
309 devpath_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk)
310 {
311 	dev_data_t data = { 0 };
312 
313 	data.dd_compare = devpath;
314 	data.dd_func = func;
315 	data.dd_prop = ZPOOL_CONFIG_PHYS_PATH;
316 	data.dd_found = B_FALSE;
317 	data.dd_isdisk = wholedisk;
318 
319 	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
320 
321 	return (data.dd_found);
322 }
323 
324 /*
325  * Given a /devices path, lookup the corresponding devid for each minor node,
326  * and find any vdevs with matching devids.  Doing this straight up would be
327  * rather inefficient, O(minor nodes * vdevs in system), so we take advantage of
328  * the fact that each devid ends with "/<minornode>".  Once we find any valid
329  * minor node, we chop off the portion after the last slash, and then search for
330  * matching vdevs, which is O(vdevs in system).
331  */
332 static boolean_t
333 devid_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk)
334 {
335 	size_t len = strlen(devpath) + sizeof ("/devices") +
336 	    sizeof (PHYS_PATH) - 1;
337 	char *fullpath;
338 	int fd;
339 	ddi_devid_t devid;
340 	char *devidstr, *fulldevid;
341 	dev_data_t data = { 0 };
342 
343 	/*
344 	 * Try to open a known minor node.
345 	 */
346 	fullpath = alloca(len);
347 	(void) snprintf(fullpath, len, "/devices%s%s", devpath, PHYS_PATH);
348 	if ((fd = open(fullpath, O_RDONLY)) < 0)
349 		return (B_FALSE);
350 
351 	/*
352 	 * Determine the devid as a string, with no trailing slash for the minor
353 	 * node.
354 	 */
355 	if (devid_get(fd, &devid) != 0) {
356 		(void) close(fd);
357 		return (B_FALSE);
358 	}
359 	(void) close(fd);
360 
361 	if ((devidstr = devid_str_encode(devid, NULL)) == NULL) {
362 		devid_free(devid);
363 		return (B_FALSE);
364 	}
365 
366 	len = strlen(devidstr) + 2;
367 	fulldevid = alloca(len);
368 	(void) snprintf(fulldevid, len, "%s/", devidstr);
369 
370 	data.dd_compare = fulldevid;
371 	data.dd_func = func;
372 	data.dd_prop = ZPOOL_CONFIG_DEVID;
373 	data.dd_found = B_FALSE;
374 	data.dd_isdisk = wholedisk;
375 
376 	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
377 
378 	devid_str_free(devidstr);
379 
380 	return (data.dd_found);
381 }
382 
383 /*
384  * This function is called when we receive a devfs add event.  This can be
385  * either a disk event or a lofi event, and the behavior is slightly different
386  * depending on which it is.
387  */
388 static int
389 zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)
390 {
391 	char *devpath, *devname;
392 	char path[PATH_MAX], realpath[PATH_MAX];
393 	char *colon, *raw;
394 	int ret;
395 
396 	/*
397 	 * The main unit of operation is the physical device path.  For disks,
398 	 * this is the device node, as all minor nodes are affected.  For lofi
399 	 * devices, this includes the minor path.  Unfortunately, this isn't
400 	 * represented in the DEV_PHYS_PATH for various reasons.
401 	 */
402 	if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath) != 0)
403 		return (-1);
404 
405 	/*
406 	 * If this is a lofi device, then also get the minor instance name.
407 	 * Unfortunately, the current payload doesn't include an easy way to get
408 	 * this information.  So we cheat by resolving the 'dev_name' (which
409 	 * refers to the raw device) and taking the portion between ':(*),raw'.
410 	 */
411 	(void) strlcpy(realpath, devpath, sizeof (realpath));
412 	if (is_lofi) {
413 		if (nvlist_lookup_string(nvl, DEV_NAME,
414 		    &devname) == 0 &&
415 		    (ret = resolvepath(devname, path,
416 		    sizeof (path))) > 0) {
417 			path[ret] = '\0';
418 			colon = strchr(path, ':');
419 			if (colon != NULL)
420 				raw = strstr(colon + 1, ",raw");
421 			if (colon != NULL && raw != NULL) {
422 				*raw = '\0';
423 				(void) snprintf(realpath,
424 				    sizeof (realpath), "%s%s",
425 				    devpath, colon);
426 				*raw = ',';
427 			}
428 		}
429 	}
430 
431 	/*
432 	 * Iterate over all vdevs with a matching devid, and then those with a
433 	 * matching /devices path.  For disks, we only want to pay attention to
434 	 * vdevs marked as whole disks.  For lofi, we don't care (because we're
435 	 * matching an exact minor name).
436 	 */
437 	if (!devid_iter(realpath, zfs_process_add, !is_lofi))
438 		(void) devpath_iter(realpath, zfs_process_add, !is_lofi);
439 
440 	return (0);
441 }
442 
443 /*
444  * Called when we receive a VDEV_CHECK event, which indicates a device could not
445  * be opened during initial pool open, but the autoreplace property was set on
446  * the pool.  In this case, we treat it as if it were an add event.
447  */
448 static int
449 zfs_deliver_check(nvlist_t *nvl)
450 {
451 	dev_data_t data = { 0 };
452 
453 	if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID,
454 	    &data.dd_pool_guid) != 0 ||
455 	    nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID,
456 	    &data.dd_vdev_guid) != 0)
457 		return (0);
458 
459 	data.dd_isdisk = B_TRUE;
460 	data.dd_func = zfs_process_add;
461 
462 	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
463 
464 	return (0);
465 }
466 
467 /*ARGSUSED*/
468 static int
469 zfs_deliver_event(sysevent_t *ev, int unused)
470 {
471 	const char *class = sysevent_get_class_name(ev);
472 	const char *subclass = sysevent_get_subclass_name(ev);
473 	nvlist_t *nvl;
474 	int ret;
475 	boolean_t is_lofi, is_check;
476 
477 	if (strcmp(class, EC_DEV_ADD) == 0) {
478 		/*
479 		 * We're mainly interested in disk additions, but we also listen
480 		 * for new lofi devices, to allow for simplified testing.
481 		 */
482 		if (strcmp(subclass, ESC_DISK) == 0)
483 			is_lofi = B_FALSE;
484 		else if (strcmp(subclass, ESC_LOFI) == 0)
485 			is_lofi = B_TRUE;
486 		else
487 			return (0);
488 
489 		is_check = B_FALSE;
490 	} else if (strcmp(class, EC_ZFS) == 0 &&
491 	    strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) {
492 		/*
493 		 * This event signifies that a device failed to open during pool
494 		 * load, but the 'autoreplace' property was set, so we should
495 		 * pretend it's just been added.
496 		 */
497 		is_check = B_TRUE;
498 	} else {
499 		return (0);
500 	}
501 
502 	if (sysevent_get_attr_list(ev, &nvl) != 0)
503 		return (-1);
504 
505 	if (is_check)
506 		ret = zfs_deliver_check(nvl);
507 	else
508 		ret = zfs_deliver_add(nvl, is_lofi);
509 
510 
511 	nvlist_free(nvl);
512 	return (ret);
513 }
514 
515 static struct slm_mod_ops zfs_mod_ops = {
516 	SE_MAJOR_VERSION, SE_MINOR_VERSION, 10, zfs_deliver_event
517 };
518 
519 struct slm_mod_ops *
520 slm_init()
521 {
522 	if ((g_zfshdl = libzfs_init()) == NULL)
523 		return (NULL);
524 
525 	return (&zfs_mod_ops);
526 }
527 
528 void
529 slm_fini()
530 {
531 }
532