xref: /freebsd/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_import_os.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
24  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
26  * Copyright 2015 RackTop Systems.
27  * Copyright (c) 2016, Intel Corporation.
28  */
29 
30 /*
31  * Pool import support functions.
32  *
33  * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since
34  * these commands are expected to run in the global zone, we can assume
35  * that the devices are all readable when called.
36  *
37  * To import a pool, we rely on reading the configuration information from the
38  * ZFS label of each device.  If we successfully read the label, then we
39  * organize the configuration information in the following hierarchy:
40  *
41  *	pool guid -> toplevel vdev guid -> label txg
42  *
43  * Duplicate entries matching this same tuple will be discarded.  Once we have
44  * examined every device, we pick the best label txg config for each toplevel
45  * vdev.  We then arrange these toplevel vdevs into a complete pool config, and
46  * update any paths that have changed.  Finally, we attempt to import the pool
47  * using our derived config, and record the results.
48  */
49 
50 #include <ctype.h>
51 #include <dirent.h>
52 #include <errno.h>
53 #include <libintl.h>
54 #include <libgen.h>
55 #include <stddef.h>
56 #include <stdlib.h>
57 #include <stdio.h>
58 #include <string.h>
59 #include <sys/stat.h>
60 #include <unistd.h>
61 #include <fcntl.h>
62 #include <sys/dktp/fdisk.h>
63 #include <sys/vdev_impl.h>
64 #include <sys/fs/zfs.h>
65 
66 #include <thread_pool.h>
67 #include <libzutil.h>
68 #include <libnvpair.h>
69 #include <libzfs.h>
70 
71 #include "zutil_import.h"
72 
73 #ifdef HAVE_LIBUDEV
74 #include <libudev.h>
75 #include <sched.h>
76 #endif
77 #include <blkid/blkid.h>
78 
79 #define	DEV_BYID_PATH	"/dev/disk/by-id/"
80 
81 /*
82  * Skip devices with well known prefixes:
83  * there can be side effects when opening devices which need to be avoided.
84  *
85  * hpet        - High Precision Event Timer
86  * watchdog[N] - Watchdog must be closed in a special way.
87  */
88 static boolean_t
should_skip_dev(const char * dev)89 should_skip_dev(const char *dev)
90 {
91 	return ((strcmp(dev, "watchdog") == 0) ||
92 	    (strncmp(dev, "watchdog", 8) == 0 && isdigit(dev[8])) ||
93 	    (strcmp(dev, "hpet") == 0));
94 }
95 
96 int
zfs_dev_flush(int fd)97 zfs_dev_flush(int fd)
98 {
99 	return (ioctl(fd, BLKFLSBUF));
100 }
101 
102 void
zpool_open_func(void * arg)103 zpool_open_func(void *arg)
104 {
105 	rdsk_node_t *rn = arg;
106 	libpc_handle_t *hdl = rn->rn_hdl;
107 	struct stat64 statbuf;
108 	nvlist_t *config;
109 	uint64_t vdev_guid = 0;
110 	int error;
111 	int num_labels = 0;
112 	int fd;
113 
114 	if (should_skip_dev(zfs_basename(rn->rn_name)))
115 		return;
116 
117 	/*
118 	 * Ignore failed stats.  We only want regular files and block devices.
119 	 * Ignore files that are too small to hold a zpool.
120 	 */
121 	if (stat64(rn->rn_name, &statbuf) != 0 ||
122 	    (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)) ||
123 	    (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE))
124 		return;
125 
126 	/*
127 	 * Preferentially open using O_DIRECT to bypass the block device
128 	 * cache which may be stale for multipath devices.  An EINVAL errno
129 	 * indicates O_DIRECT is unsupported so fallback to just O_RDONLY.
130 	 */
131 	fd = open(rn->rn_name, O_RDONLY | O_DIRECT | O_CLOEXEC);
132 	if ((fd < 0) && (errno == EINVAL))
133 		fd = open(rn->rn_name, O_RDONLY | O_CLOEXEC);
134 	if ((fd < 0) && (errno == EACCES))
135 		hdl->lpc_open_access_error = B_TRUE;
136 	if (fd < 0)
137 		return;
138 
139 	error = zpool_read_label(fd, &config, &num_labels);
140 	if (error != 0) {
141 		(void) close(fd);
142 		return;
143 	}
144 
145 	if (num_labels == 0) {
146 		(void) close(fd);
147 		nvlist_free(config);
148 		return;
149 	}
150 
151 	/*
152 	 * Check that the vdev is for the expected guid.  Additional entries
153 	 * are speculatively added based on the paths stored in the labels.
154 	 * Entries with valid paths but incorrect guids must be removed.
155 	 */
156 	error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
157 	if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) {
158 		(void) close(fd);
159 		nvlist_free(config);
160 		return;
161 	}
162 
163 	(void) close(fd);
164 
165 	rn->rn_config = config;
166 	rn->rn_num_labels = num_labels;
167 
168 	/*
169 	 * Add additional entries for paths described by this label.
170 	 */
171 	if (rn->rn_labelpaths) {
172 		const char *path = NULL;
173 		const char *devid = NULL;
174 		rdsk_node_t *slice;
175 		avl_index_t where;
176 		int error;
177 
178 		if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid))
179 			return;
180 
181 		/*
182 		 * Allow devlinks to stabilize so all paths are available.
183 		 */
184 		zpool_disk_wait(rn->rn_name);
185 
186 		if (path != NULL) {
187 			slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
188 			slice->rn_name = zutil_strdup(hdl, path);
189 			slice->rn_vdev_guid = vdev_guid;
190 			slice->rn_avl = rn->rn_avl;
191 			slice->rn_hdl = hdl;
192 			slice->rn_order = IMPORT_ORDER_PREFERRED_1;
193 			slice->rn_labelpaths = B_FALSE;
194 			pthread_mutex_lock(rn->rn_lock);
195 			if (avl_find(rn->rn_avl, slice, &where)) {
196 			pthread_mutex_unlock(rn->rn_lock);
197 				free(slice->rn_name);
198 				free(slice);
199 			} else {
200 				avl_insert(rn->rn_avl, slice, where);
201 				pthread_mutex_unlock(rn->rn_lock);
202 				zpool_open_func(slice);
203 			}
204 		}
205 
206 		if (devid != NULL) {
207 			slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
208 			error = asprintf(&slice->rn_name, "%s%s",
209 			    DEV_BYID_PATH, devid);
210 			if (error == -1) {
211 				free(slice);
212 				return;
213 			}
214 
215 			slice->rn_vdev_guid = vdev_guid;
216 			slice->rn_avl = rn->rn_avl;
217 			slice->rn_hdl = hdl;
218 			slice->rn_order = IMPORT_ORDER_PREFERRED_2;
219 			slice->rn_labelpaths = B_FALSE;
220 			pthread_mutex_lock(rn->rn_lock);
221 			if (avl_find(rn->rn_avl, slice, &where)) {
222 				pthread_mutex_unlock(rn->rn_lock);
223 				free(slice->rn_name);
224 				free(slice);
225 			} else {
226 				avl_insert(rn->rn_avl, slice, where);
227 				pthread_mutex_unlock(rn->rn_lock);
228 				zpool_open_func(slice);
229 			}
230 		}
231 	}
232 }
233 
234 static const char * const
235 zpool_default_import_path[] = {
236 	"/dev/disk/by-vdev",	/* Custom rules, use first if they exist */
237 	"/dev/mapper",		/* Use multipath devices before components */
238 	"/dev/disk/by-partlabel", /* Single unique entry set by user */
239 	"/dev/disk/by-partuuid", /* Generated partition uuid */
240 	"/dev/disk/by-label",	/* Custom persistent labels */
241 	"/dev/disk/by-uuid",	/* Single unique entry and persistent */
242 	"/dev/disk/by-id",	/* May be multiple entries and persistent */
243 	"/dev/disk/by-path",	/* Encodes physical location and persistent */
244 	"/dev"			/* UNSAFE device names will change */
245 };
246 
247 const char * const *
zpool_default_search_paths(size_t * count)248 zpool_default_search_paths(size_t *count)
249 {
250 	*count = ARRAY_SIZE(zpool_default_import_path);
251 	return (zpool_default_import_path);
252 }
253 
254 /*
255  * Given a full path to a device determine if that device appears in the
256  * import search path.  If it does return the first match and store the
257  * index in the passed 'order' variable, otherwise return an error.
258  */
259 static int
zfs_path_order(const char * name,int * order)260 zfs_path_order(const char *name, int *order)
261 {
262 	const char *env = getenv("ZPOOL_IMPORT_PATH");
263 
264 	if (env) {
265 		for (int i = 0; ; ++i) {
266 			env += strspn(env, ":");
267 			size_t dirlen = strcspn(env, ":");
268 			if (dirlen) {
269 				if (strncmp(name, env, dirlen) == 0) {
270 					*order = i;
271 					return (0);
272 				}
273 
274 				env += dirlen;
275 			} else
276 				break;
277 		}
278 	} else {
279 		for (int i = 0; i < ARRAY_SIZE(zpool_default_import_path);
280 		    ++i) {
281 			if (strncmp(name, zpool_default_import_path[i],
282 			    strlen(zpool_default_import_path[i])) == 0) {
283 				*order = i;
284 				return (0);
285 			}
286 		}
287 	}
288 
289 	return (ENOENT);
290 }
291 
292 /*
293  * Use libblkid to quickly enumerate all known zfs devices.
294  */
295 int
zpool_find_import_blkid(libpc_handle_t * hdl,pthread_mutex_t * lock,avl_tree_t ** slice_cache)296 zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock,
297     avl_tree_t **slice_cache)
298 {
299 	rdsk_node_t *slice;
300 	blkid_cache cache;
301 	blkid_dev_iterate iter;
302 	blkid_dev dev;
303 	avl_index_t where;
304 	int error;
305 
306 	*slice_cache = NULL;
307 
308 	error = blkid_get_cache(&cache, NULL);
309 	if (error != 0)
310 		return (error);
311 
312 	error = blkid_probe_all_new(cache);
313 	if (error != 0) {
314 		blkid_put_cache(cache);
315 		return (error);
316 	}
317 
318 	iter = blkid_dev_iterate_begin(cache);
319 	if (iter == NULL) {
320 		blkid_put_cache(cache);
321 		return (EINVAL);
322 	}
323 
324 	/* Only const char *s since 2.32 */
325 	error = blkid_dev_set_search(iter,
326 	    (char *)"TYPE", (char *)"zfs_member");
327 	if (error != 0) {
328 		blkid_dev_iterate_end(iter);
329 		blkid_put_cache(cache);
330 		return (error);
331 	}
332 
333 	*slice_cache = zutil_alloc(hdl, sizeof (avl_tree_t));
334 	avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t),
335 	    offsetof(rdsk_node_t, rn_node));
336 
337 	while (blkid_dev_next(iter, &dev) == 0) {
338 		slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
339 		slice->rn_name = zutil_strdup(hdl, blkid_dev_devname(dev));
340 		slice->rn_vdev_guid = 0;
341 		slice->rn_lock = lock;
342 		slice->rn_avl = *slice_cache;
343 		slice->rn_hdl = hdl;
344 		slice->rn_labelpaths = B_TRUE;
345 
346 		error = zfs_path_order(slice->rn_name, &slice->rn_order);
347 		if (error == 0)
348 			slice->rn_order += IMPORT_ORDER_SCAN_OFFSET;
349 		else
350 			slice->rn_order = IMPORT_ORDER_DEFAULT;
351 
352 		pthread_mutex_lock(lock);
353 		if (avl_find(*slice_cache, slice, &where)) {
354 			free(slice->rn_name);
355 			free(slice);
356 		} else {
357 			avl_insert(*slice_cache, slice, where);
358 		}
359 		pthread_mutex_unlock(lock);
360 	}
361 
362 	blkid_dev_iterate_end(iter);
363 	blkid_put_cache(cache);
364 
365 	return (0);
366 }
367 
368 /*
369  * Linux persistent device strings for vdev labels
370  *
371  * based on libudev for consistency with libudev disk add/remove events
372  */
373 
374 typedef struct vdev_dev_strs {
375 	char	vds_devid[128];
376 	char	vds_devphys[128];
377 } vdev_dev_strs_t;
378 
379 #ifdef HAVE_LIBUDEV
380 
381 /*
382  * Obtain the persistent device id string (describes what)
383  *
384  * used by ZED vdev matching for auto-{online,expand,replace}
385  */
386 int
zfs_device_get_devid(struct udev_device * dev,char * bufptr,size_t buflen)387 zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
388 {
389 	struct udev_list_entry *entry;
390 	const char *bus;
391 	char devbyid[MAXPATHLEN];
392 
393 	/* The bus based by-id path is preferred */
394 	bus = udev_device_get_property_value(dev, "ID_BUS");
395 
396 	if (bus == NULL) {
397 		const char *dm_uuid;
398 
399 		/*
400 		 * For multipath nodes use the persistent uuid based identifier
401 		 *
402 		 * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f
403 		 */
404 		dm_uuid = udev_device_get_property_value(dev, "DM_UUID");
405 		if (dm_uuid != NULL) {
406 			(void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid);
407 			return (0);
408 		}
409 
410 		/*
411 		 * For volumes use the persistent /dev/zvol/dataset identifier
412 		 */
413 		entry = udev_device_get_devlinks_list_entry(dev);
414 		while (entry != NULL) {
415 			const char *name;
416 
417 			name = udev_list_entry_get_name(entry);
418 			if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
419 				(void) strlcpy(bufptr, name, buflen);
420 				return (0);
421 			}
422 			entry = udev_list_entry_get_next(entry);
423 		}
424 
425 		/*
426 		 * NVME 'by-id' symlinks are similar to bus case
427 		 */
428 		struct udev_device *parent;
429 
430 		parent = udev_device_get_parent_with_subsystem_devtype(dev,
431 		    "nvme", NULL);
432 		if (parent != NULL)
433 			bus = "nvme";	/* continue with bus symlink search */
434 		else
435 			return (ENODATA);
436 	}
437 
438 	/*
439 	 * locate the bus specific by-id link
440 	 */
441 	(void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus);
442 	entry = udev_device_get_devlinks_list_entry(dev);
443 	while (entry != NULL) {
444 		const char *name;
445 
446 		name = udev_list_entry_get_name(entry);
447 		if (strncmp(name, devbyid, strlen(devbyid)) == 0) {
448 			name += strlen(DEV_BYID_PATH);
449 			(void) strlcpy(bufptr, name, buflen);
450 			return (0);
451 		}
452 		entry = udev_list_entry_get_next(entry);
453 	}
454 
455 	return (ENODATA);
456 }
457 
458 /*
459  * Obtain the persistent physical location string (describes where)
460  *
461  * used by ZED vdev matching for auto-{online,expand,replace}
462  */
463 int
zfs_device_get_physical(struct udev_device * dev,char * bufptr,size_t buflen)464 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
465 {
466 	const char *physpath = NULL;
467 	struct udev_list_entry *entry;
468 
469 	/*
470 	 * Normal disks use ID_PATH for their physical path.
471 	 */
472 	physpath = udev_device_get_property_value(dev, "ID_PATH");
473 	if (physpath != NULL && strlen(physpath) > 0) {
474 		(void) strlcpy(bufptr, physpath, buflen);
475 		return (0);
476 	}
477 
478 	/*
479 	 * Device mapper devices are virtual and don't have a physical
480 	 * path. For them we use ID_VDEV instead, which is setup via the
481 	 * /etc/vdev_id.conf file.  ID_VDEV provides a persistent path
482 	 * to a virtual device.  If you don't have vdev_id.conf setup,
483 	 * you cannot use multipath autoreplace with device mapper.
484 	 */
485 	physpath = udev_device_get_property_value(dev, "ID_VDEV");
486 	if (physpath != NULL && strlen(physpath) > 0) {
487 		(void) strlcpy(bufptr, physpath, buflen);
488 		return (0);
489 	}
490 
491 	/*
492 	 * For ZFS volumes use the persistent /dev/zvol/dataset identifier
493 	 */
494 	entry = udev_device_get_devlinks_list_entry(dev);
495 	while (entry != NULL) {
496 		physpath = udev_list_entry_get_name(entry);
497 		if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
498 			(void) strlcpy(bufptr, physpath, buflen);
499 			return (0);
500 		}
501 		entry = udev_list_entry_get_next(entry);
502 	}
503 
504 	/*
505 	 * For all other devices fallback to using the by-uuid name.
506 	 */
507 	entry = udev_device_get_devlinks_list_entry(dev);
508 	while (entry != NULL) {
509 		physpath = udev_list_entry_get_name(entry);
510 		if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) {
511 			(void) strlcpy(bufptr, physpath, buflen);
512 			return (0);
513 		}
514 		entry = udev_list_entry_get_next(entry);
515 	}
516 
517 	return (ENODATA);
518 }
519 
520 /*
521  * A disk is considered a multipath whole disk when:
522  *	DEVNAME key value has "dm-"
523  *	DM_NAME key value has "mpath" prefix
524  *	DM_UUID key exists
525  *	ID_PART_TABLE_TYPE key does not exist or is not gpt
526  */
527 static boolean_t
udev_mpath_whole_disk(struct udev_device * dev)528 udev_mpath_whole_disk(struct udev_device *dev)
529 {
530 	const char *devname, *type, *uuid;
531 
532 	devname = udev_device_get_property_value(dev, "DEVNAME");
533 	type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
534 	uuid = udev_device_get_property_value(dev, "DM_UUID");
535 
536 	if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
537 	    ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
538 	    (uuid != NULL)) {
539 		return (B_TRUE);
540 	}
541 
542 	return (B_FALSE);
543 }
544 
545 static int
udev_device_is_ready(struct udev_device * dev)546 udev_device_is_ready(struct udev_device *dev)
547 {
548 #ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED
549 	return (udev_device_get_is_initialized(dev));
550 #else
551 	/* wait for DEVLINKS property to be initialized */
552 	return (udev_device_get_property_value(dev, "DEVLINKS") != NULL);
553 #endif
554 }
555 
556 #else
557 
558 int
zfs_device_get_devid(struct udev_device * dev,char * bufptr,size_t buflen)559 zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
560 {
561 	(void) dev, (void) bufptr, (void) buflen;
562 	return (ENODATA);
563 }
564 
565 int
zfs_device_get_physical(struct udev_device * dev,char * bufptr,size_t buflen)566 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
567 {
568 	(void) dev, (void) bufptr, (void) buflen;
569 	return (ENODATA);
570 }
571 
572 #endif /* HAVE_LIBUDEV */
573 
574 /*
575  * Wait up to timeout_ms for udev to set up the device node.  The device is
576  * considered ready when libudev determines it has been initialized, all of
577  * the device links have been verified to exist, and it has been allowed to
578  * settle.  At this point the device can be accessed reliably. Depending on
579  * the complexity of the udev rules this process could take several seconds.
580  */
581 int
zpool_label_disk_wait(const char * path,int timeout_ms)582 zpool_label_disk_wait(const char *path, int timeout_ms)
583 {
584 #ifdef HAVE_LIBUDEV
585 	struct udev *udev;
586 	struct udev_device *dev = NULL;
587 	char nodepath[MAXPATHLEN];
588 	char *sysname = NULL;
589 	int ret = ENODEV;
590 	int settle_ms = 50;
591 	long sleep_ms = 10;
592 	hrtime_t start, settle;
593 
594 	if ((udev = udev_new()) == NULL)
595 		return (ENXIO);
596 
597 	start = gethrtime();
598 	settle = 0;
599 
600 	do {
601 		if (sysname == NULL) {
602 			if (realpath(path, nodepath) != NULL) {
603 				sysname = strrchr(nodepath, '/') + 1;
604 			} else {
605 				(void) usleep(sleep_ms * MILLISEC);
606 				continue;
607 			}
608 		}
609 
610 		dev = udev_device_new_from_subsystem_sysname(udev,
611 		    "block", sysname);
612 		if ((dev != NULL) && udev_device_is_ready(dev)) {
613 			struct udev_list_entry *links, *link = NULL;
614 
615 			ret = 0;
616 			links = udev_device_get_devlinks_list_entry(dev);
617 
618 			udev_list_entry_foreach(link, links) {
619 				struct stat64 statbuf;
620 				const char *name;
621 
622 				name = udev_list_entry_get_name(link);
623 				errno = 0;
624 				if (stat64(name, &statbuf) == 0 && errno == 0)
625 					continue;
626 
627 				settle = 0;
628 				ret = ENODEV;
629 				break;
630 			}
631 
632 			if (ret == 0) {
633 				if (settle == 0) {
634 					settle = gethrtime();
635 				} else if (NSEC2MSEC(gethrtime() - settle) >=
636 				    settle_ms) {
637 					udev_device_unref(dev);
638 					break;
639 				}
640 			}
641 		}
642 
643 		udev_device_unref(dev);
644 		(void) usleep(sleep_ms * MILLISEC);
645 
646 	} while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
647 
648 	udev_unref(udev);
649 
650 	return (ret);
651 #else
652 	int settle_ms = 50;
653 	long sleep_ms = 10;
654 	hrtime_t start, settle;
655 	struct stat64 statbuf;
656 
657 	start = gethrtime();
658 	settle = 0;
659 
660 	do {
661 		errno = 0;
662 		if ((stat64(path, &statbuf) == 0) && (errno == 0)) {
663 			if (settle == 0)
664 				settle = gethrtime();
665 			else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms)
666 				return (0);
667 		} else if (errno != ENOENT) {
668 			return (errno);
669 		}
670 
671 		usleep(sleep_ms * MILLISEC);
672 	} while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
673 
674 	return (ENODEV);
675 #endif /* HAVE_LIBUDEV */
676 }
677 
678 /*
679  * Simplified version of zpool_label_disk_wait() where we wait for a device
680  * to appear using the default timeouts.
681  */
682 int
zpool_disk_wait(const char * path)683 zpool_disk_wait(const char *path)
684 {
685 	int timeout;
686 	timeout = zpool_getenv_int("ZPOOL_IMPORT_UDEV_TIMEOUT_MS",
687 	    DISK_LABEL_WAIT);
688 
689 	return (zpool_label_disk_wait(path, timeout));
690 }
691 
692 /*
693  * Encode the persistent devices strings
694  * used for the vdev disk label
695  */
696 static int
encode_device_strings(const char * path,vdev_dev_strs_t * ds,boolean_t wholedisk)697 encode_device_strings(const char *path, vdev_dev_strs_t *ds,
698     boolean_t wholedisk)
699 {
700 #ifdef HAVE_LIBUDEV
701 	struct udev *udev;
702 	struct udev_device *dev = NULL;
703 	char nodepath[MAXPATHLEN];
704 	char *sysname;
705 	int ret = ENODEV;
706 	hrtime_t start;
707 
708 	if ((udev = udev_new()) == NULL)
709 		return (ENXIO);
710 
711 	/* resolve path to a runtime device node instance */
712 	if (realpath(path, nodepath) == NULL)
713 		goto no_dev;
714 
715 	sysname = strrchr(nodepath, '/') + 1;
716 
717 	/*
718 	 * Wait up to 3 seconds for udev to set up the device node context
719 	 */
720 	start = gethrtime();
721 	do {
722 		dev = udev_device_new_from_subsystem_sysname(udev, "block",
723 		    sysname);
724 		if (dev == NULL)
725 			goto no_dev;
726 		if (udev_device_is_ready(dev))
727 			break;  /* udev ready */
728 
729 		udev_device_unref(dev);
730 		dev = NULL;
731 
732 		if (NSEC2MSEC(gethrtime() - start) < 10)
733 			(void) sched_yield();	/* yield/busy wait up to 10ms */
734 		else
735 			(void) usleep(10 * MILLISEC);
736 
737 	} while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC));
738 
739 	if (dev == NULL)
740 		goto no_dev;
741 
742 	/*
743 	 * Only whole disks require extra device strings
744 	 */
745 	if (!wholedisk && !udev_mpath_whole_disk(dev))
746 		goto no_dev;
747 
748 	ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid));
749 	if (ret != 0)
750 		goto no_dev_ref;
751 
752 	/* physical location string (optional) */
753 	if (zfs_device_get_physical(dev, ds->vds_devphys,
754 	    sizeof (ds->vds_devphys)) != 0) {
755 		ds->vds_devphys[0] = '\0'; /* empty string --> not available */
756 	}
757 
758 no_dev_ref:
759 	udev_device_unref(dev);
760 no_dev:
761 	udev_unref(udev);
762 
763 	return (ret);
764 #else
765 	(void) path;
766 	(void) ds;
767 	(void) wholedisk;
768 	return (ENOENT);
769 #endif
770 }
771 
772 /*
773  * Rescan the enclosure sysfs path for turning on enclosure LEDs and store it
774  * in the nvlist * (if applicable).  Like:
775  *    vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
776  *
777  * If an old path was in the nvlist, and the rescan can not find a new path,
778  * then keep the old path, since the disk may have been removed.
779  *
780  * path: The vdev path (value from ZPOOL_CONFIG_PATH)
781  * key: The nvlist_t name (like ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH)
782  */
783 void
update_vdev_config_dev_sysfs_path(nvlist_t * nv,const char * path,const char * key)784 update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path,
785     const char *key)
786 {
787 	char *upath, *spath;
788 	const char *oldpath = NULL;
789 
790 	(void) nvlist_lookup_string(nv, key, &oldpath);
791 
792 	/* Add enclosure sysfs path (if disk is in an enclosure). */
793 	upath = zfs_get_underlying_path(path);
794 	spath = zfs_get_enclosure_sysfs_path(upath);
795 
796 	if (spath) {
797 		(void) nvlist_add_string(nv, key, spath);
798 	} else {
799 		/*
800 		 * We couldn't dynamically scan the disk's enclosure sysfs path.
801 		 * This could be because the disk went away.  If there's an old
802 		 * enclosure sysfs path in the nvlist, then keep using it.
803 		 */
804 		if (!oldpath) {
805 			(void) nvlist_remove_all(nv, key);
806 		}
807 	}
808 
809 	free(upath);
810 	free(spath);
811 }
812 
813 /*
814  * This will get called for each leaf vdev.
815  */
816 static int
sysfs_path_pool_vdev_iter_f(void * hdl_data,nvlist_t * nv,void * data)817 sysfs_path_pool_vdev_iter_f(void *hdl_data, nvlist_t *nv, void *data)
818 {
819 	(void) hdl_data, (void) data;
820 
821 	const char *path = NULL;
822 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
823 		return (1);
824 
825 	/* Rescan our enclosure sysfs path for this vdev */
826 	update_vdev_config_dev_sysfs_path(nv, path,
827 	    ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
828 	return (0);
829 }
830 
831 /*
832  * Given an nvlist for our pool (with vdev tree), iterate over all the
833  * leaf vdevs and update their ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH.
834  */
835 void
update_vdevs_config_dev_sysfs_path(nvlist_t * config)836 update_vdevs_config_dev_sysfs_path(nvlist_t *config)
837 {
838 	nvlist_t *nvroot = NULL;
839 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
840 	    &nvroot) == 0);
841 	for_each_vdev_in_nvlist(nvroot, sysfs_path_pool_vdev_iter_f, NULL);
842 }
843 
844 /*
845  * Update a leaf vdev's persistent device strings
846  *
847  * - only applies for a dedicated leaf vdev (aka whole disk)
848  * - updated during pool create|add|attach|import
849  * - used for matching device matching during auto-{online,expand,replace}
850  * - stored in a leaf disk config label (i.e. alongside 'path' NVP)
851  * - these strings are currently not used in kernel (i.e. for vdev_disk_open)
852  *
853  * single device node example:
854  * 	devid:		'scsi-MG03SCA300_350000494a8cb3d67-part1'
855  * 	phys_path:	'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0'
856  *
857  * multipath device node example:
858  * 	devid:		'dm-uuid-mpath-35000c5006304de3f'
859  *
860  * We also store the enclosure sysfs path for turning on enclosure LEDs
861  * (if applicable):
862  *	vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
863  */
864 void
update_vdev_config_dev_strs(nvlist_t * nv)865 update_vdev_config_dev_strs(nvlist_t *nv)
866 {
867 	vdev_dev_strs_t vds;
868 	const char *env, *type, *path;
869 	uint64_t wholedisk = 0;
870 
871 	/*
872 	 * For the benefit of legacy ZFS implementations, allow
873 	 * for opting out of devid strings in the vdev label.
874 	 *
875 	 * example use:
876 	 *	env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer
877 	 *
878 	 * explanation:
879 	 * Older OpenZFS implementations had issues when attempting to
880 	 * display pool config VDEV names if a "devid" NVP value is
881 	 * present in the pool's config.
882 	 *
883 	 * For example, a pool that originated on illumos platform would
884 	 * have a devid value in the config and "zpool status" would fail
885 	 * when listing the config.
886 	 *
887 	 * A pool can be stripped of any "devid" values on import or
888 	 * prevented from adding them on zpool create|add by setting
889 	 * ZFS_VDEV_DEVID_OPT_OUT.
890 	 */
891 	env = getenv("ZFS_VDEV_DEVID_OPT_OUT");
892 	if (env && (strtoul(env, NULL, 0) > 0 ||
893 	    !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) {
894 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
895 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
896 		return;
897 	}
898 
899 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 ||
900 	    strcmp(type, VDEV_TYPE_DISK) != 0) {
901 		return;
902 	}
903 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
904 		return;
905 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
906 
907 	/*
908 	 * Update device string values in the config nvlist.
909 	 */
910 	if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) {
911 		(void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid);
912 		if (vds.vds_devphys[0] != '\0') {
913 			(void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
914 			    vds.vds_devphys);
915 		}
916 		update_vdev_config_dev_sysfs_path(nv, path,
917 		    ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
918 	} else {
919 		/* Clear out any stale entries. */
920 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
921 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
922 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
923 	}
924 }
925