1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
26 * Copyright 2015 RackTop Systems.
27 * Copyright (c) 2016, Intel Corporation.
28 */
29
30 /*
31 * Pool import support functions.
32 *
33 * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since
34 * these commands are expected to run in the global zone, we can assume
35 * that the devices are all readable when called.
36 *
37 * To import a pool, we rely on reading the configuration information from the
38 * ZFS label of each device. If we successfully read the label, then we
39 * organize the configuration information in the following hierarchy:
40 *
41 * pool guid -> toplevel vdev guid -> label txg
42 *
43 * Duplicate entries matching this same tuple will be discarded. Once we have
44 * examined every device, we pick the best label txg config for each toplevel
45 * vdev. We then arrange these toplevel vdevs into a complete pool config, and
46 * update any paths that have changed. Finally, we attempt to import the pool
47 * using our derived config, and record the results.
48 */
49
50 #include <ctype.h>
51 #include <dirent.h>
52 #include <errno.h>
53 #include <libintl.h>
54 #include <libgen.h>
55 #include <stddef.h>
56 #include <stdlib.h>
57 #include <stdio.h>
58 #include <string.h>
59 #include <sys/stat.h>
60 #include <unistd.h>
61 #include <fcntl.h>
62 #include <sys/dktp/fdisk.h>
63 #include <sys/vdev_impl.h>
64 #include <sys/fs/zfs.h>
65
66 #include <thread_pool.h>
67 #include <libzutil.h>
68 #include <libnvpair.h>
69 #include <libzfs.h>
70
71 #include "zutil_import.h"
72
73 #ifdef HAVE_LIBUDEV
74 #include <libudev.h>
75 #include <sched.h>
76 #endif
77 #include <blkid/blkid.h>
78
79 #define DEV_BYID_PATH "/dev/disk/by-id/"
80
81 /*
82 * Skip devices with well known prefixes:
83 * there can be side effects when opening devices which need to be avoided.
84 *
85 * hpet - High Precision Event Timer
86 * watchdog[N] - Watchdog must be closed in a special way.
87 */
88 static boolean_t
should_skip_dev(const char * dev)89 should_skip_dev(const char *dev)
90 {
91 return ((strcmp(dev, "watchdog") == 0) ||
92 (strncmp(dev, "watchdog", 8) == 0 && isdigit(dev[8])) ||
93 (strcmp(dev, "hpet") == 0));
94 }
95
96 int
zfs_dev_flush(int fd)97 zfs_dev_flush(int fd)
98 {
99 return (ioctl(fd, BLKFLSBUF));
100 }
101
102 void
zpool_open_func(void * arg)103 zpool_open_func(void *arg)
104 {
105 rdsk_node_t *rn = arg;
106 libpc_handle_t *hdl = rn->rn_hdl;
107 struct stat64 statbuf;
108 nvlist_t *config;
109 uint64_t vdev_guid = 0;
110 int error;
111 int num_labels = 0;
112 int fd;
113
114 if (should_skip_dev(zfs_basename(rn->rn_name)))
115 return;
116
117 /*
118 * Ignore failed stats. We only want regular files and block devices.
119 * Ignore files that are too small to hold a zpool.
120 */
121 if (stat64(rn->rn_name, &statbuf) != 0 ||
122 (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)) ||
123 (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE))
124 return;
125
126 /*
127 * Preferentially open using O_DIRECT to bypass the block device
128 * cache which may be stale for multipath devices. An EINVAL errno
129 * indicates O_DIRECT is unsupported so fallback to just O_RDONLY.
130 */
131 fd = open(rn->rn_name, O_RDONLY | O_DIRECT | O_CLOEXEC);
132 if ((fd < 0) && (errno == EINVAL))
133 fd = open(rn->rn_name, O_RDONLY | O_CLOEXEC);
134 if ((fd < 0) && (errno == EACCES))
135 hdl->lpc_open_access_error = B_TRUE;
136 if (fd < 0)
137 return;
138
139 error = zpool_read_label(fd, &config, &num_labels);
140 if (error != 0) {
141 (void) close(fd);
142 return;
143 }
144
145 if (num_labels == 0) {
146 (void) close(fd);
147 nvlist_free(config);
148 return;
149 }
150
151 /*
152 * Check that the vdev is for the expected guid. Additional entries
153 * are speculatively added based on the paths stored in the labels.
154 * Entries with valid paths but incorrect guids must be removed.
155 */
156 error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
157 if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) {
158 (void) close(fd);
159 nvlist_free(config);
160 return;
161 }
162
163 (void) close(fd);
164
165 rn->rn_config = config;
166 rn->rn_num_labels = num_labels;
167
168 /*
169 * Add additional entries for paths described by this label.
170 */
171 if (rn->rn_labelpaths) {
172 const char *path = NULL;
173 const char *devid = NULL;
174 rdsk_node_t *slice;
175 avl_index_t where;
176 int error;
177
178 if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid))
179 return;
180
181 /*
182 * Allow devlinks to stabilize so all paths are available.
183 */
184 zpool_disk_wait(rn->rn_name);
185
186 if (path != NULL) {
187 slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
188 slice->rn_name = zutil_strdup(hdl, path);
189 slice->rn_vdev_guid = vdev_guid;
190 slice->rn_avl = rn->rn_avl;
191 slice->rn_hdl = hdl;
192 slice->rn_order = IMPORT_ORDER_PREFERRED_1;
193 slice->rn_labelpaths = B_FALSE;
194 pthread_mutex_lock(rn->rn_lock);
195 if (avl_find(rn->rn_avl, slice, &where)) {
196 pthread_mutex_unlock(rn->rn_lock);
197 free(slice->rn_name);
198 free(slice);
199 } else {
200 avl_insert(rn->rn_avl, slice, where);
201 pthread_mutex_unlock(rn->rn_lock);
202 zpool_open_func(slice);
203 }
204 }
205
206 if (devid != NULL) {
207 slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
208 error = asprintf(&slice->rn_name, "%s%s",
209 DEV_BYID_PATH, devid);
210 if (error == -1) {
211 free(slice);
212 return;
213 }
214
215 slice->rn_vdev_guid = vdev_guid;
216 slice->rn_avl = rn->rn_avl;
217 slice->rn_hdl = hdl;
218 slice->rn_order = IMPORT_ORDER_PREFERRED_2;
219 slice->rn_labelpaths = B_FALSE;
220 pthread_mutex_lock(rn->rn_lock);
221 if (avl_find(rn->rn_avl, slice, &where)) {
222 pthread_mutex_unlock(rn->rn_lock);
223 free(slice->rn_name);
224 free(slice);
225 } else {
226 avl_insert(rn->rn_avl, slice, where);
227 pthread_mutex_unlock(rn->rn_lock);
228 zpool_open_func(slice);
229 }
230 }
231 }
232 }
233
234 static const char * const
235 zpool_default_import_path[] = {
236 "/dev/disk/by-vdev", /* Custom rules, use first if they exist */
237 "/dev/mapper", /* Use multipath devices before components */
238 "/dev/disk/by-partlabel", /* Single unique entry set by user */
239 "/dev/disk/by-partuuid", /* Generated partition uuid */
240 "/dev/disk/by-label", /* Custom persistent labels */
241 "/dev/disk/by-uuid", /* Single unique entry and persistent */
242 "/dev/disk/by-id", /* May be multiple entries and persistent */
243 "/dev/disk/by-path", /* Encodes physical location and persistent */
244 "/dev" /* UNSAFE device names will change */
245 };
246
247 const char * const *
zpool_default_search_paths(size_t * count)248 zpool_default_search_paths(size_t *count)
249 {
250 *count = ARRAY_SIZE(zpool_default_import_path);
251 return (zpool_default_import_path);
252 }
253
254 /*
255 * Given a full path to a device determine if that device appears in the
256 * import search path. If it does return the first match and store the
257 * index in the passed 'order' variable, otherwise return an error.
258 */
259 static int
zfs_path_order(const char * name,int * order)260 zfs_path_order(const char *name, int *order)
261 {
262 const char *env = getenv("ZPOOL_IMPORT_PATH");
263
264 if (env) {
265 for (int i = 0; ; ++i) {
266 env += strspn(env, ":");
267 size_t dirlen = strcspn(env, ":");
268 if (dirlen) {
269 if (strncmp(name, env, dirlen) == 0) {
270 *order = i;
271 return (0);
272 }
273
274 env += dirlen;
275 } else
276 break;
277 }
278 } else {
279 for (int i = 0; i < ARRAY_SIZE(zpool_default_import_path);
280 ++i) {
281 if (strncmp(name, zpool_default_import_path[i],
282 strlen(zpool_default_import_path[i])) == 0) {
283 *order = i;
284 return (0);
285 }
286 }
287 }
288
289 return (ENOENT);
290 }
291
292 /*
293 * Use libblkid to quickly enumerate all known zfs devices.
294 */
295 int
zpool_find_import_blkid(libpc_handle_t * hdl,pthread_mutex_t * lock,avl_tree_t ** slice_cache)296 zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock,
297 avl_tree_t **slice_cache)
298 {
299 rdsk_node_t *slice;
300 blkid_cache cache;
301 blkid_dev_iterate iter;
302 blkid_dev dev;
303 avl_index_t where;
304 int error;
305
306 *slice_cache = NULL;
307
308 error = blkid_get_cache(&cache, NULL);
309 if (error != 0)
310 return (error);
311
312 error = blkid_probe_all_new(cache);
313 if (error != 0) {
314 blkid_put_cache(cache);
315 return (error);
316 }
317
318 iter = blkid_dev_iterate_begin(cache);
319 if (iter == NULL) {
320 blkid_put_cache(cache);
321 return (EINVAL);
322 }
323
324 /* Only const char *s since 2.32 */
325 error = blkid_dev_set_search(iter,
326 (char *)"TYPE", (char *)"zfs_member");
327 if (error != 0) {
328 blkid_dev_iterate_end(iter);
329 blkid_put_cache(cache);
330 return (error);
331 }
332
333 *slice_cache = zutil_alloc(hdl, sizeof (avl_tree_t));
334 avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t),
335 offsetof(rdsk_node_t, rn_node));
336
337 while (blkid_dev_next(iter, &dev) == 0) {
338 slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
339 slice->rn_name = zutil_strdup(hdl, blkid_dev_devname(dev));
340 slice->rn_vdev_guid = 0;
341 slice->rn_lock = lock;
342 slice->rn_avl = *slice_cache;
343 slice->rn_hdl = hdl;
344 slice->rn_labelpaths = B_TRUE;
345
346 error = zfs_path_order(slice->rn_name, &slice->rn_order);
347 if (error == 0)
348 slice->rn_order += IMPORT_ORDER_SCAN_OFFSET;
349 else
350 slice->rn_order = IMPORT_ORDER_DEFAULT;
351
352 pthread_mutex_lock(lock);
353 if (avl_find(*slice_cache, slice, &where)) {
354 free(slice->rn_name);
355 free(slice);
356 } else {
357 avl_insert(*slice_cache, slice, where);
358 }
359 pthread_mutex_unlock(lock);
360 }
361
362 blkid_dev_iterate_end(iter);
363 blkid_put_cache(cache);
364
365 return (0);
366 }
367
368 /*
369 * Linux persistent device strings for vdev labels
370 *
371 * based on libudev for consistency with libudev disk add/remove events
372 */
373
374 typedef struct vdev_dev_strs {
375 char vds_devid[128];
376 char vds_devphys[128];
377 } vdev_dev_strs_t;
378
379 #ifdef HAVE_LIBUDEV
380
381 /*
382 * Obtain the persistent device id string (describes what)
383 *
384 * used by ZED vdev matching for auto-{online,expand,replace}
385 */
386 int
zfs_device_get_devid(struct udev_device * dev,char * bufptr,size_t buflen)387 zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
388 {
389 struct udev_list_entry *entry;
390 const char *bus;
391 char devbyid[MAXPATHLEN];
392
393 /* The bus based by-id path is preferred */
394 bus = udev_device_get_property_value(dev, "ID_BUS");
395
396 if (bus == NULL) {
397 const char *dm_uuid;
398
399 /*
400 * For multipath nodes use the persistent uuid based identifier
401 *
402 * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f
403 */
404 dm_uuid = udev_device_get_property_value(dev, "DM_UUID");
405 if (dm_uuid != NULL) {
406 (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid);
407 return (0);
408 }
409
410 /*
411 * For volumes use the persistent /dev/zvol/dataset identifier
412 */
413 entry = udev_device_get_devlinks_list_entry(dev);
414 while (entry != NULL) {
415 const char *name;
416
417 name = udev_list_entry_get_name(entry);
418 if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
419 (void) strlcpy(bufptr, name, buflen);
420 return (0);
421 }
422 entry = udev_list_entry_get_next(entry);
423 }
424
425 /*
426 * NVME 'by-id' symlinks are similar to bus case
427 */
428 struct udev_device *parent;
429
430 parent = udev_device_get_parent_with_subsystem_devtype(dev,
431 "nvme", NULL);
432 if (parent != NULL)
433 bus = "nvme"; /* continue with bus symlink search */
434 else
435 return (ENODATA);
436 }
437
438 /*
439 * locate the bus specific by-id link
440 */
441 (void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus);
442 entry = udev_device_get_devlinks_list_entry(dev);
443 while (entry != NULL) {
444 const char *name;
445
446 name = udev_list_entry_get_name(entry);
447 if (strncmp(name, devbyid, strlen(devbyid)) == 0) {
448 name += strlen(DEV_BYID_PATH);
449 (void) strlcpy(bufptr, name, buflen);
450 return (0);
451 }
452 entry = udev_list_entry_get_next(entry);
453 }
454
455 return (ENODATA);
456 }
457
458 /*
459 * Obtain the persistent physical location string (describes where)
460 *
461 * used by ZED vdev matching for auto-{online,expand,replace}
462 */
463 int
zfs_device_get_physical(struct udev_device * dev,char * bufptr,size_t buflen)464 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
465 {
466 const char *physpath = NULL;
467 struct udev_list_entry *entry;
468
469 /*
470 * Normal disks use ID_PATH for their physical path.
471 */
472 physpath = udev_device_get_property_value(dev, "ID_PATH");
473 if (physpath != NULL && strlen(physpath) > 0) {
474 (void) strlcpy(bufptr, physpath, buflen);
475 return (0);
476 }
477
478 /*
479 * Device mapper devices are virtual and don't have a physical
480 * path. For them we use ID_VDEV instead, which is setup via the
481 * /etc/vdev_id.conf file. ID_VDEV provides a persistent path
482 * to a virtual device. If you don't have vdev_id.conf setup,
483 * you cannot use multipath autoreplace with device mapper.
484 */
485 physpath = udev_device_get_property_value(dev, "ID_VDEV");
486 if (physpath != NULL && strlen(physpath) > 0) {
487 (void) strlcpy(bufptr, physpath, buflen);
488 return (0);
489 }
490
491 /*
492 * For ZFS volumes use the persistent /dev/zvol/dataset identifier
493 */
494 entry = udev_device_get_devlinks_list_entry(dev);
495 while (entry != NULL) {
496 physpath = udev_list_entry_get_name(entry);
497 if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
498 (void) strlcpy(bufptr, physpath, buflen);
499 return (0);
500 }
501 entry = udev_list_entry_get_next(entry);
502 }
503
504 /*
505 * For all other devices fallback to using the by-uuid name.
506 */
507 entry = udev_device_get_devlinks_list_entry(dev);
508 while (entry != NULL) {
509 physpath = udev_list_entry_get_name(entry);
510 if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) {
511 (void) strlcpy(bufptr, physpath, buflen);
512 return (0);
513 }
514 entry = udev_list_entry_get_next(entry);
515 }
516
517 return (ENODATA);
518 }
519
520 /*
521 * A disk is considered a multipath whole disk when:
522 * DEVNAME key value has "dm-"
523 * DM_NAME key value has "mpath" prefix
524 * DM_UUID key exists
525 * ID_PART_TABLE_TYPE key does not exist or is not gpt
526 */
527 static boolean_t
udev_mpath_whole_disk(struct udev_device * dev)528 udev_mpath_whole_disk(struct udev_device *dev)
529 {
530 const char *devname, *type, *uuid;
531
532 devname = udev_device_get_property_value(dev, "DEVNAME");
533 type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
534 uuid = udev_device_get_property_value(dev, "DM_UUID");
535
536 if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
537 ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
538 (uuid != NULL)) {
539 return (B_TRUE);
540 }
541
542 return (B_FALSE);
543 }
544
545 static int
udev_device_is_ready(struct udev_device * dev)546 udev_device_is_ready(struct udev_device *dev)
547 {
548 #ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED
549 return (udev_device_get_is_initialized(dev));
550 #else
551 /* wait for DEVLINKS property to be initialized */
552 return (udev_device_get_property_value(dev, "DEVLINKS") != NULL);
553 #endif
554 }
555
556 #else
557
558 int
zfs_device_get_devid(struct udev_device * dev,char * bufptr,size_t buflen)559 zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
560 {
561 (void) dev, (void) bufptr, (void) buflen;
562 return (ENODATA);
563 }
564
565 int
zfs_device_get_physical(struct udev_device * dev,char * bufptr,size_t buflen)566 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
567 {
568 (void) dev, (void) bufptr, (void) buflen;
569 return (ENODATA);
570 }
571
572 #endif /* HAVE_LIBUDEV */
573
574 /*
575 * Wait up to timeout_ms for udev to set up the device node. The device is
576 * considered ready when libudev determines it has been initialized, all of
577 * the device links have been verified to exist, and it has been allowed to
578 * settle. At this point the device can be accessed reliably. Depending on
579 * the complexity of the udev rules this process could take several seconds.
580 */
581 int
zpool_label_disk_wait(const char * path,int timeout_ms)582 zpool_label_disk_wait(const char *path, int timeout_ms)
583 {
584 #ifdef HAVE_LIBUDEV
585 struct udev *udev;
586 struct udev_device *dev = NULL;
587 char nodepath[MAXPATHLEN];
588 char *sysname = NULL;
589 int ret = ENODEV;
590 int settle_ms = 50;
591 long sleep_ms = 10;
592 hrtime_t start, settle;
593
594 if ((udev = udev_new()) == NULL)
595 return (ENXIO);
596
597 start = gethrtime();
598 settle = 0;
599
600 do {
601 if (sysname == NULL) {
602 if (realpath(path, nodepath) != NULL) {
603 sysname = strrchr(nodepath, '/') + 1;
604 } else {
605 (void) usleep(sleep_ms * MILLISEC);
606 continue;
607 }
608 }
609
610 dev = udev_device_new_from_subsystem_sysname(udev,
611 "block", sysname);
612 if ((dev != NULL) && udev_device_is_ready(dev)) {
613 struct udev_list_entry *links, *link = NULL;
614
615 ret = 0;
616 links = udev_device_get_devlinks_list_entry(dev);
617
618 udev_list_entry_foreach(link, links) {
619 struct stat64 statbuf;
620 const char *name;
621
622 name = udev_list_entry_get_name(link);
623 errno = 0;
624 if (stat64(name, &statbuf) == 0 && errno == 0)
625 continue;
626
627 settle = 0;
628 ret = ENODEV;
629 break;
630 }
631
632 if (ret == 0) {
633 if (settle == 0) {
634 settle = gethrtime();
635 } else if (NSEC2MSEC(gethrtime() - settle) >=
636 settle_ms) {
637 udev_device_unref(dev);
638 break;
639 }
640 }
641 }
642
643 udev_device_unref(dev);
644 (void) usleep(sleep_ms * MILLISEC);
645
646 } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
647
648 udev_unref(udev);
649
650 return (ret);
651 #else
652 int settle_ms = 50;
653 long sleep_ms = 10;
654 hrtime_t start, settle;
655 struct stat64 statbuf;
656
657 start = gethrtime();
658 settle = 0;
659
660 do {
661 errno = 0;
662 if ((stat64(path, &statbuf) == 0) && (errno == 0)) {
663 if (settle == 0)
664 settle = gethrtime();
665 else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms)
666 return (0);
667 } else if (errno != ENOENT) {
668 return (errno);
669 }
670
671 usleep(sleep_ms * MILLISEC);
672 } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
673
674 return (ENODEV);
675 #endif /* HAVE_LIBUDEV */
676 }
677
678 /*
679 * Simplified version of zpool_label_disk_wait() where we wait for a device
680 * to appear using the default timeouts.
681 */
682 int
zpool_disk_wait(const char * path)683 zpool_disk_wait(const char *path)
684 {
685 int timeout;
686 timeout = zpool_getenv_int("ZPOOL_IMPORT_UDEV_TIMEOUT_MS",
687 DISK_LABEL_WAIT);
688
689 return (zpool_label_disk_wait(path, timeout));
690 }
691
692 /*
693 * Encode the persistent devices strings
694 * used for the vdev disk label
695 */
696 static int
encode_device_strings(const char * path,vdev_dev_strs_t * ds,boolean_t wholedisk)697 encode_device_strings(const char *path, vdev_dev_strs_t *ds,
698 boolean_t wholedisk)
699 {
700 #ifdef HAVE_LIBUDEV
701 struct udev *udev;
702 struct udev_device *dev = NULL;
703 char nodepath[MAXPATHLEN];
704 char *sysname;
705 int ret = ENODEV;
706 hrtime_t start;
707
708 if ((udev = udev_new()) == NULL)
709 return (ENXIO);
710
711 /* resolve path to a runtime device node instance */
712 if (realpath(path, nodepath) == NULL)
713 goto no_dev;
714
715 sysname = strrchr(nodepath, '/') + 1;
716
717 /*
718 * Wait up to 3 seconds for udev to set up the device node context
719 */
720 start = gethrtime();
721 do {
722 dev = udev_device_new_from_subsystem_sysname(udev, "block",
723 sysname);
724 if (dev == NULL)
725 goto no_dev;
726 if (udev_device_is_ready(dev))
727 break; /* udev ready */
728
729 udev_device_unref(dev);
730 dev = NULL;
731
732 if (NSEC2MSEC(gethrtime() - start) < 10)
733 (void) sched_yield(); /* yield/busy wait up to 10ms */
734 else
735 (void) usleep(10 * MILLISEC);
736
737 } while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC));
738
739 if (dev == NULL)
740 goto no_dev;
741
742 /*
743 * Only whole disks require extra device strings
744 */
745 if (!wholedisk && !udev_mpath_whole_disk(dev))
746 goto no_dev;
747
748 ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid));
749 if (ret != 0)
750 goto no_dev_ref;
751
752 /* physical location string (optional) */
753 if (zfs_device_get_physical(dev, ds->vds_devphys,
754 sizeof (ds->vds_devphys)) != 0) {
755 ds->vds_devphys[0] = '\0'; /* empty string --> not available */
756 }
757
758 no_dev_ref:
759 udev_device_unref(dev);
760 no_dev:
761 udev_unref(udev);
762
763 return (ret);
764 #else
765 (void) path;
766 (void) ds;
767 (void) wholedisk;
768 return (ENOENT);
769 #endif
770 }
771
772 /*
773 * Rescan the enclosure sysfs path for turning on enclosure LEDs and store it
774 * in the nvlist * (if applicable). Like:
775 * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
776 *
777 * If an old path was in the nvlist, and the rescan can not find a new path,
778 * then keep the old path, since the disk may have been removed.
779 *
780 * path: The vdev path (value from ZPOOL_CONFIG_PATH)
781 * key: The nvlist_t name (like ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH)
782 */
783 void
update_vdev_config_dev_sysfs_path(nvlist_t * nv,const char * path,const char * key)784 update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path,
785 const char *key)
786 {
787 char *upath, *spath;
788 const char *oldpath = NULL;
789
790 (void) nvlist_lookup_string(nv, key, &oldpath);
791
792 /* Add enclosure sysfs path (if disk is in an enclosure). */
793 upath = zfs_get_underlying_path(path);
794 spath = zfs_get_enclosure_sysfs_path(upath);
795
796 if (spath) {
797 (void) nvlist_add_string(nv, key, spath);
798 } else {
799 /*
800 * We couldn't dynamically scan the disk's enclosure sysfs path.
801 * This could be because the disk went away. If there's an old
802 * enclosure sysfs path in the nvlist, then keep using it.
803 */
804 if (!oldpath) {
805 (void) nvlist_remove_all(nv, key);
806 }
807 }
808
809 free(upath);
810 free(spath);
811 }
812
813 /*
814 * This will get called for each leaf vdev.
815 */
816 static int
sysfs_path_pool_vdev_iter_f(void * hdl_data,nvlist_t * nv,void * data)817 sysfs_path_pool_vdev_iter_f(void *hdl_data, nvlist_t *nv, void *data)
818 {
819 (void) hdl_data, (void) data;
820
821 const char *path = NULL;
822 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
823 return (1);
824
825 /* Rescan our enclosure sysfs path for this vdev */
826 update_vdev_config_dev_sysfs_path(nv, path,
827 ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
828 return (0);
829 }
830
831 /*
832 * Given an nvlist for our pool (with vdev tree), iterate over all the
833 * leaf vdevs and update their ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH.
834 */
835 void
update_vdevs_config_dev_sysfs_path(nvlist_t * config)836 update_vdevs_config_dev_sysfs_path(nvlist_t *config)
837 {
838 nvlist_t *nvroot = NULL;
839 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
840 &nvroot) == 0);
841 for_each_vdev_in_nvlist(nvroot, sysfs_path_pool_vdev_iter_f, NULL);
842 }
843
844 /*
845 * Update a leaf vdev's persistent device strings
846 *
847 * - only applies for a dedicated leaf vdev (aka whole disk)
848 * - updated during pool create|add|attach|import
849 * - used for matching device matching during auto-{online,expand,replace}
850 * - stored in a leaf disk config label (i.e. alongside 'path' NVP)
851 * - these strings are currently not used in kernel (i.e. for vdev_disk_open)
852 *
853 * single device node example:
854 * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1'
855 * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0'
856 *
857 * multipath device node example:
858 * devid: 'dm-uuid-mpath-35000c5006304de3f'
859 *
860 * We also store the enclosure sysfs path for turning on enclosure LEDs
861 * (if applicable):
862 * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
863 */
864 void
update_vdev_config_dev_strs(nvlist_t * nv)865 update_vdev_config_dev_strs(nvlist_t *nv)
866 {
867 vdev_dev_strs_t vds;
868 const char *env, *type, *path;
869 uint64_t wholedisk = 0;
870
871 /*
872 * For the benefit of legacy ZFS implementations, allow
873 * for opting out of devid strings in the vdev label.
874 *
875 * example use:
876 * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer
877 *
878 * explanation:
879 * Older OpenZFS implementations had issues when attempting to
880 * display pool config VDEV names if a "devid" NVP value is
881 * present in the pool's config.
882 *
883 * For example, a pool that originated on illumos platform would
884 * have a devid value in the config and "zpool status" would fail
885 * when listing the config.
886 *
887 * A pool can be stripped of any "devid" values on import or
888 * prevented from adding them on zpool create|add by setting
889 * ZFS_VDEV_DEVID_OPT_OUT.
890 */
891 env = getenv("ZFS_VDEV_DEVID_OPT_OUT");
892 if (env && (strtoul(env, NULL, 0) > 0 ||
893 !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) {
894 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
895 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
896 return;
897 }
898
899 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 ||
900 strcmp(type, VDEV_TYPE_DISK) != 0) {
901 return;
902 }
903 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
904 return;
905 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
906
907 /*
908 * Update device string values in the config nvlist.
909 */
910 if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) {
911 (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid);
912 if (vds.vds_devphys[0] != '\0') {
913 (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
914 vds.vds_devphys);
915 }
916 update_vdev_config_dev_sysfs_path(nv, path,
917 ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
918 } else {
919 /* Clear out any stale entries. */
920 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
921 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
922 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
923 }
924 }
925