xref: /freebsd/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_device_path_os.c (revision efa8679e7f69c9cc225613827d9f75644cca5b3b)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25  */
26 
27 #include <ctype.h>
28 #include <dirent.h>
29 #include <fcntl.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <sys/efi_partition.h>
34 
35 #ifdef HAVE_LIBUDEV
36 #include <libudev.h>
37 #endif
38 
39 #include <libzutil.h>
40 
41 /*
42  * Append partition suffix to an otherwise fully qualified device path.
43  * This is used to generate the name the full path as its stored in
44  * ZPOOL_CONFIG_PATH for whole disk devices.  On success the new length
45  * of 'path' will be returned on error a negative value is returned.
46  */
47 int
zfs_append_partition(char * path,size_t max_len)48 zfs_append_partition(char *path, size_t max_len)
49 {
50 	int len = strlen(path);
51 
52 	if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) ||
53 	    (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) {
54 		if (len + 6 >= max_len)
55 			return (-1);
56 
57 		(void) strcat(path, "-part1");
58 		len += 6;
59 	} else {
60 		if (len + 2 >= max_len)
61 			return (-1);
62 
63 		if (isdigit(path[len-1])) {
64 			(void) strcat(path, "p1");
65 			len += 2;
66 		} else {
67 			(void) strcat(path, "1");
68 			len += 1;
69 		}
70 	}
71 
72 	return (len);
73 }
74 
75 /*
76  * Remove partition suffix from a vdev path.  Partition suffixes may take three
77  * forms: "-partX", "pX", or "X", where X is a string of digits.  The second
78  * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The
79  * third case only occurs when preceded by a string matching the regular
80  * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk.
81  *
82  * caller must free the returned string
83  */
84 char *
zfs_strip_partition(const char * path)85 zfs_strip_partition(const char *path)
86 {
87 	char *tmp = strdup(path);
88 	char *part = NULL, *d = NULL;
89 	if (!tmp)
90 		return (NULL);
91 
92 	if ((part = strstr(tmp, "-part")) && part != tmp) {
93 		d = part + 5;
94 	} else if ((part = strrchr(tmp, 'p')) &&
95 	    part > tmp + 1 && isdigit(*(part-1))) {
96 		d = part + 1;
97 	} else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') &&
98 	    tmp[1] == 'd') {
99 		for (d = &tmp[2]; isalpha(*d); part = ++d) { }
100 	} else if (strncmp("xvd", tmp, 3) == 0) {
101 		for (d = &tmp[3]; isalpha(*d); part = ++d) { }
102 	}
103 	if (part && d && *d != '\0') {
104 		for (; isdigit(*d); d++) { }
105 		if (*d == '\0')
106 			*part = '\0';
107 	}
108 
109 	return (tmp);
110 }
111 
112 /*
113  * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname
114  *
115  * path:	/dev/sda1
116  * returns:	/dev/sda
117  *
118  * Returned string must be freed.
119  */
120 static char *
zfs_strip_partition_path(const char * path)121 zfs_strip_partition_path(const char *path)
122 {
123 	char *newpath = strdup(path);
124 	char *sd_offset;
125 	char *new_sd;
126 
127 	if (!newpath)
128 		return (NULL);
129 
130 	/* Point to "sda1" part of "/dev/sda1" */
131 	sd_offset = strrchr(newpath, '/') + 1;
132 
133 	/* Get our new name "sda" */
134 	new_sd = zfs_strip_partition(sd_offset);
135 	if (!new_sd) {
136 		free(newpath);
137 		return (NULL);
138 	}
139 
140 	/* Paste the "sda" where "sda1" was */
141 	strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1);
142 
143 	/* Free temporary "sda" */
144 	free(new_sd);
145 
146 	return (newpath);
147 }
148 
149 /*
150  * Strip the unwanted portion of a device path.
151  */
152 const char *
zfs_strip_path(const char * path)153 zfs_strip_path(const char *path)
154 {
155 	size_t spath_count;
156 	const char *const *spaths = zpool_default_search_paths(&spath_count);
157 
158 	for (size_t i = 0; i < spath_count; ++i)
159 		if (strncmp(path, spaths[i], strlen(spaths[i])) == 0 &&
160 		    path[strlen(spaths[i])] == '/')
161 			return (path + strlen(spaths[i]) + 1);
162 
163 	return (path);
164 }
165 
166 /*
167  * Read the contents of a sysfs file into an allocated buffer and remove the
168  * last newline.
169  *
170  * This is useful for reading sysfs files that return a single string.  Return
171  * an allocated string pointer on success, NULL otherwise.  Returned buffer
172  * must be freed by the user.
173  */
174 static char *
zfs_read_sysfs_file(char * filepath)175 zfs_read_sysfs_file(char *filepath)
176 {
177 	char buf[4096];	/* all sysfs files report 4k size */
178 	char *str = NULL;
179 
180 	FILE *fp = fopen(filepath, "r");
181 	if (fp == NULL) {
182 		return (NULL);
183 	}
184 	if (fgets(buf, sizeof (buf), fp) == buf) {
185 		/* success */
186 
187 		/* Remove the last newline (if any) */
188 		size_t len = strlen(buf);
189 		if (buf[len - 1] == '\n') {
190 			buf[len - 1] = '\0';
191 		}
192 		str = strdup(buf);
193 	}
194 
195 	fclose(fp);
196 
197 	return (str);
198 }
199 
200 /*
201  * Given a dev name like "nvme0n1", return the full PCI slot sysfs path to
202  * the drive (in /sys/bus/pci/slots).
203  *
204  * For example:
205  *     dev:            "nvme0n1"
206  *     returns:        "/sys/bus/pci/slots/0"
207  *
208  * 'dev' must be an NVMe device.
209  *
210  * Returned string must be freed.  Returns NULL on error or no sysfs path.
211  */
212 static char *
zfs_get_pci_slots_sys_path(const char * dev_name)213 zfs_get_pci_slots_sys_path(const char *dev_name)
214 {
215 	DIR *dp = NULL;
216 	struct dirent *ep;
217 	char *address1 = NULL;
218 	char *address2 = NULL;
219 	char *path = NULL;
220 	char buf[MAXPATHLEN];
221 	const char *tmp;
222 	char *tmp2;
223 
224 	/* If they preface 'dev' with a path (like "/dev") then strip it off */
225 	tmp = strrchr(dev_name, '/');
226 	if (tmp != NULL)
227 		dev_name = tmp + 1;    /* +1 since we want the chr after '/' */
228 
229 	if (strncmp("nvme", dev_name, 4) != 0)
230 		return (NULL);
231 
232 	(void) snprintf(buf, sizeof (buf), "/sys/block/%s/device/address",
233 	    dev_name);
234 
235 	address1 = zfs_read_sysfs_file(buf);
236 	if (!address1)
237 		return (NULL);
238 
239 	/*
240 	 * /sys/block/nvme0n1/device/address format will
241 	 * be "0000:01:00.0" while /sys/bus/pci/slots/0/address will be
242 	 * "0000:01:00".  Just NULL terminate at the '.' so they match.
243 	 */
244 	tmp2 = strrchr(address1, '.');
245 	if (tmp2 != NULL)
246 		*tmp2 = '\0';
247 
248 	dp = opendir("/sys/bus/pci/slots/");
249 	if (dp == NULL) {
250 		free(address1);
251 		return (NULL);
252 	}
253 
254 	/*
255 	 * Look through all the /sys/bus/pci/slots/ subdirs
256 	 */
257 	while ((ep = readdir(dp))) {
258 		/*
259 		 * We only care about directory names that are a single number.
260 		 * Sometimes there's other directories like
261 		 * "/sys/bus/pci/slots/0-3/" in there - skip those.
262 		 */
263 		if (!zfs_isnumber(ep->d_name))
264 			continue;
265 
266 		(void) snprintf(buf, sizeof (buf),
267 		    "/sys/bus/pci/slots/%s/address", ep->d_name);
268 
269 		address2 = zfs_read_sysfs_file(buf);
270 		if (!address2)
271 			continue;
272 
273 		if (strcmp(address1, address2) == 0) {
274 			/* Addresses match, we're all done */
275 			free(address2);
276 			if (asprintf(&path, "/sys/bus/pci/slots/%s",
277 			    ep->d_name) == -1) {
278 				continue;
279 			}
280 			break;
281 		}
282 		free(address2);
283 	}
284 
285 	closedir(dp);
286 	free(address1);
287 
288 	return (path);
289 }
290 
291 /*
292  * Given a dev name like "sda", return the full enclosure sysfs path to
293  * the disk.  You can also pass in the name with "/dev" prepended
294  * to it (like /dev/sda).  This works for both JBODs and NVMe PCI devices.
295  *
296  * For example, disk "sda" in enclosure slot 1:
297  *     dev_name:       "sda"
298  *     returns:        "/sys/class/enclosure/1:0:3:0/Slot 1"
299  *
300  * Or:
301  *
302  *      dev_name:   "nvme0n1"
303  *      returns:    "/sys/bus/pci/slots/0"
304  *
305  * 'dev' must be a non-devicemapper device.
306  *
307  * Returned string must be freed.  Returns NULL on error.
308  */
309 char *
zfs_get_enclosure_sysfs_path(const char * dev_name)310 zfs_get_enclosure_sysfs_path(const char *dev_name)
311 {
312 	DIR *dp = NULL;
313 	struct dirent *ep;
314 	char buf[MAXPATHLEN];
315 	const char *tmp0;
316 	char *tmp1 = NULL;
317 	char *tmp2 = NULL;
318 	char *tmp3 = NULL;
319 	char *path = NULL;
320 	size_t size;
321 	int tmpsize;
322 
323 	if (dev_name == NULL)
324 		return (NULL);
325 
326 	/* If they preface 'dev' with a path (like "/dev") then strip it off */
327 	tmp0 = strrchr(dev_name, '/');
328 	if (tmp0 != NULL)
329 		dev_name = tmp0 + 1;    /* +1 since we want the chr after '/' */
330 
331 	tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name);
332 	if (tmpsize == -1 || tmp1 == NULL) {
333 		tmp1 = NULL;
334 		goto end;
335 	}
336 
337 	dp = opendir(tmp1);
338 	if (dp == NULL)
339 		goto end;
340 
341 	/*
342 	 * Look though all sysfs entries in /sys/block/<dev>/device for
343 	 * the enclosure symlink.
344 	 */
345 	while ((ep = readdir(dp))) {
346 		/* Ignore everything that's not our enclosure_device link */
347 		if (strstr(ep->d_name, "enclosure_device") == NULL)
348 			continue;
349 
350 		if (tmp2 != NULL)
351 			free(tmp2);
352 		if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1) {
353 			tmp2 = NULL;
354 			break;
355 		}
356 
357 		size = readlink(tmp2, buf, sizeof (buf));
358 
359 		/* Did readlink fail or crop the link name? */
360 		if (size == -1 || size >= sizeof (buf))
361 			break;
362 
363 		/*
364 		 * We got a valid link.  readlink() doesn't terminate strings
365 		 * so we have to do it.
366 		 */
367 		buf[size] = '\0';
368 
369 		/*
370 		 * Our link will look like:
371 		 *
372 		 * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1"
373 		 *
374 		 * We want to grab the "enclosure/1:0:3:0/SLOT 1" part
375 		 */
376 		tmp3 = strstr(buf, "enclosure");
377 		if (tmp3 == NULL)
378 			break;
379 
380 		if (path != NULL)
381 			free(path);
382 		if (asprintf(&path, "/sys/class/%s", tmp3) == -1) {
383 			/* If asprintf() fails, 'path' is undefined */
384 			path = NULL;
385 			break;
386 		}
387 	}
388 
389 end:
390 	free(tmp2);
391 	free(tmp1);
392 
393 	if (dp != NULL)
394 		closedir(dp);
395 
396 	if (!path) {
397 		/*
398 		 * This particular disk isn't in a JBOD.  It could be an NVMe
399 		 * drive. If so, look up the NVMe device's path in
400 		 * /sys/bus/pci/slots/. Within that directory is a 'attention'
401 		 * file which controls the NVMe fault LED.
402 		 */
403 		path = zfs_get_pci_slots_sys_path(dev_name);
404 	}
405 
406 	return (path);
407 }
408 
409 /*
410  * Allocate and return the underlying device name for a device mapper device.
411  *
412  * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a
413  * DM device (like /dev/disk/by-vdev/A0) are also allowed.
414  *
415  * If the DM device has multiple underlying devices (like with multipath
416  * DM devices), then favor underlying devices that have a symlink back to their
417  * back to their enclosure device in sysfs.  This will be useful for the
418  * zedlet scripts that toggle the fault LED.
419  *
420  * Returns an underlying device name, or NULL on error or no match.  If dm_name
421  * is not a DM device then return NULL.
422  *
423  * NOTE: The returned name string must be *freed*.
424  */
425 static char *
dm_get_underlying_path(const char * dm_name)426 dm_get_underlying_path(const char *dm_name)
427 {
428 	DIR *dp = NULL;
429 	struct dirent *ep;
430 	char *realp;
431 	char *tmp = NULL;
432 	char *path = NULL;
433 	char *dev_str;
434 	char *first_path = NULL;
435 	char *enclosure_path;
436 
437 	if (dm_name == NULL)
438 		return (NULL);
439 
440 	/* dm name may be a symlink (like /dev/disk/by-vdev/A0) */
441 	realp = realpath(dm_name, NULL);
442 	if (realp == NULL)
443 		return (NULL);
444 
445 	/*
446 	 * If they preface 'dev' with a path (like "/dev") then strip it off.
447 	 * We just want the 'dm-N' part.
448 	 */
449 	tmp = strrchr(realp, '/');
450 	if (tmp != NULL)
451 		dev_str = tmp + 1;    /* +1 since we want the chr after '/' */
452 	else
453 		dev_str = tmp;
454 
455 	if (asprintf(&tmp, "/sys/block/%s/slaves/", dev_str) == -1) {
456 		tmp = NULL;
457 		goto end;
458 	}
459 
460 	dp = opendir(tmp);
461 	if (dp == NULL)
462 		goto end;
463 
464 	/*
465 	 * A device-mapper device can have multiple paths to it (multipath).
466 	 * Favor paths that have a symlink back to their enclosure device.
467 	 * We have to do this since some enclosures may only provide a symlink
468 	 * back for one underlying path to a disk and not the other.
469 	 *
470 	 * If no paths have links back to their enclosure, then just return the
471 	 * first path.
472 	 */
473 	while ((ep = readdir(dp))) {
474 		if (ep->d_type != DT_DIR) {	/* skip "." and ".." dirs */
475 			if (!first_path)
476 				first_path = strdup(ep->d_name);
477 
478 			enclosure_path =
479 			    zfs_get_enclosure_sysfs_path(ep->d_name);
480 
481 			if (!enclosure_path)
482 				continue;
483 
484 			if (asprintf(&path, "/dev/%s", ep->d_name) == -1)
485 				path = NULL;
486 			free(enclosure_path);
487 			break;
488 		}
489 	}
490 
491 end:
492 	if (dp != NULL)
493 		closedir(dp);
494 	free(tmp);
495 	free(realp);
496 
497 	if (!path && first_path) {
498 		/*
499 		 * None of the underlying paths had a link back to their
500 		 * enclosure devices.  Throw up out hands and return the first
501 		 * underlying path.
502 		 */
503 		if (asprintf(&path, "/dev/%s", first_path) == -1)
504 			path = NULL;
505 	}
506 
507 	free(first_path);
508 	return (path);
509 }
510 
511 /*
512  * Return B_TRUE if device is a device mapper or multipath device.
513  * Return B_FALSE if not.
514  */
515 boolean_t
zfs_dev_is_dm(const char * dev_name)516 zfs_dev_is_dm(const char *dev_name)
517 {
518 
519 	char *tmp;
520 	tmp = dm_get_underlying_path(dev_name);
521 	if (tmp == NULL)
522 		return (B_FALSE);
523 
524 	free(tmp);
525 	return (B_TRUE);
526 }
527 
528 /*
529  * By "whole disk" we mean an entire physical disk (something we can
530  * label, toggle the write cache on, etc.) as opposed to the full
531  * capacity of a pseudo-device such as lofi or did.  We act as if we
532  * are labeling the disk, which should be a pretty good test of whether
533  * it's a viable device or not.  Returns B_TRUE if it is and B_FALSE if
534  * it isn't.
535  */
536 boolean_t
zfs_dev_is_whole_disk(const char * dev_name)537 zfs_dev_is_whole_disk(const char *dev_name)
538 {
539 	struct dk_gpt *label = NULL;
540 	int fd;
541 
542 	if ((fd = open(dev_name, O_RDONLY | O_DIRECT | O_CLOEXEC)) < 0)
543 		return (B_FALSE);
544 
545 	if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
546 		(void) close(fd);
547 		return (B_FALSE);
548 	}
549 
550 	efi_free(label);
551 	(void) close(fd);
552 
553 	return (B_TRUE);
554 }
555 
556 /*
557  * Lookup the underlying device for a device name
558  *
559  * Often you'll have a symlink to a device, a partition device,
560  * or a multipath device, and want to look up the underlying device.
561  * This function returns the underlying device name.  If the device
562  * name is already the underlying device, then just return the same
563  * name.  If the device is a DM device with multiple underlying devices
564  * then return the first one.
565  *
566  * For example:
567  *
568  * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda
569  * dev_name:	/dev/disk/by-id/ata-QEMU_HARDDISK_QM00001
570  * returns:	/dev/sda
571  *
572  * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb)
573  * dev_name:	/dev/mapper/mpatha
574  * returns:	/dev/sda (first device)
575  *
576  * 3. /dev/sda (already the underlying device)
577  * dev_name:	/dev/sda
578  * returns:	/dev/sda
579  *
580  * 4. /dev/dm-3 (mapped to /dev/sda)
581  * dev_name:	/dev/dm-3
582  * returns:	/dev/sda
583  *
584  * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9
585  * dev_name:	/dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9
586  * returns:	/dev/sdb
587  *
588  * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2
589  * dev_name:	/dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a
590  * returns:	/dev/sda
591  *
592  * Returns underlying device name, or NULL on error or no match.
593  *
594  * NOTE: The returned name string must be *freed*.
595  */
596 char *
zfs_get_underlying_path(const char * dev_name)597 zfs_get_underlying_path(const char *dev_name)
598 {
599 	char *name = NULL;
600 	char *tmp;
601 
602 	if (dev_name == NULL)
603 		return (NULL);
604 
605 	tmp = dm_get_underlying_path(dev_name);
606 
607 	/* dev_name not a DM device, so just un-symlinkize it */
608 	if (tmp == NULL)
609 		tmp = realpath(dev_name, NULL);
610 
611 	if (tmp != NULL) {
612 		name = zfs_strip_partition_path(tmp);
613 		free(tmp);
614 	}
615 
616 	return (name);
617 }
618 
619 
620 #ifdef HAVE_LIBUDEV
621 
622 /*
623  * A disk is considered a multipath whole disk when:
624  *	DEVNAME key value has "dm-"
625  *	DM_UUID key exists and starts with 'mpath-'
626  *	ID_PART_TABLE_TYPE key does not exist or is not gpt
627  *	ID_FS_LABEL key does not exist (disk isn't labeled)
628  */
629 static boolean_t
is_mpath_udev_sane(struct udev_device * dev)630 is_mpath_udev_sane(struct udev_device *dev)
631 {
632 	const char *devname, *type, *uuid, *label;
633 
634 	devname = udev_device_get_property_value(dev, "DEVNAME");
635 	type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
636 	uuid = udev_device_get_property_value(dev, "DM_UUID");
637 	label = udev_device_get_property_value(dev, "ID_FS_LABEL");
638 
639 	if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
640 	    ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
641 	    ((uuid != NULL) && (strncmp(uuid, "mpath-", 6) == 0)) &&
642 	    (label == NULL)) {
643 		return (B_TRUE);
644 	}
645 
646 	return (B_FALSE);
647 }
648 
649 /*
650  * Check if a disk is a multipath "blank" disk:
651  *
652  * 1. The disk has udev values that suggest it's a multipath disk
653  * 2. The disk is not currently labeled with a filesystem of any type
654  * 3. There are no partitions on the disk
655  */
656 boolean_t
is_mpath_whole_disk(const char * path)657 is_mpath_whole_disk(const char *path)
658 {
659 	struct udev *udev;
660 	struct udev_device *dev = NULL;
661 	char nodepath[MAXPATHLEN];
662 	char *sysname;
663 
664 	if (realpath(path, nodepath) == NULL)
665 		return (B_FALSE);
666 	sysname = strrchr(nodepath, '/') + 1;
667 	if (strncmp(sysname, "dm-", 3) != 0)
668 		return (B_FALSE);
669 	if ((udev = udev_new()) == NULL)
670 		return (B_FALSE);
671 	if ((dev = udev_device_new_from_subsystem_sysname(udev, "block",
672 	    sysname)) == NULL) {
673 		udev_device_unref(dev);
674 		return (B_FALSE);
675 	}
676 
677 	/* Sanity check some udev values */
678 	boolean_t is_sane = is_mpath_udev_sane(dev);
679 	udev_device_unref(dev);
680 
681 	return (is_sane);
682 }
683 
684 #else /* HAVE_LIBUDEV */
685 
686 boolean_t
is_mpath_whole_disk(const char * path)687 is_mpath_whole_disk(const char *path)
688 {
689 	(void) path;
690 	return (B_FALSE);
691 }
692 
693 #endif /* HAVE_LIBUDEV */
694