1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 */
26
27 #include <ctype.h>
28 #include <dirent.h>
29 #include <fcntl.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <sys/efi_partition.h>
34
35 #ifdef HAVE_LIBUDEV
36 #include <libudev.h>
37 #endif
38
39 #include <libzutil.h>
40
41 /*
42 * Append partition suffix to an otherwise fully qualified device path.
43 * This is used to generate the name the full path as its stored in
44 * ZPOOL_CONFIG_PATH for whole disk devices. On success the new length
45 * of 'path' will be returned on error a negative value is returned.
46 */
47 int
zfs_append_partition(char * path,size_t max_len)48 zfs_append_partition(char *path, size_t max_len)
49 {
50 int len = strlen(path);
51
52 if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) ||
53 (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) {
54 if (len + 6 >= max_len)
55 return (-1);
56
57 (void) strcat(path, "-part1");
58 len += 6;
59 } else {
60 if (len + 2 >= max_len)
61 return (-1);
62
63 if (isdigit(path[len-1])) {
64 (void) strcat(path, "p1");
65 len += 2;
66 } else {
67 (void) strcat(path, "1");
68 len += 1;
69 }
70 }
71
72 return (len);
73 }
74
75 /*
76 * Remove partition suffix from a vdev path. Partition suffixes may take three
77 * forms: "-partX", "pX", or "X", where X is a string of digits. The second
78 * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The
79 * third case only occurs when preceded by a string matching the regular
80 * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk.
81 *
82 * caller must free the returned string
83 */
84 char *
zfs_strip_partition(const char * path)85 zfs_strip_partition(const char *path)
86 {
87 char *tmp = strdup(path);
88 char *part = NULL, *d = NULL;
89 if (!tmp)
90 return (NULL);
91
92 if ((part = strstr(tmp, "-part")) && part != tmp) {
93 d = part + 5;
94 } else if ((part = strrchr(tmp, 'p')) &&
95 part > tmp + 1 && isdigit(*(part-1))) {
96 d = part + 1;
97 } else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') &&
98 tmp[1] == 'd') {
99 for (d = &tmp[2]; isalpha(*d); part = ++d) { }
100 } else if (strncmp("xvd", tmp, 3) == 0) {
101 for (d = &tmp[3]; isalpha(*d); part = ++d) { }
102 }
103 if (part && d && *d != '\0') {
104 for (; isdigit(*d); d++) { }
105 if (*d == '\0')
106 *part = '\0';
107 }
108
109 return (tmp);
110 }
111
112 /*
113 * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname
114 *
115 * path: /dev/sda1
116 * returns: /dev/sda
117 *
118 * Returned string must be freed.
119 */
120 static char *
zfs_strip_partition_path(const char * path)121 zfs_strip_partition_path(const char *path)
122 {
123 char *newpath = strdup(path);
124 char *sd_offset;
125 char *new_sd;
126
127 if (!newpath)
128 return (NULL);
129
130 /* Point to "sda1" part of "/dev/sda1" */
131 sd_offset = strrchr(newpath, '/') + 1;
132
133 /* Get our new name "sda" */
134 new_sd = zfs_strip_partition(sd_offset);
135 if (!new_sd) {
136 free(newpath);
137 return (NULL);
138 }
139
140 /* Paste the "sda" where "sda1" was */
141 strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1);
142
143 /* Free temporary "sda" */
144 free(new_sd);
145
146 return (newpath);
147 }
148
149 /*
150 * Strip the unwanted portion of a device path.
151 */
152 const char *
zfs_strip_path(const char * path)153 zfs_strip_path(const char *path)
154 {
155 size_t spath_count;
156 const char *const *spaths = zpool_default_search_paths(&spath_count);
157
158 for (size_t i = 0; i < spath_count; ++i)
159 if (strncmp(path, spaths[i], strlen(spaths[i])) == 0 &&
160 path[strlen(spaths[i])] == '/')
161 return (path + strlen(spaths[i]) + 1);
162
163 return (path);
164 }
165
166 /*
167 * Read the contents of a sysfs file into an allocated buffer and remove the
168 * last newline.
169 *
170 * This is useful for reading sysfs files that return a single string. Return
171 * an allocated string pointer on success, NULL otherwise. Returned buffer
172 * must be freed by the user.
173 */
174 static char *
zfs_read_sysfs_file(char * filepath)175 zfs_read_sysfs_file(char *filepath)
176 {
177 char buf[4096]; /* all sysfs files report 4k size */
178 char *str = NULL;
179
180 FILE *fp = fopen(filepath, "r");
181 if (fp == NULL) {
182 return (NULL);
183 }
184 if (fgets(buf, sizeof (buf), fp) == buf) {
185 /* success */
186
187 /* Remove the last newline (if any) */
188 size_t len = strlen(buf);
189 if (buf[len - 1] == '\n') {
190 buf[len - 1] = '\0';
191 }
192 str = strdup(buf);
193 }
194
195 fclose(fp);
196
197 return (str);
198 }
199
200 /*
201 * Given a dev name like "nvme0n1", return the full PCI slot sysfs path to
202 * the drive (in /sys/bus/pci/slots).
203 *
204 * For example:
205 * dev: "nvme0n1"
206 * returns: "/sys/bus/pci/slots/0"
207 *
208 * 'dev' must be an NVMe device.
209 *
210 * Returned string must be freed. Returns NULL on error or no sysfs path.
211 */
212 static char *
zfs_get_pci_slots_sys_path(const char * dev_name)213 zfs_get_pci_slots_sys_path(const char *dev_name)
214 {
215 DIR *dp = NULL;
216 struct dirent *ep;
217 char *address1 = NULL;
218 char *address2 = NULL;
219 char *path = NULL;
220 char buf[MAXPATHLEN];
221 const char *tmp;
222 char *tmp2;
223
224 /* If they preface 'dev' with a path (like "/dev") then strip it off */
225 tmp = strrchr(dev_name, '/');
226 if (tmp != NULL)
227 dev_name = tmp + 1; /* +1 since we want the chr after '/' */
228
229 if (strncmp("nvme", dev_name, 4) != 0)
230 return (NULL);
231
232 (void) snprintf(buf, sizeof (buf), "/sys/block/%s/device/address",
233 dev_name);
234
235 address1 = zfs_read_sysfs_file(buf);
236 if (!address1)
237 return (NULL);
238
239 /*
240 * /sys/block/nvme0n1/device/address format will
241 * be "0000:01:00.0" while /sys/bus/pci/slots/0/address will be
242 * "0000:01:00". Just NULL terminate at the '.' so they match.
243 */
244 tmp2 = strrchr(address1, '.');
245 if (tmp2 != NULL)
246 *tmp2 = '\0';
247
248 dp = opendir("/sys/bus/pci/slots/");
249 if (dp == NULL) {
250 free(address1);
251 return (NULL);
252 }
253
254 /*
255 * Look through all the /sys/bus/pci/slots/ subdirs
256 */
257 while ((ep = readdir(dp))) {
258 /*
259 * We only care about directory names that are a single number.
260 * Sometimes there's other directories like
261 * "/sys/bus/pci/slots/0-3/" in there - skip those.
262 */
263 if (!zfs_isnumber(ep->d_name))
264 continue;
265
266 (void) snprintf(buf, sizeof (buf),
267 "/sys/bus/pci/slots/%s/address", ep->d_name);
268
269 address2 = zfs_read_sysfs_file(buf);
270 if (!address2)
271 continue;
272
273 if (strcmp(address1, address2) == 0) {
274 /* Addresses match, we're all done */
275 free(address2);
276 if (asprintf(&path, "/sys/bus/pci/slots/%s",
277 ep->d_name) == -1) {
278 continue;
279 }
280 break;
281 }
282 free(address2);
283 }
284
285 closedir(dp);
286 free(address1);
287
288 return (path);
289 }
290
291 /*
292 * Given a dev name like "sda", return the full enclosure sysfs path to
293 * the disk. You can also pass in the name with "/dev" prepended
294 * to it (like /dev/sda). This works for both JBODs and NVMe PCI devices.
295 *
296 * For example, disk "sda" in enclosure slot 1:
297 * dev_name: "sda"
298 * returns: "/sys/class/enclosure/1:0:3:0/Slot 1"
299 *
300 * Or:
301 *
302 * dev_name: "nvme0n1"
303 * returns: "/sys/bus/pci/slots/0"
304 *
305 * 'dev' must be a non-devicemapper device.
306 *
307 * Returned string must be freed. Returns NULL on error.
308 */
309 char *
zfs_get_enclosure_sysfs_path(const char * dev_name)310 zfs_get_enclosure_sysfs_path(const char *dev_name)
311 {
312 DIR *dp = NULL;
313 struct dirent *ep;
314 char buf[MAXPATHLEN];
315 const char *tmp0;
316 char *tmp1 = NULL;
317 char *tmp2 = NULL;
318 char *tmp3 = NULL;
319 char *path = NULL;
320 size_t size;
321 int tmpsize;
322
323 if (dev_name == NULL)
324 return (NULL);
325
326 /* If they preface 'dev' with a path (like "/dev") then strip it off */
327 tmp0 = strrchr(dev_name, '/');
328 if (tmp0 != NULL)
329 dev_name = tmp0 + 1; /* +1 since we want the chr after '/' */
330
331 tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name);
332 if (tmpsize == -1 || tmp1 == NULL) {
333 tmp1 = NULL;
334 goto end;
335 }
336
337 dp = opendir(tmp1);
338 if (dp == NULL)
339 goto end;
340
341 /*
342 * Look though all sysfs entries in /sys/block/<dev>/device for
343 * the enclosure symlink.
344 */
345 while ((ep = readdir(dp))) {
346 /* Ignore everything that's not our enclosure_device link */
347 if (strstr(ep->d_name, "enclosure_device") == NULL)
348 continue;
349
350 if (tmp2 != NULL)
351 free(tmp2);
352 if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1) {
353 tmp2 = NULL;
354 break;
355 }
356
357 size = readlink(tmp2, buf, sizeof (buf));
358
359 /* Did readlink fail or crop the link name? */
360 if (size == -1 || size >= sizeof (buf))
361 break;
362
363 /*
364 * We got a valid link. readlink() doesn't terminate strings
365 * so we have to do it.
366 */
367 buf[size] = '\0';
368
369 /*
370 * Our link will look like:
371 *
372 * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1"
373 *
374 * We want to grab the "enclosure/1:0:3:0/SLOT 1" part
375 */
376 tmp3 = strstr(buf, "enclosure");
377 if (tmp3 == NULL)
378 break;
379
380 if (path != NULL)
381 free(path);
382 if (asprintf(&path, "/sys/class/%s", tmp3) == -1) {
383 /* If asprintf() fails, 'path' is undefined */
384 path = NULL;
385 break;
386 }
387 }
388
389 end:
390 free(tmp2);
391 free(tmp1);
392
393 if (dp != NULL)
394 closedir(dp);
395
396 if (!path) {
397 /*
398 * This particular disk isn't in a JBOD. It could be an NVMe
399 * drive. If so, look up the NVMe device's path in
400 * /sys/bus/pci/slots/. Within that directory is a 'attention'
401 * file which controls the NVMe fault LED.
402 */
403 path = zfs_get_pci_slots_sys_path(dev_name);
404 }
405
406 return (path);
407 }
408
409 /*
410 * Allocate and return the underlying device name for a device mapper device.
411 *
412 * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a
413 * DM device (like /dev/disk/by-vdev/A0) are also allowed.
414 *
415 * If the DM device has multiple underlying devices (like with multipath
416 * DM devices), then favor underlying devices that have a symlink back to their
417 * back to their enclosure device in sysfs. This will be useful for the
418 * zedlet scripts that toggle the fault LED.
419 *
420 * Returns an underlying device name, or NULL on error or no match. If dm_name
421 * is not a DM device then return NULL.
422 *
423 * NOTE: The returned name string must be *freed*.
424 */
425 static char *
dm_get_underlying_path(const char * dm_name)426 dm_get_underlying_path(const char *dm_name)
427 {
428 DIR *dp = NULL;
429 struct dirent *ep;
430 char *realp;
431 char *tmp = NULL;
432 char *path = NULL;
433 char *dev_str;
434 char *first_path = NULL;
435 char *enclosure_path;
436
437 if (dm_name == NULL)
438 return (NULL);
439
440 /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */
441 realp = realpath(dm_name, NULL);
442 if (realp == NULL)
443 return (NULL);
444
445 /*
446 * If they preface 'dev' with a path (like "/dev") then strip it off.
447 * We just want the 'dm-N' part.
448 */
449 tmp = strrchr(realp, '/');
450 if (tmp != NULL)
451 dev_str = tmp + 1; /* +1 since we want the chr after '/' */
452 else
453 dev_str = tmp;
454
455 if (asprintf(&tmp, "/sys/block/%s/slaves/", dev_str) == -1) {
456 tmp = NULL;
457 goto end;
458 }
459
460 dp = opendir(tmp);
461 if (dp == NULL)
462 goto end;
463
464 /*
465 * A device-mapper device can have multiple paths to it (multipath).
466 * Favor paths that have a symlink back to their enclosure device.
467 * We have to do this since some enclosures may only provide a symlink
468 * back for one underlying path to a disk and not the other.
469 *
470 * If no paths have links back to their enclosure, then just return the
471 * first path.
472 */
473 while ((ep = readdir(dp))) {
474 if (ep->d_type != DT_DIR) { /* skip "." and ".." dirs */
475 if (!first_path)
476 first_path = strdup(ep->d_name);
477
478 enclosure_path =
479 zfs_get_enclosure_sysfs_path(ep->d_name);
480
481 if (!enclosure_path)
482 continue;
483
484 if (asprintf(&path, "/dev/%s", ep->d_name) == -1)
485 path = NULL;
486 free(enclosure_path);
487 break;
488 }
489 }
490
491 end:
492 if (dp != NULL)
493 closedir(dp);
494 free(tmp);
495 free(realp);
496
497 if (!path && first_path) {
498 /*
499 * None of the underlying paths had a link back to their
500 * enclosure devices. Throw up out hands and return the first
501 * underlying path.
502 */
503 if (asprintf(&path, "/dev/%s", first_path) == -1)
504 path = NULL;
505 }
506
507 free(first_path);
508 return (path);
509 }
510
511 /*
512 * Return B_TRUE if device is a device mapper or multipath device.
513 * Return B_FALSE if not.
514 */
515 boolean_t
zfs_dev_is_dm(const char * dev_name)516 zfs_dev_is_dm(const char *dev_name)
517 {
518
519 char *tmp;
520 tmp = dm_get_underlying_path(dev_name);
521 if (tmp == NULL)
522 return (B_FALSE);
523
524 free(tmp);
525 return (B_TRUE);
526 }
527
528 /*
529 * By "whole disk" we mean an entire physical disk (something we can
530 * label, toggle the write cache on, etc.) as opposed to the full
531 * capacity of a pseudo-device such as lofi or did. We act as if we
532 * are labeling the disk, which should be a pretty good test of whether
533 * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if
534 * it isn't.
535 */
536 boolean_t
zfs_dev_is_whole_disk(const char * dev_name)537 zfs_dev_is_whole_disk(const char *dev_name)
538 {
539 struct dk_gpt *label = NULL;
540 int fd;
541
542 if ((fd = open(dev_name, O_RDONLY | O_DIRECT | O_CLOEXEC)) < 0)
543 return (B_FALSE);
544
545 if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
546 (void) close(fd);
547 return (B_FALSE);
548 }
549
550 efi_free(label);
551 (void) close(fd);
552
553 return (B_TRUE);
554 }
555
556 /*
557 * Lookup the underlying device for a device name
558 *
559 * Often you'll have a symlink to a device, a partition device,
560 * or a multipath device, and want to look up the underlying device.
561 * This function returns the underlying device name. If the device
562 * name is already the underlying device, then just return the same
563 * name. If the device is a DM device with multiple underlying devices
564 * then return the first one.
565 *
566 * For example:
567 *
568 * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda
569 * dev_name: /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001
570 * returns: /dev/sda
571 *
572 * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb)
573 * dev_name: /dev/mapper/mpatha
574 * returns: /dev/sda (first device)
575 *
576 * 3. /dev/sda (already the underlying device)
577 * dev_name: /dev/sda
578 * returns: /dev/sda
579 *
580 * 4. /dev/dm-3 (mapped to /dev/sda)
581 * dev_name: /dev/dm-3
582 * returns: /dev/sda
583 *
584 * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9
585 * dev_name: /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9
586 * returns: /dev/sdb
587 *
588 * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2
589 * dev_name: /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a
590 * returns: /dev/sda
591 *
592 * Returns underlying device name, or NULL on error or no match.
593 *
594 * NOTE: The returned name string must be *freed*.
595 */
596 char *
zfs_get_underlying_path(const char * dev_name)597 zfs_get_underlying_path(const char *dev_name)
598 {
599 char *name = NULL;
600 char *tmp;
601
602 if (dev_name == NULL)
603 return (NULL);
604
605 tmp = dm_get_underlying_path(dev_name);
606
607 /* dev_name not a DM device, so just un-symlinkize it */
608 if (tmp == NULL)
609 tmp = realpath(dev_name, NULL);
610
611 if (tmp != NULL) {
612 name = zfs_strip_partition_path(tmp);
613 free(tmp);
614 }
615
616 return (name);
617 }
618
619
620 #ifdef HAVE_LIBUDEV
621
622 /*
623 * A disk is considered a multipath whole disk when:
624 * DEVNAME key value has "dm-"
625 * DM_UUID key exists and starts with 'mpath-'
626 * ID_PART_TABLE_TYPE key does not exist or is not gpt
627 * ID_FS_LABEL key does not exist (disk isn't labeled)
628 */
629 static boolean_t
is_mpath_udev_sane(struct udev_device * dev)630 is_mpath_udev_sane(struct udev_device *dev)
631 {
632 const char *devname, *type, *uuid, *label;
633
634 devname = udev_device_get_property_value(dev, "DEVNAME");
635 type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
636 uuid = udev_device_get_property_value(dev, "DM_UUID");
637 label = udev_device_get_property_value(dev, "ID_FS_LABEL");
638
639 if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
640 ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
641 ((uuid != NULL) && (strncmp(uuid, "mpath-", 6) == 0)) &&
642 (label == NULL)) {
643 return (B_TRUE);
644 }
645
646 return (B_FALSE);
647 }
648
649 /*
650 * Check if a disk is a multipath "blank" disk:
651 *
652 * 1. The disk has udev values that suggest it's a multipath disk
653 * 2. The disk is not currently labeled with a filesystem of any type
654 * 3. There are no partitions on the disk
655 */
656 boolean_t
is_mpath_whole_disk(const char * path)657 is_mpath_whole_disk(const char *path)
658 {
659 struct udev *udev;
660 struct udev_device *dev = NULL;
661 char nodepath[MAXPATHLEN];
662 char *sysname;
663
664 if (realpath(path, nodepath) == NULL)
665 return (B_FALSE);
666 sysname = strrchr(nodepath, '/') + 1;
667 if (strncmp(sysname, "dm-", 3) != 0)
668 return (B_FALSE);
669 if ((udev = udev_new()) == NULL)
670 return (B_FALSE);
671 if ((dev = udev_device_new_from_subsystem_sysname(udev, "block",
672 sysname)) == NULL) {
673 udev_device_unref(dev);
674 return (B_FALSE);
675 }
676
677 /* Sanity check some udev values */
678 boolean_t is_sane = is_mpath_udev_sane(dev);
679 udev_device_unref(dev);
680
681 return (is_sane);
682 }
683
684 #else /* HAVE_LIBUDEV */
685
686 boolean_t
is_mpath_whole_disk(const char * path)687 is_mpath_whole_disk(const char *path)
688 {
689 (void) path;
690 return (B_FALSE);
691 }
692
693 #endif /* HAVE_LIBUDEV */
694