1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <ctype.h>
27 #include <dirent.h>
28 #include <fcntl.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <sys/efi_partition.h>
33
34 #ifdef HAVE_LIBUDEV
35 #include <libudev.h>
36 #endif
37
38 #include <libzutil.h>
39
40 /*
41 * Append partition suffix to an otherwise fully qualified device path.
42 * This is used to generate the name the full path as its stored in
43 * ZPOOL_CONFIG_PATH for whole disk devices. On success the new length
44 * of 'path' will be returned on error a negative value is returned.
45 */
46 int
zfs_append_partition(char * path,size_t max_len)47 zfs_append_partition(char *path, size_t max_len)
48 {
49 int len = strlen(path);
50
51 if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) ||
52 (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) {
53 if (len + 6 >= max_len)
54 return (-1);
55
56 (void) strcat(path, "-part1");
57 len += 6;
58 } else {
59 if (len + 2 >= max_len)
60 return (-1);
61
62 if (isdigit(path[len-1])) {
63 (void) strcat(path, "p1");
64 len += 2;
65 } else {
66 (void) strcat(path, "1");
67 len += 1;
68 }
69 }
70
71 return (len);
72 }
73
74 /*
75 * Remove partition suffix from a vdev path. Partition suffixes may take three
76 * forms: "-partX", "pX", or "X", where X is a string of digits. The second
77 * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The
78 * third case only occurs when preceded by a string matching the regular
79 * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk.
80 *
81 * caller must free the returned string
82 */
83 char *
zfs_strip_partition(const char * path)84 zfs_strip_partition(const char *path)
85 {
86 char *tmp = strdup(path);
87 char *part = NULL, *d = NULL;
88 if (!tmp)
89 return (NULL);
90
91 if ((part = strstr(tmp, "-part")) && part != tmp) {
92 d = part + 5;
93 } else if ((part = strrchr(tmp, 'p')) &&
94 part > tmp + 1 && isdigit(*(part-1))) {
95 d = part + 1;
96 } else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') &&
97 tmp[1] == 'd') {
98 for (d = &tmp[2]; isalpha(*d); part = ++d) { }
99 } else if (strncmp("xvd", tmp, 3) == 0) {
100 for (d = &tmp[3]; isalpha(*d); part = ++d) { }
101 }
102 if (part && d && *d != '\0') {
103 for (; isdigit(*d); d++) { }
104 if (*d == '\0')
105 *part = '\0';
106 }
107
108 return (tmp);
109 }
110
111 /*
112 * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname
113 *
114 * path: /dev/sda1
115 * returns: /dev/sda
116 *
117 * Returned string must be freed.
118 */
119 static char *
zfs_strip_partition_path(const char * path)120 zfs_strip_partition_path(const char *path)
121 {
122 char *newpath = strdup(path);
123 char *sd_offset;
124 char *new_sd;
125
126 if (!newpath)
127 return (NULL);
128
129 /* Point to "sda1" part of "/dev/sda1" */
130 sd_offset = strrchr(newpath, '/') + 1;
131
132 /* Get our new name "sda" */
133 new_sd = zfs_strip_partition(sd_offset);
134 if (!new_sd) {
135 free(newpath);
136 return (NULL);
137 }
138
139 /* Paste the "sda" where "sda1" was */
140 strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1);
141
142 /* Free temporary "sda" */
143 free(new_sd);
144
145 return (newpath);
146 }
147
148 /*
149 * Strip the unwanted portion of a device path.
150 */
151 const char *
zfs_strip_path(const char * path)152 zfs_strip_path(const char *path)
153 {
154 size_t spath_count;
155 const char *const *spaths = zpool_default_search_paths(&spath_count);
156
157 for (size_t i = 0; i < spath_count; ++i)
158 if (strncmp(path, spaths[i], strlen(spaths[i])) == 0 &&
159 path[strlen(spaths[i])] == '/')
160 return (path + strlen(spaths[i]) + 1);
161
162 return (path);
163 }
164
165 /*
166 * Read the contents of a sysfs file into an allocated buffer and remove the
167 * last newline.
168 *
169 * This is useful for reading sysfs files that return a single string. Return
170 * an allocated string pointer on success, NULL otherwise. Returned buffer
171 * must be freed by the user.
172 */
173 static char *
zfs_read_sysfs_file(char * filepath)174 zfs_read_sysfs_file(char *filepath)
175 {
176 char buf[4096]; /* all sysfs files report 4k size */
177 char *str = NULL;
178
179 FILE *fp = fopen(filepath, "r");
180 if (fp == NULL) {
181 return (NULL);
182 }
183 if (fgets(buf, sizeof (buf), fp) == buf) {
184 /* success */
185
186 /* Remove the last newline (if any) */
187 size_t len = strlen(buf);
188 if (buf[len - 1] == '\n') {
189 buf[len - 1] = '\0';
190 }
191 str = strdup(buf);
192 }
193
194 fclose(fp);
195
196 return (str);
197 }
198
199 /*
200 * Given a dev name like "nvme0n1", return the full PCI slot sysfs path to
201 * the drive (in /sys/bus/pci/slots).
202 *
203 * For example:
204 * dev: "nvme0n1"
205 * returns: "/sys/bus/pci/slots/0"
206 *
207 * 'dev' must be an NVMe device.
208 *
209 * Returned string must be freed. Returns NULL on error or no sysfs path.
210 */
211 static char *
zfs_get_pci_slots_sys_path(const char * dev_name)212 zfs_get_pci_slots_sys_path(const char *dev_name)
213 {
214 DIR *dp = NULL;
215 struct dirent *ep;
216 char *address1 = NULL;
217 char *address2 = NULL;
218 char *path = NULL;
219 char buf[MAXPATHLEN];
220 char *tmp;
221
222 /* If they preface 'dev' with a path (like "/dev") then strip it off */
223 tmp = strrchr(dev_name, '/');
224 if (tmp != NULL)
225 dev_name = tmp + 1; /* +1 since we want the chr after '/' */
226
227 if (strncmp("nvme", dev_name, 4) != 0)
228 return (NULL);
229
230 (void) snprintf(buf, sizeof (buf), "/sys/block/%s/device/address",
231 dev_name);
232
233 address1 = zfs_read_sysfs_file(buf);
234 if (!address1)
235 return (NULL);
236
237 /*
238 * /sys/block/nvme0n1/device/address format will
239 * be "0000:01:00.0" while /sys/bus/pci/slots/0/address will be
240 * "0000:01:00". Just NULL terminate at the '.' so they match.
241 */
242 tmp = strrchr(address1, '.');
243 if (tmp != NULL)
244 *tmp = '\0';
245
246 dp = opendir("/sys/bus/pci/slots/");
247 if (dp == NULL) {
248 free(address1);
249 return (NULL);
250 }
251
252 /*
253 * Look through all the /sys/bus/pci/slots/ subdirs
254 */
255 while ((ep = readdir(dp))) {
256 /*
257 * We only care about directory names that are a single number.
258 * Sometimes there's other directories like
259 * "/sys/bus/pci/slots/0-3/" in there - skip those.
260 */
261 if (!zfs_isnumber(ep->d_name))
262 continue;
263
264 (void) snprintf(buf, sizeof (buf),
265 "/sys/bus/pci/slots/%s/address", ep->d_name);
266
267 address2 = zfs_read_sysfs_file(buf);
268 if (!address2)
269 continue;
270
271 if (strcmp(address1, address2) == 0) {
272 /* Addresses match, we're all done */
273 free(address2);
274 if (asprintf(&path, "/sys/bus/pci/slots/%s",
275 ep->d_name) == -1) {
276 continue;
277 }
278 break;
279 }
280 free(address2);
281 }
282
283 closedir(dp);
284 free(address1);
285
286 return (path);
287 }
288
289 /*
290 * Given a dev name like "sda", return the full enclosure sysfs path to
291 * the disk. You can also pass in the name with "/dev" prepended
292 * to it (like /dev/sda). This works for both JBODs and NVMe PCI devices.
293 *
294 * For example, disk "sda" in enclosure slot 1:
295 * dev_name: "sda"
296 * returns: "/sys/class/enclosure/1:0:3:0/Slot 1"
297 *
298 * Or:
299 *
300 * dev_name: "nvme0n1"
301 * returns: "/sys/bus/pci/slots/0"
302 *
303 * 'dev' must be a non-devicemapper device.
304 *
305 * Returned string must be freed. Returns NULL on error.
306 */
307 char *
zfs_get_enclosure_sysfs_path(const char * dev_name)308 zfs_get_enclosure_sysfs_path(const char *dev_name)
309 {
310 DIR *dp = NULL;
311 struct dirent *ep;
312 char buf[MAXPATHLEN];
313 char *tmp1 = NULL;
314 char *tmp2 = NULL;
315 char *tmp3 = NULL;
316 char *path = NULL;
317 size_t size;
318 int tmpsize;
319
320 if (dev_name == NULL)
321 return (NULL);
322
323 /* If they preface 'dev' with a path (like "/dev") then strip it off */
324 tmp1 = strrchr(dev_name, '/');
325 if (tmp1 != NULL)
326 dev_name = tmp1 + 1; /* +1 since we want the chr after '/' */
327
328 tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name);
329 if (tmpsize == -1 || tmp1 == NULL) {
330 tmp1 = NULL;
331 goto end;
332 }
333
334 dp = opendir(tmp1);
335 if (dp == NULL)
336 goto end;
337
338 /*
339 * Look though all sysfs entries in /sys/block/<dev>/device for
340 * the enclosure symlink.
341 */
342 while ((ep = readdir(dp))) {
343 /* Ignore everything that's not our enclosure_device link */
344 if (strstr(ep->d_name, "enclosure_device") == NULL)
345 continue;
346
347 if (tmp2 != NULL)
348 free(tmp2);
349 if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1) {
350 tmp2 = NULL;
351 break;
352 }
353
354 size = readlink(tmp2, buf, sizeof (buf));
355
356 /* Did readlink fail or crop the link name? */
357 if (size == -1 || size >= sizeof (buf))
358 break;
359
360 /*
361 * We got a valid link. readlink() doesn't terminate strings
362 * so we have to do it.
363 */
364 buf[size] = '\0';
365
366 /*
367 * Our link will look like:
368 *
369 * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1"
370 *
371 * We want to grab the "enclosure/1:0:3:0/SLOT 1" part
372 */
373 tmp3 = strstr(buf, "enclosure");
374 if (tmp3 == NULL)
375 break;
376
377 if (path != NULL)
378 free(path);
379 if (asprintf(&path, "/sys/class/%s", tmp3) == -1) {
380 /* If asprintf() fails, 'path' is undefined */
381 path = NULL;
382 break;
383 }
384 }
385
386 end:
387 free(tmp2);
388 free(tmp1);
389
390 if (dp != NULL)
391 closedir(dp);
392
393 if (!path) {
394 /*
395 * This particular disk isn't in a JBOD. It could be an NVMe
396 * drive. If so, look up the NVMe device's path in
397 * /sys/bus/pci/slots/. Within that directory is a 'attention'
398 * file which controls the NVMe fault LED.
399 */
400 path = zfs_get_pci_slots_sys_path(dev_name);
401 }
402
403 return (path);
404 }
405
406 /*
407 * Allocate and return the underlying device name for a device mapper device.
408 *
409 * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a
410 * DM device (like /dev/disk/by-vdev/A0) are also allowed.
411 *
412 * If the DM device has multiple underlying devices (like with multipath
413 * DM devices), then favor underlying devices that have a symlink back to their
414 * back to their enclosure device in sysfs. This will be useful for the
415 * zedlet scripts that toggle the fault LED.
416 *
417 * Returns an underlying device name, or NULL on error or no match. If dm_name
418 * is not a DM device then return NULL.
419 *
420 * NOTE: The returned name string must be *freed*.
421 */
422 static char *
dm_get_underlying_path(const char * dm_name)423 dm_get_underlying_path(const char *dm_name)
424 {
425 DIR *dp = NULL;
426 struct dirent *ep;
427 char *realp;
428 char *tmp = NULL;
429 char *path = NULL;
430 char *dev_str;
431 char *first_path = NULL;
432 char *enclosure_path;
433
434 if (dm_name == NULL)
435 return (NULL);
436
437 /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */
438 realp = realpath(dm_name, NULL);
439 if (realp == NULL)
440 return (NULL);
441
442 /*
443 * If they preface 'dev' with a path (like "/dev") then strip it off.
444 * We just want the 'dm-N' part.
445 */
446 tmp = strrchr(realp, '/');
447 if (tmp != NULL)
448 dev_str = tmp + 1; /* +1 since we want the chr after '/' */
449 else
450 dev_str = tmp;
451
452 if (asprintf(&tmp, "/sys/block/%s/slaves/", dev_str) == -1) {
453 tmp = NULL;
454 goto end;
455 }
456
457 dp = opendir(tmp);
458 if (dp == NULL)
459 goto end;
460
461 /*
462 * A device-mapper device can have multiple paths to it (multipath).
463 * Favor paths that have a symlink back to their enclosure device.
464 * We have to do this since some enclosures may only provide a symlink
465 * back for one underlying path to a disk and not the other.
466 *
467 * If no paths have links back to their enclosure, then just return the
468 * first path.
469 */
470 while ((ep = readdir(dp))) {
471 if (ep->d_type != DT_DIR) { /* skip "." and ".." dirs */
472 if (!first_path)
473 first_path = strdup(ep->d_name);
474
475 enclosure_path =
476 zfs_get_enclosure_sysfs_path(ep->d_name);
477
478 if (!enclosure_path)
479 continue;
480
481 if (asprintf(&path, "/dev/%s", ep->d_name) == -1)
482 path = NULL;
483 free(enclosure_path);
484 break;
485 }
486 }
487
488 end:
489 if (dp != NULL)
490 closedir(dp);
491 free(tmp);
492 free(realp);
493
494 if (!path && first_path) {
495 /*
496 * None of the underlying paths had a link back to their
497 * enclosure devices. Throw up out hands and return the first
498 * underlying path.
499 */
500 if (asprintf(&path, "/dev/%s", first_path) == -1)
501 path = NULL;
502 }
503
504 free(first_path);
505 return (path);
506 }
507
508 /*
509 * Return B_TRUE if device is a device mapper or multipath device.
510 * Return B_FALSE if not.
511 */
512 boolean_t
zfs_dev_is_dm(const char * dev_name)513 zfs_dev_is_dm(const char *dev_name)
514 {
515
516 char *tmp;
517 tmp = dm_get_underlying_path(dev_name);
518 if (tmp == NULL)
519 return (B_FALSE);
520
521 free(tmp);
522 return (B_TRUE);
523 }
524
525 /*
526 * By "whole disk" we mean an entire physical disk (something we can
527 * label, toggle the write cache on, etc.) as opposed to the full
528 * capacity of a pseudo-device such as lofi or did. We act as if we
529 * are labeling the disk, which should be a pretty good test of whether
530 * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if
531 * it isn't.
532 */
533 boolean_t
zfs_dev_is_whole_disk(const char * dev_name)534 zfs_dev_is_whole_disk(const char *dev_name)
535 {
536 struct dk_gpt *label = NULL;
537 int fd;
538
539 if ((fd = open(dev_name, O_RDONLY | O_DIRECT | O_CLOEXEC)) < 0)
540 return (B_FALSE);
541
542 if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
543 (void) close(fd);
544 return (B_FALSE);
545 }
546
547 efi_free(label);
548 (void) close(fd);
549
550 return (B_TRUE);
551 }
552
553 /*
554 * Lookup the underlying device for a device name
555 *
556 * Often you'll have a symlink to a device, a partition device,
557 * or a multipath device, and want to look up the underlying device.
558 * This function returns the underlying device name. If the device
559 * name is already the underlying device, then just return the same
560 * name. If the device is a DM device with multiple underlying devices
561 * then return the first one.
562 *
563 * For example:
564 *
565 * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda
566 * dev_name: /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001
567 * returns: /dev/sda
568 *
569 * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb)
570 * dev_name: /dev/mapper/mpatha
571 * returns: /dev/sda (first device)
572 *
573 * 3. /dev/sda (already the underlying device)
574 * dev_name: /dev/sda
575 * returns: /dev/sda
576 *
577 * 4. /dev/dm-3 (mapped to /dev/sda)
578 * dev_name: /dev/dm-3
579 * returns: /dev/sda
580 *
581 * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9
582 * dev_name: /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9
583 * returns: /dev/sdb
584 *
585 * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2
586 * dev_name: /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a
587 * returns: /dev/sda
588 *
589 * Returns underlying device name, or NULL on error or no match.
590 *
591 * NOTE: The returned name string must be *freed*.
592 */
593 char *
zfs_get_underlying_path(const char * dev_name)594 zfs_get_underlying_path(const char *dev_name)
595 {
596 char *name = NULL;
597 char *tmp;
598
599 if (dev_name == NULL)
600 return (NULL);
601
602 tmp = dm_get_underlying_path(dev_name);
603
604 /* dev_name not a DM device, so just un-symlinkize it */
605 if (tmp == NULL)
606 tmp = realpath(dev_name, NULL);
607
608 if (tmp != NULL) {
609 name = zfs_strip_partition_path(tmp);
610 free(tmp);
611 }
612
613 return (name);
614 }
615
616
617 #ifdef HAVE_LIBUDEV
618
619 /*
620 * A disk is considered a multipath whole disk when:
621 * DEVNAME key value has "dm-"
622 * DM_UUID key exists and starts with 'mpath-'
623 * ID_PART_TABLE_TYPE key does not exist or is not gpt
624 * ID_FS_LABEL key does not exist (disk isn't labeled)
625 */
626 static boolean_t
is_mpath_udev_sane(struct udev_device * dev)627 is_mpath_udev_sane(struct udev_device *dev)
628 {
629 const char *devname, *type, *uuid, *label;
630
631 devname = udev_device_get_property_value(dev, "DEVNAME");
632 type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
633 uuid = udev_device_get_property_value(dev, "DM_UUID");
634 label = udev_device_get_property_value(dev, "ID_FS_LABEL");
635
636 if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
637 ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
638 ((uuid != NULL) && (strncmp(uuid, "mpath-", 6) == 0)) &&
639 (label == NULL)) {
640 return (B_TRUE);
641 }
642
643 return (B_FALSE);
644 }
645
646 /*
647 * Check if a disk is a multipath "blank" disk:
648 *
649 * 1. The disk has udev values that suggest it's a multipath disk
650 * 2. The disk is not currently labeled with a filesystem of any type
651 * 3. There are no partitions on the disk
652 */
653 boolean_t
is_mpath_whole_disk(const char * path)654 is_mpath_whole_disk(const char *path)
655 {
656 struct udev *udev;
657 struct udev_device *dev = NULL;
658 char nodepath[MAXPATHLEN];
659 char *sysname;
660
661 if (realpath(path, nodepath) == NULL)
662 return (B_FALSE);
663 sysname = strrchr(nodepath, '/') + 1;
664 if (strncmp(sysname, "dm-", 3) != 0)
665 return (B_FALSE);
666 if ((udev = udev_new()) == NULL)
667 return (B_FALSE);
668 if ((dev = udev_device_new_from_subsystem_sysname(udev, "block",
669 sysname)) == NULL) {
670 udev_device_unref(dev);
671 return (B_FALSE);
672 }
673
674 /* Sanity check some udev values */
675 boolean_t is_sane = is_mpath_udev_sane(dev);
676 udev_device_unref(dev);
677
678 return (is_sane);
679 }
680
681 #else /* HAVE_LIBUDEV */
682
683 boolean_t
is_mpath_whole_disk(const char * path)684 is_mpath_whole_disk(const char *path)
685 {
686 (void) path;
687 return (B_FALSE);
688 }
689
690 #endif /* HAVE_LIBUDEV */
691