xref: /freebsd/sys/contrib/openzfs/cmd/zpool/os/linux/zpool_vdev_os.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25  * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
26  * Copyright (c) 2016, 2017 Intel Corporation.
27  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
28  */
29 
30 /*
31  * Functions to convert between a list of vdevs and an nvlist representing the
32  * configuration.  Each entry in the list can be one of:
33  *
34  * 	Device vdevs
35  * 		disk=(path=..., devid=...)
36  * 		file=(path=...)
37  *
38  * 	Group vdevs
39  * 		raidz[1|2]=(...)
40  * 		mirror=(...)
41  *
42  * 	Hot spares
43  *
44  * While the underlying implementation supports it, group vdevs cannot contain
45  * other group vdevs.  All userland verification of devices is contained within
46  * this file.  If successful, the nvlist returned can be passed directly to the
47  * kernel; we've done as much verification as possible in userland.
48  *
49  * Hot spares are a special case, and passed down as an array of disk vdevs, at
50  * the same level as the root of the vdev tree.
51  *
52  * The only function exported by this file is 'make_root_vdev'.  The
53  * function performs several passes:
54  *
55  * 	1. Construct the vdev specification.  Performs syntax validation and
56  *         makes sure each device is valid.
57  * 	2. Check for devices in use.  Using libblkid to make sure that no
58  *         devices are also in use.  Some can be overridden using the 'force'
59  *         flag, others cannot.
60  * 	3. Check for replication errors if the 'force' flag is not specified.
61  *         validates that the replication level is consistent across the
62  *         entire pool.
63  * 	4. Call libzfs to label any whole disks with an EFI label.
64  */
65 
66 #include <assert.h>
67 #include <ctype.h>
68 #include <errno.h>
69 #include <fcntl.h>
70 #include <libintl.h>
71 #include <libnvpair.h>
72 #include <libzutil.h>
73 #include <limits.h>
74 #include <sys/spa.h>
75 #include <stdio.h>
76 #include <string.h>
77 #include <unistd.h>
78 #include "zpool_util.h"
79 #include <sys/zfs_context.h>
80 
81 #include <scsi/scsi.h>
82 #include <scsi/sg.h>
83 #include <sys/efi_partition.h>
84 #include <sys/stat.h>
85 #include <sys/mntent.h>
86 #include <uuid/uuid.h>
87 #include <blkid/blkid.h>
88 
89 typedef struct vdev_disk_db_entry
90 {
91 	char id[24];
92 	int sector_size;
93 } vdev_disk_db_entry_t;
94 
95 /*
96  * Database of block devices that lie about physical sector sizes.  The
97  * identification string must be precisely 24 characters to avoid false
98  * negatives
99  */
100 static vdev_disk_db_entry_t vdev_disk_database[] = {
101 	{"ATA     ADATA SSD S396 3", 8192},
102 	{"ATA     APPLE SSD SM128E", 8192},
103 	{"ATA     APPLE SSD SM256E", 8192},
104 	{"ATA     APPLE SSD SM512E", 8192},
105 	{"ATA     APPLE SSD SM768E", 8192},
106 	{"ATA     C400-MTFDDAC064M", 8192},
107 	{"ATA     C400-MTFDDAC128M", 8192},
108 	{"ATA     C400-MTFDDAC256M", 8192},
109 	{"ATA     C400-MTFDDAC512M", 8192},
110 	{"ATA     Corsair Force 3 ", 8192},
111 	{"ATA     Corsair Force GS", 8192},
112 	{"ATA     INTEL SSDSA2CT04", 8192},
113 	{"ATA     INTEL SSDSA2BZ10", 8192},
114 	{"ATA     INTEL SSDSA2BZ20", 8192},
115 	{"ATA     INTEL SSDSA2BZ30", 8192},
116 	{"ATA     INTEL SSDSA2CW04", 8192},
117 	{"ATA     INTEL SSDSA2CW08", 8192},
118 	{"ATA     INTEL SSDSA2CW12", 8192},
119 	{"ATA     INTEL SSDSA2CW16", 8192},
120 	{"ATA     INTEL SSDSA2CW30", 8192},
121 	{"ATA     INTEL SSDSA2CW60", 8192},
122 	{"ATA     INTEL SSDSC2CT06", 8192},
123 	{"ATA     INTEL SSDSC2CT12", 8192},
124 	{"ATA     INTEL SSDSC2CT18", 8192},
125 	{"ATA     INTEL SSDSC2CT24", 8192},
126 	{"ATA     INTEL SSDSC2CW06", 8192},
127 	{"ATA     INTEL SSDSC2CW12", 8192},
128 	{"ATA     INTEL SSDSC2CW18", 8192},
129 	{"ATA     INTEL SSDSC2CW24", 8192},
130 	{"ATA     INTEL SSDSC2CW48", 8192},
131 	{"ATA     KINGSTON SH100S3", 8192},
132 	{"ATA     KINGSTON SH103S3", 8192},
133 	{"ATA     M4-CT064M4SSD2  ", 8192},
134 	{"ATA     M4-CT128M4SSD2  ", 8192},
135 	{"ATA     M4-CT256M4SSD2  ", 8192},
136 	{"ATA     M4-CT512M4SSD2  ", 8192},
137 	{"ATA     OCZ-AGILITY2    ", 8192},
138 	{"ATA     OCZ-AGILITY3    ", 8192},
139 	{"ATA     OCZ-VERTEX2 3.5 ", 8192},
140 	{"ATA     OCZ-VERTEX3     ", 8192},
141 	{"ATA     OCZ-VERTEX3 LT  ", 8192},
142 	{"ATA     OCZ-VERTEX3 MI  ", 8192},
143 	{"ATA     OCZ-VERTEX4     ", 8192},
144 	{"ATA     SAMSUNG MZ7WD120", 8192},
145 	{"ATA     SAMSUNG MZ7WD240", 8192},
146 	{"ATA     SAMSUNG MZ7WD480", 8192},
147 	{"ATA     SAMSUNG MZ7WD960", 8192},
148 	{"ATA     SAMSUNG SSD 830 ", 8192},
149 	{"ATA     Samsung SSD 840 ", 8192},
150 	{"ATA     SanDisk SSD U100", 8192},
151 	{"ATA     TOSHIBA THNSNH06", 8192},
152 	{"ATA     TOSHIBA THNSNH12", 8192},
153 	{"ATA     TOSHIBA THNSNH25", 8192},
154 	{"ATA     TOSHIBA THNSNH51", 8192},
155 	{"ATA     APPLE SSD TS064C", 4096},
156 	{"ATA     APPLE SSD TS128C", 4096},
157 	{"ATA     APPLE SSD TS256C", 4096},
158 	{"ATA     APPLE SSD TS512C", 4096},
159 	{"ATA     INTEL SSDSA2M040", 4096},
160 	{"ATA     INTEL SSDSA2M080", 4096},
161 	{"ATA     INTEL SSDSA2M160", 4096},
162 	{"ATA     INTEL SSDSC2MH12", 4096},
163 	{"ATA     INTEL SSDSC2MH25", 4096},
164 	{"ATA     OCZ CORE_SSD    ", 4096},
165 	{"ATA     OCZ-VERTEX      ", 4096},
166 	{"ATA     SAMSUNG MCCOE32G", 4096},
167 	{"ATA     SAMSUNG MCCOE64G", 4096},
168 	{"ATA     SAMSUNG SSD PM80", 4096},
169 	/* Flash drives optimized for 4KB IOs on larger pages */
170 	{"ATA     INTEL SSDSC2BA10", 4096},
171 	{"ATA     INTEL SSDSC2BA20", 4096},
172 	{"ATA     INTEL SSDSC2BA40", 4096},
173 	{"ATA     INTEL SSDSC2BA80", 4096},
174 	{"ATA     INTEL SSDSC2BB08", 4096},
175 	{"ATA     INTEL SSDSC2BB12", 4096},
176 	{"ATA     INTEL SSDSC2BB16", 4096},
177 	{"ATA     INTEL SSDSC2BB24", 4096},
178 	{"ATA     INTEL SSDSC2BB30", 4096},
179 	{"ATA     INTEL SSDSC2BB40", 4096},
180 	{"ATA     INTEL SSDSC2BB48", 4096},
181 	{"ATA     INTEL SSDSC2BB60", 4096},
182 	{"ATA     INTEL SSDSC2BB80", 4096},
183 	{"ATA     INTEL SSDSC2BW24", 4096},
184 	{"ATA     INTEL SSDSC2BW48", 4096},
185 	{"ATA     INTEL SSDSC2BP24", 4096},
186 	{"ATA     INTEL SSDSC2BP48", 4096},
187 	{"NA      SmrtStorSDLKAE9W", 4096},
188 	{"NVMe    Amazon EC2 NVMe ", 4096},
189 	/* Imported from Open Solaris */
190 	{"ATA     MARVELL SD88SA02", 4096},
191 	/* Advanced format Hard drives */
192 	{"ATA     Hitachi HDS5C303", 4096},
193 	{"ATA     SAMSUNG HD204UI ", 4096},
194 	{"ATA     ST2000DL004 HD20", 4096},
195 	{"ATA     WDC WD10EARS-00M", 4096},
196 	{"ATA     WDC WD10EARS-00S", 4096},
197 	{"ATA     WDC WD10EARS-00Z", 4096},
198 	{"ATA     WDC WD15EARS-00M", 4096},
199 	{"ATA     WDC WD15EARS-00S", 4096},
200 	{"ATA     WDC WD15EARS-00Z", 4096},
201 	{"ATA     WDC WD20EARS-00M", 4096},
202 	{"ATA     WDC WD20EARS-00S", 4096},
203 	{"ATA     WDC WD20EARS-00Z", 4096},
204 	{"ATA     WDC WD1600BEVT-0", 4096},
205 	{"ATA     WDC WD2500BEVT-0", 4096},
206 	{"ATA     WDC WD3200BEVT-0", 4096},
207 	{"ATA     WDC WD5000BEVT-0", 4096},
208 };
209 
210 
211 #define	INQ_REPLY_LEN	96
212 #define	INQ_CMD_LEN	6
213 
214 static const int vdev_disk_database_size =
215 	sizeof (vdev_disk_database) / sizeof (vdev_disk_database[0]);
216 
217 boolean_t
check_sector_size_database(char * path,int * sector_size)218 check_sector_size_database(char *path, int *sector_size)
219 {
220 	unsigned char inq_buff[INQ_REPLY_LEN];
221 	unsigned char sense_buffer[32];
222 	unsigned char inq_cmd_blk[INQ_CMD_LEN] =
223 	    {INQUIRY, 0, 0, 0, INQ_REPLY_LEN, 0};
224 	sg_io_hdr_t io_hdr;
225 	int error;
226 	int fd;
227 	int i;
228 
229 	/* Prepare INQUIRY command */
230 	memset(&io_hdr, 0, sizeof (sg_io_hdr_t));
231 	io_hdr.interface_id = 'S';
232 	io_hdr.cmd_len = sizeof (inq_cmd_blk);
233 	io_hdr.mx_sb_len = sizeof (sense_buffer);
234 	io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
235 	io_hdr.dxfer_len = INQ_REPLY_LEN;
236 	io_hdr.dxferp = inq_buff;
237 	io_hdr.cmdp = inq_cmd_blk;
238 	io_hdr.sbp = sense_buffer;
239 	io_hdr.timeout = 10;		/* 10 milliseconds is ample time */
240 
241 	if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
242 		return (B_FALSE);
243 
244 	error = ioctl(fd, SG_IO, (unsigned long) &io_hdr);
245 
246 	(void) close(fd);
247 
248 	if (error < 0)
249 		return (B_FALSE);
250 
251 	if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK)
252 		return (B_FALSE);
253 
254 	for (i = 0; i < vdev_disk_database_size; i++) {
255 		if (memcmp(inq_buff + 8, vdev_disk_database[i].id, 24))
256 			continue;
257 
258 		*sector_size = vdev_disk_database[i].sector_size;
259 		return (B_TRUE);
260 	}
261 
262 	return (B_FALSE);
263 }
264 
265 static int
check_slice(const char * path,blkid_cache cache,int force,boolean_t isspare)266 check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare)
267 {
268 	int err;
269 	char *value;
270 
271 	/* No valid type detected device is safe to use */
272 	value = blkid_get_tag_value(cache, "TYPE", path);
273 	if (value == NULL)
274 		return (0);
275 
276 	/*
277 	 * If libblkid detects a ZFS device, we check the device
278 	 * using check_file() to see if it's safe.  The one safe
279 	 * case is a spare device shared between multiple pools.
280 	 */
281 	if (strcmp(value, "zfs_member") == 0) {
282 		err = check_file(path, force, isspare);
283 	} else {
284 		if (force) {
285 			err = 0;
286 		} else {
287 			err = -1;
288 			vdev_error(gettext("%s contains a filesystem of "
289 			    "type '%s'\n"), path, value);
290 		}
291 	}
292 
293 	free(value);
294 
295 	return (err);
296 }
297 
298 /*
299  * Validate that a disk including all partitions are safe to use.
300  *
301  * For EFI labeled disks this can done relatively easily with the libefi
302  * library.  The partition numbers are extracted from the label and used
303  * to generate the expected /dev/ paths.  Each partition can then be
304  * checked for conflicts.
305  *
306  * For non-EFI labeled disks (MBR/EBR/etc) the same process is possible
307  * but due to the lack of a readily available libraries this scanning is
308  * not implemented.  Instead only the device path as given is checked.
309  */
310 static int
check_disk(const char * path,blkid_cache cache,int force,boolean_t isspare,boolean_t iswholedisk)311 check_disk(const char *path, blkid_cache cache, int force,
312     boolean_t isspare, boolean_t iswholedisk)
313 {
314 	struct dk_gpt *vtoc;
315 	char slice_path[MAXPATHLEN];
316 	int err = 0;
317 	int fd, i;
318 	int flags = O_RDONLY|O_DIRECT;
319 
320 	if (!iswholedisk)
321 		return (check_slice(path, cache, force, isspare));
322 
323 	/* only spares can be shared, other devices require exclusive access */
324 	if (!isspare)
325 		flags |= O_EXCL;
326 
327 	if ((fd = open(path, flags)) < 0) {
328 		char *value = blkid_get_tag_value(cache, "TYPE", path);
329 		(void) fprintf(stderr, gettext("%s is in use and contains "
330 		    "a %s filesystem.\n"), path, value ? value : "unknown");
331 		free(value);
332 		return (-1);
333 	}
334 
335 	/*
336 	 * Expected to fail for non-EFI labeled disks.  Just check the device
337 	 * as given and do not attempt to detect and scan partitions.
338 	 */
339 	err = efi_alloc_and_read(fd, &vtoc);
340 	if (err) {
341 		(void) close(fd);
342 		return (check_slice(path, cache, force, isspare));
343 	}
344 
345 	/*
346 	 * The primary efi partition label is damaged however the secondary
347 	 * label at the end of the device is intact.  Rather than use this
348 	 * label we should play it safe and treat this as a non efi device.
349 	 */
350 	if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
351 		efi_free(vtoc);
352 		(void) close(fd);
353 
354 		if (force) {
355 			/* Partitions will now be created using the backup */
356 			return (0);
357 		} else {
358 			vdev_error(gettext("%s contains a corrupt primary "
359 			    "EFI label.\n"), path);
360 			return (-1);
361 		}
362 	}
363 
364 	for (i = 0; i < vtoc->efi_nparts; i++) {
365 
366 		if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED ||
367 		    uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid))
368 			continue;
369 
370 		if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0)
371 			(void) snprintf(slice_path, sizeof (slice_path),
372 			    "%s%s%d", path, "-part", i+1);
373 		else
374 			(void) snprintf(slice_path, sizeof (slice_path),
375 			    "%s%s%d", path, isdigit(path[strlen(path)-1]) ?
376 			    "p" : "", i+1);
377 
378 		err = check_slice(slice_path, cache, force, isspare);
379 		if (err)
380 			break;
381 	}
382 
383 	efi_free(vtoc);
384 	(void) close(fd);
385 
386 	return (err);
387 }
388 
389 int
check_device(const char * path,boolean_t force,boolean_t isspare,boolean_t iswholedisk)390 check_device(const char *path, boolean_t force,
391     boolean_t isspare, boolean_t iswholedisk)
392 {
393 	blkid_cache cache;
394 	int error;
395 
396 	error = blkid_get_cache(&cache, NULL);
397 	if (error != 0) {
398 		(void) fprintf(stderr, gettext("unable to access the blkid "
399 		    "cache.\n"));
400 		return (-1);
401 	}
402 
403 	error = check_disk(path, cache, force, isspare, iswholedisk);
404 	blkid_put_cache(cache);
405 
406 	return (error);
407 }
408 
409 void
after_zpool_upgrade(zpool_handle_t * zhp)410 after_zpool_upgrade(zpool_handle_t *zhp)
411 {
412 	(void) zhp;
413 }
414 
415 int
check_file(const char * file,boolean_t force,boolean_t isspare)416 check_file(const char *file, boolean_t force, boolean_t isspare)
417 {
418 	return (check_file_generic(file, force, isspare));
419 }
420 
421 /*
422  * Read from a sysfs file and return an allocated string.  Removes
423  * the newline from the end of the string if there is one.
424  *
425  * Returns a string on success (which must be freed), or NULL on error.
426  */
zpool_sysfs_gets(char * path)427 static char *zpool_sysfs_gets(char *path)
428 {
429 	int fd;
430 	struct stat statbuf;
431 	char *buf = NULL;
432 	ssize_t count = 0;
433 	fd = open(path, O_RDONLY);
434 	if (fd < 0)
435 		return (NULL);
436 
437 	if (fstat(fd, &statbuf) != 0) {
438 		close(fd);
439 		return (NULL);
440 	}
441 
442 	buf = calloc(statbuf.st_size + 1, sizeof (*buf));
443 	if (buf == NULL) {
444 		close(fd);
445 		return (NULL);
446 	}
447 
448 	/*
449 	 * Note, we can read less bytes than st_size, and that's ok.  Sysfs
450 	 * files will report their size is 4k even if they only return a small
451 	 * string.
452 	 */
453 	count = read(fd, buf, statbuf.st_size);
454 	if (count < 0) {
455 		/* Error doing read() or we overran the buffer */
456 		close(fd);
457 		free(buf);
458 		return (NULL);
459 	}
460 
461 	/* Remove trailing newline */
462 	if (count > 0 && buf[count - 1] == '\n')
463 		buf[count - 1] = 0;
464 
465 	close(fd);
466 
467 	return (buf);
468 }
469 
470 /*
471  * Write a string to a sysfs file.
472  *
473  * Returns 0 on success, non-zero otherwise.
474  */
zpool_sysfs_puts(char * path,char * str)475 static int zpool_sysfs_puts(char *path, char *str)
476 {
477 	FILE *file;
478 
479 	file = fopen(path, "w");
480 	if (!file) {
481 		return (-1);
482 	}
483 
484 	if (fputs(str, file) < 0) {
485 		fclose(file);
486 		return (-2);
487 	}
488 	fclose(file);
489 	return (0);
490 }
491 
492 /* Given a vdev nvlist_t, rescan its enclosure sysfs path */
493 static void
rescan_vdev_config_dev_sysfs_path(nvlist_t * vdev_nv)494 rescan_vdev_config_dev_sysfs_path(nvlist_t *vdev_nv)
495 {
496 	update_vdev_config_dev_sysfs_path(vdev_nv,
497 	    fnvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_PATH),
498 	    ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
499 }
500 
501 /*
502  * Given a power string: "on", "off", "1", or "0", return 0 if it's an
503  * off value, 1 if it's an on value, and -1 if the value is unrecognized.
504  */
zpool_power_parse_value(char * str)505 static int zpool_power_parse_value(char *str)
506 {
507 	if ((strcmp(str, "off") == 0) || (strcmp(str, "0") == 0))
508 		return (0);
509 
510 	if ((strcmp(str, "on") == 0) || (strcmp(str, "1") == 0))
511 		return (1);
512 
513 	return (-1);
514 }
515 
516 /*
517  * Given a vdev string return an allocated string containing the sysfs path to
518  * its power control file.  Also do a check if the power control file really
519  * exists and has correct permissions.
520  *
521  * Example returned strings:
522  *
523  * /sys/class/enclosure/0:0:122:0/10/power_status
524  * /sys/bus/pci/slots/10/power
525  *
526  * Returns allocated string on success (which must be freed), NULL on failure.
527  */
528 static char *
zpool_power_sysfs_path(zpool_handle_t * zhp,char * vdev)529 zpool_power_sysfs_path(zpool_handle_t *zhp, char *vdev)
530 {
531 	const char *enc_sysfs_dir = NULL;
532 	char *path = NULL;
533 	nvlist_t *vdev_nv = zpool_find_vdev(zhp, vdev, NULL, NULL, NULL);
534 
535 	if (vdev_nv == NULL) {
536 		return (NULL);
537 	}
538 
539 	/* Make sure we're getting the updated enclosure sysfs path */
540 	rescan_vdev_config_dev_sysfs_path(vdev_nv);
541 
542 	if (nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
543 	    &enc_sysfs_dir) != 0) {
544 		return (NULL);
545 	}
546 
547 	if (asprintf(&path, "%s/power_status", enc_sysfs_dir) == -1)
548 		return (NULL);
549 
550 	if (access(path, W_OK) != 0) {
551 		free(path);
552 		path = NULL;
553 		/* No HDD 'power_control' file, maybe it's NVMe? */
554 		if (asprintf(&path, "%s/power", enc_sysfs_dir) == -1) {
555 			return (NULL);
556 		}
557 
558 		if (access(path, R_OK | W_OK) != 0) {
559 			/* Not NVMe either */
560 			free(path);
561 			return (NULL);
562 		}
563 	}
564 
565 	return (path);
566 }
567 
568 /*
569  * Given a path to a sysfs power control file, return B_TRUE if you should use
570  * "on/off" words to control it, or B_FALSE otherwise ("0/1" to control).
571  */
572 static boolean_t
zpool_power_use_word(char * sysfs_path)573 zpool_power_use_word(char *sysfs_path)
574 {
575 	if (strcmp(&sysfs_path[strlen(sysfs_path) - strlen("power_status")],
576 	    "power_status") == 0) {
577 		return (B_TRUE);
578 	}
579 	return (B_FALSE);
580 }
581 
582 /*
583  * Check the sysfs power control value for a vdev.
584  *
585  * Returns:
586  *  0 - Power is off
587  *  1 - Power is on
588  * -1 - Error or unsupported
589  */
590 int
zpool_power_current_state(zpool_handle_t * zhp,char * vdev)591 zpool_power_current_state(zpool_handle_t *zhp, char *vdev)
592 {
593 	char *val;
594 	int rc;
595 
596 	char *path = zpool_power_sysfs_path(zhp, vdev);
597 	if (path == NULL)
598 		return (-1);
599 
600 	val = zpool_sysfs_gets(path);
601 	if (val == NULL) {
602 		free(path);
603 		return (-1);
604 	}
605 
606 	rc = zpool_power_parse_value(val);
607 	free(val);
608 	free(path);
609 	return (rc);
610 }
611 
612 /*
613  * Turn on or off the slot to a device
614  *
615  * Device path is the full path to the device (like /dev/sda or /dev/sda1).
616  *
617  * Return code:
618  * 0:		Success
619  * ENOTSUP:	Power control not supported for OS
620  * EBADSLT:	Couldn't read current power state
621  * ENOENT:	No sysfs path to power control
622  * EIO:	Couldn't write sysfs power value
623  * EBADE:	Sysfs power value didn't change
624  */
625 int
zpool_power(zpool_handle_t * zhp,char * vdev,boolean_t turn_on)626 zpool_power(zpool_handle_t *zhp, char *vdev, boolean_t turn_on)
627 {
628 	char *sysfs_path;
629 	const char *val;
630 	int rc;
631 	int timeout_ms;
632 
633 	rc = zpool_power_current_state(zhp, vdev);
634 	if (rc == -1) {
635 		return (EBADSLT);
636 	}
637 
638 	/* Already correct value? */
639 	if (rc == (int)turn_on)
640 		return (0);
641 
642 	sysfs_path = zpool_power_sysfs_path(zhp, vdev);
643 	if (sysfs_path == NULL)
644 		return (ENOENT);
645 
646 	if (zpool_power_use_word(sysfs_path)) {
647 		val = turn_on ? "on" : "off";
648 	} else {
649 		val = turn_on ? "1" : "0";
650 	}
651 
652 	rc = zpool_sysfs_puts(sysfs_path, (char *)val);
653 
654 	free(sysfs_path);
655 	if (rc != 0) {
656 		return (EIO);
657 	}
658 
659 	/*
660 	 * Wait up to 30 seconds for sysfs power value to change after
661 	 * writing it.
662 	 */
663 	timeout_ms = zpool_getenv_int("ZPOOL_POWER_ON_SLOT_TIMEOUT_MS", 30000);
664 	for (int i = 0; i < MAX(1, timeout_ms / 200); i++) {
665 		rc = zpool_power_current_state(zhp, vdev);
666 		if (rc == (int)turn_on)
667 			return (0);	/* success */
668 
669 		fsleep(0.200);	/* 200ms */
670 	}
671 
672 	/* sysfs value never changed */
673 	return (EBADE);
674 }
675