xref: /freebsd/sys/contrib/openzfs/lib/libzfs/os/linux/libzfs_pool_os.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
25  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
26  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
27  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
28  * Copyright (c) 2018 Datto Inc.
29  * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
30  * Copyright (c) 2017, Intel Corporation.
31  * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
32  */
33 
34 #include <errno.h>
35 #include <libintl.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <unistd.h>
40 #include <libgen.h>
41 #include <zone.h>
42 #include <sys/stat.h>
43 #include <sys/efi_partition.h>
44 #include <sys/systeminfo.h>
45 #include <sys/zfs_ioctl.h>
46 #include <sys/vdev_disk.h>
47 #include <dlfcn.h>
48 #include <libzutil.h>
49 
50 #include "zfs_namecheck.h"
51 #include "zfs_prop.h"
52 #include "../../libzfs_impl.h"
53 #include "zfs_comutil.h"
54 #include "zfeature_common.h"
55 
56 /*
57  * If the device has being dynamically expanded then we need to relabel
58  * the disk to use the new unallocated space.
59  */
60 int
zpool_relabel_disk(libzfs_handle_t * hdl,const char * path,const char * msg)61 zpool_relabel_disk(libzfs_handle_t *hdl, const char *path, const char *msg)
62 {
63 	int fd, error;
64 
65 	if ((fd = open(path, O_RDWR|O_DIRECT|O_CLOEXEC)) < 0) {
66 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
67 		    "relabel '%s': unable to open device: %d"), path, errno);
68 		return (zfs_error(hdl, EZFS_OPENFAILED, msg));
69 	}
70 
71 	/*
72 	 * It's possible that we might encounter an error if the device
73 	 * does not have any unallocated space left. If so, we simply
74 	 * ignore that error and continue on.
75 	 */
76 	error = efi_use_whole_disk(fd);
77 
78 	/* Flush the buffers to disk and invalidate the page cache. */
79 	(void) fsync(fd);
80 	(void) ioctl(fd, BLKFLSBUF);
81 
82 	(void) close(fd);
83 	if (error && error != VT_ENOSPC) {
84 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
85 		    "relabel '%s': unable to read disk capacity"), path);
86 		return (zfs_error(hdl, EZFS_NOCAP, msg));
87 	}
88 	return (0);
89 }
90 
91 /*
92  * Read the EFI label from the config, if a label does not exist then
93  * pass back the error to the caller. If the caller has passed a non-NULL
94  * diskaddr argument then we set it to the starting address of the EFI
95  * partition.
96  */
97 static int
read_efi_label(nvlist_t * config,diskaddr_t * sb)98 read_efi_label(nvlist_t *config, diskaddr_t *sb)
99 {
100 	const char *path;
101 	int fd;
102 	char diskname[MAXPATHLEN];
103 	int err = -1;
104 
105 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0)
106 		return (err);
107 
108 	(void) snprintf(diskname, sizeof (diskname), "%s%s", DISK_ROOT,
109 	    strrchr(path, '/'));
110 	if ((fd = open(diskname, O_RDONLY|O_DIRECT|O_CLOEXEC)) >= 0) {
111 		struct dk_gpt *vtoc;
112 
113 		if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) {
114 			if (sb != NULL)
115 				*sb = vtoc->efi_parts[0].p_start;
116 			efi_free(vtoc);
117 		}
118 		(void) close(fd);
119 	}
120 	return (err);
121 }
122 
123 /*
124  * determine where a partition starts on a disk in the current
125  * configuration
126  */
127 static diskaddr_t
find_start_block(nvlist_t * config)128 find_start_block(nvlist_t *config)
129 {
130 	nvlist_t **child;
131 	uint_t c, children;
132 	diskaddr_t sb = MAXOFFSET_T;
133 	uint64_t wholedisk;
134 
135 	if (nvlist_lookup_nvlist_array(config,
136 	    ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) {
137 		if (nvlist_lookup_uint64(config,
138 		    ZPOOL_CONFIG_WHOLE_DISK,
139 		    &wholedisk) != 0 || !wholedisk) {
140 			return (MAXOFFSET_T);
141 		}
142 		if (read_efi_label(config, &sb) < 0)
143 			sb = MAXOFFSET_T;
144 		return (sb);
145 	}
146 
147 	for (c = 0; c < children; c++) {
148 		sb = find_start_block(child[c]);
149 		if (sb != MAXOFFSET_T) {
150 			return (sb);
151 		}
152 	}
153 	return (MAXOFFSET_T);
154 }
155 
156 static int
zpool_label_disk_check(char * path)157 zpool_label_disk_check(char *path)
158 {
159 	struct dk_gpt *vtoc;
160 	int fd, err;
161 
162 	if ((fd = open(path, O_RDONLY|O_DIRECT|O_CLOEXEC)) < 0)
163 		return (errno);
164 
165 	if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) {
166 		(void) close(fd);
167 		return (err);
168 	}
169 
170 	if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
171 		efi_free(vtoc);
172 		(void) close(fd);
173 		return (EIDRM);
174 	}
175 
176 	efi_free(vtoc);
177 	(void) close(fd);
178 	return (0);
179 }
180 
181 /*
182  * Generate a unique partition name for the ZFS member.  Partitions must
183  * have unique names to ensure udev will be able to create symlinks under
184  * /dev/disk/by-partlabel/ for all pool members.  The partition names are
185  * of the form <pool>-<unique-id>.
186  */
187 static void
zpool_label_name(char * label_name,int label_size)188 zpool_label_name(char *label_name, int label_size)
189 {
190 	uint64_t id = 0;
191 	int fd;
192 
193 	fd = open("/dev/urandom", O_RDONLY|O_CLOEXEC);
194 	if (fd >= 0) {
195 		if (read(fd, &id, sizeof (id)) != sizeof (id))
196 			id = 0;
197 
198 		close(fd);
199 	}
200 
201 	if (id == 0)
202 		id = (((uint64_t)rand()) << 32) | (uint64_t)rand();
203 
204 	snprintf(label_name, label_size, "zfs-%016llx", (u_longlong_t)id);
205 }
206 
207 /*
208  * Label an individual disk.  The name provided is the short name,
209  * stripped of any leading /dev path.
210  */
211 int
zpool_label_disk(libzfs_handle_t * hdl,zpool_handle_t * zhp,const char * name)212 zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name)
213 {
214 	char path[MAXPATHLEN];
215 	struct dk_gpt *vtoc;
216 	int rval, fd;
217 	size_t resv = EFI_MIN_RESV_SIZE;
218 	uint64_t slice_size;
219 	diskaddr_t start_block;
220 	char errbuf[ERRBUFLEN];
221 
222 	/* prepare an error message just in case */
223 	(void) snprintf(errbuf, sizeof (errbuf),
224 	    dgettext(TEXT_DOMAIN, "cannot label '%s'"), name);
225 
226 	if (zhp) {
227 		nvlist_t *nvroot = fnvlist_lookup_nvlist(zhp->zpool_config,
228 		    ZPOOL_CONFIG_VDEV_TREE);
229 
230 		if (zhp->zpool_start_block == 0)
231 			start_block = find_start_block(nvroot);
232 		else
233 			start_block = zhp->zpool_start_block;
234 		zhp->zpool_start_block = start_block;
235 	} else {
236 		/* new pool */
237 		start_block = NEW_START_BLOCK;
238 	}
239 
240 	(void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name);
241 
242 	if ((fd = open(path, O_RDWR|O_DIRECT|O_EXCL|O_CLOEXEC)) < 0) {
243 		/*
244 		 * This shouldn't happen.  We've long since verified that this
245 		 * is a valid device.
246 		 */
247 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
248 		    "label '%s': unable to open device: %d"), path, errno);
249 		return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
250 	}
251 
252 	if (efi_alloc_and_init(fd, EFI_NUMPAR, &vtoc) != 0) {
253 		/*
254 		 * The only way this can fail is if we run out of memory, or we
255 		 * were unable to read the disk's capacity
256 		 */
257 		if (errno == ENOMEM)
258 			(void) no_memory(hdl);
259 
260 		(void) close(fd);
261 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
262 		    "label '%s': unable to read disk capacity"), path);
263 
264 		return (zfs_error(hdl, EZFS_NOCAP, errbuf));
265 	}
266 
267 	slice_size = vtoc->efi_last_u_lba + 1;
268 	slice_size -= EFI_MIN_RESV_SIZE;
269 	if (start_block == MAXOFFSET_T)
270 		start_block = NEW_START_BLOCK;
271 	slice_size -= start_block;
272 	slice_size = P2ALIGN_TYPED(slice_size, PARTITION_END_ALIGNMENT,
273 	    uint64_t);
274 
275 	vtoc->efi_parts[0].p_start = start_block;
276 	vtoc->efi_parts[0].p_size = slice_size;
277 
278 	if (vtoc->efi_parts[0].p_size * vtoc->efi_lbasize < SPA_MINDEVSIZE) {
279 		(void) close(fd);
280 		efi_free(vtoc);
281 
282 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
283 		    "label '%s': partition would be less than the minimum "
284 		    "device size (64M)"), path);
285 		return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
286 	}
287 
288 	/*
289 	 * Why we use V_USR: V_BACKUP confuses users, and is considered
290 	 * disposable by some EFI utilities (since EFI doesn't have a backup
291 	 * slice).  V_UNASSIGNED is supposed to be used only for zero size
292 	 * partitions, and efi_write() will fail if we use it.
293 	 * Other available types were all pretty specific.
294 	 * V_USR is as close to reality as we
295 	 * can get, in the absence of V_OTHER.
296 	 */
297 	vtoc->efi_parts[0].p_tag = V_USR;
298 	zpool_label_name(vtoc->efi_parts[0].p_name, EFI_PART_NAME_LEN);
299 
300 	vtoc->efi_parts[8].p_start = slice_size + start_block;
301 	vtoc->efi_parts[8].p_size = resv;
302 	vtoc->efi_parts[8].p_tag = V_RESERVED;
303 
304 	rval = efi_write(fd, vtoc);
305 
306 	/* Flush the buffers to disk and invalidate the page cache. */
307 	(void) fsync(fd);
308 	(void) ioctl(fd, BLKFLSBUF);
309 
310 	if (rval == 0)
311 		rval = efi_rescan(fd);
312 
313 	/*
314 	 * Some block drivers (like pcata) may not support EFI GPT labels.
315 	 * Print out a helpful error message directing the user to manually
316 	 * label the disk and give a specific slice.
317 	 */
318 	if (rval != 0) {
319 		(void) close(fd);
320 		efi_free(vtoc);
321 
322 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "try using "
323 		    "parted(8) and then provide a specific slice: %d"), rval);
324 		return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
325 	}
326 
327 	(void) close(fd);
328 	efi_free(vtoc);
329 
330 	(void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name);
331 	(void) zfs_append_partition(path, MAXPATHLEN);
332 
333 	/* Wait to udev to signal use the device has settled. */
334 	rval = zpool_label_disk_wait(path, DISK_LABEL_WAIT);
335 	if (rval) {
336 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to "
337 		    "detect device partitions on '%s': %d"), path, rval);
338 		return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
339 	}
340 
341 	/* We can't be to paranoid.  Read the label back and verify it. */
342 	(void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name);
343 	rval = zpool_label_disk_check(path);
344 	if (rval) {
345 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "freshly written "
346 		    "EFI label on '%s' is damaged.  Ensure\nthis device "
347 		    "is not in use, and is functioning properly: %d"),
348 		    path, rval);
349 		return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
350 	}
351 	return (0);
352 }
353