xref: /freebsd/usr.sbin/makefs/zfs.c (revision cd4b9dac1a0dc6b868aa4376ac355aaf25430a77)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2022 The FreeBSD Foundation
5  *
6  * This software was developed by Mark Johnston under sponsorship from
7  * the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions are
11  * met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/errno.h>
33 #include <sys/queue.h>
34 
35 #include <assert.h>
36 #include <ctype.h>
37 #include <fcntl.h>
38 #include <stdalign.h>
39 #include <stdbool.h>
40 #include <stddef.h>
41 #include <stdint.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <unistd.h>
45 
46 #include <util.h>
47 
48 #include "makefs.h"
49 #include "zfs.h"
50 
51 #define	VDEV_LABEL_SPACE	\
52 	((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE))
53 _Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, "");
54 
55 #define	MINMSSIZE		((off_t)1 << 24) /* 16MB */
56 #define	DFLTMSSIZE		((off_t)1 << 29) /* 512MB */
57 #define	MAXMSSIZE		((off_t)1 << 34) /* 16GB */
58 
59 #define	INDIR_LEVELS		6
60 /* Indirect blocks are always 128KB. */
61 #define	BLKPTR_PER_INDIR	(MAXBLOCKSIZE / sizeof(blkptr_t))
62 
63 struct dnode_cursor {
64 	char		inddir[INDIR_LEVELS][MAXBLOCKSIZE];
65 	off_t		indloc;
66 	off_t		indspace;
67 	dnode_phys_t	*dnode;
68 	off_t		dataoff;
69 	off_t		datablksz;
70 };
71 
72 void
zfs_prep_opts(fsinfo_t * fsopts)73 zfs_prep_opts(fsinfo_t *fsopts)
74 {
75 	zfs_opt_t *zfs;
76 	size_t align;
77 
78 	align = alignof(uint64_t);
79 	zfs = aligned_alloc(align, roundup2(sizeof(*zfs), align));
80 	if (zfs == NULL)
81 		err(1, "aligned_alloc");
82 	memset(zfs, 0, sizeof(*zfs));
83 
84 	const option_t zfs_options[] = {
85 		{ '\0', "bootfs", &zfs->bootfs, OPT_STRPTR,
86 		  0, 0, "Bootable dataset" },
87 		{ '\0', "mssize", &zfs->mssize, OPT_INT64,
88 		  MINMSSIZE, MAXMSSIZE, "Metaslab size" },
89 		{ '\0', "poolguid", &zfs->poolguid, OPT_INT64,
90 		  0, INT64_MAX, "ZFS pool GUID" },
91 		{ '\0', "poolname", &zfs->poolname, OPT_STRPTR,
92 		  0, 0, "ZFS pool name" },
93 		{ '\0', "rootpath", &zfs->rootpath, OPT_STRPTR,
94 		  0, 0, "Prefix for all dataset mount points" },
95 		{ '\0', "ashift", &zfs->ashift, OPT_INT32,
96 		  MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" },
97 		{ '\0', "verify-txgs", &zfs->verify_txgs, OPT_BOOL,
98 		  0, 0, "Make OpenZFS verify data upon import" },
99 		{ '\0', "nowarn", &zfs->nowarn, OPT_BOOL,
100 		  0, 0, "Provided for backwards compatibility, ignored" },
101 		{ .name = NULL }
102 	};
103 
104 	STAILQ_INIT(&zfs->datasetdescs);
105 
106 	fsopts->fs_specific = zfs;
107 	fsopts->fs_options = copy_opts(zfs_options);
108 }
109 
110 int
zfs_parse_opts(const char * option,fsinfo_t * fsopts)111 zfs_parse_opts(const char *option, fsinfo_t *fsopts)
112 {
113 	zfs_opt_t *zfs;
114 	struct dataset_desc *dsdesc;
115 	char buf[BUFSIZ], *opt, *val;
116 	int rv;
117 
118 	zfs = fsopts->fs_specific;
119 
120 	opt = val = estrdup(option);
121 	opt = strsep(&val, "=");
122 	if (strcmp(opt, "fs") == 0) {
123 		if (val == NULL)
124 			errx(1, "invalid filesystem parameters `%s'", option);
125 
126 		/*
127 		 * Dataset descriptions will be parsed later, in dsl_init().
128 		 * Just stash them away for now.
129 		 */
130 		dsdesc = ecalloc(1, sizeof(*dsdesc));
131 		dsdesc->params = estrdup(val);
132 		free(opt);
133 		STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next);
134 		return (1);
135 	}
136 	free(opt);
137 
138 	rv = set_option(fsopts->fs_options, option, buf, sizeof(buf));
139 	return (rv == -1 ? 0 : 1);
140 }
141 
142 static void
zfs_size_vdev(fsinfo_t * fsopts)143 zfs_size_vdev(fsinfo_t *fsopts)
144 {
145 	zfs_opt_t *zfs;
146 	off_t asize, mssize, vdevsize, vdevsize1;
147 
148 	zfs = fsopts->fs_specific;
149 
150 	assert(fsopts->maxsize != 0);
151 	assert(zfs->ashift != 0);
152 
153 	/*
154 	 * Figure out how big the vdev should be.
155 	 */
156 	vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift);
157 	if (vdevsize < MINDEVSIZE)
158 		errx(1, "maximum image size is too small");
159 	if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) {
160 		errx(1, "image size bounds must be multiples of %d",
161 		    1 << zfs->ashift);
162 	}
163 	asize = vdevsize - VDEV_LABEL_SPACE;
164 
165 	/*
166 	 * Size metaslabs according to the following heuristic:
167 	 * - provide at least 8 metaslabs,
168 	 * - without using a metaslab size larger than 512MB.
169 	 * This approximates what OpenZFS does without being complicated.  In
170 	 * practice we expect pools to be expanded upon first use, and OpenZFS
171 	 * does not resize metaslabs in that case, so there is no right answer
172 	 * here.  In general we want to provide large metaslabs even if the
173 	 * image size is small, and 512MB is a reasonable size for pools up to
174 	 * several hundred gigabytes.
175 	 *
176 	 * The user may override this heuristic using the "-o mssize" option.
177 	 */
178 	mssize = zfs->mssize;
179 	if (mssize == 0) {
180 		mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE);
181 		if (!powerof2(mssize))
182 			mssize = 1l << (flsll(mssize) - 1);
183 	}
184 	if (!powerof2(mssize))
185 		errx(1, "metaslab size must be a power of 2");
186 
187 	/*
188 	 * If we have some slop left over, try to cover it by resizing the vdev,
189 	 * subject to the maxsize and minsize parameters.
190 	 */
191 	if (asize % mssize != 0) {
192 		vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE;
193 		if (vdevsize1 < fsopts->minsize)
194 			vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE;
195 		if (vdevsize1 <= fsopts->maxsize)
196 			vdevsize = vdevsize1;
197 	}
198 	asize = vdevsize - VDEV_LABEL_SPACE;
199 
200 	zfs->asize = asize;
201 	zfs->vdevsize = vdevsize;
202 	zfs->mssize = mssize;
203 	zfs->msshift = flsll(mssize) - 1;
204 	zfs->mscount = asize / mssize;
205 }
206 
207 /*
208  * Validate options and set some default values.
209  */
210 static void
zfs_check_opts(fsinfo_t * fsopts)211 zfs_check_opts(fsinfo_t *fsopts)
212 {
213 	zfs_opt_t *zfs;
214 
215 	zfs = fsopts->fs_specific;
216 
217 	if (fsopts->offset != 0)
218 		errx(1, "unhandled offset option");
219 	if (fsopts->maxsize == 0)
220 		errx(1, "an image size must be specified");
221 
222 	if (zfs->poolname == NULL)
223 		errx(1, "a pool name must be specified");
224 	if (!isalpha(zfs->poolname[0]))
225 		errx(1, "the pool name must begin with a letter");
226 	for (size_t i = 0, len = strlen(zfs->poolname); i < len; i++) {
227 		if (!isalnum(zfs->poolname[i]) && zfs->poolname[i] != '_')
228 			errx(1, "invalid character '%c' in pool name",
229 			    zfs->poolname[i]);
230 	}
231 	if (strcmp(zfs->poolname, "mirror") == 0 ||
232 	    strcmp(zfs->poolname, "raidz") == 0 ||
233 	    strcmp(zfs->poolname, "draid") == 0) {
234 		errx(1, "pool name '%s' is reserved and cannot be used",
235 		    zfs->poolname);
236 	}
237 
238 	if (zfs->rootpath == NULL)
239 		easprintf(&zfs->rootpath, "/%s", zfs->poolname);
240 	if (zfs->rootpath[0] != '/')
241 		errx(1, "mountpoint `%s' must be absolute", zfs->rootpath);
242 
243 	if (zfs->ashift == 0)
244 		zfs->ashift = 12;
245 
246 	zfs_size_vdev(fsopts);
247 }
248 
249 void
zfs_cleanup_opts(fsinfo_t * fsopts)250 zfs_cleanup_opts(fsinfo_t *fsopts)
251 {
252 	struct dataset_desc *d, *tmp;
253 	zfs_opt_t *zfs;
254 
255 	zfs = fsopts->fs_specific;
256 	free(zfs->rootpath);
257 	free(zfs->bootfs);
258 	free(__DECONST(void *, zfs->poolname));
259 	STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) {
260 		free(d->params);
261 		free(d);
262 	}
263 	free(zfs);
264 	free(fsopts->fs_options);
265 }
266 
267 static size_t
nvlist_size(const nvlist_t * nvl)268 nvlist_size(const nvlist_t *nvl)
269 {
270 	return (sizeof(nvl->nv_header) + nvl->nv_size);
271 }
272 
273 static void
nvlist_copy(const nvlist_t * nvl,char * buf,size_t sz)274 nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz)
275 {
276 	assert(sz >= nvlist_size(nvl));
277 
278 	memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header));
279 	memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size);
280 }
281 
282 /*
283  * Avoid returning a GUID of 0, just to avoid the possibility that something
284  * will interpret that as meaning that the GUID is uninitialized.
285  */
286 uint64_t
randomguid(void)287 randomguid(void)
288 {
289 	uint64_t ret;
290 
291 	do {
292 		ret = ((uint64_t)random() << 32) | random();
293 	} while (ret == 0);
294 
295 	return (ret);
296 }
297 
298 static nvlist_t *
pool_config_nvcreate(zfs_opt_t * zfs)299 pool_config_nvcreate(zfs_opt_t *zfs)
300 {
301 	nvlist_t *featuresnv, *poolnv;
302 
303 	poolnv = nvlist_create(NV_UNIQUE_NAME);
304 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG);
305 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION);
306 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED);
307 	nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname);
308 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid);
309 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid);
310 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
311 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1);
312 
313 	featuresnv = nvlist_create(NV_UNIQUE_NAME);
314 	nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv);
315 	nvlist_destroy(featuresnv);
316 
317 	return (poolnv);
318 }
319 
320 static nvlist_t *
pool_disk_vdev_config_nvcreate(zfs_opt_t * zfs)321 pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs)
322 {
323 	nvlist_t *diskvdevnv;
324 
325 	assert(zfs->objarrid != 0);
326 
327 	diskvdevnv = nvlist_create(NV_UNIQUE_NAME);
328 	nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK);
329 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift);
330 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize);
331 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
332 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0);
333 	nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null");
334 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1);
335 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
336 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY,
337 	    zfs->objarrid);
338 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT,
339 	    zfs->msshift);
340 
341 	return (diskvdevnv);
342 }
343 
344 static nvlist_t *
pool_root_vdev_config_nvcreate(zfs_opt_t * zfs)345 pool_root_vdev_config_nvcreate(zfs_opt_t *zfs)
346 {
347 	nvlist_t *diskvdevnv, *rootvdevnv;
348 
349 	diskvdevnv = pool_disk_vdev_config_nvcreate(zfs);
350 	rootvdevnv = nvlist_create(NV_UNIQUE_NAME);
351 
352 	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0);
353 	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid);
354 	nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT);
355 	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
356 	nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv,
357 	    1);
358 	nvlist_destroy(diskvdevnv);
359 
360 	return (rootvdevnv);
361 }
362 
363 /*
364  * Create the pool's "config" object, which contains an nvlist describing pool
365  * parameters and the vdev topology.  It is similar but not identical to the
366  * nvlist stored in vdev labels.  The main difference is that vdev labels do not
367  * describe the full vdev tree and in particular do not contain the "root"
368  * meta-vdev.
369  */
370 static void
pool_init_objdir_config(zfs_opt_t * zfs,zfs_zap_t * objdir)371 pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir)
372 {
373 	dnode_phys_t *dnode;
374 	nvlist_t *poolconfig, *vdevconfig;
375 	void *configbuf;
376 	uint64_t dnid;
377 	off_t configloc, configblksz;
378 	int error;
379 
380 	dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST,
381 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid);
382 
383 	poolconfig = pool_config_nvcreate(zfs);
384 
385 	vdevconfig = pool_root_vdev_config_nvcreate(zfs);
386 	nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
387 	nvlist_destroy(vdevconfig);
388 
389 	error = nvlist_export(poolconfig);
390 	if (error != 0)
391 		errc(1, error, "nvlist_export");
392 
393 	configblksz = nvlist_size(poolconfig);
394 	configloc = objset_space_alloc(zfs, zfs->mos, &configblksz);
395 	configbuf = ecalloc(1, configblksz);
396 	nvlist_copy(poolconfig, configbuf, configblksz);
397 
398 	vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc);
399 
400 	dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT;
401 	dnode->dn_flags = DNODE_FLAG_USED_BYTES;
402 	*(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig);
403 
404 	zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid);
405 
406 	nvlist_destroy(poolconfig);
407 	free(configbuf);
408 }
409 
410 /*
411  * Add objects block pointer list objects, used for deferred frees.  We don't do
412  * anything with them, but they need to be present or OpenZFS will refuse to
413  * import the pool.
414  */
415 static void
pool_init_objdir_bplists(zfs_opt_t * zfs __unused,zfs_zap_t * objdir)416 pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir)
417 {
418 	uint64_t dnid;
419 
420 	(void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
421 	    BPOBJ_SIZE_V2, &dnid);
422 	zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid);
423 
424 	(void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
425 	    BPOBJ_SIZE_V2, &dnid);
426 	zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid);
427 }
428 
429 /*
430  * Add required feature metadata objects.  We don't know anything about ZFS
431  * features, so the objects are just empty ZAPs.
432  */
433 static void
pool_init_objdir_feature_maps(zfs_opt_t * zfs,zfs_zap_t * objdir)434 pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir)
435 {
436 	dnode_phys_t *dnode;
437 	uint64_t dnid;
438 
439 	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
440 	zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid);
441 	zap_write(zfs, zap_alloc(zfs->mos, dnode));
442 
443 	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
444 	zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid);
445 	zap_write(zfs, zap_alloc(zfs->mos, dnode));
446 
447 	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
448 	zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid);
449 	zap_write(zfs, zap_alloc(zfs->mos, dnode));
450 }
451 
452 static void
pool_init_objdir_dsl(zfs_opt_t * zfs,zfs_zap_t * objdir)453 pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir)
454 {
455 	zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET,
456 	    dsl_dir_id(zfs->rootdsldir));
457 }
458 
459 static void
pool_init_objdir_poolprops(zfs_opt_t * zfs,zfs_zap_t * objdir)460 pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir)
461 {
462 	dnode_phys_t *dnode;
463 	uint64_t id;
464 
465 	dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id);
466 	zap_add_uint64(objdir, DMU_POOL_PROPS, id);
467 
468 	zfs->poolprops = zap_alloc(zfs->mos, dnode);
469 }
470 
471 /*
472  * Initialize the MOS object directory, the root of virtually all of the pool's
473  * data and metadata.
474  */
475 static void
pool_init_objdir(zfs_opt_t * zfs)476 pool_init_objdir(zfs_opt_t *zfs)
477 {
478 	zfs_zap_t *zap;
479 	dnode_phys_t *objdir;
480 
481 	objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT);
482 
483 	zap = zap_alloc(zfs->mos, objdir);
484 	pool_init_objdir_config(zfs, zap);
485 	pool_init_objdir_bplists(zfs, zap);
486 	pool_init_objdir_feature_maps(zfs, zap);
487 	pool_init_objdir_dsl(zfs, zap);
488 	pool_init_objdir_poolprops(zfs, zap);
489 	zap_write(zfs, zap);
490 }
491 
492 /*
493  * Initialize the meta-object set (MOS) and immediately write out several
494  * special objects whose contents are already finalized, including the object
495  * directory.
496  *
497  * Once the MOS is finalized, it'll look roughly like this:
498  *
499  *	object directory (ZAP)
500  *	|-> vdev config object (nvlist)
501  *	|-> features for read
502  *	|-> features for write
503  *	|-> feature descriptions
504  *	|-> sync bplist
505  *	|-> free bplist
506  *	|-> pool properties
507  *	L-> root DSL directory
508  *	    |-> DSL child directory (ZAP)
509  *	    |   |-> $MOS (DSL dir)
510  *	    |   |   |-> child map
511  *	    |   |   L-> props (ZAP)
512  *	    |   |-> $FREE (DSL dir)
513  *	    |   |   |-> child map
514  *	    |   |   L-> props (ZAP)
515  *	    |   |-> $ORIGIN (DSL dir)
516  *	    |   |   |-> child map
517  *	    |   |   |-> dataset
518  *	    |   |   |   L-> deadlist
519  *	    |   |   |-> snapshot
520  *	    |   |   |   |-> deadlist
521  *	    |   |   |   L-> snapshot names
522  *	    |   |   |-> props (ZAP)
523  *	    |   |   L-> clones (ZAP)
524  *	    |   |-> dataset 1 (DSL dir)
525  *	    |   |   |-> DSL dataset
526  *	    |   |   |   |-> snapshot names
527  *	    |   |   |   L-> deadlist
528  *	    |   |   |-> child map
529  *	    |   |   |   L-> ...
530  *	    |   |   L-> props
531  *	    |   |-> dataset 2
532  *	    |   |   L-> ...
533  *	    |   |-> ...
534  *	    |   L-> dataset n
535  *	    |-> DSL root dataset
536  *	    |   |-> snapshot names
537  *	    |   L-> deadlist
538  *	    L-> props (ZAP)
539  *	space map object array
540  *	|-> space map 1
541  *	|-> space map 2
542  *	|-> ...
543  *	L-> space map n (zfs->mscount)
544  *
545  * The space map object array is pointed to by the "msarray" property in the
546  * pool configuration.
547  */
548 static void
pool_init(zfs_opt_t * zfs)549 pool_init(zfs_opt_t *zfs)
550 {
551 	uint64_t dnid;
552 
553 	if (zfs->poolguid == 0)
554 		zfs->poolguid = randomguid();
555 	zfs->vdevguid = randomguid();
556 
557 	zfs->mos = objset_alloc(zfs, DMU_OST_META);
558 
559 	(void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid);
560 	assert(dnid == DMU_POOL_DIRECTORY_OBJECT);
561 
562 	(void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid);
563 
564 	dsl_init(zfs);
565 
566 	pool_init_objdir(zfs);
567 }
568 
569 static void
pool_labels_write(zfs_opt_t * zfs)570 pool_labels_write(zfs_opt_t *zfs)
571 {
572 	uberblock_t *ub;
573 	vdev_label_t *label;
574 	nvlist_t *poolconfig, *vdevconfig;
575 	int error;
576 
577 	label = ecalloc(1, sizeof(*label));
578 
579 	/*
580 	 * Assemble the vdev configuration and store it in the label.
581 	 */
582 	poolconfig = pool_config_nvcreate(zfs);
583 	vdevconfig = pool_disk_vdev_config_nvcreate(zfs);
584 	nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
585 	nvlist_destroy(vdevconfig);
586 
587 	error = nvlist_export(poolconfig);
588 	if (error != 0)
589 		errc(1, error, "nvlist_export");
590 	nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist,
591 	    sizeof(label->vl_vdev_phys.vp_nvlist));
592 	nvlist_destroy(poolconfig);
593 
594 	/*
595 	 * Fill out the uberblock.  Just make each one the same.  The embedded
596 	 * checksum is calculated in vdev_label_write().
597 	 */
598 	for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock);
599 	    uoff += (1 << zfs->ashift)) {
600 		ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff);
601 		ub->ub_magic = UBERBLOCK_MAGIC;
602 		ub->ub_version = SPA_VERSION;
603 
604 		/*
605 		 * Upon import, OpenZFS will perform metadata verification of
606 		 * the last TXG by default.  If all data is written in the same
607 		 * TXG, it'll all get verified, which can be painfully slow in
608 		 * some cases, e.g., initial boot in a cloud environment with
609 		 * slow storage.  So, fabricate additional TXGs to avoid this
610 		 * overhead, unless the user requests otherwise.
611 		 */
612 		ub->ub_txg = TXG;
613 		if (!zfs->verify_txgs)
614 			ub->ub_txg += TXG_SIZE;
615 		ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid;
616 		ub->ub_timestamp = 0;
617 
618 		ub->ub_software_version = SPA_VERSION;
619 		ub->ub_mmp_magic = MMP_MAGIC;
620 		ub->ub_mmp_delay = 0;
621 		ub->ub_mmp_config = 0;
622 		ub->ub_checkpoint_txg = 0;
623 		objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp);
624 	}
625 
626 	/*
627 	 * Write out four copies of the label: two at the beginning of the vdev
628 	 * and two at the end.
629 	 */
630 	for (int i = 0; i < VDEV_LABELS; i++)
631 		vdev_label_write(zfs, i, label);
632 
633 	free(label);
634 }
635 
636 static void
pool_fini(zfs_opt_t * zfs)637 pool_fini(zfs_opt_t *zfs)
638 {
639 	zap_write(zfs, zfs->poolprops);
640 	dsl_write(zfs);
641 	objset_write(zfs, zfs->mos);
642 	pool_labels_write(zfs);
643 }
644 
645 struct dnode_cursor *
dnode_cursor_init(zfs_opt_t * zfs,zfs_objset_t * os,dnode_phys_t * dnode,off_t size,off_t blksz)646 dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode,
647     off_t size, off_t blksz)
648 {
649 	struct dnode_cursor *c;
650 	uint64_t nbppindir, indlevel, ndatablks, nindblks;
651 
652 	assert(dnode->dn_nblkptr == 1);
653 	assert(blksz <= MAXBLOCKSIZE);
654 
655 	if (blksz == 0) {
656 		/* Must be between 1<<ashift and 128KB. */
657 		blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift,
658 		    powerof2(size) ? size : (1l << flsll(size))));
659 	}
660 	assert(powerof2(blksz));
661 
662 	/*
663 	 * Do we need indirect blocks?  Figure out how many levels are needed
664 	 * (indlevel == 1 means no indirect blocks) and how much space is needed
665 	 * (it has to be allocated up-front to break the dependency cycle
666 	 * described in objset_write()).
667 	 */
668 	ndatablks = size == 0 ? 0 : howmany(size, blksz);
669 	nindblks = 0;
670 	for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) {
671 		nbppindir *= BLKPTR_PER_INDIR;
672 		nindblks += howmany(ndatablks, indlevel * nbppindir);
673 	}
674 	assert(indlevel < INDIR_LEVELS);
675 
676 	dnode->dn_nlevels = (uint8_t)indlevel;
677 	dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0;
678 	dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
679 
680 	c = ecalloc(1, sizeof(*c));
681 	if (nindblks > 0) {
682 		c->indspace = nindblks * MAXBLOCKSIZE;
683 		c->indloc = objset_space_alloc(zfs, os, &c->indspace);
684 	}
685 	c->dnode = dnode;
686 	c->dataoff = 0;
687 	c->datablksz = blksz;
688 
689 	return (c);
690 }
691 
692 static void
_dnode_cursor_flush(zfs_opt_t * zfs,struct dnode_cursor * c,unsigned int levels)693 _dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, unsigned int levels)
694 {
695 	blkptr_t *bp, *pbp;
696 	void *buf;
697 	uint64_t fill;
698 	off_t blkid, blksz, loc;
699 
700 	assert(levels > 0);
701 	assert(levels <= c->dnode->dn_nlevels - 1U);
702 
703 	blksz = MAXBLOCKSIZE;
704 	blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR;
705 	for (unsigned int level = 1; level <= levels; level++) {
706 		buf = c->inddir[level - 1];
707 
708 		if (level == c->dnode->dn_nlevels - 1U) {
709 			pbp = &c->dnode->dn_blkptr[0];
710 		} else {
711 			uint64_t iblkid;
712 
713 			iblkid = blkid & (BLKPTR_PER_INDIR - 1);
714 			pbp = (blkptr_t *)
715 			    &c->inddir[level][iblkid * sizeof(blkptr_t)];
716 		}
717 
718 		/*
719 		 * Space for indirect blocks is allocated up-front; see the
720 		 * comment in objset_write().
721 		 */
722 		loc = c->indloc;
723 		c->indloc += blksz;
724 		assert(c->indspace >= blksz);
725 		c->indspace -= blksz;
726 
727 		bp = buf;
728 		fill = 0;
729 		for (size_t i = 0; i < BLKPTR_PER_INDIR; i++)
730 			fill += BP_GET_FILL(&bp[i]);
731 
732 		vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz,
733 		    loc, pbp);
734 		memset(buf, 0, MAXBLOCKSIZE);
735 
736 		blkid /= BLKPTR_PER_INDIR;
737 	}
738 }
739 
740 blkptr_t *
dnode_cursor_next(zfs_opt_t * zfs,struct dnode_cursor * c,off_t off)741 dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off)
742 {
743 	off_t blkid, l1id;
744 	unsigned int levels;
745 
746 	if (c->dnode->dn_nlevels == 1) {
747 		assert(off < MAXBLOCKSIZE);
748 		return (&c->dnode->dn_blkptr[0]);
749 	}
750 
751 	assert(off % c->datablksz == 0);
752 
753 	/* Do we need to flush any full indirect blocks? */
754 	if (off > 0) {
755 		blkid = off / c->datablksz;
756 		for (levels = 0; levels < c->dnode->dn_nlevels - 1U; levels++) {
757 			if (blkid % BLKPTR_PER_INDIR != 0)
758 				break;
759 			blkid /= BLKPTR_PER_INDIR;
760 		}
761 		if (levels > 0)
762 			_dnode_cursor_flush(zfs, c, levels);
763 	}
764 
765 	c->dataoff = off;
766 	l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1);
767 	return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]);
768 }
769 
770 void
dnode_cursor_finish(zfs_opt_t * zfs,struct dnode_cursor * c)771 dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c)
772 {
773 	unsigned int levels;
774 
775 	assert(c->dnode->dn_nlevels > 0);
776 	levels = c->dnode->dn_nlevels - 1;
777 	if (levels > 0)
778 		_dnode_cursor_flush(zfs, c, levels);
779 	assert(c->indspace == 0);
780 	free(c);
781 }
782 
783 void
zfs_makefs(const char * image,const char * dir,fsnode * root,fsinfo_t * fsopts)784 zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts)
785 {
786 	zfs_opt_t *zfs;
787 	int dirfd;
788 
789 	zfs = fsopts->fs_specific;
790 
791 	/*
792 	 * Use a fixed seed to provide reproducible pseudo-random numbers for
793 	 * on-disk structures when needed (e.g., GUIDs, ZAP hash salts).
794 	 */
795 	srandom(1729);
796 
797 	zfs_check_opts(fsopts);
798 
799 	dirfd = open(dir, O_DIRECTORY | O_RDONLY);
800 	if (dirfd < 0)
801 		err(1, "open(%s)", dir);
802 
803 	vdev_init(zfs, image);
804 	pool_init(zfs);
805 	fs_build(zfs, dirfd, root);
806 	pool_fini(zfs);
807 	vdev_fini(zfs);
808 }
809