xref: /freebsd/usr.sbin/makefs/zfs.c (revision d4eeb02986980bf33dd56c41ceb9fc5f180c0d47)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2022 The FreeBSD Foundation
5  *
6  * This software was developed by Mark Johnston under sponsorship from
7  * the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions are
11  * met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/errno.h>
33 #include <sys/queue.h>
34 
35 #include <assert.h>
36 #include <fcntl.h>
37 #include <stdalign.h>
38 #include <stdbool.h>
39 #include <stddef.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <unistd.h>
43 
44 #include <util.h>
45 
46 #include "makefs.h"
47 #include "zfs.h"
48 
49 #define	VDEV_LABEL_SPACE	\
50 	((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE))
51 _Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, "");
52 
53 #define	MINMSSIZE		((off_t)1 << 24) /* 16MB */
54 #define	DFLTMSSIZE		((off_t)1 << 29) /* 512MB */
55 #define	MAXMSSIZE		((off_t)1 << 34) /* 16GB */
56 
57 #define	INDIR_LEVELS		6
58 /* Indirect blocks are always 128KB. */
59 #define	BLKPTR_PER_INDIR	(MAXBLOCKSIZE / sizeof(blkptr_t))
60 
61 struct dnode_cursor {
62 	char		inddir[INDIR_LEVELS][MAXBLOCKSIZE];
63 	off_t		indloc;
64 	off_t		indspace;
65 	dnode_phys_t	*dnode;
66 	off_t		dataoff;
67 	off_t		datablksz;
68 };
69 
70 void
71 zfs_prep_opts(fsinfo_t *fsopts)
72 {
73 	zfs_opt_t *zfs;
74 	size_t align;
75 
76 	align = alignof(uint64_t);
77 	zfs = aligned_alloc(align, roundup2(sizeof(*zfs), align));
78 	if (zfs == NULL)
79 		err(1, "aligned_alloc");
80 	memset(zfs, 0, sizeof(*zfs));
81 
82 	const option_t zfs_options[] = {
83 		{ '\0', "bootfs", &zfs->bootfs, OPT_STRPTR,
84 		  0, 0, "Bootable dataset" },
85 		{ '\0', "mssize", &zfs->mssize, OPT_INT64,
86 		  MINMSSIZE, MAXMSSIZE, "Metaslab size" },
87 		{ '\0', "poolname", &zfs->poolname, OPT_STRPTR,
88 		  0, 0, "ZFS pool name" },
89 		{ '\0', "rootpath", &zfs->rootpath, OPT_STRPTR,
90 		  0, 0, "Prefix for all dataset mount points" },
91 		{ '\0', "ashift", &zfs->ashift, OPT_INT32,
92 		  MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" },
93 		{ '\0', "nowarn", &zfs->nowarn, OPT_BOOL,
94 		  0, 0, "Suppress warning about experimental ZFS support" },
95 		{ .name = NULL }
96 	};
97 
98 	STAILQ_INIT(&zfs->datasetdescs);
99 
100 	fsopts->fs_specific = zfs;
101 	fsopts->fs_options = copy_opts(zfs_options);
102 }
103 
104 int
105 zfs_parse_opts(const char *option, fsinfo_t *fsopts)
106 {
107 	zfs_opt_t *zfs;
108 	struct dataset_desc *dsdesc;
109 	char buf[BUFSIZ], *opt, *val;
110 	int rv;
111 
112 	zfs = fsopts->fs_specific;
113 
114 	opt = val = estrdup(option);
115 	opt = strsep(&val, "=");
116 	if (strcmp(opt, "fs") == 0) {
117 		if (val == NULL)
118 			errx(1, "invalid filesystem parameters `%s'", option);
119 
120 		/*
121 		 * Dataset descriptions will be parsed later, in dsl_init().
122 		 * Just stash them away for now.
123 		 */
124 		dsdesc = ecalloc(1, sizeof(*dsdesc));
125 		dsdesc->params = estrdup(val);
126 		free(opt);
127 		STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next);
128 		return (1);
129 	}
130 	free(opt);
131 
132 	rv = set_option(fsopts->fs_options, option, buf, sizeof(buf));
133 	return (rv == -1 ? 0 : 1);
134 }
135 
136 static void
137 zfs_size_vdev(fsinfo_t *fsopts)
138 {
139 	zfs_opt_t *zfs;
140 	off_t asize, mssize, vdevsize, vdevsize1;
141 
142 	zfs = fsopts->fs_specific;
143 
144 	assert(fsopts->maxsize != 0);
145 	assert(zfs->ashift != 0);
146 
147 	/*
148 	 * Figure out how big the vdev should be.
149 	 */
150 	vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift);
151 	if (vdevsize < MINDEVSIZE)
152 		errx(1, "maximum image size is too small");
153 	if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) {
154 		errx(1, "image size bounds must be multiples of %d",
155 		    1 << zfs->ashift);
156 	}
157 	asize = vdevsize - VDEV_LABEL_SPACE;
158 
159 	/*
160 	 * Size metaslabs according to the following heuristic:
161 	 * - provide at least 8 metaslabs,
162 	 * - without using a metaslab size larger than 512MB.
163 	 * This approximates what OpenZFS does without being complicated.  In
164 	 * practice we expect pools to be expanded upon first use, and OpenZFS
165 	 * does not resize metaslabs in that case, so there is no right answer
166 	 * here.  In general we want to provide large metaslabs even if the
167 	 * image size is small, and 512MB is a reasonable size for pools up to
168 	 * several hundred gigabytes.
169 	 *
170 	 * The user may override this heuristic using the "-o mssize" option.
171 	 */
172 	mssize = zfs->mssize;
173 	if (mssize == 0) {
174 		mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE);
175 		if (!powerof2(mssize))
176 			mssize = 1l << (flsll(mssize) - 1);
177 	}
178 	if (!powerof2(mssize))
179 		errx(1, "metaslab size must be a power of 2");
180 
181 	/*
182 	 * If we have some slop left over, try to cover it by resizing the vdev,
183 	 * subject to the maxsize and minsize parameters.
184 	 */
185 	if (asize % mssize != 0) {
186 		vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE;
187 		if (vdevsize1 < fsopts->minsize)
188 			vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE;
189 		if (vdevsize1 <= fsopts->maxsize)
190 			vdevsize = vdevsize1;
191 	}
192 	asize = vdevsize - VDEV_LABEL_SPACE;
193 
194 	zfs->asize = asize;
195 	zfs->vdevsize = vdevsize;
196 	zfs->mssize = mssize;
197 	zfs->msshift = flsll(mssize) - 1;
198 	zfs->mscount = asize / mssize;
199 }
200 
201 /*
202  * Validate options and set some default values.
203  */
204 static void
205 zfs_check_opts(fsinfo_t *fsopts)
206 {
207 	zfs_opt_t *zfs;
208 
209 	zfs = fsopts->fs_specific;
210 
211 	if (fsopts->offset != 0)
212 		errx(1, "unhandled offset option");
213 	if (fsopts->maxsize == 0)
214 		errx(1, "an image size must be specified");
215 
216 	if (zfs->poolname == NULL)
217 		errx(1, "a pool name must be specified");
218 
219 	if (zfs->rootpath == NULL)
220 		easprintf(&zfs->rootpath, "/%s", zfs->poolname);
221 	if (zfs->rootpath[0] != '/')
222 		errx(1, "mountpoint `%s' must be absolute", zfs->rootpath);
223 
224 	if (zfs->ashift == 0)
225 		zfs->ashift = 12;
226 
227 	zfs_size_vdev(fsopts);
228 }
229 
230 void
231 zfs_cleanup_opts(fsinfo_t *fsopts)
232 {
233 	struct dataset_desc *d, *tmp;
234 	zfs_opt_t *zfs;
235 
236 	zfs = fsopts->fs_specific;
237 	free(zfs->rootpath);
238 	free(zfs->bootfs);
239 	free(__DECONST(void *, zfs->poolname));
240 	STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) {
241 		free(d->params);
242 		free(d);
243 	}
244 	free(zfs);
245 	free(fsopts->fs_options);
246 }
247 
248 static size_t
249 nvlist_size(const nvlist_t *nvl)
250 {
251 	return (sizeof(nvl->nv_header) + nvl->nv_size);
252 }
253 
254 static void
255 nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz)
256 {
257 	assert(sz >= nvlist_size(nvl));
258 
259 	memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header));
260 	memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size);
261 }
262 
263 static nvlist_t *
264 pool_config_nvcreate(zfs_opt_t *zfs)
265 {
266 	nvlist_t *featuresnv, *poolnv;
267 
268 	poolnv = nvlist_create(NV_UNIQUE_NAME);
269 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG);
270 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION);
271 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED);
272 	nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname);
273 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid);
274 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid);
275 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
276 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1);
277 
278 	featuresnv = nvlist_create(NV_UNIQUE_NAME);
279 	nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv);
280 	nvlist_destroy(featuresnv);
281 
282 	return (poolnv);
283 }
284 
285 static nvlist_t *
286 pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs)
287 {
288 	nvlist_t *diskvdevnv;
289 
290 	assert(zfs->objarrid != 0);
291 
292 	diskvdevnv = nvlist_create(NV_UNIQUE_NAME);
293 	nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK);
294 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift);
295 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize);
296 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
297 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0);
298 	nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null");
299 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1);
300 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
301 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY,
302 	    zfs->objarrid);
303 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT,
304 	    zfs->msshift);
305 
306 	return (diskvdevnv);
307 }
308 
309 static nvlist_t *
310 pool_root_vdev_config_nvcreate(zfs_opt_t *zfs)
311 {
312 	nvlist_t *diskvdevnv, *rootvdevnv;
313 
314 	diskvdevnv = pool_disk_vdev_config_nvcreate(zfs);
315 	rootvdevnv = nvlist_create(NV_UNIQUE_NAME);
316 
317 	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0);
318 	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid);
319 	nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT);
320 	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
321 	nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv,
322 	    1);
323 	nvlist_destroy(diskvdevnv);
324 
325 	return (rootvdevnv);
326 }
327 
328 /*
329  * Create the pool's "config" object, which contains an nvlist describing pool
330  * parameters and the vdev topology.  It is similar but not identical to the
331  * nvlist stored in vdev labels.  The main difference is that vdev labels do not
332  * describe the full vdev tree and in particular do not contain the "root"
333  * meta-vdev.
334  */
335 static void
336 pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir)
337 {
338 	dnode_phys_t *dnode;
339 	nvlist_t *poolconfig, *vdevconfig;
340 	void *configbuf;
341 	uint64_t dnid;
342 	off_t configloc, configblksz;
343 	int error;
344 
345 	dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST,
346 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid);
347 
348 	poolconfig = pool_config_nvcreate(zfs);
349 
350 	vdevconfig = pool_root_vdev_config_nvcreate(zfs);
351 	nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
352 	nvlist_destroy(vdevconfig);
353 
354 	error = nvlist_export(poolconfig);
355 	if (error != 0)
356 		errc(1, error, "nvlist_export");
357 
358 	configblksz = nvlist_size(poolconfig);
359 	configloc = objset_space_alloc(zfs, zfs->mos, &configblksz);
360 	configbuf = ecalloc(1, configblksz);
361 	nvlist_copy(poolconfig, configbuf, configblksz);
362 
363 	vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc);
364 
365 	dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT;
366 	dnode->dn_flags = DNODE_FLAG_USED_BYTES;
367 	*(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig);
368 
369 	zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid);
370 
371 	nvlist_destroy(poolconfig);
372 	free(configbuf);
373 }
374 
375 /*
376  * Add objects block pointer list objects, used for deferred frees.  We don't do
377  * anything with them, but they need to be present or OpenZFS will refuse to
378  * import the pool.
379  */
380 static void
381 pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir)
382 {
383 	uint64_t dnid;
384 
385 	(void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
386 	    BPOBJ_SIZE_V2, &dnid);
387 	zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid);
388 
389 	(void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
390 	    BPOBJ_SIZE_V2, &dnid);
391 	zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid);
392 }
393 
394 /*
395  * Add required feature metadata objects.  We don't know anything about ZFS
396  * features, so the objects are just empty ZAPs.
397  */
398 static void
399 pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir)
400 {
401 	dnode_phys_t *dnode;
402 	uint64_t dnid;
403 
404 	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
405 	zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid);
406 	zap_write(zfs, zap_alloc(zfs->mos, dnode));
407 
408 	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
409 	zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid);
410 	zap_write(zfs, zap_alloc(zfs->mos, dnode));
411 
412 	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
413 	zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid);
414 	zap_write(zfs, zap_alloc(zfs->mos, dnode));
415 }
416 
417 static void
418 pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir)
419 {
420 	zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET,
421 	    dsl_dir_id(zfs->rootdsldir));
422 }
423 
424 static void
425 pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir)
426 {
427 	dnode_phys_t *dnode;
428 	uint64_t id;
429 
430 	dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id);
431 	zap_add_uint64(objdir, DMU_POOL_PROPS, id);
432 
433 	zfs->poolprops = zap_alloc(zfs->mos, dnode);
434 }
435 
436 /*
437  * Initialize the MOS object directory, the root of virtually all of the pool's
438  * data and metadata.
439  */
440 static void
441 pool_init_objdir(zfs_opt_t *zfs)
442 {
443 	zfs_zap_t *zap;
444 	dnode_phys_t *objdir;
445 
446 	objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT);
447 
448 	zap = zap_alloc(zfs->mos, objdir);
449 	pool_init_objdir_config(zfs, zap);
450 	pool_init_objdir_bplists(zfs, zap);
451 	pool_init_objdir_feature_maps(zfs, zap);
452 	pool_init_objdir_dsl(zfs, zap);
453 	pool_init_objdir_poolprops(zfs, zap);
454 	zap_write(zfs, zap);
455 }
456 
457 /*
458  * Initialize the meta-object set (MOS) and immediately write out several
459  * special objects whose contents are already finalized, including the object
460  * directory.
461  *
462  * Once the MOS is finalized, it'll look roughly like this:
463  *
464  *	object directory (ZAP)
465  *	|-> vdev config object (nvlist)
466  *	|-> features for read
467  *	|-> features for write
468  *	|-> feature descriptions
469  *	|-> sync bplist
470  *	|-> free bplist
471  *	|-> pool properties
472  *	L-> root DSL directory
473  *	    |-> DSL child directory (ZAP)
474  *	    |   |-> $MOS (DSL dir)
475  *	    |   |   |-> child map
476  *	    |   |   L-> props (ZAP)
477  *	    |   |-> $FREE (DSL dir)
478  *	    |   |   |-> child map
479  *	    |   |   L-> props (ZAP)
480  *	    |   |-> $ORIGIN (DSL dir)
481  *	    |   |   |-> child map
482  *	    |   |   |-> dataset
483  *	    |   |   |   L-> deadlist
484  *	    |   |   |-> snapshot
485  *	    |   |   |   |-> deadlist
486  *	    |   |   |   L-> snapshot names
487  *	    |   |   |-> props (ZAP)
488  *	    |   |   L-> clones (ZAP)
489  *	    |   |-> dataset 1 (DSL dir)
490  *	    |   |   |-> DSL dataset
491  *	    |   |   |   |-> snapshot names
492  *	    |   |   |   L-> deadlist
493  *	    |   |   |-> child map
494  *	    |   |   |   L-> ...
495  *	    |   |   L-> props
496  *	    |   |-> dataset 2
497  *	    |   |   L-> ...
498  *	    |   |-> ...
499  *	    |   L-> dataset n
500  *	    |-> DSL root dataset
501  *	    |   |-> snapshot names
502  *	    |   L-> deadlist
503  *	    L-> props (ZAP)
504  *	space map object array
505  *	|-> space map 1
506  *	|-> space map 2
507  *	|-> ...
508  *	L-> space map n (zfs->mscount)
509  *
510  * The space map object array is pointed to by the "msarray" property in the
511  * pool configuration.
512  */
513 static void
514 pool_init(zfs_opt_t *zfs)
515 {
516 	uint64_t dnid;
517 
518 	zfs->poolguid = ((uint64_t)random() << 32) | random();
519 	zfs->vdevguid = ((uint64_t)random() << 32) | random();
520 
521 	zfs->mos = objset_alloc(zfs, DMU_OST_META);
522 
523 	(void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid);
524 	assert(dnid == DMU_POOL_DIRECTORY_OBJECT);
525 
526 	(void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid);
527 
528 	dsl_init(zfs);
529 
530 	pool_init_objdir(zfs);
531 }
532 
533 static void
534 pool_labels_write(zfs_opt_t *zfs)
535 {
536 	uberblock_t *ub;
537 	vdev_label_t *label;
538 	nvlist_t *poolconfig, *vdevconfig;
539 	int error;
540 
541 	label = ecalloc(1, sizeof(*label));
542 
543 	/*
544 	 * Assemble the vdev configuration and store it in the label.
545 	 */
546 	poolconfig = pool_config_nvcreate(zfs);
547 	vdevconfig = pool_disk_vdev_config_nvcreate(zfs);
548 	nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
549 	nvlist_destroy(vdevconfig);
550 
551 	error = nvlist_export(poolconfig);
552 	if (error != 0)
553 		errc(1, error, "nvlist_export");
554 	nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist,
555 	    sizeof(label->vl_vdev_phys.vp_nvlist));
556 	nvlist_destroy(poolconfig);
557 
558 	/*
559 	 * Fill out the uberblock.  Just make each one the same.  The embedded
560 	 * checksum is calculated in vdev_label_write().
561 	 */
562 	for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock);
563 	    uoff += (1 << zfs->ashift)) {
564 		ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff);
565 		ub->ub_magic = UBERBLOCK_MAGIC;
566 		ub->ub_version = SPA_VERSION;
567 		ub->ub_txg = TXG;
568 		ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid;
569 		ub->ub_timestamp = 0;
570 
571 		ub->ub_software_version = SPA_VERSION;
572 		ub->ub_mmp_magic = MMP_MAGIC;
573 		ub->ub_mmp_delay = 0;
574 		ub->ub_mmp_config = 0;
575 		ub->ub_checkpoint_txg = 0;
576 		objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp);
577 	}
578 
579 	/*
580 	 * Write out four copies of the label: two at the beginning of the vdev
581 	 * and two at the end.
582 	 */
583 	for (int i = 0; i < VDEV_LABELS; i++)
584 		vdev_label_write(zfs, i, label);
585 
586 	free(label);
587 }
588 
589 static void
590 pool_fini(zfs_opt_t *zfs)
591 {
592 	zap_write(zfs, zfs->poolprops);
593 	dsl_write(zfs);
594 	objset_write(zfs, zfs->mos);
595 	pool_labels_write(zfs);
596 }
597 
598 struct dnode_cursor *
599 dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode,
600     off_t size, off_t blksz)
601 {
602 	struct dnode_cursor *c;
603 	uint64_t nbppindir, indlevel, ndatablks, nindblks;
604 
605 	assert(dnode->dn_nblkptr == 1);
606 	assert(blksz <= MAXBLOCKSIZE);
607 
608 	if (blksz == 0) {
609 		/* Must be between 1<<ashift and 128KB. */
610 		blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift,
611 		    powerof2(size) ? size : (1ul << flsll(size))));
612 	}
613 	assert(powerof2(blksz));
614 
615 	/*
616 	 * Do we need indirect blocks?  Figure out how many levels are needed
617 	 * (indlevel == 1 means no indirect blocks) and how much space is needed
618 	 * (it has to be allocated up-front to break the dependency cycle
619 	 * described in objset_write()).
620 	 */
621 	ndatablks = size == 0 ? 0 : howmany(size, blksz);
622 	nindblks = 0;
623 	for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) {
624 		nbppindir *= BLKPTR_PER_INDIR;
625 		nindblks += howmany(ndatablks, indlevel * nbppindir);
626 	}
627 	assert(indlevel < INDIR_LEVELS);
628 
629 	dnode->dn_nlevels = (uint8_t)indlevel;
630 	dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0;
631 	dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
632 
633 	c = ecalloc(1, sizeof(*c));
634 	if (nindblks > 0) {
635 		c->indspace = nindblks * MAXBLOCKSIZE;
636 		c->indloc = objset_space_alloc(zfs, os, &c->indspace);
637 	}
638 	c->dnode = dnode;
639 	c->dataoff = 0;
640 	c->datablksz = blksz;
641 
642 	return (c);
643 }
644 
645 static void
646 _dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, int levels)
647 {
648 	blkptr_t *bp, *pbp;
649 	void *buf;
650 	uint64_t fill;
651 	off_t blkid, blksz, loc;
652 
653 	assert(levels > 0);
654 	assert(levels <= c->dnode->dn_nlevels - 1);
655 
656 	blksz = MAXBLOCKSIZE;
657 	blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR;
658 	for (int level = 1; level <= levels; level++) {
659 		buf = c->inddir[level - 1];
660 
661 		if (level == c->dnode->dn_nlevels - 1) {
662 			pbp = &c->dnode->dn_blkptr[0];
663 		} else {
664 			uint64_t iblkid;
665 
666 			iblkid = blkid & (BLKPTR_PER_INDIR - 1);
667 			pbp = (blkptr_t *)
668 			    &c->inddir[level][iblkid * sizeof(blkptr_t)];
669 		}
670 
671 		/*
672 		 * Space for indirect blocks is allocated up-front; see the
673 		 * comment in objset_write().
674 		 */
675 		loc = c->indloc;
676 		c->indloc += blksz;
677 		assert(c->indspace >= blksz);
678 		c->indspace -= blksz;
679 
680 		bp = buf;
681 		fill = 0;
682 		for (size_t i = 0; i < BLKPTR_PER_INDIR; i++)
683 			fill += BP_GET_FILL(&bp[i]);
684 
685 		vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz,
686 		    loc, pbp);
687 		memset(buf, 0, MAXBLOCKSIZE);
688 
689 		blkid /= BLKPTR_PER_INDIR;
690 	}
691 }
692 
693 blkptr_t *
694 dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off)
695 {
696 	off_t blkid, l1id;
697 	int levels;
698 
699 	if (c->dnode->dn_nlevels == 1) {
700 		assert(off < MAXBLOCKSIZE);
701 		return (&c->dnode->dn_blkptr[0]);
702 	}
703 
704 	assert(off % c->datablksz == 0);
705 
706 	/* Do we need to flush any full indirect blocks? */
707 	if (off > 0) {
708 		blkid = off / c->datablksz;
709 		for (levels = 0; levels < c->dnode->dn_nlevels - 1; levels++) {
710 			if (blkid % BLKPTR_PER_INDIR != 0)
711 				break;
712 			blkid /= BLKPTR_PER_INDIR;
713 		}
714 		if (levels > 0)
715 			_dnode_cursor_flush(zfs, c, levels);
716 	}
717 
718 	c->dataoff = off;
719 	l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1);
720 	return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]);
721 }
722 
723 void
724 dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c)
725 {
726 	int levels;
727 
728 	levels = c->dnode->dn_nlevels - 1;
729 	if (levels > 0)
730 		_dnode_cursor_flush(zfs, c, levels);
731 	assert(c->indspace == 0);
732 	free(c);
733 }
734 
735 void
736 zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts)
737 {
738 	zfs_opt_t *zfs;
739 	int dirfd;
740 
741 	zfs = fsopts->fs_specific;
742 
743 	/*
744 	 * Use a fixed seed to provide reproducible pseudo-random numbers for
745 	 * on-disk structures when needed (e.g., GUIDs, ZAP hash salts).
746 	 */
747 	srandom(1729);
748 
749 	zfs_check_opts(fsopts);
750 
751 	if (!zfs->nowarn) {
752 		fprintf(stderr,
753 		    "ZFS support is currently considered experimental. "
754 		    "Do not use it for anything critical.\n");
755 	}
756 
757 	dirfd = open(dir, O_DIRECTORY | O_RDONLY);
758 	if (dirfd < 0)
759 		err(1, "open(%s)", dir);
760 
761 	vdev_init(zfs, image);
762 	pool_init(zfs);
763 	fs_build(zfs, dirfd, root);
764 	pool_fini(zfs);
765 	vdev_fini(zfs);
766 }
767