xref: /freebsd/usr.sbin/makefs/zfs.c (revision 3a3af6b2a160bea72509a9d5ef84e25906b0478a)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2022 The FreeBSD Foundation
5  *
6  * This software was developed by Mark Johnston under sponsorship from
7  * the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions are
11  * met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/errno.h>
33 #include <sys/queue.h>
34 
35 #include <assert.h>
36 #include <fcntl.h>
37 #include <stdbool.h>
38 #include <stddef.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <unistd.h>
42 
43 #include <util.h>
44 
45 #include "makefs.h"
46 #include "zfs.h"
47 
48 #define	VDEV_LABEL_SPACE	\
49 	((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE))
50 _Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, "");
51 
52 #define	MINMSSIZE		((off_t)1 << 24) /* 16MB */
53 #define	DFLTMSSIZE		((off_t)1 << 29) /* 512MB */
54 #define	MAXMSSIZE		((off_t)1 << 34) /* 16GB */
55 
56 #define	INDIR_LEVELS		6
57 /* Indirect blocks are always 128KB. */
58 #define	BLKPTR_PER_INDIR	(MAXBLOCKSIZE / sizeof(blkptr_t))
59 
60 struct dnode_cursor {
61 	char		inddir[INDIR_LEVELS][MAXBLOCKSIZE];
62 	off_t		indloc;
63 	off_t		indspace;
64 	dnode_phys_t	*dnode;
65 	off_t		dataoff;
66 	off_t		datablksz;
67 };
68 
69 void
70 zfs_prep_opts(fsinfo_t *fsopts)
71 {
72 	zfs_opt_t *zfs = ecalloc(1, sizeof(*zfs));
73 
74 	const option_t zfs_options[] = {
75 		{ '\0', "bootfs", &zfs->bootfs, OPT_STRPTR,
76 		  0, 0, "Bootable dataset" },
77 		{ '\0', "mssize", &zfs->mssize, OPT_INT64,
78 		  MINMSSIZE, MAXMSSIZE, "Metaslab size" },
79 		{ '\0', "poolname", &zfs->poolname, OPT_STRPTR,
80 		  0, 0, "ZFS pool name" },
81 		{ '\0', "rootpath", &zfs->rootpath, OPT_STRPTR,
82 		  0, 0, "Prefix for all dataset mount points" },
83 		{ '\0', "ashift", &zfs->ashift, OPT_INT32,
84 		  MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" },
85 		{ '\0', "nowarn", &zfs->nowarn, OPT_BOOL,
86 		  0, 0, "Suppress warning about experimental ZFS support" },
87 		{ .name = NULL }
88 	};
89 
90 	STAILQ_INIT(&zfs->datasetdescs);
91 
92 	fsopts->fs_specific = zfs;
93 	fsopts->fs_options = copy_opts(zfs_options);
94 }
95 
96 int
97 zfs_parse_opts(const char *option, fsinfo_t *fsopts)
98 {
99 	zfs_opt_t *zfs;
100 	struct dataset_desc *dsdesc;
101 	char buf[BUFSIZ], *opt, *val;
102 	int rv;
103 
104 	zfs = fsopts->fs_specific;
105 
106 	opt = val = estrdup(option);
107 	opt = strsep(&val, "=");
108 	if (strcmp(opt, "fs") == 0) {
109 		if (val == NULL)
110 			errx(1, "invalid filesystem parameters `%s'", option);
111 
112 		/*
113 		 * Dataset descriptions will be parsed later, in dsl_init().
114 		 * Just stash them away for now.
115 		 */
116 		dsdesc = ecalloc(1, sizeof(*dsdesc));
117 		dsdesc->params = estrdup(val);
118 		free(opt);
119 		STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next);
120 		return (1);
121 	}
122 	free(opt);
123 
124 	rv = set_option(fsopts->fs_options, option, buf, sizeof(buf));
125 	return (rv == -1 ? 0 : 1);
126 }
127 
128 static void
129 zfs_size_vdev(fsinfo_t *fsopts)
130 {
131 	zfs_opt_t *zfs;
132 	off_t asize, mssize, vdevsize, vdevsize1;
133 
134 	zfs = fsopts->fs_specific;
135 
136 	assert(fsopts->maxsize != 0);
137 	assert(zfs->ashift != 0);
138 
139 	/*
140 	 * Figure out how big the vdev should be.
141 	 */
142 	vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift);
143 	if (vdevsize < MINDEVSIZE)
144 		errx(1, "maximum image size is too small");
145 	if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) {
146 		errx(1, "image size bounds must be multiples of %d",
147 		    1 << zfs->ashift);
148 	}
149 	asize = vdevsize - VDEV_LABEL_SPACE;
150 
151 	/*
152 	 * Size metaslabs according to the following heuristic:
153 	 * - provide at least 8 metaslabs,
154 	 * - without using a metaslab size larger than 512MB.
155 	 * This approximates what OpenZFS does without being complicated.  In
156 	 * practice we expect pools to be expanded upon first use, and OpenZFS
157 	 * does not resize metaslabs in that case, so there is no right answer
158 	 * here.  In general we want to provide large metaslabs even if the
159 	 * image size is small, and 512MB is a reasonable size for pools up to
160 	 * several hundred gigabytes.
161 	 *
162 	 * The user may override this heuristic using the "-o mssize" option.
163 	 */
164 	mssize = zfs->mssize;
165 	if (mssize == 0) {
166 		mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE);
167 		if (!powerof2(mssize))
168 			mssize = 1l << (flsll(mssize) - 1);
169 	}
170 	if (!powerof2(mssize))
171 		errx(1, "metaslab size must be a power of 2");
172 
173 	/*
174 	 * If we have some slop left over, try to cover it by resizing the vdev,
175 	 * subject to the maxsize and minsize parameters.
176 	 */
177 	if (asize % mssize != 0) {
178 		vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE;
179 		if (vdevsize1 < fsopts->minsize)
180 			vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE;
181 		if (vdevsize1 <= fsopts->maxsize)
182 			vdevsize = vdevsize1;
183 	}
184 	asize = vdevsize - VDEV_LABEL_SPACE;
185 
186 	zfs->asize = asize;
187 	zfs->vdevsize = vdevsize;
188 	zfs->mssize = mssize;
189 	zfs->msshift = flsll(mssize) - 1;
190 	zfs->mscount = asize / mssize;
191 }
192 
193 /*
194  * Validate options and set some default values.
195  */
196 static void
197 zfs_check_opts(fsinfo_t *fsopts)
198 {
199 	zfs_opt_t *zfs;
200 
201 	zfs = fsopts->fs_specific;
202 
203 	if (fsopts->offset != 0)
204 		errx(1, "unhandled offset option");
205 	if (fsopts->maxsize == 0)
206 		errx(1, "an image size must be specified");
207 
208 	if (zfs->poolname == NULL)
209 		errx(1, "a pool name must be specified");
210 
211 	if (zfs->rootpath == NULL)
212 		easprintf(&zfs->rootpath, "/%s", zfs->poolname);
213 	if (zfs->rootpath[0] != '/')
214 		errx(1, "mountpoint `%s' must be absolute", zfs->rootpath);
215 
216 	if (zfs->ashift == 0)
217 		zfs->ashift = 12;
218 
219 	zfs_size_vdev(fsopts);
220 }
221 
222 void
223 zfs_cleanup_opts(fsinfo_t *fsopts)
224 {
225 	struct dataset_desc *d, *tmp;
226 	zfs_opt_t *zfs;
227 
228 	zfs = fsopts->fs_specific;
229 	free(zfs->rootpath);
230 	free(zfs->bootfs);
231 	free(__DECONST(void *, zfs->poolname));
232 	STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) {
233 		free(d->params);
234 		free(d);
235 	}
236 	free(zfs);
237 	free(fsopts->fs_options);
238 }
239 
240 static size_t
241 nvlist_size(const nvlist_t *nvl)
242 {
243 	return (sizeof(nvl->nv_header) + nvl->nv_size);
244 }
245 
246 static void
247 nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz)
248 {
249 	assert(sz >= nvlist_size(nvl));
250 
251 	memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header));
252 	memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size);
253 }
254 
255 static nvlist_t *
256 pool_config_nvcreate(zfs_opt_t *zfs)
257 {
258 	nvlist_t *featuresnv, *poolnv;
259 
260 	poolnv = nvlist_create(NV_UNIQUE_NAME);
261 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG);
262 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION);
263 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED);
264 	nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname);
265 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid);
266 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid);
267 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
268 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1);
269 
270 	featuresnv = nvlist_create(NV_UNIQUE_NAME);
271 	nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv);
272 	nvlist_destroy(featuresnv);
273 
274 	return (poolnv);
275 }
276 
277 static nvlist_t *
278 pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs)
279 {
280 	nvlist_t *diskvdevnv;
281 
282 	assert(zfs->objarrid != 0);
283 
284 	diskvdevnv = nvlist_create(NV_UNIQUE_NAME);
285 	nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK);
286 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift);
287 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize);
288 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
289 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0);
290 	nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null");
291 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1);
292 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
293 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY,
294 	    zfs->objarrid);
295 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT,
296 	    zfs->msshift);
297 
298 	return (diskvdevnv);
299 }
300 
301 static nvlist_t *
302 pool_root_vdev_config_nvcreate(zfs_opt_t *zfs)
303 {
304 	nvlist_t *diskvdevnv, *rootvdevnv;
305 
306 	diskvdevnv = pool_disk_vdev_config_nvcreate(zfs);
307 	rootvdevnv = nvlist_create(NV_UNIQUE_NAME);
308 
309 	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0);
310 	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid);
311 	nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT);
312 	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
313 	nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv,
314 	    1);
315 	nvlist_destroy(diskvdevnv);
316 
317 	return (rootvdevnv);
318 }
319 
320 /*
321  * Create the pool's "config" object, which contains an nvlist describing pool
322  * parameters and the vdev topology.  It is similar but not identical to the
323  * nvlist stored in vdev labels.  The main difference is that vdev labels do not
324  * describe the full vdev tree and in particular do not contain the "root"
325  * meta-vdev.
326  */
327 static void
328 pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir)
329 {
330 	dnode_phys_t *dnode;
331 	nvlist_t *poolconfig, *vdevconfig;
332 	void *configbuf;
333 	uint64_t dnid;
334 	off_t configloc, configblksz;
335 	int error;
336 
337 	dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST,
338 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid);
339 
340 	poolconfig = pool_config_nvcreate(zfs);
341 
342 	vdevconfig = pool_root_vdev_config_nvcreate(zfs);
343 	nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
344 	nvlist_destroy(vdevconfig);
345 
346 	error = nvlist_export(poolconfig);
347 	if (error != 0)
348 		errc(1, error, "nvlist_export");
349 
350 	configblksz = nvlist_size(poolconfig);
351 	configloc = objset_space_alloc(zfs, zfs->mos, &configblksz);
352 	configbuf = ecalloc(1, configblksz);
353 	nvlist_copy(poolconfig, configbuf, configblksz);
354 
355 	vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc);
356 
357 	dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT;
358 	dnode->dn_flags = DNODE_FLAG_USED_BYTES;
359 	*(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig);
360 
361 	zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid);
362 
363 	nvlist_destroy(poolconfig);
364 	free(configbuf);
365 }
366 
367 /*
368  * Add objects block pointer list objects, used for deferred frees.  We don't do
369  * anything with them, but they need to be present or OpenZFS will refuse to
370  * import the pool.
371  */
372 static void
373 pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir)
374 {
375 	uint64_t dnid;
376 
377 	(void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
378 	    BPOBJ_SIZE_V2, &dnid);
379 	zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid);
380 
381 	(void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
382 	    BPOBJ_SIZE_V2, &dnid);
383 	zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid);
384 }
385 
386 /*
387  * Add required feature metadata objects.  We don't know anything about ZFS
388  * features, so the objects are just empty ZAPs.
389  */
390 static void
391 pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir)
392 {
393 	dnode_phys_t *dnode;
394 	uint64_t dnid;
395 
396 	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
397 	zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid);
398 	zap_write(zfs, zap_alloc(zfs->mos, dnode));
399 
400 	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
401 	zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid);
402 	zap_write(zfs, zap_alloc(zfs->mos, dnode));
403 
404 	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
405 	zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid);
406 	zap_write(zfs, zap_alloc(zfs->mos, dnode));
407 }
408 
409 static void
410 pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir)
411 {
412 	zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET,
413 	    dsl_dir_id(zfs->rootdsldir));
414 }
415 
416 static void
417 pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir)
418 {
419 	dnode_phys_t *dnode;
420 	uint64_t id;
421 
422 	dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id);
423 	zap_add_uint64(objdir, DMU_POOL_PROPS, id);
424 
425 	zfs->poolprops = zap_alloc(zfs->mos, dnode);
426 }
427 
428 /*
429  * Initialize the MOS object directory, the root of virtually all of the pool's
430  * data and metadata.
431  */
432 static void
433 pool_init_objdir(zfs_opt_t *zfs)
434 {
435 	zfs_zap_t *zap;
436 	dnode_phys_t *objdir;
437 
438 	objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT);
439 
440 	zap = zap_alloc(zfs->mos, objdir);
441 	pool_init_objdir_config(zfs, zap);
442 	pool_init_objdir_bplists(zfs, zap);
443 	pool_init_objdir_feature_maps(zfs, zap);
444 	pool_init_objdir_dsl(zfs, zap);
445 	pool_init_objdir_poolprops(zfs, zap);
446 	zap_write(zfs, zap);
447 }
448 
449 /*
450  * Initialize the meta-object set (MOS) and immediately write out several
451  * special objects whose contents are already finalized, including the object
452  * directory.
453  *
454  * Once the MOS is finalized, it'll look roughly like this:
455  *
456  *	object directory (ZAP)
457  *	|-> vdev config object (nvlist)
458  *	|-> features for read
459  *	|-> features for write
460  *	|-> feature descriptions
461  *	|-> sync bplist
462  *	|-> free bplist
463  *	|-> pool properties
464  *	L-> root DSL directory
465  *	    |-> DSL child directory (ZAP)
466  *	    |   |-> $MOS (DSL dir)
467  *	    |   |   |-> child map
468  *	    |   |   L-> props (ZAP)
469  *	    |   |-> $FREE (DSL dir)
470  *	    |   |   |-> child map
471  *	    |   |   L-> props (ZAP)
472  *	    |   |-> $ORIGIN (DSL dir)
473  *	    |   |   |-> child map
474  *	    |   |   |-> dataset
475  *	    |   |   |   L-> deadlist
476  *	    |   |   |-> snapshot
477  *	    |   |   |   |-> deadlist
478  *	    |   |   |   L-> snapshot names
479  *	    |   |   |-> props (ZAP)
480  *	    |   |   L-> clones (ZAP)
481  *	    |   |-> dataset 1 (DSL dir)
482  *	    |   |   |-> DSL dataset
483  *	    |   |   |   |-> snapshot names
484  *	    |   |   |   L-> deadlist
485  *	    |   |   |-> child map
486  *	    |   |   |   L-> ...
487  *	    |   |   L-> props
488  *	    |   |-> dataset 2
489  *	    |   |   L-> ...
490  *	    |   |-> ...
491  *	    |   L-> dataset n
492  *	    |-> DSL root dataset
493  *	    |   |-> snapshot names
494  *	    |   L-> deadlist
495  *	    L-> props (ZAP)
496  *	space map object array
497  *	|-> space map 1
498  *	|-> space map 2
499  *	|-> ...
500  *	L-> space map n (zfs->mscount)
501  *
502  * The space map object array is pointed to by the "msarray" property in the
503  * pool configuration.
504  */
505 static void
506 pool_init(zfs_opt_t *zfs)
507 {
508 	uint64_t dnid;
509 
510 	zfs->poolguid = ((uint64_t)random() << 32) | random();
511 	zfs->vdevguid = ((uint64_t)random() << 32) | random();
512 
513 	zfs->mos = objset_alloc(zfs, DMU_OST_META);
514 
515 	(void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid);
516 	assert(dnid == DMU_POOL_DIRECTORY_OBJECT);
517 
518 	(void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid);
519 
520 	dsl_init(zfs);
521 
522 	pool_init_objdir(zfs);
523 }
524 
525 static void
526 pool_labels_write(zfs_opt_t *zfs)
527 {
528 	uberblock_t *ub;
529 	vdev_label_t *label;
530 	nvlist_t *poolconfig, *vdevconfig;
531 	int error;
532 
533 	label = ecalloc(1, sizeof(*label));
534 
535 	/*
536 	 * Assemble the vdev configuration and store it in the label.
537 	 */
538 	poolconfig = pool_config_nvcreate(zfs);
539 	vdevconfig = pool_disk_vdev_config_nvcreate(zfs);
540 	nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
541 	nvlist_destroy(vdevconfig);
542 
543 	error = nvlist_export(poolconfig);
544 	if (error != 0)
545 		errc(1, error, "nvlist_export");
546 	nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist,
547 	    sizeof(label->vl_vdev_phys.vp_nvlist));
548 	nvlist_destroy(poolconfig);
549 
550 	/*
551 	 * Fill out the uberblock.  Just make each one the same.  The embedded
552 	 * checksum is calculated in vdev_label_write().
553 	 */
554 	for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock);
555 	    uoff += (1 << zfs->ashift)) {
556 		ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff);
557 		ub->ub_magic = UBERBLOCK_MAGIC;
558 		ub->ub_version = SPA_VERSION;
559 		ub->ub_txg = TXG;
560 		ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid;
561 		ub->ub_timestamp = 0;
562 
563 		ub->ub_software_version = SPA_VERSION;
564 		ub->ub_mmp_magic = MMP_MAGIC;
565 		ub->ub_mmp_delay = 0;
566 		ub->ub_mmp_config = 0;
567 		ub->ub_checkpoint_txg = 0;
568 		objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp);
569 	}
570 
571 	/*
572 	 * Write out four copies of the label: two at the beginning of the vdev
573 	 * and two at the end.
574 	 */
575 	for (int i = 0; i < VDEV_LABELS; i++)
576 		vdev_label_write(zfs, i, label);
577 
578 	free(label);
579 }
580 
581 static void
582 pool_fini(zfs_opt_t *zfs)
583 {
584 	zap_write(zfs, zfs->poolprops);
585 	dsl_write(zfs);
586 	objset_write(zfs, zfs->mos);
587 	pool_labels_write(zfs);
588 }
589 
590 struct dnode_cursor *
591 dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode,
592     off_t size, off_t blksz)
593 {
594 	struct dnode_cursor *c;
595 	uint64_t nbppindir, indlevel, ndatablks, nindblks;
596 
597 	assert(dnode->dn_nblkptr == 1);
598 	assert(blksz <= MAXBLOCKSIZE);
599 
600 	if (blksz == 0) {
601 		/* Must be between 1<<ashift and 128KB. */
602 		blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift,
603 		    powerof2(size) ? size : (1ul << flsll(size))));
604 	}
605 	assert(powerof2(blksz));
606 
607 	/*
608 	 * Do we need indirect blocks?  Figure out how many levels are needed
609 	 * (indlevel == 1 means no indirect blocks) and how much space is needed
610 	 * (it has to be allocated up-front to break the dependency cycle
611 	 * described in objset_write()).
612 	 */
613 	ndatablks = size == 0 ? 0 : howmany(size, blksz);
614 	nindblks = 0;
615 	for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) {
616 		nbppindir *= BLKPTR_PER_INDIR;
617 		nindblks += howmany(ndatablks, indlevel * nbppindir);
618 	}
619 	assert(indlevel < INDIR_LEVELS);
620 
621 	dnode->dn_nlevels = (uint8_t)indlevel;
622 	dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0;
623 	dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
624 
625 	c = ecalloc(1, sizeof(*c));
626 	if (nindblks > 0) {
627 		c->indspace = nindblks * MAXBLOCKSIZE;
628 		c->indloc = objset_space_alloc(zfs, os, &c->indspace);
629 	}
630 	c->dnode = dnode;
631 	c->dataoff = 0;
632 	c->datablksz = blksz;
633 
634 	return (c);
635 }
636 
637 static void
638 _dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, int levels)
639 {
640 	blkptr_t *bp, *pbp;
641 	void *buf;
642 	uint64_t fill;
643 	off_t blkid, blksz, loc;
644 
645 	assert(levels > 0);
646 	assert(levels <= c->dnode->dn_nlevels - 1);
647 
648 	blksz = MAXBLOCKSIZE;
649 	blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR;
650 	for (int level = 1; level <= levels; level++) {
651 		buf = c->inddir[level - 1];
652 
653 		if (level == c->dnode->dn_nlevels - 1) {
654 			pbp = &c->dnode->dn_blkptr[0];
655 		} else {
656 			uint64_t iblkid;
657 
658 			iblkid = blkid & (BLKPTR_PER_INDIR - 1);
659 			pbp = (blkptr_t *)
660 			    &c->inddir[level][iblkid * sizeof(blkptr_t)];
661 		}
662 
663 		/*
664 		 * Space for indirect blocks is allocated up-front; see the
665 		 * comment in objset_write().
666 		 */
667 		loc = c->indloc;
668 		c->indloc += blksz;
669 		assert(c->indspace >= blksz);
670 		c->indspace -= blksz;
671 
672 		bp = buf;
673 		fill = 0;
674 		for (size_t i = 0; i < BLKPTR_PER_INDIR; i++)
675 			fill += BP_GET_FILL(&bp[i]);
676 
677 		vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz,
678 		    loc, pbp);
679 		memset(buf, 0, MAXBLOCKSIZE);
680 
681 		blkid /= BLKPTR_PER_INDIR;
682 	}
683 }
684 
685 blkptr_t *
686 dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off)
687 {
688 	off_t blkid, l1id;
689 	int levels;
690 
691 	if (c->dnode->dn_nlevels == 1) {
692 		assert(off < MAXBLOCKSIZE);
693 		return (&c->dnode->dn_blkptr[0]);
694 	}
695 
696 	assert(off % c->datablksz == 0);
697 
698 	/* Do we need to flush any full indirect blocks? */
699 	if (off > 0) {
700 		blkid = off / c->datablksz;
701 		for (levels = 0; levels < c->dnode->dn_nlevels - 1; levels++) {
702 			if (blkid % BLKPTR_PER_INDIR != 0)
703 				break;
704 			blkid /= BLKPTR_PER_INDIR;
705 		}
706 		if (levels > 0)
707 			_dnode_cursor_flush(zfs, c, levels);
708 	}
709 
710 	c->dataoff = off;
711 	l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1);
712 	return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]);
713 }
714 
715 void
716 dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c)
717 {
718 	int levels;
719 
720 	levels = c->dnode->dn_nlevels - 1;
721 	if (levels > 0)
722 		_dnode_cursor_flush(zfs, c, levels);
723 	assert(c->indspace == 0);
724 	free(c);
725 }
726 
727 void
728 zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts)
729 {
730 	zfs_opt_t *zfs;
731 	int dirfd;
732 
733 	zfs = fsopts->fs_specific;
734 
735 	/*
736 	 * Use a fixed seed to provide reproducible pseudo-random numbers for
737 	 * on-disk structures when needed (e.g., GUIDs, ZAP hash salts).
738 	 */
739 	srandom(1729);
740 
741 	zfs_check_opts(fsopts);
742 
743 	if (!zfs->nowarn) {
744 		fprintf(stderr,
745 		    "ZFS support is currently considered experimental. "
746 		    "Do not use it for anything critical.\n");
747 	}
748 
749 	dirfd = open(dir, O_DIRECTORY | O_RDONLY);
750 	if (dirfd < 0)
751 		err(1, "open(%s)", dir);
752 
753 	vdev_init(zfs, image);
754 	pool_init(zfs);
755 	fs_build(zfs, dirfd, root);
756 	pool_fini(zfs);
757 	vdev_fini(zfs);
758 }
759