xref: /freebsd/usr.sbin/makefs/zfs.c (revision ebacd8013fe5f7fdf9f6a5b286f6680dd2891036)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2022 The FreeBSD Foundation
5  *
6  * This software was developed by Mark Johnston under sponsorship from
7  * the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions are
11  * met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/errno.h>
33 #include <sys/queue.h>
34 
35 #include <assert.h>
36 #include <ctype.h>
37 #include <fcntl.h>
38 #include <stdalign.h>
39 #include <stdbool.h>
40 #include <stddef.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #include <unistd.h>
44 
45 #include <util.h>
46 
47 #include "makefs.h"
48 #include "zfs.h"
49 
50 #define	VDEV_LABEL_SPACE	\
51 	((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE))
52 _Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, "");
53 
54 #define	MINMSSIZE		((off_t)1 << 24) /* 16MB */
55 #define	DFLTMSSIZE		((off_t)1 << 29) /* 512MB */
56 #define	MAXMSSIZE		((off_t)1 << 34) /* 16GB */
57 
58 #define	INDIR_LEVELS		6
59 /* Indirect blocks are always 128KB. */
60 #define	BLKPTR_PER_INDIR	(MAXBLOCKSIZE / sizeof(blkptr_t))
61 
62 struct dnode_cursor {
63 	char		inddir[INDIR_LEVELS][MAXBLOCKSIZE];
64 	off_t		indloc;
65 	off_t		indspace;
66 	dnode_phys_t	*dnode;
67 	off_t		dataoff;
68 	off_t		datablksz;
69 };
70 
71 void
72 zfs_prep_opts(fsinfo_t *fsopts)
73 {
74 	zfs_opt_t *zfs;
75 	size_t align;
76 
77 	align = alignof(uint64_t);
78 	zfs = aligned_alloc(align, roundup2(sizeof(*zfs), align));
79 	if (zfs == NULL)
80 		err(1, "aligned_alloc");
81 	memset(zfs, 0, sizeof(*zfs));
82 
83 	const option_t zfs_options[] = {
84 		{ '\0', "bootfs", &zfs->bootfs, OPT_STRPTR,
85 		  0, 0, "Bootable dataset" },
86 		{ '\0', "mssize", &zfs->mssize, OPT_INT64,
87 		  MINMSSIZE, MAXMSSIZE, "Metaslab size" },
88 		{ '\0', "poolname", &zfs->poolname, OPT_STRPTR,
89 		  0, 0, "ZFS pool name" },
90 		{ '\0', "rootpath", &zfs->rootpath, OPT_STRPTR,
91 		  0, 0, "Prefix for all dataset mount points" },
92 		{ '\0', "ashift", &zfs->ashift, OPT_INT32,
93 		  MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" },
94 		{ '\0', "nowarn", &zfs->nowarn, OPT_BOOL,
95 		  0, 0, "Suppress warning about experimental ZFS support" },
96 		{ .name = NULL }
97 	};
98 
99 	STAILQ_INIT(&zfs->datasetdescs);
100 
101 	fsopts->fs_specific = zfs;
102 	fsopts->fs_options = copy_opts(zfs_options);
103 }
104 
105 int
106 zfs_parse_opts(const char *option, fsinfo_t *fsopts)
107 {
108 	zfs_opt_t *zfs;
109 	struct dataset_desc *dsdesc;
110 	char buf[BUFSIZ], *opt, *val;
111 	int rv;
112 
113 	zfs = fsopts->fs_specific;
114 
115 	opt = val = estrdup(option);
116 	opt = strsep(&val, "=");
117 	if (strcmp(opt, "fs") == 0) {
118 		if (val == NULL)
119 			errx(1, "invalid filesystem parameters `%s'", option);
120 
121 		/*
122 		 * Dataset descriptions will be parsed later, in dsl_init().
123 		 * Just stash them away for now.
124 		 */
125 		dsdesc = ecalloc(1, sizeof(*dsdesc));
126 		dsdesc->params = estrdup(val);
127 		free(opt);
128 		STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next);
129 		return (1);
130 	}
131 	free(opt);
132 
133 	rv = set_option(fsopts->fs_options, option, buf, sizeof(buf));
134 	return (rv == -1 ? 0 : 1);
135 }
136 
137 static void
138 zfs_size_vdev(fsinfo_t *fsopts)
139 {
140 	zfs_opt_t *zfs;
141 	off_t asize, mssize, vdevsize, vdevsize1;
142 
143 	zfs = fsopts->fs_specific;
144 
145 	assert(fsopts->maxsize != 0);
146 	assert(zfs->ashift != 0);
147 
148 	/*
149 	 * Figure out how big the vdev should be.
150 	 */
151 	vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift);
152 	if (vdevsize < MINDEVSIZE)
153 		errx(1, "maximum image size is too small");
154 	if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) {
155 		errx(1, "image size bounds must be multiples of %d",
156 		    1 << zfs->ashift);
157 	}
158 	asize = vdevsize - VDEV_LABEL_SPACE;
159 
160 	/*
161 	 * Size metaslabs according to the following heuristic:
162 	 * - provide at least 8 metaslabs,
163 	 * - without using a metaslab size larger than 512MB.
164 	 * This approximates what OpenZFS does without being complicated.  In
165 	 * practice we expect pools to be expanded upon first use, and OpenZFS
166 	 * does not resize metaslabs in that case, so there is no right answer
167 	 * here.  In general we want to provide large metaslabs even if the
168 	 * image size is small, and 512MB is a reasonable size for pools up to
169 	 * several hundred gigabytes.
170 	 *
171 	 * The user may override this heuristic using the "-o mssize" option.
172 	 */
173 	mssize = zfs->mssize;
174 	if (mssize == 0) {
175 		mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE);
176 		if (!powerof2(mssize))
177 			mssize = 1l << (flsll(mssize) - 1);
178 	}
179 	if (!powerof2(mssize))
180 		errx(1, "metaslab size must be a power of 2");
181 
182 	/*
183 	 * If we have some slop left over, try to cover it by resizing the vdev,
184 	 * subject to the maxsize and minsize parameters.
185 	 */
186 	if (asize % mssize != 0) {
187 		vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE;
188 		if (vdevsize1 < fsopts->minsize)
189 			vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE;
190 		if (vdevsize1 <= fsopts->maxsize)
191 			vdevsize = vdevsize1;
192 	}
193 	asize = vdevsize - VDEV_LABEL_SPACE;
194 
195 	zfs->asize = asize;
196 	zfs->vdevsize = vdevsize;
197 	zfs->mssize = mssize;
198 	zfs->msshift = flsll(mssize) - 1;
199 	zfs->mscount = asize / mssize;
200 }
201 
202 /*
203  * Validate options and set some default values.
204  */
205 static void
206 zfs_check_opts(fsinfo_t *fsopts)
207 {
208 	zfs_opt_t *zfs;
209 
210 	zfs = fsopts->fs_specific;
211 
212 	if (fsopts->offset != 0)
213 		errx(1, "unhandled offset option");
214 	if (fsopts->maxsize == 0)
215 		errx(1, "an image size must be specified");
216 
217 	if (zfs->poolname == NULL)
218 		errx(1, "a pool name must be specified");
219 	if (!isalpha(zfs->poolname[0]))
220 		errx(1, "the pool name must begin with a letter");
221 	for (size_t i = 0, len = strlen(zfs->poolname); i < len; i++) {
222 		if (!isalnum(zfs->poolname[i]) && zfs->poolname[i] != '_')
223 			errx(1, "invalid character '%c' in pool name",
224 			    zfs->poolname[i]);
225 	}
226 	if (strcmp(zfs->poolname, "mirror") == 0 ||
227 	    strcmp(zfs->poolname, "raidz") == 0 ||
228 	    strcmp(zfs->poolname, "draid") == 0) {
229 		errx(1, "pool name '%s' is reserved and cannot be used",
230 		    zfs->poolname);
231 	}
232 
233 	if (zfs->rootpath == NULL)
234 		easprintf(&zfs->rootpath, "/%s", zfs->poolname);
235 	if (zfs->rootpath[0] != '/')
236 		errx(1, "mountpoint `%s' must be absolute", zfs->rootpath);
237 
238 	if (zfs->ashift == 0)
239 		zfs->ashift = 12;
240 
241 	zfs_size_vdev(fsopts);
242 }
243 
244 void
245 zfs_cleanup_opts(fsinfo_t *fsopts)
246 {
247 	struct dataset_desc *d, *tmp;
248 	zfs_opt_t *zfs;
249 
250 	zfs = fsopts->fs_specific;
251 	free(zfs->rootpath);
252 	free(zfs->bootfs);
253 	free(__DECONST(void *, zfs->poolname));
254 	STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) {
255 		free(d->params);
256 		free(d);
257 	}
258 	free(zfs);
259 	free(fsopts->fs_options);
260 }
261 
262 static size_t
263 nvlist_size(const nvlist_t *nvl)
264 {
265 	return (sizeof(nvl->nv_header) + nvl->nv_size);
266 }
267 
268 static void
269 nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz)
270 {
271 	assert(sz >= nvlist_size(nvl));
272 
273 	memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header));
274 	memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size);
275 }
276 
277 static nvlist_t *
278 pool_config_nvcreate(zfs_opt_t *zfs)
279 {
280 	nvlist_t *featuresnv, *poolnv;
281 
282 	poolnv = nvlist_create(NV_UNIQUE_NAME);
283 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG);
284 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION);
285 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED);
286 	nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname);
287 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid);
288 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid);
289 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
290 	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1);
291 
292 	featuresnv = nvlist_create(NV_UNIQUE_NAME);
293 	nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv);
294 	nvlist_destroy(featuresnv);
295 
296 	return (poolnv);
297 }
298 
299 static nvlist_t *
300 pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs)
301 {
302 	nvlist_t *diskvdevnv;
303 
304 	assert(zfs->objarrid != 0);
305 
306 	diskvdevnv = nvlist_create(NV_UNIQUE_NAME);
307 	nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK);
308 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift);
309 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize);
310 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
311 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0);
312 	nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null");
313 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1);
314 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
315 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY,
316 	    zfs->objarrid);
317 	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT,
318 	    zfs->msshift);
319 
320 	return (diskvdevnv);
321 }
322 
323 static nvlist_t *
324 pool_root_vdev_config_nvcreate(zfs_opt_t *zfs)
325 {
326 	nvlist_t *diskvdevnv, *rootvdevnv;
327 
328 	diskvdevnv = pool_disk_vdev_config_nvcreate(zfs);
329 	rootvdevnv = nvlist_create(NV_UNIQUE_NAME);
330 
331 	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0);
332 	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid);
333 	nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT);
334 	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
335 	nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv,
336 	    1);
337 	nvlist_destroy(diskvdevnv);
338 
339 	return (rootvdevnv);
340 }
341 
342 /*
343  * Create the pool's "config" object, which contains an nvlist describing pool
344  * parameters and the vdev topology.  It is similar but not identical to the
345  * nvlist stored in vdev labels.  The main difference is that vdev labels do not
346  * describe the full vdev tree and in particular do not contain the "root"
347  * meta-vdev.
348  */
349 static void
350 pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir)
351 {
352 	dnode_phys_t *dnode;
353 	nvlist_t *poolconfig, *vdevconfig;
354 	void *configbuf;
355 	uint64_t dnid;
356 	off_t configloc, configblksz;
357 	int error;
358 
359 	dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST,
360 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid);
361 
362 	poolconfig = pool_config_nvcreate(zfs);
363 
364 	vdevconfig = pool_root_vdev_config_nvcreate(zfs);
365 	nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
366 	nvlist_destroy(vdevconfig);
367 
368 	error = nvlist_export(poolconfig);
369 	if (error != 0)
370 		errc(1, error, "nvlist_export");
371 
372 	configblksz = nvlist_size(poolconfig);
373 	configloc = objset_space_alloc(zfs, zfs->mos, &configblksz);
374 	configbuf = ecalloc(1, configblksz);
375 	nvlist_copy(poolconfig, configbuf, configblksz);
376 
377 	vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc);
378 
379 	dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT;
380 	dnode->dn_flags = DNODE_FLAG_USED_BYTES;
381 	*(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig);
382 
383 	zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid);
384 
385 	nvlist_destroy(poolconfig);
386 	free(configbuf);
387 }
388 
389 /*
390  * Add objects block pointer list objects, used for deferred frees.  We don't do
391  * anything with them, but they need to be present or OpenZFS will refuse to
392  * import the pool.
393  */
394 static void
395 pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir)
396 {
397 	uint64_t dnid;
398 
399 	(void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
400 	    BPOBJ_SIZE_V2, &dnid);
401 	zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid);
402 
403 	(void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
404 	    BPOBJ_SIZE_V2, &dnid);
405 	zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid);
406 }
407 
408 /*
409  * Add required feature metadata objects.  We don't know anything about ZFS
410  * features, so the objects are just empty ZAPs.
411  */
412 static void
413 pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir)
414 {
415 	dnode_phys_t *dnode;
416 	uint64_t dnid;
417 
418 	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
419 	zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid);
420 	zap_write(zfs, zap_alloc(zfs->mos, dnode));
421 
422 	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
423 	zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid);
424 	zap_write(zfs, zap_alloc(zfs->mos, dnode));
425 
426 	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
427 	zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid);
428 	zap_write(zfs, zap_alloc(zfs->mos, dnode));
429 }
430 
431 static void
432 pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir)
433 {
434 	zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET,
435 	    dsl_dir_id(zfs->rootdsldir));
436 }
437 
438 static void
439 pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir)
440 {
441 	dnode_phys_t *dnode;
442 	uint64_t id;
443 
444 	dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id);
445 	zap_add_uint64(objdir, DMU_POOL_PROPS, id);
446 
447 	zfs->poolprops = zap_alloc(zfs->mos, dnode);
448 }
449 
450 /*
451  * Initialize the MOS object directory, the root of virtually all of the pool's
452  * data and metadata.
453  */
454 static void
455 pool_init_objdir(zfs_opt_t *zfs)
456 {
457 	zfs_zap_t *zap;
458 	dnode_phys_t *objdir;
459 
460 	objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT);
461 
462 	zap = zap_alloc(zfs->mos, objdir);
463 	pool_init_objdir_config(zfs, zap);
464 	pool_init_objdir_bplists(zfs, zap);
465 	pool_init_objdir_feature_maps(zfs, zap);
466 	pool_init_objdir_dsl(zfs, zap);
467 	pool_init_objdir_poolprops(zfs, zap);
468 	zap_write(zfs, zap);
469 }
470 
471 /*
472  * Initialize the meta-object set (MOS) and immediately write out several
473  * special objects whose contents are already finalized, including the object
474  * directory.
475  *
476  * Once the MOS is finalized, it'll look roughly like this:
477  *
478  *	object directory (ZAP)
479  *	|-> vdev config object (nvlist)
480  *	|-> features for read
481  *	|-> features for write
482  *	|-> feature descriptions
483  *	|-> sync bplist
484  *	|-> free bplist
485  *	|-> pool properties
486  *	L-> root DSL directory
487  *	    |-> DSL child directory (ZAP)
488  *	    |   |-> $MOS (DSL dir)
489  *	    |   |   |-> child map
490  *	    |   |   L-> props (ZAP)
491  *	    |   |-> $FREE (DSL dir)
492  *	    |   |   |-> child map
493  *	    |   |   L-> props (ZAP)
494  *	    |   |-> $ORIGIN (DSL dir)
495  *	    |   |   |-> child map
496  *	    |   |   |-> dataset
497  *	    |   |   |   L-> deadlist
498  *	    |   |   |-> snapshot
499  *	    |   |   |   |-> deadlist
500  *	    |   |   |   L-> snapshot names
501  *	    |   |   |-> props (ZAP)
502  *	    |   |   L-> clones (ZAP)
503  *	    |   |-> dataset 1 (DSL dir)
504  *	    |   |   |-> DSL dataset
505  *	    |   |   |   |-> snapshot names
506  *	    |   |   |   L-> deadlist
507  *	    |   |   |-> child map
508  *	    |   |   |   L-> ...
509  *	    |   |   L-> props
510  *	    |   |-> dataset 2
511  *	    |   |   L-> ...
512  *	    |   |-> ...
513  *	    |   L-> dataset n
514  *	    |-> DSL root dataset
515  *	    |   |-> snapshot names
516  *	    |   L-> deadlist
517  *	    L-> props (ZAP)
518  *	space map object array
519  *	|-> space map 1
520  *	|-> space map 2
521  *	|-> ...
522  *	L-> space map n (zfs->mscount)
523  *
524  * The space map object array is pointed to by the "msarray" property in the
525  * pool configuration.
526  */
527 static void
528 pool_init(zfs_opt_t *zfs)
529 {
530 	uint64_t dnid;
531 
532 	zfs->poolguid = ((uint64_t)random() << 32) | random();
533 	zfs->vdevguid = ((uint64_t)random() << 32) | random();
534 
535 	zfs->mos = objset_alloc(zfs, DMU_OST_META);
536 
537 	(void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid);
538 	assert(dnid == DMU_POOL_DIRECTORY_OBJECT);
539 
540 	(void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid);
541 
542 	dsl_init(zfs);
543 
544 	pool_init_objdir(zfs);
545 }
546 
547 static void
548 pool_labels_write(zfs_opt_t *zfs)
549 {
550 	uberblock_t *ub;
551 	vdev_label_t *label;
552 	nvlist_t *poolconfig, *vdevconfig;
553 	int error;
554 
555 	label = ecalloc(1, sizeof(*label));
556 
557 	/*
558 	 * Assemble the vdev configuration and store it in the label.
559 	 */
560 	poolconfig = pool_config_nvcreate(zfs);
561 	vdevconfig = pool_disk_vdev_config_nvcreate(zfs);
562 	nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
563 	nvlist_destroy(vdevconfig);
564 
565 	error = nvlist_export(poolconfig);
566 	if (error != 0)
567 		errc(1, error, "nvlist_export");
568 	nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist,
569 	    sizeof(label->vl_vdev_phys.vp_nvlist));
570 	nvlist_destroy(poolconfig);
571 
572 	/*
573 	 * Fill out the uberblock.  Just make each one the same.  The embedded
574 	 * checksum is calculated in vdev_label_write().
575 	 */
576 	for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock);
577 	    uoff += (1 << zfs->ashift)) {
578 		ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff);
579 		ub->ub_magic = UBERBLOCK_MAGIC;
580 		ub->ub_version = SPA_VERSION;
581 		ub->ub_txg = TXG;
582 		ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid;
583 		ub->ub_timestamp = 0;
584 
585 		ub->ub_software_version = SPA_VERSION;
586 		ub->ub_mmp_magic = MMP_MAGIC;
587 		ub->ub_mmp_delay = 0;
588 		ub->ub_mmp_config = 0;
589 		ub->ub_checkpoint_txg = 0;
590 		objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp);
591 	}
592 
593 	/*
594 	 * Write out four copies of the label: two at the beginning of the vdev
595 	 * and two at the end.
596 	 */
597 	for (int i = 0; i < VDEV_LABELS; i++)
598 		vdev_label_write(zfs, i, label);
599 
600 	free(label);
601 }
602 
603 static void
604 pool_fini(zfs_opt_t *zfs)
605 {
606 	zap_write(zfs, zfs->poolprops);
607 	dsl_write(zfs);
608 	objset_write(zfs, zfs->mos);
609 	pool_labels_write(zfs);
610 }
611 
612 struct dnode_cursor *
613 dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode,
614     off_t size, off_t blksz)
615 {
616 	struct dnode_cursor *c;
617 	uint64_t nbppindir, indlevel, ndatablks, nindblks;
618 
619 	assert(dnode->dn_nblkptr == 1);
620 	assert(blksz <= MAXBLOCKSIZE);
621 
622 	if (blksz == 0) {
623 		/* Must be between 1<<ashift and 128KB. */
624 		blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift,
625 		    powerof2(size) ? size : (1l << flsll(size))));
626 	}
627 	assert(powerof2(blksz));
628 
629 	/*
630 	 * Do we need indirect blocks?  Figure out how many levels are needed
631 	 * (indlevel == 1 means no indirect blocks) and how much space is needed
632 	 * (it has to be allocated up-front to break the dependency cycle
633 	 * described in objset_write()).
634 	 */
635 	ndatablks = size == 0 ? 0 : howmany(size, blksz);
636 	nindblks = 0;
637 	for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) {
638 		nbppindir *= BLKPTR_PER_INDIR;
639 		nindblks += howmany(ndatablks, indlevel * nbppindir);
640 	}
641 	assert(indlevel < INDIR_LEVELS);
642 
643 	dnode->dn_nlevels = (uint8_t)indlevel;
644 	dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0;
645 	dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
646 
647 	c = ecalloc(1, sizeof(*c));
648 	if (nindblks > 0) {
649 		c->indspace = nindblks * MAXBLOCKSIZE;
650 		c->indloc = objset_space_alloc(zfs, os, &c->indspace);
651 	}
652 	c->dnode = dnode;
653 	c->dataoff = 0;
654 	c->datablksz = blksz;
655 
656 	return (c);
657 }
658 
659 static void
660 _dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, int levels)
661 {
662 	blkptr_t *bp, *pbp;
663 	void *buf;
664 	uint64_t fill;
665 	off_t blkid, blksz, loc;
666 
667 	assert(levels > 0);
668 	assert(levels <= c->dnode->dn_nlevels - 1);
669 
670 	blksz = MAXBLOCKSIZE;
671 	blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR;
672 	for (int level = 1; level <= levels; level++) {
673 		buf = c->inddir[level - 1];
674 
675 		if (level == c->dnode->dn_nlevels - 1) {
676 			pbp = &c->dnode->dn_blkptr[0];
677 		} else {
678 			uint64_t iblkid;
679 
680 			iblkid = blkid & (BLKPTR_PER_INDIR - 1);
681 			pbp = (blkptr_t *)
682 			    &c->inddir[level][iblkid * sizeof(blkptr_t)];
683 		}
684 
685 		/*
686 		 * Space for indirect blocks is allocated up-front; see the
687 		 * comment in objset_write().
688 		 */
689 		loc = c->indloc;
690 		c->indloc += blksz;
691 		assert(c->indspace >= blksz);
692 		c->indspace -= blksz;
693 
694 		bp = buf;
695 		fill = 0;
696 		for (size_t i = 0; i < BLKPTR_PER_INDIR; i++)
697 			fill += BP_GET_FILL(&bp[i]);
698 
699 		vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz,
700 		    loc, pbp);
701 		memset(buf, 0, MAXBLOCKSIZE);
702 
703 		blkid /= BLKPTR_PER_INDIR;
704 	}
705 }
706 
707 blkptr_t *
708 dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off)
709 {
710 	off_t blkid, l1id;
711 	int levels;
712 
713 	if (c->dnode->dn_nlevels == 1) {
714 		assert(off < MAXBLOCKSIZE);
715 		return (&c->dnode->dn_blkptr[0]);
716 	}
717 
718 	assert(off % c->datablksz == 0);
719 
720 	/* Do we need to flush any full indirect blocks? */
721 	if (off > 0) {
722 		blkid = off / c->datablksz;
723 		for (levels = 0; levels < c->dnode->dn_nlevels - 1; levels++) {
724 			if (blkid % BLKPTR_PER_INDIR != 0)
725 				break;
726 			blkid /= BLKPTR_PER_INDIR;
727 		}
728 		if (levels > 0)
729 			_dnode_cursor_flush(zfs, c, levels);
730 	}
731 
732 	c->dataoff = off;
733 	l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1);
734 	return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]);
735 }
736 
737 void
738 dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c)
739 {
740 	int levels;
741 
742 	levels = c->dnode->dn_nlevels - 1;
743 	if (levels > 0)
744 		_dnode_cursor_flush(zfs, c, levels);
745 	assert(c->indspace == 0);
746 	free(c);
747 }
748 
749 void
750 zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts)
751 {
752 	zfs_opt_t *zfs;
753 	int dirfd;
754 
755 	zfs = fsopts->fs_specific;
756 
757 	/*
758 	 * Use a fixed seed to provide reproducible pseudo-random numbers for
759 	 * on-disk structures when needed (e.g., GUIDs, ZAP hash salts).
760 	 */
761 	srandom(1729);
762 
763 	zfs_check_opts(fsopts);
764 
765 	if (!zfs->nowarn) {
766 		fprintf(stderr,
767 		    "ZFS support is currently considered experimental. "
768 		    "Do not use it for anything critical.\n");
769 	}
770 
771 	dirfd = open(dir, O_DIRECTORY | O_RDONLY);
772 	if (dirfd < 0)
773 		err(1, "open(%s)", dir);
774 
775 	vdev_init(zfs, image);
776 	pool_init(zfs);
777 	fs_build(zfs, dirfd, root);
778 	pool_fini(zfs);
779 	vdev_fini(zfs);
780 }
781