1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2022 The FreeBSD Foundation
5 *
6 * This software was developed by Mark Johnston under sponsorship from
7 * the FreeBSD Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions are
11 * met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include <sys/param.h>
32 #include <sys/errno.h>
33 #include <sys/queue.h>
34
35 #include <assert.h>
36 #include <ctype.h>
37 #include <fcntl.h>
38 #include <stdalign.h>
39 #include <stdbool.h>
40 #include <stddef.h>
41 #include <stdint.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <unistd.h>
45
46 #include <util.h>
47
48 #include "makefs.h"
49 #include "zfs.h"
50
51 #define VDEV_LABEL_SPACE \
52 ((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE))
53 _Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, "");
54
55 #define MINMSSIZE ((off_t)1 << 24) /* 16MB */
56 #define DFLTMSSIZE ((off_t)1 << 29) /* 512MB */
57 #define MAXMSSIZE ((off_t)1 << 34) /* 16GB */
58
59 #define INDIR_LEVELS 6
60 /* Indirect blocks are always 128KB. */
61 #define BLKPTR_PER_INDIR (MAXBLOCKSIZE / sizeof(blkptr_t))
62
63 struct dnode_cursor {
64 char inddir[INDIR_LEVELS][MAXBLOCKSIZE];
65 off_t indloc;
66 off_t indspace;
67 dnode_phys_t *dnode;
68 off_t dataoff;
69 off_t datablksz;
70 };
71
72 void
zfs_prep_opts(fsinfo_t * fsopts)73 zfs_prep_opts(fsinfo_t *fsopts)
74 {
75 zfs_opt_t *zfs;
76 size_t align;
77
78 align = alignof(uint64_t);
79 zfs = aligned_alloc(align, roundup2(sizeof(*zfs), align));
80 if (zfs == NULL)
81 err(1, "aligned_alloc");
82 memset(zfs, 0, sizeof(*zfs));
83
84 const option_t zfs_options[] = {
85 { '\0', "bootfs", &zfs->bootfs, OPT_STRPTR,
86 0, 0, "Bootable dataset" },
87 { '\0', "mssize", &zfs->mssize, OPT_INT64,
88 MINMSSIZE, MAXMSSIZE, "Metaslab size" },
89 { '\0', "poolguid", &zfs->poolguid, OPT_INT64,
90 0, INT64_MAX, "ZFS pool GUID" },
91 { '\0', "poolname", &zfs->poolname, OPT_STRPTR,
92 0, 0, "ZFS pool name" },
93 { '\0', "rootpath", &zfs->rootpath, OPT_STRPTR,
94 0, 0, "Prefix for all dataset mount points" },
95 { '\0', "ashift", &zfs->ashift, OPT_INT32,
96 MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" },
97 { '\0', "verify-txgs", &zfs->verify_txgs, OPT_BOOL,
98 0, 0, "Make OpenZFS verify data upon import" },
99 { '\0', "nowarn", &zfs->nowarn, OPT_BOOL,
100 0, 0, "Provided for backwards compatibility, ignored" },
101 { .name = NULL }
102 };
103
104 STAILQ_INIT(&zfs->datasetdescs);
105
106 fsopts->fs_specific = zfs;
107 fsopts->fs_options = copy_opts(zfs_options);
108 }
109
110 int
zfs_parse_opts(const char * option,fsinfo_t * fsopts)111 zfs_parse_opts(const char *option, fsinfo_t *fsopts)
112 {
113 zfs_opt_t *zfs;
114 struct dataset_desc *dsdesc;
115 char buf[BUFSIZ], *opt, *val;
116 int rv;
117
118 zfs = fsopts->fs_specific;
119
120 opt = val = estrdup(option);
121 opt = strsep(&val, "=");
122 if (strcmp(opt, "fs") == 0) {
123 if (val == NULL)
124 errx(1, "invalid filesystem parameters `%s'", option);
125
126 /*
127 * Dataset descriptions will be parsed later, in dsl_init().
128 * Just stash them away for now.
129 */
130 dsdesc = ecalloc(1, sizeof(*dsdesc));
131 dsdesc->params = estrdup(val);
132 free(opt);
133 STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next);
134 return (1);
135 }
136 free(opt);
137
138 rv = set_option(fsopts->fs_options, option, buf, sizeof(buf));
139 return (rv == -1 ? 0 : 1);
140 }
141
142 static void
zfs_size_vdev(fsinfo_t * fsopts)143 zfs_size_vdev(fsinfo_t *fsopts)
144 {
145 zfs_opt_t *zfs;
146 off_t asize, mssize, vdevsize, vdevsize1;
147
148 zfs = fsopts->fs_specific;
149
150 assert(fsopts->maxsize != 0);
151 assert(zfs->ashift != 0);
152
153 /*
154 * Figure out how big the vdev should be.
155 */
156 vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift);
157 if (vdevsize < MINDEVSIZE)
158 errx(1, "maximum image size is too small");
159 if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) {
160 errx(1, "image size bounds must be multiples of %d",
161 1 << zfs->ashift);
162 }
163 asize = vdevsize - VDEV_LABEL_SPACE;
164
165 /*
166 * Size metaslabs according to the following heuristic:
167 * - provide at least 8 metaslabs,
168 * - without using a metaslab size larger than 512MB.
169 * This approximates what OpenZFS does without being complicated. In
170 * practice we expect pools to be expanded upon first use, and OpenZFS
171 * does not resize metaslabs in that case, so there is no right answer
172 * here. In general we want to provide large metaslabs even if the
173 * image size is small, and 512MB is a reasonable size for pools up to
174 * several hundred gigabytes.
175 *
176 * The user may override this heuristic using the "-o mssize" option.
177 */
178 mssize = zfs->mssize;
179 if (mssize == 0) {
180 mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE);
181 if (!powerof2(mssize))
182 mssize = 1l << (flsll(mssize) - 1);
183 }
184 if (!powerof2(mssize))
185 errx(1, "metaslab size must be a power of 2");
186
187 /*
188 * If we have some slop left over, try to cover it by resizing the vdev,
189 * subject to the maxsize and minsize parameters.
190 */
191 if (asize % mssize != 0) {
192 vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE;
193 if (vdevsize1 < fsopts->minsize)
194 vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE;
195 if (vdevsize1 <= fsopts->maxsize)
196 vdevsize = vdevsize1;
197 }
198 asize = vdevsize - VDEV_LABEL_SPACE;
199
200 zfs->asize = asize;
201 zfs->vdevsize = vdevsize;
202 zfs->mssize = mssize;
203 zfs->msshift = flsll(mssize) - 1;
204 zfs->mscount = asize / mssize;
205 }
206
207 /*
208 * Validate options and set some default values.
209 */
210 static void
zfs_check_opts(fsinfo_t * fsopts)211 zfs_check_opts(fsinfo_t *fsopts)
212 {
213 zfs_opt_t *zfs;
214
215 zfs = fsopts->fs_specific;
216
217 if (fsopts->offset != 0)
218 errx(1, "unhandled offset option");
219 if (fsopts->maxsize == 0)
220 errx(1, "an image size must be specified");
221
222 if (zfs->poolname == NULL)
223 errx(1, "a pool name must be specified");
224 if (!isalpha(zfs->poolname[0]))
225 errx(1, "the pool name must begin with a letter");
226 for (size_t i = 0, len = strlen(zfs->poolname); i < len; i++) {
227 if (!isalnum(zfs->poolname[i]) && zfs->poolname[i] != '_')
228 errx(1, "invalid character '%c' in pool name",
229 zfs->poolname[i]);
230 }
231 if (strcmp(zfs->poolname, "mirror") == 0 ||
232 strcmp(zfs->poolname, "raidz") == 0 ||
233 strcmp(zfs->poolname, "draid") == 0) {
234 errx(1, "pool name '%s' is reserved and cannot be used",
235 zfs->poolname);
236 }
237
238 if (zfs->rootpath == NULL)
239 easprintf(&zfs->rootpath, "/%s", zfs->poolname);
240 if (zfs->rootpath[0] != '/')
241 errx(1, "mountpoint `%s' must be absolute", zfs->rootpath);
242
243 if (zfs->ashift == 0)
244 zfs->ashift = 12;
245
246 zfs_size_vdev(fsopts);
247 }
248
249 void
zfs_cleanup_opts(fsinfo_t * fsopts)250 zfs_cleanup_opts(fsinfo_t *fsopts)
251 {
252 struct dataset_desc *d, *tmp;
253 zfs_opt_t *zfs;
254
255 zfs = fsopts->fs_specific;
256 free(zfs->rootpath);
257 free(zfs->bootfs);
258 free(__DECONST(void *, zfs->poolname));
259 STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) {
260 free(d->params);
261 free(d);
262 }
263 free(zfs);
264 free(fsopts->fs_options);
265 }
266
267 static size_t
nvlist_size(const nvlist_t * nvl)268 nvlist_size(const nvlist_t *nvl)
269 {
270 return (sizeof(nvl->nv_header) + nvl->nv_size);
271 }
272
273 static void
nvlist_copy(const nvlist_t * nvl,char * buf,size_t sz)274 nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz)
275 {
276 assert(sz >= nvlist_size(nvl));
277
278 memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header));
279 memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size);
280 }
281
282 /*
283 * Avoid returning a GUID of 0, just to avoid the possibility that something
284 * will interpret that as meaning that the GUID is uninitialized.
285 */
286 uint64_t
randomguid(void)287 randomguid(void)
288 {
289 uint64_t ret;
290
291 do {
292 ret = ((uint64_t)random() << 32) | random();
293 } while (ret == 0);
294
295 return (ret);
296 }
297
298 static nvlist_t *
pool_config_nvcreate(zfs_opt_t * zfs)299 pool_config_nvcreate(zfs_opt_t *zfs)
300 {
301 nvlist_t *featuresnv, *poolnv;
302
303 poolnv = nvlist_create(NV_UNIQUE_NAME);
304 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG);
305 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION);
306 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED);
307 nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname);
308 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid);
309 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid);
310 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
311 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1);
312
313 featuresnv = nvlist_create(NV_UNIQUE_NAME);
314 nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv);
315 nvlist_destroy(featuresnv);
316
317 return (poolnv);
318 }
319
320 static nvlist_t *
pool_disk_vdev_config_nvcreate(zfs_opt_t * zfs)321 pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs)
322 {
323 nvlist_t *diskvdevnv;
324
325 assert(zfs->objarrid != 0);
326
327 diskvdevnv = nvlist_create(NV_UNIQUE_NAME);
328 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK);
329 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift);
330 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize);
331 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
332 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0);
333 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null");
334 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1);
335 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
336 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY,
337 zfs->objarrid);
338 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT,
339 zfs->msshift);
340
341 return (diskvdevnv);
342 }
343
344 static nvlist_t *
pool_root_vdev_config_nvcreate(zfs_opt_t * zfs)345 pool_root_vdev_config_nvcreate(zfs_opt_t *zfs)
346 {
347 nvlist_t *diskvdevnv, *rootvdevnv;
348
349 diskvdevnv = pool_disk_vdev_config_nvcreate(zfs);
350 rootvdevnv = nvlist_create(NV_UNIQUE_NAME);
351
352 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0);
353 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid);
354 nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT);
355 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
356 nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv,
357 1);
358 nvlist_destroy(diskvdevnv);
359
360 return (rootvdevnv);
361 }
362
363 /*
364 * Create the pool's "config" object, which contains an nvlist describing pool
365 * parameters and the vdev topology. It is similar but not identical to the
366 * nvlist stored in vdev labels. The main difference is that vdev labels do not
367 * describe the full vdev tree and in particular do not contain the "root"
368 * meta-vdev.
369 */
370 static void
pool_init_objdir_config(zfs_opt_t * zfs,zfs_zap_t * objdir)371 pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir)
372 {
373 dnode_phys_t *dnode;
374 nvlist_t *poolconfig, *vdevconfig;
375 void *configbuf;
376 uint64_t dnid;
377 off_t configloc, configblksz;
378 int error;
379
380 dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST,
381 DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid);
382
383 poolconfig = pool_config_nvcreate(zfs);
384
385 vdevconfig = pool_root_vdev_config_nvcreate(zfs);
386 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
387 nvlist_destroy(vdevconfig);
388
389 error = nvlist_export(poolconfig);
390 if (error != 0)
391 errc(1, error, "nvlist_export");
392
393 configblksz = nvlist_size(poolconfig);
394 configloc = objset_space_alloc(zfs, zfs->mos, &configblksz);
395 configbuf = ecalloc(1, configblksz);
396 nvlist_copy(poolconfig, configbuf, configblksz);
397
398 vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc);
399
400 dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT;
401 dnode->dn_flags = DNODE_FLAG_USED_BYTES;
402 *(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig);
403
404 zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid);
405
406 nvlist_destroy(poolconfig);
407 free(configbuf);
408 }
409
410 /*
411 * Add objects block pointer list objects, used for deferred frees. We don't do
412 * anything with them, but they need to be present or OpenZFS will refuse to
413 * import the pool.
414 */
415 static void
pool_init_objdir_bplists(zfs_opt_t * zfs __unused,zfs_zap_t * objdir)416 pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir)
417 {
418 uint64_t dnid;
419
420 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
421 BPOBJ_SIZE_V2, &dnid);
422 zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid);
423
424 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
425 BPOBJ_SIZE_V2, &dnid);
426 zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid);
427 }
428
429 /*
430 * Add required feature metadata objects. We don't know anything about ZFS
431 * features, so the objects are just empty ZAPs.
432 */
433 static void
pool_init_objdir_feature_maps(zfs_opt_t * zfs,zfs_zap_t * objdir)434 pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir)
435 {
436 dnode_phys_t *dnode;
437 uint64_t dnid;
438
439 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
440 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid);
441 zap_write(zfs, zap_alloc(zfs->mos, dnode));
442
443 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
444 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid);
445 zap_write(zfs, zap_alloc(zfs->mos, dnode));
446
447 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
448 zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid);
449 zap_write(zfs, zap_alloc(zfs->mos, dnode));
450 }
451
452 static void
pool_init_objdir_dsl(zfs_opt_t * zfs,zfs_zap_t * objdir)453 pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir)
454 {
455 zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET,
456 dsl_dir_id(zfs->rootdsldir));
457 }
458
459 static void
pool_init_objdir_poolprops(zfs_opt_t * zfs,zfs_zap_t * objdir)460 pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir)
461 {
462 dnode_phys_t *dnode;
463 uint64_t id;
464
465 dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id);
466 zap_add_uint64(objdir, DMU_POOL_PROPS, id);
467
468 zfs->poolprops = zap_alloc(zfs->mos, dnode);
469 }
470
471 /*
472 * Initialize the MOS object directory, the root of virtually all of the pool's
473 * data and metadata.
474 */
475 static void
pool_init_objdir(zfs_opt_t * zfs)476 pool_init_objdir(zfs_opt_t *zfs)
477 {
478 zfs_zap_t *zap;
479 dnode_phys_t *objdir;
480
481 objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT);
482
483 zap = zap_alloc(zfs->mos, objdir);
484 pool_init_objdir_config(zfs, zap);
485 pool_init_objdir_bplists(zfs, zap);
486 pool_init_objdir_feature_maps(zfs, zap);
487 pool_init_objdir_dsl(zfs, zap);
488 pool_init_objdir_poolprops(zfs, zap);
489 zap_write(zfs, zap);
490 }
491
492 /*
493 * Initialize the meta-object set (MOS) and immediately write out several
494 * special objects whose contents are already finalized, including the object
495 * directory.
496 *
497 * Once the MOS is finalized, it'll look roughly like this:
498 *
499 * object directory (ZAP)
500 * |-> vdev config object (nvlist)
501 * |-> features for read
502 * |-> features for write
503 * |-> feature descriptions
504 * |-> sync bplist
505 * |-> free bplist
506 * |-> pool properties
507 * L-> root DSL directory
508 * |-> DSL child directory (ZAP)
509 * | |-> $MOS (DSL dir)
510 * | | |-> child map
511 * | | L-> props (ZAP)
512 * | |-> $FREE (DSL dir)
513 * | | |-> child map
514 * | | L-> props (ZAP)
515 * | |-> $ORIGIN (DSL dir)
516 * | | |-> child map
517 * | | |-> dataset
518 * | | | L-> deadlist
519 * | | |-> snapshot
520 * | | | |-> deadlist
521 * | | | L-> snapshot names
522 * | | |-> props (ZAP)
523 * | | L-> clones (ZAP)
524 * | |-> dataset 1 (DSL dir)
525 * | | |-> DSL dataset
526 * | | | |-> snapshot names
527 * | | | L-> deadlist
528 * | | |-> child map
529 * | | | L-> ...
530 * | | L-> props
531 * | |-> dataset 2
532 * | | L-> ...
533 * | |-> ...
534 * | L-> dataset n
535 * |-> DSL root dataset
536 * | |-> snapshot names
537 * | L-> deadlist
538 * L-> props (ZAP)
539 * space map object array
540 * |-> space map 1
541 * |-> space map 2
542 * |-> ...
543 * L-> space map n (zfs->mscount)
544 *
545 * The space map object array is pointed to by the "msarray" property in the
546 * pool configuration.
547 */
548 static void
pool_init(zfs_opt_t * zfs)549 pool_init(zfs_opt_t *zfs)
550 {
551 uint64_t dnid;
552
553 if (zfs->poolguid == 0)
554 zfs->poolguid = randomguid();
555 zfs->vdevguid = randomguid();
556
557 zfs->mos = objset_alloc(zfs, DMU_OST_META);
558
559 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid);
560 assert(dnid == DMU_POOL_DIRECTORY_OBJECT);
561
562 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid);
563
564 dsl_init(zfs);
565
566 pool_init_objdir(zfs);
567 }
568
569 static void
pool_labels_write(zfs_opt_t * zfs)570 pool_labels_write(zfs_opt_t *zfs)
571 {
572 uberblock_t *ub;
573 vdev_label_t *label;
574 nvlist_t *poolconfig, *vdevconfig;
575 int error;
576
577 label = ecalloc(1, sizeof(*label));
578
579 /*
580 * Assemble the vdev configuration and store it in the label.
581 */
582 poolconfig = pool_config_nvcreate(zfs);
583 vdevconfig = pool_disk_vdev_config_nvcreate(zfs);
584 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
585 nvlist_destroy(vdevconfig);
586
587 error = nvlist_export(poolconfig);
588 if (error != 0)
589 errc(1, error, "nvlist_export");
590 nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist,
591 sizeof(label->vl_vdev_phys.vp_nvlist));
592 nvlist_destroy(poolconfig);
593
594 /*
595 * Fill out the uberblock. Just make each one the same. The embedded
596 * checksum is calculated in vdev_label_write().
597 */
598 for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock);
599 uoff += (1 << zfs->ashift)) {
600 ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff);
601 ub->ub_magic = UBERBLOCK_MAGIC;
602 ub->ub_version = SPA_VERSION;
603
604 /*
605 * Upon import, OpenZFS will perform metadata verification of
606 * the last TXG by default. If all data is written in the same
607 * TXG, it'll all get verified, which can be painfully slow in
608 * some cases, e.g., initial boot in a cloud environment with
609 * slow storage. So, fabricate additional TXGs to avoid this
610 * overhead, unless the user requests otherwise.
611 */
612 ub->ub_txg = TXG;
613 if (!zfs->verify_txgs)
614 ub->ub_txg += TXG_SIZE;
615 ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid;
616 ub->ub_timestamp = 0;
617
618 ub->ub_software_version = SPA_VERSION;
619 ub->ub_mmp_magic = MMP_MAGIC;
620 ub->ub_mmp_delay = 0;
621 ub->ub_mmp_config = 0;
622 ub->ub_checkpoint_txg = 0;
623 objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp);
624 }
625
626 /*
627 * Write out four copies of the label: two at the beginning of the vdev
628 * and two at the end.
629 */
630 for (int i = 0; i < VDEV_LABELS; i++)
631 vdev_label_write(zfs, i, label);
632
633 free(label);
634 }
635
636 static void
pool_fini(zfs_opt_t * zfs)637 pool_fini(zfs_opt_t *zfs)
638 {
639 zap_write(zfs, zfs->poolprops);
640 dsl_write(zfs);
641 objset_write(zfs, zfs->mos);
642 pool_labels_write(zfs);
643 }
644
645 struct dnode_cursor *
dnode_cursor_init(zfs_opt_t * zfs,zfs_objset_t * os,dnode_phys_t * dnode,off_t size,off_t blksz)646 dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode,
647 off_t size, off_t blksz)
648 {
649 struct dnode_cursor *c;
650 uint64_t nbppindir, indlevel, ndatablks, nindblks;
651
652 assert(dnode->dn_nblkptr == 1);
653 assert(blksz <= MAXBLOCKSIZE);
654
655 if (blksz == 0) {
656 /* Must be between 1<<ashift and 128KB. */
657 blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift,
658 powerof2(size) ? size : (1l << flsll(size))));
659 }
660 assert(powerof2(blksz));
661
662 /*
663 * Do we need indirect blocks? Figure out how many levels are needed
664 * (indlevel == 1 means no indirect blocks) and how much space is needed
665 * (it has to be allocated up-front to break the dependency cycle
666 * described in objset_write()).
667 */
668 ndatablks = size == 0 ? 0 : howmany(size, blksz);
669 nindblks = 0;
670 for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) {
671 nbppindir *= BLKPTR_PER_INDIR;
672 nindblks += howmany(ndatablks, indlevel * nbppindir);
673 }
674 assert(indlevel < INDIR_LEVELS);
675
676 dnode->dn_nlevels = (uint8_t)indlevel;
677 dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0;
678 dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
679
680 c = ecalloc(1, sizeof(*c));
681 if (nindblks > 0) {
682 c->indspace = nindblks * MAXBLOCKSIZE;
683 c->indloc = objset_space_alloc(zfs, os, &c->indspace);
684 }
685 c->dnode = dnode;
686 c->dataoff = 0;
687 c->datablksz = blksz;
688
689 return (c);
690 }
691
692 static void
_dnode_cursor_flush(zfs_opt_t * zfs,struct dnode_cursor * c,unsigned int levels)693 _dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, unsigned int levels)
694 {
695 blkptr_t *bp, *pbp;
696 void *buf;
697 uint64_t fill;
698 off_t blkid, blksz, loc;
699
700 assert(levels > 0);
701 assert(levels <= c->dnode->dn_nlevels - 1U);
702
703 blksz = MAXBLOCKSIZE;
704 blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR;
705 for (unsigned int level = 1; level <= levels; level++) {
706 buf = c->inddir[level - 1];
707
708 if (level == c->dnode->dn_nlevels - 1U) {
709 pbp = &c->dnode->dn_blkptr[0];
710 } else {
711 uint64_t iblkid;
712
713 iblkid = blkid & (BLKPTR_PER_INDIR - 1);
714 pbp = (blkptr_t *)
715 &c->inddir[level][iblkid * sizeof(blkptr_t)];
716 }
717
718 /*
719 * Space for indirect blocks is allocated up-front; see the
720 * comment in objset_write().
721 */
722 loc = c->indloc;
723 c->indloc += blksz;
724 assert(c->indspace >= blksz);
725 c->indspace -= blksz;
726
727 bp = buf;
728 fill = 0;
729 for (size_t i = 0; i < BLKPTR_PER_INDIR; i++)
730 fill += BP_GET_FILL(&bp[i]);
731
732 vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz,
733 loc, pbp);
734 memset(buf, 0, MAXBLOCKSIZE);
735
736 blkid /= BLKPTR_PER_INDIR;
737 }
738 }
739
740 blkptr_t *
dnode_cursor_next(zfs_opt_t * zfs,struct dnode_cursor * c,off_t off)741 dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off)
742 {
743 off_t blkid, l1id;
744 unsigned int levels;
745
746 if (c->dnode->dn_nlevels == 1) {
747 assert(off < MAXBLOCKSIZE);
748 return (&c->dnode->dn_blkptr[0]);
749 }
750
751 assert(off % c->datablksz == 0);
752
753 /* Do we need to flush any full indirect blocks? */
754 if (off > 0) {
755 blkid = off / c->datablksz;
756 for (levels = 0; levels < c->dnode->dn_nlevels - 1U; levels++) {
757 if (blkid % BLKPTR_PER_INDIR != 0)
758 break;
759 blkid /= BLKPTR_PER_INDIR;
760 }
761 if (levels > 0)
762 _dnode_cursor_flush(zfs, c, levels);
763 }
764
765 c->dataoff = off;
766 l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1);
767 return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]);
768 }
769
770 void
dnode_cursor_finish(zfs_opt_t * zfs,struct dnode_cursor * c)771 dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c)
772 {
773 unsigned int levels;
774
775 assert(c->dnode->dn_nlevels > 0);
776 levels = c->dnode->dn_nlevels - 1;
777 if (levels > 0)
778 _dnode_cursor_flush(zfs, c, levels);
779 assert(c->indspace == 0);
780 free(c);
781 }
782
783 void
zfs_makefs(const char * image,const char * dir,fsnode * root,fsinfo_t * fsopts)784 zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts)
785 {
786 zfs_opt_t *zfs;
787 int dirfd;
788
789 zfs = fsopts->fs_specific;
790
791 /*
792 * Use a fixed seed to provide reproducible pseudo-random numbers for
793 * on-disk structures when needed (e.g., GUIDs, ZAP hash salts).
794 */
795 srandom(1729);
796
797 zfs_check_opts(fsopts);
798
799 dirfd = open(dir, O_DIRECTORY | O_RDONLY);
800 if (dirfd < 0)
801 err(1, "open(%s)", dir);
802
803 vdev_init(zfs, image);
804 pool_init(zfs);
805 fs_build(zfs, dirfd, root);
806 pool_fini(zfs);
807 vdev_fini(zfs);
808 }
809