1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2022 The FreeBSD Foundation
5 *
6 * This software was developed by Mark Johnston under sponsorship from
7 * the FreeBSD Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions are
11 * met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include <sys/param.h>
32 #include <sys/errno.h>
33 #include <sys/queue.h>
34
35 #include <assert.h>
36 #include <ctype.h>
37 #include <fcntl.h>
38 #include <stdalign.h>
39 #include <stdbool.h>
40 #include <stddef.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #include <unistd.h>
44
45 #include <util.h>
46
47 #include "makefs.h"
48 #include "zfs.h"
49
50 #define VDEV_LABEL_SPACE \
51 ((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE))
52 _Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, "");
53
54 #define MINMSSIZE ((off_t)1 << 24) /* 16MB */
55 #define DFLTMSSIZE ((off_t)1 << 29) /* 512MB */
56 #define MAXMSSIZE ((off_t)1 << 34) /* 16GB */
57
58 #define INDIR_LEVELS 6
59 /* Indirect blocks are always 128KB. */
60 #define BLKPTR_PER_INDIR (MAXBLOCKSIZE / sizeof(blkptr_t))
61
62 struct dnode_cursor {
63 char inddir[INDIR_LEVELS][MAXBLOCKSIZE];
64 off_t indloc;
65 off_t indspace;
66 dnode_phys_t *dnode;
67 off_t dataoff;
68 off_t datablksz;
69 };
70
71 void
zfs_prep_opts(fsinfo_t * fsopts)72 zfs_prep_opts(fsinfo_t *fsopts)
73 {
74 zfs_opt_t *zfs;
75 size_t align;
76
77 align = alignof(uint64_t);
78 zfs = aligned_alloc(align, roundup2(sizeof(*zfs), align));
79 if (zfs == NULL)
80 err(1, "aligned_alloc");
81 memset(zfs, 0, sizeof(*zfs));
82
83 const option_t zfs_options[] = {
84 { '\0', "bootfs", &zfs->bootfs, OPT_STRPTR,
85 0, 0, "Bootable dataset" },
86 { '\0', "mssize", &zfs->mssize, OPT_INT64,
87 MINMSSIZE, MAXMSSIZE, "Metaslab size" },
88 { '\0', "poolname", &zfs->poolname, OPT_STRPTR,
89 0, 0, "ZFS pool name" },
90 { '\0', "rootpath", &zfs->rootpath, OPT_STRPTR,
91 0, 0, "Prefix for all dataset mount points" },
92 { '\0', "ashift", &zfs->ashift, OPT_INT32,
93 MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" },
94 { '\0', "verify-txgs", &zfs->verify_txgs, OPT_BOOL,
95 0, 0, "Make OpenZFS verify data upon import" },
96 { '\0', "nowarn", &zfs->nowarn, OPT_BOOL,
97 0, 0, "Provided for backwards compatibility, ignored" },
98 { .name = NULL }
99 };
100
101 STAILQ_INIT(&zfs->datasetdescs);
102
103 fsopts->fs_specific = zfs;
104 fsopts->fs_options = copy_opts(zfs_options);
105 }
106
107 int
zfs_parse_opts(const char * option,fsinfo_t * fsopts)108 zfs_parse_opts(const char *option, fsinfo_t *fsopts)
109 {
110 zfs_opt_t *zfs;
111 struct dataset_desc *dsdesc;
112 char buf[BUFSIZ], *opt, *val;
113 int rv;
114
115 zfs = fsopts->fs_specific;
116
117 opt = val = estrdup(option);
118 opt = strsep(&val, "=");
119 if (strcmp(opt, "fs") == 0) {
120 if (val == NULL)
121 errx(1, "invalid filesystem parameters `%s'", option);
122
123 /*
124 * Dataset descriptions will be parsed later, in dsl_init().
125 * Just stash them away for now.
126 */
127 dsdesc = ecalloc(1, sizeof(*dsdesc));
128 dsdesc->params = estrdup(val);
129 free(opt);
130 STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next);
131 return (1);
132 }
133 free(opt);
134
135 rv = set_option(fsopts->fs_options, option, buf, sizeof(buf));
136 return (rv == -1 ? 0 : 1);
137 }
138
139 static void
zfs_size_vdev(fsinfo_t * fsopts)140 zfs_size_vdev(fsinfo_t *fsopts)
141 {
142 zfs_opt_t *zfs;
143 off_t asize, mssize, vdevsize, vdevsize1;
144
145 zfs = fsopts->fs_specific;
146
147 assert(fsopts->maxsize != 0);
148 assert(zfs->ashift != 0);
149
150 /*
151 * Figure out how big the vdev should be.
152 */
153 vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift);
154 if (vdevsize < MINDEVSIZE)
155 errx(1, "maximum image size is too small");
156 if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) {
157 errx(1, "image size bounds must be multiples of %d",
158 1 << zfs->ashift);
159 }
160 asize = vdevsize - VDEV_LABEL_SPACE;
161
162 /*
163 * Size metaslabs according to the following heuristic:
164 * - provide at least 8 metaslabs,
165 * - without using a metaslab size larger than 512MB.
166 * This approximates what OpenZFS does without being complicated. In
167 * practice we expect pools to be expanded upon first use, and OpenZFS
168 * does not resize metaslabs in that case, so there is no right answer
169 * here. In general we want to provide large metaslabs even if the
170 * image size is small, and 512MB is a reasonable size for pools up to
171 * several hundred gigabytes.
172 *
173 * The user may override this heuristic using the "-o mssize" option.
174 */
175 mssize = zfs->mssize;
176 if (mssize == 0) {
177 mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE);
178 if (!powerof2(mssize))
179 mssize = 1l << (flsll(mssize) - 1);
180 }
181 if (!powerof2(mssize))
182 errx(1, "metaslab size must be a power of 2");
183
184 /*
185 * If we have some slop left over, try to cover it by resizing the vdev,
186 * subject to the maxsize and minsize parameters.
187 */
188 if (asize % mssize != 0) {
189 vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE;
190 if (vdevsize1 < fsopts->minsize)
191 vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE;
192 if (vdevsize1 <= fsopts->maxsize)
193 vdevsize = vdevsize1;
194 }
195 asize = vdevsize - VDEV_LABEL_SPACE;
196
197 zfs->asize = asize;
198 zfs->vdevsize = vdevsize;
199 zfs->mssize = mssize;
200 zfs->msshift = flsll(mssize) - 1;
201 zfs->mscount = asize / mssize;
202 }
203
204 /*
205 * Validate options and set some default values.
206 */
207 static void
zfs_check_opts(fsinfo_t * fsopts)208 zfs_check_opts(fsinfo_t *fsopts)
209 {
210 zfs_opt_t *zfs;
211
212 zfs = fsopts->fs_specific;
213
214 if (fsopts->offset != 0)
215 errx(1, "unhandled offset option");
216 if (fsopts->maxsize == 0)
217 errx(1, "an image size must be specified");
218
219 if (zfs->poolname == NULL)
220 errx(1, "a pool name must be specified");
221 if (!isalpha(zfs->poolname[0]))
222 errx(1, "the pool name must begin with a letter");
223 for (size_t i = 0, len = strlen(zfs->poolname); i < len; i++) {
224 if (!isalnum(zfs->poolname[i]) && zfs->poolname[i] != '_')
225 errx(1, "invalid character '%c' in pool name",
226 zfs->poolname[i]);
227 }
228 if (strcmp(zfs->poolname, "mirror") == 0 ||
229 strcmp(zfs->poolname, "raidz") == 0 ||
230 strcmp(zfs->poolname, "draid") == 0) {
231 errx(1, "pool name '%s' is reserved and cannot be used",
232 zfs->poolname);
233 }
234
235 if (zfs->rootpath == NULL)
236 easprintf(&zfs->rootpath, "/%s", zfs->poolname);
237 if (zfs->rootpath[0] != '/')
238 errx(1, "mountpoint `%s' must be absolute", zfs->rootpath);
239
240 if (zfs->ashift == 0)
241 zfs->ashift = 12;
242
243 zfs_size_vdev(fsopts);
244 }
245
246 void
zfs_cleanup_opts(fsinfo_t * fsopts)247 zfs_cleanup_opts(fsinfo_t *fsopts)
248 {
249 struct dataset_desc *d, *tmp;
250 zfs_opt_t *zfs;
251
252 zfs = fsopts->fs_specific;
253 free(zfs->rootpath);
254 free(zfs->bootfs);
255 free(__DECONST(void *, zfs->poolname));
256 STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) {
257 free(d->params);
258 free(d);
259 }
260 free(zfs);
261 free(fsopts->fs_options);
262 }
263
264 static size_t
nvlist_size(const nvlist_t * nvl)265 nvlist_size(const nvlist_t *nvl)
266 {
267 return (sizeof(nvl->nv_header) + nvl->nv_size);
268 }
269
270 static void
nvlist_copy(const nvlist_t * nvl,char * buf,size_t sz)271 nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz)
272 {
273 assert(sz >= nvlist_size(nvl));
274
275 memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header));
276 memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size);
277 }
278
279 /*
280 * Avoid returning a GUID of 0, just to avoid the possibility that something
281 * will interpret that as meaning that the GUID is uninitialized.
282 */
283 uint64_t
randomguid(void)284 randomguid(void)
285 {
286 uint64_t ret;
287
288 do {
289 ret = ((uint64_t)random() << 32) | random();
290 } while (ret == 0);
291
292 return (ret);
293 }
294
295 static nvlist_t *
pool_config_nvcreate(zfs_opt_t * zfs)296 pool_config_nvcreate(zfs_opt_t *zfs)
297 {
298 nvlist_t *featuresnv, *poolnv;
299
300 poolnv = nvlist_create(NV_UNIQUE_NAME);
301 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG);
302 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION);
303 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED);
304 nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname);
305 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid);
306 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid);
307 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
308 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1);
309
310 featuresnv = nvlist_create(NV_UNIQUE_NAME);
311 nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv);
312 nvlist_destroy(featuresnv);
313
314 return (poolnv);
315 }
316
317 static nvlist_t *
pool_disk_vdev_config_nvcreate(zfs_opt_t * zfs)318 pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs)
319 {
320 nvlist_t *diskvdevnv;
321
322 assert(zfs->objarrid != 0);
323
324 diskvdevnv = nvlist_create(NV_UNIQUE_NAME);
325 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK);
326 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift);
327 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize);
328 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
329 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0);
330 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null");
331 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1);
332 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
333 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY,
334 zfs->objarrid);
335 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT,
336 zfs->msshift);
337
338 return (diskvdevnv);
339 }
340
341 static nvlist_t *
pool_root_vdev_config_nvcreate(zfs_opt_t * zfs)342 pool_root_vdev_config_nvcreate(zfs_opt_t *zfs)
343 {
344 nvlist_t *diskvdevnv, *rootvdevnv;
345
346 diskvdevnv = pool_disk_vdev_config_nvcreate(zfs);
347 rootvdevnv = nvlist_create(NV_UNIQUE_NAME);
348
349 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0);
350 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid);
351 nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT);
352 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
353 nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv,
354 1);
355 nvlist_destroy(diskvdevnv);
356
357 return (rootvdevnv);
358 }
359
360 /*
361 * Create the pool's "config" object, which contains an nvlist describing pool
362 * parameters and the vdev topology. It is similar but not identical to the
363 * nvlist stored in vdev labels. The main difference is that vdev labels do not
364 * describe the full vdev tree and in particular do not contain the "root"
365 * meta-vdev.
366 */
367 static void
pool_init_objdir_config(zfs_opt_t * zfs,zfs_zap_t * objdir)368 pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir)
369 {
370 dnode_phys_t *dnode;
371 nvlist_t *poolconfig, *vdevconfig;
372 void *configbuf;
373 uint64_t dnid;
374 off_t configloc, configblksz;
375 int error;
376
377 dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST,
378 DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid);
379
380 poolconfig = pool_config_nvcreate(zfs);
381
382 vdevconfig = pool_root_vdev_config_nvcreate(zfs);
383 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
384 nvlist_destroy(vdevconfig);
385
386 error = nvlist_export(poolconfig);
387 if (error != 0)
388 errc(1, error, "nvlist_export");
389
390 configblksz = nvlist_size(poolconfig);
391 configloc = objset_space_alloc(zfs, zfs->mos, &configblksz);
392 configbuf = ecalloc(1, configblksz);
393 nvlist_copy(poolconfig, configbuf, configblksz);
394
395 vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc);
396
397 dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT;
398 dnode->dn_flags = DNODE_FLAG_USED_BYTES;
399 *(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig);
400
401 zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid);
402
403 nvlist_destroy(poolconfig);
404 free(configbuf);
405 }
406
407 /*
408 * Add objects block pointer list objects, used for deferred frees. We don't do
409 * anything with them, but they need to be present or OpenZFS will refuse to
410 * import the pool.
411 */
412 static void
pool_init_objdir_bplists(zfs_opt_t * zfs __unused,zfs_zap_t * objdir)413 pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir)
414 {
415 uint64_t dnid;
416
417 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
418 BPOBJ_SIZE_V2, &dnid);
419 zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid);
420
421 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
422 BPOBJ_SIZE_V2, &dnid);
423 zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid);
424 }
425
426 /*
427 * Add required feature metadata objects. We don't know anything about ZFS
428 * features, so the objects are just empty ZAPs.
429 */
430 static void
pool_init_objdir_feature_maps(zfs_opt_t * zfs,zfs_zap_t * objdir)431 pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir)
432 {
433 dnode_phys_t *dnode;
434 uint64_t dnid;
435
436 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
437 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid);
438 zap_write(zfs, zap_alloc(zfs->mos, dnode));
439
440 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
441 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid);
442 zap_write(zfs, zap_alloc(zfs->mos, dnode));
443
444 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
445 zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid);
446 zap_write(zfs, zap_alloc(zfs->mos, dnode));
447 }
448
449 static void
pool_init_objdir_dsl(zfs_opt_t * zfs,zfs_zap_t * objdir)450 pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir)
451 {
452 zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET,
453 dsl_dir_id(zfs->rootdsldir));
454 }
455
456 static void
pool_init_objdir_poolprops(zfs_opt_t * zfs,zfs_zap_t * objdir)457 pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir)
458 {
459 dnode_phys_t *dnode;
460 uint64_t id;
461
462 dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id);
463 zap_add_uint64(objdir, DMU_POOL_PROPS, id);
464
465 zfs->poolprops = zap_alloc(zfs->mos, dnode);
466 }
467
468 /*
469 * Initialize the MOS object directory, the root of virtually all of the pool's
470 * data and metadata.
471 */
472 static void
pool_init_objdir(zfs_opt_t * zfs)473 pool_init_objdir(zfs_opt_t *zfs)
474 {
475 zfs_zap_t *zap;
476 dnode_phys_t *objdir;
477
478 objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT);
479
480 zap = zap_alloc(zfs->mos, objdir);
481 pool_init_objdir_config(zfs, zap);
482 pool_init_objdir_bplists(zfs, zap);
483 pool_init_objdir_feature_maps(zfs, zap);
484 pool_init_objdir_dsl(zfs, zap);
485 pool_init_objdir_poolprops(zfs, zap);
486 zap_write(zfs, zap);
487 }
488
489 /*
490 * Initialize the meta-object set (MOS) and immediately write out several
491 * special objects whose contents are already finalized, including the object
492 * directory.
493 *
494 * Once the MOS is finalized, it'll look roughly like this:
495 *
496 * object directory (ZAP)
497 * |-> vdev config object (nvlist)
498 * |-> features for read
499 * |-> features for write
500 * |-> feature descriptions
501 * |-> sync bplist
502 * |-> free bplist
503 * |-> pool properties
504 * L-> root DSL directory
505 * |-> DSL child directory (ZAP)
506 * | |-> $MOS (DSL dir)
507 * | | |-> child map
508 * | | L-> props (ZAP)
509 * | |-> $FREE (DSL dir)
510 * | | |-> child map
511 * | | L-> props (ZAP)
512 * | |-> $ORIGIN (DSL dir)
513 * | | |-> child map
514 * | | |-> dataset
515 * | | | L-> deadlist
516 * | | |-> snapshot
517 * | | | |-> deadlist
518 * | | | L-> snapshot names
519 * | | |-> props (ZAP)
520 * | | L-> clones (ZAP)
521 * | |-> dataset 1 (DSL dir)
522 * | | |-> DSL dataset
523 * | | | |-> snapshot names
524 * | | | L-> deadlist
525 * | | |-> child map
526 * | | | L-> ...
527 * | | L-> props
528 * | |-> dataset 2
529 * | | L-> ...
530 * | |-> ...
531 * | L-> dataset n
532 * |-> DSL root dataset
533 * | |-> snapshot names
534 * | L-> deadlist
535 * L-> props (ZAP)
536 * space map object array
537 * |-> space map 1
538 * |-> space map 2
539 * |-> ...
540 * L-> space map n (zfs->mscount)
541 *
542 * The space map object array is pointed to by the "msarray" property in the
543 * pool configuration.
544 */
545 static void
pool_init(zfs_opt_t * zfs)546 pool_init(zfs_opt_t *zfs)
547 {
548 uint64_t dnid;
549
550 zfs->poolguid = randomguid();
551 zfs->vdevguid = randomguid();
552
553 zfs->mos = objset_alloc(zfs, DMU_OST_META);
554
555 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid);
556 assert(dnid == DMU_POOL_DIRECTORY_OBJECT);
557
558 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid);
559
560 dsl_init(zfs);
561
562 pool_init_objdir(zfs);
563 }
564
565 static void
pool_labels_write(zfs_opt_t * zfs)566 pool_labels_write(zfs_opt_t *zfs)
567 {
568 uberblock_t *ub;
569 vdev_label_t *label;
570 nvlist_t *poolconfig, *vdevconfig;
571 int error;
572
573 label = ecalloc(1, sizeof(*label));
574
575 /*
576 * Assemble the vdev configuration and store it in the label.
577 */
578 poolconfig = pool_config_nvcreate(zfs);
579 vdevconfig = pool_disk_vdev_config_nvcreate(zfs);
580 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
581 nvlist_destroy(vdevconfig);
582
583 error = nvlist_export(poolconfig);
584 if (error != 0)
585 errc(1, error, "nvlist_export");
586 nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist,
587 sizeof(label->vl_vdev_phys.vp_nvlist));
588 nvlist_destroy(poolconfig);
589
590 /*
591 * Fill out the uberblock. Just make each one the same. The embedded
592 * checksum is calculated in vdev_label_write().
593 */
594 for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock);
595 uoff += (1 << zfs->ashift)) {
596 ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff);
597 ub->ub_magic = UBERBLOCK_MAGIC;
598 ub->ub_version = SPA_VERSION;
599
600 /*
601 * Upon import, OpenZFS will perform metadata verification of
602 * the last TXG by default. If all data is written in the same
603 * TXG, it'll all get verified, which can be painfully slow in
604 * some cases, e.g., initial boot in a cloud environment with
605 * slow storage. So, fabricate additional TXGs to avoid this
606 * overhead, unless the user requests otherwise.
607 */
608 ub->ub_txg = TXG;
609 if (!zfs->verify_txgs)
610 ub->ub_txg += TXG_SIZE;
611 ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid;
612 ub->ub_timestamp = 0;
613
614 ub->ub_software_version = SPA_VERSION;
615 ub->ub_mmp_magic = MMP_MAGIC;
616 ub->ub_mmp_delay = 0;
617 ub->ub_mmp_config = 0;
618 ub->ub_checkpoint_txg = 0;
619 objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp);
620 }
621
622 /*
623 * Write out four copies of the label: two at the beginning of the vdev
624 * and two at the end.
625 */
626 for (int i = 0; i < VDEV_LABELS; i++)
627 vdev_label_write(zfs, i, label);
628
629 free(label);
630 }
631
632 static void
pool_fini(zfs_opt_t * zfs)633 pool_fini(zfs_opt_t *zfs)
634 {
635 zap_write(zfs, zfs->poolprops);
636 dsl_write(zfs);
637 objset_write(zfs, zfs->mos);
638 pool_labels_write(zfs);
639 }
640
641 struct dnode_cursor *
dnode_cursor_init(zfs_opt_t * zfs,zfs_objset_t * os,dnode_phys_t * dnode,off_t size,off_t blksz)642 dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode,
643 off_t size, off_t blksz)
644 {
645 struct dnode_cursor *c;
646 uint64_t nbppindir, indlevel, ndatablks, nindblks;
647
648 assert(dnode->dn_nblkptr == 1);
649 assert(blksz <= MAXBLOCKSIZE);
650
651 if (blksz == 0) {
652 /* Must be between 1<<ashift and 128KB. */
653 blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift,
654 powerof2(size) ? size : (1l << flsll(size))));
655 }
656 assert(powerof2(blksz));
657
658 /*
659 * Do we need indirect blocks? Figure out how many levels are needed
660 * (indlevel == 1 means no indirect blocks) and how much space is needed
661 * (it has to be allocated up-front to break the dependency cycle
662 * described in objset_write()).
663 */
664 ndatablks = size == 0 ? 0 : howmany(size, blksz);
665 nindblks = 0;
666 for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) {
667 nbppindir *= BLKPTR_PER_INDIR;
668 nindblks += howmany(ndatablks, indlevel * nbppindir);
669 }
670 assert(indlevel < INDIR_LEVELS);
671
672 dnode->dn_nlevels = (uint8_t)indlevel;
673 dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0;
674 dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
675
676 c = ecalloc(1, sizeof(*c));
677 if (nindblks > 0) {
678 c->indspace = nindblks * MAXBLOCKSIZE;
679 c->indloc = objset_space_alloc(zfs, os, &c->indspace);
680 }
681 c->dnode = dnode;
682 c->dataoff = 0;
683 c->datablksz = blksz;
684
685 return (c);
686 }
687
688 static void
_dnode_cursor_flush(zfs_opt_t * zfs,struct dnode_cursor * c,unsigned int levels)689 _dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, unsigned int levels)
690 {
691 blkptr_t *bp, *pbp;
692 void *buf;
693 uint64_t fill;
694 off_t blkid, blksz, loc;
695
696 assert(levels > 0);
697 assert(levels <= c->dnode->dn_nlevels - 1U);
698
699 blksz = MAXBLOCKSIZE;
700 blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR;
701 for (unsigned int level = 1; level <= levels; level++) {
702 buf = c->inddir[level - 1];
703
704 if (level == c->dnode->dn_nlevels - 1U) {
705 pbp = &c->dnode->dn_blkptr[0];
706 } else {
707 uint64_t iblkid;
708
709 iblkid = blkid & (BLKPTR_PER_INDIR - 1);
710 pbp = (blkptr_t *)
711 &c->inddir[level][iblkid * sizeof(blkptr_t)];
712 }
713
714 /*
715 * Space for indirect blocks is allocated up-front; see the
716 * comment in objset_write().
717 */
718 loc = c->indloc;
719 c->indloc += blksz;
720 assert(c->indspace >= blksz);
721 c->indspace -= blksz;
722
723 bp = buf;
724 fill = 0;
725 for (size_t i = 0; i < BLKPTR_PER_INDIR; i++)
726 fill += BP_GET_FILL(&bp[i]);
727
728 vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz,
729 loc, pbp);
730 memset(buf, 0, MAXBLOCKSIZE);
731
732 blkid /= BLKPTR_PER_INDIR;
733 }
734 }
735
736 blkptr_t *
dnode_cursor_next(zfs_opt_t * zfs,struct dnode_cursor * c,off_t off)737 dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off)
738 {
739 off_t blkid, l1id;
740 unsigned int levels;
741
742 if (c->dnode->dn_nlevels == 1) {
743 assert(off < MAXBLOCKSIZE);
744 return (&c->dnode->dn_blkptr[0]);
745 }
746
747 assert(off % c->datablksz == 0);
748
749 /* Do we need to flush any full indirect blocks? */
750 if (off > 0) {
751 blkid = off / c->datablksz;
752 for (levels = 0; levels < c->dnode->dn_nlevels - 1U; levels++) {
753 if (blkid % BLKPTR_PER_INDIR != 0)
754 break;
755 blkid /= BLKPTR_PER_INDIR;
756 }
757 if (levels > 0)
758 _dnode_cursor_flush(zfs, c, levels);
759 }
760
761 c->dataoff = off;
762 l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1);
763 return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]);
764 }
765
766 void
dnode_cursor_finish(zfs_opt_t * zfs,struct dnode_cursor * c)767 dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c)
768 {
769 unsigned int levels;
770
771 assert(c->dnode->dn_nlevels > 0);
772 levels = c->dnode->dn_nlevels - 1;
773 if (levels > 0)
774 _dnode_cursor_flush(zfs, c, levels);
775 assert(c->indspace == 0);
776 free(c);
777 }
778
779 void
zfs_makefs(const char * image,const char * dir,fsnode * root,fsinfo_t * fsopts)780 zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts)
781 {
782 zfs_opt_t *zfs;
783 int dirfd;
784
785 zfs = fsopts->fs_specific;
786
787 /*
788 * Use a fixed seed to provide reproducible pseudo-random numbers for
789 * on-disk structures when needed (e.g., GUIDs, ZAP hash salts).
790 */
791 srandom(1729);
792
793 zfs_check_opts(fsopts);
794
795 dirfd = open(dir, O_DIRECTORY | O_RDONLY);
796 if (dirfd < 0)
797 err(1, "open(%s)", dir);
798
799 vdev_init(zfs, image);
800 pool_init(zfs);
801 fs_build(zfs, dirfd, root);
802 pool_fini(zfs);
803 vdev_fini(zfs);
804 }
805