1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2022 The FreeBSD Foundation 5 * 6 * This software was developed by Mark Johnston under sponsorship from 7 * the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions are 11 * met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/errno.h> 33 #include <sys/queue.h> 34 35 #include <assert.h> 36 #include <ctype.h> 37 #include <fcntl.h> 38 #include <stdalign.h> 39 #include <stdbool.h> 40 #include <stddef.h> 41 #include <stdint.h> 42 #include <stdlib.h> 43 #include <string.h> 44 #include <unistd.h> 45 46 #include <util.h> 47 48 #include "makefs.h" 49 #include "zfs.h" 50 51 #define VDEV_LABEL_SPACE \ 52 ((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) 53 _Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, ""); 54 55 #define MINMSSIZE ((off_t)1 << 24) /* 16MB */ 56 #define DFLTMSSIZE ((off_t)1 << 29) /* 512MB */ 57 #define MAXMSSIZE ((off_t)1 << 34) /* 16GB */ 58 59 #define INDIR_LEVELS 6 60 /* Indirect blocks are always 128KB. */ 61 #define BLKPTR_PER_INDIR (MAXBLOCKSIZE / sizeof(blkptr_t)) 62 63 struct dnode_cursor { 64 char inddir[INDIR_LEVELS][MAXBLOCKSIZE]; 65 off_t indloc; 66 off_t indspace; 67 dnode_phys_t *dnode; 68 off_t dataoff; 69 off_t datablksz; 70 }; 71 72 void 73 zfs_prep_opts(fsinfo_t *fsopts) 74 { 75 zfs_opt_t *zfs; 76 size_t align; 77 78 align = alignof(uint64_t); 79 zfs = aligned_alloc(align, roundup2(sizeof(*zfs), align)); 80 if (zfs == NULL) 81 err(1, "aligned_alloc"); 82 memset(zfs, 0, sizeof(*zfs)); 83 84 const option_t zfs_options[] = { 85 { '\0', "bootfs", &zfs->bootfs, OPT_STRPTR, 86 0, 0, "Bootable dataset" }, 87 { '\0', "mssize", &zfs->mssize, OPT_INT64, 88 MINMSSIZE, MAXMSSIZE, "Metaslab size" }, 89 { '\0', "poolguid", &zfs->poolguid, OPT_INT64, 90 0, INT64_MAX, "ZFS pool GUID" }, 91 { '\0', "poolname", &zfs->poolname, OPT_STRPTR, 92 0, 0, "ZFS pool name" }, 93 { '\0', "rootpath", &zfs->rootpath, OPT_STRPTR, 94 0, 0, "Prefix for all dataset mount points" }, 95 { '\0', "ashift", &zfs->ashift, OPT_INT32, 96 MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" }, 97 { '\0', "verify-txgs", &zfs->verify_txgs, OPT_BOOL, 98 0, 0, "Make OpenZFS verify data upon import" }, 99 { '\0', "nowarn", &zfs->nowarn, OPT_BOOL, 100 0, 0, "Provided for backwards compatibility, ignored" }, 101 { .name = NULL } 102 }; 103 104 STAILQ_INIT(&zfs->datasetdescs); 105 106 fsopts->fs_specific = zfs; 107 fsopts->fs_options = copy_opts(zfs_options); 108 } 109 110 int 111 zfs_parse_opts(const char *option, fsinfo_t *fsopts) 112 { 113 zfs_opt_t *zfs; 114 struct dataset_desc *dsdesc; 115 char buf[BUFSIZ], *opt, *val; 116 int rv; 117 118 zfs = fsopts->fs_specific; 119 120 opt = val = estrdup(option); 121 opt = strsep(&val, "="); 122 if (strcmp(opt, "fs") == 0) { 123 if (val == NULL) 124 errx(1, "invalid filesystem parameters `%s'", option); 125 126 /* 127 * Dataset descriptions will be parsed later, in dsl_init(). 128 * Just stash them away for now. 129 */ 130 dsdesc = ecalloc(1, sizeof(*dsdesc)); 131 dsdesc->params = estrdup(val); 132 free(opt); 133 STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next); 134 return (1); 135 } 136 free(opt); 137 138 rv = set_option(fsopts->fs_options, option, buf, sizeof(buf)); 139 return (rv == -1 ? 0 : 1); 140 } 141 142 static void 143 zfs_size_vdev(fsinfo_t *fsopts) 144 { 145 zfs_opt_t *zfs; 146 off_t asize, mssize, vdevsize, vdevsize1; 147 148 zfs = fsopts->fs_specific; 149 150 assert(fsopts->maxsize != 0); 151 assert(zfs->ashift != 0); 152 153 /* 154 * Figure out how big the vdev should be. 155 */ 156 vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift); 157 if (vdevsize < MINDEVSIZE) 158 errx(1, "maximum image size is too small"); 159 if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) { 160 errx(1, "image size bounds must be multiples of %d", 161 1 << zfs->ashift); 162 } 163 asize = vdevsize - VDEV_LABEL_SPACE; 164 165 /* 166 * Size metaslabs according to the following heuristic: 167 * - provide at least 8 metaslabs, 168 * - without using a metaslab size larger than 512MB. 169 * This approximates what OpenZFS does without being complicated. In 170 * practice we expect pools to be expanded upon first use, and OpenZFS 171 * does not resize metaslabs in that case, so there is no right answer 172 * here. In general we want to provide large metaslabs even if the 173 * image size is small, and 512MB is a reasonable size for pools up to 174 * several hundred gigabytes. 175 * 176 * The user may override this heuristic using the "-o mssize" option. 177 */ 178 mssize = zfs->mssize; 179 if (mssize == 0) { 180 mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE); 181 if (!powerof2(mssize)) 182 mssize = 1l << (flsll(mssize) - 1); 183 } 184 if (!powerof2(mssize)) 185 errx(1, "metaslab size must be a power of 2"); 186 187 /* 188 * If we have some slop left over, try to cover it by resizing the vdev, 189 * subject to the maxsize and minsize parameters. 190 */ 191 if (asize % mssize != 0) { 192 vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE; 193 if (vdevsize1 < fsopts->minsize) 194 vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE; 195 if (vdevsize1 <= fsopts->maxsize) 196 vdevsize = vdevsize1; 197 } 198 asize = vdevsize - VDEV_LABEL_SPACE; 199 200 zfs->asize = asize; 201 zfs->vdevsize = vdevsize; 202 zfs->mssize = mssize; 203 zfs->msshift = flsll(mssize) - 1; 204 zfs->mscount = asize / mssize; 205 } 206 207 /* 208 * Validate options and set some default values. 209 */ 210 static void 211 zfs_check_opts(fsinfo_t *fsopts) 212 { 213 zfs_opt_t *zfs; 214 215 zfs = fsopts->fs_specific; 216 217 if (fsopts->offset != 0) 218 errx(1, "unhandled offset option"); 219 if (fsopts->maxsize == 0) 220 errx(1, "an image size must be specified"); 221 222 if (zfs->poolname == NULL) 223 errx(1, "a pool name must be specified"); 224 if (!isalpha(zfs->poolname[0])) 225 errx(1, "the pool name must begin with a letter"); 226 for (size_t i = 0, len = strlen(zfs->poolname); i < len; i++) { 227 if (!isalnum(zfs->poolname[i]) && zfs->poolname[i] != '_') 228 errx(1, "invalid character '%c' in pool name", 229 zfs->poolname[i]); 230 } 231 if (strcmp(zfs->poolname, "mirror") == 0 || 232 strcmp(zfs->poolname, "raidz") == 0 || 233 strcmp(zfs->poolname, "draid") == 0) { 234 errx(1, "pool name '%s' is reserved and cannot be used", 235 zfs->poolname); 236 } 237 238 if (zfs->rootpath == NULL) 239 easprintf(&zfs->rootpath, "/%s", zfs->poolname); 240 if (zfs->rootpath[0] != '/') 241 errx(1, "mountpoint `%s' must be absolute", zfs->rootpath); 242 243 if (zfs->ashift == 0) 244 zfs->ashift = 12; 245 246 zfs_size_vdev(fsopts); 247 } 248 249 void 250 zfs_cleanup_opts(fsinfo_t *fsopts) 251 { 252 struct dataset_desc *d, *tmp; 253 zfs_opt_t *zfs; 254 255 zfs = fsopts->fs_specific; 256 free(zfs->rootpath); 257 free(zfs->bootfs); 258 free(__DECONST(void *, zfs->poolname)); 259 STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) { 260 free(d->params); 261 free(d); 262 } 263 free(zfs); 264 free(fsopts->fs_options); 265 } 266 267 static size_t 268 nvlist_size(const nvlist_t *nvl) 269 { 270 return (sizeof(nvl->nv_header) + nvl->nv_size); 271 } 272 273 static void 274 nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz) 275 { 276 assert(sz >= nvlist_size(nvl)); 277 278 memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header)); 279 memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size); 280 } 281 282 /* 283 * Avoid returning a GUID of 0, just to avoid the possibility that something 284 * will interpret that as meaning that the GUID is uninitialized. 285 */ 286 uint64_t 287 randomguid(void) 288 { 289 uint64_t ret; 290 291 do { 292 ret = ((uint64_t)random() << 32) | random(); 293 } while (ret == 0); 294 295 return (ret); 296 } 297 298 static nvlist_t * 299 pool_config_nvcreate(zfs_opt_t *zfs) 300 { 301 nvlist_t *featuresnv, *poolnv; 302 303 poolnv = nvlist_create(NV_UNIQUE_NAME); 304 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG); 305 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION); 306 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED); 307 nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname); 308 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid); 309 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid); 310 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); 311 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1); 312 313 featuresnv = nvlist_create(NV_UNIQUE_NAME); 314 nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv); 315 nvlist_destroy(featuresnv); 316 317 return (poolnv); 318 } 319 320 static nvlist_t * 321 pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs) 322 { 323 nvlist_t *diskvdevnv; 324 325 assert(zfs->objarrid != 0); 326 327 diskvdevnv = nvlist_create(NV_UNIQUE_NAME); 328 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK); 329 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift); 330 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize); 331 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); 332 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0); 333 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null"); 334 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1); 335 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); 336 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY, 337 zfs->objarrid); 338 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT, 339 zfs->msshift); 340 341 return (diskvdevnv); 342 } 343 344 static nvlist_t * 345 pool_root_vdev_config_nvcreate(zfs_opt_t *zfs) 346 { 347 nvlist_t *diskvdevnv, *rootvdevnv; 348 349 diskvdevnv = pool_disk_vdev_config_nvcreate(zfs); 350 rootvdevnv = nvlist_create(NV_UNIQUE_NAME); 351 352 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0); 353 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid); 354 nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 355 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); 356 nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv, 357 1); 358 nvlist_destroy(diskvdevnv); 359 360 return (rootvdevnv); 361 } 362 363 /* 364 * Create the pool's "config" object, which contains an nvlist describing pool 365 * parameters and the vdev topology. It is similar but not identical to the 366 * nvlist stored in vdev labels. The main difference is that vdev labels do not 367 * describe the full vdev tree and in particular do not contain the "root" 368 * meta-vdev. 369 */ 370 static void 371 pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir) 372 { 373 dnode_phys_t *dnode; 374 nvlist_t *poolconfig, *vdevconfig; 375 void *configbuf; 376 uint64_t dnid; 377 off_t configloc, configblksz; 378 int error; 379 380 dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST, 381 DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid); 382 383 poolconfig = pool_config_nvcreate(zfs); 384 385 vdevconfig = pool_root_vdev_config_nvcreate(zfs); 386 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); 387 nvlist_destroy(vdevconfig); 388 389 error = nvlist_export(poolconfig); 390 if (error != 0) 391 errc(1, error, "nvlist_export"); 392 393 configblksz = nvlist_size(poolconfig); 394 configloc = objset_space_alloc(zfs, zfs->mos, &configblksz); 395 configbuf = ecalloc(1, configblksz); 396 nvlist_copy(poolconfig, configbuf, configblksz); 397 398 vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc); 399 400 dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT; 401 dnode->dn_flags = DNODE_FLAG_USED_BYTES; 402 *(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig); 403 404 zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid); 405 406 nvlist_destroy(poolconfig); 407 free(configbuf); 408 } 409 410 /* 411 * Add objects block pointer list objects, used for deferred frees. We don't do 412 * anything with them, but they need to be present or OpenZFS will refuse to 413 * import the pool. 414 */ 415 static void 416 pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir) 417 { 418 uint64_t dnid; 419 420 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, 421 BPOBJ_SIZE_V2, &dnid); 422 zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid); 423 424 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, 425 BPOBJ_SIZE_V2, &dnid); 426 zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid); 427 } 428 429 /* 430 * Add required feature metadata objects. We don't know anything about ZFS 431 * features, so the objects are just empty ZAPs. 432 */ 433 static void 434 pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir) 435 { 436 dnode_phys_t *dnode; 437 uint64_t dnid; 438 439 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 440 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid); 441 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 442 443 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 444 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid); 445 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 446 447 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 448 zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid); 449 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 450 } 451 452 static void 453 pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir) 454 { 455 zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET, 456 dsl_dir_id(zfs->rootdsldir)); 457 } 458 459 static void 460 pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir) 461 { 462 dnode_phys_t *dnode; 463 uint64_t id; 464 465 dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id); 466 zap_add_uint64(objdir, DMU_POOL_PROPS, id); 467 468 zfs->poolprops = zap_alloc(zfs->mos, dnode); 469 } 470 471 /* 472 * Initialize the MOS object directory, the root of virtually all of the pool's 473 * data and metadata. 474 */ 475 static void 476 pool_init_objdir(zfs_opt_t *zfs) 477 { 478 zfs_zap_t *zap; 479 dnode_phys_t *objdir; 480 481 objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT); 482 483 zap = zap_alloc(zfs->mos, objdir); 484 pool_init_objdir_config(zfs, zap); 485 pool_init_objdir_bplists(zfs, zap); 486 pool_init_objdir_feature_maps(zfs, zap); 487 pool_init_objdir_dsl(zfs, zap); 488 pool_init_objdir_poolprops(zfs, zap); 489 zap_write(zfs, zap); 490 } 491 492 /* 493 * Initialize the meta-object set (MOS) and immediately write out several 494 * special objects whose contents are already finalized, including the object 495 * directory. 496 * 497 * Once the MOS is finalized, it'll look roughly like this: 498 * 499 * object directory (ZAP) 500 * |-> vdev config object (nvlist) 501 * |-> features for read 502 * |-> features for write 503 * |-> feature descriptions 504 * |-> sync bplist 505 * |-> free bplist 506 * |-> pool properties 507 * L-> root DSL directory 508 * |-> DSL child directory (ZAP) 509 * | |-> $MOS (DSL dir) 510 * | | |-> child map 511 * | | L-> props (ZAP) 512 * | |-> $FREE (DSL dir) 513 * | | |-> child map 514 * | | L-> props (ZAP) 515 * | |-> $ORIGIN (DSL dir) 516 * | | |-> child map 517 * | | |-> dataset 518 * | | | L-> deadlist 519 * | | |-> snapshot 520 * | | | |-> deadlist 521 * | | | L-> snapshot names 522 * | | |-> props (ZAP) 523 * | | L-> clones (ZAP) 524 * | |-> dataset 1 (DSL dir) 525 * | | |-> DSL dataset 526 * | | | |-> snapshot names 527 * | | | L-> deadlist 528 * | | |-> child map 529 * | | | L-> ... 530 * | | L-> props 531 * | |-> dataset 2 532 * | | L-> ... 533 * | |-> ... 534 * | L-> dataset n 535 * |-> DSL root dataset 536 * | |-> snapshot names 537 * | L-> deadlist 538 * L-> props (ZAP) 539 * space map object array 540 * |-> space map 1 541 * |-> space map 2 542 * |-> ... 543 * L-> space map n (zfs->mscount) 544 * 545 * The space map object array is pointed to by the "msarray" property in the 546 * pool configuration. 547 */ 548 static void 549 pool_init(zfs_opt_t *zfs) 550 { 551 uint64_t dnid; 552 553 if (zfs->poolguid == 0) 554 zfs->poolguid = randomguid(); 555 zfs->vdevguid = randomguid(); 556 557 zfs->mos = objset_alloc(zfs, DMU_OST_META); 558 559 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid); 560 assert(dnid == DMU_POOL_DIRECTORY_OBJECT); 561 562 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid); 563 564 dsl_init(zfs); 565 566 pool_init_objdir(zfs); 567 } 568 569 static void 570 pool_labels_write(zfs_opt_t *zfs) 571 { 572 uberblock_t *ub; 573 vdev_label_t *label; 574 nvlist_t *poolconfig, *vdevconfig; 575 int error; 576 577 label = ecalloc(1, sizeof(*label)); 578 579 /* 580 * Assemble the vdev configuration and store it in the label. 581 */ 582 poolconfig = pool_config_nvcreate(zfs); 583 vdevconfig = pool_disk_vdev_config_nvcreate(zfs); 584 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); 585 nvlist_destroy(vdevconfig); 586 587 error = nvlist_export(poolconfig); 588 if (error != 0) 589 errc(1, error, "nvlist_export"); 590 nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist, 591 sizeof(label->vl_vdev_phys.vp_nvlist)); 592 nvlist_destroy(poolconfig); 593 594 /* 595 * Fill out the uberblock. Just make each one the same. The embedded 596 * checksum is calculated in vdev_label_write(). 597 */ 598 for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock); 599 uoff += ASHIFT_UBERBLOCK_SIZE(zfs->ashift)) { 600 ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff); 601 ub->ub_magic = UBERBLOCK_MAGIC; 602 ub->ub_version = SPA_VERSION; 603 604 /* 605 * Upon import, OpenZFS will perform metadata verification of 606 * the last TXG by default. If all data is written in the same 607 * TXG, it'll all get verified, which can be painfully slow in 608 * some cases, e.g., initial boot in a cloud environment with 609 * slow storage. So, fabricate additional TXGs to avoid this 610 * overhead, unless the user requests otherwise. 611 */ 612 ub->ub_txg = TXG; 613 if (!zfs->verify_txgs) 614 ub->ub_txg += TXG_SIZE; 615 ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid; 616 ub->ub_timestamp = 0; 617 618 ub->ub_software_version = SPA_VERSION; 619 ub->ub_mmp_magic = MMP_MAGIC; 620 ub->ub_mmp_delay = 0; 621 ub->ub_mmp_config = 0; 622 ub->ub_checkpoint_txg = 0; 623 objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp); 624 } 625 626 /* 627 * Write out four copies of the label: two at the beginning of the vdev 628 * and two at the end. 629 */ 630 for (int i = 0; i < VDEV_LABELS; i++) 631 vdev_label_write(zfs, i, label); 632 633 free(label); 634 } 635 636 static void 637 pool_fini(zfs_opt_t *zfs) 638 { 639 zap_write(zfs, zfs->poolprops); 640 dsl_write(zfs); 641 objset_write(zfs, zfs->mos); 642 pool_labels_write(zfs); 643 } 644 645 struct dnode_cursor * 646 dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode, 647 off_t size, off_t blksz) 648 { 649 struct dnode_cursor *c; 650 uint64_t nbppindir, indlevel, ndatablks, nindblks; 651 652 assert(dnode->dn_nblkptr == 1); 653 assert(blksz <= MAXBLOCKSIZE); 654 655 if (blksz == 0) { 656 /* Must be between 1<<ashift and 128KB. */ 657 blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift, 658 powerof2(size) ? size : (1l << flsll(size)))); 659 } 660 assert(powerof2(blksz)); 661 662 /* 663 * Do we need indirect blocks? Figure out how many levels are needed 664 * (indlevel == 1 means no indirect blocks) and how much space is needed 665 * (it has to be allocated up-front to break the dependency cycle 666 * described in objset_write()). 667 */ 668 ndatablks = size == 0 ? 0 : howmany(size, blksz); 669 nindblks = 0; 670 for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) { 671 nbppindir *= BLKPTR_PER_INDIR; 672 nindblks += howmany(ndatablks, indlevel * nbppindir); 673 } 674 assert(indlevel < INDIR_LEVELS); 675 676 dnode->dn_nlevels = (uint8_t)indlevel; 677 dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0; 678 dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT; 679 680 c = ecalloc(1, sizeof(*c)); 681 if (nindblks > 0) { 682 c->indspace = nindblks * MAXBLOCKSIZE; 683 c->indloc = objset_space_alloc(zfs, os, &c->indspace); 684 } 685 c->dnode = dnode; 686 c->dataoff = 0; 687 c->datablksz = blksz; 688 689 return (c); 690 } 691 692 static void 693 _dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, unsigned int levels) 694 { 695 blkptr_t *bp, *pbp; 696 void *buf; 697 uint64_t fill; 698 off_t blkid, blksz, loc; 699 700 assert(levels > 0); 701 assert(levels <= c->dnode->dn_nlevels - 1U); 702 703 blksz = MAXBLOCKSIZE; 704 blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR; 705 for (unsigned int level = 1; level <= levels; level++) { 706 buf = c->inddir[level - 1]; 707 708 if (level == c->dnode->dn_nlevels - 1U) { 709 pbp = &c->dnode->dn_blkptr[0]; 710 } else { 711 uint64_t iblkid; 712 713 iblkid = blkid & (BLKPTR_PER_INDIR - 1); 714 pbp = (blkptr_t *) 715 &c->inddir[level][iblkid * sizeof(blkptr_t)]; 716 } 717 718 /* 719 * Space for indirect blocks is allocated up-front; see the 720 * comment in objset_write(). 721 */ 722 loc = c->indloc; 723 c->indloc += blksz; 724 assert(c->indspace >= blksz); 725 c->indspace -= blksz; 726 727 bp = buf; 728 fill = 0; 729 for (size_t i = 0; i < BLKPTR_PER_INDIR; i++) 730 fill += BP_GET_FILL(&bp[i]); 731 732 vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz, 733 loc, pbp); 734 memset(buf, 0, MAXBLOCKSIZE); 735 736 blkid /= BLKPTR_PER_INDIR; 737 } 738 } 739 740 blkptr_t * 741 dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off) 742 { 743 off_t blkid, l1id; 744 unsigned int levels; 745 746 if (c->dnode->dn_nlevels == 1) { 747 assert(off < MAXBLOCKSIZE); 748 return (&c->dnode->dn_blkptr[0]); 749 } 750 751 assert(off % c->datablksz == 0); 752 753 /* Do we need to flush any full indirect blocks? */ 754 if (off > 0) { 755 blkid = off / c->datablksz; 756 for (levels = 0; levels < c->dnode->dn_nlevels - 1U; levels++) { 757 if (blkid % BLKPTR_PER_INDIR != 0) 758 break; 759 blkid /= BLKPTR_PER_INDIR; 760 } 761 if (levels > 0) 762 _dnode_cursor_flush(zfs, c, levels); 763 } 764 765 c->dataoff = off; 766 l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1); 767 return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]); 768 } 769 770 void 771 dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c) 772 { 773 unsigned int levels; 774 775 assert(c->dnode->dn_nlevels > 0); 776 levels = c->dnode->dn_nlevels - 1; 777 if (levels > 0) 778 _dnode_cursor_flush(zfs, c, levels); 779 assert(c->indspace == 0); 780 free(c); 781 } 782 783 void 784 zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts) 785 { 786 zfs_opt_t *zfs; 787 int dirfd; 788 789 zfs = fsopts->fs_specific; 790 791 /* 792 * Use a fixed seed to provide reproducible pseudo-random numbers for 793 * on-disk structures when needed (e.g., GUIDs, ZAP hash salts). 794 */ 795 srandom(1729); 796 797 zfs_check_opts(fsopts); 798 799 dirfd = open(dir, O_DIRECTORY | O_RDONLY); 800 if (dirfd < 0) 801 err(1, "open(%s)", dir); 802 803 vdev_init(zfs, image); 804 pool_init(zfs); 805 fs_build(zfs, dirfd, root); 806 pool_fini(zfs); 807 vdev_fini(zfs); 808 } 809