1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2022 The FreeBSD Foundation 5 * 6 * This software was developed by Mark Johnston under sponsorship from 7 * the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions are 11 * met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/errno.h> 33 #include <sys/queue.h> 34 35 #include <assert.h> 36 #include <ctype.h> 37 #include <fcntl.h> 38 #include <stdalign.h> 39 #include <stdbool.h> 40 #include <stddef.h> 41 #include <stdlib.h> 42 #include <string.h> 43 #include <unistd.h> 44 45 #include <util.h> 46 47 #include "makefs.h" 48 #include "zfs.h" 49 50 #define VDEV_LABEL_SPACE \ 51 ((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) 52 _Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, ""); 53 54 #define MINMSSIZE ((off_t)1 << 24) /* 16MB */ 55 #define DFLTMSSIZE ((off_t)1 << 29) /* 512MB */ 56 #define MAXMSSIZE ((off_t)1 << 34) /* 16GB */ 57 58 #define INDIR_LEVELS 6 59 /* Indirect blocks are always 128KB. */ 60 #define BLKPTR_PER_INDIR (MAXBLOCKSIZE / sizeof(blkptr_t)) 61 62 struct dnode_cursor { 63 char inddir[INDIR_LEVELS][MAXBLOCKSIZE]; 64 off_t indloc; 65 off_t indspace; 66 dnode_phys_t *dnode; 67 off_t dataoff; 68 off_t datablksz; 69 }; 70 71 void 72 zfs_prep_opts(fsinfo_t *fsopts) 73 { 74 zfs_opt_t *zfs; 75 size_t align; 76 77 align = alignof(uint64_t); 78 zfs = aligned_alloc(align, roundup2(sizeof(*zfs), align)); 79 if (zfs == NULL) 80 err(1, "aligned_alloc"); 81 memset(zfs, 0, sizeof(*zfs)); 82 83 const option_t zfs_options[] = { 84 { '\0', "bootfs", &zfs->bootfs, OPT_STRPTR, 85 0, 0, "Bootable dataset" }, 86 { '\0', "mssize", &zfs->mssize, OPT_INT64, 87 MINMSSIZE, MAXMSSIZE, "Metaslab size" }, 88 { '\0', "poolname", &zfs->poolname, OPT_STRPTR, 89 0, 0, "ZFS pool name" }, 90 { '\0', "rootpath", &zfs->rootpath, OPT_STRPTR, 91 0, 0, "Prefix for all dataset mount points" }, 92 { '\0', "ashift", &zfs->ashift, OPT_INT32, 93 MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" }, 94 { '\0', "verify-txgs", &zfs->verify_txgs, OPT_BOOL, 95 0, 0, "Make OpenZFS verify data upon import" }, 96 { '\0', "nowarn", &zfs->nowarn, OPT_BOOL, 97 0, 0, "Provided for backwards compatibility, ignored" }, 98 { .name = NULL } 99 }; 100 101 STAILQ_INIT(&zfs->datasetdescs); 102 103 fsopts->fs_specific = zfs; 104 fsopts->fs_options = copy_opts(zfs_options); 105 } 106 107 int 108 zfs_parse_opts(const char *option, fsinfo_t *fsopts) 109 { 110 zfs_opt_t *zfs; 111 struct dataset_desc *dsdesc; 112 char buf[BUFSIZ], *opt, *val; 113 int rv; 114 115 zfs = fsopts->fs_specific; 116 117 opt = val = estrdup(option); 118 opt = strsep(&val, "="); 119 if (strcmp(opt, "fs") == 0) { 120 if (val == NULL) 121 errx(1, "invalid filesystem parameters `%s'", option); 122 123 /* 124 * Dataset descriptions will be parsed later, in dsl_init(). 125 * Just stash them away for now. 126 */ 127 dsdesc = ecalloc(1, sizeof(*dsdesc)); 128 dsdesc->params = estrdup(val); 129 free(opt); 130 STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next); 131 return (1); 132 } 133 free(opt); 134 135 rv = set_option(fsopts->fs_options, option, buf, sizeof(buf)); 136 return (rv == -1 ? 0 : 1); 137 } 138 139 static void 140 zfs_size_vdev(fsinfo_t *fsopts) 141 { 142 zfs_opt_t *zfs; 143 off_t asize, mssize, vdevsize, vdevsize1; 144 145 zfs = fsopts->fs_specific; 146 147 assert(fsopts->maxsize != 0); 148 assert(zfs->ashift != 0); 149 150 /* 151 * Figure out how big the vdev should be. 152 */ 153 vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift); 154 if (vdevsize < MINDEVSIZE) 155 errx(1, "maximum image size is too small"); 156 if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) { 157 errx(1, "image size bounds must be multiples of %d", 158 1 << zfs->ashift); 159 } 160 asize = vdevsize - VDEV_LABEL_SPACE; 161 162 /* 163 * Size metaslabs according to the following heuristic: 164 * - provide at least 8 metaslabs, 165 * - without using a metaslab size larger than 512MB. 166 * This approximates what OpenZFS does without being complicated. In 167 * practice we expect pools to be expanded upon first use, and OpenZFS 168 * does not resize metaslabs in that case, so there is no right answer 169 * here. In general we want to provide large metaslabs even if the 170 * image size is small, and 512MB is a reasonable size for pools up to 171 * several hundred gigabytes. 172 * 173 * The user may override this heuristic using the "-o mssize" option. 174 */ 175 mssize = zfs->mssize; 176 if (mssize == 0) { 177 mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE); 178 if (!powerof2(mssize)) 179 mssize = 1l << (flsll(mssize) - 1); 180 } 181 if (!powerof2(mssize)) 182 errx(1, "metaslab size must be a power of 2"); 183 184 /* 185 * If we have some slop left over, try to cover it by resizing the vdev, 186 * subject to the maxsize and minsize parameters. 187 */ 188 if (asize % mssize != 0) { 189 vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE; 190 if (vdevsize1 < fsopts->minsize) 191 vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE; 192 if (vdevsize1 <= fsopts->maxsize) 193 vdevsize = vdevsize1; 194 } 195 asize = vdevsize - VDEV_LABEL_SPACE; 196 197 zfs->asize = asize; 198 zfs->vdevsize = vdevsize; 199 zfs->mssize = mssize; 200 zfs->msshift = flsll(mssize) - 1; 201 zfs->mscount = asize / mssize; 202 } 203 204 /* 205 * Validate options and set some default values. 206 */ 207 static void 208 zfs_check_opts(fsinfo_t *fsopts) 209 { 210 zfs_opt_t *zfs; 211 212 zfs = fsopts->fs_specific; 213 214 if (fsopts->offset != 0) 215 errx(1, "unhandled offset option"); 216 if (fsopts->maxsize == 0) 217 errx(1, "an image size must be specified"); 218 219 if (zfs->poolname == NULL) 220 errx(1, "a pool name must be specified"); 221 if (!isalpha(zfs->poolname[0])) 222 errx(1, "the pool name must begin with a letter"); 223 for (size_t i = 0, len = strlen(zfs->poolname); i < len; i++) { 224 if (!isalnum(zfs->poolname[i]) && zfs->poolname[i] != '_') 225 errx(1, "invalid character '%c' in pool name", 226 zfs->poolname[i]); 227 } 228 if (strcmp(zfs->poolname, "mirror") == 0 || 229 strcmp(zfs->poolname, "raidz") == 0 || 230 strcmp(zfs->poolname, "draid") == 0) { 231 errx(1, "pool name '%s' is reserved and cannot be used", 232 zfs->poolname); 233 } 234 235 if (zfs->rootpath == NULL) 236 easprintf(&zfs->rootpath, "/%s", zfs->poolname); 237 if (zfs->rootpath[0] != '/') 238 errx(1, "mountpoint `%s' must be absolute", zfs->rootpath); 239 240 if (zfs->ashift == 0) 241 zfs->ashift = 12; 242 243 zfs_size_vdev(fsopts); 244 } 245 246 void 247 zfs_cleanup_opts(fsinfo_t *fsopts) 248 { 249 struct dataset_desc *d, *tmp; 250 zfs_opt_t *zfs; 251 252 zfs = fsopts->fs_specific; 253 free(zfs->rootpath); 254 free(zfs->bootfs); 255 free(__DECONST(void *, zfs->poolname)); 256 STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) { 257 free(d->params); 258 free(d); 259 } 260 free(zfs); 261 free(fsopts->fs_options); 262 } 263 264 static size_t 265 nvlist_size(const nvlist_t *nvl) 266 { 267 return (sizeof(nvl->nv_header) + nvl->nv_size); 268 } 269 270 static void 271 nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz) 272 { 273 assert(sz >= nvlist_size(nvl)); 274 275 memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header)); 276 memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size); 277 } 278 279 /* 280 * Avoid returning a GUID of 0, just to avoid the possibility that something 281 * will interpret that as meaning that the GUID is uninitialized. 282 */ 283 uint64_t 284 randomguid(void) 285 { 286 uint64_t ret; 287 288 do { 289 ret = ((uint64_t)random() << 32) | random(); 290 } while (ret == 0); 291 292 return (ret); 293 } 294 295 static nvlist_t * 296 pool_config_nvcreate(zfs_opt_t *zfs) 297 { 298 nvlist_t *featuresnv, *poolnv; 299 300 poolnv = nvlist_create(NV_UNIQUE_NAME); 301 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG); 302 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION); 303 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED); 304 nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname); 305 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid); 306 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid); 307 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); 308 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1); 309 310 featuresnv = nvlist_create(NV_UNIQUE_NAME); 311 nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv); 312 nvlist_destroy(featuresnv); 313 314 return (poolnv); 315 } 316 317 static nvlist_t * 318 pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs) 319 { 320 nvlist_t *diskvdevnv; 321 322 assert(zfs->objarrid != 0); 323 324 diskvdevnv = nvlist_create(NV_UNIQUE_NAME); 325 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK); 326 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift); 327 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize); 328 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); 329 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0); 330 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null"); 331 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1); 332 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); 333 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY, 334 zfs->objarrid); 335 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT, 336 zfs->msshift); 337 338 return (diskvdevnv); 339 } 340 341 static nvlist_t * 342 pool_root_vdev_config_nvcreate(zfs_opt_t *zfs) 343 { 344 nvlist_t *diskvdevnv, *rootvdevnv; 345 346 diskvdevnv = pool_disk_vdev_config_nvcreate(zfs); 347 rootvdevnv = nvlist_create(NV_UNIQUE_NAME); 348 349 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0); 350 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid); 351 nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 352 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); 353 nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv, 354 1); 355 nvlist_destroy(diskvdevnv); 356 357 return (rootvdevnv); 358 } 359 360 /* 361 * Create the pool's "config" object, which contains an nvlist describing pool 362 * parameters and the vdev topology. It is similar but not identical to the 363 * nvlist stored in vdev labels. The main difference is that vdev labels do not 364 * describe the full vdev tree and in particular do not contain the "root" 365 * meta-vdev. 366 */ 367 static void 368 pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir) 369 { 370 dnode_phys_t *dnode; 371 nvlist_t *poolconfig, *vdevconfig; 372 void *configbuf; 373 uint64_t dnid; 374 off_t configloc, configblksz; 375 int error; 376 377 dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST, 378 DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid); 379 380 poolconfig = pool_config_nvcreate(zfs); 381 382 vdevconfig = pool_root_vdev_config_nvcreate(zfs); 383 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); 384 nvlist_destroy(vdevconfig); 385 386 error = nvlist_export(poolconfig); 387 if (error != 0) 388 errc(1, error, "nvlist_export"); 389 390 configblksz = nvlist_size(poolconfig); 391 configloc = objset_space_alloc(zfs, zfs->mos, &configblksz); 392 configbuf = ecalloc(1, configblksz); 393 nvlist_copy(poolconfig, configbuf, configblksz); 394 395 vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc); 396 397 dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT; 398 dnode->dn_flags = DNODE_FLAG_USED_BYTES; 399 *(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig); 400 401 zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid); 402 403 nvlist_destroy(poolconfig); 404 free(configbuf); 405 } 406 407 /* 408 * Add objects block pointer list objects, used for deferred frees. We don't do 409 * anything with them, but they need to be present or OpenZFS will refuse to 410 * import the pool. 411 */ 412 static void 413 pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir) 414 { 415 uint64_t dnid; 416 417 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, 418 BPOBJ_SIZE_V2, &dnid); 419 zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid); 420 421 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, 422 BPOBJ_SIZE_V2, &dnid); 423 zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid); 424 } 425 426 /* 427 * Add required feature metadata objects. We don't know anything about ZFS 428 * features, so the objects are just empty ZAPs. 429 */ 430 static void 431 pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir) 432 { 433 dnode_phys_t *dnode; 434 uint64_t dnid; 435 436 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 437 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid); 438 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 439 440 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 441 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid); 442 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 443 444 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 445 zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid); 446 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 447 } 448 449 static void 450 pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir) 451 { 452 zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET, 453 dsl_dir_id(zfs->rootdsldir)); 454 } 455 456 static void 457 pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir) 458 { 459 dnode_phys_t *dnode; 460 uint64_t id; 461 462 dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id); 463 zap_add_uint64(objdir, DMU_POOL_PROPS, id); 464 465 zfs->poolprops = zap_alloc(zfs->mos, dnode); 466 } 467 468 /* 469 * Initialize the MOS object directory, the root of virtually all of the pool's 470 * data and metadata. 471 */ 472 static void 473 pool_init_objdir(zfs_opt_t *zfs) 474 { 475 zfs_zap_t *zap; 476 dnode_phys_t *objdir; 477 478 objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT); 479 480 zap = zap_alloc(zfs->mos, objdir); 481 pool_init_objdir_config(zfs, zap); 482 pool_init_objdir_bplists(zfs, zap); 483 pool_init_objdir_feature_maps(zfs, zap); 484 pool_init_objdir_dsl(zfs, zap); 485 pool_init_objdir_poolprops(zfs, zap); 486 zap_write(zfs, zap); 487 } 488 489 /* 490 * Initialize the meta-object set (MOS) and immediately write out several 491 * special objects whose contents are already finalized, including the object 492 * directory. 493 * 494 * Once the MOS is finalized, it'll look roughly like this: 495 * 496 * object directory (ZAP) 497 * |-> vdev config object (nvlist) 498 * |-> features for read 499 * |-> features for write 500 * |-> feature descriptions 501 * |-> sync bplist 502 * |-> free bplist 503 * |-> pool properties 504 * L-> root DSL directory 505 * |-> DSL child directory (ZAP) 506 * | |-> $MOS (DSL dir) 507 * | | |-> child map 508 * | | L-> props (ZAP) 509 * | |-> $FREE (DSL dir) 510 * | | |-> child map 511 * | | L-> props (ZAP) 512 * | |-> $ORIGIN (DSL dir) 513 * | | |-> child map 514 * | | |-> dataset 515 * | | | L-> deadlist 516 * | | |-> snapshot 517 * | | | |-> deadlist 518 * | | | L-> snapshot names 519 * | | |-> props (ZAP) 520 * | | L-> clones (ZAP) 521 * | |-> dataset 1 (DSL dir) 522 * | | |-> DSL dataset 523 * | | | |-> snapshot names 524 * | | | L-> deadlist 525 * | | |-> child map 526 * | | | L-> ... 527 * | | L-> props 528 * | |-> dataset 2 529 * | | L-> ... 530 * | |-> ... 531 * | L-> dataset n 532 * |-> DSL root dataset 533 * | |-> snapshot names 534 * | L-> deadlist 535 * L-> props (ZAP) 536 * space map object array 537 * |-> space map 1 538 * |-> space map 2 539 * |-> ... 540 * L-> space map n (zfs->mscount) 541 * 542 * The space map object array is pointed to by the "msarray" property in the 543 * pool configuration. 544 */ 545 static void 546 pool_init(zfs_opt_t *zfs) 547 { 548 uint64_t dnid; 549 550 zfs->poolguid = randomguid(); 551 zfs->vdevguid = randomguid(); 552 553 zfs->mos = objset_alloc(zfs, DMU_OST_META); 554 555 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid); 556 assert(dnid == DMU_POOL_DIRECTORY_OBJECT); 557 558 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid); 559 560 dsl_init(zfs); 561 562 pool_init_objdir(zfs); 563 } 564 565 static void 566 pool_labels_write(zfs_opt_t *zfs) 567 { 568 uberblock_t *ub; 569 vdev_label_t *label; 570 nvlist_t *poolconfig, *vdevconfig; 571 int error; 572 573 label = ecalloc(1, sizeof(*label)); 574 575 /* 576 * Assemble the vdev configuration and store it in the label. 577 */ 578 poolconfig = pool_config_nvcreate(zfs); 579 vdevconfig = pool_disk_vdev_config_nvcreate(zfs); 580 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); 581 nvlist_destroy(vdevconfig); 582 583 error = nvlist_export(poolconfig); 584 if (error != 0) 585 errc(1, error, "nvlist_export"); 586 nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist, 587 sizeof(label->vl_vdev_phys.vp_nvlist)); 588 nvlist_destroy(poolconfig); 589 590 /* 591 * Fill out the uberblock. Just make each one the same. The embedded 592 * checksum is calculated in vdev_label_write(). 593 */ 594 for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock); 595 uoff += (1 << zfs->ashift)) { 596 ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff); 597 ub->ub_magic = UBERBLOCK_MAGIC; 598 ub->ub_version = SPA_VERSION; 599 600 /* 601 * Upon import, OpenZFS will perform metadata verification of 602 * the last TXG by default. If all data is written in the same 603 * TXG, it'll all get verified, which can be painfully slow in 604 * some cases, e.g., initial boot in a cloud environment with 605 * slow storage. So, fabricate additional TXGs to avoid this 606 * overhead, unless the user requests otherwise. 607 */ 608 ub->ub_txg = TXG; 609 if (!zfs->verify_txgs) 610 ub->ub_txg += TXG_SIZE; 611 ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid; 612 ub->ub_timestamp = 0; 613 614 ub->ub_software_version = SPA_VERSION; 615 ub->ub_mmp_magic = MMP_MAGIC; 616 ub->ub_mmp_delay = 0; 617 ub->ub_mmp_config = 0; 618 ub->ub_checkpoint_txg = 0; 619 objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp); 620 } 621 622 /* 623 * Write out four copies of the label: two at the beginning of the vdev 624 * and two at the end. 625 */ 626 for (int i = 0; i < VDEV_LABELS; i++) 627 vdev_label_write(zfs, i, label); 628 629 free(label); 630 } 631 632 static void 633 pool_fini(zfs_opt_t *zfs) 634 { 635 zap_write(zfs, zfs->poolprops); 636 dsl_write(zfs); 637 objset_write(zfs, zfs->mos); 638 pool_labels_write(zfs); 639 } 640 641 struct dnode_cursor * 642 dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode, 643 off_t size, off_t blksz) 644 { 645 struct dnode_cursor *c; 646 uint64_t nbppindir, indlevel, ndatablks, nindblks; 647 648 assert(dnode->dn_nblkptr == 1); 649 assert(blksz <= MAXBLOCKSIZE); 650 651 if (blksz == 0) { 652 /* Must be between 1<<ashift and 128KB. */ 653 blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift, 654 powerof2(size) ? size : (1l << flsll(size)))); 655 } 656 assert(powerof2(blksz)); 657 658 /* 659 * Do we need indirect blocks? Figure out how many levels are needed 660 * (indlevel == 1 means no indirect blocks) and how much space is needed 661 * (it has to be allocated up-front to break the dependency cycle 662 * described in objset_write()). 663 */ 664 ndatablks = size == 0 ? 0 : howmany(size, blksz); 665 nindblks = 0; 666 for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) { 667 nbppindir *= BLKPTR_PER_INDIR; 668 nindblks += howmany(ndatablks, indlevel * nbppindir); 669 } 670 assert(indlevel < INDIR_LEVELS); 671 672 dnode->dn_nlevels = (uint8_t)indlevel; 673 dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0; 674 dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT; 675 676 c = ecalloc(1, sizeof(*c)); 677 if (nindblks > 0) { 678 c->indspace = nindblks * MAXBLOCKSIZE; 679 c->indloc = objset_space_alloc(zfs, os, &c->indspace); 680 } 681 c->dnode = dnode; 682 c->dataoff = 0; 683 c->datablksz = blksz; 684 685 return (c); 686 } 687 688 static void 689 _dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, unsigned int levels) 690 { 691 blkptr_t *bp, *pbp; 692 void *buf; 693 uint64_t fill; 694 off_t blkid, blksz, loc; 695 696 assert(levels > 0); 697 assert(levels <= c->dnode->dn_nlevels - 1U); 698 699 blksz = MAXBLOCKSIZE; 700 blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR; 701 for (unsigned int level = 1; level <= levels; level++) { 702 buf = c->inddir[level - 1]; 703 704 if (level == c->dnode->dn_nlevels - 1U) { 705 pbp = &c->dnode->dn_blkptr[0]; 706 } else { 707 uint64_t iblkid; 708 709 iblkid = blkid & (BLKPTR_PER_INDIR - 1); 710 pbp = (blkptr_t *) 711 &c->inddir[level][iblkid * sizeof(blkptr_t)]; 712 } 713 714 /* 715 * Space for indirect blocks is allocated up-front; see the 716 * comment in objset_write(). 717 */ 718 loc = c->indloc; 719 c->indloc += blksz; 720 assert(c->indspace >= blksz); 721 c->indspace -= blksz; 722 723 bp = buf; 724 fill = 0; 725 for (size_t i = 0; i < BLKPTR_PER_INDIR; i++) 726 fill += BP_GET_FILL(&bp[i]); 727 728 vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz, 729 loc, pbp); 730 memset(buf, 0, MAXBLOCKSIZE); 731 732 blkid /= BLKPTR_PER_INDIR; 733 } 734 } 735 736 blkptr_t * 737 dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off) 738 { 739 off_t blkid, l1id; 740 unsigned int levels; 741 742 if (c->dnode->dn_nlevels == 1) { 743 assert(off < MAXBLOCKSIZE); 744 return (&c->dnode->dn_blkptr[0]); 745 } 746 747 assert(off % c->datablksz == 0); 748 749 /* Do we need to flush any full indirect blocks? */ 750 if (off > 0) { 751 blkid = off / c->datablksz; 752 for (levels = 0; levels < c->dnode->dn_nlevels - 1U; levels++) { 753 if (blkid % BLKPTR_PER_INDIR != 0) 754 break; 755 blkid /= BLKPTR_PER_INDIR; 756 } 757 if (levels > 0) 758 _dnode_cursor_flush(zfs, c, levels); 759 } 760 761 c->dataoff = off; 762 l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1); 763 return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]); 764 } 765 766 void 767 dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c) 768 { 769 unsigned int levels; 770 771 assert(c->dnode->dn_nlevels > 0); 772 levels = c->dnode->dn_nlevels - 1; 773 if (levels > 0) 774 _dnode_cursor_flush(zfs, c, levels); 775 assert(c->indspace == 0); 776 free(c); 777 } 778 779 void 780 zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts) 781 { 782 zfs_opt_t *zfs; 783 int dirfd; 784 785 zfs = fsopts->fs_specific; 786 787 /* 788 * Use a fixed seed to provide reproducible pseudo-random numbers for 789 * on-disk structures when needed (e.g., GUIDs, ZAP hash salts). 790 */ 791 srandom(1729); 792 793 zfs_check_opts(fsopts); 794 795 dirfd = open(dir, O_DIRECTORY | O_RDONLY); 796 if (dirfd < 0) 797 err(1, "open(%s)", dir); 798 799 vdev_init(zfs, image); 800 pool_init(zfs); 801 fs_build(zfs, dirfd, root); 802 pool_fini(zfs); 803 vdev_fini(zfs); 804 } 805