1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2022 The FreeBSD Foundation 5 * 6 * This software was developed by Mark Johnston under sponsorship from 7 * the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions are 11 * met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/errno.h> 33 #include <sys/queue.h> 34 35 #include <assert.h> 36 #include <ctype.h> 37 #include <fcntl.h> 38 #include <stdalign.h> 39 #include <stdbool.h> 40 #include <stddef.h> 41 #include <stdlib.h> 42 #include <string.h> 43 #include <unistd.h> 44 45 #include <util.h> 46 47 #include "makefs.h" 48 #include "zfs.h" 49 50 #define VDEV_LABEL_SPACE \ 51 ((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) 52 _Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, ""); 53 54 #define MINMSSIZE ((off_t)1 << 24) /* 16MB */ 55 #define DFLTMSSIZE ((off_t)1 << 29) /* 512MB */ 56 #define MAXMSSIZE ((off_t)1 << 34) /* 16GB */ 57 58 #define INDIR_LEVELS 6 59 /* Indirect blocks are always 128KB. */ 60 #define BLKPTR_PER_INDIR (MAXBLOCKSIZE / sizeof(blkptr_t)) 61 62 struct dnode_cursor { 63 char inddir[INDIR_LEVELS][MAXBLOCKSIZE]; 64 off_t indloc; 65 off_t indspace; 66 dnode_phys_t *dnode; 67 off_t dataoff; 68 off_t datablksz; 69 }; 70 71 void 72 zfs_prep_opts(fsinfo_t *fsopts) 73 { 74 zfs_opt_t *zfs; 75 size_t align; 76 77 align = alignof(uint64_t); 78 zfs = aligned_alloc(align, roundup2(sizeof(*zfs), align)); 79 if (zfs == NULL) 80 err(1, "aligned_alloc"); 81 memset(zfs, 0, sizeof(*zfs)); 82 83 const option_t zfs_options[] = { 84 { '\0', "bootfs", &zfs->bootfs, OPT_STRPTR, 85 0, 0, "Bootable dataset" }, 86 { '\0', "mssize", &zfs->mssize, OPT_INT64, 87 MINMSSIZE, MAXMSSIZE, "Metaslab size" }, 88 { '\0', "poolname", &zfs->poolname, OPT_STRPTR, 89 0, 0, "ZFS pool name" }, 90 { '\0', "rootpath", &zfs->rootpath, OPT_STRPTR, 91 0, 0, "Prefix for all dataset mount points" }, 92 { '\0', "ashift", &zfs->ashift, OPT_INT32, 93 MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" }, 94 { '\0', "nowarn", &zfs->nowarn, OPT_BOOL, 95 0, 0, "Suppress warning about experimental ZFS support" }, 96 { .name = NULL } 97 }; 98 99 STAILQ_INIT(&zfs->datasetdescs); 100 101 fsopts->fs_specific = zfs; 102 fsopts->fs_options = copy_opts(zfs_options); 103 } 104 105 int 106 zfs_parse_opts(const char *option, fsinfo_t *fsopts) 107 { 108 zfs_opt_t *zfs; 109 struct dataset_desc *dsdesc; 110 char buf[BUFSIZ], *opt, *val; 111 int rv; 112 113 zfs = fsopts->fs_specific; 114 115 opt = val = estrdup(option); 116 opt = strsep(&val, "="); 117 if (strcmp(opt, "fs") == 0) { 118 if (val == NULL) 119 errx(1, "invalid filesystem parameters `%s'", option); 120 121 /* 122 * Dataset descriptions will be parsed later, in dsl_init(). 123 * Just stash them away for now. 124 */ 125 dsdesc = ecalloc(1, sizeof(*dsdesc)); 126 dsdesc->params = estrdup(val); 127 free(opt); 128 STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next); 129 return (1); 130 } 131 free(opt); 132 133 rv = set_option(fsopts->fs_options, option, buf, sizeof(buf)); 134 return (rv == -1 ? 0 : 1); 135 } 136 137 static void 138 zfs_size_vdev(fsinfo_t *fsopts) 139 { 140 zfs_opt_t *zfs; 141 off_t asize, mssize, vdevsize, vdevsize1; 142 143 zfs = fsopts->fs_specific; 144 145 assert(fsopts->maxsize != 0); 146 assert(zfs->ashift != 0); 147 148 /* 149 * Figure out how big the vdev should be. 150 */ 151 vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift); 152 if (vdevsize < MINDEVSIZE) 153 errx(1, "maximum image size is too small"); 154 if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) { 155 errx(1, "image size bounds must be multiples of %d", 156 1 << zfs->ashift); 157 } 158 asize = vdevsize - VDEV_LABEL_SPACE; 159 160 /* 161 * Size metaslabs according to the following heuristic: 162 * - provide at least 8 metaslabs, 163 * - without using a metaslab size larger than 512MB. 164 * This approximates what OpenZFS does without being complicated. In 165 * practice we expect pools to be expanded upon first use, and OpenZFS 166 * does not resize metaslabs in that case, so there is no right answer 167 * here. In general we want to provide large metaslabs even if the 168 * image size is small, and 512MB is a reasonable size for pools up to 169 * several hundred gigabytes. 170 * 171 * The user may override this heuristic using the "-o mssize" option. 172 */ 173 mssize = zfs->mssize; 174 if (mssize == 0) { 175 mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE); 176 if (!powerof2(mssize)) 177 mssize = 1l << (flsll(mssize) - 1); 178 } 179 if (!powerof2(mssize)) 180 errx(1, "metaslab size must be a power of 2"); 181 182 /* 183 * If we have some slop left over, try to cover it by resizing the vdev, 184 * subject to the maxsize and minsize parameters. 185 */ 186 if (asize % mssize != 0) { 187 vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE; 188 if (vdevsize1 < fsopts->minsize) 189 vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE; 190 if (vdevsize1 <= fsopts->maxsize) 191 vdevsize = vdevsize1; 192 } 193 asize = vdevsize - VDEV_LABEL_SPACE; 194 195 zfs->asize = asize; 196 zfs->vdevsize = vdevsize; 197 zfs->mssize = mssize; 198 zfs->msshift = flsll(mssize) - 1; 199 zfs->mscount = asize / mssize; 200 } 201 202 /* 203 * Validate options and set some default values. 204 */ 205 static void 206 zfs_check_opts(fsinfo_t *fsopts) 207 { 208 zfs_opt_t *zfs; 209 210 zfs = fsopts->fs_specific; 211 212 if (fsopts->offset != 0) 213 errx(1, "unhandled offset option"); 214 if (fsopts->maxsize == 0) 215 errx(1, "an image size must be specified"); 216 217 if (zfs->poolname == NULL) 218 errx(1, "a pool name must be specified"); 219 if (!isalpha(zfs->poolname[0])) 220 errx(1, "the pool name must begin with a letter"); 221 for (size_t i = 0, len = strlen(zfs->poolname); i < len; i++) { 222 if (!isalnum(zfs->poolname[i]) && zfs->poolname[i] != '_') 223 errx(1, "invalid character '%c' in pool name", 224 zfs->poolname[i]); 225 } 226 if (strcmp(zfs->poolname, "mirror") == 0 || 227 strcmp(zfs->poolname, "raidz") == 0 || 228 strcmp(zfs->poolname, "draid") == 0) { 229 errx(1, "pool name '%s' is reserved and cannot be used", 230 zfs->poolname); 231 } 232 233 if (zfs->rootpath == NULL) 234 easprintf(&zfs->rootpath, "/%s", zfs->poolname); 235 if (zfs->rootpath[0] != '/') 236 errx(1, "mountpoint `%s' must be absolute", zfs->rootpath); 237 238 if (zfs->ashift == 0) 239 zfs->ashift = 12; 240 241 zfs_size_vdev(fsopts); 242 } 243 244 void 245 zfs_cleanup_opts(fsinfo_t *fsopts) 246 { 247 struct dataset_desc *d, *tmp; 248 zfs_opt_t *zfs; 249 250 zfs = fsopts->fs_specific; 251 free(zfs->rootpath); 252 free(zfs->bootfs); 253 free(__DECONST(void *, zfs->poolname)); 254 STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) { 255 free(d->params); 256 free(d); 257 } 258 free(zfs); 259 free(fsopts->fs_options); 260 } 261 262 static size_t 263 nvlist_size(const nvlist_t *nvl) 264 { 265 return (sizeof(nvl->nv_header) + nvl->nv_size); 266 } 267 268 static void 269 nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz) 270 { 271 assert(sz >= nvlist_size(nvl)); 272 273 memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header)); 274 memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size); 275 } 276 277 static nvlist_t * 278 pool_config_nvcreate(zfs_opt_t *zfs) 279 { 280 nvlist_t *featuresnv, *poolnv; 281 282 poolnv = nvlist_create(NV_UNIQUE_NAME); 283 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG); 284 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION); 285 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED); 286 nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname); 287 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid); 288 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid); 289 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); 290 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1); 291 292 featuresnv = nvlist_create(NV_UNIQUE_NAME); 293 nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv); 294 nvlist_destroy(featuresnv); 295 296 return (poolnv); 297 } 298 299 static nvlist_t * 300 pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs) 301 { 302 nvlist_t *diskvdevnv; 303 304 assert(zfs->objarrid != 0); 305 306 diskvdevnv = nvlist_create(NV_UNIQUE_NAME); 307 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK); 308 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift); 309 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize); 310 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); 311 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0); 312 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null"); 313 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1); 314 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); 315 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY, 316 zfs->objarrid); 317 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT, 318 zfs->msshift); 319 320 return (diskvdevnv); 321 } 322 323 static nvlist_t * 324 pool_root_vdev_config_nvcreate(zfs_opt_t *zfs) 325 { 326 nvlist_t *diskvdevnv, *rootvdevnv; 327 328 diskvdevnv = pool_disk_vdev_config_nvcreate(zfs); 329 rootvdevnv = nvlist_create(NV_UNIQUE_NAME); 330 331 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0); 332 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid); 333 nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 334 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); 335 nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv, 336 1); 337 nvlist_destroy(diskvdevnv); 338 339 return (rootvdevnv); 340 } 341 342 /* 343 * Create the pool's "config" object, which contains an nvlist describing pool 344 * parameters and the vdev topology. It is similar but not identical to the 345 * nvlist stored in vdev labels. The main difference is that vdev labels do not 346 * describe the full vdev tree and in particular do not contain the "root" 347 * meta-vdev. 348 */ 349 static void 350 pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir) 351 { 352 dnode_phys_t *dnode; 353 nvlist_t *poolconfig, *vdevconfig; 354 void *configbuf; 355 uint64_t dnid; 356 off_t configloc, configblksz; 357 int error; 358 359 dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST, 360 DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid); 361 362 poolconfig = pool_config_nvcreate(zfs); 363 364 vdevconfig = pool_root_vdev_config_nvcreate(zfs); 365 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); 366 nvlist_destroy(vdevconfig); 367 368 error = nvlist_export(poolconfig); 369 if (error != 0) 370 errc(1, error, "nvlist_export"); 371 372 configblksz = nvlist_size(poolconfig); 373 configloc = objset_space_alloc(zfs, zfs->mos, &configblksz); 374 configbuf = ecalloc(1, configblksz); 375 nvlist_copy(poolconfig, configbuf, configblksz); 376 377 vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc); 378 379 dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT; 380 dnode->dn_flags = DNODE_FLAG_USED_BYTES; 381 *(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig); 382 383 zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid); 384 385 nvlist_destroy(poolconfig); 386 free(configbuf); 387 } 388 389 /* 390 * Add objects block pointer list objects, used for deferred frees. We don't do 391 * anything with them, but they need to be present or OpenZFS will refuse to 392 * import the pool. 393 */ 394 static void 395 pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir) 396 { 397 uint64_t dnid; 398 399 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, 400 BPOBJ_SIZE_V2, &dnid); 401 zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid); 402 403 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, 404 BPOBJ_SIZE_V2, &dnid); 405 zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid); 406 } 407 408 /* 409 * Add required feature metadata objects. We don't know anything about ZFS 410 * features, so the objects are just empty ZAPs. 411 */ 412 static void 413 pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir) 414 { 415 dnode_phys_t *dnode; 416 uint64_t dnid; 417 418 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 419 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid); 420 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 421 422 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 423 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid); 424 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 425 426 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 427 zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid); 428 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 429 } 430 431 static void 432 pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir) 433 { 434 zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET, 435 dsl_dir_id(zfs->rootdsldir)); 436 } 437 438 static void 439 pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir) 440 { 441 dnode_phys_t *dnode; 442 uint64_t id; 443 444 dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id); 445 zap_add_uint64(objdir, DMU_POOL_PROPS, id); 446 447 zfs->poolprops = zap_alloc(zfs->mos, dnode); 448 } 449 450 /* 451 * Initialize the MOS object directory, the root of virtually all of the pool's 452 * data and metadata. 453 */ 454 static void 455 pool_init_objdir(zfs_opt_t *zfs) 456 { 457 zfs_zap_t *zap; 458 dnode_phys_t *objdir; 459 460 objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT); 461 462 zap = zap_alloc(zfs->mos, objdir); 463 pool_init_objdir_config(zfs, zap); 464 pool_init_objdir_bplists(zfs, zap); 465 pool_init_objdir_feature_maps(zfs, zap); 466 pool_init_objdir_dsl(zfs, zap); 467 pool_init_objdir_poolprops(zfs, zap); 468 zap_write(zfs, zap); 469 } 470 471 /* 472 * Initialize the meta-object set (MOS) and immediately write out several 473 * special objects whose contents are already finalized, including the object 474 * directory. 475 * 476 * Once the MOS is finalized, it'll look roughly like this: 477 * 478 * object directory (ZAP) 479 * |-> vdev config object (nvlist) 480 * |-> features for read 481 * |-> features for write 482 * |-> feature descriptions 483 * |-> sync bplist 484 * |-> free bplist 485 * |-> pool properties 486 * L-> root DSL directory 487 * |-> DSL child directory (ZAP) 488 * | |-> $MOS (DSL dir) 489 * | | |-> child map 490 * | | L-> props (ZAP) 491 * | |-> $FREE (DSL dir) 492 * | | |-> child map 493 * | | L-> props (ZAP) 494 * | |-> $ORIGIN (DSL dir) 495 * | | |-> child map 496 * | | |-> dataset 497 * | | | L-> deadlist 498 * | | |-> snapshot 499 * | | | |-> deadlist 500 * | | | L-> snapshot names 501 * | | |-> props (ZAP) 502 * | | L-> clones (ZAP) 503 * | |-> dataset 1 (DSL dir) 504 * | | |-> DSL dataset 505 * | | | |-> snapshot names 506 * | | | L-> deadlist 507 * | | |-> child map 508 * | | | L-> ... 509 * | | L-> props 510 * | |-> dataset 2 511 * | | L-> ... 512 * | |-> ... 513 * | L-> dataset n 514 * |-> DSL root dataset 515 * | |-> snapshot names 516 * | L-> deadlist 517 * L-> props (ZAP) 518 * space map object array 519 * |-> space map 1 520 * |-> space map 2 521 * |-> ... 522 * L-> space map n (zfs->mscount) 523 * 524 * The space map object array is pointed to by the "msarray" property in the 525 * pool configuration. 526 */ 527 static void 528 pool_init(zfs_opt_t *zfs) 529 { 530 uint64_t dnid; 531 532 zfs->poolguid = ((uint64_t)random() << 32) | random(); 533 zfs->vdevguid = ((uint64_t)random() << 32) | random(); 534 535 zfs->mos = objset_alloc(zfs, DMU_OST_META); 536 537 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid); 538 assert(dnid == DMU_POOL_DIRECTORY_OBJECT); 539 540 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid); 541 542 dsl_init(zfs); 543 544 pool_init_objdir(zfs); 545 } 546 547 static void 548 pool_labels_write(zfs_opt_t *zfs) 549 { 550 uberblock_t *ub; 551 vdev_label_t *label; 552 nvlist_t *poolconfig, *vdevconfig; 553 int error; 554 555 label = ecalloc(1, sizeof(*label)); 556 557 /* 558 * Assemble the vdev configuration and store it in the label. 559 */ 560 poolconfig = pool_config_nvcreate(zfs); 561 vdevconfig = pool_disk_vdev_config_nvcreate(zfs); 562 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); 563 nvlist_destroy(vdevconfig); 564 565 error = nvlist_export(poolconfig); 566 if (error != 0) 567 errc(1, error, "nvlist_export"); 568 nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist, 569 sizeof(label->vl_vdev_phys.vp_nvlist)); 570 nvlist_destroy(poolconfig); 571 572 /* 573 * Fill out the uberblock. Just make each one the same. The embedded 574 * checksum is calculated in vdev_label_write(). 575 */ 576 for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock); 577 uoff += (1 << zfs->ashift)) { 578 ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff); 579 ub->ub_magic = UBERBLOCK_MAGIC; 580 ub->ub_version = SPA_VERSION; 581 ub->ub_txg = TXG; 582 ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid; 583 ub->ub_timestamp = 0; 584 585 ub->ub_software_version = SPA_VERSION; 586 ub->ub_mmp_magic = MMP_MAGIC; 587 ub->ub_mmp_delay = 0; 588 ub->ub_mmp_config = 0; 589 ub->ub_checkpoint_txg = 0; 590 objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp); 591 } 592 593 /* 594 * Write out four copies of the label: two at the beginning of the vdev 595 * and two at the end. 596 */ 597 for (int i = 0; i < VDEV_LABELS; i++) 598 vdev_label_write(zfs, i, label); 599 600 free(label); 601 } 602 603 static void 604 pool_fini(zfs_opt_t *zfs) 605 { 606 zap_write(zfs, zfs->poolprops); 607 dsl_write(zfs); 608 objset_write(zfs, zfs->mos); 609 pool_labels_write(zfs); 610 } 611 612 struct dnode_cursor * 613 dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode, 614 off_t size, off_t blksz) 615 { 616 struct dnode_cursor *c; 617 uint64_t nbppindir, indlevel, ndatablks, nindblks; 618 619 assert(dnode->dn_nblkptr == 1); 620 assert(blksz <= MAXBLOCKSIZE); 621 622 if (blksz == 0) { 623 /* Must be between 1<<ashift and 128KB. */ 624 blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift, 625 powerof2(size) ? size : (1l << flsll(size)))); 626 } 627 assert(powerof2(blksz)); 628 629 /* 630 * Do we need indirect blocks? Figure out how many levels are needed 631 * (indlevel == 1 means no indirect blocks) and how much space is needed 632 * (it has to be allocated up-front to break the dependency cycle 633 * described in objset_write()). 634 */ 635 ndatablks = size == 0 ? 0 : howmany(size, blksz); 636 nindblks = 0; 637 for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) { 638 nbppindir *= BLKPTR_PER_INDIR; 639 nindblks += howmany(ndatablks, indlevel * nbppindir); 640 } 641 assert(indlevel < INDIR_LEVELS); 642 643 dnode->dn_nlevels = (uint8_t)indlevel; 644 dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0; 645 dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT; 646 647 c = ecalloc(1, sizeof(*c)); 648 if (nindblks > 0) { 649 c->indspace = nindblks * MAXBLOCKSIZE; 650 c->indloc = objset_space_alloc(zfs, os, &c->indspace); 651 } 652 c->dnode = dnode; 653 c->dataoff = 0; 654 c->datablksz = blksz; 655 656 return (c); 657 } 658 659 static void 660 _dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, int levels) 661 { 662 blkptr_t *bp, *pbp; 663 void *buf; 664 uint64_t fill; 665 off_t blkid, blksz, loc; 666 667 assert(levels > 0); 668 assert(levels <= c->dnode->dn_nlevels - 1); 669 670 blksz = MAXBLOCKSIZE; 671 blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR; 672 for (int level = 1; level <= levels; level++) { 673 buf = c->inddir[level - 1]; 674 675 if (level == c->dnode->dn_nlevels - 1) { 676 pbp = &c->dnode->dn_blkptr[0]; 677 } else { 678 uint64_t iblkid; 679 680 iblkid = blkid & (BLKPTR_PER_INDIR - 1); 681 pbp = (blkptr_t *) 682 &c->inddir[level][iblkid * sizeof(blkptr_t)]; 683 } 684 685 /* 686 * Space for indirect blocks is allocated up-front; see the 687 * comment in objset_write(). 688 */ 689 loc = c->indloc; 690 c->indloc += blksz; 691 assert(c->indspace >= blksz); 692 c->indspace -= blksz; 693 694 bp = buf; 695 fill = 0; 696 for (size_t i = 0; i < BLKPTR_PER_INDIR; i++) 697 fill += BP_GET_FILL(&bp[i]); 698 699 vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz, 700 loc, pbp); 701 memset(buf, 0, MAXBLOCKSIZE); 702 703 blkid /= BLKPTR_PER_INDIR; 704 } 705 } 706 707 blkptr_t * 708 dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off) 709 { 710 off_t blkid, l1id; 711 int levels; 712 713 if (c->dnode->dn_nlevels == 1) { 714 assert(off < MAXBLOCKSIZE); 715 return (&c->dnode->dn_blkptr[0]); 716 } 717 718 assert(off % c->datablksz == 0); 719 720 /* Do we need to flush any full indirect blocks? */ 721 if (off > 0) { 722 blkid = off / c->datablksz; 723 for (levels = 0; levels < c->dnode->dn_nlevels - 1; levels++) { 724 if (blkid % BLKPTR_PER_INDIR != 0) 725 break; 726 blkid /= BLKPTR_PER_INDIR; 727 } 728 if (levels > 0) 729 _dnode_cursor_flush(zfs, c, levels); 730 } 731 732 c->dataoff = off; 733 l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1); 734 return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]); 735 } 736 737 void 738 dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c) 739 { 740 int levels; 741 742 levels = c->dnode->dn_nlevels - 1; 743 if (levels > 0) 744 _dnode_cursor_flush(zfs, c, levels); 745 assert(c->indspace == 0); 746 free(c); 747 } 748 749 void 750 zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts) 751 { 752 zfs_opt_t *zfs; 753 int dirfd; 754 755 zfs = fsopts->fs_specific; 756 757 /* 758 * Use a fixed seed to provide reproducible pseudo-random numbers for 759 * on-disk structures when needed (e.g., GUIDs, ZAP hash salts). 760 */ 761 srandom(1729); 762 763 zfs_check_opts(fsopts); 764 765 if (!zfs->nowarn) { 766 fprintf(stderr, 767 "ZFS support is currently considered experimental. " 768 "Do not use it for anything critical.\n"); 769 } 770 771 dirfd = open(dir, O_DIRECTORY | O_RDONLY); 772 if (dirfd < 0) 773 err(1, "open(%s)", dir); 774 775 vdev_init(zfs, image); 776 pool_init(zfs); 777 fs_build(zfs, dirfd, root); 778 pool_fini(zfs); 779 vdev_fini(zfs); 780 } 781