1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2022 The FreeBSD Foundation 5 * 6 * This software was developed by Mark Johnston under sponsorship from 7 * the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions are 11 * met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/errno.h> 33 #include <sys/queue.h> 34 35 #include <assert.h> 36 #include <fcntl.h> 37 #include <stdalign.h> 38 #include <stdbool.h> 39 #include <stddef.h> 40 #include <stdlib.h> 41 #include <string.h> 42 #include <unistd.h> 43 44 #include <util.h> 45 46 #include "makefs.h" 47 #include "zfs.h" 48 49 #define VDEV_LABEL_SPACE \ 50 ((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) 51 _Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, ""); 52 53 #define MINMSSIZE ((off_t)1 << 24) /* 16MB */ 54 #define DFLTMSSIZE ((off_t)1 << 29) /* 512MB */ 55 #define MAXMSSIZE ((off_t)1 << 34) /* 16GB */ 56 57 #define INDIR_LEVELS 6 58 /* Indirect blocks are always 128KB. */ 59 #define BLKPTR_PER_INDIR (MAXBLOCKSIZE / sizeof(blkptr_t)) 60 61 struct dnode_cursor { 62 char inddir[INDIR_LEVELS][MAXBLOCKSIZE]; 63 off_t indloc; 64 off_t indspace; 65 dnode_phys_t *dnode; 66 off_t dataoff; 67 off_t datablksz; 68 }; 69 70 void 71 zfs_prep_opts(fsinfo_t *fsopts) 72 { 73 size_t align; 74 75 align = alignof(uint64_t); 76 zfs_opt_t *zfs = aligned_alloc(align, roundup2(sizeof(*zfs), align)); 77 if (zfs == NULL) 78 err(1, "aligned_alloc"); 79 memset(zfs, 0, sizeof(*zfs)); 80 81 const option_t zfs_options[] = { 82 { '\0', "bootfs", &zfs->bootfs, OPT_STRPTR, 83 0, 0, "Bootable dataset" }, 84 { '\0', "mssize", &zfs->mssize, OPT_INT64, 85 MINMSSIZE, MAXMSSIZE, "Metaslab size" }, 86 { '\0', "poolname", &zfs->poolname, OPT_STRPTR, 87 0, 0, "ZFS pool name" }, 88 { '\0', "rootpath", &zfs->rootpath, OPT_STRPTR, 89 0, 0, "Prefix for all dataset mount points" }, 90 { '\0', "ashift", &zfs->ashift, OPT_INT32, 91 MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" }, 92 { '\0', "nowarn", &zfs->nowarn, OPT_BOOL, 93 0, 0, "Suppress warning about experimental ZFS support" }, 94 { .name = NULL } 95 }; 96 97 STAILQ_INIT(&zfs->datasetdescs); 98 99 fsopts->fs_specific = zfs; 100 fsopts->fs_options = copy_opts(zfs_options); 101 } 102 103 int 104 zfs_parse_opts(const char *option, fsinfo_t *fsopts) 105 { 106 zfs_opt_t *zfs; 107 struct dataset_desc *dsdesc; 108 char buf[BUFSIZ], *opt, *val; 109 int rv; 110 111 zfs = fsopts->fs_specific; 112 113 opt = val = estrdup(option); 114 opt = strsep(&val, "="); 115 if (strcmp(opt, "fs") == 0) { 116 if (val == NULL) 117 errx(1, "invalid filesystem parameters `%s'", option); 118 119 /* 120 * Dataset descriptions will be parsed later, in dsl_init(). 121 * Just stash them away for now. 122 */ 123 dsdesc = ecalloc(1, sizeof(*dsdesc)); 124 dsdesc->params = estrdup(val); 125 free(opt); 126 STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next); 127 return (1); 128 } 129 free(opt); 130 131 rv = set_option(fsopts->fs_options, option, buf, sizeof(buf)); 132 return (rv == -1 ? 0 : 1); 133 } 134 135 static void 136 zfs_size_vdev(fsinfo_t *fsopts) 137 { 138 zfs_opt_t *zfs; 139 off_t asize, mssize, vdevsize, vdevsize1; 140 141 zfs = fsopts->fs_specific; 142 143 assert(fsopts->maxsize != 0); 144 assert(zfs->ashift != 0); 145 146 /* 147 * Figure out how big the vdev should be. 148 */ 149 vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift); 150 if (vdevsize < MINDEVSIZE) 151 errx(1, "maximum image size is too small"); 152 if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) { 153 errx(1, "image size bounds must be multiples of %d", 154 1 << zfs->ashift); 155 } 156 asize = vdevsize - VDEV_LABEL_SPACE; 157 158 /* 159 * Size metaslabs according to the following heuristic: 160 * - provide at least 8 metaslabs, 161 * - without using a metaslab size larger than 512MB. 162 * This approximates what OpenZFS does without being complicated. In 163 * practice we expect pools to be expanded upon first use, and OpenZFS 164 * does not resize metaslabs in that case, so there is no right answer 165 * here. In general we want to provide large metaslabs even if the 166 * image size is small, and 512MB is a reasonable size for pools up to 167 * several hundred gigabytes. 168 * 169 * The user may override this heuristic using the "-o mssize" option. 170 */ 171 mssize = zfs->mssize; 172 if (mssize == 0) { 173 mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE); 174 if (!powerof2(mssize)) 175 mssize = 1l << (flsll(mssize) - 1); 176 } 177 if (!powerof2(mssize)) 178 errx(1, "metaslab size must be a power of 2"); 179 180 /* 181 * If we have some slop left over, try to cover it by resizing the vdev, 182 * subject to the maxsize and minsize parameters. 183 */ 184 if (asize % mssize != 0) { 185 vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE; 186 if (vdevsize1 < fsopts->minsize) 187 vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE; 188 if (vdevsize1 <= fsopts->maxsize) 189 vdevsize = vdevsize1; 190 } 191 asize = vdevsize - VDEV_LABEL_SPACE; 192 193 zfs->asize = asize; 194 zfs->vdevsize = vdevsize; 195 zfs->mssize = mssize; 196 zfs->msshift = flsll(mssize) - 1; 197 zfs->mscount = asize / mssize; 198 } 199 200 /* 201 * Validate options and set some default values. 202 */ 203 static void 204 zfs_check_opts(fsinfo_t *fsopts) 205 { 206 zfs_opt_t *zfs; 207 208 zfs = fsopts->fs_specific; 209 210 if (fsopts->offset != 0) 211 errx(1, "unhandled offset option"); 212 if (fsopts->maxsize == 0) 213 errx(1, "an image size must be specified"); 214 215 if (zfs->poolname == NULL) 216 errx(1, "a pool name must be specified"); 217 218 if (zfs->rootpath == NULL) 219 easprintf(&zfs->rootpath, "/%s", zfs->poolname); 220 if (zfs->rootpath[0] != '/') 221 errx(1, "mountpoint `%s' must be absolute", zfs->rootpath); 222 223 if (zfs->ashift == 0) 224 zfs->ashift = 12; 225 226 zfs_size_vdev(fsopts); 227 } 228 229 void 230 zfs_cleanup_opts(fsinfo_t *fsopts) 231 { 232 struct dataset_desc *d, *tmp; 233 zfs_opt_t *zfs; 234 235 zfs = fsopts->fs_specific; 236 free(zfs->rootpath); 237 free(zfs->bootfs); 238 free(__DECONST(void *, zfs->poolname)); 239 STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) { 240 free(d->params); 241 free(d); 242 } 243 free(zfs); 244 free(fsopts->fs_options); 245 } 246 247 static size_t 248 nvlist_size(const nvlist_t *nvl) 249 { 250 return (sizeof(nvl->nv_header) + nvl->nv_size); 251 } 252 253 static void 254 nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz) 255 { 256 assert(sz >= nvlist_size(nvl)); 257 258 memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header)); 259 memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size); 260 } 261 262 static nvlist_t * 263 pool_config_nvcreate(zfs_opt_t *zfs) 264 { 265 nvlist_t *featuresnv, *poolnv; 266 267 poolnv = nvlist_create(NV_UNIQUE_NAME); 268 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG); 269 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION); 270 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED); 271 nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname); 272 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid); 273 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid); 274 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); 275 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1); 276 277 featuresnv = nvlist_create(NV_UNIQUE_NAME); 278 nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv); 279 nvlist_destroy(featuresnv); 280 281 return (poolnv); 282 } 283 284 static nvlist_t * 285 pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs) 286 { 287 nvlist_t *diskvdevnv; 288 289 assert(zfs->objarrid != 0); 290 291 diskvdevnv = nvlist_create(NV_UNIQUE_NAME); 292 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK); 293 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift); 294 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize); 295 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); 296 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0); 297 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null"); 298 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1); 299 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); 300 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY, 301 zfs->objarrid); 302 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT, 303 zfs->msshift); 304 305 return (diskvdevnv); 306 } 307 308 static nvlist_t * 309 pool_root_vdev_config_nvcreate(zfs_opt_t *zfs) 310 { 311 nvlist_t *diskvdevnv, *rootvdevnv; 312 313 diskvdevnv = pool_disk_vdev_config_nvcreate(zfs); 314 rootvdevnv = nvlist_create(NV_UNIQUE_NAME); 315 316 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0); 317 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid); 318 nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 319 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); 320 nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv, 321 1); 322 nvlist_destroy(diskvdevnv); 323 324 return (rootvdevnv); 325 } 326 327 /* 328 * Create the pool's "config" object, which contains an nvlist describing pool 329 * parameters and the vdev topology. It is similar but not identical to the 330 * nvlist stored in vdev labels. The main difference is that vdev labels do not 331 * describe the full vdev tree and in particular do not contain the "root" 332 * meta-vdev. 333 */ 334 static void 335 pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir) 336 { 337 dnode_phys_t *dnode; 338 nvlist_t *poolconfig, *vdevconfig; 339 void *configbuf; 340 uint64_t dnid; 341 off_t configloc, configblksz; 342 int error; 343 344 dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST, 345 DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid); 346 347 poolconfig = pool_config_nvcreate(zfs); 348 349 vdevconfig = pool_root_vdev_config_nvcreate(zfs); 350 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); 351 nvlist_destroy(vdevconfig); 352 353 error = nvlist_export(poolconfig); 354 if (error != 0) 355 errc(1, error, "nvlist_export"); 356 357 configblksz = nvlist_size(poolconfig); 358 configloc = objset_space_alloc(zfs, zfs->mos, &configblksz); 359 configbuf = ecalloc(1, configblksz); 360 nvlist_copy(poolconfig, configbuf, configblksz); 361 362 vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc); 363 364 dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT; 365 dnode->dn_flags = DNODE_FLAG_USED_BYTES; 366 *(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig); 367 368 zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid); 369 370 nvlist_destroy(poolconfig); 371 free(configbuf); 372 } 373 374 /* 375 * Add objects block pointer list objects, used for deferred frees. We don't do 376 * anything with them, but they need to be present or OpenZFS will refuse to 377 * import the pool. 378 */ 379 static void 380 pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir) 381 { 382 uint64_t dnid; 383 384 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, 385 BPOBJ_SIZE_V2, &dnid); 386 zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid); 387 388 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, 389 BPOBJ_SIZE_V2, &dnid); 390 zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid); 391 } 392 393 /* 394 * Add required feature metadata objects. We don't know anything about ZFS 395 * features, so the objects are just empty ZAPs. 396 */ 397 static void 398 pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir) 399 { 400 dnode_phys_t *dnode; 401 uint64_t dnid; 402 403 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 404 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid); 405 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 406 407 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 408 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid); 409 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 410 411 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 412 zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid); 413 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 414 } 415 416 static void 417 pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir) 418 { 419 zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET, 420 dsl_dir_id(zfs->rootdsldir)); 421 } 422 423 static void 424 pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir) 425 { 426 dnode_phys_t *dnode; 427 uint64_t id; 428 429 dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id); 430 zap_add_uint64(objdir, DMU_POOL_PROPS, id); 431 432 zfs->poolprops = zap_alloc(zfs->mos, dnode); 433 } 434 435 /* 436 * Initialize the MOS object directory, the root of virtually all of the pool's 437 * data and metadata. 438 */ 439 static void 440 pool_init_objdir(zfs_opt_t *zfs) 441 { 442 zfs_zap_t *zap; 443 dnode_phys_t *objdir; 444 445 objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT); 446 447 zap = zap_alloc(zfs->mos, objdir); 448 pool_init_objdir_config(zfs, zap); 449 pool_init_objdir_bplists(zfs, zap); 450 pool_init_objdir_feature_maps(zfs, zap); 451 pool_init_objdir_dsl(zfs, zap); 452 pool_init_objdir_poolprops(zfs, zap); 453 zap_write(zfs, zap); 454 } 455 456 /* 457 * Initialize the meta-object set (MOS) and immediately write out several 458 * special objects whose contents are already finalized, including the object 459 * directory. 460 * 461 * Once the MOS is finalized, it'll look roughly like this: 462 * 463 * object directory (ZAP) 464 * |-> vdev config object (nvlist) 465 * |-> features for read 466 * |-> features for write 467 * |-> feature descriptions 468 * |-> sync bplist 469 * |-> free bplist 470 * |-> pool properties 471 * L-> root DSL directory 472 * |-> DSL child directory (ZAP) 473 * | |-> $MOS (DSL dir) 474 * | | |-> child map 475 * | | L-> props (ZAP) 476 * | |-> $FREE (DSL dir) 477 * | | |-> child map 478 * | | L-> props (ZAP) 479 * | |-> $ORIGIN (DSL dir) 480 * | | |-> child map 481 * | | |-> dataset 482 * | | | L-> deadlist 483 * | | |-> snapshot 484 * | | | |-> deadlist 485 * | | | L-> snapshot names 486 * | | |-> props (ZAP) 487 * | | L-> clones (ZAP) 488 * | |-> dataset 1 (DSL dir) 489 * | | |-> DSL dataset 490 * | | | |-> snapshot names 491 * | | | L-> deadlist 492 * | | |-> child map 493 * | | | L-> ... 494 * | | L-> props 495 * | |-> dataset 2 496 * | | L-> ... 497 * | |-> ... 498 * | L-> dataset n 499 * |-> DSL root dataset 500 * | |-> snapshot names 501 * | L-> deadlist 502 * L-> props (ZAP) 503 * space map object array 504 * |-> space map 1 505 * |-> space map 2 506 * |-> ... 507 * L-> space map n (zfs->mscount) 508 * 509 * The space map object array is pointed to by the "msarray" property in the 510 * pool configuration. 511 */ 512 static void 513 pool_init(zfs_opt_t *zfs) 514 { 515 uint64_t dnid; 516 517 zfs->poolguid = ((uint64_t)random() << 32) | random(); 518 zfs->vdevguid = ((uint64_t)random() << 32) | random(); 519 520 zfs->mos = objset_alloc(zfs, DMU_OST_META); 521 522 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid); 523 assert(dnid == DMU_POOL_DIRECTORY_OBJECT); 524 525 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid); 526 527 dsl_init(zfs); 528 529 pool_init_objdir(zfs); 530 } 531 532 static void 533 pool_labels_write(zfs_opt_t *zfs) 534 { 535 uberblock_t *ub; 536 vdev_label_t *label; 537 nvlist_t *poolconfig, *vdevconfig; 538 int error; 539 540 label = ecalloc(1, sizeof(*label)); 541 542 /* 543 * Assemble the vdev configuration and store it in the label. 544 */ 545 poolconfig = pool_config_nvcreate(zfs); 546 vdevconfig = pool_disk_vdev_config_nvcreate(zfs); 547 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); 548 nvlist_destroy(vdevconfig); 549 550 error = nvlist_export(poolconfig); 551 if (error != 0) 552 errc(1, error, "nvlist_export"); 553 nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist, 554 sizeof(label->vl_vdev_phys.vp_nvlist)); 555 nvlist_destroy(poolconfig); 556 557 /* 558 * Fill out the uberblock. Just make each one the same. The embedded 559 * checksum is calculated in vdev_label_write(). 560 */ 561 for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock); 562 uoff += (1 << zfs->ashift)) { 563 ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff); 564 ub->ub_magic = UBERBLOCK_MAGIC; 565 ub->ub_version = SPA_VERSION; 566 ub->ub_txg = TXG; 567 ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid; 568 ub->ub_timestamp = 0; 569 570 ub->ub_software_version = SPA_VERSION; 571 ub->ub_mmp_magic = MMP_MAGIC; 572 ub->ub_mmp_delay = 0; 573 ub->ub_mmp_config = 0; 574 ub->ub_checkpoint_txg = 0; 575 objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp); 576 } 577 578 /* 579 * Write out four copies of the label: two at the beginning of the vdev 580 * and two at the end. 581 */ 582 for (int i = 0; i < VDEV_LABELS; i++) 583 vdev_label_write(zfs, i, label); 584 585 free(label); 586 } 587 588 static void 589 pool_fini(zfs_opt_t *zfs) 590 { 591 zap_write(zfs, zfs->poolprops); 592 dsl_write(zfs); 593 objset_write(zfs, zfs->mos); 594 pool_labels_write(zfs); 595 } 596 597 struct dnode_cursor * 598 dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode, 599 off_t size, off_t blksz) 600 { 601 struct dnode_cursor *c; 602 uint64_t nbppindir, indlevel, ndatablks, nindblks; 603 604 assert(dnode->dn_nblkptr == 1); 605 assert(blksz <= MAXBLOCKSIZE); 606 607 if (blksz == 0) { 608 /* Must be between 1<<ashift and 128KB. */ 609 blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift, 610 powerof2(size) ? size : (1ul << flsll(size)))); 611 } 612 assert(powerof2(blksz)); 613 614 /* 615 * Do we need indirect blocks? Figure out how many levels are needed 616 * (indlevel == 1 means no indirect blocks) and how much space is needed 617 * (it has to be allocated up-front to break the dependency cycle 618 * described in objset_write()). 619 */ 620 ndatablks = size == 0 ? 0 : howmany(size, blksz); 621 nindblks = 0; 622 for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) { 623 nbppindir *= BLKPTR_PER_INDIR; 624 nindblks += howmany(ndatablks, indlevel * nbppindir); 625 } 626 assert(indlevel < INDIR_LEVELS); 627 628 dnode->dn_nlevels = (uint8_t)indlevel; 629 dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0; 630 dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT; 631 632 c = ecalloc(1, sizeof(*c)); 633 if (nindblks > 0) { 634 c->indspace = nindblks * MAXBLOCKSIZE; 635 c->indloc = objset_space_alloc(zfs, os, &c->indspace); 636 } 637 c->dnode = dnode; 638 c->dataoff = 0; 639 c->datablksz = blksz; 640 641 return (c); 642 } 643 644 static void 645 _dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, int levels) 646 { 647 blkptr_t *bp, *pbp; 648 void *buf; 649 uint64_t fill; 650 off_t blkid, blksz, loc; 651 652 assert(levels > 0); 653 assert(levels <= c->dnode->dn_nlevels - 1); 654 655 blksz = MAXBLOCKSIZE; 656 blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR; 657 for (int level = 1; level <= levels; level++) { 658 buf = c->inddir[level - 1]; 659 660 if (level == c->dnode->dn_nlevels - 1) { 661 pbp = &c->dnode->dn_blkptr[0]; 662 } else { 663 uint64_t iblkid; 664 665 iblkid = blkid & (BLKPTR_PER_INDIR - 1); 666 pbp = (blkptr_t *) 667 &c->inddir[level][iblkid * sizeof(blkptr_t)]; 668 } 669 670 /* 671 * Space for indirect blocks is allocated up-front; see the 672 * comment in objset_write(). 673 */ 674 loc = c->indloc; 675 c->indloc += blksz; 676 assert(c->indspace >= blksz); 677 c->indspace -= blksz; 678 679 bp = buf; 680 fill = 0; 681 for (size_t i = 0; i < BLKPTR_PER_INDIR; i++) 682 fill += BP_GET_FILL(&bp[i]); 683 684 vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz, 685 loc, pbp); 686 memset(buf, 0, MAXBLOCKSIZE); 687 688 blkid /= BLKPTR_PER_INDIR; 689 } 690 } 691 692 blkptr_t * 693 dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off) 694 { 695 off_t blkid, l1id; 696 int levels; 697 698 if (c->dnode->dn_nlevels == 1) { 699 assert(off < MAXBLOCKSIZE); 700 return (&c->dnode->dn_blkptr[0]); 701 } 702 703 assert(off % c->datablksz == 0); 704 705 /* Do we need to flush any full indirect blocks? */ 706 if (off > 0) { 707 blkid = off / c->datablksz; 708 for (levels = 0; levels < c->dnode->dn_nlevels - 1; levels++) { 709 if (blkid % BLKPTR_PER_INDIR != 0) 710 break; 711 blkid /= BLKPTR_PER_INDIR; 712 } 713 if (levels > 0) 714 _dnode_cursor_flush(zfs, c, levels); 715 } 716 717 c->dataoff = off; 718 l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1); 719 return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]); 720 } 721 722 void 723 dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c) 724 { 725 int levels; 726 727 levels = c->dnode->dn_nlevels - 1; 728 if (levels > 0) 729 _dnode_cursor_flush(zfs, c, levels); 730 assert(c->indspace == 0); 731 free(c); 732 } 733 734 void 735 zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts) 736 { 737 zfs_opt_t *zfs; 738 int dirfd; 739 740 zfs = fsopts->fs_specific; 741 742 /* 743 * Use a fixed seed to provide reproducible pseudo-random numbers for 744 * on-disk structures when needed (e.g., GUIDs, ZAP hash salts). 745 */ 746 srandom(1729); 747 748 zfs_check_opts(fsopts); 749 750 if (!zfs->nowarn) { 751 fprintf(stderr, 752 "ZFS support is currently considered experimental. " 753 "Do not use it for anything critical.\n"); 754 } 755 756 dirfd = open(dir, O_DIRECTORY | O_RDONLY); 757 if (dirfd < 0) 758 err(1, "open(%s)", dir); 759 760 vdev_init(zfs, image); 761 pool_init(zfs); 762 fs_build(zfs, dirfd, root); 763 pool_fini(zfs); 764 vdev_fini(zfs); 765 } 766