1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2022 The FreeBSD Foundation 5 * 6 * This software was developed by Mark Johnston under sponsorship from 7 * the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions are 11 * met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/errno.h> 33 #include <sys/queue.h> 34 35 #include <assert.h> 36 #include <fcntl.h> 37 #include <stdalign.h> 38 #include <stdbool.h> 39 #include <stddef.h> 40 #include <stdlib.h> 41 #include <string.h> 42 #include <unistd.h> 43 44 #include <util.h> 45 46 #include "makefs.h" 47 #include "zfs.h" 48 49 #define VDEV_LABEL_SPACE \ 50 ((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) 51 _Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, ""); 52 53 #define MINMSSIZE ((off_t)1 << 24) /* 16MB */ 54 #define DFLTMSSIZE ((off_t)1 << 29) /* 512MB */ 55 #define MAXMSSIZE ((off_t)1 << 34) /* 16GB */ 56 57 #define INDIR_LEVELS 6 58 /* Indirect blocks are always 128KB. */ 59 #define BLKPTR_PER_INDIR (MAXBLOCKSIZE / sizeof(blkptr_t)) 60 61 struct dnode_cursor { 62 char inddir[INDIR_LEVELS][MAXBLOCKSIZE]; 63 off_t indloc; 64 off_t indspace; 65 dnode_phys_t *dnode; 66 off_t dataoff; 67 off_t datablksz; 68 }; 69 70 void 71 zfs_prep_opts(fsinfo_t *fsopts) 72 { 73 zfs_opt_t *zfs; 74 size_t align; 75 76 align = alignof(uint64_t); 77 zfs = aligned_alloc(align, roundup2(sizeof(*zfs), align)); 78 if (zfs == NULL) 79 err(1, "aligned_alloc"); 80 memset(zfs, 0, sizeof(*zfs)); 81 82 const option_t zfs_options[] = { 83 { '\0', "bootfs", &zfs->bootfs, OPT_STRPTR, 84 0, 0, "Bootable dataset" }, 85 { '\0', "mssize", &zfs->mssize, OPT_INT64, 86 MINMSSIZE, MAXMSSIZE, "Metaslab size" }, 87 { '\0', "poolname", &zfs->poolname, OPT_STRPTR, 88 0, 0, "ZFS pool name" }, 89 { '\0', "rootpath", &zfs->rootpath, OPT_STRPTR, 90 0, 0, "Prefix for all dataset mount points" }, 91 { '\0', "ashift", &zfs->ashift, OPT_INT32, 92 MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" }, 93 { '\0', "nowarn", &zfs->nowarn, OPT_BOOL, 94 0, 0, "Suppress warning about experimental ZFS support" }, 95 { .name = NULL } 96 }; 97 98 STAILQ_INIT(&zfs->datasetdescs); 99 100 fsopts->fs_specific = zfs; 101 fsopts->fs_options = copy_opts(zfs_options); 102 } 103 104 int 105 zfs_parse_opts(const char *option, fsinfo_t *fsopts) 106 { 107 zfs_opt_t *zfs; 108 struct dataset_desc *dsdesc; 109 char buf[BUFSIZ], *opt, *val; 110 int rv; 111 112 zfs = fsopts->fs_specific; 113 114 opt = val = estrdup(option); 115 opt = strsep(&val, "="); 116 if (strcmp(opt, "fs") == 0) { 117 if (val == NULL) 118 errx(1, "invalid filesystem parameters `%s'", option); 119 120 /* 121 * Dataset descriptions will be parsed later, in dsl_init(). 122 * Just stash them away for now. 123 */ 124 dsdesc = ecalloc(1, sizeof(*dsdesc)); 125 dsdesc->params = estrdup(val); 126 free(opt); 127 STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next); 128 return (1); 129 } 130 free(opt); 131 132 rv = set_option(fsopts->fs_options, option, buf, sizeof(buf)); 133 return (rv == -1 ? 0 : 1); 134 } 135 136 static void 137 zfs_size_vdev(fsinfo_t *fsopts) 138 { 139 zfs_opt_t *zfs; 140 off_t asize, mssize, vdevsize, vdevsize1; 141 142 zfs = fsopts->fs_specific; 143 144 assert(fsopts->maxsize != 0); 145 assert(zfs->ashift != 0); 146 147 /* 148 * Figure out how big the vdev should be. 149 */ 150 vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift); 151 if (vdevsize < MINDEVSIZE) 152 errx(1, "maximum image size is too small"); 153 if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) { 154 errx(1, "image size bounds must be multiples of %d", 155 1 << zfs->ashift); 156 } 157 asize = vdevsize - VDEV_LABEL_SPACE; 158 159 /* 160 * Size metaslabs according to the following heuristic: 161 * - provide at least 8 metaslabs, 162 * - without using a metaslab size larger than 512MB. 163 * This approximates what OpenZFS does without being complicated. In 164 * practice we expect pools to be expanded upon first use, and OpenZFS 165 * does not resize metaslabs in that case, so there is no right answer 166 * here. In general we want to provide large metaslabs even if the 167 * image size is small, and 512MB is a reasonable size for pools up to 168 * several hundred gigabytes. 169 * 170 * The user may override this heuristic using the "-o mssize" option. 171 */ 172 mssize = zfs->mssize; 173 if (mssize == 0) { 174 mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE); 175 if (!powerof2(mssize)) 176 mssize = 1l << (flsll(mssize) - 1); 177 } 178 if (!powerof2(mssize)) 179 errx(1, "metaslab size must be a power of 2"); 180 181 /* 182 * If we have some slop left over, try to cover it by resizing the vdev, 183 * subject to the maxsize and minsize parameters. 184 */ 185 if (asize % mssize != 0) { 186 vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE; 187 if (vdevsize1 < fsopts->minsize) 188 vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE; 189 if (vdevsize1 <= fsopts->maxsize) 190 vdevsize = vdevsize1; 191 } 192 asize = vdevsize - VDEV_LABEL_SPACE; 193 194 zfs->asize = asize; 195 zfs->vdevsize = vdevsize; 196 zfs->mssize = mssize; 197 zfs->msshift = flsll(mssize) - 1; 198 zfs->mscount = asize / mssize; 199 } 200 201 /* 202 * Validate options and set some default values. 203 */ 204 static void 205 zfs_check_opts(fsinfo_t *fsopts) 206 { 207 zfs_opt_t *zfs; 208 209 zfs = fsopts->fs_specific; 210 211 if (fsopts->offset != 0) 212 errx(1, "unhandled offset option"); 213 if (fsopts->maxsize == 0) 214 errx(1, "an image size must be specified"); 215 216 if (zfs->poolname == NULL) 217 errx(1, "a pool name must be specified"); 218 219 if (zfs->rootpath == NULL) 220 easprintf(&zfs->rootpath, "/%s", zfs->poolname); 221 if (zfs->rootpath[0] != '/') 222 errx(1, "mountpoint `%s' must be absolute", zfs->rootpath); 223 224 if (zfs->ashift == 0) 225 zfs->ashift = 12; 226 227 zfs_size_vdev(fsopts); 228 } 229 230 void 231 zfs_cleanup_opts(fsinfo_t *fsopts) 232 { 233 struct dataset_desc *d, *tmp; 234 zfs_opt_t *zfs; 235 236 zfs = fsopts->fs_specific; 237 free(zfs->rootpath); 238 free(zfs->bootfs); 239 free(__DECONST(void *, zfs->poolname)); 240 STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) { 241 free(d->params); 242 free(d); 243 } 244 free(zfs); 245 free(fsopts->fs_options); 246 } 247 248 static size_t 249 nvlist_size(const nvlist_t *nvl) 250 { 251 return (sizeof(nvl->nv_header) + nvl->nv_size); 252 } 253 254 static void 255 nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz) 256 { 257 assert(sz >= nvlist_size(nvl)); 258 259 memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header)); 260 memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size); 261 } 262 263 static nvlist_t * 264 pool_config_nvcreate(zfs_opt_t *zfs) 265 { 266 nvlist_t *featuresnv, *poolnv; 267 268 poolnv = nvlist_create(NV_UNIQUE_NAME); 269 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG); 270 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION); 271 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED); 272 nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname); 273 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid); 274 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid); 275 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); 276 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1); 277 278 featuresnv = nvlist_create(NV_UNIQUE_NAME); 279 nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv); 280 nvlist_destroy(featuresnv); 281 282 return (poolnv); 283 } 284 285 static nvlist_t * 286 pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs) 287 { 288 nvlist_t *diskvdevnv; 289 290 assert(zfs->objarrid != 0); 291 292 diskvdevnv = nvlist_create(NV_UNIQUE_NAME); 293 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK); 294 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift); 295 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize); 296 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); 297 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0); 298 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null"); 299 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1); 300 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); 301 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY, 302 zfs->objarrid); 303 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT, 304 zfs->msshift); 305 306 return (diskvdevnv); 307 } 308 309 static nvlist_t * 310 pool_root_vdev_config_nvcreate(zfs_opt_t *zfs) 311 { 312 nvlist_t *diskvdevnv, *rootvdevnv; 313 314 diskvdevnv = pool_disk_vdev_config_nvcreate(zfs); 315 rootvdevnv = nvlist_create(NV_UNIQUE_NAME); 316 317 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0); 318 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid); 319 nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 320 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); 321 nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv, 322 1); 323 nvlist_destroy(diskvdevnv); 324 325 return (rootvdevnv); 326 } 327 328 /* 329 * Create the pool's "config" object, which contains an nvlist describing pool 330 * parameters and the vdev topology. It is similar but not identical to the 331 * nvlist stored in vdev labels. The main difference is that vdev labels do not 332 * describe the full vdev tree and in particular do not contain the "root" 333 * meta-vdev. 334 */ 335 static void 336 pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir) 337 { 338 dnode_phys_t *dnode; 339 nvlist_t *poolconfig, *vdevconfig; 340 void *configbuf; 341 uint64_t dnid; 342 off_t configloc, configblksz; 343 int error; 344 345 dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST, 346 DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid); 347 348 poolconfig = pool_config_nvcreate(zfs); 349 350 vdevconfig = pool_root_vdev_config_nvcreate(zfs); 351 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); 352 nvlist_destroy(vdevconfig); 353 354 error = nvlist_export(poolconfig); 355 if (error != 0) 356 errc(1, error, "nvlist_export"); 357 358 configblksz = nvlist_size(poolconfig); 359 configloc = objset_space_alloc(zfs, zfs->mos, &configblksz); 360 configbuf = ecalloc(1, configblksz); 361 nvlist_copy(poolconfig, configbuf, configblksz); 362 363 vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc); 364 365 dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT; 366 dnode->dn_flags = DNODE_FLAG_USED_BYTES; 367 *(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig); 368 369 zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid); 370 371 nvlist_destroy(poolconfig); 372 free(configbuf); 373 } 374 375 /* 376 * Add objects block pointer list objects, used for deferred frees. We don't do 377 * anything with them, but they need to be present or OpenZFS will refuse to 378 * import the pool. 379 */ 380 static void 381 pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir) 382 { 383 uint64_t dnid; 384 385 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, 386 BPOBJ_SIZE_V2, &dnid); 387 zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid); 388 389 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, 390 BPOBJ_SIZE_V2, &dnid); 391 zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid); 392 } 393 394 /* 395 * Add required feature metadata objects. We don't know anything about ZFS 396 * features, so the objects are just empty ZAPs. 397 */ 398 static void 399 pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir) 400 { 401 dnode_phys_t *dnode; 402 uint64_t dnid; 403 404 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 405 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid); 406 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 407 408 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 409 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid); 410 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 411 412 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 413 zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid); 414 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 415 } 416 417 static void 418 pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir) 419 { 420 zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET, 421 dsl_dir_id(zfs->rootdsldir)); 422 } 423 424 static void 425 pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir) 426 { 427 dnode_phys_t *dnode; 428 uint64_t id; 429 430 dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id); 431 zap_add_uint64(objdir, DMU_POOL_PROPS, id); 432 433 zfs->poolprops = zap_alloc(zfs->mos, dnode); 434 } 435 436 /* 437 * Initialize the MOS object directory, the root of virtually all of the pool's 438 * data and metadata. 439 */ 440 static void 441 pool_init_objdir(zfs_opt_t *zfs) 442 { 443 zfs_zap_t *zap; 444 dnode_phys_t *objdir; 445 446 objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT); 447 448 zap = zap_alloc(zfs->mos, objdir); 449 pool_init_objdir_config(zfs, zap); 450 pool_init_objdir_bplists(zfs, zap); 451 pool_init_objdir_feature_maps(zfs, zap); 452 pool_init_objdir_dsl(zfs, zap); 453 pool_init_objdir_poolprops(zfs, zap); 454 zap_write(zfs, zap); 455 } 456 457 /* 458 * Initialize the meta-object set (MOS) and immediately write out several 459 * special objects whose contents are already finalized, including the object 460 * directory. 461 * 462 * Once the MOS is finalized, it'll look roughly like this: 463 * 464 * object directory (ZAP) 465 * |-> vdev config object (nvlist) 466 * |-> features for read 467 * |-> features for write 468 * |-> feature descriptions 469 * |-> sync bplist 470 * |-> free bplist 471 * |-> pool properties 472 * L-> root DSL directory 473 * |-> DSL child directory (ZAP) 474 * | |-> $MOS (DSL dir) 475 * | | |-> child map 476 * | | L-> props (ZAP) 477 * | |-> $FREE (DSL dir) 478 * | | |-> child map 479 * | | L-> props (ZAP) 480 * | |-> $ORIGIN (DSL dir) 481 * | | |-> child map 482 * | | |-> dataset 483 * | | | L-> deadlist 484 * | | |-> snapshot 485 * | | | |-> deadlist 486 * | | | L-> snapshot names 487 * | | |-> props (ZAP) 488 * | | L-> clones (ZAP) 489 * | |-> dataset 1 (DSL dir) 490 * | | |-> DSL dataset 491 * | | | |-> snapshot names 492 * | | | L-> deadlist 493 * | | |-> child map 494 * | | | L-> ... 495 * | | L-> props 496 * | |-> dataset 2 497 * | | L-> ... 498 * | |-> ... 499 * | L-> dataset n 500 * |-> DSL root dataset 501 * | |-> snapshot names 502 * | L-> deadlist 503 * L-> props (ZAP) 504 * space map object array 505 * |-> space map 1 506 * |-> space map 2 507 * |-> ... 508 * L-> space map n (zfs->mscount) 509 * 510 * The space map object array is pointed to by the "msarray" property in the 511 * pool configuration. 512 */ 513 static void 514 pool_init(zfs_opt_t *zfs) 515 { 516 uint64_t dnid; 517 518 zfs->poolguid = ((uint64_t)random() << 32) | random(); 519 zfs->vdevguid = ((uint64_t)random() << 32) | random(); 520 521 zfs->mos = objset_alloc(zfs, DMU_OST_META); 522 523 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid); 524 assert(dnid == DMU_POOL_DIRECTORY_OBJECT); 525 526 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid); 527 528 dsl_init(zfs); 529 530 pool_init_objdir(zfs); 531 } 532 533 static void 534 pool_labels_write(zfs_opt_t *zfs) 535 { 536 uberblock_t *ub; 537 vdev_label_t *label; 538 nvlist_t *poolconfig, *vdevconfig; 539 int error; 540 541 label = ecalloc(1, sizeof(*label)); 542 543 /* 544 * Assemble the vdev configuration and store it in the label. 545 */ 546 poolconfig = pool_config_nvcreate(zfs); 547 vdevconfig = pool_disk_vdev_config_nvcreate(zfs); 548 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); 549 nvlist_destroy(vdevconfig); 550 551 error = nvlist_export(poolconfig); 552 if (error != 0) 553 errc(1, error, "nvlist_export"); 554 nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist, 555 sizeof(label->vl_vdev_phys.vp_nvlist)); 556 nvlist_destroy(poolconfig); 557 558 /* 559 * Fill out the uberblock. Just make each one the same. The embedded 560 * checksum is calculated in vdev_label_write(). 561 */ 562 for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock); 563 uoff += (1 << zfs->ashift)) { 564 ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff); 565 ub->ub_magic = UBERBLOCK_MAGIC; 566 ub->ub_version = SPA_VERSION; 567 ub->ub_txg = TXG; 568 ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid; 569 ub->ub_timestamp = 0; 570 571 ub->ub_software_version = SPA_VERSION; 572 ub->ub_mmp_magic = MMP_MAGIC; 573 ub->ub_mmp_delay = 0; 574 ub->ub_mmp_config = 0; 575 ub->ub_checkpoint_txg = 0; 576 objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp); 577 } 578 579 /* 580 * Write out four copies of the label: two at the beginning of the vdev 581 * and two at the end. 582 */ 583 for (int i = 0; i < VDEV_LABELS; i++) 584 vdev_label_write(zfs, i, label); 585 586 free(label); 587 } 588 589 static void 590 pool_fini(zfs_opt_t *zfs) 591 { 592 zap_write(zfs, zfs->poolprops); 593 dsl_write(zfs); 594 objset_write(zfs, zfs->mos); 595 pool_labels_write(zfs); 596 } 597 598 struct dnode_cursor * 599 dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode, 600 off_t size, off_t blksz) 601 { 602 struct dnode_cursor *c; 603 uint64_t nbppindir, indlevel, ndatablks, nindblks; 604 605 assert(dnode->dn_nblkptr == 1); 606 assert(blksz <= MAXBLOCKSIZE); 607 608 if (blksz == 0) { 609 /* Must be between 1<<ashift and 128KB. */ 610 blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift, 611 powerof2(size) ? size : (1l << flsll(size)))); 612 } 613 assert(powerof2(blksz)); 614 615 /* 616 * Do we need indirect blocks? Figure out how many levels are needed 617 * (indlevel == 1 means no indirect blocks) and how much space is needed 618 * (it has to be allocated up-front to break the dependency cycle 619 * described in objset_write()). 620 */ 621 ndatablks = size == 0 ? 0 : howmany(size, blksz); 622 nindblks = 0; 623 for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) { 624 nbppindir *= BLKPTR_PER_INDIR; 625 nindblks += howmany(ndatablks, indlevel * nbppindir); 626 } 627 assert(indlevel < INDIR_LEVELS); 628 629 dnode->dn_nlevels = (uint8_t)indlevel; 630 dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0; 631 dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT; 632 633 c = ecalloc(1, sizeof(*c)); 634 if (nindblks > 0) { 635 c->indspace = nindblks * MAXBLOCKSIZE; 636 c->indloc = objset_space_alloc(zfs, os, &c->indspace); 637 } 638 c->dnode = dnode; 639 c->dataoff = 0; 640 c->datablksz = blksz; 641 642 return (c); 643 } 644 645 static void 646 _dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, int levels) 647 { 648 blkptr_t *bp, *pbp; 649 void *buf; 650 uint64_t fill; 651 off_t blkid, blksz, loc; 652 653 assert(levels > 0); 654 assert(levels <= c->dnode->dn_nlevels - 1); 655 656 blksz = MAXBLOCKSIZE; 657 blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR; 658 for (int level = 1; level <= levels; level++) { 659 buf = c->inddir[level - 1]; 660 661 if (level == c->dnode->dn_nlevels - 1) { 662 pbp = &c->dnode->dn_blkptr[0]; 663 } else { 664 uint64_t iblkid; 665 666 iblkid = blkid & (BLKPTR_PER_INDIR - 1); 667 pbp = (blkptr_t *) 668 &c->inddir[level][iblkid * sizeof(blkptr_t)]; 669 } 670 671 /* 672 * Space for indirect blocks is allocated up-front; see the 673 * comment in objset_write(). 674 */ 675 loc = c->indloc; 676 c->indloc += blksz; 677 assert(c->indspace >= blksz); 678 c->indspace -= blksz; 679 680 bp = buf; 681 fill = 0; 682 for (size_t i = 0; i < BLKPTR_PER_INDIR; i++) 683 fill += BP_GET_FILL(&bp[i]); 684 685 vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz, 686 loc, pbp); 687 memset(buf, 0, MAXBLOCKSIZE); 688 689 blkid /= BLKPTR_PER_INDIR; 690 } 691 } 692 693 blkptr_t * 694 dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off) 695 { 696 off_t blkid, l1id; 697 int levels; 698 699 if (c->dnode->dn_nlevels == 1) { 700 assert(off < MAXBLOCKSIZE); 701 return (&c->dnode->dn_blkptr[0]); 702 } 703 704 assert(off % c->datablksz == 0); 705 706 /* Do we need to flush any full indirect blocks? */ 707 if (off > 0) { 708 blkid = off / c->datablksz; 709 for (levels = 0; levels < c->dnode->dn_nlevels - 1; levels++) { 710 if (blkid % BLKPTR_PER_INDIR != 0) 711 break; 712 blkid /= BLKPTR_PER_INDIR; 713 } 714 if (levels > 0) 715 _dnode_cursor_flush(zfs, c, levels); 716 } 717 718 c->dataoff = off; 719 l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1); 720 return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]); 721 } 722 723 void 724 dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c) 725 { 726 int levels; 727 728 levels = c->dnode->dn_nlevels - 1; 729 if (levels > 0) 730 _dnode_cursor_flush(zfs, c, levels); 731 assert(c->indspace == 0); 732 free(c); 733 } 734 735 void 736 zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts) 737 { 738 zfs_opt_t *zfs; 739 int dirfd; 740 741 zfs = fsopts->fs_specific; 742 743 /* 744 * Use a fixed seed to provide reproducible pseudo-random numbers for 745 * on-disk structures when needed (e.g., GUIDs, ZAP hash salts). 746 */ 747 srandom(1729); 748 749 zfs_check_opts(fsopts); 750 751 if (!zfs->nowarn) { 752 fprintf(stderr, 753 "ZFS support is currently considered experimental. " 754 "Do not use it for anything critical.\n"); 755 } 756 757 dirfd = open(dir, O_DIRECTORY | O_RDONLY); 758 if (dirfd < 0) 759 err(1, "open(%s)", dir); 760 761 vdev_init(zfs, image); 762 pool_init(zfs); 763 fs_build(zfs, dirfd, root); 764 pool_fini(zfs); 765 vdev_fini(zfs); 766 } 767