1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2022 The FreeBSD Foundation 5 * 6 * This software was developed by Mark Johnston under sponsorship from 7 * the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions are 11 * met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/errno.h> 33 #include <sys/queue.h> 34 35 #include <assert.h> 36 #include <fcntl.h> 37 #include <stdbool.h> 38 #include <stddef.h> 39 #include <stdlib.h> 40 #include <string.h> 41 #include <unistd.h> 42 43 #include <util.h> 44 45 #include "makefs.h" 46 #include "zfs.h" 47 48 #define VDEV_LABEL_SPACE \ 49 ((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) 50 _Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, ""); 51 52 #define MINMSSIZE ((off_t)1 << 24) /* 16MB */ 53 #define DFLTMSSIZE ((off_t)1 << 29) /* 512MB */ 54 #define MAXMSSIZE ((off_t)1 << 34) /* 16GB */ 55 56 #define INDIR_LEVELS 6 57 /* Indirect blocks are always 128KB. */ 58 #define BLKPTR_PER_INDIR (MAXBLOCKSIZE / sizeof(blkptr_t)) 59 60 struct dnode_cursor { 61 char inddir[INDIR_LEVELS][MAXBLOCKSIZE]; 62 off_t indloc; 63 off_t indspace; 64 dnode_phys_t *dnode; 65 off_t dataoff; 66 off_t datablksz; 67 }; 68 69 void 70 zfs_prep_opts(fsinfo_t *fsopts) 71 { 72 zfs_opt_t *zfs = ecalloc(1, sizeof(*zfs)); 73 74 const option_t zfs_options[] = { 75 { '\0', "bootfs", &zfs->bootfs, OPT_STRPTR, 76 0, 0, "Bootable dataset" }, 77 { '\0', "mssize", &zfs->mssize, OPT_INT64, 78 MINMSSIZE, MAXMSSIZE, "Metaslab size" }, 79 { '\0', "poolname", &zfs->poolname, OPT_STRPTR, 80 0, 0, "ZFS pool name" }, 81 { '\0', "rootpath", &zfs->rootpath, OPT_STRPTR, 82 0, 0, "Prefix for all dataset mount points" }, 83 { '\0', "ashift", &zfs->ashift, OPT_INT32, 84 MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" }, 85 { '\0', "nowarn", &zfs->nowarn, OPT_BOOL, 86 0, 0, "Suppress warning about experimental ZFS support" }, 87 { .name = NULL } 88 }; 89 90 STAILQ_INIT(&zfs->datasetdescs); 91 92 fsopts->fs_specific = zfs; 93 fsopts->fs_options = copy_opts(zfs_options); 94 } 95 96 int 97 zfs_parse_opts(const char *option, fsinfo_t *fsopts) 98 { 99 zfs_opt_t *zfs; 100 struct dataset_desc *dsdesc; 101 char buf[BUFSIZ], *opt, *val; 102 int rv; 103 104 zfs = fsopts->fs_specific; 105 106 opt = val = estrdup(option); 107 opt = strsep(&val, "="); 108 if (strcmp(opt, "fs") == 0) { 109 if (val == NULL) 110 errx(1, "invalid filesystem parameters `%s'", option); 111 112 /* 113 * Dataset descriptions will be parsed later, in dsl_init(). 114 * Just stash them away for now. 115 */ 116 dsdesc = ecalloc(1, sizeof(*dsdesc)); 117 dsdesc->params = estrdup(val); 118 free(opt); 119 STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next); 120 return (1); 121 } 122 free(opt); 123 124 rv = set_option(fsopts->fs_options, option, buf, sizeof(buf)); 125 return (rv == -1 ? 0 : 1); 126 } 127 128 static void 129 zfs_size_vdev(fsinfo_t *fsopts) 130 { 131 zfs_opt_t *zfs; 132 off_t asize, mssize, vdevsize, vdevsize1; 133 134 zfs = fsopts->fs_specific; 135 136 assert(fsopts->maxsize != 0); 137 assert(zfs->ashift != 0); 138 139 /* 140 * Figure out how big the vdev should be. 141 */ 142 vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift); 143 if (vdevsize < MINDEVSIZE) 144 errx(1, "maximum image size is too small"); 145 if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) { 146 errx(1, "image size bounds must be multiples of %d", 147 1 << zfs->ashift); 148 } 149 asize = vdevsize - VDEV_LABEL_SPACE; 150 151 /* 152 * Size metaslabs according to the following heuristic: 153 * - provide at least 8 metaslabs, 154 * - without using a metaslab size larger than 512MB. 155 * This approximates what OpenZFS does without being complicated. In 156 * practice we expect pools to be expanded upon first use, and OpenZFS 157 * does not resize metaslabs in that case, so there is no right answer 158 * here. In general we want to provide large metaslabs even if the 159 * image size is small, and 512MB is a reasonable size for pools up to 160 * several hundred gigabytes. 161 * 162 * The user may override this heuristic using the "-o mssize" option. 163 */ 164 mssize = zfs->mssize; 165 if (mssize == 0) { 166 mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE); 167 if (!powerof2(mssize)) 168 mssize = 1l << (flsll(mssize) - 1); 169 } 170 if (!powerof2(mssize)) 171 errx(1, "metaslab size must be a power of 2"); 172 173 /* 174 * If we have some slop left over, try to cover it by resizing the vdev, 175 * subject to the maxsize and minsize parameters. 176 */ 177 if (asize % mssize != 0) { 178 vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE; 179 if (vdevsize1 < fsopts->minsize) 180 vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE; 181 if (vdevsize1 <= fsopts->maxsize) 182 vdevsize = vdevsize1; 183 } 184 asize = vdevsize - VDEV_LABEL_SPACE; 185 186 zfs->asize = asize; 187 zfs->vdevsize = vdevsize; 188 zfs->mssize = mssize; 189 zfs->msshift = flsll(mssize) - 1; 190 zfs->mscount = asize / mssize; 191 } 192 193 /* 194 * Validate options and set some default values. 195 */ 196 static void 197 zfs_check_opts(fsinfo_t *fsopts) 198 { 199 zfs_opt_t *zfs; 200 201 zfs = fsopts->fs_specific; 202 203 if (fsopts->offset != 0) 204 errx(1, "unhandled offset option"); 205 if (fsopts->maxsize == 0) 206 errx(1, "an image size must be specified"); 207 208 if (zfs->poolname == NULL) 209 errx(1, "a pool name must be specified"); 210 211 if (zfs->rootpath == NULL) 212 easprintf(&zfs->rootpath, "/%s", zfs->poolname); 213 if (zfs->rootpath[0] != '/') 214 errx(1, "mountpoint `%s' must be absolute", zfs->rootpath); 215 216 if (zfs->ashift == 0) 217 zfs->ashift = 12; 218 219 zfs_size_vdev(fsopts); 220 } 221 222 void 223 zfs_cleanup_opts(fsinfo_t *fsopts) 224 { 225 struct dataset_desc *d, *tmp; 226 zfs_opt_t *zfs; 227 228 zfs = fsopts->fs_specific; 229 free(zfs->rootpath); 230 free(zfs->bootfs); 231 free(__DECONST(void *, zfs->poolname)); 232 STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) { 233 free(d->params); 234 free(d); 235 } 236 free(zfs); 237 free(fsopts->fs_options); 238 } 239 240 static size_t 241 nvlist_size(const nvlist_t *nvl) 242 { 243 return (sizeof(nvl->nv_header) + nvl->nv_size); 244 } 245 246 static void 247 nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz) 248 { 249 assert(sz >= nvlist_size(nvl)); 250 251 memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header)); 252 memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size); 253 } 254 255 static nvlist_t * 256 pool_config_nvcreate(zfs_opt_t *zfs) 257 { 258 nvlist_t *featuresnv, *poolnv; 259 260 poolnv = nvlist_create(NV_UNIQUE_NAME); 261 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG); 262 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION); 263 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED); 264 nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname); 265 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid); 266 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid); 267 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); 268 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1); 269 270 featuresnv = nvlist_create(NV_UNIQUE_NAME); 271 nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv); 272 nvlist_destroy(featuresnv); 273 274 return (poolnv); 275 } 276 277 static nvlist_t * 278 pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs) 279 { 280 nvlist_t *diskvdevnv; 281 282 assert(zfs->objarrid != 0); 283 284 diskvdevnv = nvlist_create(NV_UNIQUE_NAME); 285 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK); 286 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift); 287 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize); 288 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); 289 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0); 290 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null"); 291 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1); 292 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); 293 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY, 294 zfs->objarrid); 295 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT, 296 zfs->msshift); 297 298 return (diskvdevnv); 299 } 300 301 static nvlist_t * 302 pool_root_vdev_config_nvcreate(zfs_opt_t *zfs) 303 { 304 nvlist_t *diskvdevnv, *rootvdevnv; 305 306 diskvdevnv = pool_disk_vdev_config_nvcreate(zfs); 307 rootvdevnv = nvlist_create(NV_UNIQUE_NAME); 308 309 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0); 310 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid); 311 nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 312 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); 313 nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv, 314 1); 315 nvlist_destroy(diskvdevnv); 316 317 return (rootvdevnv); 318 } 319 320 /* 321 * Create the pool's "config" object, which contains an nvlist describing pool 322 * parameters and the vdev topology. It is similar but not identical to the 323 * nvlist stored in vdev labels. The main difference is that vdev labels do not 324 * describe the full vdev tree and in particular do not contain the "root" 325 * meta-vdev. 326 */ 327 static void 328 pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir) 329 { 330 dnode_phys_t *dnode; 331 nvlist_t *poolconfig, *vdevconfig; 332 void *configbuf; 333 uint64_t dnid; 334 off_t configloc, configblksz; 335 int error; 336 337 dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST, 338 DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid); 339 340 poolconfig = pool_config_nvcreate(zfs); 341 342 vdevconfig = pool_root_vdev_config_nvcreate(zfs); 343 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); 344 nvlist_destroy(vdevconfig); 345 346 error = nvlist_export(poolconfig); 347 if (error != 0) 348 errc(1, error, "nvlist_export"); 349 350 configblksz = nvlist_size(poolconfig); 351 configloc = objset_space_alloc(zfs, zfs->mos, &configblksz); 352 configbuf = ecalloc(1, configblksz); 353 nvlist_copy(poolconfig, configbuf, configblksz); 354 355 vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc); 356 357 dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT; 358 dnode->dn_flags = DNODE_FLAG_USED_BYTES; 359 *(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig); 360 361 zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid); 362 363 nvlist_destroy(poolconfig); 364 free(configbuf); 365 } 366 367 /* 368 * Add objects block pointer list objects, used for deferred frees. We don't do 369 * anything with them, but they need to be present or OpenZFS will refuse to 370 * import the pool. 371 */ 372 static void 373 pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir) 374 { 375 uint64_t dnid; 376 377 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, 378 BPOBJ_SIZE_V2, &dnid); 379 zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid); 380 381 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, 382 BPOBJ_SIZE_V2, &dnid); 383 zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid); 384 } 385 386 /* 387 * Add required feature metadata objects. We don't know anything about ZFS 388 * features, so the objects are just empty ZAPs. 389 */ 390 static void 391 pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir) 392 { 393 dnode_phys_t *dnode; 394 uint64_t dnid; 395 396 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 397 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid); 398 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 399 400 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 401 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid); 402 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 403 404 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 405 zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid); 406 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 407 } 408 409 static void 410 pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir) 411 { 412 zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET, 413 dsl_dir_id(zfs->rootdsldir)); 414 } 415 416 static void 417 pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir) 418 { 419 dnode_phys_t *dnode; 420 uint64_t id; 421 422 dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id); 423 zap_add_uint64(objdir, DMU_POOL_PROPS, id); 424 425 zfs->poolprops = zap_alloc(zfs->mos, dnode); 426 } 427 428 /* 429 * Initialize the MOS object directory, the root of virtually all of the pool's 430 * data and metadata. 431 */ 432 static void 433 pool_init_objdir(zfs_opt_t *zfs) 434 { 435 zfs_zap_t *zap; 436 dnode_phys_t *objdir; 437 438 objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT); 439 440 zap = zap_alloc(zfs->mos, objdir); 441 pool_init_objdir_config(zfs, zap); 442 pool_init_objdir_bplists(zfs, zap); 443 pool_init_objdir_feature_maps(zfs, zap); 444 pool_init_objdir_dsl(zfs, zap); 445 pool_init_objdir_poolprops(zfs, zap); 446 zap_write(zfs, zap); 447 } 448 449 /* 450 * Initialize the meta-object set (MOS) and immediately write out several 451 * special objects whose contents are already finalized, including the object 452 * directory. 453 * 454 * Once the MOS is finalized, it'll look roughly like this: 455 * 456 * object directory (ZAP) 457 * |-> vdev config object (nvlist) 458 * |-> features for read 459 * |-> features for write 460 * |-> feature descriptions 461 * |-> sync bplist 462 * |-> free bplist 463 * |-> pool properties 464 * L-> root DSL directory 465 * |-> DSL child directory (ZAP) 466 * | |-> $MOS (DSL dir) 467 * | | |-> child map 468 * | | L-> props (ZAP) 469 * | |-> $FREE (DSL dir) 470 * | | |-> child map 471 * | | L-> props (ZAP) 472 * | |-> $ORIGIN (DSL dir) 473 * | | |-> child map 474 * | | |-> dataset 475 * | | | L-> deadlist 476 * | | |-> snapshot 477 * | | | |-> deadlist 478 * | | | L-> snapshot names 479 * | | |-> props (ZAP) 480 * | | L-> clones (ZAP) 481 * | |-> dataset 1 (DSL dir) 482 * | | |-> DSL dataset 483 * | | | |-> snapshot names 484 * | | | L-> deadlist 485 * | | |-> child map 486 * | | | L-> ... 487 * | | L-> props 488 * | |-> dataset 2 489 * | | L-> ... 490 * | |-> ... 491 * | L-> dataset n 492 * |-> DSL root dataset 493 * | |-> snapshot names 494 * | L-> deadlist 495 * L-> props (ZAP) 496 * space map object array 497 * |-> space map 1 498 * |-> space map 2 499 * |-> ... 500 * L-> space map n (zfs->mscount) 501 * 502 * The space map object array is pointed to by the "msarray" property in the 503 * pool configuration. 504 */ 505 static void 506 pool_init(zfs_opt_t *zfs) 507 { 508 uint64_t dnid; 509 510 zfs->poolguid = ((uint64_t)random() << 32) | random(); 511 zfs->vdevguid = ((uint64_t)random() << 32) | random(); 512 513 zfs->mos = objset_alloc(zfs, DMU_OST_META); 514 515 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid); 516 assert(dnid == DMU_POOL_DIRECTORY_OBJECT); 517 518 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid); 519 520 dsl_init(zfs); 521 522 pool_init_objdir(zfs); 523 } 524 525 static void 526 pool_labels_write(zfs_opt_t *zfs) 527 { 528 uberblock_t *ub; 529 vdev_label_t *label; 530 nvlist_t *poolconfig, *vdevconfig; 531 int error; 532 533 label = ecalloc(1, sizeof(*label)); 534 535 /* 536 * Assemble the vdev configuration and store it in the label. 537 */ 538 poolconfig = pool_config_nvcreate(zfs); 539 vdevconfig = pool_disk_vdev_config_nvcreate(zfs); 540 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); 541 nvlist_destroy(vdevconfig); 542 543 error = nvlist_export(poolconfig); 544 if (error != 0) 545 errc(1, error, "nvlist_export"); 546 nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist, 547 sizeof(label->vl_vdev_phys.vp_nvlist)); 548 nvlist_destroy(poolconfig); 549 550 /* 551 * Fill out the uberblock. Just make each one the same. The embedded 552 * checksum is calculated in vdev_label_write(). 553 */ 554 for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock); 555 uoff += (1 << zfs->ashift)) { 556 ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff); 557 ub->ub_magic = UBERBLOCK_MAGIC; 558 ub->ub_version = SPA_VERSION; 559 ub->ub_txg = TXG; 560 ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid; 561 ub->ub_timestamp = 0; 562 563 ub->ub_software_version = SPA_VERSION; 564 ub->ub_mmp_magic = MMP_MAGIC; 565 ub->ub_mmp_delay = 0; 566 ub->ub_mmp_config = 0; 567 ub->ub_checkpoint_txg = 0; 568 objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp); 569 } 570 571 /* 572 * Write out four copies of the label: two at the beginning of the vdev 573 * and two at the end. 574 */ 575 for (int i = 0; i < VDEV_LABELS; i++) 576 vdev_label_write(zfs, i, label); 577 578 free(label); 579 } 580 581 static void 582 pool_fini(zfs_opt_t *zfs) 583 { 584 zap_write(zfs, zfs->poolprops); 585 dsl_write(zfs); 586 objset_write(zfs, zfs->mos); 587 pool_labels_write(zfs); 588 } 589 590 struct dnode_cursor * 591 dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode, 592 off_t size, off_t blksz) 593 { 594 struct dnode_cursor *c; 595 uint64_t nbppindir, indlevel, ndatablks, nindblks; 596 597 assert(dnode->dn_nblkptr == 1); 598 assert(blksz <= MAXBLOCKSIZE); 599 600 if (blksz == 0) { 601 /* Must be between 1<<ashift and 128KB. */ 602 blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift, 603 powerof2(size) ? size : (1ul << flsll(size)))); 604 } 605 assert(powerof2(blksz)); 606 607 /* 608 * Do we need indirect blocks? Figure out how many levels are needed 609 * (indlevel == 1 means no indirect blocks) and how much space is needed 610 * (it has to be allocated up-front to break the dependency cycle 611 * described in objset_write()). 612 */ 613 ndatablks = size == 0 ? 0 : howmany(size, blksz); 614 nindblks = 0; 615 for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) { 616 nbppindir *= BLKPTR_PER_INDIR; 617 nindblks += howmany(ndatablks, indlevel * nbppindir); 618 } 619 assert(indlevel < INDIR_LEVELS); 620 621 dnode->dn_nlevels = (uint8_t)indlevel; 622 dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0; 623 dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT; 624 625 c = ecalloc(1, sizeof(*c)); 626 if (nindblks > 0) { 627 c->indspace = nindblks * MAXBLOCKSIZE; 628 c->indloc = objset_space_alloc(zfs, os, &c->indspace); 629 } 630 c->dnode = dnode; 631 c->dataoff = 0; 632 c->datablksz = blksz; 633 634 return (c); 635 } 636 637 static void 638 _dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, int levels) 639 { 640 blkptr_t *bp, *pbp; 641 void *buf; 642 uint64_t fill; 643 off_t blkid, blksz, loc; 644 645 assert(levels > 0); 646 assert(levels <= c->dnode->dn_nlevels - 1); 647 648 blksz = MAXBLOCKSIZE; 649 blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR; 650 for (int level = 1; level <= levels; level++) { 651 buf = c->inddir[level - 1]; 652 653 if (level == c->dnode->dn_nlevels - 1) { 654 pbp = &c->dnode->dn_blkptr[0]; 655 } else { 656 uint64_t iblkid; 657 658 iblkid = blkid & (BLKPTR_PER_INDIR - 1); 659 pbp = (blkptr_t *) 660 &c->inddir[level][iblkid * sizeof(blkptr_t)]; 661 } 662 663 /* 664 * Space for indirect blocks is allocated up-front; see the 665 * comment in objset_write(). 666 */ 667 loc = c->indloc; 668 c->indloc += blksz; 669 assert(c->indspace >= blksz); 670 c->indspace -= blksz; 671 672 bp = buf; 673 fill = 0; 674 for (size_t i = 0; i < BLKPTR_PER_INDIR; i++) 675 fill += BP_GET_FILL(&bp[i]); 676 677 vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz, 678 loc, pbp); 679 memset(buf, 0, MAXBLOCKSIZE); 680 681 blkid /= BLKPTR_PER_INDIR; 682 } 683 } 684 685 blkptr_t * 686 dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off) 687 { 688 off_t blkid, l1id; 689 int levels; 690 691 if (c->dnode->dn_nlevels == 1) { 692 assert(off < MAXBLOCKSIZE); 693 return (&c->dnode->dn_blkptr[0]); 694 } 695 696 assert(off % c->datablksz == 0); 697 698 /* Do we need to flush any full indirect blocks? */ 699 if (off > 0) { 700 blkid = off / c->datablksz; 701 for (levels = 0; levels < c->dnode->dn_nlevels - 1; levels++) { 702 if (blkid % BLKPTR_PER_INDIR != 0) 703 break; 704 blkid /= BLKPTR_PER_INDIR; 705 } 706 if (levels > 0) 707 _dnode_cursor_flush(zfs, c, levels); 708 } 709 710 c->dataoff = off; 711 l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1); 712 return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]); 713 } 714 715 void 716 dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c) 717 { 718 int levels; 719 720 levels = c->dnode->dn_nlevels - 1; 721 if (levels > 0) 722 _dnode_cursor_flush(zfs, c, levels); 723 assert(c->indspace == 0); 724 free(c); 725 } 726 727 void 728 zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts) 729 { 730 zfs_opt_t *zfs; 731 int dirfd; 732 733 zfs = fsopts->fs_specific; 734 735 /* 736 * Use a fixed seed to provide reproducible pseudo-random numbers for 737 * on-disk structures when needed (e.g., GUIDs, ZAP hash salts). 738 */ 739 srandom(1729); 740 741 zfs_check_opts(fsopts); 742 743 if (!zfs->nowarn) { 744 fprintf(stderr, 745 "ZFS support is currently considered experimental. " 746 "Do not use it for anything critical.\n"); 747 } 748 749 dirfd = open(dir, O_DIRECTORY | O_RDONLY); 750 if (dirfd < 0) 751 err(1, "open(%s)", dir); 752 753 vdev_init(zfs, image); 754 pool_init(zfs); 755 fs_build(zfs, dirfd, root); 756 pool_fini(zfs); 757 vdev_fini(zfs); 758 } 759