1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2022 The FreeBSD Foundation 5 * 6 * This software was developed by Mark Johnston under sponsorship from 7 * the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions are 11 * met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/errno.h> 33 #include <sys/queue.h> 34 35 #include <assert.h> 36 #include <ctype.h> 37 #include <fcntl.h> 38 #include <stdalign.h> 39 #include <stdbool.h> 40 #include <stddef.h> 41 #include <stdlib.h> 42 #include <string.h> 43 #include <unistd.h> 44 45 #include <util.h> 46 47 #include "makefs.h" 48 #include "zfs.h" 49 50 #define VDEV_LABEL_SPACE \ 51 ((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) 52 _Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, ""); 53 54 #define MINMSSIZE ((off_t)1 << 24) /* 16MB */ 55 #define DFLTMSSIZE ((off_t)1 << 29) /* 512MB */ 56 #define MAXMSSIZE ((off_t)1 << 34) /* 16GB */ 57 58 #define INDIR_LEVELS 6 59 /* Indirect blocks are always 128KB. */ 60 #define BLKPTR_PER_INDIR (MAXBLOCKSIZE / sizeof(blkptr_t)) 61 62 struct dnode_cursor { 63 char inddir[INDIR_LEVELS][MAXBLOCKSIZE]; 64 off_t indloc; 65 off_t indspace; 66 dnode_phys_t *dnode; 67 off_t dataoff; 68 off_t datablksz; 69 }; 70 71 void 72 zfs_prep_opts(fsinfo_t *fsopts) 73 { 74 zfs_opt_t *zfs; 75 size_t align; 76 77 align = alignof(uint64_t); 78 zfs = aligned_alloc(align, roundup2(sizeof(*zfs), align)); 79 if (zfs == NULL) 80 err(1, "aligned_alloc"); 81 memset(zfs, 0, sizeof(*zfs)); 82 83 const option_t zfs_options[] = { 84 { '\0', "bootfs", &zfs->bootfs, OPT_STRPTR, 85 0, 0, "Bootable dataset" }, 86 { '\0', "mssize", &zfs->mssize, OPT_INT64, 87 MINMSSIZE, MAXMSSIZE, "Metaslab size" }, 88 { '\0', "poolname", &zfs->poolname, OPT_STRPTR, 89 0, 0, "ZFS pool name" }, 90 { '\0', "rootpath", &zfs->rootpath, OPT_STRPTR, 91 0, 0, "Prefix for all dataset mount points" }, 92 { '\0', "ashift", &zfs->ashift, OPT_INT32, 93 MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" }, 94 { '\0', "nowarn", &zfs->nowarn, OPT_BOOL, 95 0, 0, "Suppress warning about experimental ZFS support" }, 96 { .name = NULL } 97 }; 98 99 STAILQ_INIT(&zfs->datasetdescs); 100 101 fsopts->fs_specific = zfs; 102 fsopts->fs_options = copy_opts(zfs_options); 103 } 104 105 int 106 zfs_parse_opts(const char *option, fsinfo_t *fsopts) 107 { 108 zfs_opt_t *zfs; 109 struct dataset_desc *dsdesc; 110 char buf[BUFSIZ], *opt, *val; 111 int rv; 112 113 zfs = fsopts->fs_specific; 114 115 opt = val = estrdup(option); 116 opt = strsep(&val, "="); 117 if (strcmp(opt, "fs") == 0) { 118 if (val == NULL) 119 errx(1, "invalid filesystem parameters `%s'", option); 120 121 /* 122 * Dataset descriptions will be parsed later, in dsl_init(). 123 * Just stash them away for now. 124 */ 125 dsdesc = ecalloc(1, sizeof(*dsdesc)); 126 dsdesc->params = estrdup(val); 127 free(opt); 128 STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next); 129 return (1); 130 } 131 free(opt); 132 133 rv = set_option(fsopts->fs_options, option, buf, sizeof(buf)); 134 return (rv == -1 ? 0 : 1); 135 } 136 137 static void 138 zfs_size_vdev(fsinfo_t *fsopts) 139 { 140 zfs_opt_t *zfs; 141 off_t asize, mssize, vdevsize, vdevsize1; 142 143 zfs = fsopts->fs_specific; 144 145 assert(fsopts->maxsize != 0); 146 assert(zfs->ashift != 0); 147 148 /* 149 * Figure out how big the vdev should be. 150 */ 151 vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift); 152 if (vdevsize < MINDEVSIZE) 153 errx(1, "maximum image size is too small"); 154 if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) { 155 errx(1, "image size bounds must be multiples of %d", 156 1 << zfs->ashift); 157 } 158 asize = vdevsize - VDEV_LABEL_SPACE; 159 160 /* 161 * Size metaslabs according to the following heuristic: 162 * - provide at least 8 metaslabs, 163 * - without using a metaslab size larger than 512MB. 164 * This approximates what OpenZFS does without being complicated. In 165 * practice we expect pools to be expanded upon first use, and OpenZFS 166 * does not resize metaslabs in that case, so there is no right answer 167 * here. In general we want to provide large metaslabs even if the 168 * image size is small, and 512MB is a reasonable size for pools up to 169 * several hundred gigabytes. 170 * 171 * The user may override this heuristic using the "-o mssize" option. 172 */ 173 mssize = zfs->mssize; 174 if (mssize == 0) { 175 mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE); 176 if (!powerof2(mssize)) 177 mssize = 1l << (flsll(mssize) - 1); 178 } 179 if (!powerof2(mssize)) 180 errx(1, "metaslab size must be a power of 2"); 181 182 /* 183 * If we have some slop left over, try to cover it by resizing the vdev, 184 * subject to the maxsize and minsize parameters. 185 */ 186 if (asize % mssize != 0) { 187 vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE; 188 if (vdevsize1 < fsopts->minsize) 189 vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE; 190 if (vdevsize1 <= fsopts->maxsize) 191 vdevsize = vdevsize1; 192 } 193 asize = vdevsize - VDEV_LABEL_SPACE; 194 195 zfs->asize = asize; 196 zfs->vdevsize = vdevsize; 197 zfs->mssize = mssize; 198 zfs->msshift = flsll(mssize) - 1; 199 zfs->mscount = asize / mssize; 200 } 201 202 /* 203 * Validate options and set some default values. 204 */ 205 static void 206 zfs_check_opts(fsinfo_t *fsopts) 207 { 208 zfs_opt_t *zfs; 209 210 zfs = fsopts->fs_specific; 211 212 if (fsopts->offset != 0) 213 errx(1, "unhandled offset option"); 214 if (fsopts->maxsize == 0) 215 errx(1, "an image size must be specified"); 216 217 if (zfs->poolname == NULL) 218 errx(1, "a pool name must be specified"); 219 if (!isalpha(zfs->poolname[0])) 220 errx(1, "the pool name must begin with a letter"); 221 for (size_t i = 0, len = strlen(zfs->poolname); i < len; i++) { 222 if (!isalnum(zfs->poolname[i]) && zfs->poolname[i] != '_') 223 errx(1, "invalid character '%c' in pool name", 224 zfs->poolname[i]); 225 } 226 if (strcmp(zfs->poolname, "mirror") == 0 || 227 strcmp(zfs->poolname, "raidz") == 0 || 228 strcmp(zfs->poolname, "draid") == 0) { 229 errx(1, "pool name '%s' is reserved and cannot be used", 230 zfs->poolname); 231 } 232 233 if (zfs->rootpath == NULL) 234 easprintf(&zfs->rootpath, "/%s", zfs->poolname); 235 if (zfs->rootpath[0] != '/') 236 errx(1, "mountpoint `%s' must be absolute", zfs->rootpath); 237 238 if (zfs->ashift == 0) 239 zfs->ashift = 12; 240 241 zfs_size_vdev(fsopts); 242 } 243 244 void 245 zfs_cleanup_opts(fsinfo_t *fsopts) 246 { 247 struct dataset_desc *d, *tmp; 248 zfs_opt_t *zfs; 249 250 zfs = fsopts->fs_specific; 251 free(zfs->rootpath); 252 free(zfs->bootfs); 253 free(__DECONST(void *, zfs->poolname)); 254 STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) { 255 free(d->params); 256 free(d); 257 } 258 free(zfs); 259 free(fsopts->fs_options); 260 } 261 262 static size_t 263 nvlist_size(const nvlist_t *nvl) 264 { 265 return (sizeof(nvl->nv_header) + nvl->nv_size); 266 } 267 268 static void 269 nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz) 270 { 271 assert(sz >= nvlist_size(nvl)); 272 273 memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header)); 274 memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size); 275 } 276 277 /* 278 * Avoid returning a GUID of 0, just to avoid the possibility that something 279 * will interpret that as meaning that the GUID is uninitialized. 280 */ 281 uint64_t 282 randomguid(void) 283 { 284 uint64_t ret; 285 286 do { 287 ret = ((uint64_t)random() << 32) | random(); 288 } while (ret == 0); 289 290 return (ret); 291 } 292 293 static nvlist_t * 294 pool_config_nvcreate(zfs_opt_t *zfs) 295 { 296 nvlist_t *featuresnv, *poolnv; 297 298 poolnv = nvlist_create(NV_UNIQUE_NAME); 299 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG); 300 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION); 301 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED); 302 nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname); 303 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid); 304 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid); 305 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); 306 nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1); 307 308 featuresnv = nvlist_create(NV_UNIQUE_NAME); 309 nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv); 310 nvlist_destroy(featuresnv); 311 312 return (poolnv); 313 } 314 315 static nvlist_t * 316 pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs) 317 { 318 nvlist_t *diskvdevnv; 319 320 assert(zfs->objarrid != 0); 321 322 diskvdevnv = nvlist_create(NV_UNIQUE_NAME); 323 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK); 324 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift); 325 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize); 326 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); 327 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0); 328 nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null"); 329 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1); 330 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); 331 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY, 332 zfs->objarrid); 333 nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT, 334 zfs->msshift); 335 336 return (diskvdevnv); 337 } 338 339 static nvlist_t * 340 pool_root_vdev_config_nvcreate(zfs_opt_t *zfs) 341 { 342 nvlist_t *diskvdevnv, *rootvdevnv; 343 344 diskvdevnv = pool_disk_vdev_config_nvcreate(zfs); 345 rootvdevnv = nvlist_create(NV_UNIQUE_NAME); 346 347 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0); 348 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid); 349 nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 350 nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); 351 nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv, 352 1); 353 nvlist_destroy(diskvdevnv); 354 355 return (rootvdevnv); 356 } 357 358 /* 359 * Create the pool's "config" object, which contains an nvlist describing pool 360 * parameters and the vdev topology. It is similar but not identical to the 361 * nvlist stored in vdev labels. The main difference is that vdev labels do not 362 * describe the full vdev tree and in particular do not contain the "root" 363 * meta-vdev. 364 */ 365 static void 366 pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir) 367 { 368 dnode_phys_t *dnode; 369 nvlist_t *poolconfig, *vdevconfig; 370 void *configbuf; 371 uint64_t dnid; 372 off_t configloc, configblksz; 373 int error; 374 375 dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST, 376 DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid); 377 378 poolconfig = pool_config_nvcreate(zfs); 379 380 vdevconfig = pool_root_vdev_config_nvcreate(zfs); 381 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); 382 nvlist_destroy(vdevconfig); 383 384 error = nvlist_export(poolconfig); 385 if (error != 0) 386 errc(1, error, "nvlist_export"); 387 388 configblksz = nvlist_size(poolconfig); 389 configloc = objset_space_alloc(zfs, zfs->mos, &configblksz); 390 configbuf = ecalloc(1, configblksz); 391 nvlist_copy(poolconfig, configbuf, configblksz); 392 393 vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc); 394 395 dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT; 396 dnode->dn_flags = DNODE_FLAG_USED_BYTES; 397 *(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig); 398 399 zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid); 400 401 nvlist_destroy(poolconfig); 402 free(configbuf); 403 } 404 405 /* 406 * Add objects block pointer list objects, used for deferred frees. We don't do 407 * anything with them, but they need to be present or OpenZFS will refuse to 408 * import the pool. 409 */ 410 static void 411 pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir) 412 { 413 uint64_t dnid; 414 415 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, 416 BPOBJ_SIZE_V2, &dnid); 417 zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid); 418 419 (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, 420 BPOBJ_SIZE_V2, &dnid); 421 zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid); 422 } 423 424 /* 425 * Add required feature metadata objects. We don't know anything about ZFS 426 * features, so the objects are just empty ZAPs. 427 */ 428 static void 429 pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir) 430 { 431 dnode_phys_t *dnode; 432 uint64_t dnid; 433 434 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 435 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid); 436 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 437 438 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 439 zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid); 440 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 441 442 dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); 443 zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid); 444 zap_write(zfs, zap_alloc(zfs->mos, dnode)); 445 } 446 447 static void 448 pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir) 449 { 450 zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET, 451 dsl_dir_id(zfs->rootdsldir)); 452 } 453 454 static void 455 pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir) 456 { 457 dnode_phys_t *dnode; 458 uint64_t id; 459 460 dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id); 461 zap_add_uint64(objdir, DMU_POOL_PROPS, id); 462 463 zfs->poolprops = zap_alloc(zfs->mos, dnode); 464 } 465 466 /* 467 * Initialize the MOS object directory, the root of virtually all of the pool's 468 * data and metadata. 469 */ 470 static void 471 pool_init_objdir(zfs_opt_t *zfs) 472 { 473 zfs_zap_t *zap; 474 dnode_phys_t *objdir; 475 476 objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT); 477 478 zap = zap_alloc(zfs->mos, objdir); 479 pool_init_objdir_config(zfs, zap); 480 pool_init_objdir_bplists(zfs, zap); 481 pool_init_objdir_feature_maps(zfs, zap); 482 pool_init_objdir_dsl(zfs, zap); 483 pool_init_objdir_poolprops(zfs, zap); 484 zap_write(zfs, zap); 485 } 486 487 /* 488 * Initialize the meta-object set (MOS) and immediately write out several 489 * special objects whose contents are already finalized, including the object 490 * directory. 491 * 492 * Once the MOS is finalized, it'll look roughly like this: 493 * 494 * object directory (ZAP) 495 * |-> vdev config object (nvlist) 496 * |-> features for read 497 * |-> features for write 498 * |-> feature descriptions 499 * |-> sync bplist 500 * |-> free bplist 501 * |-> pool properties 502 * L-> root DSL directory 503 * |-> DSL child directory (ZAP) 504 * | |-> $MOS (DSL dir) 505 * | | |-> child map 506 * | | L-> props (ZAP) 507 * | |-> $FREE (DSL dir) 508 * | | |-> child map 509 * | | L-> props (ZAP) 510 * | |-> $ORIGIN (DSL dir) 511 * | | |-> child map 512 * | | |-> dataset 513 * | | | L-> deadlist 514 * | | |-> snapshot 515 * | | | |-> deadlist 516 * | | | L-> snapshot names 517 * | | |-> props (ZAP) 518 * | | L-> clones (ZAP) 519 * | |-> dataset 1 (DSL dir) 520 * | | |-> DSL dataset 521 * | | | |-> snapshot names 522 * | | | L-> deadlist 523 * | | |-> child map 524 * | | | L-> ... 525 * | | L-> props 526 * | |-> dataset 2 527 * | | L-> ... 528 * | |-> ... 529 * | L-> dataset n 530 * |-> DSL root dataset 531 * | |-> snapshot names 532 * | L-> deadlist 533 * L-> props (ZAP) 534 * space map object array 535 * |-> space map 1 536 * |-> space map 2 537 * |-> ... 538 * L-> space map n (zfs->mscount) 539 * 540 * The space map object array is pointed to by the "msarray" property in the 541 * pool configuration. 542 */ 543 static void 544 pool_init(zfs_opt_t *zfs) 545 { 546 uint64_t dnid; 547 548 zfs->poolguid = randomguid(); 549 zfs->vdevguid = randomguid(); 550 551 zfs->mos = objset_alloc(zfs, DMU_OST_META); 552 553 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid); 554 assert(dnid == DMU_POOL_DIRECTORY_OBJECT); 555 556 (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid); 557 558 dsl_init(zfs); 559 560 pool_init_objdir(zfs); 561 } 562 563 static void 564 pool_labels_write(zfs_opt_t *zfs) 565 { 566 uberblock_t *ub; 567 vdev_label_t *label; 568 nvlist_t *poolconfig, *vdevconfig; 569 int error; 570 571 label = ecalloc(1, sizeof(*label)); 572 573 /* 574 * Assemble the vdev configuration and store it in the label. 575 */ 576 poolconfig = pool_config_nvcreate(zfs); 577 vdevconfig = pool_disk_vdev_config_nvcreate(zfs); 578 nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); 579 nvlist_destroy(vdevconfig); 580 581 error = nvlist_export(poolconfig); 582 if (error != 0) 583 errc(1, error, "nvlist_export"); 584 nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist, 585 sizeof(label->vl_vdev_phys.vp_nvlist)); 586 nvlist_destroy(poolconfig); 587 588 /* 589 * Fill out the uberblock. Just make each one the same. The embedded 590 * checksum is calculated in vdev_label_write(). 591 */ 592 for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock); 593 uoff += (1 << zfs->ashift)) { 594 ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff); 595 ub->ub_magic = UBERBLOCK_MAGIC; 596 ub->ub_version = SPA_VERSION; 597 ub->ub_txg = TXG; 598 ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid; 599 ub->ub_timestamp = 0; 600 601 ub->ub_software_version = SPA_VERSION; 602 ub->ub_mmp_magic = MMP_MAGIC; 603 ub->ub_mmp_delay = 0; 604 ub->ub_mmp_config = 0; 605 ub->ub_checkpoint_txg = 0; 606 objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp); 607 } 608 609 /* 610 * Write out four copies of the label: two at the beginning of the vdev 611 * and two at the end. 612 */ 613 for (int i = 0; i < VDEV_LABELS; i++) 614 vdev_label_write(zfs, i, label); 615 616 free(label); 617 } 618 619 static void 620 pool_fini(zfs_opt_t *zfs) 621 { 622 zap_write(zfs, zfs->poolprops); 623 dsl_write(zfs); 624 objset_write(zfs, zfs->mos); 625 pool_labels_write(zfs); 626 } 627 628 struct dnode_cursor * 629 dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode, 630 off_t size, off_t blksz) 631 { 632 struct dnode_cursor *c; 633 uint64_t nbppindir, indlevel, ndatablks, nindblks; 634 635 assert(dnode->dn_nblkptr == 1); 636 assert(blksz <= MAXBLOCKSIZE); 637 638 if (blksz == 0) { 639 /* Must be between 1<<ashift and 128KB. */ 640 blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift, 641 powerof2(size) ? size : (1l << flsll(size)))); 642 } 643 assert(powerof2(blksz)); 644 645 /* 646 * Do we need indirect blocks? Figure out how many levels are needed 647 * (indlevel == 1 means no indirect blocks) and how much space is needed 648 * (it has to be allocated up-front to break the dependency cycle 649 * described in objset_write()). 650 */ 651 ndatablks = size == 0 ? 0 : howmany(size, blksz); 652 nindblks = 0; 653 for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) { 654 nbppindir *= BLKPTR_PER_INDIR; 655 nindblks += howmany(ndatablks, indlevel * nbppindir); 656 } 657 assert(indlevel < INDIR_LEVELS); 658 659 dnode->dn_nlevels = (uint8_t)indlevel; 660 dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0; 661 dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT; 662 663 c = ecalloc(1, sizeof(*c)); 664 if (nindblks > 0) { 665 c->indspace = nindblks * MAXBLOCKSIZE; 666 c->indloc = objset_space_alloc(zfs, os, &c->indspace); 667 } 668 c->dnode = dnode; 669 c->dataoff = 0; 670 c->datablksz = blksz; 671 672 return (c); 673 } 674 675 static void 676 _dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, unsigned int levels) 677 { 678 blkptr_t *bp, *pbp; 679 void *buf; 680 uint64_t fill; 681 off_t blkid, blksz, loc; 682 683 assert(levels > 0); 684 assert(levels <= c->dnode->dn_nlevels - 1U); 685 686 blksz = MAXBLOCKSIZE; 687 blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR; 688 for (unsigned int level = 1; level <= levels; level++) { 689 buf = c->inddir[level - 1]; 690 691 if (level == c->dnode->dn_nlevels - 1U) { 692 pbp = &c->dnode->dn_blkptr[0]; 693 } else { 694 uint64_t iblkid; 695 696 iblkid = blkid & (BLKPTR_PER_INDIR - 1); 697 pbp = (blkptr_t *) 698 &c->inddir[level][iblkid * sizeof(blkptr_t)]; 699 } 700 701 /* 702 * Space for indirect blocks is allocated up-front; see the 703 * comment in objset_write(). 704 */ 705 loc = c->indloc; 706 c->indloc += blksz; 707 assert(c->indspace >= blksz); 708 c->indspace -= blksz; 709 710 bp = buf; 711 fill = 0; 712 for (size_t i = 0; i < BLKPTR_PER_INDIR; i++) 713 fill += BP_GET_FILL(&bp[i]); 714 715 vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz, 716 loc, pbp); 717 memset(buf, 0, MAXBLOCKSIZE); 718 719 blkid /= BLKPTR_PER_INDIR; 720 } 721 } 722 723 blkptr_t * 724 dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off) 725 { 726 off_t blkid, l1id; 727 unsigned int levels; 728 729 if (c->dnode->dn_nlevels == 1) { 730 assert(off < MAXBLOCKSIZE); 731 return (&c->dnode->dn_blkptr[0]); 732 } 733 734 assert(off % c->datablksz == 0); 735 736 /* Do we need to flush any full indirect blocks? */ 737 if (off > 0) { 738 blkid = off / c->datablksz; 739 for (levels = 0; levels < c->dnode->dn_nlevels - 1U; levels++) { 740 if (blkid % BLKPTR_PER_INDIR != 0) 741 break; 742 blkid /= BLKPTR_PER_INDIR; 743 } 744 if (levels > 0) 745 _dnode_cursor_flush(zfs, c, levels); 746 } 747 748 c->dataoff = off; 749 l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1); 750 return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]); 751 } 752 753 void 754 dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c) 755 { 756 unsigned int levels; 757 758 assert(c->dnode->dn_nlevels > 0); 759 levels = c->dnode->dn_nlevels - 1; 760 if (levels > 0) 761 _dnode_cursor_flush(zfs, c, levels); 762 assert(c->indspace == 0); 763 free(c); 764 } 765 766 void 767 zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts) 768 { 769 zfs_opt_t *zfs; 770 int dirfd; 771 772 zfs = fsopts->fs_specific; 773 774 /* 775 * Use a fixed seed to provide reproducible pseudo-random numbers for 776 * on-disk structures when needed (e.g., GUIDs, ZAP hash salts). 777 */ 778 srandom(1729); 779 780 zfs_check_opts(fsopts); 781 782 if (!zfs->nowarn) { 783 fprintf(stderr, 784 "ZFS support is currently considered experimental. " 785 "Do not use it for anything critical.\n"); 786 } 787 788 dirfd = open(dir, O_DIRECTORY | O_RDONLY); 789 if (dirfd < 0) 790 err(1, "open(%s)", dir); 791 792 vdev_init(zfs, image); 793 pool_init(zfs); 794 fs_build(zfs, dirfd, root); 795 pool_fini(zfs); 796 vdev_fini(zfs); 797 } 798