1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2022 The FreeBSD Foundation 5 * 6 * This software was developed by Mark Johnston under sponsorship from 7 * the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions are 11 * met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/param.h> 32 #include <assert.h> 33 #include <fcntl.h> 34 #include <stdlib.h> 35 #include <string.h> 36 #include <unistd.h> 37 38 #include <util.h> 39 40 #include "zfs.h" 41 42 #pragma GCC diagnostic push 43 #pragma GCC diagnostic ignored "-Wunused-function" 44 #include "zfs/fletcher.c" 45 #include "zfs/sha256.c" 46 #pragma GCC diagnostic pop 47 48 static void 49 blkptr_set(blkptr_t *bp, off_t off, off_t size, uint8_t dntype, uint8_t level, 50 uint64_t fill, enum zio_checksum cksumt, zio_cksum_t *cksum) 51 { 52 dva_t *dva; 53 54 assert(powerof2(size)); 55 56 BP_ZERO(bp); 57 BP_SET_LSIZE(bp, size); 58 BP_SET_PSIZE(bp, size); 59 BP_SET_CHECKSUM(bp, cksumt); 60 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 61 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 62 BP_SET_BIRTH(bp, TXG, TXG); 63 BP_SET_LEVEL(bp, level); 64 BP_SET_FILL(bp, fill); 65 BP_SET_TYPE(bp, dntype); 66 67 dva = BP_IDENTITY(bp); 68 DVA_SET_VDEV(dva, 0); 69 DVA_SET_OFFSET(dva, off); 70 DVA_SET_ASIZE(dva, size); 71 memcpy(&bp->blk_cksum, cksum, sizeof(*cksum)); 72 } 73 74 /* 75 * Write a block of data to the vdev. The offset is always relative to the end 76 * of the second leading vdev label. 77 * 78 * Consumers should generally use the helpers below, which provide block 79 * pointers and update dnode accounting, rather than calling this function 80 * directly. 81 */ 82 static void 83 vdev_pwrite(const zfs_opt_t *zfs, const void *buf, size_t len, off_t off) 84 { 85 ssize_t n; 86 87 assert(off >= 0 && off < zfs->asize); 88 assert(powerof2(len)); 89 assert((off_t)len > 0 && off + (off_t)len > off && 90 off + (off_t)len < zfs->asize); 91 if (zfs->spacemap != NULL) { 92 /* 93 * Verify that the blocks being written were in fact allocated. 94 * 95 * The space map isn't available once the on-disk space map is 96 * finalized, so this check doesn't quite catch everything. 97 */ 98 assert(bit_ntest(zfs->spacemap, off >> zfs->ashift, 99 (off + len - 1) >> zfs->ashift, 1)); 100 } 101 102 off += VDEV_LABEL_START_SIZE; 103 for (size_t sofar = 0; sofar < len; sofar += n) { 104 n = pwrite(zfs->fd, (const char *)buf + sofar, len - sofar, 105 off + sofar); 106 if (n < 0) 107 err(1, "pwrite"); 108 assert(n > 0); 109 } 110 } 111 112 void 113 vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype, 114 uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc, 115 blkptr_t *bp) 116 { 117 zio_cksum_t cksum; 118 119 assert(cksumtype == ZIO_CHECKSUM_FLETCHER_4); 120 121 fletcher_4_native(data, sz, NULL, &cksum); 122 blkptr_set(bp, loc, sz, datatype, level, fill, cksumtype, &cksum); 123 vdev_pwrite(zfs, data, sz, loc); 124 } 125 126 void 127 vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level, 128 uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp) 129 { 130 vdev_pwrite_data(zfs, dnode->dn_type, dnode->dn_checksum, level, fill, 131 data, sz, loc, bp); 132 133 assert((dnode->dn_flags & DNODE_FLAG_USED_BYTES) != 0); 134 dnode->dn_used += sz; 135 } 136 137 void 138 vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data, 139 off_t sz, off_t loc) 140 { 141 vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, data, sz, loc, 142 &dnode->dn_blkptr[0]); 143 } 144 145 static void 146 vdev_label_set_checksum(void *buf, off_t off, off_t size) 147 { 148 zio_cksum_t cksum; 149 zio_eck_t *eck; 150 151 assert(size > 0 && (size_t)size >= sizeof(zio_eck_t)); 152 153 eck = (zio_eck_t *)((char *)buf + size) - 1; 154 eck->zec_magic = ZEC_MAGIC; 155 ZIO_SET_CHECKSUM(&eck->zec_cksum, off, 0, 0, 0); 156 zio_checksum_SHA256(buf, size, NULL, &cksum); 157 eck->zec_cksum = cksum; 158 } 159 160 /* 161 * Set embedded checksums and write the label at the specified index. 162 */ 163 void 164 vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp) 165 { 166 vdev_label_t *label; 167 ssize_t n; 168 off_t blksz, loff; 169 170 assert(ind >= 0 && ind < VDEV_LABELS); 171 172 /* 173 * Make a copy since we have to modify the label to set checksums. 174 */ 175 label = ecalloc(1, sizeof(*label)); 176 memcpy(label, labelp, sizeof(*label)); 177 178 if (ind < 2) 179 loff = ind * sizeof(*label); 180 else 181 loff = zfs->vdevsize - (VDEV_LABELS - ind) * sizeof(*label); 182 183 /* 184 * Set the verifier checksum for the boot block. We don't use it, but 185 * the FreeBSD loader reads it and will complain if the checksum isn't 186 * valid. 187 */ 188 vdev_label_set_checksum(&label->vl_be, 189 loff + __offsetof(vdev_label_t, vl_be), sizeof(label->vl_be)); 190 191 /* 192 * Set the verifier checksum for the label. 193 */ 194 vdev_label_set_checksum(&label->vl_vdev_phys, 195 loff + __offsetof(vdev_label_t, vl_vdev_phys), 196 sizeof(label->vl_vdev_phys)); 197 198 /* 199 * Set the verifier checksum for the uberblocks. There is one uberblock 200 * per sector; for example, with an ashift of 12 we end up with 201 * 128KB/4KB=32 copies of the uberblock in the ring. 202 */ 203 blksz = ASHIFT_UBERBLOCK_SIZE(zfs->ashift); 204 assert(sizeof(label->vl_uberblock) % blksz == 0); 205 for (size_t roff = 0; roff < sizeof(label->vl_uberblock); 206 roff += blksz) { 207 vdev_label_set_checksum(&label->vl_uberblock[0] + roff, 208 loff + __offsetof(vdev_label_t, vl_uberblock) + roff, 209 blksz); 210 } 211 212 n = pwrite(zfs->fd, label, sizeof(*label), loff); 213 if (n < 0) 214 err(1, "writing vdev label"); 215 assert(n == sizeof(*label)); 216 217 free(label); 218 } 219 220 /* 221 * Find a chunk of contiguous free space of length *lenp, according to the 222 * following rules: 223 * 1. If the length is less than or equal to 128KB, the returned run's length 224 * will be the smallest power of 2 equal to or larger than the length. 225 * 2. If the length is larger than 128KB, the returned run's length will be 226 * the smallest multiple of 128KB that is larger than the length. 227 * 3. The returned run's length will be size-aligned up to 128KB. 228 * 229 * XXX-MJ the third rule isn't actually required, so this can just be a dumb 230 * bump allocator. Maybe there's some benefit to keeping large blocks aligned, 231 * so let's keep it for now and hope we don't get too much fragmentation. 232 * Alternately we could try to allocate all blocks of a certain size from the 233 * same metaslab. 234 */ 235 off_t 236 vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp) 237 { 238 off_t len; 239 int align, loc, minblksz, nbits; 240 241 minblksz = 1 << zfs->ashift; 242 len = roundup2(*lenp, minblksz); 243 244 assert(len != 0); 245 assert(len / minblksz <= INT_MAX); 246 247 if (len < MAXBLOCKSIZE) { 248 if ((len & (len - 1)) != 0) 249 len = (off_t)1 << flsll(len); 250 align = len / minblksz; 251 } else { 252 len = roundup2(len, MAXBLOCKSIZE); 253 align = MAXBLOCKSIZE / minblksz; 254 } 255 256 for (loc = 0, nbits = len / minblksz;; loc = roundup2(loc, align)) { 257 bit_ffc_area_at(zfs->spacemap, loc, zfs->spacemapbits, nbits, 258 &loc); 259 if (loc == -1) { 260 errx(1, "failed to find %ju bytes of space", 261 (uintmax_t)len); 262 } 263 if ((loc & (align - 1)) == 0) 264 break; 265 } 266 assert(loc + nbits > loc); 267 bit_nset(zfs->spacemap, loc, loc + nbits - 1); 268 *lenp = len; 269 270 return ((off_t)loc << zfs->ashift); 271 } 272 273 static void 274 vdev_spacemap_init(zfs_opt_t *zfs) 275 { 276 uint64_t nbits; 277 278 assert(powerof2(zfs->mssize)); 279 280 nbits = rounddown2(zfs->asize, zfs->mssize) >> zfs->ashift; 281 if (nbits > INT_MAX) { 282 /* 283 * With the smallest block size of 512B, the limit on the image 284 * size is 2TB. That should be enough for anyone. 285 */ 286 errx(1, "image size is too large"); 287 } 288 zfs->spacemapbits = (int)nbits; 289 zfs->spacemap = bit_alloc(zfs->spacemapbits); 290 if (zfs->spacemap == NULL) 291 err(1, "bitstring allocation failed"); 292 } 293 294 void 295 vdev_spacemap_write(zfs_opt_t *zfs) 296 { 297 dnode_phys_t *objarr; 298 bitstr_t *spacemap; 299 uint64_t *objarrblk; 300 off_t smblksz, objarrblksz, objarrloc; 301 302 struct { 303 dnode_phys_t *dnode; 304 uint64_t dnid; 305 off_t loc; 306 } *sma; 307 308 objarrblksz = sizeof(uint64_t) * zfs->mscount; 309 assert(objarrblksz <= MAXBLOCKSIZE); 310 objarrloc = objset_space_alloc(zfs, zfs->mos, &objarrblksz); 311 objarrblk = ecalloc(1, objarrblksz); 312 313 objarr = objset_dnode_lookup(zfs->mos, zfs->objarrid); 314 objarr->dn_datablkszsec = objarrblksz >> MINBLOCKSHIFT; 315 316 /* 317 * Use the smallest block size for space maps. The space allocation 318 * algorithm should aim to minimize the number of holes. 319 */ 320 smblksz = 1 << zfs->ashift; 321 322 /* 323 * First allocate dnodes and space for all of our space maps. No more 324 * space can be allocated from the vdev after this point. 325 */ 326 sma = ecalloc(zfs->mscount, sizeof(*sma)); 327 for (uint64_t i = 0; i < zfs->mscount; i++) { 328 sma[i].dnode = objset_dnode_bonus_alloc(zfs->mos, 329 DMU_OT_SPACE_MAP, DMU_OT_SPACE_MAP_HEADER, 330 sizeof(space_map_phys_t), &sma[i].dnid); 331 sma[i].loc = objset_space_alloc(zfs, zfs->mos, &smblksz); 332 } 333 spacemap = zfs->spacemap; 334 zfs->spacemap = NULL; 335 336 /* 337 * Now that the set of allocated space is finalized, populate each space 338 * map and write it to the vdev. 339 */ 340 for (uint64_t i = 0; i < zfs->mscount; i++) { 341 space_map_phys_t *sm; 342 uint64_t alloc, length, *smblk; 343 int shift, startb, endb, srunb, erunb; 344 345 /* 346 * We only allocate a single block for this space map, but 347 * OpenZFS assumes that a space map object with sufficient bonus 348 * space supports histograms. 349 */ 350 sma[i].dnode->dn_nblkptr = 3; 351 sma[i].dnode->dn_datablkszsec = smblksz >> MINBLOCKSHIFT; 352 353 smblk = ecalloc(1, smblksz); 354 355 alloc = length = 0; 356 shift = zfs->msshift - zfs->ashift; 357 for (srunb = startb = i * (1 << shift), 358 endb = (i + 1) * (1 << shift); 359 srunb < endb; srunb = erunb) { 360 uint64_t runlen, runoff; 361 362 /* Find a run of allocated space. */ 363 bit_ffs_at(spacemap, srunb, zfs->spacemapbits, &srunb); 364 if (srunb == -1 || srunb >= endb) 365 break; 366 367 bit_ffc_at(spacemap, srunb, zfs->spacemapbits, &erunb); 368 if (erunb == -1 || erunb > endb) 369 erunb = endb; 370 371 /* 372 * The space represented by [srunb, erunb) has been 373 * allocated. Add a record to the space map to indicate 374 * this. Run offsets are relative to the beginning of 375 * the metaslab. 376 */ 377 runlen = erunb - srunb; 378 runoff = srunb - startb; 379 380 assert(length * sizeof(uint64_t) < (uint64_t)smblksz); 381 smblk[length] = SM_PREFIX_ENCODE(SM2_PREFIX) | 382 SM2_RUN_ENCODE(runlen) | SM2_VDEV_ENCODE(0); 383 smblk[length + 1] = SM2_TYPE_ENCODE(SM_ALLOC) | 384 SM2_OFFSET_ENCODE(runoff); 385 386 alloc += runlen << zfs->ashift; 387 length += 2; 388 } 389 390 sm = DN_BONUS(sma[i].dnode); 391 sm->smp_length = length * sizeof(uint64_t); 392 sm->smp_alloc = alloc; 393 394 vdev_pwrite_dnode_data(zfs, sma[i].dnode, smblk, smblksz, 395 sma[i].loc); 396 free(smblk); 397 398 /* Record this space map in the space map object array. */ 399 objarrblk[i] = sma[i].dnid; 400 } 401 402 /* 403 * All of the space maps are written, now write the object array. 404 */ 405 vdev_pwrite_dnode_data(zfs, objarr, objarrblk, objarrblksz, objarrloc); 406 free(objarrblk); 407 408 assert(zfs->spacemap == NULL); 409 free(spacemap); 410 free(sma); 411 } 412 413 void 414 vdev_init(zfs_opt_t *zfs, const char *image) 415 { 416 assert(zfs->ashift >= MINBLOCKSHIFT); 417 418 zfs->fd = open(image, O_RDWR | O_CREAT | O_TRUNC, 0644); 419 if (zfs->fd == -1) 420 err(1, "Can't open `%s' for writing", image); 421 if (ftruncate(zfs->fd, zfs->vdevsize) != 0) 422 err(1, "Failed to extend image file `%s'", image); 423 424 vdev_spacemap_init(zfs); 425 } 426 427 void 428 vdev_fini(zfs_opt_t *zfs) 429 { 430 assert(zfs->spacemap == NULL); 431 432 if (zfs->fd != -1) { 433 if (close(zfs->fd) != 0) 434 err(1, "close"); 435 zfs->fd = -1; 436 } 437 } 438