1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2022 The FreeBSD Foundation 5 * 6 * This software was developed by Mark Johnston under sponsorship from 7 * the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions are 11 * met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <assert.h> 32 #include <fcntl.h> 33 #include <stdlib.h> 34 #include <string.h> 35 #include <unistd.h> 36 37 #include <util.h> 38 39 #include "zfs.h" 40 41 #pragma GCC diagnostic push 42 #pragma GCC diagnostic ignored "-Wunused-function" 43 #include "zfs/fletcher.c" 44 #include "zfs/sha256.c" 45 #pragma GCC diagnostic pop 46 47 static void 48 blkptr_set(blkptr_t *bp, off_t off, off_t size, uint8_t dntype, uint8_t level, 49 uint64_t fill, enum zio_checksum cksumt, zio_cksum_t *cksum) 50 { 51 dva_t *dva; 52 53 assert(powerof2(size)); 54 55 BP_ZERO(bp); 56 BP_SET_LSIZE(bp, size); 57 BP_SET_PSIZE(bp, size); 58 BP_SET_CHECKSUM(bp, cksumt); 59 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 60 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 61 BP_SET_BIRTH(bp, TXG, TXG); 62 BP_SET_LEVEL(bp, level); 63 BP_SET_FILL(bp, fill); 64 BP_SET_TYPE(bp, dntype); 65 66 dva = BP_IDENTITY(bp); 67 DVA_SET_VDEV(dva, 0); 68 DVA_SET_OFFSET(dva, off); 69 DVA_SET_ASIZE(dva, size); 70 memcpy(&bp->blk_cksum, cksum, sizeof(*cksum)); 71 } 72 73 /* 74 * Write a block of data to the vdev. The offset is always relative to the end 75 * of the second leading vdev label. 76 * 77 * Consumers should generally use the helpers below, which provide block 78 * pointers and update dnode accounting, rather than calling this function 79 * directly. 80 */ 81 static void 82 vdev_pwrite(const zfs_opt_t *zfs, const void *buf, size_t len, off_t off) 83 { 84 ssize_t n; 85 86 assert(off >= 0 && off < zfs->asize); 87 assert(powerof2(len)); 88 assert((off_t)len > 0 && off + (off_t)len > off && 89 off + (off_t)len < zfs->asize); 90 if (zfs->spacemap != NULL) { 91 /* 92 * Verify that the blocks being written were in fact allocated. 93 * 94 * The space map isn't available once the on-disk space map is 95 * finalized, so this check doesn't quite catch everything. 96 */ 97 assert(bit_ntest(zfs->spacemap, off >> zfs->ashift, 98 (off + len - 1) >> zfs->ashift, 1)); 99 } 100 101 off += VDEV_LABEL_START_SIZE; 102 for (size_t sofar = 0; sofar < len; sofar += n) { 103 n = pwrite(zfs->fd, (const char *)buf + sofar, len - sofar, 104 off + sofar); 105 if (n < 0) 106 err(1, "pwrite"); 107 assert(n > 0); 108 } 109 } 110 111 void 112 vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype, 113 uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc, 114 blkptr_t *bp) 115 { 116 zio_cksum_t cksum; 117 118 assert(cksumtype == ZIO_CHECKSUM_FLETCHER_4); 119 120 fletcher_4_native(data, sz, NULL, &cksum); 121 blkptr_set(bp, loc, sz, datatype, level, fill, cksumtype, &cksum); 122 vdev_pwrite(zfs, data, sz, loc); 123 } 124 125 void 126 vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level, 127 uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp) 128 { 129 vdev_pwrite_data(zfs, dnode->dn_type, dnode->dn_checksum, level, fill, 130 data, sz, loc, bp); 131 132 assert((dnode->dn_flags & DNODE_FLAG_USED_BYTES) != 0); 133 dnode->dn_used += sz; 134 } 135 136 void 137 vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data, 138 off_t sz, off_t loc) 139 { 140 vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, data, sz, loc, 141 &dnode->dn_blkptr[0]); 142 } 143 144 static void 145 vdev_label_set_checksum(void *buf, off_t off, off_t size) 146 { 147 zio_cksum_t cksum; 148 zio_eck_t *eck; 149 150 assert(size > 0 && (size_t)size >= sizeof(zio_eck_t)); 151 152 eck = (zio_eck_t *)((char *)buf + size) - 1; 153 eck->zec_magic = ZEC_MAGIC; 154 ZIO_SET_CHECKSUM(&eck->zec_cksum, off, 0, 0, 0); 155 zio_checksum_SHA256(buf, size, NULL, &cksum); 156 eck->zec_cksum = cksum; 157 } 158 159 /* 160 * Set embedded checksums and write the label at the specified index. 161 */ 162 void 163 vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp) 164 { 165 vdev_label_t *label; 166 ssize_t n; 167 off_t blksz, loff; 168 169 assert(ind >= 0 && ind < VDEV_LABELS); 170 171 /* 172 * Make a copy since we have to modify the label to set checksums. 173 */ 174 label = ecalloc(1, sizeof(*label)); 175 memcpy(label, labelp, sizeof(*label)); 176 177 if (ind < 2) 178 loff = ind * sizeof(*label); 179 else 180 loff = zfs->vdevsize - (VDEV_LABELS - ind) * sizeof(*label); 181 182 /* 183 * Set the verifier checksum for the boot block. We don't use it, but 184 * the FreeBSD loader reads it and will complain if the checksum isn't 185 * valid. 186 */ 187 vdev_label_set_checksum(&label->vl_be, 188 loff + __offsetof(vdev_label_t, vl_be), sizeof(label->vl_be)); 189 190 /* 191 * Set the verifier checksum for the label. 192 */ 193 vdev_label_set_checksum(&label->vl_vdev_phys, 194 loff + __offsetof(vdev_label_t, vl_vdev_phys), 195 sizeof(label->vl_vdev_phys)); 196 197 /* 198 * Set the verifier checksum for the uberblocks. There is one uberblock 199 * per sector; for example, with an ashift of 12 we end up with 200 * 128KB/4KB=32 copies of the uberblock in the ring. 201 */ 202 blksz = 1 << zfs->ashift; 203 assert(sizeof(label->vl_uberblock) % blksz == 0); 204 for (size_t roff = 0; roff < sizeof(label->vl_uberblock); 205 roff += blksz) { 206 vdev_label_set_checksum(&label->vl_uberblock[0] + roff, 207 loff + __offsetof(vdev_label_t, vl_uberblock) + roff, 208 blksz); 209 } 210 211 n = pwrite(zfs->fd, label, sizeof(*label), loff); 212 if (n < 0) 213 err(1, "writing vdev label"); 214 assert(n == sizeof(*label)); 215 216 free(label); 217 } 218 219 /* 220 * Find a chunk of contiguous free space of length *lenp, according to the 221 * following rules: 222 * 1. If the length is less than or equal to 128KB, the returned run's length 223 * will be the smallest power of 2 equal to or larger than the length. 224 * 2. If the length is larger than 128KB, the returned run's length will be 225 * the smallest multiple of 128KB that is larger than the length. 226 * 3. The returned run's length will be size-aligned up to 128KB. 227 * 228 * XXX-MJ the third rule isn't actually required, so this can just be a dumb 229 * bump allocator. Maybe there's some benefit to keeping large blocks aligned, 230 * so let's keep it for now and hope we don't get too much fragmentation. 231 * Alternately we could try to allocate all blocks of a certain size from the 232 * same metaslab. 233 */ 234 off_t 235 vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp) 236 { 237 off_t len; 238 int align, loc, minblksz, nbits; 239 240 minblksz = 1 << zfs->ashift; 241 len = roundup2(*lenp, minblksz); 242 243 assert(len != 0); 244 assert(len / minblksz <= INT_MAX); 245 246 if (len < MAXBLOCKSIZE) { 247 if ((len & (len - 1)) != 0) 248 len = (off_t)1 << flsll(len); 249 align = len / minblksz; 250 } else { 251 len = roundup2(len, MAXBLOCKSIZE); 252 align = MAXBLOCKSIZE / minblksz; 253 } 254 255 for (loc = 0, nbits = len / minblksz;; loc = roundup2(loc, align)) { 256 bit_ffc_area_at(zfs->spacemap, loc, zfs->spacemapbits, nbits, 257 &loc); 258 if (loc == -1) { 259 errx(1, "failed to find %ju bytes of space", 260 (uintmax_t)len); 261 } 262 if ((loc & (align - 1)) == 0) 263 break; 264 } 265 assert(loc + nbits > loc); 266 bit_nset(zfs->spacemap, loc, loc + nbits - 1); 267 *lenp = len; 268 269 return ((off_t)loc << zfs->ashift); 270 } 271 272 static void 273 vdev_spacemap_init(zfs_opt_t *zfs) 274 { 275 uint64_t nbits; 276 277 assert(powerof2(zfs->mssize)); 278 279 nbits = rounddown2(zfs->asize, zfs->mssize) >> zfs->ashift; 280 if (nbits > INT_MAX) { 281 /* 282 * With the smallest block size of 512B, the limit on the image 283 * size is 2TB. That should be enough for anyone. 284 */ 285 errx(1, "image size is too large"); 286 } 287 zfs->spacemapbits = (int)nbits; 288 zfs->spacemap = bit_alloc(zfs->spacemapbits); 289 if (zfs->spacemap == NULL) 290 err(1, "bitstring allocation failed"); 291 } 292 293 void 294 vdev_spacemap_write(zfs_opt_t *zfs) 295 { 296 dnode_phys_t *objarr; 297 bitstr_t *spacemap; 298 uint64_t *objarrblk; 299 off_t smblksz, objarrblksz, objarrloc; 300 301 struct { 302 dnode_phys_t *dnode; 303 uint64_t dnid; 304 off_t loc; 305 } *sma; 306 307 objarrblksz = sizeof(uint64_t) * zfs->mscount; 308 assert(objarrblksz <= MAXBLOCKSIZE); 309 objarrloc = objset_space_alloc(zfs, zfs->mos, &objarrblksz); 310 objarrblk = ecalloc(1, objarrblksz); 311 312 objarr = objset_dnode_lookup(zfs->mos, zfs->objarrid); 313 objarr->dn_datablkszsec = objarrblksz >> MINBLOCKSHIFT; 314 315 /* 316 * Use the smallest block size for space maps. The space allocation 317 * algorithm should aim to minimize the number of holes. 318 */ 319 smblksz = 1 << zfs->ashift; 320 321 /* 322 * First allocate dnodes and space for all of our space maps. No more 323 * space can be allocated from the vdev after this point. 324 */ 325 sma = ecalloc(zfs->mscount, sizeof(*sma)); 326 for (uint64_t i = 0; i < zfs->mscount; i++) { 327 sma[i].dnode = objset_dnode_bonus_alloc(zfs->mos, 328 DMU_OT_SPACE_MAP, DMU_OT_SPACE_MAP_HEADER, 329 sizeof(space_map_phys_t), &sma[i].dnid); 330 sma[i].loc = objset_space_alloc(zfs, zfs->mos, &smblksz); 331 } 332 spacemap = zfs->spacemap; 333 zfs->spacemap = NULL; 334 335 /* 336 * Now that the set of allocated space is finalized, populate each space 337 * map and write it to the vdev. 338 */ 339 for (uint64_t i = 0; i < zfs->mscount; i++) { 340 space_map_phys_t *sm; 341 uint64_t alloc, length, *smblk; 342 int shift, startb, endb, srunb, erunb; 343 344 /* 345 * We only allocate a single block for this space map, but 346 * OpenZFS assumes that a space map object with sufficient bonus 347 * space supports histograms. 348 */ 349 sma[i].dnode->dn_nblkptr = 3; 350 sma[i].dnode->dn_datablkszsec = smblksz >> MINBLOCKSHIFT; 351 352 smblk = ecalloc(1, smblksz); 353 354 alloc = length = 0; 355 shift = zfs->msshift - zfs->ashift; 356 for (srunb = startb = i * (1 << shift), 357 endb = (i + 1) * (1 << shift); 358 srunb < endb; srunb = erunb) { 359 uint64_t runlen, runoff; 360 361 /* Find a run of allocated space. */ 362 bit_ffs_at(spacemap, srunb, zfs->spacemapbits, &srunb); 363 if (srunb == -1 || srunb >= endb) 364 break; 365 366 bit_ffc_at(spacemap, srunb, zfs->spacemapbits, &erunb); 367 if (erunb == -1 || erunb > endb) 368 erunb = endb; 369 370 /* 371 * The space represented by [srunb, erunb) has been 372 * allocated. Add a record to the space map to indicate 373 * this. Run offsets are relative to the beginning of 374 * the metaslab. 375 */ 376 runlen = erunb - srunb; 377 runoff = srunb - startb; 378 379 assert(length * sizeof(uint64_t) < (uint64_t)smblksz); 380 smblk[length] = SM_PREFIX_ENCODE(SM2_PREFIX) | 381 SM2_RUN_ENCODE(runlen) | SM2_VDEV_ENCODE(0); 382 smblk[length + 1] = SM2_TYPE_ENCODE(SM_ALLOC) | 383 SM2_OFFSET_ENCODE(runoff); 384 385 alloc += runlen << zfs->ashift; 386 length += 2; 387 } 388 389 sm = DN_BONUS(sma[i].dnode); 390 sm->smp_length = length * sizeof(uint64_t); 391 sm->smp_alloc = alloc; 392 393 vdev_pwrite_dnode_data(zfs, sma[i].dnode, smblk, smblksz, 394 sma[i].loc); 395 free(smblk); 396 397 /* Record this space map in the space map object array. */ 398 objarrblk[i] = sma[i].dnid; 399 } 400 401 /* 402 * All of the space maps are written, now write the object array. 403 */ 404 vdev_pwrite_dnode_data(zfs, objarr, objarrblk, objarrblksz, objarrloc); 405 free(objarrblk); 406 407 assert(zfs->spacemap == NULL); 408 free(spacemap); 409 free(sma); 410 } 411 412 void 413 vdev_init(zfs_opt_t *zfs, const char *image) 414 { 415 assert(zfs->ashift >= MINBLOCKSHIFT); 416 417 zfs->fd = open(image, O_RDWR | O_CREAT | O_TRUNC, 0644); 418 if (zfs->fd == -1) 419 err(1, "Can't open `%s' for writing", image); 420 if (ftruncate(zfs->fd, zfs->vdevsize) != 0) 421 err(1, "Failed to extend image file `%s'", image); 422 423 vdev_spacemap_init(zfs); 424 } 425 426 void 427 vdev_fini(zfs_opt_t *zfs) 428 { 429 assert(zfs->spacemap == NULL); 430 431 if (zfs->fd != -1) { 432 if (close(zfs->fd) != 0) 433 err(1, "close"); 434 zfs->fd = -1; 435 } 436 } 437