1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2022 The FreeBSD Foundation 5 * 6 * This software was developed by Mark Johnston under sponsorship from 7 * the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions are 11 * met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <assert.h> 32 #include <fcntl.h> 33 #include <string.h> 34 #include <unistd.h> 35 36 #include <util.h> 37 38 #include "zfs.h" 39 40 #pragma clang diagnostic push 41 #pragma clang diagnostic ignored "-Wunused-function" 42 #include "zfs/fletcher.c" 43 #include "zfs/sha256.c" 44 #pragma clang diagnostic pop 45 46 static void 47 blkptr_set(blkptr_t *bp, off_t off, off_t size, uint8_t dntype, uint8_t level, 48 uint64_t fill, enum zio_checksum cksumt, zio_cksum_t *cksum) 49 { 50 dva_t *dva; 51 52 assert(powerof2(size)); 53 54 BP_ZERO(bp); 55 BP_SET_LSIZE(bp, size); 56 BP_SET_PSIZE(bp, size); 57 BP_SET_CHECKSUM(bp, cksumt); 58 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 59 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 60 BP_SET_BIRTH(bp, TXG, TXG); 61 BP_SET_LEVEL(bp, level); 62 BP_SET_FILL(bp, fill); 63 BP_SET_TYPE(bp, dntype); 64 65 dva = BP_IDENTITY(bp); 66 DVA_SET_VDEV(dva, 0); 67 DVA_SET_OFFSET(dva, off); 68 DVA_SET_ASIZE(dva, size); 69 memcpy(&bp->blk_cksum, cksum, sizeof(*cksum)); 70 } 71 72 /* 73 * Write a block of data to the vdev. The offset is always relative to the end 74 * of the second leading vdev label. 75 * 76 * Consumers should generally use the helpers below, which provide block 77 * pointers and update dnode accounting, rather than calling this function 78 * directly. 79 */ 80 static void 81 vdev_pwrite(const zfs_opt_t *zfs, const void *buf, size_t len, off_t off) 82 { 83 ssize_t n; 84 85 assert(off >= 0 && off < zfs->asize); 86 assert(powerof2(len)); 87 assert((off_t)len > 0 && off + (off_t)len > off && 88 off + (off_t)len < zfs->asize); 89 if (zfs->spacemap != NULL) { 90 /* 91 * Verify that the blocks being written were in fact allocated. 92 * 93 * The space map isn't available once the on-disk space map is 94 * finalized, so this check doesn't quite catch everything. 95 */ 96 assert(bit_ntest(zfs->spacemap, off >> zfs->ashift, 97 (off + len - 1) >> zfs->ashift, 1)); 98 } 99 100 off += VDEV_LABEL_START_SIZE; 101 for (size_t sofar = 0; sofar < len; sofar += n) { 102 n = pwrite(zfs->fd, (const char *)buf + sofar, len - sofar, 103 off + sofar); 104 if (n < 0) 105 err(1, "pwrite"); 106 assert(n > 0); 107 } 108 } 109 110 void 111 vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype, 112 uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc, 113 blkptr_t *bp) 114 { 115 zio_cksum_t cksum; 116 117 assert(cksumtype == ZIO_CHECKSUM_FLETCHER_4); 118 119 fletcher_4_native(data, sz, NULL, &cksum); 120 blkptr_set(bp, loc, sz, datatype, level, fill, cksumtype, &cksum); 121 vdev_pwrite(zfs, data, sz, loc); 122 } 123 124 void 125 vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level, 126 uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp) 127 { 128 vdev_pwrite_data(zfs, dnode->dn_type, dnode->dn_checksum, level, fill, 129 data, sz, loc, bp); 130 131 assert((dnode->dn_flags & DNODE_FLAG_USED_BYTES) != 0); 132 dnode->dn_used += sz; 133 } 134 135 void 136 vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data, 137 off_t sz, off_t loc) 138 { 139 vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, data, sz, loc, 140 &dnode->dn_blkptr[0]); 141 } 142 143 static void 144 vdev_label_set_checksum(void *buf, off_t off, off_t size) 145 { 146 zio_cksum_t cksum; 147 zio_eck_t *eck; 148 149 assert(size > 0 && (size_t)size >= sizeof(zio_eck_t)); 150 151 eck = (zio_eck_t *)((char *)buf + size) - 1; 152 eck->zec_magic = ZEC_MAGIC; 153 ZIO_SET_CHECKSUM(&eck->zec_cksum, off, 0, 0, 0); 154 zio_checksum_SHA256(buf, size, NULL, &cksum); 155 eck->zec_cksum = cksum; 156 } 157 158 /* 159 * Set embedded checksums and write the label at the specified index. 160 */ 161 void 162 vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp) 163 { 164 vdev_label_t *label; 165 ssize_t n; 166 off_t blksz, loff; 167 168 assert(ind >= 0 && ind < VDEV_LABELS); 169 170 /* 171 * Make a copy since we have to modify the label to set checksums. 172 */ 173 label = ecalloc(1, sizeof(*label)); 174 memcpy(label, labelp, sizeof(*label)); 175 176 if (ind < 2) 177 loff = ind * sizeof(*label); 178 else 179 loff = zfs->vdevsize - (VDEV_LABELS - ind) * sizeof(*label); 180 181 /* 182 * Set the verifier checksum for the boot block. We don't use it, but 183 * the FreeBSD loader reads it and will complain if the checksum isn't 184 * valid. 185 */ 186 vdev_label_set_checksum(&label->vl_be, 187 loff + __offsetof(vdev_label_t, vl_be), sizeof(label->vl_be)); 188 189 /* 190 * Set the verifier checksum for the label. 191 */ 192 vdev_label_set_checksum(&label->vl_vdev_phys, 193 loff + __offsetof(vdev_label_t, vl_vdev_phys), 194 sizeof(label->vl_vdev_phys)); 195 196 /* 197 * Set the verifier checksum for the uberblocks. There is one uberblock 198 * per sector; for example, with an ashift of 12 we end up with 199 * 128KB/4KB=32 copies of the uberblock in the ring. 200 */ 201 blksz = 1 << zfs->ashift; 202 assert(sizeof(label->vl_uberblock) % blksz == 0); 203 for (size_t roff = 0; roff < sizeof(label->vl_uberblock); 204 roff += blksz) { 205 vdev_label_set_checksum(&label->vl_uberblock[0] + roff, 206 loff + __offsetof(vdev_label_t, vl_uberblock) + roff, 207 blksz); 208 } 209 210 n = pwrite(zfs->fd, label, sizeof(*label), loff); 211 if (n < 0) 212 err(1, "writing vdev label"); 213 assert(n == sizeof(*label)); 214 215 free(label); 216 } 217 218 /* 219 * Find a chunk of contiguous free space of length *lenp, according to the 220 * following rules: 221 * 1. If the length is less than or equal to 128KB, the returned run's length 222 * will be the smallest power of 2 equal to or larger than the length. 223 * 2. If the length is larger than 128KB, the returned run's length will be 224 * the smallest multiple of 128KB that is larger than the length. 225 * 3. The returned run's length will be size-aligned up to 128KB. 226 * 227 * XXX-MJ the third rule isn't actually required, so this can just be a dumb 228 * bump allocator. Maybe there's some benefit to keeping large blocks aligned, 229 * so let's keep it for now and hope we don't get too much fragmentation. 230 * Alternately we could try to allocate all blocks of a certain size from the 231 * same metaslab. 232 */ 233 off_t 234 vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp) 235 { 236 off_t len; 237 int align, loc, minblksz, nbits; 238 239 minblksz = 1 << zfs->ashift; 240 len = roundup2(*lenp, minblksz); 241 242 assert(len != 0); 243 assert(len / minblksz <= INT_MAX); 244 245 if (len < MAXBLOCKSIZE) { 246 if ((len & (len - 1)) != 0) 247 len = (off_t)1 << flsll(len); 248 align = len / minblksz; 249 } else { 250 len = roundup2(len, MAXBLOCKSIZE); 251 align = MAXBLOCKSIZE / minblksz; 252 } 253 254 for (loc = 0, nbits = len / minblksz;; loc = roundup2(loc, align)) { 255 bit_ffc_area_at(zfs->spacemap, loc, zfs->spacemapbits, nbits, 256 &loc); 257 if (loc == -1) { 258 errx(1, "failed to find %ju bytes of space", 259 (uintmax_t)len); 260 } 261 if ((loc & (align - 1)) == 0) 262 break; 263 } 264 assert(loc + nbits > loc); 265 bit_nset(zfs->spacemap, loc, loc + nbits - 1); 266 *lenp = len; 267 268 return ((off_t)loc << zfs->ashift); 269 } 270 271 static void 272 vdev_spacemap_init(zfs_opt_t *zfs) 273 { 274 uint64_t nbits; 275 276 assert(powerof2(zfs->mssize)); 277 278 nbits = rounddown2(zfs->asize, zfs->mssize) >> zfs->ashift; 279 if (nbits > INT_MAX) { 280 /* 281 * With the smallest block size of 512B, the limit on the image 282 * size is 2TB. That should be enough for anyone. 283 */ 284 errx(1, "image size is too large"); 285 } 286 zfs->spacemapbits = (int)nbits; 287 zfs->spacemap = bit_alloc(zfs->spacemapbits); 288 if (zfs->spacemap == NULL) 289 err(1, "bitstring allocation failed"); 290 } 291 292 void 293 vdev_spacemap_write(zfs_opt_t *zfs) 294 { 295 dnode_phys_t *objarr; 296 bitstr_t *spacemap; 297 uint64_t *objarrblk; 298 off_t smblksz, objarrblksz, objarrloc; 299 300 struct { 301 dnode_phys_t *dnode; 302 uint64_t dnid; 303 off_t loc; 304 } *sma; 305 306 objarrblksz = sizeof(uint64_t) * zfs->mscount; 307 assert(objarrblksz <= MAXBLOCKSIZE); 308 objarrloc = objset_space_alloc(zfs, zfs->mos, &objarrblksz); 309 objarrblk = ecalloc(1, objarrblksz); 310 311 objarr = objset_dnode_lookup(zfs->mos, zfs->objarrid); 312 objarr->dn_datablkszsec = objarrblksz >> MINBLOCKSHIFT; 313 314 /* 315 * Use the smallest block size for space maps. The space allocation 316 * algorithm should aim to minimize the number of holes. 317 */ 318 smblksz = 1 << zfs->ashift; 319 320 /* 321 * First allocate dnodes and space for all of our space maps. No more 322 * space can be allocated from the vdev after this point. 323 */ 324 sma = ecalloc(zfs->mscount, sizeof(*sma)); 325 for (uint64_t i = 0; i < zfs->mscount; i++) { 326 sma[i].dnode = objset_dnode_bonus_alloc(zfs->mos, 327 DMU_OT_SPACE_MAP, DMU_OT_SPACE_MAP_HEADER, 328 sizeof(space_map_phys_t), &sma[i].dnid); 329 sma[i].loc = objset_space_alloc(zfs, zfs->mos, &smblksz); 330 } 331 spacemap = zfs->spacemap; 332 zfs->spacemap = NULL; 333 334 /* 335 * Now that the set of allocated space is finalized, populate each space 336 * map and write it to the vdev. 337 */ 338 for (uint64_t i = 0; i < zfs->mscount; i++) { 339 space_map_phys_t *sm; 340 uint64_t alloc, length, *smblk; 341 int shift, startb, endb, srunb, erunb; 342 343 /* 344 * We only allocate a single block for this space map, but 345 * OpenZFS assumes that a space map object with sufficient bonus 346 * space supports histograms. 347 */ 348 sma[i].dnode->dn_nblkptr = 3; 349 sma[i].dnode->dn_datablkszsec = smblksz >> MINBLOCKSHIFT; 350 351 smblk = ecalloc(1, smblksz); 352 353 alloc = length = 0; 354 shift = zfs->msshift - zfs->ashift; 355 for (srunb = startb = i * (1 << shift), 356 endb = (i + 1) * (1 << shift); 357 srunb < endb; srunb = erunb) { 358 uint64_t runlen, runoff; 359 360 /* Find a run of allocated space. */ 361 bit_ffs_at(spacemap, srunb, zfs->spacemapbits, &srunb); 362 if (srunb == -1 || srunb >= endb) 363 break; 364 365 bit_ffc_at(spacemap, srunb, zfs->spacemapbits, &erunb); 366 if (erunb == -1 || erunb > endb) 367 erunb = endb; 368 369 /* 370 * The space represented by [srunb, erunb) has been 371 * allocated. Add a record to the space map to indicate 372 * this. Run offsets are relative to the beginning of 373 * the metaslab. 374 */ 375 runlen = erunb - srunb; 376 runoff = srunb - startb; 377 378 assert(length * sizeof(uint64_t) < (uint64_t)smblksz); 379 smblk[length] = SM_PREFIX_ENCODE(SM2_PREFIX) | 380 SM2_RUN_ENCODE(runlen) | SM2_VDEV_ENCODE(0); 381 smblk[length + 1] = SM2_TYPE_ENCODE(SM_ALLOC) | 382 SM2_OFFSET_ENCODE(runoff); 383 384 alloc += runlen << zfs->ashift; 385 length += 2; 386 } 387 388 sm = DN_BONUS(sma[i].dnode); 389 sm->smp_length = length * sizeof(uint64_t); 390 sm->smp_alloc = alloc; 391 392 vdev_pwrite_dnode_data(zfs, sma[i].dnode, smblk, smblksz, 393 sma[i].loc); 394 free(smblk); 395 396 /* Record this space map in the space map object array. */ 397 objarrblk[i] = sma[i].dnid; 398 } 399 400 /* 401 * All of the space maps are written, now write the object array. 402 */ 403 vdev_pwrite_dnode_data(zfs, objarr, objarrblk, objarrblksz, objarrloc); 404 free(objarrblk); 405 406 assert(zfs->spacemap == NULL); 407 free(spacemap); 408 free(sma); 409 } 410 411 void 412 vdev_init(zfs_opt_t *zfs, const char *image) 413 { 414 assert(zfs->ashift >= MINBLOCKSHIFT); 415 416 zfs->fd = open(image, O_RDWR | O_CREAT | O_TRUNC, 0644); 417 if (zfs->fd == -1) 418 err(1, "Can't open `%s' for writing", image); 419 if (ftruncate(zfs->fd, zfs->vdevsize) != 0) 420 err(1, "Failed to extend image file `%s'", image); 421 422 vdev_spacemap_init(zfs); 423 } 424 425 void 426 vdev_fini(zfs_opt_t *zfs) 427 { 428 assert(zfs->spacemap == NULL); 429 430 if (zfs->fd != -1) { 431 if (close(zfs->fd) != 0) 432 err(1, "close"); 433 zfs->fd = -1; 434 } 435 } 436