xref: /freebsd/usr.sbin/makefs/zfs/vdev.c (revision 4f0c9b76cf75724ef0b9c59bb8c182be24361d7c)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2022 The FreeBSD Foundation
5  *
6  * This software was developed by Mark Johnston under sponsorship from
7  * the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions are
11  * met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <assert.h>
32 #include <fcntl.h>
33 #include <string.h>
34 #include <unistd.h>
35 
36 #include <util.h>
37 
38 #include "zfs.h"
39 
40 #pragma clang diagnostic push
41 #pragma clang diagnostic ignored "-Wunused-function"
42 #include "zfs/fletcher.c"
43 #include "zfs/sha256.c"
44 #pragma clang diagnostic pop
45 
46 static void
47 blkptr_set(blkptr_t *bp, off_t off, off_t size, uint8_t dntype, uint8_t level,
48     uint64_t fill, enum zio_checksum cksumt, zio_cksum_t *cksum)
49 {
50 	dva_t *dva;
51 
52 	assert(powerof2(size));
53 
54 	BP_ZERO(bp);
55 	BP_SET_LSIZE(bp, size);
56 	BP_SET_PSIZE(bp, size);
57 	BP_SET_CHECKSUM(bp, cksumt);
58 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
59 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
60 	BP_SET_BIRTH(bp, TXG, TXG);
61 	BP_SET_LEVEL(bp, level);
62 	BP_SET_FILL(bp, fill);
63 	BP_SET_TYPE(bp, dntype);
64 
65 	dva = BP_IDENTITY(bp);
66 	DVA_SET_VDEV(dva, 0);
67 	DVA_SET_OFFSET(dva, off);
68 	DVA_SET_ASIZE(dva, size);
69 	memcpy(&bp->blk_cksum, cksum, sizeof(*cksum));
70 }
71 
72 /*
73  * Write a block of data to the vdev.  The offset is always relative to the end
74  * of the second leading vdev label.
75  *
76  * Consumers should generally use the helpers below, which provide block
77  * pointers and update dnode accounting, rather than calling this function
78  * directly.
79  */
80 static void
81 vdev_pwrite(const zfs_opt_t *zfs, const void *buf, size_t len, off_t off)
82 {
83 	ssize_t n;
84 
85 	assert(off >= 0 && off < zfs->asize);
86 	assert(powerof2(len));
87 	assert((off_t)len > 0 && off + (off_t)len > off &&
88 	    off + (off_t)len < zfs->asize);
89 	if (zfs->spacemap != NULL) {
90 		/*
91 		 * Verify that the blocks being written were in fact allocated.
92 		 *
93 		 * The space map isn't available once the on-disk space map is
94 		 * finalized, so this check doesn't quite catch everything.
95 		 */
96 		assert(bit_ntest(zfs->spacemap, off >> zfs->ashift,
97 		    (off + len - 1) >> zfs->ashift, 1));
98 	}
99 
100 	off += VDEV_LABEL_START_SIZE;
101 	for (size_t sofar = 0; sofar < len; sofar += n) {
102 		n = pwrite(zfs->fd, (const char *)buf + sofar, len - sofar,
103 		    off + sofar);
104 		if (n < 0)
105 			err(1, "pwrite");
106 		assert(n > 0);
107 	}
108 }
109 
110 void
111 vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype,
112     uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc,
113     blkptr_t *bp)
114 {
115 	zio_cksum_t cksum;
116 
117 	assert(cksumtype == ZIO_CHECKSUM_FLETCHER_4);
118 
119 	fletcher_4_native(data, sz, NULL, &cksum);
120 	blkptr_set(bp, loc, sz, datatype, level, fill, cksumtype, &cksum);
121 	vdev_pwrite(zfs, data, sz, loc);
122 }
123 
124 void
125 vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level,
126     uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp)
127 {
128 	vdev_pwrite_data(zfs, dnode->dn_type, dnode->dn_checksum, level, fill,
129 	    data, sz, loc, bp);
130 
131 	assert((dnode->dn_flags & DNODE_FLAG_USED_BYTES) != 0);
132 	dnode->dn_used += sz;
133 }
134 
135 void
136 vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data,
137     off_t sz, off_t loc)
138 {
139 	vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, data, sz, loc,
140 	    &dnode->dn_blkptr[0]);
141 }
142 
143 static void
144 vdev_label_set_checksum(void *buf, off_t off, off_t size)
145 {
146 	zio_cksum_t cksum;
147 	zio_eck_t *eck;
148 
149 	assert(size > 0 && (size_t)size >= sizeof(zio_eck_t));
150 
151 	eck = (zio_eck_t *)((char *)buf + size) - 1;
152 	eck->zec_magic = ZEC_MAGIC;
153 	ZIO_SET_CHECKSUM(&eck->zec_cksum, off, 0, 0, 0);
154 	zio_checksum_SHA256(buf, size, NULL, &cksum);
155 	eck->zec_cksum = cksum;
156 }
157 
158 /*
159  * Set embedded checksums and write the label at the specified index.
160  */
161 void
162 vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp)
163 {
164 	vdev_label_t *label;
165 	ssize_t n;
166 	off_t blksz, loff;
167 
168 	assert(ind >= 0 && ind < VDEV_LABELS);
169 
170 	/*
171 	 * Make a copy since we have to modify the label to set checksums.
172 	 */
173 	label = ecalloc(1, sizeof(*label));
174 	memcpy(label, labelp, sizeof(*label));
175 
176 	if (ind < 2)
177 		loff = ind * sizeof(*label);
178 	else
179 		loff = zfs->vdevsize - (VDEV_LABELS - ind) * sizeof(*label);
180 
181 	/*
182 	 * Set the verifier checksum for the boot block.  We don't use it, but
183 	 * the FreeBSD loader reads it and will complain if the checksum isn't
184 	 * valid.
185 	 */
186 	vdev_label_set_checksum(&label->vl_be,
187 	    loff + __offsetof(vdev_label_t, vl_be), sizeof(label->vl_be));
188 
189 	/*
190 	 * Set the verifier checksum for the label.
191 	 */
192 	vdev_label_set_checksum(&label->vl_vdev_phys,
193 	    loff + __offsetof(vdev_label_t, vl_vdev_phys),
194 	    sizeof(label->vl_vdev_phys));
195 
196 	/*
197 	 * Set the verifier checksum for the uberblocks.  There is one uberblock
198 	 * per sector; for example, with an ashift of 12 we end up with
199 	 * 128KB/4KB=32 copies of the uberblock in the ring.
200 	 */
201 	blksz = 1 << zfs->ashift;
202 	assert(sizeof(label->vl_uberblock) % blksz == 0);
203 	for (size_t roff = 0; roff < sizeof(label->vl_uberblock);
204 	    roff += blksz) {
205 		vdev_label_set_checksum(&label->vl_uberblock[0] + roff,
206 		    loff + __offsetof(vdev_label_t, vl_uberblock) + roff,
207 		    blksz);
208 	}
209 
210 	n = pwrite(zfs->fd, label, sizeof(*label), loff);
211 	if (n < 0)
212 		err(1, "writing vdev label");
213 	assert(n == sizeof(*label));
214 
215 	free(label);
216 }
217 
218 /*
219  * Find a chunk of contiguous free space of length *lenp, according to the
220  * following rules:
221  * 1. If the length is less than or equal to 128KB, the returned run's length
222  *    will be the smallest power of 2 equal to or larger than the length.
223  * 2. If the length is larger than 128KB, the returned run's length will be
224  *    the smallest multiple of 128KB that is larger than the length.
225  * 3. The returned run's length will be size-aligned up to 128KB.
226  *
227  * XXX-MJ the third rule isn't actually required, so this can just be a dumb
228  * bump allocator.  Maybe there's some benefit to keeping large blocks aligned,
229  * so let's keep it for now and hope we don't get too much fragmentation.
230  * Alternately we could try to allocate all blocks of a certain size from the
231  * same metaslab.
232  */
233 off_t
234 vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp)
235 {
236 	off_t len;
237 	int align, loc, minblksz, nbits;
238 
239 	minblksz = 1 << zfs->ashift;
240 	len = roundup2(*lenp, minblksz);
241 
242 	assert(len != 0);
243 	assert(len / minblksz <= INT_MAX);
244 
245 	if (len < MAXBLOCKSIZE) {
246 		if ((len & (len - 1)) != 0)
247 			len = (off_t)1 << flsll(len);
248 		align = len / minblksz;
249 	} else {
250 		len = roundup2(len, MAXBLOCKSIZE);
251 		align = MAXBLOCKSIZE / minblksz;
252 	}
253 
254 	for (loc = 0, nbits = len / minblksz;; loc = roundup2(loc, align)) {
255 		bit_ffc_area_at(zfs->spacemap, loc, zfs->spacemapbits, nbits,
256 		    &loc);
257 		if (loc == -1) {
258 			errx(1, "failed to find %ju bytes of space",
259 			    (uintmax_t)len);
260 		}
261 		if ((loc & (align - 1)) == 0)
262 			break;
263 	}
264 	assert(loc + nbits > loc);
265 	bit_nset(zfs->spacemap, loc, loc + nbits - 1);
266 	*lenp = len;
267 
268 	return ((off_t)loc << zfs->ashift);
269 }
270 
271 static void
272 vdev_spacemap_init(zfs_opt_t *zfs)
273 {
274 	uint64_t nbits;
275 
276 	assert(powerof2(zfs->mssize));
277 
278 	nbits = rounddown2(zfs->asize, zfs->mssize) >> zfs->ashift;
279 	if (nbits > INT_MAX) {
280 		/*
281 		 * With the smallest block size of 512B, the limit on the image
282 		 * size is 2TB.  That should be enough for anyone.
283 		 */
284 		errx(1, "image size is too large");
285 	}
286 	zfs->spacemapbits = (int)nbits;
287 	zfs->spacemap = bit_alloc(zfs->spacemapbits);
288 	if (zfs->spacemap == NULL)
289 		err(1, "bitstring allocation failed");
290 }
291 
292 void
293 vdev_spacemap_write(zfs_opt_t *zfs)
294 {
295 	dnode_phys_t *objarr;
296 	bitstr_t *spacemap;
297 	uint64_t *objarrblk;
298 	off_t smblksz, objarrblksz, objarrloc;
299 
300 	struct {
301 		dnode_phys_t	*dnode;
302 		uint64_t	dnid;
303 		off_t		loc;
304 	} *sma;
305 
306 	objarrblksz = sizeof(uint64_t) * zfs->mscount;
307 	assert(objarrblksz <= MAXBLOCKSIZE);
308 	objarrloc = objset_space_alloc(zfs, zfs->mos, &objarrblksz);
309 	objarrblk = ecalloc(1, objarrblksz);
310 
311 	objarr = objset_dnode_lookup(zfs->mos, zfs->objarrid);
312 	objarr->dn_datablkszsec = objarrblksz >> MINBLOCKSHIFT;
313 
314 	/*
315 	 * Use the smallest block size for space maps.  The space allocation
316 	 * algorithm should aim to minimize the number of holes.
317 	 */
318 	smblksz = 1 << zfs->ashift;
319 
320 	/*
321 	 * First allocate dnodes and space for all of our space maps.  No more
322 	 * space can be allocated from the vdev after this point.
323 	 */
324 	sma = ecalloc(zfs->mscount, sizeof(*sma));
325 	for (uint64_t i = 0; i < zfs->mscount; i++) {
326 		sma[i].dnode = objset_dnode_bonus_alloc(zfs->mos,
327 		    DMU_OT_SPACE_MAP, DMU_OT_SPACE_MAP_HEADER,
328 		    sizeof(space_map_phys_t), &sma[i].dnid);
329 		sma[i].loc = objset_space_alloc(zfs, zfs->mos, &smblksz);
330 	}
331 	spacemap = zfs->spacemap;
332 	zfs->spacemap = NULL;
333 
334 	/*
335 	 * Now that the set of allocated space is finalized, populate each space
336 	 * map and write it to the vdev.
337 	 */
338 	for (uint64_t i = 0; i < zfs->mscount; i++) {
339 		space_map_phys_t *sm;
340 		uint64_t alloc, length, *smblk;
341 		int shift, startb, endb, srunb, erunb;
342 
343 		/*
344 		 * We only allocate a single block for this space map, but
345 		 * OpenZFS assumes that a space map object with sufficient bonus
346 		 * space supports histograms.
347 		 */
348 		sma[i].dnode->dn_nblkptr = 3;
349 		sma[i].dnode->dn_datablkszsec = smblksz >> MINBLOCKSHIFT;
350 
351 		smblk = ecalloc(1, smblksz);
352 
353 		alloc = length = 0;
354 		shift = zfs->msshift - zfs->ashift;
355 		for (srunb = startb = i * (1 << shift),
356 		    endb = (i + 1) * (1 << shift);
357 		    srunb < endb; srunb = erunb) {
358 			uint64_t runlen, runoff;
359 
360 			/* Find a run of allocated space. */
361 			bit_ffs_at(spacemap, srunb, zfs->spacemapbits, &srunb);
362 			if (srunb == -1 || srunb >= endb)
363 				break;
364 
365 			bit_ffc_at(spacemap, srunb, zfs->spacemapbits, &erunb);
366 			if (erunb == -1 || erunb > endb)
367 				erunb = endb;
368 
369 			/*
370 			 * The space represented by [srunb, erunb) has been
371 			 * allocated.  Add a record to the space map to indicate
372 			 * this.  Run offsets are relative to the beginning of
373 			 * the metaslab.
374 			 */
375 			runlen = erunb - srunb;
376 			runoff = srunb - startb;
377 
378 			assert(length * sizeof(uint64_t) < (uint64_t)smblksz);
379 			smblk[length] = SM_PREFIX_ENCODE(SM2_PREFIX) |
380 			    SM2_RUN_ENCODE(runlen) | SM2_VDEV_ENCODE(0);
381 			smblk[length + 1] = SM2_TYPE_ENCODE(SM_ALLOC) |
382 			    SM2_OFFSET_ENCODE(runoff);
383 
384 			alloc += runlen << zfs->ashift;
385 			length += 2;
386 		}
387 
388 		sm = DN_BONUS(sma[i].dnode);
389 		sm->smp_length = length * sizeof(uint64_t);
390 		sm->smp_alloc = alloc;
391 
392 		vdev_pwrite_dnode_data(zfs, sma[i].dnode, smblk, smblksz,
393 		    sma[i].loc);
394 		free(smblk);
395 
396 		/* Record this space map in the space map object array. */
397 		objarrblk[i] = sma[i].dnid;
398 	}
399 
400 	/*
401 	 * All of the space maps are written, now write the object array.
402 	 */
403 	vdev_pwrite_dnode_data(zfs, objarr, objarrblk, objarrblksz, objarrloc);
404 	free(objarrblk);
405 
406 	assert(zfs->spacemap == NULL);
407 	free(spacemap);
408 	free(sma);
409 }
410 
411 void
412 vdev_init(zfs_opt_t *zfs, const char *image)
413 {
414 	assert(zfs->ashift >= MINBLOCKSHIFT);
415 
416 	zfs->fd = open(image, O_RDWR | O_CREAT | O_TRUNC, 0644);
417 	if (zfs->fd == -1)
418 		err(1, "Can't open `%s' for writing", image);
419 	if (ftruncate(zfs->fd, zfs->vdevsize) != 0)
420 		err(1, "Failed to extend image file `%s'", image);
421 
422 	vdev_spacemap_init(zfs);
423 }
424 
425 void
426 vdev_fini(zfs_opt_t *zfs)
427 {
428 	assert(zfs->spacemap == NULL);
429 
430 	if (zfs->fd != -1) {
431 		if (close(zfs->fd) != 0)
432 			err(1, "close");
433 		zfs->fd = -1;
434 	}
435 }
436