xref: /freebsd/sys/contrib/openzfs/include/sys/brt_impl.h (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
24  */
25 
26 #ifndef _SYS_BRT_IMPL_H
27 #define	_SYS_BRT_IMPL_H
28 
29 #ifdef	__cplusplus
30 extern "C" {
31 #endif
32 
33 /*
34  * BRT - Block Reference Table.
35  */
36 #define	BRT_OBJECT_VDEV_PREFIX	"com.fudosecurity:brt:vdev:"
37 
38 /*
39  * We divide each VDEV into 16MB chunks. Each chunk is represented in memory
40  * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
41  * Each element in this array represents how many BRT entries do we have in this
42  * chunk of storage. We always load this entire array into memory and update as
43  * needed. By having it in memory we can quickly tell (during zio_free()) if
44  * there are any BRT entries that we might need to update.
45  *
46  * This value cannot be larger than 16MB, at least as long as we support
47  * 512 byte block sizes. With 512 byte block size we can have exactly
48  * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
49  * many for a 16bit counter.
50  */
51 #define	BRT_RANGESIZE	(16 * 1024 * 1024)
52 _Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
53 	"BRT_RANGESIZE is too large.");
54 /*
55  * We don't want to update the whole structure every time. Maintain bitmap
56  * of dirty blocks within the regions, so that a single bit represents a
57  * block size of entcounts. For example if we have a 1PB vdev then all
58  * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
59  * 128MB array of entcounts into 32kB disk blocks, as we don't want to update
60  * the whole 128MB on disk when we have updated only a single entcount.
61  * We maintain a bitmap where each 32kB disk block within 128MB entcounts array
62  * is represented by a single bit. This gives us 4096 bits. A set bit in the
63  * bitmap means that we had a change in at least one of the 16384 entcounts
64  * that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
65  */
66 #define	BRT_BLOCKSIZE	(32 * 1024)
67 #define	BRT_RANGESIZE_TO_NBLOCKS(size)					\
68 	(((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
69 
70 #define	BRT_LITTLE_ENDIAN	0
71 #define	BRT_BIG_ENDIAN		1
72 #ifdef _ZFS_LITTLE_ENDIAN
73 #define	BRT_NATIVE_BYTEORDER		BRT_LITTLE_ENDIAN
74 #define	BRT_NON_NATIVE_BYTEORDER	BRT_BIG_ENDIAN
75 #else
76 #define	BRT_NATIVE_BYTEORDER		BRT_BIG_ENDIAN
77 #define	BRT_NON_NATIVE_BYTEORDER	BRT_LITTLE_ENDIAN
78 #endif
79 
80 typedef struct brt_vdev_phys {
81 	uint64_t	bvp_mos_entries;
82 	uint64_t	bvp_size;
83 	uint64_t	bvp_byteorder;
84 	uint64_t	bvp_totalcount;
85 	uint64_t	bvp_rangesize;
86 	uint64_t	bvp_usedspace;
87 	uint64_t	bvp_savedspace;
88 } brt_vdev_phys_t;
89 
90 struct brt_vdev {
91 	/*
92 	 * Pending changes from open contexts.
93 	 */
94 	kmutex_t	bv_pending_lock;
95 	avl_tree_t	bv_pending_tree[TXG_SIZE];
96 	/*
97 	 * Protects bv_mos_*.
98 	 */
99 	krwlock_t	bv_mos_entries_lock ____cacheline_aligned;
100 	/*
101 	 * Protects all the fields starting from bv_initiated.
102 	 */
103 	krwlock_t	bv_lock ____cacheline_aligned;
104 	/*
105 	 * VDEV id.
106 	 */
107 	uint64_t	bv_vdevid ____cacheline_aligned;
108 	/*
109 	 * Object number in the MOS for the entcount array and brt_vdev_phys.
110 	 */
111 	uint64_t	bv_mos_brtvdev;
112 	/*
113 	 * Object number in the MOS and dnode for the entries table.
114 	 */
115 	uint64_t	bv_mos_entries;
116 	dnode_t		*bv_mos_entries_dnode;
117 	/*
118 	 * Is the structure initiated?
119 	 * (bv_entcount and bv_bitmap are allocated?)
120 	 */
121 	boolean_t	bv_initiated;
122 	/*
123 	 * Does the bv_entcount[] array needs byte swapping?
124 	 */
125 	boolean_t	bv_need_byteswap;
126 	/*
127 	 * Number of entries in the bv_entcount[] array.
128 	 */
129 	uint64_t	bv_size;
130 	/*
131 	 * This is the array with BRT entry count per BRT_RANGESIZE.
132 	 */
133 	uint16_t	*bv_entcount;
134 	/*
135 	 * bv_entcount[] potentially can be a bit too big to sychronize it all
136 	 * when we just changed few entcounts. The fields below allow us to
137 	 * track updates to bv_entcount[] array since the last sync.
138 	 * A single bit in the bv_bitmap represents as many entcounts as can
139 	 * fit into a single BRT_BLOCKSIZE.
140 	 * For example we have 65536 entcounts in the bv_entcount array
141 	 * (so the whole array is 128kB). We updated bv_entcount[2] and
142 	 * bv_entcount[5]. In that case only first bit in the bv_bitmap will
143 	 * be set and we will write only first BRT_BLOCKSIZE out of 128kB.
144 	 */
145 	ulong_t		*bv_bitmap;
146 	/*
147 	 * bv_entcount[] needs updating on disk.
148 	 */
149 	boolean_t	bv_entcount_dirty;
150 	/*
151 	 * brt_vdev_phys needs updating on disk.
152 	 */
153 	boolean_t	bv_meta_dirty;
154 	/*
155 	 * Sum of all bv_entcount[]s.
156 	 */
157 	uint64_t	bv_totalcount;
158 	/*
159 	 * Space on disk occupied by cloned blocks (without compression).
160 	 */
161 	uint64_t	bv_usedspace;
162 	/*
163 	 * How much additional space would be occupied without block cloning.
164 	 */
165 	uint64_t	bv_savedspace;
166 	/*
167 	 * Entries to sync.
168 	 */
169 	avl_tree_t	bv_tree;
170 };
171 
172 /* Size of offset / sizeof (uint64_t). */
173 #define	BRT_KEY_WORDS	(1)
174 
175 #define	BRE_OFFSET(bre)	(DVA_GET_OFFSET(&(bre)->bre_bp.blk_dva[0]))
176 
177 /*
178  * In-core brt entry.
179  * On-disk we use ZAP with offset as the key and count as the value.
180  */
181 typedef struct brt_entry {
182 	avl_node_t	bre_node;
183 	blkptr_t	bre_bp;
184 	uint64_t	bre_count;
185 	uint64_t	bre_pcount;
186 } brt_entry_t;
187 
188 #ifdef	__cplusplus
189 }
190 #endif
191 
192 #endif	/* _SYS_BRT_IMPL_H */
193