1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek 24 */ 25 26 #ifndef _SYS_BRT_IMPL_H 27 #define _SYS_BRT_IMPL_H 28 29 #ifdef __cplusplus 30 extern "C" { 31 #endif 32 33 /* 34 * BRT - Block Reference Table. 35 */ 36 #define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:" 37 38 /* 39 * We divide each VDEV into 16MB chunks. Each chunk is represented in memory 40 * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B 41 * Each element in this array represents how many BRT entries do we have in this 42 * chunk of storage. We always load this entire array into memory and update as 43 * needed. By having it in memory we can quickly tell (during zio_free()) if 44 * there are any BRT entries that we might need to update. 45 * 46 * This value cannot be larger than 16MB, at least as long as we support 47 * 512 byte block sizes. With 512 byte block size we can have exactly 48 * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too 49 * many for a 16bit counter. 50 */ 51 #define BRT_RANGESIZE (16 * 1024 * 1024) 52 _Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX, 53 "BRT_RANGESIZE is too large."); 54 /* 55 * We don't want to update the whole structure every time. Maintain bitmap 56 * of dirty blocks within the regions, so that a single bit represents a 57 * block size of entcounts. For example if we have a 1PB vdev then all 58 * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this 59 * 128MB array of entcounts into 32kB disk blocks, as we don't want to update 60 * the whole 128MB on disk when we have updated only a single entcount. 61 * We maintain a bitmap where each 32kB disk block within 128MB entcounts array 62 * is represented by a single bit. This gives us 4096 bits. A set bit in the 63 * bitmap means that we had a change in at least one of the 16384 entcounts 64 * that reside on a 32kB disk block (32kB / sizeof (uint16_t)). 65 */ 66 #define BRT_BLOCKSIZE (32 * 1024) 67 #define BRT_RANGESIZE_TO_NBLOCKS(size) \ 68 (((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1) 69 70 #define BRT_LITTLE_ENDIAN 0 71 #define BRT_BIG_ENDIAN 1 72 #ifdef _ZFS_LITTLE_ENDIAN 73 #define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN 74 #define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN 75 #else 76 #define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN 77 #define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN 78 #endif 79 80 typedef struct brt_vdev_phys { 81 uint64_t bvp_mos_entries; 82 uint64_t bvp_size; 83 uint64_t bvp_byteorder; 84 uint64_t bvp_totalcount; 85 uint64_t bvp_rangesize; 86 uint64_t bvp_usedspace; 87 uint64_t bvp_savedspace; 88 } brt_vdev_phys_t; 89 90 struct brt_vdev { 91 /* 92 * Pending changes from open contexts. 93 */ 94 kmutex_t bv_pending_lock; 95 avl_tree_t bv_pending_tree[TXG_SIZE]; 96 /* 97 * Protects bv_mos_*. 98 */ 99 krwlock_t bv_mos_entries_lock ____cacheline_aligned; 100 /* 101 * Protects all the fields starting from bv_initiated. 102 */ 103 krwlock_t bv_lock ____cacheline_aligned; 104 /* 105 * VDEV id. 106 */ 107 uint64_t bv_vdevid ____cacheline_aligned; 108 /* 109 * Object number in the MOS for the entcount array and brt_vdev_phys. 110 */ 111 uint64_t bv_mos_brtvdev; 112 /* 113 * Object number in the MOS and dnode for the entries table. 114 */ 115 uint64_t bv_mos_entries; 116 dnode_t *bv_mos_entries_dnode; 117 /* 118 * Is the structure initiated? 119 * (bv_entcount and bv_bitmap are allocated?) 120 */ 121 boolean_t bv_initiated; 122 /* 123 * Does the bv_entcount[] array needs byte swapping? 124 */ 125 boolean_t bv_need_byteswap; 126 /* 127 * Number of entries in the bv_entcount[] array. 128 */ 129 uint64_t bv_size; 130 /* 131 * This is the array with BRT entry count per BRT_RANGESIZE. 132 */ 133 uint16_t *bv_entcount; 134 /* 135 * bv_entcount[] potentially can be a bit too big to sychronize it all 136 * when we just changed few entcounts. The fields below allow us to 137 * track updates to bv_entcount[] array since the last sync. 138 * A single bit in the bv_bitmap represents as many entcounts as can 139 * fit into a single BRT_BLOCKSIZE. 140 * For example we have 65536 entcounts in the bv_entcount array 141 * (so the whole array is 128kB). We updated bv_entcount[2] and 142 * bv_entcount[5]. In that case only first bit in the bv_bitmap will 143 * be set and we will write only first BRT_BLOCKSIZE out of 128kB. 144 */ 145 ulong_t *bv_bitmap; 146 /* 147 * bv_entcount[] needs updating on disk. 148 */ 149 boolean_t bv_entcount_dirty; 150 /* 151 * brt_vdev_phys needs updating on disk. 152 */ 153 boolean_t bv_meta_dirty; 154 /* 155 * Sum of all bv_entcount[]s. 156 */ 157 uint64_t bv_totalcount; 158 /* 159 * Space on disk occupied by cloned blocks (without compression). 160 */ 161 uint64_t bv_usedspace; 162 /* 163 * How much additional space would be occupied without block cloning. 164 */ 165 uint64_t bv_savedspace; 166 /* 167 * Entries to sync. 168 */ 169 avl_tree_t bv_tree; 170 }; 171 172 /* Size of offset / sizeof (uint64_t). */ 173 #define BRT_KEY_WORDS (1) 174 175 #define BRE_OFFSET(bre) (DVA_GET_OFFSET(&(bre)->bre_bp.blk_dva[0])) 176 177 /* 178 * In-core brt entry. 179 * On-disk we use ZAP with offset as the key and count as the value. 180 */ 181 typedef struct brt_entry { 182 avl_node_t bre_node; 183 blkptr_t bre_bp; 184 uint64_t bre_count; 185 uint64_t bre_pcount; 186 } brt_entry_t; 187 188 #ifdef __cplusplus 189 } 190 #endif 191 192 #endif /* _SYS_BRT_IMPL_H */ 193