1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/spa.h> 28 #include <sys/spa_impl.h> 29 #include <sys/zio.h> 30 #include <sys/brt.h> 31 #include <sys/ddt.h> 32 #include <sys/bitmap.h> 33 #include <sys/zap.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/arc.h> 36 #include <sys/dsl_pool.h> 37 #include <sys/dsl_scan.h> 38 #include <sys/vdev_impl.h> 39 #include <sys/kstat.h> 40 #include <sys/wmsum.h> 41 42 /* 43 * Block Cloning design. 44 * 45 * Block Cloning allows to manually clone a file (or a subset of its blocks) 46 * into another (or the same) file by just creating additional references to 47 * the data blocks without copying the data itself. Those references are kept 48 * in the Block Reference Tables (BRTs). 49 * 50 * In many ways this is similar to the existing deduplication, but there are 51 * some important differences: 52 * 53 * - Deduplication is automatic and Block Cloning is not - one has to use a 54 * dedicated system call(s) to clone the given file/blocks. 55 * - Deduplication keeps all data blocks in its table, even those referenced 56 * just once. Block Cloning creates an entry in its tables only when there 57 * are at least two references to the given data block. If the block was 58 * never explicitly cloned or the second to last reference was dropped, 59 * there will be neither space nor performance overhead. 60 * - Deduplication needs data to work - one needs to pass real data to the 61 * write(2) syscall, so hash can be calculated. Block Cloning doesn't require 62 * data, just block pointers to the data, so it is extremely fast, as we pay 63 * neither the cost of reading the data, nor the cost of writing the data - 64 * we operate exclusively on metadata. 65 * - If the D (dedup) bit is not set in the block pointer, it means that 66 * the block is not in the dedup table (DDT) and we won't consult the DDT 67 * when we need to free the block. Block Cloning must be consulted on every 68 * free, because we cannot modify the source BP (eg. by setting something 69 * similar to the D bit), thus we have no hint if the block is in the 70 * Block Reference Table (BRT), so we need to look into the BRT. There is 71 * an optimization in place that allows us to eliminate the majority of BRT 72 * lookups which is described below in the "Minimizing free penalty" section. 73 * - The BRT entry is much smaller than the DDT entry - for BRT we only store 74 * 64bit offset and 64bit reference counter. 75 * - Dedup keys are cryptographic hashes, so two blocks that are close to each 76 * other on disk are most likely in totally different parts of the DDT. 77 * The BRT entry keys are offsets into a single top-level VDEV, so data blocks 78 * from one file should have BRT entries close to each other. 79 * - Scrub will only do a single pass over a block that is referenced multiple 80 * times in the DDT. Unfortunately it is not currently (if at all) possible 81 * with Block Cloning and block referenced multiple times will be scrubbed 82 * multiple times. The new, sorted scrub should be able to eliminate 83 * duplicated reads given enough memory. 84 * - Deduplication requires cryptographically strong hash as a checksum or 85 * additional data verification. Block Cloning works with any checksum 86 * algorithm or even with checksumming disabled. 87 * 88 * As mentioned above, the BRT entries are much smaller than the DDT entries. 89 * To uniquely identify a block we just need its vdev id and offset. We also 90 * need to maintain a reference counter. The vdev id will often repeat, as there 91 * is a small number of top-level VDEVs and a large number of blocks stored in 92 * each VDEV. We take advantage of that to reduce the BRT entry size further by 93 * maintaining one BRT for each top-level VDEV, so we can then have only offset 94 * and counter as the BRT entry. 95 * 96 * Minimizing free penalty. 97 * 98 * Block Cloning allows creating additional references to any existing block. 99 * When we free a block there is no hint in the block pointer whether the block 100 * was cloned or not, so on each free we have to check if there is a 101 * corresponding entry in the BRT or not. If there is, we need to decrease 102 * the reference counter. Doing BRT lookup on every free can potentially be 103 * expensive by requiring additional I/Os if the BRT doesn't fit into memory. 104 * This is the main problem with deduplication, so we've learned our lesson and 105 * try not to repeat the same mistake here. How do we do that? We divide each 106 * top-level VDEV into 16MB regions. For each region we maintain a counter that 107 * is a sum of all the BRT entries that have offsets within the region. This 108 * creates the entries count array of 16bit numbers for each top-level VDEV. 109 * The entries count array is always kept in memory and updated on disk in the 110 * same transaction group as the BRT updates to keep everything in-sync. We can 111 * keep the array in memory, because it is very small. With 16MB regions and 112 * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease 113 * the region size even further in the future). Now, when we want to free 114 * a block, we first consult the array. If the counter for the whole region is 115 * zero, there is no need to look for the BRT entry, as there isn't one for 116 * sure. If the counter for the region is greater than zero, only then we will 117 * do a BRT lookup and if an entry is found we will decrease the reference 118 * counter in the BRT entry and in the entry counters array. 119 * 120 * The entry counters array is small, but can potentially be larger for very 121 * large VDEVs or smaller regions. In this case we don't want to rewrite entire 122 * array on every change. We then divide the array into 32kB block and keep 123 * a bitmap of dirty blocks within a transaction group. When we sync the 124 * transaction group we can only update the parts of the entry counters array 125 * that were modified. Note: Keeping track of the dirty parts of the entry 126 * counters array is implemented, but updating only parts of the array on disk 127 * is not yet implemented - for now we will update entire array if there was 128 * any change. 129 * 130 * The implementation tries to be economic: if BRT is not used, or no longer 131 * used, there will be no entries in the MOS and no additional memory used (eg. 132 * the entry counters array is only allocated if needed). 133 * 134 * Interaction between Deduplication and Block Cloning. 135 * 136 * If both functionalities are in use, we could end up with a block that is 137 * referenced multiple times in both DDT and BRT. When we free one of the 138 * references we couldn't tell where it belongs, so we would have to decide 139 * what table takes the precedence: do we first clear DDT references or BRT 140 * references? To avoid this dilemma BRT cooperates with DDT - if a given block 141 * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will 142 * lookup DDT entry instead and increase the counter there. No BRT entry 143 * will be created for a block which has the D (dedup) bit set. 144 * BRT may be more efficient for manual deduplication, but if the block is 145 * already in the DDT, then creating additional BRT entry would be less 146 * efficient. This clever idea was proposed by Allan Jude. 147 * 148 * Block Cloning across datasets. 149 * 150 * Block Cloning is not limited to cloning blocks within the same dataset. 151 * It is possible (and very useful) to clone blocks between different datasets. 152 * One use case is recovering files from snapshots. By cloning the files into 153 * dataset we need no additional storage. Without Block Cloning we would need 154 * additional space for those files. 155 * Another interesting use case is moving the files between datasets 156 * (copying the file content to the new dataset and removing the source file). 157 * In that case Block Cloning will only be used briefly, because the BRT entries 158 * will be removed when the source is removed. 159 * Note: currently it is not possible to clone blocks between encrypted 160 * datasets, even if those datasets use the same encryption key (this includes 161 * snapshots of encrypted datasets). Cloning blocks between datasets that use 162 * the same keys should be possible and should be implemented in the future. 163 * 164 * Block Cloning flow through ZFS layers. 165 * 166 * Note: Block Cloning can be used both for cloning file system blocks and ZVOL 167 * blocks. As of this writing no interface is implemented that allows for block 168 * cloning within a ZVOL. 169 * FreeBSD and Linux provides copy_file_range(2) system call and we will use it 170 * for blocking cloning. 171 * 172 * ssize_t 173 * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp, 174 * size_t len, unsigned int flags); 175 * 176 * Even though offsets and length represent bytes, they have to be 177 * block-aligned or we will return the EXDEV error so the upper layer can 178 * fallback to the generic mechanism that will just copy the data. 179 * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. 180 * This function was implemented based on zfs_write(), but instead of writing 181 * the given data we first read block pointers using the new dmu_read_l0_bps() 182 * function from the source file. Once we have BPs from the source file we call 183 * the dmu_brt_clone() function on the destination file. This function 184 * allocates BPs for us. We iterate over all source BPs. If the given BP is 185 * a hole or an embedded block, we just copy BP as-is. If it points to a real 186 * data we place this BP on a BRT pending list using the brt_pending_add() 187 * function. 188 * 189 * We use this pending list to keep track of all BPs that got new references 190 * within this transaction group. 191 * 192 * Some special cases to consider and how we address them: 193 * - The block we want to clone may have been created within the same 194 * transaction group that we are trying to clone. Such block has no BP 195 * allocated yet, so cannot be immediately cloned. We return EXDEV. 196 * - The block we want to clone may have been modified within the same 197 * transaction group. We return EXDEV. 198 * - A block may be cloned multiple times during one transaction group (that's 199 * why pending list is actually a tree and not an append-only list - this 200 * way we can figure out faster if this block is cloned for the first time 201 * in this txg or consecutive time). 202 * - A block may be cloned and freed within the same transaction group 203 * (see dbuf_undirty()). 204 * - A block may be cloned and within the same transaction group the clone 205 * can be cloned again (see dmu_read_l0_bps()). 206 * - A file might have been deleted, but the caller still has a file descriptor 207 * open to this file and clones it. 208 * 209 * When we free a block we have an additional step in the ZIO pipeline where we 210 * call the zio_brt_free() function. We then call the brt_entry_decref() 211 * that loads the corresponding BRT entry (if one exists) and decreases 212 * reference counter. If this is not the last reference we will stop ZIO 213 * pipeline here. If this is the last reference or the block is not in the 214 * BRT, we continue the pipeline and free the block as usual. 215 * 216 * At the beginning of spa_sync() where there can be no more block cloning, 217 * but before issuing frees we call brt_pending_apply(). This function applies 218 * all the new clones to the BRT table - we load BRT entries and update 219 * reference counters. To sync new BRT entries to disk, we use brt_sync() 220 * function. This function will sync all dirty per-top-level-vdev BRTs, 221 * the entry counters arrays, etc. 222 * 223 * Block Cloning and ZIL. 224 * 225 * Every clone operation is divided into chunks (similar to write) and each 226 * chunk is cloned in a separate transaction. The chunk size is determined by 227 * how many BPs we can fit into a single ZIL entry. 228 * Replaying clone operation is different from the regular clone operation, 229 * as when we log clone operations we cannot use the source object - it may 230 * reside on a different dataset, so we log BPs we want to clone. 231 * The ZIL is replayed when we mount the given dataset, not when the pool is 232 * imported. Taking this into account it is possible that the pool is imported 233 * without mounting datasets and the source dataset is destroyed before the 234 * destination dataset is mounted and its ZIL replayed. 235 * To address this situation we leverage zil_claim() mechanism where ZFS will 236 * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE 237 * entries, we will bump reference counters for their BPs in the BRT and then 238 * on mount and ZIL replay we will just attach BPs to the file without 239 * bumping reference counters. 240 * Note it is still possible that after zil_claim() we never mount the 241 * destination, so we never replay its ZIL and we destroy it. This way we would 242 * end up with leaked references in BRT. We address that too as ZFS gives us 243 * a chance to clean this up on dataset destroy (see zil_free_clone_range()). 244 */ 245 246 /* 247 * BRT - Block Reference Table. 248 */ 249 #define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:" 250 251 /* 252 * We divide each VDEV into 16MB chunks. Each chunk is represented in memory 253 * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B 254 * Each element in this array represents how many BRT entries do we have in this 255 * chunk of storage. We always load this entire array into memory and update as 256 * needed. By having it in memory we can quickly tell (during zio_free()) if 257 * there are any BRT entries that we might need to update. 258 * 259 * This value cannot be larger than 16MB, at least as long as we support 260 * 512 byte block sizes. With 512 byte block size we can have exactly 261 * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too 262 * many for a 16bit counter. 263 */ 264 #define BRT_RANGESIZE (16 * 1024 * 1024) 265 _Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX, 266 "BRT_RANGESIZE is too large."); 267 /* 268 * We don't want to update the whole structure every time. Maintain bitmap 269 * of dirty blocks within the regions, so that a single bit represents a 270 * block size of entcounts. For example if we have a 1PB vdev then all 271 * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this 272 * 128MB array of entcounts into 32kB disk blocks, as we don't want to update 273 * the whole 128MB on disk when we have updated only a single entcount. 274 * We maintain a bitmap where each 32kB disk block within 128MB entcounts array 275 * is represented by a single bit. This gives us 4096 bits. A set bit in the 276 * bitmap means that we had a change in at least one of the 16384 entcounts 277 * that reside on a 32kB disk block (32kB / sizeof (uint16_t)). 278 */ 279 #define BRT_BLOCKSIZE (32 * 1024) 280 #define BRT_RANGESIZE_TO_NBLOCKS(size) \ 281 (((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1) 282 283 #define BRT_LITTLE_ENDIAN 0 284 #define BRT_BIG_ENDIAN 1 285 #ifdef _ZFS_LITTLE_ENDIAN 286 #define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN 287 #define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN 288 #else 289 #define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN 290 #define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN 291 #endif 292 293 typedef struct brt_vdev_phys { 294 uint64_t bvp_mos_entries; 295 uint64_t bvp_size; 296 uint64_t bvp_byteorder; 297 uint64_t bvp_totalcount; 298 uint64_t bvp_rangesize; 299 uint64_t bvp_usedspace; 300 uint64_t bvp_savedspace; 301 } brt_vdev_phys_t; 302 303 typedef struct brt_vdev { 304 /* 305 * VDEV id. 306 */ 307 uint64_t bv_vdevid; 308 /* 309 * Is the structure initiated? 310 * (bv_entcount and bv_bitmap are allocated?) 311 */ 312 boolean_t bv_initiated; 313 /* 314 * Object number in the MOS for the entcount array and brt_vdev_phys. 315 */ 316 uint64_t bv_mos_brtvdev; 317 /* 318 * Object number in the MOS for the entries table. 319 */ 320 uint64_t bv_mos_entries; 321 /* 322 * Entries to sync. 323 */ 324 avl_tree_t bv_tree; 325 /* 326 * Does the bv_entcount[] array needs byte swapping? 327 */ 328 boolean_t bv_need_byteswap; 329 /* 330 * Number of entries in the bv_entcount[] array. 331 */ 332 uint64_t bv_size; 333 /* 334 * This is the array with BRT entry count per BRT_RANGESIZE. 335 */ 336 uint16_t *bv_entcount; 337 /* 338 * Sum of all bv_entcount[]s. 339 */ 340 uint64_t bv_totalcount; 341 /* 342 * Space on disk occupied by cloned blocks (without compression). 343 */ 344 uint64_t bv_usedspace; 345 /* 346 * How much additional space would be occupied without block cloning. 347 */ 348 uint64_t bv_savedspace; 349 /* 350 * brt_vdev_phys needs updating on disk. 351 */ 352 boolean_t bv_meta_dirty; 353 /* 354 * bv_entcount[] needs updating on disk. 355 */ 356 boolean_t bv_entcount_dirty; 357 /* 358 * bv_entcount[] potentially can be a bit too big to sychronize it all 359 * when we just changed few entcounts. The fields below allow us to 360 * track updates to bv_entcount[] array since the last sync. 361 * A single bit in the bv_bitmap represents as many entcounts as can 362 * fit into a single BRT_BLOCKSIZE. 363 * For example we have 65536 entcounts in the bv_entcount array 364 * (so the whole array is 128kB). We updated bv_entcount[2] and 365 * bv_entcount[5]. In that case only first bit in the bv_bitmap will 366 * be set and we will write only first BRT_BLOCKSIZE out of 128kB. 367 */ 368 ulong_t *bv_bitmap; 369 uint64_t bv_nblocks; 370 } brt_vdev_t; 371 372 /* 373 * In-core brt 374 */ 375 typedef struct brt { 376 krwlock_t brt_lock; 377 spa_t *brt_spa; 378 #define brt_mos brt_spa->spa_meta_objset 379 uint64_t brt_rangesize; 380 uint64_t brt_usedspace; 381 uint64_t brt_savedspace; 382 avl_tree_t brt_pending_tree[TXG_SIZE]; 383 kmutex_t brt_pending_lock[TXG_SIZE]; 384 /* Sum of all entries across all bv_trees. */ 385 uint64_t brt_nentries; 386 brt_vdev_t *brt_vdevs; 387 uint64_t brt_nvdevs; 388 } brt_t; 389 390 /* Size of bre_offset / sizeof (uint64_t). */ 391 #define BRT_KEY_WORDS (1) 392 393 /* 394 * In-core brt entry. 395 * On-disk we use bre_offset as the key and bre_refcount as the value. 396 */ 397 typedef struct brt_entry { 398 uint64_t bre_offset; 399 uint64_t bre_refcount; 400 avl_node_t bre_node; 401 } brt_entry_t; 402 403 typedef struct brt_pending_entry { 404 blkptr_t bpe_bp; 405 int bpe_count; 406 avl_node_t bpe_node; 407 } brt_pending_entry_t; 408 409 static kmem_cache_t *brt_entry_cache; 410 static kmem_cache_t *brt_pending_entry_cache; 411 412 /* 413 * Enable/disable prefetching of BRT entries that we are going to modify. 414 */ 415 int zfs_brt_prefetch = 1; 416 417 #ifdef ZFS_DEBUG 418 #define BRT_DEBUG(...) do { \ 419 if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \ 420 __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \ 421 } \ 422 } while (0) 423 #else 424 #define BRT_DEBUG(...) do { } while (0) 425 #endif 426 427 int brt_zap_leaf_blockshift = 12; 428 int brt_zap_indirect_blockshift = 12; 429 430 static kstat_t *brt_ksp; 431 432 typedef struct brt_stats { 433 kstat_named_t brt_addref_entry_in_memory; 434 kstat_named_t brt_addref_entry_not_on_disk; 435 kstat_named_t brt_addref_entry_on_disk; 436 kstat_named_t brt_addref_entry_read_lost_race; 437 kstat_named_t brt_decref_entry_in_memory; 438 kstat_named_t brt_decref_entry_loaded_from_disk; 439 kstat_named_t brt_decref_entry_not_in_memory; 440 kstat_named_t brt_decref_entry_not_on_disk; 441 kstat_named_t brt_decref_entry_read_lost_race; 442 kstat_named_t brt_decref_entry_still_referenced; 443 kstat_named_t brt_decref_free_data_later; 444 kstat_named_t brt_decref_free_data_now; 445 kstat_named_t brt_decref_no_entry; 446 } brt_stats_t; 447 448 static brt_stats_t brt_stats = { 449 { "addref_entry_in_memory", KSTAT_DATA_UINT64 }, 450 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, 451 { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, 452 { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 }, 453 { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, 454 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, 455 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, 456 { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 }, 457 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, 458 { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, 459 { "decref_free_data_later", KSTAT_DATA_UINT64 }, 460 { "decref_free_data_now", KSTAT_DATA_UINT64 }, 461 { "decref_no_entry", KSTAT_DATA_UINT64 } 462 }; 463 464 struct { 465 wmsum_t brt_addref_entry_in_memory; 466 wmsum_t brt_addref_entry_not_on_disk; 467 wmsum_t brt_addref_entry_on_disk; 468 wmsum_t brt_addref_entry_read_lost_race; 469 wmsum_t brt_decref_entry_in_memory; 470 wmsum_t brt_decref_entry_loaded_from_disk; 471 wmsum_t brt_decref_entry_not_in_memory; 472 wmsum_t brt_decref_entry_not_on_disk; 473 wmsum_t brt_decref_entry_read_lost_race; 474 wmsum_t brt_decref_entry_still_referenced; 475 wmsum_t brt_decref_free_data_later; 476 wmsum_t brt_decref_free_data_now; 477 wmsum_t brt_decref_no_entry; 478 } brt_sums; 479 480 #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) 481 482 static int brt_entry_compare(const void *x1, const void *x2); 483 static int brt_pending_entry_compare(const void *x1, const void *x2); 484 485 static void 486 brt_rlock(brt_t *brt) 487 { 488 rw_enter(&brt->brt_lock, RW_READER); 489 } 490 491 static void 492 brt_wlock(brt_t *brt) 493 { 494 rw_enter(&brt->brt_lock, RW_WRITER); 495 } 496 497 static void 498 brt_unlock(brt_t *brt) 499 { 500 rw_exit(&brt->brt_lock); 501 } 502 503 static uint16_t 504 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) 505 { 506 507 ASSERT3U(idx, <, brtvd->bv_size); 508 509 if (brtvd->bv_need_byteswap) { 510 return (BSWAP_16(brtvd->bv_entcount[idx])); 511 } else { 512 return (brtvd->bv_entcount[idx]); 513 } 514 } 515 516 static void 517 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt) 518 { 519 520 ASSERT3U(idx, <, brtvd->bv_size); 521 522 if (brtvd->bv_need_byteswap) { 523 brtvd->bv_entcount[idx] = BSWAP_16(entcnt); 524 } else { 525 brtvd->bv_entcount[idx] = entcnt; 526 } 527 } 528 529 static void 530 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx) 531 { 532 uint16_t entcnt; 533 534 ASSERT3U(idx, <, brtvd->bv_size); 535 536 entcnt = brt_vdev_entcount_get(brtvd, idx); 537 ASSERT(entcnt < UINT16_MAX); 538 539 brt_vdev_entcount_set(brtvd, idx, entcnt + 1); 540 } 541 542 static void 543 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx) 544 { 545 uint16_t entcnt; 546 547 ASSERT3U(idx, <, brtvd->bv_size); 548 549 entcnt = brt_vdev_entcount_get(brtvd, idx); 550 ASSERT(entcnt > 0); 551 552 brt_vdev_entcount_set(brtvd, idx, entcnt - 1); 553 } 554 555 #ifdef ZFS_DEBUG 556 static void 557 brt_vdev_dump(brt_t *brt) 558 { 559 brt_vdev_t *brtvd; 560 uint64_t vdevid; 561 562 if ((zfs_flags & ZFS_DEBUG_BRT) == 0) { 563 return; 564 } 565 566 if (brt->brt_nvdevs == 0) { 567 zfs_dbgmsg("BRT empty"); 568 return; 569 } 570 571 zfs_dbgmsg("BRT vdev dump:"); 572 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 573 uint64_t idx; 574 575 brtvd = &brt->brt_vdevs[vdevid]; 576 zfs_dbgmsg(" vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d " 577 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", 578 (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid, 579 brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, 580 (u_longlong_t)brtvd->bv_size, 581 (u_longlong_t)brtvd->bv_totalcount, 582 (u_longlong_t)brtvd->bv_nblocks, 583 (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); 584 if (brtvd->bv_totalcount > 0) { 585 zfs_dbgmsg(" entcounts:"); 586 for (idx = 0; idx < brtvd->bv_size; idx++) { 587 if (brt_vdev_entcount_get(brtvd, idx) > 0) { 588 zfs_dbgmsg(" [%04llu] %hu", 589 (u_longlong_t)idx, 590 brt_vdev_entcount_get(brtvd, idx)); 591 } 592 } 593 } 594 if (brtvd->bv_entcount_dirty) { 595 char *bitmap; 596 597 bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); 598 for (idx = 0; idx < brtvd->bv_nblocks; idx++) { 599 bitmap[idx] = 600 BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; 601 } 602 bitmap[idx] = '\0'; 603 zfs_dbgmsg(" bitmap: %s", bitmap); 604 kmem_free(bitmap, brtvd->bv_nblocks + 1); 605 } 606 } 607 } 608 #endif 609 610 static brt_vdev_t * 611 brt_vdev(brt_t *brt, uint64_t vdevid) 612 { 613 brt_vdev_t *brtvd; 614 615 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 616 617 if (vdevid < brt->brt_nvdevs) { 618 brtvd = &brt->brt_vdevs[vdevid]; 619 } else { 620 brtvd = NULL; 621 } 622 623 return (brtvd); 624 } 625 626 static void 627 brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) 628 { 629 char name[64]; 630 631 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 632 ASSERT0(brtvd->bv_mos_brtvdev); 633 ASSERT0(brtvd->bv_mos_entries); 634 ASSERT(brtvd->bv_entcount != NULL); 635 ASSERT(brtvd->bv_size > 0); 636 ASSERT(brtvd->bv_bitmap != NULL); 637 ASSERT(brtvd->bv_nblocks > 0); 638 639 brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0, 640 ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, 641 brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE, 642 0, tx); 643 VERIFY(brtvd->bv_mos_entries != 0); 644 BRT_DEBUG("MOS entries created, object=%llu", 645 (u_longlong_t)brtvd->bv_mos_entries); 646 647 /* 648 * We allocate DMU buffer to store the bv_entcount[] array. 649 * We will keep array size (bv_size) and cummulative count for all 650 * bv_entcount[]s (bv_totalcount) in the bonus buffer. 651 */ 652 brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos, 653 DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, 654 DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); 655 VERIFY(brtvd->bv_mos_brtvdev != 0); 656 BRT_DEBUG("MOS BRT VDEV created, object=%llu", 657 (u_longlong_t)brtvd->bv_mos_brtvdev); 658 659 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 660 (u_longlong_t)brtvd->bv_vdevid); 661 VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, 662 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); 663 BRT_DEBUG("Pool directory object created, object=%s", name); 664 665 spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); 666 } 667 668 static void 669 brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) 670 { 671 vdev_t *vd; 672 uint16_t *entcount; 673 ulong_t *bitmap; 674 uint64_t nblocks, size; 675 676 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 677 678 spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER); 679 vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid); 680 size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1; 681 spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG); 682 683 entcount = kmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); 684 nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); 685 bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); 686 687 if (!brtvd->bv_initiated) { 688 ASSERT0(brtvd->bv_size); 689 ASSERT(brtvd->bv_entcount == NULL); 690 ASSERT(brtvd->bv_bitmap == NULL); 691 ASSERT0(brtvd->bv_nblocks); 692 693 avl_create(&brtvd->bv_tree, brt_entry_compare, 694 sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); 695 } else { 696 ASSERT(brtvd->bv_size > 0); 697 ASSERT(brtvd->bv_entcount != NULL); 698 ASSERT(brtvd->bv_bitmap != NULL); 699 ASSERT(brtvd->bv_nblocks > 0); 700 /* 701 * TODO: Allow vdev shrinking. We only need to implement 702 * shrinking the on-disk BRT VDEV object. 703 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset, 704 * size, tx); 705 */ 706 ASSERT3U(brtvd->bv_size, <=, size); 707 708 memcpy(entcount, brtvd->bv_entcount, 709 sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); 710 memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), 711 BT_SIZEOFMAP(brtvd->bv_nblocks))); 712 kmem_free(brtvd->bv_entcount, 713 sizeof (entcount[0]) * brtvd->bv_size); 714 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); 715 } 716 717 brtvd->bv_size = size; 718 brtvd->bv_entcount = entcount; 719 brtvd->bv_bitmap = bitmap; 720 brtvd->bv_nblocks = nblocks; 721 if (!brtvd->bv_initiated) { 722 brtvd->bv_need_byteswap = FALSE; 723 brtvd->bv_initiated = TRUE; 724 BRT_DEBUG("BRT VDEV %llu initiated.", 725 (u_longlong_t)brtvd->bv_vdevid); 726 } 727 } 728 729 static void 730 brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) 731 { 732 char name[64]; 733 dmu_buf_t *db; 734 brt_vdev_phys_t *bvphys; 735 int error; 736 737 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 738 (u_longlong_t)brtvd->bv_vdevid); 739 error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, 740 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev); 741 if (error != 0) 742 return; 743 ASSERT(brtvd->bv_mos_brtvdev != 0); 744 745 error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db); 746 ASSERT0(error); 747 if (error != 0) 748 return; 749 750 bvphys = db->db_data; 751 if (brt->brt_rangesize == 0) { 752 brt->brt_rangesize = bvphys->bvp_rangesize; 753 } else { 754 ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize); 755 } 756 757 ASSERT(!brtvd->bv_initiated); 758 brt_vdev_realloc(brt, brtvd); 759 760 /* TODO: We don't support VDEV shrinking. */ 761 ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); 762 763 /* 764 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. 765 */ 766 error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, 767 MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), 768 brtvd->bv_entcount, DMU_READ_NO_PREFETCH); 769 ASSERT0(error); 770 771 brtvd->bv_mos_entries = bvphys->bvp_mos_entries; 772 ASSERT(brtvd->bv_mos_entries != 0); 773 brtvd->bv_need_byteswap = 774 (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); 775 brtvd->bv_totalcount = bvphys->bvp_totalcount; 776 brtvd->bv_usedspace = bvphys->bvp_usedspace; 777 brtvd->bv_savedspace = bvphys->bvp_savedspace; 778 brt->brt_usedspace += brtvd->bv_usedspace; 779 brt->brt_savedspace += brtvd->bv_savedspace; 780 781 dmu_buf_rele(db, FTAG); 782 783 BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu", 784 name, (u_longlong_t)brtvd->bv_mos_brtvdev, 785 (u_longlong_t)brtvd->bv_mos_entries); 786 } 787 788 static void 789 brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd) 790 { 791 792 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 793 ASSERT(brtvd->bv_initiated); 794 795 kmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); 796 brtvd->bv_entcount = NULL; 797 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); 798 brtvd->bv_bitmap = NULL; 799 ASSERT0(avl_numnodes(&brtvd->bv_tree)); 800 avl_destroy(&brtvd->bv_tree); 801 802 brtvd->bv_size = 0; 803 brtvd->bv_nblocks = 0; 804 805 brtvd->bv_initiated = FALSE; 806 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); 807 } 808 809 static void 810 brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) 811 { 812 char name[64]; 813 uint64_t count; 814 dmu_buf_t *db; 815 brt_vdev_phys_t *bvphys; 816 817 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 818 ASSERT(brtvd->bv_mos_brtvdev != 0); 819 ASSERT(brtvd->bv_mos_entries != 0); 820 821 VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count)); 822 VERIFY0(count); 823 VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx)); 824 BRT_DEBUG("MOS entries destroyed, object=%llu", 825 (u_longlong_t)brtvd->bv_mos_entries); 826 brtvd->bv_mos_entries = 0; 827 828 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); 829 bvphys = db->db_data; 830 ASSERT0(bvphys->bvp_totalcount); 831 ASSERT0(bvphys->bvp_usedspace); 832 ASSERT0(bvphys->bvp_savedspace); 833 dmu_buf_rele(db, FTAG); 834 835 VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx)); 836 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", 837 (u_longlong_t)brtvd->bv_mos_brtvdev); 838 brtvd->bv_mos_brtvdev = 0; 839 840 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 841 (u_longlong_t)brtvd->bv_vdevid); 842 VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx)); 843 BRT_DEBUG("Pool directory object removed, object=%s", name); 844 845 brt_vdev_dealloc(brt, brtvd); 846 847 spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); 848 } 849 850 static void 851 brt_vdevs_expand(brt_t *brt, uint64_t nvdevs) 852 { 853 brt_vdev_t *brtvd, *vdevs; 854 uint64_t vdevid; 855 856 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 857 ASSERT3U(nvdevs, >, brt->brt_nvdevs); 858 859 vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP); 860 if (brt->brt_nvdevs > 0) { 861 ASSERT(brt->brt_vdevs != NULL); 862 863 memcpy(vdevs, brt->brt_vdevs, 864 sizeof (brt_vdev_t) * brt->brt_nvdevs); 865 kmem_free(brt->brt_vdevs, 866 sizeof (brt_vdev_t) * brt->brt_nvdevs); 867 } 868 for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) { 869 brtvd = &vdevs[vdevid]; 870 871 brtvd->bv_vdevid = vdevid; 872 brtvd->bv_initiated = FALSE; 873 } 874 875 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", 876 (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs); 877 878 brt->brt_vdevs = vdevs; 879 brt->brt_nvdevs = nvdevs; 880 } 881 882 static boolean_t 883 brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre) 884 { 885 uint64_t idx; 886 887 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 888 889 idx = bre->bre_offset / brt->brt_rangesize; 890 if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) { 891 /* VDEV wasn't expanded. */ 892 return (brt_vdev_entcount_get(brtvd, idx) > 0); 893 } 894 895 return (FALSE); 896 } 897 898 static void 899 brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, 900 uint64_t dsize) 901 { 902 uint64_t idx; 903 904 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 905 ASSERT(brtvd != NULL); 906 ASSERT(brtvd->bv_entcount != NULL); 907 908 brt->brt_savedspace += dsize; 909 brtvd->bv_savedspace += dsize; 910 brtvd->bv_meta_dirty = TRUE; 911 912 if (bre->bre_refcount > 1) { 913 return; 914 } 915 916 brt->brt_usedspace += dsize; 917 brtvd->bv_usedspace += dsize; 918 919 idx = bre->bre_offset / brt->brt_rangesize; 920 if (idx >= brtvd->bv_size) { 921 /* VDEV has been expanded. */ 922 brt_vdev_realloc(brt, brtvd); 923 } 924 925 ASSERT3U(idx, <, brtvd->bv_size); 926 927 brtvd->bv_totalcount++; 928 brt_vdev_entcount_inc(brtvd, idx); 929 brtvd->bv_entcount_dirty = TRUE; 930 idx = idx / BRT_BLOCKSIZE / 8; 931 BT_SET(brtvd->bv_bitmap, idx); 932 933 #ifdef ZFS_DEBUG 934 brt_vdev_dump(brt); 935 #endif 936 } 937 938 static void 939 brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, 940 uint64_t dsize) 941 { 942 uint64_t idx; 943 944 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 945 ASSERT(brtvd != NULL); 946 ASSERT(brtvd->bv_entcount != NULL); 947 948 brt->brt_savedspace -= dsize; 949 brtvd->bv_savedspace -= dsize; 950 brtvd->bv_meta_dirty = TRUE; 951 952 if (bre->bre_refcount > 0) { 953 return; 954 } 955 956 brt->brt_usedspace -= dsize; 957 brtvd->bv_usedspace -= dsize; 958 959 idx = bre->bre_offset / brt->brt_rangesize; 960 ASSERT3U(idx, <, brtvd->bv_size); 961 962 ASSERT(brtvd->bv_totalcount > 0); 963 brtvd->bv_totalcount--; 964 brt_vdev_entcount_dec(brtvd, idx); 965 brtvd->bv_entcount_dirty = TRUE; 966 idx = idx / BRT_BLOCKSIZE / 8; 967 BT_SET(brtvd->bv_bitmap, idx); 968 969 #ifdef ZFS_DEBUG 970 brt_vdev_dump(brt); 971 #endif 972 } 973 974 static void 975 brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) 976 { 977 dmu_buf_t *db; 978 brt_vdev_phys_t *bvphys; 979 980 ASSERT(brtvd->bv_meta_dirty); 981 ASSERT(brtvd->bv_mos_brtvdev != 0); 982 ASSERT(dmu_tx_is_syncing(tx)); 983 984 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); 985 986 if (brtvd->bv_entcount_dirty) { 987 /* 988 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. 989 */ 990 dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, 991 brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), 992 brtvd->bv_entcount, tx); 993 memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks)); 994 brtvd->bv_entcount_dirty = FALSE; 995 } 996 997 dmu_buf_will_dirty(db, tx); 998 bvphys = db->db_data; 999 bvphys->bvp_mos_entries = brtvd->bv_mos_entries; 1000 bvphys->bvp_size = brtvd->bv_size; 1001 if (brtvd->bv_need_byteswap) { 1002 bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER; 1003 } else { 1004 bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; 1005 } 1006 bvphys->bvp_totalcount = brtvd->bv_totalcount; 1007 bvphys->bvp_rangesize = brt->brt_rangesize; 1008 bvphys->bvp_usedspace = brtvd->bv_usedspace; 1009 bvphys->bvp_savedspace = brtvd->bv_savedspace; 1010 dmu_buf_rele(db, FTAG); 1011 1012 brtvd->bv_meta_dirty = FALSE; 1013 } 1014 1015 static void 1016 brt_vdevs_alloc(brt_t *brt, boolean_t load) 1017 { 1018 brt_vdev_t *brtvd; 1019 uint64_t vdevid; 1020 1021 brt_wlock(brt); 1022 1023 brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children); 1024 1025 if (load) { 1026 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 1027 brtvd = &brt->brt_vdevs[vdevid]; 1028 ASSERT(brtvd->bv_entcount == NULL); 1029 1030 brt_vdev_load(brt, brtvd); 1031 } 1032 } 1033 1034 if (brt->brt_rangesize == 0) { 1035 brt->brt_rangesize = BRT_RANGESIZE; 1036 } 1037 1038 brt_unlock(brt); 1039 } 1040 1041 static void 1042 brt_vdevs_free(brt_t *brt) 1043 { 1044 brt_vdev_t *brtvd; 1045 uint64_t vdevid; 1046 1047 brt_wlock(brt); 1048 1049 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 1050 brtvd = &brt->brt_vdevs[vdevid]; 1051 if (brtvd->bv_initiated) 1052 brt_vdev_dealloc(brt, brtvd); 1053 } 1054 kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs); 1055 1056 brt_unlock(brt); 1057 } 1058 1059 static void 1060 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) 1061 { 1062 1063 bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); 1064 bre->bre_refcount = 0; 1065 1066 *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); 1067 } 1068 1069 static int 1070 brt_entry_compare(const void *x1, const void *x2) 1071 { 1072 const brt_entry_t *bre1 = x1; 1073 const brt_entry_t *bre2 = x2; 1074 1075 return (TREE_CMP(bre1->bre_offset, bre2->bre_offset)); 1076 } 1077 1078 static int 1079 brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre) 1080 { 1081 uint64_t mos_entries; 1082 uint64_t one, physsize; 1083 int error; 1084 1085 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 1086 1087 if (!brt_vdev_lookup(brt, brtvd, bre)) 1088 return (SET_ERROR(ENOENT)); 1089 1090 /* 1091 * Remember mos_entries object number. After we reacquire the BRT lock, 1092 * the brtvd pointer may be invalid. 1093 */ 1094 mos_entries = brtvd->bv_mos_entries; 1095 if (mos_entries == 0) 1096 return (SET_ERROR(ENOENT)); 1097 1098 brt_unlock(brt); 1099 1100 error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, 1101 BRT_KEY_WORDS, &one, &physsize); 1102 if (error == 0) { 1103 ASSERT3U(one, ==, 1); 1104 ASSERT3U(physsize, ==, sizeof (bre->bre_refcount)); 1105 1106 error = zap_lookup_uint64(brt->brt_mos, mos_entries, 1107 &bre->bre_offset, BRT_KEY_WORDS, 1, 1108 sizeof (bre->bre_refcount), &bre->bre_refcount); 1109 BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu " 1110 "count=%llu error=%d", (u_longlong_t)mos_entries, 1111 (u_longlong_t)brtvd->bv_vdevid, 1112 (u_longlong_t)bre->bre_offset, 1113 error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error); 1114 } 1115 1116 brt_wlock(brt); 1117 1118 return (error); 1119 } 1120 1121 static void 1122 brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) 1123 { 1124 brt_vdev_t *brtvd; 1125 uint64_t mos_entries = 0; 1126 1127 brt_rlock(brt); 1128 brtvd = brt_vdev(brt, vdevid); 1129 if (brtvd != NULL) 1130 mos_entries = brtvd->bv_mos_entries; 1131 brt_unlock(brt); 1132 1133 if (mos_entries == 0) 1134 return; 1135 1136 BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu", 1137 (u_longlong_t)mos_entries, (u_longlong_t)vdevid, 1138 (u_longlong_t)bre->bre_offset); 1139 (void) zap_prefetch_uint64(brt->brt_mos, mos_entries, 1140 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS); 1141 } 1142 1143 static int 1144 brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) 1145 { 1146 int error; 1147 1148 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 1149 ASSERT(brtvd->bv_mos_entries != 0); 1150 ASSERT(bre->bre_refcount > 0); 1151 1152 error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries, 1153 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1, 1154 sizeof (bre->bre_refcount), &bre->bre_refcount, tx); 1155 BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu " 1156 "error=%d", (u_longlong_t)brtvd->bv_mos_entries, 1157 (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, 1158 (u_longlong_t)bre->bre_refcount, error); 1159 1160 return (error); 1161 } 1162 1163 static int 1164 brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) 1165 { 1166 int error; 1167 1168 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 1169 ASSERT(brtvd->bv_mos_entries != 0); 1170 ASSERT0(bre->bre_refcount); 1171 1172 error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries, 1173 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx); 1174 BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu " 1175 "error=%d", (u_longlong_t)brtvd->bv_mos_entries, 1176 (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, 1177 (u_longlong_t)bre->bre_refcount, error); 1178 1179 return (error); 1180 } 1181 1182 /* 1183 * Return TRUE if we _can_ have BRT entry for this bp. It might be false 1184 * positive, but gives us quick answer if we should look into BRT, which 1185 * may require reads and thus will be more expensive. 1186 */ 1187 boolean_t 1188 brt_maybe_exists(spa_t *spa, const blkptr_t *bp) 1189 { 1190 brt_t *brt = spa->spa_brt; 1191 brt_vdev_t *brtvd; 1192 brt_entry_t bre_search; 1193 boolean_t mayexists = FALSE; 1194 uint64_t vdevid; 1195 1196 brt_entry_fill(bp, &bre_search, &vdevid); 1197 1198 brt_rlock(brt); 1199 1200 brtvd = brt_vdev(brt, vdevid); 1201 if (brtvd != NULL && brtvd->bv_initiated) { 1202 if (!avl_is_empty(&brtvd->bv_tree) || 1203 brt_vdev_lookup(brt, brtvd, &bre_search)) { 1204 mayexists = TRUE; 1205 } 1206 } 1207 1208 brt_unlock(brt); 1209 1210 return (mayexists); 1211 } 1212 1213 uint64_t 1214 brt_get_dspace(spa_t *spa) 1215 { 1216 brt_t *brt = spa->spa_brt; 1217 1218 if (brt == NULL) 1219 return (0); 1220 1221 return (brt->brt_savedspace); 1222 } 1223 1224 uint64_t 1225 brt_get_used(spa_t *spa) 1226 { 1227 brt_t *brt = spa->spa_brt; 1228 1229 if (brt == NULL) 1230 return (0); 1231 1232 return (brt->brt_usedspace); 1233 } 1234 1235 uint64_t 1236 brt_get_saved(spa_t *spa) 1237 { 1238 brt_t *brt = spa->spa_brt; 1239 1240 if (brt == NULL) 1241 return (0); 1242 1243 return (brt->brt_savedspace); 1244 } 1245 1246 uint64_t 1247 brt_get_ratio(spa_t *spa) 1248 { 1249 brt_t *brt = spa->spa_brt; 1250 1251 if (brt->brt_usedspace == 0) 1252 return (100); 1253 1254 return ((brt->brt_usedspace + brt->brt_savedspace) * 100 / 1255 brt->brt_usedspace); 1256 } 1257 1258 static int 1259 brt_kstats_update(kstat_t *ksp, int rw) 1260 { 1261 brt_stats_t *bs = ksp->ks_data; 1262 1263 if (rw == KSTAT_WRITE) 1264 return (EACCES); 1265 1266 bs->brt_addref_entry_in_memory.value.ui64 = 1267 wmsum_value(&brt_sums.brt_addref_entry_in_memory); 1268 bs->brt_addref_entry_not_on_disk.value.ui64 = 1269 wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); 1270 bs->brt_addref_entry_on_disk.value.ui64 = 1271 wmsum_value(&brt_sums.brt_addref_entry_on_disk); 1272 bs->brt_addref_entry_read_lost_race.value.ui64 = 1273 wmsum_value(&brt_sums.brt_addref_entry_read_lost_race); 1274 bs->brt_decref_entry_in_memory.value.ui64 = 1275 wmsum_value(&brt_sums.brt_decref_entry_in_memory); 1276 bs->brt_decref_entry_loaded_from_disk.value.ui64 = 1277 wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); 1278 bs->brt_decref_entry_not_in_memory.value.ui64 = 1279 wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); 1280 bs->brt_decref_entry_not_on_disk.value.ui64 = 1281 wmsum_value(&brt_sums.brt_decref_entry_not_on_disk); 1282 bs->brt_decref_entry_read_lost_race.value.ui64 = 1283 wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); 1284 bs->brt_decref_entry_still_referenced.value.ui64 = 1285 wmsum_value(&brt_sums.brt_decref_entry_still_referenced); 1286 bs->brt_decref_free_data_later.value.ui64 = 1287 wmsum_value(&brt_sums.brt_decref_free_data_later); 1288 bs->brt_decref_free_data_now.value.ui64 = 1289 wmsum_value(&brt_sums.brt_decref_free_data_now); 1290 bs->brt_decref_no_entry.value.ui64 = 1291 wmsum_value(&brt_sums.brt_decref_no_entry); 1292 1293 return (0); 1294 } 1295 1296 static void 1297 brt_stat_init(void) 1298 { 1299 1300 wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0); 1301 wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); 1302 wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); 1303 wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0); 1304 wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); 1305 wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); 1306 wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); 1307 wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0); 1308 wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); 1309 wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); 1310 wmsum_init(&brt_sums.brt_decref_free_data_later, 0); 1311 wmsum_init(&brt_sums.brt_decref_free_data_now, 0); 1312 wmsum_init(&brt_sums.brt_decref_no_entry, 0); 1313 1314 brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED, 1315 sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 1316 if (brt_ksp != NULL) { 1317 brt_ksp->ks_data = &brt_stats; 1318 brt_ksp->ks_update = brt_kstats_update; 1319 kstat_install(brt_ksp); 1320 } 1321 } 1322 1323 static void 1324 brt_stat_fini(void) 1325 { 1326 if (brt_ksp != NULL) { 1327 kstat_delete(brt_ksp); 1328 brt_ksp = NULL; 1329 } 1330 1331 wmsum_fini(&brt_sums.brt_addref_entry_in_memory); 1332 wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); 1333 wmsum_fini(&brt_sums.brt_addref_entry_on_disk); 1334 wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race); 1335 wmsum_fini(&brt_sums.brt_decref_entry_in_memory); 1336 wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); 1337 wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); 1338 wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk); 1339 wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); 1340 wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); 1341 wmsum_fini(&brt_sums.brt_decref_free_data_later); 1342 wmsum_fini(&brt_sums.brt_decref_free_data_now); 1343 wmsum_fini(&brt_sums.brt_decref_no_entry); 1344 } 1345 1346 void 1347 brt_init(void) 1348 { 1349 brt_entry_cache = kmem_cache_create("brt_entry_cache", 1350 sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1351 brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache", 1352 sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1353 1354 brt_stat_init(); 1355 } 1356 1357 void 1358 brt_fini(void) 1359 { 1360 brt_stat_fini(); 1361 1362 kmem_cache_destroy(brt_entry_cache); 1363 kmem_cache_destroy(brt_pending_entry_cache); 1364 } 1365 1366 static brt_entry_t * 1367 brt_entry_alloc(const brt_entry_t *bre_init) 1368 { 1369 brt_entry_t *bre; 1370 1371 bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); 1372 bre->bre_offset = bre_init->bre_offset; 1373 bre->bre_refcount = bre_init->bre_refcount; 1374 1375 return (bre); 1376 } 1377 1378 static void 1379 brt_entry_free(brt_entry_t *bre) 1380 { 1381 1382 kmem_cache_free(brt_entry_cache, bre); 1383 } 1384 1385 static void 1386 brt_entry_addref(brt_t *brt, const blkptr_t *bp) 1387 { 1388 brt_vdev_t *brtvd; 1389 brt_entry_t *bre, *racebre; 1390 brt_entry_t bre_search; 1391 avl_index_t where; 1392 uint64_t vdevid; 1393 int error; 1394 1395 ASSERT(!RW_WRITE_HELD(&brt->brt_lock)); 1396 1397 brt_entry_fill(bp, &bre_search, &vdevid); 1398 1399 brt_wlock(brt); 1400 1401 brtvd = brt_vdev(brt, vdevid); 1402 if (brtvd == NULL) { 1403 ASSERT3U(vdevid, >=, brt->brt_nvdevs); 1404 1405 /* New VDEV was added. */ 1406 brt_vdevs_expand(brt, vdevid + 1); 1407 brtvd = brt_vdev(brt, vdevid); 1408 } 1409 ASSERT(brtvd != NULL); 1410 if (!brtvd->bv_initiated) 1411 brt_vdev_realloc(brt, brtvd); 1412 1413 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1414 if (bre != NULL) { 1415 BRTSTAT_BUMP(brt_addref_entry_in_memory); 1416 } else { 1417 /* 1418 * brt_entry_lookup() may drop the BRT (read) lock and 1419 * reacquire it (write). 1420 */ 1421 error = brt_entry_lookup(brt, brtvd, &bre_search); 1422 /* bre_search now contains correct bre_refcount */ 1423 ASSERT(error == 0 || error == ENOENT); 1424 if (error == 0) 1425 BRTSTAT_BUMP(brt_addref_entry_on_disk); 1426 else 1427 BRTSTAT_BUMP(brt_addref_entry_not_on_disk); 1428 /* 1429 * When the BRT lock was dropped, brt_vdevs[] may have been 1430 * expanded and reallocated, we need to update brtvd's pointer. 1431 */ 1432 brtvd = brt_vdev(brt, vdevid); 1433 ASSERT(brtvd != NULL); 1434 1435 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); 1436 if (racebre == NULL) { 1437 bre = brt_entry_alloc(&bre_search); 1438 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 1439 avl_insert(&brtvd->bv_tree, bre, where); 1440 brt->brt_nentries++; 1441 } else { 1442 /* 1443 * The entry was added when the BRT lock was dropped in 1444 * brt_entry_lookup(). 1445 */ 1446 BRTSTAT_BUMP(brt_addref_entry_read_lost_race); 1447 bre = racebre; 1448 } 1449 } 1450 bre->bre_refcount++; 1451 brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); 1452 1453 brt_unlock(brt); 1454 } 1455 1456 /* Return TRUE if block should be freed immediately. */ 1457 boolean_t 1458 brt_entry_decref(spa_t *spa, const blkptr_t *bp) 1459 { 1460 brt_t *brt = spa->spa_brt; 1461 brt_vdev_t *brtvd; 1462 brt_entry_t *bre, *racebre; 1463 brt_entry_t bre_search; 1464 avl_index_t where; 1465 uint64_t vdevid; 1466 int error; 1467 1468 brt_entry_fill(bp, &bre_search, &vdevid); 1469 1470 brt_wlock(brt); 1471 1472 brtvd = brt_vdev(brt, vdevid); 1473 ASSERT(brtvd != NULL); 1474 1475 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1476 if (bre != NULL) { 1477 BRTSTAT_BUMP(brt_decref_entry_in_memory); 1478 goto out; 1479 } else { 1480 BRTSTAT_BUMP(brt_decref_entry_not_in_memory); 1481 } 1482 1483 /* 1484 * brt_entry_lookup() may drop the BRT lock and reacquire it. 1485 */ 1486 error = brt_entry_lookup(brt, brtvd, &bre_search); 1487 /* bre_search now contains correct bre_refcount */ 1488 ASSERT(error == 0 || error == ENOENT); 1489 /* 1490 * When the BRT lock was dropped, brt_vdevs[] may have been expanded 1491 * and reallocated, we need to update brtvd's pointer. 1492 */ 1493 brtvd = brt_vdev(brt, vdevid); 1494 ASSERT(brtvd != NULL); 1495 1496 if (error == ENOENT) { 1497 BRTSTAT_BUMP(brt_decref_entry_not_on_disk); 1498 bre = NULL; 1499 goto out; 1500 } 1501 1502 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); 1503 if (racebre != NULL) { 1504 /* 1505 * The entry was added when the BRT lock was dropped in 1506 * brt_entry_lookup(). 1507 */ 1508 BRTSTAT_BUMP(brt_decref_entry_read_lost_race); 1509 bre = racebre; 1510 goto out; 1511 } 1512 1513 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); 1514 bre = brt_entry_alloc(&bre_search); 1515 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 1516 avl_insert(&brtvd->bv_tree, bre, where); 1517 brt->brt_nentries++; 1518 1519 out: 1520 if (bre == NULL) { 1521 /* 1522 * This is a free of a regular (not cloned) block. 1523 */ 1524 brt_unlock(brt); 1525 BRTSTAT_BUMP(brt_decref_no_entry); 1526 return (B_TRUE); 1527 } 1528 if (bre->bre_refcount == 0) { 1529 brt_unlock(brt); 1530 BRTSTAT_BUMP(brt_decref_free_data_now); 1531 return (B_TRUE); 1532 } 1533 1534 ASSERT(bre->bre_refcount > 0); 1535 bre->bre_refcount--; 1536 if (bre->bre_refcount == 0) 1537 BRTSTAT_BUMP(brt_decref_free_data_later); 1538 else 1539 BRTSTAT_BUMP(brt_decref_entry_still_referenced); 1540 brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); 1541 1542 brt_unlock(brt); 1543 1544 return (B_FALSE); 1545 } 1546 1547 static void 1548 brt_prefetch(brt_t *brt, const blkptr_t *bp) 1549 { 1550 brt_entry_t bre; 1551 uint64_t vdevid; 1552 1553 ASSERT(bp != NULL); 1554 1555 if (!zfs_brt_prefetch) 1556 return; 1557 1558 brt_entry_fill(bp, &bre, &vdevid); 1559 1560 brt_entry_prefetch(brt, vdevid, &bre); 1561 } 1562 1563 static int 1564 brt_pending_entry_compare(const void *x1, const void *x2) 1565 { 1566 const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2; 1567 const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp; 1568 int cmp; 1569 1570 cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2)); 1571 if (cmp == 0) { 1572 cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]), 1573 DVA_GET_VDEV(&bp2->blk_dva[0])); 1574 if (cmp == 0) { 1575 cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), 1576 DVA_GET_OFFSET(&bp2->blk_dva[0])); 1577 } 1578 } 1579 1580 return (cmp); 1581 } 1582 1583 void 1584 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1585 { 1586 brt_t *brt; 1587 avl_tree_t *pending_tree; 1588 kmutex_t *pending_lock; 1589 brt_pending_entry_t *bpe, *newbpe; 1590 avl_index_t where; 1591 uint64_t txg; 1592 1593 brt = spa->spa_brt; 1594 txg = dmu_tx_get_txg(tx); 1595 ASSERT3U(txg, !=, 0); 1596 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; 1597 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; 1598 1599 newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP); 1600 newbpe->bpe_bp = *bp; 1601 newbpe->bpe_count = 1; 1602 1603 mutex_enter(pending_lock); 1604 1605 bpe = avl_find(pending_tree, newbpe, &where); 1606 if (bpe == NULL) { 1607 avl_insert(pending_tree, newbpe, where); 1608 newbpe = NULL; 1609 } else { 1610 bpe->bpe_count++; 1611 } 1612 1613 mutex_exit(pending_lock); 1614 1615 if (newbpe != NULL) { 1616 ASSERT(bpe != NULL); 1617 ASSERT(bpe != newbpe); 1618 kmem_cache_free(brt_pending_entry_cache, newbpe); 1619 } else { 1620 ASSERT(bpe == NULL); 1621 } 1622 1623 /* Prefetch BRT entry, as we will need it in the syncing context. */ 1624 brt_prefetch(brt, bp); 1625 } 1626 1627 void 1628 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1629 { 1630 brt_t *brt; 1631 avl_tree_t *pending_tree; 1632 kmutex_t *pending_lock; 1633 brt_pending_entry_t *bpe, bpe_search; 1634 uint64_t txg; 1635 1636 brt = spa->spa_brt; 1637 txg = dmu_tx_get_txg(tx); 1638 ASSERT3U(txg, !=, 0); 1639 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; 1640 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; 1641 1642 bpe_search.bpe_bp = *bp; 1643 1644 mutex_enter(pending_lock); 1645 1646 bpe = avl_find(pending_tree, &bpe_search, NULL); 1647 /* I believe we should always find bpe when this function is called. */ 1648 if (bpe != NULL) { 1649 ASSERT(bpe->bpe_count > 0); 1650 1651 bpe->bpe_count--; 1652 if (bpe->bpe_count == 0) { 1653 avl_remove(pending_tree, bpe); 1654 kmem_cache_free(brt_pending_entry_cache, bpe); 1655 } 1656 } 1657 1658 mutex_exit(pending_lock); 1659 } 1660 1661 void 1662 brt_pending_apply(spa_t *spa, uint64_t txg) 1663 { 1664 brt_t *brt; 1665 brt_pending_entry_t *bpe; 1666 avl_tree_t *pending_tree; 1667 kmutex_t *pending_lock; 1668 void *c; 1669 1670 ASSERT3U(txg, !=, 0); 1671 1672 brt = spa->spa_brt; 1673 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; 1674 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; 1675 1676 mutex_enter(pending_lock); 1677 1678 c = NULL; 1679 while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) { 1680 boolean_t added_to_ddt; 1681 1682 mutex_exit(pending_lock); 1683 1684 for (int i = 0; i < bpe->bpe_count; i++) { 1685 /* 1686 * If the block has DEDUP bit set, it means that it 1687 * already exists in the DEDUP table, so we can just 1688 * use that instead of creating new entry in 1689 * the BRT table. 1690 */ 1691 if (BP_GET_DEDUP(&bpe->bpe_bp)) { 1692 added_to_ddt = ddt_addref(spa, &bpe->bpe_bp); 1693 } else { 1694 added_to_ddt = B_FALSE; 1695 } 1696 if (!added_to_ddt) 1697 brt_entry_addref(brt, &bpe->bpe_bp); 1698 } 1699 1700 kmem_cache_free(brt_pending_entry_cache, bpe); 1701 mutex_enter(pending_lock); 1702 } 1703 1704 mutex_exit(pending_lock); 1705 } 1706 1707 static void 1708 brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) 1709 { 1710 1711 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 1712 ASSERT(brtvd->bv_mos_entries != 0); 1713 1714 if (bre->bre_refcount == 0) { 1715 int error; 1716 1717 error = brt_entry_remove(brt, brtvd, bre, tx); 1718 ASSERT(error == 0 || error == ENOENT); 1719 /* 1720 * If error == ENOENT then zfs_clone_range() was done from a 1721 * removed (but opened) file (open(), unlink()). 1722 */ 1723 ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT); 1724 } else { 1725 VERIFY0(brt_entry_update(brt, brtvd, bre, tx)); 1726 } 1727 } 1728 1729 static void 1730 brt_sync_table(brt_t *brt, dmu_tx_t *tx) 1731 { 1732 brt_vdev_t *brtvd; 1733 brt_entry_t *bre; 1734 uint64_t vdevid; 1735 void *c; 1736 1737 brt_wlock(brt); 1738 1739 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 1740 brtvd = &brt->brt_vdevs[vdevid]; 1741 1742 if (!brtvd->bv_initiated) 1743 continue; 1744 1745 if (!brtvd->bv_meta_dirty) { 1746 ASSERT(!brtvd->bv_entcount_dirty); 1747 ASSERT0(avl_numnodes(&brtvd->bv_tree)); 1748 continue; 1749 } 1750 1751 ASSERT(!brtvd->bv_entcount_dirty || 1752 avl_numnodes(&brtvd->bv_tree) != 0); 1753 1754 if (brtvd->bv_mos_brtvdev == 0) 1755 brt_vdev_create(brt, brtvd, tx); 1756 1757 c = NULL; 1758 while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { 1759 brt_sync_entry(brt, brtvd, bre, tx); 1760 brt_entry_free(bre); 1761 ASSERT(brt->brt_nentries > 0); 1762 brt->brt_nentries--; 1763 } 1764 1765 brt_vdev_sync(brt, brtvd, tx); 1766 1767 if (brtvd->bv_totalcount == 0) 1768 brt_vdev_destroy(brt, brtvd, tx); 1769 } 1770 1771 ASSERT0(brt->brt_nentries); 1772 1773 brt_unlock(brt); 1774 } 1775 1776 void 1777 brt_sync(spa_t *spa, uint64_t txg) 1778 { 1779 dmu_tx_t *tx; 1780 brt_t *brt; 1781 1782 ASSERT(spa_syncing_txg(spa) == txg); 1783 1784 brt = spa->spa_brt; 1785 brt_rlock(brt); 1786 if (brt->brt_nentries == 0) { 1787 /* No changes. */ 1788 brt_unlock(brt); 1789 return; 1790 } 1791 brt_unlock(brt); 1792 1793 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1794 1795 brt_sync_table(brt, tx); 1796 1797 dmu_tx_commit(tx); 1798 } 1799 1800 static void 1801 brt_table_alloc(brt_t *brt) 1802 { 1803 1804 for (int i = 0; i < TXG_SIZE; i++) { 1805 avl_create(&brt->brt_pending_tree[i], 1806 brt_pending_entry_compare, 1807 sizeof (brt_pending_entry_t), 1808 offsetof(brt_pending_entry_t, bpe_node)); 1809 mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT, 1810 NULL); 1811 } 1812 } 1813 1814 static void 1815 brt_table_free(brt_t *brt) 1816 { 1817 1818 for (int i = 0; i < TXG_SIZE; i++) { 1819 ASSERT(avl_is_empty(&brt->brt_pending_tree[i])); 1820 1821 avl_destroy(&brt->brt_pending_tree[i]); 1822 mutex_destroy(&brt->brt_pending_lock[i]); 1823 } 1824 } 1825 1826 static void 1827 brt_alloc(spa_t *spa) 1828 { 1829 brt_t *brt; 1830 1831 ASSERT(spa->spa_brt == NULL); 1832 1833 brt = kmem_zalloc(sizeof (*brt), KM_SLEEP); 1834 rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL); 1835 brt->brt_spa = spa; 1836 brt->brt_rangesize = 0; 1837 brt->brt_nentries = 0; 1838 brt->brt_vdevs = NULL; 1839 brt->brt_nvdevs = 0; 1840 brt_table_alloc(brt); 1841 1842 spa->spa_brt = brt; 1843 } 1844 1845 void 1846 brt_create(spa_t *spa) 1847 { 1848 1849 brt_alloc(spa); 1850 brt_vdevs_alloc(spa->spa_brt, B_FALSE); 1851 } 1852 1853 int 1854 brt_load(spa_t *spa) 1855 { 1856 1857 brt_alloc(spa); 1858 brt_vdevs_alloc(spa->spa_brt, B_TRUE); 1859 1860 return (0); 1861 } 1862 1863 void 1864 brt_unload(spa_t *spa) 1865 { 1866 brt_t *brt = spa->spa_brt; 1867 1868 if (brt == NULL) 1869 return; 1870 1871 brt_vdevs_free(brt); 1872 brt_table_free(brt); 1873 rw_destroy(&brt->brt_lock); 1874 kmem_free(brt, sizeof (*brt)); 1875 spa->spa_brt = NULL; 1876 } 1877 1878 /* BEGIN CSTYLED */ 1879 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW, 1880 "Enable prefetching of BRT entries"); 1881 #ifdef ZFS_BRT_DEBUG 1882 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug"); 1883 #endif 1884 /* END CSTYLED */ 1885