1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/spa.h> 28 #include <sys/spa_impl.h> 29 #include <sys/zio.h> 30 #include <sys/brt.h> 31 #include <sys/brt_impl.h> 32 #include <sys/ddt.h> 33 #include <sys/bitmap.h> 34 #include <sys/zap.h> 35 #include <sys/dmu_tx.h> 36 #include <sys/arc.h> 37 #include <sys/dsl_pool.h> 38 #include <sys/dsl_scan.h> 39 #include <sys/vdev_impl.h> 40 #include <sys/kstat.h> 41 #include <sys/wmsum.h> 42 43 /* 44 * Block Cloning design. 45 * 46 * Block Cloning allows to manually clone a file (or a subset of its blocks) 47 * into another (or the same) file by just creating additional references to 48 * the data blocks without copying the data itself. Those references are kept 49 * in the Block Reference Tables (BRTs). 50 * 51 * In many ways this is similar to the existing deduplication, but there are 52 * some important differences: 53 * 54 * - Deduplication is automatic and Block Cloning is not - one has to use a 55 * dedicated system call(s) to clone the given file/blocks. 56 * - Deduplication keeps all data blocks in its table, even those referenced 57 * just once. Block Cloning creates an entry in its tables only when there 58 * are at least two references to the given data block. If the block was 59 * never explicitly cloned or the second to last reference was dropped, 60 * there will be neither space nor performance overhead. 61 * - Deduplication needs data to work - one needs to pass real data to the 62 * write(2) syscall, so hash can be calculated. Block Cloning doesn't require 63 * data, just block pointers to the data, so it is extremely fast, as we pay 64 * neither the cost of reading the data, nor the cost of writing the data - 65 * we operate exclusively on metadata. 66 * - If the D (dedup) bit is not set in the block pointer, it means that 67 * the block is not in the dedup table (DDT) and we won't consult the DDT 68 * when we need to free the block. Block Cloning must be consulted on every 69 * free, because we cannot modify the source BP (eg. by setting something 70 * similar to the D bit), thus we have no hint if the block is in the 71 * Block Reference Table (BRT), so we need to look into the BRT. There is 72 * an optimization in place that allows us to eliminate the majority of BRT 73 * lookups which is described below in the "Minimizing free penalty" section. 74 * - The BRT entry is much smaller than the DDT entry - for BRT we only store 75 * 64bit offset and 64bit reference counter. 76 * - Dedup keys are cryptographic hashes, so two blocks that are close to each 77 * other on disk are most likely in totally different parts of the DDT. 78 * The BRT entry keys are offsets into a single top-level VDEV, so data blocks 79 * from one file should have BRT entries close to each other. 80 * - Scrub will only do a single pass over a block that is referenced multiple 81 * times in the DDT. Unfortunately it is not currently (if at all) possible 82 * with Block Cloning and block referenced multiple times will be scrubbed 83 * multiple times. The new, sorted scrub should be able to eliminate 84 * duplicated reads given enough memory. 85 * - Deduplication requires cryptographically strong hash as a checksum or 86 * additional data verification. Block Cloning works with any checksum 87 * algorithm or even with checksumming disabled. 88 * 89 * As mentioned above, the BRT entries are much smaller than the DDT entries. 90 * To uniquely identify a block we just need its vdev id and offset. We also 91 * need to maintain a reference counter. The vdev id will often repeat, as there 92 * is a small number of top-level VDEVs and a large number of blocks stored in 93 * each VDEV. We take advantage of that to reduce the BRT entry size further by 94 * maintaining one BRT for each top-level VDEV, so we can then have only offset 95 * and counter as the BRT entry. 96 * 97 * Minimizing free penalty. 98 * 99 * Block Cloning allows creating additional references to any existing block. 100 * When we free a block there is no hint in the block pointer whether the block 101 * was cloned or not, so on each free we have to check if there is a 102 * corresponding entry in the BRT or not. If there is, we need to decrease 103 * the reference counter. Doing BRT lookup on every free can potentially be 104 * expensive by requiring additional I/Os if the BRT doesn't fit into memory. 105 * This is the main problem with deduplication, so we've learned our lesson and 106 * try not to repeat the same mistake here. How do we do that? We divide each 107 * top-level VDEV into 16MB regions. For each region we maintain a counter that 108 * is a sum of all the BRT entries that have offsets within the region. This 109 * creates the entries count array of 16bit numbers for each top-level VDEV. 110 * The entries count array is always kept in memory and updated on disk in the 111 * same transaction group as the BRT updates to keep everything in-sync. We can 112 * keep the array in memory, because it is very small. With 16MB regions and 113 * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease 114 * the region size even further in the future). Now, when we want to free 115 * a block, we first consult the array. If the counter for the whole region is 116 * zero, there is no need to look for the BRT entry, as there isn't one for 117 * sure. If the counter for the region is greater than zero, only then we will 118 * do a BRT lookup and if an entry is found we will decrease the reference 119 * counter in the BRT entry and in the entry counters array. 120 * 121 * The entry counters array is small, but can potentially be larger for very 122 * large VDEVs or smaller regions. In this case we don't want to rewrite entire 123 * array on every change. We then divide the array into 32kB block and keep 124 * a bitmap of dirty blocks within a transaction group. When we sync the 125 * transaction group we can only update the parts of the entry counters array 126 * that were modified. Note: Keeping track of the dirty parts of the entry 127 * counters array is implemented, but updating only parts of the array on disk 128 * is not yet implemented - for now we will update entire array if there was 129 * any change. 130 * 131 * The implementation tries to be economic: if BRT is not used, or no longer 132 * used, there will be no entries in the MOS and no additional memory used (eg. 133 * the entry counters array is only allocated if needed). 134 * 135 * Interaction between Deduplication and Block Cloning. 136 * 137 * If both functionalities are in use, we could end up with a block that is 138 * referenced multiple times in both DDT and BRT. When we free one of the 139 * references we couldn't tell where it belongs, so we would have to decide 140 * what table takes the precedence: do we first clear DDT references or BRT 141 * references? To avoid this dilemma BRT cooperates with DDT - if a given block 142 * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will 143 * lookup DDT entry instead and increase the counter there. No BRT entry 144 * will be created for a block which has the D (dedup) bit set. 145 * BRT may be more efficient for manual deduplication, but if the block is 146 * already in the DDT, then creating additional BRT entry would be less 147 * efficient. This clever idea was proposed by Allan Jude. 148 * 149 * Block Cloning across datasets. 150 * 151 * Block Cloning is not limited to cloning blocks within the same dataset. 152 * It is possible (and very useful) to clone blocks between different datasets. 153 * One use case is recovering files from snapshots. By cloning the files into 154 * dataset we need no additional storage. Without Block Cloning we would need 155 * additional space for those files. 156 * Another interesting use case is moving the files between datasets 157 * (copying the file content to the new dataset and removing the source file). 158 * In that case Block Cloning will only be used briefly, because the BRT entries 159 * will be removed when the source is removed. 160 * Note: currently it is not possible to clone blocks between encrypted 161 * datasets, even if those datasets use the same encryption key (this includes 162 * snapshots of encrypted datasets). Cloning blocks between datasets that use 163 * the same keys should be possible and should be implemented in the future. 164 * 165 * Block Cloning flow through ZFS layers. 166 * 167 * Note: Block Cloning can be used both for cloning file system blocks and ZVOL 168 * blocks. As of this writing no interface is implemented that allows for block 169 * cloning within a ZVOL. 170 * FreeBSD and Linux provides copy_file_range(2) system call and we will use it 171 * for blocking cloning. 172 * 173 * ssize_t 174 * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp, 175 * size_t len, unsigned int flags); 176 * 177 * Even though offsets and length represent bytes, they have to be 178 * block-aligned or we will return an error so the upper layer can 179 * fallback to the generic mechanism that will just copy the data. 180 * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. 181 * This function was implemented based on zfs_write(), but instead of writing 182 * the given data we first read block pointers using the new dmu_read_l0_bps() 183 * function from the source file. Once we have BPs from the source file we call 184 * the dmu_brt_clone() function on the destination file. This function 185 * allocates BPs for us. We iterate over all source BPs. If the given BP is 186 * a hole or an embedded block, we just copy BP as-is. If it points to a real 187 * data we place this BP on a BRT pending list using the brt_pending_add() 188 * function. 189 * 190 * We use this pending list to keep track of all BPs that got new references 191 * within this transaction group. 192 * 193 * Some special cases to consider and how we address them: 194 * - The block we want to clone may have been created within the same 195 * transaction group that we are trying to clone. Such block has no BP 196 * allocated yet, so cannot be immediately cloned. We return EAGAIN. 197 * - The block we want to clone may have been modified within the same 198 * transaction group. We return EAGAIN. 199 * - A block may be cloned multiple times during one transaction group (that's 200 * why pending list is actually a tree and not an append-only list - this 201 * way we can figure out faster if this block is cloned for the first time 202 * in this txg or consecutive time). 203 * - A block may be cloned and freed within the same transaction group 204 * (see dbuf_undirty()). 205 * - A block may be cloned and within the same transaction group the clone 206 * can be cloned again (see dmu_read_l0_bps()). 207 * - A file might have been deleted, but the caller still has a file descriptor 208 * open to this file and clones it. 209 * 210 * When we free a block we have an additional step in the ZIO pipeline where we 211 * call the zio_brt_free() function. We then call the brt_entry_decref() 212 * that loads the corresponding BRT entry (if one exists) and decreases 213 * reference counter. If this is not the last reference we will stop ZIO 214 * pipeline here. If this is the last reference or the block is not in the 215 * BRT, we continue the pipeline and free the block as usual. 216 * 217 * At the beginning of spa_sync() where there can be no more block cloning, 218 * but before issuing frees we call brt_pending_apply(). This function applies 219 * all the new clones to the BRT table - we load BRT entries and update 220 * reference counters. To sync new BRT entries to disk, we use brt_sync() 221 * function. This function will sync all dirty per-top-level-vdev BRTs, 222 * the entry counters arrays, etc. 223 * 224 * Block Cloning and ZIL. 225 * 226 * Every clone operation is divided into chunks (similar to write) and each 227 * chunk is cloned in a separate transaction. The chunk size is determined by 228 * how many BPs we can fit into a single ZIL entry. 229 * Replaying clone operation is different from the regular clone operation, 230 * as when we log clone operations we cannot use the source object - it may 231 * reside on a different dataset, so we log BPs we want to clone. 232 * The ZIL is replayed when we mount the given dataset, not when the pool is 233 * imported. Taking this into account it is possible that the pool is imported 234 * without mounting datasets and the source dataset is destroyed before the 235 * destination dataset is mounted and its ZIL replayed. 236 * To address this situation we leverage zil_claim() mechanism where ZFS will 237 * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE 238 * entries, we will bump reference counters for their BPs in the BRT and then 239 * on mount and ZIL replay we will just attach BPs to the file without 240 * bumping reference counters. 241 * Note it is still possible that after zil_claim() we never mount the 242 * destination, so we never replay its ZIL and we destroy it. This way we would 243 * end up with leaked references in BRT. We address that too as ZFS gives us 244 * a chance to clean this up on dataset destroy (see zil_free_clone_range()). 245 */ 246 247 static kmem_cache_t *brt_entry_cache; 248 static kmem_cache_t *brt_pending_entry_cache; 249 250 /* 251 * Enable/disable prefetching of BRT entries that we are going to modify. 252 */ 253 int zfs_brt_prefetch = 1; 254 255 #ifdef ZFS_DEBUG 256 #define BRT_DEBUG(...) do { \ 257 if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \ 258 __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \ 259 } \ 260 } while (0) 261 #else 262 #define BRT_DEBUG(...) do { } while (0) 263 #endif 264 265 int brt_zap_leaf_blockshift = 12; 266 int brt_zap_indirect_blockshift = 12; 267 268 static kstat_t *brt_ksp; 269 270 typedef struct brt_stats { 271 kstat_named_t brt_addref_entry_in_memory; 272 kstat_named_t brt_addref_entry_not_on_disk; 273 kstat_named_t brt_addref_entry_on_disk; 274 kstat_named_t brt_addref_entry_read_lost_race; 275 kstat_named_t brt_decref_entry_in_memory; 276 kstat_named_t brt_decref_entry_loaded_from_disk; 277 kstat_named_t brt_decref_entry_not_in_memory; 278 kstat_named_t brt_decref_entry_not_on_disk; 279 kstat_named_t brt_decref_entry_read_lost_race; 280 kstat_named_t brt_decref_entry_still_referenced; 281 kstat_named_t brt_decref_free_data_later; 282 kstat_named_t brt_decref_free_data_now; 283 kstat_named_t brt_decref_no_entry; 284 } brt_stats_t; 285 286 static brt_stats_t brt_stats = { 287 { "addref_entry_in_memory", KSTAT_DATA_UINT64 }, 288 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, 289 { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, 290 { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 }, 291 { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, 292 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, 293 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, 294 { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 }, 295 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, 296 { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, 297 { "decref_free_data_later", KSTAT_DATA_UINT64 }, 298 { "decref_free_data_now", KSTAT_DATA_UINT64 }, 299 { "decref_no_entry", KSTAT_DATA_UINT64 } 300 }; 301 302 struct { 303 wmsum_t brt_addref_entry_in_memory; 304 wmsum_t brt_addref_entry_not_on_disk; 305 wmsum_t brt_addref_entry_on_disk; 306 wmsum_t brt_addref_entry_read_lost_race; 307 wmsum_t brt_decref_entry_in_memory; 308 wmsum_t brt_decref_entry_loaded_from_disk; 309 wmsum_t brt_decref_entry_not_in_memory; 310 wmsum_t brt_decref_entry_not_on_disk; 311 wmsum_t brt_decref_entry_read_lost_race; 312 wmsum_t brt_decref_entry_still_referenced; 313 wmsum_t brt_decref_free_data_later; 314 wmsum_t brt_decref_free_data_now; 315 wmsum_t brt_decref_no_entry; 316 } brt_sums; 317 318 #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) 319 320 static int brt_entry_compare(const void *x1, const void *x2); 321 static int brt_pending_entry_compare(const void *x1, const void *x2); 322 323 static void 324 brt_rlock(brt_t *brt) 325 { 326 rw_enter(&brt->brt_lock, RW_READER); 327 } 328 329 static void 330 brt_wlock(brt_t *brt) 331 { 332 rw_enter(&brt->brt_lock, RW_WRITER); 333 } 334 335 static void 336 brt_unlock(brt_t *brt) 337 { 338 rw_exit(&brt->brt_lock); 339 } 340 341 static uint16_t 342 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) 343 { 344 345 ASSERT3U(idx, <, brtvd->bv_size); 346 347 if (brtvd->bv_need_byteswap) { 348 return (BSWAP_16(brtvd->bv_entcount[idx])); 349 } else { 350 return (brtvd->bv_entcount[idx]); 351 } 352 } 353 354 static void 355 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt) 356 { 357 358 ASSERT3U(idx, <, brtvd->bv_size); 359 360 if (brtvd->bv_need_byteswap) { 361 brtvd->bv_entcount[idx] = BSWAP_16(entcnt); 362 } else { 363 brtvd->bv_entcount[idx] = entcnt; 364 } 365 } 366 367 static void 368 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx) 369 { 370 uint16_t entcnt; 371 372 ASSERT3U(idx, <, brtvd->bv_size); 373 374 entcnt = brt_vdev_entcount_get(brtvd, idx); 375 ASSERT(entcnt < UINT16_MAX); 376 377 brt_vdev_entcount_set(brtvd, idx, entcnt + 1); 378 } 379 380 static void 381 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx) 382 { 383 uint16_t entcnt; 384 385 ASSERT3U(idx, <, brtvd->bv_size); 386 387 entcnt = brt_vdev_entcount_get(brtvd, idx); 388 ASSERT(entcnt > 0); 389 390 brt_vdev_entcount_set(brtvd, idx, entcnt - 1); 391 } 392 393 #ifdef ZFS_DEBUG 394 static void 395 brt_vdev_dump(brt_t *brt) 396 { 397 brt_vdev_t *brtvd; 398 uint64_t vdevid; 399 400 if ((zfs_flags & ZFS_DEBUG_BRT) == 0) { 401 return; 402 } 403 404 if (brt->brt_nvdevs == 0) { 405 zfs_dbgmsg("BRT empty"); 406 return; 407 } 408 409 zfs_dbgmsg("BRT vdev dump:"); 410 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 411 uint64_t idx; 412 413 brtvd = &brt->brt_vdevs[vdevid]; 414 zfs_dbgmsg(" vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d " 415 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", 416 (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid, 417 brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, 418 (u_longlong_t)brtvd->bv_size, 419 (u_longlong_t)brtvd->bv_totalcount, 420 (u_longlong_t)brtvd->bv_nblocks, 421 (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); 422 if (brtvd->bv_totalcount > 0) { 423 zfs_dbgmsg(" entcounts:"); 424 for (idx = 0; idx < brtvd->bv_size; idx++) { 425 if (brt_vdev_entcount_get(brtvd, idx) > 0) { 426 zfs_dbgmsg(" [%04llu] %hu", 427 (u_longlong_t)idx, 428 brt_vdev_entcount_get(brtvd, idx)); 429 } 430 } 431 } 432 if (brtvd->bv_entcount_dirty) { 433 char *bitmap; 434 435 bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); 436 for (idx = 0; idx < brtvd->bv_nblocks; idx++) { 437 bitmap[idx] = 438 BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; 439 } 440 bitmap[idx] = '\0'; 441 zfs_dbgmsg(" bitmap: %s", bitmap); 442 kmem_free(bitmap, brtvd->bv_nblocks + 1); 443 } 444 } 445 } 446 #endif 447 448 static brt_vdev_t * 449 brt_vdev(brt_t *brt, uint64_t vdevid) 450 { 451 brt_vdev_t *brtvd; 452 453 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 454 455 if (vdevid < brt->brt_nvdevs) { 456 brtvd = &brt->brt_vdevs[vdevid]; 457 } else { 458 brtvd = NULL; 459 } 460 461 return (brtvd); 462 } 463 464 static void 465 brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) 466 { 467 char name[64]; 468 469 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 470 ASSERT0(brtvd->bv_mos_brtvdev); 471 ASSERT0(brtvd->bv_mos_entries); 472 ASSERT(brtvd->bv_entcount != NULL); 473 ASSERT(brtvd->bv_size > 0); 474 ASSERT(brtvd->bv_bitmap != NULL); 475 ASSERT(brtvd->bv_nblocks > 0); 476 477 brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0, 478 ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, 479 brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE, 480 0, tx); 481 VERIFY(brtvd->bv_mos_entries != 0); 482 BRT_DEBUG("MOS entries created, object=%llu", 483 (u_longlong_t)brtvd->bv_mos_entries); 484 485 /* 486 * We allocate DMU buffer to store the bv_entcount[] array. 487 * We will keep array size (bv_size) and cummulative count for all 488 * bv_entcount[]s (bv_totalcount) in the bonus buffer. 489 */ 490 brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos, 491 DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, 492 DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); 493 VERIFY(brtvd->bv_mos_brtvdev != 0); 494 BRT_DEBUG("MOS BRT VDEV created, object=%llu", 495 (u_longlong_t)brtvd->bv_mos_brtvdev); 496 497 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 498 (u_longlong_t)brtvd->bv_vdevid); 499 VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, 500 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); 501 BRT_DEBUG("Pool directory object created, object=%s", name); 502 503 spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); 504 } 505 506 static void 507 brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) 508 { 509 vdev_t *vd; 510 uint16_t *entcount; 511 ulong_t *bitmap; 512 uint64_t nblocks, size; 513 514 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 515 516 spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER); 517 vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid); 518 size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1; 519 spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG); 520 521 entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); 522 nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); 523 bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); 524 525 if (!brtvd->bv_initiated) { 526 ASSERT0(brtvd->bv_size); 527 ASSERT(brtvd->bv_entcount == NULL); 528 ASSERT(brtvd->bv_bitmap == NULL); 529 ASSERT0(brtvd->bv_nblocks); 530 531 avl_create(&brtvd->bv_tree, brt_entry_compare, 532 sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); 533 } else { 534 ASSERT(brtvd->bv_size > 0); 535 ASSERT(brtvd->bv_entcount != NULL); 536 ASSERT(brtvd->bv_bitmap != NULL); 537 ASSERT(brtvd->bv_nblocks > 0); 538 /* 539 * TODO: Allow vdev shrinking. We only need to implement 540 * shrinking the on-disk BRT VDEV object. 541 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset, 542 * size, tx); 543 */ 544 ASSERT3U(brtvd->bv_size, <=, size); 545 546 memcpy(entcount, brtvd->bv_entcount, 547 sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); 548 memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), 549 BT_SIZEOFMAP(brtvd->bv_nblocks))); 550 vmem_free(brtvd->bv_entcount, 551 sizeof (entcount[0]) * brtvd->bv_size); 552 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); 553 } 554 555 brtvd->bv_size = size; 556 brtvd->bv_entcount = entcount; 557 brtvd->bv_bitmap = bitmap; 558 brtvd->bv_nblocks = nblocks; 559 if (!brtvd->bv_initiated) { 560 brtvd->bv_need_byteswap = FALSE; 561 brtvd->bv_initiated = TRUE; 562 BRT_DEBUG("BRT VDEV %llu initiated.", 563 (u_longlong_t)brtvd->bv_vdevid); 564 } 565 } 566 567 static void 568 brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) 569 { 570 char name[64]; 571 dmu_buf_t *db; 572 brt_vdev_phys_t *bvphys; 573 int error; 574 575 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 576 (u_longlong_t)brtvd->bv_vdevid); 577 error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, 578 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev); 579 if (error != 0) 580 return; 581 ASSERT(brtvd->bv_mos_brtvdev != 0); 582 583 error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db); 584 ASSERT0(error); 585 if (error != 0) 586 return; 587 588 bvphys = db->db_data; 589 if (brt->brt_rangesize == 0) { 590 brt->brt_rangesize = bvphys->bvp_rangesize; 591 } else { 592 ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize); 593 } 594 595 ASSERT(!brtvd->bv_initiated); 596 brt_vdev_realloc(brt, brtvd); 597 598 /* TODO: We don't support VDEV shrinking. */ 599 ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); 600 601 /* 602 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. 603 */ 604 error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, 605 MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), 606 brtvd->bv_entcount, DMU_READ_NO_PREFETCH); 607 ASSERT0(error); 608 609 brtvd->bv_mos_entries = bvphys->bvp_mos_entries; 610 ASSERT(brtvd->bv_mos_entries != 0); 611 brtvd->bv_need_byteswap = 612 (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); 613 brtvd->bv_totalcount = bvphys->bvp_totalcount; 614 brtvd->bv_usedspace = bvphys->bvp_usedspace; 615 brtvd->bv_savedspace = bvphys->bvp_savedspace; 616 brt->brt_usedspace += brtvd->bv_usedspace; 617 brt->brt_savedspace += brtvd->bv_savedspace; 618 619 dmu_buf_rele(db, FTAG); 620 621 BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu", 622 name, (u_longlong_t)brtvd->bv_mos_brtvdev, 623 (u_longlong_t)brtvd->bv_mos_entries); 624 } 625 626 static void 627 brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd) 628 { 629 630 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 631 ASSERT(brtvd->bv_initiated); 632 633 vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); 634 brtvd->bv_entcount = NULL; 635 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); 636 brtvd->bv_bitmap = NULL; 637 ASSERT0(avl_numnodes(&brtvd->bv_tree)); 638 avl_destroy(&brtvd->bv_tree); 639 640 brtvd->bv_size = 0; 641 brtvd->bv_nblocks = 0; 642 643 brtvd->bv_initiated = FALSE; 644 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); 645 } 646 647 static void 648 brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) 649 { 650 char name[64]; 651 uint64_t count; 652 dmu_buf_t *db; 653 brt_vdev_phys_t *bvphys; 654 655 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 656 ASSERT(brtvd->bv_mos_brtvdev != 0); 657 ASSERT(brtvd->bv_mos_entries != 0); 658 659 VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count)); 660 VERIFY0(count); 661 VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx)); 662 BRT_DEBUG("MOS entries destroyed, object=%llu", 663 (u_longlong_t)brtvd->bv_mos_entries); 664 brtvd->bv_mos_entries = 0; 665 666 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); 667 bvphys = db->db_data; 668 ASSERT0(bvphys->bvp_totalcount); 669 ASSERT0(bvphys->bvp_usedspace); 670 ASSERT0(bvphys->bvp_savedspace); 671 dmu_buf_rele(db, FTAG); 672 673 VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx)); 674 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", 675 (u_longlong_t)brtvd->bv_mos_brtvdev); 676 brtvd->bv_mos_brtvdev = 0; 677 678 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 679 (u_longlong_t)brtvd->bv_vdevid); 680 VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx)); 681 BRT_DEBUG("Pool directory object removed, object=%s", name); 682 683 brt_vdev_dealloc(brt, brtvd); 684 685 spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); 686 } 687 688 static void 689 brt_vdevs_expand(brt_t *brt, uint64_t nvdevs) 690 { 691 brt_vdev_t *brtvd, *vdevs; 692 uint64_t vdevid; 693 694 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 695 ASSERT3U(nvdevs, >, brt->brt_nvdevs); 696 697 vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP); 698 if (brt->brt_nvdevs > 0) { 699 ASSERT(brt->brt_vdevs != NULL); 700 701 memcpy(vdevs, brt->brt_vdevs, 702 sizeof (brt_vdev_t) * brt->brt_nvdevs); 703 kmem_free(brt->brt_vdevs, 704 sizeof (brt_vdev_t) * brt->brt_nvdevs); 705 } 706 for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) { 707 brtvd = &vdevs[vdevid]; 708 709 brtvd->bv_vdevid = vdevid; 710 brtvd->bv_initiated = FALSE; 711 } 712 713 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", 714 (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs); 715 716 brt->brt_vdevs = vdevs; 717 brt->brt_nvdevs = nvdevs; 718 } 719 720 static boolean_t 721 brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre) 722 { 723 uint64_t idx; 724 725 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 726 727 idx = bre->bre_offset / brt->brt_rangesize; 728 if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) { 729 /* VDEV wasn't expanded. */ 730 return (brt_vdev_entcount_get(brtvd, idx) > 0); 731 } 732 733 return (FALSE); 734 } 735 736 static void 737 brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, 738 uint64_t dsize) 739 { 740 uint64_t idx; 741 742 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 743 ASSERT(brtvd != NULL); 744 ASSERT(brtvd->bv_entcount != NULL); 745 746 brt->brt_savedspace += dsize; 747 brtvd->bv_savedspace += dsize; 748 brtvd->bv_meta_dirty = TRUE; 749 750 if (bre->bre_refcount > 1) { 751 return; 752 } 753 754 brt->brt_usedspace += dsize; 755 brtvd->bv_usedspace += dsize; 756 757 idx = bre->bre_offset / brt->brt_rangesize; 758 if (idx >= brtvd->bv_size) { 759 /* VDEV has been expanded. */ 760 brt_vdev_realloc(brt, brtvd); 761 } 762 763 ASSERT3U(idx, <, brtvd->bv_size); 764 765 brtvd->bv_totalcount++; 766 brt_vdev_entcount_inc(brtvd, idx); 767 brtvd->bv_entcount_dirty = TRUE; 768 idx = idx / BRT_BLOCKSIZE / 8; 769 BT_SET(brtvd->bv_bitmap, idx); 770 771 #ifdef ZFS_DEBUG 772 brt_vdev_dump(brt); 773 #endif 774 } 775 776 static void 777 brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, 778 uint64_t dsize) 779 { 780 uint64_t idx; 781 782 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 783 ASSERT(brtvd != NULL); 784 ASSERT(brtvd->bv_entcount != NULL); 785 786 brt->brt_savedspace -= dsize; 787 brtvd->bv_savedspace -= dsize; 788 brtvd->bv_meta_dirty = TRUE; 789 790 if (bre->bre_refcount > 0) { 791 return; 792 } 793 794 brt->brt_usedspace -= dsize; 795 brtvd->bv_usedspace -= dsize; 796 797 idx = bre->bre_offset / brt->brt_rangesize; 798 ASSERT3U(idx, <, brtvd->bv_size); 799 800 ASSERT(brtvd->bv_totalcount > 0); 801 brtvd->bv_totalcount--; 802 brt_vdev_entcount_dec(brtvd, idx); 803 brtvd->bv_entcount_dirty = TRUE; 804 idx = idx / BRT_BLOCKSIZE / 8; 805 BT_SET(brtvd->bv_bitmap, idx); 806 807 #ifdef ZFS_DEBUG 808 brt_vdev_dump(brt); 809 #endif 810 } 811 812 static void 813 brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) 814 { 815 dmu_buf_t *db; 816 brt_vdev_phys_t *bvphys; 817 818 ASSERT(brtvd->bv_meta_dirty); 819 ASSERT(brtvd->bv_mos_brtvdev != 0); 820 ASSERT(dmu_tx_is_syncing(tx)); 821 822 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); 823 824 if (brtvd->bv_entcount_dirty) { 825 /* 826 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. 827 */ 828 dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, 829 brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), 830 brtvd->bv_entcount, tx); 831 memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks)); 832 brtvd->bv_entcount_dirty = FALSE; 833 } 834 835 dmu_buf_will_dirty(db, tx); 836 bvphys = db->db_data; 837 bvphys->bvp_mos_entries = brtvd->bv_mos_entries; 838 bvphys->bvp_size = brtvd->bv_size; 839 if (brtvd->bv_need_byteswap) { 840 bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER; 841 } else { 842 bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; 843 } 844 bvphys->bvp_totalcount = brtvd->bv_totalcount; 845 bvphys->bvp_rangesize = brt->brt_rangesize; 846 bvphys->bvp_usedspace = brtvd->bv_usedspace; 847 bvphys->bvp_savedspace = brtvd->bv_savedspace; 848 dmu_buf_rele(db, FTAG); 849 850 brtvd->bv_meta_dirty = FALSE; 851 } 852 853 static void 854 brt_vdevs_alloc(brt_t *brt, boolean_t load) 855 { 856 brt_vdev_t *brtvd; 857 uint64_t vdevid; 858 859 brt_wlock(brt); 860 861 brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children); 862 863 if (load) { 864 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 865 brtvd = &brt->brt_vdevs[vdevid]; 866 ASSERT(brtvd->bv_entcount == NULL); 867 868 brt_vdev_load(brt, brtvd); 869 } 870 } 871 872 if (brt->brt_rangesize == 0) { 873 brt->brt_rangesize = BRT_RANGESIZE; 874 } 875 876 brt_unlock(brt); 877 } 878 879 static void 880 brt_vdevs_free(brt_t *brt) 881 { 882 brt_vdev_t *brtvd; 883 uint64_t vdevid; 884 885 brt_wlock(brt); 886 887 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 888 brtvd = &brt->brt_vdevs[vdevid]; 889 if (brtvd->bv_initiated) 890 brt_vdev_dealloc(brt, brtvd); 891 } 892 kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs); 893 894 brt_unlock(brt); 895 } 896 897 static void 898 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) 899 { 900 901 bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); 902 bre->bre_refcount = 0; 903 904 *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); 905 } 906 907 static int 908 brt_entry_compare(const void *x1, const void *x2) 909 { 910 const brt_entry_t *bre1 = x1; 911 const brt_entry_t *bre2 = x2; 912 913 return (TREE_CMP(bre1->bre_offset, bre2->bre_offset)); 914 } 915 916 static int 917 brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre) 918 { 919 uint64_t mos_entries; 920 uint64_t one, physsize; 921 int error; 922 923 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 924 925 if (!brt_vdev_lookup(brt, brtvd, bre)) 926 return (SET_ERROR(ENOENT)); 927 928 /* 929 * Remember mos_entries object number. After we reacquire the BRT lock, 930 * the brtvd pointer may be invalid. 931 */ 932 mos_entries = brtvd->bv_mos_entries; 933 if (mos_entries == 0) 934 return (SET_ERROR(ENOENT)); 935 936 brt_unlock(brt); 937 938 error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, 939 BRT_KEY_WORDS, &one, &physsize); 940 if (error == 0) { 941 ASSERT3U(one, ==, 1); 942 ASSERT3U(physsize, ==, sizeof (bre->bre_refcount)); 943 944 error = zap_lookup_uint64(brt->brt_mos, mos_entries, 945 &bre->bre_offset, BRT_KEY_WORDS, 1, 946 sizeof (bre->bre_refcount), &bre->bre_refcount); 947 BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu " 948 "count=%llu error=%d", (u_longlong_t)mos_entries, 949 (u_longlong_t)brtvd->bv_vdevid, 950 (u_longlong_t)bre->bre_offset, 951 error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error); 952 } 953 954 brt_wlock(brt); 955 956 return (error); 957 } 958 959 static void 960 brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) 961 { 962 brt_vdev_t *brtvd; 963 uint64_t mos_entries = 0; 964 965 brt_rlock(brt); 966 brtvd = brt_vdev(brt, vdevid); 967 if (brtvd != NULL) 968 mos_entries = brtvd->bv_mos_entries; 969 brt_unlock(brt); 970 971 if (mos_entries == 0) 972 return; 973 974 BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu", 975 (u_longlong_t)mos_entries, (u_longlong_t)vdevid, 976 (u_longlong_t)bre->bre_offset); 977 (void) zap_prefetch_uint64(brt->brt_mos, mos_entries, 978 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS); 979 } 980 981 static int 982 brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) 983 { 984 int error; 985 986 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 987 ASSERT(brtvd->bv_mos_entries != 0); 988 ASSERT(bre->bre_refcount > 0); 989 990 error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries, 991 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1, 992 sizeof (bre->bre_refcount), &bre->bre_refcount, tx); 993 BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu " 994 "error=%d", (u_longlong_t)brtvd->bv_mos_entries, 995 (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, 996 (u_longlong_t)bre->bre_refcount, error); 997 998 return (error); 999 } 1000 1001 static int 1002 brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) 1003 { 1004 int error; 1005 1006 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 1007 ASSERT(brtvd->bv_mos_entries != 0); 1008 ASSERT0(bre->bre_refcount); 1009 1010 error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries, 1011 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx); 1012 BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu " 1013 "error=%d", (u_longlong_t)brtvd->bv_mos_entries, 1014 (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, 1015 (u_longlong_t)bre->bre_refcount, error); 1016 1017 return (error); 1018 } 1019 1020 /* 1021 * Return TRUE if we _can_ have BRT entry for this bp. It might be false 1022 * positive, but gives us quick answer if we should look into BRT, which 1023 * may require reads and thus will be more expensive. 1024 */ 1025 boolean_t 1026 brt_maybe_exists(spa_t *spa, const blkptr_t *bp) 1027 { 1028 brt_t *brt = spa->spa_brt; 1029 brt_vdev_t *brtvd; 1030 brt_entry_t bre_search; 1031 boolean_t mayexists = FALSE; 1032 uint64_t vdevid; 1033 1034 brt_entry_fill(bp, &bre_search, &vdevid); 1035 1036 brt_rlock(brt); 1037 1038 brtvd = brt_vdev(brt, vdevid); 1039 if (brtvd != NULL && brtvd->bv_initiated) { 1040 if (!avl_is_empty(&brtvd->bv_tree) || 1041 brt_vdev_lookup(brt, brtvd, &bre_search)) { 1042 mayexists = TRUE; 1043 } 1044 } 1045 1046 brt_unlock(brt); 1047 1048 return (mayexists); 1049 } 1050 1051 uint64_t 1052 brt_get_dspace(spa_t *spa) 1053 { 1054 brt_t *brt = spa->spa_brt; 1055 1056 if (brt == NULL) 1057 return (0); 1058 1059 return (brt->brt_savedspace); 1060 } 1061 1062 uint64_t 1063 brt_get_used(spa_t *spa) 1064 { 1065 brt_t *brt = spa->spa_brt; 1066 1067 if (brt == NULL) 1068 return (0); 1069 1070 return (brt->brt_usedspace); 1071 } 1072 1073 uint64_t 1074 brt_get_saved(spa_t *spa) 1075 { 1076 brt_t *brt = spa->spa_brt; 1077 1078 if (brt == NULL) 1079 return (0); 1080 1081 return (brt->brt_savedspace); 1082 } 1083 1084 uint64_t 1085 brt_get_ratio(spa_t *spa) 1086 { 1087 brt_t *brt = spa->spa_brt; 1088 1089 if (brt->brt_usedspace == 0) 1090 return (100); 1091 1092 return ((brt->brt_usedspace + brt->brt_savedspace) * 100 / 1093 brt->brt_usedspace); 1094 } 1095 1096 static int 1097 brt_kstats_update(kstat_t *ksp, int rw) 1098 { 1099 brt_stats_t *bs = ksp->ks_data; 1100 1101 if (rw == KSTAT_WRITE) 1102 return (EACCES); 1103 1104 bs->brt_addref_entry_in_memory.value.ui64 = 1105 wmsum_value(&brt_sums.brt_addref_entry_in_memory); 1106 bs->brt_addref_entry_not_on_disk.value.ui64 = 1107 wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); 1108 bs->brt_addref_entry_on_disk.value.ui64 = 1109 wmsum_value(&brt_sums.brt_addref_entry_on_disk); 1110 bs->brt_addref_entry_read_lost_race.value.ui64 = 1111 wmsum_value(&brt_sums.brt_addref_entry_read_lost_race); 1112 bs->brt_decref_entry_in_memory.value.ui64 = 1113 wmsum_value(&brt_sums.brt_decref_entry_in_memory); 1114 bs->brt_decref_entry_loaded_from_disk.value.ui64 = 1115 wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); 1116 bs->brt_decref_entry_not_in_memory.value.ui64 = 1117 wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); 1118 bs->brt_decref_entry_not_on_disk.value.ui64 = 1119 wmsum_value(&brt_sums.brt_decref_entry_not_on_disk); 1120 bs->brt_decref_entry_read_lost_race.value.ui64 = 1121 wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); 1122 bs->brt_decref_entry_still_referenced.value.ui64 = 1123 wmsum_value(&brt_sums.brt_decref_entry_still_referenced); 1124 bs->brt_decref_free_data_later.value.ui64 = 1125 wmsum_value(&brt_sums.brt_decref_free_data_later); 1126 bs->brt_decref_free_data_now.value.ui64 = 1127 wmsum_value(&brt_sums.brt_decref_free_data_now); 1128 bs->brt_decref_no_entry.value.ui64 = 1129 wmsum_value(&brt_sums.brt_decref_no_entry); 1130 1131 return (0); 1132 } 1133 1134 static void 1135 brt_stat_init(void) 1136 { 1137 1138 wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0); 1139 wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); 1140 wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); 1141 wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0); 1142 wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); 1143 wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); 1144 wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); 1145 wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0); 1146 wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); 1147 wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); 1148 wmsum_init(&brt_sums.brt_decref_free_data_later, 0); 1149 wmsum_init(&brt_sums.brt_decref_free_data_now, 0); 1150 wmsum_init(&brt_sums.brt_decref_no_entry, 0); 1151 1152 brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED, 1153 sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 1154 if (brt_ksp != NULL) { 1155 brt_ksp->ks_data = &brt_stats; 1156 brt_ksp->ks_update = brt_kstats_update; 1157 kstat_install(brt_ksp); 1158 } 1159 } 1160 1161 static void 1162 brt_stat_fini(void) 1163 { 1164 if (brt_ksp != NULL) { 1165 kstat_delete(brt_ksp); 1166 brt_ksp = NULL; 1167 } 1168 1169 wmsum_fini(&brt_sums.brt_addref_entry_in_memory); 1170 wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); 1171 wmsum_fini(&brt_sums.brt_addref_entry_on_disk); 1172 wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race); 1173 wmsum_fini(&brt_sums.brt_decref_entry_in_memory); 1174 wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); 1175 wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); 1176 wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk); 1177 wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); 1178 wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); 1179 wmsum_fini(&brt_sums.brt_decref_free_data_later); 1180 wmsum_fini(&brt_sums.brt_decref_free_data_now); 1181 wmsum_fini(&brt_sums.brt_decref_no_entry); 1182 } 1183 1184 void 1185 brt_init(void) 1186 { 1187 brt_entry_cache = kmem_cache_create("brt_entry_cache", 1188 sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1189 brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache", 1190 sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1191 1192 brt_stat_init(); 1193 } 1194 1195 void 1196 brt_fini(void) 1197 { 1198 brt_stat_fini(); 1199 1200 kmem_cache_destroy(brt_entry_cache); 1201 kmem_cache_destroy(brt_pending_entry_cache); 1202 } 1203 1204 static brt_entry_t * 1205 brt_entry_alloc(const brt_entry_t *bre_init) 1206 { 1207 brt_entry_t *bre; 1208 1209 bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); 1210 bre->bre_offset = bre_init->bre_offset; 1211 bre->bre_refcount = bre_init->bre_refcount; 1212 1213 return (bre); 1214 } 1215 1216 static void 1217 brt_entry_free(brt_entry_t *bre) 1218 { 1219 1220 kmem_cache_free(brt_entry_cache, bre); 1221 } 1222 1223 static void 1224 brt_entry_addref(brt_t *brt, const blkptr_t *bp) 1225 { 1226 brt_vdev_t *brtvd; 1227 brt_entry_t *bre, *racebre; 1228 brt_entry_t bre_search; 1229 avl_index_t where; 1230 uint64_t vdevid; 1231 int error; 1232 1233 ASSERT(!RW_WRITE_HELD(&brt->brt_lock)); 1234 1235 brt_entry_fill(bp, &bre_search, &vdevid); 1236 1237 brt_wlock(brt); 1238 1239 brtvd = brt_vdev(brt, vdevid); 1240 if (brtvd == NULL) { 1241 ASSERT3U(vdevid, >=, brt->brt_nvdevs); 1242 1243 /* New VDEV was added. */ 1244 brt_vdevs_expand(brt, vdevid + 1); 1245 brtvd = brt_vdev(brt, vdevid); 1246 } 1247 ASSERT(brtvd != NULL); 1248 if (!brtvd->bv_initiated) 1249 brt_vdev_realloc(brt, brtvd); 1250 1251 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1252 if (bre != NULL) { 1253 BRTSTAT_BUMP(brt_addref_entry_in_memory); 1254 } else { 1255 /* 1256 * brt_entry_lookup() may drop the BRT (read) lock and 1257 * reacquire it (write). 1258 */ 1259 error = brt_entry_lookup(brt, brtvd, &bre_search); 1260 /* bre_search now contains correct bre_refcount */ 1261 ASSERT(error == 0 || error == ENOENT); 1262 if (error == 0) 1263 BRTSTAT_BUMP(brt_addref_entry_on_disk); 1264 else 1265 BRTSTAT_BUMP(brt_addref_entry_not_on_disk); 1266 /* 1267 * When the BRT lock was dropped, brt_vdevs[] may have been 1268 * expanded and reallocated, we need to update brtvd's pointer. 1269 */ 1270 brtvd = brt_vdev(brt, vdevid); 1271 ASSERT(brtvd != NULL); 1272 1273 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); 1274 if (racebre == NULL) { 1275 bre = brt_entry_alloc(&bre_search); 1276 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 1277 avl_insert(&brtvd->bv_tree, bre, where); 1278 brt->brt_nentries++; 1279 } else { 1280 /* 1281 * The entry was added when the BRT lock was dropped in 1282 * brt_entry_lookup(). 1283 */ 1284 BRTSTAT_BUMP(brt_addref_entry_read_lost_race); 1285 bre = racebre; 1286 } 1287 } 1288 bre->bre_refcount++; 1289 brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); 1290 1291 brt_unlock(brt); 1292 } 1293 1294 /* Return TRUE if block should be freed immediately. */ 1295 boolean_t 1296 brt_entry_decref(spa_t *spa, const blkptr_t *bp) 1297 { 1298 brt_t *brt = spa->spa_brt; 1299 brt_vdev_t *brtvd; 1300 brt_entry_t *bre, *racebre; 1301 brt_entry_t bre_search; 1302 avl_index_t where; 1303 uint64_t vdevid; 1304 int error; 1305 1306 brt_entry_fill(bp, &bre_search, &vdevid); 1307 1308 brt_wlock(brt); 1309 1310 brtvd = brt_vdev(brt, vdevid); 1311 ASSERT(brtvd != NULL); 1312 1313 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1314 if (bre != NULL) { 1315 BRTSTAT_BUMP(brt_decref_entry_in_memory); 1316 goto out; 1317 } else { 1318 BRTSTAT_BUMP(brt_decref_entry_not_in_memory); 1319 } 1320 1321 /* 1322 * brt_entry_lookup() may drop the BRT lock and reacquire it. 1323 */ 1324 error = brt_entry_lookup(brt, brtvd, &bre_search); 1325 /* bre_search now contains correct bre_refcount */ 1326 ASSERT(error == 0 || error == ENOENT); 1327 /* 1328 * When the BRT lock was dropped, brt_vdevs[] may have been expanded 1329 * and reallocated, we need to update brtvd's pointer. 1330 */ 1331 brtvd = brt_vdev(brt, vdevid); 1332 ASSERT(brtvd != NULL); 1333 1334 if (error == ENOENT) { 1335 BRTSTAT_BUMP(brt_decref_entry_not_on_disk); 1336 bre = NULL; 1337 goto out; 1338 } 1339 1340 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); 1341 if (racebre != NULL) { 1342 /* 1343 * The entry was added when the BRT lock was dropped in 1344 * brt_entry_lookup(). 1345 */ 1346 BRTSTAT_BUMP(brt_decref_entry_read_lost_race); 1347 bre = racebre; 1348 goto out; 1349 } 1350 1351 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); 1352 bre = brt_entry_alloc(&bre_search); 1353 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 1354 avl_insert(&brtvd->bv_tree, bre, where); 1355 brt->brt_nentries++; 1356 1357 out: 1358 if (bre == NULL) { 1359 /* 1360 * This is a free of a regular (not cloned) block. 1361 */ 1362 brt_unlock(brt); 1363 BRTSTAT_BUMP(brt_decref_no_entry); 1364 return (B_TRUE); 1365 } 1366 if (bre->bre_refcount == 0) { 1367 brt_unlock(brt); 1368 BRTSTAT_BUMP(brt_decref_free_data_now); 1369 return (B_TRUE); 1370 } 1371 1372 ASSERT(bre->bre_refcount > 0); 1373 bre->bre_refcount--; 1374 if (bre->bre_refcount == 0) 1375 BRTSTAT_BUMP(brt_decref_free_data_later); 1376 else 1377 BRTSTAT_BUMP(brt_decref_entry_still_referenced); 1378 brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); 1379 1380 brt_unlock(brt); 1381 1382 return (B_FALSE); 1383 } 1384 1385 uint64_t 1386 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) 1387 { 1388 brt_t *brt = spa->spa_brt; 1389 brt_vdev_t *brtvd; 1390 brt_entry_t bre_search, *bre; 1391 uint64_t vdevid, refcnt; 1392 int error; 1393 1394 brt_entry_fill(bp, &bre_search, &vdevid); 1395 1396 brt_rlock(brt); 1397 1398 brtvd = brt_vdev(brt, vdevid); 1399 ASSERT(brtvd != NULL); 1400 1401 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1402 if (bre == NULL) { 1403 error = brt_entry_lookup(brt, brtvd, &bre_search); 1404 ASSERT(error == 0 || error == ENOENT); 1405 if (error == ENOENT) 1406 refcnt = 0; 1407 else 1408 refcnt = bre_search.bre_refcount; 1409 } else 1410 refcnt = bre->bre_refcount; 1411 1412 brt_unlock(brt); 1413 return (refcnt); 1414 } 1415 1416 static void 1417 brt_prefetch(brt_t *brt, const blkptr_t *bp) 1418 { 1419 brt_entry_t bre; 1420 uint64_t vdevid; 1421 1422 ASSERT(bp != NULL); 1423 1424 if (!zfs_brt_prefetch) 1425 return; 1426 1427 brt_entry_fill(bp, &bre, &vdevid); 1428 1429 brt_entry_prefetch(brt, vdevid, &bre); 1430 } 1431 1432 static int 1433 brt_pending_entry_compare(const void *x1, const void *x2) 1434 { 1435 const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2; 1436 const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp; 1437 int cmp; 1438 1439 cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2)); 1440 if (cmp == 0) { 1441 cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]), 1442 DVA_GET_VDEV(&bp2->blk_dva[0])); 1443 if (cmp == 0) { 1444 cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), 1445 DVA_GET_OFFSET(&bp2->blk_dva[0])); 1446 } 1447 } 1448 1449 return (cmp); 1450 } 1451 1452 void 1453 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1454 { 1455 brt_t *brt; 1456 avl_tree_t *pending_tree; 1457 kmutex_t *pending_lock; 1458 brt_pending_entry_t *bpe, *newbpe; 1459 avl_index_t where; 1460 uint64_t txg; 1461 1462 brt = spa->spa_brt; 1463 txg = dmu_tx_get_txg(tx); 1464 ASSERT3U(txg, !=, 0); 1465 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; 1466 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; 1467 1468 newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP); 1469 newbpe->bpe_bp = *bp; 1470 newbpe->bpe_count = 1; 1471 1472 mutex_enter(pending_lock); 1473 1474 bpe = avl_find(pending_tree, newbpe, &where); 1475 if (bpe == NULL) { 1476 avl_insert(pending_tree, newbpe, where); 1477 newbpe = NULL; 1478 } else { 1479 bpe->bpe_count++; 1480 } 1481 1482 mutex_exit(pending_lock); 1483 1484 if (newbpe != NULL) { 1485 ASSERT(bpe != NULL); 1486 ASSERT(bpe != newbpe); 1487 kmem_cache_free(brt_pending_entry_cache, newbpe); 1488 } else { 1489 ASSERT(bpe == NULL); 1490 } 1491 1492 /* Prefetch BRT entry, as we will need it in the syncing context. */ 1493 brt_prefetch(brt, bp); 1494 } 1495 1496 void 1497 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1498 { 1499 brt_t *brt; 1500 avl_tree_t *pending_tree; 1501 kmutex_t *pending_lock; 1502 brt_pending_entry_t *bpe, bpe_search; 1503 uint64_t txg; 1504 1505 brt = spa->spa_brt; 1506 txg = dmu_tx_get_txg(tx); 1507 ASSERT3U(txg, !=, 0); 1508 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; 1509 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; 1510 1511 bpe_search.bpe_bp = *bp; 1512 1513 mutex_enter(pending_lock); 1514 1515 bpe = avl_find(pending_tree, &bpe_search, NULL); 1516 /* I believe we should always find bpe when this function is called. */ 1517 if (bpe != NULL) { 1518 ASSERT(bpe->bpe_count > 0); 1519 1520 bpe->bpe_count--; 1521 if (bpe->bpe_count == 0) { 1522 avl_remove(pending_tree, bpe); 1523 kmem_cache_free(brt_pending_entry_cache, bpe); 1524 } 1525 } 1526 1527 mutex_exit(pending_lock); 1528 } 1529 1530 void 1531 brt_pending_apply(spa_t *spa, uint64_t txg) 1532 { 1533 brt_t *brt; 1534 brt_pending_entry_t *bpe; 1535 avl_tree_t *pending_tree; 1536 kmutex_t *pending_lock; 1537 void *c; 1538 1539 ASSERT3U(txg, !=, 0); 1540 1541 brt = spa->spa_brt; 1542 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; 1543 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; 1544 1545 mutex_enter(pending_lock); 1546 1547 c = NULL; 1548 while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) { 1549 boolean_t added_to_ddt; 1550 1551 mutex_exit(pending_lock); 1552 1553 for (int i = 0; i < bpe->bpe_count; i++) { 1554 /* 1555 * If the block has DEDUP bit set, it means that it 1556 * already exists in the DEDUP table, so we can just 1557 * use that instead of creating new entry in 1558 * the BRT table. 1559 */ 1560 if (BP_GET_DEDUP(&bpe->bpe_bp)) { 1561 added_to_ddt = ddt_addref(spa, &bpe->bpe_bp); 1562 } else { 1563 added_to_ddt = B_FALSE; 1564 } 1565 if (!added_to_ddt) 1566 brt_entry_addref(brt, &bpe->bpe_bp); 1567 } 1568 1569 kmem_cache_free(brt_pending_entry_cache, bpe); 1570 mutex_enter(pending_lock); 1571 } 1572 1573 mutex_exit(pending_lock); 1574 } 1575 1576 static void 1577 brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) 1578 { 1579 1580 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 1581 ASSERT(brtvd->bv_mos_entries != 0); 1582 1583 if (bre->bre_refcount == 0) { 1584 int error; 1585 1586 error = brt_entry_remove(brt, brtvd, bre, tx); 1587 ASSERT(error == 0 || error == ENOENT); 1588 /* 1589 * If error == ENOENT then zfs_clone_range() was done from a 1590 * removed (but opened) file (open(), unlink()). 1591 */ 1592 ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT); 1593 } else { 1594 VERIFY0(brt_entry_update(brt, brtvd, bre, tx)); 1595 } 1596 } 1597 1598 static void 1599 brt_sync_table(brt_t *brt, dmu_tx_t *tx) 1600 { 1601 brt_vdev_t *brtvd; 1602 brt_entry_t *bre; 1603 uint64_t vdevid; 1604 void *c; 1605 1606 brt_wlock(brt); 1607 1608 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 1609 brtvd = &brt->brt_vdevs[vdevid]; 1610 1611 if (!brtvd->bv_initiated) 1612 continue; 1613 1614 if (!brtvd->bv_meta_dirty) { 1615 ASSERT(!brtvd->bv_entcount_dirty); 1616 ASSERT0(avl_numnodes(&brtvd->bv_tree)); 1617 continue; 1618 } 1619 1620 ASSERT(!brtvd->bv_entcount_dirty || 1621 avl_numnodes(&brtvd->bv_tree) != 0); 1622 1623 if (brtvd->bv_mos_brtvdev == 0) 1624 brt_vdev_create(brt, brtvd, tx); 1625 1626 c = NULL; 1627 while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { 1628 brt_sync_entry(brt, brtvd, bre, tx); 1629 brt_entry_free(bre); 1630 ASSERT(brt->brt_nentries > 0); 1631 brt->brt_nentries--; 1632 } 1633 1634 brt_vdev_sync(brt, brtvd, tx); 1635 1636 if (brtvd->bv_totalcount == 0) 1637 brt_vdev_destroy(brt, brtvd, tx); 1638 } 1639 1640 ASSERT0(brt->brt_nentries); 1641 1642 brt_unlock(brt); 1643 } 1644 1645 void 1646 brt_sync(spa_t *spa, uint64_t txg) 1647 { 1648 dmu_tx_t *tx; 1649 brt_t *brt; 1650 1651 ASSERT(spa_syncing_txg(spa) == txg); 1652 1653 brt = spa->spa_brt; 1654 brt_rlock(brt); 1655 if (brt->brt_nentries == 0) { 1656 /* No changes. */ 1657 brt_unlock(brt); 1658 return; 1659 } 1660 brt_unlock(brt); 1661 1662 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1663 1664 brt_sync_table(brt, tx); 1665 1666 dmu_tx_commit(tx); 1667 } 1668 1669 static void 1670 brt_table_alloc(brt_t *brt) 1671 { 1672 1673 for (int i = 0; i < TXG_SIZE; i++) { 1674 avl_create(&brt->brt_pending_tree[i], 1675 brt_pending_entry_compare, 1676 sizeof (brt_pending_entry_t), 1677 offsetof(brt_pending_entry_t, bpe_node)); 1678 mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT, 1679 NULL); 1680 } 1681 } 1682 1683 static void 1684 brt_table_free(brt_t *brt) 1685 { 1686 1687 for (int i = 0; i < TXG_SIZE; i++) { 1688 ASSERT(avl_is_empty(&brt->brt_pending_tree[i])); 1689 1690 avl_destroy(&brt->brt_pending_tree[i]); 1691 mutex_destroy(&brt->brt_pending_lock[i]); 1692 } 1693 } 1694 1695 static void 1696 brt_alloc(spa_t *spa) 1697 { 1698 brt_t *brt; 1699 1700 ASSERT(spa->spa_brt == NULL); 1701 1702 brt = kmem_zalloc(sizeof (*brt), KM_SLEEP); 1703 rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL); 1704 brt->brt_spa = spa; 1705 brt->brt_rangesize = 0; 1706 brt->brt_nentries = 0; 1707 brt->brt_vdevs = NULL; 1708 brt->brt_nvdevs = 0; 1709 brt_table_alloc(brt); 1710 1711 spa->spa_brt = brt; 1712 } 1713 1714 void 1715 brt_create(spa_t *spa) 1716 { 1717 1718 brt_alloc(spa); 1719 brt_vdevs_alloc(spa->spa_brt, B_FALSE); 1720 } 1721 1722 int 1723 brt_load(spa_t *spa) 1724 { 1725 1726 brt_alloc(spa); 1727 brt_vdevs_alloc(spa->spa_brt, B_TRUE); 1728 1729 return (0); 1730 } 1731 1732 void 1733 brt_unload(spa_t *spa) 1734 { 1735 brt_t *brt = spa->spa_brt; 1736 1737 if (brt == NULL) 1738 return; 1739 1740 brt_vdevs_free(brt); 1741 brt_table_free(brt); 1742 rw_destroy(&brt->brt_lock); 1743 kmem_free(brt, sizeof (*brt)); 1744 spa->spa_brt = NULL; 1745 } 1746 1747 /* BEGIN CSTYLED */ 1748 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW, 1749 "Enable prefetching of BRT entries"); 1750 #ifdef ZFS_BRT_DEBUG 1751 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug"); 1752 #endif 1753 /* END CSTYLED */ 1754