1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/spa.h> 28 #include <sys/spa_impl.h> 29 #include <sys/zio.h> 30 #include <sys/brt.h> 31 #include <sys/brt_impl.h> 32 #include <sys/ddt.h> 33 #include <sys/bitmap.h> 34 #include <sys/zap.h> 35 #include <sys/dmu_tx.h> 36 #include <sys/arc.h> 37 #include <sys/dsl_pool.h> 38 #include <sys/dsl_scan.h> 39 #include <sys/vdev_impl.h> 40 #include <sys/kstat.h> 41 #include <sys/wmsum.h> 42 43 /* 44 * Block Cloning design. 45 * 46 * Block Cloning allows to manually clone a file (or a subset of its blocks) 47 * into another (or the same) file by just creating additional references to 48 * the data blocks without copying the data itself. Those references are kept 49 * in the Block Reference Tables (BRTs). 50 * 51 * In many ways this is similar to the existing deduplication, but there are 52 * some important differences: 53 * 54 * - Deduplication is automatic and Block Cloning is not - one has to use a 55 * dedicated system call(s) to clone the given file/blocks. 56 * - Deduplication keeps all data blocks in its table, even those referenced 57 * just once. Block Cloning creates an entry in its tables only when there 58 * are at least two references to the given data block. If the block was 59 * never explicitly cloned or the second to last reference was dropped, 60 * there will be neither space nor performance overhead. 61 * - Deduplication needs data to work - one needs to pass real data to the 62 * write(2) syscall, so hash can be calculated. Block Cloning doesn't require 63 * data, just block pointers to the data, so it is extremely fast, as we pay 64 * neither the cost of reading the data, nor the cost of writing the data - 65 * we operate exclusively on metadata. 66 * - If the D (dedup) bit is not set in the block pointer, it means that 67 * the block is not in the dedup table (DDT) and we won't consult the DDT 68 * when we need to free the block. Block Cloning must be consulted on every 69 * free, because we cannot modify the source BP (eg. by setting something 70 * similar to the D bit), thus we have no hint if the block is in the 71 * Block Reference Table (BRT), so we need to look into the BRT. There is 72 * an optimization in place that allows us to eliminate the majority of BRT 73 * lookups which is described below in the "Minimizing free penalty" section. 74 * - The BRT entry is much smaller than the DDT entry - for BRT we only store 75 * 64bit offset and 64bit reference counter. 76 * - Dedup keys are cryptographic hashes, so two blocks that are close to each 77 * other on disk are most likely in totally different parts of the DDT. 78 * The BRT entry keys are offsets into a single top-level VDEV, so data blocks 79 * from one file should have BRT entries close to each other. 80 * - Scrub will only do a single pass over a block that is referenced multiple 81 * times in the DDT. Unfortunately it is not currently (if at all) possible 82 * with Block Cloning and block referenced multiple times will be scrubbed 83 * multiple times. The new, sorted scrub should be able to eliminate 84 * duplicated reads given enough memory. 85 * - Deduplication requires cryptographically strong hash as a checksum or 86 * additional data verification. Block Cloning works with any checksum 87 * algorithm or even with checksumming disabled. 88 * 89 * As mentioned above, the BRT entries are much smaller than the DDT entries. 90 * To uniquely identify a block we just need its vdev id and offset. We also 91 * need to maintain a reference counter. The vdev id will often repeat, as there 92 * is a small number of top-level VDEVs and a large number of blocks stored in 93 * each VDEV. We take advantage of that to reduce the BRT entry size further by 94 * maintaining one BRT for each top-level VDEV, so we can then have only offset 95 * and counter as the BRT entry. 96 * 97 * Minimizing free penalty. 98 * 99 * Block Cloning allows creating additional references to any existing block. 100 * When we free a block there is no hint in the block pointer whether the block 101 * was cloned or not, so on each free we have to check if there is a 102 * corresponding entry in the BRT or not. If there is, we need to decrease 103 * the reference counter. Doing BRT lookup on every free can potentially be 104 * expensive by requiring additional I/Os if the BRT doesn't fit into memory. 105 * This is the main problem with deduplication, so we've learned our lesson and 106 * try not to repeat the same mistake here. How do we do that? We divide each 107 * top-level VDEV into 16MB regions. For each region we maintain a counter that 108 * is a sum of all the BRT entries that have offsets within the region. This 109 * creates the entries count array of 16bit numbers for each top-level VDEV. 110 * The entries count array is always kept in memory and updated on disk in the 111 * same transaction group as the BRT updates to keep everything in-sync. We can 112 * keep the array in memory, because it is very small. With 16MB regions and 113 * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease 114 * the region size even further in the future). Now, when we want to free 115 * a block, we first consult the array. If the counter for the whole region is 116 * zero, there is no need to look for the BRT entry, as there isn't one for 117 * sure. If the counter for the region is greater than zero, only then we will 118 * do a BRT lookup and if an entry is found we will decrease the reference 119 * counter in the BRT entry and in the entry counters array. 120 * 121 * The entry counters array is small, but can potentially be larger for very 122 * large VDEVs or smaller regions. In this case we don't want to rewrite entire 123 * array on every change. We then divide the array into 32kB block and keep 124 * a bitmap of dirty blocks within a transaction group. When we sync the 125 * transaction group we can only update the parts of the entry counters array 126 * that were modified. Note: Keeping track of the dirty parts of the entry 127 * counters array is implemented, but updating only parts of the array on disk 128 * is not yet implemented - for now we will update entire array if there was 129 * any change. 130 * 131 * The implementation tries to be economic: if BRT is not used, or no longer 132 * used, there will be no entries in the MOS and no additional memory used (eg. 133 * the entry counters array is only allocated if needed). 134 * 135 * Interaction between Deduplication and Block Cloning. 136 * 137 * If both functionalities are in use, we could end up with a block that is 138 * referenced multiple times in both DDT and BRT. When we free one of the 139 * references we couldn't tell where it belongs, so we would have to decide 140 * what table takes the precedence: do we first clear DDT references or BRT 141 * references? To avoid this dilemma BRT cooperates with DDT - if a given block 142 * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will 143 * lookup DDT entry instead and increase the counter there. No BRT entry 144 * will be created for a block which has the D (dedup) bit set. 145 * BRT may be more efficient for manual deduplication, but if the block is 146 * already in the DDT, then creating additional BRT entry would be less 147 * efficient. This clever idea was proposed by Allan Jude. 148 * 149 * Block Cloning across datasets. 150 * 151 * Block Cloning is not limited to cloning blocks within the same dataset. 152 * It is possible (and very useful) to clone blocks between different datasets. 153 * One use case is recovering files from snapshots. By cloning the files into 154 * dataset we need no additional storage. Without Block Cloning we would need 155 * additional space for those files. 156 * Another interesting use case is moving the files between datasets 157 * (copying the file content to the new dataset and removing the source file). 158 * In that case Block Cloning will only be used briefly, because the BRT entries 159 * will be removed when the source is removed. 160 * Block Cloning across encrypted datasets is supported as long as both 161 * datasets share the same master key (e.g. snapshots and clones) 162 * 163 * Block Cloning flow through ZFS layers. 164 * 165 * Note: Block Cloning can be used both for cloning file system blocks and ZVOL 166 * blocks. As of this writing no interface is implemented that allows for block 167 * cloning within a ZVOL. 168 * FreeBSD and Linux provides copy_file_range(2) system call and we will use it 169 * for blocking cloning. 170 * 171 * ssize_t 172 * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp, 173 * size_t len, unsigned int flags); 174 * 175 * Even though offsets and length represent bytes, they have to be 176 * block-aligned or we will return an error so the upper layer can 177 * fallback to the generic mechanism that will just copy the data. 178 * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. 179 * This function was implemented based on zfs_write(), but instead of writing 180 * the given data we first read block pointers using the new dmu_read_l0_bps() 181 * function from the source file. Once we have BPs from the source file we call 182 * the dmu_brt_clone() function on the destination file. This function 183 * allocates BPs for us. We iterate over all source BPs. If the given BP is 184 * a hole or an embedded block, we just copy BP as-is. If it points to a real 185 * data we place this BP on a BRT pending list using the brt_pending_add() 186 * function. 187 * 188 * We use this pending list to keep track of all BPs that got new references 189 * within this transaction group. 190 * 191 * Some special cases to consider and how we address them: 192 * - The block we want to clone may have been created within the same 193 * transaction group that we are trying to clone. Such block has no BP 194 * allocated yet, so cannot be immediately cloned. We return EAGAIN. 195 * - The block we want to clone may have been modified within the same 196 * transaction group. We return EAGAIN. 197 * - A block may be cloned multiple times during one transaction group (that's 198 * why pending list is actually a tree and not an append-only list - this 199 * way we can figure out faster if this block is cloned for the first time 200 * in this txg or consecutive time). 201 * - A block may be cloned and freed within the same transaction group 202 * (see dbuf_undirty()). 203 * - A block may be cloned and within the same transaction group the clone 204 * can be cloned again (see dmu_read_l0_bps()). 205 * - A file might have been deleted, but the caller still has a file descriptor 206 * open to this file and clones it. 207 * 208 * When we free a block we have an additional step in the ZIO pipeline where we 209 * call the zio_brt_free() function. We then call the brt_entry_decref() 210 * that loads the corresponding BRT entry (if one exists) and decreases 211 * reference counter. If this is not the last reference we will stop ZIO 212 * pipeline here. If this is the last reference or the block is not in the 213 * BRT, we continue the pipeline and free the block as usual. 214 * 215 * At the beginning of spa_sync() where there can be no more block cloning, 216 * but before issuing frees we call brt_pending_apply(). This function applies 217 * all the new clones to the BRT table - we load BRT entries and update 218 * reference counters. To sync new BRT entries to disk, we use brt_sync() 219 * function. This function will sync all dirty per-top-level-vdev BRTs, 220 * the entry counters arrays, etc. 221 * 222 * Block Cloning and ZIL. 223 * 224 * Every clone operation is divided into chunks (similar to write) and each 225 * chunk is cloned in a separate transaction. The chunk size is determined by 226 * how many BPs we can fit into a single ZIL entry. 227 * Replaying clone operation is different from the regular clone operation, 228 * as when we log clone operations we cannot use the source object - it may 229 * reside on a different dataset, so we log BPs we want to clone. 230 * The ZIL is replayed when we mount the given dataset, not when the pool is 231 * imported. Taking this into account it is possible that the pool is imported 232 * without mounting datasets and the source dataset is destroyed before the 233 * destination dataset is mounted and its ZIL replayed. 234 * To address this situation we leverage zil_claim() mechanism where ZFS will 235 * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE 236 * entries, we will bump reference counters for their BPs in the BRT. Then 237 * on mount and ZIL replay we bump the reference counters once more, while the 238 * first references are dropped during ZIL destroy by zil_free_clone_range(). 239 * It is possible that after zil_claim() we never mount the destination, so 240 * we never replay its ZIL and just destroy it. In this case the only taken 241 * references will be dropped by zil_free_clone_range(), since the cloning is 242 * not going to ever take place. 243 */ 244 245 static kmem_cache_t *brt_entry_cache; 246 static kmem_cache_t *brt_pending_entry_cache; 247 248 /* 249 * Enable/disable prefetching of BRT entries that we are going to modify. 250 */ 251 int zfs_brt_prefetch = 1; 252 253 #ifdef ZFS_DEBUG 254 #define BRT_DEBUG(...) do { \ 255 if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \ 256 __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \ 257 } \ 258 } while (0) 259 #else 260 #define BRT_DEBUG(...) do { } while (0) 261 #endif 262 263 int brt_zap_leaf_blockshift = 12; 264 int brt_zap_indirect_blockshift = 12; 265 266 static kstat_t *brt_ksp; 267 268 typedef struct brt_stats { 269 kstat_named_t brt_addref_entry_in_memory; 270 kstat_named_t brt_addref_entry_not_on_disk; 271 kstat_named_t brt_addref_entry_on_disk; 272 kstat_named_t brt_addref_entry_read_lost_race; 273 kstat_named_t brt_decref_entry_in_memory; 274 kstat_named_t brt_decref_entry_loaded_from_disk; 275 kstat_named_t brt_decref_entry_not_in_memory; 276 kstat_named_t brt_decref_entry_not_on_disk; 277 kstat_named_t brt_decref_entry_read_lost_race; 278 kstat_named_t brt_decref_entry_still_referenced; 279 kstat_named_t brt_decref_free_data_later; 280 kstat_named_t brt_decref_free_data_now; 281 kstat_named_t brt_decref_no_entry; 282 } brt_stats_t; 283 284 static brt_stats_t brt_stats = { 285 { "addref_entry_in_memory", KSTAT_DATA_UINT64 }, 286 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, 287 { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, 288 { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 }, 289 { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, 290 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, 291 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, 292 { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 }, 293 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, 294 { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, 295 { "decref_free_data_later", KSTAT_DATA_UINT64 }, 296 { "decref_free_data_now", KSTAT_DATA_UINT64 }, 297 { "decref_no_entry", KSTAT_DATA_UINT64 } 298 }; 299 300 struct { 301 wmsum_t brt_addref_entry_in_memory; 302 wmsum_t brt_addref_entry_not_on_disk; 303 wmsum_t brt_addref_entry_on_disk; 304 wmsum_t brt_addref_entry_read_lost_race; 305 wmsum_t brt_decref_entry_in_memory; 306 wmsum_t brt_decref_entry_loaded_from_disk; 307 wmsum_t brt_decref_entry_not_in_memory; 308 wmsum_t brt_decref_entry_not_on_disk; 309 wmsum_t brt_decref_entry_read_lost_race; 310 wmsum_t brt_decref_entry_still_referenced; 311 wmsum_t brt_decref_free_data_later; 312 wmsum_t brt_decref_free_data_now; 313 wmsum_t brt_decref_no_entry; 314 } brt_sums; 315 316 #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) 317 318 static int brt_entry_compare(const void *x1, const void *x2); 319 static int brt_pending_entry_compare(const void *x1, const void *x2); 320 321 static void 322 brt_rlock(brt_t *brt) 323 { 324 rw_enter(&brt->brt_lock, RW_READER); 325 } 326 327 static void 328 brt_wlock(brt_t *brt) 329 { 330 rw_enter(&brt->brt_lock, RW_WRITER); 331 } 332 333 static void 334 brt_unlock(brt_t *brt) 335 { 336 rw_exit(&brt->brt_lock); 337 } 338 339 static uint16_t 340 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) 341 { 342 343 ASSERT3U(idx, <, brtvd->bv_size); 344 345 if (unlikely(brtvd->bv_need_byteswap)) { 346 return (BSWAP_16(brtvd->bv_entcount[idx])); 347 } else { 348 return (brtvd->bv_entcount[idx]); 349 } 350 } 351 352 static void 353 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt) 354 { 355 356 ASSERT3U(idx, <, brtvd->bv_size); 357 358 if (unlikely(brtvd->bv_need_byteswap)) { 359 brtvd->bv_entcount[idx] = BSWAP_16(entcnt); 360 } else { 361 brtvd->bv_entcount[idx] = entcnt; 362 } 363 } 364 365 static void 366 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx) 367 { 368 uint16_t entcnt; 369 370 ASSERT3U(idx, <, brtvd->bv_size); 371 372 entcnt = brt_vdev_entcount_get(brtvd, idx); 373 ASSERT(entcnt < UINT16_MAX); 374 375 brt_vdev_entcount_set(brtvd, idx, entcnt + 1); 376 } 377 378 static void 379 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx) 380 { 381 uint16_t entcnt; 382 383 ASSERT3U(idx, <, brtvd->bv_size); 384 385 entcnt = brt_vdev_entcount_get(brtvd, idx); 386 ASSERT(entcnt > 0); 387 388 brt_vdev_entcount_set(brtvd, idx, entcnt - 1); 389 } 390 391 #ifdef ZFS_DEBUG 392 static void 393 brt_vdev_dump(brt_vdev_t *brtvd) 394 { 395 uint64_t idx; 396 397 zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d " 398 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", 399 (u_longlong_t)brtvd->bv_vdevid, 400 brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, 401 (u_longlong_t)brtvd->bv_size, 402 (u_longlong_t)brtvd->bv_totalcount, 403 (u_longlong_t)brtvd->bv_nblocks, 404 (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); 405 if (brtvd->bv_totalcount > 0) { 406 zfs_dbgmsg(" entcounts:"); 407 for (idx = 0; idx < brtvd->bv_size; idx++) { 408 uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx); 409 if (entcnt > 0) { 410 zfs_dbgmsg(" [%04llu] %hu", 411 (u_longlong_t)idx, entcnt); 412 } 413 } 414 } 415 if (brtvd->bv_entcount_dirty) { 416 char *bitmap; 417 418 bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); 419 for (idx = 0; idx < brtvd->bv_nblocks; idx++) { 420 bitmap[idx] = 421 BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; 422 } 423 bitmap[idx] = '\0'; 424 zfs_dbgmsg(" dirty: %s", bitmap); 425 kmem_free(bitmap, brtvd->bv_nblocks + 1); 426 } 427 } 428 #endif 429 430 static brt_vdev_t * 431 brt_vdev(brt_t *brt, uint64_t vdevid) 432 { 433 brt_vdev_t *brtvd; 434 435 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 436 437 if (vdevid < brt->brt_nvdevs) { 438 brtvd = &brt->brt_vdevs[vdevid]; 439 } else { 440 brtvd = NULL; 441 } 442 443 return (brtvd); 444 } 445 446 static void 447 brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) 448 { 449 char name[64]; 450 451 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 452 ASSERT0(brtvd->bv_mos_brtvdev); 453 ASSERT0(brtvd->bv_mos_entries); 454 ASSERT(brtvd->bv_entcount != NULL); 455 ASSERT(brtvd->bv_size > 0); 456 ASSERT(brtvd->bv_bitmap != NULL); 457 ASSERT(brtvd->bv_nblocks > 0); 458 459 brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0, 460 ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, 461 brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE, 462 0, tx); 463 VERIFY(brtvd->bv_mos_entries != 0); 464 BRT_DEBUG("MOS entries created, object=%llu", 465 (u_longlong_t)brtvd->bv_mos_entries); 466 467 /* 468 * We allocate DMU buffer to store the bv_entcount[] array. 469 * We will keep array size (bv_size) and cummulative count for all 470 * bv_entcount[]s (bv_totalcount) in the bonus buffer. 471 */ 472 brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos, 473 DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, 474 DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); 475 VERIFY(brtvd->bv_mos_brtvdev != 0); 476 BRT_DEBUG("MOS BRT VDEV created, object=%llu", 477 (u_longlong_t)brtvd->bv_mos_brtvdev); 478 479 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 480 (u_longlong_t)brtvd->bv_vdevid); 481 VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, 482 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); 483 BRT_DEBUG("Pool directory object created, object=%s", name); 484 485 spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); 486 } 487 488 static void 489 brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) 490 { 491 vdev_t *vd; 492 uint16_t *entcount; 493 ulong_t *bitmap; 494 uint64_t nblocks, size; 495 496 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 497 498 spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER); 499 vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid); 500 size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1; 501 spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG); 502 503 entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); 504 nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); 505 bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); 506 507 if (!brtvd->bv_initiated) { 508 ASSERT0(brtvd->bv_size); 509 ASSERT(brtvd->bv_entcount == NULL); 510 ASSERT(brtvd->bv_bitmap == NULL); 511 ASSERT0(brtvd->bv_nblocks); 512 513 avl_create(&brtvd->bv_tree, brt_entry_compare, 514 sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); 515 } else { 516 ASSERT(brtvd->bv_size > 0); 517 ASSERT(brtvd->bv_entcount != NULL); 518 ASSERT(brtvd->bv_bitmap != NULL); 519 ASSERT(brtvd->bv_nblocks > 0); 520 /* 521 * TODO: Allow vdev shrinking. We only need to implement 522 * shrinking the on-disk BRT VDEV object. 523 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset, 524 * size, tx); 525 */ 526 ASSERT3U(brtvd->bv_size, <=, size); 527 528 memcpy(entcount, brtvd->bv_entcount, 529 sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); 530 memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), 531 BT_SIZEOFMAP(brtvd->bv_nblocks))); 532 vmem_free(brtvd->bv_entcount, 533 sizeof (entcount[0]) * brtvd->bv_size); 534 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); 535 } 536 537 brtvd->bv_size = size; 538 brtvd->bv_entcount = entcount; 539 brtvd->bv_bitmap = bitmap; 540 brtvd->bv_nblocks = nblocks; 541 if (!brtvd->bv_initiated) { 542 brtvd->bv_need_byteswap = FALSE; 543 brtvd->bv_initiated = TRUE; 544 BRT_DEBUG("BRT VDEV %llu initiated.", 545 (u_longlong_t)brtvd->bv_vdevid); 546 } 547 } 548 549 static void 550 brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) 551 { 552 char name[64]; 553 dmu_buf_t *db; 554 brt_vdev_phys_t *bvphys; 555 int error; 556 557 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 558 (u_longlong_t)brtvd->bv_vdevid); 559 error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, 560 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev); 561 if (error != 0) 562 return; 563 ASSERT(brtvd->bv_mos_brtvdev != 0); 564 565 error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db); 566 ASSERT0(error); 567 if (error != 0) 568 return; 569 570 bvphys = db->db_data; 571 if (brt->brt_rangesize == 0) { 572 brt->brt_rangesize = bvphys->bvp_rangesize; 573 } else { 574 ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize); 575 } 576 577 ASSERT(!brtvd->bv_initiated); 578 brt_vdev_realloc(brt, brtvd); 579 580 /* TODO: We don't support VDEV shrinking. */ 581 ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); 582 583 /* 584 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. 585 */ 586 error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, 587 MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), 588 brtvd->bv_entcount, DMU_READ_NO_PREFETCH); 589 ASSERT0(error); 590 591 brtvd->bv_mos_entries = bvphys->bvp_mos_entries; 592 ASSERT(brtvd->bv_mos_entries != 0); 593 brtvd->bv_need_byteswap = 594 (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); 595 brtvd->bv_totalcount = bvphys->bvp_totalcount; 596 brtvd->bv_usedspace = bvphys->bvp_usedspace; 597 brtvd->bv_savedspace = bvphys->bvp_savedspace; 598 brt->brt_usedspace += brtvd->bv_usedspace; 599 brt->brt_savedspace += brtvd->bv_savedspace; 600 601 dmu_buf_rele(db, FTAG); 602 603 BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu", 604 name, (u_longlong_t)brtvd->bv_mos_brtvdev, 605 (u_longlong_t)brtvd->bv_mos_entries); 606 } 607 608 static void 609 brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd) 610 { 611 612 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 613 ASSERT(brtvd->bv_initiated); 614 615 vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); 616 brtvd->bv_entcount = NULL; 617 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); 618 brtvd->bv_bitmap = NULL; 619 ASSERT0(avl_numnodes(&brtvd->bv_tree)); 620 avl_destroy(&brtvd->bv_tree); 621 622 brtvd->bv_size = 0; 623 brtvd->bv_nblocks = 0; 624 625 brtvd->bv_initiated = FALSE; 626 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); 627 } 628 629 static void 630 brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) 631 { 632 char name[64]; 633 uint64_t count; 634 dmu_buf_t *db; 635 brt_vdev_phys_t *bvphys; 636 637 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 638 ASSERT(brtvd->bv_mos_brtvdev != 0); 639 ASSERT(brtvd->bv_mos_entries != 0); 640 641 VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count)); 642 VERIFY0(count); 643 VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx)); 644 BRT_DEBUG("MOS entries destroyed, object=%llu", 645 (u_longlong_t)brtvd->bv_mos_entries); 646 brtvd->bv_mos_entries = 0; 647 648 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); 649 bvphys = db->db_data; 650 ASSERT0(bvphys->bvp_totalcount); 651 ASSERT0(bvphys->bvp_usedspace); 652 ASSERT0(bvphys->bvp_savedspace); 653 dmu_buf_rele(db, FTAG); 654 655 VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx)); 656 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", 657 (u_longlong_t)brtvd->bv_mos_brtvdev); 658 brtvd->bv_mos_brtvdev = 0; 659 660 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 661 (u_longlong_t)brtvd->bv_vdevid); 662 VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx)); 663 BRT_DEBUG("Pool directory object removed, object=%s", name); 664 665 brt_vdev_dealloc(brt, brtvd); 666 667 spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); 668 } 669 670 static void 671 brt_vdevs_expand(brt_t *brt, uint64_t nvdevs) 672 { 673 brt_vdev_t *brtvd, *vdevs; 674 uint64_t vdevid; 675 676 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 677 ASSERT3U(nvdevs, >, brt->brt_nvdevs); 678 679 vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP); 680 if (brt->brt_nvdevs > 0) { 681 ASSERT(brt->brt_vdevs != NULL); 682 683 memcpy(vdevs, brt->brt_vdevs, 684 sizeof (brt_vdev_t) * brt->brt_nvdevs); 685 kmem_free(brt->brt_vdevs, 686 sizeof (brt_vdev_t) * brt->brt_nvdevs); 687 } 688 for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) { 689 brtvd = &vdevs[vdevid]; 690 691 brtvd->bv_vdevid = vdevid; 692 brtvd->bv_initiated = FALSE; 693 } 694 695 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", 696 (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs); 697 698 brt->brt_vdevs = vdevs; 699 brt->brt_nvdevs = nvdevs; 700 } 701 702 static boolean_t 703 brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre) 704 { 705 uint64_t idx; 706 707 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 708 709 idx = bre->bre_offset / brt->brt_rangesize; 710 if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) { 711 /* VDEV wasn't expanded. */ 712 return (brt_vdev_entcount_get(brtvd, idx) > 0); 713 } 714 715 return (FALSE); 716 } 717 718 static void 719 brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, 720 uint64_t dsize) 721 { 722 uint64_t idx; 723 724 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 725 ASSERT(brtvd != NULL); 726 ASSERT(brtvd->bv_entcount != NULL); 727 728 brt->brt_savedspace += dsize; 729 brtvd->bv_savedspace += dsize; 730 brtvd->bv_meta_dirty = TRUE; 731 732 if (bre->bre_refcount > 1) { 733 return; 734 } 735 736 brt->brt_usedspace += dsize; 737 brtvd->bv_usedspace += dsize; 738 739 idx = bre->bre_offset / brt->brt_rangesize; 740 if (idx >= brtvd->bv_size) { 741 /* VDEV has been expanded. */ 742 brt_vdev_realloc(brt, brtvd); 743 } 744 745 ASSERT3U(idx, <, brtvd->bv_size); 746 747 brtvd->bv_totalcount++; 748 brt_vdev_entcount_inc(brtvd, idx); 749 brtvd->bv_entcount_dirty = TRUE; 750 idx = idx / BRT_BLOCKSIZE / 8; 751 BT_SET(brtvd->bv_bitmap, idx); 752 753 #ifdef ZFS_DEBUG 754 if (zfs_flags & ZFS_DEBUG_BRT) 755 brt_vdev_dump(brtvd); 756 #endif 757 } 758 759 static void 760 brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, 761 uint64_t dsize) 762 { 763 uint64_t idx; 764 765 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 766 ASSERT(brtvd != NULL); 767 ASSERT(brtvd->bv_entcount != NULL); 768 769 brt->brt_savedspace -= dsize; 770 brtvd->bv_savedspace -= dsize; 771 brtvd->bv_meta_dirty = TRUE; 772 773 if (bre->bre_refcount > 0) { 774 return; 775 } 776 777 brt->brt_usedspace -= dsize; 778 brtvd->bv_usedspace -= dsize; 779 780 idx = bre->bre_offset / brt->brt_rangesize; 781 ASSERT3U(idx, <, brtvd->bv_size); 782 783 ASSERT(brtvd->bv_totalcount > 0); 784 brtvd->bv_totalcount--; 785 brt_vdev_entcount_dec(brtvd, idx); 786 brtvd->bv_entcount_dirty = TRUE; 787 idx = idx / BRT_BLOCKSIZE / 8; 788 BT_SET(brtvd->bv_bitmap, idx); 789 790 #ifdef ZFS_DEBUG 791 if (zfs_flags & ZFS_DEBUG_BRT) 792 brt_vdev_dump(brtvd); 793 #endif 794 } 795 796 static void 797 brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) 798 { 799 dmu_buf_t *db; 800 brt_vdev_phys_t *bvphys; 801 802 ASSERT(brtvd->bv_meta_dirty); 803 ASSERT(brtvd->bv_mos_brtvdev != 0); 804 ASSERT(dmu_tx_is_syncing(tx)); 805 806 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); 807 808 if (brtvd->bv_entcount_dirty) { 809 /* 810 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. 811 */ 812 dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, 813 brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), 814 brtvd->bv_entcount, tx); 815 memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks)); 816 brtvd->bv_entcount_dirty = FALSE; 817 } 818 819 dmu_buf_will_dirty(db, tx); 820 bvphys = db->db_data; 821 bvphys->bvp_mos_entries = brtvd->bv_mos_entries; 822 bvphys->bvp_size = brtvd->bv_size; 823 if (brtvd->bv_need_byteswap) { 824 bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER; 825 } else { 826 bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; 827 } 828 bvphys->bvp_totalcount = brtvd->bv_totalcount; 829 bvphys->bvp_rangesize = brt->brt_rangesize; 830 bvphys->bvp_usedspace = brtvd->bv_usedspace; 831 bvphys->bvp_savedspace = brtvd->bv_savedspace; 832 dmu_buf_rele(db, FTAG); 833 834 brtvd->bv_meta_dirty = FALSE; 835 } 836 837 static void 838 brt_vdevs_alloc(brt_t *brt, boolean_t load) 839 { 840 brt_vdev_t *brtvd; 841 uint64_t vdevid; 842 843 brt_wlock(brt); 844 845 brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children); 846 847 if (load) { 848 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 849 brtvd = &brt->brt_vdevs[vdevid]; 850 ASSERT(brtvd->bv_entcount == NULL); 851 852 brt_vdev_load(brt, brtvd); 853 } 854 } 855 856 if (brt->brt_rangesize == 0) { 857 brt->brt_rangesize = BRT_RANGESIZE; 858 } 859 860 brt_unlock(brt); 861 } 862 863 static void 864 brt_vdevs_free(brt_t *brt) 865 { 866 brt_vdev_t *brtvd; 867 uint64_t vdevid; 868 869 brt_wlock(brt); 870 871 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 872 brtvd = &brt->brt_vdevs[vdevid]; 873 if (brtvd->bv_initiated) 874 brt_vdev_dealloc(brt, brtvd); 875 } 876 kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs); 877 878 brt_unlock(brt); 879 } 880 881 static void 882 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) 883 { 884 885 bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); 886 bre->bre_refcount = 0; 887 888 *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); 889 } 890 891 static int 892 brt_entry_compare(const void *x1, const void *x2) 893 { 894 const brt_entry_t *bre1 = x1; 895 const brt_entry_t *bre2 = x2; 896 897 return (TREE_CMP(bre1->bre_offset, bre2->bre_offset)); 898 } 899 900 static int 901 brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre) 902 { 903 uint64_t mos_entries; 904 uint64_t one, physsize; 905 int error; 906 907 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 908 909 if (!brt_vdev_lookup(brt, brtvd, bre)) 910 return (SET_ERROR(ENOENT)); 911 912 /* 913 * Remember mos_entries object number. After we reacquire the BRT lock, 914 * the brtvd pointer may be invalid. 915 */ 916 mos_entries = brtvd->bv_mos_entries; 917 if (mos_entries == 0) 918 return (SET_ERROR(ENOENT)); 919 920 brt_unlock(brt); 921 922 error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, 923 BRT_KEY_WORDS, &one, &physsize); 924 if (error == 0) { 925 ASSERT3U(one, ==, 1); 926 ASSERT3U(physsize, ==, sizeof (bre->bre_refcount)); 927 928 error = zap_lookup_uint64(brt->brt_mos, mos_entries, 929 &bre->bre_offset, BRT_KEY_WORDS, 1, 930 sizeof (bre->bre_refcount), &bre->bre_refcount); 931 BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu " 932 "count=%llu error=%d", (u_longlong_t)mos_entries, 933 (u_longlong_t)brtvd->bv_vdevid, 934 (u_longlong_t)bre->bre_offset, 935 error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error); 936 } 937 938 brt_wlock(brt); 939 940 return (error); 941 } 942 943 static void 944 brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) 945 { 946 brt_vdev_t *brtvd; 947 uint64_t mos_entries = 0; 948 949 brt_rlock(brt); 950 brtvd = brt_vdev(brt, vdevid); 951 if (brtvd != NULL) 952 mos_entries = brtvd->bv_mos_entries; 953 brt_unlock(brt); 954 955 if (mos_entries == 0) 956 return; 957 958 BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu", 959 (u_longlong_t)mos_entries, (u_longlong_t)vdevid, 960 (u_longlong_t)bre->bre_offset); 961 (void) zap_prefetch_uint64(brt->brt_mos, mos_entries, 962 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS); 963 } 964 965 static int 966 brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) 967 { 968 int error; 969 970 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 971 ASSERT(brtvd->bv_mos_entries != 0); 972 ASSERT(bre->bre_refcount > 0); 973 974 error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries, 975 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1, 976 sizeof (bre->bre_refcount), &bre->bre_refcount, tx); 977 BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu " 978 "error=%d", (u_longlong_t)brtvd->bv_mos_entries, 979 (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, 980 (u_longlong_t)bre->bre_refcount, error); 981 982 return (error); 983 } 984 985 static int 986 brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) 987 { 988 int error; 989 990 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 991 ASSERT(brtvd->bv_mos_entries != 0); 992 ASSERT0(bre->bre_refcount); 993 994 error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries, 995 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx); 996 BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu " 997 "error=%d", (u_longlong_t)brtvd->bv_mos_entries, 998 (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, 999 (u_longlong_t)bre->bre_refcount, error); 1000 1001 return (error); 1002 } 1003 1004 /* 1005 * Return TRUE if we _can_ have BRT entry for this bp. It might be false 1006 * positive, but gives us quick answer if we should look into BRT, which 1007 * may require reads and thus will be more expensive. 1008 */ 1009 boolean_t 1010 brt_maybe_exists(spa_t *spa, const blkptr_t *bp) 1011 { 1012 brt_t *brt = spa->spa_brt; 1013 brt_vdev_t *brtvd; 1014 brt_entry_t bre_search; 1015 boolean_t mayexists = FALSE; 1016 uint64_t vdevid; 1017 1018 brt_entry_fill(bp, &bre_search, &vdevid); 1019 1020 brt_rlock(brt); 1021 1022 brtvd = brt_vdev(brt, vdevid); 1023 if (brtvd != NULL && brtvd->bv_initiated) { 1024 if (!avl_is_empty(&brtvd->bv_tree) || 1025 brt_vdev_lookup(brt, brtvd, &bre_search)) { 1026 mayexists = TRUE; 1027 } 1028 } 1029 1030 brt_unlock(brt); 1031 1032 return (mayexists); 1033 } 1034 1035 uint64_t 1036 brt_get_dspace(spa_t *spa) 1037 { 1038 brt_t *brt = spa->spa_brt; 1039 1040 if (brt == NULL) 1041 return (0); 1042 1043 return (brt->brt_savedspace); 1044 } 1045 1046 uint64_t 1047 brt_get_used(spa_t *spa) 1048 { 1049 brt_t *brt = spa->spa_brt; 1050 1051 if (brt == NULL) 1052 return (0); 1053 1054 return (brt->brt_usedspace); 1055 } 1056 1057 uint64_t 1058 brt_get_saved(spa_t *spa) 1059 { 1060 brt_t *brt = spa->spa_brt; 1061 1062 if (brt == NULL) 1063 return (0); 1064 1065 return (brt->brt_savedspace); 1066 } 1067 1068 uint64_t 1069 brt_get_ratio(spa_t *spa) 1070 { 1071 brt_t *brt = spa->spa_brt; 1072 1073 if (brt->brt_usedspace == 0) 1074 return (100); 1075 1076 return ((brt->brt_usedspace + brt->brt_savedspace) * 100 / 1077 brt->brt_usedspace); 1078 } 1079 1080 static int 1081 brt_kstats_update(kstat_t *ksp, int rw) 1082 { 1083 brt_stats_t *bs = ksp->ks_data; 1084 1085 if (rw == KSTAT_WRITE) 1086 return (EACCES); 1087 1088 bs->brt_addref_entry_in_memory.value.ui64 = 1089 wmsum_value(&brt_sums.brt_addref_entry_in_memory); 1090 bs->brt_addref_entry_not_on_disk.value.ui64 = 1091 wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); 1092 bs->brt_addref_entry_on_disk.value.ui64 = 1093 wmsum_value(&brt_sums.brt_addref_entry_on_disk); 1094 bs->brt_addref_entry_read_lost_race.value.ui64 = 1095 wmsum_value(&brt_sums.brt_addref_entry_read_lost_race); 1096 bs->brt_decref_entry_in_memory.value.ui64 = 1097 wmsum_value(&brt_sums.brt_decref_entry_in_memory); 1098 bs->brt_decref_entry_loaded_from_disk.value.ui64 = 1099 wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); 1100 bs->brt_decref_entry_not_in_memory.value.ui64 = 1101 wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); 1102 bs->brt_decref_entry_not_on_disk.value.ui64 = 1103 wmsum_value(&brt_sums.brt_decref_entry_not_on_disk); 1104 bs->brt_decref_entry_read_lost_race.value.ui64 = 1105 wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); 1106 bs->brt_decref_entry_still_referenced.value.ui64 = 1107 wmsum_value(&brt_sums.brt_decref_entry_still_referenced); 1108 bs->brt_decref_free_data_later.value.ui64 = 1109 wmsum_value(&brt_sums.brt_decref_free_data_later); 1110 bs->brt_decref_free_data_now.value.ui64 = 1111 wmsum_value(&brt_sums.brt_decref_free_data_now); 1112 bs->brt_decref_no_entry.value.ui64 = 1113 wmsum_value(&brt_sums.brt_decref_no_entry); 1114 1115 return (0); 1116 } 1117 1118 static void 1119 brt_stat_init(void) 1120 { 1121 1122 wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0); 1123 wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); 1124 wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); 1125 wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0); 1126 wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); 1127 wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); 1128 wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); 1129 wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0); 1130 wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); 1131 wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); 1132 wmsum_init(&brt_sums.brt_decref_free_data_later, 0); 1133 wmsum_init(&brt_sums.brt_decref_free_data_now, 0); 1134 wmsum_init(&brt_sums.brt_decref_no_entry, 0); 1135 1136 brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED, 1137 sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 1138 if (brt_ksp != NULL) { 1139 brt_ksp->ks_data = &brt_stats; 1140 brt_ksp->ks_update = brt_kstats_update; 1141 kstat_install(brt_ksp); 1142 } 1143 } 1144 1145 static void 1146 brt_stat_fini(void) 1147 { 1148 if (brt_ksp != NULL) { 1149 kstat_delete(brt_ksp); 1150 brt_ksp = NULL; 1151 } 1152 1153 wmsum_fini(&brt_sums.brt_addref_entry_in_memory); 1154 wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); 1155 wmsum_fini(&brt_sums.brt_addref_entry_on_disk); 1156 wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race); 1157 wmsum_fini(&brt_sums.brt_decref_entry_in_memory); 1158 wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); 1159 wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); 1160 wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk); 1161 wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); 1162 wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); 1163 wmsum_fini(&brt_sums.brt_decref_free_data_later); 1164 wmsum_fini(&brt_sums.brt_decref_free_data_now); 1165 wmsum_fini(&brt_sums.brt_decref_no_entry); 1166 } 1167 1168 void 1169 brt_init(void) 1170 { 1171 brt_entry_cache = kmem_cache_create("brt_entry_cache", 1172 sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1173 brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache", 1174 sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1175 1176 brt_stat_init(); 1177 } 1178 1179 void 1180 brt_fini(void) 1181 { 1182 brt_stat_fini(); 1183 1184 kmem_cache_destroy(brt_entry_cache); 1185 kmem_cache_destroy(brt_pending_entry_cache); 1186 } 1187 1188 static brt_entry_t * 1189 brt_entry_alloc(const brt_entry_t *bre_init) 1190 { 1191 brt_entry_t *bre; 1192 1193 bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); 1194 bre->bre_offset = bre_init->bre_offset; 1195 bre->bre_refcount = bre_init->bre_refcount; 1196 1197 return (bre); 1198 } 1199 1200 static void 1201 brt_entry_free(brt_entry_t *bre) 1202 { 1203 1204 kmem_cache_free(brt_entry_cache, bre); 1205 } 1206 1207 static void 1208 brt_entry_addref(brt_t *brt, const blkptr_t *bp) 1209 { 1210 brt_vdev_t *brtvd; 1211 brt_entry_t *bre, *racebre; 1212 brt_entry_t bre_search; 1213 avl_index_t where; 1214 uint64_t vdevid; 1215 int error; 1216 1217 ASSERT(!RW_WRITE_HELD(&brt->brt_lock)); 1218 1219 brt_entry_fill(bp, &bre_search, &vdevid); 1220 1221 brt_wlock(brt); 1222 1223 brtvd = brt_vdev(brt, vdevid); 1224 if (brtvd == NULL) { 1225 ASSERT3U(vdevid, >=, brt->brt_nvdevs); 1226 1227 /* New VDEV was added. */ 1228 brt_vdevs_expand(brt, vdevid + 1); 1229 brtvd = brt_vdev(brt, vdevid); 1230 } 1231 ASSERT(brtvd != NULL); 1232 if (!brtvd->bv_initiated) 1233 brt_vdev_realloc(brt, brtvd); 1234 1235 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1236 if (bre != NULL) { 1237 BRTSTAT_BUMP(brt_addref_entry_in_memory); 1238 } else { 1239 /* 1240 * brt_entry_lookup() may drop the BRT (read) lock and 1241 * reacquire it (write). 1242 */ 1243 error = brt_entry_lookup(brt, brtvd, &bre_search); 1244 /* bre_search now contains correct bre_refcount */ 1245 ASSERT(error == 0 || error == ENOENT); 1246 if (error == 0) 1247 BRTSTAT_BUMP(brt_addref_entry_on_disk); 1248 else 1249 BRTSTAT_BUMP(brt_addref_entry_not_on_disk); 1250 /* 1251 * When the BRT lock was dropped, brt_vdevs[] may have been 1252 * expanded and reallocated, we need to update brtvd's pointer. 1253 */ 1254 brtvd = brt_vdev(brt, vdevid); 1255 ASSERT(brtvd != NULL); 1256 1257 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); 1258 if (racebre == NULL) { 1259 bre = brt_entry_alloc(&bre_search); 1260 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 1261 avl_insert(&brtvd->bv_tree, bre, where); 1262 brt->brt_nentries++; 1263 } else { 1264 /* 1265 * The entry was added when the BRT lock was dropped in 1266 * brt_entry_lookup(). 1267 */ 1268 BRTSTAT_BUMP(brt_addref_entry_read_lost_race); 1269 bre = racebre; 1270 } 1271 } 1272 bre->bre_refcount++; 1273 brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); 1274 1275 brt_unlock(brt); 1276 } 1277 1278 /* Return TRUE if block should be freed immediately. */ 1279 boolean_t 1280 brt_entry_decref(spa_t *spa, const blkptr_t *bp) 1281 { 1282 brt_t *brt = spa->spa_brt; 1283 brt_vdev_t *brtvd; 1284 brt_entry_t *bre, *racebre; 1285 brt_entry_t bre_search; 1286 avl_index_t where; 1287 uint64_t vdevid; 1288 int error; 1289 1290 brt_entry_fill(bp, &bre_search, &vdevid); 1291 1292 brt_wlock(brt); 1293 1294 brtvd = brt_vdev(brt, vdevid); 1295 ASSERT(brtvd != NULL); 1296 1297 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1298 if (bre != NULL) { 1299 BRTSTAT_BUMP(brt_decref_entry_in_memory); 1300 goto out; 1301 } else { 1302 BRTSTAT_BUMP(brt_decref_entry_not_in_memory); 1303 } 1304 1305 /* 1306 * brt_entry_lookup() may drop the BRT lock and reacquire it. 1307 */ 1308 error = brt_entry_lookup(brt, brtvd, &bre_search); 1309 /* bre_search now contains correct bre_refcount */ 1310 ASSERT(error == 0 || error == ENOENT); 1311 /* 1312 * When the BRT lock was dropped, brt_vdevs[] may have been expanded 1313 * and reallocated, we need to update brtvd's pointer. 1314 */ 1315 brtvd = brt_vdev(brt, vdevid); 1316 ASSERT(brtvd != NULL); 1317 1318 if (error == ENOENT) { 1319 BRTSTAT_BUMP(brt_decref_entry_not_on_disk); 1320 bre = NULL; 1321 goto out; 1322 } 1323 1324 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); 1325 if (racebre != NULL) { 1326 /* 1327 * The entry was added when the BRT lock was dropped in 1328 * brt_entry_lookup(). 1329 */ 1330 BRTSTAT_BUMP(brt_decref_entry_read_lost_race); 1331 bre = racebre; 1332 goto out; 1333 } 1334 1335 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); 1336 bre = brt_entry_alloc(&bre_search); 1337 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 1338 avl_insert(&brtvd->bv_tree, bre, where); 1339 brt->brt_nentries++; 1340 1341 out: 1342 if (bre == NULL) { 1343 /* 1344 * This is a free of a regular (not cloned) block. 1345 */ 1346 brt_unlock(brt); 1347 BRTSTAT_BUMP(brt_decref_no_entry); 1348 return (B_TRUE); 1349 } 1350 if (bre->bre_refcount == 0) { 1351 brt_unlock(brt); 1352 BRTSTAT_BUMP(brt_decref_free_data_now); 1353 return (B_TRUE); 1354 } 1355 1356 ASSERT(bre->bre_refcount > 0); 1357 bre->bre_refcount--; 1358 if (bre->bre_refcount == 0) 1359 BRTSTAT_BUMP(brt_decref_free_data_later); 1360 else 1361 BRTSTAT_BUMP(brt_decref_entry_still_referenced); 1362 brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); 1363 1364 brt_unlock(brt); 1365 1366 return (B_FALSE); 1367 } 1368 1369 uint64_t 1370 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) 1371 { 1372 brt_t *brt = spa->spa_brt; 1373 brt_vdev_t *brtvd; 1374 brt_entry_t bre_search, *bre; 1375 uint64_t vdevid, refcnt; 1376 int error; 1377 1378 brt_entry_fill(bp, &bre_search, &vdevid); 1379 1380 brt_rlock(brt); 1381 1382 brtvd = brt_vdev(brt, vdevid); 1383 ASSERT(brtvd != NULL); 1384 1385 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1386 if (bre == NULL) { 1387 error = brt_entry_lookup(brt, brtvd, &bre_search); 1388 ASSERT(error == 0 || error == ENOENT); 1389 if (error == ENOENT) 1390 refcnt = 0; 1391 else 1392 refcnt = bre_search.bre_refcount; 1393 } else 1394 refcnt = bre->bre_refcount; 1395 1396 brt_unlock(brt); 1397 return (refcnt); 1398 } 1399 1400 static void 1401 brt_prefetch(brt_t *brt, const blkptr_t *bp) 1402 { 1403 brt_entry_t bre; 1404 uint64_t vdevid; 1405 1406 ASSERT(bp != NULL); 1407 1408 if (!zfs_brt_prefetch) 1409 return; 1410 1411 brt_entry_fill(bp, &bre, &vdevid); 1412 1413 brt_entry_prefetch(brt, vdevid, &bre); 1414 } 1415 1416 static int 1417 brt_pending_entry_compare(const void *x1, const void *x2) 1418 { 1419 const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2; 1420 const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp; 1421 int cmp; 1422 1423 cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2)); 1424 if (cmp == 0) { 1425 cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]), 1426 DVA_GET_VDEV(&bp2->blk_dva[0])); 1427 if (cmp == 0) { 1428 cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), 1429 DVA_GET_OFFSET(&bp2->blk_dva[0])); 1430 } 1431 } 1432 1433 return (cmp); 1434 } 1435 1436 void 1437 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1438 { 1439 brt_t *brt; 1440 avl_tree_t *pending_tree; 1441 kmutex_t *pending_lock; 1442 brt_pending_entry_t *bpe, *newbpe; 1443 avl_index_t where; 1444 uint64_t txg; 1445 1446 brt = spa->spa_brt; 1447 txg = dmu_tx_get_txg(tx); 1448 ASSERT3U(txg, !=, 0); 1449 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; 1450 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; 1451 1452 newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP); 1453 newbpe->bpe_bp = *bp; 1454 newbpe->bpe_count = 1; 1455 1456 mutex_enter(pending_lock); 1457 1458 bpe = avl_find(pending_tree, newbpe, &where); 1459 if (bpe == NULL) { 1460 avl_insert(pending_tree, newbpe, where); 1461 newbpe = NULL; 1462 } else { 1463 bpe->bpe_count++; 1464 } 1465 1466 mutex_exit(pending_lock); 1467 1468 if (newbpe != NULL) { 1469 ASSERT(bpe != NULL); 1470 ASSERT(bpe != newbpe); 1471 kmem_cache_free(brt_pending_entry_cache, newbpe); 1472 } else { 1473 ASSERT(bpe == NULL); 1474 } 1475 1476 /* Prefetch BRT entry, as we will need it in the syncing context. */ 1477 brt_prefetch(brt, bp); 1478 } 1479 1480 void 1481 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1482 { 1483 brt_t *brt; 1484 avl_tree_t *pending_tree; 1485 kmutex_t *pending_lock; 1486 brt_pending_entry_t *bpe, bpe_search; 1487 uint64_t txg; 1488 1489 brt = spa->spa_brt; 1490 txg = dmu_tx_get_txg(tx); 1491 ASSERT3U(txg, !=, 0); 1492 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; 1493 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; 1494 1495 bpe_search.bpe_bp = *bp; 1496 1497 mutex_enter(pending_lock); 1498 1499 bpe = avl_find(pending_tree, &bpe_search, NULL); 1500 /* I believe we should always find bpe when this function is called. */ 1501 if (bpe != NULL) { 1502 ASSERT(bpe->bpe_count > 0); 1503 1504 bpe->bpe_count--; 1505 if (bpe->bpe_count == 0) { 1506 avl_remove(pending_tree, bpe); 1507 kmem_cache_free(brt_pending_entry_cache, bpe); 1508 } 1509 } 1510 1511 mutex_exit(pending_lock); 1512 } 1513 1514 void 1515 brt_pending_apply(spa_t *spa, uint64_t txg) 1516 { 1517 brt_t *brt; 1518 brt_pending_entry_t *bpe; 1519 avl_tree_t *pending_tree; 1520 kmutex_t *pending_lock; 1521 void *c; 1522 1523 ASSERT3U(txg, !=, 0); 1524 1525 brt = spa->spa_brt; 1526 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; 1527 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; 1528 1529 mutex_enter(pending_lock); 1530 1531 c = NULL; 1532 while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) { 1533 boolean_t added_to_ddt; 1534 1535 mutex_exit(pending_lock); 1536 1537 for (int i = 0; i < bpe->bpe_count; i++) { 1538 /* 1539 * If the block has DEDUP bit set, it means that it 1540 * already exists in the DEDUP table, so we can just 1541 * use that instead of creating new entry in 1542 * the BRT table. 1543 */ 1544 if (BP_GET_DEDUP(&bpe->bpe_bp)) { 1545 added_to_ddt = ddt_addref(spa, &bpe->bpe_bp); 1546 } else { 1547 added_to_ddt = B_FALSE; 1548 } 1549 if (!added_to_ddt) 1550 brt_entry_addref(brt, &bpe->bpe_bp); 1551 } 1552 1553 kmem_cache_free(brt_pending_entry_cache, bpe); 1554 mutex_enter(pending_lock); 1555 } 1556 1557 mutex_exit(pending_lock); 1558 } 1559 1560 static void 1561 brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) 1562 { 1563 1564 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 1565 ASSERT(brtvd->bv_mos_entries != 0); 1566 1567 if (bre->bre_refcount == 0) { 1568 int error; 1569 1570 error = brt_entry_remove(brt, brtvd, bre, tx); 1571 ASSERT(error == 0 || error == ENOENT); 1572 /* 1573 * If error == ENOENT then zfs_clone_range() was done from a 1574 * removed (but opened) file (open(), unlink()). 1575 */ 1576 ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT); 1577 } else { 1578 VERIFY0(brt_entry_update(brt, brtvd, bre, tx)); 1579 } 1580 } 1581 1582 static void 1583 brt_sync_table(brt_t *brt, dmu_tx_t *tx) 1584 { 1585 brt_vdev_t *brtvd; 1586 brt_entry_t *bre; 1587 uint64_t vdevid; 1588 void *c; 1589 1590 brt_wlock(brt); 1591 1592 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 1593 brtvd = &brt->brt_vdevs[vdevid]; 1594 1595 if (!brtvd->bv_initiated) 1596 continue; 1597 1598 if (!brtvd->bv_meta_dirty) { 1599 ASSERT(!brtvd->bv_entcount_dirty); 1600 ASSERT0(avl_numnodes(&brtvd->bv_tree)); 1601 continue; 1602 } 1603 1604 ASSERT(!brtvd->bv_entcount_dirty || 1605 avl_numnodes(&brtvd->bv_tree) != 0); 1606 1607 if (brtvd->bv_mos_brtvdev == 0) 1608 brt_vdev_create(brt, brtvd, tx); 1609 1610 c = NULL; 1611 while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { 1612 brt_sync_entry(brt, brtvd, bre, tx); 1613 brt_entry_free(bre); 1614 ASSERT(brt->brt_nentries > 0); 1615 brt->brt_nentries--; 1616 } 1617 1618 brt_vdev_sync(brt, brtvd, tx); 1619 1620 if (brtvd->bv_totalcount == 0) 1621 brt_vdev_destroy(brt, brtvd, tx); 1622 } 1623 1624 ASSERT0(brt->brt_nentries); 1625 1626 brt_unlock(brt); 1627 } 1628 1629 void 1630 brt_sync(spa_t *spa, uint64_t txg) 1631 { 1632 dmu_tx_t *tx; 1633 brt_t *brt; 1634 1635 ASSERT(spa_syncing_txg(spa) == txg); 1636 1637 brt = spa->spa_brt; 1638 brt_rlock(brt); 1639 if (brt->brt_nentries == 0) { 1640 /* No changes. */ 1641 brt_unlock(brt); 1642 return; 1643 } 1644 brt_unlock(brt); 1645 1646 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1647 1648 brt_sync_table(brt, tx); 1649 1650 dmu_tx_commit(tx); 1651 } 1652 1653 static void 1654 brt_table_alloc(brt_t *brt) 1655 { 1656 1657 for (int i = 0; i < TXG_SIZE; i++) { 1658 avl_create(&brt->brt_pending_tree[i], 1659 brt_pending_entry_compare, 1660 sizeof (brt_pending_entry_t), 1661 offsetof(brt_pending_entry_t, bpe_node)); 1662 mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT, 1663 NULL); 1664 } 1665 } 1666 1667 static void 1668 brt_table_free(brt_t *brt) 1669 { 1670 1671 for (int i = 0; i < TXG_SIZE; i++) { 1672 ASSERT(avl_is_empty(&brt->brt_pending_tree[i])); 1673 1674 avl_destroy(&brt->brt_pending_tree[i]); 1675 mutex_destroy(&brt->brt_pending_lock[i]); 1676 } 1677 } 1678 1679 static void 1680 brt_alloc(spa_t *spa) 1681 { 1682 brt_t *brt; 1683 1684 ASSERT(spa->spa_brt == NULL); 1685 1686 brt = kmem_zalloc(sizeof (*brt), KM_SLEEP); 1687 rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL); 1688 brt->brt_spa = spa; 1689 brt->brt_rangesize = 0; 1690 brt->brt_nentries = 0; 1691 brt->brt_vdevs = NULL; 1692 brt->brt_nvdevs = 0; 1693 brt_table_alloc(brt); 1694 1695 spa->spa_brt = brt; 1696 } 1697 1698 void 1699 brt_create(spa_t *spa) 1700 { 1701 1702 brt_alloc(spa); 1703 brt_vdevs_alloc(spa->spa_brt, B_FALSE); 1704 } 1705 1706 int 1707 brt_load(spa_t *spa) 1708 { 1709 1710 brt_alloc(spa); 1711 brt_vdevs_alloc(spa->spa_brt, B_TRUE); 1712 1713 return (0); 1714 } 1715 1716 void 1717 brt_unload(spa_t *spa) 1718 { 1719 brt_t *brt = spa->spa_brt; 1720 1721 if (brt == NULL) 1722 return; 1723 1724 brt_vdevs_free(brt); 1725 brt_table_free(brt); 1726 rw_destroy(&brt->brt_lock); 1727 kmem_free(brt, sizeof (*brt)); 1728 spa->spa_brt = NULL; 1729 } 1730 1731 /* BEGIN CSTYLED */ 1732 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW, 1733 "Enable prefetching of BRT entries"); 1734 #ifdef ZFS_BRT_DEBUG 1735 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug"); 1736 #endif 1737 /* END CSTYLED */ 1738