1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/spa.h> 28 #include <sys/spa_impl.h> 29 #include <sys/zio.h> 30 #include <sys/brt.h> 31 #include <sys/brt_impl.h> 32 #include <sys/ddt.h> 33 #include <sys/bitmap.h> 34 #include <sys/zap.h> 35 #include <sys/dmu_tx.h> 36 #include <sys/arc.h> 37 #include <sys/dsl_pool.h> 38 #include <sys/dsl_scan.h> 39 #include <sys/vdev_impl.h> 40 #include <sys/kstat.h> 41 #include <sys/wmsum.h> 42 43 /* 44 * Block Cloning design. 45 * 46 * Block Cloning allows to manually clone a file (or a subset of its blocks) 47 * into another (or the same) file by just creating additional references to 48 * the data blocks without copying the data itself. Those references are kept 49 * in the Block Reference Tables (BRTs). 50 * 51 * In many ways this is similar to the existing deduplication, but there are 52 * some important differences: 53 * 54 * - Deduplication is automatic and Block Cloning is not - one has to use a 55 * dedicated system call(s) to clone the given file/blocks. 56 * - Deduplication keeps all data blocks in its table, even those referenced 57 * just once. Block Cloning creates an entry in its tables only when there 58 * are at least two references to the given data block. If the block was 59 * never explicitly cloned or the second to last reference was dropped, 60 * there will be neither space nor performance overhead. 61 * - Deduplication needs data to work - one needs to pass real data to the 62 * write(2) syscall, so hash can be calculated. Block Cloning doesn't require 63 * data, just block pointers to the data, so it is extremely fast, as we pay 64 * neither the cost of reading the data, nor the cost of writing the data - 65 * we operate exclusively on metadata. 66 * - If the D (dedup) bit is not set in the block pointer, it means that 67 * the block is not in the dedup table (DDT) and we won't consult the DDT 68 * when we need to free the block. Block Cloning must be consulted on every 69 * free, because we cannot modify the source BP (eg. by setting something 70 * similar to the D bit), thus we have no hint if the block is in the 71 * Block Reference Table (BRT), so we need to look into the BRT. There is 72 * an optimization in place that allows us to eliminate the majority of BRT 73 * lookups which is described below in the "Minimizing free penalty" section. 74 * - The BRT entry is much smaller than the DDT entry - for BRT we only store 75 * 64bit offset and 64bit reference counter. 76 * - Dedup keys are cryptographic hashes, so two blocks that are close to each 77 * other on disk are most likely in totally different parts of the DDT. 78 * The BRT entry keys are offsets into a single top-level VDEV, so data blocks 79 * from one file should have BRT entries close to each other. 80 * - Scrub will only do a single pass over a block that is referenced multiple 81 * times in the DDT. Unfortunately it is not currently (if at all) possible 82 * with Block Cloning and block referenced multiple times will be scrubbed 83 * multiple times. The new, sorted scrub should be able to eliminate 84 * duplicated reads given enough memory. 85 * - Deduplication requires cryptographically strong hash as a checksum or 86 * additional data verification. Block Cloning works with any checksum 87 * algorithm or even with checksumming disabled. 88 * 89 * As mentioned above, the BRT entries are much smaller than the DDT entries. 90 * To uniquely identify a block we just need its vdev id and offset. We also 91 * need to maintain a reference counter. The vdev id will often repeat, as there 92 * is a small number of top-level VDEVs and a large number of blocks stored in 93 * each VDEV. We take advantage of that to reduce the BRT entry size further by 94 * maintaining one BRT for each top-level VDEV, so we can then have only offset 95 * and counter as the BRT entry. 96 * 97 * Minimizing free penalty. 98 * 99 * Block Cloning allows creating additional references to any existing block. 100 * When we free a block there is no hint in the block pointer whether the block 101 * was cloned or not, so on each free we have to check if there is a 102 * corresponding entry in the BRT or not. If there is, we need to decrease 103 * the reference counter. Doing BRT lookup on every free can potentially be 104 * expensive by requiring additional I/Os if the BRT doesn't fit into memory. 105 * This is the main problem with deduplication, so we've learned our lesson and 106 * try not to repeat the same mistake here. How do we do that? We divide each 107 * top-level VDEV into 16MB regions. For each region we maintain a counter that 108 * is a sum of all the BRT entries that have offsets within the region. This 109 * creates the entries count array of 16bit numbers for each top-level VDEV. 110 * The entries count array is always kept in memory and updated on disk in the 111 * same transaction group as the BRT updates to keep everything in-sync. We can 112 * keep the array in memory, because it is very small. With 16MB regions and 113 * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease 114 * the region size even further in the future). Now, when we want to free 115 * a block, we first consult the array. If the counter for the whole region is 116 * zero, there is no need to look for the BRT entry, as there isn't one for 117 * sure. If the counter for the region is greater than zero, only then we will 118 * do a BRT lookup and if an entry is found we will decrease the reference 119 * counter in the BRT entry and in the entry counters array. 120 * 121 * The entry counters array is small, but can potentially be larger for very 122 * large VDEVs or smaller regions. In this case we don't want to rewrite entire 123 * array on every change. We then divide the array into 32kB block and keep 124 * a bitmap of dirty blocks within a transaction group. When we sync the 125 * transaction group we can only update the parts of the entry counters array 126 * that were modified. Note: Keeping track of the dirty parts of the entry 127 * counters array is implemented, but updating only parts of the array on disk 128 * is not yet implemented - for now we will update entire array if there was 129 * any change. 130 * 131 * The implementation tries to be economic: if BRT is not used, or no longer 132 * used, there will be no entries in the MOS and no additional memory used (eg. 133 * the entry counters array is only allocated if needed). 134 * 135 * Interaction between Deduplication and Block Cloning. 136 * 137 * If both functionalities are in use, we could end up with a block that is 138 * referenced multiple times in both DDT and BRT. When we free one of the 139 * references we couldn't tell where it belongs, so we would have to decide 140 * what table takes the precedence: do we first clear DDT references or BRT 141 * references? To avoid this dilemma BRT cooperates with DDT - if a given block 142 * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will 143 * lookup DDT entry instead and increase the counter there. No BRT entry 144 * will be created for a block which has the D (dedup) bit set. 145 * BRT may be more efficient for manual deduplication, but if the block is 146 * already in the DDT, then creating additional BRT entry would be less 147 * efficient. This clever idea was proposed by Allan Jude. 148 * 149 * Block Cloning across datasets. 150 * 151 * Block Cloning is not limited to cloning blocks within the same dataset. 152 * It is possible (and very useful) to clone blocks between different datasets. 153 * One use case is recovering files from snapshots. By cloning the files into 154 * dataset we need no additional storage. Without Block Cloning we would need 155 * additional space for those files. 156 * Another interesting use case is moving the files between datasets 157 * (copying the file content to the new dataset and removing the source file). 158 * In that case Block Cloning will only be used briefly, because the BRT entries 159 * will be removed when the source is removed. 160 * Block Cloning across encrypted datasets is supported as long as both 161 * datasets share the same master key (e.g. snapshots and clones) 162 * 163 * Block Cloning flow through ZFS layers. 164 * 165 * Note: Block Cloning can be used both for cloning file system blocks and ZVOL 166 * blocks. As of this writing no interface is implemented that allows for block 167 * cloning within a ZVOL. 168 * FreeBSD and Linux provides copy_file_range(2) system call and we will use it 169 * for blocking cloning. 170 * 171 * ssize_t 172 * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp, 173 * size_t len, unsigned int flags); 174 * 175 * Even though offsets and length represent bytes, they have to be 176 * block-aligned or we will return an error so the upper layer can 177 * fallback to the generic mechanism that will just copy the data. 178 * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. 179 * This function was implemented based on zfs_write(), but instead of writing 180 * the given data we first read block pointers using the new dmu_read_l0_bps() 181 * function from the source file. Once we have BPs from the source file we call 182 * the dmu_brt_clone() function on the destination file. This function 183 * allocates BPs for us. We iterate over all source BPs. If the given BP is 184 * a hole or an embedded block, we just copy BP as-is. If it points to a real 185 * data we place this BP on a BRT pending list using the brt_pending_add() 186 * function. 187 * 188 * We use this pending list to keep track of all BPs that got new references 189 * within this transaction group. 190 * 191 * Some special cases to consider and how we address them: 192 * - The block we want to clone may have been created within the same 193 * transaction group that we are trying to clone. Such block has no BP 194 * allocated yet, so cannot be immediately cloned. We return EAGAIN. 195 * - The block we want to clone may have been modified within the same 196 * transaction group. We return EAGAIN. 197 * - A block may be cloned multiple times during one transaction group (that's 198 * why pending list is actually a tree and not an append-only list - this 199 * way we can figure out faster if this block is cloned for the first time 200 * in this txg or consecutive time). 201 * - A block may be cloned and freed within the same transaction group 202 * (see dbuf_undirty()). 203 * - A block may be cloned and within the same transaction group the clone 204 * can be cloned again (see dmu_read_l0_bps()). 205 * - A file might have been deleted, but the caller still has a file descriptor 206 * open to this file and clones it. 207 * 208 * When we free a block we have an additional step in the ZIO pipeline where we 209 * call the zio_brt_free() function. We then call the brt_entry_decref() 210 * that loads the corresponding BRT entry (if one exists) and decreases 211 * reference counter. If this is not the last reference we will stop ZIO 212 * pipeline here. If this is the last reference or the block is not in the 213 * BRT, we continue the pipeline and free the block as usual. 214 * 215 * At the beginning of spa_sync() where there can be no more block cloning, 216 * but before issuing frees we call brt_pending_apply(). This function applies 217 * all the new clones to the BRT table - we load BRT entries and update 218 * reference counters. To sync new BRT entries to disk, we use brt_sync() 219 * function. This function will sync all dirty per-top-level-vdev BRTs, 220 * the entry counters arrays, etc. 221 * 222 * Block Cloning and ZIL. 223 * 224 * Every clone operation is divided into chunks (similar to write) and each 225 * chunk is cloned in a separate transaction. The chunk size is determined by 226 * how many BPs we can fit into a single ZIL entry. 227 * Replaying clone operation is different from the regular clone operation, 228 * as when we log clone operations we cannot use the source object - it may 229 * reside on a different dataset, so we log BPs we want to clone. 230 * The ZIL is replayed when we mount the given dataset, not when the pool is 231 * imported. Taking this into account it is possible that the pool is imported 232 * without mounting datasets and the source dataset is destroyed before the 233 * destination dataset is mounted and its ZIL replayed. 234 * To address this situation we leverage zil_claim() mechanism where ZFS will 235 * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE 236 * entries, we will bump reference counters for their BPs in the BRT. Then 237 * on mount and ZIL replay we bump the reference counters once more, while the 238 * first references are dropped during ZIL destroy by zil_free_clone_range(). 239 * It is possible that after zil_claim() we never mount the destination, so 240 * we never replay its ZIL and just destroy it. In this case the only taken 241 * references will be dropped by zil_free_clone_range(), since the cloning is 242 * not going to ever take place. 243 */ 244 245 static kmem_cache_t *brt_entry_cache; 246 247 /* 248 * Enable/disable prefetching of BRT entries that we are going to modify. 249 */ 250 static int brt_zap_prefetch = 1; 251 252 #ifdef ZFS_DEBUG 253 #define BRT_DEBUG(...) do { \ 254 if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \ 255 __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \ 256 } \ 257 } while (0) 258 #else 259 #define BRT_DEBUG(...) do { } while (0) 260 #endif 261 262 static int brt_zap_default_bs = 12; 263 static int brt_zap_default_ibs = 12; 264 265 static kstat_t *brt_ksp; 266 267 typedef struct brt_stats { 268 kstat_named_t brt_addref_entry_not_on_disk; 269 kstat_named_t brt_addref_entry_on_disk; 270 kstat_named_t brt_decref_entry_in_memory; 271 kstat_named_t brt_decref_entry_loaded_from_disk; 272 kstat_named_t brt_decref_entry_not_in_memory; 273 kstat_named_t brt_decref_entry_read_lost_race; 274 kstat_named_t brt_decref_entry_still_referenced; 275 kstat_named_t brt_decref_free_data_later; 276 kstat_named_t brt_decref_free_data_now; 277 kstat_named_t brt_decref_no_entry; 278 } brt_stats_t; 279 280 static brt_stats_t brt_stats = { 281 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, 282 { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, 283 { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, 284 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, 285 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, 286 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, 287 { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, 288 { "decref_free_data_later", KSTAT_DATA_UINT64 }, 289 { "decref_free_data_now", KSTAT_DATA_UINT64 }, 290 { "decref_no_entry", KSTAT_DATA_UINT64 } 291 }; 292 293 struct { 294 wmsum_t brt_addref_entry_not_on_disk; 295 wmsum_t brt_addref_entry_on_disk; 296 wmsum_t brt_decref_entry_in_memory; 297 wmsum_t brt_decref_entry_loaded_from_disk; 298 wmsum_t brt_decref_entry_not_in_memory; 299 wmsum_t brt_decref_entry_read_lost_race; 300 wmsum_t brt_decref_entry_still_referenced; 301 wmsum_t brt_decref_free_data_later; 302 wmsum_t brt_decref_free_data_now; 303 wmsum_t brt_decref_no_entry; 304 } brt_sums; 305 306 #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) 307 308 static int brt_entry_compare(const void *x1, const void *x2); 309 static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs); 310 311 static void 312 brt_rlock(spa_t *spa) 313 { 314 rw_enter(&spa->spa_brt_lock, RW_READER); 315 } 316 317 static void 318 brt_wlock(spa_t *spa) 319 { 320 rw_enter(&spa->spa_brt_lock, RW_WRITER); 321 } 322 323 static void 324 brt_unlock(spa_t *spa) 325 { 326 rw_exit(&spa->spa_brt_lock); 327 } 328 329 static uint16_t 330 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) 331 { 332 333 ASSERT3U(idx, <, brtvd->bv_size); 334 335 if (unlikely(brtvd->bv_need_byteswap)) { 336 return (BSWAP_16(brtvd->bv_entcount[idx])); 337 } else { 338 return (brtvd->bv_entcount[idx]); 339 } 340 } 341 342 static void 343 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt) 344 { 345 346 ASSERT3U(idx, <, brtvd->bv_size); 347 348 if (unlikely(brtvd->bv_need_byteswap)) { 349 brtvd->bv_entcount[idx] = BSWAP_16(entcnt); 350 } else { 351 brtvd->bv_entcount[idx] = entcnt; 352 } 353 } 354 355 static void 356 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx) 357 { 358 uint16_t entcnt; 359 360 ASSERT3U(idx, <, brtvd->bv_size); 361 362 entcnt = brt_vdev_entcount_get(brtvd, idx); 363 ASSERT(entcnt < UINT16_MAX); 364 365 brt_vdev_entcount_set(brtvd, idx, entcnt + 1); 366 } 367 368 static void 369 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx) 370 { 371 uint16_t entcnt; 372 373 ASSERT3U(idx, <, brtvd->bv_size); 374 375 entcnt = brt_vdev_entcount_get(brtvd, idx); 376 ASSERT(entcnt > 0); 377 378 brt_vdev_entcount_set(brtvd, idx, entcnt - 1); 379 } 380 381 #ifdef ZFS_DEBUG 382 static void 383 brt_vdev_dump(brt_vdev_t *brtvd) 384 { 385 uint64_t idx; 386 387 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); 388 zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d " 389 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu", 390 (u_longlong_t)brtvd->bv_vdevid, 391 brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, 392 (u_longlong_t)brtvd->bv_size, 393 (u_longlong_t)brtvd->bv_totalcount, 394 (u_longlong_t)nblocks, 395 (size_t)BT_SIZEOFMAP(nblocks)); 396 if (brtvd->bv_totalcount > 0) { 397 zfs_dbgmsg(" entcounts:"); 398 for (idx = 0; idx < brtvd->bv_size; idx++) { 399 uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx); 400 if (entcnt > 0) { 401 zfs_dbgmsg(" [%04llu] %hu", 402 (u_longlong_t)idx, entcnt); 403 } 404 } 405 } 406 if (brtvd->bv_entcount_dirty) { 407 char *bitmap; 408 409 bitmap = kmem_alloc(nblocks + 1, KM_SLEEP); 410 for (idx = 0; idx < nblocks; idx++) { 411 bitmap[idx] = 412 BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; 413 } 414 bitmap[idx] = '\0'; 415 zfs_dbgmsg(" dirty: %s", bitmap); 416 kmem_free(bitmap, nblocks + 1); 417 } 418 } 419 #endif 420 421 static brt_vdev_t * 422 brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc) 423 { 424 brt_vdev_t *brtvd = NULL; 425 426 brt_rlock(spa); 427 if (vdevid < spa->spa_brt_nvdevs) { 428 brtvd = spa->spa_brt_vdevs[vdevid]; 429 } else if (alloc) { 430 /* New VDEV was added. */ 431 brt_unlock(spa); 432 brt_wlock(spa); 433 if (vdevid >= spa->spa_brt_nvdevs) 434 brt_vdevs_expand(spa, vdevid + 1); 435 brtvd = spa->spa_brt_vdevs[vdevid]; 436 } 437 brt_unlock(spa); 438 return (brtvd); 439 } 440 441 static void 442 brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) 443 { 444 char name[64]; 445 446 ASSERT(brtvd->bv_initiated); 447 ASSERT0(brtvd->bv_mos_brtvdev); 448 ASSERT0(brtvd->bv_mos_entries); 449 450 uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0, 451 ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, 452 brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx); 453 VERIFY(mos_entries != 0); 454 VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd, 455 &brtvd->bv_mos_entries_dnode)); 456 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); 457 brtvd->bv_mos_entries = mos_entries; 458 rw_exit(&brtvd->bv_mos_entries_lock); 459 BRT_DEBUG("MOS entries created, object=%llu", 460 (u_longlong_t)brtvd->bv_mos_entries); 461 462 /* 463 * We allocate DMU buffer to store the bv_entcount[] array. 464 * We will keep array size (bv_size) and cummulative count for all 465 * bv_entcount[]s (bv_totalcount) in the bonus buffer. 466 */ 467 brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset, 468 DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, 469 DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); 470 VERIFY(brtvd->bv_mos_brtvdev != 0); 471 BRT_DEBUG("MOS BRT VDEV created, object=%llu", 472 (u_longlong_t)brtvd->bv_mos_brtvdev); 473 474 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 475 (u_longlong_t)brtvd->bv_vdevid); 476 VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, 477 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); 478 BRT_DEBUG("Pool directory object created, object=%s", name); 479 480 spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx); 481 } 482 483 static void 484 brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd) 485 { 486 vdev_t *vd; 487 uint16_t *entcount; 488 ulong_t *bitmap; 489 uint64_t nblocks, onblocks, size; 490 491 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); 492 493 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 494 vd = vdev_lookup_top(spa, brtvd->bv_vdevid); 495 size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1; 496 spa_config_exit(spa, SCL_VDEV, FTAG); 497 498 entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); 499 nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); 500 bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); 501 502 if (!brtvd->bv_initiated) { 503 ASSERT0(brtvd->bv_size); 504 ASSERT0P(brtvd->bv_entcount); 505 ASSERT0P(brtvd->bv_bitmap); 506 } else { 507 ASSERT(brtvd->bv_size > 0); 508 ASSERT(brtvd->bv_entcount != NULL); 509 ASSERT(brtvd->bv_bitmap != NULL); 510 /* 511 * TODO: Allow vdev shrinking. We only need to implement 512 * shrinking the on-disk BRT VDEV object. 513 * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 514 * offset, size, tx); 515 */ 516 ASSERT3U(brtvd->bv_size, <=, size); 517 518 memcpy(entcount, brtvd->bv_entcount, 519 sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); 520 vmem_free(brtvd->bv_entcount, 521 sizeof (entcount[0]) * brtvd->bv_size); 522 onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); 523 memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), 524 BT_SIZEOFMAP(onblocks))); 525 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks)); 526 } 527 528 brtvd->bv_size = size; 529 brtvd->bv_entcount = entcount; 530 brtvd->bv_bitmap = bitmap; 531 if (!brtvd->bv_initiated) { 532 brtvd->bv_need_byteswap = FALSE; 533 brtvd->bv_initiated = TRUE; 534 BRT_DEBUG("BRT VDEV %llu initiated.", 535 (u_longlong_t)brtvd->bv_vdevid); 536 } 537 } 538 539 static int 540 brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd) 541 { 542 dmu_buf_t *db; 543 brt_vdev_phys_t *bvphys; 544 int error; 545 546 ASSERT(!brtvd->bv_initiated); 547 ASSERT(brtvd->bv_mos_brtvdev != 0); 548 549 error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 550 FTAG, &db); 551 if (error != 0) 552 return (error); 553 554 bvphys = db->db_data; 555 if (spa->spa_brt_rangesize == 0) { 556 spa->spa_brt_rangesize = bvphys->bvp_rangesize; 557 } else { 558 ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize); 559 } 560 561 brt_vdev_realloc(spa, brtvd); 562 563 /* TODO: We don't support VDEV shrinking. */ 564 ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); 565 566 /* 567 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. 568 */ 569 error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, 570 MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), 571 brtvd->bv_entcount, DMU_READ_NO_PREFETCH); 572 if (error != 0) 573 return (error); 574 575 ASSERT(bvphys->bvp_mos_entries != 0); 576 VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd, 577 &brtvd->bv_mos_entries_dnode)); 578 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); 579 brtvd->bv_mos_entries = bvphys->bvp_mos_entries; 580 rw_exit(&brtvd->bv_mos_entries_lock); 581 brtvd->bv_need_byteswap = 582 (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); 583 brtvd->bv_totalcount = bvphys->bvp_totalcount; 584 brtvd->bv_usedspace = bvphys->bvp_usedspace; 585 brtvd->bv_savedspace = bvphys->bvp_savedspace; 586 587 dmu_buf_rele(db, FTAG); 588 589 BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu", 590 (u_longlong_t)brtvd->bv_vdevid, 591 (u_longlong_t)brtvd->bv_mos_brtvdev, 592 (u_longlong_t)brtvd->bv_mos_entries); 593 return (0); 594 } 595 596 static void 597 brt_vdev_dealloc(brt_vdev_t *brtvd) 598 { 599 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); 600 ASSERT(brtvd->bv_initiated); 601 ASSERT0(avl_numnodes(&brtvd->bv_tree)); 602 603 vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); 604 brtvd->bv_entcount = NULL; 605 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); 606 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks)); 607 brtvd->bv_bitmap = NULL; 608 609 brtvd->bv_size = 0; 610 611 brtvd->bv_initiated = FALSE; 612 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); 613 } 614 615 static void 616 brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) 617 { 618 char name[64]; 619 uint64_t count; 620 621 ASSERT(brtvd->bv_initiated); 622 ASSERT(brtvd->bv_mos_brtvdev != 0); 623 ASSERT(brtvd->bv_mos_entries != 0); 624 ASSERT0(brtvd->bv_totalcount); 625 ASSERT0(brtvd->bv_usedspace); 626 ASSERT0(brtvd->bv_savedspace); 627 628 uint64_t mos_entries = brtvd->bv_mos_entries; 629 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); 630 brtvd->bv_mos_entries = 0; 631 rw_exit(&brtvd->bv_mos_entries_lock); 632 dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); 633 brtvd->bv_mos_entries_dnode = NULL; 634 ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count)); 635 ASSERT0(count); 636 VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx)); 637 BRT_DEBUG("MOS entries destroyed, object=%llu", 638 (u_longlong_t)mos_entries); 639 640 VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 641 tx)); 642 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", 643 (u_longlong_t)brtvd->bv_mos_brtvdev); 644 brtvd->bv_mos_brtvdev = 0; 645 brtvd->bv_entcount_dirty = FALSE; 646 647 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 648 (u_longlong_t)brtvd->bv_vdevid); 649 VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 650 name, tx)); 651 BRT_DEBUG("Pool directory object removed, object=%s", name); 652 653 brtvd->bv_meta_dirty = FALSE; 654 655 rw_enter(&brtvd->bv_lock, RW_WRITER); 656 brt_vdev_dealloc(brtvd); 657 rw_exit(&brtvd->bv_lock); 658 659 spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx); 660 } 661 662 static void 663 brt_vdevs_expand(spa_t *spa, uint64_t nvdevs) 664 { 665 brt_vdev_t **vdevs; 666 667 ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock)); 668 ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs); 669 670 if (nvdevs == spa->spa_brt_nvdevs) 671 return; 672 673 vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP); 674 if (spa->spa_brt_nvdevs > 0) { 675 ASSERT(spa->spa_brt_vdevs != NULL); 676 677 memcpy(vdevs, spa->spa_brt_vdevs, 678 sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); 679 kmem_free(spa->spa_brt_vdevs, 680 sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); 681 } 682 spa->spa_brt_vdevs = vdevs; 683 684 for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) { 685 brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP); 686 rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL); 687 brtvd->bv_vdevid = vdevid; 688 brtvd->bv_initiated = FALSE; 689 rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL); 690 avl_create(&brtvd->bv_tree, brt_entry_compare, 691 sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); 692 for (int i = 0; i < TXG_SIZE; i++) { 693 avl_create(&brtvd->bv_pending_tree[i], 694 brt_entry_compare, sizeof (brt_entry_t), 695 offsetof(brt_entry_t, bre_node)); 696 } 697 mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL); 698 spa->spa_brt_vdevs[vdevid] = brtvd; 699 } 700 701 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", 702 (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs); 703 spa->spa_brt_nvdevs = nvdevs; 704 } 705 706 static boolean_t 707 brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset) 708 { 709 uint64_t idx = offset / spa->spa_brt_rangesize; 710 if (idx < brtvd->bv_size) { 711 /* VDEV wasn't expanded. */ 712 return (brt_vdev_entcount_get(brtvd, idx) > 0); 713 } 714 return (FALSE); 715 } 716 717 static void 718 brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, 719 uint64_t dsize, uint64_t count) 720 { 721 uint64_t idx; 722 723 ASSERT(brtvd->bv_initiated); 724 725 brtvd->bv_savedspace += dsize * count; 726 brtvd->bv_meta_dirty = TRUE; 727 728 if (bre->bre_count > 0) 729 return; 730 731 brtvd->bv_usedspace += dsize; 732 733 idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize; 734 if (idx >= brtvd->bv_size) { 735 /* VDEV has been expanded. */ 736 rw_enter(&brtvd->bv_lock, RW_WRITER); 737 brt_vdev_realloc(spa, brtvd); 738 rw_exit(&brtvd->bv_lock); 739 } 740 741 ASSERT3U(idx, <, brtvd->bv_size); 742 743 brtvd->bv_totalcount++; 744 brt_vdev_entcount_inc(brtvd, idx); 745 brtvd->bv_entcount_dirty = TRUE; 746 idx = idx / BRT_BLOCKSIZE / 8; 747 BT_SET(brtvd->bv_bitmap, idx); 748 } 749 750 static void 751 brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, 752 uint64_t dsize) 753 { 754 uint64_t idx; 755 756 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); 757 ASSERT(brtvd->bv_initiated); 758 759 brtvd->bv_savedspace -= dsize; 760 brtvd->bv_meta_dirty = TRUE; 761 762 if (bre->bre_count > 0) 763 return; 764 765 brtvd->bv_usedspace -= dsize; 766 767 idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize; 768 ASSERT3U(idx, <, brtvd->bv_size); 769 770 ASSERT(brtvd->bv_totalcount > 0); 771 brtvd->bv_totalcount--; 772 brt_vdev_entcount_dec(brtvd, idx); 773 brtvd->bv_entcount_dirty = TRUE; 774 idx = idx / BRT_BLOCKSIZE / 8; 775 BT_SET(brtvd->bv_bitmap, idx); 776 } 777 778 static void 779 brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) 780 { 781 dmu_buf_t *db; 782 brt_vdev_phys_t *bvphys; 783 784 ASSERT(brtvd->bv_meta_dirty); 785 ASSERT(brtvd->bv_mos_brtvdev != 0); 786 ASSERT(dmu_tx_is_syncing(tx)); 787 788 VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 789 FTAG, &db)); 790 791 if (brtvd->bv_entcount_dirty) { 792 /* 793 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. 794 */ 795 dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, 796 brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), 797 brtvd->bv_entcount, tx); 798 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); 799 memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks)); 800 brtvd->bv_entcount_dirty = FALSE; 801 } 802 803 dmu_buf_will_dirty(db, tx); 804 bvphys = db->db_data; 805 bvphys->bvp_mos_entries = brtvd->bv_mos_entries; 806 bvphys->bvp_size = brtvd->bv_size; 807 if (brtvd->bv_need_byteswap) { 808 bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER; 809 } else { 810 bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; 811 } 812 bvphys->bvp_totalcount = brtvd->bv_totalcount; 813 bvphys->bvp_rangesize = spa->spa_brt_rangesize; 814 bvphys->bvp_usedspace = brtvd->bv_usedspace; 815 bvphys->bvp_savedspace = brtvd->bv_savedspace; 816 dmu_buf_rele(db, FTAG); 817 818 brtvd->bv_meta_dirty = FALSE; 819 } 820 821 static void 822 brt_vdevs_free(spa_t *spa) 823 { 824 if (spa->spa_brt_vdevs == 0) 825 return; 826 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 827 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 828 rw_enter(&brtvd->bv_lock, RW_WRITER); 829 if (brtvd->bv_initiated) 830 brt_vdev_dealloc(brtvd); 831 rw_exit(&brtvd->bv_lock); 832 rw_destroy(&brtvd->bv_lock); 833 if (brtvd->bv_mos_entries != 0) 834 dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); 835 rw_destroy(&brtvd->bv_mos_entries_lock); 836 avl_destroy(&brtvd->bv_tree); 837 for (int i = 0; i < TXG_SIZE; i++) 838 avl_destroy(&brtvd->bv_pending_tree[i]); 839 mutex_destroy(&brtvd->bv_pending_lock); 840 kmem_free(brtvd, sizeof (*brtvd)); 841 } 842 kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) * 843 spa->spa_brt_nvdevs); 844 } 845 846 static void 847 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) 848 { 849 850 bre->bre_bp = *bp; 851 bre->bre_count = 0; 852 bre->bre_pcount = 0; 853 854 *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); 855 } 856 857 static int 858 brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre) 859 { 860 uint64_t off = BRE_OFFSET(bre); 861 862 return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, 863 &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count)); 864 } 865 866 /* 867 * Return TRUE if we _can_ have BRT entry for this bp. It might be false 868 * positive, but gives us quick answer if we should look into BRT, which 869 * may require reads and thus will be more expensive. 870 */ 871 boolean_t 872 brt_maybe_exists(spa_t *spa, const blkptr_t *bp) 873 { 874 875 if (spa->spa_brt_nvdevs == 0) 876 return (B_FALSE); 877 878 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); 879 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); 880 if (brtvd == NULL || !brtvd->bv_initiated) 881 return (FALSE); 882 883 /* 884 * We don't need locks here, since bv_entcount pointer must be 885 * stable at this point, and we don't care about false positive 886 * races here, while false negative should be impossible, since 887 * all brt_vdev_addref() have already completed by this point. 888 */ 889 uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]); 890 return (brt_vdev_lookup(spa, brtvd, off)); 891 } 892 893 uint64_t 894 brt_get_dspace(spa_t *spa) 895 { 896 if (spa->spa_brt_nvdevs == 0) 897 return (0); 898 899 brt_rlock(spa); 900 uint64_t s = 0; 901 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) 902 s += spa->spa_brt_vdevs[vdevid]->bv_savedspace; 903 brt_unlock(spa); 904 return (s); 905 } 906 907 uint64_t 908 brt_get_used(spa_t *spa) 909 { 910 if (spa->spa_brt_nvdevs == 0) 911 return (0); 912 913 brt_rlock(spa); 914 uint64_t s = 0; 915 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) 916 s += spa->spa_brt_vdevs[vdevid]->bv_usedspace; 917 brt_unlock(spa); 918 return (s); 919 } 920 921 uint64_t 922 brt_get_saved(spa_t *spa) 923 { 924 return (brt_get_dspace(spa)); 925 } 926 927 uint64_t 928 brt_get_ratio(spa_t *spa) 929 { 930 uint64_t used = brt_get_used(spa); 931 if (used == 0) 932 return (100); 933 return ((used + brt_get_saved(spa)) * 100 / used); 934 } 935 936 static int 937 brt_kstats_update(kstat_t *ksp, int rw) 938 { 939 brt_stats_t *bs = ksp->ks_data; 940 941 if (rw == KSTAT_WRITE) 942 return (EACCES); 943 944 bs->brt_addref_entry_not_on_disk.value.ui64 = 945 wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); 946 bs->brt_addref_entry_on_disk.value.ui64 = 947 wmsum_value(&brt_sums.brt_addref_entry_on_disk); 948 bs->brt_decref_entry_in_memory.value.ui64 = 949 wmsum_value(&brt_sums.brt_decref_entry_in_memory); 950 bs->brt_decref_entry_loaded_from_disk.value.ui64 = 951 wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); 952 bs->brt_decref_entry_not_in_memory.value.ui64 = 953 wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); 954 bs->brt_decref_entry_read_lost_race.value.ui64 = 955 wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); 956 bs->brt_decref_entry_still_referenced.value.ui64 = 957 wmsum_value(&brt_sums.brt_decref_entry_still_referenced); 958 bs->brt_decref_free_data_later.value.ui64 = 959 wmsum_value(&brt_sums.brt_decref_free_data_later); 960 bs->brt_decref_free_data_now.value.ui64 = 961 wmsum_value(&brt_sums.brt_decref_free_data_now); 962 bs->brt_decref_no_entry.value.ui64 = 963 wmsum_value(&brt_sums.brt_decref_no_entry); 964 965 return (0); 966 } 967 968 static void 969 brt_stat_init(void) 970 { 971 972 wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); 973 wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); 974 wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); 975 wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); 976 wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); 977 wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); 978 wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); 979 wmsum_init(&brt_sums.brt_decref_free_data_later, 0); 980 wmsum_init(&brt_sums.brt_decref_free_data_now, 0); 981 wmsum_init(&brt_sums.brt_decref_no_entry, 0); 982 983 brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED, 984 sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 985 if (brt_ksp != NULL) { 986 brt_ksp->ks_data = &brt_stats; 987 brt_ksp->ks_update = brt_kstats_update; 988 kstat_install(brt_ksp); 989 } 990 } 991 992 static void 993 brt_stat_fini(void) 994 { 995 if (brt_ksp != NULL) { 996 kstat_delete(brt_ksp); 997 brt_ksp = NULL; 998 } 999 1000 wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); 1001 wmsum_fini(&brt_sums.brt_addref_entry_on_disk); 1002 wmsum_fini(&brt_sums.brt_decref_entry_in_memory); 1003 wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); 1004 wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); 1005 wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); 1006 wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); 1007 wmsum_fini(&brt_sums.brt_decref_free_data_later); 1008 wmsum_fini(&brt_sums.brt_decref_free_data_now); 1009 wmsum_fini(&brt_sums.brt_decref_no_entry); 1010 } 1011 1012 void 1013 brt_init(void) 1014 { 1015 brt_entry_cache = kmem_cache_create("brt_entry_cache", 1016 sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1017 1018 brt_stat_init(); 1019 } 1020 1021 void 1022 brt_fini(void) 1023 { 1024 brt_stat_fini(); 1025 1026 kmem_cache_destroy(brt_entry_cache); 1027 } 1028 1029 /* Return TRUE if block should be freed immediately. */ 1030 boolean_t 1031 brt_entry_decref(spa_t *spa, const blkptr_t *bp) 1032 { 1033 brt_entry_t *bre, *racebre; 1034 brt_entry_t bre_search; 1035 avl_index_t where; 1036 uint64_t vdevid; 1037 int error; 1038 1039 brt_entry_fill(bp, &bre_search, &vdevid); 1040 1041 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); 1042 ASSERT(brtvd != NULL); 1043 1044 rw_enter(&brtvd->bv_lock, RW_WRITER); 1045 ASSERT(brtvd->bv_initiated); 1046 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1047 if (bre != NULL) { 1048 BRTSTAT_BUMP(brt_decref_entry_in_memory); 1049 goto out; 1050 } else { 1051 BRTSTAT_BUMP(brt_decref_entry_not_in_memory); 1052 } 1053 rw_exit(&brtvd->bv_lock); 1054 1055 error = brt_entry_lookup(brtvd, &bre_search); 1056 /* bre_search now contains correct bre_count */ 1057 if (error == ENOENT) { 1058 BRTSTAT_BUMP(brt_decref_no_entry); 1059 return (B_TRUE); 1060 } 1061 ASSERT0(error); 1062 1063 rw_enter(&brtvd->bv_lock, RW_WRITER); 1064 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); 1065 if (racebre != NULL) { 1066 /* The entry was added when the lock was dropped. */ 1067 BRTSTAT_BUMP(brt_decref_entry_read_lost_race); 1068 bre = racebre; 1069 goto out; 1070 } 1071 1072 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); 1073 bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); 1074 bre->bre_bp = bre_search.bre_bp; 1075 bre->bre_count = bre_search.bre_count; 1076 bre->bre_pcount = 0; 1077 avl_insert(&brtvd->bv_tree, bre, where); 1078 1079 out: 1080 if (bre->bre_count == 0) { 1081 rw_exit(&brtvd->bv_lock); 1082 BRTSTAT_BUMP(brt_decref_free_data_now); 1083 return (B_TRUE); 1084 } 1085 1086 bre->bre_pcount--; 1087 ASSERT(bre->bre_count > 0); 1088 bre->bre_count--; 1089 if (bre->bre_count == 0) 1090 BRTSTAT_BUMP(brt_decref_free_data_later); 1091 else 1092 BRTSTAT_BUMP(brt_decref_entry_still_referenced); 1093 brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp)); 1094 1095 rw_exit(&brtvd->bv_lock); 1096 1097 return (B_FALSE); 1098 } 1099 1100 uint64_t 1101 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) 1102 { 1103 brt_entry_t bre_search, *bre; 1104 uint64_t vdevid, refcnt; 1105 int error; 1106 1107 brt_entry_fill(bp, &bre_search, &vdevid); 1108 1109 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); 1110 ASSERT(brtvd != NULL); 1111 1112 rw_enter(&brtvd->bv_lock, RW_READER); 1113 ASSERT(brtvd->bv_initiated); 1114 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1115 if (bre == NULL) { 1116 rw_exit(&brtvd->bv_lock); 1117 error = brt_entry_lookup(brtvd, &bre_search); 1118 if (error == ENOENT) { 1119 refcnt = 0; 1120 } else { 1121 ASSERT0(error); 1122 refcnt = bre_search.bre_count; 1123 } 1124 } else { 1125 refcnt = bre->bre_count; 1126 rw_exit(&brtvd->bv_lock); 1127 } 1128 1129 return (refcnt); 1130 } 1131 1132 static void 1133 brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp) 1134 { 1135 if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0) 1136 return; 1137 1138 uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]); 1139 rw_enter(&brtvd->bv_mos_entries_lock, RW_READER); 1140 if (brtvd->bv_mos_entries != 0) { 1141 (void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode, 1142 &off, BRT_KEY_WORDS); 1143 } 1144 rw_exit(&brtvd->bv_mos_entries_lock); 1145 } 1146 1147 static int 1148 brt_entry_compare(const void *x1, const void *x2) 1149 { 1150 const brt_entry_t *bre1 = x1, *bre2 = x2; 1151 const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp; 1152 1153 return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), 1154 DVA_GET_OFFSET(&bp2->blk_dva[0]))); 1155 } 1156 1157 void 1158 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1159 { 1160 brt_entry_t *bre, *newbre; 1161 avl_index_t where; 1162 uint64_t txg; 1163 1164 txg = dmu_tx_get_txg(tx); 1165 ASSERT3U(txg, !=, 0); 1166 1167 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); 1168 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE); 1169 avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; 1170 1171 newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); 1172 newbre->bre_bp = *bp; 1173 newbre->bre_count = 0; 1174 newbre->bre_pcount = 1; 1175 1176 mutex_enter(&brtvd->bv_pending_lock); 1177 bre = avl_find(pending_tree, newbre, &where); 1178 if (bre == NULL) { 1179 avl_insert(pending_tree, newbre, where); 1180 newbre = NULL; 1181 } else { 1182 bre->bre_pcount++; 1183 } 1184 mutex_exit(&brtvd->bv_pending_lock); 1185 1186 if (newbre != NULL) { 1187 ASSERT(bre != NULL); 1188 ASSERT(bre != newbre); 1189 kmem_cache_free(brt_entry_cache, newbre); 1190 } else { 1191 ASSERT0P(bre); 1192 1193 /* Prefetch BRT entry for the syncing context. */ 1194 brt_prefetch(brtvd, bp); 1195 } 1196 } 1197 1198 void 1199 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1200 { 1201 brt_entry_t *bre, bre_search; 1202 uint64_t txg; 1203 1204 txg = dmu_tx_get_txg(tx); 1205 ASSERT3U(txg, !=, 0); 1206 1207 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); 1208 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); 1209 ASSERT(brtvd != NULL); 1210 avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; 1211 1212 bre_search.bre_bp = *bp; 1213 1214 mutex_enter(&brtvd->bv_pending_lock); 1215 bre = avl_find(pending_tree, &bre_search, NULL); 1216 ASSERT(bre != NULL); 1217 ASSERT(bre->bre_pcount > 0); 1218 bre->bre_pcount--; 1219 if (bre->bre_pcount == 0) 1220 avl_remove(pending_tree, bre); 1221 else 1222 bre = NULL; 1223 mutex_exit(&brtvd->bv_pending_lock); 1224 1225 if (bre) 1226 kmem_cache_free(brt_entry_cache, bre); 1227 } 1228 1229 static void 1230 brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg) 1231 { 1232 brt_entry_t *bre, *nbre; 1233 1234 /* 1235 * We are in syncing context, so no other bv_pending_tree accesses 1236 * are possible for the TXG. So we don't need bv_pending_lock. 1237 */ 1238 ASSERT(avl_is_empty(&brtvd->bv_tree)); 1239 avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]); 1240 1241 for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) { 1242 nbre = AVL_NEXT(&brtvd->bv_tree, bre); 1243 1244 /* 1245 * If the block has DEDUP bit set, it means that it 1246 * already exists in the DEDUP table, so we can just 1247 * use that instead of creating new entry in the BRT. 1248 */ 1249 if (BP_GET_DEDUP(&bre->bre_bp)) { 1250 while (bre->bre_pcount > 0) { 1251 if (!ddt_addref(spa, &bre->bre_bp)) 1252 break; 1253 bre->bre_pcount--; 1254 } 1255 if (bre->bre_pcount == 0) { 1256 avl_remove(&brtvd->bv_tree, bre); 1257 kmem_cache_free(brt_entry_cache, bre); 1258 continue; 1259 } 1260 } 1261 1262 /* 1263 * Unless we know that the block is definitely not in ZAP, 1264 * try to get its reference count from there. 1265 */ 1266 uint64_t off = BRE_OFFSET(bre); 1267 if (brtvd->bv_mos_entries != 0 && 1268 brt_vdev_lookup(spa, brtvd, off)) { 1269 int error = zap_lookup_uint64_by_dnode( 1270 brtvd->bv_mos_entries_dnode, &off, 1271 BRT_KEY_WORDS, 1, sizeof (bre->bre_count), 1272 &bre->bre_count); 1273 if (error == 0) { 1274 BRTSTAT_BUMP(brt_addref_entry_on_disk); 1275 } else { 1276 ASSERT3U(error, ==, ENOENT); 1277 BRTSTAT_BUMP(brt_addref_entry_not_on_disk); 1278 } 1279 } 1280 } 1281 1282 /* 1283 * If all the cloned blocks we had were handled by DDT, we don't need 1284 * to initiate the vdev. 1285 */ 1286 if (avl_is_empty(&brtvd->bv_tree)) 1287 return; 1288 1289 if (!brtvd->bv_initiated) { 1290 rw_enter(&brtvd->bv_lock, RW_WRITER); 1291 brt_vdev_realloc(spa, brtvd); 1292 rw_exit(&brtvd->bv_lock); 1293 } 1294 1295 /* 1296 * Convert pending references into proper ones. This has to be a 1297 * separate loop, since entcount modifications would cause false 1298 * positives for brt_vdev_lookup() on following iterations. 1299 */ 1300 for (bre = avl_first(&brtvd->bv_tree); bre; 1301 bre = AVL_NEXT(&brtvd->bv_tree, bre)) { 1302 brt_vdev_addref(spa, brtvd, bre, 1303 bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount); 1304 bre->bre_count += bre->bre_pcount; 1305 } 1306 } 1307 1308 void 1309 brt_pending_apply(spa_t *spa, uint64_t txg) 1310 { 1311 1312 brt_rlock(spa); 1313 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 1314 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 1315 brt_unlock(spa); 1316 1317 brt_pending_apply_vdev(spa, brtvd, txg); 1318 1319 brt_rlock(spa); 1320 } 1321 brt_unlock(spa); 1322 } 1323 1324 static void 1325 brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) 1326 { 1327 uint64_t off = BRE_OFFSET(bre); 1328 1329 if (bre->bre_pcount == 0) { 1330 /* The net change is zero, nothing to do in ZAP. */ 1331 } else if (bre->bre_count == 0) { 1332 int error = zap_remove_uint64_by_dnode(dn, &off, 1333 BRT_KEY_WORDS, tx); 1334 VERIFY(error == 0 || error == ENOENT); 1335 } else { 1336 VERIFY0(zap_update_uint64_by_dnode(dn, &off, 1337 BRT_KEY_WORDS, 1, sizeof (bre->bre_count), 1338 &bre->bre_count, tx)); 1339 } 1340 } 1341 1342 static void 1343 brt_sync_table(spa_t *spa, dmu_tx_t *tx) 1344 { 1345 brt_entry_t *bre; 1346 1347 brt_rlock(spa); 1348 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 1349 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 1350 brt_unlock(spa); 1351 1352 if (!brtvd->bv_meta_dirty) { 1353 ASSERT(!brtvd->bv_entcount_dirty); 1354 ASSERT0(avl_numnodes(&brtvd->bv_tree)); 1355 brt_rlock(spa); 1356 continue; 1357 } 1358 1359 ASSERT(!brtvd->bv_entcount_dirty || 1360 avl_numnodes(&brtvd->bv_tree) != 0); 1361 1362 if (brtvd->bv_mos_brtvdev == 0) 1363 brt_vdev_create(spa, brtvd, tx); 1364 1365 void *c = NULL; 1366 while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { 1367 brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx); 1368 kmem_cache_free(brt_entry_cache, bre); 1369 } 1370 1371 #ifdef ZFS_DEBUG 1372 if (zfs_flags & ZFS_DEBUG_BRT) 1373 brt_vdev_dump(brtvd); 1374 #endif 1375 if (brtvd->bv_totalcount == 0) 1376 brt_vdev_destroy(spa, brtvd, tx); 1377 else 1378 brt_vdev_sync(spa, brtvd, tx); 1379 brt_rlock(spa); 1380 } 1381 brt_unlock(spa); 1382 } 1383 1384 void 1385 brt_sync(spa_t *spa, uint64_t txg) 1386 { 1387 dmu_tx_t *tx; 1388 uint64_t vdevid; 1389 1390 ASSERT3U(spa_syncing_txg(spa), ==, txg); 1391 1392 brt_rlock(spa); 1393 for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 1394 if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty) 1395 break; 1396 } 1397 if (vdevid >= spa->spa_brt_nvdevs) { 1398 brt_unlock(spa); 1399 return; 1400 } 1401 brt_unlock(spa); 1402 1403 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1404 brt_sync_table(spa, tx); 1405 dmu_tx_commit(tx); 1406 } 1407 1408 static void 1409 brt_alloc(spa_t *spa) 1410 { 1411 rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL); 1412 spa->spa_brt_vdevs = NULL; 1413 spa->spa_brt_nvdevs = 0; 1414 spa->spa_brt_rangesize = 0; 1415 } 1416 1417 void 1418 brt_create(spa_t *spa) 1419 { 1420 brt_alloc(spa); 1421 spa->spa_brt_rangesize = BRT_RANGESIZE; 1422 } 1423 1424 int 1425 brt_load(spa_t *spa) 1426 { 1427 int error = 0; 1428 1429 brt_alloc(spa); 1430 brt_wlock(spa); 1431 for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children; 1432 vdevid++) { 1433 char name[64]; 1434 uint64_t mos_brtvdev; 1435 1436 /* Look if this vdev had active block cloning. */ 1437 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 1438 (u_longlong_t)vdevid); 1439 error = zap_lookup(spa->spa_meta_objset, 1440 DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, 1441 &mos_brtvdev); 1442 if (error == ENOENT) { 1443 error = 0; 1444 continue; 1445 } 1446 if (error != 0) 1447 break; 1448 1449 /* If it did, then allocate them all and load this one. */ 1450 brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children); 1451 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 1452 rw_enter(&brtvd->bv_lock, RW_WRITER); 1453 brtvd->bv_mos_brtvdev = mos_brtvdev; 1454 error = brt_vdev_load(spa, brtvd); 1455 rw_exit(&brtvd->bv_lock); 1456 if (error != 0) 1457 break; 1458 } 1459 1460 if (spa->spa_brt_rangesize == 0) 1461 spa->spa_brt_rangesize = BRT_RANGESIZE; 1462 brt_unlock(spa); 1463 return (error); 1464 } 1465 1466 void 1467 brt_unload(spa_t *spa) 1468 { 1469 if (spa->spa_brt_rangesize == 0) 1470 return; 1471 brt_vdevs_free(spa); 1472 rw_destroy(&spa->spa_brt_lock); 1473 spa->spa_brt_rangesize = 0; 1474 } 1475 1476 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW, 1477 "Enable prefetching of BRT ZAP entries"); 1478 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW, 1479 "BRT ZAP leaf blockshift"); 1480 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW, 1481 "BRT ZAP indirect blockshift"); 1482