1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/spa.h> 29 #include <sys/spa_impl.h> 30 #include <sys/zio.h> 31 #include <sys/brt.h> 32 #include <sys/brt_impl.h> 33 #include <sys/ddt.h> 34 #include <sys/bitmap.h> 35 #include <sys/zap.h> 36 #include <sys/dmu_tx.h> 37 #include <sys/arc.h> 38 #include <sys/dsl_pool.h> 39 #include <sys/dsl_scan.h> 40 #include <sys/vdev_impl.h> 41 #include <sys/kstat.h> 42 #include <sys/wmsum.h> 43 44 /* 45 * Block Cloning design. 46 * 47 * Block Cloning allows to manually clone a file (or a subset of its blocks) 48 * into another (or the same) file by just creating additional references to 49 * the data blocks without copying the data itself. Those references are kept 50 * in the Block Reference Tables (BRTs). 51 * 52 * In many ways this is similar to the existing deduplication, but there are 53 * some important differences: 54 * 55 * - Deduplication is automatic and Block Cloning is not - one has to use a 56 * dedicated system call(s) to clone the given file/blocks. 57 * - Deduplication keeps all data blocks in its table, even those referenced 58 * just once. Block Cloning creates an entry in its tables only when there 59 * are at least two references to the given data block. If the block was 60 * never explicitly cloned or the second to last reference was dropped, 61 * there will be neither space nor performance overhead. 62 * - Deduplication needs data to work - one needs to pass real data to the 63 * write(2) syscall, so hash can be calculated. Block Cloning doesn't require 64 * data, just block pointers to the data, so it is extremely fast, as we pay 65 * neither the cost of reading the data, nor the cost of writing the data - 66 * we operate exclusively on metadata. 67 * - If the D (dedup) bit is not set in the block pointer, it means that 68 * the block is not in the dedup table (DDT) and we won't consult the DDT 69 * when we need to free the block. Block Cloning must be consulted on every 70 * free, because we cannot modify the source BP (eg. by setting something 71 * similar to the D bit), thus we have no hint if the block is in the 72 * Block Reference Table (BRT), so we need to look into the BRT. There is 73 * an optimization in place that allows us to eliminate the majority of BRT 74 * lookups which is described below in the "Minimizing free penalty" section. 75 * - The BRT entry is much smaller than the DDT entry - for BRT we only store 76 * 64bit offset and 64bit reference counter. 77 * - Dedup keys are cryptographic hashes, so two blocks that are close to each 78 * other on disk are most likely in totally different parts of the DDT. 79 * The BRT entry keys are offsets into a single top-level VDEV, so data blocks 80 * from one file should have BRT entries close to each other. 81 * - Scrub will only do a single pass over a block that is referenced multiple 82 * times in the DDT. Unfortunately it is not currently (if at all) possible 83 * with Block Cloning and block referenced multiple times will be scrubbed 84 * multiple times. The new, sorted scrub should be able to eliminate 85 * duplicated reads given enough memory. 86 * - Deduplication requires cryptographically strong hash as a checksum or 87 * additional data verification. Block Cloning works with any checksum 88 * algorithm or even with checksumming disabled. 89 * 90 * As mentioned above, the BRT entries are much smaller than the DDT entries. 91 * To uniquely identify a block we just need its vdev id and offset. We also 92 * need to maintain a reference counter. The vdev id will often repeat, as there 93 * is a small number of top-level VDEVs and a large number of blocks stored in 94 * each VDEV. We take advantage of that to reduce the BRT entry size further by 95 * maintaining one BRT for each top-level VDEV, so we can then have only offset 96 * and counter as the BRT entry. 97 * 98 * Minimizing free penalty. 99 * 100 * Block Cloning allows creating additional references to any existing block. 101 * When we free a block there is no hint in the block pointer whether the block 102 * was cloned or not, so on each free we have to check if there is a 103 * corresponding entry in the BRT or not. If there is, we need to decrease 104 * the reference counter. Doing BRT lookup on every free can potentially be 105 * expensive by requiring additional I/Os if the BRT doesn't fit into memory. 106 * This is the main problem with deduplication, so we've learned our lesson and 107 * try not to repeat the same mistake here. How do we do that? We divide each 108 * top-level VDEV into 16MB regions. For each region we maintain a counter that 109 * is a sum of all the BRT entries that have offsets within the region. This 110 * creates the entries count array of 16bit numbers for each top-level VDEV. 111 * The entries count array is always kept in memory and updated on disk in the 112 * same transaction group as the BRT updates to keep everything in-sync. We can 113 * keep the array in memory, because it is very small. With 16MB regions and 114 * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease 115 * the region size even further in the future). Now, when we want to free 116 * a block, we first consult the array. If the counter for the whole region is 117 * zero, there is no need to look for the BRT entry, as there isn't one for 118 * sure. If the counter for the region is greater than zero, only then we will 119 * do a BRT lookup and if an entry is found we will decrease the reference 120 * counter in the BRT entry and in the entry counters array. 121 * 122 * The entry counters array is small, but can potentially be larger for very 123 * large VDEVs or smaller regions. In this case we don't want to rewrite entire 124 * array on every change. We then divide the array into 32kB block and keep 125 * a bitmap of dirty blocks within a transaction group. When we sync the 126 * transaction group we can only update the parts of the entry counters array 127 * that were modified. Note: Keeping track of the dirty parts of the entry 128 * counters array is implemented, but updating only parts of the array on disk 129 * is not yet implemented - for now we will update entire array if there was 130 * any change. 131 * 132 * The implementation tries to be economic: if BRT is not used, or no longer 133 * used, there will be no entries in the MOS and no additional memory used (eg. 134 * the entry counters array is only allocated if needed). 135 * 136 * Interaction between Deduplication and Block Cloning. 137 * 138 * If both functionalities are in use, we could end up with a block that is 139 * referenced multiple times in both DDT and BRT. When we free one of the 140 * references we couldn't tell where it belongs, so we would have to decide 141 * what table takes the precedence: do we first clear DDT references or BRT 142 * references? To avoid this dilemma BRT cooperates with DDT - if a given block 143 * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will 144 * lookup DDT entry instead and increase the counter there. No BRT entry 145 * will be created for a block which has the D (dedup) bit set. 146 * BRT may be more efficient for manual deduplication, but if the block is 147 * already in the DDT, then creating additional BRT entry would be less 148 * efficient. This clever idea was proposed by Allan Jude. 149 * 150 * Block Cloning across datasets. 151 * 152 * Block Cloning is not limited to cloning blocks within the same dataset. 153 * It is possible (and very useful) to clone blocks between different datasets. 154 * One use case is recovering files from snapshots. By cloning the files into 155 * dataset we need no additional storage. Without Block Cloning we would need 156 * additional space for those files. 157 * Another interesting use case is moving the files between datasets 158 * (copying the file content to the new dataset and removing the source file). 159 * In that case Block Cloning will only be used briefly, because the BRT entries 160 * will be removed when the source is removed. 161 * Block Cloning across encrypted datasets is supported as long as both 162 * datasets share the same master key (e.g. snapshots and clones) 163 * 164 * Block Cloning flow through ZFS layers. 165 * 166 * Note: Block Cloning can be used both for cloning file system blocks and ZVOL 167 * blocks. As of this writing no interface is implemented that allows for block 168 * cloning within a ZVOL. 169 * FreeBSD and Linux provides copy_file_range(2) system call and we will use it 170 * for blocking cloning. 171 * 172 * ssize_t 173 * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp, 174 * size_t len, unsigned int flags); 175 * 176 * Even though offsets and length represent bytes, they have to be 177 * block-aligned or we will return an error so the upper layer can 178 * fallback to the generic mechanism that will just copy the data. 179 * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. 180 * This function was implemented based on zfs_write(), but instead of writing 181 * the given data we first read block pointers using the new dmu_read_l0_bps() 182 * function from the source file. Once we have BPs from the source file we call 183 * the dmu_brt_clone() function on the destination file. This function 184 * allocates BPs for us. We iterate over all source BPs. If the given BP is 185 * a hole or an embedded block, we just copy BP as-is. If it points to a real 186 * data we place this BP on a BRT pending list using the brt_pending_add() 187 * function. 188 * 189 * We use this pending list to keep track of all BPs that got new references 190 * within this transaction group. 191 * 192 * Some special cases to consider and how we address them: 193 * - The block we want to clone may have been created within the same 194 * transaction group that we are trying to clone. Such block has no BP 195 * allocated yet, so cannot be immediately cloned. We return EAGAIN. 196 * - The block we want to clone may have been modified within the same 197 * transaction group. We return EAGAIN. 198 * - A block may be cloned multiple times during one transaction group (that's 199 * why pending list is actually a tree and not an append-only list - this 200 * way we can figure out faster if this block is cloned for the first time 201 * in this txg or consecutive time). 202 * - A block may be cloned and freed within the same transaction group 203 * (see dbuf_undirty()). 204 * - A block may be cloned and within the same transaction group the clone 205 * can be cloned again (see dmu_read_l0_bps()). 206 * - A file might have been deleted, but the caller still has a file descriptor 207 * open to this file and clones it. 208 * 209 * When we free a block we have an additional step in the ZIO pipeline where we 210 * call the zio_brt_free() function. We then call the brt_entry_decref() 211 * that loads the corresponding BRT entry (if one exists) and decreases 212 * reference counter. If this is not the last reference we will stop ZIO 213 * pipeline here. If this is the last reference or the block is not in the 214 * BRT, we continue the pipeline and free the block as usual. 215 * 216 * At the beginning of spa_sync() where there can be no more block cloning, 217 * but before issuing frees we call brt_pending_apply(). This function applies 218 * all the new clones to the BRT table - we load BRT entries and update 219 * reference counters. To sync new BRT entries to disk, we use brt_sync() 220 * function. This function will sync all dirty per-top-level-vdev BRTs, 221 * the entry counters arrays, etc. 222 * 223 * Block Cloning and ZIL. 224 * 225 * Every clone operation is divided into chunks (similar to write) and each 226 * chunk is cloned in a separate transaction. The chunk size is determined by 227 * how many BPs we can fit into a single ZIL entry. 228 * Replaying clone operation is different from the regular clone operation, 229 * as when we log clone operations we cannot use the source object - it may 230 * reside on a different dataset, so we log BPs we want to clone. 231 * The ZIL is replayed when we mount the given dataset, not when the pool is 232 * imported. Taking this into account it is possible that the pool is imported 233 * without mounting datasets and the source dataset is destroyed before the 234 * destination dataset is mounted and its ZIL replayed. 235 * To address this situation we leverage zil_claim() mechanism where ZFS will 236 * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE 237 * entries, we will bump reference counters for their BPs in the BRT. Then 238 * on mount and ZIL replay we bump the reference counters once more, while the 239 * first references are dropped during ZIL destroy by zil_free_clone_range(). 240 * It is possible that after zil_claim() we never mount the destination, so 241 * we never replay its ZIL and just destroy it. In this case the only taken 242 * references will be dropped by zil_free_clone_range(), since the cloning is 243 * not going to ever take place. 244 */ 245 246 static kmem_cache_t *brt_entry_cache; 247 248 /* 249 * Enable/disable prefetching of BRT entries that we are going to modify. 250 */ 251 static int brt_zap_prefetch = 1; 252 253 #ifdef ZFS_DEBUG 254 #define BRT_DEBUG(...) do { \ 255 if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \ 256 __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \ 257 } \ 258 } while (0) 259 #else 260 #define BRT_DEBUG(...) do { } while (0) 261 #endif 262 263 static int brt_zap_default_bs = 13; 264 static int brt_zap_default_ibs = 13; 265 266 static kstat_t *brt_ksp; 267 268 typedef struct brt_stats { 269 kstat_named_t brt_addref_entry_not_on_disk; 270 kstat_named_t brt_addref_entry_on_disk; 271 kstat_named_t brt_decref_entry_in_memory; 272 kstat_named_t brt_decref_entry_loaded_from_disk; 273 kstat_named_t brt_decref_entry_not_in_memory; 274 kstat_named_t brt_decref_entry_read_lost_race; 275 kstat_named_t brt_decref_entry_still_referenced; 276 kstat_named_t brt_decref_free_data_later; 277 kstat_named_t brt_decref_free_data_now; 278 kstat_named_t brt_decref_no_entry; 279 } brt_stats_t; 280 281 static brt_stats_t brt_stats = { 282 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, 283 { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, 284 { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, 285 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, 286 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, 287 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, 288 { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, 289 { "decref_free_data_later", KSTAT_DATA_UINT64 }, 290 { "decref_free_data_now", KSTAT_DATA_UINT64 }, 291 { "decref_no_entry", KSTAT_DATA_UINT64 } 292 }; 293 294 struct { 295 wmsum_t brt_addref_entry_not_on_disk; 296 wmsum_t brt_addref_entry_on_disk; 297 wmsum_t brt_decref_entry_in_memory; 298 wmsum_t brt_decref_entry_loaded_from_disk; 299 wmsum_t brt_decref_entry_not_in_memory; 300 wmsum_t brt_decref_entry_read_lost_race; 301 wmsum_t brt_decref_entry_still_referenced; 302 wmsum_t brt_decref_free_data_later; 303 wmsum_t brt_decref_free_data_now; 304 wmsum_t brt_decref_no_entry; 305 } brt_sums; 306 307 #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) 308 309 static int brt_entry_compare(const void *x1, const void *x2); 310 static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs); 311 312 static void 313 brt_rlock(spa_t *spa) 314 { 315 rw_enter(&spa->spa_brt_lock, RW_READER); 316 } 317 318 static void 319 brt_wlock(spa_t *spa) 320 { 321 rw_enter(&spa->spa_brt_lock, RW_WRITER); 322 } 323 324 static void 325 brt_unlock(spa_t *spa) 326 { 327 rw_exit(&spa->spa_brt_lock); 328 } 329 330 static uint16_t 331 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) 332 { 333 334 ASSERT3U(idx, <, brtvd->bv_size); 335 336 if (unlikely(brtvd->bv_need_byteswap)) { 337 return (BSWAP_16(brtvd->bv_entcount[idx])); 338 } else { 339 return (brtvd->bv_entcount[idx]); 340 } 341 } 342 343 static void 344 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt) 345 { 346 347 ASSERT3U(idx, <, brtvd->bv_size); 348 349 if (unlikely(brtvd->bv_need_byteswap)) { 350 brtvd->bv_entcount[idx] = BSWAP_16(entcnt); 351 } else { 352 brtvd->bv_entcount[idx] = entcnt; 353 } 354 } 355 356 static void 357 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx) 358 { 359 uint16_t entcnt; 360 361 ASSERT3U(idx, <, brtvd->bv_size); 362 363 entcnt = brt_vdev_entcount_get(brtvd, idx); 364 ASSERT(entcnt < UINT16_MAX); 365 366 brt_vdev_entcount_set(brtvd, idx, entcnt + 1); 367 } 368 369 static void 370 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx) 371 { 372 uint16_t entcnt; 373 374 ASSERT3U(idx, <, brtvd->bv_size); 375 376 entcnt = brt_vdev_entcount_get(brtvd, idx); 377 ASSERT(entcnt > 0); 378 379 brt_vdev_entcount_set(brtvd, idx, entcnt - 1); 380 } 381 382 #ifdef ZFS_DEBUG 383 static void 384 brt_vdev_dump(brt_vdev_t *brtvd) 385 { 386 uint64_t idx; 387 388 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); 389 zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d " 390 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu", 391 (u_longlong_t)brtvd->bv_vdevid, 392 brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, 393 (u_longlong_t)brtvd->bv_size, 394 (u_longlong_t)brtvd->bv_totalcount, 395 (u_longlong_t)nblocks, 396 (size_t)BT_SIZEOFMAP(nblocks)); 397 if (brtvd->bv_totalcount > 0) { 398 zfs_dbgmsg(" entcounts:"); 399 for (idx = 0; idx < brtvd->bv_size; idx++) { 400 uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx); 401 if (entcnt > 0) { 402 zfs_dbgmsg(" [%04llu] %hu", 403 (u_longlong_t)idx, entcnt); 404 } 405 } 406 } 407 if (brtvd->bv_entcount_dirty) { 408 char *bitmap; 409 410 bitmap = kmem_alloc(nblocks + 1, KM_SLEEP); 411 for (idx = 0; idx < nblocks; idx++) { 412 bitmap[idx] = 413 BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; 414 } 415 bitmap[idx] = '\0'; 416 zfs_dbgmsg(" dirty: %s", bitmap); 417 kmem_free(bitmap, nblocks + 1); 418 } 419 } 420 #endif 421 422 static brt_vdev_t * 423 brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc) 424 { 425 brt_vdev_t *brtvd = NULL; 426 427 brt_rlock(spa); 428 if (vdevid < spa->spa_brt_nvdevs) { 429 brtvd = spa->spa_brt_vdevs[vdevid]; 430 } else if (alloc) { 431 /* New VDEV was added. */ 432 brt_unlock(spa); 433 brt_wlock(spa); 434 if (vdevid >= spa->spa_brt_nvdevs) 435 brt_vdevs_expand(spa, vdevid + 1); 436 brtvd = spa->spa_brt_vdevs[vdevid]; 437 } 438 brt_unlock(spa); 439 return (brtvd); 440 } 441 442 static void 443 brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) 444 { 445 char name[64]; 446 447 ASSERT(brtvd->bv_initiated); 448 ASSERT0(brtvd->bv_mos_brtvdev); 449 ASSERT0(brtvd->bv_mos_entries); 450 451 uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0, 452 ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, 453 brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx); 454 VERIFY(mos_entries != 0); 455 VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd, 456 &brtvd->bv_mos_entries_dnode)); 457 dnode_set_storage_type(brtvd->bv_mos_entries_dnode, DMU_OT_DDT_ZAP); 458 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); 459 brtvd->bv_mos_entries = mos_entries; 460 rw_exit(&brtvd->bv_mos_entries_lock); 461 BRT_DEBUG("MOS entries created, object=%llu", 462 (u_longlong_t)brtvd->bv_mos_entries); 463 464 /* 465 * We allocate DMU buffer to store the bv_entcount[] array. 466 * We will keep array size (bv_size) and cummulative count for all 467 * bv_entcount[]s (bv_totalcount) in the bonus buffer. 468 */ 469 brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset, 470 DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, 471 DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); 472 VERIFY(brtvd->bv_mos_brtvdev != 0); 473 BRT_DEBUG("MOS BRT VDEV created, object=%llu", 474 (u_longlong_t)brtvd->bv_mos_brtvdev); 475 476 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 477 (u_longlong_t)brtvd->bv_vdevid); 478 VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, 479 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); 480 BRT_DEBUG("Pool directory object created, object=%s", name); 481 482 /* 483 * Activate the endian-fixed feature if this is the first BRT ZAP 484 * (i.e., BLOCK_CLONING is not yet active) and the feature is enabled. 485 */ 486 if (spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN) && 487 !spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { 488 spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx); 489 } else if (spa_feature_is_active(spa, 490 SPA_FEATURE_BLOCK_CLONING_ENDIAN)) { 491 spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx); 492 } 493 494 spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx); 495 } 496 497 static void 498 brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd) 499 { 500 vdev_t *vd; 501 uint16_t *entcount; 502 ulong_t *bitmap; 503 uint64_t nblocks, onblocks, size; 504 505 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); 506 507 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 508 vd = vdev_lookup_top(spa, brtvd->bv_vdevid); 509 size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1; 510 spa_config_exit(spa, SCL_VDEV, FTAG); 511 512 nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); 513 entcount = vmem_zalloc(nblocks * BRT_BLOCKSIZE, KM_SLEEP); 514 bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); 515 516 if (!brtvd->bv_initiated) { 517 ASSERT0(brtvd->bv_size); 518 ASSERT0P(brtvd->bv_entcount); 519 ASSERT0P(brtvd->bv_bitmap); 520 } else { 521 ASSERT(brtvd->bv_size > 0); 522 ASSERT(brtvd->bv_entcount != NULL); 523 ASSERT(brtvd->bv_bitmap != NULL); 524 /* 525 * TODO: Allow vdev shrinking. We only need to implement 526 * shrinking the on-disk BRT VDEV object. 527 * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 528 * offset, size, tx); 529 */ 530 ASSERT3U(brtvd->bv_size, <=, size); 531 532 memcpy(entcount, brtvd->bv_entcount, 533 sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); 534 onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); 535 vmem_free(brtvd->bv_entcount, onblocks * BRT_BLOCKSIZE); 536 memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), 537 BT_SIZEOFMAP(onblocks))); 538 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks)); 539 } 540 541 brtvd->bv_size = size; 542 brtvd->bv_entcount = entcount; 543 brtvd->bv_bitmap = bitmap; 544 if (!brtvd->bv_initiated) { 545 brtvd->bv_need_byteswap = FALSE; 546 brtvd->bv_initiated = TRUE; 547 BRT_DEBUG("BRT VDEV %llu initiated.", 548 (u_longlong_t)brtvd->bv_vdevid); 549 } 550 } 551 552 static int 553 brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd) 554 { 555 dmu_buf_t *db; 556 brt_vdev_phys_t *bvphys; 557 int error; 558 559 ASSERT(!brtvd->bv_initiated); 560 ASSERT(brtvd->bv_mos_brtvdev != 0); 561 562 error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 563 FTAG, &db); 564 if (error != 0) 565 return (error); 566 567 bvphys = db->db_data; 568 if (spa->spa_brt_rangesize == 0) { 569 spa->spa_brt_rangesize = bvphys->bvp_rangesize; 570 } else { 571 ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize); 572 } 573 574 brt_vdev_realloc(spa, brtvd); 575 576 /* TODO: We don't support VDEV shrinking. */ 577 ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); 578 579 /* 580 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. 581 */ 582 error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, 583 MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), 584 brtvd->bv_entcount, DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO); 585 if (error != 0) 586 return (error); 587 588 ASSERT(bvphys->bvp_mos_entries != 0); 589 VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd, 590 &brtvd->bv_mos_entries_dnode)); 591 dnode_set_storage_type(brtvd->bv_mos_entries_dnode, DMU_OT_DDT_ZAP); 592 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); 593 brtvd->bv_mos_entries = bvphys->bvp_mos_entries; 594 rw_exit(&brtvd->bv_mos_entries_lock); 595 brtvd->bv_need_byteswap = 596 (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); 597 brtvd->bv_totalcount = bvphys->bvp_totalcount; 598 brtvd->bv_usedspace = bvphys->bvp_usedspace; 599 brtvd->bv_savedspace = bvphys->bvp_savedspace; 600 601 dmu_buf_rele(db, FTAG); 602 603 BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu", 604 (u_longlong_t)brtvd->bv_vdevid, 605 (u_longlong_t)brtvd->bv_mos_brtvdev, 606 (u_longlong_t)brtvd->bv_mos_entries); 607 return (0); 608 } 609 610 static void 611 brt_vdev_dealloc(brt_vdev_t *brtvd) 612 { 613 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); 614 ASSERT(brtvd->bv_initiated); 615 ASSERT0(avl_numnodes(&brtvd->bv_tree)); 616 617 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); 618 vmem_free(brtvd->bv_entcount, nblocks * BRT_BLOCKSIZE); 619 brtvd->bv_entcount = NULL; 620 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks)); 621 brtvd->bv_bitmap = NULL; 622 623 brtvd->bv_size = 0; 624 625 brtvd->bv_initiated = FALSE; 626 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); 627 } 628 629 static void 630 brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) 631 { 632 char name[64]; 633 uint64_t count; 634 635 ASSERT(brtvd->bv_initiated); 636 ASSERT(brtvd->bv_mos_brtvdev != 0); 637 ASSERT(brtvd->bv_mos_entries != 0); 638 ASSERT0(brtvd->bv_totalcount); 639 ASSERT0(brtvd->bv_usedspace); 640 ASSERT0(brtvd->bv_savedspace); 641 642 uint64_t mos_entries = brtvd->bv_mos_entries; 643 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); 644 brtvd->bv_mos_entries = 0; 645 rw_exit(&brtvd->bv_mos_entries_lock); 646 dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); 647 brtvd->bv_mos_entries_dnode = NULL; 648 ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count)); 649 ASSERT0(count); 650 VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx)); 651 BRT_DEBUG("MOS entries destroyed, object=%llu", 652 (u_longlong_t)mos_entries); 653 654 VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 655 tx)); 656 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", 657 (u_longlong_t)brtvd->bv_mos_brtvdev); 658 brtvd->bv_mos_brtvdev = 0; 659 brtvd->bv_entcount_dirty = FALSE; 660 661 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 662 (u_longlong_t)brtvd->bv_vdevid); 663 VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 664 name, tx)); 665 BRT_DEBUG("Pool directory object removed, object=%s", name); 666 667 brtvd->bv_meta_dirty = FALSE; 668 669 rw_enter(&brtvd->bv_lock, RW_WRITER); 670 brt_vdev_dealloc(brtvd); 671 rw_exit(&brtvd->bv_lock); 672 673 spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx); 674 if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN)) 675 spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx); 676 } 677 678 static void 679 brt_vdevs_expand(spa_t *spa, uint64_t nvdevs) 680 { 681 brt_vdev_t **vdevs; 682 683 ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock)); 684 ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs); 685 686 if (nvdevs == spa->spa_brt_nvdevs) 687 return; 688 689 vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP); 690 if (spa->spa_brt_nvdevs > 0) { 691 ASSERT(spa->spa_brt_vdevs != NULL); 692 693 memcpy(vdevs, spa->spa_brt_vdevs, 694 sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); 695 kmem_free(spa->spa_brt_vdevs, 696 sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); 697 } 698 spa->spa_brt_vdevs = vdevs; 699 700 for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) { 701 brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP); 702 rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL); 703 brtvd->bv_vdevid = vdevid; 704 brtvd->bv_initiated = FALSE; 705 rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL); 706 avl_create(&brtvd->bv_tree, brt_entry_compare, 707 sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); 708 for (int i = 0; i < TXG_SIZE; i++) { 709 avl_create(&brtvd->bv_pending_tree[i], 710 brt_entry_compare, sizeof (brt_entry_t), 711 offsetof(brt_entry_t, bre_node)); 712 } 713 mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL); 714 spa->spa_brt_vdevs[vdevid] = brtvd; 715 } 716 717 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", 718 (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs); 719 spa->spa_brt_nvdevs = nvdevs; 720 } 721 722 static boolean_t 723 brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset) 724 { 725 uint64_t idx = offset / spa->spa_brt_rangesize; 726 if (idx < brtvd->bv_size) { 727 /* VDEV wasn't expanded. */ 728 return (brt_vdev_entcount_get(brtvd, idx) > 0); 729 } 730 return (FALSE); 731 } 732 733 static void 734 brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, 735 uint64_t dsize, uint64_t count) 736 { 737 uint64_t idx; 738 739 ASSERT(brtvd->bv_initiated); 740 741 brtvd->bv_savedspace += dsize * count; 742 brtvd->bv_meta_dirty = TRUE; 743 744 if (bre->bre_count > 0) 745 return; 746 747 brtvd->bv_usedspace += dsize; 748 749 idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize; 750 if (idx >= brtvd->bv_size) { 751 /* VDEV has been expanded. */ 752 rw_enter(&brtvd->bv_lock, RW_WRITER); 753 brt_vdev_realloc(spa, brtvd); 754 rw_exit(&brtvd->bv_lock); 755 } 756 757 ASSERT3U(idx, <, brtvd->bv_size); 758 759 brtvd->bv_totalcount++; 760 brt_vdev_entcount_inc(brtvd, idx); 761 brtvd->bv_entcount_dirty = TRUE; 762 idx = idx / BRT_BLOCKSIZE / 8; 763 BT_SET(brtvd->bv_bitmap, idx); 764 } 765 766 static void 767 brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, 768 uint64_t dsize) 769 { 770 uint64_t idx; 771 772 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); 773 ASSERT(brtvd->bv_initiated); 774 775 brtvd->bv_savedspace -= dsize; 776 brtvd->bv_meta_dirty = TRUE; 777 778 if (bre->bre_count > 0) 779 return; 780 781 brtvd->bv_usedspace -= dsize; 782 783 idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize; 784 ASSERT3U(idx, <, brtvd->bv_size); 785 786 ASSERT(brtvd->bv_totalcount > 0); 787 brtvd->bv_totalcount--; 788 brt_vdev_entcount_dec(brtvd, idx); 789 brtvd->bv_entcount_dirty = TRUE; 790 idx = idx / BRT_BLOCKSIZE / 8; 791 BT_SET(brtvd->bv_bitmap, idx); 792 } 793 794 static void 795 brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) 796 { 797 dmu_buf_t *db; 798 brt_vdev_phys_t *bvphys; 799 800 ASSERT(brtvd->bv_meta_dirty); 801 ASSERT(brtvd->bv_mos_brtvdev != 0); 802 ASSERT(dmu_tx_is_syncing(tx)); 803 804 VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 805 FTAG, &db)); 806 807 if (brtvd->bv_entcount_dirty) { 808 /* 809 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. 810 */ 811 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); 812 dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, 813 nblocks * BRT_BLOCKSIZE, brtvd->bv_entcount, tx, 814 DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO); 815 memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks)); 816 brtvd->bv_entcount_dirty = FALSE; 817 } 818 819 dmu_buf_will_dirty(db, tx); 820 bvphys = db->db_data; 821 bvphys->bvp_mos_entries = brtvd->bv_mos_entries; 822 bvphys->bvp_size = brtvd->bv_size; 823 if (brtvd->bv_need_byteswap) { 824 bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER; 825 } else { 826 bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; 827 } 828 bvphys->bvp_totalcount = brtvd->bv_totalcount; 829 bvphys->bvp_rangesize = spa->spa_brt_rangesize; 830 bvphys->bvp_usedspace = brtvd->bv_usedspace; 831 bvphys->bvp_savedspace = brtvd->bv_savedspace; 832 dmu_buf_rele(db, FTAG); 833 834 brtvd->bv_meta_dirty = FALSE; 835 } 836 837 static void 838 brt_vdevs_free(spa_t *spa) 839 { 840 if (spa->spa_brt_vdevs == 0) 841 return; 842 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 843 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 844 rw_enter(&brtvd->bv_lock, RW_WRITER); 845 if (brtvd->bv_initiated) 846 brt_vdev_dealloc(brtvd); 847 rw_exit(&brtvd->bv_lock); 848 rw_destroy(&brtvd->bv_lock); 849 if (brtvd->bv_mos_entries != 0) 850 dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); 851 rw_destroy(&brtvd->bv_mos_entries_lock); 852 avl_destroy(&brtvd->bv_tree); 853 for (int i = 0; i < TXG_SIZE; i++) 854 avl_destroy(&brtvd->bv_pending_tree[i]); 855 mutex_destroy(&brtvd->bv_pending_lock); 856 kmem_free(brtvd, sizeof (*brtvd)); 857 } 858 kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) * 859 spa->spa_brt_nvdevs); 860 } 861 862 static void 863 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) 864 { 865 866 bre->bre_bp = *bp; 867 bre->bre_count = 0; 868 bre->bre_pcount = 0; 869 870 *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); 871 } 872 873 static boolean_t 874 brt_has_endian_fixed(spa_t *spa) 875 { 876 return (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN)); 877 } 878 879 static int 880 brt_entry_lookup(spa_t *spa, brt_vdev_t *brtvd, brt_entry_t *bre) 881 { 882 uint64_t off = BRE_OFFSET(bre); 883 884 if (brtvd->bv_mos_entries == 0) 885 return (SET_ERROR(ENOENT)); 886 887 if (brt_has_endian_fixed(spa)) { 888 return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, 889 &off, BRT_KEY_WORDS, sizeof (bre->bre_count), 1, 890 &bre->bre_count)); 891 } else { 892 return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, 893 &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), 894 &bre->bre_count)); 895 } 896 } 897 898 /* 899 * Return TRUE if we _can_ have BRT entry for this bp. It might be false 900 * positive, but gives us quick answer if we should look into BRT, which 901 * may require reads and thus will be more expensive. 902 */ 903 boolean_t 904 brt_maybe_exists(spa_t *spa, const blkptr_t *bp) 905 { 906 907 if (spa->spa_brt_nvdevs == 0) 908 return (B_FALSE); 909 910 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); 911 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); 912 if (brtvd == NULL || !brtvd->bv_initiated) 913 return (FALSE); 914 915 /* 916 * We don't need locks here, since bv_entcount pointer must be 917 * stable at this point, and we don't care about false positive 918 * races here, while false negative should be impossible, since 919 * all brt_vdev_addref() have already completed by this point. 920 */ 921 uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]); 922 return (brt_vdev_lookup(spa, brtvd, off)); 923 } 924 925 uint64_t 926 brt_get_dspace(spa_t *spa) 927 { 928 if (spa->spa_brt_nvdevs == 0) 929 return (0); 930 931 brt_rlock(spa); 932 uint64_t s = 0; 933 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) 934 s += spa->spa_brt_vdevs[vdevid]->bv_savedspace; 935 brt_unlock(spa); 936 return (s); 937 } 938 939 uint64_t 940 brt_get_used(spa_t *spa) 941 { 942 if (spa->spa_brt_nvdevs == 0) 943 return (0); 944 945 brt_rlock(spa); 946 uint64_t s = 0; 947 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) 948 s += spa->spa_brt_vdevs[vdevid]->bv_usedspace; 949 brt_unlock(spa); 950 return (s); 951 } 952 953 uint64_t 954 brt_get_saved(spa_t *spa) 955 { 956 return (brt_get_dspace(spa)); 957 } 958 959 uint64_t 960 brt_get_ratio(spa_t *spa) 961 { 962 uint64_t used = brt_get_used(spa); 963 if (used == 0) 964 return (100); 965 return ((used + brt_get_saved(spa)) * 100 / used); 966 } 967 968 static int 969 brt_kstats_update(kstat_t *ksp, int rw) 970 { 971 brt_stats_t *bs = ksp->ks_data; 972 973 if (rw == KSTAT_WRITE) 974 return (EACCES); 975 976 bs->brt_addref_entry_not_on_disk.value.ui64 = 977 wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); 978 bs->brt_addref_entry_on_disk.value.ui64 = 979 wmsum_value(&brt_sums.brt_addref_entry_on_disk); 980 bs->brt_decref_entry_in_memory.value.ui64 = 981 wmsum_value(&brt_sums.brt_decref_entry_in_memory); 982 bs->brt_decref_entry_loaded_from_disk.value.ui64 = 983 wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); 984 bs->brt_decref_entry_not_in_memory.value.ui64 = 985 wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); 986 bs->brt_decref_entry_read_lost_race.value.ui64 = 987 wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); 988 bs->brt_decref_entry_still_referenced.value.ui64 = 989 wmsum_value(&brt_sums.brt_decref_entry_still_referenced); 990 bs->brt_decref_free_data_later.value.ui64 = 991 wmsum_value(&brt_sums.brt_decref_free_data_later); 992 bs->brt_decref_free_data_now.value.ui64 = 993 wmsum_value(&brt_sums.brt_decref_free_data_now); 994 bs->brt_decref_no_entry.value.ui64 = 995 wmsum_value(&brt_sums.brt_decref_no_entry); 996 997 return (0); 998 } 999 1000 static void 1001 brt_stat_init(void) 1002 { 1003 1004 wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); 1005 wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); 1006 wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); 1007 wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); 1008 wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); 1009 wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); 1010 wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); 1011 wmsum_init(&brt_sums.brt_decref_free_data_later, 0); 1012 wmsum_init(&brt_sums.brt_decref_free_data_now, 0); 1013 wmsum_init(&brt_sums.brt_decref_no_entry, 0); 1014 1015 brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED, 1016 sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 1017 if (brt_ksp != NULL) { 1018 brt_ksp->ks_data = &brt_stats; 1019 brt_ksp->ks_update = brt_kstats_update; 1020 kstat_install(brt_ksp); 1021 } 1022 } 1023 1024 static void 1025 brt_stat_fini(void) 1026 { 1027 if (brt_ksp != NULL) { 1028 kstat_delete(brt_ksp); 1029 brt_ksp = NULL; 1030 } 1031 1032 wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); 1033 wmsum_fini(&brt_sums.brt_addref_entry_on_disk); 1034 wmsum_fini(&brt_sums.brt_decref_entry_in_memory); 1035 wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); 1036 wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); 1037 wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); 1038 wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); 1039 wmsum_fini(&brt_sums.brt_decref_free_data_later); 1040 wmsum_fini(&brt_sums.brt_decref_free_data_now); 1041 wmsum_fini(&brt_sums.brt_decref_no_entry); 1042 } 1043 1044 void 1045 brt_init(void) 1046 { 1047 brt_entry_cache = kmem_cache_create("brt_entry_cache", 1048 sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1049 1050 brt_stat_init(); 1051 } 1052 1053 void 1054 brt_fini(void) 1055 { 1056 brt_stat_fini(); 1057 1058 kmem_cache_destroy(brt_entry_cache); 1059 } 1060 1061 /* Return TRUE if block should be freed immediately. */ 1062 boolean_t 1063 brt_entry_decref(spa_t *spa, const blkptr_t *bp) 1064 { 1065 brt_entry_t *bre, *racebre; 1066 brt_entry_t bre_search; 1067 avl_index_t where; 1068 uint64_t vdevid; 1069 int error; 1070 1071 brt_entry_fill(bp, &bre_search, &vdevid); 1072 1073 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); 1074 ASSERT(brtvd != NULL); 1075 1076 rw_enter(&brtvd->bv_lock, RW_WRITER); 1077 ASSERT(brtvd->bv_initiated); 1078 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1079 if (bre != NULL) { 1080 BRTSTAT_BUMP(brt_decref_entry_in_memory); 1081 goto out; 1082 } else { 1083 BRTSTAT_BUMP(brt_decref_entry_not_in_memory); 1084 } 1085 rw_exit(&brtvd->bv_lock); 1086 1087 error = brt_entry_lookup(spa, brtvd, &bre_search); 1088 /* bre_search now contains correct bre_count */ 1089 if (error == ENOENT) { 1090 BRTSTAT_BUMP(brt_decref_no_entry); 1091 return (B_TRUE); 1092 } 1093 ASSERT0(error); 1094 1095 rw_enter(&brtvd->bv_lock, RW_WRITER); 1096 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); 1097 if (racebre != NULL) { 1098 /* The entry was added when the lock was dropped. */ 1099 BRTSTAT_BUMP(brt_decref_entry_read_lost_race); 1100 bre = racebre; 1101 goto out; 1102 } 1103 1104 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); 1105 bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); 1106 bre->bre_bp = bre_search.bre_bp; 1107 bre->bre_count = bre_search.bre_count; 1108 bre->bre_pcount = 0; 1109 avl_insert(&brtvd->bv_tree, bre, where); 1110 1111 out: 1112 if (bre->bre_count == 0) { 1113 rw_exit(&brtvd->bv_lock); 1114 BRTSTAT_BUMP(brt_decref_free_data_now); 1115 return (B_TRUE); 1116 } 1117 1118 bre->bre_pcount--; 1119 ASSERT(bre->bre_count > 0); 1120 bre->bre_count--; 1121 if (bre->bre_count == 0) 1122 BRTSTAT_BUMP(brt_decref_free_data_later); 1123 else 1124 BRTSTAT_BUMP(brt_decref_entry_still_referenced); 1125 brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp)); 1126 1127 rw_exit(&brtvd->bv_lock); 1128 1129 return (B_FALSE); 1130 } 1131 1132 uint64_t 1133 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) 1134 { 1135 brt_entry_t bre_search, *bre; 1136 uint64_t vdevid, refcnt; 1137 int error; 1138 1139 brt_entry_fill(bp, &bre_search, &vdevid); 1140 1141 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); 1142 ASSERT(brtvd != NULL); 1143 1144 rw_enter(&brtvd->bv_lock, RW_READER); 1145 ASSERT(brtvd->bv_initiated); 1146 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1147 if (bre == NULL) { 1148 rw_exit(&brtvd->bv_lock); 1149 error = brt_entry_lookup(spa, brtvd, &bre_search); 1150 if (error == ENOENT) { 1151 refcnt = 0; 1152 } else { 1153 ASSERT0(error); 1154 refcnt = bre_search.bre_count; 1155 } 1156 } else { 1157 refcnt = bre->bre_count; 1158 rw_exit(&brtvd->bv_lock); 1159 } 1160 1161 return (refcnt); 1162 } 1163 1164 static void 1165 brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp) 1166 { 1167 if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0) 1168 return; 1169 1170 uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]); 1171 rw_enter(&brtvd->bv_mos_entries_lock, RW_READER); 1172 if (brtvd->bv_mos_entries != 0) { 1173 (void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode, 1174 &off, BRT_KEY_WORDS); 1175 } 1176 rw_exit(&brtvd->bv_mos_entries_lock); 1177 } 1178 1179 static int 1180 brt_entry_compare(const void *x1, const void *x2) 1181 { 1182 const brt_entry_t *bre1 = x1, *bre2 = x2; 1183 const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp; 1184 1185 return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), 1186 DVA_GET_OFFSET(&bp2->blk_dva[0]))); 1187 } 1188 1189 void 1190 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1191 { 1192 brt_entry_t *bre, *newbre; 1193 avl_index_t where; 1194 uint64_t txg; 1195 1196 txg = dmu_tx_get_txg(tx); 1197 ASSERT3U(txg, !=, 0); 1198 1199 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); 1200 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE); 1201 avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; 1202 1203 newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); 1204 newbre->bre_bp = *bp; 1205 newbre->bre_count = 0; 1206 newbre->bre_pcount = 1; 1207 1208 mutex_enter(&brtvd->bv_pending_lock); 1209 bre = avl_find(pending_tree, newbre, &where); 1210 if (bre == NULL) { 1211 avl_insert(pending_tree, newbre, where); 1212 newbre = NULL; 1213 } else { 1214 bre->bre_pcount++; 1215 } 1216 mutex_exit(&brtvd->bv_pending_lock); 1217 1218 if (newbre != NULL) { 1219 ASSERT(bre != NULL); 1220 ASSERT(bre != newbre); 1221 kmem_cache_free(brt_entry_cache, newbre); 1222 } else { 1223 ASSERT0P(bre); 1224 1225 /* Prefetch BRT entry for the syncing context. */ 1226 brt_prefetch(brtvd, bp); 1227 } 1228 } 1229 1230 void 1231 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1232 { 1233 brt_entry_t *bre, bre_search; 1234 uint64_t txg; 1235 1236 txg = dmu_tx_get_txg(tx); 1237 ASSERT3U(txg, !=, 0); 1238 1239 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); 1240 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); 1241 ASSERT(brtvd != NULL); 1242 avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; 1243 1244 bre_search.bre_bp = *bp; 1245 1246 mutex_enter(&brtvd->bv_pending_lock); 1247 bre = avl_find(pending_tree, &bre_search, NULL); 1248 ASSERT(bre != NULL); 1249 ASSERT(bre->bre_pcount > 0); 1250 bre->bre_pcount--; 1251 if (bre->bre_pcount == 0) 1252 avl_remove(pending_tree, bre); 1253 else 1254 bre = NULL; 1255 mutex_exit(&brtvd->bv_pending_lock); 1256 1257 if (bre) 1258 kmem_cache_free(brt_entry_cache, bre); 1259 } 1260 1261 static void 1262 brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg) 1263 { 1264 brt_entry_t *bre, *nbre; 1265 1266 /* 1267 * We are in syncing context, so no other bv_pending_tree accesses 1268 * are possible for the TXG. So we don't need bv_pending_lock. 1269 */ 1270 ASSERT(avl_is_empty(&brtvd->bv_tree)); 1271 avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]); 1272 1273 for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) { 1274 nbre = AVL_NEXT(&brtvd->bv_tree, bre); 1275 1276 /* 1277 * If the block has DEDUP bit set, it means that it 1278 * already exists in the DEDUP table, so we can just 1279 * use that instead of creating new entry in the BRT. 1280 */ 1281 if (BP_GET_DEDUP(&bre->bre_bp)) { 1282 while (bre->bre_pcount > 0) { 1283 if (!ddt_addref(spa, &bre->bre_bp)) 1284 break; 1285 bre->bre_pcount--; 1286 } 1287 if (bre->bre_pcount == 0) { 1288 avl_remove(&brtvd->bv_tree, bre); 1289 kmem_cache_free(brt_entry_cache, bre); 1290 continue; 1291 } 1292 } 1293 1294 /* 1295 * Unless we know that the block is definitely not in ZAP, 1296 * try to get its reference count from there. 1297 */ 1298 uint64_t off = BRE_OFFSET(bre); 1299 if (brtvd->bv_mos_entries != 0 && 1300 brt_vdev_lookup(spa, brtvd, off)) { 1301 int error; 1302 if (brt_has_endian_fixed(spa)) { 1303 error = zap_lookup_uint64_by_dnode( 1304 brtvd->bv_mos_entries_dnode, &off, 1305 BRT_KEY_WORDS, sizeof (bre->bre_count), 1, 1306 &bre->bre_count); 1307 } else { 1308 error = zap_lookup_uint64_by_dnode( 1309 brtvd->bv_mos_entries_dnode, &off, 1310 BRT_KEY_WORDS, 1, sizeof (bre->bre_count), 1311 &bre->bre_count); 1312 } 1313 if (error == 0) { 1314 BRTSTAT_BUMP(brt_addref_entry_on_disk); 1315 } else { 1316 ASSERT3U(error, ==, ENOENT); 1317 BRTSTAT_BUMP(brt_addref_entry_not_on_disk); 1318 } 1319 } 1320 } 1321 1322 /* 1323 * If all the cloned blocks we had were handled by DDT, we don't need 1324 * to initiate the vdev. 1325 */ 1326 if (avl_is_empty(&brtvd->bv_tree)) 1327 return; 1328 1329 if (!brtvd->bv_initiated) { 1330 rw_enter(&brtvd->bv_lock, RW_WRITER); 1331 brt_vdev_realloc(spa, brtvd); 1332 rw_exit(&brtvd->bv_lock); 1333 } 1334 1335 /* 1336 * Convert pending references into proper ones. This has to be a 1337 * separate loop, since entcount modifications would cause false 1338 * positives for brt_vdev_lookup() on following iterations. 1339 */ 1340 for (bre = avl_first(&brtvd->bv_tree); bre; 1341 bre = AVL_NEXT(&brtvd->bv_tree, bre)) { 1342 brt_vdev_addref(spa, brtvd, bre, 1343 bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount); 1344 bre->bre_count += bre->bre_pcount; 1345 } 1346 } 1347 1348 void 1349 brt_pending_apply(spa_t *spa, uint64_t txg) 1350 { 1351 1352 brt_rlock(spa); 1353 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 1354 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 1355 brt_unlock(spa); 1356 1357 brt_pending_apply_vdev(spa, brtvd, txg); 1358 1359 brt_rlock(spa); 1360 } 1361 brt_unlock(spa); 1362 } 1363 1364 static void 1365 brt_sync_entry(spa_t *spa, dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) 1366 { 1367 uint64_t off = BRE_OFFSET(bre); 1368 1369 if (bre->bre_pcount == 0) { 1370 /* The net change is zero, nothing to do in ZAP. */ 1371 } else if (bre->bre_count == 0) { 1372 int error = zap_remove_uint64_by_dnode(dn, &off, 1373 BRT_KEY_WORDS, tx); 1374 VERIFY(error == 0 || error == ENOENT); 1375 } else { 1376 if (brt_has_endian_fixed(spa)) { 1377 VERIFY0(zap_update_uint64_by_dnode(dn, &off, 1378 BRT_KEY_WORDS, sizeof (bre->bre_count), 1, 1379 &bre->bre_count, tx)); 1380 } else { 1381 VERIFY0(zap_update_uint64_by_dnode(dn, &off, 1382 BRT_KEY_WORDS, 1, sizeof (bre->bre_count), 1383 &bre->bre_count, tx)); 1384 } 1385 } 1386 } 1387 1388 static void 1389 brt_sync_table(spa_t *spa, dmu_tx_t *tx) 1390 { 1391 brt_entry_t *bre; 1392 1393 brt_rlock(spa); 1394 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 1395 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 1396 brt_unlock(spa); 1397 1398 if (!brtvd->bv_meta_dirty) { 1399 ASSERT(!brtvd->bv_entcount_dirty); 1400 ASSERT0(avl_numnodes(&brtvd->bv_tree)); 1401 brt_rlock(spa); 1402 continue; 1403 } 1404 1405 ASSERT(!brtvd->bv_entcount_dirty || 1406 avl_numnodes(&brtvd->bv_tree) != 0); 1407 1408 if (brtvd->bv_mos_brtvdev == 0) 1409 brt_vdev_create(spa, brtvd, tx); 1410 1411 void *c = NULL; 1412 while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { 1413 brt_sync_entry(spa, brtvd->bv_mos_entries_dnode, bre, 1414 tx); 1415 kmem_cache_free(brt_entry_cache, bre); 1416 } 1417 1418 #ifdef ZFS_DEBUG 1419 if (zfs_flags & ZFS_DEBUG_BRT) 1420 brt_vdev_dump(brtvd); 1421 #endif 1422 if (brtvd->bv_totalcount == 0) 1423 brt_vdev_destroy(spa, brtvd, tx); 1424 else 1425 brt_vdev_sync(spa, brtvd, tx); 1426 brt_rlock(spa); 1427 } 1428 brt_unlock(spa); 1429 } 1430 1431 void 1432 brt_sync(spa_t *spa, uint64_t txg) 1433 { 1434 dmu_tx_t *tx; 1435 uint64_t vdevid; 1436 1437 ASSERT3U(spa_syncing_txg(spa), ==, txg); 1438 1439 brt_rlock(spa); 1440 for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 1441 if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty) 1442 break; 1443 } 1444 if (vdevid >= spa->spa_brt_nvdevs) { 1445 brt_unlock(spa); 1446 return; 1447 } 1448 brt_unlock(spa); 1449 1450 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1451 brt_sync_table(spa, tx); 1452 dmu_tx_commit(tx); 1453 } 1454 1455 static void 1456 brt_alloc(spa_t *spa) 1457 { 1458 rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL); 1459 spa->spa_brt_vdevs = NULL; 1460 spa->spa_brt_nvdevs = 0; 1461 spa->spa_brt_rangesize = 0; 1462 } 1463 1464 void 1465 brt_create(spa_t *spa) 1466 { 1467 brt_alloc(spa); 1468 spa->spa_brt_rangesize = BRT_RANGESIZE; 1469 } 1470 1471 int 1472 brt_load(spa_t *spa) 1473 { 1474 int error = 0; 1475 1476 brt_alloc(spa); 1477 brt_wlock(spa); 1478 for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children; 1479 vdevid++) { 1480 char name[64]; 1481 uint64_t mos_brtvdev; 1482 1483 /* Look if this vdev had active block cloning. */ 1484 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 1485 (u_longlong_t)vdevid); 1486 error = zap_lookup(spa->spa_meta_objset, 1487 DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, 1488 &mos_brtvdev); 1489 if (error == ENOENT) { 1490 error = 0; 1491 continue; 1492 } 1493 if (error != 0) 1494 break; 1495 1496 /* If it did, then allocate them all and load this one. */ 1497 brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children); 1498 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 1499 rw_enter(&brtvd->bv_lock, RW_WRITER); 1500 brtvd->bv_mos_brtvdev = mos_brtvdev; 1501 error = brt_vdev_load(spa, brtvd); 1502 rw_exit(&brtvd->bv_lock); 1503 if (error != 0) 1504 break; 1505 } 1506 1507 if (spa->spa_brt_rangesize == 0) 1508 spa->spa_brt_rangesize = BRT_RANGESIZE; 1509 brt_unlock(spa); 1510 return (error); 1511 } 1512 1513 void 1514 brt_prefetch_all(spa_t *spa) 1515 { 1516 /* 1517 * Load all BRT entries for each vdev. This is intended to perform 1518 * a prefetch on all such blocks. For the same reason that brt_prefetch 1519 * (called from brt_pending_add) isn't locked, this is also not locked. 1520 */ 1521 brt_rlock(spa); 1522 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 1523 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 1524 brt_unlock(spa); 1525 1526 rw_enter(&brtvd->bv_mos_entries_lock, RW_READER); 1527 if (brtvd->bv_mos_entries != 0) { 1528 (void) zap_prefetch_object(spa->spa_meta_objset, 1529 brtvd->bv_mos_entries); 1530 } 1531 rw_exit(&brtvd->bv_mos_entries_lock); 1532 1533 brt_rlock(spa); 1534 } 1535 brt_unlock(spa); 1536 } 1537 1538 void 1539 brt_unload(spa_t *spa) 1540 { 1541 if (spa->spa_brt_rangesize == 0) 1542 return; 1543 brt_vdevs_free(spa); 1544 rw_destroy(&spa->spa_brt_lock); 1545 spa->spa_brt_rangesize = 0; 1546 } 1547 1548 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW, 1549 "Enable prefetching of BRT ZAP entries"); 1550 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW, 1551 "BRT ZAP leaf blockshift"); 1552 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW, 1553 "BRT ZAP indirect blockshift"); 1554