1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/spa.h> 28 #include <sys/spa_impl.h> 29 #include <sys/zio.h> 30 #include <sys/brt.h> 31 #include <sys/brt_impl.h> 32 #include <sys/ddt.h> 33 #include <sys/bitmap.h> 34 #include <sys/zap.h> 35 #include <sys/dmu_tx.h> 36 #include <sys/arc.h> 37 #include <sys/dsl_pool.h> 38 #include <sys/dsl_scan.h> 39 #include <sys/vdev_impl.h> 40 #include <sys/kstat.h> 41 #include <sys/wmsum.h> 42 43 /* 44 * Block Cloning design. 45 * 46 * Block Cloning allows to manually clone a file (or a subset of its blocks) 47 * into another (or the same) file by just creating additional references to 48 * the data blocks without copying the data itself. Those references are kept 49 * in the Block Reference Tables (BRTs). 50 * 51 * In many ways this is similar to the existing deduplication, but there are 52 * some important differences: 53 * 54 * - Deduplication is automatic and Block Cloning is not - one has to use a 55 * dedicated system call(s) to clone the given file/blocks. 56 * - Deduplication keeps all data blocks in its table, even those referenced 57 * just once. Block Cloning creates an entry in its tables only when there 58 * are at least two references to the given data block. If the block was 59 * never explicitly cloned or the second to last reference was dropped, 60 * there will be neither space nor performance overhead. 61 * - Deduplication needs data to work - one needs to pass real data to the 62 * write(2) syscall, so hash can be calculated. Block Cloning doesn't require 63 * data, just block pointers to the data, so it is extremely fast, as we pay 64 * neither the cost of reading the data, nor the cost of writing the data - 65 * we operate exclusively on metadata. 66 * - If the D (dedup) bit is not set in the block pointer, it means that 67 * the block is not in the dedup table (DDT) and we won't consult the DDT 68 * when we need to free the block. Block Cloning must be consulted on every 69 * free, because we cannot modify the source BP (eg. by setting something 70 * similar to the D bit), thus we have no hint if the block is in the 71 * Block Reference Table (BRT), so we need to look into the BRT. There is 72 * an optimization in place that allows us to eliminate the majority of BRT 73 * lookups which is described below in the "Minimizing free penalty" section. 74 * - The BRT entry is much smaller than the DDT entry - for BRT we only store 75 * 64bit offset and 64bit reference counter. 76 * - Dedup keys are cryptographic hashes, so two blocks that are close to each 77 * other on disk are most likely in totally different parts of the DDT. 78 * The BRT entry keys are offsets into a single top-level VDEV, so data blocks 79 * from one file should have BRT entries close to each other. 80 * - Scrub will only do a single pass over a block that is referenced multiple 81 * times in the DDT. Unfortunately it is not currently (if at all) possible 82 * with Block Cloning and block referenced multiple times will be scrubbed 83 * multiple times. The new, sorted scrub should be able to eliminate 84 * duplicated reads given enough memory. 85 * - Deduplication requires cryptographically strong hash as a checksum or 86 * additional data verification. Block Cloning works with any checksum 87 * algorithm or even with checksumming disabled. 88 * 89 * As mentioned above, the BRT entries are much smaller than the DDT entries. 90 * To uniquely identify a block we just need its vdev id and offset. We also 91 * need to maintain a reference counter. The vdev id will often repeat, as there 92 * is a small number of top-level VDEVs and a large number of blocks stored in 93 * each VDEV. We take advantage of that to reduce the BRT entry size further by 94 * maintaining one BRT for each top-level VDEV, so we can then have only offset 95 * and counter as the BRT entry. 96 * 97 * Minimizing free penalty. 98 * 99 * Block Cloning allows creating additional references to any existing block. 100 * When we free a block there is no hint in the block pointer whether the block 101 * was cloned or not, so on each free we have to check if there is a 102 * corresponding entry in the BRT or not. If there is, we need to decrease 103 * the reference counter. Doing BRT lookup on every free can potentially be 104 * expensive by requiring additional I/Os if the BRT doesn't fit into memory. 105 * This is the main problem with deduplication, so we've learned our lesson and 106 * try not to repeat the same mistake here. How do we do that? We divide each 107 * top-level VDEV into 16MB regions. For each region we maintain a counter that 108 * is a sum of all the BRT entries that have offsets within the region. This 109 * creates the entries count array of 16bit numbers for each top-level VDEV. 110 * The entries count array is always kept in memory and updated on disk in the 111 * same transaction group as the BRT updates to keep everything in-sync. We can 112 * keep the array in memory, because it is very small. With 16MB regions and 113 * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease 114 * the region size even further in the future). Now, when we want to free 115 * a block, we first consult the array. If the counter for the whole region is 116 * zero, there is no need to look for the BRT entry, as there isn't one for 117 * sure. If the counter for the region is greater than zero, only then we will 118 * do a BRT lookup and if an entry is found we will decrease the reference 119 * counter in the BRT entry and in the entry counters array. 120 * 121 * The entry counters array is small, but can potentially be larger for very 122 * large VDEVs or smaller regions. In this case we don't want to rewrite entire 123 * array on every change. We then divide the array into 32kB block and keep 124 * a bitmap of dirty blocks within a transaction group. When we sync the 125 * transaction group we can only update the parts of the entry counters array 126 * that were modified. Note: Keeping track of the dirty parts of the entry 127 * counters array is implemented, but updating only parts of the array on disk 128 * is not yet implemented - for now we will update entire array if there was 129 * any change. 130 * 131 * The implementation tries to be economic: if BRT is not used, or no longer 132 * used, there will be no entries in the MOS and no additional memory used (eg. 133 * the entry counters array is only allocated if needed). 134 * 135 * Interaction between Deduplication and Block Cloning. 136 * 137 * If both functionalities are in use, we could end up with a block that is 138 * referenced multiple times in both DDT and BRT. When we free one of the 139 * references we couldn't tell where it belongs, so we would have to decide 140 * what table takes the precedence: do we first clear DDT references or BRT 141 * references? To avoid this dilemma BRT cooperates with DDT - if a given block 142 * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will 143 * lookup DDT entry instead and increase the counter there. No BRT entry 144 * will be created for a block which has the D (dedup) bit set. 145 * BRT may be more efficient for manual deduplication, but if the block is 146 * already in the DDT, then creating additional BRT entry would be less 147 * efficient. This clever idea was proposed by Allan Jude. 148 * 149 * Block Cloning across datasets. 150 * 151 * Block Cloning is not limited to cloning blocks within the same dataset. 152 * It is possible (and very useful) to clone blocks between different datasets. 153 * One use case is recovering files from snapshots. By cloning the files into 154 * dataset we need no additional storage. Without Block Cloning we would need 155 * additional space for those files. 156 * Another interesting use case is moving the files between datasets 157 * (copying the file content to the new dataset and removing the source file). 158 * In that case Block Cloning will only be used briefly, because the BRT entries 159 * will be removed when the source is removed. 160 * Block Cloning across encrypted datasets is supported as long as both 161 * datasets share the same master key (e.g. snapshots and clones) 162 * 163 * Block Cloning flow through ZFS layers. 164 * 165 * Note: Block Cloning can be used both for cloning file system blocks and ZVOL 166 * blocks. As of this writing no interface is implemented that allows for block 167 * cloning within a ZVOL. 168 * FreeBSD and Linux provides copy_file_range(2) system call and we will use it 169 * for blocking cloning. 170 * 171 * ssize_t 172 * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp, 173 * size_t len, unsigned int flags); 174 * 175 * Even though offsets and length represent bytes, they have to be 176 * block-aligned or we will return an error so the upper layer can 177 * fallback to the generic mechanism that will just copy the data. 178 * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. 179 * This function was implemented based on zfs_write(), but instead of writing 180 * the given data we first read block pointers using the new dmu_read_l0_bps() 181 * function from the source file. Once we have BPs from the source file we call 182 * the dmu_brt_clone() function on the destination file. This function 183 * allocates BPs for us. We iterate over all source BPs. If the given BP is 184 * a hole or an embedded block, we just copy BP as-is. If it points to a real 185 * data we place this BP on a BRT pending list using the brt_pending_add() 186 * function. 187 * 188 * We use this pending list to keep track of all BPs that got new references 189 * within this transaction group. 190 * 191 * Some special cases to consider and how we address them: 192 * - The block we want to clone may have been created within the same 193 * transaction group that we are trying to clone. Such block has no BP 194 * allocated yet, so cannot be immediately cloned. We return EAGAIN. 195 * - The block we want to clone may have been modified within the same 196 * transaction group. We return EAGAIN. 197 * - A block may be cloned multiple times during one transaction group (that's 198 * why pending list is actually a tree and not an append-only list - this 199 * way we can figure out faster if this block is cloned for the first time 200 * in this txg or consecutive time). 201 * - A block may be cloned and freed within the same transaction group 202 * (see dbuf_undirty()). 203 * - A block may be cloned and within the same transaction group the clone 204 * can be cloned again (see dmu_read_l0_bps()). 205 * - A file might have been deleted, but the caller still has a file descriptor 206 * open to this file and clones it. 207 * 208 * When we free a block we have an additional step in the ZIO pipeline where we 209 * call the zio_brt_free() function. We then call the brt_entry_decref() 210 * that loads the corresponding BRT entry (if one exists) and decreases 211 * reference counter. If this is not the last reference we will stop ZIO 212 * pipeline here. If this is the last reference or the block is not in the 213 * BRT, we continue the pipeline and free the block as usual. 214 * 215 * At the beginning of spa_sync() where there can be no more block cloning, 216 * but before issuing frees we call brt_pending_apply(). This function applies 217 * all the new clones to the BRT table - we load BRT entries and update 218 * reference counters. To sync new BRT entries to disk, we use brt_sync() 219 * function. This function will sync all dirty per-top-level-vdev BRTs, 220 * the entry counters arrays, etc. 221 * 222 * Block Cloning and ZIL. 223 * 224 * Every clone operation is divided into chunks (similar to write) and each 225 * chunk is cloned in a separate transaction. The chunk size is determined by 226 * how many BPs we can fit into a single ZIL entry. 227 * Replaying clone operation is different from the regular clone operation, 228 * as when we log clone operations we cannot use the source object - it may 229 * reside on a different dataset, so we log BPs we want to clone. 230 * The ZIL is replayed when we mount the given dataset, not when the pool is 231 * imported. Taking this into account it is possible that the pool is imported 232 * without mounting datasets and the source dataset is destroyed before the 233 * destination dataset is mounted and its ZIL replayed. 234 * To address this situation we leverage zil_claim() mechanism where ZFS will 235 * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE 236 * entries, we will bump reference counters for their BPs in the BRT. Then 237 * on mount and ZIL replay we bump the reference counters once more, while the 238 * first references are dropped during ZIL destroy by zil_free_clone_range(). 239 * It is possible that after zil_claim() we never mount the destination, so 240 * we never replay its ZIL and just destroy it. In this case the only taken 241 * references will be dropped by zil_free_clone_range(), since the cloning is 242 * not going to ever take place. 243 */ 244 245 static kmem_cache_t *brt_entry_cache; 246 static kmem_cache_t *brt_pending_entry_cache; 247 248 /* 249 * Enable/disable prefetching of BRT entries that we are going to modify. 250 */ 251 static int brt_zap_prefetch = 1; 252 253 #ifdef ZFS_DEBUG 254 #define BRT_DEBUG(...) do { \ 255 if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \ 256 __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \ 257 } \ 258 } while (0) 259 #else 260 #define BRT_DEBUG(...) do { } while (0) 261 #endif 262 263 static int brt_zap_default_bs = 12; 264 static int brt_zap_default_ibs = 12; 265 266 static kstat_t *brt_ksp; 267 268 typedef struct brt_stats { 269 kstat_named_t brt_addref_entry_in_memory; 270 kstat_named_t brt_addref_entry_not_on_disk; 271 kstat_named_t brt_addref_entry_on_disk; 272 kstat_named_t brt_addref_entry_read_lost_race; 273 kstat_named_t brt_decref_entry_in_memory; 274 kstat_named_t brt_decref_entry_loaded_from_disk; 275 kstat_named_t brt_decref_entry_not_in_memory; 276 kstat_named_t brt_decref_entry_not_on_disk; 277 kstat_named_t brt_decref_entry_read_lost_race; 278 kstat_named_t brt_decref_entry_still_referenced; 279 kstat_named_t brt_decref_free_data_later; 280 kstat_named_t brt_decref_free_data_now; 281 kstat_named_t brt_decref_no_entry; 282 } brt_stats_t; 283 284 static brt_stats_t brt_stats = { 285 { "addref_entry_in_memory", KSTAT_DATA_UINT64 }, 286 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, 287 { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, 288 { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 }, 289 { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, 290 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, 291 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, 292 { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 }, 293 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, 294 { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, 295 { "decref_free_data_later", KSTAT_DATA_UINT64 }, 296 { "decref_free_data_now", KSTAT_DATA_UINT64 }, 297 { "decref_no_entry", KSTAT_DATA_UINT64 } 298 }; 299 300 struct { 301 wmsum_t brt_addref_entry_in_memory; 302 wmsum_t brt_addref_entry_not_on_disk; 303 wmsum_t brt_addref_entry_on_disk; 304 wmsum_t brt_addref_entry_read_lost_race; 305 wmsum_t brt_decref_entry_in_memory; 306 wmsum_t brt_decref_entry_loaded_from_disk; 307 wmsum_t brt_decref_entry_not_in_memory; 308 wmsum_t brt_decref_entry_not_on_disk; 309 wmsum_t brt_decref_entry_read_lost_race; 310 wmsum_t brt_decref_entry_still_referenced; 311 wmsum_t brt_decref_free_data_later; 312 wmsum_t brt_decref_free_data_now; 313 wmsum_t brt_decref_no_entry; 314 } brt_sums; 315 316 #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) 317 318 static int brt_entry_compare(const void *x1, const void *x2); 319 static int brt_pending_entry_compare(const void *x1, const void *x2); 320 321 static void 322 brt_rlock(brt_t *brt) 323 { 324 rw_enter(&brt->brt_lock, RW_READER); 325 } 326 327 static void 328 brt_wlock(brt_t *brt) 329 { 330 rw_enter(&brt->brt_lock, RW_WRITER); 331 } 332 333 static void 334 brt_unlock(brt_t *brt) 335 { 336 rw_exit(&brt->brt_lock); 337 } 338 339 static uint16_t 340 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) 341 { 342 343 ASSERT3U(idx, <, brtvd->bv_size); 344 345 if (unlikely(brtvd->bv_need_byteswap)) { 346 return (BSWAP_16(brtvd->bv_entcount[idx])); 347 } else { 348 return (brtvd->bv_entcount[idx]); 349 } 350 } 351 352 static void 353 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt) 354 { 355 356 ASSERT3U(idx, <, brtvd->bv_size); 357 358 if (unlikely(brtvd->bv_need_byteswap)) { 359 brtvd->bv_entcount[idx] = BSWAP_16(entcnt); 360 } else { 361 brtvd->bv_entcount[idx] = entcnt; 362 } 363 } 364 365 static void 366 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx) 367 { 368 uint16_t entcnt; 369 370 ASSERT3U(idx, <, brtvd->bv_size); 371 372 entcnt = brt_vdev_entcount_get(brtvd, idx); 373 ASSERT(entcnt < UINT16_MAX); 374 375 brt_vdev_entcount_set(brtvd, idx, entcnt + 1); 376 } 377 378 static void 379 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx) 380 { 381 uint16_t entcnt; 382 383 ASSERT3U(idx, <, brtvd->bv_size); 384 385 entcnt = brt_vdev_entcount_get(brtvd, idx); 386 ASSERT(entcnt > 0); 387 388 brt_vdev_entcount_set(brtvd, idx, entcnt - 1); 389 } 390 391 #ifdef ZFS_DEBUG 392 static void 393 brt_vdev_dump(brt_vdev_t *brtvd) 394 { 395 uint64_t idx; 396 397 zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d " 398 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", 399 (u_longlong_t)brtvd->bv_vdevid, 400 brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, 401 (u_longlong_t)brtvd->bv_size, 402 (u_longlong_t)brtvd->bv_totalcount, 403 (u_longlong_t)brtvd->bv_nblocks, 404 (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); 405 if (brtvd->bv_totalcount > 0) { 406 zfs_dbgmsg(" entcounts:"); 407 for (idx = 0; idx < brtvd->bv_size; idx++) { 408 uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx); 409 if (entcnt > 0) { 410 zfs_dbgmsg(" [%04llu] %hu", 411 (u_longlong_t)idx, entcnt); 412 } 413 } 414 } 415 if (brtvd->bv_entcount_dirty) { 416 char *bitmap; 417 418 bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); 419 for (idx = 0; idx < brtvd->bv_nblocks; idx++) { 420 bitmap[idx] = 421 BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; 422 } 423 bitmap[idx] = '\0'; 424 zfs_dbgmsg(" dirty: %s", bitmap); 425 kmem_free(bitmap, brtvd->bv_nblocks + 1); 426 } 427 } 428 #endif 429 430 static brt_vdev_t * 431 brt_vdev(brt_t *brt, uint64_t vdevid) 432 { 433 brt_vdev_t *brtvd; 434 435 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 436 437 if (vdevid < brt->brt_nvdevs) { 438 brtvd = &brt->brt_vdevs[vdevid]; 439 } else { 440 brtvd = NULL; 441 } 442 443 return (brtvd); 444 } 445 446 static void 447 brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) 448 { 449 char name[64]; 450 451 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 452 ASSERT0(brtvd->bv_mos_brtvdev); 453 ASSERT0(brtvd->bv_mos_entries); 454 ASSERT(brtvd->bv_entcount != NULL); 455 ASSERT(brtvd->bv_size > 0); 456 ASSERT(brtvd->bv_bitmap != NULL); 457 ASSERT(brtvd->bv_nblocks > 0); 458 459 brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0, 460 ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, 461 brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx); 462 VERIFY(brtvd->bv_mos_entries != 0); 463 BRT_DEBUG("MOS entries created, object=%llu", 464 (u_longlong_t)brtvd->bv_mos_entries); 465 466 /* 467 * We allocate DMU buffer to store the bv_entcount[] array. 468 * We will keep array size (bv_size) and cummulative count for all 469 * bv_entcount[]s (bv_totalcount) in the bonus buffer. 470 */ 471 brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos, 472 DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, 473 DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); 474 VERIFY(brtvd->bv_mos_brtvdev != 0); 475 BRT_DEBUG("MOS BRT VDEV created, object=%llu", 476 (u_longlong_t)brtvd->bv_mos_brtvdev); 477 478 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 479 (u_longlong_t)brtvd->bv_vdevid); 480 VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, 481 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); 482 BRT_DEBUG("Pool directory object created, object=%s", name); 483 484 spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); 485 } 486 487 static void 488 brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) 489 { 490 vdev_t *vd; 491 uint16_t *entcount; 492 ulong_t *bitmap; 493 uint64_t nblocks, size; 494 495 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 496 497 spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER); 498 vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid); 499 size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1; 500 spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG); 501 502 entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); 503 nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); 504 bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); 505 506 if (!brtvd->bv_initiated) { 507 ASSERT0(brtvd->bv_size); 508 ASSERT(brtvd->bv_entcount == NULL); 509 ASSERT(brtvd->bv_bitmap == NULL); 510 ASSERT0(brtvd->bv_nblocks); 511 512 avl_create(&brtvd->bv_tree, brt_entry_compare, 513 sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); 514 } else { 515 ASSERT(brtvd->bv_size > 0); 516 ASSERT(brtvd->bv_entcount != NULL); 517 ASSERT(brtvd->bv_bitmap != NULL); 518 ASSERT(brtvd->bv_nblocks > 0); 519 /* 520 * TODO: Allow vdev shrinking. We only need to implement 521 * shrinking the on-disk BRT VDEV object. 522 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset, 523 * size, tx); 524 */ 525 ASSERT3U(brtvd->bv_size, <=, size); 526 527 memcpy(entcount, brtvd->bv_entcount, 528 sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); 529 memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), 530 BT_SIZEOFMAP(brtvd->bv_nblocks))); 531 vmem_free(brtvd->bv_entcount, 532 sizeof (entcount[0]) * brtvd->bv_size); 533 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); 534 } 535 536 brtvd->bv_size = size; 537 brtvd->bv_entcount = entcount; 538 brtvd->bv_bitmap = bitmap; 539 brtvd->bv_nblocks = nblocks; 540 if (!brtvd->bv_initiated) { 541 brtvd->bv_need_byteswap = FALSE; 542 brtvd->bv_initiated = TRUE; 543 BRT_DEBUG("BRT VDEV %llu initiated.", 544 (u_longlong_t)brtvd->bv_vdevid); 545 } 546 } 547 548 static void 549 brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) 550 { 551 char name[64]; 552 dmu_buf_t *db; 553 brt_vdev_phys_t *bvphys; 554 int error; 555 556 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 557 (u_longlong_t)brtvd->bv_vdevid); 558 error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, 559 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev); 560 if (error != 0) 561 return; 562 ASSERT(brtvd->bv_mos_brtvdev != 0); 563 564 error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db); 565 ASSERT0(error); 566 if (error != 0) 567 return; 568 569 bvphys = db->db_data; 570 if (brt->brt_rangesize == 0) { 571 brt->brt_rangesize = bvphys->bvp_rangesize; 572 } else { 573 ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize); 574 } 575 576 ASSERT(!brtvd->bv_initiated); 577 brt_vdev_realloc(brt, brtvd); 578 579 /* TODO: We don't support VDEV shrinking. */ 580 ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); 581 582 /* 583 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. 584 */ 585 error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, 586 MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), 587 brtvd->bv_entcount, DMU_READ_NO_PREFETCH); 588 ASSERT0(error); 589 590 brtvd->bv_mos_entries = bvphys->bvp_mos_entries; 591 ASSERT(brtvd->bv_mos_entries != 0); 592 brtvd->bv_need_byteswap = 593 (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); 594 brtvd->bv_totalcount = bvphys->bvp_totalcount; 595 brtvd->bv_usedspace = bvphys->bvp_usedspace; 596 brtvd->bv_savedspace = bvphys->bvp_savedspace; 597 brt->brt_usedspace += brtvd->bv_usedspace; 598 brt->brt_savedspace += brtvd->bv_savedspace; 599 600 dmu_buf_rele(db, FTAG); 601 602 BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu", 603 name, (u_longlong_t)brtvd->bv_mos_brtvdev, 604 (u_longlong_t)brtvd->bv_mos_entries); 605 } 606 607 static void 608 brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd) 609 { 610 611 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 612 ASSERT(brtvd->bv_initiated); 613 614 vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); 615 brtvd->bv_entcount = NULL; 616 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); 617 brtvd->bv_bitmap = NULL; 618 ASSERT0(avl_numnodes(&brtvd->bv_tree)); 619 avl_destroy(&brtvd->bv_tree); 620 621 brtvd->bv_size = 0; 622 brtvd->bv_nblocks = 0; 623 624 brtvd->bv_initiated = FALSE; 625 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); 626 } 627 628 static void 629 brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) 630 { 631 char name[64]; 632 uint64_t count; 633 dmu_buf_t *db; 634 brt_vdev_phys_t *bvphys; 635 636 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 637 ASSERT(brtvd->bv_mos_brtvdev != 0); 638 ASSERT(brtvd->bv_mos_entries != 0); 639 640 VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count)); 641 VERIFY0(count); 642 VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx)); 643 BRT_DEBUG("MOS entries destroyed, object=%llu", 644 (u_longlong_t)brtvd->bv_mos_entries); 645 brtvd->bv_mos_entries = 0; 646 647 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); 648 bvphys = db->db_data; 649 ASSERT0(bvphys->bvp_totalcount); 650 ASSERT0(bvphys->bvp_usedspace); 651 ASSERT0(bvphys->bvp_savedspace); 652 dmu_buf_rele(db, FTAG); 653 654 VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx)); 655 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", 656 (u_longlong_t)brtvd->bv_mos_brtvdev); 657 brtvd->bv_mos_brtvdev = 0; 658 659 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 660 (u_longlong_t)brtvd->bv_vdevid); 661 VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx)); 662 BRT_DEBUG("Pool directory object removed, object=%s", name); 663 664 brt_vdev_dealloc(brt, brtvd); 665 666 spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); 667 } 668 669 static void 670 brt_vdevs_expand(brt_t *brt, uint64_t nvdevs) 671 { 672 brt_vdev_t *brtvd, *vdevs; 673 uint64_t vdevid; 674 675 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 676 ASSERT3U(nvdevs, >, brt->brt_nvdevs); 677 678 vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP); 679 if (brt->brt_nvdevs > 0) { 680 ASSERT(brt->brt_vdevs != NULL); 681 682 memcpy(vdevs, brt->brt_vdevs, 683 sizeof (brt_vdev_t) * brt->brt_nvdevs); 684 kmem_free(brt->brt_vdevs, 685 sizeof (brt_vdev_t) * brt->brt_nvdevs); 686 } 687 for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) { 688 brtvd = &vdevs[vdevid]; 689 690 brtvd->bv_vdevid = vdevid; 691 brtvd->bv_initiated = FALSE; 692 } 693 694 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", 695 (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs); 696 697 brt->brt_vdevs = vdevs; 698 brt->brt_nvdevs = nvdevs; 699 } 700 701 static boolean_t 702 brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre) 703 { 704 uint64_t idx; 705 706 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 707 708 idx = bre->bre_offset / brt->brt_rangesize; 709 if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) { 710 /* VDEV wasn't expanded. */ 711 return (brt_vdev_entcount_get(brtvd, idx) > 0); 712 } 713 714 return (FALSE); 715 } 716 717 static void 718 brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, 719 uint64_t dsize) 720 { 721 uint64_t idx; 722 723 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 724 ASSERT(brtvd != NULL); 725 ASSERT(brtvd->bv_entcount != NULL); 726 727 brt->brt_savedspace += dsize; 728 brtvd->bv_savedspace += dsize; 729 brtvd->bv_meta_dirty = TRUE; 730 731 if (bre->bre_refcount > 1) { 732 return; 733 } 734 735 brt->brt_usedspace += dsize; 736 brtvd->bv_usedspace += dsize; 737 738 idx = bre->bre_offset / brt->brt_rangesize; 739 if (idx >= brtvd->bv_size) { 740 /* VDEV has been expanded. */ 741 brt_vdev_realloc(brt, brtvd); 742 } 743 744 ASSERT3U(idx, <, brtvd->bv_size); 745 746 brtvd->bv_totalcount++; 747 brt_vdev_entcount_inc(brtvd, idx); 748 brtvd->bv_entcount_dirty = TRUE; 749 idx = idx / BRT_BLOCKSIZE / 8; 750 BT_SET(brtvd->bv_bitmap, idx); 751 752 #ifdef ZFS_DEBUG 753 if (zfs_flags & ZFS_DEBUG_BRT) 754 brt_vdev_dump(brtvd); 755 #endif 756 } 757 758 static void 759 brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, 760 uint64_t dsize) 761 { 762 uint64_t idx; 763 764 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 765 ASSERT(brtvd != NULL); 766 ASSERT(brtvd->bv_entcount != NULL); 767 768 brt->brt_savedspace -= dsize; 769 brtvd->bv_savedspace -= dsize; 770 brtvd->bv_meta_dirty = TRUE; 771 772 if (bre->bre_refcount > 0) { 773 return; 774 } 775 776 brt->brt_usedspace -= dsize; 777 brtvd->bv_usedspace -= dsize; 778 779 idx = bre->bre_offset / brt->brt_rangesize; 780 ASSERT3U(idx, <, brtvd->bv_size); 781 782 ASSERT(brtvd->bv_totalcount > 0); 783 brtvd->bv_totalcount--; 784 brt_vdev_entcount_dec(brtvd, idx); 785 brtvd->bv_entcount_dirty = TRUE; 786 idx = idx / BRT_BLOCKSIZE / 8; 787 BT_SET(brtvd->bv_bitmap, idx); 788 789 #ifdef ZFS_DEBUG 790 if (zfs_flags & ZFS_DEBUG_BRT) 791 brt_vdev_dump(brtvd); 792 #endif 793 } 794 795 static void 796 brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) 797 { 798 dmu_buf_t *db; 799 brt_vdev_phys_t *bvphys; 800 801 ASSERT(brtvd->bv_meta_dirty); 802 ASSERT(brtvd->bv_mos_brtvdev != 0); 803 ASSERT(dmu_tx_is_syncing(tx)); 804 805 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); 806 807 if (brtvd->bv_entcount_dirty) { 808 /* 809 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. 810 */ 811 dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, 812 brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), 813 brtvd->bv_entcount, tx); 814 memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks)); 815 brtvd->bv_entcount_dirty = FALSE; 816 } 817 818 dmu_buf_will_dirty(db, tx); 819 bvphys = db->db_data; 820 bvphys->bvp_mos_entries = brtvd->bv_mos_entries; 821 bvphys->bvp_size = brtvd->bv_size; 822 if (brtvd->bv_need_byteswap) { 823 bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER; 824 } else { 825 bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; 826 } 827 bvphys->bvp_totalcount = brtvd->bv_totalcount; 828 bvphys->bvp_rangesize = brt->brt_rangesize; 829 bvphys->bvp_usedspace = brtvd->bv_usedspace; 830 bvphys->bvp_savedspace = brtvd->bv_savedspace; 831 dmu_buf_rele(db, FTAG); 832 833 brtvd->bv_meta_dirty = FALSE; 834 } 835 836 static void 837 brt_vdevs_alloc(brt_t *brt, boolean_t load) 838 { 839 brt_vdev_t *brtvd; 840 uint64_t vdevid; 841 842 brt_wlock(brt); 843 844 brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children); 845 846 if (load) { 847 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 848 brtvd = &brt->brt_vdevs[vdevid]; 849 ASSERT(brtvd->bv_entcount == NULL); 850 851 brt_vdev_load(brt, brtvd); 852 } 853 } 854 855 if (brt->brt_rangesize == 0) { 856 brt->brt_rangesize = BRT_RANGESIZE; 857 } 858 859 brt_unlock(brt); 860 } 861 862 static void 863 brt_vdevs_free(brt_t *brt) 864 { 865 brt_vdev_t *brtvd; 866 uint64_t vdevid; 867 868 brt_wlock(brt); 869 870 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 871 brtvd = &brt->brt_vdevs[vdevid]; 872 if (brtvd->bv_initiated) 873 brt_vdev_dealloc(brt, brtvd); 874 } 875 kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs); 876 877 brt_unlock(brt); 878 } 879 880 static void 881 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) 882 { 883 884 bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); 885 bre->bre_refcount = 0; 886 887 *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); 888 } 889 890 static int 891 brt_entry_compare(const void *x1, const void *x2) 892 { 893 const brt_entry_t *bre1 = x1; 894 const brt_entry_t *bre2 = x2; 895 896 return (TREE_CMP(bre1->bre_offset, bre2->bre_offset)); 897 } 898 899 static int 900 brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre) 901 { 902 uint64_t mos_entries; 903 int error; 904 905 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 906 907 if (!brt_vdev_lookup(brt, brtvd, bre)) 908 return (SET_ERROR(ENOENT)); 909 910 /* 911 * Remember mos_entries object number. After we reacquire the BRT lock, 912 * the brtvd pointer may be invalid. 913 */ 914 mos_entries = brtvd->bv_mos_entries; 915 if (mos_entries == 0) 916 return (SET_ERROR(ENOENT)); 917 918 brt_unlock(brt); 919 920 error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, 921 BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount); 922 923 brt_wlock(brt); 924 925 return (error); 926 } 927 928 static void 929 brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) 930 { 931 brt_vdev_t *brtvd; 932 uint64_t mos_entries = 0; 933 934 brt_rlock(brt); 935 brtvd = brt_vdev(brt, vdevid); 936 if (brtvd != NULL) 937 mos_entries = brtvd->bv_mos_entries; 938 brt_unlock(brt); 939 940 if (mos_entries == 0) 941 return; 942 943 (void) zap_prefetch_uint64(brt->brt_mos, mos_entries, 944 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS); 945 } 946 947 /* 948 * Return TRUE if we _can_ have BRT entry for this bp. It might be false 949 * positive, but gives us quick answer if we should look into BRT, which 950 * may require reads and thus will be more expensive. 951 */ 952 boolean_t 953 brt_maybe_exists(spa_t *spa, const blkptr_t *bp) 954 { 955 brt_t *brt = spa->spa_brt; 956 brt_vdev_t *brtvd; 957 brt_entry_t bre_search; 958 boolean_t mayexists = FALSE; 959 uint64_t vdevid; 960 961 brt_entry_fill(bp, &bre_search, &vdevid); 962 963 brt_rlock(brt); 964 965 brtvd = brt_vdev(brt, vdevid); 966 if (brtvd != NULL && brtvd->bv_initiated) { 967 if (!avl_is_empty(&brtvd->bv_tree) || 968 brt_vdev_lookup(brt, brtvd, &bre_search)) { 969 mayexists = TRUE; 970 } 971 } 972 973 brt_unlock(brt); 974 975 return (mayexists); 976 } 977 978 uint64_t 979 brt_get_dspace(spa_t *spa) 980 { 981 brt_t *brt = spa->spa_brt; 982 983 if (brt == NULL) 984 return (0); 985 986 return (brt->brt_savedspace); 987 } 988 989 uint64_t 990 brt_get_used(spa_t *spa) 991 { 992 brt_t *brt = spa->spa_brt; 993 994 if (brt == NULL) 995 return (0); 996 997 return (brt->brt_usedspace); 998 } 999 1000 uint64_t 1001 brt_get_saved(spa_t *spa) 1002 { 1003 brt_t *brt = spa->spa_brt; 1004 1005 if (brt == NULL) 1006 return (0); 1007 1008 return (brt->brt_savedspace); 1009 } 1010 1011 uint64_t 1012 brt_get_ratio(spa_t *spa) 1013 { 1014 brt_t *brt = spa->spa_brt; 1015 1016 if (brt->brt_usedspace == 0) 1017 return (100); 1018 1019 return ((brt->brt_usedspace + brt->brt_savedspace) * 100 / 1020 brt->brt_usedspace); 1021 } 1022 1023 static int 1024 brt_kstats_update(kstat_t *ksp, int rw) 1025 { 1026 brt_stats_t *bs = ksp->ks_data; 1027 1028 if (rw == KSTAT_WRITE) 1029 return (EACCES); 1030 1031 bs->brt_addref_entry_in_memory.value.ui64 = 1032 wmsum_value(&brt_sums.brt_addref_entry_in_memory); 1033 bs->brt_addref_entry_not_on_disk.value.ui64 = 1034 wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); 1035 bs->brt_addref_entry_on_disk.value.ui64 = 1036 wmsum_value(&brt_sums.brt_addref_entry_on_disk); 1037 bs->brt_addref_entry_read_lost_race.value.ui64 = 1038 wmsum_value(&brt_sums.brt_addref_entry_read_lost_race); 1039 bs->brt_decref_entry_in_memory.value.ui64 = 1040 wmsum_value(&brt_sums.brt_decref_entry_in_memory); 1041 bs->brt_decref_entry_loaded_from_disk.value.ui64 = 1042 wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); 1043 bs->brt_decref_entry_not_in_memory.value.ui64 = 1044 wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); 1045 bs->brt_decref_entry_not_on_disk.value.ui64 = 1046 wmsum_value(&brt_sums.brt_decref_entry_not_on_disk); 1047 bs->brt_decref_entry_read_lost_race.value.ui64 = 1048 wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); 1049 bs->brt_decref_entry_still_referenced.value.ui64 = 1050 wmsum_value(&brt_sums.brt_decref_entry_still_referenced); 1051 bs->brt_decref_free_data_later.value.ui64 = 1052 wmsum_value(&brt_sums.brt_decref_free_data_later); 1053 bs->brt_decref_free_data_now.value.ui64 = 1054 wmsum_value(&brt_sums.brt_decref_free_data_now); 1055 bs->brt_decref_no_entry.value.ui64 = 1056 wmsum_value(&brt_sums.brt_decref_no_entry); 1057 1058 return (0); 1059 } 1060 1061 static void 1062 brt_stat_init(void) 1063 { 1064 1065 wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0); 1066 wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); 1067 wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); 1068 wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0); 1069 wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); 1070 wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); 1071 wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); 1072 wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0); 1073 wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); 1074 wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); 1075 wmsum_init(&brt_sums.brt_decref_free_data_later, 0); 1076 wmsum_init(&brt_sums.brt_decref_free_data_now, 0); 1077 wmsum_init(&brt_sums.brt_decref_no_entry, 0); 1078 1079 brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED, 1080 sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 1081 if (brt_ksp != NULL) { 1082 brt_ksp->ks_data = &brt_stats; 1083 brt_ksp->ks_update = brt_kstats_update; 1084 kstat_install(brt_ksp); 1085 } 1086 } 1087 1088 static void 1089 brt_stat_fini(void) 1090 { 1091 if (brt_ksp != NULL) { 1092 kstat_delete(brt_ksp); 1093 brt_ksp = NULL; 1094 } 1095 1096 wmsum_fini(&brt_sums.brt_addref_entry_in_memory); 1097 wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); 1098 wmsum_fini(&brt_sums.brt_addref_entry_on_disk); 1099 wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race); 1100 wmsum_fini(&brt_sums.brt_decref_entry_in_memory); 1101 wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); 1102 wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); 1103 wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk); 1104 wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); 1105 wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); 1106 wmsum_fini(&brt_sums.brt_decref_free_data_later); 1107 wmsum_fini(&brt_sums.brt_decref_free_data_now); 1108 wmsum_fini(&brt_sums.brt_decref_no_entry); 1109 } 1110 1111 void 1112 brt_init(void) 1113 { 1114 brt_entry_cache = kmem_cache_create("brt_entry_cache", 1115 sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1116 brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache", 1117 sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1118 1119 brt_stat_init(); 1120 } 1121 1122 void 1123 brt_fini(void) 1124 { 1125 brt_stat_fini(); 1126 1127 kmem_cache_destroy(brt_entry_cache); 1128 kmem_cache_destroy(brt_pending_entry_cache); 1129 } 1130 1131 static brt_entry_t * 1132 brt_entry_alloc(const brt_entry_t *bre_init) 1133 { 1134 brt_entry_t *bre; 1135 1136 bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); 1137 bre->bre_offset = bre_init->bre_offset; 1138 bre->bre_refcount = bre_init->bre_refcount; 1139 1140 return (bre); 1141 } 1142 1143 static void 1144 brt_entry_free(brt_entry_t *bre) 1145 { 1146 1147 kmem_cache_free(brt_entry_cache, bre); 1148 } 1149 1150 static void 1151 brt_entry_addref(brt_t *brt, const blkptr_t *bp) 1152 { 1153 brt_vdev_t *brtvd; 1154 brt_entry_t *bre, *racebre; 1155 brt_entry_t bre_search; 1156 avl_index_t where; 1157 uint64_t vdevid; 1158 int error; 1159 1160 ASSERT(!RW_WRITE_HELD(&brt->brt_lock)); 1161 1162 brt_entry_fill(bp, &bre_search, &vdevid); 1163 1164 brt_wlock(brt); 1165 1166 brtvd = brt_vdev(brt, vdevid); 1167 if (brtvd == NULL) { 1168 ASSERT3U(vdevid, >=, brt->brt_nvdevs); 1169 1170 /* New VDEV was added. */ 1171 brt_vdevs_expand(brt, vdevid + 1); 1172 brtvd = brt_vdev(brt, vdevid); 1173 } 1174 ASSERT(brtvd != NULL); 1175 if (!brtvd->bv_initiated) 1176 brt_vdev_realloc(brt, brtvd); 1177 1178 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1179 if (bre != NULL) { 1180 BRTSTAT_BUMP(brt_addref_entry_in_memory); 1181 } else { 1182 /* 1183 * brt_entry_lookup() may drop the BRT (read) lock and 1184 * reacquire it (write). 1185 */ 1186 error = brt_entry_lookup(brt, brtvd, &bre_search); 1187 /* bre_search now contains correct bre_refcount */ 1188 ASSERT(error == 0 || error == ENOENT); 1189 if (error == 0) 1190 BRTSTAT_BUMP(brt_addref_entry_on_disk); 1191 else 1192 BRTSTAT_BUMP(brt_addref_entry_not_on_disk); 1193 /* 1194 * When the BRT lock was dropped, brt_vdevs[] may have been 1195 * expanded and reallocated, we need to update brtvd's pointer. 1196 */ 1197 brtvd = brt_vdev(brt, vdevid); 1198 ASSERT(brtvd != NULL); 1199 1200 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); 1201 if (racebre == NULL) { 1202 bre = brt_entry_alloc(&bre_search); 1203 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 1204 avl_insert(&brtvd->bv_tree, bre, where); 1205 brt->brt_nentries++; 1206 } else { 1207 /* 1208 * The entry was added when the BRT lock was dropped in 1209 * brt_entry_lookup(). 1210 */ 1211 BRTSTAT_BUMP(brt_addref_entry_read_lost_race); 1212 bre = racebre; 1213 } 1214 } 1215 bre->bre_refcount++; 1216 brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); 1217 1218 brt_unlock(brt); 1219 } 1220 1221 /* Return TRUE if block should be freed immediately. */ 1222 boolean_t 1223 brt_entry_decref(spa_t *spa, const blkptr_t *bp) 1224 { 1225 brt_t *brt = spa->spa_brt; 1226 brt_vdev_t *brtvd; 1227 brt_entry_t *bre, *racebre; 1228 brt_entry_t bre_search; 1229 avl_index_t where; 1230 uint64_t vdevid; 1231 int error; 1232 1233 brt_entry_fill(bp, &bre_search, &vdevid); 1234 1235 brt_wlock(brt); 1236 1237 brtvd = brt_vdev(brt, vdevid); 1238 ASSERT(brtvd != NULL); 1239 1240 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1241 if (bre != NULL) { 1242 BRTSTAT_BUMP(brt_decref_entry_in_memory); 1243 goto out; 1244 } else { 1245 BRTSTAT_BUMP(brt_decref_entry_not_in_memory); 1246 } 1247 1248 /* 1249 * brt_entry_lookup() may drop the BRT lock and reacquire it. 1250 */ 1251 error = brt_entry_lookup(brt, brtvd, &bre_search); 1252 /* bre_search now contains correct bre_refcount */ 1253 ASSERT(error == 0 || error == ENOENT); 1254 /* 1255 * When the BRT lock was dropped, brt_vdevs[] may have been expanded 1256 * and reallocated, we need to update brtvd's pointer. 1257 */ 1258 brtvd = brt_vdev(brt, vdevid); 1259 ASSERT(brtvd != NULL); 1260 1261 if (error == ENOENT) { 1262 BRTSTAT_BUMP(brt_decref_entry_not_on_disk); 1263 bre = NULL; 1264 goto out; 1265 } 1266 1267 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); 1268 if (racebre != NULL) { 1269 /* 1270 * The entry was added when the BRT lock was dropped in 1271 * brt_entry_lookup(). 1272 */ 1273 BRTSTAT_BUMP(brt_decref_entry_read_lost_race); 1274 bre = racebre; 1275 goto out; 1276 } 1277 1278 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); 1279 bre = brt_entry_alloc(&bre_search); 1280 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 1281 avl_insert(&brtvd->bv_tree, bre, where); 1282 brt->brt_nentries++; 1283 1284 out: 1285 if (bre == NULL) { 1286 /* 1287 * This is a free of a regular (not cloned) block. 1288 */ 1289 brt_unlock(brt); 1290 BRTSTAT_BUMP(brt_decref_no_entry); 1291 return (B_TRUE); 1292 } 1293 if (bre->bre_refcount == 0) { 1294 brt_unlock(brt); 1295 BRTSTAT_BUMP(brt_decref_free_data_now); 1296 return (B_TRUE); 1297 } 1298 1299 ASSERT(bre->bre_refcount > 0); 1300 bre->bre_refcount--; 1301 if (bre->bre_refcount == 0) 1302 BRTSTAT_BUMP(brt_decref_free_data_later); 1303 else 1304 BRTSTAT_BUMP(brt_decref_entry_still_referenced); 1305 brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); 1306 1307 brt_unlock(brt); 1308 1309 return (B_FALSE); 1310 } 1311 1312 uint64_t 1313 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) 1314 { 1315 brt_t *brt = spa->spa_brt; 1316 brt_vdev_t *brtvd; 1317 brt_entry_t bre_search, *bre; 1318 uint64_t vdevid, refcnt; 1319 int error; 1320 1321 brt_entry_fill(bp, &bre_search, &vdevid); 1322 1323 brt_rlock(brt); 1324 1325 brtvd = brt_vdev(brt, vdevid); 1326 ASSERT(brtvd != NULL); 1327 1328 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1329 if (bre == NULL) { 1330 error = brt_entry_lookup(brt, brtvd, &bre_search); 1331 ASSERT(error == 0 || error == ENOENT); 1332 if (error == ENOENT) 1333 refcnt = 0; 1334 else 1335 refcnt = bre_search.bre_refcount; 1336 } else 1337 refcnt = bre->bre_refcount; 1338 1339 brt_unlock(brt); 1340 return (refcnt); 1341 } 1342 1343 static void 1344 brt_prefetch(brt_t *brt, const blkptr_t *bp) 1345 { 1346 brt_entry_t bre; 1347 uint64_t vdevid; 1348 1349 ASSERT(bp != NULL); 1350 1351 if (!brt_zap_prefetch) 1352 return; 1353 1354 brt_entry_fill(bp, &bre, &vdevid); 1355 1356 brt_entry_prefetch(brt, vdevid, &bre); 1357 } 1358 1359 static int 1360 brt_pending_entry_compare(const void *x1, const void *x2) 1361 { 1362 const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2; 1363 const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp; 1364 int cmp; 1365 1366 cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]), 1367 DVA_GET_VDEV(&bp2->blk_dva[0])); 1368 if (cmp == 0) { 1369 cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), 1370 DVA_GET_OFFSET(&bp2->blk_dva[0])); 1371 if (unlikely(cmp == 0)) { 1372 cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2)); 1373 } 1374 } 1375 1376 return (cmp); 1377 } 1378 1379 void 1380 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1381 { 1382 brt_t *brt; 1383 avl_tree_t *pending_tree; 1384 kmutex_t *pending_lock; 1385 brt_pending_entry_t *bpe, *newbpe; 1386 avl_index_t where; 1387 uint64_t txg; 1388 1389 brt = spa->spa_brt; 1390 txg = dmu_tx_get_txg(tx); 1391 ASSERT3U(txg, !=, 0); 1392 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; 1393 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; 1394 1395 newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP); 1396 newbpe->bpe_bp = *bp; 1397 newbpe->bpe_count = 1; 1398 1399 mutex_enter(pending_lock); 1400 1401 bpe = avl_find(pending_tree, newbpe, &where); 1402 if (bpe == NULL) { 1403 avl_insert(pending_tree, newbpe, where); 1404 newbpe = NULL; 1405 } else { 1406 bpe->bpe_count++; 1407 } 1408 1409 mutex_exit(pending_lock); 1410 1411 if (newbpe != NULL) { 1412 ASSERT(bpe != NULL); 1413 ASSERT(bpe != newbpe); 1414 kmem_cache_free(brt_pending_entry_cache, newbpe); 1415 } else { 1416 ASSERT(bpe == NULL); 1417 1418 /* Prefetch BRT entry for the syncing context. */ 1419 brt_prefetch(brt, bp); 1420 } 1421 } 1422 1423 void 1424 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1425 { 1426 brt_t *brt; 1427 avl_tree_t *pending_tree; 1428 kmutex_t *pending_lock; 1429 brt_pending_entry_t *bpe, bpe_search; 1430 uint64_t txg; 1431 1432 brt = spa->spa_brt; 1433 txg = dmu_tx_get_txg(tx); 1434 ASSERT3U(txg, !=, 0); 1435 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; 1436 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; 1437 1438 bpe_search.bpe_bp = *bp; 1439 1440 mutex_enter(pending_lock); 1441 1442 bpe = avl_find(pending_tree, &bpe_search, NULL); 1443 /* I believe we should always find bpe when this function is called. */ 1444 if (bpe != NULL) { 1445 ASSERT(bpe->bpe_count > 0); 1446 1447 bpe->bpe_count--; 1448 if (bpe->bpe_count == 0) { 1449 avl_remove(pending_tree, bpe); 1450 kmem_cache_free(brt_pending_entry_cache, bpe); 1451 } 1452 } 1453 1454 mutex_exit(pending_lock); 1455 } 1456 1457 void 1458 brt_pending_apply(spa_t *spa, uint64_t txg) 1459 { 1460 brt_t *brt = spa->spa_brt; 1461 brt_pending_entry_t *bpe; 1462 avl_tree_t *pending_tree; 1463 void *c; 1464 1465 ASSERT3U(txg, !=, 0); 1466 1467 /* 1468 * We are in syncing context, so no other brt_pending_tree accesses 1469 * are possible for the TXG. Don't need to acquire brt_pending_lock. 1470 */ 1471 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; 1472 1473 c = NULL; 1474 while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) { 1475 boolean_t added_to_ddt; 1476 1477 for (int i = 0; i < bpe->bpe_count; i++) { 1478 /* 1479 * If the block has DEDUP bit set, it means that it 1480 * already exists in the DEDUP table, so we can just 1481 * use that instead of creating new entry in 1482 * the BRT table. 1483 */ 1484 if (BP_GET_DEDUP(&bpe->bpe_bp)) { 1485 added_to_ddt = ddt_addref(spa, &bpe->bpe_bp); 1486 } else { 1487 added_to_ddt = B_FALSE; 1488 } 1489 if (!added_to_ddt) 1490 brt_entry_addref(brt, &bpe->bpe_bp); 1491 } 1492 1493 kmem_cache_free(brt_pending_entry_cache, bpe); 1494 } 1495 } 1496 1497 static void 1498 brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) 1499 { 1500 if (bre->bre_refcount == 0) { 1501 int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset, 1502 BRT_KEY_WORDS, tx); 1503 VERIFY(error == 0 || error == ENOENT); 1504 } else { 1505 VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset, 1506 BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), 1507 &bre->bre_refcount, tx)); 1508 } 1509 } 1510 1511 static void 1512 brt_sync_table(brt_t *brt, dmu_tx_t *tx) 1513 { 1514 brt_vdev_t *brtvd; 1515 brt_entry_t *bre; 1516 dnode_t *dn; 1517 uint64_t vdevid; 1518 void *c; 1519 1520 brt_wlock(brt); 1521 1522 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 1523 brtvd = &brt->brt_vdevs[vdevid]; 1524 1525 if (!brtvd->bv_initiated) 1526 continue; 1527 1528 if (!brtvd->bv_meta_dirty) { 1529 ASSERT(!brtvd->bv_entcount_dirty); 1530 ASSERT0(avl_numnodes(&brtvd->bv_tree)); 1531 continue; 1532 } 1533 1534 ASSERT(!brtvd->bv_entcount_dirty || 1535 avl_numnodes(&brtvd->bv_tree) != 0); 1536 1537 if (brtvd->bv_mos_brtvdev == 0) 1538 brt_vdev_create(brt, brtvd, tx); 1539 1540 VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries, 1541 FTAG, &dn)); 1542 1543 c = NULL; 1544 while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { 1545 brt_sync_entry(dn, bre, tx); 1546 brt_entry_free(bre); 1547 ASSERT(brt->brt_nentries > 0); 1548 brt->brt_nentries--; 1549 } 1550 1551 dnode_rele(dn, FTAG); 1552 1553 brt_vdev_sync(brt, brtvd, tx); 1554 1555 if (brtvd->bv_totalcount == 0) 1556 brt_vdev_destroy(brt, brtvd, tx); 1557 } 1558 1559 ASSERT0(brt->brt_nentries); 1560 1561 brt_unlock(brt); 1562 } 1563 1564 void 1565 brt_sync(spa_t *spa, uint64_t txg) 1566 { 1567 dmu_tx_t *tx; 1568 brt_t *brt; 1569 1570 ASSERT(spa_syncing_txg(spa) == txg); 1571 1572 brt = spa->spa_brt; 1573 brt_rlock(brt); 1574 if (brt->brt_nentries == 0) { 1575 /* No changes. */ 1576 brt_unlock(brt); 1577 return; 1578 } 1579 brt_unlock(brt); 1580 1581 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1582 1583 brt_sync_table(brt, tx); 1584 1585 dmu_tx_commit(tx); 1586 } 1587 1588 static void 1589 brt_table_alloc(brt_t *brt) 1590 { 1591 1592 for (int i = 0; i < TXG_SIZE; i++) { 1593 avl_create(&brt->brt_pending_tree[i], 1594 brt_pending_entry_compare, 1595 sizeof (brt_pending_entry_t), 1596 offsetof(brt_pending_entry_t, bpe_node)); 1597 mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT, 1598 NULL); 1599 } 1600 } 1601 1602 static void 1603 brt_table_free(brt_t *brt) 1604 { 1605 1606 for (int i = 0; i < TXG_SIZE; i++) { 1607 ASSERT(avl_is_empty(&brt->brt_pending_tree[i])); 1608 1609 avl_destroy(&brt->brt_pending_tree[i]); 1610 mutex_destroy(&brt->brt_pending_lock[i]); 1611 } 1612 } 1613 1614 static void 1615 brt_alloc(spa_t *spa) 1616 { 1617 brt_t *brt; 1618 1619 ASSERT(spa->spa_brt == NULL); 1620 1621 brt = kmem_zalloc(sizeof (*brt), KM_SLEEP); 1622 rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL); 1623 brt->brt_spa = spa; 1624 brt->brt_rangesize = 0; 1625 brt->brt_nentries = 0; 1626 brt->brt_vdevs = NULL; 1627 brt->brt_nvdevs = 0; 1628 brt_table_alloc(brt); 1629 1630 spa->spa_brt = brt; 1631 } 1632 1633 void 1634 brt_create(spa_t *spa) 1635 { 1636 1637 brt_alloc(spa); 1638 brt_vdevs_alloc(spa->spa_brt, B_FALSE); 1639 } 1640 1641 int 1642 brt_load(spa_t *spa) 1643 { 1644 1645 brt_alloc(spa); 1646 brt_vdevs_alloc(spa->spa_brt, B_TRUE); 1647 1648 return (0); 1649 } 1650 1651 void 1652 brt_unload(spa_t *spa) 1653 { 1654 brt_t *brt = spa->spa_brt; 1655 1656 if (brt == NULL) 1657 return; 1658 1659 brt_vdevs_free(brt); 1660 brt_table_free(brt); 1661 rw_destroy(&brt->brt_lock); 1662 kmem_free(brt, sizeof (*brt)); 1663 spa->spa_brt = NULL; 1664 } 1665 1666 /* BEGIN CSTYLED */ 1667 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW, 1668 "Enable prefetching of BRT ZAP entries"); 1669 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW, 1670 "BRT ZAP leaf blockshift"); 1671 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW, 1672 "BRT ZAP indirect blockshift"); 1673 /* END CSTYLED */ 1674