brt.c (783d3ff6d7fae619db8a7990b8a6387de0c677b5) | brt.c (718519f4efc71096422fc71dab90b2a3369871ff) |
---|---|
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 229 unchanged lines hidden (view full) --- 238 * first references are dropped during ZIL destroy by zil_free_clone_range(). 239 * It is possible that after zil_claim() we never mount the destination, so 240 * we never replay its ZIL and just destroy it. In this case the only taken 241 * references will be dropped by zil_free_clone_range(), since the cloning is 242 * not going to ever take place. 243 */ 244 245static kmem_cache_t *brt_entry_cache; | 1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 229 unchanged lines hidden (view full) --- 238 * first references are dropped during ZIL destroy by zil_free_clone_range(). 239 * It is possible that after zil_claim() we never mount the destination, so 240 * we never replay its ZIL and just destroy it. In this case the only taken 241 * references will be dropped by zil_free_clone_range(), since the cloning is 242 * not going to ever take place. 243 */ 244 245static kmem_cache_t *brt_entry_cache; |
246static kmem_cache_t *brt_pending_entry_cache; | |
247 248/* 249 * Enable/disable prefetching of BRT entries that we are going to modify. 250 */ 251static int brt_zap_prefetch = 1; 252 253#ifdef ZFS_DEBUG 254#define BRT_DEBUG(...) do { \ --- 6 unchanged lines hidden (view full) --- 261#endif 262 263static int brt_zap_default_bs = 12; 264static int brt_zap_default_ibs = 12; 265 266static kstat_t *brt_ksp; 267 268typedef struct brt_stats { | 246 247/* 248 * Enable/disable prefetching of BRT entries that we are going to modify. 249 */ 250static int brt_zap_prefetch = 1; 251 252#ifdef ZFS_DEBUG 253#define BRT_DEBUG(...) do { \ --- 6 unchanged lines hidden (view full) --- 260#endif 261 262static int brt_zap_default_bs = 12; 263static int brt_zap_default_ibs = 12; 264 265static kstat_t *brt_ksp; 266 267typedef struct brt_stats { |
269 kstat_named_t brt_addref_entry_in_memory; | |
270 kstat_named_t brt_addref_entry_not_on_disk; 271 kstat_named_t brt_addref_entry_on_disk; | 268 kstat_named_t brt_addref_entry_not_on_disk; 269 kstat_named_t brt_addref_entry_on_disk; |
272 kstat_named_t brt_addref_entry_read_lost_race; | |
273 kstat_named_t brt_decref_entry_in_memory; 274 kstat_named_t brt_decref_entry_loaded_from_disk; 275 kstat_named_t brt_decref_entry_not_in_memory; | 270 kstat_named_t brt_decref_entry_in_memory; 271 kstat_named_t brt_decref_entry_loaded_from_disk; 272 kstat_named_t brt_decref_entry_not_in_memory; |
276 kstat_named_t brt_decref_entry_not_on_disk; | |
277 kstat_named_t brt_decref_entry_read_lost_race; 278 kstat_named_t brt_decref_entry_still_referenced; 279 kstat_named_t brt_decref_free_data_later; 280 kstat_named_t brt_decref_free_data_now; 281 kstat_named_t brt_decref_no_entry; 282} brt_stats_t; 283 284static brt_stats_t brt_stats = { | 273 kstat_named_t brt_decref_entry_read_lost_race; 274 kstat_named_t brt_decref_entry_still_referenced; 275 kstat_named_t brt_decref_free_data_later; 276 kstat_named_t brt_decref_free_data_now; 277 kstat_named_t brt_decref_no_entry; 278} brt_stats_t; 279 280static brt_stats_t brt_stats = { |
285 { "addref_entry_in_memory", KSTAT_DATA_UINT64 }, | |
286 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, 287 { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, | 281 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, 282 { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, |
288 { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 }, | |
289 { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, 290 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, 291 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, | 283 { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, 284 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, 285 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, |
292 { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 }, | |
293 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, 294 { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, 295 { "decref_free_data_later", KSTAT_DATA_UINT64 }, 296 { "decref_free_data_now", KSTAT_DATA_UINT64 }, 297 { "decref_no_entry", KSTAT_DATA_UINT64 } 298}; 299 300struct { | 286 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, 287 { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, 288 { "decref_free_data_later", KSTAT_DATA_UINT64 }, 289 { "decref_free_data_now", KSTAT_DATA_UINT64 }, 290 { "decref_no_entry", KSTAT_DATA_UINT64 } 291}; 292 293struct { |
301 wmsum_t brt_addref_entry_in_memory; | |
302 wmsum_t brt_addref_entry_not_on_disk; 303 wmsum_t brt_addref_entry_on_disk; | 294 wmsum_t brt_addref_entry_not_on_disk; 295 wmsum_t brt_addref_entry_on_disk; |
304 wmsum_t brt_addref_entry_read_lost_race; | |
305 wmsum_t brt_decref_entry_in_memory; 306 wmsum_t brt_decref_entry_loaded_from_disk; 307 wmsum_t brt_decref_entry_not_in_memory; | 296 wmsum_t brt_decref_entry_in_memory; 297 wmsum_t brt_decref_entry_loaded_from_disk; 298 wmsum_t brt_decref_entry_not_in_memory; |
308 wmsum_t brt_decref_entry_not_on_disk; | |
309 wmsum_t brt_decref_entry_read_lost_race; 310 wmsum_t brt_decref_entry_still_referenced; 311 wmsum_t brt_decref_free_data_later; 312 wmsum_t brt_decref_free_data_now; 313 wmsum_t brt_decref_no_entry; 314} brt_sums; 315 316#define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) 317 318static int brt_entry_compare(const void *x1, const void *x2); | 299 wmsum_t brt_decref_entry_read_lost_race; 300 wmsum_t brt_decref_entry_still_referenced; 301 wmsum_t brt_decref_free_data_later; 302 wmsum_t brt_decref_free_data_now; 303 wmsum_t brt_decref_no_entry; 304} brt_sums; 305 306#define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) 307 308static int brt_entry_compare(const void *x1, const void *x2); |
319static int brt_pending_entry_compare(const void *x1, const void *x2); | 309static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs); |
320 321static void | 310 311static void |
322brt_rlock(brt_t *brt) | 312brt_rlock(spa_t *spa) |
323{ | 313{ |
324 rw_enter(&brt->brt_lock, RW_READER); | 314 rw_enter(&spa->spa_brt_lock, RW_READER); |
325} 326 327static void | 315} 316 317static void |
328brt_wlock(brt_t *brt) | 318brt_wlock(spa_t *spa) |
329{ | 319{ |
330 rw_enter(&brt->brt_lock, RW_WRITER); | 320 rw_enter(&spa->spa_brt_lock, RW_WRITER); |
331} 332 333static void | 321} 322 323static void |
334brt_unlock(brt_t *brt) | 324brt_unlock(spa_t *spa) |
335{ | 325{ |
336 rw_exit(&brt->brt_lock); | 326 rw_exit(&spa->spa_brt_lock); |
337} 338 339static uint16_t 340brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) 341{ 342 343 ASSERT3U(idx, <, brtvd->bv_size); 344 --- 44 unchanged lines hidden (view full) --- 389} 390 391#ifdef ZFS_DEBUG 392static void 393brt_vdev_dump(brt_vdev_t *brtvd) 394{ 395 uint64_t idx; 396 | 327} 328 329static uint16_t 330brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) 331{ 332 333 ASSERT3U(idx, <, brtvd->bv_size); 334 --- 44 unchanged lines hidden (view full) --- 379} 380 381#ifdef ZFS_DEBUG 382static void 383brt_vdev_dump(brt_vdev_t *brtvd) 384{ 385 uint64_t idx; 386 |
387 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); |
|
397 zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d " | 388 zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d " |
398 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", | 389 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu", |
399 (u_longlong_t)brtvd->bv_vdevid, 400 brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, 401 (u_longlong_t)brtvd->bv_size, 402 (u_longlong_t)brtvd->bv_totalcount, | 390 (u_longlong_t)brtvd->bv_vdevid, 391 brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, 392 (u_longlong_t)brtvd->bv_size, 393 (u_longlong_t)brtvd->bv_totalcount, |
403 (u_longlong_t)brtvd->bv_nblocks, 404 (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); | 394 (u_longlong_t)nblocks, 395 (size_t)BT_SIZEOFMAP(nblocks)); |
405 if (brtvd->bv_totalcount > 0) { 406 zfs_dbgmsg(" entcounts:"); 407 for (idx = 0; idx < brtvd->bv_size; idx++) { 408 uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx); 409 if (entcnt > 0) { 410 zfs_dbgmsg(" [%04llu] %hu", 411 (u_longlong_t)idx, entcnt); 412 } 413 } 414 } 415 if (brtvd->bv_entcount_dirty) { 416 char *bitmap; 417 | 396 if (brtvd->bv_totalcount > 0) { 397 zfs_dbgmsg(" entcounts:"); 398 for (idx = 0; idx < brtvd->bv_size; idx++) { 399 uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx); 400 if (entcnt > 0) { 401 zfs_dbgmsg(" [%04llu] %hu", 402 (u_longlong_t)idx, entcnt); 403 } 404 } 405 } 406 if (brtvd->bv_entcount_dirty) { 407 char *bitmap; 408 |
418 bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); 419 for (idx = 0; idx < brtvd->bv_nblocks; idx++) { | 409 bitmap = kmem_alloc(nblocks + 1, KM_SLEEP); 410 for (idx = 0; idx < nblocks; idx++) { |
420 bitmap[idx] = 421 BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; 422 } 423 bitmap[idx] = '\0'; 424 zfs_dbgmsg(" dirty: %s", bitmap); | 411 bitmap[idx] = 412 BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; 413 } 414 bitmap[idx] = '\0'; 415 zfs_dbgmsg(" dirty: %s", bitmap); |
425 kmem_free(bitmap, brtvd->bv_nblocks + 1); | 416 kmem_free(bitmap, nblocks + 1); |
426 } 427} 428#endif 429 430static brt_vdev_t * | 417 } 418} 419#endif 420 421static brt_vdev_t * |
431brt_vdev(brt_t *brt, uint64_t vdevid) | 422brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc) |
432{ | 423{ |
433 brt_vdev_t *brtvd; | 424 brt_vdev_t *brtvd = NULL; |
434 | 425 |
435 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 436 437 if (vdevid < brt->brt_nvdevs) { 438 brtvd = &brt->brt_vdevs[vdevid]; 439 } else { 440 brtvd = NULL; | 426 brt_rlock(spa); 427 if (vdevid < spa->spa_brt_nvdevs) { 428 brtvd = spa->spa_brt_vdevs[vdevid]; 429 } else if (alloc) { 430 /* New VDEV was added. */ 431 brt_unlock(spa); 432 brt_wlock(spa); 433 if (vdevid >= spa->spa_brt_nvdevs) 434 brt_vdevs_expand(spa, vdevid + 1); 435 brtvd = spa->spa_brt_vdevs[vdevid]; |
441 } | 436 } |
442 | 437 brt_unlock(spa); |
443 return (brtvd); 444} 445 446static void | 438 return (brtvd); 439} 440 441static void |
447brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) | 442brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) |
448{ 449 char name[64]; 450 | 443{ 444 char name[64]; 445 |
451 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); | 446 ASSERT(brtvd->bv_initiated); |
452 ASSERT0(brtvd->bv_mos_brtvdev); 453 ASSERT0(brtvd->bv_mos_entries); | 447 ASSERT0(brtvd->bv_mos_brtvdev); 448 ASSERT0(brtvd->bv_mos_entries); |
454 ASSERT(brtvd->bv_entcount != NULL); 455 ASSERT(brtvd->bv_size > 0); 456 ASSERT(brtvd->bv_bitmap != NULL); 457 ASSERT(brtvd->bv_nblocks > 0); | |
458 | 449 |
459 brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0, | 450 uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0, |
460 ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, 461 brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx); | 451 ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, 452 brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx); |
462 VERIFY(brtvd->bv_mos_entries != 0); | 453 VERIFY(mos_entries != 0); 454 VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd, 455 &brtvd->bv_mos_entries_dnode)); 456 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); 457 brtvd->bv_mos_entries = mos_entries; 458 rw_exit(&brtvd->bv_mos_entries_lock); |
463 BRT_DEBUG("MOS entries created, object=%llu", 464 (u_longlong_t)brtvd->bv_mos_entries); 465 466 /* 467 * We allocate DMU buffer to store the bv_entcount[] array. 468 * We will keep array size (bv_size) and cummulative count for all 469 * bv_entcount[]s (bv_totalcount) in the bonus buffer. 470 */ | 459 BRT_DEBUG("MOS entries created, object=%llu", 460 (u_longlong_t)brtvd->bv_mos_entries); 461 462 /* 463 * We allocate DMU buffer to store the bv_entcount[] array. 464 * We will keep array size (bv_size) and cummulative count for all 465 * bv_entcount[]s (bv_totalcount) in the bonus buffer. 466 */ |
471 brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos, | 467 brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset, |
472 DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, 473 DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); 474 VERIFY(brtvd->bv_mos_brtvdev != 0); 475 BRT_DEBUG("MOS BRT VDEV created, object=%llu", 476 (u_longlong_t)brtvd->bv_mos_brtvdev); 477 478 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 479 (u_longlong_t)brtvd->bv_vdevid); | 468 DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, 469 DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); 470 VERIFY(brtvd->bv_mos_brtvdev != 0); 471 BRT_DEBUG("MOS BRT VDEV created, object=%llu", 472 (u_longlong_t)brtvd->bv_mos_brtvdev); 473 474 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 475 (u_longlong_t)brtvd->bv_vdevid); |
480 VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, | 476 VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, |
481 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); 482 BRT_DEBUG("Pool directory object created, object=%s", name); 483 | 477 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); 478 BRT_DEBUG("Pool directory object created, object=%s", name); 479 |
484 spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); | 480 spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx); |
485} 486 487static void | 481} 482 483static void |
488brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) | 484brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd) |
489{ 490 vdev_t *vd; 491 uint16_t *entcount; 492 ulong_t *bitmap; | 485{ 486 vdev_t *vd; 487 uint16_t *entcount; 488 ulong_t *bitmap; |
493 uint64_t nblocks, size; | 489 uint64_t nblocks, onblocks, size; |
494 | 490 |
495 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); | 491 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); |
496 | 492 |
497 spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER); 498 vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid); 499 size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1; 500 spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG); | 493 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 494 vd = vdev_lookup_top(spa, brtvd->bv_vdevid); 495 size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1; 496 spa_config_exit(spa, SCL_VDEV, FTAG); |
501 502 entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); 503 nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); 504 bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); 505 506 if (!brtvd->bv_initiated) { 507 ASSERT0(brtvd->bv_size); | 497 498 entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); 499 nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); 500 bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); 501 502 if (!brtvd->bv_initiated) { 503 ASSERT0(brtvd->bv_size); |
508 ASSERT(brtvd->bv_entcount == NULL); 509 ASSERT(brtvd->bv_bitmap == NULL); 510 ASSERT0(brtvd->bv_nblocks); 511 512 avl_create(&brtvd->bv_tree, brt_entry_compare, 513 sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); | 504 ASSERT0P(brtvd->bv_entcount); 505 ASSERT0P(brtvd->bv_bitmap); |
514 } else { 515 ASSERT(brtvd->bv_size > 0); 516 ASSERT(brtvd->bv_entcount != NULL); 517 ASSERT(brtvd->bv_bitmap != NULL); | 506 } else { 507 ASSERT(brtvd->bv_size > 0); 508 ASSERT(brtvd->bv_entcount != NULL); 509 ASSERT(brtvd->bv_bitmap != NULL); |
518 ASSERT(brtvd->bv_nblocks > 0); | |
519 /* 520 * TODO: Allow vdev shrinking. We only need to implement 521 * shrinking the on-disk BRT VDEV object. | 510 /* 511 * TODO: Allow vdev shrinking. We only need to implement 512 * shrinking the on-disk BRT VDEV object. |
522 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset, 523 * size, tx); | 513 * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 514 * offset, size, tx); |
524 */ 525 ASSERT3U(brtvd->bv_size, <=, size); 526 527 memcpy(entcount, brtvd->bv_entcount, 528 sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); | 515 */ 516 ASSERT3U(brtvd->bv_size, <=, size); 517 518 memcpy(entcount, brtvd->bv_entcount, 519 sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); |
529 memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), 530 BT_SIZEOFMAP(brtvd->bv_nblocks))); | |
531 vmem_free(brtvd->bv_entcount, 532 sizeof (entcount[0]) * brtvd->bv_size); | 520 vmem_free(brtvd->bv_entcount, 521 sizeof (entcount[0]) * brtvd->bv_size); |
533 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); | 522 onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); 523 memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), 524 BT_SIZEOFMAP(onblocks))); 525 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks)); |
534 } 535 536 brtvd->bv_size = size; 537 brtvd->bv_entcount = entcount; 538 brtvd->bv_bitmap = bitmap; | 526 } 527 528 brtvd->bv_size = size; 529 brtvd->bv_entcount = entcount; 530 brtvd->bv_bitmap = bitmap; |
539 brtvd->bv_nblocks = nblocks; | |
540 if (!brtvd->bv_initiated) { 541 brtvd->bv_need_byteswap = FALSE; 542 brtvd->bv_initiated = TRUE; 543 BRT_DEBUG("BRT VDEV %llu initiated.", 544 (u_longlong_t)brtvd->bv_vdevid); 545 } 546} 547 | 531 if (!brtvd->bv_initiated) { 532 brtvd->bv_need_byteswap = FALSE; 533 brtvd->bv_initiated = TRUE; 534 BRT_DEBUG("BRT VDEV %llu initiated.", 535 (u_longlong_t)brtvd->bv_vdevid); 536 } 537} 538 |
548static void 549brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) | 539static int 540brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd) |
550{ | 541{ |
551 char name[64]; | |
552 dmu_buf_t *db; 553 brt_vdev_phys_t *bvphys; 554 int error; 555 | 542 dmu_buf_t *db; 543 brt_vdev_phys_t *bvphys; 544 int error; 545 |
556 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 557 (u_longlong_t)brtvd->bv_vdevid); 558 error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, 559 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev); 560 if (error != 0) 561 return; | 546 ASSERT(!brtvd->bv_initiated); |
562 ASSERT(brtvd->bv_mos_brtvdev != 0); 563 | 547 ASSERT(brtvd->bv_mos_brtvdev != 0); 548 |
564 error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db); 565 ASSERT0(error); | 549 error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 550 FTAG, &db); |
566 if (error != 0) | 551 if (error != 0) |
567 return; | 552 return (error); |
568 569 bvphys = db->db_data; | 553 554 bvphys = db->db_data; |
570 if (brt->brt_rangesize == 0) { 571 brt->brt_rangesize = bvphys->bvp_rangesize; | 555 if (spa->spa_brt_rangesize == 0) { 556 spa->spa_brt_rangesize = bvphys->bvp_rangesize; |
572 } else { | 557 } else { |
573 ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize); | 558 ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize); |
574 } 575 | 559 } 560 |
576 ASSERT(!brtvd->bv_initiated); 577 brt_vdev_realloc(brt, brtvd); | 561 brt_vdev_realloc(spa, brtvd); |
578 579 /* TODO: We don't support VDEV shrinking. */ 580 ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); 581 582 /* 583 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. 584 */ | 562 563 /* TODO: We don't support VDEV shrinking. */ 564 ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); 565 566 /* 567 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. 568 */ |
585 error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, | 569 error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, |
586 MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), 587 brtvd->bv_entcount, DMU_READ_NO_PREFETCH); | 570 MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), 571 brtvd->bv_entcount, DMU_READ_NO_PREFETCH); |
588 ASSERT0(error); | 572 if (error != 0) 573 return (error); |
589 | 574 |
575 ASSERT(bvphys->bvp_mos_entries != 0); 576 VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd, 577 &brtvd->bv_mos_entries_dnode)); 578 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); |
|
590 brtvd->bv_mos_entries = bvphys->bvp_mos_entries; | 579 brtvd->bv_mos_entries = bvphys->bvp_mos_entries; |
591 ASSERT(brtvd->bv_mos_entries != 0); | 580 rw_exit(&brtvd->bv_mos_entries_lock); |
592 brtvd->bv_need_byteswap = 593 (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); 594 brtvd->bv_totalcount = bvphys->bvp_totalcount; 595 brtvd->bv_usedspace = bvphys->bvp_usedspace; 596 brtvd->bv_savedspace = bvphys->bvp_savedspace; | 581 brtvd->bv_need_byteswap = 582 (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); 583 brtvd->bv_totalcount = bvphys->bvp_totalcount; 584 brtvd->bv_usedspace = bvphys->bvp_usedspace; 585 brtvd->bv_savedspace = bvphys->bvp_savedspace; |
597 brt->brt_usedspace += brtvd->bv_usedspace; 598 brt->brt_savedspace += brtvd->bv_savedspace; | |
599 600 dmu_buf_rele(db, FTAG); 601 | 586 587 dmu_buf_rele(db, FTAG); 588 |
602 BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu", 603 name, (u_longlong_t)brtvd->bv_mos_brtvdev, | 589 BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu", 590 (u_longlong_t)brtvd->bv_vdevid, 591 (u_longlong_t)brtvd->bv_mos_brtvdev, |
604 (u_longlong_t)brtvd->bv_mos_entries); | 592 (u_longlong_t)brtvd->bv_mos_entries); |
593 return (0); |
|
605} 606 607static void | 594} 595 596static void |
608brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd) | 597brt_vdev_dealloc(brt_vdev_t *brtvd) |
609{ | 598{ |
610 611 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); | 599 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); |
612 ASSERT(brtvd->bv_initiated); | 600 ASSERT(brtvd->bv_initiated); |
601 ASSERT0(avl_numnodes(&brtvd->bv_tree)); |
|
613 614 vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); 615 brtvd->bv_entcount = NULL; | 602 603 vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); 604 brtvd->bv_entcount = NULL; |
616 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); | 605 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); 606 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks)); |
617 brtvd->bv_bitmap = NULL; | 607 brtvd->bv_bitmap = NULL; |
618 ASSERT0(avl_numnodes(&brtvd->bv_tree)); 619 avl_destroy(&brtvd->bv_tree); | |
620 621 brtvd->bv_size = 0; | 608 609 brtvd->bv_size = 0; |
622 brtvd->bv_nblocks = 0; | |
623 624 brtvd->bv_initiated = FALSE; 625 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); 626} 627 628static void | 610 611 brtvd->bv_initiated = FALSE; 612 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); 613} 614 615static void |
629brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) | 616brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) |
630{ 631 char name[64]; 632 uint64_t count; | 617{ 618 char name[64]; 619 uint64_t count; |
633 dmu_buf_t *db; 634 brt_vdev_phys_t *bvphys; | |
635 | 620 |
636 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); | 621 ASSERT(brtvd->bv_initiated); |
637 ASSERT(brtvd->bv_mos_brtvdev != 0); 638 ASSERT(brtvd->bv_mos_entries != 0); | 622 ASSERT(brtvd->bv_mos_brtvdev != 0); 623 ASSERT(brtvd->bv_mos_entries != 0); |
624 ASSERT0(brtvd->bv_totalcount); 625 ASSERT0(brtvd->bv_usedspace); 626 ASSERT0(brtvd->bv_savedspace); |
|
639 | 627 |
640 VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count)); 641 VERIFY0(count); 642 VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx)); 643 BRT_DEBUG("MOS entries destroyed, object=%llu", 644 (u_longlong_t)brtvd->bv_mos_entries); | 628 uint64_t mos_entries = brtvd->bv_mos_entries; 629 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); |
645 brtvd->bv_mos_entries = 0; | 630 brtvd->bv_mos_entries = 0; |
631 rw_exit(&brtvd->bv_mos_entries_lock); 632 dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); 633 brtvd->bv_mos_entries_dnode = NULL; 634 ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count)); 635 ASSERT0(count); 636 VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx)); 637 BRT_DEBUG("MOS entries destroyed, object=%llu", 638 (u_longlong_t)mos_entries); |
|
646 | 639 |
647 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); 648 bvphys = db->db_data; 649 ASSERT0(bvphys->bvp_totalcount); 650 ASSERT0(bvphys->bvp_usedspace); 651 ASSERT0(bvphys->bvp_savedspace); 652 dmu_buf_rele(db, FTAG); 653 654 VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx)); | 640 VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 641 tx)); |
655 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", 656 (u_longlong_t)brtvd->bv_mos_brtvdev); 657 brtvd->bv_mos_brtvdev = 0; | 642 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", 643 (u_longlong_t)brtvd->bv_mos_brtvdev); 644 brtvd->bv_mos_brtvdev = 0; |
645 brtvd->bv_entcount_dirty = FALSE; |
|
658 659 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 660 (u_longlong_t)brtvd->bv_vdevid); | 646 647 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 648 (u_longlong_t)brtvd->bv_vdevid); |
661 VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx)); | 649 VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 650 name, tx)); |
662 BRT_DEBUG("Pool directory object removed, object=%s", name); 663 | 651 BRT_DEBUG("Pool directory object removed, object=%s", name); 652 |
664 brt_vdev_dealloc(brt, brtvd); | 653 brtvd->bv_meta_dirty = FALSE; |
665 | 654 |
666 spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); | 655 rw_enter(&brtvd->bv_lock, RW_WRITER); 656 brt_vdev_dealloc(brtvd); 657 rw_exit(&brtvd->bv_lock); 658 659 spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx); |
667} 668 669static void | 660} 661 662static void |
670brt_vdevs_expand(brt_t *brt, uint64_t nvdevs) | 663brt_vdevs_expand(spa_t *spa, uint64_t nvdevs) |
671{ | 664{ |
672 brt_vdev_t *brtvd, *vdevs; 673 uint64_t vdevid; | 665 brt_vdev_t **vdevs; |
674 | 666 |
675 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 676 ASSERT3U(nvdevs, >, brt->brt_nvdevs); | 667 ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock)); 668 ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs); |
677 | 669 |
678 vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP); 679 if (brt->brt_nvdevs > 0) { 680 ASSERT(brt->brt_vdevs != NULL); | 670 if (nvdevs == spa->spa_brt_nvdevs) 671 return; |
681 | 672 |
682 memcpy(vdevs, brt->brt_vdevs, 683 sizeof (brt_vdev_t) * brt->brt_nvdevs); 684 kmem_free(brt->brt_vdevs, 685 sizeof (brt_vdev_t) * brt->brt_nvdevs); | 673 vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP); 674 if (spa->spa_brt_nvdevs > 0) { 675 ASSERT(spa->spa_brt_vdevs != NULL); 676 677 memcpy(vdevs, spa->spa_brt_vdevs, 678 sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); 679 kmem_free(spa->spa_brt_vdevs, 680 sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); |
686 } | 681 } |
687 for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) { 688 brtvd = &vdevs[vdevid]; | 682 spa->spa_brt_vdevs = vdevs; |
689 | 683 |
684 for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) { 685 brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP); 686 rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL); |
|
690 brtvd->bv_vdevid = vdevid; 691 brtvd->bv_initiated = FALSE; | 687 brtvd->bv_vdevid = vdevid; 688 brtvd->bv_initiated = FALSE; |
689 rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL); 690 avl_create(&brtvd->bv_tree, brt_entry_compare, 691 sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); 692 for (int i = 0; i < TXG_SIZE; i++) { 693 avl_create(&brtvd->bv_pending_tree[i], 694 brt_entry_compare, sizeof (brt_entry_t), 695 offsetof(brt_entry_t, bre_node)); 696 } 697 mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL); 698 spa->spa_brt_vdevs[vdevid] = brtvd; |
|
692 } 693 694 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", | 699 } 700 701 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", |
695 (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs); 696 697 brt->brt_vdevs = vdevs; 698 brt->brt_nvdevs = nvdevs; | 702 (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs); 703 spa->spa_brt_nvdevs = nvdevs; |
699} 700 701static boolean_t | 704} 705 706static boolean_t |
702brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre) | 707brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset) |
703{ | 708{ |
704 uint64_t idx; 705 706 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 707 708 idx = bre->bre_offset / brt->brt_rangesize; 709 if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) { | 709 uint64_t idx = offset / spa->spa_brt_rangesize; 710 if (idx < brtvd->bv_size) { |
710 /* VDEV wasn't expanded. */ 711 return (brt_vdev_entcount_get(brtvd, idx) > 0); 712 } | 711 /* VDEV wasn't expanded. */ 712 return (brt_vdev_entcount_get(brtvd, idx) > 0); 713 } |
713 | |
714 return (FALSE); 715} 716 717static void | 714 return (FALSE); 715} 716 717static void |
718brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, 719 uint64_t dsize) | 718brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, 719 uint64_t dsize, uint64_t count) |
720{ 721 uint64_t idx; 722 | 720{ 721 uint64_t idx; 722 |
723 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 724 ASSERT(brtvd != NULL); 725 ASSERT(brtvd->bv_entcount != NULL); | 723 ASSERT(brtvd->bv_initiated); |
726 | 724 |
727 brt->brt_savedspace += dsize; 728 brtvd->bv_savedspace += dsize; | 725 brtvd->bv_savedspace += dsize * count; |
729 brtvd->bv_meta_dirty = TRUE; 730 | 726 brtvd->bv_meta_dirty = TRUE; 727 |
731 if (bre->bre_refcount > 1) { | 728 if (bre->bre_count > 0) |
732 return; | 729 return; |
733 } | |
734 | 730 |
735 brt->brt_usedspace += dsize; | |
736 brtvd->bv_usedspace += dsize; 737 | 731 brtvd->bv_usedspace += dsize; 732 |
738 idx = bre->bre_offset / brt->brt_rangesize; | 733 idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize; |
739 if (idx >= brtvd->bv_size) { 740 /* VDEV has been expanded. */ | 734 if (idx >= brtvd->bv_size) { 735 /* VDEV has been expanded. */ |
741 brt_vdev_realloc(brt, brtvd); | 736 rw_enter(&brtvd->bv_lock, RW_WRITER); 737 brt_vdev_realloc(spa, brtvd); 738 rw_exit(&brtvd->bv_lock); |
742 } 743 744 ASSERT3U(idx, <, brtvd->bv_size); 745 746 brtvd->bv_totalcount++; 747 brt_vdev_entcount_inc(brtvd, idx); 748 brtvd->bv_entcount_dirty = TRUE; 749 idx = idx / BRT_BLOCKSIZE / 8; 750 BT_SET(brtvd->bv_bitmap, idx); | 739 } 740 741 ASSERT3U(idx, <, brtvd->bv_size); 742 743 brtvd->bv_totalcount++; 744 brt_vdev_entcount_inc(brtvd, idx); 745 brtvd->bv_entcount_dirty = TRUE; 746 idx = idx / BRT_BLOCKSIZE / 8; 747 BT_SET(brtvd->bv_bitmap, idx); |
751 752#ifdef ZFS_DEBUG 753 if (zfs_flags & ZFS_DEBUG_BRT) 754 brt_vdev_dump(brtvd); 755#endif | |
756} 757 758static void | 748} 749 750static void |
759brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, | 751brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, |
760 uint64_t dsize) 761{ 762 uint64_t idx; 763 | 752 uint64_t dsize) 753{ 754 uint64_t idx; 755 |
764 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 765 ASSERT(brtvd != NULL); 766 ASSERT(brtvd->bv_entcount != NULL); | 756 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); 757 ASSERT(brtvd->bv_initiated); |
767 | 758 |
768 brt->brt_savedspace -= dsize; | |
769 brtvd->bv_savedspace -= dsize; 770 brtvd->bv_meta_dirty = TRUE; 771 | 759 brtvd->bv_savedspace -= dsize; 760 brtvd->bv_meta_dirty = TRUE; 761 |
772 if (bre->bre_refcount > 0) { | 762 if (bre->bre_count > 0) |
773 return; | 763 return; |
774 } | |
775 | 764 |
776 brt->brt_usedspace -= dsize; | |
777 brtvd->bv_usedspace -= dsize; 778 | 765 brtvd->bv_usedspace -= dsize; 766 |
779 idx = bre->bre_offset / brt->brt_rangesize; | 767 idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize; |
780 ASSERT3U(idx, <, brtvd->bv_size); 781 782 ASSERT(brtvd->bv_totalcount > 0); 783 brtvd->bv_totalcount--; 784 brt_vdev_entcount_dec(brtvd, idx); 785 brtvd->bv_entcount_dirty = TRUE; 786 idx = idx / BRT_BLOCKSIZE / 8; 787 BT_SET(brtvd->bv_bitmap, idx); | 768 ASSERT3U(idx, <, brtvd->bv_size); 769 770 ASSERT(brtvd->bv_totalcount > 0); 771 brtvd->bv_totalcount--; 772 brt_vdev_entcount_dec(brtvd, idx); 773 brtvd->bv_entcount_dirty = TRUE; 774 idx = idx / BRT_BLOCKSIZE / 8; 775 BT_SET(brtvd->bv_bitmap, idx); |
788 789#ifdef ZFS_DEBUG 790 if (zfs_flags & ZFS_DEBUG_BRT) 791 brt_vdev_dump(brtvd); 792#endif | |
793} 794 795static void | 776} 777 778static void |
796brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) | 779brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) |
797{ 798 dmu_buf_t *db; 799 brt_vdev_phys_t *bvphys; 800 801 ASSERT(brtvd->bv_meta_dirty); 802 ASSERT(brtvd->bv_mos_brtvdev != 0); 803 ASSERT(dmu_tx_is_syncing(tx)); 804 | 780{ 781 dmu_buf_t *db; 782 brt_vdev_phys_t *bvphys; 783 784 ASSERT(brtvd->bv_meta_dirty); 785 ASSERT(brtvd->bv_mos_brtvdev != 0); 786 ASSERT(dmu_tx_is_syncing(tx)); 787 |
805 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); | 788 VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 789 FTAG, &db)); |
806 807 if (brtvd->bv_entcount_dirty) { 808 /* 809 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. 810 */ | 790 791 if (brtvd->bv_entcount_dirty) { 792 /* 793 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. 794 */ |
811 dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, | 795 dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, |
812 brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), 813 brtvd->bv_entcount, tx); | 796 brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), 797 brtvd->bv_entcount, tx); |
814 memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks)); | 798 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); 799 memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks)); |
815 brtvd->bv_entcount_dirty = FALSE; 816 } 817 818 dmu_buf_will_dirty(db, tx); 819 bvphys = db->db_data; 820 bvphys->bvp_mos_entries = brtvd->bv_mos_entries; 821 bvphys->bvp_size = brtvd->bv_size; 822 if (brtvd->bv_need_byteswap) { 823 bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER; 824 } else { 825 bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; 826 } 827 bvphys->bvp_totalcount = brtvd->bv_totalcount; | 800 brtvd->bv_entcount_dirty = FALSE; 801 } 802 803 dmu_buf_will_dirty(db, tx); 804 bvphys = db->db_data; 805 bvphys->bvp_mos_entries = brtvd->bv_mos_entries; 806 bvphys->bvp_size = brtvd->bv_size; 807 if (brtvd->bv_need_byteswap) { 808 bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER; 809 } else { 810 bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; 811 } 812 bvphys->bvp_totalcount = brtvd->bv_totalcount; |
828 bvphys->bvp_rangesize = brt->brt_rangesize; | 813 bvphys->bvp_rangesize = spa->spa_brt_rangesize; |
829 bvphys->bvp_usedspace = brtvd->bv_usedspace; 830 bvphys->bvp_savedspace = brtvd->bv_savedspace; 831 dmu_buf_rele(db, FTAG); 832 833 brtvd->bv_meta_dirty = FALSE; 834} 835 836static void | 814 bvphys->bvp_usedspace = brtvd->bv_usedspace; 815 bvphys->bvp_savedspace = brtvd->bv_savedspace; 816 dmu_buf_rele(db, FTAG); 817 818 brtvd->bv_meta_dirty = FALSE; 819} 820 821static void |
837brt_vdevs_alloc(brt_t *brt, boolean_t load) | 822brt_vdevs_free(spa_t *spa) |
838{ | 823{ |
839 brt_vdev_t *brtvd; 840 uint64_t vdevid; 841 842 brt_wlock(brt); 843 844 brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children); 845 846 if (load) { 847 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 848 brtvd = &brt->brt_vdevs[vdevid]; 849 ASSERT(brtvd->bv_entcount == NULL); 850 851 brt_vdev_load(brt, brtvd); 852 } 853 } 854 855 if (brt->brt_rangesize == 0) { 856 brt->brt_rangesize = BRT_RANGESIZE; 857 } 858 859 brt_unlock(brt); 860} 861 862static void 863brt_vdevs_free(brt_t *brt) 864{ 865 brt_vdev_t *brtvd; 866 uint64_t vdevid; 867 868 brt_wlock(brt); 869 870 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 871 brtvd = &brt->brt_vdevs[vdevid]; | 824 if (spa->spa_brt_vdevs == 0) 825 return; 826 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 827 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 828 rw_enter(&brtvd->bv_lock, RW_WRITER); |
872 if (brtvd->bv_initiated) | 829 if (brtvd->bv_initiated) |
873 brt_vdev_dealloc(brt, brtvd); | 830 brt_vdev_dealloc(brtvd); 831 rw_exit(&brtvd->bv_lock); 832 rw_destroy(&brtvd->bv_lock); 833 if (brtvd->bv_mos_entries != 0) 834 dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); 835 rw_destroy(&brtvd->bv_mos_entries_lock); 836 avl_destroy(&brtvd->bv_tree); 837 for (int i = 0; i < TXG_SIZE; i++) 838 avl_destroy(&brtvd->bv_pending_tree[i]); 839 mutex_destroy(&brtvd->bv_pending_lock); 840 kmem_free(brtvd, sizeof (*brtvd)); |
874 } | 841 } |
875 kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs); 876 877 brt_unlock(brt); | 842 kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) * 843 spa->spa_brt_nvdevs); |
878} 879 880static void 881brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) 882{ 883 | 844} 845 846static void 847brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) 848{ 849 |
884 bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); 885 bre->bre_refcount = 0; | 850 bre->bre_bp = *bp; 851 bre->bre_count = 0; 852 bre->bre_pcount = 0; |
886 887 *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); 888} 889 890static int | 853 854 *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); 855} 856 857static int |
891brt_entry_compare(const void *x1, const void *x2) | 858brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre) |
892{ | 859{ |
893 const brt_entry_t *bre1 = x1; 894 const brt_entry_t *bre2 = x2; | 860 uint64_t off = BRE_OFFSET(bre); |
895 | 861 |
896 return (TREE_CMP(bre1->bre_offset, bre2->bre_offset)); | 862 return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, 863 &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count)); |
897} 898 | 864} 865 |
899static int 900brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre) 901{ 902 uint64_t mos_entries; 903 int error; 904 905 ASSERT(RW_LOCK_HELD(&brt->brt_lock)); 906 907 if (!brt_vdev_lookup(brt, brtvd, bre)) 908 return (SET_ERROR(ENOENT)); 909 910 /* 911 * Remember mos_entries object number. After we reacquire the BRT lock, 912 * the brtvd pointer may be invalid. 913 */ 914 mos_entries = brtvd->bv_mos_entries; 915 if (mos_entries == 0) 916 return (SET_ERROR(ENOENT)); 917 918 brt_unlock(brt); 919 920 error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, 921 BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount); 922 923 brt_wlock(brt); 924 925 return (error); 926} 927 928static void 929brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) 930{ 931 brt_vdev_t *brtvd; 932 uint64_t mos_entries = 0; 933 934 brt_rlock(brt); 935 brtvd = brt_vdev(brt, vdevid); 936 if (brtvd != NULL) 937 mos_entries = brtvd->bv_mos_entries; 938 brt_unlock(brt); 939 940 if (mos_entries == 0) 941 return; 942 943 (void) zap_prefetch_uint64(brt->brt_mos, mos_entries, 944 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS); 945} 946 | |
947/* 948 * Return TRUE if we _can_ have BRT entry for this bp. It might be false 949 * positive, but gives us quick answer if we should look into BRT, which 950 * may require reads and thus will be more expensive. 951 */ 952boolean_t 953brt_maybe_exists(spa_t *spa, const blkptr_t *bp) 954{ | 866/* 867 * Return TRUE if we _can_ have BRT entry for this bp. It might be false 868 * positive, but gives us quick answer if we should look into BRT, which 869 * may require reads and thus will be more expensive. 870 */ 871boolean_t 872brt_maybe_exists(spa_t *spa, const blkptr_t *bp) 873{ |
955 brt_t *brt = spa->spa_brt; 956 brt_vdev_t *brtvd; 957 brt_entry_t bre_search; 958 boolean_t mayexists = FALSE; 959 uint64_t vdevid; | |
960 | 874 |
961 brt_entry_fill(bp, &bre_search, &vdevid); | 875 if (spa->spa_brt_nvdevs == 0) 876 return (B_FALSE); |
962 | 877 |
963 brt_rlock(brt); | 878 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); 879 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); 880 if (brtvd == NULL || !brtvd->bv_initiated) 881 return (FALSE); |
964 | 882 |
965 brtvd = brt_vdev(brt, vdevid); 966 if (brtvd != NULL && brtvd->bv_initiated) { 967 if (!avl_is_empty(&brtvd->bv_tree) || 968 brt_vdev_lookup(brt, brtvd, &bre_search)) { 969 mayexists = TRUE; 970 } 971 } 972 973 brt_unlock(brt); 974 975 return (mayexists); | 883 /* 884 * We don't need locks here, since bv_entcount pointer must be 885 * stable at this point, and we don't care about false positive 886 * races here, while false negative should be impossible, since 887 * all brt_vdev_addref() have already completed by this point. 888 */ 889 uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]); 890 return (brt_vdev_lookup(spa, brtvd, off)); |
976} 977 978uint64_t 979brt_get_dspace(spa_t *spa) 980{ | 891} 892 893uint64_t 894brt_get_dspace(spa_t *spa) 895{ |
981 brt_t *brt = spa->spa_brt; 982 983 if (brt == NULL) | 896 if (spa->spa_brt_nvdevs == 0) |
984 return (0); 985 | 897 return (0); 898 |
986 return (brt->brt_savedspace); | 899 brt_rlock(spa); 900 uint64_t s = 0; 901 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) 902 s += spa->spa_brt_vdevs[vdevid]->bv_savedspace; 903 brt_unlock(spa); 904 return (s); |
987} 988 989uint64_t 990brt_get_used(spa_t *spa) 991{ | 905} 906 907uint64_t 908brt_get_used(spa_t *spa) 909{ |
992 brt_t *brt = spa->spa_brt; 993 994 if (brt == NULL) | 910 if (spa->spa_brt_nvdevs == 0) |
995 return (0); 996 | 911 return (0); 912 |
997 return (brt->brt_usedspace); | 913 brt_rlock(spa); 914 uint64_t s = 0; 915 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) 916 s += spa->spa_brt_vdevs[vdevid]->bv_usedspace; 917 brt_unlock(spa); 918 return (s); |
998} 999 1000uint64_t 1001brt_get_saved(spa_t *spa) 1002{ | 919} 920 921uint64_t 922brt_get_saved(spa_t *spa) 923{ |
1003 brt_t *brt = spa->spa_brt; 1004 1005 if (brt == NULL) 1006 return (0); 1007 1008 return (brt->brt_savedspace); | 924 return (brt_get_dspace(spa)); |
1009} 1010 1011uint64_t 1012brt_get_ratio(spa_t *spa) 1013{ | 925} 926 927uint64_t 928brt_get_ratio(spa_t *spa) 929{ |
1014 brt_t *brt = spa->spa_brt; 1015 1016 if (brt->brt_usedspace == 0) | 930 uint64_t used = brt_get_used(spa); 931 if (used == 0) |
1017 return (100); | 932 return (100); |
1018 1019 return ((brt->brt_usedspace + brt->brt_savedspace) * 100 / 1020 brt->brt_usedspace); | 933 return ((used + brt_get_saved(spa)) * 100 / used); |
1021} 1022 1023static int 1024brt_kstats_update(kstat_t *ksp, int rw) 1025{ 1026 brt_stats_t *bs = ksp->ks_data; 1027 1028 if (rw == KSTAT_WRITE) 1029 return (EACCES); 1030 | 934} 935 936static int 937brt_kstats_update(kstat_t *ksp, int rw) 938{ 939 brt_stats_t *bs = ksp->ks_data; 940 941 if (rw == KSTAT_WRITE) 942 return (EACCES); 943 |
1031 bs->brt_addref_entry_in_memory.value.ui64 = 1032 wmsum_value(&brt_sums.brt_addref_entry_in_memory); | |
1033 bs->brt_addref_entry_not_on_disk.value.ui64 = 1034 wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); 1035 bs->brt_addref_entry_on_disk.value.ui64 = 1036 wmsum_value(&brt_sums.brt_addref_entry_on_disk); | 944 bs->brt_addref_entry_not_on_disk.value.ui64 = 945 wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); 946 bs->brt_addref_entry_on_disk.value.ui64 = 947 wmsum_value(&brt_sums.brt_addref_entry_on_disk); |
1037 bs->brt_addref_entry_read_lost_race.value.ui64 = 1038 wmsum_value(&brt_sums.brt_addref_entry_read_lost_race); | |
1039 bs->brt_decref_entry_in_memory.value.ui64 = 1040 wmsum_value(&brt_sums.brt_decref_entry_in_memory); 1041 bs->brt_decref_entry_loaded_from_disk.value.ui64 = 1042 wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); 1043 bs->brt_decref_entry_not_in_memory.value.ui64 = 1044 wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); | 948 bs->brt_decref_entry_in_memory.value.ui64 = 949 wmsum_value(&brt_sums.brt_decref_entry_in_memory); 950 bs->brt_decref_entry_loaded_from_disk.value.ui64 = 951 wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); 952 bs->brt_decref_entry_not_in_memory.value.ui64 = 953 wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); |
1045 bs->brt_decref_entry_not_on_disk.value.ui64 = 1046 wmsum_value(&brt_sums.brt_decref_entry_not_on_disk); | |
1047 bs->brt_decref_entry_read_lost_race.value.ui64 = 1048 wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); 1049 bs->brt_decref_entry_still_referenced.value.ui64 = 1050 wmsum_value(&brt_sums.brt_decref_entry_still_referenced); 1051 bs->brt_decref_free_data_later.value.ui64 = 1052 wmsum_value(&brt_sums.brt_decref_free_data_later); 1053 bs->brt_decref_free_data_now.value.ui64 = 1054 wmsum_value(&brt_sums.brt_decref_free_data_now); 1055 bs->brt_decref_no_entry.value.ui64 = 1056 wmsum_value(&brt_sums.brt_decref_no_entry); 1057 1058 return (0); 1059} 1060 1061static void 1062brt_stat_init(void) 1063{ 1064 | 954 bs->brt_decref_entry_read_lost_race.value.ui64 = 955 wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); 956 bs->brt_decref_entry_still_referenced.value.ui64 = 957 wmsum_value(&brt_sums.brt_decref_entry_still_referenced); 958 bs->brt_decref_free_data_later.value.ui64 = 959 wmsum_value(&brt_sums.brt_decref_free_data_later); 960 bs->brt_decref_free_data_now.value.ui64 = 961 wmsum_value(&brt_sums.brt_decref_free_data_now); 962 bs->brt_decref_no_entry.value.ui64 = 963 wmsum_value(&brt_sums.brt_decref_no_entry); 964 965 return (0); 966} 967 968static void 969brt_stat_init(void) 970{ 971 |
1065 wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0); | |
1066 wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); 1067 wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); | 972 wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); 973 wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); |
1068 wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0); | |
1069 wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); 1070 wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); 1071 wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); | 974 wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); 975 wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); 976 wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); |
1072 wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0); | |
1073 wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); 1074 wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); 1075 wmsum_init(&brt_sums.brt_decref_free_data_later, 0); 1076 wmsum_init(&brt_sums.brt_decref_free_data_now, 0); 1077 wmsum_init(&brt_sums.brt_decref_no_entry, 0); 1078 1079 brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED, 1080 sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); --- 7 unchanged lines hidden (view full) --- 1088static void 1089brt_stat_fini(void) 1090{ 1091 if (brt_ksp != NULL) { 1092 kstat_delete(brt_ksp); 1093 brt_ksp = NULL; 1094 } 1095 | 977 wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); 978 wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); 979 wmsum_init(&brt_sums.brt_decref_free_data_later, 0); 980 wmsum_init(&brt_sums.brt_decref_free_data_now, 0); 981 wmsum_init(&brt_sums.brt_decref_no_entry, 0); 982 983 brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED, 984 sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); --- 7 unchanged lines hidden (view full) --- 992static void 993brt_stat_fini(void) 994{ 995 if (brt_ksp != NULL) { 996 kstat_delete(brt_ksp); 997 brt_ksp = NULL; 998 } 999 |
1096 wmsum_fini(&brt_sums.brt_addref_entry_in_memory); | |
1097 wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); 1098 wmsum_fini(&brt_sums.brt_addref_entry_on_disk); | 1000 wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); 1001 wmsum_fini(&brt_sums.brt_addref_entry_on_disk); |
1099 wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race); | |
1100 wmsum_fini(&brt_sums.brt_decref_entry_in_memory); 1101 wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); 1102 wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); | 1002 wmsum_fini(&brt_sums.brt_decref_entry_in_memory); 1003 wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); 1004 wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); |
1103 wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk); | |
1104 wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); 1105 wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); 1106 wmsum_fini(&brt_sums.brt_decref_free_data_later); 1107 wmsum_fini(&brt_sums.brt_decref_free_data_now); 1108 wmsum_fini(&brt_sums.brt_decref_no_entry); 1109} 1110 1111void 1112brt_init(void) 1113{ 1114 brt_entry_cache = kmem_cache_create("brt_entry_cache", 1115 sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); | 1005 wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); 1006 wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); 1007 wmsum_fini(&brt_sums.brt_decref_free_data_later); 1008 wmsum_fini(&brt_sums.brt_decref_free_data_now); 1009 wmsum_fini(&brt_sums.brt_decref_no_entry); 1010} 1011 1012void 1013brt_init(void) 1014{ 1015 brt_entry_cache = kmem_cache_create("brt_entry_cache", 1016 sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); |
1116 brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache", 1117 sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); | |
1118 1119 brt_stat_init(); 1120} 1121 1122void 1123brt_fini(void) 1124{ 1125 brt_stat_fini(); 1126 1127 kmem_cache_destroy(brt_entry_cache); | 1017 1018 brt_stat_init(); 1019} 1020 1021void 1022brt_fini(void) 1023{ 1024 brt_stat_fini(); 1025 1026 kmem_cache_destroy(brt_entry_cache); |
1128 kmem_cache_destroy(brt_pending_entry_cache); | |
1129} 1130 | 1027} 1028 |
1131static brt_entry_t * 1132brt_entry_alloc(const brt_entry_t *bre_init) 1133{ 1134 brt_entry_t *bre; 1135 1136 bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); 1137 bre->bre_offset = bre_init->bre_offset; 1138 bre->bre_refcount = bre_init->bre_refcount; 1139 1140 return (bre); 1141} 1142 1143static void 1144brt_entry_free(brt_entry_t *bre) 1145{ 1146 1147 kmem_cache_free(brt_entry_cache, bre); 1148} 1149 1150static void 1151brt_entry_addref(brt_t *brt, const blkptr_t *bp) 1152{ 1153 brt_vdev_t *brtvd; 1154 brt_entry_t *bre, *racebre; 1155 brt_entry_t bre_search; 1156 avl_index_t where; 1157 uint64_t vdevid; 1158 int error; 1159 1160 ASSERT(!RW_WRITE_HELD(&brt->brt_lock)); 1161 1162 brt_entry_fill(bp, &bre_search, &vdevid); 1163 1164 brt_wlock(brt); 1165 1166 brtvd = brt_vdev(brt, vdevid); 1167 if (brtvd == NULL) { 1168 ASSERT3U(vdevid, >=, brt->brt_nvdevs); 1169 1170 /* New VDEV was added. */ 1171 brt_vdevs_expand(brt, vdevid + 1); 1172 brtvd = brt_vdev(brt, vdevid); 1173 } 1174 ASSERT(brtvd != NULL); 1175 if (!brtvd->bv_initiated) 1176 brt_vdev_realloc(brt, brtvd); 1177 1178 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1179 if (bre != NULL) { 1180 BRTSTAT_BUMP(brt_addref_entry_in_memory); 1181 } else { 1182 /* 1183 * brt_entry_lookup() may drop the BRT (read) lock and 1184 * reacquire it (write). 1185 */ 1186 error = brt_entry_lookup(brt, brtvd, &bre_search); 1187 /* bre_search now contains correct bre_refcount */ 1188 ASSERT(error == 0 || error == ENOENT); 1189 if (error == 0) 1190 BRTSTAT_BUMP(brt_addref_entry_on_disk); 1191 else 1192 BRTSTAT_BUMP(brt_addref_entry_not_on_disk); 1193 /* 1194 * When the BRT lock was dropped, brt_vdevs[] may have been 1195 * expanded and reallocated, we need to update brtvd's pointer. 1196 */ 1197 brtvd = brt_vdev(brt, vdevid); 1198 ASSERT(brtvd != NULL); 1199 1200 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); 1201 if (racebre == NULL) { 1202 bre = brt_entry_alloc(&bre_search); 1203 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); 1204 avl_insert(&brtvd->bv_tree, bre, where); 1205 brt->brt_nentries++; 1206 } else { 1207 /* 1208 * The entry was added when the BRT lock was dropped in 1209 * brt_entry_lookup(). 1210 */ 1211 BRTSTAT_BUMP(brt_addref_entry_read_lost_race); 1212 bre = racebre; 1213 } 1214 } 1215 bre->bre_refcount++; 1216 brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); 1217 1218 brt_unlock(brt); 1219} 1220 | |
1221/* Return TRUE if block should be freed immediately. */ 1222boolean_t 1223brt_entry_decref(spa_t *spa, const blkptr_t *bp) 1224{ | 1029/* Return TRUE if block should be freed immediately. */ 1030boolean_t 1031brt_entry_decref(spa_t *spa, const blkptr_t *bp) 1032{ |
1225 brt_t *brt = spa->spa_brt; 1226 brt_vdev_t *brtvd; | |
1227 brt_entry_t *bre, *racebre; 1228 brt_entry_t bre_search; 1229 avl_index_t where; 1230 uint64_t vdevid; 1231 int error; 1232 1233 brt_entry_fill(bp, &bre_search, &vdevid); 1234 | 1033 brt_entry_t *bre, *racebre; 1034 brt_entry_t bre_search; 1035 avl_index_t where; 1036 uint64_t vdevid; 1037 int error; 1038 1039 brt_entry_fill(bp, &bre_search, &vdevid); 1040 |
1235 brt_wlock(brt); 1236 1237 brtvd = brt_vdev(brt, vdevid); | 1041 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); |
1238 ASSERT(brtvd != NULL); 1239 | 1042 ASSERT(brtvd != NULL); 1043 |
1044 rw_enter(&brtvd->bv_lock, RW_WRITER); 1045 ASSERT(brtvd->bv_initiated); |
|
1240 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1241 if (bre != NULL) { 1242 BRTSTAT_BUMP(brt_decref_entry_in_memory); 1243 goto out; 1244 } else { 1245 BRTSTAT_BUMP(brt_decref_entry_not_in_memory); 1246 } | 1046 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1047 if (bre != NULL) { 1048 BRTSTAT_BUMP(brt_decref_entry_in_memory); 1049 goto out; 1050 } else { 1051 BRTSTAT_BUMP(brt_decref_entry_not_in_memory); 1052 } |
1053 rw_exit(&brtvd->bv_lock); |
|
1247 | 1054 |
1248 /* 1249 * brt_entry_lookup() may drop the BRT lock and reacquire it. 1250 */ 1251 error = brt_entry_lookup(brt, brtvd, &bre_search); 1252 /* bre_search now contains correct bre_refcount */ 1253 ASSERT(error == 0 || error == ENOENT); 1254 /* 1255 * When the BRT lock was dropped, brt_vdevs[] may have been expanded 1256 * and reallocated, we need to update brtvd's pointer. 1257 */ 1258 brtvd = brt_vdev(brt, vdevid); 1259 ASSERT(brtvd != NULL); 1260 | 1055 error = brt_entry_lookup(brtvd, &bre_search); 1056 /* bre_search now contains correct bre_count */ |
1261 if (error == ENOENT) { | 1057 if (error == ENOENT) { |
1262 BRTSTAT_BUMP(brt_decref_entry_not_on_disk); 1263 bre = NULL; 1264 goto out; | 1058 BRTSTAT_BUMP(brt_decref_no_entry); 1059 return (B_TRUE); |
1265 } | 1060 } |
1061 ASSERT0(error); |
|
1266 | 1062 |
1063 rw_enter(&brtvd->bv_lock, RW_WRITER); |
|
1267 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); 1268 if (racebre != NULL) { | 1064 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); 1065 if (racebre != NULL) { |
1269 /* 1270 * The entry was added when the BRT lock was dropped in 1271 * brt_entry_lookup(). 1272 */ | 1066 /* The entry was added when the lock was dropped. */ |
1273 BRTSTAT_BUMP(brt_decref_entry_read_lost_race); 1274 bre = racebre; 1275 goto out; 1276 } 1277 1278 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); | 1067 BRTSTAT_BUMP(brt_decref_entry_read_lost_race); 1068 bre = racebre; 1069 goto out; 1070 } 1071 1072 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); |
1279 bre = brt_entry_alloc(&bre_search); 1280 ASSERT(RW_WRITE_HELD(&brt->brt_lock)); | 1073 bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); 1074 bre->bre_bp = bre_search.bre_bp; 1075 bre->bre_count = bre_search.bre_count; 1076 bre->bre_pcount = 0; |
1281 avl_insert(&brtvd->bv_tree, bre, where); | 1077 avl_insert(&brtvd->bv_tree, bre, where); |
1282 brt->brt_nentries++; | |
1283 1284out: | 1078 1079out: |
1285 if (bre == NULL) { 1286 /* 1287 * This is a free of a regular (not cloned) block. 1288 */ 1289 brt_unlock(brt); 1290 BRTSTAT_BUMP(brt_decref_no_entry); 1291 return (B_TRUE); 1292 } 1293 if (bre->bre_refcount == 0) { 1294 brt_unlock(brt); | 1080 if (bre->bre_count == 0) { 1081 rw_exit(&brtvd->bv_lock); |
1295 BRTSTAT_BUMP(brt_decref_free_data_now); 1296 return (B_TRUE); 1297 } 1298 | 1082 BRTSTAT_BUMP(brt_decref_free_data_now); 1083 return (B_TRUE); 1084 } 1085 |
1299 ASSERT(bre->bre_refcount > 0); 1300 bre->bre_refcount--; 1301 if (bre->bre_refcount == 0) | 1086 bre->bre_pcount--; 1087 ASSERT(bre->bre_count > 0); 1088 bre->bre_count--; 1089 if (bre->bre_count == 0) |
1302 BRTSTAT_BUMP(brt_decref_free_data_later); 1303 else 1304 BRTSTAT_BUMP(brt_decref_entry_still_referenced); | 1090 BRTSTAT_BUMP(brt_decref_free_data_later); 1091 else 1092 BRTSTAT_BUMP(brt_decref_entry_still_referenced); |
1305 brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); | 1093 brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp)); |
1306 | 1094 |
1307 brt_unlock(brt); | 1095 rw_exit(&brtvd->bv_lock); |
1308 1309 return (B_FALSE); 1310} 1311 1312uint64_t 1313brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) 1314{ | 1096 1097 return (B_FALSE); 1098} 1099 1100uint64_t 1101brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) 1102{ |
1315 brt_t *brt = spa->spa_brt; 1316 brt_vdev_t *brtvd; | |
1317 brt_entry_t bre_search, *bre; 1318 uint64_t vdevid, refcnt; 1319 int error; 1320 1321 brt_entry_fill(bp, &bre_search, &vdevid); 1322 | 1103 brt_entry_t bre_search, *bre; 1104 uint64_t vdevid, refcnt; 1105 int error; 1106 1107 brt_entry_fill(bp, &bre_search, &vdevid); 1108 |
1323 brt_rlock(brt); 1324 1325 brtvd = brt_vdev(brt, vdevid); | 1109 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); |
1326 ASSERT(brtvd != NULL); 1327 | 1110 ASSERT(brtvd != NULL); 1111 |
1112 rw_enter(&brtvd->bv_lock, RW_READER); 1113 ASSERT(brtvd->bv_initiated); |
|
1328 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1329 if (bre == NULL) { | 1114 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); 1115 if (bre == NULL) { |
1330 error = brt_entry_lookup(brt, brtvd, &bre_search); 1331 ASSERT(error == 0 || error == ENOENT); 1332 if (error == ENOENT) | 1116 rw_exit(&brtvd->bv_lock); 1117 error = brt_entry_lookup(brtvd, &bre_search); 1118 if (error == ENOENT) { |
1333 refcnt = 0; | 1119 refcnt = 0; |
1334 else 1335 refcnt = bre_search.bre_refcount; 1336 } else 1337 refcnt = bre->bre_refcount; | 1120 } else { 1121 ASSERT0(error); 1122 refcnt = bre_search.bre_count; 1123 } 1124 } else { 1125 refcnt = bre->bre_count; 1126 rw_exit(&brtvd->bv_lock); 1127 } |
1338 | 1128 |
1339 brt_unlock(brt); | |
1340 return (refcnt); 1341} 1342 1343static void | 1129 return (refcnt); 1130} 1131 1132static void |
1344brt_prefetch(brt_t *brt, const blkptr_t *bp) | 1133brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp) |
1345{ | 1134{ |
1346 brt_entry_t bre; 1347 uint64_t vdevid; 1348 1349 ASSERT(bp != NULL); 1350 1351 if (!brt_zap_prefetch) | 1135 if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0) |
1352 return; 1353 | 1136 return; 1137 |
1354 brt_entry_fill(bp, &bre, &vdevid); 1355 1356 brt_entry_prefetch(brt, vdevid, &bre); | 1138 uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]); 1139 rw_enter(&brtvd->bv_mos_entries_lock, RW_READER); 1140 if (brtvd->bv_mos_entries != 0) { 1141 (void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode, 1142 &off, BRT_KEY_WORDS); 1143 } 1144 rw_exit(&brtvd->bv_mos_entries_lock); |
1357} 1358 1359static int | 1145} 1146 1147static int |
1360brt_pending_entry_compare(const void *x1, const void *x2) | 1148brt_entry_compare(const void *x1, const void *x2) |
1361{ | 1149{ |
1362 const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2; 1363 const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp; 1364 int cmp; | 1150 const brt_entry_t *bre1 = x1, *bre2 = x2; 1151 const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp; |
1365 | 1152 |
1366 cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]), 1367 DVA_GET_VDEV(&bp2->blk_dva[0])); 1368 if (cmp == 0) { 1369 cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), 1370 DVA_GET_OFFSET(&bp2->blk_dva[0])); 1371 if (unlikely(cmp == 0)) { 1372 cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2)); 1373 } 1374 } 1375 1376 return (cmp); | 1153 return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), 1154 DVA_GET_OFFSET(&bp2->blk_dva[0]))); |
1377} 1378 1379void 1380brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1381{ | 1155} 1156 1157void 1158brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1159{ |
1382 brt_t *brt; 1383 avl_tree_t *pending_tree; 1384 kmutex_t *pending_lock; 1385 brt_pending_entry_t *bpe, *newbpe; | 1160 brt_entry_t *bre, *newbre; |
1386 avl_index_t where; 1387 uint64_t txg; 1388 | 1161 avl_index_t where; 1162 uint64_t txg; 1163 |
1389 brt = spa->spa_brt; | |
1390 txg = dmu_tx_get_txg(tx); 1391 ASSERT3U(txg, !=, 0); | 1164 txg = dmu_tx_get_txg(tx); 1165 ASSERT3U(txg, !=, 0); |
1392 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; 1393 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; | |
1394 | 1166 |
1395 newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP); 1396 newbpe->bpe_bp = *bp; 1397 newbpe->bpe_count = 1; | 1167 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); 1168 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE); 1169 avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; |
1398 | 1170 |
1399 mutex_enter(pending_lock); | 1171 newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); 1172 newbre->bre_bp = *bp; 1173 newbre->bre_count = 0; 1174 newbre->bre_pcount = 1; |
1400 | 1175 |
1401 bpe = avl_find(pending_tree, newbpe, &where); 1402 if (bpe == NULL) { 1403 avl_insert(pending_tree, newbpe, where); 1404 newbpe = NULL; | 1176 mutex_enter(&brtvd->bv_pending_lock); 1177 bre = avl_find(pending_tree, newbre, &where); 1178 if (bre == NULL) { 1179 avl_insert(pending_tree, newbre, where); 1180 newbre = NULL; |
1405 } else { | 1181 } else { |
1406 bpe->bpe_count++; | 1182 bre->bre_pcount++; |
1407 } | 1183 } |
1184 mutex_exit(&brtvd->bv_pending_lock); |
|
1408 | 1185 |
1409 mutex_exit(pending_lock); 1410 1411 if (newbpe != NULL) { 1412 ASSERT(bpe != NULL); 1413 ASSERT(bpe != newbpe); 1414 kmem_cache_free(brt_pending_entry_cache, newbpe); | 1186 if (newbre != NULL) { 1187 ASSERT(bre != NULL); 1188 ASSERT(bre != newbre); 1189 kmem_cache_free(brt_entry_cache, newbre); |
1415 } else { | 1190 } else { |
1416 ASSERT(bpe == NULL); | 1191 ASSERT0P(bre); |
1417 1418 /* Prefetch BRT entry for the syncing context. */ | 1192 1193 /* Prefetch BRT entry for the syncing context. */ |
1419 brt_prefetch(brt, bp); | 1194 brt_prefetch(brtvd, bp); |
1420 } 1421} 1422 1423void 1424brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1425{ | 1195 } 1196} 1197 1198void 1199brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) 1200{ |
1426 brt_t *brt; 1427 avl_tree_t *pending_tree; 1428 kmutex_t *pending_lock; 1429 brt_pending_entry_t *bpe, bpe_search; | 1201 brt_entry_t *bre, bre_search; |
1430 uint64_t txg; 1431 | 1202 uint64_t txg; 1203 |
1432 brt = spa->spa_brt; | |
1433 txg = dmu_tx_get_txg(tx); 1434 ASSERT3U(txg, !=, 0); | 1204 txg = dmu_tx_get_txg(tx); 1205 ASSERT3U(txg, !=, 0); |
1435 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; 1436 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; | |
1437 | 1206 |
1438 bpe_search.bpe_bp = *bp; | 1207 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); 1208 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); 1209 ASSERT(brtvd != NULL); 1210 avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; |
1439 | 1211 |
1440 mutex_enter(pending_lock); | 1212 bre_search.bre_bp = *bp; |
1441 | 1213 |
1442 bpe = avl_find(pending_tree, &bpe_search, NULL); 1443 /* I believe we should always find bpe when this function is called. */ 1444 if (bpe != NULL) { 1445 ASSERT(bpe->bpe_count > 0); | 1214 mutex_enter(&brtvd->bv_pending_lock); 1215 bre = avl_find(pending_tree, &bre_search, NULL); 1216 ASSERT(bre != NULL); 1217 ASSERT(bre->bre_pcount > 0); 1218 bre->bre_pcount--; 1219 if (bre->bre_pcount == 0) 1220 avl_remove(pending_tree, bre); 1221 else 1222 bre = NULL; 1223 mutex_exit(&brtvd->bv_pending_lock); |
1446 | 1224 |
1447 bpe->bpe_count--; 1448 if (bpe->bpe_count == 0) { 1449 avl_remove(pending_tree, bpe); 1450 kmem_cache_free(brt_pending_entry_cache, bpe); 1451 } 1452 } 1453 1454 mutex_exit(pending_lock); | 1225 if (bre) 1226 kmem_cache_free(brt_entry_cache, bre); |
1455} 1456 | 1227} 1228 |
1457void 1458brt_pending_apply(spa_t *spa, uint64_t txg) | 1229static void 1230brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg) |
1459{ | 1231{ |
1460 brt_t *brt = spa->spa_brt; 1461 brt_pending_entry_t *bpe; 1462 avl_tree_t *pending_tree; 1463 void *c; | 1232 brt_entry_t *bre, *nbre; |
1464 | 1233 |
1465 ASSERT3U(txg, !=, 0); 1466 | |
1467 /* | 1234 /* |
1468 * We are in syncing context, so no other brt_pending_tree accesses 1469 * are possible for the TXG. Don't need to acquire brt_pending_lock. | 1235 * We are in syncing context, so no other bv_pending_tree accesses 1236 * are possible for the TXG. So we don't need bv_pending_lock. |
1470 */ | 1237 */ |
1471 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; | 1238 ASSERT(avl_is_empty(&brtvd->bv_tree)); 1239 avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]); |
1472 | 1240 |
1473 c = NULL; 1474 while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) { 1475 boolean_t added_to_ddt; | 1241 for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) { 1242 nbre = AVL_NEXT(&brtvd->bv_tree, bre); |
1476 | 1243 |
1477 for (int i = 0; i < bpe->bpe_count; i++) { 1478 /* 1479 * If the block has DEDUP bit set, it means that it 1480 * already exists in the DEDUP table, so we can just 1481 * use that instead of creating new entry in 1482 * the BRT table. 1483 */ 1484 if (BP_GET_DEDUP(&bpe->bpe_bp)) { 1485 added_to_ddt = ddt_addref(spa, &bpe->bpe_bp); | 1244 /* 1245 * If the block has DEDUP bit set, it means that it 1246 * already exists in the DEDUP table, so we can just 1247 * use that instead of creating new entry in the BRT. 1248 */ 1249 if (BP_GET_DEDUP(&bre->bre_bp)) { 1250 while (bre->bre_pcount > 0) { 1251 if (!ddt_addref(spa, &bre->bre_bp)) 1252 break; 1253 bre->bre_pcount--; 1254 } 1255 if (bre->bre_pcount == 0) { 1256 avl_remove(&brtvd->bv_tree, bre); 1257 kmem_cache_free(brt_entry_cache, bre); 1258 continue; 1259 } 1260 } 1261 1262 /* 1263 * Unless we know that the block is definitely not in ZAP, 1264 * try to get its reference count from there. 1265 */ 1266 uint64_t off = BRE_OFFSET(bre); 1267 if (brtvd->bv_mos_entries != 0 && 1268 brt_vdev_lookup(spa, brtvd, off)) { 1269 int error = zap_lookup_uint64_by_dnode( 1270 brtvd->bv_mos_entries_dnode, &off, 1271 BRT_KEY_WORDS, 1, sizeof (bre->bre_count), 1272 &bre->bre_count); 1273 if (error == 0) { 1274 BRTSTAT_BUMP(brt_addref_entry_on_disk); |
1486 } else { | 1275 } else { |
1487 added_to_ddt = B_FALSE; | 1276 ASSERT3U(error, ==, ENOENT); 1277 BRTSTAT_BUMP(brt_addref_entry_not_on_disk); |
1488 } | 1278 } |
1489 if (!added_to_ddt) 1490 brt_entry_addref(brt, &bpe->bpe_bp); | |
1491 } | 1279 } |
1280 } |
|
1492 | 1281 |
1493 kmem_cache_free(brt_pending_entry_cache, bpe); | 1282 /* 1283 * If all the cloned blocks we had were handled by DDT, we don't need 1284 * to initiate the vdev. 1285 */ 1286 if (avl_is_empty(&brtvd->bv_tree)) 1287 return; 1288 1289 if (!brtvd->bv_initiated) { 1290 rw_enter(&brtvd->bv_lock, RW_WRITER); 1291 brt_vdev_realloc(spa, brtvd); 1292 rw_exit(&brtvd->bv_lock); |
1494 } | 1293 } |
1294 1295 /* 1296 * Convert pending references into proper ones. This has to be a 1297 * separate loop, since entcount modifications would cause false 1298 * positives for brt_vdev_lookup() on following iterations. 1299 */ 1300 for (bre = avl_first(&brtvd->bv_tree); bre; 1301 bre = AVL_NEXT(&brtvd->bv_tree, bre)) { 1302 brt_vdev_addref(spa, brtvd, bre, 1303 bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount); 1304 bre->bre_count += bre->bre_pcount; 1305 } |
|
1495} 1496 | 1306} 1307 |
1308void 1309brt_pending_apply(spa_t *spa, uint64_t txg) 1310{ 1311 1312 brt_rlock(spa); 1313 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 1314 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 1315 brt_unlock(spa); 1316 1317 brt_pending_apply_vdev(spa, brtvd, txg); 1318 1319 brt_rlock(spa); 1320 } 1321 brt_unlock(spa); 1322} 1323 |
|
1497static void 1498brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) 1499{ | 1324static void 1325brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) 1326{ |
1500 if (bre->bre_refcount == 0) { 1501 int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset, | 1327 uint64_t off = BRE_OFFSET(bre); 1328 1329 if (bre->bre_pcount == 0) { 1330 /* The net change is zero, nothing to do in ZAP. */ 1331 } else if (bre->bre_count == 0) { 1332 int error = zap_remove_uint64_by_dnode(dn, &off, |
1502 BRT_KEY_WORDS, tx); 1503 VERIFY(error == 0 || error == ENOENT); 1504 } else { | 1333 BRT_KEY_WORDS, tx); 1334 VERIFY(error == 0 || error == ENOENT); 1335 } else { |
1505 VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset, 1506 BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), 1507 &bre->bre_refcount, tx)); | 1336 VERIFY0(zap_update_uint64_by_dnode(dn, &off, 1337 BRT_KEY_WORDS, 1, sizeof (bre->bre_count), 1338 &bre->bre_count, tx)); |
1508 } 1509} 1510 1511static void | 1339 } 1340} 1341 1342static void |
1512brt_sync_table(brt_t *brt, dmu_tx_t *tx) | 1343brt_sync_table(spa_t *spa, dmu_tx_t *tx) |
1513{ | 1344{ |
1514 brt_vdev_t *brtvd; | |
1515 brt_entry_t *bre; | 1345 brt_entry_t *bre; |
1516 dnode_t *dn; 1517 uint64_t vdevid; 1518 void *c; | |
1519 | 1346 |
1520 brt_wlock(brt); | 1347 brt_rlock(spa); 1348 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 1349 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 1350 brt_unlock(spa); |
1521 | 1351 |
1522 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { 1523 brtvd = &brt->brt_vdevs[vdevid]; 1524 1525 if (!brtvd->bv_initiated) 1526 continue; 1527 | |
1528 if (!brtvd->bv_meta_dirty) { 1529 ASSERT(!brtvd->bv_entcount_dirty); 1530 ASSERT0(avl_numnodes(&brtvd->bv_tree)); | 1352 if (!brtvd->bv_meta_dirty) { 1353 ASSERT(!brtvd->bv_entcount_dirty); 1354 ASSERT0(avl_numnodes(&brtvd->bv_tree)); |
1355 brt_rlock(spa); |
|
1531 continue; 1532 } 1533 1534 ASSERT(!brtvd->bv_entcount_dirty || 1535 avl_numnodes(&brtvd->bv_tree) != 0); 1536 1537 if (brtvd->bv_mos_brtvdev == 0) | 1356 continue; 1357 } 1358 1359 ASSERT(!brtvd->bv_entcount_dirty || 1360 avl_numnodes(&brtvd->bv_tree) != 0); 1361 1362 if (brtvd->bv_mos_brtvdev == 0) |
1538 brt_vdev_create(brt, brtvd, tx); | 1363 brt_vdev_create(spa, brtvd, tx); |
1539 | 1364 |
1540 VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries, 1541 FTAG, &dn)); 1542 1543 c = NULL; | 1365 void *c = NULL; |
1544 while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { | 1366 while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { |
1545 brt_sync_entry(dn, bre, tx); 1546 brt_entry_free(bre); 1547 ASSERT(brt->brt_nentries > 0); 1548 brt->brt_nentries--; | 1367 brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx); 1368 kmem_cache_free(brt_entry_cache, bre); |
1549 } 1550 | 1369 } 1370 |
1551 dnode_rele(dn, FTAG); 1552 1553 brt_vdev_sync(brt, brtvd, tx); 1554 | 1371#ifdef ZFS_DEBUG 1372 if (zfs_flags & ZFS_DEBUG_BRT) 1373 brt_vdev_dump(brtvd); 1374#endif |
1555 if (brtvd->bv_totalcount == 0) | 1375 if (brtvd->bv_totalcount == 0) |
1556 brt_vdev_destroy(brt, brtvd, tx); | 1376 brt_vdev_destroy(spa, brtvd, tx); 1377 else 1378 brt_vdev_sync(spa, brtvd, tx); 1379 brt_rlock(spa); |
1557 } | 1380 } |
1558 1559 ASSERT0(brt->brt_nentries); 1560 1561 brt_unlock(brt); | 1381 brt_unlock(spa); |
1562} 1563 1564void 1565brt_sync(spa_t *spa, uint64_t txg) 1566{ 1567 dmu_tx_t *tx; | 1382} 1383 1384void 1385brt_sync(spa_t *spa, uint64_t txg) 1386{ 1387 dmu_tx_t *tx; |
1568 brt_t *brt; | 1388 uint64_t vdevid; |
1569 | 1389 |
1570 ASSERT(spa_syncing_txg(spa) == txg); | 1390 ASSERT3U(spa_syncing_txg(spa), ==, txg); |
1571 | 1391 |
1572 brt = spa->spa_brt; 1573 brt_rlock(brt); 1574 if (brt->brt_nentries == 0) { 1575 /* No changes. */ 1576 brt_unlock(brt); | 1392 brt_rlock(spa); 1393 for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 1394 if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty) 1395 break; 1396 } 1397 if (vdevid >= spa->spa_brt_nvdevs) { 1398 brt_unlock(spa); |
1577 return; 1578 } | 1399 return; 1400 } |
1579 brt_unlock(brt); | 1401 brt_unlock(spa); |
1580 1581 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); | 1402 1403 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); |
1582 1583 brt_sync_table(brt, tx); 1584 | 1404 brt_sync_table(spa, tx); |
1585 dmu_tx_commit(tx); 1586} 1587 1588static void | 1405 dmu_tx_commit(tx); 1406} 1407 1408static void |
1589brt_table_alloc(brt_t *brt) 1590{ 1591 1592 for (int i = 0; i < TXG_SIZE; i++) { 1593 avl_create(&brt->brt_pending_tree[i], 1594 brt_pending_entry_compare, 1595 sizeof (brt_pending_entry_t), 1596 offsetof(brt_pending_entry_t, bpe_node)); 1597 mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT, 1598 NULL); 1599 } 1600} 1601 1602static void 1603brt_table_free(brt_t *brt) 1604{ 1605 1606 for (int i = 0; i < TXG_SIZE; i++) { 1607 ASSERT(avl_is_empty(&brt->brt_pending_tree[i])); 1608 1609 avl_destroy(&brt->brt_pending_tree[i]); 1610 mutex_destroy(&brt->brt_pending_lock[i]); 1611 } 1612} 1613 1614static void | |
1615brt_alloc(spa_t *spa) 1616{ | 1409brt_alloc(spa_t *spa) 1410{ |
1617 brt_t *brt; 1618 1619 ASSERT(spa->spa_brt == NULL); 1620 1621 brt = kmem_zalloc(sizeof (*brt), KM_SLEEP); 1622 rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL); 1623 brt->brt_spa = spa; 1624 brt->brt_rangesize = 0; 1625 brt->brt_nentries = 0; 1626 brt->brt_vdevs = NULL; 1627 brt->brt_nvdevs = 0; 1628 brt_table_alloc(brt); 1629 1630 spa->spa_brt = brt; | 1411 rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL); 1412 spa->spa_brt_vdevs = NULL; 1413 spa->spa_brt_nvdevs = 0; 1414 spa->spa_brt_rangesize = 0; |
1631} 1632 1633void 1634brt_create(spa_t *spa) 1635{ | 1415} 1416 1417void 1418brt_create(spa_t *spa) 1419{ |
1636 | |
1637 brt_alloc(spa); | 1420 brt_alloc(spa); |
1638 brt_vdevs_alloc(spa->spa_brt, B_FALSE); | 1421 spa->spa_brt_rangesize = BRT_RANGESIZE; |
1639} 1640 1641int 1642brt_load(spa_t *spa) 1643{ | 1422} 1423 1424int 1425brt_load(spa_t *spa) 1426{ |
1427 int error = 0; |
|
1644 1645 brt_alloc(spa); | 1428 1429 brt_alloc(spa); |
1646 brt_vdevs_alloc(spa->spa_brt, B_TRUE); | 1430 brt_wlock(spa); 1431 for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children; 1432 vdevid++) { 1433 char name[64]; 1434 uint64_t mos_brtvdev; |
1647 | 1435 |
1648 return (0); | 1436 /* Look if this vdev had active block cloning. */ 1437 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, 1438 (u_longlong_t)vdevid); 1439 error = zap_lookup(spa->spa_meta_objset, 1440 DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, 1441 &mos_brtvdev); 1442 if (error == ENOENT) { 1443 error = 0; 1444 continue; 1445 } 1446 if (error != 0) 1447 break; 1448 1449 /* If it did, then allocate them all and load this one. */ 1450 brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children); 1451 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 1452 rw_enter(&brtvd->bv_lock, RW_WRITER); 1453 brtvd->bv_mos_brtvdev = mos_brtvdev; 1454 error = brt_vdev_load(spa, brtvd); 1455 rw_exit(&brtvd->bv_lock); 1456 if (error != 0) 1457 break; 1458 } 1459 1460 if (spa->spa_brt_rangesize == 0) 1461 spa->spa_brt_rangesize = BRT_RANGESIZE; 1462 brt_unlock(spa); 1463 return (error); |
1649} 1650 1651void 1652brt_unload(spa_t *spa) 1653{ | 1464} 1465 1466void 1467brt_unload(spa_t *spa) 1468{ |
1654 brt_t *brt = spa->spa_brt; 1655 1656 if (brt == NULL) | 1469 if (spa->spa_brt_rangesize == 0) |
1657 return; | 1470 return; |
1658 1659 brt_vdevs_free(brt); 1660 brt_table_free(brt); 1661 rw_destroy(&brt->brt_lock); 1662 kmem_free(brt, sizeof (*brt)); 1663 spa->spa_brt = NULL; | 1471 brt_vdevs_free(spa); 1472 rw_destroy(&spa->spa_brt_lock); 1473 spa->spa_brt_rangesize = 0; |
1664} 1665 1666/* BEGIN CSTYLED */ 1667ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW, 1668 "Enable prefetching of BRT ZAP entries"); 1669ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW, 1670 "BRT ZAP leaf blockshift"); 1671ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW, 1672 "BRT ZAP indirect blockshift"); 1673/* END CSTYLED */ | 1474} 1475 1476/* BEGIN CSTYLED */ 1477ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW, 1478 "Enable prefetching of BRT ZAP entries"); 1479ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW, 1480 "BRT ZAP leaf blockshift"); 1481ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW, 1482 "BRT ZAP indirect blockshift"); 1483/* END CSTYLED */ |