brt.c (783d3ff6d7fae619db8a7990b8a6387de0c677b5) brt.c (718519f4efc71096422fc71dab90b2a3369871ff)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 229 unchanged lines hidden (view full) ---

238 * first references are dropped during ZIL destroy by zil_free_clone_range().
239 * It is possible that after zil_claim() we never mount the destination, so
240 * we never replay its ZIL and just destroy it. In this case the only taken
241 * references will be dropped by zil_free_clone_range(), since the cloning is
242 * not going to ever take place.
243 */
244
245static kmem_cache_t *brt_entry_cache;
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 229 unchanged lines hidden (view full) ---

238 * first references are dropped during ZIL destroy by zil_free_clone_range().
239 * It is possible that after zil_claim() we never mount the destination, so
240 * we never replay its ZIL and just destroy it. In this case the only taken
241 * references will be dropped by zil_free_clone_range(), since the cloning is
242 * not going to ever take place.
243 */
244
245static kmem_cache_t *brt_entry_cache;
246static kmem_cache_t *brt_pending_entry_cache;
247
248/*
249 * Enable/disable prefetching of BRT entries that we are going to modify.
250 */
251static int brt_zap_prefetch = 1;
252
253#ifdef ZFS_DEBUG
254#define BRT_DEBUG(...) do { \

--- 6 unchanged lines hidden (view full) ---

261#endif
262
263static int brt_zap_default_bs = 12;
264static int brt_zap_default_ibs = 12;
265
266static kstat_t *brt_ksp;
267
268typedef struct brt_stats {
246
247/*
248 * Enable/disable prefetching of BRT entries that we are going to modify.
249 */
250static int brt_zap_prefetch = 1;
251
252#ifdef ZFS_DEBUG
253#define BRT_DEBUG(...) do { \

--- 6 unchanged lines hidden (view full) ---

260#endif
261
262static int brt_zap_default_bs = 12;
263static int brt_zap_default_ibs = 12;
264
265static kstat_t *brt_ksp;
266
267typedef struct brt_stats {
269 kstat_named_t brt_addref_entry_in_memory;
270 kstat_named_t brt_addref_entry_not_on_disk;
271 kstat_named_t brt_addref_entry_on_disk;
268 kstat_named_t brt_addref_entry_not_on_disk;
269 kstat_named_t brt_addref_entry_on_disk;
272 kstat_named_t brt_addref_entry_read_lost_race;
273 kstat_named_t brt_decref_entry_in_memory;
274 kstat_named_t brt_decref_entry_loaded_from_disk;
275 kstat_named_t brt_decref_entry_not_in_memory;
270 kstat_named_t brt_decref_entry_in_memory;
271 kstat_named_t brt_decref_entry_loaded_from_disk;
272 kstat_named_t brt_decref_entry_not_in_memory;
276 kstat_named_t brt_decref_entry_not_on_disk;
277 kstat_named_t brt_decref_entry_read_lost_race;
278 kstat_named_t brt_decref_entry_still_referenced;
279 kstat_named_t brt_decref_free_data_later;
280 kstat_named_t brt_decref_free_data_now;
281 kstat_named_t brt_decref_no_entry;
282} brt_stats_t;
283
284static brt_stats_t brt_stats = {
273 kstat_named_t brt_decref_entry_read_lost_race;
274 kstat_named_t brt_decref_entry_still_referenced;
275 kstat_named_t brt_decref_free_data_later;
276 kstat_named_t brt_decref_free_data_now;
277 kstat_named_t brt_decref_no_entry;
278} brt_stats_t;
279
280static brt_stats_t brt_stats = {
285 { "addref_entry_in_memory", KSTAT_DATA_UINT64 },
286 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 },
287 { "addref_entry_on_disk", KSTAT_DATA_UINT64 },
281 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 },
282 { "addref_entry_on_disk", KSTAT_DATA_UINT64 },
288 { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 },
289 { "decref_entry_in_memory", KSTAT_DATA_UINT64 },
290 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 },
291 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 },
283 { "decref_entry_in_memory", KSTAT_DATA_UINT64 },
284 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 },
285 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 },
292 { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 },
293 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 },
294 { "decref_entry_still_referenced", KSTAT_DATA_UINT64 },
295 { "decref_free_data_later", KSTAT_DATA_UINT64 },
296 { "decref_free_data_now", KSTAT_DATA_UINT64 },
297 { "decref_no_entry", KSTAT_DATA_UINT64 }
298};
299
300struct {
286 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 },
287 { "decref_entry_still_referenced", KSTAT_DATA_UINT64 },
288 { "decref_free_data_later", KSTAT_DATA_UINT64 },
289 { "decref_free_data_now", KSTAT_DATA_UINT64 },
290 { "decref_no_entry", KSTAT_DATA_UINT64 }
291};
292
293struct {
301 wmsum_t brt_addref_entry_in_memory;
302 wmsum_t brt_addref_entry_not_on_disk;
303 wmsum_t brt_addref_entry_on_disk;
294 wmsum_t brt_addref_entry_not_on_disk;
295 wmsum_t brt_addref_entry_on_disk;
304 wmsum_t brt_addref_entry_read_lost_race;
305 wmsum_t brt_decref_entry_in_memory;
306 wmsum_t brt_decref_entry_loaded_from_disk;
307 wmsum_t brt_decref_entry_not_in_memory;
296 wmsum_t brt_decref_entry_in_memory;
297 wmsum_t brt_decref_entry_loaded_from_disk;
298 wmsum_t brt_decref_entry_not_in_memory;
308 wmsum_t brt_decref_entry_not_on_disk;
309 wmsum_t brt_decref_entry_read_lost_race;
310 wmsum_t brt_decref_entry_still_referenced;
311 wmsum_t brt_decref_free_data_later;
312 wmsum_t brt_decref_free_data_now;
313 wmsum_t brt_decref_no_entry;
314} brt_sums;
315
316#define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1)
317
318static int brt_entry_compare(const void *x1, const void *x2);
299 wmsum_t brt_decref_entry_read_lost_race;
300 wmsum_t brt_decref_entry_still_referenced;
301 wmsum_t brt_decref_free_data_later;
302 wmsum_t brt_decref_free_data_now;
303 wmsum_t brt_decref_no_entry;
304} brt_sums;
305
306#define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1)
307
308static int brt_entry_compare(const void *x1, const void *x2);
319static int brt_pending_entry_compare(const void *x1, const void *x2);
309static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs);
320
321static void
310
311static void
322brt_rlock(brt_t *brt)
312brt_rlock(spa_t *spa)
323{
313{
324 rw_enter(&brt->brt_lock, RW_READER);
314 rw_enter(&spa->spa_brt_lock, RW_READER);
325}
326
327static void
315}
316
317static void
328brt_wlock(brt_t *brt)
318brt_wlock(spa_t *spa)
329{
319{
330 rw_enter(&brt->brt_lock, RW_WRITER);
320 rw_enter(&spa->spa_brt_lock, RW_WRITER);
331}
332
333static void
321}
322
323static void
334brt_unlock(brt_t *brt)
324brt_unlock(spa_t *spa)
335{
325{
336 rw_exit(&brt->brt_lock);
326 rw_exit(&spa->spa_brt_lock);
337}
338
339static uint16_t
340brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
341{
342
343 ASSERT3U(idx, <, brtvd->bv_size);
344

--- 44 unchanged lines hidden (view full) ---

389}
390
391#ifdef ZFS_DEBUG
392static void
393brt_vdev_dump(brt_vdev_t *brtvd)
394{
395 uint64_t idx;
396
327}
328
329static uint16_t
330brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
331{
332
333 ASSERT3U(idx, <, brtvd->bv_size);
334

--- 44 unchanged lines hidden (view full) ---

379}
380
381#ifdef ZFS_DEBUG
382static void
383brt_vdev_dump(brt_vdev_t *brtvd)
384{
385 uint64_t idx;
386
387 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
397 zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
388 zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
398 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
389 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu",
399 (u_longlong_t)brtvd->bv_vdevid,
400 brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
401 (u_longlong_t)brtvd->bv_size,
402 (u_longlong_t)brtvd->bv_totalcount,
390 (u_longlong_t)brtvd->bv_vdevid,
391 brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
392 (u_longlong_t)brtvd->bv_size,
393 (u_longlong_t)brtvd->bv_totalcount,
403 (u_longlong_t)brtvd->bv_nblocks,
404 (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
394 (u_longlong_t)nblocks,
395 (size_t)BT_SIZEOFMAP(nblocks));
405 if (brtvd->bv_totalcount > 0) {
406 zfs_dbgmsg(" entcounts:");
407 for (idx = 0; idx < brtvd->bv_size; idx++) {
408 uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
409 if (entcnt > 0) {
410 zfs_dbgmsg(" [%04llu] %hu",
411 (u_longlong_t)idx, entcnt);
412 }
413 }
414 }
415 if (brtvd->bv_entcount_dirty) {
416 char *bitmap;
417
396 if (brtvd->bv_totalcount > 0) {
397 zfs_dbgmsg(" entcounts:");
398 for (idx = 0; idx < brtvd->bv_size; idx++) {
399 uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
400 if (entcnt > 0) {
401 zfs_dbgmsg(" [%04llu] %hu",
402 (u_longlong_t)idx, entcnt);
403 }
404 }
405 }
406 if (brtvd->bv_entcount_dirty) {
407 char *bitmap;
408
418 bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
419 for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
409 bitmap = kmem_alloc(nblocks + 1, KM_SLEEP);
410 for (idx = 0; idx < nblocks; idx++) {
420 bitmap[idx] =
421 BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
422 }
423 bitmap[idx] = '\0';
424 zfs_dbgmsg(" dirty: %s", bitmap);
411 bitmap[idx] =
412 BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
413 }
414 bitmap[idx] = '\0';
415 zfs_dbgmsg(" dirty: %s", bitmap);
425 kmem_free(bitmap, brtvd->bv_nblocks + 1);
416 kmem_free(bitmap, nblocks + 1);
426 }
427}
428#endif
429
430static brt_vdev_t *
417 }
418}
419#endif
420
421static brt_vdev_t *
431brt_vdev(brt_t *brt, uint64_t vdevid)
422brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc)
432{
423{
433 brt_vdev_t *brtvd;
424 brt_vdev_t *brtvd = NULL;
434
425
435 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
436
437 if (vdevid < brt->brt_nvdevs) {
438 brtvd = &brt->brt_vdevs[vdevid];
439 } else {
440 brtvd = NULL;
426 brt_rlock(spa);
427 if (vdevid < spa->spa_brt_nvdevs) {
428 brtvd = spa->spa_brt_vdevs[vdevid];
429 } else if (alloc) {
430 /* New VDEV was added. */
431 brt_unlock(spa);
432 brt_wlock(spa);
433 if (vdevid >= spa->spa_brt_nvdevs)
434 brt_vdevs_expand(spa, vdevid + 1);
435 brtvd = spa->spa_brt_vdevs[vdevid];
441 }
436 }
442
437 brt_unlock(spa);
443 return (brtvd);
444}
445
446static void
438 return (brtvd);
439}
440
441static void
447brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
442brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
448{
449 char name[64];
450
443{
444 char name[64];
445
451 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
446 ASSERT(brtvd->bv_initiated);
452 ASSERT0(brtvd->bv_mos_brtvdev);
453 ASSERT0(brtvd->bv_mos_entries);
447 ASSERT0(brtvd->bv_mos_brtvdev);
448 ASSERT0(brtvd->bv_mos_entries);
454 ASSERT(brtvd->bv_entcount != NULL);
455 ASSERT(brtvd->bv_size > 0);
456 ASSERT(brtvd->bv_bitmap != NULL);
457 ASSERT(brtvd->bv_nblocks > 0);
458
449
459 brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
450 uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0,
460 ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
461 brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
451 ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
452 brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
462 VERIFY(brtvd->bv_mos_entries != 0);
453 VERIFY(mos_entries != 0);
454 VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd,
455 &brtvd->bv_mos_entries_dnode));
456 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
457 brtvd->bv_mos_entries = mos_entries;
458 rw_exit(&brtvd->bv_mos_entries_lock);
463 BRT_DEBUG("MOS entries created, object=%llu",
464 (u_longlong_t)brtvd->bv_mos_entries);
465
466 /*
467 * We allocate DMU buffer to store the bv_entcount[] array.
468 * We will keep array size (bv_size) and cummulative count for all
469 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
470 */
459 BRT_DEBUG("MOS entries created, object=%llu",
460 (u_longlong_t)brtvd->bv_mos_entries);
461
462 /*
463 * We allocate DMU buffer to store the bv_entcount[] array.
464 * We will keep array size (bv_size) and cummulative count for all
465 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
466 */
471 brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos,
467 brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset,
472 DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
473 DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
474 VERIFY(brtvd->bv_mos_brtvdev != 0);
475 BRT_DEBUG("MOS BRT VDEV created, object=%llu",
476 (u_longlong_t)brtvd->bv_mos_brtvdev);
477
478 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
479 (u_longlong_t)brtvd->bv_vdevid);
468 DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
469 DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
470 VERIFY(brtvd->bv_mos_brtvdev != 0);
471 BRT_DEBUG("MOS BRT VDEV created, object=%llu",
472 (u_longlong_t)brtvd->bv_mos_brtvdev);
473
474 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
475 (u_longlong_t)brtvd->bv_vdevid);
480 VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
476 VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name,
481 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
482 BRT_DEBUG("Pool directory object created, object=%s", name);
483
477 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
478 BRT_DEBUG("Pool directory object created, object=%s", name);
479
484 spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
480 spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
485}
486
487static void
481}
482
483static void
488brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
484brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd)
489{
490 vdev_t *vd;
491 uint16_t *entcount;
492 ulong_t *bitmap;
485{
486 vdev_t *vd;
487 uint16_t *entcount;
488 ulong_t *bitmap;
493 uint64_t nblocks, size;
489 uint64_t nblocks, onblocks, size;
494
490
495 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
491 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
496
492
497 spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER);
498 vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid);
499 size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
500 spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
493 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
494 vd = vdev_lookup_top(spa, brtvd->bv_vdevid);
495 size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1;
496 spa_config_exit(spa, SCL_VDEV, FTAG);
501
502 entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
503 nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
504 bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
505
506 if (!brtvd->bv_initiated) {
507 ASSERT0(brtvd->bv_size);
497
498 entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
499 nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
500 bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
501
502 if (!brtvd->bv_initiated) {
503 ASSERT0(brtvd->bv_size);
508 ASSERT(brtvd->bv_entcount == NULL);
509 ASSERT(brtvd->bv_bitmap == NULL);
510 ASSERT0(brtvd->bv_nblocks);
511
512 avl_create(&brtvd->bv_tree, brt_entry_compare,
513 sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
504 ASSERT0P(brtvd->bv_entcount);
505 ASSERT0P(brtvd->bv_bitmap);
514 } else {
515 ASSERT(brtvd->bv_size > 0);
516 ASSERT(brtvd->bv_entcount != NULL);
517 ASSERT(brtvd->bv_bitmap != NULL);
506 } else {
507 ASSERT(brtvd->bv_size > 0);
508 ASSERT(brtvd->bv_entcount != NULL);
509 ASSERT(brtvd->bv_bitmap != NULL);
518 ASSERT(brtvd->bv_nblocks > 0);
519 /*
520 * TODO: Allow vdev shrinking. We only need to implement
521 * shrinking the on-disk BRT VDEV object.
510 /*
511 * TODO: Allow vdev shrinking. We only need to implement
512 * shrinking the on-disk BRT VDEV object.
522 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
523 * size, tx);
513 * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
514 * offset, size, tx);
524 */
525 ASSERT3U(brtvd->bv_size, <=, size);
526
527 memcpy(entcount, brtvd->bv_entcount,
528 sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
515 */
516 ASSERT3U(brtvd->bv_size, <=, size);
517
518 memcpy(entcount, brtvd->bv_entcount,
519 sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
529 memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
530 BT_SIZEOFMAP(brtvd->bv_nblocks)));
531 vmem_free(brtvd->bv_entcount,
532 sizeof (entcount[0]) * brtvd->bv_size);
520 vmem_free(brtvd->bv_entcount,
521 sizeof (entcount[0]) * brtvd->bv_size);
533 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
522 onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
523 memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
524 BT_SIZEOFMAP(onblocks)));
525 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks));
534 }
535
536 brtvd->bv_size = size;
537 brtvd->bv_entcount = entcount;
538 brtvd->bv_bitmap = bitmap;
526 }
527
528 brtvd->bv_size = size;
529 brtvd->bv_entcount = entcount;
530 brtvd->bv_bitmap = bitmap;
539 brtvd->bv_nblocks = nblocks;
540 if (!brtvd->bv_initiated) {
541 brtvd->bv_need_byteswap = FALSE;
542 brtvd->bv_initiated = TRUE;
543 BRT_DEBUG("BRT VDEV %llu initiated.",
544 (u_longlong_t)brtvd->bv_vdevid);
545 }
546}
547
531 if (!brtvd->bv_initiated) {
532 brtvd->bv_need_byteswap = FALSE;
533 brtvd->bv_initiated = TRUE;
534 BRT_DEBUG("BRT VDEV %llu initiated.",
535 (u_longlong_t)brtvd->bv_vdevid);
536 }
537}
538
548static void
549brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
539static int
540brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd)
550{
541{
551 char name[64];
552 dmu_buf_t *db;
553 brt_vdev_phys_t *bvphys;
554 int error;
555
542 dmu_buf_t *db;
543 brt_vdev_phys_t *bvphys;
544 int error;
545
556 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
557 (u_longlong_t)brtvd->bv_vdevid);
558 error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
559 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
560 if (error != 0)
561 return;
546 ASSERT(!brtvd->bv_initiated);
562 ASSERT(brtvd->bv_mos_brtvdev != 0);
563
547 ASSERT(brtvd->bv_mos_brtvdev != 0);
548
564 error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db);
565 ASSERT0(error);
549 error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
550 FTAG, &db);
566 if (error != 0)
551 if (error != 0)
567 return;
552 return (error);
568
569 bvphys = db->db_data;
553
554 bvphys = db->db_data;
570 if (brt->brt_rangesize == 0) {
571 brt->brt_rangesize = bvphys->bvp_rangesize;
555 if (spa->spa_brt_rangesize == 0) {
556 spa->spa_brt_rangesize = bvphys->bvp_rangesize;
572 } else {
557 } else {
573 ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize);
558 ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize);
574 }
575
559 }
560
576 ASSERT(!brtvd->bv_initiated);
577 brt_vdev_realloc(brt, brtvd);
561 brt_vdev_realloc(spa, brtvd);
578
579 /* TODO: We don't support VDEV shrinking. */
580 ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
581
582 /*
583 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
584 */
562
563 /* TODO: We don't support VDEV shrinking. */
564 ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
565
566 /*
567 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
568 */
585 error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
569 error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
586 MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
587 brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
570 MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
571 brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
588 ASSERT0(error);
572 if (error != 0)
573 return (error);
589
574
575 ASSERT(bvphys->bvp_mos_entries != 0);
576 VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd,
577 &brtvd->bv_mos_entries_dnode));
578 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
590 brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
579 brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
591 ASSERT(brtvd->bv_mos_entries != 0);
580 rw_exit(&brtvd->bv_mos_entries_lock);
592 brtvd->bv_need_byteswap =
593 (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
594 brtvd->bv_totalcount = bvphys->bvp_totalcount;
595 brtvd->bv_usedspace = bvphys->bvp_usedspace;
596 brtvd->bv_savedspace = bvphys->bvp_savedspace;
581 brtvd->bv_need_byteswap =
582 (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
583 brtvd->bv_totalcount = bvphys->bvp_totalcount;
584 brtvd->bv_usedspace = bvphys->bvp_usedspace;
585 brtvd->bv_savedspace = bvphys->bvp_savedspace;
597 brt->brt_usedspace += brtvd->bv_usedspace;
598 brt->brt_savedspace += brtvd->bv_savedspace;
599
600 dmu_buf_rele(db, FTAG);
601
586
587 dmu_buf_rele(db, FTAG);
588
602 BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
603 name, (u_longlong_t)brtvd->bv_mos_brtvdev,
589 BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu",
590 (u_longlong_t)brtvd->bv_vdevid,
591 (u_longlong_t)brtvd->bv_mos_brtvdev,
604 (u_longlong_t)brtvd->bv_mos_entries);
592 (u_longlong_t)brtvd->bv_mos_entries);
593 return (0);
605}
606
607static void
594}
595
596static void
608brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
597brt_vdev_dealloc(brt_vdev_t *brtvd)
609{
598{
610
611 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
599 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
612 ASSERT(brtvd->bv_initiated);
600 ASSERT(brtvd->bv_initiated);
601 ASSERT0(avl_numnodes(&brtvd->bv_tree));
613
614 vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
615 brtvd->bv_entcount = NULL;
602
603 vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
604 brtvd->bv_entcount = NULL;
616 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
605 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
606 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks));
617 brtvd->bv_bitmap = NULL;
607 brtvd->bv_bitmap = NULL;
618 ASSERT0(avl_numnodes(&brtvd->bv_tree));
619 avl_destroy(&brtvd->bv_tree);
620
621 brtvd->bv_size = 0;
608
609 brtvd->bv_size = 0;
622 brtvd->bv_nblocks = 0;
623
624 brtvd->bv_initiated = FALSE;
625 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
626}
627
628static void
610
611 brtvd->bv_initiated = FALSE;
612 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
613}
614
615static void
629brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
616brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
630{
631 char name[64];
632 uint64_t count;
617{
618 char name[64];
619 uint64_t count;
633 dmu_buf_t *db;
634 brt_vdev_phys_t *bvphys;
635
620
636 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
621 ASSERT(brtvd->bv_initiated);
637 ASSERT(brtvd->bv_mos_brtvdev != 0);
638 ASSERT(brtvd->bv_mos_entries != 0);
622 ASSERT(brtvd->bv_mos_brtvdev != 0);
623 ASSERT(brtvd->bv_mos_entries != 0);
624 ASSERT0(brtvd->bv_totalcount);
625 ASSERT0(brtvd->bv_usedspace);
626 ASSERT0(brtvd->bv_savedspace);
639
627
640 VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count));
641 VERIFY0(count);
642 VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx));
643 BRT_DEBUG("MOS entries destroyed, object=%llu",
644 (u_longlong_t)brtvd->bv_mos_entries);
628 uint64_t mos_entries = brtvd->bv_mos_entries;
629 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
645 brtvd->bv_mos_entries = 0;
630 brtvd->bv_mos_entries = 0;
631 rw_exit(&brtvd->bv_mos_entries_lock);
632 dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
633 brtvd->bv_mos_entries_dnode = NULL;
634 ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count));
635 ASSERT0(count);
636 VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx));
637 BRT_DEBUG("MOS entries destroyed, object=%llu",
638 (u_longlong_t)mos_entries);
646
639
647 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
648 bvphys = db->db_data;
649 ASSERT0(bvphys->bvp_totalcount);
650 ASSERT0(bvphys->bvp_usedspace);
651 ASSERT0(bvphys->bvp_savedspace);
652 dmu_buf_rele(db, FTAG);
653
654 VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx));
640 VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
641 tx));
655 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
656 (u_longlong_t)brtvd->bv_mos_brtvdev);
657 brtvd->bv_mos_brtvdev = 0;
642 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
643 (u_longlong_t)brtvd->bv_mos_brtvdev);
644 brtvd->bv_mos_brtvdev = 0;
645 brtvd->bv_entcount_dirty = FALSE;
658
659 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
660 (u_longlong_t)brtvd->bv_vdevid);
646
647 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
648 (u_longlong_t)brtvd->bv_vdevid);
661 VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx));
649 VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
650 name, tx));
662 BRT_DEBUG("Pool directory object removed, object=%s", name);
663
651 BRT_DEBUG("Pool directory object removed, object=%s", name);
652
664 brt_vdev_dealloc(brt, brtvd);
653 brtvd->bv_meta_dirty = FALSE;
665
654
666 spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
655 rw_enter(&brtvd->bv_lock, RW_WRITER);
656 brt_vdev_dealloc(brtvd);
657 rw_exit(&brtvd->bv_lock);
658
659 spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
667}
668
669static void
660}
661
662static void
670brt_vdevs_expand(brt_t *brt, uint64_t nvdevs)
663brt_vdevs_expand(spa_t *spa, uint64_t nvdevs)
671{
664{
672 brt_vdev_t *brtvd, *vdevs;
673 uint64_t vdevid;
665 brt_vdev_t **vdevs;
674
666
675 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
676 ASSERT3U(nvdevs, >, brt->brt_nvdevs);
667 ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock));
668 ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs);
677
669
678 vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP);
679 if (brt->brt_nvdevs > 0) {
680 ASSERT(brt->brt_vdevs != NULL);
670 if (nvdevs == spa->spa_brt_nvdevs)
671 return;
681
672
682 memcpy(vdevs, brt->brt_vdevs,
683 sizeof (brt_vdev_t) * brt->brt_nvdevs);
684 kmem_free(brt->brt_vdevs,
685 sizeof (brt_vdev_t) * brt->brt_nvdevs);
673 vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP);
674 if (spa->spa_brt_nvdevs > 0) {
675 ASSERT(spa->spa_brt_vdevs != NULL);
676
677 memcpy(vdevs, spa->spa_brt_vdevs,
678 sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
679 kmem_free(spa->spa_brt_vdevs,
680 sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
686 }
681 }
687 for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) {
688 brtvd = &vdevs[vdevid];
682 spa->spa_brt_vdevs = vdevs;
689
683
684 for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) {
685 brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP);
686 rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL);
690 brtvd->bv_vdevid = vdevid;
691 brtvd->bv_initiated = FALSE;
687 brtvd->bv_vdevid = vdevid;
688 brtvd->bv_initiated = FALSE;
689 rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL);
690 avl_create(&brtvd->bv_tree, brt_entry_compare,
691 sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
692 for (int i = 0; i < TXG_SIZE; i++) {
693 avl_create(&brtvd->bv_pending_tree[i],
694 brt_entry_compare, sizeof (brt_entry_t),
695 offsetof(brt_entry_t, bre_node));
696 }
697 mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL);
698 spa->spa_brt_vdevs[vdevid] = brtvd;
692 }
693
694 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
699 }
700
701 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
695 (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs);
696
697 brt->brt_vdevs = vdevs;
698 brt->brt_nvdevs = nvdevs;
702 (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs);
703 spa->spa_brt_nvdevs = nvdevs;
699}
700
701static boolean_t
704}
705
706static boolean_t
702brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
707brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset)
703{
708{
704 uint64_t idx;
705
706 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
707
708 idx = bre->bre_offset / brt->brt_rangesize;
709 if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) {
709 uint64_t idx = offset / spa->spa_brt_rangesize;
710 if (idx < brtvd->bv_size) {
710 /* VDEV wasn't expanded. */
711 return (brt_vdev_entcount_get(brtvd, idx) > 0);
712 }
711 /* VDEV wasn't expanded. */
712 return (brt_vdev_entcount_get(brtvd, idx) > 0);
713 }
713
714 return (FALSE);
715}
716
717static void
714 return (FALSE);
715}
716
717static void
718brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
719 uint64_t dsize)
718brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
719 uint64_t dsize, uint64_t count)
720{
721 uint64_t idx;
722
720{
721 uint64_t idx;
722
723 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
724 ASSERT(brtvd != NULL);
725 ASSERT(brtvd->bv_entcount != NULL);
723 ASSERT(brtvd->bv_initiated);
726
724
727 brt->brt_savedspace += dsize;
728 brtvd->bv_savedspace += dsize;
725 brtvd->bv_savedspace += dsize * count;
729 brtvd->bv_meta_dirty = TRUE;
730
726 brtvd->bv_meta_dirty = TRUE;
727
731 if (bre->bre_refcount > 1) {
728 if (bre->bre_count > 0)
732 return;
729 return;
733 }
734
730
735 brt->brt_usedspace += dsize;
736 brtvd->bv_usedspace += dsize;
737
731 brtvd->bv_usedspace += dsize;
732
738 idx = bre->bre_offset / brt->brt_rangesize;
733 idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
739 if (idx >= brtvd->bv_size) {
740 /* VDEV has been expanded. */
734 if (idx >= brtvd->bv_size) {
735 /* VDEV has been expanded. */
741 brt_vdev_realloc(brt, brtvd);
736 rw_enter(&brtvd->bv_lock, RW_WRITER);
737 brt_vdev_realloc(spa, brtvd);
738 rw_exit(&brtvd->bv_lock);
742 }
743
744 ASSERT3U(idx, <, brtvd->bv_size);
745
746 brtvd->bv_totalcount++;
747 brt_vdev_entcount_inc(brtvd, idx);
748 brtvd->bv_entcount_dirty = TRUE;
749 idx = idx / BRT_BLOCKSIZE / 8;
750 BT_SET(brtvd->bv_bitmap, idx);
739 }
740
741 ASSERT3U(idx, <, brtvd->bv_size);
742
743 brtvd->bv_totalcount++;
744 brt_vdev_entcount_inc(brtvd, idx);
745 brtvd->bv_entcount_dirty = TRUE;
746 idx = idx / BRT_BLOCKSIZE / 8;
747 BT_SET(brtvd->bv_bitmap, idx);
751
752#ifdef ZFS_DEBUG
753 if (zfs_flags & ZFS_DEBUG_BRT)
754 brt_vdev_dump(brtvd);
755#endif
756}
757
758static void
748}
749
750static void
759brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
751brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
760 uint64_t dsize)
761{
762 uint64_t idx;
763
752 uint64_t dsize)
753{
754 uint64_t idx;
755
764 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
765 ASSERT(brtvd != NULL);
766 ASSERT(brtvd->bv_entcount != NULL);
756 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
757 ASSERT(brtvd->bv_initiated);
767
758
768 brt->brt_savedspace -= dsize;
769 brtvd->bv_savedspace -= dsize;
770 brtvd->bv_meta_dirty = TRUE;
771
759 brtvd->bv_savedspace -= dsize;
760 brtvd->bv_meta_dirty = TRUE;
761
772 if (bre->bre_refcount > 0) {
762 if (bre->bre_count > 0)
773 return;
763 return;
774 }
775
764
776 brt->brt_usedspace -= dsize;
777 brtvd->bv_usedspace -= dsize;
778
765 brtvd->bv_usedspace -= dsize;
766
779 idx = bre->bre_offset / brt->brt_rangesize;
767 idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
780 ASSERT3U(idx, <, brtvd->bv_size);
781
782 ASSERT(brtvd->bv_totalcount > 0);
783 brtvd->bv_totalcount--;
784 brt_vdev_entcount_dec(brtvd, idx);
785 brtvd->bv_entcount_dirty = TRUE;
786 idx = idx / BRT_BLOCKSIZE / 8;
787 BT_SET(brtvd->bv_bitmap, idx);
768 ASSERT3U(idx, <, brtvd->bv_size);
769
770 ASSERT(brtvd->bv_totalcount > 0);
771 brtvd->bv_totalcount--;
772 brt_vdev_entcount_dec(brtvd, idx);
773 brtvd->bv_entcount_dirty = TRUE;
774 idx = idx / BRT_BLOCKSIZE / 8;
775 BT_SET(brtvd->bv_bitmap, idx);
788
789#ifdef ZFS_DEBUG
790 if (zfs_flags & ZFS_DEBUG_BRT)
791 brt_vdev_dump(brtvd);
792#endif
793}
794
795static void
776}
777
778static void
796brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
779brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
797{
798 dmu_buf_t *db;
799 brt_vdev_phys_t *bvphys;
800
801 ASSERT(brtvd->bv_meta_dirty);
802 ASSERT(brtvd->bv_mos_brtvdev != 0);
803 ASSERT(dmu_tx_is_syncing(tx));
804
780{
781 dmu_buf_t *db;
782 brt_vdev_phys_t *bvphys;
783
784 ASSERT(brtvd->bv_meta_dirty);
785 ASSERT(brtvd->bv_mos_brtvdev != 0);
786 ASSERT(dmu_tx_is_syncing(tx));
787
805 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
788 VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
789 FTAG, &db));
806
807 if (brtvd->bv_entcount_dirty) {
808 /*
809 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
810 */
790
791 if (brtvd->bv_entcount_dirty) {
792 /*
793 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
794 */
811 dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
795 dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
812 brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
813 brtvd->bv_entcount, tx);
796 brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
797 brtvd->bv_entcount, tx);
814 memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks));
798 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
799 memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks));
815 brtvd->bv_entcount_dirty = FALSE;
816 }
817
818 dmu_buf_will_dirty(db, tx);
819 bvphys = db->db_data;
820 bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
821 bvphys->bvp_size = brtvd->bv_size;
822 if (brtvd->bv_need_byteswap) {
823 bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
824 } else {
825 bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
826 }
827 bvphys->bvp_totalcount = brtvd->bv_totalcount;
800 brtvd->bv_entcount_dirty = FALSE;
801 }
802
803 dmu_buf_will_dirty(db, tx);
804 bvphys = db->db_data;
805 bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
806 bvphys->bvp_size = brtvd->bv_size;
807 if (brtvd->bv_need_byteswap) {
808 bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
809 } else {
810 bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
811 }
812 bvphys->bvp_totalcount = brtvd->bv_totalcount;
828 bvphys->bvp_rangesize = brt->brt_rangesize;
813 bvphys->bvp_rangesize = spa->spa_brt_rangesize;
829 bvphys->bvp_usedspace = brtvd->bv_usedspace;
830 bvphys->bvp_savedspace = brtvd->bv_savedspace;
831 dmu_buf_rele(db, FTAG);
832
833 brtvd->bv_meta_dirty = FALSE;
834}
835
836static void
814 bvphys->bvp_usedspace = brtvd->bv_usedspace;
815 bvphys->bvp_savedspace = brtvd->bv_savedspace;
816 dmu_buf_rele(db, FTAG);
817
818 brtvd->bv_meta_dirty = FALSE;
819}
820
821static void
837brt_vdevs_alloc(brt_t *brt, boolean_t load)
822brt_vdevs_free(spa_t *spa)
838{
823{
839 brt_vdev_t *brtvd;
840 uint64_t vdevid;
841
842 brt_wlock(brt);
843
844 brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children);
845
846 if (load) {
847 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
848 brtvd = &brt->brt_vdevs[vdevid];
849 ASSERT(brtvd->bv_entcount == NULL);
850
851 brt_vdev_load(brt, brtvd);
852 }
853 }
854
855 if (brt->brt_rangesize == 0) {
856 brt->brt_rangesize = BRT_RANGESIZE;
857 }
858
859 brt_unlock(brt);
860}
861
862static void
863brt_vdevs_free(brt_t *brt)
864{
865 brt_vdev_t *brtvd;
866 uint64_t vdevid;
867
868 brt_wlock(brt);
869
870 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
871 brtvd = &brt->brt_vdevs[vdevid];
824 if (spa->spa_brt_vdevs == 0)
825 return;
826 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
827 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
828 rw_enter(&brtvd->bv_lock, RW_WRITER);
872 if (brtvd->bv_initiated)
829 if (brtvd->bv_initiated)
873 brt_vdev_dealloc(brt, brtvd);
830 brt_vdev_dealloc(brtvd);
831 rw_exit(&brtvd->bv_lock);
832 rw_destroy(&brtvd->bv_lock);
833 if (brtvd->bv_mos_entries != 0)
834 dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
835 rw_destroy(&brtvd->bv_mos_entries_lock);
836 avl_destroy(&brtvd->bv_tree);
837 for (int i = 0; i < TXG_SIZE; i++)
838 avl_destroy(&brtvd->bv_pending_tree[i]);
839 mutex_destroy(&brtvd->bv_pending_lock);
840 kmem_free(brtvd, sizeof (*brtvd));
874 }
841 }
875 kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs);
876
877 brt_unlock(brt);
842 kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) *
843 spa->spa_brt_nvdevs);
878}
879
880static void
881brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
882{
883
844}
845
846static void
847brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
848{
849
884 bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
885 bre->bre_refcount = 0;
850 bre->bre_bp = *bp;
851 bre->bre_count = 0;
852 bre->bre_pcount = 0;
886
887 *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
888}
889
890static int
853
854 *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
855}
856
857static int
891brt_entry_compare(const void *x1, const void *x2)
858brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre)
892{
859{
893 const brt_entry_t *bre1 = x1;
894 const brt_entry_t *bre2 = x2;
860 uint64_t off = BRE_OFFSET(bre);
895
861
896 return (TREE_CMP(bre1->bre_offset, bre2->bre_offset));
862 return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
863 &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count));
897}
898
864}
865
899static int
900brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
901{
902 uint64_t mos_entries;
903 int error;
904
905 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
906
907 if (!brt_vdev_lookup(brt, brtvd, bre))
908 return (SET_ERROR(ENOENT));
909
910 /*
911 * Remember mos_entries object number. After we reacquire the BRT lock,
912 * the brtvd pointer may be invalid.
913 */
914 mos_entries = brtvd->bv_mos_entries;
915 if (mos_entries == 0)
916 return (SET_ERROR(ENOENT));
917
918 brt_unlock(brt);
919
920 error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
921 BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount);
922
923 brt_wlock(brt);
924
925 return (error);
926}
927
928static void
929brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
930{
931 brt_vdev_t *brtvd;
932 uint64_t mos_entries = 0;
933
934 brt_rlock(brt);
935 brtvd = brt_vdev(brt, vdevid);
936 if (brtvd != NULL)
937 mos_entries = brtvd->bv_mos_entries;
938 brt_unlock(brt);
939
940 if (mos_entries == 0)
941 return;
942
943 (void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
944 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
945}
946
947/*
948 * Return TRUE if we _can_ have BRT entry for this bp. It might be false
949 * positive, but gives us quick answer if we should look into BRT, which
950 * may require reads and thus will be more expensive.
951 */
952boolean_t
953brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
954{
866/*
867 * Return TRUE if we _can_ have BRT entry for this bp. It might be false
868 * positive, but gives us quick answer if we should look into BRT, which
869 * may require reads and thus will be more expensive.
870 */
871boolean_t
872brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
873{
955 brt_t *brt = spa->spa_brt;
956 brt_vdev_t *brtvd;
957 brt_entry_t bre_search;
958 boolean_t mayexists = FALSE;
959 uint64_t vdevid;
960
874
961 brt_entry_fill(bp, &bre_search, &vdevid);
875 if (spa->spa_brt_nvdevs == 0)
876 return (B_FALSE);
962
877
963 brt_rlock(brt);
878 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
879 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
880 if (brtvd == NULL || !brtvd->bv_initiated)
881 return (FALSE);
964
882
965 brtvd = brt_vdev(brt, vdevid);
966 if (brtvd != NULL && brtvd->bv_initiated) {
967 if (!avl_is_empty(&brtvd->bv_tree) ||
968 brt_vdev_lookup(brt, brtvd, &bre_search)) {
969 mayexists = TRUE;
970 }
971 }
972
973 brt_unlock(brt);
974
975 return (mayexists);
883 /*
884 * We don't need locks here, since bv_entcount pointer must be
885 * stable at this point, and we don't care about false positive
886 * races here, while false negative should be impossible, since
887 * all brt_vdev_addref() have already completed by this point.
888 */
889 uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
890 return (brt_vdev_lookup(spa, brtvd, off));
976}
977
978uint64_t
979brt_get_dspace(spa_t *spa)
980{
891}
892
893uint64_t
894brt_get_dspace(spa_t *spa)
895{
981 brt_t *brt = spa->spa_brt;
982
983 if (brt == NULL)
896 if (spa->spa_brt_nvdevs == 0)
984 return (0);
985
897 return (0);
898
986 return (brt->brt_savedspace);
899 brt_rlock(spa);
900 uint64_t s = 0;
901 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
902 s += spa->spa_brt_vdevs[vdevid]->bv_savedspace;
903 brt_unlock(spa);
904 return (s);
987}
988
989uint64_t
990brt_get_used(spa_t *spa)
991{
905}
906
907uint64_t
908brt_get_used(spa_t *spa)
909{
992 brt_t *brt = spa->spa_brt;
993
994 if (brt == NULL)
910 if (spa->spa_brt_nvdevs == 0)
995 return (0);
996
911 return (0);
912
997 return (brt->brt_usedspace);
913 brt_rlock(spa);
914 uint64_t s = 0;
915 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
916 s += spa->spa_brt_vdevs[vdevid]->bv_usedspace;
917 brt_unlock(spa);
918 return (s);
998}
999
1000uint64_t
1001brt_get_saved(spa_t *spa)
1002{
919}
920
921uint64_t
922brt_get_saved(spa_t *spa)
923{
1003 brt_t *brt = spa->spa_brt;
1004
1005 if (brt == NULL)
1006 return (0);
1007
1008 return (brt->brt_savedspace);
924 return (brt_get_dspace(spa));
1009}
1010
1011uint64_t
1012brt_get_ratio(spa_t *spa)
1013{
925}
926
927uint64_t
928brt_get_ratio(spa_t *spa)
929{
1014 brt_t *brt = spa->spa_brt;
1015
1016 if (brt->brt_usedspace == 0)
930 uint64_t used = brt_get_used(spa);
931 if (used == 0)
1017 return (100);
932 return (100);
1018
1019 return ((brt->brt_usedspace + brt->brt_savedspace) * 100 /
1020 brt->brt_usedspace);
933 return ((used + brt_get_saved(spa)) * 100 / used);
1021}
1022
1023static int
1024brt_kstats_update(kstat_t *ksp, int rw)
1025{
1026 brt_stats_t *bs = ksp->ks_data;
1027
1028 if (rw == KSTAT_WRITE)
1029 return (EACCES);
1030
934}
935
936static int
937brt_kstats_update(kstat_t *ksp, int rw)
938{
939 brt_stats_t *bs = ksp->ks_data;
940
941 if (rw == KSTAT_WRITE)
942 return (EACCES);
943
1031 bs->brt_addref_entry_in_memory.value.ui64 =
1032 wmsum_value(&brt_sums.brt_addref_entry_in_memory);
1033 bs->brt_addref_entry_not_on_disk.value.ui64 =
1034 wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
1035 bs->brt_addref_entry_on_disk.value.ui64 =
1036 wmsum_value(&brt_sums.brt_addref_entry_on_disk);
944 bs->brt_addref_entry_not_on_disk.value.ui64 =
945 wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
946 bs->brt_addref_entry_on_disk.value.ui64 =
947 wmsum_value(&brt_sums.brt_addref_entry_on_disk);
1037 bs->brt_addref_entry_read_lost_race.value.ui64 =
1038 wmsum_value(&brt_sums.brt_addref_entry_read_lost_race);
1039 bs->brt_decref_entry_in_memory.value.ui64 =
1040 wmsum_value(&brt_sums.brt_decref_entry_in_memory);
1041 bs->brt_decref_entry_loaded_from_disk.value.ui64 =
1042 wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
1043 bs->brt_decref_entry_not_in_memory.value.ui64 =
1044 wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
948 bs->brt_decref_entry_in_memory.value.ui64 =
949 wmsum_value(&brt_sums.brt_decref_entry_in_memory);
950 bs->brt_decref_entry_loaded_from_disk.value.ui64 =
951 wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
952 bs->brt_decref_entry_not_in_memory.value.ui64 =
953 wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
1045 bs->brt_decref_entry_not_on_disk.value.ui64 =
1046 wmsum_value(&brt_sums.brt_decref_entry_not_on_disk);
1047 bs->brt_decref_entry_read_lost_race.value.ui64 =
1048 wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
1049 bs->brt_decref_entry_still_referenced.value.ui64 =
1050 wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
1051 bs->brt_decref_free_data_later.value.ui64 =
1052 wmsum_value(&brt_sums.brt_decref_free_data_later);
1053 bs->brt_decref_free_data_now.value.ui64 =
1054 wmsum_value(&brt_sums.brt_decref_free_data_now);
1055 bs->brt_decref_no_entry.value.ui64 =
1056 wmsum_value(&brt_sums.brt_decref_no_entry);
1057
1058 return (0);
1059}
1060
1061static void
1062brt_stat_init(void)
1063{
1064
954 bs->brt_decref_entry_read_lost_race.value.ui64 =
955 wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
956 bs->brt_decref_entry_still_referenced.value.ui64 =
957 wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
958 bs->brt_decref_free_data_later.value.ui64 =
959 wmsum_value(&brt_sums.brt_decref_free_data_later);
960 bs->brt_decref_free_data_now.value.ui64 =
961 wmsum_value(&brt_sums.brt_decref_free_data_now);
962 bs->brt_decref_no_entry.value.ui64 =
963 wmsum_value(&brt_sums.brt_decref_no_entry);
964
965 return (0);
966}
967
968static void
969brt_stat_init(void)
970{
971
1065 wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0);
1066 wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
1067 wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
972 wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
973 wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
1068 wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0);
1069 wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
1070 wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
1071 wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
974 wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
975 wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
976 wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
1072 wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0);
1073 wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
1074 wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
1075 wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
1076 wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
1077 wmsum_init(&brt_sums.brt_decref_no_entry, 0);
1078
1079 brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
1080 sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);

--- 7 unchanged lines hidden (view full) ---

1088static void
1089brt_stat_fini(void)
1090{
1091 if (brt_ksp != NULL) {
1092 kstat_delete(brt_ksp);
1093 brt_ksp = NULL;
1094 }
1095
977 wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
978 wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
979 wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
980 wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
981 wmsum_init(&brt_sums.brt_decref_no_entry, 0);
982
983 brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
984 sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);

--- 7 unchanged lines hidden (view full) ---

992static void
993brt_stat_fini(void)
994{
995 if (brt_ksp != NULL) {
996 kstat_delete(brt_ksp);
997 brt_ksp = NULL;
998 }
999
1096 wmsum_fini(&brt_sums.brt_addref_entry_in_memory);
1097 wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
1098 wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
1000 wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
1001 wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
1099 wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race);
1100 wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
1101 wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
1102 wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
1002 wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
1003 wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
1004 wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
1103 wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk);
1104 wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
1105 wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
1106 wmsum_fini(&brt_sums.brt_decref_free_data_later);
1107 wmsum_fini(&brt_sums.brt_decref_free_data_now);
1108 wmsum_fini(&brt_sums.brt_decref_no_entry);
1109}
1110
1111void
1112brt_init(void)
1113{
1114 brt_entry_cache = kmem_cache_create("brt_entry_cache",
1115 sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1005 wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
1006 wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
1007 wmsum_fini(&brt_sums.brt_decref_free_data_later);
1008 wmsum_fini(&brt_sums.brt_decref_free_data_now);
1009 wmsum_fini(&brt_sums.brt_decref_no_entry);
1010}
1011
1012void
1013brt_init(void)
1014{
1015 brt_entry_cache = kmem_cache_create("brt_entry_cache",
1016 sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1116 brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache",
1117 sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1118
1119 brt_stat_init();
1120}
1121
1122void
1123brt_fini(void)
1124{
1125 brt_stat_fini();
1126
1127 kmem_cache_destroy(brt_entry_cache);
1017
1018 brt_stat_init();
1019}
1020
1021void
1022brt_fini(void)
1023{
1024 brt_stat_fini();
1025
1026 kmem_cache_destroy(brt_entry_cache);
1128 kmem_cache_destroy(brt_pending_entry_cache);
1129}
1130
1027}
1028
1131static brt_entry_t *
1132brt_entry_alloc(const brt_entry_t *bre_init)
1133{
1134 brt_entry_t *bre;
1135
1136 bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1137 bre->bre_offset = bre_init->bre_offset;
1138 bre->bre_refcount = bre_init->bre_refcount;
1139
1140 return (bre);
1141}
1142
1143static void
1144brt_entry_free(brt_entry_t *bre)
1145{
1146
1147 kmem_cache_free(brt_entry_cache, bre);
1148}
1149
1150static void
1151brt_entry_addref(brt_t *brt, const blkptr_t *bp)
1152{
1153 brt_vdev_t *brtvd;
1154 brt_entry_t *bre, *racebre;
1155 brt_entry_t bre_search;
1156 avl_index_t where;
1157 uint64_t vdevid;
1158 int error;
1159
1160 ASSERT(!RW_WRITE_HELD(&brt->brt_lock));
1161
1162 brt_entry_fill(bp, &bre_search, &vdevid);
1163
1164 brt_wlock(brt);
1165
1166 brtvd = brt_vdev(brt, vdevid);
1167 if (brtvd == NULL) {
1168 ASSERT3U(vdevid, >=, brt->brt_nvdevs);
1169
1170 /* New VDEV was added. */
1171 brt_vdevs_expand(brt, vdevid + 1);
1172 brtvd = brt_vdev(brt, vdevid);
1173 }
1174 ASSERT(brtvd != NULL);
1175 if (!brtvd->bv_initiated)
1176 brt_vdev_realloc(brt, brtvd);
1177
1178 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1179 if (bre != NULL) {
1180 BRTSTAT_BUMP(brt_addref_entry_in_memory);
1181 } else {
1182 /*
1183 * brt_entry_lookup() may drop the BRT (read) lock and
1184 * reacquire it (write).
1185 */
1186 error = brt_entry_lookup(brt, brtvd, &bre_search);
1187 /* bre_search now contains correct bre_refcount */
1188 ASSERT(error == 0 || error == ENOENT);
1189 if (error == 0)
1190 BRTSTAT_BUMP(brt_addref_entry_on_disk);
1191 else
1192 BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1193 /*
1194 * When the BRT lock was dropped, brt_vdevs[] may have been
1195 * expanded and reallocated, we need to update brtvd's pointer.
1196 */
1197 brtvd = brt_vdev(brt, vdevid);
1198 ASSERT(brtvd != NULL);
1199
1200 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1201 if (racebre == NULL) {
1202 bre = brt_entry_alloc(&bre_search);
1203 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1204 avl_insert(&brtvd->bv_tree, bre, where);
1205 brt->brt_nentries++;
1206 } else {
1207 /*
1208 * The entry was added when the BRT lock was dropped in
1209 * brt_entry_lookup().
1210 */
1211 BRTSTAT_BUMP(brt_addref_entry_read_lost_race);
1212 bre = racebre;
1213 }
1214 }
1215 bre->bre_refcount++;
1216 brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1217
1218 brt_unlock(brt);
1219}
1220
1221/* Return TRUE if block should be freed immediately. */
1222boolean_t
1223brt_entry_decref(spa_t *spa, const blkptr_t *bp)
1224{
1029/* Return TRUE if block should be freed immediately. */
1030boolean_t
1031brt_entry_decref(spa_t *spa, const blkptr_t *bp)
1032{
1225 brt_t *brt = spa->spa_brt;
1226 brt_vdev_t *brtvd;
1227 brt_entry_t *bre, *racebre;
1228 brt_entry_t bre_search;
1229 avl_index_t where;
1230 uint64_t vdevid;
1231 int error;
1232
1233 brt_entry_fill(bp, &bre_search, &vdevid);
1234
1033 brt_entry_t *bre, *racebre;
1034 brt_entry_t bre_search;
1035 avl_index_t where;
1036 uint64_t vdevid;
1037 int error;
1038
1039 brt_entry_fill(bp, &bre_search, &vdevid);
1040
1235 brt_wlock(brt);
1236
1237 brtvd = brt_vdev(brt, vdevid);
1041 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1238 ASSERT(brtvd != NULL);
1239
1042 ASSERT(brtvd != NULL);
1043
1044 rw_enter(&brtvd->bv_lock, RW_WRITER);
1045 ASSERT(brtvd->bv_initiated);
1240 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1241 if (bre != NULL) {
1242 BRTSTAT_BUMP(brt_decref_entry_in_memory);
1243 goto out;
1244 } else {
1245 BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
1246 }
1046 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1047 if (bre != NULL) {
1048 BRTSTAT_BUMP(brt_decref_entry_in_memory);
1049 goto out;
1050 } else {
1051 BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
1052 }
1053 rw_exit(&brtvd->bv_lock);
1247
1054
1248 /*
1249 * brt_entry_lookup() may drop the BRT lock and reacquire it.
1250 */
1251 error = brt_entry_lookup(brt, brtvd, &bre_search);
1252 /* bre_search now contains correct bre_refcount */
1253 ASSERT(error == 0 || error == ENOENT);
1254 /*
1255 * When the BRT lock was dropped, brt_vdevs[] may have been expanded
1256 * and reallocated, we need to update brtvd's pointer.
1257 */
1258 brtvd = brt_vdev(brt, vdevid);
1259 ASSERT(brtvd != NULL);
1260
1055 error = brt_entry_lookup(brtvd, &bre_search);
1056 /* bre_search now contains correct bre_count */
1261 if (error == ENOENT) {
1057 if (error == ENOENT) {
1262 BRTSTAT_BUMP(brt_decref_entry_not_on_disk);
1263 bre = NULL;
1264 goto out;
1058 BRTSTAT_BUMP(brt_decref_no_entry);
1059 return (B_TRUE);
1265 }
1060 }
1061 ASSERT0(error);
1266
1062
1063 rw_enter(&brtvd->bv_lock, RW_WRITER);
1267 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1268 if (racebre != NULL) {
1064 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1065 if (racebre != NULL) {
1269 /*
1270 * The entry was added when the BRT lock was dropped in
1271 * brt_entry_lookup().
1272 */
1066 /* The entry was added when the lock was dropped. */
1273 BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
1274 bre = racebre;
1275 goto out;
1276 }
1277
1278 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1067 BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
1068 bre = racebre;
1069 goto out;
1070 }
1071
1072 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1279 bre = brt_entry_alloc(&bre_search);
1280 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1073 bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1074 bre->bre_bp = bre_search.bre_bp;
1075 bre->bre_count = bre_search.bre_count;
1076 bre->bre_pcount = 0;
1281 avl_insert(&brtvd->bv_tree, bre, where);
1077 avl_insert(&brtvd->bv_tree, bre, where);
1282 brt->brt_nentries++;
1283
1284out:
1078
1079out:
1285 if (bre == NULL) {
1286 /*
1287 * This is a free of a regular (not cloned) block.
1288 */
1289 brt_unlock(brt);
1290 BRTSTAT_BUMP(brt_decref_no_entry);
1291 return (B_TRUE);
1292 }
1293 if (bre->bre_refcount == 0) {
1294 brt_unlock(brt);
1080 if (bre->bre_count == 0) {
1081 rw_exit(&brtvd->bv_lock);
1295 BRTSTAT_BUMP(brt_decref_free_data_now);
1296 return (B_TRUE);
1297 }
1298
1082 BRTSTAT_BUMP(brt_decref_free_data_now);
1083 return (B_TRUE);
1084 }
1085
1299 ASSERT(bre->bre_refcount > 0);
1300 bre->bre_refcount--;
1301 if (bre->bre_refcount == 0)
1086 bre->bre_pcount--;
1087 ASSERT(bre->bre_count > 0);
1088 bre->bre_count--;
1089 if (bre->bre_count == 0)
1302 BRTSTAT_BUMP(brt_decref_free_data_later);
1303 else
1304 BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1090 BRTSTAT_BUMP(brt_decref_free_data_later);
1091 else
1092 BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1305 brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1093 brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp));
1306
1094
1307 brt_unlock(brt);
1095 rw_exit(&brtvd->bv_lock);
1308
1309 return (B_FALSE);
1310}
1311
1312uint64_t
1313brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
1314{
1096
1097 return (B_FALSE);
1098}
1099
1100uint64_t
1101brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
1102{
1315 brt_t *brt = spa->spa_brt;
1316 brt_vdev_t *brtvd;
1317 brt_entry_t bre_search, *bre;
1318 uint64_t vdevid, refcnt;
1319 int error;
1320
1321 brt_entry_fill(bp, &bre_search, &vdevid);
1322
1103 brt_entry_t bre_search, *bre;
1104 uint64_t vdevid, refcnt;
1105 int error;
1106
1107 brt_entry_fill(bp, &bre_search, &vdevid);
1108
1323 brt_rlock(brt);
1324
1325 brtvd = brt_vdev(brt, vdevid);
1109 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1326 ASSERT(brtvd != NULL);
1327
1110 ASSERT(brtvd != NULL);
1111
1112 rw_enter(&brtvd->bv_lock, RW_READER);
1113 ASSERT(brtvd->bv_initiated);
1328 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1329 if (bre == NULL) {
1114 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1115 if (bre == NULL) {
1330 error = brt_entry_lookup(brt, brtvd, &bre_search);
1331 ASSERT(error == 0 || error == ENOENT);
1332 if (error == ENOENT)
1116 rw_exit(&brtvd->bv_lock);
1117 error = brt_entry_lookup(brtvd, &bre_search);
1118 if (error == ENOENT) {
1333 refcnt = 0;
1119 refcnt = 0;
1334 else
1335 refcnt = bre_search.bre_refcount;
1336 } else
1337 refcnt = bre->bre_refcount;
1120 } else {
1121 ASSERT0(error);
1122 refcnt = bre_search.bre_count;
1123 }
1124 } else {
1125 refcnt = bre->bre_count;
1126 rw_exit(&brtvd->bv_lock);
1127 }
1338
1128
1339 brt_unlock(brt);
1340 return (refcnt);
1341}
1342
1343static void
1129 return (refcnt);
1130}
1131
1132static void
1344brt_prefetch(brt_t *brt, const blkptr_t *bp)
1133brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp)
1345{
1134{
1346 brt_entry_t bre;
1347 uint64_t vdevid;
1348
1349 ASSERT(bp != NULL);
1350
1351 if (!brt_zap_prefetch)
1135 if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0)
1352 return;
1353
1136 return;
1137
1354 brt_entry_fill(bp, &bre, &vdevid);
1355
1356 brt_entry_prefetch(brt, vdevid, &bre);
1138 uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
1139 rw_enter(&brtvd->bv_mos_entries_lock, RW_READER);
1140 if (brtvd->bv_mos_entries != 0) {
1141 (void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
1142 &off, BRT_KEY_WORDS);
1143 }
1144 rw_exit(&brtvd->bv_mos_entries_lock);
1357}
1358
1359static int
1145}
1146
1147static int
1360brt_pending_entry_compare(const void *x1, const void *x2)
1148brt_entry_compare(const void *x1, const void *x2)
1361{
1149{
1362 const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2;
1363 const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
1364 int cmp;
1150 const brt_entry_t *bre1 = x1, *bre2 = x2;
1151 const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp;
1365
1152
1366 cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
1367 DVA_GET_VDEV(&bp2->blk_dva[0]));
1368 if (cmp == 0) {
1369 cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1370 DVA_GET_OFFSET(&bp2->blk_dva[0]));
1371 if (unlikely(cmp == 0)) {
1372 cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2));
1373 }
1374 }
1375
1376 return (cmp);
1153 return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1154 DVA_GET_OFFSET(&bp2->blk_dva[0])));
1377}
1378
1379void
1380brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1381{
1155}
1156
1157void
1158brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1159{
1382 brt_t *brt;
1383 avl_tree_t *pending_tree;
1384 kmutex_t *pending_lock;
1385 brt_pending_entry_t *bpe, *newbpe;
1160 brt_entry_t *bre, *newbre;
1386 avl_index_t where;
1387 uint64_t txg;
1388
1161 avl_index_t where;
1162 uint64_t txg;
1163
1389 brt = spa->spa_brt;
1390 txg = dmu_tx_get_txg(tx);
1391 ASSERT3U(txg, !=, 0);
1164 txg = dmu_tx_get_txg(tx);
1165 ASSERT3U(txg, !=, 0);
1392 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1393 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1394
1166
1395 newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP);
1396 newbpe->bpe_bp = *bp;
1397 newbpe->bpe_count = 1;
1167 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1168 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE);
1169 avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
1398
1170
1399 mutex_enter(pending_lock);
1171 newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1172 newbre->bre_bp = *bp;
1173 newbre->bre_count = 0;
1174 newbre->bre_pcount = 1;
1400
1175
1401 bpe = avl_find(pending_tree, newbpe, &where);
1402 if (bpe == NULL) {
1403 avl_insert(pending_tree, newbpe, where);
1404 newbpe = NULL;
1176 mutex_enter(&brtvd->bv_pending_lock);
1177 bre = avl_find(pending_tree, newbre, &where);
1178 if (bre == NULL) {
1179 avl_insert(pending_tree, newbre, where);
1180 newbre = NULL;
1405 } else {
1181 } else {
1406 bpe->bpe_count++;
1182 bre->bre_pcount++;
1407 }
1183 }
1184 mutex_exit(&brtvd->bv_pending_lock);
1408
1185
1409 mutex_exit(pending_lock);
1410
1411 if (newbpe != NULL) {
1412 ASSERT(bpe != NULL);
1413 ASSERT(bpe != newbpe);
1414 kmem_cache_free(brt_pending_entry_cache, newbpe);
1186 if (newbre != NULL) {
1187 ASSERT(bre != NULL);
1188 ASSERT(bre != newbre);
1189 kmem_cache_free(brt_entry_cache, newbre);
1415 } else {
1190 } else {
1416 ASSERT(bpe == NULL);
1191 ASSERT0P(bre);
1417
1418 /* Prefetch BRT entry for the syncing context. */
1192
1193 /* Prefetch BRT entry for the syncing context. */
1419 brt_prefetch(brt, bp);
1194 brt_prefetch(brtvd, bp);
1420 }
1421}
1422
1423void
1424brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1425{
1195 }
1196}
1197
1198void
1199brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1200{
1426 brt_t *brt;
1427 avl_tree_t *pending_tree;
1428 kmutex_t *pending_lock;
1429 brt_pending_entry_t *bpe, bpe_search;
1201 brt_entry_t *bre, bre_search;
1430 uint64_t txg;
1431
1202 uint64_t txg;
1203
1432 brt = spa->spa_brt;
1433 txg = dmu_tx_get_txg(tx);
1434 ASSERT3U(txg, !=, 0);
1204 txg = dmu_tx_get_txg(tx);
1205 ASSERT3U(txg, !=, 0);
1435 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1436 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1437
1206
1438 bpe_search.bpe_bp = *bp;
1207 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1208 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1209 ASSERT(brtvd != NULL);
1210 avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
1439
1211
1440 mutex_enter(pending_lock);
1212 bre_search.bre_bp = *bp;
1441
1213
1442 bpe = avl_find(pending_tree, &bpe_search, NULL);
1443 /* I believe we should always find bpe when this function is called. */
1444 if (bpe != NULL) {
1445 ASSERT(bpe->bpe_count > 0);
1214 mutex_enter(&brtvd->bv_pending_lock);
1215 bre = avl_find(pending_tree, &bre_search, NULL);
1216 ASSERT(bre != NULL);
1217 ASSERT(bre->bre_pcount > 0);
1218 bre->bre_pcount--;
1219 if (bre->bre_pcount == 0)
1220 avl_remove(pending_tree, bre);
1221 else
1222 bre = NULL;
1223 mutex_exit(&brtvd->bv_pending_lock);
1446
1224
1447 bpe->bpe_count--;
1448 if (bpe->bpe_count == 0) {
1449 avl_remove(pending_tree, bpe);
1450 kmem_cache_free(brt_pending_entry_cache, bpe);
1451 }
1452 }
1453
1454 mutex_exit(pending_lock);
1225 if (bre)
1226 kmem_cache_free(brt_entry_cache, bre);
1455}
1456
1227}
1228
1457void
1458brt_pending_apply(spa_t *spa, uint64_t txg)
1229static void
1230brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg)
1459{
1231{
1460 brt_t *brt = spa->spa_brt;
1461 brt_pending_entry_t *bpe;
1462 avl_tree_t *pending_tree;
1463 void *c;
1232 brt_entry_t *bre, *nbre;
1464
1233
1465 ASSERT3U(txg, !=, 0);
1466
1467 /*
1234 /*
1468 * We are in syncing context, so no other brt_pending_tree accesses
1469 * are possible for the TXG. Don't need to acquire brt_pending_lock.
1235 * We are in syncing context, so no other bv_pending_tree accesses
1236 * are possible for the TXG. So we don't need bv_pending_lock.
1470 */
1237 */
1471 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1238 ASSERT(avl_is_empty(&brtvd->bv_tree));
1239 avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]);
1472
1240
1473 c = NULL;
1474 while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
1475 boolean_t added_to_ddt;
1241 for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) {
1242 nbre = AVL_NEXT(&brtvd->bv_tree, bre);
1476
1243
1477 for (int i = 0; i < bpe->bpe_count; i++) {
1478 /*
1479 * If the block has DEDUP bit set, it means that it
1480 * already exists in the DEDUP table, so we can just
1481 * use that instead of creating new entry in
1482 * the BRT table.
1483 */
1484 if (BP_GET_DEDUP(&bpe->bpe_bp)) {
1485 added_to_ddt = ddt_addref(spa, &bpe->bpe_bp);
1244 /*
1245 * If the block has DEDUP bit set, it means that it
1246 * already exists in the DEDUP table, so we can just
1247 * use that instead of creating new entry in the BRT.
1248 */
1249 if (BP_GET_DEDUP(&bre->bre_bp)) {
1250 while (bre->bre_pcount > 0) {
1251 if (!ddt_addref(spa, &bre->bre_bp))
1252 break;
1253 bre->bre_pcount--;
1254 }
1255 if (bre->bre_pcount == 0) {
1256 avl_remove(&brtvd->bv_tree, bre);
1257 kmem_cache_free(brt_entry_cache, bre);
1258 continue;
1259 }
1260 }
1261
1262 /*
1263 * Unless we know that the block is definitely not in ZAP,
1264 * try to get its reference count from there.
1265 */
1266 uint64_t off = BRE_OFFSET(bre);
1267 if (brtvd->bv_mos_entries != 0 &&
1268 brt_vdev_lookup(spa, brtvd, off)) {
1269 int error = zap_lookup_uint64_by_dnode(
1270 brtvd->bv_mos_entries_dnode, &off,
1271 BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1272 &bre->bre_count);
1273 if (error == 0) {
1274 BRTSTAT_BUMP(brt_addref_entry_on_disk);
1486 } else {
1275 } else {
1487 added_to_ddt = B_FALSE;
1276 ASSERT3U(error, ==, ENOENT);
1277 BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1488 }
1278 }
1489 if (!added_to_ddt)
1490 brt_entry_addref(brt, &bpe->bpe_bp);
1491 }
1279 }
1280 }
1492
1281
1493 kmem_cache_free(brt_pending_entry_cache, bpe);
1282 /*
1283 * If all the cloned blocks we had were handled by DDT, we don't need
1284 * to initiate the vdev.
1285 */
1286 if (avl_is_empty(&brtvd->bv_tree))
1287 return;
1288
1289 if (!brtvd->bv_initiated) {
1290 rw_enter(&brtvd->bv_lock, RW_WRITER);
1291 brt_vdev_realloc(spa, brtvd);
1292 rw_exit(&brtvd->bv_lock);
1494 }
1293 }
1294
1295 /*
1296 * Convert pending references into proper ones. This has to be a
1297 * separate loop, since entcount modifications would cause false
1298 * positives for brt_vdev_lookup() on following iterations.
1299 */
1300 for (bre = avl_first(&brtvd->bv_tree); bre;
1301 bre = AVL_NEXT(&brtvd->bv_tree, bre)) {
1302 brt_vdev_addref(spa, brtvd, bre,
1303 bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount);
1304 bre->bre_count += bre->bre_pcount;
1305 }
1495}
1496
1306}
1307
1308void
1309brt_pending_apply(spa_t *spa, uint64_t txg)
1310{
1311
1312 brt_rlock(spa);
1313 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1314 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1315 brt_unlock(spa);
1316
1317 brt_pending_apply_vdev(spa, brtvd, txg);
1318
1319 brt_rlock(spa);
1320 }
1321 brt_unlock(spa);
1322}
1323
1497static void
1498brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
1499{
1324static void
1325brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
1326{
1500 if (bre->bre_refcount == 0) {
1501 int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset,
1327 uint64_t off = BRE_OFFSET(bre);
1328
1329 if (bre->bre_pcount == 0) {
1330 /* The net change is zero, nothing to do in ZAP. */
1331 } else if (bre->bre_count == 0) {
1332 int error = zap_remove_uint64_by_dnode(dn, &off,
1502 BRT_KEY_WORDS, tx);
1503 VERIFY(error == 0 || error == ENOENT);
1504 } else {
1333 BRT_KEY_WORDS, tx);
1334 VERIFY(error == 0 || error == ENOENT);
1335 } else {
1505 VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset,
1506 BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount),
1507 &bre->bre_refcount, tx));
1336 VERIFY0(zap_update_uint64_by_dnode(dn, &off,
1337 BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1338 &bre->bre_count, tx));
1508 }
1509}
1510
1511static void
1339 }
1340}
1341
1342static void
1512brt_sync_table(brt_t *brt, dmu_tx_t *tx)
1343brt_sync_table(spa_t *spa, dmu_tx_t *tx)
1513{
1344{
1514 brt_vdev_t *brtvd;
1515 brt_entry_t *bre;
1345 brt_entry_t *bre;
1516 dnode_t *dn;
1517 uint64_t vdevid;
1518 void *c;
1519
1346
1520 brt_wlock(brt);
1347 brt_rlock(spa);
1348 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1349 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1350 brt_unlock(spa);
1521
1351
1522 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
1523 brtvd = &brt->brt_vdevs[vdevid];
1524
1525 if (!brtvd->bv_initiated)
1526 continue;
1527
1528 if (!brtvd->bv_meta_dirty) {
1529 ASSERT(!brtvd->bv_entcount_dirty);
1530 ASSERT0(avl_numnodes(&brtvd->bv_tree));
1352 if (!brtvd->bv_meta_dirty) {
1353 ASSERT(!brtvd->bv_entcount_dirty);
1354 ASSERT0(avl_numnodes(&brtvd->bv_tree));
1355 brt_rlock(spa);
1531 continue;
1532 }
1533
1534 ASSERT(!brtvd->bv_entcount_dirty ||
1535 avl_numnodes(&brtvd->bv_tree) != 0);
1536
1537 if (brtvd->bv_mos_brtvdev == 0)
1356 continue;
1357 }
1358
1359 ASSERT(!brtvd->bv_entcount_dirty ||
1360 avl_numnodes(&brtvd->bv_tree) != 0);
1361
1362 if (brtvd->bv_mos_brtvdev == 0)
1538 brt_vdev_create(brt, brtvd, tx);
1363 brt_vdev_create(spa, brtvd, tx);
1539
1364
1540 VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries,
1541 FTAG, &dn));
1542
1543 c = NULL;
1365 void *c = NULL;
1544 while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1366 while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1545 brt_sync_entry(dn, bre, tx);
1546 brt_entry_free(bre);
1547 ASSERT(brt->brt_nentries > 0);
1548 brt->brt_nentries--;
1367 brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx);
1368 kmem_cache_free(brt_entry_cache, bre);
1549 }
1550
1369 }
1370
1551 dnode_rele(dn, FTAG);
1552
1553 brt_vdev_sync(brt, brtvd, tx);
1554
1371#ifdef ZFS_DEBUG
1372 if (zfs_flags & ZFS_DEBUG_BRT)
1373 brt_vdev_dump(brtvd);
1374#endif
1555 if (brtvd->bv_totalcount == 0)
1375 if (brtvd->bv_totalcount == 0)
1556 brt_vdev_destroy(brt, brtvd, tx);
1376 brt_vdev_destroy(spa, brtvd, tx);
1377 else
1378 brt_vdev_sync(spa, brtvd, tx);
1379 brt_rlock(spa);
1557 }
1380 }
1558
1559 ASSERT0(brt->brt_nentries);
1560
1561 brt_unlock(brt);
1381 brt_unlock(spa);
1562}
1563
1564void
1565brt_sync(spa_t *spa, uint64_t txg)
1566{
1567 dmu_tx_t *tx;
1382}
1383
1384void
1385brt_sync(spa_t *spa, uint64_t txg)
1386{
1387 dmu_tx_t *tx;
1568 brt_t *brt;
1388 uint64_t vdevid;
1569
1389
1570 ASSERT(spa_syncing_txg(spa) == txg);
1390 ASSERT3U(spa_syncing_txg(spa), ==, txg);
1571
1391
1572 brt = spa->spa_brt;
1573 brt_rlock(brt);
1574 if (brt->brt_nentries == 0) {
1575 /* No changes. */
1576 brt_unlock(brt);
1392 brt_rlock(spa);
1393 for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1394 if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty)
1395 break;
1396 }
1397 if (vdevid >= spa->spa_brt_nvdevs) {
1398 brt_unlock(spa);
1577 return;
1578 }
1399 return;
1400 }
1579 brt_unlock(brt);
1401 brt_unlock(spa);
1580
1581 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1402
1403 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1582
1583 brt_sync_table(brt, tx);
1584
1404 brt_sync_table(spa, tx);
1585 dmu_tx_commit(tx);
1586}
1587
1588static void
1405 dmu_tx_commit(tx);
1406}
1407
1408static void
1589brt_table_alloc(brt_t *brt)
1590{
1591
1592 for (int i = 0; i < TXG_SIZE; i++) {
1593 avl_create(&brt->brt_pending_tree[i],
1594 brt_pending_entry_compare,
1595 sizeof (brt_pending_entry_t),
1596 offsetof(brt_pending_entry_t, bpe_node));
1597 mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT,
1598 NULL);
1599 }
1600}
1601
1602static void
1603brt_table_free(brt_t *brt)
1604{
1605
1606 for (int i = 0; i < TXG_SIZE; i++) {
1607 ASSERT(avl_is_empty(&brt->brt_pending_tree[i]));
1608
1609 avl_destroy(&brt->brt_pending_tree[i]);
1610 mutex_destroy(&brt->brt_pending_lock[i]);
1611 }
1612}
1613
1614static void
1615brt_alloc(spa_t *spa)
1616{
1409brt_alloc(spa_t *spa)
1410{
1617 brt_t *brt;
1618
1619 ASSERT(spa->spa_brt == NULL);
1620
1621 brt = kmem_zalloc(sizeof (*brt), KM_SLEEP);
1622 rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL);
1623 brt->brt_spa = spa;
1624 brt->brt_rangesize = 0;
1625 brt->brt_nentries = 0;
1626 brt->brt_vdevs = NULL;
1627 brt->brt_nvdevs = 0;
1628 brt_table_alloc(brt);
1629
1630 spa->spa_brt = brt;
1411 rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL);
1412 spa->spa_brt_vdevs = NULL;
1413 spa->spa_brt_nvdevs = 0;
1414 spa->spa_brt_rangesize = 0;
1631}
1632
1633void
1634brt_create(spa_t *spa)
1635{
1415}
1416
1417void
1418brt_create(spa_t *spa)
1419{
1636
1637 brt_alloc(spa);
1420 brt_alloc(spa);
1638 brt_vdevs_alloc(spa->spa_brt, B_FALSE);
1421 spa->spa_brt_rangesize = BRT_RANGESIZE;
1639}
1640
1641int
1642brt_load(spa_t *spa)
1643{
1422}
1423
1424int
1425brt_load(spa_t *spa)
1426{
1427 int error = 0;
1644
1645 brt_alloc(spa);
1428
1429 brt_alloc(spa);
1646 brt_vdevs_alloc(spa->spa_brt, B_TRUE);
1430 brt_wlock(spa);
1431 for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children;
1432 vdevid++) {
1433 char name[64];
1434 uint64_t mos_brtvdev;
1647
1435
1648 return (0);
1436 /* Look if this vdev had active block cloning. */
1437 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
1438 (u_longlong_t)vdevid);
1439 error = zap_lookup(spa->spa_meta_objset,
1440 DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
1441 &mos_brtvdev);
1442 if (error == ENOENT) {
1443 error = 0;
1444 continue;
1445 }
1446 if (error != 0)
1447 break;
1448
1449 /* If it did, then allocate them all and load this one. */
1450 brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children);
1451 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1452 rw_enter(&brtvd->bv_lock, RW_WRITER);
1453 brtvd->bv_mos_brtvdev = mos_brtvdev;
1454 error = brt_vdev_load(spa, brtvd);
1455 rw_exit(&brtvd->bv_lock);
1456 if (error != 0)
1457 break;
1458 }
1459
1460 if (spa->spa_brt_rangesize == 0)
1461 spa->spa_brt_rangesize = BRT_RANGESIZE;
1462 brt_unlock(spa);
1463 return (error);
1649}
1650
1651void
1652brt_unload(spa_t *spa)
1653{
1464}
1465
1466void
1467brt_unload(spa_t *spa)
1468{
1654 brt_t *brt = spa->spa_brt;
1655
1656 if (brt == NULL)
1469 if (spa->spa_brt_rangesize == 0)
1657 return;
1470 return;
1658
1659 brt_vdevs_free(brt);
1660 brt_table_free(brt);
1661 rw_destroy(&brt->brt_lock);
1662 kmem_free(brt, sizeof (*brt));
1663 spa->spa_brt = NULL;
1471 brt_vdevs_free(spa);
1472 rw_destroy(&spa->spa_brt_lock);
1473 spa->spa_brt_rangesize = 0;
1664}
1665
1666/* BEGIN CSTYLED */
1667ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
1668 "Enable prefetching of BRT ZAP entries");
1669ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
1670 "BRT ZAP leaf blockshift");
1671ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
1672 "BRT ZAP indirect blockshift");
1673/* END CSTYLED */
1474}
1475
1476/* BEGIN CSTYLED */
1477ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
1478 "Enable prefetching of BRT ZAP entries");
1479ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
1480 "BRT ZAP leaf blockshift");
1481ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
1482 "BRT ZAP indirect blockshift");
1483/* END CSTYLED */