Lines Matching +full:data +full:- +full:mapping

1 // SPDX-License-Identifier: CDDL-1.0
43 * mapping from old location on the removed device to the new location
44 * on another device in the pool and use this mapping whenever we need
45 * to access the DVA. Unfortunately, this mapping did not respect
53 * - I/Os to this vdev use the callback to determine where the
54 * data is now located, and issue child I/Os for each segment's new
57 * - frees and claims to this vdev use the callback to free or claim
68 * that vdev's mapping may no longer be referenced (aka "obsolete"). We
69 * keep track of how much of each mapping entry is obsolete. When
71 * the memory used by the mapping. The complete picture of obsolescence
72 * is given by the following data structures, described below:
73 * - the entry-specific obsolete count
74 * - the vdev-specific obsolete spacemap
75 * - the pool-specific obsolete bpobj
77 * == On disk data structures used ==
84 * - Each vic_mapping_object (associated with an indirect vdev) can
87 * the mapping is condensed, entries from the vic_obsolete_sm_object
90 * corresponding mapping entry that were not referenced when the
91 * mapping was last condensed.
93 * - Each indirect or removing vdev can have a vic_obsolete_sm_object.
104 * - Each dataset can have a ds_remap_deadlist object. This is a
111 * - The pool can have a dp_obsolete_bpobj. This is a list of blocks
120 * - When freeing a block: if any DVA is on an indirect vdev, append to
122 * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
124 * - When freeing a snapshot: move parts of ds_remap_deadlist to
126 * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
133 * Condensing an indirect vdev's mapping is the process of determining
134 * the precise counts of obsolete space for each mapping entry (by
136 * writing out a new mapping that contains only referenced entries.
138 * We condense a vdev when we expect the mapping to shrink (see
141 * open-context thread (spa_condense_indirect_thread) to incrementally
142 * create the new mapping object in a way that minimizes the impact on
145 * == Generating a new mapping ==
147 * To generate a new mapping, we follow these steps:
149 * 1. Save the old obsolete space map and create a new mapping object
157 * mapping entry, by incorporating the obsolete space map into the
160 * 3. Iterate through each mapping entry, writing to the new mapping any
162 * obsolete count == mapping length). (See
165 * 4. Destroy the old mapping object and switch over to the new one
173 * iterating where we left off: at vimp_max_offset of the new mapping
180 * Condense if at least this percent of the bytes in the mapping is
184 * i/o); lower values will reduce the mapping size more quickly.
197 * Don't bother condensing if the mapping uses less than this amount of
214 * expensive to check them all. Instead, try at most 100 randomly-selected
229 * need to read all copies of the data (e.g. for scrub or reconstruction).
230 * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
231 * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs,
244 int ic_error; /* set when a child does not contain the data */
249 * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
262 vdev_t *is_vdev; /* top-level vdev */
280 * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
295 indirect_vsd_t *iv = zio->io_vsd; in vdev_indirect_map_free()
298 while ((is = list_remove_head(&iv->iv_splits)) != NULL) { in vdev_indirect_map_free()
299 for (int c = 0; c < is->is_children; c++) { in vdev_indirect_map_free()
300 indirect_child_t *ic = &is->is_child[c]; in vdev_indirect_map_free()
301 if (ic->ic_data != NULL) in vdev_indirect_map_free()
302 abd_free(ic->ic_data); in vdev_indirect_map_free()
306 while ((ic = list_remove_head(&is->is_unique_child)) != NULL) in vdev_indirect_map_free()
309 list_destroy(&is->is_unique_child); in vdev_indirect_map_free()
312 offsetof(indirect_split_t, is_child[is->is_children])); in vdev_indirect_map_free()
327 spa_t *spa = vd->vdev_spa; in vdev_indirect_mark_obsolete()
329 ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); in vdev_indirect_mark_obsolete()
330 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); in vdev_indirect_mark_obsolete()
333 vd->vdev_indirect_mapping, offset) != NULL); in vdev_indirect_mark_obsolete()
336 mutex_enter(&vd->vdev_obsolete_lock); in vdev_indirect_mark_obsolete()
337 zfs_range_tree_add(vd->vdev_obsolete_segments, offset, size); in vdev_indirect_mark_obsolete()
338 mutex_exit(&vd->vdev_obsolete_lock); in vdev_indirect_mark_obsolete()
356 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); in spa_vdev_indirect_mark_obsolete()
364 &spa->spa_condensing_indirect_phys; in spa_condensing_indirect_create()
366 objset_t *mos = spa->spa_meta_objset; in spa_condensing_indirect_create()
369 list_create(&sci->sci_new_mapping_entries[i], in spa_condensing_indirect_create()
374 sci->sci_new_mapping = in spa_condensing_indirect_create()
375 vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); in spa_condensing_indirect_create()
384 list_destroy(&sci->sci_new_mapping_entries[i]); in spa_condensing_indirect_destroy()
386 if (sci->sci_new_mapping != NULL) in spa_condensing_indirect_destroy()
387 vdev_indirect_mapping_close(sci->sci_new_mapping); in spa_condensing_indirect_destroy()
395 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; in vdev_indirect_should_condense()
396 spa_t *spa = vd->vdev_spa; in vdev_indirect_should_condense()
398 ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); in vdev_indirect_should_condense()
406 if (spa->spa_condensing_indirect != NULL) in vdev_indirect_should_condense()
413 * The mapping object size must not change while we are in vdev_indirect_should_condense()
417 if (vd->vdev_ops != &vdev_indirect_ops) in vdev_indirect_should_condense()
426 if (vd->vdev_obsolete_sm == NULL) { in vdev_indirect_should_condense()
431 ASSERT(vd->vdev_obsolete_sm != NULL); in vdev_indirect_should_condense()
433 ASSERT3U(obsolete_sm_obj, ==, space_map_object(vd->vdev_obsolete_sm)); in vdev_indirect_should_condense()
436 uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); in vdev_indirect_should_condense()
438 uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); in vdev_indirect_should_condense()
444 * obsolete, condense (unless the mapping is already small enough). in vdev_indirect_should_condense()
446 * by the mapping. in vdev_indirect_should_condense()
452 "spacemap covers %d%% of %lluMB mapping", in vdev_indirect_should_condense()
453 (u_longlong_t)vd->vdev_id, in vdev_indirect_should_condense()
466 (u_longlong_t)vd->vdev_id, in vdev_indirect_should_condense()
478 * mapping and replacing it with the new one.
484 spa_t *spa = dmu_tx_pool(tx)->dp_spa; in spa_condense_indirect_complete_sync()
486 &spa->spa_condensing_indirect_phys; in spa_condense_indirect_complete_sync()
487 vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); in spa_condense_indirect_complete_sync()
488 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; in spa_condense_indirect_complete_sync()
489 objset_t *mos = spa->spa_meta_objset; in spa_condense_indirect_complete_sync()
490 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; in spa_condense_indirect_complete_sync()
493 vdev_indirect_mapping_num_entries(sci->sci_new_mapping); in spa_condense_indirect_complete_sync()
496 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); in spa_condense_indirect_complete_sync()
497 ASSERT3P(sci, ==, spa->spa_condensing_indirect); in spa_condense_indirect_complete_sync()
499 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); in spa_condense_indirect_complete_sync()
501 ASSERT(vic->vic_mapping_object != 0); in spa_condense_indirect_complete_sync()
502 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); in spa_condense_indirect_complete_sync()
503 ASSERT(scip->scip_next_mapping_object != 0); in spa_condense_indirect_complete_sync()
504 ASSERT(scip->scip_prev_obsolete_sm_object != 0); in spa_condense_indirect_complete_sync()
509 rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); in spa_condense_indirect_complete_sync()
510 vdev_indirect_mapping_close(vd->vdev_indirect_mapping); in spa_condense_indirect_complete_sync()
511 vd->vdev_indirect_mapping = sci->sci_new_mapping; in spa_condense_indirect_complete_sync()
512 rw_exit(&vd->vdev_indirect_rwlock); in spa_condense_indirect_complete_sync()
514 sci->sci_new_mapping = NULL; in spa_condense_indirect_complete_sync()
515 vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); in spa_condense_indirect_complete_sync()
516 vic->vic_mapping_object = scip->scip_next_mapping_object; in spa_condense_indirect_complete_sync()
517 scip->scip_next_mapping_object = 0; in spa_condense_indirect_complete_sync()
519 space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); in spa_condense_indirect_complete_sync()
521 scip->scip_prev_obsolete_sm_object = 0; in spa_condense_indirect_complete_sync()
523 scip->scip_vdev = 0; in spa_condense_indirect_complete_sync()
527 spa_condensing_indirect_destroy(spa->spa_condensing_indirect); in spa_condense_indirect_complete_sync()
528 spa->spa_condensing_indirect = NULL; in spa_condense_indirect_complete_sync()
531 "new mapping object %llu has %llu entries " in spa_condense_indirect_complete_sync()
533 (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx), in spa_condense_indirect_complete_sync()
534 (u_longlong_t)vic->vic_mapping_object, in spa_condense_indirect_complete_sync()
537 vdev_config_dirty(spa->spa_root_vdev); in spa_condense_indirect_complete_sync()
541 * This sync task appends entries to the new mapping object.
548 spa_t *spa __maybe_unused = dmu_tx_pool(tx)->dp_spa; in spa_condense_indirect_commit_sync()
551 ASSERT3P(sci, ==, spa->spa_condensing_indirect); in spa_condense_indirect_commit_sync()
553 vdev_indirect_mapping_add_entries(sci->sci_new_mapping, in spa_condense_indirect_commit_sync()
554 &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); in spa_condense_indirect_commit_sync()
555 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); in spa_condense_indirect_commit_sync()
559 * Open-context function to add one entry to the new mapping. The new
566 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; in spa_condense_indirect_commit_entry()
568 ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); in spa_condense_indirect_commit_entry()
570 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); in spa_condense_indirect_commit_entry()
579 if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { in spa_condense_indirect_commit_entry()
586 vime->vime_mapping = *vimep; in spa_condense_indirect_commit_entry()
587 vime->vime_obsolete_count = count; in spa_condense_indirect_commit_entry()
588 list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); in spa_condense_indirect_commit_entry()
597 spa_t *spa = vd->vdev_spa; in spa_condense_indirect_generate_new_mapping()
599 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; in spa_condense_indirect_generate_new_mapping()
603 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); in spa_condense_indirect_generate_new_mapping()
604 ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); in spa_condense_indirect_generate_new_mapping()
607 (u_longlong_t)vd->vdev_id, in spa_condense_indirect_generate_new_mapping()
614 "at index %llu", (u_longlong_t)vd->vdev_id, in spa_condense_indirect_generate_new_mapping()
620 &old_mapping->vim_entries[mapi]; in spa_condense_indirect_generate_new_mapping()
621 uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); in spa_condense_indirect_generate_new_mapping()
647 return (spa->spa_condensing_indirect != NULL); in spa_condense_indirect_thread_check()
656 ASSERT3P(spa->spa_condensing_indirect, !=, NULL); in spa_condense_indirect_thread()
658 vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); in spa_condense_indirect_thread()
662 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; in spa_condense_indirect_thread()
664 &spa->spa_condensing_indirect_phys; in spa_condense_indirect_thread()
667 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; in spa_condense_indirect_thread()
670 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); in spa_condense_indirect_thread()
671 ASSERT(scip->scip_next_mapping_object != 0); in spa_condense_indirect_thread()
672 ASSERT(scip->scip_prev_obsolete_sm_object != 0); in spa_condense_indirect_thread()
673 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); in spa_condense_indirect_thread()
683 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); in spa_condense_indirect_thread()
686 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, in spa_condense_indirect_thread()
687 scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); in spa_condense_indirect_thread()
696 * Generate new mapping. Determine what index to continue from in spa_condense_indirect_thread()
698 * new mapping. in spa_condense_indirect_thread()
701 vdev_indirect_mapping_max_offset(sci->sci_new_mapping); in spa_condense_indirect_thread()
703 /* We haven't written anything to the new mapping yet. */ in spa_condense_indirect_thread()
721 * We've already written the whole new mapping. in spa_condense_indirect_thread()
728 start_index = entry - old_mapping->vim_entries; in spa_condense_indirect_thread()
759 spa_t *spa = vd->vdev_spa; in spa_condense_indirect_start_sync()
761 &spa->spa_condensing_indirect_phys; in spa_condense_indirect_start_sync()
763 ASSERT0(scip->scip_next_mapping_object); in spa_condense_indirect_start_sync()
764 ASSERT0(scip->scip_prev_obsolete_sm_object); in spa_condense_indirect_start_sync()
765 ASSERT0(scip->scip_vdev); in spa_condense_indirect_start_sync()
767 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); in spa_condense_indirect_start_sync()
769 ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); in spa_condense_indirect_start_sync()
775 scip->scip_vdev = vd->vdev_id; in spa_condense_indirect_start_sync()
776 scip->scip_next_mapping_object = in spa_condense_indirect_start_sync()
777 vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); in spa_condense_indirect_start_sync()
779 scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; in spa_condense_indirect_start_sync()
785 space_map_close(vd->vdev_obsolete_sm); in spa_condense_indirect_start_sync()
786 vd->vdev_obsolete_sm = NULL; in spa_condense_indirect_start_sync()
787 VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, in spa_condense_indirect_start_sync()
790 VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, in spa_condense_indirect_start_sync()
795 ASSERT3P(spa->spa_condensing_indirect, ==, NULL); in spa_condense_indirect_start_sync()
796 spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); in spa_condense_indirect_start_sync()
800 (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx), in spa_condense_indirect_start_sync()
801 (u_longlong_t)scip->scip_prev_obsolete_sm_object, in spa_condense_indirect_start_sync()
802 (u_longlong_t)scip->scip_next_mapping_object); in spa_condense_indirect_start_sync()
804 zthr_wakeup(spa->spa_condense_zthr); in spa_condense_indirect_start_sync()
816 spa_t *spa = vd->vdev_spa; in vdev_indirect_sync_obsolete()
817 vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config; in vdev_indirect_sync_obsolete()
819 ASSERT3U(vic->vic_mapping_object, !=, 0); in vdev_indirect_sync_obsolete()
820 ASSERT(zfs_range_tree_space(vd->vdev_obsolete_segments) > 0); in vdev_indirect_sync_obsolete()
821 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); in vdev_indirect_sync_obsolete()
827 obsolete_sm_object = space_map_alloc(spa->spa_meta_objset, in vdev_indirect_sync_obsolete()
830 ASSERT(vd->vdev_top_zap != 0); in vdev_indirect_sync_obsolete()
831 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, in vdev_indirect_sync_obsolete()
838 VERIFY0(space_map_open(&vd->vdev_obsolete_sm, in vdev_indirect_sync_obsolete()
839 spa->spa_meta_objset, obsolete_sm_object, in vdev_indirect_sync_obsolete()
840 0, vd->vdev_asize, 0)); in vdev_indirect_sync_obsolete()
843 ASSERT(vd->vdev_obsolete_sm != NULL); in vdev_indirect_sync_obsolete()
845 space_map_object(vd->vdev_obsolete_sm)); in vdev_indirect_sync_obsolete()
847 space_map_write(vd->vdev_obsolete_sm, in vdev_indirect_sync_obsolete()
848 vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); in vdev_indirect_sync_obsolete()
849 zfs_range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); in vdev_indirect_sync_obsolete()
855 int error = zap_lookup(spa->spa_meta_objset, in spa_condense_init()
858 sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), in spa_condense_init()
859 &spa->spa_condensing_indirect_phys); in spa_condense_init()
862 spa->spa_condensing_indirect = in spa_condense_init()
876 if (spa->spa_condensing_indirect != NULL) { in spa_condense_fini()
877 spa_condensing_indirect_destroy(spa->spa_condensing_indirect); in spa_condense_fini()
878 spa->spa_condensing_indirect = NULL; in spa_condense_fini()
885 ASSERT3P(spa->spa_condense_zthr, ==, NULL); in spa_start_indirect_condensing_thread()
886 spa->spa_condense_zthr = zthr_create("z_indirect_condense", in spa_start_indirect_condensing_thread()
899 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); in vdev_obsolete_sm_object()
901 if (vd->vdev_top_zap == 0) { in vdev_obsolete_sm_object()
906 int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, in vdev_obsolete_sm_object()
924 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); in vdev_obsolete_counts_are_precise()
926 if (vd->vdev_top_zap == 0) { in vdev_obsolete_counts_are_precise()
932 int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, in vdev_obsolete_counts_are_precise()
954 *psize = *max_psize = vd->vdev_asize + in vdev_indirect_open()
956 *logical_ashift = vd->vdev_ashift; in vdev_indirect_open()
957 *physical_ashift = vd->vdev_physical_ashift; in vdev_indirect_open()
973 rs->rs_vd = vd; in rs_alloc()
974 rs->rs_offset = offset; in rs_alloc()
975 rs->rs_asize = asize; in rs_alloc()
976 rs->rs_split_offset = split_offset; in rs_alloc()
982 * physical entries of the indirect mapping that correspond to the extent
984 * is populated with the number of mapping entries that were duplicated.
987 * This ensures that the mapping won't change due to condensing as we
998 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; in vdev_indirect_mapping_duplicate_adjacent_entries()
1001 ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock)); in vdev_indirect_mapping_duplicate_adjacent_entries()
1009 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); in vdev_indirect_mapping_duplicate_adjacent_entries()
1014 uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); in vdev_indirect_mapping_duplicate_adjacent_entries()
1015 uint64_t inner_size = MIN(asize, size - inner_offset); in vdev_indirect_mapping_duplicate_adjacent_entries()
1018 asize -= inner_size; in vdev_indirect_mapping_duplicate_adjacent_entries()
1046 * has been split into multiple sections in our mapping, we keep track
1050 * this scenario the callbacks in each split block won't occur in-order in
1063 spa_t *spa = vd->vdev_spa; in vdev_indirect_remap()
1070 vdev_t *v = rs->rs_vd; in vdev_indirect_remap()
1074 ASSERT(rs->rs_asize > 0); in vdev_indirect_remap()
1079 * prevent the mapping from being changed by condensing. in vdev_indirect_remap()
1084 * our copy of the mapping. Once we are done with the with in vdev_indirect_remap()
1086 * of the indirect mapping entries that are relevant to it. in vdev_indirect_remap()
1095 rw_enter(&v->vdev_indirect_rwlock, RW_READER); in vdev_indirect_remap()
1096 ASSERT3P(v->vdev_indirect_mapping, !=, NULL); in vdev_indirect_remap()
1098 vdev_indirect_mapping_entry_phys_t *mapping = in vdev_indirect_remap() local
1100 rs->rs_offset, rs->rs_asize, &num_entries); in vdev_indirect_remap()
1101 ASSERT3P(mapping, !=, NULL); in vdev_indirect_remap()
1103 rw_exit(&v->vdev_indirect_rwlock); in vdev_indirect_remap()
1114 vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; in vdev_indirect_remap()
1117 ASSERT3U(rs->rs_asize, >, 0); in vdev_indirect_remap()
1119 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); in vdev_indirect_remap()
1120 uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); in vdev_indirect_remap()
1121 uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); in vdev_indirect_remap()
1123 ASSERT3U(rs->rs_offset, >=, in vdev_indirect_remap()
1125 ASSERT3U(rs->rs_offset, <, in vdev_indirect_remap()
1127 ASSERT3U(dst_vdev, !=, v->vdev_id); in vdev_indirect_remap()
1129 uint64_t inner_offset = rs->rs_offset - in vdev_indirect_remap()
1132 MIN(rs->rs_asize, size - inner_offset); in vdev_indirect_remap()
1137 if (dst_v->vdev_ops == &vdev_indirect_ops) { in vdev_indirect_remap()
1140 inner_size, rs->rs_split_offset)); in vdev_indirect_remap()
1155 func(rs->rs_split_offset + inner_half, dst_v, in vdev_indirect_remap()
1159 func(rs->rs_split_offset, dst_v, in vdev_indirect_remap()
1163 func(rs->rs_split_offset, dst_v, in vdev_indirect_remap()
1168 rs->rs_offset += inner_size; in vdev_indirect_remap()
1169 rs->rs_asize -= inner_size; in vdev_indirect_remap()
1170 rs->rs_split_offset += inner_size; in vdev_indirect_remap()
1172 VERIFY0(rs->rs_asize); in vdev_indirect_remap()
1174 kmem_free(mapping, num_entries * sizeof (*mapping)); in vdev_indirect_remap()
1183 zio_t *pio = zio->io_private; in vdev_indirect_child_io_done()
1185 mutex_enter(&pio->io_lock); in vdev_indirect_child_io_done()
1186 pio->io_error = zio_worst_error(pio->io_error, zio->io_error); in vdev_indirect_child_io_done()
1187 mutex_exit(&pio->io_lock); in vdev_indirect_child_io_done()
1189 abd_free(zio->io_abd); in vdev_indirect_child_io_done()
1201 indirect_vsd_t *iv = zio->io_vsd; in vdev_indirect_gather_splits()
1205 if (vd->vdev_ops == &vdev_indirect_ops) in vdev_indirect_gather_splits()
1209 if (vd->vdev_ops == &vdev_mirror_ops) in vdev_indirect_gather_splits()
1210 n = vd->vdev_children; in vdev_indirect_gather_splits()
1215 is->is_children = n; in vdev_indirect_gather_splits()
1216 is->is_size = size; in vdev_indirect_gather_splits()
1217 is->is_split_offset = split_offset; in vdev_indirect_gather_splits()
1218 is->is_target_offset = offset; in vdev_indirect_gather_splits()
1219 is->is_vdev = vd; in vdev_indirect_gather_splits()
1220 list_create(&is->is_unique_child, sizeof (indirect_child_t), in vdev_indirect_gather_splits()
1224 * Note that we only consider multiple copies of the data for in vdev_indirect_gather_splits()
1229 if (vd->vdev_ops == &vdev_mirror_ops) { in vdev_indirect_gather_splits()
1231 is->is_child[i].ic_vdev = vd->vdev_child[i]; in vdev_indirect_gather_splits()
1232 list_link_init(&is->is_child[i].ic_node); in vdev_indirect_gather_splits()
1235 is->is_child[0].ic_vdev = vd; in vdev_indirect_gather_splits()
1238 list_insert_tail(&iv->iv_splits, is); in vdev_indirect_gather_splits()
1244 indirect_child_t *ic = zio->io_private; in vdev_indirect_read_split_done()
1246 if (zio->io_error != 0) { in vdev_indirect_read_split_done()
1248 * Clear ic_data to indicate that we do not have data for this in vdev_indirect_read_split_done()
1251 abd_free(ic->ic_data); in vdev_indirect_read_split_done()
1252 ic->ic_data = NULL; in vdev_indirect_read_split_done()
1262 indirect_vsd_t *iv = zio->io_vsd; in vdev_indirect_read_all()
1264 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); in vdev_indirect_read_all()
1266 for (indirect_split_t *is = list_head(&iv->iv_splits); in vdev_indirect_read_all()
1267 is != NULL; is = list_next(&iv->iv_splits, is)) { in vdev_indirect_read_all()
1268 for (int i = 0; i < is->is_children; i++) { in vdev_indirect_read_all()
1269 indirect_child_t *ic = &is->is_child[i]; in vdev_indirect_read_all()
1271 if (!vdev_readable(ic->ic_vdev)) in vdev_indirect_read_all()
1275 * If a child is missing the data, set ic_error. Used in vdev_indirect_read_all()
1280 if (vdev_dtl_contains(ic->ic_vdev, DTL_MISSING, in vdev_indirect_read_all()
1281 zio->io_txg, 1)) in vdev_indirect_read_all()
1282 ic->ic_error = SET_ERROR(ESTALE); in vdev_indirect_read_all()
1284 ic->ic_data = abd_alloc_sametype(zio->io_abd, in vdev_indirect_read_all()
1285 is->is_size); in vdev_indirect_read_all()
1286 ic->ic_duplicate = NULL; in vdev_indirect_read_all()
1289 ic->ic_vdev, is->is_target_offset, ic->ic_data, in vdev_indirect_read_all()
1290 is->is_size, zio->io_type, zio->io_priority, 0, in vdev_indirect_read_all()
1294 iv->iv_reconstruct = B_TRUE; in vdev_indirect_read_all()
1300 spa_t *spa __maybe_unused = zio->io_spa; in vdev_indirect_io_start()
1302 list_create(&iv->iv_splits, in vdev_indirect_io_start()
1305 zio->io_vsd = iv; in vdev_indirect_io_start()
1306 zio->io_vsd_ops = &vdev_indirect_vsd_ops; in vdev_indirect_io_start()
1309 if (zio->io_type != ZIO_TYPE_READ) { in vdev_indirect_io_start()
1310 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); in vdev_indirect_io_start()
1315 ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL | in vdev_indirect_io_start()
1319 vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, in vdev_indirect_io_start()
1322 indirect_split_t *first = list_head(&iv->iv_splits); in vdev_indirect_io_start()
1324 if (first->is_size == zio->io_size) { in vdev_indirect_io_start()
1327 * data, which will checksum the same as the original data. in vdev_indirect_io_start()
1336 * on non-indirect vdevs. This allows us to be less strict in vdev_indirect_io_start()
1339 ASSERT0(first->is_split_offset); in vdev_indirect_io_start()
1340 ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL); in vdev_indirect_io_start()
1341 zio_nowait(zio_vdev_child_io(zio, zio->io_bp, in vdev_indirect_io_start()
1342 first->is_vdev, first->is_target_offset, in vdev_indirect_io_start()
1343 abd_get_offset(zio->io_abd, 0), in vdev_indirect_io_start()
1344 zio->io_size, zio->io_type, zio->io_priority, 0, in vdev_indirect_io_start()
1347 iv->iv_split_block = B_TRUE; in vdev_indirect_io_start()
1348 if (zio->io_type == ZIO_TYPE_READ && in vdev_indirect_io_start()
1349 zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { in vdev_indirect_io_start()
1359 * split segment, from the top-level vdev. Since in vdev_indirect_io_start()
1362 * we get the right data. E.g. if it's a mirror, in vdev_indirect_io_start()
1370 for (indirect_split_t *is = list_head(&iv->iv_splits); in vdev_indirect_io_start()
1371 is != NULL; is = list_next(&iv->iv_splits, is)) { in vdev_indirect_io_start()
1373 is->is_vdev, is->is_target_offset, in vdev_indirect_io_start()
1374 abd_get_offset_size(zio->io_abd, in vdev_indirect_io_start()
1375 is->is_split_offset, is->is_size), in vdev_indirect_io_start()
1376 is->is_size, zio->io_type, in vdev_indirect_io_start()
1377 zio->io_priority, 0, in vdev_indirect_io_start()
1394 vdev_t *vd = ic->ic_vdev; in vdev_indirect_checksum_error()
1396 if (zio->io_flags & ZIO_FLAG_SPECULATIVE) in vdev_indirect_checksum_error()
1399 mutex_enter(&vd->vdev_stat_lock); in vdev_indirect_checksum_error()
1400 vd->vdev_stat.vs_checksum_errors++; in vdev_indirect_checksum_error()
1401 mutex_exit(&vd->vdev_stat_lock); in vdev_indirect_checksum_error()
1404 abd_t *bad_abd = ic->ic_data; in vdev_indirect_checksum_error()
1405 abd_t *good_abd = is->is_good_child->ic_data; in vdev_indirect_checksum_error()
1406 (void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, in vdev_indirect_checksum_error()
1407 is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc); in vdev_indirect_checksum_error()
1412 * each split segment's correct data (is_good_child's ic_data) with each
1413 * other copy of the data. If they differ, then we overwrite the bad data
1415 * if a vdev is missing a copy of the data we set ic_error and the read is
1420 * (based on which copies actually read bad data, as opposed to which we
1427 indirect_vsd_t *iv = zio->io_vsd; in vdev_indirect_repair()
1429 if (!spa_writeable(zio->io_spa)) in vdev_indirect_repair()
1432 for (indirect_split_t *is = list_head(&iv->iv_splits); in vdev_indirect_repair()
1433 is != NULL; is = list_next(&iv->iv_splits, is)) { in vdev_indirect_repair()
1434 for (int c = 0; c < is->is_children; c++) { in vdev_indirect_repair()
1435 indirect_child_t *ic = &is->is_child[c]; in vdev_indirect_repair()
1436 if (ic == is->is_good_child) in vdev_indirect_repair()
1438 if (ic->ic_data == NULL) in vdev_indirect_repair()
1440 if (ic->ic_duplicate == is->is_good_child) in vdev_indirect_repair()
1444 ic->ic_vdev, is->is_target_offset, in vdev_indirect_repair()
1445 is->is_good_child->ic_data, is->is_size, in vdev_indirect_repair()
1452 * a copy of the data, so suppress incrementing the in vdev_indirect_repair()
1455 if (ic->ic_error == ESTALE) in vdev_indirect_repair()
1469 indirect_vsd_t *iv = zio->io_vsd; in vdev_indirect_all_checksum_errors()
1471 if (zio->io_flags & ZIO_FLAG_SPECULATIVE) in vdev_indirect_all_checksum_errors()
1474 for (indirect_split_t *is = list_head(&iv->iv_splits); in vdev_indirect_all_checksum_errors()
1475 is != NULL; is = list_next(&iv->iv_splits, is)) { in vdev_indirect_all_checksum_errors()
1476 for (int c = 0; c < is->is_children; c++) { in vdev_indirect_all_checksum_errors()
1477 indirect_child_t *ic = &is->is_child[c]; in vdev_indirect_all_checksum_errors()
1479 if (ic->ic_data == NULL) in vdev_indirect_all_checksum_errors()
1482 vdev_t *vd = ic->ic_vdev; in vdev_indirect_all_checksum_errors()
1484 mutex_enter(&vd->vdev_stat_lock); in vdev_indirect_all_checksum_errors()
1485 vd->vdev_stat.vs_checksum_errors++; in vdev_indirect_all_checksum_errors()
1486 mutex_exit(&vd->vdev_stat_lock); in vdev_indirect_all_checksum_errors()
1487 (void) zfs_ereport_post_checksum(zio->io_spa, vd, in vdev_indirect_all_checksum_errors()
1488 NULL, zio, is->is_target_offset, is->is_size, in vdev_indirect_all_checksum_errors()
1495 * Copy data from all the splits to a main zio then validate the checksum.
1503 for (indirect_split_t *is = list_head(&iv->iv_splits); in vdev_indirect_splits_checksum_validate()
1504 is != NULL; is = list_next(&iv->iv_splits, is)) { in vdev_indirect_splits_checksum_validate()
1506 ASSERT3P(is->is_good_child->ic_data, !=, NULL); in vdev_indirect_splits_checksum_validate()
1507 ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL); in vdev_indirect_splits_checksum_validate()
1509 abd_copy_off(zio->io_abd, is->is_good_child->ic_data, in vdev_indirect_splits_checksum_validate()
1510 is->is_split_offset, 0, is->is_size); in vdev_indirect_splits_checksum_validate()
1528 iv->iv_attempts = 0; in vdev_indirect_splits_enumerate_all()
1530 for (indirect_split_t *is = list_head(&iv->iv_splits); in vdev_indirect_splits_enumerate_all()
1531 is != NULL; is = list_next(&iv->iv_splits, is)) in vdev_indirect_splits_enumerate_all()
1532 is->is_good_child = list_head(&is->is_unique_child); in vdev_indirect_splits_enumerate_all()
1535 iv->iv_attempts++; in vdev_indirect_splits_enumerate_all()
1541 for (indirect_split_t *is = list_head(&iv->iv_splits); in vdev_indirect_splits_enumerate_all()
1542 is != NULL; is = list_next(&iv->iv_splits, is)) { in vdev_indirect_splits_enumerate_all()
1543 is->is_good_child = list_next(&is->is_unique_child, in vdev_indirect_splits_enumerate_all()
1544 is->is_good_child); in vdev_indirect_splits_enumerate_all()
1545 if (is->is_good_child != NULL) { in vdev_indirect_splits_enumerate_all()
1550 is->is_good_child = list_head(&is->is_unique_child); in vdev_indirect_splits_enumerate_all()
1554 ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations); in vdev_indirect_splits_enumerate_all()
1567 iv->iv_attempts = 0; in vdev_indirect_splits_enumerate_randomly()
1569 while (iv->iv_attempts < iv->iv_attempts_max) { in vdev_indirect_splits_enumerate_randomly()
1570 iv->iv_attempts++; in vdev_indirect_splits_enumerate_randomly()
1572 for (indirect_split_t *is = list_head(&iv->iv_splits); in vdev_indirect_splits_enumerate_randomly()
1573 is != NULL; is = list_next(&iv->iv_splits, is)) { in vdev_indirect_splits_enumerate_randomly()
1574 indirect_child_t *ic = list_head(&is->is_unique_child); in vdev_indirect_splits_enumerate_randomly()
1575 int children = is->is_unique_children; in vdev_indirect_splits_enumerate_randomly()
1577 for (int i = random_in_range(children); i > 0; i--) in vdev_indirect_splits_enumerate_randomly()
1578 ic = list_next(&is->is_unique_child, ic); in vdev_indirect_splits_enumerate_randomly()
1581 is->is_good_child = ic; in vdev_indirect_splits_enumerate_randomly()
1603 for (indirect_split_t *is = list_head(&iv->iv_splits); in vdev_indirect_splits_damage()
1604 is != NULL; is = list_next(&iv->iv_splits, is)) { in vdev_indirect_splits_damage()
1605 is->is_unique_children = 0; in vdev_indirect_splits_damage()
1607 for (int i = 0; i < is->is_children; i++) { in vdev_indirect_splits_damage()
1608 indirect_child_t *ic = &is->is_child[i]; in vdev_indirect_splits_damage()
1609 if (ic->ic_data != NULL) { in vdev_indirect_splits_damage()
1610 is->is_unique_children++; in vdev_indirect_splits_damage()
1611 list_insert_tail(&is->is_unique_child, ic); in vdev_indirect_splits_damage()
1615 if (list_is_empty(&is->is_unique_child)) { in vdev_indirect_splits_damage()
1622 * Set each is_good_child to a randomly-selected child which in vdev_indirect_splits_damage()
1623 * is known to contain validated data. in vdev_indirect_splits_damage()
1633 * Set iv->iv_attempts_max such that all unique combinations will in vdev_indirect_splits_damage()
1636 iv->iv_attempts_max = 1; in vdev_indirect_splits_damage()
1638 for (indirect_split_t *is = list_head(&iv->iv_splits); in vdev_indirect_splits_damage()
1639 is != NULL; is = list_next(&iv->iv_splits, is)) { in vdev_indirect_splits_damage()
1640 for (int c = 0; c < is->is_children; c++) { in vdev_indirect_splits_damage()
1641 indirect_child_t *ic = &is->is_child[c]; in vdev_indirect_splits_damage()
1643 if (ic == is->is_good_child) in vdev_indirect_splits_damage()
1645 if (ic->ic_data == NULL) in vdev_indirect_splits_damage()
1648 abd_zero(ic->ic_data, abd_get_size(ic->ic_data)); in vdev_indirect_splits_damage()
1651 iv->iv_attempts_max *= 2; in vdev_indirect_splits_damage()
1652 if (iv->iv_attempts_max >= (1ULL << 12)) { in vdev_indirect_splits_damage()
1653 iv->iv_attempts_max = UINT64_MAX; in vdev_indirect_splits_damage()
1660 for (indirect_split_t *is = list_head(&iv->iv_splits); in vdev_indirect_splits_damage()
1661 is != NULL; is = list_next(&iv->iv_splits, is)) { in vdev_indirect_splits_damage()
1663 while ((ic = list_remove_head(&is->is_unique_child)) != NULL) in vdev_indirect_splits_damage()
1666 is->is_unique_children = 0; in vdev_indirect_splits_damage()
1673 * This function is called when we have read all copies of the data and need
1688 * 2-way mirror with unique copies, we will have the following pieces of data:
1698 * combinations, which is similar to bitwise-little-endian counting in
1714 * Note that the split segments may be on the same or different top-level
1718 * the correct data, as long as those errors are at sufficiently-separated
1719 * offsets (specifically, separated by the largest block size - default of
1725 indirect_vsd_t *iv = zio->io_vsd; in vdev_indirect_reconstruct_io_done()
1729 iv->iv_unique_combinations = 1; in vdev_indirect_reconstruct_io_done()
1730 iv->iv_attempts_max = UINT64_MAX; in vdev_indirect_reconstruct_io_done()
1733 iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max; in vdev_indirect_reconstruct_io_done()
1751 for (indirect_split_t *is = list_head(&iv->iv_splits); in vdev_indirect_reconstruct_io_done()
1752 is != NULL; is = list_next(&iv->iv_splits, is)) { in vdev_indirect_reconstruct_io_done()
1753 is->is_unique_children = 0; in vdev_indirect_reconstruct_io_done()
1755 for (int i = 0; i < is->is_children; i++) { in vdev_indirect_reconstruct_io_done()
1756 indirect_child_t *ic_i = &is->is_child[i]; in vdev_indirect_reconstruct_io_done()
1758 if (ic_i->ic_data == NULL || in vdev_indirect_reconstruct_io_done()
1759 ic_i->ic_duplicate != NULL) in vdev_indirect_reconstruct_io_done()
1762 for (int j = i + 1; j < is->is_children; j++) { in vdev_indirect_reconstruct_io_done()
1763 indirect_child_t *ic_j = &is->is_child[j]; in vdev_indirect_reconstruct_io_done()
1765 if (ic_j->ic_data == NULL || in vdev_indirect_reconstruct_io_done()
1766 ic_j->ic_duplicate != NULL) in vdev_indirect_reconstruct_io_done()
1769 if (abd_cmp(ic_i->ic_data, ic_j->ic_data) == 0) in vdev_indirect_reconstruct_io_done()
1770 ic_j->ic_duplicate = ic_i; in vdev_indirect_reconstruct_io_done()
1773 is->is_unique_children++; in vdev_indirect_reconstruct_io_done()
1774 list_insert_tail(&is->is_unique_child, ic_i); in vdev_indirect_reconstruct_io_done()
1778 EQUIV(list_is_empty(&is->is_unique_child), in vdev_indirect_reconstruct_io_done()
1779 is->is_unique_children == 0); in vdev_indirect_reconstruct_io_done()
1780 if (list_is_empty(&is->is_unique_child)) { in vdev_indirect_reconstruct_io_done()
1781 zio->io_error = EIO; in vdev_indirect_reconstruct_io_done()
1787 iv->iv_unique_combinations *= is->is_unique_children; in vdev_indirect_reconstruct_io_done()
1790 if (iv->iv_unique_combinations <= iv->iv_attempts_max) in vdev_indirect_reconstruct_io_done()
1798 zio->io_error = error; in vdev_indirect_reconstruct_io_done()
1815 indirect_vsd_t *iv = zio->io_vsd; in vdev_indirect_io_done()
1817 if (iv->iv_reconstruct) { in vdev_indirect_io_done()
1819 * We have read all copies of the data (e.g. from mirrors), in vdev_indirect_io_done()
1821 * one-copy read didn't checksum correctly. in vdev_indirect_io_done()
1827 if (!iv->iv_split_block) { in vdev_indirect_io_done()
1841 * will be reported to the top-level VDEV. in vdev_indirect_io_done()
1843 if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { in vdev_indirect_io_done()
1844 zio->io_error = ret; in vdev_indirect_io_done()
1845 zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; in vdev_indirect_io_done()
1906 "Minimum obsolete percent of bytes in the mapping "
1910 "Don't bother condensing if the mapping uses less than this amount of "