161145dc2SMartin Matuska // SPDX-License-Identifier: CDDL-1.0
2eda14cbcSMatt Macy /*
3eda14cbcSMatt Macy * CDDL HEADER START
4eda14cbcSMatt Macy *
5eda14cbcSMatt Macy * The contents of this file are subject to the terms of the
6eda14cbcSMatt Macy * Common Development and Distribution License (the "License").
7eda14cbcSMatt Macy * You may not use this file except in compliance with the License.
8eda14cbcSMatt Macy *
9eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0.
11eda14cbcSMatt Macy * See the License for the specific language governing permissions
12eda14cbcSMatt Macy * and limitations under the License.
13eda14cbcSMatt Macy *
14eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each
15eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the
17eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying
18eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner]
19eda14cbcSMatt Macy *
20eda14cbcSMatt Macy * CDDL HEADER END
21eda14cbcSMatt Macy */
22eda14cbcSMatt Macy
23eda14cbcSMatt Macy /*
24eda14cbcSMatt Macy * Copyright (c) 2017 by Delphix. All rights reserved.
25eda14cbcSMatt Macy */
26eda14cbcSMatt Macy
27eda14cbcSMatt Macy /*
28eda14cbcSMatt Macy * Storage Pool Checkpoint
29eda14cbcSMatt Macy *
30eda14cbcSMatt Macy * A storage pool checkpoint can be thought of as a pool-wide snapshot or
31eda14cbcSMatt Macy * a stable version of extreme rewind that guarantees no blocks from the
32eda14cbcSMatt Macy * checkpointed state will have been overwritten. It remembers the entire
33eda14cbcSMatt Macy * state of the storage pool (e.g. snapshots, dataset names, etc..) from the
34eda14cbcSMatt Macy * point that it was taken and the user can rewind back to that point even if
35eda14cbcSMatt Macy * they applied destructive operations on their datasets or even enabled new
36eda14cbcSMatt Macy * zpool on-disk features. If a pool has a checkpoint that is no longer
37eda14cbcSMatt Macy * needed, the user can discard it.
38eda14cbcSMatt Macy *
39eda14cbcSMatt Macy * == On disk data structures used ==
40eda14cbcSMatt Macy *
41eda14cbcSMatt Macy * - The pool has a new feature flag and a new entry in the MOS. The feature
42eda14cbcSMatt Macy * flag is set to active when we create the checkpoint and remains active
43eda14cbcSMatt Macy * until the checkpoint is fully discarded. The entry in the MOS config
44eda14cbcSMatt Macy * (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that
45eda14cbcSMatt Macy * references the state of the pool when we take the checkpoint. The entry
46eda14cbcSMatt Macy * remains populated until we start discarding the checkpoint or we rewind
47eda14cbcSMatt Macy * back to it.
48eda14cbcSMatt Macy *
49eda14cbcSMatt Macy * - Each vdev contains a vdev-wide space map while the pool has a checkpoint,
50eda14cbcSMatt Macy * which persists until the checkpoint is fully discarded. The space map
51eda14cbcSMatt Macy * contains entries that have been freed in the current state of the pool
52eda14cbcSMatt Macy * but we want to keep around in case we decide to rewind to the checkpoint.
53eda14cbcSMatt Macy * [see vdev_checkpoint_sm]
54eda14cbcSMatt Macy *
55eda14cbcSMatt Macy * - Each metaslab's ms_sm space map behaves the same as without the
56eda14cbcSMatt Macy * checkpoint, with the only exception being the scenario when we free
57eda14cbcSMatt Macy * blocks that belong to the checkpoint. In this case, these blocks remain
58eda14cbcSMatt Macy * ALLOCATED in the metaslab's space map and they are added as FREE in the
59eda14cbcSMatt Macy * vdev's checkpoint space map.
60eda14cbcSMatt Macy *
61eda14cbcSMatt Macy * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that
62eda14cbcSMatt Macy * the uberblock was checkpointed. For normal uberblocks this field is 0.
63eda14cbcSMatt Macy *
64eda14cbcSMatt Macy * == Overview of operations ==
65eda14cbcSMatt Macy *
66eda14cbcSMatt Macy * - To create a checkpoint, we first wait for the current TXG to be synced,
67eda14cbcSMatt Macy * so we can use the most recently synced uberblock (spa_ubsync) as the
68eda14cbcSMatt Macy * checkpointed uberblock. Then we use an early synctask to place that
69eda14cbcSMatt Macy * uberblock in MOS config, increment the feature flag for the checkpoint
70eda14cbcSMatt Macy * (marking it active), and setting spa_checkpoint_txg (see its use below)
71eda14cbcSMatt Macy * to the TXG of the checkpointed uberblock. We use an early synctask for
72eda14cbcSMatt Macy * the aforementioned operations to ensure that no blocks were dirtied
73eda14cbcSMatt Macy * between the current TXG and the TXG of the checkpointed uberblock
74eda14cbcSMatt Macy * (e.g the previous txg).
75eda14cbcSMatt Macy *
76eda14cbcSMatt Macy * - When a checkpoint exists, we need to ensure that the blocks that
77eda14cbcSMatt Macy * belong to the checkpoint are freed but never reused. This means that
78eda14cbcSMatt Macy * these blocks should never end up in the ms_allocatable or the ms_freeing
79eda14cbcSMatt Macy * trees of a metaslab. Therefore, whenever there is a checkpoint the new
80eda14cbcSMatt Macy * ms_checkpointing tree is used in addition to the aforementioned ones.
81eda14cbcSMatt Macy *
82eda14cbcSMatt Macy * Whenever a block is freed and we find out that it is referenced by the
83eda14cbcSMatt Macy * checkpoint (we find out by comparing its birth to spa_checkpoint_txg),
84eda14cbcSMatt Macy * we place it in the ms_checkpointing tree instead of the ms_freeingtree.
85eda14cbcSMatt Macy * This way, we divide the blocks that are being freed into checkpointed
86eda14cbcSMatt Macy * and not-checkpointed blocks.
87eda14cbcSMatt Macy *
88eda14cbcSMatt Macy * In order to persist these frees, we write the extents from the
89eda14cbcSMatt Macy * ms_freeingtree to the ms_sm as usual, and the extents from the
90eda14cbcSMatt Macy * ms_checkpointing tree to the vdev_checkpoint_sm. This way, these
91eda14cbcSMatt Macy * checkpointed extents will remain allocated in the metaslab's ms_sm space
92eda14cbcSMatt Macy * map, and therefore won't be reused [see metaslab_sync()]. In addition,
93eda14cbcSMatt Macy * when we discard the checkpoint, we can find the entries that have
94eda14cbcSMatt Macy * actually been freed in vdev_checkpoint_sm.
95eda14cbcSMatt Macy * [see spa_checkpoint_discard_thread_sync()]
96eda14cbcSMatt Macy *
97eda14cbcSMatt Macy * - To discard the checkpoint we use an early synctask to delete the
98eda14cbcSMatt Macy * checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0,
99eda14cbcSMatt Macy * and wakeup the discarding zthr thread (an open-context async thread).
100eda14cbcSMatt Macy * We use an early synctask to ensure that the operation happens before any
101eda14cbcSMatt Macy * new data end up in the checkpoint's data structures.
102eda14cbcSMatt Macy *
103eda14cbcSMatt Macy * Once the synctask is done and the discarding zthr is awake, we discard
104eda14cbcSMatt Macy * the checkpointed data over multiple TXGs by having the zthr prefetching
105eda14cbcSMatt Macy * entries from vdev_checkpoint_sm and then starting a synctask that places
106eda14cbcSMatt Macy * them as free blocks into their respective ms_allocatable and ms_sm
107eda14cbcSMatt Macy * structures.
108eda14cbcSMatt Macy * [see spa_checkpoint_discard_thread()]
109eda14cbcSMatt Macy *
110eda14cbcSMatt Macy * When there are no entries left in the vdev_checkpoint_sm of all
111eda14cbcSMatt Macy * top-level vdevs, a final synctask runs that decrements the feature flag.
112eda14cbcSMatt Macy *
113eda14cbcSMatt Macy * - To rewind to the checkpoint, we first use the current uberblock and
114eda14cbcSMatt Macy * open the MOS so we can access the checkpointed uberblock from the MOS
115eda14cbcSMatt Macy * config. After we retrieve the checkpointed uberblock, we use it as the
116eda14cbcSMatt Macy * current uberblock for the pool by writing it to disk with an updated
117eda14cbcSMatt Macy * TXG, opening its version of the MOS, and moving on as usual from there.
118eda14cbcSMatt Macy * [see spa_ld_checkpoint_rewind()]
119eda14cbcSMatt Macy *
120eda14cbcSMatt Macy * An important note on rewinding to the checkpoint has to do with how we
121eda14cbcSMatt Macy * handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL
122eda14cbcSMatt Macy * blocks that have not been claimed by the time we took the checkpoint
123eda14cbcSMatt Macy * as they should no longer be valid.
124eda14cbcSMatt Macy * [see comment in zil_claim()]
125eda14cbcSMatt Macy *
126eda14cbcSMatt Macy * == Miscellaneous information ==
127eda14cbcSMatt Macy *
128eda14cbcSMatt Macy * - In the hypothetical event that we take a checkpoint, remove a vdev,
129eda14cbcSMatt Macy * and attempt to rewind, the rewind would fail as the checkpointed
130eda14cbcSMatt Macy * uberblock would reference data in the removed device. For this reason
131eda14cbcSMatt Macy * and others of similar nature, we disallow the following operations that
132eda14cbcSMatt Macy * can change the config:
133eda14cbcSMatt Macy * vdev removal and attach/detach, mirror splitting, and pool reguid.
134eda14cbcSMatt Macy *
135eda14cbcSMatt Macy * - As most of the checkpoint logic is implemented in the SPA and doesn't
136eda14cbcSMatt Macy * distinguish datasets when it comes to space accounting, having a
137eda14cbcSMatt Macy * checkpoint can potentially break the boundaries set by dataset
138eda14cbcSMatt Macy * reservations.
139eda14cbcSMatt Macy */
140eda14cbcSMatt Macy
141eda14cbcSMatt Macy #include <sys/dmu_tx.h>
142eda14cbcSMatt Macy #include <sys/dsl_dir.h>
143eda14cbcSMatt Macy #include <sys/dsl_synctask.h>
144eda14cbcSMatt Macy #include <sys/metaslab_impl.h>
145eda14cbcSMatt Macy #include <sys/spa.h>
146eda14cbcSMatt Macy #include <sys/spa_impl.h>
147eda14cbcSMatt Macy #include <sys/spa_checkpoint.h>
148eda14cbcSMatt Macy #include <sys/vdev_impl.h>
149eda14cbcSMatt Macy #include <sys/zap.h>
150eda14cbcSMatt Macy #include <sys/zfeature.h>
151eda14cbcSMatt Macy
152eda14cbcSMatt Macy /*
153eda14cbcSMatt Macy * The following parameter limits the amount of memory to be used for the
154eda14cbcSMatt Macy * prefetching of the checkpoint space map done on each vdev while
155eda14cbcSMatt Macy * discarding the checkpoint.
156eda14cbcSMatt Macy *
157eda14cbcSMatt Macy * The reason it exists is because top-level vdevs with long checkpoint
158eda14cbcSMatt Macy * space maps can potentially take up a lot of memory depending on the
159eda14cbcSMatt Macy * amount of checkpointed data that has been freed within them while
160eda14cbcSMatt Macy * the pool had a checkpoint.
161eda14cbcSMatt Macy */
162dbd5678dSMartin Matuska static uint64_t zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
163eda14cbcSMatt Macy
164eda14cbcSMatt Macy int
spa_checkpoint_get_stats(spa_t * spa,pool_checkpoint_stat_t * pcs)165eda14cbcSMatt Macy spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs)
166eda14cbcSMatt Macy {
167eda14cbcSMatt Macy if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
168eda14cbcSMatt Macy return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
169eda14cbcSMatt Macy
170da5137abSMartin Matuska memset(pcs, 0, sizeof (pool_checkpoint_stat_t));
171eda14cbcSMatt Macy
172eda14cbcSMatt Macy int error = zap_contains(spa_meta_objset(spa),
173eda14cbcSMatt Macy DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT);
174eda14cbcSMatt Macy ASSERT(error == 0 || error == ENOENT);
175eda14cbcSMatt Macy
176eda14cbcSMatt Macy if (error == ENOENT)
177eda14cbcSMatt Macy pcs->pcs_state = CS_CHECKPOINT_DISCARDING;
178eda14cbcSMatt Macy else
179eda14cbcSMatt Macy pcs->pcs_state = CS_CHECKPOINT_EXISTS;
180eda14cbcSMatt Macy
181eda14cbcSMatt Macy pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace;
182eda14cbcSMatt Macy pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp;
183eda14cbcSMatt Macy
184eda14cbcSMatt Macy return (0);
185eda14cbcSMatt Macy }
186eda14cbcSMatt Macy
187eda14cbcSMatt Macy static void
spa_checkpoint_discard_complete_sync(void * arg,dmu_tx_t * tx)188eda14cbcSMatt Macy spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx)
189eda14cbcSMatt Macy {
190eda14cbcSMatt Macy spa_t *spa = arg;
191eda14cbcSMatt Macy
192eda14cbcSMatt Macy spa->spa_checkpoint_info.sci_timestamp = 0;
193eda14cbcSMatt Macy
194eda14cbcSMatt Macy spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
195eda14cbcSMatt Macy spa_notify_waiters(spa);
196eda14cbcSMatt Macy
197eda14cbcSMatt Macy spa_history_log_internal(spa, "spa discard checkpoint", tx,
198eda14cbcSMatt Macy "finished discarding checkpointed state from the pool");
199eda14cbcSMatt Macy }
200eda14cbcSMatt Macy
201eda14cbcSMatt Macy typedef struct spa_checkpoint_discard_sync_callback_arg {
202eda14cbcSMatt Macy vdev_t *sdc_vd;
203eda14cbcSMatt Macy uint64_t sdc_txg;
204eda14cbcSMatt Macy uint64_t sdc_entry_limit;
205eda14cbcSMatt Macy } spa_checkpoint_discard_sync_callback_arg_t;
206eda14cbcSMatt Macy
207eda14cbcSMatt Macy static int
spa_checkpoint_discard_sync_callback(space_map_entry_t * sme,void * arg)208eda14cbcSMatt Macy spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg)
209eda14cbcSMatt Macy {
210eda14cbcSMatt Macy spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
211eda14cbcSMatt Macy vdev_t *vd = sdc->sdc_vd;
212eda14cbcSMatt Macy metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
213eda14cbcSMatt Macy uint64_t end = sme->sme_offset + sme->sme_run;
214eda14cbcSMatt Macy
215eda14cbcSMatt Macy if (sdc->sdc_entry_limit == 0)
216eda14cbcSMatt Macy return (SET_ERROR(EINTR));
217eda14cbcSMatt Macy
218eda14cbcSMatt Macy /*
219eda14cbcSMatt Macy * Since the space map is not condensed, we know that
220eda14cbcSMatt Macy * none of its entries is crossing the boundaries of
221eda14cbcSMatt Macy * its respective metaslab.
222eda14cbcSMatt Macy *
223eda14cbcSMatt Macy * That said, there is no fundamental requirement that
224eda14cbcSMatt Macy * the checkpoint's space map entries should not cross
225eda14cbcSMatt Macy * metaslab boundaries. So if needed we could add code
226eda14cbcSMatt Macy * that handles metaslab-crossing segments in the future.
227eda14cbcSMatt Macy */
228eda14cbcSMatt Macy VERIFY3U(sme->sme_type, ==, SM_FREE);
229eda14cbcSMatt Macy VERIFY3U(sme->sme_offset, >=, ms->ms_start);
230eda14cbcSMatt Macy VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
231eda14cbcSMatt Macy
232eda14cbcSMatt Macy /*
233eda14cbcSMatt Macy * At this point we should not be processing any
234eda14cbcSMatt Macy * other frees concurrently, so the lock is technically
235eda14cbcSMatt Macy * unnecessary. We use the lock anyway though to
236eda14cbcSMatt Macy * potentially save ourselves from future headaches.
237eda14cbcSMatt Macy */
238eda14cbcSMatt Macy mutex_enter(&ms->ms_lock);
239b59a0cdeSMartin Matuska if (zfs_range_tree_is_empty(ms->ms_freeing))
240eda14cbcSMatt Macy vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
241b59a0cdeSMartin Matuska zfs_range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run);
242eda14cbcSMatt Macy mutex_exit(&ms->ms_lock);
243eda14cbcSMatt Macy
244eda14cbcSMatt Macy ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=,
245eda14cbcSMatt Macy sme->sme_run);
246eda14cbcSMatt Macy ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run);
247eda14cbcSMatt Macy
248eda14cbcSMatt Macy vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run;
249eda14cbcSMatt Macy vd->vdev_stat.vs_checkpoint_space -= sme->sme_run;
250eda14cbcSMatt Macy sdc->sdc_entry_limit--;
251eda14cbcSMatt Macy
252eda14cbcSMatt Macy return (0);
253eda14cbcSMatt Macy }
254eda14cbcSMatt Macy
255eda14cbcSMatt Macy #ifdef ZFS_DEBUG
256eda14cbcSMatt Macy static void
spa_checkpoint_accounting_verify(spa_t * spa)257eda14cbcSMatt Macy spa_checkpoint_accounting_verify(spa_t *spa)
258eda14cbcSMatt Macy {
259eda14cbcSMatt Macy vdev_t *rvd = spa->spa_root_vdev;
260eda14cbcSMatt Macy uint64_t ckpoint_sm_space_sum = 0;
261eda14cbcSMatt Macy uint64_t vs_ckpoint_space_sum = 0;
262eda14cbcSMatt Macy
263eda14cbcSMatt Macy for (uint64_t c = 0; c < rvd->vdev_children; c++) {
264eda14cbcSMatt Macy vdev_t *vd = rvd->vdev_child[c];
265eda14cbcSMatt Macy
266eda14cbcSMatt Macy if (vd->vdev_checkpoint_sm != NULL) {
267eda14cbcSMatt Macy ckpoint_sm_space_sum +=
268eda14cbcSMatt Macy -space_map_allocated(vd->vdev_checkpoint_sm);
269eda14cbcSMatt Macy vs_ckpoint_space_sum +=
270eda14cbcSMatt Macy vd->vdev_stat.vs_checkpoint_space;
271eda14cbcSMatt Macy ASSERT3U(ckpoint_sm_space_sum, ==,
272eda14cbcSMatt Macy vs_ckpoint_space_sum);
273eda14cbcSMatt Macy } else {
274eda14cbcSMatt Macy ASSERT0(vd->vdev_stat.vs_checkpoint_space);
275eda14cbcSMatt Macy }
276eda14cbcSMatt Macy }
277eda14cbcSMatt Macy ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum);
278eda14cbcSMatt Macy }
279eda14cbcSMatt Macy #endif
280eda14cbcSMatt Macy
281eda14cbcSMatt Macy static void
spa_checkpoint_discard_thread_sync(void * arg,dmu_tx_t * tx)282eda14cbcSMatt Macy spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
283eda14cbcSMatt Macy {
284eda14cbcSMatt Macy vdev_t *vd = arg;
285eda14cbcSMatt Macy int error;
286eda14cbcSMatt Macy
287eda14cbcSMatt Macy /*
288eda14cbcSMatt Macy * The space map callback is applied only to non-debug entries.
289eda14cbcSMatt Macy * Because the number of debug entries is less or equal to the
290eda14cbcSMatt Macy * number of non-debug entries, we want to ensure that we only
291eda14cbcSMatt Macy * read what we prefetched from open-context.
292eda14cbcSMatt Macy *
293eda14cbcSMatt Macy * Thus, we set the maximum entries that the space map callback
294eda14cbcSMatt Macy * will be applied to be half the entries that could fit in the
295eda14cbcSMatt Macy * imposed memory limit.
296eda14cbcSMatt Macy *
297eda14cbcSMatt Macy * Note that since this is a conservative estimate we also
298eda14cbcSMatt Macy * assume the worst case scenario in our computation where each
299eda14cbcSMatt Macy * entry is two-word.
300eda14cbcSMatt Macy */
301eda14cbcSMatt Macy uint64_t max_entry_limit =
302eda14cbcSMatt Macy (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1;
303eda14cbcSMatt Macy
304eda14cbcSMatt Macy /*
305eda14cbcSMatt Macy * Iterate from the end of the space map towards the beginning,
306eda14cbcSMatt Macy * placing its entries on ms_freeing and removing them from the
307eda14cbcSMatt Macy * space map. The iteration stops if one of the following
308eda14cbcSMatt Macy * conditions is true:
309eda14cbcSMatt Macy *
310eda14cbcSMatt Macy * 1] We reached the beginning of the space map. At this point
311eda14cbcSMatt Macy * the space map should be completely empty and
312eda14cbcSMatt Macy * space_map_incremental_destroy should have returned 0.
313eda14cbcSMatt Macy * The next step would be to free and close the space map
314eda14cbcSMatt Macy * and remove its entry from its vdev's top zap. This allows
315eda14cbcSMatt Macy * spa_checkpoint_discard_thread() to move on to the next vdev.
316eda14cbcSMatt Macy *
317eda14cbcSMatt Macy * 2] We reached the memory limit (amount of memory used to hold
318eda14cbcSMatt Macy * space map entries in memory) and space_map_incremental_destroy
319eda14cbcSMatt Macy * returned EINTR. This means that there are entries remaining
320eda14cbcSMatt Macy * in the space map that will be cleared in a future invocation
321eda14cbcSMatt Macy * of this function by spa_checkpoint_discard_thread().
322eda14cbcSMatt Macy */
323eda14cbcSMatt Macy spa_checkpoint_discard_sync_callback_arg_t sdc;
324eda14cbcSMatt Macy sdc.sdc_vd = vd;
325eda14cbcSMatt Macy sdc.sdc_txg = tx->tx_txg;
326eda14cbcSMatt Macy sdc.sdc_entry_limit = max_entry_limit;
327eda14cbcSMatt Macy
328eda14cbcSMatt Macy uint64_t words_before =
329eda14cbcSMatt Macy space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
330eda14cbcSMatt Macy
331eda14cbcSMatt Macy error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
332eda14cbcSMatt Macy spa_checkpoint_discard_sync_callback, &sdc, tx);
333eda14cbcSMatt Macy
334eda14cbcSMatt Macy uint64_t words_after =
335eda14cbcSMatt Macy space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
336eda14cbcSMatt Macy
337eda14cbcSMatt Macy #ifdef ZFS_DEBUG
338eda14cbcSMatt Macy spa_checkpoint_accounting_verify(vd->vdev_spa);
339eda14cbcSMatt Macy #endif
340eda14cbcSMatt Macy
34133b8c039SMartin Matuska zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %lld, "
342eda14cbcSMatt Macy "deleted %llu words - %llu words are left",
34333b8c039SMartin Matuska (u_longlong_t)tx->tx_txg, (longlong_t)vd->vdev_id,
34433b8c039SMartin Matuska (u_longlong_t)(words_before - words_after),
34533b8c039SMartin Matuska (u_longlong_t)words_after);
346eda14cbcSMatt Macy
347eda14cbcSMatt Macy if (error != EINTR) {
348eda14cbcSMatt Macy if (error != 0) {
34933b8c039SMartin Matuska zfs_panic_recover("zfs: error %lld was returned "
350eda14cbcSMatt Macy "while incrementally destroying the checkpoint "
351c7046f76SMartin Matuska "space map of vdev %llu\n",
35233b8c039SMartin Matuska (longlong_t)error, vd->vdev_id);
353eda14cbcSMatt Macy }
354eda14cbcSMatt Macy ASSERT0(words_after);
355eda14cbcSMatt Macy ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm));
356eda14cbcSMatt Macy ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
357eda14cbcSMatt Macy
358eda14cbcSMatt Macy space_map_free(vd->vdev_checkpoint_sm, tx);
359eda14cbcSMatt Macy space_map_close(vd->vdev_checkpoint_sm);
360eda14cbcSMatt Macy vd->vdev_checkpoint_sm = NULL;
361eda14cbcSMatt Macy
362eda14cbcSMatt Macy VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa),
363eda14cbcSMatt Macy vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
364eda14cbcSMatt Macy }
365eda14cbcSMatt Macy }
366eda14cbcSMatt Macy
367eda14cbcSMatt Macy static boolean_t
spa_checkpoint_discard_is_done(spa_t * spa)368eda14cbcSMatt Macy spa_checkpoint_discard_is_done(spa_t *spa)
369eda14cbcSMatt Macy {
370eda14cbcSMatt Macy vdev_t *rvd = spa->spa_root_vdev;
371eda14cbcSMatt Macy
372eda14cbcSMatt Macy ASSERT(!spa_has_checkpoint(spa));
373eda14cbcSMatt Macy ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT));
374eda14cbcSMatt Macy
375eda14cbcSMatt Macy for (uint64_t c = 0; c < rvd->vdev_children; c++) {
376eda14cbcSMatt Macy if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL)
377eda14cbcSMatt Macy return (B_FALSE);
378eda14cbcSMatt Macy ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space);
379eda14cbcSMatt Macy }
380eda14cbcSMatt Macy
381eda14cbcSMatt Macy return (B_TRUE);
382eda14cbcSMatt Macy }
383eda14cbcSMatt Macy
384eda14cbcSMatt Macy boolean_t
spa_checkpoint_discard_thread_check(void * arg,zthr_t * zthr)385eda14cbcSMatt Macy spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr)
386eda14cbcSMatt Macy {
387e92ffd9bSMartin Matuska (void) zthr;
388eda14cbcSMatt Macy spa_t *spa = arg;
389eda14cbcSMatt Macy
390eda14cbcSMatt Macy if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
391eda14cbcSMatt Macy return (B_FALSE);
392eda14cbcSMatt Macy
393eda14cbcSMatt Macy if (spa_has_checkpoint(spa))
394eda14cbcSMatt Macy return (B_FALSE);
395eda14cbcSMatt Macy
396eda14cbcSMatt Macy return (B_TRUE);
397eda14cbcSMatt Macy }
398eda14cbcSMatt Macy
399eda14cbcSMatt Macy void
spa_checkpoint_discard_thread(void * arg,zthr_t * zthr)400eda14cbcSMatt Macy spa_checkpoint_discard_thread(void *arg, zthr_t *zthr)
401eda14cbcSMatt Macy {
402eda14cbcSMatt Macy spa_t *spa = arg;
403eda14cbcSMatt Macy vdev_t *rvd = spa->spa_root_vdev;
404eda14cbcSMatt Macy
405eda14cbcSMatt Macy for (uint64_t c = 0; c < rvd->vdev_children; c++) {
406eda14cbcSMatt Macy vdev_t *vd = rvd->vdev_child[c];
407eda14cbcSMatt Macy
408eda14cbcSMatt Macy while (vd->vdev_checkpoint_sm != NULL) {
409eda14cbcSMatt Macy space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm;
410eda14cbcSMatt Macy int numbufs;
411eda14cbcSMatt Macy dmu_buf_t **dbp;
412eda14cbcSMatt Macy
413eda14cbcSMatt Macy if (zthr_iscancelled(zthr))
414eda14cbcSMatt Macy return;
415eda14cbcSMatt Macy
416eda14cbcSMatt Macy ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
417eda14cbcSMatt Macy
418eda14cbcSMatt Macy uint64_t size = MIN(space_map_length(checkpoint_sm),
419eda14cbcSMatt Macy zfs_spa_discard_memory_limit);
420eda14cbcSMatt Macy uint64_t offset =
421eda14cbcSMatt Macy space_map_length(checkpoint_sm) - size;
422eda14cbcSMatt Macy
423eda14cbcSMatt Macy /*
424eda14cbcSMatt Macy * Ensure that the part of the space map that will
425eda14cbcSMatt Macy * be destroyed by the synctask, is prefetched in
426eda14cbcSMatt Macy * memory before the synctask runs.
427eda14cbcSMatt Macy */
428eda14cbcSMatt Macy int error = dmu_buf_hold_array_by_bonus(
429eda14cbcSMatt Macy checkpoint_sm->sm_dbuf, offset, size,
430*8ac904ceSMartin Matuska B_TRUE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
431eda14cbcSMatt Macy if (error != 0) {
432eda14cbcSMatt Macy zfs_panic_recover("zfs: error %d was returned "
433eda14cbcSMatt Macy "while prefetching checkpoint space map "
434eda14cbcSMatt Macy "entries of vdev %llu\n",
435eda14cbcSMatt Macy error, vd->vdev_id);
436eda14cbcSMatt Macy }
437eda14cbcSMatt Macy
438eda14cbcSMatt Macy VERIFY0(dsl_sync_task(spa->spa_name, NULL,
439eda14cbcSMatt Macy spa_checkpoint_discard_thread_sync, vd,
440eda14cbcSMatt Macy 0, ZFS_SPACE_CHECK_NONE));
441eda14cbcSMatt Macy
442eda14cbcSMatt Macy dmu_buf_rele_array(dbp, numbufs, FTAG);
443eda14cbcSMatt Macy }
444eda14cbcSMatt Macy }
445eda14cbcSMatt Macy
446eda14cbcSMatt Macy VERIFY(spa_checkpoint_discard_is_done(spa));
447eda14cbcSMatt Macy VERIFY0(spa->spa_checkpoint_info.sci_dspace);
448eda14cbcSMatt Macy VERIFY0(dsl_sync_task(spa->spa_name, NULL,
449eda14cbcSMatt Macy spa_checkpoint_discard_complete_sync, spa,
450eda14cbcSMatt Macy 0, ZFS_SPACE_CHECK_NONE));
451eda14cbcSMatt Macy }
452eda14cbcSMatt Macy
453eda14cbcSMatt Macy
454eda14cbcSMatt Macy static int
spa_checkpoint_check(void * arg,dmu_tx_t * tx)455eda14cbcSMatt Macy spa_checkpoint_check(void *arg, dmu_tx_t *tx)
456eda14cbcSMatt Macy {
457e92ffd9bSMartin Matuska (void) arg;
458eda14cbcSMatt Macy spa_t *spa = dmu_tx_pool(tx)->dp_spa;
459eda14cbcSMatt Macy
460eda14cbcSMatt Macy if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT))
461eda14cbcSMatt Macy return (SET_ERROR(ENOTSUP));
462eda14cbcSMatt Macy
463eda14cbcSMatt Macy if (!spa_top_vdevs_spacemap_addressable(spa))
464eda14cbcSMatt Macy return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG));
465eda14cbcSMatt Macy
466eda14cbcSMatt Macy if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
467eda14cbcSMatt Macy return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));
468eda14cbcSMatt Macy
469e716630dSMartin Matuska if (spa->spa_raidz_expand != NULL)
470e716630dSMartin Matuska return (SET_ERROR(ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));
471e716630dSMartin Matuska
472eda14cbcSMatt Macy if (spa->spa_checkpoint_txg != 0)
473eda14cbcSMatt Macy return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));
474eda14cbcSMatt Macy
475eda14cbcSMatt Macy if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
476eda14cbcSMatt Macy return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
477eda14cbcSMatt Macy
478eda14cbcSMatt Macy return (0);
479eda14cbcSMatt Macy }
480eda14cbcSMatt Macy
481eda14cbcSMatt Macy static void
spa_checkpoint_sync(void * arg,dmu_tx_t * tx)482eda14cbcSMatt Macy spa_checkpoint_sync(void *arg, dmu_tx_t *tx)
483eda14cbcSMatt Macy {
484e92ffd9bSMartin Matuska (void) arg;
485eda14cbcSMatt Macy dsl_pool_t *dp = dmu_tx_pool(tx);
486eda14cbcSMatt Macy spa_t *spa = dp->dp_spa;
487eda14cbcSMatt Macy uberblock_t checkpoint = spa->spa_ubsync;
488eda14cbcSMatt Macy
489eda14cbcSMatt Macy /*
490eda14cbcSMatt Macy * At this point, there should not be a checkpoint in the MOS.
491eda14cbcSMatt Macy */
492eda14cbcSMatt Macy ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
493eda14cbcSMatt Macy DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT);
494eda14cbcSMatt Macy
495eda14cbcSMatt Macy ASSERT0(spa->spa_checkpoint_info.sci_timestamp);
496eda14cbcSMatt Macy ASSERT0(spa->spa_checkpoint_info.sci_dspace);
497eda14cbcSMatt Macy
498eda14cbcSMatt Macy /*
499eda14cbcSMatt Macy * Since the checkpointed uberblock is the one that just got synced
500eda14cbcSMatt Macy * (we use spa_ubsync), its txg must be equal to the txg number of
501eda14cbcSMatt Macy * the txg we are syncing, minus 1.
502eda14cbcSMatt Macy */
503eda14cbcSMatt Macy ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1);
504eda14cbcSMatt Macy
505eda14cbcSMatt Macy /*
506eda14cbcSMatt Macy * Once the checkpoint is in place, we need to ensure that none of
507eda14cbcSMatt Macy * its blocks will be marked for reuse after it has been freed.
508eda14cbcSMatt Macy * When there is a checkpoint and a block is freed, we compare its
509eda14cbcSMatt Macy * birth txg to the txg of the checkpointed uberblock to see if the
510eda14cbcSMatt Macy * block is part of the checkpoint or not. Therefore, we have to set
511eda14cbcSMatt Macy * spa_checkpoint_txg before any frees happen in this txg (which is
512eda14cbcSMatt Macy * why this is done as an early_synctask as explained in the comment
513eda14cbcSMatt Macy * in spa_checkpoint()).
514eda14cbcSMatt Macy */
515eda14cbcSMatt Macy spa->spa_checkpoint_txg = checkpoint.ub_txg;
516eda14cbcSMatt Macy spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
517eda14cbcSMatt Macy
518eda14cbcSMatt Macy checkpoint.ub_checkpoint_txg = checkpoint.ub_txg;
519eda14cbcSMatt Macy VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
520eda14cbcSMatt Macy DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT,
521eda14cbcSMatt Macy sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t),
522eda14cbcSMatt Macy &checkpoint, tx));
523eda14cbcSMatt Macy
524eda14cbcSMatt Macy /*
525eda14cbcSMatt Macy * Increment the feature refcount and thus activate the feature.
526eda14cbcSMatt Macy * Note that the feature will be deactivated when we've
527eda14cbcSMatt Macy * completely discarded all checkpointed state (both vdev
528eda14cbcSMatt Macy * space maps and uberblock).
529eda14cbcSMatt Macy */
530eda14cbcSMatt Macy spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
531eda14cbcSMatt Macy
532eda14cbcSMatt Macy spa_history_log_internal(spa, "spa checkpoint", tx,
533eda14cbcSMatt Macy "checkpointed uberblock txg=%llu", (u_longlong_t)checkpoint.ub_txg);
534eda14cbcSMatt Macy }
535eda14cbcSMatt Macy
536eda14cbcSMatt Macy /*
537eda14cbcSMatt Macy * Create a checkpoint for the pool.
538eda14cbcSMatt Macy */
539eda14cbcSMatt Macy int
spa_checkpoint(const char * pool)540eda14cbcSMatt Macy spa_checkpoint(const char *pool)
541eda14cbcSMatt Macy {
542eda14cbcSMatt Macy int error;
543eda14cbcSMatt Macy spa_t *spa;
544eda14cbcSMatt Macy
545eda14cbcSMatt Macy error = spa_open(pool, &spa, FTAG);
546eda14cbcSMatt Macy if (error != 0)
547eda14cbcSMatt Macy return (error);
548eda14cbcSMatt Macy
549eda14cbcSMatt Macy mutex_enter(&spa->spa_vdev_top_lock);
550eda14cbcSMatt Macy
551eda14cbcSMatt Macy /*
552eda14cbcSMatt Macy * Wait for current syncing txg to finish so the latest synced
553eda14cbcSMatt Macy * uberblock (spa_ubsync) has all the changes that we expect
554eda14cbcSMatt Macy * to see if we were to revert later to the checkpoint. In other
555eda14cbcSMatt Macy * words we want the checkpointed uberblock to include/reference
556eda14cbcSMatt Macy * all the changes that were pending at the time that we issued
557eda14cbcSMatt Macy * the checkpoint command.
558eda14cbcSMatt Macy */
559eda14cbcSMatt Macy txg_wait_synced(spa_get_dsl(spa), 0);
560eda14cbcSMatt Macy
561eda14cbcSMatt Macy /*
562eda14cbcSMatt Macy * As the checkpointed uberblock references blocks from the previous
563eda14cbcSMatt Macy * txg (spa_ubsync) we want to ensure that are not freeing any of
564eda14cbcSMatt Macy * these blocks in the same txg that the following synctask will
565eda14cbcSMatt Macy * run. Thus, we run it as an early synctask, so the dirty changes
566eda14cbcSMatt Macy * that are synced to disk afterwards during zios and other synctasks
567eda14cbcSMatt Macy * do not reuse checkpointed blocks.
568eda14cbcSMatt Macy */
569eda14cbcSMatt Macy error = dsl_early_sync_task(pool, spa_checkpoint_check,
570eda14cbcSMatt Macy spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL);
571eda14cbcSMatt Macy
572eda14cbcSMatt Macy mutex_exit(&spa->spa_vdev_top_lock);
573eda14cbcSMatt Macy
574eda14cbcSMatt Macy spa_close(spa, FTAG);
575eda14cbcSMatt Macy return (error);
576eda14cbcSMatt Macy }
577eda14cbcSMatt Macy
578eda14cbcSMatt Macy static int
spa_checkpoint_discard_check(void * arg,dmu_tx_t * tx)579eda14cbcSMatt Macy spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx)
580eda14cbcSMatt Macy {
581e92ffd9bSMartin Matuska (void) arg;
582eda14cbcSMatt Macy spa_t *spa = dmu_tx_pool(tx)->dp_spa;
583eda14cbcSMatt Macy
584eda14cbcSMatt Macy if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
585eda14cbcSMatt Macy return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
586eda14cbcSMatt Macy
587eda14cbcSMatt Macy if (spa->spa_checkpoint_txg == 0)
588eda14cbcSMatt Macy return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
589eda14cbcSMatt Macy
590eda14cbcSMatt Macy VERIFY0(zap_contains(spa_meta_objset(spa),
591eda14cbcSMatt Macy DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT));
592eda14cbcSMatt Macy
593eda14cbcSMatt Macy return (0);
594eda14cbcSMatt Macy }
595eda14cbcSMatt Macy
596eda14cbcSMatt Macy static void
spa_checkpoint_discard_sync(void * arg,dmu_tx_t * tx)597eda14cbcSMatt Macy spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx)
598eda14cbcSMatt Macy {
599e92ffd9bSMartin Matuska (void) arg;
600eda14cbcSMatt Macy spa_t *spa = dmu_tx_pool(tx)->dp_spa;
601eda14cbcSMatt Macy
602eda14cbcSMatt Macy VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
603eda14cbcSMatt Macy DMU_POOL_ZPOOL_CHECKPOINT, tx));
604eda14cbcSMatt Macy
605eda14cbcSMatt Macy spa->spa_checkpoint_txg = 0;
606eda14cbcSMatt Macy
607eda14cbcSMatt Macy zthr_wakeup(spa->spa_checkpoint_discard_zthr);
608eda14cbcSMatt Macy
609eda14cbcSMatt Macy spa_history_log_internal(spa, "spa discard checkpoint", tx,
610eda14cbcSMatt Macy "started discarding checkpointed state from the pool");
611eda14cbcSMatt Macy }
612eda14cbcSMatt Macy
613eda14cbcSMatt Macy /*
614eda14cbcSMatt Macy * Discard the checkpoint from a pool.
615eda14cbcSMatt Macy */
616eda14cbcSMatt Macy int
spa_checkpoint_discard(const char * pool)617eda14cbcSMatt Macy spa_checkpoint_discard(const char *pool)
618eda14cbcSMatt Macy {
619eda14cbcSMatt Macy /*
620eda14cbcSMatt Macy * Similarly to spa_checkpoint(), we want our synctask to run
621eda14cbcSMatt Macy * before any pending dirty data are written to disk so they
622eda14cbcSMatt Macy * won't end up in the checkpoint's data structures (e.g.
623eda14cbcSMatt Macy * ms_checkpointing and vdev_checkpoint_sm) and re-create any
624eda14cbcSMatt Macy * space maps that the discarding open-context thread has
625eda14cbcSMatt Macy * deleted.
626eda14cbcSMatt Macy * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread]
627eda14cbcSMatt Macy */
628eda14cbcSMatt Macy return (dsl_early_sync_task(pool, spa_checkpoint_discard_check,
629eda14cbcSMatt Macy spa_checkpoint_discard_sync, NULL, 0,
630eda14cbcSMatt Macy ZFS_SPACE_CHECK_DISCARD_CHECKPOINT));
631eda14cbcSMatt Macy }
632eda14cbcSMatt Macy
633eda14cbcSMatt Macy EXPORT_SYMBOL(spa_checkpoint_get_stats);
634eda14cbcSMatt Macy EXPORT_SYMBOL(spa_checkpoint_discard_thread);
635eda14cbcSMatt Macy EXPORT_SYMBOL(spa_checkpoint_discard_thread_check);
636eda14cbcSMatt Macy
637dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, U64, ZMOD_RW,
638eda14cbcSMatt Macy "Limit for memory used in prefetching the checkpoint space map done "
639eda14cbcSMatt Macy "on each vdev while discarding the checkpoint");
640