1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2017 by Delphix. All rights reserved.
25 */
26
27 /*
28 * Storage Pool Checkpoint
29 *
30 * A storage pool checkpoint can be thought of as a pool-wide snapshot or
31 * a stable version of extreme rewind that guarantees no blocks from the
32 * checkpointed state will have been overwritten. It remembers the entire
33 * state of the storage pool (e.g. snapshots, dataset names, etc..) from the
34 * point that it was taken and the user can rewind back to that point even if
35 * they applied destructive operations on their datasets or even enabled new
36 * zpool on-disk features. If a pool has a checkpoint that is no longer
37 * needed, the user can discard it.
38 *
39 * == On disk data structures used ==
40 *
41 * - The pool has a new feature flag and a new entry in the MOS. The feature
42 * flag is set to active when we create the checkpoint and remains active
43 * until the checkpoint is fully discarded. The entry in the MOS config
44 * (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that
45 * references the state of the pool when we take the checkpoint. The entry
46 * remains populated until we start discarding the checkpoint or we rewind
47 * back to it.
48 *
49 * - Each vdev contains a vdev-wide space map while the pool has a checkpoint,
50 * which persists until the checkpoint is fully discarded. The space map
51 * contains entries that have been freed in the current state of the pool
52 * but we want to keep around in case we decide to rewind to the checkpoint.
53 * [see vdev_checkpoint_sm]
54 *
55 * - Each metaslab's ms_sm space map behaves the same as without the
56 * checkpoint, with the only exception being the scenario when we free
57 * blocks that belong to the checkpoint. In this case, these blocks remain
58 * ALLOCATED in the metaslab's space map and they are added as FREE in the
59 * vdev's checkpoint space map.
60 *
61 * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that
62 * the uberblock was checkpointed. For normal uberblocks this field is 0.
63 *
64 * == Overview of operations ==
65 *
66 * - To create a checkpoint, we first wait for the current TXG to be synced,
67 * so we can use the most recently synced uberblock (spa_ubsync) as the
68 * checkpointed uberblock. Then we use an early synctask to place that
69 * uberblock in MOS config, increment the feature flag for the checkpoint
70 * (marking it active), and setting spa_checkpoint_txg (see its use below)
71 * to the TXG of the checkpointed uberblock. We use an early synctask for
72 * the aforementioned operations to ensure that no blocks were dirtied
73 * between the current TXG and the TXG of the checkpointed uberblock
74 * (e.g the previous txg).
75 *
76 * - When a checkpoint exists, we need to ensure that the blocks that
77 * belong to the checkpoint are freed but never reused. This means that
78 * these blocks should never end up in the ms_allocatable or the ms_freeing
79 * trees of a metaslab. Therefore, whenever there is a checkpoint the new
80 * ms_checkpointing tree is used in addition to the aforementioned ones.
81 *
82 * Whenever a block is freed and we find out that it is referenced by the
83 * checkpoint (we find out by comparing its birth to spa_checkpoint_txg),
84 * we place it in the ms_checkpointing tree instead of the ms_freeingtree.
85 * This way, we divide the blocks that are being freed into checkpointed
86 * and not-checkpointed blocks.
87 *
88 * In order to persist these frees, we write the extents from the
89 * ms_freeingtree to the ms_sm as usual, and the extents from the
90 * ms_checkpointing tree to the vdev_checkpoint_sm. This way, these
91 * checkpointed extents will remain allocated in the metaslab's ms_sm space
92 * map, and therefore won't be reused [see metaslab_sync()]. In addition,
93 * when we discard the checkpoint, we can find the entries that have
94 * actually been freed in vdev_checkpoint_sm.
95 * [see spa_checkpoint_discard_thread_sync()]
96 *
97 * - To discard the checkpoint we use an early synctask to delete the
98 * checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0,
99 * and wakeup the discarding zthr thread (an open-context async thread).
100 * We use an early synctask to ensure that the operation happens before any
101 * new data end up in the checkpoint's data structures.
102 *
103 * Once the synctask is done and the discarding zthr is awake, we discard
104 * the checkpointed data over multiple TXGs by having the zthr prefetching
105 * entries from vdev_checkpoint_sm and then starting a synctask that places
106 * them as free blocks into their respective ms_allocatable and ms_sm
107 * structures.
108 * [see spa_checkpoint_discard_thread()]
109 *
110 * When there are no entries left in the vdev_checkpoint_sm of all
111 * top-level vdevs, a final synctask runs that decrements the feature flag.
112 *
113 * - To rewind to the checkpoint, we first use the current uberblock and
114 * open the MOS so we can access the checkpointed uberblock from the MOS
115 * config. After we retrieve the checkpointed uberblock, we use it as the
116 * current uberblock for the pool by writing it to disk with an updated
117 * TXG, opening its version of the MOS, and moving on as usual from there.
118 * [see spa_ld_checkpoint_rewind()]
119 *
120 * An important note on rewinding to the checkpoint has to do with how we
121 * handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL
122 * blocks that have not been claimed by the time we took the checkpoint
123 * as they should no longer be valid.
124 * [see comment in zil_claim()]
125 *
126 * == Miscellaneous information ==
127 *
128 * - In the hypothetical event that we take a checkpoint, remove a vdev,
129 * and attempt to rewind, the rewind would fail as the checkpointed
130 * uberblock would reference data in the removed device. For this reason
131 * and others of similar nature, we disallow the following operations that
132 * can change the config:
133 * vdev removal and attach/detach, mirror splitting, and pool reguid.
134 *
135 * - As most of the checkpoint logic is implemented in the SPA and doesn't
136 * distinguish datasets when it comes to space accounting, having a
137 * checkpoint can potentially break the boundaries set by dataset
138 * reservations.
139 */
140
141 #include <sys/dmu_tx.h>
142 #include <sys/dsl_dir.h>
143 #include <sys/dsl_synctask.h>
144 #include <sys/metaslab_impl.h>
145 #include <sys/spa.h>
146 #include <sys/spa_impl.h>
147 #include <sys/spa_checkpoint.h>
148 #include <sys/vdev_impl.h>
149 #include <sys/zap.h>
150 #include <sys/zfeature.h>
151
152 /*
153 * The following parameter limits the amount of memory to be used for the
154 * prefetching of the checkpoint space map done on each vdev while
155 * discarding the checkpoint.
156 *
157 * The reason it exists is because top-level vdevs with long checkpoint
158 * space maps can potentially take up a lot of memory depending on the
159 * amount of checkpointed data that has been freed within them while
160 * the pool had a checkpoint.
161 */
162 static uint64_t zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
163
164 int
spa_checkpoint_get_stats(spa_t * spa,pool_checkpoint_stat_t * pcs)165 spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs)
166 {
167 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
168 return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
169
170 memset(pcs, 0, sizeof (pool_checkpoint_stat_t));
171
172 int error = zap_contains(spa_meta_objset(spa),
173 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT);
174 ASSERT(error == 0 || error == ENOENT);
175
176 if (error == ENOENT)
177 pcs->pcs_state = CS_CHECKPOINT_DISCARDING;
178 else
179 pcs->pcs_state = CS_CHECKPOINT_EXISTS;
180
181 pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace;
182 pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp;
183
184 return (0);
185 }
186
187 static void
spa_checkpoint_discard_complete_sync(void * arg,dmu_tx_t * tx)188 spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx)
189 {
190 spa_t *spa = arg;
191
192 spa->spa_checkpoint_info.sci_timestamp = 0;
193
194 spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
195 spa_notify_waiters(spa);
196
197 spa_history_log_internal(spa, "spa discard checkpoint", tx,
198 "finished discarding checkpointed state from the pool");
199 }
200
201 typedef struct spa_checkpoint_discard_sync_callback_arg {
202 vdev_t *sdc_vd;
203 uint64_t sdc_txg;
204 uint64_t sdc_entry_limit;
205 } spa_checkpoint_discard_sync_callback_arg_t;
206
207 static int
spa_checkpoint_discard_sync_callback(space_map_entry_t * sme,void * arg)208 spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg)
209 {
210 spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
211 vdev_t *vd = sdc->sdc_vd;
212 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
213 uint64_t end = sme->sme_offset + sme->sme_run;
214
215 if (sdc->sdc_entry_limit == 0)
216 return (SET_ERROR(EINTR));
217
218 /*
219 * Since the space map is not condensed, we know that
220 * none of its entries is crossing the boundaries of
221 * its respective metaslab.
222 *
223 * That said, there is no fundamental requirement that
224 * the checkpoint's space map entries should not cross
225 * metaslab boundaries. So if needed we could add code
226 * that handles metaslab-crossing segments in the future.
227 */
228 VERIFY3U(sme->sme_type, ==, SM_FREE);
229 VERIFY3U(sme->sme_offset, >=, ms->ms_start);
230 VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
231
232 /*
233 * At this point we should not be processing any
234 * other frees concurrently, so the lock is technically
235 * unnecessary. We use the lock anyway though to
236 * potentially save ourselves from future headaches.
237 */
238 mutex_enter(&ms->ms_lock);
239 if (zfs_range_tree_is_empty(ms->ms_freeing))
240 vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
241 zfs_range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run);
242 mutex_exit(&ms->ms_lock);
243
244 ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=,
245 sme->sme_run);
246 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run);
247
248 vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run;
249 vd->vdev_stat.vs_checkpoint_space -= sme->sme_run;
250 sdc->sdc_entry_limit--;
251
252 return (0);
253 }
254
255 #ifdef ZFS_DEBUG
256 static void
spa_checkpoint_accounting_verify(spa_t * spa)257 spa_checkpoint_accounting_verify(spa_t *spa)
258 {
259 vdev_t *rvd = spa->spa_root_vdev;
260 uint64_t ckpoint_sm_space_sum = 0;
261 uint64_t vs_ckpoint_space_sum = 0;
262
263 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
264 vdev_t *vd = rvd->vdev_child[c];
265
266 if (vd->vdev_checkpoint_sm != NULL) {
267 ckpoint_sm_space_sum +=
268 -space_map_allocated(vd->vdev_checkpoint_sm);
269 vs_ckpoint_space_sum +=
270 vd->vdev_stat.vs_checkpoint_space;
271 ASSERT3U(ckpoint_sm_space_sum, ==,
272 vs_ckpoint_space_sum);
273 } else {
274 ASSERT0(vd->vdev_stat.vs_checkpoint_space);
275 }
276 }
277 ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum);
278 }
279 #endif
280
281 static void
spa_checkpoint_discard_thread_sync(void * arg,dmu_tx_t * tx)282 spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
283 {
284 vdev_t *vd = arg;
285 int error;
286
287 /*
288 * The space map callback is applied only to non-debug entries.
289 * Because the number of debug entries is less or equal to the
290 * number of non-debug entries, we want to ensure that we only
291 * read what we prefetched from open-context.
292 *
293 * Thus, we set the maximum entries that the space map callback
294 * will be applied to be half the entries that could fit in the
295 * imposed memory limit.
296 *
297 * Note that since this is a conservative estimate we also
298 * assume the worst case scenario in our computation where each
299 * entry is two-word.
300 */
301 uint64_t max_entry_limit =
302 (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1;
303
304 /*
305 * Iterate from the end of the space map towards the beginning,
306 * placing its entries on ms_freeing and removing them from the
307 * space map. The iteration stops if one of the following
308 * conditions is true:
309 *
310 * 1] We reached the beginning of the space map. At this point
311 * the space map should be completely empty and
312 * space_map_incremental_destroy should have returned 0.
313 * The next step would be to free and close the space map
314 * and remove its entry from its vdev's top zap. This allows
315 * spa_checkpoint_discard_thread() to move on to the next vdev.
316 *
317 * 2] We reached the memory limit (amount of memory used to hold
318 * space map entries in memory) and space_map_incremental_destroy
319 * returned EINTR. This means that there are entries remaining
320 * in the space map that will be cleared in a future invocation
321 * of this function by spa_checkpoint_discard_thread().
322 */
323 spa_checkpoint_discard_sync_callback_arg_t sdc;
324 sdc.sdc_vd = vd;
325 sdc.sdc_txg = tx->tx_txg;
326 sdc.sdc_entry_limit = max_entry_limit;
327
328 uint64_t words_before =
329 space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
330
331 error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
332 spa_checkpoint_discard_sync_callback, &sdc, tx);
333
334 uint64_t words_after =
335 space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
336
337 #ifdef ZFS_DEBUG
338 spa_checkpoint_accounting_verify(vd->vdev_spa);
339 #endif
340
341 zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %lld, "
342 "deleted %llu words - %llu words are left",
343 (u_longlong_t)tx->tx_txg, (longlong_t)vd->vdev_id,
344 (u_longlong_t)(words_before - words_after),
345 (u_longlong_t)words_after);
346
347 if (error != EINTR) {
348 if (error != 0) {
349 zfs_panic_recover("zfs: error %lld was returned "
350 "while incrementally destroying the checkpoint "
351 "space map of vdev %llu\n",
352 (longlong_t)error, vd->vdev_id);
353 }
354 ASSERT0(words_after);
355 ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm));
356 ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
357
358 space_map_free(vd->vdev_checkpoint_sm, tx);
359 space_map_close(vd->vdev_checkpoint_sm);
360 vd->vdev_checkpoint_sm = NULL;
361
362 VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa),
363 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
364 }
365 }
366
367 static boolean_t
spa_checkpoint_discard_is_done(spa_t * spa)368 spa_checkpoint_discard_is_done(spa_t *spa)
369 {
370 vdev_t *rvd = spa->spa_root_vdev;
371
372 ASSERT(!spa_has_checkpoint(spa));
373 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT));
374
375 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
376 if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL)
377 return (B_FALSE);
378 ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space);
379 }
380
381 return (B_TRUE);
382 }
383
384 boolean_t
spa_checkpoint_discard_thread_check(void * arg,zthr_t * zthr)385 spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr)
386 {
387 (void) zthr;
388 spa_t *spa = arg;
389
390 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
391 return (B_FALSE);
392
393 if (spa_has_checkpoint(spa))
394 return (B_FALSE);
395
396 return (B_TRUE);
397 }
398
399 void
spa_checkpoint_discard_thread(void * arg,zthr_t * zthr)400 spa_checkpoint_discard_thread(void *arg, zthr_t *zthr)
401 {
402 spa_t *spa = arg;
403 vdev_t *rvd = spa->spa_root_vdev;
404
405 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
406 vdev_t *vd = rvd->vdev_child[c];
407
408 while (vd->vdev_checkpoint_sm != NULL) {
409 space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm;
410 int numbufs;
411 dmu_buf_t **dbp;
412
413 if (zthr_iscancelled(zthr))
414 return;
415
416 ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
417
418 uint64_t size = MIN(space_map_length(checkpoint_sm),
419 zfs_spa_discard_memory_limit);
420 uint64_t offset =
421 space_map_length(checkpoint_sm) - size;
422
423 /*
424 * Ensure that the part of the space map that will
425 * be destroyed by the synctask, is prefetched in
426 * memory before the synctask runs.
427 */
428 int error = dmu_buf_hold_array_by_bonus(
429 checkpoint_sm->sm_dbuf, offset, size,
430 B_TRUE, FTAG, &numbufs, &dbp);
431 if (error != 0) {
432 zfs_panic_recover("zfs: error %d was returned "
433 "while prefetching checkpoint space map "
434 "entries of vdev %llu\n",
435 error, vd->vdev_id);
436 }
437
438 VERIFY0(dsl_sync_task(spa->spa_name, NULL,
439 spa_checkpoint_discard_thread_sync, vd,
440 0, ZFS_SPACE_CHECK_NONE));
441
442 dmu_buf_rele_array(dbp, numbufs, FTAG);
443 }
444 }
445
446 VERIFY(spa_checkpoint_discard_is_done(spa));
447 VERIFY0(spa->spa_checkpoint_info.sci_dspace);
448 VERIFY0(dsl_sync_task(spa->spa_name, NULL,
449 spa_checkpoint_discard_complete_sync, spa,
450 0, ZFS_SPACE_CHECK_NONE));
451 }
452
453
454 static int
spa_checkpoint_check(void * arg,dmu_tx_t * tx)455 spa_checkpoint_check(void *arg, dmu_tx_t *tx)
456 {
457 (void) arg;
458 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
459
460 if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT))
461 return (SET_ERROR(ENOTSUP));
462
463 if (!spa_top_vdevs_spacemap_addressable(spa))
464 return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG));
465
466 if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
467 return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));
468
469 if (spa->spa_raidz_expand != NULL)
470 return (SET_ERROR(ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));
471
472 if (spa->spa_checkpoint_txg != 0)
473 return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));
474
475 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
476 return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
477
478 return (0);
479 }
480
481 static void
spa_checkpoint_sync(void * arg,dmu_tx_t * tx)482 spa_checkpoint_sync(void *arg, dmu_tx_t *tx)
483 {
484 (void) arg;
485 dsl_pool_t *dp = dmu_tx_pool(tx);
486 spa_t *spa = dp->dp_spa;
487 uberblock_t checkpoint = spa->spa_ubsync;
488
489 /*
490 * At this point, there should not be a checkpoint in the MOS.
491 */
492 ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
493 DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT);
494
495 ASSERT0(spa->spa_checkpoint_info.sci_timestamp);
496 ASSERT0(spa->spa_checkpoint_info.sci_dspace);
497
498 /*
499 * Since the checkpointed uberblock is the one that just got synced
500 * (we use spa_ubsync), its txg must be equal to the txg number of
501 * the txg we are syncing, minus 1.
502 */
503 ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1);
504
505 /*
506 * Once the checkpoint is in place, we need to ensure that none of
507 * its blocks will be marked for reuse after it has been freed.
508 * When there is a checkpoint and a block is freed, we compare its
509 * birth txg to the txg of the checkpointed uberblock to see if the
510 * block is part of the checkpoint or not. Therefore, we have to set
511 * spa_checkpoint_txg before any frees happen in this txg (which is
512 * why this is done as an early_synctask as explained in the comment
513 * in spa_checkpoint()).
514 */
515 spa->spa_checkpoint_txg = checkpoint.ub_txg;
516 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
517
518 checkpoint.ub_checkpoint_txg = checkpoint.ub_txg;
519 VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
520 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT,
521 sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t),
522 &checkpoint, tx));
523
524 /*
525 * Increment the feature refcount and thus activate the feature.
526 * Note that the feature will be deactivated when we've
527 * completely discarded all checkpointed state (both vdev
528 * space maps and uberblock).
529 */
530 spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
531
532 spa_history_log_internal(spa, "spa checkpoint", tx,
533 "checkpointed uberblock txg=%llu", (u_longlong_t)checkpoint.ub_txg);
534 }
535
536 /*
537 * Create a checkpoint for the pool.
538 */
539 int
spa_checkpoint(const char * pool)540 spa_checkpoint(const char *pool)
541 {
542 int error;
543 spa_t *spa;
544
545 error = spa_open(pool, &spa, FTAG);
546 if (error != 0)
547 return (error);
548
549 mutex_enter(&spa->spa_vdev_top_lock);
550
551 /*
552 * Wait for current syncing txg to finish so the latest synced
553 * uberblock (spa_ubsync) has all the changes that we expect
554 * to see if we were to revert later to the checkpoint. In other
555 * words we want the checkpointed uberblock to include/reference
556 * all the changes that were pending at the time that we issued
557 * the checkpoint command.
558 */
559 txg_wait_synced(spa_get_dsl(spa), 0);
560
561 /*
562 * As the checkpointed uberblock references blocks from the previous
563 * txg (spa_ubsync) we want to ensure that are not freeing any of
564 * these blocks in the same txg that the following synctask will
565 * run. Thus, we run it as an early synctask, so the dirty changes
566 * that are synced to disk afterwards during zios and other synctasks
567 * do not reuse checkpointed blocks.
568 */
569 error = dsl_early_sync_task(pool, spa_checkpoint_check,
570 spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL);
571
572 mutex_exit(&spa->spa_vdev_top_lock);
573
574 spa_close(spa, FTAG);
575 return (error);
576 }
577
578 static int
spa_checkpoint_discard_check(void * arg,dmu_tx_t * tx)579 spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx)
580 {
581 (void) arg;
582 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
583
584 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
585 return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
586
587 if (spa->spa_checkpoint_txg == 0)
588 return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
589
590 VERIFY0(zap_contains(spa_meta_objset(spa),
591 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT));
592
593 return (0);
594 }
595
596 static void
spa_checkpoint_discard_sync(void * arg,dmu_tx_t * tx)597 spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx)
598 {
599 (void) arg;
600 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
601
602 VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
603 DMU_POOL_ZPOOL_CHECKPOINT, tx));
604
605 spa->spa_checkpoint_txg = 0;
606
607 zthr_wakeup(spa->spa_checkpoint_discard_zthr);
608
609 spa_history_log_internal(spa, "spa discard checkpoint", tx,
610 "started discarding checkpointed state from the pool");
611 }
612
613 /*
614 * Discard the checkpoint from a pool.
615 */
616 int
spa_checkpoint_discard(const char * pool)617 spa_checkpoint_discard(const char *pool)
618 {
619 /*
620 * Similarly to spa_checkpoint(), we want our synctask to run
621 * before any pending dirty data are written to disk so they
622 * won't end up in the checkpoint's data structures (e.g.
623 * ms_checkpointing and vdev_checkpoint_sm) and re-create any
624 * space maps that the discarding open-context thread has
625 * deleted.
626 * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread]
627 */
628 return (dsl_early_sync_task(pool, spa_checkpoint_discard_check,
629 spa_checkpoint_discard_sync, NULL, 0,
630 ZFS_SPACE_CHECK_DISCARD_CHECKPOINT));
631 }
632
633 EXPORT_SYMBOL(spa_checkpoint_get_stats);
634 EXPORT_SYMBOL(spa_checkpoint_discard_thread);
635 EXPORT_SYMBOL(spa_checkpoint_discard_thread_check);
636
637 ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, U64, ZMOD_RW,
638 "Limit for memory used in prefetching the checkpoint space map done "
639 "on each vdev while discarding the checkpoint");
640