1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2017 by Delphix. All rights reserved. 25 */ 26 27 /* 28 * Storage Pool Checkpoint 29 * 30 * A storage pool checkpoint can be thought of as a pool-wide snapshot or 31 * a stable version of extreme rewind that guarantees no blocks from the 32 * checkpointed state will have been overwritten. It remembers the entire 33 * state of the storage pool (e.g. snapshots, dataset names, etc..) from the 34 * point that it was taken and the user can rewind back to that point even if 35 * they applied destructive operations on their datasets or even enabled new 36 * zpool on-disk features. If a pool has a checkpoint that is no longer 37 * needed, the user can discard it. 38 * 39 * == On disk data structures used == 40 * 41 * - The pool has a new feature flag and a new entry in the MOS. The feature 42 * flag is set to active when we create the checkpoint and remains active 43 * until the checkpoint is fully discarded. The entry in the MOS config 44 * (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that 45 * references the state of the pool when we take the checkpoint. The entry 46 * remains populated until we start discarding the checkpoint or we rewind 47 * back to it. 48 * 49 * - Each vdev contains a vdev-wide space map while the pool has a checkpoint, 50 * which persists until the checkpoint is fully discarded. The space map 51 * contains entries that have been freed in the current state of the pool 52 * but we want to keep around in case we decide to rewind to the checkpoint. 53 * [see vdev_checkpoint_sm] 54 * 55 * - Each metaslab's ms_sm space map behaves the same as without the 56 * checkpoint, with the only exception being the scenario when we free 57 * blocks that belong to the checkpoint. In this case, these blocks remain 58 * ALLOCATED in the metaslab's space map and they are added as FREE in the 59 * vdev's checkpoint space map. 60 * 61 * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that 62 * the uberblock was checkpointed. For normal uberblocks this field is 0. 63 * 64 * == Overview of operations == 65 * 66 * - To create a checkpoint, we first wait for the current TXG to be synced, 67 * so we can use the most recently synced uberblock (spa_ubsync) as the 68 * checkpointed uberblock. Then we use an early synctask to place that 69 * uberblock in MOS config, increment the feature flag for the checkpoint 70 * (marking it active), and setting spa_checkpoint_txg (see its use below) 71 * to the TXG of the checkpointed uberblock. We use an early synctask for 72 * the aforementioned operations to ensure that no blocks were dirtied 73 * between the current TXG and the TXG of the checkpointed uberblock 74 * (e.g the previous txg). 75 * 76 * - When a checkpoint exists, we need to ensure that the blocks that 77 * belong to the checkpoint are freed but never reused. This means that 78 * these blocks should never end up in the ms_allocatable or the ms_freeing 79 * trees of a metaslab. Therefore, whenever there is a checkpoint the new 80 * ms_checkpointing tree is used in addition to the aforementioned ones. 81 * 82 * Whenever a block is freed and we find out that it is referenced by the 83 * checkpoint (we find out by comparing its birth to spa_checkpoint_txg), 84 * we place it in the ms_checkpointing tree instead of the ms_freeingtree. 85 * This way, we divide the blocks that are being freed into checkpointed 86 * and not-checkpointed blocks. 87 * 88 * In order to persist these frees, we write the extents from the 89 * ms_freeingtree to the ms_sm as usual, and the extents from the 90 * ms_checkpointing tree to the vdev_checkpoint_sm. This way, these 91 * checkpointed extents will remain allocated in the metaslab's ms_sm space 92 * map, and therefore won't be reused [see metaslab_sync()]. In addition, 93 * when we discard the checkpoint, we can find the entries that have 94 * actually been freed in vdev_checkpoint_sm. 95 * [see spa_checkpoint_discard_thread_sync()] 96 * 97 * - To discard the checkpoint we use an early synctask to delete the 98 * checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0, 99 * and wakeup the discarding zthr thread (an open-context async thread). 100 * We use an early synctask to ensure that the operation happens before any 101 * new data end up in the checkpoint's data structures. 102 * 103 * Once the synctask is done and the discarding zthr is awake, we discard 104 * the checkpointed data over multiple TXGs by having the zthr prefetching 105 * entries from vdev_checkpoint_sm and then starting a synctask that places 106 * them as free blocks into their respective ms_allocatable and ms_sm 107 * structures. 108 * [see spa_checkpoint_discard_thread()] 109 * 110 * When there are no entries left in the vdev_checkpoint_sm of all 111 * top-level vdevs, a final synctask runs that decrements the feature flag. 112 * 113 * - To rewind to the checkpoint, we first use the current uberblock and 114 * open the MOS so we can access the checkpointed uberblock from the MOS 115 * config. After we retrieve the checkpointed uberblock, we use it as the 116 * current uberblock for the pool by writing it to disk with an updated 117 * TXG, opening its version of the MOS, and moving on as usual from there. 118 * [see spa_ld_checkpoint_rewind()] 119 * 120 * An important note on rewinding to the checkpoint has to do with how we 121 * handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL 122 * blocks that have not been claimed by the time we took the checkpoint 123 * as they should no longer be valid. 124 * [see comment in zil_claim()] 125 * 126 * == Miscellaneous information == 127 * 128 * - In the hypothetical event that we take a checkpoint, remove a vdev, 129 * and attempt to rewind, the rewind would fail as the checkpointed 130 * uberblock would reference data in the removed device. For this reason 131 * and others of similar nature, we disallow the following operations that 132 * can change the config: 133 * vdev removal and attach/detach, mirror splitting, and pool reguid. 134 * 135 * - As most of the checkpoint logic is implemented in the SPA and doesn't 136 * distinguish datasets when it comes to space accounting, having a 137 * checkpoint can potentially break the boundaries set by dataset 138 * reservations. 139 */ 140 141 #include <sys/dmu_tx.h> 142 #include <sys/dsl_dir.h> 143 #include <sys/dsl_synctask.h> 144 #include <sys/metaslab_impl.h> 145 #include <sys/spa.h> 146 #include <sys/spa_impl.h> 147 #include <sys/spa_checkpoint.h> 148 #include <sys/vdev_impl.h> 149 #include <sys/zap.h> 150 #include <sys/zfeature.h> 151 152 /* 153 * The following parameter limits the amount of memory to be used for the 154 * prefetching of the checkpoint space map done on each vdev while 155 * discarding the checkpoint. 156 * 157 * The reason it exists is because top-level vdevs with long checkpoint 158 * space maps can potentially take up a lot of memory depending on the 159 * amount of checkpointed data that has been freed within them while 160 * the pool had a checkpoint. 161 */ 162 static uint64_t zfs_spa_discard_memory_limit = 16 * 1024 * 1024; 163 164 int 165 spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs) 166 { 167 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) 168 return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); 169 170 memset(pcs, 0, sizeof (pool_checkpoint_stat_t)); 171 172 int error = zap_contains(spa_meta_objset(spa), 173 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT); 174 ASSERT(error == 0 || error == ENOENT); 175 176 if (error == ENOENT) 177 pcs->pcs_state = CS_CHECKPOINT_DISCARDING; 178 else 179 pcs->pcs_state = CS_CHECKPOINT_EXISTS; 180 181 pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace; 182 pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp; 183 184 return (0); 185 } 186 187 static void 188 spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx) 189 { 190 spa_t *spa = arg; 191 192 spa->spa_checkpoint_info.sci_timestamp = 0; 193 194 spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); 195 spa_notify_waiters(spa); 196 197 spa_history_log_internal(spa, "spa discard checkpoint", tx, 198 "finished discarding checkpointed state from the pool"); 199 } 200 201 typedef struct spa_checkpoint_discard_sync_callback_arg { 202 vdev_t *sdc_vd; 203 uint64_t sdc_txg; 204 uint64_t sdc_entry_limit; 205 } spa_checkpoint_discard_sync_callback_arg_t; 206 207 static int 208 spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg) 209 { 210 spa_checkpoint_discard_sync_callback_arg_t *sdc = arg; 211 vdev_t *vd = sdc->sdc_vd; 212 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; 213 uint64_t end = sme->sme_offset + sme->sme_run; 214 215 if (sdc->sdc_entry_limit == 0) 216 return (SET_ERROR(EINTR)); 217 218 /* 219 * Since the space map is not condensed, we know that 220 * none of its entries is crossing the boundaries of 221 * its respective metaslab. 222 * 223 * That said, there is no fundamental requirement that 224 * the checkpoint's space map entries should not cross 225 * metaslab boundaries. So if needed we could add code 226 * that handles metaslab-crossing segments in the future. 227 */ 228 VERIFY3U(sme->sme_type, ==, SM_FREE); 229 VERIFY3U(sme->sme_offset, >=, ms->ms_start); 230 VERIFY3U(end, <=, ms->ms_start + ms->ms_size); 231 232 /* 233 * At this point we should not be processing any 234 * other frees concurrently, so the lock is technically 235 * unnecessary. We use the lock anyway though to 236 * potentially save ourselves from future headaches. 237 */ 238 mutex_enter(&ms->ms_lock); 239 if (zfs_range_tree_is_empty(ms->ms_freeing)) 240 vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg); 241 zfs_range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run); 242 mutex_exit(&ms->ms_lock); 243 244 ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, 245 sme->sme_run); 246 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run); 247 248 vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run; 249 vd->vdev_stat.vs_checkpoint_space -= sme->sme_run; 250 sdc->sdc_entry_limit--; 251 252 return (0); 253 } 254 255 #ifdef ZFS_DEBUG 256 static void 257 spa_checkpoint_accounting_verify(spa_t *spa) 258 { 259 vdev_t *rvd = spa->spa_root_vdev; 260 uint64_t ckpoint_sm_space_sum = 0; 261 uint64_t vs_ckpoint_space_sum = 0; 262 263 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 264 vdev_t *vd = rvd->vdev_child[c]; 265 266 if (vd->vdev_checkpoint_sm != NULL) { 267 ckpoint_sm_space_sum += 268 -space_map_allocated(vd->vdev_checkpoint_sm); 269 vs_ckpoint_space_sum += 270 vd->vdev_stat.vs_checkpoint_space; 271 ASSERT3U(ckpoint_sm_space_sum, ==, 272 vs_ckpoint_space_sum); 273 } else { 274 ASSERT0(vd->vdev_stat.vs_checkpoint_space); 275 } 276 } 277 ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum); 278 } 279 #endif 280 281 static void 282 spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) 283 { 284 vdev_t *vd = arg; 285 int error; 286 287 /* 288 * The space map callback is applied only to non-debug entries. 289 * Because the number of debug entries is less or equal to the 290 * number of non-debug entries, we want to ensure that we only 291 * read what we prefetched from open-context. 292 * 293 * Thus, we set the maximum entries that the space map callback 294 * will be applied to be half the entries that could fit in the 295 * imposed memory limit. 296 * 297 * Note that since this is a conservative estimate we also 298 * assume the worst case scenario in our computation where each 299 * entry is two-word. 300 */ 301 uint64_t max_entry_limit = 302 (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1; 303 304 /* 305 * Iterate from the end of the space map towards the beginning, 306 * placing its entries on ms_freeing and removing them from the 307 * space map. The iteration stops if one of the following 308 * conditions is true: 309 * 310 * 1] We reached the beginning of the space map. At this point 311 * the space map should be completely empty and 312 * space_map_incremental_destroy should have returned 0. 313 * The next step would be to free and close the space map 314 * and remove its entry from its vdev's top zap. This allows 315 * spa_checkpoint_discard_thread() to move on to the next vdev. 316 * 317 * 2] We reached the memory limit (amount of memory used to hold 318 * space map entries in memory) and space_map_incremental_destroy 319 * returned EINTR. This means that there are entries remaining 320 * in the space map that will be cleared in a future invocation 321 * of this function by spa_checkpoint_discard_thread(). 322 */ 323 spa_checkpoint_discard_sync_callback_arg_t sdc; 324 sdc.sdc_vd = vd; 325 sdc.sdc_txg = tx->tx_txg; 326 sdc.sdc_entry_limit = max_entry_limit; 327 328 uint64_t words_before = 329 space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); 330 331 error = space_map_incremental_destroy(vd->vdev_checkpoint_sm, 332 spa_checkpoint_discard_sync_callback, &sdc, tx); 333 334 uint64_t words_after = 335 space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); 336 337 #ifdef ZFS_DEBUG 338 spa_checkpoint_accounting_verify(vd->vdev_spa); 339 #endif 340 341 zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %lld, " 342 "deleted %llu words - %llu words are left", 343 (u_longlong_t)tx->tx_txg, (longlong_t)vd->vdev_id, 344 (u_longlong_t)(words_before - words_after), 345 (u_longlong_t)words_after); 346 347 if (error != EINTR) { 348 if (error != 0) { 349 zfs_panic_recover("zfs: error %lld was returned " 350 "while incrementally destroying the checkpoint " 351 "space map of vdev %llu\n", 352 (longlong_t)error, vd->vdev_id); 353 } 354 ASSERT0(words_after); 355 ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm)); 356 ASSERT0(space_map_length(vd->vdev_checkpoint_sm)); 357 358 space_map_free(vd->vdev_checkpoint_sm, tx); 359 space_map_close(vd->vdev_checkpoint_sm); 360 vd->vdev_checkpoint_sm = NULL; 361 362 VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa), 363 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx)); 364 } 365 } 366 367 static boolean_t 368 spa_checkpoint_discard_is_done(spa_t *spa) 369 { 370 vdev_t *rvd = spa->spa_root_vdev; 371 372 ASSERT(!spa_has_checkpoint(spa)); 373 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)); 374 375 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 376 if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL) 377 return (B_FALSE); 378 ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space); 379 } 380 381 return (B_TRUE); 382 } 383 384 boolean_t 385 spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr) 386 { 387 (void) zthr; 388 spa_t *spa = arg; 389 390 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) 391 return (B_FALSE); 392 393 if (spa_has_checkpoint(spa)) 394 return (B_FALSE); 395 396 return (B_TRUE); 397 } 398 399 void 400 spa_checkpoint_discard_thread(void *arg, zthr_t *zthr) 401 { 402 spa_t *spa = arg; 403 vdev_t *rvd = spa->spa_root_vdev; 404 405 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 406 vdev_t *vd = rvd->vdev_child[c]; 407 408 while (vd->vdev_checkpoint_sm != NULL) { 409 space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm; 410 int numbufs; 411 dmu_buf_t **dbp; 412 413 if (zthr_iscancelled(zthr)) 414 return; 415 416 ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); 417 418 uint64_t size = MIN(space_map_length(checkpoint_sm), 419 zfs_spa_discard_memory_limit); 420 uint64_t offset = 421 space_map_length(checkpoint_sm) - size; 422 423 /* 424 * Ensure that the part of the space map that will 425 * be destroyed by the synctask, is prefetched in 426 * memory before the synctask runs. 427 */ 428 int error = dmu_buf_hold_array_by_bonus( 429 checkpoint_sm->sm_dbuf, offset, size, 430 B_TRUE, FTAG, &numbufs, &dbp); 431 if (error != 0) { 432 zfs_panic_recover("zfs: error %d was returned " 433 "while prefetching checkpoint space map " 434 "entries of vdev %llu\n", 435 error, vd->vdev_id); 436 } 437 438 VERIFY0(dsl_sync_task(spa->spa_name, NULL, 439 spa_checkpoint_discard_thread_sync, vd, 440 0, ZFS_SPACE_CHECK_NONE)); 441 442 dmu_buf_rele_array(dbp, numbufs, FTAG); 443 } 444 } 445 446 VERIFY(spa_checkpoint_discard_is_done(spa)); 447 VERIFY0(spa->spa_checkpoint_info.sci_dspace); 448 VERIFY0(dsl_sync_task(spa->spa_name, NULL, 449 spa_checkpoint_discard_complete_sync, spa, 450 0, ZFS_SPACE_CHECK_NONE)); 451 } 452 453 454 static int 455 spa_checkpoint_check(void *arg, dmu_tx_t *tx) 456 { 457 (void) arg; 458 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 459 460 if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT)) 461 return (SET_ERROR(ENOTSUP)); 462 463 if (!spa_top_vdevs_spacemap_addressable(spa)) 464 return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG)); 465 466 if (spa->spa_removing_phys.sr_state == DSS_SCANNING) 467 return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS)); 468 469 if (spa->spa_raidz_expand != NULL) 470 return (SET_ERROR(ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); 471 472 if (spa->spa_checkpoint_txg != 0) 473 return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS)); 474 475 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) 476 return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); 477 478 return (0); 479 } 480 481 static void 482 spa_checkpoint_sync(void *arg, dmu_tx_t *tx) 483 { 484 (void) arg; 485 dsl_pool_t *dp = dmu_tx_pool(tx); 486 spa_t *spa = dp->dp_spa; 487 uberblock_t checkpoint = spa->spa_ubsync; 488 489 /* 490 * At this point, there should not be a checkpoint in the MOS. 491 */ 492 ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, 493 DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT); 494 495 ASSERT0(spa->spa_checkpoint_info.sci_timestamp); 496 ASSERT0(spa->spa_checkpoint_info.sci_dspace); 497 498 /* 499 * Since the checkpointed uberblock is the one that just got synced 500 * (we use spa_ubsync), its txg must be equal to the txg number of 501 * the txg we are syncing, minus 1. 502 */ 503 ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1); 504 505 /* 506 * Once the checkpoint is in place, we need to ensure that none of 507 * its blocks will be marked for reuse after it has been freed. 508 * When there is a checkpoint and a block is freed, we compare its 509 * birth txg to the txg of the checkpointed uberblock to see if the 510 * block is part of the checkpoint or not. Therefore, we have to set 511 * spa_checkpoint_txg before any frees happen in this txg (which is 512 * why this is done as an early_synctask as explained in the comment 513 * in spa_checkpoint()). 514 */ 515 spa->spa_checkpoint_txg = checkpoint.ub_txg; 516 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; 517 518 checkpoint.ub_checkpoint_txg = checkpoint.ub_txg; 519 VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, 520 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, 521 sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), 522 &checkpoint, tx)); 523 524 /* 525 * Increment the feature refcount and thus activate the feature. 526 * Note that the feature will be deactivated when we've 527 * completely discarded all checkpointed state (both vdev 528 * space maps and uberblock). 529 */ 530 spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); 531 532 spa_history_log_internal(spa, "spa checkpoint", tx, 533 "checkpointed uberblock txg=%llu", (u_longlong_t)checkpoint.ub_txg); 534 } 535 536 /* 537 * Create a checkpoint for the pool. 538 */ 539 int 540 spa_checkpoint(const char *pool) 541 { 542 int error; 543 spa_t *spa; 544 545 error = spa_open(pool, &spa, FTAG); 546 if (error != 0) 547 return (error); 548 549 mutex_enter(&spa->spa_vdev_top_lock); 550 551 /* 552 * Wait for current syncing txg to finish so the latest synced 553 * uberblock (spa_ubsync) has all the changes that we expect 554 * to see if we were to revert later to the checkpoint. In other 555 * words we want the checkpointed uberblock to include/reference 556 * all the changes that were pending at the time that we issued 557 * the checkpoint command. 558 */ 559 txg_wait_synced(spa_get_dsl(spa), 0); 560 561 /* 562 * As the checkpointed uberblock references blocks from the previous 563 * txg (spa_ubsync) we want to ensure that are not freeing any of 564 * these blocks in the same txg that the following synctask will 565 * run. Thus, we run it as an early synctask, so the dirty changes 566 * that are synced to disk afterwards during zios and other synctasks 567 * do not reuse checkpointed blocks. 568 */ 569 error = dsl_early_sync_task(pool, spa_checkpoint_check, 570 spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL); 571 572 mutex_exit(&spa->spa_vdev_top_lock); 573 574 spa_close(spa, FTAG); 575 return (error); 576 } 577 578 static int 579 spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx) 580 { 581 (void) arg; 582 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 583 584 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) 585 return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); 586 587 if (spa->spa_checkpoint_txg == 0) 588 return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); 589 590 VERIFY0(zap_contains(spa_meta_objset(spa), 591 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT)); 592 593 return (0); 594 } 595 596 static void 597 spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx) 598 { 599 (void) arg; 600 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 601 602 VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, 603 DMU_POOL_ZPOOL_CHECKPOINT, tx)); 604 605 spa->spa_checkpoint_txg = 0; 606 607 zthr_wakeup(spa->spa_checkpoint_discard_zthr); 608 609 spa_history_log_internal(spa, "spa discard checkpoint", tx, 610 "started discarding checkpointed state from the pool"); 611 } 612 613 /* 614 * Discard the checkpoint from a pool. 615 */ 616 int 617 spa_checkpoint_discard(const char *pool) 618 { 619 /* 620 * Similarly to spa_checkpoint(), we want our synctask to run 621 * before any pending dirty data are written to disk so they 622 * won't end up in the checkpoint's data structures (e.g. 623 * ms_checkpointing and vdev_checkpoint_sm) and re-create any 624 * space maps that the discarding open-context thread has 625 * deleted. 626 * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread] 627 */ 628 return (dsl_early_sync_task(pool, spa_checkpoint_discard_check, 629 spa_checkpoint_discard_sync, NULL, 0, 630 ZFS_SPACE_CHECK_DISCARD_CHECKPOINT)); 631 } 632 633 EXPORT_SYMBOL(spa_checkpoint_get_stats); 634 EXPORT_SYMBOL(spa_checkpoint_discard_thread); 635 EXPORT_SYMBOL(spa_checkpoint_discard_thread_check); 636 637 ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, U64, ZMOD_RW, 638 "Limit for memory used in prefetching the checkpoint space map done " 639 "on each vdev while discarding the checkpoint"); 640