1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2017 by Delphix. All rights reserved. 24 */ 25 26 /* 27 * Storage Pool Checkpoint 28 * 29 * A storage pool checkpoint can be thought of as a pool-wide snapshot or 30 * a stable version of extreme rewind that guarantees no blocks from the 31 * checkpointed state will have been overwritten. It remembers the entire 32 * state of the storage pool (e.g. snapshots, dataset names, etc..) from the 33 * point that it was taken and the user can rewind back to that point even if 34 * they applied destructive operations on their datasets or even enabled new 35 * zpool on-disk features. If a pool has a checkpoint that is no longer 36 * needed, the user can discard it. 37 * 38 * == On disk data structures used == 39 * 40 * - The pool has a new feature flag and a new entry in the MOS. The feature 41 * flag is set to active when we create the checkpoint and remains active 42 * until the checkpoint is fully discarded. The entry in the MOS config 43 * (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that 44 * references the state of the pool when we take the checkpoint. The entry 45 * remains populated until we start discarding the checkpoint or we rewind 46 * back to it. 47 * 48 * - Each vdev contains a vdev-wide space map while the pool has a checkpoint, 49 * which persists until the checkpoint is fully discarded. The space map 50 * contains entries that have been freed in the current state of the pool 51 * but we want to keep around in case we decide to rewind to the checkpoint. 52 * [see vdev_checkpoint_sm] 53 * 54 * - Each metaslab's ms_sm space map behaves the same as without the 55 * checkpoint, with the only exception being the scenario when we free 56 * blocks that belong to the checkpoint. In this case, these blocks remain 57 * ALLOCATED in the metaslab's space map and they are added as FREE in the 58 * vdev's checkpoint space map. 59 * 60 * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that 61 * the uberblock was checkpointed. For normal uberblocks this field is 0. 62 * 63 * == Overview of operations == 64 * 65 * - To create a checkpoint, we first wait for the current TXG to be synced, 66 * so we can use the most recently synced uberblock (spa_ubsync) as the 67 * checkpointed uberblock. Then we use an early synctask to place that 68 * uberblock in MOS config, increment the feature flag for the checkpoint 69 * (marking it active), and setting spa_checkpoint_txg (see its use below) 70 * to the TXG of the checkpointed uberblock. We use an early synctask for 71 * the aforementioned operations to ensure that no blocks were dirtied 72 * between the current TXG and the TXG of the checkpointed uberblock 73 * (e.g the previous txg). 74 * 75 * - When a checkpoint exists, we need to ensure that the blocks that 76 * belong to the checkpoint are freed but never reused. This means that 77 * these blocks should never end up in the ms_allocatable or the ms_freeing 78 * trees of a metaslab. Therefore, whenever there is a checkpoint the new 79 * ms_checkpointing tree is used in addition to the aforementioned ones. 80 * 81 * Whenever a block is freed and we find out that it is referenced by the 82 * checkpoint (we find out by comparing its birth to spa_checkpoint_txg), 83 * we place it in the ms_checkpointing tree instead of the ms_freeingtree. 84 * This way, we divide the blocks that are being freed into checkpointed 85 * and not-checkpointed blocks. 86 * 87 * In order to persist these frees, we write the extents from the 88 * ms_freeingtree to the ms_sm as usual, and the extents from the 89 * ms_checkpointing tree to the vdev_checkpoint_sm. This way, these 90 * checkpointed extents will remain allocated in the metaslab's ms_sm space 91 * map, and therefore won't be reused [see metaslab_sync()]. In addition, 92 * when we discard the checkpoint, we can find the entries that have 93 * actually been freed in vdev_checkpoint_sm. 94 * [see spa_checkpoint_discard_thread_sync()] 95 * 96 * - To discard the checkpoint we use an early synctask to delete the 97 * checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0, 98 * and wakeup the discarding zthr thread (an open-context async thread). 99 * We use an early synctask to ensure that the operation happens before any 100 * new data end up in the checkpoint's data structures. 101 * 102 * Once the synctask is done and the discarding zthr is awake, we discard 103 * the checkpointed data over multiple TXGs by having the zthr prefetching 104 * entries from vdev_checkpoint_sm and then starting a synctask that places 105 * them as free blocks into their respective ms_allocatable and ms_sm 106 * structures. 107 * [see spa_checkpoint_discard_thread()] 108 * 109 * When there are no entries left in the vdev_checkpoint_sm of all 110 * top-level vdevs, a final synctask runs that decrements the feature flag. 111 * 112 * - To rewind to the checkpoint, we first use the current uberblock and 113 * open the MOS so we can access the checkpointed uberblock from the MOS 114 * config. After we retrieve the checkpointed uberblock, we use it as the 115 * current uberblock for the pool by writing it to disk with an updated 116 * TXG, opening its version of the MOS, and moving on as usual from there. 117 * [see spa_ld_checkpoint_rewind()] 118 * 119 * An important note on rewinding to the checkpoint has to do with how we 120 * handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL 121 * blocks that have not been claimed by the time we took the checkpoint 122 * as they should no longer be valid. 123 * [see comment in zil_claim()] 124 * 125 * == Miscellaneous information == 126 * 127 * - In the hypothetical event that we take a checkpoint, remove a vdev, 128 * and attempt to rewind, the rewind would fail as the checkpointed 129 * uberblock would reference data in the removed device. For this reason 130 * and others of similar nature, we disallow the following operations that 131 * can change the config: 132 * vdev removal and attach/detach, mirror splitting, and pool reguid. 133 * 134 * - As most of the checkpoint logic is implemented in the SPA and doesn't 135 * distinguish datasets when it comes to space accounting, having a 136 * checkpoint can potentially break the boundaries set by dataset 137 * reservations. 138 */ 139 140 #include <sys/dmu_tx.h> 141 #include <sys/dsl_dir.h> 142 #include <sys/dsl_synctask.h> 143 #include <sys/metaslab_impl.h> 144 #include <sys/spa.h> 145 #include <sys/spa_impl.h> 146 #include <sys/spa_checkpoint.h> 147 #include <sys/vdev_impl.h> 148 #include <sys/zap.h> 149 #include <sys/zfeature.h> 150 151 /* 152 * The following parameter limits the amount of memory to be used for the 153 * prefetching of the checkpoint space map done on each vdev while 154 * discarding the checkpoint. 155 * 156 * The reason it exists is because top-level vdevs with long checkpoint 157 * space maps can potentially take up a lot of memory depending on the 158 * amount of checkpointed data that has been freed within them while 159 * the pool had a checkpoint. 160 */ 161 unsigned long zfs_spa_discard_memory_limit = 16 * 1024 * 1024; 162 163 int 164 spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs) 165 { 166 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) 167 return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); 168 169 bzero(pcs, sizeof (pool_checkpoint_stat_t)); 170 171 int error = zap_contains(spa_meta_objset(spa), 172 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT); 173 ASSERT(error == 0 || error == ENOENT); 174 175 if (error == ENOENT) 176 pcs->pcs_state = CS_CHECKPOINT_DISCARDING; 177 else 178 pcs->pcs_state = CS_CHECKPOINT_EXISTS; 179 180 pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace; 181 pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp; 182 183 return (0); 184 } 185 186 static void 187 spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx) 188 { 189 spa_t *spa = arg; 190 191 spa->spa_checkpoint_info.sci_timestamp = 0; 192 193 spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); 194 spa_notify_waiters(spa); 195 196 spa_history_log_internal(spa, "spa discard checkpoint", tx, 197 "finished discarding checkpointed state from the pool"); 198 } 199 200 typedef struct spa_checkpoint_discard_sync_callback_arg { 201 vdev_t *sdc_vd; 202 uint64_t sdc_txg; 203 uint64_t sdc_entry_limit; 204 } spa_checkpoint_discard_sync_callback_arg_t; 205 206 static int 207 spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg) 208 { 209 spa_checkpoint_discard_sync_callback_arg_t *sdc = arg; 210 vdev_t *vd = sdc->sdc_vd; 211 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; 212 uint64_t end = sme->sme_offset + sme->sme_run; 213 214 if (sdc->sdc_entry_limit == 0) 215 return (SET_ERROR(EINTR)); 216 217 /* 218 * Since the space map is not condensed, we know that 219 * none of its entries is crossing the boundaries of 220 * its respective metaslab. 221 * 222 * That said, there is no fundamental requirement that 223 * the checkpoint's space map entries should not cross 224 * metaslab boundaries. So if needed we could add code 225 * that handles metaslab-crossing segments in the future. 226 */ 227 VERIFY3U(sme->sme_type, ==, SM_FREE); 228 VERIFY3U(sme->sme_offset, >=, ms->ms_start); 229 VERIFY3U(end, <=, ms->ms_start + ms->ms_size); 230 231 /* 232 * At this point we should not be processing any 233 * other frees concurrently, so the lock is technically 234 * unnecessary. We use the lock anyway though to 235 * potentially save ourselves from future headaches. 236 */ 237 mutex_enter(&ms->ms_lock); 238 if (range_tree_is_empty(ms->ms_freeing)) 239 vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg); 240 range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run); 241 mutex_exit(&ms->ms_lock); 242 243 ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, 244 sme->sme_run); 245 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run); 246 247 vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run; 248 vd->vdev_stat.vs_checkpoint_space -= sme->sme_run; 249 sdc->sdc_entry_limit--; 250 251 return (0); 252 } 253 254 #ifdef ZFS_DEBUG 255 static void 256 spa_checkpoint_accounting_verify(spa_t *spa) 257 { 258 vdev_t *rvd = spa->spa_root_vdev; 259 uint64_t ckpoint_sm_space_sum = 0; 260 uint64_t vs_ckpoint_space_sum = 0; 261 262 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 263 vdev_t *vd = rvd->vdev_child[c]; 264 265 if (vd->vdev_checkpoint_sm != NULL) { 266 ckpoint_sm_space_sum += 267 -space_map_allocated(vd->vdev_checkpoint_sm); 268 vs_ckpoint_space_sum += 269 vd->vdev_stat.vs_checkpoint_space; 270 ASSERT3U(ckpoint_sm_space_sum, ==, 271 vs_ckpoint_space_sum); 272 } else { 273 ASSERT0(vd->vdev_stat.vs_checkpoint_space); 274 } 275 } 276 ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum); 277 } 278 #endif 279 280 static void 281 spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) 282 { 283 vdev_t *vd = arg; 284 int error; 285 286 /* 287 * The space map callback is applied only to non-debug entries. 288 * Because the number of debug entries is less or equal to the 289 * number of non-debug entries, we want to ensure that we only 290 * read what we prefetched from open-context. 291 * 292 * Thus, we set the maximum entries that the space map callback 293 * will be applied to be half the entries that could fit in the 294 * imposed memory limit. 295 * 296 * Note that since this is a conservative estimate we also 297 * assume the worst case scenario in our computation where each 298 * entry is two-word. 299 */ 300 uint64_t max_entry_limit = 301 (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1; 302 303 /* 304 * Iterate from the end of the space map towards the beginning, 305 * placing its entries on ms_freeing and removing them from the 306 * space map. The iteration stops if one of the following 307 * conditions is true: 308 * 309 * 1] We reached the beginning of the space map. At this point 310 * the space map should be completely empty and 311 * space_map_incremental_destroy should have returned 0. 312 * The next step would be to free and close the space map 313 * and remove its entry from its vdev's top zap. This allows 314 * spa_checkpoint_discard_thread() to move on to the next vdev. 315 * 316 * 2] We reached the memory limit (amount of memory used to hold 317 * space map entries in memory) and space_map_incremental_destroy 318 * returned EINTR. This means that there are entries remaining 319 * in the space map that will be cleared in a future invocation 320 * of this function by spa_checkpoint_discard_thread(). 321 */ 322 spa_checkpoint_discard_sync_callback_arg_t sdc; 323 sdc.sdc_vd = vd; 324 sdc.sdc_txg = tx->tx_txg; 325 sdc.sdc_entry_limit = max_entry_limit; 326 327 uint64_t words_before = 328 space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); 329 330 error = space_map_incremental_destroy(vd->vdev_checkpoint_sm, 331 spa_checkpoint_discard_sync_callback, &sdc, tx); 332 333 uint64_t words_after = 334 space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); 335 336 #ifdef ZFS_DEBUG 337 spa_checkpoint_accounting_verify(vd->vdev_spa); 338 #endif 339 340 zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, " 341 "deleted %llu words - %llu words are left", 342 tx->tx_txg, vd->vdev_id, (words_before - words_after), 343 words_after); 344 345 if (error != EINTR) { 346 if (error != 0) { 347 zfs_panic_recover("zfs: error %d was returned " 348 "while incrementally destroying the checkpoint " 349 "space map of vdev %llu\n", 350 error, vd->vdev_id); 351 } 352 ASSERT0(words_after); 353 ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm)); 354 ASSERT0(space_map_length(vd->vdev_checkpoint_sm)); 355 356 space_map_free(vd->vdev_checkpoint_sm, tx); 357 space_map_close(vd->vdev_checkpoint_sm); 358 vd->vdev_checkpoint_sm = NULL; 359 360 VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa), 361 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx)); 362 } 363 } 364 365 static boolean_t 366 spa_checkpoint_discard_is_done(spa_t *spa) 367 { 368 vdev_t *rvd = spa->spa_root_vdev; 369 370 ASSERT(!spa_has_checkpoint(spa)); 371 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)); 372 373 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 374 if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL) 375 return (B_FALSE); 376 ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space); 377 } 378 379 return (B_TRUE); 380 } 381 382 /* ARGSUSED */ 383 boolean_t 384 spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr) 385 { 386 spa_t *spa = arg; 387 388 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) 389 return (B_FALSE); 390 391 if (spa_has_checkpoint(spa)) 392 return (B_FALSE); 393 394 return (B_TRUE); 395 } 396 397 void 398 spa_checkpoint_discard_thread(void *arg, zthr_t *zthr) 399 { 400 spa_t *spa = arg; 401 vdev_t *rvd = spa->spa_root_vdev; 402 403 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 404 vdev_t *vd = rvd->vdev_child[c]; 405 406 while (vd->vdev_checkpoint_sm != NULL) { 407 space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm; 408 int numbufs; 409 dmu_buf_t **dbp; 410 411 if (zthr_iscancelled(zthr)) 412 return; 413 414 ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); 415 416 uint64_t size = MIN(space_map_length(checkpoint_sm), 417 zfs_spa_discard_memory_limit); 418 uint64_t offset = 419 space_map_length(checkpoint_sm) - size; 420 421 /* 422 * Ensure that the part of the space map that will 423 * be destroyed by the synctask, is prefetched in 424 * memory before the synctask runs. 425 */ 426 int error = dmu_buf_hold_array_by_bonus( 427 checkpoint_sm->sm_dbuf, offset, size, 428 B_TRUE, FTAG, &numbufs, &dbp); 429 if (error != 0) { 430 zfs_panic_recover("zfs: error %d was returned " 431 "while prefetching checkpoint space map " 432 "entries of vdev %llu\n", 433 error, vd->vdev_id); 434 } 435 436 VERIFY0(dsl_sync_task(spa->spa_name, NULL, 437 spa_checkpoint_discard_thread_sync, vd, 438 0, ZFS_SPACE_CHECK_NONE)); 439 440 dmu_buf_rele_array(dbp, numbufs, FTAG); 441 } 442 } 443 444 VERIFY(spa_checkpoint_discard_is_done(spa)); 445 VERIFY0(spa->spa_checkpoint_info.sci_dspace); 446 VERIFY0(dsl_sync_task(spa->spa_name, NULL, 447 spa_checkpoint_discard_complete_sync, spa, 448 0, ZFS_SPACE_CHECK_NONE)); 449 } 450 451 452 /* ARGSUSED */ 453 static int 454 spa_checkpoint_check(void *arg, dmu_tx_t *tx) 455 { 456 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 457 458 if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT)) 459 return (SET_ERROR(ENOTSUP)); 460 461 if (!spa_top_vdevs_spacemap_addressable(spa)) 462 return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG)); 463 464 if (spa->spa_removing_phys.sr_state == DSS_SCANNING) 465 return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS)); 466 467 if (spa->spa_checkpoint_txg != 0) 468 return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS)); 469 470 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) 471 return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); 472 473 return (0); 474 } 475 476 /* ARGSUSED */ 477 static void 478 spa_checkpoint_sync(void *arg, dmu_tx_t *tx) 479 { 480 dsl_pool_t *dp = dmu_tx_pool(tx); 481 spa_t *spa = dp->dp_spa; 482 uberblock_t checkpoint = spa->spa_ubsync; 483 484 /* 485 * At this point, there should not be a checkpoint in the MOS. 486 */ 487 ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, 488 DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT); 489 490 ASSERT0(spa->spa_checkpoint_info.sci_timestamp); 491 ASSERT0(spa->spa_checkpoint_info.sci_dspace); 492 493 /* 494 * Since the checkpointed uberblock is the one that just got synced 495 * (we use spa_ubsync), its txg must be equal to the txg number of 496 * the txg we are syncing, minus 1. 497 */ 498 ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1); 499 500 /* 501 * Once the checkpoint is in place, we need to ensure that none of 502 * its blocks will be marked for reuse after it has been freed. 503 * When there is a checkpoint and a block is freed, we compare its 504 * birth txg to the txg of the checkpointed uberblock to see if the 505 * block is part of the checkpoint or not. Therefore, we have to set 506 * spa_checkpoint_txg before any frees happen in this txg (which is 507 * why this is done as an early_synctask as explained in the comment 508 * in spa_checkpoint()). 509 */ 510 spa->spa_checkpoint_txg = checkpoint.ub_txg; 511 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; 512 513 checkpoint.ub_checkpoint_txg = checkpoint.ub_txg; 514 VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, 515 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, 516 sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), 517 &checkpoint, tx)); 518 519 /* 520 * Increment the feature refcount and thus activate the feature. 521 * Note that the feature will be deactivated when we've 522 * completely discarded all checkpointed state (both vdev 523 * space maps and uberblock). 524 */ 525 spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); 526 527 spa_history_log_internal(spa, "spa checkpoint", tx, 528 "checkpointed uberblock txg=%llu", (u_longlong_t)checkpoint.ub_txg); 529 } 530 531 /* 532 * Create a checkpoint for the pool. 533 */ 534 int 535 spa_checkpoint(const char *pool) 536 { 537 int error; 538 spa_t *spa; 539 540 error = spa_open(pool, &spa, FTAG); 541 if (error != 0) 542 return (error); 543 544 mutex_enter(&spa->spa_vdev_top_lock); 545 546 /* 547 * Wait for current syncing txg to finish so the latest synced 548 * uberblock (spa_ubsync) has all the changes that we expect 549 * to see if we were to revert later to the checkpoint. In other 550 * words we want the checkpointed uberblock to include/reference 551 * all the changes that were pending at the time that we issued 552 * the checkpoint command. 553 */ 554 txg_wait_synced(spa_get_dsl(spa), 0); 555 556 /* 557 * As the checkpointed uberblock references blocks from the previous 558 * txg (spa_ubsync) we want to ensure that are not freeing any of 559 * these blocks in the same txg that the following synctask will 560 * run. Thus, we run it as an early synctask, so the dirty changes 561 * that are synced to disk afterwards during zios and other synctasks 562 * do not reuse checkpointed blocks. 563 */ 564 error = dsl_early_sync_task(pool, spa_checkpoint_check, 565 spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL); 566 567 mutex_exit(&spa->spa_vdev_top_lock); 568 569 spa_close(spa, FTAG); 570 return (error); 571 } 572 573 /* ARGSUSED */ 574 static int 575 spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx) 576 { 577 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 578 579 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) 580 return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); 581 582 if (spa->spa_checkpoint_txg == 0) 583 return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); 584 585 VERIFY0(zap_contains(spa_meta_objset(spa), 586 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT)); 587 588 return (0); 589 } 590 591 /* ARGSUSED */ 592 static void 593 spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx) 594 { 595 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 596 597 VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, 598 DMU_POOL_ZPOOL_CHECKPOINT, tx)); 599 600 spa->spa_checkpoint_txg = 0; 601 602 zthr_wakeup(spa->spa_checkpoint_discard_zthr); 603 604 spa_history_log_internal(spa, "spa discard checkpoint", tx, 605 "started discarding checkpointed state from the pool"); 606 } 607 608 /* 609 * Discard the checkpoint from a pool. 610 */ 611 int 612 spa_checkpoint_discard(const char *pool) 613 { 614 /* 615 * Similarly to spa_checkpoint(), we want our synctask to run 616 * before any pending dirty data are written to disk so they 617 * won't end up in the checkpoint's data structures (e.g. 618 * ms_checkpointing and vdev_checkpoint_sm) and re-create any 619 * space maps that the discarding open-context thread has 620 * deleted. 621 * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread] 622 */ 623 return (dsl_early_sync_task(pool, spa_checkpoint_discard_check, 624 spa_checkpoint_discard_sync, NULL, 0, 625 ZFS_SPACE_CHECK_DISCARD_CHECKPOINT)); 626 } 627 628 EXPORT_SYMBOL(spa_checkpoint_get_stats); 629 EXPORT_SYMBOL(spa_checkpoint_discard_thread); 630 EXPORT_SYMBOL(spa_checkpoint_discard_thread_check); 631 632 /* BEGIN CSTYLED */ 633 ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, ULONG, ZMOD_RW, 634 "Limit for memory used in prefetching the checkpoint space map done " 635 "on each vdev while discarding the checkpoint"); 636 /* END CSTYLED */ 637