1 /* 2 * CDDL HEADER START 3 * 4 * This file and its contents are supplied under the terms of the 5 * Common Development and Distribution License ("CDDL"), version 1.0. 6 * You may only use this file in accordance with the terms of version 7 * 1.0 of the CDDL. 8 * 9 * A full copy of the text of the CDDL should have accompanied this 10 * source. A copy of the CDDL is also available via the Internet at 11 * http://www.illumos.org/license/CDDL. 12 * 13 * CDDL HEADER END 14 */ 15 16 /* 17 * Copyright (c) 2014, 2017 by Delphix. All rights reserved. 18 */ 19 20 #include <sys/zfs_context.h> 21 #include <sys/spa.h> 22 #include <sys/spa_impl.h> 23 #include <sys/vdev_impl.h> 24 #include <sys/fs/zfs.h> 25 #include <sys/zio.h> 26 #include <sys/metaslab.h> 27 #include <sys/refcount.h> 28 #include <sys/dmu.h> 29 #include <sys/vdev_indirect_mapping.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/dsl_synctask.h> 32 #include <sys/zap.h> 33 #include <sys/abd.h> 34 #include <sys/zthr.h> 35 36 /* 37 * An indirect vdev corresponds to a vdev that has been removed. Since 38 * we cannot rewrite block pointers of snapshots, etc., we keep a 39 * mapping from old location on the removed device to the new location 40 * on another device in the pool and use this mapping whenever we need 41 * to access the DVA. Unfortunately, this mapping did not respect 42 * logical block boundaries when it was first created, and so a DVA on 43 * this indirect vdev may be "split" into multiple sections that each 44 * map to a different location. As a consequence, not all DVAs can be 45 * translated to an equivalent new DVA. Instead we must provide a 46 * "vdev_remap" operation that executes a callback on each contiguous 47 * segment of the new location. This function is used in multiple ways: 48 * 49 * - reads and repair writes to this device use the callback to create 50 * a child io for each mapped segment. 51 * 52 * - frees and claims to this device use the callback to free or claim 53 * each mapped segment. (Note that we don't actually need to claim 54 * log blocks on indirect vdevs, because we don't allocate to 55 * removing vdevs. However, zdb uses zio_claim() for its leak 56 * detection.) 57 */ 58 59 /* 60 * "Big theory statement" for how we mark blocks obsolete. 61 * 62 * When a block on an indirect vdev is freed or remapped, a section of 63 * that vdev's mapping may no longer be referenced (aka "obsolete"). We 64 * keep track of how much of each mapping entry is obsolete. When 65 * an entry becomes completely obsolete, we can remove it, thus reducing 66 * the memory used by the mapping. The complete picture of obsolescence 67 * is given by the following data structures, described below: 68 * - the entry-specific obsolete count 69 * - the vdev-specific obsolete spacemap 70 * - the pool-specific obsolete bpobj 71 * 72 * == On disk data structures used == 73 * 74 * We track the obsolete space for the pool using several objects. Each 75 * of these objects is created on demand and freed when no longer 76 * needed, and is assumed to be empty if it does not exist. 77 * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. 78 * 79 * - Each vic_mapping_object (associated with an indirect vdev) can 80 * have a vimp_counts_object. This is an array of uint32_t's 81 * with the same number of entries as the vic_mapping_object. When 82 * the mapping is condensed, entries from the vic_obsolete_sm_object 83 * (see below) are folded into the counts. Therefore, each 84 * obsolete_counts entry tells us the number of bytes in the 85 * corresponding mapping entry that were not referenced when the 86 * mapping was last condensed. 87 * 88 * - Each indirect or removing vdev can have a vic_obsolete_sm_object. 89 * This is a space map containing an alloc entry for every DVA that 90 * has been obsoleted since the last time this indirect vdev was 91 * condensed. We use this object in order to improve performance 92 * when marking a DVA as obsolete. Instead of modifying an arbitrary 93 * offset of the vimp_counts_object, we only need to append an entry 94 * to the end of this object. When a DVA becomes obsolete, it is 95 * added to the obsolete space map. This happens when the DVA is 96 * freed, remapped and not referenced by a snapshot, or the last 97 * snapshot referencing it is destroyed. 98 * 99 * - Each dataset can have a ds_remap_deadlist object. This is a 100 * deadlist object containing all blocks that were remapped in this 101 * dataset but referenced in a previous snapshot. Blocks can *only* 102 * appear on this list if they were remapped (dsl_dataset_block_remapped); 103 * blocks that were killed in a head dataset are put on the normal 104 * ds_deadlist and marked obsolete when they are freed. 105 * 106 * - The pool can have a dp_obsolete_bpobj. This is a list of blocks 107 * in the pool that need to be marked obsolete. When a snapshot is 108 * destroyed, we move some of the ds_remap_deadlist to the obsolete 109 * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then 110 * asynchronously process the obsolete bpobj, moving its entries to 111 * the specific vdevs' obsolete space maps. 112 * 113 * == Summary of how we mark blocks as obsolete == 114 * 115 * - When freeing a block: if any DVA is on an indirect vdev, append to 116 * vic_obsolete_sm_object. 117 * - When remapping a block, add dva to ds_remap_deadlist (if prev snap 118 * references; otherwise append to vic_obsolete_sm_object). 119 * - When freeing a snapshot: move parts of ds_remap_deadlist to 120 * dp_obsolete_bpobj (same algorithm as ds_deadlist). 121 * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to 122 * individual vdev's vic_obsolete_sm_object. 123 */ 124 125 /* 126 * "Big theory statement" for how we condense indirect vdevs. 127 * 128 * Condensing an indirect vdev's mapping is the process of determining 129 * the precise counts of obsolete space for each mapping entry (by 130 * integrating the obsolete spacemap into the obsolete counts) and 131 * writing out a new mapping that contains only referenced entries. 132 * 133 * We condense a vdev when we expect the mapping to shrink (see 134 * vdev_indirect_should_condense()), but only perform one condense at a 135 * time to limit the memory usage. In addition, we use a separate 136 * open-context thread (spa_condense_indirect_thread) to incrementally 137 * create the new mapping object in a way that minimizes the impact on 138 * the rest of the system. 139 * 140 * == Generating a new mapping == 141 * 142 * To generate a new mapping, we follow these steps: 143 * 144 * 1. Save the old obsolete space map and create a new mapping object 145 * (see spa_condense_indirect_start_sync()). This initializes the 146 * spa_condensing_indirect_phys with the "previous obsolete space map", 147 * which is now read only. Newly obsolete DVAs will be added to a 148 * new (initially empty) obsolete space map, and will not be 149 * considered as part of this condense operation. 150 * 151 * 2. Construct in memory the precise counts of obsolete space for each 152 * mapping entry, by incorporating the obsolete space map into the 153 * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) 154 * 155 * 3. Iterate through each mapping entry, writing to the new mapping any 156 * entries that are not completely obsolete (i.e. which don't have 157 * obsolete count == mapping length). (See 158 * spa_condense_indirect_generate_new_mapping().) 159 * 160 * 4. Destroy the old mapping object and switch over to the new one 161 * (spa_condense_indirect_complete_sync). 162 * 163 * == Restarting from failure == 164 * 165 * To restart the condense when we import/open the pool, we must start 166 * at the 2nd step above: reconstruct the precise counts in memory, 167 * based on the space map + counts. Then in the 3rd step, we start 168 * iterating where we left off: at vimp_max_offset of the new mapping 169 * object. 170 */ 171 172 boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE; 173 174 /* 175 * Condense if at least this percent of the bytes in the mapping is 176 * obsolete. With the default of 25%, the amount of space mapped 177 * will be reduced to 1% of its original size after at most 16 178 * condenses. Higher values will condense less often (causing less 179 * i/o); lower values will reduce the mapping size more quickly. 180 */ 181 int zfs_indirect_condense_obsolete_pct = 25; 182 183 /* 184 * Condense if the obsolete space map takes up more than this amount of 185 * space on disk (logically). This limits the amount of disk space 186 * consumed by the obsolete space map; the default of 1GB is small enough 187 * that we typically don't mind "wasting" it. 188 */ 189 uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; 190 191 /* 192 * Don't bother condensing if the mapping uses less than this amount of 193 * memory. The default of 128KB is considered a "trivial" amount of 194 * memory and not worth reducing. 195 */ 196 uint64_t zfs_condense_min_mapping_bytes = 128 * 1024; 197 198 /* 199 * This is used by the test suite so that it can ensure that certain 200 * actions happen while in the middle of a condense (which might otherwise 201 * complete too quickly). If used to reduce the performance impact of 202 * condensing in production, a maximum value of 1 should be sufficient. 203 */ 204 int zfs_condense_indirect_commit_entry_delay_ticks = 0; 205 206 /* 207 * Mark the given offset and size as being obsolete in the given txg. 208 */ 209 void 210 vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size, 211 uint64_t txg) 212 { 213 spa_t *spa = vd->vdev_spa; 214 ASSERT3U(spa_syncing_txg(spa), ==, txg); 215 ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); 216 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); 217 ASSERT(size > 0); 218 VERIFY(vdev_indirect_mapping_entry_for_offset( 219 vd->vdev_indirect_mapping, offset) != NULL); 220 221 if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 222 mutex_enter(&vd->vdev_obsolete_lock); 223 range_tree_add(vd->vdev_obsolete_segments, offset, size); 224 mutex_exit(&vd->vdev_obsolete_lock); 225 vdev_dirty(vd, 0, NULL, txg); 226 } 227 } 228 229 /* 230 * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This 231 * wrapper is provided because the DMU does not know about vdev_t's and 232 * cannot directly call vdev_indirect_mark_obsolete. 233 */ 234 void 235 spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, 236 uint64_t size, dmu_tx_t *tx) 237 { 238 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 239 ASSERT(dmu_tx_is_syncing(tx)); 240 241 /* The DMU can only remap indirect vdevs. */ 242 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 243 vdev_indirect_mark_obsolete(vd, offset, size, dmu_tx_get_txg(tx)); 244 } 245 246 static spa_condensing_indirect_t * 247 spa_condensing_indirect_create(spa_t *spa) 248 { 249 spa_condensing_indirect_phys_t *scip = 250 &spa->spa_condensing_indirect_phys; 251 spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); 252 objset_t *mos = spa->spa_meta_objset; 253 254 for (int i = 0; i < TXG_SIZE; i++) { 255 list_create(&sci->sci_new_mapping_entries[i], 256 sizeof (vdev_indirect_mapping_entry_t), 257 offsetof(vdev_indirect_mapping_entry_t, vime_node)); 258 } 259 260 sci->sci_new_mapping = 261 vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); 262 263 return (sci); 264 } 265 266 static void 267 spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) 268 { 269 for (int i = 0; i < TXG_SIZE; i++) 270 list_destroy(&sci->sci_new_mapping_entries[i]); 271 272 if (sci->sci_new_mapping != NULL) 273 vdev_indirect_mapping_close(sci->sci_new_mapping); 274 275 kmem_free(sci, sizeof (*sci)); 276 } 277 278 boolean_t 279 vdev_indirect_should_condense(vdev_t *vd) 280 { 281 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 282 spa_t *spa = vd->vdev_spa; 283 284 ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); 285 286 if (!zfs_condense_indirect_vdevs_enable) 287 return (B_FALSE); 288 289 /* 290 * We can only condense one indirect vdev at a time. 291 */ 292 if (spa->spa_condensing_indirect != NULL) 293 return (B_FALSE); 294 295 if (spa_shutting_down(spa)) 296 return (B_FALSE); 297 298 /* 299 * The mapping object size must not change while we are 300 * condensing, so we can only condense indirect vdevs 301 * (not vdevs that are still in the middle of being removed). 302 */ 303 if (vd->vdev_ops != &vdev_indirect_ops) 304 return (B_FALSE); 305 306 /* 307 * If nothing new has been marked obsolete, there is no 308 * point in condensing. 309 */ 310 if (vd->vdev_obsolete_sm == NULL) { 311 ASSERT0(vdev_obsolete_sm_object(vd)); 312 return (B_FALSE); 313 } 314 315 ASSERT(vd->vdev_obsolete_sm != NULL); 316 317 ASSERT3U(vdev_obsolete_sm_object(vd), ==, 318 space_map_object(vd->vdev_obsolete_sm)); 319 320 uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); 321 uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); 322 uint64_t mapping_size = vdev_indirect_mapping_size(vim); 323 uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); 324 325 ASSERT3U(bytes_obsolete, <=, bytes_mapped); 326 327 /* 328 * If a high percentage of the bytes that are mapped have become 329 * obsolete, condense (unless the mapping is already small enough). 330 * This has a good chance of reducing the amount of memory used 331 * by the mapping. 332 */ 333 if (bytes_obsolete * 100 / bytes_mapped >= 334 zfs_indirect_condense_obsolete_pct && 335 mapping_size > zfs_condense_min_mapping_bytes) { 336 zfs_dbgmsg("should condense vdev %llu because obsolete " 337 "spacemap covers %d%% of %lluMB mapping", 338 (u_longlong_t)vd->vdev_id, 339 (int)(bytes_obsolete * 100 / bytes_mapped), 340 (u_longlong_t)bytes_mapped / 1024 / 1024); 341 return (B_TRUE); 342 } 343 344 /* 345 * If the obsolete space map takes up too much space on disk, 346 * condense in order to free up this disk space. 347 */ 348 if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { 349 zfs_dbgmsg("should condense vdev %llu because obsolete sm " 350 "length %lluMB >= max size %lluMB", 351 (u_longlong_t)vd->vdev_id, 352 (u_longlong_t)obsolete_sm_size / 1024 / 1024, 353 (u_longlong_t)zfs_condense_max_obsolete_bytes / 354 1024 / 1024); 355 return (B_TRUE); 356 } 357 358 return (B_FALSE); 359 } 360 361 /* 362 * This sync task completes (finishes) a condense, deleting the old 363 * mapping and replacing it with the new one. 364 */ 365 static void 366 spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) 367 { 368 spa_condensing_indirect_t *sci = arg; 369 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 370 spa_condensing_indirect_phys_t *scip = 371 &spa->spa_condensing_indirect_phys; 372 vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); 373 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 374 objset_t *mos = spa->spa_meta_objset; 375 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 376 uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); 377 uint64_t new_count = 378 vdev_indirect_mapping_num_entries(sci->sci_new_mapping); 379 380 ASSERT(dmu_tx_is_syncing(tx)); 381 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 382 ASSERT3P(sci, ==, spa->spa_condensing_indirect); 383 for (int i = 0; i < TXG_SIZE; i++) { 384 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); 385 } 386 ASSERT(vic->vic_mapping_object != 0); 387 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); 388 ASSERT(scip->scip_next_mapping_object != 0); 389 ASSERT(scip->scip_prev_obsolete_sm_object != 0); 390 391 /* 392 * Reset vdev_indirect_mapping to refer to the new object. 393 */ 394 rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); 395 vdev_indirect_mapping_close(vd->vdev_indirect_mapping); 396 vd->vdev_indirect_mapping = sci->sci_new_mapping; 397 rw_exit(&vd->vdev_indirect_rwlock); 398 399 sci->sci_new_mapping = NULL; 400 vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); 401 vic->vic_mapping_object = scip->scip_next_mapping_object; 402 scip->scip_next_mapping_object = 0; 403 404 space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); 405 spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 406 scip->scip_prev_obsolete_sm_object = 0; 407 408 scip->scip_vdev = 0; 409 410 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 411 DMU_POOL_CONDENSING_INDIRECT, tx)); 412 spa_condensing_indirect_destroy(spa->spa_condensing_indirect); 413 spa->spa_condensing_indirect = NULL; 414 415 zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " 416 "new mapping object %llu has %llu entries " 417 "(was %llu entries)", 418 vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, 419 new_count, old_count); 420 421 vdev_config_dirty(spa->spa_root_vdev); 422 } 423 424 /* 425 * This sync task appends entries to the new mapping object. 426 */ 427 static void 428 spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) 429 { 430 spa_condensing_indirect_t *sci = arg; 431 uint64_t txg = dmu_tx_get_txg(tx); 432 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 433 434 ASSERT(dmu_tx_is_syncing(tx)); 435 ASSERT3P(sci, ==, spa->spa_condensing_indirect); 436 437 vdev_indirect_mapping_add_entries(sci->sci_new_mapping, 438 &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); 439 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); 440 } 441 442 /* 443 * Open-context function to add one entry to the new mapping. The new 444 * entry will be remembered and written from syncing context. 445 */ 446 static void 447 spa_condense_indirect_commit_entry(spa_t *spa, 448 vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) 449 { 450 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; 451 452 ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); 453 454 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 455 dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); 456 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 457 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 458 459 /* 460 * If we are the first entry committed this txg, kick off the sync 461 * task to write to the MOS on our behalf. 462 */ 463 if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { 464 dsl_sync_task_nowait(dmu_tx_pool(tx), 465 spa_condense_indirect_commit_sync, sci, 466 0, ZFS_SPACE_CHECK_NONE, tx); 467 } 468 469 vdev_indirect_mapping_entry_t *vime = 470 kmem_alloc(sizeof (*vime), KM_SLEEP); 471 vime->vime_mapping = *vimep; 472 vime->vime_obsolete_count = count; 473 list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); 474 475 dmu_tx_commit(tx); 476 } 477 478 static void 479 spa_condense_indirect_generate_new_mapping(vdev_t *vd, 480 uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr) 481 { 482 spa_t *spa = vd->vdev_spa; 483 uint64_t mapi = start_index; 484 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 485 uint64_t old_num_entries = 486 vdev_indirect_mapping_num_entries(old_mapping); 487 488 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 489 ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); 490 491 zfs_dbgmsg("starting condense of vdev %llu from index %llu", 492 (u_longlong_t)vd->vdev_id, 493 (u_longlong_t)mapi); 494 495 while (mapi < old_num_entries) { 496 497 if (zthr_iscancelled(zthr)) { 498 zfs_dbgmsg("pausing condense of vdev %llu " 499 "at index %llu", (u_longlong_t)vd->vdev_id, 500 (u_longlong_t)mapi); 501 break; 502 } 503 504 vdev_indirect_mapping_entry_phys_t *entry = 505 &old_mapping->vim_entries[mapi]; 506 uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); 507 ASSERT3U(obsolete_counts[mapi], <=, entry_size); 508 if (obsolete_counts[mapi] < entry_size) { 509 spa_condense_indirect_commit_entry(spa, entry, 510 obsolete_counts[mapi]); 511 512 /* 513 * This delay may be requested for testing, debugging, 514 * or performance reasons. 515 */ 516 delay(zfs_condense_indirect_commit_entry_delay_ticks); 517 } 518 519 mapi++; 520 } 521 } 522 523 /* ARGSUSED */ 524 static boolean_t 525 spa_condense_indirect_thread_check(void *arg, zthr_t *zthr) 526 { 527 spa_t *spa = arg; 528 529 return (spa->spa_condensing_indirect != NULL); 530 } 531 532 /* ARGSUSED */ 533 static int 534 spa_condense_indirect_thread(void *arg, zthr_t *zthr) 535 { 536 spa_t *spa = arg; 537 vdev_t *vd; 538 539 ASSERT3P(spa->spa_condensing_indirect, !=, NULL); 540 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 541 vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); 542 ASSERT3P(vd, !=, NULL); 543 spa_config_exit(spa, SCL_VDEV, FTAG); 544 545 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; 546 spa_condensing_indirect_phys_t *scip = 547 &spa->spa_condensing_indirect_phys; 548 uint32_t *counts; 549 uint64_t start_index; 550 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 551 space_map_t *prev_obsolete_sm = NULL; 552 553 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); 554 ASSERT(scip->scip_next_mapping_object != 0); 555 ASSERT(scip->scip_prev_obsolete_sm_object != 0); 556 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 557 558 for (int i = 0; i < TXG_SIZE; i++) { 559 /* 560 * The list must start out empty in order for the 561 * _commit_sync() sync task to be properly registered 562 * on the first call to _commit_entry(); so it's wise 563 * to double check and ensure we actually are starting 564 * with empty lists. 565 */ 566 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); 567 } 568 569 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, 570 scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); 571 space_map_update(prev_obsolete_sm); 572 counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); 573 if (prev_obsolete_sm != NULL) { 574 vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, 575 counts, prev_obsolete_sm); 576 } 577 space_map_close(prev_obsolete_sm); 578 579 /* 580 * Generate new mapping. Determine what index to continue from 581 * based on the max offset that we've already written in the 582 * new mapping. 583 */ 584 uint64_t max_offset = 585 vdev_indirect_mapping_max_offset(sci->sci_new_mapping); 586 if (max_offset == 0) { 587 /* We haven't written anything to the new mapping yet. */ 588 start_index = 0; 589 } else { 590 /* 591 * Pick up from where we left off. _entry_for_offset() 592 * returns a pointer into the vim_entries array. If 593 * max_offset is greater than any of the mappings 594 * contained in the table NULL will be returned and 595 * that indicates we've exhausted our iteration of the 596 * old_mapping. 597 */ 598 599 vdev_indirect_mapping_entry_phys_t *entry = 600 vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, 601 max_offset); 602 603 if (entry == NULL) { 604 /* 605 * We've already written the whole new mapping. 606 * This special value will cause us to skip the 607 * generate_new_mapping step and just do the sync 608 * task to complete the condense. 609 */ 610 start_index = UINT64_MAX; 611 } else { 612 start_index = entry - old_mapping->vim_entries; 613 ASSERT3U(start_index, <, 614 vdev_indirect_mapping_num_entries(old_mapping)); 615 } 616 } 617 618 spa_condense_indirect_generate_new_mapping(vd, counts, 619 start_index, zthr); 620 621 vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); 622 623 /* 624 * If the zthr has received a cancellation signal while running 625 * in generate_new_mapping() or at any point after that, then bail 626 * early. We don't want to complete the condense if the spa is 627 * shutting down. 628 */ 629 if (zthr_iscancelled(zthr)) 630 return (0); 631 632 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 633 spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_NONE)); 634 635 return (0); 636 } 637 638 /* 639 * Sync task to begin the condensing process. 640 */ 641 void 642 spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) 643 { 644 spa_t *spa = vd->vdev_spa; 645 spa_condensing_indirect_phys_t *scip = 646 &spa->spa_condensing_indirect_phys; 647 648 ASSERT0(scip->scip_next_mapping_object); 649 ASSERT0(scip->scip_prev_obsolete_sm_object); 650 ASSERT0(scip->scip_vdev); 651 ASSERT(dmu_tx_is_syncing(tx)); 652 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 653 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); 654 ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); 655 656 uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd); 657 ASSERT(obsolete_sm_obj != 0); 658 659 scip->scip_vdev = vd->vdev_id; 660 scip->scip_next_mapping_object = 661 vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); 662 663 scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; 664 665 /* 666 * We don't need to allocate a new space map object, since 667 * vdev_indirect_sync_obsolete will allocate one when needed. 668 */ 669 space_map_close(vd->vdev_obsolete_sm); 670 vd->vdev_obsolete_sm = NULL; 671 VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, 672 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); 673 674 VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, 675 DMU_POOL_DIRECTORY_OBJECT, 676 DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), 677 sizeof (*scip) / sizeof (uint64_t), scip, tx)); 678 679 ASSERT3P(spa->spa_condensing_indirect, ==, NULL); 680 spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); 681 682 zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " 683 "posm=%llu nm=%llu", 684 vd->vdev_id, dmu_tx_get_txg(tx), 685 (u_longlong_t)scip->scip_prev_obsolete_sm_object, 686 (u_longlong_t)scip->scip_next_mapping_object); 687 688 zthr_wakeup(spa->spa_condense_zthr); 689 } 690 691 /* 692 * Sync to the given vdev's obsolete space map any segments that are no longer 693 * referenced as of the given txg. 694 * 695 * If the obsolete space map doesn't exist yet, create and open it. 696 */ 697 void 698 vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) 699 { 700 spa_t *spa = vd->vdev_spa; 701 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 702 703 ASSERT3U(vic->vic_mapping_object, !=, 0); 704 ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); 705 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); 706 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); 707 708 if (vdev_obsolete_sm_object(vd) == 0) { 709 uint64_t obsolete_sm_object = 710 space_map_alloc(spa->spa_meta_objset, tx); 711 712 ASSERT(vd->vdev_top_zap != 0); 713 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 714 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, 715 sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); 716 ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0); 717 718 spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 719 VERIFY0(space_map_open(&vd->vdev_obsolete_sm, 720 spa->spa_meta_objset, obsolete_sm_object, 721 0, vd->vdev_asize, 0)); 722 space_map_update(vd->vdev_obsolete_sm); 723 } 724 725 ASSERT(vd->vdev_obsolete_sm != NULL); 726 ASSERT3U(vdev_obsolete_sm_object(vd), ==, 727 space_map_object(vd->vdev_obsolete_sm)); 728 729 space_map_write(vd->vdev_obsolete_sm, 730 vd->vdev_obsolete_segments, SM_ALLOC, tx); 731 space_map_update(vd->vdev_obsolete_sm); 732 range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); 733 } 734 735 int 736 spa_condense_init(spa_t *spa) 737 { 738 int error = zap_lookup(spa->spa_meta_objset, 739 DMU_POOL_DIRECTORY_OBJECT, 740 DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), 741 sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), 742 &spa->spa_condensing_indirect_phys); 743 if (error == 0) { 744 if (spa_writeable(spa)) { 745 spa->spa_condensing_indirect = 746 spa_condensing_indirect_create(spa); 747 } 748 return (0); 749 } else if (error == ENOENT) { 750 return (0); 751 } else { 752 return (error); 753 } 754 } 755 756 void 757 spa_condense_fini(spa_t *spa) 758 { 759 if (spa->spa_condensing_indirect != NULL) { 760 spa_condensing_indirect_destroy(spa->spa_condensing_indirect); 761 spa->spa_condensing_indirect = NULL; 762 } 763 } 764 765 void 766 spa_start_indirect_condensing_thread(spa_t *spa) 767 { 768 ASSERT3P(spa->spa_condense_zthr, ==, NULL); 769 spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check, 770 spa_condense_indirect_thread, spa); 771 } 772 773 /* 774 * Gets the obsolete spacemap object from the vdev's ZAP. 775 * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't 776 * exist yet. 777 */ 778 int 779 vdev_obsolete_sm_object(vdev_t *vd) 780 { 781 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 782 if (vd->vdev_top_zap == 0) { 783 return (0); 784 } 785 786 uint64_t sm_obj = 0; 787 int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 788 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj); 789 790 ASSERT(err == 0 || err == ENOENT); 791 792 return (sm_obj); 793 } 794 795 boolean_t 796 vdev_obsolete_counts_are_precise(vdev_t *vd) 797 { 798 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 799 if (vd->vdev_top_zap == 0) { 800 return (B_FALSE); 801 } 802 803 uint64_t val = 0; 804 int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 805 VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); 806 807 ASSERT(err == 0 || err == ENOENT); 808 809 return (val != 0); 810 } 811 812 /* ARGSUSED */ 813 static void 814 vdev_indirect_close(vdev_t *vd) 815 { 816 } 817 818 /* ARGSUSED */ 819 static void 820 vdev_indirect_io_done(zio_t *zio) 821 { 822 } 823 824 /* ARGSUSED */ 825 static int 826 vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 827 uint64_t *ashift) 828 { 829 *psize = *max_psize = vd->vdev_asize + 830 VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 831 *ashift = vd->vdev_ashift; 832 return (0); 833 } 834 835 typedef struct remap_segment { 836 vdev_t *rs_vd; 837 uint64_t rs_offset; 838 uint64_t rs_asize; 839 uint64_t rs_split_offset; 840 list_node_t rs_node; 841 } remap_segment_t; 842 843 remap_segment_t * 844 rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) 845 { 846 remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); 847 rs->rs_vd = vd; 848 rs->rs_offset = offset; 849 rs->rs_asize = asize; 850 rs->rs_split_offset = split_offset; 851 return (rs); 852 } 853 854 /* 855 * Given an indirect vdev and an extent on that vdev, it duplicates the 856 * physical entries of the indirect mapping that correspond to the extent 857 * to a new array and returns a pointer to it. In addition, copied_entries 858 * is populated with the number of mapping entries that were duplicated. 859 * 860 * Note that the function assumes that the caller holds vdev_indirect_rwlock. 861 * This ensures that the mapping won't change due to condensing as we 862 * copy over its contents. 863 * 864 * Finally, since we are doing an allocation, it is up to the caller to 865 * free the array allocated in this function. 866 */ 867 vdev_indirect_mapping_entry_phys_t * 868 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, 869 uint64_t asize, uint64_t *copied_entries) 870 { 871 vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; 872 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 873 uint64_t entries = 0; 874 875 ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock)); 876 877 vdev_indirect_mapping_entry_phys_t *first_mapping = 878 vdev_indirect_mapping_entry_for_offset(vim, offset); 879 ASSERT3P(first_mapping, !=, NULL); 880 881 vdev_indirect_mapping_entry_phys_t *m = first_mapping; 882 while (asize > 0) { 883 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 884 885 ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); 886 ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); 887 888 uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); 889 uint64_t inner_size = MIN(asize, size - inner_offset); 890 891 offset += inner_size; 892 asize -= inner_size; 893 entries++; 894 m++; 895 } 896 897 size_t copy_length = entries * sizeof (*first_mapping); 898 duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP); 899 bcopy(first_mapping, duplicate_mappings, copy_length); 900 *copied_entries = entries; 901 902 return (duplicate_mappings); 903 } 904 905 /* 906 * Goes through the relevant indirect mappings until it hits a concrete vdev 907 * and issues the callback. On the way to the concrete vdev, if any other 908 * indirect vdevs are encountered, then the callback will also be called on 909 * each of those indirect vdevs. For example, if the segment is mapped to 910 * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is 911 * mapped to segment B on concrete vdev 2, then the callback will be called on 912 * both vdev 1 and vdev 2. 913 * 914 * While the callback passed to vdev_indirect_remap() is called on every vdev 915 * the function encounters, certain callbacks only care about concrete vdevs. 916 * These types of callbacks should return immediately and explicitly when they 917 * are called on an indirect vdev. 918 * 919 * Because there is a possibility that a DVA section in the indirect device 920 * has been split into multiple sections in our mapping, we keep track 921 * of the relevant contiguous segments of the new location (remap_segment_t) 922 * in a stack. This way we can call the callback for each of the new sections 923 * created by a single section of the indirect device. Note though, that in 924 * this scenario the callbacks in each split block won't occur in-order in 925 * terms of offset, so callers should not make any assumptions about that. 926 * 927 * For callbacks that don't handle split blocks and immediately return when 928 * they encounter them (as is the case for remap_blkptr_cb), the caller can 929 * assume that its callback will be applied from the first indirect vdev 930 * encountered to the last one and then the concrete vdev, in that order. 931 */ 932 static void 933 vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, 934 void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) 935 { 936 list_t stack; 937 spa_t *spa = vd->vdev_spa; 938 939 list_create(&stack, sizeof (remap_segment_t), 940 offsetof(remap_segment_t, rs_node)); 941 942 for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); 943 rs != NULL; rs = list_remove_head(&stack)) { 944 vdev_t *v = rs->rs_vd; 945 uint64_t num_entries = 0; 946 947 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 948 ASSERT(rs->rs_asize > 0); 949 950 /* 951 * Note: As this function can be called from open context 952 * (e.g. zio_read()), we need the following rwlock to 953 * prevent the mapping from being changed by condensing. 954 * 955 * So we grab the lock and we make a copy of the entries 956 * that are relevant to the extent that we are working on. 957 * Once that is done, we drop the lock and iterate over 958 * our copy of the mapping. Once we are done with the with 959 * the remap segment and we free it, we also free our copy 960 * of the indirect mapping entries that are relevant to it. 961 * 962 * This way we don't need to wait until the function is 963 * finished with a segment, to condense it. In addition, we 964 * don't need a recursive rwlock for the case that a call to 965 * vdev_indirect_remap() needs to call itself (through the 966 * codepath of its callback) for the same vdev in the middle 967 * of its execution. 968 */ 969 rw_enter(&v->vdev_indirect_rwlock, RW_READER); 970 vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping; 971 ASSERT3P(vim, !=, NULL); 972 973 vdev_indirect_mapping_entry_phys_t *mapping = 974 vdev_indirect_mapping_duplicate_adjacent_entries(v, 975 rs->rs_offset, rs->rs_asize, &num_entries); 976 ASSERT3P(mapping, !=, NULL); 977 ASSERT3U(num_entries, >, 0); 978 rw_exit(&v->vdev_indirect_rwlock); 979 980 for (uint64_t i = 0; i < num_entries; i++) { 981 /* 982 * Note: the vdev_indirect_mapping can not change 983 * while we are running. It only changes while the 984 * removal is in progress, and then only from syncing 985 * context. While a removal is in progress, this 986 * function is only called for frees, which also only 987 * happen from syncing context. 988 */ 989 vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; 990 991 ASSERT3P(m, !=, NULL); 992 ASSERT3U(rs->rs_asize, >, 0); 993 994 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 995 uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); 996 uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); 997 998 ASSERT3U(rs->rs_offset, >=, 999 DVA_MAPPING_GET_SRC_OFFSET(m)); 1000 ASSERT3U(rs->rs_offset, <, 1001 DVA_MAPPING_GET_SRC_OFFSET(m) + size); 1002 ASSERT3U(dst_vdev, !=, v->vdev_id); 1003 1004 uint64_t inner_offset = rs->rs_offset - 1005 DVA_MAPPING_GET_SRC_OFFSET(m); 1006 uint64_t inner_size = 1007 MIN(rs->rs_asize, size - inner_offset); 1008 1009 vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); 1010 ASSERT3P(dst_v, !=, NULL); 1011 1012 if (dst_v->vdev_ops == &vdev_indirect_ops) { 1013 list_insert_head(&stack, 1014 rs_alloc(dst_v, dst_offset + inner_offset, 1015 inner_size, rs->rs_split_offset)); 1016 1017 } 1018 1019 if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && 1020 IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { 1021 /* 1022 * Note: This clause exists only solely for 1023 * testing purposes. We use it to ensure that 1024 * split blocks work and that the callbacks 1025 * using them yield the same result if issued 1026 * in reverse order. 1027 */ 1028 uint64_t inner_half = inner_size / 2; 1029 1030 func(rs->rs_split_offset + inner_half, dst_v, 1031 dst_offset + inner_offset + inner_half, 1032 inner_half, arg); 1033 1034 func(rs->rs_split_offset, dst_v, 1035 dst_offset + inner_offset, 1036 inner_half, arg); 1037 } else { 1038 func(rs->rs_split_offset, dst_v, 1039 dst_offset + inner_offset, 1040 inner_size, arg); 1041 } 1042 1043 rs->rs_offset += inner_size; 1044 rs->rs_asize -= inner_size; 1045 rs->rs_split_offset += inner_size; 1046 } 1047 VERIFY0(rs->rs_asize); 1048 1049 kmem_free(mapping, num_entries * sizeof (*mapping)); 1050 kmem_free(rs, sizeof (remap_segment_t)); 1051 } 1052 list_destroy(&stack); 1053 } 1054 1055 static void 1056 vdev_indirect_child_io_done(zio_t *zio) 1057 { 1058 zio_t *pio = zio->io_private; 1059 1060 mutex_enter(&pio->io_lock); 1061 pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 1062 mutex_exit(&pio->io_lock); 1063 1064 abd_put(zio->io_abd); 1065 } 1066 1067 static void 1068 vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset, 1069 uint64_t size, void *arg) 1070 { 1071 zio_t *zio = arg; 1072 1073 ASSERT3P(vd, !=, NULL); 1074 1075 if (vd->vdev_ops == &vdev_indirect_ops) 1076 return; 1077 1078 zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, 1079 abd_get_offset(zio->io_abd, split_offset), 1080 size, zio->io_type, zio->io_priority, 1081 0, vdev_indirect_child_io_done, zio)); 1082 } 1083 1084 static void 1085 vdev_indirect_io_start(zio_t *zio) 1086 { 1087 spa_t *spa = zio->io_spa; 1088 1089 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1090 if (zio->io_type != ZIO_TYPE_READ) { 1091 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 1092 ASSERT((zio->io_flags & 1093 (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0); 1094 } 1095 1096 vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, 1097 vdev_indirect_io_start_cb, zio); 1098 1099 zio_execute(zio); 1100 } 1101 1102 vdev_ops_t vdev_indirect_ops = { 1103 vdev_indirect_open, 1104 vdev_indirect_close, 1105 vdev_default_asize, 1106 vdev_indirect_io_start, 1107 vdev_indirect_io_done, 1108 NULL, 1109 NULL, 1110 NULL, 1111 vdev_indirect_remap, 1112 VDEV_TYPE_INDIRECT, /* name of this vdev type */ 1113 B_FALSE /* leaf vdev */ 1114 }; 1115