1 /* 2 * CDDL HEADER START 3 * 4 * This file and its contents are supplied under the terms of the 5 * Common Development and Distribution License ("CDDL"), version 1.0. 6 * You may only use this file in accordance with the terms of version 7 * 1.0 of the CDDL. 8 * 9 * A full copy of the text of the CDDL should have accompanied this 10 * source. A copy of the CDDL is also available via the Internet at 11 * http://www.illumos.org/license/CDDL. 12 * 13 * CDDL HEADER END 14 */ 15 16 /* 17 * Copyright (c) 2014, 2015 by Delphix. All rights reserved. 18 */ 19 20 #include <sys/zfs_context.h> 21 #include <sys/spa.h> 22 #include <sys/spa_impl.h> 23 #include <sys/vdev_impl.h> 24 #include <sys/fs/zfs.h> 25 #include <sys/zio.h> 26 #include <sys/metaslab.h> 27 #include <sys/refcount.h> 28 #include <sys/dmu.h> 29 #include <sys/vdev_indirect_mapping.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/dsl_synctask.h> 32 #include <sys/zap.h> 33 34 /* 35 * An indirect vdev corresponds to a vdev that has been removed. Since 36 * we cannot rewrite block pointers of snapshots, etc., we keep a 37 * mapping from old location on the removed device to the new location 38 * on another device in the pool and use this mapping whenever we need 39 * to access the DVA. Unfortunately, this mapping did not respect 40 * logical block boundaries when it was first created, and so a DVA on 41 * this indirect vdev may be "split" into multiple sections that each 42 * map to a different location. As a consequence, not all DVAs can be 43 * translated to an equivalent new DVA. Instead we must provide a 44 * "vdev_remap" operation that executes a callback on each contiguous 45 * segment of the new location. This function is used in multiple ways: 46 * 47 * - reads and repair writes to this device use the callback to create 48 * a child io for each mapped segment. 49 * 50 * - frees and claims to this device use the callback to free or claim 51 * each mapped segment. (Note that we don't actually need to claim 52 * log blocks on indirect vdevs, because we don't allocate to 53 * removing vdevs. However, zdb uses zio_claim() for its leak 54 * detection.) 55 */ 56 57 /* 58 * "Big theory statement" for how we mark blocks obsolete. 59 * 60 * When a block on an indirect vdev is freed or remapped, a section of 61 * that vdev's mapping may no longer be referenced (aka "obsolete"). We 62 * keep track of how much of each mapping entry is obsolete. When 63 * an entry becomes completely obsolete, we can remove it, thus reducing 64 * the memory used by the mapping. The complete picture of obsolescence 65 * is given by the following data structures, described below: 66 * - the entry-specific obsolete count 67 * - the vdev-specific obsolete spacemap 68 * - the pool-specific obsolete bpobj 69 * 70 * == On disk data structures used == 71 * 72 * We track the obsolete space for the pool using several objects. Each 73 * of these objects is created on demand and freed when no longer 74 * needed, and is assumed to be empty if it does not exist. 75 * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. 76 * 77 * - Each vic_mapping_object (associated with an indirect vdev) can 78 * have a vimp_counts_object. This is an array of uint32_t's 79 * with the same number of entries as the vic_mapping_object. When 80 * the mapping is condensed, entries from the vic_obsolete_sm_object 81 * (see below) are folded into the counts. Therefore, each 82 * obsolete_counts entry tells us the number of bytes in the 83 * corresponding mapping entry that were not referenced when the 84 * mapping was last condensed. 85 * 86 * - Each indirect or removing vdev can have a vic_obsolete_sm_object. 87 * This is a space map containing an alloc entry for every DVA that 88 * has been obsoleted since the last time this indirect vdev was 89 * condensed. We use this object in order to improve performance 90 * when marking a DVA as obsolete. Instead of modifying an arbitrary 91 * offset of the vimp_counts_object, we only need to append an entry 92 * to the end of this object. When a DVA becomes obsolete, it is 93 * added to the obsolete space map. This happens when the DVA is 94 * freed, remapped and not referenced by a snapshot, or the last 95 * snapshot referencing it is destroyed. 96 * 97 * - Each dataset can have a ds_remap_deadlist object. This is a 98 * deadlist object containing all blocks that were remapped in this 99 * dataset but referenced in a previous snapshot. Blocks can *only* 100 * appear on this list if they were remapped (dsl_dataset_block_remapped); 101 * blocks that were killed in a head dataset are put on the normal 102 * ds_deadlist and marked obsolete when they are freed. 103 * 104 * - The pool can have a dp_obsolete_bpobj. This is a list of blocks 105 * in the pool that need to be marked obsolete. When a snapshot is 106 * destroyed, we move some of the ds_remap_deadlist to the obsolete 107 * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then 108 * asynchronously process the obsolete bpobj, moving its entries to 109 * the specific vdevs' obsolete space maps. 110 * 111 * == Summary of how we mark blocks as obsolete == 112 * 113 * - When freeing a block: if any DVA is on an indirect vdev, append to 114 * vic_obsolete_sm_object. 115 * - When remapping a block, add dva to ds_remap_deadlist (if prev snap 116 * references; otherwise append to vic_obsolete_sm_object). 117 * - When freeing a snapshot: move parts of ds_remap_deadlist to 118 * dp_obsolete_bpobj (same algorithm as ds_deadlist). 119 * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to 120 * individual vdev's vic_obsolete_sm_object. 121 */ 122 123 /* 124 * "Big theory statement" for how we condense indirect vdevs. 125 * 126 * Condensing an indirect vdev's mapping is the process of determining 127 * the precise counts of obsolete space for each mapping entry (by 128 * integrating the obsolete spacemap into the obsolete counts) and 129 * writing out a new mapping that contains only referenced entries. 130 * 131 * We condense a vdev when we expect the mapping to shrink (see 132 * vdev_indirect_should_condense()), but only perform one condense at a 133 * time to limit the memory usage. In addition, we use a separate 134 * open-context thread (spa_condense_indirect_thread) to incrementally 135 * create the new mapping object in a way that minimizes the impact on 136 * the rest of the system. 137 * 138 * == Generating a new mapping == 139 * 140 * To generate a new mapping, we follow these steps: 141 * 142 * 1. Save the old obsolete space map and create a new mapping object 143 * (see spa_condense_indirect_start_sync()). This initializes the 144 * spa_condensing_indirect_phys with the "previous obsolete space map", 145 * which is now read only. Newly obsolete DVAs will be added to a 146 * new (initially empty) obsolete space map, and will not be 147 * considered as part of this condense operation. 148 * 149 * 2. Construct in memory the precise counts of obsolete space for each 150 * mapping entry, by incorporating the obsolete space map into the 151 * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) 152 * 153 * 3. Iterate through each mapping entry, writing to the new mapping any 154 * entries that are not completely obsolete (i.e. which don't have 155 * obsolete count == mapping length). (See 156 * spa_condense_indirect_generate_new_mapping().) 157 * 158 * 4. Destroy the old mapping object and switch over to the new one 159 * (spa_condense_indirect_complete_sync). 160 * 161 * == Restarting from failure == 162 * 163 * To restart the condense when we import/open the pool, we must start 164 * at the 2nd step above: reconstruct the precise counts in memory, 165 * based on the space map + counts. Then in the 3rd step, we start 166 * iterating where we left off: at vimp_max_offset of the new mapping 167 * object. 168 */ 169 170 boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE; 171 172 /* 173 * Condense if at least this percent of the bytes in the mapping is 174 * obsolete. With the default of 25%, the amount of space mapped 175 * will be reduced to 1% of its original size after at most 16 176 * condenses. Higher values will condense less often (causing less 177 * i/o); lower values will reduce the mapping size more quickly. 178 */ 179 int zfs_indirect_condense_obsolete_pct = 25; 180 181 /* 182 * Condense if the obsolete space map takes up more than this amount of 183 * space on disk (logically). This limits the amount of disk space 184 * consumed by the obsolete space map; the default of 1GB is small enough 185 * that we typically don't mind "wasting" it. 186 */ 187 uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; 188 189 /* 190 * Don't bother condensing if the mapping uses less than this amount of 191 * memory. The default of 128KB is considered a "trivial" amount of 192 * memory and not worth reducing. 193 */ 194 uint64_t zfs_condense_min_mapping_bytes = 128 * 1024; 195 196 /* 197 * This is used by the test suite so that it can ensure that certain 198 * actions happen while in the middle of a condense (which might otherwise 199 * complete too quickly). If used to reduce the performance impact of 200 * condensing in production, a maximum value of 1 should be sufficient. 201 */ 202 int zfs_condense_indirect_commit_entry_delay_ticks = 0; 203 204 /* 205 * Mark the given offset and size as being obsolete in the given txg. 206 */ 207 void 208 vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size, 209 uint64_t txg) 210 { 211 spa_t *spa = vd->vdev_spa; 212 ASSERT3U(spa_syncing_txg(spa), ==, txg); 213 ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); 214 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); 215 ASSERT(size > 0); 216 VERIFY(vdev_indirect_mapping_entry_for_offset( 217 vd->vdev_indirect_mapping, offset) != NULL); 218 219 if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 220 mutex_enter(&vd->vdev_obsolete_lock); 221 range_tree_add(vd->vdev_obsolete_segments, offset, size); 222 mutex_exit(&vd->vdev_obsolete_lock); 223 vdev_dirty(vd, 0, NULL, txg); 224 } 225 } 226 227 /* 228 * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This 229 * wrapper is provided because the DMU does not know about vdev_t's and 230 * cannot directly call vdev_indirect_mark_obsolete. 231 */ 232 void 233 spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, 234 uint64_t size, dmu_tx_t *tx) 235 { 236 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 237 ASSERT(dmu_tx_is_syncing(tx)); 238 239 /* The DMU can only remap indirect vdevs. */ 240 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 241 vdev_indirect_mark_obsolete(vd, offset, size, dmu_tx_get_txg(tx)); 242 } 243 244 static spa_condensing_indirect_t * 245 spa_condensing_indirect_create(spa_t *spa) 246 { 247 spa_condensing_indirect_phys_t *scip = 248 &spa->spa_condensing_indirect_phys; 249 spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); 250 objset_t *mos = spa->spa_meta_objset; 251 252 for (int i = 0; i < TXG_SIZE; i++) { 253 list_create(&sci->sci_new_mapping_entries[i], 254 sizeof (vdev_indirect_mapping_entry_t), 255 offsetof(vdev_indirect_mapping_entry_t, vime_node)); 256 } 257 258 sci->sci_new_mapping = 259 vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); 260 261 return (sci); 262 } 263 264 static void 265 spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) 266 { 267 for (int i = 0; i < TXG_SIZE; i++) 268 list_destroy(&sci->sci_new_mapping_entries[i]); 269 270 if (sci->sci_new_mapping != NULL) 271 vdev_indirect_mapping_close(sci->sci_new_mapping); 272 273 kmem_free(sci, sizeof (*sci)); 274 } 275 276 boolean_t 277 vdev_indirect_should_condense(vdev_t *vd) 278 { 279 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 280 spa_t *spa = vd->vdev_spa; 281 282 ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); 283 284 if (!zfs_condense_indirect_vdevs_enable) 285 return (B_FALSE); 286 287 /* 288 * We can only condense one indirect vdev at a time. 289 */ 290 if (spa->spa_condensing_indirect != NULL) 291 return (B_FALSE); 292 293 if (spa_shutting_down(spa)) 294 return (B_FALSE); 295 296 /* 297 * The mapping object size must not change while we are 298 * condensing, so we can only condense indirect vdevs 299 * (not vdevs that are still in the middle of being removed). 300 */ 301 if (vd->vdev_ops != &vdev_indirect_ops) 302 return (B_FALSE); 303 304 /* 305 * If nothing new has been marked obsolete, there is no 306 * point in condensing. 307 */ 308 if (vd->vdev_obsolete_sm == NULL) { 309 ASSERT0(vdev_obsolete_sm_object(vd)); 310 return (B_FALSE); 311 } 312 313 ASSERT(vd->vdev_obsolete_sm != NULL); 314 315 ASSERT3U(vdev_obsolete_sm_object(vd), ==, 316 space_map_object(vd->vdev_obsolete_sm)); 317 318 uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); 319 uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); 320 uint64_t mapping_size = vdev_indirect_mapping_size(vim); 321 uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); 322 323 ASSERT3U(bytes_obsolete, <=, bytes_mapped); 324 325 /* 326 * If a high percentage of the bytes that are mapped have become 327 * obsolete, condense (unless the mapping is already small enough). 328 * This has a good chance of reducing the amount of memory used 329 * by the mapping. 330 */ 331 if (bytes_obsolete * 100 / bytes_mapped >= 332 zfs_indirect_condense_obsolete_pct && 333 mapping_size > zfs_condense_min_mapping_bytes) { 334 zfs_dbgmsg("should condense vdev %llu because obsolete " 335 "spacemap covers %d%% of %lluMB mapping", 336 (u_longlong_t)vd->vdev_id, 337 (int)(bytes_obsolete * 100 / bytes_mapped), 338 (u_longlong_t)bytes_mapped / 1024 / 1024); 339 return (B_TRUE); 340 } 341 342 /* 343 * If the obsolete space map takes up too much space on disk, 344 * condense in order to free up this disk space. 345 */ 346 if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { 347 zfs_dbgmsg("should condense vdev %llu because obsolete sm " 348 "length %lluMB >= max size %lluMB", 349 (u_longlong_t)vd->vdev_id, 350 (u_longlong_t)obsolete_sm_size / 1024 / 1024, 351 (u_longlong_t)zfs_condense_max_obsolete_bytes / 352 1024 / 1024); 353 return (B_TRUE); 354 } 355 356 return (B_FALSE); 357 } 358 359 /* 360 * This sync task completes (finishes) a condense, deleting the old 361 * mapping and replacing it with the new one. 362 */ 363 static void 364 spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) 365 { 366 spa_condensing_indirect_t *sci = arg; 367 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 368 spa_condensing_indirect_phys_t *scip = 369 &spa->spa_condensing_indirect_phys; 370 vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); 371 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 372 objset_t *mos = spa->spa_meta_objset; 373 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 374 uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); 375 uint64_t new_count = 376 vdev_indirect_mapping_num_entries(sci->sci_new_mapping); 377 378 ASSERT(dmu_tx_is_syncing(tx)); 379 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 380 ASSERT3P(sci, ==, spa->spa_condensing_indirect); 381 for (int i = 0; i < TXG_SIZE; i++) { 382 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); 383 } 384 ASSERT(vic->vic_mapping_object != 0); 385 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); 386 ASSERT(scip->scip_next_mapping_object != 0); 387 ASSERT(scip->scip_prev_obsolete_sm_object != 0); 388 389 /* 390 * Reset vdev_indirect_mapping to refer to the new object. 391 */ 392 rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); 393 vdev_indirect_mapping_close(vd->vdev_indirect_mapping); 394 vd->vdev_indirect_mapping = sci->sci_new_mapping; 395 rw_exit(&vd->vdev_indirect_rwlock); 396 397 sci->sci_new_mapping = NULL; 398 vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); 399 vic->vic_mapping_object = scip->scip_next_mapping_object; 400 scip->scip_next_mapping_object = 0; 401 402 space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); 403 spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 404 scip->scip_prev_obsolete_sm_object = 0; 405 406 scip->scip_vdev = 0; 407 408 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 409 DMU_POOL_CONDENSING_INDIRECT, tx)); 410 spa_condensing_indirect_destroy(spa->spa_condensing_indirect); 411 spa->spa_condensing_indirect = NULL; 412 413 zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " 414 "new mapping object %llu has %llu entries " 415 "(was %llu entries)", 416 vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, 417 new_count, old_count); 418 419 vdev_config_dirty(spa->spa_root_vdev); 420 } 421 422 /* 423 * This sync task appends entries to the new mapping object. 424 */ 425 static void 426 spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) 427 { 428 spa_condensing_indirect_t *sci = arg; 429 uint64_t txg = dmu_tx_get_txg(tx); 430 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 431 432 ASSERT(dmu_tx_is_syncing(tx)); 433 ASSERT3P(sci, ==, spa->spa_condensing_indirect); 434 435 vdev_indirect_mapping_add_entries(sci->sci_new_mapping, 436 &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); 437 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); 438 } 439 440 /* 441 * Open-context function to add one entry to the new mapping. The new 442 * entry will be remembered and written from syncing context. 443 */ 444 static void 445 spa_condense_indirect_commit_entry(spa_t *spa, 446 vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) 447 { 448 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; 449 450 ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); 451 452 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 453 dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); 454 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 455 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 456 457 /* 458 * If we are the first entry committed this txg, kick off the sync 459 * task to write to the MOS on our behalf. 460 */ 461 if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { 462 dsl_sync_task_nowait(dmu_tx_pool(tx), 463 spa_condense_indirect_commit_sync, sci, 464 0, ZFS_SPACE_CHECK_NONE, tx); 465 } 466 467 vdev_indirect_mapping_entry_t *vime = 468 kmem_alloc(sizeof (*vime), KM_SLEEP); 469 vime->vime_mapping = *vimep; 470 vime->vime_obsolete_count = count; 471 list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); 472 473 dmu_tx_commit(tx); 474 } 475 476 static void 477 spa_condense_indirect_generate_new_mapping(vdev_t *vd, 478 uint32_t *obsolete_counts, uint64_t start_index) 479 { 480 spa_t *spa = vd->vdev_spa; 481 uint64_t mapi = start_index; 482 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 483 uint64_t old_num_entries = 484 vdev_indirect_mapping_num_entries(old_mapping); 485 486 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 487 ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); 488 489 zfs_dbgmsg("starting condense of vdev %llu from index %llu", 490 (u_longlong_t)vd->vdev_id, 491 (u_longlong_t)mapi); 492 493 while (mapi < old_num_entries && !spa_shutting_down(spa)) { 494 vdev_indirect_mapping_entry_phys_t *entry = 495 &old_mapping->vim_entries[mapi]; 496 uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); 497 ASSERT3U(obsolete_counts[mapi], <=, entry_size); 498 if (obsolete_counts[mapi] < entry_size) { 499 spa_condense_indirect_commit_entry(spa, entry, 500 obsolete_counts[mapi]); 501 502 /* 503 * This delay may be requested for testing, debugging, 504 * or performance reasons. 505 */ 506 delay(zfs_condense_indirect_commit_entry_delay_ticks); 507 } 508 509 mapi++; 510 } 511 if (spa_shutting_down(spa)) { 512 zfs_dbgmsg("pausing condense of vdev %llu at index %llu", 513 (u_longlong_t)vd->vdev_id, 514 (u_longlong_t)mapi); 515 } 516 } 517 518 static void 519 spa_condense_indirect_thread(void *arg) 520 { 521 vdev_t *vd = arg; 522 spa_t *spa = vd->vdev_spa; 523 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; 524 spa_condensing_indirect_phys_t *scip = 525 &spa->spa_condensing_indirect_phys; 526 uint32_t *counts; 527 uint64_t start_index; 528 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 529 space_map_t *prev_obsolete_sm = NULL; 530 531 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); 532 ASSERT(scip->scip_next_mapping_object != 0); 533 ASSERT(scip->scip_prev_obsolete_sm_object != 0); 534 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 535 536 for (int i = 0; i < TXG_SIZE; i++) { 537 /* 538 * The list must start out empty in order for the 539 * _commit_sync() sync task to be properly registered 540 * on the first call to _commit_entry(); so it's wise 541 * to double check and ensure we actually are starting 542 * with empty lists. 543 */ 544 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); 545 } 546 547 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, 548 scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); 549 space_map_update(prev_obsolete_sm); 550 counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); 551 if (prev_obsolete_sm != NULL) { 552 vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, 553 counts, prev_obsolete_sm); 554 } 555 space_map_close(prev_obsolete_sm); 556 557 /* 558 * Generate new mapping. Determine what index to continue from 559 * based on the max offset that we've already written in the 560 * new mapping. 561 */ 562 uint64_t max_offset = 563 vdev_indirect_mapping_max_offset(sci->sci_new_mapping); 564 if (max_offset == 0) { 565 /* We haven't written anything to the new mapping yet. */ 566 start_index = 0; 567 } else { 568 /* 569 * Pick up from where we left off. _entry_for_offset() 570 * returns a pointer into the vim_entries array. If 571 * max_offset is greater than any of the mappings 572 * contained in the table NULL will be returned and 573 * that indicates we've exhausted our iteration of the 574 * old_mapping. 575 */ 576 577 vdev_indirect_mapping_entry_phys_t *entry = 578 vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, 579 max_offset); 580 581 if (entry == NULL) { 582 /* 583 * We've already written the whole new mapping. 584 * This special value will cause us to skip the 585 * generate_new_mapping step and just do the sync 586 * task to complete the condense. 587 */ 588 start_index = UINT64_MAX; 589 } else { 590 start_index = entry - old_mapping->vim_entries; 591 ASSERT3U(start_index, <, 592 vdev_indirect_mapping_num_entries(old_mapping)); 593 } 594 } 595 596 spa_condense_indirect_generate_new_mapping(vd, counts, start_index); 597 598 vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); 599 600 /* 601 * We may have bailed early from generate_new_mapping(), if 602 * the spa is shutting down. In this case, do not complete 603 * the condense. 604 */ 605 if (!spa_shutting_down(spa)) { 606 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 607 spa_condense_indirect_complete_sync, sci, 0, 608 ZFS_SPACE_CHECK_NONE)); 609 } 610 611 mutex_enter(&spa->spa_async_lock); 612 spa->spa_condense_thread = NULL; 613 cv_broadcast(&spa->spa_async_cv); 614 mutex_exit(&spa->spa_async_lock); 615 } 616 617 /* 618 * Sync task to begin the condensing process. 619 */ 620 void 621 spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) 622 { 623 spa_t *spa = vd->vdev_spa; 624 spa_condensing_indirect_phys_t *scip = 625 &spa->spa_condensing_indirect_phys; 626 627 ASSERT0(scip->scip_next_mapping_object); 628 ASSERT0(scip->scip_prev_obsolete_sm_object); 629 ASSERT0(scip->scip_vdev); 630 ASSERT(dmu_tx_is_syncing(tx)); 631 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 632 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); 633 ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); 634 635 uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd); 636 ASSERT(obsolete_sm_obj != 0); 637 638 scip->scip_vdev = vd->vdev_id; 639 scip->scip_next_mapping_object = 640 vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); 641 642 scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; 643 644 /* 645 * We don't need to allocate a new space map object, since 646 * vdev_indirect_sync_obsolete will allocate one when needed. 647 */ 648 space_map_close(vd->vdev_obsolete_sm); 649 vd->vdev_obsolete_sm = NULL; 650 VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, 651 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); 652 653 VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, 654 DMU_POOL_DIRECTORY_OBJECT, 655 DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), 656 sizeof (*scip) / sizeof (uint64_t), scip, tx)); 657 658 ASSERT3P(spa->spa_condensing_indirect, ==, NULL); 659 spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); 660 661 zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " 662 "posm=%llu nm=%llu", 663 vd->vdev_id, dmu_tx_get_txg(tx), 664 (u_longlong_t)scip->scip_prev_obsolete_sm_object, 665 (u_longlong_t)scip->scip_next_mapping_object); 666 667 ASSERT3P(spa->spa_condense_thread, ==, NULL); 668 spa->spa_condense_thread = thread_create(NULL, 0, 669 spa_condense_indirect_thread, vd, 0, &p0, TS_RUN, minclsyspri); 670 } 671 672 /* 673 * Sync to the given vdev's obsolete space map any segments that are no longer 674 * referenced as of the given txg. 675 * 676 * If the obsolete space map doesn't exist yet, create and open it. 677 */ 678 void 679 vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) 680 { 681 spa_t *spa = vd->vdev_spa; 682 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 683 684 ASSERT3U(vic->vic_mapping_object, !=, 0); 685 ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); 686 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); 687 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); 688 689 if (vdev_obsolete_sm_object(vd) == 0) { 690 uint64_t obsolete_sm_object = 691 space_map_alloc(spa->spa_meta_objset, tx); 692 693 ASSERT(vd->vdev_top_zap != 0); 694 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 695 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, 696 sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); 697 ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0); 698 699 spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 700 VERIFY0(space_map_open(&vd->vdev_obsolete_sm, 701 spa->spa_meta_objset, obsolete_sm_object, 702 0, vd->vdev_asize, 0)); 703 space_map_update(vd->vdev_obsolete_sm); 704 } 705 706 ASSERT(vd->vdev_obsolete_sm != NULL); 707 ASSERT3U(vdev_obsolete_sm_object(vd), ==, 708 space_map_object(vd->vdev_obsolete_sm)); 709 710 space_map_write(vd->vdev_obsolete_sm, 711 vd->vdev_obsolete_segments, SM_ALLOC, tx); 712 space_map_update(vd->vdev_obsolete_sm); 713 range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); 714 } 715 716 int 717 spa_condense_init(spa_t *spa) 718 { 719 int error = zap_lookup(spa->spa_meta_objset, 720 DMU_POOL_DIRECTORY_OBJECT, 721 DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), 722 sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), 723 &spa->spa_condensing_indirect_phys); 724 if (error == 0) { 725 if (spa_writeable(spa)) { 726 spa->spa_condensing_indirect = 727 spa_condensing_indirect_create(spa); 728 } 729 return (0); 730 } else if (error == ENOENT) { 731 return (0); 732 } else { 733 return (error); 734 } 735 } 736 737 void 738 spa_condense_fini(spa_t *spa) 739 { 740 if (spa->spa_condensing_indirect != NULL) { 741 spa_condensing_indirect_destroy(spa->spa_condensing_indirect); 742 spa->spa_condensing_indirect = NULL; 743 } 744 } 745 746 /* 747 * Restart the condense - called when the pool is opened. 748 */ 749 void 750 spa_condense_indirect_restart(spa_t *spa) 751 { 752 vdev_t *vd; 753 ASSERT(spa->spa_condensing_indirect != NULL); 754 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 755 vd = vdev_lookup_top(spa, 756 spa->spa_condensing_indirect_phys.scip_vdev); 757 ASSERT(vd != NULL); 758 spa_config_exit(spa, SCL_VDEV, FTAG); 759 760 ASSERT3P(spa->spa_condense_thread, ==, NULL); 761 spa->spa_condense_thread = thread_create(NULL, 0, 762 spa_condense_indirect_thread, vd, 0, &p0, TS_RUN, 763 minclsyspri); 764 } 765 766 /* 767 * Gets the obsolete spacemap object from the vdev's ZAP. 768 * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't 769 * exist yet. 770 */ 771 int 772 vdev_obsolete_sm_object(vdev_t *vd) 773 { 774 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 775 if (vd->vdev_top_zap == 0) { 776 return (0); 777 } 778 779 uint64_t sm_obj = 0; 780 int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 781 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj); 782 783 ASSERT(err == 0 || err == ENOENT); 784 785 return (sm_obj); 786 } 787 788 boolean_t 789 vdev_obsolete_counts_are_precise(vdev_t *vd) 790 { 791 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 792 if (vd->vdev_top_zap == 0) { 793 return (B_FALSE); 794 } 795 796 uint64_t val = 0; 797 int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 798 VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); 799 800 ASSERT(err == 0 || err == ENOENT); 801 802 return (val != 0); 803 } 804 805 /* ARGSUSED */ 806 static void 807 vdev_indirect_close(vdev_t *vd) 808 { 809 } 810 811 /* ARGSUSED */ 812 static void 813 vdev_indirect_io_done(zio_t *zio) 814 { 815 } 816 817 /* ARGSUSED */ 818 static int 819 vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 820 uint64_t *ashift) 821 { 822 *psize = *max_psize = vd->vdev_asize + 823 VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 824 *ashift = vd->vdev_ashift; 825 return (0); 826 } 827 828 typedef struct remap_segment { 829 vdev_t *rs_vd; 830 uint64_t rs_offset; 831 uint64_t rs_asize; 832 uint64_t rs_split_offset; 833 list_node_t rs_node; 834 } remap_segment_t; 835 836 remap_segment_t * 837 rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) 838 { 839 remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); 840 rs->rs_vd = vd; 841 rs->rs_offset = offset; 842 rs->rs_asize = asize; 843 rs->rs_split_offset = split_offset; 844 return (rs); 845 } 846 847 /* 848 * Goes through the relevant indirect mappings until it hits a concrete vdev 849 * and issues the callback. On the way to the concrete vdev, if any other 850 * indirect vdevs are encountered, then the callback will also be called on 851 * each of those indirect vdevs. For example, if the segment is mapped to 852 * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is 853 * mapped to segment B on concrete vdev 2, then the callback will be called on 854 * both vdev 1 and vdev 2. 855 * 856 * While the callback passed to vdev_indirect_remap() is called on every vdev 857 * the function encounters, certain callbacks only care about concrete vdevs. 858 * These types of callbacks should return immediately and explicitly when they 859 * are called on an indirect vdev. 860 * 861 * Because there is a possibility that a DVA section in the indirect device 862 * has been split into multiple sections in our mapping, we keep track 863 * of the relevant contiguous segments of the new location (remap_segment_t) 864 * in a stack. This way we can call the callback for each of the new sections 865 * created by a single section of the indirect device. Note though, that in 866 * this scenario the callbacks in each split block won't occur in-order in 867 * terms of offset, so callers should not make any assumptions about that. 868 * 869 * For callbacks that don't handle split blocks and immediately return when 870 * they encounter them (as is the case for remap_blkptr_cb), the caller can 871 * assume that its callback will be applied from the first indirect vdev 872 * encountered to the last one and then the concrete vdev, in that order. 873 */ 874 static void 875 vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, 876 void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) 877 { 878 list_t stack; 879 spa_t *spa = vd->vdev_spa; 880 881 list_create(&stack, sizeof (remap_segment_t), 882 offsetof(remap_segment_t, rs_node)); 883 884 for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); 885 rs != NULL; rs = list_remove_head(&stack)) { 886 vdev_t *v = rs->rs_vd; 887 888 /* 889 * Note: this can be called from open context 890 * (eg. zio_read()), so we need the rwlock to prevent 891 * the mapping from being changed by condensing. 892 */ 893 rw_enter(&v->vdev_indirect_rwlock, RW_READER); 894 vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping; 895 ASSERT3P(vim, !=, NULL); 896 897 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 898 ASSERT(rs->rs_asize > 0); 899 900 vdev_indirect_mapping_entry_phys_t *mapping = 901 vdev_indirect_mapping_entry_for_offset(vim, rs->rs_offset); 902 ASSERT3P(mapping, !=, NULL); 903 904 while (rs->rs_asize > 0) { 905 /* 906 * Note: the vdev_indirect_mapping can not change 907 * while we are running. It only changes while the 908 * removal is in progress, and then only from syncing 909 * context. While a removal is in progress, this 910 * function is only called for frees, which also only 911 * happen from syncing context. 912 */ 913 914 uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst); 915 uint64_t dst_offset = 916 DVA_GET_OFFSET(&mapping->vimep_dst); 917 uint64_t dst_vdev = DVA_GET_VDEV(&mapping->vimep_dst); 918 919 ASSERT3U(rs->rs_offset, >=, 920 DVA_MAPPING_GET_SRC_OFFSET(mapping)); 921 ASSERT3U(rs->rs_offset, <, 922 DVA_MAPPING_GET_SRC_OFFSET(mapping) + size); 923 ASSERT3U(dst_vdev, !=, v->vdev_id); 924 925 uint64_t inner_offset = rs->rs_offset - 926 DVA_MAPPING_GET_SRC_OFFSET(mapping); 927 uint64_t inner_size = 928 MIN(rs->rs_asize, size - inner_offset); 929 930 vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); 931 ASSERT3P(dst_v, !=, NULL); 932 933 if (dst_v->vdev_ops == &vdev_indirect_ops) { 934 list_insert_head(&stack, 935 rs_alloc(dst_v, dst_offset + inner_offset, 936 inner_size, rs->rs_split_offset)); 937 938 } 939 940 if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && 941 IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { 942 /* 943 * Note: This clause exists only solely for 944 * testing purposes. We use it to ensure that 945 * split blocks work and that the callbacks 946 * using them yield the same result if issued 947 * in reverse order. 948 */ 949 uint64_t inner_half = inner_size / 2; 950 951 func(rs->rs_split_offset + inner_half, dst_v, 952 dst_offset + inner_offset + inner_half, 953 inner_half, arg); 954 955 func(rs->rs_split_offset, dst_v, 956 dst_offset + inner_offset, 957 inner_half, arg); 958 } else { 959 func(rs->rs_split_offset, dst_v, 960 dst_offset + inner_offset, 961 inner_size, arg); 962 } 963 964 rs->rs_offset += inner_size; 965 rs->rs_asize -= inner_size; 966 rs->rs_split_offset += inner_size; 967 mapping++; 968 } 969 970 rw_exit(&v->vdev_indirect_rwlock); 971 kmem_free(rs, sizeof (remap_segment_t)); 972 } 973 list_destroy(&stack); 974 } 975 976 static void 977 vdev_indirect_child_io_done(zio_t *zio) 978 { 979 zio_t *pio = zio->io_private; 980 981 mutex_enter(&pio->io_lock); 982 pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 983 mutex_exit(&pio->io_lock); 984 985 abd_put(zio->io_abd); 986 } 987 988 static void 989 vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset, 990 uint64_t size, void *arg) 991 { 992 zio_t *zio = arg; 993 994 ASSERT3P(vd, !=, NULL); 995 996 if (vd->vdev_ops == &vdev_indirect_ops) 997 return; 998 999 zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, 1000 abd_get_offset(zio->io_abd, split_offset), 1001 size, zio->io_type, zio->io_priority, 1002 0, vdev_indirect_child_io_done, zio)); 1003 } 1004 1005 static void 1006 vdev_indirect_io_start(zio_t *zio) 1007 { 1008 spa_t *spa = zio->io_spa; 1009 1010 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1011 if (zio->io_type != ZIO_TYPE_READ) { 1012 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 1013 ASSERT((zio->io_flags & 1014 (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0); 1015 } 1016 1017 vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, 1018 vdev_indirect_io_start_cb, zio); 1019 1020 zio_execute(zio); 1021 } 1022 1023 vdev_ops_t vdev_indirect_ops = { 1024 vdev_indirect_open, 1025 vdev_indirect_close, 1026 vdev_default_asize, 1027 vdev_indirect_io_start, 1028 vdev_indirect_io_done, 1029 NULL, 1030 NULL, 1031 NULL, 1032 vdev_indirect_remap, 1033 VDEV_TYPE_INDIRECT, /* name of this vdev type */ 1034 B_FALSE /* leaf vdev */ 1035 }; 1036