1 /* 2 * CDDL HEADER START 3 * 4 * This file and its contents are supplied under the terms of the 5 * Common Development and Distribution License ("CDDL"), version 1.0. 6 * You may only use this file in accordance with the terms of version 7 * 1.0 of the CDDL. 8 * 9 * A full copy of the text of the CDDL should have accompanied this 10 * source. A copy of the CDDL is also available via the Internet at 11 * http://www.illumos.org/license/CDDL. 12 * 13 * CDDL HEADER END 14 */ 15 16 /* 17 * Copyright (c) 2014, 2017 by Delphix. All rights reserved. 18 */ 19 20 #include <sys/zfs_context.h> 21 #include <sys/spa.h> 22 #include <sys/spa_impl.h> 23 #include <sys/vdev_impl.h> 24 #include <sys/fs/zfs.h> 25 #include <sys/zio.h> 26 #include <sys/metaslab.h> 27 #include <sys/refcount.h> 28 #include <sys/dmu.h> 29 #include <sys/vdev_indirect_mapping.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/dsl_synctask.h> 32 #include <sys/zap.h> 33 #include <sys/abd.h> 34 #include <sys/zthr.h> 35 36 /* 37 * An indirect vdev corresponds to a vdev that has been removed. Since 38 * we cannot rewrite block pointers of snapshots, etc., we keep a 39 * mapping from old location on the removed device to the new location 40 * on another device in the pool and use this mapping whenever we need 41 * to access the DVA. Unfortunately, this mapping did not respect 42 * logical block boundaries when it was first created, and so a DVA on 43 * this indirect vdev may be "split" into multiple sections that each 44 * map to a different location. As a consequence, not all DVAs can be 45 * translated to an equivalent new DVA. Instead we must provide a 46 * "vdev_remap" operation that executes a callback on each contiguous 47 * segment of the new location. This function is used in multiple ways: 48 * 49 * - reads and repair writes to this device use the callback to create 50 * a child io for each mapped segment. 51 * 52 * - frees and claims to this device use the callback to free or claim 53 * each mapped segment. (Note that we don't actually need to claim 54 * log blocks on indirect vdevs, because we don't allocate to 55 * removing vdevs. However, zdb uses zio_claim() for its leak 56 * detection.) 57 */ 58 59 /* 60 * "Big theory statement" for how we mark blocks obsolete. 61 * 62 * When a block on an indirect vdev is freed or remapped, a section of 63 * that vdev's mapping may no longer be referenced (aka "obsolete"). We 64 * keep track of how much of each mapping entry is obsolete. When 65 * an entry becomes completely obsolete, we can remove it, thus reducing 66 * the memory used by the mapping. The complete picture of obsolescence 67 * is given by the following data structures, described below: 68 * - the entry-specific obsolete count 69 * - the vdev-specific obsolete spacemap 70 * - the pool-specific obsolete bpobj 71 * 72 * == On disk data structures used == 73 * 74 * We track the obsolete space for the pool using several objects. Each 75 * of these objects is created on demand and freed when no longer 76 * needed, and is assumed to be empty if it does not exist. 77 * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. 78 * 79 * - Each vic_mapping_object (associated with an indirect vdev) can 80 * have a vimp_counts_object. This is an array of uint32_t's 81 * with the same number of entries as the vic_mapping_object. When 82 * the mapping is condensed, entries from the vic_obsolete_sm_object 83 * (see below) are folded into the counts. Therefore, each 84 * obsolete_counts entry tells us the number of bytes in the 85 * corresponding mapping entry that were not referenced when the 86 * mapping was last condensed. 87 * 88 * - Each indirect or removing vdev can have a vic_obsolete_sm_object. 89 * This is a space map containing an alloc entry for every DVA that 90 * has been obsoleted since the last time this indirect vdev was 91 * condensed. We use this object in order to improve performance 92 * when marking a DVA as obsolete. Instead of modifying an arbitrary 93 * offset of the vimp_counts_object, we only need to append an entry 94 * to the end of this object. When a DVA becomes obsolete, it is 95 * added to the obsolete space map. This happens when the DVA is 96 * freed, remapped and not referenced by a snapshot, or the last 97 * snapshot referencing it is destroyed. 98 * 99 * - Each dataset can have a ds_remap_deadlist object. This is a 100 * deadlist object containing all blocks that were remapped in this 101 * dataset but referenced in a previous snapshot. Blocks can *only* 102 * appear on this list if they were remapped (dsl_dataset_block_remapped); 103 * blocks that were killed in a head dataset are put on the normal 104 * ds_deadlist and marked obsolete when they are freed. 105 * 106 * - The pool can have a dp_obsolete_bpobj. This is a list of blocks 107 * in the pool that need to be marked obsolete. When a snapshot is 108 * destroyed, we move some of the ds_remap_deadlist to the obsolete 109 * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then 110 * asynchronously process the obsolete bpobj, moving its entries to 111 * the specific vdevs' obsolete space maps. 112 * 113 * == Summary of how we mark blocks as obsolete == 114 * 115 * - When freeing a block: if any DVA is on an indirect vdev, append to 116 * vic_obsolete_sm_object. 117 * - When remapping a block, add dva to ds_remap_deadlist (if prev snap 118 * references; otherwise append to vic_obsolete_sm_object). 119 * - When freeing a snapshot: move parts of ds_remap_deadlist to 120 * dp_obsolete_bpobj (same algorithm as ds_deadlist). 121 * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to 122 * individual vdev's vic_obsolete_sm_object. 123 */ 124 125 /* 126 * "Big theory statement" for how we condense indirect vdevs. 127 * 128 * Condensing an indirect vdev's mapping is the process of determining 129 * the precise counts of obsolete space for each mapping entry (by 130 * integrating the obsolete spacemap into the obsolete counts) and 131 * writing out a new mapping that contains only referenced entries. 132 * 133 * We condense a vdev when we expect the mapping to shrink (see 134 * vdev_indirect_should_condense()), but only perform one condense at a 135 * time to limit the memory usage. In addition, we use a separate 136 * open-context thread (spa_condense_indirect_thread) to incrementally 137 * create the new mapping object in a way that minimizes the impact on 138 * the rest of the system. 139 * 140 * == Generating a new mapping == 141 * 142 * To generate a new mapping, we follow these steps: 143 * 144 * 1. Save the old obsolete space map and create a new mapping object 145 * (see spa_condense_indirect_start_sync()). This initializes the 146 * spa_condensing_indirect_phys with the "previous obsolete space map", 147 * which is now read only. Newly obsolete DVAs will be added to a 148 * new (initially empty) obsolete space map, and will not be 149 * considered as part of this condense operation. 150 * 151 * 2. Construct in memory the precise counts of obsolete space for each 152 * mapping entry, by incorporating the obsolete space map into the 153 * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) 154 * 155 * 3. Iterate through each mapping entry, writing to the new mapping any 156 * entries that are not completely obsolete (i.e. which don't have 157 * obsolete count == mapping length). (See 158 * spa_condense_indirect_generate_new_mapping().) 159 * 160 * 4. Destroy the old mapping object and switch over to the new one 161 * (spa_condense_indirect_complete_sync). 162 * 163 * == Restarting from failure == 164 * 165 * To restart the condense when we import/open the pool, we must start 166 * at the 2nd step above: reconstruct the precise counts in memory, 167 * based on the space map + counts. Then in the 3rd step, we start 168 * iterating where we left off: at vimp_max_offset of the new mapping 169 * object. 170 */ 171 172 boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE; 173 174 /* 175 * Condense if at least this percent of the bytes in the mapping is 176 * obsolete. With the default of 25%, the amount of space mapped 177 * will be reduced to 1% of its original size after at most 16 178 * condenses. Higher values will condense less often (causing less 179 * i/o); lower values will reduce the mapping size more quickly. 180 */ 181 int zfs_indirect_condense_obsolete_pct = 25; 182 183 /* 184 * Condense if the obsolete space map takes up more than this amount of 185 * space on disk (logically). This limits the amount of disk space 186 * consumed by the obsolete space map; the default of 1GB is small enough 187 * that we typically don't mind "wasting" it. 188 */ 189 uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; 190 191 /* 192 * Don't bother condensing if the mapping uses less than this amount of 193 * memory. The default of 128KB is considered a "trivial" amount of 194 * memory and not worth reducing. 195 */ 196 uint64_t zfs_condense_min_mapping_bytes = 128 * 1024; 197 198 /* 199 * This is used by the test suite so that it can ensure that certain 200 * actions happen while in the middle of a condense (which might otherwise 201 * complete too quickly). If used to reduce the performance impact of 202 * condensing in production, a maximum value of 1 should be sufficient. 203 */ 204 int zfs_condense_indirect_commit_entry_delay_ticks = 0; 205 206 /* 207 * Mark the given offset and size as being obsolete. 208 */ 209 void 210 vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) 211 { 212 spa_t *spa = vd->vdev_spa; 213 214 ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); 215 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); 216 ASSERT(size > 0); 217 VERIFY(vdev_indirect_mapping_entry_for_offset( 218 vd->vdev_indirect_mapping, offset) != NULL); 219 220 if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 221 mutex_enter(&vd->vdev_obsolete_lock); 222 range_tree_add(vd->vdev_obsolete_segments, offset, size); 223 mutex_exit(&vd->vdev_obsolete_lock); 224 vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa)); 225 } 226 } 227 228 /* 229 * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This 230 * wrapper is provided because the DMU does not know about vdev_t's and 231 * cannot directly call vdev_indirect_mark_obsolete. 232 */ 233 void 234 spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, 235 uint64_t size, dmu_tx_t *tx) 236 { 237 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 238 ASSERT(dmu_tx_is_syncing(tx)); 239 240 /* The DMU can only remap indirect vdevs. */ 241 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 242 vdev_indirect_mark_obsolete(vd, offset, size); 243 } 244 245 static spa_condensing_indirect_t * 246 spa_condensing_indirect_create(spa_t *spa) 247 { 248 spa_condensing_indirect_phys_t *scip = 249 &spa->spa_condensing_indirect_phys; 250 spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); 251 objset_t *mos = spa->spa_meta_objset; 252 253 for (int i = 0; i < TXG_SIZE; i++) { 254 list_create(&sci->sci_new_mapping_entries[i], 255 sizeof (vdev_indirect_mapping_entry_t), 256 offsetof(vdev_indirect_mapping_entry_t, vime_node)); 257 } 258 259 sci->sci_new_mapping = 260 vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); 261 262 return (sci); 263 } 264 265 static void 266 spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) 267 { 268 for (int i = 0; i < TXG_SIZE; i++) 269 list_destroy(&sci->sci_new_mapping_entries[i]); 270 271 if (sci->sci_new_mapping != NULL) 272 vdev_indirect_mapping_close(sci->sci_new_mapping); 273 274 kmem_free(sci, sizeof (*sci)); 275 } 276 277 boolean_t 278 vdev_indirect_should_condense(vdev_t *vd) 279 { 280 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 281 spa_t *spa = vd->vdev_spa; 282 283 ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); 284 285 if (!zfs_condense_indirect_vdevs_enable) 286 return (B_FALSE); 287 288 /* 289 * We can only condense one indirect vdev at a time. 290 */ 291 if (spa->spa_condensing_indirect != NULL) 292 return (B_FALSE); 293 294 if (spa_shutting_down(spa)) 295 return (B_FALSE); 296 297 /* 298 * The mapping object size must not change while we are 299 * condensing, so we can only condense indirect vdevs 300 * (not vdevs that are still in the middle of being removed). 301 */ 302 if (vd->vdev_ops != &vdev_indirect_ops) 303 return (B_FALSE); 304 305 /* 306 * If nothing new has been marked obsolete, there is no 307 * point in condensing. 308 */ 309 if (vd->vdev_obsolete_sm == NULL) { 310 ASSERT0(vdev_obsolete_sm_object(vd)); 311 return (B_FALSE); 312 } 313 314 ASSERT(vd->vdev_obsolete_sm != NULL); 315 316 ASSERT3U(vdev_obsolete_sm_object(vd), ==, 317 space_map_object(vd->vdev_obsolete_sm)); 318 319 uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); 320 uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); 321 uint64_t mapping_size = vdev_indirect_mapping_size(vim); 322 uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); 323 324 ASSERT3U(bytes_obsolete, <=, bytes_mapped); 325 326 /* 327 * If a high percentage of the bytes that are mapped have become 328 * obsolete, condense (unless the mapping is already small enough). 329 * This has a good chance of reducing the amount of memory used 330 * by the mapping. 331 */ 332 if (bytes_obsolete * 100 / bytes_mapped >= 333 zfs_indirect_condense_obsolete_pct && 334 mapping_size > zfs_condense_min_mapping_bytes) { 335 zfs_dbgmsg("should condense vdev %llu because obsolete " 336 "spacemap covers %d%% of %lluMB mapping", 337 (u_longlong_t)vd->vdev_id, 338 (int)(bytes_obsolete * 100 / bytes_mapped), 339 (u_longlong_t)bytes_mapped / 1024 / 1024); 340 return (B_TRUE); 341 } 342 343 /* 344 * If the obsolete space map takes up too much space on disk, 345 * condense in order to free up this disk space. 346 */ 347 if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { 348 zfs_dbgmsg("should condense vdev %llu because obsolete sm " 349 "length %lluMB >= max size %lluMB", 350 (u_longlong_t)vd->vdev_id, 351 (u_longlong_t)obsolete_sm_size / 1024 / 1024, 352 (u_longlong_t)zfs_condense_max_obsolete_bytes / 353 1024 / 1024); 354 return (B_TRUE); 355 } 356 357 return (B_FALSE); 358 } 359 360 /* 361 * This sync task completes (finishes) a condense, deleting the old 362 * mapping and replacing it with the new one. 363 */ 364 static void 365 spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) 366 { 367 spa_condensing_indirect_t *sci = arg; 368 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 369 spa_condensing_indirect_phys_t *scip = 370 &spa->spa_condensing_indirect_phys; 371 vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); 372 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 373 objset_t *mos = spa->spa_meta_objset; 374 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 375 uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); 376 uint64_t new_count = 377 vdev_indirect_mapping_num_entries(sci->sci_new_mapping); 378 379 ASSERT(dmu_tx_is_syncing(tx)); 380 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 381 ASSERT3P(sci, ==, spa->spa_condensing_indirect); 382 for (int i = 0; i < TXG_SIZE; i++) { 383 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); 384 } 385 ASSERT(vic->vic_mapping_object != 0); 386 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); 387 ASSERT(scip->scip_next_mapping_object != 0); 388 ASSERT(scip->scip_prev_obsolete_sm_object != 0); 389 390 /* 391 * Reset vdev_indirect_mapping to refer to the new object. 392 */ 393 rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); 394 vdev_indirect_mapping_close(vd->vdev_indirect_mapping); 395 vd->vdev_indirect_mapping = sci->sci_new_mapping; 396 rw_exit(&vd->vdev_indirect_rwlock); 397 398 sci->sci_new_mapping = NULL; 399 vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); 400 vic->vic_mapping_object = scip->scip_next_mapping_object; 401 scip->scip_next_mapping_object = 0; 402 403 space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); 404 spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 405 scip->scip_prev_obsolete_sm_object = 0; 406 407 scip->scip_vdev = 0; 408 409 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 410 DMU_POOL_CONDENSING_INDIRECT, tx)); 411 spa_condensing_indirect_destroy(spa->spa_condensing_indirect); 412 spa->spa_condensing_indirect = NULL; 413 414 zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " 415 "new mapping object %llu has %llu entries " 416 "(was %llu entries)", 417 vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, 418 new_count, old_count); 419 420 vdev_config_dirty(spa->spa_root_vdev); 421 } 422 423 /* 424 * This sync task appends entries to the new mapping object. 425 */ 426 static void 427 spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) 428 { 429 spa_condensing_indirect_t *sci = arg; 430 uint64_t txg = dmu_tx_get_txg(tx); 431 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 432 433 ASSERT(dmu_tx_is_syncing(tx)); 434 ASSERT3P(sci, ==, spa->spa_condensing_indirect); 435 436 vdev_indirect_mapping_add_entries(sci->sci_new_mapping, 437 &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); 438 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); 439 } 440 441 /* 442 * Open-context function to add one entry to the new mapping. The new 443 * entry will be remembered and written from syncing context. 444 */ 445 static void 446 spa_condense_indirect_commit_entry(spa_t *spa, 447 vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) 448 { 449 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; 450 451 ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); 452 453 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 454 dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); 455 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 456 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 457 458 /* 459 * If we are the first entry committed this txg, kick off the sync 460 * task to write to the MOS on our behalf. 461 */ 462 if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { 463 dsl_sync_task_nowait(dmu_tx_pool(tx), 464 spa_condense_indirect_commit_sync, sci, 465 0, ZFS_SPACE_CHECK_NONE, tx); 466 } 467 468 vdev_indirect_mapping_entry_t *vime = 469 kmem_alloc(sizeof (*vime), KM_SLEEP); 470 vime->vime_mapping = *vimep; 471 vime->vime_obsolete_count = count; 472 list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); 473 474 dmu_tx_commit(tx); 475 } 476 477 static void 478 spa_condense_indirect_generate_new_mapping(vdev_t *vd, 479 uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr) 480 { 481 spa_t *spa = vd->vdev_spa; 482 uint64_t mapi = start_index; 483 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 484 uint64_t old_num_entries = 485 vdev_indirect_mapping_num_entries(old_mapping); 486 487 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 488 ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); 489 490 zfs_dbgmsg("starting condense of vdev %llu from index %llu", 491 (u_longlong_t)vd->vdev_id, 492 (u_longlong_t)mapi); 493 494 while (mapi < old_num_entries) { 495 496 if (zthr_iscancelled(zthr)) { 497 zfs_dbgmsg("pausing condense of vdev %llu " 498 "at index %llu", (u_longlong_t)vd->vdev_id, 499 (u_longlong_t)mapi); 500 break; 501 } 502 503 vdev_indirect_mapping_entry_phys_t *entry = 504 &old_mapping->vim_entries[mapi]; 505 uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); 506 ASSERT3U(obsolete_counts[mapi], <=, entry_size); 507 if (obsolete_counts[mapi] < entry_size) { 508 spa_condense_indirect_commit_entry(spa, entry, 509 obsolete_counts[mapi]); 510 511 /* 512 * This delay may be requested for testing, debugging, 513 * or performance reasons. 514 */ 515 delay(zfs_condense_indirect_commit_entry_delay_ticks); 516 } 517 518 mapi++; 519 } 520 } 521 522 /* ARGSUSED */ 523 static boolean_t 524 spa_condense_indirect_thread_check(void *arg, zthr_t *zthr) 525 { 526 spa_t *spa = arg; 527 528 return (spa->spa_condensing_indirect != NULL); 529 } 530 531 /* ARGSUSED */ 532 static int 533 spa_condense_indirect_thread(void *arg, zthr_t *zthr) 534 { 535 spa_t *spa = arg; 536 vdev_t *vd; 537 538 ASSERT3P(spa->spa_condensing_indirect, !=, NULL); 539 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 540 vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); 541 ASSERT3P(vd, !=, NULL); 542 spa_config_exit(spa, SCL_VDEV, FTAG); 543 544 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; 545 spa_condensing_indirect_phys_t *scip = 546 &spa->spa_condensing_indirect_phys; 547 uint32_t *counts; 548 uint64_t start_index; 549 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 550 space_map_t *prev_obsolete_sm = NULL; 551 552 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); 553 ASSERT(scip->scip_next_mapping_object != 0); 554 ASSERT(scip->scip_prev_obsolete_sm_object != 0); 555 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 556 557 for (int i = 0; i < TXG_SIZE; i++) { 558 /* 559 * The list must start out empty in order for the 560 * _commit_sync() sync task to be properly registered 561 * on the first call to _commit_entry(); so it's wise 562 * to double check and ensure we actually are starting 563 * with empty lists. 564 */ 565 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); 566 } 567 568 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, 569 scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); 570 space_map_update(prev_obsolete_sm); 571 counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); 572 if (prev_obsolete_sm != NULL) { 573 vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, 574 counts, prev_obsolete_sm); 575 } 576 space_map_close(prev_obsolete_sm); 577 578 /* 579 * Generate new mapping. Determine what index to continue from 580 * based on the max offset that we've already written in the 581 * new mapping. 582 */ 583 uint64_t max_offset = 584 vdev_indirect_mapping_max_offset(sci->sci_new_mapping); 585 if (max_offset == 0) { 586 /* We haven't written anything to the new mapping yet. */ 587 start_index = 0; 588 } else { 589 /* 590 * Pick up from where we left off. _entry_for_offset() 591 * returns a pointer into the vim_entries array. If 592 * max_offset is greater than any of the mappings 593 * contained in the table NULL will be returned and 594 * that indicates we've exhausted our iteration of the 595 * old_mapping. 596 */ 597 598 vdev_indirect_mapping_entry_phys_t *entry = 599 vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, 600 max_offset); 601 602 if (entry == NULL) { 603 /* 604 * We've already written the whole new mapping. 605 * This special value will cause us to skip the 606 * generate_new_mapping step and just do the sync 607 * task to complete the condense. 608 */ 609 start_index = UINT64_MAX; 610 } else { 611 start_index = entry - old_mapping->vim_entries; 612 ASSERT3U(start_index, <, 613 vdev_indirect_mapping_num_entries(old_mapping)); 614 } 615 } 616 617 spa_condense_indirect_generate_new_mapping(vd, counts, 618 start_index, zthr); 619 620 vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); 621 622 /* 623 * If the zthr has received a cancellation signal while running 624 * in generate_new_mapping() or at any point after that, then bail 625 * early. We don't want to complete the condense if the spa is 626 * shutting down. 627 */ 628 if (zthr_iscancelled(zthr)) 629 return (0); 630 631 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 632 spa_condense_indirect_complete_sync, sci, 0, 633 ZFS_SPACE_CHECK_EXTRA_RESERVED)); 634 635 return (0); 636 } 637 638 /* 639 * Sync task to begin the condensing process. 640 */ 641 void 642 spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) 643 { 644 spa_t *spa = vd->vdev_spa; 645 spa_condensing_indirect_phys_t *scip = 646 &spa->spa_condensing_indirect_phys; 647 648 ASSERT0(scip->scip_next_mapping_object); 649 ASSERT0(scip->scip_prev_obsolete_sm_object); 650 ASSERT0(scip->scip_vdev); 651 ASSERT(dmu_tx_is_syncing(tx)); 652 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 653 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); 654 ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); 655 656 uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd); 657 ASSERT(obsolete_sm_obj != 0); 658 659 scip->scip_vdev = vd->vdev_id; 660 scip->scip_next_mapping_object = 661 vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); 662 663 scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; 664 665 /* 666 * We don't need to allocate a new space map object, since 667 * vdev_indirect_sync_obsolete will allocate one when needed. 668 */ 669 space_map_close(vd->vdev_obsolete_sm); 670 vd->vdev_obsolete_sm = NULL; 671 VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, 672 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); 673 674 VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, 675 DMU_POOL_DIRECTORY_OBJECT, 676 DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), 677 sizeof (*scip) / sizeof (uint64_t), scip, tx)); 678 679 ASSERT3P(spa->spa_condensing_indirect, ==, NULL); 680 spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); 681 682 zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " 683 "posm=%llu nm=%llu", 684 vd->vdev_id, dmu_tx_get_txg(tx), 685 (u_longlong_t)scip->scip_prev_obsolete_sm_object, 686 (u_longlong_t)scip->scip_next_mapping_object); 687 688 zthr_wakeup(spa->spa_condense_zthr); 689 } 690 691 /* 692 * Sync to the given vdev's obsolete space map any segments that are no longer 693 * referenced as of the given txg. 694 * 695 * If the obsolete space map doesn't exist yet, create and open it. 696 */ 697 void 698 vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) 699 { 700 spa_t *spa = vd->vdev_spa; 701 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 702 703 ASSERT3U(vic->vic_mapping_object, !=, 0); 704 ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); 705 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); 706 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); 707 708 if (vdev_obsolete_sm_object(vd) == 0) { 709 uint64_t obsolete_sm_object = 710 space_map_alloc(spa->spa_meta_objset, 711 vdev_standard_sm_blksz, tx); 712 713 ASSERT(vd->vdev_top_zap != 0); 714 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 715 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, 716 sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); 717 ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0); 718 719 spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 720 VERIFY0(space_map_open(&vd->vdev_obsolete_sm, 721 spa->spa_meta_objset, obsolete_sm_object, 722 0, vd->vdev_asize, 0)); 723 space_map_update(vd->vdev_obsolete_sm); 724 } 725 726 ASSERT(vd->vdev_obsolete_sm != NULL); 727 ASSERT3U(vdev_obsolete_sm_object(vd), ==, 728 space_map_object(vd->vdev_obsolete_sm)); 729 730 space_map_write(vd->vdev_obsolete_sm, 731 vd->vdev_obsolete_segments, SM_ALLOC, tx); 732 space_map_update(vd->vdev_obsolete_sm); 733 range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); 734 } 735 736 int 737 spa_condense_init(spa_t *spa) 738 { 739 int error = zap_lookup(spa->spa_meta_objset, 740 DMU_POOL_DIRECTORY_OBJECT, 741 DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), 742 sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), 743 &spa->spa_condensing_indirect_phys); 744 if (error == 0) { 745 if (spa_writeable(spa)) { 746 spa->spa_condensing_indirect = 747 spa_condensing_indirect_create(spa); 748 } 749 return (0); 750 } else if (error == ENOENT) { 751 return (0); 752 } else { 753 return (error); 754 } 755 } 756 757 void 758 spa_condense_fini(spa_t *spa) 759 { 760 if (spa->spa_condensing_indirect != NULL) { 761 spa_condensing_indirect_destroy(spa->spa_condensing_indirect); 762 spa->spa_condensing_indirect = NULL; 763 } 764 } 765 766 void 767 spa_start_indirect_condensing_thread(spa_t *spa) 768 { 769 ASSERT3P(spa->spa_condense_zthr, ==, NULL); 770 spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check, 771 spa_condense_indirect_thread, spa); 772 } 773 774 /* 775 * Gets the obsolete spacemap object from the vdev's ZAP. 776 * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't 777 * exist yet. 778 */ 779 int 780 vdev_obsolete_sm_object(vdev_t *vd) 781 { 782 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 783 if (vd->vdev_top_zap == 0) { 784 return (0); 785 } 786 787 uint64_t sm_obj = 0; 788 int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 789 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj); 790 791 ASSERT(err == 0 || err == ENOENT); 792 793 return (sm_obj); 794 } 795 796 boolean_t 797 vdev_obsolete_counts_are_precise(vdev_t *vd) 798 { 799 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 800 if (vd->vdev_top_zap == 0) { 801 return (B_FALSE); 802 } 803 804 uint64_t val = 0; 805 int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 806 VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); 807 808 ASSERT(err == 0 || err == ENOENT); 809 810 return (val != 0); 811 } 812 813 /* ARGSUSED */ 814 static void 815 vdev_indirect_close(vdev_t *vd) 816 { 817 } 818 819 /* ARGSUSED */ 820 static void 821 vdev_indirect_io_done(zio_t *zio) 822 { 823 } 824 825 /* ARGSUSED */ 826 static int 827 vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 828 uint64_t *ashift) 829 { 830 *psize = *max_psize = vd->vdev_asize + 831 VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 832 *ashift = vd->vdev_ashift; 833 return (0); 834 } 835 836 typedef struct remap_segment { 837 vdev_t *rs_vd; 838 uint64_t rs_offset; 839 uint64_t rs_asize; 840 uint64_t rs_split_offset; 841 list_node_t rs_node; 842 } remap_segment_t; 843 844 remap_segment_t * 845 rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) 846 { 847 remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); 848 rs->rs_vd = vd; 849 rs->rs_offset = offset; 850 rs->rs_asize = asize; 851 rs->rs_split_offset = split_offset; 852 return (rs); 853 } 854 855 /* 856 * Given an indirect vdev and an extent on that vdev, it duplicates the 857 * physical entries of the indirect mapping that correspond to the extent 858 * to a new array and returns a pointer to it. In addition, copied_entries 859 * is populated with the number of mapping entries that were duplicated. 860 * 861 * Note that the function assumes that the caller holds vdev_indirect_rwlock. 862 * This ensures that the mapping won't change due to condensing as we 863 * copy over its contents. 864 * 865 * Finally, since we are doing an allocation, it is up to the caller to 866 * free the array allocated in this function. 867 */ 868 vdev_indirect_mapping_entry_phys_t * 869 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, 870 uint64_t asize, uint64_t *copied_entries) 871 { 872 vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; 873 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 874 uint64_t entries = 0; 875 876 ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock)); 877 878 vdev_indirect_mapping_entry_phys_t *first_mapping = 879 vdev_indirect_mapping_entry_for_offset(vim, offset); 880 ASSERT3P(first_mapping, !=, NULL); 881 882 vdev_indirect_mapping_entry_phys_t *m = first_mapping; 883 while (asize > 0) { 884 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 885 886 ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); 887 ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); 888 889 uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); 890 uint64_t inner_size = MIN(asize, size - inner_offset); 891 892 offset += inner_size; 893 asize -= inner_size; 894 entries++; 895 m++; 896 } 897 898 size_t copy_length = entries * sizeof (*first_mapping); 899 duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP); 900 bcopy(first_mapping, duplicate_mappings, copy_length); 901 *copied_entries = entries; 902 903 return (duplicate_mappings); 904 } 905 906 /* 907 * Goes through the relevant indirect mappings until it hits a concrete vdev 908 * and issues the callback. On the way to the concrete vdev, if any other 909 * indirect vdevs are encountered, then the callback will also be called on 910 * each of those indirect vdevs. For example, if the segment is mapped to 911 * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is 912 * mapped to segment B on concrete vdev 2, then the callback will be called on 913 * both vdev 1 and vdev 2. 914 * 915 * While the callback passed to vdev_indirect_remap() is called on every vdev 916 * the function encounters, certain callbacks only care about concrete vdevs. 917 * These types of callbacks should return immediately and explicitly when they 918 * are called on an indirect vdev. 919 * 920 * Because there is a possibility that a DVA section in the indirect device 921 * has been split into multiple sections in our mapping, we keep track 922 * of the relevant contiguous segments of the new location (remap_segment_t) 923 * in a stack. This way we can call the callback for each of the new sections 924 * created by a single section of the indirect device. Note though, that in 925 * this scenario the callbacks in each split block won't occur in-order in 926 * terms of offset, so callers should not make any assumptions about that. 927 * 928 * For callbacks that don't handle split blocks and immediately return when 929 * they encounter them (as is the case for remap_blkptr_cb), the caller can 930 * assume that its callback will be applied from the first indirect vdev 931 * encountered to the last one and then the concrete vdev, in that order. 932 */ 933 static void 934 vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, 935 void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) 936 { 937 list_t stack; 938 spa_t *spa = vd->vdev_spa; 939 940 list_create(&stack, sizeof (remap_segment_t), 941 offsetof(remap_segment_t, rs_node)); 942 943 for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); 944 rs != NULL; rs = list_remove_head(&stack)) { 945 vdev_t *v = rs->rs_vd; 946 uint64_t num_entries = 0; 947 948 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 949 ASSERT(rs->rs_asize > 0); 950 951 /* 952 * Note: As this function can be called from open context 953 * (e.g. zio_read()), we need the following rwlock to 954 * prevent the mapping from being changed by condensing. 955 * 956 * So we grab the lock and we make a copy of the entries 957 * that are relevant to the extent that we are working on. 958 * Once that is done, we drop the lock and iterate over 959 * our copy of the mapping. Once we are done with the with 960 * the remap segment and we free it, we also free our copy 961 * of the indirect mapping entries that are relevant to it. 962 * 963 * This way we don't need to wait until the function is 964 * finished with a segment, to condense it. In addition, we 965 * don't need a recursive rwlock for the case that a call to 966 * vdev_indirect_remap() needs to call itself (through the 967 * codepath of its callback) for the same vdev in the middle 968 * of its execution. 969 */ 970 rw_enter(&v->vdev_indirect_rwlock, RW_READER); 971 vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping; 972 ASSERT3P(vim, !=, NULL); 973 974 vdev_indirect_mapping_entry_phys_t *mapping = 975 vdev_indirect_mapping_duplicate_adjacent_entries(v, 976 rs->rs_offset, rs->rs_asize, &num_entries); 977 ASSERT3P(mapping, !=, NULL); 978 ASSERT3U(num_entries, >, 0); 979 rw_exit(&v->vdev_indirect_rwlock); 980 981 for (uint64_t i = 0; i < num_entries; i++) { 982 /* 983 * Note: the vdev_indirect_mapping can not change 984 * while we are running. It only changes while the 985 * removal is in progress, and then only from syncing 986 * context. While a removal is in progress, this 987 * function is only called for frees, which also only 988 * happen from syncing context. 989 */ 990 vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; 991 992 ASSERT3P(m, !=, NULL); 993 ASSERT3U(rs->rs_asize, >, 0); 994 995 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 996 uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); 997 uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); 998 999 ASSERT3U(rs->rs_offset, >=, 1000 DVA_MAPPING_GET_SRC_OFFSET(m)); 1001 ASSERT3U(rs->rs_offset, <, 1002 DVA_MAPPING_GET_SRC_OFFSET(m) + size); 1003 ASSERT3U(dst_vdev, !=, v->vdev_id); 1004 1005 uint64_t inner_offset = rs->rs_offset - 1006 DVA_MAPPING_GET_SRC_OFFSET(m); 1007 uint64_t inner_size = 1008 MIN(rs->rs_asize, size - inner_offset); 1009 1010 vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); 1011 ASSERT3P(dst_v, !=, NULL); 1012 1013 if (dst_v->vdev_ops == &vdev_indirect_ops) { 1014 list_insert_head(&stack, 1015 rs_alloc(dst_v, dst_offset + inner_offset, 1016 inner_size, rs->rs_split_offset)); 1017 1018 } 1019 1020 if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && 1021 IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { 1022 /* 1023 * Note: This clause exists only solely for 1024 * testing purposes. We use it to ensure that 1025 * split blocks work and that the callbacks 1026 * using them yield the same result if issued 1027 * in reverse order. 1028 */ 1029 uint64_t inner_half = inner_size / 2; 1030 1031 func(rs->rs_split_offset + inner_half, dst_v, 1032 dst_offset + inner_offset + inner_half, 1033 inner_half, arg); 1034 1035 func(rs->rs_split_offset, dst_v, 1036 dst_offset + inner_offset, 1037 inner_half, arg); 1038 } else { 1039 func(rs->rs_split_offset, dst_v, 1040 dst_offset + inner_offset, 1041 inner_size, arg); 1042 } 1043 1044 rs->rs_offset += inner_size; 1045 rs->rs_asize -= inner_size; 1046 rs->rs_split_offset += inner_size; 1047 } 1048 VERIFY0(rs->rs_asize); 1049 1050 kmem_free(mapping, num_entries * sizeof (*mapping)); 1051 kmem_free(rs, sizeof (remap_segment_t)); 1052 } 1053 list_destroy(&stack); 1054 } 1055 1056 static void 1057 vdev_indirect_child_io_done(zio_t *zio) 1058 { 1059 zio_t *pio = zio->io_private; 1060 1061 mutex_enter(&pio->io_lock); 1062 pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 1063 mutex_exit(&pio->io_lock); 1064 1065 abd_put(zio->io_abd); 1066 } 1067 1068 static void 1069 vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset, 1070 uint64_t size, void *arg) 1071 { 1072 zio_t *zio = arg; 1073 1074 ASSERT3P(vd, !=, NULL); 1075 1076 if (vd->vdev_ops == &vdev_indirect_ops) 1077 return; 1078 1079 zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, 1080 abd_get_offset(zio->io_abd, split_offset), 1081 size, zio->io_type, zio->io_priority, 1082 0, vdev_indirect_child_io_done, zio)); 1083 } 1084 1085 static void 1086 vdev_indirect_io_start(zio_t *zio) 1087 { 1088 spa_t *spa = zio->io_spa; 1089 1090 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1091 if (zio->io_type != ZIO_TYPE_READ) { 1092 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 1093 ASSERT((zio->io_flags & 1094 (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0); 1095 } 1096 1097 vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, 1098 vdev_indirect_io_start_cb, zio); 1099 1100 zio_execute(zio); 1101 } 1102 1103 vdev_ops_t vdev_indirect_ops = { 1104 vdev_indirect_open, 1105 vdev_indirect_close, 1106 vdev_default_asize, 1107 vdev_indirect_io_start, 1108 vdev_indirect_io_done, 1109 NULL, 1110 NULL, 1111 NULL, 1112 vdev_indirect_remap, 1113 VDEV_TYPE_INDIRECT, /* name of this vdev type */ 1114 B_FALSE /* leaf vdev */ 1115 }; 1116