1 /* 2 * CDDL HEADER START 3 * 4 * This file and its contents are supplied under the terms of the 5 * Common Development and Distribution License ("CDDL"), version 1.0. 6 * You may only use this file in accordance with the terms of version 7 * 1.0 of the CDDL. 8 * 9 * A full copy of the text of the CDDL should have accompanied this 10 * source. A copy of the CDDL is also available via the Internet at 11 * http://www.illumos.org/license/CDDL. 12 * 13 * CDDL HEADER END 14 */ 15 16 /* 17 * Copyright (c) 2015, 2017 by Delphix. All rights reserved. 18 */ 19 20 #include <sys/dmu_tx.h> 21 #include <sys/dsl_pool.h> 22 #include <sys/spa.h> 23 #include <sys/vdev_impl.h> 24 #include <sys/vdev_indirect_mapping.h> 25 #include <sys/zfeature.h> 26 #include <sys/dmu_objset.h> 27 28 #ifdef ZFS_DEBUG 29 static boolean_t 30 vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim) 31 { 32 ASSERT(vim != NULL); 33 34 ASSERT(vim->vim_object != 0); 35 ASSERT(vim->vim_objset != NULL); 36 ASSERT(vim->vim_phys != NULL); 37 ASSERT(vim->vim_dbuf != NULL); 38 39 EQUIV(vim->vim_phys->vimp_num_entries > 0, 40 vim->vim_entries != NULL); 41 if (vim->vim_phys->vimp_num_entries > 0) { 42 vdev_indirect_mapping_entry_phys_t *last_entry __maybe_unused = 43 &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1]; 44 uint64_t offset __maybe_unused = 45 DVA_MAPPING_GET_SRC_OFFSET(last_entry); 46 uint64_t size __maybe_unused = 47 DVA_GET_ASIZE(&last_entry->vimep_dst); 48 49 ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size); 50 } 51 if (vim->vim_havecounts) { 52 ASSERT(vim->vim_phys->vimp_counts_object != 0); 53 } 54 55 return (B_TRUE); 56 } 57 #else 58 #define vdev_indirect_mapping_verify(vim) ((void) sizeof (vim), B_TRUE) 59 #endif 60 61 uint64_t 62 vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim) 63 { 64 ASSERT(vdev_indirect_mapping_verify(vim)); 65 66 return (vim->vim_phys->vimp_num_entries); 67 } 68 69 uint64_t 70 vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim) 71 { 72 ASSERT(vdev_indirect_mapping_verify(vim)); 73 74 return (vim->vim_phys->vimp_max_offset); 75 } 76 77 uint64_t 78 vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim) 79 { 80 ASSERT(vdev_indirect_mapping_verify(vim)); 81 82 return (vim->vim_object); 83 } 84 85 uint64_t 86 vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim) 87 { 88 ASSERT(vdev_indirect_mapping_verify(vim)); 89 90 return (vim->vim_phys->vimp_bytes_mapped); 91 } 92 93 /* 94 * The length (in bytes) of the mapping object array in memory and 95 * (logically) on disk. 96 * 97 * Note that unlike most of our accessor functions, 98 * we don't assert that the struct is consistent; therefore it can be 99 * called while there may be concurrent changes, if we don't care about 100 * the value being immediately stale (e.g. from spa_removal_get_stats()). 101 */ 102 uint64_t 103 vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim) 104 { 105 return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries)); 106 } 107 108 /* 109 * Compare an offset with an indirect mapping entry; there are three 110 * possible scenarios: 111 * 112 * 1. The offset is "less than" the mapping entry; meaning the 113 * offset is less than the source offset of the mapping entry. In 114 * this case, there is no overlap between the offset and the 115 * mapping entry and -1 will be returned. 116 * 117 * 2. The offset is "greater than" the mapping entry; meaning the 118 * offset is greater than the mapping entry's source offset plus 119 * the entry's size. In this case, there is no overlap between 120 * the offset and the mapping entry and 1 will be returned. 121 * 122 * NOTE: If the offset is actually equal to the entry's offset 123 * plus size, this is considered to be "greater" than the entry, 124 * and this case applies (i.e. 1 will be returned). Thus, the 125 * entry's "range" can be considered to be inclusive at its 126 * start, but exclusive at its end: e.g. [src, src + size). 127 * 128 * 3. The last case to consider is if the offset actually falls 129 * within the mapping entry's range. If this is the case, the 130 * offset is considered to be "equal to" the mapping entry and 131 * 0 will be returned. 132 * 133 * NOTE: If the offset is equal to the entry's source offset, 134 * this case applies and 0 will be returned. If the offset is 135 * equal to the entry's source plus its size, this case does 136 * *not* apply (see "NOTE" above for scenario 2), and 1 will be 137 * returned. 138 */ 139 static int 140 dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem) 141 { 142 const uint64_t * const key = v_key; 143 const vdev_indirect_mapping_entry_phys_t * const array_elem = 144 v_array_elem; 145 uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem); 146 147 if (*key < src_offset) { 148 return (-1); 149 } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) { 150 return (0); 151 } else { 152 return (1); 153 } 154 } 155 156 /* 157 * Returns the mapping entry for the given offset. 158 * 159 * It's possible that the given offset will not be in the mapping table 160 * (i.e. no mapping entries contain this offset), in which case, the 161 * return value value depends on the "next_if_missing" parameter. 162 * 163 * If the offset is not found in the table and "next_if_missing" is 164 * B_FALSE, then NULL will always be returned. The behavior is intended 165 * to allow consumers to get the entry corresponding to the offset 166 * parameter, iff the offset overlaps with an entry in the table. 167 * 168 * If the offset is not found in the table and "next_if_missing" is 169 * B_TRUE, then the entry nearest to the given offset will be returned, 170 * such that the entry's source offset is greater than the offset 171 * passed in (i.e. the "next" mapping entry in the table is returned, if 172 * the offset is missing from the table). If there are no entries whose 173 * source offset is greater than the passed in offset, NULL is returned. 174 */ 175 static vdev_indirect_mapping_entry_phys_t * 176 vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim, 177 uint64_t offset, boolean_t next_if_missing) 178 { 179 ASSERT(vdev_indirect_mapping_verify(vim)); 180 ASSERT(vim->vim_phys->vimp_num_entries > 0); 181 182 vdev_indirect_mapping_entry_phys_t *entry = NULL; 183 184 uint64_t last = vim->vim_phys->vimp_num_entries - 1; 185 uint64_t base = 0; 186 187 /* 188 * We don't define these inside of the while loop because we use 189 * their value in the case that offset isn't in the mapping. 190 */ 191 uint64_t mid; 192 int result; 193 194 while (last >= base) { 195 mid = base + ((last - base) >> 1); 196 197 result = dva_mapping_overlap_compare(&offset, 198 &vim->vim_entries[mid]); 199 200 if (result == 0) { 201 entry = &vim->vim_entries[mid]; 202 break; 203 } else if (result < 0) { 204 last = mid - 1; 205 } else { 206 base = mid + 1; 207 } 208 } 209 210 if (entry == NULL && next_if_missing) { 211 ASSERT3U(base, ==, last + 1); 212 ASSERT(mid == base || mid == last); 213 ASSERT3S(result, !=, 0); 214 215 /* 216 * The offset we're looking for isn't actually contained 217 * in the mapping table, thus we need to return the 218 * closest mapping entry that is greater than the 219 * offset. We reuse the result of the last comparison, 220 * comparing the mapping entry at index "mid" and the 221 * offset. The offset is guaranteed to lie between 222 * indices one less than "mid", and one greater than 223 * "mid"; we just need to determine if offset is greater 224 * than, or less than the mapping entry contained at 225 * index "mid". 226 */ 227 228 uint64_t index; 229 if (result < 0) 230 index = mid; 231 else 232 index = mid + 1; 233 234 ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries); 235 236 if (index == vim->vim_phys->vimp_num_entries) { 237 /* 238 * If "index" is past the end of the entries 239 * array, then not only is the offset not in the 240 * mapping table, but it's actually greater than 241 * all entries in the table. In this case, we 242 * can't return a mapping entry greater than the 243 * offset (since none exist), so we return NULL. 244 */ 245 246 ASSERT3S(dva_mapping_overlap_compare(&offset, 247 &vim->vim_entries[index - 1]), >, 0); 248 249 return (NULL); 250 } else { 251 /* 252 * Just to be safe, we verify the offset falls 253 * in between the mapping entries at index and 254 * one less than index. Since we know the offset 255 * doesn't overlap an entry, and we're supposed 256 * to return the entry just greater than the 257 * offset, both of the following tests must be 258 * true. 259 */ 260 ASSERT3S(dva_mapping_overlap_compare(&offset, 261 &vim->vim_entries[index]), <, 0); 262 IMPLY(index >= 1, dva_mapping_overlap_compare(&offset, 263 &vim->vim_entries[index - 1]) > 0); 264 265 return (&vim->vim_entries[index]); 266 } 267 } else { 268 return (entry); 269 } 270 } 271 272 vdev_indirect_mapping_entry_phys_t * 273 vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, 274 uint64_t offset) 275 { 276 return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, 277 B_FALSE)); 278 } 279 280 vdev_indirect_mapping_entry_phys_t * 281 vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim, 282 uint64_t offset) 283 { 284 return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, 285 B_TRUE)); 286 } 287 288 void 289 vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim) 290 { 291 ASSERT(vdev_indirect_mapping_verify(vim)); 292 293 if (vim->vim_phys->vimp_num_entries > 0) { 294 uint64_t map_size = vdev_indirect_mapping_size(vim); 295 vmem_free(vim->vim_entries, map_size); 296 vim->vim_entries = NULL; 297 } 298 299 dmu_buf_rele(vim->vim_dbuf, vim); 300 301 vim->vim_objset = NULL; 302 vim->vim_object = 0; 303 vim->vim_dbuf = NULL; 304 vim->vim_phys = NULL; 305 306 kmem_free(vim, sizeof (*vim)); 307 } 308 309 uint64_t 310 vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx) 311 { 312 uint64_t object; 313 ASSERT(dmu_tx_is_syncing(tx)); 314 uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0; 315 316 if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 317 bonus_size = sizeof (vdev_indirect_mapping_phys_t); 318 } 319 320 object = dmu_object_alloc(os, 321 DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, 322 DMU_OTN_UINT64_METADATA, bonus_size, 323 tx); 324 325 if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 326 dmu_buf_t *dbuf; 327 vdev_indirect_mapping_phys_t *vimp; 328 329 VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf)); 330 dmu_buf_will_dirty(dbuf, tx); 331 vimp = dbuf->db_data; 332 vimp->vimp_counts_object = dmu_object_alloc(os, 333 DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE, 334 DMU_OT_NONE, 0, tx); 335 spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 336 dmu_buf_rele(dbuf, FTAG); 337 } 338 339 return (object); 340 } 341 342 343 vdev_indirect_mapping_t * 344 vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object) 345 { 346 vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP); 347 dmu_object_info_t doi; 348 VERIFY0(dmu_object_info(os, mapping_object, &doi)); 349 350 vim->vim_objset = os; 351 vim->vim_object = mapping_object; 352 353 VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim, 354 &vim->vim_dbuf)); 355 vim->vim_phys = vim->vim_dbuf->db_data; 356 357 vim->vim_havecounts = 358 (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0); 359 360 if (vim->vim_phys->vimp_num_entries > 0) { 361 uint64_t map_size = vdev_indirect_mapping_size(vim); 362 vim->vim_entries = vmem_alloc(map_size, KM_SLEEP); 363 VERIFY0(dmu_read(os, vim->vim_object, 0, map_size, 364 vim->vim_entries, DMU_READ_PREFETCH)); 365 } 366 367 ASSERT(vdev_indirect_mapping_verify(vim)); 368 369 return (vim); 370 } 371 372 void 373 vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx) 374 { 375 vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object); 376 if (vim->vim_havecounts) { 377 VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object, 378 tx)); 379 spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 380 } 381 vdev_indirect_mapping_close(vim); 382 383 VERIFY0(dmu_object_free(os, object, tx)); 384 } 385 386 /* 387 * Append the list of vdev_indirect_mapping_entry_t's to the on-disk 388 * mapping object. Also remove the entries from the list and free them. 389 * This also implicitly extends the max_offset of the mapping (to the end 390 * of the last entry). 391 */ 392 void 393 vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, 394 list_t *list, dmu_tx_t *tx) 395 { 396 vdev_indirect_mapping_entry_phys_t *mapbuf; 397 uint64_t old_size; 398 uint32_t *countbuf = NULL; 399 vdev_indirect_mapping_entry_phys_t *old_entries; 400 uint64_t old_count; 401 uint64_t entries_written = 0; 402 403 ASSERT(vdev_indirect_mapping_verify(vim)); 404 ASSERT(dmu_tx_is_syncing(tx)); 405 ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx))); 406 ASSERT(!list_is_empty(list)); 407 408 old_size = vdev_indirect_mapping_size(vim); 409 old_entries = vim->vim_entries; 410 old_count = vim->vim_phys->vimp_num_entries; 411 412 dmu_buf_will_dirty(vim->vim_dbuf, tx); 413 414 mapbuf = vmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP); 415 if (vim->vim_havecounts) { 416 countbuf = vmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP); 417 ASSERT(spa_feature_is_active(vim->vim_objset->os_spa, 418 SPA_FEATURE_OBSOLETE_COUNTS)); 419 } 420 while (!list_is_empty(list)) { 421 uint64_t i; 422 /* 423 * Write entries from the list to the 424 * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE. 425 */ 426 for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) { 427 vdev_indirect_mapping_entry_t *entry = 428 list_remove_head(list); 429 if (entry == NULL) 430 break; 431 432 uint64_t size = 433 DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst); 434 uint64_t src_offset = 435 DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping); 436 437 /* 438 * We shouldn't be adding an entry which is fully 439 * obsolete. 440 */ 441 ASSERT3U(entry->vime_obsolete_count, <, size); 442 IMPLY(entry->vime_obsolete_count != 0, 443 vim->vim_havecounts); 444 445 mapbuf[i] = entry->vime_mapping; 446 if (vim->vim_havecounts) 447 countbuf[i] = entry->vime_obsolete_count; 448 449 vim->vim_phys->vimp_bytes_mapped += size; 450 ASSERT3U(src_offset, >=, 451 vim->vim_phys->vimp_max_offset); 452 vim->vim_phys->vimp_max_offset = src_offset + size; 453 454 entries_written++; 455 456 vmem_free(entry, sizeof (*entry)); 457 } 458 dmu_write(vim->vim_objset, vim->vim_object, 459 vim->vim_phys->vimp_num_entries * sizeof (*mapbuf), 460 i * sizeof (*mapbuf), 461 mapbuf, tx); 462 if (vim->vim_havecounts) { 463 dmu_write(vim->vim_objset, 464 vim->vim_phys->vimp_counts_object, 465 vim->vim_phys->vimp_num_entries * 466 sizeof (*countbuf), 467 i * sizeof (*countbuf), countbuf, tx); 468 } 469 vim->vim_phys->vimp_num_entries += i; 470 } 471 vmem_free(mapbuf, SPA_OLD_MAXBLOCKSIZE); 472 if (vim->vim_havecounts) 473 vmem_free(countbuf, SPA_OLD_MAXBLOCKSIZE); 474 475 /* 476 * Update the entry array to reflect the new entries. First, copy 477 * over any old entries then read back the new entries we just wrote. 478 */ 479 uint64_t new_size = vdev_indirect_mapping_size(vim); 480 ASSERT3U(new_size, >, old_size); 481 ASSERT3U(new_size - old_size, ==, 482 entries_written * sizeof (vdev_indirect_mapping_entry_phys_t)); 483 vim->vim_entries = vmem_alloc(new_size, KM_SLEEP); 484 if (old_size > 0) { 485 memcpy(vim->vim_entries, old_entries, old_size); 486 vmem_free(old_entries, old_size); 487 } 488 VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size, 489 new_size - old_size, &vim->vim_entries[old_count], 490 DMU_READ_PREFETCH)); 491 492 zfs_dbgmsg("txg %llu: wrote %llu entries to " 493 "indirect mapping obj %llu; max offset=0x%llx", 494 (u_longlong_t)dmu_tx_get_txg(tx), 495 (u_longlong_t)entries_written, 496 (u_longlong_t)vim->vim_object, 497 (u_longlong_t)vim->vim_phys->vimp_max_offset); 498 } 499 500 /* 501 * Increment the relevant counts for the specified offset and length. 502 * The counts array must be obtained from 503 * vdev_indirect_mapping_load_obsolete_counts(). 504 */ 505 void 506 vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim, 507 uint64_t offset, uint64_t length, uint32_t *counts) 508 { 509 vdev_indirect_mapping_entry_phys_t *mapping; 510 uint64_t index; 511 512 mapping = vdev_indirect_mapping_entry_for_offset(vim, offset); 513 514 ASSERT(length > 0); 515 ASSERT3P(mapping, !=, NULL); 516 517 index = mapping - vim->vim_entries; 518 519 while (length > 0) { 520 ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim)); 521 522 uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst); 523 uint64_t inner_offset = offset - 524 DVA_MAPPING_GET_SRC_OFFSET(mapping); 525 VERIFY3U(inner_offset, <, size); 526 uint64_t inner_size = MIN(length, size - inner_offset); 527 528 VERIFY3U(counts[index] + inner_size, <=, size); 529 counts[index] += inner_size; 530 531 offset += inner_size; 532 length -= inner_size; 533 mapping++; 534 index++; 535 } 536 } 537 538 typedef struct load_obsolete_space_map_arg { 539 vdev_indirect_mapping_t *losma_vim; 540 uint32_t *losma_counts; 541 } load_obsolete_space_map_arg_t; 542 543 static int 544 load_obsolete_sm_callback(space_map_entry_t *sme, void *arg) 545 { 546 load_obsolete_space_map_arg_t *losma = arg; 547 ASSERT3S(sme->sme_type, ==, SM_ALLOC); 548 549 vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim, 550 sme->sme_offset, sme->sme_run, losma->losma_counts); 551 552 return (0); 553 } 554 555 /* 556 * Modify the counts (increment them) based on the spacemap. 557 */ 558 void 559 vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim, 560 uint32_t *counts, space_map_t *obsolete_space_sm) 561 { 562 load_obsolete_space_map_arg_t losma; 563 losma.losma_counts = counts; 564 losma.losma_vim = vim; 565 VERIFY0(space_map_iterate(obsolete_space_sm, 566 space_map_length(obsolete_space_sm), 567 load_obsolete_sm_callback, &losma)); 568 } 569 570 /* 571 * Read the obsolete counts from disk, returning them in an array. 572 */ 573 uint32_t * 574 vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim) 575 { 576 ASSERT(vdev_indirect_mapping_verify(vim)); 577 578 uint64_t counts_size = 579 vim->vim_phys->vimp_num_entries * sizeof (uint32_t); 580 uint32_t *counts = vmem_alloc(counts_size, KM_SLEEP); 581 if (vim->vim_havecounts) { 582 VERIFY0(dmu_read(vim->vim_objset, 583 vim->vim_phys->vimp_counts_object, 584 0, counts_size, 585 counts, DMU_READ_PREFETCH)); 586 } else { 587 memset(counts, 0, counts_size); 588 } 589 return (counts); 590 } 591 592 extern void 593 vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim, 594 uint32_t *counts) 595 { 596 ASSERT(vdev_indirect_mapping_verify(vim)); 597 598 vmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t)); 599 } 600 601 #if defined(_KERNEL) 602 EXPORT_SYMBOL(vdev_indirect_mapping_add_entries); 603 EXPORT_SYMBOL(vdev_indirect_mapping_alloc); 604 EXPORT_SYMBOL(vdev_indirect_mapping_bytes_mapped); 605 EXPORT_SYMBOL(vdev_indirect_mapping_close); 606 EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset); 607 EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset_or_next); 608 EXPORT_SYMBOL(vdev_indirect_mapping_free); 609 EXPORT_SYMBOL(vdev_indirect_mapping_free_obsolete_counts); 610 EXPORT_SYMBOL(vdev_indirect_mapping_increment_obsolete_count); 611 EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_counts); 612 EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_spacemap); 613 EXPORT_SYMBOL(vdev_indirect_mapping_max_offset); 614 EXPORT_SYMBOL(vdev_indirect_mapping_num_entries); 615 EXPORT_SYMBOL(vdev_indirect_mapping_object); 616 EXPORT_SYMBOL(vdev_indirect_mapping_open); 617 EXPORT_SYMBOL(vdev_indirect_mapping_size); 618 #endif 619