1 /* 2 * CDDL HEADER START 3 * 4 * This file and its contents are supplied under the terms of the 5 * Common Development and Distribution License ("CDDL"), version 1.0. 6 * You may only use this file in accordance with the terms of version 7 * 1.0 of the CDDL. 8 * 9 * A full copy of the text of the CDDL should have accompanied this 10 * source. A copy of the CDDL is also available via the Internet at 11 * http://www.illumos.org/license/CDDL. 12 * 13 * CDDL HEADER END 14 */ 15 16 /* 17 * Copyright (c) 2015, 2017 by Delphix. All rights reserved. 18 */ 19 20 #include <sys/dmu_tx.h> 21 #include <sys/dsl_pool.h> 22 #include <sys/spa.h> 23 #include <sys/vdev_impl.h> 24 #include <sys/vdev_indirect_mapping.h> 25 #include <sys/zfeature.h> 26 #include <sys/dmu_objset.h> 27 28 #ifdef ZFS_DEBUG 29 static boolean_t 30 vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim) 31 { 32 ASSERT(vim != NULL); 33 34 ASSERT(vim->vim_object != 0); 35 ASSERT(vim->vim_objset != NULL); 36 ASSERT(vim->vim_phys != NULL); 37 ASSERT(vim->vim_dbuf != NULL); 38 39 EQUIV(vim->vim_phys->vimp_num_entries > 0, 40 vim->vim_entries != NULL); 41 if (vim->vim_phys->vimp_num_entries > 0) { 42 vdev_indirect_mapping_entry_phys_t *last_entry __maybe_unused = 43 &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1]; 44 uint64_t offset __maybe_unused = 45 DVA_MAPPING_GET_SRC_OFFSET(last_entry); 46 uint64_t size __maybe_unused = 47 DVA_GET_ASIZE(&last_entry->vimep_dst); 48 49 ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size); 50 } 51 if (vim->vim_havecounts) { 52 ASSERT(vim->vim_phys->vimp_counts_object != 0); 53 } 54 55 return (B_TRUE); 56 } 57 #endif 58 59 uint64_t 60 vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim) 61 { 62 ASSERT(vdev_indirect_mapping_verify(vim)); 63 64 return (vim->vim_phys->vimp_num_entries); 65 } 66 67 uint64_t 68 vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim) 69 { 70 ASSERT(vdev_indirect_mapping_verify(vim)); 71 72 return (vim->vim_phys->vimp_max_offset); 73 } 74 75 uint64_t 76 vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim) 77 { 78 ASSERT(vdev_indirect_mapping_verify(vim)); 79 80 return (vim->vim_object); 81 } 82 83 uint64_t 84 vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim) 85 { 86 ASSERT(vdev_indirect_mapping_verify(vim)); 87 88 return (vim->vim_phys->vimp_bytes_mapped); 89 } 90 91 /* 92 * The length (in bytes) of the mapping object array in memory and 93 * (logically) on disk. 94 * 95 * Note that unlike most of our accessor functions, 96 * we don't assert that the struct is consistent; therefore it can be 97 * called while there may be concurrent changes, if we don't care about 98 * the value being immediately stale (e.g. from spa_removal_get_stats()). 99 */ 100 uint64_t 101 vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim) 102 { 103 return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries)); 104 } 105 106 /* 107 * Compare an offset with an indirect mapping entry; there are three 108 * possible scenarios: 109 * 110 * 1. The offset is "less than" the mapping entry; meaning the 111 * offset is less than the source offset of the mapping entry. In 112 * this case, there is no overlap between the offset and the 113 * mapping entry and -1 will be returned. 114 * 115 * 2. The offset is "greater than" the mapping entry; meaning the 116 * offset is greater than the mapping entry's source offset plus 117 * the entry's size. In this case, there is no overlap between 118 * the offset and the mapping entry and 1 will be returned. 119 * 120 * NOTE: If the offset is actually equal to the entry's offset 121 * plus size, this is considered to be "greater" than the entry, 122 * and this case applies (i.e. 1 will be returned). Thus, the 123 * entry's "range" can be considered to be inclusive at its 124 * start, but exclusive at its end: e.g. [src, src + size). 125 * 126 * 3. The last case to consider is if the offset actually falls 127 * within the mapping entry's range. If this is the case, the 128 * offset is considered to be "equal to" the mapping entry and 129 * 0 will be returned. 130 * 131 * NOTE: If the offset is equal to the entry's source offset, 132 * this case applies and 0 will be returned. If the offset is 133 * equal to the entry's source plus its size, this case does 134 * *not* apply (see "NOTE" above for scenario 2), and 1 will be 135 * returned. 136 */ 137 static int 138 dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem) 139 { 140 const uint64_t * const key = v_key; 141 const vdev_indirect_mapping_entry_phys_t * const array_elem = 142 v_array_elem; 143 uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem); 144 145 if (*key < src_offset) { 146 return (-1); 147 } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) { 148 return (0); 149 } else { 150 return (1); 151 } 152 } 153 154 /* 155 * Returns the mapping entry for the given offset. 156 * 157 * It's possible that the given offset will not be in the mapping table 158 * (i.e. no mapping entries contain this offset), in which case, the 159 * return value value depends on the "next_if_missing" parameter. 160 * 161 * If the offset is not found in the table and "next_if_missing" is 162 * B_FALSE, then NULL will always be returned. The behavior is intended 163 * to allow consumers to get the entry corresponding to the offset 164 * parameter, iff the offset overlaps with an entry in the table. 165 * 166 * If the offset is not found in the table and "next_if_missing" is 167 * B_TRUE, then the entry nearest to the given offset will be returned, 168 * such that the entry's source offset is greater than the offset 169 * passed in (i.e. the "next" mapping entry in the table is returned, if 170 * the offset is missing from the table). If there are no entries whose 171 * source offset is greater than the passed in offset, NULL is returned. 172 */ 173 static vdev_indirect_mapping_entry_phys_t * 174 vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim, 175 uint64_t offset, boolean_t next_if_missing) 176 { 177 ASSERT(vdev_indirect_mapping_verify(vim)); 178 ASSERT(vim->vim_phys->vimp_num_entries > 0); 179 180 vdev_indirect_mapping_entry_phys_t *entry = NULL; 181 182 uint64_t last = vim->vim_phys->vimp_num_entries - 1; 183 uint64_t base = 0; 184 185 /* 186 * We don't define these inside of the while loop because we use 187 * their value in the case that offset isn't in the mapping. 188 */ 189 uint64_t mid; 190 int result; 191 192 while (last >= base) { 193 mid = base + ((last - base) >> 1); 194 195 result = dva_mapping_overlap_compare(&offset, 196 &vim->vim_entries[mid]); 197 198 if (result == 0) { 199 entry = &vim->vim_entries[mid]; 200 break; 201 } else if (result < 0) { 202 last = mid - 1; 203 } else { 204 base = mid + 1; 205 } 206 } 207 208 if (entry == NULL && next_if_missing) { 209 ASSERT3U(base, ==, last + 1); 210 ASSERT(mid == base || mid == last); 211 ASSERT3S(result, !=, 0); 212 213 /* 214 * The offset we're looking for isn't actually contained 215 * in the mapping table, thus we need to return the 216 * closest mapping entry that is greater than the 217 * offset. We reuse the result of the last comparison, 218 * comparing the mapping entry at index "mid" and the 219 * offset. The offset is guaranteed to lie between 220 * indices one less than "mid", and one greater than 221 * "mid"; we just need to determine if offset is greater 222 * than, or less than the mapping entry contained at 223 * index "mid". 224 */ 225 226 uint64_t index; 227 if (result < 0) 228 index = mid; 229 else 230 index = mid + 1; 231 232 ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries); 233 234 if (index == vim->vim_phys->vimp_num_entries) { 235 /* 236 * If "index" is past the end of the entries 237 * array, then not only is the offset not in the 238 * mapping table, but it's actually greater than 239 * all entries in the table. In this case, we 240 * can't return a mapping entry greater than the 241 * offset (since none exist), so we return NULL. 242 */ 243 244 ASSERT3S(dva_mapping_overlap_compare(&offset, 245 &vim->vim_entries[index - 1]), >, 0); 246 247 return (NULL); 248 } else { 249 /* 250 * Just to be safe, we verify the offset falls 251 * in between the mapping entries at index and 252 * one less than index. Since we know the offset 253 * doesn't overlap an entry, and we're supposed 254 * to return the entry just greater than the 255 * offset, both of the following tests must be 256 * true. 257 */ 258 ASSERT3S(dva_mapping_overlap_compare(&offset, 259 &vim->vim_entries[index]), <, 0); 260 IMPLY(index >= 1, dva_mapping_overlap_compare(&offset, 261 &vim->vim_entries[index - 1]) > 0); 262 263 return (&vim->vim_entries[index]); 264 } 265 } else { 266 return (entry); 267 } 268 } 269 270 vdev_indirect_mapping_entry_phys_t * 271 vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, 272 uint64_t offset) 273 { 274 return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, 275 B_FALSE)); 276 } 277 278 vdev_indirect_mapping_entry_phys_t * 279 vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim, 280 uint64_t offset) 281 { 282 return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, 283 B_TRUE)); 284 } 285 286 void 287 vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim) 288 { 289 ASSERT(vdev_indirect_mapping_verify(vim)); 290 291 if (vim->vim_phys->vimp_num_entries > 0) { 292 uint64_t map_size = vdev_indirect_mapping_size(vim); 293 vmem_free(vim->vim_entries, map_size); 294 vim->vim_entries = NULL; 295 } 296 297 dmu_buf_rele(vim->vim_dbuf, vim); 298 299 vim->vim_objset = NULL; 300 vim->vim_object = 0; 301 vim->vim_dbuf = NULL; 302 vim->vim_phys = NULL; 303 304 kmem_free(vim, sizeof (*vim)); 305 } 306 307 uint64_t 308 vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx) 309 { 310 uint64_t object; 311 ASSERT(dmu_tx_is_syncing(tx)); 312 uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0; 313 314 if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 315 bonus_size = sizeof (vdev_indirect_mapping_phys_t); 316 } 317 318 object = dmu_object_alloc(os, 319 DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, 320 DMU_OTN_UINT64_METADATA, bonus_size, 321 tx); 322 323 if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 324 dmu_buf_t *dbuf; 325 vdev_indirect_mapping_phys_t *vimp; 326 327 VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf)); 328 dmu_buf_will_dirty(dbuf, tx); 329 vimp = dbuf->db_data; 330 vimp->vimp_counts_object = dmu_object_alloc(os, 331 DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE, 332 DMU_OT_NONE, 0, tx); 333 spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 334 dmu_buf_rele(dbuf, FTAG); 335 } 336 337 return (object); 338 } 339 340 341 vdev_indirect_mapping_t * 342 vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object) 343 { 344 vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP); 345 dmu_object_info_t doi; 346 VERIFY0(dmu_object_info(os, mapping_object, &doi)); 347 348 vim->vim_objset = os; 349 vim->vim_object = mapping_object; 350 351 VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim, 352 &vim->vim_dbuf)); 353 vim->vim_phys = vim->vim_dbuf->db_data; 354 355 vim->vim_havecounts = 356 (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0); 357 358 if (vim->vim_phys->vimp_num_entries > 0) { 359 uint64_t map_size = vdev_indirect_mapping_size(vim); 360 vim->vim_entries = vmem_alloc(map_size, KM_SLEEP); 361 VERIFY0(dmu_read(os, vim->vim_object, 0, map_size, 362 vim->vim_entries, DMU_READ_PREFETCH)); 363 } 364 365 ASSERT(vdev_indirect_mapping_verify(vim)); 366 367 return (vim); 368 } 369 370 void 371 vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx) 372 { 373 vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object); 374 if (vim->vim_havecounts) { 375 VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object, 376 tx)); 377 spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 378 } 379 vdev_indirect_mapping_close(vim); 380 381 VERIFY0(dmu_object_free(os, object, tx)); 382 } 383 384 /* 385 * Append the list of vdev_indirect_mapping_entry_t's to the on-disk 386 * mapping object. Also remove the entries from the list and free them. 387 * This also implicitly extends the max_offset of the mapping (to the end 388 * of the last entry). 389 */ 390 void 391 vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, 392 list_t *list, dmu_tx_t *tx) 393 { 394 vdev_indirect_mapping_entry_phys_t *mapbuf; 395 uint64_t old_size; 396 uint32_t *countbuf = NULL; 397 vdev_indirect_mapping_entry_phys_t *old_entries; 398 uint64_t old_count; 399 uint64_t entries_written = 0; 400 401 ASSERT(vdev_indirect_mapping_verify(vim)); 402 ASSERT(dmu_tx_is_syncing(tx)); 403 ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx))); 404 ASSERT(!list_is_empty(list)); 405 406 old_size = vdev_indirect_mapping_size(vim); 407 old_entries = vim->vim_entries; 408 old_count = vim->vim_phys->vimp_num_entries; 409 410 dmu_buf_will_dirty(vim->vim_dbuf, tx); 411 412 mapbuf = vmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP); 413 if (vim->vim_havecounts) { 414 countbuf = vmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP); 415 ASSERT(spa_feature_is_active(vim->vim_objset->os_spa, 416 SPA_FEATURE_OBSOLETE_COUNTS)); 417 } 418 while (!list_is_empty(list)) { 419 uint64_t i; 420 /* 421 * Write entries from the list to the 422 * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE. 423 */ 424 for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) { 425 vdev_indirect_mapping_entry_t *entry = 426 list_remove_head(list); 427 if (entry == NULL) 428 break; 429 430 uint64_t size = 431 DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst); 432 uint64_t src_offset = 433 DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping); 434 435 /* 436 * We shouldn't be adding an entry which is fully 437 * obsolete. 438 */ 439 ASSERT3U(entry->vime_obsolete_count, <, size); 440 IMPLY(entry->vime_obsolete_count != 0, 441 vim->vim_havecounts); 442 443 mapbuf[i] = entry->vime_mapping; 444 if (vim->vim_havecounts) 445 countbuf[i] = entry->vime_obsolete_count; 446 447 vim->vim_phys->vimp_bytes_mapped += size; 448 ASSERT3U(src_offset, >=, 449 vim->vim_phys->vimp_max_offset); 450 vim->vim_phys->vimp_max_offset = src_offset + size; 451 452 entries_written++; 453 454 vmem_free(entry, sizeof (*entry)); 455 } 456 dmu_write(vim->vim_objset, vim->vim_object, 457 vim->vim_phys->vimp_num_entries * sizeof (*mapbuf), 458 i * sizeof (*mapbuf), 459 mapbuf, tx); 460 if (vim->vim_havecounts) { 461 dmu_write(vim->vim_objset, 462 vim->vim_phys->vimp_counts_object, 463 vim->vim_phys->vimp_num_entries * 464 sizeof (*countbuf), 465 i * sizeof (*countbuf), countbuf, tx); 466 } 467 vim->vim_phys->vimp_num_entries += i; 468 } 469 vmem_free(mapbuf, SPA_OLD_MAXBLOCKSIZE); 470 if (vim->vim_havecounts) 471 vmem_free(countbuf, SPA_OLD_MAXBLOCKSIZE); 472 473 /* 474 * Update the entry array to reflect the new entries. First, copy 475 * over any old entries then read back the new entries we just wrote. 476 */ 477 uint64_t new_size = vdev_indirect_mapping_size(vim); 478 ASSERT3U(new_size, >, old_size); 479 ASSERT3U(new_size - old_size, ==, 480 entries_written * sizeof (vdev_indirect_mapping_entry_phys_t)); 481 vim->vim_entries = vmem_alloc(new_size, KM_SLEEP); 482 if (old_size > 0) { 483 bcopy(old_entries, vim->vim_entries, old_size); 484 vmem_free(old_entries, old_size); 485 } 486 VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size, 487 new_size - old_size, &vim->vim_entries[old_count], 488 DMU_READ_PREFETCH)); 489 490 zfs_dbgmsg("txg %llu: wrote %llu entries to " 491 "indirect mapping obj %llu; max offset=0x%llx", 492 (u_longlong_t)dmu_tx_get_txg(tx), 493 (u_longlong_t)entries_written, 494 (u_longlong_t)vim->vim_object, 495 (u_longlong_t)vim->vim_phys->vimp_max_offset); 496 } 497 498 /* 499 * Increment the relevant counts for the specified offset and length. 500 * The counts array must be obtained from 501 * vdev_indirect_mapping_load_obsolete_counts(). 502 */ 503 void 504 vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim, 505 uint64_t offset, uint64_t length, uint32_t *counts) 506 { 507 vdev_indirect_mapping_entry_phys_t *mapping; 508 uint64_t index; 509 510 mapping = vdev_indirect_mapping_entry_for_offset(vim, offset); 511 512 ASSERT(length > 0); 513 ASSERT3P(mapping, !=, NULL); 514 515 index = mapping - vim->vim_entries; 516 517 while (length > 0) { 518 ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim)); 519 520 uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst); 521 uint64_t inner_offset = offset - 522 DVA_MAPPING_GET_SRC_OFFSET(mapping); 523 VERIFY3U(inner_offset, <, size); 524 uint64_t inner_size = MIN(length, size - inner_offset); 525 526 VERIFY3U(counts[index] + inner_size, <=, size); 527 counts[index] += inner_size; 528 529 offset += inner_size; 530 length -= inner_size; 531 mapping++; 532 index++; 533 } 534 } 535 536 typedef struct load_obsolete_space_map_arg { 537 vdev_indirect_mapping_t *losma_vim; 538 uint32_t *losma_counts; 539 } load_obsolete_space_map_arg_t; 540 541 static int 542 load_obsolete_sm_callback(space_map_entry_t *sme, void *arg) 543 { 544 load_obsolete_space_map_arg_t *losma = arg; 545 ASSERT3S(sme->sme_type, ==, SM_ALLOC); 546 547 vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim, 548 sme->sme_offset, sme->sme_run, losma->losma_counts); 549 550 return (0); 551 } 552 553 /* 554 * Modify the counts (increment them) based on the spacemap. 555 */ 556 void 557 vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim, 558 uint32_t *counts, space_map_t *obsolete_space_sm) 559 { 560 load_obsolete_space_map_arg_t losma; 561 losma.losma_counts = counts; 562 losma.losma_vim = vim; 563 VERIFY0(space_map_iterate(obsolete_space_sm, 564 space_map_length(obsolete_space_sm), 565 load_obsolete_sm_callback, &losma)); 566 } 567 568 /* 569 * Read the obsolete counts from disk, returning them in an array. 570 */ 571 uint32_t * 572 vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim) 573 { 574 ASSERT(vdev_indirect_mapping_verify(vim)); 575 576 uint64_t counts_size = 577 vim->vim_phys->vimp_num_entries * sizeof (uint32_t); 578 uint32_t *counts = vmem_alloc(counts_size, KM_SLEEP); 579 if (vim->vim_havecounts) { 580 VERIFY0(dmu_read(vim->vim_objset, 581 vim->vim_phys->vimp_counts_object, 582 0, counts_size, 583 counts, DMU_READ_PREFETCH)); 584 } else { 585 bzero(counts, counts_size); 586 } 587 return (counts); 588 } 589 590 extern void 591 vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim, 592 uint32_t *counts) 593 { 594 ASSERT(vdev_indirect_mapping_verify(vim)); 595 596 vmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t)); 597 } 598 599 #if defined(_KERNEL) 600 EXPORT_SYMBOL(vdev_indirect_mapping_add_entries); 601 EXPORT_SYMBOL(vdev_indirect_mapping_alloc); 602 EXPORT_SYMBOL(vdev_indirect_mapping_bytes_mapped); 603 EXPORT_SYMBOL(vdev_indirect_mapping_close); 604 EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset); 605 EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset_or_next); 606 EXPORT_SYMBOL(vdev_indirect_mapping_free); 607 EXPORT_SYMBOL(vdev_indirect_mapping_free_obsolete_counts); 608 EXPORT_SYMBOL(vdev_indirect_mapping_increment_obsolete_count); 609 EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_counts); 610 EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_spacemap); 611 EXPORT_SYMBOL(vdev_indirect_mapping_max_offset); 612 EXPORT_SYMBOL(vdev_indirect_mapping_num_entries); 613 EXPORT_SYMBOL(vdev_indirect_mapping_object); 614 EXPORT_SYMBOL(vdev_indirect_mapping_open); 615 EXPORT_SYMBOL(vdev_indirect_mapping_size); 616 #endif 617