1 /* 2 * CDDL HEADER START 3 * 4 * This file and its contents are supplied under the terms of the 5 * Common Development and Distribution License ("CDDL"), version 1.0. 6 * You may only use this file in accordance with the terms of version 7 * 1.0 of the CDDL. 8 * 9 * A full copy of the text of the CDDL should have accompanied this 10 * source. A copy of the CDDL is also available via the Internet at 11 * http://www.illumos.org/license/CDDL. 12 * 13 * CDDL HEADER END 14 */ 15 16 /* 17 * Copyright (c) 2015, 2017 by Delphix. All rights reserved. 18 */ 19 20 #include <sys/dmu_tx.h> 21 #include <sys/dsl_pool.h> 22 #include <sys/spa.h> 23 #include <sys/vdev_impl.h> 24 #include <sys/vdev_indirect_mapping.h> 25 #include <sys/zfeature.h> 26 #include <sys/dmu_objset.h> 27 28 static boolean_t 29 vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim) 30 { 31 ASSERT(vim != NULL); 32 33 ASSERT(vim->vim_object != 0); 34 ASSERT(vim->vim_objset != NULL); 35 ASSERT(vim->vim_phys != NULL); 36 ASSERT(vim->vim_dbuf != NULL); 37 38 EQUIV(vim->vim_phys->vimp_num_entries > 0, 39 vim->vim_entries != NULL); 40 if (vim->vim_phys->vimp_num_entries > 0) { 41 vdev_indirect_mapping_entry_phys_t *last_entry = 42 &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1]; 43 uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(last_entry); 44 uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst); 45 46 ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size); 47 } 48 if (vim->vim_havecounts) { 49 ASSERT(vim->vim_phys->vimp_counts_object != 0); 50 } 51 52 return (B_TRUE); 53 } 54 55 uint64_t 56 vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim) 57 { 58 ASSERT(vdev_indirect_mapping_verify(vim)); 59 60 return (vim->vim_phys->vimp_num_entries); 61 } 62 63 uint64_t 64 vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim) 65 { 66 ASSERT(vdev_indirect_mapping_verify(vim)); 67 68 return (vim->vim_phys->vimp_max_offset); 69 } 70 71 uint64_t 72 vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim) 73 { 74 ASSERT(vdev_indirect_mapping_verify(vim)); 75 76 return (vim->vim_object); 77 } 78 79 uint64_t 80 vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim) 81 { 82 ASSERT(vdev_indirect_mapping_verify(vim)); 83 84 return (vim->vim_phys->vimp_bytes_mapped); 85 } 86 87 /* 88 * The length (in bytes) of the mapping object array in memory and 89 * (logically) on disk. 90 * 91 * Note that unlike most of our accessor functions, 92 * we don't assert that the struct is consistent; therefore it can be 93 * called while there may be concurrent changes, if we don't care about 94 * the value being immediately stale (e.g. from spa_removal_get_stats()). 95 */ 96 uint64_t 97 vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim) 98 { 99 return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries)); 100 } 101 102 /* 103 * Compare an offset with an indirect mapping entry; there are three 104 * possible scenarios: 105 * 106 * 1. The offset is "less than" the mapping entry; meaning the 107 * offset is less than the source offset of the mapping entry. In 108 * this case, there is no overlap between the offset and the 109 * mapping entry and -1 will be returned. 110 * 111 * 2. The offset is "greater than" the mapping entry; meaning the 112 * offset is greater than the mapping entry's source offset plus 113 * the entry's size. In this case, there is no overlap between 114 * the offset and the mapping entry and 1 will be returned. 115 * 116 * NOTE: If the offset is actually equal to the entry's offset 117 * plus size, this is considered to be "greater" than the entry, 118 * and this case applies (i.e. 1 will be returned). Thus, the 119 * entry's "range" can be considered to be inclusive at its 120 * start, but exclusive at its end: e.g. [src, src + size). 121 * 122 * 3. The last case to consider is if the offset actually falls 123 * within the mapping entry's range. If this is the case, the 124 * offset is considered to be "equal to" the mapping entry and 125 * 0 will be returned. 126 * 127 * NOTE: If the offset is equal to the entry's source offset, 128 * this case applies and 0 will be returned. If the offset is 129 * equal to the entry's source plus its size, this case does 130 * *not* apply (see "NOTE" above for scenario 2), and 1 will be 131 * returned. 132 */ 133 static int 134 dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem) 135 { 136 const uint64_t *key = v_key; 137 const vdev_indirect_mapping_entry_phys_t *array_elem = 138 v_array_elem; 139 uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem); 140 141 if (*key < src_offset) { 142 return (-1); 143 } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) { 144 return (0); 145 } else { 146 return (1); 147 } 148 } 149 150 /* 151 * Returns the mapping entry for the given offset. 152 * 153 * It's possible that the given offset will not be in the mapping table 154 * (i.e. no mapping entries contain this offset), in which case, the 155 * return value value depends on the "next_if_missing" parameter. 156 * 157 * If the offset is not found in the table and "next_if_missing" is 158 * B_FALSE, then NULL will always be returned. The behavior is intended 159 * to allow consumers to get the entry corresponding to the offset 160 * parameter, iff the offset overlaps with an entry in the table. 161 * 162 * If the offset is not found in the table and "next_if_missing" is 163 * B_TRUE, then the entry nearest to the given offset will be returned, 164 * such that the entry's source offset is greater than the offset 165 * passed in (i.e. the "next" mapping entry in the table is returned, if 166 * the offset is missing from the table). If there are no entries whose 167 * source offset is greater than the passed in offset, NULL is returned. 168 */ 169 static vdev_indirect_mapping_entry_phys_t * 170 vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim, 171 uint64_t offset, boolean_t next_if_missing) 172 { 173 ASSERT(vdev_indirect_mapping_verify(vim)); 174 ASSERT(vim->vim_phys->vimp_num_entries > 0); 175 176 vdev_indirect_mapping_entry_phys_t *entry = NULL; 177 178 uint64_t last = vim->vim_phys->vimp_num_entries - 1; 179 uint64_t base = 0; 180 181 /* 182 * We don't define these inside of the while loop because we use 183 * their value in the case that offset isn't in the mapping. 184 */ 185 uint64_t mid; 186 int result; 187 188 while (last >= base) { 189 mid = base + ((last - base) >> 1); 190 191 result = dva_mapping_overlap_compare(&offset, 192 &vim->vim_entries[mid]); 193 194 if (result == 0) { 195 entry = &vim->vim_entries[mid]; 196 break; 197 } else if (result < 0) { 198 last = mid - 1; 199 } else { 200 base = mid + 1; 201 } 202 } 203 204 if (entry == NULL && next_if_missing) { 205 ASSERT3U(base, ==, last + 1); 206 ASSERT(mid == base || mid == last); 207 ASSERT3S(result, !=, 0); 208 209 /* 210 * The offset we're looking for isn't actually contained 211 * in the mapping table, thus we need to return the 212 * closest mapping entry that is greater than the 213 * offset. We reuse the result of the last comparison, 214 * comparing the mapping entry at index "mid" and the 215 * offset. The offset is guaranteed to lie between 216 * indices one less than "mid", and one greater than 217 * "mid"; we just need to determine if offset is greater 218 * than, or less than the mapping entry contained at 219 * index "mid". 220 */ 221 222 uint64_t index; 223 if (result < 0) 224 index = mid; 225 else 226 index = mid + 1; 227 228 ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries); 229 230 if (index == vim->vim_phys->vimp_num_entries) { 231 /* 232 * If "index" is past the end of the entries 233 * array, then not only is the offset not in the 234 * mapping table, but it's actually greater than 235 * all entries in the table. In this case, we 236 * can't return a mapping entry greater than the 237 * offset (since none exist), so we return NULL. 238 */ 239 240 ASSERT3S(dva_mapping_overlap_compare(&offset, 241 &vim->vim_entries[index - 1]), >, 0); 242 243 return (NULL); 244 } else { 245 /* 246 * Just to be safe, we verify the offset falls 247 * in between the mapping entries at index and 248 * one less than index. Since we know the offset 249 * doesn't overlap an entry, and we're supposed 250 * to return the entry just greater than the 251 * offset, both of the following tests must be 252 * true. 253 */ 254 ASSERT3S(dva_mapping_overlap_compare(&offset, 255 &vim->vim_entries[index]), <, 0); 256 IMPLY(index >= 1, dva_mapping_overlap_compare(&offset, 257 &vim->vim_entries[index - 1]) > 0); 258 259 return (&vim->vim_entries[index]); 260 } 261 } else { 262 return (entry); 263 } 264 } 265 266 vdev_indirect_mapping_entry_phys_t * 267 vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, 268 uint64_t offset) 269 { 270 return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, 271 B_FALSE)); 272 } 273 274 vdev_indirect_mapping_entry_phys_t * 275 vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim, 276 uint64_t offset) 277 { 278 return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, 279 B_TRUE)); 280 } 281 282 283 void 284 vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim) 285 { 286 ASSERT(vdev_indirect_mapping_verify(vim)); 287 288 if (vim->vim_phys->vimp_num_entries > 0) { 289 uint64_t map_size = vdev_indirect_mapping_size(vim); 290 kmem_free(vim->vim_entries, map_size); 291 vim->vim_entries = NULL; 292 } 293 294 dmu_buf_rele(vim->vim_dbuf, vim); 295 296 vim->vim_objset = NULL; 297 vim->vim_object = 0; 298 vim->vim_dbuf = NULL; 299 vim->vim_phys = NULL; 300 301 kmem_free(vim, sizeof (*vim)); 302 } 303 304 uint64_t 305 vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx) 306 { 307 uint64_t object; 308 ASSERT(dmu_tx_is_syncing(tx)); 309 uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0; 310 311 if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 312 bonus_size = sizeof (vdev_indirect_mapping_phys_t); 313 } 314 315 object = dmu_object_alloc(os, 316 DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, 317 DMU_OTN_UINT64_METADATA, bonus_size, 318 tx); 319 320 if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 321 dmu_buf_t *dbuf; 322 vdev_indirect_mapping_phys_t *vimp; 323 324 VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf)); 325 dmu_buf_will_dirty(dbuf, tx); 326 vimp = dbuf->db_data; 327 vimp->vimp_counts_object = dmu_object_alloc(os, 328 DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE, 329 DMU_OT_NONE, 0, tx); 330 spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 331 dmu_buf_rele(dbuf, FTAG); 332 } 333 334 return (object); 335 } 336 337 338 vdev_indirect_mapping_t * 339 vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object) 340 { 341 vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP); 342 dmu_object_info_t doi; 343 VERIFY0(dmu_object_info(os, mapping_object, &doi)); 344 345 vim->vim_objset = os; 346 vim->vim_object = mapping_object; 347 348 VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim, 349 &vim->vim_dbuf)); 350 vim->vim_phys = vim->vim_dbuf->db_data; 351 352 vim->vim_havecounts = 353 (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0); 354 355 if (vim->vim_phys->vimp_num_entries > 0) { 356 uint64_t map_size = vdev_indirect_mapping_size(vim); 357 vim->vim_entries = kmem_alloc(map_size, KM_SLEEP); 358 VERIFY0(dmu_read(os, vim->vim_object, 0, map_size, 359 vim->vim_entries, DMU_READ_PREFETCH)); 360 } 361 362 ASSERT(vdev_indirect_mapping_verify(vim)); 363 364 return (vim); 365 } 366 367 void 368 vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx) 369 { 370 vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object); 371 if (vim->vim_havecounts) { 372 VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object, 373 tx)); 374 spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 375 } 376 vdev_indirect_mapping_close(vim); 377 378 VERIFY0(dmu_object_free(os, object, tx)); 379 } 380 381 /* 382 * Append the list of vdev_indirect_mapping_entry_t's to the on-disk 383 * mapping object. Also remove the entries from the list and free them. 384 * This also implicitly extends the max_offset of the mapping (to the end 385 * of the last entry). 386 */ 387 void 388 vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, 389 list_t *list, dmu_tx_t *tx) 390 { 391 vdev_indirect_mapping_entry_phys_t *mapbuf; 392 uint64_t old_size; 393 uint32_t *countbuf = NULL; 394 vdev_indirect_mapping_entry_phys_t *old_entries; 395 uint64_t old_count; 396 uint64_t entries_written = 0; 397 398 ASSERT(vdev_indirect_mapping_verify(vim)); 399 ASSERT(dmu_tx_is_syncing(tx)); 400 ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx))); 401 ASSERT(!list_is_empty(list)); 402 403 old_size = vdev_indirect_mapping_size(vim); 404 old_entries = vim->vim_entries; 405 old_count = vim->vim_phys->vimp_num_entries; 406 407 dmu_buf_will_dirty(vim->vim_dbuf, tx); 408 409 mapbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); 410 if (vim->vim_havecounts) { 411 countbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); 412 ASSERT(spa_feature_is_active(vim->vim_objset->os_spa, 413 SPA_FEATURE_OBSOLETE_COUNTS)); 414 } 415 while (!list_is_empty(list)) { 416 uint64_t i; 417 /* 418 * Write entries from the list to the 419 * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE. 420 */ 421 for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) { 422 vdev_indirect_mapping_entry_t *entry = 423 list_remove_head(list); 424 if (entry == NULL) 425 break; 426 427 uint64_t size = 428 DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst); 429 uint64_t src_offset = 430 DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping); 431 432 /* 433 * We shouldn't be adding an entry which is fully 434 * obsolete. 435 */ 436 ASSERT3U(entry->vime_obsolete_count, <, size); 437 IMPLY(entry->vime_obsolete_count != 0, 438 vim->vim_havecounts); 439 440 mapbuf[i] = entry->vime_mapping; 441 if (vim->vim_havecounts) 442 countbuf[i] = entry->vime_obsolete_count; 443 444 vim->vim_phys->vimp_bytes_mapped += size; 445 ASSERT3U(src_offset, >=, 446 vim->vim_phys->vimp_max_offset); 447 vim->vim_phys->vimp_max_offset = src_offset + size; 448 449 entries_written++; 450 451 kmem_free(entry, sizeof (*entry)); 452 } 453 dmu_write(vim->vim_objset, vim->vim_object, 454 vim->vim_phys->vimp_num_entries * sizeof (*mapbuf), 455 i * sizeof (*mapbuf), 456 mapbuf, tx); 457 if (vim->vim_havecounts) { 458 dmu_write(vim->vim_objset, 459 vim->vim_phys->vimp_counts_object, 460 vim->vim_phys->vimp_num_entries * 461 sizeof (*countbuf), 462 i * sizeof (*countbuf), countbuf, tx); 463 } 464 vim->vim_phys->vimp_num_entries += i; 465 } 466 zio_buf_free(mapbuf, SPA_OLD_MAXBLOCKSIZE); 467 if (vim->vim_havecounts) 468 zio_buf_free(countbuf, SPA_OLD_MAXBLOCKSIZE); 469 470 /* 471 * Update the entry array to reflect the new entries. First, copy 472 * over any old entries then read back the new entries we just wrote. 473 */ 474 uint64_t new_size = vdev_indirect_mapping_size(vim); 475 ASSERT3U(new_size, >, old_size); 476 ASSERT3U(new_size - old_size, ==, 477 entries_written * sizeof (vdev_indirect_mapping_entry_phys_t)); 478 vim->vim_entries = kmem_alloc(new_size, KM_SLEEP); 479 if (old_size > 0) { 480 bcopy(old_entries, vim->vim_entries, old_size); 481 kmem_free(old_entries, old_size); 482 } 483 VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size, 484 new_size - old_size, &vim->vim_entries[old_count], 485 DMU_READ_PREFETCH)); 486 487 zfs_dbgmsg("txg %llu: wrote %llu entries to " 488 "indirect mapping obj %llu; max offset=0x%llx", 489 (u_longlong_t)dmu_tx_get_txg(tx), 490 (u_longlong_t)entries_written, 491 (u_longlong_t)vim->vim_object, 492 (u_longlong_t)vim->vim_phys->vimp_max_offset); 493 } 494 495 /* 496 * Increment the relevant counts for the specified offset and length. 497 * The counts array must be obtained from 498 * vdev_indirect_mapping_load_obsolete_counts(). 499 */ 500 void 501 vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim, 502 uint64_t offset, uint64_t length, uint32_t *counts) 503 { 504 vdev_indirect_mapping_entry_phys_t *mapping; 505 uint64_t index; 506 507 mapping = vdev_indirect_mapping_entry_for_offset(vim, offset); 508 509 ASSERT(length > 0); 510 ASSERT3P(mapping, !=, NULL); 511 512 index = mapping - vim->vim_entries; 513 514 while (length > 0) { 515 ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim)); 516 517 uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst); 518 uint64_t inner_offset = offset - 519 DVA_MAPPING_GET_SRC_OFFSET(mapping); 520 VERIFY3U(inner_offset, <, size); 521 uint64_t inner_size = MIN(length, size - inner_offset); 522 523 VERIFY3U(counts[index] + inner_size, <=, size); 524 counts[index] += inner_size; 525 526 offset += inner_size; 527 length -= inner_size; 528 mapping++; 529 index++; 530 } 531 } 532 533 typedef struct load_obsolete_space_map_arg { 534 vdev_indirect_mapping_t *losma_vim; 535 uint32_t *losma_counts; 536 } load_obsolete_space_map_arg_t; 537 538 static int 539 load_obsolete_sm_callback(space_map_entry_t *sme, void *arg) 540 { 541 load_obsolete_space_map_arg_t *losma = arg; 542 ASSERT3S(sme->sme_type, ==, SM_ALLOC); 543 544 vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim, 545 sme->sme_offset, sme->sme_run, losma->losma_counts); 546 547 return (0); 548 } 549 550 /* 551 * Modify the counts (increment them) based on the spacemap. 552 */ 553 void 554 vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim, 555 uint32_t *counts, space_map_t *obsolete_space_sm) 556 { 557 load_obsolete_space_map_arg_t losma; 558 losma.losma_counts = counts; 559 losma.losma_vim = vim; 560 VERIFY0(space_map_iterate(obsolete_space_sm, 561 load_obsolete_sm_callback, &losma)); 562 } 563 564 /* 565 * Read the obsolete counts from disk, returning them in an array. 566 */ 567 uint32_t * 568 vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim) 569 { 570 ASSERT(vdev_indirect_mapping_verify(vim)); 571 572 uint64_t counts_size = 573 vim->vim_phys->vimp_num_entries * sizeof (uint32_t); 574 uint32_t *counts = kmem_alloc(counts_size, KM_SLEEP); 575 if (vim->vim_havecounts) { 576 VERIFY0(dmu_read(vim->vim_objset, 577 vim->vim_phys->vimp_counts_object, 578 0, counts_size, 579 counts, DMU_READ_PREFETCH)); 580 } else { 581 bzero(counts, counts_size); 582 } 583 return (counts); 584 } 585 586 extern void 587 vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim, 588 uint32_t *counts) 589 { 590 ASSERT(vdev_indirect_mapping_verify(vim)); 591 592 kmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t)); 593 } 594