1 /* 2 * CDDL HEADER START 3 * 4 * This file and its contents are supplied under the terms of the 5 * Common Development and Distribution License ("CDDL"), version 1.0. 6 * You may only use this file in accordance with the terms of version 7 * 1.0 of the CDDL. 8 * 9 * A full copy of the text of the CDDL should have accompanied this 10 * source. A copy of the CDDL is also available via the Internet at 11 * http://www.illumos.org/license/CDDL. 12 * 13 * CDDL HEADER END 14 */ 15 16 /* 17 * Copyright (c) 2015, 2017 by Delphix. All rights reserved. 18 */ 19 20 #include <sys/dmu_tx.h> 21 #include <sys/dsl_pool.h> 22 #include <sys/spa.h> 23 #include <sys/vdev_impl.h> 24 #include <sys/vdev_indirect_mapping.h> 25 #include <sys/zfeature.h> 26 #include <sys/dmu_objset.h> 27 28 static boolean_t 29 vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim) 30 { 31 ASSERT(vim != NULL); 32 33 ASSERT(vim->vim_object != 0); 34 ASSERT(vim->vim_objset != NULL); 35 ASSERT(vim->vim_phys != NULL); 36 ASSERT(vim->vim_dbuf != NULL); 37 38 EQUIV(vim->vim_phys->vimp_num_entries > 0, 39 vim->vim_entries != NULL); 40 if (vim->vim_phys->vimp_num_entries > 0) { 41 vdev_indirect_mapping_entry_phys_t *last_entry = 42 &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1]; 43 uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(last_entry); 44 uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst); 45 46 ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size); 47 } 48 if (vim->vim_havecounts) { 49 ASSERT(vim->vim_phys->vimp_counts_object != 0); 50 } 51 52 return (B_TRUE); 53 } 54 55 uint64_t 56 vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim) 57 { 58 ASSERT(vdev_indirect_mapping_verify(vim)); 59 60 return (vim->vim_phys->vimp_num_entries); 61 } 62 63 uint64_t 64 vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim) 65 { 66 ASSERT(vdev_indirect_mapping_verify(vim)); 67 68 return (vim->vim_phys->vimp_max_offset); 69 } 70 71 uint64_t 72 vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim) 73 { 74 ASSERT(vdev_indirect_mapping_verify(vim)); 75 76 return (vim->vim_object); 77 } 78 79 uint64_t 80 vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim) 81 { 82 ASSERT(vdev_indirect_mapping_verify(vim)); 83 84 return (vim->vim_phys->vimp_bytes_mapped); 85 } 86 87 /* 88 * The length (in bytes) of the mapping object array in memory and 89 * (logically) on disk. 90 * 91 * Note that unlike most of our accessor functions, 92 * we don't assert that the struct is consistent; therefore it can be 93 * called while there may be concurrent changes, if we don't care about 94 * the value being immediately stale (e.g. from spa_removal_get_stats()). 95 */ 96 uint64_t 97 vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim) 98 { 99 return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries)); 100 } 101 102 /* 103 * Compare an offset with an indirect mapping entry; there are three 104 * possible scenarios: 105 * 106 * 1. The offset is "less than" the mapping entry; meaning the 107 * offset is less than the source offset of the mapping entry. In 108 * this case, there is no overlap between the offset and the 109 * mapping entry and -1 will be returned. 110 * 111 * 2. The offset is "greater than" the mapping entry; meaning the 112 * offset is greater than the mapping entry's source offset plus 113 * the entry's size. In this case, there is no overlap between 114 * the offset and the mapping entry and 1 will be returned. 115 * 116 * NOTE: If the offset is actually equal to the entry's offset 117 * plus size, this is considered to be "greater" than the entry, 118 * and this case applies (i.e. 1 will be returned). Thus, the 119 * entry's "range" can be considered to be inclusive at its 120 * start, but exclusive at its end: e.g. [src, src + size). 121 * 122 * 3. The last case to consider is if the offset actually falls 123 * within the mapping entry's range. If this is the case, the 124 * offset is considered to be "equal to" the mapping entry and 125 * 0 will be returned. 126 * 127 * NOTE: If the offset is equal to the entry's source offset, 128 * this case applies and 0 will be returned. If the offset is 129 * equal to the entry's source plus its size, this case does 130 * *not* apply (see "NOTE" above for scenario 2), and 1 will be 131 * returned. 132 */ 133 static int 134 dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem) 135 { 136 const uint64_t *key = v_key; 137 const vdev_indirect_mapping_entry_phys_t *array_elem = 138 v_array_elem; 139 uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem); 140 141 if (*key < src_offset) { 142 return (-1); 143 } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) { 144 return (0); 145 } else { 146 return (1); 147 } 148 } 149 150 /* 151 * Returns the mapping entry for the given offset. 152 * 153 * It's possible that the given offset will not be in the mapping table 154 * (i.e. no mapping entries contain this offset), in which case, the 155 * return value value depends on the "next_if_missing" parameter. 156 * 157 * If the offset is not found in the table and "next_if_missing" is 158 * B_FALSE, then NULL will always be returned. The behavior is intended 159 * to allow consumers to get the entry corresponding to the offset 160 * parameter, iff the offset overlaps with an entry in the table. 161 * 162 * If the offset is not found in the table and "next_if_missing" is 163 * B_TRUE, then the entry nearest to the given offset will be returned, 164 * such that the entry's source offset is greater than the offset 165 * passed in (i.e. the "next" mapping entry in the table is returned, if 166 * the offset is missing from the table). If there are no entries whose 167 * source offset is greater than the passed in offset, NULL is returned. 168 */ 169 static vdev_indirect_mapping_entry_phys_t * 170 vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim, 171 uint64_t offset, boolean_t next_if_missing) 172 { 173 ASSERT(vdev_indirect_mapping_verify(vim)); 174 ASSERT(vim->vim_phys->vimp_num_entries > 0); 175 176 vdev_indirect_mapping_entry_phys_t *entry = NULL; 177 178 uint64_t last = vim->vim_phys->vimp_num_entries - 1; 179 uint64_t base = 0; 180 181 /* 182 * We don't define these inside of the while loop because we use 183 * their value in the case that offset isn't in the mapping. 184 */ 185 uint64_t mid; 186 int result; 187 188 while (last >= base) { 189 mid = base + ((last - base) >> 1); 190 191 result = dva_mapping_overlap_compare(&offset, 192 &vim->vim_entries[mid]); 193 194 if (result == 0) { 195 entry = &vim->vim_entries[mid]; 196 break; 197 } else if (result < 0) { 198 last = mid - 1; 199 } else { 200 base = mid + 1; 201 } 202 } 203 204 if (entry == NULL && next_if_missing) { 205 ASSERT3U(base, ==, last + 1); 206 ASSERT(mid == base || mid == last); 207 ASSERT3S(result, !=, 0); 208 209 /* 210 * The offset we're looking for isn't actually contained 211 * in the mapping table, thus we need to return the 212 * closest mapping entry that is greater than the 213 * offset. We reuse the result of the last comparison, 214 * comparing the mapping entry at index "mid" and the 215 * offset. The offset is guaranteed to lie between 216 * indices one less than "mid", and one greater than 217 * "mid"; we just need to determine if offset is greater 218 * than, or less than the mapping entry contained at 219 * index "mid". 220 */ 221 222 uint64_t index; 223 if (result < 0) 224 index = mid; 225 else 226 index = mid + 1; 227 228 ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries); 229 230 if (index == vim->vim_phys->vimp_num_entries) { 231 /* 232 * If "index" is past the end of the entries 233 * array, then not only is the offset not in the 234 * mapping table, but it's actually greater than 235 * all entries in the table. In this case, we 236 * can't return a mapping entry greater than the 237 * offset (since none exist), so we return NULL. 238 */ 239 240 ASSERT3S(dva_mapping_overlap_compare(&offset, 241 &vim->vim_entries[index - 1]), >, 0); 242 243 return (NULL); 244 } else { 245 /* 246 * Just to be safe, we verify the offset falls 247 * in between the mapping entries at index and 248 * one less than index. Since we know the offset 249 * doesn't overlap an entry, and we're supposed 250 * to return the entry just greater than the 251 * offset, both of the following tests must be 252 * true. 253 */ 254 ASSERT3S(dva_mapping_overlap_compare(&offset, 255 &vim->vim_entries[index]), <, 0); 256 IMPLY(index >= 1, dva_mapping_overlap_compare(&offset, 257 &vim->vim_entries[index - 1]) > 0); 258 259 return (&vim->vim_entries[index]); 260 } 261 } else { 262 return (entry); 263 } 264 } 265 266 vdev_indirect_mapping_entry_phys_t * 267 vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, 268 uint64_t offset) 269 { 270 return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, 271 B_FALSE)); 272 } 273 274 vdev_indirect_mapping_entry_phys_t * 275 vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim, 276 uint64_t offset) 277 { 278 return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, 279 B_TRUE)); 280 } 281 282 void 283 vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim) 284 { 285 ASSERT(vdev_indirect_mapping_verify(vim)); 286 287 if (vim->vim_phys->vimp_num_entries > 0) { 288 uint64_t map_size = vdev_indirect_mapping_size(vim); 289 kmem_free(vim->vim_entries, map_size); 290 vim->vim_entries = NULL; 291 } 292 293 dmu_buf_rele(vim->vim_dbuf, vim); 294 295 vim->vim_objset = NULL; 296 vim->vim_object = 0; 297 vim->vim_dbuf = NULL; 298 vim->vim_phys = NULL; 299 300 kmem_free(vim, sizeof (*vim)); 301 } 302 303 uint64_t 304 vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx) 305 { 306 uint64_t object; 307 ASSERT(dmu_tx_is_syncing(tx)); 308 uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0; 309 310 if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 311 bonus_size = sizeof (vdev_indirect_mapping_phys_t); 312 } 313 314 object = dmu_object_alloc(os, 315 DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, 316 DMU_OTN_UINT64_METADATA, bonus_size, 317 tx); 318 319 if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 320 dmu_buf_t *dbuf; 321 vdev_indirect_mapping_phys_t *vimp; 322 323 VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf)); 324 dmu_buf_will_dirty(dbuf, tx); 325 vimp = dbuf->db_data; 326 vimp->vimp_counts_object = dmu_object_alloc(os, 327 DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE, 328 DMU_OT_NONE, 0, tx); 329 spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 330 dmu_buf_rele(dbuf, FTAG); 331 } 332 333 return (object); 334 } 335 336 337 vdev_indirect_mapping_t * 338 vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object) 339 { 340 vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP); 341 dmu_object_info_t doi; 342 VERIFY0(dmu_object_info(os, mapping_object, &doi)); 343 344 vim->vim_objset = os; 345 vim->vim_object = mapping_object; 346 347 VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim, 348 &vim->vim_dbuf)); 349 vim->vim_phys = vim->vim_dbuf->db_data; 350 351 vim->vim_havecounts = 352 (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0); 353 354 if (vim->vim_phys->vimp_num_entries > 0) { 355 uint64_t map_size = vdev_indirect_mapping_size(vim); 356 vim->vim_entries = kmem_alloc(map_size, KM_SLEEP); 357 VERIFY0(dmu_read(os, vim->vim_object, 0, map_size, 358 vim->vim_entries, DMU_READ_PREFETCH)); 359 } 360 361 ASSERT(vdev_indirect_mapping_verify(vim)); 362 363 return (vim); 364 } 365 366 void 367 vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx) 368 { 369 vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object); 370 if (vim->vim_havecounts) { 371 VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object, 372 tx)); 373 spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 374 } 375 vdev_indirect_mapping_close(vim); 376 377 VERIFY0(dmu_object_free(os, object, tx)); 378 } 379 380 /* 381 * Append the list of vdev_indirect_mapping_entry_t's to the on-disk 382 * mapping object. Also remove the entries from the list and free them. 383 * This also implicitly extends the max_offset of the mapping (to the end 384 * of the last entry). 385 */ 386 void 387 vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, 388 list_t *list, dmu_tx_t *tx) 389 { 390 vdev_indirect_mapping_entry_phys_t *mapbuf; 391 uint64_t old_size; 392 uint32_t *countbuf = NULL; 393 vdev_indirect_mapping_entry_phys_t *old_entries; 394 uint64_t old_count; 395 uint64_t entries_written = 0; 396 397 ASSERT(vdev_indirect_mapping_verify(vim)); 398 ASSERT(dmu_tx_is_syncing(tx)); 399 ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx))); 400 ASSERT(!list_is_empty(list)); 401 402 old_size = vdev_indirect_mapping_size(vim); 403 old_entries = vim->vim_entries; 404 old_count = vim->vim_phys->vimp_num_entries; 405 406 dmu_buf_will_dirty(vim->vim_dbuf, tx); 407 408 mapbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); 409 if (vim->vim_havecounts) { 410 countbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); 411 ASSERT(spa_feature_is_active(vim->vim_objset->os_spa, 412 SPA_FEATURE_OBSOLETE_COUNTS)); 413 } 414 while (!list_is_empty(list)) { 415 uint64_t i; 416 /* 417 * Write entries from the list to the 418 * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE. 419 */ 420 for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) { 421 vdev_indirect_mapping_entry_t *entry = 422 list_remove_head(list); 423 if (entry == NULL) 424 break; 425 426 uint64_t size = 427 DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst); 428 uint64_t src_offset = 429 DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping); 430 431 /* 432 * We shouldn't be adding an entry which is fully 433 * obsolete. 434 */ 435 ASSERT3U(entry->vime_obsolete_count, <, size); 436 IMPLY(entry->vime_obsolete_count != 0, 437 vim->vim_havecounts); 438 439 mapbuf[i] = entry->vime_mapping; 440 if (vim->vim_havecounts) 441 countbuf[i] = entry->vime_obsolete_count; 442 443 vim->vim_phys->vimp_bytes_mapped += size; 444 ASSERT3U(src_offset, >=, 445 vim->vim_phys->vimp_max_offset); 446 vim->vim_phys->vimp_max_offset = src_offset + size; 447 448 entries_written++; 449 450 kmem_free(entry, sizeof (*entry)); 451 } 452 dmu_write(vim->vim_objset, vim->vim_object, 453 vim->vim_phys->vimp_num_entries * sizeof (*mapbuf), 454 i * sizeof (*mapbuf), 455 mapbuf, tx); 456 if (vim->vim_havecounts) { 457 dmu_write(vim->vim_objset, 458 vim->vim_phys->vimp_counts_object, 459 vim->vim_phys->vimp_num_entries * 460 sizeof (*countbuf), 461 i * sizeof (*countbuf), countbuf, tx); 462 } 463 vim->vim_phys->vimp_num_entries += i; 464 } 465 zio_buf_free(mapbuf, SPA_OLD_MAXBLOCKSIZE); 466 if (vim->vim_havecounts) 467 zio_buf_free(countbuf, SPA_OLD_MAXBLOCKSIZE); 468 469 /* 470 * Update the entry array to reflect the new entries. First, copy 471 * over any old entries then read back the new entries we just wrote. 472 */ 473 uint64_t new_size = vdev_indirect_mapping_size(vim); 474 ASSERT3U(new_size, >, old_size); 475 ASSERT3U(new_size - old_size, ==, 476 entries_written * sizeof (vdev_indirect_mapping_entry_phys_t)); 477 vim->vim_entries = kmem_alloc(new_size, KM_SLEEP); 478 if (old_size > 0) { 479 bcopy(old_entries, vim->vim_entries, old_size); 480 kmem_free(old_entries, old_size); 481 } 482 VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size, 483 new_size - old_size, &vim->vim_entries[old_count], 484 DMU_READ_PREFETCH)); 485 486 zfs_dbgmsg("txg %llu: wrote %llu entries to " 487 "indirect mapping obj %llu; max offset=0x%llx", 488 (u_longlong_t)dmu_tx_get_txg(tx), 489 (u_longlong_t)entries_written, 490 (u_longlong_t)vim->vim_object, 491 (u_longlong_t)vim->vim_phys->vimp_max_offset); 492 } 493 494 /* 495 * Increment the relevant counts for the specified offset and length. 496 * The counts array must be obtained from 497 * vdev_indirect_mapping_load_obsolete_counts(). 498 */ 499 void 500 vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim, 501 uint64_t offset, uint64_t length, uint32_t *counts) 502 { 503 vdev_indirect_mapping_entry_phys_t *mapping; 504 uint64_t index; 505 506 mapping = vdev_indirect_mapping_entry_for_offset(vim, offset); 507 508 ASSERT(length > 0); 509 ASSERT3P(mapping, !=, NULL); 510 511 index = mapping - vim->vim_entries; 512 513 while (length > 0) { 514 ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim)); 515 516 uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst); 517 uint64_t inner_offset = offset - 518 DVA_MAPPING_GET_SRC_OFFSET(mapping); 519 VERIFY3U(inner_offset, <, size); 520 uint64_t inner_size = MIN(length, size - inner_offset); 521 522 VERIFY3U(counts[index] + inner_size, <=, size); 523 counts[index] += inner_size; 524 525 offset += inner_size; 526 length -= inner_size; 527 mapping++; 528 index++; 529 } 530 } 531 532 typedef struct load_obsolete_space_map_arg { 533 vdev_indirect_mapping_t *losma_vim; 534 uint32_t *losma_counts; 535 } load_obsolete_space_map_arg_t; 536 537 static int 538 load_obsolete_sm_callback(space_map_entry_t *sme, void *arg) 539 { 540 load_obsolete_space_map_arg_t *losma = arg; 541 ASSERT3S(sme->sme_type, ==, SM_ALLOC); 542 543 vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim, 544 sme->sme_offset, sme->sme_run, losma->losma_counts); 545 546 return (0); 547 } 548 549 /* 550 * Modify the counts (increment them) based on the spacemap. 551 */ 552 void 553 vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim, 554 uint32_t *counts, space_map_t *obsolete_space_sm) 555 { 556 load_obsolete_space_map_arg_t losma; 557 losma.losma_counts = counts; 558 losma.losma_vim = vim; 559 VERIFY0(space_map_iterate(obsolete_space_sm, 560 load_obsolete_sm_callback, &losma)); 561 } 562 563 /* 564 * Read the obsolete counts from disk, returning them in an array. 565 */ 566 uint32_t * 567 vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim) 568 { 569 ASSERT(vdev_indirect_mapping_verify(vim)); 570 571 uint64_t counts_size = 572 vim->vim_phys->vimp_num_entries * sizeof (uint32_t); 573 uint32_t *counts = kmem_alloc(counts_size, KM_SLEEP); 574 if (vim->vim_havecounts) { 575 VERIFY0(dmu_read(vim->vim_objset, 576 vim->vim_phys->vimp_counts_object, 577 0, counts_size, 578 counts, DMU_READ_PREFETCH)); 579 } else { 580 bzero(counts, counts_size); 581 } 582 return (counts); 583 } 584 585 extern void 586 vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim, 587 uint32_t *counts) 588 { 589 ASSERT(vdev_indirect_mapping_verify(vim)); 590 591 kmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t)); 592 } 593