1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* 27 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 28 */ 29 30 #include <sys/zfs_context.h> 31 #include <sys/spa.h> 32 #include <sys/dmu.h> 33 #include <sys/dmu_tx.h> 34 #include <sys/dnode.h> 35 #include <sys/dsl_pool.h> 36 #include <sys/zio.h> 37 #include <sys/space_map.h> 38 #include <sys/zfeature.h> 39 40 /* 41 * Note on space map block size: 42 * 43 * The data for a given space map can be kept on blocks of any size. 44 * Larger blocks entail fewer I/O operations, but they also cause the 45 * DMU to keep more data in-core, and also to waste more I/O bandwidth 46 * when only a few blocks have changed since the last transaction group. 47 */ 48 49 /* 50 * Enabled whenever we want to stress test the use of double-word 51 * space map entries. 52 */ 53 boolean_t zfs_force_some_double_word_sm_entries = B_FALSE; 54 55 /* 56 * Override the default indirect block size of 128K, instead use 16K for 57 * spacemaps (2^14 bytes). This dramatically reduces write inflation since 58 * appending to a spacemap typically has to write one data block (4KB) and one 59 * or two indirect blocks (16K-32K, rather than 128K). 60 */ 61 int space_map_ibs = 14; 62 63 boolean_t 64 sm_entry_is_debug(uint64_t e) 65 { 66 return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX); 67 } 68 69 boolean_t 70 sm_entry_is_single_word(uint64_t e) 71 { 72 uint8_t prefix = SM_PREFIX_DECODE(e); 73 return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX); 74 } 75 76 boolean_t 77 sm_entry_is_double_word(uint64_t e) 78 { 79 return (SM_PREFIX_DECODE(e) == SM2_PREFIX); 80 } 81 82 /* 83 * Iterate through the space map, invoking the callback on each (non-debug) 84 * space map entry. Stop after reading 'end' bytes of the space map. 85 */ 86 int 87 space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg) 88 { 89 uint64_t blksz = sm->sm_blksz; 90 91 ASSERT3U(blksz, !=, 0); 92 ASSERT3U(end, <=, space_map_length(sm)); 93 ASSERT0(P2PHASE(end, sizeof (uint64_t))); 94 95 dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end, 96 ZIO_PRIORITY_SYNC_READ); 97 98 int error = 0; 99 uint64_t txg = 0, sync_pass = 0; 100 for (uint64_t block_base = 0; block_base < end && error == 0; 101 block_base += blksz) { 102 dmu_buf_t *db; 103 error = dmu_buf_hold(sm->sm_os, space_map_object(sm), 104 block_base, FTAG, &db, DMU_READ_PREFETCH); 105 if (error != 0) 106 return (error); 107 108 uint64_t *block_start = db->db_data; 109 uint64_t block_length = MIN(end - block_base, blksz); 110 uint64_t *block_end = block_start + 111 (block_length / sizeof (uint64_t)); 112 113 VERIFY0(P2PHASE(block_length, sizeof (uint64_t))); 114 VERIFY3U(block_length, !=, 0); 115 ASSERT3U(blksz, ==, db->db_size); 116 117 for (uint64_t *block_cursor = block_start; 118 block_cursor < block_end && error == 0; block_cursor++) { 119 uint64_t e = *block_cursor; 120 121 if (sm_entry_is_debug(e)) { 122 /* 123 * Debug entries are only needed to record the 124 * current TXG and sync pass if available. 125 * 126 * Note though that sometimes there can be 127 * debug entries that are used as padding 128 * at the end of space map blocks in-order 129 * to not split a double-word entry in the 130 * middle between two blocks. These entries 131 * have their TXG field set to 0 and we 132 * skip them without recording the TXG. 133 * [see comment in space_map_write_seg()] 134 */ 135 uint64_t e_txg = SM_DEBUG_TXG_DECODE(e); 136 if (e_txg != 0) { 137 txg = e_txg; 138 sync_pass = SM_DEBUG_SYNCPASS_DECODE(e); 139 } else { 140 ASSERT0(SM_DEBUG_SYNCPASS_DECODE(e)); 141 } 142 continue; 143 } 144 145 uint64_t raw_offset, raw_run, vdev_id; 146 maptype_t type; 147 if (sm_entry_is_single_word(e)) { 148 type = SM_TYPE_DECODE(e); 149 vdev_id = SM_NO_VDEVID; 150 raw_offset = SM_OFFSET_DECODE(e); 151 raw_run = SM_RUN_DECODE(e); 152 } else { 153 /* it is a two-word entry */ 154 ASSERT(sm_entry_is_double_word(e)); 155 raw_run = SM2_RUN_DECODE(e); 156 vdev_id = SM2_VDEV_DECODE(e); 157 158 /* move on to the second word */ 159 block_cursor++; 160 e = *block_cursor; 161 VERIFY3P(block_cursor, <=, block_end); 162 163 type = SM2_TYPE_DECODE(e); 164 raw_offset = SM2_OFFSET_DECODE(e); 165 } 166 167 uint64_t entry_offset = (raw_offset << sm->sm_shift) + 168 sm->sm_start; 169 uint64_t entry_run = raw_run << sm->sm_shift; 170 171 VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); 172 VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); 173 ASSERT3U(entry_offset, >=, sm->sm_start); 174 ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size); 175 ASSERT3U(entry_run, <=, sm->sm_size); 176 ASSERT3U(entry_offset + entry_run, <=, 177 sm->sm_start + sm->sm_size); 178 179 space_map_entry_t sme = { 180 .sme_type = type, 181 .sme_vdev = vdev_id, 182 .sme_offset = entry_offset, 183 .sme_run = entry_run, 184 .sme_txg = txg, 185 .sme_sync_pass = sync_pass 186 }; 187 error = callback(&sme, arg); 188 } 189 dmu_buf_rele(db, FTAG); 190 } 191 return (error); 192 } 193 194 /* 195 * Reads the entries from the last block of the space map into 196 * buf in reverse order. Populates nwords with number of words 197 * in the last block. 198 * 199 * Refer to block comment within space_map_incremental_destroy() 200 * to understand why this function is needed. 201 */ 202 static int 203 space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf, 204 uint64_t bufsz, uint64_t *nwords) 205 { 206 int error = 0; 207 dmu_buf_t *db; 208 209 /* 210 * Find the offset of the last word in the space map and use 211 * that to read the last block of the space map with 212 * dmu_buf_hold(). 213 */ 214 uint64_t last_word_offset = 215 sm->sm_phys->smp_length - sizeof (uint64_t); 216 error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset, 217 FTAG, &db, DMU_READ_NO_PREFETCH); 218 if (error != 0) 219 return (error); 220 221 ASSERT3U(sm->sm_object, ==, db->db_object); 222 ASSERT3U(sm->sm_blksz, ==, db->db_size); 223 ASSERT3U(bufsz, >=, db->db_size); 224 ASSERT(nwords != NULL); 225 226 uint64_t *words = db->db_data; 227 *nwords = 228 (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t); 229 230 ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t)); 231 232 uint64_t n = *nwords; 233 uint64_t j = n - 1; 234 for (uint64_t i = 0; i < n; i++) { 235 uint64_t entry = words[i]; 236 if (sm_entry_is_double_word(entry)) { 237 /* 238 * Since we are populating the buffer backwards 239 * we have to be extra careful and add the two 240 * words of the double-word entry in the right 241 * order. 242 */ 243 ASSERT3U(j, >, 0); 244 buf[j - 1] = entry; 245 246 i++; 247 ASSERT3U(i, <, n); 248 entry = words[i]; 249 buf[j] = entry; 250 j -= 2; 251 } else { 252 ASSERT(sm_entry_is_debug(entry) || 253 sm_entry_is_single_word(entry)); 254 buf[j] = entry; 255 j--; 256 } 257 } 258 259 /* 260 * Assert that we wrote backwards all the 261 * way to the beginning of the buffer. 262 */ 263 ASSERT3S(j, ==, -1); 264 265 dmu_buf_rele(db, FTAG); 266 return (error); 267 } 268 269 /* 270 * Note: This function performs destructive actions - specifically 271 * it deletes entries from the end of the space map. Thus, callers 272 * should ensure that they are holding the appropriate locks for 273 * the space map that they provide. 274 */ 275 int 276 space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, 277 dmu_tx_t *tx) 278 { 279 uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); 280 uint64_t *buf = zio_buf_alloc(bufsz); 281 282 dmu_buf_will_dirty(sm->sm_dbuf, tx); 283 284 /* 285 * Ideally we would want to iterate from the beginning of the 286 * space map to the end in incremental steps. The issue with this 287 * approach is that we don't have any field on-disk that points 288 * us where to start between each step. We could try zeroing out 289 * entries that we've destroyed, but this doesn't work either as 290 * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]). 291 * 292 * As a result, we destroy its entries incrementally starting from 293 * the end after applying the callback to each of them. 294 * 295 * The problem with this approach is that we cannot literally 296 * iterate through the words in the space map backwards as we 297 * can't distinguish two-word space map entries from their second 298 * word. Thus we do the following: 299 * 300 * 1] We get all the entries from the last block of the space map 301 * and put them into a buffer in reverse order. This way the 302 * last entry comes first in the buffer, the second to last is 303 * second, etc. 304 * 2] We iterate through the entries in the buffer and we apply 305 * the callback to each one. As we move from entry to entry we 306 * we decrease the size of the space map, deleting effectively 307 * each entry. 308 * 3] If there are no more entries in the space map or the callback 309 * returns a value other than 0, we stop iterating over the 310 * space map. If there are entries remaining and the callback 311 * returned 0, we go back to step [1]. 312 */ 313 int error = 0; 314 while (space_map_length(sm) > 0 && error == 0) { 315 uint64_t nwords = 0; 316 error = space_map_reversed_last_block_entries(sm, buf, bufsz, 317 &nwords); 318 if (error != 0) 319 break; 320 321 ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t)); 322 323 for (uint64_t i = 0; i < nwords; i++) { 324 uint64_t e = buf[i]; 325 326 if (sm_entry_is_debug(e)) { 327 sm->sm_phys->smp_length -= sizeof (uint64_t); 328 continue; 329 } 330 331 int words = 1; 332 uint64_t raw_offset, raw_run, vdev_id; 333 maptype_t type; 334 if (sm_entry_is_single_word(e)) { 335 type = SM_TYPE_DECODE(e); 336 vdev_id = SM_NO_VDEVID; 337 raw_offset = SM_OFFSET_DECODE(e); 338 raw_run = SM_RUN_DECODE(e); 339 } else { 340 ASSERT(sm_entry_is_double_word(e)); 341 words = 2; 342 343 raw_run = SM2_RUN_DECODE(e); 344 vdev_id = SM2_VDEV_DECODE(e); 345 346 /* move to the second word */ 347 i++; 348 e = buf[i]; 349 350 ASSERT3P(i, <=, nwords); 351 352 type = SM2_TYPE_DECODE(e); 353 raw_offset = SM2_OFFSET_DECODE(e); 354 } 355 356 uint64_t entry_offset = 357 (raw_offset << sm->sm_shift) + sm->sm_start; 358 uint64_t entry_run = raw_run << sm->sm_shift; 359 360 VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); 361 VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); 362 VERIFY3U(entry_offset, >=, sm->sm_start); 363 VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size); 364 VERIFY3U(entry_run, <=, sm->sm_size); 365 VERIFY3U(entry_offset + entry_run, <=, 366 sm->sm_start + sm->sm_size); 367 368 space_map_entry_t sme = { 369 .sme_type = type, 370 .sme_vdev = vdev_id, 371 .sme_offset = entry_offset, 372 .sme_run = entry_run 373 }; 374 error = callback(&sme, arg); 375 if (error != 0) 376 break; 377 378 if (type == SM_ALLOC) 379 sm->sm_phys->smp_alloc -= entry_run; 380 else 381 sm->sm_phys->smp_alloc += entry_run; 382 sm->sm_phys->smp_length -= words * sizeof (uint64_t); 383 } 384 } 385 386 if (space_map_length(sm) == 0) { 387 ASSERT0(error); 388 ASSERT0(space_map_allocated(sm)); 389 } 390 391 zio_buf_free(buf, bufsz); 392 return (error); 393 } 394 395 typedef struct space_map_load_arg { 396 space_map_t *smla_sm; 397 zfs_range_tree_t *smla_rt; 398 maptype_t smla_type; 399 } space_map_load_arg_t; 400 401 static int 402 space_map_load_callback(space_map_entry_t *sme, void *arg) 403 { 404 space_map_load_arg_t *smla = arg; 405 if (sme->sme_type == smla->smla_type) { 406 VERIFY3U(zfs_range_tree_space(smla->smla_rt) + sme->sme_run, <=, 407 smla->smla_sm->sm_size); 408 zfs_range_tree_add(smla->smla_rt, sme->sme_offset, 409 sme->sme_run); 410 } else { 411 zfs_range_tree_remove(smla->smla_rt, sme->sme_offset, 412 sme->sme_run); 413 } 414 415 return (0); 416 } 417 418 /* 419 * Load the spacemap into the rangetree, like space_map_load. But only 420 * read the first 'length' bytes of the spacemap. 421 */ 422 int 423 space_map_load_length(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype, 424 uint64_t length) 425 { 426 space_map_load_arg_t smla; 427 428 VERIFY0(zfs_range_tree_space(rt)); 429 430 if (maptype == SM_FREE) 431 zfs_range_tree_add(rt, sm->sm_start, sm->sm_size); 432 433 smla.smla_rt = rt; 434 smla.smla_sm = sm; 435 smla.smla_type = maptype; 436 int err = space_map_iterate(sm, length, 437 space_map_load_callback, &smla); 438 439 if (err != 0) 440 zfs_range_tree_vacate(rt, NULL, NULL); 441 442 return (err); 443 } 444 445 /* 446 * Load the space map disk into the specified range tree. Segments of maptype 447 * are added to the range tree, other segment types are removed. 448 */ 449 int 450 space_map_load(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype) 451 { 452 return (space_map_load_length(sm, rt, maptype, space_map_length(sm))); 453 } 454 455 void 456 space_map_histogram_clear(space_map_t *sm) 457 { 458 if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) 459 return; 460 461 memset(sm->sm_phys->smp_histogram, 0, 462 sizeof (sm->sm_phys->smp_histogram)); 463 } 464 465 boolean_t 466 space_map_histogram_verify(space_map_t *sm, zfs_range_tree_t *rt) 467 { 468 /* 469 * Verify that the in-core range tree does not have any 470 * ranges smaller than our sm_shift size. 471 */ 472 for (int i = 0; i < sm->sm_shift; i++) { 473 if (rt->rt_histogram[i] != 0) 474 return (B_FALSE); 475 } 476 return (B_TRUE); 477 } 478 479 void 480 space_map_histogram_add(space_map_t *sm, zfs_range_tree_t *rt, dmu_tx_t *tx) 481 { 482 int idx = 0; 483 484 ASSERT(dmu_tx_is_syncing(tx)); 485 VERIFY3U(space_map_object(sm), !=, 0); 486 487 if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) 488 return; 489 490 dmu_buf_will_dirty(sm->sm_dbuf, tx); 491 492 ASSERT(space_map_histogram_verify(sm, rt)); 493 /* 494 * Transfer the content of the range tree histogram to the space 495 * map histogram. The space map histogram contains 32 buckets ranging 496 * between 2^sm_shift to 2^(32+sm_shift-1). The range tree, 497 * however, can represent ranges from 2^0 to 2^63. Since the space 498 * map only cares about allocatable blocks (minimum of sm_shift) we 499 * can safely ignore all ranges in the range tree smaller than sm_shift. 500 */ 501 for (int i = sm->sm_shift; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { 502 503 /* 504 * Since the largest histogram bucket in the space map is 505 * 2^(32+sm_shift-1), we need to normalize the values in 506 * the range tree for any bucket larger than that size. For 507 * example given an sm_shift of 9, ranges larger than 2^40 508 * would get normalized as if they were 1TB ranges. Assume 509 * the range tree had a count of 5 in the 2^44 (16TB) bucket, 510 * the calculation below would normalize this to 5 * 2^4 (16). 511 */ 512 ASSERT3U(i, >=, idx + sm->sm_shift); 513 sm->sm_phys->smp_histogram[idx] += 514 rt->rt_histogram[i] << (i - idx - sm->sm_shift); 515 516 /* 517 * Increment the space map's index as long as we haven't 518 * reached the maximum bucket size. Accumulate all ranges 519 * larger than the max bucket size into the last bucket. 520 */ 521 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { 522 ASSERT3U(idx + sm->sm_shift, ==, i); 523 idx++; 524 ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); 525 } 526 } 527 } 528 529 static void 530 space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx) 531 { 532 dmu_buf_will_dirty(sm->sm_dbuf, tx); 533 534 uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | 535 SM_DEBUG_ACTION_ENCODE(maptype) | 536 SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) | 537 SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); 538 539 dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length, 540 sizeof (dentry), &dentry, tx); 541 542 sm->sm_phys->smp_length += sizeof (dentry); 543 } 544 545 /* 546 * Writes one or more entries given a segment. 547 * 548 * Note: The function may release the dbuf from the pointer initially 549 * passed to it, and return a different dbuf. Also, the space map's 550 * dbuf must be dirty for the changes in sm_phys to take effect. 551 */ 552 static void 553 space_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend, 554 maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, 555 const void *tag, dmu_tx_t *tx) 556 { 557 ASSERT3U(words, !=, 0); 558 ASSERT3U(words, <=, 2); 559 560 /* ensure the vdev_id can be represented by the space map */ 561 ASSERT3U(vdev_id, <=, SM_NO_VDEVID); 562 563 /* 564 * if this is a single word entry, ensure that no vdev was 565 * specified. 566 */ 567 IMPLY(words == 1, vdev_id == SM_NO_VDEVID); 568 569 dmu_buf_t *db = *dbp; 570 ASSERT3U(db->db_size, ==, sm->sm_blksz); 571 572 uint64_t *block_base = db->db_data; 573 uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t)); 574 uint64_t *block_cursor = block_base + 575 (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t); 576 577 ASSERT3P(block_cursor, <=, block_end); 578 579 uint64_t size = (rend - rstart) >> sm->sm_shift; 580 uint64_t start = (rstart - sm->sm_start) >> sm->sm_shift; 581 uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX; 582 583 ASSERT3U(rstart, >=, sm->sm_start); 584 ASSERT3U(rstart, <, sm->sm_start + sm->sm_size); 585 ASSERT3U(rend - rstart, <=, sm->sm_size); 586 ASSERT3U(rend, <=, sm->sm_start + sm->sm_size); 587 588 while (size != 0) { 589 ASSERT3P(block_cursor, <=, block_end); 590 591 /* 592 * If we are at the end of this block, flush it and start 593 * writing again from the beginning. 594 */ 595 if (block_cursor == block_end) { 596 dmu_buf_rele(db, tag); 597 598 uint64_t next_word_offset = sm->sm_phys->smp_length; 599 VERIFY0(dmu_buf_hold(sm->sm_os, 600 space_map_object(sm), next_word_offset, 601 tag, &db, DMU_READ_PREFETCH)); 602 dmu_buf_will_dirty(db, tx); 603 604 /* update caller's dbuf */ 605 *dbp = db; 606 607 ASSERT3U(db->db_size, ==, sm->sm_blksz); 608 609 block_base = db->db_data; 610 block_cursor = block_base; 611 block_end = block_base + 612 (db->db_size / sizeof (uint64_t)); 613 } 614 615 /* 616 * If we are writing a two-word entry and we only have one 617 * word left on this block, just pad it with an empty debug 618 * entry and write the two-word entry in the next block. 619 */ 620 uint64_t *next_entry = block_cursor + 1; 621 if (next_entry == block_end && words > 1) { 622 ASSERT3U(words, ==, 2); 623 *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | 624 SM_DEBUG_ACTION_ENCODE(0) | 625 SM_DEBUG_SYNCPASS_ENCODE(0) | 626 SM_DEBUG_TXG_ENCODE(0); 627 block_cursor++; 628 sm->sm_phys->smp_length += sizeof (uint64_t); 629 ASSERT3P(block_cursor, ==, block_end); 630 continue; 631 } 632 633 uint64_t run_len = MIN(size, run_max); 634 switch (words) { 635 case 1: 636 *block_cursor = SM_OFFSET_ENCODE(start) | 637 SM_TYPE_ENCODE(maptype) | 638 SM_RUN_ENCODE(run_len); 639 block_cursor++; 640 break; 641 case 2: 642 /* write the first word of the entry */ 643 *block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) | 644 SM2_RUN_ENCODE(run_len) | 645 SM2_VDEV_ENCODE(vdev_id); 646 block_cursor++; 647 648 /* move on to the second word of the entry */ 649 ASSERT3P(block_cursor, <, block_end); 650 *block_cursor = SM2_TYPE_ENCODE(maptype) | 651 SM2_OFFSET_ENCODE(start); 652 block_cursor++; 653 break; 654 default: 655 panic("%d-word space map entries are not supported", 656 words); 657 break; 658 } 659 sm->sm_phys->smp_length += words * sizeof (uint64_t); 660 661 start += run_len; 662 size -= run_len; 663 } 664 ASSERT0(size); 665 666 } 667 668 /* 669 * Note: The space map's dbuf must be dirty for the changes in sm_phys to 670 * take effect. 671 */ 672 static void 673 space_map_write_impl(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype, 674 uint64_t vdev_id, dmu_tx_t *tx) 675 { 676 spa_t *spa = tx->tx_pool->dp_spa; 677 dmu_buf_t *db; 678 679 space_map_write_intro_debug(sm, maptype, tx); 680 681 #ifdef ZFS_DEBUG 682 /* 683 * We do this right after we write the intro debug entry 684 * because the estimate does not take it into account. 685 */ 686 uint64_t initial_objsize = sm->sm_phys->smp_length; 687 uint64_t estimated_growth = 688 space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID); 689 uint64_t estimated_final_objsize = initial_objsize + estimated_growth; 690 #endif 691 692 /* 693 * Find the offset right after the last word in the space map 694 * and use that to get a hold of the last block, so we can 695 * start appending to it. 696 */ 697 uint64_t next_word_offset = sm->sm_phys->smp_length; 698 VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm), 699 next_word_offset, FTAG, &db, DMU_READ_PREFETCH)); 700 ASSERT3U(db->db_size, ==, sm->sm_blksz); 701 702 dmu_buf_will_dirty(db, tx); 703 704 zfs_btree_t *t = &rt->rt_root; 705 zfs_btree_index_t where; 706 for (zfs_range_seg_t *rs = zfs_btree_first(t, &where); rs != NULL; 707 rs = zfs_btree_next(t, &where, &where)) { 708 uint64_t offset = (zfs_rs_get_start(rs, rt) - sm->sm_start) >> 709 sm->sm_shift; 710 uint64_t length = (zfs_rs_get_end(rs, rt) - 711 zfs_rs_get_start(rs, rt)) >> sm->sm_shift; 712 uint8_t words = 1; 713 714 /* 715 * We only write two-word entries when both of the following 716 * are true: 717 * 718 * [1] The feature is enabled. 719 * [2] The offset or run is too big for a single-word entry, 720 * or the vdev_id is set (meaning not equal to 721 * SM_NO_VDEVID). 722 * 723 * Note that for purposes of testing we've added the case that 724 * we write two-word entries occasionally when the feature is 725 * enabled and zfs_force_some_double_word_sm_entries has been 726 * set. 727 */ 728 if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) && 729 (offset >= (1ULL << SM_OFFSET_BITS) || 730 length > SM_RUN_MAX || 731 vdev_id != SM_NO_VDEVID || 732 (zfs_force_some_double_word_sm_entries && 733 random_in_range(100) == 0))) 734 words = 2; 735 736 space_map_write_seg(sm, zfs_rs_get_start(rs, rt), 737 zfs_rs_get_end(rs, rt), maptype, vdev_id, words, &db, 738 FTAG, tx); 739 } 740 741 dmu_buf_rele(db, FTAG); 742 743 #ifdef ZFS_DEBUG 744 /* 745 * We expect our estimation to be based on the worst case 746 * scenario [see comment in space_map_estimate_optimal_size()]. 747 * Therefore we expect the actual objsize to be equal or less 748 * than whatever we estimated it to be. 749 */ 750 ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length); 751 #endif 752 } 753 754 /* 755 * Note: This function manipulates the state of the given space map but 756 * does not hold any locks implicitly. Thus the caller is responsible 757 * for synchronizing writes to the space map. 758 */ 759 void 760 space_map_write(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype, 761 uint64_t vdev_id, dmu_tx_t *tx) 762 { 763 ASSERT(dsl_pool_sync_context(dmu_objset_pool(sm->sm_os))); 764 VERIFY3U(space_map_object(sm), !=, 0); 765 766 dmu_buf_will_dirty(sm->sm_dbuf, tx); 767 768 /* 769 * This field is no longer necessary since the in-core space map 770 * now contains the object number but is maintained for backwards 771 * compatibility. 772 */ 773 sm->sm_phys->smp_object = sm->sm_object; 774 775 if (zfs_range_tree_is_empty(rt)) { 776 VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object); 777 return; 778 } 779 780 if (maptype == SM_ALLOC) 781 sm->sm_phys->smp_alloc += zfs_range_tree_space(rt); 782 else 783 sm->sm_phys->smp_alloc -= zfs_range_tree_space(rt); 784 785 uint64_t nodes = zfs_btree_numnodes(&rt->rt_root); 786 uint64_t rt_space = zfs_range_tree_space(rt); 787 788 space_map_write_impl(sm, rt, maptype, vdev_id, tx); 789 790 /* 791 * Ensure that the space_map's accounting wasn't changed 792 * while we were in the middle of writing it out. 793 */ 794 VERIFY3U(nodes, ==, zfs_btree_numnodes(&rt->rt_root)); 795 VERIFY3U(zfs_range_tree_space(rt), ==, rt_space); 796 } 797 798 static int 799 space_map_open_impl(space_map_t *sm) 800 { 801 int error; 802 u_longlong_t blocks; 803 804 error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf); 805 if (error) 806 return (error); 807 808 dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks); 809 sm->sm_phys = sm->sm_dbuf->db_data; 810 return (0); 811 } 812 813 int 814 space_map_open(space_map_t **smp, objset_t *os, uint64_t object, 815 uint64_t start, uint64_t size, uint8_t shift) 816 { 817 space_map_t *sm; 818 int error; 819 820 ASSERT(*smp == NULL); 821 ASSERT(os != NULL); 822 ASSERT(object != 0); 823 824 sm = kmem_alloc(sizeof (space_map_t), KM_SLEEP); 825 826 sm->sm_start = start; 827 sm->sm_size = size; 828 sm->sm_shift = shift; 829 sm->sm_os = os; 830 sm->sm_object = object; 831 sm->sm_blksz = 0; 832 sm->sm_dbuf = NULL; 833 sm->sm_phys = NULL; 834 835 error = space_map_open_impl(sm); 836 if (error != 0) { 837 space_map_close(sm); 838 return (error); 839 } 840 *smp = sm; 841 842 return (0); 843 } 844 845 void 846 space_map_close(space_map_t *sm) 847 { 848 if (sm == NULL) 849 return; 850 851 if (sm->sm_dbuf != NULL) 852 dmu_buf_rele(sm->sm_dbuf, sm); 853 sm->sm_dbuf = NULL; 854 sm->sm_phys = NULL; 855 856 kmem_free(sm, sizeof (*sm)); 857 } 858 859 void 860 space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) 861 { 862 objset_t *os = sm->sm_os; 863 spa_t *spa = dmu_objset_spa(os); 864 dmu_object_info_t doi; 865 866 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 867 ASSERT(dmu_tx_is_syncing(tx)); 868 VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa)); 869 870 dmu_object_info_from_db(sm->sm_dbuf, &doi); 871 872 /* 873 * If the space map has the wrong bonus size (because 874 * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or 875 * the wrong block size (because space_map_blksz has changed), 876 * free and re-allocate its object with the updated sizes. 877 * 878 * Otherwise, just truncate the current object. 879 */ 880 if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && 881 doi.doi_bonus_size != sizeof (space_map_phys_t)) || 882 doi.doi_data_block_size != blocksize || 883 doi.doi_metadata_block_size != 1 << space_map_ibs) { 884 zfs_dbgmsg("txg %llu, spa %s, sm %px, reallocating " 885 "object[%llu]: old bonus %llu, old blocksz %u", 886 (u_longlong_t)dmu_tx_get_txg(tx), spa_name(spa), sm, 887 (u_longlong_t)sm->sm_object, 888 (u_longlong_t)doi.doi_bonus_size, 889 doi.doi_data_block_size); 890 891 space_map_free(sm, tx); 892 dmu_buf_rele(sm->sm_dbuf, sm); 893 894 sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx); 895 VERIFY0(space_map_open_impl(sm)); 896 } else { 897 VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx)); 898 899 /* 900 * If the spacemap is reallocated, its histogram 901 * will be reset. Do the same in the common case so that 902 * bugs related to the uncommon case do not go unnoticed. 903 */ 904 memset(sm->sm_phys->smp_histogram, 0, 905 sizeof (sm->sm_phys->smp_histogram)); 906 } 907 908 dmu_buf_will_dirty(sm->sm_dbuf, tx); 909 sm->sm_phys->smp_length = 0; 910 sm->sm_phys->smp_alloc = 0; 911 } 912 913 uint64_t 914 space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) 915 { 916 spa_t *spa = dmu_objset_spa(os); 917 uint64_t object; 918 int bonuslen; 919 920 if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { 921 spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); 922 bonuslen = sizeof (space_map_phys_t); 923 ASSERT3U(bonuslen, <=, dmu_bonus_max()); 924 } else { 925 bonuslen = SPACE_MAP_SIZE_V0; 926 } 927 928 object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize, 929 space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx); 930 931 return (object); 932 } 933 934 void 935 space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx) 936 { 937 spa_t *spa = dmu_objset_spa(os); 938 if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { 939 dmu_object_info_t doi; 940 941 VERIFY0(dmu_object_info(os, smobj, &doi)); 942 if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) { 943 spa_feature_decr(spa, 944 SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); 945 } 946 } 947 948 VERIFY0(dmu_object_free(os, smobj, tx)); 949 } 950 951 void 952 space_map_free(space_map_t *sm, dmu_tx_t *tx) 953 { 954 if (sm == NULL) 955 return; 956 957 space_map_free_obj(sm->sm_os, space_map_object(sm), tx); 958 sm->sm_object = 0; 959 } 960 961 /* 962 * Given a range tree, it makes a worst-case estimate of how much 963 * space would the tree's segments take if they were written to 964 * the given space map. 965 */ 966 uint64_t 967 space_map_estimate_optimal_size(space_map_t *sm, zfs_range_tree_t *rt, 968 uint64_t vdev_id) 969 { 970 spa_t *spa = dmu_objset_spa(sm->sm_os); 971 uint64_t shift = sm->sm_shift; 972 uint64_t *histogram = rt->rt_histogram; 973 uint64_t entries_for_seg = 0; 974 975 /* 976 * In order to get a quick estimate of the optimal size that this 977 * range tree would have on-disk as a space map, we iterate through 978 * its histogram buckets instead of iterating through its nodes. 979 * 980 * Note that this is a highest-bound/worst-case estimate for the 981 * following reasons: 982 * 983 * 1] We assume that we always add a debug padding for each block 984 * we write and we also assume that we start at the last word 985 * of a block attempting to write a two-word entry. 986 * 2] Rounding up errors due to the way segments are distributed 987 * in the buckets of the range tree's histogram. 988 * 3] The activation of zfs_force_some_double_word_sm_entries 989 * (tunable) when testing. 990 * 991 * = Math and Rounding Errors = 992 * 993 * rt_histogram[i] bucket of a range tree represents the number 994 * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given 995 * that, we want to divide the buckets into groups: Buckets that 996 * can be represented using a single-word entry, ones that can 997 * be represented with a double-word entry, and ones that can 998 * only be represented with multiple two-word entries. 999 * 1000 * [Note that if the new encoding feature is not enabled there 1001 * are only two groups: single-word entry buckets and multiple 1002 * single-word entry buckets. The information below assumes 1003 * two-word entries enabled, but it can easily applied when 1004 * the feature is not enabled] 1005 * 1006 * To find the highest bucket that can be represented with a 1007 * single-word entry we look at the maximum run that such entry 1008 * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that 1009 * the run of a space map entry is shifted by sm_shift, thus we 1010 * add it to the exponent]. This way, excluding the value of the 1011 * maximum run that can be represented by a single-word entry, 1012 * all runs that are smaller exist in buckets 0 to 1013 * SM_RUN_BITS + shift - 1. 1014 * 1015 * To find the highest bucket that can be represented with a 1016 * double-word entry, we follow the same approach. Finally, any 1017 * bucket higher than that are represented with multiple two-word 1018 * entries. To be more specific, if the highest bucket whose 1019 * segments can be represented with a single two-word entry is X, 1020 * then bucket X+1 will need 2 two-word entries for each of its 1021 * segments, X+2 will need 4, X+3 will need 8, ...etc. 1022 * 1023 * With all of the above we make our estimation based on bucket 1024 * groups. There is a rounding error though. As we mentioned in 1025 * the example with the one-word entry, the maximum run that can 1026 * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is 1027 * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of 1028 * that length fall into the next bucket (and bucket group) where 1029 * we start counting two-word entries and this is one more reason 1030 * why the estimated size may end up being bigger than the actual 1031 * size written. 1032 */ 1033 uint64_t size = 0; 1034 uint64_t idx = 0; 1035 1036 if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) || 1037 (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) { 1038 1039 /* 1040 * If we are trying to force some double word entries just 1041 * assume the worst-case of every single word entry being 1042 * written as a double word entry. 1043 */ 1044 uint64_t entry_size = 1045 (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) && 1046 zfs_force_some_double_word_sm_entries) ? 1047 (2 * sizeof (uint64_t)) : sizeof (uint64_t); 1048 1049 uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1; 1050 for (; idx <= single_entry_max_bucket; idx++) 1051 size += histogram[idx] * entry_size; 1052 1053 if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) { 1054 for (; idx < ZFS_RANGE_TREE_HISTOGRAM_SIZE; idx++) { 1055 ASSERT3U(idx, >=, single_entry_max_bucket); 1056 entries_for_seg = 1057 1ULL << (idx - single_entry_max_bucket); 1058 size += histogram[idx] * 1059 entries_for_seg * entry_size; 1060 } 1061 return (size); 1062 } 1063 } 1064 1065 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)); 1066 1067 uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1; 1068 for (; idx <= double_entry_max_bucket; idx++) 1069 size += histogram[idx] * 2 * sizeof (uint64_t); 1070 1071 for (; idx < ZFS_RANGE_TREE_HISTOGRAM_SIZE; idx++) { 1072 ASSERT3U(idx, >=, double_entry_max_bucket); 1073 entries_for_seg = 1ULL << (idx - double_entry_max_bucket); 1074 size += histogram[idx] * 1075 entries_for_seg * 2 * sizeof (uint64_t); 1076 } 1077 1078 /* 1079 * Assume the worst case where we start with the padding at the end 1080 * of the current block and we add an extra padding entry at the end 1081 * of all subsequent blocks. 1082 */ 1083 size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t); 1084 1085 return (size); 1086 } 1087 1088 uint64_t 1089 space_map_object(space_map_t *sm) 1090 { 1091 return (sm != NULL ? sm->sm_object : 0); 1092 } 1093 1094 int64_t 1095 space_map_allocated(space_map_t *sm) 1096 { 1097 return (sm != NULL ? sm->sm_phys->smp_alloc : 0); 1098 } 1099 1100 uint64_t 1101 space_map_length(space_map_t *sm) 1102 { 1103 return (sm != NULL ? sm->sm_phys->smp_length : 0); 1104 } 1105 1106 uint64_t 1107 space_map_nblocks(space_map_t *sm) 1108 { 1109 if (sm == NULL) 1110 return (0); 1111 return (DIV_ROUND_UP(space_map_length(sm), sm->sm_blksz)); 1112 } 1113