1*eda14cbcSMatt Macy /* 2*eda14cbcSMatt Macy * CDDL HEADER START 3*eda14cbcSMatt Macy * 4*eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5*eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6*eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7*eda14cbcSMatt Macy * 8*eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*eda14cbcSMatt Macy * or http://www.opensolaris.org/os/licensing. 10*eda14cbcSMatt Macy * See the License for the specific language governing permissions 11*eda14cbcSMatt Macy * and limitations under the License. 12*eda14cbcSMatt Macy * 13*eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14*eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16*eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17*eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18*eda14cbcSMatt Macy * 19*eda14cbcSMatt Macy * CDDL HEADER END 20*eda14cbcSMatt Macy */ 21*eda14cbcSMatt Macy /* 22*eda14cbcSMatt Macy * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23*eda14cbcSMatt Macy * Use is subject to license terms. 24*eda14cbcSMatt Macy */ 25*eda14cbcSMatt Macy /* 26*eda14cbcSMatt Macy * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 27*eda14cbcSMatt Macy */ 28*eda14cbcSMatt Macy 29*eda14cbcSMatt Macy #include <sys/zfs_context.h> 30*eda14cbcSMatt Macy #include <sys/spa.h> 31*eda14cbcSMatt Macy #include <sys/dmu.h> 32*eda14cbcSMatt Macy #include <sys/dmu_tx.h> 33*eda14cbcSMatt Macy #include <sys/dnode.h> 34*eda14cbcSMatt Macy #include <sys/dsl_pool.h> 35*eda14cbcSMatt Macy #include <sys/zio.h> 36*eda14cbcSMatt Macy #include <sys/space_map.h> 37*eda14cbcSMatt Macy #include <sys/zfeature.h> 38*eda14cbcSMatt Macy 39*eda14cbcSMatt Macy /* 40*eda14cbcSMatt Macy * Note on space map block size: 41*eda14cbcSMatt Macy * 42*eda14cbcSMatt Macy * The data for a given space map can be kept on blocks of any size. 43*eda14cbcSMatt Macy * Larger blocks entail fewer I/O operations, but they also cause the 44*eda14cbcSMatt Macy * DMU to keep more data in-core, and also to waste more I/O bandwidth 45*eda14cbcSMatt Macy * when only a few blocks have changed since the last transaction group. 46*eda14cbcSMatt Macy */ 47*eda14cbcSMatt Macy 48*eda14cbcSMatt Macy /* 49*eda14cbcSMatt Macy * Enabled whenever we want to stress test the use of double-word 50*eda14cbcSMatt Macy * space map entries. 51*eda14cbcSMatt Macy */ 52*eda14cbcSMatt Macy boolean_t zfs_force_some_double_word_sm_entries = B_FALSE; 53*eda14cbcSMatt Macy 54*eda14cbcSMatt Macy /* 55*eda14cbcSMatt Macy * Override the default indirect block size of 128K, instead use 16K for 56*eda14cbcSMatt Macy * spacemaps (2^14 bytes). This dramatically reduces write inflation since 57*eda14cbcSMatt Macy * appending to a spacemap typically has to write one data block (4KB) and one 58*eda14cbcSMatt Macy * or two indirect blocks (16K-32K, rather than 128K). 59*eda14cbcSMatt Macy */ 60*eda14cbcSMatt Macy int space_map_ibs = 14; 61*eda14cbcSMatt Macy 62*eda14cbcSMatt Macy boolean_t 63*eda14cbcSMatt Macy sm_entry_is_debug(uint64_t e) 64*eda14cbcSMatt Macy { 65*eda14cbcSMatt Macy return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX); 66*eda14cbcSMatt Macy } 67*eda14cbcSMatt Macy 68*eda14cbcSMatt Macy boolean_t 69*eda14cbcSMatt Macy sm_entry_is_single_word(uint64_t e) 70*eda14cbcSMatt Macy { 71*eda14cbcSMatt Macy uint8_t prefix = SM_PREFIX_DECODE(e); 72*eda14cbcSMatt Macy return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX); 73*eda14cbcSMatt Macy } 74*eda14cbcSMatt Macy 75*eda14cbcSMatt Macy boolean_t 76*eda14cbcSMatt Macy sm_entry_is_double_word(uint64_t e) 77*eda14cbcSMatt Macy { 78*eda14cbcSMatt Macy return (SM_PREFIX_DECODE(e) == SM2_PREFIX); 79*eda14cbcSMatt Macy } 80*eda14cbcSMatt Macy 81*eda14cbcSMatt Macy /* 82*eda14cbcSMatt Macy * Iterate through the space map, invoking the callback on each (non-debug) 83*eda14cbcSMatt Macy * space map entry. Stop after reading 'end' bytes of the space map. 84*eda14cbcSMatt Macy */ 85*eda14cbcSMatt Macy int 86*eda14cbcSMatt Macy space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg) 87*eda14cbcSMatt Macy { 88*eda14cbcSMatt Macy uint64_t blksz = sm->sm_blksz; 89*eda14cbcSMatt Macy 90*eda14cbcSMatt Macy ASSERT3U(blksz, !=, 0); 91*eda14cbcSMatt Macy ASSERT3U(end, <=, space_map_length(sm)); 92*eda14cbcSMatt Macy ASSERT0(P2PHASE(end, sizeof (uint64_t))); 93*eda14cbcSMatt Macy 94*eda14cbcSMatt Macy dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end, 95*eda14cbcSMatt Macy ZIO_PRIORITY_SYNC_READ); 96*eda14cbcSMatt Macy 97*eda14cbcSMatt Macy int error = 0; 98*eda14cbcSMatt Macy uint64_t txg = 0, sync_pass = 0; 99*eda14cbcSMatt Macy for (uint64_t block_base = 0; block_base < end && error == 0; 100*eda14cbcSMatt Macy block_base += blksz) { 101*eda14cbcSMatt Macy dmu_buf_t *db; 102*eda14cbcSMatt Macy error = dmu_buf_hold(sm->sm_os, space_map_object(sm), 103*eda14cbcSMatt Macy block_base, FTAG, &db, DMU_READ_PREFETCH); 104*eda14cbcSMatt Macy if (error != 0) 105*eda14cbcSMatt Macy return (error); 106*eda14cbcSMatt Macy 107*eda14cbcSMatt Macy uint64_t *block_start = db->db_data; 108*eda14cbcSMatt Macy uint64_t block_length = MIN(end - block_base, blksz); 109*eda14cbcSMatt Macy uint64_t *block_end = block_start + 110*eda14cbcSMatt Macy (block_length / sizeof (uint64_t)); 111*eda14cbcSMatt Macy 112*eda14cbcSMatt Macy VERIFY0(P2PHASE(block_length, sizeof (uint64_t))); 113*eda14cbcSMatt Macy VERIFY3U(block_length, !=, 0); 114*eda14cbcSMatt Macy ASSERT3U(blksz, ==, db->db_size); 115*eda14cbcSMatt Macy 116*eda14cbcSMatt Macy for (uint64_t *block_cursor = block_start; 117*eda14cbcSMatt Macy block_cursor < block_end && error == 0; block_cursor++) { 118*eda14cbcSMatt Macy uint64_t e = *block_cursor; 119*eda14cbcSMatt Macy 120*eda14cbcSMatt Macy if (sm_entry_is_debug(e)) { 121*eda14cbcSMatt Macy /* 122*eda14cbcSMatt Macy * Debug entries are only needed to record the 123*eda14cbcSMatt Macy * current TXG and sync pass if available. 124*eda14cbcSMatt Macy * 125*eda14cbcSMatt Macy * Note though that sometimes there can be 126*eda14cbcSMatt Macy * debug entries that are used as padding 127*eda14cbcSMatt Macy * at the end of space map blocks in-order 128*eda14cbcSMatt Macy * to not split a double-word entry in the 129*eda14cbcSMatt Macy * middle between two blocks. These entries 130*eda14cbcSMatt Macy * have their TXG field set to 0 and we 131*eda14cbcSMatt Macy * skip them without recording the TXG. 132*eda14cbcSMatt Macy * [see comment in space_map_write_seg()] 133*eda14cbcSMatt Macy */ 134*eda14cbcSMatt Macy uint64_t e_txg = SM_DEBUG_TXG_DECODE(e); 135*eda14cbcSMatt Macy if (e_txg != 0) { 136*eda14cbcSMatt Macy txg = e_txg; 137*eda14cbcSMatt Macy sync_pass = SM_DEBUG_SYNCPASS_DECODE(e); 138*eda14cbcSMatt Macy } else { 139*eda14cbcSMatt Macy ASSERT0(SM_DEBUG_SYNCPASS_DECODE(e)); 140*eda14cbcSMatt Macy } 141*eda14cbcSMatt Macy continue; 142*eda14cbcSMatt Macy } 143*eda14cbcSMatt Macy 144*eda14cbcSMatt Macy uint64_t raw_offset, raw_run, vdev_id; 145*eda14cbcSMatt Macy maptype_t type; 146*eda14cbcSMatt Macy if (sm_entry_is_single_word(e)) { 147*eda14cbcSMatt Macy type = SM_TYPE_DECODE(e); 148*eda14cbcSMatt Macy vdev_id = SM_NO_VDEVID; 149*eda14cbcSMatt Macy raw_offset = SM_OFFSET_DECODE(e); 150*eda14cbcSMatt Macy raw_run = SM_RUN_DECODE(e); 151*eda14cbcSMatt Macy } else { 152*eda14cbcSMatt Macy /* it is a two-word entry */ 153*eda14cbcSMatt Macy ASSERT(sm_entry_is_double_word(e)); 154*eda14cbcSMatt Macy raw_run = SM2_RUN_DECODE(e); 155*eda14cbcSMatt Macy vdev_id = SM2_VDEV_DECODE(e); 156*eda14cbcSMatt Macy 157*eda14cbcSMatt Macy /* move on to the second word */ 158*eda14cbcSMatt Macy block_cursor++; 159*eda14cbcSMatt Macy e = *block_cursor; 160*eda14cbcSMatt Macy VERIFY3P(block_cursor, <=, block_end); 161*eda14cbcSMatt Macy 162*eda14cbcSMatt Macy type = SM2_TYPE_DECODE(e); 163*eda14cbcSMatt Macy raw_offset = SM2_OFFSET_DECODE(e); 164*eda14cbcSMatt Macy } 165*eda14cbcSMatt Macy 166*eda14cbcSMatt Macy uint64_t entry_offset = (raw_offset << sm->sm_shift) + 167*eda14cbcSMatt Macy sm->sm_start; 168*eda14cbcSMatt Macy uint64_t entry_run = raw_run << sm->sm_shift; 169*eda14cbcSMatt Macy 170*eda14cbcSMatt Macy VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); 171*eda14cbcSMatt Macy VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); 172*eda14cbcSMatt Macy ASSERT3U(entry_offset, >=, sm->sm_start); 173*eda14cbcSMatt Macy ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size); 174*eda14cbcSMatt Macy ASSERT3U(entry_run, <=, sm->sm_size); 175*eda14cbcSMatt Macy ASSERT3U(entry_offset + entry_run, <=, 176*eda14cbcSMatt Macy sm->sm_start + sm->sm_size); 177*eda14cbcSMatt Macy 178*eda14cbcSMatt Macy space_map_entry_t sme = { 179*eda14cbcSMatt Macy .sme_type = type, 180*eda14cbcSMatt Macy .sme_vdev = vdev_id, 181*eda14cbcSMatt Macy .sme_offset = entry_offset, 182*eda14cbcSMatt Macy .sme_run = entry_run, 183*eda14cbcSMatt Macy .sme_txg = txg, 184*eda14cbcSMatt Macy .sme_sync_pass = sync_pass 185*eda14cbcSMatt Macy }; 186*eda14cbcSMatt Macy error = callback(&sme, arg); 187*eda14cbcSMatt Macy } 188*eda14cbcSMatt Macy dmu_buf_rele(db, FTAG); 189*eda14cbcSMatt Macy } 190*eda14cbcSMatt Macy return (error); 191*eda14cbcSMatt Macy } 192*eda14cbcSMatt Macy 193*eda14cbcSMatt Macy /* 194*eda14cbcSMatt Macy * Reads the entries from the last block of the space map into 195*eda14cbcSMatt Macy * buf in reverse order. Populates nwords with number of words 196*eda14cbcSMatt Macy * in the last block. 197*eda14cbcSMatt Macy * 198*eda14cbcSMatt Macy * Refer to block comment within space_map_incremental_destroy() 199*eda14cbcSMatt Macy * to understand why this function is needed. 200*eda14cbcSMatt Macy */ 201*eda14cbcSMatt Macy static int 202*eda14cbcSMatt Macy space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf, 203*eda14cbcSMatt Macy uint64_t bufsz, uint64_t *nwords) 204*eda14cbcSMatt Macy { 205*eda14cbcSMatt Macy int error = 0; 206*eda14cbcSMatt Macy dmu_buf_t *db; 207*eda14cbcSMatt Macy 208*eda14cbcSMatt Macy /* 209*eda14cbcSMatt Macy * Find the offset of the last word in the space map and use 210*eda14cbcSMatt Macy * that to read the last block of the space map with 211*eda14cbcSMatt Macy * dmu_buf_hold(). 212*eda14cbcSMatt Macy */ 213*eda14cbcSMatt Macy uint64_t last_word_offset = 214*eda14cbcSMatt Macy sm->sm_phys->smp_length - sizeof (uint64_t); 215*eda14cbcSMatt Macy error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset, 216*eda14cbcSMatt Macy FTAG, &db, DMU_READ_NO_PREFETCH); 217*eda14cbcSMatt Macy if (error != 0) 218*eda14cbcSMatt Macy return (error); 219*eda14cbcSMatt Macy 220*eda14cbcSMatt Macy ASSERT3U(sm->sm_object, ==, db->db_object); 221*eda14cbcSMatt Macy ASSERT3U(sm->sm_blksz, ==, db->db_size); 222*eda14cbcSMatt Macy ASSERT3U(bufsz, >=, db->db_size); 223*eda14cbcSMatt Macy ASSERT(nwords != NULL); 224*eda14cbcSMatt Macy 225*eda14cbcSMatt Macy uint64_t *words = db->db_data; 226*eda14cbcSMatt Macy *nwords = 227*eda14cbcSMatt Macy (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t); 228*eda14cbcSMatt Macy 229*eda14cbcSMatt Macy ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t)); 230*eda14cbcSMatt Macy 231*eda14cbcSMatt Macy uint64_t n = *nwords; 232*eda14cbcSMatt Macy uint64_t j = n - 1; 233*eda14cbcSMatt Macy for (uint64_t i = 0; i < n; i++) { 234*eda14cbcSMatt Macy uint64_t entry = words[i]; 235*eda14cbcSMatt Macy if (sm_entry_is_double_word(entry)) { 236*eda14cbcSMatt Macy /* 237*eda14cbcSMatt Macy * Since we are populating the buffer backwards 238*eda14cbcSMatt Macy * we have to be extra careful and add the two 239*eda14cbcSMatt Macy * words of the double-word entry in the right 240*eda14cbcSMatt Macy * order. 241*eda14cbcSMatt Macy */ 242*eda14cbcSMatt Macy ASSERT3U(j, >, 0); 243*eda14cbcSMatt Macy buf[j - 1] = entry; 244*eda14cbcSMatt Macy 245*eda14cbcSMatt Macy i++; 246*eda14cbcSMatt Macy ASSERT3U(i, <, n); 247*eda14cbcSMatt Macy entry = words[i]; 248*eda14cbcSMatt Macy buf[j] = entry; 249*eda14cbcSMatt Macy j -= 2; 250*eda14cbcSMatt Macy } else { 251*eda14cbcSMatt Macy ASSERT(sm_entry_is_debug(entry) || 252*eda14cbcSMatt Macy sm_entry_is_single_word(entry)); 253*eda14cbcSMatt Macy buf[j] = entry; 254*eda14cbcSMatt Macy j--; 255*eda14cbcSMatt Macy } 256*eda14cbcSMatt Macy } 257*eda14cbcSMatt Macy 258*eda14cbcSMatt Macy /* 259*eda14cbcSMatt Macy * Assert that we wrote backwards all the 260*eda14cbcSMatt Macy * way to the beginning of the buffer. 261*eda14cbcSMatt Macy */ 262*eda14cbcSMatt Macy ASSERT3S(j, ==, -1); 263*eda14cbcSMatt Macy 264*eda14cbcSMatt Macy dmu_buf_rele(db, FTAG); 265*eda14cbcSMatt Macy return (error); 266*eda14cbcSMatt Macy } 267*eda14cbcSMatt Macy 268*eda14cbcSMatt Macy /* 269*eda14cbcSMatt Macy * Note: This function performs destructive actions - specifically 270*eda14cbcSMatt Macy * it deletes entries from the end of the space map. Thus, callers 271*eda14cbcSMatt Macy * should ensure that they are holding the appropriate locks for 272*eda14cbcSMatt Macy * the space map that they provide. 273*eda14cbcSMatt Macy */ 274*eda14cbcSMatt Macy int 275*eda14cbcSMatt Macy space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, 276*eda14cbcSMatt Macy dmu_tx_t *tx) 277*eda14cbcSMatt Macy { 278*eda14cbcSMatt Macy uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); 279*eda14cbcSMatt Macy uint64_t *buf = zio_buf_alloc(bufsz); 280*eda14cbcSMatt Macy 281*eda14cbcSMatt Macy dmu_buf_will_dirty(sm->sm_dbuf, tx); 282*eda14cbcSMatt Macy 283*eda14cbcSMatt Macy /* 284*eda14cbcSMatt Macy * Ideally we would want to iterate from the beginning of the 285*eda14cbcSMatt Macy * space map to the end in incremental steps. The issue with this 286*eda14cbcSMatt Macy * approach is that we don't have any field on-disk that points 287*eda14cbcSMatt Macy * us where to start between each step. We could try zeroing out 288*eda14cbcSMatt Macy * entries that we've destroyed, but this doesn't work either as 289*eda14cbcSMatt Macy * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]). 290*eda14cbcSMatt Macy * 291*eda14cbcSMatt Macy * As a result, we destroy its entries incrementally starting from 292*eda14cbcSMatt Macy * the end after applying the callback to each of them. 293*eda14cbcSMatt Macy * 294*eda14cbcSMatt Macy * The problem with this approach is that we cannot literally 295*eda14cbcSMatt Macy * iterate through the words in the space map backwards as we 296*eda14cbcSMatt Macy * can't distinguish two-word space map entries from their second 297*eda14cbcSMatt Macy * word. Thus we do the following: 298*eda14cbcSMatt Macy * 299*eda14cbcSMatt Macy * 1] We get all the entries from the last block of the space map 300*eda14cbcSMatt Macy * and put them into a buffer in reverse order. This way the 301*eda14cbcSMatt Macy * last entry comes first in the buffer, the second to last is 302*eda14cbcSMatt Macy * second, etc. 303*eda14cbcSMatt Macy * 2] We iterate through the entries in the buffer and we apply 304*eda14cbcSMatt Macy * the callback to each one. As we move from entry to entry we 305*eda14cbcSMatt Macy * we decrease the size of the space map, deleting effectively 306*eda14cbcSMatt Macy * each entry. 307*eda14cbcSMatt Macy * 3] If there are no more entries in the space map or the callback 308*eda14cbcSMatt Macy * returns a value other than 0, we stop iterating over the 309*eda14cbcSMatt Macy * space map. If there are entries remaining and the callback 310*eda14cbcSMatt Macy * returned 0, we go back to step [1]. 311*eda14cbcSMatt Macy */ 312*eda14cbcSMatt Macy int error = 0; 313*eda14cbcSMatt Macy while (space_map_length(sm) > 0 && error == 0) { 314*eda14cbcSMatt Macy uint64_t nwords = 0; 315*eda14cbcSMatt Macy error = space_map_reversed_last_block_entries(sm, buf, bufsz, 316*eda14cbcSMatt Macy &nwords); 317*eda14cbcSMatt Macy if (error != 0) 318*eda14cbcSMatt Macy break; 319*eda14cbcSMatt Macy 320*eda14cbcSMatt Macy ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t)); 321*eda14cbcSMatt Macy 322*eda14cbcSMatt Macy for (uint64_t i = 0; i < nwords; i++) { 323*eda14cbcSMatt Macy uint64_t e = buf[i]; 324*eda14cbcSMatt Macy 325*eda14cbcSMatt Macy if (sm_entry_is_debug(e)) { 326*eda14cbcSMatt Macy sm->sm_phys->smp_length -= sizeof (uint64_t); 327*eda14cbcSMatt Macy continue; 328*eda14cbcSMatt Macy } 329*eda14cbcSMatt Macy 330*eda14cbcSMatt Macy int words = 1; 331*eda14cbcSMatt Macy uint64_t raw_offset, raw_run, vdev_id; 332*eda14cbcSMatt Macy maptype_t type; 333*eda14cbcSMatt Macy if (sm_entry_is_single_word(e)) { 334*eda14cbcSMatt Macy type = SM_TYPE_DECODE(e); 335*eda14cbcSMatt Macy vdev_id = SM_NO_VDEVID; 336*eda14cbcSMatt Macy raw_offset = SM_OFFSET_DECODE(e); 337*eda14cbcSMatt Macy raw_run = SM_RUN_DECODE(e); 338*eda14cbcSMatt Macy } else { 339*eda14cbcSMatt Macy ASSERT(sm_entry_is_double_word(e)); 340*eda14cbcSMatt Macy words = 2; 341*eda14cbcSMatt Macy 342*eda14cbcSMatt Macy raw_run = SM2_RUN_DECODE(e); 343*eda14cbcSMatt Macy vdev_id = SM2_VDEV_DECODE(e); 344*eda14cbcSMatt Macy 345*eda14cbcSMatt Macy /* move to the second word */ 346*eda14cbcSMatt Macy i++; 347*eda14cbcSMatt Macy e = buf[i]; 348*eda14cbcSMatt Macy 349*eda14cbcSMatt Macy ASSERT3P(i, <=, nwords); 350*eda14cbcSMatt Macy 351*eda14cbcSMatt Macy type = SM2_TYPE_DECODE(e); 352*eda14cbcSMatt Macy raw_offset = SM2_OFFSET_DECODE(e); 353*eda14cbcSMatt Macy } 354*eda14cbcSMatt Macy 355*eda14cbcSMatt Macy uint64_t entry_offset = 356*eda14cbcSMatt Macy (raw_offset << sm->sm_shift) + sm->sm_start; 357*eda14cbcSMatt Macy uint64_t entry_run = raw_run << sm->sm_shift; 358*eda14cbcSMatt Macy 359*eda14cbcSMatt Macy VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); 360*eda14cbcSMatt Macy VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); 361*eda14cbcSMatt Macy VERIFY3U(entry_offset, >=, sm->sm_start); 362*eda14cbcSMatt Macy VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size); 363*eda14cbcSMatt Macy VERIFY3U(entry_run, <=, sm->sm_size); 364*eda14cbcSMatt Macy VERIFY3U(entry_offset + entry_run, <=, 365*eda14cbcSMatt Macy sm->sm_start + sm->sm_size); 366*eda14cbcSMatt Macy 367*eda14cbcSMatt Macy space_map_entry_t sme = { 368*eda14cbcSMatt Macy .sme_type = type, 369*eda14cbcSMatt Macy .sme_vdev = vdev_id, 370*eda14cbcSMatt Macy .sme_offset = entry_offset, 371*eda14cbcSMatt Macy .sme_run = entry_run 372*eda14cbcSMatt Macy }; 373*eda14cbcSMatt Macy error = callback(&sme, arg); 374*eda14cbcSMatt Macy if (error != 0) 375*eda14cbcSMatt Macy break; 376*eda14cbcSMatt Macy 377*eda14cbcSMatt Macy if (type == SM_ALLOC) 378*eda14cbcSMatt Macy sm->sm_phys->smp_alloc -= entry_run; 379*eda14cbcSMatt Macy else 380*eda14cbcSMatt Macy sm->sm_phys->smp_alloc += entry_run; 381*eda14cbcSMatt Macy sm->sm_phys->smp_length -= words * sizeof (uint64_t); 382*eda14cbcSMatt Macy } 383*eda14cbcSMatt Macy } 384*eda14cbcSMatt Macy 385*eda14cbcSMatt Macy if (space_map_length(sm) == 0) { 386*eda14cbcSMatt Macy ASSERT0(error); 387*eda14cbcSMatt Macy ASSERT0(space_map_allocated(sm)); 388*eda14cbcSMatt Macy } 389*eda14cbcSMatt Macy 390*eda14cbcSMatt Macy zio_buf_free(buf, bufsz); 391*eda14cbcSMatt Macy return (error); 392*eda14cbcSMatt Macy } 393*eda14cbcSMatt Macy 394*eda14cbcSMatt Macy typedef struct space_map_load_arg { 395*eda14cbcSMatt Macy space_map_t *smla_sm; 396*eda14cbcSMatt Macy range_tree_t *smla_rt; 397*eda14cbcSMatt Macy maptype_t smla_type; 398*eda14cbcSMatt Macy } space_map_load_arg_t; 399*eda14cbcSMatt Macy 400*eda14cbcSMatt Macy static int 401*eda14cbcSMatt Macy space_map_load_callback(space_map_entry_t *sme, void *arg) 402*eda14cbcSMatt Macy { 403*eda14cbcSMatt Macy space_map_load_arg_t *smla = arg; 404*eda14cbcSMatt Macy if (sme->sme_type == smla->smla_type) { 405*eda14cbcSMatt Macy VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=, 406*eda14cbcSMatt Macy smla->smla_sm->sm_size); 407*eda14cbcSMatt Macy range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run); 408*eda14cbcSMatt Macy } else { 409*eda14cbcSMatt Macy range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run); 410*eda14cbcSMatt Macy } 411*eda14cbcSMatt Macy 412*eda14cbcSMatt Macy return (0); 413*eda14cbcSMatt Macy } 414*eda14cbcSMatt Macy 415*eda14cbcSMatt Macy /* 416*eda14cbcSMatt Macy * Load the spacemap into the rangetree, like space_map_load. But only 417*eda14cbcSMatt Macy * read the first 'length' bytes of the spacemap. 418*eda14cbcSMatt Macy */ 419*eda14cbcSMatt Macy int 420*eda14cbcSMatt Macy space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype, 421*eda14cbcSMatt Macy uint64_t length) 422*eda14cbcSMatt Macy { 423*eda14cbcSMatt Macy space_map_load_arg_t smla; 424*eda14cbcSMatt Macy 425*eda14cbcSMatt Macy VERIFY0(range_tree_space(rt)); 426*eda14cbcSMatt Macy 427*eda14cbcSMatt Macy if (maptype == SM_FREE) 428*eda14cbcSMatt Macy range_tree_add(rt, sm->sm_start, sm->sm_size); 429*eda14cbcSMatt Macy 430*eda14cbcSMatt Macy smla.smla_rt = rt; 431*eda14cbcSMatt Macy smla.smla_sm = sm; 432*eda14cbcSMatt Macy smla.smla_type = maptype; 433*eda14cbcSMatt Macy int err = space_map_iterate(sm, length, 434*eda14cbcSMatt Macy space_map_load_callback, &smla); 435*eda14cbcSMatt Macy 436*eda14cbcSMatt Macy if (err != 0) 437*eda14cbcSMatt Macy range_tree_vacate(rt, NULL, NULL); 438*eda14cbcSMatt Macy 439*eda14cbcSMatt Macy return (err); 440*eda14cbcSMatt Macy } 441*eda14cbcSMatt Macy 442*eda14cbcSMatt Macy /* 443*eda14cbcSMatt Macy * Load the space map disk into the specified range tree. Segments of maptype 444*eda14cbcSMatt Macy * are added to the range tree, other segment types are removed. 445*eda14cbcSMatt Macy */ 446*eda14cbcSMatt Macy int 447*eda14cbcSMatt Macy space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) 448*eda14cbcSMatt Macy { 449*eda14cbcSMatt Macy return (space_map_load_length(sm, rt, maptype, space_map_length(sm))); 450*eda14cbcSMatt Macy } 451*eda14cbcSMatt Macy 452*eda14cbcSMatt Macy void 453*eda14cbcSMatt Macy space_map_histogram_clear(space_map_t *sm) 454*eda14cbcSMatt Macy { 455*eda14cbcSMatt Macy if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) 456*eda14cbcSMatt Macy return; 457*eda14cbcSMatt Macy 458*eda14cbcSMatt Macy bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram)); 459*eda14cbcSMatt Macy } 460*eda14cbcSMatt Macy 461*eda14cbcSMatt Macy boolean_t 462*eda14cbcSMatt Macy space_map_histogram_verify(space_map_t *sm, range_tree_t *rt) 463*eda14cbcSMatt Macy { 464*eda14cbcSMatt Macy /* 465*eda14cbcSMatt Macy * Verify that the in-core range tree does not have any 466*eda14cbcSMatt Macy * ranges smaller than our sm_shift size. 467*eda14cbcSMatt Macy */ 468*eda14cbcSMatt Macy for (int i = 0; i < sm->sm_shift; i++) { 469*eda14cbcSMatt Macy if (rt->rt_histogram[i] != 0) 470*eda14cbcSMatt Macy return (B_FALSE); 471*eda14cbcSMatt Macy } 472*eda14cbcSMatt Macy return (B_TRUE); 473*eda14cbcSMatt Macy } 474*eda14cbcSMatt Macy 475*eda14cbcSMatt Macy void 476*eda14cbcSMatt Macy space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx) 477*eda14cbcSMatt Macy { 478*eda14cbcSMatt Macy int idx = 0; 479*eda14cbcSMatt Macy 480*eda14cbcSMatt Macy ASSERT(dmu_tx_is_syncing(tx)); 481*eda14cbcSMatt Macy VERIFY3U(space_map_object(sm), !=, 0); 482*eda14cbcSMatt Macy 483*eda14cbcSMatt Macy if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) 484*eda14cbcSMatt Macy return; 485*eda14cbcSMatt Macy 486*eda14cbcSMatt Macy dmu_buf_will_dirty(sm->sm_dbuf, tx); 487*eda14cbcSMatt Macy 488*eda14cbcSMatt Macy ASSERT(space_map_histogram_verify(sm, rt)); 489*eda14cbcSMatt Macy /* 490*eda14cbcSMatt Macy * Transfer the content of the range tree histogram to the space 491*eda14cbcSMatt Macy * map histogram. The space map histogram contains 32 buckets ranging 492*eda14cbcSMatt Macy * between 2^sm_shift to 2^(32+sm_shift-1). The range tree, 493*eda14cbcSMatt Macy * however, can represent ranges from 2^0 to 2^63. Since the space 494*eda14cbcSMatt Macy * map only cares about allocatable blocks (minimum of sm_shift) we 495*eda14cbcSMatt Macy * can safely ignore all ranges in the range tree smaller than sm_shift. 496*eda14cbcSMatt Macy */ 497*eda14cbcSMatt Macy for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { 498*eda14cbcSMatt Macy 499*eda14cbcSMatt Macy /* 500*eda14cbcSMatt Macy * Since the largest histogram bucket in the space map is 501*eda14cbcSMatt Macy * 2^(32+sm_shift-1), we need to normalize the values in 502*eda14cbcSMatt Macy * the range tree for any bucket larger than that size. For 503*eda14cbcSMatt Macy * example given an sm_shift of 9, ranges larger than 2^40 504*eda14cbcSMatt Macy * would get normalized as if they were 1TB ranges. Assume 505*eda14cbcSMatt Macy * the range tree had a count of 5 in the 2^44 (16TB) bucket, 506*eda14cbcSMatt Macy * the calculation below would normalize this to 5 * 2^4 (16). 507*eda14cbcSMatt Macy */ 508*eda14cbcSMatt Macy ASSERT3U(i, >=, idx + sm->sm_shift); 509*eda14cbcSMatt Macy sm->sm_phys->smp_histogram[idx] += 510*eda14cbcSMatt Macy rt->rt_histogram[i] << (i - idx - sm->sm_shift); 511*eda14cbcSMatt Macy 512*eda14cbcSMatt Macy /* 513*eda14cbcSMatt Macy * Increment the space map's index as long as we haven't 514*eda14cbcSMatt Macy * reached the maximum bucket size. Accumulate all ranges 515*eda14cbcSMatt Macy * larger than the max bucket size into the last bucket. 516*eda14cbcSMatt Macy */ 517*eda14cbcSMatt Macy if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { 518*eda14cbcSMatt Macy ASSERT3U(idx + sm->sm_shift, ==, i); 519*eda14cbcSMatt Macy idx++; 520*eda14cbcSMatt Macy ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); 521*eda14cbcSMatt Macy } 522*eda14cbcSMatt Macy } 523*eda14cbcSMatt Macy } 524*eda14cbcSMatt Macy 525*eda14cbcSMatt Macy static void 526*eda14cbcSMatt Macy space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx) 527*eda14cbcSMatt Macy { 528*eda14cbcSMatt Macy dmu_buf_will_dirty(sm->sm_dbuf, tx); 529*eda14cbcSMatt Macy 530*eda14cbcSMatt Macy uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | 531*eda14cbcSMatt Macy SM_DEBUG_ACTION_ENCODE(maptype) | 532*eda14cbcSMatt Macy SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) | 533*eda14cbcSMatt Macy SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); 534*eda14cbcSMatt Macy 535*eda14cbcSMatt Macy dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length, 536*eda14cbcSMatt Macy sizeof (dentry), &dentry, tx); 537*eda14cbcSMatt Macy 538*eda14cbcSMatt Macy sm->sm_phys->smp_length += sizeof (dentry); 539*eda14cbcSMatt Macy } 540*eda14cbcSMatt Macy 541*eda14cbcSMatt Macy /* 542*eda14cbcSMatt Macy * Writes one or more entries given a segment. 543*eda14cbcSMatt Macy * 544*eda14cbcSMatt Macy * Note: The function may release the dbuf from the pointer initially 545*eda14cbcSMatt Macy * passed to it, and return a different dbuf. Also, the space map's 546*eda14cbcSMatt Macy * dbuf must be dirty for the changes in sm_phys to take effect. 547*eda14cbcSMatt Macy */ 548*eda14cbcSMatt Macy static void 549*eda14cbcSMatt Macy space_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend, 550*eda14cbcSMatt Macy maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, 551*eda14cbcSMatt Macy void *tag, dmu_tx_t *tx) 552*eda14cbcSMatt Macy { 553*eda14cbcSMatt Macy ASSERT3U(words, !=, 0); 554*eda14cbcSMatt Macy ASSERT3U(words, <=, 2); 555*eda14cbcSMatt Macy 556*eda14cbcSMatt Macy /* ensure the vdev_id can be represented by the space map */ 557*eda14cbcSMatt Macy ASSERT3U(vdev_id, <=, SM_NO_VDEVID); 558*eda14cbcSMatt Macy 559*eda14cbcSMatt Macy /* 560*eda14cbcSMatt Macy * if this is a single word entry, ensure that no vdev was 561*eda14cbcSMatt Macy * specified. 562*eda14cbcSMatt Macy */ 563*eda14cbcSMatt Macy IMPLY(words == 1, vdev_id == SM_NO_VDEVID); 564*eda14cbcSMatt Macy 565*eda14cbcSMatt Macy dmu_buf_t *db = *dbp; 566*eda14cbcSMatt Macy ASSERT3U(db->db_size, ==, sm->sm_blksz); 567*eda14cbcSMatt Macy 568*eda14cbcSMatt Macy uint64_t *block_base = db->db_data; 569*eda14cbcSMatt Macy uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t)); 570*eda14cbcSMatt Macy uint64_t *block_cursor = block_base + 571*eda14cbcSMatt Macy (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t); 572*eda14cbcSMatt Macy 573*eda14cbcSMatt Macy ASSERT3P(block_cursor, <=, block_end); 574*eda14cbcSMatt Macy 575*eda14cbcSMatt Macy uint64_t size = (rend - rstart) >> sm->sm_shift; 576*eda14cbcSMatt Macy uint64_t start = (rstart - sm->sm_start) >> sm->sm_shift; 577*eda14cbcSMatt Macy uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX; 578*eda14cbcSMatt Macy 579*eda14cbcSMatt Macy ASSERT3U(rstart, >=, sm->sm_start); 580*eda14cbcSMatt Macy ASSERT3U(rstart, <, sm->sm_start + sm->sm_size); 581*eda14cbcSMatt Macy ASSERT3U(rend - rstart, <=, sm->sm_size); 582*eda14cbcSMatt Macy ASSERT3U(rend, <=, sm->sm_start + sm->sm_size); 583*eda14cbcSMatt Macy 584*eda14cbcSMatt Macy while (size != 0) { 585*eda14cbcSMatt Macy ASSERT3P(block_cursor, <=, block_end); 586*eda14cbcSMatt Macy 587*eda14cbcSMatt Macy /* 588*eda14cbcSMatt Macy * If we are at the end of this block, flush it and start 589*eda14cbcSMatt Macy * writing again from the beginning. 590*eda14cbcSMatt Macy */ 591*eda14cbcSMatt Macy if (block_cursor == block_end) { 592*eda14cbcSMatt Macy dmu_buf_rele(db, tag); 593*eda14cbcSMatt Macy 594*eda14cbcSMatt Macy uint64_t next_word_offset = sm->sm_phys->smp_length; 595*eda14cbcSMatt Macy VERIFY0(dmu_buf_hold(sm->sm_os, 596*eda14cbcSMatt Macy space_map_object(sm), next_word_offset, 597*eda14cbcSMatt Macy tag, &db, DMU_READ_PREFETCH)); 598*eda14cbcSMatt Macy dmu_buf_will_dirty(db, tx); 599*eda14cbcSMatt Macy 600*eda14cbcSMatt Macy /* update caller's dbuf */ 601*eda14cbcSMatt Macy *dbp = db; 602*eda14cbcSMatt Macy 603*eda14cbcSMatt Macy ASSERT3U(db->db_size, ==, sm->sm_blksz); 604*eda14cbcSMatt Macy 605*eda14cbcSMatt Macy block_base = db->db_data; 606*eda14cbcSMatt Macy block_cursor = block_base; 607*eda14cbcSMatt Macy block_end = block_base + 608*eda14cbcSMatt Macy (db->db_size / sizeof (uint64_t)); 609*eda14cbcSMatt Macy } 610*eda14cbcSMatt Macy 611*eda14cbcSMatt Macy /* 612*eda14cbcSMatt Macy * If we are writing a two-word entry and we only have one 613*eda14cbcSMatt Macy * word left on this block, just pad it with an empty debug 614*eda14cbcSMatt Macy * entry and write the two-word entry in the next block. 615*eda14cbcSMatt Macy */ 616*eda14cbcSMatt Macy uint64_t *next_entry = block_cursor + 1; 617*eda14cbcSMatt Macy if (next_entry == block_end && words > 1) { 618*eda14cbcSMatt Macy ASSERT3U(words, ==, 2); 619*eda14cbcSMatt Macy *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | 620*eda14cbcSMatt Macy SM_DEBUG_ACTION_ENCODE(0) | 621*eda14cbcSMatt Macy SM_DEBUG_SYNCPASS_ENCODE(0) | 622*eda14cbcSMatt Macy SM_DEBUG_TXG_ENCODE(0); 623*eda14cbcSMatt Macy block_cursor++; 624*eda14cbcSMatt Macy sm->sm_phys->smp_length += sizeof (uint64_t); 625*eda14cbcSMatt Macy ASSERT3P(block_cursor, ==, block_end); 626*eda14cbcSMatt Macy continue; 627*eda14cbcSMatt Macy } 628*eda14cbcSMatt Macy 629*eda14cbcSMatt Macy uint64_t run_len = MIN(size, run_max); 630*eda14cbcSMatt Macy switch (words) { 631*eda14cbcSMatt Macy case 1: 632*eda14cbcSMatt Macy *block_cursor = SM_OFFSET_ENCODE(start) | 633*eda14cbcSMatt Macy SM_TYPE_ENCODE(maptype) | 634*eda14cbcSMatt Macy SM_RUN_ENCODE(run_len); 635*eda14cbcSMatt Macy block_cursor++; 636*eda14cbcSMatt Macy break; 637*eda14cbcSMatt Macy case 2: 638*eda14cbcSMatt Macy /* write the first word of the entry */ 639*eda14cbcSMatt Macy *block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) | 640*eda14cbcSMatt Macy SM2_RUN_ENCODE(run_len) | 641*eda14cbcSMatt Macy SM2_VDEV_ENCODE(vdev_id); 642*eda14cbcSMatt Macy block_cursor++; 643*eda14cbcSMatt Macy 644*eda14cbcSMatt Macy /* move on to the second word of the entry */ 645*eda14cbcSMatt Macy ASSERT3P(block_cursor, <, block_end); 646*eda14cbcSMatt Macy *block_cursor = SM2_TYPE_ENCODE(maptype) | 647*eda14cbcSMatt Macy SM2_OFFSET_ENCODE(start); 648*eda14cbcSMatt Macy block_cursor++; 649*eda14cbcSMatt Macy break; 650*eda14cbcSMatt Macy default: 651*eda14cbcSMatt Macy panic("%d-word space map entries are not supported", 652*eda14cbcSMatt Macy words); 653*eda14cbcSMatt Macy break; 654*eda14cbcSMatt Macy } 655*eda14cbcSMatt Macy sm->sm_phys->smp_length += words * sizeof (uint64_t); 656*eda14cbcSMatt Macy 657*eda14cbcSMatt Macy start += run_len; 658*eda14cbcSMatt Macy size -= run_len; 659*eda14cbcSMatt Macy } 660*eda14cbcSMatt Macy ASSERT0(size); 661*eda14cbcSMatt Macy 662*eda14cbcSMatt Macy } 663*eda14cbcSMatt Macy 664*eda14cbcSMatt Macy /* 665*eda14cbcSMatt Macy * Note: The space map's dbuf must be dirty for the changes in sm_phys to 666*eda14cbcSMatt Macy * take effect. 667*eda14cbcSMatt Macy */ 668*eda14cbcSMatt Macy static void 669*eda14cbcSMatt Macy space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, 670*eda14cbcSMatt Macy uint64_t vdev_id, dmu_tx_t *tx) 671*eda14cbcSMatt Macy { 672*eda14cbcSMatt Macy spa_t *spa = tx->tx_pool->dp_spa; 673*eda14cbcSMatt Macy dmu_buf_t *db; 674*eda14cbcSMatt Macy 675*eda14cbcSMatt Macy space_map_write_intro_debug(sm, maptype, tx); 676*eda14cbcSMatt Macy 677*eda14cbcSMatt Macy #ifdef ZFS_DEBUG 678*eda14cbcSMatt Macy /* 679*eda14cbcSMatt Macy * We do this right after we write the intro debug entry 680*eda14cbcSMatt Macy * because the estimate does not take it into account. 681*eda14cbcSMatt Macy */ 682*eda14cbcSMatt Macy uint64_t initial_objsize = sm->sm_phys->smp_length; 683*eda14cbcSMatt Macy uint64_t estimated_growth = 684*eda14cbcSMatt Macy space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID); 685*eda14cbcSMatt Macy uint64_t estimated_final_objsize = initial_objsize + estimated_growth; 686*eda14cbcSMatt Macy #endif 687*eda14cbcSMatt Macy 688*eda14cbcSMatt Macy /* 689*eda14cbcSMatt Macy * Find the offset right after the last word in the space map 690*eda14cbcSMatt Macy * and use that to get a hold of the last block, so we can 691*eda14cbcSMatt Macy * start appending to it. 692*eda14cbcSMatt Macy */ 693*eda14cbcSMatt Macy uint64_t next_word_offset = sm->sm_phys->smp_length; 694*eda14cbcSMatt Macy VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm), 695*eda14cbcSMatt Macy next_word_offset, FTAG, &db, DMU_READ_PREFETCH)); 696*eda14cbcSMatt Macy ASSERT3U(db->db_size, ==, sm->sm_blksz); 697*eda14cbcSMatt Macy 698*eda14cbcSMatt Macy dmu_buf_will_dirty(db, tx); 699*eda14cbcSMatt Macy 700*eda14cbcSMatt Macy zfs_btree_t *t = &rt->rt_root; 701*eda14cbcSMatt Macy zfs_btree_index_t where; 702*eda14cbcSMatt Macy for (range_seg_t *rs = zfs_btree_first(t, &where); rs != NULL; 703*eda14cbcSMatt Macy rs = zfs_btree_next(t, &where, &where)) { 704*eda14cbcSMatt Macy uint64_t offset = (rs_get_start(rs, rt) - sm->sm_start) >> 705*eda14cbcSMatt Macy sm->sm_shift; 706*eda14cbcSMatt Macy uint64_t length = (rs_get_end(rs, rt) - rs_get_start(rs, rt)) >> 707*eda14cbcSMatt Macy sm->sm_shift; 708*eda14cbcSMatt Macy uint8_t words = 1; 709*eda14cbcSMatt Macy 710*eda14cbcSMatt Macy /* 711*eda14cbcSMatt Macy * We only write two-word entries when both of the following 712*eda14cbcSMatt Macy * are true: 713*eda14cbcSMatt Macy * 714*eda14cbcSMatt Macy * [1] The feature is enabled. 715*eda14cbcSMatt Macy * [2] The offset or run is too big for a single-word entry, 716*eda14cbcSMatt Macy * or the vdev_id is set (meaning not equal to 717*eda14cbcSMatt Macy * SM_NO_VDEVID). 718*eda14cbcSMatt Macy * 719*eda14cbcSMatt Macy * Note that for purposes of testing we've added the case that 720*eda14cbcSMatt Macy * we write two-word entries occasionally when the feature is 721*eda14cbcSMatt Macy * enabled and zfs_force_some_double_word_sm_entries has been 722*eda14cbcSMatt Macy * set. 723*eda14cbcSMatt Macy */ 724*eda14cbcSMatt Macy if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) && 725*eda14cbcSMatt Macy (offset >= (1ULL << SM_OFFSET_BITS) || 726*eda14cbcSMatt Macy length > SM_RUN_MAX || 727*eda14cbcSMatt Macy vdev_id != SM_NO_VDEVID || 728*eda14cbcSMatt Macy (zfs_force_some_double_word_sm_entries && 729*eda14cbcSMatt Macy spa_get_random(100) == 0))) 730*eda14cbcSMatt Macy words = 2; 731*eda14cbcSMatt Macy 732*eda14cbcSMatt Macy space_map_write_seg(sm, rs_get_start(rs, rt), rs_get_end(rs, 733*eda14cbcSMatt Macy rt), maptype, vdev_id, words, &db, FTAG, tx); 734*eda14cbcSMatt Macy } 735*eda14cbcSMatt Macy 736*eda14cbcSMatt Macy dmu_buf_rele(db, FTAG); 737*eda14cbcSMatt Macy 738*eda14cbcSMatt Macy #ifdef ZFS_DEBUG 739*eda14cbcSMatt Macy /* 740*eda14cbcSMatt Macy * We expect our estimation to be based on the worst case 741*eda14cbcSMatt Macy * scenario [see comment in space_map_estimate_optimal_size()]. 742*eda14cbcSMatt Macy * Therefore we expect the actual objsize to be equal or less 743*eda14cbcSMatt Macy * than whatever we estimated it to be. 744*eda14cbcSMatt Macy */ 745*eda14cbcSMatt Macy ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length); 746*eda14cbcSMatt Macy #endif 747*eda14cbcSMatt Macy } 748*eda14cbcSMatt Macy 749*eda14cbcSMatt Macy /* 750*eda14cbcSMatt Macy * Note: This function manipulates the state of the given space map but 751*eda14cbcSMatt Macy * does not hold any locks implicitly. Thus the caller is responsible 752*eda14cbcSMatt Macy * for synchronizing writes to the space map. 753*eda14cbcSMatt Macy */ 754*eda14cbcSMatt Macy void 755*eda14cbcSMatt Macy space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, 756*eda14cbcSMatt Macy uint64_t vdev_id, dmu_tx_t *tx) 757*eda14cbcSMatt Macy { 758*eda14cbcSMatt Macy ASSERT(dsl_pool_sync_context(dmu_objset_pool(sm->sm_os))); 759*eda14cbcSMatt Macy VERIFY3U(space_map_object(sm), !=, 0); 760*eda14cbcSMatt Macy 761*eda14cbcSMatt Macy dmu_buf_will_dirty(sm->sm_dbuf, tx); 762*eda14cbcSMatt Macy 763*eda14cbcSMatt Macy /* 764*eda14cbcSMatt Macy * This field is no longer necessary since the in-core space map 765*eda14cbcSMatt Macy * now contains the object number but is maintained for backwards 766*eda14cbcSMatt Macy * compatibility. 767*eda14cbcSMatt Macy */ 768*eda14cbcSMatt Macy sm->sm_phys->smp_object = sm->sm_object; 769*eda14cbcSMatt Macy 770*eda14cbcSMatt Macy if (range_tree_is_empty(rt)) { 771*eda14cbcSMatt Macy VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object); 772*eda14cbcSMatt Macy return; 773*eda14cbcSMatt Macy } 774*eda14cbcSMatt Macy 775*eda14cbcSMatt Macy if (maptype == SM_ALLOC) 776*eda14cbcSMatt Macy sm->sm_phys->smp_alloc += range_tree_space(rt); 777*eda14cbcSMatt Macy else 778*eda14cbcSMatt Macy sm->sm_phys->smp_alloc -= range_tree_space(rt); 779*eda14cbcSMatt Macy 780*eda14cbcSMatt Macy uint64_t nodes = zfs_btree_numnodes(&rt->rt_root); 781*eda14cbcSMatt Macy uint64_t rt_space = range_tree_space(rt); 782*eda14cbcSMatt Macy 783*eda14cbcSMatt Macy space_map_write_impl(sm, rt, maptype, vdev_id, tx); 784*eda14cbcSMatt Macy 785*eda14cbcSMatt Macy /* 786*eda14cbcSMatt Macy * Ensure that the space_map's accounting wasn't changed 787*eda14cbcSMatt Macy * while we were in the middle of writing it out. 788*eda14cbcSMatt Macy */ 789*eda14cbcSMatt Macy VERIFY3U(nodes, ==, zfs_btree_numnodes(&rt->rt_root)); 790*eda14cbcSMatt Macy VERIFY3U(range_tree_space(rt), ==, rt_space); 791*eda14cbcSMatt Macy } 792*eda14cbcSMatt Macy 793*eda14cbcSMatt Macy static int 794*eda14cbcSMatt Macy space_map_open_impl(space_map_t *sm) 795*eda14cbcSMatt Macy { 796*eda14cbcSMatt Macy int error; 797*eda14cbcSMatt Macy u_longlong_t blocks; 798*eda14cbcSMatt Macy 799*eda14cbcSMatt Macy error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf); 800*eda14cbcSMatt Macy if (error) 801*eda14cbcSMatt Macy return (error); 802*eda14cbcSMatt Macy 803*eda14cbcSMatt Macy dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks); 804*eda14cbcSMatt Macy sm->sm_phys = sm->sm_dbuf->db_data; 805*eda14cbcSMatt Macy return (0); 806*eda14cbcSMatt Macy } 807*eda14cbcSMatt Macy 808*eda14cbcSMatt Macy int 809*eda14cbcSMatt Macy space_map_open(space_map_t **smp, objset_t *os, uint64_t object, 810*eda14cbcSMatt Macy uint64_t start, uint64_t size, uint8_t shift) 811*eda14cbcSMatt Macy { 812*eda14cbcSMatt Macy space_map_t *sm; 813*eda14cbcSMatt Macy int error; 814*eda14cbcSMatt Macy 815*eda14cbcSMatt Macy ASSERT(*smp == NULL); 816*eda14cbcSMatt Macy ASSERT(os != NULL); 817*eda14cbcSMatt Macy ASSERT(object != 0); 818*eda14cbcSMatt Macy 819*eda14cbcSMatt Macy sm = kmem_alloc(sizeof (space_map_t), KM_SLEEP); 820*eda14cbcSMatt Macy 821*eda14cbcSMatt Macy sm->sm_start = start; 822*eda14cbcSMatt Macy sm->sm_size = size; 823*eda14cbcSMatt Macy sm->sm_shift = shift; 824*eda14cbcSMatt Macy sm->sm_os = os; 825*eda14cbcSMatt Macy sm->sm_object = object; 826*eda14cbcSMatt Macy sm->sm_blksz = 0; 827*eda14cbcSMatt Macy sm->sm_dbuf = NULL; 828*eda14cbcSMatt Macy sm->sm_phys = NULL; 829*eda14cbcSMatt Macy 830*eda14cbcSMatt Macy error = space_map_open_impl(sm); 831*eda14cbcSMatt Macy if (error != 0) { 832*eda14cbcSMatt Macy space_map_close(sm); 833*eda14cbcSMatt Macy return (error); 834*eda14cbcSMatt Macy } 835*eda14cbcSMatt Macy *smp = sm; 836*eda14cbcSMatt Macy 837*eda14cbcSMatt Macy return (0); 838*eda14cbcSMatt Macy } 839*eda14cbcSMatt Macy 840*eda14cbcSMatt Macy void 841*eda14cbcSMatt Macy space_map_close(space_map_t *sm) 842*eda14cbcSMatt Macy { 843*eda14cbcSMatt Macy if (sm == NULL) 844*eda14cbcSMatt Macy return; 845*eda14cbcSMatt Macy 846*eda14cbcSMatt Macy if (sm->sm_dbuf != NULL) 847*eda14cbcSMatt Macy dmu_buf_rele(sm->sm_dbuf, sm); 848*eda14cbcSMatt Macy sm->sm_dbuf = NULL; 849*eda14cbcSMatt Macy sm->sm_phys = NULL; 850*eda14cbcSMatt Macy 851*eda14cbcSMatt Macy kmem_free(sm, sizeof (*sm)); 852*eda14cbcSMatt Macy } 853*eda14cbcSMatt Macy 854*eda14cbcSMatt Macy void 855*eda14cbcSMatt Macy space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) 856*eda14cbcSMatt Macy { 857*eda14cbcSMatt Macy objset_t *os = sm->sm_os; 858*eda14cbcSMatt Macy spa_t *spa = dmu_objset_spa(os); 859*eda14cbcSMatt Macy dmu_object_info_t doi; 860*eda14cbcSMatt Macy 861*eda14cbcSMatt Macy ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 862*eda14cbcSMatt Macy ASSERT(dmu_tx_is_syncing(tx)); 863*eda14cbcSMatt Macy VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa)); 864*eda14cbcSMatt Macy 865*eda14cbcSMatt Macy dmu_object_info_from_db(sm->sm_dbuf, &doi); 866*eda14cbcSMatt Macy 867*eda14cbcSMatt Macy /* 868*eda14cbcSMatt Macy * If the space map has the wrong bonus size (because 869*eda14cbcSMatt Macy * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or 870*eda14cbcSMatt Macy * the wrong block size (because space_map_blksz has changed), 871*eda14cbcSMatt Macy * free and re-allocate its object with the updated sizes. 872*eda14cbcSMatt Macy * 873*eda14cbcSMatt Macy * Otherwise, just truncate the current object. 874*eda14cbcSMatt Macy */ 875*eda14cbcSMatt Macy if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && 876*eda14cbcSMatt Macy doi.doi_bonus_size != sizeof (space_map_phys_t)) || 877*eda14cbcSMatt Macy doi.doi_data_block_size != blocksize || 878*eda14cbcSMatt Macy doi.doi_metadata_block_size != 1 << space_map_ibs) { 879*eda14cbcSMatt Macy zfs_dbgmsg("txg %llu, spa %s, sm %px, reallocating " 880*eda14cbcSMatt Macy "object[%llu]: old bonus %u, old blocksz %u", 881*eda14cbcSMatt Macy dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object, 882*eda14cbcSMatt Macy doi.doi_bonus_size, doi.doi_data_block_size); 883*eda14cbcSMatt Macy 884*eda14cbcSMatt Macy space_map_free(sm, tx); 885*eda14cbcSMatt Macy dmu_buf_rele(sm->sm_dbuf, sm); 886*eda14cbcSMatt Macy 887*eda14cbcSMatt Macy sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx); 888*eda14cbcSMatt Macy VERIFY0(space_map_open_impl(sm)); 889*eda14cbcSMatt Macy } else { 890*eda14cbcSMatt Macy VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx)); 891*eda14cbcSMatt Macy 892*eda14cbcSMatt Macy /* 893*eda14cbcSMatt Macy * If the spacemap is reallocated, its histogram 894*eda14cbcSMatt Macy * will be reset. Do the same in the common case so that 895*eda14cbcSMatt Macy * bugs related to the uncommon case do not go unnoticed. 896*eda14cbcSMatt Macy */ 897*eda14cbcSMatt Macy bzero(sm->sm_phys->smp_histogram, 898*eda14cbcSMatt Macy sizeof (sm->sm_phys->smp_histogram)); 899*eda14cbcSMatt Macy } 900*eda14cbcSMatt Macy 901*eda14cbcSMatt Macy dmu_buf_will_dirty(sm->sm_dbuf, tx); 902*eda14cbcSMatt Macy sm->sm_phys->smp_length = 0; 903*eda14cbcSMatt Macy sm->sm_phys->smp_alloc = 0; 904*eda14cbcSMatt Macy } 905*eda14cbcSMatt Macy 906*eda14cbcSMatt Macy uint64_t 907*eda14cbcSMatt Macy space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) 908*eda14cbcSMatt Macy { 909*eda14cbcSMatt Macy spa_t *spa = dmu_objset_spa(os); 910*eda14cbcSMatt Macy uint64_t object; 911*eda14cbcSMatt Macy int bonuslen; 912*eda14cbcSMatt Macy 913*eda14cbcSMatt Macy if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { 914*eda14cbcSMatt Macy spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); 915*eda14cbcSMatt Macy bonuslen = sizeof (space_map_phys_t); 916*eda14cbcSMatt Macy ASSERT3U(bonuslen, <=, dmu_bonus_max()); 917*eda14cbcSMatt Macy } else { 918*eda14cbcSMatt Macy bonuslen = SPACE_MAP_SIZE_V0; 919*eda14cbcSMatt Macy } 920*eda14cbcSMatt Macy 921*eda14cbcSMatt Macy object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize, 922*eda14cbcSMatt Macy space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx); 923*eda14cbcSMatt Macy 924*eda14cbcSMatt Macy return (object); 925*eda14cbcSMatt Macy } 926*eda14cbcSMatt Macy 927*eda14cbcSMatt Macy void 928*eda14cbcSMatt Macy space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx) 929*eda14cbcSMatt Macy { 930*eda14cbcSMatt Macy spa_t *spa = dmu_objset_spa(os); 931*eda14cbcSMatt Macy if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { 932*eda14cbcSMatt Macy dmu_object_info_t doi; 933*eda14cbcSMatt Macy 934*eda14cbcSMatt Macy VERIFY0(dmu_object_info(os, smobj, &doi)); 935*eda14cbcSMatt Macy if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) { 936*eda14cbcSMatt Macy spa_feature_decr(spa, 937*eda14cbcSMatt Macy SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); 938*eda14cbcSMatt Macy } 939*eda14cbcSMatt Macy } 940*eda14cbcSMatt Macy 941*eda14cbcSMatt Macy VERIFY0(dmu_object_free(os, smobj, tx)); 942*eda14cbcSMatt Macy } 943*eda14cbcSMatt Macy 944*eda14cbcSMatt Macy void 945*eda14cbcSMatt Macy space_map_free(space_map_t *sm, dmu_tx_t *tx) 946*eda14cbcSMatt Macy { 947*eda14cbcSMatt Macy if (sm == NULL) 948*eda14cbcSMatt Macy return; 949*eda14cbcSMatt Macy 950*eda14cbcSMatt Macy space_map_free_obj(sm->sm_os, space_map_object(sm), tx); 951*eda14cbcSMatt Macy sm->sm_object = 0; 952*eda14cbcSMatt Macy } 953*eda14cbcSMatt Macy 954*eda14cbcSMatt Macy /* 955*eda14cbcSMatt Macy * Given a range tree, it makes a worst-case estimate of how much 956*eda14cbcSMatt Macy * space would the tree's segments take if they were written to 957*eda14cbcSMatt Macy * the given space map. 958*eda14cbcSMatt Macy */ 959*eda14cbcSMatt Macy uint64_t 960*eda14cbcSMatt Macy space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt, 961*eda14cbcSMatt Macy uint64_t vdev_id) 962*eda14cbcSMatt Macy { 963*eda14cbcSMatt Macy spa_t *spa = dmu_objset_spa(sm->sm_os); 964*eda14cbcSMatt Macy uint64_t shift = sm->sm_shift; 965*eda14cbcSMatt Macy uint64_t *histogram = rt->rt_histogram; 966*eda14cbcSMatt Macy uint64_t entries_for_seg = 0; 967*eda14cbcSMatt Macy 968*eda14cbcSMatt Macy /* 969*eda14cbcSMatt Macy * In order to get a quick estimate of the optimal size that this 970*eda14cbcSMatt Macy * range tree would have on-disk as a space map, we iterate through 971*eda14cbcSMatt Macy * its histogram buckets instead of iterating through its nodes. 972*eda14cbcSMatt Macy * 973*eda14cbcSMatt Macy * Note that this is a highest-bound/worst-case estimate for the 974*eda14cbcSMatt Macy * following reasons: 975*eda14cbcSMatt Macy * 976*eda14cbcSMatt Macy * 1] We assume that we always add a debug padding for each block 977*eda14cbcSMatt Macy * we write and we also assume that we start at the last word 978*eda14cbcSMatt Macy * of a block attempting to write a two-word entry. 979*eda14cbcSMatt Macy * 2] Rounding up errors due to the way segments are distributed 980*eda14cbcSMatt Macy * in the buckets of the range tree's histogram. 981*eda14cbcSMatt Macy * 3] The activation of zfs_force_some_double_word_sm_entries 982*eda14cbcSMatt Macy * (tunable) when testing. 983*eda14cbcSMatt Macy * 984*eda14cbcSMatt Macy * = Math and Rounding Errors = 985*eda14cbcSMatt Macy * 986*eda14cbcSMatt Macy * rt_histogram[i] bucket of a range tree represents the number 987*eda14cbcSMatt Macy * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given 988*eda14cbcSMatt Macy * that, we want to divide the buckets into groups: Buckets that 989*eda14cbcSMatt Macy * can be represented using a single-word entry, ones that can 990*eda14cbcSMatt Macy * be represented with a double-word entry, and ones that can 991*eda14cbcSMatt Macy * only be represented with multiple two-word entries. 992*eda14cbcSMatt Macy * 993*eda14cbcSMatt Macy * [Note that if the new encoding feature is not enabled there 994*eda14cbcSMatt Macy * are only two groups: single-word entry buckets and multiple 995*eda14cbcSMatt Macy * single-word entry buckets. The information below assumes 996*eda14cbcSMatt Macy * two-word entries enabled, but it can easily applied when 997*eda14cbcSMatt Macy * the feature is not enabled] 998*eda14cbcSMatt Macy * 999*eda14cbcSMatt Macy * To find the highest bucket that can be represented with a 1000*eda14cbcSMatt Macy * single-word entry we look at the maximum run that such entry 1001*eda14cbcSMatt Macy * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that 1002*eda14cbcSMatt Macy * the run of a space map entry is shifted by sm_shift, thus we 1003*eda14cbcSMatt Macy * add it to the exponent]. This way, excluding the value of the 1004*eda14cbcSMatt Macy * maximum run that can be represented by a single-word entry, 1005*eda14cbcSMatt Macy * all runs that are smaller exist in buckets 0 to 1006*eda14cbcSMatt Macy * SM_RUN_BITS + shift - 1. 1007*eda14cbcSMatt Macy * 1008*eda14cbcSMatt Macy * To find the highest bucket that can be represented with a 1009*eda14cbcSMatt Macy * double-word entry, we follow the same approach. Finally, any 1010*eda14cbcSMatt Macy * bucket higher than that are represented with multiple two-word 1011*eda14cbcSMatt Macy * entries. To be more specific, if the highest bucket whose 1012*eda14cbcSMatt Macy * segments can be represented with a single two-word entry is X, 1013*eda14cbcSMatt Macy * then bucket X+1 will need 2 two-word entries for each of its 1014*eda14cbcSMatt Macy * segments, X+2 will need 4, X+3 will need 8, ...etc. 1015*eda14cbcSMatt Macy * 1016*eda14cbcSMatt Macy * With all of the above we make our estimation based on bucket 1017*eda14cbcSMatt Macy * groups. There is a rounding error though. As we mentioned in 1018*eda14cbcSMatt Macy * the example with the one-word entry, the maximum run that can 1019*eda14cbcSMatt Macy * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is 1020*eda14cbcSMatt Macy * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of 1021*eda14cbcSMatt Macy * that length fall into the next bucket (and bucket group) where 1022*eda14cbcSMatt Macy * we start counting two-word entries and this is one more reason 1023*eda14cbcSMatt Macy * why the estimated size may end up being bigger than the actual 1024*eda14cbcSMatt Macy * size written. 1025*eda14cbcSMatt Macy */ 1026*eda14cbcSMatt Macy uint64_t size = 0; 1027*eda14cbcSMatt Macy uint64_t idx = 0; 1028*eda14cbcSMatt Macy 1029*eda14cbcSMatt Macy if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) || 1030*eda14cbcSMatt Macy (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) { 1031*eda14cbcSMatt Macy 1032*eda14cbcSMatt Macy /* 1033*eda14cbcSMatt Macy * If we are trying to force some double word entries just 1034*eda14cbcSMatt Macy * assume the worst-case of every single word entry being 1035*eda14cbcSMatt Macy * written as a double word entry. 1036*eda14cbcSMatt Macy */ 1037*eda14cbcSMatt Macy uint64_t entry_size = 1038*eda14cbcSMatt Macy (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) && 1039*eda14cbcSMatt Macy zfs_force_some_double_word_sm_entries) ? 1040*eda14cbcSMatt Macy (2 * sizeof (uint64_t)) : sizeof (uint64_t); 1041*eda14cbcSMatt Macy 1042*eda14cbcSMatt Macy uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1; 1043*eda14cbcSMatt Macy for (; idx <= single_entry_max_bucket; idx++) 1044*eda14cbcSMatt Macy size += histogram[idx] * entry_size; 1045*eda14cbcSMatt Macy 1046*eda14cbcSMatt Macy if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) { 1047*eda14cbcSMatt Macy for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { 1048*eda14cbcSMatt Macy ASSERT3U(idx, >=, single_entry_max_bucket); 1049*eda14cbcSMatt Macy entries_for_seg = 1050*eda14cbcSMatt Macy 1ULL << (idx - single_entry_max_bucket); 1051*eda14cbcSMatt Macy size += histogram[idx] * 1052*eda14cbcSMatt Macy entries_for_seg * entry_size; 1053*eda14cbcSMatt Macy } 1054*eda14cbcSMatt Macy return (size); 1055*eda14cbcSMatt Macy } 1056*eda14cbcSMatt Macy } 1057*eda14cbcSMatt Macy 1058*eda14cbcSMatt Macy ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)); 1059*eda14cbcSMatt Macy 1060*eda14cbcSMatt Macy uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1; 1061*eda14cbcSMatt Macy for (; idx <= double_entry_max_bucket; idx++) 1062*eda14cbcSMatt Macy size += histogram[idx] * 2 * sizeof (uint64_t); 1063*eda14cbcSMatt Macy 1064*eda14cbcSMatt Macy for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { 1065*eda14cbcSMatt Macy ASSERT3U(idx, >=, double_entry_max_bucket); 1066*eda14cbcSMatt Macy entries_for_seg = 1ULL << (idx - double_entry_max_bucket); 1067*eda14cbcSMatt Macy size += histogram[idx] * 1068*eda14cbcSMatt Macy entries_for_seg * 2 * sizeof (uint64_t); 1069*eda14cbcSMatt Macy } 1070*eda14cbcSMatt Macy 1071*eda14cbcSMatt Macy /* 1072*eda14cbcSMatt Macy * Assume the worst case where we start with the padding at the end 1073*eda14cbcSMatt Macy * of the current block and we add an extra padding entry at the end 1074*eda14cbcSMatt Macy * of all subsequent blocks. 1075*eda14cbcSMatt Macy */ 1076*eda14cbcSMatt Macy size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t); 1077*eda14cbcSMatt Macy 1078*eda14cbcSMatt Macy return (size); 1079*eda14cbcSMatt Macy } 1080*eda14cbcSMatt Macy 1081*eda14cbcSMatt Macy uint64_t 1082*eda14cbcSMatt Macy space_map_object(space_map_t *sm) 1083*eda14cbcSMatt Macy { 1084*eda14cbcSMatt Macy return (sm != NULL ? sm->sm_object : 0); 1085*eda14cbcSMatt Macy } 1086*eda14cbcSMatt Macy 1087*eda14cbcSMatt Macy int64_t 1088*eda14cbcSMatt Macy space_map_allocated(space_map_t *sm) 1089*eda14cbcSMatt Macy { 1090*eda14cbcSMatt Macy return (sm != NULL ? sm->sm_phys->smp_alloc : 0); 1091*eda14cbcSMatt Macy } 1092*eda14cbcSMatt Macy 1093*eda14cbcSMatt Macy uint64_t 1094*eda14cbcSMatt Macy space_map_length(space_map_t *sm) 1095*eda14cbcSMatt Macy { 1096*eda14cbcSMatt Macy return (sm != NULL ? sm->sm_phys->smp_length : 0); 1097*eda14cbcSMatt Macy } 1098*eda14cbcSMatt Macy 1099*eda14cbcSMatt Macy uint64_t 1100*eda14cbcSMatt Macy space_map_nblocks(space_map_t *sm) 1101*eda14cbcSMatt Macy { 1102*eda14cbcSMatt Macy if (sm == NULL) 1103*eda14cbcSMatt Macy return (0); 1104*eda14cbcSMatt Macy return (DIV_ROUND_UP(space_map_length(sm), sm->sm_blksz)); 1105*eda14cbcSMatt Macy } 1106