1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 26 * Copyright 2017 Nexenta Systems, Inc. 27 * Copyright (c) 2024, Klara, Inc. 28 */ 29 30 #include <sys/zio.h> 31 #include <sys/spa.h> 32 #include <sys/dmu.h> 33 #include <sys/zfs_context.h> 34 #include <sys/zap.h> 35 #include <sys/zap_impl.h> 36 #include <sys/zap_leaf.h> 37 #include <sys/btree.h> 38 #include <sys/arc.h> 39 #include <sys/dmu_objset.h> 40 #include <sys/spa_impl.h> 41 42 #ifdef _KERNEL 43 #include <sys/sunddi.h> 44 #endif 45 46 /* 47 * The maximum size (in bytes) of a microzap before it is converted to a 48 * fatzap. It will be rounded up to next multiple of 512 (SPA_MINBLOCKSIZE). 49 * 50 * By definition, a microzap must fit into a single block, so this has 51 * traditionally been SPA_OLD_MAXBLOCKSIZE, and is set to that by default. 52 * Setting this higher requires both the large_blocks feature (to even create 53 * blocks that large) and the large_microzap feature (to enable the stream 54 * machinery to understand not to try to split a microzap block). 55 * 56 * If large_microzap is enabled, this value will be clamped to 57 * spa_maxblocksize(), up to 1M. If not, it will be clamped to 58 * SPA_OLD_MAXBLOCKSIZE. 59 */ 60 static int zap_micro_max_size = SPA_OLD_MAXBLOCKSIZE; 61 62 /* 63 * The 1M upper limit is necessary because the count of chunks in a microzap 64 * block is stored as a uint16_t (mze_chunkid). Each chunk is 64 bytes, and the 65 * first is used to store a header, so there are 32767 usable chunks, which is 66 * just under 2M. 1M is the largest power-2-rounded block size under 2M, so we 67 * must set the limit there. 68 */ 69 #define MZAP_MAX_SIZE (1048576) 70 71 uint64_t 72 zap_get_micro_max_size(spa_t *spa) 73 { 74 uint64_t maxsz = MIN(MZAP_MAX_SIZE, 75 P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE)); 76 if (maxsz <= SPA_OLD_MAXBLOCKSIZE) 77 return (maxsz); 78 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP)) 79 return (MIN(maxsz, spa_maxblocksize(spa))); 80 return (SPA_OLD_MAXBLOCKSIZE); 81 } 82 83 static int mzap_upgrade(zap_t **zapp, 84 const void *tag, dmu_tx_t *tx, zap_flags_t flags); 85 86 uint64_t 87 zap_getflags(zap_t *zap) 88 { 89 if (zap->zap_ismicro) 90 return (0); 91 return (zap_f_phys(zap)->zap_flags); 92 } 93 94 int 95 zap_hashbits(zap_t *zap) 96 { 97 if (zap_getflags(zap) & ZAP_FLAG_HASH64) 98 return (48); 99 else 100 return (28); 101 } 102 103 uint32_t 104 zap_maxcd(zap_t *zap) 105 { 106 if (zap_getflags(zap) & ZAP_FLAG_HASH64) 107 return ((1<<16)-1); 108 else 109 return (-1U); 110 } 111 112 static uint64_t 113 zap_hash(zap_name_t *zn) 114 { 115 zap_t *zap = zn->zn_zap; 116 uint64_t h = 0; 117 118 if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { 119 ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); 120 h = *(uint64_t *)zn->zn_key_orig; 121 } else { 122 h = zap->zap_salt; 123 ASSERT(h != 0); 124 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 125 126 if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { 127 const uint64_t *wp = zn->zn_key_norm; 128 129 ASSERT(zn->zn_key_intlen == 8); 130 for (int i = 0; i < zn->zn_key_norm_numints; 131 wp++, i++) { 132 uint64_t word = *wp; 133 134 for (int j = 0; j < 8; j++) { 135 h = (h >> 8) ^ 136 zfs_crc64_table[(h ^ word) & 0xFF]; 137 word >>= NBBY; 138 } 139 } 140 } else { 141 const uint8_t *cp = zn->zn_key_norm; 142 143 /* 144 * We previously stored the terminating null on 145 * disk, but didn't hash it, so we need to 146 * continue to not hash it. (The 147 * zn_key_*_numints includes the terminating 148 * null for non-binary keys.) 149 */ 150 int len = zn->zn_key_norm_numints - 1; 151 152 ASSERT(zn->zn_key_intlen == 1); 153 for (int i = 0; i < len; cp++, i++) { 154 h = (h >> 8) ^ 155 zfs_crc64_table[(h ^ *cp) & 0xFF]; 156 } 157 } 158 } 159 /* 160 * Don't use all 64 bits, since we need some in the cookie for 161 * the collision differentiator. We MUST use the high bits, 162 * since those are the ones that we first pay attention to when 163 * choosing the bucket. 164 */ 165 h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); 166 167 return (h); 168 } 169 170 static int 171 zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags, 172 size_t outlen) 173 { 174 ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); 175 176 size_t inlen = strlen(name) + 1; 177 178 int err = 0; 179 (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, 180 normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, 181 U8_UNICODE_LATEST, &err); 182 183 return (err); 184 } 185 186 boolean_t 187 zap_match(zap_name_t *zn, const char *matchname) 188 { 189 boolean_t res = B_FALSE; 190 ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); 191 192 if (zn->zn_matchtype & MT_NORMALIZE) { 193 size_t namelen = zn->zn_normbuf_len; 194 char normbuf[ZAP_MAXNAMELEN]; 195 char *norm = normbuf; 196 197 /* 198 * Cannot allocate this on-stack as it exceed the stack-limit of 199 * 1024. 200 */ 201 if (namelen > ZAP_MAXNAMELEN) 202 norm = kmem_alloc(namelen, KM_SLEEP); 203 204 if (zap_normalize(zn->zn_zap, matchname, norm, 205 zn->zn_normflags, namelen) != 0) { 206 res = B_FALSE; 207 } else { 208 res = (strcmp(zn->zn_key_norm, norm) == 0); 209 } 210 if (norm != normbuf) 211 kmem_free(norm, namelen); 212 } else { 213 res = (strcmp(zn->zn_key_orig, matchname) == 0); 214 } 215 return (res); 216 } 217 218 static kmem_cache_t *zap_name_cache; 219 static kmem_cache_t *zap_attr_cache; 220 static kmem_cache_t *zap_name_long_cache; 221 static kmem_cache_t *zap_attr_long_cache; 222 223 void 224 zap_init(void) 225 { 226 zap_name_cache = kmem_cache_create("zap_name", 227 sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL, 228 NULL, NULL, NULL, 0); 229 230 zap_attr_cache = kmem_cache_create("zap_attr_cache", 231 sizeof (zap_attribute_t) + ZAP_MAXNAMELEN, 0, NULL, 232 NULL, NULL, NULL, NULL, 0); 233 234 zap_name_long_cache = kmem_cache_create("zap_name_long", 235 sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL, 236 NULL, NULL, NULL, 0); 237 238 zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache", 239 sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, 240 NULL, NULL, NULL, NULL, 0); 241 } 242 243 void 244 zap_fini(void) 245 { 246 kmem_cache_destroy(zap_name_cache); 247 kmem_cache_destroy(zap_attr_cache); 248 kmem_cache_destroy(zap_name_long_cache); 249 kmem_cache_destroy(zap_attr_long_cache); 250 } 251 252 static zap_name_t * 253 zap_name_alloc(zap_t *zap, boolean_t longname) 254 { 255 kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache; 256 zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP); 257 258 zn->zn_zap = zap; 259 zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN; 260 return (zn); 261 } 262 263 void 264 zap_name_free(zap_name_t *zn) 265 { 266 if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) { 267 kmem_cache_free(zap_name_cache, zn); 268 } else { 269 ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW); 270 kmem_cache_free(zap_name_long_cache, zn); 271 } 272 } 273 274 static int 275 zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt) 276 { 277 zap_t *zap = zn->zn_zap; 278 size_t key_len = strlen(key) + 1; 279 280 /* Make sure zn is allocated for longname if key is long */ 281 IMPLY(key_len > ZAP_MAXNAMELEN, 282 zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW); 283 284 zn->zn_key_intlen = sizeof (*key); 285 zn->zn_key_orig = key; 286 zn->zn_key_orig_numints = key_len; 287 zn->zn_matchtype = mt; 288 zn->zn_normflags = zap->zap_normflags; 289 290 /* 291 * If we're dealing with a case sensitive lookup on a mixed or 292 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup 293 * will fold case to all caps overriding the lookup request. 294 */ 295 if (mt & MT_MATCH_CASE) 296 zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; 297 298 if (zap->zap_normflags) { 299 /* 300 * We *must* use zap_normflags because this normalization is 301 * what the hash is computed from. 302 */ 303 if (zap_normalize(zap, key, zn->zn_normbuf, 304 zap->zap_normflags, zn->zn_normbuf_len) != 0) 305 return (SET_ERROR(ENOTSUP)); 306 zn->zn_key_norm = zn->zn_normbuf; 307 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; 308 } else { 309 if (mt != 0) 310 return (SET_ERROR(ENOTSUP)); 311 zn->zn_key_norm = zn->zn_key_orig; 312 zn->zn_key_norm_numints = zn->zn_key_orig_numints; 313 } 314 315 zn->zn_hash = zap_hash(zn); 316 317 if (zap->zap_normflags != zn->zn_normflags) { 318 /* 319 * We *must* use zn_normflags because this normalization is 320 * what the matching is based on. (Not the hash!) 321 */ 322 if (zap_normalize(zap, key, zn->zn_normbuf, 323 zn->zn_normflags, zn->zn_normbuf_len) != 0) 324 return (SET_ERROR(ENOTSUP)); 325 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; 326 } 327 328 return (0); 329 } 330 331 zap_name_t * 332 zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt) 333 { 334 size_t key_len = strlen(key) + 1; 335 zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN)); 336 if (zap_name_init_str(zn, key, mt) != 0) { 337 zap_name_free(zn); 338 return (NULL); 339 } 340 return (zn); 341 } 342 343 static zap_name_t * 344 zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) 345 { 346 zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP); 347 348 ASSERT(zap->zap_normflags == 0); 349 zn->zn_zap = zap; 350 zn->zn_key_intlen = sizeof (*key); 351 zn->zn_key_orig = zn->zn_key_norm = key; 352 zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; 353 zn->zn_matchtype = 0; 354 zn->zn_normbuf_len = ZAP_MAXNAMELEN; 355 356 zn->zn_hash = zap_hash(zn); 357 return (zn); 358 } 359 360 static void 361 mzap_byteswap(mzap_phys_t *buf, size_t size) 362 { 363 buf->mz_block_type = BSWAP_64(buf->mz_block_type); 364 buf->mz_salt = BSWAP_64(buf->mz_salt); 365 buf->mz_normflags = BSWAP_64(buf->mz_normflags); 366 int max = (size / MZAP_ENT_LEN) - 1; 367 for (int i = 0; i < max; i++) { 368 buf->mz_chunk[i].mze_value = 369 BSWAP_64(buf->mz_chunk[i].mze_value); 370 buf->mz_chunk[i].mze_cd = 371 BSWAP_32(buf->mz_chunk[i].mze_cd); 372 } 373 } 374 375 void 376 zap_byteswap(void *buf, size_t size) 377 { 378 uint64_t block_type = *(uint64_t *)buf; 379 380 if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { 381 /* ASSERT(magic == ZAP_LEAF_MAGIC); */ 382 mzap_byteswap(buf, size); 383 } else { 384 fzap_byteswap(buf, size); 385 } 386 } 387 388 __attribute__((always_inline)) inline 389 static int 390 mze_compare(const void *arg1, const void *arg2) 391 { 392 const mzap_ent_t *mze1 = arg1; 393 const mzap_ent_t *mze2 = arg2; 394 395 return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd, 396 (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd)); 397 } 398 399 ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t, 400 mze_compare) 401 402 static void 403 mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash) 404 { 405 mzap_ent_t mze; 406 407 ASSERT(zap->zap_ismicro); 408 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 409 410 mze.mze_chunkid = chunkid; 411 ASSERT0(hash & 0xffffffff); 412 mze.mze_hash = hash >> 32; 413 ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff); 414 mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd; 415 ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0); 416 zfs_btree_add(&zap->zap_m.zap_tree, &mze); 417 } 418 419 static mzap_ent_t * 420 mze_find(zap_name_t *zn, zfs_btree_index_t *idx) 421 { 422 mzap_ent_t mze_tofind; 423 mzap_ent_t *mze; 424 zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree; 425 426 ASSERT(zn->zn_zap->zap_ismicro); 427 ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); 428 429 ASSERT0(zn->zn_hash & 0xffffffff); 430 mze_tofind.mze_hash = zn->zn_hash >> 32; 431 mze_tofind.mze_cd = 0; 432 433 mze = zfs_btree_find(tree, &mze_tofind, idx); 434 if (mze == NULL) 435 mze = zfs_btree_next(tree, idx, idx); 436 for (; mze && mze->mze_hash == mze_tofind.mze_hash; 437 mze = zfs_btree_next(tree, idx, idx)) { 438 ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); 439 if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) 440 return (mze); 441 } 442 443 return (NULL); 444 } 445 446 static uint32_t 447 mze_find_unused_cd(zap_t *zap, uint64_t hash) 448 { 449 mzap_ent_t mze_tofind; 450 zfs_btree_index_t idx; 451 zfs_btree_t *tree = &zap->zap_m.zap_tree; 452 453 ASSERT(zap->zap_ismicro); 454 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); 455 456 ASSERT0(hash & 0xffffffff); 457 hash >>= 32; 458 mze_tofind.mze_hash = hash; 459 mze_tofind.mze_cd = 0; 460 461 uint32_t cd = 0; 462 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); 463 mze && mze->mze_hash == hash; 464 mze = zfs_btree_next(tree, &idx, &idx)) { 465 if (mze->mze_cd != cd) 466 break; 467 cd++; 468 } 469 470 return (cd); 471 } 472 473 /* 474 * Each mzap entry requires at max : 4 chunks 475 * 3 chunks for names + 1 chunk for value. 476 */ 477 #define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \ 478 ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t))) 479 480 /* 481 * Check if the current entry keeps the colliding entries under the fatzap leaf 482 * size. 483 */ 484 static boolean_t 485 mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) 486 { 487 zap_t *zap = zn->zn_zap; 488 mzap_ent_t mze_tofind; 489 zfs_btree_index_t idx; 490 zfs_btree_t *tree = &zap->zap_m.zap_tree; 491 uint32_t mzap_ents = 0; 492 493 ASSERT0(hash & 0xffffffff); 494 hash >>= 32; 495 mze_tofind.mze_hash = hash; 496 mze_tofind.mze_cd = 0; 497 498 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); 499 mze && mze->mze_hash == hash; 500 mze = zfs_btree_next(tree, &idx, &idx)) { 501 mzap_ents++; 502 } 503 504 /* Include the new entry being added */ 505 mzap_ents++; 506 507 return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS)); 508 } 509 510 static void 511 mze_destroy(zap_t *zap) 512 { 513 zfs_btree_clear(&zap->zap_m.zap_tree); 514 zfs_btree_destroy(&zap->zap_m.zap_tree); 515 } 516 517 static zap_t * 518 mzap_open(dmu_buf_t *db) 519 { 520 zap_t *winner; 521 uint64_t *zap_hdr = (uint64_t *)db->db_data; 522 uint64_t zap_block_type = zap_hdr[0]; 523 uint64_t zap_magic = zap_hdr[1]; 524 525 ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); 526 527 zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); 528 rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL); 529 rw_enter(&zap->zap_rwlock, RW_WRITER); 530 zap->zap_objset = dmu_buf_get_objset(db); 531 zap->zap_object = db->db_object; 532 zap->zap_dbuf = db; 533 534 if (zap_block_type != ZBT_MICRO) { 535 mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 536 0); 537 zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; 538 if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) { 539 winner = NULL; /* No actual winner here... */ 540 goto handle_winner; 541 } 542 } else { 543 zap->zap_ismicro = TRUE; 544 } 545 546 /* 547 * Make sure that zap_ismicro is set before we let others see 548 * it, because zap_lockdir() checks zap_ismicro without the lock 549 * held. 550 */ 551 dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf); 552 winner = dmu_buf_set_user(db, &zap->zap_dbu); 553 554 if (winner != NULL) 555 goto handle_winner; 556 557 if (zap->zap_ismicro) { 558 zap->zap_salt = zap_m_phys(zap)->mz_salt; 559 zap->zap_normflags = zap_m_phys(zap)->mz_normflags; 560 zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; 561 562 /* 563 * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove() 564 * overhead on massive inserts below. It still allows to store 565 * 62 entries before we have to add 2KB B-tree core node. 566 */ 567 zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare, 568 mze_find_in_buf, sizeof (mzap_ent_t), 512); 569 570 zap_name_t *zn = zap_name_alloc(zap, B_FALSE); 571 for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) { 572 mzap_ent_phys_t *mze = 573 &zap_m_phys(zap)->mz_chunk[i]; 574 if (mze->mze_name[0]) { 575 zap->zap_m.zap_num_entries++; 576 zap_name_init_str(zn, mze->mze_name, 0); 577 mze_insert(zap, i, zn->zn_hash); 578 } 579 } 580 zap_name_free(zn); 581 } else { 582 zap->zap_salt = zap_f_phys(zap)->zap_salt; 583 zap->zap_normflags = zap_f_phys(zap)->zap_normflags; 584 585 ASSERT3U(sizeof (struct zap_leaf_header), ==, 586 2*ZAP_LEAF_CHUNKSIZE); 587 588 /* 589 * The embedded pointer table should not overlap the 590 * other members. 591 */ 592 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, 593 &zap_f_phys(zap)->zap_salt); 594 595 /* 596 * The embedded pointer table should end at the end of 597 * the block 598 */ 599 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, 600 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) - 601 (uintptr_t)zap_f_phys(zap), ==, 602 zap->zap_dbuf->db_size); 603 } 604 rw_exit(&zap->zap_rwlock); 605 return (zap); 606 607 handle_winner: 608 rw_exit(&zap->zap_rwlock); 609 rw_destroy(&zap->zap_rwlock); 610 if (!zap->zap_ismicro) 611 mutex_destroy(&zap->zap_f.zap_num_entries_mtx); 612 kmem_free(zap, sizeof (zap_t)); 613 return (winner); 614 } 615 616 /* 617 * This routine "consumes" the caller's hold on the dbuf, which must 618 * have the specified tag. 619 */ 620 static int 621 zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx, 622 krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) 623 { 624 ASSERT0(db->db_offset); 625 objset_t *os = dmu_buf_get_objset(db); 626 uint64_t obj = db->db_object; 627 dmu_object_info_t doi; 628 629 *zapp = NULL; 630 631 dmu_object_info_from_dnode(dn, &doi); 632 if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) 633 return (SET_ERROR(EINVAL)); 634 635 zap_t *zap = dmu_buf_get_user(db); 636 if (zap == NULL) { 637 zap = mzap_open(db); 638 if (zap == NULL) { 639 /* 640 * mzap_open() didn't like what it saw on-disk. 641 * Check for corruption! 642 */ 643 return (SET_ERROR(EIO)); 644 } 645 } 646 647 /* 648 * We're checking zap_ismicro without the lock held, in order to 649 * tell what type of lock we want. Once we have some sort of 650 * lock, see if it really is the right type. In practice this 651 * can only be different if it was upgraded from micro to fat, 652 * and micro wanted WRITER but fat only needs READER. 653 */ 654 krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; 655 rw_enter(&zap->zap_rwlock, lt); 656 if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { 657 /* it was upgraded, now we only need reader */ 658 ASSERT(lt == RW_WRITER); 659 ASSERT(RW_READER == 660 ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)); 661 rw_downgrade(&zap->zap_rwlock); 662 lt = RW_READER; 663 } 664 665 zap->zap_objset = os; 666 zap->zap_dnode = dn; 667 668 if (lt == RW_WRITER) 669 dmu_buf_will_dirty(db, tx); 670 671 ASSERT3P(zap->zap_dbuf, ==, db); 672 673 ASSERT(!zap->zap_ismicro || 674 zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); 675 if (zap->zap_ismicro && tx && adding && 676 zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { 677 uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; 678 if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) { 679 dprintf("upgrading obj %llu: num_entries=%u\n", 680 (u_longlong_t)obj, zap->zap_m.zap_num_entries); 681 *zapp = zap; 682 int err = mzap_upgrade(zapp, tag, tx, 0); 683 if (err != 0) 684 rw_exit(&zap->zap_rwlock); 685 return (err); 686 } 687 VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); 688 zap->zap_m.zap_num_chunks = 689 db->db_size / MZAP_ENT_LEN - 1; 690 691 if (newsz > SPA_OLD_MAXBLOCKSIZE) { 692 dsl_dataset_t *ds = dmu_objset_ds(os); 693 if (!dsl_dataset_feature_is_active(ds, 694 SPA_FEATURE_LARGE_MICROZAP)) { 695 /* 696 * A microzap just grew beyond the old limit 697 * for the first time, so we have to ensure the 698 * feature flag is activated. 699 * zap_get_micro_max_size() won't let us get 700 * here if the feature is not enabled, so we 701 * don't need any other checks beforehand. 702 * 703 * Since we're in open context, we can't 704 * activate the feature directly, so we instead 705 * flag it on the dataset for next sync. 706 */ 707 dsl_dataset_dirty(ds, tx); 708 mutex_enter(&ds->ds_lock); 709 ds->ds_feature_activation 710 [SPA_FEATURE_LARGE_MICROZAP] = 711 (void *)B_TRUE; 712 mutex_exit(&ds->ds_lock); 713 } 714 } 715 } 716 717 *zapp = zap; 718 return (0); 719 } 720 721 static int 722 zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, 723 krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, 724 zap_t **zapp) 725 { 726 dmu_buf_t *db; 727 int err; 728 729 err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); 730 if (err != 0) 731 return (err); 732 err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); 733 if (err != 0) 734 dmu_buf_rele(db, tag); 735 else 736 VERIFY(dnode_add_ref(dn, tag)); 737 return (err); 738 } 739 740 int 741 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, 742 krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, 743 zap_t **zapp) 744 { 745 dnode_t *dn; 746 dmu_buf_t *db; 747 int err; 748 749 err = dnode_hold(os, obj, tag, &dn); 750 if (err != 0) 751 return (err); 752 err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); 753 if (err != 0) { 754 dnode_rele(dn, tag); 755 return (err); 756 } 757 err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); 758 if (err != 0) { 759 dmu_buf_rele(db, tag); 760 dnode_rele(dn, tag); 761 } 762 return (err); 763 } 764 765 void 766 zap_unlockdir(zap_t *zap, const void *tag) 767 { 768 rw_exit(&zap->zap_rwlock); 769 dnode_rele(zap->zap_dnode, tag); 770 dmu_buf_rele(zap->zap_dbuf, tag); 771 } 772 773 static int 774 mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags) 775 { 776 int err = 0; 777 zap_t *zap = *zapp; 778 779 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 780 781 int sz = zap->zap_dbuf->db_size; 782 mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP); 783 memcpy(mzp, zap->zap_dbuf->db_data, sz); 784 int nchunks = zap->zap_m.zap_num_chunks; 785 786 if (!flags) { 787 err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 788 1ULL << fzap_default_block_shift, 0, tx); 789 if (err != 0) { 790 vmem_free(mzp, sz); 791 return (err); 792 } 793 } 794 795 dprintf("upgrading obj=%llu with %u chunks\n", 796 (u_longlong_t)zap->zap_object, nchunks); 797 /* XXX destroy the tree later, so we can use the stored hash value */ 798 mze_destroy(zap); 799 800 fzap_upgrade(zap, tx, flags); 801 802 zap_name_t *zn = zap_name_alloc(zap, B_FALSE); 803 for (int i = 0; i < nchunks; i++) { 804 mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; 805 if (mze->mze_name[0] == 0) 806 continue; 807 dprintf("adding %s=%llu\n", 808 mze->mze_name, (u_longlong_t)mze->mze_value); 809 zap_name_init_str(zn, mze->mze_name, 0); 810 /* If we fail here, we would end up losing entries */ 811 VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, 812 tag, tx)); 813 zap = zn->zn_zap; /* fzap_add_cd() may change zap */ 814 } 815 zap_name_free(zn); 816 vmem_free(mzp, sz); 817 *zapp = zap; 818 return (0); 819 } 820 821 /* 822 * The "normflags" determine the behavior of the matchtype_t which is 823 * passed to zap_lookup_norm(). Names which have the same normalized 824 * version will be stored with the same hash value, and therefore we can 825 * perform normalization-insensitive lookups. We can be Unicode form- 826 * insensitive and/or case-insensitive. The following flags are valid for 827 * "normflags": 828 * 829 * U8_TEXTPREP_NFC 830 * U8_TEXTPREP_NFD 831 * U8_TEXTPREP_NFKC 832 * U8_TEXTPREP_NFKD 833 * U8_TEXTPREP_TOUPPER 834 * 835 * The *_NF* (Normalization Form) flags are mutually exclusive; at most one 836 * of them may be supplied. 837 */ 838 void 839 mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx) 840 { 841 dmu_buf_t *db; 842 843 VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); 844 845 dmu_buf_will_dirty(db, tx); 846 mzap_phys_t *zp = db->db_data; 847 zp->mz_block_type = ZBT_MICRO; 848 zp->mz_salt = 849 ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL; 850 zp->mz_normflags = normflags; 851 852 if (flags != 0) { 853 zap_t *zap; 854 /* Only fat zap supports flags; upgrade immediately. */ 855 VERIFY(dnode_add_ref(dn, FTAG)); 856 VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER, 857 B_FALSE, B_FALSE, &zap)); 858 VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); 859 zap_unlockdir(zap, FTAG); 860 } else { 861 dmu_buf_rele(db, FTAG); 862 } 863 } 864 865 static uint64_t 866 zap_create_impl(objset_t *os, int normflags, zap_flags_t flags, 867 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 868 dmu_object_type_t bonustype, int bonuslen, int dnodesize, 869 dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) 870 { 871 uint64_t obj; 872 873 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); 874 875 if (allocated_dnode == NULL) { 876 dnode_t *dn; 877 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, 878 indirect_blockshift, bonustype, bonuslen, dnodesize, 879 &dn, FTAG, tx); 880 mzap_create_impl(dn, normflags, flags, tx); 881 dnode_rele(dn, FTAG); 882 } else { 883 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, 884 indirect_blockshift, bonustype, bonuslen, dnodesize, 885 allocated_dnode, tag, tx); 886 mzap_create_impl(*allocated_dnode, normflags, flags, tx); 887 } 888 889 return (obj); 890 } 891 892 int 893 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, 894 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 895 { 896 return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, 897 0, tx)); 898 } 899 900 int 901 zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, 902 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 903 { 904 return (zap_create_claim_norm_dnsize(os, obj, 905 0, ot, bonustype, bonuslen, dnodesize, tx)); 906 } 907 908 int 909 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, 910 dmu_object_type_t ot, 911 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 912 { 913 return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, 914 bonuslen, 0, tx)); 915 } 916 917 int 918 zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, 919 dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, 920 int dnodesize, dmu_tx_t *tx) 921 { 922 dnode_t *dn; 923 int error; 924 925 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); 926 error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, 927 dnodesize, tx); 928 if (error != 0) 929 return (error); 930 931 error = dnode_hold(os, obj, FTAG, &dn); 932 if (error != 0) 933 return (error); 934 935 mzap_create_impl(dn, normflags, 0, tx); 936 937 dnode_rele(dn, FTAG); 938 939 return (0); 940 } 941 942 uint64_t 943 zap_create(objset_t *os, dmu_object_type_t ot, 944 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 945 { 946 return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); 947 } 948 949 uint64_t 950 zap_create_dnsize(objset_t *os, dmu_object_type_t ot, 951 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 952 { 953 return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, 954 dnodesize, tx)); 955 } 956 957 uint64_t 958 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, 959 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 960 { 961 return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, 962 0, tx)); 963 } 964 965 uint64_t 966 zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, 967 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 968 { 969 return (zap_create_impl(os, normflags, 0, ot, 0, 0, 970 bonustype, bonuslen, dnodesize, NULL, NULL, tx)); 971 } 972 973 uint64_t 974 zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, 975 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 976 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 977 { 978 return (zap_create_flags_dnsize(os, normflags, flags, ot, 979 leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); 980 } 981 982 uint64_t 983 zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, 984 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 985 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 986 { 987 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, 988 indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL, 989 tx)); 990 } 991 992 /* 993 * Create a zap object and return a pointer to the newly allocated dnode via 994 * the allocated_dnode argument. The returned dnode will be held and the 995 * caller is responsible for releasing the hold by calling dnode_rele(). 996 */ 997 uint64_t 998 zap_create_hold(objset_t *os, int normflags, zap_flags_t flags, 999 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 1000 dmu_object_type_t bonustype, int bonuslen, int dnodesize, 1001 dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) 1002 { 1003 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, 1004 indirect_blockshift, bonustype, bonuslen, dnodesize, 1005 allocated_dnode, tag, tx)); 1006 } 1007 1008 int 1009 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) 1010 { 1011 /* 1012 * dmu_object_free will free the object number and free the 1013 * data. Freeing the data will cause our pageout function to be 1014 * called, which will destroy our data (zap_leaf_t's and zap_t). 1015 */ 1016 1017 return (dmu_object_free(os, zapobj, tx)); 1018 } 1019 1020 void 1021 zap_evict_sync(void *dbu) 1022 { 1023 zap_t *zap = dbu; 1024 1025 rw_destroy(&zap->zap_rwlock); 1026 1027 if (zap->zap_ismicro) 1028 mze_destroy(zap); 1029 else 1030 mutex_destroy(&zap->zap_f.zap_num_entries_mtx); 1031 1032 kmem_free(zap, sizeof (zap_t)); 1033 } 1034 1035 int 1036 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) 1037 { 1038 zap_t *zap; 1039 1040 int err = 1041 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1042 if (err != 0) 1043 return (err); 1044 if (!zap->zap_ismicro) { 1045 err = fzap_count(zap, count); 1046 } else { 1047 *count = zap->zap_m.zap_num_entries; 1048 } 1049 zap_unlockdir(zap, FTAG); 1050 return (err); 1051 } 1052 1053 /* 1054 * zn may be NULL; if not specified, it will be computed if needed. 1055 * See also the comment above zap_entry_normalization_conflict(). 1056 */ 1057 static boolean_t 1058 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze, 1059 zfs_btree_index_t *idx) 1060 { 1061 boolean_t allocdzn = B_FALSE; 1062 mzap_ent_t *other; 1063 zfs_btree_index_t oidx; 1064 1065 if (zap->zap_normflags == 0) 1066 return (B_FALSE); 1067 1068 for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx); 1069 other && other->mze_hash == mze->mze_hash; 1070 other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) { 1071 1072 if (zn == NULL) { 1073 zn = zap_name_alloc_str(zap, 1074 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); 1075 allocdzn = B_TRUE; 1076 } 1077 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { 1078 if (allocdzn) 1079 zap_name_free(zn); 1080 return (B_TRUE); 1081 } 1082 } 1083 1084 for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx); 1085 other && other->mze_hash == mze->mze_hash; 1086 other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) { 1087 1088 if (zn == NULL) { 1089 zn = zap_name_alloc_str(zap, 1090 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); 1091 allocdzn = B_TRUE; 1092 } 1093 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { 1094 if (allocdzn) 1095 zap_name_free(zn); 1096 return (B_TRUE); 1097 } 1098 } 1099 1100 if (allocdzn) 1101 zap_name_free(zn); 1102 return (B_FALSE); 1103 } 1104 1105 /* 1106 * Routines for manipulating attributes. 1107 */ 1108 1109 int 1110 zap_lookup(objset_t *os, uint64_t zapobj, const char *name, 1111 uint64_t integer_size, uint64_t num_integers, void *buf) 1112 { 1113 return (zap_lookup_norm(os, zapobj, name, integer_size, 1114 num_integers, buf, 0, NULL, 0, NULL)); 1115 } 1116 1117 static int 1118 zap_lookup_impl(zap_t *zap, const char *name, 1119 uint64_t integer_size, uint64_t num_integers, void *buf, 1120 matchtype_t mt, char *realname, int rn_len, 1121 boolean_t *ncp) 1122 { 1123 int err = 0; 1124 1125 zap_name_t *zn = zap_name_alloc_str(zap, name, mt); 1126 if (zn == NULL) 1127 return (SET_ERROR(ENOTSUP)); 1128 1129 if (!zap->zap_ismicro) { 1130 err = fzap_lookup(zn, integer_size, num_integers, buf, 1131 realname, rn_len, ncp); 1132 } else { 1133 zfs_btree_index_t idx; 1134 mzap_ent_t *mze = mze_find(zn, &idx); 1135 if (mze == NULL) { 1136 err = SET_ERROR(ENOENT); 1137 } else { 1138 if (num_integers < 1) { 1139 err = SET_ERROR(EOVERFLOW); 1140 } else if (integer_size != 8) { 1141 err = SET_ERROR(EINVAL); 1142 } else { 1143 *(uint64_t *)buf = 1144 MZE_PHYS(zap, mze)->mze_value; 1145 if (realname != NULL) 1146 (void) strlcpy(realname, 1147 MZE_PHYS(zap, mze)->mze_name, 1148 rn_len); 1149 if (ncp) { 1150 *ncp = mzap_normalization_conflict(zap, 1151 zn, mze, &idx); 1152 } 1153 } 1154 } 1155 } 1156 zap_name_free(zn); 1157 return (err); 1158 } 1159 1160 int 1161 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, 1162 uint64_t integer_size, uint64_t num_integers, void *buf, 1163 matchtype_t mt, char *realname, int rn_len, 1164 boolean_t *ncp) 1165 { 1166 zap_t *zap; 1167 1168 int err = 1169 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1170 if (err != 0) 1171 return (err); 1172 err = zap_lookup_impl(zap, name, integer_size, 1173 num_integers, buf, mt, realname, rn_len, ncp); 1174 zap_unlockdir(zap, FTAG); 1175 return (err); 1176 } 1177 1178 int 1179 zap_prefetch(objset_t *os, uint64_t zapobj, const char *name) 1180 { 1181 zap_t *zap; 1182 int err; 1183 zap_name_t *zn; 1184 1185 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1186 if (err) 1187 return (err); 1188 zn = zap_name_alloc_str(zap, name, 0); 1189 if (zn == NULL) { 1190 zap_unlockdir(zap, FTAG); 1191 return (SET_ERROR(ENOTSUP)); 1192 } 1193 1194 fzap_prefetch(zn); 1195 zap_name_free(zn); 1196 zap_unlockdir(zap, FTAG); 1197 return (err); 1198 } 1199 1200 int 1201 zap_prefetch_object(objset_t *os, uint64_t zapobj) 1202 { 1203 int error; 1204 dmu_object_info_t doi; 1205 1206 error = dmu_object_info(os, zapobj, &doi); 1207 if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) 1208 error = SET_ERROR(EINVAL); 1209 if (error == 0) 1210 dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset); 1211 1212 return (error); 1213 } 1214 1215 int 1216 zap_lookup_by_dnode(dnode_t *dn, const char *name, 1217 uint64_t integer_size, uint64_t num_integers, void *buf) 1218 { 1219 return (zap_lookup_norm_by_dnode(dn, name, integer_size, 1220 num_integers, buf, 0, NULL, 0, NULL)); 1221 } 1222 1223 int 1224 zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, 1225 uint64_t integer_size, uint64_t num_integers, void *buf, 1226 matchtype_t mt, char *realname, int rn_len, 1227 boolean_t *ncp) 1228 { 1229 zap_t *zap; 1230 1231 int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, 1232 FTAG, &zap); 1233 if (err != 0) 1234 return (err); 1235 err = zap_lookup_impl(zap, name, integer_size, 1236 num_integers, buf, mt, realname, rn_len, ncp); 1237 zap_unlockdir(zap, FTAG); 1238 return (err); 1239 } 1240 1241 static int 1242 zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints) 1243 { 1244 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1245 if (zn == NULL) { 1246 zap_unlockdir(zap, FTAG); 1247 return (SET_ERROR(ENOTSUP)); 1248 } 1249 1250 fzap_prefetch(zn); 1251 zap_name_free(zn); 1252 zap_unlockdir(zap, FTAG); 1253 return (0); 1254 } 1255 1256 int 1257 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1258 int key_numints) 1259 { 1260 zap_t *zap; 1261 1262 int err = 1263 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1264 if (err != 0) 1265 return (err); 1266 err = zap_prefetch_uint64_impl(zap, key, key_numints); 1267 /* zap_prefetch_uint64_impl() calls zap_unlockdir() */ 1268 return (err); 1269 } 1270 1271 int 1272 zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints) 1273 { 1274 zap_t *zap; 1275 1276 int err = 1277 zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1278 if (err != 0) 1279 return (err); 1280 err = zap_prefetch_uint64_impl(zap, key, key_numints); 1281 /* zap_prefetch_uint64_impl() calls zap_unlockdir() */ 1282 return (err); 1283 } 1284 1285 static int 1286 zap_lookup_uint64_impl(zap_t *zap, const uint64_t *key, 1287 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) 1288 { 1289 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1290 if (zn == NULL) { 1291 zap_unlockdir(zap, FTAG); 1292 return (SET_ERROR(ENOTSUP)); 1293 } 1294 1295 int err = fzap_lookup(zn, integer_size, num_integers, buf, 1296 NULL, 0, NULL); 1297 zap_name_free(zn); 1298 zap_unlockdir(zap, FTAG); 1299 return (err); 1300 } 1301 1302 int 1303 zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1304 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) 1305 { 1306 zap_t *zap; 1307 1308 int err = 1309 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1310 if (err != 0) 1311 return (err); 1312 err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size, 1313 num_integers, buf); 1314 /* zap_lookup_uint64_impl() calls zap_unlockdir() */ 1315 return (err); 1316 } 1317 1318 int 1319 zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key, 1320 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) 1321 { 1322 zap_t *zap; 1323 1324 int err = 1325 zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1326 if (err != 0) 1327 return (err); 1328 err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size, 1329 num_integers, buf); 1330 /* zap_lookup_uint64_impl() calls zap_unlockdir() */ 1331 return (err); 1332 } 1333 1334 int 1335 zap_contains(objset_t *os, uint64_t zapobj, const char *name) 1336 { 1337 int err = zap_lookup_norm(os, zapobj, name, 0, 1338 0, NULL, 0, NULL, 0, NULL); 1339 if (err == EOVERFLOW || err == EINVAL) 1340 err = 0; /* found, but skipped reading the value */ 1341 return (err); 1342 } 1343 1344 int 1345 zap_length(objset_t *os, uint64_t zapobj, const char *name, 1346 uint64_t *integer_size, uint64_t *num_integers) 1347 { 1348 zap_t *zap; 1349 1350 int err = 1351 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1352 if (err != 0) 1353 return (err); 1354 zap_name_t *zn = zap_name_alloc_str(zap, name, 0); 1355 if (zn == NULL) { 1356 zap_unlockdir(zap, FTAG); 1357 return (SET_ERROR(ENOTSUP)); 1358 } 1359 if (!zap->zap_ismicro) { 1360 err = fzap_length(zn, integer_size, num_integers); 1361 } else { 1362 zfs_btree_index_t idx; 1363 mzap_ent_t *mze = mze_find(zn, &idx); 1364 if (mze == NULL) { 1365 err = SET_ERROR(ENOENT); 1366 } else { 1367 if (integer_size) 1368 *integer_size = 8; 1369 if (num_integers) 1370 *num_integers = 1; 1371 } 1372 } 1373 zap_name_free(zn); 1374 zap_unlockdir(zap, FTAG); 1375 return (err); 1376 } 1377 1378 int 1379 zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1380 int key_numints, uint64_t *integer_size, uint64_t *num_integers) 1381 { 1382 zap_t *zap; 1383 1384 int err = 1385 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1386 if (err != 0) 1387 return (err); 1388 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1389 if (zn == NULL) { 1390 zap_unlockdir(zap, FTAG); 1391 return (SET_ERROR(ENOTSUP)); 1392 } 1393 err = fzap_length(zn, integer_size, num_integers); 1394 zap_name_free(zn); 1395 zap_unlockdir(zap, FTAG); 1396 return (err); 1397 } 1398 1399 static void 1400 mzap_addent(zap_name_t *zn, uint64_t value) 1401 { 1402 zap_t *zap = zn->zn_zap; 1403 uint16_t start = zap->zap_m.zap_alloc_next; 1404 1405 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 1406 1407 #ifdef ZFS_DEBUG 1408 for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { 1409 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; 1410 ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); 1411 } 1412 #endif 1413 1414 uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash); 1415 /* given the limited size of the microzap, this can't happen */ 1416 ASSERT(cd < zap_maxcd(zap)); 1417 1418 again: 1419 for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) { 1420 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; 1421 if (mze->mze_name[0] == 0) { 1422 mze->mze_value = value; 1423 mze->mze_cd = cd; 1424 (void) strlcpy(mze->mze_name, zn->zn_key_orig, 1425 sizeof (mze->mze_name)); 1426 zap->zap_m.zap_num_entries++; 1427 zap->zap_m.zap_alloc_next = i+1; 1428 if (zap->zap_m.zap_alloc_next == 1429 zap->zap_m.zap_num_chunks) 1430 zap->zap_m.zap_alloc_next = 0; 1431 mze_insert(zap, i, zn->zn_hash); 1432 return; 1433 } 1434 } 1435 if (start != 0) { 1436 start = 0; 1437 goto again; 1438 } 1439 cmn_err(CE_PANIC, "out of entries!"); 1440 } 1441 1442 static int 1443 zap_add_impl(zap_t *zap, const char *key, 1444 int integer_size, uint64_t num_integers, 1445 const void *val, dmu_tx_t *tx, const void *tag) 1446 { 1447 const uint64_t *intval = val; 1448 int err = 0; 1449 1450 zap_name_t *zn = zap_name_alloc_str(zap, key, 0); 1451 if (zn == NULL) { 1452 zap_unlockdir(zap, tag); 1453 return (SET_ERROR(ENOTSUP)); 1454 } 1455 if (!zap->zap_ismicro) { 1456 err = fzap_add(zn, integer_size, num_integers, val, tag, tx); 1457 zap = zn->zn_zap; /* fzap_add() may change zap */ 1458 } else if (integer_size != 8 || num_integers != 1 || 1459 strlen(key) >= MZAP_NAME_LEN || 1460 !mze_canfit_fzap_leaf(zn, zn->zn_hash)) { 1461 err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); 1462 if (err == 0) { 1463 err = fzap_add(zn, integer_size, num_integers, val, 1464 tag, tx); 1465 } 1466 zap = zn->zn_zap; /* fzap_add() may change zap */ 1467 } else { 1468 zfs_btree_index_t idx; 1469 if (mze_find(zn, &idx) != NULL) { 1470 err = SET_ERROR(EEXIST); 1471 } else { 1472 mzap_addent(zn, *intval); 1473 } 1474 } 1475 ASSERT(zap == zn->zn_zap); 1476 zap_name_free(zn); 1477 if (zap != NULL) /* may be NULL if fzap_add() failed */ 1478 zap_unlockdir(zap, tag); 1479 return (err); 1480 } 1481 1482 int 1483 zap_add(objset_t *os, uint64_t zapobj, const char *key, 1484 int integer_size, uint64_t num_integers, 1485 const void *val, dmu_tx_t *tx) 1486 { 1487 zap_t *zap; 1488 int err; 1489 1490 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1491 if (err != 0) 1492 return (err); 1493 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); 1494 /* zap_add_impl() calls zap_unlockdir() */ 1495 return (err); 1496 } 1497 1498 int 1499 zap_add_by_dnode(dnode_t *dn, const char *key, 1500 int integer_size, uint64_t num_integers, 1501 const void *val, dmu_tx_t *tx) 1502 { 1503 zap_t *zap; 1504 int err; 1505 1506 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1507 if (err != 0) 1508 return (err); 1509 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); 1510 /* zap_add_impl() calls zap_unlockdir() */ 1511 return (err); 1512 } 1513 1514 static int 1515 zap_add_uint64_impl(zap_t *zap, const uint64_t *key, 1516 int key_numints, int integer_size, uint64_t num_integers, 1517 const void *val, dmu_tx_t *tx, const void *tag) 1518 { 1519 int err; 1520 1521 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1522 if (zn == NULL) { 1523 zap_unlockdir(zap, tag); 1524 return (SET_ERROR(ENOTSUP)); 1525 } 1526 err = fzap_add(zn, integer_size, num_integers, val, tag, tx); 1527 zap = zn->zn_zap; /* fzap_add() may change zap */ 1528 zap_name_free(zn); 1529 if (zap != NULL) /* may be NULL if fzap_add() failed */ 1530 zap_unlockdir(zap, tag); 1531 return (err); 1532 } 1533 1534 int 1535 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1536 int key_numints, int integer_size, uint64_t num_integers, 1537 const void *val, dmu_tx_t *tx) 1538 { 1539 zap_t *zap; 1540 1541 int err = 1542 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1543 if (err != 0) 1544 return (err); 1545 err = zap_add_uint64_impl(zap, key, key_numints, 1546 integer_size, num_integers, val, tx, FTAG); 1547 /* zap_add_uint64_impl() calls zap_unlockdir() */ 1548 return (err); 1549 } 1550 1551 int 1552 zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, 1553 int key_numints, int integer_size, uint64_t num_integers, 1554 const void *val, dmu_tx_t *tx) 1555 { 1556 zap_t *zap; 1557 1558 int err = 1559 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1560 if (err != 0) 1561 return (err); 1562 err = zap_add_uint64_impl(zap, key, key_numints, 1563 integer_size, num_integers, val, tx, FTAG); 1564 /* zap_add_uint64_impl() calls zap_unlockdir() */ 1565 return (err); 1566 } 1567 1568 int 1569 zap_update(objset_t *os, uint64_t zapobj, const char *name, 1570 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) 1571 { 1572 zap_t *zap; 1573 const uint64_t *intval = val; 1574 1575 int err = 1576 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1577 if (err != 0) 1578 return (err); 1579 zap_name_t *zn = zap_name_alloc_str(zap, name, 0); 1580 if (zn == NULL) { 1581 zap_unlockdir(zap, FTAG); 1582 return (SET_ERROR(ENOTSUP)); 1583 } 1584 if (!zap->zap_ismicro) { 1585 err = fzap_update(zn, integer_size, num_integers, val, 1586 FTAG, tx); 1587 zap = zn->zn_zap; /* fzap_update() may change zap */ 1588 } else if (integer_size != 8 || num_integers != 1 || 1589 strlen(name) >= MZAP_NAME_LEN) { 1590 dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", 1591 (u_longlong_t)zapobj, integer_size, 1592 (u_longlong_t)num_integers, name); 1593 err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); 1594 if (err == 0) { 1595 err = fzap_update(zn, integer_size, num_integers, 1596 val, FTAG, tx); 1597 } 1598 zap = zn->zn_zap; /* fzap_update() may change zap */ 1599 } else { 1600 zfs_btree_index_t idx; 1601 mzap_ent_t *mze = mze_find(zn, &idx); 1602 if (mze != NULL) { 1603 MZE_PHYS(zap, mze)->mze_value = *intval; 1604 } else { 1605 mzap_addent(zn, *intval); 1606 } 1607 } 1608 ASSERT(zap == zn->zn_zap); 1609 zap_name_free(zn); 1610 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ 1611 zap_unlockdir(zap, FTAG); 1612 return (err); 1613 } 1614 1615 static int 1616 zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, 1617 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, 1618 const void *tag) 1619 { 1620 int err; 1621 1622 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1623 if (zn == NULL) { 1624 zap_unlockdir(zap, tag); 1625 return (SET_ERROR(ENOTSUP)); 1626 } 1627 err = fzap_update(zn, integer_size, num_integers, val, tag, tx); 1628 zap = zn->zn_zap; /* fzap_update() may change zap */ 1629 zap_name_free(zn); 1630 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ 1631 zap_unlockdir(zap, tag); 1632 return (err); 1633 } 1634 1635 int 1636 zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1637 int key_numints, int integer_size, uint64_t num_integers, const void *val, 1638 dmu_tx_t *tx) 1639 { 1640 zap_t *zap; 1641 1642 int err = 1643 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1644 if (err != 0) 1645 return (err); 1646 err = zap_update_uint64_impl(zap, key, key_numints, 1647 integer_size, num_integers, val, tx, FTAG); 1648 /* zap_update_uint64_impl() calls zap_unlockdir() */ 1649 return (err); 1650 } 1651 1652 int 1653 zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, 1654 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) 1655 { 1656 zap_t *zap; 1657 1658 int err = 1659 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1660 if (err != 0) 1661 return (err); 1662 err = zap_update_uint64_impl(zap, key, key_numints, 1663 integer_size, num_integers, val, tx, FTAG); 1664 /* zap_update_uint64_impl() calls zap_unlockdir() */ 1665 return (err); 1666 } 1667 1668 int 1669 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) 1670 { 1671 return (zap_remove_norm(os, zapobj, name, 0, tx)); 1672 } 1673 1674 static int 1675 zap_remove_impl(zap_t *zap, const char *name, 1676 matchtype_t mt, dmu_tx_t *tx) 1677 { 1678 int err = 0; 1679 1680 zap_name_t *zn = zap_name_alloc_str(zap, name, mt); 1681 if (zn == NULL) 1682 return (SET_ERROR(ENOTSUP)); 1683 if (!zap->zap_ismicro) { 1684 err = fzap_remove(zn, tx); 1685 } else { 1686 zfs_btree_index_t idx; 1687 mzap_ent_t *mze = mze_find(zn, &idx); 1688 if (mze == NULL) { 1689 err = SET_ERROR(ENOENT); 1690 } else { 1691 zap->zap_m.zap_num_entries--; 1692 memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t)); 1693 zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx); 1694 } 1695 } 1696 zap_name_free(zn); 1697 return (err); 1698 } 1699 1700 int 1701 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, 1702 matchtype_t mt, dmu_tx_t *tx) 1703 { 1704 zap_t *zap; 1705 int err; 1706 1707 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1708 if (err) 1709 return (err); 1710 err = zap_remove_impl(zap, name, mt, tx); 1711 zap_unlockdir(zap, FTAG); 1712 return (err); 1713 } 1714 1715 int 1716 zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) 1717 { 1718 zap_t *zap; 1719 int err; 1720 1721 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1722 if (err) 1723 return (err); 1724 err = zap_remove_impl(zap, name, 0, tx); 1725 zap_unlockdir(zap, FTAG); 1726 return (err); 1727 } 1728 1729 static int 1730 zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, 1731 dmu_tx_t *tx, const void *tag) 1732 { 1733 int err; 1734 1735 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1736 if (zn == NULL) { 1737 zap_unlockdir(zap, tag); 1738 return (SET_ERROR(ENOTSUP)); 1739 } 1740 err = fzap_remove(zn, tx); 1741 zap_name_free(zn); 1742 zap_unlockdir(zap, tag); 1743 return (err); 1744 } 1745 1746 int 1747 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1748 int key_numints, dmu_tx_t *tx) 1749 { 1750 zap_t *zap; 1751 1752 int err = 1753 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1754 if (err != 0) 1755 return (err); 1756 err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); 1757 /* zap_remove_uint64_impl() calls zap_unlockdir() */ 1758 return (err); 1759 } 1760 1761 int 1762 zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, 1763 dmu_tx_t *tx) 1764 { 1765 zap_t *zap; 1766 1767 int err = 1768 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1769 if (err != 0) 1770 return (err); 1771 err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); 1772 /* zap_remove_uint64_impl() calls zap_unlockdir() */ 1773 return (err); 1774 } 1775 1776 1777 static zap_attribute_t * 1778 zap_attribute_alloc_impl(boolean_t longname) 1779 { 1780 zap_attribute_t *za; 1781 1782 za = kmem_cache_alloc((longname)? zap_attr_long_cache : zap_attr_cache, 1783 KM_SLEEP); 1784 za->za_name_len = (longname)? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN; 1785 return (za); 1786 } 1787 1788 zap_attribute_t * 1789 zap_attribute_alloc(void) 1790 { 1791 return (zap_attribute_alloc_impl(B_FALSE)); 1792 } 1793 1794 zap_attribute_t * 1795 zap_attribute_long_alloc(void) 1796 { 1797 return (zap_attribute_alloc_impl(B_TRUE)); 1798 } 1799 1800 void 1801 zap_attribute_free(zap_attribute_t *za) 1802 { 1803 if (za->za_name_len == ZAP_MAXNAMELEN) { 1804 kmem_cache_free(zap_attr_cache, za); 1805 } else { 1806 ASSERT3U(za->za_name_len, ==, ZAP_MAXNAMELEN_NEW); 1807 kmem_cache_free(zap_attr_long_cache, za); 1808 } 1809 } 1810 1811 /* 1812 * Routines for iterating over the attributes. 1813 */ 1814 1815 static void 1816 zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, 1817 uint64_t serialized, boolean_t prefetch) 1818 { 1819 zc->zc_objset = os; 1820 zc->zc_zap = NULL; 1821 zc->zc_leaf = NULL; 1822 zc->zc_zapobj = zapobj; 1823 zc->zc_serialized = serialized; 1824 zc->zc_hash = 0; 1825 zc->zc_cd = 0; 1826 zc->zc_prefetch = prefetch; 1827 } 1828 void 1829 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, 1830 uint64_t serialized) 1831 { 1832 zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE); 1833 } 1834 1835 /* 1836 * Initialize a cursor at the beginning of the ZAP object. The entire 1837 * ZAP object will be prefetched. 1838 */ 1839 void 1840 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) 1841 { 1842 zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE); 1843 } 1844 1845 /* 1846 * Initialize a cursor at the beginning, but request that we not prefetch 1847 * the entire ZAP object. 1848 */ 1849 void 1850 zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) 1851 { 1852 zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE); 1853 } 1854 1855 void 1856 zap_cursor_fini(zap_cursor_t *zc) 1857 { 1858 if (zc->zc_zap) { 1859 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); 1860 zap_unlockdir(zc->zc_zap, NULL); 1861 zc->zc_zap = NULL; 1862 } 1863 if (zc->zc_leaf) { 1864 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); 1865 zap_put_leaf(zc->zc_leaf); 1866 zc->zc_leaf = NULL; 1867 } 1868 zc->zc_objset = NULL; 1869 } 1870 1871 uint64_t 1872 zap_cursor_serialize(zap_cursor_t *zc) 1873 { 1874 if (zc->zc_hash == -1ULL) 1875 return (-1ULL); 1876 if (zc->zc_zap == NULL) 1877 return (zc->zc_serialized); 1878 ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); 1879 ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); 1880 1881 /* 1882 * We want to keep the high 32 bits of the cursor zero if we can, so 1883 * that 32-bit programs can access this. So usually use a small 1884 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits 1885 * of the cursor. 1886 * 1887 * [ collision differentiator | zap_hashbits()-bit hash value ] 1888 */ 1889 return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | 1890 ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); 1891 } 1892 1893 int 1894 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) 1895 { 1896 int err; 1897 1898 if (zc->zc_hash == -1ULL) 1899 return (SET_ERROR(ENOENT)); 1900 1901 if (zc->zc_zap == NULL) { 1902 int hb; 1903 err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, 1904 RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); 1905 if (err != 0) 1906 return (err); 1907 1908 /* 1909 * To support zap_cursor_init_serialized, advance, retrieve, 1910 * we must add to the existing zc_cd, which may already 1911 * be 1 due to the zap_cursor_advance. 1912 */ 1913 ASSERT(zc->zc_hash == 0); 1914 hb = zap_hashbits(zc->zc_zap); 1915 zc->zc_hash = zc->zc_serialized << (64 - hb); 1916 zc->zc_cd += zc->zc_serialized >> hb; 1917 if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ 1918 zc->zc_cd = 0; 1919 } else { 1920 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); 1921 } 1922 if (!zc->zc_zap->zap_ismicro) { 1923 err = fzap_cursor_retrieve(zc->zc_zap, zc, za); 1924 } else { 1925 zfs_btree_index_t idx; 1926 mzap_ent_t mze_tofind; 1927 1928 mze_tofind.mze_hash = zc->zc_hash >> 32; 1929 mze_tofind.mze_cd = zc->zc_cd; 1930 1931 mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree, 1932 &mze_tofind, &idx); 1933 if (mze == NULL) { 1934 mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree, 1935 &idx, &idx); 1936 } 1937 if (mze) { 1938 mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); 1939 ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); 1940 za->za_normalization_conflict = 1941 mzap_normalization_conflict(zc->zc_zap, NULL, 1942 mze, &idx); 1943 za->za_integer_length = 8; 1944 za->za_num_integers = 1; 1945 za->za_first_integer = mzep->mze_value; 1946 (void) strlcpy(za->za_name, mzep->mze_name, 1947 za->za_name_len); 1948 zc->zc_hash = (uint64_t)mze->mze_hash << 32; 1949 zc->zc_cd = mze->mze_cd; 1950 err = 0; 1951 } else { 1952 zc->zc_hash = -1ULL; 1953 err = SET_ERROR(ENOENT); 1954 } 1955 } 1956 rw_exit(&zc->zc_zap->zap_rwlock); 1957 return (err); 1958 } 1959 1960 void 1961 zap_cursor_advance(zap_cursor_t *zc) 1962 { 1963 if (zc->zc_hash == -1ULL) 1964 return; 1965 zc->zc_cd++; 1966 } 1967 1968 int 1969 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) 1970 { 1971 zap_t *zap; 1972 1973 int err = 1974 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1975 if (err != 0) 1976 return (err); 1977 1978 memset(zs, 0, sizeof (zap_stats_t)); 1979 1980 if (zap->zap_ismicro) { 1981 zs->zs_blocksize = zap->zap_dbuf->db_size; 1982 zs->zs_num_entries = zap->zap_m.zap_num_entries; 1983 zs->zs_num_blocks = 1; 1984 } else { 1985 fzap_get_stats(zap, zs); 1986 } 1987 zap_unlockdir(zap, FTAG); 1988 return (0); 1989 } 1990 1991 #if defined(_KERNEL) 1992 EXPORT_SYMBOL(zap_create); 1993 EXPORT_SYMBOL(zap_create_dnsize); 1994 EXPORT_SYMBOL(zap_create_norm); 1995 EXPORT_SYMBOL(zap_create_norm_dnsize); 1996 EXPORT_SYMBOL(zap_create_flags); 1997 EXPORT_SYMBOL(zap_create_flags_dnsize); 1998 EXPORT_SYMBOL(zap_create_claim); 1999 EXPORT_SYMBOL(zap_create_claim_norm); 2000 EXPORT_SYMBOL(zap_create_claim_norm_dnsize); 2001 EXPORT_SYMBOL(zap_create_hold); 2002 EXPORT_SYMBOL(zap_destroy); 2003 EXPORT_SYMBOL(zap_lookup); 2004 EXPORT_SYMBOL(zap_lookup_by_dnode); 2005 EXPORT_SYMBOL(zap_lookup_norm); 2006 EXPORT_SYMBOL(zap_lookup_uint64); 2007 EXPORT_SYMBOL(zap_contains); 2008 EXPORT_SYMBOL(zap_prefetch); 2009 EXPORT_SYMBOL(zap_prefetch_uint64); 2010 EXPORT_SYMBOL(zap_prefetch_object); 2011 EXPORT_SYMBOL(zap_add); 2012 EXPORT_SYMBOL(zap_add_by_dnode); 2013 EXPORT_SYMBOL(zap_add_uint64); 2014 EXPORT_SYMBOL(zap_add_uint64_by_dnode); 2015 EXPORT_SYMBOL(zap_update); 2016 EXPORT_SYMBOL(zap_update_uint64); 2017 EXPORT_SYMBOL(zap_update_uint64_by_dnode); 2018 EXPORT_SYMBOL(zap_length); 2019 EXPORT_SYMBOL(zap_length_uint64); 2020 EXPORT_SYMBOL(zap_remove); 2021 EXPORT_SYMBOL(zap_remove_by_dnode); 2022 EXPORT_SYMBOL(zap_remove_norm); 2023 EXPORT_SYMBOL(zap_remove_uint64); 2024 EXPORT_SYMBOL(zap_remove_uint64_by_dnode); 2025 EXPORT_SYMBOL(zap_count); 2026 EXPORT_SYMBOL(zap_value_search); 2027 EXPORT_SYMBOL(zap_join); 2028 EXPORT_SYMBOL(zap_join_increment); 2029 EXPORT_SYMBOL(zap_add_int); 2030 EXPORT_SYMBOL(zap_remove_int); 2031 EXPORT_SYMBOL(zap_lookup_int); 2032 EXPORT_SYMBOL(zap_increment_int); 2033 EXPORT_SYMBOL(zap_add_int_key); 2034 EXPORT_SYMBOL(zap_lookup_int_key); 2035 EXPORT_SYMBOL(zap_increment); 2036 EXPORT_SYMBOL(zap_cursor_init); 2037 EXPORT_SYMBOL(zap_cursor_fini); 2038 EXPORT_SYMBOL(zap_cursor_retrieve); 2039 EXPORT_SYMBOL(zap_cursor_advance); 2040 EXPORT_SYMBOL(zap_cursor_serialize); 2041 EXPORT_SYMBOL(zap_cursor_init_serialized); 2042 EXPORT_SYMBOL(zap_get_stats); 2043 2044 ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW, 2045 "Maximum micro ZAP size before converting to a fat ZAP, " 2046 "in bytes (max 1M)"); 2047 #endif 2048