1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 26 * Copyright 2017 Nexenta Systems, Inc. 27 */ 28 29 #include <sys/zio.h> 30 #include <sys/spa.h> 31 #include <sys/dmu.h> 32 #include <sys/zfs_context.h> 33 #include <sys/zap.h> 34 #include <sys/zap_impl.h> 35 #include <sys/zap_leaf.h> 36 #include <sys/btree.h> 37 #include <sys/arc.h> 38 #include <sys/dmu_objset.h> 39 40 #ifdef _KERNEL 41 #include <sys/sunddi.h> 42 #endif 43 44 int zap_micro_max_size = MZAP_MAX_BLKSZ; 45 46 static int mzap_upgrade(zap_t **zapp, 47 const void *tag, dmu_tx_t *tx, zap_flags_t flags); 48 49 uint64_t 50 zap_getflags(zap_t *zap) 51 { 52 if (zap->zap_ismicro) 53 return (0); 54 return (zap_f_phys(zap)->zap_flags); 55 } 56 57 int 58 zap_hashbits(zap_t *zap) 59 { 60 if (zap_getflags(zap) & ZAP_FLAG_HASH64) 61 return (48); 62 else 63 return (28); 64 } 65 66 uint32_t 67 zap_maxcd(zap_t *zap) 68 { 69 if (zap_getflags(zap) & ZAP_FLAG_HASH64) 70 return ((1<<16)-1); 71 else 72 return (-1U); 73 } 74 75 static uint64_t 76 zap_hash(zap_name_t *zn) 77 { 78 zap_t *zap = zn->zn_zap; 79 uint64_t h = 0; 80 81 if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { 82 ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); 83 h = *(uint64_t *)zn->zn_key_orig; 84 } else { 85 h = zap->zap_salt; 86 ASSERT(h != 0); 87 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 88 89 if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { 90 const uint64_t *wp = zn->zn_key_norm; 91 92 ASSERT(zn->zn_key_intlen == 8); 93 for (int i = 0; i < zn->zn_key_norm_numints; 94 wp++, i++) { 95 uint64_t word = *wp; 96 97 for (int j = 0; j < 8; j++) { 98 h = (h >> 8) ^ 99 zfs_crc64_table[(h ^ word) & 0xFF]; 100 word >>= NBBY; 101 } 102 } 103 } else { 104 const uint8_t *cp = zn->zn_key_norm; 105 106 /* 107 * We previously stored the terminating null on 108 * disk, but didn't hash it, so we need to 109 * continue to not hash it. (The 110 * zn_key_*_numints includes the terminating 111 * null for non-binary keys.) 112 */ 113 int len = zn->zn_key_norm_numints - 1; 114 115 ASSERT(zn->zn_key_intlen == 1); 116 for (int i = 0; i < len; cp++, i++) { 117 h = (h >> 8) ^ 118 zfs_crc64_table[(h ^ *cp) & 0xFF]; 119 } 120 } 121 } 122 /* 123 * Don't use all 64 bits, since we need some in the cookie for 124 * the collision differentiator. We MUST use the high bits, 125 * since those are the ones that we first pay attention to when 126 * choosing the bucket. 127 */ 128 h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); 129 130 return (h); 131 } 132 133 static int 134 zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags) 135 { 136 ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); 137 138 size_t inlen = strlen(name) + 1; 139 size_t outlen = ZAP_MAXNAMELEN; 140 141 int err = 0; 142 (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, 143 normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, 144 U8_UNICODE_LATEST, &err); 145 146 return (err); 147 } 148 149 boolean_t 150 zap_match(zap_name_t *zn, const char *matchname) 151 { 152 ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); 153 154 if (zn->zn_matchtype & MT_NORMALIZE) { 155 char norm[ZAP_MAXNAMELEN]; 156 157 if (zap_normalize(zn->zn_zap, matchname, norm, 158 zn->zn_normflags) != 0) 159 return (B_FALSE); 160 161 return (strcmp(zn->zn_key_norm, norm) == 0); 162 } else { 163 return (strcmp(zn->zn_key_orig, matchname) == 0); 164 } 165 } 166 167 static zap_name_t * 168 zap_name_alloc(zap_t *zap) 169 { 170 zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); 171 zn->zn_zap = zap; 172 return (zn); 173 } 174 175 void 176 zap_name_free(zap_name_t *zn) 177 { 178 kmem_free(zn, sizeof (zap_name_t)); 179 } 180 181 static int 182 zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt) 183 { 184 zap_t *zap = zn->zn_zap; 185 186 zn->zn_key_intlen = sizeof (*key); 187 zn->zn_key_orig = key; 188 zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; 189 zn->zn_matchtype = mt; 190 zn->zn_normflags = zap->zap_normflags; 191 192 /* 193 * If we're dealing with a case sensitive lookup on a mixed or 194 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup 195 * will fold case to all caps overriding the lookup request. 196 */ 197 if (mt & MT_MATCH_CASE) 198 zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; 199 200 if (zap->zap_normflags) { 201 /* 202 * We *must* use zap_normflags because this normalization is 203 * what the hash is computed from. 204 */ 205 if (zap_normalize(zap, key, zn->zn_normbuf, 206 zap->zap_normflags) != 0) 207 return (SET_ERROR(ENOTSUP)); 208 zn->zn_key_norm = zn->zn_normbuf; 209 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; 210 } else { 211 if (mt != 0) 212 return (SET_ERROR(ENOTSUP)); 213 zn->zn_key_norm = zn->zn_key_orig; 214 zn->zn_key_norm_numints = zn->zn_key_orig_numints; 215 } 216 217 zn->zn_hash = zap_hash(zn); 218 219 if (zap->zap_normflags != zn->zn_normflags) { 220 /* 221 * We *must* use zn_normflags because this normalization is 222 * what the matching is based on. (Not the hash!) 223 */ 224 if (zap_normalize(zap, key, zn->zn_normbuf, 225 zn->zn_normflags) != 0) 226 return (SET_ERROR(ENOTSUP)); 227 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; 228 } 229 230 return (0); 231 } 232 233 zap_name_t * 234 zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt) 235 { 236 zap_name_t *zn = zap_name_alloc(zap); 237 if (zap_name_init_str(zn, key, mt) != 0) { 238 zap_name_free(zn); 239 return (NULL); 240 } 241 return (zn); 242 } 243 244 static zap_name_t * 245 zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) 246 { 247 zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); 248 249 ASSERT(zap->zap_normflags == 0); 250 zn->zn_zap = zap; 251 zn->zn_key_intlen = sizeof (*key); 252 zn->zn_key_orig = zn->zn_key_norm = key; 253 zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; 254 zn->zn_matchtype = 0; 255 256 zn->zn_hash = zap_hash(zn); 257 return (zn); 258 } 259 260 static void 261 mzap_byteswap(mzap_phys_t *buf, size_t size) 262 { 263 buf->mz_block_type = BSWAP_64(buf->mz_block_type); 264 buf->mz_salt = BSWAP_64(buf->mz_salt); 265 buf->mz_normflags = BSWAP_64(buf->mz_normflags); 266 int max = (size / MZAP_ENT_LEN) - 1; 267 for (int i = 0; i < max; i++) { 268 buf->mz_chunk[i].mze_value = 269 BSWAP_64(buf->mz_chunk[i].mze_value); 270 buf->mz_chunk[i].mze_cd = 271 BSWAP_32(buf->mz_chunk[i].mze_cd); 272 } 273 } 274 275 void 276 zap_byteswap(void *buf, size_t size) 277 { 278 uint64_t block_type = *(uint64_t *)buf; 279 280 if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { 281 /* ASSERT(magic == ZAP_LEAF_MAGIC); */ 282 mzap_byteswap(buf, size); 283 } else { 284 fzap_byteswap(buf, size); 285 } 286 } 287 288 __attribute__((always_inline)) inline 289 static int 290 mze_compare(const void *arg1, const void *arg2) 291 { 292 const mzap_ent_t *mze1 = arg1; 293 const mzap_ent_t *mze2 = arg2; 294 295 return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd, 296 (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd)); 297 } 298 299 ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t, 300 mze_compare) 301 302 static void 303 mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash) 304 { 305 mzap_ent_t mze; 306 307 ASSERT(zap->zap_ismicro); 308 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 309 310 mze.mze_chunkid = chunkid; 311 ASSERT0(hash & 0xffffffff); 312 mze.mze_hash = hash >> 32; 313 ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff); 314 mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd; 315 ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0); 316 zfs_btree_add(&zap->zap_m.zap_tree, &mze); 317 } 318 319 static mzap_ent_t * 320 mze_find(zap_name_t *zn, zfs_btree_index_t *idx) 321 { 322 mzap_ent_t mze_tofind; 323 mzap_ent_t *mze; 324 zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree; 325 326 ASSERT(zn->zn_zap->zap_ismicro); 327 ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); 328 329 ASSERT0(zn->zn_hash & 0xffffffff); 330 mze_tofind.mze_hash = zn->zn_hash >> 32; 331 mze_tofind.mze_cd = 0; 332 333 mze = zfs_btree_find(tree, &mze_tofind, idx); 334 if (mze == NULL) 335 mze = zfs_btree_next(tree, idx, idx); 336 for (; mze && mze->mze_hash == mze_tofind.mze_hash; 337 mze = zfs_btree_next(tree, idx, idx)) { 338 ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); 339 if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) 340 return (mze); 341 } 342 343 return (NULL); 344 } 345 346 static uint32_t 347 mze_find_unused_cd(zap_t *zap, uint64_t hash) 348 { 349 mzap_ent_t mze_tofind; 350 zfs_btree_index_t idx; 351 zfs_btree_t *tree = &zap->zap_m.zap_tree; 352 353 ASSERT(zap->zap_ismicro); 354 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); 355 356 ASSERT0(hash & 0xffffffff); 357 hash >>= 32; 358 mze_tofind.mze_hash = hash; 359 mze_tofind.mze_cd = 0; 360 361 uint32_t cd = 0; 362 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); 363 mze && mze->mze_hash == hash; 364 mze = zfs_btree_next(tree, &idx, &idx)) { 365 if (mze->mze_cd != cd) 366 break; 367 cd++; 368 } 369 370 return (cd); 371 } 372 373 /* 374 * Each mzap entry requires at max : 4 chunks 375 * 3 chunks for names + 1 chunk for value. 376 */ 377 #define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \ 378 ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t))) 379 380 /* 381 * Check if the current entry keeps the colliding entries under the fatzap leaf 382 * size. 383 */ 384 static boolean_t 385 mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) 386 { 387 zap_t *zap = zn->zn_zap; 388 mzap_ent_t mze_tofind; 389 zfs_btree_index_t idx; 390 zfs_btree_t *tree = &zap->zap_m.zap_tree; 391 uint32_t mzap_ents = 0; 392 393 ASSERT0(hash & 0xffffffff); 394 hash >>= 32; 395 mze_tofind.mze_hash = hash; 396 mze_tofind.mze_cd = 0; 397 398 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); 399 mze && mze->mze_hash == hash; 400 mze = zfs_btree_next(tree, &idx, &idx)) { 401 mzap_ents++; 402 } 403 404 /* Include the new entry being added */ 405 mzap_ents++; 406 407 return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS)); 408 } 409 410 static void 411 mze_destroy(zap_t *zap) 412 { 413 zfs_btree_clear(&zap->zap_m.zap_tree); 414 zfs_btree_destroy(&zap->zap_m.zap_tree); 415 } 416 417 static zap_t * 418 mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) 419 { 420 zap_t *winner; 421 uint64_t *zap_hdr = (uint64_t *)db->db_data; 422 uint64_t zap_block_type = zap_hdr[0]; 423 uint64_t zap_magic = zap_hdr[1]; 424 425 ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); 426 427 zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); 428 rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL); 429 rw_enter(&zap->zap_rwlock, RW_WRITER); 430 zap->zap_objset = os; 431 zap->zap_object = obj; 432 zap->zap_dbuf = db; 433 434 if (zap_block_type != ZBT_MICRO) { 435 mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 436 0); 437 zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; 438 if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) { 439 winner = NULL; /* No actual winner here... */ 440 goto handle_winner; 441 } 442 } else { 443 zap->zap_ismicro = TRUE; 444 } 445 446 /* 447 * Make sure that zap_ismicro is set before we let others see 448 * it, because zap_lockdir() checks zap_ismicro without the lock 449 * held. 450 */ 451 dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf); 452 winner = dmu_buf_set_user(db, &zap->zap_dbu); 453 454 if (winner != NULL) 455 goto handle_winner; 456 457 if (zap->zap_ismicro) { 458 zap->zap_salt = zap_m_phys(zap)->mz_salt; 459 zap->zap_normflags = zap_m_phys(zap)->mz_normflags; 460 zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; 461 462 /* 463 * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove() 464 * overhead on massive inserts below. It still allows to store 465 * 62 entries before we have to add 2KB B-tree core node. 466 */ 467 zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare, 468 mze_find_in_buf, sizeof (mzap_ent_t), 512); 469 470 zap_name_t *zn = zap_name_alloc(zap); 471 for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) { 472 mzap_ent_phys_t *mze = 473 &zap_m_phys(zap)->mz_chunk[i]; 474 if (mze->mze_name[0]) { 475 zap->zap_m.zap_num_entries++; 476 zap_name_init_str(zn, mze->mze_name, 0); 477 mze_insert(zap, i, zn->zn_hash); 478 } 479 } 480 zap_name_free(zn); 481 } else { 482 zap->zap_salt = zap_f_phys(zap)->zap_salt; 483 zap->zap_normflags = zap_f_phys(zap)->zap_normflags; 484 485 ASSERT3U(sizeof (struct zap_leaf_header), ==, 486 2*ZAP_LEAF_CHUNKSIZE); 487 488 /* 489 * The embedded pointer table should not overlap the 490 * other members. 491 */ 492 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, 493 &zap_f_phys(zap)->zap_salt); 494 495 /* 496 * The embedded pointer table should end at the end of 497 * the block 498 */ 499 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, 500 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) - 501 (uintptr_t)zap_f_phys(zap), ==, 502 zap->zap_dbuf->db_size); 503 } 504 rw_exit(&zap->zap_rwlock); 505 return (zap); 506 507 handle_winner: 508 rw_exit(&zap->zap_rwlock); 509 rw_destroy(&zap->zap_rwlock); 510 if (!zap->zap_ismicro) 511 mutex_destroy(&zap->zap_f.zap_num_entries_mtx); 512 kmem_free(zap, sizeof (zap_t)); 513 return (winner); 514 } 515 516 /* 517 * This routine "consumes" the caller's hold on the dbuf, which must 518 * have the specified tag. 519 */ 520 static int 521 zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx, 522 krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) 523 { 524 ASSERT0(db->db_offset); 525 objset_t *os = dmu_buf_get_objset(db); 526 uint64_t obj = db->db_object; 527 dmu_object_info_t doi; 528 529 *zapp = NULL; 530 531 dmu_object_info_from_db(db, &doi); 532 if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) 533 return (SET_ERROR(EINVAL)); 534 535 zap_t *zap = dmu_buf_get_user(db); 536 if (zap == NULL) { 537 zap = mzap_open(os, obj, db); 538 if (zap == NULL) { 539 /* 540 * mzap_open() didn't like what it saw on-disk. 541 * Check for corruption! 542 */ 543 return (SET_ERROR(EIO)); 544 } 545 } 546 547 /* 548 * We're checking zap_ismicro without the lock held, in order to 549 * tell what type of lock we want. Once we have some sort of 550 * lock, see if it really is the right type. In practice this 551 * can only be different if it was upgraded from micro to fat, 552 * and micro wanted WRITER but fat only needs READER. 553 */ 554 krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; 555 rw_enter(&zap->zap_rwlock, lt); 556 if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { 557 /* it was upgraded, now we only need reader */ 558 ASSERT(lt == RW_WRITER); 559 ASSERT(RW_READER == 560 ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)); 561 rw_downgrade(&zap->zap_rwlock); 562 lt = RW_READER; 563 } 564 565 zap->zap_objset = os; 566 567 if (lt == RW_WRITER) 568 dmu_buf_will_dirty(db, tx); 569 570 ASSERT3P(zap->zap_dbuf, ==, db); 571 572 ASSERT(!zap->zap_ismicro || 573 zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); 574 if (zap->zap_ismicro && tx && adding && 575 zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { 576 uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; 577 if (newsz > zap_micro_max_size) { 578 dprintf("upgrading obj %llu: num_entries=%u\n", 579 (u_longlong_t)obj, zap->zap_m.zap_num_entries); 580 *zapp = zap; 581 int err = mzap_upgrade(zapp, tag, tx, 0); 582 if (err != 0) 583 rw_exit(&zap->zap_rwlock); 584 return (err); 585 } 586 VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); 587 zap->zap_m.zap_num_chunks = 588 db->db_size / MZAP_ENT_LEN - 1; 589 } 590 591 *zapp = zap; 592 return (0); 593 } 594 595 static int 596 zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, 597 krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, 598 zap_t **zapp) 599 { 600 dmu_buf_t *db; 601 602 int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); 603 if (err != 0) { 604 return (err); 605 } 606 #ifdef ZFS_DEBUG 607 { 608 dmu_object_info_t doi; 609 dmu_object_info_from_db(db, &doi); 610 ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); 611 } 612 #endif 613 614 err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); 615 if (err != 0) { 616 dmu_buf_rele(db, tag); 617 } 618 return (err); 619 } 620 621 int 622 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, 623 krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, 624 zap_t **zapp) 625 { 626 dmu_buf_t *db; 627 628 int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH); 629 if (err != 0) 630 return (err); 631 #ifdef ZFS_DEBUG 632 { 633 dmu_object_info_t doi; 634 dmu_object_info_from_db(db, &doi); 635 ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); 636 } 637 #endif 638 err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); 639 if (err != 0) 640 dmu_buf_rele(db, tag); 641 return (err); 642 } 643 644 void 645 zap_unlockdir(zap_t *zap, const void *tag) 646 { 647 rw_exit(&zap->zap_rwlock); 648 dmu_buf_rele(zap->zap_dbuf, tag); 649 } 650 651 static int 652 mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags) 653 { 654 int err = 0; 655 zap_t *zap = *zapp; 656 657 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 658 659 int sz = zap->zap_dbuf->db_size; 660 mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP); 661 memcpy(mzp, zap->zap_dbuf->db_data, sz); 662 int nchunks = zap->zap_m.zap_num_chunks; 663 664 if (!flags) { 665 err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 666 1ULL << fzap_default_block_shift, 0, tx); 667 if (err != 0) { 668 vmem_free(mzp, sz); 669 return (err); 670 } 671 } 672 673 dprintf("upgrading obj=%llu with %u chunks\n", 674 (u_longlong_t)zap->zap_object, nchunks); 675 /* XXX destroy the tree later, so we can use the stored hash value */ 676 mze_destroy(zap); 677 678 fzap_upgrade(zap, tx, flags); 679 680 zap_name_t *zn = zap_name_alloc(zap); 681 for (int i = 0; i < nchunks; i++) { 682 mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; 683 if (mze->mze_name[0] == 0) 684 continue; 685 dprintf("adding %s=%llu\n", 686 mze->mze_name, (u_longlong_t)mze->mze_value); 687 zap_name_init_str(zn, mze->mze_name, 0); 688 /* If we fail here, we would end up losing entries */ 689 VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, 690 tag, tx)); 691 zap = zn->zn_zap; /* fzap_add_cd() may change zap */ 692 } 693 zap_name_free(zn); 694 vmem_free(mzp, sz); 695 *zapp = zap; 696 return (0); 697 } 698 699 /* 700 * The "normflags" determine the behavior of the matchtype_t which is 701 * passed to zap_lookup_norm(). Names which have the same normalized 702 * version will be stored with the same hash value, and therefore we can 703 * perform normalization-insensitive lookups. We can be Unicode form- 704 * insensitive and/or case-insensitive. The following flags are valid for 705 * "normflags": 706 * 707 * U8_TEXTPREP_NFC 708 * U8_TEXTPREP_NFD 709 * U8_TEXTPREP_NFKC 710 * U8_TEXTPREP_NFKD 711 * U8_TEXTPREP_TOUPPER 712 * 713 * The *_NF* (Normalization Form) flags are mutually exclusive; at most one 714 * of them may be supplied. 715 */ 716 void 717 mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx) 718 { 719 dmu_buf_t *db; 720 721 VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); 722 723 dmu_buf_will_dirty(db, tx); 724 mzap_phys_t *zp = db->db_data; 725 zp->mz_block_type = ZBT_MICRO; 726 zp->mz_salt = 727 ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL; 728 zp->mz_normflags = normflags; 729 730 if (flags != 0) { 731 zap_t *zap; 732 /* Only fat zap supports flags; upgrade immediately. */ 733 VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER, 734 B_FALSE, B_FALSE, &zap)); 735 VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); 736 zap_unlockdir(zap, FTAG); 737 } else { 738 dmu_buf_rele(db, FTAG); 739 } 740 } 741 742 static uint64_t 743 zap_create_impl(objset_t *os, int normflags, zap_flags_t flags, 744 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 745 dmu_object_type_t bonustype, int bonuslen, int dnodesize, 746 dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) 747 { 748 uint64_t obj; 749 750 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); 751 752 if (allocated_dnode == NULL) { 753 dnode_t *dn; 754 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, 755 indirect_blockshift, bonustype, bonuslen, dnodesize, 756 &dn, FTAG, tx); 757 mzap_create_impl(dn, normflags, flags, tx); 758 dnode_rele(dn, FTAG); 759 } else { 760 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, 761 indirect_blockshift, bonustype, bonuslen, dnodesize, 762 allocated_dnode, tag, tx); 763 mzap_create_impl(*allocated_dnode, normflags, flags, tx); 764 } 765 766 return (obj); 767 } 768 769 int 770 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, 771 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 772 { 773 return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, 774 0, tx)); 775 } 776 777 int 778 zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, 779 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 780 { 781 return (zap_create_claim_norm_dnsize(os, obj, 782 0, ot, bonustype, bonuslen, dnodesize, tx)); 783 } 784 785 int 786 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, 787 dmu_object_type_t ot, 788 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 789 { 790 return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, 791 bonuslen, 0, tx)); 792 } 793 794 int 795 zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, 796 dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, 797 int dnodesize, dmu_tx_t *tx) 798 { 799 dnode_t *dn; 800 int error; 801 802 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); 803 error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, 804 dnodesize, tx); 805 if (error != 0) 806 return (error); 807 808 error = dnode_hold(os, obj, FTAG, &dn); 809 if (error != 0) 810 return (error); 811 812 mzap_create_impl(dn, normflags, 0, tx); 813 814 dnode_rele(dn, FTAG); 815 816 return (0); 817 } 818 819 uint64_t 820 zap_create(objset_t *os, dmu_object_type_t ot, 821 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 822 { 823 return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); 824 } 825 826 uint64_t 827 zap_create_dnsize(objset_t *os, dmu_object_type_t ot, 828 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 829 { 830 return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, 831 dnodesize, tx)); 832 } 833 834 uint64_t 835 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, 836 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 837 { 838 return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, 839 0, tx)); 840 } 841 842 uint64_t 843 zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, 844 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 845 { 846 return (zap_create_impl(os, normflags, 0, ot, 0, 0, 847 bonustype, bonuslen, dnodesize, NULL, NULL, tx)); 848 } 849 850 uint64_t 851 zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, 852 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 853 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 854 { 855 return (zap_create_flags_dnsize(os, normflags, flags, ot, 856 leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); 857 } 858 859 uint64_t 860 zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, 861 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 862 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 863 { 864 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, 865 indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL, 866 tx)); 867 } 868 869 /* 870 * Create a zap object and return a pointer to the newly allocated dnode via 871 * the allocated_dnode argument. The returned dnode will be held and the 872 * caller is responsible for releasing the hold by calling dnode_rele(). 873 */ 874 uint64_t 875 zap_create_hold(objset_t *os, int normflags, zap_flags_t flags, 876 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 877 dmu_object_type_t bonustype, int bonuslen, int dnodesize, 878 dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) 879 { 880 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, 881 indirect_blockshift, bonustype, bonuslen, dnodesize, 882 allocated_dnode, tag, tx)); 883 } 884 885 int 886 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) 887 { 888 /* 889 * dmu_object_free will free the object number and free the 890 * data. Freeing the data will cause our pageout function to be 891 * called, which will destroy our data (zap_leaf_t's and zap_t). 892 */ 893 894 return (dmu_object_free(os, zapobj, tx)); 895 } 896 897 void 898 zap_evict_sync(void *dbu) 899 { 900 zap_t *zap = dbu; 901 902 rw_destroy(&zap->zap_rwlock); 903 904 if (zap->zap_ismicro) 905 mze_destroy(zap); 906 else 907 mutex_destroy(&zap->zap_f.zap_num_entries_mtx); 908 909 kmem_free(zap, sizeof (zap_t)); 910 } 911 912 int 913 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) 914 { 915 zap_t *zap; 916 917 int err = 918 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 919 if (err != 0) 920 return (err); 921 if (!zap->zap_ismicro) { 922 err = fzap_count(zap, count); 923 } else { 924 *count = zap->zap_m.zap_num_entries; 925 } 926 zap_unlockdir(zap, FTAG); 927 return (err); 928 } 929 930 /* 931 * zn may be NULL; if not specified, it will be computed if needed. 932 * See also the comment above zap_entry_normalization_conflict(). 933 */ 934 static boolean_t 935 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze, 936 zfs_btree_index_t *idx) 937 { 938 boolean_t allocdzn = B_FALSE; 939 mzap_ent_t *other; 940 zfs_btree_index_t oidx; 941 942 if (zap->zap_normflags == 0) 943 return (B_FALSE); 944 945 for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx); 946 other && other->mze_hash == mze->mze_hash; 947 other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) { 948 949 if (zn == NULL) { 950 zn = zap_name_alloc_str(zap, 951 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); 952 allocdzn = B_TRUE; 953 } 954 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { 955 if (allocdzn) 956 zap_name_free(zn); 957 return (B_TRUE); 958 } 959 } 960 961 for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx); 962 other && other->mze_hash == mze->mze_hash; 963 other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) { 964 965 if (zn == NULL) { 966 zn = zap_name_alloc_str(zap, 967 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); 968 allocdzn = B_TRUE; 969 } 970 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { 971 if (allocdzn) 972 zap_name_free(zn); 973 return (B_TRUE); 974 } 975 } 976 977 if (allocdzn) 978 zap_name_free(zn); 979 return (B_FALSE); 980 } 981 982 /* 983 * Routines for manipulating attributes. 984 */ 985 986 int 987 zap_lookup(objset_t *os, uint64_t zapobj, const char *name, 988 uint64_t integer_size, uint64_t num_integers, void *buf) 989 { 990 return (zap_lookup_norm(os, zapobj, name, integer_size, 991 num_integers, buf, 0, NULL, 0, NULL)); 992 } 993 994 static int 995 zap_lookup_impl(zap_t *zap, const char *name, 996 uint64_t integer_size, uint64_t num_integers, void *buf, 997 matchtype_t mt, char *realname, int rn_len, 998 boolean_t *ncp) 999 { 1000 int err = 0; 1001 1002 zap_name_t *zn = zap_name_alloc_str(zap, name, mt); 1003 if (zn == NULL) 1004 return (SET_ERROR(ENOTSUP)); 1005 1006 if (!zap->zap_ismicro) { 1007 err = fzap_lookup(zn, integer_size, num_integers, buf, 1008 realname, rn_len, ncp); 1009 } else { 1010 zfs_btree_index_t idx; 1011 mzap_ent_t *mze = mze_find(zn, &idx); 1012 if (mze == NULL) { 1013 err = SET_ERROR(ENOENT); 1014 } else { 1015 if (num_integers < 1) { 1016 err = SET_ERROR(EOVERFLOW); 1017 } else if (integer_size != 8) { 1018 err = SET_ERROR(EINVAL); 1019 } else { 1020 *(uint64_t *)buf = 1021 MZE_PHYS(zap, mze)->mze_value; 1022 if (realname != NULL) 1023 (void) strlcpy(realname, 1024 MZE_PHYS(zap, mze)->mze_name, 1025 rn_len); 1026 if (ncp) { 1027 *ncp = mzap_normalization_conflict(zap, 1028 zn, mze, &idx); 1029 } 1030 } 1031 } 1032 } 1033 zap_name_free(zn); 1034 return (err); 1035 } 1036 1037 int 1038 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, 1039 uint64_t integer_size, uint64_t num_integers, void *buf, 1040 matchtype_t mt, char *realname, int rn_len, 1041 boolean_t *ncp) 1042 { 1043 zap_t *zap; 1044 1045 int err = 1046 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1047 if (err != 0) 1048 return (err); 1049 err = zap_lookup_impl(zap, name, integer_size, 1050 num_integers, buf, mt, realname, rn_len, ncp); 1051 zap_unlockdir(zap, FTAG); 1052 return (err); 1053 } 1054 1055 int 1056 zap_prefetch(objset_t *os, uint64_t zapobj, const char *name) 1057 { 1058 zap_t *zap; 1059 int err; 1060 zap_name_t *zn; 1061 1062 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1063 if (err) 1064 return (err); 1065 zn = zap_name_alloc_str(zap, name, 0); 1066 if (zn == NULL) { 1067 zap_unlockdir(zap, FTAG); 1068 return (SET_ERROR(ENOTSUP)); 1069 } 1070 1071 fzap_prefetch(zn); 1072 zap_name_free(zn); 1073 zap_unlockdir(zap, FTAG); 1074 return (err); 1075 } 1076 1077 int 1078 zap_lookup_by_dnode(dnode_t *dn, const char *name, 1079 uint64_t integer_size, uint64_t num_integers, void *buf) 1080 { 1081 return (zap_lookup_norm_by_dnode(dn, name, integer_size, 1082 num_integers, buf, 0, NULL, 0, NULL)); 1083 } 1084 1085 int 1086 zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, 1087 uint64_t integer_size, uint64_t num_integers, void *buf, 1088 matchtype_t mt, char *realname, int rn_len, 1089 boolean_t *ncp) 1090 { 1091 zap_t *zap; 1092 1093 int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, 1094 FTAG, &zap); 1095 if (err != 0) 1096 return (err); 1097 err = zap_lookup_impl(zap, name, integer_size, 1098 num_integers, buf, mt, realname, rn_len, ncp); 1099 zap_unlockdir(zap, FTAG); 1100 return (err); 1101 } 1102 1103 int 1104 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1105 int key_numints) 1106 { 1107 zap_t *zap; 1108 1109 int err = 1110 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1111 if (err != 0) 1112 return (err); 1113 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1114 if (zn == NULL) { 1115 zap_unlockdir(zap, FTAG); 1116 return (SET_ERROR(ENOTSUP)); 1117 } 1118 1119 fzap_prefetch(zn); 1120 zap_name_free(zn); 1121 zap_unlockdir(zap, FTAG); 1122 return (err); 1123 } 1124 1125 int 1126 zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1127 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) 1128 { 1129 zap_t *zap; 1130 1131 int err = 1132 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1133 if (err != 0) 1134 return (err); 1135 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1136 if (zn == NULL) { 1137 zap_unlockdir(zap, FTAG); 1138 return (SET_ERROR(ENOTSUP)); 1139 } 1140 1141 err = fzap_lookup(zn, integer_size, num_integers, buf, 1142 NULL, 0, NULL); 1143 zap_name_free(zn); 1144 zap_unlockdir(zap, FTAG); 1145 return (err); 1146 } 1147 1148 int 1149 zap_contains(objset_t *os, uint64_t zapobj, const char *name) 1150 { 1151 int err = zap_lookup_norm(os, zapobj, name, 0, 1152 0, NULL, 0, NULL, 0, NULL); 1153 if (err == EOVERFLOW || err == EINVAL) 1154 err = 0; /* found, but skipped reading the value */ 1155 return (err); 1156 } 1157 1158 int 1159 zap_length(objset_t *os, uint64_t zapobj, const char *name, 1160 uint64_t *integer_size, uint64_t *num_integers) 1161 { 1162 zap_t *zap; 1163 1164 int err = 1165 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1166 if (err != 0) 1167 return (err); 1168 zap_name_t *zn = zap_name_alloc_str(zap, name, 0); 1169 if (zn == NULL) { 1170 zap_unlockdir(zap, FTAG); 1171 return (SET_ERROR(ENOTSUP)); 1172 } 1173 if (!zap->zap_ismicro) { 1174 err = fzap_length(zn, integer_size, num_integers); 1175 } else { 1176 zfs_btree_index_t idx; 1177 mzap_ent_t *mze = mze_find(zn, &idx); 1178 if (mze == NULL) { 1179 err = SET_ERROR(ENOENT); 1180 } else { 1181 if (integer_size) 1182 *integer_size = 8; 1183 if (num_integers) 1184 *num_integers = 1; 1185 } 1186 } 1187 zap_name_free(zn); 1188 zap_unlockdir(zap, FTAG); 1189 return (err); 1190 } 1191 1192 int 1193 zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1194 int key_numints, uint64_t *integer_size, uint64_t *num_integers) 1195 { 1196 zap_t *zap; 1197 1198 int err = 1199 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1200 if (err != 0) 1201 return (err); 1202 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1203 if (zn == NULL) { 1204 zap_unlockdir(zap, FTAG); 1205 return (SET_ERROR(ENOTSUP)); 1206 } 1207 err = fzap_length(zn, integer_size, num_integers); 1208 zap_name_free(zn); 1209 zap_unlockdir(zap, FTAG); 1210 return (err); 1211 } 1212 1213 static void 1214 mzap_addent(zap_name_t *zn, uint64_t value) 1215 { 1216 zap_t *zap = zn->zn_zap; 1217 uint16_t start = zap->zap_m.zap_alloc_next; 1218 1219 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 1220 1221 #ifdef ZFS_DEBUG 1222 for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { 1223 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; 1224 ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); 1225 } 1226 #endif 1227 1228 uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash); 1229 /* given the limited size of the microzap, this can't happen */ 1230 ASSERT(cd < zap_maxcd(zap)); 1231 1232 again: 1233 for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) { 1234 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; 1235 if (mze->mze_name[0] == 0) { 1236 mze->mze_value = value; 1237 mze->mze_cd = cd; 1238 (void) strlcpy(mze->mze_name, zn->zn_key_orig, 1239 sizeof (mze->mze_name)); 1240 zap->zap_m.zap_num_entries++; 1241 zap->zap_m.zap_alloc_next = i+1; 1242 if (zap->zap_m.zap_alloc_next == 1243 zap->zap_m.zap_num_chunks) 1244 zap->zap_m.zap_alloc_next = 0; 1245 mze_insert(zap, i, zn->zn_hash); 1246 return; 1247 } 1248 } 1249 if (start != 0) { 1250 start = 0; 1251 goto again; 1252 } 1253 cmn_err(CE_PANIC, "out of entries!"); 1254 } 1255 1256 static int 1257 zap_add_impl(zap_t *zap, const char *key, 1258 int integer_size, uint64_t num_integers, 1259 const void *val, dmu_tx_t *tx, const void *tag) 1260 { 1261 const uint64_t *intval = val; 1262 int err = 0; 1263 1264 zap_name_t *zn = zap_name_alloc_str(zap, key, 0); 1265 if (zn == NULL) { 1266 zap_unlockdir(zap, tag); 1267 return (SET_ERROR(ENOTSUP)); 1268 } 1269 if (!zap->zap_ismicro) { 1270 err = fzap_add(zn, integer_size, num_integers, val, tag, tx); 1271 zap = zn->zn_zap; /* fzap_add() may change zap */ 1272 } else if (integer_size != 8 || num_integers != 1 || 1273 strlen(key) >= MZAP_NAME_LEN || 1274 !mze_canfit_fzap_leaf(zn, zn->zn_hash)) { 1275 err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); 1276 if (err == 0) { 1277 err = fzap_add(zn, integer_size, num_integers, val, 1278 tag, tx); 1279 } 1280 zap = zn->zn_zap; /* fzap_add() may change zap */ 1281 } else { 1282 zfs_btree_index_t idx; 1283 if (mze_find(zn, &idx) != NULL) { 1284 err = SET_ERROR(EEXIST); 1285 } else { 1286 mzap_addent(zn, *intval); 1287 } 1288 } 1289 ASSERT(zap == zn->zn_zap); 1290 zap_name_free(zn); 1291 if (zap != NULL) /* may be NULL if fzap_add() failed */ 1292 zap_unlockdir(zap, tag); 1293 return (err); 1294 } 1295 1296 int 1297 zap_add(objset_t *os, uint64_t zapobj, const char *key, 1298 int integer_size, uint64_t num_integers, 1299 const void *val, dmu_tx_t *tx) 1300 { 1301 zap_t *zap; 1302 int err; 1303 1304 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1305 if (err != 0) 1306 return (err); 1307 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); 1308 /* zap_add_impl() calls zap_unlockdir() */ 1309 return (err); 1310 } 1311 1312 int 1313 zap_add_by_dnode(dnode_t *dn, const char *key, 1314 int integer_size, uint64_t num_integers, 1315 const void *val, dmu_tx_t *tx) 1316 { 1317 zap_t *zap; 1318 int err; 1319 1320 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1321 if (err != 0) 1322 return (err); 1323 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); 1324 /* zap_add_impl() calls zap_unlockdir() */ 1325 return (err); 1326 } 1327 1328 int 1329 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1330 int key_numints, int integer_size, uint64_t num_integers, 1331 const void *val, dmu_tx_t *tx) 1332 { 1333 zap_t *zap; 1334 1335 int err = 1336 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1337 if (err != 0) 1338 return (err); 1339 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1340 if (zn == NULL) { 1341 zap_unlockdir(zap, FTAG); 1342 return (SET_ERROR(ENOTSUP)); 1343 } 1344 err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); 1345 zap = zn->zn_zap; /* fzap_add() may change zap */ 1346 zap_name_free(zn); 1347 if (zap != NULL) /* may be NULL if fzap_add() failed */ 1348 zap_unlockdir(zap, FTAG); 1349 return (err); 1350 } 1351 1352 int 1353 zap_update(objset_t *os, uint64_t zapobj, const char *name, 1354 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) 1355 { 1356 zap_t *zap; 1357 const uint64_t *intval = val; 1358 1359 int err = 1360 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1361 if (err != 0) 1362 return (err); 1363 zap_name_t *zn = zap_name_alloc_str(zap, name, 0); 1364 if (zn == NULL) { 1365 zap_unlockdir(zap, FTAG); 1366 return (SET_ERROR(ENOTSUP)); 1367 } 1368 if (!zap->zap_ismicro) { 1369 err = fzap_update(zn, integer_size, num_integers, val, 1370 FTAG, tx); 1371 zap = zn->zn_zap; /* fzap_update() may change zap */ 1372 } else if (integer_size != 8 || num_integers != 1 || 1373 strlen(name) >= MZAP_NAME_LEN) { 1374 dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", 1375 (u_longlong_t)zapobj, integer_size, 1376 (u_longlong_t)num_integers, name); 1377 err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); 1378 if (err == 0) { 1379 err = fzap_update(zn, integer_size, num_integers, 1380 val, FTAG, tx); 1381 } 1382 zap = zn->zn_zap; /* fzap_update() may change zap */ 1383 } else { 1384 zfs_btree_index_t idx; 1385 mzap_ent_t *mze = mze_find(zn, &idx); 1386 if (mze != NULL) { 1387 MZE_PHYS(zap, mze)->mze_value = *intval; 1388 } else { 1389 mzap_addent(zn, *intval); 1390 } 1391 } 1392 ASSERT(zap == zn->zn_zap); 1393 zap_name_free(zn); 1394 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ 1395 zap_unlockdir(zap, FTAG); 1396 return (err); 1397 } 1398 1399 int 1400 zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1401 int key_numints, 1402 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) 1403 { 1404 zap_t *zap; 1405 1406 int err = 1407 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1408 if (err != 0) 1409 return (err); 1410 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1411 if (zn == NULL) { 1412 zap_unlockdir(zap, FTAG); 1413 return (SET_ERROR(ENOTSUP)); 1414 } 1415 err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); 1416 zap = zn->zn_zap; /* fzap_update() may change zap */ 1417 zap_name_free(zn); 1418 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ 1419 zap_unlockdir(zap, FTAG); 1420 return (err); 1421 } 1422 1423 int 1424 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) 1425 { 1426 return (zap_remove_norm(os, zapobj, name, 0, tx)); 1427 } 1428 1429 static int 1430 zap_remove_impl(zap_t *zap, const char *name, 1431 matchtype_t mt, dmu_tx_t *tx) 1432 { 1433 int err = 0; 1434 1435 zap_name_t *zn = zap_name_alloc_str(zap, name, mt); 1436 if (zn == NULL) 1437 return (SET_ERROR(ENOTSUP)); 1438 if (!zap->zap_ismicro) { 1439 err = fzap_remove(zn, tx); 1440 } else { 1441 zfs_btree_index_t idx; 1442 mzap_ent_t *mze = mze_find(zn, &idx); 1443 if (mze == NULL) { 1444 err = SET_ERROR(ENOENT); 1445 } else { 1446 zap->zap_m.zap_num_entries--; 1447 memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t)); 1448 zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx); 1449 } 1450 } 1451 zap_name_free(zn); 1452 return (err); 1453 } 1454 1455 int 1456 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, 1457 matchtype_t mt, dmu_tx_t *tx) 1458 { 1459 zap_t *zap; 1460 int err; 1461 1462 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1463 if (err) 1464 return (err); 1465 err = zap_remove_impl(zap, name, mt, tx); 1466 zap_unlockdir(zap, FTAG); 1467 return (err); 1468 } 1469 1470 int 1471 zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) 1472 { 1473 zap_t *zap; 1474 int err; 1475 1476 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1477 if (err) 1478 return (err); 1479 err = zap_remove_impl(zap, name, 0, tx); 1480 zap_unlockdir(zap, FTAG); 1481 return (err); 1482 } 1483 1484 int 1485 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1486 int key_numints, dmu_tx_t *tx) 1487 { 1488 zap_t *zap; 1489 1490 int err = 1491 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1492 if (err != 0) 1493 return (err); 1494 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1495 if (zn == NULL) { 1496 zap_unlockdir(zap, FTAG); 1497 return (SET_ERROR(ENOTSUP)); 1498 } 1499 err = fzap_remove(zn, tx); 1500 zap_name_free(zn); 1501 zap_unlockdir(zap, FTAG); 1502 return (err); 1503 } 1504 1505 /* 1506 * Routines for iterating over the attributes. 1507 */ 1508 1509 static void 1510 zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, 1511 uint64_t serialized, boolean_t prefetch) 1512 { 1513 zc->zc_objset = os; 1514 zc->zc_zap = NULL; 1515 zc->zc_leaf = NULL; 1516 zc->zc_zapobj = zapobj; 1517 zc->zc_serialized = serialized; 1518 zc->zc_hash = 0; 1519 zc->zc_cd = 0; 1520 zc->zc_prefetch = prefetch; 1521 } 1522 void 1523 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, 1524 uint64_t serialized) 1525 { 1526 zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE); 1527 } 1528 1529 /* 1530 * Initialize a cursor at the beginning of the ZAP object. The entire 1531 * ZAP object will be prefetched. 1532 */ 1533 void 1534 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) 1535 { 1536 zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE); 1537 } 1538 1539 /* 1540 * Initialize a cursor at the beginning, but request that we not prefetch 1541 * the entire ZAP object. 1542 */ 1543 void 1544 zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) 1545 { 1546 zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE); 1547 } 1548 1549 void 1550 zap_cursor_fini(zap_cursor_t *zc) 1551 { 1552 if (zc->zc_zap) { 1553 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); 1554 zap_unlockdir(zc->zc_zap, NULL); 1555 zc->zc_zap = NULL; 1556 } 1557 if (zc->zc_leaf) { 1558 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); 1559 zap_put_leaf(zc->zc_leaf); 1560 zc->zc_leaf = NULL; 1561 } 1562 zc->zc_objset = NULL; 1563 } 1564 1565 uint64_t 1566 zap_cursor_serialize(zap_cursor_t *zc) 1567 { 1568 if (zc->zc_hash == -1ULL) 1569 return (-1ULL); 1570 if (zc->zc_zap == NULL) 1571 return (zc->zc_serialized); 1572 ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); 1573 ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); 1574 1575 /* 1576 * We want to keep the high 32 bits of the cursor zero if we can, so 1577 * that 32-bit programs can access this. So usually use a small 1578 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits 1579 * of the cursor. 1580 * 1581 * [ collision differentiator | zap_hashbits()-bit hash value ] 1582 */ 1583 return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | 1584 ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); 1585 } 1586 1587 int 1588 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) 1589 { 1590 int err; 1591 1592 if (zc->zc_hash == -1ULL) 1593 return (SET_ERROR(ENOENT)); 1594 1595 if (zc->zc_zap == NULL) { 1596 int hb; 1597 err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, 1598 RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); 1599 if (err != 0) 1600 return (err); 1601 1602 /* 1603 * To support zap_cursor_init_serialized, advance, retrieve, 1604 * we must add to the existing zc_cd, which may already 1605 * be 1 due to the zap_cursor_advance. 1606 */ 1607 ASSERT(zc->zc_hash == 0); 1608 hb = zap_hashbits(zc->zc_zap); 1609 zc->zc_hash = zc->zc_serialized << (64 - hb); 1610 zc->zc_cd += zc->zc_serialized >> hb; 1611 if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ 1612 zc->zc_cd = 0; 1613 } else { 1614 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); 1615 } 1616 if (!zc->zc_zap->zap_ismicro) { 1617 err = fzap_cursor_retrieve(zc->zc_zap, zc, za); 1618 } else { 1619 zfs_btree_index_t idx; 1620 mzap_ent_t mze_tofind; 1621 1622 mze_tofind.mze_hash = zc->zc_hash >> 32; 1623 mze_tofind.mze_cd = zc->zc_cd; 1624 1625 mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree, 1626 &mze_tofind, &idx); 1627 if (mze == NULL) { 1628 mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree, 1629 &idx, &idx); 1630 } 1631 if (mze) { 1632 mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); 1633 ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); 1634 za->za_normalization_conflict = 1635 mzap_normalization_conflict(zc->zc_zap, NULL, 1636 mze, &idx); 1637 za->za_integer_length = 8; 1638 za->za_num_integers = 1; 1639 za->za_first_integer = mzep->mze_value; 1640 (void) strlcpy(za->za_name, mzep->mze_name, 1641 sizeof (za->za_name)); 1642 zc->zc_hash = (uint64_t)mze->mze_hash << 32; 1643 zc->zc_cd = mze->mze_cd; 1644 err = 0; 1645 } else { 1646 zc->zc_hash = -1ULL; 1647 err = SET_ERROR(ENOENT); 1648 } 1649 } 1650 rw_exit(&zc->zc_zap->zap_rwlock); 1651 return (err); 1652 } 1653 1654 void 1655 zap_cursor_advance(zap_cursor_t *zc) 1656 { 1657 if (zc->zc_hash == -1ULL) 1658 return; 1659 zc->zc_cd++; 1660 } 1661 1662 int 1663 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) 1664 { 1665 zap_t *zap; 1666 1667 int err = 1668 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1669 if (err != 0) 1670 return (err); 1671 1672 memset(zs, 0, sizeof (zap_stats_t)); 1673 1674 if (zap->zap_ismicro) { 1675 zs->zs_blocksize = zap->zap_dbuf->db_size; 1676 zs->zs_num_entries = zap->zap_m.zap_num_entries; 1677 zs->zs_num_blocks = 1; 1678 } else { 1679 fzap_get_stats(zap, zs); 1680 } 1681 zap_unlockdir(zap, FTAG); 1682 return (0); 1683 } 1684 1685 #if defined(_KERNEL) 1686 EXPORT_SYMBOL(zap_create); 1687 EXPORT_SYMBOL(zap_create_dnsize); 1688 EXPORT_SYMBOL(zap_create_norm); 1689 EXPORT_SYMBOL(zap_create_norm_dnsize); 1690 EXPORT_SYMBOL(zap_create_flags); 1691 EXPORT_SYMBOL(zap_create_flags_dnsize); 1692 EXPORT_SYMBOL(zap_create_claim); 1693 EXPORT_SYMBOL(zap_create_claim_norm); 1694 EXPORT_SYMBOL(zap_create_claim_norm_dnsize); 1695 EXPORT_SYMBOL(zap_create_hold); 1696 EXPORT_SYMBOL(zap_destroy); 1697 EXPORT_SYMBOL(zap_lookup); 1698 EXPORT_SYMBOL(zap_lookup_by_dnode); 1699 EXPORT_SYMBOL(zap_lookup_norm); 1700 EXPORT_SYMBOL(zap_lookup_uint64); 1701 EXPORT_SYMBOL(zap_contains); 1702 EXPORT_SYMBOL(zap_prefetch); 1703 EXPORT_SYMBOL(zap_prefetch_uint64); 1704 EXPORT_SYMBOL(zap_add); 1705 EXPORT_SYMBOL(zap_add_by_dnode); 1706 EXPORT_SYMBOL(zap_add_uint64); 1707 EXPORT_SYMBOL(zap_update); 1708 EXPORT_SYMBOL(zap_update_uint64); 1709 EXPORT_SYMBOL(zap_length); 1710 EXPORT_SYMBOL(zap_length_uint64); 1711 EXPORT_SYMBOL(zap_remove); 1712 EXPORT_SYMBOL(zap_remove_by_dnode); 1713 EXPORT_SYMBOL(zap_remove_norm); 1714 EXPORT_SYMBOL(zap_remove_uint64); 1715 EXPORT_SYMBOL(zap_count); 1716 EXPORT_SYMBOL(zap_value_search); 1717 EXPORT_SYMBOL(zap_join); 1718 EXPORT_SYMBOL(zap_join_increment); 1719 EXPORT_SYMBOL(zap_add_int); 1720 EXPORT_SYMBOL(zap_remove_int); 1721 EXPORT_SYMBOL(zap_lookup_int); 1722 EXPORT_SYMBOL(zap_increment_int); 1723 EXPORT_SYMBOL(zap_add_int_key); 1724 EXPORT_SYMBOL(zap_lookup_int_key); 1725 EXPORT_SYMBOL(zap_increment); 1726 EXPORT_SYMBOL(zap_cursor_init); 1727 EXPORT_SYMBOL(zap_cursor_fini); 1728 EXPORT_SYMBOL(zap_cursor_retrieve); 1729 EXPORT_SYMBOL(zap_cursor_advance); 1730 EXPORT_SYMBOL(zap_cursor_serialize); 1731 EXPORT_SYMBOL(zap_cursor_init_serialized); 1732 EXPORT_SYMBOL(zap_get_stats); 1733 1734 /* CSTYLED */ 1735 ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW, 1736 "Maximum micro ZAP size, before converting to a fat ZAP, in bytes"); 1737 #endif 1738