1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 26 * Copyright 2017 Nexenta Systems, Inc. 27 */ 28 29 #include <sys/zio.h> 30 #include <sys/spa.h> 31 #include <sys/dmu.h> 32 #include <sys/zfs_context.h> 33 #include <sys/zap.h> 34 #include <sys/zap_impl.h> 35 #include <sys/zap_leaf.h> 36 #include <sys/btree.h> 37 #include <sys/arc.h> 38 #include <sys/dmu_objset.h> 39 40 #ifdef _KERNEL 41 #include <sys/sunddi.h> 42 #endif 43 44 int zap_micro_max_size = MZAP_MAX_BLKSZ; 45 46 static int mzap_upgrade(zap_t **zapp, 47 const void *tag, dmu_tx_t *tx, zap_flags_t flags); 48 49 uint64_t 50 zap_getflags(zap_t *zap) 51 { 52 if (zap->zap_ismicro) 53 return (0); 54 return (zap_f_phys(zap)->zap_flags); 55 } 56 57 int 58 zap_hashbits(zap_t *zap) 59 { 60 if (zap_getflags(zap) & ZAP_FLAG_HASH64) 61 return (48); 62 else 63 return (28); 64 } 65 66 uint32_t 67 zap_maxcd(zap_t *zap) 68 { 69 if (zap_getflags(zap) & ZAP_FLAG_HASH64) 70 return ((1<<16)-1); 71 else 72 return (-1U); 73 } 74 75 static uint64_t 76 zap_hash(zap_name_t *zn) 77 { 78 zap_t *zap = zn->zn_zap; 79 uint64_t h = 0; 80 81 if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { 82 ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); 83 h = *(uint64_t *)zn->zn_key_orig; 84 } else { 85 h = zap->zap_salt; 86 ASSERT(h != 0); 87 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 88 89 if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { 90 const uint64_t *wp = zn->zn_key_norm; 91 92 ASSERT(zn->zn_key_intlen == 8); 93 for (int i = 0; i < zn->zn_key_norm_numints; 94 wp++, i++) { 95 uint64_t word = *wp; 96 97 for (int j = 0; j < 8; j++) { 98 h = (h >> 8) ^ 99 zfs_crc64_table[(h ^ word) & 0xFF]; 100 word >>= NBBY; 101 } 102 } 103 } else { 104 const uint8_t *cp = zn->zn_key_norm; 105 106 /* 107 * We previously stored the terminating null on 108 * disk, but didn't hash it, so we need to 109 * continue to not hash it. (The 110 * zn_key_*_numints includes the terminating 111 * null for non-binary keys.) 112 */ 113 int len = zn->zn_key_norm_numints - 1; 114 115 ASSERT(zn->zn_key_intlen == 1); 116 for (int i = 0; i < len; cp++, i++) { 117 h = (h >> 8) ^ 118 zfs_crc64_table[(h ^ *cp) & 0xFF]; 119 } 120 } 121 } 122 /* 123 * Don't use all 64 bits, since we need some in the cookie for 124 * the collision differentiator. We MUST use the high bits, 125 * since those are the ones that we first pay attention to when 126 * choosing the bucket. 127 */ 128 h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); 129 130 return (h); 131 } 132 133 static int 134 zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags) 135 { 136 ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); 137 138 size_t inlen = strlen(name) + 1; 139 size_t outlen = ZAP_MAXNAMELEN; 140 141 int err = 0; 142 (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, 143 normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, 144 U8_UNICODE_LATEST, &err); 145 146 return (err); 147 } 148 149 boolean_t 150 zap_match(zap_name_t *zn, const char *matchname) 151 { 152 ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); 153 154 if (zn->zn_matchtype & MT_NORMALIZE) { 155 char norm[ZAP_MAXNAMELEN]; 156 157 if (zap_normalize(zn->zn_zap, matchname, norm, 158 zn->zn_normflags) != 0) 159 return (B_FALSE); 160 161 return (strcmp(zn->zn_key_norm, norm) == 0); 162 } else { 163 return (strcmp(zn->zn_key_orig, matchname) == 0); 164 } 165 } 166 167 static zap_name_t * 168 zap_name_alloc(zap_t *zap) 169 { 170 zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); 171 zn->zn_zap = zap; 172 return (zn); 173 } 174 175 void 176 zap_name_free(zap_name_t *zn) 177 { 178 kmem_free(zn, sizeof (zap_name_t)); 179 } 180 181 static int 182 zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt) 183 { 184 zap_t *zap = zn->zn_zap; 185 186 zn->zn_key_intlen = sizeof (*key); 187 zn->zn_key_orig = key; 188 zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; 189 zn->zn_matchtype = mt; 190 zn->zn_normflags = zap->zap_normflags; 191 192 /* 193 * If we're dealing with a case sensitive lookup on a mixed or 194 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup 195 * will fold case to all caps overriding the lookup request. 196 */ 197 if (mt & MT_MATCH_CASE) 198 zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; 199 200 if (zap->zap_normflags) { 201 /* 202 * We *must* use zap_normflags because this normalization is 203 * what the hash is computed from. 204 */ 205 if (zap_normalize(zap, key, zn->zn_normbuf, 206 zap->zap_normflags) != 0) 207 return (SET_ERROR(ENOTSUP)); 208 zn->zn_key_norm = zn->zn_normbuf; 209 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; 210 } else { 211 if (mt != 0) 212 return (SET_ERROR(ENOTSUP)); 213 zn->zn_key_norm = zn->zn_key_orig; 214 zn->zn_key_norm_numints = zn->zn_key_orig_numints; 215 } 216 217 zn->zn_hash = zap_hash(zn); 218 219 if (zap->zap_normflags != zn->zn_normflags) { 220 /* 221 * We *must* use zn_normflags because this normalization is 222 * what the matching is based on. (Not the hash!) 223 */ 224 if (zap_normalize(zap, key, zn->zn_normbuf, 225 zn->zn_normflags) != 0) 226 return (SET_ERROR(ENOTSUP)); 227 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; 228 } 229 230 return (0); 231 } 232 233 zap_name_t * 234 zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt) 235 { 236 zap_name_t *zn = zap_name_alloc(zap); 237 if (zap_name_init_str(zn, key, mt) != 0) { 238 zap_name_free(zn); 239 return (NULL); 240 } 241 return (zn); 242 } 243 244 static zap_name_t * 245 zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) 246 { 247 zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); 248 249 ASSERT(zap->zap_normflags == 0); 250 zn->zn_zap = zap; 251 zn->zn_key_intlen = sizeof (*key); 252 zn->zn_key_orig = zn->zn_key_norm = key; 253 zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; 254 zn->zn_matchtype = 0; 255 256 zn->zn_hash = zap_hash(zn); 257 return (zn); 258 } 259 260 static void 261 mzap_byteswap(mzap_phys_t *buf, size_t size) 262 { 263 buf->mz_block_type = BSWAP_64(buf->mz_block_type); 264 buf->mz_salt = BSWAP_64(buf->mz_salt); 265 buf->mz_normflags = BSWAP_64(buf->mz_normflags); 266 int max = (size / MZAP_ENT_LEN) - 1; 267 for (int i = 0; i < max; i++) { 268 buf->mz_chunk[i].mze_value = 269 BSWAP_64(buf->mz_chunk[i].mze_value); 270 buf->mz_chunk[i].mze_cd = 271 BSWAP_32(buf->mz_chunk[i].mze_cd); 272 } 273 } 274 275 void 276 zap_byteswap(void *buf, size_t size) 277 { 278 uint64_t block_type = *(uint64_t *)buf; 279 280 if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { 281 /* ASSERT(magic == ZAP_LEAF_MAGIC); */ 282 mzap_byteswap(buf, size); 283 } else { 284 fzap_byteswap(buf, size); 285 } 286 } 287 288 __attribute__((always_inline)) inline 289 static int 290 mze_compare(const void *arg1, const void *arg2) 291 { 292 const mzap_ent_t *mze1 = arg1; 293 const mzap_ent_t *mze2 = arg2; 294 295 return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd, 296 (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd)); 297 } 298 299 ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t, 300 mze_compare) 301 302 static void 303 mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash) 304 { 305 mzap_ent_t mze; 306 307 ASSERT(zap->zap_ismicro); 308 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 309 310 mze.mze_chunkid = chunkid; 311 ASSERT0(hash & 0xffffffff); 312 mze.mze_hash = hash >> 32; 313 ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff); 314 mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd; 315 ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0); 316 zfs_btree_add(&zap->zap_m.zap_tree, &mze); 317 } 318 319 static mzap_ent_t * 320 mze_find(zap_name_t *zn, zfs_btree_index_t *idx) 321 { 322 mzap_ent_t mze_tofind; 323 mzap_ent_t *mze; 324 zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree; 325 326 ASSERT(zn->zn_zap->zap_ismicro); 327 ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); 328 329 ASSERT0(zn->zn_hash & 0xffffffff); 330 mze_tofind.mze_hash = zn->zn_hash >> 32; 331 mze_tofind.mze_cd = 0; 332 333 mze = zfs_btree_find(tree, &mze_tofind, idx); 334 if (mze == NULL) 335 mze = zfs_btree_next(tree, idx, idx); 336 for (; mze && mze->mze_hash == mze_tofind.mze_hash; 337 mze = zfs_btree_next(tree, idx, idx)) { 338 ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); 339 if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) 340 return (mze); 341 } 342 343 return (NULL); 344 } 345 346 static uint32_t 347 mze_find_unused_cd(zap_t *zap, uint64_t hash) 348 { 349 mzap_ent_t mze_tofind; 350 zfs_btree_index_t idx; 351 zfs_btree_t *tree = &zap->zap_m.zap_tree; 352 353 ASSERT(zap->zap_ismicro); 354 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); 355 356 ASSERT0(hash & 0xffffffff); 357 hash >>= 32; 358 mze_tofind.mze_hash = hash; 359 mze_tofind.mze_cd = 0; 360 361 uint32_t cd = 0; 362 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); 363 mze && mze->mze_hash == hash; 364 mze = zfs_btree_next(tree, &idx, &idx)) { 365 if (mze->mze_cd != cd) 366 break; 367 cd++; 368 } 369 370 return (cd); 371 } 372 373 /* 374 * Each mzap entry requires at max : 4 chunks 375 * 3 chunks for names + 1 chunk for value. 376 */ 377 #define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \ 378 ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t))) 379 380 /* 381 * Check if the current entry keeps the colliding entries under the fatzap leaf 382 * size. 383 */ 384 static boolean_t 385 mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) 386 { 387 zap_t *zap = zn->zn_zap; 388 mzap_ent_t mze_tofind; 389 zfs_btree_index_t idx; 390 zfs_btree_t *tree = &zap->zap_m.zap_tree; 391 uint32_t mzap_ents = 0; 392 393 ASSERT0(hash & 0xffffffff); 394 hash >>= 32; 395 mze_tofind.mze_hash = hash; 396 mze_tofind.mze_cd = 0; 397 398 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); 399 mze && mze->mze_hash == hash; 400 mze = zfs_btree_next(tree, &idx, &idx)) { 401 mzap_ents++; 402 } 403 404 /* Include the new entry being added */ 405 mzap_ents++; 406 407 return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS)); 408 } 409 410 static void 411 mze_destroy(zap_t *zap) 412 { 413 zfs_btree_clear(&zap->zap_m.zap_tree); 414 zfs_btree_destroy(&zap->zap_m.zap_tree); 415 } 416 417 static zap_t * 418 mzap_open(dmu_buf_t *db) 419 { 420 zap_t *winner; 421 uint64_t *zap_hdr = (uint64_t *)db->db_data; 422 uint64_t zap_block_type = zap_hdr[0]; 423 uint64_t zap_magic = zap_hdr[1]; 424 425 ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); 426 427 zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); 428 rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL); 429 rw_enter(&zap->zap_rwlock, RW_WRITER); 430 zap->zap_objset = dmu_buf_get_objset(db); 431 zap->zap_object = db->db_object; 432 zap->zap_dbuf = db; 433 434 if (zap_block_type != ZBT_MICRO) { 435 mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 436 0); 437 zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; 438 if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) { 439 winner = NULL; /* No actual winner here... */ 440 goto handle_winner; 441 } 442 } else { 443 zap->zap_ismicro = TRUE; 444 } 445 446 /* 447 * Make sure that zap_ismicro is set before we let others see 448 * it, because zap_lockdir() checks zap_ismicro without the lock 449 * held. 450 */ 451 dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf); 452 winner = dmu_buf_set_user(db, &zap->zap_dbu); 453 454 if (winner != NULL) 455 goto handle_winner; 456 457 if (zap->zap_ismicro) { 458 zap->zap_salt = zap_m_phys(zap)->mz_salt; 459 zap->zap_normflags = zap_m_phys(zap)->mz_normflags; 460 zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; 461 462 /* 463 * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove() 464 * overhead on massive inserts below. It still allows to store 465 * 62 entries before we have to add 2KB B-tree core node. 466 */ 467 zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare, 468 mze_find_in_buf, sizeof (mzap_ent_t), 512); 469 470 zap_name_t *zn = zap_name_alloc(zap); 471 for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) { 472 mzap_ent_phys_t *mze = 473 &zap_m_phys(zap)->mz_chunk[i]; 474 if (mze->mze_name[0]) { 475 zap->zap_m.zap_num_entries++; 476 zap_name_init_str(zn, mze->mze_name, 0); 477 mze_insert(zap, i, zn->zn_hash); 478 } 479 } 480 zap_name_free(zn); 481 } else { 482 zap->zap_salt = zap_f_phys(zap)->zap_salt; 483 zap->zap_normflags = zap_f_phys(zap)->zap_normflags; 484 485 ASSERT3U(sizeof (struct zap_leaf_header), ==, 486 2*ZAP_LEAF_CHUNKSIZE); 487 488 /* 489 * The embedded pointer table should not overlap the 490 * other members. 491 */ 492 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, 493 &zap_f_phys(zap)->zap_salt); 494 495 /* 496 * The embedded pointer table should end at the end of 497 * the block 498 */ 499 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, 500 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) - 501 (uintptr_t)zap_f_phys(zap), ==, 502 zap->zap_dbuf->db_size); 503 } 504 rw_exit(&zap->zap_rwlock); 505 return (zap); 506 507 handle_winner: 508 rw_exit(&zap->zap_rwlock); 509 rw_destroy(&zap->zap_rwlock); 510 if (!zap->zap_ismicro) 511 mutex_destroy(&zap->zap_f.zap_num_entries_mtx); 512 kmem_free(zap, sizeof (zap_t)); 513 return (winner); 514 } 515 516 /* 517 * This routine "consumes" the caller's hold on the dbuf, which must 518 * have the specified tag. 519 */ 520 static int 521 zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx, 522 krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) 523 { 524 ASSERT0(db->db_offset); 525 objset_t *os = dmu_buf_get_objset(db); 526 uint64_t obj = db->db_object; 527 dmu_object_info_t doi; 528 529 *zapp = NULL; 530 531 dmu_object_info_from_dnode(dn, &doi); 532 if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) 533 return (SET_ERROR(EINVAL)); 534 535 zap_t *zap = dmu_buf_get_user(db); 536 if (zap == NULL) { 537 zap = mzap_open(db); 538 if (zap == NULL) { 539 /* 540 * mzap_open() didn't like what it saw on-disk. 541 * Check for corruption! 542 */ 543 return (SET_ERROR(EIO)); 544 } 545 } 546 547 /* 548 * We're checking zap_ismicro without the lock held, in order to 549 * tell what type of lock we want. Once we have some sort of 550 * lock, see if it really is the right type. In practice this 551 * can only be different if it was upgraded from micro to fat, 552 * and micro wanted WRITER but fat only needs READER. 553 */ 554 krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; 555 rw_enter(&zap->zap_rwlock, lt); 556 if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { 557 /* it was upgraded, now we only need reader */ 558 ASSERT(lt == RW_WRITER); 559 ASSERT(RW_READER == 560 ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)); 561 rw_downgrade(&zap->zap_rwlock); 562 lt = RW_READER; 563 } 564 565 zap->zap_objset = os; 566 zap->zap_dnode = dn; 567 568 if (lt == RW_WRITER) 569 dmu_buf_will_dirty(db, tx); 570 571 ASSERT3P(zap->zap_dbuf, ==, db); 572 573 ASSERT(!zap->zap_ismicro || 574 zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); 575 if (zap->zap_ismicro && tx && adding && 576 zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { 577 uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; 578 if (newsz > zap_micro_max_size) { 579 dprintf("upgrading obj %llu: num_entries=%u\n", 580 (u_longlong_t)obj, zap->zap_m.zap_num_entries); 581 *zapp = zap; 582 int err = mzap_upgrade(zapp, tag, tx, 0); 583 if (err != 0) 584 rw_exit(&zap->zap_rwlock); 585 return (err); 586 } 587 VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); 588 zap->zap_m.zap_num_chunks = 589 db->db_size / MZAP_ENT_LEN - 1; 590 } 591 592 *zapp = zap; 593 return (0); 594 } 595 596 static int 597 zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, 598 krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, 599 zap_t **zapp) 600 { 601 dmu_buf_t *db; 602 int err; 603 604 err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); 605 if (err != 0) 606 return (err); 607 err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); 608 if (err != 0) 609 dmu_buf_rele(db, tag); 610 else 611 VERIFY(dnode_add_ref(dn, tag)); 612 return (err); 613 } 614 615 int 616 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, 617 krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, 618 zap_t **zapp) 619 { 620 dnode_t *dn; 621 dmu_buf_t *db; 622 int err; 623 624 err = dnode_hold(os, obj, tag, &dn); 625 if (err != 0) 626 return (err); 627 err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); 628 if (err != 0) { 629 dnode_rele(dn, tag); 630 return (err); 631 } 632 err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); 633 if (err != 0) { 634 dmu_buf_rele(db, tag); 635 dnode_rele(dn, tag); 636 } 637 return (err); 638 } 639 640 void 641 zap_unlockdir(zap_t *zap, const void *tag) 642 { 643 rw_exit(&zap->zap_rwlock); 644 dnode_rele(zap->zap_dnode, tag); 645 dmu_buf_rele(zap->zap_dbuf, tag); 646 } 647 648 static int 649 mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags) 650 { 651 int err = 0; 652 zap_t *zap = *zapp; 653 654 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 655 656 int sz = zap->zap_dbuf->db_size; 657 mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP); 658 memcpy(mzp, zap->zap_dbuf->db_data, sz); 659 int nchunks = zap->zap_m.zap_num_chunks; 660 661 if (!flags) { 662 err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 663 1ULL << fzap_default_block_shift, 0, tx); 664 if (err != 0) { 665 vmem_free(mzp, sz); 666 return (err); 667 } 668 } 669 670 dprintf("upgrading obj=%llu with %u chunks\n", 671 (u_longlong_t)zap->zap_object, nchunks); 672 /* XXX destroy the tree later, so we can use the stored hash value */ 673 mze_destroy(zap); 674 675 fzap_upgrade(zap, tx, flags); 676 677 zap_name_t *zn = zap_name_alloc(zap); 678 for (int i = 0; i < nchunks; i++) { 679 mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; 680 if (mze->mze_name[0] == 0) 681 continue; 682 dprintf("adding %s=%llu\n", 683 mze->mze_name, (u_longlong_t)mze->mze_value); 684 zap_name_init_str(zn, mze->mze_name, 0); 685 /* If we fail here, we would end up losing entries */ 686 VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, 687 tag, tx)); 688 zap = zn->zn_zap; /* fzap_add_cd() may change zap */ 689 } 690 zap_name_free(zn); 691 vmem_free(mzp, sz); 692 *zapp = zap; 693 return (0); 694 } 695 696 /* 697 * The "normflags" determine the behavior of the matchtype_t which is 698 * passed to zap_lookup_norm(). Names which have the same normalized 699 * version will be stored with the same hash value, and therefore we can 700 * perform normalization-insensitive lookups. We can be Unicode form- 701 * insensitive and/or case-insensitive. The following flags are valid for 702 * "normflags": 703 * 704 * U8_TEXTPREP_NFC 705 * U8_TEXTPREP_NFD 706 * U8_TEXTPREP_NFKC 707 * U8_TEXTPREP_NFKD 708 * U8_TEXTPREP_TOUPPER 709 * 710 * The *_NF* (Normalization Form) flags are mutually exclusive; at most one 711 * of them may be supplied. 712 */ 713 void 714 mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx) 715 { 716 dmu_buf_t *db; 717 718 VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); 719 720 dmu_buf_will_dirty(db, tx); 721 mzap_phys_t *zp = db->db_data; 722 zp->mz_block_type = ZBT_MICRO; 723 zp->mz_salt = 724 ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL; 725 zp->mz_normflags = normflags; 726 727 if (flags != 0) { 728 zap_t *zap; 729 /* Only fat zap supports flags; upgrade immediately. */ 730 VERIFY(dnode_add_ref(dn, FTAG)); 731 VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER, 732 B_FALSE, B_FALSE, &zap)); 733 VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); 734 zap_unlockdir(zap, FTAG); 735 } else { 736 dmu_buf_rele(db, FTAG); 737 } 738 } 739 740 static uint64_t 741 zap_create_impl(objset_t *os, int normflags, zap_flags_t flags, 742 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 743 dmu_object_type_t bonustype, int bonuslen, int dnodesize, 744 dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) 745 { 746 uint64_t obj; 747 748 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); 749 750 if (allocated_dnode == NULL) { 751 dnode_t *dn; 752 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, 753 indirect_blockshift, bonustype, bonuslen, dnodesize, 754 &dn, FTAG, tx); 755 mzap_create_impl(dn, normflags, flags, tx); 756 dnode_rele(dn, FTAG); 757 } else { 758 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, 759 indirect_blockshift, bonustype, bonuslen, dnodesize, 760 allocated_dnode, tag, tx); 761 mzap_create_impl(*allocated_dnode, normflags, flags, tx); 762 } 763 764 return (obj); 765 } 766 767 int 768 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, 769 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 770 { 771 return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, 772 0, tx)); 773 } 774 775 int 776 zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, 777 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 778 { 779 return (zap_create_claim_norm_dnsize(os, obj, 780 0, ot, bonustype, bonuslen, dnodesize, tx)); 781 } 782 783 int 784 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, 785 dmu_object_type_t ot, 786 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 787 { 788 return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, 789 bonuslen, 0, tx)); 790 } 791 792 int 793 zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, 794 dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, 795 int dnodesize, dmu_tx_t *tx) 796 { 797 dnode_t *dn; 798 int error; 799 800 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); 801 error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, 802 dnodesize, tx); 803 if (error != 0) 804 return (error); 805 806 error = dnode_hold(os, obj, FTAG, &dn); 807 if (error != 0) 808 return (error); 809 810 mzap_create_impl(dn, normflags, 0, tx); 811 812 dnode_rele(dn, FTAG); 813 814 return (0); 815 } 816 817 uint64_t 818 zap_create(objset_t *os, dmu_object_type_t ot, 819 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 820 { 821 return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); 822 } 823 824 uint64_t 825 zap_create_dnsize(objset_t *os, dmu_object_type_t ot, 826 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 827 { 828 return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, 829 dnodesize, tx)); 830 } 831 832 uint64_t 833 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, 834 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 835 { 836 return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, 837 0, tx)); 838 } 839 840 uint64_t 841 zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, 842 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 843 { 844 return (zap_create_impl(os, normflags, 0, ot, 0, 0, 845 bonustype, bonuslen, dnodesize, NULL, NULL, tx)); 846 } 847 848 uint64_t 849 zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, 850 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 851 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 852 { 853 return (zap_create_flags_dnsize(os, normflags, flags, ot, 854 leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); 855 } 856 857 uint64_t 858 zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, 859 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 860 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 861 { 862 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, 863 indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL, 864 tx)); 865 } 866 867 /* 868 * Create a zap object and return a pointer to the newly allocated dnode via 869 * the allocated_dnode argument. The returned dnode will be held and the 870 * caller is responsible for releasing the hold by calling dnode_rele(). 871 */ 872 uint64_t 873 zap_create_hold(objset_t *os, int normflags, zap_flags_t flags, 874 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 875 dmu_object_type_t bonustype, int bonuslen, int dnodesize, 876 dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) 877 { 878 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, 879 indirect_blockshift, bonustype, bonuslen, dnodesize, 880 allocated_dnode, tag, tx)); 881 } 882 883 int 884 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) 885 { 886 /* 887 * dmu_object_free will free the object number and free the 888 * data. Freeing the data will cause our pageout function to be 889 * called, which will destroy our data (zap_leaf_t's and zap_t). 890 */ 891 892 return (dmu_object_free(os, zapobj, tx)); 893 } 894 895 void 896 zap_evict_sync(void *dbu) 897 { 898 zap_t *zap = dbu; 899 900 rw_destroy(&zap->zap_rwlock); 901 902 if (zap->zap_ismicro) 903 mze_destroy(zap); 904 else 905 mutex_destroy(&zap->zap_f.zap_num_entries_mtx); 906 907 kmem_free(zap, sizeof (zap_t)); 908 } 909 910 int 911 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) 912 { 913 zap_t *zap; 914 915 int err = 916 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 917 if (err != 0) 918 return (err); 919 if (!zap->zap_ismicro) { 920 err = fzap_count(zap, count); 921 } else { 922 *count = zap->zap_m.zap_num_entries; 923 } 924 zap_unlockdir(zap, FTAG); 925 return (err); 926 } 927 928 /* 929 * zn may be NULL; if not specified, it will be computed if needed. 930 * See also the comment above zap_entry_normalization_conflict(). 931 */ 932 static boolean_t 933 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze, 934 zfs_btree_index_t *idx) 935 { 936 boolean_t allocdzn = B_FALSE; 937 mzap_ent_t *other; 938 zfs_btree_index_t oidx; 939 940 if (zap->zap_normflags == 0) 941 return (B_FALSE); 942 943 for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx); 944 other && other->mze_hash == mze->mze_hash; 945 other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) { 946 947 if (zn == NULL) { 948 zn = zap_name_alloc_str(zap, 949 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); 950 allocdzn = B_TRUE; 951 } 952 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { 953 if (allocdzn) 954 zap_name_free(zn); 955 return (B_TRUE); 956 } 957 } 958 959 for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx); 960 other && other->mze_hash == mze->mze_hash; 961 other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) { 962 963 if (zn == NULL) { 964 zn = zap_name_alloc_str(zap, 965 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); 966 allocdzn = B_TRUE; 967 } 968 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { 969 if (allocdzn) 970 zap_name_free(zn); 971 return (B_TRUE); 972 } 973 } 974 975 if (allocdzn) 976 zap_name_free(zn); 977 return (B_FALSE); 978 } 979 980 /* 981 * Routines for manipulating attributes. 982 */ 983 984 int 985 zap_lookup(objset_t *os, uint64_t zapobj, const char *name, 986 uint64_t integer_size, uint64_t num_integers, void *buf) 987 { 988 return (zap_lookup_norm(os, zapobj, name, integer_size, 989 num_integers, buf, 0, NULL, 0, NULL)); 990 } 991 992 static int 993 zap_lookup_impl(zap_t *zap, const char *name, 994 uint64_t integer_size, uint64_t num_integers, void *buf, 995 matchtype_t mt, char *realname, int rn_len, 996 boolean_t *ncp) 997 { 998 int err = 0; 999 1000 zap_name_t *zn = zap_name_alloc_str(zap, name, mt); 1001 if (zn == NULL) 1002 return (SET_ERROR(ENOTSUP)); 1003 1004 if (!zap->zap_ismicro) { 1005 err = fzap_lookup(zn, integer_size, num_integers, buf, 1006 realname, rn_len, ncp); 1007 } else { 1008 zfs_btree_index_t idx; 1009 mzap_ent_t *mze = mze_find(zn, &idx); 1010 if (mze == NULL) { 1011 err = SET_ERROR(ENOENT); 1012 } else { 1013 if (num_integers < 1) { 1014 err = SET_ERROR(EOVERFLOW); 1015 } else if (integer_size != 8) { 1016 err = SET_ERROR(EINVAL); 1017 } else { 1018 *(uint64_t *)buf = 1019 MZE_PHYS(zap, mze)->mze_value; 1020 if (realname != NULL) 1021 (void) strlcpy(realname, 1022 MZE_PHYS(zap, mze)->mze_name, 1023 rn_len); 1024 if (ncp) { 1025 *ncp = mzap_normalization_conflict(zap, 1026 zn, mze, &idx); 1027 } 1028 } 1029 } 1030 } 1031 zap_name_free(zn); 1032 return (err); 1033 } 1034 1035 int 1036 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, 1037 uint64_t integer_size, uint64_t num_integers, void *buf, 1038 matchtype_t mt, char *realname, int rn_len, 1039 boolean_t *ncp) 1040 { 1041 zap_t *zap; 1042 1043 int err = 1044 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1045 if (err != 0) 1046 return (err); 1047 err = zap_lookup_impl(zap, name, integer_size, 1048 num_integers, buf, mt, realname, rn_len, ncp); 1049 zap_unlockdir(zap, FTAG); 1050 return (err); 1051 } 1052 1053 int 1054 zap_prefetch(objset_t *os, uint64_t zapobj, const char *name) 1055 { 1056 zap_t *zap; 1057 int err; 1058 zap_name_t *zn; 1059 1060 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1061 if (err) 1062 return (err); 1063 zn = zap_name_alloc_str(zap, name, 0); 1064 if (zn == NULL) { 1065 zap_unlockdir(zap, FTAG); 1066 return (SET_ERROR(ENOTSUP)); 1067 } 1068 1069 fzap_prefetch(zn); 1070 zap_name_free(zn); 1071 zap_unlockdir(zap, FTAG); 1072 return (err); 1073 } 1074 1075 int 1076 zap_lookup_by_dnode(dnode_t *dn, const char *name, 1077 uint64_t integer_size, uint64_t num_integers, void *buf) 1078 { 1079 return (zap_lookup_norm_by_dnode(dn, name, integer_size, 1080 num_integers, buf, 0, NULL, 0, NULL)); 1081 } 1082 1083 int 1084 zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, 1085 uint64_t integer_size, uint64_t num_integers, void *buf, 1086 matchtype_t mt, char *realname, int rn_len, 1087 boolean_t *ncp) 1088 { 1089 zap_t *zap; 1090 1091 int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, 1092 FTAG, &zap); 1093 if (err != 0) 1094 return (err); 1095 err = zap_lookup_impl(zap, name, integer_size, 1096 num_integers, buf, mt, realname, rn_len, ncp); 1097 zap_unlockdir(zap, FTAG); 1098 return (err); 1099 } 1100 1101 int 1102 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1103 int key_numints) 1104 { 1105 zap_t *zap; 1106 1107 int err = 1108 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1109 if (err != 0) 1110 return (err); 1111 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1112 if (zn == NULL) { 1113 zap_unlockdir(zap, FTAG); 1114 return (SET_ERROR(ENOTSUP)); 1115 } 1116 1117 fzap_prefetch(zn); 1118 zap_name_free(zn); 1119 zap_unlockdir(zap, FTAG); 1120 return (err); 1121 } 1122 1123 int 1124 zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1125 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) 1126 { 1127 zap_t *zap; 1128 1129 int err = 1130 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1131 if (err != 0) 1132 return (err); 1133 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1134 if (zn == NULL) { 1135 zap_unlockdir(zap, FTAG); 1136 return (SET_ERROR(ENOTSUP)); 1137 } 1138 1139 err = fzap_lookup(zn, integer_size, num_integers, buf, 1140 NULL, 0, NULL); 1141 zap_name_free(zn); 1142 zap_unlockdir(zap, FTAG); 1143 return (err); 1144 } 1145 1146 int 1147 zap_contains(objset_t *os, uint64_t zapobj, const char *name) 1148 { 1149 int err = zap_lookup_norm(os, zapobj, name, 0, 1150 0, NULL, 0, NULL, 0, NULL); 1151 if (err == EOVERFLOW || err == EINVAL) 1152 err = 0; /* found, but skipped reading the value */ 1153 return (err); 1154 } 1155 1156 int 1157 zap_length(objset_t *os, uint64_t zapobj, const char *name, 1158 uint64_t *integer_size, uint64_t *num_integers) 1159 { 1160 zap_t *zap; 1161 1162 int err = 1163 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1164 if (err != 0) 1165 return (err); 1166 zap_name_t *zn = zap_name_alloc_str(zap, name, 0); 1167 if (zn == NULL) { 1168 zap_unlockdir(zap, FTAG); 1169 return (SET_ERROR(ENOTSUP)); 1170 } 1171 if (!zap->zap_ismicro) { 1172 err = fzap_length(zn, integer_size, num_integers); 1173 } else { 1174 zfs_btree_index_t idx; 1175 mzap_ent_t *mze = mze_find(zn, &idx); 1176 if (mze == NULL) { 1177 err = SET_ERROR(ENOENT); 1178 } else { 1179 if (integer_size) 1180 *integer_size = 8; 1181 if (num_integers) 1182 *num_integers = 1; 1183 } 1184 } 1185 zap_name_free(zn); 1186 zap_unlockdir(zap, FTAG); 1187 return (err); 1188 } 1189 1190 int 1191 zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1192 int key_numints, uint64_t *integer_size, uint64_t *num_integers) 1193 { 1194 zap_t *zap; 1195 1196 int err = 1197 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1198 if (err != 0) 1199 return (err); 1200 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1201 if (zn == NULL) { 1202 zap_unlockdir(zap, FTAG); 1203 return (SET_ERROR(ENOTSUP)); 1204 } 1205 err = fzap_length(zn, integer_size, num_integers); 1206 zap_name_free(zn); 1207 zap_unlockdir(zap, FTAG); 1208 return (err); 1209 } 1210 1211 static void 1212 mzap_addent(zap_name_t *zn, uint64_t value) 1213 { 1214 zap_t *zap = zn->zn_zap; 1215 uint16_t start = zap->zap_m.zap_alloc_next; 1216 1217 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 1218 1219 #ifdef ZFS_DEBUG 1220 for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { 1221 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; 1222 ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); 1223 } 1224 #endif 1225 1226 uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash); 1227 /* given the limited size of the microzap, this can't happen */ 1228 ASSERT(cd < zap_maxcd(zap)); 1229 1230 again: 1231 for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) { 1232 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; 1233 if (mze->mze_name[0] == 0) { 1234 mze->mze_value = value; 1235 mze->mze_cd = cd; 1236 (void) strlcpy(mze->mze_name, zn->zn_key_orig, 1237 sizeof (mze->mze_name)); 1238 zap->zap_m.zap_num_entries++; 1239 zap->zap_m.zap_alloc_next = i+1; 1240 if (zap->zap_m.zap_alloc_next == 1241 zap->zap_m.zap_num_chunks) 1242 zap->zap_m.zap_alloc_next = 0; 1243 mze_insert(zap, i, zn->zn_hash); 1244 return; 1245 } 1246 } 1247 if (start != 0) { 1248 start = 0; 1249 goto again; 1250 } 1251 cmn_err(CE_PANIC, "out of entries!"); 1252 } 1253 1254 static int 1255 zap_add_impl(zap_t *zap, const char *key, 1256 int integer_size, uint64_t num_integers, 1257 const void *val, dmu_tx_t *tx, const void *tag) 1258 { 1259 const uint64_t *intval = val; 1260 int err = 0; 1261 1262 zap_name_t *zn = zap_name_alloc_str(zap, key, 0); 1263 if (zn == NULL) { 1264 zap_unlockdir(zap, tag); 1265 return (SET_ERROR(ENOTSUP)); 1266 } 1267 if (!zap->zap_ismicro) { 1268 err = fzap_add(zn, integer_size, num_integers, val, tag, tx); 1269 zap = zn->zn_zap; /* fzap_add() may change zap */ 1270 } else if (integer_size != 8 || num_integers != 1 || 1271 strlen(key) >= MZAP_NAME_LEN || 1272 !mze_canfit_fzap_leaf(zn, zn->zn_hash)) { 1273 err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); 1274 if (err == 0) { 1275 err = fzap_add(zn, integer_size, num_integers, val, 1276 tag, tx); 1277 } 1278 zap = zn->zn_zap; /* fzap_add() may change zap */ 1279 } else { 1280 zfs_btree_index_t idx; 1281 if (mze_find(zn, &idx) != NULL) { 1282 err = SET_ERROR(EEXIST); 1283 } else { 1284 mzap_addent(zn, *intval); 1285 } 1286 } 1287 ASSERT(zap == zn->zn_zap); 1288 zap_name_free(zn); 1289 if (zap != NULL) /* may be NULL if fzap_add() failed */ 1290 zap_unlockdir(zap, tag); 1291 return (err); 1292 } 1293 1294 int 1295 zap_add(objset_t *os, uint64_t zapobj, const char *key, 1296 int integer_size, uint64_t num_integers, 1297 const void *val, dmu_tx_t *tx) 1298 { 1299 zap_t *zap; 1300 int err; 1301 1302 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1303 if (err != 0) 1304 return (err); 1305 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); 1306 /* zap_add_impl() calls zap_unlockdir() */ 1307 return (err); 1308 } 1309 1310 int 1311 zap_add_by_dnode(dnode_t *dn, const char *key, 1312 int integer_size, uint64_t num_integers, 1313 const void *val, dmu_tx_t *tx) 1314 { 1315 zap_t *zap; 1316 int err; 1317 1318 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1319 if (err != 0) 1320 return (err); 1321 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); 1322 /* zap_add_impl() calls zap_unlockdir() */ 1323 return (err); 1324 } 1325 1326 static int 1327 zap_add_uint64_impl(zap_t *zap, const uint64_t *key, 1328 int key_numints, int integer_size, uint64_t num_integers, 1329 const void *val, dmu_tx_t *tx, const void *tag) 1330 { 1331 int err; 1332 1333 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1334 if (zn == NULL) { 1335 zap_unlockdir(zap, tag); 1336 return (SET_ERROR(ENOTSUP)); 1337 } 1338 err = fzap_add(zn, integer_size, num_integers, val, tag, tx); 1339 zap = zn->zn_zap; /* fzap_add() may change zap */ 1340 zap_name_free(zn); 1341 if (zap != NULL) /* may be NULL if fzap_add() failed */ 1342 zap_unlockdir(zap, tag); 1343 return (err); 1344 } 1345 1346 int 1347 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1348 int key_numints, int integer_size, uint64_t num_integers, 1349 const void *val, dmu_tx_t *tx) 1350 { 1351 zap_t *zap; 1352 1353 int err = 1354 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1355 if (err != 0) 1356 return (err); 1357 err = zap_add_uint64_impl(zap, key, key_numints, 1358 integer_size, num_integers, val, tx, FTAG); 1359 /* zap_add_uint64_impl() calls zap_unlockdir() */ 1360 return (err); 1361 } 1362 1363 int 1364 zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, 1365 int key_numints, int integer_size, uint64_t num_integers, 1366 const void *val, dmu_tx_t *tx) 1367 { 1368 zap_t *zap; 1369 1370 int err = 1371 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1372 if (err != 0) 1373 return (err); 1374 err = zap_add_uint64_impl(zap, key, key_numints, 1375 integer_size, num_integers, val, tx, FTAG); 1376 /* zap_add_uint64_impl() calls zap_unlockdir() */ 1377 return (err); 1378 } 1379 1380 int 1381 zap_update(objset_t *os, uint64_t zapobj, const char *name, 1382 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) 1383 { 1384 zap_t *zap; 1385 const uint64_t *intval = val; 1386 1387 int err = 1388 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1389 if (err != 0) 1390 return (err); 1391 zap_name_t *zn = zap_name_alloc_str(zap, name, 0); 1392 if (zn == NULL) { 1393 zap_unlockdir(zap, FTAG); 1394 return (SET_ERROR(ENOTSUP)); 1395 } 1396 if (!zap->zap_ismicro) { 1397 err = fzap_update(zn, integer_size, num_integers, val, 1398 FTAG, tx); 1399 zap = zn->zn_zap; /* fzap_update() may change zap */ 1400 } else if (integer_size != 8 || num_integers != 1 || 1401 strlen(name) >= MZAP_NAME_LEN) { 1402 dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", 1403 (u_longlong_t)zapobj, integer_size, 1404 (u_longlong_t)num_integers, name); 1405 err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); 1406 if (err == 0) { 1407 err = fzap_update(zn, integer_size, num_integers, 1408 val, FTAG, tx); 1409 } 1410 zap = zn->zn_zap; /* fzap_update() may change zap */ 1411 } else { 1412 zfs_btree_index_t idx; 1413 mzap_ent_t *mze = mze_find(zn, &idx); 1414 if (mze != NULL) { 1415 MZE_PHYS(zap, mze)->mze_value = *intval; 1416 } else { 1417 mzap_addent(zn, *intval); 1418 } 1419 } 1420 ASSERT(zap == zn->zn_zap); 1421 zap_name_free(zn); 1422 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ 1423 zap_unlockdir(zap, FTAG); 1424 return (err); 1425 } 1426 1427 static int 1428 zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, 1429 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, 1430 const void *tag) 1431 { 1432 int err; 1433 1434 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1435 if (zn == NULL) { 1436 zap_unlockdir(zap, tag); 1437 return (SET_ERROR(ENOTSUP)); 1438 } 1439 err = fzap_update(zn, integer_size, num_integers, val, tag, tx); 1440 zap = zn->zn_zap; /* fzap_update() may change zap */ 1441 zap_name_free(zn); 1442 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ 1443 zap_unlockdir(zap, tag); 1444 return (err); 1445 } 1446 1447 int 1448 zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1449 int key_numints, int integer_size, uint64_t num_integers, const void *val, 1450 dmu_tx_t *tx) 1451 { 1452 zap_t *zap; 1453 1454 int err = 1455 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1456 if (err != 0) 1457 return (err); 1458 err = zap_update_uint64_impl(zap, key, key_numints, 1459 integer_size, num_integers, val, tx, FTAG); 1460 /* zap_update_uint64_impl() calls zap_unlockdir() */ 1461 return (err); 1462 } 1463 1464 int 1465 zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, 1466 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) 1467 { 1468 zap_t *zap; 1469 1470 int err = 1471 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1472 if (err != 0) 1473 return (err); 1474 err = zap_update_uint64_impl(zap, key, key_numints, 1475 integer_size, num_integers, val, tx, FTAG); 1476 /* zap_update_uint64_impl() calls zap_unlockdir() */ 1477 return (err); 1478 } 1479 1480 int 1481 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) 1482 { 1483 return (zap_remove_norm(os, zapobj, name, 0, tx)); 1484 } 1485 1486 static int 1487 zap_remove_impl(zap_t *zap, const char *name, 1488 matchtype_t mt, dmu_tx_t *tx) 1489 { 1490 int err = 0; 1491 1492 zap_name_t *zn = zap_name_alloc_str(zap, name, mt); 1493 if (zn == NULL) 1494 return (SET_ERROR(ENOTSUP)); 1495 if (!zap->zap_ismicro) { 1496 err = fzap_remove(zn, tx); 1497 } else { 1498 zfs_btree_index_t idx; 1499 mzap_ent_t *mze = mze_find(zn, &idx); 1500 if (mze == NULL) { 1501 err = SET_ERROR(ENOENT); 1502 } else { 1503 zap->zap_m.zap_num_entries--; 1504 memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t)); 1505 zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx); 1506 } 1507 } 1508 zap_name_free(zn); 1509 return (err); 1510 } 1511 1512 int 1513 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, 1514 matchtype_t mt, dmu_tx_t *tx) 1515 { 1516 zap_t *zap; 1517 int err; 1518 1519 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1520 if (err) 1521 return (err); 1522 err = zap_remove_impl(zap, name, mt, tx); 1523 zap_unlockdir(zap, FTAG); 1524 return (err); 1525 } 1526 1527 int 1528 zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) 1529 { 1530 zap_t *zap; 1531 int err; 1532 1533 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1534 if (err) 1535 return (err); 1536 err = zap_remove_impl(zap, name, 0, tx); 1537 zap_unlockdir(zap, FTAG); 1538 return (err); 1539 } 1540 1541 static int 1542 zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, 1543 dmu_tx_t *tx, const void *tag) 1544 { 1545 int err; 1546 1547 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1548 if (zn == NULL) { 1549 zap_unlockdir(zap, tag); 1550 return (SET_ERROR(ENOTSUP)); 1551 } 1552 err = fzap_remove(zn, tx); 1553 zap_name_free(zn); 1554 zap_unlockdir(zap, tag); 1555 return (err); 1556 } 1557 1558 int 1559 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1560 int key_numints, dmu_tx_t *tx) 1561 { 1562 zap_t *zap; 1563 1564 int err = 1565 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1566 if (err != 0) 1567 return (err); 1568 err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); 1569 /* zap_remove_uint64_impl() calls zap_unlockdir() */ 1570 return (err); 1571 } 1572 1573 int 1574 zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, 1575 dmu_tx_t *tx) 1576 { 1577 zap_t *zap; 1578 1579 int err = 1580 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1581 if (err != 0) 1582 return (err); 1583 err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); 1584 /* zap_remove_uint64_impl() calls zap_unlockdir() */ 1585 return (err); 1586 } 1587 1588 /* 1589 * Routines for iterating over the attributes. 1590 */ 1591 1592 static void 1593 zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, 1594 uint64_t serialized, boolean_t prefetch) 1595 { 1596 zc->zc_objset = os; 1597 zc->zc_zap = NULL; 1598 zc->zc_leaf = NULL; 1599 zc->zc_zapobj = zapobj; 1600 zc->zc_serialized = serialized; 1601 zc->zc_hash = 0; 1602 zc->zc_cd = 0; 1603 zc->zc_prefetch = prefetch; 1604 } 1605 void 1606 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, 1607 uint64_t serialized) 1608 { 1609 zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE); 1610 } 1611 1612 /* 1613 * Initialize a cursor at the beginning of the ZAP object. The entire 1614 * ZAP object will be prefetched. 1615 */ 1616 void 1617 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) 1618 { 1619 zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE); 1620 } 1621 1622 /* 1623 * Initialize a cursor at the beginning, but request that we not prefetch 1624 * the entire ZAP object. 1625 */ 1626 void 1627 zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) 1628 { 1629 zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE); 1630 } 1631 1632 void 1633 zap_cursor_fini(zap_cursor_t *zc) 1634 { 1635 if (zc->zc_zap) { 1636 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); 1637 zap_unlockdir(zc->zc_zap, NULL); 1638 zc->zc_zap = NULL; 1639 } 1640 if (zc->zc_leaf) { 1641 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); 1642 zap_put_leaf(zc->zc_leaf); 1643 zc->zc_leaf = NULL; 1644 } 1645 zc->zc_objset = NULL; 1646 } 1647 1648 uint64_t 1649 zap_cursor_serialize(zap_cursor_t *zc) 1650 { 1651 if (zc->zc_hash == -1ULL) 1652 return (-1ULL); 1653 if (zc->zc_zap == NULL) 1654 return (zc->zc_serialized); 1655 ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); 1656 ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); 1657 1658 /* 1659 * We want to keep the high 32 bits of the cursor zero if we can, so 1660 * that 32-bit programs can access this. So usually use a small 1661 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits 1662 * of the cursor. 1663 * 1664 * [ collision differentiator | zap_hashbits()-bit hash value ] 1665 */ 1666 return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | 1667 ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); 1668 } 1669 1670 int 1671 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) 1672 { 1673 int err; 1674 1675 if (zc->zc_hash == -1ULL) 1676 return (SET_ERROR(ENOENT)); 1677 1678 if (zc->zc_zap == NULL) { 1679 int hb; 1680 err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, 1681 RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); 1682 if (err != 0) 1683 return (err); 1684 1685 /* 1686 * To support zap_cursor_init_serialized, advance, retrieve, 1687 * we must add to the existing zc_cd, which may already 1688 * be 1 due to the zap_cursor_advance. 1689 */ 1690 ASSERT(zc->zc_hash == 0); 1691 hb = zap_hashbits(zc->zc_zap); 1692 zc->zc_hash = zc->zc_serialized << (64 - hb); 1693 zc->zc_cd += zc->zc_serialized >> hb; 1694 if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ 1695 zc->zc_cd = 0; 1696 } else { 1697 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); 1698 } 1699 if (!zc->zc_zap->zap_ismicro) { 1700 err = fzap_cursor_retrieve(zc->zc_zap, zc, za); 1701 } else { 1702 zfs_btree_index_t idx; 1703 mzap_ent_t mze_tofind; 1704 1705 mze_tofind.mze_hash = zc->zc_hash >> 32; 1706 mze_tofind.mze_cd = zc->zc_cd; 1707 1708 mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree, 1709 &mze_tofind, &idx); 1710 if (mze == NULL) { 1711 mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree, 1712 &idx, &idx); 1713 } 1714 if (mze) { 1715 mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); 1716 ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); 1717 za->za_normalization_conflict = 1718 mzap_normalization_conflict(zc->zc_zap, NULL, 1719 mze, &idx); 1720 za->za_integer_length = 8; 1721 za->za_num_integers = 1; 1722 za->za_first_integer = mzep->mze_value; 1723 (void) strlcpy(za->za_name, mzep->mze_name, 1724 sizeof (za->za_name)); 1725 zc->zc_hash = (uint64_t)mze->mze_hash << 32; 1726 zc->zc_cd = mze->mze_cd; 1727 err = 0; 1728 } else { 1729 zc->zc_hash = -1ULL; 1730 err = SET_ERROR(ENOENT); 1731 } 1732 } 1733 rw_exit(&zc->zc_zap->zap_rwlock); 1734 return (err); 1735 } 1736 1737 void 1738 zap_cursor_advance(zap_cursor_t *zc) 1739 { 1740 if (zc->zc_hash == -1ULL) 1741 return; 1742 zc->zc_cd++; 1743 } 1744 1745 int 1746 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) 1747 { 1748 zap_t *zap; 1749 1750 int err = 1751 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1752 if (err != 0) 1753 return (err); 1754 1755 memset(zs, 0, sizeof (zap_stats_t)); 1756 1757 if (zap->zap_ismicro) { 1758 zs->zs_blocksize = zap->zap_dbuf->db_size; 1759 zs->zs_num_entries = zap->zap_m.zap_num_entries; 1760 zs->zs_num_blocks = 1; 1761 } else { 1762 fzap_get_stats(zap, zs); 1763 } 1764 zap_unlockdir(zap, FTAG); 1765 return (0); 1766 } 1767 1768 #if defined(_KERNEL) 1769 EXPORT_SYMBOL(zap_create); 1770 EXPORT_SYMBOL(zap_create_dnsize); 1771 EXPORT_SYMBOL(zap_create_norm); 1772 EXPORT_SYMBOL(zap_create_norm_dnsize); 1773 EXPORT_SYMBOL(zap_create_flags); 1774 EXPORT_SYMBOL(zap_create_flags_dnsize); 1775 EXPORT_SYMBOL(zap_create_claim); 1776 EXPORT_SYMBOL(zap_create_claim_norm); 1777 EXPORT_SYMBOL(zap_create_claim_norm_dnsize); 1778 EXPORT_SYMBOL(zap_create_hold); 1779 EXPORT_SYMBOL(zap_destroy); 1780 EXPORT_SYMBOL(zap_lookup); 1781 EXPORT_SYMBOL(zap_lookup_by_dnode); 1782 EXPORT_SYMBOL(zap_lookup_norm); 1783 EXPORT_SYMBOL(zap_lookup_uint64); 1784 EXPORT_SYMBOL(zap_contains); 1785 EXPORT_SYMBOL(zap_prefetch); 1786 EXPORT_SYMBOL(zap_prefetch_uint64); 1787 EXPORT_SYMBOL(zap_add); 1788 EXPORT_SYMBOL(zap_add_by_dnode); 1789 EXPORT_SYMBOL(zap_add_uint64); 1790 EXPORT_SYMBOL(zap_add_uint64_by_dnode); 1791 EXPORT_SYMBOL(zap_update); 1792 EXPORT_SYMBOL(zap_update_uint64); 1793 EXPORT_SYMBOL(zap_update_uint64_by_dnode); 1794 EXPORT_SYMBOL(zap_length); 1795 EXPORT_SYMBOL(zap_length_uint64); 1796 EXPORT_SYMBOL(zap_remove); 1797 EXPORT_SYMBOL(zap_remove_by_dnode); 1798 EXPORT_SYMBOL(zap_remove_norm); 1799 EXPORT_SYMBOL(zap_remove_uint64); 1800 EXPORT_SYMBOL(zap_remove_uint64_by_dnode); 1801 EXPORT_SYMBOL(zap_count); 1802 EXPORT_SYMBOL(zap_value_search); 1803 EXPORT_SYMBOL(zap_join); 1804 EXPORT_SYMBOL(zap_join_increment); 1805 EXPORT_SYMBOL(zap_add_int); 1806 EXPORT_SYMBOL(zap_remove_int); 1807 EXPORT_SYMBOL(zap_lookup_int); 1808 EXPORT_SYMBOL(zap_increment_int); 1809 EXPORT_SYMBOL(zap_add_int_key); 1810 EXPORT_SYMBOL(zap_lookup_int_key); 1811 EXPORT_SYMBOL(zap_increment); 1812 EXPORT_SYMBOL(zap_cursor_init); 1813 EXPORT_SYMBOL(zap_cursor_fini); 1814 EXPORT_SYMBOL(zap_cursor_retrieve); 1815 EXPORT_SYMBOL(zap_cursor_advance); 1816 EXPORT_SYMBOL(zap_cursor_serialize); 1817 EXPORT_SYMBOL(zap_cursor_init_serialized); 1818 EXPORT_SYMBOL(zap_get_stats); 1819 1820 /* CSTYLED */ 1821 ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW, 1822 "Maximum micro ZAP size, before converting to a fat ZAP, in bytes"); 1823 #endif 1824