1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 26 * Copyright 2017 Nexenta Systems, Inc. 27 */ 28 29 #include <sys/zio.h> 30 #include <sys/spa.h> 31 #include <sys/dmu.h> 32 #include <sys/zfs_context.h> 33 #include <sys/zap.h> 34 #include <sys/zap_impl.h> 35 #include <sys/zap_leaf.h> 36 #include <sys/btree.h> 37 #include <sys/arc.h> 38 #include <sys/dmu_objset.h> 39 40 #ifdef _KERNEL 41 #include <sys/sunddi.h> 42 #endif 43 44 int zap_micro_max_size = MZAP_MAX_BLKSZ; 45 46 static int mzap_upgrade(zap_t **zapp, 47 const void *tag, dmu_tx_t *tx, zap_flags_t flags); 48 49 uint64_t 50 zap_getflags(zap_t *zap) 51 { 52 if (zap->zap_ismicro) 53 return (0); 54 return (zap_f_phys(zap)->zap_flags); 55 } 56 57 int 58 zap_hashbits(zap_t *zap) 59 { 60 if (zap_getflags(zap) & ZAP_FLAG_HASH64) 61 return (48); 62 else 63 return (28); 64 } 65 66 uint32_t 67 zap_maxcd(zap_t *zap) 68 { 69 if (zap_getflags(zap) & ZAP_FLAG_HASH64) 70 return ((1<<16)-1); 71 else 72 return (-1U); 73 } 74 75 static uint64_t 76 zap_hash(zap_name_t *zn) 77 { 78 zap_t *zap = zn->zn_zap; 79 uint64_t h = 0; 80 81 if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { 82 ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); 83 h = *(uint64_t *)zn->zn_key_orig; 84 } else { 85 h = zap->zap_salt; 86 ASSERT(h != 0); 87 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 88 89 if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { 90 const uint64_t *wp = zn->zn_key_norm; 91 92 ASSERT(zn->zn_key_intlen == 8); 93 for (int i = 0; i < zn->zn_key_norm_numints; 94 wp++, i++) { 95 uint64_t word = *wp; 96 97 for (int j = 0; j < 8; j++) { 98 h = (h >> 8) ^ 99 zfs_crc64_table[(h ^ word) & 0xFF]; 100 word >>= NBBY; 101 } 102 } 103 } else { 104 const uint8_t *cp = zn->zn_key_norm; 105 106 /* 107 * We previously stored the terminating null on 108 * disk, but didn't hash it, so we need to 109 * continue to not hash it. (The 110 * zn_key_*_numints includes the terminating 111 * null for non-binary keys.) 112 */ 113 int len = zn->zn_key_norm_numints - 1; 114 115 ASSERT(zn->zn_key_intlen == 1); 116 for (int i = 0; i < len; cp++, i++) { 117 h = (h >> 8) ^ 118 zfs_crc64_table[(h ^ *cp) & 0xFF]; 119 } 120 } 121 } 122 /* 123 * Don't use all 64 bits, since we need some in the cookie for 124 * the collision differentiator. We MUST use the high bits, 125 * since those are the ones that we first pay attention to when 126 * choosing the bucket. 127 */ 128 h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); 129 130 return (h); 131 } 132 133 static int 134 zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags) 135 { 136 ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); 137 138 size_t inlen = strlen(name) + 1; 139 size_t outlen = ZAP_MAXNAMELEN; 140 141 int err = 0; 142 (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, 143 normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, 144 U8_UNICODE_LATEST, &err); 145 146 return (err); 147 } 148 149 boolean_t 150 zap_match(zap_name_t *zn, const char *matchname) 151 { 152 ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); 153 154 if (zn->zn_matchtype & MT_NORMALIZE) { 155 char norm[ZAP_MAXNAMELEN]; 156 157 if (zap_normalize(zn->zn_zap, matchname, norm, 158 zn->zn_normflags) != 0) 159 return (B_FALSE); 160 161 return (strcmp(zn->zn_key_norm, norm) == 0); 162 } else { 163 return (strcmp(zn->zn_key_orig, matchname) == 0); 164 } 165 } 166 167 static zap_name_t * 168 zap_name_alloc(zap_t *zap) 169 { 170 zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); 171 zn->zn_zap = zap; 172 return (zn); 173 } 174 175 void 176 zap_name_free(zap_name_t *zn) 177 { 178 kmem_free(zn, sizeof (zap_name_t)); 179 } 180 181 static int 182 zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt) 183 { 184 zap_t *zap = zn->zn_zap; 185 186 zn->zn_key_intlen = sizeof (*key); 187 zn->zn_key_orig = key; 188 zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; 189 zn->zn_matchtype = mt; 190 zn->zn_normflags = zap->zap_normflags; 191 192 /* 193 * If we're dealing with a case sensitive lookup on a mixed or 194 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup 195 * will fold case to all caps overriding the lookup request. 196 */ 197 if (mt & MT_MATCH_CASE) 198 zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; 199 200 if (zap->zap_normflags) { 201 /* 202 * We *must* use zap_normflags because this normalization is 203 * what the hash is computed from. 204 */ 205 if (zap_normalize(zap, key, zn->zn_normbuf, 206 zap->zap_normflags) != 0) 207 return (SET_ERROR(ENOTSUP)); 208 zn->zn_key_norm = zn->zn_normbuf; 209 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; 210 } else { 211 if (mt != 0) 212 return (SET_ERROR(ENOTSUP)); 213 zn->zn_key_norm = zn->zn_key_orig; 214 zn->zn_key_norm_numints = zn->zn_key_orig_numints; 215 } 216 217 zn->zn_hash = zap_hash(zn); 218 219 if (zap->zap_normflags != zn->zn_normflags) { 220 /* 221 * We *must* use zn_normflags because this normalization is 222 * what the matching is based on. (Not the hash!) 223 */ 224 if (zap_normalize(zap, key, zn->zn_normbuf, 225 zn->zn_normflags) != 0) 226 return (SET_ERROR(ENOTSUP)); 227 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; 228 } 229 230 return (0); 231 } 232 233 zap_name_t * 234 zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt) 235 { 236 zap_name_t *zn = zap_name_alloc(zap); 237 if (zap_name_init_str(zn, key, mt) != 0) { 238 zap_name_free(zn); 239 return (NULL); 240 } 241 return (zn); 242 } 243 244 static zap_name_t * 245 zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) 246 { 247 zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); 248 249 ASSERT(zap->zap_normflags == 0); 250 zn->zn_zap = zap; 251 zn->zn_key_intlen = sizeof (*key); 252 zn->zn_key_orig = zn->zn_key_norm = key; 253 zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; 254 zn->zn_matchtype = 0; 255 256 zn->zn_hash = zap_hash(zn); 257 return (zn); 258 } 259 260 static void 261 mzap_byteswap(mzap_phys_t *buf, size_t size) 262 { 263 buf->mz_block_type = BSWAP_64(buf->mz_block_type); 264 buf->mz_salt = BSWAP_64(buf->mz_salt); 265 buf->mz_normflags = BSWAP_64(buf->mz_normflags); 266 int max = (size / MZAP_ENT_LEN) - 1; 267 for (int i = 0; i < max; i++) { 268 buf->mz_chunk[i].mze_value = 269 BSWAP_64(buf->mz_chunk[i].mze_value); 270 buf->mz_chunk[i].mze_cd = 271 BSWAP_32(buf->mz_chunk[i].mze_cd); 272 } 273 } 274 275 void 276 zap_byteswap(void *buf, size_t size) 277 { 278 uint64_t block_type = *(uint64_t *)buf; 279 280 if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { 281 /* ASSERT(magic == ZAP_LEAF_MAGIC); */ 282 mzap_byteswap(buf, size); 283 } else { 284 fzap_byteswap(buf, size); 285 } 286 } 287 288 __attribute__((always_inline)) inline 289 static int 290 mze_compare(const void *arg1, const void *arg2) 291 { 292 const mzap_ent_t *mze1 = arg1; 293 const mzap_ent_t *mze2 = arg2; 294 295 return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd, 296 (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd)); 297 } 298 299 ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t, 300 mze_compare) 301 302 static void 303 mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash) 304 { 305 mzap_ent_t mze; 306 307 ASSERT(zap->zap_ismicro); 308 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 309 310 mze.mze_chunkid = chunkid; 311 ASSERT0(hash & 0xffffffff); 312 mze.mze_hash = hash >> 32; 313 ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff); 314 mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd; 315 ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0); 316 zfs_btree_add(&zap->zap_m.zap_tree, &mze); 317 } 318 319 static mzap_ent_t * 320 mze_find(zap_name_t *zn, zfs_btree_index_t *idx) 321 { 322 mzap_ent_t mze_tofind; 323 mzap_ent_t *mze; 324 zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree; 325 326 ASSERT(zn->zn_zap->zap_ismicro); 327 ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); 328 329 ASSERT0(zn->zn_hash & 0xffffffff); 330 mze_tofind.mze_hash = zn->zn_hash >> 32; 331 mze_tofind.mze_cd = 0; 332 333 mze = zfs_btree_find(tree, &mze_tofind, idx); 334 if (mze == NULL) 335 mze = zfs_btree_next(tree, idx, idx); 336 for (; mze && mze->mze_hash == mze_tofind.mze_hash; 337 mze = zfs_btree_next(tree, idx, idx)) { 338 ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); 339 if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) 340 return (mze); 341 } 342 343 return (NULL); 344 } 345 346 static uint32_t 347 mze_find_unused_cd(zap_t *zap, uint64_t hash) 348 { 349 mzap_ent_t mze_tofind; 350 zfs_btree_index_t idx; 351 zfs_btree_t *tree = &zap->zap_m.zap_tree; 352 353 ASSERT(zap->zap_ismicro); 354 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); 355 356 ASSERT0(hash & 0xffffffff); 357 hash >>= 32; 358 mze_tofind.mze_hash = hash; 359 mze_tofind.mze_cd = 0; 360 361 uint32_t cd = 0; 362 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); 363 mze && mze->mze_hash == hash; 364 mze = zfs_btree_next(tree, &idx, &idx)) { 365 if (mze->mze_cd != cd) 366 break; 367 cd++; 368 } 369 370 return (cd); 371 } 372 373 /* 374 * Each mzap entry requires at max : 4 chunks 375 * 3 chunks for names + 1 chunk for value. 376 */ 377 #define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \ 378 ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t))) 379 380 /* 381 * Check if the current entry keeps the colliding entries under the fatzap leaf 382 * size. 383 */ 384 static boolean_t 385 mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) 386 { 387 zap_t *zap = zn->zn_zap; 388 mzap_ent_t mze_tofind; 389 zfs_btree_index_t idx; 390 zfs_btree_t *tree = &zap->zap_m.zap_tree; 391 uint32_t mzap_ents = 0; 392 393 ASSERT0(hash & 0xffffffff); 394 hash >>= 32; 395 mze_tofind.mze_hash = hash; 396 mze_tofind.mze_cd = 0; 397 398 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); 399 mze && mze->mze_hash == hash; 400 mze = zfs_btree_next(tree, &idx, &idx)) { 401 mzap_ents++; 402 } 403 404 /* Include the new entry being added */ 405 mzap_ents++; 406 407 return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS)); 408 } 409 410 static void 411 mze_destroy(zap_t *zap) 412 { 413 zfs_btree_clear(&zap->zap_m.zap_tree); 414 zfs_btree_destroy(&zap->zap_m.zap_tree); 415 } 416 417 static zap_t * 418 mzap_open(dmu_buf_t *db) 419 { 420 zap_t *winner; 421 uint64_t *zap_hdr = (uint64_t *)db->db_data; 422 uint64_t zap_block_type = zap_hdr[0]; 423 uint64_t zap_magic = zap_hdr[1]; 424 425 ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); 426 427 zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); 428 rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL); 429 rw_enter(&zap->zap_rwlock, RW_WRITER); 430 zap->zap_objset = dmu_buf_get_objset(db); 431 zap->zap_object = db->db_object; 432 zap->zap_dbuf = db; 433 434 if (zap_block_type != ZBT_MICRO) { 435 mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 436 0); 437 zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; 438 if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) { 439 winner = NULL; /* No actual winner here... */ 440 goto handle_winner; 441 } 442 } else { 443 zap->zap_ismicro = TRUE; 444 } 445 446 /* 447 * Make sure that zap_ismicro is set before we let others see 448 * it, because zap_lockdir() checks zap_ismicro without the lock 449 * held. 450 */ 451 dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf); 452 winner = dmu_buf_set_user(db, &zap->zap_dbu); 453 454 if (winner != NULL) 455 goto handle_winner; 456 457 if (zap->zap_ismicro) { 458 zap->zap_salt = zap_m_phys(zap)->mz_salt; 459 zap->zap_normflags = zap_m_phys(zap)->mz_normflags; 460 zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; 461 462 /* 463 * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove() 464 * overhead on massive inserts below. It still allows to store 465 * 62 entries before we have to add 2KB B-tree core node. 466 */ 467 zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare, 468 mze_find_in_buf, sizeof (mzap_ent_t), 512); 469 470 zap_name_t *zn = zap_name_alloc(zap); 471 for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) { 472 mzap_ent_phys_t *mze = 473 &zap_m_phys(zap)->mz_chunk[i]; 474 if (mze->mze_name[0]) { 475 zap->zap_m.zap_num_entries++; 476 zap_name_init_str(zn, mze->mze_name, 0); 477 mze_insert(zap, i, zn->zn_hash); 478 } 479 } 480 zap_name_free(zn); 481 } else { 482 zap->zap_salt = zap_f_phys(zap)->zap_salt; 483 zap->zap_normflags = zap_f_phys(zap)->zap_normflags; 484 485 ASSERT3U(sizeof (struct zap_leaf_header), ==, 486 2*ZAP_LEAF_CHUNKSIZE); 487 488 /* 489 * The embedded pointer table should not overlap the 490 * other members. 491 */ 492 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, 493 &zap_f_phys(zap)->zap_salt); 494 495 /* 496 * The embedded pointer table should end at the end of 497 * the block 498 */ 499 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, 500 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) - 501 (uintptr_t)zap_f_phys(zap), ==, 502 zap->zap_dbuf->db_size); 503 } 504 rw_exit(&zap->zap_rwlock); 505 return (zap); 506 507 handle_winner: 508 rw_exit(&zap->zap_rwlock); 509 rw_destroy(&zap->zap_rwlock); 510 if (!zap->zap_ismicro) 511 mutex_destroy(&zap->zap_f.zap_num_entries_mtx); 512 kmem_free(zap, sizeof (zap_t)); 513 return (winner); 514 } 515 516 /* 517 * This routine "consumes" the caller's hold on the dbuf, which must 518 * have the specified tag. 519 */ 520 static int 521 zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx, 522 krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) 523 { 524 ASSERT0(db->db_offset); 525 objset_t *os = dmu_buf_get_objset(db); 526 uint64_t obj = db->db_object; 527 dmu_object_info_t doi; 528 529 *zapp = NULL; 530 531 dmu_object_info_from_dnode(dn, &doi); 532 if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) 533 return (SET_ERROR(EINVAL)); 534 535 zap_t *zap = dmu_buf_get_user(db); 536 if (zap == NULL) { 537 zap = mzap_open(db); 538 if (zap == NULL) { 539 /* 540 * mzap_open() didn't like what it saw on-disk. 541 * Check for corruption! 542 */ 543 return (SET_ERROR(EIO)); 544 } 545 } 546 547 /* 548 * We're checking zap_ismicro without the lock held, in order to 549 * tell what type of lock we want. Once we have some sort of 550 * lock, see if it really is the right type. In practice this 551 * can only be different if it was upgraded from micro to fat, 552 * and micro wanted WRITER but fat only needs READER. 553 */ 554 krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; 555 rw_enter(&zap->zap_rwlock, lt); 556 if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { 557 /* it was upgraded, now we only need reader */ 558 ASSERT(lt == RW_WRITER); 559 ASSERT(RW_READER == 560 ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)); 561 rw_downgrade(&zap->zap_rwlock); 562 lt = RW_READER; 563 } 564 565 zap->zap_objset = os; 566 zap->zap_dnode = dn; 567 568 if (lt == RW_WRITER) 569 dmu_buf_will_dirty(db, tx); 570 571 ASSERT3P(zap->zap_dbuf, ==, db); 572 573 ASSERT(!zap->zap_ismicro || 574 zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); 575 if (zap->zap_ismicro && tx && adding && 576 zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { 577 uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; 578 if (newsz > zap_micro_max_size) { 579 dprintf("upgrading obj %llu: num_entries=%u\n", 580 (u_longlong_t)obj, zap->zap_m.zap_num_entries); 581 *zapp = zap; 582 int err = mzap_upgrade(zapp, tag, tx, 0); 583 if (err != 0) 584 rw_exit(&zap->zap_rwlock); 585 return (err); 586 } 587 VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); 588 zap->zap_m.zap_num_chunks = 589 db->db_size / MZAP_ENT_LEN - 1; 590 } 591 592 *zapp = zap; 593 return (0); 594 } 595 596 static int 597 zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, 598 krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, 599 zap_t **zapp) 600 { 601 dmu_buf_t *db; 602 int err; 603 604 err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); 605 if (err != 0) 606 return (err); 607 err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); 608 if (err != 0) 609 dmu_buf_rele(db, tag); 610 else 611 VERIFY(dnode_add_ref(dn, tag)); 612 return (err); 613 } 614 615 int 616 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, 617 krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, 618 zap_t **zapp) 619 { 620 dnode_t *dn; 621 dmu_buf_t *db; 622 int err; 623 624 err = dnode_hold(os, obj, tag, &dn); 625 if (err != 0) 626 return (err); 627 err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); 628 if (err != 0) { 629 dnode_rele(dn, tag); 630 return (err); 631 } 632 err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); 633 if (err != 0) { 634 dmu_buf_rele(db, tag); 635 dnode_rele(dn, tag); 636 } 637 return (err); 638 } 639 640 void 641 zap_unlockdir(zap_t *zap, const void *tag) 642 { 643 rw_exit(&zap->zap_rwlock); 644 dnode_rele(zap->zap_dnode, tag); 645 dmu_buf_rele(zap->zap_dbuf, tag); 646 } 647 648 static int 649 mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags) 650 { 651 int err = 0; 652 zap_t *zap = *zapp; 653 654 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 655 656 int sz = zap->zap_dbuf->db_size; 657 mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP); 658 memcpy(mzp, zap->zap_dbuf->db_data, sz); 659 int nchunks = zap->zap_m.zap_num_chunks; 660 661 if (!flags) { 662 err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 663 1ULL << fzap_default_block_shift, 0, tx); 664 if (err != 0) { 665 vmem_free(mzp, sz); 666 return (err); 667 } 668 } 669 670 dprintf("upgrading obj=%llu with %u chunks\n", 671 (u_longlong_t)zap->zap_object, nchunks); 672 /* XXX destroy the tree later, so we can use the stored hash value */ 673 mze_destroy(zap); 674 675 fzap_upgrade(zap, tx, flags); 676 677 zap_name_t *zn = zap_name_alloc(zap); 678 for (int i = 0; i < nchunks; i++) { 679 mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; 680 if (mze->mze_name[0] == 0) 681 continue; 682 dprintf("adding %s=%llu\n", 683 mze->mze_name, (u_longlong_t)mze->mze_value); 684 zap_name_init_str(zn, mze->mze_name, 0); 685 /* If we fail here, we would end up losing entries */ 686 VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, 687 tag, tx)); 688 zap = zn->zn_zap; /* fzap_add_cd() may change zap */ 689 } 690 zap_name_free(zn); 691 vmem_free(mzp, sz); 692 *zapp = zap; 693 return (0); 694 } 695 696 /* 697 * The "normflags" determine the behavior of the matchtype_t which is 698 * passed to zap_lookup_norm(). Names which have the same normalized 699 * version will be stored with the same hash value, and therefore we can 700 * perform normalization-insensitive lookups. We can be Unicode form- 701 * insensitive and/or case-insensitive. The following flags are valid for 702 * "normflags": 703 * 704 * U8_TEXTPREP_NFC 705 * U8_TEXTPREP_NFD 706 * U8_TEXTPREP_NFKC 707 * U8_TEXTPREP_NFKD 708 * U8_TEXTPREP_TOUPPER 709 * 710 * The *_NF* (Normalization Form) flags are mutually exclusive; at most one 711 * of them may be supplied. 712 */ 713 void 714 mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx) 715 { 716 dmu_buf_t *db; 717 718 VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); 719 720 dmu_buf_will_dirty(db, tx); 721 mzap_phys_t *zp = db->db_data; 722 zp->mz_block_type = ZBT_MICRO; 723 zp->mz_salt = 724 ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL; 725 zp->mz_normflags = normflags; 726 727 if (flags != 0) { 728 zap_t *zap; 729 /* Only fat zap supports flags; upgrade immediately. */ 730 VERIFY(dnode_add_ref(dn, FTAG)); 731 VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER, 732 B_FALSE, B_FALSE, &zap)); 733 VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); 734 zap_unlockdir(zap, FTAG); 735 } else { 736 dmu_buf_rele(db, FTAG); 737 } 738 } 739 740 static uint64_t 741 zap_create_impl(objset_t *os, int normflags, zap_flags_t flags, 742 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 743 dmu_object_type_t bonustype, int bonuslen, int dnodesize, 744 dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) 745 { 746 uint64_t obj; 747 748 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); 749 750 if (allocated_dnode == NULL) { 751 dnode_t *dn; 752 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, 753 indirect_blockshift, bonustype, bonuslen, dnodesize, 754 &dn, FTAG, tx); 755 mzap_create_impl(dn, normflags, flags, tx); 756 dnode_rele(dn, FTAG); 757 } else { 758 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, 759 indirect_blockshift, bonustype, bonuslen, dnodesize, 760 allocated_dnode, tag, tx); 761 mzap_create_impl(*allocated_dnode, normflags, flags, tx); 762 } 763 764 return (obj); 765 } 766 767 int 768 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, 769 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 770 { 771 return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, 772 0, tx)); 773 } 774 775 int 776 zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, 777 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 778 { 779 return (zap_create_claim_norm_dnsize(os, obj, 780 0, ot, bonustype, bonuslen, dnodesize, tx)); 781 } 782 783 int 784 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, 785 dmu_object_type_t ot, 786 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 787 { 788 return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, 789 bonuslen, 0, tx)); 790 } 791 792 int 793 zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, 794 dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, 795 int dnodesize, dmu_tx_t *tx) 796 { 797 dnode_t *dn; 798 int error; 799 800 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); 801 error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, 802 dnodesize, tx); 803 if (error != 0) 804 return (error); 805 806 error = dnode_hold(os, obj, FTAG, &dn); 807 if (error != 0) 808 return (error); 809 810 mzap_create_impl(dn, normflags, 0, tx); 811 812 dnode_rele(dn, FTAG); 813 814 return (0); 815 } 816 817 uint64_t 818 zap_create(objset_t *os, dmu_object_type_t ot, 819 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 820 { 821 return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); 822 } 823 824 uint64_t 825 zap_create_dnsize(objset_t *os, dmu_object_type_t ot, 826 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 827 { 828 return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, 829 dnodesize, tx)); 830 } 831 832 uint64_t 833 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, 834 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 835 { 836 return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, 837 0, tx)); 838 } 839 840 uint64_t 841 zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, 842 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 843 { 844 return (zap_create_impl(os, normflags, 0, ot, 0, 0, 845 bonustype, bonuslen, dnodesize, NULL, NULL, tx)); 846 } 847 848 uint64_t 849 zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, 850 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 851 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 852 { 853 return (zap_create_flags_dnsize(os, normflags, flags, ot, 854 leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); 855 } 856 857 uint64_t 858 zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, 859 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 860 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 861 { 862 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, 863 indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL, 864 tx)); 865 } 866 867 /* 868 * Create a zap object and return a pointer to the newly allocated dnode via 869 * the allocated_dnode argument. The returned dnode will be held and the 870 * caller is responsible for releasing the hold by calling dnode_rele(). 871 */ 872 uint64_t 873 zap_create_hold(objset_t *os, int normflags, zap_flags_t flags, 874 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 875 dmu_object_type_t bonustype, int bonuslen, int dnodesize, 876 dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) 877 { 878 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, 879 indirect_blockshift, bonustype, bonuslen, dnodesize, 880 allocated_dnode, tag, tx)); 881 } 882 883 int 884 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) 885 { 886 /* 887 * dmu_object_free will free the object number and free the 888 * data. Freeing the data will cause our pageout function to be 889 * called, which will destroy our data (zap_leaf_t's and zap_t). 890 */ 891 892 return (dmu_object_free(os, zapobj, tx)); 893 } 894 895 void 896 zap_evict_sync(void *dbu) 897 { 898 zap_t *zap = dbu; 899 900 rw_destroy(&zap->zap_rwlock); 901 902 if (zap->zap_ismicro) 903 mze_destroy(zap); 904 else 905 mutex_destroy(&zap->zap_f.zap_num_entries_mtx); 906 907 kmem_free(zap, sizeof (zap_t)); 908 } 909 910 int 911 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) 912 { 913 zap_t *zap; 914 915 int err = 916 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 917 if (err != 0) 918 return (err); 919 if (!zap->zap_ismicro) { 920 err = fzap_count(zap, count); 921 } else { 922 *count = zap->zap_m.zap_num_entries; 923 } 924 zap_unlockdir(zap, FTAG); 925 return (err); 926 } 927 928 /* 929 * zn may be NULL; if not specified, it will be computed if needed. 930 * See also the comment above zap_entry_normalization_conflict(). 931 */ 932 static boolean_t 933 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze, 934 zfs_btree_index_t *idx) 935 { 936 boolean_t allocdzn = B_FALSE; 937 mzap_ent_t *other; 938 zfs_btree_index_t oidx; 939 940 if (zap->zap_normflags == 0) 941 return (B_FALSE); 942 943 for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx); 944 other && other->mze_hash == mze->mze_hash; 945 other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) { 946 947 if (zn == NULL) { 948 zn = zap_name_alloc_str(zap, 949 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); 950 allocdzn = B_TRUE; 951 } 952 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { 953 if (allocdzn) 954 zap_name_free(zn); 955 return (B_TRUE); 956 } 957 } 958 959 for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx); 960 other && other->mze_hash == mze->mze_hash; 961 other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) { 962 963 if (zn == NULL) { 964 zn = zap_name_alloc_str(zap, 965 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); 966 allocdzn = B_TRUE; 967 } 968 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { 969 if (allocdzn) 970 zap_name_free(zn); 971 return (B_TRUE); 972 } 973 } 974 975 if (allocdzn) 976 zap_name_free(zn); 977 return (B_FALSE); 978 } 979 980 /* 981 * Routines for manipulating attributes. 982 */ 983 984 int 985 zap_lookup(objset_t *os, uint64_t zapobj, const char *name, 986 uint64_t integer_size, uint64_t num_integers, void *buf) 987 { 988 return (zap_lookup_norm(os, zapobj, name, integer_size, 989 num_integers, buf, 0, NULL, 0, NULL)); 990 } 991 992 static int 993 zap_lookup_impl(zap_t *zap, const char *name, 994 uint64_t integer_size, uint64_t num_integers, void *buf, 995 matchtype_t mt, char *realname, int rn_len, 996 boolean_t *ncp) 997 { 998 int err = 0; 999 1000 zap_name_t *zn = zap_name_alloc_str(zap, name, mt); 1001 if (zn == NULL) 1002 return (SET_ERROR(ENOTSUP)); 1003 1004 if (!zap->zap_ismicro) { 1005 err = fzap_lookup(zn, integer_size, num_integers, buf, 1006 realname, rn_len, ncp); 1007 } else { 1008 zfs_btree_index_t idx; 1009 mzap_ent_t *mze = mze_find(zn, &idx); 1010 if (mze == NULL) { 1011 err = SET_ERROR(ENOENT); 1012 } else { 1013 if (num_integers < 1) { 1014 err = SET_ERROR(EOVERFLOW); 1015 } else if (integer_size != 8) { 1016 err = SET_ERROR(EINVAL); 1017 } else { 1018 *(uint64_t *)buf = 1019 MZE_PHYS(zap, mze)->mze_value; 1020 if (realname != NULL) 1021 (void) strlcpy(realname, 1022 MZE_PHYS(zap, mze)->mze_name, 1023 rn_len); 1024 if (ncp) { 1025 *ncp = mzap_normalization_conflict(zap, 1026 zn, mze, &idx); 1027 } 1028 } 1029 } 1030 } 1031 zap_name_free(zn); 1032 return (err); 1033 } 1034 1035 int 1036 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, 1037 uint64_t integer_size, uint64_t num_integers, void *buf, 1038 matchtype_t mt, char *realname, int rn_len, 1039 boolean_t *ncp) 1040 { 1041 zap_t *zap; 1042 1043 int err = 1044 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1045 if (err != 0) 1046 return (err); 1047 err = zap_lookup_impl(zap, name, integer_size, 1048 num_integers, buf, mt, realname, rn_len, ncp); 1049 zap_unlockdir(zap, FTAG); 1050 return (err); 1051 } 1052 1053 int 1054 zap_prefetch(objset_t *os, uint64_t zapobj, const char *name) 1055 { 1056 zap_t *zap; 1057 int err; 1058 zap_name_t *zn; 1059 1060 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1061 if (err) 1062 return (err); 1063 zn = zap_name_alloc_str(zap, name, 0); 1064 if (zn == NULL) { 1065 zap_unlockdir(zap, FTAG); 1066 return (SET_ERROR(ENOTSUP)); 1067 } 1068 1069 fzap_prefetch(zn); 1070 zap_name_free(zn); 1071 zap_unlockdir(zap, FTAG); 1072 return (err); 1073 } 1074 1075 int 1076 zap_prefetch_object(objset_t *os, uint64_t zapobj) 1077 { 1078 int error; 1079 dmu_object_info_t doi; 1080 1081 error = dmu_object_info(os, zapobj, &doi); 1082 if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) 1083 error = SET_ERROR(EINVAL); 1084 if (error == 0) 1085 dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset); 1086 1087 return (error); 1088 } 1089 1090 int 1091 zap_lookup_by_dnode(dnode_t *dn, const char *name, 1092 uint64_t integer_size, uint64_t num_integers, void *buf) 1093 { 1094 return (zap_lookup_norm_by_dnode(dn, name, integer_size, 1095 num_integers, buf, 0, NULL, 0, NULL)); 1096 } 1097 1098 int 1099 zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, 1100 uint64_t integer_size, uint64_t num_integers, void *buf, 1101 matchtype_t mt, char *realname, int rn_len, 1102 boolean_t *ncp) 1103 { 1104 zap_t *zap; 1105 1106 int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, 1107 FTAG, &zap); 1108 if (err != 0) 1109 return (err); 1110 err = zap_lookup_impl(zap, name, integer_size, 1111 num_integers, buf, mt, realname, rn_len, ncp); 1112 zap_unlockdir(zap, FTAG); 1113 return (err); 1114 } 1115 1116 int 1117 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1118 int key_numints) 1119 { 1120 zap_t *zap; 1121 1122 int err = 1123 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1124 if (err != 0) 1125 return (err); 1126 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1127 if (zn == NULL) { 1128 zap_unlockdir(zap, FTAG); 1129 return (SET_ERROR(ENOTSUP)); 1130 } 1131 1132 fzap_prefetch(zn); 1133 zap_name_free(zn); 1134 zap_unlockdir(zap, FTAG); 1135 return (err); 1136 } 1137 1138 int 1139 zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1140 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) 1141 { 1142 zap_t *zap; 1143 1144 int err = 1145 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1146 if (err != 0) 1147 return (err); 1148 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1149 if (zn == NULL) { 1150 zap_unlockdir(zap, FTAG); 1151 return (SET_ERROR(ENOTSUP)); 1152 } 1153 1154 err = fzap_lookup(zn, integer_size, num_integers, buf, 1155 NULL, 0, NULL); 1156 zap_name_free(zn); 1157 zap_unlockdir(zap, FTAG); 1158 return (err); 1159 } 1160 1161 int 1162 zap_contains(objset_t *os, uint64_t zapobj, const char *name) 1163 { 1164 int err = zap_lookup_norm(os, zapobj, name, 0, 1165 0, NULL, 0, NULL, 0, NULL); 1166 if (err == EOVERFLOW || err == EINVAL) 1167 err = 0; /* found, but skipped reading the value */ 1168 return (err); 1169 } 1170 1171 int 1172 zap_length(objset_t *os, uint64_t zapobj, const char *name, 1173 uint64_t *integer_size, uint64_t *num_integers) 1174 { 1175 zap_t *zap; 1176 1177 int err = 1178 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1179 if (err != 0) 1180 return (err); 1181 zap_name_t *zn = zap_name_alloc_str(zap, name, 0); 1182 if (zn == NULL) { 1183 zap_unlockdir(zap, FTAG); 1184 return (SET_ERROR(ENOTSUP)); 1185 } 1186 if (!zap->zap_ismicro) { 1187 err = fzap_length(zn, integer_size, num_integers); 1188 } else { 1189 zfs_btree_index_t idx; 1190 mzap_ent_t *mze = mze_find(zn, &idx); 1191 if (mze == NULL) { 1192 err = SET_ERROR(ENOENT); 1193 } else { 1194 if (integer_size) 1195 *integer_size = 8; 1196 if (num_integers) 1197 *num_integers = 1; 1198 } 1199 } 1200 zap_name_free(zn); 1201 zap_unlockdir(zap, FTAG); 1202 return (err); 1203 } 1204 1205 int 1206 zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1207 int key_numints, uint64_t *integer_size, uint64_t *num_integers) 1208 { 1209 zap_t *zap; 1210 1211 int err = 1212 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1213 if (err != 0) 1214 return (err); 1215 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1216 if (zn == NULL) { 1217 zap_unlockdir(zap, FTAG); 1218 return (SET_ERROR(ENOTSUP)); 1219 } 1220 err = fzap_length(zn, integer_size, num_integers); 1221 zap_name_free(zn); 1222 zap_unlockdir(zap, FTAG); 1223 return (err); 1224 } 1225 1226 static void 1227 mzap_addent(zap_name_t *zn, uint64_t value) 1228 { 1229 zap_t *zap = zn->zn_zap; 1230 uint16_t start = zap->zap_m.zap_alloc_next; 1231 1232 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 1233 1234 #ifdef ZFS_DEBUG 1235 for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { 1236 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; 1237 ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); 1238 } 1239 #endif 1240 1241 uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash); 1242 /* given the limited size of the microzap, this can't happen */ 1243 ASSERT(cd < zap_maxcd(zap)); 1244 1245 again: 1246 for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) { 1247 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; 1248 if (mze->mze_name[0] == 0) { 1249 mze->mze_value = value; 1250 mze->mze_cd = cd; 1251 (void) strlcpy(mze->mze_name, zn->zn_key_orig, 1252 sizeof (mze->mze_name)); 1253 zap->zap_m.zap_num_entries++; 1254 zap->zap_m.zap_alloc_next = i+1; 1255 if (zap->zap_m.zap_alloc_next == 1256 zap->zap_m.zap_num_chunks) 1257 zap->zap_m.zap_alloc_next = 0; 1258 mze_insert(zap, i, zn->zn_hash); 1259 return; 1260 } 1261 } 1262 if (start != 0) { 1263 start = 0; 1264 goto again; 1265 } 1266 cmn_err(CE_PANIC, "out of entries!"); 1267 } 1268 1269 static int 1270 zap_add_impl(zap_t *zap, const char *key, 1271 int integer_size, uint64_t num_integers, 1272 const void *val, dmu_tx_t *tx, const void *tag) 1273 { 1274 const uint64_t *intval = val; 1275 int err = 0; 1276 1277 zap_name_t *zn = zap_name_alloc_str(zap, key, 0); 1278 if (zn == NULL) { 1279 zap_unlockdir(zap, tag); 1280 return (SET_ERROR(ENOTSUP)); 1281 } 1282 if (!zap->zap_ismicro) { 1283 err = fzap_add(zn, integer_size, num_integers, val, tag, tx); 1284 zap = zn->zn_zap; /* fzap_add() may change zap */ 1285 } else if (integer_size != 8 || num_integers != 1 || 1286 strlen(key) >= MZAP_NAME_LEN || 1287 !mze_canfit_fzap_leaf(zn, zn->zn_hash)) { 1288 err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); 1289 if (err == 0) { 1290 err = fzap_add(zn, integer_size, num_integers, val, 1291 tag, tx); 1292 } 1293 zap = zn->zn_zap; /* fzap_add() may change zap */ 1294 } else { 1295 zfs_btree_index_t idx; 1296 if (mze_find(zn, &idx) != NULL) { 1297 err = SET_ERROR(EEXIST); 1298 } else { 1299 mzap_addent(zn, *intval); 1300 } 1301 } 1302 ASSERT(zap == zn->zn_zap); 1303 zap_name_free(zn); 1304 if (zap != NULL) /* may be NULL if fzap_add() failed */ 1305 zap_unlockdir(zap, tag); 1306 return (err); 1307 } 1308 1309 int 1310 zap_add(objset_t *os, uint64_t zapobj, const char *key, 1311 int integer_size, uint64_t num_integers, 1312 const void *val, dmu_tx_t *tx) 1313 { 1314 zap_t *zap; 1315 int err; 1316 1317 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1318 if (err != 0) 1319 return (err); 1320 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); 1321 /* zap_add_impl() calls zap_unlockdir() */ 1322 return (err); 1323 } 1324 1325 int 1326 zap_add_by_dnode(dnode_t *dn, const char *key, 1327 int integer_size, uint64_t num_integers, 1328 const void *val, dmu_tx_t *tx) 1329 { 1330 zap_t *zap; 1331 int err; 1332 1333 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1334 if (err != 0) 1335 return (err); 1336 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); 1337 /* zap_add_impl() calls zap_unlockdir() */ 1338 return (err); 1339 } 1340 1341 static int 1342 zap_add_uint64_impl(zap_t *zap, const uint64_t *key, 1343 int key_numints, int integer_size, uint64_t num_integers, 1344 const void *val, dmu_tx_t *tx, const void *tag) 1345 { 1346 int err; 1347 1348 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1349 if (zn == NULL) { 1350 zap_unlockdir(zap, tag); 1351 return (SET_ERROR(ENOTSUP)); 1352 } 1353 err = fzap_add(zn, integer_size, num_integers, val, tag, tx); 1354 zap = zn->zn_zap; /* fzap_add() may change zap */ 1355 zap_name_free(zn); 1356 if (zap != NULL) /* may be NULL if fzap_add() failed */ 1357 zap_unlockdir(zap, tag); 1358 return (err); 1359 } 1360 1361 int 1362 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1363 int key_numints, int integer_size, uint64_t num_integers, 1364 const void *val, dmu_tx_t *tx) 1365 { 1366 zap_t *zap; 1367 1368 int err = 1369 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1370 if (err != 0) 1371 return (err); 1372 err = zap_add_uint64_impl(zap, key, key_numints, 1373 integer_size, num_integers, val, tx, FTAG); 1374 /* zap_add_uint64_impl() calls zap_unlockdir() */ 1375 return (err); 1376 } 1377 1378 int 1379 zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, 1380 int key_numints, int integer_size, uint64_t num_integers, 1381 const void *val, dmu_tx_t *tx) 1382 { 1383 zap_t *zap; 1384 1385 int err = 1386 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1387 if (err != 0) 1388 return (err); 1389 err = zap_add_uint64_impl(zap, key, key_numints, 1390 integer_size, num_integers, val, tx, FTAG); 1391 /* zap_add_uint64_impl() calls zap_unlockdir() */ 1392 return (err); 1393 } 1394 1395 int 1396 zap_update(objset_t *os, uint64_t zapobj, const char *name, 1397 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) 1398 { 1399 zap_t *zap; 1400 const uint64_t *intval = val; 1401 1402 int err = 1403 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1404 if (err != 0) 1405 return (err); 1406 zap_name_t *zn = zap_name_alloc_str(zap, name, 0); 1407 if (zn == NULL) { 1408 zap_unlockdir(zap, FTAG); 1409 return (SET_ERROR(ENOTSUP)); 1410 } 1411 if (!zap->zap_ismicro) { 1412 err = fzap_update(zn, integer_size, num_integers, val, 1413 FTAG, tx); 1414 zap = zn->zn_zap; /* fzap_update() may change zap */ 1415 } else if (integer_size != 8 || num_integers != 1 || 1416 strlen(name) >= MZAP_NAME_LEN) { 1417 dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", 1418 (u_longlong_t)zapobj, integer_size, 1419 (u_longlong_t)num_integers, name); 1420 err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); 1421 if (err == 0) { 1422 err = fzap_update(zn, integer_size, num_integers, 1423 val, FTAG, tx); 1424 } 1425 zap = zn->zn_zap; /* fzap_update() may change zap */ 1426 } else { 1427 zfs_btree_index_t idx; 1428 mzap_ent_t *mze = mze_find(zn, &idx); 1429 if (mze != NULL) { 1430 MZE_PHYS(zap, mze)->mze_value = *intval; 1431 } else { 1432 mzap_addent(zn, *intval); 1433 } 1434 } 1435 ASSERT(zap == zn->zn_zap); 1436 zap_name_free(zn); 1437 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ 1438 zap_unlockdir(zap, FTAG); 1439 return (err); 1440 } 1441 1442 static int 1443 zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, 1444 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, 1445 const void *tag) 1446 { 1447 int err; 1448 1449 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1450 if (zn == NULL) { 1451 zap_unlockdir(zap, tag); 1452 return (SET_ERROR(ENOTSUP)); 1453 } 1454 err = fzap_update(zn, integer_size, num_integers, val, tag, tx); 1455 zap = zn->zn_zap; /* fzap_update() may change zap */ 1456 zap_name_free(zn); 1457 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ 1458 zap_unlockdir(zap, tag); 1459 return (err); 1460 } 1461 1462 int 1463 zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1464 int key_numints, int integer_size, uint64_t num_integers, const void *val, 1465 dmu_tx_t *tx) 1466 { 1467 zap_t *zap; 1468 1469 int err = 1470 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1471 if (err != 0) 1472 return (err); 1473 err = zap_update_uint64_impl(zap, key, key_numints, 1474 integer_size, num_integers, val, tx, FTAG); 1475 /* zap_update_uint64_impl() calls zap_unlockdir() */ 1476 return (err); 1477 } 1478 1479 int 1480 zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, 1481 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) 1482 { 1483 zap_t *zap; 1484 1485 int err = 1486 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1487 if (err != 0) 1488 return (err); 1489 err = zap_update_uint64_impl(zap, key, key_numints, 1490 integer_size, num_integers, val, tx, FTAG); 1491 /* zap_update_uint64_impl() calls zap_unlockdir() */ 1492 return (err); 1493 } 1494 1495 int 1496 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) 1497 { 1498 return (zap_remove_norm(os, zapobj, name, 0, tx)); 1499 } 1500 1501 static int 1502 zap_remove_impl(zap_t *zap, const char *name, 1503 matchtype_t mt, dmu_tx_t *tx) 1504 { 1505 int err = 0; 1506 1507 zap_name_t *zn = zap_name_alloc_str(zap, name, mt); 1508 if (zn == NULL) 1509 return (SET_ERROR(ENOTSUP)); 1510 if (!zap->zap_ismicro) { 1511 err = fzap_remove(zn, tx); 1512 } else { 1513 zfs_btree_index_t idx; 1514 mzap_ent_t *mze = mze_find(zn, &idx); 1515 if (mze == NULL) { 1516 err = SET_ERROR(ENOENT); 1517 } else { 1518 zap->zap_m.zap_num_entries--; 1519 memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t)); 1520 zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx); 1521 } 1522 } 1523 zap_name_free(zn); 1524 return (err); 1525 } 1526 1527 int 1528 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, 1529 matchtype_t mt, dmu_tx_t *tx) 1530 { 1531 zap_t *zap; 1532 int err; 1533 1534 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1535 if (err) 1536 return (err); 1537 err = zap_remove_impl(zap, name, mt, tx); 1538 zap_unlockdir(zap, FTAG); 1539 return (err); 1540 } 1541 1542 int 1543 zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) 1544 { 1545 zap_t *zap; 1546 int err; 1547 1548 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1549 if (err) 1550 return (err); 1551 err = zap_remove_impl(zap, name, 0, tx); 1552 zap_unlockdir(zap, FTAG); 1553 return (err); 1554 } 1555 1556 static int 1557 zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, 1558 dmu_tx_t *tx, const void *tag) 1559 { 1560 int err; 1561 1562 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1563 if (zn == NULL) { 1564 zap_unlockdir(zap, tag); 1565 return (SET_ERROR(ENOTSUP)); 1566 } 1567 err = fzap_remove(zn, tx); 1568 zap_name_free(zn); 1569 zap_unlockdir(zap, tag); 1570 return (err); 1571 } 1572 1573 int 1574 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1575 int key_numints, dmu_tx_t *tx) 1576 { 1577 zap_t *zap; 1578 1579 int err = 1580 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1581 if (err != 0) 1582 return (err); 1583 err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); 1584 /* zap_remove_uint64_impl() calls zap_unlockdir() */ 1585 return (err); 1586 } 1587 1588 int 1589 zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, 1590 dmu_tx_t *tx) 1591 { 1592 zap_t *zap; 1593 1594 int err = 1595 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1596 if (err != 0) 1597 return (err); 1598 err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); 1599 /* zap_remove_uint64_impl() calls zap_unlockdir() */ 1600 return (err); 1601 } 1602 1603 /* 1604 * Routines for iterating over the attributes. 1605 */ 1606 1607 static void 1608 zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, 1609 uint64_t serialized, boolean_t prefetch) 1610 { 1611 zc->zc_objset = os; 1612 zc->zc_zap = NULL; 1613 zc->zc_leaf = NULL; 1614 zc->zc_zapobj = zapobj; 1615 zc->zc_serialized = serialized; 1616 zc->zc_hash = 0; 1617 zc->zc_cd = 0; 1618 zc->zc_prefetch = prefetch; 1619 } 1620 void 1621 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, 1622 uint64_t serialized) 1623 { 1624 zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE); 1625 } 1626 1627 /* 1628 * Initialize a cursor at the beginning of the ZAP object. The entire 1629 * ZAP object will be prefetched. 1630 */ 1631 void 1632 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) 1633 { 1634 zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE); 1635 } 1636 1637 /* 1638 * Initialize a cursor at the beginning, but request that we not prefetch 1639 * the entire ZAP object. 1640 */ 1641 void 1642 zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) 1643 { 1644 zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE); 1645 } 1646 1647 void 1648 zap_cursor_fini(zap_cursor_t *zc) 1649 { 1650 if (zc->zc_zap) { 1651 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); 1652 zap_unlockdir(zc->zc_zap, NULL); 1653 zc->zc_zap = NULL; 1654 } 1655 if (zc->zc_leaf) { 1656 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); 1657 zap_put_leaf(zc->zc_leaf); 1658 zc->zc_leaf = NULL; 1659 } 1660 zc->zc_objset = NULL; 1661 } 1662 1663 uint64_t 1664 zap_cursor_serialize(zap_cursor_t *zc) 1665 { 1666 if (zc->zc_hash == -1ULL) 1667 return (-1ULL); 1668 if (zc->zc_zap == NULL) 1669 return (zc->zc_serialized); 1670 ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); 1671 ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); 1672 1673 /* 1674 * We want to keep the high 32 bits of the cursor zero if we can, so 1675 * that 32-bit programs can access this. So usually use a small 1676 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits 1677 * of the cursor. 1678 * 1679 * [ collision differentiator | zap_hashbits()-bit hash value ] 1680 */ 1681 return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | 1682 ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); 1683 } 1684 1685 int 1686 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) 1687 { 1688 int err; 1689 1690 if (zc->zc_hash == -1ULL) 1691 return (SET_ERROR(ENOENT)); 1692 1693 if (zc->zc_zap == NULL) { 1694 int hb; 1695 err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, 1696 RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); 1697 if (err != 0) 1698 return (err); 1699 1700 /* 1701 * To support zap_cursor_init_serialized, advance, retrieve, 1702 * we must add to the existing zc_cd, which may already 1703 * be 1 due to the zap_cursor_advance. 1704 */ 1705 ASSERT(zc->zc_hash == 0); 1706 hb = zap_hashbits(zc->zc_zap); 1707 zc->zc_hash = zc->zc_serialized << (64 - hb); 1708 zc->zc_cd += zc->zc_serialized >> hb; 1709 if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ 1710 zc->zc_cd = 0; 1711 } else { 1712 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); 1713 } 1714 if (!zc->zc_zap->zap_ismicro) { 1715 err = fzap_cursor_retrieve(zc->zc_zap, zc, za); 1716 } else { 1717 zfs_btree_index_t idx; 1718 mzap_ent_t mze_tofind; 1719 1720 mze_tofind.mze_hash = zc->zc_hash >> 32; 1721 mze_tofind.mze_cd = zc->zc_cd; 1722 1723 mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree, 1724 &mze_tofind, &idx); 1725 if (mze == NULL) { 1726 mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree, 1727 &idx, &idx); 1728 } 1729 if (mze) { 1730 mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); 1731 ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); 1732 za->za_normalization_conflict = 1733 mzap_normalization_conflict(zc->zc_zap, NULL, 1734 mze, &idx); 1735 za->za_integer_length = 8; 1736 za->za_num_integers = 1; 1737 za->za_first_integer = mzep->mze_value; 1738 (void) strlcpy(za->za_name, mzep->mze_name, 1739 sizeof (za->za_name)); 1740 zc->zc_hash = (uint64_t)mze->mze_hash << 32; 1741 zc->zc_cd = mze->mze_cd; 1742 err = 0; 1743 } else { 1744 zc->zc_hash = -1ULL; 1745 err = SET_ERROR(ENOENT); 1746 } 1747 } 1748 rw_exit(&zc->zc_zap->zap_rwlock); 1749 return (err); 1750 } 1751 1752 void 1753 zap_cursor_advance(zap_cursor_t *zc) 1754 { 1755 if (zc->zc_hash == -1ULL) 1756 return; 1757 zc->zc_cd++; 1758 } 1759 1760 int 1761 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) 1762 { 1763 zap_t *zap; 1764 1765 int err = 1766 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1767 if (err != 0) 1768 return (err); 1769 1770 memset(zs, 0, sizeof (zap_stats_t)); 1771 1772 if (zap->zap_ismicro) { 1773 zs->zs_blocksize = zap->zap_dbuf->db_size; 1774 zs->zs_num_entries = zap->zap_m.zap_num_entries; 1775 zs->zs_num_blocks = 1; 1776 } else { 1777 fzap_get_stats(zap, zs); 1778 } 1779 zap_unlockdir(zap, FTAG); 1780 return (0); 1781 } 1782 1783 #if defined(_KERNEL) 1784 EXPORT_SYMBOL(zap_create); 1785 EXPORT_SYMBOL(zap_create_dnsize); 1786 EXPORT_SYMBOL(zap_create_norm); 1787 EXPORT_SYMBOL(zap_create_norm_dnsize); 1788 EXPORT_SYMBOL(zap_create_flags); 1789 EXPORT_SYMBOL(zap_create_flags_dnsize); 1790 EXPORT_SYMBOL(zap_create_claim); 1791 EXPORT_SYMBOL(zap_create_claim_norm); 1792 EXPORT_SYMBOL(zap_create_claim_norm_dnsize); 1793 EXPORT_SYMBOL(zap_create_hold); 1794 EXPORT_SYMBOL(zap_destroy); 1795 EXPORT_SYMBOL(zap_lookup); 1796 EXPORT_SYMBOL(zap_lookup_by_dnode); 1797 EXPORT_SYMBOL(zap_lookup_norm); 1798 EXPORT_SYMBOL(zap_lookup_uint64); 1799 EXPORT_SYMBOL(zap_contains); 1800 EXPORT_SYMBOL(zap_prefetch); 1801 EXPORT_SYMBOL(zap_prefetch_uint64); 1802 EXPORT_SYMBOL(zap_prefetch_object); 1803 EXPORT_SYMBOL(zap_add); 1804 EXPORT_SYMBOL(zap_add_by_dnode); 1805 EXPORT_SYMBOL(zap_add_uint64); 1806 EXPORT_SYMBOL(zap_add_uint64_by_dnode); 1807 EXPORT_SYMBOL(zap_update); 1808 EXPORT_SYMBOL(zap_update_uint64); 1809 EXPORT_SYMBOL(zap_update_uint64_by_dnode); 1810 EXPORT_SYMBOL(zap_length); 1811 EXPORT_SYMBOL(zap_length_uint64); 1812 EXPORT_SYMBOL(zap_remove); 1813 EXPORT_SYMBOL(zap_remove_by_dnode); 1814 EXPORT_SYMBOL(zap_remove_norm); 1815 EXPORT_SYMBOL(zap_remove_uint64); 1816 EXPORT_SYMBOL(zap_remove_uint64_by_dnode); 1817 EXPORT_SYMBOL(zap_count); 1818 EXPORT_SYMBOL(zap_value_search); 1819 EXPORT_SYMBOL(zap_join); 1820 EXPORT_SYMBOL(zap_join_increment); 1821 EXPORT_SYMBOL(zap_add_int); 1822 EXPORT_SYMBOL(zap_remove_int); 1823 EXPORT_SYMBOL(zap_lookup_int); 1824 EXPORT_SYMBOL(zap_increment_int); 1825 EXPORT_SYMBOL(zap_add_int_key); 1826 EXPORT_SYMBOL(zap_lookup_int_key); 1827 EXPORT_SYMBOL(zap_increment); 1828 EXPORT_SYMBOL(zap_cursor_init); 1829 EXPORT_SYMBOL(zap_cursor_fini); 1830 EXPORT_SYMBOL(zap_cursor_retrieve); 1831 EXPORT_SYMBOL(zap_cursor_advance); 1832 EXPORT_SYMBOL(zap_cursor_serialize); 1833 EXPORT_SYMBOL(zap_cursor_init_serialized); 1834 EXPORT_SYMBOL(zap_get_stats); 1835 1836 /* CSTYLED */ 1837 ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW, 1838 "Maximum micro ZAP size, before converting to a fat ZAP, in bytes"); 1839 #endif 1840