1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 26 * Copyright 2017 Nexenta Systems, Inc. 27 */ 28 29 #include <sys/zio.h> 30 #include <sys/spa.h> 31 #include <sys/dmu.h> 32 #include <sys/zfs_context.h> 33 #include <sys/zap.h> 34 #include <sys/zap_impl.h> 35 #include <sys/zap_leaf.h> 36 #include <sys/btree.h> 37 #include <sys/arc.h> 38 #include <sys/dmu_objset.h> 39 40 #ifdef _KERNEL 41 #include <sys/sunddi.h> 42 #endif 43 44 static int mzap_upgrade(zap_t **zapp, 45 const void *tag, dmu_tx_t *tx, zap_flags_t flags); 46 47 uint64_t 48 zap_getflags(zap_t *zap) 49 { 50 if (zap->zap_ismicro) 51 return (0); 52 return (zap_f_phys(zap)->zap_flags); 53 } 54 55 int 56 zap_hashbits(zap_t *zap) 57 { 58 if (zap_getflags(zap) & ZAP_FLAG_HASH64) 59 return (48); 60 else 61 return (28); 62 } 63 64 uint32_t 65 zap_maxcd(zap_t *zap) 66 { 67 if (zap_getflags(zap) & ZAP_FLAG_HASH64) 68 return ((1<<16)-1); 69 else 70 return (-1U); 71 } 72 73 static uint64_t 74 zap_hash(zap_name_t *zn) 75 { 76 zap_t *zap = zn->zn_zap; 77 uint64_t h = 0; 78 79 if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { 80 ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); 81 h = *(uint64_t *)zn->zn_key_orig; 82 } else { 83 h = zap->zap_salt; 84 ASSERT(h != 0); 85 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 86 87 if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { 88 const uint64_t *wp = zn->zn_key_norm; 89 90 ASSERT(zn->zn_key_intlen == 8); 91 for (int i = 0; i < zn->zn_key_norm_numints; 92 wp++, i++) { 93 uint64_t word = *wp; 94 95 for (int j = 0; j < 8; j++) { 96 h = (h >> 8) ^ 97 zfs_crc64_table[(h ^ word) & 0xFF]; 98 word >>= NBBY; 99 } 100 } 101 } else { 102 const uint8_t *cp = zn->zn_key_norm; 103 104 /* 105 * We previously stored the terminating null on 106 * disk, but didn't hash it, so we need to 107 * continue to not hash it. (The 108 * zn_key_*_numints includes the terminating 109 * null for non-binary keys.) 110 */ 111 int len = zn->zn_key_norm_numints - 1; 112 113 ASSERT(zn->zn_key_intlen == 1); 114 for (int i = 0; i < len; cp++, i++) { 115 h = (h >> 8) ^ 116 zfs_crc64_table[(h ^ *cp) & 0xFF]; 117 } 118 } 119 } 120 /* 121 * Don't use all 64 bits, since we need some in the cookie for 122 * the collision differentiator. We MUST use the high bits, 123 * since those are the ones that we first pay attention to when 124 * choosing the bucket. 125 */ 126 h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); 127 128 return (h); 129 } 130 131 static int 132 zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags) 133 { 134 ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); 135 136 size_t inlen = strlen(name) + 1; 137 size_t outlen = ZAP_MAXNAMELEN; 138 139 int err = 0; 140 (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, 141 normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, 142 U8_UNICODE_LATEST, &err); 143 144 return (err); 145 } 146 147 boolean_t 148 zap_match(zap_name_t *zn, const char *matchname) 149 { 150 ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); 151 152 if (zn->zn_matchtype & MT_NORMALIZE) { 153 char norm[ZAP_MAXNAMELEN]; 154 155 if (zap_normalize(zn->zn_zap, matchname, norm, 156 zn->zn_normflags) != 0) 157 return (B_FALSE); 158 159 return (strcmp(zn->zn_key_norm, norm) == 0); 160 } else { 161 return (strcmp(zn->zn_key_orig, matchname) == 0); 162 } 163 } 164 165 static zap_name_t * 166 zap_name_alloc(zap_t *zap) 167 { 168 zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); 169 zn->zn_zap = zap; 170 return (zn); 171 } 172 173 void 174 zap_name_free(zap_name_t *zn) 175 { 176 kmem_free(zn, sizeof (zap_name_t)); 177 } 178 179 static int 180 zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt) 181 { 182 zap_t *zap = zn->zn_zap; 183 184 zn->zn_key_intlen = sizeof (*key); 185 zn->zn_key_orig = key; 186 zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; 187 zn->zn_matchtype = mt; 188 zn->zn_normflags = zap->zap_normflags; 189 190 /* 191 * If we're dealing with a case sensitive lookup on a mixed or 192 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup 193 * will fold case to all caps overriding the lookup request. 194 */ 195 if (mt & MT_MATCH_CASE) 196 zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; 197 198 if (zap->zap_normflags) { 199 /* 200 * We *must* use zap_normflags because this normalization is 201 * what the hash is computed from. 202 */ 203 if (zap_normalize(zap, key, zn->zn_normbuf, 204 zap->zap_normflags) != 0) 205 return (SET_ERROR(ENOTSUP)); 206 zn->zn_key_norm = zn->zn_normbuf; 207 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; 208 } else { 209 if (mt != 0) 210 return (SET_ERROR(ENOTSUP)); 211 zn->zn_key_norm = zn->zn_key_orig; 212 zn->zn_key_norm_numints = zn->zn_key_orig_numints; 213 } 214 215 zn->zn_hash = zap_hash(zn); 216 217 if (zap->zap_normflags != zn->zn_normflags) { 218 /* 219 * We *must* use zn_normflags because this normalization is 220 * what the matching is based on. (Not the hash!) 221 */ 222 if (zap_normalize(zap, key, zn->zn_normbuf, 223 zn->zn_normflags) != 0) 224 return (SET_ERROR(ENOTSUP)); 225 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; 226 } 227 228 return (0); 229 } 230 231 zap_name_t * 232 zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt) 233 { 234 zap_name_t *zn = zap_name_alloc(zap); 235 if (zap_name_init_str(zn, key, mt) != 0) { 236 zap_name_free(zn); 237 return (NULL); 238 } 239 return (zn); 240 } 241 242 static zap_name_t * 243 zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) 244 { 245 zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); 246 247 ASSERT(zap->zap_normflags == 0); 248 zn->zn_zap = zap; 249 zn->zn_key_intlen = sizeof (*key); 250 zn->zn_key_orig = zn->zn_key_norm = key; 251 zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; 252 zn->zn_matchtype = 0; 253 254 zn->zn_hash = zap_hash(zn); 255 return (zn); 256 } 257 258 static void 259 mzap_byteswap(mzap_phys_t *buf, size_t size) 260 { 261 buf->mz_block_type = BSWAP_64(buf->mz_block_type); 262 buf->mz_salt = BSWAP_64(buf->mz_salt); 263 buf->mz_normflags = BSWAP_64(buf->mz_normflags); 264 int max = (size / MZAP_ENT_LEN) - 1; 265 for (int i = 0; i < max; i++) { 266 buf->mz_chunk[i].mze_value = 267 BSWAP_64(buf->mz_chunk[i].mze_value); 268 buf->mz_chunk[i].mze_cd = 269 BSWAP_32(buf->mz_chunk[i].mze_cd); 270 } 271 } 272 273 void 274 zap_byteswap(void *buf, size_t size) 275 { 276 uint64_t block_type = *(uint64_t *)buf; 277 278 if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { 279 /* ASSERT(magic == ZAP_LEAF_MAGIC); */ 280 mzap_byteswap(buf, size); 281 } else { 282 fzap_byteswap(buf, size); 283 } 284 } 285 286 static int 287 mze_compare(const void *arg1, const void *arg2) 288 { 289 const mzap_ent_t *mze1 = arg1; 290 const mzap_ent_t *mze2 = arg2; 291 292 return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd, 293 (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd)); 294 } 295 296 static void 297 mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash) 298 { 299 mzap_ent_t mze; 300 301 ASSERT(zap->zap_ismicro); 302 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 303 304 mze.mze_chunkid = chunkid; 305 ASSERT0(hash & 0xffffffff); 306 mze.mze_hash = hash >> 32; 307 ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff); 308 mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd; 309 ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0); 310 zfs_btree_add(&zap->zap_m.zap_tree, &mze); 311 } 312 313 static mzap_ent_t * 314 mze_find(zap_name_t *zn, zfs_btree_index_t *idx) 315 { 316 mzap_ent_t mze_tofind; 317 mzap_ent_t *mze; 318 zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree; 319 320 ASSERT(zn->zn_zap->zap_ismicro); 321 ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); 322 323 ASSERT0(zn->zn_hash & 0xffffffff); 324 mze_tofind.mze_hash = zn->zn_hash >> 32; 325 mze_tofind.mze_cd = 0; 326 327 mze = zfs_btree_find(tree, &mze_tofind, idx); 328 if (mze == NULL) 329 mze = zfs_btree_next(tree, idx, idx); 330 for (; mze && mze->mze_hash == mze_tofind.mze_hash; 331 mze = zfs_btree_next(tree, idx, idx)) { 332 ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); 333 if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) 334 return (mze); 335 } 336 337 return (NULL); 338 } 339 340 static uint32_t 341 mze_find_unused_cd(zap_t *zap, uint64_t hash) 342 { 343 mzap_ent_t mze_tofind; 344 zfs_btree_index_t idx; 345 zfs_btree_t *tree = &zap->zap_m.zap_tree; 346 347 ASSERT(zap->zap_ismicro); 348 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); 349 350 ASSERT0(hash & 0xffffffff); 351 hash >>= 32; 352 mze_tofind.mze_hash = hash; 353 mze_tofind.mze_cd = 0; 354 355 uint32_t cd = 0; 356 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); 357 mze && mze->mze_hash == hash; 358 mze = zfs_btree_next(tree, &idx, &idx)) { 359 if (mze->mze_cd != cd) 360 break; 361 cd++; 362 } 363 364 return (cd); 365 } 366 367 /* 368 * Each mzap entry requires at max : 4 chunks 369 * 3 chunks for names + 1 chunk for value. 370 */ 371 #define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \ 372 ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t))) 373 374 /* 375 * Check if the current entry keeps the colliding entries under the fatzap leaf 376 * size. 377 */ 378 static boolean_t 379 mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) 380 { 381 zap_t *zap = zn->zn_zap; 382 mzap_ent_t mze_tofind; 383 zfs_btree_index_t idx; 384 zfs_btree_t *tree = &zap->zap_m.zap_tree; 385 uint32_t mzap_ents = 0; 386 387 ASSERT0(hash & 0xffffffff); 388 hash >>= 32; 389 mze_tofind.mze_hash = hash; 390 mze_tofind.mze_cd = 0; 391 392 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); 393 mze && mze->mze_hash == hash; 394 mze = zfs_btree_next(tree, &idx, &idx)) { 395 mzap_ents++; 396 } 397 398 /* Include the new entry being added */ 399 mzap_ents++; 400 401 return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS)); 402 } 403 404 static void 405 mze_destroy(zap_t *zap) 406 { 407 zfs_btree_clear(&zap->zap_m.zap_tree); 408 zfs_btree_destroy(&zap->zap_m.zap_tree); 409 } 410 411 static zap_t * 412 mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) 413 { 414 zap_t *winner; 415 uint64_t *zap_hdr = (uint64_t *)db->db_data; 416 uint64_t zap_block_type = zap_hdr[0]; 417 uint64_t zap_magic = zap_hdr[1]; 418 419 ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); 420 421 zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); 422 rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL); 423 rw_enter(&zap->zap_rwlock, RW_WRITER); 424 zap->zap_objset = os; 425 zap->zap_object = obj; 426 zap->zap_dbuf = db; 427 428 if (zap_block_type != ZBT_MICRO) { 429 mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 430 0); 431 zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; 432 if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) { 433 winner = NULL; /* No actual winner here... */ 434 goto handle_winner; 435 } 436 } else { 437 zap->zap_ismicro = TRUE; 438 } 439 440 /* 441 * Make sure that zap_ismicro is set before we let others see 442 * it, because zap_lockdir() checks zap_ismicro without the lock 443 * held. 444 */ 445 dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf); 446 winner = dmu_buf_set_user(db, &zap->zap_dbu); 447 448 if (winner != NULL) 449 goto handle_winner; 450 451 if (zap->zap_ismicro) { 452 zap->zap_salt = zap_m_phys(zap)->mz_salt; 453 zap->zap_normflags = zap_m_phys(zap)->mz_normflags; 454 zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; 455 456 /* 457 * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove() 458 * overhead on massive inserts below. It still allows to store 459 * 62 entries before we have to add 2KB B-tree core node. 460 */ 461 zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare, 462 sizeof (mzap_ent_t), 512); 463 464 zap_name_t *zn = zap_name_alloc(zap); 465 for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) { 466 mzap_ent_phys_t *mze = 467 &zap_m_phys(zap)->mz_chunk[i]; 468 if (mze->mze_name[0]) { 469 zap->zap_m.zap_num_entries++; 470 zap_name_init_str(zn, mze->mze_name, 0); 471 mze_insert(zap, i, zn->zn_hash); 472 } 473 } 474 zap_name_free(zn); 475 } else { 476 zap->zap_salt = zap_f_phys(zap)->zap_salt; 477 zap->zap_normflags = zap_f_phys(zap)->zap_normflags; 478 479 ASSERT3U(sizeof (struct zap_leaf_header), ==, 480 2*ZAP_LEAF_CHUNKSIZE); 481 482 /* 483 * The embedded pointer table should not overlap the 484 * other members. 485 */ 486 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, 487 &zap_f_phys(zap)->zap_salt); 488 489 /* 490 * The embedded pointer table should end at the end of 491 * the block 492 */ 493 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, 494 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) - 495 (uintptr_t)zap_f_phys(zap), ==, 496 zap->zap_dbuf->db_size); 497 } 498 rw_exit(&zap->zap_rwlock); 499 return (zap); 500 501 handle_winner: 502 rw_exit(&zap->zap_rwlock); 503 rw_destroy(&zap->zap_rwlock); 504 if (!zap->zap_ismicro) 505 mutex_destroy(&zap->zap_f.zap_num_entries_mtx); 506 kmem_free(zap, sizeof (zap_t)); 507 return (winner); 508 } 509 510 /* 511 * This routine "consumes" the caller's hold on the dbuf, which must 512 * have the specified tag. 513 */ 514 static int 515 zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx, 516 krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) 517 { 518 ASSERT0(db->db_offset); 519 objset_t *os = dmu_buf_get_objset(db); 520 uint64_t obj = db->db_object; 521 dmu_object_info_t doi; 522 523 *zapp = NULL; 524 525 dmu_object_info_from_db(db, &doi); 526 if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) 527 return (SET_ERROR(EINVAL)); 528 529 zap_t *zap = dmu_buf_get_user(db); 530 if (zap == NULL) { 531 zap = mzap_open(os, obj, db); 532 if (zap == NULL) { 533 /* 534 * mzap_open() didn't like what it saw on-disk. 535 * Check for corruption! 536 */ 537 return (SET_ERROR(EIO)); 538 } 539 } 540 541 /* 542 * We're checking zap_ismicro without the lock held, in order to 543 * tell what type of lock we want. Once we have some sort of 544 * lock, see if it really is the right type. In practice this 545 * can only be different if it was upgraded from micro to fat, 546 * and micro wanted WRITER but fat only needs READER. 547 */ 548 krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; 549 rw_enter(&zap->zap_rwlock, lt); 550 if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { 551 /* it was upgraded, now we only need reader */ 552 ASSERT(lt == RW_WRITER); 553 ASSERT(RW_READER == 554 ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)); 555 rw_downgrade(&zap->zap_rwlock); 556 lt = RW_READER; 557 } 558 559 zap->zap_objset = os; 560 561 if (lt == RW_WRITER) 562 dmu_buf_will_dirty(db, tx); 563 564 ASSERT3P(zap->zap_dbuf, ==, db); 565 566 ASSERT(!zap->zap_ismicro || 567 zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); 568 if (zap->zap_ismicro && tx && adding && 569 zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { 570 uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; 571 if (newsz > MZAP_MAX_BLKSZ) { 572 dprintf("upgrading obj %llu: num_entries=%u\n", 573 (u_longlong_t)obj, zap->zap_m.zap_num_entries); 574 *zapp = zap; 575 int err = mzap_upgrade(zapp, tag, tx, 0); 576 if (err != 0) 577 rw_exit(&zap->zap_rwlock); 578 return (err); 579 } 580 VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); 581 zap->zap_m.zap_num_chunks = 582 db->db_size / MZAP_ENT_LEN - 1; 583 } 584 585 *zapp = zap; 586 return (0); 587 } 588 589 static int 590 zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, 591 krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, 592 zap_t **zapp) 593 { 594 dmu_buf_t *db; 595 596 int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); 597 if (err != 0) { 598 return (err); 599 } 600 #ifdef ZFS_DEBUG 601 { 602 dmu_object_info_t doi; 603 dmu_object_info_from_db(db, &doi); 604 ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); 605 } 606 #endif 607 608 err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); 609 if (err != 0) { 610 dmu_buf_rele(db, tag); 611 } 612 return (err); 613 } 614 615 int 616 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, 617 krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, 618 zap_t **zapp) 619 { 620 dmu_buf_t *db; 621 622 int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH); 623 if (err != 0) 624 return (err); 625 #ifdef ZFS_DEBUG 626 { 627 dmu_object_info_t doi; 628 dmu_object_info_from_db(db, &doi); 629 ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); 630 } 631 #endif 632 err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); 633 if (err != 0) 634 dmu_buf_rele(db, tag); 635 return (err); 636 } 637 638 void 639 zap_unlockdir(zap_t *zap, const void *tag) 640 { 641 rw_exit(&zap->zap_rwlock); 642 dmu_buf_rele(zap->zap_dbuf, tag); 643 } 644 645 static int 646 mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags) 647 { 648 int err = 0; 649 zap_t *zap = *zapp; 650 651 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 652 653 int sz = zap->zap_dbuf->db_size; 654 mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP); 655 memcpy(mzp, zap->zap_dbuf->db_data, sz); 656 int nchunks = zap->zap_m.zap_num_chunks; 657 658 if (!flags) { 659 err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 660 1ULL << fzap_default_block_shift, 0, tx); 661 if (err != 0) { 662 vmem_free(mzp, sz); 663 return (err); 664 } 665 } 666 667 dprintf("upgrading obj=%llu with %u chunks\n", 668 (u_longlong_t)zap->zap_object, nchunks); 669 /* XXX destroy the tree later, so we can use the stored hash value */ 670 mze_destroy(zap); 671 672 fzap_upgrade(zap, tx, flags); 673 674 zap_name_t *zn = zap_name_alloc(zap); 675 for (int i = 0; i < nchunks; i++) { 676 mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; 677 if (mze->mze_name[0] == 0) 678 continue; 679 dprintf("adding %s=%llu\n", 680 mze->mze_name, (u_longlong_t)mze->mze_value); 681 zap_name_init_str(zn, mze->mze_name, 0); 682 /* If we fail here, we would end up losing entries */ 683 VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, 684 tag, tx)); 685 zap = zn->zn_zap; /* fzap_add_cd() may change zap */ 686 } 687 zap_name_free(zn); 688 vmem_free(mzp, sz); 689 *zapp = zap; 690 return (0); 691 } 692 693 /* 694 * The "normflags" determine the behavior of the matchtype_t which is 695 * passed to zap_lookup_norm(). Names which have the same normalized 696 * version will be stored with the same hash value, and therefore we can 697 * perform normalization-insensitive lookups. We can be Unicode form- 698 * insensitive and/or case-insensitive. The following flags are valid for 699 * "normflags": 700 * 701 * U8_TEXTPREP_NFC 702 * U8_TEXTPREP_NFD 703 * U8_TEXTPREP_NFKC 704 * U8_TEXTPREP_NFKD 705 * U8_TEXTPREP_TOUPPER 706 * 707 * The *_NF* (Normalization Form) flags are mutually exclusive; at most one 708 * of them may be supplied. 709 */ 710 void 711 mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx) 712 { 713 dmu_buf_t *db; 714 715 VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); 716 717 dmu_buf_will_dirty(db, tx); 718 mzap_phys_t *zp = db->db_data; 719 zp->mz_block_type = ZBT_MICRO; 720 zp->mz_salt = 721 ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL; 722 zp->mz_normflags = normflags; 723 724 if (flags != 0) { 725 zap_t *zap; 726 /* Only fat zap supports flags; upgrade immediately. */ 727 VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER, 728 B_FALSE, B_FALSE, &zap)); 729 VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); 730 zap_unlockdir(zap, FTAG); 731 } else { 732 dmu_buf_rele(db, FTAG); 733 } 734 } 735 736 static uint64_t 737 zap_create_impl(objset_t *os, int normflags, zap_flags_t flags, 738 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 739 dmu_object_type_t bonustype, int bonuslen, int dnodesize, 740 dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) 741 { 742 uint64_t obj; 743 744 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); 745 746 if (allocated_dnode == NULL) { 747 dnode_t *dn; 748 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, 749 indirect_blockshift, bonustype, bonuslen, dnodesize, 750 &dn, FTAG, tx); 751 mzap_create_impl(dn, normflags, flags, tx); 752 dnode_rele(dn, FTAG); 753 } else { 754 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, 755 indirect_blockshift, bonustype, bonuslen, dnodesize, 756 allocated_dnode, tag, tx); 757 mzap_create_impl(*allocated_dnode, normflags, flags, tx); 758 } 759 760 return (obj); 761 } 762 763 int 764 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, 765 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 766 { 767 return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, 768 0, tx)); 769 } 770 771 int 772 zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, 773 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 774 { 775 return (zap_create_claim_norm_dnsize(os, obj, 776 0, ot, bonustype, bonuslen, dnodesize, tx)); 777 } 778 779 int 780 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, 781 dmu_object_type_t ot, 782 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 783 { 784 return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, 785 bonuslen, 0, tx)); 786 } 787 788 int 789 zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, 790 dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, 791 int dnodesize, dmu_tx_t *tx) 792 { 793 dnode_t *dn; 794 int error; 795 796 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); 797 error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, 798 dnodesize, tx); 799 if (error != 0) 800 return (error); 801 802 error = dnode_hold(os, obj, FTAG, &dn); 803 if (error != 0) 804 return (error); 805 806 mzap_create_impl(dn, normflags, 0, tx); 807 808 dnode_rele(dn, FTAG); 809 810 return (0); 811 } 812 813 uint64_t 814 zap_create(objset_t *os, dmu_object_type_t ot, 815 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 816 { 817 return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); 818 } 819 820 uint64_t 821 zap_create_dnsize(objset_t *os, dmu_object_type_t ot, 822 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 823 { 824 return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, 825 dnodesize, tx)); 826 } 827 828 uint64_t 829 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, 830 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 831 { 832 return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, 833 0, tx)); 834 } 835 836 uint64_t 837 zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, 838 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 839 { 840 return (zap_create_impl(os, normflags, 0, ot, 0, 0, 841 bonustype, bonuslen, dnodesize, NULL, NULL, tx)); 842 } 843 844 uint64_t 845 zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, 846 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 847 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 848 { 849 return (zap_create_flags_dnsize(os, normflags, flags, ot, 850 leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); 851 } 852 853 uint64_t 854 zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, 855 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 856 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 857 { 858 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, 859 indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL, 860 tx)); 861 } 862 863 /* 864 * Create a zap object and return a pointer to the newly allocated dnode via 865 * the allocated_dnode argument. The returned dnode will be held and the 866 * caller is responsible for releasing the hold by calling dnode_rele(). 867 */ 868 uint64_t 869 zap_create_hold(objset_t *os, int normflags, zap_flags_t flags, 870 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, 871 dmu_object_type_t bonustype, int bonuslen, int dnodesize, 872 dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) 873 { 874 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, 875 indirect_blockshift, bonustype, bonuslen, dnodesize, 876 allocated_dnode, tag, tx)); 877 } 878 879 int 880 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) 881 { 882 /* 883 * dmu_object_free will free the object number and free the 884 * data. Freeing the data will cause our pageout function to be 885 * called, which will destroy our data (zap_leaf_t's and zap_t). 886 */ 887 888 return (dmu_object_free(os, zapobj, tx)); 889 } 890 891 void 892 zap_evict_sync(void *dbu) 893 { 894 zap_t *zap = dbu; 895 896 rw_destroy(&zap->zap_rwlock); 897 898 if (zap->zap_ismicro) 899 mze_destroy(zap); 900 else 901 mutex_destroy(&zap->zap_f.zap_num_entries_mtx); 902 903 kmem_free(zap, sizeof (zap_t)); 904 } 905 906 int 907 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) 908 { 909 zap_t *zap; 910 911 int err = 912 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 913 if (err != 0) 914 return (err); 915 if (!zap->zap_ismicro) { 916 err = fzap_count(zap, count); 917 } else { 918 *count = zap->zap_m.zap_num_entries; 919 } 920 zap_unlockdir(zap, FTAG); 921 return (err); 922 } 923 924 /* 925 * zn may be NULL; if not specified, it will be computed if needed. 926 * See also the comment above zap_entry_normalization_conflict(). 927 */ 928 static boolean_t 929 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze, 930 zfs_btree_index_t *idx) 931 { 932 boolean_t allocdzn = B_FALSE; 933 mzap_ent_t *other; 934 zfs_btree_index_t oidx; 935 936 if (zap->zap_normflags == 0) 937 return (B_FALSE); 938 939 for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx); 940 other && other->mze_hash == mze->mze_hash; 941 other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) { 942 943 if (zn == NULL) { 944 zn = zap_name_alloc_str(zap, 945 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); 946 allocdzn = B_TRUE; 947 } 948 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { 949 if (allocdzn) 950 zap_name_free(zn); 951 return (B_TRUE); 952 } 953 } 954 955 for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx); 956 other && other->mze_hash == mze->mze_hash; 957 other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) { 958 959 if (zn == NULL) { 960 zn = zap_name_alloc_str(zap, 961 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); 962 allocdzn = B_TRUE; 963 } 964 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { 965 if (allocdzn) 966 zap_name_free(zn); 967 return (B_TRUE); 968 } 969 } 970 971 if (allocdzn) 972 zap_name_free(zn); 973 return (B_FALSE); 974 } 975 976 /* 977 * Routines for manipulating attributes. 978 */ 979 980 int 981 zap_lookup(objset_t *os, uint64_t zapobj, const char *name, 982 uint64_t integer_size, uint64_t num_integers, void *buf) 983 { 984 return (zap_lookup_norm(os, zapobj, name, integer_size, 985 num_integers, buf, 0, NULL, 0, NULL)); 986 } 987 988 static int 989 zap_lookup_impl(zap_t *zap, const char *name, 990 uint64_t integer_size, uint64_t num_integers, void *buf, 991 matchtype_t mt, char *realname, int rn_len, 992 boolean_t *ncp) 993 { 994 int err = 0; 995 996 zap_name_t *zn = zap_name_alloc_str(zap, name, mt); 997 if (zn == NULL) 998 return (SET_ERROR(ENOTSUP)); 999 1000 if (!zap->zap_ismicro) { 1001 err = fzap_lookup(zn, integer_size, num_integers, buf, 1002 realname, rn_len, ncp); 1003 } else { 1004 zfs_btree_index_t idx; 1005 mzap_ent_t *mze = mze_find(zn, &idx); 1006 if (mze == NULL) { 1007 err = SET_ERROR(ENOENT); 1008 } else { 1009 if (num_integers < 1) { 1010 err = SET_ERROR(EOVERFLOW); 1011 } else if (integer_size != 8) { 1012 err = SET_ERROR(EINVAL); 1013 } else { 1014 *(uint64_t *)buf = 1015 MZE_PHYS(zap, mze)->mze_value; 1016 if (realname != NULL) 1017 (void) strlcpy(realname, 1018 MZE_PHYS(zap, mze)->mze_name, 1019 rn_len); 1020 if (ncp) { 1021 *ncp = mzap_normalization_conflict(zap, 1022 zn, mze, &idx); 1023 } 1024 } 1025 } 1026 } 1027 zap_name_free(zn); 1028 return (err); 1029 } 1030 1031 int 1032 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, 1033 uint64_t integer_size, uint64_t num_integers, void *buf, 1034 matchtype_t mt, char *realname, int rn_len, 1035 boolean_t *ncp) 1036 { 1037 zap_t *zap; 1038 1039 int err = 1040 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1041 if (err != 0) 1042 return (err); 1043 err = zap_lookup_impl(zap, name, integer_size, 1044 num_integers, buf, mt, realname, rn_len, ncp); 1045 zap_unlockdir(zap, FTAG); 1046 return (err); 1047 } 1048 1049 int 1050 zap_prefetch(objset_t *os, uint64_t zapobj, const char *name) 1051 { 1052 zap_t *zap; 1053 int err; 1054 zap_name_t *zn; 1055 1056 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1057 if (err) 1058 return (err); 1059 zn = zap_name_alloc_str(zap, name, 0); 1060 if (zn == NULL) { 1061 zap_unlockdir(zap, FTAG); 1062 return (SET_ERROR(ENOTSUP)); 1063 } 1064 1065 fzap_prefetch(zn); 1066 zap_name_free(zn); 1067 zap_unlockdir(zap, FTAG); 1068 return (err); 1069 } 1070 1071 int 1072 zap_lookup_by_dnode(dnode_t *dn, const char *name, 1073 uint64_t integer_size, uint64_t num_integers, void *buf) 1074 { 1075 return (zap_lookup_norm_by_dnode(dn, name, integer_size, 1076 num_integers, buf, 0, NULL, 0, NULL)); 1077 } 1078 1079 int 1080 zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, 1081 uint64_t integer_size, uint64_t num_integers, void *buf, 1082 matchtype_t mt, char *realname, int rn_len, 1083 boolean_t *ncp) 1084 { 1085 zap_t *zap; 1086 1087 int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, 1088 FTAG, &zap); 1089 if (err != 0) 1090 return (err); 1091 err = zap_lookup_impl(zap, name, integer_size, 1092 num_integers, buf, mt, realname, rn_len, ncp); 1093 zap_unlockdir(zap, FTAG); 1094 return (err); 1095 } 1096 1097 int 1098 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1099 int key_numints) 1100 { 1101 zap_t *zap; 1102 1103 int err = 1104 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1105 if (err != 0) 1106 return (err); 1107 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1108 if (zn == NULL) { 1109 zap_unlockdir(zap, FTAG); 1110 return (SET_ERROR(ENOTSUP)); 1111 } 1112 1113 fzap_prefetch(zn); 1114 zap_name_free(zn); 1115 zap_unlockdir(zap, FTAG); 1116 return (err); 1117 } 1118 1119 int 1120 zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1121 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) 1122 { 1123 zap_t *zap; 1124 1125 int err = 1126 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1127 if (err != 0) 1128 return (err); 1129 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1130 if (zn == NULL) { 1131 zap_unlockdir(zap, FTAG); 1132 return (SET_ERROR(ENOTSUP)); 1133 } 1134 1135 err = fzap_lookup(zn, integer_size, num_integers, buf, 1136 NULL, 0, NULL); 1137 zap_name_free(zn); 1138 zap_unlockdir(zap, FTAG); 1139 return (err); 1140 } 1141 1142 int 1143 zap_contains(objset_t *os, uint64_t zapobj, const char *name) 1144 { 1145 int err = zap_lookup_norm(os, zapobj, name, 0, 1146 0, NULL, 0, NULL, 0, NULL); 1147 if (err == EOVERFLOW || err == EINVAL) 1148 err = 0; /* found, but skipped reading the value */ 1149 return (err); 1150 } 1151 1152 int 1153 zap_length(objset_t *os, uint64_t zapobj, const char *name, 1154 uint64_t *integer_size, uint64_t *num_integers) 1155 { 1156 zap_t *zap; 1157 1158 int err = 1159 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1160 if (err != 0) 1161 return (err); 1162 zap_name_t *zn = zap_name_alloc_str(zap, name, 0); 1163 if (zn == NULL) { 1164 zap_unlockdir(zap, FTAG); 1165 return (SET_ERROR(ENOTSUP)); 1166 } 1167 if (!zap->zap_ismicro) { 1168 err = fzap_length(zn, integer_size, num_integers); 1169 } else { 1170 zfs_btree_index_t idx; 1171 mzap_ent_t *mze = mze_find(zn, &idx); 1172 if (mze == NULL) { 1173 err = SET_ERROR(ENOENT); 1174 } else { 1175 if (integer_size) 1176 *integer_size = 8; 1177 if (num_integers) 1178 *num_integers = 1; 1179 } 1180 } 1181 zap_name_free(zn); 1182 zap_unlockdir(zap, FTAG); 1183 return (err); 1184 } 1185 1186 int 1187 zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1188 int key_numints, uint64_t *integer_size, uint64_t *num_integers) 1189 { 1190 zap_t *zap; 1191 1192 int err = 1193 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1194 if (err != 0) 1195 return (err); 1196 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1197 if (zn == NULL) { 1198 zap_unlockdir(zap, FTAG); 1199 return (SET_ERROR(ENOTSUP)); 1200 } 1201 err = fzap_length(zn, integer_size, num_integers); 1202 zap_name_free(zn); 1203 zap_unlockdir(zap, FTAG); 1204 return (err); 1205 } 1206 1207 static void 1208 mzap_addent(zap_name_t *zn, uint64_t value) 1209 { 1210 zap_t *zap = zn->zn_zap; 1211 uint16_t start = zap->zap_m.zap_alloc_next; 1212 1213 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 1214 1215 #ifdef ZFS_DEBUG 1216 for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { 1217 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; 1218 ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); 1219 } 1220 #endif 1221 1222 uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash); 1223 /* given the limited size of the microzap, this can't happen */ 1224 ASSERT(cd < zap_maxcd(zap)); 1225 1226 again: 1227 for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) { 1228 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; 1229 if (mze->mze_name[0] == 0) { 1230 mze->mze_value = value; 1231 mze->mze_cd = cd; 1232 (void) strlcpy(mze->mze_name, zn->zn_key_orig, 1233 sizeof (mze->mze_name)); 1234 zap->zap_m.zap_num_entries++; 1235 zap->zap_m.zap_alloc_next = i+1; 1236 if (zap->zap_m.zap_alloc_next == 1237 zap->zap_m.zap_num_chunks) 1238 zap->zap_m.zap_alloc_next = 0; 1239 mze_insert(zap, i, zn->zn_hash); 1240 return; 1241 } 1242 } 1243 if (start != 0) { 1244 start = 0; 1245 goto again; 1246 } 1247 cmn_err(CE_PANIC, "out of entries!"); 1248 } 1249 1250 static int 1251 zap_add_impl(zap_t *zap, const char *key, 1252 int integer_size, uint64_t num_integers, 1253 const void *val, dmu_tx_t *tx, const void *tag) 1254 { 1255 const uint64_t *intval = val; 1256 int err = 0; 1257 1258 zap_name_t *zn = zap_name_alloc_str(zap, key, 0); 1259 if (zn == NULL) { 1260 zap_unlockdir(zap, tag); 1261 return (SET_ERROR(ENOTSUP)); 1262 } 1263 if (!zap->zap_ismicro) { 1264 err = fzap_add(zn, integer_size, num_integers, val, tag, tx); 1265 zap = zn->zn_zap; /* fzap_add() may change zap */ 1266 } else if (integer_size != 8 || num_integers != 1 || 1267 strlen(key) >= MZAP_NAME_LEN || 1268 !mze_canfit_fzap_leaf(zn, zn->zn_hash)) { 1269 err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); 1270 if (err == 0) { 1271 err = fzap_add(zn, integer_size, num_integers, val, 1272 tag, tx); 1273 } 1274 zap = zn->zn_zap; /* fzap_add() may change zap */ 1275 } else { 1276 zfs_btree_index_t idx; 1277 if (mze_find(zn, &idx) != NULL) { 1278 err = SET_ERROR(EEXIST); 1279 } else { 1280 mzap_addent(zn, *intval); 1281 } 1282 } 1283 ASSERT(zap == zn->zn_zap); 1284 zap_name_free(zn); 1285 if (zap != NULL) /* may be NULL if fzap_add() failed */ 1286 zap_unlockdir(zap, tag); 1287 return (err); 1288 } 1289 1290 int 1291 zap_add(objset_t *os, uint64_t zapobj, const char *key, 1292 int integer_size, uint64_t num_integers, 1293 const void *val, dmu_tx_t *tx) 1294 { 1295 zap_t *zap; 1296 int err; 1297 1298 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1299 if (err != 0) 1300 return (err); 1301 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); 1302 /* zap_add_impl() calls zap_unlockdir() */ 1303 return (err); 1304 } 1305 1306 int 1307 zap_add_by_dnode(dnode_t *dn, const char *key, 1308 int integer_size, uint64_t num_integers, 1309 const void *val, dmu_tx_t *tx) 1310 { 1311 zap_t *zap; 1312 int err; 1313 1314 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1315 if (err != 0) 1316 return (err); 1317 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); 1318 /* zap_add_impl() calls zap_unlockdir() */ 1319 return (err); 1320 } 1321 1322 int 1323 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1324 int key_numints, int integer_size, uint64_t num_integers, 1325 const void *val, dmu_tx_t *tx) 1326 { 1327 zap_t *zap; 1328 1329 int err = 1330 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1331 if (err != 0) 1332 return (err); 1333 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1334 if (zn == NULL) { 1335 zap_unlockdir(zap, FTAG); 1336 return (SET_ERROR(ENOTSUP)); 1337 } 1338 err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); 1339 zap = zn->zn_zap; /* fzap_add() may change zap */ 1340 zap_name_free(zn); 1341 if (zap != NULL) /* may be NULL if fzap_add() failed */ 1342 zap_unlockdir(zap, FTAG); 1343 return (err); 1344 } 1345 1346 int 1347 zap_update(objset_t *os, uint64_t zapobj, const char *name, 1348 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) 1349 { 1350 zap_t *zap; 1351 const uint64_t *intval = val; 1352 1353 int err = 1354 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1355 if (err != 0) 1356 return (err); 1357 zap_name_t *zn = zap_name_alloc_str(zap, name, 0); 1358 if (zn == NULL) { 1359 zap_unlockdir(zap, FTAG); 1360 return (SET_ERROR(ENOTSUP)); 1361 } 1362 if (!zap->zap_ismicro) { 1363 err = fzap_update(zn, integer_size, num_integers, val, 1364 FTAG, tx); 1365 zap = zn->zn_zap; /* fzap_update() may change zap */ 1366 } else if (integer_size != 8 || num_integers != 1 || 1367 strlen(name) >= MZAP_NAME_LEN) { 1368 dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", 1369 (u_longlong_t)zapobj, integer_size, 1370 (u_longlong_t)num_integers, name); 1371 err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); 1372 if (err == 0) { 1373 err = fzap_update(zn, integer_size, num_integers, 1374 val, FTAG, tx); 1375 } 1376 zap = zn->zn_zap; /* fzap_update() may change zap */ 1377 } else { 1378 zfs_btree_index_t idx; 1379 mzap_ent_t *mze = mze_find(zn, &idx); 1380 if (mze != NULL) { 1381 MZE_PHYS(zap, mze)->mze_value = *intval; 1382 } else { 1383 mzap_addent(zn, *intval); 1384 } 1385 } 1386 ASSERT(zap == zn->zn_zap); 1387 zap_name_free(zn); 1388 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ 1389 zap_unlockdir(zap, FTAG); 1390 return (err); 1391 } 1392 1393 int 1394 zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1395 int key_numints, 1396 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) 1397 { 1398 zap_t *zap; 1399 1400 int err = 1401 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); 1402 if (err != 0) 1403 return (err); 1404 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1405 if (zn == NULL) { 1406 zap_unlockdir(zap, FTAG); 1407 return (SET_ERROR(ENOTSUP)); 1408 } 1409 err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); 1410 zap = zn->zn_zap; /* fzap_update() may change zap */ 1411 zap_name_free(zn); 1412 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ 1413 zap_unlockdir(zap, FTAG); 1414 return (err); 1415 } 1416 1417 int 1418 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) 1419 { 1420 return (zap_remove_norm(os, zapobj, name, 0, tx)); 1421 } 1422 1423 static int 1424 zap_remove_impl(zap_t *zap, const char *name, 1425 matchtype_t mt, dmu_tx_t *tx) 1426 { 1427 int err = 0; 1428 1429 zap_name_t *zn = zap_name_alloc_str(zap, name, mt); 1430 if (zn == NULL) 1431 return (SET_ERROR(ENOTSUP)); 1432 if (!zap->zap_ismicro) { 1433 err = fzap_remove(zn, tx); 1434 } else { 1435 zfs_btree_index_t idx; 1436 mzap_ent_t *mze = mze_find(zn, &idx); 1437 if (mze == NULL) { 1438 err = SET_ERROR(ENOENT); 1439 } else { 1440 zap->zap_m.zap_num_entries--; 1441 memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t)); 1442 zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx); 1443 } 1444 } 1445 zap_name_free(zn); 1446 return (err); 1447 } 1448 1449 int 1450 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, 1451 matchtype_t mt, dmu_tx_t *tx) 1452 { 1453 zap_t *zap; 1454 int err; 1455 1456 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1457 if (err) 1458 return (err); 1459 err = zap_remove_impl(zap, name, mt, tx); 1460 zap_unlockdir(zap, FTAG); 1461 return (err); 1462 } 1463 1464 int 1465 zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) 1466 { 1467 zap_t *zap; 1468 int err; 1469 1470 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1471 if (err) 1472 return (err); 1473 err = zap_remove_impl(zap, name, 0, tx); 1474 zap_unlockdir(zap, FTAG); 1475 return (err); 1476 } 1477 1478 int 1479 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, 1480 int key_numints, dmu_tx_t *tx) 1481 { 1482 zap_t *zap; 1483 1484 int err = 1485 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); 1486 if (err != 0) 1487 return (err); 1488 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); 1489 if (zn == NULL) { 1490 zap_unlockdir(zap, FTAG); 1491 return (SET_ERROR(ENOTSUP)); 1492 } 1493 err = fzap_remove(zn, tx); 1494 zap_name_free(zn); 1495 zap_unlockdir(zap, FTAG); 1496 return (err); 1497 } 1498 1499 /* 1500 * Routines for iterating over the attributes. 1501 */ 1502 1503 static void 1504 zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, 1505 uint64_t serialized, boolean_t prefetch) 1506 { 1507 zc->zc_objset = os; 1508 zc->zc_zap = NULL; 1509 zc->zc_leaf = NULL; 1510 zc->zc_zapobj = zapobj; 1511 zc->zc_serialized = serialized; 1512 zc->zc_hash = 0; 1513 zc->zc_cd = 0; 1514 zc->zc_prefetch = prefetch; 1515 } 1516 void 1517 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, 1518 uint64_t serialized) 1519 { 1520 zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE); 1521 } 1522 1523 /* 1524 * Initialize a cursor at the beginning of the ZAP object. The entire 1525 * ZAP object will be prefetched. 1526 */ 1527 void 1528 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) 1529 { 1530 zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE); 1531 } 1532 1533 /* 1534 * Initialize a cursor at the beginning, but request that we not prefetch 1535 * the entire ZAP object. 1536 */ 1537 void 1538 zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) 1539 { 1540 zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE); 1541 } 1542 1543 void 1544 zap_cursor_fini(zap_cursor_t *zc) 1545 { 1546 if (zc->zc_zap) { 1547 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); 1548 zap_unlockdir(zc->zc_zap, NULL); 1549 zc->zc_zap = NULL; 1550 } 1551 if (zc->zc_leaf) { 1552 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); 1553 zap_put_leaf(zc->zc_leaf); 1554 zc->zc_leaf = NULL; 1555 } 1556 zc->zc_objset = NULL; 1557 } 1558 1559 uint64_t 1560 zap_cursor_serialize(zap_cursor_t *zc) 1561 { 1562 if (zc->zc_hash == -1ULL) 1563 return (-1ULL); 1564 if (zc->zc_zap == NULL) 1565 return (zc->zc_serialized); 1566 ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); 1567 ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); 1568 1569 /* 1570 * We want to keep the high 32 bits of the cursor zero if we can, so 1571 * that 32-bit programs can access this. So usually use a small 1572 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits 1573 * of the cursor. 1574 * 1575 * [ collision differentiator | zap_hashbits()-bit hash value ] 1576 */ 1577 return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | 1578 ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); 1579 } 1580 1581 int 1582 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) 1583 { 1584 int err; 1585 1586 if (zc->zc_hash == -1ULL) 1587 return (SET_ERROR(ENOENT)); 1588 1589 if (zc->zc_zap == NULL) { 1590 int hb; 1591 err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, 1592 RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); 1593 if (err != 0) 1594 return (err); 1595 1596 /* 1597 * To support zap_cursor_init_serialized, advance, retrieve, 1598 * we must add to the existing zc_cd, which may already 1599 * be 1 due to the zap_cursor_advance. 1600 */ 1601 ASSERT(zc->zc_hash == 0); 1602 hb = zap_hashbits(zc->zc_zap); 1603 zc->zc_hash = zc->zc_serialized << (64 - hb); 1604 zc->zc_cd += zc->zc_serialized >> hb; 1605 if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ 1606 zc->zc_cd = 0; 1607 } else { 1608 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); 1609 } 1610 if (!zc->zc_zap->zap_ismicro) { 1611 err = fzap_cursor_retrieve(zc->zc_zap, zc, za); 1612 } else { 1613 zfs_btree_index_t idx; 1614 mzap_ent_t mze_tofind; 1615 1616 mze_tofind.mze_hash = zc->zc_hash >> 32; 1617 mze_tofind.mze_cd = zc->zc_cd; 1618 1619 mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree, 1620 &mze_tofind, &idx); 1621 if (mze == NULL) { 1622 mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree, 1623 &idx, &idx); 1624 } 1625 if (mze) { 1626 mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); 1627 ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); 1628 za->za_normalization_conflict = 1629 mzap_normalization_conflict(zc->zc_zap, NULL, 1630 mze, &idx); 1631 za->za_integer_length = 8; 1632 za->za_num_integers = 1; 1633 za->za_first_integer = mzep->mze_value; 1634 (void) strlcpy(za->za_name, mzep->mze_name, 1635 sizeof (za->za_name)); 1636 zc->zc_hash = (uint64_t)mze->mze_hash << 32; 1637 zc->zc_cd = mze->mze_cd; 1638 err = 0; 1639 } else { 1640 zc->zc_hash = -1ULL; 1641 err = SET_ERROR(ENOENT); 1642 } 1643 } 1644 rw_exit(&zc->zc_zap->zap_rwlock); 1645 return (err); 1646 } 1647 1648 void 1649 zap_cursor_advance(zap_cursor_t *zc) 1650 { 1651 if (zc->zc_hash == -1ULL) 1652 return; 1653 zc->zc_cd++; 1654 } 1655 1656 int 1657 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) 1658 { 1659 zap_t *zap; 1660 1661 int err = 1662 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); 1663 if (err != 0) 1664 return (err); 1665 1666 memset(zs, 0, sizeof (zap_stats_t)); 1667 1668 if (zap->zap_ismicro) { 1669 zs->zs_blocksize = zap->zap_dbuf->db_size; 1670 zs->zs_num_entries = zap->zap_m.zap_num_entries; 1671 zs->zs_num_blocks = 1; 1672 } else { 1673 fzap_get_stats(zap, zs); 1674 } 1675 zap_unlockdir(zap, FTAG); 1676 return (0); 1677 } 1678 1679 #if defined(_KERNEL) 1680 EXPORT_SYMBOL(zap_create); 1681 EXPORT_SYMBOL(zap_create_dnsize); 1682 EXPORT_SYMBOL(zap_create_norm); 1683 EXPORT_SYMBOL(zap_create_norm_dnsize); 1684 EXPORT_SYMBOL(zap_create_flags); 1685 EXPORT_SYMBOL(zap_create_flags_dnsize); 1686 EXPORT_SYMBOL(zap_create_claim); 1687 EXPORT_SYMBOL(zap_create_claim_norm); 1688 EXPORT_SYMBOL(zap_create_claim_norm_dnsize); 1689 EXPORT_SYMBOL(zap_create_hold); 1690 EXPORT_SYMBOL(zap_destroy); 1691 EXPORT_SYMBOL(zap_lookup); 1692 EXPORT_SYMBOL(zap_lookup_by_dnode); 1693 EXPORT_SYMBOL(zap_lookup_norm); 1694 EXPORT_SYMBOL(zap_lookup_uint64); 1695 EXPORT_SYMBOL(zap_contains); 1696 EXPORT_SYMBOL(zap_prefetch); 1697 EXPORT_SYMBOL(zap_prefetch_uint64); 1698 EXPORT_SYMBOL(zap_add); 1699 EXPORT_SYMBOL(zap_add_by_dnode); 1700 EXPORT_SYMBOL(zap_add_uint64); 1701 EXPORT_SYMBOL(zap_update); 1702 EXPORT_SYMBOL(zap_update_uint64); 1703 EXPORT_SYMBOL(zap_length); 1704 EXPORT_SYMBOL(zap_length_uint64); 1705 EXPORT_SYMBOL(zap_remove); 1706 EXPORT_SYMBOL(zap_remove_by_dnode); 1707 EXPORT_SYMBOL(zap_remove_norm); 1708 EXPORT_SYMBOL(zap_remove_uint64); 1709 EXPORT_SYMBOL(zap_count); 1710 EXPORT_SYMBOL(zap_value_search); 1711 EXPORT_SYMBOL(zap_join); 1712 EXPORT_SYMBOL(zap_join_increment); 1713 EXPORT_SYMBOL(zap_add_int); 1714 EXPORT_SYMBOL(zap_remove_int); 1715 EXPORT_SYMBOL(zap_lookup_int); 1716 EXPORT_SYMBOL(zap_increment_int); 1717 EXPORT_SYMBOL(zap_add_int_key); 1718 EXPORT_SYMBOL(zap_lookup_int_key); 1719 EXPORT_SYMBOL(zap_increment); 1720 EXPORT_SYMBOL(zap_cursor_init); 1721 EXPORT_SYMBOL(zap_cursor_fini); 1722 EXPORT_SYMBOL(zap_cursor_retrieve); 1723 EXPORT_SYMBOL(zap_cursor_advance); 1724 EXPORT_SYMBOL(zap_cursor_serialize); 1725 EXPORT_SYMBOL(zap_cursor_init_serialized); 1726 EXPORT_SYMBOL(zap_get_stats); 1727 #endif 1728