1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/spa.h> 27 #include <sys/dmu.h> 28 #include <sys/zfs_context.h> 29 #include <sys/zap.h> 30 #include <sys/refcount.h> 31 #include <sys/zap_impl.h> 32 #include <sys/zap_leaf.h> 33 #include <sys/avl.h> 34 35 #ifdef _KERNEL 36 #include <sys/sunddi.h> 37 #endif 38 39 static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx); 40 41 42 static uint64_t 43 zap_hash(zap_t *zap, const char *normname) 44 { 45 const uint8_t *cp; 46 uint8_t c; 47 uint64_t crc = zap->zap_salt; 48 49 /* NB: name must already be normalized, if necessary */ 50 51 ASSERT(crc != 0); 52 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 53 for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) { 54 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 55 } 56 57 /* 58 * Only use 28 bits, since we need 4 bits in the cookie for the 59 * collision differentiator. We MUST use the high bits, since 60 * those are the ones that we first pay attention to when 61 * chosing the bucket. 62 */ 63 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 64 65 return (crc); 66 } 67 68 static int 69 zap_normalize(zap_t *zap, const char *name, char *namenorm) 70 { 71 size_t inlen, outlen; 72 int err; 73 74 inlen = strlen(name) + 1; 75 outlen = ZAP_MAXNAMELEN; 76 77 err = 0; 78 (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, 79 zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL | 80 U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err); 81 82 return (err); 83 } 84 85 boolean_t 86 zap_match(zap_name_t *zn, const char *matchname) 87 { 88 if (zn->zn_matchtype == MT_FIRST) { 89 char norm[ZAP_MAXNAMELEN]; 90 91 if (zap_normalize(zn->zn_zap, matchname, norm) != 0) 92 return (B_FALSE); 93 94 return (strcmp(zn->zn_name_norm, norm) == 0); 95 } else { 96 /* MT_BEST or MT_EXACT */ 97 return (strcmp(zn->zn_name_orij, matchname) == 0); 98 } 99 } 100 101 void 102 zap_name_free(zap_name_t *zn) 103 { 104 kmem_free(zn, sizeof (zap_name_t)); 105 } 106 107 /* XXX combine this with zap_lockdir()? */ 108 zap_name_t * 109 zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt) 110 { 111 zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); 112 113 zn->zn_zap = zap; 114 zn->zn_name_orij = name; 115 zn->zn_matchtype = mt; 116 if (zap->zap_normflags) { 117 if (zap_normalize(zap, name, zn->zn_normbuf) != 0) { 118 zap_name_free(zn); 119 return (NULL); 120 } 121 zn->zn_name_norm = zn->zn_normbuf; 122 } else { 123 if (mt != MT_EXACT) { 124 zap_name_free(zn); 125 return (NULL); 126 } 127 zn->zn_name_norm = zn->zn_name_orij; 128 } 129 130 zn->zn_hash = zap_hash(zap, zn->zn_name_norm); 131 return (zn); 132 } 133 134 static void 135 mzap_byteswap(mzap_phys_t *buf, size_t size) 136 { 137 int i, max; 138 buf->mz_block_type = BSWAP_64(buf->mz_block_type); 139 buf->mz_salt = BSWAP_64(buf->mz_salt); 140 buf->mz_normflags = BSWAP_64(buf->mz_normflags); 141 max = (size / MZAP_ENT_LEN) - 1; 142 for (i = 0; i < max; i++) { 143 buf->mz_chunk[i].mze_value = 144 BSWAP_64(buf->mz_chunk[i].mze_value); 145 buf->mz_chunk[i].mze_cd = 146 BSWAP_32(buf->mz_chunk[i].mze_cd); 147 } 148 } 149 150 void 151 zap_byteswap(void *buf, size_t size) 152 { 153 uint64_t block_type; 154 155 block_type = *(uint64_t *)buf; 156 157 if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { 158 /* ASSERT(magic == ZAP_LEAF_MAGIC); */ 159 mzap_byteswap(buf, size); 160 } else { 161 fzap_byteswap(buf, size); 162 } 163 } 164 165 static int 166 mze_compare(const void *arg1, const void *arg2) 167 { 168 const mzap_ent_t *mze1 = arg1; 169 const mzap_ent_t *mze2 = arg2; 170 171 if (mze1->mze_hash > mze2->mze_hash) 172 return (+1); 173 if (mze1->mze_hash < mze2->mze_hash) 174 return (-1); 175 if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd) 176 return (+1); 177 if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd) 178 return (-1); 179 return (0); 180 } 181 182 static void 183 mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep) 184 { 185 mzap_ent_t *mze; 186 187 ASSERT(zap->zap_ismicro); 188 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 189 ASSERT(mzep->mze_cd < ZAP_MAXCD); 190 191 mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); 192 mze->mze_chunkid = chunkid; 193 mze->mze_hash = hash; 194 mze->mze_phys = *mzep; 195 avl_add(&zap->zap_m.zap_avl, mze); 196 } 197 198 static mzap_ent_t * 199 mze_find(zap_name_t *zn) 200 { 201 mzap_ent_t mze_tofind; 202 mzap_ent_t *mze; 203 avl_index_t idx; 204 avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl; 205 206 ASSERT(zn->zn_zap->zap_ismicro); 207 ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); 208 209 if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name)) 210 return (NULL); 211 212 mze_tofind.mze_hash = zn->zn_hash; 213 mze_tofind.mze_phys.mze_cd = 0; 214 215 again: 216 mze = avl_find(avl, &mze_tofind, &idx); 217 if (mze == NULL) 218 mze = avl_nearest(avl, idx, AVL_AFTER); 219 for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { 220 if (zap_match(zn, mze->mze_phys.mze_name)) 221 return (mze); 222 } 223 if (zn->zn_matchtype == MT_BEST) { 224 zn->zn_matchtype = MT_FIRST; 225 goto again; 226 } 227 return (NULL); 228 } 229 230 static uint32_t 231 mze_find_unused_cd(zap_t *zap, uint64_t hash) 232 { 233 mzap_ent_t mze_tofind; 234 mzap_ent_t *mze; 235 avl_index_t idx; 236 avl_tree_t *avl = &zap->zap_m.zap_avl; 237 uint32_t cd; 238 239 ASSERT(zap->zap_ismicro); 240 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); 241 242 mze_tofind.mze_hash = hash; 243 mze_tofind.mze_phys.mze_cd = 0; 244 245 cd = 0; 246 for (mze = avl_find(avl, &mze_tofind, &idx); 247 mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { 248 if (mze->mze_phys.mze_cd != cd) 249 break; 250 cd++; 251 } 252 253 return (cd); 254 } 255 256 static void 257 mze_remove(zap_t *zap, mzap_ent_t *mze) 258 { 259 ASSERT(zap->zap_ismicro); 260 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 261 262 avl_remove(&zap->zap_m.zap_avl, mze); 263 kmem_free(mze, sizeof (mzap_ent_t)); 264 } 265 266 static void 267 mze_destroy(zap_t *zap) 268 { 269 mzap_ent_t *mze; 270 void *avlcookie = NULL; 271 272 while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)) 273 kmem_free(mze, sizeof (mzap_ent_t)); 274 avl_destroy(&zap->zap_m.zap_avl); 275 } 276 277 static zap_t * 278 mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) 279 { 280 zap_t *winner; 281 zap_t *zap; 282 int i; 283 284 ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); 285 286 zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); 287 rw_init(&zap->zap_rwlock, 0, 0, 0); 288 rw_enter(&zap->zap_rwlock, RW_WRITER); 289 zap->zap_objset = os; 290 zap->zap_object = obj; 291 zap->zap_dbuf = db; 292 293 if (*(uint64_t *)db->db_data != ZBT_MICRO) { 294 mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); 295 zap->zap_f.zap_block_shift = highbit(db->db_size) - 1; 296 } else { 297 zap->zap_ismicro = TRUE; 298 } 299 300 /* 301 * Make sure that zap_ismicro is set before we let others see 302 * it, because zap_lockdir() checks zap_ismicro without the lock 303 * held. 304 */ 305 winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict); 306 307 if (winner != NULL) { 308 rw_exit(&zap->zap_rwlock); 309 rw_destroy(&zap->zap_rwlock); 310 if (!zap->zap_ismicro) 311 mutex_destroy(&zap->zap_f.zap_num_entries_mtx); 312 kmem_free(zap, sizeof (zap_t)); 313 return (winner); 314 } 315 316 if (zap->zap_ismicro) { 317 zap->zap_salt = zap->zap_m.zap_phys->mz_salt; 318 zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags; 319 zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; 320 avl_create(&zap->zap_m.zap_avl, mze_compare, 321 sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); 322 323 for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { 324 mzap_ent_phys_t *mze = 325 &zap->zap_m.zap_phys->mz_chunk[i]; 326 if (mze->mze_name[0]) { 327 zap_name_t *zn; 328 329 zap->zap_m.zap_num_entries++; 330 zn = zap_name_alloc(zap, mze->mze_name, 331 MT_EXACT); 332 mze_insert(zap, i, zn->zn_hash, mze); 333 zap_name_free(zn); 334 } 335 } 336 } else { 337 zap->zap_salt = zap->zap_f.zap_phys->zap_salt; 338 zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags; 339 340 ASSERT3U(sizeof (struct zap_leaf_header), ==, 341 2*ZAP_LEAF_CHUNKSIZE); 342 343 /* 344 * The embedded pointer table should not overlap the 345 * other members. 346 */ 347 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, 348 &zap->zap_f.zap_phys->zap_salt); 349 350 /* 351 * The embedded pointer table should end at the end of 352 * the block 353 */ 354 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, 355 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) - 356 (uintptr_t)zap->zap_f.zap_phys, ==, 357 zap->zap_dbuf->db_size); 358 } 359 rw_exit(&zap->zap_rwlock); 360 return (zap); 361 } 362 363 int 364 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, 365 krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) 366 { 367 zap_t *zap; 368 dmu_buf_t *db; 369 krw_t lt; 370 int err; 371 372 *zapp = NULL; 373 374 err = dmu_buf_hold(os, obj, 0, NULL, &db); 375 if (err) 376 return (err); 377 378 #ifdef ZFS_DEBUG 379 { 380 dmu_object_info_t doi; 381 dmu_object_info_from_db(db, &doi); 382 ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); 383 } 384 #endif 385 386 zap = dmu_buf_get_user(db); 387 if (zap == NULL) 388 zap = mzap_open(os, obj, db); 389 390 /* 391 * We're checking zap_ismicro without the lock held, in order to 392 * tell what type of lock we want. Once we have some sort of 393 * lock, see if it really is the right type. In practice this 394 * can only be different if it was upgraded from micro to fat, 395 * and micro wanted WRITER but fat only needs READER. 396 */ 397 lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; 398 rw_enter(&zap->zap_rwlock, lt); 399 if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { 400 /* it was upgraded, now we only need reader */ 401 ASSERT(lt == RW_WRITER); 402 ASSERT(RW_READER == 403 (!zap->zap_ismicro && fatreader) ? RW_READER : lti); 404 rw_downgrade(&zap->zap_rwlock); 405 lt = RW_READER; 406 } 407 408 zap->zap_objset = os; 409 410 if (lt == RW_WRITER) 411 dmu_buf_will_dirty(db, tx); 412 413 ASSERT3P(zap->zap_dbuf, ==, db); 414 415 ASSERT(!zap->zap_ismicro || 416 zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); 417 if (zap->zap_ismicro && tx && adding && 418 zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { 419 uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; 420 if (newsz > MZAP_MAX_BLKSZ) { 421 dprintf("upgrading obj %llu: num_entries=%u\n", 422 obj, zap->zap_m.zap_num_entries); 423 *zapp = zap; 424 return (mzap_upgrade(zapp, tx)); 425 } 426 err = dmu_object_set_blocksize(os, obj, newsz, 0, tx); 427 ASSERT3U(err, ==, 0); 428 zap->zap_m.zap_num_chunks = 429 db->db_size / MZAP_ENT_LEN - 1; 430 } 431 432 *zapp = zap; 433 return (0); 434 } 435 436 void 437 zap_unlockdir(zap_t *zap) 438 { 439 rw_exit(&zap->zap_rwlock); 440 dmu_buf_rele(zap->zap_dbuf, NULL); 441 } 442 443 static int 444 mzap_upgrade(zap_t **zapp, dmu_tx_t *tx) 445 { 446 mzap_phys_t *mzp; 447 int i, sz, nchunks, err; 448 zap_t *zap = *zapp; 449 450 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 451 452 sz = zap->zap_dbuf->db_size; 453 mzp = kmem_alloc(sz, KM_SLEEP); 454 bcopy(zap->zap_dbuf->db_data, mzp, sz); 455 nchunks = zap->zap_m.zap_num_chunks; 456 457 err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 458 1ULL << fzap_default_block_shift, 0, tx); 459 if (err) { 460 kmem_free(mzp, sz); 461 return (err); 462 } 463 464 dprintf("upgrading obj=%llu with %u chunks\n", 465 zap->zap_object, nchunks); 466 /* XXX destroy the avl later, so we can use the stored hash value */ 467 mze_destroy(zap); 468 469 fzap_upgrade(zap, tx); 470 471 for (i = 0; i < nchunks; i++) { 472 int err; 473 mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; 474 zap_name_t *zn; 475 if (mze->mze_name[0] == 0) 476 continue; 477 dprintf("adding %s=%llu\n", 478 mze->mze_name, mze->mze_value); 479 zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT); 480 err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx); 481 zap = zn->zn_zap; /* fzap_add_cd() may change zap */ 482 zap_name_free(zn); 483 if (err) 484 break; 485 } 486 kmem_free(mzp, sz); 487 *zapp = zap; 488 return (err); 489 } 490 491 static void 492 mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx) 493 { 494 dmu_buf_t *db; 495 mzap_phys_t *zp; 496 497 VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db)); 498 499 #ifdef ZFS_DEBUG 500 { 501 dmu_object_info_t doi; 502 dmu_object_info_from_db(db, &doi); 503 ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); 504 } 505 #endif 506 507 dmu_buf_will_dirty(db, tx); 508 zp = db->db_data; 509 zp->mz_block_type = ZBT_MICRO; 510 zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; 511 zp->mz_normflags = normflags; 512 dmu_buf_rele(db, FTAG); 513 } 514 515 int 516 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, 517 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 518 { 519 return (zap_create_claim_norm(os, obj, 520 0, ot, bonustype, bonuslen, tx)); 521 } 522 523 int 524 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, 525 dmu_object_type_t ot, 526 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 527 { 528 int err; 529 530 err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); 531 if (err != 0) 532 return (err); 533 mzap_create_impl(os, obj, normflags, tx); 534 return (0); 535 } 536 537 uint64_t 538 zap_create(objset_t *os, dmu_object_type_t ot, 539 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 540 { 541 return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); 542 } 543 544 uint64_t 545 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, 546 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 547 { 548 uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); 549 550 mzap_create_impl(os, obj, normflags, tx); 551 return (obj); 552 } 553 554 int 555 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) 556 { 557 /* 558 * dmu_object_free will free the object number and free the 559 * data. Freeing the data will cause our pageout function to be 560 * called, which will destroy our data (zap_leaf_t's and zap_t). 561 */ 562 563 return (dmu_object_free(os, zapobj, tx)); 564 } 565 566 _NOTE(ARGSUSED(0)) 567 void 568 zap_evict(dmu_buf_t *db, void *vzap) 569 { 570 zap_t *zap = vzap; 571 572 rw_destroy(&zap->zap_rwlock); 573 574 if (zap->zap_ismicro) 575 mze_destroy(zap); 576 else 577 mutex_destroy(&zap->zap_f.zap_num_entries_mtx); 578 579 kmem_free(zap, sizeof (zap_t)); 580 } 581 582 int 583 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) 584 { 585 zap_t *zap; 586 int err; 587 588 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); 589 if (err) 590 return (err); 591 if (!zap->zap_ismicro) { 592 err = fzap_count(zap, count); 593 } else { 594 *count = zap->zap_m.zap_num_entries; 595 } 596 zap_unlockdir(zap); 597 return (err); 598 } 599 600 /* 601 * zn may be NULL; if not specified, it will be computed if needed. 602 * See also the comment above zap_entry_normalization_conflict(). 603 */ 604 static boolean_t 605 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) 606 { 607 mzap_ent_t *other; 608 int direction = AVL_BEFORE; 609 boolean_t allocdzn = B_FALSE; 610 611 if (zap->zap_normflags == 0) 612 return (B_FALSE); 613 614 again: 615 for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction); 616 other && other->mze_hash == mze->mze_hash; 617 other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { 618 619 if (zn == NULL) { 620 zn = zap_name_alloc(zap, mze->mze_phys.mze_name, 621 MT_FIRST); 622 allocdzn = B_TRUE; 623 } 624 if (zap_match(zn, other->mze_phys.mze_name)) { 625 if (allocdzn) 626 zap_name_free(zn); 627 return (B_TRUE); 628 } 629 } 630 631 if (direction == AVL_BEFORE) { 632 direction = AVL_AFTER; 633 goto again; 634 } 635 636 if (allocdzn) 637 zap_name_free(zn); 638 return (B_FALSE); 639 } 640 641 /* 642 * Routines for manipulating attributes. 643 */ 644 645 int 646 zap_lookup(objset_t *os, uint64_t zapobj, const char *name, 647 uint64_t integer_size, uint64_t num_integers, void *buf) 648 { 649 return (zap_lookup_norm(os, zapobj, name, integer_size, 650 num_integers, buf, MT_EXACT, NULL, 0, NULL)); 651 } 652 653 int 654 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, 655 uint64_t integer_size, uint64_t num_integers, void *buf, 656 matchtype_t mt, char *realname, int rn_len, 657 boolean_t *ncp) 658 { 659 zap_t *zap; 660 int err; 661 mzap_ent_t *mze; 662 zap_name_t *zn; 663 664 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); 665 if (err) 666 return (err); 667 zn = zap_name_alloc(zap, name, mt); 668 if (zn == NULL) { 669 zap_unlockdir(zap); 670 return (ENOTSUP); 671 } 672 673 if (!zap->zap_ismicro) { 674 err = fzap_lookup(zn, integer_size, num_integers, buf, 675 realname, rn_len, ncp); 676 } else { 677 mze = mze_find(zn); 678 if (mze == NULL) { 679 err = ENOENT; 680 } else { 681 if (num_integers < 1) { 682 err = EOVERFLOW; 683 } else if (integer_size != 8) { 684 err = EINVAL; 685 } else { 686 *(uint64_t *)buf = mze->mze_phys.mze_value; 687 (void) strlcpy(realname, 688 mze->mze_phys.mze_name, rn_len); 689 if (ncp) { 690 *ncp = mzap_normalization_conflict(zap, 691 zn, mze); 692 } 693 } 694 } 695 } 696 zap_name_free(zn); 697 zap_unlockdir(zap); 698 return (err); 699 } 700 701 int 702 zap_length(objset_t *os, uint64_t zapobj, const char *name, 703 uint64_t *integer_size, uint64_t *num_integers) 704 { 705 zap_t *zap; 706 int err; 707 mzap_ent_t *mze; 708 zap_name_t *zn; 709 710 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); 711 if (err) 712 return (err); 713 zn = zap_name_alloc(zap, name, MT_EXACT); 714 if (zn == NULL) { 715 zap_unlockdir(zap); 716 return (ENOTSUP); 717 } 718 if (!zap->zap_ismicro) { 719 err = fzap_length(zn, integer_size, num_integers); 720 } else { 721 mze = mze_find(zn); 722 if (mze == NULL) { 723 err = ENOENT; 724 } else { 725 if (integer_size) 726 *integer_size = 8; 727 if (num_integers) 728 *num_integers = 1; 729 } 730 } 731 zap_name_free(zn); 732 zap_unlockdir(zap); 733 return (err); 734 } 735 736 static void 737 mzap_addent(zap_name_t *zn, uint64_t value) 738 { 739 int i; 740 zap_t *zap = zn->zn_zap; 741 int start = zap->zap_m.zap_alloc_next; 742 uint32_t cd; 743 744 dprintf("obj=%llu %s=%llu\n", zap->zap_object, 745 zn->zn_name_orij, value); 746 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 747 748 #ifdef ZFS_DEBUG 749 for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { 750 mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; 751 ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0); 752 } 753 #endif 754 755 cd = mze_find_unused_cd(zap, zn->zn_hash); 756 /* given the limited size of the microzap, this can't happen */ 757 ASSERT(cd != ZAP_MAXCD); 758 759 again: 760 for (i = start; i < zap->zap_m.zap_num_chunks; i++) { 761 mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; 762 if (mze->mze_name[0] == 0) { 763 mze->mze_value = value; 764 mze->mze_cd = cd; 765 (void) strcpy(mze->mze_name, zn->zn_name_orij); 766 zap->zap_m.zap_num_entries++; 767 zap->zap_m.zap_alloc_next = i+1; 768 if (zap->zap_m.zap_alloc_next == 769 zap->zap_m.zap_num_chunks) 770 zap->zap_m.zap_alloc_next = 0; 771 mze_insert(zap, i, zn->zn_hash, mze); 772 return; 773 } 774 } 775 if (start != 0) { 776 start = 0; 777 goto again; 778 } 779 ASSERT(!"out of entries!"); 780 } 781 782 int 783 zap_add(objset_t *os, uint64_t zapobj, const char *name, 784 int integer_size, uint64_t num_integers, 785 const void *val, dmu_tx_t *tx) 786 { 787 zap_t *zap; 788 int err; 789 mzap_ent_t *mze; 790 const uint64_t *intval = val; 791 zap_name_t *zn; 792 793 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); 794 if (err) 795 return (err); 796 zn = zap_name_alloc(zap, name, MT_EXACT); 797 if (zn == NULL) { 798 zap_unlockdir(zap); 799 return (ENOTSUP); 800 } 801 if (!zap->zap_ismicro) { 802 err = fzap_add(zn, integer_size, num_integers, val, tx); 803 zap = zn->zn_zap; /* fzap_add() may change zap */ 804 } else if (integer_size != 8 || num_integers != 1 || 805 strlen(name) >= MZAP_NAME_LEN) { 806 dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", 807 zapobj, integer_size, num_integers, name); 808 err = mzap_upgrade(&zn->zn_zap, tx); 809 if (err == 0) 810 err = fzap_add(zn, integer_size, num_integers, val, tx); 811 zap = zn->zn_zap; /* fzap_add() may change zap */ 812 } else { 813 mze = mze_find(zn); 814 if (mze != NULL) { 815 err = EEXIST; 816 } else { 817 mzap_addent(zn, *intval); 818 } 819 } 820 ASSERT(zap == zn->zn_zap); 821 zap_name_free(zn); 822 if (zap != NULL) /* may be NULL if fzap_add() failed */ 823 zap_unlockdir(zap); 824 return (err); 825 } 826 827 int 828 zap_update(objset_t *os, uint64_t zapobj, const char *name, 829 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) 830 { 831 zap_t *zap; 832 mzap_ent_t *mze; 833 const uint64_t *intval = val; 834 zap_name_t *zn; 835 int err; 836 837 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); 838 if (err) 839 return (err); 840 zn = zap_name_alloc(zap, name, MT_EXACT); 841 if (zn == NULL) { 842 zap_unlockdir(zap); 843 return (ENOTSUP); 844 } 845 if (!zap->zap_ismicro) { 846 err = fzap_update(zn, integer_size, num_integers, val, tx); 847 zap = zn->zn_zap; /* fzap_update() may change zap */ 848 } else if (integer_size != 8 || num_integers != 1 || 849 strlen(name) >= MZAP_NAME_LEN) { 850 dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", 851 zapobj, integer_size, num_integers, name); 852 err = mzap_upgrade(&zn->zn_zap, tx); 853 if (err == 0) 854 err = fzap_update(zn, integer_size, num_integers, 855 val, tx); 856 zap = zn->zn_zap; /* fzap_update() may change zap */ 857 } else { 858 mze = mze_find(zn); 859 if (mze != NULL) { 860 mze->mze_phys.mze_value = *intval; 861 zap->zap_m.zap_phys->mz_chunk 862 [mze->mze_chunkid].mze_value = *intval; 863 } else { 864 mzap_addent(zn, *intval); 865 } 866 } 867 ASSERT(zap == zn->zn_zap); 868 zap_name_free(zn); 869 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ 870 zap_unlockdir(zap); 871 return (err); 872 } 873 874 int 875 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) 876 { 877 return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx)); 878 } 879 880 int 881 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, 882 matchtype_t mt, dmu_tx_t *tx) 883 { 884 zap_t *zap; 885 int err; 886 mzap_ent_t *mze; 887 zap_name_t *zn; 888 889 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); 890 if (err) 891 return (err); 892 zn = zap_name_alloc(zap, name, mt); 893 if (zn == NULL) { 894 zap_unlockdir(zap); 895 return (ENOTSUP); 896 } 897 if (!zap->zap_ismicro) { 898 err = fzap_remove(zn, tx); 899 } else { 900 mze = mze_find(zn); 901 if (mze == NULL) { 902 err = ENOENT; 903 } else { 904 zap->zap_m.zap_num_entries--; 905 bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid], 906 sizeof (mzap_ent_phys_t)); 907 mze_remove(zap, mze); 908 } 909 } 910 zap_name_free(zn); 911 zap_unlockdir(zap); 912 return (err); 913 } 914 915 /* 916 * Routines for iterating over the attributes. 917 */ 918 919 /* 920 * We want to keep the high 32 bits of the cursor zero if we can, so 921 * that 32-bit programs can access this. So use a small hash value so 922 * we can fit 4 bits of cd into the 32-bit cursor. 923 * 924 * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ] 925 */ 926 void 927 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, 928 uint64_t serialized) 929 { 930 zc->zc_objset = os; 931 zc->zc_zap = NULL; 932 zc->zc_leaf = NULL; 933 zc->zc_zapobj = zapobj; 934 if (serialized == -1ULL) { 935 zc->zc_hash = -1ULL; 936 zc->zc_cd = 0; 937 } else { 938 zc->zc_hash = serialized << (64-ZAP_HASHBITS); 939 zc->zc_cd = serialized >> ZAP_HASHBITS; 940 if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */ 941 zc->zc_cd = 0; 942 } 943 } 944 945 void 946 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) 947 { 948 zap_cursor_init_serialized(zc, os, zapobj, 0); 949 } 950 951 void 952 zap_cursor_fini(zap_cursor_t *zc) 953 { 954 if (zc->zc_zap) { 955 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); 956 zap_unlockdir(zc->zc_zap); 957 zc->zc_zap = NULL; 958 } 959 if (zc->zc_leaf) { 960 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); 961 zap_put_leaf(zc->zc_leaf); 962 zc->zc_leaf = NULL; 963 } 964 zc->zc_objset = NULL; 965 } 966 967 uint64_t 968 zap_cursor_serialize(zap_cursor_t *zc) 969 { 970 if (zc->zc_hash == -1ULL) 971 return (-1ULL); 972 ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0); 973 ASSERT(zc->zc_cd < ZAP_MAXCD); 974 return ((zc->zc_hash >> (64-ZAP_HASHBITS)) | 975 ((uint64_t)zc->zc_cd << ZAP_HASHBITS)); 976 } 977 978 int 979 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) 980 { 981 int err; 982 avl_index_t idx; 983 mzap_ent_t mze_tofind; 984 mzap_ent_t *mze; 985 986 if (zc->zc_hash == -1ULL) 987 return (ENOENT); 988 989 if (zc->zc_zap == NULL) { 990 err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, 991 RW_READER, TRUE, FALSE, &zc->zc_zap); 992 if (err) 993 return (err); 994 } else { 995 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); 996 } 997 if (!zc->zc_zap->zap_ismicro) { 998 err = fzap_cursor_retrieve(zc->zc_zap, zc, za); 999 } else { 1000 err = ENOENT; 1001 1002 mze_tofind.mze_hash = zc->zc_hash; 1003 mze_tofind.mze_phys.mze_cd = zc->zc_cd; 1004 1005 mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); 1006 if (mze == NULL) { 1007 mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, 1008 idx, AVL_AFTER); 1009 } 1010 if (mze) { 1011 ASSERT(0 == bcmp(&mze->mze_phys, 1012 &zc->zc_zap->zap_m.zap_phys->mz_chunk 1013 [mze->mze_chunkid], sizeof (mze->mze_phys))); 1014 1015 za->za_normalization_conflict = 1016 mzap_normalization_conflict(zc->zc_zap, NULL, mze); 1017 za->za_integer_length = 8; 1018 za->za_num_integers = 1; 1019 za->za_first_integer = mze->mze_phys.mze_value; 1020 (void) strcpy(za->za_name, mze->mze_phys.mze_name); 1021 zc->zc_hash = mze->mze_hash; 1022 zc->zc_cd = mze->mze_phys.mze_cd; 1023 err = 0; 1024 } else { 1025 zc->zc_hash = -1ULL; 1026 } 1027 } 1028 rw_exit(&zc->zc_zap->zap_rwlock); 1029 return (err); 1030 } 1031 1032 void 1033 zap_cursor_advance(zap_cursor_t *zc) 1034 { 1035 if (zc->zc_hash == -1ULL) 1036 return; 1037 zc->zc_cd++; 1038 if (zc->zc_cd >= ZAP_MAXCD) { 1039 zc->zc_cd = 0; 1040 zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS); 1041 if (zc->zc_hash == 0) /* EOF */ 1042 zc->zc_hash = -1ULL; 1043 } 1044 } 1045 1046 int 1047 zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) 1048 { 1049 int err = 0; 1050 mzap_ent_t *mze; 1051 zap_name_t *zn; 1052 1053 if (zc->zc_zap == NULL) { 1054 err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, 1055 RW_READER, TRUE, FALSE, &zc->zc_zap); 1056 if (err) 1057 return (err); 1058 } else { 1059 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); 1060 } 1061 1062 zn = zap_name_alloc(zc->zc_zap, name, mt); 1063 if (zn == NULL) { 1064 rw_exit(&zc->zc_zap->zap_rwlock); 1065 return (ENOTSUP); 1066 } 1067 1068 if (!zc->zc_zap->zap_ismicro) { 1069 err = fzap_cursor_move_to_key(zc, zn); 1070 } else { 1071 mze = mze_find(zn); 1072 if (mze == NULL) { 1073 err = ENOENT; 1074 goto out; 1075 } 1076 zc->zc_hash = mze->mze_hash; 1077 zc->zc_cd = mze->mze_phys.mze_cd; 1078 } 1079 1080 out: 1081 zap_name_free(zn); 1082 rw_exit(&zc->zc_zap->zap_rwlock); 1083 return (err); 1084 } 1085 1086 int 1087 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) 1088 { 1089 int err; 1090 zap_t *zap; 1091 1092 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); 1093 if (err) 1094 return (err); 1095 1096 bzero(zs, sizeof (zap_stats_t)); 1097 1098 if (zap->zap_ismicro) { 1099 zs->zs_blocksize = zap->zap_dbuf->db_size; 1100 zs->zs_num_entries = zap->zap_m.zap_num_entries; 1101 zs->zs_num_blocks = 1; 1102 } else { 1103 fzap_get_stats(zap, zs); 1104 } 1105 zap_unlockdir(zap); 1106 return (0); 1107 } 1108 1109 int 1110 zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, 1111 uint64_t *towrite, uint64_t *tooverwrite) 1112 { 1113 zap_t *zap; 1114 int err = 0; 1115 1116 1117 /* 1118 * Since, we don't have a name, we cannot figure out which blocks will 1119 * be affected in this operation. So, account for the worst case : 1120 * - 3 blocks overwritten: target leaf, ptrtbl block, header block 1121 * - 4 new blocks written if adding: 1122 * - 2 blocks for possibly split leaves, 1123 * - 2 grown ptrtbl blocks 1124 * 1125 * This also accomodates the case where an add operation to a fairly 1126 * large microzap results in a promotion to fatzap. 1127 */ 1128 if (name == NULL) { 1129 *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; 1130 return (err); 1131 } 1132 1133 /* 1134 * We lock the zap with adding == FALSE. Because, if we pass 1135 * the actual value of add, it could trigger a mzap_upgrade(). 1136 * At present we are just evaluating the possibility of this operation 1137 * and hence we donot want to trigger an upgrade. 1138 */ 1139 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); 1140 if (err) 1141 return (err); 1142 1143 if (!zap->zap_ismicro) { 1144 zap_name_t *zn = zap_name_alloc(zap, name, MT_EXACT); 1145 if (zn) { 1146 err = fzap_count_write(zn, add, towrite, 1147 tooverwrite); 1148 zap_name_free(zn); 1149 } else { 1150 /* 1151 * We treat this case as similar to (name == NULL) 1152 */ 1153 *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; 1154 } 1155 } else { 1156 /* 1157 * We are here if (name != NULL) and this is a micro-zap. 1158 * We account for the header block depending on whether it 1159 * is freeable. 1160 * 1161 * Incase of an add-operation it is hard to find out 1162 * if this add will promote this microzap to fatzap. 1163 * Hence, we consider the worst case and account for the 1164 * blocks assuming this microzap would be promoted to a 1165 * fatzap. 1166 * 1167 * 1 block overwritten : header block 1168 * 4 new blocks written : 2 new split leaf, 2 grown 1169 * ptrtbl blocks 1170 */ 1171 if (dmu_buf_freeable(zap->zap_dbuf)) 1172 *tooverwrite += SPA_MAXBLOCKSIZE; 1173 else 1174 *towrite += SPA_MAXBLOCKSIZE; 1175 1176 if (add) { 1177 *towrite += 4 * SPA_MAXBLOCKSIZE; 1178 } 1179 } 1180 1181 zap_unlockdir(zap); 1182 return (err); 1183 } 1184