xref: /titanic_51/usr/src/uts/common/fs/zfs/zap_micro.c (revision 75d01c9ab5ef6f1bbac9f9d4eb379d5c38583d82)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/spa.h>
30 #include <sys/dmu.h>
31 #include <sys/zfs_context.h>
32 #include <sys/zap.h>
33 #include <sys/zap_impl.h>
34 #include <sys/zap_leaf.h>
35 #include <sys/avl.h>
36 
37 
38 static uint64_t mzap_write_cookie(zap_t *zap, uint64_t cookie,
39     uint64_t entptr);
40 static void mzap_upgrade(zap_t *zap, dmu_tx_t *tx);
41 
42 
43 static void
44 mzap_byteswap(mzap_phys_t *buf, size_t size)
45 {
46 	int i, max;
47 	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
48 	buf->mz_salt = BSWAP_64(buf->mz_salt);
49 	max = (size / MZAP_ENT_LEN) - 1;
50 	for (i = 0; i < max; i++) {
51 		buf->mz_chunk[i].mze_value =
52 		    BSWAP_64(buf->mz_chunk[i].mze_value);
53 		buf->mz_chunk[i].mze_cd =
54 		    BSWAP_32(buf->mz_chunk[i].mze_cd);
55 	}
56 }
57 
58 void
59 zap_byteswap(void *buf, size_t size)
60 {
61 	uint64_t block_type;
62 
63 	block_type = *(uint64_t *)buf;
64 
65 	switch (block_type) {
66 	case ZBT_MICRO:
67 	case BSWAP_64(ZBT_MICRO):
68 		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
69 		mzap_byteswap(buf, size);
70 		return;
71 	default:
72 		ASSERT(size == (1<<ZAP_BLOCK_SHIFT));
73 		fzap_byteswap(buf, size);
74 		return;
75 	}
76 }
77 
78 static int
79 mze_compare(const void *arg1, const void *arg2)
80 {
81 	const mzap_ent_t *mze1 = arg1;
82 	const mzap_ent_t *mze2 = arg2;
83 
84 	if (mze1->mze_hash > mze2->mze_hash)
85 		return (+1);
86 	if (mze1->mze_hash < mze2->mze_hash)
87 		return (-1);
88 	if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
89 		return (+1);
90 	if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
91 		return (-1);
92 	return (0);
93 }
94 
95 static void
96 mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
97 {
98 	mzap_ent_t *mze;
99 
100 	ASSERT(zap->zap_ismicro);
101 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
102 	ASSERT(mzep->mze_cd < ZAP_MAXCD);
103 	ASSERT3U(zap_hash(zap, mzep->mze_name), ==, hash);
104 
105 	mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
106 	mze->mze_chunkid = chunkid;
107 	mze->mze_hash = hash;
108 	mze->mze_phys = *mzep;
109 	avl_add(&zap->zap_m.zap_avl, mze);
110 }
111 
112 static mzap_ent_t *
113 mze_find(zap_t *zap, const char *name, uint64_t hash)
114 {
115 	mzap_ent_t mze_tofind;
116 	mzap_ent_t *mze;
117 	avl_index_t idx;
118 	avl_tree_t *avl = &zap->zap_m.zap_avl;
119 
120 	ASSERT(zap->zap_ismicro);
121 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
122 	ASSERT3U(zap_hash(zap, name), ==, hash);
123 
124 	if (strlen(name) >= sizeof (mze_tofind.mze_phys.mze_name))
125 		return (NULL);
126 
127 	mze_tofind.mze_hash = hash;
128 	mze_tofind.mze_phys.mze_cd = 0;
129 
130 	mze = avl_find(avl, &mze_tofind, &idx);
131 	if (mze == NULL)
132 		mze = avl_nearest(avl, idx, AVL_AFTER);
133 	for (; mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
134 		if (strcmp(name, mze->mze_phys.mze_name) == 0)
135 			return (mze);
136 	}
137 	return (NULL);
138 }
139 
140 static uint32_t
141 mze_find_unused_cd(zap_t *zap, uint64_t hash)
142 {
143 	mzap_ent_t mze_tofind;
144 	mzap_ent_t *mze;
145 	avl_index_t idx;
146 	avl_tree_t *avl = &zap->zap_m.zap_avl;
147 	uint32_t cd;
148 
149 	ASSERT(zap->zap_ismicro);
150 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
151 
152 	mze_tofind.mze_hash = hash;
153 	mze_tofind.mze_phys.mze_cd = 0;
154 
155 	cd = 0;
156 	for (mze = avl_find(avl, &mze_tofind, &idx);
157 	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
158 		if (mze->mze_phys.mze_cd != cd)
159 			break;
160 		cd++;
161 	}
162 
163 	return (cd);
164 }
165 
166 static void
167 mze_remove(zap_t *zap, mzap_ent_t *mze)
168 {
169 	ASSERT(zap->zap_ismicro);
170 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
171 
172 	avl_remove(&zap->zap_m.zap_avl, mze);
173 	kmem_free(mze, sizeof (mzap_ent_t));
174 }
175 
176 static void
177 mze_destroy(zap_t *zap)
178 {
179 	mzap_ent_t *mze;
180 	void *avlcookie = NULL;
181 
182 	while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
183 		kmem_free(mze, sizeof (mzap_ent_t));
184 	avl_destroy(&zap->zap_m.zap_avl);
185 }
186 
187 static zap_t *
188 mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
189 {
190 	zap_t *winner;
191 	zap_t *zap;
192 	int i;
193 
194 	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
195 
196 	zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
197 	rw_init(&zap->zap_rwlock, 0, 0, 0);
198 	rw_enter(&zap->zap_rwlock, RW_WRITER);
199 	zap->zap_objset = os;
200 	zap->zap_object = obj;
201 	zap->zap_dbuf = db;
202 
203 	if (((uint64_t *)db->db_data)[0] != ZBT_MICRO) {
204 		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
205 	} else {
206 		zap->zap_ismicro = TRUE;
207 	}
208 
209 	/*
210 	 * Make sure that zap_ismicro is set before we let others see
211 	 * it, because zap_lockdir() checks zap_ismicro without the lock
212 	 * held.
213 	 */
214 	winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_pageout);
215 
216 	if (winner != NULL) {
217 		kmem_free(zap, sizeof (zap_t));
218 		return (winner);
219 	}
220 
221 	if (zap->zap_ismicro) {
222 		zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
223 		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
224 		avl_create(&zap->zap_m.zap_avl, mze_compare,
225 		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
226 
227 		for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
228 			mzap_ent_phys_t *mze =
229 			    &zap->zap_m.zap_phys->mz_chunk[i];
230 			if (mze->mze_name[0]) {
231 				zap->zap_m.zap_num_entries++;
232 				mze_insert(zap, i,
233 				    zap_hash(zap, mze->mze_name), mze);
234 			}
235 		}
236 	} else {
237 		zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
238 	}
239 	rw_exit(&zap->zap_rwlock);
240 	return (zap);
241 }
242 
243 int
244 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
245     krw_t lti, int fatreader, zap_t **zapp)
246 {
247 	zap_t *zap;
248 	dmu_buf_t *db;
249 	krw_t lt;
250 	int err;
251 
252 	*zapp = NULL;
253 
254 	db = dmu_buf_hold(os, obj, 0);
255 
256 #ifdef ZFS_DEBUG
257 	{
258 		dmu_object_info_t doi;
259 		dmu_object_info_from_db(db, &doi);
260 		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
261 	}
262 #endif
263 
264 	/*
265 	 * The zap can deal with EIO here, but its callers don't yet, so
266 	 * spare them by doing a mustsucceed read.
267 	 */
268 	dmu_buf_read(db);
269 
270 	zap = dmu_buf_get_user(db);
271 	if (zap == NULL)
272 		zap = mzap_open(os, obj, db);
273 
274 	/*
275 	 * We're checking zap_ismicro without the lock held, in order to
276 	 * tell what type of lock we want.  Once we have some sort of
277 	 * lock, see if it really is the right type.  In practice this
278 	 * can only be different if it was upgraded from micro to fat,
279 	 * and micro wanted WRITER but fat only needs READER.
280 	 */
281 	lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
282 	rw_enter(&zap->zap_rwlock, lt);
283 	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
284 		/* it was upgraded, now we only need reader */
285 		ASSERT(lt == RW_WRITER);
286 		ASSERT(RW_READER ==
287 		    (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
288 		rw_downgrade(&zap->zap_rwlock);
289 		lt = RW_READER;
290 	}
291 
292 	zap->zap_objset = os;
293 
294 	if (lt == RW_WRITER)
295 		dmu_buf_will_dirty(db, tx);
296 
297 	ASSERT3P(zap->zap_dbuf, ==, db);
298 
299 	ASSERT(!zap->zap_ismicro ||
300 	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
301 	if (zap->zap_ismicro && tx &&
302 	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
303 		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
304 		if (newsz > MZAP_MAX_BLKSZ) {
305 			dprintf("upgrading obj %llu: num_entries=%u\n",
306 			    obj, zap->zap_m.zap_num_entries);
307 			mzap_upgrade(zap, tx);
308 			*zapp = zap;
309 			return (0);
310 		}
311 		err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
312 		ASSERT3U(err, ==, 0);
313 		zap->zap_m.zap_num_chunks =
314 		    db->db_size / MZAP_ENT_LEN - 1;
315 	}
316 
317 	*zapp = zap;
318 	return (0);
319 }
320 
321 void
322 zap_unlockdir(zap_t *zap)
323 {
324 	rw_exit(&zap->zap_rwlock);
325 	dmu_buf_rele(zap->zap_dbuf);
326 }
327 
328 static void
329 mzap_upgrade(zap_t *zap, dmu_tx_t *tx)
330 {
331 	mzap_phys_t *mzp;
332 	int i, sz, nchunks, err;
333 
334 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
335 
336 	sz = zap->zap_dbuf->db_size;
337 	mzp = kmem_alloc(sz, KM_SLEEP);
338 	bcopy(zap->zap_dbuf->db_data, mzp, sz);
339 	nchunks = zap->zap_m.zap_num_chunks;
340 
341 	err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
342 	    1ULL << ZAP_BLOCK_SHIFT, 0, tx);
343 	ASSERT(err == 0);
344 
345 	dprintf("upgrading obj=%llu with %u chunks\n",
346 	    zap->zap_object, nchunks);
347 	mze_destroy(zap);
348 
349 	fzap_upgrade(zap, tx);
350 
351 	for (i = 0; i < nchunks; i++) {
352 		int err;
353 		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
354 		if (mze->mze_name[0] == 0)
355 			continue;
356 		dprintf("adding %s=%llu\n",
357 		    mze->mze_name, mze->mze_value);
358 		err = fzap_add_cd(zap,
359 		    mze->mze_name, 8, 1, &mze->mze_value,
360 		    mze->mze_cd, tx, NULL);
361 		ASSERT3U(err, ==, 0);
362 	}
363 	kmem_free(mzp, sz);
364 }
365 
366 uint64_t
367 zap_hash(zap_t *zap, const char *name)
368 {
369 	const uint8_t *cp;
370 	uint8_t c;
371 	uint64_t crc = zap->zap_salt;
372 
373 	ASSERT(crc != 0);
374 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
375 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
376 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
377 
378 	/*
379 	 * Only use 28 bits, since we need 4 bits in the cookie for the
380 	 * collision differentiator.  We MUST use the high bits, since
381 	 * those are the onces that we first pay attention to when
382 	 * chosing the bucket.
383 	 */
384 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
385 
386 	return (crc);
387 }
388 
389 
390 static void
391 mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx)
392 {
393 	dmu_buf_t *db;
394 	mzap_phys_t *zp;
395 
396 	db = dmu_buf_hold(os, obj, 0);
397 
398 #ifdef ZFS_DEBUG
399 	{
400 		dmu_object_info_t doi;
401 		dmu_object_info_from_db(db, &doi);
402 		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
403 	}
404 #endif
405 
406 	dmu_buf_will_dirty(db, tx);
407 	zp = db->db_data;
408 	zp->mz_block_type = ZBT_MICRO;
409 	zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
410 	ASSERT(zp->mz_salt != 0);
411 	dmu_buf_rele(db);
412 }
413 
414 int
415 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
416     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
417 {
418 	int err;
419 
420 	err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
421 	if (err != 0)
422 		return (err);
423 	mzap_create_impl(os, obj, tx);
424 	return (0);
425 }
426 
427 uint64_t
428 zap_create(objset_t *os, dmu_object_type_t ot,
429     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
430 {
431 	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
432 
433 	mzap_create_impl(os, obj, tx);
434 	return (obj);
435 }
436 
437 int
438 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
439 {
440 	/*
441 	 * dmu_object_free will free the object number and free the
442 	 * data.  Freeing the data will cause our pageout function to be
443 	 * called, which will destroy our data (zap_leaf_t's and zap_t).
444 	 */
445 
446 	return (dmu_object_free(os, zapobj, tx));
447 }
448 
449 _NOTE(ARGSUSED(0))
450 void
451 zap_pageout(dmu_buf_t *db, void *vmzap)
452 {
453 	zap_t *zap = vmzap;
454 
455 	rw_destroy(&zap->zap_rwlock);
456 
457 	if (zap->zap_ismicro) {
458 		mze_destroy(zap);
459 	}
460 
461 	kmem_free(zap, sizeof (zap_t));
462 }
463 
464 
465 int
466 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
467 {
468 	zap_t *zap;
469 	int err;
470 
471 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
472 	if (err)
473 		return (err);
474 	if (!zap->zap_ismicro) {
475 		err = fzap_count(zap, count);
476 	} else {
477 		*count = zap->zap_m.zap_num_entries;
478 	}
479 	zap_unlockdir(zap);
480 	return (err);
481 }
482 
483 /*
484  * Routines for maniplulating attributes.
485  */
486 
487 int
488 zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
489     uint64_t integer_size, uint64_t num_integers, void *buf)
490 {
491 	zap_t *zap;
492 	int err;
493 	mzap_ent_t *mze;
494 
495 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
496 	if (err)
497 		return (err);
498 	if (!zap->zap_ismicro) {
499 		err = fzap_lookup(zap, name,
500 		    integer_size, num_integers, buf);
501 	} else {
502 		mze = mze_find(zap, name, zap_hash(zap, name));
503 		if (mze == NULL) {
504 			err = ENOENT;
505 		} else {
506 			if (num_integers < 1)
507 				err = EOVERFLOW;
508 			else if (integer_size != 8)
509 				err = EINVAL;
510 			else
511 				*(uint64_t *)buf = mze->mze_phys.mze_value;
512 		}
513 	}
514 	zap_unlockdir(zap);
515 	return (err);
516 }
517 
518 int
519 zap_length(objset_t *os, uint64_t zapobj, const char *name,
520     uint64_t *integer_size, uint64_t *num_integers)
521 {
522 	zap_t *zap;
523 	int err;
524 	mzap_ent_t *mze;
525 
526 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
527 	if (err)
528 		return (err);
529 	if (!zap->zap_ismicro) {
530 		err = fzap_length(zap, name, integer_size, num_integers);
531 	} else {
532 		mze = mze_find(zap, name, zap_hash(zap, name));
533 		if (mze == NULL) {
534 			err = ENOENT;
535 		} else {
536 			if (integer_size)
537 				*integer_size = 8;
538 			if (num_integers)
539 				*num_integers = 1;
540 		}
541 	}
542 	zap_unlockdir(zap);
543 	return (err);
544 }
545 
546 static void
547 mzap_addent(zap_t *zap, const char *name, uint64_t hash, uint64_t value)
548 {
549 	int i;
550 	int start = zap->zap_m.zap_alloc_next;
551 	uint32_t cd;
552 
553 	dprintf("obj=%llu %s=%llu\n", zap->zap_object, name, value);
554 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
555 
556 #ifdef ZFS_DEBUG
557 	for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
558 		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
559 		ASSERT(strcmp(name, mze->mze_name) != 0);
560 	}
561 #endif
562 
563 	cd = mze_find_unused_cd(zap, hash);
564 	/* given the limited size of the microzap, this can't happen */
565 	ASSERT(cd != ZAP_MAXCD);
566 
567 again:
568 	for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
569 		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
570 		if (mze->mze_name[0] == 0) {
571 			mze->mze_value = value;
572 			mze->mze_cd = cd;
573 			(void) strcpy(mze->mze_name, name);
574 			zap->zap_m.zap_num_entries++;
575 			zap->zap_m.zap_alloc_next = i+1;
576 			if (zap->zap_m.zap_alloc_next ==
577 			    zap->zap_m.zap_num_chunks)
578 				zap->zap_m.zap_alloc_next = 0;
579 			mze_insert(zap, i, hash, mze);
580 			return;
581 		}
582 	}
583 	if (start != 0) {
584 		start = 0;
585 		goto again;
586 	}
587 	ASSERT(!"out of entries!");
588 }
589 
590 int
591 zap_add(objset_t *os, uint64_t zapobj, const char *name,
592     int integer_size, uint64_t num_integers,
593     const void *val, dmu_tx_t *tx)
594 {
595 	zap_t *zap;
596 	int err;
597 	mzap_ent_t *mze;
598 	const uint64_t *intval = val;
599 	uint64_t hash;
600 
601 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
602 	if (err)
603 		return (err);
604 	if (!zap->zap_ismicro) {
605 		err = fzap_add(zap, name, integer_size, num_integers, val, tx);
606 	} else if (integer_size != 8 || num_integers != 1 ||
607 	    strlen(name) >= MZAP_NAME_LEN) {
608 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
609 		    zapobj, integer_size, num_integers, name);
610 		mzap_upgrade(zap, tx);
611 		err = fzap_add(zap, name, integer_size, num_integers, val, tx);
612 	} else {
613 		hash = zap_hash(zap, name);
614 		mze = mze_find(zap, name, hash);
615 		if (mze != NULL) {
616 			err = EEXIST;
617 		} else {
618 			mzap_addent(zap, name, hash, *intval);
619 		}
620 	}
621 	zap_unlockdir(zap);
622 	return (err);
623 }
624 
625 int
626 zap_update(objset_t *os, uint64_t zapobj, const char *name,
627     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
628 {
629 	zap_t *zap;
630 	mzap_ent_t *mze;
631 	const uint64_t *intval = val;
632 	uint64_t hash;
633 	int err;
634 
635 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
636 	if (err)
637 		return (err);
638 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
639 	if (!zap->zap_ismicro) {
640 		err = fzap_update(zap, name,
641 		    integer_size, num_integers, val, tx);
642 	} else if (integer_size != 8 || num_integers != 1 ||
643 	    strlen(name) >= MZAP_NAME_LEN) {
644 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
645 		    zapobj, integer_size, num_integers, name);
646 		mzap_upgrade(zap, tx);
647 		err = fzap_update(zap, name,
648 		    integer_size, num_integers, val, tx);
649 	} else {
650 		hash = zap_hash(zap, name);
651 		mze = mze_find(zap, name, hash);
652 		if (mze != NULL) {
653 			mze->mze_phys.mze_value = *intval;
654 			zap->zap_m.zap_phys->mz_chunk
655 			    [mze->mze_chunkid].mze_value = *intval;
656 		} else {
657 			mzap_addent(zap, name, hash, *intval);
658 		}
659 	}
660 	zap_unlockdir(zap);
661 	return (0);
662 }
663 
664 int
665 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
666 {
667 	zap_t *zap;
668 	int err;
669 	mzap_ent_t *mze;
670 
671 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
672 	if (err)
673 		return (err);
674 	if (!zap->zap_ismicro) {
675 		err = fzap_remove(zap, name, tx);
676 	} else {
677 		mze = mze_find(zap, name, zap_hash(zap, name));
678 		if (mze == NULL) {
679 			dprintf("fail: %s\n", name);
680 			err = ENOENT;
681 		} else {
682 			dprintf("success: %s\n", name);
683 			zap->zap_m.zap_num_entries--;
684 			bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
685 			    sizeof (mzap_ent_phys_t));
686 			mze_remove(zap, mze);
687 		}
688 	}
689 	zap_unlockdir(zap);
690 	return (err);
691 }
692 
693 
694 /*
695  * Routines for iterating over the attributes.
696  */
697 
698 /*
699  * We want to keep the high 32 bits of the cursor zero if we can, so
700  * that 32-bit programs can access this.  So use a small hash value so
701  * we can fit 4 bits of cd into the 32-bit cursor.
702  *
703  * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
704  */
705 void
706 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
707     uint64_t serialized)
708 {
709 	zc->zc_objset = os;
710 	zc->zc_zap = NULL;
711 	zc->zc_leaf = NULL;
712 	zc->zc_zapobj = zapobj;
713 	if (serialized == -1ULL) {
714 		zc->zc_hash = -1ULL;
715 		zc->zc_cd = 0;
716 	} else {
717 		zc->zc_hash = serialized << (64-ZAP_HASHBITS);
718 		zc->zc_cd = serialized >> ZAP_HASHBITS;
719 		if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
720 			zc->zc_cd = 0;
721 	}
722 }
723 
724 void
725 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
726 {
727 	zap_cursor_init_serialized(zc, os, zapobj, 0);
728 }
729 
730 void
731 zap_cursor_fini(zap_cursor_t *zc)
732 {
733 	if (zc->zc_zap) {
734 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
735 		zap_unlockdir(zc->zc_zap);
736 		zc->zc_zap = NULL;
737 	}
738 	if (zc->zc_leaf) {
739 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
740 		zap_put_leaf(zc->zc_leaf);
741 		zc->zc_leaf = NULL;
742 	}
743 	zc->zc_objset = NULL;
744 }
745 
746 uint64_t
747 zap_cursor_serialize(zap_cursor_t *zc)
748 {
749 	if (zc->zc_hash == -1ULL)
750 		return (-1ULL);
751 	ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0);
752 	ASSERT(zc->zc_cd < ZAP_MAXCD);
753 	return ((zc->zc_hash >> (64-ZAP_HASHBITS)) |
754 	    ((uint64_t)zc->zc_cd << ZAP_HASHBITS));
755 }
756 
757 int
758 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
759 {
760 	int err;
761 	avl_index_t idx;
762 	mzap_ent_t mze_tofind;
763 	mzap_ent_t *mze;
764 
765 	if (zc->zc_hash == -1ULL)
766 		return (ENOENT);
767 
768 	if (zc->zc_zap == NULL) {
769 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
770 		    RW_READER, TRUE, &zc->zc_zap);
771 		if (err)
772 			return (err);
773 	} else {
774 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
775 	}
776 	if (!zc->zc_zap->zap_ismicro) {
777 		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
778 	} else {
779 		err = ENOENT;
780 
781 		mze_tofind.mze_hash = zc->zc_hash;
782 		mze_tofind.mze_phys.mze_cd = zc->zc_cd;
783 
784 		mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
785 		ASSERT(mze == NULL || 0 == bcmp(&mze->mze_phys,
786 		    &zc->zc_zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
787 		    sizeof (mze->mze_phys)));
788 		if (mze == NULL) {
789 			mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
790 			    idx, AVL_AFTER);
791 		}
792 		if (mze) {
793 			za->za_integer_length = 8;
794 			za->za_num_integers = 1;
795 			za->za_first_integer = mze->mze_phys.mze_value;
796 			(void) strcpy(za->za_name, mze->mze_phys.mze_name);
797 			zc->zc_hash = mze->mze_hash;
798 			zc->zc_cd = mze->mze_phys.mze_cd;
799 			err = 0;
800 		} else {
801 			zc->zc_hash = -1ULL;
802 		}
803 	}
804 	rw_exit(&zc->zc_zap->zap_rwlock);
805 	return (err);
806 }
807 
808 void
809 zap_cursor_advance(zap_cursor_t *zc)
810 {
811 	if (zc->zc_hash == -1ULL)
812 		return;
813 	zc->zc_cd++;
814 	if (zc->zc_cd >= ZAP_MAXCD) {
815 		zc->zc_cd = 0;
816 		zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS);
817 		if (zc->zc_hash == 0) /* EOF */
818 			zc->zc_hash = -1ULL;
819 	}
820 }
821 
822 int
823 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
824 {
825 	int err;
826 	zap_t *zap;
827 
828 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
829 	if (err)
830 		return (err);
831 
832 	bzero(zs, sizeof (zap_stats_t));
833 
834 	if (zap->zap_ismicro) {
835 		zs->zs_blocksize = zap->zap_dbuf->db_size;
836 		zs->zs_num_entries = zap->zap_m.zap_num_entries;
837 		zs->zs_num_blocks = 1;
838 	} else {
839 		fzap_get_stats(zap, zs);
840 	}
841 	zap_unlockdir(zap);
842 	return (0);
843 }
844