xref: /titanic_52/usr/src/uts/common/fs/zfs/zap_micro.c (revision d5508a7fb37e6b070e142ee081bec69a3d20bd6c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/spa.h>
27 #include <sys/dmu.h>
28 #include <sys/zfs_context.h>
29 #include <sys/zap.h>
30 #include <sys/refcount.h>
31 #include <sys/zap_impl.h>
32 #include <sys/zap_leaf.h>
33 #include <sys/avl.h>
34 
35 #ifdef _KERNEL
36 #include <sys/sunddi.h>
37 #endif
38 
39 static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx);
40 
41 
42 static uint64_t
43 zap_hash(zap_t *zap, const char *normname)
44 {
45 	const uint8_t *cp;
46 	uint8_t c;
47 	uint64_t crc = zap->zap_salt;
48 
49 	/* NB: name must already be normalized, if necessary */
50 
51 	ASSERT(crc != 0);
52 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
53 	for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) {
54 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
55 	}
56 
57 	/*
58 	 * Only use 28 bits, since we need 4 bits in the cookie for the
59 	 * collision differentiator.  We MUST use the high bits, since
60 	 * those are the ones that we first pay attention to when
61 	 * chosing the bucket.
62 	 */
63 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
64 
65 	return (crc);
66 }
67 
68 static int
69 zap_normalize(zap_t *zap, const char *name, char *namenorm)
70 {
71 	size_t inlen, outlen;
72 	int err;
73 
74 	inlen = strlen(name) + 1;
75 	outlen = ZAP_MAXNAMELEN;
76 
77 	err = 0;
78 	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
79 	    zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL |
80 	    U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err);
81 
82 	return (err);
83 }
84 
85 boolean_t
86 zap_match(zap_name_t *zn, const char *matchname)
87 {
88 	if (zn->zn_matchtype == MT_FIRST) {
89 		char norm[ZAP_MAXNAMELEN];
90 
91 		if (zap_normalize(zn->zn_zap, matchname, norm) != 0)
92 			return (B_FALSE);
93 
94 		return (strcmp(zn->zn_name_norm, norm) == 0);
95 	} else {
96 		/* MT_BEST or MT_EXACT */
97 		return (strcmp(zn->zn_name_orij, matchname) == 0);
98 	}
99 }
100 
101 void
102 zap_name_free(zap_name_t *zn)
103 {
104 	kmem_free(zn, sizeof (zap_name_t));
105 }
106 
107 /* XXX combine this with zap_lockdir()? */
108 zap_name_t *
109 zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt)
110 {
111 	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
112 
113 	zn->zn_zap = zap;
114 	zn->zn_name_orij = name;
115 	zn->zn_matchtype = mt;
116 	if (zap->zap_normflags) {
117 		if (zap_normalize(zap, name, zn->zn_normbuf) != 0) {
118 			zap_name_free(zn);
119 			return (NULL);
120 		}
121 		zn->zn_name_norm = zn->zn_normbuf;
122 	} else {
123 		if (mt != MT_EXACT) {
124 			zap_name_free(zn);
125 			return (NULL);
126 		}
127 		zn->zn_name_norm = zn->zn_name_orij;
128 	}
129 
130 	zn->zn_hash = zap_hash(zap, zn->zn_name_norm);
131 	return (zn);
132 }
133 
134 static void
135 mzap_byteswap(mzap_phys_t *buf, size_t size)
136 {
137 	int i, max;
138 	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
139 	buf->mz_salt = BSWAP_64(buf->mz_salt);
140 	buf->mz_normflags = BSWAP_64(buf->mz_normflags);
141 	max = (size / MZAP_ENT_LEN) - 1;
142 	for (i = 0; i < max; i++) {
143 		buf->mz_chunk[i].mze_value =
144 		    BSWAP_64(buf->mz_chunk[i].mze_value);
145 		buf->mz_chunk[i].mze_cd =
146 		    BSWAP_32(buf->mz_chunk[i].mze_cd);
147 	}
148 }
149 
150 void
151 zap_byteswap(void *buf, size_t size)
152 {
153 	uint64_t block_type;
154 
155 	block_type = *(uint64_t *)buf;
156 
157 	if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
158 		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
159 		mzap_byteswap(buf, size);
160 	} else {
161 		fzap_byteswap(buf, size);
162 	}
163 }
164 
165 static int
166 mze_compare(const void *arg1, const void *arg2)
167 {
168 	const mzap_ent_t *mze1 = arg1;
169 	const mzap_ent_t *mze2 = arg2;
170 
171 	if (mze1->mze_hash > mze2->mze_hash)
172 		return (+1);
173 	if (mze1->mze_hash < mze2->mze_hash)
174 		return (-1);
175 	if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
176 		return (+1);
177 	if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
178 		return (-1);
179 	return (0);
180 }
181 
182 static void
183 mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
184 {
185 	mzap_ent_t *mze;
186 
187 	ASSERT(zap->zap_ismicro);
188 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
189 	ASSERT(mzep->mze_cd < ZAP_MAXCD);
190 
191 	mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
192 	mze->mze_chunkid = chunkid;
193 	mze->mze_hash = hash;
194 	mze->mze_phys = *mzep;
195 	avl_add(&zap->zap_m.zap_avl, mze);
196 }
197 
198 static mzap_ent_t *
199 mze_find(zap_name_t *zn)
200 {
201 	mzap_ent_t mze_tofind;
202 	mzap_ent_t *mze;
203 	avl_index_t idx;
204 	avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
205 
206 	ASSERT(zn->zn_zap->zap_ismicro);
207 	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
208 
209 	if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name))
210 		return (NULL);
211 
212 	mze_tofind.mze_hash = zn->zn_hash;
213 	mze_tofind.mze_phys.mze_cd = 0;
214 
215 again:
216 	mze = avl_find(avl, &mze_tofind, &idx);
217 	if (mze == NULL)
218 		mze = avl_nearest(avl, idx, AVL_AFTER);
219 	for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
220 		if (zap_match(zn, mze->mze_phys.mze_name))
221 			return (mze);
222 	}
223 	if (zn->zn_matchtype == MT_BEST) {
224 		zn->zn_matchtype = MT_FIRST;
225 		goto again;
226 	}
227 	return (NULL);
228 }
229 
230 static uint32_t
231 mze_find_unused_cd(zap_t *zap, uint64_t hash)
232 {
233 	mzap_ent_t mze_tofind;
234 	mzap_ent_t *mze;
235 	avl_index_t idx;
236 	avl_tree_t *avl = &zap->zap_m.zap_avl;
237 	uint32_t cd;
238 
239 	ASSERT(zap->zap_ismicro);
240 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
241 
242 	mze_tofind.mze_hash = hash;
243 	mze_tofind.mze_phys.mze_cd = 0;
244 
245 	cd = 0;
246 	for (mze = avl_find(avl, &mze_tofind, &idx);
247 	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
248 		if (mze->mze_phys.mze_cd != cd)
249 			break;
250 		cd++;
251 	}
252 
253 	return (cd);
254 }
255 
256 static void
257 mze_remove(zap_t *zap, mzap_ent_t *mze)
258 {
259 	ASSERT(zap->zap_ismicro);
260 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
261 
262 	avl_remove(&zap->zap_m.zap_avl, mze);
263 	kmem_free(mze, sizeof (mzap_ent_t));
264 }
265 
266 static void
267 mze_destroy(zap_t *zap)
268 {
269 	mzap_ent_t *mze;
270 	void *avlcookie = NULL;
271 
272 	while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
273 		kmem_free(mze, sizeof (mzap_ent_t));
274 	avl_destroy(&zap->zap_m.zap_avl);
275 }
276 
277 static zap_t *
278 mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
279 {
280 	zap_t *winner;
281 	zap_t *zap;
282 	int i;
283 
284 	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
285 
286 	zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
287 	rw_init(&zap->zap_rwlock, 0, 0, 0);
288 	rw_enter(&zap->zap_rwlock, RW_WRITER);
289 	zap->zap_objset = os;
290 	zap->zap_object = obj;
291 	zap->zap_dbuf = db;
292 
293 	if (*(uint64_t *)db->db_data != ZBT_MICRO) {
294 		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
295 		zap->zap_f.zap_block_shift = highbit(db->db_size) - 1;
296 	} else {
297 		zap->zap_ismicro = TRUE;
298 	}
299 
300 	/*
301 	 * Make sure that zap_ismicro is set before we let others see
302 	 * it, because zap_lockdir() checks zap_ismicro without the lock
303 	 * held.
304 	 */
305 	winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict);
306 
307 	if (winner != NULL) {
308 		rw_exit(&zap->zap_rwlock);
309 		rw_destroy(&zap->zap_rwlock);
310 		if (!zap->zap_ismicro)
311 			mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
312 		kmem_free(zap, sizeof (zap_t));
313 		return (winner);
314 	}
315 
316 	if (zap->zap_ismicro) {
317 		zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
318 		zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags;
319 		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
320 		avl_create(&zap->zap_m.zap_avl, mze_compare,
321 		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
322 
323 		for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
324 			mzap_ent_phys_t *mze =
325 			    &zap->zap_m.zap_phys->mz_chunk[i];
326 			if (mze->mze_name[0]) {
327 				zap_name_t *zn;
328 
329 				zap->zap_m.zap_num_entries++;
330 				zn = zap_name_alloc(zap, mze->mze_name,
331 				    MT_EXACT);
332 				mze_insert(zap, i, zn->zn_hash, mze);
333 				zap_name_free(zn);
334 			}
335 		}
336 	} else {
337 		zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
338 		zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags;
339 
340 		ASSERT3U(sizeof (struct zap_leaf_header), ==,
341 		    2*ZAP_LEAF_CHUNKSIZE);
342 
343 		/*
344 		 * The embedded pointer table should not overlap the
345 		 * other members.
346 		 */
347 		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
348 		    &zap->zap_f.zap_phys->zap_salt);
349 
350 		/*
351 		 * The embedded pointer table should end at the end of
352 		 * the block
353 		 */
354 		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
355 		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
356 		    (uintptr_t)zap->zap_f.zap_phys, ==,
357 		    zap->zap_dbuf->db_size);
358 	}
359 	rw_exit(&zap->zap_rwlock);
360 	return (zap);
361 }
362 
363 int
364 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
365     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
366 {
367 	zap_t *zap;
368 	dmu_buf_t *db;
369 	krw_t lt;
370 	int err;
371 
372 	*zapp = NULL;
373 
374 	err = dmu_buf_hold(os, obj, 0, NULL, &db);
375 	if (err)
376 		return (err);
377 
378 #ifdef ZFS_DEBUG
379 	{
380 		dmu_object_info_t doi;
381 		dmu_object_info_from_db(db, &doi);
382 		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
383 	}
384 #endif
385 
386 	zap = dmu_buf_get_user(db);
387 	if (zap == NULL)
388 		zap = mzap_open(os, obj, db);
389 
390 	/*
391 	 * We're checking zap_ismicro without the lock held, in order to
392 	 * tell what type of lock we want.  Once we have some sort of
393 	 * lock, see if it really is the right type.  In practice this
394 	 * can only be different if it was upgraded from micro to fat,
395 	 * and micro wanted WRITER but fat only needs READER.
396 	 */
397 	lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
398 	rw_enter(&zap->zap_rwlock, lt);
399 	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
400 		/* it was upgraded, now we only need reader */
401 		ASSERT(lt == RW_WRITER);
402 		ASSERT(RW_READER ==
403 		    (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
404 		rw_downgrade(&zap->zap_rwlock);
405 		lt = RW_READER;
406 	}
407 
408 	zap->zap_objset = os;
409 
410 	if (lt == RW_WRITER)
411 		dmu_buf_will_dirty(db, tx);
412 
413 	ASSERT3P(zap->zap_dbuf, ==, db);
414 
415 	ASSERT(!zap->zap_ismicro ||
416 	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
417 	if (zap->zap_ismicro && tx && adding &&
418 	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
419 		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
420 		if (newsz > MZAP_MAX_BLKSZ) {
421 			dprintf("upgrading obj %llu: num_entries=%u\n",
422 			    obj, zap->zap_m.zap_num_entries);
423 			*zapp = zap;
424 			return (mzap_upgrade(zapp, tx));
425 		}
426 		err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
427 		ASSERT3U(err, ==, 0);
428 		zap->zap_m.zap_num_chunks =
429 		    db->db_size / MZAP_ENT_LEN - 1;
430 	}
431 
432 	*zapp = zap;
433 	return (0);
434 }
435 
436 void
437 zap_unlockdir(zap_t *zap)
438 {
439 	rw_exit(&zap->zap_rwlock);
440 	dmu_buf_rele(zap->zap_dbuf, NULL);
441 }
442 
443 static int
444 mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
445 {
446 	mzap_phys_t *mzp;
447 	int i, sz, nchunks, err;
448 	zap_t *zap = *zapp;
449 
450 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
451 
452 	sz = zap->zap_dbuf->db_size;
453 	mzp = kmem_alloc(sz, KM_SLEEP);
454 	bcopy(zap->zap_dbuf->db_data, mzp, sz);
455 	nchunks = zap->zap_m.zap_num_chunks;
456 
457 	err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
458 	    1ULL << fzap_default_block_shift, 0, tx);
459 	if (err) {
460 		kmem_free(mzp, sz);
461 		return (err);
462 	}
463 
464 	dprintf("upgrading obj=%llu with %u chunks\n",
465 	    zap->zap_object, nchunks);
466 	/* XXX destroy the avl later, so we can use the stored hash value */
467 	mze_destroy(zap);
468 
469 	fzap_upgrade(zap, tx);
470 
471 	for (i = 0; i < nchunks; i++) {
472 		int err;
473 		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
474 		zap_name_t *zn;
475 		if (mze->mze_name[0] == 0)
476 			continue;
477 		dprintf("adding %s=%llu\n",
478 		    mze->mze_name, mze->mze_value);
479 		zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT);
480 		err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx);
481 		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
482 		zap_name_free(zn);
483 		if (err)
484 			break;
485 	}
486 	kmem_free(mzp, sz);
487 	*zapp = zap;
488 	return (err);
489 }
490 
491 static void
492 mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx)
493 {
494 	dmu_buf_t *db;
495 	mzap_phys_t *zp;
496 
497 	VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db));
498 
499 #ifdef ZFS_DEBUG
500 	{
501 		dmu_object_info_t doi;
502 		dmu_object_info_from_db(db, &doi);
503 		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
504 	}
505 #endif
506 
507 	dmu_buf_will_dirty(db, tx);
508 	zp = db->db_data;
509 	zp->mz_block_type = ZBT_MICRO;
510 	zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
511 	zp->mz_normflags = normflags;
512 	dmu_buf_rele(db, FTAG);
513 }
514 
515 int
516 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
517     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
518 {
519 	return (zap_create_claim_norm(os, obj,
520 	    0, ot, bonustype, bonuslen, tx));
521 }
522 
523 int
524 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
525     dmu_object_type_t ot,
526     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
527 {
528 	int err;
529 
530 	err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
531 	if (err != 0)
532 		return (err);
533 	mzap_create_impl(os, obj, normflags, tx);
534 	return (0);
535 }
536 
537 uint64_t
538 zap_create(objset_t *os, dmu_object_type_t ot,
539     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
540 {
541 	return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
542 }
543 
544 uint64_t
545 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
546     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
547 {
548 	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
549 
550 	mzap_create_impl(os, obj, normflags, tx);
551 	return (obj);
552 }
553 
554 int
555 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
556 {
557 	/*
558 	 * dmu_object_free will free the object number and free the
559 	 * data.  Freeing the data will cause our pageout function to be
560 	 * called, which will destroy our data (zap_leaf_t's and zap_t).
561 	 */
562 
563 	return (dmu_object_free(os, zapobj, tx));
564 }
565 
566 _NOTE(ARGSUSED(0))
567 void
568 zap_evict(dmu_buf_t *db, void *vzap)
569 {
570 	zap_t *zap = vzap;
571 
572 	rw_destroy(&zap->zap_rwlock);
573 
574 	if (zap->zap_ismicro)
575 		mze_destroy(zap);
576 	else
577 		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
578 
579 	kmem_free(zap, sizeof (zap_t));
580 }
581 
582 int
583 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
584 {
585 	zap_t *zap;
586 	int err;
587 
588 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
589 	if (err)
590 		return (err);
591 	if (!zap->zap_ismicro) {
592 		err = fzap_count(zap, count);
593 	} else {
594 		*count = zap->zap_m.zap_num_entries;
595 	}
596 	zap_unlockdir(zap);
597 	return (err);
598 }
599 
600 /*
601  * zn may be NULL; if not specified, it will be computed if needed.
602  * See also the comment above zap_entry_normalization_conflict().
603  */
604 static boolean_t
605 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
606 {
607 	mzap_ent_t *other;
608 	int direction = AVL_BEFORE;
609 	boolean_t allocdzn = B_FALSE;
610 
611 	if (zap->zap_normflags == 0)
612 		return (B_FALSE);
613 
614 again:
615 	for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
616 	    other && other->mze_hash == mze->mze_hash;
617 	    other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
618 
619 		if (zn == NULL) {
620 			zn = zap_name_alloc(zap, mze->mze_phys.mze_name,
621 			    MT_FIRST);
622 			allocdzn = B_TRUE;
623 		}
624 		if (zap_match(zn, other->mze_phys.mze_name)) {
625 			if (allocdzn)
626 				zap_name_free(zn);
627 			return (B_TRUE);
628 		}
629 	}
630 
631 	if (direction == AVL_BEFORE) {
632 		direction = AVL_AFTER;
633 		goto again;
634 	}
635 
636 	if (allocdzn)
637 		zap_name_free(zn);
638 	return (B_FALSE);
639 }
640 
641 /*
642  * Routines for manipulating attributes.
643  */
644 
645 int
646 zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
647     uint64_t integer_size, uint64_t num_integers, void *buf)
648 {
649 	return (zap_lookup_norm(os, zapobj, name, integer_size,
650 	    num_integers, buf, MT_EXACT, NULL, 0, NULL));
651 }
652 
653 int
654 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
655     uint64_t integer_size, uint64_t num_integers, void *buf,
656     matchtype_t mt, char *realname, int rn_len,
657     boolean_t *ncp)
658 {
659 	zap_t *zap;
660 	int err;
661 	mzap_ent_t *mze;
662 	zap_name_t *zn;
663 
664 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
665 	if (err)
666 		return (err);
667 	zn = zap_name_alloc(zap, name, mt);
668 	if (zn == NULL) {
669 		zap_unlockdir(zap);
670 		return (ENOTSUP);
671 	}
672 
673 	if (!zap->zap_ismicro) {
674 		err = fzap_lookup(zn, integer_size, num_integers, buf,
675 		    realname, rn_len, ncp);
676 	} else {
677 		mze = mze_find(zn);
678 		if (mze == NULL) {
679 			err = ENOENT;
680 		} else {
681 			if (num_integers < 1) {
682 				err = EOVERFLOW;
683 			} else if (integer_size != 8) {
684 				err = EINVAL;
685 			} else {
686 				*(uint64_t *)buf = mze->mze_phys.mze_value;
687 				(void) strlcpy(realname,
688 				    mze->mze_phys.mze_name, rn_len);
689 				if (ncp) {
690 					*ncp = mzap_normalization_conflict(zap,
691 					    zn, mze);
692 				}
693 			}
694 		}
695 	}
696 	zap_name_free(zn);
697 	zap_unlockdir(zap);
698 	return (err);
699 }
700 
701 int
702 zap_length(objset_t *os, uint64_t zapobj, const char *name,
703     uint64_t *integer_size, uint64_t *num_integers)
704 {
705 	zap_t *zap;
706 	int err;
707 	mzap_ent_t *mze;
708 	zap_name_t *zn;
709 
710 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
711 	if (err)
712 		return (err);
713 	zn = zap_name_alloc(zap, name, MT_EXACT);
714 	if (zn == NULL) {
715 		zap_unlockdir(zap);
716 		return (ENOTSUP);
717 	}
718 	if (!zap->zap_ismicro) {
719 		err = fzap_length(zn, integer_size, num_integers);
720 	} else {
721 		mze = mze_find(zn);
722 		if (mze == NULL) {
723 			err = ENOENT;
724 		} else {
725 			if (integer_size)
726 				*integer_size = 8;
727 			if (num_integers)
728 				*num_integers = 1;
729 		}
730 	}
731 	zap_name_free(zn);
732 	zap_unlockdir(zap);
733 	return (err);
734 }
735 
736 static void
737 mzap_addent(zap_name_t *zn, uint64_t value)
738 {
739 	int i;
740 	zap_t *zap = zn->zn_zap;
741 	int start = zap->zap_m.zap_alloc_next;
742 	uint32_t cd;
743 
744 	dprintf("obj=%llu %s=%llu\n", zap->zap_object,
745 	    zn->zn_name_orij, value);
746 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
747 
748 #ifdef ZFS_DEBUG
749 	for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
750 		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
751 		ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0);
752 	}
753 #endif
754 
755 	cd = mze_find_unused_cd(zap, zn->zn_hash);
756 	/* given the limited size of the microzap, this can't happen */
757 	ASSERT(cd != ZAP_MAXCD);
758 
759 again:
760 	for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
761 		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
762 		if (mze->mze_name[0] == 0) {
763 			mze->mze_value = value;
764 			mze->mze_cd = cd;
765 			(void) strcpy(mze->mze_name, zn->zn_name_orij);
766 			zap->zap_m.zap_num_entries++;
767 			zap->zap_m.zap_alloc_next = i+1;
768 			if (zap->zap_m.zap_alloc_next ==
769 			    zap->zap_m.zap_num_chunks)
770 				zap->zap_m.zap_alloc_next = 0;
771 			mze_insert(zap, i, zn->zn_hash, mze);
772 			return;
773 		}
774 	}
775 	if (start != 0) {
776 		start = 0;
777 		goto again;
778 	}
779 	ASSERT(!"out of entries!");
780 }
781 
782 int
783 zap_add(objset_t *os, uint64_t zapobj, const char *name,
784     int integer_size, uint64_t num_integers,
785     const void *val, dmu_tx_t *tx)
786 {
787 	zap_t *zap;
788 	int err;
789 	mzap_ent_t *mze;
790 	const uint64_t *intval = val;
791 	zap_name_t *zn;
792 
793 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
794 	if (err)
795 		return (err);
796 	zn = zap_name_alloc(zap, name, MT_EXACT);
797 	if (zn == NULL) {
798 		zap_unlockdir(zap);
799 		return (ENOTSUP);
800 	}
801 	if (!zap->zap_ismicro) {
802 		err = fzap_add(zn, integer_size, num_integers, val, tx);
803 		zap = zn->zn_zap;	/* fzap_add() may change zap */
804 	} else if (integer_size != 8 || num_integers != 1 ||
805 	    strlen(name) >= MZAP_NAME_LEN) {
806 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
807 		    zapobj, integer_size, num_integers, name);
808 		err = mzap_upgrade(&zn->zn_zap, tx);
809 		if (err == 0)
810 			err = fzap_add(zn, integer_size, num_integers, val, tx);
811 		zap = zn->zn_zap;	/* fzap_add() may change zap */
812 	} else {
813 		mze = mze_find(zn);
814 		if (mze != NULL) {
815 			err = EEXIST;
816 		} else {
817 			mzap_addent(zn, *intval);
818 		}
819 	}
820 	ASSERT(zap == zn->zn_zap);
821 	zap_name_free(zn);
822 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
823 		zap_unlockdir(zap);
824 	return (err);
825 }
826 
827 int
828 zap_update(objset_t *os, uint64_t zapobj, const char *name,
829     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
830 {
831 	zap_t *zap;
832 	mzap_ent_t *mze;
833 	const uint64_t *intval = val;
834 	zap_name_t *zn;
835 	int err;
836 
837 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
838 	if (err)
839 		return (err);
840 	zn = zap_name_alloc(zap, name, MT_EXACT);
841 	if (zn == NULL) {
842 		zap_unlockdir(zap);
843 		return (ENOTSUP);
844 	}
845 	if (!zap->zap_ismicro) {
846 		err = fzap_update(zn, integer_size, num_integers, val, tx);
847 		zap = zn->zn_zap;	/* fzap_update() may change zap */
848 	} else if (integer_size != 8 || num_integers != 1 ||
849 	    strlen(name) >= MZAP_NAME_LEN) {
850 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
851 		    zapobj, integer_size, num_integers, name);
852 		err = mzap_upgrade(&zn->zn_zap, tx);
853 		if (err == 0)
854 			err = fzap_update(zn, integer_size, num_integers,
855 			    val, tx);
856 		zap = zn->zn_zap;	/* fzap_update() may change zap */
857 	} else {
858 		mze = mze_find(zn);
859 		if (mze != NULL) {
860 			mze->mze_phys.mze_value = *intval;
861 			zap->zap_m.zap_phys->mz_chunk
862 			    [mze->mze_chunkid].mze_value = *intval;
863 		} else {
864 			mzap_addent(zn, *intval);
865 		}
866 	}
867 	ASSERT(zap == zn->zn_zap);
868 	zap_name_free(zn);
869 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
870 		zap_unlockdir(zap);
871 	return (err);
872 }
873 
874 int
875 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
876 {
877 	return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx));
878 }
879 
880 int
881 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
882     matchtype_t mt, dmu_tx_t *tx)
883 {
884 	zap_t *zap;
885 	int err;
886 	mzap_ent_t *mze;
887 	zap_name_t *zn;
888 
889 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
890 	if (err)
891 		return (err);
892 	zn = zap_name_alloc(zap, name, mt);
893 	if (zn == NULL) {
894 		zap_unlockdir(zap);
895 		return (ENOTSUP);
896 	}
897 	if (!zap->zap_ismicro) {
898 		err = fzap_remove(zn, tx);
899 	} else {
900 		mze = mze_find(zn);
901 		if (mze == NULL) {
902 			err = ENOENT;
903 		} else {
904 			zap->zap_m.zap_num_entries--;
905 			bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
906 			    sizeof (mzap_ent_phys_t));
907 			mze_remove(zap, mze);
908 		}
909 	}
910 	zap_name_free(zn);
911 	zap_unlockdir(zap);
912 	return (err);
913 }
914 
915 /*
916  * Routines for iterating over the attributes.
917  */
918 
919 /*
920  * We want to keep the high 32 bits of the cursor zero if we can, so
921  * that 32-bit programs can access this.  So use a small hash value so
922  * we can fit 4 bits of cd into the 32-bit cursor.
923  *
924  * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
925  */
926 void
927 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
928     uint64_t serialized)
929 {
930 	zc->zc_objset = os;
931 	zc->zc_zap = NULL;
932 	zc->zc_leaf = NULL;
933 	zc->zc_zapobj = zapobj;
934 	if (serialized == -1ULL) {
935 		zc->zc_hash = -1ULL;
936 		zc->zc_cd = 0;
937 	} else {
938 		zc->zc_hash = serialized << (64-ZAP_HASHBITS);
939 		zc->zc_cd = serialized >> ZAP_HASHBITS;
940 		if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
941 			zc->zc_cd = 0;
942 	}
943 }
944 
945 void
946 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
947 {
948 	zap_cursor_init_serialized(zc, os, zapobj, 0);
949 }
950 
951 void
952 zap_cursor_fini(zap_cursor_t *zc)
953 {
954 	if (zc->zc_zap) {
955 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
956 		zap_unlockdir(zc->zc_zap);
957 		zc->zc_zap = NULL;
958 	}
959 	if (zc->zc_leaf) {
960 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
961 		zap_put_leaf(zc->zc_leaf);
962 		zc->zc_leaf = NULL;
963 	}
964 	zc->zc_objset = NULL;
965 }
966 
967 uint64_t
968 zap_cursor_serialize(zap_cursor_t *zc)
969 {
970 	if (zc->zc_hash == -1ULL)
971 		return (-1ULL);
972 	ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0);
973 	ASSERT(zc->zc_cd < ZAP_MAXCD);
974 	return ((zc->zc_hash >> (64-ZAP_HASHBITS)) |
975 	    ((uint64_t)zc->zc_cd << ZAP_HASHBITS));
976 }
977 
978 int
979 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
980 {
981 	int err;
982 	avl_index_t idx;
983 	mzap_ent_t mze_tofind;
984 	mzap_ent_t *mze;
985 
986 	if (zc->zc_hash == -1ULL)
987 		return (ENOENT);
988 
989 	if (zc->zc_zap == NULL) {
990 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
991 		    RW_READER, TRUE, FALSE, &zc->zc_zap);
992 		if (err)
993 			return (err);
994 	} else {
995 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
996 	}
997 	if (!zc->zc_zap->zap_ismicro) {
998 		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
999 	} else {
1000 		err = ENOENT;
1001 
1002 		mze_tofind.mze_hash = zc->zc_hash;
1003 		mze_tofind.mze_phys.mze_cd = zc->zc_cd;
1004 
1005 		mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
1006 		if (mze == NULL) {
1007 			mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
1008 			    idx, AVL_AFTER);
1009 		}
1010 		if (mze) {
1011 			ASSERT(0 == bcmp(&mze->mze_phys,
1012 			    &zc->zc_zap->zap_m.zap_phys->mz_chunk
1013 			    [mze->mze_chunkid], sizeof (mze->mze_phys)));
1014 
1015 			za->za_normalization_conflict =
1016 			    mzap_normalization_conflict(zc->zc_zap, NULL, mze);
1017 			za->za_integer_length = 8;
1018 			za->za_num_integers = 1;
1019 			za->za_first_integer = mze->mze_phys.mze_value;
1020 			(void) strcpy(za->za_name, mze->mze_phys.mze_name);
1021 			zc->zc_hash = mze->mze_hash;
1022 			zc->zc_cd = mze->mze_phys.mze_cd;
1023 			err = 0;
1024 		} else {
1025 			zc->zc_hash = -1ULL;
1026 		}
1027 	}
1028 	rw_exit(&zc->zc_zap->zap_rwlock);
1029 	return (err);
1030 }
1031 
1032 void
1033 zap_cursor_advance(zap_cursor_t *zc)
1034 {
1035 	if (zc->zc_hash == -1ULL)
1036 		return;
1037 	zc->zc_cd++;
1038 	if (zc->zc_cd >= ZAP_MAXCD) {
1039 		zc->zc_cd = 0;
1040 		zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS);
1041 		if (zc->zc_hash == 0) /* EOF */
1042 			zc->zc_hash = -1ULL;
1043 	}
1044 }
1045 
1046 int
1047 zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt)
1048 {
1049 	int err = 0;
1050 	mzap_ent_t *mze;
1051 	zap_name_t *zn;
1052 
1053 	if (zc->zc_zap == NULL) {
1054 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
1055 		    RW_READER, TRUE, FALSE, &zc->zc_zap);
1056 		if (err)
1057 			return (err);
1058 	} else {
1059 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1060 	}
1061 
1062 	zn = zap_name_alloc(zc->zc_zap, name, mt);
1063 	if (zn == NULL) {
1064 		rw_exit(&zc->zc_zap->zap_rwlock);
1065 		return (ENOTSUP);
1066 	}
1067 
1068 	if (!zc->zc_zap->zap_ismicro) {
1069 		err = fzap_cursor_move_to_key(zc, zn);
1070 	} else {
1071 		mze = mze_find(zn);
1072 		if (mze == NULL) {
1073 			err = ENOENT;
1074 			goto out;
1075 		}
1076 		zc->zc_hash = mze->mze_hash;
1077 		zc->zc_cd = mze->mze_phys.mze_cd;
1078 	}
1079 
1080 out:
1081 	zap_name_free(zn);
1082 	rw_exit(&zc->zc_zap->zap_rwlock);
1083 	return (err);
1084 }
1085 
1086 int
1087 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
1088 {
1089 	int err;
1090 	zap_t *zap;
1091 
1092 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
1093 	if (err)
1094 		return (err);
1095 
1096 	bzero(zs, sizeof (zap_stats_t));
1097 
1098 	if (zap->zap_ismicro) {
1099 		zs->zs_blocksize = zap->zap_dbuf->db_size;
1100 		zs->zs_num_entries = zap->zap_m.zap_num_entries;
1101 		zs->zs_num_blocks = 1;
1102 	} else {
1103 		fzap_get_stats(zap, zs);
1104 	}
1105 	zap_unlockdir(zap);
1106 	return (0);
1107 }
1108 
1109 int
1110 zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
1111     uint64_t *towrite, uint64_t *tooverwrite)
1112 {
1113 	zap_t *zap;
1114 	int err = 0;
1115 
1116 
1117 	/*
1118 	 * Since, we don't have a name, we cannot figure out which blocks will
1119 	 * be affected in this operation. So, account for the worst case :
1120 	 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
1121 	 * - 4 new blocks written if adding:
1122 	 * 	- 2 blocks for possibly split leaves,
1123 	 * 	- 2 grown ptrtbl blocks
1124 	 *
1125 	 * This also accomodates the case where an add operation to a fairly
1126 	 * large microzap results in a promotion to fatzap.
1127 	 */
1128 	if (name == NULL) {
1129 		*towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
1130 		return (err);
1131 	}
1132 
1133 	/*
1134 	 * We lock the zap with adding ==  FALSE. Because, if we pass
1135 	 * the actual value of add, it could trigger a mzap_upgrade().
1136 	 * At present we are just evaluating the possibility of this operation
1137 	 * and hence we donot want to trigger an upgrade.
1138 	 */
1139 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
1140 	if (err)
1141 		return (err);
1142 
1143 	if (!zap->zap_ismicro) {
1144 		zap_name_t *zn = zap_name_alloc(zap, name, MT_EXACT);
1145 		if (zn) {
1146 			err = fzap_count_write(zn, add, towrite,
1147 			    tooverwrite);
1148 			zap_name_free(zn);
1149 		} else {
1150 			/*
1151 			 * We treat this case as similar to (name == NULL)
1152 			 */
1153 			*towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
1154 		}
1155 	} else {
1156 		/*
1157 		 * We are here if (name != NULL) and this is a micro-zap.
1158 		 * We account for the header block depending on whether it
1159 		 * is freeable.
1160 		 *
1161 		 * Incase of an add-operation it is hard to find out
1162 		 * if this add will promote this microzap to fatzap.
1163 		 * Hence, we consider the worst case and account for the
1164 		 * blocks assuming this microzap would be promoted to a
1165 		 * fatzap.
1166 		 *
1167 		 * 1 block overwritten  : header block
1168 		 * 4 new blocks written : 2 new split leaf, 2 grown
1169 		 *			ptrtbl blocks
1170 		 */
1171 		if (dmu_buf_freeable(zap->zap_dbuf))
1172 			*tooverwrite += SPA_MAXBLOCKSIZE;
1173 		else
1174 			*towrite += SPA_MAXBLOCKSIZE;
1175 
1176 		if (add) {
1177 			*towrite += 4 * SPA_MAXBLOCKSIZE;
1178 		}
1179 	}
1180 
1181 	zap_unlockdir(zap);
1182 	return (err);
1183 }
1184