xref: /freebsd/sys/contrib/openzfs/module/zfs/zap_micro.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
26  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27  * Copyright 2017 Nexenta Systems, Inc.
28  * Copyright (c) 2024, Klara, Inc.
29  */
30 
31 #include <sys/zio.h>
32 #include <sys/spa.h>
33 #include <sys/dmu.h>
34 #include <sys/zfs_context.h>
35 #include <sys/zap.h>
36 #include <sys/zap_impl.h>
37 #include <sys/zap_leaf.h>
38 #include <sys/btree.h>
39 #include <sys/arc.h>
40 #include <sys/dmu_objset.h>
41 #include <sys/spa_impl.h>
42 
43 #ifdef _KERNEL
44 #include <sys/sunddi.h>
45 #endif
46 
47 /*
48  * The maximum size (in bytes) of a microzap before it is converted to a
49  * fatzap. It will be rounded up to next multiple of 512 (SPA_MINBLOCKSIZE).
50  *
51  * By definition, a microzap must fit into a single block, so this has
52  * traditionally been SPA_OLD_MAXBLOCKSIZE, and is set to that by default.
53  * Setting this higher requires both the large_blocks feature (to even create
54  * blocks that large) and the large_microzap feature (to enable the stream
55  * machinery to understand not to try to split a microzap block).
56  *
57  * If large_microzap is enabled, this value will be clamped to
58  * spa_maxblocksize(), up to 1M. If not, it will be clamped to
59  * SPA_OLD_MAXBLOCKSIZE.
60  */
61 static int zap_micro_max_size = SPA_OLD_MAXBLOCKSIZE;
62 
63 /*
64  * The 1M upper limit is necessary because the count of chunks in a microzap
65  * block is stored as a uint16_t (mze_chunkid). Each chunk is 64 bytes, and the
66  * first is used to store a header, so there are 32767 usable chunks, which is
67  * just under 2M. 1M is the largest power-2-rounded block size under 2M, so we
68  * must set the limit there.
69  */
70 #define	MZAP_MAX_SIZE	(1048576)
71 
72 uint64_t
zap_get_micro_max_size(spa_t * spa)73 zap_get_micro_max_size(spa_t *spa)
74 {
75 	uint64_t maxsz = MIN(MZAP_MAX_SIZE,
76 	    P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE));
77 	if (maxsz <= SPA_OLD_MAXBLOCKSIZE)
78 		return (maxsz);
79 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP))
80 		return (MIN(maxsz, spa_maxblocksize(spa)));
81 	return (SPA_OLD_MAXBLOCKSIZE);
82 }
83 
84 static int mzap_upgrade(zap_t **zapp,
85     const void *tag, dmu_tx_t *tx, zap_flags_t flags);
86 
87 uint64_t
zap_getflags(zap_t * zap)88 zap_getflags(zap_t *zap)
89 {
90 	if (zap->zap_ismicro)
91 		return (0);
92 	return (zap_f_phys(zap)->zap_flags);
93 }
94 
95 int
zap_hashbits(zap_t * zap)96 zap_hashbits(zap_t *zap)
97 {
98 	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
99 		return (48);
100 	else
101 		return (28);
102 }
103 
104 uint32_t
zap_maxcd(zap_t * zap)105 zap_maxcd(zap_t *zap)
106 {
107 	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
108 		return ((1<<16)-1);
109 	else
110 		return (-1U);
111 }
112 
113 static uint64_t
zap_hash(zap_name_t * zn)114 zap_hash(zap_name_t *zn)
115 {
116 	zap_t *zap = zn->zn_zap;
117 	uint64_t h = 0;
118 
119 	if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
120 		ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
121 		h = *(uint64_t *)zn->zn_key_orig;
122 	} else {
123 		h = zap->zap_salt;
124 		ASSERT(h != 0);
125 		ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
126 
127 		if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
128 			const uint64_t *wp = zn->zn_key_norm;
129 
130 			ASSERT(zn->zn_key_intlen == 8);
131 			for (int i = 0; i < zn->zn_key_norm_numints;
132 			    wp++, i++) {
133 				uint64_t word = *wp;
134 
135 				for (int j = 0; j < 8; j++) {
136 					h = (h >> 8) ^
137 					    zfs_crc64_table[(h ^ word) & 0xFF];
138 					word >>= NBBY;
139 				}
140 			}
141 		} else {
142 			const uint8_t *cp = zn->zn_key_norm;
143 
144 			/*
145 			 * We previously stored the terminating null on
146 			 * disk, but didn't hash it, so we need to
147 			 * continue to not hash it.  (The
148 			 * zn_key_*_numints includes the terminating
149 			 * null for non-binary keys.)
150 			 */
151 			int len = zn->zn_key_norm_numints - 1;
152 
153 			ASSERT(zn->zn_key_intlen == 1);
154 			for (int i = 0; i < len; cp++, i++) {
155 				h = (h >> 8) ^
156 				    zfs_crc64_table[(h ^ *cp) & 0xFF];
157 			}
158 		}
159 	}
160 	/*
161 	 * Don't use all 64 bits, since we need some in the cookie for
162 	 * the collision differentiator.  We MUST use the high bits,
163 	 * since those are the ones that we first pay attention to when
164 	 * choosing the bucket.
165 	 */
166 	h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
167 
168 	return (h);
169 }
170 
171 static int
zap_normalize(zap_t * zap,const char * name,char * namenorm,int normflags,size_t outlen)172 zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags,
173     size_t outlen)
174 {
175 	ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
176 
177 	size_t inlen = strlen(name) + 1;
178 
179 	int err = 0;
180 	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
181 	    normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
182 	    U8_UNICODE_LATEST, &err);
183 
184 	return (err);
185 }
186 
187 boolean_t
zap_match(zap_name_t * zn,const char * matchname)188 zap_match(zap_name_t *zn, const char *matchname)
189 {
190 	boolean_t res = B_FALSE;
191 	ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
192 
193 	if (zn->zn_matchtype & MT_NORMALIZE) {
194 		size_t namelen = zn->zn_normbuf_len;
195 		char normbuf[ZAP_MAXNAMELEN];
196 		char *norm = normbuf;
197 
198 		/*
199 		 * Cannot allocate this on-stack as it exceed the stack-limit of
200 		 * 1024.
201 		 */
202 		if (namelen > ZAP_MAXNAMELEN)
203 			norm = kmem_alloc(namelen, KM_SLEEP);
204 
205 		if (zap_normalize(zn->zn_zap, matchname, norm,
206 		    zn->zn_normflags, namelen) != 0) {
207 			res = B_FALSE;
208 		} else {
209 			res = (strcmp(zn->zn_key_norm, norm) == 0);
210 		}
211 		if (norm != normbuf)
212 			kmem_free(norm, namelen);
213 	} else {
214 		res = (strcmp(zn->zn_key_orig, matchname) == 0);
215 	}
216 	return (res);
217 }
218 
219 static kmem_cache_t *zap_name_cache;
220 static kmem_cache_t *zap_attr_cache;
221 static kmem_cache_t *zap_name_long_cache;
222 static kmem_cache_t *zap_attr_long_cache;
223 
224 void
zap_init(void)225 zap_init(void)
226 {
227 	zap_name_cache = kmem_cache_create("zap_name",
228 	    sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL,
229 	    NULL, NULL, NULL, 0);
230 
231 	zap_attr_cache = kmem_cache_create("zap_attr_cache",
232 	    sizeof (zap_attribute_t) + ZAP_MAXNAMELEN,  0, NULL,
233 	    NULL, NULL, NULL, NULL, 0);
234 
235 	zap_name_long_cache = kmem_cache_create("zap_name_long",
236 	    sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL,
237 	    NULL, NULL, NULL, 0);
238 
239 	zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache",
240 	    sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW,  0, NULL,
241 	    NULL, NULL, NULL, NULL, 0);
242 }
243 
244 void
zap_fini(void)245 zap_fini(void)
246 {
247 	kmem_cache_destroy(zap_name_cache);
248 	kmem_cache_destroy(zap_attr_cache);
249 	kmem_cache_destroy(zap_name_long_cache);
250 	kmem_cache_destroy(zap_attr_long_cache);
251 }
252 
253 static zap_name_t *
zap_name_alloc(zap_t * zap,boolean_t longname)254 zap_name_alloc(zap_t *zap, boolean_t longname)
255 {
256 	kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache;
257 	zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP);
258 
259 	zn->zn_zap = zap;
260 	zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
261 	return (zn);
262 }
263 
264 void
zap_name_free(zap_name_t * zn)265 zap_name_free(zap_name_t *zn)
266 {
267 	if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) {
268 		kmem_cache_free(zap_name_cache, zn);
269 	} else {
270 		ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW);
271 		kmem_cache_free(zap_name_long_cache, zn);
272 	}
273 }
274 
275 static int
zap_name_init_str(zap_name_t * zn,const char * key,matchtype_t mt)276 zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt)
277 {
278 	zap_t *zap = zn->zn_zap;
279 	size_t key_len = strlen(key) + 1;
280 
281 	/* Make sure zn is allocated for longname if key is long */
282 	IMPLY(key_len > ZAP_MAXNAMELEN,
283 	    zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW);
284 
285 	zn->zn_key_intlen = sizeof (*key);
286 	zn->zn_key_orig = key;
287 	zn->zn_key_orig_numints = key_len;
288 	zn->zn_matchtype = mt;
289 	zn->zn_normflags = zap->zap_normflags;
290 
291 	/*
292 	 * If we're dealing with a case sensitive lookup on a mixed or
293 	 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
294 	 * will fold case to all caps overriding the lookup request.
295 	 */
296 	if (mt & MT_MATCH_CASE)
297 		zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
298 
299 	if (zap->zap_normflags) {
300 		/*
301 		 * We *must* use zap_normflags because this normalization is
302 		 * what the hash is computed from.
303 		 */
304 		if (zap_normalize(zap, key, zn->zn_normbuf,
305 		    zap->zap_normflags, zn->zn_normbuf_len) != 0)
306 			return (SET_ERROR(ENOTSUP));
307 		zn->zn_key_norm = zn->zn_normbuf;
308 		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
309 	} else {
310 		if (mt != 0)
311 			return (SET_ERROR(ENOTSUP));
312 		zn->zn_key_norm = zn->zn_key_orig;
313 		zn->zn_key_norm_numints = zn->zn_key_orig_numints;
314 	}
315 
316 	zn->zn_hash = zap_hash(zn);
317 
318 	if (zap->zap_normflags != zn->zn_normflags) {
319 		/*
320 		 * We *must* use zn_normflags because this normalization is
321 		 * what the matching is based on.  (Not the hash!)
322 		 */
323 		if (zap_normalize(zap, key, zn->zn_normbuf,
324 		    zn->zn_normflags, zn->zn_normbuf_len) != 0)
325 			return (SET_ERROR(ENOTSUP));
326 		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
327 	}
328 
329 	return (0);
330 }
331 
332 zap_name_t *
zap_name_alloc_str(zap_t * zap,const char * key,matchtype_t mt)333 zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt)
334 {
335 	size_t key_len = strlen(key) + 1;
336 	zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN));
337 	if (zap_name_init_str(zn, key, mt) != 0) {
338 		zap_name_free(zn);
339 		return (NULL);
340 	}
341 	return (zn);
342 }
343 
344 static zap_name_t *
zap_name_alloc_uint64(zap_t * zap,const uint64_t * key,int numints)345 zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
346 {
347 	zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP);
348 
349 	ASSERT(zap->zap_normflags == 0);
350 	zn->zn_zap = zap;
351 	zn->zn_key_intlen = sizeof (*key);
352 	zn->zn_key_orig = zn->zn_key_norm = key;
353 	zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
354 	zn->zn_matchtype = 0;
355 	zn->zn_normbuf_len = ZAP_MAXNAMELEN;
356 
357 	zn->zn_hash = zap_hash(zn);
358 	return (zn);
359 }
360 
361 static void
mzap_byteswap(mzap_phys_t * buf,size_t size)362 mzap_byteswap(mzap_phys_t *buf, size_t size)
363 {
364 	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
365 	buf->mz_salt = BSWAP_64(buf->mz_salt);
366 	buf->mz_normflags = BSWAP_64(buf->mz_normflags);
367 	int max = (size / MZAP_ENT_LEN) - 1;
368 	for (int i = 0; i < max; i++) {
369 		buf->mz_chunk[i].mze_value =
370 		    BSWAP_64(buf->mz_chunk[i].mze_value);
371 		buf->mz_chunk[i].mze_cd =
372 		    BSWAP_32(buf->mz_chunk[i].mze_cd);
373 	}
374 }
375 
376 void
zap_byteswap(void * buf,size_t size)377 zap_byteswap(void *buf, size_t size)
378 {
379 	uint64_t block_type = *(uint64_t *)buf;
380 
381 	if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
382 		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
383 		mzap_byteswap(buf, size);
384 	} else {
385 		fzap_byteswap(buf, size);
386 	}
387 }
388 
389 __attribute__((always_inline)) inline
390 static int
mze_compare(const void * arg1,const void * arg2)391 mze_compare(const void *arg1, const void *arg2)
392 {
393 	const mzap_ent_t *mze1 = arg1;
394 	const mzap_ent_t *mze2 = arg2;
395 
396 	return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd,
397 	    (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd));
398 }
399 
ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf,mzap_ent_t,mze_compare)400 ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t,
401     mze_compare)
402 
403 static void
404 mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash)
405 {
406 	mzap_ent_t mze;
407 
408 	ASSERT(zap->zap_ismicro);
409 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
410 
411 	mze.mze_chunkid = chunkid;
412 	ASSERT0(hash & 0xffffffff);
413 	mze.mze_hash = hash >> 32;
414 	ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff);
415 	mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd;
416 	ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0);
417 	zfs_btree_add(&zap->zap_m.zap_tree, &mze);
418 }
419 
420 static mzap_ent_t *
mze_find(zap_name_t * zn,zfs_btree_index_t * idx)421 mze_find(zap_name_t *zn, zfs_btree_index_t *idx)
422 {
423 	mzap_ent_t mze_tofind;
424 	mzap_ent_t *mze;
425 	zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree;
426 
427 	ASSERT(zn->zn_zap->zap_ismicro);
428 	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
429 
430 	ASSERT0(zn->zn_hash & 0xffffffff);
431 	mze_tofind.mze_hash = zn->zn_hash >> 32;
432 	mze_tofind.mze_cd = 0;
433 
434 	mze = zfs_btree_find(tree, &mze_tofind, idx);
435 	if (mze == NULL)
436 		mze = zfs_btree_next(tree, idx, idx);
437 	for (; mze && mze->mze_hash == mze_tofind.mze_hash;
438 	    mze = zfs_btree_next(tree, idx, idx)) {
439 		ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
440 		if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
441 			return (mze);
442 	}
443 
444 	return (NULL);
445 }
446 
447 static uint32_t
mze_find_unused_cd(zap_t * zap,uint64_t hash)448 mze_find_unused_cd(zap_t *zap, uint64_t hash)
449 {
450 	mzap_ent_t mze_tofind;
451 	zfs_btree_index_t idx;
452 	zfs_btree_t *tree = &zap->zap_m.zap_tree;
453 
454 	ASSERT(zap->zap_ismicro);
455 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
456 
457 	ASSERT0(hash & 0xffffffff);
458 	hash >>= 32;
459 	mze_tofind.mze_hash = hash;
460 	mze_tofind.mze_cd = 0;
461 
462 	uint32_t cd = 0;
463 	for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
464 	    mze && mze->mze_hash == hash;
465 	    mze = zfs_btree_next(tree, &idx, &idx)) {
466 		if (mze->mze_cd != cd)
467 			break;
468 		cd++;
469 	}
470 
471 	return (cd);
472 }
473 
474 /*
475  * Each mzap entry requires at max : 4 chunks
476  * 3 chunks for names + 1 chunk for value.
477  */
478 #define	MZAP_ENT_CHUNKS	(1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
479 	ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
480 
481 /*
482  * Check if the current entry keeps the colliding entries under the fatzap leaf
483  * size.
484  */
485 static boolean_t
mze_canfit_fzap_leaf(zap_name_t * zn,uint64_t hash)486 mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
487 {
488 	zap_t *zap = zn->zn_zap;
489 	mzap_ent_t mze_tofind;
490 	zfs_btree_index_t idx;
491 	zfs_btree_t *tree = &zap->zap_m.zap_tree;
492 	uint32_t mzap_ents = 0;
493 
494 	ASSERT0(hash & 0xffffffff);
495 	hash >>= 32;
496 	mze_tofind.mze_hash = hash;
497 	mze_tofind.mze_cd = 0;
498 
499 	for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
500 	    mze && mze->mze_hash == hash;
501 	    mze = zfs_btree_next(tree, &idx, &idx)) {
502 		mzap_ents++;
503 	}
504 
505 	/* Include the new entry being added */
506 	mzap_ents++;
507 
508 	return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
509 }
510 
511 static void
mze_destroy(zap_t * zap)512 mze_destroy(zap_t *zap)
513 {
514 	zfs_btree_clear(&zap->zap_m.zap_tree);
515 	zfs_btree_destroy(&zap->zap_m.zap_tree);
516 }
517 
518 static zap_t *
mzap_open(dmu_buf_t * db)519 mzap_open(dmu_buf_t *db)
520 {
521 	zap_t *winner;
522 	uint64_t *zap_hdr = (uint64_t *)db->db_data;
523 	uint64_t zap_block_type = zap_hdr[0];
524 	uint64_t zap_magic = zap_hdr[1];
525 
526 	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
527 
528 	zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
529 	rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
530 	rw_enter(&zap->zap_rwlock, RW_WRITER);
531 	zap->zap_objset = dmu_buf_get_objset(db);
532 	zap->zap_object = db->db_object;
533 	zap->zap_dbuf = db;
534 
535 	if (zap_block_type != ZBT_MICRO) {
536 		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT,
537 		    0);
538 		zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
539 		if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
540 			winner = NULL;	/* No actual winner here... */
541 			goto handle_winner;
542 		}
543 	} else {
544 		zap->zap_ismicro = TRUE;
545 	}
546 
547 	/*
548 	 * Make sure that zap_ismicro is set before we let others see
549 	 * it, because zap_lockdir() checks zap_ismicro without the lock
550 	 * held.
551 	 */
552 	dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
553 	winner = dmu_buf_set_user(db, &zap->zap_dbu);
554 
555 	if (winner != NULL)
556 		goto handle_winner;
557 
558 	if (zap->zap_ismicro) {
559 		zap->zap_salt = zap_m_phys(zap)->mz_salt;
560 		zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
561 		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
562 
563 		/*
564 		 * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove()
565 		 * overhead on massive inserts below.  It still allows to store
566 		 * 62 entries before we have to add 2KB B-tree core node.
567 		 */
568 		zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare,
569 		    mze_find_in_buf, sizeof (mzap_ent_t), 512);
570 
571 		zap_name_t *zn = zap_name_alloc(zap, B_FALSE);
572 		for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) {
573 			mzap_ent_phys_t *mze =
574 			    &zap_m_phys(zap)->mz_chunk[i];
575 			if (mze->mze_name[0]) {
576 				zap->zap_m.zap_num_entries++;
577 				zap_name_init_str(zn, mze->mze_name, 0);
578 				mze_insert(zap, i, zn->zn_hash);
579 			}
580 		}
581 		zap_name_free(zn);
582 	} else {
583 		zap->zap_salt = zap_f_phys(zap)->zap_salt;
584 		zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
585 
586 		ASSERT3U(sizeof (struct zap_leaf_header), ==,
587 		    2*ZAP_LEAF_CHUNKSIZE);
588 
589 		/*
590 		 * The embedded pointer table should not overlap the
591 		 * other members.
592 		 */
593 		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
594 		    &zap_f_phys(zap)->zap_salt);
595 
596 		/*
597 		 * The embedded pointer table should end at the end of
598 		 * the block
599 		 */
600 		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
601 		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
602 		    (uintptr_t)zap_f_phys(zap), ==,
603 		    zap->zap_dbuf->db_size);
604 	}
605 	rw_exit(&zap->zap_rwlock);
606 	return (zap);
607 
608 handle_winner:
609 	rw_exit(&zap->zap_rwlock);
610 	rw_destroy(&zap->zap_rwlock);
611 	if (!zap->zap_ismicro)
612 		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
613 	kmem_free(zap, sizeof (zap_t));
614 	return (winner);
615 }
616 
617 /*
618  * This routine "consumes" the caller's hold on the dbuf, which must
619  * have the specified tag.
620  */
621 static int
zap_lockdir_impl(dnode_t * dn,dmu_buf_t * db,const void * tag,dmu_tx_t * tx,krw_t lti,boolean_t fatreader,boolean_t adding,zap_t ** zapp)622 zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
623     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
624 {
625 	ASSERT0(db->db_offset);
626 	objset_t *os = dmu_buf_get_objset(db);
627 	uint64_t obj = db->db_object;
628 	dmu_object_info_t doi;
629 
630 	*zapp = NULL;
631 
632 	dmu_object_info_from_dnode(dn, &doi);
633 	if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
634 		return (SET_ERROR(EINVAL));
635 
636 	zap_t *zap = dmu_buf_get_user(db);
637 	if (zap == NULL) {
638 		zap = mzap_open(db);
639 		if (zap == NULL) {
640 			/*
641 			 * mzap_open() didn't like what it saw on-disk.
642 			 * Check for corruption!
643 			 */
644 			return (SET_ERROR(EIO));
645 		}
646 	}
647 
648 	/*
649 	 * We're checking zap_ismicro without the lock held, in order to
650 	 * tell what type of lock we want.  Once we have some sort of
651 	 * lock, see if it really is the right type.  In practice this
652 	 * can only be different if it was upgraded from micro to fat,
653 	 * and micro wanted WRITER but fat only needs READER.
654 	 */
655 	krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
656 	rw_enter(&zap->zap_rwlock, lt);
657 	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
658 		/* it was upgraded, now we only need reader */
659 		ASSERT(lt == RW_WRITER);
660 		ASSERT(RW_READER ==
661 		    ((!zap->zap_ismicro && fatreader) ? RW_READER : lti));
662 		rw_downgrade(&zap->zap_rwlock);
663 		lt = RW_READER;
664 	}
665 
666 	zap->zap_objset = os;
667 	zap->zap_dnode = dn;
668 
669 	if (lt == RW_WRITER)
670 		dmu_buf_will_dirty(db, tx);
671 
672 	ASSERT3P(zap->zap_dbuf, ==, db);
673 
674 	ASSERT(!zap->zap_ismicro ||
675 	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
676 	if (zap->zap_ismicro && tx && adding &&
677 	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
678 		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
679 		if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) {
680 			dprintf("upgrading obj %llu: num_entries=%u\n",
681 			    (u_longlong_t)obj, zap->zap_m.zap_num_entries);
682 			*zapp = zap;
683 			int err = mzap_upgrade(zapp, tag, tx, 0);
684 			if (err != 0)
685 				rw_exit(&zap->zap_rwlock);
686 			return (err);
687 		}
688 		VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
689 		zap->zap_m.zap_num_chunks =
690 		    db->db_size / MZAP_ENT_LEN - 1;
691 
692 		if (newsz > SPA_OLD_MAXBLOCKSIZE) {
693 			dsl_dataset_t *ds = dmu_objset_ds(os);
694 			if (!dsl_dataset_feature_is_active(ds,
695 			    SPA_FEATURE_LARGE_MICROZAP)) {
696 				/*
697 				 * A microzap just grew beyond the old limit
698 				 * for the first time, so we have to ensure the
699 				 * feature flag is activated.
700 				 * zap_get_micro_max_size() won't let us get
701 				 * here if the feature is not enabled, so we
702 				 * don't need any other checks beforehand.
703 				 *
704 				 * Since we're in open context, we can't
705 				 * activate the feature directly, so we instead
706 				 * flag it on the dataset for next sync.
707 				 */
708 				dsl_dataset_dirty(ds, tx);
709 				mutex_enter(&ds->ds_lock);
710 				ds->ds_feature_activation
711 				    [SPA_FEATURE_LARGE_MICROZAP] =
712 				    (void *)B_TRUE;
713 				mutex_exit(&ds->ds_lock);
714 			}
715 		}
716 	}
717 
718 	*zapp = zap;
719 	return (0);
720 }
721 
722 static int
zap_lockdir_by_dnode(dnode_t * dn,dmu_tx_t * tx,krw_t lti,boolean_t fatreader,boolean_t adding,const void * tag,zap_t ** zapp)723 zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
724     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
725     zap_t **zapp)
726 {
727 	dmu_buf_t *db;
728 	int err;
729 
730 	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
731 	if (err != 0)
732 		return (err);
733 	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
734 	if (err != 0)
735 		dmu_buf_rele(db, tag);
736 	else
737 		VERIFY(dnode_add_ref(dn, tag));
738 	return (err);
739 }
740 
741 int
zap_lockdir(objset_t * os,uint64_t obj,dmu_tx_t * tx,krw_t lti,boolean_t fatreader,boolean_t adding,const void * tag,zap_t ** zapp)742 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
743     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
744     zap_t **zapp)
745 {
746 	dnode_t *dn;
747 	dmu_buf_t *db;
748 	int err;
749 
750 	err = dnode_hold(os, obj, tag, &dn);
751 	if (err != 0)
752 		return (err);
753 	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
754 	if (err != 0) {
755 		dnode_rele(dn, tag);
756 		return (err);
757 	}
758 	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
759 	if (err != 0) {
760 		dmu_buf_rele(db, tag);
761 		dnode_rele(dn, tag);
762 	}
763 	return (err);
764 }
765 
766 void
zap_unlockdir(zap_t * zap,const void * tag)767 zap_unlockdir(zap_t *zap, const void *tag)
768 {
769 	rw_exit(&zap->zap_rwlock);
770 	dnode_rele(zap->zap_dnode, tag);
771 	dmu_buf_rele(zap->zap_dbuf, tag);
772 }
773 
774 static int
mzap_upgrade(zap_t ** zapp,const void * tag,dmu_tx_t * tx,zap_flags_t flags)775 mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags)
776 {
777 	int err = 0;
778 	zap_t *zap = *zapp;
779 
780 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
781 
782 	int sz = zap->zap_dbuf->db_size;
783 	mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP);
784 	memcpy(mzp, zap->zap_dbuf->db_data, sz);
785 	int nchunks = zap->zap_m.zap_num_chunks;
786 
787 	if (!flags) {
788 		err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
789 		    1ULL << fzap_default_block_shift, 0, tx);
790 		if (err != 0) {
791 			vmem_free(mzp, sz);
792 			return (err);
793 		}
794 	}
795 
796 	dprintf("upgrading obj=%llu with %u chunks\n",
797 	    (u_longlong_t)zap->zap_object, nchunks);
798 	/* XXX destroy the tree later, so we can use the stored hash value */
799 	mze_destroy(zap);
800 
801 	fzap_upgrade(zap, tx, flags);
802 
803 	zap_name_t *zn = zap_name_alloc(zap, B_FALSE);
804 	for (int i = 0; i < nchunks; i++) {
805 		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
806 		if (mze->mze_name[0] == 0)
807 			continue;
808 		dprintf("adding %s=%llu\n",
809 		    mze->mze_name, (u_longlong_t)mze->mze_value);
810 		zap_name_init_str(zn, mze->mze_name, 0);
811 		/* If we fail here, we would end up losing entries */
812 		VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
813 		    tag, tx));
814 		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
815 	}
816 	zap_name_free(zn);
817 	vmem_free(mzp, sz);
818 	*zapp = zap;
819 	return (0);
820 }
821 
822 /*
823  * The "normflags" determine the behavior of the matchtype_t which is
824  * passed to zap_lookup_norm().  Names which have the same normalized
825  * version will be stored with the same hash value, and therefore we can
826  * perform normalization-insensitive lookups.  We can be Unicode form-
827  * insensitive and/or case-insensitive.  The following flags are valid for
828  * "normflags":
829  *
830  * U8_TEXTPREP_NFC
831  * U8_TEXTPREP_NFD
832  * U8_TEXTPREP_NFKC
833  * U8_TEXTPREP_NFKD
834  * U8_TEXTPREP_TOUPPER
835  *
836  * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
837  * of them may be supplied.
838  */
839 void
mzap_create_impl(dnode_t * dn,int normflags,zap_flags_t flags,dmu_tx_t * tx)840 mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
841 {
842 	dmu_buf_t *db;
843 
844 	VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
845 
846 	dmu_buf_will_dirty(db, tx);
847 	mzap_phys_t *zp = db->db_data;
848 	zp->mz_block_type = ZBT_MICRO;
849 	zp->mz_salt =
850 	    ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL;
851 	zp->mz_normflags = normflags;
852 
853 	if (flags != 0) {
854 		zap_t *zap;
855 		/* Only fat zap supports flags; upgrade immediately. */
856 		VERIFY(dnode_add_ref(dn, FTAG));
857 		VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER,
858 		    B_FALSE, B_FALSE, &zap));
859 		VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
860 		zap_unlockdir(zap, FTAG);
861 	} else {
862 		dmu_buf_rele(db, FTAG);
863 	}
864 }
865 
866 static uint64_t
zap_create_impl(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dnode_t ** allocated_dnode,const void * tag,dmu_tx_t * tx)867 zap_create_impl(objset_t *os, int normflags, zap_flags_t flags,
868     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
869     dmu_object_type_t bonustype, int bonuslen, int dnodesize,
870     dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
871 {
872 	uint64_t obj;
873 
874 	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
875 
876 	if (allocated_dnode == NULL) {
877 		dnode_t *dn;
878 		obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
879 		    indirect_blockshift, bonustype, bonuslen, dnodesize,
880 		    &dn, FTAG, tx);
881 		mzap_create_impl(dn, normflags, flags, tx);
882 		dnode_rele(dn, FTAG);
883 	} else {
884 		obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
885 		    indirect_blockshift, bonustype, bonuslen, dnodesize,
886 		    allocated_dnode, tag, tx);
887 		mzap_create_impl(*allocated_dnode, normflags, flags, tx);
888 	}
889 
890 	return (obj);
891 }
892 
893 int
zap_create_claim(objset_t * os,uint64_t obj,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)894 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
895     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
896 {
897 	return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
898 	    0, tx));
899 }
900 
901 int
zap_create_claim_dnsize(objset_t * os,uint64_t obj,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)902 zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
903     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
904 {
905 	return (zap_create_claim_norm_dnsize(os, obj,
906 	    0, ot, bonustype, bonuslen, dnodesize, tx));
907 }
908 
909 int
zap_create_claim_norm(objset_t * os,uint64_t obj,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)910 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
911     dmu_object_type_t ot,
912     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
913 {
914 	return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
915 	    bonuslen, 0, tx));
916 }
917 
918 int
zap_create_claim_norm_dnsize(objset_t * os,uint64_t obj,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)919 zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
920     dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
921     int dnodesize, dmu_tx_t *tx)
922 {
923 	dnode_t *dn;
924 	int error;
925 
926 	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
927 	error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
928 	    dnodesize, tx);
929 	if (error != 0)
930 		return (error);
931 
932 	error = dnode_hold(os, obj, FTAG, &dn);
933 	if (error != 0)
934 		return (error);
935 
936 	mzap_create_impl(dn, normflags, 0, tx);
937 
938 	dnode_rele(dn, FTAG);
939 
940 	return (0);
941 }
942 
943 uint64_t
zap_create(objset_t * os,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)944 zap_create(objset_t *os, dmu_object_type_t ot,
945     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
946 {
947 	return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
948 }
949 
950 uint64_t
zap_create_dnsize(objset_t * os,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)951 zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
952     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
953 {
954 	return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
955 	    dnodesize, tx));
956 }
957 
958 uint64_t
zap_create_norm(objset_t * os,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)959 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
960     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
961 {
962 	return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
963 	    0, tx));
964 }
965 
966 uint64_t
zap_create_norm_dnsize(objset_t * os,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)967 zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
968     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
969 {
970 	return (zap_create_impl(os, normflags, 0, ot, 0, 0,
971 	    bonustype, bonuslen, dnodesize, NULL, NULL, tx));
972 }
973 
974 uint64_t
zap_create_flags(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)975 zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
976     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
977     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
978 {
979 	return (zap_create_flags_dnsize(os, normflags, flags, ot,
980 	    leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
981 }
982 
983 uint64_t
zap_create_flags_dnsize(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)984 zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
985     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
986     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
987 {
988 	return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
989 	    indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL,
990 	    tx));
991 }
992 
993 /*
994  * Create a zap object and return a pointer to the newly allocated dnode via
995  * the allocated_dnode argument.  The returned dnode will be held and the
996  * caller is responsible for releasing the hold by calling dnode_rele().
997  */
998 uint64_t
zap_create_hold(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dnode_t ** allocated_dnode,const void * tag,dmu_tx_t * tx)999 zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
1000     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
1001     dmu_object_type_t bonustype, int bonuslen, int dnodesize,
1002     dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
1003 {
1004 	return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
1005 	    indirect_blockshift, bonustype, bonuslen, dnodesize,
1006 	    allocated_dnode, tag, tx));
1007 }
1008 
1009 int
zap_destroy(objset_t * os,uint64_t zapobj,dmu_tx_t * tx)1010 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
1011 {
1012 	/*
1013 	 * dmu_object_free will free the object number and free the
1014 	 * data.  Freeing the data will cause our pageout function to be
1015 	 * called, which will destroy our data (zap_leaf_t's and zap_t).
1016 	 */
1017 
1018 	return (dmu_object_free(os, zapobj, tx));
1019 }
1020 
1021 void
zap_evict_sync(void * dbu)1022 zap_evict_sync(void *dbu)
1023 {
1024 	zap_t *zap = dbu;
1025 
1026 	rw_destroy(&zap->zap_rwlock);
1027 
1028 	if (zap->zap_ismicro)
1029 		mze_destroy(zap);
1030 	else
1031 		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
1032 
1033 	kmem_free(zap, sizeof (zap_t));
1034 }
1035 
1036 int
zap_count(objset_t * os,uint64_t zapobj,uint64_t * count)1037 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
1038 {
1039 	zap_t *zap;
1040 
1041 	int err =
1042 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1043 	if (err != 0)
1044 		return (err);
1045 	if (!zap->zap_ismicro) {
1046 		err = fzap_count(zap, count);
1047 	} else {
1048 		*count = zap->zap_m.zap_num_entries;
1049 	}
1050 	zap_unlockdir(zap, FTAG);
1051 	return (err);
1052 }
1053 
1054 /*
1055  * zn may be NULL; if not specified, it will be computed if needed.
1056  * See also the comment above zap_entry_normalization_conflict().
1057  */
1058 static boolean_t
mzap_normalization_conflict(zap_t * zap,zap_name_t * zn,mzap_ent_t * mze,zfs_btree_index_t * idx)1059 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze,
1060     zfs_btree_index_t *idx)
1061 {
1062 	boolean_t allocdzn = B_FALSE;
1063 	mzap_ent_t *other;
1064 	zfs_btree_index_t oidx;
1065 
1066 	if (zap->zap_normflags == 0)
1067 		return (B_FALSE);
1068 
1069 	for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx);
1070 	    other && other->mze_hash == mze->mze_hash;
1071 	    other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) {
1072 
1073 		if (zn == NULL) {
1074 			zn = zap_name_alloc_str(zap,
1075 			    MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
1076 			allocdzn = B_TRUE;
1077 		}
1078 		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
1079 			if (allocdzn)
1080 				zap_name_free(zn);
1081 			return (B_TRUE);
1082 		}
1083 	}
1084 
1085 	for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx);
1086 	    other && other->mze_hash == mze->mze_hash;
1087 	    other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) {
1088 
1089 		if (zn == NULL) {
1090 			zn = zap_name_alloc_str(zap,
1091 			    MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
1092 			allocdzn = B_TRUE;
1093 		}
1094 		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
1095 			if (allocdzn)
1096 				zap_name_free(zn);
1097 			return (B_TRUE);
1098 		}
1099 	}
1100 
1101 	if (allocdzn)
1102 		zap_name_free(zn);
1103 	return (B_FALSE);
1104 }
1105 
1106 /*
1107  * Routines for manipulating attributes.
1108  */
1109 
1110 int
zap_lookup(objset_t * os,uint64_t zapobj,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf)1111 zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
1112     uint64_t integer_size, uint64_t num_integers, void *buf)
1113 {
1114 	return (zap_lookup_norm(os, zapobj, name, integer_size,
1115 	    num_integers, buf, 0, NULL, 0, NULL));
1116 }
1117 
1118 static int
zap_lookup_impl(zap_t * zap,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf,matchtype_t mt,char * realname,int rn_len,boolean_t * ncp)1119 zap_lookup_impl(zap_t *zap, const char *name,
1120     uint64_t integer_size, uint64_t num_integers, void *buf,
1121     matchtype_t mt, char *realname, int rn_len,
1122     boolean_t *ncp)
1123 {
1124 	int err = 0;
1125 
1126 	zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
1127 	if (zn == NULL)
1128 		return (SET_ERROR(ENOTSUP));
1129 
1130 	if (!zap->zap_ismicro) {
1131 		err = fzap_lookup(zn, integer_size, num_integers, buf,
1132 		    realname, rn_len, ncp);
1133 	} else {
1134 		zfs_btree_index_t idx;
1135 		mzap_ent_t *mze = mze_find(zn, &idx);
1136 		if (mze == NULL) {
1137 			err = SET_ERROR(ENOENT);
1138 		} else {
1139 			if (num_integers < 1) {
1140 				err = SET_ERROR(EOVERFLOW);
1141 			} else if (integer_size != 8) {
1142 				err = SET_ERROR(EINVAL);
1143 			} else {
1144 				*(uint64_t *)buf =
1145 				    MZE_PHYS(zap, mze)->mze_value;
1146 				if (realname != NULL)
1147 					(void) strlcpy(realname,
1148 					    MZE_PHYS(zap, mze)->mze_name,
1149 					    rn_len);
1150 				if (ncp) {
1151 					*ncp = mzap_normalization_conflict(zap,
1152 					    zn, mze, &idx);
1153 				}
1154 			}
1155 		}
1156 	}
1157 	zap_name_free(zn);
1158 	return (err);
1159 }
1160 
1161 int
zap_lookup_norm(objset_t * os,uint64_t zapobj,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf,matchtype_t mt,char * realname,int rn_len,boolean_t * ncp)1162 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
1163     uint64_t integer_size, uint64_t num_integers, void *buf,
1164     matchtype_t mt, char *realname, int rn_len,
1165     boolean_t *ncp)
1166 {
1167 	zap_t *zap;
1168 
1169 	int err =
1170 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1171 	if (err != 0)
1172 		return (err);
1173 	err = zap_lookup_impl(zap, name, integer_size,
1174 	    num_integers, buf, mt, realname, rn_len, ncp);
1175 	zap_unlockdir(zap, FTAG);
1176 	return (err);
1177 }
1178 
1179 int
zap_prefetch(objset_t * os,uint64_t zapobj,const char * name)1180 zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
1181 {
1182 	zap_t *zap;
1183 	int err;
1184 	zap_name_t *zn;
1185 
1186 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1187 	if (err)
1188 		return (err);
1189 	zn = zap_name_alloc_str(zap, name, 0);
1190 	if (zn == NULL) {
1191 		zap_unlockdir(zap, FTAG);
1192 		return (SET_ERROR(ENOTSUP));
1193 	}
1194 
1195 	fzap_prefetch(zn);
1196 	zap_name_free(zn);
1197 	zap_unlockdir(zap, FTAG);
1198 	return (err);
1199 }
1200 
1201 int
zap_prefetch_object(objset_t * os,uint64_t zapobj)1202 zap_prefetch_object(objset_t *os, uint64_t zapobj)
1203 {
1204 	int error;
1205 	dmu_object_info_t doi;
1206 
1207 	error = dmu_object_info(os, zapobj, &doi);
1208 	if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
1209 		error = SET_ERROR(EINVAL);
1210 	if (error == 0)
1211 		dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset);
1212 
1213 	return (error);
1214 }
1215 
1216 int
zap_lookup_by_dnode(dnode_t * dn,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf)1217 zap_lookup_by_dnode(dnode_t *dn, const char *name,
1218     uint64_t integer_size, uint64_t num_integers, void *buf)
1219 {
1220 	return (zap_lookup_norm_by_dnode(dn, name, integer_size,
1221 	    num_integers, buf, 0, NULL, 0, NULL));
1222 }
1223 
1224 int
zap_lookup_norm_by_dnode(dnode_t * dn,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf,matchtype_t mt,char * realname,int rn_len,boolean_t * ncp)1225 zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
1226     uint64_t integer_size, uint64_t num_integers, void *buf,
1227     matchtype_t mt, char *realname, int rn_len,
1228     boolean_t *ncp)
1229 {
1230 	zap_t *zap;
1231 
1232 	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
1233 	    FTAG, &zap);
1234 	if (err != 0)
1235 		return (err);
1236 	err = zap_lookup_impl(zap, name, integer_size,
1237 	    num_integers, buf, mt, realname, rn_len, ncp);
1238 	zap_unlockdir(zap, FTAG);
1239 	return (err);
1240 }
1241 
1242 static int
zap_prefetch_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints)1243 zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints)
1244 {
1245 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1246 	if (zn == NULL) {
1247 		zap_unlockdir(zap, FTAG);
1248 		return (SET_ERROR(ENOTSUP));
1249 	}
1250 
1251 	fzap_prefetch(zn);
1252 	zap_name_free(zn);
1253 	zap_unlockdir(zap, FTAG);
1254 	return (0);
1255 }
1256 
1257 int
zap_prefetch_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints)1258 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1259     int key_numints)
1260 {
1261 	zap_t *zap;
1262 
1263 	int err =
1264 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1265 	if (err != 0)
1266 		return (err);
1267 	err = zap_prefetch_uint64_impl(zap, key, key_numints);
1268 	/* zap_prefetch_uint64_impl() calls zap_unlockdir() */
1269 	return (err);
1270 }
1271 
1272 int
zap_prefetch_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints)1273 zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints)
1274 {
1275 	zap_t *zap;
1276 
1277 	int err =
1278 	    zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1279 	if (err != 0)
1280 		return (err);
1281 	err = zap_prefetch_uint64_impl(zap, key, key_numints);
1282 	/* zap_prefetch_uint64_impl() calls zap_unlockdir() */
1283 	return (err);
1284 }
1285 
1286 static int
zap_lookup_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,uint64_t integer_size,uint64_t num_integers,void * buf)1287 zap_lookup_uint64_impl(zap_t *zap, const uint64_t *key,
1288     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1289 {
1290 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1291 	if (zn == NULL) {
1292 		zap_unlockdir(zap, FTAG);
1293 		return (SET_ERROR(ENOTSUP));
1294 	}
1295 
1296 	int err = fzap_lookup(zn, integer_size, num_integers, buf,
1297 	    NULL, 0, NULL);
1298 	zap_name_free(zn);
1299 	zap_unlockdir(zap, FTAG);
1300 	return (err);
1301 }
1302 
1303 int
zap_lookup_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,uint64_t integer_size,uint64_t num_integers,void * buf)1304 zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1305     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1306 {
1307 	zap_t *zap;
1308 
1309 	int err =
1310 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1311 	if (err != 0)
1312 		return (err);
1313 	err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size,
1314 	    num_integers, buf);
1315 	/* zap_lookup_uint64_impl() calls zap_unlockdir() */
1316 	return (err);
1317 }
1318 
1319 int
zap_lookup_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,uint64_t integer_size,uint64_t num_integers,void * buf)1320 zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
1321     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1322 {
1323 	zap_t *zap;
1324 
1325 	int err =
1326 	    zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1327 	if (err != 0)
1328 		return (err);
1329 	err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size,
1330 	    num_integers, buf);
1331 	/* zap_lookup_uint64_impl() calls zap_unlockdir() */
1332 	return (err);
1333 }
1334 
1335 int
zap_contains(objset_t * os,uint64_t zapobj,const char * name)1336 zap_contains(objset_t *os, uint64_t zapobj, const char *name)
1337 {
1338 	int err = zap_lookup_norm(os, zapobj, name, 0,
1339 	    0, NULL, 0, NULL, 0, NULL);
1340 	if (err == EOVERFLOW || err == EINVAL)
1341 		err = 0; /* found, but skipped reading the value */
1342 	return (err);
1343 }
1344 
1345 int
zap_length(objset_t * os,uint64_t zapobj,const char * name,uint64_t * integer_size,uint64_t * num_integers)1346 zap_length(objset_t *os, uint64_t zapobj, const char *name,
1347     uint64_t *integer_size, uint64_t *num_integers)
1348 {
1349 	zap_t *zap;
1350 
1351 	int err =
1352 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1353 	if (err != 0)
1354 		return (err);
1355 	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
1356 	if (zn == NULL) {
1357 		zap_unlockdir(zap, FTAG);
1358 		return (SET_ERROR(ENOTSUP));
1359 	}
1360 	if (!zap->zap_ismicro) {
1361 		err = fzap_length(zn, integer_size, num_integers);
1362 	} else {
1363 		zfs_btree_index_t idx;
1364 		mzap_ent_t *mze = mze_find(zn, &idx);
1365 		if (mze == NULL) {
1366 			err = SET_ERROR(ENOENT);
1367 		} else {
1368 			if (integer_size)
1369 				*integer_size = 8;
1370 			if (num_integers)
1371 				*num_integers = 1;
1372 		}
1373 	}
1374 	zap_name_free(zn);
1375 	zap_unlockdir(zap, FTAG);
1376 	return (err);
1377 }
1378 
1379 int
zap_length_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,uint64_t * integer_size,uint64_t * num_integers)1380 zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1381     int key_numints, uint64_t *integer_size, uint64_t *num_integers)
1382 {
1383 	zap_t *zap;
1384 
1385 	int err =
1386 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1387 	if (err != 0)
1388 		return (err);
1389 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1390 	if (zn == NULL) {
1391 		zap_unlockdir(zap, FTAG);
1392 		return (SET_ERROR(ENOTSUP));
1393 	}
1394 	err = fzap_length(zn, integer_size, num_integers);
1395 	zap_name_free(zn);
1396 	zap_unlockdir(zap, FTAG);
1397 	return (err);
1398 }
1399 
1400 static void
mzap_addent(zap_name_t * zn,uint64_t value)1401 mzap_addent(zap_name_t *zn, uint64_t value)
1402 {
1403 	zap_t *zap = zn->zn_zap;
1404 	uint16_t start = zap->zap_m.zap_alloc_next;
1405 
1406 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
1407 
1408 #ifdef ZFS_DEBUG
1409 	for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
1410 		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1411 		ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
1412 	}
1413 #endif
1414 
1415 	uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
1416 	/* given the limited size of the microzap, this can't happen */
1417 	ASSERT(cd < zap_maxcd(zap));
1418 
1419 again:
1420 	for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) {
1421 		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1422 		if (mze->mze_name[0] == 0) {
1423 			mze->mze_value = value;
1424 			mze->mze_cd = cd;
1425 			(void) strlcpy(mze->mze_name, zn->zn_key_orig,
1426 			    sizeof (mze->mze_name));
1427 			zap->zap_m.zap_num_entries++;
1428 			zap->zap_m.zap_alloc_next = i+1;
1429 			if (zap->zap_m.zap_alloc_next ==
1430 			    zap->zap_m.zap_num_chunks)
1431 				zap->zap_m.zap_alloc_next = 0;
1432 			mze_insert(zap, i, zn->zn_hash);
1433 			return;
1434 		}
1435 	}
1436 	if (start != 0) {
1437 		start = 0;
1438 		goto again;
1439 	}
1440 	cmn_err(CE_PANIC, "out of entries!");
1441 }
1442 
1443 static int
zap_add_impl(zap_t * zap,const char * key,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx,const void * tag)1444 zap_add_impl(zap_t *zap, const char *key,
1445     int integer_size, uint64_t num_integers,
1446     const void *val, dmu_tx_t *tx, const void *tag)
1447 {
1448 	const uint64_t *intval = val;
1449 	int err = 0;
1450 
1451 	zap_name_t *zn = zap_name_alloc_str(zap, key, 0);
1452 	if (zn == NULL) {
1453 		zap_unlockdir(zap, tag);
1454 		return (SET_ERROR(ENOTSUP));
1455 	}
1456 	if (!zap->zap_ismicro) {
1457 		err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
1458 		zap = zn->zn_zap;	/* fzap_add() may change zap */
1459 	} else if (integer_size != 8 || num_integers != 1 ||
1460 	    strlen(key) >= MZAP_NAME_LEN ||
1461 	    !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
1462 		err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
1463 		if (err == 0) {
1464 			err = fzap_add(zn, integer_size, num_integers, val,
1465 			    tag, tx);
1466 		}
1467 		zap = zn->zn_zap;	/* fzap_add() may change zap */
1468 	} else {
1469 		zfs_btree_index_t idx;
1470 		if (mze_find(zn, &idx) != NULL) {
1471 			err = SET_ERROR(EEXIST);
1472 		} else {
1473 			mzap_addent(zn, *intval);
1474 		}
1475 	}
1476 	ASSERT(zap == zn->zn_zap);
1477 	zap_name_free(zn);
1478 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
1479 		zap_unlockdir(zap, tag);
1480 	return (err);
1481 }
1482 
1483 int
zap_add(objset_t * os,uint64_t zapobj,const char * key,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1484 zap_add(objset_t *os, uint64_t zapobj, const char *key,
1485     int integer_size, uint64_t num_integers,
1486     const void *val, dmu_tx_t *tx)
1487 {
1488 	zap_t *zap;
1489 	int err;
1490 
1491 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1492 	if (err != 0)
1493 		return (err);
1494 	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1495 	/* zap_add_impl() calls zap_unlockdir() */
1496 	return (err);
1497 }
1498 
1499 int
zap_add_by_dnode(dnode_t * dn,const char * key,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1500 zap_add_by_dnode(dnode_t *dn, const char *key,
1501     int integer_size, uint64_t num_integers,
1502     const void *val, dmu_tx_t *tx)
1503 {
1504 	zap_t *zap;
1505 	int err;
1506 
1507 	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1508 	if (err != 0)
1509 		return (err);
1510 	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1511 	/* zap_add_impl() calls zap_unlockdir() */
1512 	return (err);
1513 }
1514 
1515 static int
zap_add_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx,const void * tag)1516 zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
1517     int key_numints, int integer_size, uint64_t num_integers,
1518     const void *val, dmu_tx_t *tx, const void *tag)
1519 {
1520 	int err;
1521 
1522 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1523 	if (zn == NULL) {
1524 		zap_unlockdir(zap, tag);
1525 		return (SET_ERROR(ENOTSUP));
1526 	}
1527 	err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
1528 	zap = zn->zn_zap;	/* fzap_add() may change zap */
1529 	zap_name_free(zn);
1530 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
1531 		zap_unlockdir(zap, tag);
1532 	return (err);
1533 }
1534 
1535 int
zap_add_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1536 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1537     int key_numints, int integer_size, uint64_t num_integers,
1538     const void *val, dmu_tx_t *tx)
1539 {
1540 	zap_t *zap;
1541 
1542 	int err =
1543 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1544 	if (err != 0)
1545 		return (err);
1546 	err = zap_add_uint64_impl(zap, key, key_numints,
1547 	    integer_size, num_integers, val, tx, FTAG);
1548 	/* zap_add_uint64_impl() calls zap_unlockdir() */
1549 	return (err);
1550 }
1551 
1552 int
zap_add_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1553 zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
1554     int key_numints, int integer_size, uint64_t num_integers,
1555     const void *val, dmu_tx_t *tx)
1556 {
1557 	zap_t *zap;
1558 
1559 	int err =
1560 	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1561 	if (err != 0)
1562 		return (err);
1563 	err = zap_add_uint64_impl(zap, key, key_numints,
1564 	    integer_size, num_integers, val, tx, FTAG);
1565 	/* zap_add_uint64_impl() calls zap_unlockdir() */
1566 	return (err);
1567 }
1568 
1569 int
zap_update(objset_t * os,uint64_t zapobj,const char * name,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1570 zap_update(objset_t *os, uint64_t zapobj, const char *name,
1571     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1572 {
1573 	zap_t *zap;
1574 	const uint64_t *intval = val;
1575 
1576 	int err =
1577 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1578 	if (err != 0)
1579 		return (err);
1580 	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
1581 	if (zn == NULL) {
1582 		zap_unlockdir(zap, FTAG);
1583 		return (SET_ERROR(ENOTSUP));
1584 	}
1585 	if (!zap->zap_ismicro) {
1586 		err = fzap_update(zn, integer_size, num_integers, val,
1587 		    FTAG, tx);
1588 		zap = zn->zn_zap;	/* fzap_update() may change zap */
1589 	} else if (integer_size != 8 || num_integers != 1 ||
1590 	    strlen(name) >= MZAP_NAME_LEN) {
1591 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
1592 		    (u_longlong_t)zapobj, integer_size,
1593 		    (u_longlong_t)num_integers, name);
1594 		err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
1595 		if (err == 0) {
1596 			err = fzap_update(zn, integer_size, num_integers,
1597 			    val, FTAG, tx);
1598 		}
1599 		zap = zn->zn_zap;	/* fzap_update() may change zap */
1600 	} else {
1601 		zfs_btree_index_t idx;
1602 		mzap_ent_t *mze = mze_find(zn, &idx);
1603 		if (mze != NULL) {
1604 			MZE_PHYS(zap, mze)->mze_value = *intval;
1605 		} else {
1606 			mzap_addent(zn, *intval);
1607 		}
1608 	}
1609 	ASSERT(zap == zn->zn_zap);
1610 	zap_name_free(zn);
1611 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
1612 		zap_unlockdir(zap, FTAG);
1613 	return (err);
1614 }
1615 
1616 static int
zap_update_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx,const void * tag)1617 zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
1618     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
1619     const void *tag)
1620 {
1621 	int err;
1622 
1623 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1624 	if (zn == NULL) {
1625 		zap_unlockdir(zap, tag);
1626 		return (SET_ERROR(ENOTSUP));
1627 	}
1628 	err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
1629 	zap = zn->zn_zap;	/* fzap_update() may change zap */
1630 	zap_name_free(zn);
1631 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
1632 		zap_unlockdir(zap, tag);
1633 	return (err);
1634 }
1635 
1636 int
zap_update_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1637 zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1638     int key_numints, int integer_size, uint64_t num_integers, const void *val,
1639     dmu_tx_t *tx)
1640 {
1641 	zap_t *zap;
1642 
1643 	int err =
1644 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1645 	if (err != 0)
1646 		return (err);
1647 	err = zap_update_uint64_impl(zap, key, key_numints,
1648 	    integer_size, num_integers, val, tx, FTAG);
1649 	/* zap_update_uint64_impl() calls zap_unlockdir() */
1650 	return (err);
1651 }
1652 
1653 int
zap_update_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1654 zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
1655     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1656 {
1657 	zap_t *zap;
1658 
1659 	int err =
1660 	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1661 	if (err != 0)
1662 		return (err);
1663 	err = zap_update_uint64_impl(zap, key, key_numints,
1664 	    integer_size, num_integers, val, tx, FTAG);
1665 	/* zap_update_uint64_impl() calls zap_unlockdir() */
1666 	return (err);
1667 }
1668 
1669 int
zap_remove(objset_t * os,uint64_t zapobj,const char * name,dmu_tx_t * tx)1670 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
1671 {
1672 	return (zap_remove_norm(os, zapobj, name, 0, tx));
1673 }
1674 
1675 static int
zap_remove_impl(zap_t * zap,const char * name,matchtype_t mt,dmu_tx_t * tx)1676 zap_remove_impl(zap_t *zap, const char *name,
1677     matchtype_t mt, dmu_tx_t *tx)
1678 {
1679 	int err = 0;
1680 
1681 	zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
1682 	if (zn == NULL)
1683 		return (SET_ERROR(ENOTSUP));
1684 	if (!zap->zap_ismicro) {
1685 		err = fzap_remove(zn, tx);
1686 	} else {
1687 		zfs_btree_index_t idx;
1688 		mzap_ent_t *mze = mze_find(zn, &idx);
1689 		if (mze == NULL) {
1690 			err = SET_ERROR(ENOENT);
1691 		} else {
1692 			zap->zap_m.zap_num_entries--;
1693 			memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t));
1694 			zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx);
1695 		}
1696 	}
1697 	zap_name_free(zn);
1698 	return (err);
1699 }
1700 
1701 int
zap_remove_norm(objset_t * os,uint64_t zapobj,const char * name,matchtype_t mt,dmu_tx_t * tx)1702 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
1703     matchtype_t mt, dmu_tx_t *tx)
1704 {
1705 	zap_t *zap;
1706 	int err;
1707 
1708 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1709 	if (err)
1710 		return (err);
1711 	err = zap_remove_impl(zap, name, mt, tx);
1712 	zap_unlockdir(zap, FTAG);
1713 	return (err);
1714 }
1715 
1716 int
zap_remove_by_dnode(dnode_t * dn,const char * name,dmu_tx_t * tx)1717 zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
1718 {
1719 	zap_t *zap;
1720 	int err;
1721 
1722 	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1723 	if (err)
1724 		return (err);
1725 	err = zap_remove_impl(zap, name, 0, tx);
1726 	zap_unlockdir(zap, FTAG);
1727 	return (err);
1728 }
1729 
1730 static int
zap_remove_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,dmu_tx_t * tx,const void * tag)1731 zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
1732     dmu_tx_t *tx, const void *tag)
1733 {
1734 	int err;
1735 
1736 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1737 	if (zn == NULL) {
1738 		zap_unlockdir(zap, tag);
1739 		return (SET_ERROR(ENOTSUP));
1740 	}
1741 	err = fzap_remove(zn, tx);
1742 	zap_name_free(zn);
1743 	zap_unlockdir(zap, tag);
1744 	return (err);
1745 }
1746 
1747 int
zap_remove_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,dmu_tx_t * tx)1748 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1749     int key_numints, dmu_tx_t *tx)
1750 {
1751 	zap_t *zap;
1752 
1753 	int err =
1754 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1755 	if (err != 0)
1756 		return (err);
1757 	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
1758 	/* zap_remove_uint64_impl() calls zap_unlockdir() */
1759 	return (err);
1760 }
1761 
1762 int
zap_remove_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,dmu_tx_t * tx)1763 zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
1764     dmu_tx_t *tx)
1765 {
1766 	zap_t *zap;
1767 
1768 	int err =
1769 	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1770 	if (err != 0)
1771 		return (err);
1772 	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
1773 	/* zap_remove_uint64_impl() calls zap_unlockdir() */
1774 	return (err);
1775 }
1776 
1777 
1778 static zap_attribute_t *
zap_attribute_alloc_impl(boolean_t longname)1779 zap_attribute_alloc_impl(boolean_t longname)
1780 {
1781 	zap_attribute_t *za;
1782 
1783 	za = kmem_cache_alloc((longname)? zap_attr_long_cache : zap_attr_cache,
1784 	    KM_SLEEP);
1785 	za->za_name_len = (longname)? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
1786 	return (za);
1787 }
1788 
1789 zap_attribute_t *
zap_attribute_alloc(void)1790 zap_attribute_alloc(void)
1791 {
1792 	return (zap_attribute_alloc_impl(B_FALSE));
1793 }
1794 
1795 zap_attribute_t *
zap_attribute_long_alloc(void)1796 zap_attribute_long_alloc(void)
1797 {
1798 	return (zap_attribute_alloc_impl(B_TRUE));
1799 }
1800 
1801 void
zap_attribute_free(zap_attribute_t * za)1802 zap_attribute_free(zap_attribute_t *za)
1803 {
1804 	if (za->za_name_len == ZAP_MAXNAMELEN) {
1805 		kmem_cache_free(zap_attr_cache, za);
1806 	} else {
1807 		ASSERT3U(za->za_name_len, ==, ZAP_MAXNAMELEN_NEW);
1808 		kmem_cache_free(zap_attr_long_cache, za);
1809 	}
1810 }
1811 
1812 /*
1813  * Routines for iterating over the attributes.
1814  */
1815 
1816 static void
zap_cursor_init_impl(zap_cursor_t * zc,objset_t * os,uint64_t zapobj,uint64_t serialized,boolean_t prefetch)1817 zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1818     uint64_t serialized, boolean_t prefetch)
1819 {
1820 	zc->zc_objset = os;
1821 	zc->zc_zap = NULL;
1822 	zc->zc_leaf = NULL;
1823 	zc->zc_zapobj = zapobj;
1824 	zc->zc_serialized = serialized;
1825 	zc->zc_hash = 0;
1826 	zc->zc_cd = 0;
1827 	zc->zc_prefetch = prefetch;
1828 }
1829 void
zap_cursor_init_serialized(zap_cursor_t * zc,objset_t * os,uint64_t zapobj,uint64_t serialized)1830 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1831     uint64_t serialized)
1832 {
1833 	zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
1834 }
1835 
1836 /*
1837  * Initialize a cursor at the beginning of the ZAP object.  The entire
1838  * ZAP object will be prefetched.
1839  */
1840 void
zap_cursor_init(zap_cursor_t * zc,objset_t * os,uint64_t zapobj)1841 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1842 {
1843 	zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
1844 }
1845 
1846 /*
1847  * Initialize a cursor at the beginning, but request that we not prefetch
1848  * the entire ZAP object.
1849  */
1850 void
zap_cursor_init_noprefetch(zap_cursor_t * zc,objset_t * os,uint64_t zapobj)1851 zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1852 {
1853 	zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
1854 }
1855 
1856 void
zap_cursor_fini(zap_cursor_t * zc)1857 zap_cursor_fini(zap_cursor_t *zc)
1858 {
1859 	if (zc->zc_zap) {
1860 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1861 		zap_unlockdir(zc->zc_zap, NULL);
1862 		zc->zc_zap = NULL;
1863 	}
1864 	if (zc->zc_leaf) {
1865 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1866 		zap_put_leaf(zc->zc_leaf);
1867 		zc->zc_leaf = NULL;
1868 	}
1869 	zc->zc_objset = NULL;
1870 }
1871 
1872 uint64_t
zap_cursor_serialize(zap_cursor_t * zc)1873 zap_cursor_serialize(zap_cursor_t *zc)
1874 {
1875 	if (zc->zc_hash == -1ULL)
1876 		return (-1ULL);
1877 	if (zc->zc_zap == NULL)
1878 		return (zc->zc_serialized);
1879 	ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
1880 	ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
1881 
1882 	/*
1883 	 * We want to keep the high 32 bits of the cursor zero if we can, so
1884 	 * that 32-bit programs can access this.  So usually use a small
1885 	 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
1886 	 * of the cursor.
1887 	 *
1888 	 * [ collision differentiator | zap_hashbits()-bit hash value ]
1889 	 */
1890 	return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
1891 	    ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
1892 }
1893 
1894 int
zap_cursor_retrieve(zap_cursor_t * zc,zap_attribute_t * za)1895 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
1896 {
1897 	int err;
1898 
1899 	if (zc->zc_hash == -1ULL)
1900 		return (SET_ERROR(ENOENT));
1901 
1902 	if (zc->zc_zap == NULL) {
1903 		int hb;
1904 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
1905 		    RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
1906 		if (err != 0)
1907 			return (err);
1908 
1909 		/*
1910 		 * To support zap_cursor_init_serialized, advance, retrieve,
1911 		 * we must add to the existing zc_cd, which may already
1912 		 * be 1 due to the zap_cursor_advance.
1913 		 */
1914 		ASSERT(zc->zc_hash == 0);
1915 		hb = zap_hashbits(zc->zc_zap);
1916 		zc->zc_hash = zc->zc_serialized << (64 - hb);
1917 		zc->zc_cd += zc->zc_serialized >> hb;
1918 		if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
1919 			zc->zc_cd = 0;
1920 	} else {
1921 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1922 	}
1923 	if (!zc->zc_zap->zap_ismicro) {
1924 		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
1925 	} else {
1926 		zfs_btree_index_t idx;
1927 		mzap_ent_t mze_tofind;
1928 
1929 		mze_tofind.mze_hash = zc->zc_hash >> 32;
1930 		mze_tofind.mze_cd = zc->zc_cd;
1931 
1932 		mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree,
1933 		    &mze_tofind, &idx);
1934 		if (mze == NULL) {
1935 			mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree,
1936 			    &idx, &idx);
1937 		}
1938 		if (mze) {
1939 			mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
1940 			ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
1941 			za->za_normalization_conflict =
1942 			    mzap_normalization_conflict(zc->zc_zap, NULL,
1943 			    mze, &idx);
1944 			za->za_integer_length = 8;
1945 			za->za_num_integers = 1;
1946 			za->za_first_integer = mzep->mze_value;
1947 			(void) strlcpy(za->za_name, mzep->mze_name,
1948 			    za->za_name_len);
1949 			zc->zc_hash = (uint64_t)mze->mze_hash << 32;
1950 			zc->zc_cd = mze->mze_cd;
1951 			err = 0;
1952 		} else {
1953 			zc->zc_hash = -1ULL;
1954 			err = SET_ERROR(ENOENT);
1955 		}
1956 	}
1957 	rw_exit(&zc->zc_zap->zap_rwlock);
1958 	return (err);
1959 }
1960 
1961 void
zap_cursor_advance(zap_cursor_t * zc)1962 zap_cursor_advance(zap_cursor_t *zc)
1963 {
1964 	if (zc->zc_hash == -1ULL)
1965 		return;
1966 	zc->zc_cd++;
1967 }
1968 
1969 int
zap_get_stats(objset_t * os,uint64_t zapobj,zap_stats_t * zs)1970 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
1971 {
1972 	zap_t *zap;
1973 
1974 	int err =
1975 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1976 	if (err != 0)
1977 		return (err);
1978 
1979 	memset(zs, 0, sizeof (zap_stats_t));
1980 
1981 	if (zap->zap_ismicro) {
1982 		zs->zs_blocksize = zap->zap_dbuf->db_size;
1983 		zs->zs_num_entries = zap->zap_m.zap_num_entries;
1984 		zs->zs_num_blocks = 1;
1985 	} else {
1986 		fzap_get_stats(zap, zs);
1987 	}
1988 	zap_unlockdir(zap, FTAG);
1989 	return (0);
1990 }
1991 
1992 #if defined(_KERNEL)
1993 EXPORT_SYMBOL(zap_create);
1994 EXPORT_SYMBOL(zap_create_dnsize);
1995 EXPORT_SYMBOL(zap_create_norm);
1996 EXPORT_SYMBOL(zap_create_norm_dnsize);
1997 EXPORT_SYMBOL(zap_create_flags);
1998 EXPORT_SYMBOL(zap_create_flags_dnsize);
1999 EXPORT_SYMBOL(zap_create_claim);
2000 EXPORT_SYMBOL(zap_create_claim_norm);
2001 EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
2002 EXPORT_SYMBOL(zap_create_hold);
2003 EXPORT_SYMBOL(zap_destroy);
2004 EXPORT_SYMBOL(zap_lookup);
2005 EXPORT_SYMBOL(zap_lookup_by_dnode);
2006 EXPORT_SYMBOL(zap_lookup_norm);
2007 EXPORT_SYMBOL(zap_lookup_uint64);
2008 EXPORT_SYMBOL(zap_contains);
2009 EXPORT_SYMBOL(zap_prefetch);
2010 EXPORT_SYMBOL(zap_prefetch_uint64);
2011 EXPORT_SYMBOL(zap_prefetch_object);
2012 EXPORT_SYMBOL(zap_add);
2013 EXPORT_SYMBOL(zap_add_by_dnode);
2014 EXPORT_SYMBOL(zap_add_uint64);
2015 EXPORT_SYMBOL(zap_add_uint64_by_dnode);
2016 EXPORT_SYMBOL(zap_update);
2017 EXPORT_SYMBOL(zap_update_uint64);
2018 EXPORT_SYMBOL(zap_update_uint64_by_dnode);
2019 EXPORT_SYMBOL(zap_length);
2020 EXPORT_SYMBOL(zap_length_uint64);
2021 EXPORT_SYMBOL(zap_remove);
2022 EXPORT_SYMBOL(zap_remove_by_dnode);
2023 EXPORT_SYMBOL(zap_remove_norm);
2024 EXPORT_SYMBOL(zap_remove_uint64);
2025 EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
2026 EXPORT_SYMBOL(zap_count);
2027 EXPORT_SYMBOL(zap_value_search);
2028 EXPORT_SYMBOL(zap_join);
2029 EXPORT_SYMBOL(zap_join_increment);
2030 EXPORT_SYMBOL(zap_add_int);
2031 EXPORT_SYMBOL(zap_remove_int);
2032 EXPORT_SYMBOL(zap_lookup_int);
2033 EXPORT_SYMBOL(zap_increment_int);
2034 EXPORT_SYMBOL(zap_add_int_key);
2035 EXPORT_SYMBOL(zap_lookup_int_key);
2036 EXPORT_SYMBOL(zap_increment);
2037 EXPORT_SYMBOL(zap_cursor_init);
2038 EXPORT_SYMBOL(zap_cursor_fini);
2039 EXPORT_SYMBOL(zap_cursor_retrieve);
2040 EXPORT_SYMBOL(zap_cursor_advance);
2041 EXPORT_SYMBOL(zap_cursor_serialize);
2042 EXPORT_SYMBOL(zap_cursor_init_serialized);
2043 EXPORT_SYMBOL(zap_get_stats);
2044 
2045 ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW,
2046 	"Maximum micro ZAP size before converting to a fat ZAP, "
2047 	    "in bytes (max 1M)");
2048 #endif
2049