xref: /freebsd/sys/contrib/openzfs/module/zfs/zap_micro.c (revision 8ac904ce090b1c2e355da8aa122ca2252183f4e1)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
26  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27  * Copyright 2017 Nexenta Systems, Inc.
28  * Copyright (c) 2024, Klara, Inc.
29  */
30 
31 #include <sys/zio.h>
32 #include <sys/spa.h>
33 #include <sys/dmu.h>
34 #include <sys/zfs_context.h>
35 #include <sys/zap.h>
36 #include <sys/zap_impl.h>
37 #include <sys/zap_leaf.h>
38 #include <sys/btree.h>
39 #include <sys/arc.h>
40 #include <sys/dmu_objset.h>
41 #include <sys/spa_impl.h>
42 
43 #ifdef _KERNEL
44 #include <sys/sunddi.h>
45 #endif
46 
47 /*
48  * The maximum size (in bytes) of a microzap before it is converted to a
49  * fatzap. It will be rounded up to next multiple of 512 (SPA_MINBLOCKSIZE).
50  *
51  * By definition, a microzap must fit into a single block, so this has
52  * traditionally been SPA_OLD_MAXBLOCKSIZE, and is set to that by default.
53  * Setting this higher requires both the large_blocks feature (to even create
54  * blocks that large) and the large_microzap feature (to enable the stream
55  * machinery to understand not to try to split a microzap block).
56  *
57  * If large_microzap is enabled, this value will be clamped to
58  * spa_maxblocksize(), up to 1M. If not, it will be clamped to
59  * SPA_OLD_MAXBLOCKSIZE.
60  */
61 static int zap_micro_max_size = SPA_OLD_MAXBLOCKSIZE;
62 
63 /*
64  * The 1M upper limit is necessary because the count of chunks in a microzap
65  * block is stored as a uint16_t (mze_chunkid). Each chunk is 64 bytes, and the
66  * first is used to store a header, so there are 32767 usable chunks, which is
67  * just under 2M. 1M is the largest power-2-rounded block size under 2M, so we
68  * must set the limit there.
69  */
70 #define	MZAP_MAX_SIZE	(1048576)
71 
72 uint64_t
zap_get_micro_max_size(spa_t * spa)73 zap_get_micro_max_size(spa_t *spa)
74 {
75 	uint64_t maxsz = MIN(MZAP_MAX_SIZE,
76 	    P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE));
77 	if (maxsz <= SPA_OLD_MAXBLOCKSIZE)
78 		return (maxsz);
79 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP))
80 		return (MIN(maxsz, spa_maxblocksize(spa)));
81 	return (SPA_OLD_MAXBLOCKSIZE);
82 }
83 
84 static int mzap_upgrade(zap_t **zapp,
85     const void *tag, dmu_tx_t *tx, zap_flags_t flags);
86 
87 uint64_t
zap_getflags(zap_t * zap)88 zap_getflags(zap_t *zap)
89 {
90 	if (zap->zap_ismicro)
91 		return (0);
92 	return (zap_f_phys(zap)->zap_flags);
93 }
94 
95 int
zap_hashbits(zap_t * zap)96 zap_hashbits(zap_t *zap)
97 {
98 	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
99 		return (48);
100 	else
101 		return (28);
102 }
103 
104 uint32_t
zap_maxcd(zap_t * zap)105 zap_maxcd(zap_t *zap)
106 {
107 	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
108 		return ((1<<16)-1);
109 	else
110 		return (-1U);
111 }
112 
113 static uint64_t
zap_hash(zap_name_t * zn)114 zap_hash(zap_name_t *zn)
115 {
116 	zap_t *zap = zn->zn_zap;
117 	uint64_t h = 0;
118 
119 	if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
120 		ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
121 		h = *(uint64_t *)zn->zn_key_orig;
122 	} else {
123 		h = zap->zap_salt;
124 		ASSERT(h != 0);
125 		ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
126 
127 		if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
128 			const uint64_t *wp = zn->zn_key_norm;
129 
130 			ASSERT(zn->zn_key_intlen == 8);
131 			for (int i = 0; i < zn->zn_key_norm_numints;
132 			    wp++, i++) {
133 				uint64_t word = *wp;
134 
135 				for (int j = 0; j < 8; j++) {
136 					h = (h >> 8) ^
137 					    zfs_crc64_table[(h ^ word) & 0xFF];
138 					word >>= NBBY;
139 				}
140 			}
141 		} else {
142 			const uint8_t *cp = zn->zn_key_norm;
143 
144 			/*
145 			 * We previously stored the terminating null on
146 			 * disk, but didn't hash it, so we need to
147 			 * continue to not hash it.  (The
148 			 * zn_key_*_numints includes the terminating
149 			 * null for non-binary keys.)
150 			 */
151 			int len = zn->zn_key_norm_numints - 1;
152 
153 			ASSERT(zn->zn_key_intlen == 1);
154 			for (int i = 0; i < len; cp++, i++) {
155 				h = (h >> 8) ^
156 				    zfs_crc64_table[(h ^ *cp) & 0xFF];
157 			}
158 		}
159 	}
160 	/*
161 	 * Don't use all 64 bits, since we need some in the cookie for
162 	 * the collision differentiator.  We MUST use the high bits,
163 	 * since those are the ones that we first pay attention to when
164 	 * choosing the bucket.
165 	 */
166 	h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
167 
168 	return (h);
169 }
170 
171 static int
zap_normalize(zap_t * zap,const char * name,char * namenorm,int normflags,size_t outlen)172 zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags,
173     size_t outlen)
174 {
175 	ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
176 
177 	size_t inlen = strlen(name) + 1;
178 
179 	int err = 0;
180 	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
181 	    normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
182 	    U8_UNICODE_LATEST, &err);
183 
184 	return (err);
185 }
186 
187 boolean_t
zap_match(zap_name_t * zn,const char * matchname)188 zap_match(zap_name_t *zn, const char *matchname)
189 {
190 	boolean_t res = B_FALSE;
191 	ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
192 
193 	if (zn->zn_matchtype & MT_NORMALIZE) {
194 		size_t namelen = zn->zn_normbuf_len;
195 		char normbuf[ZAP_MAXNAMELEN];
196 		char *norm = normbuf;
197 
198 		/*
199 		 * Cannot allocate this on-stack as it exceed the stack-limit of
200 		 * 1024.
201 		 */
202 		if (namelen > ZAP_MAXNAMELEN)
203 			norm = kmem_alloc(namelen, KM_SLEEP);
204 
205 		if (zap_normalize(zn->zn_zap, matchname, norm,
206 		    zn->zn_normflags, namelen) != 0) {
207 			res = B_FALSE;
208 		} else {
209 			res = (strcmp(zn->zn_key_norm, norm) == 0);
210 		}
211 		if (norm != normbuf)
212 			kmem_free(norm, namelen);
213 	} else {
214 		res = (strcmp(zn->zn_key_orig, matchname) == 0);
215 	}
216 	return (res);
217 }
218 
219 static kmem_cache_t *zap_name_cache;
220 static kmem_cache_t *zap_attr_cache;
221 static kmem_cache_t *zap_name_long_cache;
222 static kmem_cache_t *zap_attr_long_cache;
223 
224 void
zap_init(void)225 zap_init(void)
226 {
227 	zap_name_cache = kmem_cache_create("zap_name",
228 	    sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL,
229 	    NULL, NULL, NULL, 0);
230 
231 	zap_attr_cache = kmem_cache_create("zap_attr_cache",
232 	    sizeof (zap_attribute_t) + ZAP_MAXNAMELEN,  0, NULL,
233 	    NULL, NULL, NULL, NULL, 0);
234 
235 	zap_name_long_cache = kmem_cache_create("zap_name_long",
236 	    sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL,
237 	    NULL, NULL, NULL, 0);
238 
239 	zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache",
240 	    sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW,  0, NULL,
241 	    NULL, NULL, NULL, NULL, 0);
242 }
243 
244 void
zap_fini(void)245 zap_fini(void)
246 {
247 	kmem_cache_destroy(zap_name_cache);
248 	kmem_cache_destroy(zap_attr_cache);
249 	kmem_cache_destroy(zap_name_long_cache);
250 	kmem_cache_destroy(zap_attr_long_cache);
251 }
252 
253 static zap_name_t *
zap_name_alloc(zap_t * zap,boolean_t longname)254 zap_name_alloc(zap_t *zap, boolean_t longname)
255 {
256 	kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache;
257 	zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP);
258 
259 	zn->zn_zap = zap;
260 	zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
261 	return (zn);
262 }
263 
264 void
zap_name_free(zap_name_t * zn)265 zap_name_free(zap_name_t *zn)
266 {
267 	if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) {
268 		kmem_cache_free(zap_name_cache, zn);
269 	} else {
270 		ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW);
271 		kmem_cache_free(zap_name_long_cache, zn);
272 	}
273 }
274 
275 static int
zap_name_init_str(zap_name_t * zn,const char * key,matchtype_t mt)276 zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt)
277 {
278 	zap_t *zap = zn->zn_zap;
279 	size_t key_len = strlen(key) + 1;
280 
281 	/* Make sure zn is allocated for longname if key is long */
282 	IMPLY(key_len > ZAP_MAXNAMELEN,
283 	    zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW);
284 
285 	zn->zn_key_intlen = sizeof (*key);
286 	zn->zn_key_orig = key;
287 	zn->zn_key_orig_numints = key_len;
288 	zn->zn_matchtype = mt;
289 	zn->zn_normflags = zap->zap_normflags;
290 
291 	/*
292 	 * If we're dealing with a case sensitive lookup on a mixed or
293 	 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
294 	 * will fold case to all caps overriding the lookup request.
295 	 */
296 	if (mt & MT_MATCH_CASE)
297 		zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
298 
299 	if (zap->zap_normflags) {
300 		/*
301 		 * We *must* use zap_normflags because this normalization is
302 		 * what the hash is computed from.
303 		 */
304 		if (zap_normalize(zap, key, zn->zn_normbuf,
305 		    zap->zap_normflags, zn->zn_normbuf_len) != 0)
306 			return (SET_ERROR(ENOTSUP));
307 		zn->zn_key_norm = zn->zn_normbuf;
308 		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
309 	} else {
310 		if (mt != 0)
311 			return (SET_ERROR(ENOTSUP));
312 		zn->zn_key_norm = zn->zn_key_orig;
313 		zn->zn_key_norm_numints = zn->zn_key_orig_numints;
314 	}
315 
316 	zn->zn_hash = zap_hash(zn);
317 
318 	if (zap->zap_normflags != zn->zn_normflags) {
319 		/*
320 		 * We *must* use zn_normflags because this normalization is
321 		 * what the matching is based on.  (Not the hash!)
322 		 */
323 		if (zap_normalize(zap, key, zn->zn_normbuf,
324 		    zn->zn_normflags, zn->zn_normbuf_len) != 0)
325 			return (SET_ERROR(ENOTSUP));
326 		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
327 	}
328 
329 	return (0);
330 }
331 
332 zap_name_t *
zap_name_alloc_str(zap_t * zap,const char * key,matchtype_t mt)333 zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt)
334 {
335 	size_t key_len = strlen(key) + 1;
336 	zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN));
337 	if (zap_name_init_str(zn, key, mt) != 0) {
338 		zap_name_free(zn);
339 		return (NULL);
340 	}
341 	return (zn);
342 }
343 
344 static zap_name_t *
zap_name_alloc_uint64(zap_t * zap,const uint64_t * key,int numints)345 zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
346 {
347 	zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP);
348 
349 	ASSERT0(zap->zap_normflags);
350 	zn->zn_zap = zap;
351 	zn->zn_key_intlen = sizeof (*key);
352 	zn->zn_key_orig = zn->zn_key_norm = key;
353 	zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
354 	zn->zn_matchtype = 0;
355 	zn->zn_normbuf_len = ZAP_MAXNAMELEN;
356 
357 	zn->zn_hash = zap_hash(zn);
358 	return (zn);
359 }
360 
361 static void
mzap_byteswap(mzap_phys_t * buf,size_t size)362 mzap_byteswap(mzap_phys_t *buf, size_t size)
363 {
364 	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
365 	buf->mz_salt = BSWAP_64(buf->mz_salt);
366 	buf->mz_normflags = BSWAP_64(buf->mz_normflags);
367 	int max = (size / MZAP_ENT_LEN) - 1;
368 	for (int i = 0; i < max; i++) {
369 		buf->mz_chunk[i].mze_value =
370 		    BSWAP_64(buf->mz_chunk[i].mze_value);
371 		buf->mz_chunk[i].mze_cd =
372 		    BSWAP_32(buf->mz_chunk[i].mze_cd);
373 	}
374 }
375 
376 void
zap_byteswap(void * buf,size_t size)377 zap_byteswap(void *buf, size_t size)
378 {
379 	uint64_t block_type = *(uint64_t *)buf;
380 
381 	if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
382 		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
383 		mzap_byteswap(buf, size);
384 	} else {
385 		fzap_byteswap(buf, size);
386 	}
387 }
388 
389 __attribute__((always_inline)) inline
390 static int
mze_compare(const void * arg1,const void * arg2)391 mze_compare(const void *arg1, const void *arg2)
392 {
393 	const mzap_ent_t *mze1 = arg1;
394 	const mzap_ent_t *mze2 = arg2;
395 
396 	return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd,
397 	    (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd));
398 }
399 
ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf,mzap_ent_t,mze_compare)400 ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t,
401     mze_compare)
402 
403 static void
404 mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash)
405 {
406 	mzap_ent_t mze;
407 
408 	ASSERT(zap->zap_ismicro);
409 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
410 
411 	mze.mze_chunkid = chunkid;
412 	ASSERT0(hash & 0xffffffff);
413 	mze.mze_hash = hash >> 32;
414 	ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff);
415 	mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd;
416 	ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0);
417 	zfs_btree_add(&zap->zap_m.zap_tree, &mze);
418 }
419 
420 static mzap_ent_t *
mze_find(zap_name_t * zn,zfs_btree_index_t * idx)421 mze_find(zap_name_t *zn, zfs_btree_index_t *idx)
422 {
423 	mzap_ent_t mze_tofind;
424 	mzap_ent_t *mze;
425 	zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree;
426 
427 	ASSERT(zn->zn_zap->zap_ismicro);
428 	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
429 
430 	ASSERT0(zn->zn_hash & 0xffffffff);
431 	mze_tofind.mze_hash = zn->zn_hash >> 32;
432 	mze_tofind.mze_cd = 0;
433 
434 	mze = zfs_btree_find(tree, &mze_tofind, idx);
435 	if (mze == NULL)
436 		mze = zfs_btree_next(tree, idx, idx);
437 	for (; mze && mze->mze_hash == mze_tofind.mze_hash;
438 	    mze = zfs_btree_next(tree, idx, idx)) {
439 		ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
440 		if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
441 			return (mze);
442 	}
443 
444 	return (NULL);
445 }
446 
447 static uint32_t
mze_find_unused_cd(zap_t * zap,uint64_t hash)448 mze_find_unused_cd(zap_t *zap, uint64_t hash)
449 {
450 	mzap_ent_t mze_tofind;
451 	zfs_btree_index_t idx;
452 	zfs_btree_t *tree = &zap->zap_m.zap_tree;
453 
454 	ASSERT(zap->zap_ismicro);
455 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
456 
457 	ASSERT0(hash & 0xffffffff);
458 	hash >>= 32;
459 	mze_tofind.mze_hash = hash;
460 	mze_tofind.mze_cd = 0;
461 
462 	uint32_t cd = 0;
463 	for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
464 	    mze && mze->mze_hash == hash;
465 	    mze = zfs_btree_next(tree, &idx, &idx)) {
466 		if (mze->mze_cd != cd)
467 			break;
468 		cd++;
469 	}
470 
471 	return (cd);
472 }
473 
474 /*
475  * Each mzap entry requires at max : 4 chunks
476  * 3 chunks for names + 1 chunk for value.
477  */
478 #define	MZAP_ENT_CHUNKS	(1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
479 	ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
480 
481 /*
482  * Check if the current entry keeps the colliding entries under the fatzap leaf
483  * size.
484  */
485 static boolean_t
mze_canfit_fzap_leaf(zap_name_t * zn,uint64_t hash)486 mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
487 {
488 	zap_t *zap = zn->zn_zap;
489 	mzap_ent_t mze_tofind;
490 	zfs_btree_index_t idx;
491 	zfs_btree_t *tree = &zap->zap_m.zap_tree;
492 	uint32_t mzap_ents = 0;
493 
494 	ASSERT0(hash & 0xffffffff);
495 	hash >>= 32;
496 	mze_tofind.mze_hash = hash;
497 	mze_tofind.mze_cd = 0;
498 
499 	for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
500 	    mze && mze->mze_hash == hash;
501 	    mze = zfs_btree_next(tree, &idx, &idx)) {
502 		mzap_ents++;
503 	}
504 
505 	/* Include the new entry being added */
506 	mzap_ents++;
507 
508 	return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
509 }
510 
511 static void
mze_destroy(zap_t * zap)512 mze_destroy(zap_t *zap)
513 {
514 	zfs_btree_clear(&zap->zap_m.zap_tree);
515 	zfs_btree_destroy(&zap->zap_m.zap_tree);
516 }
517 
518 static zap_t *
mzap_open(dmu_buf_t * db)519 mzap_open(dmu_buf_t *db)
520 {
521 	zap_t *winner;
522 	uint64_t *zap_hdr = (uint64_t *)db->db_data;
523 	uint64_t zap_block_type = zap_hdr[0];
524 	uint64_t zap_magic = zap_hdr[1];
525 
526 	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
527 
528 	zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
529 	rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
530 	rw_enter(&zap->zap_rwlock, RW_WRITER);
531 	zap->zap_objset = dmu_buf_get_objset(db);
532 	zap->zap_object = db->db_object;
533 	zap->zap_dbuf = db;
534 
535 	if (zap_block_type != ZBT_MICRO) {
536 		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT,
537 		    0);
538 		zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
539 		if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
540 			winner = NULL;	/* No actual winner here... */
541 			goto handle_winner;
542 		}
543 	} else {
544 		zap->zap_ismicro = TRUE;
545 	}
546 
547 	/*
548 	 * Make sure that zap_ismicro is set before we let others see
549 	 * it, because zap_lockdir() checks zap_ismicro without the lock
550 	 * held.
551 	 */
552 	dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
553 	winner = dmu_buf_set_user(db, &zap->zap_dbu);
554 
555 	if (winner != NULL)
556 		goto handle_winner;
557 
558 	if (zap->zap_ismicro) {
559 		zap->zap_salt = zap_m_phys(zap)->mz_salt;
560 		zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
561 		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
562 
563 		/*
564 		 * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove()
565 		 * overhead on massive inserts below.  It still allows to store
566 		 * 62 entries before we have to add 2KB B-tree core node.
567 		 */
568 		zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare,
569 		    mze_find_in_buf, sizeof (mzap_ent_t), 512);
570 
571 		zap_name_t *zn = zap_name_alloc(zap, B_FALSE);
572 		for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) {
573 			mzap_ent_phys_t *mze =
574 			    &zap_m_phys(zap)->mz_chunk[i];
575 			if (mze->mze_name[0]) {
576 				zap->zap_m.zap_num_entries++;
577 				zap_name_init_str(zn, mze->mze_name, 0);
578 				mze_insert(zap, i, zn->zn_hash);
579 			}
580 		}
581 		zap_name_free(zn);
582 	} else {
583 		zap->zap_salt = zap_f_phys(zap)->zap_salt;
584 		zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
585 
586 		ASSERT3U(sizeof (struct zap_leaf_header), ==,
587 		    2*ZAP_LEAF_CHUNKSIZE);
588 
589 		/*
590 		 * The embedded pointer table should not overlap the
591 		 * other members.
592 		 */
593 		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
594 		    &zap_f_phys(zap)->zap_salt);
595 
596 		/*
597 		 * The embedded pointer table should end at the end of
598 		 * the block
599 		 */
600 		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
601 		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
602 		    (uintptr_t)zap_f_phys(zap), ==,
603 		    zap->zap_dbuf->db_size);
604 	}
605 	rw_exit(&zap->zap_rwlock);
606 	return (zap);
607 
608 handle_winner:
609 	rw_exit(&zap->zap_rwlock);
610 	rw_destroy(&zap->zap_rwlock);
611 	if (!zap->zap_ismicro)
612 		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
613 	kmem_free(zap, sizeof (zap_t));
614 	return (winner);
615 }
616 
617 /*
618  * This routine "consumes" the caller's hold on the dbuf, which must
619  * have the specified tag.
620  */
621 static int
zap_lockdir_impl(dnode_t * dn,dmu_buf_t * db,const void * tag,dmu_tx_t * tx,krw_t lti,boolean_t fatreader,boolean_t adding,zap_t ** zapp)622 zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
623     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
624 {
625 	ASSERT0(db->db_offset);
626 	objset_t *os = dmu_buf_get_objset(db);
627 	uint64_t obj = db->db_object;
628 
629 	*zapp = NULL;
630 
631 	if (DMU_OT_BYTESWAP(dn->dn_type) != DMU_BSWAP_ZAP)
632 		return (SET_ERROR(EINVAL));
633 
634 	zap_t *zap = dmu_buf_get_user(db);
635 	if (zap == NULL) {
636 		zap = mzap_open(db);
637 		if (zap == NULL) {
638 			/*
639 			 * mzap_open() didn't like what it saw on-disk.
640 			 * Check for corruption!
641 			 */
642 			return (SET_ERROR(EIO));
643 		}
644 	}
645 
646 	/*
647 	 * We're checking zap_ismicro without the lock held, in order to
648 	 * tell what type of lock we want.  Once we have some sort of
649 	 * lock, see if it really is the right type.  In practice this
650 	 * can only be different if it was upgraded from micro to fat,
651 	 * and micro wanted WRITER but fat only needs READER.
652 	 */
653 	krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
654 	rw_enter(&zap->zap_rwlock, lt);
655 	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
656 		/* it was upgraded, now we only need reader */
657 		ASSERT(lt == RW_WRITER);
658 		ASSERT(RW_READER ==
659 		    ((!zap->zap_ismicro && fatreader) ? RW_READER : lti));
660 		rw_downgrade(&zap->zap_rwlock);
661 		lt = RW_READER;
662 	}
663 
664 	zap->zap_objset = os;
665 	zap->zap_dnode = dn;
666 
667 	if (lt == RW_WRITER)
668 		dmu_buf_will_dirty(db, tx);
669 
670 	ASSERT3P(zap->zap_dbuf, ==, db);
671 
672 	ASSERT(!zap->zap_ismicro ||
673 	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
674 	if (zap->zap_ismicro && tx && adding &&
675 	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
676 		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
677 		if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) {
678 			dprintf("upgrading obj %llu: num_entries=%u\n",
679 			    (u_longlong_t)obj, zap->zap_m.zap_num_entries);
680 			*zapp = zap;
681 			int err = mzap_upgrade(zapp, tag, tx, 0);
682 			if (err != 0)
683 				rw_exit(&zap->zap_rwlock);
684 			return (err);
685 		}
686 		VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
687 		zap->zap_m.zap_num_chunks =
688 		    db->db_size / MZAP_ENT_LEN - 1;
689 
690 		if (newsz > SPA_OLD_MAXBLOCKSIZE) {
691 			dsl_dataset_t *ds = dmu_objset_ds(os);
692 			if (!dsl_dataset_feature_is_active(ds,
693 			    SPA_FEATURE_LARGE_MICROZAP)) {
694 				/*
695 				 * A microzap just grew beyond the old limit
696 				 * for the first time, so we have to ensure the
697 				 * feature flag is activated.
698 				 * zap_get_micro_max_size() won't let us get
699 				 * here if the feature is not enabled, so we
700 				 * don't need any other checks beforehand.
701 				 *
702 				 * Since we're in open context, we can't
703 				 * activate the feature directly, so we instead
704 				 * flag it on the dataset for next sync.
705 				 */
706 				dsl_dataset_dirty(ds, tx);
707 				mutex_enter(&ds->ds_lock);
708 				ds->ds_feature_activation
709 				    [SPA_FEATURE_LARGE_MICROZAP] =
710 				    (void *)B_TRUE;
711 				mutex_exit(&ds->ds_lock);
712 			}
713 		}
714 	}
715 
716 	*zapp = zap;
717 	return (0);
718 }
719 
720 static int
zap_lockdir_by_dnode(dnode_t * dn,dmu_tx_t * tx,krw_t lti,boolean_t fatreader,boolean_t adding,const void * tag,zap_t ** zapp)721 zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
722     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
723     zap_t **zapp)
724 {
725 	dmu_buf_t *db;
726 	int err;
727 
728 	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
729 	if (err != 0)
730 		return (err);
731 	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
732 	if (err != 0)
733 		dmu_buf_rele(db, tag);
734 	else
735 		VERIFY(dnode_add_ref(dn, tag));
736 	return (err);
737 }
738 
739 int
zap_lockdir(objset_t * os,uint64_t obj,dmu_tx_t * tx,krw_t lti,boolean_t fatreader,boolean_t adding,const void * tag,zap_t ** zapp)740 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
741     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
742     zap_t **zapp)
743 {
744 	dnode_t *dn;
745 	dmu_buf_t *db;
746 	int err;
747 
748 	err = dnode_hold(os, obj, tag, &dn);
749 	if (err != 0)
750 		return (err);
751 	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
752 	if (err != 0) {
753 		dnode_rele(dn, tag);
754 		return (err);
755 	}
756 	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
757 	if (err != 0) {
758 		dmu_buf_rele(db, tag);
759 		dnode_rele(dn, tag);
760 	}
761 	return (err);
762 }
763 
764 void
zap_unlockdir(zap_t * zap,const void * tag)765 zap_unlockdir(zap_t *zap, const void *tag)
766 {
767 	rw_exit(&zap->zap_rwlock);
768 	dnode_rele(zap->zap_dnode, tag);
769 	dmu_buf_rele(zap->zap_dbuf, tag);
770 }
771 
772 static int
mzap_upgrade(zap_t ** zapp,const void * tag,dmu_tx_t * tx,zap_flags_t flags)773 mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags)
774 {
775 	int err = 0;
776 	zap_t *zap = *zapp;
777 
778 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
779 
780 	int sz = zap->zap_dbuf->db_size;
781 	mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP);
782 	memcpy(mzp, zap->zap_dbuf->db_data, sz);
783 	int nchunks = zap->zap_m.zap_num_chunks;
784 
785 	if (!flags) {
786 		err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
787 		    1ULL << fzap_default_block_shift, 0, tx);
788 		if (err != 0) {
789 			vmem_free(mzp, sz);
790 			return (err);
791 		}
792 	}
793 
794 	dprintf("upgrading obj=%llu with %u chunks\n",
795 	    (u_longlong_t)zap->zap_object, nchunks);
796 	/* XXX destroy the tree later, so we can use the stored hash value */
797 	mze_destroy(zap);
798 
799 	fzap_upgrade(zap, tx, flags);
800 
801 	zap_name_t *zn = zap_name_alloc(zap, B_FALSE);
802 	for (int i = 0; i < nchunks; i++) {
803 		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
804 		if (mze->mze_name[0] == 0)
805 			continue;
806 		dprintf("adding %s=%llu\n",
807 		    mze->mze_name, (u_longlong_t)mze->mze_value);
808 		zap_name_init_str(zn, mze->mze_name, 0);
809 		/* If we fail here, we would end up losing entries */
810 		VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
811 		    tag, tx));
812 		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
813 	}
814 	zap_name_free(zn);
815 	vmem_free(mzp, sz);
816 	*zapp = zap;
817 	return (0);
818 }
819 
820 /*
821  * The "normflags" determine the behavior of the matchtype_t which is
822  * passed to zap_lookup_norm().  Names which have the same normalized
823  * version will be stored with the same hash value, and therefore we can
824  * perform normalization-insensitive lookups.  We can be Unicode form-
825  * insensitive and/or case-insensitive.  The following flags are valid for
826  * "normflags":
827  *
828  * U8_TEXTPREP_NFC
829  * U8_TEXTPREP_NFD
830  * U8_TEXTPREP_NFKC
831  * U8_TEXTPREP_NFKD
832  * U8_TEXTPREP_TOUPPER
833  *
834  * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
835  * of them may be supplied.
836  */
837 void
mzap_create_impl(dnode_t * dn,int normflags,zap_flags_t flags,dmu_tx_t * tx)838 mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
839 {
840 	dmu_buf_t *db;
841 
842 	VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
843 
844 	dmu_buf_will_dirty(db, tx);
845 	mzap_phys_t *zp = db->db_data;
846 	zp->mz_block_type = ZBT_MICRO;
847 	zp->mz_salt =
848 	    ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL;
849 	zp->mz_normflags = normflags;
850 
851 	if (flags != 0) {
852 		zap_t *zap;
853 		/* Only fat zap supports flags; upgrade immediately. */
854 		VERIFY(dnode_add_ref(dn, FTAG));
855 		VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER,
856 		    B_FALSE, B_FALSE, &zap));
857 		VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
858 		zap_unlockdir(zap, FTAG);
859 	} else {
860 		dmu_buf_rele(db, FTAG);
861 	}
862 }
863 
864 static uint64_t
zap_create_impl(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dnode_t ** allocated_dnode,const void * tag,dmu_tx_t * tx)865 zap_create_impl(objset_t *os, int normflags, zap_flags_t flags,
866     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
867     dmu_object_type_t bonustype, int bonuslen, int dnodesize,
868     dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
869 {
870 	uint64_t obj;
871 
872 	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
873 
874 	if (allocated_dnode == NULL) {
875 		dnode_t *dn;
876 		obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
877 		    indirect_blockshift, bonustype, bonuslen, dnodesize,
878 		    &dn, FTAG, tx);
879 		mzap_create_impl(dn, normflags, flags, tx);
880 		dnode_rele(dn, FTAG);
881 	} else {
882 		obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
883 		    indirect_blockshift, bonustype, bonuslen, dnodesize,
884 		    allocated_dnode, tag, tx);
885 		mzap_create_impl(*allocated_dnode, normflags, flags, tx);
886 	}
887 
888 	return (obj);
889 }
890 
891 int
zap_create_claim(objset_t * os,uint64_t obj,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)892 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
893     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
894 {
895 	return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
896 	    0, tx));
897 }
898 
899 int
zap_create_claim_dnsize(objset_t * os,uint64_t obj,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)900 zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
901     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
902 {
903 	return (zap_create_claim_norm_dnsize(os, obj,
904 	    0, ot, bonustype, bonuslen, dnodesize, tx));
905 }
906 
907 int
zap_create_claim_norm(objset_t * os,uint64_t obj,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)908 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
909     dmu_object_type_t ot,
910     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
911 {
912 	return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
913 	    bonuslen, 0, tx));
914 }
915 
916 int
zap_create_claim_norm_dnsize(objset_t * os,uint64_t obj,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)917 zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
918     dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
919     int dnodesize, dmu_tx_t *tx)
920 {
921 	dnode_t *dn;
922 	int error;
923 
924 	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
925 	error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
926 	    dnodesize, tx);
927 	if (error != 0)
928 		return (error);
929 
930 	error = dnode_hold(os, obj, FTAG, &dn);
931 	if (error != 0)
932 		return (error);
933 
934 	mzap_create_impl(dn, normflags, 0, tx);
935 
936 	dnode_rele(dn, FTAG);
937 
938 	return (0);
939 }
940 
941 uint64_t
zap_create(objset_t * os,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)942 zap_create(objset_t *os, dmu_object_type_t ot,
943     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
944 {
945 	return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
946 }
947 
948 uint64_t
zap_create_dnsize(objset_t * os,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)949 zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
950     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
951 {
952 	return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
953 	    dnodesize, tx));
954 }
955 
956 uint64_t
zap_create_norm(objset_t * os,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)957 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
958     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
959 {
960 	return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
961 	    0, tx));
962 }
963 
964 uint64_t
zap_create_norm_dnsize(objset_t * os,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)965 zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
966     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
967 {
968 	return (zap_create_impl(os, normflags, 0, ot, 0, 0,
969 	    bonustype, bonuslen, dnodesize, NULL, NULL, tx));
970 }
971 
972 uint64_t
zap_create_flags(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)973 zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
974     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
975     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
976 {
977 	return (zap_create_flags_dnsize(os, normflags, flags, ot,
978 	    leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
979 }
980 
981 uint64_t
zap_create_flags_dnsize(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)982 zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
983     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
984     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
985 {
986 	return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
987 	    indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL,
988 	    tx));
989 }
990 
991 /*
992  * Create a zap object and return a pointer to the newly allocated dnode via
993  * the allocated_dnode argument.  The returned dnode will be held and the
994  * caller is responsible for releasing the hold by calling dnode_rele().
995  */
996 uint64_t
zap_create_hold(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dnode_t ** allocated_dnode,const void * tag,dmu_tx_t * tx)997 zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
998     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
999     dmu_object_type_t bonustype, int bonuslen, int dnodesize,
1000     dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
1001 {
1002 	return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
1003 	    indirect_blockshift, bonustype, bonuslen, dnodesize,
1004 	    allocated_dnode, tag, tx));
1005 }
1006 
1007 int
zap_destroy(objset_t * os,uint64_t zapobj,dmu_tx_t * tx)1008 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
1009 {
1010 	/*
1011 	 * dmu_object_free will free the object number and free the
1012 	 * data.  Freeing the data will cause our pageout function to be
1013 	 * called, which will destroy our data (zap_leaf_t's and zap_t).
1014 	 */
1015 
1016 	return (dmu_object_free(os, zapobj, tx));
1017 }
1018 
1019 void
zap_evict_sync(void * dbu)1020 zap_evict_sync(void *dbu)
1021 {
1022 	zap_t *zap = dbu;
1023 
1024 	rw_destroy(&zap->zap_rwlock);
1025 
1026 	if (zap->zap_ismicro)
1027 		mze_destroy(zap);
1028 	else
1029 		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
1030 
1031 	kmem_free(zap, sizeof (zap_t));
1032 }
1033 
1034 int
zap_count(objset_t * os,uint64_t zapobj,uint64_t * count)1035 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
1036 {
1037 	zap_t *zap;
1038 
1039 	int err =
1040 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1041 	if (err != 0)
1042 		return (err);
1043 	if (!zap->zap_ismicro) {
1044 		err = fzap_count(zap, count);
1045 	} else {
1046 		*count = zap->zap_m.zap_num_entries;
1047 	}
1048 	zap_unlockdir(zap, FTAG);
1049 	return (err);
1050 }
1051 
1052 /*
1053  * zn may be NULL; if not specified, it will be computed if needed.
1054  * See also the comment above zap_entry_normalization_conflict().
1055  */
1056 static boolean_t
mzap_normalization_conflict(zap_t * zap,zap_name_t * zn,mzap_ent_t * mze,zfs_btree_index_t * idx)1057 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze,
1058     zfs_btree_index_t *idx)
1059 {
1060 	boolean_t allocdzn = B_FALSE;
1061 	mzap_ent_t *other;
1062 	zfs_btree_index_t oidx;
1063 
1064 	if (zap->zap_normflags == 0)
1065 		return (B_FALSE);
1066 
1067 	for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx);
1068 	    other && other->mze_hash == mze->mze_hash;
1069 	    other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) {
1070 
1071 		if (zn == NULL) {
1072 			zn = zap_name_alloc_str(zap,
1073 			    MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
1074 			allocdzn = B_TRUE;
1075 		}
1076 		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
1077 			if (allocdzn)
1078 				zap_name_free(zn);
1079 			return (B_TRUE);
1080 		}
1081 	}
1082 
1083 	for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx);
1084 	    other && other->mze_hash == mze->mze_hash;
1085 	    other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) {
1086 
1087 		if (zn == NULL) {
1088 			zn = zap_name_alloc_str(zap,
1089 			    MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
1090 			allocdzn = B_TRUE;
1091 		}
1092 		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
1093 			if (allocdzn)
1094 				zap_name_free(zn);
1095 			return (B_TRUE);
1096 		}
1097 	}
1098 
1099 	if (allocdzn)
1100 		zap_name_free(zn);
1101 	return (B_FALSE);
1102 }
1103 
1104 /*
1105  * Routines for manipulating attributes.
1106  */
1107 
1108 int
zap_lookup(objset_t * os,uint64_t zapobj,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf)1109 zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
1110     uint64_t integer_size, uint64_t num_integers, void *buf)
1111 {
1112 	return (zap_lookup_norm(os, zapobj, name, integer_size,
1113 	    num_integers, buf, 0, NULL, 0, NULL));
1114 }
1115 
1116 static int
zap_lookup_impl(zap_t * zap,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf,matchtype_t mt,char * realname,int rn_len,boolean_t * ncp)1117 zap_lookup_impl(zap_t *zap, const char *name,
1118     uint64_t integer_size, uint64_t num_integers, void *buf,
1119     matchtype_t mt, char *realname, int rn_len,
1120     boolean_t *ncp)
1121 {
1122 	int err = 0;
1123 
1124 	zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
1125 	if (zn == NULL)
1126 		return (SET_ERROR(ENOTSUP));
1127 
1128 	if (!zap->zap_ismicro) {
1129 		err = fzap_lookup(zn, integer_size, num_integers, buf,
1130 		    realname, rn_len, ncp);
1131 	} else {
1132 		zfs_btree_index_t idx;
1133 		mzap_ent_t *mze = mze_find(zn, &idx);
1134 		if (mze == NULL) {
1135 			err = SET_ERROR(ENOENT);
1136 		} else {
1137 			if (num_integers < 1) {
1138 				err = SET_ERROR(EOVERFLOW);
1139 			} else if (integer_size != 8) {
1140 				err = SET_ERROR(EINVAL);
1141 			} else {
1142 				*(uint64_t *)buf =
1143 				    MZE_PHYS(zap, mze)->mze_value;
1144 				if (realname != NULL)
1145 					(void) strlcpy(realname,
1146 					    MZE_PHYS(zap, mze)->mze_name,
1147 					    rn_len);
1148 				if (ncp) {
1149 					*ncp = mzap_normalization_conflict(zap,
1150 					    zn, mze, &idx);
1151 				}
1152 			}
1153 		}
1154 	}
1155 	zap_name_free(zn);
1156 	return (err);
1157 }
1158 
1159 int
zap_lookup_norm(objset_t * os,uint64_t zapobj,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf,matchtype_t mt,char * realname,int rn_len,boolean_t * ncp)1160 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
1161     uint64_t integer_size, uint64_t num_integers, void *buf,
1162     matchtype_t mt, char *realname, int rn_len,
1163     boolean_t *ncp)
1164 {
1165 	zap_t *zap;
1166 
1167 	int err =
1168 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1169 	if (err != 0)
1170 		return (err);
1171 	err = zap_lookup_impl(zap, name, integer_size,
1172 	    num_integers, buf, mt, realname, rn_len, ncp);
1173 	zap_unlockdir(zap, FTAG);
1174 	return (err);
1175 }
1176 
1177 int
zap_prefetch(objset_t * os,uint64_t zapobj,const char * name)1178 zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
1179 {
1180 	zap_t *zap;
1181 	int err;
1182 	zap_name_t *zn;
1183 
1184 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1185 	if (err)
1186 		return (err);
1187 	zn = zap_name_alloc_str(zap, name, 0);
1188 	if (zn == NULL) {
1189 		zap_unlockdir(zap, FTAG);
1190 		return (SET_ERROR(ENOTSUP));
1191 	}
1192 
1193 	fzap_prefetch(zn);
1194 	zap_name_free(zn);
1195 	zap_unlockdir(zap, FTAG);
1196 	return (err);
1197 }
1198 
1199 int
zap_prefetch_object(objset_t * os,uint64_t zapobj)1200 zap_prefetch_object(objset_t *os, uint64_t zapobj)
1201 {
1202 	int error;
1203 	dmu_object_info_t doi;
1204 
1205 	error = dmu_object_info(os, zapobj, &doi);
1206 	if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
1207 		error = SET_ERROR(EINVAL);
1208 	if (error == 0)
1209 		dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset);
1210 
1211 	return (error);
1212 }
1213 
1214 int
zap_lookup_by_dnode(dnode_t * dn,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf)1215 zap_lookup_by_dnode(dnode_t *dn, const char *name,
1216     uint64_t integer_size, uint64_t num_integers, void *buf)
1217 {
1218 	return (zap_lookup_norm_by_dnode(dn, name, integer_size,
1219 	    num_integers, buf, 0, NULL, 0, NULL));
1220 }
1221 
1222 int
zap_lookup_norm_by_dnode(dnode_t * dn,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf,matchtype_t mt,char * realname,int rn_len,boolean_t * ncp)1223 zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
1224     uint64_t integer_size, uint64_t num_integers, void *buf,
1225     matchtype_t mt, char *realname, int rn_len,
1226     boolean_t *ncp)
1227 {
1228 	zap_t *zap;
1229 
1230 	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
1231 	    FTAG, &zap);
1232 	if (err != 0)
1233 		return (err);
1234 	err = zap_lookup_impl(zap, name, integer_size,
1235 	    num_integers, buf, mt, realname, rn_len, ncp);
1236 	zap_unlockdir(zap, FTAG);
1237 	return (err);
1238 }
1239 
1240 static int
zap_prefetch_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints)1241 zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints)
1242 {
1243 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1244 	if (zn == NULL) {
1245 		zap_unlockdir(zap, FTAG);
1246 		return (SET_ERROR(ENOTSUP));
1247 	}
1248 
1249 	fzap_prefetch(zn);
1250 	zap_name_free(zn);
1251 	zap_unlockdir(zap, FTAG);
1252 	return (0);
1253 }
1254 
1255 int
zap_prefetch_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints)1256 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1257     int key_numints)
1258 {
1259 	zap_t *zap;
1260 
1261 	int err =
1262 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1263 	if (err != 0)
1264 		return (err);
1265 	err = zap_prefetch_uint64_impl(zap, key, key_numints);
1266 	/* zap_prefetch_uint64_impl() calls zap_unlockdir() */
1267 	return (err);
1268 }
1269 
1270 int
zap_prefetch_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints)1271 zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints)
1272 {
1273 	zap_t *zap;
1274 
1275 	int err =
1276 	    zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1277 	if (err != 0)
1278 		return (err);
1279 	err = zap_prefetch_uint64_impl(zap, key, key_numints);
1280 	/* zap_prefetch_uint64_impl() calls zap_unlockdir() */
1281 	return (err);
1282 }
1283 
1284 static int
zap_lookup_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,uint64_t integer_size,uint64_t num_integers,void * buf)1285 zap_lookup_uint64_impl(zap_t *zap, const uint64_t *key,
1286     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1287 {
1288 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1289 	if (zn == NULL) {
1290 		zap_unlockdir(zap, FTAG);
1291 		return (SET_ERROR(ENOTSUP));
1292 	}
1293 
1294 	int err = fzap_lookup(zn, integer_size, num_integers, buf,
1295 	    NULL, 0, NULL);
1296 	zap_name_free(zn);
1297 	zap_unlockdir(zap, FTAG);
1298 	return (err);
1299 }
1300 
1301 int
zap_lookup_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,uint64_t integer_size,uint64_t num_integers,void * buf)1302 zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1303     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1304 {
1305 	zap_t *zap;
1306 
1307 	int err =
1308 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1309 	if (err != 0)
1310 		return (err);
1311 	err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size,
1312 	    num_integers, buf);
1313 	/* zap_lookup_uint64_impl() calls zap_unlockdir() */
1314 	return (err);
1315 }
1316 
1317 int
zap_lookup_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,uint64_t integer_size,uint64_t num_integers,void * buf)1318 zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
1319     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1320 {
1321 	zap_t *zap;
1322 
1323 	int err =
1324 	    zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1325 	if (err != 0)
1326 		return (err);
1327 	err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size,
1328 	    num_integers, buf);
1329 	/* zap_lookup_uint64_impl() calls zap_unlockdir() */
1330 	return (err);
1331 }
1332 
1333 int
zap_contains(objset_t * os,uint64_t zapobj,const char * name)1334 zap_contains(objset_t *os, uint64_t zapobj, const char *name)
1335 {
1336 	int err = zap_lookup_norm(os, zapobj, name, 0,
1337 	    0, NULL, 0, NULL, 0, NULL);
1338 	if (err == EOVERFLOW || err == EINVAL)
1339 		err = 0; /* found, but skipped reading the value */
1340 	return (err);
1341 }
1342 
1343 int
zap_length(objset_t * os,uint64_t zapobj,const char * name,uint64_t * integer_size,uint64_t * num_integers)1344 zap_length(objset_t *os, uint64_t zapobj, const char *name,
1345     uint64_t *integer_size, uint64_t *num_integers)
1346 {
1347 	zap_t *zap;
1348 
1349 	int err =
1350 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1351 	if (err != 0)
1352 		return (err);
1353 	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
1354 	if (zn == NULL) {
1355 		zap_unlockdir(zap, FTAG);
1356 		return (SET_ERROR(ENOTSUP));
1357 	}
1358 	if (!zap->zap_ismicro) {
1359 		err = fzap_length(zn, integer_size, num_integers);
1360 	} else {
1361 		zfs_btree_index_t idx;
1362 		mzap_ent_t *mze = mze_find(zn, &idx);
1363 		if (mze == NULL) {
1364 			err = SET_ERROR(ENOENT);
1365 		} else {
1366 			if (integer_size)
1367 				*integer_size = 8;
1368 			if (num_integers)
1369 				*num_integers = 1;
1370 		}
1371 	}
1372 	zap_name_free(zn);
1373 	zap_unlockdir(zap, FTAG);
1374 	return (err);
1375 }
1376 
1377 int
zap_length_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,uint64_t * integer_size,uint64_t * num_integers)1378 zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1379     int key_numints, uint64_t *integer_size, uint64_t *num_integers)
1380 {
1381 	zap_t *zap;
1382 
1383 	int err =
1384 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1385 	if (err != 0)
1386 		return (err);
1387 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1388 	if (zn == NULL) {
1389 		zap_unlockdir(zap, FTAG);
1390 		return (SET_ERROR(ENOTSUP));
1391 	}
1392 	err = fzap_length(zn, integer_size, num_integers);
1393 	zap_name_free(zn);
1394 	zap_unlockdir(zap, FTAG);
1395 	return (err);
1396 }
1397 
1398 static void
mzap_addent(zap_name_t * zn,uint64_t value)1399 mzap_addent(zap_name_t *zn, uint64_t value)
1400 {
1401 	zap_t *zap = zn->zn_zap;
1402 	uint16_t start = zap->zap_m.zap_alloc_next;
1403 
1404 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
1405 
1406 #ifdef ZFS_DEBUG
1407 	for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
1408 		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1409 		ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
1410 	}
1411 #endif
1412 
1413 	uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
1414 	/* given the limited size of the microzap, this can't happen */
1415 	ASSERT(cd < zap_maxcd(zap));
1416 
1417 again:
1418 	for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) {
1419 		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1420 		if (mze->mze_name[0] == 0) {
1421 			mze->mze_value = value;
1422 			mze->mze_cd = cd;
1423 			(void) strlcpy(mze->mze_name, zn->zn_key_orig,
1424 			    sizeof (mze->mze_name));
1425 			zap->zap_m.zap_num_entries++;
1426 			zap->zap_m.zap_alloc_next = i+1;
1427 			if (zap->zap_m.zap_alloc_next ==
1428 			    zap->zap_m.zap_num_chunks)
1429 				zap->zap_m.zap_alloc_next = 0;
1430 			mze_insert(zap, i, zn->zn_hash);
1431 			return;
1432 		}
1433 	}
1434 	if (start != 0) {
1435 		start = 0;
1436 		goto again;
1437 	}
1438 	cmn_err(CE_PANIC, "out of entries!");
1439 }
1440 
1441 static int
zap_add_impl(zap_t * zap,const char * key,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx,const void * tag)1442 zap_add_impl(zap_t *zap, const char *key,
1443     int integer_size, uint64_t num_integers,
1444     const void *val, dmu_tx_t *tx, const void *tag)
1445 {
1446 	const uint64_t *intval = val;
1447 	int err = 0;
1448 
1449 	zap_name_t *zn = zap_name_alloc_str(zap, key, 0);
1450 	if (zn == NULL) {
1451 		zap_unlockdir(zap, tag);
1452 		return (SET_ERROR(ENOTSUP));
1453 	}
1454 	if (!zap->zap_ismicro) {
1455 		err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
1456 		zap = zn->zn_zap;	/* fzap_add() may change zap */
1457 	} else if (integer_size != 8 || num_integers != 1 ||
1458 	    strlen(key) >= MZAP_NAME_LEN ||
1459 	    !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
1460 		err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
1461 		if (err == 0) {
1462 			err = fzap_add(zn, integer_size, num_integers, val,
1463 			    tag, tx);
1464 		}
1465 		zap = zn->zn_zap;	/* fzap_add() may change zap */
1466 	} else {
1467 		zfs_btree_index_t idx;
1468 		if (mze_find(zn, &idx) != NULL) {
1469 			err = SET_ERROR(EEXIST);
1470 		} else {
1471 			mzap_addent(zn, *intval);
1472 		}
1473 	}
1474 	ASSERT(zap == zn->zn_zap);
1475 	zap_name_free(zn);
1476 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
1477 		zap_unlockdir(zap, tag);
1478 	return (err);
1479 }
1480 
1481 int
zap_add(objset_t * os,uint64_t zapobj,const char * key,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1482 zap_add(objset_t *os, uint64_t zapobj, const char *key,
1483     int integer_size, uint64_t num_integers,
1484     const void *val, dmu_tx_t *tx)
1485 {
1486 	zap_t *zap;
1487 	int err;
1488 
1489 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1490 	if (err != 0)
1491 		return (err);
1492 	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1493 	/* zap_add_impl() calls zap_unlockdir() */
1494 	return (err);
1495 }
1496 
1497 int
zap_add_by_dnode(dnode_t * dn,const char * key,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1498 zap_add_by_dnode(dnode_t *dn, const char *key,
1499     int integer_size, uint64_t num_integers,
1500     const void *val, dmu_tx_t *tx)
1501 {
1502 	zap_t *zap;
1503 	int err;
1504 
1505 	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1506 	if (err != 0)
1507 		return (err);
1508 	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1509 	/* zap_add_impl() calls zap_unlockdir() */
1510 	return (err);
1511 }
1512 
1513 static int
zap_add_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx,const void * tag)1514 zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
1515     int key_numints, int integer_size, uint64_t num_integers,
1516     const void *val, dmu_tx_t *tx, const void *tag)
1517 {
1518 	int err;
1519 
1520 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1521 	if (zn == NULL) {
1522 		zap_unlockdir(zap, tag);
1523 		return (SET_ERROR(ENOTSUP));
1524 	}
1525 	err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
1526 	zap = zn->zn_zap;	/* fzap_add() may change zap */
1527 	zap_name_free(zn);
1528 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
1529 		zap_unlockdir(zap, tag);
1530 	return (err);
1531 }
1532 
1533 int
zap_add_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1534 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1535     int key_numints, int integer_size, uint64_t num_integers,
1536     const void *val, dmu_tx_t *tx)
1537 {
1538 	zap_t *zap;
1539 
1540 	int err =
1541 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1542 	if (err != 0)
1543 		return (err);
1544 	err = zap_add_uint64_impl(zap, key, key_numints,
1545 	    integer_size, num_integers, val, tx, FTAG);
1546 	/* zap_add_uint64_impl() calls zap_unlockdir() */
1547 	return (err);
1548 }
1549 
1550 int
zap_add_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1551 zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
1552     int key_numints, int integer_size, uint64_t num_integers,
1553     const void *val, dmu_tx_t *tx)
1554 {
1555 	zap_t *zap;
1556 
1557 	int err =
1558 	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1559 	if (err != 0)
1560 		return (err);
1561 	err = zap_add_uint64_impl(zap, key, key_numints,
1562 	    integer_size, num_integers, val, tx, FTAG);
1563 	/* zap_add_uint64_impl() calls zap_unlockdir() */
1564 	return (err);
1565 }
1566 
1567 int
zap_update(objset_t * os,uint64_t zapobj,const char * name,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1568 zap_update(objset_t *os, uint64_t zapobj, const char *name,
1569     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1570 {
1571 	zap_t *zap;
1572 	const uint64_t *intval = val;
1573 
1574 	int err =
1575 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1576 	if (err != 0)
1577 		return (err);
1578 	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
1579 	if (zn == NULL) {
1580 		zap_unlockdir(zap, FTAG);
1581 		return (SET_ERROR(ENOTSUP));
1582 	}
1583 	if (!zap->zap_ismicro) {
1584 		err = fzap_update(zn, integer_size, num_integers, val,
1585 		    FTAG, tx);
1586 		zap = zn->zn_zap;	/* fzap_update() may change zap */
1587 	} else if (integer_size != 8 || num_integers != 1 ||
1588 	    strlen(name) >= MZAP_NAME_LEN) {
1589 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
1590 		    (u_longlong_t)zapobj, integer_size,
1591 		    (u_longlong_t)num_integers, name);
1592 		err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
1593 		if (err == 0) {
1594 			err = fzap_update(zn, integer_size, num_integers,
1595 			    val, FTAG, tx);
1596 		}
1597 		zap = zn->zn_zap;	/* fzap_update() may change zap */
1598 	} else {
1599 		zfs_btree_index_t idx;
1600 		mzap_ent_t *mze = mze_find(zn, &idx);
1601 		if (mze != NULL) {
1602 			MZE_PHYS(zap, mze)->mze_value = *intval;
1603 		} else {
1604 			mzap_addent(zn, *intval);
1605 		}
1606 	}
1607 	ASSERT(zap == zn->zn_zap);
1608 	zap_name_free(zn);
1609 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
1610 		zap_unlockdir(zap, FTAG);
1611 	return (err);
1612 }
1613 
1614 static int
zap_update_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx,const void * tag)1615 zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
1616     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
1617     const void *tag)
1618 {
1619 	int err;
1620 
1621 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1622 	if (zn == NULL) {
1623 		zap_unlockdir(zap, tag);
1624 		return (SET_ERROR(ENOTSUP));
1625 	}
1626 	err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
1627 	zap = zn->zn_zap;	/* fzap_update() may change zap */
1628 	zap_name_free(zn);
1629 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
1630 		zap_unlockdir(zap, tag);
1631 	return (err);
1632 }
1633 
1634 int
zap_update_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1635 zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1636     int key_numints, int integer_size, uint64_t num_integers, const void *val,
1637     dmu_tx_t *tx)
1638 {
1639 	zap_t *zap;
1640 
1641 	int err =
1642 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1643 	if (err != 0)
1644 		return (err);
1645 	err = zap_update_uint64_impl(zap, key, key_numints,
1646 	    integer_size, num_integers, val, tx, FTAG);
1647 	/* zap_update_uint64_impl() calls zap_unlockdir() */
1648 	return (err);
1649 }
1650 
1651 int
zap_update_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1652 zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
1653     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1654 {
1655 	zap_t *zap;
1656 
1657 	int err =
1658 	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1659 	if (err != 0)
1660 		return (err);
1661 	err = zap_update_uint64_impl(zap, key, key_numints,
1662 	    integer_size, num_integers, val, tx, FTAG);
1663 	/* zap_update_uint64_impl() calls zap_unlockdir() */
1664 	return (err);
1665 }
1666 
1667 int
zap_remove(objset_t * os,uint64_t zapobj,const char * name,dmu_tx_t * tx)1668 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
1669 {
1670 	return (zap_remove_norm(os, zapobj, name, 0, tx));
1671 }
1672 
1673 static int
zap_remove_impl(zap_t * zap,const char * name,matchtype_t mt,dmu_tx_t * tx)1674 zap_remove_impl(zap_t *zap, const char *name,
1675     matchtype_t mt, dmu_tx_t *tx)
1676 {
1677 	int err = 0;
1678 
1679 	zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
1680 	if (zn == NULL)
1681 		return (SET_ERROR(ENOTSUP));
1682 	if (!zap->zap_ismicro) {
1683 		err = fzap_remove(zn, tx);
1684 	} else {
1685 		zfs_btree_index_t idx;
1686 		mzap_ent_t *mze = mze_find(zn, &idx);
1687 		if (mze == NULL) {
1688 			err = SET_ERROR(ENOENT);
1689 		} else {
1690 			zap->zap_m.zap_num_entries--;
1691 			memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t));
1692 			zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx);
1693 		}
1694 	}
1695 	zap_name_free(zn);
1696 	return (err);
1697 }
1698 
1699 int
zap_remove_norm(objset_t * os,uint64_t zapobj,const char * name,matchtype_t mt,dmu_tx_t * tx)1700 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
1701     matchtype_t mt, dmu_tx_t *tx)
1702 {
1703 	zap_t *zap;
1704 	int err;
1705 
1706 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1707 	if (err)
1708 		return (err);
1709 	err = zap_remove_impl(zap, name, mt, tx);
1710 	zap_unlockdir(zap, FTAG);
1711 	return (err);
1712 }
1713 
1714 int
zap_remove_by_dnode(dnode_t * dn,const char * name,dmu_tx_t * tx)1715 zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
1716 {
1717 	zap_t *zap;
1718 	int err;
1719 
1720 	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1721 	if (err)
1722 		return (err);
1723 	err = zap_remove_impl(zap, name, 0, tx);
1724 	zap_unlockdir(zap, FTAG);
1725 	return (err);
1726 }
1727 
1728 static int
zap_remove_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,dmu_tx_t * tx,const void * tag)1729 zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
1730     dmu_tx_t *tx, const void *tag)
1731 {
1732 	int err;
1733 
1734 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1735 	if (zn == NULL) {
1736 		zap_unlockdir(zap, tag);
1737 		return (SET_ERROR(ENOTSUP));
1738 	}
1739 	err = fzap_remove(zn, tx);
1740 	zap_name_free(zn);
1741 	zap_unlockdir(zap, tag);
1742 	return (err);
1743 }
1744 
1745 int
zap_remove_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,dmu_tx_t * tx)1746 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1747     int key_numints, dmu_tx_t *tx)
1748 {
1749 	zap_t *zap;
1750 
1751 	int err =
1752 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1753 	if (err != 0)
1754 		return (err);
1755 	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
1756 	/* zap_remove_uint64_impl() calls zap_unlockdir() */
1757 	return (err);
1758 }
1759 
1760 int
zap_remove_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,dmu_tx_t * tx)1761 zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
1762     dmu_tx_t *tx)
1763 {
1764 	zap_t *zap;
1765 
1766 	int err =
1767 	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1768 	if (err != 0)
1769 		return (err);
1770 	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
1771 	/* zap_remove_uint64_impl() calls zap_unlockdir() */
1772 	return (err);
1773 }
1774 
1775 
1776 static zap_attribute_t *
zap_attribute_alloc_impl(boolean_t longname)1777 zap_attribute_alloc_impl(boolean_t longname)
1778 {
1779 	zap_attribute_t *za;
1780 
1781 	za = kmem_cache_alloc((longname)? zap_attr_long_cache : zap_attr_cache,
1782 	    KM_SLEEP);
1783 	za->za_name_len = (longname)? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
1784 	return (za);
1785 }
1786 
1787 zap_attribute_t *
zap_attribute_alloc(void)1788 zap_attribute_alloc(void)
1789 {
1790 	return (zap_attribute_alloc_impl(B_FALSE));
1791 }
1792 
1793 zap_attribute_t *
zap_attribute_long_alloc(void)1794 zap_attribute_long_alloc(void)
1795 {
1796 	return (zap_attribute_alloc_impl(B_TRUE));
1797 }
1798 
1799 void
zap_attribute_free(zap_attribute_t * za)1800 zap_attribute_free(zap_attribute_t *za)
1801 {
1802 	if (za->za_name_len == ZAP_MAXNAMELEN) {
1803 		kmem_cache_free(zap_attr_cache, za);
1804 	} else {
1805 		ASSERT3U(za->za_name_len, ==, ZAP_MAXNAMELEN_NEW);
1806 		kmem_cache_free(zap_attr_long_cache, za);
1807 	}
1808 }
1809 
1810 /*
1811  * Routines for iterating over the attributes.
1812  */
1813 
1814 static void
zap_cursor_init_impl(zap_cursor_t * zc,objset_t * os,uint64_t zapobj,uint64_t serialized,boolean_t prefetch)1815 zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1816     uint64_t serialized, boolean_t prefetch)
1817 {
1818 	zc->zc_objset = os;
1819 	zc->zc_zap = NULL;
1820 	zc->zc_leaf = NULL;
1821 	zc->zc_zapobj = zapobj;
1822 	zc->zc_serialized = serialized;
1823 	zc->zc_hash = 0;
1824 	zc->zc_cd = 0;
1825 	zc->zc_prefetch = prefetch;
1826 }
1827 void
zap_cursor_init_serialized(zap_cursor_t * zc,objset_t * os,uint64_t zapobj,uint64_t serialized)1828 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1829     uint64_t serialized)
1830 {
1831 	zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
1832 }
1833 
1834 /*
1835  * Initialize a cursor at the beginning of the ZAP object.  The entire
1836  * ZAP object will be prefetched.
1837  */
1838 void
zap_cursor_init(zap_cursor_t * zc,objset_t * os,uint64_t zapobj)1839 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1840 {
1841 	zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
1842 }
1843 
1844 /*
1845  * Initialize a cursor at the beginning, but request that we not prefetch
1846  * the entire ZAP object.
1847  */
1848 void
zap_cursor_init_noprefetch(zap_cursor_t * zc,objset_t * os,uint64_t zapobj)1849 zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1850 {
1851 	zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
1852 }
1853 
1854 void
zap_cursor_fini(zap_cursor_t * zc)1855 zap_cursor_fini(zap_cursor_t *zc)
1856 {
1857 	if (zc->zc_zap) {
1858 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1859 		zap_unlockdir(zc->zc_zap, NULL);
1860 		zc->zc_zap = NULL;
1861 	}
1862 	if (zc->zc_leaf) {
1863 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1864 		zap_put_leaf(zc->zc_leaf);
1865 		zc->zc_leaf = NULL;
1866 	}
1867 	zc->zc_objset = NULL;
1868 }
1869 
1870 uint64_t
zap_cursor_serialize(zap_cursor_t * zc)1871 zap_cursor_serialize(zap_cursor_t *zc)
1872 {
1873 	if (zc->zc_hash == -1ULL)
1874 		return (-1ULL);
1875 	if (zc->zc_zap == NULL)
1876 		return (zc->zc_serialized);
1877 	ASSERT0((zc->zc_hash & zap_maxcd(zc->zc_zap)));
1878 	ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
1879 
1880 	/*
1881 	 * We want to keep the high 32 bits of the cursor zero if we can, so
1882 	 * that 32-bit programs can access this.  So usually use a small
1883 	 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
1884 	 * of the cursor.
1885 	 *
1886 	 * [ collision differentiator | zap_hashbits()-bit hash value ]
1887 	 */
1888 	return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
1889 	    ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
1890 }
1891 
1892 int
zap_cursor_retrieve(zap_cursor_t * zc,zap_attribute_t * za)1893 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
1894 {
1895 	int err;
1896 
1897 	if (zc->zc_hash == -1ULL)
1898 		return (SET_ERROR(ENOENT));
1899 
1900 	if (zc->zc_zap == NULL) {
1901 		int hb;
1902 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
1903 		    RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
1904 		if (err != 0)
1905 			return (err);
1906 
1907 		/*
1908 		 * To support zap_cursor_init_serialized, advance, retrieve,
1909 		 * we must add to the existing zc_cd, which may already
1910 		 * be 1 due to the zap_cursor_advance.
1911 		 */
1912 		ASSERT0(zc->zc_hash);
1913 		hb = zap_hashbits(zc->zc_zap);
1914 		zc->zc_hash = zc->zc_serialized << (64 - hb);
1915 		zc->zc_cd += zc->zc_serialized >> hb;
1916 		if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
1917 			zc->zc_cd = 0;
1918 	} else {
1919 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1920 	}
1921 	if (!zc->zc_zap->zap_ismicro) {
1922 		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
1923 	} else {
1924 		zfs_btree_index_t idx;
1925 		mzap_ent_t mze_tofind;
1926 
1927 		mze_tofind.mze_hash = zc->zc_hash >> 32;
1928 		mze_tofind.mze_cd = zc->zc_cd;
1929 
1930 		mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree,
1931 		    &mze_tofind, &idx);
1932 		if (mze == NULL) {
1933 			mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree,
1934 			    &idx, &idx);
1935 		}
1936 		if (mze) {
1937 			mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
1938 			ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
1939 			za->za_normalization_conflict =
1940 			    mzap_normalization_conflict(zc->zc_zap, NULL,
1941 			    mze, &idx);
1942 			za->za_integer_length = 8;
1943 			za->za_num_integers = 1;
1944 			za->za_first_integer = mzep->mze_value;
1945 			(void) strlcpy(za->za_name, mzep->mze_name,
1946 			    za->za_name_len);
1947 			zc->zc_hash = (uint64_t)mze->mze_hash << 32;
1948 			zc->zc_cd = mze->mze_cd;
1949 			err = 0;
1950 		} else {
1951 			zc->zc_hash = -1ULL;
1952 			err = SET_ERROR(ENOENT);
1953 		}
1954 	}
1955 	rw_exit(&zc->zc_zap->zap_rwlock);
1956 	return (err);
1957 }
1958 
1959 void
zap_cursor_advance(zap_cursor_t * zc)1960 zap_cursor_advance(zap_cursor_t *zc)
1961 {
1962 	if (zc->zc_hash == -1ULL)
1963 		return;
1964 	zc->zc_cd++;
1965 }
1966 
1967 int
zap_get_stats(objset_t * os,uint64_t zapobj,zap_stats_t * zs)1968 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
1969 {
1970 	zap_t *zap;
1971 
1972 	int err =
1973 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1974 	if (err != 0)
1975 		return (err);
1976 
1977 	memset(zs, 0, sizeof (zap_stats_t));
1978 
1979 	if (zap->zap_ismicro) {
1980 		zs->zs_blocksize = zap->zap_dbuf->db_size;
1981 		zs->zs_num_entries = zap->zap_m.zap_num_entries;
1982 		zs->zs_num_blocks = 1;
1983 	} else {
1984 		fzap_get_stats(zap, zs);
1985 	}
1986 	zap_unlockdir(zap, FTAG);
1987 	return (0);
1988 }
1989 
1990 #if defined(_KERNEL)
1991 EXPORT_SYMBOL(zap_create);
1992 EXPORT_SYMBOL(zap_create_dnsize);
1993 EXPORT_SYMBOL(zap_create_norm);
1994 EXPORT_SYMBOL(zap_create_norm_dnsize);
1995 EXPORT_SYMBOL(zap_create_flags);
1996 EXPORT_SYMBOL(zap_create_flags_dnsize);
1997 EXPORT_SYMBOL(zap_create_claim);
1998 EXPORT_SYMBOL(zap_create_claim_norm);
1999 EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
2000 EXPORT_SYMBOL(zap_create_hold);
2001 EXPORT_SYMBOL(zap_destroy);
2002 EXPORT_SYMBOL(zap_lookup);
2003 EXPORT_SYMBOL(zap_lookup_by_dnode);
2004 EXPORT_SYMBOL(zap_lookup_norm);
2005 EXPORT_SYMBOL(zap_lookup_uint64);
2006 EXPORT_SYMBOL(zap_contains);
2007 EXPORT_SYMBOL(zap_prefetch);
2008 EXPORT_SYMBOL(zap_prefetch_uint64);
2009 EXPORT_SYMBOL(zap_prefetch_object);
2010 EXPORT_SYMBOL(zap_add);
2011 EXPORT_SYMBOL(zap_add_by_dnode);
2012 EXPORT_SYMBOL(zap_add_uint64);
2013 EXPORT_SYMBOL(zap_add_uint64_by_dnode);
2014 EXPORT_SYMBOL(zap_update);
2015 EXPORT_SYMBOL(zap_update_uint64);
2016 EXPORT_SYMBOL(zap_update_uint64_by_dnode);
2017 EXPORT_SYMBOL(zap_length);
2018 EXPORT_SYMBOL(zap_length_uint64);
2019 EXPORT_SYMBOL(zap_remove);
2020 EXPORT_SYMBOL(zap_remove_by_dnode);
2021 EXPORT_SYMBOL(zap_remove_norm);
2022 EXPORT_SYMBOL(zap_remove_uint64);
2023 EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
2024 EXPORT_SYMBOL(zap_count);
2025 EXPORT_SYMBOL(zap_value_search);
2026 EXPORT_SYMBOL(zap_join);
2027 EXPORT_SYMBOL(zap_join_increment);
2028 EXPORT_SYMBOL(zap_add_int);
2029 EXPORT_SYMBOL(zap_remove_int);
2030 EXPORT_SYMBOL(zap_lookup_int);
2031 EXPORT_SYMBOL(zap_increment_int);
2032 EXPORT_SYMBOL(zap_add_int_key);
2033 EXPORT_SYMBOL(zap_lookup_int_key);
2034 EXPORT_SYMBOL(zap_increment);
2035 EXPORT_SYMBOL(zap_cursor_init);
2036 EXPORT_SYMBOL(zap_cursor_fini);
2037 EXPORT_SYMBOL(zap_cursor_retrieve);
2038 EXPORT_SYMBOL(zap_cursor_advance);
2039 EXPORT_SYMBOL(zap_cursor_serialize);
2040 EXPORT_SYMBOL(zap_cursor_init_serialized);
2041 EXPORT_SYMBOL(zap_get_stats);
2042 
2043 ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW,
2044 	"Maximum micro ZAP size before converting to a fat ZAP, "
2045 	    "in bytes (max 1M)");
2046 #endif
2047