xref: /illumos-gate/usr/src/uts/common/fs/zfs/zap_leaf.c (revision ea4ea50f064c5468142b24627acad09a41f060cb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
25  * Copyright 2017 Nexenta Systems, Inc.
26  */
27 
28 /*
29  * The 512-byte leaf is broken into 32 16-byte chunks.
30  * chunk number n means l_chunk[n], even though the header precedes it.
31  * the names are stored null-terminated.
32  */
33 
34 #include <sys/zio.h>
35 #include <sys/spa.h>
36 #include <sys/dmu.h>
37 #include <sys/zfs_context.h>
38 #include <sys/fs/zfs.h>
39 #include <sys/zap.h>
40 #include <sys/zap_impl.h>
41 #include <sys/zap_leaf.h>
42 #include <sys/arc.h>
43 
44 static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
45 
46 #define	CHAIN_END 0xffff /* end of the chunk chain */
47 
48 /* half the (current) minimum block size */
49 #define	MAX_ARRAY_BYTES (8<<10)
50 
51 #define	LEAF_HASH(l, h) \
52 	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
53 	((h) >> \
54 	(64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
55 
56 #define	LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
57 
58 extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l);
59 
60 static void
61 zap_memset(void *a, int c, size_t n)
62 {
63 	char *cp = a;
64 	char *cpend = cp + n;
65 
66 	while (cp < cpend)
67 		*cp++ = c;
68 }
69 
70 static void
71 stv(int len, void *addr, uint64_t value)
72 {
73 	switch (len) {
74 	case 1:
75 		*(uint8_t *)addr = value;
76 		return;
77 	case 2:
78 		*(uint16_t *)addr = value;
79 		return;
80 	case 4:
81 		*(uint32_t *)addr = value;
82 		return;
83 	case 8:
84 		*(uint64_t *)addr = value;
85 		return;
86 	}
87 	ASSERT(!"bad int len");
88 }
89 
90 static uint64_t
91 ldv(int len, const void *addr)
92 {
93 	switch (len) {
94 	case 1:
95 		return (*(uint8_t *)addr);
96 	case 2:
97 		return (*(uint16_t *)addr);
98 	case 4:
99 		return (*(uint32_t *)addr);
100 	case 8:
101 		return (*(uint64_t *)addr);
102 	}
103 	ASSERT(!"bad int len");
104 	return (0xFEEDFACEDEADBEEFULL);
105 }
106 
107 void
108 zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
109 {
110 	zap_leaf_t l;
111 	dmu_buf_t l_dbuf;
112 
113 	l_dbuf.db_data = buf;
114 	l.l_bs = highbit64(size) - 1;
115 	l.l_dbuf = &l_dbuf;
116 
117 	buf->l_hdr.lh_block_type =	BSWAP_64(buf->l_hdr.lh_block_type);
118 	buf->l_hdr.lh_prefix =		BSWAP_64(buf->l_hdr.lh_prefix);
119 	buf->l_hdr.lh_magic =		BSWAP_32(buf->l_hdr.lh_magic);
120 	buf->l_hdr.lh_nfree =		BSWAP_16(buf->l_hdr.lh_nfree);
121 	buf->l_hdr.lh_nentries =	BSWAP_16(buf->l_hdr.lh_nentries);
122 	buf->l_hdr.lh_prefix_len =	BSWAP_16(buf->l_hdr.lh_prefix_len);
123 	buf->l_hdr.lh_freelist =	BSWAP_16(buf->l_hdr.lh_freelist);
124 
125 	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
126 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
127 
128 	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
129 		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
130 		struct zap_leaf_entry *le;
131 
132 		switch (lc->l_free.lf_type) {
133 		case ZAP_CHUNK_ENTRY:
134 			le = &lc->l_entry;
135 
136 			le->le_type =		BSWAP_8(le->le_type);
137 			le->le_value_intlen =	BSWAP_8(le->le_value_intlen);
138 			le->le_next =		BSWAP_16(le->le_next);
139 			le->le_name_chunk =	BSWAP_16(le->le_name_chunk);
140 			le->le_name_numints =	BSWAP_16(le->le_name_numints);
141 			le->le_value_chunk =	BSWAP_16(le->le_value_chunk);
142 			le->le_value_numints =	BSWAP_16(le->le_value_numints);
143 			le->le_cd =		BSWAP_32(le->le_cd);
144 			le->le_hash =		BSWAP_64(le->le_hash);
145 			break;
146 		case ZAP_CHUNK_FREE:
147 			lc->l_free.lf_type =	BSWAP_8(lc->l_free.lf_type);
148 			lc->l_free.lf_next =	BSWAP_16(lc->l_free.lf_next);
149 			break;
150 		case ZAP_CHUNK_ARRAY:
151 			lc->l_array.la_type =	BSWAP_8(lc->l_array.la_type);
152 			lc->l_array.la_next =	BSWAP_16(lc->l_array.la_next);
153 			/* la_array doesn't need swapping */
154 			break;
155 		default:
156 			ASSERT(!"bad leaf type");
157 		}
158 	}
159 }
160 
161 void
162 zap_leaf_init(zap_leaf_t *l, boolean_t sort)
163 {
164 	l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
165 	zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
166 	    sizeof (struct zap_leaf_header));
167 	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
168 	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
169 	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
170 		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
171 		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
172 	}
173 	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
174 	zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF;
175 	zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
176 	zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
177 	if (sort)
178 		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
179 }
180 
181 /*
182  * Routines which manipulate leaf chunks (l_chunk[]).
183  */
184 
185 static uint16_t
186 zap_leaf_chunk_alloc(zap_leaf_t *l)
187 {
188 	ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
189 
190 	int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
191 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
192 	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
193 
194 	zap_leaf_phys(l)->l_hdr.lh_freelist =
195 	    ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
196 
197 	zap_leaf_phys(l)->l_hdr.lh_nfree--;
198 
199 	return (chunk);
200 }
201 
202 static void
203 zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
204 {
205 	struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
206 	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
207 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
208 	ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
209 
210 	zlf->lf_type = ZAP_CHUNK_FREE;
211 	zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
212 	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
213 	zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
214 
215 	zap_leaf_phys(l)->l_hdr.lh_nfree++;
216 }
217 
218 /*
219  * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
220  */
221 
222 static uint16_t
223 zap_leaf_array_create(zap_leaf_t *l, const char *buf,
224     int integer_size, int num_integers)
225 {
226 	uint16_t chunk_head;
227 	uint16_t *chunkp = &chunk_head;
228 	int byten = 0;
229 	uint64_t value = 0;
230 	int shift = (integer_size - 1) * 8;
231 	int len = num_integers;
232 
233 	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
234 
235 	while (len > 0) {
236 		uint16_t chunk = zap_leaf_chunk_alloc(l);
237 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
238 
239 		la->la_type = ZAP_CHUNK_ARRAY;
240 		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
241 			if (byten == 0)
242 				value = ldv(integer_size, buf);
243 			la->la_array[i] = value >> shift;
244 			value <<= 8;
245 			if (++byten == integer_size) {
246 				byten = 0;
247 				buf += integer_size;
248 				if (--len == 0)
249 					break;
250 			}
251 		}
252 
253 		*chunkp = chunk;
254 		chunkp = &la->la_next;
255 	}
256 	*chunkp = CHAIN_END;
257 
258 	return (chunk_head);
259 }
260 
261 static void
262 zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
263 {
264 	uint16_t chunk = *chunkp;
265 
266 	*chunkp = CHAIN_END;
267 
268 	while (chunk != CHAIN_END) {
269 		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
270 		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
271 		    ZAP_CHUNK_ARRAY);
272 		zap_leaf_chunk_free(l, chunk);
273 		chunk = nextchunk;
274 	}
275 }
276 
277 /* array_len and buf_len are in integers, not bytes */
278 static void
279 zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
280     int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
281     void *buf)
282 {
283 	int len = MIN(array_len, buf_len);
284 	int byten = 0;
285 	uint64_t value = 0;
286 	char *p = buf;
287 
288 	ASSERT3U(array_int_len, <=, buf_int_len);
289 
290 	/* Fast path for one 8-byte integer */
291 	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
292 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
293 		uint8_t *ip = la->la_array;
294 		uint64_t *buf64 = buf;
295 
296 		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
297 		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
298 		    (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
299 		    (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
300 		return;
301 	}
302 
303 	/* Fast path for an array of 1-byte integers (eg. the entry name) */
304 	if (array_int_len == 1 && buf_int_len == 1 &&
305 	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
306 		while (chunk != CHAIN_END) {
307 			struct zap_leaf_array *la =
308 			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
309 			bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
310 			p += ZAP_LEAF_ARRAY_BYTES;
311 			chunk = la->la_next;
312 		}
313 		return;
314 	}
315 
316 	while (len > 0) {
317 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
318 
319 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
320 		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
321 			value = (value << 8) | la->la_array[i];
322 			byten++;
323 			if (byten == array_int_len) {
324 				stv(buf_int_len, p, value);
325 				byten = 0;
326 				len--;
327 				if (len == 0)
328 					return;
329 				p += buf_int_len;
330 			}
331 		}
332 		chunk = la->la_next;
333 	}
334 }
335 
336 static boolean_t
337 zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
338     int chunk, int array_numints)
339 {
340 	int bseen = 0;
341 
342 	if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
343 		uint64_t *thiskey =
344 		    kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP);
345 		ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
346 
347 		zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
348 		    sizeof (*thiskey), array_numints, thiskey);
349 		boolean_t match = bcmp(thiskey, zn->zn_key_orig,
350 		    array_numints * sizeof (*thiskey)) == 0;
351 		kmem_free(thiskey, array_numints * sizeof (*thiskey));
352 		return (match);
353 	}
354 
355 	ASSERT(zn->zn_key_intlen == 1);
356 	if (zn->zn_matchtype & MT_NORMALIZE) {
357 		char *thisname = kmem_alloc(array_numints, KM_SLEEP);
358 
359 		zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
360 		    sizeof (char), array_numints, thisname);
361 		boolean_t match = zap_match(zn, thisname);
362 		kmem_free(thisname, array_numints);
363 		return (match);
364 	}
365 
366 	/*
367 	 * Fast path for exact matching.
368 	 * First check that the lengths match, so that we don't read
369 	 * past the end of the zn_key_orig array.
370 	 */
371 	if (array_numints != zn->zn_key_orig_numints)
372 		return (B_FALSE);
373 	while (bseen < array_numints) {
374 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
375 		int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
376 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
377 		if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
378 			break;
379 		chunk = la->la_next;
380 		bseen += toread;
381 	}
382 	return (bseen == array_numints);
383 }
384 
385 /*
386  * Routines which manipulate leaf entries.
387  */
388 
389 int
390 zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
391 {
392 	struct zap_leaf_entry *le;
393 
394 	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
395 
396 	for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
397 	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
398 		uint16_t chunk = *chunkp;
399 		le = ZAP_LEAF_ENTRY(l, chunk);
400 
401 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
402 		ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
403 
404 		if (le->le_hash != zn->zn_hash)
405 			continue;
406 
407 		/*
408 		 * NB: the entry chain is always sorted by cd on
409 		 * normalized zap objects, so this will find the
410 		 * lowest-cd match for MT_NORMALIZE.
411 		 */
412 		ASSERT((zn->zn_matchtype == 0) ||
413 		    (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
414 		if (zap_leaf_array_match(l, zn, le->le_name_chunk,
415 		    le->le_name_numints)) {
416 			zeh->zeh_num_integers = le->le_value_numints;
417 			zeh->zeh_integer_size = le->le_value_intlen;
418 			zeh->zeh_cd = le->le_cd;
419 			zeh->zeh_hash = le->le_hash;
420 			zeh->zeh_chunkp = chunkp;
421 			zeh->zeh_leaf = l;
422 			return (0);
423 		}
424 	}
425 
426 	return (SET_ERROR(ENOENT));
427 }
428 
429 /* Return (h1,cd1 >= h2,cd2) */
430 #define	HCD_GTEQ(h1, cd1, h2, cd2) \
431 	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
432 
433 int
434 zap_leaf_lookup_closest(zap_leaf_t *l,
435     uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
436 {
437 	uint64_t besth = -1ULL;
438 	uint32_t bestcd = -1U;
439 	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
440 	struct zap_leaf_entry *le;
441 
442 	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
443 
444 	for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
445 		for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh];
446 		    chunk != CHAIN_END; chunk = le->le_next) {
447 			le = ZAP_LEAF_ENTRY(l, chunk);
448 
449 			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
450 			ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
451 
452 			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
453 			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
454 				ASSERT3U(bestlh, >=, lh);
455 				bestlh = lh;
456 				besth = le->le_hash;
457 				bestcd = le->le_cd;
458 
459 				zeh->zeh_num_integers = le->le_value_numints;
460 				zeh->zeh_integer_size = le->le_value_intlen;
461 				zeh->zeh_cd = le->le_cd;
462 				zeh->zeh_hash = le->le_hash;
463 				zeh->zeh_fakechunk = chunk;
464 				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
465 				zeh->zeh_leaf = l;
466 			}
467 		}
468 	}
469 
470 	return (bestcd == -1U ? ENOENT : 0);
471 }
472 
473 int
474 zap_entry_read(const zap_entry_handle_t *zeh,
475     uint8_t integer_size, uint64_t num_integers, void *buf)
476 {
477 	struct zap_leaf_entry *le =
478 	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
479 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
480 
481 	if (le->le_value_intlen > integer_size)
482 		return (SET_ERROR(EINVAL));
483 
484 	zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
485 	    le->le_value_intlen, le->le_value_numints,
486 	    integer_size, num_integers, buf);
487 
488 	if (zeh->zeh_num_integers > num_integers)
489 		return (SET_ERROR(EOVERFLOW));
490 	return (0);
491 
492 }
493 
494 int
495 zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
496     char *buf)
497 {
498 	struct zap_leaf_entry *le =
499 	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
500 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
501 
502 	if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
503 		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
504 		    le->le_name_numints, 8, buflen / 8, buf);
505 	} else {
506 		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
507 		    le->le_name_numints, 1, buflen, buf);
508 	}
509 	if (le->le_name_numints > buflen)
510 		return (SET_ERROR(EOVERFLOW));
511 	return (0);
512 }
513 
514 int
515 zap_entry_update(zap_entry_handle_t *zeh,
516     uint8_t integer_size, uint64_t num_integers, const void *buf)
517 {
518 	zap_leaf_t *l = zeh->zeh_leaf;
519 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
520 
521 	int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
522 	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
523 
524 	if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
525 		return (SET_ERROR(EAGAIN));
526 
527 	zap_leaf_array_free(l, &le->le_value_chunk);
528 	le->le_value_chunk =
529 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
530 	le->le_value_numints = num_integers;
531 	le->le_value_intlen = integer_size;
532 	return (0);
533 }
534 
535 void
536 zap_entry_remove(zap_entry_handle_t *zeh)
537 {
538 	zap_leaf_t *l = zeh->zeh_leaf;
539 
540 	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
541 
542 	uint16_t entry_chunk = *zeh->zeh_chunkp;
543 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk);
544 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
545 
546 	zap_leaf_array_free(l, &le->le_name_chunk);
547 	zap_leaf_array_free(l, &le->le_value_chunk);
548 
549 	*zeh->zeh_chunkp = le->le_next;
550 	zap_leaf_chunk_free(l, entry_chunk);
551 
552 	zap_leaf_phys(l)->l_hdr.lh_nentries--;
553 }
554 
555 int
556 zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
557     uint8_t integer_size, uint64_t num_integers, const void *buf,
558     zap_entry_handle_t *zeh)
559 {
560 	uint16_t chunk;
561 	struct zap_leaf_entry *le;
562 	uint64_t h = zn->zn_hash;
563 
564 	uint64_t valuelen = integer_size * num_integers;
565 
566 	int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
567 	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
568 	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
569 		return (E2BIG);
570 
571 	if (cd == ZAP_NEED_CD) {
572 		/* find the lowest unused cd */
573 		if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
574 			cd = 0;
575 
576 			for (chunk = *LEAF_HASH_ENTPTR(l, h);
577 			    chunk != CHAIN_END; chunk = le->le_next) {
578 				le = ZAP_LEAF_ENTRY(l, chunk);
579 				if (le->le_cd > cd)
580 					break;
581 				if (le->le_hash == h) {
582 					ASSERT3U(cd, ==, le->le_cd);
583 					cd++;
584 				}
585 			}
586 		} else {
587 			/* old unsorted format; do it the O(n^2) way */
588 			for (cd = 0; ; cd++) {
589 				for (chunk = *LEAF_HASH_ENTPTR(l, h);
590 				    chunk != CHAIN_END; chunk = le->le_next) {
591 					le = ZAP_LEAF_ENTRY(l, chunk);
592 					if (le->le_hash == h &&
593 					    le->le_cd == cd) {
594 						break;
595 					}
596 				}
597 				/* If this cd is not in use, we are good. */
598 				if (chunk == CHAIN_END)
599 					break;
600 			}
601 		}
602 		/*
603 		 * We would run out of space in a block before we could
604 		 * store enough entries to run out of CD values.
605 		 */
606 		ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
607 	}
608 
609 	if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks)
610 		return (SET_ERROR(EAGAIN));
611 
612 	/* make the entry */
613 	chunk = zap_leaf_chunk_alloc(l);
614 	le = ZAP_LEAF_ENTRY(l, chunk);
615 	le->le_type = ZAP_CHUNK_ENTRY;
616 	le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
617 	    zn->zn_key_intlen, zn->zn_key_orig_numints);
618 	le->le_name_numints = zn->zn_key_orig_numints;
619 	le->le_value_chunk =
620 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
621 	le->le_value_numints = num_integers;
622 	le->le_value_intlen = integer_size;
623 	le->le_hash = h;
624 	le->le_cd = cd;
625 
626 	/* link it into the hash chain */
627 	/* XXX if we did the search above, we could just use that */
628 	uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk);
629 
630 	zap_leaf_phys(l)->l_hdr.lh_nentries++;
631 
632 	zeh->zeh_leaf = l;
633 	zeh->zeh_num_integers = num_integers;
634 	zeh->zeh_integer_size = le->le_value_intlen;
635 	zeh->zeh_cd = le->le_cd;
636 	zeh->zeh_hash = le->le_hash;
637 	zeh->zeh_chunkp = chunkp;
638 
639 	return (0);
640 }
641 
642 /*
643  * Determine if there is another entry with the same normalized form.
644  * For performance purposes, either zn or name must be provided (the
645  * other can be NULL).  Note, there usually won't be any hash
646  * conflicts, in which case we don't need the concatenated/normalized
647  * form of the name.  But all callers have one of these on hand anyway,
648  * so might as well take advantage.  A cleaner but slower interface
649  * would accept neither argument, and compute the normalized name as
650  * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
651  */
652 boolean_t
653 zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
654     const char *name, zap_t *zap)
655 {
656 	struct zap_leaf_entry *le;
657 	boolean_t allocdzn = B_FALSE;
658 
659 	if (zap->zap_normflags == 0)
660 		return (B_FALSE);
661 
662 	for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
663 	    chunk != CHAIN_END; chunk = le->le_next) {
664 		le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
665 		if (le->le_hash != zeh->zeh_hash)
666 			continue;
667 		if (le->le_cd == zeh->zeh_cd)
668 			continue;
669 
670 		if (zn == NULL) {
671 			zn = zap_name_alloc(zap, name, MT_NORMALIZE);
672 			allocdzn = B_TRUE;
673 		}
674 		if (zap_leaf_array_match(zeh->zeh_leaf, zn,
675 		    le->le_name_chunk, le->le_name_numints)) {
676 			if (allocdzn)
677 				zap_name_free(zn);
678 			return (B_TRUE);
679 		}
680 	}
681 	if (allocdzn)
682 		zap_name_free(zn);
683 	return (B_FALSE);
684 }
685 
686 /*
687  * Routines for transferring entries between leafs.
688  */
689 
690 static uint16_t *
691 zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
692 {
693 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
694 	struct zap_leaf_entry *le2;
695 	uint16_t *chunkp;
696 
697 	/*
698 	 * keep the entry chain sorted by cd
699 	 * NB: this will not cause problems for unsorted leafs, though
700 	 * it is unnecessary there.
701 	 */
702 	for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
703 	    *chunkp != CHAIN_END; chunkp = &le2->le_next) {
704 		le2 = ZAP_LEAF_ENTRY(l, *chunkp);
705 		if (le2->le_cd > le->le_cd)
706 			break;
707 	}
708 
709 	le->le_next = *chunkp;
710 	*chunkp = entry;
711 	return (chunkp);
712 }
713 
714 static uint16_t
715 zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
716 {
717 	uint16_t new_chunk;
718 	uint16_t *nchunkp = &new_chunk;
719 
720 	while (chunk != CHAIN_END) {
721 		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
722 		struct zap_leaf_array *nla =
723 		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
724 		struct zap_leaf_array *la =
725 		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
726 		int nextchunk = la->la_next;
727 
728 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
729 		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
730 
731 		*nla = *la; /* structure assignment */
732 
733 		zap_leaf_chunk_free(l, chunk);
734 		chunk = nextchunk;
735 		*nchunkp = nchunk;
736 		nchunkp = &nla->la_next;
737 	}
738 	*nchunkp = CHAIN_END;
739 	return (new_chunk);
740 }
741 
742 static void
743 zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
744 {
745 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
746 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
747 
748 	uint16_t chunk = zap_leaf_chunk_alloc(nl);
749 	struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
750 	*nle = *le; /* structure assignment */
751 
752 	(void) zap_leaf_rehash_entry(nl, chunk);
753 
754 	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
755 	nle->le_value_chunk =
756 	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
757 
758 	zap_leaf_chunk_free(l, entry);
759 
760 	zap_leaf_phys(l)->l_hdr.lh_nentries--;
761 	zap_leaf_phys(nl)->l_hdr.lh_nentries++;
762 }
763 
764 /*
765  * Transfer the entries whose hash prefix ends in 1 to the new leaf.
766  */
767 void
768 zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
769 {
770 	int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
771 
772 	/* set new prefix and prefix_len */
773 	zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
774 	zap_leaf_phys(l)->l_hdr.lh_prefix_len++;
775 	zap_leaf_phys(nl)->l_hdr.lh_prefix =
776 	    zap_leaf_phys(l)->l_hdr.lh_prefix | 1;
777 	zap_leaf_phys(nl)->l_hdr.lh_prefix_len =
778 	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
779 
780 	/* break existing hash chains */
781 	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
782 	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
783 
784 	if (sort)
785 		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
786 
787 	/*
788 	 * Transfer entries whose hash bit 'bit' is set to nl; rehash
789 	 * the remaining entries
790 	 *
791 	 * NB: We could find entries via the hashtable instead. That
792 	 * would be O(hashents+numents) rather than O(numblks+numents),
793 	 * but this accesses memory more sequentially, and when we're
794 	 * called, the block is usually pretty full.
795 	 */
796 	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
797 		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
798 		if (le->le_type != ZAP_CHUNK_ENTRY)
799 			continue;
800 
801 		if (le->le_hash & (1ULL << bit))
802 			zap_leaf_transfer_entry(l, i, nl);
803 		else
804 			(void) zap_leaf_rehash_entry(l, i);
805 	}
806 }
807 
808 void
809 zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
810 {
811 	int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
812 	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
813 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
814 	zs->zs_leafs_with_2n_pointers[n]++;
815 
816 
817 	n = zap_leaf_phys(l)->l_hdr.lh_nentries/5;
818 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
819 	zs->zs_blocks_with_n5_entries[n]++;
820 
821 	n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
822 	    zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
823 	    (1<<FZAP_BLOCK_SHIFT(zap));
824 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
825 	zs->zs_blocks_n_tenths_full[n]++;
826 
827 	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
828 		int nentries = 0;
829 		int chunk = zap_leaf_phys(l)->l_hash[i];
830 
831 		while (chunk != CHAIN_END) {
832 			struct zap_leaf_entry *le =
833 			    ZAP_LEAF_ENTRY(l, chunk);
834 
835 			n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
836 			    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
837 			    le->le_value_intlen);
838 			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
839 			zs->zs_entries_using_n_chunks[n]++;
840 
841 			chunk = le->le_next;
842 			nentries++;
843 		}
844 
845 		n = nentries;
846 		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
847 		zs->zs_buckets_with_n_entries[n]++;
848 	}
849 }
850