xref: /illumos-gate/usr/src/uts/common/fs/zfs/zap_leaf.c (revision 3e6960d70408b9f4e09714ed3341173673ed28b2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
25  * Copyright 2017 Nexenta Systems, Inc.
26  */
27 
28 /*
29  * The 512-byte leaf is broken into 32 16-byte chunks.
30  * chunk number n means l_chunk[n], even though the header precedes it.
31  * the names are stored null-terminated.
32  */
33 
34 #include <sys/zio.h>
35 #include <sys/spa.h>
36 #include <sys/dmu.h>
37 #include <sys/zfs_context.h>
38 #include <sys/fs/zfs.h>
39 #include <sys/zap.h>
40 #include <sys/zap_impl.h>
41 #include <sys/zap_leaf.h>
42 #include <sys/arc.h>
43 
44 static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
45 
46 #define	CHAIN_END 0xffff /* end of the chunk chain */
47 
48 /* half the (current) minimum block size */
49 #define	MAX_ARRAY_BYTES (8<<10)
50 
51 #define	LEAF_HASH(l, h) \
52 	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
53 	((h) >> \
54 	(64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
55 
56 #define	LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
57 
58 extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l);
59 
60 static void
61 zap_memset(void *a, int c, size_t n)
62 {
63 	char *cp = a;
64 	char *cpend = cp + n;
65 
66 	while (cp < cpend)
67 		*cp++ = c;
68 }
69 
70 static void
71 stv(int len, void *addr, uint64_t value)
72 {
73 	switch (len) {
74 	case 1:
75 		*(uint8_t *)addr = value;
76 		return;
77 	case 2:
78 		*(uint16_t *)addr = value;
79 		return;
80 	case 4:
81 		*(uint32_t *)addr = value;
82 		return;
83 	case 8:
84 		*(uint64_t *)addr = value;
85 		return;
86 	}
87 	ASSERT(!"bad int len");
88 }
89 
90 static uint64_t
91 ldv(int len, const void *addr)
92 {
93 	switch (len) {
94 	case 1:
95 		return (*(uint8_t *)addr);
96 	case 2:
97 		return (*(uint16_t *)addr);
98 	case 4:
99 		return (*(uint32_t *)addr);
100 	case 8:
101 		return (*(uint64_t *)addr);
102 	}
103 	ASSERT(!"bad int len");
104 	return (0xFEEDFACEDEADBEEFULL);
105 }
106 
107 void
108 zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
109 {
110 	int i;
111 	zap_leaf_t l;
112 	dmu_buf_t l_dbuf;
113 
114 	l_dbuf.db_data = buf;
115 	l.l_bs = highbit64(size) - 1;
116 	l.l_dbuf = &l_dbuf;
117 
118 	buf->l_hdr.lh_block_type =	BSWAP_64(buf->l_hdr.lh_block_type);
119 	buf->l_hdr.lh_prefix =		BSWAP_64(buf->l_hdr.lh_prefix);
120 	buf->l_hdr.lh_magic =		BSWAP_32(buf->l_hdr.lh_magic);
121 	buf->l_hdr.lh_nfree =		BSWAP_16(buf->l_hdr.lh_nfree);
122 	buf->l_hdr.lh_nentries =	BSWAP_16(buf->l_hdr.lh_nentries);
123 	buf->l_hdr.lh_prefix_len =	BSWAP_16(buf->l_hdr.lh_prefix_len);
124 	buf->l_hdr.lh_freelist =	BSWAP_16(buf->l_hdr.lh_freelist);
125 
126 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
127 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
128 
129 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
130 		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
131 		struct zap_leaf_entry *le;
132 
133 		switch (lc->l_free.lf_type) {
134 		case ZAP_CHUNK_ENTRY:
135 			le = &lc->l_entry;
136 
137 			le->le_type =		BSWAP_8(le->le_type);
138 			le->le_value_intlen =	BSWAP_8(le->le_value_intlen);
139 			le->le_next =		BSWAP_16(le->le_next);
140 			le->le_name_chunk =	BSWAP_16(le->le_name_chunk);
141 			le->le_name_numints =	BSWAP_16(le->le_name_numints);
142 			le->le_value_chunk =	BSWAP_16(le->le_value_chunk);
143 			le->le_value_numints =	BSWAP_16(le->le_value_numints);
144 			le->le_cd =		BSWAP_32(le->le_cd);
145 			le->le_hash =		BSWAP_64(le->le_hash);
146 			break;
147 		case ZAP_CHUNK_FREE:
148 			lc->l_free.lf_type =	BSWAP_8(lc->l_free.lf_type);
149 			lc->l_free.lf_next =	BSWAP_16(lc->l_free.lf_next);
150 			break;
151 		case ZAP_CHUNK_ARRAY:
152 			lc->l_array.la_type =	BSWAP_8(lc->l_array.la_type);
153 			lc->l_array.la_next =	BSWAP_16(lc->l_array.la_next);
154 			/* la_array doesn't need swapping */
155 			break;
156 		default:
157 			ASSERT(!"bad leaf type");
158 		}
159 	}
160 }
161 
162 void
163 zap_leaf_init(zap_leaf_t *l, boolean_t sort)
164 {
165 	int i;
166 
167 	l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
168 	zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
169 	    sizeof (struct zap_leaf_header));
170 	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
171 	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
172 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
173 		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
174 		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
175 	}
176 	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
177 	zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF;
178 	zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
179 	zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
180 	if (sort)
181 		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
182 }
183 
184 /*
185  * Routines which manipulate leaf chunks (l_chunk[]).
186  */
187 
188 static uint16_t
189 zap_leaf_chunk_alloc(zap_leaf_t *l)
190 {
191 	int chunk;
192 
193 	ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
194 
195 	chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
196 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
197 	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
198 
199 	zap_leaf_phys(l)->l_hdr.lh_freelist =
200 	    ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
201 
202 	zap_leaf_phys(l)->l_hdr.lh_nfree--;
203 
204 	return (chunk);
205 }
206 
207 static void
208 zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
209 {
210 	struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
211 	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
212 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
213 	ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
214 
215 	zlf->lf_type = ZAP_CHUNK_FREE;
216 	zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
217 	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
218 	zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
219 
220 	zap_leaf_phys(l)->l_hdr.lh_nfree++;
221 }
222 
223 /*
224  * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
225  */
226 
227 static uint16_t
228 zap_leaf_array_create(zap_leaf_t *l, const char *buf,
229     int integer_size, int num_integers)
230 {
231 	uint16_t chunk_head;
232 	uint16_t *chunkp = &chunk_head;
233 	int byten = 0;
234 	uint64_t value = 0;
235 	int shift = (integer_size-1)*8;
236 	int len = num_integers;
237 
238 	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
239 
240 	while (len > 0) {
241 		uint16_t chunk = zap_leaf_chunk_alloc(l);
242 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
243 		int i;
244 
245 		la->la_type = ZAP_CHUNK_ARRAY;
246 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
247 			if (byten == 0)
248 				value = ldv(integer_size, buf);
249 			la->la_array[i] = value >> shift;
250 			value <<= 8;
251 			if (++byten == integer_size) {
252 				byten = 0;
253 				buf += integer_size;
254 				if (--len == 0)
255 					break;
256 			}
257 		}
258 
259 		*chunkp = chunk;
260 		chunkp = &la->la_next;
261 	}
262 	*chunkp = CHAIN_END;
263 
264 	return (chunk_head);
265 }
266 
267 static void
268 zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
269 {
270 	uint16_t chunk = *chunkp;
271 
272 	*chunkp = CHAIN_END;
273 
274 	while (chunk != CHAIN_END) {
275 		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
276 		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
277 		    ZAP_CHUNK_ARRAY);
278 		zap_leaf_chunk_free(l, chunk);
279 		chunk = nextchunk;
280 	}
281 }
282 
283 /* array_len and buf_len are in integers, not bytes */
284 static void
285 zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
286     int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
287     void *buf)
288 {
289 	int len = MIN(array_len, buf_len);
290 	int byten = 0;
291 	uint64_t value = 0;
292 	char *p = buf;
293 
294 	ASSERT3U(array_int_len, <=, buf_int_len);
295 
296 	/* Fast path for one 8-byte integer */
297 	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
298 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
299 		uint8_t *ip = la->la_array;
300 		uint64_t *buf64 = buf;
301 
302 		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
303 		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
304 		    (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
305 		    (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
306 		return;
307 	}
308 
309 	/* Fast path for an array of 1-byte integers (eg. the entry name) */
310 	if (array_int_len == 1 && buf_int_len == 1 &&
311 	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
312 		while (chunk != CHAIN_END) {
313 			struct zap_leaf_array *la =
314 			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
315 			bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
316 			p += ZAP_LEAF_ARRAY_BYTES;
317 			chunk = la->la_next;
318 		}
319 		return;
320 	}
321 
322 	while (len > 0) {
323 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
324 		int i;
325 
326 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
327 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
328 			value = (value << 8) | la->la_array[i];
329 			byten++;
330 			if (byten == array_int_len) {
331 				stv(buf_int_len, p, value);
332 				byten = 0;
333 				len--;
334 				if (len == 0)
335 					return;
336 				p += buf_int_len;
337 			}
338 		}
339 		chunk = la->la_next;
340 	}
341 }
342 
343 static boolean_t
344 zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
345     int chunk, int array_numints)
346 {
347 	int bseen = 0;
348 
349 	if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
350 		uint64_t *thiskey;
351 		boolean_t match;
352 
353 		ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
354 		thiskey = kmem_alloc(array_numints * sizeof (*thiskey),
355 		    KM_SLEEP);
356 
357 		zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
358 		    sizeof (*thiskey), array_numints, thiskey);
359 		match = bcmp(thiskey, zn->zn_key_orig,
360 		    array_numints * sizeof (*thiskey)) == 0;
361 		kmem_free(thiskey, array_numints * sizeof (*thiskey));
362 		return (match);
363 	}
364 
365 	ASSERT(zn->zn_key_intlen == 1);
366 	if (zn->zn_matchtype & MT_NORMALIZE) {
367 		char *thisname = kmem_alloc(array_numints, KM_SLEEP);
368 		boolean_t match;
369 
370 		zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
371 		    sizeof (char), array_numints, thisname);
372 		match = zap_match(zn, thisname);
373 		kmem_free(thisname, array_numints);
374 		return (match);
375 	}
376 
377 	/*
378 	 * Fast path for exact matching.
379 	 * First check that the lengths match, so that we don't read
380 	 * past the end of the zn_key_orig array.
381 	 */
382 	if (array_numints != zn->zn_key_orig_numints)
383 		return (B_FALSE);
384 	while (bseen < array_numints) {
385 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
386 		int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
387 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
388 		if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
389 			break;
390 		chunk = la->la_next;
391 		bseen += toread;
392 	}
393 	return (bseen == array_numints);
394 }
395 
396 /*
397  * Routines which manipulate leaf entries.
398  */
399 
400 int
401 zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
402 {
403 	uint16_t *chunkp;
404 	struct zap_leaf_entry *le;
405 
406 	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
407 
408 	for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
409 	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
410 		uint16_t chunk = *chunkp;
411 		le = ZAP_LEAF_ENTRY(l, chunk);
412 
413 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
414 		ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
415 
416 		if (le->le_hash != zn->zn_hash)
417 			continue;
418 
419 		/*
420 		 * NB: the entry chain is always sorted by cd on
421 		 * normalized zap objects, so this will find the
422 		 * lowest-cd match for MT_NORMALIZE.
423 		 */
424 		ASSERT((zn->zn_matchtype == 0) ||
425 		    (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
426 		if (zap_leaf_array_match(l, zn, le->le_name_chunk,
427 		    le->le_name_numints)) {
428 			zeh->zeh_num_integers = le->le_value_numints;
429 			zeh->zeh_integer_size = le->le_value_intlen;
430 			zeh->zeh_cd = le->le_cd;
431 			zeh->zeh_hash = le->le_hash;
432 			zeh->zeh_chunkp = chunkp;
433 			zeh->zeh_leaf = l;
434 			return (0);
435 		}
436 	}
437 
438 	return (SET_ERROR(ENOENT));
439 }
440 
441 /* Return (h1,cd1 >= h2,cd2) */
442 #define	HCD_GTEQ(h1, cd1, h2, cd2) \
443 	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
444 
445 int
446 zap_leaf_lookup_closest(zap_leaf_t *l,
447     uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
448 {
449 	uint16_t chunk;
450 	uint64_t besth = -1ULL;
451 	uint32_t bestcd = -1U;
452 	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
453 	uint16_t lh;
454 	struct zap_leaf_entry *le;
455 
456 	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
457 
458 	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
459 		for (chunk = zap_leaf_phys(l)->l_hash[lh];
460 		    chunk != CHAIN_END; chunk = le->le_next) {
461 			le = ZAP_LEAF_ENTRY(l, chunk);
462 
463 			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
464 			ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
465 
466 			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
467 			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
468 				ASSERT3U(bestlh, >=, lh);
469 				bestlh = lh;
470 				besth = le->le_hash;
471 				bestcd = le->le_cd;
472 
473 				zeh->zeh_num_integers = le->le_value_numints;
474 				zeh->zeh_integer_size = le->le_value_intlen;
475 				zeh->zeh_cd = le->le_cd;
476 				zeh->zeh_hash = le->le_hash;
477 				zeh->zeh_fakechunk = chunk;
478 				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
479 				zeh->zeh_leaf = l;
480 			}
481 		}
482 	}
483 
484 	return (bestcd == -1U ? ENOENT : 0);
485 }
486 
487 int
488 zap_entry_read(const zap_entry_handle_t *zeh,
489     uint8_t integer_size, uint64_t num_integers, void *buf)
490 {
491 	struct zap_leaf_entry *le =
492 	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
493 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
494 
495 	if (le->le_value_intlen > integer_size)
496 		return (SET_ERROR(EINVAL));
497 
498 	zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
499 	    le->le_value_intlen, le->le_value_numints,
500 	    integer_size, num_integers, buf);
501 
502 	if (zeh->zeh_num_integers > num_integers)
503 		return (SET_ERROR(EOVERFLOW));
504 	return (0);
505 
506 }
507 
508 int
509 zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
510     char *buf)
511 {
512 	struct zap_leaf_entry *le =
513 	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
514 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
515 
516 	if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
517 		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
518 		    le->le_name_numints, 8, buflen / 8, buf);
519 	} else {
520 		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
521 		    le->le_name_numints, 1, buflen, buf);
522 	}
523 	if (le->le_name_numints > buflen)
524 		return (SET_ERROR(EOVERFLOW));
525 	return (0);
526 }
527 
528 int
529 zap_entry_update(zap_entry_handle_t *zeh,
530     uint8_t integer_size, uint64_t num_integers, const void *buf)
531 {
532 	int delta_chunks;
533 	zap_leaf_t *l = zeh->zeh_leaf;
534 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
535 
536 	delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
537 	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
538 
539 	if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
540 		return (SET_ERROR(EAGAIN));
541 
542 	zap_leaf_array_free(l, &le->le_value_chunk);
543 	le->le_value_chunk =
544 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
545 	le->le_value_numints = num_integers;
546 	le->le_value_intlen = integer_size;
547 	return (0);
548 }
549 
550 void
551 zap_entry_remove(zap_entry_handle_t *zeh)
552 {
553 	uint16_t entry_chunk;
554 	struct zap_leaf_entry *le;
555 	zap_leaf_t *l = zeh->zeh_leaf;
556 
557 	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
558 
559 	entry_chunk = *zeh->zeh_chunkp;
560 	le = ZAP_LEAF_ENTRY(l, entry_chunk);
561 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
562 
563 	zap_leaf_array_free(l, &le->le_name_chunk);
564 	zap_leaf_array_free(l, &le->le_value_chunk);
565 
566 	*zeh->zeh_chunkp = le->le_next;
567 	zap_leaf_chunk_free(l, entry_chunk);
568 
569 	zap_leaf_phys(l)->l_hdr.lh_nentries--;
570 }
571 
572 int
573 zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
574     uint8_t integer_size, uint64_t num_integers, const void *buf,
575     zap_entry_handle_t *zeh)
576 {
577 	uint16_t chunk;
578 	uint16_t *chunkp;
579 	struct zap_leaf_entry *le;
580 	uint64_t valuelen;
581 	int numchunks;
582 	uint64_t h = zn->zn_hash;
583 
584 	valuelen = integer_size * num_integers;
585 
586 	numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
587 	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
588 	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
589 		return (E2BIG);
590 
591 	if (cd == ZAP_NEED_CD) {
592 		/* find the lowest unused cd */
593 		if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
594 			cd = 0;
595 
596 			for (chunk = *LEAF_HASH_ENTPTR(l, h);
597 			    chunk != CHAIN_END; chunk = le->le_next) {
598 				le = ZAP_LEAF_ENTRY(l, chunk);
599 				if (le->le_cd > cd)
600 					break;
601 				if (le->le_hash == h) {
602 					ASSERT3U(cd, ==, le->le_cd);
603 					cd++;
604 				}
605 			}
606 		} else {
607 			/* old unsorted format; do it the O(n^2) way */
608 			for (cd = 0; ; cd++) {
609 				for (chunk = *LEAF_HASH_ENTPTR(l, h);
610 				    chunk != CHAIN_END; chunk = le->le_next) {
611 					le = ZAP_LEAF_ENTRY(l, chunk);
612 					if (le->le_hash == h &&
613 					    le->le_cd == cd) {
614 						break;
615 					}
616 				}
617 				/* If this cd is not in use, we are good. */
618 				if (chunk == CHAIN_END)
619 					break;
620 			}
621 		}
622 		/*
623 		 * We would run out of space in a block before we could
624 		 * store enough entries to run out of CD values.
625 		 */
626 		ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
627 	}
628 
629 	if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks)
630 		return (SET_ERROR(EAGAIN));
631 
632 	/* make the entry */
633 	chunk = zap_leaf_chunk_alloc(l);
634 	le = ZAP_LEAF_ENTRY(l, chunk);
635 	le->le_type = ZAP_CHUNK_ENTRY;
636 	le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
637 	    zn->zn_key_intlen, zn->zn_key_orig_numints);
638 	le->le_name_numints = zn->zn_key_orig_numints;
639 	le->le_value_chunk =
640 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
641 	le->le_value_numints = num_integers;
642 	le->le_value_intlen = integer_size;
643 	le->le_hash = h;
644 	le->le_cd = cd;
645 
646 	/* link it into the hash chain */
647 	/* XXX if we did the search above, we could just use that */
648 	chunkp = zap_leaf_rehash_entry(l, chunk);
649 
650 	zap_leaf_phys(l)->l_hdr.lh_nentries++;
651 
652 	zeh->zeh_leaf = l;
653 	zeh->zeh_num_integers = num_integers;
654 	zeh->zeh_integer_size = le->le_value_intlen;
655 	zeh->zeh_cd = le->le_cd;
656 	zeh->zeh_hash = le->le_hash;
657 	zeh->zeh_chunkp = chunkp;
658 
659 	return (0);
660 }
661 
662 /*
663  * Determine if there is another entry with the same normalized form.
664  * For performance purposes, either zn or name must be provided (the
665  * other can be NULL).  Note, there usually won't be any hash
666  * conflicts, in which case we don't need the concatenated/normalized
667  * form of the name.  But all callers have one of these on hand anyway,
668  * so might as well take advantage.  A cleaner but slower interface
669  * would accept neither argument, and compute the normalized name as
670  * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
671  */
672 boolean_t
673 zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
674     const char *name, zap_t *zap)
675 {
676 	uint64_t chunk;
677 	struct zap_leaf_entry *le;
678 	boolean_t allocdzn = B_FALSE;
679 
680 	if (zap->zap_normflags == 0)
681 		return (B_FALSE);
682 
683 	for (chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
684 	    chunk != CHAIN_END; chunk = le->le_next) {
685 		le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
686 		if (le->le_hash != zeh->zeh_hash)
687 			continue;
688 		if (le->le_cd == zeh->zeh_cd)
689 			continue;
690 
691 		if (zn == NULL) {
692 			zn = zap_name_alloc(zap, name, MT_NORMALIZE);
693 			allocdzn = B_TRUE;
694 		}
695 		if (zap_leaf_array_match(zeh->zeh_leaf, zn,
696 		    le->le_name_chunk, le->le_name_numints)) {
697 			if (allocdzn)
698 				zap_name_free(zn);
699 			return (B_TRUE);
700 		}
701 	}
702 	if (allocdzn)
703 		zap_name_free(zn);
704 	return (B_FALSE);
705 }
706 
707 /*
708  * Routines for transferring entries between leafs.
709  */
710 
711 static uint16_t *
712 zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
713 {
714 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
715 	struct zap_leaf_entry *le2;
716 	uint16_t *chunkp;
717 
718 	/*
719 	 * keep the entry chain sorted by cd
720 	 * NB: this will not cause problems for unsorted leafs, though
721 	 * it is unnecessary there.
722 	 */
723 	for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
724 	    *chunkp != CHAIN_END; chunkp = &le2->le_next) {
725 		le2 = ZAP_LEAF_ENTRY(l, *chunkp);
726 		if (le2->le_cd > le->le_cd)
727 			break;
728 	}
729 
730 	le->le_next = *chunkp;
731 	*chunkp = entry;
732 	return (chunkp);
733 }
734 
735 static uint16_t
736 zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
737 {
738 	uint16_t new_chunk;
739 	uint16_t *nchunkp = &new_chunk;
740 
741 	while (chunk != CHAIN_END) {
742 		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
743 		struct zap_leaf_array *nla =
744 		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
745 		struct zap_leaf_array *la =
746 		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
747 		int nextchunk = la->la_next;
748 
749 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
750 		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
751 
752 		*nla = *la; /* structure assignment */
753 
754 		zap_leaf_chunk_free(l, chunk);
755 		chunk = nextchunk;
756 		*nchunkp = nchunk;
757 		nchunkp = &nla->la_next;
758 	}
759 	*nchunkp = CHAIN_END;
760 	return (new_chunk);
761 }
762 
763 static void
764 zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
765 {
766 	struct zap_leaf_entry *le, *nle;
767 	uint16_t chunk;
768 
769 	le = ZAP_LEAF_ENTRY(l, entry);
770 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
771 
772 	chunk = zap_leaf_chunk_alloc(nl);
773 	nle = ZAP_LEAF_ENTRY(nl, chunk);
774 	*nle = *le; /* structure assignment */
775 
776 	(void) zap_leaf_rehash_entry(nl, chunk);
777 
778 	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
779 	nle->le_value_chunk =
780 	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
781 
782 	zap_leaf_chunk_free(l, entry);
783 
784 	zap_leaf_phys(l)->l_hdr.lh_nentries--;
785 	zap_leaf_phys(nl)->l_hdr.lh_nentries++;
786 }
787 
788 /*
789  * Transfer the entries whose hash prefix ends in 1 to the new leaf.
790  */
791 void
792 zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
793 {
794 	int i;
795 	int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
796 
797 	/* set new prefix and prefix_len */
798 	zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
799 	zap_leaf_phys(l)->l_hdr.lh_prefix_len++;
800 	zap_leaf_phys(nl)->l_hdr.lh_prefix =
801 	    zap_leaf_phys(l)->l_hdr.lh_prefix | 1;
802 	zap_leaf_phys(nl)->l_hdr.lh_prefix_len =
803 	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
804 
805 	/* break existing hash chains */
806 	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
807 	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
808 
809 	if (sort)
810 		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
811 
812 	/*
813 	 * Transfer entries whose hash bit 'bit' is set to nl; rehash
814 	 * the remaining entries
815 	 *
816 	 * NB: We could find entries via the hashtable instead. That
817 	 * would be O(hashents+numents) rather than O(numblks+numents),
818 	 * but this accesses memory more sequentially, and when we're
819 	 * called, the block is usually pretty full.
820 	 */
821 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
822 		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
823 		if (le->le_type != ZAP_CHUNK_ENTRY)
824 			continue;
825 
826 		if (le->le_hash & (1ULL << bit))
827 			zap_leaf_transfer_entry(l, i, nl);
828 		else
829 			(void) zap_leaf_rehash_entry(l, i);
830 	}
831 }
832 
833 void
834 zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
835 {
836 	int i, n;
837 
838 	n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
839 	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
840 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
841 	zs->zs_leafs_with_2n_pointers[n]++;
842 
843 
844 	n = zap_leaf_phys(l)->l_hdr.lh_nentries/5;
845 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
846 	zs->zs_blocks_with_n5_entries[n]++;
847 
848 	n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
849 	    zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
850 	    (1<<FZAP_BLOCK_SHIFT(zap));
851 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
852 	zs->zs_blocks_n_tenths_full[n]++;
853 
854 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
855 		int nentries = 0;
856 		int chunk = zap_leaf_phys(l)->l_hash[i];
857 
858 		while (chunk != CHAIN_END) {
859 			struct zap_leaf_entry *le =
860 			    ZAP_LEAF_ENTRY(l, chunk);
861 
862 			n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
863 			    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
864 			    le->le_value_intlen);
865 			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
866 			zs->zs_entries_using_n_chunks[n]++;
867 
868 			chunk = le->le_next;
869 			nentries++;
870 		}
871 
872 		n = nentries;
873 		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
874 		zs->zs_buckets_with_n_entries[n]++;
875 	}
876 }
877