xref: /titanic_51/usr/src/uts/common/fs/zfs/zap_leaf.c (revision 30da143285931291f495cc20b5a1b8869f0618a6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * The 512-byte leaf is broken into 32 16-byte chunks.
31  * chunk number n means l_chunk[n], even though the header precedes it.
32  * the names are stored null-terminated.
33  */
34 
35 #include <sys/zfs_context.h>
36 #include <sys/zap.h>
37 #include <sys/zap_impl.h>
38 #include <sys/zap_leaf.h>
39 #include <sys/spa.h>
40 #include <sys/dmu.h>
41 
42 #define	CHAIN_END 0xffff /* end of the chunk chain */
43 
44 /* somewhat arbitrary, could go up to around 100k ... */
45 #define	MAX_ARRAY_BYTES (8<<10)
46 
47 #define	NCHUNKS(bytes) (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
48 
49 /*
50  * XXX This will >> by a negative number when
51  * lh_prefix_len > 64-ZAP_LEAF_HASH_SHIFT.
52  */
53 #define	LEAF_HASH(l, h) \
54 	((ZAP_LEAF_HASH_NUMENTRIES-1) & \
55 		((h) >> (64 - ZAP_LEAF_HASH_SHIFT-(l)->lh_prefix_len)))
56 
57 #define	LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
58 
59 /* #define	MEMCHECK */
60 
61 
62 static void
63 zap_memset(void *a, int c, size_t n)
64 {
65 	char *cp = a;
66 	char *cpend = cp + n;
67 
68 	while (cp < cpend)
69 		*cp++ = c;
70 }
71 
72 static void
73 stv(int len, void *addr, uint64_t value)
74 {
75 	switch (len) {
76 	case 1:
77 		*(uint8_t *)addr = value;
78 		return;
79 	case 2:
80 		*(uint16_t *)addr = value;
81 		return;
82 	case 4:
83 		*(uint32_t *)addr = value;
84 		return;
85 	case 8:
86 		*(uint64_t *)addr = value;
87 		return;
88 	}
89 	ASSERT(!"bad int len");
90 }
91 
92 static uint64_t
93 ldv(int len, const void *addr)
94 {
95 	switch (len) {
96 	case 1:
97 		return (*(uint8_t *)addr);
98 	case 2:
99 		return (*(uint16_t *)addr);
100 	case 4:
101 		return (*(uint32_t *)addr);
102 	case 8:
103 		return (*(uint64_t *)addr);
104 	}
105 	ASSERT(!"bad int len");
106 	return (0xFEEDFACEDEADBEEF);
107 }
108 
109 void
110 zap_leaf_byteswap(zap_leaf_phys_t *buf)
111 {
112 	int i;
113 
114 	buf->l_hdr.lhr_block_type = 	BSWAP_64(buf->l_hdr.lhr_block_type);
115 	buf->l_hdr.lhr_next = 		BSWAP_64(buf->l_hdr.lhr_next);
116 	buf->l_hdr.lhr_prefix = 	BSWAP_64(buf->l_hdr.lhr_prefix);
117 	buf->l_hdr.lhr_magic = 		BSWAP_32(buf->l_hdr.lhr_magic);
118 	buf->l_hdr.lhr_nfree = 		BSWAP_16(buf->l_hdr.lhr_nfree);
119 	buf->l_hdr.lhr_nentries = 	BSWAP_16(buf->l_hdr.lhr_nentries);
120 	buf->l_hdr.lhr_prefix_len = 	BSWAP_16(buf->l_hdr.lhr_prefix_len);
121 	buf->l_hdr.lh_freelist = 	BSWAP_16(buf->l_hdr.lh_freelist);
122 
123 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES; i++)
124 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
125 
126 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
127 		struct zap_leaf_entry *le;
128 
129 		switch (buf->l_chunk[i].l_free.lf_type) {
130 		case ZAP_LEAF_ENTRY:
131 			le = &buf->l_chunk[i].l_entry;
132 
133 			le->le_type = BSWAP_8(le->le_type);
134 			le->le_int_size = BSWAP_8(le->le_int_size);
135 			le->le_next = BSWAP_16(le->le_next);
136 			le->le_name_chunk = BSWAP_16(le->le_name_chunk);
137 			le->le_name_length = BSWAP_16(le->le_name_length);
138 			le->le_value_chunk = BSWAP_16(le->le_value_chunk);
139 			le->le_value_length = BSWAP_16(le->le_value_length);
140 			le->le_cd = BSWAP_32(le->le_cd);
141 			le->le_hash = BSWAP_64(le->le_hash);
142 			break;
143 		case ZAP_LEAF_FREE:
144 			buf->l_chunk[i].l_free.lf_type =
145 			    BSWAP_8(buf->l_chunk[i].l_free.lf_type);
146 			buf->l_chunk[i].l_free.lf_next =
147 			    BSWAP_16(buf->l_chunk[i].l_free.lf_next);
148 			break;
149 		case ZAP_LEAF_ARRAY:
150 			/* zap_leaf_array */
151 			buf->l_chunk[i].l_array.la_type =
152 			    BSWAP_8(buf->l_chunk[i].l_array.la_type);
153 			buf->l_chunk[i].l_array.la_next =
154 			    BSWAP_16(buf->l_chunk[i].l_array.la_next);
155 			/* la_array doesn't need swapping */
156 			break;
157 		default:
158 			ASSERT(!"bad leaf type");
159 		}
160 	}
161 }
162 
163 void
164 zap_leaf_init(zap_leaf_t *l)
165 {
166 	int i;
167 
168 	ASSERT3U(sizeof (zap_leaf_phys_t), ==, l->l_dbuf->db_size);
169 	zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
170 	zap_memset(&l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
171 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
172 		l->l_phys->l_chunk[i].l_free.lf_type = ZAP_LEAF_FREE;
173 		l->l_phys->l_chunk[i].l_free.lf_next = i+1;
174 	}
175 	l->l_phys->l_chunk[ZAP_LEAF_NUMCHUNKS-1].l_free.lf_next = CHAIN_END;
176 	l->lh_block_type = ZBT_LEAF;
177 	l->lh_magic = ZAP_LEAF_MAGIC;
178 	l->lh_nfree = ZAP_LEAF_NUMCHUNKS;
179 }
180 
181 zap_leaf_t *
182 zap_leaf_chainmore(zap_leaf_t *l, zap_leaf_t *nl)
183 {
184 	nl->lh_prefix = l->lh_prefix;
185 	nl->lh_prefix_len = l->lh_prefix_len;
186 	nl->l_next = l->l_next;
187 	l->l_next = nl;
188 	nl->lh_next = l->lh_next;
189 	l->lh_next = nl->l_blkid;
190 	return (nl);
191 }
192 
193 /*
194  * Routines which manipulate leaf chunks (l_chunk[]).
195  */
196 
197 static uint16_t
198 zap_leaf_chunk_alloc(zap_leaf_t *l)
199 {
200 	int chunk;
201 
202 	ASSERT(l->lh_nfree > 0);
203 
204 	chunk = l->l_phys->l_hdr.lh_freelist;
205 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
206 	ASSERT3U(l->l_phys->l_chunk[chunk].l_free.lf_type, ==, ZAP_LEAF_FREE);
207 
208 	l->l_phys->l_hdr.lh_freelist = l->l_phys->l_chunk[chunk].l_free.lf_next;
209 
210 #ifdef MEMCHECK
211 	zap_memset(&l->l_phys->l_chunk[chunk], 0xa1,
212 	    sizeof (l->l_phys->l_chunk[chunk]));
213 #endif
214 
215 	l->lh_nfree--;
216 
217 	return (chunk);
218 }
219 
220 static void
221 zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
222 {
223 	struct zap_leaf_free *zlf = &l->l_phys->l_chunk[chunk].l_free;
224 	ASSERT3U(l->lh_nfree, <, ZAP_LEAF_NUMCHUNKS);
225 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
226 	ASSERT(zlf->lf_type != ZAP_LEAF_FREE);
227 
228 #ifdef MEMCHECK
229 	zap_memset(&l->l_phys->l_chunk[chunk], 0xf4,
230 	    sizeof (l->l_phys->l_chunk[chunk]));
231 #endif
232 
233 	zlf->lf_type = ZAP_LEAF_FREE;
234 	zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
235 	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
236 	l->l_phys->l_hdr.lh_freelist = chunk;
237 
238 	l->lh_nfree++;
239 }
240 
241 
242 /*
243  * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
244  */
245 
246 static uint16_t
247 zap_leaf_array_create(const zap_entry_handle_t *zeh, const char *buf,
248 	int integer_size, int num_integers)
249 {
250 	uint16_t chunk_head;
251 	uint16_t *chunkp = &chunk_head;
252 	int byten = 0;
253 	uint64_t value;
254 	int shift = (integer_size-1)*8;
255 	int len = num_integers;
256 	zap_leaf_t *l = zeh->zeh_found_leaf;
257 
258 	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
259 
260 	while (len > 0) {
261 		uint16_t chunk = zap_leaf_chunk_alloc(l);
262 		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
263 		int i;
264 
265 		la->la_type = ZAP_LEAF_ARRAY;
266 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
267 			if (byten == 0)
268 				value = ldv(integer_size, buf);
269 			la->la_array[i] = (value & (0xff << shift)) >> shift;
270 			value <<= 8;
271 			if (++byten == integer_size) {
272 				byten = 0;
273 				buf += integer_size;
274 				if (--len == 0)
275 					break;
276 			}
277 		}
278 
279 		*chunkp = chunk;
280 		chunkp = &la->la_next;
281 	}
282 	*chunkp = CHAIN_END;
283 
284 	return (chunk_head);
285 }
286 
287 static void
288 zap_leaf_array_free(zap_entry_handle_t *zeh, uint16_t *chunkp)
289 {
290 	uint16_t chunk = *chunkp;
291 	zap_leaf_t *l = zeh->zeh_found_leaf;
292 
293 	*chunkp = CHAIN_END;
294 
295 	while (chunk != CHAIN_END) {
296 		int nextchunk = l->l_phys->l_chunk[chunk].l_array.la_next;
297 		ASSERT3U(l->l_phys->l_chunk[chunk].l_array.la_type, ==,
298 		    ZAP_LEAF_ARRAY);
299 		zap_leaf_chunk_free(l, chunk);
300 		chunk = nextchunk;
301 	}
302 }
303 
304 /* array_len and buf_len are in integers, not bytes */
305 static void
306 zap_leaf_array_read(const zap_entry_handle_t *zeh, uint16_t chunk,
307     int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
308     char *buf)
309 {
310 	int len = MIN(array_len, buf_len);
311 	int byten = 0;
312 	uint64_t value = 0;
313 	zap_leaf_t *l = zeh->zeh_found_leaf;
314 
315 	ASSERT3U(array_int_len, <=, buf_int_len);
316 
317 	/* Fast path for one 8-byte integer */
318 	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
319 		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
320 		uint8_t *ip = la->la_array;
321 		uint64_t *buf64 = (uint64_t *)buf;
322 
323 		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
324 		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
325 		    (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
326 		    (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
327 		return;
328 	}
329 
330 	/* Fast path for an array of 1-byte integers (eg. the entry name) */
331 	if (array_int_len == 1 && buf_int_len == 1 &&
332 	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
333 		while (chunk != CHAIN_END) {
334 			struct zap_leaf_array *la =
335 			    &l->l_phys->l_chunk[chunk].l_array;
336 			bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES);
337 			buf += ZAP_LEAF_ARRAY_BYTES;
338 			chunk = la->la_next;
339 		}
340 		return;
341 	}
342 
343 	while (len > 0) {
344 		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
345 		int i;
346 
347 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
348 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
349 			value = (value << 8) | la->la_array[i];
350 			byten++;
351 			if (byten == array_int_len) {
352 				stv(buf_int_len, buf, value);
353 				byten = 0;
354 				len--;
355 				if (len == 0)
356 					return;
357 				buf += buf_int_len;
358 			}
359 		}
360 		chunk = la->la_next;
361 	}
362 }
363 
364 /*
365  * Only to be used on 8-bit arrays.
366  * array_len is actual len in bytes (not encoded le_value_length).
367  * buf is null-terminated.
368  */
369 static int
370 zap_leaf_array_equal(const zap_entry_handle_t *zeh, int chunk,
371     int array_len, const char *buf)
372 {
373 	int bseen = 0;
374 	zap_leaf_t *l = zeh->zeh_found_leaf;
375 
376 	while (bseen < array_len) {
377 		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
378 		int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
379 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
380 		if (bcmp(la->la_array, buf + bseen, toread))
381 			break;
382 		chunk = la->la_next;
383 		bseen += toread;
384 	}
385 	return (bseen == array_len);
386 }
387 
388 /*
389  * Routines which manipulate leaf entries.
390  */
391 
392 int
393 zap_leaf_lookup(zap_leaf_t *l,
394     const char *name, uint64_t h, zap_entry_handle_t *zeh)
395 {
396 	uint16_t *chunkp;
397 	struct zap_leaf_entry *le;
398 
399 	zeh->zeh_head_leaf = l;
400 
401 again:
402 	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
403 
404 	for (chunkp = LEAF_HASH_ENTPTR(l, h);
405 	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
406 		uint16_t chunk = *chunkp;
407 		le = &l->l_phys->l_chunk[chunk].l_entry;
408 
409 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
410 		ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
411 
412 		if (le->le_hash != h)
413 			continue;
414 
415 		zeh->zeh_found_leaf = l;
416 		if (zap_leaf_array_equal(zeh, le->le_name_chunk,
417 		    le->le_name_length, name)) {
418 			zeh->zeh_num_integers = le->le_value_length;
419 			zeh->zeh_integer_size = le->le_int_size;
420 			zeh->zeh_cd = le->le_cd;
421 			zeh->zeh_hash = le->le_hash;
422 			zeh->zeh_chunkp = chunkp;
423 			zeh->zeh_found_leaf = l;
424 			return (0);
425 		}
426 	}
427 
428 	if (l->l_next) {
429 		l = l->l_next;
430 		goto again;
431 	}
432 
433 	return (ENOENT);
434 }
435 
436 /* Return (h1,cd1 >= h2,cd2) */
437 #define	HCD_GTEQ(h1, cd1, h2, cd2) \
438 	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
439 
440 int
441 zap_leaf_lookup_closest(zap_leaf_t *l,
442     uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
443 {
444 	uint16_t chunk;
445 	uint64_t besth = -1ULL;
446 	uint32_t bestcd = ZAP_MAXCD;
447 	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES-1;
448 	uint16_t lh;
449 	struct zap_leaf_entry *le;
450 
451 	zeh->zeh_head_leaf = l;
452 
453 again:
454 	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
455 
456 	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
457 		for (chunk = l->l_phys->l_hash[lh];
458 		    chunk != CHAIN_END; chunk = le->le_next) {
459 			le = &l->l_phys->l_chunk[chunk].l_entry;
460 
461 			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
462 			ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
463 
464 			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
465 			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
466 				ASSERT3U(bestlh, >=, lh);
467 				bestlh = lh;
468 				besth = le->le_hash;
469 				bestcd = le->le_cd;
470 
471 				zeh->zeh_num_integers = le->le_value_length;
472 				zeh->zeh_integer_size = le->le_int_size;
473 				zeh->zeh_cd = le->le_cd;
474 				zeh->zeh_hash = le->le_hash;
475 				zeh->zeh_fakechunk = chunk;
476 				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
477 				zeh->zeh_found_leaf = l;
478 			}
479 		}
480 	}
481 
482 	if (l->l_next) {
483 		l = l->l_next;
484 		goto again;
485 	}
486 
487 	return (bestcd == ZAP_MAXCD ? ENOENT : 0);
488 }
489 
490 int
491 zap_entry_read(const zap_entry_handle_t *zeh,
492     uint8_t integer_size, uint64_t num_integers, void *buf)
493 {
494 	struct zap_leaf_entry *le;
495 
496 	le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
497 	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
498 
499 	if (le->le_int_size > integer_size)
500 		return (EINVAL);
501 
502 	zap_leaf_array_read(zeh, le->le_value_chunk, le->le_int_size,
503 	    le->le_value_length, integer_size, num_integers, buf);
504 
505 	if (zeh->zeh_num_integers > num_integers)
506 		return (EOVERFLOW);
507 	return (0);
508 
509 }
510 
511 int
512 zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
513 {
514 	struct zap_leaf_entry *le;
515 
516 	le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
517 	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
518 
519 	zap_leaf_array_read(zeh, le->le_name_chunk, 1,
520 	    le->le_name_length, 1, buflen, buf);
521 	if (le->le_name_length > buflen)
522 		return (EOVERFLOW);
523 	return (0);
524 }
525 
526 int
527 zap_entry_update(zap_entry_handle_t *zeh,
528 	uint8_t integer_size, uint64_t num_integers, const void *buf)
529 {
530 	int delta_chunks;
531 	struct zap_leaf_entry *le;
532 	le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
533 
534 	delta_chunks = NCHUNKS(num_integers * integer_size) -
535 	    NCHUNKS(le->le_value_length * le->le_int_size);
536 
537 	if (zeh->zeh_found_leaf->lh_nfree < delta_chunks)
538 		return (EAGAIN);
539 
540 	/*
541 	 * We should search other chained leaves (via
542 	 * zap_entry_remove,create?) otherwise returning EAGAIN will
543 	 * just send us into an infinite loop if we have to chain
544 	 * another leaf block, rather than being able to split this
545 	 * block.
546 	 */
547 
548 	zap_leaf_array_free(zeh, &le->le_value_chunk);
549 	le->le_value_chunk =
550 	    zap_leaf_array_create(zeh, buf, integer_size, num_integers);
551 	le->le_value_length = (num_integers*integer_size > MAX_ARRAY_BYTES) ?
552 	    (MAX_ARRAY_BYTES + 1) : (num_integers);
553 	le->le_int_size = integer_size;
554 	return (0);
555 }
556 
557 void
558 zap_entry_remove(zap_entry_handle_t *zeh)
559 {
560 	uint16_t entry_chunk;
561 	struct zap_leaf_entry *le;
562 	zap_leaf_t *l = zeh->zeh_found_leaf;
563 
564 	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
565 
566 	entry_chunk = *zeh->zeh_chunkp;
567 	le = &l->l_phys->l_chunk[entry_chunk].l_entry;
568 	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
569 
570 	zap_leaf_array_free(zeh, &le->le_name_chunk);
571 	zap_leaf_array_free(zeh, &le->le_value_chunk);
572 
573 	*zeh->zeh_chunkp = le->le_next;
574 	zap_leaf_chunk_free(l, entry_chunk);
575 
576 	l->lh_nentries--;
577 }
578 
579 int
580 zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
581     uint8_t integer_size, uint64_t num_integers, const void *buf,
582     zap_entry_handle_t *zeh)
583 {
584 	uint16_t chunk;
585 	uint16_t *chunkp;
586 	struct zap_leaf_entry *le;
587 	uint64_t namelen, valuelen;
588 	int numchunks;
589 
590 	valuelen = integer_size * num_integers;
591 	namelen = strlen(name) + 1;
592 	ASSERT(namelen >= 2);
593 
594 	zeh->zeh_head_leaf = l;
595 
596 	if (namelen > MAXNAMELEN)
597 		return (ENAMETOOLONG);
598 	/* find the first leaf in the chain that has sufficient free space */
599 	numchunks = 1 + NCHUNKS(namelen) + NCHUNKS(valuelen);
600 	if (numchunks > ZAP_LEAF_NUMCHUNKS)
601 		return (E2BIG);
602 
603 	if (cd == ZAP_MAXCD) {
604 		for (cd = 0; cd < ZAP_MAXCD; cd++) {
605 			zap_leaf_t *ll;
606 			for (ll = l; ll; ll = ll->l_next) {
607 				for (chunk = *LEAF_HASH_ENTPTR(ll, h);
608 				    chunk != CHAIN_END; chunk = le->le_next) {
609 					le = &ll->l_phys->l_chunk
610 					    [chunk].l_entry;
611 					if (le->le_hash == h &&
612 					    le->le_cd == cd) {
613 						break;
614 					}
615 				}
616 				/*
617 				 * if this cd is in use, no need to
618 				 * check more chained leafs
619 				 */
620 				if (chunk != CHAIN_END)
621 					break;
622 			}
623 			/* If this cd is not in use, we are good. */
624 			if (chunk == CHAIN_END)
625 				break;
626 		}
627 		/* If we tried all the cd's, we lose. */
628 		if (cd == ZAP_MAXCD)
629 			return (ENOSPC);
630 	}
631 
632 	for (; l; l = l->l_next)
633 		if (l->lh_nfree >= numchunks)
634 			break;
635 	if (l == NULL)
636 		return (EAGAIN);
637 
638 	zeh->zeh_found_leaf = l;
639 
640 	/* make the entry */
641 	chunk = zap_leaf_chunk_alloc(l);
642 	le = &l->l_phys->l_chunk[chunk].l_entry;
643 	le->le_type = ZAP_LEAF_ENTRY;
644 	le->le_name_chunk = zap_leaf_array_create(zeh, name, 1, namelen);
645 	le->le_name_length = namelen;
646 	le->le_value_chunk =
647 	    zap_leaf_array_create(zeh, buf, integer_size, num_integers);
648 	le->le_value_length = (num_integers*integer_size > MAX_ARRAY_BYTES) ?
649 	    (MAX_ARRAY_BYTES + 1) : (num_integers);
650 	le->le_int_size = integer_size;
651 	le->le_hash = h;
652 	le->le_cd = cd;
653 
654 	/* link it into the hash chain */
655 	chunkp = LEAF_HASH_ENTPTR(l, h);
656 	le->le_next = *chunkp;
657 	*chunkp = chunk;
658 
659 	l->lh_nentries++;
660 
661 	zeh->zeh_num_integers = num_integers;
662 	zeh->zeh_integer_size = le->le_int_size;
663 	zeh->zeh_cd = le->le_cd;
664 	zeh->zeh_hash = le->le_hash;
665 	zeh->zeh_chunkp = chunkp;
666 
667 	return (0);
668 }
669 
670 /*
671  * Routines for transferring entries between leafs.
672  */
673 
674 static void
675 zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
676 {
677 	struct zap_leaf_entry *le = &l->l_phys->l_chunk[entry].l_entry;
678 	uint16_t *ptr = LEAF_HASH_ENTPTR(l, le->le_hash);
679 	le->le_next = *ptr;
680 	*ptr = entry;
681 }
682 
683 static void
684 zap_leaf_rehash_entries(zap_leaf_t *l)
685 {
686 	int i;
687 
688 	if (l->lh_nentries == 0)
689 		return;
690 
691 	/* break existing hash chains */
692 	zap_memset(l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
693 
694 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
695 		struct zap_leaf_entry *le = &l->l_phys->l_chunk[i].l_entry;
696 		if (le->le_type != ZAP_LEAF_ENTRY)
697 			continue;
698 		zap_leaf_rehash_entry(l, i);
699 	}
700 }
701 
702 static uint16_t
703 zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
704 {
705 	uint16_t new_chunk;
706 	uint16_t *nchunkp = &new_chunk;
707 
708 	while (chunk != CHAIN_END) {
709 		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
710 		struct zap_leaf_array *nla =
711 		    &nl->l_phys->l_chunk[nchunk].l_array;
712 		struct zap_leaf_array *la =
713 		    &l->l_phys->l_chunk[chunk].l_array;
714 		int nextchunk = la->la_next;
715 
716 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
717 		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS);
718 
719 		*nla = *la;
720 
721 		zap_leaf_chunk_free(l, chunk);
722 		chunk = nextchunk;
723 		*nchunkp = nchunk;
724 		nchunkp = &nla->la_next;
725 	}
726 	*nchunkp = CHAIN_END;
727 	return (new_chunk);
728 }
729 
730 static void
731 zap_leaf_transfer_entry(zap_t *zap, zap_leaf_t *l, int entry, zap_leaf_t *nhl,
732     dmu_tx_t *tx)
733 {
734 	zap_leaf_t *nl;
735 	struct zap_leaf_entry *le, *nle;
736 	uint16_t chunk, nchunks;
737 
738 	le = &l->l_phys->l_chunk[entry].l_entry;
739 	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
740 
741 	/* find a leaf in the destination leaf chain with enough free space */
742 	nchunks = 1 + NCHUNKS(le->le_name_length) +
743 	    NCHUNKS(le->le_value_length * le->le_int_size);
744 	for (nl = nhl; nl; nl = nl->l_next)
745 		if (nl->lh_nfree >= nchunks)
746 			break;
747 	if (nl == NULL) {
748 		nl = zap_leaf_chainmore(nhl, zap_create_leaf(zap, tx));
749 		dprintf("transfer_entry: chaining leaf %x/%d\n",
750 		    nl->lh_prefix, nl->lh_prefix_len);
751 	}
752 
753 	chunk = zap_leaf_chunk_alloc(nl);
754 	nle = &nl->l_phys->l_chunk[chunk].l_entry;
755 	*nle = *le;
756 
757 	zap_leaf_rehash_entry(nl, chunk);
758 
759 	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
760 	nle->le_value_chunk =
761 	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
762 
763 	zap_leaf_chunk_free(l, entry);
764 
765 	l->lh_nentries--;
766 	nl->lh_nentries++;
767 }
768 
769 /*
770  * Transfer entries whose hash bit 'bit' is 1 to nl1, and 0 to nl0.
771  * Ignore leaf chaining in source (l), but chain in destinations.
772  * We'll re-chain all the entries in l as we go along.
773  */
774 static void
775 zap_leaf_transfer_entries(zap_t *zap, zap_leaf_t *l,
776     zap_leaf_t *nl0, zap_leaf_t *nl1, int bit, dmu_tx_t *tx)
777 {
778 	int i;
779 
780 	ASSERT(bit < 64 && bit >= 0);
781 	/* break existing hash chains */
782 	zap_memset(l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
783 
784 	if (nl0 != l)
785 		zap_leaf_rehash_entries(nl0);
786 	if (nl1 != nl0)
787 		zap_leaf_rehash_entries(nl1);
788 
789 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
790 		struct zap_leaf_entry *le = &l->l_phys->l_chunk[i].l_entry;
791 		if (le->le_type != ZAP_LEAF_ENTRY)
792 			continue;
793 
794 		/*
795 		 * We could find entries via hashtable instead. That
796 		 * would be O(hashents+numents) rather than
797 		 * O(numblks+numents), but this accesses memory more
798 		 * sequentially, and when we're called, the block is
799 		 * usually pretty full.
800 		 */
801 
802 		if (le->le_hash & (1ULL << bit)) {
803 			zap_leaf_transfer_entry(zap, l, i, nl1, tx);
804 		} else {
805 			if (nl0 == l)
806 				zap_leaf_rehash_entry(l, i);
807 			else
808 				zap_leaf_transfer_entry(zap, l, i, nl0, tx);
809 		}
810 	}
811 
812 }
813 
814 /*
815  * nl will contain the entries whose hash prefix ends in 1
816  * handles leaf chaining
817  */
818 zap_leaf_t *
819 zap_leaf_split(zap_t *zap, zap_leaf_t *hl, dmu_tx_t *tx)
820 {
821 	zap_leaf_t *l = hl;
822 	int bit = 64 - 1 - hl->lh_prefix_len;
823 	zap_leaf_t *nl = zap_create_leaf(zap, tx);
824 
825 	/* set new prefix and prefix_len */
826 	hl->lh_prefix <<= 1;
827 	hl->lh_prefix_len++;
828 	nl->lh_prefix = hl->lh_prefix | 1;
829 	nl->lh_prefix_len = hl->lh_prefix_len;
830 
831 	/* transfer odd entries from first leaf in hl chain to nl */
832 	zap_leaf_transfer_entries(zap, hl, hl, nl, bit, tx);
833 
834 	/* take rest of chain off hl */
835 	l = hl->l_next;
836 	hl->l_next = NULL;
837 	hl->lh_next = 0;
838 
839 	/* transfer even entries from hl chain back to hl, odd entries to nl */
840 	while (l) {
841 		zap_leaf_t *next = l->l_next;
842 		zap_leaf_transfer_entries(zap, l, hl, nl, bit, tx);
843 		zap_destroy_leaf(zap, l, tx);
844 		l = next;
845 	}
846 
847 	return (nl);
848 }
849 
850 void
851 zap_stats_leaf(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
852 {
853 	int n, nchained = 0;
854 
855 	n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len;
856 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
857 	zs->zs_leafs_with_2n_pointers[n]++;
858 
859 	do {
860 		int i;
861 
862 		n = l->lh_nentries/5;
863 		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
864 		zs->zs_blocks_with_n5_entries[n]++;
865 
866 		n = ((1<<ZAP_BLOCK_SHIFT) -
867 		    l->lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
868 		    (1<<ZAP_BLOCK_SHIFT);
869 		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
870 		zs->zs_blocks_n_tenths_full[n]++;
871 
872 		for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES; i++) {
873 			int nentries = 0;
874 			int chunk = l->l_phys->l_hash[i];
875 
876 			while (chunk != CHAIN_END) {
877 				struct zap_leaf_entry *le =
878 				    &l->l_phys->l_chunk[chunk].l_entry;
879 
880 				n = 1 + NCHUNKS(le->le_name_length) +
881 				    NCHUNKS(le->le_value_length *
882 					le->le_int_size);
883 				n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
884 				zs->zs_entries_using_n_chunks[n]++;
885 
886 				chunk = le->le_next;
887 				nentries++;
888 			}
889 
890 			n = nentries;
891 			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
892 			zs->zs_buckets_with_n_entries[n]++;
893 		}
894 
895 		nchained++;
896 		l = l->l_next;
897 	} while (l);
898 
899 	n = nchained-1;
900 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
901 	zs->zs_leafs_with_n_chained[n]++;
902 }
903