xref: /titanic_50/usr/src/uts/common/fs/zfs/zap_leaf.c (revision 4610e4a00999c6d2291b3fc263926b890ec500a5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * The 512-byte leaf is broken into 32 16-byte chunks.
31  * chunk number n means l_chunk[n], even though the header precedes it.
32  * the names are stored null-terminated.
33  */
34 
35 #include <sys/zfs_context.h>
36 #include <sys/zap.h>
37 #include <sys/zap_impl.h>
38 #include <sys/zap_leaf.h>
39 #include <sys/spa.h>
40 #include <sys/dmu.h>
41 
42 #define	CHAIN_END 0xffff /* end of the chunk chain */
43 
44 /* somewhat arbitrary, could go up to around 100k ... */
45 #define	MAX_ARRAY_BYTES (8<<10)
46 
47 #define	NCHUNKS(bytes) (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
48 
49 /*
50  * XXX This will >> by a negative number when
51  * lh_prefix_len > 64-ZAP_LEAF_HASH_SHIFT.
52  */
53 #define	LEAF_HASH(l, h) \
54 	((ZAP_LEAF_HASH_NUMENTRIES-1) & \
55 		((h) >> (64 - ZAP_LEAF_HASH_SHIFT-(l)->lh_prefix_len)))
56 
57 #define	LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
58 
59 /* #define	MEMCHECK */
60 
61 
62 static void
63 zap_memset(void *a, int c, size_t n)
64 {
65 	char *cp = a;
66 	char *cpend = cp + n;
67 
68 	while (cp < cpend)
69 		*cp++ = c;
70 }
71 
72 static void
73 stv(int len, void *addr, uint64_t value)
74 {
75 	switch (len) {
76 	case 1:
77 		*(uint8_t *)addr = value;
78 		return;
79 	case 2:
80 		*(uint16_t *)addr = value;
81 		return;
82 	case 4:
83 		*(uint32_t *)addr = value;
84 		return;
85 	case 8:
86 		*(uint64_t *)addr = value;
87 		return;
88 	}
89 	ASSERT(!"bad int len");
90 }
91 
92 static uint64_t
93 ldv(int len, const void *addr)
94 {
95 	switch (len) {
96 	case 1:
97 		return (*(uint8_t *)addr);
98 	case 2:
99 		return (*(uint16_t *)addr);
100 	case 4:
101 		return (*(uint32_t *)addr);
102 	case 8:
103 		return (*(uint64_t *)addr);
104 	}
105 	ASSERT(!"bad int len");
106 	return (0xFEEDFACEDEADBEEF);
107 }
108 
109 void
110 zap_leaf_byteswap(zap_leaf_phys_t *buf)
111 {
112 	int i;
113 
114 	buf->l_hdr.lhr_block_type = 	BSWAP_64(buf->l_hdr.lhr_block_type);
115 	buf->l_hdr.lhr_next = 		BSWAP_64(buf->l_hdr.lhr_next);
116 	buf->l_hdr.lhr_prefix = 	BSWAP_64(buf->l_hdr.lhr_prefix);
117 	buf->l_hdr.lhr_magic = 		BSWAP_32(buf->l_hdr.lhr_magic);
118 	buf->l_hdr.lhr_nfree = 		BSWAP_16(buf->l_hdr.lhr_nfree);
119 	buf->l_hdr.lhr_nentries = 	BSWAP_16(buf->l_hdr.lhr_nentries);
120 	buf->l_hdr.lhr_prefix_len = 	BSWAP_16(buf->l_hdr.lhr_prefix_len);
121 	buf->l_hdr.lh_freelist = 	BSWAP_16(buf->l_hdr.lh_freelist);
122 
123 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES; i++)
124 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
125 
126 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
127 		struct zap_leaf_entry *le;
128 
129 		switch (buf->l_chunk[i].l_free.lf_type) {
130 		case ZAP_LEAF_ENTRY:
131 			le = &buf->l_chunk[i].l_entry;
132 
133 			le->le_type = BSWAP_8(le->le_type);
134 			le->le_int_size = BSWAP_8(le->le_int_size);
135 			le->le_next = BSWAP_16(le->le_next);
136 			le->le_name_chunk = BSWAP_16(le->le_name_chunk);
137 			le->le_name_length = BSWAP_16(le->le_name_length);
138 			le->le_value_chunk = BSWAP_16(le->le_value_chunk);
139 			le->le_value_length = BSWAP_16(le->le_value_length);
140 			le->le_cd = BSWAP_32(le->le_cd);
141 			le->le_hash = BSWAP_64(le->le_hash);
142 			break;
143 		case ZAP_LEAF_FREE:
144 			buf->l_chunk[i].l_free.lf_type =
145 			    BSWAP_8(buf->l_chunk[i].l_free.lf_type);
146 			buf->l_chunk[i].l_free.lf_next =
147 			    BSWAP_16(buf->l_chunk[i].l_free.lf_next);
148 			break;
149 		case ZAP_LEAF_ARRAY:
150 			/* zap_leaf_array */
151 			buf->l_chunk[i].l_array.la_type =
152 			    BSWAP_8(buf->l_chunk[i].l_array.la_type);
153 			buf->l_chunk[i].l_array.la_next =
154 			    BSWAP_16(buf->l_chunk[i].l_array.la_next);
155 			/* la_array doesn't need swapping */
156 			break;
157 		default:
158 			ASSERT(!"bad leaf type");
159 		}
160 	}
161 }
162 
163 void
164 zap_leaf_init(zap_leaf_t *l)
165 {
166 	int i;
167 
168 	ASSERT3U(sizeof (zap_leaf_phys_t), ==, l->l_dbuf->db_size);
169 	zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
170 	zap_memset(&l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
171 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
172 		l->l_phys->l_chunk[i].l_free.lf_type = ZAP_LEAF_FREE;
173 		l->l_phys->l_chunk[i].l_free.lf_next = i+1;
174 	}
175 	l->l_phys->l_chunk[ZAP_LEAF_NUMCHUNKS-1].l_free.lf_next = CHAIN_END;
176 	l->lh_block_type = ZBT_LEAF;
177 	l->lh_magic = ZAP_LEAF_MAGIC;
178 	l->lh_nfree = ZAP_LEAF_NUMCHUNKS;
179 }
180 
181 zap_leaf_t *
182 zap_leaf_chainmore(zap_leaf_t *l, zap_leaf_t *nl)
183 {
184 	nl->lh_prefix = l->lh_prefix;
185 	nl->lh_prefix_len = l->lh_prefix_len;
186 	nl->l_next = l->l_next;
187 	l->l_next = nl;
188 	nl->lh_next = l->lh_next;
189 	l->lh_next = nl->l_blkid;
190 	return (nl);
191 }
192 
193 /*
194  * Routines which manipulate leaf chunks (l_chunk[]).
195  */
196 
197 static uint16_t
198 zap_leaf_chunk_alloc(zap_leaf_t *l)
199 {
200 	int chunk;
201 
202 	ASSERT(l->lh_nfree > 0);
203 
204 	chunk = l->l_phys->l_hdr.lh_freelist;
205 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
206 	ASSERT3U(l->l_phys->l_chunk[chunk].l_free.lf_type, ==, ZAP_LEAF_FREE);
207 
208 	l->l_phys->l_hdr.lh_freelist = l->l_phys->l_chunk[chunk].l_free.lf_next;
209 
210 #ifdef MEMCHECK
211 	zap_memset(&l->l_phys->l_chunk[chunk], 0xa1,
212 	    sizeof (l->l_phys->l_chunk[chunk]));
213 #endif
214 
215 	l->lh_nfree--;
216 
217 	return (chunk);
218 }
219 
220 static void
221 zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
222 {
223 	struct zap_leaf_free *zlf = &l->l_phys->l_chunk[chunk].l_free;
224 	ASSERT3U(l->lh_nfree, <, ZAP_LEAF_NUMCHUNKS);
225 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
226 	ASSERT(zlf->lf_type != ZAP_LEAF_FREE);
227 
228 #ifdef MEMCHECK
229 	zap_memset(&l->l_phys->l_chunk[chunk], 0xf4,
230 	    sizeof (l->l_phys->l_chunk[chunk]));
231 #endif
232 
233 	zlf->lf_type = ZAP_LEAF_FREE;
234 	zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
235 	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
236 	l->l_phys->l_hdr.lh_freelist = chunk;
237 
238 	l->lh_nfree++;
239 }
240 
241 
242 /*
243  * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
244  */
245 
246 static uint16_t
247 zap_leaf_array_create(const zap_entry_handle_t *zeh, const char *buf,
248 	int integer_size, int num_integers)
249 {
250 	uint16_t chunk_head;
251 	uint16_t *chunkp = &chunk_head;
252 	int byten = 0;
253 	uint64_t value;
254 	int shift = (integer_size-1)*8;
255 	int len = num_integers;
256 	zap_leaf_t *l = zeh->zeh_found_leaf;
257 
258 	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
259 
260 	while (len > 0) {
261 		uint16_t chunk = zap_leaf_chunk_alloc(l);
262 		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
263 		int i;
264 
265 		la->la_type = ZAP_LEAF_ARRAY;
266 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
267 			if (byten == 0)
268 				value = ldv(integer_size, buf);
269 			la->la_array[i] = (value & (0xff << shift)) >> shift;
270 			value <<= 8;
271 			if (++byten == integer_size) {
272 				byten = 0;
273 				buf += integer_size;
274 				if (--len == 0)
275 					break;
276 			}
277 		}
278 
279 		*chunkp = chunk;
280 		chunkp = &la->la_next;
281 	}
282 	*chunkp = CHAIN_END;
283 
284 	return (chunk_head);
285 }
286 
287 static void
288 zap_leaf_array_free(zap_entry_handle_t *zeh, uint16_t *chunkp)
289 {
290 	uint16_t chunk = *chunkp;
291 	zap_leaf_t *l = zeh->zeh_found_leaf;
292 
293 	*chunkp = CHAIN_END;
294 
295 	while (chunk != CHAIN_END) {
296 		int nextchunk = l->l_phys->l_chunk[chunk].l_array.la_next;
297 		ASSERT3U(l->l_phys->l_chunk[chunk].l_array.la_type, ==,
298 		    ZAP_LEAF_ARRAY);
299 		zap_leaf_chunk_free(l, chunk);
300 		chunk = nextchunk;
301 	}
302 }
303 
304 /* array_len and buf_len are in integers, not bytes */
305 static void
306 zap_leaf_array_read(const zap_entry_handle_t *zeh, uint16_t chunk,
307     int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
308     char *buf)
309 {
310 	int len = MIN(array_len, buf_len);
311 	int byten = 0;
312 	uint64_t value = 0;
313 	zap_leaf_t *l = zeh->zeh_found_leaf;
314 
315 	ASSERT3U(array_int_len, <=, buf_int_len);
316 
317 	while (len > 0) {
318 		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
319 		int i;
320 
321 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
322 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
323 			value = (value << 8) | la->la_array[i];
324 			byten++;
325 			if (byten == array_int_len) {
326 				stv(buf_int_len, buf, value);
327 				byten = 0;
328 				len--;
329 				if (len == 0)
330 					return;
331 				buf += buf_int_len;
332 			}
333 		}
334 		chunk = la->la_next;
335 	}
336 }
337 
338 /*
339  * Only to be used on 8-bit arrays.
340  * array_len is actual len in bytes (not encoded le_value_length).
341  * buf is null-terminated.
342  */
343 static int
344 zap_leaf_array_equal(const zap_entry_handle_t *zeh, int chunk,
345     int array_len, const char *buf)
346 {
347 	int bseen = 0;
348 	zap_leaf_t *l = zeh->zeh_found_leaf;
349 
350 	while (bseen < array_len) {
351 		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
352 		int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
353 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
354 		if (bcmp(la->la_array, buf + bseen, toread))
355 			break;
356 		chunk = la->la_next;
357 		bseen += toread;
358 	}
359 	return (bseen == array_len);
360 }
361 
362 /*
363  * Routines which manipulate leaf entries.
364  */
365 
366 int
367 zap_leaf_lookup(zap_leaf_t *l,
368     const char *name, uint64_t h, zap_entry_handle_t *zeh)
369 {
370 	uint16_t *chunkp;
371 	struct zap_leaf_entry *le;
372 
373 	zeh->zeh_head_leaf = l;
374 
375 again:
376 	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
377 
378 	for (chunkp = LEAF_HASH_ENTPTR(l, h);
379 	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
380 		uint16_t chunk = *chunkp;
381 		le = &l->l_phys->l_chunk[chunk].l_entry;
382 
383 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
384 		ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
385 
386 		if (le->le_hash != h)
387 			continue;
388 
389 		zeh->zeh_found_leaf = l;
390 		if (zap_leaf_array_equal(zeh, le->le_name_chunk,
391 		    le->le_name_length, name)) {
392 			zeh->zeh_num_integers = le->le_value_length;
393 			zeh->zeh_integer_size = le->le_int_size;
394 			zeh->zeh_cd = le->le_cd;
395 			zeh->zeh_hash = le->le_hash;
396 			zeh->zeh_chunkp = chunkp;
397 			zeh->zeh_found_leaf = l;
398 			return (0);
399 		}
400 	}
401 
402 	if (l->l_next) {
403 		l = l->l_next;
404 		goto again;
405 	}
406 
407 	return (ENOENT);
408 }
409 
410 /* Return (h1,cd1 >= h2,cd2) */
411 static int
412 hcd_gteq(uint64_t h1, uint32_t cd1, uint64_t h2, uint32_t cd2)
413 {
414 	if (h1 > h2)
415 		return (TRUE);
416 	if (h1 == h2 && cd1 >= cd2)
417 		return (TRUE);
418 	return (FALSE);
419 }
420 
421 int
422 zap_leaf_lookup_closest(zap_leaf_t *l,
423     uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
424 {
425 	uint16_t chunk;
426 	uint64_t besth = -1ULL;
427 	uint32_t bestcd = ZAP_MAXCD;
428 	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES-1;
429 	uint16_t lh;
430 	struct zap_leaf_entry *le;
431 
432 	zeh->zeh_head_leaf = l;
433 
434 again:
435 	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
436 
437 	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
438 		for (chunk = l->l_phys->l_hash[lh];
439 		    chunk != CHAIN_END; chunk = le->le_next) {
440 			le = &l->l_phys->l_chunk[chunk].l_entry;
441 
442 			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
443 			ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
444 
445 			if (hcd_gteq(le->le_hash, le->le_cd, h, cd) &&
446 			    hcd_gteq(besth, bestcd, le->le_hash, le->le_cd)) {
447 				ASSERT3U(bestlh, >=, lh);
448 				bestlh = lh;
449 				besth = le->le_hash;
450 				bestcd = le->le_cd;
451 
452 				zeh->zeh_num_integers = le->le_value_length;
453 				zeh->zeh_integer_size = le->le_int_size;
454 				zeh->zeh_cd = le->le_cd;
455 				zeh->zeh_hash = le->le_hash;
456 				zeh->zeh_fakechunk = chunk;
457 				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
458 				zeh->zeh_found_leaf = l;
459 			}
460 		}
461 	}
462 
463 	if (l->l_next) {
464 		l = l->l_next;
465 		goto again;
466 	}
467 
468 	return (bestcd == ZAP_MAXCD ? ENOENT : 0);
469 }
470 
471 int
472 zap_entry_read(const zap_entry_handle_t *zeh,
473     uint8_t integer_size, uint64_t num_integers, void *buf)
474 {
475 	struct zap_leaf_entry *le;
476 
477 	le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
478 	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
479 
480 	if (le->le_int_size > integer_size)
481 		return (EINVAL);
482 
483 	zap_leaf_array_read(zeh, le->le_value_chunk, le->le_int_size,
484 	    le->le_value_length, integer_size, num_integers, buf);
485 
486 	if (zeh->zeh_num_integers > num_integers)
487 		return (EOVERFLOW);
488 	return (0);
489 
490 }
491 
492 int
493 zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
494 {
495 	struct zap_leaf_entry *le;
496 
497 	le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
498 	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
499 
500 	zap_leaf_array_read(zeh, le->le_name_chunk, 1,
501 	    le->le_name_length, 1, buflen, buf);
502 	if (le->le_name_length > buflen)
503 		return (EOVERFLOW);
504 	return (0);
505 }
506 
507 int
508 zap_entry_update(zap_entry_handle_t *zeh,
509 	uint8_t integer_size, uint64_t num_integers, const void *buf)
510 {
511 	int delta_chunks;
512 	struct zap_leaf_entry *le;
513 	le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
514 
515 	delta_chunks = NCHUNKS(num_integers * integer_size) -
516 	    NCHUNKS(le->le_value_length * le->le_int_size);
517 
518 	if (zeh->zeh_found_leaf->lh_nfree < delta_chunks)
519 		return (EAGAIN);
520 
521 	/*
522 	 * We should search other chained leaves (via
523 	 * zap_entry_remove,create?) otherwise returning EAGAIN will
524 	 * just send us into an infinite loop if we have to chain
525 	 * another leaf block, rather than being able to split this
526 	 * block.
527 	 */
528 
529 	zap_leaf_array_free(zeh, &le->le_value_chunk);
530 	le->le_value_chunk =
531 	    zap_leaf_array_create(zeh, buf, integer_size, num_integers);
532 	le->le_value_length = (num_integers*integer_size > MAX_ARRAY_BYTES) ?
533 	    (MAX_ARRAY_BYTES + 1) : (num_integers);
534 	le->le_int_size = integer_size;
535 	return (0);
536 }
537 
538 void
539 zap_entry_remove(zap_entry_handle_t *zeh)
540 {
541 	uint16_t entry_chunk;
542 	struct zap_leaf_entry *le;
543 	zap_leaf_t *l = zeh->zeh_found_leaf;
544 
545 	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
546 
547 	entry_chunk = *zeh->zeh_chunkp;
548 	le = &l->l_phys->l_chunk[entry_chunk].l_entry;
549 	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
550 
551 	zap_leaf_array_free(zeh, &le->le_name_chunk);
552 	zap_leaf_array_free(zeh, &le->le_value_chunk);
553 
554 	*zeh->zeh_chunkp = le->le_next;
555 	zap_leaf_chunk_free(l, entry_chunk);
556 
557 	l->lh_nentries--;
558 }
559 
560 int
561 zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
562     uint8_t integer_size, uint64_t num_integers, const void *buf,
563     zap_entry_handle_t *zeh)
564 {
565 	uint16_t chunk;
566 	uint16_t *chunkp;
567 	struct zap_leaf_entry *le;
568 	uint64_t namelen, valuelen;
569 	int numchunks;
570 
571 	valuelen = integer_size * num_integers;
572 	namelen = strlen(name) + 1;
573 	ASSERT(namelen >= 2);
574 
575 	zeh->zeh_head_leaf = l;
576 
577 	if (namelen > MAXNAMELEN)
578 		return (ENAMETOOLONG);
579 	/* find the first leaf in the chain that has sufficient free space */
580 	numchunks = 1 + NCHUNKS(namelen) + NCHUNKS(valuelen);
581 	if (numchunks > ZAP_LEAF_NUMCHUNKS)
582 		return (E2BIG);
583 
584 	if (cd == ZAP_MAXCD) {
585 		for (cd = 0; cd < ZAP_MAXCD; cd++) {
586 			zap_leaf_t *ll;
587 			for (ll = l; ll; ll = ll->l_next) {
588 				for (chunk = *LEAF_HASH_ENTPTR(ll, h);
589 				    chunk != CHAIN_END; chunk = le->le_next) {
590 					le = &ll->l_phys->l_chunk
591 					    [chunk].l_entry;
592 					if (le->le_hash == h &&
593 					    le->le_cd == cd) {
594 						break;
595 					}
596 				}
597 				/*
598 				 * if this cd is in use, no need to
599 				 * check more chained leafs
600 				 */
601 				if (chunk != CHAIN_END)
602 					break;
603 			}
604 			/* If this cd is not in use, we are good. */
605 			if (chunk == CHAIN_END)
606 				break;
607 		}
608 		/* If we tried all the cd's, we lose. */
609 		if (cd == ZAP_MAXCD)
610 			return (ENOSPC);
611 	}
612 
613 	for (; l; l = l->l_next)
614 		if (l->lh_nfree >= numchunks)
615 			break;
616 	if (l == NULL)
617 		return (EAGAIN);
618 
619 	zeh->zeh_found_leaf = l;
620 
621 	/* make the entry */
622 	chunk = zap_leaf_chunk_alloc(l);
623 	le = &l->l_phys->l_chunk[chunk].l_entry;
624 	le->le_type = ZAP_LEAF_ENTRY;
625 	le->le_name_chunk = zap_leaf_array_create(zeh, name, 1, namelen);
626 	le->le_name_length = namelen;
627 	le->le_value_chunk =
628 	    zap_leaf_array_create(zeh, buf, integer_size, num_integers);
629 	le->le_value_length = (num_integers*integer_size > MAX_ARRAY_BYTES) ?
630 	    (MAX_ARRAY_BYTES + 1) : (num_integers);
631 	le->le_int_size = integer_size;
632 	le->le_hash = h;
633 	le->le_cd = cd;
634 
635 	/* link it into the hash chain */
636 	chunkp = LEAF_HASH_ENTPTR(l, h);
637 	le->le_next = *chunkp;
638 	*chunkp = chunk;
639 
640 	l->lh_nentries++;
641 
642 	zeh->zeh_num_integers = num_integers;
643 	zeh->zeh_integer_size = le->le_int_size;
644 	zeh->zeh_cd = le->le_cd;
645 	zeh->zeh_hash = le->le_hash;
646 	zeh->zeh_chunkp = chunkp;
647 
648 	return (0);
649 }
650 
651 /*
652  * Routines for transferring entries between leafs.
653  */
654 
655 static void
656 zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
657 {
658 	struct zap_leaf_entry *le = &l->l_phys->l_chunk[entry].l_entry;
659 	uint16_t *ptr = LEAF_HASH_ENTPTR(l, le->le_hash);
660 	le->le_next = *ptr;
661 	*ptr = entry;
662 }
663 
664 static void
665 zap_leaf_rehash_entries(zap_leaf_t *l)
666 {
667 	int i;
668 
669 	if (l->lh_nentries == 0)
670 		return;
671 
672 	/* break existing hash chains */
673 	zap_memset(l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
674 
675 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
676 		struct zap_leaf_entry *le = &l->l_phys->l_chunk[i].l_entry;
677 		if (le->le_type != ZAP_LEAF_ENTRY)
678 			continue;
679 		zap_leaf_rehash_entry(l, i);
680 	}
681 }
682 
683 static uint16_t
684 zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
685 {
686 	uint16_t new_chunk;
687 	uint16_t *nchunkp = &new_chunk;
688 
689 	while (chunk != CHAIN_END) {
690 		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
691 		struct zap_leaf_array *nla =
692 		    &nl->l_phys->l_chunk[nchunk].l_array;
693 		struct zap_leaf_array *la =
694 		    &l->l_phys->l_chunk[chunk].l_array;
695 		int nextchunk = la->la_next;
696 
697 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
698 		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS);
699 
700 		*nla = *la;
701 
702 		zap_leaf_chunk_free(l, chunk);
703 		chunk = nextchunk;
704 		*nchunkp = nchunk;
705 		nchunkp = &nla->la_next;
706 	}
707 	*nchunkp = CHAIN_END;
708 	return (new_chunk);
709 }
710 
711 static void
712 zap_leaf_transfer_entry(zap_t *zap, zap_leaf_t *l, int entry, zap_leaf_t *nhl,
713     dmu_tx_t *tx)
714 {
715 	zap_leaf_t *nl;
716 	struct zap_leaf_entry *le, *nle;
717 	uint16_t chunk, nchunks;
718 
719 	le = &l->l_phys->l_chunk[entry].l_entry;
720 	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
721 
722 	/* find a leaf in the destination leaf chain with enough free space */
723 	nchunks = 1 + NCHUNKS(le->le_name_length) +
724 	    NCHUNKS(le->le_value_length * le->le_int_size);
725 	for (nl = nhl; nl; nl = nl->l_next)
726 		if (nl->lh_nfree >= nchunks)
727 			break;
728 	if (nl == NULL) {
729 		nl = zap_leaf_chainmore(nhl, zap_create_leaf(zap, tx));
730 		dprintf("transfer_entry: chaining leaf %x/%d\n",
731 		    nl->lh_prefix, nl->lh_prefix_len);
732 	}
733 
734 	chunk = zap_leaf_chunk_alloc(nl);
735 	nle = &nl->l_phys->l_chunk[chunk].l_entry;
736 	*nle = *le;
737 
738 	zap_leaf_rehash_entry(nl, chunk);
739 
740 	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
741 	nle->le_value_chunk =
742 	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
743 
744 	zap_leaf_chunk_free(l, entry);
745 
746 	l->lh_nentries--;
747 	nl->lh_nentries++;
748 }
749 
750 /*
751  * Transfer entries whose hash bit 'bit' is 1 to nl1, and 0 to nl0.
752  * Ignore leaf chaining in source (l), but chain in destinations.
753  * We'll re-chain all the entries in l as we go along.
754  */
755 static void
756 zap_leaf_transfer_entries(zap_t *zap, zap_leaf_t *l,
757     zap_leaf_t *nl0, zap_leaf_t *nl1, int bit, dmu_tx_t *tx)
758 {
759 	int i;
760 
761 	ASSERT(bit < 64 && bit >= 0);
762 	/* break existing hash chains */
763 	zap_memset(l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
764 
765 	if (nl0 != l)
766 		zap_leaf_rehash_entries(nl0);
767 	if (nl1 != nl0)
768 		zap_leaf_rehash_entries(nl1);
769 
770 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
771 		struct zap_leaf_entry *le = &l->l_phys->l_chunk[i].l_entry;
772 		if (le->le_type != ZAP_LEAF_ENTRY)
773 			continue;
774 
775 		/*
776 		 * We could find entries via hashtable instead. That
777 		 * would be O(hashents+numents) rather than
778 		 * O(numblks+numents), but this accesses memory more
779 		 * sequentially, and when we're called, the block is
780 		 * usually pretty full.
781 		 */
782 
783 		if (le->le_hash & (1ULL << bit)) {
784 			zap_leaf_transfer_entry(zap, l, i, nl1, tx);
785 		} else {
786 			if (nl0 == l)
787 				zap_leaf_rehash_entry(l, i);
788 			else
789 				zap_leaf_transfer_entry(zap, l, i, nl0, tx);
790 		}
791 	}
792 
793 }
794 
795 /*
796  * nl will contain the entries whose hash prefix ends in 1
797  * handles leaf chaining
798  */
799 zap_leaf_t *
800 zap_leaf_split(zap_t *zap, zap_leaf_t *hl, dmu_tx_t *tx)
801 {
802 	zap_leaf_t *l = hl;
803 	int bit = 64 - 1 - hl->lh_prefix_len;
804 	zap_leaf_t *nl = zap_create_leaf(zap, tx);
805 
806 	/* set new prefix and prefix_len */
807 	hl->lh_prefix <<= 1;
808 	hl->lh_prefix_len++;
809 	nl->lh_prefix = hl->lh_prefix | 1;
810 	nl->lh_prefix_len = hl->lh_prefix_len;
811 
812 	/* transfer odd entries from first leaf in hl chain to nl */
813 	zap_leaf_transfer_entries(zap, hl, hl, nl, bit, tx);
814 
815 	/* take rest of chain off hl */
816 	l = hl->l_next;
817 	hl->l_next = NULL;
818 	hl->lh_next = 0;
819 
820 	/* transfer even entries from hl chain back to hl, odd entries to nl */
821 	while (l) {
822 		zap_leaf_t *next = l->l_next;
823 		zap_leaf_transfer_entries(zap, l, hl, nl, bit, tx);
824 		zap_destroy_leaf(zap, l, tx);
825 		l = next;
826 	}
827 
828 	return (nl);
829 }
830 
831 void
832 zap_stats_leaf(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
833 {
834 	int n, nchained = 0;
835 
836 	n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len;
837 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
838 	zs->zs_leafs_with_2n_pointers[n]++;
839 
840 	do {
841 		int i;
842 
843 		n = l->lh_nentries/5;
844 		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
845 		zs->zs_blocks_with_n5_entries[n]++;
846 
847 		n = ((1<<ZAP_BLOCK_SHIFT) -
848 		    l->lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
849 		    (1<<ZAP_BLOCK_SHIFT);
850 		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
851 		zs->zs_blocks_n_tenths_full[n]++;
852 
853 		for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES; i++) {
854 			int nentries = 0;
855 			int chunk = l->l_phys->l_hash[i];
856 
857 			while (chunk != CHAIN_END) {
858 				struct zap_leaf_entry *le =
859 				    &l->l_phys->l_chunk[chunk].l_entry;
860 
861 				n = 1 + NCHUNKS(le->le_name_length) +
862 				    NCHUNKS(le->le_value_length *
863 					le->le_int_size);
864 				n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
865 				zs->zs_entries_using_n_chunks[n]++;
866 
867 				chunk = le->le_next;
868 				nentries++;
869 			}
870 
871 			n = nentries;
872 			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
873 			zs->zs_buckets_with_n_entries[n]++;
874 		}
875 
876 		nchained++;
877 		l = l->l_next;
878 	} while (l);
879 
880 	n = nchained-1;
881 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
882 	zs->zs_leafs_with_n_chained[n]++;
883 }
884