xref: /titanic_52/usr/src/uts/common/fs/zfs/zap_leaf.c (revision 14d56903b5e712ea59206c206e57d3731107fcf1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * The 512-byte leaf is broken into 32 16-byte chunks.
30  * chunk number n means l_chunk[n], even though the header precedes it.
31  * the names are stored null-terminated.
32  */
33 
34 #include <sys/zfs_context.h>
35 #include <sys/zap.h>
36 #include <sys/zap_impl.h>
37 #include <sys/zap_leaf.h>
38 #include <sys/spa.h>
39 #include <sys/dmu.h>
40 
41 #define	CHAIN_END 0xffff /* end of the chunk chain */
42 
43 /* half the (current) minimum block size */
44 #define	MAX_ARRAY_BYTES (8<<10)
45 
46 #define	NCHUNKS(bytes) (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
47 
48 /*
49  * XXX This will >> by a negative number when
50  * lh_prefix_len > 64-ZAP_LEAF_HASH_SHIFT.
51  */
52 #define	LEAF_HASH(l, h) \
53 	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
54 		((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->lh_prefix_len)))
55 
56 #define	LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
57 
58 /* #define	MEMCHECK */
59 
60 
61 static void
62 zap_memset(void *a, int c, size_t n)
63 {
64 	char *cp = a;
65 	char *cpend = cp + n;
66 
67 	while (cp < cpend)
68 		*cp++ = c;
69 }
70 
71 static void
72 stv(int len, void *addr, uint64_t value)
73 {
74 	switch (len) {
75 	case 1:
76 		*(uint8_t *)addr = value;
77 		return;
78 	case 2:
79 		*(uint16_t *)addr = value;
80 		return;
81 	case 4:
82 		*(uint32_t *)addr = value;
83 		return;
84 	case 8:
85 		*(uint64_t *)addr = value;
86 		return;
87 	}
88 	ASSERT(!"bad int len");
89 }
90 
91 static uint64_t
92 ldv(int len, const void *addr)
93 {
94 	switch (len) {
95 	case 1:
96 		return (*(uint8_t *)addr);
97 	case 2:
98 		return (*(uint16_t *)addr);
99 	case 4:
100 		return (*(uint32_t *)addr);
101 	case 8:
102 		return (*(uint64_t *)addr);
103 	}
104 	ASSERT(!"bad int len");
105 	return (0xFEEDFACEDEADBEEF);
106 }
107 
108 void
109 zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
110 {
111 	int i;
112 	zap_leaf_t l;
113 	l.l_bs = highbit(size)-1;
114 	l.l_phys = buf;
115 
116 	buf->l_hdr.lhr_block_type = 	BSWAP_64(buf->l_hdr.lhr_block_type);
117 	buf->l_hdr.lhr_next = 		BSWAP_64(buf->l_hdr.lhr_next);
118 	buf->l_hdr.lhr_prefix = 	BSWAP_64(buf->l_hdr.lhr_prefix);
119 	buf->l_hdr.lhr_magic = 		BSWAP_32(buf->l_hdr.lhr_magic);
120 	buf->l_hdr.lhr_nfree = 		BSWAP_16(buf->l_hdr.lhr_nfree);
121 	buf->l_hdr.lhr_nentries = 	BSWAP_16(buf->l_hdr.lhr_nentries);
122 	buf->l_hdr.lhr_prefix_len = 	BSWAP_16(buf->l_hdr.lhr_prefix_len);
123 	buf->l_hdr.lh_freelist = 	BSWAP_16(buf->l_hdr.lh_freelist);
124 
125 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
126 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
127 
128 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
129 		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
130 		struct zap_leaf_entry *le;
131 
132 		switch (lc->l_free.lf_type) {
133 		case ZAP_CHUNK_ENTRY:
134 			le = &lc->l_entry;
135 
136 			le->le_type = BSWAP_8(le->le_type);
137 			le->le_int_size = BSWAP_8(le->le_int_size);
138 			le->le_next = BSWAP_16(le->le_next);
139 			le->le_name_chunk = BSWAP_16(le->le_name_chunk);
140 			le->le_name_length = BSWAP_16(le->le_name_length);
141 			le->le_value_chunk = BSWAP_16(le->le_value_chunk);
142 			le->le_value_length = BSWAP_16(le->le_value_length);
143 			le->le_cd = BSWAP_32(le->le_cd);
144 			le->le_hash = BSWAP_64(le->le_hash);
145 			break;
146 		case ZAP_CHUNK_FREE:
147 			lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type);
148 			lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next);
149 			break;
150 		case ZAP_CHUNK_ARRAY:
151 			lc->l_array.la_type = BSWAP_8(lc->l_array.la_type);
152 			lc->l_array.la_next = BSWAP_16(lc->l_array.la_next);
153 			/* la_array doesn't need swapping */
154 			break;
155 		default:
156 			ASSERT(!"bad leaf type");
157 		}
158 	}
159 }
160 
161 void
162 zap_leaf_init(zap_leaf_t *l)
163 {
164 	int i;
165 
166 	l->l_bs = highbit(l->l_dbuf->db_size)-1;
167 	zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
168 	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
169 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
170 		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
171 		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
172 	}
173 	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
174 	l->lh_block_type = ZBT_LEAF;
175 	l->lh_magic = ZAP_LEAF_MAGIC;
176 	l->lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
177 }
178 
179 zap_leaf_t *
180 zap_leaf_chainmore(zap_leaf_t *l, zap_leaf_t *nl)
181 {
182 	nl->lh_prefix = l->lh_prefix;
183 	nl->lh_prefix_len = l->lh_prefix_len;
184 	nl->l_next = l->l_next;
185 	l->l_next = nl;
186 	nl->lh_next = l->lh_next;
187 	l->lh_next = nl->l_blkid;
188 	return (nl);
189 }
190 
191 /*
192  * Routines which manipulate leaf chunks (l_chunk[]).
193  */
194 
195 static uint16_t
196 zap_leaf_chunk_alloc(zap_leaf_t *l)
197 {
198 	int chunk;
199 
200 	ASSERT(l->lh_nfree > 0);
201 
202 	chunk = l->l_phys->l_hdr.lh_freelist;
203 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
204 	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
205 
206 	l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
207 
208 #ifdef MEMCHECK
209 	zap_memset(&ZAP_LEAF_CHUNK(l, chunk), 0xa1, sizeof (zap_leaf_chunk_t));
210 #endif
211 
212 	l->lh_nfree--;
213 
214 	return (chunk);
215 }
216 
217 static void
218 zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
219 {
220 	struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
221 	ASSERT3U(l->lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
222 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
223 	ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
224 
225 #ifdef MEMCHECK
226 	zap_memset(zlf, 0xf4, sizeof (zap_leaf_chunk_t));
227 #endif
228 
229 	zlf->lf_type = ZAP_CHUNK_FREE;
230 	zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
231 	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
232 	l->l_phys->l_hdr.lh_freelist = chunk;
233 
234 	l->lh_nfree++;
235 }
236 
237 
238 /*
239  * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
240  */
241 
242 static uint16_t
243 zap_leaf_array_create(const zap_entry_handle_t *zeh, const char *buf,
244 	int integer_size, int num_integers)
245 {
246 	uint16_t chunk_head;
247 	uint16_t *chunkp = &chunk_head;
248 	int byten = 0;
249 	uint64_t value;
250 	int shift = (integer_size-1)*8;
251 	int len = num_integers;
252 	zap_leaf_t *l = zeh->zeh_found_leaf;
253 
254 	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
255 
256 	while (len > 0) {
257 		uint16_t chunk = zap_leaf_chunk_alloc(l);
258 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
259 		int i;
260 
261 		la->la_type = ZAP_CHUNK_ARRAY;
262 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
263 			if (byten == 0)
264 				value = ldv(integer_size, buf);
265 			la->la_array[i] = (value & (0xff << shift)) >> shift;
266 			value <<= 8;
267 			if (++byten == integer_size) {
268 				byten = 0;
269 				buf += integer_size;
270 				if (--len == 0)
271 					break;
272 			}
273 		}
274 
275 		*chunkp = chunk;
276 		chunkp = &la->la_next;
277 	}
278 	*chunkp = CHAIN_END;
279 
280 	return (chunk_head);
281 }
282 
283 static void
284 zap_leaf_array_free(zap_entry_handle_t *zeh, uint16_t *chunkp)
285 {
286 	uint16_t chunk = *chunkp;
287 	zap_leaf_t *l = zeh->zeh_found_leaf;
288 
289 	*chunkp = CHAIN_END;
290 
291 	while (chunk != CHAIN_END) {
292 		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
293 		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
294 		    ZAP_CHUNK_ARRAY);
295 		zap_leaf_chunk_free(l, chunk);
296 		chunk = nextchunk;
297 	}
298 }
299 
300 /* array_len and buf_len are in integers, not bytes */
301 static void
302 zap_leaf_array_read(const zap_entry_handle_t *zeh, uint16_t chunk,
303     int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
304     char *buf)
305 {
306 	int len = MIN(array_len, buf_len);
307 	int byten = 0;
308 	uint64_t value = 0;
309 	zap_leaf_t *l = zeh->zeh_found_leaf;
310 
311 	ASSERT3U(array_int_len, <=, buf_int_len);
312 
313 	/* Fast path for one 8-byte integer */
314 	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
315 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
316 		uint8_t *ip = la->la_array;
317 		uint64_t *buf64 = (uint64_t *)buf;
318 
319 		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
320 		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
321 		    (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
322 		    (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
323 		return;
324 	}
325 
326 	/* Fast path for an array of 1-byte integers (eg. the entry name) */
327 	if (array_int_len == 1 && buf_int_len == 1 &&
328 	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
329 		while (chunk != CHAIN_END) {
330 			struct zap_leaf_array *la =
331 			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
332 			bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES);
333 			buf += ZAP_LEAF_ARRAY_BYTES;
334 			chunk = la->la_next;
335 		}
336 		return;
337 	}
338 
339 	while (len > 0) {
340 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
341 		int i;
342 
343 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
344 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
345 			value = (value << 8) | la->la_array[i];
346 			byten++;
347 			if (byten == array_int_len) {
348 				stv(buf_int_len, buf, value);
349 				byten = 0;
350 				len--;
351 				if (len == 0)
352 					return;
353 				buf += buf_int_len;
354 			}
355 		}
356 		chunk = la->la_next;
357 	}
358 }
359 
360 /*
361  * Only to be used on 8-bit arrays.
362  * array_len is actual len in bytes (not encoded le_value_length).
363  * buf is null-terminated.
364  */
365 static int
366 zap_leaf_array_equal(const zap_entry_handle_t *zeh, int chunk,
367     int array_len, const char *buf)
368 {
369 	int bseen = 0;
370 	zap_leaf_t *l = zeh->zeh_found_leaf;
371 
372 	while (bseen < array_len) {
373 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
374 		int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
375 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
376 		if (bcmp(la->la_array, buf + bseen, toread))
377 			break;
378 		chunk = la->la_next;
379 		bseen += toread;
380 	}
381 	return (bseen == array_len);
382 }
383 
384 /*
385  * Routines which manipulate leaf entries.
386  */
387 
388 int
389 zap_leaf_lookup(zap_leaf_t *l,
390     const char *name, uint64_t h, zap_entry_handle_t *zeh)
391 {
392 	uint16_t *chunkp;
393 	struct zap_leaf_entry *le;
394 
395 	zeh->zeh_head_leaf = l;
396 
397 again:
398 	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
399 
400 	for (chunkp = LEAF_HASH_ENTPTR(l, h);
401 	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
402 		uint16_t chunk = *chunkp;
403 		le = ZAP_LEAF_ENTRY(l, chunk);
404 
405 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
406 		ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
407 
408 		if (le->le_hash != h)
409 			continue;
410 
411 		zeh->zeh_found_leaf = l;
412 		if (zap_leaf_array_equal(zeh, le->le_name_chunk,
413 		    le->le_name_length, name)) {
414 			zeh->zeh_num_integers = le->le_value_length;
415 			zeh->zeh_integer_size = le->le_int_size;
416 			zeh->zeh_cd = le->le_cd;
417 			zeh->zeh_hash = le->le_hash;
418 			zeh->zeh_chunkp = chunkp;
419 			zeh->zeh_found_leaf = l;
420 			return (0);
421 		}
422 	}
423 
424 	if (l->l_next) {
425 		l = l->l_next;
426 		goto again;
427 	}
428 
429 	return (ENOENT);
430 }
431 
432 /* Return (h1,cd1 >= h2,cd2) */
433 #define	HCD_GTEQ(h1, cd1, h2, cd2) \
434 	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
435 
436 int
437 zap_leaf_lookup_closest(zap_leaf_t *l,
438     uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
439 {
440 	uint16_t chunk;
441 	uint64_t besth = -1ULL;
442 	uint32_t bestcd = ZAP_MAXCD;
443 	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
444 	uint16_t lh;
445 	struct zap_leaf_entry *le;
446 
447 	zeh->zeh_head_leaf = l;
448 
449 again:
450 	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
451 
452 	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
453 		for (chunk = l->l_phys->l_hash[lh];
454 		    chunk != CHAIN_END; chunk = le->le_next) {
455 			le = ZAP_LEAF_ENTRY(l, chunk);
456 
457 			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
458 			ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
459 
460 			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
461 			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
462 				ASSERT3U(bestlh, >=, lh);
463 				bestlh = lh;
464 				besth = le->le_hash;
465 				bestcd = le->le_cd;
466 
467 				zeh->zeh_num_integers = le->le_value_length;
468 				zeh->zeh_integer_size = le->le_int_size;
469 				zeh->zeh_cd = le->le_cd;
470 				zeh->zeh_hash = le->le_hash;
471 				zeh->zeh_fakechunk = chunk;
472 				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
473 				zeh->zeh_found_leaf = l;
474 			}
475 		}
476 	}
477 
478 	if (l->l_next) {
479 		l = l->l_next;
480 		goto again;
481 	}
482 
483 	return (bestcd == ZAP_MAXCD ? ENOENT : 0);
484 }
485 
486 int
487 zap_entry_read(const zap_entry_handle_t *zeh,
488     uint8_t integer_size, uint64_t num_integers, void *buf)
489 {
490 	struct zap_leaf_entry *le =
491 	    ZAP_LEAF_ENTRY(zeh->zeh_found_leaf, *zeh->zeh_chunkp);
492 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
493 
494 	if (le->le_int_size > integer_size)
495 		return (EINVAL);
496 
497 	zap_leaf_array_read(zeh, le->le_value_chunk, le->le_int_size,
498 	    le->le_value_length, integer_size, num_integers, buf);
499 
500 	if (zeh->zeh_num_integers > num_integers)
501 		return (EOVERFLOW);
502 	return (0);
503 
504 }
505 
506 int
507 zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
508 {
509 	struct zap_leaf_entry *le =
510 	    ZAP_LEAF_ENTRY(zeh->zeh_found_leaf, *zeh->zeh_chunkp);
511 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
512 
513 	zap_leaf_array_read(zeh, le->le_name_chunk, 1,
514 	    le->le_name_length, 1, buflen, buf);
515 	if (le->le_name_length > buflen)
516 		return (EOVERFLOW);
517 	return (0);
518 }
519 
520 int
521 zap_entry_update(zap_entry_handle_t *zeh,
522 	uint8_t integer_size, uint64_t num_integers, const void *buf)
523 {
524 	int delta_chunks;
525 	struct zap_leaf_entry *le =
526 	    ZAP_LEAF_ENTRY(zeh->zeh_found_leaf, *zeh->zeh_chunkp);
527 
528 	delta_chunks = NCHUNKS(num_integers * integer_size) -
529 	    NCHUNKS(le->le_value_length * le->le_int_size);
530 
531 	if (zeh->zeh_found_leaf->lh_nfree < delta_chunks)
532 		return (EAGAIN);
533 
534 	/*
535 	 * We should search other chained leaves (via
536 	 * zap_entry_remove,create?) otherwise returning EAGAIN will
537 	 * just send us into an infinite loop if we have to chain
538 	 * another leaf block, rather than being able to split this
539 	 * block.
540 	 */
541 
542 	zap_leaf_array_free(zeh, &le->le_value_chunk);
543 	le->le_value_chunk =
544 	    zap_leaf_array_create(zeh, buf, integer_size, num_integers);
545 	le->le_value_length = num_integers;
546 	le->le_int_size = integer_size;
547 	return (0);
548 }
549 
550 void
551 zap_entry_remove(zap_entry_handle_t *zeh)
552 {
553 	uint16_t entry_chunk;
554 	struct zap_leaf_entry *le;
555 	zap_leaf_t *l = zeh->zeh_found_leaf;
556 
557 	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
558 
559 	entry_chunk = *zeh->zeh_chunkp;
560 	le = ZAP_LEAF_ENTRY(l, entry_chunk);
561 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
562 
563 	zap_leaf_array_free(zeh, &le->le_name_chunk);
564 	zap_leaf_array_free(zeh, &le->le_value_chunk);
565 
566 	*zeh->zeh_chunkp = le->le_next;
567 	zap_leaf_chunk_free(l, entry_chunk);
568 
569 	l->lh_nentries--;
570 }
571 
572 int
573 zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
574     uint8_t integer_size, uint64_t num_integers, const void *buf,
575     zap_entry_handle_t *zeh)
576 {
577 	uint16_t chunk;
578 	uint16_t *chunkp;
579 	struct zap_leaf_entry *le;
580 	uint64_t namelen, valuelen;
581 	int numchunks;
582 
583 	valuelen = integer_size * num_integers;
584 	namelen = strlen(name) + 1;
585 	ASSERT(namelen >= 2);
586 
587 	zeh->zeh_head_leaf = l;
588 
589 	if (namelen > MAXNAMELEN)
590 		return (ENAMETOOLONG);
591 	/* find the first leaf in the chain that has sufficient free space */
592 	numchunks = 1 + NCHUNKS(namelen) + NCHUNKS(valuelen);
593 	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
594 		return (E2BIG);
595 
596 	if (cd == ZAP_MAXCD) {
597 		for (cd = 0; cd < ZAP_MAXCD; cd++) {
598 			zap_leaf_t *ll;
599 			for (ll = l; ll; ll = ll->l_next) {
600 				for (chunk = *LEAF_HASH_ENTPTR(ll, h);
601 				    chunk != CHAIN_END; chunk = le->le_next) {
602 					le = ZAP_LEAF_ENTRY(ll, chunk);
603 					if (le->le_hash == h &&
604 					    le->le_cd == cd) {
605 						break;
606 					}
607 				}
608 				/*
609 				 * if this cd is in use, no need to
610 				 * check more chained leafs
611 				 */
612 				if (chunk != CHAIN_END)
613 					break;
614 			}
615 			/* If this cd is not in use, we are good. */
616 			if (chunk == CHAIN_END)
617 				break;
618 		}
619 		/* If we tried all the cd's, we lose. */
620 		if (cd == ZAP_MAXCD)
621 			return (ENOSPC);
622 	}
623 
624 	for (; l; l = l->l_next)
625 		if (l->lh_nfree >= numchunks)
626 			break;
627 	if (l == NULL)
628 		return (EAGAIN);
629 
630 	zeh->zeh_found_leaf = l;
631 
632 	/* make the entry */
633 	chunk = zap_leaf_chunk_alloc(l);
634 	le = ZAP_LEAF_ENTRY(l, chunk);
635 	le->le_type = ZAP_CHUNK_ENTRY;
636 	le->le_name_chunk = zap_leaf_array_create(zeh, name, 1, namelen);
637 	le->le_name_length = namelen;
638 	le->le_value_chunk =
639 	    zap_leaf_array_create(zeh, buf, integer_size, num_integers);
640 	le->le_value_length = num_integers;
641 	le->le_int_size = integer_size;
642 	le->le_hash = h;
643 	le->le_cd = cd;
644 
645 	/* link it into the hash chain */
646 	chunkp = LEAF_HASH_ENTPTR(l, h);
647 	le->le_next = *chunkp;
648 	*chunkp = chunk;
649 
650 	l->lh_nentries++;
651 
652 	zeh->zeh_num_integers = num_integers;
653 	zeh->zeh_integer_size = le->le_int_size;
654 	zeh->zeh_cd = le->le_cd;
655 	zeh->zeh_hash = le->le_hash;
656 	zeh->zeh_chunkp = chunkp;
657 
658 	return (0);
659 }
660 
661 /*
662  * Routines for transferring entries between leafs.
663  */
664 
665 static void
666 zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
667 {
668 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
669 	uint16_t *ptr = LEAF_HASH_ENTPTR(l, le->le_hash);
670 	le->le_next = *ptr;
671 	*ptr = entry;
672 }
673 
674 static void
675 zap_leaf_rehash_entries(zap_leaf_t *l)
676 {
677 	int i;
678 
679 	if (l->lh_nentries == 0)
680 		return;
681 
682 	/* break existing hash chains */
683 	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
684 
685 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
686 		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
687 		if (le->le_type != ZAP_CHUNK_ENTRY)
688 			continue;
689 		zap_leaf_rehash_entry(l, i);
690 	}
691 }
692 
693 static uint16_t
694 zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
695 {
696 	uint16_t new_chunk;
697 	uint16_t *nchunkp = &new_chunk;
698 
699 	while (chunk != CHAIN_END) {
700 		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
701 		struct zap_leaf_array *nla =
702 		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
703 		struct zap_leaf_array *la =
704 		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
705 		int nextchunk = la->la_next;
706 
707 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
708 		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
709 
710 		*nla = *la;
711 
712 		zap_leaf_chunk_free(l, chunk);
713 		chunk = nextchunk;
714 		*nchunkp = nchunk;
715 		nchunkp = &nla->la_next;
716 	}
717 	*nchunkp = CHAIN_END;
718 	return (new_chunk);
719 }
720 
721 static void
722 zap_leaf_transfer_entry(zap_t *zap, zap_leaf_t *l, int entry, zap_leaf_t *nhl,
723     dmu_tx_t *tx)
724 {
725 	zap_leaf_t *nl;
726 	struct zap_leaf_entry *le, *nle;
727 	uint16_t chunk, nchunks;
728 
729 	le = ZAP_LEAF_ENTRY(l, entry);
730 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
731 
732 	/* find a leaf in the destination leaf chain with enough free space */
733 	nchunks = 1 + NCHUNKS(le->le_name_length) +
734 	    NCHUNKS(le->le_value_length * le->le_int_size);
735 	for (nl = nhl; nl; nl = nl->l_next)
736 		if (nl->lh_nfree >= nchunks)
737 			break;
738 	if (nl == NULL) {
739 		nl = zap_leaf_chainmore(nhl, zap_create_leaf(zap, tx));
740 		dprintf("transfer_entry: chaining leaf %x/%d\n",
741 		    nl->lh_prefix, nl->lh_prefix_len);
742 	}
743 
744 	chunk = zap_leaf_chunk_alloc(nl);
745 	nle = ZAP_LEAF_ENTRY(nl, chunk);
746 	*nle = *le;
747 
748 	zap_leaf_rehash_entry(nl, chunk);
749 
750 	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
751 	nle->le_value_chunk =
752 	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
753 
754 	zap_leaf_chunk_free(l, entry);
755 
756 	l->lh_nentries--;
757 	nl->lh_nentries++;
758 }
759 
760 /*
761  * Transfer entries whose hash bit 'bit' is 1 to nl1, and 0 to nl0.
762  * Ignore leaf chaining in source (l), but chain in destinations.
763  * We'll re-chain all the entries in l as we go along.
764  */
765 static void
766 zap_leaf_transfer_entries(zap_t *zap, zap_leaf_t *l,
767     zap_leaf_t *nl0, zap_leaf_t *nl1, int bit, dmu_tx_t *tx)
768 {
769 	int i;
770 
771 	ASSERT(bit < 64 && bit >= 0);
772 	/* break existing hash chains */
773 	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
774 
775 	if (nl0 != l)
776 		zap_leaf_rehash_entries(nl0);
777 	if (nl1 != nl0)
778 		zap_leaf_rehash_entries(nl1);
779 
780 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
781 		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
782 		if (le->le_type != ZAP_CHUNK_ENTRY)
783 			continue;
784 
785 		/*
786 		 * We could find entries via hashtable instead. That
787 		 * would be O(hashents+numents) rather than
788 		 * O(numblks+numents), but this accesses memory more
789 		 * sequentially, and when we're called, the block is
790 		 * usually pretty full.
791 		 */
792 
793 		if (le->le_hash & (1ULL << bit)) {
794 			zap_leaf_transfer_entry(zap, l, i, nl1, tx);
795 		} else {
796 			if (nl0 == l)
797 				zap_leaf_rehash_entry(l, i);
798 			else
799 				zap_leaf_transfer_entry(zap, l, i, nl0, tx);
800 		}
801 	}
802 
803 }
804 
805 /*
806  * nl will contain the entries whose hash prefix ends in 1
807  * handles leaf chaining
808  */
809 zap_leaf_t *
810 zap_leaf_split(zap_t *zap, zap_leaf_t *hl, dmu_tx_t *tx)
811 {
812 	zap_leaf_t *l = hl;
813 	int bit = 64 - 1 - hl->lh_prefix_len;
814 	zap_leaf_t *nl = zap_create_leaf(zap, tx);
815 
816 	/* set new prefix and prefix_len */
817 	hl->lh_prefix <<= 1;
818 	hl->lh_prefix_len++;
819 	nl->lh_prefix = hl->lh_prefix | 1;
820 	nl->lh_prefix_len = hl->lh_prefix_len;
821 
822 	/* transfer odd entries from first leaf in hl chain to nl */
823 	zap_leaf_transfer_entries(zap, hl, hl, nl, bit, tx);
824 
825 	/* take rest of chain off hl */
826 	l = hl->l_next;
827 	hl->l_next = NULL;
828 	hl->lh_next = 0;
829 
830 	/* transfer even entries from hl chain back to hl, odd entries to nl */
831 	while (l) {
832 		zap_leaf_t *next = l->l_next;
833 		zap_leaf_transfer_entries(zap, l, hl, nl, bit, tx);
834 		zap_destroy_leaf(zap, l, tx);
835 		l = next;
836 	}
837 
838 	return (nl);
839 }
840 
841 void
842 zap_stats_leaf(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
843 {
844 	int n, nchained = 0;
845 
846 	n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len;
847 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
848 	zs->zs_leafs_with_2n_pointers[n]++;
849 
850 	do {
851 		int i;
852 
853 		n = l->lh_nentries/5;
854 		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
855 		zs->zs_blocks_with_n5_entries[n]++;
856 
857 		n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
858 		    l->lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
859 		    (1<<FZAP_BLOCK_SHIFT(zap));
860 		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
861 		zs->zs_blocks_n_tenths_full[n]++;
862 
863 		for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
864 			int nentries = 0;
865 			int chunk = l->l_phys->l_hash[i];
866 
867 			while (chunk != CHAIN_END) {
868 				struct zap_leaf_entry *le =
869 				    ZAP_LEAF_ENTRY(l, chunk);
870 
871 				n = 1 + NCHUNKS(le->le_name_length) +
872 				    NCHUNKS(le->le_value_length *
873 					le->le_int_size);
874 				n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
875 				zs->zs_entries_using_n_chunks[n]++;
876 
877 				chunk = le->le_next;
878 				nentries++;
879 			}
880 
881 			n = nentries;
882 			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
883 			zs->zs_buckets_with_n_entries[n]++;
884 		}
885 
886 		nchained++;
887 		l = l->l_next;
888 	} while (l);
889 
890 	n = nchained-1;
891 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
892 	zs->zs_leafs_with_n_chained[n]++;
893 }
894