xref: /titanic_50/usr/src/uts/common/fs/zfs/zap_leaf.c (revision a65cd518c5d0f30c53594a7022eb0f7d04c98cef)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
24  */
25 
26 /*
27  * The 512-byte leaf is broken into 32 16-byte chunks.
28  * chunk number n means l_chunk[n], even though the header precedes it.
29  * the names are stored null-terminated.
30  */
31 
32 #include <sys/zio.h>
33 #include <sys/spa.h>
34 #include <sys/dmu.h>
35 #include <sys/zfs_context.h>
36 #include <sys/fs/zfs.h>
37 #include <sys/zap.h>
38 #include <sys/zap_impl.h>
39 #include <sys/zap_leaf.h>
40 #include <sys/arc.h>
41 
42 static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
43 
44 #define	CHAIN_END 0xffff /* end of the chunk chain */
45 
46 /* half the (current) minimum block size */
47 #define	MAX_ARRAY_BYTES (8<<10)
48 
49 #define	LEAF_HASH(l, h) \
50 	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
51 	((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len)))
52 
53 #define	LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
54 
55 
56 static void
57 zap_memset(void *a, int c, size_t n)
58 {
59 	char *cp = a;
60 	char *cpend = cp + n;
61 
62 	while (cp < cpend)
63 		*cp++ = c;
64 }
65 
66 static void
67 stv(int len, void *addr, uint64_t value)
68 {
69 	switch (len) {
70 	case 1:
71 		*(uint8_t *)addr = value;
72 		return;
73 	case 2:
74 		*(uint16_t *)addr = value;
75 		return;
76 	case 4:
77 		*(uint32_t *)addr = value;
78 		return;
79 	case 8:
80 		*(uint64_t *)addr = value;
81 		return;
82 	}
83 	ASSERT(!"bad int len");
84 }
85 
86 static uint64_t
87 ldv(int len, const void *addr)
88 {
89 	switch (len) {
90 	case 1:
91 		return (*(uint8_t *)addr);
92 	case 2:
93 		return (*(uint16_t *)addr);
94 	case 4:
95 		return (*(uint32_t *)addr);
96 	case 8:
97 		return (*(uint64_t *)addr);
98 	}
99 	ASSERT(!"bad int len");
100 	return (0xFEEDFACEDEADBEEFULL);
101 }
102 
103 void
104 zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
105 {
106 	int i;
107 	zap_leaf_t l;
108 	l.l_bs = highbit64(size) - 1;
109 	l.l_phys = buf;
110 
111 	buf->l_hdr.lh_block_type =	BSWAP_64(buf->l_hdr.lh_block_type);
112 	buf->l_hdr.lh_prefix =		BSWAP_64(buf->l_hdr.lh_prefix);
113 	buf->l_hdr.lh_magic =		BSWAP_32(buf->l_hdr.lh_magic);
114 	buf->l_hdr.lh_nfree =		BSWAP_16(buf->l_hdr.lh_nfree);
115 	buf->l_hdr.lh_nentries =	BSWAP_16(buf->l_hdr.lh_nentries);
116 	buf->l_hdr.lh_prefix_len =	BSWAP_16(buf->l_hdr.lh_prefix_len);
117 	buf->l_hdr.lh_freelist =	BSWAP_16(buf->l_hdr.lh_freelist);
118 
119 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
120 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
121 
122 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
123 		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
124 		struct zap_leaf_entry *le;
125 
126 		switch (lc->l_free.lf_type) {
127 		case ZAP_CHUNK_ENTRY:
128 			le = &lc->l_entry;
129 
130 			le->le_type =		BSWAP_8(le->le_type);
131 			le->le_value_intlen =	BSWAP_8(le->le_value_intlen);
132 			le->le_next =		BSWAP_16(le->le_next);
133 			le->le_name_chunk =	BSWAP_16(le->le_name_chunk);
134 			le->le_name_numints =	BSWAP_16(le->le_name_numints);
135 			le->le_value_chunk =	BSWAP_16(le->le_value_chunk);
136 			le->le_value_numints =	BSWAP_16(le->le_value_numints);
137 			le->le_cd =		BSWAP_32(le->le_cd);
138 			le->le_hash =		BSWAP_64(le->le_hash);
139 			break;
140 		case ZAP_CHUNK_FREE:
141 			lc->l_free.lf_type =	BSWAP_8(lc->l_free.lf_type);
142 			lc->l_free.lf_next =	BSWAP_16(lc->l_free.lf_next);
143 			break;
144 		case ZAP_CHUNK_ARRAY:
145 			lc->l_array.la_type =	BSWAP_8(lc->l_array.la_type);
146 			lc->l_array.la_next =	BSWAP_16(lc->l_array.la_next);
147 			/* la_array doesn't need swapping */
148 			break;
149 		default:
150 			ASSERT(!"bad leaf type");
151 		}
152 	}
153 }
154 
155 void
156 zap_leaf_init(zap_leaf_t *l, boolean_t sort)
157 {
158 	int i;
159 
160 	l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
161 	zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
162 	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
163 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
164 		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
165 		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
166 	}
167 	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
168 	l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
169 	l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
170 	l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
171 	if (sort)
172 		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
173 }
174 
175 /*
176  * Routines which manipulate leaf chunks (l_chunk[]).
177  */
178 
179 static uint16_t
180 zap_leaf_chunk_alloc(zap_leaf_t *l)
181 {
182 	int chunk;
183 
184 	ASSERT(l->l_phys->l_hdr.lh_nfree > 0);
185 
186 	chunk = l->l_phys->l_hdr.lh_freelist;
187 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
188 	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
189 
190 	l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
191 
192 	l->l_phys->l_hdr.lh_nfree--;
193 
194 	return (chunk);
195 }
196 
197 static void
198 zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
199 {
200 	struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
201 	ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
202 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
203 	ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
204 
205 	zlf->lf_type = ZAP_CHUNK_FREE;
206 	zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
207 	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
208 	l->l_phys->l_hdr.lh_freelist = chunk;
209 
210 	l->l_phys->l_hdr.lh_nfree++;
211 }
212 
213 /*
214  * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
215  */
216 
217 static uint16_t
218 zap_leaf_array_create(zap_leaf_t *l, const char *buf,
219     int integer_size, int num_integers)
220 {
221 	uint16_t chunk_head;
222 	uint16_t *chunkp = &chunk_head;
223 	int byten = 0;
224 	uint64_t value = 0;
225 	int shift = (integer_size-1)*8;
226 	int len = num_integers;
227 
228 	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
229 
230 	while (len > 0) {
231 		uint16_t chunk = zap_leaf_chunk_alloc(l);
232 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
233 		int i;
234 
235 		la->la_type = ZAP_CHUNK_ARRAY;
236 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
237 			if (byten == 0)
238 				value = ldv(integer_size, buf);
239 			la->la_array[i] = value >> shift;
240 			value <<= 8;
241 			if (++byten == integer_size) {
242 				byten = 0;
243 				buf += integer_size;
244 				if (--len == 0)
245 					break;
246 			}
247 		}
248 
249 		*chunkp = chunk;
250 		chunkp = &la->la_next;
251 	}
252 	*chunkp = CHAIN_END;
253 
254 	return (chunk_head);
255 }
256 
257 static void
258 zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
259 {
260 	uint16_t chunk = *chunkp;
261 
262 	*chunkp = CHAIN_END;
263 
264 	while (chunk != CHAIN_END) {
265 		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
266 		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
267 		    ZAP_CHUNK_ARRAY);
268 		zap_leaf_chunk_free(l, chunk);
269 		chunk = nextchunk;
270 	}
271 }
272 
273 /* array_len and buf_len are in integers, not bytes */
274 static void
275 zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
276     int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
277     void *buf)
278 {
279 	int len = MIN(array_len, buf_len);
280 	int byten = 0;
281 	uint64_t value = 0;
282 	char *p = buf;
283 
284 	ASSERT3U(array_int_len, <=, buf_int_len);
285 
286 	/* Fast path for one 8-byte integer */
287 	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
288 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
289 		uint8_t *ip = la->la_array;
290 		uint64_t *buf64 = buf;
291 
292 		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
293 		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
294 		    (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
295 		    (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
296 		return;
297 	}
298 
299 	/* Fast path for an array of 1-byte integers (eg. the entry name) */
300 	if (array_int_len == 1 && buf_int_len == 1 &&
301 	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
302 		while (chunk != CHAIN_END) {
303 			struct zap_leaf_array *la =
304 			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
305 			bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
306 			p += ZAP_LEAF_ARRAY_BYTES;
307 			chunk = la->la_next;
308 		}
309 		return;
310 	}
311 
312 	while (len > 0) {
313 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
314 		int i;
315 
316 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
317 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
318 			value = (value << 8) | la->la_array[i];
319 			byten++;
320 			if (byten == array_int_len) {
321 				stv(buf_int_len, p, value);
322 				byten = 0;
323 				len--;
324 				if (len == 0)
325 					return;
326 				p += buf_int_len;
327 			}
328 		}
329 		chunk = la->la_next;
330 	}
331 }
332 
333 static boolean_t
334 zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
335     int chunk, int array_numints)
336 {
337 	int bseen = 0;
338 
339 	if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
340 		uint64_t *thiskey;
341 		boolean_t match;
342 
343 		ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
344 		thiskey = kmem_alloc(array_numints * sizeof (*thiskey),
345 		    KM_SLEEP);
346 
347 		zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
348 		    sizeof (*thiskey), array_numints, thiskey);
349 		match = bcmp(thiskey, zn->zn_key_orig,
350 		    array_numints * sizeof (*thiskey)) == 0;
351 		kmem_free(thiskey, array_numints * sizeof (*thiskey));
352 		return (match);
353 	}
354 
355 	ASSERT(zn->zn_key_intlen == 1);
356 	if (zn->zn_matchtype == MT_FIRST) {
357 		char *thisname = kmem_alloc(array_numints, KM_SLEEP);
358 		boolean_t match;
359 
360 		zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
361 		    sizeof (char), array_numints, thisname);
362 		match = zap_match(zn, thisname);
363 		kmem_free(thisname, array_numints);
364 		return (match);
365 	}
366 
367 	/*
368 	 * Fast path for exact matching.
369 	 * First check that the lengths match, so that we don't read
370 	 * past the end of the zn_key_orig array.
371 	 */
372 	if (array_numints != zn->zn_key_orig_numints)
373 		return (B_FALSE);
374 	while (bseen < array_numints) {
375 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
376 		int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
377 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
378 		if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
379 			break;
380 		chunk = la->la_next;
381 		bseen += toread;
382 	}
383 	return (bseen == array_numints);
384 }
385 
386 /*
387  * Routines which manipulate leaf entries.
388  */
389 
390 int
391 zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
392 {
393 	uint16_t *chunkp;
394 	struct zap_leaf_entry *le;
395 
396 	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
397 
398 again:
399 	for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
400 	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
401 		uint16_t chunk = *chunkp;
402 		le = ZAP_LEAF_ENTRY(l, chunk);
403 
404 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
405 		ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
406 
407 		if (le->le_hash != zn->zn_hash)
408 			continue;
409 
410 		/*
411 		 * NB: the entry chain is always sorted by cd on
412 		 * normalized zap objects, so this will find the
413 		 * lowest-cd match for MT_FIRST.
414 		 */
415 		ASSERT(zn->zn_matchtype == MT_EXACT ||
416 		    (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
417 		if (zap_leaf_array_match(l, zn, le->le_name_chunk,
418 		    le->le_name_numints)) {
419 			zeh->zeh_num_integers = le->le_value_numints;
420 			zeh->zeh_integer_size = le->le_value_intlen;
421 			zeh->zeh_cd = le->le_cd;
422 			zeh->zeh_hash = le->le_hash;
423 			zeh->zeh_chunkp = chunkp;
424 			zeh->zeh_leaf = l;
425 			return (0);
426 		}
427 	}
428 
429 	/*
430 	 * NB: we could of course do this in one pass, but that would be
431 	 * a pain.  We'll see if MT_BEST is even used much.
432 	 */
433 	if (zn->zn_matchtype == MT_BEST) {
434 		zn->zn_matchtype = MT_FIRST;
435 		goto again;
436 	}
437 
438 	return (SET_ERROR(ENOENT));
439 }
440 
441 /* Return (h1,cd1 >= h2,cd2) */
442 #define	HCD_GTEQ(h1, cd1, h2, cd2) \
443 	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
444 
445 int
446 zap_leaf_lookup_closest(zap_leaf_t *l,
447     uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
448 {
449 	uint16_t chunk;
450 	uint64_t besth = -1ULL;
451 	uint32_t bestcd = -1U;
452 	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
453 	uint16_t lh;
454 	struct zap_leaf_entry *le;
455 
456 	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
457 
458 	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
459 		for (chunk = l->l_phys->l_hash[lh];
460 		    chunk != CHAIN_END; chunk = le->le_next) {
461 			le = ZAP_LEAF_ENTRY(l, chunk);
462 
463 			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
464 			ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
465 
466 			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
467 			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
468 				ASSERT3U(bestlh, >=, lh);
469 				bestlh = lh;
470 				besth = le->le_hash;
471 				bestcd = le->le_cd;
472 
473 				zeh->zeh_num_integers = le->le_value_numints;
474 				zeh->zeh_integer_size = le->le_value_intlen;
475 				zeh->zeh_cd = le->le_cd;
476 				zeh->zeh_hash = le->le_hash;
477 				zeh->zeh_fakechunk = chunk;
478 				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
479 				zeh->zeh_leaf = l;
480 			}
481 		}
482 	}
483 
484 	return (bestcd == -1U ? ENOENT : 0);
485 }
486 
487 int
488 zap_entry_read(const zap_entry_handle_t *zeh,
489     uint8_t integer_size, uint64_t num_integers, void *buf)
490 {
491 	struct zap_leaf_entry *le =
492 	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
493 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
494 
495 	if (le->le_value_intlen > integer_size)
496 		return (SET_ERROR(EINVAL));
497 
498 	zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
499 	    le->le_value_intlen, le->le_value_numints,
500 	    integer_size, num_integers, buf);
501 
502 	if (zeh->zeh_num_integers > num_integers)
503 		return (SET_ERROR(EOVERFLOW));
504 	return (0);
505 
506 }
507 
508 int
509 zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
510     char *buf)
511 {
512 	struct zap_leaf_entry *le =
513 	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
514 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
515 
516 	if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
517 		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
518 		    le->le_name_numints, 8, buflen / 8, buf);
519 	} else {
520 		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
521 		    le->le_name_numints, 1, buflen, buf);
522 	}
523 	if (le->le_name_numints > buflen)
524 		return (SET_ERROR(EOVERFLOW));
525 	return (0);
526 }
527 
528 int
529 zap_entry_update(zap_entry_handle_t *zeh,
530 	uint8_t integer_size, uint64_t num_integers, const void *buf)
531 {
532 	int delta_chunks;
533 	zap_leaf_t *l = zeh->zeh_leaf;
534 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
535 
536 	delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
537 	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
538 
539 	if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
540 		return (SET_ERROR(EAGAIN));
541 
542 	zap_leaf_array_free(l, &le->le_value_chunk);
543 	le->le_value_chunk =
544 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
545 	le->le_value_numints = num_integers;
546 	le->le_value_intlen = integer_size;
547 	return (0);
548 }
549 
550 void
551 zap_entry_remove(zap_entry_handle_t *zeh)
552 {
553 	uint16_t entry_chunk;
554 	struct zap_leaf_entry *le;
555 	zap_leaf_t *l = zeh->zeh_leaf;
556 
557 	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
558 
559 	entry_chunk = *zeh->zeh_chunkp;
560 	le = ZAP_LEAF_ENTRY(l, entry_chunk);
561 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
562 
563 	zap_leaf_array_free(l, &le->le_name_chunk);
564 	zap_leaf_array_free(l, &le->le_value_chunk);
565 
566 	*zeh->zeh_chunkp = le->le_next;
567 	zap_leaf_chunk_free(l, entry_chunk);
568 
569 	l->l_phys->l_hdr.lh_nentries--;
570 }
571 
572 int
573 zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
574     uint8_t integer_size, uint64_t num_integers, const void *buf,
575     zap_entry_handle_t *zeh)
576 {
577 	uint16_t chunk;
578 	uint16_t *chunkp;
579 	struct zap_leaf_entry *le;
580 	uint64_t valuelen;
581 	int numchunks;
582 	uint64_t h = zn->zn_hash;
583 
584 	valuelen = integer_size * num_integers;
585 
586 	numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
587 	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
588 	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
589 		return (E2BIG);
590 
591 	if (cd == ZAP_NEED_CD) {
592 		/* find the lowest unused cd */
593 		if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
594 			cd = 0;
595 
596 			for (chunk = *LEAF_HASH_ENTPTR(l, h);
597 			    chunk != CHAIN_END; chunk = le->le_next) {
598 				le = ZAP_LEAF_ENTRY(l, chunk);
599 				if (le->le_cd > cd)
600 					break;
601 				if (le->le_hash == h) {
602 					ASSERT3U(cd, ==, le->le_cd);
603 					cd++;
604 				}
605 			}
606 		} else {
607 			/* old unsorted format; do it the O(n^2) way */
608 			for (cd = 0; ; cd++) {
609 				for (chunk = *LEAF_HASH_ENTPTR(l, h);
610 				    chunk != CHAIN_END; chunk = le->le_next) {
611 					le = ZAP_LEAF_ENTRY(l, chunk);
612 					if (le->le_hash == h &&
613 					    le->le_cd == cd) {
614 						break;
615 					}
616 				}
617 				/* If this cd is not in use, we are good. */
618 				if (chunk == CHAIN_END)
619 					break;
620 			}
621 		}
622 		/*
623 		 * We would run out of space in a block before we could
624 		 * store enough entries to run out of CD values.
625 		 */
626 		ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
627 	}
628 
629 	if (l->l_phys->l_hdr.lh_nfree < numchunks)
630 		return (SET_ERROR(EAGAIN));
631 
632 	/* make the entry */
633 	chunk = zap_leaf_chunk_alloc(l);
634 	le = ZAP_LEAF_ENTRY(l, chunk);
635 	le->le_type = ZAP_CHUNK_ENTRY;
636 	le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
637 	    zn->zn_key_intlen, zn->zn_key_orig_numints);
638 	le->le_name_numints = zn->zn_key_orig_numints;
639 	le->le_value_chunk =
640 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
641 	le->le_value_numints = num_integers;
642 	le->le_value_intlen = integer_size;
643 	le->le_hash = h;
644 	le->le_cd = cd;
645 
646 	/* link it into the hash chain */
647 	/* XXX if we did the search above, we could just use that */
648 	chunkp = zap_leaf_rehash_entry(l, chunk);
649 
650 	l->l_phys->l_hdr.lh_nentries++;
651 
652 	zeh->zeh_leaf = l;
653 	zeh->zeh_num_integers = num_integers;
654 	zeh->zeh_integer_size = le->le_value_intlen;
655 	zeh->zeh_cd = le->le_cd;
656 	zeh->zeh_hash = le->le_hash;
657 	zeh->zeh_chunkp = chunkp;
658 
659 	return (0);
660 }
661 
662 /*
663  * Determine if there is another entry with the same normalized form.
664  * For performance purposes, either zn or name must be provided (the
665  * other can be NULL).  Note, there usually won't be any hash
666  * conflicts, in which case we don't need the concatenated/normalized
667  * form of the name.  But all callers have one of these on hand anyway,
668  * so might as well take advantage.  A cleaner but slower interface
669  * would accept neither argument, and compute the normalized name as
670  * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
671  */
672 boolean_t
673 zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
674     const char *name, zap_t *zap)
675 {
676 	uint64_t chunk;
677 	struct zap_leaf_entry *le;
678 	boolean_t allocdzn = B_FALSE;
679 
680 	if (zap->zap_normflags == 0)
681 		return (B_FALSE);
682 
683 	for (chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
684 	    chunk != CHAIN_END; chunk = le->le_next) {
685 		le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
686 		if (le->le_hash != zeh->zeh_hash)
687 			continue;
688 		if (le->le_cd == zeh->zeh_cd)
689 			continue;
690 
691 		if (zn == NULL) {
692 			zn = zap_name_alloc(zap, name, MT_FIRST);
693 			allocdzn = B_TRUE;
694 		}
695 		if (zap_leaf_array_match(zeh->zeh_leaf, zn,
696 		    le->le_name_chunk, le->le_name_numints)) {
697 			if (allocdzn)
698 				zap_name_free(zn);
699 			return (B_TRUE);
700 		}
701 	}
702 	if (allocdzn)
703 		zap_name_free(zn);
704 	return (B_FALSE);
705 }
706 
707 /*
708  * Routines for transferring entries between leafs.
709  */
710 
711 static uint16_t *
712 zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
713 {
714 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
715 	struct zap_leaf_entry *le2;
716 	uint16_t *chunkp;
717 
718 	/*
719 	 * keep the entry chain sorted by cd
720 	 * NB: this will not cause problems for unsorted leafs, though
721 	 * it is unnecessary there.
722 	 */
723 	for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
724 	    *chunkp != CHAIN_END; chunkp = &le2->le_next) {
725 		le2 = ZAP_LEAF_ENTRY(l, *chunkp);
726 		if (le2->le_cd > le->le_cd)
727 			break;
728 	}
729 
730 	le->le_next = *chunkp;
731 	*chunkp = entry;
732 	return (chunkp);
733 }
734 
735 static uint16_t
736 zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
737 {
738 	uint16_t new_chunk;
739 	uint16_t *nchunkp = &new_chunk;
740 
741 	while (chunk != CHAIN_END) {
742 		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
743 		struct zap_leaf_array *nla =
744 		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
745 		struct zap_leaf_array *la =
746 		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
747 		int nextchunk = la->la_next;
748 
749 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
750 		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
751 
752 		*nla = *la; /* structure assignment */
753 
754 		zap_leaf_chunk_free(l, chunk);
755 		chunk = nextchunk;
756 		*nchunkp = nchunk;
757 		nchunkp = &nla->la_next;
758 	}
759 	*nchunkp = CHAIN_END;
760 	return (new_chunk);
761 }
762 
763 static void
764 zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
765 {
766 	struct zap_leaf_entry *le, *nle;
767 	uint16_t chunk;
768 
769 	le = ZAP_LEAF_ENTRY(l, entry);
770 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
771 
772 	chunk = zap_leaf_chunk_alloc(nl);
773 	nle = ZAP_LEAF_ENTRY(nl, chunk);
774 	*nle = *le; /* structure assignment */
775 
776 	(void) zap_leaf_rehash_entry(nl, chunk);
777 
778 	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
779 	nle->le_value_chunk =
780 	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
781 
782 	zap_leaf_chunk_free(l, entry);
783 
784 	l->l_phys->l_hdr.lh_nentries--;
785 	nl->l_phys->l_hdr.lh_nentries++;
786 }
787 
788 /*
789  * Transfer the entries whose hash prefix ends in 1 to the new leaf.
790  */
791 void
792 zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
793 {
794 	int i;
795 	int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len;
796 
797 	/* set new prefix and prefix_len */
798 	l->l_phys->l_hdr.lh_prefix <<= 1;
799 	l->l_phys->l_hdr.lh_prefix_len++;
800 	nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1;
801 	nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
802 
803 	/* break existing hash chains */
804 	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
805 
806 	if (sort)
807 		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
808 
809 	/*
810 	 * Transfer entries whose hash bit 'bit' is set to nl; rehash
811 	 * the remaining entries
812 	 *
813 	 * NB: We could find entries via the hashtable instead. That
814 	 * would be O(hashents+numents) rather than O(numblks+numents),
815 	 * but this accesses memory more sequentially, and when we're
816 	 * called, the block is usually pretty full.
817 	 */
818 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
819 		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
820 		if (le->le_type != ZAP_CHUNK_ENTRY)
821 			continue;
822 
823 		if (le->le_hash & (1ULL << bit))
824 			zap_leaf_transfer_entry(l, i, nl);
825 		else
826 			(void) zap_leaf_rehash_entry(l, i);
827 	}
828 }
829 
830 void
831 zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
832 {
833 	int i, n;
834 
835 	n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
836 	    l->l_phys->l_hdr.lh_prefix_len;
837 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
838 	zs->zs_leafs_with_2n_pointers[n]++;
839 
840 
841 	n = l->l_phys->l_hdr.lh_nentries/5;
842 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
843 	zs->zs_blocks_with_n5_entries[n]++;
844 
845 	n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
846 	    l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
847 	    (1<<FZAP_BLOCK_SHIFT(zap));
848 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
849 	zs->zs_blocks_n_tenths_full[n]++;
850 
851 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
852 		int nentries = 0;
853 		int chunk = l->l_phys->l_hash[i];
854 
855 		while (chunk != CHAIN_END) {
856 			struct zap_leaf_entry *le =
857 			    ZAP_LEAF_ENTRY(l, chunk);
858 
859 			n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
860 			    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
861 			    le->le_value_intlen);
862 			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
863 			zs->zs_entries_using_n_chunks[n]++;
864 
865 			chunk = le->le_next;
866 			nentries++;
867 		}
868 
869 		n = nentries;
870 		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
871 		zs->zs_buckets_with_n_entries[n]++;
872 	}
873 }
874