1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27 * Copyright 2017 Nexenta Systems, Inc.
28 * Copyright (c) 2024, Klara, Inc.
29 */
30
31 #include <sys/zio.h>
32 #include <sys/spa.h>
33 #include <sys/dmu.h>
34 #include <sys/zfs_context.h>
35 #include <sys/zap.h>
36 #include <sys/zap_impl.h>
37 #include <sys/zap_leaf.h>
38 #include <sys/btree.h>
39 #include <sys/arc.h>
40 #include <sys/dmu_objset.h>
41 #include <sys/spa_impl.h>
42
43 #ifdef _KERNEL
44 #include <sys/sunddi.h>
45 #endif
46
47 /*
48 * The maximum size (in bytes) of a microzap before it is converted to a
49 * fatzap. It will be rounded up to next multiple of 512 (SPA_MINBLOCKSIZE).
50 *
51 * By definition, a microzap must fit into a single block, so this has
52 * traditionally been SPA_OLD_MAXBLOCKSIZE, and is set to that by default.
53 * Setting this higher requires both the large_blocks feature (to even create
54 * blocks that large) and the large_microzap feature (to enable the stream
55 * machinery to understand not to try to split a microzap block).
56 *
57 * If large_microzap is enabled, this value will be clamped to
58 * spa_maxblocksize(), up to 1M. If not, it will be clamped to
59 * SPA_OLD_MAXBLOCKSIZE.
60 */
61 static int zap_micro_max_size = SPA_OLD_MAXBLOCKSIZE;
62
63 /*
64 * The 1M upper limit is necessary because the count of chunks in a microzap
65 * block is stored as a uint16_t (mze_chunkid). Each chunk is 64 bytes, and the
66 * first is used to store a header, so there are 32767 usable chunks, which is
67 * just under 2M. 1M is the largest power-2-rounded block size under 2M, so we
68 * must set the limit there.
69 */
70 #define MZAP_MAX_SIZE (1048576)
71
72 uint64_t
zap_get_micro_max_size(spa_t * spa)73 zap_get_micro_max_size(spa_t *spa)
74 {
75 uint64_t maxsz = MIN(MZAP_MAX_SIZE,
76 P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE));
77 if (maxsz <= SPA_OLD_MAXBLOCKSIZE)
78 return (maxsz);
79 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP))
80 return (MIN(maxsz, spa_maxblocksize(spa)));
81 return (SPA_OLD_MAXBLOCKSIZE);
82 }
83
84 static int mzap_upgrade(zap_t **zapp,
85 const void *tag, dmu_tx_t *tx, zap_flags_t flags);
86
87 uint64_t
zap_getflags(zap_t * zap)88 zap_getflags(zap_t *zap)
89 {
90 if (zap->zap_ismicro)
91 return (0);
92 return (zap_f_phys(zap)->zap_flags);
93 }
94
95 int
zap_hashbits(zap_t * zap)96 zap_hashbits(zap_t *zap)
97 {
98 if (zap_getflags(zap) & ZAP_FLAG_HASH64)
99 return (48);
100 else
101 return (28);
102 }
103
104 uint32_t
zap_maxcd(zap_t * zap)105 zap_maxcd(zap_t *zap)
106 {
107 if (zap_getflags(zap) & ZAP_FLAG_HASH64)
108 return ((1<<16)-1);
109 else
110 return (-1U);
111 }
112
113 static uint64_t
zap_hash(zap_name_t * zn)114 zap_hash(zap_name_t *zn)
115 {
116 zap_t *zap = zn->zn_zap;
117 uint64_t h = 0;
118
119 if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
120 ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
121 h = *(uint64_t *)zn->zn_key_orig;
122 } else {
123 h = zap->zap_salt;
124 ASSERT(h != 0);
125 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
126
127 if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
128 const uint64_t *wp = zn->zn_key_norm;
129
130 ASSERT(zn->zn_key_intlen == 8);
131 for (int i = 0; i < zn->zn_key_norm_numints;
132 wp++, i++) {
133 uint64_t word = *wp;
134
135 for (int j = 0; j < 8; j++) {
136 h = (h >> 8) ^
137 zfs_crc64_table[(h ^ word) & 0xFF];
138 word >>= NBBY;
139 }
140 }
141 } else {
142 const uint8_t *cp = zn->zn_key_norm;
143
144 /*
145 * We previously stored the terminating null on
146 * disk, but didn't hash it, so we need to
147 * continue to not hash it. (The
148 * zn_key_*_numints includes the terminating
149 * null for non-binary keys.)
150 */
151 int len = zn->zn_key_norm_numints - 1;
152
153 ASSERT(zn->zn_key_intlen == 1);
154 for (int i = 0; i < len; cp++, i++) {
155 h = (h >> 8) ^
156 zfs_crc64_table[(h ^ *cp) & 0xFF];
157 }
158 }
159 }
160 /*
161 * Don't use all 64 bits, since we need some in the cookie for
162 * the collision differentiator. We MUST use the high bits,
163 * since those are the ones that we first pay attention to when
164 * choosing the bucket.
165 */
166 h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
167
168 return (h);
169 }
170
171 static int
zap_normalize(zap_t * zap,const char * name,char * namenorm,int normflags,size_t outlen)172 zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags,
173 size_t outlen)
174 {
175 ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
176
177 size_t inlen = strlen(name) + 1;
178
179 int err = 0;
180 (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
181 normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
182 U8_UNICODE_LATEST, &err);
183
184 return (err);
185 }
186
187 boolean_t
zap_match(zap_name_t * zn,const char * matchname)188 zap_match(zap_name_t *zn, const char *matchname)
189 {
190 boolean_t res = B_FALSE;
191 ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
192
193 if (zn->zn_matchtype & MT_NORMALIZE) {
194 size_t namelen = zn->zn_normbuf_len;
195 char normbuf[ZAP_MAXNAMELEN];
196 char *norm = normbuf;
197
198 /*
199 * Cannot allocate this on-stack as it exceed the stack-limit of
200 * 1024.
201 */
202 if (namelen > ZAP_MAXNAMELEN)
203 norm = kmem_alloc(namelen, KM_SLEEP);
204
205 if (zap_normalize(zn->zn_zap, matchname, norm,
206 zn->zn_normflags, namelen) != 0) {
207 res = B_FALSE;
208 } else {
209 res = (strcmp(zn->zn_key_norm, norm) == 0);
210 }
211 if (norm != normbuf)
212 kmem_free(norm, namelen);
213 } else {
214 res = (strcmp(zn->zn_key_orig, matchname) == 0);
215 }
216 return (res);
217 }
218
219 static kmem_cache_t *zap_name_cache;
220 static kmem_cache_t *zap_attr_cache;
221 static kmem_cache_t *zap_name_long_cache;
222 static kmem_cache_t *zap_attr_long_cache;
223
224 void
zap_init(void)225 zap_init(void)
226 {
227 zap_name_cache = kmem_cache_create("zap_name",
228 sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL,
229 NULL, NULL, NULL, 0);
230
231 zap_attr_cache = kmem_cache_create("zap_attr_cache",
232 sizeof (zap_attribute_t) + ZAP_MAXNAMELEN, 0, NULL,
233 NULL, NULL, NULL, NULL, 0);
234
235 zap_name_long_cache = kmem_cache_create("zap_name_long",
236 sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL,
237 NULL, NULL, NULL, 0);
238
239 zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache",
240 sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW, 0, NULL,
241 NULL, NULL, NULL, NULL, 0);
242 }
243
244 void
zap_fini(void)245 zap_fini(void)
246 {
247 kmem_cache_destroy(zap_name_cache);
248 kmem_cache_destroy(zap_attr_cache);
249 kmem_cache_destroy(zap_name_long_cache);
250 kmem_cache_destroy(zap_attr_long_cache);
251 }
252
253 static zap_name_t *
zap_name_alloc(zap_t * zap,boolean_t longname)254 zap_name_alloc(zap_t *zap, boolean_t longname)
255 {
256 kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache;
257 zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP);
258
259 zn->zn_zap = zap;
260 zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
261 return (zn);
262 }
263
264 void
zap_name_free(zap_name_t * zn)265 zap_name_free(zap_name_t *zn)
266 {
267 if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) {
268 kmem_cache_free(zap_name_cache, zn);
269 } else {
270 ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW);
271 kmem_cache_free(zap_name_long_cache, zn);
272 }
273 }
274
275 static int
zap_name_init_str(zap_name_t * zn,const char * key,matchtype_t mt)276 zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt)
277 {
278 zap_t *zap = zn->zn_zap;
279 size_t key_len = strlen(key) + 1;
280
281 /* Make sure zn is allocated for longname if key is long */
282 IMPLY(key_len > ZAP_MAXNAMELEN,
283 zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW);
284
285 zn->zn_key_intlen = sizeof (*key);
286 zn->zn_key_orig = key;
287 zn->zn_key_orig_numints = key_len;
288 zn->zn_matchtype = mt;
289 zn->zn_normflags = zap->zap_normflags;
290
291 /*
292 * If we're dealing with a case sensitive lookup on a mixed or
293 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
294 * will fold case to all caps overriding the lookup request.
295 */
296 if (mt & MT_MATCH_CASE)
297 zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
298
299 if (zap->zap_normflags) {
300 /*
301 * We *must* use zap_normflags because this normalization is
302 * what the hash is computed from.
303 */
304 if (zap_normalize(zap, key, zn->zn_normbuf,
305 zap->zap_normflags, zn->zn_normbuf_len) != 0)
306 return (SET_ERROR(ENOTSUP));
307 zn->zn_key_norm = zn->zn_normbuf;
308 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
309 } else {
310 if (mt != 0)
311 return (SET_ERROR(ENOTSUP));
312 zn->zn_key_norm = zn->zn_key_orig;
313 zn->zn_key_norm_numints = zn->zn_key_orig_numints;
314 }
315
316 zn->zn_hash = zap_hash(zn);
317
318 if (zap->zap_normflags != zn->zn_normflags) {
319 /*
320 * We *must* use zn_normflags because this normalization is
321 * what the matching is based on. (Not the hash!)
322 */
323 if (zap_normalize(zap, key, zn->zn_normbuf,
324 zn->zn_normflags, zn->zn_normbuf_len) != 0)
325 return (SET_ERROR(ENOTSUP));
326 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
327 }
328
329 return (0);
330 }
331
332 zap_name_t *
zap_name_alloc_str(zap_t * zap,const char * key,matchtype_t mt)333 zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt)
334 {
335 size_t key_len = strlen(key) + 1;
336 zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN));
337 if (zap_name_init_str(zn, key, mt) != 0) {
338 zap_name_free(zn);
339 return (NULL);
340 }
341 return (zn);
342 }
343
344 static zap_name_t *
zap_name_alloc_uint64(zap_t * zap,const uint64_t * key,int numints)345 zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
346 {
347 zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP);
348
349 ASSERT(zap->zap_normflags == 0);
350 zn->zn_zap = zap;
351 zn->zn_key_intlen = sizeof (*key);
352 zn->zn_key_orig = zn->zn_key_norm = key;
353 zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
354 zn->zn_matchtype = 0;
355 zn->zn_normbuf_len = ZAP_MAXNAMELEN;
356
357 zn->zn_hash = zap_hash(zn);
358 return (zn);
359 }
360
361 static void
mzap_byteswap(mzap_phys_t * buf,size_t size)362 mzap_byteswap(mzap_phys_t *buf, size_t size)
363 {
364 buf->mz_block_type = BSWAP_64(buf->mz_block_type);
365 buf->mz_salt = BSWAP_64(buf->mz_salt);
366 buf->mz_normflags = BSWAP_64(buf->mz_normflags);
367 int max = (size / MZAP_ENT_LEN) - 1;
368 for (int i = 0; i < max; i++) {
369 buf->mz_chunk[i].mze_value =
370 BSWAP_64(buf->mz_chunk[i].mze_value);
371 buf->mz_chunk[i].mze_cd =
372 BSWAP_32(buf->mz_chunk[i].mze_cd);
373 }
374 }
375
376 void
zap_byteswap(void * buf,size_t size)377 zap_byteswap(void *buf, size_t size)
378 {
379 uint64_t block_type = *(uint64_t *)buf;
380
381 if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
382 /* ASSERT(magic == ZAP_LEAF_MAGIC); */
383 mzap_byteswap(buf, size);
384 } else {
385 fzap_byteswap(buf, size);
386 }
387 }
388
389 __attribute__((always_inline)) inline
390 static int
mze_compare(const void * arg1,const void * arg2)391 mze_compare(const void *arg1, const void *arg2)
392 {
393 const mzap_ent_t *mze1 = arg1;
394 const mzap_ent_t *mze2 = arg2;
395
396 return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd,
397 (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd));
398 }
399
ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf,mzap_ent_t,mze_compare)400 ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t,
401 mze_compare)
402
403 static void
404 mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash)
405 {
406 mzap_ent_t mze;
407
408 ASSERT(zap->zap_ismicro);
409 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
410
411 mze.mze_chunkid = chunkid;
412 ASSERT0(hash & 0xffffffff);
413 mze.mze_hash = hash >> 32;
414 ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff);
415 mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd;
416 ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0);
417 zfs_btree_add(&zap->zap_m.zap_tree, &mze);
418 }
419
420 static mzap_ent_t *
mze_find(zap_name_t * zn,zfs_btree_index_t * idx)421 mze_find(zap_name_t *zn, zfs_btree_index_t *idx)
422 {
423 mzap_ent_t mze_tofind;
424 mzap_ent_t *mze;
425 zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree;
426
427 ASSERT(zn->zn_zap->zap_ismicro);
428 ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
429
430 ASSERT0(zn->zn_hash & 0xffffffff);
431 mze_tofind.mze_hash = zn->zn_hash >> 32;
432 mze_tofind.mze_cd = 0;
433
434 mze = zfs_btree_find(tree, &mze_tofind, idx);
435 if (mze == NULL)
436 mze = zfs_btree_next(tree, idx, idx);
437 for (; mze && mze->mze_hash == mze_tofind.mze_hash;
438 mze = zfs_btree_next(tree, idx, idx)) {
439 ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
440 if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
441 return (mze);
442 }
443
444 return (NULL);
445 }
446
447 static uint32_t
mze_find_unused_cd(zap_t * zap,uint64_t hash)448 mze_find_unused_cd(zap_t *zap, uint64_t hash)
449 {
450 mzap_ent_t mze_tofind;
451 zfs_btree_index_t idx;
452 zfs_btree_t *tree = &zap->zap_m.zap_tree;
453
454 ASSERT(zap->zap_ismicro);
455 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
456
457 ASSERT0(hash & 0xffffffff);
458 hash >>= 32;
459 mze_tofind.mze_hash = hash;
460 mze_tofind.mze_cd = 0;
461
462 uint32_t cd = 0;
463 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
464 mze && mze->mze_hash == hash;
465 mze = zfs_btree_next(tree, &idx, &idx)) {
466 if (mze->mze_cd != cd)
467 break;
468 cd++;
469 }
470
471 return (cd);
472 }
473
474 /*
475 * Each mzap entry requires at max : 4 chunks
476 * 3 chunks for names + 1 chunk for value.
477 */
478 #define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
479 ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
480
481 /*
482 * Check if the current entry keeps the colliding entries under the fatzap leaf
483 * size.
484 */
485 static boolean_t
mze_canfit_fzap_leaf(zap_name_t * zn,uint64_t hash)486 mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
487 {
488 zap_t *zap = zn->zn_zap;
489 mzap_ent_t mze_tofind;
490 zfs_btree_index_t idx;
491 zfs_btree_t *tree = &zap->zap_m.zap_tree;
492 uint32_t mzap_ents = 0;
493
494 ASSERT0(hash & 0xffffffff);
495 hash >>= 32;
496 mze_tofind.mze_hash = hash;
497 mze_tofind.mze_cd = 0;
498
499 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
500 mze && mze->mze_hash == hash;
501 mze = zfs_btree_next(tree, &idx, &idx)) {
502 mzap_ents++;
503 }
504
505 /* Include the new entry being added */
506 mzap_ents++;
507
508 return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
509 }
510
511 static void
mze_destroy(zap_t * zap)512 mze_destroy(zap_t *zap)
513 {
514 zfs_btree_clear(&zap->zap_m.zap_tree);
515 zfs_btree_destroy(&zap->zap_m.zap_tree);
516 }
517
518 static zap_t *
mzap_open(dmu_buf_t * db)519 mzap_open(dmu_buf_t *db)
520 {
521 zap_t *winner;
522 uint64_t *zap_hdr = (uint64_t *)db->db_data;
523 uint64_t zap_block_type = zap_hdr[0];
524 uint64_t zap_magic = zap_hdr[1];
525
526 ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
527
528 zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
529 rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
530 rw_enter(&zap->zap_rwlock, RW_WRITER);
531 zap->zap_objset = dmu_buf_get_objset(db);
532 zap->zap_object = db->db_object;
533 zap->zap_dbuf = db;
534
535 if (zap_block_type != ZBT_MICRO) {
536 mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT,
537 0);
538 zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
539 if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
540 winner = NULL; /* No actual winner here... */
541 goto handle_winner;
542 }
543 } else {
544 zap->zap_ismicro = TRUE;
545 }
546
547 /*
548 * Make sure that zap_ismicro is set before we let others see
549 * it, because zap_lockdir() checks zap_ismicro without the lock
550 * held.
551 */
552 dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
553 winner = dmu_buf_set_user(db, &zap->zap_dbu);
554
555 if (winner != NULL)
556 goto handle_winner;
557
558 if (zap->zap_ismicro) {
559 zap->zap_salt = zap_m_phys(zap)->mz_salt;
560 zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
561 zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
562
563 /*
564 * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove()
565 * overhead on massive inserts below. It still allows to store
566 * 62 entries before we have to add 2KB B-tree core node.
567 */
568 zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare,
569 mze_find_in_buf, sizeof (mzap_ent_t), 512);
570
571 zap_name_t *zn = zap_name_alloc(zap, B_FALSE);
572 for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) {
573 mzap_ent_phys_t *mze =
574 &zap_m_phys(zap)->mz_chunk[i];
575 if (mze->mze_name[0]) {
576 zap->zap_m.zap_num_entries++;
577 zap_name_init_str(zn, mze->mze_name, 0);
578 mze_insert(zap, i, zn->zn_hash);
579 }
580 }
581 zap_name_free(zn);
582 } else {
583 zap->zap_salt = zap_f_phys(zap)->zap_salt;
584 zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
585
586 ASSERT3U(sizeof (struct zap_leaf_header), ==,
587 2*ZAP_LEAF_CHUNKSIZE);
588
589 /*
590 * The embedded pointer table should not overlap the
591 * other members.
592 */
593 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
594 &zap_f_phys(zap)->zap_salt);
595
596 /*
597 * The embedded pointer table should end at the end of
598 * the block
599 */
600 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
601 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
602 (uintptr_t)zap_f_phys(zap), ==,
603 zap->zap_dbuf->db_size);
604 }
605 rw_exit(&zap->zap_rwlock);
606 return (zap);
607
608 handle_winner:
609 rw_exit(&zap->zap_rwlock);
610 rw_destroy(&zap->zap_rwlock);
611 if (!zap->zap_ismicro)
612 mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
613 kmem_free(zap, sizeof (zap_t));
614 return (winner);
615 }
616
617 /*
618 * This routine "consumes" the caller's hold on the dbuf, which must
619 * have the specified tag.
620 */
621 static int
zap_lockdir_impl(dnode_t * dn,dmu_buf_t * db,const void * tag,dmu_tx_t * tx,krw_t lti,boolean_t fatreader,boolean_t adding,zap_t ** zapp)622 zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
623 krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
624 {
625 ASSERT0(db->db_offset);
626 objset_t *os = dmu_buf_get_objset(db);
627 uint64_t obj = db->db_object;
628 dmu_object_info_t doi;
629
630 *zapp = NULL;
631
632 dmu_object_info_from_dnode(dn, &doi);
633 if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
634 return (SET_ERROR(EINVAL));
635
636 zap_t *zap = dmu_buf_get_user(db);
637 if (zap == NULL) {
638 zap = mzap_open(db);
639 if (zap == NULL) {
640 /*
641 * mzap_open() didn't like what it saw on-disk.
642 * Check for corruption!
643 */
644 return (SET_ERROR(EIO));
645 }
646 }
647
648 /*
649 * We're checking zap_ismicro without the lock held, in order to
650 * tell what type of lock we want. Once we have some sort of
651 * lock, see if it really is the right type. In practice this
652 * can only be different if it was upgraded from micro to fat,
653 * and micro wanted WRITER but fat only needs READER.
654 */
655 krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
656 rw_enter(&zap->zap_rwlock, lt);
657 if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
658 /* it was upgraded, now we only need reader */
659 ASSERT(lt == RW_WRITER);
660 ASSERT(RW_READER ==
661 ((!zap->zap_ismicro && fatreader) ? RW_READER : lti));
662 rw_downgrade(&zap->zap_rwlock);
663 lt = RW_READER;
664 }
665
666 zap->zap_objset = os;
667 zap->zap_dnode = dn;
668
669 if (lt == RW_WRITER)
670 dmu_buf_will_dirty(db, tx);
671
672 ASSERT3P(zap->zap_dbuf, ==, db);
673
674 ASSERT(!zap->zap_ismicro ||
675 zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
676 if (zap->zap_ismicro && tx && adding &&
677 zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
678 uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
679 if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) {
680 dprintf("upgrading obj %llu: num_entries=%u\n",
681 (u_longlong_t)obj, zap->zap_m.zap_num_entries);
682 *zapp = zap;
683 int err = mzap_upgrade(zapp, tag, tx, 0);
684 if (err != 0)
685 rw_exit(&zap->zap_rwlock);
686 return (err);
687 }
688 VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
689 zap->zap_m.zap_num_chunks =
690 db->db_size / MZAP_ENT_LEN - 1;
691
692 if (newsz > SPA_OLD_MAXBLOCKSIZE) {
693 dsl_dataset_t *ds = dmu_objset_ds(os);
694 if (!dsl_dataset_feature_is_active(ds,
695 SPA_FEATURE_LARGE_MICROZAP)) {
696 /*
697 * A microzap just grew beyond the old limit
698 * for the first time, so we have to ensure the
699 * feature flag is activated.
700 * zap_get_micro_max_size() won't let us get
701 * here if the feature is not enabled, so we
702 * don't need any other checks beforehand.
703 *
704 * Since we're in open context, we can't
705 * activate the feature directly, so we instead
706 * flag it on the dataset for next sync.
707 */
708 dsl_dataset_dirty(ds, tx);
709 mutex_enter(&ds->ds_lock);
710 ds->ds_feature_activation
711 [SPA_FEATURE_LARGE_MICROZAP] =
712 (void *)B_TRUE;
713 mutex_exit(&ds->ds_lock);
714 }
715 }
716 }
717
718 *zapp = zap;
719 return (0);
720 }
721
722 static int
zap_lockdir_by_dnode(dnode_t * dn,dmu_tx_t * tx,krw_t lti,boolean_t fatreader,boolean_t adding,const void * tag,zap_t ** zapp)723 zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
724 krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
725 zap_t **zapp)
726 {
727 dmu_buf_t *db;
728 int err;
729
730 err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
731 if (err != 0)
732 return (err);
733 err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
734 if (err != 0)
735 dmu_buf_rele(db, tag);
736 else
737 VERIFY(dnode_add_ref(dn, tag));
738 return (err);
739 }
740
741 int
zap_lockdir(objset_t * os,uint64_t obj,dmu_tx_t * tx,krw_t lti,boolean_t fatreader,boolean_t adding,const void * tag,zap_t ** zapp)742 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
743 krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
744 zap_t **zapp)
745 {
746 dnode_t *dn;
747 dmu_buf_t *db;
748 int err;
749
750 err = dnode_hold(os, obj, tag, &dn);
751 if (err != 0)
752 return (err);
753 err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
754 if (err != 0) {
755 dnode_rele(dn, tag);
756 return (err);
757 }
758 err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
759 if (err != 0) {
760 dmu_buf_rele(db, tag);
761 dnode_rele(dn, tag);
762 }
763 return (err);
764 }
765
766 void
zap_unlockdir(zap_t * zap,const void * tag)767 zap_unlockdir(zap_t *zap, const void *tag)
768 {
769 rw_exit(&zap->zap_rwlock);
770 dnode_rele(zap->zap_dnode, tag);
771 dmu_buf_rele(zap->zap_dbuf, tag);
772 }
773
774 static int
mzap_upgrade(zap_t ** zapp,const void * tag,dmu_tx_t * tx,zap_flags_t flags)775 mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags)
776 {
777 int err = 0;
778 zap_t *zap = *zapp;
779
780 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
781
782 int sz = zap->zap_dbuf->db_size;
783 mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP);
784 memcpy(mzp, zap->zap_dbuf->db_data, sz);
785 int nchunks = zap->zap_m.zap_num_chunks;
786
787 if (!flags) {
788 err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
789 1ULL << fzap_default_block_shift, 0, tx);
790 if (err != 0) {
791 vmem_free(mzp, sz);
792 return (err);
793 }
794 }
795
796 dprintf("upgrading obj=%llu with %u chunks\n",
797 (u_longlong_t)zap->zap_object, nchunks);
798 /* XXX destroy the tree later, so we can use the stored hash value */
799 mze_destroy(zap);
800
801 fzap_upgrade(zap, tx, flags);
802
803 zap_name_t *zn = zap_name_alloc(zap, B_FALSE);
804 for (int i = 0; i < nchunks; i++) {
805 mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
806 if (mze->mze_name[0] == 0)
807 continue;
808 dprintf("adding %s=%llu\n",
809 mze->mze_name, (u_longlong_t)mze->mze_value);
810 zap_name_init_str(zn, mze->mze_name, 0);
811 /* If we fail here, we would end up losing entries */
812 VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
813 tag, tx));
814 zap = zn->zn_zap; /* fzap_add_cd() may change zap */
815 }
816 zap_name_free(zn);
817 vmem_free(mzp, sz);
818 *zapp = zap;
819 return (0);
820 }
821
822 /*
823 * The "normflags" determine the behavior of the matchtype_t which is
824 * passed to zap_lookup_norm(). Names which have the same normalized
825 * version will be stored with the same hash value, and therefore we can
826 * perform normalization-insensitive lookups. We can be Unicode form-
827 * insensitive and/or case-insensitive. The following flags are valid for
828 * "normflags":
829 *
830 * U8_TEXTPREP_NFC
831 * U8_TEXTPREP_NFD
832 * U8_TEXTPREP_NFKC
833 * U8_TEXTPREP_NFKD
834 * U8_TEXTPREP_TOUPPER
835 *
836 * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
837 * of them may be supplied.
838 */
839 void
mzap_create_impl(dnode_t * dn,int normflags,zap_flags_t flags,dmu_tx_t * tx)840 mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
841 {
842 dmu_buf_t *db;
843
844 VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
845
846 dmu_buf_will_dirty(db, tx);
847 mzap_phys_t *zp = db->db_data;
848 zp->mz_block_type = ZBT_MICRO;
849 zp->mz_salt =
850 ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL;
851 zp->mz_normflags = normflags;
852
853 if (flags != 0) {
854 zap_t *zap;
855 /* Only fat zap supports flags; upgrade immediately. */
856 VERIFY(dnode_add_ref(dn, FTAG));
857 VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER,
858 B_FALSE, B_FALSE, &zap));
859 VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
860 zap_unlockdir(zap, FTAG);
861 } else {
862 dmu_buf_rele(db, FTAG);
863 }
864 }
865
866 static uint64_t
zap_create_impl(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dnode_t ** allocated_dnode,const void * tag,dmu_tx_t * tx)867 zap_create_impl(objset_t *os, int normflags, zap_flags_t flags,
868 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
869 dmu_object_type_t bonustype, int bonuslen, int dnodesize,
870 dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
871 {
872 uint64_t obj;
873
874 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
875
876 if (allocated_dnode == NULL) {
877 dnode_t *dn;
878 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
879 indirect_blockshift, bonustype, bonuslen, dnodesize,
880 &dn, FTAG, tx);
881 mzap_create_impl(dn, normflags, flags, tx);
882 dnode_rele(dn, FTAG);
883 } else {
884 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
885 indirect_blockshift, bonustype, bonuslen, dnodesize,
886 allocated_dnode, tag, tx);
887 mzap_create_impl(*allocated_dnode, normflags, flags, tx);
888 }
889
890 return (obj);
891 }
892
893 int
zap_create_claim(objset_t * os,uint64_t obj,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)894 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
895 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
896 {
897 return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
898 0, tx));
899 }
900
901 int
zap_create_claim_dnsize(objset_t * os,uint64_t obj,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)902 zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
903 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
904 {
905 return (zap_create_claim_norm_dnsize(os, obj,
906 0, ot, bonustype, bonuslen, dnodesize, tx));
907 }
908
909 int
zap_create_claim_norm(objset_t * os,uint64_t obj,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)910 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
911 dmu_object_type_t ot,
912 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
913 {
914 return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
915 bonuslen, 0, tx));
916 }
917
918 int
zap_create_claim_norm_dnsize(objset_t * os,uint64_t obj,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)919 zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
920 dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
921 int dnodesize, dmu_tx_t *tx)
922 {
923 dnode_t *dn;
924 int error;
925
926 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
927 error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
928 dnodesize, tx);
929 if (error != 0)
930 return (error);
931
932 error = dnode_hold(os, obj, FTAG, &dn);
933 if (error != 0)
934 return (error);
935
936 mzap_create_impl(dn, normflags, 0, tx);
937
938 dnode_rele(dn, FTAG);
939
940 return (0);
941 }
942
943 uint64_t
zap_create(objset_t * os,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)944 zap_create(objset_t *os, dmu_object_type_t ot,
945 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
946 {
947 return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
948 }
949
950 uint64_t
zap_create_dnsize(objset_t * os,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)951 zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
952 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
953 {
954 return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
955 dnodesize, tx));
956 }
957
958 uint64_t
zap_create_norm(objset_t * os,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)959 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
960 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
961 {
962 return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
963 0, tx));
964 }
965
966 uint64_t
zap_create_norm_dnsize(objset_t * os,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)967 zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
968 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
969 {
970 return (zap_create_impl(os, normflags, 0, ot, 0, 0,
971 bonustype, bonuslen, dnodesize, NULL, NULL, tx));
972 }
973
974 uint64_t
zap_create_flags(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)975 zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
976 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
977 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
978 {
979 return (zap_create_flags_dnsize(os, normflags, flags, ot,
980 leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
981 }
982
983 uint64_t
zap_create_flags_dnsize(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)984 zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
985 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
986 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
987 {
988 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
989 indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL,
990 tx));
991 }
992
993 /*
994 * Create a zap object and return a pointer to the newly allocated dnode via
995 * the allocated_dnode argument. The returned dnode will be held and the
996 * caller is responsible for releasing the hold by calling dnode_rele().
997 */
998 uint64_t
zap_create_hold(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dnode_t ** allocated_dnode,const void * tag,dmu_tx_t * tx)999 zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
1000 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
1001 dmu_object_type_t bonustype, int bonuslen, int dnodesize,
1002 dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
1003 {
1004 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
1005 indirect_blockshift, bonustype, bonuslen, dnodesize,
1006 allocated_dnode, tag, tx));
1007 }
1008
1009 int
zap_destroy(objset_t * os,uint64_t zapobj,dmu_tx_t * tx)1010 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
1011 {
1012 /*
1013 * dmu_object_free will free the object number and free the
1014 * data. Freeing the data will cause our pageout function to be
1015 * called, which will destroy our data (zap_leaf_t's and zap_t).
1016 */
1017
1018 return (dmu_object_free(os, zapobj, tx));
1019 }
1020
1021 void
zap_evict_sync(void * dbu)1022 zap_evict_sync(void *dbu)
1023 {
1024 zap_t *zap = dbu;
1025
1026 rw_destroy(&zap->zap_rwlock);
1027
1028 if (zap->zap_ismicro)
1029 mze_destroy(zap);
1030 else
1031 mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
1032
1033 kmem_free(zap, sizeof (zap_t));
1034 }
1035
1036 int
zap_count(objset_t * os,uint64_t zapobj,uint64_t * count)1037 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
1038 {
1039 zap_t *zap;
1040
1041 int err =
1042 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1043 if (err != 0)
1044 return (err);
1045 if (!zap->zap_ismicro) {
1046 err = fzap_count(zap, count);
1047 } else {
1048 *count = zap->zap_m.zap_num_entries;
1049 }
1050 zap_unlockdir(zap, FTAG);
1051 return (err);
1052 }
1053
1054 /*
1055 * zn may be NULL; if not specified, it will be computed if needed.
1056 * See also the comment above zap_entry_normalization_conflict().
1057 */
1058 static boolean_t
mzap_normalization_conflict(zap_t * zap,zap_name_t * zn,mzap_ent_t * mze,zfs_btree_index_t * idx)1059 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze,
1060 zfs_btree_index_t *idx)
1061 {
1062 boolean_t allocdzn = B_FALSE;
1063 mzap_ent_t *other;
1064 zfs_btree_index_t oidx;
1065
1066 if (zap->zap_normflags == 0)
1067 return (B_FALSE);
1068
1069 for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx);
1070 other && other->mze_hash == mze->mze_hash;
1071 other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) {
1072
1073 if (zn == NULL) {
1074 zn = zap_name_alloc_str(zap,
1075 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
1076 allocdzn = B_TRUE;
1077 }
1078 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
1079 if (allocdzn)
1080 zap_name_free(zn);
1081 return (B_TRUE);
1082 }
1083 }
1084
1085 for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx);
1086 other && other->mze_hash == mze->mze_hash;
1087 other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) {
1088
1089 if (zn == NULL) {
1090 zn = zap_name_alloc_str(zap,
1091 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
1092 allocdzn = B_TRUE;
1093 }
1094 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
1095 if (allocdzn)
1096 zap_name_free(zn);
1097 return (B_TRUE);
1098 }
1099 }
1100
1101 if (allocdzn)
1102 zap_name_free(zn);
1103 return (B_FALSE);
1104 }
1105
1106 /*
1107 * Routines for manipulating attributes.
1108 */
1109
1110 int
zap_lookup(objset_t * os,uint64_t zapobj,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf)1111 zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
1112 uint64_t integer_size, uint64_t num_integers, void *buf)
1113 {
1114 return (zap_lookup_norm(os, zapobj, name, integer_size,
1115 num_integers, buf, 0, NULL, 0, NULL));
1116 }
1117
1118 static int
zap_lookup_impl(zap_t * zap,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf,matchtype_t mt,char * realname,int rn_len,boolean_t * ncp)1119 zap_lookup_impl(zap_t *zap, const char *name,
1120 uint64_t integer_size, uint64_t num_integers, void *buf,
1121 matchtype_t mt, char *realname, int rn_len,
1122 boolean_t *ncp)
1123 {
1124 int err = 0;
1125
1126 zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
1127 if (zn == NULL)
1128 return (SET_ERROR(ENOTSUP));
1129
1130 if (!zap->zap_ismicro) {
1131 err = fzap_lookup(zn, integer_size, num_integers, buf,
1132 realname, rn_len, ncp);
1133 } else {
1134 zfs_btree_index_t idx;
1135 mzap_ent_t *mze = mze_find(zn, &idx);
1136 if (mze == NULL) {
1137 err = SET_ERROR(ENOENT);
1138 } else {
1139 if (num_integers < 1) {
1140 err = SET_ERROR(EOVERFLOW);
1141 } else if (integer_size != 8) {
1142 err = SET_ERROR(EINVAL);
1143 } else {
1144 *(uint64_t *)buf =
1145 MZE_PHYS(zap, mze)->mze_value;
1146 if (realname != NULL)
1147 (void) strlcpy(realname,
1148 MZE_PHYS(zap, mze)->mze_name,
1149 rn_len);
1150 if (ncp) {
1151 *ncp = mzap_normalization_conflict(zap,
1152 zn, mze, &idx);
1153 }
1154 }
1155 }
1156 }
1157 zap_name_free(zn);
1158 return (err);
1159 }
1160
1161 int
zap_lookup_norm(objset_t * os,uint64_t zapobj,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf,matchtype_t mt,char * realname,int rn_len,boolean_t * ncp)1162 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
1163 uint64_t integer_size, uint64_t num_integers, void *buf,
1164 matchtype_t mt, char *realname, int rn_len,
1165 boolean_t *ncp)
1166 {
1167 zap_t *zap;
1168
1169 int err =
1170 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1171 if (err != 0)
1172 return (err);
1173 err = zap_lookup_impl(zap, name, integer_size,
1174 num_integers, buf, mt, realname, rn_len, ncp);
1175 zap_unlockdir(zap, FTAG);
1176 return (err);
1177 }
1178
1179 int
zap_prefetch(objset_t * os,uint64_t zapobj,const char * name)1180 zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
1181 {
1182 zap_t *zap;
1183 int err;
1184 zap_name_t *zn;
1185
1186 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1187 if (err)
1188 return (err);
1189 zn = zap_name_alloc_str(zap, name, 0);
1190 if (zn == NULL) {
1191 zap_unlockdir(zap, FTAG);
1192 return (SET_ERROR(ENOTSUP));
1193 }
1194
1195 fzap_prefetch(zn);
1196 zap_name_free(zn);
1197 zap_unlockdir(zap, FTAG);
1198 return (err);
1199 }
1200
1201 int
zap_prefetch_object(objset_t * os,uint64_t zapobj)1202 zap_prefetch_object(objset_t *os, uint64_t zapobj)
1203 {
1204 int error;
1205 dmu_object_info_t doi;
1206
1207 error = dmu_object_info(os, zapobj, &doi);
1208 if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
1209 error = SET_ERROR(EINVAL);
1210 if (error == 0)
1211 dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset);
1212
1213 return (error);
1214 }
1215
1216 int
zap_lookup_by_dnode(dnode_t * dn,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf)1217 zap_lookup_by_dnode(dnode_t *dn, const char *name,
1218 uint64_t integer_size, uint64_t num_integers, void *buf)
1219 {
1220 return (zap_lookup_norm_by_dnode(dn, name, integer_size,
1221 num_integers, buf, 0, NULL, 0, NULL));
1222 }
1223
1224 int
zap_lookup_norm_by_dnode(dnode_t * dn,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf,matchtype_t mt,char * realname,int rn_len,boolean_t * ncp)1225 zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
1226 uint64_t integer_size, uint64_t num_integers, void *buf,
1227 matchtype_t mt, char *realname, int rn_len,
1228 boolean_t *ncp)
1229 {
1230 zap_t *zap;
1231
1232 int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
1233 FTAG, &zap);
1234 if (err != 0)
1235 return (err);
1236 err = zap_lookup_impl(zap, name, integer_size,
1237 num_integers, buf, mt, realname, rn_len, ncp);
1238 zap_unlockdir(zap, FTAG);
1239 return (err);
1240 }
1241
1242 static int
zap_prefetch_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints)1243 zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints)
1244 {
1245 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1246 if (zn == NULL) {
1247 zap_unlockdir(zap, FTAG);
1248 return (SET_ERROR(ENOTSUP));
1249 }
1250
1251 fzap_prefetch(zn);
1252 zap_name_free(zn);
1253 zap_unlockdir(zap, FTAG);
1254 return (0);
1255 }
1256
1257 int
zap_prefetch_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints)1258 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1259 int key_numints)
1260 {
1261 zap_t *zap;
1262
1263 int err =
1264 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1265 if (err != 0)
1266 return (err);
1267 err = zap_prefetch_uint64_impl(zap, key, key_numints);
1268 /* zap_prefetch_uint64_impl() calls zap_unlockdir() */
1269 return (err);
1270 }
1271
1272 int
zap_prefetch_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints)1273 zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints)
1274 {
1275 zap_t *zap;
1276
1277 int err =
1278 zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1279 if (err != 0)
1280 return (err);
1281 err = zap_prefetch_uint64_impl(zap, key, key_numints);
1282 /* zap_prefetch_uint64_impl() calls zap_unlockdir() */
1283 return (err);
1284 }
1285
1286 static int
zap_lookup_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,uint64_t integer_size,uint64_t num_integers,void * buf)1287 zap_lookup_uint64_impl(zap_t *zap, const uint64_t *key,
1288 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1289 {
1290 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1291 if (zn == NULL) {
1292 zap_unlockdir(zap, FTAG);
1293 return (SET_ERROR(ENOTSUP));
1294 }
1295
1296 int err = fzap_lookup(zn, integer_size, num_integers, buf,
1297 NULL, 0, NULL);
1298 zap_name_free(zn);
1299 zap_unlockdir(zap, FTAG);
1300 return (err);
1301 }
1302
1303 int
zap_lookup_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,uint64_t integer_size,uint64_t num_integers,void * buf)1304 zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1305 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1306 {
1307 zap_t *zap;
1308
1309 int err =
1310 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1311 if (err != 0)
1312 return (err);
1313 err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size,
1314 num_integers, buf);
1315 /* zap_lookup_uint64_impl() calls zap_unlockdir() */
1316 return (err);
1317 }
1318
1319 int
zap_lookup_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,uint64_t integer_size,uint64_t num_integers,void * buf)1320 zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
1321 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1322 {
1323 zap_t *zap;
1324
1325 int err =
1326 zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1327 if (err != 0)
1328 return (err);
1329 err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size,
1330 num_integers, buf);
1331 /* zap_lookup_uint64_impl() calls zap_unlockdir() */
1332 return (err);
1333 }
1334
1335 int
zap_contains(objset_t * os,uint64_t zapobj,const char * name)1336 zap_contains(objset_t *os, uint64_t zapobj, const char *name)
1337 {
1338 int err = zap_lookup_norm(os, zapobj, name, 0,
1339 0, NULL, 0, NULL, 0, NULL);
1340 if (err == EOVERFLOW || err == EINVAL)
1341 err = 0; /* found, but skipped reading the value */
1342 return (err);
1343 }
1344
1345 int
zap_length(objset_t * os,uint64_t zapobj,const char * name,uint64_t * integer_size,uint64_t * num_integers)1346 zap_length(objset_t *os, uint64_t zapobj, const char *name,
1347 uint64_t *integer_size, uint64_t *num_integers)
1348 {
1349 zap_t *zap;
1350
1351 int err =
1352 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1353 if (err != 0)
1354 return (err);
1355 zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
1356 if (zn == NULL) {
1357 zap_unlockdir(zap, FTAG);
1358 return (SET_ERROR(ENOTSUP));
1359 }
1360 if (!zap->zap_ismicro) {
1361 err = fzap_length(zn, integer_size, num_integers);
1362 } else {
1363 zfs_btree_index_t idx;
1364 mzap_ent_t *mze = mze_find(zn, &idx);
1365 if (mze == NULL) {
1366 err = SET_ERROR(ENOENT);
1367 } else {
1368 if (integer_size)
1369 *integer_size = 8;
1370 if (num_integers)
1371 *num_integers = 1;
1372 }
1373 }
1374 zap_name_free(zn);
1375 zap_unlockdir(zap, FTAG);
1376 return (err);
1377 }
1378
1379 int
zap_length_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,uint64_t * integer_size,uint64_t * num_integers)1380 zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1381 int key_numints, uint64_t *integer_size, uint64_t *num_integers)
1382 {
1383 zap_t *zap;
1384
1385 int err =
1386 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1387 if (err != 0)
1388 return (err);
1389 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1390 if (zn == NULL) {
1391 zap_unlockdir(zap, FTAG);
1392 return (SET_ERROR(ENOTSUP));
1393 }
1394 err = fzap_length(zn, integer_size, num_integers);
1395 zap_name_free(zn);
1396 zap_unlockdir(zap, FTAG);
1397 return (err);
1398 }
1399
1400 static void
mzap_addent(zap_name_t * zn,uint64_t value)1401 mzap_addent(zap_name_t *zn, uint64_t value)
1402 {
1403 zap_t *zap = zn->zn_zap;
1404 uint16_t start = zap->zap_m.zap_alloc_next;
1405
1406 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
1407
1408 #ifdef ZFS_DEBUG
1409 for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
1410 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1411 ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
1412 }
1413 #endif
1414
1415 uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
1416 /* given the limited size of the microzap, this can't happen */
1417 ASSERT(cd < zap_maxcd(zap));
1418
1419 again:
1420 for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) {
1421 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1422 if (mze->mze_name[0] == 0) {
1423 mze->mze_value = value;
1424 mze->mze_cd = cd;
1425 (void) strlcpy(mze->mze_name, zn->zn_key_orig,
1426 sizeof (mze->mze_name));
1427 zap->zap_m.zap_num_entries++;
1428 zap->zap_m.zap_alloc_next = i+1;
1429 if (zap->zap_m.zap_alloc_next ==
1430 zap->zap_m.zap_num_chunks)
1431 zap->zap_m.zap_alloc_next = 0;
1432 mze_insert(zap, i, zn->zn_hash);
1433 return;
1434 }
1435 }
1436 if (start != 0) {
1437 start = 0;
1438 goto again;
1439 }
1440 cmn_err(CE_PANIC, "out of entries!");
1441 }
1442
1443 static int
zap_add_impl(zap_t * zap,const char * key,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx,const void * tag)1444 zap_add_impl(zap_t *zap, const char *key,
1445 int integer_size, uint64_t num_integers,
1446 const void *val, dmu_tx_t *tx, const void *tag)
1447 {
1448 const uint64_t *intval = val;
1449 int err = 0;
1450
1451 zap_name_t *zn = zap_name_alloc_str(zap, key, 0);
1452 if (zn == NULL) {
1453 zap_unlockdir(zap, tag);
1454 return (SET_ERROR(ENOTSUP));
1455 }
1456 if (!zap->zap_ismicro) {
1457 err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
1458 zap = zn->zn_zap; /* fzap_add() may change zap */
1459 } else if (integer_size != 8 || num_integers != 1 ||
1460 strlen(key) >= MZAP_NAME_LEN ||
1461 !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
1462 err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
1463 if (err == 0) {
1464 err = fzap_add(zn, integer_size, num_integers, val,
1465 tag, tx);
1466 }
1467 zap = zn->zn_zap; /* fzap_add() may change zap */
1468 } else {
1469 zfs_btree_index_t idx;
1470 if (mze_find(zn, &idx) != NULL) {
1471 err = SET_ERROR(EEXIST);
1472 } else {
1473 mzap_addent(zn, *intval);
1474 }
1475 }
1476 ASSERT(zap == zn->zn_zap);
1477 zap_name_free(zn);
1478 if (zap != NULL) /* may be NULL if fzap_add() failed */
1479 zap_unlockdir(zap, tag);
1480 return (err);
1481 }
1482
1483 int
zap_add(objset_t * os,uint64_t zapobj,const char * key,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1484 zap_add(objset_t *os, uint64_t zapobj, const char *key,
1485 int integer_size, uint64_t num_integers,
1486 const void *val, dmu_tx_t *tx)
1487 {
1488 zap_t *zap;
1489 int err;
1490
1491 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1492 if (err != 0)
1493 return (err);
1494 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1495 /* zap_add_impl() calls zap_unlockdir() */
1496 return (err);
1497 }
1498
1499 int
zap_add_by_dnode(dnode_t * dn,const char * key,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1500 zap_add_by_dnode(dnode_t *dn, const char *key,
1501 int integer_size, uint64_t num_integers,
1502 const void *val, dmu_tx_t *tx)
1503 {
1504 zap_t *zap;
1505 int err;
1506
1507 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1508 if (err != 0)
1509 return (err);
1510 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1511 /* zap_add_impl() calls zap_unlockdir() */
1512 return (err);
1513 }
1514
1515 static int
zap_add_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx,const void * tag)1516 zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
1517 int key_numints, int integer_size, uint64_t num_integers,
1518 const void *val, dmu_tx_t *tx, const void *tag)
1519 {
1520 int err;
1521
1522 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1523 if (zn == NULL) {
1524 zap_unlockdir(zap, tag);
1525 return (SET_ERROR(ENOTSUP));
1526 }
1527 err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
1528 zap = zn->zn_zap; /* fzap_add() may change zap */
1529 zap_name_free(zn);
1530 if (zap != NULL) /* may be NULL if fzap_add() failed */
1531 zap_unlockdir(zap, tag);
1532 return (err);
1533 }
1534
1535 int
zap_add_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1536 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1537 int key_numints, int integer_size, uint64_t num_integers,
1538 const void *val, dmu_tx_t *tx)
1539 {
1540 zap_t *zap;
1541
1542 int err =
1543 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1544 if (err != 0)
1545 return (err);
1546 err = zap_add_uint64_impl(zap, key, key_numints,
1547 integer_size, num_integers, val, tx, FTAG);
1548 /* zap_add_uint64_impl() calls zap_unlockdir() */
1549 return (err);
1550 }
1551
1552 int
zap_add_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1553 zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
1554 int key_numints, int integer_size, uint64_t num_integers,
1555 const void *val, dmu_tx_t *tx)
1556 {
1557 zap_t *zap;
1558
1559 int err =
1560 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1561 if (err != 0)
1562 return (err);
1563 err = zap_add_uint64_impl(zap, key, key_numints,
1564 integer_size, num_integers, val, tx, FTAG);
1565 /* zap_add_uint64_impl() calls zap_unlockdir() */
1566 return (err);
1567 }
1568
1569 int
zap_update(objset_t * os,uint64_t zapobj,const char * name,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1570 zap_update(objset_t *os, uint64_t zapobj, const char *name,
1571 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1572 {
1573 zap_t *zap;
1574 const uint64_t *intval = val;
1575
1576 int err =
1577 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1578 if (err != 0)
1579 return (err);
1580 zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
1581 if (zn == NULL) {
1582 zap_unlockdir(zap, FTAG);
1583 return (SET_ERROR(ENOTSUP));
1584 }
1585 if (!zap->zap_ismicro) {
1586 err = fzap_update(zn, integer_size, num_integers, val,
1587 FTAG, tx);
1588 zap = zn->zn_zap; /* fzap_update() may change zap */
1589 } else if (integer_size != 8 || num_integers != 1 ||
1590 strlen(name) >= MZAP_NAME_LEN) {
1591 dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
1592 (u_longlong_t)zapobj, integer_size,
1593 (u_longlong_t)num_integers, name);
1594 err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
1595 if (err == 0) {
1596 err = fzap_update(zn, integer_size, num_integers,
1597 val, FTAG, tx);
1598 }
1599 zap = zn->zn_zap; /* fzap_update() may change zap */
1600 } else {
1601 zfs_btree_index_t idx;
1602 mzap_ent_t *mze = mze_find(zn, &idx);
1603 if (mze != NULL) {
1604 MZE_PHYS(zap, mze)->mze_value = *intval;
1605 } else {
1606 mzap_addent(zn, *intval);
1607 }
1608 }
1609 ASSERT(zap == zn->zn_zap);
1610 zap_name_free(zn);
1611 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
1612 zap_unlockdir(zap, FTAG);
1613 return (err);
1614 }
1615
1616 static int
zap_update_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx,const void * tag)1617 zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
1618 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
1619 const void *tag)
1620 {
1621 int err;
1622
1623 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1624 if (zn == NULL) {
1625 zap_unlockdir(zap, tag);
1626 return (SET_ERROR(ENOTSUP));
1627 }
1628 err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
1629 zap = zn->zn_zap; /* fzap_update() may change zap */
1630 zap_name_free(zn);
1631 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
1632 zap_unlockdir(zap, tag);
1633 return (err);
1634 }
1635
1636 int
zap_update_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1637 zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1638 int key_numints, int integer_size, uint64_t num_integers, const void *val,
1639 dmu_tx_t *tx)
1640 {
1641 zap_t *zap;
1642
1643 int err =
1644 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1645 if (err != 0)
1646 return (err);
1647 err = zap_update_uint64_impl(zap, key, key_numints,
1648 integer_size, num_integers, val, tx, FTAG);
1649 /* zap_update_uint64_impl() calls zap_unlockdir() */
1650 return (err);
1651 }
1652
1653 int
zap_update_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1654 zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
1655 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1656 {
1657 zap_t *zap;
1658
1659 int err =
1660 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1661 if (err != 0)
1662 return (err);
1663 err = zap_update_uint64_impl(zap, key, key_numints,
1664 integer_size, num_integers, val, tx, FTAG);
1665 /* zap_update_uint64_impl() calls zap_unlockdir() */
1666 return (err);
1667 }
1668
1669 int
zap_remove(objset_t * os,uint64_t zapobj,const char * name,dmu_tx_t * tx)1670 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
1671 {
1672 return (zap_remove_norm(os, zapobj, name, 0, tx));
1673 }
1674
1675 static int
zap_remove_impl(zap_t * zap,const char * name,matchtype_t mt,dmu_tx_t * tx)1676 zap_remove_impl(zap_t *zap, const char *name,
1677 matchtype_t mt, dmu_tx_t *tx)
1678 {
1679 int err = 0;
1680
1681 zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
1682 if (zn == NULL)
1683 return (SET_ERROR(ENOTSUP));
1684 if (!zap->zap_ismicro) {
1685 err = fzap_remove(zn, tx);
1686 } else {
1687 zfs_btree_index_t idx;
1688 mzap_ent_t *mze = mze_find(zn, &idx);
1689 if (mze == NULL) {
1690 err = SET_ERROR(ENOENT);
1691 } else {
1692 zap->zap_m.zap_num_entries--;
1693 memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t));
1694 zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx);
1695 }
1696 }
1697 zap_name_free(zn);
1698 return (err);
1699 }
1700
1701 int
zap_remove_norm(objset_t * os,uint64_t zapobj,const char * name,matchtype_t mt,dmu_tx_t * tx)1702 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
1703 matchtype_t mt, dmu_tx_t *tx)
1704 {
1705 zap_t *zap;
1706 int err;
1707
1708 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1709 if (err)
1710 return (err);
1711 err = zap_remove_impl(zap, name, mt, tx);
1712 zap_unlockdir(zap, FTAG);
1713 return (err);
1714 }
1715
1716 int
zap_remove_by_dnode(dnode_t * dn,const char * name,dmu_tx_t * tx)1717 zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
1718 {
1719 zap_t *zap;
1720 int err;
1721
1722 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1723 if (err)
1724 return (err);
1725 err = zap_remove_impl(zap, name, 0, tx);
1726 zap_unlockdir(zap, FTAG);
1727 return (err);
1728 }
1729
1730 static int
zap_remove_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,dmu_tx_t * tx,const void * tag)1731 zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
1732 dmu_tx_t *tx, const void *tag)
1733 {
1734 int err;
1735
1736 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1737 if (zn == NULL) {
1738 zap_unlockdir(zap, tag);
1739 return (SET_ERROR(ENOTSUP));
1740 }
1741 err = fzap_remove(zn, tx);
1742 zap_name_free(zn);
1743 zap_unlockdir(zap, tag);
1744 return (err);
1745 }
1746
1747 int
zap_remove_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,dmu_tx_t * tx)1748 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1749 int key_numints, dmu_tx_t *tx)
1750 {
1751 zap_t *zap;
1752
1753 int err =
1754 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1755 if (err != 0)
1756 return (err);
1757 err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
1758 /* zap_remove_uint64_impl() calls zap_unlockdir() */
1759 return (err);
1760 }
1761
1762 int
zap_remove_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,dmu_tx_t * tx)1763 zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
1764 dmu_tx_t *tx)
1765 {
1766 zap_t *zap;
1767
1768 int err =
1769 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1770 if (err != 0)
1771 return (err);
1772 err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
1773 /* zap_remove_uint64_impl() calls zap_unlockdir() */
1774 return (err);
1775 }
1776
1777
1778 static zap_attribute_t *
zap_attribute_alloc_impl(boolean_t longname)1779 zap_attribute_alloc_impl(boolean_t longname)
1780 {
1781 zap_attribute_t *za;
1782
1783 za = kmem_cache_alloc((longname)? zap_attr_long_cache : zap_attr_cache,
1784 KM_SLEEP);
1785 za->za_name_len = (longname)? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
1786 return (za);
1787 }
1788
1789 zap_attribute_t *
zap_attribute_alloc(void)1790 zap_attribute_alloc(void)
1791 {
1792 return (zap_attribute_alloc_impl(B_FALSE));
1793 }
1794
1795 zap_attribute_t *
zap_attribute_long_alloc(void)1796 zap_attribute_long_alloc(void)
1797 {
1798 return (zap_attribute_alloc_impl(B_TRUE));
1799 }
1800
1801 void
zap_attribute_free(zap_attribute_t * za)1802 zap_attribute_free(zap_attribute_t *za)
1803 {
1804 if (za->za_name_len == ZAP_MAXNAMELEN) {
1805 kmem_cache_free(zap_attr_cache, za);
1806 } else {
1807 ASSERT3U(za->za_name_len, ==, ZAP_MAXNAMELEN_NEW);
1808 kmem_cache_free(zap_attr_long_cache, za);
1809 }
1810 }
1811
1812 /*
1813 * Routines for iterating over the attributes.
1814 */
1815
1816 static void
zap_cursor_init_impl(zap_cursor_t * zc,objset_t * os,uint64_t zapobj,uint64_t serialized,boolean_t prefetch)1817 zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1818 uint64_t serialized, boolean_t prefetch)
1819 {
1820 zc->zc_objset = os;
1821 zc->zc_zap = NULL;
1822 zc->zc_leaf = NULL;
1823 zc->zc_zapobj = zapobj;
1824 zc->zc_serialized = serialized;
1825 zc->zc_hash = 0;
1826 zc->zc_cd = 0;
1827 zc->zc_prefetch = prefetch;
1828 }
1829 void
zap_cursor_init_serialized(zap_cursor_t * zc,objset_t * os,uint64_t zapobj,uint64_t serialized)1830 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1831 uint64_t serialized)
1832 {
1833 zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
1834 }
1835
1836 /*
1837 * Initialize a cursor at the beginning of the ZAP object. The entire
1838 * ZAP object will be prefetched.
1839 */
1840 void
zap_cursor_init(zap_cursor_t * zc,objset_t * os,uint64_t zapobj)1841 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1842 {
1843 zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
1844 }
1845
1846 /*
1847 * Initialize a cursor at the beginning, but request that we not prefetch
1848 * the entire ZAP object.
1849 */
1850 void
zap_cursor_init_noprefetch(zap_cursor_t * zc,objset_t * os,uint64_t zapobj)1851 zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1852 {
1853 zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
1854 }
1855
1856 void
zap_cursor_fini(zap_cursor_t * zc)1857 zap_cursor_fini(zap_cursor_t *zc)
1858 {
1859 if (zc->zc_zap) {
1860 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1861 zap_unlockdir(zc->zc_zap, NULL);
1862 zc->zc_zap = NULL;
1863 }
1864 if (zc->zc_leaf) {
1865 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1866 zap_put_leaf(zc->zc_leaf);
1867 zc->zc_leaf = NULL;
1868 }
1869 zc->zc_objset = NULL;
1870 }
1871
1872 uint64_t
zap_cursor_serialize(zap_cursor_t * zc)1873 zap_cursor_serialize(zap_cursor_t *zc)
1874 {
1875 if (zc->zc_hash == -1ULL)
1876 return (-1ULL);
1877 if (zc->zc_zap == NULL)
1878 return (zc->zc_serialized);
1879 ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
1880 ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
1881
1882 /*
1883 * We want to keep the high 32 bits of the cursor zero if we can, so
1884 * that 32-bit programs can access this. So usually use a small
1885 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
1886 * of the cursor.
1887 *
1888 * [ collision differentiator | zap_hashbits()-bit hash value ]
1889 */
1890 return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
1891 ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
1892 }
1893
1894 int
zap_cursor_retrieve(zap_cursor_t * zc,zap_attribute_t * za)1895 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
1896 {
1897 int err;
1898
1899 if (zc->zc_hash == -1ULL)
1900 return (SET_ERROR(ENOENT));
1901
1902 if (zc->zc_zap == NULL) {
1903 int hb;
1904 err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
1905 RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
1906 if (err != 0)
1907 return (err);
1908
1909 /*
1910 * To support zap_cursor_init_serialized, advance, retrieve,
1911 * we must add to the existing zc_cd, which may already
1912 * be 1 due to the zap_cursor_advance.
1913 */
1914 ASSERT(zc->zc_hash == 0);
1915 hb = zap_hashbits(zc->zc_zap);
1916 zc->zc_hash = zc->zc_serialized << (64 - hb);
1917 zc->zc_cd += zc->zc_serialized >> hb;
1918 if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
1919 zc->zc_cd = 0;
1920 } else {
1921 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1922 }
1923 if (!zc->zc_zap->zap_ismicro) {
1924 err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
1925 } else {
1926 zfs_btree_index_t idx;
1927 mzap_ent_t mze_tofind;
1928
1929 mze_tofind.mze_hash = zc->zc_hash >> 32;
1930 mze_tofind.mze_cd = zc->zc_cd;
1931
1932 mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree,
1933 &mze_tofind, &idx);
1934 if (mze == NULL) {
1935 mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree,
1936 &idx, &idx);
1937 }
1938 if (mze) {
1939 mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
1940 ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
1941 za->za_normalization_conflict =
1942 mzap_normalization_conflict(zc->zc_zap, NULL,
1943 mze, &idx);
1944 za->za_integer_length = 8;
1945 za->za_num_integers = 1;
1946 za->za_first_integer = mzep->mze_value;
1947 (void) strlcpy(za->za_name, mzep->mze_name,
1948 za->za_name_len);
1949 zc->zc_hash = (uint64_t)mze->mze_hash << 32;
1950 zc->zc_cd = mze->mze_cd;
1951 err = 0;
1952 } else {
1953 zc->zc_hash = -1ULL;
1954 err = SET_ERROR(ENOENT);
1955 }
1956 }
1957 rw_exit(&zc->zc_zap->zap_rwlock);
1958 return (err);
1959 }
1960
1961 void
zap_cursor_advance(zap_cursor_t * zc)1962 zap_cursor_advance(zap_cursor_t *zc)
1963 {
1964 if (zc->zc_hash == -1ULL)
1965 return;
1966 zc->zc_cd++;
1967 }
1968
1969 int
zap_get_stats(objset_t * os,uint64_t zapobj,zap_stats_t * zs)1970 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
1971 {
1972 zap_t *zap;
1973
1974 int err =
1975 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1976 if (err != 0)
1977 return (err);
1978
1979 memset(zs, 0, sizeof (zap_stats_t));
1980
1981 if (zap->zap_ismicro) {
1982 zs->zs_blocksize = zap->zap_dbuf->db_size;
1983 zs->zs_num_entries = zap->zap_m.zap_num_entries;
1984 zs->zs_num_blocks = 1;
1985 } else {
1986 fzap_get_stats(zap, zs);
1987 }
1988 zap_unlockdir(zap, FTAG);
1989 return (0);
1990 }
1991
1992 #if defined(_KERNEL)
1993 EXPORT_SYMBOL(zap_create);
1994 EXPORT_SYMBOL(zap_create_dnsize);
1995 EXPORT_SYMBOL(zap_create_norm);
1996 EXPORT_SYMBOL(zap_create_norm_dnsize);
1997 EXPORT_SYMBOL(zap_create_flags);
1998 EXPORT_SYMBOL(zap_create_flags_dnsize);
1999 EXPORT_SYMBOL(zap_create_claim);
2000 EXPORT_SYMBOL(zap_create_claim_norm);
2001 EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
2002 EXPORT_SYMBOL(zap_create_hold);
2003 EXPORT_SYMBOL(zap_destroy);
2004 EXPORT_SYMBOL(zap_lookup);
2005 EXPORT_SYMBOL(zap_lookup_by_dnode);
2006 EXPORT_SYMBOL(zap_lookup_norm);
2007 EXPORT_SYMBOL(zap_lookup_uint64);
2008 EXPORT_SYMBOL(zap_contains);
2009 EXPORT_SYMBOL(zap_prefetch);
2010 EXPORT_SYMBOL(zap_prefetch_uint64);
2011 EXPORT_SYMBOL(zap_prefetch_object);
2012 EXPORT_SYMBOL(zap_add);
2013 EXPORT_SYMBOL(zap_add_by_dnode);
2014 EXPORT_SYMBOL(zap_add_uint64);
2015 EXPORT_SYMBOL(zap_add_uint64_by_dnode);
2016 EXPORT_SYMBOL(zap_update);
2017 EXPORT_SYMBOL(zap_update_uint64);
2018 EXPORT_SYMBOL(zap_update_uint64_by_dnode);
2019 EXPORT_SYMBOL(zap_length);
2020 EXPORT_SYMBOL(zap_length_uint64);
2021 EXPORT_SYMBOL(zap_remove);
2022 EXPORT_SYMBOL(zap_remove_by_dnode);
2023 EXPORT_SYMBOL(zap_remove_norm);
2024 EXPORT_SYMBOL(zap_remove_uint64);
2025 EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
2026 EXPORT_SYMBOL(zap_count);
2027 EXPORT_SYMBOL(zap_value_search);
2028 EXPORT_SYMBOL(zap_join);
2029 EXPORT_SYMBOL(zap_join_increment);
2030 EXPORT_SYMBOL(zap_add_int);
2031 EXPORT_SYMBOL(zap_remove_int);
2032 EXPORT_SYMBOL(zap_lookup_int);
2033 EXPORT_SYMBOL(zap_increment_int);
2034 EXPORT_SYMBOL(zap_add_int_key);
2035 EXPORT_SYMBOL(zap_lookup_int_key);
2036 EXPORT_SYMBOL(zap_increment);
2037 EXPORT_SYMBOL(zap_cursor_init);
2038 EXPORT_SYMBOL(zap_cursor_fini);
2039 EXPORT_SYMBOL(zap_cursor_retrieve);
2040 EXPORT_SYMBOL(zap_cursor_advance);
2041 EXPORT_SYMBOL(zap_cursor_serialize);
2042 EXPORT_SYMBOL(zap_cursor_init_serialized);
2043 EXPORT_SYMBOL(zap_get_stats);
2044
2045 ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW,
2046 "Maximum micro ZAP size before converting to a fat ZAP, "
2047 "in bytes (max 1M)");
2048 #endif
2049