1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27 * Copyright 2017 Nexenta Systems, Inc.
28 * Copyright (c) 2024, Klara, Inc.
29 */
30
31 #include <sys/zio.h>
32 #include <sys/spa.h>
33 #include <sys/dmu.h>
34 #include <sys/zfs_context.h>
35 #include <sys/zap.h>
36 #include <sys/zap_impl.h>
37 #include <sys/zap_leaf.h>
38 #include <sys/btree.h>
39 #include <sys/arc.h>
40 #include <sys/dmu_objset.h>
41 #include <sys/spa_impl.h>
42
43 #ifdef _KERNEL
44 #include <sys/sunddi.h>
45 #endif
46
47 /*
48 * The maximum size (in bytes) of a microzap before it is converted to a
49 * fatzap. It will be rounded up to next multiple of 512 (SPA_MINBLOCKSIZE).
50 *
51 * By definition, a microzap must fit into a single block, so this has
52 * traditionally been SPA_OLD_MAXBLOCKSIZE, and is set to that by default.
53 * Setting this higher requires both the large_blocks feature (to even create
54 * blocks that large) and the large_microzap feature (to enable the stream
55 * machinery to understand not to try to split a microzap block).
56 *
57 * If large_microzap is enabled, this value will be clamped to
58 * spa_maxblocksize(), up to 1M. If not, it will be clamped to
59 * SPA_OLD_MAXBLOCKSIZE.
60 */
61 static int zap_micro_max_size = SPA_OLD_MAXBLOCKSIZE;
62
63 /*
64 * The 1M upper limit is necessary because the count of chunks in a microzap
65 * block is stored as a uint16_t (mze_chunkid). Each chunk is 64 bytes, and the
66 * first is used to store a header, so there are 32767 usable chunks, which is
67 * just under 2M. 1M is the largest power-2-rounded block size under 2M, so we
68 * must set the limit there.
69 */
70 #define MZAP_MAX_SIZE (1048576)
71
72 uint64_t
zap_get_micro_max_size(spa_t * spa)73 zap_get_micro_max_size(spa_t *spa)
74 {
75 uint64_t maxsz = MIN(MZAP_MAX_SIZE,
76 P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE));
77 if (maxsz <= SPA_OLD_MAXBLOCKSIZE)
78 return (maxsz);
79 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP))
80 return (MIN(maxsz, spa_maxblocksize(spa)));
81 return (SPA_OLD_MAXBLOCKSIZE);
82 }
83
84 static int mzap_upgrade(zap_t **zapp,
85 const void *tag, dmu_tx_t *tx, zap_flags_t flags);
86
87 uint64_t
zap_getflags(zap_t * zap)88 zap_getflags(zap_t *zap)
89 {
90 if (zap->zap_ismicro)
91 return (0);
92 return (zap_f_phys(zap)->zap_flags);
93 }
94
95 int
zap_hashbits(zap_t * zap)96 zap_hashbits(zap_t *zap)
97 {
98 if (zap_getflags(zap) & ZAP_FLAG_HASH64)
99 return (48);
100 else
101 return (28);
102 }
103
104 uint32_t
zap_maxcd(zap_t * zap)105 zap_maxcd(zap_t *zap)
106 {
107 if (zap_getflags(zap) & ZAP_FLAG_HASH64)
108 return ((1<<16)-1);
109 else
110 return (-1U);
111 }
112
113 static uint64_t
zap_hash(zap_name_t * zn)114 zap_hash(zap_name_t *zn)
115 {
116 zap_t *zap = zn->zn_zap;
117 uint64_t h = 0;
118
119 if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
120 ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
121 h = *(uint64_t *)zn->zn_key_orig;
122 } else {
123 h = zap->zap_salt;
124 ASSERT(h != 0);
125 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
126
127 if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
128 const uint64_t *wp = zn->zn_key_norm;
129
130 ASSERT(zn->zn_key_intlen == 8);
131 for (int i = 0; i < zn->zn_key_norm_numints;
132 wp++, i++) {
133 uint64_t word = *wp;
134
135 for (int j = 0; j < 8; j++) {
136 h = (h >> 8) ^
137 zfs_crc64_table[(h ^ word) & 0xFF];
138 word >>= NBBY;
139 }
140 }
141 } else {
142 const uint8_t *cp = zn->zn_key_norm;
143
144 /*
145 * We previously stored the terminating null on
146 * disk, but didn't hash it, so we need to
147 * continue to not hash it. (The
148 * zn_key_*_numints includes the terminating
149 * null for non-binary keys.)
150 */
151 int len = zn->zn_key_norm_numints - 1;
152
153 ASSERT(zn->zn_key_intlen == 1);
154 for (int i = 0; i < len; cp++, i++) {
155 h = (h >> 8) ^
156 zfs_crc64_table[(h ^ *cp) & 0xFF];
157 }
158 }
159 }
160 /*
161 * Don't use all 64 bits, since we need some in the cookie for
162 * the collision differentiator. We MUST use the high bits,
163 * since those are the ones that we first pay attention to when
164 * choosing the bucket.
165 */
166 h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
167
168 return (h);
169 }
170
171 static int
zap_normalize(zap_t * zap,const char * name,char * namenorm,int normflags,size_t outlen)172 zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags,
173 size_t outlen)
174 {
175 ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
176
177 size_t inlen = strlen(name) + 1;
178
179 int err = 0;
180 (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
181 normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
182 U8_UNICODE_LATEST, &err);
183
184 return (err);
185 }
186
187 boolean_t
zap_match(zap_name_t * zn,const char * matchname)188 zap_match(zap_name_t *zn, const char *matchname)
189 {
190 boolean_t res = B_FALSE;
191 ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
192
193 if (zn->zn_matchtype & MT_NORMALIZE) {
194 size_t namelen = zn->zn_normbuf_len;
195 char normbuf[ZAP_MAXNAMELEN];
196 char *norm = normbuf;
197
198 /*
199 * Cannot allocate this on-stack as it exceed the stack-limit of
200 * 1024.
201 */
202 if (namelen > ZAP_MAXNAMELEN)
203 norm = kmem_alloc(namelen, KM_SLEEP);
204
205 if (zap_normalize(zn->zn_zap, matchname, norm,
206 zn->zn_normflags, namelen) != 0) {
207 res = B_FALSE;
208 } else {
209 res = (strcmp(zn->zn_key_norm, norm) == 0);
210 }
211 if (norm != normbuf)
212 kmem_free(norm, namelen);
213 } else {
214 res = (strcmp(zn->zn_key_orig, matchname) == 0);
215 }
216 return (res);
217 }
218
219 static kmem_cache_t *zap_name_cache;
220 static kmem_cache_t *zap_attr_cache;
221 static kmem_cache_t *zap_name_long_cache;
222 static kmem_cache_t *zap_attr_long_cache;
223
224 void
zap_init(void)225 zap_init(void)
226 {
227 zap_name_cache = kmem_cache_create("zap_name",
228 sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL,
229 NULL, NULL, NULL, 0);
230
231 zap_attr_cache = kmem_cache_create("zap_attr_cache",
232 sizeof (zap_attribute_t) + ZAP_MAXNAMELEN, 0, NULL,
233 NULL, NULL, NULL, NULL, 0);
234
235 zap_name_long_cache = kmem_cache_create("zap_name_long",
236 sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL,
237 NULL, NULL, NULL, 0);
238
239 zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache",
240 sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW, 0, NULL,
241 NULL, NULL, NULL, NULL, 0);
242 }
243
244 void
zap_fini(void)245 zap_fini(void)
246 {
247 kmem_cache_destroy(zap_name_cache);
248 kmem_cache_destroy(zap_attr_cache);
249 kmem_cache_destroy(zap_name_long_cache);
250 kmem_cache_destroy(zap_attr_long_cache);
251 }
252
253 static zap_name_t *
zap_name_alloc(zap_t * zap,boolean_t longname)254 zap_name_alloc(zap_t *zap, boolean_t longname)
255 {
256 kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache;
257 zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP);
258
259 zn->zn_zap = zap;
260 zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
261 return (zn);
262 }
263
264 void
zap_name_free(zap_name_t * zn)265 zap_name_free(zap_name_t *zn)
266 {
267 if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) {
268 kmem_cache_free(zap_name_cache, zn);
269 } else {
270 ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW);
271 kmem_cache_free(zap_name_long_cache, zn);
272 }
273 }
274
275 static int
zap_name_init_str(zap_name_t * zn,const char * key,matchtype_t mt)276 zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt)
277 {
278 zap_t *zap = zn->zn_zap;
279 size_t key_len = strlen(key) + 1;
280
281 /* Make sure zn is allocated for longname if key is long */
282 IMPLY(key_len > ZAP_MAXNAMELEN,
283 zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW);
284
285 zn->zn_key_intlen = sizeof (*key);
286 zn->zn_key_orig = key;
287 zn->zn_key_orig_numints = key_len;
288 zn->zn_matchtype = mt;
289 zn->zn_normflags = zap->zap_normflags;
290
291 /*
292 * If we're dealing with a case sensitive lookup on a mixed or
293 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
294 * will fold case to all caps overriding the lookup request.
295 */
296 if (mt & MT_MATCH_CASE)
297 zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
298
299 if (zap->zap_normflags) {
300 /*
301 * We *must* use zap_normflags because this normalization is
302 * what the hash is computed from.
303 */
304 if (zap_normalize(zap, key, zn->zn_normbuf,
305 zap->zap_normflags, zn->zn_normbuf_len) != 0)
306 return (SET_ERROR(ENOTSUP));
307 zn->zn_key_norm = zn->zn_normbuf;
308 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
309 } else {
310 if (mt != 0)
311 return (SET_ERROR(ENOTSUP));
312 zn->zn_key_norm = zn->zn_key_orig;
313 zn->zn_key_norm_numints = zn->zn_key_orig_numints;
314 }
315
316 zn->zn_hash = zap_hash(zn);
317
318 if (zap->zap_normflags != zn->zn_normflags) {
319 /*
320 * We *must* use zn_normflags because this normalization is
321 * what the matching is based on. (Not the hash!)
322 */
323 if (zap_normalize(zap, key, zn->zn_normbuf,
324 zn->zn_normflags, zn->zn_normbuf_len) != 0)
325 return (SET_ERROR(ENOTSUP));
326 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
327 }
328
329 return (0);
330 }
331
332 zap_name_t *
zap_name_alloc_str(zap_t * zap,const char * key,matchtype_t mt)333 zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt)
334 {
335 size_t key_len = strlen(key) + 1;
336 zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN));
337 if (zap_name_init_str(zn, key, mt) != 0) {
338 zap_name_free(zn);
339 return (NULL);
340 }
341 return (zn);
342 }
343
344 static zap_name_t *
zap_name_alloc_uint64(zap_t * zap,const uint64_t * key,int numints)345 zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
346 {
347 zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP);
348
349 ASSERT0(zap->zap_normflags);
350 zn->zn_zap = zap;
351 zn->zn_key_intlen = sizeof (*key);
352 zn->zn_key_orig = zn->zn_key_norm = key;
353 zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
354 zn->zn_matchtype = 0;
355 zn->zn_normbuf_len = ZAP_MAXNAMELEN;
356
357 zn->zn_hash = zap_hash(zn);
358 return (zn);
359 }
360
361 static void
mzap_byteswap(mzap_phys_t * buf,size_t size)362 mzap_byteswap(mzap_phys_t *buf, size_t size)
363 {
364 buf->mz_block_type = BSWAP_64(buf->mz_block_type);
365 buf->mz_salt = BSWAP_64(buf->mz_salt);
366 buf->mz_normflags = BSWAP_64(buf->mz_normflags);
367 int max = (size / MZAP_ENT_LEN) - 1;
368 for (int i = 0; i < max; i++) {
369 buf->mz_chunk[i].mze_value =
370 BSWAP_64(buf->mz_chunk[i].mze_value);
371 buf->mz_chunk[i].mze_cd =
372 BSWAP_32(buf->mz_chunk[i].mze_cd);
373 }
374 }
375
376 void
zap_byteswap(void * buf,size_t size)377 zap_byteswap(void *buf, size_t size)
378 {
379 uint64_t block_type = *(uint64_t *)buf;
380
381 if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
382 /* ASSERT(magic == ZAP_LEAF_MAGIC); */
383 mzap_byteswap(buf, size);
384 } else {
385 fzap_byteswap(buf, size);
386 }
387 }
388
389 __attribute__((always_inline)) inline
390 static int
mze_compare(const void * arg1,const void * arg2)391 mze_compare(const void *arg1, const void *arg2)
392 {
393 const mzap_ent_t *mze1 = arg1;
394 const mzap_ent_t *mze2 = arg2;
395
396 return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd,
397 (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd));
398 }
399
ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf,mzap_ent_t,mze_compare)400 ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t,
401 mze_compare)
402
403 static void
404 mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash)
405 {
406 mzap_ent_t mze;
407
408 ASSERT(zap->zap_ismicro);
409 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
410
411 mze.mze_chunkid = chunkid;
412 ASSERT0(hash & 0xffffffff);
413 mze.mze_hash = hash >> 32;
414 ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff);
415 mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd;
416 ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0);
417 zfs_btree_add(&zap->zap_m.zap_tree, &mze);
418 }
419
420 static mzap_ent_t *
mze_find(zap_name_t * zn,zfs_btree_index_t * idx)421 mze_find(zap_name_t *zn, zfs_btree_index_t *idx)
422 {
423 mzap_ent_t mze_tofind;
424 mzap_ent_t *mze;
425 zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree;
426
427 ASSERT(zn->zn_zap->zap_ismicro);
428 ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
429
430 ASSERT0(zn->zn_hash & 0xffffffff);
431 mze_tofind.mze_hash = zn->zn_hash >> 32;
432 mze_tofind.mze_cd = 0;
433
434 mze = zfs_btree_find(tree, &mze_tofind, idx);
435 if (mze == NULL)
436 mze = zfs_btree_next(tree, idx, idx);
437 for (; mze && mze->mze_hash == mze_tofind.mze_hash;
438 mze = zfs_btree_next(tree, idx, idx)) {
439 ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
440 if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
441 return (mze);
442 }
443
444 return (NULL);
445 }
446
447 static uint32_t
mze_find_unused_cd(zap_t * zap,uint64_t hash)448 mze_find_unused_cd(zap_t *zap, uint64_t hash)
449 {
450 mzap_ent_t mze_tofind;
451 zfs_btree_index_t idx;
452 zfs_btree_t *tree = &zap->zap_m.zap_tree;
453
454 ASSERT(zap->zap_ismicro);
455 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
456
457 ASSERT0(hash & 0xffffffff);
458 hash >>= 32;
459 mze_tofind.mze_hash = hash;
460 mze_tofind.mze_cd = 0;
461
462 uint32_t cd = 0;
463 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
464 mze && mze->mze_hash == hash;
465 mze = zfs_btree_next(tree, &idx, &idx)) {
466 if (mze->mze_cd != cd)
467 break;
468 cd++;
469 }
470
471 return (cd);
472 }
473
474 /*
475 * Each mzap entry requires at max : 4 chunks
476 * 3 chunks for names + 1 chunk for value.
477 */
478 #define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
479 ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
480
481 /*
482 * Check if the current entry keeps the colliding entries under the fatzap leaf
483 * size.
484 */
485 static boolean_t
mze_canfit_fzap_leaf(zap_name_t * zn,uint64_t hash)486 mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
487 {
488 zap_t *zap = zn->zn_zap;
489 mzap_ent_t mze_tofind;
490 zfs_btree_index_t idx;
491 zfs_btree_t *tree = &zap->zap_m.zap_tree;
492 uint32_t mzap_ents = 0;
493
494 ASSERT0(hash & 0xffffffff);
495 hash >>= 32;
496 mze_tofind.mze_hash = hash;
497 mze_tofind.mze_cd = 0;
498
499 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
500 mze && mze->mze_hash == hash;
501 mze = zfs_btree_next(tree, &idx, &idx)) {
502 mzap_ents++;
503 }
504
505 /* Include the new entry being added */
506 mzap_ents++;
507
508 return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
509 }
510
511 static void
mze_destroy(zap_t * zap)512 mze_destroy(zap_t *zap)
513 {
514 zfs_btree_clear(&zap->zap_m.zap_tree);
515 zfs_btree_destroy(&zap->zap_m.zap_tree);
516 }
517
518 static zap_t *
mzap_open(dmu_buf_t * db)519 mzap_open(dmu_buf_t *db)
520 {
521 zap_t *winner;
522 uint64_t *zap_hdr = (uint64_t *)db->db_data;
523 uint64_t zap_block_type = zap_hdr[0];
524 uint64_t zap_magic = zap_hdr[1];
525
526 ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
527
528 zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
529 rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
530 rw_enter(&zap->zap_rwlock, RW_WRITER);
531 zap->zap_objset = dmu_buf_get_objset(db);
532 zap->zap_object = db->db_object;
533 zap->zap_dbuf = db;
534
535 if (zap_block_type != ZBT_MICRO) {
536 mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT,
537 0);
538 zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
539 if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
540 winner = NULL; /* No actual winner here... */
541 goto handle_winner;
542 }
543 } else {
544 zap->zap_ismicro = TRUE;
545 }
546
547 /*
548 * Make sure that zap_ismicro is set before we let others see
549 * it, because zap_lockdir() checks zap_ismicro without the lock
550 * held.
551 */
552 dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
553 winner = dmu_buf_set_user(db, &zap->zap_dbu);
554
555 if (winner != NULL)
556 goto handle_winner;
557
558 if (zap->zap_ismicro) {
559 zap->zap_salt = zap_m_phys(zap)->mz_salt;
560 zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
561 zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
562
563 /*
564 * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove()
565 * overhead on massive inserts below. It still allows to store
566 * 62 entries before we have to add 2KB B-tree core node.
567 */
568 zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare,
569 mze_find_in_buf, sizeof (mzap_ent_t), 512);
570
571 zap_name_t *zn = zap_name_alloc(zap, B_FALSE);
572 for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) {
573 mzap_ent_phys_t *mze =
574 &zap_m_phys(zap)->mz_chunk[i];
575 if (mze->mze_name[0]) {
576 zap->zap_m.zap_num_entries++;
577 zap_name_init_str(zn, mze->mze_name, 0);
578 mze_insert(zap, i, zn->zn_hash);
579 }
580 }
581 zap_name_free(zn);
582 } else {
583 zap->zap_salt = zap_f_phys(zap)->zap_salt;
584 zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
585
586 ASSERT3U(sizeof (struct zap_leaf_header), ==,
587 2*ZAP_LEAF_CHUNKSIZE);
588
589 /*
590 * The embedded pointer table should not overlap the
591 * other members.
592 */
593 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
594 &zap_f_phys(zap)->zap_salt);
595
596 /*
597 * The embedded pointer table should end at the end of
598 * the block
599 */
600 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
601 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
602 (uintptr_t)zap_f_phys(zap), ==,
603 zap->zap_dbuf->db_size);
604 }
605 rw_exit(&zap->zap_rwlock);
606 return (zap);
607
608 handle_winner:
609 rw_exit(&zap->zap_rwlock);
610 rw_destroy(&zap->zap_rwlock);
611 if (!zap->zap_ismicro)
612 mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
613 kmem_free(zap, sizeof (zap_t));
614 return (winner);
615 }
616
617 /*
618 * This routine "consumes" the caller's hold on the dbuf, which must
619 * have the specified tag.
620 */
621 static int
zap_lockdir_impl(dnode_t * dn,dmu_buf_t * db,const void * tag,dmu_tx_t * tx,krw_t lti,boolean_t fatreader,boolean_t adding,zap_t ** zapp)622 zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
623 krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
624 {
625 ASSERT0(db->db_offset);
626 objset_t *os = dmu_buf_get_objset(db);
627 uint64_t obj = db->db_object;
628
629 *zapp = NULL;
630
631 if (DMU_OT_BYTESWAP(dn->dn_type) != DMU_BSWAP_ZAP)
632 return (SET_ERROR(EINVAL));
633
634 zap_t *zap = dmu_buf_get_user(db);
635 if (zap == NULL) {
636 zap = mzap_open(db);
637 if (zap == NULL) {
638 /*
639 * mzap_open() didn't like what it saw on-disk.
640 * Check for corruption!
641 */
642 return (SET_ERROR(EIO));
643 }
644 }
645
646 /*
647 * We're checking zap_ismicro without the lock held, in order to
648 * tell what type of lock we want. Once we have some sort of
649 * lock, see if it really is the right type. In practice this
650 * can only be different if it was upgraded from micro to fat,
651 * and micro wanted WRITER but fat only needs READER.
652 */
653 krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
654 rw_enter(&zap->zap_rwlock, lt);
655 if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
656 /* it was upgraded, now we only need reader */
657 ASSERT(lt == RW_WRITER);
658 ASSERT(RW_READER ==
659 ((!zap->zap_ismicro && fatreader) ? RW_READER : lti));
660 rw_downgrade(&zap->zap_rwlock);
661 lt = RW_READER;
662 }
663
664 zap->zap_objset = os;
665 zap->zap_dnode = dn;
666
667 if (lt == RW_WRITER)
668 dmu_buf_will_dirty(db, tx);
669
670 ASSERT3P(zap->zap_dbuf, ==, db);
671
672 ASSERT(!zap->zap_ismicro ||
673 zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
674 if (zap->zap_ismicro && tx && adding &&
675 zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
676 uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
677 if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) {
678 dprintf("upgrading obj %llu: num_entries=%u\n",
679 (u_longlong_t)obj, zap->zap_m.zap_num_entries);
680 *zapp = zap;
681 int err = mzap_upgrade(zapp, tag, tx, 0);
682 if (err != 0)
683 rw_exit(&zap->zap_rwlock);
684 return (err);
685 }
686 VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
687 zap->zap_m.zap_num_chunks =
688 db->db_size / MZAP_ENT_LEN - 1;
689
690 if (newsz > SPA_OLD_MAXBLOCKSIZE) {
691 dsl_dataset_t *ds = dmu_objset_ds(os);
692 if (!dsl_dataset_feature_is_active(ds,
693 SPA_FEATURE_LARGE_MICROZAP)) {
694 /*
695 * A microzap just grew beyond the old limit
696 * for the first time, so we have to ensure the
697 * feature flag is activated.
698 * zap_get_micro_max_size() won't let us get
699 * here if the feature is not enabled, so we
700 * don't need any other checks beforehand.
701 *
702 * Since we're in open context, we can't
703 * activate the feature directly, so we instead
704 * flag it on the dataset for next sync.
705 */
706 dsl_dataset_dirty(ds, tx);
707 mutex_enter(&ds->ds_lock);
708 ds->ds_feature_activation
709 [SPA_FEATURE_LARGE_MICROZAP] =
710 (void *)B_TRUE;
711 mutex_exit(&ds->ds_lock);
712 }
713 }
714 }
715
716 *zapp = zap;
717 return (0);
718 }
719
720 static int
zap_lockdir_by_dnode(dnode_t * dn,dmu_tx_t * tx,krw_t lti,boolean_t fatreader,boolean_t adding,const void * tag,zap_t ** zapp)721 zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
722 krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
723 zap_t **zapp)
724 {
725 dmu_buf_t *db;
726 int err;
727
728 err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
729 if (err != 0)
730 return (err);
731 err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
732 if (err != 0)
733 dmu_buf_rele(db, tag);
734 else
735 VERIFY(dnode_add_ref(dn, tag));
736 return (err);
737 }
738
739 int
zap_lockdir(objset_t * os,uint64_t obj,dmu_tx_t * tx,krw_t lti,boolean_t fatreader,boolean_t adding,const void * tag,zap_t ** zapp)740 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
741 krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
742 zap_t **zapp)
743 {
744 dnode_t *dn;
745 dmu_buf_t *db;
746 int err;
747
748 err = dnode_hold(os, obj, tag, &dn);
749 if (err != 0)
750 return (err);
751 err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
752 if (err != 0) {
753 dnode_rele(dn, tag);
754 return (err);
755 }
756 err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
757 if (err != 0) {
758 dmu_buf_rele(db, tag);
759 dnode_rele(dn, tag);
760 }
761 return (err);
762 }
763
764 void
zap_unlockdir(zap_t * zap,const void * tag)765 zap_unlockdir(zap_t *zap, const void *tag)
766 {
767 rw_exit(&zap->zap_rwlock);
768 dnode_rele(zap->zap_dnode, tag);
769 dmu_buf_rele(zap->zap_dbuf, tag);
770 }
771
772 static int
mzap_upgrade(zap_t ** zapp,const void * tag,dmu_tx_t * tx,zap_flags_t flags)773 mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags)
774 {
775 int err = 0;
776 zap_t *zap = *zapp;
777
778 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
779
780 int sz = zap->zap_dbuf->db_size;
781 mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP);
782 memcpy(mzp, zap->zap_dbuf->db_data, sz);
783 int nchunks = zap->zap_m.zap_num_chunks;
784
785 if (!flags) {
786 err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
787 1ULL << fzap_default_block_shift, 0, tx);
788 if (err != 0) {
789 vmem_free(mzp, sz);
790 return (err);
791 }
792 }
793
794 dprintf("upgrading obj=%llu with %u chunks\n",
795 (u_longlong_t)zap->zap_object, nchunks);
796 /* XXX destroy the tree later, so we can use the stored hash value */
797 mze_destroy(zap);
798
799 fzap_upgrade(zap, tx, flags);
800
801 zap_name_t *zn = zap_name_alloc(zap, B_FALSE);
802 for (int i = 0; i < nchunks; i++) {
803 mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
804 if (mze->mze_name[0] == 0)
805 continue;
806 dprintf("adding %s=%llu\n",
807 mze->mze_name, (u_longlong_t)mze->mze_value);
808 zap_name_init_str(zn, mze->mze_name, 0);
809 /* If we fail here, we would end up losing entries */
810 VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
811 tag, tx));
812 zap = zn->zn_zap; /* fzap_add_cd() may change zap */
813 }
814 zap_name_free(zn);
815 vmem_free(mzp, sz);
816 *zapp = zap;
817 return (0);
818 }
819
820 /*
821 * The "normflags" determine the behavior of the matchtype_t which is
822 * passed to zap_lookup_norm(). Names which have the same normalized
823 * version will be stored with the same hash value, and therefore we can
824 * perform normalization-insensitive lookups. We can be Unicode form-
825 * insensitive and/or case-insensitive. The following flags are valid for
826 * "normflags":
827 *
828 * U8_TEXTPREP_NFC
829 * U8_TEXTPREP_NFD
830 * U8_TEXTPREP_NFKC
831 * U8_TEXTPREP_NFKD
832 * U8_TEXTPREP_TOUPPER
833 *
834 * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
835 * of them may be supplied.
836 */
837 void
mzap_create_impl(dnode_t * dn,int normflags,zap_flags_t flags,dmu_tx_t * tx)838 mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
839 {
840 dmu_buf_t *db;
841
842 VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
843
844 dmu_buf_will_dirty(db, tx);
845 mzap_phys_t *zp = db->db_data;
846 zp->mz_block_type = ZBT_MICRO;
847 zp->mz_salt =
848 ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL;
849 zp->mz_normflags = normflags;
850
851 if (flags != 0) {
852 zap_t *zap;
853 /* Only fat zap supports flags; upgrade immediately. */
854 VERIFY(dnode_add_ref(dn, FTAG));
855 VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER,
856 B_FALSE, B_FALSE, &zap));
857 VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
858 zap_unlockdir(zap, FTAG);
859 } else {
860 dmu_buf_rele(db, FTAG);
861 }
862 }
863
864 static uint64_t
zap_create_impl(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dnode_t ** allocated_dnode,const void * tag,dmu_tx_t * tx)865 zap_create_impl(objset_t *os, int normflags, zap_flags_t flags,
866 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
867 dmu_object_type_t bonustype, int bonuslen, int dnodesize,
868 dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
869 {
870 uint64_t obj;
871
872 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
873
874 if (allocated_dnode == NULL) {
875 dnode_t *dn;
876 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
877 indirect_blockshift, bonustype, bonuslen, dnodesize,
878 &dn, FTAG, tx);
879 mzap_create_impl(dn, normflags, flags, tx);
880 dnode_rele(dn, FTAG);
881 } else {
882 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
883 indirect_blockshift, bonustype, bonuslen, dnodesize,
884 allocated_dnode, tag, tx);
885 mzap_create_impl(*allocated_dnode, normflags, flags, tx);
886 }
887
888 return (obj);
889 }
890
891 int
zap_create_claim(objset_t * os,uint64_t obj,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)892 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
893 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
894 {
895 return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
896 0, tx));
897 }
898
899 int
zap_create_claim_dnsize(objset_t * os,uint64_t obj,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)900 zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
901 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
902 {
903 return (zap_create_claim_norm_dnsize(os, obj,
904 0, ot, bonustype, bonuslen, dnodesize, tx));
905 }
906
907 int
zap_create_claim_norm(objset_t * os,uint64_t obj,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)908 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
909 dmu_object_type_t ot,
910 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
911 {
912 return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
913 bonuslen, 0, tx));
914 }
915
916 int
zap_create_claim_norm_dnsize(objset_t * os,uint64_t obj,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)917 zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
918 dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
919 int dnodesize, dmu_tx_t *tx)
920 {
921 dnode_t *dn;
922 int error;
923
924 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
925 error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
926 dnodesize, tx);
927 if (error != 0)
928 return (error);
929
930 error = dnode_hold(os, obj, FTAG, &dn);
931 if (error != 0)
932 return (error);
933
934 mzap_create_impl(dn, normflags, 0, tx);
935
936 dnode_rele(dn, FTAG);
937
938 return (0);
939 }
940
941 uint64_t
zap_create(objset_t * os,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)942 zap_create(objset_t *os, dmu_object_type_t ot,
943 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
944 {
945 return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
946 }
947
948 uint64_t
zap_create_dnsize(objset_t * os,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)949 zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
950 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
951 {
952 return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
953 dnodesize, tx));
954 }
955
956 uint64_t
zap_create_norm(objset_t * os,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)957 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
958 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
959 {
960 return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
961 0, tx));
962 }
963
964 uint64_t
zap_create_norm_dnsize(objset_t * os,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)965 zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
966 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
967 {
968 return (zap_create_impl(os, normflags, 0, ot, 0, 0,
969 bonustype, bonuslen, dnodesize, NULL, NULL, tx));
970 }
971
972 uint64_t
zap_create_flags(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)973 zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
974 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
975 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
976 {
977 return (zap_create_flags_dnsize(os, normflags, flags, ot,
978 leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
979 }
980
981 uint64_t
zap_create_flags_dnsize(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)982 zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
983 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
984 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
985 {
986 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
987 indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL,
988 tx));
989 }
990
991 /*
992 * Create a zap object and return a pointer to the newly allocated dnode via
993 * the allocated_dnode argument. The returned dnode will be held and the
994 * caller is responsible for releasing the hold by calling dnode_rele().
995 */
996 uint64_t
zap_create_hold(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dnode_t ** allocated_dnode,const void * tag,dmu_tx_t * tx)997 zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
998 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
999 dmu_object_type_t bonustype, int bonuslen, int dnodesize,
1000 dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
1001 {
1002 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
1003 indirect_blockshift, bonustype, bonuslen, dnodesize,
1004 allocated_dnode, tag, tx));
1005 }
1006
1007 int
zap_destroy(objset_t * os,uint64_t zapobj,dmu_tx_t * tx)1008 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
1009 {
1010 /*
1011 * dmu_object_free will free the object number and free the
1012 * data. Freeing the data will cause our pageout function to be
1013 * called, which will destroy our data (zap_leaf_t's and zap_t).
1014 */
1015
1016 return (dmu_object_free(os, zapobj, tx));
1017 }
1018
1019 void
zap_evict_sync(void * dbu)1020 zap_evict_sync(void *dbu)
1021 {
1022 zap_t *zap = dbu;
1023
1024 rw_destroy(&zap->zap_rwlock);
1025
1026 if (zap->zap_ismicro)
1027 mze_destroy(zap);
1028 else
1029 mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
1030
1031 kmem_free(zap, sizeof (zap_t));
1032 }
1033
1034 int
zap_count(objset_t * os,uint64_t zapobj,uint64_t * count)1035 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
1036 {
1037 zap_t *zap;
1038
1039 int err =
1040 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1041 if (err != 0)
1042 return (err);
1043 if (!zap->zap_ismicro) {
1044 err = fzap_count(zap, count);
1045 } else {
1046 *count = zap->zap_m.zap_num_entries;
1047 }
1048 zap_unlockdir(zap, FTAG);
1049 return (err);
1050 }
1051
1052 /*
1053 * zn may be NULL; if not specified, it will be computed if needed.
1054 * See also the comment above zap_entry_normalization_conflict().
1055 */
1056 static boolean_t
mzap_normalization_conflict(zap_t * zap,zap_name_t * zn,mzap_ent_t * mze,zfs_btree_index_t * idx)1057 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze,
1058 zfs_btree_index_t *idx)
1059 {
1060 boolean_t allocdzn = B_FALSE;
1061 mzap_ent_t *other;
1062 zfs_btree_index_t oidx;
1063
1064 if (zap->zap_normflags == 0)
1065 return (B_FALSE);
1066
1067 for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx);
1068 other && other->mze_hash == mze->mze_hash;
1069 other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) {
1070
1071 if (zn == NULL) {
1072 zn = zap_name_alloc_str(zap,
1073 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
1074 allocdzn = B_TRUE;
1075 }
1076 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
1077 if (allocdzn)
1078 zap_name_free(zn);
1079 return (B_TRUE);
1080 }
1081 }
1082
1083 for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx);
1084 other && other->mze_hash == mze->mze_hash;
1085 other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) {
1086
1087 if (zn == NULL) {
1088 zn = zap_name_alloc_str(zap,
1089 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
1090 allocdzn = B_TRUE;
1091 }
1092 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
1093 if (allocdzn)
1094 zap_name_free(zn);
1095 return (B_TRUE);
1096 }
1097 }
1098
1099 if (allocdzn)
1100 zap_name_free(zn);
1101 return (B_FALSE);
1102 }
1103
1104 /*
1105 * Routines for manipulating attributes.
1106 */
1107
1108 int
zap_lookup(objset_t * os,uint64_t zapobj,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf)1109 zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
1110 uint64_t integer_size, uint64_t num_integers, void *buf)
1111 {
1112 return (zap_lookup_norm(os, zapobj, name, integer_size,
1113 num_integers, buf, 0, NULL, 0, NULL));
1114 }
1115
1116 static int
zap_lookup_impl(zap_t * zap,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf,matchtype_t mt,char * realname,int rn_len,boolean_t * ncp)1117 zap_lookup_impl(zap_t *zap, const char *name,
1118 uint64_t integer_size, uint64_t num_integers, void *buf,
1119 matchtype_t mt, char *realname, int rn_len,
1120 boolean_t *ncp)
1121 {
1122 int err = 0;
1123
1124 zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
1125 if (zn == NULL)
1126 return (SET_ERROR(ENOTSUP));
1127
1128 if (!zap->zap_ismicro) {
1129 err = fzap_lookup(zn, integer_size, num_integers, buf,
1130 realname, rn_len, ncp);
1131 } else {
1132 zfs_btree_index_t idx;
1133 mzap_ent_t *mze = mze_find(zn, &idx);
1134 if (mze == NULL) {
1135 err = SET_ERROR(ENOENT);
1136 } else {
1137 if (num_integers < 1) {
1138 err = SET_ERROR(EOVERFLOW);
1139 } else if (integer_size != 8) {
1140 err = SET_ERROR(EINVAL);
1141 } else {
1142 *(uint64_t *)buf =
1143 MZE_PHYS(zap, mze)->mze_value;
1144 if (realname != NULL)
1145 (void) strlcpy(realname,
1146 MZE_PHYS(zap, mze)->mze_name,
1147 rn_len);
1148 if (ncp) {
1149 *ncp = mzap_normalization_conflict(zap,
1150 zn, mze, &idx);
1151 }
1152 }
1153 }
1154 }
1155 zap_name_free(zn);
1156 return (err);
1157 }
1158
1159 int
zap_lookup_norm(objset_t * os,uint64_t zapobj,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf,matchtype_t mt,char * realname,int rn_len,boolean_t * ncp)1160 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
1161 uint64_t integer_size, uint64_t num_integers, void *buf,
1162 matchtype_t mt, char *realname, int rn_len,
1163 boolean_t *ncp)
1164 {
1165 zap_t *zap;
1166
1167 int err =
1168 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1169 if (err != 0)
1170 return (err);
1171 err = zap_lookup_impl(zap, name, integer_size,
1172 num_integers, buf, mt, realname, rn_len, ncp);
1173 zap_unlockdir(zap, FTAG);
1174 return (err);
1175 }
1176
1177 int
zap_prefetch(objset_t * os,uint64_t zapobj,const char * name)1178 zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
1179 {
1180 zap_t *zap;
1181 int err;
1182 zap_name_t *zn;
1183
1184 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1185 if (err)
1186 return (err);
1187 zn = zap_name_alloc_str(zap, name, 0);
1188 if (zn == NULL) {
1189 zap_unlockdir(zap, FTAG);
1190 return (SET_ERROR(ENOTSUP));
1191 }
1192
1193 fzap_prefetch(zn);
1194 zap_name_free(zn);
1195 zap_unlockdir(zap, FTAG);
1196 return (err);
1197 }
1198
1199 int
zap_prefetch_object(objset_t * os,uint64_t zapobj)1200 zap_prefetch_object(objset_t *os, uint64_t zapobj)
1201 {
1202 int error;
1203 dmu_object_info_t doi;
1204
1205 error = dmu_object_info(os, zapobj, &doi);
1206 if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
1207 error = SET_ERROR(EINVAL);
1208 if (error == 0)
1209 dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset);
1210
1211 return (error);
1212 }
1213
1214 int
zap_lookup_by_dnode(dnode_t * dn,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf)1215 zap_lookup_by_dnode(dnode_t *dn, const char *name,
1216 uint64_t integer_size, uint64_t num_integers, void *buf)
1217 {
1218 return (zap_lookup_norm_by_dnode(dn, name, integer_size,
1219 num_integers, buf, 0, NULL, 0, NULL));
1220 }
1221
1222 int
zap_lookup_norm_by_dnode(dnode_t * dn,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf,matchtype_t mt,char * realname,int rn_len,boolean_t * ncp)1223 zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
1224 uint64_t integer_size, uint64_t num_integers, void *buf,
1225 matchtype_t mt, char *realname, int rn_len,
1226 boolean_t *ncp)
1227 {
1228 zap_t *zap;
1229
1230 int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
1231 FTAG, &zap);
1232 if (err != 0)
1233 return (err);
1234 err = zap_lookup_impl(zap, name, integer_size,
1235 num_integers, buf, mt, realname, rn_len, ncp);
1236 zap_unlockdir(zap, FTAG);
1237 return (err);
1238 }
1239
1240 static int
zap_prefetch_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints)1241 zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints)
1242 {
1243 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1244 if (zn == NULL) {
1245 zap_unlockdir(zap, FTAG);
1246 return (SET_ERROR(ENOTSUP));
1247 }
1248
1249 fzap_prefetch(zn);
1250 zap_name_free(zn);
1251 zap_unlockdir(zap, FTAG);
1252 return (0);
1253 }
1254
1255 int
zap_prefetch_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints)1256 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1257 int key_numints)
1258 {
1259 zap_t *zap;
1260
1261 int err =
1262 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1263 if (err != 0)
1264 return (err);
1265 err = zap_prefetch_uint64_impl(zap, key, key_numints);
1266 /* zap_prefetch_uint64_impl() calls zap_unlockdir() */
1267 return (err);
1268 }
1269
1270 int
zap_prefetch_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints)1271 zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints)
1272 {
1273 zap_t *zap;
1274
1275 int err =
1276 zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1277 if (err != 0)
1278 return (err);
1279 err = zap_prefetch_uint64_impl(zap, key, key_numints);
1280 /* zap_prefetch_uint64_impl() calls zap_unlockdir() */
1281 return (err);
1282 }
1283
1284 static int
zap_lookup_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,uint64_t integer_size,uint64_t num_integers,void * buf)1285 zap_lookup_uint64_impl(zap_t *zap, const uint64_t *key,
1286 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1287 {
1288 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1289 if (zn == NULL) {
1290 zap_unlockdir(zap, FTAG);
1291 return (SET_ERROR(ENOTSUP));
1292 }
1293
1294 int err = fzap_lookup(zn, integer_size, num_integers, buf,
1295 NULL, 0, NULL);
1296 zap_name_free(zn);
1297 zap_unlockdir(zap, FTAG);
1298 return (err);
1299 }
1300
1301 int
zap_lookup_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,uint64_t integer_size,uint64_t num_integers,void * buf)1302 zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1303 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1304 {
1305 zap_t *zap;
1306
1307 int err =
1308 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1309 if (err != 0)
1310 return (err);
1311 err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size,
1312 num_integers, buf);
1313 /* zap_lookup_uint64_impl() calls zap_unlockdir() */
1314 return (err);
1315 }
1316
1317 int
zap_lookup_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,uint64_t integer_size,uint64_t num_integers,void * buf)1318 zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
1319 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1320 {
1321 zap_t *zap;
1322
1323 int err =
1324 zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1325 if (err != 0)
1326 return (err);
1327 err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size,
1328 num_integers, buf);
1329 /* zap_lookup_uint64_impl() calls zap_unlockdir() */
1330 return (err);
1331 }
1332
1333 int
zap_contains(objset_t * os,uint64_t zapobj,const char * name)1334 zap_contains(objset_t *os, uint64_t zapobj, const char *name)
1335 {
1336 int err = zap_lookup_norm(os, zapobj, name, 0,
1337 0, NULL, 0, NULL, 0, NULL);
1338 if (err == EOVERFLOW || err == EINVAL)
1339 err = 0; /* found, but skipped reading the value */
1340 return (err);
1341 }
1342
1343 int
zap_length(objset_t * os,uint64_t zapobj,const char * name,uint64_t * integer_size,uint64_t * num_integers)1344 zap_length(objset_t *os, uint64_t zapobj, const char *name,
1345 uint64_t *integer_size, uint64_t *num_integers)
1346 {
1347 zap_t *zap;
1348
1349 int err =
1350 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1351 if (err != 0)
1352 return (err);
1353 zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
1354 if (zn == NULL) {
1355 zap_unlockdir(zap, FTAG);
1356 return (SET_ERROR(ENOTSUP));
1357 }
1358 if (!zap->zap_ismicro) {
1359 err = fzap_length(zn, integer_size, num_integers);
1360 } else {
1361 zfs_btree_index_t idx;
1362 mzap_ent_t *mze = mze_find(zn, &idx);
1363 if (mze == NULL) {
1364 err = SET_ERROR(ENOENT);
1365 } else {
1366 if (integer_size)
1367 *integer_size = 8;
1368 if (num_integers)
1369 *num_integers = 1;
1370 }
1371 }
1372 zap_name_free(zn);
1373 zap_unlockdir(zap, FTAG);
1374 return (err);
1375 }
1376
1377 int
zap_length_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,uint64_t * integer_size,uint64_t * num_integers)1378 zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1379 int key_numints, uint64_t *integer_size, uint64_t *num_integers)
1380 {
1381 zap_t *zap;
1382
1383 int err =
1384 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1385 if (err != 0)
1386 return (err);
1387 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1388 if (zn == NULL) {
1389 zap_unlockdir(zap, FTAG);
1390 return (SET_ERROR(ENOTSUP));
1391 }
1392 err = fzap_length(zn, integer_size, num_integers);
1393 zap_name_free(zn);
1394 zap_unlockdir(zap, FTAG);
1395 return (err);
1396 }
1397
1398 static void
mzap_addent(zap_name_t * zn,uint64_t value)1399 mzap_addent(zap_name_t *zn, uint64_t value)
1400 {
1401 zap_t *zap = zn->zn_zap;
1402 uint16_t start = zap->zap_m.zap_alloc_next;
1403
1404 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
1405
1406 #ifdef ZFS_DEBUG
1407 for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
1408 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1409 ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
1410 }
1411 #endif
1412
1413 uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
1414 /* given the limited size of the microzap, this can't happen */
1415 ASSERT(cd < zap_maxcd(zap));
1416
1417 again:
1418 for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) {
1419 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1420 if (mze->mze_name[0] == 0) {
1421 mze->mze_value = value;
1422 mze->mze_cd = cd;
1423 (void) strlcpy(mze->mze_name, zn->zn_key_orig,
1424 sizeof (mze->mze_name));
1425 zap->zap_m.zap_num_entries++;
1426 zap->zap_m.zap_alloc_next = i+1;
1427 if (zap->zap_m.zap_alloc_next ==
1428 zap->zap_m.zap_num_chunks)
1429 zap->zap_m.zap_alloc_next = 0;
1430 mze_insert(zap, i, zn->zn_hash);
1431 return;
1432 }
1433 }
1434 if (start != 0) {
1435 start = 0;
1436 goto again;
1437 }
1438 cmn_err(CE_PANIC, "out of entries!");
1439 }
1440
1441 static int
zap_add_impl(zap_t * zap,const char * key,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx,const void * tag)1442 zap_add_impl(zap_t *zap, const char *key,
1443 int integer_size, uint64_t num_integers,
1444 const void *val, dmu_tx_t *tx, const void *tag)
1445 {
1446 const uint64_t *intval = val;
1447 int err = 0;
1448
1449 zap_name_t *zn = zap_name_alloc_str(zap, key, 0);
1450 if (zn == NULL) {
1451 zap_unlockdir(zap, tag);
1452 return (SET_ERROR(ENOTSUP));
1453 }
1454 if (!zap->zap_ismicro) {
1455 err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
1456 zap = zn->zn_zap; /* fzap_add() may change zap */
1457 } else if (integer_size != 8 || num_integers != 1 ||
1458 strlen(key) >= MZAP_NAME_LEN ||
1459 !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
1460 err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
1461 if (err == 0) {
1462 err = fzap_add(zn, integer_size, num_integers, val,
1463 tag, tx);
1464 }
1465 zap = zn->zn_zap; /* fzap_add() may change zap */
1466 } else {
1467 zfs_btree_index_t idx;
1468 if (mze_find(zn, &idx) != NULL) {
1469 err = SET_ERROR(EEXIST);
1470 } else {
1471 mzap_addent(zn, *intval);
1472 }
1473 }
1474 ASSERT(zap == zn->zn_zap);
1475 zap_name_free(zn);
1476 if (zap != NULL) /* may be NULL if fzap_add() failed */
1477 zap_unlockdir(zap, tag);
1478 return (err);
1479 }
1480
1481 int
zap_add(objset_t * os,uint64_t zapobj,const char * key,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1482 zap_add(objset_t *os, uint64_t zapobj, const char *key,
1483 int integer_size, uint64_t num_integers,
1484 const void *val, dmu_tx_t *tx)
1485 {
1486 zap_t *zap;
1487 int err;
1488
1489 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1490 if (err != 0)
1491 return (err);
1492 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1493 /* zap_add_impl() calls zap_unlockdir() */
1494 return (err);
1495 }
1496
1497 int
zap_add_by_dnode(dnode_t * dn,const char * key,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1498 zap_add_by_dnode(dnode_t *dn, const char *key,
1499 int integer_size, uint64_t num_integers,
1500 const void *val, dmu_tx_t *tx)
1501 {
1502 zap_t *zap;
1503 int err;
1504
1505 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1506 if (err != 0)
1507 return (err);
1508 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1509 /* zap_add_impl() calls zap_unlockdir() */
1510 return (err);
1511 }
1512
1513 static int
zap_add_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx,const void * tag)1514 zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
1515 int key_numints, int integer_size, uint64_t num_integers,
1516 const void *val, dmu_tx_t *tx, const void *tag)
1517 {
1518 int err;
1519
1520 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1521 if (zn == NULL) {
1522 zap_unlockdir(zap, tag);
1523 return (SET_ERROR(ENOTSUP));
1524 }
1525 err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
1526 zap = zn->zn_zap; /* fzap_add() may change zap */
1527 zap_name_free(zn);
1528 if (zap != NULL) /* may be NULL if fzap_add() failed */
1529 zap_unlockdir(zap, tag);
1530 return (err);
1531 }
1532
1533 int
zap_add_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1534 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1535 int key_numints, int integer_size, uint64_t num_integers,
1536 const void *val, dmu_tx_t *tx)
1537 {
1538 zap_t *zap;
1539
1540 int err =
1541 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1542 if (err != 0)
1543 return (err);
1544 err = zap_add_uint64_impl(zap, key, key_numints,
1545 integer_size, num_integers, val, tx, FTAG);
1546 /* zap_add_uint64_impl() calls zap_unlockdir() */
1547 return (err);
1548 }
1549
1550 int
zap_add_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1551 zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
1552 int key_numints, int integer_size, uint64_t num_integers,
1553 const void *val, dmu_tx_t *tx)
1554 {
1555 zap_t *zap;
1556
1557 int err =
1558 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1559 if (err != 0)
1560 return (err);
1561 err = zap_add_uint64_impl(zap, key, key_numints,
1562 integer_size, num_integers, val, tx, FTAG);
1563 /* zap_add_uint64_impl() calls zap_unlockdir() */
1564 return (err);
1565 }
1566
1567 int
zap_update(objset_t * os,uint64_t zapobj,const char * name,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1568 zap_update(objset_t *os, uint64_t zapobj, const char *name,
1569 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1570 {
1571 zap_t *zap;
1572 const uint64_t *intval = val;
1573
1574 int err =
1575 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1576 if (err != 0)
1577 return (err);
1578 zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
1579 if (zn == NULL) {
1580 zap_unlockdir(zap, FTAG);
1581 return (SET_ERROR(ENOTSUP));
1582 }
1583 if (!zap->zap_ismicro) {
1584 err = fzap_update(zn, integer_size, num_integers, val,
1585 FTAG, tx);
1586 zap = zn->zn_zap; /* fzap_update() may change zap */
1587 } else if (integer_size != 8 || num_integers != 1 ||
1588 strlen(name) >= MZAP_NAME_LEN) {
1589 dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
1590 (u_longlong_t)zapobj, integer_size,
1591 (u_longlong_t)num_integers, name);
1592 err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
1593 if (err == 0) {
1594 err = fzap_update(zn, integer_size, num_integers,
1595 val, FTAG, tx);
1596 }
1597 zap = zn->zn_zap; /* fzap_update() may change zap */
1598 } else {
1599 zfs_btree_index_t idx;
1600 mzap_ent_t *mze = mze_find(zn, &idx);
1601 if (mze != NULL) {
1602 MZE_PHYS(zap, mze)->mze_value = *intval;
1603 } else {
1604 mzap_addent(zn, *intval);
1605 }
1606 }
1607 ASSERT(zap == zn->zn_zap);
1608 zap_name_free(zn);
1609 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
1610 zap_unlockdir(zap, FTAG);
1611 return (err);
1612 }
1613
1614 static int
zap_update_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx,const void * tag)1615 zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
1616 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
1617 const void *tag)
1618 {
1619 int err;
1620
1621 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1622 if (zn == NULL) {
1623 zap_unlockdir(zap, tag);
1624 return (SET_ERROR(ENOTSUP));
1625 }
1626 err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
1627 zap = zn->zn_zap; /* fzap_update() may change zap */
1628 zap_name_free(zn);
1629 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
1630 zap_unlockdir(zap, tag);
1631 return (err);
1632 }
1633
1634 int
zap_update_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1635 zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1636 int key_numints, int integer_size, uint64_t num_integers, const void *val,
1637 dmu_tx_t *tx)
1638 {
1639 zap_t *zap;
1640
1641 int err =
1642 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1643 if (err != 0)
1644 return (err);
1645 err = zap_update_uint64_impl(zap, key, key_numints,
1646 integer_size, num_integers, val, tx, FTAG);
1647 /* zap_update_uint64_impl() calls zap_unlockdir() */
1648 return (err);
1649 }
1650
1651 int
zap_update_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1652 zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
1653 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1654 {
1655 zap_t *zap;
1656
1657 int err =
1658 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1659 if (err != 0)
1660 return (err);
1661 err = zap_update_uint64_impl(zap, key, key_numints,
1662 integer_size, num_integers, val, tx, FTAG);
1663 /* zap_update_uint64_impl() calls zap_unlockdir() */
1664 return (err);
1665 }
1666
1667 int
zap_remove(objset_t * os,uint64_t zapobj,const char * name,dmu_tx_t * tx)1668 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
1669 {
1670 return (zap_remove_norm(os, zapobj, name, 0, tx));
1671 }
1672
1673 static int
zap_remove_impl(zap_t * zap,const char * name,matchtype_t mt,dmu_tx_t * tx)1674 zap_remove_impl(zap_t *zap, const char *name,
1675 matchtype_t mt, dmu_tx_t *tx)
1676 {
1677 int err = 0;
1678
1679 zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
1680 if (zn == NULL)
1681 return (SET_ERROR(ENOTSUP));
1682 if (!zap->zap_ismicro) {
1683 err = fzap_remove(zn, tx);
1684 } else {
1685 zfs_btree_index_t idx;
1686 mzap_ent_t *mze = mze_find(zn, &idx);
1687 if (mze == NULL) {
1688 err = SET_ERROR(ENOENT);
1689 } else {
1690 zap->zap_m.zap_num_entries--;
1691 memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t));
1692 zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx);
1693 }
1694 }
1695 zap_name_free(zn);
1696 return (err);
1697 }
1698
1699 int
zap_remove_norm(objset_t * os,uint64_t zapobj,const char * name,matchtype_t mt,dmu_tx_t * tx)1700 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
1701 matchtype_t mt, dmu_tx_t *tx)
1702 {
1703 zap_t *zap;
1704 int err;
1705
1706 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1707 if (err)
1708 return (err);
1709 err = zap_remove_impl(zap, name, mt, tx);
1710 zap_unlockdir(zap, FTAG);
1711 return (err);
1712 }
1713
1714 int
zap_remove_by_dnode(dnode_t * dn,const char * name,dmu_tx_t * tx)1715 zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
1716 {
1717 zap_t *zap;
1718 int err;
1719
1720 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1721 if (err)
1722 return (err);
1723 err = zap_remove_impl(zap, name, 0, tx);
1724 zap_unlockdir(zap, FTAG);
1725 return (err);
1726 }
1727
1728 static int
zap_remove_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,dmu_tx_t * tx,const void * tag)1729 zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
1730 dmu_tx_t *tx, const void *tag)
1731 {
1732 int err;
1733
1734 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1735 if (zn == NULL) {
1736 zap_unlockdir(zap, tag);
1737 return (SET_ERROR(ENOTSUP));
1738 }
1739 err = fzap_remove(zn, tx);
1740 zap_name_free(zn);
1741 zap_unlockdir(zap, tag);
1742 return (err);
1743 }
1744
1745 int
zap_remove_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,dmu_tx_t * tx)1746 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1747 int key_numints, dmu_tx_t *tx)
1748 {
1749 zap_t *zap;
1750
1751 int err =
1752 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1753 if (err != 0)
1754 return (err);
1755 err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
1756 /* zap_remove_uint64_impl() calls zap_unlockdir() */
1757 return (err);
1758 }
1759
1760 int
zap_remove_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,dmu_tx_t * tx)1761 zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
1762 dmu_tx_t *tx)
1763 {
1764 zap_t *zap;
1765
1766 int err =
1767 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1768 if (err != 0)
1769 return (err);
1770 err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
1771 /* zap_remove_uint64_impl() calls zap_unlockdir() */
1772 return (err);
1773 }
1774
1775
1776 static zap_attribute_t *
zap_attribute_alloc_impl(boolean_t longname)1777 zap_attribute_alloc_impl(boolean_t longname)
1778 {
1779 zap_attribute_t *za;
1780
1781 za = kmem_cache_alloc((longname)? zap_attr_long_cache : zap_attr_cache,
1782 KM_SLEEP);
1783 za->za_name_len = (longname)? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
1784 return (za);
1785 }
1786
1787 zap_attribute_t *
zap_attribute_alloc(void)1788 zap_attribute_alloc(void)
1789 {
1790 return (zap_attribute_alloc_impl(B_FALSE));
1791 }
1792
1793 zap_attribute_t *
zap_attribute_long_alloc(void)1794 zap_attribute_long_alloc(void)
1795 {
1796 return (zap_attribute_alloc_impl(B_TRUE));
1797 }
1798
1799 void
zap_attribute_free(zap_attribute_t * za)1800 zap_attribute_free(zap_attribute_t *za)
1801 {
1802 if (za->za_name_len == ZAP_MAXNAMELEN) {
1803 kmem_cache_free(zap_attr_cache, za);
1804 } else {
1805 ASSERT3U(za->za_name_len, ==, ZAP_MAXNAMELEN_NEW);
1806 kmem_cache_free(zap_attr_long_cache, za);
1807 }
1808 }
1809
1810 /*
1811 * Routines for iterating over the attributes.
1812 */
1813
1814 static void
zap_cursor_init_impl(zap_cursor_t * zc,objset_t * os,uint64_t zapobj,uint64_t serialized,boolean_t prefetch)1815 zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1816 uint64_t serialized, boolean_t prefetch)
1817 {
1818 zc->zc_objset = os;
1819 zc->zc_zap = NULL;
1820 zc->zc_leaf = NULL;
1821 zc->zc_zapobj = zapobj;
1822 zc->zc_serialized = serialized;
1823 zc->zc_hash = 0;
1824 zc->zc_cd = 0;
1825 zc->zc_prefetch = prefetch;
1826 }
1827 void
zap_cursor_init_serialized(zap_cursor_t * zc,objset_t * os,uint64_t zapobj,uint64_t serialized)1828 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1829 uint64_t serialized)
1830 {
1831 zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
1832 }
1833
1834 /*
1835 * Initialize a cursor at the beginning of the ZAP object. The entire
1836 * ZAP object will be prefetched.
1837 */
1838 void
zap_cursor_init(zap_cursor_t * zc,objset_t * os,uint64_t zapobj)1839 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1840 {
1841 zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
1842 }
1843
1844 /*
1845 * Initialize a cursor at the beginning, but request that we not prefetch
1846 * the entire ZAP object.
1847 */
1848 void
zap_cursor_init_noprefetch(zap_cursor_t * zc,objset_t * os,uint64_t zapobj)1849 zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1850 {
1851 zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
1852 }
1853
1854 void
zap_cursor_fini(zap_cursor_t * zc)1855 zap_cursor_fini(zap_cursor_t *zc)
1856 {
1857 if (zc->zc_zap) {
1858 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1859 zap_unlockdir(zc->zc_zap, NULL);
1860 zc->zc_zap = NULL;
1861 }
1862 if (zc->zc_leaf) {
1863 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1864 zap_put_leaf(zc->zc_leaf);
1865 zc->zc_leaf = NULL;
1866 }
1867 zc->zc_objset = NULL;
1868 }
1869
1870 uint64_t
zap_cursor_serialize(zap_cursor_t * zc)1871 zap_cursor_serialize(zap_cursor_t *zc)
1872 {
1873 if (zc->zc_hash == -1ULL)
1874 return (-1ULL);
1875 if (zc->zc_zap == NULL)
1876 return (zc->zc_serialized);
1877 ASSERT0((zc->zc_hash & zap_maxcd(zc->zc_zap)));
1878 ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
1879
1880 /*
1881 * We want to keep the high 32 bits of the cursor zero if we can, so
1882 * that 32-bit programs can access this. So usually use a small
1883 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
1884 * of the cursor.
1885 *
1886 * [ collision differentiator | zap_hashbits()-bit hash value ]
1887 */
1888 return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
1889 ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
1890 }
1891
1892 int
zap_cursor_retrieve(zap_cursor_t * zc,zap_attribute_t * za)1893 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
1894 {
1895 int err;
1896
1897 if (zc->zc_hash == -1ULL)
1898 return (SET_ERROR(ENOENT));
1899
1900 if (zc->zc_zap == NULL) {
1901 int hb;
1902 err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
1903 RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
1904 if (err != 0)
1905 return (err);
1906
1907 /*
1908 * To support zap_cursor_init_serialized, advance, retrieve,
1909 * we must add to the existing zc_cd, which may already
1910 * be 1 due to the zap_cursor_advance.
1911 */
1912 ASSERT0(zc->zc_hash);
1913 hb = zap_hashbits(zc->zc_zap);
1914 zc->zc_hash = zc->zc_serialized << (64 - hb);
1915 zc->zc_cd += zc->zc_serialized >> hb;
1916 if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
1917 zc->zc_cd = 0;
1918 } else {
1919 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1920 }
1921 if (!zc->zc_zap->zap_ismicro) {
1922 err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
1923 } else {
1924 zfs_btree_index_t idx;
1925 mzap_ent_t mze_tofind;
1926
1927 mze_tofind.mze_hash = zc->zc_hash >> 32;
1928 mze_tofind.mze_cd = zc->zc_cd;
1929
1930 mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree,
1931 &mze_tofind, &idx);
1932 if (mze == NULL) {
1933 mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree,
1934 &idx, &idx);
1935 }
1936 if (mze) {
1937 mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
1938 ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
1939 za->za_normalization_conflict =
1940 mzap_normalization_conflict(zc->zc_zap, NULL,
1941 mze, &idx);
1942 za->za_integer_length = 8;
1943 za->za_num_integers = 1;
1944 za->za_first_integer = mzep->mze_value;
1945 (void) strlcpy(za->za_name, mzep->mze_name,
1946 za->za_name_len);
1947 zc->zc_hash = (uint64_t)mze->mze_hash << 32;
1948 zc->zc_cd = mze->mze_cd;
1949 err = 0;
1950 } else {
1951 zc->zc_hash = -1ULL;
1952 err = SET_ERROR(ENOENT);
1953 }
1954 }
1955 rw_exit(&zc->zc_zap->zap_rwlock);
1956 return (err);
1957 }
1958
1959 void
zap_cursor_advance(zap_cursor_t * zc)1960 zap_cursor_advance(zap_cursor_t *zc)
1961 {
1962 if (zc->zc_hash == -1ULL)
1963 return;
1964 zc->zc_cd++;
1965 }
1966
1967 int
zap_get_stats(objset_t * os,uint64_t zapobj,zap_stats_t * zs)1968 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
1969 {
1970 zap_t *zap;
1971
1972 int err =
1973 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1974 if (err != 0)
1975 return (err);
1976
1977 memset(zs, 0, sizeof (zap_stats_t));
1978
1979 if (zap->zap_ismicro) {
1980 zs->zs_blocksize = zap->zap_dbuf->db_size;
1981 zs->zs_num_entries = zap->zap_m.zap_num_entries;
1982 zs->zs_num_blocks = 1;
1983 } else {
1984 fzap_get_stats(zap, zs);
1985 }
1986 zap_unlockdir(zap, FTAG);
1987 return (0);
1988 }
1989
1990 #if defined(_KERNEL)
1991 EXPORT_SYMBOL(zap_create);
1992 EXPORT_SYMBOL(zap_create_dnsize);
1993 EXPORT_SYMBOL(zap_create_norm);
1994 EXPORT_SYMBOL(zap_create_norm_dnsize);
1995 EXPORT_SYMBOL(zap_create_flags);
1996 EXPORT_SYMBOL(zap_create_flags_dnsize);
1997 EXPORT_SYMBOL(zap_create_claim);
1998 EXPORT_SYMBOL(zap_create_claim_norm);
1999 EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
2000 EXPORT_SYMBOL(zap_create_hold);
2001 EXPORT_SYMBOL(zap_destroy);
2002 EXPORT_SYMBOL(zap_lookup);
2003 EXPORT_SYMBOL(zap_lookup_by_dnode);
2004 EXPORT_SYMBOL(zap_lookup_norm);
2005 EXPORT_SYMBOL(zap_lookup_uint64);
2006 EXPORT_SYMBOL(zap_contains);
2007 EXPORT_SYMBOL(zap_prefetch);
2008 EXPORT_SYMBOL(zap_prefetch_uint64);
2009 EXPORT_SYMBOL(zap_prefetch_object);
2010 EXPORT_SYMBOL(zap_add);
2011 EXPORT_SYMBOL(zap_add_by_dnode);
2012 EXPORT_SYMBOL(zap_add_uint64);
2013 EXPORT_SYMBOL(zap_add_uint64_by_dnode);
2014 EXPORT_SYMBOL(zap_update);
2015 EXPORT_SYMBOL(zap_update_uint64);
2016 EXPORT_SYMBOL(zap_update_uint64_by_dnode);
2017 EXPORT_SYMBOL(zap_length);
2018 EXPORT_SYMBOL(zap_length_uint64);
2019 EXPORT_SYMBOL(zap_remove);
2020 EXPORT_SYMBOL(zap_remove_by_dnode);
2021 EXPORT_SYMBOL(zap_remove_norm);
2022 EXPORT_SYMBOL(zap_remove_uint64);
2023 EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
2024 EXPORT_SYMBOL(zap_count);
2025 EXPORT_SYMBOL(zap_value_search);
2026 EXPORT_SYMBOL(zap_join);
2027 EXPORT_SYMBOL(zap_join_increment);
2028 EXPORT_SYMBOL(zap_add_int);
2029 EXPORT_SYMBOL(zap_remove_int);
2030 EXPORT_SYMBOL(zap_lookup_int);
2031 EXPORT_SYMBOL(zap_increment_int);
2032 EXPORT_SYMBOL(zap_add_int_key);
2033 EXPORT_SYMBOL(zap_lookup_int_key);
2034 EXPORT_SYMBOL(zap_increment);
2035 EXPORT_SYMBOL(zap_cursor_init);
2036 EXPORT_SYMBOL(zap_cursor_fini);
2037 EXPORT_SYMBOL(zap_cursor_retrieve);
2038 EXPORT_SYMBOL(zap_cursor_advance);
2039 EXPORT_SYMBOL(zap_cursor_serialize);
2040 EXPORT_SYMBOL(zap_cursor_init_serialized);
2041 EXPORT_SYMBOL(zap_get_stats);
2042
2043 ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW,
2044 "Maximum micro ZAP size before converting to a fat ZAP, "
2045 "in bytes (max 1M)");
2046 #endif
2047