1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27 * Copyright 2017 Nexenta Systems, Inc.
28 * Copyright (c) 2024, Klara, Inc.
29 */
30
31 #include <sys/zio.h>
32 #include <sys/spa.h>
33 #include <sys/dmu.h>
34 #include <sys/zfs_context.h>
35 #include <sys/zap.h>
36 #include <sys/zap_impl.h>
37 #include <sys/zap_leaf.h>
38 #include <sys/btree.h>
39 #include <sys/arc.h>
40 #include <sys/dmu_objset.h>
41 #include <sys/spa_impl.h>
42
43 #ifdef _KERNEL
44 #include <sys/sunddi.h>
45 #endif
46
47 /*
48 * The maximum size (in bytes) of a microzap before it is converted to a
49 * fatzap. It will be rounded up to next multiple of 512 (SPA_MINBLOCKSIZE).
50 *
51 * By definition, a microzap must fit into a single block, so this has
52 * traditionally been SPA_OLD_MAXBLOCKSIZE, and is set to that by default.
53 * Setting this higher requires both the large_blocks feature (to even create
54 * blocks that large) and the large_microzap feature (to enable the stream
55 * machinery to understand not to try to split a microzap block).
56 *
57 * If large_microzap is enabled, this value will be clamped to
58 * spa_maxblocksize(), up to 1M. If not, it will be clamped to
59 * SPA_OLD_MAXBLOCKSIZE.
60 */
61 static int zap_micro_max_size = SPA_OLD_MAXBLOCKSIZE;
62
63 /*
64 * The 1M upper limit is necessary because the count of chunks in a microzap
65 * block is stored as a uint16_t (mze_chunkid). Each chunk is 64 bytes, and the
66 * first is used to store a header, so there are 32767 usable chunks, which is
67 * just under 2M. 1M is the largest power-2-rounded block size under 2M, so we
68 * must set the limit there.
69 */
70 #define MZAP_MAX_SIZE (1048576)
71
72 uint64_t
zap_get_micro_max_size(spa_t * spa)73 zap_get_micro_max_size(spa_t *spa)
74 {
75 uint64_t maxsz = MIN(MZAP_MAX_SIZE,
76 P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE));
77 if (maxsz <= SPA_OLD_MAXBLOCKSIZE)
78 return (maxsz);
79 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP))
80 return (MIN(maxsz, spa_maxblocksize(spa)));
81 return (SPA_OLD_MAXBLOCKSIZE);
82 }
83
84 static int mzap_upgrade(zap_t **zapp,
85 const void *tag, dmu_tx_t *tx, zap_flags_t flags);
86
87 uint64_t
zap_getflags(zap_t * zap)88 zap_getflags(zap_t *zap)
89 {
90 if (zap->zap_ismicro)
91 return (0);
92 return (zap_f_phys(zap)->zap_flags);
93 }
94
95 int
zap_hashbits(zap_t * zap)96 zap_hashbits(zap_t *zap)
97 {
98 if (zap_getflags(zap) & ZAP_FLAG_HASH64)
99 return (48);
100 else
101 return (28);
102 }
103
104 uint32_t
zap_maxcd(zap_t * zap)105 zap_maxcd(zap_t *zap)
106 {
107 if (zap_getflags(zap) & ZAP_FLAG_HASH64)
108 return ((1<<16)-1);
109 else
110 return (-1U);
111 }
112
113 static uint64_t
zap_hash(zap_name_t * zn)114 zap_hash(zap_name_t *zn)
115 {
116 zap_t *zap = zn->zn_zap;
117 uint64_t h = 0;
118
119 if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
120 ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
121 h = *(uint64_t *)zn->zn_key_orig;
122 } else {
123 h = zap->zap_salt;
124 ASSERT(h != 0);
125 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
126
127 if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
128 const uint64_t *wp = zn->zn_key_norm;
129
130 ASSERT(zn->zn_key_intlen == 8);
131 for (int i = 0; i < zn->zn_key_norm_numints;
132 wp++, i++) {
133 uint64_t word = *wp;
134
135 for (int j = 0; j < 8; j++) {
136 h = (h >> 8) ^
137 zfs_crc64_table[(h ^ word) & 0xFF];
138 word >>= NBBY;
139 }
140 }
141 } else {
142 const uint8_t *cp = zn->zn_key_norm;
143
144 /*
145 * We previously stored the terminating null on
146 * disk, but didn't hash it, so we need to
147 * continue to not hash it. (The
148 * zn_key_*_numints includes the terminating
149 * null for non-binary keys.)
150 */
151 int len = zn->zn_key_norm_numints - 1;
152
153 ASSERT(zn->zn_key_intlen == 1);
154 for (int i = 0; i < len; cp++, i++) {
155 h = (h >> 8) ^
156 zfs_crc64_table[(h ^ *cp) & 0xFF];
157 }
158 }
159 }
160 /*
161 * Don't use all 64 bits, since we need some in the cookie for
162 * the collision differentiator. We MUST use the high bits,
163 * since those are the ones that we first pay attention to when
164 * choosing the bucket.
165 */
166 h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
167
168 return (h);
169 }
170
171 static int
zap_normalize(zap_t * zap,const char * name,char * namenorm,int normflags,size_t outlen)172 zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags,
173 size_t outlen)
174 {
175 ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
176
177 size_t inlen = strlen(name) + 1;
178
179 int err = 0;
180 (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
181 normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
182 U8_UNICODE_LATEST, &err);
183
184 return (err);
185 }
186
187 boolean_t
zap_match(zap_name_t * zn,const char * matchname)188 zap_match(zap_name_t *zn, const char *matchname)
189 {
190 boolean_t res = B_FALSE;
191 ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
192
193 if (zn->zn_matchtype & MT_NORMALIZE) {
194 size_t namelen = zn->zn_normbuf_len;
195 char normbuf[ZAP_MAXNAMELEN];
196 char *norm = normbuf;
197
198 /*
199 * Cannot allocate this on-stack as it exceed the stack-limit of
200 * 1024.
201 */
202 if (namelen > ZAP_MAXNAMELEN)
203 norm = kmem_alloc(namelen, KM_SLEEP);
204
205 if (zap_normalize(zn->zn_zap, matchname, norm,
206 zn->zn_normflags, namelen) != 0) {
207 res = B_FALSE;
208 } else {
209 res = (strcmp(zn->zn_key_norm, norm) == 0);
210 }
211 if (norm != normbuf)
212 kmem_free(norm, namelen);
213 } else {
214 res = (strcmp(zn->zn_key_orig, matchname) == 0);
215 }
216 return (res);
217 }
218
219 static kmem_cache_t *zap_name_cache;
220 static kmem_cache_t *zap_attr_cache;
221 static kmem_cache_t *zap_name_long_cache;
222 static kmem_cache_t *zap_attr_long_cache;
223
224 void
zap_init(void)225 zap_init(void)
226 {
227 zap_name_cache = kmem_cache_create("zap_name",
228 sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL,
229 NULL, NULL, NULL, 0);
230
231 zap_attr_cache = kmem_cache_create("zap_attr_cache",
232 sizeof (zap_attribute_t) + ZAP_MAXNAMELEN, 0, NULL,
233 NULL, NULL, NULL, NULL, 0);
234
235 zap_name_long_cache = kmem_cache_create("zap_name_long",
236 sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL,
237 NULL, NULL, NULL, 0);
238
239 zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache",
240 sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW, 0, NULL,
241 NULL, NULL, NULL, NULL, 0);
242 }
243
244 void
zap_fini(void)245 zap_fini(void)
246 {
247 kmem_cache_destroy(zap_name_cache);
248 kmem_cache_destroy(zap_attr_cache);
249 kmem_cache_destroy(zap_name_long_cache);
250 kmem_cache_destroy(zap_attr_long_cache);
251 }
252
253 static zap_name_t *
zap_name_alloc(zap_t * zap,boolean_t longname)254 zap_name_alloc(zap_t *zap, boolean_t longname)
255 {
256 kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache;
257 zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP);
258
259 zn->zn_zap = zap;
260 zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
261 return (zn);
262 }
263
264 void
zap_name_free(zap_name_t * zn)265 zap_name_free(zap_name_t *zn)
266 {
267 if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) {
268 kmem_cache_free(zap_name_cache, zn);
269 } else {
270 ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW);
271 kmem_cache_free(zap_name_long_cache, zn);
272 }
273 }
274
275 static int
zap_name_init_str(zap_name_t * zn,const char * key,matchtype_t mt)276 zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt)
277 {
278 zap_t *zap = zn->zn_zap;
279 size_t key_len = strlen(key) + 1;
280
281 /* Make sure zn is allocated for longname if key is long */
282 IMPLY(key_len > ZAP_MAXNAMELEN,
283 zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW);
284
285 zn->zn_key_intlen = sizeof (*key);
286 zn->zn_key_orig = key;
287 zn->zn_key_orig_numints = key_len;
288 zn->zn_matchtype = mt;
289 zn->zn_normflags = zap->zap_normflags;
290
291 /*
292 * If we're dealing with a case sensitive lookup on a mixed or
293 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
294 * will fold case to all caps overriding the lookup request.
295 */
296 if (mt & MT_MATCH_CASE)
297 zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
298
299 if (zap->zap_normflags) {
300 /*
301 * We *must* use zap_normflags because this normalization is
302 * what the hash is computed from.
303 */
304 if (zap_normalize(zap, key, zn->zn_normbuf,
305 zap->zap_normflags, zn->zn_normbuf_len) != 0)
306 return (SET_ERROR(ENOTSUP));
307 zn->zn_key_norm = zn->zn_normbuf;
308 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
309 } else {
310 if (mt != 0)
311 return (SET_ERROR(ENOTSUP));
312 zn->zn_key_norm = zn->zn_key_orig;
313 zn->zn_key_norm_numints = zn->zn_key_orig_numints;
314 }
315
316 zn->zn_hash = zap_hash(zn);
317
318 if (zap->zap_normflags != zn->zn_normflags) {
319 /*
320 * We *must* use zn_normflags because this normalization is
321 * what the matching is based on. (Not the hash!)
322 */
323 if (zap_normalize(zap, key, zn->zn_normbuf,
324 zn->zn_normflags, zn->zn_normbuf_len) != 0)
325 return (SET_ERROR(ENOTSUP));
326 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
327 }
328
329 return (0);
330 }
331
332 zap_name_t *
zap_name_alloc_str(zap_t * zap,const char * key,matchtype_t mt)333 zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt)
334 {
335 size_t key_len = strlen(key) + 1;
336 zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN));
337 if (zap_name_init_str(zn, key, mt) != 0) {
338 zap_name_free(zn);
339 return (NULL);
340 }
341 return (zn);
342 }
343
344 static zap_name_t *
zap_name_alloc_uint64(zap_t * zap,const uint64_t * key,int numints)345 zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
346 {
347 zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP);
348
349 ASSERT0(zap->zap_normflags);
350 zn->zn_zap = zap;
351 zn->zn_key_intlen = sizeof (*key);
352 zn->zn_key_orig = zn->zn_key_norm = key;
353 zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
354 zn->zn_matchtype = 0;
355 zn->zn_normbuf_len = ZAP_MAXNAMELEN;
356
357 zn->zn_hash = zap_hash(zn);
358 return (zn);
359 }
360
361 static void
mzap_byteswap(mzap_phys_t * buf,size_t size)362 mzap_byteswap(mzap_phys_t *buf, size_t size)
363 {
364 buf->mz_block_type = BSWAP_64(buf->mz_block_type);
365 buf->mz_salt = BSWAP_64(buf->mz_salt);
366 buf->mz_normflags = BSWAP_64(buf->mz_normflags);
367 int max = (size / MZAP_ENT_LEN) - 1;
368 for (int i = 0; i < max; i++) {
369 buf->mz_chunk[i].mze_value =
370 BSWAP_64(buf->mz_chunk[i].mze_value);
371 buf->mz_chunk[i].mze_cd =
372 BSWAP_32(buf->mz_chunk[i].mze_cd);
373 }
374 }
375
376 void
zap_byteswap(void * buf,size_t size)377 zap_byteswap(void *buf, size_t size)
378 {
379 uint64_t block_type = *(uint64_t *)buf;
380
381 if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
382 /* ASSERT(magic == ZAP_LEAF_MAGIC); */
383 mzap_byteswap(buf, size);
384 } else {
385 fzap_byteswap(buf, size);
386 }
387 }
388
389 __attribute__((always_inline)) inline
390 static int
mze_compare(const void * arg1,const void * arg2)391 mze_compare(const void *arg1, const void *arg2)
392 {
393 const mzap_ent_t *mze1 = arg1;
394 const mzap_ent_t *mze2 = arg2;
395
396 return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd,
397 (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd));
398 }
399
ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf,mzap_ent_t,mze_compare)400 ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t,
401 mze_compare)
402
403 static void
404 mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash)
405 {
406 mzap_ent_t mze;
407
408 ASSERT(zap->zap_ismicro);
409 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
410
411 mze.mze_chunkid = chunkid;
412 ASSERT0(hash & 0xffffffff);
413 mze.mze_hash = hash >> 32;
414 ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff);
415 mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd;
416 ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0);
417 zfs_btree_add(&zap->zap_m.zap_tree, &mze);
418 }
419
420 static mzap_ent_t *
mze_find(zap_name_t * zn,zfs_btree_index_t * idx)421 mze_find(zap_name_t *zn, zfs_btree_index_t *idx)
422 {
423 mzap_ent_t mze_tofind;
424 mzap_ent_t *mze;
425 zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree;
426
427 ASSERT(zn->zn_zap->zap_ismicro);
428 ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
429
430 ASSERT0(zn->zn_hash & 0xffffffff);
431 mze_tofind.mze_hash = zn->zn_hash >> 32;
432 mze_tofind.mze_cd = 0;
433
434 mze = zfs_btree_find(tree, &mze_tofind, idx);
435 if (mze == NULL)
436 mze = zfs_btree_next(tree, idx, idx);
437 for (; mze && mze->mze_hash == mze_tofind.mze_hash;
438 mze = zfs_btree_next(tree, idx, idx)) {
439 ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
440 if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
441 return (mze);
442 }
443
444 return (NULL);
445 }
446
447 static uint32_t
mze_find_unused_cd(zap_t * zap,uint64_t hash)448 mze_find_unused_cd(zap_t *zap, uint64_t hash)
449 {
450 mzap_ent_t mze_tofind;
451 zfs_btree_index_t idx;
452 zfs_btree_t *tree = &zap->zap_m.zap_tree;
453
454 ASSERT(zap->zap_ismicro);
455 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
456
457 ASSERT0(hash & 0xffffffff);
458 hash >>= 32;
459 mze_tofind.mze_hash = hash;
460 mze_tofind.mze_cd = 0;
461
462 uint32_t cd = 0;
463 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
464 mze && mze->mze_hash == hash;
465 mze = zfs_btree_next(tree, &idx, &idx)) {
466 if (mze->mze_cd != cd)
467 break;
468 cd++;
469 }
470
471 return (cd);
472 }
473
474 /*
475 * Each mzap entry requires at max : 4 chunks
476 * 3 chunks for names + 1 chunk for value.
477 */
478 #define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
479 ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
480
481 /*
482 * Check if the current entry keeps the colliding entries under the fatzap leaf
483 * size.
484 */
485 static boolean_t
mze_canfit_fzap_leaf(zap_name_t * zn,uint64_t hash)486 mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
487 {
488 zap_t *zap = zn->zn_zap;
489 mzap_ent_t mze_tofind;
490 zfs_btree_index_t idx;
491 zfs_btree_t *tree = &zap->zap_m.zap_tree;
492 uint32_t mzap_ents = 0;
493
494 ASSERT0(hash & 0xffffffff);
495 hash >>= 32;
496 mze_tofind.mze_hash = hash;
497 mze_tofind.mze_cd = 0;
498
499 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
500 mze && mze->mze_hash == hash;
501 mze = zfs_btree_next(tree, &idx, &idx)) {
502 mzap_ents++;
503 }
504
505 /* Include the new entry being added */
506 mzap_ents++;
507
508 return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
509 }
510
511 static void
mze_destroy(zap_t * zap)512 mze_destroy(zap_t *zap)
513 {
514 zfs_btree_clear(&zap->zap_m.zap_tree);
515 zfs_btree_destroy(&zap->zap_m.zap_tree);
516 }
517
518 static zap_t *
mzap_open(dmu_buf_t * db)519 mzap_open(dmu_buf_t *db)
520 {
521 zap_t *winner;
522 uint64_t *zap_hdr = (uint64_t *)db->db_data;
523 uint64_t zap_block_type = zap_hdr[0];
524 uint64_t zap_magic = zap_hdr[1];
525
526 ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
527
528 zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
529 rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
530 rw_enter(&zap->zap_rwlock, RW_WRITER);
531 zap->zap_objset = dmu_buf_get_objset(db);
532 zap->zap_object = db->db_object;
533 zap->zap_dbuf = db;
534
535 if (zap_block_type != ZBT_MICRO) {
536 mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT,
537 0);
538 zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
539 if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
540 winner = NULL; /* No actual winner here... */
541 goto handle_winner;
542 }
543 } else {
544 zap->zap_ismicro = TRUE;
545 }
546
547 /*
548 * Make sure that zap_ismicro is set before we let others see
549 * it, because zap_lockdir() checks zap_ismicro without the lock
550 * held.
551 */
552 dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
553 winner = dmu_buf_set_user(db, &zap->zap_dbu);
554
555 if (winner != NULL)
556 goto handle_winner;
557
558 if (zap->zap_ismicro) {
559 zap->zap_salt = zap_m_phys(zap)->mz_salt;
560 zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
561 zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
562
563 /*
564 * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove()
565 * overhead on massive inserts below. It still allows to store
566 * 62 entries before we have to add 2KB B-tree core node.
567 */
568 zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare,
569 mze_find_in_buf, sizeof (mzap_ent_t), 512);
570
571 zap_name_t *zn = zap_name_alloc(zap, B_FALSE);
572 for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) {
573 mzap_ent_phys_t *mze =
574 &zap_m_phys(zap)->mz_chunk[i];
575 if (mze->mze_name[0]) {
576 zap->zap_m.zap_num_entries++;
577 zap_name_init_str(zn, mze->mze_name, 0);
578 mze_insert(zap, i, zn->zn_hash);
579 }
580 }
581 zap_name_free(zn);
582 } else {
583 zap->zap_salt = zap_f_phys(zap)->zap_salt;
584 zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
585
586 ASSERT3U(sizeof (struct zap_leaf_header), ==,
587 2*ZAP_LEAF_CHUNKSIZE);
588
589 /*
590 * The embedded pointer table should not overlap the
591 * other members.
592 */
593 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
594 &zap_f_phys(zap)->zap_salt);
595
596 /*
597 * The embedded pointer table should end at the end of
598 * the block
599 */
600 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
601 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
602 (uintptr_t)zap_f_phys(zap), ==,
603 zap->zap_dbuf->db_size);
604 }
605 rw_exit(&zap->zap_rwlock);
606 return (zap);
607
608 handle_winner:
609 rw_exit(&zap->zap_rwlock);
610 rw_destroy(&zap->zap_rwlock);
611 if (!zap->zap_ismicro)
612 mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
613 kmem_free(zap, sizeof (zap_t));
614 return (winner);
615 }
616
617 /*
618 * This routine "consumes" the caller's hold on the dbuf, which must
619 * have the specified tag.
620 */
621 static int
zap_lockdir_impl(dnode_t * dn,dmu_buf_t * db,const void * tag,dmu_tx_t * tx,krw_t lti,boolean_t fatreader,boolean_t adding,zap_t ** zapp)622 zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
623 krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
624 {
625 ASSERT0(db->db_offset);
626 objset_t *os = dmu_buf_get_objset(db);
627 uint64_t obj = db->db_object;
628
629 *zapp = NULL;
630
631 if (DMU_OT_BYTESWAP(dn->dn_type) != DMU_BSWAP_ZAP)
632 return (SET_ERROR(EINVAL));
633
634 zap_t *zap = dmu_buf_get_user(db);
635 if (zap == NULL) {
636 zap = mzap_open(db);
637 if (zap == NULL) {
638 /*
639 * mzap_open() didn't like what it saw on-disk.
640 * Check for corruption!
641 */
642 return (SET_ERROR(EIO));
643 }
644 }
645
646 /*
647 * We're checking zap_ismicro without the lock held, in order to
648 * tell what type of lock we want. Once we have some sort of
649 * lock, see if it really is the right type. In practice this
650 * can only be different if it was upgraded from micro to fat,
651 * and micro wanted WRITER but fat only needs READER.
652 */
653 krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
654 rw_enter(&zap->zap_rwlock, lt);
655 if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
656 /* it was upgraded, now we only need reader */
657 ASSERT(lt == RW_WRITER);
658 ASSERT(RW_READER ==
659 ((!zap->zap_ismicro && fatreader) ? RW_READER : lti));
660 rw_downgrade(&zap->zap_rwlock);
661 lt = RW_READER;
662 }
663
664 zap->zap_objset = os;
665 zap->zap_dnode = dn;
666
667 if (lt == RW_WRITER)
668 dmu_buf_will_dirty(db, tx);
669
670 ASSERT3P(zap->zap_dbuf, ==, db);
671
672 ASSERT(!zap->zap_ismicro ||
673 zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
674 if (zap->zap_ismicro && tx && adding &&
675 zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
676 uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
677 if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) {
678 dprintf("upgrading obj %llu: num_entries=%u\n",
679 (u_longlong_t)obj, zap->zap_m.zap_num_entries);
680 *zapp = zap;
681 int err = mzap_upgrade(zapp, tag, tx, 0);
682 if (err != 0)
683 rw_exit(&zap->zap_rwlock);
684 return (err);
685 }
686 VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
687 zap->zap_m.zap_num_chunks =
688 db->db_size / MZAP_ENT_LEN - 1;
689
690 if (newsz > SPA_OLD_MAXBLOCKSIZE) {
691 dsl_dataset_t *ds = dmu_objset_ds(os);
692 if (!dsl_dataset_feature_is_active(ds,
693 SPA_FEATURE_LARGE_MICROZAP)) {
694 /*
695 * A microzap just grew beyond the old limit
696 * for the first time, so we have to ensure the
697 * feature flag is activated.
698 * zap_get_micro_max_size() won't let us get
699 * here if the feature is not enabled, so we
700 * don't need any other checks beforehand.
701 *
702 * Since we're in open context, we can't
703 * activate the feature directly, so we instead
704 * flag it on the dataset for next sync.
705 */
706 dsl_dataset_dirty(ds, tx);
707 mutex_enter(&ds->ds_lock);
708 ds->ds_feature_activation
709 [SPA_FEATURE_LARGE_MICROZAP] =
710 (void *)B_TRUE;
711 mutex_exit(&ds->ds_lock);
712 }
713 }
714 }
715
716 *zapp = zap;
717 return (0);
718 }
719
720 static int
zap_lockdir_by_dnode(dnode_t * dn,dmu_tx_t * tx,krw_t lti,boolean_t fatreader,boolean_t adding,const void * tag,zap_t ** zapp)721 zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
722 krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
723 zap_t **zapp)
724 {
725 dmu_buf_t *db;
726 int err;
727
728 err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
729 if (err != 0)
730 return (err);
731 err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
732 if (err != 0)
733 dmu_buf_rele(db, tag);
734 else
735 VERIFY(dnode_add_ref(dn, tag));
736 return (err);
737 }
738
739 int
zap_lockdir(objset_t * os,uint64_t obj,dmu_tx_t * tx,krw_t lti,boolean_t fatreader,boolean_t adding,const void * tag,zap_t ** zapp)740 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
741 krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
742 zap_t **zapp)
743 {
744 dnode_t *dn;
745 dmu_buf_t *db;
746 int err;
747
748 err = dnode_hold(os, obj, tag, &dn);
749 if (err != 0)
750 return (err);
751 err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
752 if (err != 0) {
753 dnode_rele(dn, tag);
754 return (err);
755 }
756 err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
757 if (err != 0) {
758 dmu_buf_rele(db, tag);
759 dnode_rele(dn, tag);
760 }
761 return (err);
762 }
763
764 void
zap_unlockdir(zap_t * zap,const void * tag)765 zap_unlockdir(zap_t *zap, const void *tag)
766 {
767 rw_exit(&zap->zap_rwlock);
768 dnode_rele(zap->zap_dnode, tag);
769 dmu_buf_rele(zap->zap_dbuf, tag);
770 }
771
772 static int
mzap_upgrade(zap_t ** zapp,const void * tag,dmu_tx_t * tx,zap_flags_t flags)773 mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags)
774 {
775 int err = 0;
776 zap_t *zap = *zapp;
777
778 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
779
780 int sz = zap->zap_dbuf->db_size;
781 mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP);
782 memcpy(mzp, zap->zap_dbuf->db_data, sz);
783 int nchunks = zap->zap_m.zap_num_chunks;
784
785 if (!flags) {
786 err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
787 1ULL << fzap_default_block_shift, 0, tx);
788 if (err != 0) {
789 vmem_free(mzp, sz);
790 return (err);
791 }
792 }
793
794 dprintf("upgrading obj=%llu with %u chunks\n",
795 (u_longlong_t)zap->zap_object, nchunks);
796 /* XXX destroy the tree later, so we can use the stored hash value */
797 mze_destroy(zap);
798
799 fzap_upgrade(zap, tx, flags);
800
801 zap_name_t *zn = zap_name_alloc(zap, B_FALSE);
802 for (int i = 0; i < nchunks; i++) {
803 mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
804 if (mze->mze_name[0] == 0)
805 continue;
806 dprintf("adding %s=%llu\n",
807 mze->mze_name, (u_longlong_t)mze->mze_value);
808 zap_name_init_str(zn, mze->mze_name, 0);
809 /* If we fail here, we would end up losing entries */
810 VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
811 tag, tx));
812 zap = zn->zn_zap; /* fzap_add_cd() may change zap */
813 }
814 zap_name_free(zn);
815 vmem_free(mzp, sz);
816 *zapp = zap;
817 return (0);
818 }
819
820 /*
821 * The "normflags" determine the behavior of the matchtype_t which is
822 * passed to zap_lookup_norm(). Names which have the same normalized
823 * version will be stored with the same hash value, and therefore we can
824 * perform normalization-insensitive lookups. We can be Unicode form-
825 * insensitive and/or case-insensitive. The following flags are valid for
826 * "normflags":
827 *
828 * U8_TEXTPREP_NFC
829 * U8_TEXTPREP_NFD
830 * U8_TEXTPREP_NFKC
831 * U8_TEXTPREP_NFKD
832 * U8_TEXTPREP_TOUPPER
833 *
834 * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
835 * of them may be supplied.
836 */
837 void
mzap_create_impl(dnode_t * dn,int normflags,zap_flags_t flags,dmu_tx_t * tx)838 mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
839 {
840 dmu_buf_t *db;
841
842 VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
843
844 dmu_buf_will_dirty(db, tx);
845 mzap_phys_t *zp = db->db_data;
846 zp->mz_block_type = ZBT_MICRO;
847 zp->mz_salt =
848 ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL;
849 zp->mz_normflags = normflags;
850
851 if (flags != 0) {
852 zap_t *zap;
853 /* Only fat zap supports flags; upgrade immediately. */
854 VERIFY(dnode_add_ref(dn, FTAG));
855 VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER,
856 B_FALSE, B_FALSE, &zap));
857 VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
858 zap_unlockdir(zap, FTAG);
859 } else {
860 dmu_buf_rele(db, FTAG);
861 }
862 }
863
864 static uint64_t
zap_create_impl(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dnode_t ** allocated_dnode,const void * tag,dmu_tx_t * tx)865 zap_create_impl(objset_t *os, int normflags, zap_flags_t flags,
866 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
867 dmu_object_type_t bonustype, int bonuslen, int dnodesize,
868 dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
869 {
870 uint64_t obj;
871
872 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
873
874 if (allocated_dnode == NULL) {
875 dnode_t *dn;
876 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
877 indirect_blockshift, bonustype, bonuslen, dnodesize,
878 &dn, FTAG, tx);
879 mzap_create_impl(dn, normflags, flags, tx);
880 dnode_rele(dn, FTAG);
881 } else {
882 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
883 indirect_blockshift, bonustype, bonuslen, dnodesize,
884 allocated_dnode, tag, tx);
885 mzap_create_impl(*allocated_dnode, normflags, flags, tx);
886 }
887
888 return (obj);
889 }
890
891 int
zap_create_claim(objset_t * os,uint64_t obj,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)892 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
893 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
894 {
895 return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
896 0, tx));
897 }
898
899 int
zap_create_claim_dnsize(objset_t * os,uint64_t obj,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)900 zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
901 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
902 {
903 return (zap_create_claim_norm_dnsize(os, obj,
904 0, ot, bonustype, bonuslen, dnodesize, tx));
905 }
906
907 int
zap_create_claim_norm(objset_t * os,uint64_t obj,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)908 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
909 dmu_object_type_t ot,
910 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
911 {
912 return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
913 bonuslen, 0, tx));
914 }
915
916 int
zap_create_claim_norm_dnsize(objset_t * os,uint64_t obj,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)917 zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
918 dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
919 int dnodesize, dmu_tx_t *tx)
920 {
921 dnode_t *dn;
922 int error;
923
924 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
925 error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
926 dnodesize, tx);
927 if (error != 0)
928 return (error);
929
930 error = dnode_hold(os, obj, FTAG, &dn);
931 if (error != 0)
932 return (error);
933
934 mzap_create_impl(dn, normflags, 0, tx);
935
936 dnode_rele(dn, FTAG);
937
938 return (0);
939 }
940
941 uint64_t
zap_create(objset_t * os,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)942 zap_create(objset_t *os, dmu_object_type_t ot,
943 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
944 {
945 return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
946 }
947
948 uint64_t
zap_create_dnsize(objset_t * os,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)949 zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
950 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
951 {
952 return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
953 dnodesize, tx));
954 }
955
956 uint64_t
zap_create_norm(objset_t * os,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)957 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
958 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
959 {
960 return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
961 0, tx));
962 }
963
964 uint64_t
zap_create_norm_dnsize(objset_t * os,int normflags,dmu_object_type_t ot,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)965 zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
966 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
967 {
968 return (zap_create_impl(os, normflags, 0, ot, 0, 0,
969 bonustype, bonuslen, dnodesize, NULL, NULL, tx));
970 }
971
972 uint64_t
zap_create_flags(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)973 zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
974 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
975 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
976 {
977 return (zap_create_flags_dnsize(os, normflags, flags, ot,
978 leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
979 }
980
981 uint64_t
zap_create_flags_dnsize(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)982 zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
983 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
984 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
985 {
986 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
987 indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL,
988 tx));
989 }
990
991 /*
992 * Create a zap object and return a pointer to the newly allocated dnode via
993 * the allocated_dnode argument. The returned dnode will be held and the
994 * caller is responsible for releasing the hold by calling dnode_rele().
995 */
996 uint64_t
zap_create_hold(objset_t * os,int normflags,zap_flags_t flags,dmu_object_type_t ot,int leaf_blockshift,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dnode_t ** allocated_dnode,const void * tag,dmu_tx_t * tx)997 zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
998 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
999 dmu_object_type_t bonustype, int bonuslen, int dnodesize,
1000 dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
1001 {
1002 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
1003 indirect_blockshift, bonustype, bonuslen, dnodesize,
1004 allocated_dnode, tag, tx));
1005 }
1006
1007 int
zap_destroy(objset_t * os,uint64_t zapobj,dmu_tx_t * tx)1008 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
1009 {
1010 /*
1011 * dmu_object_free will free the object number and free the
1012 * data. Freeing the data will cause our pageout function to be
1013 * called, which will destroy our data (zap_leaf_t's and zap_t).
1014 */
1015
1016 return (dmu_object_free(os, zapobj, tx));
1017 }
1018
1019 void
zap_evict_sync(void * dbu)1020 zap_evict_sync(void *dbu)
1021 {
1022 zap_t *zap = dbu;
1023
1024 rw_destroy(&zap->zap_rwlock);
1025
1026 if (zap->zap_ismicro)
1027 mze_destroy(zap);
1028 else
1029 mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
1030
1031 kmem_free(zap, sizeof (zap_t));
1032 }
1033
1034 int
zap_count(objset_t * os,uint64_t zapobj,uint64_t * count)1035 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
1036 {
1037 zap_t *zap;
1038
1039 int err =
1040 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1041 if (err != 0)
1042 return (err);
1043 if (!zap->zap_ismicro) {
1044 err = fzap_count(zap, count);
1045 } else {
1046 *count = zap->zap_m.zap_num_entries;
1047 }
1048 zap_unlockdir(zap, FTAG);
1049 return (err);
1050 }
1051
1052 int
zap_count_by_dnode(dnode_t * dn,uint64_t * count)1053 zap_count_by_dnode(dnode_t *dn, uint64_t *count)
1054 {
1055 zap_t *zap;
1056
1057 int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
1058 FTAG, &zap);
1059 if (err != 0)
1060 return (err);
1061 if (!zap->zap_ismicro) {
1062 err = fzap_count(zap, count);
1063 } else {
1064 *count = zap->zap_m.zap_num_entries;
1065 }
1066 zap_unlockdir(zap, FTAG);
1067 return (err);
1068 }
1069
1070 /*
1071 * zn may be NULL; if not specified, it will be computed if needed.
1072 * See also the comment above zap_entry_normalization_conflict().
1073 */
1074 static boolean_t
mzap_normalization_conflict(zap_t * zap,zap_name_t * zn,mzap_ent_t * mze,zfs_btree_index_t * idx)1075 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze,
1076 zfs_btree_index_t *idx)
1077 {
1078 boolean_t allocdzn = B_FALSE;
1079 mzap_ent_t *other;
1080 zfs_btree_index_t oidx;
1081
1082 if (zap->zap_normflags == 0)
1083 return (B_FALSE);
1084
1085 for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx);
1086 other && other->mze_hash == mze->mze_hash;
1087 other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) {
1088
1089 if (zn == NULL) {
1090 zn = zap_name_alloc_str(zap,
1091 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
1092 allocdzn = B_TRUE;
1093 }
1094 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
1095 if (allocdzn)
1096 zap_name_free(zn);
1097 return (B_TRUE);
1098 }
1099 }
1100
1101 for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx);
1102 other && other->mze_hash == mze->mze_hash;
1103 other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) {
1104
1105 if (zn == NULL) {
1106 zn = zap_name_alloc_str(zap,
1107 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
1108 allocdzn = B_TRUE;
1109 }
1110 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
1111 if (allocdzn)
1112 zap_name_free(zn);
1113 return (B_TRUE);
1114 }
1115 }
1116
1117 if (allocdzn)
1118 zap_name_free(zn);
1119 return (B_FALSE);
1120 }
1121
1122 /*
1123 * Routines for manipulating attributes.
1124 */
1125
1126 int
zap_lookup(objset_t * os,uint64_t zapobj,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf)1127 zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
1128 uint64_t integer_size, uint64_t num_integers, void *buf)
1129 {
1130 return (zap_lookup_norm(os, zapobj, name, integer_size,
1131 num_integers, buf, 0, NULL, 0, NULL));
1132 }
1133
1134 static int
zap_lookup_impl(zap_t * zap,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf,matchtype_t mt,char * realname,int rn_len,boolean_t * ncp)1135 zap_lookup_impl(zap_t *zap, const char *name,
1136 uint64_t integer_size, uint64_t num_integers, void *buf,
1137 matchtype_t mt, char *realname, int rn_len,
1138 boolean_t *ncp)
1139 {
1140 int err = 0;
1141
1142 zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
1143 if (zn == NULL)
1144 return (SET_ERROR(ENOTSUP));
1145
1146 if (!zap->zap_ismicro) {
1147 err = fzap_lookup(zn, integer_size, num_integers, buf,
1148 realname, rn_len, ncp, NULL);
1149 } else {
1150 zfs_btree_index_t idx;
1151 mzap_ent_t *mze = mze_find(zn, &idx);
1152 if (mze == NULL) {
1153 err = SET_ERROR(ENOENT);
1154 } else {
1155 if (num_integers < 1) {
1156 err = SET_ERROR(EOVERFLOW);
1157 } else if (integer_size != 8) {
1158 err = SET_ERROR(EINVAL);
1159 } else {
1160 *(uint64_t *)buf =
1161 MZE_PHYS(zap, mze)->mze_value;
1162 if (realname != NULL)
1163 (void) strlcpy(realname,
1164 MZE_PHYS(zap, mze)->mze_name,
1165 rn_len);
1166 if (ncp) {
1167 *ncp = mzap_normalization_conflict(zap,
1168 zn, mze, &idx);
1169 }
1170 }
1171 }
1172 }
1173 zap_name_free(zn);
1174 return (err);
1175 }
1176
1177 int
zap_lookup_norm(objset_t * os,uint64_t zapobj,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf,matchtype_t mt,char * realname,int rn_len,boolean_t * ncp)1178 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
1179 uint64_t integer_size, uint64_t num_integers, void *buf,
1180 matchtype_t mt, char *realname, int rn_len,
1181 boolean_t *ncp)
1182 {
1183 zap_t *zap;
1184
1185 int err =
1186 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1187 if (err != 0)
1188 return (err);
1189 err = zap_lookup_impl(zap, name, integer_size,
1190 num_integers, buf, mt, realname, rn_len, ncp);
1191 zap_unlockdir(zap, FTAG);
1192 return (err);
1193 }
1194
1195 int
zap_prefetch(objset_t * os,uint64_t zapobj,const char * name)1196 zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
1197 {
1198 zap_t *zap;
1199 int err;
1200 zap_name_t *zn;
1201
1202 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1203 if (err)
1204 return (err);
1205 zn = zap_name_alloc_str(zap, name, 0);
1206 if (zn == NULL) {
1207 zap_unlockdir(zap, FTAG);
1208 return (SET_ERROR(ENOTSUP));
1209 }
1210
1211 fzap_prefetch(zn);
1212 zap_name_free(zn);
1213 zap_unlockdir(zap, FTAG);
1214 return (err);
1215 }
1216
1217 int
zap_prefetch_object(objset_t * os,uint64_t zapobj)1218 zap_prefetch_object(objset_t *os, uint64_t zapobj)
1219 {
1220 int error;
1221 dmu_object_info_t doi;
1222
1223 error = dmu_object_info(os, zapobj, &doi);
1224 if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
1225 error = SET_ERROR(EINVAL);
1226 if (error == 0)
1227 dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset);
1228
1229 return (error);
1230 }
1231
1232 int
zap_lookup_by_dnode(dnode_t * dn,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf)1233 zap_lookup_by_dnode(dnode_t *dn, const char *name,
1234 uint64_t integer_size, uint64_t num_integers, void *buf)
1235 {
1236 return (zap_lookup_norm_by_dnode(dn, name, integer_size,
1237 num_integers, buf, 0, NULL, 0, NULL));
1238 }
1239
1240 int
zap_lookup_norm_by_dnode(dnode_t * dn,const char * name,uint64_t integer_size,uint64_t num_integers,void * buf,matchtype_t mt,char * realname,int rn_len,boolean_t * ncp)1241 zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
1242 uint64_t integer_size, uint64_t num_integers, void *buf,
1243 matchtype_t mt, char *realname, int rn_len,
1244 boolean_t *ncp)
1245 {
1246 zap_t *zap;
1247
1248 int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
1249 FTAG, &zap);
1250 if (err != 0)
1251 return (err);
1252 err = zap_lookup_impl(zap, name, integer_size,
1253 num_integers, buf, mt, realname, rn_len, ncp);
1254 zap_unlockdir(zap, FTAG);
1255 return (err);
1256 }
1257
1258 static int
zap_prefetch_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints)1259 zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints)
1260 {
1261 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1262 if (zn == NULL) {
1263 zap_unlockdir(zap, FTAG);
1264 return (SET_ERROR(ENOTSUP));
1265 }
1266
1267 fzap_prefetch(zn);
1268 zap_name_free(zn);
1269 zap_unlockdir(zap, FTAG);
1270 return (0);
1271 }
1272
1273 int
zap_prefetch_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints)1274 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1275 int key_numints)
1276 {
1277 zap_t *zap;
1278
1279 int err =
1280 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1281 if (err != 0)
1282 return (err);
1283 err = zap_prefetch_uint64_impl(zap, key, key_numints);
1284 /* zap_prefetch_uint64_impl() calls zap_unlockdir() */
1285 return (err);
1286 }
1287
1288 int
zap_prefetch_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints)1289 zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints)
1290 {
1291 zap_t *zap;
1292
1293 int err =
1294 zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1295 if (err != 0)
1296 return (err);
1297 err = zap_prefetch_uint64_impl(zap, key, key_numints);
1298 /* zap_prefetch_uint64_impl() calls zap_unlockdir() */
1299 return (err);
1300 }
1301
1302 static int
zap_lookup_length_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,uint64_t integer_size,uint64_t num_integers,void * buf,uint64_t * actual_num_integers)1303 zap_lookup_length_uint64_impl(zap_t *zap, const uint64_t *key,
1304 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf,
1305 uint64_t *actual_num_integers)
1306 {
1307 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1308 if (zn == NULL) {
1309 zap_unlockdir(zap, FTAG);
1310 return (SET_ERROR(ENOTSUP));
1311 }
1312
1313 int err = fzap_lookup(zn, integer_size, num_integers, buf,
1314 NULL, 0, NULL, actual_num_integers);
1315 zap_name_free(zn);
1316 zap_unlockdir(zap, FTAG);
1317 return (err);
1318 }
1319
1320 int
zap_lookup_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,uint64_t integer_size,uint64_t num_integers,void * buf)1321 zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1322 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1323 {
1324 zap_t *zap;
1325
1326 int err =
1327 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1328 if (err != 0)
1329 return (err);
1330 err = zap_lookup_length_uint64_impl(zap, key, key_numints,
1331 integer_size, num_integers, buf, NULL);
1332 /* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
1333 return (err);
1334 }
1335
1336 int
zap_lookup_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,uint64_t integer_size,uint64_t num_integers,void * buf)1337 zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
1338 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1339 {
1340 zap_t *zap;
1341
1342 int err =
1343 zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1344 if (err != 0)
1345 return (err);
1346 err = zap_lookup_length_uint64_impl(zap, key, key_numints,
1347 integer_size, num_integers, buf, NULL);
1348 /* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
1349 return (err);
1350 }
1351
1352 int
zap_lookup_length_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,uint64_t integer_size,uint64_t num_integers,void * buf,uint64_t * actual_num_integers)1353 zap_lookup_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
1354 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf,
1355 uint64_t *actual_num_integers)
1356 {
1357 zap_t *zap;
1358
1359 int err =
1360 zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1361 if (err != 0)
1362 return (err);
1363 err = zap_lookup_length_uint64_impl(zap, key, key_numints,
1364 integer_size, num_integers, buf, actual_num_integers);
1365 /* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
1366 return (err);
1367 }
1368
1369 int
zap_contains(objset_t * os,uint64_t zapobj,const char * name)1370 zap_contains(objset_t *os, uint64_t zapobj, const char *name)
1371 {
1372 int err = zap_lookup_norm(os, zapobj, name, 0,
1373 0, NULL, 0, NULL, 0, NULL);
1374 if (err == EOVERFLOW || err == EINVAL)
1375 err = 0; /* found, but skipped reading the value */
1376 return (err);
1377 }
1378
1379 int
zap_length(objset_t * os,uint64_t zapobj,const char * name,uint64_t * integer_size,uint64_t * num_integers)1380 zap_length(objset_t *os, uint64_t zapobj, const char *name,
1381 uint64_t *integer_size, uint64_t *num_integers)
1382 {
1383 zap_t *zap;
1384
1385 int err =
1386 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1387 if (err != 0)
1388 return (err);
1389 zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
1390 if (zn == NULL) {
1391 zap_unlockdir(zap, FTAG);
1392 return (SET_ERROR(ENOTSUP));
1393 }
1394 if (!zap->zap_ismicro) {
1395 err = fzap_length(zn, integer_size, num_integers);
1396 } else {
1397 zfs_btree_index_t idx;
1398 mzap_ent_t *mze = mze_find(zn, &idx);
1399 if (mze == NULL) {
1400 err = SET_ERROR(ENOENT);
1401 } else {
1402 if (integer_size)
1403 *integer_size = 8;
1404 if (num_integers)
1405 *num_integers = 1;
1406 }
1407 }
1408 zap_name_free(zn);
1409 zap_unlockdir(zap, FTAG);
1410 return (err);
1411 }
1412
1413 int
zap_length_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,uint64_t * integer_size,uint64_t * num_integers)1414 zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1415 int key_numints, uint64_t *integer_size, uint64_t *num_integers)
1416 {
1417 zap_t *zap;
1418
1419 int err =
1420 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1421 if (err != 0)
1422 return (err);
1423 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1424 if (zn == NULL) {
1425 zap_unlockdir(zap, FTAG);
1426 return (SET_ERROR(ENOTSUP));
1427 }
1428 err = fzap_length(zn, integer_size, num_integers);
1429 zap_name_free(zn);
1430 zap_unlockdir(zap, FTAG);
1431 return (err);
1432 }
1433
1434 int
zap_length_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,uint64_t * integer_size,uint64_t * num_integers)1435 zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
1436 int key_numints, uint64_t *integer_size, uint64_t *num_integers)
1437 {
1438 zap_t *zap;
1439
1440 int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
1441 FTAG, &zap);
1442 if (err != 0)
1443 return (err);
1444 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1445 if (zn == NULL) {
1446 zap_unlockdir(zap, FTAG);
1447 return (SET_ERROR(ENOTSUP));
1448 }
1449 err = fzap_length(zn, integer_size, num_integers);
1450 zap_name_free(zn);
1451 zap_unlockdir(zap, FTAG);
1452 return (err);
1453 }
1454
1455 static void
mzap_addent(zap_name_t * zn,uint64_t value)1456 mzap_addent(zap_name_t *zn, uint64_t value)
1457 {
1458 zap_t *zap = zn->zn_zap;
1459 uint16_t start = zap->zap_m.zap_alloc_next;
1460
1461 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
1462
1463 #ifdef ZFS_DEBUG
1464 for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
1465 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1466 ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
1467 }
1468 #endif
1469
1470 uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
1471 /* given the limited size of the microzap, this can't happen */
1472 ASSERT(cd < zap_maxcd(zap));
1473
1474 again:
1475 for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) {
1476 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1477 if (mze->mze_name[0] == 0) {
1478 mze->mze_value = value;
1479 mze->mze_cd = cd;
1480 (void) strlcpy(mze->mze_name, zn->zn_key_orig,
1481 sizeof (mze->mze_name));
1482 zap->zap_m.zap_num_entries++;
1483 zap->zap_m.zap_alloc_next = i+1;
1484 if (zap->zap_m.zap_alloc_next ==
1485 zap->zap_m.zap_num_chunks)
1486 zap->zap_m.zap_alloc_next = 0;
1487 mze_insert(zap, i, zn->zn_hash);
1488 return;
1489 }
1490 }
1491 if (start != 0) {
1492 start = 0;
1493 goto again;
1494 }
1495 cmn_err(CE_PANIC, "out of entries!");
1496 }
1497
1498 static int
zap_add_impl(zap_t * zap,const char * key,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx,const void * tag)1499 zap_add_impl(zap_t *zap, const char *key,
1500 int integer_size, uint64_t num_integers,
1501 const void *val, dmu_tx_t *tx, const void *tag)
1502 {
1503 const uint64_t *intval = val;
1504 int err = 0;
1505
1506 zap_name_t *zn = zap_name_alloc_str(zap, key, 0);
1507 if (zn == NULL) {
1508 zap_unlockdir(zap, tag);
1509 return (SET_ERROR(ENOTSUP));
1510 }
1511 if (!zap->zap_ismicro) {
1512 err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
1513 zap = zn->zn_zap; /* fzap_add() may change zap */
1514 } else if (integer_size != 8 || num_integers != 1 ||
1515 strlen(key) >= MZAP_NAME_LEN ||
1516 !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
1517 err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
1518 if (err == 0) {
1519 err = fzap_add(zn, integer_size, num_integers, val,
1520 tag, tx);
1521 }
1522 zap = zn->zn_zap; /* fzap_add() may change zap */
1523 } else {
1524 zfs_btree_index_t idx;
1525 if (mze_find(zn, &idx) != NULL) {
1526 err = SET_ERROR(EEXIST);
1527 } else {
1528 mzap_addent(zn, *intval);
1529 }
1530 }
1531 ASSERT(zap == zn->zn_zap);
1532 zap_name_free(zn);
1533 if (zap != NULL) /* may be NULL if fzap_add() failed */
1534 zap_unlockdir(zap, tag);
1535 return (err);
1536 }
1537
1538 int
zap_add(objset_t * os,uint64_t zapobj,const char * key,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1539 zap_add(objset_t *os, uint64_t zapobj, const char *key,
1540 int integer_size, uint64_t num_integers,
1541 const void *val, dmu_tx_t *tx)
1542 {
1543 zap_t *zap;
1544 int err;
1545
1546 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1547 if (err != 0)
1548 return (err);
1549 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1550 /* zap_add_impl() calls zap_unlockdir() */
1551 return (err);
1552 }
1553
1554 int
zap_add_by_dnode(dnode_t * dn,const char * key,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1555 zap_add_by_dnode(dnode_t *dn, const char *key,
1556 int integer_size, uint64_t num_integers,
1557 const void *val, dmu_tx_t *tx)
1558 {
1559 zap_t *zap;
1560 int err;
1561
1562 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1563 if (err != 0)
1564 return (err);
1565 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1566 /* zap_add_impl() calls zap_unlockdir() */
1567 return (err);
1568 }
1569
1570 static int
zap_add_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx,const void * tag)1571 zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
1572 int key_numints, int integer_size, uint64_t num_integers,
1573 const void *val, dmu_tx_t *tx, const void *tag)
1574 {
1575 int err;
1576
1577 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1578 if (zn == NULL) {
1579 zap_unlockdir(zap, tag);
1580 return (SET_ERROR(ENOTSUP));
1581 }
1582 err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
1583 zap = zn->zn_zap; /* fzap_add() may change zap */
1584 zap_name_free(zn);
1585 if (zap != NULL) /* may be NULL if fzap_add() failed */
1586 zap_unlockdir(zap, tag);
1587 return (err);
1588 }
1589
1590 int
zap_add_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1591 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1592 int key_numints, int integer_size, uint64_t num_integers,
1593 const void *val, dmu_tx_t *tx)
1594 {
1595 zap_t *zap;
1596
1597 int err =
1598 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1599 if (err != 0)
1600 return (err);
1601 err = zap_add_uint64_impl(zap, key, key_numints,
1602 integer_size, num_integers, val, tx, FTAG);
1603 /* zap_add_uint64_impl() calls zap_unlockdir() */
1604 return (err);
1605 }
1606
1607 int
zap_add_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1608 zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
1609 int key_numints, int integer_size, uint64_t num_integers,
1610 const void *val, dmu_tx_t *tx)
1611 {
1612 zap_t *zap;
1613
1614 int err =
1615 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1616 if (err != 0)
1617 return (err);
1618 err = zap_add_uint64_impl(zap, key, key_numints,
1619 integer_size, num_integers, val, tx, FTAG);
1620 /* zap_add_uint64_impl() calls zap_unlockdir() */
1621 return (err);
1622 }
1623
1624 int
zap_update(objset_t * os,uint64_t zapobj,const char * name,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1625 zap_update(objset_t *os, uint64_t zapobj, const char *name,
1626 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1627 {
1628 zap_t *zap;
1629 const uint64_t *intval = val;
1630
1631 int err =
1632 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1633 if (err != 0)
1634 return (err);
1635 zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
1636 if (zn == NULL) {
1637 zap_unlockdir(zap, FTAG);
1638 return (SET_ERROR(ENOTSUP));
1639 }
1640 if (!zap->zap_ismicro) {
1641 err = fzap_update(zn, integer_size, num_integers, val,
1642 FTAG, tx);
1643 zap = zn->zn_zap; /* fzap_update() may change zap */
1644 } else if (integer_size != 8 || num_integers != 1 ||
1645 strlen(name) >= MZAP_NAME_LEN) {
1646 dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
1647 (u_longlong_t)zapobj, integer_size,
1648 (u_longlong_t)num_integers, name);
1649 err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
1650 if (err == 0) {
1651 err = fzap_update(zn, integer_size, num_integers,
1652 val, FTAG, tx);
1653 }
1654 zap = zn->zn_zap; /* fzap_update() may change zap */
1655 } else {
1656 zfs_btree_index_t idx;
1657 mzap_ent_t *mze = mze_find(zn, &idx);
1658 if (mze != NULL) {
1659 MZE_PHYS(zap, mze)->mze_value = *intval;
1660 } else {
1661 mzap_addent(zn, *intval);
1662 }
1663 }
1664 ASSERT(zap == zn->zn_zap);
1665 zap_name_free(zn);
1666 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
1667 zap_unlockdir(zap, FTAG);
1668 return (err);
1669 }
1670
1671 static int
zap_update_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx,const void * tag)1672 zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
1673 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
1674 const void *tag)
1675 {
1676 int err;
1677
1678 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1679 if (zn == NULL) {
1680 zap_unlockdir(zap, tag);
1681 return (SET_ERROR(ENOTSUP));
1682 }
1683 err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
1684 zap = zn->zn_zap; /* fzap_update() may change zap */
1685 zap_name_free(zn);
1686 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
1687 zap_unlockdir(zap, tag);
1688 return (err);
1689 }
1690
1691 int
zap_update_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1692 zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1693 int key_numints, int integer_size, uint64_t num_integers, const void *val,
1694 dmu_tx_t *tx)
1695 {
1696 zap_t *zap;
1697
1698 int err =
1699 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1700 if (err != 0)
1701 return (err);
1702 err = zap_update_uint64_impl(zap, key, key_numints,
1703 integer_size, num_integers, val, tx, FTAG);
1704 /* zap_update_uint64_impl() calls zap_unlockdir() */
1705 return (err);
1706 }
1707
1708 int
zap_update_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,int integer_size,uint64_t num_integers,const void * val,dmu_tx_t * tx)1709 zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
1710 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1711 {
1712 zap_t *zap;
1713
1714 int err =
1715 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1716 if (err != 0)
1717 return (err);
1718 err = zap_update_uint64_impl(zap, key, key_numints,
1719 integer_size, num_integers, val, tx, FTAG);
1720 /* zap_update_uint64_impl() calls zap_unlockdir() */
1721 return (err);
1722 }
1723
1724 int
zap_remove(objset_t * os,uint64_t zapobj,const char * name,dmu_tx_t * tx)1725 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
1726 {
1727 return (zap_remove_norm(os, zapobj, name, 0, tx));
1728 }
1729
1730 static int
zap_remove_impl(zap_t * zap,const char * name,matchtype_t mt,dmu_tx_t * tx)1731 zap_remove_impl(zap_t *zap, const char *name,
1732 matchtype_t mt, dmu_tx_t *tx)
1733 {
1734 int err = 0;
1735
1736 zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
1737 if (zn == NULL)
1738 return (SET_ERROR(ENOTSUP));
1739 if (!zap->zap_ismicro) {
1740 err = fzap_remove(zn, tx);
1741 } else {
1742 zfs_btree_index_t idx;
1743 mzap_ent_t *mze = mze_find(zn, &idx);
1744 if (mze == NULL) {
1745 err = SET_ERROR(ENOENT);
1746 } else {
1747 zap->zap_m.zap_num_entries--;
1748 memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t));
1749 zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx);
1750 }
1751 }
1752 zap_name_free(zn);
1753 return (err);
1754 }
1755
1756 int
zap_remove_norm(objset_t * os,uint64_t zapobj,const char * name,matchtype_t mt,dmu_tx_t * tx)1757 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
1758 matchtype_t mt, dmu_tx_t *tx)
1759 {
1760 zap_t *zap;
1761 int err;
1762
1763 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1764 if (err)
1765 return (err);
1766 err = zap_remove_impl(zap, name, mt, tx);
1767 zap_unlockdir(zap, FTAG);
1768 return (err);
1769 }
1770
1771 int
zap_remove_by_dnode(dnode_t * dn,const char * name,dmu_tx_t * tx)1772 zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
1773 {
1774 zap_t *zap;
1775 int err;
1776
1777 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1778 if (err)
1779 return (err);
1780 err = zap_remove_impl(zap, name, 0, tx);
1781 zap_unlockdir(zap, FTAG);
1782 return (err);
1783 }
1784
1785 static int
zap_remove_uint64_impl(zap_t * zap,const uint64_t * key,int key_numints,dmu_tx_t * tx,const void * tag)1786 zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
1787 dmu_tx_t *tx, const void *tag)
1788 {
1789 int err;
1790
1791 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1792 if (zn == NULL) {
1793 zap_unlockdir(zap, tag);
1794 return (SET_ERROR(ENOTSUP));
1795 }
1796 err = fzap_remove(zn, tx);
1797 zap_name_free(zn);
1798 zap_unlockdir(zap, tag);
1799 return (err);
1800 }
1801
1802 int
zap_remove_uint64(objset_t * os,uint64_t zapobj,const uint64_t * key,int key_numints,dmu_tx_t * tx)1803 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1804 int key_numints, dmu_tx_t *tx)
1805 {
1806 zap_t *zap;
1807
1808 int err =
1809 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1810 if (err != 0)
1811 return (err);
1812 err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
1813 /* zap_remove_uint64_impl() calls zap_unlockdir() */
1814 return (err);
1815 }
1816
1817 int
zap_remove_uint64_by_dnode(dnode_t * dn,const uint64_t * key,int key_numints,dmu_tx_t * tx)1818 zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
1819 dmu_tx_t *tx)
1820 {
1821 zap_t *zap;
1822
1823 int err =
1824 zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1825 if (err != 0)
1826 return (err);
1827 err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
1828 /* zap_remove_uint64_impl() calls zap_unlockdir() */
1829 return (err);
1830 }
1831
1832
1833 static zap_attribute_t *
zap_attribute_alloc_impl(boolean_t longname)1834 zap_attribute_alloc_impl(boolean_t longname)
1835 {
1836 zap_attribute_t *za;
1837
1838 za = kmem_cache_alloc((longname)? zap_attr_long_cache : zap_attr_cache,
1839 KM_SLEEP);
1840 za->za_name_len = (longname)? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
1841 return (za);
1842 }
1843
1844 zap_attribute_t *
zap_attribute_alloc(void)1845 zap_attribute_alloc(void)
1846 {
1847 return (zap_attribute_alloc_impl(B_FALSE));
1848 }
1849
1850 zap_attribute_t *
zap_attribute_long_alloc(void)1851 zap_attribute_long_alloc(void)
1852 {
1853 return (zap_attribute_alloc_impl(B_TRUE));
1854 }
1855
1856 void
zap_attribute_free(zap_attribute_t * za)1857 zap_attribute_free(zap_attribute_t *za)
1858 {
1859 if (za->za_name_len == ZAP_MAXNAMELEN) {
1860 kmem_cache_free(zap_attr_cache, za);
1861 } else {
1862 ASSERT3U(za->za_name_len, ==, ZAP_MAXNAMELEN_NEW);
1863 kmem_cache_free(zap_attr_long_cache, za);
1864 }
1865 }
1866
1867 /*
1868 * Routines for iterating over the attributes.
1869 */
1870
1871 static void
zap_cursor_init_impl(zap_cursor_t * zc,objset_t * os,uint64_t zapobj,uint64_t serialized,boolean_t prefetch)1872 zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1873 uint64_t serialized, boolean_t prefetch)
1874 {
1875 zc->zc_objset = os;
1876 zc->zc_zap = NULL;
1877 zc->zc_leaf = NULL;
1878 zc->zc_zapobj = zapobj;
1879 zc->zc_serialized = serialized;
1880 zc->zc_hash = 0;
1881 zc->zc_cd = 0;
1882 zc->zc_prefetch = prefetch;
1883 }
1884 void
zap_cursor_init_serialized(zap_cursor_t * zc,objset_t * os,uint64_t zapobj,uint64_t serialized)1885 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1886 uint64_t serialized)
1887 {
1888 zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
1889 }
1890
1891 /*
1892 * Initialize a cursor at the beginning of the ZAP object. The entire
1893 * ZAP object will be prefetched.
1894 */
1895 void
zap_cursor_init(zap_cursor_t * zc,objset_t * os,uint64_t zapobj)1896 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1897 {
1898 zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
1899 }
1900
1901 /*
1902 * Initialize a cursor at the beginning, but request that we not prefetch
1903 * the entire ZAP object.
1904 */
1905 void
zap_cursor_init_noprefetch(zap_cursor_t * zc,objset_t * os,uint64_t zapobj)1906 zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1907 {
1908 zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
1909 }
1910
1911 void
zap_cursor_fini(zap_cursor_t * zc)1912 zap_cursor_fini(zap_cursor_t *zc)
1913 {
1914 if (zc->zc_zap) {
1915 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1916 zap_unlockdir(zc->zc_zap, NULL);
1917 zc->zc_zap = NULL;
1918 }
1919 if (zc->zc_leaf) {
1920 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1921 zap_put_leaf(zc->zc_leaf);
1922 zc->zc_leaf = NULL;
1923 }
1924 zc->zc_objset = NULL;
1925 }
1926
1927 uint64_t
zap_cursor_serialize(zap_cursor_t * zc)1928 zap_cursor_serialize(zap_cursor_t *zc)
1929 {
1930 if (zc->zc_hash == -1ULL)
1931 return (-1ULL);
1932 if (zc->zc_zap == NULL)
1933 return (zc->zc_serialized);
1934 ASSERT0((zc->zc_hash & zap_maxcd(zc->zc_zap)));
1935 ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
1936
1937 /*
1938 * We want to keep the high 32 bits of the cursor zero if we can, so
1939 * that 32-bit programs can access this. So usually use a small
1940 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
1941 * of the cursor.
1942 *
1943 * [ collision differentiator | zap_hashbits()-bit hash value ]
1944 */
1945 return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
1946 ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
1947 }
1948
1949 int
zap_cursor_retrieve(zap_cursor_t * zc,zap_attribute_t * za)1950 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
1951 {
1952 int err;
1953
1954 if (zc->zc_hash == -1ULL)
1955 return (SET_ERROR(ENOENT));
1956
1957 if (zc->zc_zap == NULL) {
1958 int hb;
1959 err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
1960 RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
1961 if (err != 0)
1962 return (err);
1963
1964 /*
1965 * To support zap_cursor_init_serialized, advance, retrieve,
1966 * we must add to the existing zc_cd, which may already
1967 * be 1 due to the zap_cursor_advance.
1968 */
1969 ASSERT0(zc->zc_hash);
1970 hb = zap_hashbits(zc->zc_zap);
1971 zc->zc_hash = zc->zc_serialized << (64 - hb);
1972 zc->zc_cd += zc->zc_serialized >> hb;
1973 if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
1974 zc->zc_cd = 0;
1975 } else {
1976 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1977 }
1978 if (!zc->zc_zap->zap_ismicro) {
1979 err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
1980 } else {
1981 zfs_btree_index_t idx;
1982 mzap_ent_t mze_tofind;
1983
1984 mze_tofind.mze_hash = zc->zc_hash >> 32;
1985 mze_tofind.mze_cd = zc->zc_cd;
1986
1987 mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree,
1988 &mze_tofind, &idx);
1989 if (mze == NULL) {
1990 mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree,
1991 &idx, &idx);
1992 }
1993 if (mze) {
1994 mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
1995 ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
1996 za->za_normalization_conflict =
1997 mzap_normalization_conflict(zc->zc_zap, NULL,
1998 mze, &idx);
1999 za->za_integer_length = 8;
2000 za->za_num_integers = 1;
2001 za->za_first_integer = mzep->mze_value;
2002 (void) strlcpy(za->za_name, mzep->mze_name,
2003 za->za_name_len);
2004 zc->zc_hash = (uint64_t)mze->mze_hash << 32;
2005 zc->zc_cd = mze->mze_cd;
2006 err = 0;
2007 } else {
2008 zc->zc_hash = -1ULL;
2009 err = SET_ERROR(ENOENT);
2010 }
2011 }
2012 rw_exit(&zc->zc_zap->zap_rwlock);
2013 return (err);
2014 }
2015
2016 void
zap_cursor_advance(zap_cursor_t * zc)2017 zap_cursor_advance(zap_cursor_t *zc)
2018 {
2019 if (zc->zc_hash == -1ULL)
2020 return;
2021 zc->zc_cd++;
2022 }
2023
2024 int
zap_get_stats(objset_t * os,uint64_t zapobj,zap_stats_t * zs)2025 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
2026 {
2027 zap_t *zap;
2028
2029 int err =
2030 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
2031 if (err != 0)
2032 return (err);
2033
2034 memset(zs, 0, sizeof (zap_stats_t));
2035
2036 if (zap->zap_ismicro) {
2037 zs->zs_blocksize = zap->zap_dbuf->db_size;
2038 zs->zs_num_entries = zap->zap_m.zap_num_entries;
2039 zs->zs_num_blocks = 1;
2040 } else {
2041 fzap_get_stats(zap, zs);
2042 }
2043 zap_unlockdir(zap, FTAG);
2044 return (0);
2045 }
2046
2047 #if defined(_KERNEL)
2048 EXPORT_SYMBOL(zap_create);
2049 EXPORT_SYMBOL(zap_create_dnsize);
2050 EXPORT_SYMBOL(zap_create_norm);
2051 EXPORT_SYMBOL(zap_create_norm_dnsize);
2052 EXPORT_SYMBOL(zap_create_flags);
2053 EXPORT_SYMBOL(zap_create_flags_dnsize);
2054 EXPORT_SYMBOL(zap_create_claim);
2055 EXPORT_SYMBOL(zap_create_claim_norm);
2056 EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
2057 EXPORT_SYMBOL(zap_create_hold);
2058 EXPORT_SYMBOL(zap_destroy);
2059 EXPORT_SYMBOL(zap_lookup);
2060 EXPORT_SYMBOL(zap_lookup_by_dnode);
2061 EXPORT_SYMBOL(zap_lookup_norm);
2062 EXPORT_SYMBOL(zap_lookup_uint64);
2063 EXPORT_SYMBOL(zap_lookup_length_uint64_by_dnode);
2064 EXPORT_SYMBOL(zap_contains);
2065 EXPORT_SYMBOL(zap_prefetch);
2066 EXPORT_SYMBOL(zap_prefetch_uint64);
2067 EXPORT_SYMBOL(zap_prefetch_object);
2068 EXPORT_SYMBOL(zap_add);
2069 EXPORT_SYMBOL(zap_add_by_dnode);
2070 EXPORT_SYMBOL(zap_add_uint64);
2071 EXPORT_SYMBOL(zap_add_uint64_by_dnode);
2072 EXPORT_SYMBOL(zap_update);
2073 EXPORT_SYMBOL(zap_update_uint64);
2074 EXPORT_SYMBOL(zap_update_uint64_by_dnode);
2075 EXPORT_SYMBOL(zap_length);
2076 EXPORT_SYMBOL(zap_length_uint64);
2077 EXPORT_SYMBOL(zap_length_uint64_by_dnode);
2078 EXPORT_SYMBOL(zap_remove);
2079 EXPORT_SYMBOL(zap_remove_by_dnode);
2080 EXPORT_SYMBOL(zap_remove_norm);
2081 EXPORT_SYMBOL(zap_remove_uint64);
2082 EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
2083 EXPORT_SYMBOL(zap_count);
2084 EXPORT_SYMBOL(zap_count_by_dnode);
2085 EXPORT_SYMBOL(zap_value_search);
2086 EXPORT_SYMBOL(zap_join);
2087 EXPORT_SYMBOL(zap_join_increment);
2088 EXPORT_SYMBOL(zap_add_int);
2089 EXPORT_SYMBOL(zap_remove_int);
2090 EXPORT_SYMBOL(zap_lookup_int);
2091 EXPORT_SYMBOL(zap_increment_int);
2092 EXPORT_SYMBOL(zap_add_int_key);
2093 EXPORT_SYMBOL(zap_lookup_int_key);
2094 EXPORT_SYMBOL(zap_increment);
2095 EXPORT_SYMBOL(zap_cursor_init);
2096 EXPORT_SYMBOL(zap_cursor_fini);
2097 EXPORT_SYMBOL(zap_cursor_retrieve);
2098 EXPORT_SYMBOL(zap_cursor_advance);
2099 EXPORT_SYMBOL(zap_cursor_serialize);
2100 EXPORT_SYMBOL(zap_cursor_init_serialized);
2101 EXPORT_SYMBOL(zap_get_stats);
2102
2103 ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW,
2104 "Maximum micro ZAP size before converting to a fat ZAP, "
2105 "in bytes (max 1M)");
2106 #endif
2107