1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27 * Copyright 2017 Nexenta Systems, Inc.
28 * Copyright (c) 2024, Klara, Inc.
29 */
30
31 #include <sys/zio.h>
32 #include <sys/spa.h>
33 #include <sys/dmu.h>
34 #include <sys/zfs_context.h>
35 #include <sys/zap.h>
36 #include <sys/zap_impl.h>
37 #include <sys/zap_leaf.h>
38 #include <sys/btree.h>
39 #include <sys/arc.h>
40 #include <sys/dmu_objset.h>
41 #include <sys/spa_impl.h>
42
43 #ifdef _KERNEL
44 #include <sys/sunddi.h>
45 #endif
46
47 /*
48 * The maximum size (in bytes) of a microzap before it is converted to a
49 * fatzap. It will be rounded up to next multiple of 512 (SPA_MINBLOCKSIZE).
50 *
51 * By definition, a microzap must fit into a single block, so this has
52 * traditionally been SPA_OLD_MAXBLOCKSIZE, and is set to that by default.
53 * Setting this higher requires both the large_blocks feature (to even create
54 * blocks that large) and the large_microzap feature (to enable the stream
55 * machinery to understand not to try to split a microzap block).
56 *
57 * If large_microzap is enabled, this value will be clamped to
58 * spa_maxblocksize(), up to 1M. If not, it will be clamped to
59 * SPA_OLD_MAXBLOCKSIZE.
60 */
61 static int zap_micro_max_size = SPA_OLD_MAXBLOCKSIZE;
62
63 /*
64 * The 1M upper limit is necessary because the count of chunks in a microzap
65 * block is stored as a uint16_t (mze_chunkid). Each chunk is 64 bytes, and the
66 * first is used to store a header, so there are 32767 usable chunks, which is
67 * just under 2M. 1M is the largest power-2-rounded block size under 2M, so we
68 * must set the limit there.
69 */
70 #define MZAP_MAX_SIZE (1048576)
71
72 uint64_t
zap_get_micro_max_size(spa_t * spa)73 zap_get_micro_max_size(spa_t *spa)
74 {
75 uint64_t maxsz = MIN(MZAP_MAX_SIZE,
76 P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE));
77 if (maxsz <= SPA_OLD_MAXBLOCKSIZE)
78 return (maxsz);
79 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP))
80 return (MIN(maxsz, spa_maxblocksize(spa)));
81 return (SPA_OLD_MAXBLOCKSIZE);
82 }
83
84 void
mzap_byteswap(mzap_phys_t * buf,size_t size)85 mzap_byteswap(mzap_phys_t *buf, size_t size)
86 {
87 buf->mz_block_type = BSWAP_64(buf->mz_block_type);
88 buf->mz_salt = BSWAP_64(buf->mz_salt);
89 buf->mz_normflags = BSWAP_64(buf->mz_normflags);
90 int max = (size / MZAP_ENT_LEN) - 1;
91 for (int i = 0; i < max; i++) {
92 buf->mz_chunk[i].mze_value =
93 BSWAP_64(buf->mz_chunk[i].mze_value);
94 buf->mz_chunk[i].mze_cd =
95 BSWAP_32(buf->mz_chunk[i].mze_cd);
96 }
97 }
98
99 __attribute__((always_inline)) inline
100 static int
mze_compare(const void * arg1,const void * arg2)101 mze_compare(const void *arg1, const void *arg2)
102 {
103 const mzap_ent_t *mze1 = arg1;
104 const mzap_ent_t *mze2 = arg2;
105
106 return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd,
107 (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd));
108 }
109
ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf,mzap_ent_t,mze_compare)110 ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t,
111 mze_compare)
112
113 static void
114 mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash)
115 {
116 mzap_ent_t mze;
117
118 ASSERT(zap->zap_ismicro);
119 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
120
121 mze.mze_chunkid = chunkid;
122 ASSERT0(hash & 0xffffffff);
123 mze.mze_hash = hash >> 32;
124 ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff);
125 mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd;
126 ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0);
127 zfs_btree_add(&zap->zap_m.zap_tree, &mze);
128 }
129
130 mzap_ent_t *
mze_find(zap_name_t * zn,zfs_btree_index_t * idx)131 mze_find(zap_name_t *zn, zfs_btree_index_t *idx)
132 {
133 mzap_ent_t mze_tofind;
134 mzap_ent_t *mze;
135 zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree;
136
137 ASSERT(zn->zn_zap->zap_ismicro);
138 ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
139
140 ASSERT0(zn->zn_hash & 0xffffffff);
141 mze_tofind.mze_hash = zn->zn_hash >> 32;
142 mze_tofind.mze_cd = 0;
143
144 mze = zfs_btree_find(tree, &mze_tofind, idx);
145 if (mze == NULL)
146 mze = zfs_btree_next(tree, idx, idx);
147 for (; mze && mze->mze_hash == mze_tofind.mze_hash;
148 mze = zfs_btree_next(tree, idx, idx)) {
149 ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
150 if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
151 return (mze);
152 }
153
154 return (NULL);
155 }
156
157 static uint32_t
mze_find_unused_cd(zap_t * zap,uint64_t hash)158 mze_find_unused_cd(zap_t *zap, uint64_t hash)
159 {
160 mzap_ent_t mze_tofind;
161 zfs_btree_index_t idx;
162 zfs_btree_t *tree = &zap->zap_m.zap_tree;
163
164 ASSERT(zap->zap_ismicro);
165 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
166
167 ASSERT0(hash & 0xffffffff);
168 hash >>= 32;
169 mze_tofind.mze_hash = hash;
170 mze_tofind.mze_cd = 0;
171
172 uint32_t cd = 0;
173 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
174 mze && mze->mze_hash == hash;
175 mze = zfs_btree_next(tree, &idx, &idx)) {
176 if (mze->mze_cd != cd)
177 break;
178 cd++;
179 }
180
181 return (cd);
182 }
183
184 /*
185 * Each mzap entry requires at max : 4 chunks
186 * 3 chunks for names + 1 chunk for value.
187 */
188 #define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
189 ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
190
191 /*
192 * Check if the current entry keeps the colliding entries under the fatzap leaf
193 * size.
194 */
195 boolean_t
mze_canfit_fzap_leaf(zap_name_t * zn,uint64_t hash)196 mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
197 {
198 zap_t *zap = zn->zn_zap;
199 mzap_ent_t mze_tofind;
200 zfs_btree_index_t idx;
201 zfs_btree_t *tree = &zap->zap_m.zap_tree;
202 uint32_t mzap_ents = 0;
203
204 ASSERT0(hash & 0xffffffff);
205 hash >>= 32;
206 mze_tofind.mze_hash = hash;
207 mze_tofind.mze_cd = 0;
208
209 for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
210 mze && mze->mze_hash == hash;
211 mze = zfs_btree_next(tree, &idx, &idx)) {
212 mzap_ents++;
213 }
214
215 /* Include the new entry being added */
216 mzap_ents++;
217
218 return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
219 }
220
221 void
mze_destroy(zap_t * zap)222 mze_destroy(zap_t *zap)
223 {
224 zfs_btree_clear(&zap->zap_m.zap_tree);
225 zfs_btree_destroy(&zap->zap_m.zap_tree);
226 }
227
228 zap_t *
mzap_open(dmu_buf_t * db)229 mzap_open(dmu_buf_t *db)
230 {
231 zap_t *winner;
232 uint64_t *zap_hdr = (uint64_t *)db->db_data;
233 uint64_t zap_block_type = zap_hdr[0];
234 uint64_t zap_magic = zap_hdr[1];
235
236 ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
237
238 zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
239 rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
240 rw_enter(&zap->zap_rwlock, RW_WRITER);
241 zap->zap_objset = dmu_buf_get_objset(db);
242 zap->zap_object = db->db_object;
243 zap->zap_dbuf = db;
244
245 if (zap_block_type != ZBT_MICRO) {
246 mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT,
247 0);
248 zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
249 if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
250 winner = NULL; /* No actual winner here... */
251 goto handle_winner;
252 }
253 } else {
254 zap->zap_ismicro = TRUE;
255 }
256
257 /*
258 * Make sure that zap_ismicro is set before we let others see it,
259 * because zap_lock() checks zap_ismicro without the lock held.
260 */
261 dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
262 winner = dmu_buf_set_user(db, &zap->zap_dbu);
263
264 if (winner != NULL)
265 goto handle_winner;
266
267 if (zap->zap_ismicro) {
268 zap->zap_salt = zap_m_phys(zap)->mz_salt;
269 zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
270 zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
271
272 /*
273 * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove()
274 * overhead on massive inserts below. It still allows to store
275 * 62 entries before we have to add 2KB B-tree core node.
276 */
277 zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare,
278 mze_find_in_buf, sizeof (mzap_ent_t), 512);
279
280 zap_name_t *zn = zap_name_alloc(zap, B_FALSE);
281 for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) {
282 mzap_ent_phys_t *mze =
283 &zap_m_phys(zap)->mz_chunk[i];
284 if (mze->mze_name[0]) {
285 zap->zap_m.zap_num_entries++;
286 zap_name_init_str(zn, mze->mze_name, 0);
287 mze_insert(zap, i, zn->zn_hash);
288 }
289 }
290 zap_name_free(zn);
291 } else {
292 zap->zap_salt = zap_f_phys(zap)->zap_salt;
293 zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
294
295 ASSERT3U(sizeof (struct zap_leaf_header), ==,
296 2*ZAP_LEAF_CHUNKSIZE);
297
298 /*
299 * The embedded pointer table should not overlap the
300 * other members.
301 */
302 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
303 &zap_f_phys(zap)->zap_salt);
304
305 /*
306 * The embedded pointer table should end at the end of
307 * the block
308 */
309 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
310 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
311 (uintptr_t)zap_f_phys(zap), ==,
312 zap->zap_dbuf->db_size);
313 }
314 rw_exit(&zap->zap_rwlock);
315 return (zap);
316
317 handle_winner:
318 rw_exit(&zap->zap_rwlock);
319 rw_destroy(&zap->zap_rwlock);
320 if (!zap->zap_ismicro)
321 mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
322 kmem_free(zap, sizeof (zap_t));
323 return (winner);
324 }
325
326 int
mzap_upgrade(zap_t ** zapp,dmu_tx_t * tx,zap_flags_t flags)327 mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
328 {
329 int err = 0;
330 zap_t *zap = *zapp;
331
332 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
333
334 int sz = zap->zap_dbuf->db_size;
335 mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP);
336 memcpy(mzp, zap->zap_dbuf->db_data, sz);
337 int nchunks = zap->zap_m.zap_num_chunks;
338
339 if (!flags) {
340 err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
341 1ULL << fzap_default_block_shift, 0, tx);
342 if (err != 0) {
343 vmem_free(mzp, sz);
344 return (err);
345 }
346 }
347
348 dprintf("upgrading obj=%llu with %u chunks\n",
349 (u_longlong_t)zap->zap_object, nchunks);
350 /* XXX destroy the tree later, so we can use the stored hash value */
351 mze_destroy(zap);
352
353 fzap_upgrade(zap, tx, flags);
354
355 zap_name_t *zn = zap_name_alloc(zap, B_FALSE);
356 for (int i = 0; i < nchunks; i++) {
357 mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
358 if (mze->mze_name[0] == 0)
359 continue;
360 dprintf("adding %s=%llu\n",
361 mze->mze_name, (u_longlong_t)mze->mze_value);
362 zap_name_init_str(zn, mze->mze_name, 0);
363 /* If we fail here, we would end up losing entries */
364 VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
365 tx));
366 }
367 zap_name_free(zn);
368 vmem_free(mzp, sz);
369 *zapp = zap;
370 return (0);
371 }
372
373 /*
374 * The "normflags" determine the behavior of the matchtype_t which is
375 * passed to zap_lookup_norm(). Names which have the same normalized
376 * version will be stored with the same hash value, and therefore we can
377 * perform normalization-insensitive lookups. We can be Unicode form-
378 * insensitive and/or case-insensitive. The following flags are valid for
379 * "normflags":
380 *
381 * U8_TEXTPREP_NFC
382 * U8_TEXTPREP_NFD
383 * U8_TEXTPREP_NFKC
384 * U8_TEXTPREP_NFKD
385 * U8_TEXTPREP_TOUPPER
386 *
387 * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
388 * of them may be supplied.
389 */
390 void
mzap_create_impl(dnode_t * dn,int normflags,zap_flags_t flags,dmu_tx_t * tx)391 mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
392 {
393 dmu_buf_t *db;
394
395 VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
396
397 dmu_buf_will_dirty(db, tx);
398 mzap_phys_t *zp = db->db_data;
399 zp->mz_block_type = ZBT_MICRO;
400 zp->mz_salt =
401 ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL;
402 zp->mz_normflags = normflags;
403
404 if (flags != 0) {
405 zap_t *zap;
406 /* Only fat zap supports flags; upgrade immediately. */
407 VERIFY0(zap_lock_by_dnode(dn, tx,
408 RW_WRITER, B_FALSE, B_FALSE, FTAG, &zap));
409 VERIFY0(mzap_upgrade(&zap, tx, flags));
410 zap_unlock(zap, FTAG);
411 }
412
413 dmu_buf_rele(db, FTAG);
414 }
415
416 /*
417 * zn may be NULL; if not specified, it will be computed if needed.
418 * See also the comment above zap_entry_normalization_conflict().
419 */
420 boolean_t
mzap_normalization_conflict(zap_t * zap,zap_name_t * zn,mzap_ent_t * mze,zfs_btree_index_t * idx)421 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze,
422 zfs_btree_index_t *idx)
423 {
424 boolean_t allocdzn = B_FALSE;
425 mzap_ent_t *other;
426 zfs_btree_index_t oidx;
427
428 if (zap->zap_normflags == 0)
429 return (B_FALSE);
430
431 for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx);
432 other && other->mze_hash == mze->mze_hash;
433 other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) {
434
435 if (zn == NULL) {
436 zn = zap_name_alloc_str(zap,
437 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
438 allocdzn = B_TRUE;
439 }
440 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
441 if (allocdzn)
442 zap_name_free(zn);
443 return (B_TRUE);
444 }
445 }
446
447 for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx);
448 other && other->mze_hash == mze->mze_hash;
449 other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) {
450
451 if (zn == NULL) {
452 zn = zap_name_alloc_str(zap,
453 MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
454 allocdzn = B_TRUE;
455 }
456 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
457 if (allocdzn)
458 zap_name_free(zn);
459 return (B_TRUE);
460 }
461 }
462
463 if (allocdzn)
464 zap_name_free(zn);
465 return (B_FALSE);
466 }
467
468 void
mzap_addent(zap_name_t * zn,uint64_t value)469 mzap_addent(zap_name_t *zn, uint64_t value)
470 {
471 zap_t *zap = zn->zn_zap;
472 uint16_t start = zap->zap_m.zap_alloc_next;
473
474 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
475
476 #ifdef ZFS_DEBUG
477 for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
478 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
479 ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
480 }
481 #endif
482
483 uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
484 /* given the limited size of the microzap, this can't happen */
485 ASSERT(cd < zap_maxcd(zap));
486
487 again:
488 for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) {
489 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
490 if (mze->mze_name[0] == 0) {
491 mze->mze_value = value;
492 mze->mze_cd = cd;
493 (void) strlcpy(mze->mze_name, zn->zn_key_orig,
494 sizeof (mze->mze_name));
495 zap->zap_m.zap_num_entries++;
496 zap->zap_m.zap_alloc_next = i+1;
497 if (zap->zap_m.zap_alloc_next ==
498 zap->zap_m.zap_num_chunks)
499 zap->zap_m.zap_alloc_next = 0;
500 mze_insert(zap, i, zn->zn_hash);
501 return;
502 }
503 }
504 if (start != 0) {
505 start = 0;
506 goto again;
507 }
508 cmn_err(CE_PANIC, "out of entries!");
509 }
510
511 ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW,
512 "Maximum micro ZAP size before converting to a fat ZAP, "
513 "in bytes (max 1M)");
514