xref: /freebsd/sys/contrib/openzfs/module/zfs/zap_micro.c (revision d9497217456002b0ddad3cd319570d0b098daa29)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
26  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27  * Copyright 2017 Nexenta Systems, Inc.
28  * Copyright (c) 2024, Klara, Inc.
29  */
30 
31 #include <sys/zio.h>
32 #include <sys/spa.h>
33 #include <sys/dmu.h>
34 #include <sys/zfs_context.h>
35 #include <sys/zap.h>
36 #include <sys/zap_impl.h>
37 #include <sys/zap_leaf.h>
38 #include <sys/btree.h>
39 #include <sys/arc.h>
40 #include <sys/dmu_objset.h>
41 #include <sys/spa_impl.h>
42 
43 #ifdef _KERNEL
44 #include <sys/sunddi.h>
45 #endif
46 
47 /*
48  * The maximum size (in bytes) of a microzap before it is converted to a
49  * fatzap. It will be rounded up to next multiple of 512 (SPA_MINBLOCKSIZE).
50  *
51  * By definition, a microzap must fit into a single block, so this has
52  * traditionally been SPA_OLD_MAXBLOCKSIZE, and is set to that by default.
53  * Setting this higher requires both the large_blocks feature (to even create
54  * blocks that large) and the large_microzap feature (to enable the stream
55  * machinery to understand not to try to split a microzap block).
56  *
57  * If large_microzap is enabled, this value will be clamped to
58  * spa_maxblocksize(), up to 1M. If not, it will be clamped to
59  * SPA_OLD_MAXBLOCKSIZE.
60  */
61 static int zap_micro_max_size = SPA_OLD_MAXBLOCKSIZE;
62 
63 /*
64  * The 1M upper limit is necessary because the count of chunks in a microzap
65  * block is stored as a uint16_t (mze_chunkid). Each chunk is 64 bytes, and the
66  * first is used to store a header, so there are 32767 usable chunks, which is
67  * just under 2M. 1M is the largest power-2-rounded block size under 2M, so we
68  * must set the limit there.
69  */
70 #define	MZAP_MAX_SIZE	(1048576)
71 
72 uint64_t
zap_get_micro_max_size(spa_t * spa)73 zap_get_micro_max_size(spa_t *spa)
74 {
75 	uint64_t maxsz = MIN(MZAP_MAX_SIZE,
76 	    P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE));
77 	if (maxsz <= SPA_OLD_MAXBLOCKSIZE)
78 		return (maxsz);
79 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP))
80 		return (MIN(maxsz, spa_maxblocksize(spa)));
81 	return (SPA_OLD_MAXBLOCKSIZE);
82 }
83 
84 void
mzap_byteswap(mzap_phys_t * buf,size_t size)85 mzap_byteswap(mzap_phys_t *buf, size_t size)
86 {
87 	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
88 	buf->mz_salt = BSWAP_64(buf->mz_salt);
89 	buf->mz_normflags = BSWAP_64(buf->mz_normflags);
90 	int max = (size / MZAP_ENT_LEN) - 1;
91 	for (int i = 0; i < max; i++) {
92 		buf->mz_chunk[i].mze_value =
93 		    BSWAP_64(buf->mz_chunk[i].mze_value);
94 		buf->mz_chunk[i].mze_cd =
95 		    BSWAP_32(buf->mz_chunk[i].mze_cd);
96 	}
97 }
98 
99 __attribute__((always_inline)) inline
100 static int
mze_compare(const void * arg1,const void * arg2)101 mze_compare(const void *arg1, const void *arg2)
102 {
103 	const mzap_ent_t *mze1 = arg1;
104 	const mzap_ent_t *mze2 = arg2;
105 
106 	return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd,
107 	    (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd));
108 }
109 
ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf,mzap_ent_t,mze_compare)110 ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t,
111     mze_compare)
112 
113 static void
114 mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash)
115 {
116 	mzap_ent_t mze;
117 
118 	ASSERT(zap->zap_ismicro);
119 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
120 
121 	mze.mze_chunkid = chunkid;
122 	ASSERT0(hash & 0xffffffff);
123 	mze.mze_hash = hash >> 32;
124 	ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff);
125 	mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd;
126 	ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0);
127 	zfs_btree_add(&zap->zap_m.zap_tree, &mze);
128 }
129 
130 mzap_ent_t *
mze_find(zap_name_t * zn,zfs_btree_index_t * idx)131 mze_find(zap_name_t *zn, zfs_btree_index_t *idx)
132 {
133 	mzap_ent_t mze_tofind;
134 	mzap_ent_t *mze;
135 	zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree;
136 
137 	ASSERT(zn->zn_zap->zap_ismicro);
138 	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
139 
140 	ASSERT0(zn->zn_hash & 0xffffffff);
141 	mze_tofind.mze_hash = zn->zn_hash >> 32;
142 	mze_tofind.mze_cd = 0;
143 
144 	mze = zfs_btree_find(tree, &mze_tofind, idx);
145 	if (mze == NULL)
146 		mze = zfs_btree_next(tree, idx, idx);
147 	for (; mze && mze->mze_hash == mze_tofind.mze_hash;
148 	    mze = zfs_btree_next(tree, idx, idx)) {
149 		ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
150 		if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
151 			return (mze);
152 	}
153 
154 	return (NULL);
155 }
156 
157 static uint32_t
mze_find_unused_cd(zap_t * zap,uint64_t hash)158 mze_find_unused_cd(zap_t *zap, uint64_t hash)
159 {
160 	mzap_ent_t mze_tofind;
161 	zfs_btree_index_t idx;
162 	zfs_btree_t *tree = &zap->zap_m.zap_tree;
163 
164 	ASSERT(zap->zap_ismicro);
165 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
166 
167 	ASSERT0(hash & 0xffffffff);
168 	hash >>= 32;
169 	mze_tofind.mze_hash = hash;
170 	mze_tofind.mze_cd = 0;
171 
172 	uint32_t cd = 0;
173 	for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
174 	    mze && mze->mze_hash == hash;
175 	    mze = zfs_btree_next(tree, &idx, &idx)) {
176 		if (mze->mze_cd != cd)
177 			break;
178 		cd++;
179 	}
180 
181 	return (cd);
182 }
183 
184 /*
185  * Each mzap entry requires at max : 4 chunks
186  * 3 chunks for names + 1 chunk for value.
187  */
188 #define	MZAP_ENT_CHUNKS	(1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
189 	ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
190 
191 /*
192  * Check if the current entry keeps the colliding entries under the fatzap leaf
193  * size.
194  */
195 boolean_t
mze_canfit_fzap_leaf(zap_name_t * zn,uint64_t hash)196 mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
197 {
198 	zap_t *zap = zn->zn_zap;
199 	mzap_ent_t mze_tofind;
200 	zfs_btree_index_t idx;
201 	zfs_btree_t *tree = &zap->zap_m.zap_tree;
202 	uint32_t mzap_ents = 0;
203 
204 	ASSERT0(hash & 0xffffffff);
205 	hash >>= 32;
206 	mze_tofind.mze_hash = hash;
207 	mze_tofind.mze_cd = 0;
208 
209 	for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
210 	    mze && mze->mze_hash == hash;
211 	    mze = zfs_btree_next(tree, &idx, &idx)) {
212 		mzap_ents++;
213 	}
214 
215 	/* Include the new entry being added */
216 	mzap_ents++;
217 
218 	return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
219 }
220 
221 void
mze_destroy(zap_t * zap)222 mze_destroy(zap_t *zap)
223 {
224 	zfs_btree_clear(&zap->zap_m.zap_tree);
225 	zfs_btree_destroy(&zap->zap_m.zap_tree);
226 }
227 
228 zap_t *
mzap_open(dmu_buf_t * db)229 mzap_open(dmu_buf_t *db)
230 {
231 	zap_t *winner;
232 	uint64_t *zap_hdr = (uint64_t *)db->db_data;
233 	uint64_t zap_block_type = zap_hdr[0];
234 	uint64_t zap_magic = zap_hdr[1];
235 
236 	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
237 
238 	zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
239 	rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
240 	rw_enter(&zap->zap_rwlock, RW_WRITER);
241 	zap->zap_objset = dmu_buf_get_objset(db);
242 	zap->zap_object = db->db_object;
243 	zap->zap_dbuf = db;
244 
245 	if (zap_block_type != ZBT_MICRO) {
246 		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT,
247 		    0);
248 		zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
249 		if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
250 			winner = NULL;	/* No actual winner here... */
251 			goto handle_winner;
252 		}
253 	} else {
254 		zap->zap_ismicro = TRUE;
255 	}
256 
257 	/*
258 	 * Make sure that zap_ismicro is set before we let others see it,
259 	 * because zap_lock() checks zap_ismicro without the lock held.
260 	 */
261 	dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
262 	winner = dmu_buf_set_user(db, &zap->zap_dbu);
263 
264 	if (winner != NULL)
265 		goto handle_winner;
266 
267 	if (zap->zap_ismicro) {
268 		zap->zap_salt = zap_m_phys(zap)->mz_salt;
269 		zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
270 		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
271 
272 		/*
273 		 * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove()
274 		 * overhead on massive inserts below.  It still allows to store
275 		 * 62 entries before we have to add 2KB B-tree core node.
276 		 */
277 		zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare,
278 		    mze_find_in_buf, sizeof (mzap_ent_t), 512);
279 
280 		zap_name_t *zn = zap_name_alloc(zap, B_FALSE);
281 		for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) {
282 			mzap_ent_phys_t *mze =
283 			    &zap_m_phys(zap)->mz_chunk[i];
284 			if (mze->mze_name[0]) {
285 				zap->zap_m.zap_num_entries++;
286 				zap_name_init_str(zn, mze->mze_name, 0);
287 				mze_insert(zap, i, zn->zn_hash);
288 			}
289 		}
290 		zap_name_free(zn);
291 	} else {
292 		zap->zap_salt = zap_f_phys(zap)->zap_salt;
293 		zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
294 
295 		ASSERT3U(sizeof (struct zap_leaf_header), ==,
296 		    2*ZAP_LEAF_CHUNKSIZE);
297 
298 		/*
299 		 * The embedded pointer table should not overlap the
300 		 * other members.
301 		 */
302 		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
303 		    &zap_f_phys(zap)->zap_salt);
304 
305 		/*
306 		 * The embedded pointer table should end at the end of
307 		 * the block
308 		 */
309 		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
310 		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
311 		    (uintptr_t)zap_f_phys(zap), ==,
312 		    zap->zap_dbuf->db_size);
313 	}
314 	rw_exit(&zap->zap_rwlock);
315 	return (zap);
316 
317 handle_winner:
318 	rw_exit(&zap->zap_rwlock);
319 	rw_destroy(&zap->zap_rwlock);
320 	if (!zap->zap_ismicro)
321 		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
322 	kmem_free(zap, sizeof (zap_t));
323 	return (winner);
324 }
325 
326 int
mzap_upgrade(zap_t ** zapp,dmu_tx_t * tx,zap_flags_t flags)327 mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
328 {
329 	int err = 0;
330 	zap_t *zap = *zapp;
331 
332 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
333 
334 	int sz = zap->zap_dbuf->db_size;
335 	mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP);
336 	memcpy(mzp, zap->zap_dbuf->db_data, sz);
337 	int nchunks = zap->zap_m.zap_num_chunks;
338 
339 	if (!flags) {
340 		err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
341 		    1ULL << fzap_default_block_shift, 0, tx);
342 		if (err != 0) {
343 			vmem_free(mzp, sz);
344 			return (err);
345 		}
346 	}
347 
348 	dprintf("upgrading obj=%llu with %u chunks\n",
349 	    (u_longlong_t)zap->zap_object, nchunks);
350 	/* XXX destroy the tree later, so we can use the stored hash value */
351 	mze_destroy(zap);
352 
353 	fzap_upgrade(zap, tx, flags);
354 
355 	zap_name_t *zn = zap_name_alloc(zap, B_FALSE);
356 	for (int i = 0; i < nchunks; i++) {
357 		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
358 		if (mze->mze_name[0] == 0)
359 			continue;
360 		dprintf("adding %s=%llu\n",
361 		    mze->mze_name, (u_longlong_t)mze->mze_value);
362 		zap_name_init_str(zn, mze->mze_name, 0);
363 		/* If we fail here, we would end up losing entries */
364 		VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
365 		    tx));
366 	}
367 	zap_name_free(zn);
368 	vmem_free(mzp, sz);
369 	*zapp = zap;
370 	return (0);
371 }
372 
373 /*
374  * The "normflags" determine the behavior of the matchtype_t which is
375  * passed to zap_lookup_norm().  Names which have the same normalized
376  * version will be stored with the same hash value, and therefore we can
377  * perform normalization-insensitive lookups.  We can be Unicode form-
378  * insensitive and/or case-insensitive.  The following flags are valid for
379  * "normflags":
380  *
381  * U8_TEXTPREP_NFC
382  * U8_TEXTPREP_NFD
383  * U8_TEXTPREP_NFKC
384  * U8_TEXTPREP_NFKD
385  * U8_TEXTPREP_TOUPPER
386  *
387  * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
388  * of them may be supplied.
389  */
390 void
mzap_create_impl(dnode_t * dn,int normflags,zap_flags_t flags,dmu_tx_t * tx)391 mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
392 {
393 	dmu_buf_t *db;
394 
395 	VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
396 
397 	dmu_buf_will_dirty(db, tx);
398 	mzap_phys_t *zp = db->db_data;
399 	zp->mz_block_type = ZBT_MICRO;
400 	zp->mz_salt =
401 	    ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL;
402 	zp->mz_normflags = normflags;
403 
404 	if (flags != 0) {
405 		zap_t *zap;
406 		/* Only fat zap supports flags; upgrade immediately. */
407 		VERIFY0(zap_lock_by_dnode(dn, tx,
408 		    RW_WRITER, B_FALSE, B_FALSE, FTAG, &zap));
409 		VERIFY0(mzap_upgrade(&zap, tx, flags));
410 		zap_unlock(zap, FTAG);
411 	}
412 
413 	dmu_buf_rele(db, FTAG);
414 }
415 
416 /*
417  * zn may be NULL; if not specified, it will be computed if needed.
418  * See also the comment above zap_entry_normalization_conflict().
419  */
420 boolean_t
mzap_normalization_conflict(zap_t * zap,zap_name_t * zn,mzap_ent_t * mze,zfs_btree_index_t * idx)421 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze,
422     zfs_btree_index_t *idx)
423 {
424 	boolean_t allocdzn = B_FALSE;
425 	mzap_ent_t *other;
426 	zfs_btree_index_t oidx;
427 
428 	if (zap->zap_normflags == 0)
429 		return (B_FALSE);
430 
431 	for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx);
432 	    other && other->mze_hash == mze->mze_hash;
433 	    other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) {
434 
435 		if (zn == NULL) {
436 			zn = zap_name_alloc_str(zap,
437 			    MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
438 			allocdzn = B_TRUE;
439 		}
440 		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
441 			if (allocdzn)
442 				zap_name_free(zn);
443 			return (B_TRUE);
444 		}
445 	}
446 
447 	for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx);
448 	    other && other->mze_hash == mze->mze_hash;
449 	    other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) {
450 
451 		if (zn == NULL) {
452 			zn = zap_name_alloc_str(zap,
453 			    MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
454 			allocdzn = B_TRUE;
455 		}
456 		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
457 			if (allocdzn)
458 				zap_name_free(zn);
459 			return (B_TRUE);
460 		}
461 	}
462 
463 	if (allocdzn)
464 		zap_name_free(zn);
465 	return (B_FALSE);
466 }
467 
468 void
mzap_addent(zap_name_t * zn,uint64_t value)469 mzap_addent(zap_name_t *zn, uint64_t value)
470 {
471 	zap_t *zap = zn->zn_zap;
472 	uint16_t start = zap->zap_m.zap_alloc_next;
473 
474 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
475 
476 #ifdef ZFS_DEBUG
477 	for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
478 		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
479 		ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
480 	}
481 #endif
482 
483 	uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
484 	/* given the limited size of the microzap, this can't happen */
485 	ASSERT(cd < zap_maxcd(zap));
486 
487 again:
488 	for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) {
489 		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
490 		if (mze->mze_name[0] == 0) {
491 			mze->mze_value = value;
492 			mze->mze_cd = cd;
493 			(void) strlcpy(mze->mze_name, zn->zn_key_orig,
494 			    sizeof (mze->mze_name));
495 			zap->zap_m.zap_num_entries++;
496 			zap->zap_m.zap_alloc_next = i+1;
497 			if (zap->zap_m.zap_alloc_next ==
498 			    zap->zap_m.zap_num_chunks)
499 				zap->zap_m.zap_alloc_next = 0;
500 			mze_insert(zap, i, zn->zn_hash);
501 			return;
502 		}
503 	}
504 	if (start != 0) {
505 		start = 0;
506 		goto again;
507 	}
508 	cmn_err(CE_PANIC, "out of entries!");
509 }
510 
511 ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW,
512 	"Maximum micro ZAP size before converting to a fat ZAP, "
513 	    "in bytes (max 1M)");
514