xref: /freebsd/sys/contrib/openzfs/module/zstd/zfs_zstd.c (revision 6132212808e8dccedc9e5d85fea4390c2f38059a)
1 /*
2  * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  * 1. Redistributions of source code must retain the above copyright notice,
8  * this list of conditions and the following disclaimer.
9  *
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  * this list of conditions and the following disclaimer in the documentation
12  * and/or other materials provided with the distribution.
13  *
14  * 3. Neither the name of the copyright holder nor the names of its
15  * contributors may be used to endorse or promote products derived from this
16  * software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * Copyright (c) 2016-2018, Klara Inc.
33  * Copyright (c) 2016-2018, Allan Jude
34  * Copyright (c) 2018-2020, Sebastian Gottschall
35  * Copyright (c) 2019-2020, Michael Niewöhner
36  * Copyright (c) 2020, The FreeBSD Foundation [1]
37  *
38  * [1] Portions of this software were developed by Allan Jude
39  *     under sponsorship from the FreeBSD Foundation.
40  */
41 
42 #include <sys/param.h>
43 #include <sys/sysmacros.h>
44 #include <sys/zfs_context.h>
45 #include <sys/zio_compress.h>
46 #include <sys/spa.h>
47 #include <sys/zstd/zstd.h>
48 
49 #define	ZSTD_STATIC_LINKING_ONLY
50 #include "lib/zstd.h"
51 #include "lib/zstd_errors.h"
52 
53 kstat_t *zstd_ksp = NULL;
54 
55 typedef struct zstd_stats {
56 	kstat_named_t	zstd_stat_alloc_fail;
57 	kstat_named_t	zstd_stat_alloc_fallback;
58 	kstat_named_t	zstd_stat_com_alloc_fail;
59 	kstat_named_t	zstd_stat_dec_alloc_fail;
60 	kstat_named_t	zstd_stat_com_inval;
61 	kstat_named_t	zstd_stat_dec_inval;
62 	kstat_named_t	zstd_stat_dec_header_inval;
63 	kstat_named_t	zstd_stat_com_fail;
64 	kstat_named_t	zstd_stat_dec_fail;
65 } zstd_stats_t;
66 
67 static zstd_stats_t zstd_stats = {
68 	{ "alloc_fail",			KSTAT_DATA_UINT64 },
69 	{ "alloc_fallback",		KSTAT_DATA_UINT64 },
70 	{ "compress_alloc_fail",	KSTAT_DATA_UINT64 },
71 	{ "decompress_alloc_fail",	KSTAT_DATA_UINT64 },
72 	{ "compress_level_invalid",	KSTAT_DATA_UINT64 },
73 	{ "decompress_level_invalid",	KSTAT_DATA_UINT64 },
74 	{ "decompress_header_invalid",	KSTAT_DATA_UINT64 },
75 	{ "compress_failed",		KSTAT_DATA_UINT64 },
76 	{ "decompress_failed",		KSTAT_DATA_UINT64 },
77 };
78 
79 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */
80 enum zstd_kmem_type {
81 	ZSTD_KMEM_UNKNOWN = 0,
82 	/* Allocation type using kmem_vmalloc */
83 	ZSTD_KMEM_DEFAULT,
84 	/* Pool based allocation using mempool_alloc */
85 	ZSTD_KMEM_POOL,
86 	/* Reserved fallback memory for decompression only */
87 	ZSTD_KMEM_DCTX,
88 	ZSTD_KMEM_COUNT,
89 };
90 
91 /* Structure for pooled memory objects */
92 struct zstd_pool {
93 	void *mem;
94 	size_t size;
95 	kmutex_t barrier;
96 	hrtime_t timeout;
97 };
98 
99 /* Global structure for handling memory allocations */
100 struct zstd_kmem {
101 	enum zstd_kmem_type kmem_type;
102 	size_t kmem_size;
103 	struct zstd_pool *pool;
104 };
105 
106 /* Fallback memory structure used for decompression only if memory runs out */
107 struct zstd_fallback_mem {
108 	size_t mem_size;
109 	void *mem;
110 	kmutex_t barrier;
111 };
112 
113 struct zstd_levelmap {
114 	int16_t zstd_level;
115 	enum zio_zstd_levels level;
116 };
117 
118 /*
119  * ZSTD memory handlers
120  *
121  * For decompression we use a different handler which also provides fallback
122  * memory allocation in case memory runs out.
123  *
124  * The ZSTD handlers were split up for the most simplified implementation.
125  */
126 static void *zstd_alloc(void *opaque, size_t size);
127 static void *zstd_dctx_alloc(void *opaque, size_t size);
128 static void zstd_free(void *opaque, void *ptr);
129 
130 /* Compression memory handler */
131 static const ZSTD_customMem zstd_malloc = {
132 	zstd_alloc,
133 	zstd_free,
134 	NULL,
135 };
136 
137 /* Decompression memory handler */
138 static const ZSTD_customMem zstd_dctx_malloc = {
139 	zstd_dctx_alloc,
140 	zstd_free,
141 	NULL,
142 };
143 
144 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
145 static struct zstd_levelmap zstd_levels[] = {
146 	{ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
147 	{ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
148 	{ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
149 	{ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
150 	{ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
151 	{ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
152 	{ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
153 	{ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
154 	{ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
155 	{ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
156 	{ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
157 	{ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
158 	{ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
159 	{ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
160 	{ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
161 	{ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
162 	{ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
163 	{ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
164 	{ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
165 	{-1, ZIO_ZSTD_LEVEL_FAST_1},
166 	{-2, ZIO_ZSTD_LEVEL_FAST_2},
167 	{-3, ZIO_ZSTD_LEVEL_FAST_3},
168 	{-4, ZIO_ZSTD_LEVEL_FAST_4},
169 	{-5, ZIO_ZSTD_LEVEL_FAST_5},
170 	{-6, ZIO_ZSTD_LEVEL_FAST_6},
171 	{-7, ZIO_ZSTD_LEVEL_FAST_7},
172 	{-8, ZIO_ZSTD_LEVEL_FAST_8},
173 	{-9, ZIO_ZSTD_LEVEL_FAST_9},
174 	{-10, ZIO_ZSTD_LEVEL_FAST_10},
175 	{-20, ZIO_ZSTD_LEVEL_FAST_20},
176 	{-30, ZIO_ZSTD_LEVEL_FAST_30},
177 	{-40, ZIO_ZSTD_LEVEL_FAST_40},
178 	{-50, ZIO_ZSTD_LEVEL_FAST_50},
179 	{-60, ZIO_ZSTD_LEVEL_FAST_60},
180 	{-70, ZIO_ZSTD_LEVEL_FAST_70},
181 	{-80, ZIO_ZSTD_LEVEL_FAST_80},
182 	{-90, ZIO_ZSTD_LEVEL_FAST_90},
183 	{-100, ZIO_ZSTD_LEVEL_FAST_100},
184 	{-500, ZIO_ZSTD_LEVEL_FAST_500},
185 	{-1000, ZIO_ZSTD_LEVEL_FAST_1000},
186 };
187 
188 /*
189  * This variable represents the maximum count of the pool based on the number
190  * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
191  */
192 static int pool_count = 16;
193 
194 #define	ZSTD_POOL_MAX		pool_count
195 #define	ZSTD_POOL_TIMEOUT	60 * 2
196 
197 static struct zstd_fallback_mem zstd_dctx_fallback;
198 static struct zstd_pool *zstd_mempool_cctx;
199 static struct zstd_pool *zstd_mempool_dctx;
200 
201 /*
202  * Try to get a cached allocated buffer from memory pool or allocate a new one
203  * if necessary. If a object is older than 2 minutes and does not fit the
204  * requested size, it will be released and a new cached entry will be allocated.
205  * If other pooled objects are detected without being used for 2 minutes, they
206  * will be released, too.
207  *
208  * The concept is that high frequency memory allocations of bigger objects are
209  * expensive. So if a lot of work is going on, allocations will be kept for a
210  * while and can be reused in that time frame.
211  *
212  * The scheduled release will be updated every time a object is reused.
213  */
214 static void *
215 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
216 {
217 	struct zstd_pool *pool;
218 	struct zstd_kmem *mem = NULL;
219 
220 	if (!zstd_mempool) {
221 		return (NULL);
222 	}
223 
224 	/* Seek for preallocated memory slot and free obsolete slots */
225 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
226 		pool = &zstd_mempool[i];
227 		/*
228 		 * This lock is simply a marker for a pool object beeing in use.
229 		 * If it's already hold, it will be skipped.
230 		 *
231 		 * We need to create it before checking it to avoid race
232 		 * conditions caused by running in a threaded context.
233 		 *
234 		 * The lock is later released by zstd_mempool_free.
235 		 */
236 		if (mutex_tryenter(&pool->barrier)) {
237 			/*
238 			 * Check if objects fits the size, if so we take it and
239 			 * update the timestamp.
240 			 */
241 			if (!mem && pool->mem && size <= pool->size) {
242 				pool->timeout = gethrestime_sec() +
243 				    ZSTD_POOL_TIMEOUT;
244 				mem = pool->mem;
245 				continue;
246 			}
247 
248 			/* Free memory if unused object older than 2 minutes */
249 			if (pool->mem && gethrestime_sec() > pool->timeout) {
250 				vmem_free(pool->mem, pool->size);
251 				pool->mem = NULL;
252 				pool->size = 0;
253 				pool->timeout = 0;
254 			}
255 
256 			mutex_exit(&pool->barrier);
257 		}
258 	}
259 
260 	if (mem) {
261 		return (mem);
262 	}
263 
264 	/*
265 	 * If no preallocated slot was found, try to fill in a new one.
266 	 *
267 	 * We run a similar algorithm twice here to avoid pool fragmentation.
268 	 * The first one may generate holes in the list if objects get released.
269 	 * We always make sure that these holes get filled instead of adding new
270 	 * allocations constantly at the end.
271 	 */
272 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
273 		pool = &zstd_mempool[i];
274 		if (mutex_tryenter(&pool->barrier)) {
275 			/* Object is free, try to allocate new one */
276 			if (!pool->mem) {
277 				mem = vmem_alloc(size, KM_SLEEP);
278 				pool->mem = mem;
279 
280 				if (pool->mem) {
281 					/* Keep track for later release */
282 					mem->pool = pool;
283 					pool->size = size;
284 					mem->kmem_type = ZSTD_KMEM_POOL;
285 					mem->kmem_size = size;
286 				}
287 			}
288 
289 			if (size <= pool->size) {
290 				/* Update timestamp */
291 				pool->timeout = gethrestime_sec() +
292 				    ZSTD_POOL_TIMEOUT;
293 
294 				return (pool->mem);
295 			}
296 
297 			mutex_exit(&pool->barrier);
298 		}
299 	}
300 
301 	/*
302 	 * If the pool is full or the allocation failed, try lazy allocation
303 	 * instead.
304 	 */
305 	if (!mem) {
306 		mem = vmem_alloc(size, KM_NOSLEEP);
307 		if (mem) {
308 			mem->pool = NULL;
309 			mem->kmem_type = ZSTD_KMEM_DEFAULT;
310 			mem->kmem_size = size;
311 		}
312 	}
313 
314 	return (mem);
315 }
316 
317 /* Mark object as released by releasing the barrier mutex */
318 static void
319 zstd_mempool_free(struct zstd_kmem *z)
320 {
321 	mutex_exit(&z->pool->barrier);
322 }
323 
324 /* Convert ZFS internal enum to ZSTD level */
325 static int
326 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
327 {
328 	if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
329 		*zstd_level = zstd_levels[level - 1].zstd_level;
330 		return (0);
331 	}
332 	if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
333 	    level <= ZIO_ZSTD_LEVEL_FAST_1000) {
334 		*zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
335 		    + ZIO_ZSTD_LEVEL_19].zstd_level;
336 		return (0);
337 	}
338 
339 	/* Invalid/unknown zfs compression enum - this should never happen. */
340 	return (1);
341 }
342 
343 /* Compress block using zstd */
344 size_t
345 zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
346     int level)
347 {
348 	size_t c_len;
349 	int16_t zstd_level;
350 	zfs_zstdhdr_t *hdr;
351 	ZSTD_CCtx *cctx;
352 
353 	hdr = (zfs_zstdhdr_t *)d_start;
354 
355 	/* Skip compression if the specified level is invalid */
356 	if (zstd_enum_to_level(level, &zstd_level)) {
357 		ZSTDSTAT_BUMP(zstd_stat_com_inval);
358 		return (s_len);
359 	}
360 
361 	ASSERT3U(d_len, >=, sizeof (*hdr));
362 	ASSERT3U(d_len, <=, s_len);
363 	ASSERT3U(zstd_level, !=, 0);
364 
365 	cctx = ZSTD_createCCtx_advanced(zstd_malloc);
366 
367 	/*
368 	 * Out of kernel memory, gently fall through - this will disable
369 	 * compression in zio_compress_data
370 	 */
371 	if (!cctx) {
372 		ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
373 		return (s_len);
374 	}
375 
376 	/* Set the compression level */
377 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
378 
379 	/* Use the "magicless" zstd header which saves us 4 header bytes */
380 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
381 
382 	/*
383 	 * Disable redundant checksum calculation and content size storage since
384 	 * this is already done by ZFS itself.
385 	 */
386 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
387 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
388 
389 	c_len = ZSTD_compress2(cctx,
390 	    hdr->data,
391 	    d_len - sizeof (*hdr),
392 	    s_start, s_len);
393 
394 	ZSTD_freeCCtx(cctx);
395 
396 	/* Error in the compression routine, disable compression. */
397 	if (ZSTD_isError(c_len)) {
398 		/*
399 		 * If we are aborting the compression because the saves are
400 		 * too small, that is not a failure. Everything else is a
401 		 * failure, so increment the compression failure counter.
402 		 */
403 		if (ZSTD_getErrorCode(c_len) != ZSTD_error_dstSize_tooSmall) {
404 			ZSTDSTAT_BUMP(zstd_stat_com_fail);
405 		}
406 		return (s_len);
407 	}
408 
409 	/*
410 	 * Encode the compressed buffer size at the start. We'll need this in
411 	 * decompression to counter the effects of padding which might be added
412 	 * to the compressed buffer and which, if unhandled, would confuse the
413 	 * hell out of our decompression function.
414 	 */
415 	hdr->c_len = BE_32(c_len);
416 
417 	/*
418 	 * Check version for overflow.
419 	 * The limit of 24 bits must not be exceeded. This allows a maximum
420 	 * version 1677.72.15 which we don't expect to be ever reached.
421 	 */
422 	ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
423 
424 	/*
425 	 * Encode the compression level as well. We may need to know the
426 	 * original compression level if compressed_arc is disabled, to match
427 	 * the compression settings to write this block to the L2ARC.
428 	 *
429 	 * Encode the actual level, so if the enum changes in the future, we
430 	 * will be compatible.
431 	 *
432 	 * The upper 24 bits store the ZSTD version to be able to provide
433 	 * future compatibility, since new versions might enhance the
434 	 * compression algorithm in a way, where the compressed data will
435 	 * change.
436 	 *
437 	 * As soon as such incompatibility occurs, handling code needs to be
438 	 * added, differentiating between the versions.
439 	 */
440 	hdr->version = ZSTD_VERSION_NUMBER;
441 	hdr->level = level;
442 	hdr->raw_version_level = BE_32(hdr->raw_version_level);
443 
444 	return (c_len + sizeof (*hdr));
445 }
446 
447 /* Decompress block using zstd and return its stored level */
448 int
449 zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
450     size_t d_len, uint8_t *level)
451 {
452 	ZSTD_DCtx *dctx;
453 	size_t result;
454 	int16_t zstd_level;
455 	uint32_t c_len;
456 	const zfs_zstdhdr_t *hdr;
457 	zfs_zstdhdr_t hdr_copy;
458 
459 	hdr = (const zfs_zstdhdr_t *)s_start;
460 	c_len = BE_32(hdr->c_len);
461 
462 	/*
463 	 * Make a copy instead of directly converting the header, since we must
464 	 * not modify the original data that may be used again later.
465 	 */
466 	hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
467 
468 	/*
469 	 * NOTE: We ignore the ZSTD version for now. As soon as any
470 	 * incompatibility occurrs, it has to be handled accordingly.
471 	 * The version can be accessed via `hdr_copy.version`.
472 	 */
473 
474 	/*
475 	 * Convert and check the level
476 	 * An invalid level is a strong indicator for data corruption! In such
477 	 * case return an error so the upper layers can try to fix it.
478 	 */
479 	if (zstd_enum_to_level(hdr_copy.level, &zstd_level)) {
480 		ZSTDSTAT_BUMP(zstd_stat_dec_inval);
481 		return (1);
482 	}
483 
484 	ASSERT3U(d_len, >=, s_len);
485 	ASSERT3U(hdr_copy.level, !=, ZIO_COMPLEVEL_INHERIT);
486 
487 	/* Invalid compressed buffer size encoded at start */
488 	if (c_len + sizeof (*hdr) > s_len) {
489 		ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
490 		return (1);
491 	}
492 
493 	dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
494 	if (!dctx) {
495 		ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
496 		return (1);
497 	}
498 
499 	/* Set header type to "magicless" */
500 	ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
501 
502 	/* Decompress the data and release the context */
503 	result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
504 	ZSTD_freeDCtx(dctx);
505 
506 	/*
507 	 * Returns 0 on success (decompression function returned non-negative)
508 	 * and non-zero on failure (decompression function returned negative.
509 	 */
510 	if (ZSTD_isError(result)) {
511 		ZSTDSTAT_BUMP(zstd_stat_dec_fail);
512 		return (1);
513 	}
514 
515 	if (level) {
516 		*level = hdr_copy.level;
517 	}
518 
519 	return (0);
520 }
521 
522 /* Decompress datablock using zstd */
523 int
524 zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
525     int level __maybe_unused)
526 {
527 
528 	return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
529 	    NULL));
530 }
531 
532 /* Allocator for zstd compression context using mempool_allocator */
533 static void *
534 zstd_alloc(void *opaque __maybe_unused, size_t size)
535 {
536 	size_t nbytes = sizeof (struct zstd_kmem) + size;
537 	struct zstd_kmem *z = NULL;
538 
539 	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
540 
541 	if (!z) {
542 		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
543 		return (NULL);
544 	}
545 
546 	return ((void*)z + (sizeof (struct zstd_kmem)));
547 }
548 
549 /*
550  * Allocator for zstd decompression context using mempool_allocator with
551  * fallback to reserved memory if allocation fails
552  */
553 static void *
554 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
555 {
556 	size_t nbytes = sizeof (struct zstd_kmem) + size;
557 	struct zstd_kmem *z = NULL;
558 	enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
559 
560 	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
561 	if (!z) {
562 		/* Try harder, decompression shall not fail */
563 		z = vmem_alloc(nbytes, KM_SLEEP);
564 		if (z) {
565 			z->pool = NULL;
566 		}
567 		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
568 	} else {
569 		return ((void*)z + (sizeof (struct zstd_kmem)));
570 	}
571 
572 	/* Fallback if everything fails */
573 	if (!z) {
574 		/*
575 		 * Barrier since we only can handle it in a single thread. All
576 		 * other following threads need to wait here until decompression
577 		 * is completed. zstd_free will release this barrier later.
578 		 */
579 		mutex_enter(&zstd_dctx_fallback.barrier);
580 
581 		z = zstd_dctx_fallback.mem;
582 		type = ZSTD_KMEM_DCTX;
583 		ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
584 	}
585 
586 	/* Allocation should always be successful */
587 	if (!z) {
588 		return (NULL);
589 	}
590 
591 	z->kmem_type = type;
592 	z->kmem_size = nbytes;
593 
594 	return ((void*)z + (sizeof (struct zstd_kmem)));
595 }
596 
597 /* Free allocated memory by its specific type */
598 static void
599 zstd_free(void *opaque __maybe_unused, void *ptr)
600 {
601 	struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
602 	enum zstd_kmem_type type;
603 
604 	ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
605 	ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
606 
607 	type = z->kmem_type;
608 	switch (type) {
609 	case ZSTD_KMEM_DEFAULT:
610 		vmem_free(z, z->kmem_size);
611 		break;
612 	case ZSTD_KMEM_POOL:
613 		zstd_mempool_free(z);
614 		break;
615 	case ZSTD_KMEM_DCTX:
616 		mutex_exit(&zstd_dctx_fallback.barrier);
617 		break;
618 	default:
619 		break;
620 	}
621 }
622 
623 /* Allocate fallback memory to ensure safe decompression */
624 static void __init
625 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
626 {
627 	mem->mem_size = size;
628 	mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
629 	mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
630 }
631 
632 /* Initialize memory pool barrier mutexes */
633 static void __init
634 zstd_mempool_init(void)
635 {
636 	zstd_mempool_cctx = (struct zstd_pool *)
637 	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
638 	zstd_mempool_dctx = (struct zstd_pool *)
639 	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
640 
641 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
642 		mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
643 		    MUTEX_DEFAULT, NULL);
644 		mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
645 		    MUTEX_DEFAULT, NULL);
646 	}
647 }
648 
649 /* Initialize zstd-related memory handling */
650 static int __init
651 zstd_meminit(void)
652 {
653 	zstd_mempool_init();
654 
655 	/*
656 	 * Estimate the size of the fallback decompression context.
657 	 * The expected size on x64 with current ZSTD should be about 160 KB.
658 	 */
659 	create_fallback_mem(&zstd_dctx_fallback,
660 	    P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
661 	    PAGESIZE));
662 
663 	return (0);
664 }
665 
666 /* Release object from pool and free memory */
667 static void __exit
668 release_pool(struct zstd_pool *pool)
669 {
670 	mutex_destroy(&pool->barrier);
671 	vmem_free(pool->mem, pool->size);
672 	pool->mem = NULL;
673 	pool->size = 0;
674 }
675 
676 /* Release memory pool objects */
677 static void __exit
678 zstd_mempool_deinit(void)
679 {
680 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
681 		release_pool(&zstd_mempool_cctx[i]);
682 		release_pool(&zstd_mempool_dctx[i]);
683 	}
684 
685 	kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
686 	kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
687 	zstd_mempool_dctx = NULL;
688 	zstd_mempool_cctx = NULL;
689 }
690 
691 extern int __init
692 zstd_init(void)
693 {
694 	/* Set pool size by using maximum sane thread count * 4 */
695 	pool_count = (boot_ncpus * 4);
696 	zstd_meminit();
697 
698 	/* Initialize kstat */
699 	zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
700 	    KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
701 	    KSTAT_FLAG_VIRTUAL);
702 	if (zstd_ksp != NULL) {
703 		zstd_ksp->ks_data = &zstd_stats;
704 		kstat_install(zstd_ksp);
705 	}
706 
707 	return (0);
708 }
709 
710 extern void __exit
711 zstd_fini(void)
712 {
713 	/* Deinitialize kstat */
714 	if (zstd_ksp != NULL) {
715 		kstat_delete(zstd_ksp);
716 		zstd_ksp = NULL;
717 	}
718 
719 	/* Release fallback memory */
720 	vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
721 	mutex_destroy(&zstd_dctx_fallback.barrier);
722 
723 	/* Deinit memory pool */
724 	zstd_mempool_deinit();
725 }
726 
727 #if defined(_KERNEL)
728 module_init(zstd_init);
729 module_exit(zstd_fini);
730 
731 ZFS_MODULE_DESCRIPTION("ZSTD Compression for ZFS");
732 ZFS_MODULE_LICENSE("BSD");
733 ZFS_MODULE_VERSION(ZSTD_VERSION_STRING);
734 
735 EXPORT_SYMBOL(zfs_zstd_compress);
736 EXPORT_SYMBOL(zfs_zstd_decompress_level);
737 EXPORT_SYMBOL(zfs_zstd_decompress);
738 #endif
739