xref: /freebsd/sys/contrib/openzfs/module/zstd/zfs_zstd.c (revision 78adacd4eab39a3508bd8c65f0aba94fc6b907ce)
1 /*
2  * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  * 1. Redistributions of source code must retain the above copyright notice,
8  * this list of conditions and the following disclaimer.
9  *
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  * this list of conditions and the following disclaimer in the documentation
12  * and/or other materials provided with the distribution.
13  *
14  * 3. Neither the name of the copyright holder nor the names of its
15  * contributors may be used to endorse or promote products derived from this
16  * software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * Copyright (c) 2016-2018, Klara Inc.
33  * Copyright (c) 2016-2018, Allan Jude
34  * Copyright (c) 2018-2020, Sebastian Gottschall
35  * Copyright (c) 2019-2020, Michael Niewöhner
36  * Copyright (c) 2020, The FreeBSD Foundation [1]
37  *
38  * [1] Portions of this software were developed by Allan Jude
39  *     under sponsorship from the FreeBSD Foundation.
40  */
41 
42 #include <sys/param.h>
43 #include <sys/sysmacros.h>
44 #include <sys/zfs_context.h>
45 #include <sys/zio_compress.h>
46 #include <sys/spa.h>
47 #include <sys/zstd/zstd.h>
48 
49 #define	ZSTD_STATIC_LINKING_ONLY
50 #include "lib/zstd.h"
51 #include "lib/zstd_errors.h"
52 
53 kstat_t *zstd_ksp = NULL;
54 
55 typedef struct zstd_stats {
56 	kstat_named_t	zstd_stat_alloc_fail;
57 	kstat_named_t	zstd_stat_alloc_fallback;
58 	kstat_named_t	zstd_stat_com_alloc_fail;
59 	kstat_named_t	zstd_stat_dec_alloc_fail;
60 	kstat_named_t	zstd_stat_com_inval;
61 	kstat_named_t	zstd_stat_dec_inval;
62 	kstat_named_t	zstd_stat_dec_header_inval;
63 	kstat_named_t	zstd_stat_com_fail;
64 	kstat_named_t	zstd_stat_dec_fail;
65 	kstat_named_t	zstd_stat_buffers;
66 	kstat_named_t	zstd_stat_size;
67 } zstd_stats_t;
68 
69 static zstd_stats_t zstd_stats = {
70 	{ "alloc_fail",			KSTAT_DATA_UINT64 },
71 	{ "alloc_fallback",		KSTAT_DATA_UINT64 },
72 	{ "compress_alloc_fail",	KSTAT_DATA_UINT64 },
73 	{ "decompress_alloc_fail",	KSTAT_DATA_UINT64 },
74 	{ "compress_level_invalid",	KSTAT_DATA_UINT64 },
75 	{ "decompress_level_invalid",	KSTAT_DATA_UINT64 },
76 	{ "decompress_header_invalid",	KSTAT_DATA_UINT64 },
77 	{ "compress_failed",		KSTAT_DATA_UINT64 },
78 	{ "decompress_failed",		KSTAT_DATA_UINT64 },
79 	{ "buffers",			KSTAT_DATA_UINT64 },
80 	{ "size",			KSTAT_DATA_UINT64 },
81 };
82 
83 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */
84 enum zstd_kmem_type {
85 	ZSTD_KMEM_UNKNOWN = 0,
86 	/* Allocation type using kmem_vmalloc */
87 	ZSTD_KMEM_DEFAULT,
88 	/* Pool based allocation using mempool_alloc */
89 	ZSTD_KMEM_POOL,
90 	/* Reserved fallback memory for decompression only */
91 	ZSTD_KMEM_DCTX,
92 	ZSTD_KMEM_COUNT,
93 };
94 
95 /* Structure for pooled memory objects */
96 struct zstd_pool {
97 	void *mem;
98 	size_t size;
99 	kmutex_t barrier;
100 	hrtime_t timeout;
101 };
102 
103 /* Global structure for handling memory allocations */
104 struct zstd_kmem {
105 	enum zstd_kmem_type kmem_type;
106 	size_t kmem_size;
107 	struct zstd_pool *pool;
108 };
109 
110 /* Fallback memory structure used for decompression only if memory runs out */
111 struct zstd_fallback_mem {
112 	size_t mem_size;
113 	void *mem;
114 	kmutex_t barrier;
115 };
116 
117 struct zstd_levelmap {
118 	int16_t zstd_level;
119 	enum zio_zstd_levels level;
120 };
121 
122 /*
123  * ZSTD memory handlers
124  *
125  * For decompression we use a different handler which also provides fallback
126  * memory allocation in case memory runs out.
127  *
128  * The ZSTD handlers were split up for the most simplified implementation.
129  */
130 static void *zstd_alloc(void *opaque, size_t size);
131 static void *zstd_dctx_alloc(void *opaque, size_t size);
132 static void zstd_free(void *opaque, void *ptr);
133 
134 /* Compression memory handler */
135 static const ZSTD_customMem zstd_malloc = {
136 	zstd_alloc,
137 	zstd_free,
138 	NULL,
139 };
140 
141 /* Decompression memory handler */
142 static const ZSTD_customMem zstd_dctx_malloc = {
143 	zstd_dctx_alloc,
144 	zstd_free,
145 	NULL,
146 };
147 
148 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
149 static struct zstd_levelmap zstd_levels[] = {
150 	{ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
151 	{ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
152 	{ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
153 	{ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
154 	{ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
155 	{ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
156 	{ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
157 	{ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
158 	{ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
159 	{ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
160 	{ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
161 	{ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
162 	{ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
163 	{ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
164 	{ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
165 	{ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
166 	{ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
167 	{ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
168 	{ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
169 	{-1, ZIO_ZSTD_LEVEL_FAST_1},
170 	{-2, ZIO_ZSTD_LEVEL_FAST_2},
171 	{-3, ZIO_ZSTD_LEVEL_FAST_3},
172 	{-4, ZIO_ZSTD_LEVEL_FAST_4},
173 	{-5, ZIO_ZSTD_LEVEL_FAST_5},
174 	{-6, ZIO_ZSTD_LEVEL_FAST_6},
175 	{-7, ZIO_ZSTD_LEVEL_FAST_7},
176 	{-8, ZIO_ZSTD_LEVEL_FAST_8},
177 	{-9, ZIO_ZSTD_LEVEL_FAST_9},
178 	{-10, ZIO_ZSTD_LEVEL_FAST_10},
179 	{-20, ZIO_ZSTD_LEVEL_FAST_20},
180 	{-30, ZIO_ZSTD_LEVEL_FAST_30},
181 	{-40, ZIO_ZSTD_LEVEL_FAST_40},
182 	{-50, ZIO_ZSTD_LEVEL_FAST_50},
183 	{-60, ZIO_ZSTD_LEVEL_FAST_60},
184 	{-70, ZIO_ZSTD_LEVEL_FAST_70},
185 	{-80, ZIO_ZSTD_LEVEL_FAST_80},
186 	{-90, ZIO_ZSTD_LEVEL_FAST_90},
187 	{-100, ZIO_ZSTD_LEVEL_FAST_100},
188 	{-500, ZIO_ZSTD_LEVEL_FAST_500},
189 	{-1000, ZIO_ZSTD_LEVEL_FAST_1000},
190 };
191 
192 /*
193  * This variable represents the maximum count of the pool based on the number
194  * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
195  */
196 static int pool_count = 16;
197 
198 #define	ZSTD_POOL_MAX		pool_count
199 #define	ZSTD_POOL_TIMEOUT	60 * 2
200 
201 static struct zstd_fallback_mem zstd_dctx_fallback;
202 static struct zstd_pool *zstd_mempool_cctx;
203 static struct zstd_pool *zstd_mempool_dctx;
204 
205 /*
206  * Try to get a cached allocated buffer from memory pool or allocate a new one
207  * if necessary. If a object is older than 2 minutes and does not fit the
208  * requested size, it will be released and a new cached entry will be allocated.
209  * If other pooled objects are detected without being used for 2 minutes, they
210  * will be released, too.
211  *
212  * The concept is that high frequency memory allocations of bigger objects are
213  * expensive. So if a lot of work is going on, allocations will be kept for a
214  * while and can be reused in that time frame.
215  *
216  * The scheduled release will be updated every time a object is reused.
217  */
218 static void *
219 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
220 {
221 	struct zstd_pool *pool;
222 	struct zstd_kmem *mem = NULL;
223 
224 	if (!zstd_mempool) {
225 		return (NULL);
226 	}
227 
228 	/* Seek for preallocated memory slot and free obsolete slots */
229 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
230 		pool = &zstd_mempool[i];
231 		/*
232 		 * This lock is simply a marker for a pool object beeing in use.
233 		 * If it's already hold, it will be skipped.
234 		 *
235 		 * We need to create it before checking it to avoid race
236 		 * conditions caused by running in a threaded context.
237 		 *
238 		 * The lock is later released by zstd_mempool_free.
239 		 */
240 		if (mutex_tryenter(&pool->barrier)) {
241 			/*
242 			 * Check if objects fits the size, if so we take it and
243 			 * update the timestamp.
244 			 */
245 			if (size && !mem && pool->mem && size <= pool->size) {
246 				pool->timeout = gethrestime_sec() +
247 				    ZSTD_POOL_TIMEOUT;
248 				mem = pool->mem;
249 				continue;
250 			}
251 
252 			/* Free memory if unused object older than 2 minutes */
253 			if (pool->mem && gethrestime_sec() > pool->timeout) {
254 				vmem_free(pool->mem, pool->size);
255 				ZSTDSTAT_SUB(zstd_stat_buffers, 1);
256 				ZSTDSTAT_SUB(zstd_stat_size, pool->size);
257 				pool->mem = NULL;
258 				pool->size = 0;
259 				pool->timeout = 0;
260 			}
261 
262 			mutex_exit(&pool->barrier);
263 		}
264 	}
265 
266 	if (!size || mem) {
267 		return (mem);
268 	}
269 
270 	/*
271 	 * If no preallocated slot was found, try to fill in a new one.
272 	 *
273 	 * We run a similar algorithm twice here to avoid pool fragmentation.
274 	 * The first one may generate holes in the list if objects get released.
275 	 * We always make sure that these holes get filled instead of adding new
276 	 * allocations constantly at the end.
277 	 */
278 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
279 		pool = &zstd_mempool[i];
280 		if (mutex_tryenter(&pool->barrier)) {
281 			/* Object is free, try to allocate new one */
282 			if (!pool->mem) {
283 				mem = vmem_alloc(size, KM_SLEEP);
284 				if (mem) {
285 					ZSTDSTAT_ADD(zstd_stat_buffers, 1);
286 					ZSTDSTAT_ADD(zstd_stat_size, size);
287 					pool->mem = mem;
288 					pool->size = size;
289 					/* Keep track for later release */
290 					mem->pool = pool;
291 					mem->kmem_type = ZSTD_KMEM_POOL;
292 					mem->kmem_size = size;
293 				}
294 			}
295 
296 			if (size <= pool->size) {
297 				/* Update timestamp */
298 				pool->timeout = gethrestime_sec() +
299 				    ZSTD_POOL_TIMEOUT;
300 
301 				return (pool->mem);
302 			}
303 
304 			mutex_exit(&pool->barrier);
305 		}
306 	}
307 
308 	/*
309 	 * If the pool is full or the allocation failed, try lazy allocation
310 	 * instead.
311 	 */
312 	if (!mem) {
313 		mem = vmem_alloc(size, KM_NOSLEEP);
314 		if (mem) {
315 			mem->pool = NULL;
316 			mem->kmem_type = ZSTD_KMEM_DEFAULT;
317 			mem->kmem_size = size;
318 		}
319 	}
320 
321 	return (mem);
322 }
323 
324 /* Mark object as released by releasing the barrier mutex */
325 static void
326 zstd_mempool_free(struct zstd_kmem *z)
327 {
328 	mutex_exit(&z->pool->barrier);
329 }
330 
331 /* Convert ZFS internal enum to ZSTD level */
332 static int
333 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
334 {
335 	if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
336 		*zstd_level = zstd_levels[level - 1].zstd_level;
337 		return (0);
338 	}
339 	if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
340 	    level <= ZIO_ZSTD_LEVEL_FAST_1000) {
341 		*zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
342 		    + ZIO_ZSTD_LEVEL_19].zstd_level;
343 		return (0);
344 	}
345 
346 	/* Invalid/unknown zfs compression enum - this should never happen. */
347 	return (1);
348 }
349 
350 /* Compress block using zstd */
351 size_t
352 zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
353     int level)
354 {
355 	size_t c_len;
356 	int16_t zstd_level;
357 	zfs_zstdhdr_t *hdr;
358 	ZSTD_CCtx *cctx;
359 
360 	hdr = (zfs_zstdhdr_t *)d_start;
361 
362 	/* Skip compression if the specified level is invalid */
363 	if (zstd_enum_to_level(level, &zstd_level)) {
364 		ZSTDSTAT_BUMP(zstd_stat_com_inval);
365 		return (s_len);
366 	}
367 
368 	ASSERT3U(d_len, >=, sizeof (*hdr));
369 	ASSERT3U(d_len, <=, s_len);
370 	ASSERT3U(zstd_level, !=, 0);
371 
372 	cctx = ZSTD_createCCtx_advanced(zstd_malloc);
373 
374 	/*
375 	 * Out of kernel memory, gently fall through - this will disable
376 	 * compression in zio_compress_data
377 	 */
378 	if (!cctx) {
379 		ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
380 		return (s_len);
381 	}
382 
383 	/* Set the compression level */
384 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
385 
386 	/* Use the "magicless" zstd header which saves us 4 header bytes */
387 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
388 
389 	/*
390 	 * Disable redundant checksum calculation and content size storage since
391 	 * this is already done by ZFS itself.
392 	 */
393 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
394 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
395 
396 	c_len = ZSTD_compress2(cctx,
397 	    hdr->data,
398 	    d_len - sizeof (*hdr),
399 	    s_start, s_len);
400 
401 	ZSTD_freeCCtx(cctx);
402 
403 	/* Error in the compression routine, disable compression. */
404 	if (ZSTD_isError(c_len)) {
405 		/*
406 		 * If we are aborting the compression because the saves are
407 		 * too small, that is not a failure. Everything else is a
408 		 * failure, so increment the compression failure counter.
409 		 */
410 		if (ZSTD_getErrorCode(c_len) != ZSTD_error_dstSize_tooSmall) {
411 			ZSTDSTAT_BUMP(zstd_stat_com_fail);
412 		}
413 		return (s_len);
414 	}
415 
416 	/*
417 	 * Encode the compressed buffer size at the start. We'll need this in
418 	 * decompression to counter the effects of padding which might be added
419 	 * to the compressed buffer and which, if unhandled, would confuse the
420 	 * hell out of our decompression function.
421 	 */
422 	hdr->c_len = BE_32(c_len);
423 
424 	/*
425 	 * Check version for overflow.
426 	 * The limit of 24 bits must not be exceeded. This allows a maximum
427 	 * version 1677.72.15 which we don't expect to be ever reached.
428 	 */
429 	ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
430 
431 	/*
432 	 * Encode the compression level as well. We may need to know the
433 	 * original compression level if compressed_arc is disabled, to match
434 	 * the compression settings to write this block to the L2ARC.
435 	 *
436 	 * Encode the actual level, so if the enum changes in the future, we
437 	 * will be compatible.
438 	 *
439 	 * The upper 24 bits store the ZSTD version to be able to provide
440 	 * future compatibility, since new versions might enhance the
441 	 * compression algorithm in a way, where the compressed data will
442 	 * change.
443 	 *
444 	 * As soon as such incompatibility occurs, handling code needs to be
445 	 * added, differentiating between the versions.
446 	 */
447 	hdr->version = ZSTD_VERSION_NUMBER;
448 	hdr->level = level;
449 	hdr->raw_version_level = BE_32(hdr->raw_version_level);
450 
451 	return (c_len + sizeof (*hdr));
452 }
453 
454 /* Decompress block using zstd and return its stored level */
455 int
456 zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
457     size_t d_len, uint8_t *level)
458 {
459 	ZSTD_DCtx *dctx;
460 	size_t result;
461 	int16_t zstd_level;
462 	uint32_t c_len;
463 	const zfs_zstdhdr_t *hdr;
464 	zfs_zstdhdr_t hdr_copy;
465 
466 	hdr = (const zfs_zstdhdr_t *)s_start;
467 	c_len = BE_32(hdr->c_len);
468 
469 	/*
470 	 * Make a copy instead of directly converting the header, since we must
471 	 * not modify the original data that may be used again later.
472 	 */
473 	hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
474 
475 	/*
476 	 * NOTE: We ignore the ZSTD version for now. As soon as any
477 	 * incompatibility occurrs, it has to be handled accordingly.
478 	 * The version can be accessed via `hdr_copy.version`.
479 	 */
480 
481 	/*
482 	 * Convert and check the level
483 	 * An invalid level is a strong indicator for data corruption! In such
484 	 * case return an error so the upper layers can try to fix it.
485 	 */
486 	if (zstd_enum_to_level(hdr_copy.level, &zstd_level)) {
487 		ZSTDSTAT_BUMP(zstd_stat_dec_inval);
488 		return (1);
489 	}
490 
491 	ASSERT3U(d_len, >=, s_len);
492 	ASSERT3U(hdr_copy.level, !=, ZIO_COMPLEVEL_INHERIT);
493 
494 	/* Invalid compressed buffer size encoded at start */
495 	if (c_len + sizeof (*hdr) > s_len) {
496 		ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
497 		return (1);
498 	}
499 
500 	dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
501 	if (!dctx) {
502 		ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
503 		return (1);
504 	}
505 
506 	/* Set header type to "magicless" */
507 	ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
508 
509 	/* Decompress the data and release the context */
510 	result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
511 	ZSTD_freeDCtx(dctx);
512 
513 	/*
514 	 * Returns 0 on success (decompression function returned non-negative)
515 	 * and non-zero on failure (decompression function returned negative.
516 	 */
517 	if (ZSTD_isError(result)) {
518 		ZSTDSTAT_BUMP(zstd_stat_dec_fail);
519 		return (1);
520 	}
521 
522 	if (level) {
523 		*level = hdr_copy.level;
524 	}
525 
526 	return (0);
527 }
528 
529 /* Decompress datablock using zstd */
530 int
531 zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
532     int level __maybe_unused)
533 {
534 
535 	return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
536 	    NULL));
537 }
538 
539 /* Allocator for zstd compression context using mempool_allocator */
540 static void *
541 zstd_alloc(void *opaque __maybe_unused, size_t size)
542 {
543 	size_t nbytes = sizeof (struct zstd_kmem) + size;
544 	struct zstd_kmem *z = NULL;
545 
546 	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
547 
548 	if (!z) {
549 		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
550 		return (NULL);
551 	}
552 
553 	return ((void*)z + (sizeof (struct zstd_kmem)));
554 }
555 
556 /*
557  * Allocator for zstd decompression context using mempool_allocator with
558  * fallback to reserved memory if allocation fails
559  */
560 static void *
561 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
562 {
563 	size_t nbytes = sizeof (struct zstd_kmem) + size;
564 	struct zstd_kmem *z = NULL;
565 	enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
566 
567 	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
568 	if (!z) {
569 		/* Try harder, decompression shall not fail */
570 		z = vmem_alloc(nbytes, KM_SLEEP);
571 		if (z) {
572 			z->pool = NULL;
573 		}
574 		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
575 	} else {
576 		return ((void*)z + (sizeof (struct zstd_kmem)));
577 	}
578 
579 	/* Fallback if everything fails */
580 	if (!z) {
581 		/*
582 		 * Barrier since we only can handle it in a single thread. All
583 		 * other following threads need to wait here until decompression
584 		 * is completed. zstd_free will release this barrier later.
585 		 */
586 		mutex_enter(&zstd_dctx_fallback.barrier);
587 
588 		z = zstd_dctx_fallback.mem;
589 		type = ZSTD_KMEM_DCTX;
590 		ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
591 	}
592 
593 	/* Allocation should always be successful */
594 	if (!z) {
595 		return (NULL);
596 	}
597 
598 	z->kmem_type = type;
599 	z->kmem_size = nbytes;
600 
601 	return ((void*)z + (sizeof (struct zstd_kmem)));
602 }
603 
604 /* Free allocated memory by its specific type */
605 static void
606 zstd_free(void *opaque __maybe_unused, void *ptr)
607 {
608 	struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
609 	enum zstd_kmem_type type;
610 
611 	ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
612 	ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
613 
614 	type = z->kmem_type;
615 	switch (type) {
616 	case ZSTD_KMEM_DEFAULT:
617 		vmem_free(z, z->kmem_size);
618 		break;
619 	case ZSTD_KMEM_POOL:
620 		zstd_mempool_free(z);
621 		break;
622 	case ZSTD_KMEM_DCTX:
623 		mutex_exit(&zstd_dctx_fallback.barrier);
624 		break;
625 	default:
626 		break;
627 	}
628 }
629 
630 /* Allocate fallback memory to ensure safe decompression */
631 static void __init
632 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
633 {
634 	mem->mem_size = size;
635 	mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
636 	mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
637 }
638 
639 /* Initialize memory pool barrier mutexes */
640 static void __init
641 zstd_mempool_init(void)
642 {
643 	zstd_mempool_cctx = (struct zstd_pool *)
644 	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
645 	zstd_mempool_dctx = (struct zstd_pool *)
646 	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
647 
648 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
649 		mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
650 		    MUTEX_DEFAULT, NULL);
651 		mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
652 		    MUTEX_DEFAULT, NULL);
653 	}
654 }
655 
656 /* Initialize zstd-related memory handling */
657 static int __init
658 zstd_meminit(void)
659 {
660 	zstd_mempool_init();
661 
662 	/*
663 	 * Estimate the size of the fallback decompression context.
664 	 * The expected size on x64 with current ZSTD should be about 160 KB.
665 	 */
666 	create_fallback_mem(&zstd_dctx_fallback,
667 	    P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
668 	    PAGESIZE));
669 
670 	return (0);
671 }
672 
673 /* Release object from pool and free memory */
674 static void __exit
675 release_pool(struct zstd_pool *pool)
676 {
677 	mutex_destroy(&pool->barrier);
678 	vmem_free(pool->mem, pool->size);
679 	pool->mem = NULL;
680 	pool->size = 0;
681 }
682 
683 /* Release memory pool objects */
684 static void __exit
685 zstd_mempool_deinit(void)
686 {
687 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
688 		release_pool(&zstd_mempool_cctx[i]);
689 		release_pool(&zstd_mempool_dctx[i]);
690 	}
691 
692 	kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
693 	kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
694 	zstd_mempool_dctx = NULL;
695 	zstd_mempool_cctx = NULL;
696 }
697 
698 /* release unused memory from pool */
699 
700 void
701 zfs_zstd_cache_reap_now(void)
702 {
703 
704 	/*
705 	 * Short-circuit if there are no buffers to begin with.
706 	 */
707 	if (ZSTDSTAT(zstd_stat_buffers) == 0)
708 		return;
709 
710 	/*
711 	 * calling alloc with zero size seeks
712 	 * and releases old unused objects
713 	 */
714 	zstd_mempool_alloc(zstd_mempool_cctx, 0);
715 	zstd_mempool_alloc(zstd_mempool_dctx, 0);
716 }
717 
718 extern int __init
719 zstd_init(void)
720 {
721 	/* Set pool size by using maximum sane thread count * 4 */
722 	pool_count = (boot_ncpus * 4);
723 	zstd_meminit();
724 
725 	/* Initialize kstat */
726 	zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
727 	    KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
728 	    KSTAT_FLAG_VIRTUAL);
729 	if (zstd_ksp != NULL) {
730 		zstd_ksp->ks_data = &zstd_stats;
731 		kstat_install(zstd_ksp);
732 	}
733 
734 	return (0);
735 }
736 
737 extern void __exit
738 zstd_fini(void)
739 {
740 	/* Deinitialize kstat */
741 	if (zstd_ksp != NULL) {
742 		kstat_delete(zstd_ksp);
743 		zstd_ksp = NULL;
744 	}
745 
746 	/* Release fallback memory */
747 	vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
748 	mutex_destroy(&zstd_dctx_fallback.barrier);
749 
750 	/* Deinit memory pool */
751 	zstd_mempool_deinit();
752 }
753 
754 #if defined(_KERNEL)
755 module_init(zstd_init);
756 module_exit(zstd_fini);
757 
758 ZFS_MODULE_DESCRIPTION("ZSTD Compression for ZFS");
759 ZFS_MODULE_LICENSE("Dual BSD/GPL");
760 ZFS_MODULE_VERSION(ZSTD_VERSION_STRING);
761 
762 EXPORT_SYMBOL(zfs_zstd_compress);
763 EXPORT_SYMBOL(zfs_zstd_decompress_level);
764 EXPORT_SYMBOL(zfs_zstd_decompress);
765 EXPORT_SYMBOL(zfs_zstd_cache_reap_now);
766 #endif
767