xref: /freebsd/sys/contrib/openzfs/module/zstd/zfs_zstd.c (revision 1323ec571215a77ddd21294f0871979d5ad6b992)
1 /*
2  * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  * 1. Redistributions of source code must retain the above copyright notice,
8  * this list of conditions and the following disclaimer.
9  *
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  * this list of conditions and the following disclaimer in the documentation
12  * and/or other materials provided with the distribution.
13  *
14  * 3. Neither the name of the copyright holder nor the names of its
15  * contributors may be used to endorse or promote products derived from this
16  * software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * Copyright (c) 2016-2018, Klara Inc.
33  * Copyright (c) 2016-2018, Allan Jude
34  * Copyright (c) 2018-2020, Sebastian Gottschall
35  * Copyright (c) 2019-2020, Michael Niewöhner
36  * Copyright (c) 2020, The FreeBSD Foundation [1]
37  *
38  * [1] Portions of this software were developed by Allan Jude
39  *     under sponsorship from the FreeBSD Foundation.
40  */
41 
42 #include <sys/param.h>
43 #include <sys/sysmacros.h>
44 #include <sys/zfs_context.h>
45 #include <sys/zio_compress.h>
46 #include <sys/spa.h>
47 #include <sys/zstd/zstd.h>
48 
49 #define	ZSTD_STATIC_LINKING_ONLY
50 #include "lib/zstd.h"
51 #include "lib/common/zstd_errors.h"
52 
53 kstat_t *zstd_ksp = NULL;
54 
55 typedef struct zstd_stats {
56 	kstat_named_t	zstd_stat_alloc_fail;
57 	kstat_named_t	zstd_stat_alloc_fallback;
58 	kstat_named_t	zstd_stat_com_alloc_fail;
59 	kstat_named_t	zstd_stat_dec_alloc_fail;
60 	kstat_named_t	zstd_stat_com_inval;
61 	kstat_named_t	zstd_stat_dec_inval;
62 	kstat_named_t	zstd_stat_dec_header_inval;
63 	kstat_named_t	zstd_stat_com_fail;
64 	kstat_named_t	zstd_stat_dec_fail;
65 	kstat_named_t	zstd_stat_buffers;
66 	kstat_named_t	zstd_stat_size;
67 } zstd_stats_t;
68 
69 static zstd_stats_t zstd_stats = {
70 	{ "alloc_fail",			KSTAT_DATA_UINT64 },
71 	{ "alloc_fallback",		KSTAT_DATA_UINT64 },
72 	{ "compress_alloc_fail",	KSTAT_DATA_UINT64 },
73 	{ "decompress_alloc_fail",	KSTAT_DATA_UINT64 },
74 	{ "compress_level_invalid",	KSTAT_DATA_UINT64 },
75 	{ "decompress_level_invalid",	KSTAT_DATA_UINT64 },
76 	{ "decompress_header_invalid",	KSTAT_DATA_UINT64 },
77 	{ "compress_failed",		KSTAT_DATA_UINT64 },
78 	{ "decompress_failed",		KSTAT_DATA_UINT64 },
79 	{ "buffers",			KSTAT_DATA_UINT64 },
80 	{ "size",			KSTAT_DATA_UINT64 },
81 };
82 
83 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */
84 enum zstd_kmem_type {
85 	ZSTD_KMEM_UNKNOWN = 0,
86 	/* Allocation type using kmem_vmalloc */
87 	ZSTD_KMEM_DEFAULT,
88 	/* Pool based allocation using mempool_alloc */
89 	ZSTD_KMEM_POOL,
90 	/* Reserved fallback memory for decompression only */
91 	ZSTD_KMEM_DCTX,
92 	ZSTD_KMEM_COUNT,
93 };
94 
95 /* Structure for pooled memory objects */
96 struct zstd_pool {
97 	void *mem;
98 	size_t size;
99 	kmutex_t barrier;
100 	hrtime_t timeout;
101 };
102 
103 /* Global structure for handling memory allocations */
104 struct zstd_kmem {
105 	enum zstd_kmem_type kmem_type;
106 	size_t kmem_size;
107 	struct zstd_pool *pool;
108 };
109 
110 /* Fallback memory structure used for decompression only if memory runs out */
111 struct zstd_fallback_mem {
112 	size_t mem_size;
113 	void *mem;
114 	kmutex_t barrier;
115 };
116 
117 struct zstd_levelmap {
118 	int16_t zstd_level;
119 	enum zio_zstd_levels level;
120 };
121 
122 /*
123  * ZSTD memory handlers
124  *
125  * For decompression we use a different handler which also provides fallback
126  * memory allocation in case memory runs out.
127  *
128  * The ZSTD handlers were split up for the most simplified implementation.
129  */
130 static void *zstd_alloc(void *opaque, size_t size);
131 static void *zstd_dctx_alloc(void *opaque, size_t size);
132 static void zstd_free(void *opaque, void *ptr);
133 
134 /* Compression memory handler */
135 static const ZSTD_customMem zstd_malloc = {
136 	zstd_alloc,
137 	zstd_free,
138 	NULL,
139 };
140 
141 /* Decompression memory handler */
142 static const ZSTD_customMem zstd_dctx_malloc = {
143 	zstd_dctx_alloc,
144 	zstd_free,
145 	NULL,
146 };
147 
148 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
149 static struct zstd_levelmap zstd_levels[] = {
150 	{ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
151 	{ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
152 	{ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
153 	{ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
154 	{ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
155 	{ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
156 	{ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
157 	{ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
158 	{ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
159 	{ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
160 	{ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
161 	{ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
162 	{ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
163 	{ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
164 	{ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
165 	{ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
166 	{ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
167 	{ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
168 	{ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
169 	{-1, ZIO_ZSTD_LEVEL_FAST_1},
170 	{-2, ZIO_ZSTD_LEVEL_FAST_2},
171 	{-3, ZIO_ZSTD_LEVEL_FAST_3},
172 	{-4, ZIO_ZSTD_LEVEL_FAST_4},
173 	{-5, ZIO_ZSTD_LEVEL_FAST_5},
174 	{-6, ZIO_ZSTD_LEVEL_FAST_6},
175 	{-7, ZIO_ZSTD_LEVEL_FAST_7},
176 	{-8, ZIO_ZSTD_LEVEL_FAST_8},
177 	{-9, ZIO_ZSTD_LEVEL_FAST_9},
178 	{-10, ZIO_ZSTD_LEVEL_FAST_10},
179 	{-20, ZIO_ZSTD_LEVEL_FAST_20},
180 	{-30, ZIO_ZSTD_LEVEL_FAST_30},
181 	{-40, ZIO_ZSTD_LEVEL_FAST_40},
182 	{-50, ZIO_ZSTD_LEVEL_FAST_50},
183 	{-60, ZIO_ZSTD_LEVEL_FAST_60},
184 	{-70, ZIO_ZSTD_LEVEL_FAST_70},
185 	{-80, ZIO_ZSTD_LEVEL_FAST_80},
186 	{-90, ZIO_ZSTD_LEVEL_FAST_90},
187 	{-100, ZIO_ZSTD_LEVEL_FAST_100},
188 	{-500, ZIO_ZSTD_LEVEL_FAST_500},
189 	{-1000, ZIO_ZSTD_LEVEL_FAST_1000},
190 };
191 
192 /*
193  * This variable represents the maximum count of the pool based on the number
194  * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
195  */
196 static int pool_count = 16;
197 
198 #define	ZSTD_POOL_MAX		pool_count
199 #define	ZSTD_POOL_TIMEOUT	60 * 2
200 
201 static struct zstd_fallback_mem zstd_dctx_fallback;
202 static struct zstd_pool *zstd_mempool_cctx;
203 static struct zstd_pool *zstd_mempool_dctx;
204 
205 /*
206  * The library zstd code expects these if ADDRESS_SANITIZER gets defined,
207  * and while ASAN does this, KASAN defines that and does not. So to avoid
208  * changing the external code, we do this.
209  */
210 #if defined(ZFS_ASAN_ENABLED)
211 #define	ADDRESS_SANITIZER 1
212 #endif
213 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER)
214 void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
215 void __asan_poison_memory_region(void const volatile *addr, size_t size);
216 void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {};
217 void __asan_poison_memory_region(void const volatile *addr, size_t size) {};
218 #endif
219 
220 
221 static void
222 zstd_mempool_reap(struct zstd_pool *zstd_mempool)
223 {
224 	struct zstd_pool *pool;
225 
226 	if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {
227 		return;
228 	}
229 
230 	/* free obsolete slots */
231 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
232 		pool = &zstd_mempool[i];
233 		if (pool->mem && mutex_tryenter(&pool->barrier)) {
234 			/* Free memory if unused object older than 2 minutes */
235 			if (pool->mem && gethrestime_sec() > pool->timeout) {
236 				vmem_free(pool->mem, pool->size);
237 				ZSTDSTAT_SUB(zstd_stat_buffers, 1);
238 				ZSTDSTAT_SUB(zstd_stat_size, pool->size);
239 				pool->mem = NULL;
240 				pool->size = 0;
241 				pool->timeout = 0;
242 			}
243 			mutex_exit(&pool->barrier);
244 		}
245 	}
246 }
247 
248 /*
249  * Try to get a cached allocated buffer from memory pool or allocate a new one
250  * if necessary. If a object is older than 2 minutes and does not fit the
251  * requested size, it will be released and a new cached entry will be allocated.
252  * If other pooled objects are detected without being used for 2 minutes, they
253  * will be released, too.
254  *
255  * The concept is that high frequency memory allocations of bigger objects are
256  * expensive. So if a lot of work is going on, allocations will be kept for a
257  * while and can be reused in that time frame.
258  *
259  * The scheduled release will be updated every time a object is reused.
260  */
261 
262 static void *
263 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
264 {
265 	struct zstd_pool *pool;
266 	struct zstd_kmem *mem = NULL;
267 
268 	if (!zstd_mempool) {
269 		return (NULL);
270 	}
271 
272 	/* Seek for preallocated memory slot and free obsolete slots */
273 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
274 		pool = &zstd_mempool[i];
275 		/*
276 		 * This lock is simply a marker for a pool object being in use.
277 		 * If it's already hold, it will be skipped.
278 		 *
279 		 * We need to create it before checking it to avoid race
280 		 * conditions caused by running in a threaded context.
281 		 *
282 		 * The lock is later released by zstd_mempool_free.
283 		 */
284 		if (mutex_tryenter(&pool->barrier)) {
285 			/*
286 			 * Check if objects fits the size, if so we take it and
287 			 * update the timestamp.
288 			 */
289 			if (pool->mem && size <= pool->size) {
290 				pool->timeout = gethrestime_sec() +
291 				    ZSTD_POOL_TIMEOUT;
292 				mem = pool->mem;
293 				return (mem);
294 			}
295 			mutex_exit(&pool->barrier);
296 		}
297 	}
298 
299 	/*
300 	 * If no preallocated slot was found, try to fill in a new one.
301 	 *
302 	 * We run a similar algorithm twice here to avoid pool fragmentation.
303 	 * The first one may generate holes in the list if objects get released.
304 	 * We always make sure that these holes get filled instead of adding new
305 	 * allocations constantly at the end.
306 	 */
307 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
308 		pool = &zstd_mempool[i];
309 		if (mutex_tryenter(&pool->barrier)) {
310 			/* Object is free, try to allocate new one */
311 			if (!pool->mem) {
312 				mem = vmem_alloc(size, KM_SLEEP);
313 				if (mem) {
314 					ZSTDSTAT_ADD(zstd_stat_buffers, 1);
315 					ZSTDSTAT_ADD(zstd_stat_size, size);
316 					pool->mem = mem;
317 					pool->size = size;
318 					/* Keep track for later release */
319 					mem->pool = pool;
320 					mem->kmem_type = ZSTD_KMEM_POOL;
321 					mem->kmem_size = size;
322 				}
323 			}
324 
325 			if (size <= pool->size) {
326 				/* Update timestamp */
327 				pool->timeout = gethrestime_sec() +
328 				    ZSTD_POOL_TIMEOUT;
329 
330 				return (pool->mem);
331 			}
332 
333 			mutex_exit(&pool->barrier);
334 		}
335 	}
336 
337 	/*
338 	 * If the pool is full or the allocation failed, try lazy allocation
339 	 * instead.
340 	 */
341 	if (!mem) {
342 		mem = vmem_alloc(size, KM_NOSLEEP);
343 		if (mem) {
344 			mem->pool = NULL;
345 			mem->kmem_type = ZSTD_KMEM_DEFAULT;
346 			mem->kmem_size = size;
347 		}
348 	}
349 
350 	return (mem);
351 }
352 
353 /* Mark object as released by releasing the barrier mutex */
354 static void
355 zstd_mempool_free(struct zstd_kmem *z)
356 {
357 	mutex_exit(&z->pool->barrier);
358 }
359 
360 /* Convert ZFS internal enum to ZSTD level */
361 static int
362 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
363 {
364 	if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
365 		*zstd_level = zstd_levels[level - 1].zstd_level;
366 		return (0);
367 	}
368 	if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
369 	    level <= ZIO_ZSTD_LEVEL_FAST_1000) {
370 		*zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
371 		    + ZIO_ZSTD_LEVEL_19].zstd_level;
372 		return (0);
373 	}
374 
375 	/* Invalid/unknown zfs compression enum - this should never happen. */
376 	return (1);
377 }
378 
379 
380 /* Compress block using zstd */
381 size_t
382 zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
383     int level)
384 {
385 	size_t c_len;
386 	int16_t zstd_level;
387 	zfs_zstdhdr_t *hdr;
388 	ZSTD_CCtx *cctx;
389 
390 	hdr = (zfs_zstdhdr_t *)d_start;
391 
392 	/* Skip compression if the specified level is invalid */
393 	if (zstd_enum_to_level(level, &zstd_level)) {
394 		ZSTDSTAT_BUMP(zstd_stat_com_inval);
395 		return (s_len);
396 	}
397 
398 	ASSERT3U(d_len, >=, sizeof (*hdr));
399 	ASSERT3U(d_len, <=, s_len);
400 	ASSERT3U(zstd_level, !=, 0);
401 
402 	cctx = ZSTD_createCCtx_advanced(zstd_malloc);
403 
404 	/*
405 	 * Out of kernel memory, gently fall through - this will disable
406 	 * compression in zio_compress_data
407 	 */
408 	if (!cctx) {
409 		ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
410 		return (s_len);
411 	}
412 
413 	/* Set the compression level */
414 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
415 
416 	/* Use the "magicless" zstd header which saves us 4 header bytes */
417 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
418 
419 	/*
420 	 * Disable redundant checksum calculation and content size storage since
421 	 * this is already done by ZFS itself.
422 	 */
423 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
424 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
425 
426 	c_len = ZSTD_compress2(cctx,
427 	    hdr->data,
428 	    d_len - sizeof (*hdr),
429 	    s_start, s_len);
430 
431 	ZSTD_freeCCtx(cctx);
432 
433 	/* Error in the compression routine, disable compression. */
434 	if (ZSTD_isError(c_len)) {
435 		/*
436 		 * If we are aborting the compression because the saves are
437 		 * too small, that is not a failure. Everything else is a
438 		 * failure, so increment the compression failure counter.
439 		 */
440 		if (ZSTD_getErrorCode(c_len) != ZSTD_error_dstSize_tooSmall) {
441 			ZSTDSTAT_BUMP(zstd_stat_com_fail);
442 		}
443 		return (s_len);
444 	}
445 
446 	/*
447 	 * Encode the compressed buffer size at the start. We'll need this in
448 	 * decompression to counter the effects of padding which might be added
449 	 * to the compressed buffer and which, if unhandled, would confuse the
450 	 * hell out of our decompression function.
451 	 */
452 	hdr->c_len = BE_32(c_len);
453 
454 	/*
455 	 * Check version for overflow.
456 	 * The limit of 24 bits must not be exceeded. This allows a maximum
457 	 * version 1677.72.15 which we don't expect to be ever reached.
458 	 */
459 	ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
460 
461 	/*
462 	 * Encode the compression level as well. We may need to know the
463 	 * original compression level if compressed_arc is disabled, to match
464 	 * the compression settings to write this block to the L2ARC.
465 	 *
466 	 * Encode the actual level, so if the enum changes in the future, we
467 	 * will be compatible.
468 	 *
469 	 * The upper 24 bits store the ZSTD version to be able to provide
470 	 * future compatibility, since new versions might enhance the
471 	 * compression algorithm in a way, where the compressed data will
472 	 * change.
473 	 *
474 	 * As soon as such incompatibility occurs, handling code needs to be
475 	 * added, differentiating between the versions.
476 	 */
477 	zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER);
478 	zfs_set_hdrlevel(hdr, level);
479 	hdr->raw_version_level = BE_32(hdr->raw_version_level);
480 
481 	return (c_len + sizeof (*hdr));
482 }
483 
484 /* Decompress block using zstd and return its stored level */
485 int
486 zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
487     size_t d_len, uint8_t *level)
488 {
489 	ZSTD_DCtx *dctx;
490 	size_t result;
491 	int16_t zstd_level;
492 	uint32_t c_len;
493 	const zfs_zstdhdr_t *hdr;
494 	zfs_zstdhdr_t hdr_copy;
495 
496 	hdr = (const zfs_zstdhdr_t *)s_start;
497 	c_len = BE_32(hdr->c_len);
498 
499 	/*
500 	 * Make a copy instead of directly converting the header, since we must
501 	 * not modify the original data that may be used again later.
502 	 */
503 	hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
504 	uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy);
505 
506 	/*
507 	 * NOTE: We ignore the ZSTD version for now. As soon as any
508 	 * incompatibility occurs, it has to be handled accordingly.
509 	 * The version can be accessed via `hdr_copy.version`.
510 	 */
511 
512 	/*
513 	 * Convert and check the level
514 	 * An invalid level is a strong indicator for data corruption! In such
515 	 * case return an error so the upper layers can try to fix it.
516 	 */
517 	if (zstd_enum_to_level(curlevel, &zstd_level)) {
518 		ZSTDSTAT_BUMP(zstd_stat_dec_inval);
519 		return (1);
520 	}
521 
522 	ASSERT3U(d_len, >=, s_len);
523 	ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT);
524 
525 	/* Invalid compressed buffer size encoded at start */
526 	if (c_len + sizeof (*hdr) > s_len) {
527 		ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
528 		return (1);
529 	}
530 
531 	dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
532 	if (!dctx) {
533 		ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
534 		return (1);
535 	}
536 
537 	/* Set header type to "magicless" */
538 	ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
539 
540 	/* Decompress the data and release the context */
541 	result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
542 	ZSTD_freeDCtx(dctx);
543 
544 	/*
545 	 * Returns 0 on success (decompression function returned non-negative)
546 	 * and non-zero on failure (decompression function returned negative.
547 	 */
548 	if (ZSTD_isError(result)) {
549 		ZSTDSTAT_BUMP(zstd_stat_dec_fail);
550 		return (1);
551 	}
552 
553 	if (level) {
554 		*level = curlevel;
555 	}
556 
557 	return (0);
558 }
559 
560 /* Decompress datablock using zstd */
561 int
562 zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
563     int level __maybe_unused)
564 {
565 
566 	return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
567 	    NULL));
568 }
569 
570 /* Allocator for zstd compression context using mempool_allocator */
571 static void *
572 zstd_alloc(void *opaque __maybe_unused, size_t size)
573 {
574 	size_t nbytes = sizeof (struct zstd_kmem) + size;
575 	struct zstd_kmem *z = NULL;
576 
577 	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
578 
579 	if (!z) {
580 		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
581 		return (NULL);
582 	}
583 
584 	return ((void*)z + (sizeof (struct zstd_kmem)));
585 }
586 
587 /*
588  * Allocator for zstd decompression context using mempool_allocator with
589  * fallback to reserved memory if allocation fails
590  */
591 static void *
592 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
593 {
594 	size_t nbytes = sizeof (struct zstd_kmem) + size;
595 	struct zstd_kmem *z = NULL;
596 	enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
597 
598 	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
599 	if (!z) {
600 		/* Try harder, decompression shall not fail */
601 		z = vmem_alloc(nbytes, KM_SLEEP);
602 		if (z) {
603 			z->pool = NULL;
604 		}
605 		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
606 	} else {
607 		return ((void*)z + (sizeof (struct zstd_kmem)));
608 	}
609 
610 	/* Fallback if everything fails */
611 	if (!z) {
612 		/*
613 		 * Barrier since we only can handle it in a single thread. All
614 		 * other following threads need to wait here until decompression
615 		 * is completed. zstd_free will release this barrier later.
616 		 */
617 		mutex_enter(&zstd_dctx_fallback.barrier);
618 
619 		z = zstd_dctx_fallback.mem;
620 		type = ZSTD_KMEM_DCTX;
621 		ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
622 	}
623 
624 	/* Allocation should always be successful */
625 	if (!z) {
626 		return (NULL);
627 	}
628 
629 	z->kmem_type = type;
630 	z->kmem_size = nbytes;
631 
632 	return ((void*)z + (sizeof (struct zstd_kmem)));
633 }
634 
635 /* Free allocated memory by its specific type */
636 static void
637 zstd_free(void *opaque __maybe_unused, void *ptr)
638 {
639 	struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
640 	enum zstd_kmem_type type;
641 
642 	ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
643 	ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
644 
645 	type = z->kmem_type;
646 	switch (type) {
647 	case ZSTD_KMEM_DEFAULT:
648 		vmem_free(z, z->kmem_size);
649 		break;
650 	case ZSTD_KMEM_POOL:
651 		zstd_mempool_free(z);
652 		break;
653 	case ZSTD_KMEM_DCTX:
654 		mutex_exit(&zstd_dctx_fallback.barrier);
655 		break;
656 	default:
657 		break;
658 	}
659 }
660 
661 /* Allocate fallback memory to ensure safe decompression */
662 static void __init
663 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
664 {
665 	mem->mem_size = size;
666 	mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
667 	mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
668 }
669 
670 /* Initialize memory pool barrier mutexes */
671 static void __init
672 zstd_mempool_init(void)
673 {
674 	zstd_mempool_cctx = (struct zstd_pool *)
675 	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
676 	zstd_mempool_dctx = (struct zstd_pool *)
677 	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
678 
679 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
680 		mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
681 		    MUTEX_DEFAULT, NULL);
682 		mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
683 		    MUTEX_DEFAULT, NULL);
684 	}
685 }
686 
687 /* Initialize zstd-related memory handling */
688 static int __init
689 zstd_meminit(void)
690 {
691 	zstd_mempool_init();
692 
693 	/*
694 	 * Estimate the size of the fallback decompression context.
695 	 * The expected size on x64 with current ZSTD should be about 160 KB.
696 	 */
697 	create_fallback_mem(&zstd_dctx_fallback,
698 	    P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
699 	    PAGESIZE));
700 
701 	return (0);
702 }
703 
704 /* Release object from pool and free memory */
705 static void __exit
706 release_pool(struct zstd_pool *pool)
707 {
708 	mutex_destroy(&pool->barrier);
709 	vmem_free(pool->mem, pool->size);
710 	pool->mem = NULL;
711 	pool->size = 0;
712 }
713 
714 /* Release memory pool objects */
715 static void __exit
716 zstd_mempool_deinit(void)
717 {
718 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
719 		release_pool(&zstd_mempool_cctx[i]);
720 		release_pool(&zstd_mempool_dctx[i]);
721 	}
722 
723 	kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
724 	kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
725 	zstd_mempool_dctx = NULL;
726 	zstd_mempool_cctx = NULL;
727 }
728 
729 /* release unused memory from pool */
730 
731 void
732 zfs_zstd_cache_reap_now(void)
733 {
734 
735 	/*
736 	 * Short-circuit if there are no buffers to begin with.
737 	 */
738 	if (ZSTDSTAT(zstd_stat_buffers) == 0)
739 		return;
740 
741 	/*
742 	 * calling alloc with zero size seeks
743 	 * and releases old unused objects
744 	 */
745 	zstd_mempool_reap(zstd_mempool_cctx);
746 	zstd_mempool_reap(zstd_mempool_dctx);
747 }
748 
749 extern int __init
750 zstd_init(void)
751 {
752 	/* Set pool size by using maximum sane thread count * 4 */
753 	pool_count = (boot_ncpus * 4);
754 	zstd_meminit();
755 
756 	/* Initialize kstat */
757 	zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
758 	    KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
759 	    KSTAT_FLAG_VIRTUAL);
760 	if (zstd_ksp != NULL) {
761 		zstd_ksp->ks_data = &zstd_stats;
762 		kstat_install(zstd_ksp);
763 	}
764 
765 	return (0);
766 }
767 
768 extern void __exit
769 zstd_fini(void)
770 {
771 	/* Deinitialize kstat */
772 	if (zstd_ksp != NULL) {
773 		kstat_delete(zstd_ksp);
774 		zstd_ksp = NULL;
775 	}
776 
777 	/* Release fallback memory */
778 	vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
779 	mutex_destroy(&zstd_dctx_fallback.barrier);
780 
781 	/* Deinit memory pool */
782 	zstd_mempool_deinit();
783 }
784 
785 #if defined(_KERNEL)
786 module_init(zstd_init);
787 module_exit(zstd_fini);
788 
789 ZFS_MODULE_DESCRIPTION("ZSTD Compression for ZFS");
790 ZFS_MODULE_LICENSE("Dual BSD/GPL");
791 ZFS_MODULE_VERSION(ZSTD_VERSION_STRING "a");
792 
793 EXPORT_SYMBOL(zfs_zstd_compress);
794 EXPORT_SYMBOL(zfs_zstd_decompress_level);
795 EXPORT_SYMBOL(zfs_zstd_decompress);
796 EXPORT_SYMBOL(zfs_zstd_cache_reap_now);
797 #endif
798