xref: /freebsd/sys/contrib/openzfs/module/zstd/zfs_zstd.c (revision 2617128a21bfb1b1e3065e54d46f9982d37c0bf8)
1 /*
2  * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  * 1. Redistributions of source code must retain the above copyright notice,
8  * this list of conditions and the following disclaimer.
9  *
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  * this list of conditions and the following disclaimer in the documentation
12  * and/or other materials provided with the distribution.
13  *
14  * 3. Neither the name of the copyright holder nor the names of its
15  * contributors may be used to endorse or promote products derived from this
16  * software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * Copyright (c) 2016-2018, Klara Inc.
33  * Copyright (c) 2016-2018, Allan Jude
34  * Copyright (c) 2018-2020, Sebastian Gottschall
35  * Copyright (c) 2019-2020, Michael Niewöhner
36  * Copyright (c) 2020, The FreeBSD Foundation [1]
37  *
38  * [1] Portions of this software were developed by Allan Jude
39  *     under sponsorship from the FreeBSD Foundation.
40  */
41 
42 #include <sys/param.h>
43 #include <sys/sysmacros.h>
44 #include <sys/zfs_context.h>
45 #include <sys/zio_compress.h>
46 #include <sys/spa.h>
47 #include <sys/zstd/zstd.h>
48 
49 #define	ZSTD_STATIC_LINKING_ONLY
50 #include "lib/zstd.h"
51 #include "lib/zstd_errors.h"
52 
53 kstat_t *zstd_ksp = NULL;
54 
55 typedef struct zstd_stats {
56 	kstat_named_t	zstd_stat_alloc_fail;
57 	kstat_named_t	zstd_stat_alloc_fallback;
58 	kstat_named_t	zstd_stat_com_alloc_fail;
59 	kstat_named_t	zstd_stat_dec_alloc_fail;
60 	kstat_named_t	zstd_stat_com_inval;
61 	kstat_named_t	zstd_stat_dec_inval;
62 	kstat_named_t	zstd_stat_dec_header_inval;
63 	kstat_named_t	zstd_stat_com_fail;
64 	kstat_named_t	zstd_stat_dec_fail;
65 	kstat_named_t	zstd_stat_buffers;
66 	kstat_named_t	zstd_stat_size;
67 } zstd_stats_t;
68 
69 static zstd_stats_t zstd_stats = {
70 	{ "alloc_fail",			KSTAT_DATA_UINT64 },
71 	{ "alloc_fallback",		KSTAT_DATA_UINT64 },
72 	{ "compress_alloc_fail",	KSTAT_DATA_UINT64 },
73 	{ "decompress_alloc_fail",	KSTAT_DATA_UINT64 },
74 	{ "compress_level_invalid",	KSTAT_DATA_UINT64 },
75 	{ "decompress_level_invalid",	KSTAT_DATA_UINT64 },
76 	{ "decompress_header_invalid",	KSTAT_DATA_UINT64 },
77 	{ "compress_failed",		KSTAT_DATA_UINT64 },
78 	{ "decompress_failed",		KSTAT_DATA_UINT64 },
79 	{ "buffers",			KSTAT_DATA_UINT64 },
80 	{ "size",			KSTAT_DATA_UINT64 },
81 };
82 
83 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */
84 enum zstd_kmem_type {
85 	ZSTD_KMEM_UNKNOWN = 0,
86 	/* Allocation type using kmem_vmalloc */
87 	ZSTD_KMEM_DEFAULT,
88 	/* Pool based allocation using mempool_alloc */
89 	ZSTD_KMEM_POOL,
90 	/* Reserved fallback memory for decompression only */
91 	ZSTD_KMEM_DCTX,
92 	ZSTD_KMEM_COUNT,
93 };
94 
95 /* Structure for pooled memory objects */
96 struct zstd_pool {
97 	void *mem;
98 	size_t size;
99 	kmutex_t barrier;
100 	hrtime_t timeout;
101 };
102 
103 /* Global structure for handling memory allocations */
104 struct zstd_kmem {
105 	enum zstd_kmem_type kmem_type;
106 	size_t kmem_size;
107 	struct zstd_pool *pool;
108 };
109 
110 /* Fallback memory structure used for decompression only if memory runs out */
111 struct zstd_fallback_mem {
112 	size_t mem_size;
113 	void *mem;
114 	kmutex_t barrier;
115 };
116 
117 struct zstd_levelmap {
118 	int16_t zstd_level;
119 	enum zio_zstd_levels level;
120 };
121 
122 /*
123  * ZSTD memory handlers
124  *
125  * For decompression we use a different handler which also provides fallback
126  * memory allocation in case memory runs out.
127  *
128  * The ZSTD handlers were split up for the most simplified implementation.
129  */
130 static void *zstd_alloc(void *opaque, size_t size);
131 static void *zstd_dctx_alloc(void *opaque, size_t size);
132 static void zstd_free(void *opaque, void *ptr);
133 
134 /* Compression memory handler */
135 static const ZSTD_customMem zstd_malloc = {
136 	zstd_alloc,
137 	zstd_free,
138 	NULL,
139 };
140 
141 /* Decompression memory handler */
142 static const ZSTD_customMem zstd_dctx_malloc = {
143 	zstd_dctx_alloc,
144 	zstd_free,
145 	NULL,
146 };
147 
148 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
149 static struct zstd_levelmap zstd_levels[] = {
150 	{ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
151 	{ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
152 	{ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
153 	{ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
154 	{ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
155 	{ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
156 	{ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
157 	{ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
158 	{ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
159 	{ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
160 	{ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
161 	{ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
162 	{ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
163 	{ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
164 	{ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
165 	{ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
166 	{ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
167 	{ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
168 	{ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
169 	{-1, ZIO_ZSTD_LEVEL_FAST_1},
170 	{-2, ZIO_ZSTD_LEVEL_FAST_2},
171 	{-3, ZIO_ZSTD_LEVEL_FAST_3},
172 	{-4, ZIO_ZSTD_LEVEL_FAST_4},
173 	{-5, ZIO_ZSTD_LEVEL_FAST_5},
174 	{-6, ZIO_ZSTD_LEVEL_FAST_6},
175 	{-7, ZIO_ZSTD_LEVEL_FAST_7},
176 	{-8, ZIO_ZSTD_LEVEL_FAST_8},
177 	{-9, ZIO_ZSTD_LEVEL_FAST_9},
178 	{-10, ZIO_ZSTD_LEVEL_FAST_10},
179 	{-20, ZIO_ZSTD_LEVEL_FAST_20},
180 	{-30, ZIO_ZSTD_LEVEL_FAST_30},
181 	{-40, ZIO_ZSTD_LEVEL_FAST_40},
182 	{-50, ZIO_ZSTD_LEVEL_FAST_50},
183 	{-60, ZIO_ZSTD_LEVEL_FAST_60},
184 	{-70, ZIO_ZSTD_LEVEL_FAST_70},
185 	{-80, ZIO_ZSTD_LEVEL_FAST_80},
186 	{-90, ZIO_ZSTD_LEVEL_FAST_90},
187 	{-100, ZIO_ZSTD_LEVEL_FAST_100},
188 	{-500, ZIO_ZSTD_LEVEL_FAST_500},
189 	{-1000, ZIO_ZSTD_LEVEL_FAST_1000},
190 };
191 
192 /*
193  * This variable represents the maximum count of the pool based on the number
194  * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
195  */
196 static int pool_count = 16;
197 
198 #define	ZSTD_POOL_MAX		pool_count
199 #define	ZSTD_POOL_TIMEOUT	60 * 2
200 
201 static struct zstd_fallback_mem zstd_dctx_fallback;
202 static struct zstd_pool *zstd_mempool_cctx;
203 static struct zstd_pool *zstd_mempool_dctx;
204 
205 /*
206  * The library zstd code expects these if ADDRESS_SANITIZER gets defined,
207  * and while ASAN does this, KASAN defines that and does not. So to avoid
208  * changing the external code, we do this.
209  */
210 #if defined(__has_feature)
211 #if __has_feature(address_sanitizer)
212 #define	ADDRESS_SANITIZER 1
213 #endif
214 #elif defined(__SANITIZE_ADDRESS__)
215 #define	ADDRESS_SANITIZER 1
216 #endif
217 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER)
218 void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
219 void __asan_poison_memory_region(void const volatile *addr, size_t size);
220 void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {};
221 void __asan_poison_memory_region(void const volatile *addr, size_t size) {};
222 #endif
223 
224 
225 static void
226 zstd_mempool_reap(struct zstd_pool *zstd_mempool)
227 {
228 	struct zstd_pool *pool;
229 
230 	if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {
231 		return;
232 	}
233 
234 	/* free obsolete slots */
235 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
236 		pool = &zstd_mempool[i];
237 		if (pool->mem && mutex_tryenter(&pool->barrier)) {
238 			/* Free memory if unused object older than 2 minutes */
239 			if (pool->mem && gethrestime_sec() > pool->timeout) {
240 				vmem_free(pool->mem, pool->size);
241 				ZSTDSTAT_SUB(zstd_stat_buffers, 1);
242 				ZSTDSTAT_SUB(zstd_stat_size, pool->size);
243 				pool->mem = NULL;
244 				pool->size = 0;
245 				pool->timeout = 0;
246 			}
247 			mutex_exit(&pool->barrier);
248 		}
249 	}
250 }
251 
252 /*
253  * Try to get a cached allocated buffer from memory pool or allocate a new one
254  * if necessary. If a object is older than 2 minutes and does not fit the
255  * requested size, it will be released and a new cached entry will be allocated.
256  * If other pooled objects are detected without being used for 2 minutes, they
257  * will be released, too.
258  *
259  * The concept is that high frequency memory allocations of bigger objects are
260  * expensive. So if a lot of work is going on, allocations will be kept for a
261  * while and can be reused in that time frame.
262  *
263  * The scheduled release will be updated every time a object is reused.
264  */
265 
266 static void *
267 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
268 {
269 	struct zstd_pool *pool;
270 	struct zstd_kmem *mem = NULL;
271 
272 	if (!zstd_mempool) {
273 		return (NULL);
274 	}
275 
276 	/* Seek for preallocated memory slot and free obsolete slots */
277 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
278 		pool = &zstd_mempool[i];
279 		/*
280 		 * This lock is simply a marker for a pool object being in use.
281 		 * If it's already hold, it will be skipped.
282 		 *
283 		 * We need to create it before checking it to avoid race
284 		 * conditions caused by running in a threaded context.
285 		 *
286 		 * The lock is later released by zstd_mempool_free.
287 		 */
288 		if (mutex_tryenter(&pool->barrier)) {
289 			/*
290 			 * Check if objects fits the size, if so we take it and
291 			 * update the timestamp.
292 			 */
293 			if (pool->mem && size <= pool->size) {
294 				pool->timeout = gethrestime_sec() +
295 				    ZSTD_POOL_TIMEOUT;
296 				mem = pool->mem;
297 				return (mem);
298 			}
299 			mutex_exit(&pool->barrier);
300 		}
301 	}
302 
303 	/*
304 	 * If no preallocated slot was found, try to fill in a new one.
305 	 *
306 	 * We run a similar algorithm twice here to avoid pool fragmentation.
307 	 * The first one may generate holes in the list if objects get released.
308 	 * We always make sure that these holes get filled instead of adding new
309 	 * allocations constantly at the end.
310 	 */
311 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
312 		pool = &zstd_mempool[i];
313 		if (mutex_tryenter(&pool->barrier)) {
314 			/* Object is free, try to allocate new one */
315 			if (!pool->mem) {
316 				mem = vmem_alloc(size, KM_SLEEP);
317 				if (mem) {
318 					ZSTDSTAT_ADD(zstd_stat_buffers, 1);
319 					ZSTDSTAT_ADD(zstd_stat_size, size);
320 					pool->mem = mem;
321 					pool->size = size;
322 					/* Keep track for later release */
323 					mem->pool = pool;
324 					mem->kmem_type = ZSTD_KMEM_POOL;
325 					mem->kmem_size = size;
326 				}
327 			}
328 
329 			if (size <= pool->size) {
330 				/* Update timestamp */
331 				pool->timeout = gethrestime_sec() +
332 				    ZSTD_POOL_TIMEOUT;
333 
334 				return (pool->mem);
335 			}
336 
337 			mutex_exit(&pool->barrier);
338 		}
339 	}
340 
341 	/*
342 	 * If the pool is full or the allocation failed, try lazy allocation
343 	 * instead.
344 	 */
345 	if (!mem) {
346 		mem = vmem_alloc(size, KM_NOSLEEP);
347 		if (mem) {
348 			mem->pool = NULL;
349 			mem->kmem_type = ZSTD_KMEM_DEFAULT;
350 			mem->kmem_size = size;
351 		}
352 	}
353 
354 	return (mem);
355 }
356 
357 /* Mark object as released by releasing the barrier mutex */
358 static void
359 zstd_mempool_free(struct zstd_kmem *z)
360 {
361 	mutex_exit(&z->pool->barrier);
362 }
363 
364 /* Convert ZFS internal enum to ZSTD level */
365 static int
366 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
367 {
368 	if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
369 		*zstd_level = zstd_levels[level - 1].zstd_level;
370 		return (0);
371 	}
372 	if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
373 	    level <= ZIO_ZSTD_LEVEL_FAST_1000) {
374 		*zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
375 		    + ZIO_ZSTD_LEVEL_19].zstd_level;
376 		return (0);
377 	}
378 
379 	/* Invalid/unknown zfs compression enum - this should never happen. */
380 	return (1);
381 }
382 
383 /* Compress block using zstd */
384 size_t
385 zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
386     int level)
387 {
388 	size_t c_len;
389 	int16_t zstd_level;
390 	zfs_zstdhdr_t *hdr;
391 	ZSTD_CCtx *cctx;
392 
393 	hdr = (zfs_zstdhdr_t *)d_start;
394 
395 	/* Skip compression if the specified level is invalid */
396 	if (zstd_enum_to_level(level, &zstd_level)) {
397 		ZSTDSTAT_BUMP(zstd_stat_com_inval);
398 		return (s_len);
399 	}
400 
401 	ASSERT3U(d_len, >=, sizeof (*hdr));
402 	ASSERT3U(d_len, <=, s_len);
403 	ASSERT3U(zstd_level, !=, 0);
404 
405 	cctx = ZSTD_createCCtx_advanced(zstd_malloc);
406 
407 	/*
408 	 * Out of kernel memory, gently fall through - this will disable
409 	 * compression in zio_compress_data
410 	 */
411 	if (!cctx) {
412 		ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
413 		return (s_len);
414 	}
415 
416 	/* Set the compression level */
417 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
418 
419 	/* Use the "magicless" zstd header which saves us 4 header bytes */
420 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
421 
422 	/*
423 	 * Disable redundant checksum calculation and content size storage since
424 	 * this is already done by ZFS itself.
425 	 */
426 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
427 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
428 
429 	c_len = ZSTD_compress2(cctx,
430 	    hdr->data,
431 	    d_len - sizeof (*hdr),
432 	    s_start, s_len);
433 
434 	ZSTD_freeCCtx(cctx);
435 
436 	/* Error in the compression routine, disable compression. */
437 	if (ZSTD_isError(c_len)) {
438 		/*
439 		 * If we are aborting the compression because the saves are
440 		 * too small, that is not a failure. Everything else is a
441 		 * failure, so increment the compression failure counter.
442 		 */
443 		if (ZSTD_getErrorCode(c_len) != ZSTD_error_dstSize_tooSmall) {
444 			ZSTDSTAT_BUMP(zstd_stat_com_fail);
445 		}
446 		return (s_len);
447 	}
448 
449 	/*
450 	 * Encode the compressed buffer size at the start. We'll need this in
451 	 * decompression to counter the effects of padding which might be added
452 	 * to the compressed buffer and which, if unhandled, would confuse the
453 	 * hell out of our decompression function.
454 	 */
455 	hdr->c_len = BE_32(c_len);
456 
457 	/*
458 	 * Check version for overflow.
459 	 * The limit of 24 bits must not be exceeded. This allows a maximum
460 	 * version 1677.72.15 which we don't expect to be ever reached.
461 	 */
462 	ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
463 
464 	/*
465 	 * Encode the compression level as well. We may need to know the
466 	 * original compression level if compressed_arc is disabled, to match
467 	 * the compression settings to write this block to the L2ARC.
468 	 *
469 	 * Encode the actual level, so if the enum changes in the future, we
470 	 * will be compatible.
471 	 *
472 	 * The upper 24 bits store the ZSTD version to be able to provide
473 	 * future compatibility, since new versions might enhance the
474 	 * compression algorithm in a way, where the compressed data will
475 	 * change.
476 	 *
477 	 * As soon as such incompatibility occurs, handling code needs to be
478 	 * added, differentiating between the versions.
479 	 */
480 	hdr->version = ZSTD_VERSION_NUMBER;
481 	hdr->level = level;
482 	hdr->raw_version_level = BE_32(hdr->raw_version_level);
483 
484 	return (c_len + sizeof (*hdr));
485 }
486 
487 /* Decompress block using zstd and return its stored level */
488 int
489 zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
490     size_t d_len, uint8_t *level)
491 {
492 	ZSTD_DCtx *dctx;
493 	size_t result;
494 	int16_t zstd_level;
495 	uint32_t c_len;
496 	const zfs_zstdhdr_t *hdr;
497 	zfs_zstdhdr_t hdr_copy;
498 
499 	hdr = (const zfs_zstdhdr_t *)s_start;
500 	c_len = BE_32(hdr->c_len);
501 
502 	/*
503 	 * Make a copy instead of directly converting the header, since we must
504 	 * not modify the original data that may be used again later.
505 	 */
506 	hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
507 
508 	/*
509 	 * NOTE: We ignore the ZSTD version for now. As soon as any
510 	 * incompatibility occurs, it has to be handled accordingly.
511 	 * The version can be accessed via `hdr_copy.version`.
512 	 */
513 
514 	/*
515 	 * Convert and check the level
516 	 * An invalid level is a strong indicator for data corruption! In such
517 	 * case return an error so the upper layers can try to fix it.
518 	 */
519 	if (zstd_enum_to_level(hdr_copy.level, &zstd_level)) {
520 		ZSTDSTAT_BUMP(zstd_stat_dec_inval);
521 		return (1);
522 	}
523 
524 	ASSERT3U(d_len, >=, s_len);
525 	ASSERT3U(hdr_copy.level, !=, ZIO_COMPLEVEL_INHERIT);
526 
527 	/* Invalid compressed buffer size encoded at start */
528 	if (c_len + sizeof (*hdr) > s_len) {
529 		ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
530 		return (1);
531 	}
532 
533 	dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
534 	if (!dctx) {
535 		ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
536 		return (1);
537 	}
538 
539 	/* Set header type to "magicless" */
540 	ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
541 
542 	/* Decompress the data and release the context */
543 	result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
544 	ZSTD_freeDCtx(dctx);
545 
546 	/*
547 	 * Returns 0 on success (decompression function returned non-negative)
548 	 * and non-zero on failure (decompression function returned negative.
549 	 */
550 	if (ZSTD_isError(result)) {
551 		ZSTDSTAT_BUMP(zstd_stat_dec_fail);
552 		return (1);
553 	}
554 
555 	if (level) {
556 		*level = hdr_copy.level;
557 	}
558 
559 	return (0);
560 }
561 
562 /* Decompress datablock using zstd */
563 int
564 zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
565     int level __maybe_unused)
566 {
567 
568 	return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
569 	    NULL));
570 }
571 
572 /* Allocator for zstd compression context using mempool_allocator */
573 static void *
574 zstd_alloc(void *opaque __maybe_unused, size_t size)
575 {
576 	size_t nbytes = sizeof (struct zstd_kmem) + size;
577 	struct zstd_kmem *z = NULL;
578 
579 	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
580 
581 	if (!z) {
582 		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
583 		return (NULL);
584 	}
585 
586 	return ((void*)z + (sizeof (struct zstd_kmem)));
587 }
588 
589 /*
590  * Allocator for zstd decompression context using mempool_allocator with
591  * fallback to reserved memory if allocation fails
592  */
593 static void *
594 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
595 {
596 	size_t nbytes = sizeof (struct zstd_kmem) + size;
597 	struct zstd_kmem *z = NULL;
598 	enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
599 
600 	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
601 	if (!z) {
602 		/* Try harder, decompression shall not fail */
603 		z = vmem_alloc(nbytes, KM_SLEEP);
604 		if (z) {
605 			z->pool = NULL;
606 		}
607 		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
608 	} else {
609 		return ((void*)z + (sizeof (struct zstd_kmem)));
610 	}
611 
612 	/* Fallback if everything fails */
613 	if (!z) {
614 		/*
615 		 * Barrier since we only can handle it in a single thread. All
616 		 * other following threads need to wait here until decompression
617 		 * is completed. zstd_free will release this barrier later.
618 		 */
619 		mutex_enter(&zstd_dctx_fallback.barrier);
620 
621 		z = zstd_dctx_fallback.mem;
622 		type = ZSTD_KMEM_DCTX;
623 		ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
624 	}
625 
626 	/* Allocation should always be successful */
627 	if (!z) {
628 		return (NULL);
629 	}
630 
631 	z->kmem_type = type;
632 	z->kmem_size = nbytes;
633 
634 	return ((void*)z + (sizeof (struct zstd_kmem)));
635 }
636 
637 /* Free allocated memory by its specific type */
638 static void
639 zstd_free(void *opaque __maybe_unused, void *ptr)
640 {
641 	struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
642 	enum zstd_kmem_type type;
643 
644 	ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
645 	ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
646 
647 	type = z->kmem_type;
648 	switch (type) {
649 	case ZSTD_KMEM_DEFAULT:
650 		vmem_free(z, z->kmem_size);
651 		break;
652 	case ZSTD_KMEM_POOL:
653 		zstd_mempool_free(z);
654 		break;
655 	case ZSTD_KMEM_DCTX:
656 		mutex_exit(&zstd_dctx_fallback.barrier);
657 		break;
658 	default:
659 		break;
660 	}
661 }
662 
663 /* Allocate fallback memory to ensure safe decompression */
664 static void __init
665 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
666 {
667 	mem->mem_size = size;
668 	mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
669 	mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
670 }
671 
672 /* Initialize memory pool barrier mutexes */
673 static void __init
674 zstd_mempool_init(void)
675 {
676 	zstd_mempool_cctx = (struct zstd_pool *)
677 	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
678 	zstd_mempool_dctx = (struct zstd_pool *)
679 	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
680 
681 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
682 		mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
683 		    MUTEX_DEFAULT, NULL);
684 		mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
685 		    MUTEX_DEFAULT, NULL);
686 	}
687 }
688 
689 /* Initialize zstd-related memory handling */
690 static int __init
691 zstd_meminit(void)
692 {
693 	zstd_mempool_init();
694 
695 	/*
696 	 * Estimate the size of the fallback decompression context.
697 	 * The expected size on x64 with current ZSTD should be about 160 KB.
698 	 */
699 	create_fallback_mem(&zstd_dctx_fallback,
700 	    P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
701 	    PAGESIZE));
702 
703 	return (0);
704 }
705 
706 /* Release object from pool and free memory */
707 static void __exit
708 release_pool(struct zstd_pool *pool)
709 {
710 	mutex_destroy(&pool->barrier);
711 	vmem_free(pool->mem, pool->size);
712 	pool->mem = NULL;
713 	pool->size = 0;
714 }
715 
716 /* Release memory pool objects */
717 static void __exit
718 zstd_mempool_deinit(void)
719 {
720 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
721 		release_pool(&zstd_mempool_cctx[i]);
722 		release_pool(&zstd_mempool_dctx[i]);
723 	}
724 
725 	kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
726 	kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
727 	zstd_mempool_dctx = NULL;
728 	zstd_mempool_cctx = NULL;
729 }
730 
731 /* release unused memory from pool */
732 
733 void
734 zfs_zstd_cache_reap_now(void)
735 {
736 
737 	/*
738 	 * Short-circuit if there are no buffers to begin with.
739 	 */
740 	if (ZSTDSTAT(zstd_stat_buffers) == 0)
741 		return;
742 
743 	/*
744 	 * calling alloc with zero size seeks
745 	 * and releases old unused objects
746 	 */
747 	zstd_mempool_reap(zstd_mempool_cctx);
748 	zstd_mempool_reap(zstd_mempool_dctx);
749 }
750 
751 extern int __init
752 zstd_init(void)
753 {
754 	/* Set pool size by using maximum sane thread count * 4 */
755 	pool_count = (boot_ncpus * 4);
756 	zstd_meminit();
757 
758 	/* Initialize kstat */
759 	zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
760 	    KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
761 	    KSTAT_FLAG_VIRTUAL);
762 	if (zstd_ksp != NULL) {
763 		zstd_ksp->ks_data = &zstd_stats;
764 		kstat_install(zstd_ksp);
765 	}
766 
767 	return (0);
768 }
769 
770 extern void __exit
771 zstd_fini(void)
772 {
773 	/* Deinitialize kstat */
774 	if (zstd_ksp != NULL) {
775 		kstat_delete(zstd_ksp);
776 		zstd_ksp = NULL;
777 	}
778 
779 	/* Release fallback memory */
780 	vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
781 	mutex_destroy(&zstd_dctx_fallback.barrier);
782 
783 	/* Deinit memory pool */
784 	zstd_mempool_deinit();
785 }
786 
787 #if defined(_KERNEL)
788 module_init(zstd_init);
789 module_exit(zstd_fini);
790 
791 ZFS_MODULE_DESCRIPTION("ZSTD Compression for ZFS");
792 ZFS_MODULE_LICENSE("Dual BSD/GPL");
793 ZFS_MODULE_VERSION(ZSTD_VERSION_STRING);
794 
795 EXPORT_SYMBOL(zfs_zstd_compress);
796 EXPORT_SYMBOL(zfs_zstd_decompress_level);
797 EXPORT_SYMBOL(zfs_zstd_decompress);
798 EXPORT_SYMBOL(zfs_zstd_cache_reap_now);
799 #endif
800