xref: /freebsd/sys/contrib/openzfs/lib/libzpool/kernel.c (revision 1bb8b1d7e190d75597d31ab87364a058e8ef8c5b)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
26  */
27 
28 #include <assert.h>
29 #include <fcntl.h>
30 #include <libgen.h>
31 #include <poll.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <limits.h>
36 #include <libzutil.h>
37 #include <sys/crypto/icp.h>
38 #include <sys/processor.h>
39 #include <sys/rrwlock.h>
40 #include <sys/spa.h>
41 #include <sys/stat.h>
42 #include <sys/systeminfo.h>
43 #include <sys/time.h>
44 #include <sys/utsname.h>
45 #include <sys/zfs_context.h>
46 #include <sys/zfs_onexit.h>
47 #include <sys/zfs_vfsops.h>
48 #include <sys/zstd/zstd.h>
49 #include <sys/zvol.h>
50 #include <zfs_fletcher.h>
51 #include <zlib.h>
52 
53 /*
54  * Emulation of kernel services in userland.
55  */
56 
57 uint64_t physmem;
58 uint32_t hostid;
59 struct utsname hw_utsname;
60 
61 /* If set, all blocks read will be copied to the specified directory. */
62 char *vn_dumpdir = NULL;
63 
64 /* this only exists to have its address taken */
65 struct proc p0;
66 
67 /*
68  * =========================================================================
69  * threads
70  * =========================================================================
71  *
72  * TS_STACK_MIN is dictated by the minimum allowed pthread stack size.  While
73  * TS_STACK_MAX is somewhat arbitrary, it was selected to be large enough for
74  * the expected stack depth while small enough to avoid exhausting address
75  * space with high thread counts.
76  */
77 #define	TS_STACK_MIN	MAX(PTHREAD_STACK_MIN, 32768)
78 #define	TS_STACK_MAX	(256 * 1024)
79 
80 struct zk_thread_wrapper {
81 	void (*func)(void *);
82 	void *arg;
83 };
84 
85 static void *
86 zk_thread_wrapper(void *arg)
87 {
88 	struct zk_thread_wrapper ztw;
89 	memcpy(&ztw, arg, sizeof (ztw));
90 	free(arg);
91 	ztw.func(ztw.arg);
92 	return (NULL);
93 }
94 
95 kthread_t *
96 zk_thread_create(const char *name, void (*func)(void *), void *arg,
97     size_t stksize, int state)
98 {
99 	pthread_attr_t attr;
100 	pthread_t tid;
101 	char *stkstr;
102 	struct zk_thread_wrapper *ztw;
103 	int detachstate = PTHREAD_CREATE_DETACHED;
104 
105 	VERIFY0(pthread_attr_init(&attr));
106 
107 	if (state & TS_JOINABLE)
108 		detachstate = PTHREAD_CREATE_JOINABLE;
109 
110 	VERIFY0(pthread_attr_setdetachstate(&attr, detachstate));
111 
112 	/*
113 	 * We allow the default stack size in user space to be specified by
114 	 * setting the ZFS_STACK_SIZE environment variable.  This allows us
115 	 * the convenience of observing and debugging stack overruns in
116 	 * user space.  Explicitly specified stack sizes will be honored.
117 	 * The usage of ZFS_STACK_SIZE is discussed further in the
118 	 * ENVIRONMENT VARIABLES sections of the ztest(1) man page.
119 	 */
120 	if (stksize == 0) {
121 		stkstr = getenv("ZFS_STACK_SIZE");
122 
123 		if (stkstr == NULL)
124 			stksize = TS_STACK_MAX;
125 		else
126 			stksize = MAX(atoi(stkstr), TS_STACK_MIN);
127 	}
128 
129 	VERIFY3S(stksize, >, 0);
130 	stksize = P2ROUNDUP(MAX(stksize, TS_STACK_MIN), PAGESIZE);
131 
132 	/*
133 	 * If this ever fails, it may be because the stack size is not a
134 	 * multiple of system page size.
135 	 */
136 	VERIFY0(pthread_attr_setstacksize(&attr, stksize));
137 	VERIFY0(pthread_attr_setguardsize(&attr, PAGESIZE));
138 
139 	VERIFY(ztw = malloc(sizeof (*ztw)));
140 	ztw->func = func;
141 	ztw->arg = arg;
142 	VERIFY0(pthread_create(&tid, &attr, zk_thread_wrapper, ztw));
143 	VERIFY0(pthread_attr_destroy(&attr));
144 
145 	pthread_setname_np(tid, name);
146 
147 	return ((void *)(uintptr_t)tid);
148 }
149 
150 /*
151  * =========================================================================
152  * kstats
153  * =========================================================================
154  */
155 kstat_t *
156 kstat_create(const char *module, int instance, const char *name,
157     const char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag)
158 {
159 	(void) module, (void) instance, (void) name, (void) class, (void) type,
160 	    (void) ndata, (void) ks_flag;
161 	return (NULL);
162 }
163 
164 void
165 kstat_install(kstat_t *ksp)
166 {
167 	(void) ksp;
168 }
169 
170 void
171 kstat_delete(kstat_t *ksp)
172 {
173 	(void) ksp;
174 }
175 
176 void
177 kstat_set_raw_ops(kstat_t *ksp,
178     int (*headers)(char *buf, size_t size),
179     int (*data)(char *buf, size_t size, void *data),
180     void *(*addr)(kstat_t *ksp, loff_t index))
181 {
182 	(void) ksp, (void) headers, (void) data, (void) addr;
183 }
184 
185 /*
186  * =========================================================================
187  * mutexes
188  * =========================================================================
189  */
190 
191 void
192 mutex_init(kmutex_t *mp, char *name, int type, void *cookie)
193 {
194 	(void) name, (void) type, (void) cookie;
195 	VERIFY0(pthread_mutex_init(&mp->m_lock, NULL));
196 	memset(&mp->m_owner, 0, sizeof (pthread_t));
197 }
198 
199 void
200 mutex_destroy(kmutex_t *mp)
201 {
202 	VERIFY0(pthread_mutex_destroy(&mp->m_lock));
203 }
204 
205 void
206 mutex_enter(kmutex_t *mp)
207 {
208 	VERIFY0(pthread_mutex_lock(&mp->m_lock));
209 	mp->m_owner = pthread_self();
210 }
211 
212 int
213 mutex_enter_check_return(kmutex_t *mp)
214 {
215 	int error = pthread_mutex_lock(&mp->m_lock);
216 	if (error == 0)
217 		mp->m_owner = pthread_self();
218 	return (error);
219 }
220 
221 int
222 mutex_tryenter(kmutex_t *mp)
223 {
224 	int error = pthread_mutex_trylock(&mp->m_lock);
225 	if (error == 0) {
226 		mp->m_owner = pthread_self();
227 		return (1);
228 	} else {
229 		VERIFY3S(error, ==, EBUSY);
230 		return (0);
231 	}
232 }
233 
234 void
235 mutex_exit(kmutex_t *mp)
236 {
237 	memset(&mp->m_owner, 0, sizeof (pthread_t));
238 	VERIFY0(pthread_mutex_unlock(&mp->m_lock));
239 }
240 
241 /*
242  * =========================================================================
243  * rwlocks
244  * =========================================================================
245  */
246 
247 void
248 rw_init(krwlock_t *rwlp, char *name, int type, void *arg)
249 {
250 	(void) name, (void) type, (void) arg;
251 	VERIFY0(pthread_rwlock_init(&rwlp->rw_lock, NULL));
252 	rwlp->rw_readers = 0;
253 	rwlp->rw_owner = 0;
254 }
255 
256 void
257 rw_destroy(krwlock_t *rwlp)
258 {
259 	VERIFY0(pthread_rwlock_destroy(&rwlp->rw_lock));
260 }
261 
262 void
263 rw_enter(krwlock_t *rwlp, krw_t rw)
264 {
265 	if (rw == RW_READER) {
266 		VERIFY0(pthread_rwlock_rdlock(&rwlp->rw_lock));
267 		atomic_inc_uint(&rwlp->rw_readers);
268 	} else {
269 		VERIFY0(pthread_rwlock_wrlock(&rwlp->rw_lock));
270 		rwlp->rw_owner = pthread_self();
271 	}
272 }
273 
274 void
275 rw_exit(krwlock_t *rwlp)
276 {
277 	if (RW_READ_HELD(rwlp))
278 		atomic_dec_uint(&rwlp->rw_readers);
279 	else
280 		rwlp->rw_owner = 0;
281 
282 	VERIFY0(pthread_rwlock_unlock(&rwlp->rw_lock));
283 }
284 
285 int
286 rw_tryenter(krwlock_t *rwlp, krw_t rw)
287 {
288 	int error;
289 
290 	if (rw == RW_READER)
291 		error = pthread_rwlock_tryrdlock(&rwlp->rw_lock);
292 	else
293 		error = pthread_rwlock_trywrlock(&rwlp->rw_lock);
294 
295 	if (error == 0) {
296 		if (rw == RW_READER)
297 			atomic_inc_uint(&rwlp->rw_readers);
298 		else
299 			rwlp->rw_owner = pthread_self();
300 
301 		return (1);
302 	}
303 
304 	VERIFY3S(error, ==, EBUSY);
305 
306 	return (0);
307 }
308 
309 uint32_t
310 zone_get_hostid(void *zonep)
311 {
312 	/*
313 	 * We're emulating the system's hostid in userland.
314 	 */
315 	(void) zonep;
316 	return (hostid);
317 }
318 
319 int
320 rw_tryupgrade(krwlock_t *rwlp)
321 {
322 	(void) rwlp;
323 	return (0);
324 }
325 
326 /*
327  * =========================================================================
328  * condition variables
329  * =========================================================================
330  */
331 
332 void
333 cv_init(kcondvar_t *cv, char *name, int type, void *arg)
334 {
335 	(void) name, (void) type, (void) arg;
336 	VERIFY0(pthread_cond_init(cv, NULL));
337 }
338 
339 void
340 cv_destroy(kcondvar_t *cv)
341 {
342 	VERIFY0(pthread_cond_destroy(cv));
343 }
344 
345 void
346 cv_wait(kcondvar_t *cv, kmutex_t *mp)
347 {
348 	memset(&mp->m_owner, 0, sizeof (pthread_t));
349 	VERIFY0(pthread_cond_wait(cv, &mp->m_lock));
350 	mp->m_owner = pthread_self();
351 }
352 
353 int
354 cv_wait_sig(kcondvar_t *cv, kmutex_t *mp)
355 {
356 	cv_wait(cv, mp);
357 	return (1);
358 }
359 
360 int
361 cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
362 {
363 	int error;
364 	struct timeval tv;
365 	struct timespec ts;
366 	clock_t delta;
367 
368 	delta = abstime - ddi_get_lbolt();
369 	if (delta <= 0)
370 		return (-1);
371 
372 	VERIFY(gettimeofday(&tv, NULL) == 0);
373 
374 	ts.tv_sec = tv.tv_sec + delta / hz;
375 	ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % hz) * (NANOSEC / hz);
376 	if (ts.tv_nsec >= NANOSEC) {
377 		ts.tv_sec++;
378 		ts.tv_nsec -= NANOSEC;
379 	}
380 
381 	memset(&mp->m_owner, 0, sizeof (pthread_t));
382 	error = pthread_cond_timedwait(cv, &mp->m_lock, &ts);
383 	mp->m_owner = pthread_self();
384 
385 	if (error == ETIMEDOUT)
386 		return (-1);
387 
388 	VERIFY0(error);
389 
390 	return (1);
391 }
392 
393 int
394 cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res,
395     int flag)
396 {
397 	(void) res;
398 	int error;
399 	struct timeval tv;
400 	struct timespec ts;
401 	hrtime_t delta;
402 
403 	ASSERT(flag == 0 || flag == CALLOUT_FLAG_ABSOLUTE);
404 
405 	delta = tim;
406 	if (flag & CALLOUT_FLAG_ABSOLUTE)
407 		delta -= gethrtime();
408 
409 	if (delta <= 0)
410 		return (-1);
411 
412 	VERIFY0(gettimeofday(&tv, NULL));
413 
414 	ts.tv_sec = tv.tv_sec + delta / NANOSEC;
415 	ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % NANOSEC);
416 	if (ts.tv_nsec >= NANOSEC) {
417 		ts.tv_sec++;
418 		ts.tv_nsec -= NANOSEC;
419 	}
420 
421 	memset(&mp->m_owner, 0, sizeof (pthread_t));
422 	error = pthread_cond_timedwait(cv, &mp->m_lock, &ts);
423 	mp->m_owner = pthread_self();
424 
425 	if (error == ETIMEDOUT)
426 		return (-1);
427 
428 	VERIFY0(error);
429 
430 	return (1);
431 }
432 
433 void
434 cv_signal(kcondvar_t *cv)
435 {
436 	VERIFY0(pthread_cond_signal(cv));
437 }
438 
439 void
440 cv_broadcast(kcondvar_t *cv)
441 {
442 	VERIFY0(pthread_cond_broadcast(cv));
443 }
444 
445 /*
446  * =========================================================================
447  * procfs list
448  * =========================================================================
449  */
450 
451 void
452 seq_printf(struct seq_file *m, const char *fmt, ...)
453 {
454 	(void) m, (void) fmt;
455 }
456 
457 void
458 procfs_list_install(const char *module,
459     const char *submodule,
460     const char *name,
461     mode_t mode,
462     procfs_list_t *procfs_list,
463     int (*show)(struct seq_file *f, void *p),
464     int (*show_header)(struct seq_file *f),
465     int (*clear)(procfs_list_t *procfs_list),
466     size_t procfs_list_node_off)
467 {
468 	(void) module, (void) submodule, (void) name, (void) mode, (void) show,
469 	    (void) show_header, (void) clear;
470 	mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL);
471 	list_create(&procfs_list->pl_list,
472 	    procfs_list_node_off + sizeof (procfs_list_node_t),
473 	    procfs_list_node_off + offsetof(procfs_list_node_t, pln_link));
474 	procfs_list->pl_next_id = 1;
475 	procfs_list->pl_node_offset = procfs_list_node_off;
476 }
477 
478 void
479 procfs_list_uninstall(procfs_list_t *procfs_list)
480 {
481 	(void) procfs_list;
482 }
483 
484 void
485 procfs_list_destroy(procfs_list_t *procfs_list)
486 {
487 	ASSERT(list_is_empty(&procfs_list->pl_list));
488 	list_destroy(&procfs_list->pl_list);
489 	mutex_destroy(&procfs_list->pl_lock);
490 }
491 
492 #define	NODE_ID(procfs_list, obj) \
493 		(((procfs_list_node_t *)(((char *)obj) + \
494 		(procfs_list)->pl_node_offset))->pln_id)
495 
496 void
497 procfs_list_add(procfs_list_t *procfs_list, void *p)
498 {
499 	ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
500 	NODE_ID(procfs_list, p) = procfs_list->pl_next_id++;
501 	list_insert_tail(&procfs_list->pl_list, p);
502 }
503 
504 /*
505  * =========================================================================
506  * vnode operations
507  * =========================================================================
508  */
509 
510 /*
511  * =========================================================================
512  * Figure out which debugging statements to print
513  * =========================================================================
514  */
515 
516 static char *dprintf_string;
517 static int dprintf_print_all;
518 
519 int
520 dprintf_find_string(const char *string)
521 {
522 	char *tmp_str = dprintf_string;
523 	int len = strlen(string);
524 
525 	/*
526 	 * Find out if this is a string we want to print.
527 	 * String format: file1.c,function_name1,file2.c,file3.c
528 	 */
529 
530 	while (tmp_str != NULL) {
531 		if (strncmp(tmp_str, string, len) == 0 &&
532 		    (tmp_str[len] == ',' || tmp_str[len] == '\0'))
533 			return (1);
534 		tmp_str = strchr(tmp_str, ',');
535 		if (tmp_str != NULL)
536 			tmp_str++; /* Get rid of , */
537 	}
538 	return (0);
539 }
540 
541 void
542 dprintf_setup(int *argc, char **argv)
543 {
544 	int i, j;
545 
546 	/*
547 	 * Debugging can be specified two ways: by setting the
548 	 * environment variable ZFS_DEBUG, or by including a
549 	 * "debug=..."  argument on the command line.  The command
550 	 * line setting overrides the environment variable.
551 	 */
552 
553 	for (i = 1; i < *argc; i++) {
554 		int len = strlen("debug=");
555 		/* First look for a command line argument */
556 		if (strncmp("debug=", argv[i], len) == 0) {
557 			dprintf_string = argv[i] + len;
558 			/* Remove from args */
559 			for (j = i; j < *argc; j++)
560 				argv[j] = argv[j+1];
561 			argv[j] = NULL;
562 			(*argc)--;
563 		}
564 	}
565 
566 	if (dprintf_string == NULL) {
567 		/* Look for ZFS_DEBUG environment variable */
568 		dprintf_string = getenv("ZFS_DEBUG");
569 	}
570 
571 	/*
572 	 * Are we just turning on all debugging?
573 	 */
574 	if (dprintf_find_string("on"))
575 		dprintf_print_all = 1;
576 
577 	if (dprintf_string != NULL)
578 		zfs_flags |= ZFS_DEBUG_DPRINTF;
579 }
580 
581 /*
582  * =========================================================================
583  * debug printfs
584  * =========================================================================
585  */
586 void
587 __dprintf(boolean_t dprint, const char *file, const char *func,
588     int line, const char *fmt, ...)
589 {
590 	/* Get rid of annoying "../common/" prefix to filename. */
591 	const char *newfile = zfs_basename(file);
592 
593 	va_list adx;
594 	if (dprint) {
595 		/* dprintf messages are printed immediately */
596 
597 		if (!dprintf_print_all &&
598 		    !dprintf_find_string(newfile) &&
599 		    !dprintf_find_string(func))
600 			return;
601 
602 		/* Print out just the function name if requested */
603 		flockfile(stdout);
604 		if (dprintf_find_string("pid"))
605 			(void) printf("%d ", getpid());
606 		if (dprintf_find_string("tid"))
607 			(void) printf("%ju ",
608 			    (uintmax_t)(uintptr_t)pthread_self());
609 		if (dprintf_find_string("cpu"))
610 			(void) printf("%u ", getcpuid());
611 		if (dprintf_find_string("time"))
612 			(void) printf("%llu ", gethrtime());
613 		if (dprintf_find_string("long"))
614 			(void) printf("%s, line %d: ", newfile, line);
615 		(void) printf("dprintf: %s: ", func);
616 		va_start(adx, fmt);
617 		(void) vprintf(fmt, adx);
618 		va_end(adx);
619 		funlockfile(stdout);
620 	} else {
621 		/* zfs_dbgmsg is logged for dumping later */
622 		size_t size;
623 		char *buf;
624 		int i;
625 
626 		size = 1024;
627 		buf = umem_alloc(size, UMEM_NOFAIL);
628 		i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func);
629 
630 		if (i < size) {
631 			va_start(adx, fmt);
632 			(void) vsnprintf(buf + i, size - i, fmt, adx);
633 			va_end(adx);
634 		}
635 
636 		__zfs_dbgmsg(buf);
637 
638 		umem_free(buf, size);
639 	}
640 }
641 
642 /*
643  * =========================================================================
644  * cmn_err() and panic()
645  * =========================================================================
646  */
647 static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" };
648 static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" };
649 
650 __attribute__((noreturn)) void
651 vpanic(const char *fmt, va_list adx)
652 {
653 	(void) fprintf(stderr, "error: ");
654 	(void) vfprintf(stderr, fmt, adx);
655 	(void) fprintf(stderr, "\n");
656 
657 	abort();	/* think of it as a "user-level crash dump" */
658 }
659 
660 __attribute__((noreturn)) void
661 panic(const char *fmt, ...)
662 {
663 	va_list adx;
664 
665 	va_start(adx, fmt);
666 	vpanic(fmt, adx);
667 	va_end(adx);
668 }
669 
670 void
671 vcmn_err(int ce, const char *fmt, va_list adx)
672 {
673 	if (ce == CE_PANIC)
674 		vpanic(fmt, adx);
675 	if (ce != CE_NOTE) {	/* suppress noise in userland stress testing */
676 		(void) fprintf(stderr, "%s", ce_prefix[ce]);
677 		(void) vfprintf(stderr, fmt, adx);
678 		(void) fprintf(stderr, "%s", ce_suffix[ce]);
679 	}
680 }
681 
682 void
683 cmn_err(int ce, const char *fmt, ...)
684 {
685 	va_list adx;
686 
687 	va_start(adx, fmt);
688 	vcmn_err(ce, fmt, adx);
689 	va_end(adx);
690 }
691 
692 /*
693  * =========================================================================
694  * misc routines
695  * =========================================================================
696  */
697 
698 void
699 delay(clock_t ticks)
700 {
701 	(void) poll(0, 0, ticks * (1000 / hz));
702 }
703 
704 /*
705  * Find highest one bit set.
706  * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
707  * The __builtin_clzll() function is supported by both GCC and Clang.
708  */
709 int
710 highbit64(uint64_t i)
711 {
712 	if (i == 0)
713 	return (0);
714 
715 	return (NBBY * sizeof (uint64_t) - __builtin_clzll(i));
716 }
717 
718 /*
719  * Find lowest one bit set.
720  * Returns bit number + 1 of lowest bit that is set, otherwise returns 0.
721  * The __builtin_ffsll() function is supported by both GCC and Clang.
722  */
723 int
724 lowbit64(uint64_t i)
725 {
726 	if (i == 0)
727 		return (0);
728 
729 	return (__builtin_ffsll(i));
730 }
731 
732 const char *random_path = "/dev/random";
733 const char *urandom_path = "/dev/urandom";
734 static int random_fd = -1, urandom_fd = -1;
735 
736 void
737 random_init(void)
738 {
739 	VERIFY((random_fd = open(random_path, O_RDONLY | O_CLOEXEC)) != -1);
740 	VERIFY((urandom_fd = open(urandom_path, O_RDONLY | O_CLOEXEC)) != -1);
741 }
742 
743 void
744 random_fini(void)
745 {
746 	close(random_fd);
747 	close(urandom_fd);
748 
749 	random_fd = -1;
750 	urandom_fd = -1;
751 }
752 
753 static int
754 random_get_bytes_common(uint8_t *ptr, size_t len, int fd)
755 {
756 	size_t resid = len;
757 	ssize_t bytes;
758 
759 	ASSERT(fd != -1);
760 
761 	while (resid != 0) {
762 		bytes = read(fd, ptr, resid);
763 		ASSERT3S(bytes, >=, 0);
764 		ptr += bytes;
765 		resid -= bytes;
766 	}
767 
768 	return (0);
769 }
770 
771 int
772 random_get_bytes(uint8_t *ptr, size_t len)
773 {
774 	return (random_get_bytes_common(ptr, len, random_fd));
775 }
776 
777 int
778 random_get_pseudo_bytes(uint8_t *ptr, size_t len)
779 {
780 	return (random_get_bytes_common(ptr, len, urandom_fd));
781 }
782 
783 int
784 ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result)
785 {
786 	errno = 0;
787 	*result = strtoull(str, nptr, base);
788 	if (*result == 0)
789 		return (errno);
790 	return (0);
791 }
792 
793 utsname_t *
794 utsname(void)
795 {
796 	return (&hw_utsname);
797 }
798 
799 /*
800  * =========================================================================
801  * kernel emulation setup & teardown
802  * =========================================================================
803  */
804 static int
805 umem_out_of_memory(void)
806 {
807 	char errmsg[] = "out of memory -- generating core dump\n";
808 
809 	(void) fprintf(stderr, "%s", errmsg);
810 	abort();
811 	return (0);
812 }
813 
814 void
815 kernel_init(int mode)
816 {
817 	extern uint_t rrw_tsd_key;
818 
819 	umem_nofail_callback(umem_out_of_memory);
820 
821 	physmem = sysconf(_SC_PHYS_PAGES);
822 
823 	dprintf("physmem = %llu pages (%.2f GB)\n", (u_longlong_t)physmem,
824 	    (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
825 
826 	hostid = (mode & SPA_MODE_WRITE) ? get_system_hostid() : 0;
827 
828 	random_init();
829 
830 	VERIFY0(uname(&hw_utsname));
831 
832 	system_taskq_init();
833 	icp_init();
834 
835 	zstd_init();
836 
837 	spa_init((spa_mode_t)mode);
838 
839 	fletcher_4_init();
840 
841 	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
842 }
843 
844 void
845 kernel_fini(void)
846 {
847 	fletcher_4_fini();
848 	spa_fini();
849 
850 	zstd_fini();
851 
852 	icp_fini();
853 	system_taskq_fini();
854 
855 	random_fini();
856 }
857 
858 uid_t
859 crgetuid(cred_t *cr)
860 {
861 	(void) cr;
862 	return (0);
863 }
864 
865 uid_t
866 crgetruid(cred_t *cr)
867 {
868 	(void) cr;
869 	return (0);
870 }
871 
872 gid_t
873 crgetgid(cred_t *cr)
874 {
875 	(void) cr;
876 	return (0);
877 }
878 
879 int
880 crgetngroups(cred_t *cr)
881 {
882 	(void) cr;
883 	return (0);
884 }
885 
886 gid_t *
887 crgetgroups(cred_t *cr)
888 {
889 	(void) cr;
890 	return (NULL);
891 }
892 
893 int
894 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
895 {
896 	(void) name, (void) cr;
897 	return (0);
898 }
899 
900 int
901 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
902 {
903 	(void) from, (void) to, (void) cr;
904 	return (0);
905 }
906 
907 int
908 zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
909 {
910 	(void) name, (void) cr;
911 	return (0);
912 }
913 
914 int
915 secpolicy_zfs(const cred_t *cr)
916 {
917 	(void) cr;
918 	return (0);
919 }
920 
921 int
922 secpolicy_zfs_proc(const cred_t *cr, proc_t *proc)
923 {
924 	(void) cr, (void) proc;
925 	return (0);
926 }
927 
928 ksiddomain_t *
929 ksid_lookupdomain(const char *dom)
930 {
931 	ksiddomain_t *kd;
932 
933 	kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL);
934 	kd->kd_name = spa_strdup(dom);
935 	return (kd);
936 }
937 
938 void
939 ksiddomain_rele(ksiddomain_t *ksid)
940 {
941 	spa_strfree(ksid->kd_name);
942 	umem_free(ksid, sizeof (ksiddomain_t));
943 }
944 
945 char *
946 kmem_vasprintf(const char *fmt, va_list adx)
947 {
948 	char *buf = NULL;
949 	va_list adx_copy;
950 
951 	va_copy(adx_copy, adx);
952 	VERIFY(vasprintf(&buf, fmt, adx_copy) != -1);
953 	va_end(adx_copy);
954 
955 	return (buf);
956 }
957 
958 char *
959 kmem_asprintf(const char *fmt, ...)
960 {
961 	char *buf = NULL;
962 	va_list adx;
963 
964 	va_start(adx, fmt);
965 	VERIFY(vasprintf(&buf, fmt, adx) != -1);
966 	va_end(adx);
967 
968 	return (buf);
969 }
970 
971 /*
972  * kmem_scnprintf() will return the number of characters that it would have
973  * printed whenever it is limited by value of the size variable, rather than
974  * the number of characters that it did print. This can cause misbehavior on
975  * subsequent uses of the return value, so we define a safe version that will
976  * return the number of characters actually printed, minus the NULL format
977  * character.  Subsequent use of this by the safe string functions is safe
978  * whether it is snprintf(), strlcat() or strlcpy().
979  */
980 int
981 kmem_scnprintf(char *restrict str, size_t size, const char *restrict fmt, ...)
982 {
983 	int n;
984 	va_list ap;
985 
986 	/* Make the 0 case a no-op so that we do not return -1 */
987 	if (size == 0)
988 		return (0);
989 
990 	va_start(ap, fmt);
991 	n = vsnprintf(str, size, fmt, ap);
992 	va_end(ap);
993 
994 	if (n >= size)
995 		n = size - 1;
996 
997 	return (n);
998 }
999 
1000 zfs_file_t *
1001 zfs_onexit_fd_hold(int fd, minor_t *minorp)
1002 {
1003 	(void) fd;
1004 	*minorp = 0;
1005 	return (NULL);
1006 }
1007 
1008 void
1009 zfs_onexit_fd_rele(zfs_file_t *fp)
1010 {
1011 	(void) fp;
1012 }
1013 
1014 int
1015 zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
1016     uintptr_t *action_handle)
1017 {
1018 	(void) minor, (void) func, (void) data, (void) action_handle;
1019 	return (0);
1020 }
1021 
1022 fstrans_cookie_t
1023 spl_fstrans_mark(void)
1024 {
1025 	return ((fstrans_cookie_t)0);
1026 }
1027 
1028 void
1029 spl_fstrans_unmark(fstrans_cookie_t cookie)
1030 {
1031 	(void) cookie;
1032 }
1033 
1034 int
1035 __spl_pf_fstrans_check(void)
1036 {
1037 	return (0);
1038 }
1039 
1040 int
1041 kmem_cache_reap_active(void)
1042 {
1043 	return (0);
1044 }
1045 
1046 void
1047 zvol_create_minor(const char *name)
1048 {
1049 	(void) name;
1050 }
1051 
1052 void
1053 zvol_create_minors_recursive(const char *name)
1054 {
1055 	(void) name;
1056 }
1057 
1058 void
1059 zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
1060 {
1061 	(void) spa, (void) name, (void) async;
1062 }
1063 
1064 void
1065 zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname,
1066     boolean_t async)
1067 {
1068 	(void) spa, (void) oldname, (void) newname, (void) async;
1069 }
1070 
1071 /*
1072  * Open file
1073  *
1074  * path - fully qualified path to file
1075  * flags - file attributes O_READ / O_WRITE / O_EXCL
1076  * fpp - pointer to return file pointer
1077  *
1078  * Returns 0 on success underlying error on failure.
1079  */
1080 int
1081 zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp)
1082 {
1083 	int fd = -1;
1084 	int dump_fd = -1;
1085 	int err;
1086 	int old_umask = 0;
1087 	zfs_file_t *fp;
1088 	struct stat64 st;
1089 
1090 	if (!(flags & O_CREAT) && stat64(path, &st) == -1)
1091 		return (errno);
1092 
1093 	if (!(flags & O_CREAT) && S_ISBLK(st.st_mode))
1094 		flags |= O_DIRECT;
1095 
1096 	if (flags & O_CREAT)
1097 		old_umask = umask(0);
1098 
1099 	fd = open64(path, flags, mode);
1100 	if (fd == -1)
1101 		return (errno);
1102 
1103 	if (flags & O_CREAT)
1104 		(void) umask(old_umask);
1105 
1106 	if (vn_dumpdir != NULL) {
1107 		char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL);
1108 		const char *inpath = zfs_basename(path);
1109 
1110 		(void) snprintf(dumppath, MAXPATHLEN,
1111 		    "%s/%s", vn_dumpdir, inpath);
1112 		dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666);
1113 		umem_free(dumppath, MAXPATHLEN);
1114 		if (dump_fd == -1) {
1115 			err = errno;
1116 			close(fd);
1117 			return (err);
1118 		}
1119 	} else {
1120 		dump_fd = -1;
1121 	}
1122 
1123 	(void) fcntl(fd, F_SETFD, FD_CLOEXEC);
1124 
1125 	fp = umem_zalloc(sizeof (zfs_file_t), UMEM_NOFAIL);
1126 	fp->f_fd = fd;
1127 	fp->f_dump_fd = dump_fd;
1128 	*fpp = fp;
1129 
1130 	return (0);
1131 }
1132 
1133 void
1134 zfs_file_close(zfs_file_t *fp)
1135 {
1136 	close(fp->f_fd);
1137 	if (fp->f_dump_fd != -1)
1138 		close(fp->f_dump_fd);
1139 
1140 	umem_free(fp, sizeof (zfs_file_t));
1141 }
1142 
1143 /*
1144  * Stateful write - use os internal file pointer to determine where to
1145  * write and update on successful completion.
1146  *
1147  * fp -  pointer to file (pipe, socket, etc) to write to
1148  * buf - buffer to write
1149  * count - # of bytes to write
1150  * resid -  pointer to count of unwritten bytes  (if short write)
1151  *
1152  * Returns 0 on success errno on failure.
1153  */
1154 int
1155 zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
1156 {
1157 	ssize_t rc;
1158 
1159 	rc = write(fp->f_fd, buf, count);
1160 	if (rc < 0)
1161 		return (errno);
1162 
1163 	if (resid) {
1164 		*resid = count - rc;
1165 	} else if (rc != count) {
1166 		return (EIO);
1167 	}
1168 
1169 	return (0);
1170 }
1171 
1172 /*
1173  * Stateless write - os internal file pointer is not updated.
1174  *
1175  * fp -  pointer to file (pipe, socket, etc) to write to
1176  * buf - buffer to write
1177  * count - # of bytes to write
1178  * off - file offset to write to (only valid for seekable types)
1179  * resid -  pointer to count of unwritten bytes
1180  *
1181  * Returns 0 on success errno on failure.
1182  */
1183 int
1184 zfs_file_pwrite(zfs_file_t *fp, const void *buf,
1185     size_t count, loff_t pos, ssize_t *resid)
1186 {
1187 	ssize_t rc, split, done;
1188 	int sectors;
1189 
1190 	/*
1191 	 * To simulate partial disk writes, we split writes into two
1192 	 * system calls so that the process can be killed in between.
1193 	 * This is used by ztest to simulate realistic failure modes.
1194 	 */
1195 	sectors = count >> SPA_MINBLOCKSHIFT;
1196 	split = (sectors > 0 ? rand() % sectors : 0) << SPA_MINBLOCKSHIFT;
1197 	rc = pwrite64(fp->f_fd, buf, split, pos);
1198 	if (rc != -1) {
1199 		done = rc;
1200 		rc = pwrite64(fp->f_fd, (char *)buf + split,
1201 		    count - split, pos + split);
1202 	}
1203 #ifdef __linux__
1204 	if (rc == -1 && errno == EINVAL) {
1205 		/*
1206 		 * Under Linux, this most likely means an alignment issue
1207 		 * (memory or disk) due to O_DIRECT, so we abort() in order
1208 		 * to catch the offender.
1209 		 */
1210 		abort();
1211 	}
1212 #endif
1213 
1214 	if (rc < 0)
1215 		return (errno);
1216 
1217 	done += rc;
1218 
1219 	if (resid) {
1220 		*resid = count - done;
1221 	} else if (done != count) {
1222 		return (EIO);
1223 	}
1224 
1225 	return (0);
1226 }
1227 
1228 /*
1229  * Stateful read - use os internal file pointer to determine where to
1230  * read and update on successful completion.
1231  *
1232  * fp -  pointer to file (pipe, socket, etc) to read from
1233  * buf - buffer to write
1234  * count - # of bytes to read
1235  * resid -  pointer to count of unread bytes (if short read)
1236  *
1237  * Returns 0 on success errno on failure.
1238  */
1239 int
1240 zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid)
1241 {
1242 	int rc;
1243 
1244 	rc = read(fp->f_fd, buf, count);
1245 	if (rc < 0)
1246 		return (errno);
1247 
1248 	if (resid) {
1249 		*resid = count - rc;
1250 	} else if (rc != count) {
1251 		return (EIO);
1252 	}
1253 
1254 	return (0);
1255 }
1256 
1257 /*
1258  * Stateless read - os internal file pointer is not updated.
1259  *
1260  * fp -  pointer to file (pipe, socket, etc) to read from
1261  * buf - buffer to write
1262  * count - # of bytes to write
1263  * off - file offset to read from (only valid for seekable types)
1264  * resid -  pointer to count of unwritten bytes (if short write)
1265  *
1266  * Returns 0 on success errno on failure.
1267  */
1268 int
1269 zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off,
1270     ssize_t *resid)
1271 {
1272 	ssize_t rc;
1273 
1274 	rc = pread64(fp->f_fd, buf, count, off);
1275 	if (rc < 0) {
1276 #ifdef __linux__
1277 		/*
1278 		 * Under Linux, this most likely means an alignment issue
1279 		 * (memory or disk) due to O_DIRECT, so we abort() in order to
1280 		 * catch the offender.
1281 		 */
1282 		if (errno == EINVAL)
1283 			abort();
1284 #endif
1285 		return (errno);
1286 	}
1287 
1288 	if (fp->f_dump_fd != -1) {
1289 		int status;
1290 
1291 		status = pwrite64(fp->f_dump_fd, buf, rc, off);
1292 		ASSERT(status != -1);
1293 	}
1294 
1295 	if (resid) {
1296 		*resid = count - rc;
1297 	} else if (rc != count) {
1298 		return (EIO);
1299 	}
1300 
1301 	return (0);
1302 }
1303 
1304 /*
1305  * lseek - set / get file pointer
1306  *
1307  * fp -  pointer to file (pipe, socket, etc) to read from
1308  * offp - value to seek to, returns current value plus passed offset
1309  * whence - see man pages for standard lseek whence values
1310  *
1311  * Returns 0 on success errno on failure (ESPIPE for non seekable types)
1312  */
1313 int
1314 zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence)
1315 {
1316 	loff_t rc;
1317 
1318 	rc = lseek(fp->f_fd, *offp, whence);
1319 	if (rc < 0)
1320 		return (errno);
1321 
1322 	*offp = rc;
1323 
1324 	return (0);
1325 }
1326 
1327 /*
1328  * Get file attributes
1329  *
1330  * filp - file pointer
1331  * zfattr - pointer to file attr structure
1332  *
1333  * Currently only used for fetching size and file mode
1334  *
1335  * Returns 0 on success or error code of underlying getattr call on failure.
1336  */
1337 int
1338 zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr)
1339 {
1340 	struct stat64 st;
1341 
1342 	if (fstat64_blk(fp->f_fd, &st) == -1)
1343 		return (errno);
1344 
1345 	zfattr->zfa_size = st.st_size;
1346 	zfattr->zfa_mode = st.st_mode;
1347 
1348 	return (0);
1349 }
1350 
1351 /*
1352  * Sync file to disk
1353  *
1354  * filp - file pointer
1355  * flags - O_SYNC and or O_DSYNC
1356  *
1357  * Returns 0 on success or error code of underlying sync call on failure.
1358  */
1359 int
1360 zfs_file_fsync(zfs_file_t *fp, int flags)
1361 {
1362 	(void) flags;
1363 
1364 	if (fsync(fp->f_fd) < 0)
1365 		return (errno);
1366 
1367 	return (0);
1368 }
1369 
1370 /*
1371  * deallocate - zero and/or deallocate file storage
1372  *
1373  * fp - file pointer
1374  * offset - offset to start zeroing or deallocating
1375  * len - length to zero or deallocate
1376  */
1377 int
1378 zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len)
1379 {
1380 	int rc;
1381 #if defined(__linux__)
1382 	rc = fallocate(fp->f_fd,
1383 	    FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, len);
1384 #elif defined(__FreeBSD__) && (__FreeBSD_version >= 1400029)
1385 	struct spacectl_range rqsr = {
1386 		.r_offset = offset,
1387 		.r_len = len,
1388 	};
1389 	rc = fspacectl(fp->f_fd, SPACECTL_DEALLOC, &rqsr, 0, &rqsr);
1390 #else
1391 	(void) fp, (void) offset, (void) len;
1392 	rc = EOPNOTSUPP;
1393 #endif
1394 	if (rc)
1395 		return (SET_ERROR(rc));
1396 	return (0);
1397 }
1398 
1399 /*
1400  * Request current file pointer offset
1401  *
1402  * fp - pointer to file
1403  *
1404  * Returns current file offset.
1405  */
1406 loff_t
1407 zfs_file_off(zfs_file_t *fp)
1408 {
1409 	return (lseek(fp->f_fd, SEEK_CUR, 0));
1410 }
1411 
1412 /*
1413  * unlink file
1414  *
1415  * path - fully qualified file path
1416  *
1417  * Returns 0 on success.
1418  *
1419  * OPTIONAL
1420  */
1421 int
1422 zfs_file_unlink(const char *path)
1423 {
1424 	return (remove(path));
1425 }
1426 
1427 /*
1428  * Get reference to file pointer
1429  *
1430  * fd - input file descriptor
1431  *
1432  * Returns pointer to file struct or NULL.
1433  * Unsupported in user space.
1434  */
1435 zfs_file_t *
1436 zfs_file_get(int fd)
1437 {
1438 	(void) fd;
1439 	abort();
1440 	return (NULL);
1441 }
1442 /*
1443  * Drop reference to file pointer
1444  *
1445  * fp - pointer to file struct
1446  *
1447  * Unsupported in user space.
1448  */
1449 void
1450 zfs_file_put(zfs_file_t *fp)
1451 {
1452 	abort();
1453 	(void) fp;
1454 }
1455 
1456 void
1457 zfsvfs_update_fromname(const char *oldname, const char *newname)
1458 {
1459 	(void) oldname, (void) newname;
1460 }
1461 
1462 void
1463 spa_import_os(spa_t *spa)
1464 {
1465 	(void) spa;
1466 }
1467 
1468 void
1469 spa_export_os(spa_t *spa)
1470 {
1471 	(void) spa;
1472 }
1473 
1474 void
1475 spa_activate_os(spa_t *spa)
1476 {
1477 	(void) spa;
1478 }
1479 
1480 void
1481 spa_deactivate_os(spa_t *spa)
1482 {
1483 	(void) spa;
1484 }
1485