1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
25 */
26
27 #include <assert.h>
28 #include <fcntl.h>
29 #include <libgen.h>
30 #include <poll.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <limits.h>
35 #include <libzutil.h>
36 #include <sys/crypto/icp.h>
37 #include <sys/processor.h>
38 #include <sys/rrwlock.h>
39 #include <sys/spa.h>
40 #include <sys/stat.h>
41 #include <sys/systeminfo.h>
42 #include <sys/time.h>
43 #include <sys/utsname.h>
44 #include <sys/zfs_context.h>
45 #include <sys/zfs_onexit.h>
46 #include <sys/zfs_vfsops.h>
47 #include <sys/zstd/zstd.h>
48 #include <sys/zvol.h>
49 #include <zfs_fletcher.h>
50 #include <zlib.h>
51
52 /*
53 * Emulation of kernel services in userland.
54 */
55
56 uint64_t physmem;
57 uint32_t hostid;
58 struct utsname hw_utsname;
59
60 /* If set, all blocks read will be copied to the specified directory. */
61 char *vn_dumpdir = NULL;
62
63 /* this only exists to have its address taken */
64 struct proc p0;
65
66 /*
67 * =========================================================================
68 * threads
69 * =========================================================================
70 *
71 * TS_STACK_MIN is dictated by the minimum allowed pthread stack size. While
72 * TS_STACK_MAX is somewhat arbitrary, it was selected to be large enough for
73 * the expected stack depth while small enough to avoid exhausting address
74 * space with high thread counts.
75 */
76 #define TS_STACK_MIN MAX(PTHREAD_STACK_MIN, 32768)
77 #define TS_STACK_MAX (256 * 1024)
78
79 struct zk_thread_wrapper {
80 void (*func)(void *);
81 void *arg;
82 };
83
84 static void *
zk_thread_wrapper(void * arg)85 zk_thread_wrapper(void *arg)
86 {
87 struct zk_thread_wrapper ztw;
88 memcpy(&ztw, arg, sizeof (ztw));
89 free(arg);
90 ztw.func(ztw.arg);
91 return (NULL);
92 }
93
94 kthread_t *
zk_thread_create(const char * name,void (* func)(void *),void * arg,size_t stksize,int state)95 zk_thread_create(const char *name, void (*func)(void *), void *arg,
96 size_t stksize, int state)
97 {
98 pthread_attr_t attr;
99 pthread_t tid;
100 char *stkstr;
101 struct zk_thread_wrapper *ztw;
102 int detachstate = PTHREAD_CREATE_DETACHED;
103
104 VERIFY0(pthread_attr_init(&attr));
105
106 if (state & TS_JOINABLE)
107 detachstate = PTHREAD_CREATE_JOINABLE;
108
109 VERIFY0(pthread_attr_setdetachstate(&attr, detachstate));
110
111 /*
112 * We allow the default stack size in user space to be specified by
113 * setting the ZFS_STACK_SIZE environment variable. This allows us
114 * the convenience of observing and debugging stack overruns in
115 * user space. Explicitly specified stack sizes will be honored.
116 * The usage of ZFS_STACK_SIZE is discussed further in the
117 * ENVIRONMENT VARIABLES sections of the ztest(1) man page.
118 */
119 if (stksize == 0) {
120 stkstr = getenv("ZFS_STACK_SIZE");
121
122 if (stkstr == NULL)
123 stksize = TS_STACK_MAX;
124 else
125 stksize = MAX(atoi(stkstr), TS_STACK_MIN);
126 }
127
128 VERIFY3S(stksize, >, 0);
129 stksize = P2ROUNDUP(MAX(stksize, TS_STACK_MIN), PAGESIZE);
130
131 /*
132 * If this ever fails, it may be because the stack size is not a
133 * multiple of system page size.
134 */
135 VERIFY0(pthread_attr_setstacksize(&attr, stksize));
136 VERIFY0(pthread_attr_setguardsize(&attr, PAGESIZE));
137
138 VERIFY(ztw = malloc(sizeof (*ztw)));
139 ztw->func = func;
140 ztw->arg = arg;
141 VERIFY0(pthread_create(&tid, &attr, zk_thread_wrapper, ztw));
142 VERIFY0(pthread_attr_destroy(&attr));
143
144 pthread_setname_np(tid, name);
145
146 return ((void *)(uintptr_t)tid);
147 }
148
149 /*
150 * =========================================================================
151 * kstats
152 * =========================================================================
153 */
154 kstat_t *
kstat_create(const char * module,int instance,const char * name,const char * class,uchar_t type,ulong_t ndata,uchar_t ks_flag)155 kstat_create(const char *module, int instance, const char *name,
156 const char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag)
157 {
158 (void) module, (void) instance, (void) name, (void) class, (void) type,
159 (void) ndata, (void) ks_flag;
160 return (NULL);
161 }
162
163 void
kstat_install(kstat_t * ksp)164 kstat_install(kstat_t *ksp)
165 {
166 (void) ksp;
167 }
168
169 void
kstat_delete(kstat_t * ksp)170 kstat_delete(kstat_t *ksp)
171 {
172 (void) ksp;
173 }
174
175 void
kstat_set_raw_ops(kstat_t * ksp,int (* headers)(char * buf,size_t size),int (* data)(char * buf,size_t size,void * data),void * (* addr)(kstat_t * ksp,loff_t index))176 kstat_set_raw_ops(kstat_t *ksp,
177 int (*headers)(char *buf, size_t size),
178 int (*data)(char *buf, size_t size, void *data),
179 void *(*addr)(kstat_t *ksp, loff_t index))
180 {
181 (void) ksp, (void) headers, (void) data, (void) addr;
182 }
183
184 /*
185 * =========================================================================
186 * mutexes
187 * =========================================================================
188 */
189
190 void
mutex_init(kmutex_t * mp,char * name,int type,void * cookie)191 mutex_init(kmutex_t *mp, char *name, int type, void *cookie)
192 {
193 (void) name, (void) type, (void) cookie;
194 VERIFY0(pthread_mutex_init(&mp->m_lock, NULL));
195 memset(&mp->m_owner, 0, sizeof (pthread_t));
196 }
197
198 void
mutex_destroy(kmutex_t * mp)199 mutex_destroy(kmutex_t *mp)
200 {
201 VERIFY0(pthread_mutex_destroy(&mp->m_lock));
202 }
203
204 void
mutex_enter(kmutex_t * mp)205 mutex_enter(kmutex_t *mp)
206 {
207 VERIFY0(pthread_mutex_lock(&mp->m_lock));
208 mp->m_owner = pthread_self();
209 }
210
211 int
mutex_enter_check_return(kmutex_t * mp)212 mutex_enter_check_return(kmutex_t *mp)
213 {
214 int error = pthread_mutex_lock(&mp->m_lock);
215 if (error == 0)
216 mp->m_owner = pthread_self();
217 return (error);
218 }
219
220 int
mutex_tryenter(kmutex_t * mp)221 mutex_tryenter(kmutex_t *mp)
222 {
223 int error = pthread_mutex_trylock(&mp->m_lock);
224 if (error == 0) {
225 mp->m_owner = pthread_self();
226 return (1);
227 } else {
228 VERIFY3S(error, ==, EBUSY);
229 return (0);
230 }
231 }
232
233 void
mutex_exit(kmutex_t * mp)234 mutex_exit(kmutex_t *mp)
235 {
236 memset(&mp->m_owner, 0, sizeof (pthread_t));
237 VERIFY0(pthread_mutex_unlock(&mp->m_lock));
238 }
239
240 /*
241 * =========================================================================
242 * rwlocks
243 * =========================================================================
244 */
245
246 void
rw_init(krwlock_t * rwlp,char * name,int type,void * arg)247 rw_init(krwlock_t *rwlp, char *name, int type, void *arg)
248 {
249 (void) name, (void) type, (void) arg;
250 VERIFY0(pthread_rwlock_init(&rwlp->rw_lock, NULL));
251 rwlp->rw_readers = 0;
252 rwlp->rw_owner = 0;
253 }
254
255 void
rw_destroy(krwlock_t * rwlp)256 rw_destroy(krwlock_t *rwlp)
257 {
258 VERIFY0(pthread_rwlock_destroy(&rwlp->rw_lock));
259 }
260
261 void
rw_enter(krwlock_t * rwlp,krw_t rw)262 rw_enter(krwlock_t *rwlp, krw_t rw)
263 {
264 if (rw == RW_READER) {
265 VERIFY0(pthread_rwlock_rdlock(&rwlp->rw_lock));
266 atomic_inc_uint(&rwlp->rw_readers);
267 } else {
268 VERIFY0(pthread_rwlock_wrlock(&rwlp->rw_lock));
269 rwlp->rw_owner = pthread_self();
270 }
271 }
272
273 void
rw_exit(krwlock_t * rwlp)274 rw_exit(krwlock_t *rwlp)
275 {
276 if (RW_READ_HELD(rwlp))
277 atomic_dec_uint(&rwlp->rw_readers);
278 else
279 rwlp->rw_owner = 0;
280
281 VERIFY0(pthread_rwlock_unlock(&rwlp->rw_lock));
282 }
283
284 int
rw_tryenter(krwlock_t * rwlp,krw_t rw)285 rw_tryenter(krwlock_t *rwlp, krw_t rw)
286 {
287 int error;
288
289 if (rw == RW_READER)
290 error = pthread_rwlock_tryrdlock(&rwlp->rw_lock);
291 else
292 error = pthread_rwlock_trywrlock(&rwlp->rw_lock);
293
294 if (error == 0) {
295 if (rw == RW_READER)
296 atomic_inc_uint(&rwlp->rw_readers);
297 else
298 rwlp->rw_owner = pthread_self();
299
300 return (1);
301 }
302
303 VERIFY3S(error, ==, EBUSY);
304
305 return (0);
306 }
307
308 uint32_t
zone_get_hostid(void * zonep)309 zone_get_hostid(void *zonep)
310 {
311 /*
312 * We're emulating the system's hostid in userland.
313 */
314 (void) zonep;
315 return (hostid);
316 }
317
318 int
rw_tryupgrade(krwlock_t * rwlp)319 rw_tryupgrade(krwlock_t *rwlp)
320 {
321 (void) rwlp;
322 return (0);
323 }
324
325 /*
326 * =========================================================================
327 * condition variables
328 * =========================================================================
329 */
330
331 void
cv_init(kcondvar_t * cv,char * name,int type,void * arg)332 cv_init(kcondvar_t *cv, char *name, int type, void *arg)
333 {
334 (void) name, (void) type, (void) arg;
335 VERIFY0(pthread_cond_init(cv, NULL));
336 }
337
338 void
cv_destroy(kcondvar_t * cv)339 cv_destroy(kcondvar_t *cv)
340 {
341 VERIFY0(pthread_cond_destroy(cv));
342 }
343
344 void
cv_wait(kcondvar_t * cv,kmutex_t * mp)345 cv_wait(kcondvar_t *cv, kmutex_t *mp)
346 {
347 memset(&mp->m_owner, 0, sizeof (pthread_t));
348 VERIFY0(pthread_cond_wait(cv, &mp->m_lock));
349 mp->m_owner = pthread_self();
350 }
351
352 int
cv_wait_sig(kcondvar_t * cv,kmutex_t * mp)353 cv_wait_sig(kcondvar_t *cv, kmutex_t *mp)
354 {
355 cv_wait(cv, mp);
356 return (1);
357 }
358
359 int
cv_timedwait(kcondvar_t * cv,kmutex_t * mp,clock_t abstime)360 cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
361 {
362 int error;
363 struct timeval tv;
364 struct timespec ts;
365 clock_t delta;
366
367 delta = abstime - ddi_get_lbolt();
368 if (delta <= 0)
369 return (-1);
370
371 VERIFY(gettimeofday(&tv, NULL) == 0);
372
373 ts.tv_sec = tv.tv_sec + delta / hz;
374 ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % hz) * (NANOSEC / hz);
375 if (ts.tv_nsec >= NANOSEC) {
376 ts.tv_sec++;
377 ts.tv_nsec -= NANOSEC;
378 }
379
380 memset(&mp->m_owner, 0, sizeof (pthread_t));
381 error = pthread_cond_timedwait(cv, &mp->m_lock, &ts);
382 mp->m_owner = pthread_self();
383
384 if (error == ETIMEDOUT)
385 return (-1);
386
387 VERIFY0(error);
388
389 return (1);
390 }
391
392 int
cv_timedwait_hires(kcondvar_t * cv,kmutex_t * mp,hrtime_t tim,hrtime_t res,int flag)393 cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res,
394 int flag)
395 {
396 (void) res;
397 int error;
398 struct timeval tv;
399 struct timespec ts;
400 hrtime_t delta;
401
402 ASSERT(flag == 0 || flag == CALLOUT_FLAG_ABSOLUTE);
403
404 delta = tim;
405 if (flag & CALLOUT_FLAG_ABSOLUTE)
406 delta -= gethrtime();
407
408 if (delta <= 0)
409 return (-1);
410
411 VERIFY0(gettimeofday(&tv, NULL));
412
413 ts.tv_sec = tv.tv_sec + delta / NANOSEC;
414 ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % NANOSEC);
415 if (ts.tv_nsec >= NANOSEC) {
416 ts.tv_sec++;
417 ts.tv_nsec -= NANOSEC;
418 }
419
420 memset(&mp->m_owner, 0, sizeof (pthread_t));
421 error = pthread_cond_timedwait(cv, &mp->m_lock, &ts);
422 mp->m_owner = pthread_self();
423
424 if (error == ETIMEDOUT)
425 return (-1);
426
427 VERIFY0(error);
428
429 return (1);
430 }
431
432 void
cv_signal(kcondvar_t * cv)433 cv_signal(kcondvar_t *cv)
434 {
435 VERIFY0(pthread_cond_signal(cv));
436 }
437
438 void
cv_broadcast(kcondvar_t * cv)439 cv_broadcast(kcondvar_t *cv)
440 {
441 VERIFY0(pthread_cond_broadcast(cv));
442 }
443
444 /*
445 * =========================================================================
446 * procfs list
447 * =========================================================================
448 */
449
450 void
seq_printf(struct seq_file * m,const char * fmt,...)451 seq_printf(struct seq_file *m, const char *fmt, ...)
452 {
453 (void) m, (void) fmt;
454 }
455
456 void
procfs_list_install(const char * module,const char * submodule,const char * name,mode_t mode,procfs_list_t * procfs_list,int (* show)(struct seq_file * f,void * p),int (* show_header)(struct seq_file * f),int (* clear)(procfs_list_t * procfs_list),size_t procfs_list_node_off)457 procfs_list_install(const char *module,
458 const char *submodule,
459 const char *name,
460 mode_t mode,
461 procfs_list_t *procfs_list,
462 int (*show)(struct seq_file *f, void *p),
463 int (*show_header)(struct seq_file *f),
464 int (*clear)(procfs_list_t *procfs_list),
465 size_t procfs_list_node_off)
466 {
467 (void) module, (void) submodule, (void) name, (void) mode, (void) show,
468 (void) show_header, (void) clear;
469 mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL);
470 list_create(&procfs_list->pl_list,
471 procfs_list_node_off + sizeof (procfs_list_node_t),
472 procfs_list_node_off + offsetof(procfs_list_node_t, pln_link));
473 procfs_list->pl_next_id = 1;
474 procfs_list->pl_node_offset = procfs_list_node_off;
475 }
476
477 void
procfs_list_uninstall(procfs_list_t * procfs_list)478 procfs_list_uninstall(procfs_list_t *procfs_list)
479 {
480 (void) procfs_list;
481 }
482
483 void
procfs_list_destroy(procfs_list_t * procfs_list)484 procfs_list_destroy(procfs_list_t *procfs_list)
485 {
486 ASSERT(list_is_empty(&procfs_list->pl_list));
487 list_destroy(&procfs_list->pl_list);
488 mutex_destroy(&procfs_list->pl_lock);
489 }
490
491 #define NODE_ID(procfs_list, obj) \
492 (((procfs_list_node_t *)(((char *)obj) + \
493 (procfs_list)->pl_node_offset))->pln_id)
494
495 void
procfs_list_add(procfs_list_t * procfs_list,void * p)496 procfs_list_add(procfs_list_t *procfs_list, void *p)
497 {
498 ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
499 NODE_ID(procfs_list, p) = procfs_list->pl_next_id++;
500 list_insert_tail(&procfs_list->pl_list, p);
501 }
502
503 /*
504 * =========================================================================
505 * vnode operations
506 * =========================================================================
507 */
508
509 /*
510 * =========================================================================
511 * Figure out which debugging statements to print
512 * =========================================================================
513 */
514
515 static char *dprintf_string;
516 static int dprintf_print_all;
517
518 int
dprintf_find_string(const char * string)519 dprintf_find_string(const char *string)
520 {
521 char *tmp_str = dprintf_string;
522 int len = strlen(string);
523
524 /*
525 * Find out if this is a string we want to print.
526 * String format: file1.c,function_name1,file2.c,file3.c
527 */
528
529 while (tmp_str != NULL) {
530 if (strncmp(tmp_str, string, len) == 0 &&
531 (tmp_str[len] == ',' || tmp_str[len] == '\0'))
532 return (1);
533 tmp_str = strchr(tmp_str, ',');
534 if (tmp_str != NULL)
535 tmp_str++; /* Get rid of , */
536 }
537 return (0);
538 }
539
540 void
dprintf_setup(int * argc,char ** argv)541 dprintf_setup(int *argc, char **argv)
542 {
543 int i, j;
544
545 /*
546 * Debugging can be specified two ways: by setting the
547 * environment variable ZFS_DEBUG, or by including a
548 * "debug=..." argument on the command line. The command
549 * line setting overrides the environment variable.
550 */
551
552 for (i = 1; i < *argc; i++) {
553 int len = strlen("debug=");
554 /* First look for a command line argument */
555 if (strncmp("debug=", argv[i], len) == 0) {
556 dprintf_string = argv[i] + len;
557 /* Remove from args */
558 for (j = i; j < *argc; j++)
559 argv[j] = argv[j+1];
560 argv[j] = NULL;
561 (*argc)--;
562 }
563 }
564
565 if (dprintf_string == NULL) {
566 /* Look for ZFS_DEBUG environment variable */
567 dprintf_string = getenv("ZFS_DEBUG");
568 }
569
570 /*
571 * Are we just turning on all debugging?
572 */
573 if (dprintf_find_string("on"))
574 dprintf_print_all = 1;
575
576 if (dprintf_string != NULL)
577 zfs_flags |= ZFS_DEBUG_DPRINTF;
578 }
579
580 /*
581 * =========================================================================
582 * debug printfs
583 * =========================================================================
584 */
585 void
__dprintf(boolean_t dprint,const char * file,const char * func,int line,const char * fmt,...)586 __dprintf(boolean_t dprint, const char *file, const char *func,
587 int line, const char *fmt, ...)
588 {
589 /* Get rid of annoying "../common/" prefix to filename. */
590 const char *newfile = zfs_basename(file);
591
592 va_list adx;
593 if (dprint) {
594 /* dprintf messages are printed immediately */
595
596 if (!dprintf_print_all &&
597 !dprintf_find_string(newfile) &&
598 !dprintf_find_string(func))
599 return;
600
601 /* Print out just the function name if requested */
602 flockfile(stdout);
603 if (dprintf_find_string("pid"))
604 (void) printf("%d ", getpid());
605 if (dprintf_find_string("tid"))
606 (void) printf("%ju ",
607 (uintmax_t)(uintptr_t)pthread_self());
608 if (dprintf_find_string("cpu"))
609 (void) printf("%u ", getcpuid());
610 if (dprintf_find_string("time"))
611 (void) printf("%llu ", gethrtime());
612 if (dprintf_find_string("long"))
613 (void) printf("%s, line %d: ", newfile, line);
614 (void) printf("dprintf: %s: ", func);
615 va_start(adx, fmt);
616 (void) vprintf(fmt, adx);
617 va_end(adx);
618 funlockfile(stdout);
619 } else {
620 /* zfs_dbgmsg is logged for dumping later */
621 size_t size;
622 char *buf;
623 int i;
624
625 size = 1024;
626 buf = umem_alloc(size, UMEM_NOFAIL);
627 i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func);
628
629 if (i < size) {
630 va_start(adx, fmt);
631 (void) vsnprintf(buf + i, size - i, fmt, adx);
632 va_end(adx);
633 }
634
635 __zfs_dbgmsg(buf);
636
637 umem_free(buf, size);
638 }
639 }
640
641 /*
642 * =========================================================================
643 * cmn_err() and panic()
644 * =========================================================================
645 */
646 static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" };
647 static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" };
648
649 __attribute__((noreturn)) void
vpanic(const char * fmt,va_list adx)650 vpanic(const char *fmt, va_list adx)
651 {
652 (void) fprintf(stderr, "error: ");
653 (void) vfprintf(stderr, fmt, adx);
654 (void) fprintf(stderr, "\n");
655
656 abort(); /* think of it as a "user-level crash dump" */
657 }
658
659 __attribute__((noreturn)) void
panic(const char * fmt,...)660 panic(const char *fmt, ...)
661 {
662 va_list adx;
663
664 va_start(adx, fmt);
665 vpanic(fmt, adx);
666 va_end(adx);
667 }
668
669 void
vcmn_err(int ce,const char * fmt,va_list adx)670 vcmn_err(int ce, const char *fmt, va_list adx)
671 {
672 if (ce == CE_PANIC)
673 vpanic(fmt, adx);
674 if (ce != CE_NOTE) { /* suppress noise in userland stress testing */
675 (void) fprintf(stderr, "%s", ce_prefix[ce]);
676 (void) vfprintf(stderr, fmt, adx);
677 (void) fprintf(stderr, "%s", ce_suffix[ce]);
678 }
679 }
680
681 void
cmn_err(int ce,const char * fmt,...)682 cmn_err(int ce, const char *fmt, ...)
683 {
684 va_list adx;
685
686 va_start(adx, fmt);
687 vcmn_err(ce, fmt, adx);
688 va_end(adx);
689 }
690
691 /*
692 * =========================================================================
693 * misc routines
694 * =========================================================================
695 */
696
697 void
delay(clock_t ticks)698 delay(clock_t ticks)
699 {
700 (void) poll(0, 0, ticks * (1000 / hz));
701 }
702
703 /*
704 * Find highest one bit set.
705 * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
706 * The __builtin_clzll() function is supported by both GCC and Clang.
707 */
708 int
highbit64(uint64_t i)709 highbit64(uint64_t i)
710 {
711 if (i == 0)
712 return (0);
713
714 return (NBBY * sizeof (uint64_t) - __builtin_clzll(i));
715 }
716
717 /*
718 * Find lowest one bit set.
719 * Returns bit number + 1 of lowest bit that is set, otherwise returns 0.
720 * The __builtin_ffsll() function is supported by both GCC and Clang.
721 */
722 int
lowbit64(uint64_t i)723 lowbit64(uint64_t i)
724 {
725 if (i == 0)
726 return (0);
727
728 return (__builtin_ffsll(i));
729 }
730
731 const char *random_path = "/dev/random";
732 const char *urandom_path = "/dev/urandom";
733 static int random_fd = -1, urandom_fd = -1;
734
735 void
random_init(void)736 random_init(void)
737 {
738 VERIFY((random_fd = open(random_path, O_RDONLY | O_CLOEXEC)) != -1);
739 VERIFY((urandom_fd = open(urandom_path, O_RDONLY | O_CLOEXEC)) != -1);
740 }
741
742 void
random_fini(void)743 random_fini(void)
744 {
745 close(random_fd);
746 close(urandom_fd);
747
748 random_fd = -1;
749 urandom_fd = -1;
750 }
751
752 static int
random_get_bytes_common(uint8_t * ptr,size_t len,int fd)753 random_get_bytes_common(uint8_t *ptr, size_t len, int fd)
754 {
755 size_t resid = len;
756 ssize_t bytes;
757
758 ASSERT(fd != -1);
759
760 while (resid != 0) {
761 bytes = read(fd, ptr, resid);
762 ASSERT3S(bytes, >=, 0);
763 ptr += bytes;
764 resid -= bytes;
765 }
766
767 return (0);
768 }
769
770 int
random_get_bytes(uint8_t * ptr,size_t len)771 random_get_bytes(uint8_t *ptr, size_t len)
772 {
773 return (random_get_bytes_common(ptr, len, random_fd));
774 }
775
776 int
random_get_pseudo_bytes(uint8_t * ptr,size_t len)777 random_get_pseudo_bytes(uint8_t *ptr, size_t len)
778 {
779 return (random_get_bytes_common(ptr, len, urandom_fd));
780 }
781
782 int
ddi_strtoull(const char * str,char ** nptr,int base,u_longlong_t * result)783 ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result)
784 {
785 errno = 0;
786 *result = strtoull(str, nptr, base);
787 if (*result == 0)
788 return (errno);
789 return (0);
790 }
791
792 utsname_t *
utsname(void)793 utsname(void)
794 {
795 return (&hw_utsname);
796 }
797
798 /*
799 * =========================================================================
800 * kernel emulation setup & teardown
801 * =========================================================================
802 */
803 static int
umem_out_of_memory(void)804 umem_out_of_memory(void)
805 {
806 char errmsg[] = "out of memory -- generating core dump\n";
807
808 (void) fprintf(stderr, "%s", errmsg);
809 abort();
810 return (0);
811 }
812
813 void
kernel_init(int mode)814 kernel_init(int mode)
815 {
816 extern uint_t rrw_tsd_key;
817
818 umem_nofail_callback(umem_out_of_memory);
819
820 physmem = sysconf(_SC_PHYS_PAGES);
821
822 dprintf("physmem = %llu pages (%.2f GB)\n", (u_longlong_t)physmem,
823 (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
824
825 hostid = (mode & SPA_MODE_WRITE) ? get_system_hostid() : 0;
826
827 random_init();
828
829 VERIFY0(uname(&hw_utsname));
830
831 system_taskq_init();
832 icp_init();
833
834 zstd_init();
835
836 spa_init((spa_mode_t)mode);
837
838 fletcher_4_init();
839
840 tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
841 }
842
843 void
kernel_fini(void)844 kernel_fini(void)
845 {
846 fletcher_4_fini();
847 spa_fini();
848
849 zstd_fini();
850
851 icp_fini();
852 system_taskq_fini();
853
854 random_fini();
855 }
856
857 uid_t
crgetuid(cred_t * cr)858 crgetuid(cred_t *cr)
859 {
860 (void) cr;
861 return (0);
862 }
863
864 uid_t
crgetruid(cred_t * cr)865 crgetruid(cred_t *cr)
866 {
867 (void) cr;
868 return (0);
869 }
870
871 gid_t
crgetgid(cred_t * cr)872 crgetgid(cred_t *cr)
873 {
874 (void) cr;
875 return (0);
876 }
877
878 int
crgetngroups(cred_t * cr)879 crgetngroups(cred_t *cr)
880 {
881 (void) cr;
882 return (0);
883 }
884
885 gid_t *
crgetgroups(cred_t * cr)886 crgetgroups(cred_t *cr)
887 {
888 (void) cr;
889 return (NULL);
890 }
891
892 int
zfs_secpolicy_snapshot_perms(const char * name,cred_t * cr)893 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
894 {
895 (void) name, (void) cr;
896 return (0);
897 }
898
899 int
zfs_secpolicy_rename_perms(const char * from,const char * to,cred_t * cr)900 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
901 {
902 (void) from, (void) to, (void) cr;
903 return (0);
904 }
905
906 int
zfs_secpolicy_destroy_perms(const char * name,cred_t * cr)907 zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
908 {
909 (void) name, (void) cr;
910 return (0);
911 }
912
913 int
secpolicy_zfs(const cred_t * cr)914 secpolicy_zfs(const cred_t *cr)
915 {
916 (void) cr;
917 return (0);
918 }
919
920 int
secpolicy_zfs_proc(const cred_t * cr,proc_t * proc)921 secpolicy_zfs_proc(const cred_t *cr, proc_t *proc)
922 {
923 (void) cr, (void) proc;
924 return (0);
925 }
926
927 ksiddomain_t *
ksid_lookupdomain(const char * dom)928 ksid_lookupdomain(const char *dom)
929 {
930 ksiddomain_t *kd;
931
932 kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL);
933 kd->kd_name = spa_strdup(dom);
934 return (kd);
935 }
936
937 void
ksiddomain_rele(ksiddomain_t * ksid)938 ksiddomain_rele(ksiddomain_t *ksid)
939 {
940 spa_strfree(ksid->kd_name);
941 umem_free(ksid, sizeof (ksiddomain_t));
942 }
943
944 char *
kmem_vasprintf(const char * fmt,va_list adx)945 kmem_vasprintf(const char *fmt, va_list adx)
946 {
947 char *buf = NULL;
948 va_list adx_copy;
949
950 va_copy(adx_copy, adx);
951 VERIFY(vasprintf(&buf, fmt, adx_copy) != -1);
952 va_end(adx_copy);
953
954 return (buf);
955 }
956
957 char *
kmem_asprintf(const char * fmt,...)958 kmem_asprintf(const char *fmt, ...)
959 {
960 char *buf = NULL;
961 va_list adx;
962
963 va_start(adx, fmt);
964 VERIFY(vasprintf(&buf, fmt, adx) != -1);
965 va_end(adx);
966
967 return (buf);
968 }
969
970 /*
971 * kmem_scnprintf() will return the number of characters that it would have
972 * printed whenever it is limited by value of the size variable, rather than
973 * the number of characters that it did print. This can cause misbehavior on
974 * subsequent uses of the return value, so we define a safe version that will
975 * return the number of characters actually printed, minus the NULL format
976 * character. Subsequent use of this by the safe string functions is safe
977 * whether it is snprintf(), strlcat() or strlcpy().
978 */
979 int
kmem_scnprintf(char * restrict str,size_t size,const char * restrict fmt,...)980 kmem_scnprintf(char *restrict str, size_t size, const char *restrict fmt, ...)
981 {
982 int n;
983 va_list ap;
984
985 /* Make the 0 case a no-op so that we do not return -1 */
986 if (size == 0)
987 return (0);
988
989 va_start(ap, fmt);
990 n = vsnprintf(str, size, fmt, ap);
991 va_end(ap);
992
993 if (n >= size)
994 n = size - 1;
995
996 return (n);
997 }
998
999 zfs_file_t *
zfs_onexit_fd_hold(int fd,minor_t * minorp)1000 zfs_onexit_fd_hold(int fd, minor_t *minorp)
1001 {
1002 (void) fd;
1003 *minorp = 0;
1004 return (NULL);
1005 }
1006
1007 void
zfs_onexit_fd_rele(zfs_file_t * fp)1008 zfs_onexit_fd_rele(zfs_file_t *fp)
1009 {
1010 (void) fp;
1011 }
1012
1013 int
zfs_onexit_add_cb(minor_t minor,void (* func)(void *),void * data,uintptr_t * action_handle)1014 zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
1015 uintptr_t *action_handle)
1016 {
1017 (void) minor, (void) func, (void) data, (void) action_handle;
1018 return (0);
1019 }
1020
1021 fstrans_cookie_t
spl_fstrans_mark(void)1022 spl_fstrans_mark(void)
1023 {
1024 return ((fstrans_cookie_t)0);
1025 }
1026
1027 void
spl_fstrans_unmark(fstrans_cookie_t cookie)1028 spl_fstrans_unmark(fstrans_cookie_t cookie)
1029 {
1030 (void) cookie;
1031 }
1032
1033 int
__spl_pf_fstrans_check(void)1034 __spl_pf_fstrans_check(void)
1035 {
1036 return (0);
1037 }
1038
1039 int
kmem_cache_reap_active(void)1040 kmem_cache_reap_active(void)
1041 {
1042 return (0);
1043 }
1044
1045 void
zvol_create_minor(const char * name)1046 zvol_create_minor(const char *name)
1047 {
1048 (void) name;
1049 }
1050
1051 void
zvol_create_minors_recursive(const char * name)1052 zvol_create_minors_recursive(const char *name)
1053 {
1054 (void) name;
1055 }
1056
1057 void
zvol_remove_minors(spa_t * spa,const char * name,boolean_t async)1058 zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
1059 {
1060 (void) spa, (void) name, (void) async;
1061 }
1062
1063 void
zvol_rename_minors(spa_t * spa,const char * oldname,const char * newname,boolean_t async)1064 zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname,
1065 boolean_t async)
1066 {
1067 (void) spa, (void) oldname, (void) newname, (void) async;
1068 }
1069
1070 /*
1071 * Open file
1072 *
1073 * path - fully qualified path to file
1074 * flags - file attributes O_READ / O_WRITE / O_EXCL
1075 * fpp - pointer to return file pointer
1076 *
1077 * Returns 0 on success underlying error on failure.
1078 */
1079 int
zfs_file_open(const char * path,int flags,int mode,zfs_file_t ** fpp)1080 zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp)
1081 {
1082 int fd = -1;
1083 int dump_fd = -1;
1084 int err;
1085 int old_umask = 0;
1086 zfs_file_t *fp;
1087 struct stat64 st;
1088
1089 if (!(flags & O_CREAT) && stat64(path, &st) == -1)
1090 return (errno);
1091
1092 if (!(flags & O_CREAT) && S_ISBLK(st.st_mode))
1093 flags |= O_DIRECT;
1094
1095 if (flags & O_CREAT)
1096 old_umask = umask(0);
1097
1098 fd = open64(path, flags, mode);
1099 if (fd == -1)
1100 return (errno);
1101
1102 if (flags & O_CREAT)
1103 (void) umask(old_umask);
1104
1105 if (vn_dumpdir != NULL) {
1106 char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL);
1107 const char *inpath = zfs_basename(path);
1108
1109 (void) snprintf(dumppath, MAXPATHLEN,
1110 "%s/%s", vn_dumpdir, inpath);
1111 dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666);
1112 umem_free(dumppath, MAXPATHLEN);
1113 if (dump_fd == -1) {
1114 err = errno;
1115 close(fd);
1116 return (err);
1117 }
1118 } else {
1119 dump_fd = -1;
1120 }
1121
1122 (void) fcntl(fd, F_SETFD, FD_CLOEXEC);
1123
1124 fp = umem_zalloc(sizeof (zfs_file_t), UMEM_NOFAIL);
1125 fp->f_fd = fd;
1126 fp->f_dump_fd = dump_fd;
1127 *fpp = fp;
1128
1129 return (0);
1130 }
1131
1132 void
zfs_file_close(zfs_file_t * fp)1133 zfs_file_close(zfs_file_t *fp)
1134 {
1135 close(fp->f_fd);
1136 if (fp->f_dump_fd != -1)
1137 close(fp->f_dump_fd);
1138
1139 umem_free(fp, sizeof (zfs_file_t));
1140 }
1141
1142 /*
1143 * Stateful write - use os internal file pointer to determine where to
1144 * write and update on successful completion.
1145 *
1146 * fp - pointer to file (pipe, socket, etc) to write to
1147 * buf - buffer to write
1148 * count - # of bytes to write
1149 * resid - pointer to count of unwritten bytes (if short write)
1150 *
1151 * Returns 0 on success errno on failure.
1152 */
1153 int
zfs_file_write(zfs_file_t * fp,const void * buf,size_t count,ssize_t * resid)1154 zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
1155 {
1156 ssize_t rc;
1157
1158 rc = write(fp->f_fd, buf, count);
1159 if (rc < 0)
1160 return (errno);
1161
1162 if (resid) {
1163 *resid = count - rc;
1164 } else if (rc != count) {
1165 return (EIO);
1166 }
1167
1168 return (0);
1169 }
1170
1171 /*
1172 * Stateless write - os internal file pointer is not updated.
1173 *
1174 * fp - pointer to file (pipe, socket, etc) to write to
1175 * buf - buffer to write
1176 * count - # of bytes to write
1177 * off - file offset to write to (only valid for seekable types)
1178 * resid - pointer to count of unwritten bytes
1179 *
1180 * Returns 0 on success errno on failure.
1181 */
1182 int
zfs_file_pwrite(zfs_file_t * fp,const void * buf,size_t count,loff_t pos,ssize_t * resid)1183 zfs_file_pwrite(zfs_file_t *fp, const void *buf,
1184 size_t count, loff_t pos, ssize_t *resid)
1185 {
1186 ssize_t rc, split, done;
1187 int sectors;
1188
1189 /*
1190 * To simulate partial disk writes, we split writes into two
1191 * system calls so that the process can be killed in between.
1192 * This is used by ztest to simulate realistic failure modes.
1193 */
1194 sectors = count >> SPA_MINBLOCKSHIFT;
1195 split = (sectors > 0 ? rand() % sectors : 0) << SPA_MINBLOCKSHIFT;
1196 rc = pwrite64(fp->f_fd, buf, split, pos);
1197 if (rc != -1) {
1198 done = rc;
1199 rc = pwrite64(fp->f_fd, (char *)buf + split,
1200 count - split, pos + split);
1201 }
1202 #ifdef __linux__
1203 if (rc == -1 && errno == EINVAL) {
1204 /*
1205 * Under Linux, this most likely means an alignment issue
1206 * (memory or disk) due to O_DIRECT, so we abort() in order
1207 * to catch the offender.
1208 */
1209 abort();
1210 }
1211 #endif
1212
1213 if (rc < 0)
1214 return (errno);
1215
1216 done += rc;
1217
1218 if (resid) {
1219 *resid = count - done;
1220 } else if (done != count) {
1221 return (EIO);
1222 }
1223
1224 return (0);
1225 }
1226
1227 /*
1228 * Stateful read - use os internal file pointer to determine where to
1229 * read and update on successful completion.
1230 *
1231 * fp - pointer to file (pipe, socket, etc) to read from
1232 * buf - buffer to write
1233 * count - # of bytes to read
1234 * resid - pointer to count of unread bytes (if short read)
1235 *
1236 * Returns 0 on success errno on failure.
1237 */
1238 int
zfs_file_read(zfs_file_t * fp,void * buf,size_t count,ssize_t * resid)1239 zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid)
1240 {
1241 int rc;
1242
1243 rc = read(fp->f_fd, buf, count);
1244 if (rc < 0)
1245 return (errno);
1246
1247 if (resid) {
1248 *resid = count - rc;
1249 } else if (rc != count) {
1250 return (EIO);
1251 }
1252
1253 return (0);
1254 }
1255
1256 /*
1257 * Stateless read - os internal file pointer is not updated.
1258 *
1259 * fp - pointer to file (pipe, socket, etc) to read from
1260 * buf - buffer to write
1261 * count - # of bytes to write
1262 * off - file offset to read from (only valid for seekable types)
1263 * resid - pointer to count of unwritten bytes (if short write)
1264 *
1265 * Returns 0 on success errno on failure.
1266 */
1267 int
zfs_file_pread(zfs_file_t * fp,void * buf,size_t count,loff_t off,ssize_t * resid)1268 zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off,
1269 ssize_t *resid)
1270 {
1271 ssize_t rc;
1272
1273 rc = pread64(fp->f_fd, buf, count, off);
1274 if (rc < 0) {
1275 #ifdef __linux__
1276 /*
1277 * Under Linux, this most likely means an alignment issue
1278 * (memory or disk) due to O_DIRECT, so we abort() in order to
1279 * catch the offender.
1280 */
1281 if (errno == EINVAL)
1282 abort();
1283 #endif
1284 return (errno);
1285 }
1286
1287 if (fp->f_dump_fd != -1) {
1288 int status;
1289
1290 status = pwrite64(fp->f_dump_fd, buf, rc, off);
1291 ASSERT(status != -1);
1292 }
1293
1294 if (resid) {
1295 *resid = count - rc;
1296 } else if (rc != count) {
1297 return (EIO);
1298 }
1299
1300 return (0);
1301 }
1302
1303 /*
1304 * lseek - set / get file pointer
1305 *
1306 * fp - pointer to file (pipe, socket, etc) to read from
1307 * offp - value to seek to, returns current value plus passed offset
1308 * whence - see man pages for standard lseek whence values
1309 *
1310 * Returns 0 on success errno on failure (ESPIPE for non seekable types)
1311 */
1312 int
zfs_file_seek(zfs_file_t * fp,loff_t * offp,int whence)1313 zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence)
1314 {
1315 loff_t rc;
1316
1317 rc = lseek(fp->f_fd, *offp, whence);
1318 if (rc < 0)
1319 return (errno);
1320
1321 *offp = rc;
1322
1323 return (0);
1324 }
1325
1326 /*
1327 * Get file attributes
1328 *
1329 * filp - file pointer
1330 * zfattr - pointer to file attr structure
1331 *
1332 * Currently only used for fetching size and file mode
1333 *
1334 * Returns 0 on success or error code of underlying getattr call on failure.
1335 */
1336 int
zfs_file_getattr(zfs_file_t * fp,zfs_file_attr_t * zfattr)1337 zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr)
1338 {
1339 struct stat64 st;
1340
1341 if (fstat64_blk(fp->f_fd, &st) == -1)
1342 return (errno);
1343
1344 zfattr->zfa_size = st.st_size;
1345 zfattr->zfa_mode = st.st_mode;
1346
1347 return (0);
1348 }
1349
1350 /*
1351 * Sync file to disk
1352 *
1353 * filp - file pointer
1354 * flags - O_SYNC and or O_DSYNC
1355 *
1356 * Returns 0 on success or error code of underlying sync call on failure.
1357 */
1358 int
zfs_file_fsync(zfs_file_t * fp,int flags)1359 zfs_file_fsync(zfs_file_t *fp, int flags)
1360 {
1361 (void) flags;
1362
1363 if (fsync(fp->f_fd) < 0)
1364 return (errno);
1365
1366 return (0);
1367 }
1368
1369 /*
1370 * deallocate - zero and/or deallocate file storage
1371 *
1372 * fp - file pointer
1373 * offset - offset to start zeroing or deallocating
1374 * len - length to zero or deallocate
1375 */
1376 int
zfs_file_deallocate(zfs_file_t * fp,loff_t offset,loff_t len)1377 zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len)
1378 {
1379 int rc;
1380 #if defined(__linux__)
1381 rc = fallocate(fp->f_fd,
1382 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, len);
1383 #elif defined(__FreeBSD__) && (__FreeBSD_version >= 1400029)
1384 struct spacectl_range rqsr = {
1385 .r_offset = offset,
1386 .r_len = len,
1387 };
1388 rc = fspacectl(fp->f_fd, SPACECTL_DEALLOC, &rqsr, 0, &rqsr);
1389 #else
1390 (void) fp, (void) offset, (void) len;
1391 rc = EOPNOTSUPP;
1392 #endif
1393 if (rc)
1394 return (SET_ERROR(rc));
1395 return (0);
1396 }
1397
1398 /*
1399 * Request current file pointer offset
1400 *
1401 * fp - pointer to file
1402 *
1403 * Returns current file offset.
1404 */
1405 loff_t
zfs_file_off(zfs_file_t * fp)1406 zfs_file_off(zfs_file_t *fp)
1407 {
1408 return (lseek(fp->f_fd, SEEK_CUR, 0));
1409 }
1410
1411 /*
1412 * unlink file
1413 *
1414 * path - fully qualified file path
1415 *
1416 * Returns 0 on success.
1417 *
1418 * OPTIONAL
1419 */
1420 int
zfs_file_unlink(const char * path)1421 zfs_file_unlink(const char *path)
1422 {
1423 return (remove(path));
1424 }
1425
1426 /*
1427 * Get reference to file pointer
1428 *
1429 * fd - input file descriptor
1430 *
1431 * Returns pointer to file struct or NULL.
1432 * Unsupported in user space.
1433 */
1434 zfs_file_t *
zfs_file_get(int fd)1435 zfs_file_get(int fd)
1436 {
1437 (void) fd;
1438 abort();
1439 return (NULL);
1440 }
1441 /*
1442 * Drop reference to file pointer
1443 *
1444 * fp - pointer to file struct
1445 *
1446 * Unsupported in user space.
1447 */
1448 void
zfs_file_put(zfs_file_t * fp)1449 zfs_file_put(zfs_file_t *fp)
1450 {
1451 abort();
1452 (void) fp;
1453 }
1454
1455 void
zfsvfs_update_fromname(const char * oldname,const char * newname)1456 zfsvfs_update_fromname(const char *oldname, const char *newname)
1457 {
1458 (void) oldname, (void) newname;
1459 }
1460
1461 void
spa_import_os(spa_t * spa)1462 spa_import_os(spa_t *spa)
1463 {
1464 (void) spa;
1465 }
1466
1467 void
spa_export_os(spa_t * spa)1468 spa_export_os(spa_t *spa)
1469 {
1470 (void) spa;
1471 }
1472
1473 void
spa_activate_os(spa_t * spa)1474 spa_activate_os(spa_t *spa)
1475 {
1476 (void) spa;
1477 }
1478
1479 void
spa_deactivate_os(spa_t * spa)1480 spa_deactivate_os(spa_t *spa)
1481 {
1482 (void) spa;
1483 }
1484