xref: /illumos-gate/usr/src/lib/libzpool/common/kernel.c (revision f3af49816e370d667d566ab703e94b81305a536e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <assert.h>
29 #include <fcntl.h>
30 #include <poll.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <zlib.h>
35 #include <sys/spa.h>
36 #include <sys/stat.h>
37 #include <sys/processor.h>
38 #include <sys/zfs_context.h>
39 #include <sys/zmod.h>
40 #include <sys/utsname.h>
41 
42 /*
43  * Emulation of kernel services in userland.
44  */
45 
46 uint64_t physmem;
47 vnode_t *rootdir = (vnode_t *)0xabcd1234;
48 char hw_serial[11];
49 
50 struct utsname utsname = {
51 	"userland", "libzpool", "1", "1", "na"
52 };
53 
54 /*
55  * =========================================================================
56  * threads
57  * =========================================================================
58  */
59 /*ARGSUSED*/
60 kthread_t *
61 zk_thread_create(void (*func)(), void *arg)
62 {
63 	thread_t tid;
64 
65 	VERIFY(thr_create(0, 0, (void *(*)(void *))func, arg, THR_DETACHED,
66 	    &tid) == 0);
67 
68 	return ((void *)(uintptr_t)tid);
69 }
70 
71 /*
72  * =========================================================================
73  * kstats
74  * =========================================================================
75  */
76 /*ARGSUSED*/
77 kstat_t *
78 kstat_create(char *module, int instance, char *name, char *class,
79     uchar_t type, ulong_t ndata, uchar_t ks_flag)
80 {
81 	return (NULL);
82 }
83 
84 /*ARGSUSED*/
85 void
86 kstat_install(kstat_t *ksp)
87 {}
88 
89 /*ARGSUSED*/
90 void
91 kstat_delete(kstat_t *ksp)
92 {}
93 
94 /*
95  * =========================================================================
96  * mutexes
97  * =========================================================================
98  */
99 void
100 zmutex_init(kmutex_t *mp)
101 {
102 	mp->m_owner = NULL;
103 	(void) _mutex_init(&mp->m_lock, USYNC_THREAD, NULL);
104 }
105 
106 void
107 zmutex_destroy(kmutex_t *mp)
108 {
109 	ASSERT(mp->m_owner == NULL);
110 	(void) _mutex_destroy(&(mp)->m_lock);
111 	mp->m_owner = (void *)-1UL;
112 }
113 
114 void
115 mutex_enter(kmutex_t *mp)
116 {
117 	ASSERT(mp->m_owner != (void *)-1UL);
118 	ASSERT(mp->m_owner != curthread);
119 	VERIFY(mutex_lock(&mp->m_lock) == 0);
120 	ASSERT(mp->m_owner == NULL);
121 	mp->m_owner = curthread;
122 }
123 
124 int
125 mutex_tryenter(kmutex_t *mp)
126 {
127 	ASSERT(mp->m_owner != (void *)-1UL);
128 	if (0 == mutex_trylock(&mp->m_lock)) {
129 		ASSERT(mp->m_owner == NULL);
130 		mp->m_owner = curthread;
131 		return (1);
132 	} else {
133 		return (0);
134 	}
135 }
136 
137 void
138 mutex_exit(kmutex_t *mp)
139 {
140 	ASSERT(mutex_owner(mp) == curthread);
141 	mp->m_owner = NULL;
142 	VERIFY(mutex_unlock(&mp->m_lock) == 0);
143 }
144 
145 void *
146 mutex_owner(kmutex_t *mp)
147 {
148 	return (mp->m_owner);
149 }
150 
151 /*
152  * =========================================================================
153  * rwlocks
154  * =========================================================================
155  */
156 /*ARGSUSED*/
157 void
158 rw_init(krwlock_t *rwlp, char *name, int type, void *arg)
159 {
160 	rwlock_init(&rwlp->rw_lock, USYNC_THREAD, NULL);
161 	rwlp->rw_owner = NULL;
162 }
163 
164 void
165 rw_destroy(krwlock_t *rwlp)
166 {
167 	rwlock_destroy(&rwlp->rw_lock);
168 	rwlp->rw_owner = (void *)-1UL;
169 }
170 
171 void
172 rw_enter(krwlock_t *rwlp, krw_t rw)
173 {
174 	ASSERT(!RW_LOCK_HELD(rwlp));
175 	ASSERT(rwlp->rw_owner != (void *)-1UL);
176 	ASSERT(rwlp->rw_owner != curthread);
177 
178 	if (rw == RW_READER)
179 		(void) rw_rdlock(&rwlp->rw_lock);
180 	else
181 		(void) rw_wrlock(&rwlp->rw_lock);
182 
183 	rwlp->rw_owner = curthread;
184 }
185 
186 void
187 rw_exit(krwlock_t *rwlp)
188 {
189 	ASSERT(rwlp->rw_owner != (void *)-1UL);
190 
191 	rwlp->rw_owner = NULL;
192 	(void) rw_unlock(&rwlp->rw_lock);
193 }
194 
195 int
196 rw_tryenter(krwlock_t *rwlp, krw_t rw)
197 {
198 	int rv;
199 
200 	ASSERT(rwlp->rw_owner != (void *)-1UL);
201 
202 	if (rw == RW_READER)
203 		rv = rw_tryrdlock(&rwlp->rw_lock);
204 	else
205 		rv = rw_trywrlock(&rwlp->rw_lock);
206 
207 	if (rv == 0) {
208 		rwlp->rw_owner = curthread;
209 		return (1);
210 	}
211 
212 	return (0);
213 }
214 
215 /*ARGSUSED*/
216 int
217 rw_tryupgrade(krwlock_t *rwlp)
218 {
219 	ASSERT(rwlp->rw_owner != (void *)-1UL);
220 
221 	return (0);
222 }
223 
224 /*
225  * =========================================================================
226  * condition variables
227  * =========================================================================
228  */
229 /*ARGSUSED*/
230 void
231 cv_init(kcondvar_t *cv, char *name, int type, void *arg)
232 {
233 	VERIFY(cond_init(cv, type, NULL) == 0);
234 }
235 
236 void
237 cv_destroy(kcondvar_t *cv)
238 {
239 	VERIFY(cond_destroy(cv) == 0);
240 }
241 
242 void
243 cv_wait(kcondvar_t *cv, kmutex_t *mp)
244 {
245 	ASSERT(mutex_owner(mp) == curthread);
246 	mp->m_owner = NULL;
247 	int ret = cond_wait(cv, &mp->m_lock);
248 	VERIFY(ret == 0 || ret == EINTR);
249 	mp->m_owner = curthread;
250 }
251 
252 clock_t
253 cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
254 {
255 	int error;
256 	timestruc_t ts;
257 	clock_t delta;
258 
259 top:
260 	delta = abstime - lbolt;
261 	if (delta <= 0)
262 		return (-1);
263 
264 	ts.tv_sec = delta / hz;
265 	ts.tv_nsec = (delta % hz) * (NANOSEC / hz);
266 
267 	ASSERT(mutex_owner(mp) == curthread);
268 	mp->m_owner = NULL;
269 	error = cond_reltimedwait(cv, &mp->m_lock, &ts);
270 	mp->m_owner = curthread;
271 
272 	if (error == ETIME)
273 		return (-1);
274 
275 	if (error == EINTR)
276 		goto top;
277 
278 	ASSERT(error == 0);
279 
280 	return (1);
281 }
282 
283 void
284 cv_signal(kcondvar_t *cv)
285 {
286 	VERIFY(cond_signal(cv) == 0);
287 }
288 
289 void
290 cv_broadcast(kcondvar_t *cv)
291 {
292 	VERIFY(cond_broadcast(cv) == 0);
293 }
294 
295 /*
296  * =========================================================================
297  * vnode operations
298  * =========================================================================
299  */
300 /*
301  * Note: for the xxxat() versions of these functions, we assume that the
302  * starting vp is always rootdir (which is true for spa_directory.c, the only
303  * ZFS consumer of these interfaces).  We assert this is true, and then emulate
304  * them by adding '/' in front of the path.
305  */
306 
307 /*ARGSUSED*/
308 int
309 vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
310 {
311 	int fd;
312 	vnode_t *vp;
313 	int old_umask;
314 	char realpath[MAXPATHLEN];
315 	struct stat64 st;
316 
317 	/*
318 	 * If we're accessing a real disk from userland, we need to use
319 	 * the character interface to avoid caching.  This is particularly
320 	 * important if we're trying to look at a real in-kernel storage
321 	 * pool from userland, e.g. via zdb, because otherwise we won't
322 	 * see the changes occurring under the segmap cache.
323 	 * On the other hand, the stupid character device returns zero
324 	 * for its size.  So -- gag -- we open the block device to get
325 	 * its size, and remember it for subsequent VOP_GETATTR().
326 	 */
327 	if (strncmp(path, "/dev/", 5) == 0) {
328 		char *dsk;
329 		fd = open64(path, O_RDONLY);
330 		if (fd == -1)
331 			return (errno);
332 		if (fstat64(fd, &st) == -1) {
333 			close(fd);
334 			return (errno);
335 		}
336 		close(fd);
337 		(void) sprintf(realpath, "%s", path);
338 		dsk = strstr(path, "/dsk/");
339 		if (dsk != NULL)
340 			(void) sprintf(realpath + (dsk - path) + 1, "r%s",
341 			    dsk + 1);
342 	} else {
343 		(void) sprintf(realpath, "%s", path);
344 		if (!(flags & FCREAT) && stat64(realpath, &st) == -1)
345 			return (errno);
346 	}
347 
348 	if (flags & FCREAT)
349 		old_umask = umask(0);
350 
351 	/*
352 	 * The construct 'flags - FREAD' conveniently maps combinations of
353 	 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR.
354 	 */
355 	fd = open64(realpath, flags - FREAD, mode);
356 
357 	if (flags & FCREAT)
358 		(void) umask(old_umask);
359 
360 	if (fd == -1)
361 		return (errno);
362 
363 	if (fstat64(fd, &st) == -1) {
364 		close(fd);
365 		return (errno);
366 	}
367 
368 	(void) fcntl(fd, F_SETFD, FD_CLOEXEC);
369 
370 	*vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL);
371 
372 	vp->v_fd = fd;
373 	vp->v_size = st.st_size;
374 	vp->v_path = spa_strdup(path);
375 
376 	return (0);
377 }
378 
379 int
380 vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2,
381     int x3, vnode_t *startvp)
382 {
383 	char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL);
384 	int ret;
385 
386 	ASSERT(startvp == rootdir);
387 	(void) sprintf(realpath, "/%s", path);
388 
389 	ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3);
390 
391 	umem_free(realpath, strlen(path) + 2);
392 
393 	return (ret);
394 }
395 
396 /*ARGSUSED*/
397 int
398 vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset,
399 	int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp)
400 {
401 	ssize_t iolen, split;
402 
403 	if (uio == UIO_READ) {
404 		iolen = pread64(vp->v_fd, addr, len, offset);
405 	} else {
406 		/*
407 		 * To simulate partial disk writes, we split writes into two
408 		 * system calls so that the process can be killed in between.
409 		 */
410 		split = (len > 0 ? rand() % len : 0);
411 		iolen = pwrite64(vp->v_fd, addr, split, offset);
412 		iolen += pwrite64(vp->v_fd, (char *)addr + split,
413 		    len - split, offset + split);
414 	}
415 
416 	if (iolen == -1)
417 		return (errno);
418 	if (residp)
419 		*residp = len - iolen;
420 	else if (iolen != len)
421 		return (EIO);
422 	return (0);
423 }
424 
425 void
426 vn_close(vnode_t *vp)
427 {
428 	close(vp->v_fd);
429 	spa_strfree(vp->v_path);
430 	umem_free(vp, sizeof (vnode_t));
431 }
432 
433 #ifdef ZFS_DEBUG
434 
435 /*
436  * =========================================================================
437  * Figure out which debugging statements to print
438  * =========================================================================
439  */
440 
441 static char *dprintf_string;
442 static int dprintf_print_all;
443 
444 int
445 dprintf_find_string(const char *string)
446 {
447 	char *tmp_str = dprintf_string;
448 	int len = strlen(string);
449 
450 	/*
451 	 * Find out if this is a string we want to print.
452 	 * String format: file1.c,function_name1,file2.c,file3.c
453 	 */
454 
455 	while (tmp_str != NULL) {
456 		if (strncmp(tmp_str, string, len) == 0 &&
457 		    (tmp_str[len] == ',' || tmp_str[len] == '\0'))
458 			return (1);
459 		tmp_str = strchr(tmp_str, ',');
460 		if (tmp_str != NULL)
461 			tmp_str++; /* Get rid of , */
462 	}
463 	return (0);
464 }
465 
466 void
467 dprintf_setup(int *argc, char **argv)
468 {
469 	int i, j;
470 
471 	/*
472 	 * Debugging can be specified two ways: by setting the
473 	 * environment variable ZFS_DEBUG, or by including a
474 	 * "debug=..."  argument on the command line.  The command
475 	 * line setting overrides the environment variable.
476 	 */
477 
478 	for (i = 1; i < *argc; i++) {
479 		int len = strlen("debug=");
480 		/* First look for a command line argument */
481 		if (strncmp("debug=", argv[i], len) == 0) {
482 			dprintf_string = argv[i] + len;
483 			/* Remove from args */
484 			for (j = i; j < *argc; j++)
485 				argv[j] = argv[j+1];
486 			argv[j] = NULL;
487 			(*argc)--;
488 		}
489 	}
490 
491 	if (dprintf_string == NULL) {
492 		/* Look for ZFS_DEBUG environment variable */
493 		dprintf_string = getenv("ZFS_DEBUG");
494 	}
495 
496 	/*
497 	 * Are we just turning on all debugging?
498 	 */
499 	if (dprintf_find_string("on"))
500 		dprintf_print_all = 1;
501 }
502 
503 /*
504  * =========================================================================
505  * debug printfs
506  * =========================================================================
507  */
508 void
509 __dprintf(const char *file, const char *func, int line, const char *fmt, ...)
510 {
511 	const char *newfile;
512 	va_list adx;
513 
514 	/*
515 	 * Get rid of annoying "../common/" prefix to filename.
516 	 */
517 	newfile = strrchr(file, '/');
518 	if (newfile != NULL) {
519 		newfile = newfile + 1; /* Get rid of leading / */
520 	} else {
521 		newfile = file;
522 	}
523 
524 	if (dprintf_print_all ||
525 	    dprintf_find_string(newfile) ||
526 	    dprintf_find_string(func)) {
527 		/* Print out just the function name if requested */
528 		flockfile(stdout);
529 		if (dprintf_find_string("pid"))
530 			(void) printf("%d ", getpid());
531 		if (dprintf_find_string("tid"))
532 			(void) printf("%u ", thr_self());
533 		if (dprintf_find_string("cpu"))
534 			(void) printf("%u ", getcpuid());
535 		if (dprintf_find_string("time"))
536 			(void) printf("%llu ", gethrtime());
537 		if (dprintf_find_string("long"))
538 			(void) printf("%s, line %d: ", newfile, line);
539 		(void) printf("%s: ", func);
540 		va_start(adx, fmt);
541 		(void) vprintf(fmt, adx);
542 		va_end(adx);
543 		funlockfile(stdout);
544 	}
545 }
546 
547 #endif /* ZFS_DEBUG */
548 
549 /*
550  * =========================================================================
551  * cmn_err() and panic()
552  * =========================================================================
553  */
554 static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" };
555 static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" };
556 
557 void
558 vpanic(const char *fmt, va_list adx)
559 {
560 	(void) fprintf(stderr, "error: ");
561 	(void) vfprintf(stderr, fmt, adx);
562 	(void) fprintf(stderr, "\n");
563 
564 	abort();	/* think of it as a "user-level crash dump" */
565 }
566 
567 void
568 panic(const char *fmt, ...)
569 {
570 	va_list adx;
571 
572 	va_start(adx, fmt);
573 	vpanic(fmt, adx);
574 	va_end(adx);
575 }
576 
577 void
578 vcmn_err(int ce, const char *fmt, va_list adx)
579 {
580 	if (ce == CE_PANIC)
581 		vpanic(fmt, adx);
582 	if (ce != CE_NOTE) {	/* suppress noise in userland stress testing */
583 		(void) fprintf(stderr, "%s", ce_prefix[ce]);
584 		(void) vfprintf(stderr, fmt, adx);
585 		(void) fprintf(stderr, "%s", ce_suffix[ce]);
586 	}
587 }
588 
589 /*PRINTFLIKE2*/
590 void
591 cmn_err(int ce, const char *fmt, ...)
592 {
593 	va_list adx;
594 
595 	va_start(adx, fmt);
596 	vcmn_err(ce, fmt, adx);
597 	va_end(adx);
598 }
599 
600 /*
601  * =========================================================================
602  * kobj interfaces
603  * =========================================================================
604  */
605 struct _buf *
606 kobj_open_file(char *name)
607 {
608 	struct _buf *file;
609 	vnode_t *vp;
610 
611 	/* set vp as the _fd field of the file */
612 	if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir) != 0)
613 		return ((void *)-1UL);
614 
615 	file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL);
616 	file->_fd = (intptr_t)vp;
617 	return (file);
618 }
619 
620 int
621 kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
622 {
623 	ssize_t resid;
624 
625 	vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off,
626 	    UIO_SYSSPACE, 0, 0, 0, &resid);
627 
628 	return (size - resid);
629 }
630 
631 void
632 kobj_close_file(struct _buf *file)
633 {
634 	vn_close((vnode_t *)file->_fd);
635 	umem_free(file, sizeof (struct _buf));
636 }
637 
638 int
639 kobj_get_filesize(struct _buf *file, uint64_t *size)
640 {
641 	struct stat64 st;
642 	vnode_t *vp = (vnode_t *)file->_fd;
643 
644 	if (fstat64(vp->v_fd, &st) == -1) {
645 		vn_close(vp);
646 		return (errno);
647 	}
648 	*size = st.st_size;
649 	return (0);
650 }
651 
652 /*
653  * =========================================================================
654  * misc routines
655  * =========================================================================
656  */
657 
658 void
659 delay(clock_t ticks)
660 {
661 	poll(0, 0, ticks * (1000 / hz));
662 }
663 
664 /*
665  * Find highest one bit set.
666  *	Returns bit number + 1 of highest bit that is set, otherwise returns 0.
667  * High order bit is 31 (or 63 in _LP64 kernel).
668  */
669 int
670 highbit(ulong_t i)
671 {
672 	register int h = 1;
673 
674 	if (i == 0)
675 		return (0);
676 #ifdef _LP64
677 	if (i & 0xffffffff00000000ul) {
678 		h += 32; i >>= 32;
679 	}
680 #endif
681 	if (i & 0xffff0000) {
682 		h += 16; i >>= 16;
683 	}
684 	if (i & 0xff00) {
685 		h += 8; i >>= 8;
686 	}
687 	if (i & 0xf0) {
688 		h += 4; i >>= 4;
689 	}
690 	if (i & 0xc) {
691 		h += 2; i >>= 2;
692 	}
693 	if (i & 0x2) {
694 		h += 1;
695 	}
696 	return (h);
697 }
698 
699 static int
700 random_get_bytes_common(uint8_t *ptr, size_t len, char *devname)
701 {
702 	int fd = open(devname, O_RDONLY);
703 	size_t resid = len;
704 	ssize_t bytes;
705 
706 	ASSERT(fd != -1);
707 
708 	while (resid != 0) {
709 		bytes = read(fd, ptr, resid);
710 		ASSERT(bytes >= 0);
711 		ptr += bytes;
712 		resid -= bytes;
713 	}
714 
715 	close(fd);
716 
717 	return (0);
718 }
719 
720 int
721 random_get_bytes(uint8_t *ptr, size_t len)
722 {
723 	return (random_get_bytes_common(ptr, len, "/dev/random"));
724 }
725 
726 int
727 random_get_pseudo_bytes(uint8_t *ptr, size_t len)
728 {
729 	return (random_get_bytes_common(ptr, len, "/dev/urandom"));
730 }
731 
732 int
733 ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result)
734 {
735 	char *end;
736 
737 	*result = strtoul(hw_serial, &end, base);
738 	if (*result == 0)
739 		return (errno);
740 	return (0);
741 }
742 
743 /*
744  * =========================================================================
745  * kernel emulation setup & teardown
746  * =========================================================================
747  */
748 static int
749 umem_out_of_memory(void)
750 {
751 	char errmsg[] = "out of memory -- generating core dump\n";
752 
753 	write(fileno(stderr), errmsg, sizeof (errmsg));
754 	abort();
755 	return (0);
756 }
757 
758 void
759 kernel_init(int mode)
760 {
761 	umem_nofail_callback(umem_out_of_memory);
762 
763 	physmem = sysconf(_SC_PHYS_PAGES);
764 
765 	dprintf("physmem = %llu pages (%.2f GB)\n", physmem,
766 	    (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
767 
768 	snprintf(hw_serial, sizeof (hw_serial), "%ld", gethostid());
769 
770 	spa_init(mode);
771 }
772 
773 void
774 kernel_fini(void)
775 {
776 	spa_fini();
777 }
778 
779 int
780 z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen)
781 {
782 	int ret;
783 	uLongf len = *dstlen;
784 
785 	if ((ret = uncompress(dst, &len, src, srclen)) == Z_OK)
786 		*dstlen = (size_t)len;
787 
788 	return (ret);
789 }
790 
791 int
792 z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen,
793     int level)
794 {
795 	int ret;
796 	uLongf len = *dstlen;
797 
798 	if ((ret = compress2(dst, &len, src, srclen, level)) == Z_OK)
799 		*dstlen = (size_t)len;
800 
801 	return (ret);
802 }
803