xref: /titanic_50/usr/src/lib/libzpool/common/kernel.c (revision e127a3e717f822eb855235fa3bd08235b2cf533d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <assert.h>
29 #include <sys/zfs_context.h>
30 #include <poll.h>
31 #include <string.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <fcntl.h>
35 #include <sys/stat.h>
36 #include <sys/spa.h>
37 #include <sys/processor.h>
38 
39 
40 /*
41  * Emulation of kernel services in userland.
42  */
43 
44 uint64_t physmem;
45 vnode_t *rootdir = (vnode_t *)0xabcd1234;
46 
47 /*
48  * =========================================================================
49  * threads
50  * =========================================================================
51  */
52 /*ARGSUSED*/
53 kthread_t *
54 zk_thread_create(void (*func)(), void *arg)
55 {
56 	thread_t tid;
57 
58 	VERIFY(thr_create(0, 0, (void *(*)(void *))func, arg, THR_DETACHED,
59 	    &tid) == 0);
60 
61 	return ((void *)(uintptr_t)tid);
62 }
63 
64 /*
65  * =========================================================================
66  * kstats
67  * =========================================================================
68  */
69 /*ARGSUSED*/
70 kstat_t *
71 kstat_create(char *module, int instance, char *name, char *class,
72     uchar_t type, ulong_t ndata, uchar_t ks_flag)
73 {
74 	return (NULL);
75 }
76 
77 /*ARGSUSED*/
78 void
79 kstat_install(kstat_t *ksp)
80 {}
81 
82 /*ARGSUSED*/
83 void
84 kstat_delete(kstat_t *ksp)
85 {}
86 
87 /*
88  * =========================================================================
89  * mutexes
90  * =========================================================================
91  */
92 void
93 zmutex_init(kmutex_t *mp)
94 {
95 	mp->m_owner = NULL;
96 	(void) _mutex_init(&mp->m_lock, USYNC_THREAD, NULL);
97 }
98 
99 void
100 zmutex_destroy(kmutex_t *mp)
101 {
102 	ASSERT(mp->m_owner == NULL);
103 	(void) _mutex_destroy(&(mp)->m_lock);
104 	mp->m_owner = (void *)-1UL;
105 }
106 
107 void
108 mutex_enter(kmutex_t *mp)
109 {
110 	ASSERT(mp->m_owner != (void *)-1UL);
111 	ASSERT(mp->m_owner != curthread);
112 	VERIFY(mutex_lock(&mp->m_lock) == 0);
113 	ASSERT(mp->m_owner == NULL);
114 	mp->m_owner = curthread;
115 }
116 
117 int
118 mutex_tryenter(kmutex_t *mp)
119 {
120 	ASSERT(mp->m_owner != (void *)-1UL);
121 	if (0 == mutex_trylock(&mp->m_lock)) {
122 		ASSERT(mp->m_owner == NULL);
123 		mp->m_owner = curthread;
124 		return (1);
125 	} else {
126 		return (0);
127 	}
128 }
129 
130 void
131 mutex_exit(kmutex_t *mp)
132 {
133 	ASSERT(mutex_owner(mp) == curthread);
134 	mp->m_owner = NULL;
135 	VERIFY(mutex_unlock(&mp->m_lock) == 0);
136 }
137 
138 void *
139 mutex_owner(kmutex_t *mp)
140 {
141 	return (mp->m_owner);
142 }
143 
144 /*
145  * =========================================================================
146  * rwlocks
147  * =========================================================================
148  */
149 /*ARGSUSED*/
150 void
151 rw_init(krwlock_t *rwlp, char *name, int type, void *arg)
152 {
153 	rwlock_init(&rwlp->rw_lock, USYNC_THREAD, NULL);
154 	rwlp->rw_owner = NULL;
155 }
156 
157 void
158 rw_destroy(krwlock_t *rwlp)
159 {
160 	rwlock_destroy(&rwlp->rw_lock);
161 	rwlp->rw_owner = (void *)-1UL;
162 }
163 
164 void
165 rw_enter(krwlock_t *rwlp, krw_t rw)
166 {
167 	ASSERT(!RW_LOCK_HELD(rwlp));
168 	ASSERT(rwlp->rw_owner != (void *)-1UL);
169 	ASSERT(rwlp->rw_owner != curthread);
170 
171 	if (rw == RW_READER)
172 		(void) rw_rdlock(&rwlp->rw_lock);
173 	else
174 		(void) rw_wrlock(&rwlp->rw_lock);
175 
176 	rwlp->rw_owner = curthread;
177 }
178 
179 void
180 rw_exit(krwlock_t *rwlp)
181 {
182 	ASSERT(rwlp->rw_owner != (void *)-1UL);
183 
184 	rwlp->rw_owner = NULL;
185 	(void) rw_unlock(&rwlp->rw_lock);
186 }
187 
188 int
189 rw_tryenter(krwlock_t *rwlp, krw_t rw)
190 {
191 	int rv;
192 
193 	ASSERT(rwlp->rw_owner != (void *)-1UL);
194 
195 	if (rw == RW_READER)
196 		rv = rw_tryrdlock(&rwlp->rw_lock);
197 	else
198 		rv = rw_trywrlock(&rwlp->rw_lock);
199 
200 	if (rv == 0) {
201 		rwlp->rw_owner = curthread;
202 		return (1);
203 	}
204 
205 	return (0);
206 }
207 
208 /*ARGSUSED*/
209 int
210 rw_tryupgrade(krwlock_t *rwlp)
211 {
212 	ASSERT(rwlp->rw_owner != (void *)-1UL);
213 
214 	return (0);
215 }
216 
217 /*
218  * =========================================================================
219  * condition variables
220  * =========================================================================
221  */
222 /*ARGSUSED*/
223 void
224 cv_init(kcondvar_t *cv, char *name, int type, void *arg)
225 {
226 	VERIFY(cond_init(cv, type, NULL) == 0);
227 }
228 
229 void
230 cv_destroy(kcondvar_t *cv)
231 {
232 	VERIFY(cond_destroy(cv) == 0);
233 }
234 
235 void
236 cv_wait(kcondvar_t *cv, kmutex_t *mp)
237 {
238 	ASSERT(mutex_owner(mp) == curthread);
239 	mp->m_owner = NULL;
240 	int ret = cond_wait(cv, &mp->m_lock);
241 	VERIFY(ret == 0 || ret == EINTR);
242 	mp->m_owner = curthread;
243 }
244 
245 clock_t
246 cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
247 {
248 	int error;
249 	timestruc_t ts;
250 	clock_t delta;
251 
252 top:
253 	delta = abstime - lbolt;
254 	if (delta <= 0)
255 		return (-1);
256 
257 	ts.tv_sec = delta / hz;
258 	ts.tv_nsec = (delta % hz) * (NANOSEC / hz);
259 
260 	ASSERT(mutex_owner(mp) == curthread);
261 	mp->m_owner = NULL;
262 	error = cond_reltimedwait(cv, &mp->m_lock, &ts);
263 	mp->m_owner = curthread;
264 
265 	if (error == ETIME)
266 		return (-1);
267 
268 	if (error == EINTR)
269 		goto top;
270 
271 	ASSERT(error == 0);
272 
273 	return (1);
274 }
275 
276 void
277 cv_signal(kcondvar_t *cv)
278 {
279 	VERIFY(cond_signal(cv) == 0);
280 }
281 
282 void
283 cv_broadcast(kcondvar_t *cv)
284 {
285 	VERIFY(cond_broadcast(cv) == 0);
286 }
287 
288 /*
289  * =========================================================================
290  * vnode operations
291  * =========================================================================
292  */
293 /*
294  * Note: for the xxxat() versions of these functions, we assume that the
295  * starting vp is always rootdir (which is true for spa_directory.c, the only
296  * ZFS consumer of these interfaces).  We assert this is true, and then emulate
297  * them by adding '/' in front of the path.
298  */
299 
300 /*ARGSUSED*/
301 int
302 vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
303 {
304 	int fd;
305 	vnode_t *vp;
306 	int old_umask;
307 	char realpath[MAXPATHLEN];
308 	struct stat64 st;
309 
310 	/*
311 	 * If we're accessing a real disk from userland, we need to use
312 	 * the character interface to avoid caching.  This is particularly
313 	 * important if we're trying to look at a real in-kernel storage
314 	 * pool from userland, e.g. via zdb, because otherwise we won't
315 	 * see the changes occurring under the segmap cache.
316 	 * On the other hand, the stupid character device returns zero
317 	 * for its size.  So -- gag -- we open the block device to get
318 	 * its size, and remember it for subsequent VOP_GETATTR().
319 	 */
320 	if (strncmp(path, "/dev/", 5) == 0) {
321 		char *dsk;
322 		fd = open64(path, O_RDONLY);
323 		if (fd == -1)
324 			return (errno);
325 		if (fstat64(fd, &st) == -1) {
326 			close(fd);
327 			return (errno);
328 		}
329 		close(fd);
330 		(void) sprintf(realpath, "%s", path);
331 		dsk = strstr(path, "/dsk/");
332 		if (dsk != NULL)
333 			(void) sprintf(realpath + (dsk - path) + 1, "r%s",
334 			    dsk + 1);
335 	} else {
336 		(void) sprintf(realpath, "%s", path);
337 		if (!(flags & FCREAT) && stat64(realpath, &st) == -1)
338 			return (errno);
339 	}
340 
341 	if (flags & FCREAT)
342 		old_umask = umask(0);
343 
344 	/*
345 	 * The construct 'flags - FREAD' conveniently maps combinations of
346 	 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR.
347 	 */
348 	fd = open64(realpath, flags - FREAD, mode);
349 
350 	if (flags & FCREAT)
351 		(void) umask(old_umask);
352 
353 	if (fd == -1)
354 		return (errno);
355 
356 	if (fstat64(fd, &st) == -1) {
357 		close(fd);
358 		return (errno);
359 	}
360 
361 	(void) fcntl(fd, F_SETFD, FD_CLOEXEC);
362 
363 	*vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL);
364 
365 	vp->v_fd = fd;
366 	vp->v_size = st.st_size;
367 	vp->v_path = spa_strdup(path);
368 
369 	return (0);
370 }
371 
372 int
373 vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2,
374     int x3, vnode_t *startvp)
375 {
376 	char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL);
377 	int ret;
378 
379 	ASSERT(startvp == rootdir);
380 	(void) sprintf(realpath, "/%s", path);
381 
382 	ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3);
383 
384 	umem_free(realpath, strlen(path) + 2);
385 
386 	return (ret);
387 }
388 
389 /*ARGSUSED*/
390 int
391 vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset,
392 	int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp)
393 {
394 	ssize_t iolen, split;
395 
396 	if (uio == UIO_READ) {
397 		iolen = pread64(vp->v_fd, addr, len, offset);
398 	} else {
399 		/*
400 		 * To simulate partial disk writes, we split writes into two
401 		 * system calls so that the process can be killed in between.
402 		 */
403 		split = (len > 0 ? rand() % len : 0);
404 		iolen = pwrite64(vp->v_fd, addr, split, offset);
405 		iolen += pwrite64(vp->v_fd, (char *)addr + split,
406 		    len - split, offset + split);
407 	}
408 
409 	if (iolen == -1)
410 		return (errno);
411 	if (residp)
412 		*residp = len - iolen;
413 	else if (iolen != len)
414 		return (EIO);
415 	return (0);
416 }
417 
418 void
419 vn_close(vnode_t *vp)
420 {
421 	close(vp->v_fd);
422 	spa_strfree(vp->v_path);
423 	umem_free(vp, sizeof (vnode_t));
424 }
425 
426 #ifdef ZFS_DEBUG
427 
428 /*
429  * =========================================================================
430  * Figure out which debugging statements to print
431  * =========================================================================
432  */
433 
434 static char *dprintf_string;
435 static int dprintf_print_all;
436 
437 int
438 dprintf_find_string(const char *string)
439 {
440 	char *tmp_str = dprintf_string;
441 	int len = strlen(string);
442 
443 	/*
444 	 * Find out if this is a string we want to print.
445 	 * String format: file1.c,function_name1,file2.c,file3.c
446 	 */
447 
448 	while (tmp_str != NULL) {
449 		if (strncmp(tmp_str, string, len) == 0 &&
450 		    (tmp_str[len] == ',' || tmp_str[len] == '\0'))
451 			return (1);
452 		tmp_str = strchr(tmp_str, ',');
453 		if (tmp_str != NULL)
454 			tmp_str++; /* Get rid of , */
455 	}
456 	return (0);
457 }
458 
459 void
460 dprintf_setup(int *argc, char **argv)
461 {
462 	int i, j;
463 
464 	/*
465 	 * Debugging can be specified two ways: by setting the
466 	 * environment variable ZFS_DEBUG, or by including a
467 	 * "debug=..."  argument on the command line.  The command
468 	 * line setting overrides the environment variable.
469 	 */
470 
471 	for (i = 1; i < *argc; i++) {
472 		int len = strlen("debug=");
473 		/* First look for a command line argument */
474 		if (strncmp("debug=", argv[i], len) == 0) {
475 			dprintf_string = argv[i] + len;
476 			/* Remove from args */
477 			for (j = i; j < *argc; j++)
478 				argv[j] = argv[j+1];
479 			argv[j] = NULL;
480 			(*argc)--;
481 		}
482 	}
483 
484 	if (dprintf_string == NULL) {
485 		/* Look for ZFS_DEBUG environment variable */
486 		dprintf_string = getenv("ZFS_DEBUG");
487 	}
488 
489 	/*
490 	 * Are we just turning on all debugging?
491 	 */
492 	if (dprintf_find_string("on"))
493 		dprintf_print_all = 1;
494 }
495 
496 /*
497  * =========================================================================
498  * debug printfs
499  * =========================================================================
500  */
501 void
502 __dprintf(const char *file, const char *func, int line, const char *fmt, ...)
503 {
504 	const char *newfile;
505 	va_list adx;
506 
507 	/*
508 	 * Get rid of annoying "../common/" prefix to filename.
509 	 */
510 	newfile = strrchr(file, '/');
511 	if (newfile != NULL) {
512 		newfile = newfile + 1; /* Get rid of leading / */
513 	} else {
514 		newfile = file;
515 	}
516 
517 	if (dprintf_print_all ||
518 	    dprintf_find_string(newfile) ||
519 	    dprintf_find_string(func)) {
520 		/* Print out just the function name if requested */
521 		flockfile(stdout);
522 		if (dprintf_find_string("pid"))
523 			(void) printf("%d ", getpid());
524 		if (dprintf_find_string("tid"))
525 			(void) printf("%u ", thr_self());
526 		if (dprintf_find_string("cpu"))
527 			(void) printf("%u ", getcpuid());
528 		if (dprintf_find_string("time"))
529 			(void) printf("%llu ", gethrtime());
530 		if (dprintf_find_string("long"))
531 			(void) printf("%s, line %d: ", newfile, line);
532 		(void) printf("%s: ", func);
533 		va_start(adx, fmt);
534 		(void) vprintf(fmt, adx);
535 		va_end(adx);
536 		funlockfile(stdout);
537 	}
538 }
539 
540 #endif /* ZFS_DEBUG */
541 
542 /*
543  * =========================================================================
544  * cmn_err() and panic()
545  * =========================================================================
546  */
547 static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" };
548 static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" };
549 
550 void
551 vpanic(const char *fmt, va_list adx)
552 {
553 	(void) fprintf(stderr, "error: ");
554 	(void) vfprintf(stderr, fmt, adx);
555 	(void) fprintf(stderr, "\n");
556 
557 	abort();	/* think of it as a "user-level crash dump" */
558 }
559 
560 void
561 panic(const char *fmt, ...)
562 {
563 	va_list adx;
564 
565 	va_start(adx, fmt);
566 	vpanic(fmt, adx);
567 	va_end(adx);
568 }
569 
570 void
571 vcmn_err(int ce, const char *fmt, va_list adx)
572 {
573 	if (ce == CE_PANIC)
574 		vpanic(fmt, adx);
575 	if (ce != CE_NOTE) {	/* suppress noise in userland stress testing */
576 		(void) fprintf(stderr, "%s", ce_prefix[ce]);
577 		(void) vfprintf(stderr, fmt, adx);
578 		(void) fprintf(stderr, "%s", ce_suffix[ce]);
579 	}
580 }
581 
582 /*PRINTFLIKE2*/
583 void
584 cmn_err(int ce, const char *fmt, ...)
585 {
586 	va_list adx;
587 
588 	va_start(adx, fmt);
589 	vcmn_err(ce, fmt, adx);
590 	va_end(adx);
591 }
592 
593 /*
594  * =========================================================================
595  * kobj interfaces
596  * =========================================================================
597  */
598 struct _buf *
599 kobj_open_file(char *name)
600 {
601 	struct _buf *file;
602 	vnode_t *vp;
603 
604 	/* set vp as the _fd field of the file */
605 	if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir) != 0)
606 		return ((void *)-1UL);
607 
608 	file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL);
609 	file->_fd = (intptr_t)vp;
610 	return (file);
611 }
612 
613 int
614 kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
615 {
616 	ssize_t resid;
617 
618 	vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off,
619 	    UIO_SYSSPACE, 0, 0, 0, &resid);
620 
621 	return (0);
622 }
623 
624 void
625 kobj_close_file(struct _buf *file)
626 {
627 	vn_close((vnode_t *)file->_fd);
628 	umem_free(file, sizeof (struct _buf));
629 }
630 
631 int
632 kobj_fstat(intptr_t fd, struct bootstat *bst)
633 {
634 	struct stat64 st;
635 	vnode_t *vp = (vnode_t *)fd;
636 	if (fstat64(vp->v_fd, &st) == -1) {
637 		vn_close(vp);
638 		return (errno);
639 	}
640 	bst->st_size = (uint64_t)st.st_size;
641 	return (0);
642 }
643 
644 /*
645  * =========================================================================
646  * misc routines
647  * =========================================================================
648  */
649 
650 void
651 delay(clock_t ticks)
652 {
653 	poll(0, 0, ticks * (1000 / hz));
654 }
655 
656 /*
657  * Find highest one bit set.
658  *	Returns bit number + 1 of highest bit that is set, otherwise returns 0.
659  * High order bit is 31 (or 63 in _LP64 kernel).
660  */
661 int
662 highbit(ulong_t i)
663 {
664 	register int h = 1;
665 
666 	if (i == 0)
667 		return (0);
668 #ifdef _LP64
669 	if (i & 0xffffffff00000000ul) {
670 		h += 32; i >>= 32;
671 	}
672 #endif
673 	if (i & 0xffff0000) {
674 		h += 16; i >>= 16;
675 	}
676 	if (i & 0xff00) {
677 		h += 8; i >>= 8;
678 	}
679 	if (i & 0xf0) {
680 		h += 4; i >>= 4;
681 	}
682 	if (i & 0xc) {
683 		h += 2; i >>= 2;
684 	}
685 	if (i & 0x2) {
686 		h += 1;
687 	}
688 	return (h);
689 }
690 
691 static int
692 random_get_bytes_common(uint8_t *ptr, size_t len, char *devname)
693 {
694 	int fd = open(devname, O_RDONLY);
695 	size_t resid = len;
696 	ssize_t bytes;
697 
698 	ASSERT(fd != -1);
699 
700 	while (resid != 0) {
701 		bytes = read(fd, ptr, resid);
702 		ASSERT(bytes >= 0);
703 		ptr += bytes;
704 		resid -= bytes;
705 	}
706 
707 	close(fd);
708 
709 	return (0);
710 }
711 
712 int
713 random_get_bytes(uint8_t *ptr, size_t len)
714 {
715 	return (random_get_bytes_common(ptr, len, "/dev/random"));
716 }
717 
718 int
719 random_get_pseudo_bytes(uint8_t *ptr, size_t len)
720 {
721 	return (random_get_bytes_common(ptr, len, "/dev/urandom"));
722 }
723 
724 /*
725  * =========================================================================
726  * kernel emulation setup & teardown
727  * =========================================================================
728  */
729 static int
730 umem_out_of_memory(void)
731 {
732 	char errmsg[] = "out of memory -- generating core dump\n";
733 
734 	write(fileno(stderr), errmsg, sizeof (errmsg));
735 	abort();
736 	return (0);
737 }
738 
739 void
740 kernel_init(int mode)
741 {
742 	umem_nofail_callback(umem_out_of_memory);
743 
744 	physmem = sysconf(_SC_PHYS_PAGES);
745 
746 	dprintf("physmem = %llu pages (%.2f GB)\n", physmem,
747 	    (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
748 
749 	spa_init(mode);
750 }
751 
752 void
753 kernel_fini(void)
754 {
755 	spa_fini();
756 }
757