xref: /titanic_44/usr/src/lib/libzpool/common/kernel.c (revision 1979231e1e29c981e5d1e6cee60f2db46d052b00)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <assert.h>
29 #include <sys/zfs_context.h>
30 #include <poll.h>
31 #include <string.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <fcntl.h>
35 #include <sys/stat.h>
36 #include <sys/spa.h>
37 #include <sys/processor.h>
38 
39 /*
40  * Emulation of kernel services in userland.
41  */
42 
43 uint64_t physmem;
44 vnode_t *rootdir = (vnode_t *)0xabcd1234;
45 
46 /*
47  * =========================================================================
48  * threads
49  * =========================================================================
50  */
51 /*ARGSUSED*/
52 kthread_t *
53 zk_thread_create(void (*func)(), void *arg)
54 {
55 	thread_t tid;
56 
57 	VERIFY(thr_create(0, 0, (void *(*)(void *))func, arg, THR_DETACHED,
58 	    &tid) == 0);
59 
60 	return ((void *)(uintptr_t)tid);
61 }
62 
63 /*
64  * =========================================================================
65  * mutexes
66  * =========================================================================
67  */
68 void
69 zmutex_init(kmutex_t *mp)
70 {
71 	mp->m_owner = NULL;
72 	(void) _mutex_init(&mp->m_lock, USYNC_THREAD, NULL);
73 }
74 
75 void
76 zmutex_destroy(kmutex_t *mp)
77 {
78 	ASSERT(mp->m_owner == NULL);
79 	(void) _mutex_destroy(&(mp)->m_lock);
80 	mp->m_owner = (void *)-1UL;
81 }
82 
83 void
84 mutex_enter(kmutex_t *mp)
85 {
86 	ASSERT(mp->m_owner != (void *)-1UL);
87 	ASSERT(mp->m_owner != curthread);
88 	(void) mutex_lock(&mp->m_lock);
89 	ASSERT(mp->m_owner == NULL);
90 	mp->m_owner = curthread;
91 }
92 
93 int
94 mutex_tryenter(kmutex_t *mp)
95 {
96 	ASSERT(mp->m_owner != (void *)-1UL);
97 	if (0 == mutex_trylock(&mp->m_lock)) {
98 		ASSERT(mp->m_owner == NULL);
99 		mp->m_owner = curthread;
100 		return (1);
101 	} else {
102 		return (0);
103 	}
104 }
105 
106 void
107 mutex_exit(kmutex_t *mp)
108 {
109 	ASSERT(mutex_owner(mp) == curthread);
110 	mp->m_owner = NULL;
111 	(void) mutex_unlock(&mp->m_lock);
112 }
113 
114 void *
115 mutex_owner(kmutex_t *mp)
116 {
117 	return (mp->m_owner);
118 }
119 
120 /*
121  * =========================================================================
122  * rwlocks
123  * =========================================================================
124  */
125 /*ARGSUSED*/
126 void
127 rw_init(krwlock_t *rwlp, char *name, int type, void *arg)
128 {
129 	rwlock_init(&rwlp->rw_lock, USYNC_THREAD, NULL);
130 	rwlp->rw_owner = NULL;
131 }
132 
133 void
134 rw_destroy(krwlock_t *rwlp)
135 {
136 	rwlock_destroy(&rwlp->rw_lock);
137 	rwlp->rw_owner = (void *)-1UL;
138 }
139 
140 void
141 rw_enter(krwlock_t *rwlp, krw_t rw)
142 {
143 	ASSERT(!RW_LOCK_HELD(rwlp));
144 	ASSERT(rwlp->rw_owner != (void *)-1UL);
145 	ASSERT(rwlp->rw_owner != curthread);
146 
147 	if (rw == RW_READER)
148 		(void) rw_rdlock(&rwlp->rw_lock);
149 	else
150 		(void) rw_wrlock(&rwlp->rw_lock);
151 
152 	rwlp->rw_owner = curthread;
153 }
154 
155 void
156 rw_exit(krwlock_t *rwlp)
157 {
158 	ASSERT(rwlp->rw_owner != (void *)-1UL);
159 
160 	rwlp->rw_owner = NULL;
161 	(void) rw_unlock(&rwlp->rw_lock);
162 }
163 
164 int
165 rw_tryenter(krwlock_t *rwlp, krw_t rw)
166 {
167 	int rv;
168 
169 	ASSERT(rwlp->rw_owner != (void *)-1UL);
170 
171 	if (rw == RW_READER)
172 		rv = rw_tryrdlock(&rwlp->rw_lock);
173 	else
174 		rv = rw_trywrlock(&rwlp->rw_lock);
175 
176 	if (rv == 0) {
177 		rwlp->rw_owner = curthread;
178 		return (1);
179 	}
180 
181 	return (0);
182 }
183 
184 /*ARGSUSED*/
185 int
186 rw_tryupgrade(krwlock_t *rwlp)
187 {
188 	ASSERT(rwlp->rw_owner != (void *)-1UL);
189 
190 	return (0);
191 }
192 
193 /*
194  * =========================================================================
195  * condition variables
196  * =========================================================================
197  */
198 /*ARGSUSED*/
199 void
200 cv_init(kcondvar_t *cv, char *name, int type, void *arg)
201 {
202 	(void) cond_init(cv, type, NULL);
203 }
204 
205 void
206 cv_destroy(kcondvar_t *cv)
207 {
208 	(void) cond_destroy(cv);
209 }
210 
211 void
212 cv_wait(kcondvar_t *cv, kmutex_t *mp)
213 {
214 	ASSERT(mutex_owner(mp) == curthread);
215 	mp->m_owner = NULL;
216 	(void) cond_wait(cv, &mp->m_lock);
217 	mp->m_owner = curthread;
218 }
219 
220 clock_t
221 cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
222 {
223 	int error;
224 	timestruc_t ts;
225 	clock_t delta;
226 
227 top:
228 	delta = abstime - lbolt;
229 	if (delta <= 0)
230 		return (-1);
231 
232 	ts.tv_sec = delta / hz;
233 	ts.tv_nsec = (delta % hz) * (NANOSEC / hz);
234 
235 	ASSERT(mutex_owner(mp) == curthread);
236 	mp->m_owner = NULL;
237 	error = cond_reltimedwait(cv, &mp->m_lock, &ts);
238 	mp->m_owner = curthread;
239 
240 	if (error == ETIME)
241 		return (-1);
242 
243 	if (error == EINTR)
244 		goto top;
245 
246 	ASSERT(error == 0);
247 
248 	return (1);
249 }
250 
251 void
252 cv_signal(kcondvar_t *cv)
253 {
254 	(void) cond_signal(cv);
255 }
256 
257 void
258 cv_broadcast(kcondvar_t *cv)
259 {
260 	(void) cond_broadcast(cv);
261 }
262 
263 /*
264  * =========================================================================
265  * vnode operations
266  * =========================================================================
267  */
268 /*
269  * Note: for the xxxat() versions of these functions, we assume that the
270  * starting vp is always rootdir (which is true for spa_directory.c, the only
271  * ZFS consumer of these interfaces).  We assert this is true, and then emulate
272  * them by adding '/' in front of the path.
273  */
274 
275 /*ARGSUSED*/
276 int
277 vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
278 {
279 	int fd;
280 	vnode_t *vp;
281 	int old_umask;
282 	char realpath[MAXPATHLEN];
283 	struct stat64 st;
284 
285 	/*
286 	 * If we're accessing a real disk from userland, we need to use
287 	 * the character interface to avoid caching.  This is particularly
288 	 * important if we're trying to look at a real in-kernel storage
289 	 * pool from userland, e.g. via zdb, because otherwise we won't
290 	 * see the changes occurring under the segmap cache.
291 	 * On the other hand, the stupid character device returns zero
292 	 * for its size.  So -- gag -- we open the block device to get
293 	 * its size, and remember it for subsequent VOP_GETATTR().
294 	 */
295 	if (strncmp(path, "/dev/", 5) == 0) {
296 		char *dsk;
297 		fd = open64(path, O_RDONLY);
298 		if (fd == -1)
299 			return (errno);
300 		if (fstat64(fd, &st) == -1) {
301 			close(fd);
302 			return (errno);
303 		}
304 		close(fd);
305 		(void) sprintf(realpath, "%s", path);
306 		dsk = strstr(path, "/dsk/");
307 		if (dsk != NULL)
308 			(void) sprintf(realpath + (dsk - path) + 1, "r%s",
309 			    dsk + 1);
310 	} else {
311 		(void) sprintf(realpath, "%s", path);
312 		if (!(flags & FCREAT) && stat64(realpath, &st) == -1)
313 			return (errno);
314 	}
315 
316 	if (flags & FCREAT)
317 		old_umask = umask(0);
318 
319 	/*
320 	 * The construct 'flags - FREAD' conveniently maps combinations of
321 	 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR.
322 	 */
323 	fd = open64(realpath, flags - FREAD, mode);
324 
325 	if (flags & FCREAT)
326 		(void) umask(old_umask);
327 
328 	if (fd == -1)
329 		return (errno);
330 
331 	if (fstat64(fd, &st) == -1) {
332 		close(fd);
333 		return (errno);
334 	}
335 
336 	(void) fcntl(fd, F_SETFD, FD_CLOEXEC);
337 
338 	*vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL);
339 
340 	vp->v_fd = fd;
341 	vp->v_size = st.st_size;
342 	vp->v_path = spa_strdup(path);
343 
344 	return (0);
345 }
346 
347 int
348 vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2,
349     int x3, vnode_t *startvp)
350 {
351 	char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL);
352 	int ret;
353 
354 	ASSERT(startvp == rootdir);
355 	(void) sprintf(realpath, "/%s", path);
356 
357 	ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3);
358 
359 	umem_free(realpath, strlen(path) + 2);
360 
361 	return (ret);
362 }
363 
364 /*ARGSUSED*/
365 int
366 vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset,
367 	int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp)
368 {
369 	ssize_t iolen, split;
370 
371 	if (uio == UIO_READ) {
372 		iolen = pread64(vp->v_fd, addr, len, offset);
373 	} else {
374 		/*
375 		 * To simulate partial disk writes, we split writes into two
376 		 * system calls so that the process can be killed in between.
377 		 */
378 		split = (len > 0 ? rand() % len : 0);
379 		iolen = pwrite64(vp->v_fd, addr, split, offset);
380 		iolen += pwrite64(vp->v_fd, (char *)addr + split,
381 		    len - split, offset + split);
382 	}
383 
384 	if (iolen == -1)
385 		return (errno);
386 	if (residp)
387 		*residp = len - iolen;
388 	else if (iolen != len)
389 		return (EIO);
390 	return (0);
391 }
392 
393 void
394 vn_close(vnode_t *vp)
395 {
396 	close(vp->v_fd);
397 	spa_strfree(vp->v_path);
398 	umem_free(vp, sizeof (vnode_t));
399 }
400 
401 #ifdef ZFS_DEBUG
402 
403 /*
404  * =========================================================================
405  * Figure out which debugging statements to print
406  * =========================================================================
407  */
408 
409 static char *dprintf_string;
410 static int dprintf_print_all;
411 
412 int
413 dprintf_find_string(const char *string)
414 {
415 	char *tmp_str = dprintf_string;
416 	int len = strlen(string);
417 
418 	/*
419 	 * Find out if this is a string we want to print.
420 	 * String format: file1.c,function_name1,file2.c,file3.c
421 	 */
422 
423 	while (tmp_str != NULL) {
424 		if (strncmp(tmp_str, string, len) == 0 &&
425 		    (tmp_str[len] == ',' || tmp_str[len] == '\0'))
426 			return (1);
427 		tmp_str = strchr(tmp_str, ',');
428 		if (tmp_str != NULL)
429 			tmp_str++; /* Get rid of , */
430 	}
431 	return (0);
432 }
433 
434 void
435 dprintf_setup(int *argc, char **argv)
436 {
437 	int i, j;
438 
439 	/*
440 	 * Debugging can be specified two ways: by setting the
441 	 * environment variable ZFS_DEBUG, or by including a
442 	 * "debug=..."  argument on the command line.  The command
443 	 * line setting overrides the environment variable.
444 	 */
445 
446 	for (i = 1; i < *argc; i++) {
447 		int len = strlen("debug=");
448 		/* First look for a command line argument */
449 		if (strncmp("debug=", argv[i], len) == 0) {
450 			dprintf_string = argv[i] + len;
451 			/* Remove from args */
452 			for (j = i; j < *argc; j++)
453 				argv[j] = argv[j+1];
454 			argv[j] = NULL;
455 			(*argc)--;
456 		}
457 	}
458 
459 	if (dprintf_string == NULL) {
460 		/* Look for ZFS_DEBUG environment variable */
461 		dprintf_string = getenv("ZFS_DEBUG");
462 	}
463 
464 	/*
465 	 * Are we just turning on all debugging?
466 	 */
467 	if (dprintf_find_string("on"))
468 		dprintf_print_all = 1;
469 }
470 
471 /*
472  * =========================================================================
473  * debug printfs
474  * =========================================================================
475  */
476 void
477 __dprintf(const char *file, const char *func, int line, const char *fmt, ...)
478 {
479 	const char *newfile;
480 	va_list adx;
481 
482 	/*
483 	 * Get rid of annoying "../common/" prefix to filename.
484 	 */
485 	newfile = strrchr(file, '/');
486 	if (newfile != NULL) {
487 		newfile = newfile + 1; /* Get rid of leading / */
488 	} else {
489 		newfile = file;
490 	}
491 
492 	if (dprintf_print_all ||
493 	    dprintf_find_string(newfile) ||
494 	    dprintf_find_string(func)) {
495 		/* Print out just the function name if requested */
496 		flockfile(stdout);
497 		if (dprintf_find_string("pid"))
498 			(void) printf("%d ", getpid());
499 		if (dprintf_find_string("tid"))
500 			(void) printf("%u ", thr_self());
501 		if (dprintf_find_string("cpu"))
502 			(void) printf("%u ", getcpuid());
503 		if (dprintf_find_string("time"))
504 			(void) printf("%llu ", gethrtime());
505 		if (dprintf_find_string("long"))
506 			(void) printf("%s, line %d: ", newfile, line);
507 		(void) printf("%s: ", func);
508 		va_start(adx, fmt);
509 		(void) vprintf(fmt, adx);
510 		va_end(adx);
511 		funlockfile(stdout);
512 	}
513 }
514 
515 #endif /* ZFS_DEBUG */
516 
517 /*
518  * =========================================================================
519  * cmn_err() and panic()
520  * =========================================================================
521  */
522 static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" };
523 static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" };
524 
525 void
526 vpanic(const char *fmt, va_list adx)
527 {
528 	(void) fprintf(stderr, "error: ");
529 	(void) vfprintf(stderr, fmt, adx);
530 	(void) fprintf(stderr, "\n");
531 
532 	abort();	/* think of it as a "user-level crash dump" */
533 }
534 
535 void
536 panic(const char *fmt, ...)
537 {
538 	va_list adx;
539 
540 	va_start(adx, fmt);
541 	vpanic(fmt, adx);
542 	va_end(adx);
543 }
544 
545 /*PRINTFLIKE2*/
546 void
547 cmn_err(int ce, const char *fmt, ...)
548 {
549 	va_list adx;
550 
551 	va_start(adx, fmt);
552 	if (ce == CE_PANIC)
553 		vpanic(fmt, adx);
554 	if (ce != CE_NOTE) {	/* suppress noise in userland stress testing */
555 		(void) fprintf(stderr, "%s", ce_prefix[ce]);
556 		(void) vfprintf(stderr, fmt, adx);
557 		(void) fprintf(stderr, "%s", ce_suffix[ce]);
558 	}
559 	va_end(adx);
560 }
561 
562 /*
563  * =========================================================================
564  * kobj interfaces
565  * =========================================================================
566  */
567 struct _buf *
568 kobj_open_file(char *name)
569 {
570 	struct _buf *file;
571 	vnode_t *vp;
572 
573 	/* set vp as the _fd field of the file */
574 	if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir) != 0)
575 		return ((void *)-1UL);
576 
577 	file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL);
578 	file->_fd = (intptr_t)vp;
579 	return (file);
580 }
581 
582 int
583 kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
584 {
585 	ssize_t resid;
586 
587 	vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off,
588 	    UIO_SYSSPACE, 0, 0, 0, &resid);
589 
590 	return (0);
591 }
592 
593 void
594 kobj_close_file(struct _buf *file)
595 {
596 	vn_close((vnode_t *)file->_fd);
597 	umem_free(file, sizeof (struct _buf));
598 }
599 
600 int
601 kobj_fstat(intptr_t fd, struct bootstat *bst)
602 {
603 	struct stat64 st;
604 	vnode_t *vp = (vnode_t *)fd;
605 	if (fstat64(vp->v_fd, &st) == -1) {
606 		vn_close(vp);
607 		return (errno);
608 	}
609 	bst->st_size = (uint64_t)st.st_size;
610 	return (0);
611 }
612 
613 /*
614  * =========================================================================
615  * misc routines
616  * =========================================================================
617  */
618 
619 void
620 delay(clock_t ticks)
621 {
622 	poll(0, 0, ticks * (1000 / hz));
623 }
624 
625 /*
626  * Find highest one bit set.
627  *	Returns bit number + 1 of highest bit that is set, otherwise returns 0.
628  * High order bit is 31 (or 63 in _LP64 kernel).
629  */
630 int
631 highbit(ulong_t i)
632 {
633 	register int h = 1;
634 
635 	if (i == 0)
636 		return (0);
637 #ifdef _LP64
638 	if (i & 0xffffffff00000000ul) {
639 		h += 32; i >>= 32;
640 	}
641 #endif
642 	if (i & 0xffff0000) {
643 		h += 16; i >>= 16;
644 	}
645 	if (i & 0xff00) {
646 		h += 8; i >>= 8;
647 	}
648 	if (i & 0xf0) {
649 		h += 4; i >>= 4;
650 	}
651 	if (i & 0xc) {
652 		h += 2; i >>= 2;
653 	}
654 	if (i & 0x2) {
655 		h += 1;
656 	}
657 	return (h);
658 }
659 
660 static int
661 random_get_bytes_common(uint8_t *ptr, size_t len, char *devname)
662 {
663 	int fd = open(devname, O_RDONLY);
664 	size_t resid = len;
665 	ssize_t bytes;
666 
667 	ASSERT(fd != -1);
668 
669 	while (resid != 0) {
670 		bytes = read(fd, ptr, resid);
671 		ASSERT(bytes >= 0);
672 		ptr += bytes;
673 		resid -= bytes;
674 	}
675 
676 	close(fd);
677 
678 	return (0);
679 }
680 
681 int
682 random_get_bytes(uint8_t *ptr, size_t len)
683 {
684 	return (random_get_bytes_common(ptr, len, "/dev/random"));
685 }
686 
687 int
688 random_get_pseudo_bytes(uint8_t *ptr, size_t len)
689 {
690 	return (random_get_bytes_common(ptr, len, "/dev/urandom"));
691 }
692 
693 /*
694  * =========================================================================
695  * kernel emulation setup & teardown
696  * =========================================================================
697  */
698 static int
699 umem_out_of_memory(void)
700 {
701 	char errmsg[] = "out of memory -- generating core dump\n";
702 
703 	write(fileno(stderr), errmsg, sizeof (errmsg));
704 	abort();
705 	return (0);
706 }
707 
708 void
709 kernel_init(int mode)
710 {
711 	umem_nofail_callback(umem_out_of_memory);
712 
713 	physmem = sysconf(_SC_PHYS_PAGES);
714 
715 	dprintf("physmem = %llu pages (%.2f GB)\n", physmem,
716 	    (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
717 
718 	spa_init(mode);
719 }
720 
721 void
722 kernel_fini(void)
723 {
724 	spa_fini();
725 }
726