xref: /illumos-gate/usr/src/lib/libzpool/common/kernel.c (revision 2833423dc59f4c35fe4713dbb942950c82df0437)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24  * Copyright 2020 Joyent, Inc.
25  * Copyright 2017 RackTop Systems.
26  */
27 
28 #include <assert.h>
29 #include <fcntl.h>
30 #include <poll.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <zlib.h>
35 #include <libgen.h>
36 #include <sys/spa.h>
37 #include <sys/stat.h>
38 #include <sys/processor.h>
39 #include <sys/zfs_context.h>
40 #include <zfs_fletcher.h>
41 #include <sys/rrwlock.h>
42 #include <sys/zmod.h>
43 #include <sys/utsname.h>
44 #include <sys/systeminfo.h>
45 #include <libzutil.h>
46 #include <sys/crypto/common.h>
47 #include <sys/crypto/impl.h>
48 #include <sys/crypto/api.h>
49 #include <sys/sha2.h>
50 #include <crypto/aes/aes_impl.h>
51 
52 extern void system_taskq_init(void);
53 extern void system_taskq_fini(void);
54 
55 /*
56  * Emulation of kernel services in userland.
57  */
58 
59 pgcnt_t physmem;
60 vnode_t *rootdir = (vnode_t *)0xabcd1234;
61 char hw_serial[HW_HOSTID_LEN];
62 kmutex_t cpu_lock;
63 vmem_t *zio_arena = NULL;
64 
65 /* If set, all blocks read will be copied to the specified directory. */
66 char *vn_dumpdir = NULL;
67 
68 struct utsname utsname = {
69 	"userland", "libzpool", "1", "1", "na"
70 };
71 
72 /*
73  * =========================================================================
74  * vnode operations
75  * =========================================================================
76  */
77 /*
78  * Note: for the xxxat() versions of these functions, we assume that the
79  * starting vp is always rootdir (which is true for spa_directory.c, the only
80  * ZFS consumer of these interfaces).  We assert this is true, and then emulate
81  * them by adding '/' in front of the path.
82  */
83 
84 /*ARGSUSED*/
85 int
86 vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
87 {
88 	int fd;
89 	int dump_fd;
90 	vnode_t *vp;
91 	int old_umask;
92 	char realpath[MAXPATHLEN];
93 	struct stat64 st;
94 
95 	/*
96 	 * If we're accessing a real disk from userland, we need to use
97 	 * the character interface to avoid caching.  This is particularly
98 	 * important if we're trying to look at a real in-kernel storage
99 	 * pool from userland, e.g. via zdb, because otherwise we won't
100 	 * see the changes occurring under the segmap cache.
101 	 * On the other hand, the stupid character device returns zero
102 	 * for its size.  So -- gag -- we open the block device to get
103 	 * its size, and remember it for subsequent VOP_GETATTR().
104 	 */
105 	if (strncmp(path, "/dev/", 5) == 0) {
106 		char *dsk;
107 		fd = open64(path, O_RDONLY);
108 		if (fd == -1)
109 			return (errno);
110 		if (fstat64(fd, &st) == -1) {
111 			close(fd);
112 			return (errno);
113 		}
114 		close(fd);
115 		(void) sprintf(realpath, "%s", path);
116 		dsk = strstr(path, "/dsk/");
117 		if (dsk != NULL)
118 			(void) sprintf(realpath + (dsk - path) + 1, "r%s",
119 			    dsk + 1);
120 	} else {
121 		(void) sprintf(realpath, "%s", path);
122 		if (!(flags & FCREAT) && stat64(realpath, &st) == -1)
123 			return (errno);
124 	}
125 
126 	if (flags & FCREAT)
127 		old_umask = umask(0);
128 
129 	/*
130 	 * The construct 'flags - FREAD' conveniently maps combinations of
131 	 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR.
132 	 */
133 	fd = open64(realpath, flags - FREAD, mode);
134 
135 	if (flags & FCREAT)
136 		(void) umask(old_umask);
137 
138 	if (vn_dumpdir != NULL) {
139 		char dumppath[MAXPATHLEN];
140 		(void) snprintf(dumppath, sizeof (dumppath),
141 		    "%s/%s", vn_dumpdir, basename(realpath));
142 		dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666);
143 		if (dump_fd == -1)
144 			return (errno);
145 	} else {
146 		dump_fd = -1;
147 	}
148 
149 	if (fd == -1)
150 		return (errno);
151 
152 	if (fstat64(fd, &st) == -1) {
153 		close(fd);
154 		return (errno);
155 	}
156 
157 	(void) fcntl(fd, F_SETFD, FD_CLOEXEC);
158 
159 	*vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL);
160 
161 	vp->v_fd = fd;
162 	vp->v_size = st.st_size;
163 	vp->v_path = spa_strdup(path);
164 	vp->v_dump_fd = dump_fd;
165 
166 	return (0);
167 }
168 
169 /*ARGSUSED*/
170 int
171 vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2,
172     int x3, vnode_t *startvp, int fd)
173 {
174 	char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL);
175 	int ret;
176 
177 	ASSERT(startvp == rootdir);
178 	(void) sprintf(realpath, "/%s", path);
179 
180 	/* fd ignored for now, need if want to simulate nbmand support */
181 	ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3);
182 
183 	umem_free(realpath, strlen(path) + 2);
184 
185 	return (ret);
186 }
187 
188 /*ARGSUSED*/
189 int
190 vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset,
191     int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp)
192 {
193 	ssize_t iolen, split;
194 
195 	if (uio == UIO_READ) {
196 		iolen = pread64(vp->v_fd, addr, len, offset);
197 		if (vp->v_dump_fd != -1) {
198 			int status =
199 			    pwrite64(vp->v_dump_fd, addr, iolen, offset);
200 			ASSERT(status != -1);
201 		}
202 	} else {
203 		/*
204 		 * To simulate partial disk writes, we split writes into two
205 		 * system calls so that the process can be killed in between.
206 		 */
207 		int sectors = len >> SPA_MINBLOCKSHIFT;
208 		split = (sectors > 0 ? rand() % sectors : 0) <<
209 		    SPA_MINBLOCKSHIFT;
210 		iolen = pwrite64(vp->v_fd, addr, split, offset);
211 		iolen += pwrite64(vp->v_fd, (char *)addr + split,
212 		    len - split, offset + split);
213 	}
214 
215 	if (iolen == -1)
216 		return (errno);
217 	if (residp)
218 		*residp = len - iolen;
219 	else if (iolen != len)
220 		return (EIO);
221 	return (0);
222 }
223 
224 void
225 vn_close(vnode_t *vp)
226 {
227 	close(vp->v_fd);
228 	if (vp->v_dump_fd != -1)
229 		close(vp->v_dump_fd);
230 	spa_strfree(vp->v_path);
231 	umem_free(vp, sizeof (vnode_t));
232 }
233 
234 /*
235  * At a minimum we need to update the size since vdev_reopen()
236  * will no longer call vn_openat().
237  */
238 int
239 fop_getattr(vnode_t *vp, vattr_t *vap)
240 {
241 	struct stat64 st;
242 
243 	if (fstat64(vp->v_fd, &st) == -1) {
244 		close(vp->v_fd);
245 		return (errno);
246 	}
247 
248 	vap->va_size = st.st_size;
249 	return (0);
250 }
251 
252 #ifdef ZFS_DEBUG
253 
254 /*
255  * =========================================================================
256  * Figure out which debugging statements to print
257  * =========================================================================
258  */
259 
260 static char *dprintf_string;
261 static int dprintf_print_all;
262 
263 int
264 dprintf_find_string(const char *string)
265 {
266 	char *tmp_str = dprintf_string;
267 	int len = strlen(string);
268 
269 	/*
270 	 * Find out if this is a string we want to print.
271 	 * String format: file1.c,function_name1,file2.c,file3.c
272 	 */
273 
274 	while (tmp_str != NULL) {
275 		if (strncmp(tmp_str, string, len) == 0 &&
276 		    (tmp_str[len] == ',' || tmp_str[len] == '\0'))
277 			return (1);
278 		tmp_str = strchr(tmp_str, ',');
279 		if (tmp_str != NULL)
280 			tmp_str++; /* Get rid of , */
281 	}
282 	return (0);
283 }
284 
285 void
286 dprintf_setup(int *argc, char **argv)
287 {
288 	int i, j;
289 
290 	/*
291 	 * Debugging can be specified two ways: by setting the
292 	 * environment variable ZFS_DEBUG, or by including a
293 	 * "debug=..."  argument on the command line.  The command
294 	 * line setting overrides the environment variable.
295 	 */
296 
297 	for (i = 1; i < *argc; i++) {
298 		int len = strlen("debug=");
299 		/* First look for a command line argument */
300 		if (strncmp("debug=", argv[i], len) == 0) {
301 			dprintf_string = argv[i] + len;
302 			/* Remove from args */
303 			for (j = i; j < *argc; j++)
304 				argv[j] = argv[j+1];
305 			argv[j] = NULL;
306 			(*argc)--;
307 		}
308 	}
309 
310 	if (dprintf_string == NULL) {
311 		/* Look for ZFS_DEBUG environment variable */
312 		dprintf_string = getenv("ZFS_DEBUG");
313 	}
314 
315 	/*
316 	 * Are we just turning on all debugging?
317 	 */
318 	if (dprintf_find_string("on"))
319 		dprintf_print_all = 1;
320 
321 	if (dprintf_string != NULL)
322 		zfs_flags |= ZFS_DEBUG_DPRINTF;
323 }
324 
325 /*
326  * =========================================================================
327  * debug printfs
328  * =========================================================================
329  */
330 void
331 __dprintf(const char *file, const char *func, int line, const char *fmt, ...)
332 {
333 	const char *newfile;
334 	va_list adx;
335 
336 	/*
337 	 * Get rid of annoying "../common/" prefix to filename.
338 	 */
339 	newfile = strrchr(file, '/');
340 	if (newfile != NULL) {
341 		newfile = newfile + 1; /* Get rid of leading / */
342 	} else {
343 		newfile = file;
344 	}
345 
346 	if (dprintf_print_all ||
347 	    dprintf_find_string(newfile) ||
348 	    dprintf_find_string(func)) {
349 		/* Print out just the function name if requested */
350 		flockfile(stdout);
351 		if (dprintf_find_string("pid"))
352 			(void) printf("%d ", getpid());
353 		if (dprintf_find_string("tid"))
354 			(void) printf("%u ", thr_self());
355 		if (dprintf_find_string("cpu"))
356 			(void) printf("%u ", getcpuid());
357 		if (dprintf_find_string("time"))
358 			(void) printf("%llu ", gethrtime());
359 		if (dprintf_find_string("long"))
360 			(void) printf("%s, line %d: ", newfile, line);
361 		(void) printf("%s: ", func);
362 		va_start(adx, fmt);
363 		(void) vprintf(fmt, adx);
364 		va_end(adx);
365 		funlockfile(stdout);
366 	}
367 }
368 
369 #endif /* ZFS_DEBUG */
370 
371 /*
372  * =========================================================================
373  * kobj interfaces
374  * =========================================================================
375  */
376 struct _buf *
377 kobj_open_file(char *name)
378 {
379 	struct _buf *file;
380 	vnode_t *vp;
381 
382 	/* set vp as the _fd field of the file */
383 	if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir,
384 	    -1) != 0)
385 		return ((void *)-1UL);
386 
387 	file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL);
388 	file->_fd = (intptr_t)vp;
389 	return (file);
390 }
391 
392 int
393 kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
394 {
395 	ssize_t resid;
396 
397 	vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off,
398 	    UIO_SYSSPACE, 0, 0, 0, &resid);
399 
400 	return (size - resid);
401 }
402 
403 void
404 kobj_close_file(struct _buf *file)
405 {
406 	vn_close((vnode_t *)file->_fd);
407 	umem_free(file, sizeof (struct _buf));
408 }
409 
410 int
411 kobj_get_filesize(struct _buf *file, uint64_t *size)
412 {
413 	struct stat64 st;
414 	vnode_t *vp = (vnode_t *)file->_fd;
415 
416 	if (fstat64(vp->v_fd, &st) == -1) {
417 		vn_close(vp);
418 		return (errno);
419 	}
420 	*size = st.st_size;
421 	return (0);
422 }
423 
424 /*
425  * =========================================================================
426  * misc routines
427  * =========================================================================
428  */
429 
430 /*
431  * Find lowest one bit set.
432  * Returns bit number + 1 of lowest bit that is set, otherwise returns 0.
433  * This is basically a reimplementation of ffsll(), which is GNU specific.
434  */
435 int
436 lowbit64(uint64_t i)
437 {
438 	register int h = 64;
439 	if (i == 0)
440 		return (0);
441 
442 	if (i & 0x00000000ffffffffULL)
443 		h -= 32;
444 	else
445 		i >>= 32;
446 
447 	if (i & 0x0000ffff)
448 		h -= 16;
449 	else
450 		i >>= 16;
451 
452 	if (i & 0x00ff)
453 		h -= 8;
454 	else
455 		i >>= 8;
456 
457 	if (i & 0x0f)
458 		h -= 4;
459 	else
460 		i >>= 4;
461 
462 	if (i & 0x3)
463 		h -= 2;
464 	else
465 		i >>= 2;
466 
467 	if (i & 0x1)
468 		h -= 1;
469 
470 	return (h);
471 }
472 
473 int
474 highbit64(uint64_t i)
475 {
476 	int h = 1;
477 
478 	if (i == 0)
479 		return (0);
480 	if (i & 0xffffffff00000000ULL) {
481 		h += 32; i >>= 32;
482 	}
483 	if (i & 0xffff0000) {
484 		h += 16; i >>= 16;
485 	}
486 	if (i & 0xff00) {
487 		h += 8; i >>= 8;
488 	}
489 	if (i & 0xf0) {
490 		h += 4; i >>= 4;
491 	}
492 	if (i & 0xc) {
493 		h += 2; i >>= 2;
494 	}
495 	if (i & 0x2) {
496 		h += 1;
497 	}
498 	return (h);
499 }
500 
501 /*
502  * =========================================================================
503  * kernel emulation setup & teardown
504  * =========================================================================
505  */
506 static int
507 umem_out_of_memory(void)
508 {
509 	char errmsg[] = "out of memory -- generating core dump\n";
510 
511 	write(fileno(stderr), errmsg, sizeof (errmsg));
512 	abort();
513 	return (0);
514 }
515 
516 void
517 kernel_init(int mode)
518 {
519 	extern uint_t rrw_tsd_key;
520 
521 	umem_nofail_callback(umem_out_of_memory);
522 
523 	physmem = sysconf(_SC_PHYS_PAGES);
524 
525 	dprintf("physmem = %llu pages (%.2f GB)\n", physmem,
526 	    (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
527 
528 	(void) snprintf(hw_serial, sizeof (hw_serial), "%ld",
529 	    (mode & FWRITE) ? get_system_hostid() : 0);
530 
531 	system_taskq_init();
532 
533 	mutex_init(&cpu_lock, NULL, MUTEX_DEFAULT, NULL);
534 
535 	spa_init(mode);
536 
537 	fletcher_4_init();
538 
539 	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
540 }
541 
542 void
543 kernel_fini(void)
544 {
545 	fletcher_4_fini();
546 
547 	spa_fini();
548 
549 	system_taskq_fini();
550 }
551 
552 /* ARGSUSED */
553 uint32_t
554 zone_get_hostid(void *zonep)
555 {
556 	/*
557 	 * We're emulating the system's hostid in userland.
558 	 */
559 	return (strtoul(hw_serial, NULL, 10));
560 }
561 
562 int
563 z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen)
564 {
565 	int ret;
566 	uLongf len = *dstlen;
567 
568 	if ((ret = uncompress(dst, &len, src, srclen)) == Z_OK)
569 		*dstlen = (size_t)len;
570 
571 	return (ret);
572 }
573 
574 int
575 z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen,
576     int level)
577 {
578 	int ret;
579 	uLongf len = *dstlen;
580 
581 	if ((ret = compress2(dst, &len, src, srclen, level)) == Z_OK)
582 		*dstlen = (size_t)len;
583 
584 	return (ret);
585 }
586 
587 int
588 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
589 {
590 	return (0);
591 }
592 
593 int
594 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
595 {
596 	return (0);
597 }
598 
599 int
600 zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
601 {
602 	return (0);
603 }
604 
605 /* ARGSUSED */
606 int
607 zfs_onexit_fd_hold(int fd, minor_t *minorp)
608 {
609 	*minorp = 0;
610 	return (0);
611 }
612 
613 /* ARGSUSED */
614 void
615 zfs_onexit_fd_rele(int fd)
616 {
617 }
618 
619 /* ARGSUSED */
620 int
621 zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
622     uint64_t *action_handle)
623 {
624 	return (0);
625 }
626 
627 /* ARGSUSED */
628 int
629 zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire)
630 {
631 	return (0);
632 }
633 
634 /* ARGSUSED */
635 int
636 zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data)
637 {
638 	return (0);
639 }
640 
641 void
642 bioinit(buf_t *bp)
643 {
644 	bzero(bp, sizeof (buf_t));
645 }
646 
647 void
648 biodone(buf_t *bp)
649 {
650 	if (bp->b_iodone != NULL) {
651 		(*(bp->b_iodone))(bp);
652 		return;
653 	}
654 	ASSERT((bp->b_flags & B_DONE) == 0);
655 	bp->b_flags |= B_DONE;
656 }
657 
658 void
659 bioerror(buf_t *bp, int error)
660 {
661 	ASSERT(bp != NULL);
662 	ASSERT(error >= 0);
663 
664 	if (error != 0) {
665 		bp->b_flags |= B_ERROR;
666 	} else {
667 		bp->b_flags &= ~B_ERROR;
668 	}
669 	bp->b_error = error;
670 }
671 
672 
673 int
674 geterror(struct buf *bp)
675 {
676 	int error = 0;
677 
678 	if (bp->b_flags & B_ERROR) {
679 		error = bp->b_error;
680 		if (!error)
681 			error = EIO;
682 	}
683 	return (error);
684 }
685 
686 int
687 crypto_create_ctx_template(crypto_mechanism_t *mech,
688     crypto_key_t *key, crypto_ctx_template_t *tmpl, int kmflag)
689 {
690 	return (0);
691 }
692 
693 crypto_mech_type_t
694 crypto_mech2id(const char *name)
695 {
696 	return (CRYPTO_MECH_INVALID);
697 }
698 
699 int
700 crypto_mac(crypto_mechanism_t *mech, crypto_data_t *data,
701     crypto_key_t *key, crypto_ctx_template_t impl,
702     crypto_data_t *mac, crypto_call_req_t *cr)
703 {
704 	return (0);
705 }
706 
707 int
708 crypto_encrypt(crypto_mechanism_t *mech, crypto_data_t *plaintext,
709     crypto_key_t *key, crypto_ctx_template_t tmpl,
710     crypto_data_t *ciphertext, crypto_call_req_t *cr)
711 {
712 	return (0);
713 }
714 
715 /* This could probably be a weak reference */
716 int
717 crypto_decrypt(crypto_mechanism_t *mech, crypto_data_t *plaintext,
718     crypto_key_t *key, crypto_ctx_template_t tmpl,
719     crypto_data_t *ciphertext, crypto_call_req_t *cr)
720 {
721 	return (0);
722 }
723 
724 
725 int
726 crypto_digest_final(crypto_context_t context, crypto_data_t *digest,
727     crypto_call_req_t *cr)
728 {
729 	return (0);
730 }
731 
732 int
733 crypto_digest_update(crypto_context_t context, crypto_data_t *data,
734     crypto_call_req_t *cr)
735 {
736 	return (0);
737 }
738 
739 int
740 crypto_digest_init(crypto_mechanism_t *mech, crypto_context_t *ctxp,
741     crypto_call_req_t  *crq)
742 {
743 	return (0);
744 }
745 
746 void
747 crypto_destroy_ctx_template(crypto_ctx_template_t tmpl)
748 {
749 }
750 
751 extern int crypto_mac_init(crypto_mechanism_t *mech, crypto_key_t *key,
752 	crypto_ctx_template_t tmpl, crypto_context_t *ctxp,
753     crypto_call_req_t *cr)
754 {
755 	return (0);
756 }
757 
758 extern int crypto_mac_update(crypto_context_t ctx, crypto_data_t *data,
759 	crypto_call_req_t *cr)
760 {
761 	return (0);
762 }
763 
764 extern int crypto_mac_final(crypto_context_t ctx, crypto_data_t *data,
765 	crypto_call_req_t *cr)
766 {
767 	return (0);
768 }
769