xref: /illumos-gate/usr/src/lib/libzpool/common/kernel.c (revision f20211217f12ce291fd518e61065cd273f23e4ea)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24  * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
25  * Copyright 2017 RackTop Systems.
26  */
27 
28 #include <assert.h>
29 #include <fcntl.h>
30 #include <poll.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <zlib.h>
35 #include <libgen.h>
36 #include <sys/spa.h>
37 #include <sys/stat.h>
38 #include <sys/processor.h>
39 #include <sys/zfs_context.h>
40 #include <sys/rrwlock.h>
41 #include <sys/zmod.h>
42 #include <sys/utsname.h>
43 #include <sys/systeminfo.h>
44 
45 extern void system_taskq_init(void);
46 extern void system_taskq_fini(void);
47 
48 /*
49  * Emulation of kernel services in userland.
50  */
51 
52 pgcnt_t physmem;
53 vnode_t *rootdir = (vnode_t *)0xabcd1234;
54 char hw_serial[HW_HOSTID_LEN];
55 kmutex_t cpu_lock;
56 vmem_t *zio_arena = NULL;
57 
58 /* If set, all blocks read will be copied to the specified directory. */
59 char *vn_dumpdir = NULL;
60 
61 struct utsname utsname = {
62 	"userland", "libzpool", "1", "1", "na"
63 };
64 
65 /*
66  * =========================================================================
67  * vnode operations
68  * =========================================================================
69  */
70 /*
71  * Note: for the xxxat() versions of these functions, we assume that the
72  * starting vp is always rootdir (which is true for spa_directory.c, the only
73  * ZFS consumer of these interfaces).  We assert this is true, and then emulate
74  * them by adding '/' in front of the path.
75  */
76 
77 /*ARGSUSED*/
78 int
79 vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
80 {
81 	int fd;
82 	int dump_fd;
83 	vnode_t *vp;
84 	int old_umask;
85 	char realpath[MAXPATHLEN];
86 	struct stat64 st;
87 
88 	/*
89 	 * If we're accessing a real disk from userland, we need to use
90 	 * the character interface to avoid caching.  This is particularly
91 	 * important if we're trying to look at a real in-kernel storage
92 	 * pool from userland, e.g. via zdb, because otherwise we won't
93 	 * see the changes occurring under the segmap cache.
94 	 * On the other hand, the stupid character device returns zero
95 	 * for its size.  So -- gag -- we open the block device to get
96 	 * its size, and remember it for subsequent VOP_GETATTR().
97 	 */
98 	if (strncmp(path, "/dev/", 5) == 0) {
99 		char *dsk;
100 		fd = open64(path, O_RDONLY);
101 		if (fd == -1)
102 			return (errno);
103 		if (fstat64(fd, &st) == -1) {
104 			close(fd);
105 			return (errno);
106 		}
107 		close(fd);
108 		(void) sprintf(realpath, "%s", path);
109 		dsk = strstr(path, "/dsk/");
110 		if (dsk != NULL)
111 			(void) sprintf(realpath + (dsk - path) + 1, "r%s",
112 			    dsk + 1);
113 	} else {
114 		(void) sprintf(realpath, "%s", path);
115 		if (!(flags & FCREAT) && stat64(realpath, &st) == -1)
116 			return (errno);
117 	}
118 
119 	if (flags & FCREAT)
120 		old_umask = umask(0);
121 
122 	/*
123 	 * The construct 'flags - FREAD' conveniently maps combinations of
124 	 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR.
125 	 */
126 	fd = open64(realpath, flags - FREAD, mode);
127 
128 	if (flags & FCREAT)
129 		(void) umask(old_umask);
130 
131 	if (vn_dumpdir != NULL) {
132 		char dumppath[MAXPATHLEN];
133 		(void) snprintf(dumppath, sizeof (dumppath),
134 		    "%s/%s", vn_dumpdir, basename(realpath));
135 		dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666);
136 		if (dump_fd == -1)
137 			return (errno);
138 	} else {
139 		dump_fd = -1;
140 	}
141 
142 	if (fd == -1)
143 		return (errno);
144 
145 	if (fstat64(fd, &st) == -1) {
146 		close(fd);
147 		return (errno);
148 	}
149 
150 	(void) fcntl(fd, F_SETFD, FD_CLOEXEC);
151 
152 	*vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL);
153 
154 	vp->v_fd = fd;
155 	vp->v_size = st.st_size;
156 	vp->v_path = spa_strdup(path);
157 	vp->v_dump_fd = dump_fd;
158 
159 	return (0);
160 }
161 
162 /*ARGSUSED*/
163 int
164 vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2,
165     int x3, vnode_t *startvp, int fd)
166 {
167 	char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL);
168 	int ret;
169 
170 	ASSERT(startvp == rootdir);
171 	(void) sprintf(realpath, "/%s", path);
172 
173 	/* fd ignored for now, need if want to simulate nbmand support */
174 	ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3);
175 
176 	umem_free(realpath, strlen(path) + 2);
177 
178 	return (ret);
179 }
180 
181 /*ARGSUSED*/
182 int
183 vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset,
184     int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp)
185 {
186 	ssize_t iolen, split;
187 
188 	if (uio == UIO_READ) {
189 		iolen = pread64(vp->v_fd, addr, len, offset);
190 		if (vp->v_dump_fd != -1) {
191 			int status =
192 			    pwrite64(vp->v_dump_fd, addr, iolen, offset);
193 			ASSERT(status != -1);
194 		}
195 	} else {
196 		/*
197 		 * To simulate partial disk writes, we split writes into two
198 		 * system calls so that the process can be killed in between.
199 		 */
200 		int sectors = len >> SPA_MINBLOCKSHIFT;
201 		split = (sectors > 0 ? rand() % sectors : 0) <<
202 		    SPA_MINBLOCKSHIFT;
203 		iolen = pwrite64(vp->v_fd, addr, split, offset);
204 		iolen += pwrite64(vp->v_fd, (char *)addr + split,
205 		    len - split, offset + split);
206 	}
207 
208 	if (iolen == -1)
209 		return (errno);
210 	if (residp)
211 		*residp = len - iolen;
212 	else if (iolen != len)
213 		return (EIO);
214 	return (0);
215 }
216 
217 void
218 vn_close(vnode_t *vp)
219 {
220 	close(vp->v_fd);
221 	if (vp->v_dump_fd != -1)
222 		close(vp->v_dump_fd);
223 	spa_strfree(vp->v_path);
224 	umem_free(vp, sizeof (vnode_t));
225 }
226 
227 /*
228  * At a minimum we need to update the size since vdev_reopen()
229  * will no longer call vn_openat().
230  */
231 int
232 fop_getattr(vnode_t *vp, vattr_t *vap)
233 {
234 	struct stat64 st;
235 
236 	if (fstat64(vp->v_fd, &st) == -1) {
237 		close(vp->v_fd);
238 		return (errno);
239 	}
240 
241 	vap->va_size = st.st_size;
242 	return (0);
243 }
244 
245 #ifdef ZFS_DEBUG
246 
247 /*
248  * =========================================================================
249  * Figure out which debugging statements to print
250  * =========================================================================
251  */
252 
253 static char *dprintf_string;
254 static int dprintf_print_all;
255 
256 int
257 dprintf_find_string(const char *string)
258 {
259 	char *tmp_str = dprintf_string;
260 	int len = strlen(string);
261 
262 	/*
263 	 * Find out if this is a string we want to print.
264 	 * String format: file1.c,function_name1,file2.c,file3.c
265 	 */
266 
267 	while (tmp_str != NULL) {
268 		if (strncmp(tmp_str, string, len) == 0 &&
269 		    (tmp_str[len] == ',' || tmp_str[len] == '\0'))
270 			return (1);
271 		tmp_str = strchr(tmp_str, ',');
272 		if (tmp_str != NULL)
273 			tmp_str++; /* Get rid of , */
274 	}
275 	return (0);
276 }
277 
278 void
279 dprintf_setup(int *argc, char **argv)
280 {
281 	int i, j;
282 
283 	/*
284 	 * Debugging can be specified two ways: by setting the
285 	 * environment variable ZFS_DEBUG, or by including a
286 	 * "debug=..."  argument on the command line.  The command
287 	 * line setting overrides the environment variable.
288 	 */
289 
290 	for (i = 1; i < *argc; i++) {
291 		int len = strlen("debug=");
292 		/* First look for a command line argument */
293 		if (strncmp("debug=", argv[i], len) == 0) {
294 			dprintf_string = argv[i] + len;
295 			/* Remove from args */
296 			for (j = i; j < *argc; j++)
297 				argv[j] = argv[j+1];
298 			argv[j] = NULL;
299 			(*argc)--;
300 		}
301 	}
302 
303 	if (dprintf_string == NULL) {
304 		/* Look for ZFS_DEBUG environment variable */
305 		dprintf_string = getenv("ZFS_DEBUG");
306 	}
307 
308 	/*
309 	 * Are we just turning on all debugging?
310 	 */
311 	if (dprintf_find_string("on"))
312 		dprintf_print_all = 1;
313 
314 	if (dprintf_string != NULL)
315 		zfs_flags |= ZFS_DEBUG_DPRINTF;
316 }
317 
318 /*
319  * =========================================================================
320  * debug printfs
321  * =========================================================================
322  */
323 void
324 __dprintf(const char *file, const char *func, int line, const char *fmt, ...)
325 {
326 	const char *newfile;
327 	va_list adx;
328 
329 	/*
330 	 * Get rid of annoying "../common/" prefix to filename.
331 	 */
332 	newfile = strrchr(file, '/');
333 	if (newfile != NULL) {
334 		newfile = newfile + 1; /* Get rid of leading / */
335 	} else {
336 		newfile = file;
337 	}
338 
339 	if (dprintf_print_all ||
340 	    dprintf_find_string(newfile) ||
341 	    dprintf_find_string(func)) {
342 		/* Print out just the function name if requested */
343 		flockfile(stdout);
344 		if (dprintf_find_string("pid"))
345 			(void) printf("%d ", getpid());
346 		if (dprintf_find_string("tid"))
347 			(void) printf("%u ", thr_self());
348 		if (dprintf_find_string("cpu"))
349 			(void) printf("%u ", getcpuid());
350 		if (dprintf_find_string("time"))
351 			(void) printf("%llu ", gethrtime());
352 		if (dprintf_find_string("long"))
353 			(void) printf("%s, line %d: ", newfile, line);
354 		(void) printf("%s: ", func);
355 		va_start(adx, fmt);
356 		(void) vprintf(fmt, adx);
357 		va_end(adx);
358 		funlockfile(stdout);
359 	}
360 }
361 
362 #endif /* ZFS_DEBUG */
363 
364 /*
365  * =========================================================================
366  * kobj interfaces
367  * =========================================================================
368  */
369 struct _buf *
370 kobj_open_file(char *name)
371 {
372 	struct _buf *file;
373 	vnode_t *vp;
374 
375 	/* set vp as the _fd field of the file */
376 	if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir,
377 	    -1) != 0)
378 		return ((void *)-1UL);
379 
380 	file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL);
381 	file->_fd = (intptr_t)vp;
382 	return (file);
383 }
384 
385 int
386 kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
387 {
388 	ssize_t resid;
389 
390 	vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off,
391 	    UIO_SYSSPACE, 0, 0, 0, &resid);
392 
393 	return (size - resid);
394 }
395 
396 void
397 kobj_close_file(struct _buf *file)
398 {
399 	vn_close((vnode_t *)file->_fd);
400 	umem_free(file, sizeof (struct _buf));
401 }
402 
403 int
404 kobj_get_filesize(struct _buf *file, uint64_t *size)
405 {
406 	struct stat64 st;
407 	vnode_t *vp = (vnode_t *)file->_fd;
408 
409 	if (fstat64(vp->v_fd, &st) == -1) {
410 		vn_close(vp);
411 		return (errno);
412 	}
413 	*size = st.st_size;
414 	return (0);
415 }
416 
417 /*
418  * =========================================================================
419  * kernel emulation setup & teardown
420  * =========================================================================
421  */
422 static int
423 umem_out_of_memory(void)
424 {
425 	char errmsg[] = "out of memory -- generating core dump\n";
426 
427 	write(fileno(stderr), errmsg, sizeof (errmsg));
428 	abort();
429 	return (0);
430 }
431 
432 void
433 kernel_init(int mode)
434 {
435 	extern uint_t rrw_tsd_key;
436 
437 	umem_nofail_callback(umem_out_of_memory);
438 
439 	physmem = sysconf(_SC_PHYS_PAGES);
440 
441 	dprintf("physmem = %llu pages (%.2f GB)\n", physmem,
442 	    (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
443 
444 	(void) snprintf(hw_serial, sizeof (hw_serial), "%ld",
445 	    (mode & FWRITE) ? gethostid() : 0);
446 
447 	system_taskq_init();
448 
449 	mutex_init(&cpu_lock, NULL, MUTEX_DEFAULT, NULL);
450 
451 	spa_init(mode);
452 
453 	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
454 }
455 
456 void
457 kernel_fini(void)
458 {
459 	spa_fini();
460 
461 	system_taskq_fini();
462 }
463 
464 /* ARGSUSED */
465 uint32_t
466 zone_get_hostid(void *zonep)
467 {
468 	/*
469 	 * We're emulating the system's hostid in userland.
470 	 */
471 	return (strtoul(hw_serial, NULL, 10));
472 }
473 
474 int
475 z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen)
476 {
477 	int ret;
478 	uLongf len = *dstlen;
479 
480 	if ((ret = uncompress(dst, &len, src, srclen)) == Z_OK)
481 		*dstlen = (size_t)len;
482 
483 	return (ret);
484 }
485 
486 int
487 z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen,
488     int level)
489 {
490 	int ret;
491 	uLongf len = *dstlen;
492 
493 	if ((ret = compress2(dst, &len, src, srclen, level)) == Z_OK)
494 		*dstlen = (size_t)len;
495 
496 	return (ret);
497 }
498 
499 int
500 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
501 {
502 	return (0);
503 }
504 
505 int
506 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
507 {
508 	return (0);
509 }
510 
511 int
512 zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
513 {
514 	return (0);
515 }
516 
517 /* ARGSUSED */
518 int
519 zfs_onexit_fd_hold(int fd, minor_t *minorp)
520 {
521 	*minorp = 0;
522 	return (0);
523 }
524 
525 /* ARGSUSED */
526 void
527 zfs_onexit_fd_rele(int fd)
528 {
529 }
530 
531 /* ARGSUSED */
532 int
533 zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
534     uint64_t *action_handle)
535 {
536 	return (0);
537 }
538 
539 /* ARGSUSED */
540 int
541 zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire)
542 {
543 	return (0);
544 }
545 
546 /* ARGSUSED */
547 int
548 zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data)
549 {
550 	return (0);
551 }
552 
553 void
554 bioinit(buf_t *bp)
555 {
556 	bzero(bp, sizeof (buf_t));
557 }
558 
559 void
560 biodone(buf_t *bp)
561 {
562 	if (bp->b_iodone != NULL) {
563 		(*(bp->b_iodone))(bp);
564 		return;
565 	}
566 	ASSERT((bp->b_flags & B_DONE) == 0);
567 	bp->b_flags |= B_DONE;
568 }
569 
570 void
571 bioerror(buf_t *bp, int error)
572 {
573 	ASSERT(bp != NULL);
574 	ASSERT(error >= 0);
575 
576 	if (error != 0) {
577 		bp->b_flags |= B_ERROR;
578 	} else {
579 		bp->b_flags &= ~B_ERROR;
580 	}
581 	bp->b_error = error;
582 }
583 
584 
585 int
586 geterror(struct buf *bp)
587 {
588 	int error = 0;
589 
590 	if (bp->b_flags & B_ERROR) {
591 		error = bp->b_error;
592 		if (!error)
593 			error = EIO;
594 	}
595 	return (error);
596 }
597