1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 * Copyright 2020 Joyent, Inc. 25 * Copyright 2017 RackTop Systems. 26 */ 27 28 #include <assert.h> 29 #include <fcntl.h> 30 #include <poll.h> 31 #include <stdio.h> 32 #include <stdlib.h> 33 #include <string.h> 34 #include <zlib.h> 35 #include <libgen.h> 36 #include <sys/spa.h> 37 #include <sys/stat.h> 38 #include <sys/processor.h> 39 #include <sys/zfs_context.h> 40 #include <sys/rrwlock.h> 41 #include <sys/zmod.h> 42 #include <sys/utsname.h> 43 #include <sys/systeminfo.h> 44 #include <libzutil.h> 45 #include <sys/crypto/common.h> 46 #include <sys/crypto/impl.h> 47 #include <sys/crypto/api.h> 48 #include <sys/sha2.h> 49 #include <crypto/aes/aes_impl.h> 50 51 extern void system_taskq_init(void); 52 extern void system_taskq_fini(void); 53 54 /* 55 * Emulation of kernel services in userland. 56 */ 57 58 pgcnt_t physmem; 59 vnode_t *rootdir = (vnode_t *)0xabcd1234; 60 char hw_serial[HW_HOSTID_LEN]; 61 kmutex_t cpu_lock; 62 vmem_t *zio_arena = NULL; 63 64 /* If set, all blocks read will be copied to the specified directory. */ 65 char *vn_dumpdir = NULL; 66 67 struct utsname utsname = { 68 "userland", "libzpool", "1", "1", "na" 69 }; 70 71 /* 72 * ========================================================================= 73 * vnode operations 74 * ========================================================================= 75 */ 76 /* 77 * Note: for the xxxat() versions of these functions, we assume that the 78 * starting vp is always rootdir (which is true for spa_directory.c, the only 79 * ZFS consumer of these interfaces). We assert this is true, and then emulate 80 * them by adding '/' in front of the path. 81 */ 82 83 /*ARGSUSED*/ 84 int 85 vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) 86 { 87 int fd; 88 int dump_fd; 89 vnode_t *vp; 90 int old_umask; 91 char realpath[MAXPATHLEN]; 92 struct stat64 st; 93 94 /* 95 * If we're accessing a real disk from userland, we need to use 96 * the character interface to avoid caching. This is particularly 97 * important if we're trying to look at a real in-kernel storage 98 * pool from userland, e.g. via zdb, because otherwise we won't 99 * see the changes occurring under the segmap cache. 100 * On the other hand, the stupid character device returns zero 101 * for its size. So -- gag -- we open the block device to get 102 * its size, and remember it for subsequent VOP_GETATTR(). 103 */ 104 if (strncmp(path, "/dev/", 5) == 0) { 105 char *dsk; 106 fd = open64(path, O_RDONLY); 107 if (fd == -1) 108 return (errno); 109 if (fstat64(fd, &st) == -1) { 110 close(fd); 111 return (errno); 112 } 113 close(fd); 114 (void) sprintf(realpath, "%s", path); 115 dsk = strstr(path, "/dsk/"); 116 if (dsk != NULL) 117 (void) sprintf(realpath + (dsk - path) + 1, "r%s", 118 dsk + 1); 119 } else { 120 (void) sprintf(realpath, "%s", path); 121 if (!(flags & FCREAT) && stat64(realpath, &st) == -1) 122 return (errno); 123 } 124 125 if (flags & FCREAT) 126 old_umask = umask(0); 127 128 /* 129 * The construct 'flags - FREAD' conveniently maps combinations of 130 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR. 131 */ 132 fd = open64(realpath, flags - FREAD, mode); 133 134 if (flags & FCREAT) 135 (void) umask(old_umask); 136 137 if (vn_dumpdir != NULL) { 138 char dumppath[MAXPATHLEN]; 139 (void) snprintf(dumppath, sizeof (dumppath), 140 "%s/%s", vn_dumpdir, basename(realpath)); 141 dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); 142 if (dump_fd == -1) 143 return (errno); 144 } else { 145 dump_fd = -1; 146 } 147 148 if (fd == -1) 149 return (errno); 150 151 if (fstat64(fd, &st) == -1) { 152 close(fd); 153 return (errno); 154 } 155 156 (void) fcntl(fd, F_SETFD, FD_CLOEXEC); 157 158 *vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL); 159 160 vp->v_fd = fd; 161 vp->v_size = st.st_size; 162 vp->v_path = spa_strdup(path); 163 vp->v_dump_fd = dump_fd; 164 165 return (0); 166 } 167 168 /*ARGSUSED*/ 169 int 170 vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, 171 int x3, vnode_t *startvp, int fd) 172 { 173 char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL); 174 int ret; 175 176 ASSERT(startvp == rootdir); 177 (void) sprintf(realpath, "/%s", path); 178 179 /* fd ignored for now, need if want to simulate nbmand support */ 180 ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3); 181 182 umem_free(realpath, strlen(path) + 2); 183 184 return (ret); 185 } 186 187 /*ARGSUSED*/ 188 int 189 vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, 190 int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp) 191 { 192 ssize_t iolen, split; 193 194 if (uio == UIO_READ) { 195 iolen = pread64(vp->v_fd, addr, len, offset); 196 if (vp->v_dump_fd != -1) { 197 int status = 198 pwrite64(vp->v_dump_fd, addr, iolen, offset); 199 ASSERT(status != -1); 200 } 201 } else { 202 /* 203 * To simulate partial disk writes, we split writes into two 204 * system calls so that the process can be killed in between. 205 */ 206 int sectors = len >> SPA_MINBLOCKSHIFT; 207 split = (sectors > 0 ? rand() % sectors : 0) << 208 SPA_MINBLOCKSHIFT; 209 iolen = pwrite64(vp->v_fd, addr, split, offset); 210 iolen += pwrite64(vp->v_fd, (char *)addr + split, 211 len - split, offset + split); 212 } 213 214 if (iolen == -1) 215 return (errno); 216 if (residp) 217 *residp = len - iolen; 218 else if (iolen != len) 219 return (EIO); 220 return (0); 221 } 222 223 void 224 vn_close(vnode_t *vp) 225 { 226 close(vp->v_fd); 227 if (vp->v_dump_fd != -1) 228 close(vp->v_dump_fd); 229 spa_strfree(vp->v_path); 230 umem_free(vp, sizeof (vnode_t)); 231 } 232 233 /* 234 * At a minimum we need to update the size since vdev_reopen() 235 * will no longer call vn_openat(). 236 */ 237 int 238 fop_getattr(vnode_t *vp, vattr_t *vap) 239 { 240 struct stat64 st; 241 242 if (fstat64(vp->v_fd, &st) == -1) { 243 close(vp->v_fd); 244 return (errno); 245 } 246 247 vap->va_size = st.st_size; 248 return (0); 249 } 250 251 #ifdef ZFS_DEBUG 252 253 /* 254 * ========================================================================= 255 * Figure out which debugging statements to print 256 * ========================================================================= 257 */ 258 259 static char *dprintf_string; 260 static int dprintf_print_all; 261 262 int 263 dprintf_find_string(const char *string) 264 { 265 char *tmp_str = dprintf_string; 266 int len = strlen(string); 267 268 /* 269 * Find out if this is a string we want to print. 270 * String format: file1.c,function_name1,file2.c,file3.c 271 */ 272 273 while (tmp_str != NULL) { 274 if (strncmp(tmp_str, string, len) == 0 && 275 (tmp_str[len] == ',' || tmp_str[len] == '\0')) 276 return (1); 277 tmp_str = strchr(tmp_str, ','); 278 if (tmp_str != NULL) 279 tmp_str++; /* Get rid of , */ 280 } 281 return (0); 282 } 283 284 void 285 dprintf_setup(int *argc, char **argv) 286 { 287 int i, j; 288 289 /* 290 * Debugging can be specified two ways: by setting the 291 * environment variable ZFS_DEBUG, or by including a 292 * "debug=..." argument on the command line. The command 293 * line setting overrides the environment variable. 294 */ 295 296 for (i = 1; i < *argc; i++) { 297 int len = strlen("debug="); 298 /* First look for a command line argument */ 299 if (strncmp("debug=", argv[i], len) == 0) { 300 dprintf_string = argv[i] + len; 301 /* Remove from args */ 302 for (j = i; j < *argc; j++) 303 argv[j] = argv[j+1]; 304 argv[j] = NULL; 305 (*argc)--; 306 } 307 } 308 309 if (dprintf_string == NULL) { 310 /* Look for ZFS_DEBUG environment variable */ 311 dprintf_string = getenv("ZFS_DEBUG"); 312 } 313 314 /* 315 * Are we just turning on all debugging? 316 */ 317 if (dprintf_find_string("on")) 318 dprintf_print_all = 1; 319 320 if (dprintf_string != NULL) 321 zfs_flags |= ZFS_DEBUG_DPRINTF; 322 } 323 324 /* 325 * ========================================================================= 326 * debug printfs 327 * ========================================================================= 328 */ 329 void 330 __dprintf(const char *file, const char *func, int line, const char *fmt, ...) 331 { 332 const char *newfile; 333 va_list adx; 334 335 /* 336 * Get rid of annoying "../common/" prefix to filename. 337 */ 338 newfile = strrchr(file, '/'); 339 if (newfile != NULL) { 340 newfile = newfile + 1; /* Get rid of leading / */ 341 } else { 342 newfile = file; 343 } 344 345 if (dprintf_print_all || 346 dprintf_find_string(newfile) || 347 dprintf_find_string(func)) { 348 /* Print out just the function name if requested */ 349 flockfile(stdout); 350 if (dprintf_find_string("pid")) 351 (void) printf("%d ", getpid()); 352 if (dprintf_find_string("tid")) 353 (void) printf("%u ", thr_self()); 354 if (dprintf_find_string("cpu")) 355 (void) printf("%u ", getcpuid()); 356 if (dprintf_find_string("time")) 357 (void) printf("%llu ", gethrtime()); 358 if (dprintf_find_string("long")) 359 (void) printf("%s, line %d: ", newfile, line); 360 (void) printf("%s: ", func); 361 va_start(adx, fmt); 362 (void) vprintf(fmt, adx); 363 va_end(adx); 364 funlockfile(stdout); 365 } 366 } 367 368 #endif /* ZFS_DEBUG */ 369 370 /* 371 * ========================================================================= 372 * kobj interfaces 373 * ========================================================================= 374 */ 375 struct _buf * 376 kobj_open_file(char *name) 377 { 378 struct _buf *file; 379 vnode_t *vp; 380 381 /* set vp as the _fd field of the file */ 382 if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, 383 -1) != 0) 384 return ((void *)-1UL); 385 386 file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL); 387 file->_fd = (intptr_t)vp; 388 return (file); 389 } 390 391 int 392 kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) 393 { 394 ssize_t resid; 395 396 vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off, 397 UIO_SYSSPACE, 0, 0, 0, &resid); 398 399 return (size - resid); 400 } 401 402 void 403 kobj_close_file(struct _buf *file) 404 { 405 vn_close((vnode_t *)file->_fd); 406 umem_free(file, sizeof (struct _buf)); 407 } 408 409 int 410 kobj_get_filesize(struct _buf *file, uint64_t *size) 411 { 412 struct stat64 st; 413 vnode_t *vp = (vnode_t *)file->_fd; 414 415 if (fstat64(vp->v_fd, &st) == -1) { 416 vn_close(vp); 417 return (errno); 418 } 419 *size = st.st_size; 420 return (0); 421 } 422 423 /* 424 * ========================================================================= 425 * misc routines 426 * ========================================================================= 427 */ 428 429 /* 430 * Find lowest one bit set. 431 * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. 432 * This is basically a reimplementation of ffsll(), which is GNU specific. 433 */ 434 int 435 lowbit64(uint64_t i) 436 { 437 register int h = 64; 438 if (i == 0) 439 return (0); 440 441 if (i & 0x00000000ffffffffULL) 442 h -= 32; 443 else 444 i >>= 32; 445 446 if (i & 0x0000ffff) 447 h -= 16; 448 else 449 i >>= 16; 450 451 if (i & 0x00ff) 452 h -= 8; 453 else 454 i >>= 8; 455 456 if (i & 0x0f) 457 h -= 4; 458 else 459 i >>= 4; 460 461 if (i & 0x3) 462 h -= 2; 463 else 464 i >>= 2; 465 466 if (i & 0x1) 467 h -= 1; 468 469 return (h); 470 } 471 472 int 473 highbit64(uint64_t i) 474 { 475 int h = 1; 476 477 if (i == 0) 478 return (0); 479 if (i & 0xffffffff00000000ULL) { 480 h += 32; i >>= 32; 481 } 482 if (i & 0xffff0000) { 483 h += 16; i >>= 16; 484 } 485 if (i & 0xff00) { 486 h += 8; i >>= 8; 487 } 488 if (i & 0xf0) { 489 h += 4; i >>= 4; 490 } 491 if (i & 0xc) { 492 h += 2; i >>= 2; 493 } 494 if (i & 0x2) { 495 h += 1; 496 } 497 return (h); 498 } 499 500 /* 501 * ========================================================================= 502 * kernel emulation setup & teardown 503 * ========================================================================= 504 */ 505 static int 506 umem_out_of_memory(void) 507 { 508 char errmsg[] = "out of memory -- generating core dump\n"; 509 510 write(fileno(stderr), errmsg, sizeof (errmsg)); 511 abort(); 512 return (0); 513 } 514 515 void 516 kernel_init(int mode) 517 { 518 extern uint_t rrw_tsd_key; 519 520 umem_nofail_callback(umem_out_of_memory); 521 522 physmem = sysconf(_SC_PHYS_PAGES); 523 524 dprintf("physmem = %llu pages (%.2f GB)\n", physmem, 525 (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); 526 527 (void) snprintf(hw_serial, sizeof (hw_serial), "%ld", 528 (mode & FWRITE) ? get_system_hostid() : 0); 529 530 system_taskq_init(); 531 532 mutex_init(&cpu_lock, NULL, MUTEX_DEFAULT, NULL); 533 534 spa_init(mode); 535 536 tsd_create(&rrw_tsd_key, rrw_tsd_destroy); 537 } 538 539 void 540 kernel_fini(void) 541 { 542 spa_fini(); 543 544 system_taskq_fini(); 545 } 546 547 /* ARGSUSED */ 548 uint32_t 549 zone_get_hostid(void *zonep) 550 { 551 /* 552 * We're emulating the system's hostid in userland. 553 */ 554 return (strtoul(hw_serial, NULL, 10)); 555 } 556 557 int 558 z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen) 559 { 560 int ret; 561 uLongf len = *dstlen; 562 563 if ((ret = uncompress(dst, &len, src, srclen)) == Z_OK) 564 *dstlen = (size_t)len; 565 566 return (ret); 567 } 568 569 int 570 z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen, 571 int level) 572 { 573 int ret; 574 uLongf len = *dstlen; 575 576 if ((ret = compress2(dst, &len, src, srclen, level)) == Z_OK) 577 *dstlen = (size_t)len; 578 579 return (ret); 580 } 581 582 int 583 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) 584 { 585 return (0); 586 } 587 588 int 589 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) 590 { 591 return (0); 592 } 593 594 int 595 zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) 596 { 597 return (0); 598 } 599 600 /* ARGSUSED */ 601 int 602 zfs_onexit_fd_hold(int fd, minor_t *minorp) 603 { 604 *minorp = 0; 605 return (0); 606 } 607 608 /* ARGSUSED */ 609 void 610 zfs_onexit_fd_rele(int fd) 611 { 612 } 613 614 /* ARGSUSED */ 615 int 616 zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, 617 uint64_t *action_handle) 618 { 619 return (0); 620 } 621 622 /* ARGSUSED */ 623 int 624 zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) 625 { 626 return (0); 627 } 628 629 /* ARGSUSED */ 630 int 631 zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) 632 { 633 return (0); 634 } 635 636 void 637 bioinit(buf_t *bp) 638 { 639 bzero(bp, sizeof (buf_t)); 640 } 641 642 void 643 biodone(buf_t *bp) 644 { 645 if (bp->b_iodone != NULL) { 646 (*(bp->b_iodone))(bp); 647 return; 648 } 649 ASSERT((bp->b_flags & B_DONE) == 0); 650 bp->b_flags |= B_DONE; 651 } 652 653 void 654 bioerror(buf_t *bp, int error) 655 { 656 ASSERT(bp != NULL); 657 ASSERT(error >= 0); 658 659 if (error != 0) { 660 bp->b_flags |= B_ERROR; 661 } else { 662 bp->b_flags &= ~B_ERROR; 663 } 664 bp->b_error = error; 665 } 666 667 668 int 669 geterror(struct buf *bp) 670 { 671 int error = 0; 672 673 if (bp->b_flags & B_ERROR) { 674 error = bp->b_error; 675 if (!error) 676 error = EIO; 677 } 678 return (error); 679 } 680 681 int 682 crypto_create_ctx_template(crypto_mechanism_t *mech, 683 crypto_key_t *key, crypto_ctx_template_t *tmpl, int kmflag) 684 { 685 return (0); 686 } 687 688 crypto_mech_type_t 689 crypto_mech2id(const char *name) 690 { 691 return (CRYPTO_MECH_INVALID); 692 } 693 694 int 695 crypto_mac(crypto_mechanism_t *mech, crypto_data_t *data, 696 crypto_key_t *key, crypto_ctx_template_t impl, 697 crypto_data_t *mac, crypto_call_req_t *cr) 698 { 699 return (0); 700 } 701 702 int 703 crypto_encrypt(crypto_mechanism_t *mech, crypto_data_t *plaintext, 704 crypto_key_t *key, crypto_ctx_template_t tmpl, 705 crypto_data_t *ciphertext, crypto_call_req_t *cr) 706 { 707 return (0); 708 } 709 710 /* This could probably be a weak reference */ 711 int 712 crypto_decrypt(crypto_mechanism_t *mech, crypto_data_t *plaintext, 713 crypto_key_t *key, crypto_ctx_template_t tmpl, 714 crypto_data_t *ciphertext, crypto_call_req_t *cr) 715 { 716 return (0); 717 } 718 719 720 int 721 crypto_digest_final(crypto_context_t context, crypto_data_t *digest, 722 crypto_call_req_t *cr) 723 { 724 return (0); 725 } 726 727 int 728 crypto_digest_update(crypto_context_t context, crypto_data_t *data, 729 crypto_call_req_t *cr) 730 { 731 return (0); 732 } 733 734 int 735 crypto_digest_init(crypto_mechanism_t *mech, crypto_context_t *ctxp, 736 crypto_call_req_t *crq) 737 { 738 return (0); 739 } 740 741 void 742 crypto_destroy_ctx_template(crypto_ctx_template_t tmpl) 743 { 744 } 745 746 extern int crypto_mac_init(crypto_mechanism_t *mech, crypto_key_t *key, 747 crypto_ctx_template_t tmpl, crypto_context_t *ctxp, 748 crypto_call_req_t *cr) 749 { 750 return (0); 751 } 752 753 extern int crypto_mac_update(crypto_context_t ctx, crypto_data_t *data, 754 crypto_call_req_t *cr) 755 { 756 return (0); 757 } 758 759 extern int crypto_mac_final(crypto_context_t ctx, crypto_data_t *data, 760 crypto_call_req_t *cr) 761 { 762 return (0); 763 } 764