1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2016 Actifio, Inc. All rights reserved. 26 * Copyright (c) 2025, Klara, Inc. 27 */ 28 29 #include <assert.h> 30 #include <fcntl.h> 31 #include <libgen.h> 32 #include <poll.h> 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 #include <limits.h> 37 #include <libzutil.h> 38 #include <sys/crypto/icp.h> 39 #include <sys/processor.h> 40 #include <sys/rrwlock.h> 41 #include <sys/spa.h> 42 #include <sys/spa_impl.h> 43 #include <sys/stat.h> 44 #include <sys/systeminfo.h> 45 #include <sys/time.h> 46 #include <sys/utsname.h> 47 #include <sys/zfs_context.h> 48 #include <sys/zfs_onexit.h> 49 #include <sys/zfs_vfsops.h> 50 #include <sys/zstd/zstd.h> 51 #include <sys/zvol.h> 52 #include <zfs_fletcher.h> 53 #include <zlib.h> 54 55 /* 56 * Emulation of kernel services in userland. 57 */ 58 59 uint64_t physmem; 60 uint32_t hostid; 61 struct utsname hw_utsname; 62 63 /* If set, all blocks read will be copied to the specified directory. */ 64 char *vn_dumpdir = NULL; 65 66 /* this only exists to have its address taken */ 67 struct proc p0; 68 69 /* 70 * ========================================================================= 71 * threads 72 * ========================================================================= 73 * 74 * TS_STACK_MIN is dictated by the minimum allowed pthread stack size. While 75 * TS_STACK_MAX is somewhat arbitrary, it was selected to be large enough for 76 * the expected stack depth while small enough to avoid exhausting address 77 * space with high thread counts. 78 */ 79 #define TS_STACK_MIN MAX(PTHREAD_STACK_MIN, 32768) 80 #define TS_STACK_MAX (256 * 1024) 81 82 struct zk_thread_wrapper { 83 void (*func)(void *); 84 void *arg; 85 }; 86 87 static void * 88 zk_thread_wrapper(void *arg) 89 { 90 struct zk_thread_wrapper ztw; 91 memcpy(&ztw, arg, sizeof (ztw)); 92 free(arg); 93 ztw.func(ztw.arg); 94 return (NULL); 95 } 96 97 kthread_t * 98 zk_thread_create(const char *name, void (*func)(void *), void *arg, 99 size_t stksize, int state) 100 { 101 pthread_attr_t attr; 102 pthread_t tid; 103 char *stkstr; 104 struct zk_thread_wrapper *ztw; 105 int detachstate = PTHREAD_CREATE_DETACHED; 106 107 VERIFY0(pthread_attr_init(&attr)); 108 109 if (state & TS_JOINABLE) 110 detachstate = PTHREAD_CREATE_JOINABLE; 111 112 VERIFY0(pthread_attr_setdetachstate(&attr, detachstate)); 113 114 /* 115 * We allow the default stack size in user space to be specified by 116 * setting the ZFS_STACK_SIZE environment variable. This allows us 117 * the convenience of observing and debugging stack overruns in 118 * user space. Explicitly specified stack sizes will be honored. 119 * The usage of ZFS_STACK_SIZE is discussed further in the 120 * ENVIRONMENT VARIABLES sections of the ztest(1) man page. 121 */ 122 if (stksize == 0) { 123 stkstr = getenv("ZFS_STACK_SIZE"); 124 125 if (stkstr == NULL) 126 stksize = TS_STACK_MAX; 127 else 128 stksize = MAX(atoi(stkstr), TS_STACK_MIN); 129 } 130 131 VERIFY3S(stksize, >, 0); 132 stksize = P2ROUNDUP(MAX(stksize, TS_STACK_MIN), PAGESIZE); 133 134 /* 135 * If this ever fails, it may be because the stack size is not a 136 * multiple of system page size. 137 */ 138 VERIFY0(pthread_attr_setstacksize(&attr, stksize)); 139 VERIFY0(pthread_attr_setguardsize(&attr, PAGESIZE)); 140 141 VERIFY(ztw = malloc(sizeof (*ztw))); 142 ztw->func = func; 143 ztw->arg = arg; 144 VERIFY0(pthread_create(&tid, &attr, zk_thread_wrapper, ztw)); 145 VERIFY0(pthread_attr_destroy(&attr)); 146 147 pthread_setname_np(tid, name); 148 149 return ((void *)(uintptr_t)tid); 150 } 151 152 /* 153 * ========================================================================= 154 * kstats 155 * ========================================================================= 156 */ 157 kstat_t * 158 kstat_create(const char *module, int instance, const char *name, 159 const char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag) 160 { 161 (void) module, (void) instance, (void) name, (void) class, (void) type, 162 (void) ndata, (void) ks_flag; 163 return (NULL); 164 } 165 166 void 167 kstat_install(kstat_t *ksp) 168 { 169 (void) ksp; 170 } 171 172 void 173 kstat_delete(kstat_t *ksp) 174 { 175 (void) ksp; 176 } 177 178 void 179 kstat_set_raw_ops(kstat_t *ksp, 180 int (*headers)(char *buf, size_t size), 181 int (*data)(char *buf, size_t size, void *data), 182 void *(*addr)(kstat_t *ksp, loff_t index)) 183 { 184 (void) ksp, (void) headers, (void) data, (void) addr; 185 } 186 187 /* 188 * ========================================================================= 189 * mutexes 190 * ========================================================================= 191 */ 192 193 void 194 mutex_init(kmutex_t *mp, char *name, int type, void *cookie) 195 { 196 (void) name, (void) type, (void) cookie; 197 VERIFY0(pthread_mutex_init(&mp->m_lock, NULL)); 198 memset(&mp->m_owner, 0, sizeof (pthread_t)); 199 } 200 201 void 202 mutex_destroy(kmutex_t *mp) 203 { 204 VERIFY0(pthread_mutex_destroy(&mp->m_lock)); 205 } 206 207 void 208 mutex_enter(kmutex_t *mp) 209 { 210 VERIFY0(pthread_mutex_lock(&mp->m_lock)); 211 mp->m_owner = pthread_self(); 212 } 213 214 int 215 mutex_enter_check_return(kmutex_t *mp) 216 { 217 int error = pthread_mutex_lock(&mp->m_lock); 218 if (error == 0) 219 mp->m_owner = pthread_self(); 220 return (error); 221 } 222 223 int 224 mutex_tryenter(kmutex_t *mp) 225 { 226 int error = pthread_mutex_trylock(&mp->m_lock); 227 if (error == 0) { 228 mp->m_owner = pthread_self(); 229 return (1); 230 } else { 231 VERIFY3S(error, ==, EBUSY); 232 return (0); 233 } 234 } 235 236 void 237 mutex_exit(kmutex_t *mp) 238 { 239 memset(&mp->m_owner, 0, sizeof (pthread_t)); 240 VERIFY0(pthread_mutex_unlock(&mp->m_lock)); 241 } 242 243 /* 244 * ========================================================================= 245 * rwlocks 246 * ========================================================================= 247 */ 248 249 void 250 rw_init(krwlock_t *rwlp, char *name, int type, void *arg) 251 { 252 (void) name, (void) type, (void) arg; 253 VERIFY0(pthread_rwlock_init(&rwlp->rw_lock, NULL)); 254 rwlp->rw_readers = 0; 255 rwlp->rw_owner = 0; 256 } 257 258 void 259 rw_destroy(krwlock_t *rwlp) 260 { 261 VERIFY0(pthread_rwlock_destroy(&rwlp->rw_lock)); 262 } 263 264 void 265 rw_enter(krwlock_t *rwlp, krw_t rw) 266 { 267 if (rw == RW_READER) { 268 VERIFY0(pthread_rwlock_rdlock(&rwlp->rw_lock)); 269 atomic_inc_uint(&rwlp->rw_readers); 270 } else { 271 VERIFY0(pthread_rwlock_wrlock(&rwlp->rw_lock)); 272 rwlp->rw_owner = pthread_self(); 273 } 274 } 275 276 void 277 rw_exit(krwlock_t *rwlp) 278 { 279 if (RW_READ_HELD(rwlp)) 280 atomic_dec_uint(&rwlp->rw_readers); 281 else 282 rwlp->rw_owner = 0; 283 284 VERIFY0(pthread_rwlock_unlock(&rwlp->rw_lock)); 285 } 286 287 int 288 rw_tryenter(krwlock_t *rwlp, krw_t rw) 289 { 290 int error; 291 292 if (rw == RW_READER) 293 error = pthread_rwlock_tryrdlock(&rwlp->rw_lock); 294 else 295 error = pthread_rwlock_trywrlock(&rwlp->rw_lock); 296 297 if (error == 0) { 298 if (rw == RW_READER) 299 atomic_inc_uint(&rwlp->rw_readers); 300 else 301 rwlp->rw_owner = pthread_self(); 302 303 return (1); 304 } 305 306 VERIFY3S(error, ==, EBUSY); 307 308 return (0); 309 } 310 311 uint32_t 312 zone_get_hostid(void *zonep) 313 { 314 /* 315 * We're emulating the system's hostid in userland. 316 */ 317 (void) zonep; 318 return (hostid); 319 } 320 321 int 322 rw_tryupgrade(krwlock_t *rwlp) 323 { 324 (void) rwlp; 325 return (0); 326 } 327 328 /* 329 * ========================================================================= 330 * condition variables 331 * ========================================================================= 332 */ 333 334 void 335 cv_init(kcondvar_t *cv, char *name, int type, void *arg) 336 { 337 (void) name, (void) type, (void) arg; 338 VERIFY0(pthread_cond_init(cv, NULL)); 339 } 340 341 void 342 cv_destroy(kcondvar_t *cv) 343 { 344 VERIFY0(pthread_cond_destroy(cv)); 345 } 346 347 void 348 cv_wait(kcondvar_t *cv, kmutex_t *mp) 349 { 350 memset(&mp->m_owner, 0, sizeof (pthread_t)); 351 VERIFY0(pthread_cond_wait(cv, &mp->m_lock)); 352 mp->m_owner = pthread_self(); 353 } 354 355 int 356 cv_wait_sig(kcondvar_t *cv, kmutex_t *mp) 357 { 358 cv_wait(cv, mp); 359 return (1); 360 } 361 362 int 363 cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) 364 { 365 int error; 366 struct timeval tv; 367 struct timespec ts; 368 clock_t delta; 369 370 delta = abstime - ddi_get_lbolt(); 371 if (delta <= 0) 372 return (-1); 373 374 VERIFY0(gettimeofday(&tv, NULL)); 375 376 ts.tv_sec = tv.tv_sec + delta / hz; 377 ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % hz) * (NANOSEC / hz); 378 if (ts.tv_nsec >= NANOSEC) { 379 ts.tv_sec++; 380 ts.tv_nsec -= NANOSEC; 381 } 382 383 memset(&mp->m_owner, 0, sizeof (pthread_t)); 384 error = pthread_cond_timedwait(cv, &mp->m_lock, &ts); 385 mp->m_owner = pthread_self(); 386 387 if (error == ETIMEDOUT) 388 return (-1); 389 390 VERIFY0(error); 391 392 return (1); 393 } 394 395 int 396 cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res, 397 int flag) 398 { 399 (void) res; 400 int error; 401 struct timeval tv; 402 struct timespec ts; 403 hrtime_t delta; 404 405 ASSERT(flag == 0 || flag == CALLOUT_FLAG_ABSOLUTE); 406 407 delta = tim; 408 if (flag & CALLOUT_FLAG_ABSOLUTE) 409 delta -= gethrtime(); 410 411 if (delta <= 0) 412 return (-1); 413 414 VERIFY0(gettimeofday(&tv, NULL)); 415 416 ts.tv_sec = tv.tv_sec + delta / NANOSEC; 417 ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % NANOSEC); 418 if (ts.tv_nsec >= NANOSEC) { 419 ts.tv_sec++; 420 ts.tv_nsec -= NANOSEC; 421 } 422 423 memset(&mp->m_owner, 0, sizeof (pthread_t)); 424 error = pthread_cond_timedwait(cv, &mp->m_lock, &ts); 425 mp->m_owner = pthread_self(); 426 427 if (error == ETIMEDOUT) 428 return (-1); 429 430 VERIFY0(error); 431 432 return (1); 433 } 434 435 void 436 cv_signal(kcondvar_t *cv) 437 { 438 VERIFY0(pthread_cond_signal(cv)); 439 } 440 441 void 442 cv_broadcast(kcondvar_t *cv) 443 { 444 VERIFY0(pthread_cond_broadcast(cv)); 445 } 446 447 /* 448 * ========================================================================= 449 * procfs list 450 * ========================================================================= 451 */ 452 453 void 454 seq_printf(struct seq_file *m, const char *fmt, ...) 455 { 456 (void) m, (void) fmt; 457 } 458 459 void 460 procfs_list_install(const char *module, 461 const char *submodule, 462 const char *name, 463 mode_t mode, 464 procfs_list_t *procfs_list, 465 int (*show)(struct seq_file *f, void *p), 466 int (*show_header)(struct seq_file *f), 467 int (*clear)(procfs_list_t *procfs_list), 468 size_t procfs_list_node_off) 469 { 470 (void) module, (void) submodule, (void) name, (void) mode, (void) show, 471 (void) show_header, (void) clear; 472 mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); 473 list_create(&procfs_list->pl_list, 474 procfs_list_node_off + sizeof (procfs_list_node_t), 475 procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); 476 procfs_list->pl_next_id = 1; 477 procfs_list->pl_node_offset = procfs_list_node_off; 478 } 479 480 void 481 procfs_list_uninstall(procfs_list_t *procfs_list) 482 { 483 (void) procfs_list; 484 } 485 486 void 487 procfs_list_destroy(procfs_list_t *procfs_list) 488 { 489 ASSERT(list_is_empty(&procfs_list->pl_list)); 490 list_destroy(&procfs_list->pl_list); 491 mutex_destroy(&procfs_list->pl_lock); 492 } 493 494 #define NODE_ID(procfs_list, obj) \ 495 (((procfs_list_node_t *)(((char *)obj) + \ 496 (procfs_list)->pl_node_offset))->pln_id) 497 498 void 499 procfs_list_add(procfs_list_t *procfs_list, void *p) 500 { 501 ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); 502 NODE_ID(procfs_list, p) = procfs_list->pl_next_id++; 503 list_insert_tail(&procfs_list->pl_list, p); 504 } 505 506 /* 507 * ========================================================================= 508 * vnode operations 509 * ========================================================================= 510 */ 511 512 /* 513 * ========================================================================= 514 * Figure out which debugging statements to print 515 * ========================================================================= 516 */ 517 518 static char *dprintf_string; 519 static int dprintf_print_all; 520 521 int 522 dprintf_find_string(const char *string) 523 { 524 char *tmp_str = dprintf_string; 525 int len = strlen(string); 526 527 /* 528 * Find out if this is a string we want to print. 529 * String format: file1.c,function_name1,file2.c,file3.c 530 */ 531 532 while (tmp_str != NULL) { 533 if (strncmp(tmp_str, string, len) == 0 && 534 (tmp_str[len] == ',' || tmp_str[len] == '\0')) 535 return (1); 536 tmp_str = strchr(tmp_str, ','); 537 if (tmp_str != NULL) 538 tmp_str++; /* Get rid of , */ 539 } 540 return (0); 541 } 542 543 void 544 dprintf_setup(int *argc, char **argv) 545 { 546 int i, j; 547 548 /* 549 * Debugging can be specified two ways: by setting the 550 * environment variable ZFS_DEBUG, or by including a 551 * "debug=..." argument on the command line. The command 552 * line setting overrides the environment variable. 553 */ 554 555 for (i = 1; i < *argc; i++) { 556 int len = strlen("debug="); 557 /* First look for a command line argument */ 558 if (strncmp("debug=", argv[i], len) == 0) { 559 dprintf_string = argv[i] + len; 560 /* Remove from args */ 561 for (j = i; j < *argc; j++) 562 argv[j] = argv[j+1]; 563 argv[j] = NULL; 564 (*argc)--; 565 } 566 } 567 568 if (dprintf_string == NULL) { 569 /* Look for ZFS_DEBUG environment variable */ 570 dprintf_string = getenv("ZFS_DEBUG"); 571 } 572 573 /* 574 * Are we just turning on all debugging? 575 */ 576 if (dprintf_find_string("on")) 577 dprintf_print_all = 1; 578 579 if (dprintf_string != NULL) 580 zfs_flags |= ZFS_DEBUG_DPRINTF; 581 } 582 583 /* 584 * ========================================================================= 585 * debug printfs 586 * ========================================================================= 587 */ 588 void 589 __dprintf(boolean_t dprint, const char *file, const char *func, 590 int line, const char *fmt, ...) 591 { 592 /* Get rid of annoying "../common/" prefix to filename. */ 593 const char *newfile = zfs_basename(file); 594 595 va_list adx; 596 if (dprint) { 597 /* dprintf messages are printed immediately */ 598 599 if (!dprintf_print_all && 600 !dprintf_find_string(newfile) && 601 !dprintf_find_string(func)) 602 return; 603 604 /* Print out just the function name if requested */ 605 flockfile(stdout); 606 if (dprintf_find_string("pid")) 607 (void) printf("%d ", getpid()); 608 if (dprintf_find_string("tid")) 609 (void) printf("%ju ", 610 (uintmax_t)(uintptr_t)pthread_self()); 611 if (dprintf_find_string("cpu")) 612 (void) printf("%u ", getcpuid()); 613 if (dprintf_find_string("time")) 614 (void) printf("%llu ", gethrtime()); 615 if (dprintf_find_string("long")) 616 (void) printf("%s, line %d: ", newfile, line); 617 (void) printf("dprintf: %s: ", func); 618 va_start(adx, fmt); 619 (void) vprintf(fmt, adx); 620 va_end(adx); 621 funlockfile(stdout); 622 } else { 623 /* zfs_dbgmsg is logged for dumping later */ 624 size_t size; 625 char *buf; 626 int i; 627 628 size = 1024; 629 buf = umem_alloc(size, UMEM_NOFAIL); 630 i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func); 631 632 if (i < size) { 633 va_start(adx, fmt); 634 (void) vsnprintf(buf + i, size - i, fmt, adx); 635 va_end(adx); 636 } 637 638 __zfs_dbgmsg(buf); 639 640 umem_free(buf, size); 641 } 642 } 643 644 /* 645 * ========================================================================= 646 * cmn_err() and panic() 647 * ========================================================================= 648 */ 649 650 static __attribute__((noreturn)) void 651 panic_stop_or_abort(void) 652 { 653 const char *stopenv = getenv("LIBZPOOL_PANIC_STOP"); 654 if (stopenv != NULL && atoi(stopenv)) { 655 fputs("libzpool: LIBZPOOL_PANIC_STOP is set, sending " 656 "SIGSTOP to process group\n", stderr); 657 fflush(stderr); 658 659 kill(0, SIGSTOP); 660 661 fputs("libzpool: continued after panic stop, " 662 "aborting\n", stderr); 663 } 664 665 abort(); /* think of it as a "user-level crash dump" */ 666 } 667 668 static void 669 vcmn_msg(int ce, const char *fmt, va_list adx) 670 { 671 switch (ce) { 672 case CE_IGNORE: 673 return; 674 case CE_CONT: 675 break; 676 case CE_NOTE: 677 fputs("libzpool: NOTICE: ", stderr); 678 break; 679 case CE_WARN: 680 fputs("libzpool: WARNING: ", stderr); 681 break; 682 case CE_PANIC: 683 fputs("libzpool: PANIC: ", stderr); 684 break; 685 default: 686 fputs("libzpool: [unknown severity %d]: ", stderr); 687 break; 688 } 689 690 vfprintf(stderr, fmt, adx); 691 if (ce != CE_CONT) 692 fputc('\n', stderr); 693 fflush(stderr); 694 } 695 696 void 697 vcmn_err(int ce, const char *fmt, va_list adx) 698 { 699 vcmn_msg(ce, fmt, adx); 700 701 if (ce == CE_PANIC) 702 panic_stop_or_abort(); 703 } 704 705 void 706 cmn_err(int ce, const char *fmt, ...) 707 { 708 va_list adx; 709 710 va_start(adx, fmt); 711 vcmn_err(ce, fmt, adx); 712 va_end(adx); 713 } 714 715 __attribute__((noreturn)) void 716 panic(const char *fmt, ...) 717 { 718 va_list adx; 719 720 va_start(adx, fmt); 721 vcmn_msg(CE_PANIC, fmt, adx); 722 va_end(adx); 723 724 panic_stop_or_abort(); 725 } 726 727 __attribute__((noreturn)) void 728 vpanic(const char *fmt, va_list adx) 729 { 730 vcmn_msg(CE_PANIC, fmt, adx); 731 panic_stop_or_abort(); 732 } 733 734 /* 735 * ========================================================================= 736 * misc routines 737 * ========================================================================= 738 */ 739 740 void 741 delay(clock_t ticks) 742 { 743 (void) poll(0, 0, ticks * (1000 / hz)); 744 } 745 746 /* 747 * Find highest one bit set. 748 * Returns bit number + 1 of highest bit that is set, otherwise returns 0. 749 * The __builtin_clzll() function is supported by both GCC and Clang. 750 */ 751 int 752 highbit64(uint64_t i) 753 { 754 if (i == 0) 755 return (0); 756 757 return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); 758 } 759 760 /* 761 * Find lowest one bit set. 762 * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. 763 * The __builtin_ffsll() function is supported by both GCC and Clang. 764 */ 765 int 766 lowbit64(uint64_t i) 767 { 768 if (i == 0) 769 return (0); 770 771 return (__builtin_ffsll(i)); 772 } 773 774 const char *random_path = "/dev/random"; 775 const char *urandom_path = "/dev/urandom"; 776 static int random_fd = -1, urandom_fd = -1; 777 778 void 779 random_init(void) 780 { 781 VERIFY((random_fd = open(random_path, O_RDONLY | O_CLOEXEC)) != -1); 782 VERIFY((urandom_fd = open(urandom_path, O_RDONLY | O_CLOEXEC)) != -1); 783 } 784 785 void 786 random_fini(void) 787 { 788 close(random_fd); 789 close(urandom_fd); 790 791 random_fd = -1; 792 urandom_fd = -1; 793 } 794 795 static int 796 random_get_bytes_common(uint8_t *ptr, size_t len, int fd) 797 { 798 size_t resid = len; 799 ssize_t bytes; 800 801 ASSERT(fd != -1); 802 803 while (resid != 0) { 804 bytes = read(fd, ptr, resid); 805 ASSERT3S(bytes, >=, 0); 806 ptr += bytes; 807 resid -= bytes; 808 } 809 810 return (0); 811 } 812 813 int 814 random_get_bytes(uint8_t *ptr, size_t len) 815 { 816 return (random_get_bytes_common(ptr, len, random_fd)); 817 } 818 819 int 820 random_get_pseudo_bytes(uint8_t *ptr, size_t len) 821 { 822 return (random_get_bytes_common(ptr, len, urandom_fd)); 823 } 824 825 int 826 ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result) 827 { 828 errno = 0; 829 *result = strtoull(str, nptr, base); 830 if (*result == 0) 831 return (errno); 832 return (0); 833 } 834 835 utsname_t * 836 utsname(void) 837 { 838 return (&hw_utsname); 839 } 840 841 /* 842 * ========================================================================= 843 * kernel emulation setup & teardown 844 * ========================================================================= 845 */ 846 static int 847 umem_out_of_memory(void) 848 { 849 char errmsg[] = "out of memory -- generating core dump\n"; 850 851 (void) fprintf(stderr, "%s", errmsg); 852 abort(); 853 return (0); 854 } 855 856 static void 857 spa_config_load(void) 858 { 859 void *buf = NULL; 860 nvlist_t *nvlist, *child; 861 nvpair_t *nvpair; 862 char *pathname; 863 zfs_file_t *fp; 864 zfs_file_attr_t zfa; 865 uint64_t fsize; 866 int err; 867 868 /* 869 * Open the configuration file. 870 */ 871 pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 872 873 (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path); 874 875 err = zfs_file_open(pathname, O_RDONLY, 0, &fp); 876 if (err) 877 err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp); 878 879 kmem_free(pathname, MAXPATHLEN); 880 881 if (err) 882 return; 883 884 if (zfs_file_getattr(fp, &zfa)) 885 goto out; 886 887 fsize = zfa.zfa_size; 888 buf = kmem_alloc(fsize, KM_SLEEP); 889 890 /* 891 * Read the nvlist from the file. 892 */ 893 if (zfs_file_read(fp, buf, fsize, NULL) < 0) 894 goto out; 895 896 /* 897 * Unpack the nvlist. 898 */ 899 if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) 900 goto out; 901 902 /* 903 * Iterate over all elements in the nvlist, creating a new spa_t for 904 * each one with the specified configuration. 905 */ 906 mutex_enter(&spa_namespace_lock); 907 nvpair = NULL; 908 while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { 909 if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) 910 continue; 911 912 child = fnvpair_value_nvlist(nvpair); 913 914 if (spa_lookup(nvpair_name(nvpair)) != NULL) 915 continue; 916 (void) spa_add(nvpair_name(nvpair), child, NULL); 917 } 918 mutex_exit(&spa_namespace_lock); 919 920 nvlist_free(nvlist); 921 922 out: 923 if (buf != NULL) 924 kmem_free(buf, fsize); 925 926 zfs_file_close(fp); 927 } 928 929 void 930 kernel_init(int mode) 931 { 932 extern uint_t rrw_tsd_key; 933 934 umem_nofail_callback(umem_out_of_memory); 935 936 physmem = sysconf(_SC_PHYS_PAGES); 937 938 dprintf("physmem = %llu pages (%.2f GB)\n", (u_longlong_t)physmem, 939 (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); 940 941 hostid = (mode & SPA_MODE_WRITE) ? get_system_hostid() : 0; 942 943 random_init(); 944 945 VERIFY0(uname(&hw_utsname)); 946 947 system_taskq_init(); 948 icp_init(); 949 950 zstd_init(); 951 952 spa_init((spa_mode_t)mode); 953 spa_config_load(); 954 955 fletcher_4_init(); 956 957 tsd_create(&rrw_tsd_key, rrw_tsd_destroy); 958 } 959 960 void 961 kernel_fini(void) 962 { 963 fletcher_4_fini(); 964 spa_fini(); 965 966 zstd_fini(); 967 968 icp_fini(); 969 system_taskq_fini(); 970 971 random_fini(); 972 } 973 974 uid_t 975 crgetuid(cred_t *cr) 976 { 977 (void) cr; 978 return (0); 979 } 980 981 uid_t 982 crgetruid(cred_t *cr) 983 { 984 (void) cr; 985 return (0); 986 } 987 988 gid_t 989 crgetgid(cred_t *cr) 990 { 991 (void) cr; 992 return (0); 993 } 994 995 int 996 crgetngroups(cred_t *cr) 997 { 998 (void) cr; 999 return (0); 1000 } 1001 1002 gid_t * 1003 crgetgroups(cred_t *cr) 1004 { 1005 (void) cr; 1006 return (NULL); 1007 } 1008 1009 int 1010 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) 1011 { 1012 (void) name, (void) cr; 1013 return (0); 1014 } 1015 1016 int 1017 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) 1018 { 1019 (void) from, (void) to, (void) cr; 1020 return (0); 1021 } 1022 1023 int 1024 zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) 1025 { 1026 (void) name, (void) cr; 1027 return (0); 1028 } 1029 1030 int 1031 secpolicy_zfs(const cred_t *cr) 1032 { 1033 (void) cr; 1034 return (0); 1035 } 1036 1037 ksiddomain_t * 1038 ksid_lookupdomain(const char *dom) 1039 { 1040 ksiddomain_t *kd; 1041 1042 kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL); 1043 kd->kd_name = spa_strdup(dom); 1044 return (kd); 1045 } 1046 1047 void 1048 ksiddomain_rele(ksiddomain_t *ksid) 1049 { 1050 spa_strfree(ksid->kd_name); 1051 umem_free(ksid, sizeof (ksiddomain_t)); 1052 } 1053 1054 char * 1055 kmem_vasprintf(const char *fmt, va_list adx) 1056 { 1057 char *buf = NULL; 1058 va_list adx_copy; 1059 1060 va_copy(adx_copy, adx); 1061 VERIFY(vasprintf(&buf, fmt, adx_copy) != -1); 1062 va_end(adx_copy); 1063 1064 return (buf); 1065 } 1066 1067 char * 1068 kmem_asprintf(const char *fmt, ...) 1069 { 1070 char *buf = NULL; 1071 va_list adx; 1072 1073 va_start(adx, fmt); 1074 VERIFY(vasprintf(&buf, fmt, adx) != -1); 1075 va_end(adx); 1076 1077 return (buf); 1078 } 1079 1080 /* 1081 * kmem_scnprintf() will return the number of characters that it would have 1082 * printed whenever it is limited by value of the size variable, rather than 1083 * the number of characters that it did print. This can cause misbehavior on 1084 * subsequent uses of the return value, so we define a safe version that will 1085 * return the number of characters actually printed, minus the NULL format 1086 * character. Subsequent use of this by the safe string functions is safe 1087 * whether it is snprintf(), strlcat() or strlcpy(). 1088 */ 1089 int 1090 kmem_scnprintf(char *restrict str, size_t size, const char *restrict fmt, ...) 1091 { 1092 int n; 1093 va_list ap; 1094 1095 /* Make the 0 case a no-op so that we do not return -1 */ 1096 if (size == 0) 1097 return (0); 1098 1099 va_start(ap, fmt); 1100 n = vsnprintf(str, size, fmt, ap); 1101 va_end(ap); 1102 1103 if (n >= size) 1104 n = size - 1; 1105 1106 return (n); 1107 } 1108 1109 zfs_file_t * 1110 zfs_onexit_fd_hold(int fd, minor_t *minorp) 1111 { 1112 (void) fd; 1113 *minorp = 0; 1114 return (NULL); 1115 } 1116 1117 void 1118 zfs_onexit_fd_rele(zfs_file_t *fp) 1119 { 1120 (void) fp; 1121 } 1122 1123 int 1124 zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, 1125 uintptr_t *action_handle) 1126 { 1127 (void) minor, (void) func, (void) data, (void) action_handle; 1128 return (0); 1129 } 1130 1131 fstrans_cookie_t 1132 spl_fstrans_mark(void) 1133 { 1134 return ((fstrans_cookie_t)0); 1135 } 1136 1137 void 1138 spl_fstrans_unmark(fstrans_cookie_t cookie) 1139 { 1140 (void) cookie; 1141 } 1142 1143 int 1144 kmem_cache_reap_active(void) 1145 { 1146 return (0); 1147 } 1148 1149 void 1150 zvol_create_minors(const char *name) 1151 { 1152 (void) name; 1153 } 1154 1155 void 1156 zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) 1157 { 1158 (void) spa, (void) name, (void) async; 1159 } 1160 1161 void 1162 zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname, 1163 boolean_t async) 1164 { 1165 (void) spa, (void) oldname, (void) newname, (void) async; 1166 } 1167 1168 /* 1169 * Open file 1170 * 1171 * path - fully qualified path to file 1172 * flags - file attributes O_READ / O_WRITE / O_EXCL 1173 * fpp - pointer to return file pointer 1174 * 1175 * Returns 0 on success underlying error on failure. 1176 */ 1177 int 1178 zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) 1179 { 1180 int fd; 1181 int dump_fd; 1182 int err; 1183 int old_umask = 0; 1184 zfs_file_t *fp; 1185 struct stat64 st; 1186 1187 if (!(flags & O_CREAT) && stat64(path, &st) == -1) 1188 return (errno); 1189 1190 if (!(flags & O_CREAT) && S_ISBLK(st.st_mode)) 1191 flags |= O_DIRECT; 1192 1193 if (flags & O_CREAT) 1194 old_umask = umask(0); 1195 1196 fd = open64(path, flags, mode); 1197 if (fd == -1) 1198 return (errno); 1199 1200 if (flags & O_CREAT) 1201 (void) umask(old_umask); 1202 1203 if (vn_dumpdir != NULL) { 1204 char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL); 1205 const char *inpath = zfs_basename(path); 1206 1207 (void) snprintf(dumppath, MAXPATHLEN, 1208 "%s/%s", vn_dumpdir, inpath); 1209 dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); 1210 umem_free(dumppath, MAXPATHLEN); 1211 if (dump_fd == -1) { 1212 err = errno; 1213 close(fd); 1214 return (err); 1215 } 1216 } else { 1217 dump_fd = -1; 1218 } 1219 1220 (void) fcntl(fd, F_SETFD, FD_CLOEXEC); 1221 1222 fp = umem_zalloc(sizeof (zfs_file_t), UMEM_NOFAIL); 1223 fp->f_fd = fd; 1224 fp->f_dump_fd = dump_fd; 1225 *fpp = fp; 1226 1227 return (0); 1228 } 1229 1230 void 1231 zfs_file_close(zfs_file_t *fp) 1232 { 1233 close(fp->f_fd); 1234 if (fp->f_dump_fd != -1) 1235 close(fp->f_dump_fd); 1236 1237 umem_free(fp, sizeof (zfs_file_t)); 1238 } 1239 1240 /* 1241 * Stateful write - use os internal file pointer to determine where to 1242 * write and update on successful completion. 1243 * 1244 * fp - pointer to file (pipe, socket, etc) to write to 1245 * buf - buffer to write 1246 * count - # of bytes to write 1247 * resid - pointer to count of unwritten bytes (if short write) 1248 * 1249 * Returns 0 on success errno on failure. 1250 */ 1251 int 1252 zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) 1253 { 1254 ssize_t rc; 1255 1256 rc = write(fp->f_fd, buf, count); 1257 if (rc < 0) 1258 return (errno); 1259 1260 if (resid) { 1261 *resid = count - rc; 1262 } else if (rc != count) { 1263 return (EIO); 1264 } 1265 1266 return (0); 1267 } 1268 1269 /* 1270 * Stateless write - os internal file pointer is not updated. 1271 * 1272 * fp - pointer to file (pipe, socket, etc) to write to 1273 * buf - buffer to write 1274 * count - # of bytes to write 1275 * off - file offset to write to (only valid for seekable types) 1276 * resid - pointer to count of unwritten bytes 1277 * 1278 * Returns 0 on success errno on failure. 1279 */ 1280 int 1281 zfs_file_pwrite(zfs_file_t *fp, const void *buf, 1282 size_t count, loff_t pos, uint8_t ashift, ssize_t *resid) 1283 { 1284 ssize_t rc, split, done; 1285 int sectors; 1286 1287 /* 1288 * To simulate partial disk writes, we split writes into two 1289 * system calls so that the process can be killed in between. 1290 * This is used by ztest to simulate realistic failure modes. 1291 */ 1292 sectors = count >> ashift; 1293 split = (sectors > 0 ? rand() % sectors : 0) << ashift; 1294 rc = pwrite64(fp->f_fd, buf, split, pos); 1295 if (rc != -1) { 1296 done = rc; 1297 rc = pwrite64(fp->f_fd, (char *)buf + split, 1298 count - split, pos + split); 1299 } 1300 #ifdef __linux__ 1301 if (rc == -1 && errno == EINVAL) { 1302 /* 1303 * Under Linux, this most likely means an alignment issue 1304 * (memory or disk) due to O_DIRECT, so we abort() in order 1305 * to catch the offender. 1306 */ 1307 abort(); 1308 } 1309 #endif 1310 1311 if (rc < 0) 1312 return (errno); 1313 1314 done += rc; 1315 1316 if (resid) { 1317 *resid = count - done; 1318 } else if (done != count) { 1319 return (EIO); 1320 } 1321 1322 return (0); 1323 } 1324 1325 /* 1326 * Stateful read - use os internal file pointer to determine where to 1327 * read and update on successful completion. 1328 * 1329 * fp - pointer to file (pipe, socket, etc) to read from 1330 * buf - buffer to write 1331 * count - # of bytes to read 1332 * resid - pointer to count of unread bytes (if short read) 1333 * 1334 * Returns 0 on success errno on failure. 1335 */ 1336 int 1337 zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) 1338 { 1339 int rc; 1340 1341 rc = read(fp->f_fd, buf, count); 1342 if (rc < 0) 1343 return (errno); 1344 1345 if (resid) { 1346 *resid = count - rc; 1347 } else if (rc != count) { 1348 return (EIO); 1349 } 1350 1351 return (0); 1352 } 1353 1354 /* 1355 * Stateless read - os internal file pointer is not updated. 1356 * 1357 * fp - pointer to file (pipe, socket, etc) to read from 1358 * buf - buffer to write 1359 * count - # of bytes to write 1360 * off - file offset to read from (only valid for seekable types) 1361 * resid - pointer to count of unwritten bytes (if short write) 1362 * 1363 * Returns 0 on success errno on failure. 1364 */ 1365 int 1366 zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, 1367 ssize_t *resid) 1368 { 1369 ssize_t rc; 1370 1371 rc = pread64(fp->f_fd, buf, count, off); 1372 if (rc < 0) { 1373 #ifdef __linux__ 1374 /* 1375 * Under Linux, this most likely means an alignment issue 1376 * (memory or disk) due to O_DIRECT, so we abort() in order to 1377 * catch the offender. 1378 */ 1379 if (errno == EINVAL) 1380 abort(); 1381 #endif 1382 return (errno); 1383 } 1384 1385 if (fp->f_dump_fd != -1) { 1386 int status; 1387 1388 status = pwrite64(fp->f_dump_fd, buf, rc, off); 1389 ASSERT(status != -1); 1390 } 1391 1392 if (resid) { 1393 *resid = count - rc; 1394 } else if (rc != count) { 1395 return (EIO); 1396 } 1397 1398 return (0); 1399 } 1400 1401 /* 1402 * lseek - set / get file pointer 1403 * 1404 * fp - pointer to file (pipe, socket, etc) to read from 1405 * offp - value to seek to, returns current value plus passed offset 1406 * whence - see man pages for standard lseek whence values 1407 * 1408 * Returns 0 on success errno on failure (ESPIPE for non seekable types) 1409 */ 1410 int 1411 zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) 1412 { 1413 loff_t rc; 1414 1415 rc = lseek(fp->f_fd, *offp, whence); 1416 if (rc < 0) 1417 return (errno); 1418 1419 *offp = rc; 1420 1421 return (0); 1422 } 1423 1424 /* 1425 * Get file attributes 1426 * 1427 * filp - file pointer 1428 * zfattr - pointer to file attr structure 1429 * 1430 * Currently only used for fetching size and file mode 1431 * 1432 * Returns 0 on success or error code of underlying getattr call on failure. 1433 */ 1434 int 1435 zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr) 1436 { 1437 struct stat64 st; 1438 1439 if (fstat64_blk(fp->f_fd, &st) == -1) 1440 return (errno); 1441 1442 zfattr->zfa_size = st.st_size; 1443 zfattr->zfa_mode = st.st_mode; 1444 1445 return (0); 1446 } 1447 1448 /* 1449 * Sync file to disk 1450 * 1451 * filp - file pointer 1452 * flags - O_SYNC and or O_DSYNC 1453 * 1454 * Returns 0 on success or error code of underlying sync call on failure. 1455 */ 1456 int 1457 zfs_file_fsync(zfs_file_t *fp, int flags) 1458 { 1459 (void) flags; 1460 1461 if (fsync(fp->f_fd) < 0) 1462 return (errno); 1463 1464 return (0); 1465 } 1466 1467 /* 1468 * deallocate - zero and/or deallocate file storage 1469 * 1470 * fp - file pointer 1471 * offset - offset to start zeroing or deallocating 1472 * len - length to zero or deallocate 1473 */ 1474 int 1475 zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len) 1476 { 1477 int rc; 1478 #if defined(__linux__) 1479 rc = fallocate(fp->f_fd, 1480 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, len); 1481 #elif defined(__FreeBSD__) && (__FreeBSD_version >= 1400029) 1482 struct spacectl_range rqsr = { 1483 .r_offset = offset, 1484 .r_len = len, 1485 }; 1486 rc = fspacectl(fp->f_fd, SPACECTL_DEALLOC, &rqsr, 0, &rqsr); 1487 #else 1488 (void) fp, (void) offset, (void) len; 1489 rc = EOPNOTSUPP; 1490 #endif 1491 if (rc) 1492 return (SET_ERROR(rc)); 1493 return (0); 1494 } 1495 1496 /* 1497 * Request current file pointer offset 1498 * 1499 * fp - pointer to file 1500 * 1501 * Returns current file offset. 1502 */ 1503 loff_t 1504 zfs_file_off(zfs_file_t *fp) 1505 { 1506 return (lseek(fp->f_fd, SEEK_CUR, 0)); 1507 } 1508 1509 /* 1510 * unlink file 1511 * 1512 * path - fully qualified file path 1513 * 1514 * Returns 0 on success. 1515 * 1516 * OPTIONAL 1517 */ 1518 int 1519 zfs_file_unlink(const char *path) 1520 { 1521 return (remove(path)); 1522 } 1523 1524 /* 1525 * Get reference to file pointer 1526 * 1527 * fd - input file descriptor 1528 * 1529 * Returns pointer to file struct or NULL. 1530 * Unsupported in user space. 1531 */ 1532 zfs_file_t * 1533 zfs_file_get(int fd) 1534 { 1535 (void) fd; 1536 abort(); 1537 return (NULL); 1538 } 1539 /* 1540 * Drop reference to file pointer 1541 * 1542 * fp - pointer to file struct 1543 * 1544 * Unsupported in user space. 1545 */ 1546 void 1547 zfs_file_put(zfs_file_t *fp) 1548 { 1549 abort(); 1550 (void) fp; 1551 } 1552 1553 void 1554 zfsvfs_update_fromname(const char *oldname, const char *newname) 1555 { 1556 (void) oldname, (void) newname; 1557 } 1558 1559 void 1560 spa_import_os(spa_t *spa) 1561 { 1562 (void) spa; 1563 } 1564 1565 void 1566 spa_export_os(spa_t *spa) 1567 { 1568 (void) spa; 1569 } 1570 1571 void 1572 spa_activate_os(spa_t *spa) 1573 { 1574 (void) spa; 1575 } 1576 1577 void 1578 spa_deactivate_os(spa_t *spa) 1579 { 1580 (void) spa; 1581 } 1582