1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 25 * Copyright 2017 RackTop Systems. 26 */ 27 28 #include <assert.h> 29 #include <fcntl.h> 30 #include <poll.h> 31 #include <stdio.h> 32 #include <stdlib.h> 33 #include <string.h> 34 #include <zlib.h> 35 #include <libgen.h> 36 #include <sys/spa.h> 37 #include <sys/stat.h> 38 #include <sys/processor.h> 39 #include <sys/zfs_context.h> 40 #include <sys/rrwlock.h> 41 #include <sys/zmod.h> 42 #include <sys/utsname.h> 43 #include <sys/systeminfo.h> 44 45 extern void system_taskq_init(void); 46 extern void system_taskq_fini(void); 47 48 /* 49 * Emulation of kernel services in userland. 50 */ 51 52 pgcnt_t physmem; 53 vnode_t *rootdir = (vnode_t *)0xabcd1234; 54 char hw_serial[HW_HOSTID_LEN]; 55 kmutex_t cpu_lock; 56 vmem_t *zio_arena = NULL; 57 58 /* If set, all blocks read will be copied to the specified directory. */ 59 char *vn_dumpdir = NULL; 60 61 struct utsname utsname = { 62 "userland", "libzpool", "1", "1", "na" 63 }; 64 65 /* 66 * ========================================================================= 67 * vnode operations 68 * ========================================================================= 69 */ 70 /* 71 * Note: for the xxxat() versions of these functions, we assume that the 72 * starting vp is always rootdir (which is true for spa_directory.c, the only 73 * ZFS consumer of these interfaces). We assert this is true, and then emulate 74 * them by adding '/' in front of the path. 75 */ 76 77 /*ARGSUSED*/ 78 int 79 vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) 80 { 81 int fd; 82 int dump_fd; 83 vnode_t *vp; 84 int old_umask; 85 char realpath[MAXPATHLEN]; 86 struct stat64 st; 87 88 /* 89 * If we're accessing a real disk from userland, we need to use 90 * the character interface to avoid caching. This is particularly 91 * important if we're trying to look at a real in-kernel storage 92 * pool from userland, e.g. via zdb, because otherwise we won't 93 * see the changes occurring under the segmap cache. 94 * On the other hand, the stupid character device returns zero 95 * for its size. So -- gag -- we open the block device to get 96 * its size, and remember it for subsequent VOP_GETATTR(). 97 */ 98 if (strncmp(path, "/dev/", 5) == 0) { 99 char *dsk; 100 fd = open64(path, O_RDONLY); 101 if (fd == -1) 102 return (errno); 103 if (fstat64(fd, &st) == -1) { 104 close(fd); 105 return (errno); 106 } 107 close(fd); 108 (void) sprintf(realpath, "%s", path); 109 dsk = strstr(path, "/dsk/"); 110 if (dsk != NULL) 111 (void) sprintf(realpath + (dsk - path) + 1, "r%s", 112 dsk + 1); 113 } else { 114 (void) sprintf(realpath, "%s", path); 115 if (!(flags & FCREAT) && stat64(realpath, &st) == -1) 116 return (errno); 117 } 118 119 if (flags & FCREAT) 120 old_umask = umask(0); 121 122 /* 123 * The construct 'flags - FREAD' conveniently maps combinations of 124 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR. 125 */ 126 fd = open64(realpath, flags - FREAD, mode); 127 128 if (flags & FCREAT) 129 (void) umask(old_umask); 130 131 if (vn_dumpdir != NULL) { 132 char dumppath[MAXPATHLEN]; 133 (void) snprintf(dumppath, sizeof (dumppath), 134 "%s/%s", vn_dumpdir, basename(realpath)); 135 dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); 136 if (dump_fd == -1) 137 return (errno); 138 } else { 139 dump_fd = -1; 140 } 141 142 if (fd == -1) 143 return (errno); 144 145 if (fstat64(fd, &st) == -1) { 146 close(fd); 147 return (errno); 148 } 149 150 (void) fcntl(fd, F_SETFD, FD_CLOEXEC); 151 152 *vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL); 153 154 vp->v_fd = fd; 155 vp->v_size = st.st_size; 156 vp->v_path = spa_strdup(path); 157 vp->v_dump_fd = dump_fd; 158 159 return (0); 160 } 161 162 /*ARGSUSED*/ 163 int 164 vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, 165 int x3, vnode_t *startvp, int fd) 166 { 167 char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL); 168 int ret; 169 170 ASSERT(startvp == rootdir); 171 (void) sprintf(realpath, "/%s", path); 172 173 /* fd ignored for now, need if want to simulate nbmand support */ 174 ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3); 175 176 umem_free(realpath, strlen(path) + 2); 177 178 return (ret); 179 } 180 181 /*ARGSUSED*/ 182 int 183 vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, 184 int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp) 185 { 186 ssize_t iolen, split; 187 188 if (uio == UIO_READ) { 189 iolen = pread64(vp->v_fd, addr, len, offset); 190 if (vp->v_dump_fd != -1) { 191 int status = 192 pwrite64(vp->v_dump_fd, addr, iolen, offset); 193 ASSERT(status != -1); 194 } 195 } else { 196 /* 197 * To simulate partial disk writes, we split writes into two 198 * system calls so that the process can be killed in between. 199 */ 200 int sectors = len >> SPA_MINBLOCKSHIFT; 201 split = (sectors > 0 ? rand() % sectors : 0) << 202 SPA_MINBLOCKSHIFT; 203 iolen = pwrite64(vp->v_fd, addr, split, offset); 204 iolen += pwrite64(vp->v_fd, (char *)addr + split, 205 len - split, offset + split); 206 } 207 208 if (iolen == -1) 209 return (errno); 210 if (residp) 211 *residp = len - iolen; 212 else if (iolen != len) 213 return (EIO); 214 return (0); 215 } 216 217 void 218 vn_close(vnode_t *vp) 219 { 220 close(vp->v_fd); 221 if (vp->v_dump_fd != -1) 222 close(vp->v_dump_fd); 223 spa_strfree(vp->v_path); 224 umem_free(vp, sizeof (vnode_t)); 225 } 226 227 /* 228 * At a minimum we need to update the size since vdev_reopen() 229 * will no longer call vn_openat(). 230 */ 231 int 232 fop_getattr(vnode_t *vp, vattr_t *vap) 233 { 234 struct stat64 st; 235 236 if (fstat64(vp->v_fd, &st) == -1) { 237 close(vp->v_fd); 238 return (errno); 239 } 240 241 vap->va_size = st.st_size; 242 return (0); 243 } 244 245 #ifdef ZFS_DEBUG 246 247 /* 248 * ========================================================================= 249 * Figure out which debugging statements to print 250 * ========================================================================= 251 */ 252 253 static char *dprintf_string; 254 static int dprintf_print_all; 255 256 int 257 dprintf_find_string(const char *string) 258 { 259 char *tmp_str = dprintf_string; 260 int len = strlen(string); 261 262 /* 263 * Find out if this is a string we want to print. 264 * String format: file1.c,function_name1,file2.c,file3.c 265 */ 266 267 while (tmp_str != NULL) { 268 if (strncmp(tmp_str, string, len) == 0 && 269 (tmp_str[len] == ',' || tmp_str[len] == '\0')) 270 return (1); 271 tmp_str = strchr(tmp_str, ','); 272 if (tmp_str != NULL) 273 tmp_str++; /* Get rid of , */ 274 } 275 return (0); 276 } 277 278 void 279 dprintf_setup(int *argc, char **argv) 280 { 281 int i, j; 282 283 /* 284 * Debugging can be specified two ways: by setting the 285 * environment variable ZFS_DEBUG, or by including a 286 * "debug=..." argument on the command line. The command 287 * line setting overrides the environment variable. 288 */ 289 290 for (i = 1; i < *argc; i++) { 291 int len = strlen("debug="); 292 /* First look for a command line argument */ 293 if (strncmp("debug=", argv[i], len) == 0) { 294 dprintf_string = argv[i] + len; 295 /* Remove from args */ 296 for (j = i; j < *argc; j++) 297 argv[j] = argv[j+1]; 298 argv[j] = NULL; 299 (*argc)--; 300 } 301 } 302 303 if (dprintf_string == NULL) { 304 /* Look for ZFS_DEBUG environment variable */ 305 dprintf_string = getenv("ZFS_DEBUG"); 306 } 307 308 /* 309 * Are we just turning on all debugging? 310 */ 311 if (dprintf_find_string("on")) 312 dprintf_print_all = 1; 313 314 if (dprintf_string != NULL) 315 zfs_flags |= ZFS_DEBUG_DPRINTF; 316 } 317 318 /* 319 * ========================================================================= 320 * debug printfs 321 * ========================================================================= 322 */ 323 void 324 __dprintf(const char *file, const char *func, int line, const char *fmt, ...) 325 { 326 const char *newfile; 327 va_list adx; 328 329 /* 330 * Get rid of annoying "../common/" prefix to filename. 331 */ 332 newfile = strrchr(file, '/'); 333 if (newfile != NULL) { 334 newfile = newfile + 1; /* Get rid of leading / */ 335 } else { 336 newfile = file; 337 } 338 339 if (dprintf_print_all || 340 dprintf_find_string(newfile) || 341 dprintf_find_string(func)) { 342 /* Print out just the function name if requested */ 343 flockfile(stdout); 344 if (dprintf_find_string("pid")) 345 (void) printf("%d ", getpid()); 346 if (dprintf_find_string("tid")) 347 (void) printf("%u ", thr_self()); 348 if (dprintf_find_string("cpu")) 349 (void) printf("%u ", getcpuid()); 350 if (dprintf_find_string("time")) 351 (void) printf("%llu ", gethrtime()); 352 if (dprintf_find_string("long")) 353 (void) printf("%s, line %d: ", newfile, line); 354 (void) printf("%s: ", func); 355 va_start(adx, fmt); 356 (void) vprintf(fmt, adx); 357 va_end(adx); 358 funlockfile(stdout); 359 } 360 } 361 362 #endif /* ZFS_DEBUG */ 363 364 /* 365 * ========================================================================= 366 * kobj interfaces 367 * ========================================================================= 368 */ 369 struct _buf * 370 kobj_open_file(char *name) 371 { 372 struct _buf *file; 373 vnode_t *vp; 374 375 /* set vp as the _fd field of the file */ 376 if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, 377 -1) != 0) 378 return ((void *)-1UL); 379 380 file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL); 381 file->_fd = (intptr_t)vp; 382 return (file); 383 } 384 385 int 386 kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) 387 { 388 ssize_t resid; 389 390 vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off, 391 UIO_SYSSPACE, 0, 0, 0, &resid); 392 393 return (size - resid); 394 } 395 396 void 397 kobj_close_file(struct _buf *file) 398 { 399 vn_close((vnode_t *)file->_fd); 400 umem_free(file, sizeof (struct _buf)); 401 } 402 403 int 404 kobj_get_filesize(struct _buf *file, uint64_t *size) 405 { 406 struct stat64 st; 407 vnode_t *vp = (vnode_t *)file->_fd; 408 409 if (fstat64(vp->v_fd, &st) == -1) { 410 vn_close(vp); 411 return (errno); 412 } 413 *size = st.st_size; 414 return (0); 415 } 416 417 /* 418 * ========================================================================= 419 * kernel emulation setup & teardown 420 * ========================================================================= 421 */ 422 static int 423 umem_out_of_memory(void) 424 { 425 char errmsg[] = "out of memory -- generating core dump\n"; 426 427 write(fileno(stderr), errmsg, sizeof (errmsg)); 428 abort(); 429 return (0); 430 } 431 432 void 433 kernel_init(int mode) 434 { 435 extern uint_t rrw_tsd_key; 436 437 umem_nofail_callback(umem_out_of_memory); 438 439 physmem = sysconf(_SC_PHYS_PAGES); 440 441 dprintf("physmem = %llu pages (%.2f GB)\n", physmem, 442 (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); 443 444 (void) snprintf(hw_serial, sizeof (hw_serial), "%ld", 445 (mode & FWRITE) ? gethostid() : 0); 446 447 system_taskq_init(); 448 449 mutex_init(&cpu_lock, NULL, MUTEX_DEFAULT, NULL); 450 451 spa_init(mode); 452 453 tsd_create(&rrw_tsd_key, rrw_tsd_destroy); 454 } 455 456 void 457 kernel_fini(void) 458 { 459 spa_fini(); 460 461 system_taskq_fini(); 462 } 463 464 /* ARGSUSED */ 465 uint32_t 466 zone_get_hostid(void *zonep) 467 { 468 /* 469 * We're emulating the system's hostid in userland. 470 */ 471 return (strtoul(hw_serial, NULL, 10)); 472 } 473 474 int 475 z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen) 476 { 477 int ret; 478 uLongf len = *dstlen; 479 480 if ((ret = uncompress(dst, &len, src, srclen)) == Z_OK) 481 *dstlen = (size_t)len; 482 483 return (ret); 484 } 485 486 int 487 z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen, 488 int level) 489 { 490 int ret; 491 uLongf len = *dstlen; 492 493 if ((ret = compress2(dst, &len, src, srclen, level)) == Z_OK) 494 *dstlen = (size_t)len; 495 496 return (ret); 497 } 498 499 int 500 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) 501 { 502 return (0); 503 } 504 505 int 506 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) 507 { 508 return (0); 509 } 510 511 int 512 zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) 513 { 514 return (0); 515 } 516 517 /* ARGSUSED */ 518 int 519 zfs_onexit_fd_hold(int fd, minor_t *minorp) 520 { 521 *minorp = 0; 522 return (0); 523 } 524 525 /* ARGSUSED */ 526 void 527 zfs_onexit_fd_rele(int fd) 528 { 529 } 530 531 /* ARGSUSED */ 532 int 533 zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, 534 uint64_t *action_handle) 535 { 536 return (0); 537 } 538 539 /* ARGSUSED */ 540 int 541 zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) 542 { 543 return (0); 544 } 545 546 /* ARGSUSED */ 547 int 548 zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) 549 { 550 return (0); 551 } 552 553 void 554 bioinit(buf_t *bp) 555 { 556 bzero(bp, sizeof (buf_t)); 557 } 558 559 void 560 biodone(buf_t *bp) 561 { 562 if (bp->b_iodone != NULL) { 563 (*(bp->b_iodone))(bp); 564 return; 565 } 566 ASSERT((bp->b_flags & B_DONE) == 0); 567 bp->b_flags |= B_DONE; 568 } 569 570 void 571 bioerror(buf_t *bp, int error) 572 { 573 ASSERT(bp != NULL); 574 ASSERT(error >= 0); 575 576 if (error != 0) { 577 bp->b_flags |= B_ERROR; 578 } else { 579 bp->b_flags &= ~B_ERROR; 580 } 581 bp->b_error = error; 582 } 583 584 585 int 586 geterror(struct buf *bp) 587 { 588 int error = 0; 589 590 if (bp->b_flags & B_ERROR) { 591 error = bp->b_error; 592 if (!error) 593 error = EIO; 594 } 595 return (error); 596 } 597