1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <errno.h> 28 #include <fcntl.h> 29 #include <dirent.h> 30 #include <stddef.h> 31 #include <stdio.h> 32 #include <stdlib.h> 33 #include <strings.h> 34 #include <unistd.h> 35 #include <thread.h> 36 #include <sys/auxv.h> 37 #include <sys/brand.h> 38 #include <sys/inttypes.h> 39 #include <sys/lwp.h> 40 #include <sys/syscall.h> 41 #include <sys/systm.h> 42 #include <sys/utsname.h> 43 #include <sys/sysconfig.h> 44 #include <sys/systeminfo.h> 45 #include <sys/zone.h> 46 #include <sys/stat.h> 47 #include <sys/mntent.h> 48 #include <sys/ctfs.h> 49 #include <sys/priv.h> 50 #include <sys/acctctl.h> 51 #include <libgen.h> 52 #include <bsm/audit.h> 53 #include <sys/crypto/ioctl.h> 54 #include <sys/fs/zfs.h> 55 #include <sys/zfs_ioctl.h> 56 #include <sys/ucontext.h> 57 #include <sys/mntio.h> 58 #include <sys/mnttab.h> 59 #include <sys/attr.h> 60 #include <atomic.h> 61 62 #include <s10_brand.h> 63 #include <s10_misc.h> 64 #include <s10_signal.h> 65 66 /* 67 * Principles of emulation 101. 68 * 69 * 70 * *** Setting errno 71 * 72 * Just don't do it. This emulation library is loaded onto a 73 * seperate link map from the application who's address space we're 74 * running in. We have our own private copy of libc, so there for, 75 * the errno value accessible from here is is also private and changing 76 * it will not affect any errno value that the processes who's address 77 * space we are running in will see. To return an error condition we 78 * should return the negated errno value we'd like the system to return. 79 * For more information about this see the comment in s10_handler(). 80 * Basically, when we return to the caller that initiated the system 81 * call it's their responsibility to set errno. 82 * 83 * 84 * *** Recursion Considerations 85 * 86 * When emulating system calls we need to be very careful about what 87 * library calls we invoke. Library calls should be kept to a minimum. 88 * One issue is that library calls can invoke system calls, so if we're 89 * emulating a system call and we invoke a library call that depends on 90 * that system call we will probably enter a recursive loop, which would 91 * be bad. 92 * 93 * 94 * *** Return Values. 95 * 96 * When declaring new syscall emulation functions, it is very important 97 * to to set the proper RV_* flags in the s10_sysent_table. Upon failure, 98 * syscall emulation fuctions should return an errno value. Upon success 99 * syscall emulation functions should return 0 and set the sysret_t return 100 * value parameters accordingly. 101 * 102 * There are five possible syscall macro wrappers used in the kernel's system 103 * call sysent table. These turn into the following return values: 104 * SYSENT_CL -> SYSENT_C or SYSENT_CI 105 * SYSENT_C SE_64RVAL RV_DEFAULT 106 * SYSENT_CI SE_32RVAL1 RV_DEFAULT 107 * SYSENT_2CI SE_32RVAL1|SE_32RVAL2 RV_32RVAL2 108 * SYSENT_AP SE_64RVAL RV_64RVAL 109 * 110 * 111 * *** Agent lwp considerations 112 * 113 * It is currently impossible to do any emulation for these system call 114 * when they are being invoked on behalf of an agent lwp. To understand why 115 * it's impossible you have to understand how agent lwp syscalls work. 116 * 117 * The agent lwp syscall process works as follows: 118 * 1 The controlling process stops the target. 119 * 2 The controlling process injects an agent lwp which is also stopped. 120 * This agent lwp assumes the userland stack and register values 121 * of another stopped lwp in the current process. 122 * 3 The controlling process configures the agent lwp to start 123 * executing the requested system call. 124 * 4 The controlling process configure /proc to stop the agent lwp when 125 * it enters the requested system call. 126 * 5 The controlling processes allows the agent lwp to start executing. 127 * 6 The agent lwp traps into the kernel to perform the requested system 128 * call and immediately stop. 129 * 7 The controlling process copies all the arguments for the requested 130 * system call onto the agent lwp's stack. 131 * 8 The controlling process configures /proc to stop the agent lwp 132 * when it completes the requested system call. 133 * 9 The controlling processes allows the agent lwp to start executing. 134 * 10 The agent lwp executes the system call and then stop before returning 135 * to userland. 136 * 11 The controlling process copies the return value and return arguments 137 * back from the agent lwps stack. 138 * 12 The controlling process destroys the agent lwp and restarts 139 * the target process. 140 * 141 * The fundamental problem is that when the agent executes the request 142 * system call in step 5, if we're emulating that system call then the 143 * lwp is redirected back to our emulation layer without blocking 144 * in the kernel. But our emulation layer can't access the arguments 145 * for the system call because they haven't been copied to the stack 146 * yet and they still only exist in the controlling processes address 147 * space. This prevents us from being able to do any emulation of 148 * agent lwp system calls. Hence, currently our brand trap interposition 149 * callback (s10_brand_syscall_callback_common) will detect if a system 150 * call is being made by an agent lwp, and if this is the case it will 151 * never redirect the system call to this emulation library. 152 * 153 * In the future, if this proves to be a problem the the easiest solution 154 * would probably be to replace the branded versions of these application 155 * with their native counterparts. Ie, truss, plimit, and pfiles could be 156 * replace with wrapper scripts that execute the native versions of these 157 * applications. In the case of plimit and pfiles this should be pretty 158 * strait forward. Truss would probably be more tricky since it can 159 * execute applications which would be branded applications, so in that 160 * case it might be necessary to create a loadable library which could 161 * be LD_PRELOADed into truss and this library would interpose on the 162 * exec() system call to allow truss to correctly execute branded 163 * processes. It should be pointed out that this solution could work 164 * because "native agent lwps" (ie, agent lwps created by native 165 * processes) can be treated differently from "branded aged lwps" (ie, 166 * agent lwps created by branded processes), since native agent lwps 167 * would presumably be making native system calls and hence not need 168 * any interposition. 169 * 170 */ 171 172 static zoneid_t zoneid; 173 static boolean_t emul_global_zone = B_FALSE; 174 static s10_emul_bitmap_t emul_bitmap; 175 pid_t zone_init_pid; 176 177 /* 178 * S10_FEATURE_IS_PRESENT is a macro that helps facilitate conditional 179 * emulation. For each constant N defined in the s10_emulated_features 180 * enumeration in usr/src/uts/common/brand/solaris10/s10_brand.h, 181 * S10_FEATURE_IS_PRESENT(N) is true iff the feature/backport represented by N 182 * is present in the Solaris 10 image hosted within the zone. In other words, 183 * S10_FEATURE_IS_PRESENT(N) is true iff the file /usr/lib/brand/solaris10/M, 184 * where M is the enum value of N, was present in the zone when the zone booted. 185 * 186 * 187 * *** Sample Usage 188 * 189 * Suppose that you need to backport a fix to Solaris 10 and there is 190 * emulation in place for the fix. Suppose further that the emulation won't be 191 * needed if the fix is backported (i.e., if the fix is present in the hosted 192 * Solaris 10 environment, then the brand won't need the emulation). Then if 193 * you add a constant named "S10_FEATURE_X" to the end of the 194 * s10_emulated_features enumeration that represents the backported fix and 195 * S10_FEATURE_X evaluates to four, then you should create a file named 196 * /usr/lib/brand/solaris10/4 as part of your backport. Additionally, you 197 * should retain the aforementioned emulation but modify it so that it's 198 * performed only when S10_FEATURE_IS_PRESENT(S10_FEATURE_X) is false. Thus the 199 * emulation function should look something like the following: 200 * 201 * static int 202 * my_emul_function(sysret_t *rv, ...) 203 * { 204 * if (S10_FEATURE_IS_PRESENT(S10_FEATURE_X)) { 205 * // Don't emulate 206 * return (__systemcall(rv, ...)); 207 * } else { 208 * // Emulate whatever needs to be emulated when the 209 * // backport isn't present in the Solaris 10 image. 210 * } 211 * } 212 */ 213 #define S10_FEATURE_IS_PRESENT(s10_emulated_features_constant) \ 214 ((emul_bitmap[(s10_emulated_features_constant) >> 3] & \ 215 (1 << ((s10_emulated_features_constant) & 0x7))) != 0) 216 217 #define EMULATE(cb, args) { (sysent_cb_t)(cb), (args) } 218 #define NOSYS EMULATE(s10_unimpl, (0 | RV_DEFAULT)) 219 220 typedef long (*sysent_cb_t)(); 221 typedef struct s10_sysent_table { 222 sysent_cb_t st_callc; 223 uintptr_t st_args; 224 } s10_sysent_table_t; 225 s10_sysent_table_t s10_sysent_table[]; 226 227 #define S10_UTS_RELEASE "5.10" 228 #define S10_UTS_VERSION "Generic_Virtual" 229 230 /*LINTED: static unused*/ 231 static volatile int s10_abort_err; 232 /*LINTED: static unused*/ 233 static volatile const char *s10_abort_msg; 234 /*LINTED: static unused*/ 235 static volatile const char *s10_abort_file; 236 /*LINTED: static unused*/ 237 static volatile int s10_abort_line; 238 239 extern int errno; 240 241 /*ARGSUSED*/ 242 void 243 _s10_abort(int err, const char *msg, const char *file, int line) 244 { 245 sysret_t rval; 246 247 /* Save the error message into convenient globals */ 248 s10_abort_err = err; 249 s10_abort_msg = msg; 250 s10_abort_file = file; 251 s10_abort_line = line; 252 253 /* kill ourselves */ 254 abort(); 255 256 /* If abort() didn't work, try something stronger. */ 257 (void) __systemcall(&rval, SYS_lwp_kill + 1024, _lwp_self(), SIGKILL); 258 } 259 260 int 261 s10_uucopy(const void *from, void *to, size_t size) 262 { 263 sysret_t rval; 264 265 if (__systemcall(&rval, SYS_uucopy + 1024, from, to, size) != 0) 266 return (EFAULT); 267 return (0); 268 } 269 270 /* 271 * ATTENTION: uucopystr() does NOT ensure that string are null terminated! 272 */ 273 int 274 s10_uucopystr(const void *from, void *to, size_t size) 275 { 276 sysret_t rval; 277 278 if (__systemcall(&rval, SYS_uucopystr + 1024, from, to, size) != 0) 279 return (EFAULT); 280 return (0); 281 } 282 283 /* 284 * Figures out the PID of init for the zone. Also returns a boolean 285 * indicating whether this process currently has that pid: if so, 286 * then at this moment, we are init. 287 */ 288 static boolean_t 289 get_initpid_info(void) 290 { 291 pid_t pid; 292 sysret_t rval; 293 int err; 294 295 /* 296 * Determine the current process PID and the PID of the zone's init. 297 * We use care not to call getpid() here, because we're not supposed 298 * to call getpid() until after the program is fully linked-- the 299 * first call to getpid() is a signal from the linker to debuggers 300 * that linking has been completed. 301 */ 302 if ((err = __systemcall(&rval, SYS_brand, 303 B_S10_PIDINFO, &pid, &zone_init_pid)) != 0) { 304 s10_abort(err, "Failed to get init's pid"); 305 } 306 307 /* 308 * Note that we need to be cautious with the pid we get back-- 309 * it should not be stashed and used in place of getpid(), since 310 * we might fork(2). So we keep zone_init_pid and toss the pid 311 * we otherwise got. 312 */ 313 if (pid == zone_init_pid) 314 return (B_TRUE); 315 316 return (B_FALSE); 317 } 318 319 /* 320 * This function is defined to be NOSYS but it won't be called from the 321 * the kernel since the NOSYS system calls are not enabled in the kernel. 322 * Thus, the only time this function is called is directly from within the 323 * indirect system call path. 324 */ 325 /*ARGSUSED*/ 326 static long 327 s10_unimpl(sysret_t *rv, uintptr_t p1) 328 { 329 sysret_t rval; 330 331 /* 332 * We'd like to print out some kind of error message here like 333 * "unsupported syscall", but we can't because it's not safe to 334 * assume that stderr or STDERR_FILENO actually points to something 335 * that is a terminal, and if we wrote to those files we could 336 * inadvertantly write to some applications open files, which would 337 * be bad. 338 * 339 * Normally, if an application calls an invalid system call 340 * it get a SIGSYS sent to it. So we'll just go ahead and send 341 * ourselves a signal here. Note that this is far from ideal since 342 * if the application has registered a signal handler, that signal 343 * handler may recieve a ucontext_t as the third parameter to 344 * indicate the context of the process when the signal was 345 * generated, and in this case that context will not be what the 346 * application is expecting. Hence, we should probably create a 347 * brandsys() kernel function that can deliver the signal to us 348 * with the correct ucontext_t. 349 */ 350 (void) __systemcall(&rval, SYS_lwp_kill + 1024, _lwp_self(), SIGSYS); 351 return (ENOSYS); 352 } 353 354 #if defined(__sparc) && !defined(__sparcv9) 355 /* 356 * Yuck. For 32-bit sparc applications, handle indirect system calls. 357 * Note that we declare this interface to use the maximum number of 358 * system call arguments. If we recieve a system call that uses less 359 * arguments, then the additional arguments will be garbage, but they 360 * will also be ignored so that should be ok. 361 */ 362 static long 363 s10_indir(sysret_t *rv, int code, 364 uintptr_t a0, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4, 365 uintptr_t a5, uintptr_t a6, uintptr_t a7) 366 { 367 s10_sysent_table_t *sst = &(s10_sysent_table[code]); 368 369 s10_assert(code < NSYSCALL); 370 switch (sst->st_args & NARGS_MASK) { 371 case 0: 372 return ((sst->st_callc)(rv)); 373 case 1: 374 return ((sst->st_callc)(rv, a0)); 375 case 2: 376 return ((sst->st_callc)(rv, a0, a1)); 377 case 3: 378 return ((sst->st_callc)(rv, a0, a1, a2)); 379 case 4: 380 return ((sst->st_callc)(rv, a0, a1, a2, a3)); 381 case 5: 382 return ((sst->st_callc)(rv, a0, a1, a2, a3, a4)); 383 case 6: 384 return ((sst->st_callc)(rv, rv, a0, a1, a2, a3, a4, a5)); 385 case 7: 386 return ((sst->st_callc)(rv, a0, a1, a2, a3, a4, a5, a6)); 387 case 8: 388 return ((sst->st_callc)(rv, a0, a1, a2, a3, a4, a5, a6, a7)); 389 } 390 s10_abort(0, "invalid entry in s10_sysent_table"); 391 return (EINVAL); 392 } 393 #endif /* __sparc && !__sparcv9 */ 394 395 /* Free the thread-local storage provided by mntfs_get_mntentbuf(). */ 396 static void 397 mntfs_free_mntentbuf(void *arg) 398 { 399 struct mntentbuf *embufp = arg; 400 401 if (embufp == NULL) 402 return; 403 if (embufp->mbuf_emp) 404 free(embufp->mbuf_emp); 405 if (embufp->mbuf_buf) 406 free(embufp->mbuf_buf); 407 bzero(embufp, sizeof (struct mntentbuf)); 408 free(embufp); 409 } 410 411 /* Provide the thread-local storage required by mntfs_ioctl(). */ 412 static struct mntentbuf * 413 mntfs_get_mntentbuf(size_t size) 414 { 415 static mutex_t keylock; 416 static thread_key_t key; 417 static int once_per_keyname = 0; 418 void *tsd = NULL; 419 struct mntentbuf *embufp; 420 421 /* Create the key. */ 422 if (!once_per_keyname) { 423 (void) mutex_lock(&keylock); 424 if (!once_per_keyname) { 425 if (thr_keycreate(&key, mntfs_free_mntentbuf)) { 426 (void) mutex_unlock(&keylock); 427 return (NULL); 428 } else { 429 once_per_keyname++; 430 } 431 } 432 (void) mutex_unlock(&keylock); 433 } 434 435 /* 436 * The thread-specific datum for this key is the address of a struct 437 * mntentbuf. If this is the first time here then we allocate the struct 438 * and its contents, and associate its address with the thread; if there 439 * are any problems then we abort. 440 */ 441 if (thr_getspecific(key, &tsd)) 442 return (NULL); 443 if (tsd == NULL) { 444 if (!(embufp = calloc(1, sizeof (struct mntentbuf))) || 445 !(embufp->mbuf_emp = malloc(sizeof (struct extmnttab))) || 446 thr_setspecific(key, embufp)) { 447 mntfs_free_mntentbuf(embufp); 448 return (NULL); 449 } 450 } else { 451 embufp = tsd; 452 } 453 454 /* Return the buffer, resizing it if necessary. */ 455 if (size > embufp->mbuf_bufsize) { 456 if (embufp->mbuf_buf) 457 free(embufp->mbuf_buf); 458 if ((embufp->mbuf_buf = malloc(size)) == NULL) { 459 embufp->mbuf_bufsize = 0; 460 return (NULL); 461 } else { 462 embufp->mbuf_bufsize = size; 463 } 464 } 465 return (embufp); 466 } 467 468 /* 469 * The MNTIOC_GETMNTENT command in this release differs from that in early 470 * versions of Solaris 10. 471 * 472 * Previously, the command would copy a pointer to a struct extmnttab to an 473 * address provided as an argument. The pointer would be somewhere within a 474 * mapping already present within the user's address space. In addition, the 475 * text to which the struct's members pointed would also be within a 476 * pre-existing mapping. Now, the user is required to allocate memory for both 477 * the struct and the text buffer, and to pass the address of each within a 478 * struct mntentbuf. In order to conceal these details from a Solaris 10 client 479 * we allocate some thread-local storage in which to create the necessary data 480 * structures; this is static, thread-safe memory that will be cleaned up 481 * without the caller's intervention. 482 * 483 * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY are new in this release; they should 484 * not work for older clients. 485 */ 486 int 487 mntfs_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg) 488 { 489 int err; 490 struct stat statbuf; 491 struct mntentbuf *embufp; 492 static size_t bufsize = MNT_LINE_MAX; 493 494 /* Do not emulate mntfs commands from up-to-date clients. */ 495 if (S10_FEATURE_IS_PRESENT(S10_FEATURE_ALTERED_MNTFS_IOCTL)) 496 return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg)); 497 498 /* Do not emulate mntfs commands directed at other file systems. */ 499 if ((err = __systemcall(rval, SYS_fstatat + 1024, 500 fdes, NULL, &statbuf, 0)) != 0) 501 return (err); 502 if (strcmp(statbuf.st_fstype, MNTTYPE_MNTFS) != 0) 503 return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg)); 504 505 if (cmd == MNTIOC_GETEXTMNTENT || cmd == MNTIOC_GETMNTANY) 506 return (EINVAL); 507 508 if ((embufp = mntfs_get_mntentbuf(bufsize)) == NULL) 509 return (ENOMEM); 510 511 /* 512 * MNTIOC_GETEXTMNTENT advances the file pointer once it has 513 * successfully copied out the result to the address provided. We 514 * therefore need to check the user-supplied address now since the 515 * one we'll be providing is guaranteed to work. 516 */ 517 if (s10_uucopy(&embufp->mbuf_emp, (void *)arg, sizeof (void *)) != 0) 518 return (EFAULT); 519 520 /* 521 * Keep retrying for as long as we fail for want of a large enough 522 * buffer. 523 */ 524 for (;;) { 525 if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes, 526 MNTIOC_GETEXTMNTENT, embufp)) != 0) 527 return (err); 528 529 if (rval->sys_rval1 == MNTFS_TOOLONG) { 530 /* The buffer wasn't large enough. */ 531 (void) atomic_swap_ulong((unsigned long *)&bufsize, 532 2 * embufp->mbuf_bufsize); 533 if ((embufp = mntfs_get_mntentbuf(bufsize)) == NULL) 534 return (ENOMEM); 535 } else { 536 break; 537 } 538 } 539 540 if (s10_uucopy(&embufp->mbuf_emp, (void *)arg, sizeof (void *)) != 0) 541 return (EFAULT); 542 543 return (0); 544 } 545 546 /* 547 * Assign the structure member value from the s (source) structure to the 548 * d (dest) structure. 549 */ 550 #define struct_assign(d, s, val) (((d).val) = ((s).val)) 551 552 /* 553 * The CRYPTO_GET_FUNCTION_LIST parameter structure crypto_function_list_t 554 * changed between S10 and Nevada, so we have to emulate the old S10 555 * crypto_function_list_t structure when interposing on the ioctl syscall. 556 */ 557 typedef struct s10_crypto_function_list { 558 boolean_t fl_digest_init; 559 boolean_t fl_digest; 560 boolean_t fl_digest_update; 561 boolean_t fl_digest_key; 562 boolean_t fl_digest_final; 563 564 boolean_t fl_encrypt_init; 565 boolean_t fl_encrypt; 566 boolean_t fl_encrypt_update; 567 boolean_t fl_encrypt_final; 568 569 boolean_t fl_decrypt_init; 570 boolean_t fl_decrypt; 571 boolean_t fl_decrypt_update; 572 boolean_t fl_decrypt_final; 573 574 boolean_t fl_mac_init; 575 boolean_t fl_mac; 576 boolean_t fl_mac_update; 577 boolean_t fl_mac_final; 578 579 boolean_t fl_sign_init; 580 boolean_t fl_sign; 581 boolean_t fl_sign_update; 582 boolean_t fl_sign_final; 583 boolean_t fl_sign_recover_init; 584 boolean_t fl_sign_recover; 585 586 boolean_t fl_verify_init; 587 boolean_t fl_verify; 588 boolean_t fl_verify_update; 589 boolean_t fl_verify_final; 590 boolean_t fl_verify_recover_init; 591 boolean_t fl_verify_recover; 592 593 boolean_t fl_digest_encrypt_update; 594 boolean_t fl_decrypt_digest_update; 595 boolean_t fl_sign_encrypt_update; 596 boolean_t fl_decrypt_verify_update; 597 598 boolean_t fl_seed_random; 599 boolean_t fl_generate_random; 600 601 boolean_t fl_session_open; 602 boolean_t fl_session_close; 603 boolean_t fl_session_login; 604 boolean_t fl_session_logout; 605 606 boolean_t fl_object_create; 607 boolean_t fl_object_copy; 608 boolean_t fl_object_destroy; 609 boolean_t fl_object_get_size; 610 boolean_t fl_object_get_attribute_value; 611 boolean_t fl_object_set_attribute_value; 612 boolean_t fl_object_find_init; 613 boolean_t fl_object_find; 614 boolean_t fl_object_find_final; 615 616 boolean_t fl_key_generate; 617 boolean_t fl_key_generate_pair; 618 boolean_t fl_key_wrap; 619 boolean_t fl_key_unwrap; 620 boolean_t fl_key_derive; 621 622 boolean_t fl_init_token; 623 boolean_t fl_init_pin; 624 boolean_t fl_set_pin; 625 626 boolean_t prov_is_hash_limited; 627 uint32_t prov_hash_threshold; 628 uint32_t prov_hash_limit; 629 } s10_crypto_function_list_t; 630 631 typedef struct s10_crypto_get_function_list { 632 uint_t fl_return_value; 633 crypto_provider_id_t fl_provider_id; 634 s10_crypto_function_list_t fl_list; 635 } s10_crypto_get_function_list_t; 636 637 /* 638 * The structure returned by the CRYPTO_GET_FUNCTION_LIST ioctl on /dev/crypto 639 * increased in size due to: 640 * 6482533 Threshold for HW offload via PKCS11 interface 641 * between S10 and Nevada. This is a relatively simple process of filling 642 * in the S10 structure fields with the Nevada data. 643 * 644 * We stat the device to make sure that the ioctl is meant for /dev/crypto. 645 * 646 */ 647 static int 648 crypto_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg) 649 { 650 int err; 651 s10_crypto_get_function_list_t s10_param; 652 crypto_get_function_list_t native_param; 653 static dev_t crypto_dev = (dev_t)-1; 654 struct stat sbuf; 655 656 if (crypto_dev == (dev_t)-1) { 657 if ((err = __systemcall(rval, SYS_fstatat + 1024, 658 AT_FDCWD, "/dev/crypto", &sbuf, 0)) != 0) 659 goto nonemuioctl; 660 crypto_dev = major(sbuf.st_rdev); 661 } 662 if ((err = __systemcall(rval, SYS_fstatat + 1024, 663 fdes, NULL, &sbuf, 0)) != 0) 664 return (err); 665 /* Each open fd of /dev/crypto gets a new minor device. */ 666 if (major(sbuf.st_rdev) != crypto_dev) 667 goto nonemuioctl; 668 669 if (s10_uucopy((const void *)arg, &s10_param, sizeof (s10_param)) != 0) 670 return (EFAULT); 671 struct_assign(native_param, s10_param, fl_provider_id); 672 if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes, cmd, 673 &native_param)) != 0) 674 return (err); 675 676 struct_assign(s10_param, native_param, fl_return_value); 677 struct_assign(s10_param, native_param, fl_provider_id); 678 679 struct_assign(s10_param, native_param, fl_list.fl_digest_init); 680 struct_assign(s10_param, native_param, fl_list.fl_digest); 681 struct_assign(s10_param, native_param, fl_list.fl_digest_update); 682 struct_assign(s10_param, native_param, fl_list.fl_digest_key); 683 struct_assign(s10_param, native_param, fl_list.fl_digest_final); 684 685 struct_assign(s10_param, native_param, fl_list.fl_encrypt_init); 686 struct_assign(s10_param, native_param, fl_list.fl_encrypt); 687 struct_assign(s10_param, native_param, fl_list.fl_encrypt_update); 688 struct_assign(s10_param, native_param, fl_list.fl_encrypt_final); 689 690 struct_assign(s10_param, native_param, fl_list.fl_decrypt_init); 691 struct_assign(s10_param, native_param, fl_list.fl_decrypt); 692 struct_assign(s10_param, native_param, fl_list.fl_decrypt_update); 693 struct_assign(s10_param, native_param, fl_list.fl_decrypt_final); 694 695 struct_assign(s10_param, native_param, fl_list.fl_mac_init); 696 struct_assign(s10_param, native_param, fl_list.fl_mac); 697 struct_assign(s10_param, native_param, fl_list.fl_mac_update); 698 struct_assign(s10_param, native_param, fl_list.fl_mac_final); 699 700 struct_assign(s10_param, native_param, fl_list.fl_sign_init); 701 struct_assign(s10_param, native_param, fl_list.fl_sign); 702 struct_assign(s10_param, native_param, fl_list.fl_sign_update); 703 struct_assign(s10_param, native_param, fl_list.fl_sign_final); 704 struct_assign(s10_param, native_param, fl_list.fl_sign_recover_init); 705 struct_assign(s10_param, native_param, fl_list.fl_sign_recover); 706 707 struct_assign(s10_param, native_param, fl_list.fl_verify_init); 708 struct_assign(s10_param, native_param, fl_list.fl_verify); 709 struct_assign(s10_param, native_param, fl_list.fl_verify_update); 710 struct_assign(s10_param, native_param, fl_list.fl_verify_final); 711 struct_assign(s10_param, native_param, fl_list.fl_verify_recover_init); 712 struct_assign(s10_param, native_param, fl_list.fl_verify_recover); 713 714 struct_assign(s10_param, native_param, 715 fl_list.fl_digest_encrypt_update); 716 struct_assign(s10_param, native_param, 717 fl_list.fl_decrypt_digest_update); 718 struct_assign(s10_param, native_param, fl_list.fl_sign_encrypt_update); 719 struct_assign(s10_param, native_param, 720 fl_list.fl_decrypt_verify_update); 721 722 struct_assign(s10_param, native_param, fl_list.fl_seed_random); 723 struct_assign(s10_param, native_param, fl_list.fl_generate_random); 724 725 struct_assign(s10_param, native_param, fl_list.fl_session_open); 726 struct_assign(s10_param, native_param, fl_list.fl_session_close); 727 struct_assign(s10_param, native_param, fl_list.fl_session_login); 728 struct_assign(s10_param, native_param, fl_list.fl_session_logout); 729 730 struct_assign(s10_param, native_param, fl_list.fl_object_create); 731 struct_assign(s10_param, native_param, fl_list.fl_object_copy); 732 struct_assign(s10_param, native_param, fl_list.fl_object_destroy); 733 struct_assign(s10_param, native_param, fl_list.fl_object_get_size); 734 struct_assign(s10_param, native_param, 735 fl_list.fl_object_get_attribute_value); 736 struct_assign(s10_param, native_param, 737 fl_list.fl_object_set_attribute_value); 738 struct_assign(s10_param, native_param, fl_list.fl_object_find_init); 739 struct_assign(s10_param, native_param, fl_list.fl_object_find); 740 struct_assign(s10_param, native_param, fl_list.fl_object_find_final); 741 742 struct_assign(s10_param, native_param, fl_list.fl_key_generate); 743 struct_assign(s10_param, native_param, fl_list.fl_key_generate_pair); 744 struct_assign(s10_param, native_param, fl_list.fl_key_wrap); 745 struct_assign(s10_param, native_param, fl_list.fl_key_unwrap); 746 struct_assign(s10_param, native_param, fl_list.fl_key_derive); 747 748 struct_assign(s10_param, native_param, fl_list.fl_init_token); 749 struct_assign(s10_param, native_param, fl_list.fl_init_pin); 750 struct_assign(s10_param, native_param, fl_list.fl_set_pin); 751 752 struct_assign(s10_param, native_param, fl_list.prov_is_hash_limited); 753 struct_assign(s10_param, native_param, fl_list.prov_hash_threshold); 754 struct_assign(s10_param, native_param, fl_list.prov_hash_limit); 755 756 return (s10_uucopy(&s10_param, (void *)arg, sizeof (s10_param))); 757 758 nonemuioctl: 759 return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg)); 760 } 761 762 /* 763 * The process contract CT_TGET and CT_TSET parameter structure ct_param_t 764 * changed between S10 and Nevada, so we have to emulate the old S10 765 * ct_param_t structure when interposing on the ioctl syscall. 766 */ 767 typedef struct s10_ct_param { 768 uint32_t ctpm_id; 769 uint32_t ctpm_pad; 770 uint64_t ctpm_value; 771 } s10_ct_param_t; 772 773 /* 774 * We have to emulate process contract ioctls for init(1M) because the 775 * ioctl parameter structure changed between S10 and Nevada. This is 776 * a relatively simple process of filling Nevada structure fields, 777 * shuffling values, and initiating a native system call. 778 * 779 * For now, we'll assume that all consumers of CT_TGET and CT_TSET will 780 * need emulation. We'll issue a stat to make sure that the ioctl 781 * is meant for the contract file system. 782 * 783 */ 784 static int 785 ctfs_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg) 786 { 787 int err; 788 s10_ct_param_t s10param; 789 ct_param_t param; 790 struct stat statbuf; 791 792 if ((err = __systemcall(rval, SYS_fstatat + 1024, 793 fdes, NULL, &statbuf, 0)) != 0) 794 return (err); 795 if (strcmp(statbuf.st_fstype, MNTTYPE_CTFS) != 0) 796 return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg)); 797 798 if (s10_uucopy((const void *)arg, &s10param, sizeof (s10param)) != 0) 799 return (EFAULT); 800 param.ctpm_id = s10param.ctpm_id; 801 param.ctpm_size = sizeof (uint64_t); 802 param.ctpm_value = &s10param.ctpm_value; 803 if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes, cmd, ¶m)) 804 != 0) 805 return (err); 806 807 if (cmd == CT_TGET) 808 return (s10_uucopy(&s10param, (void *)arg, sizeof (s10param))); 809 810 return (0); 811 } 812 813 typedef struct s10_zfs_cmd { 814 char zc_name[MAXPATHLEN]; 815 char zc_value[MAXPATHLEN * 2]; 816 char zc_string[MAXNAMELEN]; 817 uint64_t zc_guid; 818 uint64_t zc_nvlist_conf; /* really (char *) */ 819 uint64_t zc_nvlist_conf_size; 820 uint64_t zc_nvlist_src; /* really (char *) */ 821 uint64_t zc_nvlist_src_size; 822 uint64_t zc_nvlist_dst; /* really (char *) */ 823 uint64_t zc_nvlist_dst_size; 824 uint64_t zc_cookie; 825 uint64_t zc_objset_type; 826 uint64_t zc_perm_action; 827 uint64_t zc_history; /* really (char *) */ 828 uint64_t zc_history_len; 829 uint64_t zc_history_offset; 830 uint64_t zc_obj; 831 /* Solaris Next added zc_iflags member here */ 832 zfs_share_t zc_share; 833 dmu_objset_stats_t zc_objset_stats; 834 struct drr_begin zc_begin_record; 835 zinject_record_t zc_inject_record; 836 } s10_zfs_cmd_t; 837 838 /* 839 * There is a difference in the zfs_cmd_t ioctl parameter between S10 and 840 * Solaris Next so we need to translate between the two structures when 841 * making ZFS ioctls. 842 */ 843 static int 844 zfs_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg) 845 { 846 int err; 847 s10_zfs_cmd_t s10_param; 848 zfs_cmd_t native_param; 849 static dev_t zfs_dev = (dev_t)-1; 850 struct stat sbuf; 851 852 if (zfs_dev == (dev_t)-1) { 853 if ((err = __systemcall(rval, SYS_fstatat + 1024, 854 AT_FDCWD, "/dev/zfs", &sbuf, 0) != 0) != 0) 855 goto nonemuioctl; 856 zfs_dev = major(sbuf.st_rdev); 857 } 858 if ((err = __systemcall(rval, SYS_fstatat + 1024, 859 fdes, NULL, &sbuf, 0)) != 0) 860 return (err); 861 if (major(sbuf.st_rdev) != zfs_dev) 862 goto nonemuioctl; 863 864 if (s10_uucopy((const void *)arg, &s10_param, sizeof (s10_param)) != 0) 865 return (EFAULT); 866 867 bcopy((const void *)s10_param.zc_name, (void *)native_param.zc_name, 868 sizeof (s10_param.zc_name)); 869 bcopy((const void *)s10_param.zc_value, (void *)native_param.zc_value, 870 sizeof (s10_param.zc_value)); 871 bcopy((const void *)s10_param.zc_string, (void *)native_param.zc_string, 872 sizeof (s10_param.zc_string)); 873 struct_assign(native_param, s10_param, zc_guid); 874 struct_assign(native_param, s10_param, zc_nvlist_conf); 875 struct_assign(native_param, s10_param, zc_nvlist_conf_size); 876 struct_assign(native_param, s10_param, zc_nvlist_src); 877 struct_assign(native_param, s10_param, zc_nvlist_src_size); 878 struct_assign(native_param, s10_param, zc_nvlist_dst); 879 struct_assign(native_param, s10_param, zc_nvlist_dst_size); 880 struct_assign(native_param, s10_param, zc_cookie); 881 struct_assign(native_param, s10_param, zc_objset_type); 882 struct_assign(native_param, s10_param, zc_perm_action); 883 struct_assign(native_param, s10_param, zc_history); 884 struct_assign(native_param, s10_param, zc_history_len); 885 struct_assign(native_param, s10_param, zc_history_offset); 886 struct_assign(native_param, s10_param, zc_obj); 887 native_param.zc_iflags = 0; 888 struct_assign(native_param, s10_param, zc_share); 889 struct_assign(native_param, s10_param, zc_objset_stats); 890 struct_assign(native_param, s10_param, zc_begin_record); 891 struct_assign(native_param, s10_param, zc_inject_record); 892 893 err = __systemcall(rval, SYS_ioctl + 1024, fdes, cmd, &native_param); 894 895 bcopy((const void *)native_param.zc_name, (void *)s10_param.zc_name, 896 sizeof (s10_param.zc_name)); 897 bcopy((const void *)native_param.zc_value, (void *)s10_param.zc_value, 898 sizeof (s10_param.zc_value)); 899 bcopy((const void *)native_param.zc_string, (void *)s10_param.zc_string, 900 sizeof (s10_param.zc_string)); 901 struct_assign(s10_param, native_param, zc_guid); 902 struct_assign(s10_param, native_param, zc_nvlist_conf); 903 struct_assign(s10_param, native_param, zc_nvlist_conf_size); 904 struct_assign(s10_param, native_param, zc_nvlist_src); 905 struct_assign(s10_param, native_param, zc_nvlist_src_size); 906 struct_assign(s10_param, native_param, zc_nvlist_dst); 907 struct_assign(s10_param, native_param, zc_nvlist_dst_size); 908 struct_assign(s10_param, native_param, zc_cookie); 909 struct_assign(s10_param, native_param, zc_objset_type); 910 struct_assign(s10_param, native_param, zc_perm_action); 911 struct_assign(s10_param, native_param, zc_history); 912 struct_assign(s10_param, native_param, zc_history_len); 913 struct_assign(s10_param, native_param, zc_history_offset); 914 struct_assign(s10_param, native_param, zc_obj); 915 struct_assign(s10_param, native_param, zc_share); 916 struct_assign(s10_param, native_param, zc_objset_stats); 917 struct_assign(s10_param, native_param, zc_begin_record); 918 struct_assign(s10_param, native_param, zc_inject_record); 919 920 (void) s10_uucopy(&s10_param, (void *)arg, sizeof (s10_param)); 921 return (err); 922 923 nonemuioctl: 924 return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg)); 925 } 926 927 int 928 s10_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg) 929 { 930 switch (cmd) { 931 case CRYPTO_GET_FUNCTION_LIST: 932 return (crypto_ioctl(rval, fdes, cmd, arg)); 933 case CT_TGET: 934 /*FALLTHRU*/ 935 case CT_TSET: 936 return (ctfs_ioctl(rval, fdes, cmd, arg)); 937 case MNTIOC_GETMNTENT: 938 /*FALLTHRU*/ 939 case MNTIOC_GETEXTMNTENT: 940 /*FALLTHRU*/ 941 case MNTIOC_GETMNTANY: 942 return (mntfs_ioctl(rval, fdes, cmd, arg)); 943 } 944 945 if ((cmd & 0xff00) == ZFS_IOC) 946 return (zfs_ioctl(rval, fdes, cmd, arg)); 947 948 return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg)); 949 } 950 951 /* 952 * Unfortunately, pwrite()'s behavior differs between S10 and Nevada when 953 * applied to files opened with O_APPEND. The offset argument is ignored and 954 * the buffer is appended to the target file in S10, whereas the current file 955 * position is ignored in Nevada (i.e., pwrite() acts as though the target file 956 * wasn't opened with O_APPEND). This is a result of the fix for CR 6655660 957 * (pwrite() must ignore the O_APPEND/FAPPEND flag). 958 * 959 * We emulate the old S10 pwrite() behavior by checking whether the target file 960 * was opened with O_APPEND. If it was, then invoke the write() system call 961 * instead of pwrite(); otherwise, invoke the pwrite() system call as usual. 962 */ 963 static int 964 s10_pwrite(sysret_t *rval, int fd, const void *bufferp, size_t num_bytes, 965 off_t offset) 966 { 967 int err; 968 969 if ((err = __systemcall(rval, SYS_fcntl + 1024, fd, F_GETFL)) != 0) 970 return (err); 971 if (rval->sys_rval1 & O_APPEND) 972 return (__systemcall(rval, SYS_write + 1024, fd, bufferp, 973 num_bytes)); 974 return (__systemcall(rval, SYS_pwrite + 1024, fd, bufferp, num_bytes, 975 offset)); 976 } 977 978 #if !defined(_LP64) 979 /* 980 * This is the large file version of the pwrite() system call for 32-bit 981 * processes. This exists for the same reason that s10_pwrite() exists; see 982 * the comment above s10_pwrite(). 983 */ 984 static int 985 s10_pwrite64(sysret_t *rval, int fd, const void *bufferp, size32_t num_bytes, 986 uint32_t offset_1, uint32_t offset_2) 987 { 988 int err; 989 990 if ((err = __systemcall(rval, SYS_fcntl + 1024, fd, F_GETFL)) != 0) 991 return (err); 992 if (rval->sys_rval1 & O_APPEND) 993 return (__systemcall(rval, SYS_write + 1024, fd, bufferp, 994 num_bytes)); 995 return (__systemcall(rval, SYS_pwrite64 + 1024, fd, bufferp, 996 num_bytes, offset_1, offset_2)); 997 } 998 #endif /* !_LP64 */ 999 1000 /* 1001 * These are convenience macros that s10_getdents_common() uses. Both treat 1002 * their arguments, which should be character pointers, as dirent pointers or 1003 * dirent64 pointers and yield their d_name and d_reclen fields. These 1004 * macros shouldn't be used outside of s10_getdents_common(). 1005 */ 1006 #define dirent_name(charptr) ((charptr) + name_offset) 1007 #define dirent_reclen(charptr) \ 1008 (*(unsigned short *)(uintptr_t)((charptr) + reclen_offset)) 1009 1010 /* 1011 * This function contains code that is common to both s10_getdents() and 1012 * s10_getdents64(). See the comment above s10_getdents() for details. 1013 * 1014 * rval, fd, buf, and nbyte should be passed unmodified from s10_getdents() 1015 * and s10_getdents64(). getdents_syscall_id should be either SYS_getdents 1016 * or SYS_getdents64. name_offset should be the the byte offset of 1017 * the d_name field in the dirent structures passed to the kernel via the 1018 * syscall represented by getdents_syscall_id. reclen_offset should be 1019 * the byte offset of the d_reclen field in the aforementioned dirent 1020 * structures. 1021 */ 1022 static int 1023 s10_getdents_common(sysret_t *rval, int fd, char *buf, size_t nbyte, 1024 int getdents_syscall_id, size_t name_offset, size_t reclen_offset) 1025 { 1026 int err; 1027 size_t buf_size; 1028 char *local_buf; 1029 char *buf_current; 1030 1031 /* 1032 * Use a special brand operation, B_S10_ISFDXATTRDIR, to determine 1033 * whether the specified file descriptor refers to an extended file 1034 * attribute directory. If it doesn't, then SYS_getdents won't 1035 * reveal extended file attributes, in which case we can simply 1036 * hand the syscall to the native kernel. 1037 */ 1038 if ((err = __systemcall(rval, SYS_brand + 1024, B_S10_ISFDXATTRDIR, 1039 fd)) != 0) 1040 return (err); 1041 if (rval->sys_rval1 == 0) 1042 return (__systemcall(rval, getdents_syscall_id + 1024, fd, buf, 1043 nbyte)); 1044 1045 /* 1046 * The file descriptor refers to an extended file attributes directory. 1047 * We need to create a dirent buffer that's as large as buf into which 1048 * the native SYS_getdents will store the special extended file 1049 * attribute directory's entries. We can't dereference buf because 1050 * it might be an invalid pointer! 1051 */ 1052 if (nbyte > MAXGETDENTS_SIZE) 1053 nbyte = MAXGETDENTS_SIZE; 1054 local_buf = (char *)malloc(nbyte); 1055 if (local_buf == NULL) { 1056 /* 1057 * getdents(2) doesn't return an error code indicating a memory 1058 * allocation error and it doesn't make sense to return any of 1059 * its documented error codes for a malloc(3C) failure. We'll 1060 * use ENOMEM even though getdents(2) doesn't use it because it 1061 * best describes the failure. 1062 */ 1063 (void) S10_TRUSS_POINT_3(rval, getdents_syscall_id, ENOMEM, fd, 1064 buf, nbyte); 1065 rval->sys_rval1 = -1; 1066 rval->sys_rval2 = 0; 1067 return (EIO); 1068 } 1069 1070 /* 1071 * Issue a native SYS_getdents syscall but use our local dirent buffer 1072 * instead of buf. This will allow us to examine the returned dirent 1073 * structures immediately and copy them to buf later. That way the 1074 * calling process won't be able to see the dirent structures until 1075 * we finish examining them. 1076 */ 1077 if ((err = __systemcall(rval, getdents_syscall_id + 1024, fd, local_buf, 1078 nbyte)) != 0) { 1079 free(local_buf); 1080 return (err); 1081 } 1082 buf_size = rval->sys_rval1; 1083 if (buf_size == 0) { 1084 free(local_buf); 1085 return (0); 1086 } 1087 1088 /* 1089 * Look for SUNWattr_ro (VIEW_READONLY) and SUNWattr_rw 1090 * (VIEW_READWRITE) in the directory entries and remove them 1091 * from the dirent buffer. 1092 */ 1093 for (buf_current = local_buf; 1094 (size_t)(buf_current - local_buf) < buf_size; /* cstyle */) { 1095 if (strcmp(dirent_name(buf_current), VIEW_READONLY) != 0 && 1096 strcmp(dirent_name(buf_current), VIEW_READWRITE) != 0) { 1097 /* 1098 * The dirent refers to an attribute that should 1099 * be visible to Solaris 10 processes. Keep it 1100 * and examine the next entry in the buffer. 1101 */ 1102 buf_current += dirent_reclen(buf_current); 1103 } else { 1104 /* 1105 * We found either SUNWattr_ro (VIEW_READONLY) 1106 * or SUNWattr_rw (VIEW_READWRITE). Remove it 1107 * from the dirent buffer by decrementing 1108 * buf_size by the size of the entry and 1109 * overwriting the entry with the remaining 1110 * entries. 1111 */ 1112 buf_size -= dirent_reclen(buf_current); 1113 (void) memmove(buf_current, buf_current + 1114 dirent_reclen(buf_current), buf_size - 1115 (size_t)(buf_current - local_buf)); 1116 } 1117 } 1118 1119 /* 1120 * Copy local_buf into buf so that the calling process can see 1121 * the results. 1122 */ 1123 if ((err = s10_uucopy(local_buf, buf, buf_size)) != 0) { 1124 free(local_buf); 1125 rval->sys_rval1 = -1; 1126 rval->sys_rval2 = 0; 1127 return (err); 1128 } 1129 rval->sys_rval1 = buf_size; 1130 free(local_buf); 1131 return (0); 1132 } 1133 1134 /* 1135 * Solaris Next added two special extended file attributes, SUNWattr_ro and 1136 * SUNWattr_rw, which are called "extended system attributes". They have 1137 * special semantics (e.g., a process cannot unlink SUNWattr_ro) and should 1138 * not appear in solaris10-branded zones because no Solaris 10 applications, 1139 * including system commands such as tar(1), are coded to correctly handle these 1140 * special attributes. 1141 * 1142 * This emulation function solves the aforementioned problem by emulating 1143 * the getdents(2) syscall and filtering both system attributes out of resulting 1144 * directory entry lists. The emulation function only filters results when 1145 * the given file descriptor refers to an extended file attribute directory. 1146 * Filtering getdents(2) results is expensive because it requires dynamic 1147 * memory allocation; however, the performance cost is tolerable because 1148 * we don't expect Solaris 10 processes to frequently examine extended file 1149 * attribute directories. 1150 * 1151 * The brand's emulation library needs two getdents(2) emulation functions 1152 * because getdents(2) comes in two flavors: non-largefile-aware getdents(2) 1153 * and largefile-aware getdents64(2). s10_getdents() handles the non-largefile- 1154 * aware case for 32-bit processes and all getdents(2) syscalls for 64-bit 1155 * processes (64-bit processes use largefile-aware interfaces by default). 1156 * See s10_getdents64() below for the largefile-aware getdents64(2) emulation 1157 * function for 32-bit processes. 1158 */ 1159 static int 1160 s10_getdents(sysret_t *rval, int fd, struct dirent *buf, size_t nbyte) 1161 { 1162 return (s10_getdents_common(rval, fd, (char *)buf, nbyte, SYS_getdents, 1163 offsetof(struct dirent, d_name), 1164 offsetof(struct dirent, d_reclen))); 1165 } 1166 1167 #ifndef _LP64 1168 /* 1169 * This is the largefile-aware version of getdents(2) for 32-bit processes. 1170 * This exists for the same reason that s10_getdents() exists. See the comment 1171 * above s10_getdents(). 1172 */ 1173 static int 1174 s10_getdents64(sysret_t *rval, int fd, struct dirent64 *buf, size_t nbyte) 1175 { 1176 return (s10_getdents_common(rval, fd, (char *)buf, nbyte, 1177 SYS_getdents64, offsetof(struct dirent64, d_name), 1178 offsetof(struct dirent64, d_reclen))); 1179 } 1180 #endif /* !_LP64 */ 1181 1182 #define S10_AC_PROC (0x1 << 28) 1183 #define S10_AC_TASK (0x2 << 28) 1184 #define S10_AC_FLOW (0x4 << 28) 1185 #define S10_AC_MODE(x) ((x) & 0xf0000000) 1186 #define S10_AC_OPTION(x) ((x) & 0x0fffffff) 1187 1188 /* 1189 * The mode shift, mode mask and option mask for acctctl have changed. The 1190 * mode is currently the top full byte and the option is the lower 3 full bytes. 1191 */ 1192 int 1193 s10_acctctl(sysret_t *rval, int cmd, void *buf, size_t bufsz) 1194 { 1195 int mode = S10_AC_MODE(cmd); 1196 int option = S10_AC_OPTION(cmd); 1197 1198 switch (mode) { 1199 case S10_AC_PROC: 1200 mode = AC_PROC; 1201 break; 1202 case S10_AC_TASK: 1203 mode = AC_TASK; 1204 break; 1205 case S10_AC_FLOW: 1206 mode = AC_FLOW; 1207 break; 1208 default: 1209 return (S10_TRUSS_POINT_3(rval, SYS_acctctl, EINVAL, cmd, buf, 1210 bufsz)); 1211 } 1212 1213 return (__systemcall(rval, SYS_acctctl + 1024, mode | option, buf, 1214 bufsz)); 1215 } 1216 1217 /* 1218 * The Audit Policy parameters have changed due to: 1219 * 6466722 audituser and AUDIT_USER are defined, unused, undocumented and 1220 * should be removed. 1221 * 1222 * In S10 we had the following flag: 1223 * #define AUDIT_USER 0x0040 1224 * which doesn't exist in Solaris Next where the subsequent flags are shifted 1225 * down. For example, in S10 we had: 1226 * #define AUDIT_GROUP 0x0080 1227 * but on Solaris Next we have: 1228 * #define AUDIT_GROUP 0x0040 1229 * AUDIT_GROUP has the value AUDIT_USER had in S10 and all of the subsequent 1230 * bits are also shifted one place. 1231 * 1232 * When we're getting or setting the Audit Policy parameters we need to 1233 * shift the outgoing or incoming bits into their proper positions. Since 1234 * S10_AUDIT_USER was always unused, we always clear that bit on A_GETPOLICY. 1235 * 1236 * The command we care about, BSM_AUDITCTL, passes the most parameters (3), 1237 * so declare this function to take up to 4 args and just pass them on. 1238 * The number of parameters for s10_auditsys needs to be equal to the BSM_* 1239 * subcommand that has the most parameters, since we want to pass all 1240 * parameters through, regardless of which subcommands we interpose on. 1241 * 1242 * Note that the auditsys system call uses the SYSENT_AP macro wrapper instead 1243 * of the more common SYSENT_CI macro. This means the return value is a 1244 * SE_64RVAL so the syscall table uses RV_64RVAL. 1245 */ 1246 1247 #define S10_AUDIT_HMASK 0xffffffc0 1248 #define S10_AUDIT_LMASK 0x3f 1249 #define S10_AUC_NOSPACE 0x3 1250 1251 int 1252 s10_auditsys(sysret_t *rval, int bsmcmd, intptr_t a0, intptr_t a1, intptr_t a2) 1253 { 1254 int err; 1255 uint32_t m; 1256 1257 if (bsmcmd != BSM_AUDITCTL) 1258 return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, a1, 1259 a2)); 1260 1261 if ((int)a0 == A_GETPOLICY) { 1262 if ((err = __systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, 1263 &m, a2)) != 0) 1264 return (err); 1265 m = ((m & S10_AUDIT_HMASK) << 1) | (m & S10_AUDIT_LMASK); 1266 if (s10_uucopy(&m, (void *)a1, sizeof (m)) != 0) 1267 return (EFAULT); 1268 return (0); 1269 1270 } else if ((int)a0 == A_SETPOLICY) { 1271 if (s10_uucopy((const void *)a1, &m, sizeof (m)) != 0) 1272 return (EFAULT); 1273 m = ((m >> 1) & S10_AUDIT_HMASK) | (m & S10_AUDIT_LMASK); 1274 return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, &m, 1275 a2)); 1276 } else if ((int)a0 == A_GETCOND) { 1277 if ((err = __systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, 1278 &m, a2)) != 0) 1279 return (err); 1280 if (m == AUC_NOSPACE) 1281 m = S10_AUC_NOSPACE; 1282 if (s10_uucopy(&m, (void *)a1, sizeof (m)) != 0) 1283 return (EFAULT); 1284 return (0); 1285 } else if ((int)a0 == A_SETCOND) { 1286 if (s10_uucopy((const void *)a1, &m, sizeof (m)) != 0) 1287 return (EFAULT); 1288 if (m == S10_AUC_NOSPACE) 1289 m = AUC_NOSPACE; 1290 return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, &m, 1291 a2)); 1292 } 1293 1294 return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, a1, a2)); 1295 } 1296 1297 /* 1298 * Determine whether the executable passed to SYS_exec or SYS_execve is a 1299 * native executable. The s10_npreload.so invokes the B_S10_NATIVE brand 1300 * operation which patches up the processes exec info to eliminate any trace 1301 * of the wrapper. That will make pgrep and other commands that examine 1302 * process' executable names and command-line parameters work properly. 1303 */ 1304 static int 1305 s10_exec_native(sysret_t *rval, const char *fname, const char **argp, 1306 const char **envp) 1307 { 1308 const char *filename = fname; 1309 char path[64]; 1310 int err; 1311 1312 /* Get a copy of the executable we're trying to run */ 1313 path[0] = '\0'; 1314 (void) s10_uucopystr(filename, path, sizeof (path)); 1315 1316 /* Check if we're trying to run a native binary */ 1317 if (strncmp(path, "/.SUNWnative/usr/lib/brand/solaris10/s10_native", 1318 sizeof (path)) != 0) 1319 return (0); 1320 1321 /* Skip the first element in the argv array */ 1322 argp++; 1323 1324 /* 1325 * The the path of the dynamic linker is the second parameter 1326 * of s10_native_exec(). 1327 */ 1328 if (s10_uucopy(argp, &filename, sizeof (char *)) != 0) 1329 return (EFAULT); 1330 1331 /* If an exec call succeeds, it never returns */ 1332 err = __systemcall(rval, SYS_brand + 1024, B_EXEC_NATIVE, filename, 1333 argp, envp, NULL, NULL, NULL); 1334 s10_assert(err != 0); 1335 return (err); 1336 } 1337 1338 /* 1339 * Interpose on the SYS_exec syscall to detect native wrappers. 1340 */ 1341 int 1342 s10_exec(sysret_t *rval, const char *fname, const char **argp) 1343 { 1344 int err; 1345 1346 if ((err = s10_exec_native(rval, fname, argp, NULL)) != 0) 1347 return (err); 1348 1349 /* If an exec call succeeds, it never returns */ 1350 err = __systemcall(rval, SYS_execve + 1024, fname, argp, NULL); 1351 s10_assert(err != 0); 1352 return (err); 1353 } 1354 1355 /* 1356 * Interpose on the SYS_execve syscall to detect native wrappers. 1357 */ 1358 int 1359 s10_execve(sysret_t *rval, const char *fname, const char **argp, 1360 const char **envp) 1361 { 1362 int err; 1363 1364 if ((err = s10_exec_native(rval, fname, argp, envp)) != 0) 1365 return (err); 1366 1367 /* If an exec call succeeds, it never returns */ 1368 err = __systemcall(rval, SYS_execve + 1024, fname, argp, envp); 1369 s10_assert(err != 0); 1370 return (err); 1371 } 1372 1373 /* 1374 * S10's issetugid() syscall is now a subcode to privsys(). 1375 */ 1376 static int 1377 s10_issetugid(sysret_t *rval) 1378 { 1379 return (__systemcall(rval, SYS_privsys + 1024, PRIVSYS_ISSETUGID, 1380 0, 0, 0, 0, 0)); 1381 } 1382 1383 static long 1384 s10_uname(sysret_t *rv, uintptr_t p1) 1385 { 1386 struct utsname un, *unp = (struct utsname *)p1; 1387 int rev, err; 1388 1389 if ((err = __systemcall(rv, SYS_uname + 1024, &un)) != 0) 1390 return (err); 1391 1392 rev = atoi(&un.release[2]); 1393 s10_assert(rev >= 11); 1394 bzero(un.release, _SYS_NMLN); 1395 (void) strlcpy(un.release, S10_UTS_RELEASE, _SYS_NMLN); 1396 bzero(un.version, _SYS_NMLN); 1397 (void) strlcpy(un.version, S10_UTS_VERSION, _SYS_NMLN); 1398 1399 /* copy out the modified uname info */ 1400 return (s10_uucopy(&un, unp, sizeof (un))); 1401 } 1402 1403 int 1404 s10_sysconfig(sysret_t *rv, int which) 1405 { 1406 long value; 1407 1408 /* 1409 * We must interpose on the sysconfig(2) requests 1410 * that deal with the realtime signal number range. 1411 * All others get passed to the native sysconfig(2). 1412 */ 1413 switch (which) { 1414 case _CONFIG_RTSIG_MAX: 1415 value = S10_SIGRTMAX - S10_SIGRTMIN + 1; 1416 break; 1417 case _CONFIG_SIGRT_MIN: 1418 value = S10_SIGRTMIN; 1419 break; 1420 case _CONFIG_SIGRT_MAX: 1421 value = S10_SIGRTMAX; 1422 break; 1423 default: 1424 return (__systemcall(rv, SYS_sysconfig + 1024, which)); 1425 } 1426 1427 (void) S10_TRUSS_POINT_1(rv, SYS_sysconfig, 0, which); 1428 rv->sys_rval1 = value; 1429 rv->sys_rval2 = 0; 1430 1431 return (0); 1432 } 1433 1434 int 1435 s10_sysinfo(sysret_t *rv, int command, char *buf, long count) 1436 { 1437 char *value; 1438 int len; 1439 1440 /* 1441 * We must interpose on the sysinfo(2) commands SI_RELEASE and 1442 * SI_VERSION; all others get passed to the native sysinfo(2) 1443 * command. 1444 */ 1445 switch (command) { 1446 case SI_RELEASE: 1447 value = S10_UTS_RELEASE; 1448 break; 1449 1450 case SI_VERSION: 1451 value = S10_UTS_VERSION; 1452 break; 1453 1454 default: 1455 /* 1456 * The default action is to pass the command to the 1457 * native sysinfo(2) syscall. 1458 */ 1459 return (__systemcall(rv, SYS_systeminfo + 1024, 1460 command, buf, count)); 1461 } 1462 1463 len = strlen(value) + 1; 1464 if (count > 0) { 1465 if (s10_uucopystr(value, buf, count) != 0) 1466 return (EFAULT); 1467 1468 /* Assure NULL termination of buf as s10_uucopystr() doesn't. */ 1469 if (len > count && s10_uucopy("\0", buf + (count - 1), 1) != 0) 1470 return (EFAULT); 1471 } 1472 1473 /* 1474 * On success, sysinfo(2) returns the size of buffer required to hold 1475 * the complete value plus its terminating NULL byte. 1476 */ 1477 (void) S10_TRUSS_POINT_3(rv, SYS_systeminfo, 0, command, buf, count); 1478 rv->sys_rval1 = len; 1479 rv->sys_rval2 = 0; 1480 return (0); 1481 } 1482 1483 #if defined(__x86) 1484 #if defined(__amd64) 1485 /* 1486 * 64-bit x86 LWPs created by SYS_lwp_create start here if they need to set 1487 * their %fs registers to the legacy Solaris 10 selector value. 1488 * 1489 * This function does three things: 1490 * 1491 * 1. Trap to the kernel so that it can set %fs to the legacy Solaris 10 1492 * selector value. 1493 * 2. Read the LWP's true entry point (the entry point supplied by libc 1494 * when SYS_lwp_create was invoked) from %r14. 1495 * 3. Eliminate this function's stack frame and pass control to the LWP's 1496 * true entry point. 1497 * 1498 * See the comment above s10_lwp_create_correct_fs() (see below) for the reason 1499 * why this function exists. 1500 */ 1501 /*ARGSUSED*/ 1502 static void 1503 s10_lwp_create_entry_point(void *ulwp_structp) 1504 { 1505 sysret_t rval; 1506 1507 /* 1508 * The new LWP's %fs register is initially zero, but libc won't 1509 * function correctly when %fs is zero. Change the LWP's %fs register 1510 * via SYS_brand. 1511 */ 1512 (void) __systemcall(&rval, SYS_brand + 1024, B_S10_FSREGCORRECTION); 1513 1514 /* 1515 * Jump to the true entry point, which is stored in %r14. 1516 * Remove our stack frame before jumping so that 1517 * s10_lwp_create_entry_point() won't be seen in stack traces. 1518 * 1519 * NOTE: s10_lwp_create_entry_point() pushes %r12 onto its stack frame 1520 * so that it can use it as a temporary register. We don't restore %r12 1521 * in this assembly block because we don't care about its value (and 1522 * neither does _lwp_start()). Besides, the System V ABI AMD64 1523 * Actirecture Processor Supplement doesn't specify that %r12 should 1524 * have a special value when LWPs start, so we can ignore its value when 1525 * we jump to the true entry point. Furthermore, %r12 is a callee-saved 1526 * register, so the true entry point should push %r12 onto its stack 1527 * before using the register. We ignore %r14 after we read it for 1528 * similar reasons. 1529 * 1530 * NOTE: The compiler will generate a function epilogue for this 1531 * function despite the fact that the LWP will never execute it. 1532 * We could hand-code this entire function in assembly to eliminate 1533 * the epilogue, but the epilogue is only three or four instructions, 1534 * so we wouldn't save much space. Besides, why would we want 1535 * to create yet another ugly, hard-to-maintain assembly function when 1536 * we could write most of it in C? 1537 */ 1538 __asm__ __volatile__( 1539 "movq %0, %%rdi\n\t" /* pass ulwp_structp as arg1 */ 1540 "movq %%rbp, %%rsp\n\t" /* eliminate the stack frame */ 1541 "popq %%rbp\n\t" 1542 "jmp *%%r14\n\t" /* jump to the true entry point */ 1543 : : "r" (ulwp_structp)); 1544 /*NOTREACHED*/ 1545 } 1546 1547 /* 1548 * The S10 libc expects that %fs will be nonzero for new 64-bit x86 LWPs but the 1549 * Nevada kernel clears %fs for such LWPs. Unforunately, new LWPs do not issue 1550 * SYS_lwp_private (see s10_lwp_private() below) after they are created, so 1551 * we must ensure that new LWPs invoke a brand operation that sets %fs to a 1552 * nonzero value immediately after their creation. 1553 * 1554 * The easiest way to do this is to make new LWPs start at a special function, 1555 * s10_lwp_create_entry_point() (see its definition above), that invokes the 1556 * brand operation that corrects %fs. We'll store the entry points of new LWPs 1557 * in their %r14 registers so that s10_lwp_create_entry_point() can find and 1558 * call them after invoking the special brand operation. %r14 is a callee-saved 1559 * register; therefore, any functions invoked by s10_lwp_create_entry_point() 1560 * and all functions dealing with signals (e.g., sigacthandler()) will preserve 1561 * %r14 for s10_lwp_create_entry_point(). 1562 * 1563 * The Nevada kernel can safely work with nonzero %fs values because the kernel 1564 * configures per-thread %fs segment descriptors so that the legacy %fs selector 1565 * value will still work. See the comment in lwp_load() regarding %fs and 1566 * %fsbase in 64-bit x86 processes. 1567 * 1568 * This emulation exists thanks to CRs 6467491 and 6501650. 1569 */ 1570 static int 1571 s10_lwp_create_correct_fs(sysret_t *rval, ucontext_t *ucp, int flags, 1572 id_t *new_lwp) 1573 { 1574 ucontext_t s10_uc; 1575 1576 /* 1577 * Copy the supplied ucontext_t structure to the local stack 1578 * frame and store the new LWP's entry point (the value of %rip 1579 * stored in the ucontext_t) in the new LWP's %r14 register. 1580 * Then make s10_lwp_create_entry_point() the new LWP's entry 1581 * point. 1582 */ 1583 if (s10_uucopy(ucp, &s10_uc, sizeof (s10_uc)) != 0) 1584 return (EFAULT); 1585 1586 s10_uc.uc_mcontext.gregs[REG_R14] = s10_uc.uc_mcontext.gregs[REG_RIP]; 1587 s10_uc.uc_mcontext.gregs[REG_RIP] = (greg_t)s10_lwp_create_entry_point; 1588 1589 /* fix up the signal mask */ 1590 if (s10_uc.uc_flags & UC_SIGMASK) 1591 (void) s10sigset_to_native(&s10_uc.uc_sigmask, 1592 &s10_uc.uc_sigmask); 1593 1594 /* 1595 * Issue SYS_lwp_create to create the new LWP. We pass the 1596 * modified ucontext_t to make sure that the new LWP starts at 1597 * s10_lwp_create_entry_point(). 1598 */ 1599 return (__systemcall(rval, SYS_lwp_create + 1024, &s10_uc, 1600 flags, new_lwp)); 1601 } 1602 #endif /* __amd64 */ 1603 1604 /* 1605 * SYS_lwp_private is issued by libc_init() to set %fsbase in 64-bit x86 1606 * processes. The Nevada kernel sets %fs to zero but the S10 libc expects 1607 * %fs to be nonzero. We'll pass the issued system call to the kernel untouched 1608 * and invoke a brand operation to set %fs to the legacy S10 selector value. 1609 * 1610 * This emulation exists thanks to CRs 6467491 and 6501650. 1611 */ 1612 static int 1613 s10_lwp_private(sysret_t *rval, int cmd, int which, uintptr_t base) 1614 { 1615 #if defined(__amd64) 1616 int err; 1617 1618 /* 1619 * The current LWP's %fs register should be zero. Determine whether the 1620 * Solaris 10 libc with which we're working functions correctly when %fs 1621 * is zero by calling thr_main() after issuing the SYS_lwp_private 1622 * syscall. If thr_main() barfs (returns -1), then change the LWP's %fs 1623 * register via SYS_brand and patch s10_sysent_table so that issuing 1624 * SYS_lwp_create executes s10_lwp_create_correct_fs() rather than the 1625 * default s10_lwp_create(). s10_lwp_create_correct_fs() will 1626 * guarantee that new LWPs will have correct %fs values. 1627 */ 1628 if ((err = __systemcall(rval, SYS_lwp_private + 1024, cmd, which, 1629 base)) != 0) 1630 return (err); 1631 if (thr_main() == -1) { 1632 /* 1633 * SYS_lwp_private is only issued by libc_init(), which is 1634 * executed when libc is first loaded by ld.so.1. Thus we 1635 * are guaranteed to be single-threaded at this point. Even 1636 * if we were multithreaded at this point, writing a 64-bit 1637 * value to the st_callc field of a s10_sysent_table 1638 * entry is guaranteed to be atomic on 64-bit x86 chips 1639 * as long as the field is not split across cache lines 1640 * (It shouldn't be.). See chapter 8, section 1.1 of 1641 * "The Intel 64 and IA32 Architectures Software Developer's 1642 * Manual," Volume 3A for more details. 1643 */ 1644 s10_sysent_table[SYS_lwp_create].st_callc = 1645 (sysent_cb_t)s10_lwp_create_correct_fs; 1646 return (__systemcall(rval, SYS_brand + 1024, 1647 B_S10_FSREGCORRECTION)); 1648 } 1649 return (0); 1650 #else /* !__amd64 */ 1651 return (__systemcall(rval, SYS_lwp_private + 1024, cmd, which, base)); 1652 #endif /* !__amd64 */ 1653 } 1654 #endif /* __x86 */ 1655 1656 /* 1657 * The Opensolaris versions of lwp_mutex_timedlock() and lwp_mutex_trylock() 1658 * add an extra argument to the interfaces, a uintptr_t value for the mutex's 1659 * mutex_owner field. The Solaris 10 libc assigns the mutex_owner field at 1660 * user-level, so we just make the extra argument be zero in both syscalls. 1661 */ 1662 1663 static int 1664 s10_lwp_mutex_timedlock(sysret_t *rval, lwp_mutex_t *lp, timespec_t *tsp) 1665 { 1666 return (__systemcall(rval, SYS_lwp_mutex_timedlock + 1024, lp, tsp, 0)); 1667 } 1668 1669 static int 1670 s10_lwp_mutex_trylock(sysret_t *rval, lwp_mutex_t *lp) 1671 { 1672 return (__systemcall(rval, SYS_lwp_mutex_trylock + 1024, lp, 0)); 1673 } 1674 1675 /* 1676 * If the emul_global_zone flag is set then emulate some aspects of the 1677 * zone system call. In particular, emulate the global zone ID on the 1678 * ZONE_LOOKUP subcommand and emulate some of the global zone attributes 1679 * on the ZONE_GETATTR subcommand. If the flag is not set or we're performing 1680 * some other operation, simply pass the calls through. 1681 */ 1682 int 1683 s10_zone(sysret_t *rval, int cmd, void *arg1, void *arg2, void *arg3, 1684 void *arg4) 1685 { 1686 char *aval; 1687 int len; 1688 zoneid_t zid; 1689 int attr; 1690 char *buf; 1691 size_t bufsize; 1692 1693 /* 1694 * We only emulate the zone syscall for a subset of specific commands, 1695 * otherwise we just pass the call through. 1696 */ 1697 if (!emul_global_zone) 1698 return (__systemcall(rval, SYS_zone + 1024, cmd, arg1, arg2, 1699 arg3, arg4)); 1700 1701 switch (cmd) { 1702 case ZONE_LOOKUP: 1703 (void) S10_TRUSS_POINT_1(rval, SYS_zone, 0, cmd); 1704 rval->sys_rval1 = GLOBAL_ZONEID; 1705 rval->sys_rval2 = 0; 1706 return (0); 1707 1708 case ZONE_GETATTR: 1709 zid = (zoneid_t)(uintptr_t)arg1; 1710 attr = (int)(uintptr_t)arg2; 1711 buf = (char *)arg3; 1712 bufsize = (size_t)arg4; 1713 1714 /* 1715 * If the request is for the global zone then we're emulating 1716 * that, otherwise pass this thru. 1717 */ 1718 if (zid != GLOBAL_ZONEID) 1719 goto passthru; 1720 1721 switch (attr) { 1722 case ZONE_ATTR_NAME: 1723 aval = GLOBAL_ZONENAME; 1724 break; 1725 1726 case ZONE_ATTR_BRAND: 1727 aval = NATIVE_BRAND_NAME; 1728 break; 1729 default: 1730 /* 1731 * We only emulate a subset of the attrs, use the 1732 * real zone id to pass thru the rest. 1733 */ 1734 arg1 = (void *)(uintptr_t)zoneid; 1735 goto passthru; 1736 } 1737 1738 (void) S10_TRUSS_POINT_5(rval, SYS_zone, 0, cmd, zid, attr, 1739 buf, bufsize); 1740 1741 len = strlen(aval) + 1; 1742 if (len > bufsize) 1743 return (ENAMETOOLONG); 1744 1745 if (buf != NULL) { 1746 if (len == 1) { 1747 if (s10_uucopy("\0", buf, 1) != 0) 1748 return (EFAULT); 1749 } else { 1750 if (s10_uucopystr(aval, buf, len) != 0) 1751 return (EFAULT); 1752 1753 /* 1754 * Assure NULL termination of "buf" as 1755 * s10_uucopystr() does NOT. 1756 */ 1757 if (s10_uucopy("\0", buf + (len - 1), 1) != 0) 1758 return (EFAULT); 1759 } 1760 } 1761 1762 rval->sys_rval1 = len; 1763 rval->sys_rval2 = 0; 1764 return (0); 1765 1766 default: 1767 break; 1768 } 1769 1770 passthru: 1771 return (__systemcall(rval, SYS_zone + 1024, cmd, arg1, arg2, arg3, 1772 arg4)); 1773 } 1774 1775 /* 1776 * Close a libc file handle, but don't actually close the underlying 1777 * file descriptor. 1778 */ 1779 static void 1780 s10_close_fh(FILE *file) 1781 { 1782 int fd, fd_new; 1783 1784 if (file == NULL) 1785 return; 1786 1787 if ((fd = fileno(file)) < 0) 1788 return; 1789 1790 /* 1791 * We're a branded process but our handler isn't installed yet. We 1792 * can't use the dup() syscall since it no longer exists. 1793 */ 1794 fd_new = fcntl(fd, F_DUPFD, 0); 1795 if (fd_new == -1) 1796 return; 1797 1798 (void) fclose(file); 1799 (void) dup2(fd_new, fd); 1800 (void) close(fd_new); 1801 } 1802 1803 /*ARGSUSED*/ 1804 int 1805 s10_init(int argc, char *argv[], char *envp[]) 1806 { 1807 sysret_t rval; 1808 s10_brand_reg_t reg; 1809 s10_elf_data_t sed; 1810 auxv_t *ap; 1811 uintptr_t *p; 1812 int i, err; 1813 char *bname; 1814 1815 /* Sanity check our translation table return value codes */ 1816 for (i = 0; i < NSYSCALL; i++) { 1817 s10_sysent_table_t *est = &(s10_sysent_table[i]); 1818 s10_assert(BIT_ONLYONESET(est->st_args & RV_MASK)); 1819 } 1820 1821 /* 1822 * We need to shutdown all libc stdio. libc stdio normally goes to 1823 * file descriptors, but since we're actually part of a another 1824 * process we don't own these file descriptors and we can't make 1825 * any assumptions about their state. 1826 */ 1827 s10_close_fh(stdin); 1828 s10_close_fh(stdout); 1829 s10_close_fh(stderr); 1830 1831 /* 1832 * Cache the pid of the zone's init process and determine if 1833 * we're init(1m) for the zone. Remember: we might be init 1834 * now, but as soon as we fork(2) we won't be. 1835 */ 1836 (void) get_initpid_info(); 1837 1838 /* get the current zoneid */ 1839 err = __systemcall(&rval, SYS_zone, ZONE_LOOKUP, NULL); 1840 s10_assert(err == 0); 1841 zoneid = (zoneid_t)rval.sys_rval1; 1842 1843 /* Get the zone's emulation bitmap. */ 1844 if ((err = __systemcall(&rval, SYS_zone, ZONE_GETATTR, zoneid, 1845 S10_EMUL_BITMAP, emul_bitmap, sizeof (emul_bitmap))) != 0) { 1846 s10_abort(err, "The zone's patch level is unsupported"); 1847 /*NOTREACHED*/ 1848 } 1849 1850 bname = basename(argv[0]); 1851 1852 /* 1853 * In general we want the S10 commands that are zone-aware to continue 1854 * to behave as they normally do within a zone. Since these commands 1855 * are zone-aware, they should continue to "do the right thing". 1856 * However, some zone-aware commands aren't going to work the way 1857 * we expect them to inside the branded zone. In particular, the pkg 1858 * and patch commands will not properly manage all pkgs/patches 1859 * unless the commands think they are running in the global zone. For 1860 * these commands we want to emulate the global zone. 1861 * 1862 * We don't do any emulation for pkgcond since it is typically used 1863 * in pkg/patch postinstall scripts and we want those scripts to do 1864 * the right thing inside a zone. 1865 * 1866 * One issue is the handling of hollow pkgs. Since the pkgs are 1867 * hollow, they won't use pkgcond in their postinstall scripts. These 1868 * pkgs typically are installing drivers so we handle that by 1869 * replacing add_drv and rem_drv in the s10_boot script. 1870 */ 1871 if (strcmp("pkgadd", bname) == 0 || strcmp("pkgrm", bname) == 0 || 1872 strcmp("patchadd", bname) == 0 || strcmp("patchrm", bname) == 0) 1873 emul_global_zone = B_TRUE; 1874 1875 /* 1876 * Register our syscall emulation table with the kernel. 1877 * Note that we don't have to do invoke (syscall_number + 1024) 1878 * until we've actually establised a syscall emulation callback 1879 * handler address, which is what we're doing with this brand 1880 * syscall. 1881 */ 1882 reg.sbr_version = S10_VERSION; 1883 #ifdef __x86 1884 reg.sbr_handler = (caddr_t)s10_handler_table; 1885 #else /* !__x86 */ 1886 reg.sbr_handler = (caddr_t)s10_handler; 1887 #endif /* !__x86 */ 1888 1889 if ((err = __systemcall(&rval, SYS_brand, B_REGISTER, ®)) != 0) { 1890 s10_abort(err, "Failed to brand current process"); 1891 /*NOTREACHED*/ 1892 } 1893 1894 /* Get data about the executable we're running from the kernel. */ 1895 if ((err = __systemcall(&rval, SYS_brand + 1024, 1896 B_ELFDATA, (void *)&sed)) != 0) { 1897 s10_abort(err, 1898 "Failed to get required brand ELF data from the kernel"); 1899 /*NOTREACHED*/ 1900 } 1901 1902 /* 1903 * Find the aux vector on the stack. 1904 */ 1905 p = (uintptr_t *)envp; 1906 while (*p != NULL) 1907 p++; 1908 1909 /* 1910 * p is now pointing at the 0 word after the environ pointers. 1911 * After that is the aux vectors. 1912 * 1913 * The aux vectors are currently pointing to the brand emulation 1914 * library and associated linker. We're going to change them to 1915 * point to the brand executable and associated linker (or to no 1916 * linker for static binaries). This matches the process data 1917 * stored within the kernel and visible from /proc, which was 1918 * all setup in s10_elfexec(). We do this so that when a debugger 1919 * attaches to the process it sees the process as a normal solaris 1920 * process, this brand emulation library and everything on it's 1921 * link map will not be visible, unless our librtld_db plugin 1922 * is used. Note that this is very different from how Linux 1923 * branded processes are implemented within lx branded zones. 1924 * In that situation, the primary linkmap of the process is the 1925 * brand emulation libraries linkmap, not the Linux applications 1926 * linkmap. 1927 * 1928 * We also need to clear the AF_SUN_NOPLM flag from the AT_SUN_AUXFLAGS 1929 * aux vector. This flag told our linker that we don't have a 1930 * primary link map. Now that our linker is done initializing, we 1931 * want to clear this flag before we transfer control to the 1932 * applications copy of the linker, since we want that linker to have 1933 * a primary link map which will be the link map for the application 1934 * we're running. 1935 */ 1936 p++; 1937 for (ap = (auxv_t *)p; ap->a_type != AT_NULL; ap++) { 1938 switch (ap->a_type) { 1939 case AT_BASE: 1940 /* Hide AT_BASE if static binary */ 1941 if (sed.sed_base == NULL) { 1942 ap->a_type = AT_IGNORE; 1943 ap->a_un.a_val = NULL; 1944 } else { 1945 ap->a_un.a_val = sed.sed_base; 1946 } 1947 break; 1948 case AT_ENTRY: 1949 ap->a_un.a_val = sed.sed_entry; 1950 break; 1951 case AT_PHDR: 1952 ap->a_un.a_val = sed.sed_phdr; 1953 break; 1954 case AT_PHENT: 1955 ap->a_un.a_val = sed.sed_phent; 1956 break; 1957 case AT_PHNUM: 1958 ap->a_un.a_val = sed.sed_phnum; 1959 break; 1960 case AT_SUN_AUXFLAGS: 1961 ap->a_un.a_val &= ~AF_SUN_NOPLM; 1962 break; 1963 case AT_SUN_EMULATOR: 1964 /* 1965 * ld.so.1 inspects AT_SUN_EMULATOR to see if 1966 * if it is the linker for the brand emulation 1967 * library. Hide AT_SUN_EMULATOR, as the 1968 * linker we are about to jump to is the linker 1969 * for the binary. 1970 */ 1971 ap->a_type = AT_IGNORE; 1972 ap->a_un.a_val = NULL; 1973 break; 1974 case AT_SUN_LDDATA: 1975 /* Hide AT_SUN_LDDATA if static binary */ 1976 if (sed.sed_lddata == NULL) { 1977 ap->a_type = AT_IGNORE; 1978 ap->a_un.a_val = NULL; 1979 } else { 1980 ap->a_un.a_val = sed.sed_lddata; 1981 } 1982 break; 1983 default: 1984 break; 1985 } 1986 } 1987 1988 s10_runexe(argv, sed.sed_ldentry); 1989 /*NOTREACHED*/ 1990 s10_abort(0, "s10_runexe() returned"); 1991 return (-1); 1992 } 1993 1994 /* 1995 * This table must have at least NSYSCALL entries in it. 1996 * 1997 * The second parameter of each entry in the s10_sysent_table 1998 * contains the number of parameters and flags that describe the 1999 * syscall return value encoding. See the block comments at the 2000 * top of this file for more information about the syscall return 2001 * value flags and when they should be used. 2002 */ 2003 s10_sysent_table_t s10_sysent_table[] = { 2004 #if defined(__sparc) && !defined(__sparcv9) 2005 EMULATE(s10_indir, 9 | RV_64RVAL), /* 0 */ 2006 #else 2007 NOSYS, /* 0 */ 2008 #endif 2009 NOSYS, /* 1 */ 2010 EMULATE(s10_forkall, 0 | RV_32RVAL2), /* 2 */ 2011 NOSYS, /* 3 */ 2012 NOSYS, /* 4 */ 2013 EMULATE(s10_open, 3 | RV_DEFAULT), /* 5 */ 2014 NOSYS, /* 6 */ 2015 EMULATE(s10_wait, 0 | RV_32RVAL2), /* 7 */ 2016 EMULATE(s10_creat, 2 | RV_DEFAULT), /* 8 */ 2017 NOSYS, /* 9 */ 2018 EMULATE(s10_unlink, 1 | RV_DEFAULT), /* 10 */ 2019 EMULATE(s10_exec, 2 | RV_DEFAULT), /* 11 */ 2020 NOSYS, /* 12 */ 2021 NOSYS, /* 13 */ 2022 NOSYS, /* 14 */ 2023 NOSYS, /* 15 */ 2024 EMULATE(s10_chown, 3 | RV_DEFAULT), /* 16 */ 2025 NOSYS, /* 17 */ 2026 EMULATE(s10_stat, 2 | RV_DEFAULT), /* 18 */ 2027 NOSYS, /* 19 */ 2028 NOSYS, /* 20 */ 2029 NOSYS, /* 21 */ 2030 EMULATE(s10_umount, 1 | RV_DEFAULT), /* 22 */ 2031 NOSYS, /* 23 */ 2032 NOSYS, /* 24 */ 2033 NOSYS, /* 25 */ 2034 NOSYS, /* 26 */ 2035 NOSYS, /* 27 */ 2036 EMULATE(s10_fstat, 2 | RV_DEFAULT), /* 28 */ 2037 NOSYS, /* 29 */ 2038 EMULATE(s10_utime, 2 | RV_DEFAULT), /* 30 */ 2039 NOSYS, /* 31 */ 2040 NOSYS, /* 32 */ 2041 EMULATE(s10_access, 2 | RV_DEFAULT), /* 33 */ 2042 NOSYS, /* 34 */ 2043 NOSYS, /* 35 */ 2044 NOSYS, /* 36 */ 2045 EMULATE(s10_kill, 2 | RV_DEFAULT), /* 37 */ 2046 NOSYS, /* 38 */ 2047 NOSYS, /* 39 */ 2048 NOSYS, /* 40 */ 2049 EMULATE(s10_dup, 1 | RV_DEFAULT), /* 41 */ 2050 NOSYS, /* 42 */ 2051 NOSYS, /* 43 */ 2052 NOSYS, /* 44 */ 2053 NOSYS, /* 45 */ 2054 NOSYS, /* 46 */ 2055 NOSYS, /* 47 */ 2056 NOSYS, /* 48 */ 2057 NOSYS, /* 49 */ 2058 NOSYS, /* 50 */ 2059 NOSYS, /* 51 */ 2060 NOSYS, /* 52 */ 2061 NOSYS, /* 53 */ 2062 EMULATE(s10_ioctl, 3 | RV_DEFAULT), /* 54 */ 2063 NOSYS, /* 55 */ 2064 NOSYS, /* 56 */ 2065 NOSYS, /* 57 */ 2066 NOSYS, /* 58 */ 2067 EMULATE(s10_execve, 3 | RV_DEFAULT), /* 59 */ 2068 NOSYS, /* 60 */ 2069 NOSYS, /* 61 */ 2070 NOSYS, /* 62 */ 2071 NOSYS, /* 63 */ 2072 NOSYS, /* 64 */ 2073 NOSYS, /* 65 */ 2074 NOSYS, /* 66 */ 2075 NOSYS, /* 67 */ 2076 NOSYS, /* 68 */ 2077 NOSYS, /* 69 */ 2078 NOSYS, /* 70 */ 2079 EMULATE(s10_acctctl, 3 | RV_DEFAULT), /* 71 */ 2080 NOSYS, /* 72 */ 2081 NOSYS, /* 73 */ 2082 NOSYS, /* 74 */ 2083 EMULATE(s10_issetugid, 0 | RV_DEFAULT), /* 75 */ 2084 EMULATE(s10_fsat, 6 | RV_DEFAULT), /* 76 */ 2085 NOSYS, /* 77 */ 2086 NOSYS, /* 78 */ 2087 EMULATE(s10_rmdir, 1 | RV_DEFAULT), /* 79 */ 2088 NOSYS, /* 80 */ 2089 EMULATE(s10_getdents, 3 | RV_DEFAULT), /* 81 */ 2090 NOSYS, /* 82 */ 2091 NOSYS, /* 83 */ 2092 NOSYS, /* 84 */ 2093 NOSYS, /* 85 */ 2094 NOSYS, /* 86 */ 2095 EMULATE(s10_poll, 3 | RV_DEFAULT), /* 87 */ 2096 EMULATE(s10_lstat, 2 | RV_DEFAULT), /* 88 */ 2097 NOSYS, /* 89 */ 2098 NOSYS, /* 90 */ 2099 NOSYS, /* 91 */ 2100 NOSYS, /* 92 */ 2101 NOSYS, /* 93 */ 2102 EMULATE(s10_fchown, 3 | RV_DEFAULT), /* 94 */ 2103 EMULATE(s10_sigprocmask, 3 | RV_DEFAULT), /* 95 */ 2104 EMULATE(s10_sigsuspend, 1 | RV_DEFAULT), /* 96 */ 2105 NOSYS, /* 97 */ 2106 EMULATE(s10_sigaction, 3 | RV_DEFAULT), /* 98 */ 2107 EMULATE(s10_sigpending, 2 | RV_DEFAULT), /* 99 */ 2108 NOSYS, /* 100 */ 2109 NOSYS, /* 101 */ 2110 NOSYS, /* 102 */ 2111 NOSYS, /* 103 */ 2112 NOSYS, /* 104 */ 2113 NOSYS, /* 105 */ 2114 NOSYS, /* 106 */ 2115 EMULATE(s10_waitid, 4 | RV_DEFAULT), /* 107 */ 2116 EMULATE(s10_sigsendsys, 2 | RV_DEFAULT), /* 108 */ 2117 NOSYS, /* 109 */ 2118 NOSYS, /* 110 */ 2119 NOSYS, /* 111 */ 2120 NOSYS, /* 112 */ 2121 NOSYS, /* 113 */ 2122 NOSYS, /* 114 */ 2123 NOSYS, /* 115 */ 2124 NOSYS, /* 116 */ 2125 NOSYS, /* 117 */ 2126 NOSYS, /* 118 */ 2127 NOSYS, /* 119 */ 2128 NOSYS, /* 120 */ 2129 NOSYS, /* 121 */ 2130 NOSYS, /* 122 */ 2131 #if defined(__x86) 2132 EMULATE(s10_xstat, 3 | RV_DEFAULT), /* 123 */ 2133 EMULATE(s10_lxstat, 3 | RV_DEFAULT), /* 124 */ 2134 EMULATE(s10_fxstat, 3 | RV_DEFAULT), /* 125 */ 2135 EMULATE(s10_xmknod, 4 | RV_DEFAULT), /* 126 */ 2136 #else 2137 NOSYS, /* 123 */ 2138 NOSYS, /* 124 */ 2139 NOSYS, /* 125 */ 2140 NOSYS, /* 126 */ 2141 #endif 2142 NOSYS, /* 127 */ 2143 NOSYS, /* 128 */ 2144 NOSYS, /* 129 */ 2145 EMULATE(s10_lchown, 3 | RV_DEFAULT), /* 130 */ 2146 NOSYS, /* 131 */ 2147 NOSYS, /* 132 */ 2148 NOSYS, /* 133 */ 2149 EMULATE(s10_rename, 2 | RV_DEFAULT), /* 134 */ 2150 EMULATE(s10_uname, 1 | RV_DEFAULT), /* 135 */ 2151 NOSYS, /* 136 */ 2152 EMULATE(s10_sysconfig, 1 | RV_DEFAULT), /* 137 */ 2153 NOSYS, /* 138 */ 2154 EMULATE(s10_sysinfo, 3 | RV_DEFAULT), /* 139 */ 2155 NOSYS, /* 140 */ 2156 NOSYS, /* 141 */ 2157 NOSYS, /* 142 */ 2158 EMULATE(s10_fork1, 0 | RV_32RVAL2), /* 143 */ 2159 EMULATE(s10_sigtimedwait, 3 | RV_DEFAULT), /* 144 */ 2160 NOSYS, /* 145 */ 2161 NOSYS, /* 146 */ 2162 EMULATE(s10_lwp_sema_wait, 1 | RV_DEFAULT), /* 147 */ 2163 NOSYS, /* 148 */ 2164 NOSYS, /* 149 */ 2165 NOSYS, /* 150 */ 2166 NOSYS, /* 151 */ 2167 NOSYS, /* 152 */ 2168 NOSYS, /* 153 */ 2169 EMULATE(s10_utimes, 2 | RV_DEFAULT), /* 154 */ 2170 NOSYS, /* 155 */ 2171 NOSYS, /* 156 */ 2172 NOSYS, /* 157 */ 2173 NOSYS, /* 158 */ 2174 EMULATE(s10_lwp_create, 3 | RV_DEFAULT), /* 159 */ 2175 NOSYS, /* 160 */ 2176 NOSYS, /* 161 */ 2177 NOSYS, /* 162 */ 2178 EMULATE(s10_lwp_kill, 2 | RV_DEFAULT), /* 163 */ 2179 NOSYS, /* 164 */ 2180 EMULATE(s10_lwp_sigmask, 3 | RV_32RVAL2), /* 165 */ 2181 #if defined(__x86) 2182 EMULATE(s10_lwp_private, 3 | RV_DEFAULT), /* 166 */ 2183 #else 2184 NOSYS, /* 166 */ 2185 #endif 2186 NOSYS, /* 167 */ 2187 NOSYS, /* 168 */ 2188 EMULATE(s10_lwp_mutex_lock, 1 | RV_DEFAULT), /* 169 */ 2189 NOSYS, /* 170 */ 2190 NOSYS, /* 171 */ 2191 NOSYS, /* 172 */ 2192 NOSYS, /* 173 */ 2193 EMULATE(s10_pwrite, 4 | RV_DEFAULT), /* 174 */ 2194 NOSYS, /* 175 */ 2195 NOSYS, /* 176 */ 2196 NOSYS, /* 177 */ 2197 NOSYS, /* 178 */ 2198 NOSYS, /* 179 */ 2199 NOSYS, /* 180 */ 2200 NOSYS, /* 181 */ 2201 NOSYS, /* 182 */ 2202 NOSYS, /* 183 */ 2203 NOSYS, /* 184 */ 2204 NOSYS, /* 185 */ 2205 EMULATE(s10_auditsys, 4 | RV_64RVAL), /* 186 */ 2206 NOSYS, /* 187 */ 2207 NOSYS, /* 188 */ 2208 NOSYS, /* 189 */ 2209 EMULATE(s10_sigqueue, 4 | RV_DEFAULT), /* 190 */ 2210 NOSYS, /* 191 */ 2211 NOSYS, /* 192 */ 2212 NOSYS, /* 193 */ 2213 NOSYS, /* 194 */ 2214 NOSYS, /* 195 */ 2215 NOSYS, /* 196 */ 2216 NOSYS, /* 197 */ 2217 NOSYS, /* 198 */ 2218 NOSYS, /* 199 */ 2219 NOSYS, /* 200 */ 2220 NOSYS, /* 201 */ 2221 NOSYS, /* 202 */ 2222 NOSYS, /* 203 */ 2223 NOSYS, /* 204 */ 2224 EMULATE(s10_signotify, 3 | RV_DEFAULT), /* 205 */ 2225 NOSYS, /* 206 */ 2226 NOSYS, /* 207 */ 2227 NOSYS, /* 208 */ 2228 NOSYS, /* 209 */ 2229 EMULATE(s10_lwp_mutex_timedlock, 2 | RV_DEFAULT), /* 210 */ 2230 NOSYS, /* 211 */ 2231 NOSYS, /* 212 */ 2232 #if defined(_LP64) 2233 NOSYS, /* 213 */ 2234 #else 2235 EMULATE(s10_getdents64, 3 | RV_DEFAULT), /* 213 */ 2236 #endif 2237 NOSYS, /* 214 */ 2238 #if defined(_LP64) 2239 NOSYS, /* 215 */ 2240 NOSYS, /* 216 */ 2241 NOSYS, /* 217 */ 2242 #else 2243 EMULATE(s10_stat64, 2 | RV_DEFAULT), /* 215 */ 2244 EMULATE(s10_lstat64, 2 | RV_DEFAULT), /* 216 */ 2245 EMULATE(s10_fstat64, 2 | RV_DEFAULT), /* 217 */ 2246 #endif 2247 NOSYS, /* 218 */ 2248 NOSYS, /* 219 */ 2249 NOSYS, /* 220 */ 2250 NOSYS, /* 221 */ 2251 NOSYS, /* 222 */ 2252 #if defined(_LP64) 2253 NOSYS, /* 223 */ 2254 NOSYS, /* 224 */ 2255 NOSYS, /* 225 */ 2256 #else 2257 EMULATE(s10_pwrite64, 5 | RV_DEFAULT), /* 223 */ 2258 EMULATE(s10_creat64, 2 | RV_DEFAULT), /* 224 */ 2259 EMULATE(s10_open64, 3 | RV_DEFAULT), /* 225 */ 2260 #endif 2261 NOSYS, /* 226 */ 2262 EMULATE(s10_zone, 5 | RV_DEFAULT), /* 227 */ 2263 NOSYS, /* 228 */ 2264 NOSYS, /* 229 */ 2265 NOSYS, /* 230 */ 2266 NOSYS, /* 231 */ 2267 NOSYS, /* 232 */ 2268 NOSYS, /* 233 */ 2269 NOSYS, /* 234 */ 2270 NOSYS, /* 235 */ 2271 NOSYS, /* 236 */ 2272 NOSYS, /* 237 */ 2273 NOSYS, /* 238 */ 2274 NOSYS, /* 239 */ 2275 NOSYS, /* 240 */ 2276 NOSYS, /* 241 */ 2277 NOSYS, /* 242 */ 2278 NOSYS, /* 243 */ 2279 NOSYS, /* 244 */ 2280 NOSYS, /* 245 */ 2281 NOSYS, /* 246 */ 2282 NOSYS, /* 247 */ 2283 NOSYS, /* 248 */ 2284 NOSYS, /* 249 */ 2285 NOSYS, /* 250 */ 2286 EMULATE(s10_lwp_mutex_trylock, 1 | RV_DEFAULT), /* 251 */ 2287 NOSYS, /* 252 */ 2288 NOSYS, /* 253 */ 2289 NOSYS, /* 254 */ 2290 NOSYS /* 255 */ 2291 }; 2292