1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. 24 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright 2018, Joyent, Inc. 26 */ 27 28 #include <sys/errno.h> 29 #include <sys/exec.h> 30 #include <sys/file.h> 31 #include <sys/kmem.h> 32 #include <sys/modctl.h> 33 #include <sys/model.h> 34 #include <sys/proc.h> 35 #include <sys/syscall.h> 36 #include <sys/systm.h> 37 #include <sys/thread.h> 38 #include <sys/cmn_err.h> 39 #include <sys/archsystm.h> 40 #include <sys/pathname.h> 41 #include <sys/sunddi.h> 42 43 #include <sys/machbrand.h> 44 #include <sys/brand.h> 45 #include "s10_brand.h" 46 47 char *s10_emulation_table = NULL; 48 49 void s10_init_brand_data(zone_t *); 50 void s10_free_brand_data(zone_t *); 51 void s10_setbrand(proc_t *); 52 int s10_getattr(zone_t *, int, void *, size_t *); 53 int s10_setattr(zone_t *, int, void *, size_t); 54 int s10_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, 55 uintptr_t, uintptr_t, uintptr_t); 56 void s10_copy_procdata(proc_t *, proc_t *); 57 void s10_proc_exit(struct proc *, klwp_t *); 58 void s10_exec(); 59 int s10_initlwp(klwp_t *); 60 void s10_forklwp(klwp_t *, klwp_t *); 61 void s10_freelwp(klwp_t *); 62 void s10_lwpexit(klwp_t *); 63 int s10_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, 64 long *, int, caddr_t, cred_t *, int); 65 void s10_sigset_native_to_s10(sigset_t *); 66 void s10_sigset_s10_to_native(sigset_t *); 67 68 /* s10 brand */ 69 struct brand_ops s10_brops = { 70 s10_init_brand_data, 71 s10_free_brand_data, 72 s10_brandsys, 73 s10_setbrand, 74 s10_getattr, 75 s10_setattr, 76 s10_copy_procdata, 77 s10_proc_exit, 78 s10_exec, 79 lwp_setrval, 80 s10_initlwp, 81 s10_forklwp, 82 s10_freelwp, 83 s10_lwpexit, 84 s10_elfexec, 85 s10_sigset_native_to_s10, 86 s10_sigset_s10_to_native, 87 S10_NSIG, 88 }; 89 90 #ifdef sparc 91 92 struct brand_mach_ops s10_mops = { 93 s10_brand_syscall_callback, 94 s10_brand_syscall32_callback 95 }; 96 97 #else /* sparc */ 98 99 #ifdef __amd64 100 101 struct brand_mach_ops s10_mops = { 102 s10_brand_sysenter_callback, 103 s10_brand_int91_callback, 104 s10_brand_syscall_callback, 105 s10_brand_syscall32_callback 106 }; 107 108 #else /* ! __amd64 */ 109 110 struct brand_mach_ops s10_mops = { 111 s10_brand_sysenter_callback, 112 NULL, 113 s10_brand_syscall_callback, 114 NULL 115 }; 116 #endif /* __amd64 */ 117 118 #endif /* _sparc */ 119 120 struct brand s10_brand = { 121 BRAND_VER_1, 122 "solaris10", 123 &s10_brops, 124 &s10_mops 125 }; 126 127 static struct modlbrand modlbrand = { 128 &mod_brandops, /* type of module */ 129 "Solaris 10 Brand", /* description of module */ 130 &s10_brand /* driver ops */ 131 }; 132 133 static struct modlinkage modlinkage = { 134 MODREV_1, (void *)&modlbrand, NULL 135 }; 136 137 void 138 s10_setbrand(proc_t *p) 139 { 140 brand_solaris_setbrand(p, &s10_brand); 141 } 142 143 /*ARGSUSED*/ 144 int 145 s10_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize) 146 { 147 ASSERT(zone->zone_brand == &s10_brand); 148 if (attr == S10_EMUL_BITMAP) { 149 if (buf == NULL || *bufsize != sizeof (s10_emul_bitmap_t)) 150 return (EINVAL); 151 if (copyout(((s10_zone_data_t *)zone->zone_brand_data)-> 152 emul_bitmap, buf, sizeof (s10_emul_bitmap_t)) != 0) 153 return (EFAULT); 154 return (0); 155 } 156 157 return (EINVAL); 158 } 159 160 int 161 s10_setattr(zone_t *zone, int attr, void *buf, size_t bufsize) 162 { 163 ASSERT(zone->zone_brand == &s10_brand); 164 if (attr == S10_EMUL_BITMAP) { 165 if (buf == NULL || bufsize != sizeof (s10_emul_bitmap_t)) 166 return (EINVAL); 167 if (copyin(buf, ((s10_zone_data_t *)zone->zone_brand_data)-> 168 emul_bitmap, sizeof (s10_emul_bitmap_t)) != 0) 169 return (EFAULT); 170 return (0); 171 } 172 173 return (EINVAL); 174 } 175 176 #ifdef __amd64 177 /* 178 * The Nevada kernel clears %fs for threads in 64-bit x86 processes but S10's 179 * libc expects %fs to be nonzero. This causes some committed 180 * libc/libthread interfaces (e.g., thr_main()) to fail, which impacts several 181 * libraries, including libdoor. This function sets the specified LWP's %fs 182 * register to the legacy S10 selector value (LWPFS_SEL). 183 * 184 * The best solution to the aforementioned problem is backporting CRs 185 * 6467491 to Solaris 10 so that 64-bit x86 Solaris 10 processes 186 * would accept zero for %fs. Backporting the CRs is a requirement for running 187 * S10 Containers in PV domUs because 64-bit Xen clears %fsbase when %fs is 188 * nonzero. Such behavior breaks 64-bit processes because Xen has to fetch the 189 * FS segments' base addresses from the LWPs' GDTs, which are only capable of 190 * 32-bit addressing. 191 */ 192 /*ARGSUSED*/ 193 static void 194 s10_amd64_correct_fsreg(klwp_t *l) 195 { 196 if (lwp_getdatamodel(l) == DATAMODEL_NATIVE) { 197 kpreempt_disable(); 198 l->lwp_pcb.pcb_fs = LWPFS_SEL; 199 PCB_SET_UPDATE_SEGS(&l->lwp_pcb); 200 lwptot(l)->t_post_sys = 1; /* Guarantee update_sregs() */ 201 kpreempt_enable(); 202 } 203 } 204 #endif /* __amd64 */ 205 206 /* 207 * Native processes are started with the native ld.so.1 as the command. This 208 * brand op is invoked by s10_npreload to fix up the command and arguments 209 * so that apps like pgrep or ps see the expected command strings. 210 */ 211 int 212 s10_native(void *cmd, void *args) 213 { 214 struct user *up = PTOU(curproc); 215 char cmd_buf[MAXCOMLEN + 1]; 216 char arg_buf[PSARGSZ]; 217 218 if (copyin(cmd, &cmd_buf, sizeof (cmd_buf)) != 0) 219 return (EFAULT); 220 if (copyin(args, &arg_buf, sizeof (arg_buf)) != 0) 221 return (EFAULT); 222 223 /* 224 * Make sure that the process' interpreter is the native dynamic linker. 225 * Convention dictates that native processes executing within solaris10- 226 * branded zones are interpreted by the native dynamic linker (the 227 * process and its arguments are specified as arguments to the dynamic 228 * linker). If this convention is violated (i.e., 229 * brandsys(B_S10_NATIVE, ...) is invoked by a process that shouldn't be 230 * native), then do nothing and silently indicate success. 231 */ 232 if (strcmp(up->u_comm, S10_LINKER_NAME) != 0) 233 return (0); 234 235 /* 236 * The sizeof has an extra value for the trailing '\0' so this covers 237 * the appended " " in the following strcmps. 238 */ 239 if (strncmp(up->u_psargs, BRAND_NATIVE_LINKER64 " ", 240 sizeof (BRAND_NATIVE_LINKER64)) != 0 && 241 strncmp(up->u_psargs, BRAND_NATIVE_LINKER32 " ", 242 sizeof (BRAND_NATIVE_LINKER32)) != 0) 243 return (0); 244 245 mutex_enter(&curproc->p_lock); 246 (void) strlcpy(up->u_comm, cmd_buf, sizeof (up->u_comm)); 247 (void) strlcpy(up->u_psargs, arg_buf, sizeof (up->u_psargs)); 248 mutex_exit(&curproc->p_lock); 249 250 return (0); 251 } 252 253 /*ARGSUSED*/ 254 int 255 s10_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, 256 uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) 257 { 258 proc_t *p = curproc; 259 int res; 260 261 *rval = 0; 262 263 if (cmd == B_S10_NATIVE) 264 return (s10_native((void *)arg1, (void *)arg2)); 265 266 res = brand_solaris_cmd(cmd, arg1, arg2, arg3, &s10_brand, S10_VERSION); 267 if (res >= 0) 268 return (res); 269 270 switch ((cmd)) { 271 case B_S10_PIDINFO: 272 /* 273 * The s10 brand needs to be able to get the pid of the 274 * current process and the pid of the zone's init, and it 275 * needs to do this on every process startup. Early in 276 * brand startup, we can't call getpid() because calls to 277 * getpid() represent a magical signal to some old-skool 278 * debuggers. By merging all of this into one call, we 279 * make this quite a bit cheaper and easier to handle in 280 * the brand module. 281 */ 282 if (copyout(&p->p_pid, (void *)arg1, sizeof (pid_t)) != 0) 283 return (EFAULT); 284 if (copyout(&p->p_zone->zone_proc_initpid, (void *)arg2, 285 sizeof (pid_t)) != 0) 286 return (EFAULT); 287 return (0); 288 289 case B_S10_ISFDXATTRDIR: { 290 /* 291 * This subcommand enables the userland brand emulation library 292 * to determine whether a file descriptor refers to an extended 293 * file attributes directory. There is no standard syscall or 294 * libc function that can make such a determination. 295 */ 296 file_t *dir_filep; 297 298 dir_filep = getf((int)arg1); 299 if (dir_filep == NULL) 300 return (EBADF); 301 ASSERT(dir_filep->f_vnode != NULL); 302 *rval = IS_XATTRDIR(dir_filep->f_vnode); 303 releasef((int)arg1); 304 return (0); 305 } 306 307 #ifdef __amd64 308 case B_S10_FSREGCORRECTION: 309 /* 310 * This subcommand exists so that the SYS_lwp_private and 311 * SYS_lwp_create syscalls can manually set the current thread's 312 * %fs register to the legacy S10 selector value for 64-bit x86 313 * processes. 314 */ 315 s10_amd64_correct_fsreg(ttolwp(curthread)); 316 return (0); 317 #endif /* __amd64 */ 318 } 319 320 return (EINVAL); 321 } 322 323 void 324 s10_copy_procdata(proc_t *child, proc_t *parent) 325 { 326 brand_solaris_copy_procdata(child, parent, &s10_brand); 327 } 328 329 void 330 s10_proc_exit(struct proc *p, klwp_t *l) 331 { 332 brand_solaris_proc_exit(p, l, &s10_brand); 333 } 334 335 void 336 s10_exec() 337 { 338 brand_solaris_exec(&s10_brand); 339 } 340 341 int 342 s10_initlwp(klwp_t *l) 343 { 344 return (brand_solaris_initlwp(l, &s10_brand)); 345 } 346 347 void 348 s10_forklwp(klwp_t *p, klwp_t *c) 349 { 350 brand_solaris_forklwp(p, c, &s10_brand); 351 352 #ifdef __amd64 353 /* 354 * Only correct the child's %fs register if the parent's %fs register 355 * is LWPFS_SEL. If the parent's %fs register is zero, then the Solaris 356 * 10 environment that we're emulating uses a version of libc that 357 * works when %fs is zero (i.e., it contains backports of CRs 6467491 358 * and 6501650). 359 */ 360 if (p->lwp_pcb.pcb_fs == LWPFS_SEL) 361 s10_amd64_correct_fsreg(c); 362 #endif /* __amd64 */ 363 } 364 365 void 366 s10_freelwp(klwp_t *l) 367 { 368 brand_solaris_freelwp(l, &s10_brand); 369 } 370 371 void 372 s10_lwpexit(klwp_t *l) 373 { 374 brand_solaris_lwpexit(l, &s10_brand); 375 } 376 377 void 378 s10_free_brand_data(zone_t *zone) 379 { 380 kmem_free(zone->zone_brand_data, sizeof (s10_zone_data_t)); 381 } 382 383 void 384 s10_init_brand_data(zone_t *zone) 385 { 386 ASSERT(zone->zone_brand == &s10_brand); 387 ASSERT(zone->zone_brand_data == NULL); 388 zone->zone_brand_data = kmem_zalloc(sizeof (s10_zone_data_t), KM_SLEEP); 389 } 390 391 int 392 s10_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, 393 int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred, 394 int brand_action) 395 { 396 return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz, 397 setid, exec_file, cred, brand_action, &s10_brand, S10_BRANDNAME, 398 S10_LIB, S10_LIB32, S10_LINKER, S10_LINKER32)); 399 } 400 401 void 402 s10_sigset_native_to_s10(sigset_t *set) 403 { 404 int nativesig; 405 int s10sig; 406 sigset_t s10set; 407 408 /* 409 * Shortcut: we know the first 32 signals are the same in both 410 * s10 and native Solaris. Just assign the first word. 411 */ 412 s10set.__sigbits[0] = set->__sigbits[0]; 413 s10set.__sigbits[1] = 0; 414 s10set.__sigbits[2] = 0; 415 s10set.__sigbits[3] = 0; 416 417 /* 418 * Copy the remainder of the initial set of common signals. 419 */ 420 for (nativesig = 33; nativesig < S10_SIGRTMIN; nativesig++) 421 if (sigismember(set, nativesig)) 422 sigaddset(&s10set, nativesig); 423 424 /* 425 * Convert any native RT signals to their S10 values. 426 */ 427 for (nativesig = _SIGRTMIN, s10sig = S10_SIGRTMIN; 428 nativesig <= _SIGRTMAX && s10sig <= S10_SIGRTMAX; 429 nativesig++, s10sig++) { 430 if (sigismember(set, nativesig)) 431 sigaddset(&s10set, s10sig); 432 } 433 434 *set = s10set; 435 } 436 437 void 438 s10_sigset_s10_to_native(sigset_t *set) 439 { 440 int s10sig; 441 int nativesig; 442 sigset_t nativeset; 443 444 /* 445 * Shortcut: we know the first 32 signals are the same in both 446 * s10 and native Solaris. Just assign the first word. 447 */ 448 nativeset.__sigbits[0] = set->__sigbits[0]; 449 nativeset.__sigbits[1] = 0; 450 nativeset.__sigbits[2] = 0; 451 nativeset.__sigbits[3] = 0; 452 453 /* 454 * Copy the remainder of the initial set of common signals. 455 */ 456 for (s10sig = 33; s10sig < S10_SIGRTMIN; s10sig++) 457 if (sigismember(set, s10sig)) 458 sigaddset(&nativeset, s10sig); 459 460 /* 461 * Convert any S10 RT signals to their native values. 462 */ 463 for (s10sig = S10_SIGRTMIN, nativesig = _SIGRTMIN; 464 s10sig <= S10_SIGRTMAX && nativesig <= _SIGRTMAX; 465 s10sig++, nativesig++) { 466 if (sigismember(set, s10sig)) 467 sigaddset(&nativeset, nativesig); 468 } 469 470 *set = nativeset; 471 } 472 473 int 474 _init(void) 475 { 476 int err; 477 478 /* 479 * Set up the table indicating which system calls we want to 480 * interpose on. We should probably build this automatically from 481 * a list of system calls that is shared with the user-space 482 * library. 483 */ 484 s10_emulation_table = kmem_zalloc(NSYSCALL, KM_SLEEP); 485 s10_emulation_table[S10_SYS_forkall] = 1; /* 2 */ 486 s10_emulation_table[S10_SYS_open] = 1; /* 5 */ 487 s10_emulation_table[S10_SYS_wait] = 1; /* 7 */ 488 s10_emulation_table[S10_SYS_creat] = 1; /* 8 */ 489 s10_emulation_table[S10_SYS_link] = 1; /* 9 */ 490 s10_emulation_table[S10_SYS_unlink] = 1; /* 10 */ 491 s10_emulation_table[S10_SYS_exec] = 1; /* 11 */ 492 s10_emulation_table[S10_SYS_mknod] = 1; /* 14 */ 493 s10_emulation_table[S10_SYS_chmod] = 1; /* 15 */ 494 s10_emulation_table[S10_SYS_chown] = 1; /* 16 */ 495 s10_emulation_table[S10_SYS_stat] = 1; /* 18 */ 496 s10_emulation_table[S10_SYS_umount] = 1; /* 22 */ 497 s10_emulation_table[S10_SYS_fstat] = 1; /* 28 */ 498 s10_emulation_table[S10_SYS_utime] = 1; /* 30 */ 499 s10_emulation_table[S10_SYS_access] = 1; /* 33 */ 500 s10_emulation_table[SYS_kill] = 1; /* 37 */ 501 s10_emulation_table[S10_SYS_dup] = 1; /* 41 */ 502 s10_emulation_table[S10_SYS_pipe] = 1; /* 42 */ 503 s10_emulation_table[SYS_ioctl] = 1; /* 54 */ 504 s10_emulation_table[SYS_execve] = 1; /* 59 */ 505 s10_emulation_table[SYS_acctctl] = 1; /* 71 */ 506 s10_emulation_table[S10_SYS_issetugid] = 1; /* 75 */ 507 s10_emulation_table[S10_SYS_fsat] = 1; /* 76 */ 508 s10_emulation_table[S10_SYS_rmdir] = 1; /* 79 */ 509 s10_emulation_table[S10_SYS_mkdir] = 1; /* 80 */ 510 s10_emulation_table[SYS_getdents] = 1; /* 81 */ 511 s10_emulation_table[S10_SYS_poll] = 1; /* 87 */ 512 s10_emulation_table[S10_SYS_lstat] = 1; /* 88 */ 513 s10_emulation_table[S10_SYS_symlink] = 1; /* 89 */ 514 s10_emulation_table[S10_SYS_readlink] = 1; /* 90 */ 515 s10_emulation_table[S10_SYS_fchmod] = 1; /* 93 */ 516 s10_emulation_table[S10_SYS_fchown] = 1; /* 94 */ 517 s10_emulation_table[SYS_sigprocmask] = 1; /* 95 */ 518 s10_emulation_table[SYS_sigsuspend] = 1; /* 96 */ 519 s10_emulation_table[SYS_sigaction] = 1; /* 98 */ 520 s10_emulation_table[SYS_sigpending] = 1; /* 99 */ 521 s10_emulation_table[SYS_waitid] = 1; /* 107 */ 522 s10_emulation_table[SYS_sigsendsys] = 1; /* 108 */ 523 #if defined(__x86) 524 s10_emulation_table[S10_SYS_xstat] = 1; /* 123 */ 525 s10_emulation_table[S10_SYS_lxstat] = 1; /* 124 */ 526 s10_emulation_table[S10_SYS_fxstat] = 1; /* 125 */ 527 s10_emulation_table[S10_SYS_xmknod] = 1; /* 126 */ 528 #endif 529 s10_emulation_table[S10_SYS_lchown] = 1; /* 130 */ 530 s10_emulation_table[S10_SYS_rename] = 1; /* 134 */ 531 s10_emulation_table[SYS_uname] = 1; /* 135 */ 532 s10_emulation_table[SYS_sysconfig] = 1; /* 137 */ 533 s10_emulation_table[SYS_systeminfo] = 1; /* 139 */ 534 s10_emulation_table[S10_SYS_fork1] = 1; /* 143 */ 535 s10_emulation_table[SYS_sigtimedwait] = 1; /* 144 */ 536 s10_emulation_table[S10_SYS_lwp_sema_wait] = 1; /* 147 */ 537 s10_emulation_table[S10_SYS_utimes] = 1; /* 154 */ 538 s10_emulation_table[SYS_lwp_create] = 1; /* 159 */ 539 s10_emulation_table[SYS_lwp_kill] = 1; /* 163 */ 540 s10_emulation_table[SYS_lwp_sigmask] = 1; /* 165 */ 541 #if defined(__amd64) 542 s10_emulation_table[SYS_lwp_private] = 1; /* 166 */ 543 #endif /* __amd64 */ 544 s10_emulation_table[S10_SYS_lwp_mutex_lock] = 1; /* 169 */ 545 s10_emulation_table[SYS_pwrite] = 1; /* 174 */ 546 s10_emulation_table[SYS_acl] = 1; /* 185 */ 547 s10_emulation_table[SYS_auditsys] = 1; /* 186 */ 548 s10_emulation_table[SYS_sigqueue] = 1; /* 190 */ 549 s10_emulation_table[SYS_facl] = 1; /* 200 */ 550 s10_emulation_table[SYS_signotify] = 1; /* 205 */ 551 s10_emulation_table[SYS_lwp_mutex_timedlock] = 1; /* 210 */ 552 s10_emulation_table[SYS_getdents64] = 1; /* 213 */ 553 s10_emulation_table[S10_SYS_stat64] = 1; /* 215 */ 554 s10_emulation_table[S10_SYS_lstat64] = 1; /* 216 */ 555 s10_emulation_table[S10_SYS_fstat64] = 1; /* 217 */ 556 s10_emulation_table[SYS_pwrite64] = 1; /* 223 */ 557 s10_emulation_table[S10_SYS_creat64] = 1; /* 224 */ 558 s10_emulation_table[S10_SYS_open64] = 1; /* 225 */ 559 s10_emulation_table[SYS_zone] = 1; /* 227 */ 560 s10_emulation_table[S10_SYS_so_socket] = 1; /* 230 */ 561 s10_emulation_table[S10_SYS_accept] = 1; /* 234 */ 562 s10_emulation_table[SYS_lwp_mutex_trylock] = 1; /* 251 */ 563 564 err = mod_install(&modlinkage); 565 if (err) { 566 cmn_err(CE_WARN, "Couldn't install brand module"); 567 kmem_free(s10_emulation_table, NSYSCALL); 568 } 569 570 return (err); 571 } 572 573 int 574 _info(struct modinfo *modinfop) 575 { 576 return (mod_info(&modlinkage, modinfop)); 577 } 578 579 int 580 _fini(void) 581 { 582 return (brand_solaris_fini(&s10_emulation_table, &modlinkage, 583 &s10_brand)); 584 } 585