1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2019 Joyent, Inc. 25 */ 26 27 #include <sys/param.h> 28 #include <sys/vmparam.h> 29 #include <sys/types.h> 30 #include <sys/sysmacros.h> 31 #include <sys/systm.h> 32 #include <sys/cmn_err.h> 33 #include <sys/signal.h> 34 #include <sys/stack.h> 35 #include <sys/cred.h> 36 #include <sys/user.h> 37 #include <sys/debug.h> 38 #include <sys/errno.h> 39 #include <sys/proc.h> 40 #include <sys/var.h> 41 #include <sys/inline.h> 42 #include <sys/syscall.h> 43 #include <sys/ucontext.h> 44 #include <sys/cpuvar.h> 45 #include <sys/siginfo.h> 46 #include <sys/trap.h> 47 #include <sys/machtrap.h> 48 #include <sys/sysinfo.h> 49 #include <sys/procfs.h> 50 #include <sys/prsystm.h> 51 #include <sys/fpu/fpusystm.h> 52 #include <sys/modctl.h> 53 #include <sys/aio_impl.h> 54 #include <c2/audit.h> 55 #include <sys/machpcb.h> 56 #include <sys/privregs.h> 57 #include <sys/copyops.h> 58 #include <sys/timer.h> 59 #include <sys/priv.h> 60 #include <sys/msacct.h> 61 62 int syscalltrace = 0; 63 #ifdef SYSCALLTRACE 64 static kmutex_t systrace_lock; /* syscall tracing lock */ 65 #endif /* SYSCALLTRACE */ 66 67 static krwlock_t *lock_syscall(struct sysent *, uint_t); 68 69 #ifdef _SYSCALL32_IMPL 70 static struct sysent * 71 lwp_getsysent(klwp_t *lwp) 72 { 73 if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) 74 return (sysent); 75 return (sysent32); 76 } 77 #define LWP_GETSYSENT(lwp) (lwp_getsysent(lwp)) 78 #else 79 #define LWP_GETSYSENT(lwp) (sysent) 80 #endif 81 82 /* 83 * Called to restore the lwp's register window just before 84 * returning to user level (only if the registers have been 85 * fetched or modified through /proc). 86 */ 87 /*ARGSUSED1*/ 88 void 89 xregrestore(klwp_t *lwp, int shared) 90 { 91 /* 92 * If locals+ins were modified by /proc copy them out. 93 * Also copy to the shared window, if necessary. 94 */ 95 if (lwp->lwp_pcb.pcb_xregstat == XREGMODIFIED) { 96 struct machpcb *mpcb = lwptompcb(lwp); 97 caddr_t sp = (caddr_t)lwptoregs(lwp)->r_sp; 98 99 size_t rwinsize; 100 caddr_t rwp; 101 int is64; 102 103 if (lwp_getdatamodel(lwp) == DATAMODEL_LP64) { 104 rwinsize = sizeof (struct rwindow); 105 rwp = sp + STACK_BIAS; 106 is64 = 1; 107 } else { 108 rwinsize = sizeof (struct rwindow32); 109 sp = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)sp; 110 rwp = sp; 111 is64 = 0; 112 } 113 114 if (is64) 115 (void) copyout_nowatch(&lwp->lwp_pcb.pcb_xregs, 116 rwp, rwinsize); 117 else { 118 struct rwindow32 rwindow32; 119 int watched; 120 121 watched = watch_disable_addr(rwp, rwinsize, S_WRITE); 122 rwindow_nto32(&lwp->lwp_pcb.pcb_xregs, &rwindow32); 123 (void) copyout(&rwindow32, rwp, rwinsize); 124 if (watched) 125 watch_enable_addr(rwp, rwinsize, S_WRITE); 126 } 127 128 /* also copy to the user return window */ 129 mpcb->mpcb_rsp[0] = sp; 130 mpcb->mpcb_rsp[1] = NULL; 131 bcopy(&lwp->lwp_pcb.pcb_xregs, &mpcb->mpcb_rwin[0], 132 sizeof (lwp->lwp_pcb.pcb_xregs)); 133 } 134 lwp->lwp_pcb.pcb_xregstat = XREGNONE; 135 } 136 137 138 /* 139 * Get the arguments to the current system call. 140 * lwp->lwp_ap normally points to the out regs in the reg structure. 141 * If the user is going to change the out registers and might want to 142 * get the args (for /proc tracing), it must copy the args elsewhere 143 * via save_syscall_args(). 144 */ 145 uint_t 146 get_syscall_args(klwp_t *lwp, long *argp, int *nargsp) 147 { 148 kthread_t *t = lwptot(lwp); 149 uint_t code = t->t_sysnum; 150 long mask; 151 long *ap; 152 int nargs; 153 154 if (lwptoproc(lwp)->p_model == DATAMODEL_ILP32) 155 mask = (uint32_t)0xffffffffU; 156 else 157 mask = 0xffffffffffffffff; 158 159 if (code != 0 && code < NSYSCALL) { 160 161 nargs = LWP_GETSYSENT(lwp)[code].sy_narg; 162 163 ASSERT(nargs <= MAXSYSARGS); 164 165 *nargsp = nargs; 166 ap = lwp->lwp_ap; 167 while (nargs-- > 0) 168 *argp++ = *ap++ & mask; 169 } else { 170 *nargsp = 0; 171 } 172 return (code); 173 } 174 175 #ifdef _SYSCALL32_IMPL 176 /* 177 * Get the arguments to the current 32-bit system call. 178 */ 179 uint_t 180 get_syscall32_args(klwp_t *lwp, int *argp, int *nargsp) 181 { 182 long args[MAXSYSARGS]; 183 uint_t i, code; 184 185 code = get_syscall_args(lwp, args, nargsp); 186 for (i = 0; i != *nargsp; i++) 187 *argp++ = (int)args[i]; 188 return (code); 189 } 190 #endif 191 192 /* 193 * Save the system call arguments in a safe place. 194 * lwp->lwp_ap normally points to the out regs in the reg structure. 195 * If the user is going to change the out registers, g1, or the stack, 196 * and might want to get the args (for /proc tracing), it must copy 197 * the args elsewhere via save_syscall_args(). 198 * 199 * This may be called from stop() even when we're not in a system call. 200 * Since there's no easy way to tell, this must be safe (not panic). 201 * If the copyins get data faults, return non-zero. 202 */ 203 int 204 save_syscall_args() 205 { 206 kthread_t *t = curthread; 207 klwp_t *lwp = ttolwp(t); 208 struct regs *rp = lwptoregs(lwp); 209 uint_t code = t->t_sysnum; 210 uint_t nargs; 211 int i; 212 caddr_t ua; 213 model_t datamodel; 214 215 if (lwp->lwp_argsaved || code == 0) 216 return (0); /* args already saved or not needed */ 217 218 if (code >= NSYSCALL) { 219 nargs = 0; /* illegal syscall */ 220 } else { 221 struct sysent *se = LWP_GETSYSENT(lwp); 222 struct sysent *callp = se + code; 223 224 nargs = callp->sy_narg; 225 if (LOADABLE_SYSCALL(callp) && nargs == 0) { 226 krwlock_t *module_lock; 227 228 /* 229 * Find out how many arguments the system 230 * call uses. 231 * 232 * We have the property that loaded syscalls 233 * never change the number of arguments they 234 * use after they've been loaded once. This 235 * allows us to stop for /proc tracing without 236 * holding the module lock. 237 * /proc is assured that sy_narg is valid. 238 */ 239 module_lock = lock_syscall(se, code); 240 nargs = callp->sy_narg; 241 rw_exit(module_lock); 242 } 243 } 244 245 /* 246 * Fetch the system call arguments. 247 */ 248 if (nargs == 0) 249 goto out; 250 251 252 ASSERT(nargs <= MAXSYSARGS); 253 254 if ((datamodel = lwp_getdatamodel(lwp)) == DATAMODEL_ILP32) { 255 256 if (rp->r_g1 == 0) { /* indirect syscall */ 257 258 lwp->lwp_arg[0] = (uint32_t)rp->r_o1; 259 lwp->lwp_arg[1] = (uint32_t)rp->r_o2; 260 lwp->lwp_arg[2] = (uint32_t)rp->r_o3; 261 lwp->lwp_arg[3] = (uint32_t)rp->r_o4; 262 lwp->lwp_arg[4] = (uint32_t)rp->r_o5; 263 if (nargs > 5) { 264 ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t) 265 (rp->r_sp + MINFRAME32); 266 for (i = 5; i < nargs; i++) { 267 uint32_t a; 268 if (fuword32(ua, &a) != 0) 269 return (-1); 270 lwp->lwp_arg[i] = a; 271 ua += sizeof (a); 272 } 273 } 274 } else { 275 lwp->lwp_arg[0] = (uint32_t)rp->r_o0; 276 lwp->lwp_arg[1] = (uint32_t)rp->r_o1; 277 lwp->lwp_arg[2] = (uint32_t)rp->r_o2; 278 lwp->lwp_arg[3] = (uint32_t)rp->r_o3; 279 lwp->lwp_arg[4] = (uint32_t)rp->r_o4; 280 lwp->lwp_arg[5] = (uint32_t)rp->r_o5; 281 if (nargs > 6) { 282 ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t) 283 (rp->r_sp + MINFRAME32); 284 for (i = 6; i < nargs; i++) { 285 uint32_t a; 286 if (fuword32(ua, &a) != 0) 287 return (-1); 288 lwp->lwp_arg[i] = a; 289 ua += sizeof (a); 290 } 291 } 292 } 293 } else { 294 ASSERT(datamodel == DATAMODEL_LP64); 295 lwp->lwp_arg[0] = rp->r_o0; 296 lwp->lwp_arg[1] = rp->r_o1; 297 lwp->lwp_arg[2] = rp->r_o2; 298 lwp->lwp_arg[3] = rp->r_o3; 299 lwp->lwp_arg[4] = rp->r_o4; 300 lwp->lwp_arg[5] = rp->r_o5; 301 if (nargs > 6) { 302 ua = (caddr_t)rp->r_sp + MINFRAME + STACK_BIAS; 303 for (i = 6; i < nargs; i++) { 304 unsigned long a; 305 if (fulword(ua, &a) != 0) 306 return (-1); 307 lwp->lwp_arg[i] = a; 308 ua += sizeof (a); 309 } 310 } 311 } 312 313 out: 314 lwp->lwp_ap = lwp->lwp_arg; 315 lwp->lwp_argsaved = 1; 316 t->t_post_sys = 1; /* so lwp_ap will be reset */ 317 return (0); 318 } 319 320 void 321 reset_syscall_args(void) 322 { 323 klwp_t *lwp = ttolwp(curthread); 324 325 lwp->lwp_ap = (long *)&lwptoregs(lwp)->r_o0; 326 lwp->lwp_argsaved = 0; 327 } 328 329 /* 330 * nonexistent system call-- signal lwp (may want to handle it) 331 * flag error if lwp won't see signal immediately 332 * This works for old or new calling sequence. 333 */ 334 int64_t 335 nosys(void) 336 { 337 tsignal(curthread, SIGSYS); 338 return ((int64_t)set_errno(ENOSYS)); 339 } 340 341 int 342 nosys32(void) 343 { 344 return (nosys()); 345 } 346 347 /* 348 * Perform pre-system-call processing, including stopping for tracing, 349 * auditing, microstate-accounting, etc. 350 * 351 * This routine is called only if the t_pre_sys flag is set. Any condition 352 * requiring pre-syscall handling must set the t_pre_sys flag. If the 353 * condition is persistent, this routine will repost t_pre_sys. 354 */ 355 int 356 pre_syscall(int arg0) 357 { 358 unsigned int code; 359 kthread_t *t = curthread; 360 proc_t *p = ttoproc(t); 361 klwp_t *lwp = ttolwp(t); 362 struct regs *rp = lwptoregs(lwp); 363 int repost; 364 365 t->t_pre_sys = repost = 0; /* clear pre-syscall processing flag */ 366 367 ASSERT(t->t_schedflag & TS_DONT_SWAP); 368 369 syscall_mstate(LMS_USER, LMS_SYSTEM); 370 371 /* 372 * The syscall arguments in the out registers should be pointed to 373 * by lwp_ap. If the args need to be copied so that the outs can 374 * be changed without losing the ability to get the args for /proc, 375 * they can be saved by save_syscall_args(), and lwp_ap will be 376 * restored by post_syscall(). 377 */ 378 ASSERT(lwp->lwp_ap == (long *)&rp->r_o0); 379 380 /* 381 * Make sure the thread is holding the latest credentials for the 382 * process. The credentials in the process right now apply to this 383 * thread for the entire system call. 384 */ 385 if (t->t_cred != p->p_cred) { 386 cred_t *oldcred = t->t_cred; 387 /* 388 * DTrace accesses t_cred in probe context. t_cred must 389 * always be either NULL, or point to a valid, allocated cred 390 * structure. 391 */ 392 t->t_cred = crgetcred(); 393 crfree(oldcred); 394 } 395 396 /* 397 * Undo special arrangements to single-step the lwp 398 * so that a debugger will see valid register contents. 399 * Also so that the pc is valid for syncfpu(). 400 * Also so that a syscall like exec() can be stepped. 401 */ 402 if (lwp->lwp_pcb.pcb_step != STEP_NONE) { 403 (void) prundostep(); 404 repost = 1; 405 } 406 407 /* 408 * Check for indirect system call in case we stop for tracing. 409 * Don't allow multiple indirection. 410 */ 411 code = t->t_sysnum; 412 if (code == 0 && arg0 != 0) { /* indirect syscall */ 413 code = arg0; 414 t->t_sysnum = arg0; 415 } 416 417 /* 418 * From the proc(5) manual page: 419 * When entry to a system call is being traced, the traced process 420 * stops after having begun the call to the system but before the 421 * system call arguments have been fetched from the process. 422 * If proc changes the args we must refetch them after starting. 423 */ 424 if (PTOU(p)->u_systrap) { 425 if (prismember(&PTOU(p)->u_entrymask, code)) { 426 /* 427 * Recheck stop condition, now that lock is held. 428 */ 429 mutex_enter(&p->p_lock); 430 if (PTOU(p)->u_systrap && 431 prismember(&PTOU(p)->u_entrymask, code)) { 432 stop(PR_SYSENTRY, code); 433 /* 434 * Must refetch args since they were 435 * possibly modified by /proc. Indicate 436 * that the valid copy is in the 437 * registers. 438 */ 439 lwp->lwp_argsaved = 0; 440 lwp->lwp_ap = (long *)&rp->r_o0; 441 } 442 mutex_exit(&p->p_lock); 443 } 444 repost = 1; 445 } 446 447 if (lwp->lwp_sysabort) { 448 /* 449 * lwp_sysabort may have been set via /proc while the process 450 * was stopped on PR_SYSENTRY. If so, abort the system call. 451 * Override any error from the copyin() of the arguments. 452 */ 453 lwp->lwp_sysabort = 0; 454 (void) set_errno(EINTR); /* sets post-sys processing */ 455 t->t_pre_sys = 1; /* repost anyway */ 456 return (1); /* don't do system call, return EINTR */ 457 } 458 459 /* begin auditing for this syscall */ 460 if (audit_active == C2AUDIT_LOADED) { 461 uint32_t auditing = au_zone_getstate(NULL); 462 463 if (auditing & AU_AUDIT_MASK) { 464 int error; 465 if (error = audit_start(T_SYSCALL, code, auditing, \ 466 0, lwp)) { 467 t->t_pre_sys = 1; /* repost anyway */ 468 lwp->lwp_error = 0; /* for old drivers */ 469 return (error); 470 } 471 repost = 1; 472 } 473 } 474 475 #ifdef SYSCALLTRACE 476 if (syscalltrace) { 477 int i; 478 long *ap; 479 char *cp; 480 char *sysname; 481 struct sysent *callp; 482 483 if (code >= NSYSCALL) 484 callp = &nosys_ent; /* nosys has no args */ 485 else 486 callp = LWP_GETSYSENT(lwp) + code; 487 (void) save_syscall_args(); 488 mutex_enter(&systrace_lock); 489 printf("%d: ", p->p_pid); 490 if (code >= NSYSCALL) 491 printf("0x%x", code); 492 else { 493 sysname = mod_getsysname(code); 494 printf("%s[0x%x]", sysname == NULL ? "NULL" : 495 sysname, code); 496 } 497 cp = "("; 498 for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) { 499 printf("%s%lx", cp, *ap); 500 cp = ", "; 501 } 502 if (i) 503 printf(")"); 504 printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread); 505 mutex_exit(&systrace_lock); 506 } 507 #endif /* SYSCALLTRACE */ 508 509 /* 510 * If there was a continuing reason for pre-syscall processing, 511 * set the t_pre_sys flag for the next system call. 512 */ 513 if (repost) 514 t->t_pre_sys = 1; 515 lwp->lwp_error = 0; /* for old drivers */ 516 lwp->lwp_badpriv = PRIV_NONE; /* for privilege tracing */ 517 return (0); 518 } 519 520 /* 521 * Post-syscall processing. Perform abnormal system call completion 522 * actions such as /proc tracing, profiling, signals, preemption, etc. 523 * 524 * This routine is called only if t_post_sys, t_sig_check, or t_astflag is set. 525 * Any condition requiring pre-syscall handling must set one of these. 526 * If the condition is persistent, this routine will repost t_post_sys. 527 */ 528 void 529 post_syscall(long rval1, long rval2) 530 { 531 kthread_t *t = curthread; 532 proc_t *p = curproc; 533 klwp_t *lwp = ttolwp(t); 534 struct regs *rp = lwptoregs(lwp); 535 uint_t error; 536 int code = t->t_sysnum; 537 int repost = 0; 538 int proc_stop = 0; /* non-zero if stopping for /proc */ 539 int sigprof = 0; /* non-zero if sending SIGPROF */ 540 541 t->t_post_sys = 0; 542 543 error = lwp->lwp_errno; 544 545 /* 546 * Code can be zero if this is a new LWP returning after a forkall(), 547 * other than the one which matches the one in the parent which called 548 * forkall(). In these LWPs, skip most of post-syscall activity. 549 */ 550 if (code == 0) 551 goto sig_check; 552 553 /* put out audit record for this syscall */ 554 if (AU_AUDITING()) { 555 rval_t rval; /* fix audit_finish() someday */ 556 557 /* XX64 -- truncation of 64-bit return values? */ 558 rval.r_val1 = (int)rval1; 559 rval.r_val2 = (int)rval2; 560 audit_finish(T_SYSCALL, code, error, &rval); 561 repost = 1; 562 } 563 564 if (curthread->t_pdmsg != NULL) { 565 char *m = curthread->t_pdmsg; 566 567 uprintf("%s", m); 568 kmem_free(m, strlen(m) + 1); 569 curthread->t_pdmsg = NULL; 570 } 571 572 /* 573 * If we're going to stop for /proc tracing, set the flag and 574 * save the arguments so that the return values don't smash them. 575 */ 576 if (PTOU(p)->u_systrap) { 577 if (prismember(&PTOU(p)->u_exitmask, code)) { 578 proc_stop = 1; 579 (void) save_syscall_args(); 580 } 581 repost = 1; 582 } 583 584 /* 585 * Similarly check to see if SIGPROF might be sent. 586 */ 587 if (curthread->t_rprof != NULL && 588 curthread->t_rprof->rp_anystate != 0) { 589 (void) save_syscall_args(); 590 sigprof = 1; 591 } 592 593 if (lwp->lwp_eosys == NORMALRETURN) { 594 if (error == 0) { 595 #ifdef SYSCALLTRACE 596 if (syscalltrace) { 597 mutex_enter(&systrace_lock); 598 printf( 599 "%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n", 600 p->p_pid, rval1, rval2, curthread); 601 mutex_exit(&systrace_lock); 602 } 603 #endif /* SYSCALLTRACE */ 604 rp->r_tstate &= ~TSTATE_IC; 605 rp->r_o0 = rval1; 606 rp->r_o1 = rval2; 607 } else { 608 int sig; 609 610 #ifdef SYSCALLTRACE 611 if (syscalltrace) { 612 mutex_enter(&systrace_lock); 613 printf("%d: error=%d, id 0x%p\n", 614 p->p_pid, error, curthread); 615 mutex_exit(&systrace_lock); 616 } 617 #endif /* SYSCALLTRACE */ 618 if (error == EINTR && t->t_activefd.a_stale) 619 error = EBADF; 620 if (error == EINTR && 621 (sig = lwp->lwp_cursig) != 0 && 622 sigismember(&PTOU(p)->u_sigrestart, sig) && 623 PTOU(p)->u_signal[sig - 1] != SIG_DFL && 624 PTOU(p)->u_signal[sig - 1] != SIG_IGN) 625 error = ERESTART; 626 rp->r_o0 = error; 627 rp->r_tstate |= TSTATE_IC; 628 } 629 /* 630 * The default action is to redo the trap instruction. 631 * We increment the pc and npc past it for NORMALRETURN. 632 * JUSTRETURN has set up a new pc and npc already. 633 * If we are a cloned thread of forkall(), don't 634 * adjust here because we have already inherited 635 * the adjusted values from our clone. 636 */ 637 if (!(t->t_flag & T_FORKALL)) { 638 rp->r_pc = rp->r_npc; 639 rp->r_npc += 4; 640 } 641 } 642 643 /* 644 * From the proc(5) manual page: 645 * When exit from a system call is being traced, the traced process 646 * stops on completion of the system call just prior to checking for 647 * signals and returning to user level. At this point all return 648 * values have been stored into the traced process's saved registers. 649 */ 650 if (proc_stop) { 651 mutex_enter(&p->p_lock); 652 if (PTOU(p)->u_systrap && 653 prismember(&PTOU(p)->u_exitmask, code)) 654 stop(PR_SYSEXIT, code); 655 mutex_exit(&p->p_lock); 656 } 657 658 /* 659 * If we are the parent returning from a successful 660 * vfork, wait for the child to exec or exit. 661 * This code must be here and not in the bowels of the system 662 * so that /proc can intercept exit from vfork in a timely way. 663 */ 664 if (t->t_flag & T_VFPARENT) { 665 ASSERT(code == SYS_vfork || code == SYS_forksys); 666 ASSERT(rp->r_o1 == 0 && error == 0); 667 vfwait((pid_t)rval1); 668 t->t_flag &= ~T_VFPARENT; 669 } 670 671 /* 672 * If profiling is active, bill the current PC in user-land 673 * and keep reposting until profiling is disabled. 674 */ 675 if (p->p_prof.pr_scale) { 676 if (lwp->lwp_oweupc) 677 profil_tick(rp->r_pc); 678 repost = 1; 679 } 680 681 sig_check: 682 /* 683 * Reset flag for next time. 684 * We must do this after stopping on PR_SYSEXIT 685 * because /proc uses the information in lwp_eosys. 686 */ 687 lwp->lwp_eosys = NORMALRETURN; 688 clear_stale_fd(); 689 t->t_flag &= ~T_FORKALL; 690 691 if (t->t_astflag | t->t_sig_check) { 692 /* 693 * Turn off the AST flag before checking all the conditions that 694 * may have caused an AST. This flag is on whenever a signal or 695 * unusual condition should be handled after the next trap or 696 * syscall. 697 */ 698 astoff(t); 699 t->t_sig_check = 0; 700 701 /* 702 * The following check is legal for the following reasons: 703 * 1) The thread we are checking, is ourselves, so there is 704 * no way the proc can go away. 705 * 2) The only time we need to be protected by the 706 * lock is if the binding is changed. 707 * 708 * Note we will still take the lock and check the binding 709 * if the condition was true without the lock held. This 710 * prevents lock contention among threads owned by the 711 * same proc. 712 */ 713 714 if (curthread->t_proc_flag & TP_CHANGEBIND) { 715 mutex_enter(&p->p_lock); 716 if (curthread->t_proc_flag & TP_CHANGEBIND) { 717 timer_lwpbind(); 718 curthread->t_proc_flag &= ~TP_CHANGEBIND; 719 } 720 mutex_exit(&p->p_lock); 721 } 722 723 /* 724 * for kaio requests on the special kaio poll queue, 725 * copyout their results to user memory. 726 */ 727 if (p->p_aio) 728 aio_cleanup(0); 729 730 /* 731 * If this LWP was asked to hold, call holdlwp(), which will 732 * stop. holdlwps() sets this up and calls pokelwps() which 733 * sets the AST flag. 734 * 735 * Also check TP_EXITLWP, since this is used by fresh new LWPs 736 * through lwp_rtt(). That flag is set if the lwp_create(2) 737 * syscall failed after creating the LWP. 738 */ 739 if (ISHOLD(p) || (t->t_proc_flag & TP_EXITLWP)) 740 holdlwp(); 741 742 /* 743 * All code that sets signals and makes ISSIG_PENDING 744 * evaluate true must set t_sig_check afterwards. 745 */ 746 if (ISSIG_PENDING(t, lwp, p)) { 747 if (issig(FORREAL)) 748 psig(); 749 t->t_sig_check = 1; /* recheck next time */ 750 } 751 752 if (sigprof) { 753 int nargs = (code > 0 && code < NSYSCALL)? 754 LWP_GETSYSENT(lwp)[code].sy_narg : 0; 755 realsigprof(code, nargs, error); 756 t->t_sig_check = 1; /* recheck next time */ 757 } 758 759 /* 760 * If a performance counter overflow interrupt was 761 * delivered *during* the syscall, then re-enable the 762 * AST so that we take a trip through trap() to cause 763 * the SIGEMT to be delivered. 764 */ 765 if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW) 766 aston(t); 767 768 /* 769 * If an asynchronous hardware error is pending, turn AST flag 770 * back on. AST will be checked again before we return to user 771 * mode and we'll come back through trap() to handle the error. 772 */ 773 if (lwp->lwp_pcb.pcb_flags & ASYNC_HWERR) 774 aston(t); 775 } 776 777 /* 778 * Restore register window if a debugger modified it. 779 * Set up to perform a single-step if a debugger requested it. 780 */ 781 if (lwp->lwp_pcb.pcb_xregstat != XREGNONE) 782 xregrestore(lwp, 1); 783 784 lwp->lwp_errno = 0; /* clear error for next time */ 785 786 /* 787 * Set state to LWP_USER here so preempt won't give us a kernel 788 * priority if it occurs after this point. Call CL_TRAPRET() to 789 * restore the user-level priority. 790 * 791 * It is important that no locks (other than spinlocks) be entered 792 * after this point before returning to user mode (unless lwp_state 793 * is set back to LWP_SYS). 794 * 795 * Sampled times past this point are charged to the user. 796 */ 797 lwp->lwp_state = LWP_USER; 798 799 if (t->t_trapret) { 800 t->t_trapret = 0; 801 thread_lock(t); 802 CL_TRAPRET(t); 803 thread_unlock(t); 804 } 805 if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ) 806 preempt(); 807 prunstop(); 808 809 /* 810 * t_post_sys will be set if pcb_step is active. 811 */ 812 if (lwp->lwp_pcb.pcb_step != STEP_NONE) { 813 prdostep(); 814 repost = 1; 815 } 816 817 t->t_sysnum = 0; /* no longer in a system call */ 818 819 /* 820 * In case the args were copied to the lwp, reset the 821 * pointer so the next syscall will have the right lwp_ap pointer. 822 */ 823 lwp->lwp_ap = (long *)&rp->r_o0; 824 lwp->lwp_argsaved = 0; 825 826 /* 827 * If there was a continuing reason for post-syscall processing, 828 * set the t_post_sys flag for the next system call. 829 */ 830 if (repost) 831 t->t_post_sys = 1; 832 833 /* 834 * If there is a ustack registered for this lwp, and the stack rlimit 835 * has been altered, read in the ustack. If the saved stack rlimit 836 * matches the bounds of the ustack, update the ustack to reflect 837 * the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable 838 * stack checking by setting the size to 0. 839 */ 840 if (lwp->lwp_ustack != 0 && lwp->lwp_old_stk_ctl != 0) { 841 rlim64_t new_size; 842 model_t model; 843 caddr_t top; 844 struct rlimit64 rl; 845 846 mutex_enter(&p->p_lock); 847 new_size = p->p_stk_ctl; 848 model = p->p_model; 849 top = p->p_usrstack; 850 (void) rctl_rlimit_get(rctlproc_legacy[RLIMIT_STACK], p, &rl); 851 mutex_exit(&p->p_lock); 852 853 if (rl.rlim_cur == RLIM64_INFINITY) 854 new_size = 0; 855 856 if (model == DATAMODEL_NATIVE) { 857 stack_t stk; 858 859 if (copyin((stack_t *)lwp->lwp_ustack, &stk, 860 sizeof (stack_t)) == 0 && 861 (stk.ss_size == lwp->lwp_old_stk_ctl || 862 stk.ss_size == 0) && 863 stk.ss_sp == top - stk.ss_size) { 864 stk.ss_sp = (void *)((uintptr_t)stk.ss_sp + 865 stk.ss_size - new_size); 866 stk.ss_size = new_size; 867 868 (void) copyout(&stk, 869 (stack_t *)lwp->lwp_ustack, 870 sizeof (stack_t)); 871 } 872 } else { 873 stack32_t stk32; 874 875 if (copyin((stack32_t *)lwp->lwp_ustack, &stk32, 876 sizeof (stack32_t)) == 0 && 877 (stk32.ss_size == lwp->lwp_old_stk_ctl || 878 stk32.ss_size == 0) && 879 stk32.ss_sp == 880 (caddr32_t)(uintptr_t)(top - stk32.ss_size)) { 881 stk32.ss_sp += stk32.ss_size - new_size; 882 stk32.ss_size = new_size; 883 884 (void) copyout(&stk32, 885 (stack32_t *)lwp->lwp_ustack, 886 sizeof (stack32_t)); 887 } 888 } 889 890 lwp->lwp_old_stk_ctl = 0; 891 } 892 893 syscall_mstate(LMS_SYSTEM, LMS_USER); 894 } 895 896 /* 897 * Call a system call which takes a pointer to the user args struct and 898 * a pointer to the return values. This is a bit slower than the standard 899 * C arg-passing method in some cases. 900 */ 901 int64_t 902 syscall_ap() 903 { 904 uint_t error; 905 struct sysent *callp; 906 rval_t rval; 907 klwp_t *lwp = ttolwp(curthread); 908 struct regs *rp = lwptoregs(lwp); 909 910 callp = LWP_GETSYSENT(lwp) + curthread->t_sysnum; 911 912 /* 913 * If the arguments don't fit in registers %o0 - o5, make sure they 914 * have been copied to the lwp_arg array. 915 */ 916 if (callp->sy_narg > 6 && save_syscall_args()) 917 return ((int64_t)set_errno(EFAULT)); 918 919 rval.r_val1 = 0; 920 rval.r_val2 = (int)rp->r_o1; 921 lwp->lwp_error = 0; /* for old drivers */ 922 error = (*(callp->sy_call))(lwp->lwp_ap, &rval); 923 if (error) 924 return ((int64_t)set_errno(error)); 925 return (rval.r_vals); 926 } 927 928 /* 929 * Load system call module. 930 * Returns with pointer to held read lock for module. 931 */ 932 static krwlock_t * 933 lock_syscall(struct sysent *table, uint_t code) 934 { 935 krwlock_t *module_lock; 936 struct modctl *modp; 937 int id; 938 struct sysent *callp; 939 940 module_lock = table[code].sy_lock; 941 callp = &table[code]; 942 943 /* 944 * Optimization to only call modload if we don't have a loaded 945 * syscall. 946 */ 947 rw_enter(module_lock, RW_READER); 948 if (LOADED_SYSCALL(callp)) 949 return (module_lock); 950 rw_exit(module_lock); 951 952 for (;;) { 953 if ((id = modload("sys", syscallnames[code])) == -1) 954 break; 955 956 /* 957 * If we loaded successfully at least once, the modctl 958 * will still be valid, so we try to grab it by filename. 959 * If this call fails, it's because the mod_filename 960 * was changed after the call to modload() (mod_hold_by_name() 961 * is the likely culprit). We can safely just take 962 * another lap if this is the case; the modload() will 963 * change the mod_filename back to one by which we can 964 * find the modctl. 965 */ 966 modp = mod_find_by_filename("sys", syscallnames[code]); 967 968 if (modp == NULL) 969 continue; 970 971 mutex_enter(&mod_lock); 972 973 if (!modp->mod_installed) { 974 mutex_exit(&mod_lock); 975 continue; 976 } 977 break; 978 } 979 980 rw_enter(module_lock, RW_READER); 981 982 if (id != -1) 983 mutex_exit(&mod_lock); 984 985 return (module_lock); 986 } 987 988 /* 989 * Loadable syscall support. 990 * If needed, load the module, then reserve it by holding a read 991 * lock for the duration of the call. 992 * Later, if the syscall is not unloadable, it could patch the vector. 993 */ 994 /*ARGSUSED*/ 995 int64_t 996 loadable_syscall( 997 long a0, long a1, long a2, long a3, 998 long a4, long a5, long a6, long a7) 999 { 1000 int64_t rval; 1001 struct sysent *callp; 1002 struct sysent *se = LWP_GETSYSENT(ttolwp(curthread)); 1003 krwlock_t *module_lock; 1004 int code; 1005 1006 code = curthread->t_sysnum; 1007 callp = se + code; 1008 1009 /* 1010 * Try to autoload the system call if necessary. 1011 */ 1012 module_lock = lock_syscall(se, code); 1013 1014 /* 1015 * we've locked either the loaded syscall or nosys 1016 */ 1017 if (callp->sy_flags & SE_ARGC) { 1018 int64_t (*sy_call)(); 1019 1020 sy_call = (int64_t (*)())callp->sy_call; 1021 rval = (*sy_call)(a0, a1, a2, a3, a4, a5); 1022 } else { 1023 rval = syscall_ap(); 1024 } 1025 1026 rw_exit(module_lock); 1027 return (rval); 1028 } 1029 1030 /* 1031 * Handle indirect system calls. 1032 * This interface should be deprecated. The library can handle 1033 * this more efficiently, but keep this implementation for old binaries. 1034 * 1035 * XX64 Needs some work. 1036 */ 1037 int64_t 1038 indir(int code, long a0, long a1, long a2, long a3, long a4) 1039 { 1040 klwp_t *lwp = ttolwp(curthread); 1041 struct sysent *callp; 1042 1043 if (code <= 0 || code >= NSYSCALL) 1044 return (nosys()); 1045 1046 ASSERT(lwp->lwp_ap != NULL); 1047 1048 curthread->t_sysnum = code; 1049 callp = LWP_GETSYSENT(lwp) + code; 1050 1051 /* 1052 * Handle argument setup, unless already done in pre_syscall(). 1053 */ 1054 if (callp->sy_narg > 5) { 1055 if (save_syscall_args()) /* move args to LWP array */ 1056 return ((int64_t)set_errno(EFAULT)); 1057 } else if (!lwp->lwp_argsaved) { 1058 long *ap; 1059 1060 ap = lwp->lwp_ap; /* args haven't been saved */ 1061 lwp->lwp_ap = ap + 1; /* advance arg pointer */ 1062 curthread->t_post_sys = 1; /* so lwp_ap will be reset */ 1063 } 1064 return ((*callp->sy_callc)(a0, a1, a2, a3, a4, lwp->lwp_arg[5])); 1065 } 1066 1067 /* 1068 * set_errno - set an error return from the current system call. 1069 * This could be a macro. 1070 * This returns the value it is passed, so that the caller can 1071 * use tail-recursion-elimination and do return (set_errno(ERRNO)); 1072 */ 1073 uint_t 1074 set_errno(uint_t error) 1075 { 1076 ASSERT(error != 0); /* must not be used to clear errno */ 1077 1078 curthread->t_post_sys = 1; /* have post_syscall do error return */ 1079 return (ttolwp(curthread)->lwp_errno = error); 1080 } 1081 1082 /* 1083 * set_proc_pre_sys - Set pre-syscall processing for entire process. 1084 */ 1085 void 1086 set_proc_pre_sys(proc_t *p) 1087 { 1088 kthread_t *t; 1089 kthread_t *first; 1090 1091 ASSERT(MUTEX_HELD(&p->p_lock)); 1092 1093 t = first = p->p_tlist; 1094 do { 1095 t->t_pre_sys = 1; 1096 } while ((t = t->t_forw) != first); 1097 } 1098 1099 /* 1100 * set_proc_post_sys - Set post-syscall processing for entire process. 1101 */ 1102 void 1103 set_proc_post_sys(proc_t *p) 1104 { 1105 kthread_t *t; 1106 kthread_t *first; 1107 1108 ASSERT(MUTEX_HELD(&p->p_lock)); 1109 1110 t = first = p->p_tlist; 1111 do { 1112 t->t_post_sys = 1; 1113 } while ((t = t->t_forw) != first); 1114 } 1115 1116 /* 1117 * set_proc_sys - Set pre- and post-syscall processing for entire process. 1118 */ 1119 void 1120 set_proc_sys(proc_t *p) 1121 { 1122 kthread_t *t; 1123 kthread_t *first; 1124 1125 ASSERT(MUTEX_HELD(&p->p_lock)); 1126 1127 t = first = p->p_tlist; 1128 do { 1129 t->t_pre_sys = 1; 1130 t->t_post_sys = 1; 1131 } while ((t = t->t_forw) != first); 1132 } 1133 1134 /* 1135 * set_all_proc_sys - set pre- and post-syscall processing flags for all 1136 * user processes. 1137 * 1138 * This is needed when auditing, tracing, or other facilities which affect 1139 * all processes are turned on. 1140 */ 1141 void 1142 set_all_proc_sys() 1143 { 1144 kthread_t *t; 1145 kthread_t *first; 1146 1147 mutex_enter(&pidlock); 1148 t = first = curthread; 1149 do { 1150 t->t_pre_sys = 1; 1151 t->t_post_sys = 1; 1152 } while ((t = t->t_next) != first); 1153 mutex_exit(&pidlock); 1154 } 1155 1156 /* 1157 * set_all_zone_usr_proc_sys - set pre- and post-syscall processing flags for 1158 * all user processes running in the zone of the current process 1159 * 1160 * This is needed when auditing is turned on. 1161 */ 1162 void 1163 set_all_zone_usr_proc_sys(zoneid_t zoneid) 1164 { 1165 proc_t *p; 1166 kthread_t *t; 1167 1168 mutex_enter(&pidlock); 1169 for (p = practive; p != NULL; p = p->p_next) { 1170 /* skip kernel processes */ 1171 if (p->p_exec == NULLVP || p->p_as == &kas || 1172 p->p_stat == SIDL || p->p_stat == SZOMB || 1173 (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) 1174 continue; 1175 /* 1176 * Only processes in the given zone (eventually in 1177 * all zones) are taken into account 1178 */ 1179 if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) { 1180 mutex_enter(&p->p_lock); 1181 if ((t = p->p_tlist) == NULL) { 1182 mutex_exit(&p->p_lock); 1183 continue; 1184 } 1185 /* 1186 * Set pre- and post-syscall processing flags 1187 * for all threads of the process 1188 */ 1189 do { 1190 t->t_pre_sys = 1; 1191 t->t_post_sys = 1; 1192 } while (p->p_tlist != (t = t->t_forw)); 1193 mutex_exit(&p->p_lock); 1194 } 1195 } 1196 mutex_exit(&pidlock); 1197 } 1198 1199 /* 1200 * set_proc_ast - Set asynchronous service trap (AST) flag for all 1201 * threads in process. 1202 */ 1203 void 1204 set_proc_ast(proc_t *p) 1205 { 1206 kthread_t *t; 1207 kthread_t *first; 1208 1209 ASSERT(MUTEX_HELD(&p->p_lock)); 1210 1211 t = first = p->p_tlist; 1212 do { 1213 aston(t); 1214 } while ((t = t->t_forw) != first); 1215 } 1216