1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/param.h>
27 #include <sys/vmparam.h>
28 #include <sys/types.h>
29 #include <sys/sysmacros.h>
30 #include <sys/systm.h>
31 #include <sys/cmn_err.h>
32 #include <sys/signal.h>
33 #include <sys/stack.h>
34 #include <sys/cred.h>
35 #include <sys/user.h>
36 #include <sys/debug.h>
37 #include <sys/errno.h>
38 #include <sys/proc.h>
39 #include <sys/var.h>
40 #include <sys/inline.h>
41 #include <sys/syscall.h>
42 #include <sys/ucontext.h>
43 #include <sys/cpuvar.h>
44 #include <sys/siginfo.h>
45 #include <sys/trap.h>
46 #include <sys/machtrap.h>
47 #include <sys/sysinfo.h>
48 #include <sys/procfs.h>
49 #include <sys/prsystm.h>
50 #include <sys/fpu/fpusystm.h>
51 #include <sys/modctl.h>
52 #include <sys/aio_impl.h>
53 #include <c2/audit.h>
54 #include <sys/tnf.h>
55 #include <sys/tnf_probe.h>
56 #include <sys/machpcb.h>
57 #include <sys/privregs.h>
58 #include <sys/copyops.h>
59 #include <sys/timer.h>
60 #include <sys/priv.h>
61 #include <sys/msacct.h>
62
63 int syscalltrace = 0;
64 #ifdef SYSCALLTRACE
65 static kmutex_t systrace_lock; /* syscall tracing lock */
66 #endif /* SYSCALLTRACE */
67
68 static krwlock_t *lock_syscall(struct sysent *, uint_t);
69
70 #ifdef _SYSCALL32_IMPL
71 static struct sysent *
lwp_getsysent(klwp_t * lwp)72 lwp_getsysent(klwp_t *lwp)
73 {
74 if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE)
75 return (sysent);
76 return (sysent32);
77 }
78 #define LWP_GETSYSENT(lwp) (lwp_getsysent(lwp))
79 #else
80 #define LWP_GETSYSENT(lwp) (sysent)
81 #endif
82
83 /*
84 * Called to restore the lwp's register window just before
85 * returning to user level (only if the registers have been
86 * fetched or modified through /proc).
87 */
88 /*ARGSUSED1*/
89 void
xregrestore(klwp_t * lwp,int shared)90 xregrestore(klwp_t *lwp, int shared)
91 {
92 /*
93 * If locals+ins were modified by /proc copy them out.
94 * Also copy to the shared window, if necessary.
95 */
96 if (lwp->lwp_pcb.pcb_xregstat == XREGMODIFIED) {
97 struct machpcb *mpcb = lwptompcb(lwp);
98 caddr_t sp = (caddr_t)lwptoregs(lwp)->r_sp;
99
100 size_t rwinsize;
101 caddr_t rwp;
102 int is64;
103
104 if (lwp_getdatamodel(lwp) == DATAMODEL_LP64) {
105 rwinsize = sizeof (struct rwindow);
106 rwp = sp + STACK_BIAS;
107 is64 = 1;
108 } else {
109 rwinsize = sizeof (struct rwindow32);
110 sp = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)sp;
111 rwp = sp;
112 is64 = 0;
113 }
114
115 if (is64)
116 (void) copyout_nowatch(&lwp->lwp_pcb.pcb_xregs,
117 rwp, rwinsize);
118 else {
119 struct rwindow32 rwindow32;
120 int watched;
121
122 watched = watch_disable_addr(rwp, rwinsize, S_WRITE);
123 rwindow_nto32(&lwp->lwp_pcb.pcb_xregs, &rwindow32);
124 (void) copyout(&rwindow32, rwp, rwinsize);
125 if (watched)
126 watch_enable_addr(rwp, rwinsize, S_WRITE);
127 }
128
129 /* also copy to the user return window */
130 mpcb->mpcb_rsp[0] = sp;
131 mpcb->mpcb_rsp[1] = NULL;
132 bcopy(&lwp->lwp_pcb.pcb_xregs, &mpcb->mpcb_rwin[0],
133 sizeof (lwp->lwp_pcb.pcb_xregs));
134 }
135 lwp->lwp_pcb.pcb_xregstat = XREGNONE;
136 }
137
138
139 /*
140 * Get the arguments to the current system call.
141 * lwp->lwp_ap normally points to the out regs in the reg structure.
142 * If the user is going to change the out registers and might want to
143 * get the args (for /proc tracing), it must copy the args elsewhere
144 * via save_syscall_args().
145 */
146 uint_t
get_syscall_args(klwp_t * lwp,long * argp,int * nargsp)147 get_syscall_args(klwp_t *lwp, long *argp, int *nargsp)
148 {
149 kthread_t *t = lwptot(lwp);
150 uint_t code = t->t_sysnum;
151 long mask;
152 long *ap;
153 int nargs;
154
155 if (lwptoproc(lwp)->p_model == DATAMODEL_ILP32)
156 mask = (uint32_t)0xffffffffU;
157 else
158 mask = 0xffffffffffffffff;
159
160 if (code != 0 && code < NSYSCALL) {
161
162 nargs = LWP_GETSYSENT(lwp)[code].sy_narg;
163
164 ASSERT(nargs <= MAXSYSARGS);
165
166 *nargsp = nargs;
167 ap = lwp->lwp_ap;
168 while (nargs-- > 0)
169 *argp++ = *ap++ & mask;
170 } else {
171 *nargsp = 0;
172 }
173 return (code);
174 }
175
176 #ifdef _SYSCALL32_IMPL
177 /*
178 * Get the arguments to the current 32-bit system call.
179 */
180 uint_t
get_syscall32_args(klwp_t * lwp,int * argp,int * nargsp)181 get_syscall32_args(klwp_t *lwp, int *argp, int *nargsp)
182 {
183 long args[MAXSYSARGS];
184 uint_t i, code;
185
186 code = get_syscall_args(lwp, args, nargsp);
187 for (i = 0; i != *nargsp; i++)
188 *argp++ = (int)args[i];
189 return (code);
190 }
191 #endif
192
193 /*
194 * Save the system call arguments in a safe place.
195 * lwp->lwp_ap normally points to the out regs in the reg structure.
196 * If the user is going to change the out registers, g1, or the stack,
197 * and might want to get the args (for /proc tracing), it must copy
198 * the args elsewhere via save_syscall_args().
199 *
200 * This may be called from stop() even when we're not in a system call.
201 * Since there's no easy way to tell, this must be safe (not panic).
202 * If the copyins get data faults, return non-zero.
203 */
204 int
save_syscall_args()205 save_syscall_args()
206 {
207 kthread_t *t = curthread;
208 klwp_t *lwp = ttolwp(t);
209 struct regs *rp = lwptoregs(lwp);
210 uint_t code = t->t_sysnum;
211 uint_t nargs;
212 int i;
213 caddr_t ua;
214 model_t datamodel;
215
216 if (lwp->lwp_argsaved || code == 0)
217 return (0); /* args already saved or not needed */
218
219 if (code >= NSYSCALL) {
220 nargs = 0; /* illegal syscall */
221 } else {
222 struct sysent *se = LWP_GETSYSENT(lwp);
223 struct sysent *callp = se + code;
224
225 nargs = callp->sy_narg;
226 if (LOADABLE_SYSCALL(callp) && nargs == 0) {
227 krwlock_t *module_lock;
228
229 /*
230 * Find out how many arguments the system
231 * call uses.
232 *
233 * We have the property that loaded syscalls
234 * never change the number of arguments they
235 * use after they've been loaded once. This
236 * allows us to stop for /proc tracing without
237 * holding the module lock.
238 * /proc is assured that sy_narg is valid.
239 */
240 module_lock = lock_syscall(se, code);
241 nargs = callp->sy_narg;
242 rw_exit(module_lock);
243 }
244 }
245
246 /*
247 * Fetch the system call arguments.
248 */
249 if (nargs == 0)
250 goto out;
251
252
253 ASSERT(nargs <= MAXSYSARGS);
254
255 if ((datamodel = lwp_getdatamodel(lwp)) == DATAMODEL_ILP32) {
256
257 if (rp->r_g1 == 0) { /* indirect syscall */
258
259 lwp->lwp_arg[0] = (uint32_t)rp->r_o1;
260 lwp->lwp_arg[1] = (uint32_t)rp->r_o2;
261 lwp->lwp_arg[2] = (uint32_t)rp->r_o3;
262 lwp->lwp_arg[3] = (uint32_t)rp->r_o4;
263 lwp->lwp_arg[4] = (uint32_t)rp->r_o5;
264 if (nargs > 5) {
265 ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
266 (rp->r_sp + MINFRAME32);
267 for (i = 5; i < nargs; i++) {
268 uint32_t a;
269 if (fuword32(ua, &a) != 0)
270 return (-1);
271 lwp->lwp_arg[i] = a;
272 ua += sizeof (a);
273 }
274 }
275 } else {
276 lwp->lwp_arg[0] = (uint32_t)rp->r_o0;
277 lwp->lwp_arg[1] = (uint32_t)rp->r_o1;
278 lwp->lwp_arg[2] = (uint32_t)rp->r_o2;
279 lwp->lwp_arg[3] = (uint32_t)rp->r_o3;
280 lwp->lwp_arg[4] = (uint32_t)rp->r_o4;
281 lwp->lwp_arg[5] = (uint32_t)rp->r_o5;
282 if (nargs > 6) {
283 ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
284 (rp->r_sp + MINFRAME32);
285 for (i = 6; i < nargs; i++) {
286 uint32_t a;
287 if (fuword32(ua, &a) != 0)
288 return (-1);
289 lwp->lwp_arg[i] = a;
290 ua += sizeof (a);
291 }
292 }
293 }
294 } else {
295 ASSERT(datamodel == DATAMODEL_LP64);
296 lwp->lwp_arg[0] = rp->r_o0;
297 lwp->lwp_arg[1] = rp->r_o1;
298 lwp->lwp_arg[2] = rp->r_o2;
299 lwp->lwp_arg[3] = rp->r_o3;
300 lwp->lwp_arg[4] = rp->r_o4;
301 lwp->lwp_arg[5] = rp->r_o5;
302 if (nargs > 6) {
303 ua = (caddr_t)rp->r_sp + MINFRAME + STACK_BIAS;
304 for (i = 6; i < nargs; i++) {
305 unsigned long a;
306 if (fulword(ua, &a) != 0)
307 return (-1);
308 lwp->lwp_arg[i] = a;
309 ua += sizeof (a);
310 }
311 }
312 }
313
314 out:
315 lwp->lwp_ap = lwp->lwp_arg;
316 lwp->lwp_argsaved = 1;
317 t->t_post_sys = 1; /* so lwp_ap will be reset */
318 return (0);
319 }
320
321 void
reset_syscall_args(void)322 reset_syscall_args(void)
323 {
324 klwp_t *lwp = ttolwp(curthread);
325
326 lwp->lwp_ap = (long *)&lwptoregs(lwp)->r_o0;
327 lwp->lwp_argsaved = 0;
328 }
329
330 /*
331 * nonexistent system call-- signal lwp (may want to handle it)
332 * flag error if lwp won't see signal immediately
333 * This works for old or new calling sequence.
334 */
335 int64_t
nosys()336 nosys()
337 {
338 tsignal(curthread, SIGSYS);
339 return ((int64_t)set_errno(ENOSYS));
340 }
341
342 /*
343 * Perform pre-system-call processing, including stopping for tracing,
344 * auditing, microstate-accounting, etc.
345 *
346 * This routine is called only if the t_pre_sys flag is set. Any condition
347 * requiring pre-syscall handling must set the t_pre_sys flag. If the
348 * condition is persistent, this routine will repost t_pre_sys.
349 */
350 int
pre_syscall(int arg0)351 pre_syscall(int arg0)
352 {
353 unsigned int code;
354 kthread_t *t = curthread;
355 proc_t *p = ttoproc(t);
356 klwp_t *lwp = ttolwp(t);
357 struct regs *rp = lwptoregs(lwp);
358 int repost;
359
360 t->t_pre_sys = repost = 0; /* clear pre-syscall processing flag */
361
362 ASSERT(t->t_schedflag & TS_DONT_SWAP);
363
364 syscall_mstate(LMS_USER, LMS_SYSTEM);
365
366 /*
367 * The syscall arguments in the out registers should be pointed to
368 * by lwp_ap. If the args need to be copied so that the outs can
369 * be changed without losing the ability to get the args for /proc,
370 * they can be saved by save_syscall_args(), and lwp_ap will be
371 * restored by post_syscall().
372 */
373 ASSERT(lwp->lwp_ap == (long *)&rp->r_o0);
374
375 /*
376 * Make sure the thread is holding the latest credentials for the
377 * process. The credentials in the process right now apply to this
378 * thread for the entire system call.
379 */
380 if (t->t_cred != p->p_cred) {
381 cred_t *oldcred = t->t_cred;
382 /*
383 * DTrace accesses t_cred in probe context. t_cred must
384 * always be either NULL, or point to a valid, allocated cred
385 * structure.
386 */
387 t->t_cred = crgetcred();
388 crfree(oldcred);
389 }
390
391 /*
392 * Undo special arrangements to single-step the lwp
393 * so that a debugger will see valid register contents.
394 * Also so that the pc is valid for syncfpu().
395 * Also so that a syscall like exec() can be stepped.
396 */
397 if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
398 (void) prundostep();
399 repost = 1;
400 }
401
402 /*
403 * Check for indirect system call in case we stop for tracing.
404 * Don't allow multiple indirection.
405 */
406 code = t->t_sysnum;
407 if (code == 0 && arg0 != 0) { /* indirect syscall */
408 code = arg0;
409 t->t_sysnum = arg0;
410 }
411
412 /*
413 * From the proc(4) manual page:
414 * When entry to a system call is being traced, the traced process
415 * stops after having begun the call to the system but before the
416 * system call arguments have been fetched from the process.
417 * If proc changes the args we must refetch them after starting.
418 */
419 if (PTOU(p)->u_systrap) {
420 if (prismember(&PTOU(p)->u_entrymask, code)) {
421 /*
422 * Recheck stop condition, now that lock is held.
423 */
424 mutex_enter(&p->p_lock);
425 if (PTOU(p)->u_systrap &&
426 prismember(&PTOU(p)->u_entrymask, code)) {
427 stop(PR_SYSENTRY, code);
428 /*
429 * Must refetch args since they were
430 * possibly modified by /proc. Indicate
431 * that the valid copy is in the
432 * registers.
433 */
434 lwp->lwp_argsaved = 0;
435 lwp->lwp_ap = (long *)&rp->r_o0;
436 }
437 mutex_exit(&p->p_lock);
438 }
439 repost = 1;
440 }
441
442 if (lwp->lwp_sysabort) {
443 /*
444 * lwp_sysabort may have been set via /proc while the process
445 * was stopped on PR_SYSENTRY. If so, abort the system call.
446 * Override any error from the copyin() of the arguments.
447 */
448 lwp->lwp_sysabort = 0;
449 (void) set_errno(EINTR); /* sets post-sys processing */
450 t->t_pre_sys = 1; /* repost anyway */
451 return (1); /* don't do system call, return EINTR */
452 }
453
454 /* begin auditing for this syscall */
455 if (audit_active == C2AUDIT_LOADED) {
456 uint32_t auditing = au_zone_getstate(NULL);
457
458 if (auditing & AU_AUDIT_MASK) {
459 int error;
460 if (error = audit_start(T_SYSCALL, code, auditing, \
461 0, lwp)) {
462 t->t_pre_sys = 1; /* repost anyway */
463 lwp->lwp_error = 0; /* for old drivers */
464 return (error);
465 }
466 repost = 1;
467 }
468 }
469
470 #ifndef NPROBE
471 /* Kernel probe */
472 if (tnf_tracing_active) {
473 TNF_PROBE_1(syscall_start, "syscall thread", /* CSTYLED */,
474 tnf_sysnum, sysnum, t->t_sysnum);
475 t->t_post_sys = 1; /* make sure post_syscall runs */
476 repost = 1;
477 }
478 #endif /* NPROBE */
479
480 #ifdef SYSCALLTRACE
481 if (syscalltrace) {
482 int i;
483 long *ap;
484 char *cp;
485 char *sysname;
486 struct sysent *callp;
487
488 if (code >= NSYSCALL)
489 callp = &nosys_ent; /* nosys has no args */
490 else
491 callp = LWP_GETSYSENT(lwp) + code;
492 (void) save_syscall_args();
493 mutex_enter(&systrace_lock);
494 printf("%d: ", p->p_pid);
495 if (code >= NSYSCALL)
496 printf("0x%x", code);
497 else {
498 sysname = mod_getsysname(code);
499 printf("%s[0x%x]", sysname == NULL ? "NULL" :
500 sysname, code);
501 }
502 cp = "(";
503 for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) {
504 printf("%s%lx", cp, *ap);
505 cp = ", ";
506 }
507 if (i)
508 printf(")");
509 printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread);
510 mutex_exit(&systrace_lock);
511 }
512 #endif /* SYSCALLTRACE */
513
514 /*
515 * If there was a continuing reason for pre-syscall processing,
516 * set the t_pre_sys flag for the next system call.
517 */
518 if (repost)
519 t->t_pre_sys = 1;
520 lwp->lwp_error = 0; /* for old drivers */
521 lwp->lwp_badpriv = PRIV_NONE; /* for privilege tracing */
522 return (0);
523 }
524
525 /*
526 * Post-syscall processing. Perform abnormal system call completion
527 * actions such as /proc tracing, profiling, signals, preemption, etc.
528 *
529 * This routine is called only if t_post_sys, t_sig_check, or t_astflag is set.
530 * Any condition requiring pre-syscall handling must set one of these.
531 * If the condition is persistent, this routine will repost t_post_sys.
532 */
533 void
post_syscall(long rval1,long rval2)534 post_syscall(long rval1, long rval2)
535 {
536 kthread_t *t = curthread;
537 proc_t *p = curproc;
538 klwp_t *lwp = ttolwp(t);
539 struct regs *rp = lwptoregs(lwp);
540 uint_t error;
541 int code = t->t_sysnum;
542 int repost = 0;
543 int proc_stop = 0; /* non-zero if stopping for /proc */
544 int sigprof = 0; /* non-zero if sending SIGPROF */
545
546 t->t_post_sys = 0;
547
548 error = lwp->lwp_errno;
549
550 /*
551 * Code can be zero if this is a new LWP returning after a forkall(),
552 * other than the one which matches the one in the parent which called
553 * forkall(). In these LWPs, skip most of post-syscall activity.
554 */
555 if (code == 0)
556 goto sig_check;
557
558 /* put out audit record for this syscall */
559 if (AU_AUDITING()) {
560 rval_t rval; /* fix audit_finish() someday */
561
562 /* XX64 -- truncation of 64-bit return values? */
563 rval.r_val1 = (int)rval1;
564 rval.r_val2 = (int)rval2;
565 audit_finish(T_SYSCALL, code, error, &rval);
566 repost = 1;
567 }
568
569 if (curthread->t_pdmsg != NULL) {
570 char *m = curthread->t_pdmsg;
571
572 uprintf("%s", m);
573 kmem_free(m, strlen(m) + 1);
574 curthread->t_pdmsg = NULL;
575 }
576
577 /*
578 * If we're going to stop for /proc tracing, set the flag and
579 * save the arguments so that the return values don't smash them.
580 */
581 if (PTOU(p)->u_systrap) {
582 if (prismember(&PTOU(p)->u_exitmask, code)) {
583 proc_stop = 1;
584 (void) save_syscall_args();
585 }
586 repost = 1;
587 }
588
589 /*
590 * Similarly check to see if SIGPROF might be sent.
591 */
592 if (curthread->t_rprof != NULL &&
593 curthread->t_rprof->rp_anystate != 0) {
594 (void) save_syscall_args();
595 sigprof = 1;
596 }
597
598 if (lwp->lwp_eosys == NORMALRETURN) {
599 if (error == 0) {
600 #ifdef SYSCALLTRACE
601 if (syscalltrace) {
602 mutex_enter(&systrace_lock);
603 printf(
604 "%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n",
605 p->p_pid, rval1, rval2, curthread);
606 mutex_exit(&systrace_lock);
607 }
608 #endif /* SYSCALLTRACE */
609 rp->r_tstate &= ~TSTATE_IC;
610 rp->r_o0 = rval1;
611 rp->r_o1 = rval2;
612 } else {
613 int sig;
614
615 #ifdef SYSCALLTRACE
616 if (syscalltrace) {
617 mutex_enter(&systrace_lock);
618 printf("%d: error=%d, id 0x%p\n",
619 p->p_pid, error, curthread);
620 mutex_exit(&systrace_lock);
621 }
622 #endif /* SYSCALLTRACE */
623 if (error == EINTR && t->t_activefd.a_stale)
624 error = EBADF;
625 if (error == EINTR &&
626 (sig = lwp->lwp_cursig) != 0 &&
627 sigismember(&PTOU(p)->u_sigrestart, sig) &&
628 PTOU(p)->u_signal[sig - 1] != SIG_DFL &&
629 PTOU(p)->u_signal[sig - 1] != SIG_IGN)
630 error = ERESTART;
631 rp->r_o0 = error;
632 rp->r_tstate |= TSTATE_IC;
633 }
634 /*
635 * The default action is to redo the trap instruction.
636 * We increment the pc and npc past it for NORMALRETURN.
637 * JUSTRETURN has set up a new pc and npc already.
638 * If we are a cloned thread of forkall(), don't
639 * adjust here because we have already inherited
640 * the adjusted values from our clone.
641 */
642 if (!(t->t_flag & T_FORKALL)) {
643 rp->r_pc = rp->r_npc;
644 rp->r_npc += 4;
645 }
646 }
647
648 /*
649 * From the proc(4) manual page:
650 * When exit from a system call is being traced, the traced process
651 * stops on completion of the system call just prior to checking for
652 * signals and returning to user level. At this point all return
653 * values have been stored into the traced process's saved registers.
654 */
655 if (proc_stop) {
656 mutex_enter(&p->p_lock);
657 if (PTOU(p)->u_systrap &&
658 prismember(&PTOU(p)->u_exitmask, code))
659 stop(PR_SYSEXIT, code);
660 mutex_exit(&p->p_lock);
661 }
662
663 /*
664 * If we are the parent returning from a successful
665 * vfork, wait for the child to exec or exit.
666 * This code must be here and not in the bowels of the system
667 * so that /proc can intercept exit from vfork in a timely way.
668 */
669 if (t->t_flag & T_VFPARENT) {
670 ASSERT(code == SYS_vfork || code == SYS_forksys);
671 ASSERT(rp->r_o1 == 0 && error == 0);
672 vfwait((pid_t)rval1);
673 t->t_flag &= ~T_VFPARENT;
674 }
675
676 /*
677 * If profiling is active, bill the current PC in user-land
678 * and keep reposting until profiling is disabled.
679 */
680 if (p->p_prof.pr_scale) {
681 if (lwp->lwp_oweupc)
682 profil_tick(rp->r_pc);
683 repost = 1;
684 }
685
686 sig_check:
687 /*
688 * Reset flag for next time.
689 * We must do this after stopping on PR_SYSEXIT
690 * because /proc uses the information in lwp_eosys.
691 */
692 lwp->lwp_eosys = NORMALRETURN;
693 clear_stale_fd();
694 t->t_flag &= ~T_FORKALL;
695
696 if (t->t_astflag | t->t_sig_check) {
697 /*
698 * Turn off the AST flag before checking all the conditions that
699 * may have caused an AST. This flag is on whenever a signal or
700 * unusual condition should be handled after the next trap or
701 * syscall.
702 */
703 astoff(t);
704 t->t_sig_check = 0;
705
706 /*
707 * The following check is legal for the following reasons:
708 * 1) The thread we are checking, is ourselves, so there is
709 * no way the proc can go away.
710 * 2) The only time we need to be protected by the
711 * lock is if the binding is changed.
712 *
713 * Note we will still take the lock and check the binding
714 * if the condition was true without the lock held. This
715 * prevents lock contention among threads owned by the
716 * same proc.
717 */
718
719 if (curthread->t_proc_flag & TP_CHANGEBIND) {
720 mutex_enter(&p->p_lock);
721 if (curthread->t_proc_flag & TP_CHANGEBIND) {
722 timer_lwpbind();
723 curthread->t_proc_flag &= ~TP_CHANGEBIND;
724 }
725 mutex_exit(&p->p_lock);
726 }
727
728 /*
729 * for kaio requests on the special kaio poll queue,
730 * copyout their results to user memory.
731 */
732 if (p->p_aio)
733 aio_cleanup(0);
734
735 /*
736 * If this LWP was asked to hold, call holdlwp(), which will
737 * stop. holdlwps() sets this up and calls pokelwps() which
738 * sets the AST flag.
739 *
740 * Also check TP_EXITLWP, since this is used by fresh new LWPs
741 * through lwp_rtt(). That flag is set if the lwp_create(2)
742 * syscall failed after creating the LWP.
743 */
744 if (ISHOLD(p) || (t->t_proc_flag & TP_EXITLWP))
745 holdlwp();
746
747 /*
748 * All code that sets signals and makes ISSIG_PENDING
749 * evaluate true must set t_sig_check afterwards.
750 */
751 if (ISSIG_PENDING(t, lwp, p)) {
752 if (issig(FORREAL))
753 psig();
754 t->t_sig_check = 1; /* recheck next time */
755 }
756
757 if (sigprof) {
758 int nargs = (code > 0 && code < NSYSCALL)?
759 LWP_GETSYSENT(lwp)[code].sy_narg : 0;
760 realsigprof(code, nargs, error);
761 t->t_sig_check = 1; /* recheck next time */
762 }
763
764 /*
765 * If a performance counter overflow interrupt was
766 * delivered *during* the syscall, then re-enable the
767 * AST so that we take a trip through trap() to cause
768 * the SIGEMT to be delivered.
769 */
770 if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW)
771 aston(t);
772
773 /*
774 * If an asynchronous hardware error is pending, turn AST flag
775 * back on. AST will be checked again before we return to user
776 * mode and we'll come back through trap() to handle the error.
777 */
778 if (lwp->lwp_pcb.pcb_flags & ASYNC_HWERR)
779 aston(t);
780 }
781
782 /*
783 * Restore register window if a debugger modified it.
784 * Set up to perform a single-step if a debugger requested it.
785 */
786 if (lwp->lwp_pcb.pcb_xregstat != XREGNONE)
787 xregrestore(lwp, 1);
788
789 lwp->lwp_errno = 0; /* clear error for next time */
790
791 #ifndef NPROBE
792 /* Kernel probe */
793 if (tnf_tracing_active) {
794 TNF_PROBE_3(syscall_end, "syscall thread", /* CSTYLED */,
795 tnf_long, rval1, rval1,
796 tnf_long, rval2, rval2,
797 tnf_long, errno, (long)error);
798 repost = 1;
799 }
800 #endif /* NPROBE */
801
802 /*
803 * Set state to LWP_USER here so preempt won't give us a kernel
804 * priority if it occurs after this point. Call CL_TRAPRET() to
805 * restore the user-level priority.
806 *
807 * It is important that no locks (other than spinlocks) be entered
808 * after this point before returning to user mode (unless lwp_state
809 * is set back to LWP_SYS).
810 *
811 * Sampled times past this point are charged to the user.
812 */
813 lwp->lwp_state = LWP_USER;
814
815 if (t->t_trapret) {
816 t->t_trapret = 0;
817 thread_lock(t);
818 CL_TRAPRET(t);
819 thread_unlock(t);
820 }
821 if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ)
822 preempt();
823 prunstop();
824
825 /*
826 * t_post_sys will be set if pcb_step is active.
827 */
828 if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
829 prdostep();
830 repost = 1;
831 }
832
833 t->t_sysnum = 0; /* no longer in a system call */
834
835 /*
836 * In case the args were copied to the lwp, reset the
837 * pointer so the next syscall will have the right lwp_ap pointer.
838 */
839 lwp->lwp_ap = (long *)&rp->r_o0;
840 lwp->lwp_argsaved = 0;
841
842 /*
843 * If there was a continuing reason for post-syscall processing,
844 * set the t_post_sys flag for the next system call.
845 */
846 if (repost)
847 t->t_post_sys = 1;
848
849 /*
850 * If there is a ustack registered for this lwp, and the stack rlimit
851 * has been altered, read in the ustack. If the saved stack rlimit
852 * matches the bounds of the ustack, update the ustack to reflect
853 * the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable
854 * stack checking by setting the size to 0.
855 */
856 if (lwp->lwp_ustack != 0 && lwp->lwp_old_stk_ctl != 0) {
857 rlim64_t new_size;
858 model_t model;
859 caddr_t top;
860 struct rlimit64 rl;
861
862 mutex_enter(&p->p_lock);
863 new_size = p->p_stk_ctl;
864 model = p->p_model;
865 top = p->p_usrstack;
866 (void) rctl_rlimit_get(rctlproc_legacy[RLIMIT_STACK], p, &rl);
867 mutex_exit(&p->p_lock);
868
869 if (rl.rlim_cur == RLIM64_INFINITY)
870 new_size = 0;
871
872 if (model == DATAMODEL_NATIVE) {
873 stack_t stk;
874
875 if (copyin((stack_t *)lwp->lwp_ustack, &stk,
876 sizeof (stack_t)) == 0 &&
877 (stk.ss_size == lwp->lwp_old_stk_ctl ||
878 stk.ss_size == 0) &&
879 stk.ss_sp == top - stk.ss_size) {
880 stk.ss_sp = (void *)((uintptr_t)stk.ss_sp +
881 stk.ss_size - new_size);
882 stk.ss_size = new_size;
883
884 (void) copyout(&stk,
885 (stack_t *)lwp->lwp_ustack,
886 sizeof (stack_t));
887 }
888 } else {
889 stack32_t stk32;
890
891 if (copyin((stack32_t *)lwp->lwp_ustack, &stk32,
892 sizeof (stack32_t)) == 0 &&
893 (stk32.ss_size == lwp->lwp_old_stk_ctl ||
894 stk32.ss_size == 0) &&
895 stk32.ss_sp ==
896 (caddr32_t)(uintptr_t)(top - stk32.ss_size)) {
897 stk32.ss_sp += stk32.ss_size - new_size;
898 stk32.ss_size = new_size;
899
900 (void) copyout(&stk32,
901 (stack32_t *)lwp->lwp_ustack,
902 sizeof (stack32_t));
903 }
904 }
905
906 lwp->lwp_old_stk_ctl = 0;
907 }
908
909 syscall_mstate(LMS_SYSTEM, LMS_USER);
910 }
911
912 /*
913 * Call a system call which takes a pointer to the user args struct and
914 * a pointer to the return values. This is a bit slower than the standard
915 * C arg-passing method in some cases.
916 */
917 int64_t
syscall_ap()918 syscall_ap()
919 {
920 uint_t error;
921 struct sysent *callp;
922 rval_t rval;
923 klwp_t *lwp = ttolwp(curthread);
924 struct regs *rp = lwptoregs(lwp);
925
926 callp = LWP_GETSYSENT(lwp) + curthread->t_sysnum;
927
928 /*
929 * If the arguments don't fit in registers %o0 - o5, make sure they
930 * have been copied to the lwp_arg array.
931 */
932 if (callp->sy_narg > 6 && save_syscall_args())
933 return ((int64_t)set_errno(EFAULT));
934
935 rval.r_val1 = 0;
936 rval.r_val2 = (int)rp->r_o1;
937 lwp->lwp_error = 0; /* for old drivers */
938 error = (*(callp->sy_call))(lwp->lwp_ap, &rval);
939 if (error)
940 return ((int64_t)set_errno(error));
941 return (rval.r_vals);
942 }
943
944 /*
945 * Load system call module.
946 * Returns with pointer to held read lock for module.
947 */
948 static krwlock_t *
lock_syscall(struct sysent * table,uint_t code)949 lock_syscall(struct sysent *table, uint_t code)
950 {
951 krwlock_t *module_lock;
952 struct modctl *modp;
953 int id;
954 struct sysent *callp;
955
956 module_lock = table[code].sy_lock;
957 callp = &table[code];
958
959 /*
960 * Optimization to only call modload if we don't have a loaded
961 * syscall.
962 */
963 rw_enter(module_lock, RW_READER);
964 if (LOADED_SYSCALL(callp))
965 return (module_lock);
966 rw_exit(module_lock);
967
968 for (;;) {
969 if ((id = modload("sys", syscallnames[code])) == -1)
970 break;
971
972 /*
973 * If we loaded successfully at least once, the modctl
974 * will still be valid, so we try to grab it by filename.
975 * If this call fails, it's because the mod_filename
976 * was changed after the call to modload() (mod_hold_by_name()
977 * is the likely culprit). We can safely just take
978 * another lap if this is the case; the modload() will
979 * change the mod_filename back to one by which we can
980 * find the modctl.
981 */
982 modp = mod_find_by_filename("sys", syscallnames[code]);
983
984 if (modp == NULL)
985 continue;
986
987 mutex_enter(&mod_lock);
988
989 if (!modp->mod_installed) {
990 mutex_exit(&mod_lock);
991 continue;
992 }
993 break;
994 }
995
996 rw_enter(module_lock, RW_READER);
997
998 if (id != -1)
999 mutex_exit(&mod_lock);
1000
1001 return (module_lock);
1002 }
1003
1004 /*
1005 * Loadable syscall support.
1006 * If needed, load the module, then reserve it by holding a read
1007 * lock for the duration of the call.
1008 * Later, if the syscall is not unloadable, it could patch the vector.
1009 */
1010 /*ARGSUSED*/
1011 int64_t
loadable_syscall(long a0,long a1,long a2,long a3,long a4,long a5,long a6,long a7)1012 loadable_syscall(
1013 long a0, long a1, long a2, long a3,
1014 long a4, long a5, long a6, long a7)
1015 {
1016 int64_t rval;
1017 struct sysent *callp;
1018 struct sysent *se = LWP_GETSYSENT(ttolwp(curthread));
1019 krwlock_t *module_lock;
1020 int code;
1021
1022 code = curthread->t_sysnum;
1023 callp = se + code;
1024
1025 /*
1026 * Try to autoload the system call if necessary.
1027 */
1028 module_lock = lock_syscall(se, code);
1029 THREAD_KPRI_RELEASE(); /* drop priority given by rw_enter */
1030
1031 /*
1032 * we've locked either the loaded syscall or nosys
1033 */
1034 if (callp->sy_flags & SE_ARGC) {
1035 int64_t (*sy_call)();
1036
1037 sy_call = (int64_t (*)())callp->sy_call;
1038 rval = (*sy_call)(a0, a1, a2, a3, a4, a5);
1039 } else {
1040 rval = syscall_ap();
1041 }
1042
1043 THREAD_KPRI_REQUEST(); /* regain priority from read lock */
1044 rw_exit(module_lock);
1045 return (rval);
1046 }
1047
1048 /*
1049 * Handle indirect system calls.
1050 * This interface should be deprecated. The library can handle
1051 * this more efficiently, but keep this implementation for old binaries.
1052 *
1053 * XX64 Needs some work.
1054 */
1055 int64_t
indir(int code,long a0,long a1,long a2,long a3,long a4)1056 indir(int code, long a0, long a1, long a2, long a3, long a4)
1057 {
1058 klwp_t *lwp = ttolwp(curthread);
1059 struct sysent *callp;
1060
1061 if (code <= 0 || code >= NSYSCALL)
1062 return (nosys());
1063
1064 ASSERT(lwp->lwp_ap != NULL);
1065
1066 curthread->t_sysnum = code;
1067 callp = LWP_GETSYSENT(lwp) + code;
1068
1069 /*
1070 * Handle argument setup, unless already done in pre_syscall().
1071 */
1072 if (callp->sy_narg > 5) {
1073 if (save_syscall_args()) /* move args to LWP array */
1074 return ((int64_t)set_errno(EFAULT));
1075 } else if (!lwp->lwp_argsaved) {
1076 long *ap;
1077
1078 ap = lwp->lwp_ap; /* args haven't been saved */
1079 lwp->lwp_ap = ap + 1; /* advance arg pointer */
1080 curthread->t_post_sys = 1; /* so lwp_ap will be reset */
1081 }
1082 return ((*callp->sy_callc)(a0, a1, a2, a3, a4, lwp->lwp_arg[5]));
1083 }
1084
1085 /*
1086 * set_errno - set an error return from the current system call.
1087 * This could be a macro.
1088 * This returns the value it is passed, so that the caller can
1089 * use tail-recursion-elimination and do return (set_errno(ERRNO));
1090 */
1091 uint_t
set_errno(uint_t error)1092 set_errno(uint_t error)
1093 {
1094 ASSERT(error != 0); /* must not be used to clear errno */
1095
1096 curthread->t_post_sys = 1; /* have post_syscall do error return */
1097 return (ttolwp(curthread)->lwp_errno = error);
1098 }
1099
1100 /*
1101 * set_proc_pre_sys - Set pre-syscall processing for entire process.
1102 */
1103 void
set_proc_pre_sys(proc_t * p)1104 set_proc_pre_sys(proc_t *p)
1105 {
1106 kthread_t *t;
1107 kthread_t *first;
1108
1109 ASSERT(MUTEX_HELD(&p->p_lock));
1110
1111 t = first = p->p_tlist;
1112 do {
1113 t->t_pre_sys = 1;
1114 } while ((t = t->t_forw) != first);
1115 }
1116
1117 /*
1118 * set_proc_post_sys - Set post-syscall processing for entire process.
1119 */
1120 void
set_proc_post_sys(proc_t * p)1121 set_proc_post_sys(proc_t *p)
1122 {
1123 kthread_t *t;
1124 kthread_t *first;
1125
1126 ASSERT(MUTEX_HELD(&p->p_lock));
1127
1128 t = first = p->p_tlist;
1129 do {
1130 t->t_post_sys = 1;
1131 } while ((t = t->t_forw) != first);
1132 }
1133
1134 /*
1135 * set_proc_sys - Set pre- and post-syscall processing for entire process.
1136 */
1137 void
set_proc_sys(proc_t * p)1138 set_proc_sys(proc_t *p)
1139 {
1140 kthread_t *t;
1141 kthread_t *first;
1142
1143 ASSERT(MUTEX_HELD(&p->p_lock));
1144
1145 t = first = p->p_tlist;
1146 do {
1147 t->t_pre_sys = 1;
1148 t->t_post_sys = 1;
1149 } while ((t = t->t_forw) != first);
1150 }
1151
1152 /*
1153 * set_all_proc_sys - set pre- and post-syscall processing flags for all
1154 * user processes.
1155 *
1156 * This is needed when auditing, tracing, or other facilities which affect
1157 * all processes are turned on.
1158 */
1159 void
set_all_proc_sys()1160 set_all_proc_sys()
1161 {
1162 kthread_t *t;
1163 kthread_t *first;
1164
1165 mutex_enter(&pidlock);
1166 t = first = curthread;
1167 do {
1168 t->t_pre_sys = 1;
1169 t->t_post_sys = 1;
1170 } while ((t = t->t_next) != first);
1171 mutex_exit(&pidlock);
1172 }
1173
1174 /*
1175 * set_all_zone_usr_proc_sys - set pre- and post-syscall processing flags for
1176 * all user processes running in the zone of the current process
1177 *
1178 * This is needed when auditing is turned on.
1179 */
1180 void
set_all_zone_usr_proc_sys(zoneid_t zoneid)1181 set_all_zone_usr_proc_sys(zoneid_t zoneid)
1182 {
1183 proc_t *p;
1184 kthread_t *t;
1185
1186 mutex_enter(&pidlock);
1187 for (p = practive; p != NULL; p = p->p_next) {
1188 /* skip kernel processes */
1189 if (p->p_exec == NULLVP || p->p_as == &kas ||
1190 p->p_stat == SIDL || p->p_stat == SZOMB ||
1191 (p->p_flag & (SSYS | SEXITING | SEXITLWPS)))
1192 continue;
1193 /*
1194 * Only processes in the given zone (eventually in
1195 * all zones) are taken into account
1196 */
1197 if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) {
1198 mutex_enter(&p->p_lock);
1199 if ((t = p->p_tlist) == NULL) {
1200 mutex_exit(&p->p_lock);
1201 continue;
1202 }
1203 /*
1204 * Set pre- and post-syscall processing flags
1205 * for all threads of the process
1206 */
1207 do {
1208 t->t_pre_sys = 1;
1209 t->t_post_sys = 1;
1210 } while (p->p_tlist != (t = t->t_forw));
1211 mutex_exit(&p->p_lock);
1212 }
1213 }
1214 mutex_exit(&pidlock);
1215 }
1216
1217 /*
1218 * set_proc_ast - Set asynchronous service trap (AST) flag for all
1219 * threads in process.
1220 */
1221 void
set_proc_ast(proc_t * p)1222 set_proc_ast(proc_t *p)
1223 {
1224 kthread_t *t;
1225 kthread_t *first;
1226
1227 ASSERT(MUTEX_HELD(&p->p_lock));
1228
1229 t = first = p->p_tlist;
1230 do {
1231 aston(t);
1232 } while ((t = t->t_forw) != first);
1233 }
1234