1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2019 Joyent, Inc.
25 */
26
27 #include <sys/param.h>
28 #include <sys/vmparam.h>
29 #include <sys/types.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/cmn_err.h>
33 #include <sys/signal.h>
34 #include <sys/stack.h>
35 #include <sys/cred.h>
36 #include <sys/user.h>
37 #include <sys/debug.h>
38 #include <sys/errno.h>
39 #include <sys/proc.h>
40 #include <sys/var.h>
41 #include <sys/inline.h>
42 #include <sys/syscall.h>
43 #include <sys/ucontext.h>
44 #include <sys/cpuvar.h>
45 #include <sys/siginfo.h>
46 #include <sys/trap.h>
47 #include <sys/machtrap.h>
48 #include <sys/sysinfo.h>
49 #include <sys/procfs.h>
50 #include <sys/prsystm.h>
51 #include <sys/fpu/fpusystm.h>
52 #include <sys/modctl.h>
53 #include <sys/aio_impl.h>
54 #include <c2/audit.h>
55 #include <sys/tnf.h>
56 #include <sys/tnf_probe.h>
57 #include <sys/machpcb.h>
58 #include <sys/privregs.h>
59 #include <sys/copyops.h>
60 #include <sys/timer.h>
61 #include <sys/priv.h>
62 #include <sys/msacct.h>
63
64 int syscalltrace = 0;
65 #ifdef SYSCALLTRACE
66 static kmutex_t systrace_lock; /* syscall tracing lock */
67 #endif /* SYSCALLTRACE */
68
69 static krwlock_t *lock_syscall(struct sysent *, uint_t);
70
71 #ifdef _SYSCALL32_IMPL
72 static struct sysent *
lwp_getsysent(klwp_t * lwp)73 lwp_getsysent(klwp_t *lwp)
74 {
75 if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE)
76 return (sysent);
77 return (sysent32);
78 }
79 #define LWP_GETSYSENT(lwp) (lwp_getsysent(lwp))
80 #else
81 #define LWP_GETSYSENT(lwp) (sysent)
82 #endif
83
84 /*
85 * Called to restore the lwp's register window just before
86 * returning to user level (only if the registers have been
87 * fetched or modified through /proc).
88 */
89 /*ARGSUSED1*/
90 void
xregrestore(klwp_t * lwp,int shared)91 xregrestore(klwp_t *lwp, int shared)
92 {
93 /*
94 * If locals+ins were modified by /proc copy them out.
95 * Also copy to the shared window, if necessary.
96 */
97 if (lwp->lwp_pcb.pcb_xregstat == XREGMODIFIED) {
98 struct machpcb *mpcb = lwptompcb(lwp);
99 caddr_t sp = (caddr_t)lwptoregs(lwp)->r_sp;
100
101 size_t rwinsize;
102 caddr_t rwp;
103 int is64;
104
105 if (lwp_getdatamodel(lwp) == DATAMODEL_LP64) {
106 rwinsize = sizeof (struct rwindow);
107 rwp = sp + STACK_BIAS;
108 is64 = 1;
109 } else {
110 rwinsize = sizeof (struct rwindow32);
111 sp = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)sp;
112 rwp = sp;
113 is64 = 0;
114 }
115
116 if (is64)
117 (void) copyout_nowatch(&lwp->lwp_pcb.pcb_xregs,
118 rwp, rwinsize);
119 else {
120 struct rwindow32 rwindow32;
121 int watched;
122
123 watched = watch_disable_addr(rwp, rwinsize, S_WRITE);
124 rwindow_nto32(&lwp->lwp_pcb.pcb_xregs, &rwindow32);
125 (void) copyout(&rwindow32, rwp, rwinsize);
126 if (watched)
127 watch_enable_addr(rwp, rwinsize, S_WRITE);
128 }
129
130 /* also copy to the user return window */
131 mpcb->mpcb_rsp[0] = sp;
132 mpcb->mpcb_rsp[1] = NULL;
133 bcopy(&lwp->lwp_pcb.pcb_xregs, &mpcb->mpcb_rwin[0],
134 sizeof (lwp->lwp_pcb.pcb_xregs));
135 }
136 lwp->lwp_pcb.pcb_xregstat = XREGNONE;
137 }
138
139
140 /*
141 * Get the arguments to the current system call.
142 * lwp->lwp_ap normally points to the out regs in the reg structure.
143 * If the user is going to change the out registers and might want to
144 * get the args (for /proc tracing), it must copy the args elsewhere
145 * via save_syscall_args().
146 */
147 uint_t
get_syscall_args(klwp_t * lwp,long * argp,int * nargsp)148 get_syscall_args(klwp_t *lwp, long *argp, int *nargsp)
149 {
150 kthread_t *t = lwptot(lwp);
151 uint_t code = t->t_sysnum;
152 long mask;
153 long *ap;
154 int nargs;
155
156 if (lwptoproc(lwp)->p_model == DATAMODEL_ILP32)
157 mask = (uint32_t)0xffffffffU;
158 else
159 mask = 0xffffffffffffffff;
160
161 if (code != 0 && code < NSYSCALL) {
162
163 nargs = LWP_GETSYSENT(lwp)[code].sy_narg;
164
165 ASSERT(nargs <= MAXSYSARGS);
166
167 *nargsp = nargs;
168 ap = lwp->lwp_ap;
169 while (nargs-- > 0)
170 *argp++ = *ap++ & mask;
171 } else {
172 *nargsp = 0;
173 }
174 return (code);
175 }
176
177 #ifdef _SYSCALL32_IMPL
178 /*
179 * Get the arguments to the current 32-bit system call.
180 */
181 uint_t
get_syscall32_args(klwp_t * lwp,int * argp,int * nargsp)182 get_syscall32_args(klwp_t *lwp, int *argp, int *nargsp)
183 {
184 long args[MAXSYSARGS];
185 uint_t i, code;
186
187 code = get_syscall_args(lwp, args, nargsp);
188 for (i = 0; i != *nargsp; i++)
189 *argp++ = (int)args[i];
190 return (code);
191 }
192 #endif
193
194 /*
195 * Save the system call arguments in a safe place.
196 * lwp->lwp_ap normally points to the out regs in the reg structure.
197 * If the user is going to change the out registers, g1, or the stack,
198 * and might want to get the args (for /proc tracing), it must copy
199 * the args elsewhere via save_syscall_args().
200 *
201 * This may be called from stop() even when we're not in a system call.
202 * Since there's no easy way to tell, this must be safe (not panic).
203 * If the copyins get data faults, return non-zero.
204 */
205 int
save_syscall_args()206 save_syscall_args()
207 {
208 kthread_t *t = curthread;
209 klwp_t *lwp = ttolwp(t);
210 struct regs *rp = lwptoregs(lwp);
211 uint_t code = t->t_sysnum;
212 uint_t nargs;
213 int i;
214 caddr_t ua;
215 model_t datamodel;
216
217 if (lwp->lwp_argsaved || code == 0)
218 return (0); /* args already saved or not needed */
219
220 if (code >= NSYSCALL) {
221 nargs = 0; /* illegal syscall */
222 } else {
223 struct sysent *se = LWP_GETSYSENT(lwp);
224 struct sysent *callp = se + code;
225
226 nargs = callp->sy_narg;
227 if (LOADABLE_SYSCALL(callp) && nargs == 0) {
228 krwlock_t *module_lock;
229
230 /*
231 * Find out how many arguments the system
232 * call uses.
233 *
234 * We have the property that loaded syscalls
235 * never change the number of arguments they
236 * use after they've been loaded once. This
237 * allows us to stop for /proc tracing without
238 * holding the module lock.
239 * /proc is assured that sy_narg is valid.
240 */
241 module_lock = lock_syscall(se, code);
242 nargs = callp->sy_narg;
243 rw_exit(module_lock);
244 }
245 }
246
247 /*
248 * Fetch the system call arguments.
249 */
250 if (nargs == 0)
251 goto out;
252
253
254 ASSERT(nargs <= MAXSYSARGS);
255
256 if ((datamodel = lwp_getdatamodel(lwp)) == DATAMODEL_ILP32) {
257
258 if (rp->r_g1 == 0) { /* indirect syscall */
259
260 lwp->lwp_arg[0] = (uint32_t)rp->r_o1;
261 lwp->lwp_arg[1] = (uint32_t)rp->r_o2;
262 lwp->lwp_arg[2] = (uint32_t)rp->r_o3;
263 lwp->lwp_arg[3] = (uint32_t)rp->r_o4;
264 lwp->lwp_arg[4] = (uint32_t)rp->r_o5;
265 if (nargs > 5) {
266 ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
267 (rp->r_sp + MINFRAME32);
268 for (i = 5; i < nargs; i++) {
269 uint32_t a;
270 if (fuword32(ua, &a) != 0)
271 return (-1);
272 lwp->lwp_arg[i] = a;
273 ua += sizeof (a);
274 }
275 }
276 } else {
277 lwp->lwp_arg[0] = (uint32_t)rp->r_o0;
278 lwp->lwp_arg[1] = (uint32_t)rp->r_o1;
279 lwp->lwp_arg[2] = (uint32_t)rp->r_o2;
280 lwp->lwp_arg[3] = (uint32_t)rp->r_o3;
281 lwp->lwp_arg[4] = (uint32_t)rp->r_o4;
282 lwp->lwp_arg[5] = (uint32_t)rp->r_o5;
283 if (nargs > 6) {
284 ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
285 (rp->r_sp + MINFRAME32);
286 for (i = 6; i < nargs; i++) {
287 uint32_t a;
288 if (fuword32(ua, &a) != 0)
289 return (-1);
290 lwp->lwp_arg[i] = a;
291 ua += sizeof (a);
292 }
293 }
294 }
295 } else {
296 ASSERT(datamodel == DATAMODEL_LP64);
297 lwp->lwp_arg[0] = rp->r_o0;
298 lwp->lwp_arg[1] = rp->r_o1;
299 lwp->lwp_arg[2] = rp->r_o2;
300 lwp->lwp_arg[3] = rp->r_o3;
301 lwp->lwp_arg[4] = rp->r_o4;
302 lwp->lwp_arg[5] = rp->r_o5;
303 if (nargs > 6) {
304 ua = (caddr_t)rp->r_sp + MINFRAME + STACK_BIAS;
305 for (i = 6; i < nargs; i++) {
306 unsigned long a;
307 if (fulword(ua, &a) != 0)
308 return (-1);
309 lwp->lwp_arg[i] = a;
310 ua += sizeof (a);
311 }
312 }
313 }
314
315 out:
316 lwp->lwp_ap = lwp->lwp_arg;
317 lwp->lwp_argsaved = 1;
318 t->t_post_sys = 1; /* so lwp_ap will be reset */
319 return (0);
320 }
321
322 void
reset_syscall_args(void)323 reset_syscall_args(void)
324 {
325 klwp_t *lwp = ttolwp(curthread);
326
327 lwp->lwp_ap = (long *)&lwptoregs(lwp)->r_o0;
328 lwp->lwp_argsaved = 0;
329 }
330
331 /*
332 * nonexistent system call-- signal lwp (may want to handle it)
333 * flag error if lwp won't see signal immediately
334 * This works for old or new calling sequence.
335 */
336 int64_t
nosys()337 nosys()
338 {
339 tsignal(curthread, SIGSYS);
340 return ((int64_t)set_errno(ENOSYS));
341 }
342
343 /*
344 * Perform pre-system-call processing, including stopping for tracing,
345 * auditing, microstate-accounting, etc.
346 *
347 * This routine is called only if the t_pre_sys flag is set. Any condition
348 * requiring pre-syscall handling must set the t_pre_sys flag. If the
349 * condition is persistent, this routine will repost t_pre_sys.
350 */
351 int
pre_syscall(int arg0)352 pre_syscall(int arg0)
353 {
354 unsigned int code;
355 kthread_t *t = curthread;
356 proc_t *p = ttoproc(t);
357 klwp_t *lwp = ttolwp(t);
358 struct regs *rp = lwptoregs(lwp);
359 int repost;
360
361 t->t_pre_sys = repost = 0; /* clear pre-syscall processing flag */
362
363 ASSERT(t->t_schedflag & TS_DONT_SWAP);
364
365 syscall_mstate(LMS_USER, LMS_SYSTEM);
366
367 /*
368 * The syscall arguments in the out registers should be pointed to
369 * by lwp_ap. If the args need to be copied so that the outs can
370 * be changed without losing the ability to get the args for /proc,
371 * they can be saved by save_syscall_args(), and lwp_ap will be
372 * restored by post_syscall().
373 */
374 ASSERT(lwp->lwp_ap == (long *)&rp->r_o0);
375
376 /*
377 * Make sure the thread is holding the latest credentials for the
378 * process. The credentials in the process right now apply to this
379 * thread for the entire system call.
380 */
381 if (t->t_cred != p->p_cred) {
382 cred_t *oldcred = t->t_cred;
383 /*
384 * DTrace accesses t_cred in probe context. t_cred must
385 * always be either NULL, or point to a valid, allocated cred
386 * structure.
387 */
388 t->t_cred = crgetcred();
389 crfree(oldcred);
390 }
391
392 /*
393 * Undo special arrangements to single-step the lwp
394 * so that a debugger will see valid register contents.
395 * Also so that the pc is valid for syncfpu().
396 * Also so that a syscall like exec() can be stepped.
397 */
398 if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
399 (void) prundostep();
400 repost = 1;
401 }
402
403 /*
404 * Check for indirect system call in case we stop for tracing.
405 * Don't allow multiple indirection.
406 */
407 code = t->t_sysnum;
408 if (code == 0 && arg0 != 0) { /* indirect syscall */
409 code = arg0;
410 t->t_sysnum = arg0;
411 }
412
413 /*
414 * From the proc(4) manual page:
415 * When entry to a system call is being traced, the traced process
416 * stops after having begun the call to the system but before the
417 * system call arguments have been fetched from the process.
418 * If proc changes the args we must refetch them after starting.
419 */
420 if (PTOU(p)->u_systrap) {
421 if (prismember(&PTOU(p)->u_entrymask, code)) {
422 /*
423 * Recheck stop condition, now that lock is held.
424 */
425 mutex_enter(&p->p_lock);
426 if (PTOU(p)->u_systrap &&
427 prismember(&PTOU(p)->u_entrymask, code)) {
428 stop(PR_SYSENTRY, code);
429 /*
430 * Must refetch args since they were
431 * possibly modified by /proc. Indicate
432 * that the valid copy is in the
433 * registers.
434 */
435 lwp->lwp_argsaved = 0;
436 lwp->lwp_ap = (long *)&rp->r_o0;
437 }
438 mutex_exit(&p->p_lock);
439 }
440 repost = 1;
441 }
442
443 if (lwp->lwp_sysabort) {
444 /*
445 * lwp_sysabort may have been set via /proc while the process
446 * was stopped on PR_SYSENTRY. If so, abort the system call.
447 * Override any error from the copyin() of the arguments.
448 */
449 lwp->lwp_sysabort = 0;
450 (void) set_errno(EINTR); /* sets post-sys processing */
451 t->t_pre_sys = 1; /* repost anyway */
452 return (1); /* don't do system call, return EINTR */
453 }
454
455 /* begin auditing for this syscall */
456 if (audit_active == C2AUDIT_LOADED) {
457 uint32_t auditing = au_zone_getstate(NULL);
458
459 if (auditing & AU_AUDIT_MASK) {
460 int error;
461 if (error = audit_start(T_SYSCALL, code, auditing, \
462 0, lwp)) {
463 t->t_pre_sys = 1; /* repost anyway */
464 lwp->lwp_error = 0; /* for old drivers */
465 return (error);
466 }
467 repost = 1;
468 }
469 }
470
471 #ifndef NPROBE
472 /* Kernel probe */
473 if (tnf_tracing_active) {
474 TNF_PROBE_1(syscall_start, "syscall thread", /* CSTYLED */,
475 tnf_sysnum, sysnum, t->t_sysnum);
476 t->t_post_sys = 1; /* make sure post_syscall runs */
477 repost = 1;
478 }
479 #endif /* NPROBE */
480
481 #ifdef SYSCALLTRACE
482 if (syscalltrace) {
483 int i;
484 long *ap;
485 char *cp;
486 char *sysname;
487 struct sysent *callp;
488
489 if (code >= NSYSCALL)
490 callp = &nosys_ent; /* nosys has no args */
491 else
492 callp = LWP_GETSYSENT(lwp) + code;
493 (void) save_syscall_args();
494 mutex_enter(&systrace_lock);
495 printf("%d: ", p->p_pid);
496 if (code >= NSYSCALL)
497 printf("0x%x", code);
498 else {
499 sysname = mod_getsysname(code);
500 printf("%s[0x%x]", sysname == NULL ? "NULL" :
501 sysname, code);
502 }
503 cp = "(";
504 for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) {
505 printf("%s%lx", cp, *ap);
506 cp = ", ";
507 }
508 if (i)
509 printf(")");
510 printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread);
511 mutex_exit(&systrace_lock);
512 }
513 #endif /* SYSCALLTRACE */
514
515 /*
516 * If there was a continuing reason for pre-syscall processing,
517 * set the t_pre_sys flag for the next system call.
518 */
519 if (repost)
520 t->t_pre_sys = 1;
521 lwp->lwp_error = 0; /* for old drivers */
522 lwp->lwp_badpriv = PRIV_NONE; /* for privilege tracing */
523 return (0);
524 }
525
526 /*
527 * Post-syscall processing. Perform abnormal system call completion
528 * actions such as /proc tracing, profiling, signals, preemption, etc.
529 *
530 * This routine is called only if t_post_sys, t_sig_check, or t_astflag is set.
531 * Any condition requiring pre-syscall handling must set one of these.
532 * If the condition is persistent, this routine will repost t_post_sys.
533 */
534 void
post_syscall(long rval1,long rval2)535 post_syscall(long rval1, long rval2)
536 {
537 kthread_t *t = curthread;
538 proc_t *p = curproc;
539 klwp_t *lwp = ttolwp(t);
540 struct regs *rp = lwptoregs(lwp);
541 uint_t error;
542 int code = t->t_sysnum;
543 int repost = 0;
544 int proc_stop = 0; /* non-zero if stopping for /proc */
545 int sigprof = 0; /* non-zero if sending SIGPROF */
546
547 t->t_post_sys = 0;
548
549 error = lwp->lwp_errno;
550
551 /*
552 * Code can be zero if this is a new LWP returning after a forkall(),
553 * other than the one which matches the one in the parent which called
554 * forkall(). In these LWPs, skip most of post-syscall activity.
555 */
556 if (code == 0)
557 goto sig_check;
558
559 /* put out audit record for this syscall */
560 if (AU_AUDITING()) {
561 rval_t rval; /* fix audit_finish() someday */
562
563 /* XX64 -- truncation of 64-bit return values? */
564 rval.r_val1 = (int)rval1;
565 rval.r_val2 = (int)rval2;
566 audit_finish(T_SYSCALL, code, error, &rval);
567 repost = 1;
568 }
569
570 if (curthread->t_pdmsg != NULL) {
571 char *m = curthread->t_pdmsg;
572
573 uprintf("%s", m);
574 kmem_free(m, strlen(m) + 1);
575 curthread->t_pdmsg = NULL;
576 }
577
578 /*
579 * If we're going to stop for /proc tracing, set the flag and
580 * save the arguments so that the return values don't smash them.
581 */
582 if (PTOU(p)->u_systrap) {
583 if (prismember(&PTOU(p)->u_exitmask, code)) {
584 proc_stop = 1;
585 (void) save_syscall_args();
586 }
587 repost = 1;
588 }
589
590 /*
591 * Similarly check to see if SIGPROF might be sent.
592 */
593 if (curthread->t_rprof != NULL &&
594 curthread->t_rprof->rp_anystate != 0) {
595 (void) save_syscall_args();
596 sigprof = 1;
597 }
598
599 if (lwp->lwp_eosys == NORMALRETURN) {
600 if (error == 0) {
601 #ifdef SYSCALLTRACE
602 if (syscalltrace) {
603 mutex_enter(&systrace_lock);
604 printf(
605 "%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n",
606 p->p_pid, rval1, rval2, curthread);
607 mutex_exit(&systrace_lock);
608 }
609 #endif /* SYSCALLTRACE */
610 rp->r_tstate &= ~TSTATE_IC;
611 rp->r_o0 = rval1;
612 rp->r_o1 = rval2;
613 } else {
614 int sig;
615
616 #ifdef SYSCALLTRACE
617 if (syscalltrace) {
618 mutex_enter(&systrace_lock);
619 printf("%d: error=%d, id 0x%p\n",
620 p->p_pid, error, curthread);
621 mutex_exit(&systrace_lock);
622 }
623 #endif /* SYSCALLTRACE */
624 if (error == EINTR && t->t_activefd.a_stale)
625 error = EBADF;
626 if (error == EINTR &&
627 (sig = lwp->lwp_cursig) != 0 &&
628 sigismember(&PTOU(p)->u_sigrestart, sig) &&
629 PTOU(p)->u_signal[sig - 1] != SIG_DFL &&
630 PTOU(p)->u_signal[sig - 1] != SIG_IGN)
631 error = ERESTART;
632 rp->r_o0 = error;
633 rp->r_tstate |= TSTATE_IC;
634 }
635 /*
636 * The default action is to redo the trap instruction.
637 * We increment the pc and npc past it for NORMALRETURN.
638 * JUSTRETURN has set up a new pc and npc already.
639 * If we are a cloned thread of forkall(), don't
640 * adjust here because we have already inherited
641 * the adjusted values from our clone.
642 */
643 if (!(t->t_flag & T_FORKALL)) {
644 rp->r_pc = rp->r_npc;
645 rp->r_npc += 4;
646 }
647 }
648
649 /*
650 * From the proc(4) manual page:
651 * When exit from a system call is being traced, the traced process
652 * stops on completion of the system call just prior to checking for
653 * signals and returning to user level. At this point all return
654 * values have been stored into the traced process's saved registers.
655 */
656 if (proc_stop) {
657 mutex_enter(&p->p_lock);
658 if (PTOU(p)->u_systrap &&
659 prismember(&PTOU(p)->u_exitmask, code))
660 stop(PR_SYSEXIT, code);
661 mutex_exit(&p->p_lock);
662 }
663
664 /*
665 * If we are the parent returning from a successful
666 * vfork, wait for the child to exec or exit.
667 * This code must be here and not in the bowels of the system
668 * so that /proc can intercept exit from vfork in a timely way.
669 */
670 if (t->t_flag & T_VFPARENT) {
671 ASSERT(code == SYS_vfork || code == SYS_forksys);
672 ASSERT(rp->r_o1 == 0 && error == 0);
673 vfwait((pid_t)rval1);
674 t->t_flag &= ~T_VFPARENT;
675 }
676
677 /*
678 * If profiling is active, bill the current PC in user-land
679 * and keep reposting until profiling is disabled.
680 */
681 if (p->p_prof.pr_scale) {
682 if (lwp->lwp_oweupc)
683 profil_tick(rp->r_pc);
684 repost = 1;
685 }
686
687 sig_check:
688 /*
689 * Reset flag for next time.
690 * We must do this after stopping on PR_SYSEXIT
691 * because /proc uses the information in lwp_eosys.
692 */
693 lwp->lwp_eosys = NORMALRETURN;
694 clear_stale_fd();
695 t->t_flag &= ~T_FORKALL;
696
697 if (t->t_astflag | t->t_sig_check) {
698 /*
699 * Turn off the AST flag before checking all the conditions that
700 * may have caused an AST. This flag is on whenever a signal or
701 * unusual condition should be handled after the next trap or
702 * syscall.
703 */
704 astoff(t);
705 t->t_sig_check = 0;
706
707 /*
708 * The following check is legal for the following reasons:
709 * 1) The thread we are checking, is ourselves, so there is
710 * no way the proc can go away.
711 * 2) The only time we need to be protected by the
712 * lock is if the binding is changed.
713 *
714 * Note we will still take the lock and check the binding
715 * if the condition was true without the lock held. This
716 * prevents lock contention among threads owned by the
717 * same proc.
718 */
719
720 if (curthread->t_proc_flag & TP_CHANGEBIND) {
721 mutex_enter(&p->p_lock);
722 if (curthread->t_proc_flag & TP_CHANGEBIND) {
723 timer_lwpbind();
724 curthread->t_proc_flag &= ~TP_CHANGEBIND;
725 }
726 mutex_exit(&p->p_lock);
727 }
728
729 /*
730 * for kaio requests on the special kaio poll queue,
731 * copyout their results to user memory.
732 */
733 if (p->p_aio)
734 aio_cleanup(0);
735
736 /*
737 * If this LWP was asked to hold, call holdlwp(), which will
738 * stop. holdlwps() sets this up and calls pokelwps() which
739 * sets the AST flag.
740 *
741 * Also check TP_EXITLWP, since this is used by fresh new LWPs
742 * through lwp_rtt(). That flag is set if the lwp_create(2)
743 * syscall failed after creating the LWP.
744 */
745 if (ISHOLD(p) || (t->t_proc_flag & TP_EXITLWP))
746 holdlwp();
747
748 /*
749 * All code that sets signals and makes ISSIG_PENDING
750 * evaluate true must set t_sig_check afterwards.
751 */
752 if (ISSIG_PENDING(t, lwp, p)) {
753 if (issig(FORREAL))
754 psig();
755 t->t_sig_check = 1; /* recheck next time */
756 }
757
758 if (sigprof) {
759 int nargs = (code > 0 && code < NSYSCALL)?
760 LWP_GETSYSENT(lwp)[code].sy_narg : 0;
761 realsigprof(code, nargs, error);
762 t->t_sig_check = 1; /* recheck next time */
763 }
764
765 /*
766 * If a performance counter overflow interrupt was
767 * delivered *during* the syscall, then re-enable the
768 * AST so that we take a trip through trap() to cause
769 * the SIGEMT to be delivered.
770 */
771 if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW)
772 aston(t);
773
774 /*
775 * If an asynchronous hardware error is pending, turn AST flag
776 * back on. AST will be checked again before we return to user
777 * mode and we'll come back through trap() to handle the error.
778 */
779 if (lwp->lwp_pcb.pcb_flags & ASYNC_HWERR)
780 aston(t);
781 }
782
783 /*
784 * Restore register window if a debugger modified it.
785 * Set up to perform a single-step if a debugger requested it.
786 */
787 if (lwp->lwp_pcb.pcb_xregstat != XREGNONE)
788 xregrestore(lwp, 1);
789
790 lwp->lwp_errno = 0; /* clear error for next time */
791
792 #ifndef NPROBE
793 /* Kernel probe */
794 if (tnf_tracing_active) {
795 TNF_PROBE_3(syscall_end, "syscall thread", /* CSTYLED */,
796 tnf_long, rval1, rval1,
797 tnf_long, rval2, rval2,
798 tnf_long, errno, (long)error);
799 repost = 1;
800 }
801 #endif /* NPROBE */
802
803 /*
804 * Set state to LWP_USER here so preempt won't give us a kernel
805 * priority if it occurs after this point. Call CL_TRAPRET() to
806 * restore the user-level priority.
807 *
808 * It is important that no locks (other than spinlocks) be entered
809 * after this point before returning to user mode (unless lwp_state
810 * is set back to LWP_SYS).
811 *
812 * Sampled times past this point are charged to the user.
813 */
814 lwp->lwp_state = LWP_USER;
815
816 if (t->t_trapret) {
817 t->t_trapret = 0;
818 thread_lock(t);
819 CL_TRAPRET(t);
820 thread_unlock(t);
821 }
822 if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ)
823 preempt();
824 prunstop();
825
826 /*
827 * t_post_sys will be set if pcb_step is active.
828 */
829 if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
830 prdostep();
831 repost = 1;
832 }
833
834 t->t_sysnum = 0; /* no longer in a system call */
835
836 /*
837 * In case the args were copied to the lwp, reset the
838 * pointer so the next syscall will have the right lwp_ap pointer.
839 */
840 lwp->lwp_ap = (long *)&rp->r_o0;
841 lwp->lwp_argsaved = 0;
842
843 /*
844 * If there was a continuing reason for post-syscall processing,
845 * set the t_post_sys flag for the next system call.
846 */
847 if (repost)
848 t->t_post_sys = 1;
849
850 /*
851 * If there is a ustack registered for this lwp, and the stack rlimit
852 * has been altered, read in the ustack. If the saved stack rlimit
853 * matches the bounds of the ustack, update the ustack to reflect
854 * the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable
855 * stack checking by setting the size to 0.
856 */
857 if (lwp->lwp_ustack != 0 && lwp->lwp_old_stk_ctl != 0) {
858 rlim64_t new_size;
859 model_t model;
860 caddr_t top;
861 struct rlimit64 rl;
862
863 mutex_enter(&p->p_lock);
864 new_size = p->p_stk_ctl;
865 model = p->p_model;
866 top = p->p_usrstack;
867 (void) rctl_rlimit_get(rctlproc_legacy[RLIMIT_STACK], p, &rl);
868 mutex_exit(&p->p_lock);
869
870 if (rl.rlim_cur == RLIM64_INFINITY)
871 new_size = 0;
872
873 if (model == DATAMODEL_NATIVE) {
874 stack_t stk;
875
876 if (copyin((stack_t *)lwp->lwp_ustack, &stk,
877 sizeof (stack_t)) == 0 &&
878 (stk.ss_size == lwp->lwp_old_stk_ctl ||
879 stk.ss_size == 0) &&
880 stk.ss_sp == top - stk.ss_size) {
881 stk.ss_sp = (void *)((uintptr_t)stk.ss_sp +
882 stk.ss_size - new_size);
883 stk.ss_size = new_size;
884
885 (void) copyout(&stk,
886 (stack_t *)lwp->lwp_ustack,
887 sizeof (stack_t));
888 }
889 } else {
890 stack32_t stk32;
891
892 if (copyin((stack32_t *)lwp->lwp_ustack, &stk32,
893 sizeof (stack32_t)) == 0 &&
894 (stk32.ss_size == lwp->lwp_old_stk_ctl ||
895 stk32.ss_size == 0) &&
896 stk32.ss_sp ==
897 (caddr32_t)(uintptr_t)(top - stk32.ss_size)) {
898 stk32.ss_sp += stk32.ss_size - new_size;
899 stk32.ss_size = new_size;
900
901 (void) copyout(&stk32,
902 (stack32_t *)lwp->lwp_ustack,
903 sizeof (stack32_t));
904 }
905 }
906
907 lwp->lwp_old_stk_ctl = 0;
908 }
909
910 syscall_mstate(LMS_SYSTEM, LMS_USER);
911 }
912
913 /*
914 * Call a system call which takes a pointer to the user args struct and
915 * a pointer to the return values. This is a bit slower than the standard
916 * C arg-passing method in some cases.
917 */
918 int64_t
syscall_ap()919 syscall_ap()
920 {
921 uint_t error;
922 struct sysent *callp;
923 rval_t rval;
924 klwp_t *lwp = ttolwp(curthread);
925 struct regs *rp = lwptoregs(lwp);
926
927 callp = LWP_GETSYSENT(lwp) + curthread->t_sysnum;
928
929 /*
930 * If the arguments don't fit in registers %o0 - o5, make sure they
931 * have been copied to the lwp_arg array.
932 */
933 if (callp->sy_narg > 6 && save_syscall_args())
934 return ((int64_t)set_errno(EFAULT));
935
936 rval.r_val1 = 0;
937 rval.r_val2 = (int)rp->r_o1;
938 lwp->lwp_error = 0; /* for old drivers */
939 error = (*(callp->sy_call))(lwp->lwp_ap, &rval);
940 if (error)
941 return ((int64_t)set_errno(error));
942 return (rval.r_vals);
943 }
944
945 /*
946 * Load system call module.
947 * Returns with pointer to held read lock for module.
948 */
949 static krwlock_t *
lock_syscall(struct sysent * table,uint_t code)950 lock_syscall(struct sysent *table, uint_t code)
951 {
952 krwlock_t *module_lock;
953 struct modctl *modp;
954 int id;
955 struct sysent *callp;
956
957 module_lock = table[code].sy_lock;
958 callp = &table[code];
959
960 /*
961 * Optimization to only call modload if we don't have a loaded
962 * syscall.
963 */
964 rw_enter(module_lock, RW_READER);
965 if (LOADED_SYSCALL(callp))
966 return (module_lock);
967 rw_exit(module_lock);
968
969 for (;;) {
970 if ((id = modload("sys", syscallnames[code])) == -1)
971 break;
972
973 /*
974 * If we loaded successfully at least once, the modctl
975 * will still be valid, so we try to grab it by filename.
976 * If this call fails, it's because the mod_filename
977 * was changed after the call to modload() (mod_hold_by_name()
978 * is the likely culprit). We can safely just take
979 * another lap if this is the case; the modload() will
980 * change the mod_filename back to one by which we can
981 * find the modctl.
982 */
983 modp = mod_find_by_filename("sys", syscallnames[code]);
984
985 if (modp == NULL)
986 continue;
987
988 mutex_enter(&mod_lock);
989
990 if (!modp->mod_installed) {
991 mutex_exit(&mod_lock);
992 continue;
993 }
994 break;
995 }
996
997 rw_enter(module_lock, RW_READER);
998
999 if (id != -1)
1000 mutex_exit(&mod_lock);
1001
1002 return (module_lock);
1003 }
1004
1005 /*
1006 * Loadable syscall support.
1007 * If needed, load the module, then reserve it by holding a read
1008 * lock for the duration of the call.
1009 * Later, if the syscall is not unloadable, it could patch the vector.
1010 */
1011 /*ARGSUSED*/
1012 int64_t
loadable_syscall(long a0,long a1,long a2,long a3,long a4,long a5,long a6,long a7)1013 loadable_syscall(
1014 long a0, long a1, long a2, long a3,
1015 long a4, long a5, long a6, long a7)
1016 {
1017 int64_t rval;
1018 struct sysent *callp;
1019 struct sysent *se = LWP_GETSYSENT(ttolwp(curthread));
1020 krwlock_t *module_lock;
1021 int code;
1022
1023 code = curthread->t_sysnum;
1024 callp = se + code;
1025
1026 /*
1027 * Try to autoload the system call if necessary.
1028 */
1029 module_lock = lock_syscall(se, code);
1030
1031 /*
1032 * we've locked either the loaded syscall or nosys
1033 */
1034 if (callp->sy_flags & SE_ARGC) {
1035 int64_t (*sy_call)();
1036
1037 sy_call = (int64_t (*)())callp->sy_call;
1038 rval = (*sy_call)(a0, a1, a2, a3, a4, a5);
1039 } else {
1040 rval = syscall_ap();
1041 }
1042
1043 rw_exit(module_lock);
1044 return (rval);
1045 }
1046
1047 /*
1048 * Handle indirect system calls.
1049 * This interface should be deprecated. The library can handle
1050 * this more efficiently, but keep this implementation for old binaries.
1051 *
1052 * XX64 Needs some work.
1053 */
1054 int64_t
indir(int code,long a0,long a1,long a2,long a3,long a4)1055 indir(int code, long a0, long a1, long a2, long a3, long a4)
1056 {
1057 klwp_t *lwp = ttolwp(curthread);
1058 struct sysent *callp;
1059
1060 if (code <= 0 || code >= NSYSCALL)
1061 return (nosys());
1062
1063 ASSERT(lwp->lwp_ap != NULL);
1064
1065 curthread->t_sysnum = code;
1066 callp = LWP_GETSYSENT(lwp) + code;
1067
1068 /*
1069 * Handle argument setup, unless already done in pre_syscall().
1070 */
1071 if (callp->sy_narg > 5) {
1072 if (save_syscall_args()) /* move args to LWP array */
1073 return ((int64_t)set_errno(EFAULT));
1074 } else if (!lwp->lwp_argsaved) {
1075 long *ap;
1076
1077 ap = lwp->lwp_ap; /* args haven't been saved */
1078 lwp->lwp_ap = ap + 1; /* advance arg pointer */
1079 curthread->t_post_sys = 1; /* so lwp_ap will be reset */
1080 }
1081 return ((*callp->sy_callc)(a0, a1, a2, a3, a4, lwp->lwp_arg[5]));
1082 }
1083
1084 /*
1085 * set_errno - set an error return from the current system call.
1086 * This could be a macro.
1087 * This returns the value it is passed, so that the caller can
1088 * use tail-recursion-elimination and do return (set_errno(ERRNO));
1089 */
1090 uint_t
set_errno(uint_t error)1091 set_errno(uint_t error)
1092 {
1093 ASSERT(error != 0); /* must not be used to clear errno */
1094
1095 curthread->t_post_sys = 1; /* have post_syscall do error return */
1096 return (ttolwp(curthread)->lwp_errno = error);
1097 }
1098
1099 /*
1100 * set_proc_pre_sys - Set pre-syscall processing for entire process.
1101 */
1102 void
set_proc_pre_sys(proc_t * p)1103 set_proc_pre_sys(proc_t *p)
1104 {
1105 kthread_t *t;
1106 kthread_t *first;
1107
1108 ASSERT(MUTEX_HELD(&p->p_lock));
1109
1110 t = first = p->p_tlist;
1111 do {
1112 t->t_pre_sys = 1;
1113 } while ((t = t->t_forw) != first);
1114 }
1115
1116 /*
1117 * set_proc_post_sys - Set post-syscall processing for entire process.
1118 */
1119 void
set_proc_post_sys(proc_t * p)1120 set_proc_post_sys(proc_t *p)
1121 {
1122 kthread_t *t;
1123 kthread_t *first;
1124
1125 ASSERT(MUTEX_HELD(&p->p_lock));
1126
1127 t = first = p->p_tlist;
1128 do {
1129 t->t_post_sys = 1;
1130 } while ((t = t->t_forw) != first);
1131 }
1132
1133 /*
1134 * set_proc_sys - Set pre- and post-syscall processing for entire process.
1135 */
1136 void
set_proc_sys(proc_t * p)1137 set_proc_sys(proc_t *p)
1138 {
1139 kthread_t *t;
1140 kthread_t *first;
1141
1142 ASSERT(MUTEX_HELD(&p->p_lock));
1143
1144 t = first = p->p_tlist;
1145 do {
1146 t->t_pre_sys = 1;
1147 t->t_post_sys = 1;
1148 } while ((t = t->t_forw) != first);
1149 }
1150
1151 /*
1152 * set_all_proc_sys - set pre- and post-syscall processing flags for all
1153 * user processes.
1154 *
1155 * This is needed when auditing, tracing, or other facilities which affect
1156 * all processes are turned on.
1157 */
1158 void
set_all_proc_sys()1159 set_all_proc_sys()
1160 {
1161 kthread_t *t;
1162 kthread_t *first;
1163
1164 mutex_enter(&pidlock);
1165 t = first = curthread;
1166 do {
1167 t->t_pre_sys = 1;
1168 t->t_post_sys = 1;
1169 } while ((t = t->t_next) != first);
1170 mutex_exit(&pidlock);
1171 }
1172
1173 /*
1174 * set_all_zone_usr_proc_sys - set pre- and post-syscall processing flags for
1175 * all user processes running in the zone of the current process
1176 *
1177 * This is needed when auditing is turned on.
1178 */
1179 void
set_all_zone_usr_proc_sys(zoneid_t zoneid)1180 set_all_zone_usr_proc_sys(zoneid_t zoneid)
1181 {
1182 proc_t *p;
1183 kthread_t *t;
1184
1185 mutex_enter(&pidlock);
1186 for (p = practive; p != NULL; p = p->p_next) {
1187 /* skip kernel processes */
1188 if (p->p_exec == NULLVP || p->p_as == &kas ||
1189 p->p_stat == SIDL || p->p_stat == SZOMB ||
1190 (p->p_flag & (SSYS | SEXITING | SEXITLWPS)))
1191 continue;
1192 /*
1193 * Only processes in the given zone (eventually in
1194 * all zones) are taken into account
1195 */
1196 if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) {
1197 mutex_enter(&p->p_lock);
1198 if ((t = p->p_tlist) == NULL) {
1199 mutex_exit(&p->p_lock);
1200 continue;
1201 }
1202 /*
1203 * Set pre- and post-syscall processing flags
1204 * for all threads of the process
1205 */
1206 do {
1207 t->t_pre_sys = 1;
1208 t->t_post_sys = 1;
1209 } while (p->p_tlist != (t = t->t_forw));
1210 mutex_exit(&p->p_lock);
1211 }
1212 }
1213 mutex_exit(&pidlock);
1214 }
1215
1216 /*
1217 * set_proc_ast - Set asynchronous service trap (AST) flag for all
1218 * threads in process.
1219 */
1220 void
set_proc_ast(proc_t * p)1221 set_proc_ast(proc_t *p)
1222 {
1223 kthread_t *t;
1224 kthread_t *first;
1225
1226 ASSERT(MUTEX_HELD(&p->p_lock));
1227
1228 t = first = p->p_tlist;
1229 do {
1230 aston(t);
1231 } while ((t = t->t_forw) != first);
1232 }
1233