1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2019 Joyent, Inc.
25 */
26
27 #include <sys/param.h>
28 #include <sys/vmparam.h>
29 #include <sys/types.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/cmn_err.h>
33 #include <sys/signal.h>
34 #include <sys/stack.h>
35 #include <sys/cred.h>
36 #include <sys/user.h>
37 #include <sys/debug.h>
38 #include <sys/errno.h>
39 #include <sys/proc.h>
40 #include <sys/var.h>
41 #include <sys/inline.h>
42 #include <sys/syscall.h>
43 #include <sys/ucontext.h>
44 #include <sys/cpuvar.h>
45 #include <sys/siginfo.h>
46 #include <sys/trap.h>
47 #include <sys/machtrap.h>
48 #include <sys/sysinfo.h>
49 #include <sys/procfs.h>
50 #include <sys/prsystm.h>
51 #include <sys/fpu/fpusystm.h>
52 #include <sys/modctl.h>
53 #include <sys/aio_impl.h>
54 #include <c2/audit.h>
55 #include <sys/machpcb.h>
56 #include <sys/privregs.h>
57 #include <sys/copyops.h>
58 #include <sys/timer.h>
59 #include <sys/priv.h>
60 #include <sys/msacct.h>
61
62 int syscalltrace = 0;
63 #ifdef SYSCALLTRACE
64 static kmutex_t systrace_lock; /* syscall tracing lock */
65 #endif /* SYSCALLTRACE */
66
67 static krwlock_t *lock_syscall(struct sysent *, uint_t);
68
69 #ifdef _SYSCALL32_IMPL
70 static struct sysent *
lwp_getsysent(klwp_t * lwp)71 lwp_getsysent(klwp_t *lwp)
72 {
73 if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE)
74 return (sysent);
75 return (sysent32);
76 }
77 #define LWP_GETSYSENT(lwp) (lwp_getsysent(lwp))
78 #else
79 #define LWP_GETSYSENT(lwp) (sysent)
80 #endif
81
82 /*
83 * Called to restore the lwp's register window just before
84 * returning to user level (only if the registers have been
85 * fetched or modified through /proc).
86 */
87 /*ARGSUSED1*/
88 void
xregrestore(klwp_t * lwp,int shared)89 xregrestore(klwp_t *lwp, int shared)
90 {
91 /*
92 * If locals+ins were modified by /proc copy them out.
93 * Also copy to the shared window, if necessary.
94 */
95 if (lwp->lwp_pcb.pcb_xregstat == XREGMODIFIED) {
96 struct machpcb *mpcb = lwptompcb(lwp);
97 caddr_t sp = (caddr_t)lwptoregs(lwp)->r_sp;
98
99 size_t rwinsize;
100 caddr_t rwp;
101 int is64;
102
103 if (lwp_getdatamodel(lwp) == DATAMODEL_LP64) {
104 rwinsize = sizeof (struct rwindow);
105 rwp = sp + STACK_BIAS;
106 is64 = 1;
107 } else {
108 rwinsize = sizeof (struct rwindow32);
109 sp = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)sp;
110 rwp = sp;
111 is64 = 0;
112 }
113
114 if (is64)
115 (void) copyout_nowatch(&lwp->lwp_pcb.pcb_xregs,
116 rwp, rwinsize);
117 else {
118 struct rwindow32 rwindow32;
119 int watched;
120
121 watched = watch_disable_addr(rwp, rwinsize, S_WRITE);
122 rwindow_nto32(&lwp->lwp_pcb.pcb_xregs, &rwindow32);
123 (void) copyout(&rwindow32, rwp, rwinsize);
124 if (watched)
125 watch_enable_addr(rwp, rwinsize, S_WRITE);
126 }
127
128 /* also copy to the user return window */
129 mpcb->mpcb_rsp[0] = sp;
130 mpcb->mpcb_rsp[1] = NULL;
131 bcopy(&lwp->lwp_pcb.pcb_xregs, &mpcb->mpcb_rwin[0],
132 sizeof (lwp->lwp_pcb.pcb_xregs));
133 }
134 lwp->lwp_pcb.pcb_xregstat = XREGNONE;
135 }
136
137
138 /*
139 * Get the arguments to the current system call.
140 * lwp->lwp_ap normally points to the out regs in the reg structure.
141 * If the user is going to change the out registers and might want to
142 * get the args (for /proc tracing), it must copy the args elsewhere
143 * via save_syscall_args().
144 */
145 uint_t
get_syscall_args(klwp_t * lwp,long * argp,int * nargsp)146 get_syscall_args(klwp_t *lwp, long *argp, int *nargsp)
147 {
148 kthread_t *t = lwptot(lwp);
149 uint_t code = t->t_sysnum;
150 long mask;
151 long *ap;
152 int nargs;
153
154 if (lwptoproc(lwp)->p_model == DATAMODEL_ILP32)
155 mask = (uint32_t)0xffffffffU;
156 else
157 mask = 0xffffffffffffffff;
158
159 if (code != 0 && code < NSYSCALL) {
160
161 nargs = LWP_GETSYSENT(lwp)[code].sy_narg;
162
163 ASSERT(nargs <= MAXSYSARGS);
164
165 *nargsp = nargs;
166 ap = lwp->lwp_ap;
167 while (nargs-- > 0)
168 *argp++ = *ap++ & mask;
169 } else {
170 *nargsp = 0;
171 }
172 return (code);
173 }
174
175 #ifdef _SYSCALL32_IMPL
176 /*
177 * Get the arguments to the current 32-bit system call.
178 */
179 uint_t
get_syscall32_args(klwp_t * lwp,int * argp,int * nargsp)180 get_syscall32_args(klwp_t *lwp, int *argp, int *nargsp)
181 {
182 long args[MAXSYSARGS];
183 uint_t i, code;
184
185 code = get_syscall_args(lwp, args, nargsp);
186 for (i = 0; i != *nargsp; i++)
187 *argp++ = (int)args[i];
188 return (code);
189 }
190 #endif
191
192 /*
193 * Save the system call arguments in a safe place.
194 * lwp->lwp_ap normally points to the out regs in the reg structure.
195 * If the user is going to change the out registers, g1, or the stack,
196 * and might want to get the args (for /proc tracing), it must copy
197 * the args elsewhere via save_syscall_args().
198 *
199 * This may be called from stop() even when we're not in a system call.
200 * Since there's no easy way to tell, this must be safe (not panic).
201 * If the copyins get data faults, return non-zero.
202 */
203 int
save_syscall_args()204 save_syscall_args()
205 {
206 kthread_t *t = curthread;
207 klwp_t *lwp = ttolwp(t);
208 struct regs *rp = lwptoregs(lwp);
209 uint_t code = t->t_sysnum;
210 uint_t nargs;
211 int i;
212 caddr_t ua;
213 model_t datamodel;
214
215 if (lwp->lwp_argsaved || code == 0)
216 return (0); /* args already saved or not needed */
217
218 if (code >= NSYSCALL) {
219 nargs = 0; /* illegal syscall */
220 } else {
221 struct sysent *se = LWP_GETSYSENT(lwp);
222 struct sysent *callp = se + code;
223
224 nargs = callp->sy_narg;
225 if (LOADABLE_SYSCALL(callp) && nargs == 0) {
226 krwlock_t *module_lock;
227
228 /*
229 * Find out how many arguments the system
230 * call uses.
231 *
232 * We have the property that loaded syscalls
233 * never change the number of arguments they
234 * use after they've been loaded once. This
235 * allows us to stop for /proc tracing without
236 * holding the module lock.
237 * /proc is assured that sy_narg is valid.
238 */
239 module_lock = lock_syscall(se, code);
240 nargs = callp->sy_narg;
241 rw_exit(module_lock);
242 }
243 }
244
245 /*
246 * Fetch the system call arguments.
247 */
248 if (nargs == 0)
249 goto out;
250
251
252 ASSERT(nargs <= MAXSYSARGS);
253
254 if ((datamodel = lwp_getdatamodel(lwp)) == DATAMODEL_ILP32) {
255
256 if (rp->r_g1 == 0) { /* indirect syscall */
257
258 lwp->lwp_arg[0] = (uint32_t)rp->r_o1;
259 lwp->lwp_arg[1] = (uint32_t)rp->r_o2;
260 lwp->lwp_arg[2] = (uint32_t)rp->r_o3;
261 lwp->lwp_arg[3] = (uint32_t)rp->r_o4;
262 lwp->lwp_arg[4] = (uint32_t)rp->r_o5;
263 if (nargs > 5) {
264 ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
265 (rp->r_sp + MINFRAME32);
266 for (i = 5; i < nargs; i++) {
267 uint32_t a;
268 if (fuword32(ua, &a) != 0)
269 return (-1);
270 lwp->lwp_arg[i] = a;
271 ua += sizeof (a);
272 }
273 }
274 } else {
275 lwp->lwp_arg[0] = (uint32_t)rp->r_o0;
276 lwp->lwp_arg[1] = (uint32_t)rp->r_o1;
277 lwp->lwp_arg[2] = (uint32_t)rp->r_o2;
278 lwp->lwp_arg[3] = (uint32_t)rp->r_o3;
279 lwp->lwp_arg[4] = (uint32_t)rp->r_o4;
280 lwp->lwp_arg[5] = (uint32_t)rp->r_o5;
281 if (nargs > 6) {
282 ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
283 (rp->r_sp + MINFRAME32);
284 for (i = 6; i < nargs; i++) {
285 uint32_t a;
286 if (fuword32(ua, &a) != 0)
287 return (-1);
288 lwp->lwp_arg[i] = a;
289 ua += sizeof (a);
290 }
291 }
292 }
293 } else {
294 ASSERT(datamodel == DATAMODEL_LP64);
295 lwp->lwp_arg[0] = rp->r_o0;
296 lwp->lwp_arg[1] = rp->r_o1;
297 lwp->lwp_arg[2] = rp->r_o2;
298 lwp->lwp_arg[3] = rp->r_o3;
299 lwp->lwp_arg[4] = rp->r_o4;
300 lwp->lwp_arg[5] = rp->r_o5;
301 if (nargs > 6) {
302 ua = (caddr_t)rp->r_sp + MINFRAME + STACK_BIAS;
303 for (i = 6; i < nargs; i++) {
304 unsigned long a;
305 if (fulword(ua, &a) != 0)
306 return (-1);
307 lwp->lwp_arg[i] = a;
308 ua += sizeof (a);
309 }
310 }
311 }
312
313 out:
314 lwp->lwp_ap = lwp->lwp_arg;
315 lwp->lwp_argsaved = 1;
316 t->t_post_sys = 1; /* so lwp_ap will be reset */
317 return (0);
318 }
319
320 void
reset_syscall_args(void)321 reset_syscall_args(void)
322 {
323 klwp_t *lwp = ttolwp(curthread);
324
325 lwp->lwp_ap = (long *)&lwptoregs(lwp)->r_o0;
326 lwp->lwp_argsaved = 0;
327 }
328
329 /*
330 * nonexistent system call-- signal lwp (may want to handle it)
331 * flag error if lwp won't see signal immediately
332 * This works for old or new calling sequence.
333 */
334 int64_t
nosys(void)335 nosys(void)
336 {
337 tsignal(curthread, SIGSYS);
338 return ((int64_t)set_errno(ENOSYS));
339 }
340
341 int
nosys32(void)342 nosys32(void)
343 {
344 return (nosys());
345 }
346
347 /*
348 * Perform pre-system-call processing, including stopping for tracing,
349 * auditing, microstate-accounting, etc.
350 *
351 * This routine is called only if the t_pre_sys flag is set. Any condition
352 * requiring pre-syscall handling must set the t_pre_sys flag. If the
353 * condition is persistent, this routine will repost t_pre_sys.
354 */
355 int
pre_syscall(int arg0)356 pre_syscall(int arg0)
357 {
358 unsigned int code;
359 kthread_t *t = curthread;
360 proc_t *p = ttoproc(t);
361 klwp_t *lwp = ttolwp(t);
362 struct regs *rp = lwptoregs(lwp);
363 int repost;
364
365 t->t_pre_sys = repost = 0; /* clear pre-syscall processing flag */
366
367 ASSERT(t->t_schedflag & TS_DONT_SWAP);
368
369 syscall_mstate(LMS_USER, LMS_SYSTEM);
370
371 /*
372 * The syscall arguments in the out registers should be pointed to
373 * by lwp_ap. If the args need to be copied so that the outs can
374 * be changed without losing the ability to get the args for /proc,
375 * they can be saved by save_syscall_args(), and lwp_ap will be
376 * restored by post_syscall().
377 */
378 ASSERT(lwp->lwp_ap == (long *)&rp->r_o0);
379
380 /*
381 * Make sure the thread is holding the latest credentials for the
382 * process. The credentials in the process right now apply to this
383 * thread for the entire system call.
384 */
385 if (t->t_cred != p->p_cred) {
386 cred_t *oldcred = t->t_cred;
387 /*
388 * DTrace accesses t_cred in probe context. t_cred must
389 * always be either NULL, or point to a valid, allocated cred
390 * structure.
391 */
392 t->t_cred = crgetcred();
393 crfree(oldcred);
394 }
395
396 /*
397 * Undo special arrangements to single-step the lwp
398 * so that a debugger will see valid register contents.
399 * Also so that the pc is valid for syncfpu().
400 * Also so that a syscall like exec() can be stepped.
401 */
402 if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
403 (void) prundostep();
404 repost = 1;
405 }
406
407 /*
408 * Check for indirect system call in case we stop for tracing.
409 * Don't allow multiple indirection.
410 */
411 code = t->t_sysnum;
412 if (code == 0 && arg0 != 0) { /* indirect syscall */
413 code = arg0;
414 t->t_sysnum = arg0;
415 }
416
417 /*
418 * From the proc(5) manual page:
419 * When entry to a system call is being traced, the traced process
420 * stops after having begun the call to the system but before the
421 * system call arguments have been fetched from the process.
422 * If proc changes the args we must refetch them after starting.
423 */
424 if (PTOU(p)->u_systrap) {
425 if (prismember(&PTOU(p)->u_entrymask, code)) {
426 /*
427 * Recheck stop condition, now that lock is held.
428 */
429 mutex_enter(&p->p_lock);
430 if (PTOU(p)->u_systrap &&
431 prismember(&PTOU(p)->u_entrymask, code)) {
432 stop(PR_SYSENTRY, code);
433 /*
434 * Must refetch args since they were
435 * possibly modified by /proc. Indicate
436 * that the valid copy is in the
437 * registers.
438 */
439 lwp->lwp_argsaved = 0;
440 lwp->lwp_ap = (long *)&rp->r_o0;
441 }
442 mutex_exit(&p->p_lock);
443 }
444 repost = 1;
445 }
446
447 if (lwp->lwp_sysabort) {
448 /*
449 * lwp_sysabort may have been set via /proc while the process
450 * was stopped on PR_SYSENTRY. If so, abort the system call.
451 * Override any error from the copyin() of the arguments.
452 */
453 lwp->lwp_sysabort = 0;
454 (void) set_errno(EINTR); /* sets post-sys processing */
455 t->t_pre_sys = 1; /* repost anyway */
456 return (1); /* don't do system call, return EINTR */
457 }
458
459 /* begin auditing for this syscall */
460 if (audit_active == C2AUDIT_LOADED) {
461 uint32_t auditing = au_zone_getstate(NULL);
462
463 if (auditing & AU_AUDIT_MASK) {
464 int error;
465 if (error = audit_start(T_SYSCALL, code, auditing, \
466 0, lwp)) {
467 t->t_pre_sys = 1; /* repost anyway */
468 lwp->lwp_error = 0; /* for old drivers */
469 return (error);
470 }
471 repost = 1;
472 }
473 }
474
475 #ifdef SYSCALLTRACE
476 if (syscalltrace) {
477 int i;
478 long *ap;
479 char *cp;
480 char *sysname;
481 struct sysent *callp;
482
483 if (code >= NSYSCALL)
484 callp = &nosys_ent; /* nosys has no args */
485 else
486 callp = LWP_GETSYSENT(lwp) + code;
487 (void) save_syscall_args();
488 mutex_enter(&systrace_lock);
489 printf("%d: ", p->p_pid);
490 if (code >= NSYSCALL)
491 printf("0x%x", code);
492 else {
493 sysname = mod_getsysname(code);
494 printf("%s[0x%x]", sysname == NULL ? "NULL" :
495 sysname, code);
496 }
497 cp = "(";
498 for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) {
499 printf("%s%lx", cp, *ap);
500 cp = ", ";
501 }
502 if (i)
503 printf(")");
504 printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread);
505 mutex_exit(&systrace_lock);
506 }
507 #endif /* SYSCALLTRACE */
508
509 /*
510 * If there was a continuing reason for pre-syscall processing,
511 * set the t_pre_sys flag for the next system call.
512 */
513 if (repost)
514 t->t_pre_sys = 1;
515 lwp->lwp_error = 0; /* for old drivers */
516 lwp->lwp_badpriv = PRIV_NONE; /* for privilege tracing */
517 return (0);
518 }
519
520 /*
521 * Post-syscall processing. Perform abnormal system call completion
522 * actions such as /proc tracing, profiling, signals, preemption, etc.
523 *
524 * This routine is called only if t_post_sys, t_sig_check, or t_astflag is set.
525 * Any condition requiring pre-syscall handling must set one of these.
526 * If the condition is persistent, this routine will repost t_post_sys.
527 */
528 void
post_syscall(long rval1,long rval2)529 post_syscall(long rval1, long rval2)
530 {
531 kthread_t *t = curthread;
532 proc_t *p = curproc;
533 klwp_t *lwp = ttolwp(t);
534 struct regs *rp = lwptoregs(lwp);
535 uint_t error;
536 int code = t->t_sysnum;
537 int repost = 0;
538 int proc_stop = 0; /* non-zero if stopping for /proc */
539 int sigprof = 0; /* non-zero if sending SIGPROF */
540
541 t->t_post_sys = 0;
542
543 error = lwp->lwp_errno;
544
545 /*
546 * Code can be zero if this is a new LWP returning after a forkall(),
547 * other than the one which matches the one in the parent which called
548 * forkall(). In these LWPs, skip most of post-syscall activity.
549 */
550 if (code == 0)
551 goto sig_check;
552
553 /* put out audit record for this syscall */
554 if (AU_AUDITING()) {
555 rval_t rval; /* fix audit_finish() someday */
556
557 /* XX64 -- truncation of 64-bit return values? */
558 rval.r_val1 = (int)rval1;
559 rval.r_val2 = (int)rval2;
560 audit_finish(T_SYSCALL, code, error, &rval);
561 repost = 1;
562 }
563
564 if (curthread->t_pdmsg != NULL) {
565 char *m = curthread->t_pdmsg;
566
567 uprintf("%s", m);
568 kmem_free(m, strlen(m) + 1);
569 curthread->t_pdmsg = NULL;
570 }
571
572 /*
573 * If we're going to stop for /proc tracing, set the flag and
574 * save the arguments so that the return values don't smash them.
575 */
576 if (PTOU(p)->u_systrap) {
577 if (prismember(&PTOU(p)->u_exitmask, code)) {
578 proc_stop = 1;
579 (void) save_syscall_args();
580 }
581 repost = 1;
582 }
583
584 /*
585 * Similarly check to see if SIGPROF might be sent.
586 */
587 if (curthread->t_rprof != NULL &&
588 curthread->t_rprof->rp_anystate != 0) {
589 (void) save_syscall_args();
590 sigprof = 1;
591 }
592
593 if (lwp->lwp_eosys == NORMALRETURN) {
594 if (error == 0) {
595 #ifdef SYSCALLTRACE
596 if (syscalltrace) {
597 mutex_enter(&systrace_lock);
598 printf(
599 "%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n",
600 p->p_pid, rval1, rval2, curthread);
601 mutex_exit(&systrace_lock);
602 }
603 #endif /* SYSCALLTRACE */
604 rp->r_tstate &= ~TSTATE_IC;
605 rp->r_o0 = rval1;
606 rp->r_o1 = rval2;
607 } else {
608 int sig;
609
610 #ifdef SYSCALLTRACE
611 if (syscalltrace) {
612 mutex_enter(&systrace_lock);
613 printf("%d: error=%d, id 0x%p\n",
614 p->p_pid, error, curthread);
615 mutex_exit(&systrace_lock);
616 }
617 #endif /* SYSCALLTRACE */
618 if (error == EINTR && t->t_activefd.a_stale)
619 error = EBADF;
620 if (error == EINTR &&
621 (sig = lwp->lwp_cursig) != 0 &&
622 sigismember(&PTOU(p)->u_sigrestart, sig) &&
623 PTOU(p)->u_signal[sig - 1] != SIG_DFL &&
624 PTOU(p)->u_signal[sig - 1] != SIG_IGN)
625 error = ERESTART;
626 rp->r_o0 = error;
627 rp->r_tstate |= TSTATE_IC;
628 }
629 /*
630 * The default action is to redo the trap instruction.
631 * We increment the pc and npc past it for NORMALRETURN.
632 * JUSTRETURN has set up a new pc and npc already.
633 * If we are a cloned thread of forkall(), don't
634 * adjust here because we have already inherited
635 * the adjusted values from our clone.
636 */
637 if (!(t->t_flag & T_FORKALL)) {
638 rp->r_pc = rp->r_npc;
639 rp->r_npc += 4;
640 }
641 }
642
643 /*
644 * From the proc(5) manual page:
645 * When exit from a system call is being traced, the traced process
646 * stops on completion of the system call just prior to checking for
647 * signals and returning to user level. At this point all return
648 * values have been stored into the traced process's saved registers.
649 */
650 if (proc_stop) {
651 mutex_enter(&p->p_lock);
652 if (PTOU(p)->u_systrap &&
653 prismember(&PTOU(p)->u_exitmask, code))
654 stop(PR_SYSEXIT, code);
655 mutex_exit(&p->p_lock);
656 }
657
658 /*
659 * If we are the parent returning from a successful
660 * vfork, wait for the child to exec or exit.
661 * This code must be here and not in the bowels of the system
662 * so that /proc can intercept exit from vfork in a timely way.
663 */
664 if (t->t_flag & T_VFPARENT) {
665 ASSERT(code == SYS_vfork || code == SYS_forksys);
666 ASSERT(rp->r_o1 == 0 && error == 0);
667 vfwait((pid_t)rval1);
668 t->t_flag &= ~T_VFPARENT;
669 }
670
671 /*
672 * If profiling is active, bill the current PC in user-land
673 * and keep reposting until profiling is disabled.
674 */
675 if (p->p_prof.pr_scale) {
676 if (lwp->lwp_oweupc)
677 profil_tick(rp->r_pc);
678 repost = 1;
679 }
680
681 sig_check:
682 /*
683 * Reset flag for next time.
684 * We must do this after stopping on PR_SYSEXIT
685 * because /proc uses the information in lwp_eosys.
686 */
687 lwp->lwp_eosys = NORMALRETURN;
688 clear_stale_fd();
689 t->t_flag &= ~T_FORKALL;
690
691 if (t->t_astflag | t->t_sig_check) {
692 /*
693 * Turn off the AST flag before checking all the conditions that
694 * may have caused an AST. This flag is on whenever a signal or
695 * unusual condition should be handled after the next trap or
696 * syscall.
697 */
698 astoff(t);
699 t->t_sig_check = 0;
700
701 /*
702 * The following check is legal for the following reasons:
703 * 1) The thread we are checking, is ourselves, so there is
704 * no way the proc can go away.
705 * 2) The only time we need to be protected by the
706 * lock is if the binding is changed.
707 *
708 * Note we will still take the lock and check the binding
709 * if the condition was true without the lock held. This
710 * prevents lock contention among threads owned by the
711 * same proc.
712 */
713
714 if (curthread->t_proc_flag & TP_CHANGEBIND) {
715 mutex_enter(&p->p_lock);
716 if (curthread->t_proc_flag & TP_CHANGEBIND) {
717 timer_lwpbind();
718 curthread->t_proc_flag &= ~TP_CHANGEBIND;
719 }
720 mutex_exit(&p->p_lock);
721 }
722
723 /*
724 * for kaio requests on the special kaio poll queue,
725 * copyout their results to user memory.
726 */
727 if (p->p_aio)
728 aio_cleanup(0);
729
730 /*
731 * If this LWP was asked to hold, call holdlwp(), which will
732 * stop. holdlwps() sets this up and calls pokelwps() which
733 * sets the AST flag.
734 *
735 * Also check TP_EXITLWP, since this is used by fresh new LWPs
736 * through lwp_rtt(). That flag is set if the lwp_create(2)
737 * syscall failed after creating the LWP.
738 */
739 if (ISHOLD(p) || (t->t_proc_flag & TP_EXITLWP))
740 holdlwp();
741
742 /*
743 * All code that sets signals and makes ISSIG_PENDING
744 * evaluate true must set t_sig_check afterwards.
745 */
746 if (ISSIG_PENDING(t, lwp, p)) {
747 if (issig(FORREAL))
748 psig();
749 t->t_sig_check = 1; /* recheck next time */
750 }
751
752 if (sigprof) {
753 int nargs = (code > 0 && code < NSYSCALL)?
754 LWP_GETSYSENT(lwp)[code].sy_narg : 0;
755 realsigprof(code, nargs, error);
756 t->t_sig_check = 1; /* recheck next time */
757 }
758
759 /*
760 * If a performance counter overflow interrupt was
761 * delivered *during* the syscall, then re-enable the
762 * AST so that we take a trip through trap() to cause
763 * the SIGEMT to be delivered.
764 */
765 if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW)
766 aston(t);
767
768 /*
769 * If an asynchronous hardware error is pending, turn AST flag
770 * back on. AST will be checked again before we return to user
771 * mode and we'll come back through trap() to handle the error.
772 */
773 if (lwp->lwp_pcb.pcb_flags & ASYNC_HWERR)
774 aston(t);
775 }
776
777 /*
778 * Restore register window if a debugger modified it.
779 * Set up to perform a single-step if a debugger requested it.
780 */
781 if (lwp->lwp_pcb.pcb_xregstat != XREGNONE)
782 xregrestore(lwp, 1);
783
784 lwp->lwp_errno = 0; /* clear error for next time */
785
786 /*
787 * Set state to LWP_USER here so preempt won't give us a kernel
788 * priority if it occurs after this point. Call CL_TRAPRET() to
789 * restore the user-level priority.
790 *
791 * It is important that no locks (other than spinlocks) be entered
792 * after this point before returning to user mode (unless lwp_state
793 * is set back to LWP_SYS).
794 *
795 * Sampled times past this point are charged to the user.
796 */
797 lwp->lwp_state = LWP_USER;
798
799 if (t->t_trapret) {
800 t->t_trapret = 0;
801 thread_lock(t);
802 CL_TRAPRET(t);
803 thread_unlock(t);
804 }
805 if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ)
806 preempt();
807 prunstop();
808
809 /*
810 * t_post_sys will be set if pcb_step is active.
811 */
812 if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
813 prdostep();
814 repost = 1;
815 }
816
817 t->t_sysnum = 0; /* no longer in a system call */
818
819 /*
820 * In case the args were copied to the lwp, reset the
821 * pointer so the next syscall will have the right lwp_ap pointer.
822 */
823 lwp->lwp_ap = (long *)&rp->r_o0;
824 lwp->lwp_argsaved = 0;
825
826 /*
827 * If there was a continuing reason for post-syscall processing,
828 * set the t_post_sys flag for the next system call.
829 */
830 if (repost)
831 t->t_post_sys = 1;
832
833 /*
834 * If there is a ustack registered for this lwp, and the stack rlimit
835 * has been altered, read in the ustack. If the saved stack rlimit
836 * matches the bounds of the ustack, update the ustack to reflect
837 * the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable
838 * stack checking by setting the size to 0.
839 */
840 if (lwp->lwp_ustack != 0 && lwp->lwp_old_stk_ctl != 0) {
841 rlim64_t new_size;
842 model_t model;
843 caddr_t top;
844 struct rlimit64 rl;
845
846 mutex_enter(&p->p_lock);
847 new_size = p->p_stk_ctl;
848 model = p->p_model;
849 top = p->p_usrstack;
850 (void) rctl_rlimit_get(rctlproc_legacy[RLIMIT_STACK], p, &rl);
851 mutex_exit(&p->p_lock);
852
853 if (rl.rlim_cur == RLIM64_INFINITY)
854 new_size = 0;
855
856 if (model == DATAMODEL_NATIVE) {
857 stack_t stk;
858
859 if (copyin((stack_t *)lwp->lwp_ustack, &stk,
860 sizeof (stack_t)) == 0 &&
861 (stk.ss_size == lwp->lwp_old_stk_ctl ||
862 stk.ss_size == 0) &&
863 stk.ss_sp == top - stk.ss_size) {
864 stk.ss_sp = (void *)((uintptr_t)stk.ss_sp +
865 stk.ss_size - new_size);
866 stk.ss_size = new_size;
867
868 (void) copyout(&stk,
869 (stack_t *)lwp->lwp_ustack,
870 sizeof (stack_t));
871 }
872 } else {
873 stack32_t stk32;
874
875 if (copyin((stack32_t *)lwp->lwp_ustack, &stk32,
876 sizeof (stack32_t)) == 0 &&
877 (stk32.ss_size == lwp->lwp_old_stk_ctl ||
878 stk32.ss_size == 0) &&
879 stk32.ss_sp ==
880 (caddr32_t)(uintptr_t)(top - stk32.ss_size)) {
881 stk32.ss_sp += stk32.ss_size - new_size;
882 stk32.ss_size = new_size;
883
884 (void) copyout(&stk32,
885 (stack32_t *)lwp->lwp_ustack,
886 sizeof (stack32_t));
887 }
888 }
889
890 lwp->lwp_old_stk_ctl = 0;
891 }
892
893 syscall_mstate(LMS_SYSTEM, LMS_USER);
894 }
895
896 /*
897 * Call a system call which takes a pointer to the user args struct and
898 * a pointer to the return values. This is a bit slower than the standard
899 * C arg-passing method in some cases.
900 */
901 int64_t
syscall_ap()902 syscall_ap()
903 {
904 uint_t error;
905 struct sysent *callp;
906 rval_t rval;
907 klwp_t *lwp = ttolwp(curthread);
908 struct regs *rp = lwptoregs(lwp);
909
910 callp = LWP_GETSYSENT(lwp) + curthread->t_sysnum;
911
912 /*
913 * If the arguments don't fit in registers %o0 - o5, make sure they
914 * have been copied to the lwp_arg array.
915 */
916 if (callp->sy_narg > 6 && save_syscall_args())
917 return ((int64_t)set_errno(EFAULT));
918
919 rval.r_val1 = 0;
920 rval.r_val2 = (int)rp->r_o1;
921 lwp->lwp_error = 0; /* for old drivers */
922 error = (*(callp->sy_call))(lwp->lwp_ap, &rval);
923 if (error)
924 return ((int64_t)set_errno(error));
925 return (rval.r_vals);
926 }
927
928 /*
929 * Load system call module.
930 * Returns with pointer to held read lock for module.
931 */
932 static krwlock_t *
lock_syscall(struct sysent * table,uint_t code)933 lock_syscall(struct sysent *table, uint_t code)
934 {
935 krwlock_t *module_lock;
936 struct modctl *modp;
937 int id;
938 struct sysent *callp;
939
940 module_lock = table[code].sy_lock;
941 callp = &table[code];
942
943 /*
944 * Optimization to only call modload if we don't have a loaded
945 * syscall.
946 */
947 rw_enter(module_lock, RW_READER);
948 if (LOADED_SYSCALL(callp))
949 return (module_lock);
950 rw_exit(module_lock);
951
952 for (;;) {
953 if ((id = modload("sys", syscallnames[code])) == -1)
954 break;
955
956 /*
957 * If we loaded successfully at least once, the modctl
958 * will still be valid, so we try to grab it by filename.
959 * If this call fails, it's because the mod_filename
960 * was changed after the call to modload() (mod_hold_by_name()
961 * is the likely culprit). We can safely just take
962 * another lap if this is the case; the modload() will
963 * change the mod_filename back to one by which we can
964 * find the modctl.
965 */
966 modp = mod_find_by_filename("sys", syscallnames[code]);
967
968 if (modp == NULL)
969 continue;
970
971 mutex_enter(&mod_lock);
972
973 if (!modp->mod_installed) {
974 mutex_exit(&mod_lock);
975 continue;
976 }
977 break;
978 }
979
980 rw_enter(module_lock, RW_READER);
981
982 if (id != -1)
983 mutex_exit(&mod_lock);
984
985 return (module_lock);
986 }
987
988 /*
989 * Loadable syscall support.
990 * If needed, load the module, then reserve it by holding a read
991 * lock for the duration of the call.
992 * Later, if the syscall is not unloadable, it could patch the vector.
993 */
994 /*ARGSUSED*/
995 int64_t
loadable_syscall(long a0,long a1,long a2,long a3,long a4,long a5,long a6,long a7)996 loadable_syscall(
997 long a0, long a1, long a2, long a3,
998 long a4, long a5, long a6, long a7)
999 {
1000 int64_t rval;
1001 struct sysent *callp;
1002 struct sysent *se = LWP_GETSYSENT(ttolwp(curthread));
1003 krwlock_t *module_lock;
1004 int code;
1005
1006 code = curthread->t_sysnum;
1007 callp = se + code;
1008
1009 /*
1010 * Try to autoload the system call if necessary.
1011 */
1012 module_lock = lock_syscall(se, code);
1013
1014 /*
1015 * we've locked either the loaded syscall or nosys
1016 */
1017 if (callp->sy_flags & SE_ARGC) {
1018 int64_t (*sy_call)();
1019
1020 sy_call = (int64_t (*)())callp->sy_call;
1021 rval = (*sy_call)(a0, a1, a2, a3, a4, a5);
1022 } else {
1023 rval = syscall_ap();
1024 }
1025
1026 rw_exit(module_lock);
1027 return (rval);
1028 }
1029
1030 /*
1031 * Handle indirect system calls.
1032 * This interface should be deprecated. The library can handle
1033 * this more efficiently, but keep this implementation for old binaries.
1034 *
1035 * XX64 Needs some work.
1036 */
1037 int64_t
indir(int code,long a0,long a1,long a2,long a3,long a4)1038 indir(int code, long a0, long a1, long a2, long a3, long a4)
1039 {
1040 klwp_t *lwp = ttolwp(curthread);
1041 struct sysent *callp;
1042
1043 if (code <= 0 || code >= NSYSCALL)
1044 return (nosys());
1045
1046 ASSERT(lwp->lwp_ap != NULL);
1047
1048 curthread->t_sysnum = code;
1049 callp = LWP_GETSYSENT(lwp) + code;
1050
1051 /*
1052 * Handle argument setup, unless already done in pre_syscall().
1053 */
1054 if (callp->sy_narg > 5) {
1055 if (save_syscall_args()) /* move args to LWP array */
1056 return ((int64_t)set_errno(EFAULT));
1057 } else if (!lwp->lwp_argsaved) {
1058 long *ap;
1059
1060 ap = lwp->lwp_ap; /* args haven't been saved */
1061 lwp->lwp_ap = ap + 1; /* advance arg pointer */
1062 curthread->t_post_sys = 1; /* so lwp_ap will be reset */
1063 }
1064 return ((*callp->sy_callc)(a0, a1, a2, a3, a4, lwp->lwp_arg[5]));
1065 }
1066
1067 /*
1068 * set_errno - set an error return from the current system call.
1069 * This could be a macro.
1070 * This returns the value it is passed, so that the caller can
1071 * use tail-recursion-elimination and do return (set_errno(ERRNO));
1072 */
1073 uint_t
set_errno(uint_t error)1074 set_errno(uint_t error)
1075 {
1076 ASSERT(error != 0); /* must not be used to clear errno */
1077
1078 curthread->t_post_sys = 1; /* have post_syscall do error return */
1079 return (ttolwp(curthread)->lwp_errno = error);
1080 }
1081
1082 /*
1083 * set_proc_pre_sys - Set pre-syscall processing for entire process.
1084 */
1085 void
set_proc_pre_sys(proc_t * p)1086 set_proc_pre_sys(proc_t *p)
1087 {
1088 kthread_t *t;
1089 kthread_t *first;
1090
1091 ASSERT(MUTEX_HELD(&p->p_lock));
1092
1093 t = first = p->p_tlist;
1094 do {
1095 t->t_pre_sys = 1;
1096 } while ((t = t->t_forw) != first);
1097 }
1098
1099 /*
1100 * set_proc_post_sys - Set post-syscall processing for entire process.
1101 */
1102 void
set_proc_post_sys(proc_t * p)1103 set_proc_post_sys(proc_t *p)
1104 {
1105 kthread_t *t;
1106 kthread_t *first;
1107
1108 ASSERT(MUTEX_HELD(&p->p_lock));
1109
1110 t = first = p->p_tlist;
1111 do {
1112 t->t_post_sys = 1;
1113 } while ((t = t->t_forw) != first);
1114 }
1115
1116 /*
1117 * set_proc_sys - Set pre- and post-syscall processing for entire process.
1118 */
1119 void
set_proc_sys(proc_t * p)1120 set_proc_sys(proc_t *p)
1121 {
1122 kthread_t *t;
1123 kthread_t *first;
1124
1125 ASSERT(MUTEX_HELD(&p->p_lock));
1126
1127 t = first = p->p_tlist;
1128 do {
1129 t->t_pre_sys = 1;
1130 t->t_post_sys = 1;
1131 } while ((t = t->t_forw) != first);
1132 }
1133
1134 /*
1135 * set_all_proc_sys - set pre- and post-syscall processing flags for all
1136 * user processes.
1137 *
1138 * This is needed when auditing, tracing, or other facilities which affect
1139 * all processes are turned on.
1140 */
1141 void
set_all_proc_sys()1142 set_all_proc_sys()
1143 {
1144 kthread_t *t;
1145 kthread_t *first;
1146
1147 mutex_enter(&pidlock);
1148 t = first = curthread;
1149 do {
1150 t->t_pre_sys = 1;
1151 t->t_post_sys = 1;
1152 } while ((t = t->t_next) != first);
1153 mutex_exit(&pidlock);
1154 }
1155
1156 /*
1157 * set_all_zone_usr_proc_sys - set pre- and post-syscall processing flags for
1158 * all user processes running in the zone of the current process
1159 *
1160 * This is needed when auditing is turned on.
1161 */
1162 void
set_all_zone_usr_proc_sys(zoneid_t zoneid)1163 set_all_zone_usr_proc_sys(zoneid_t zoneid)
1164 {
1165 proc_t *p;
1166 kthread_t *t;
1167
1168 mutex_enter(&pidlock);
1169 for (p = practive; p != NULL; p = p->p_next) {
1170 /* skip kernel processes */
1171 if (p->p_exec == NULLVP || p->p_as == &kas ||
1172 p->p_stat == SIDL || p->p_stat == SZOMB ||
1173 (p->p_flag & (SSYS | SEXITING | SEXITLWPS)))
1174 continue;
1175 /*
1176 * Only processes in the given zone (eventually in
1177 * all zones) are taken into account
1178 */
1179 if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) {
1180 mutex_enter(&p->p_lock);
1181 if ((t = p->p_tlist) == NULL) {
1182 mutex_exit(&p->p_lock);
1183 continue;
1184 }
1185 /*
1186 * Set pre- and post-syscall processing flags
1187 * for all threads of the process
1188 */
1189 do {
1190 t->t_pre_sys = 1;
1191 t->t_post_sys = 1;
1192 } while (p->p_tlist != (t = t->t_forw));
1193 mutex_exit(&p->p_lock);
1194 }
1195 }
1196 mutex_exit(&pidlock);
1197 }
1198
1199 /*
1200 * set_proc_ast - Set asynchronous service trap (AST) flag for all
1201 * threads in process.
1202 */
1203 void
set_proc_ast(proc_t * p)1204 set_proc_ast(proc_t *p)
1205 {
1206 kthread_t *t;
1207 kthread_t *first;
1208
1209 ASSERT(MUTEX_HELD(&p->p_lock));
1210
1211 t = first = p->p_tlist;
1212 do {
1213 aston(t);
1214 } while ((t = t->t_forw) != first);
1215 }
1216