xref: /illumos-gate/usr/src/uts/sparc/os/syscall.c (revision 76f19f5fdc974fe5be5c82a556e43a4df93f1de1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2019 Joyent, Inc.
25  */
26 
27 #include <sys/param.h>
28 #include <sys/vmparam.h>
29 #include <sys/types.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/cmn_err.h>
33 #include <sys/signal.h>
34 #include <sys/stack.h>
35 #include <sys/cred.h>
36 #include <sys/user.h>
37 #include <sys/debug.h>
38 #include <sys/errno.h>
39 #include <sys/proc.h>
40 #include <sys/var.h>
41 #include <sys/inline.h>
42 #include <sys/syscall.h>
43 #include <sys/ucontext.h>
44 #include <sys/cpuvar.h>
45 #include <sys/siginfo.h>
46 #include <sys/trap.h>
47 #include <sys/machtrap.h>
48 #include <sys/sysinfo.h>
49 #include <sys/procfs.h>
50 #include <sys/prsystm.h>
51 #include <sys/fpu/fpusystm.h>
52 #include <sys/modctl.h>
53 #include <sys/aio_impl.h>
54 #include <c2/audit.h>
55 #include <sys/tnf.h>
56 #include <sys/tnf_probe.h>
57 #include <sys/machpcb.h>
58 #include <sys/privregs.h>
59 #include <sys/copyops.h>
60 #include <sys/timer.h>
61 #include <sys/priv.h>
62 #include <sys/msacct.h>
63 
64 int syscalltrace = 0;
65 #ifdef SYSCALLTRACE
66 static kmutex_t	systrace_lock;		/* syscall tracing lock */
67 #endif /* SYSCALLTRACE */
68 
69 static krwlock_t *lock_syscall(struct sysent *, uint_t);
70 
71 #ifdef _SYSCALL32_IMPL
72 static struct sysent *
73 lwp_getsysent(klwp_t *lwp)
74 {
75 	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE)
76 		return (sysent);
77 	return (sysent32);
78 }
79 #define	LWP_GETSYSENT(lwp)	(lwp_getsysent(lwp))
80 #else
81 #define	LWP_GETSYSENT(lwp)	(sysent)
82 #endif
83 
84 /*
85  * Called to restore the lwp's register window just before
86  * returning to user level (only if the registers have been
87  * fetched or modified through /proc).
88  */
89 /*ARGSUSED1*/
90 void
91 xregrestore(klwp_t *lwp, int shared)
92 {
93 	/*
94 	 * If locals+ins were modified by /proc copy them out.
95 	 * Also copy to the shared window, if necessary.
96 	 */
97 	if (lwp->lwp_pcb.pcb_xregstat == XREGMODIFIED) {
98 		struct machpcb *mpcb = lwptompcb(lwp);
99 		caddr_t sp = (caddr_t)lwptoregs(lwp)->r_sp;
100 
101 		size_t rwinsize;
102 		caddr_t rwp;
103 		int is64;
104 
105 		if (lwp_getdatamodel(lwp) == DATAMODEL_LP64) {
106 			rwinsize = sizeof (struct rwindow);
107 			rwp = sp + STACK_BIAS;
108 			is64 = 1;
109 		} else {
110 			rwinsize = sizeof (struct rwindow32);
111 			sp = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)sp;
112 			rwp = sp;
113 			is64 = 0;
114 		}
115 
116 		if (is64)
117 			(void) copyout_nowatch(&lwp->lwp_pcb.pcb_xregs,
118 			    rwp, rwinsize);
119 		else {
120 			struct rwindow32 rwindow32;
121 			int watched;
122 
123 			watched = watch_disable_addr(rwp, rwinsize, S_WRITE);
124 			rwindow_nto32(&lwp->lwp_pcb.pcb_xregs, &rwindow32);
125 			(void) copyout(&rwindow32, rwp, rwinsize);
126 			if (watched)
127 				watch_enable_addr(rwp, rwinsize, S_WRITE);
128 		}
129 
130 		/* also copy to the user return window */
131 		mpcb->mpcb_rsp[0] = sp;
132 		mpcb->mpcb_rsp[1] = NULL;
133 		bcopy(&lwp->lwp_pcb.pcb_xregs, &mpcb->mpcb_rwin[0],
134 		    sizeof (lwp->lwp_pcb.pcb_xregs));
135 	}
136 	lwp->lwp_pcb.pcb_xregstat = XREGNONE;
137 }
138 
139 
140 /*
141  * Get the arguments to the current system call.
142  *	lwp->lwp_ap normally points to the out regs in the reg structure.
143  *	If the user is going to change the out registers and might want to
144  *	get the args (for /proc tracing), it must copy the args elsewhere
145  *	via save_syscall_args().
146  */
147 uint_t
148 get_syscall_args(klwp_t *lwp, long *argp, int *nargsp)
149 {
150 	kthread_t	*t = lwptot(lwp);
151 	uint_t	code = t->t_sysnum;
152 	long	mask;
153 	long	*ap;
154 	int	nargs;
155 
156 	if (lwptoproc(lwp)->p_model == DATAMODEL_ILP32)
157 		mask = (uint32_t)0xffffffffU;
158 	else
159 		mask = 0xffffffffffffffff;
160 
161 	if (code != 0 && code < NSYSCALL) {
162 
163 		nargs = LWP_GETSYSENT(lwp)[code].sy_narg;
164 
165 		ASSERT(nargs <= MAXSYSARGS);
166 
167 		*nargsp = nargs;
168 		ap = lwp->lwp_ap;
169 		while (nargs-- > 0)
170 			*argp++ = *ap++ & mask;
171 	} else {
172 		*nargsp = 0;
173 	}
174 	return (code);
175 }
176 
177 #ifdef _SYSCALL32_IMPL
178 /*
179  * Get the arguments to the current 32-bit system call.
180  */
181 uint_t
182 get_syscall32_args(klwp_t *lwp, int *argp, int *nargsp)
183 {
184 	long args[MAXSYSARGS];
185 	uint_t i, code;
186 
187 	code = get_syscall_args(lwp, args, nargsp);
188 	for (i = 0; i != *nargsp; i++)
189 		*argp++ = (int)args[i];
190 	return (code);
191 }
192 #endif
193 
194 /*
195  *	Save the system call arguments in a safe place.
196  *	lwp->lwp_ap normally points to the out regs in the reg structure.
197  *	If the user is going to change the out registers, g1, or the stack,
198  *	and might want to get the args (for /proc tracing), it must copy
199  *	the args elsewhere via save_syscall_args().
200  *
201  *	This may be called from stop() even when we're not in a system call.
202  *	Since there's no easy way to tell, this must be safe (not panic).
203  *	If the copyins get data faults, return non-zero.
204  */
205 int
206 save_syscall_args()
207 {
208 	kthread_t	*t = curthread;
209 	klwp_t		*lwp = ttolwp(t);
210 	struct regs	*rp = lwptoregs(lwp);
211 	uint_t		code = t->t_sysnum;
212 	uint_t		nargs;
213 	int		i;
214 	caddr_t		ua;
215 	model_t		datamodel;
216 
217 	if (lwp->lwp_argsaved || code == 0)
218 		return (0);		/* args already saved or not needed */
219 
220 	if (code >= NSYSCALL) {
221 		nargs = 0;		/* illegal syscall */
222 	} else {
223 		struct sysent *se = LWP_GETSYSENT(lwp);
224 		struct sysent *callp = se + code;
225 
226 		nargs = callp->sy_narg;
227 		if (LOADABLE_SYSCALL(callp) && nargs == 0) {
228 			krwlock_t	*module_lock;
229 
230 			/*
231 			 * Find out how many arguments the system
232 			 * call uses.
233 			 *
234 			 * We have the property that loaded syscalls
235 			 * never change the number of arguments they
236 			 * use after they've been loaded once.  This
237 			 * allows us to stop for /proc tracing without
238 			 * holding the module lock.
239 			 * /proc is assured that sy_narg is valid.
240 			 */
241 			module_lock = lock_syscall(se, code);
242 			nargs = callp->sy_narg;
243 			rw_exit(module_lock);
244 		}
245 	}
246 
247 	/*
248 	 * Fetch the system call arguments.
249 	 */
250 	if (nargs == 0)
251 		goto out;
252 
253 
254 	ASSERT(nargs <= MAXSYSARGS);
255 
256 	if ((datamodel = lwp_getdatamodel(lwp)) == DATAMODEL_ILP32) {
257 
258 		if (rp->r_g1 == 0) {	/* indirect syscall */
259 
260 			lwp->lwp_arg[0] = (uint32_t)rp->r_o1;
261 			lwp->lwp_arg[1] = (uint32_t)rp->r_o2;
262 			lwp->lwp_arg[2] = (uint32_t)rp->r_o3;
263 			lwp->lwp_arg[3] = (uint32_t)rp->r_o4;
264 			lwp->lwp_arg[4] = (uint32_t)rp->r_o5;
265 			if (nargs > 5) {
266 				ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
267 				    (rp->r_sp + MINFRAME32);
268 				for (i = 5; i < nargs; i++) {
269 					uint32_t a;
270 					if (fuword32(ua, &a) != 0)
271 						return (-1);
272 					lwp->lwp_arg[i] = a;
273 					ua += sizeof (a);
274 				}
275 			}
276 		} else {
277 			lwp->lwp_arg[0] = (uint32_t)rp->r_o0;
278 			lwp->lwp_arg[1] = (uint32_t)rp->r_o1;
279 			lwp->lwp_arg[2] = (uint32_t)rp->r_o2;
280 			lwp->lwp_arg[3] = (uint32_t)rp->r_o3;
281 			lwp->lwp_arg[4] = (uint32_t)rp->r_o4;
282 			lwp->lwp_arg[5] = (uint32_t)rp->r_o5;
283 			if (nargs > 6) {
284 				ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
285 				    (rp->r_sp + MINFRAME32);
286 				for (i = 6; i < nargs; i++) {
287 					uint32_t a;
288 					if (fuword32(ua, &a) != 0)
289 						return (-1);
290 					lwp->lwp_arg[i] = a;
291 					ua += sizeof (a);
292 				}
293 			}
294 		}
295 	} else {
296 		ASSERT(datamodel == DATAMODEL_LP64);
297 		lwp->lwp_arg[0] = rp->r_o0;
298 		lwp->lwp_arg[1] = rp->r_o1;
299 		lwp->lwp_arg[2] = rp->r_o2;
300 		lwp->lwp_arg[3] = rp->r_o3;
301 		lwp->lwp_arg[4] = rp->r_o4;
302 		lwp->lwp_arg[5] = rp->r_o5;
303 		if (nargs > 6) {
304 			ua = (caddr_t)rp->r_sp + MINFRAME + STACK_BIAS;
305 			for (i = 6; i < nargs; i++) {
306 				unsigned long a;
307 				if (fulword(ua, &a) != 0)
308 					return (-1);
309 				lwp->lwp_arg[i] = a;
310 				ua += sizeof (a);
311 			}
312 		}
313 	}
314 
315 out:
316 	lwp->lwp_ap = lwp->lwp_arg;
317 	lwp->lwp_argsaved = 1;
318 	t->t_post_sys = 1;	/* so lwp_ap will be reset */
319 	return (0);
320 }
321 
322 void
323 reset_syscall_args(void)
324 {
325 	klwp_t *lwp = ttolwp(curthread);
326 
327 	lwp->lwp_ap = (long *)&lwptoregs(lwp)->r_o0;
328 	lwp->lwp_argsaved = 0;
329 }
330 
331 /*
332  * nonexistent system call-- signal lwp (may want to handle it)
333  * flag error if lwp won't see signal immediately
334  * This works for old or new calling sequence.
335  */
336 int64_t
337 nosys(void)
338 {
339 	tsignal(curthread, SIGSYS);
340 	return ((int64_t)set_errno(ENOSYS));
341 }
342 
343 int
344 nosys32(void)
345 {
346 	return (nosys());
347 }
348 
349 /*
350  * Perform pre-system-call processing, including stopping for tracing,
351  * auditing, microstate-accounting, etc.
352  *
353  * This routine is called only if the t_pre_sys flag is set.  Any condition
354  * requiring pre-syscall handling must set the t_pre_sys flag.  If the
355  * condition is persistent, this routine will repost t_pre_sys.
356  */
357 int
358 pre_syscall(int arg0)
359 {
360 	unsigned int code;
361 	kthread_t *t = curthread;
362 	proc_t *p = ttoproc(t);
363 	klwp_t *lwp = ttolwp(t);
364 	struct regs *rp = lwptoregs(lwp);
365 	int	repost;
366 
367 	t->t_pre_sys = repost = 0;	/* clear pre-syscall processing flag */
368 
369 	ASSERT(t->t_schedflag & TS_DONT_SWAP);
370 
371 	syscall_mstate(LMS_USER, LMS_SYSTEM);
372 
373 	/*
374 	 * The syscall arguments in the out registers should be pointed to
375 	 * by lwp_ap.  If the args need to be copied so that the outs can
376 	 * be changed without losing the ability to get the args for /proc,
377 	 * they can be saved by save_syscall_args(), and lwp_ap will be
378 	 * restored by post_syscall().
379 	 */
380 	ASSERT(lwp->lwp_ap == (long *)&rp->r_o0);
381 
382 	/*
383 	 * Make sure the thread is holding the latest credentials for the
384 	 * process.  The credentials in the process right now apply to this
385 	 * thread for the entire system call.
386 	 */
387 	if (t->t_cred != p->p_cred) {
388 		cred_t *oldcred = t->t_cred;
389 		/*
390 		 * DTrace accesses t_cred in probe context.  t_cred must
391 		 * always be either NULL, or point to a valid, allocated cred
392 		 * structure.
393 		 */
394 		t->t_cred = crgetcred();
395 		crfree(oldcred);
396 	}
397 
398 	/*
399 	 * Undo special arrangements to single-step the lwp
400 	 * so that a debugger will see valid register contents.
401 	 * Also so that the pc is valid for syncfpu().
402 	 * Also so that a syscall like exec() can be stepped.
403 	 */
404 	if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
405 		(void) prundostep();
406 		repost = 1;
407 	}
408 
409 	/*
410 	 * Check for indirect system call in case we stop for tracing.
411 	 * Don't allow multiple indirection.
412 	 */
413 	code = t->t_sysnum;
414 	if (code == 0 && arg0 != 0) {		/* indirect syscall */
415 		code = arg0;
416 		t->t_sysnum = arg0;
417 	}
418 
419 	/*
420 	 * From the proc(4) manual page:
421 	 * When entry to a system call is being traced, the traced process
422 	 * stops after having begun the call to the system but before the
423 	 * system call arguments have been fetched from the process.
424 	 * If proc changes the args we must refetch them after starting.
425 	 */
426 	if (PTOU(p)->u_systrap) {
427 		if (prismember(&PTOU(p)->u_entrymask, code)) {
428 			/*
429 			 * Recheck stop condition, now that lock is held.
430 			 */
431 			mutex_enter(&p->p_lock);
432 			if (PTOU(p)->u_systrap &&
433 			    prismember(&PTOU(p)->u_entrymask, code)) {
434 				stop(PR_SYSENTRY, code);
435 				/*
436 				 * Must refetch args since they were
437 				 * possibly modified by /proc.  Indicate
438 				 * that the valid copy is in the
439 				 * registers.
440 				 */
441 				lwp->lwp_argsaved = 0;
442 				lwp->lwp_ap = (long *)&rp->r_o0;
443 			}
444 			mutex_exit(&p->p_lock);
445 		}
446 		repost = 1;
447 	}
448 
449 	if (lwp->lwp_sysabort) {
450 		/*
451 		 * lwp_sysabort may have been set via /proc while the process
452 		 * was stopped on PR_SYSENTRY.  If so, abort the system call.
453 		 * Override any error from the copyin() of the arguments.
454 		 */
455 		lwp->lwp_sysabort = 0;
456 		(void) set_errno(EINTR); /* sets post-sys processing */
457 		t->t_pre_sys = 1;	/* repost anyway */
458 		return (1);		/* don't do system call, return EINTR */
459 	}
460 
461 	/* begin auditing for this syscall */
462 	if (audit_active == C2AUDIT_LOADED) {
463 		uint32_t auditing = au_zone_getstate(NULL);
464 
465 		if (auditing & AU_AUDIT_MASK) {
466 			int error;
467 			if (error = audit_start(T_SYSCALL, code, auditing, \
468 			    0, lwp)) {
469 				t->t_pre_sys = 1;	/* repost anyway */
470 				lwp->lwp_error = 0;	/* for old drivers */
471 				return (error);
472 			}
473 			repost = 1;
474 		}
475 	}
476 
477 #ifndef NPROBE
478 	/* Kernel probe */
479 	if (tnf_tracing_active) {
480 		TNF_PROBE_1(syscall_start, "syscall thread", /* CSTYLED */,
481 			tnf_sysnum,	sysnum,		t->t_sysnum);
482 		t->t_post_sys = 1;	/* make sure post_syscall runs */
483 		repost = 1;
484 	}
485 #endif /* NPROBE */
486 
487 #ifdef SYSCALLTRACE
488 	if (syscalltrace) {
489 		int i;
490 		long *ap;
491 		char *cp;
492 		char *sysname;
493 		struct sysent *callp;
494 
495 		if (code >= NSYSCALL)
496 			callp = &nosys_ent;	/* nosys has no args */
497 		else
498 			callp = LWP_GETSYSENT(lwp) + code;
499 		(void) save_syscall_args();
500 		mutex_enter(&systrace_lock);
501 		printf("%d: ", p->p_pid);
502 		if (code >= NSYSCALL)
503 			printf("0x%x", code);
504 		else {
505 			sysname = mod_getsysname(code);
506 			printf("%s[0x%x]", sysname == NULL ? "NULL" :
507 			    sysname, code);
508 		}
509 		cp = "(";
510 		for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) {
511 			printf("%s%lx", cp, *ap);
512 			cp = ", ";
513 		}
514 		if (i)
515 			printf(")");
516 		printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread);
517 		mutex_exit(&systrace_lock);
518 	}
519 #endif /* SYSCALLTRACE */
520 
521 	/*
522 	 * If there was a continuing reason for pre-syscall processing,
523 	 * set the t_pre_sys flag for the next system call.
524 	 */
525 	if (repost)
526 		t->t_pre_sys = 1;
527 	lwp->lwp_error = 0;	/* for old drivers */
528 	lwp->lwp_badpriv = PRIV_NONE;	/* for privilege tracing */
529 	return (0);
530 }
531 
532 /*
533  * Post-syscall processing.  Perform abnormal system call completion
534  * actions such as /proc tracing, profiling, signals, preemption, etc.
535  *
536  * This routine is called only if t_post_sys, t_sig_check, or t_astflag is set.
537  * Any condition requiring pre-syscall handling must set one of these.
538  * If the condition is persistent, this routine will repost t_post_sys.
539  */
540 void
541 post_syscall(long rval1, long rval2)
542 {
543 	kthread_t	*t = curthread;
544 	proc_t	*p = curproc;
545 	klwp_t	*lwp = ttolwp(t);
546 	struct regs *rp = lwptoregs(lwp);
547 	uint_t	error;
548 	int	code = t->t_sysnum;
549 	int	repost = 0;
550 	int	proc_stop = 0;		/* non-zero if stopping for /proc */
551 	int	sigprof = 0;		/* non-zero if sending SIGPROF */
552 
553 	t->t_post_sys = 0;
554 
555 	error = lwp->lwp_errno;
556 
557 	/*
558 	 * Code can be zero if this is a new LWP returning after a forkall(),
559 	 * other than the one which matches the one in the parent which called
560 	 * forkall().  In these LWPs, skip most of post-syscall activity.
561 	 */
562 	if (code == 0)
563 		goto sig_check;
564 
565 	/* put out audit record for this syscall */
566 	if (AU_AUDITING()) {
567 		rval_t	rval;	/* fix audit_finish() someday */
568 
569 		/* XX64 -- truncation of 64-bit return values? */
570 		rval.r_val1 = (int)rval1;
571 		rval.r_val2 = (int)rval2;
572 		audit_finish(T_SYSCALL, code, error, &rval);
573 		repost = 1;
574 	}
575 
576 	if (curthread->t_pdmsg != NULL) {
577 		char *m = curthread->t_pdmsg;
578 
579 		uprintf("%s", m);
580 		kmem_free(m, strlen(m) + 1);
581 		curthread->t_pdmsg = NULL;
582 	}
583 
584 	/*
585 	 * If we're going to stop for /proc tracing, set the flag and
586 	 * save the arguments so that the return values don't smash them.
587 	 */
588 	if (PTOU(p)->u_systrap) {
589 		if (prismember(&PTOU(p)->u_exitmask, code)) {
590 			proc_stop = 1;
591 			(void) save_syscall_args();
592 		}
593 		repost = 1;
594 	}
595 
596 	/*
597 	 * Similarly check to see if SIGPROF might be sent.
598 	 */
599 	if (curthread->t_rprof != NULL &&
600 	    curthread->t_rprof->rp_anystate != 0) {
601 		(void) save_syscall_args();
602 		sigprof = 1;
603 	}
604 
605 	if (lwp->lwp_eosys == NORMALRETURN) {
606 		if (error == 0) {
607 #ifdef SYSCALLTRACE
608 			if (syscalltrace) {
609 				mutex_enter(&systrace_lock);
610 				printf(
611 				    "%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n",
612 				    p->p_pid, rval1, rval2, curthread);
613 				mutex_exit(&systrace_lock);
614 			}
615 #endif /* SYSCALLTRACE */
616 			rp->r_tstate &= ~TSTATE_IC;
617 			rp->r_o0 = rval1;
618 			rp->r_o1 = rval2;
619 		} else {
620 			int sig;
621 
622 #ifdef SYSCALLTRACE
623 			if (syscalltrace) {
624 				mutex_enter(&systrace_lock);
625 				printf("%d: error=%d, id 0x%p\n",
626 				    p->p_pid, error, curthread);
627 				mutex_exit(&systrace_lock);
628 			}
629 #endif /* SYSCALLTRACE */
630 			if (error == EINTR && t->t_activefd.a_stale)
631 				error = EBADF;
632 			if (error == EINTR &&
633 			    (sig = lwp->lwp_cursig) != 0 &&
634 			    sigismember(&PTOU(p)->u_sigrestart, sig) &&
635 			    PTOU(p)->u_signal[sig - 1] != SIG_DFL &&
636 			    PTOU(p)->u_signal[sig - 1] != SIG_IGN)
637 				error = ERESTART;
638 			rp->r_o0 = error;
639 			rp->r_tstate |= TSTATE_IC;
640 		}
641 		/*
642 		 * The default action is to redo the trap instruction.
643 		 * We increment the pc and npc past it for NORMALRETURN.
644 		 * JUSTRETURN has set up a new pc and npc already.
645 		 * If we are a cloned thread of forkall(), don't
646 		 * adjust here because we have already inherited
647 		 * the adjusted values from our clone.
648 		 */
649 		if (!(t->t_flag & T_FORKALL)) {
650 			rp->r_pc = rp->r_npc;
651 			rp->r_npc += 4;
652 		}
653 	}
654 
655 	/*
656 	 * From the proc(4) manual page:
657 	 * When exit from a system call is being traced, the traced process
658 	 * stops on completion of the system call just prior to checking for
659 	 * signals and returning to user level.  At this point all return
660 	 * values have been stored into the traced process's saved registers.
661 	 */
662 	if (proc_stop) {
663 		mutex_enter(&p->p_lock);
664 		if (PTOU(p)->u_systrap &&
665 		    prismember(&PTOU(p)->u_exitmask, code))
666 			stop(PR_SYSEXIT, code);
667 		mutex_exit(&p->p_lock);
668 	}
669 
670 	/*
671 	 * If we are the parent returning from a successful
672 	 * vfork, wait for the child to exec or exit.
673 	 * This code must be here and not in the bowels of the system
674 	 * so that /proc can intercept exit from vfork in a timely way.
675 	 */
676 	if (t->t_flag & T_VFPARENT) {
677 		ASSERT(code == SYS_vfork || code == SYS_forksys);
678 		ASSERT(rp->r_o1 == 0 && error == 0);
679 		vfwait((pid_t)rval1);
680 		t->t_flag &= ~T_VFPARENT;
681 	}
682 
683 	/*
684 	 * If profiling is active, bill the current PC in user-land
685 	 * and keep reposting until profiling is disabled.
686 	 */
687 	if (p->p_prof.pr_scale) {
688 		if (lwp->lwp_oweupc)
689 			profil_tick(rp->r_pc);
690 		repost = 1;
691 	}
692 
693 sig_check:
694 	/*
695 	 * Reset flag for next time.
696 	 * We must do this after stopping on PR_SYSEXIT
697 	 * because /proc uses the information in lwp_eosys.
698 	 */
699 	lwp->lwp_eosys = NORMALRETURN;
700 	clear_stale_fd();
701 	t->t_flag &= ~T_FORKALL;
702 
703 	if (t->t_astflag | t->t_sig_check) {
704 		/*
705 		 * Turn off the AST flag before checking all the conditions that
706 		 * may have caused an AST.  This flag is on whenever a signal or
707 		 * unusual condition should be handled after the next trap or
708 		 * syscall.
709 		 */
710 		astoff(t);
711 		t->t_sig_check = 0;
712 
713 		/*
714 		 * The following check is legal for the following reasons:
715 		 *	1) The thread we are checking, is ourselves, so there is
716 		 *	   no way the proc can go away.
717 		 *	2) The only time we need to be protected by the
718 		 *	   lock is if the binding is changed.
719 		 *
720 		 *	Note we will still take the lock and check the binding
721 		 *	if the condition was true without the lock held.  This
722 		 *	prevents lock contention among threads owned by the
723 		 *	same proc.
724 		 */
725 
726 		if (curthread->t_proc_flag & TP_CHANGEBIND) {
727 			mutex_enter(&p->p_lock);
728 			if (curthread->t_proc_flag & TP_CHANGEBIND) {
729 				timer_lwpbind();
730 				curthread->t_proc_flag &= ~TP_CHANGEBIND;
731 			}
732 			mutex_exit(&p->p_lock);
733 		}
734 
735 		/*
736 		 * for kaio requests on the special kaio poll queue,
737 		 * copyout their results to user memory.
738 		 */
739 		if (p->p_aio)
740 			aio_cleanup(0);
741 
742 		/*
743 		 * If this LWP was asked to hold, call holdlwp(), which will
744 		 * stop.  holdlwps() sets this up and calls pokelwps() which
745 		 * sets the AST flag.
746 		 *
747 		 * Also check TP_EXITLWP, since this is used by fresh new LWPs
748 		 * through lwp_rtt().  That flag is set if the lwp_create(2)
749 		 * syscall failed after creating the LWP.
750 		 */
751 		if (ISHOLD(p) || (t->t_proc_flag & TP_EXITLWP))
752 			holdlwp();
753 
754 		/*
755 		 * All code that sets signals and makes ISSIG_PENDING
756 		 * evaluate true must set t_sig_check afterwards.
757 		 */
758 		if (ISSIG_PENDING(t, lwp, p)) {
759 			if (issig(FORREAL))
760 				psig();
761 			t->t_sig_check = 1;	/* recheck next time */
762 		}
763 
764 		if (sigprof) {
765 			int nargs = (code > 0 && code < NSYSCALL)?
766 			    LWP_GETSYSENT(lwp)[code].sy_narg : 0;
767 			realsigprof(code, nargs, error);
768 			t->t_sig_check = 1;	/* recheck next time */
769 		}
770 
771 		/*
772 		 * If a performance counter overflow interrupt was
773 		 * delivered *during* the syscall, then re-enable the
774 		 * AST so that we take a trip through trap() to cause
775 		 * the SIGEMT to be delivered.
776 		 */
777 		if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW)
778 			aston(t);
779 
780 		/*
781 		 * If an asynchronous hardware error is pending, turn AST flag
782 		 * back on.  AST will be checked again before we return to user
783 		 * mode and we'll come back through trap() to handle the error.
784 		 */
785 		if (lwp->lwp_pcb.pcb_flags & ASYNC_HWERR)
786 			aston(t);
787 	}
788 
789 	/*
790 	 * Restore register window if a debugger modified it.
791 	 * Set up to perform a single-step if a debugger requested it.
792 	 */
793 	if (lwp->lwp_pcb.pcb_xregstat != XREGNONE)
794 		xregrestore(lwp, 1);
795 
796 	lwp->lwp_errno = 0;		/* clear error for next time */
797 
798 #ifndef NPROBE
799 	/* Kernel probe */
800 	if (tnf_tracing_active) {
801 		TNF_PROBE_3(syscall_end, "syscall thread", /* CSTYLED */,
802 		    tnf_long,	rval1,		rval1,
803 		    tnf_long,	rval2,		rval2,
804 		    tnf_long,	errno,		(long)error);
805 		repost = 1;
806 	}
807 #endif /* NPROBE */
808 
809 	/*
810 	 * Set state to LWP_USER here so preempt won't give us a kernel
811 	 * priority if it occurs after this point.  Call CL_TRAPRET() to
812 	 * restore the user-level priority.
813 	 *
814 	 * It is important that no locks (other than spinlocks) be entered
815 	 * after this point before returning to user mode (unless lwp_state
816 	 * is set back to LWP_SYS).
817 	 *
818 	 * Sampled times past this point are charged to the user.
819 	 */
820 	lwp->lwp_state = LWP_USER;
821 
822 	if (t->t_trapret) {
823 		t->t_trapret = 0;
824 		thread_lock(t);
825 		CL_TRAPRET(t);
826 		thread_unlock(t);
827 	}
828 	if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ)
829 		preempt();
830 	prunstop();
831 
832 	/*
833 	 * t_post_sys will be set if pcb_step is active.
834 	 */
835 	if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
836 		prdostep();
837 		repost = 1;
838 	}
839 
840 	t->t_sysnum = 0;	/* no longer in a system call */
841 
842 	/*
843 	 * In case the args were copied to the lwp, reset the
844 	 * pointer so the next syscall will have the right lwp_ap pointer.
845 	 */
846 	lwp->lwp_ap = (long *)&rp->r_o0;
847 	lwp->lwp_argsaved = 0;
848 
849 	/*
850 	 * If there was a continuing reason for post-syscall processing,
851 	 * set the t_post_sys flag for the next system call.
852 	 */
853 	if (repost)
854 		t->t_post_sys = 1;
855 
856 	/*
857 	 * If there is a ustack registered for this lwp, and the stack rlimit
858 	 * has been altered, read in the ustack. If the saved stack rlimit
859 	 * matches the bounds of the ustack, update the ustack to reflect
860 	 * the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable
861 	 * stack checking by setting the size to 0.
862 	 */
863 	if (lwp->lwp_ustack != 0 && lwp->lwp_old_stk_ctl != 0) {
864 		rlim64_t new_size;
865 		model_t model;
866 		caddr_t top;
867 		struct rlimit64 rl;
868 
869 		mutex_enter(&p->p_lock);
870 		new_size = p->p_stk_ctl;
871 		model = p->p_model;
872 		top = p->p_usrstack;
873 		(void) rctl_rlimit_get(rctlproc_legacy[RLIMIT_STACK], p, &rl);
874 		mutex_exit(&p->p_lock);
875 
876 		if (rl.rlim_cur == RLIM64_INFINITY)
877 			new_size = 0;
878 
879 		if (model == DATAMODEL_NATIVE) {
880 			stack_t stk;
881 
882 			if (copyin((stack_t *)lwp->lwp_ustack, &stk,
883 			    sizeof (stack_t)) == 0 &&
884 			    (stk.ss_size == lwp->lwp_old_stk_ctl ||
885 			    stk.ss_size == 0) &&
886 			    stk.ss_sp == top - stk.ss_size) {
887 				stk.ss_sp = (void *)((uintptr_t)stk.ss_sp +
888 				    stk.ss_size - new_size);
889 				stk.ss_size = new_size;
890 
891 				(void) copyout(&stk,
892 				    (stack_t *)lwp->lwp_ustack,
893 				    sizeof (stack_t));
894 			}
895 		} else {
896 			stack32_t stk32;
897 
898 			if (copyin((stack32_t *)lwp->lwp_ustack, &stk32,
899 			    sizeof (stack32_t)) == 0 &&
900 			    (stk32.ss_size == lwp->lwp_old_stk_ctl ||
901 			    stk32.ss_size == 0) &&
902 			    stk32.ss_sp ==
903 			    (caddr32_t)(uintptr_t)(top - stk32.ss_size)) {
904 				stk32.ss_sp += stk32.ss_size - new_size;
905 				stk32.ss_size = new_size;
906 
907 				(void) copyout(&stk32,
908 				    (stack32_t *)lwp->lwp_ustack,
909 				    sizeof (stack32_t));
910 			}
911 		}
912 
913 		lwp->lwp_old_stk_ctl = 0;
914 	}
915 
916 	syscall_mstate(LMS_SYSTEM, LMS_USER);
917 }
918 
919 /*
920  * Call a system call which takes a pointer to the user args struct and
921  * a pointer to the return values.  This is a bit slower than the standard
922  * C arg-passing method in some cases.
923  */
924 int64_t
925 syscall_ap()
926 {
927 	uint_t	error;
928 	struct sysent *callp;
929 	rval_t	rval;
930 	klwp_t	*lwp = ttolwp(curthread);
931 	struct regs *rp = lwptoregs(lwp);
932 
933 	callp = LWP_GETSYSENT(lwp) + curthread->t_sysnum;
934 
935 	/*
936 	 * If the arguments don't fit in registers %o0 - o5, make sure they
937 	 * have been copied to the lwp_arg array.
938 	 */
939 	if (callp->sy_narg > 6 && save_syscall_args())
940 		return ((int64_t)set_errno(EFAULT));
941 
942 	rval.r_val1 = 0;
943 	rval.r_val2 = (int)rp->r_o1;
944 	lwp->lwp_error = 0;	/* for old drivers */
945 	error = (*(callp->sy_call))(lwp->lwp_ap, &rval);
946 	if (error)
947 		return ((int64_t)set_errno(error));
948 	return (rval.r_vals);
949 }
950 
951 /*
952  * Load system call module.
953  *	Returns with pointer to held read lock for module.
954  */
955 static krwlock_t *
956 lock_syscall(struct sysent *table, uint_t code)
957 {
958 	krwlock_t	*module_lock;
959 	struct modctl	*modp;
960 	int		id;
961 	struct sysent   *callp;
962 
963 	module_lock = table[code].sy_lock;
964 	callp = &table[code];
965 
966 	/*
967 	 * Optimization to only call modload if we don't have a loaded
968 	 * syscall.
969 	 */
970 	rw_enter(module_lock, RW_READER);
971 	if (LOADED_SYSCALL(callp))
972 		return (module_lock);
973 	rw_exit(module_lock);
974 
975 	for (;;) {
976 		if ((id = modload("sys", syscallnames[code])) == -1)
977 			break;
978 
979 		/*
980 		 * If we loaded successfully at least once, the modctl
981 		 * will still be valid, so we try to grab it by filename.
982 		 * If this call fails, it's because the mod_filename
983 		 * was changed after the call to modload() (mod_hold_by_name()
984 		 * is the likely culprit).  We can safely just take
985 		 * another lap if this is the case;  the modload() will
986 		 * change the mod_filename back to one by which we can
987 		 * find the modctl.
988 		 */
989 		modp = mod_find_by_filename("sys", syscallnames[code]);
990 
991 		if (modp == NULL)
992 			continue;
993 
994 		mutex_enter(&mod_lock);
995 
996 		if (!modp->mod_installed) {
997 			mutex_exit(&mod_lock);
998 			continue;
999 		}
1000 		break;
1001 	}
1002 
1003 	rw_enter(module_lock, RW_READER);
1004 
1005 	if (id != -1)
1006 		mutex_exit(&mod_lock);
1007 
1008 	return (module_lock);
1009 }
1010 
1011 /*
1012  * Loadable syscall support.
1013  *	If needed, load the module, then reserve it by holding a read
1014  *	lock for the duration of the call.
1015  *	Later, if the syscall is not unloadable, it could patch the vector.
1016  */
1017 /*ARGSUSED*/
1018 int64_t
1019 loadable_syscall(
1020     long a0, long a1, long a2, long a3,
1021     long a4, long a5, long a6, long a7)
1022 {
1023 	int64_t		rval;
1024 	struct sysent	*callp;
1025 	struct sysent	*se = LWP_GETSYSENT(ttolwp(curthread));
1026 	krwlock_t	*module_lock;
1027 	int		code;
1028 
1029 	code = curthread->t_sysnum;
1030 	callp = se + code;
1031 
1032 	/*
1033 	 * Try to autoload the system call if necessary.
1034 	 */
1035 	module_lock = lock_syscall(se, code);
1036 
1037 	/*
1038 	 * we've locked either the loaded syscall or nosys
1039 	 */
1040 	if (callp->sy_flags & SE_ARGC) {
1041 		int64_t (*sy_call)();
1042 
1043 		sy_call = (int64_t (*)())callp->sy_call;
1044 		rval = (*sy_call)(a0, a1, a2, a3, a4, a5);
1045 	} else {
1046 		rval = syscall_ap();
1047 	}
1048 
1049 	rw_exit(module_lock);
1050 	return (rval);
1051 }
1052 
1053 /*
1054  * Handle indirect system calls.
1055  *	This interface should be deprecated.  The library can handle
1056  *	this more efficiently, but keep this implementation for old binaries.
1057  *
1058  * XX64	Needs some work.
1059  */
1060 int64_t
1061 indir(int code, long a0, long a1, long a2, long a3, long a4)
1062 {
1063 	klwp_t		*lwp = ttolwp(curthread);
1064 	struct sysent	*callp;
1065 
1066 	if (code <= 0 || code >= NSYSCALL)
1067 		return (nosys());
1068 
1069 	ASSERT(lwp->lwp_ap != NULL);
1070 
1071 	curthread->t_sysnum = code;
1072 	callp = LWP_GETSYSENT(lwp) + code;
1073 
1074 	/*
1075 	 * Handle argument setup, unless already done in pre_syscall().
1076 	 */
1077 	if (callp->sy_narg > 5) {
1078 		if (save_syscall_args())	/* move args to LWP array */
1079 			return ((int64_t)set_errno(EFAULT));
1080 	} else if (!lwp->lwp_argsaved) {
1081 		long *ap;
1082 
1083 		ap = lwp->lwp_ap;		/* args haven't been saved */
1084 		lwp->lwp_ap = ap + 1;		/* advance arg pointer */
1085 		curthread->t_post_sys = 1;	/* so lwp_ap will be reset */
1086 	}
1087 	return ((*callp->sy_callc)(a0, a1, a2, a3, a4, lwp->lwp_arg[5]));
1088 }
1089 
1090 /*
1091  * set_errno - set an error return from the current system call.
1092  *	This could be a macro.
1093  *	This returns the value it is passed, so that the caller can
1094  *	use tail-recursion-elimination and do return (set_errno(ERRNO));
1095  */
1096 uint_t
1097 set_errno(uint_t error)
1098 {
1099 	ASSERT(error != 0);		/* must not be used to clear errno */
1100 
1101 	curthread->t_post_sys = 1;	/* have post_syscall do error return */
1102 	return (ttolwp(curthread)->lwp_errno = error);
1103 }
1104 
1105 /*
1106  * set_proc_pre_sys - Set pre-syscall processing for entire process.
1107  */
1108 void
1109 set_proc_pre_sys(proc_t *p)
1110 {
1111 	kthread_t	*t;
1112 	kthread_t	*first;
1113 
1114 	ASSERT(MUTEX_HELD(&p->p_lock));
1115 
1116 	t = first = p->p_tlist;
1117 	do {
1118 		t->t_pre_sys = 1;
1119 	} while ((t = t->t_forw) != first);
1120 }
1121 
1122 /*
1123  * set_proc_post_sys - Set post-syscall processing for entire process.
1124  */
1125 void
1126 set_proc_post_sys(proc_t *p)
1127 {
1128 	kthread_t	*t;
1129 	kthread_t	*first;
1130 
1131 	ASSERT(MUTEX_HELD(&p->p_lock));
1132 
1133 	t = first = p->p_tlist;
1134 	do {
1135 		t->t_post_sys = 1;
1136 	} while ((t = t->t_forw) != first);
1137 }
1138 
1139 /*
1140  * set_proc_sys - Set pre- and post-syscall processing for entire process.
1141  */
1142 void
1143 set_proc_sys(proc_t *p)
1144 {
1145 	kthread_t	*t;
1146 	kthread_t	*first;
1147 
1148 	ASSERT(MUTEX_HELD(&p->p_lock));
1149 
1150 	t = first = p->p_tlist;
1151 	do {
1152 		t->t_pre_sys = 1;
1153 		t->t_post_sys = 1;
1154 	} while ((t = t->t_forw) != first);
1155 }
1156 
1157 /*
1158  * set_all_proc_sys - set pre- and post-syscall processing flags for all
1159  * user processes.
1160  *
1161  * This is needed when auditing, tracing, or other facilities which affect
1162  * all processes are turned on.
1163  */
1164 void
1165 set_all_proc_sys()
1166 {
1167 	kthread_t	*t;
1168 	kthread_t	*first;
1169 
1170 	mutex_enter(&pidlock);
1171 	t = first = curthread;
1172 	do {
1173 		t->t_pre_sys = 1;
1174 		t->t_post_sys = 1;
1175 	} while ((t = t->t_next) != first);
1176 	mutex_exit(&pidlock);
1177 }
1178 
1179 /*
1180  * set_all_zone_usr_proc_sys - set pre- and post-syscall processing flags for
1181  * all user processes running in the zone of the current process
1182  *
1183  * This is needed when auditing is turned on.
1184  */
1185 void
1186 set_all_zone_usr_proc_sys(zoneid_t zoneid)
1187 {
1188 	proc_t	    *p;
1189 	kthread_t   *t;
1190 
1191 	mutex_enter(&pidlock);
1192 	for (p = practive; p != NULL; p = p->p_next) {
1193 		/* skip kernel processes */
1194 		if (p->p_exec == NULLVP || p->p_as == &kas ||
1195 		    p->p_stat == SIDL || p->p_stat == SZOMB ||
1196 		    (p->p_flag & (SSYS | SEXITING | SEXITLWPS)))
1197 			continue;
1198 		/*
1199 		 * Only processes in the given zone (eventually in
1200 		 * all zones) are taken into account
1201 		 */
1202 		if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) {
1203 			mutex_enter(&p->p_lock);
1204 			if ((t = p->p_tlist) == NULL) {
1205 				mutex_exit(&p->p_lock);
1206 				continue;
1207 			}
1208 			/*
1209 			 * Set pre- and post-syscall processing flags
1210 			 * for all threads of the process
1211 			 */
1212 			do {
1213 				t->t_pre_sys = 1;
1214 				t->t_post_sys = 1;
1215 			} while (p->p_tlist != (t = t->t_forw));
1216 			mutex_exit(&p->p_lock);
1217 		}
1218 	}
1219 	mutex_exit(&pidlock);
1220 }
1221 
1222 /*
1223  * set_proc_ast - Set asynchronous service trap (AST) flag for all
1224  * threads in process.
1225  */
1226 void
1227 set_proc_ast(proc_t *p)
1228 {
1229 	kthread_t	*t;
1230 	kthread_t	*first;
1231 
1232 	ASSERT(MUTEX_HELD(&p->p_lock));
1233 
1234 	t = first = p->p_tlist;
1235 	do {
1236 		aston(t);
1237 	} while ((t = t->t_forw) != first);
1238 }
1239