xref: /titanic_41/usr/src/uts/i86pc/ml/syscall_asm.s (revision 0b6016e6ff70af39f99c9cc28e0c2207c8f5413c)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*	Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.	*/
28/*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T	*/
29/*	  All Rights Reserved					*/
30
31/*	Copyright (c) 1987, 1988 Microsoft Corporation		*/
32/*	  All Rights Reserved					*/
33
34#pragma ident	"%Z%%M%	%I%	%E% SMI"
35
36#include <sys/asm_linkage.h>
37#include <sys/asm_misc.h>
38#include <sys/regset.h>
39#include <sys/psw.h>
40#include <sys/x86_archext.h>
41
42#if defined(__lint)
43
44#include <sys/types.h>
45#include <sys/thread.h>
46#include <sys/systm.h>
47
48#else	/* __lint */
49
50#include <sys/segments.h>
51#include <sys/pcb.h>
52#include <sys/trap.h>
53#include <sys/ftrace.h>
54#include <sys/traptrace.h>
55#include <sys/clock.h>
56#include <sys/panic.h>
57#include "assym.h"
58
59#endif	/* __lint */
60
61/*
62 * We implement two flavours of system call entry points
63 *
64 * -	{int,lcall}/iret	(i386)
65 * -	sysenter/sysexit	(Pentium II and beyond)
66 *
67 * The basic pattern used in the handlers is to check to see if we can
68 * do fast (simple) version of the system call; if we can't we use various
69 * C routines that handle corner cases and debugging.
70 *
71 * To reduce the amount of assembler replication, yet keep the system call
72 * implementations vaguely comprehensible, the common code in the body
73 * of the handlers is broken up into a set of preprocessor definitions
74 * below.
75 */
76
77/*
78 * When we have SYSCALLTRACE defined, we sneak an extra
79 * predicate into a couple of tests.
80 */
81#if defined(SYSCALLTRACE)
82#define	ORL_SYSCALLTRACE(r32)	\
83	orl	syscalltrace, r32
84#else
85#define	ORL_SYSCALLTRACE(r32)
86#endif
87
88/*
89 * This check is false whenever we want to go fast i.e.
90 *
91 *	if (code >= NSYSCALL ||
92 *	    t->t_pre_sys || (t->t_proc_flag & TP_WATCHPT) != 0)
93 *		do full version
94 * #ifdef SYSCALLTRACE
95 *	if (syscalltrace)
96 *		do full version
97 * #endif
98 *
99 * Preconditions:
100 * -	t	curthread
101 * -	code	contains the syscall number
102 * Postconditions:
103 * -	%ecx and %edi are smashed
104 * -	condition code flag ZF is cleared if pre-sys is too complex
105 */
106#define	CHECK_PRESYS_NE(t, code)		\
107	movzbl	T_PRE_SYS(t), %edi;		\
108	movzwl	T_PROC_FLAG(t), %ecx;		\
109	andl	$TP_WATCHPT, %ecx;		\
110	orl	%ecx, %edi;			\
111	cmpl	$NSYSCALL, code;		\
112	setae	%cl;				\
113	movzbl	%cl, %ecx;			\
114	orl	%ecx, %edi;			\
115	ORL_SYSCALLTRACE(%edi)
116
117#define	MSTATE_TRANSITION(from, to)		\
118	pushl	$to;				\
119	pushl	$from;				\
120	call	syscall_mstate;			\
121	addl	$0x8, %esp
122
123/*
124 * aka CPU_STATS_ADDQ(CPU, sys.syscall, 1)
125 * This must be called with interrupts or preemption disabled.
126 */
127#define	CPU_STATS_SYS_SYSCALL_INC			\
128	addl	$1, %gs:CPU_STATS_SYS_SYSCALL;		\
129	adcl	$0, %gs:CPU_STATS_SYS_SYSCALL+4;
130
131#if !defined(__lint)
132
133/*
134 * ASSERT(lwptoregs(lwp) == rp);
135 *
136 * this may seem obvious, but very odd things happen if this
137 * assertion is false
138 *
139 * Preconditions:
140 *	-none-
141 * Postconditions (if assertion is true):
142 *	%esi and %edi are smashed
143 */
144#if defined(DEBUG)
145
146__lwptoregs_msg:
147	.string	"%M%:%d lwptoregs(%p) [%p] != rp [%p]"
148
149#define	ASSERT_LWPTOREGS(t, rp)				\
150	movl	T_LWP(t), %esi;				\
151	movl	LWP_REGS(%esi), %edi;			\
152	cmpl	rp, %edi;				\
153	je	7f;					\
154	pushl	rp;					\
155	pushl	%edi;					\
156	pushl	%esi;					\
157	pushl	$__LINE__;				\
158	pushl	$__lwptoregs_msg;			\
159	call	panic;					\
1607:
161#else
162#define	ASSERT_LWPTOREGS(t, rp)
163#endif
164
165#endif	/* __lint */
166
167/*
168 * This is an assembler version of this fragment:
169 *
170 * lwp->lwp_state = LWP_SYS;
171 * lwp->lwp_ru.sysc++;
172 * lwp->lwp_eosys = NORMALRETURN;
173 * lwp->lwp_ap = argp;
174 *
175 * Preconditions:
176 *	-none-
177 * Postconditions:
178 *	-none-
179 */
180#define	SET_LWP(lwp, argp)				\
181	movb	$LWP_SYS, LWP_STATE(lwp);		\
182	addl	$1, LWP_RU_SYSC(lwp);			\
183	adcl	$0, LWP_RU_SYSC+4(lwp);			\
184	movb	$NORMALRETURN, LWP_EOSYS(lwp);		\
185	movl	argp, LWP_AP(lwp)
186
187/*
188 * Set up the thread, lwp, find the handler, and copy
189 * in the arguments from userland to the kernel stack.
190 *
191 * Preconditions:
192 * -	%eax contains the syscall number
193 * Postconditions:
194 * -	%eax contains a pointer to the sysent structure
195 * -	%ecx is zeroed
196 * -	%esi, %edi are smashed
197 * -	%esp is SYS_DROPped ready for the syscall
198 */
199#define	SIMPLE_SYSCALL_PRESYS(t, faultlabel)		\
200	movl	T_LWP(t), %esi;				\
201	movw	%ax, T_SYSNUM(t);			\
202	subl	$SYS_DROP, %esp;			\
203	shll	$SYSENT_SIZE_SHIFT, %eax;			\
204	SET_LWP(%esi, %esp);				\
205	leal	sysent(%eax), %eax;			\
206	movzbl	SY_NARG(%eax), %ecx;			\
207	testl	%ecx, %ecx;				\
208	jz	4f;					\
209	movl	%esp, %edi;				\
210	movl	SYS_DROP + REGOFF_UESP(%esp), %esi;	\
211	movl	$faultlabel, T_LOFAULT(t);		\
212	addl	$4, %esi;				\
213	rep;						\
214	  smovl;					\
215	movl	%ecx, T_LOFAULT(t);			\
2164:
217
218/*
219 * Check to see if a simple return is possible i.e.
220 *
221 *	if ((t->t_post_sys_ast | syscalltrace) != 0)
222 *		do full version;
223 *
224 * Preconditions:
225 * -	t is curthread
226 * Postconditions:
227 * -	condition code NE is set if post-sys is too complex
228 * -	rtmp is zeroed if it isn't (we rely on this!)
229 */
230#define	CHECK_POSTSYS_NE(t, rtmp)			\
231	xorl	rtmp, rtmp;				\
232	ORL_SYSCALLTRACE(rtmp);				\
233	orl	T_POST_SYS_AST(t), rtmp;		\
234	cmpl	$0, rtmp
235
236/*
237 * Fix up the lwp, thread, and eflags for a successful return
238 *
239 * Preconditions:
240 * -	zwreg contains zero
241 * Postconditions:
242 * -	%esp has been unSYS_DROPped
243 * -	%esi is smashed (points to lwp)
244 */
245#define	SIMPLE_SYSCALL_POSTSYS(t, zwreg)		\
246	movl	T_LWP(t), %esi;				\
247	addl	$SYS_DROP, %esp;			\
248	movw	zwreg, T_SYSNUM(t);			\
249	movb	$LWP_USER, LWP_STATE(%esi);		\
250	andb	$_CONST(0xffff - PS_C), REGOFF_EFL(%esp)
251
252/*
253 * System call handler.  This is the destination of both the call
254 * gate (lcall 0x27) _and_ the interrupt gate (int 0x91). For our purposes,
255 * there are two significant differences between an interrupt gate and a call
256 * gate:
257 *
258 * 1) An interrupt gate runs the handler with interrupts disabled, whereas a
259 * call gate runs the handler with whatever EFLAGS settings were in effect at
260 * the time of the call.
261 *
262 * 2) An interrupt gate pushes the contents of the EFLAGS register at the time
263 * of the interrupt onto the stack, whereas a call gate does not.
264 *
265 * Because we use the following code sequence to handle system calls made from
266 * _both_ a call gate _and_ an interrupt gate, these two differences must be
267 * respected. In regards to number 1) above, the handler must ensure that a sane
268 * EFLAGS snapshot is stored on the stack so that when the kernel returns back
269 * to the user via iret (which returns to user with the EFLAGS value saved on
270 * the stack), interrupts are re-enabled.
271 *
272 * In regards to number 2) above, the handler must always put a current snapshot
273 * of EFLAGS onto the stack in the appropriate place. If we came in via an
274 * interrupt gate, we will be clobbering the EFLAGS value that was pushed by
275 * the interrupt gate. This is OK, as the only bit that was changed by the
276 * hardware was the IE (interrupt enable) bit, which for an interrupt gate is
277 * now off. If we were to do nothing, the stack would contain an EFLAGS with
278 * IE off, resulting in us eventually returning back to the user with interrupts
279 * disabled. The solution is to turn on the IE bit in the EFLAGS value saved on
280 * the stack.
281 *
282 * Another subtlety which deserves mention is the difference between the two
283 * descriptors. The call gate descriptor is set to instruct the hardware to copy
284 * one parameter from the user stack to the kernel stack, whereas the interrupt
285 * gate descriptor doesn't use the parameter passing mechanism at all. The
286 * kernel doesn't actually use the parameter that is copied by the hardware; the
287 * only reason it does this is so that there is a space on the stack large
288 * enough to hold an EFLAGS register value, which happens to be in the correct
289 * place for use by iret when we go back to userland. How convenient.
290 *
291 * Stack frame description in syscall() and callees.
292 *
293 * |------------|
294 * | regs	| +(8*4)+4	registers
295 * |------------|
296 * | 8 args	| <- %esp	MAXSYSARGS (currently 8) arguments
297 * |------------|
298 *
299 */
300#define	SYS_DROP	_CONST(_MUL(MAXSYSARGS, 4))
301
302#if defined(__lint)
303
304/*ARGSUSED*/
305void
306sys_call()
307{}
308
309void
310_allsyscalls()
311{}
312
313size_t _allsyscalls_size;
314
315#else	/* __lint */
316
317	ENTRY_NP2(sys_call, _allsyscalls)
318
319	/ on entry	eax = system call number
320	/ set up the stack to look as in reg.h
321	subl    $8, %esp        / pad the stack with ERRCODE and TRAPNO
322
323	SYSCALL_PUSH
324
325#ifdef TRAPTRACE
326	TRACE_PTR(%edi, %ebx, %ebx, %ecx, $TT_SYSCALL) / Uses labels "8" and "9"
327	TRACE_REGS(%edi, %esp, %ebx, %ecx)	/ Uses label "9"
328	pushl	%eax
329	TRACE_STAMP(%edi)		/ Clobbers %eax, %edx, uses "9"
330	popl	%eax
331	movl	%eax, TTR_SYSNUM(%edi)
332#endif
333
334_watch_do_syscall:
335	movl	%esp, %ebp
336
337	pushl	%eax				/ preserve across mstate call
338	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
339	popl	%eax
340
341	movl	%gs:CPU_THREAD, %ebx
342
343	/ Interrupts are enabled here, so we must make sure this thread doesn't
344	/ migrate off the CPU while it updates the CPU stats.
345	addb	$1, T_PREEMPT(%ebx)
346	CPU_STATS_SYS_SYSCALL_INC
347	subb	$1, T_PREEMPT(%ebx)
348
349	/ Set EFLAGS to standard kernel settings.
350	ENABLE_INTR_FLAGS
351
352	ASSERT_LWPTOREGS(%ebx, %esp)
353
354	CHECK_PRESYS_NE(%ebx, %eax)
355	jne	_full_syscall_presys
356	SIMPLE_SYSCALL_PRESYS(%ebx, _syscall_fault)
357
358_syslcall_call:
359	call	*SY_CALLC(%eax)
360
361_syslcall_done:
362	CHECK_POSTSYS_NE(%ebx, %ecx)
363	jne	_full_syscall_postsys
364	SIMPLE_SYSCALL_POSTSYS(%ebx, %cx)
365	movl	%eax, REGOFF_EAX(%esp)
366	movl	%edx, REGOFF_EDX(%esp)
367
368	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
369
370	/
371	/ get back via iret
372	/
373	cli
374	jmp	set_user_regs
375
376_full_syscall_presys:
377	movl	T_LWP(%ebx), %esi
378	subl	$SYS_DROP, %esp
379	movb	$LWP_SYS, LWP_STATE(%esi)
380	pushl	%esp
381	pushl	%ebx
382	call	syscall_entry
383	addl	$8, %esp
384	jmp	_syslcall_call
385
386_full_syscall_postsys:
387	addl	$SYS_DROP, %esp
388	pushl	%edx
389	pushl	%eax
390	pushl	%ebx
391	call	syscall_exit
392	addl	$12, %esp
393	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
394	jmp	sys_rtt_syscall
395
396_syscall_fault:
397	push	$0xe			/ EFAULT
398	call	set_errno
399	addl	$4, %esp
400	xorl	%eax, %eax		/ fake syscall_err()
401	xorl	%edx, %edx
402	jmp	_syslcall_done
403	SET_SIZE(sys_call)
404
405#endif	/* __lint */
406
407/*
408 * System call handler via the sysenter instruction
409 *
410 * Here's how syscall entry usually works (see sys_call for details).
411 *
412 * There, the caller (lcall or int) in userland has arranged that:
413 *
414 * -	%eax contains the syscall number
415 * -	the user stack contains the args to the syscall
416 *
417 * Normally the lcall instruction into the call gate causes the processor
418 * to push %ss, %esp, <top-of-stack>, %cs, %eip onto the kernel stack.
419 * The sys_call handler then leaves space for r_trapno and r_err, and
420 * pusha's {%eax, %ecx, %edx, %ebx, %esp, %ebp, %esi, %edi}, followed
421 * by %ds, %es, %fs and %gs to capture a 'struct regs' on the stack.
422 * Then the kernel sets %ds, %es and %gs to kernel selectors, and finally
423 * extracts %efl and puts it into r_efl (which happens to live at the offset
424 * that <top-of-stack> was copied into). Note that the value in r_efl has
425 * the IF (interrupt enable) flag turned on. (The int instruction into the
426 * interrupt gate does essentially the same thing, only instead of
427 * <top-of-stack> we get eflags - see comment above.)
428 *
429 * In the sysenter case, things are a lot more primitive.
430 *
431 * The caller in userland has arranged that:
432 *
433 * -	%eax contains the syscall number
434 * -	%ecx contains the user %esp
435 * -	%edx contains the return %eip
436 * -	the user stack contains the args to the syscall
437 *
438 * e.g.
439 *	<args on the stack>
440 *	mov	$SYS_callnum, %eax
441 *	mov	$1f, %edx	/ return %eip
442 *	mov	%esp, %ecx	/ return %esp
443 *	sysenter
444 * 1:
445 *
446 * Hardware and (privileged) initialization code have arranged that by
447 * the time the sysenter instructions completes:
448 *
449 * - %eip is pointing to sys_sysenter (below).
450 * - %cs and %ss are set to kernel text and stack (data) selectors.
451 * - %esp is pointing at the lwp's stack
452 * - Interrupts have been disabled.
453 *
454 * The task for the sysenter handler is:
455 *
456 * -	recreate the same regs structure on the stack and the same
457 *	kernel state as if we'd come in on an lcall
458 * -	do the normal work of a syscall
459 * -	execute the system call epilogue, use sysexit to return to userland.
460 *
461 * Note that we are unable to return both "rvals" to userland with this
462 * call, as %edx is used by the sysexit instruction.
463 */
464#if defined(__lint)
465
466void
467sys_sysenter()
468{}
469
470#else	/* __lint */
471
472	ENTRY_NP(sys_sysenter)
473	/
474	/ do what the call gate would've done to the stack ..
475	/
476	pushl	$UDS_SEL	/ (really %ss, but it's the same ..)
477	pushl	%ecx		/ userland makes this a copy of %esp
478	pushfl
479	orl	$PS_IE, (%esp)	/ turn interrupts on when we return to user
480	pushl	$UCS_SEL
481	pushl	%edx		/ userland makes this a copy of %eip
482	/
483	/ done.  finish building the stack frame
484	/
485	subl	$8, %esp	/ leave space for ERR and TRAPNO
486
487	SYSENTER_PUSH
488
489#ifdef TRAPTRACE
490	TRACE_PTR(%edi, %ebx, %ebx, %ecx, $TT_SYSENTER)	/ uses labels 8 and 9
491	TRACE_REGS(%edi, %esp, %ebx, %ecx)		/ uses label 9
492	pushl	%eax
493	TRACE_STAMP(%edi)		/ clobbers %eax, %edx, uses label 9
494	popl	%eax
495	movl	%eax, TTR_SYSNUM(%edi)
496#endif
497	movl	%esp, %ebp
498
499	CPU_STATS_SYS_SYSCALL_INC
500
501	ENABLE_INTR_FLAGS
502
503	pushl	%eax				/ preserve across mstate call
504	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
505	popl	%eax
506
507	movl	%gs:CPU_THREAD, %ebx
508
509	ASSERT_LWPTOREGS(%ebx, %esp)
510
511	CHECK_PRESYS_NE(%ebx, %eax)
512	jne	_full_syscall_presys
513	SIMPLE_SYSCALL_PRESYS(%ebx, _syscall_fault)
514
515_sysenter_call:
516	call	*SY_CALLC(%eax)
517
518_sysenter_done:
519	CHECK_POSTSYS_NE(%ebx, %ecx)
520	jne	_full_syscall_postsys
521	SIMPLE_SYSCALL_POSTSYS(%ebx, %cx)
522	/
523	/ sysexit uses %edx to restore %eip, so we can't use it
524	/ to return a value, sigh.
525	/
526	movl	%eax, REGOFF_EAX(%esp)
527	/ movl	%edx, REGOFF_EDX(%esp)
528
529	/ Interrupts will be turned on by the 'sti' executed just before
530	/ sysexit. The following ensures that restoring the user's EFLAGS
531	/ doesn't enable interrupts too soon.
532	andl	$_BITNOT(PS_IE), REGOFF_EFL(%esp)
533
534	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
535
536	cli
537
538	SYSCALL_POP
539
540	popl	%edx			/ sysexit: %edx -> %eip
541	addl	$4, %esp		/ get CS off the stack
542	popfl				/ EFL
543	popl	%ecx			/ sysexit: %ecx -> %esp
544	sti
545	sysexit
546	SET_SIZE(sys_sysenter)
547
548/*
549 * Declare a uintptr_t which covers the entire pc range of syscall
550 * handlers for the stack walkers that need this.
551 */
552	.align	CPTRSIZE
553	.globl	_allsyscalls_size
554	.type	_allsyscalls_size, @object
555_allsyscalls_size:
556	.NWORD	. - _allsyscalls
557	SET_SIZE(_allsyscalls_size)
558
559#endif	/* __lint */
560
561/*
562 * These are the thread context handlers for lwps using sysenter/sysexit.
563 */
564
565#if defined(__lint)
566
567/*ARGSUSED*/
568void
569sep_save(void *ksp)
570{}
571
572/*ARGSUSED*/
573void
574sep_restore(void *ksp)
575{}
576
577#else	/* __lint */
578
579	/*
580	 * setting this value to zero as we switch away causes the
581	 * stack-pointer-on-sysenter to be NULL, ensuring that we
582	 * don't silently corrupt another (preempted) thread stack
583	 * when running an lwp that (somehow) didn't get sep_restore'd
584	 */
585	ENTRY_NP(sep_save)
586	xorl	%edx, %edx
587	xorl	%eax, %eax
588	movl	$MSR_INTC_SEP_ESP, %ecx
589	wrmsr
590	ret
591	SET_SIZE(sep_save)
592
593	/*
594	 * Update the kernel stack pointer as we resume onto this cpu.
595	 */
596	ENTRY_NP(sep_restore)
597	movl	4(%esp), %eax			/* per-lwp kernel sp */
598	xorl	%edx, %edx
599	movl	$MSR_INTC_SEP_ESP, %ecx
600	wrmsr
601	ret
602	SET_SIZE(sep_restore)
603
604#endif	/* __lint */
605
606/*
607 * Call syscall().  Called from trap() on watchpoint at lcall 0,7
608 */
609
610#if defined(__lint)
611
612void
613watch_syscall(void)
614{}
615
616#else	/* __lint */
617
618	ENTRY_NP(watch_syscall)
619	movl	%gs:CPU_THREAD, %ebx
620	movl	T_STACK(%ebx), %esp		/ switch to the thread stack
621	movl	REGOFF_EAX(%esp), %eax		/ recover original syscall#
622	jmp	_watch_do_syscall
623	SET_SIZE(watch_syscall)
624
625#endif	/* __lint */
626