xref: /titanic_52/usr/src/uts/i86pc/ml/syscall_asm_amd64.s (revision f841f6ad96ea6675d6c6b35c749eaac601799fdf)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/asm_linkage.h>
30#include <sys/asm_misc.h>
31#include <sys/regset.h>
32#include <sys/psw.h>
33
34#if defined(__lint)
35
36#include <sys/types.h>
37#include <sys/thread.h>
38#include <sys/systm.h>
39
40#else	/* __lint */
41
42#include <sys/segments.h>
43#include <sys/pcb.h>
44#include <sys/trap.h>
45#include <sys/ftrace.h>
46#include <sys/traptrace.h>
47#include <sys/clock.h>
48#include <sys/model.h>
49#include <sys/panic.h>
50#include "assym.h"
51
52#endif	/* __lint */
53
54/*
55 * We implement five flavours of system call entry points
56 *
57 * -	syscall/sysretq		(amd64 generic)
58 * -	syscall/sysretl		(i386 plus SYSC bit)
59 * -	sysenter/sysexit	(i386 plus SEP bit)
60 * -	int/iret		(i386 generic)
61 * -	lcall/iret		(i386 generic)
62 *
63 * The current libc included in Solaris uses int/iret as the base unoptimized
64 * kernel entry method. Older libc implementations and legacy binaries may use
65 * the lcall call gate, so it must continue to be supported.
66 *
67 * System calls that use an lcall call gate are processed in trap() via a
68 * segment-not-present trap, i.e. lcalls are extremely slow(!).
69 *
70 * The basic pattern used in the 32-bit SYSC handler at this point in time is
71 * to have the bare minimum of assembler, and get to the C handlers as
72 * quickly as possible.
73 *
74 * The 64-bit handler is much closer to the sparcv9 handler; that's
75 * because of passing arguments in registers.  The 32-bit world still
76 * passes arguments on the stack -- that makes that handler substantially
77 * more complex.
78 *
79 * The two handlers share a few code fragments which are broken
80 * out into preprocessor macros below.
81 *
82 * XX64	come back and speed all this up later.  The 32-bit stuff looks
83 * especially easy to speed up the argument copying part ..
84 *
85 *
86 * Notes about segment register usage (c.f. the 32-bit kernel)
87 *
88 * In the 32-bit kernel, segment registers are dutifully saved and
89 * restored on all mode transitions because the kernel uses them directly.
90 * When the processor is running in 64-bit mode, segment registers are
91 * largely ignored.
92 *
93 * %cs and %ss
94 *	controlled by the hardware mechanisms that make mode transitions
95 *
96 * The remaining segment registers have to either be pointing at a valid
97 * descriptor i.e. with the 'present' bit set, or they can NULL descriptors
98 *
99 * %ds and %es
100 *	always ignored
101 *
102 * %fs and %gs
103 *	fsbase and gsbase are used to control the place they really point at.
104 *	The kernel only depends on %gs, and controls its own gsbase via swapgs
105 *
106 * Note that loading segment registers is still costly because the GDT
107 * lookup still happens (this is because the hardware can't know that we're
108 * not setting up these segment registers for a 32-bit program).  Thus we
109 * avoid doing this in the syscall path, and defer them to lwp context switch
110 * handlers, so the register values remain virtualized to the lwp.
111 */
112
113#if defined(SYSCALLTRACE)
114#define	ORL_SYSCALLTRACE(r32)		\
115	orl	syscalltrace(%rip), r32
116#else
117#define	ORL_SYSCALLTRACE(r32)
118#endif
119
120#define	MSTATE_TRANSITION(from, to)		\
121	movl	$from, %edi;			\
122	movl	$to, %esi;			\
123	call	syscall_mstate
124
125/*
126 * Check to see if a simple (direct) return is possible i.e.
127 *
128 *	if ((t->t_post_sys_ast | syscalltrace |
129 *	    (lwp->lwp_pcb.pcb_flags & RUPDATE_PENDING)) != 0)
130 *		do full version	;
131 *
132 * Preconditions:
133 * -	t is curthread
134 * Postconditions:
135 * -	condition code NE is set if post-sys is too complex
136 * -	rtmp is zeroed if it isn't (we rely on this!)
137 * -	ltmp is smashed
138 */
139#define	CHECK_POSTSYS_NE(t, ltmp, rtmp)			\
140	movq	T_LWP(t), ltmp;				\
141	movl	PCB_FLAGS(ltmp), rtmp;			\
142	andl	$RUPDATE_PENDING, rtmp;			\
143	ORL_SYSCALLTRACE(rtmp);				\
144	orl	T_POST_SYS_AST(t), rtmp;		\
145	cmpl	$0, rtmp
146
147/*
148 * Fix up the lwp, thread, and eflags for a successful return
149 *
150 * Preconditions:
151 * -	zwreg contains zero
152 */
153#define	SIMPLE_SYSCALL_POSTSYS(t, lwp, zwreg)		\
154	movb	$LWP_USER, LWP_STATE(lwp);		\
155	movw	zwreg, T_SYSNUM(t);			\
156	andb	$_CONST(0xffff - PS_C), REGOFF_RFL(%rsp)
157
158/*
159 * ASSERT(lwptoregs(lwp) == rp);
160 *
161 * This may seem obvious, but very odd things happen if this
162 * assertion is false
163 *
164 * Preconditions:
165 *	(%rsp is ready for normal call sequence)
166 * Postconditions (if assertion is true):
167 *	%r11 is smashed
168 *
169 * ASSERT(rp->r_cs == descnum)
170 *
171 * The code selector is written into the regs structure when the
172 * lwp stack is created.  We use this ASSERT to validate that
173 * the regs structure really matches how we came in.
174 *
175 * Preconditions:
176 *	(%rsp is ready for normal call sequence)
177 * Postconditions (if assertion is true):
178 *	-none-
179 *
180 * ASSERT((lwp->lwp_pcb.pcb_flags & RUPDATE_PENDING) == 0);
181 *
182 * If this is false, it meant that we returned to userland without
183 * updating the segment registers as we were supposed to.
184 *
185 * Note that we must ensure no interrupts or other traps intervene
186 * between entering privileged mode and performing the assertion,
187 * otherwise we may perform a context switch on the thread, which
188 * will end up setting the RUPDATE_PENDING bit again.
189 */
190#if defined(DEBUG)
191
192#if !defined(__lint)
193
194__lwptoregs_msg:
195	.string	"%M%:%d lwptoregs(%p) [%p] != rp [%p]"
196
197__codesel_msg:
198	.string	"%M%:%d rp->r_cs [%ld] != %ld"
199
200__no_rupdate_msg:
201	.string	"%M%:%d lwp %p, pcb_flags & RUPDATE_PENDING != 0"
202
203#endif	/* !__lint */
204
205#define	ASSERT_LWPTOREGS(lwp, rp)			\
206	movq	LWP_REGS(lwp), %r11;			\
207	cmpq	rp, %r11;				\
208	je	7f;					\
209	leaq	__lwptoregs_msg(%rip), %rdi;		\
210	movl	$__LINE__, %esi;			\
211	movq	lwp, %rdx;				\
212	movq	%r11, %rcx;				\
213	movq	rp, %r8;				\
214	xorl	%eax, %eax;				\
215	call	panic;					\
2167:
217
218#define	ASSERT_NO_RUPDATE_PENDING(lwp)			\
219	testl	$RUPDATE_PENDING, PCB_FLAGS(lwp);	\
220	je	8f;					\
221	movq	lwp, %rdx;				\
222	leaq	__no_rupdate_msg(%rip), %rdi;		\
223	movl	$__LINE__, %esi;			\
224	xorl	%eax, %eax;				\
225	call	panic;					\
2268:
227
228#else
229#define	ASSERT_LWPTOREGS(lwp, rp)
230#define	ASSERT_NO_RUPDATE_PENDING(lwp)
231#endif
232
233/*
234 * Do the traptrace thing and restore any registers we used
235 * in situ.  Assumes that %rsp is pointing at the base of
236 * the struct regs, obviously ..
237 */
238#ifdef TRAPTRACE
239#define	SYSCALL_TRAPTRACE(ttype)				\
240	TRACE_PTR(%rdi, %rbx, %ebx, %rcx, ttype);		\
241	TRACE_REGS(%rdi, %rsp, %rbx, %rcx);			\
242	TRACE_STAMP(%rdi);	/* rdtsc clobbers %eax, %edx */	\
243	movq	REGOFF_RAX(%rsp), %rax;				\
244	movq	REGOFF_RBX(%rsp), %rbx;				\
245	movq	REGOFF_RCX(%rsp), %rcx;				\
246	movq	REGOFF_RDX(%rsp), %rdx;				\
247	movl	%eax, TTR_SYSNUM(%rdi);				\
248	movq	REGOFF_RDI(%rsp), %rdi
249
250#define	SYSCALL_TRAPTRACE32(ttype)				\
251	SYSCALL_TRAPTRACE(ttype);				\
252	/* paranoia: clean the top 32-bits of the registers */	\
253	orl	%eax, %eax;					\
254	orl	%ebx, %ebx;					\
255	orl	%ecx, %ecx;					\
256	orl	%edx, %edx;					\
257	orl	%edi, %edi
258#else	/* TRAPTRACE */
259#define	SYSCALL_TRAPTRACE(ttype)
260#define	SYSCALL_TRAPTRACE32(ttype)
261#endif	/* TRAPTRACE */
262
263/*
264 * The 64-bit libc syscall wrapper does this:
265 *
266 * fn(<args>)
267 * {
268 *	movq	%rcx, %r10	-- because syscall smashes %rcx
269 *	movl	$CODE, %eax
270 *	syscall
271 *	<error processing>
272 * }
273 *
274 * Thus when we come into the kernel:
275 *
276 *	%rdi, %rsi, %rdx, %r10, %r8, %r9 contain first six args
277 *	%rax is the syscall number
278 *	%r12-%r15 contain caller state
279 *
280 * The syscall instruction arranges that:
281 *
282 *	%rcx contains the return %rip
283 *	%r11d contains bottom 32-bits of %rflags
284 *	%rflags is masked (as determined by the SFMASK msr)
285 *	%cs is set to UCS_SEL (as determined by the STAR msr)
286 *	%ss is set to UDS_SEL (as determined by the STAR msr)
287 *	%rip is set to sys_syscall (as determined by the LSTAR msr)
288 *
289 * Or in other words, we have no registers available at all.
290 * Only swapgs can save us!
291 */
292
293#if defined(__lint)
294
295/*ARGSUSED*/
296void
297sys_syscall()
298{}
299
300void
301_allsyscalls()
302{}
303
304size_t _allsyscalls_size;
305
306#else	/* __lint */
307
308	ENTRY_NP2(sys_syscall,_allsyscalls)
309
310	swapgs
311	movq	%rsp, %gs:CPU_RTMP_RSP
312	movq	%r15, %gs:CPU_RTMP_R15
313	movq	%gs:CPU_THREAD, %r15
314	movq	T_STACK(%r15), %rsp
315
316	movl	$UCS_SEL, REGOFF_CS(%rsp)
317	movq	%rcx, REGOFF_RIP(%rsp)		/* syscall: %rip -> %rcx */
318	movq	%r11, REGOFF_RFL(%rsp)		/* syscall: %rfl -> %r11d */
319	movl	$UDS_SEL, REGOFF_SS(%rsp)
320
321	movl	%eax, %eax			/* wrapper: sysc# -> %eax */
322	movq	%rdi, REGOFF_RDI(%rsp)
323	movq	%rsi, REGOFF_RSI(%rsp)
324	movq	%rdx, REGOFF_RDX(%rsp)
325	movq	%r10, REGOFF_RCX(%rsp)		/* wrapper: %rcx -> %r10 */
326	movq	%r10, %rcx			/* arg[3] for direct calls */
327
328	movq	%r8, REGOFF_R8(%rsp)
329	movq	%r9, REGOFF_R9(%rsp)
330	movq	%rax, REGOFF_RAX(%rsp)
331	movq	%rbx, REGOFF_RBX(%rsp)
332
333	movq	%rbp, REGOFF_RBP(%rsp)
334	movq	%r10, REGOFF_R10(%rsp)
335	movq	%gs:CPU_RTMP_RSP, %r11
336	movq	%r11, REGOFF_RSP(%rsp)
337	movq	%r12, REGOFF_R12(%rsp)
338
339	movq	%r13, REGOFF_R13(%rsp)
340	movq	%r14, REGOFF_R14(%rsp)
341	movq	%gs:CPU_RTMP_R15, %r10
342	movq	%r10, REGOFF_R15(%rsp)
343	movq	$0, REGOFF_SAVFP(%rsp)
344	movq	$0, REGOFF_SAVPC(%rsp)
345
346	/*
347	 * Copy these registers here in case we end up stopped with
348	 * someone (like, say, /proc) messing with our register state.
349	 * We don't -restore- them unless we have to in update_sregs.
350	 *
351	 * Since userland -can't- change fsbase or gsbase directly,
352	 * and capturing them involves two serializing instructions,
353	 * we don't bother to capture them here.
354	 */
355	xorl	%ebx, %ebx
356	movw	%ds, %bx
357	movq	%rbx, REGOFF_DS(%rsp)
358	movw	%es, %bx
359	movq	%rbx, REGOFF_ES(%rsp)
360	movw	%fs, %bx
361	movq	%rbx, REGOFF_FS(%rsp)
362	movw	%gs, %bx
363	movq	%rbx, REGOFF_GS(%rsp)
364
365	/*
366	 * Machine state saved in the regs structure on the stack
367	 * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9
368	 * %eax is the syscall number
369	 * %rsp is the thread's stack, %r15 is curthread
370	 * REG_RSP(%rsp) is the user's stack
371	 */
372
373	SYSCALL_TRAPTRACE($TT_SYSC64)
374
375	movq	%rsp, %rbp
376
377	movq	T_LWP(%r15), %r14
378	ASSERT_NO_RUPDATE_PENDING(%r14)
379
380	ENABLE_INTR_FLAGS
381
382	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
383	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate call) */
384
385	ASSERT_LWPTOREGS(%r14, %rsp)
386
387	movb	$LWP_SYS, LWP_STATE(%r14)
388	incq	LWP_RU_SYSC(%r14)
389	movb	$NORMALRETURN, LWP_EOSYS(%r14)
390
391	incq	%gs:CPU_STATS_SYS_SYSCALL
392
393	movw	%ax, T_SYSNUM(%r15)
394	movzbl	T_PRE_SYS(%r15), %ebx
395	ORL_SYSCALLTRACE(%ebx)
396	testl	%ebx, %ebx
397	jne	_syscall_pre
398
399_syscall_invoke:
400	movq	REGOFF_RDI(%rbp), %rdi
401	movq	REGOFF_RSI(%rbp), %rsi
402	movq	REGOFF_RDX(%rbp), %rdx
403	movq	REGOFF_RCX(%rbp), %rcx
404	movq	REGOFF_R8(%rbp), %r8
405	movq	REGOFF_R9(%rbp), %r9
406
407	cmpl	$NSYSCALL, %eax
408	jae	_syscall_ill
409	shll	$SYSENT_SIZE_SHIFT, %eax
410	leaq	sysent(%rax), %rbx
411
412	call	*SY_CALLC(%rbx)
413
414	movq	%rax, %r12
415	movq	%rdx, %r13
416
417	/*
418	 * If the handler returns two ints, then we need to split the
419	 * 64-bit return value into two 32-bit values.
420	 */
421	testw	$SE_32RVAL2, SY_FLAGS(%rbx)
422	je	5f
423	movq	%r12, %r13
424	shrq	$32, %r13	/* upper 32-bits into %edx */
425	movl	%r12d, %r12d	/* lower 32-bits into %eax */
4265:
427	/*
428	 * Optimistically assume that there's no post-syscall
429	 * work to do.  (This is to avoid having to call syscall_mstate()
430	 * with interrupts disabled)
431	 */
432	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
433
434	/*
435	 * We must protect ourselves from being descheduled here;
436	 * If we were, and we ended up on another cpu, or another
437	 * lwp got in ahead of us, it could change the segment
438	 * registers without us noticing before we return to userland.
439	 */
440	cli
441	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
442	jne	_syscall_post
443	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
444
445	movq	%r12, REGOFF_RAX(%rsp)
446	movq	%r13, REGOFF_RDX(%rsp)
447
448	/*
449	 * To get back to userland, we need the return %rip in %rcx and
450	 * the return %rfl in %r11d.  The sysretq instruction also arranges
451	 * to fix up %cs and %ss; everything else is our responsibility.
452	 */
453	movq	REGOFF_RDI(%rsp), %rdi
454	movq	REGOFF_RSI(%rsp), %rsi
455	movq	REGOFF_RDX(%rsp), %rdx
456	/* %rcx used to restore %rip value */
457
458	movq	REGOFF_R8(%rsp), %r8
459	movq	REGOFF_R9(%rsp), %r9
460	movq	REGOFF_RAX(%rsp), %rax
461	movq	REGOFF_RBX(%rsp), %rbx
462
463	movq	REGOFF_RBP(%rsp), %rbp
464	movq	REGOFF_R10(%rsp), %r10
465	/* %r11 used to restore %rfl value */
466	movq	REGOFF_R12(%rsp), %r12
467
468	movq	REGOFF_R13(%rsp), %r13
469	movq	REGOFF_R14(%rsp), %r14
470	movq	REGOFF_R15(%rsp), %r15
471
472	movq	REGOFF_RIP(%rsp), %rcx
473	movl	REGOFF_RFL(%rsp), %r11d
474	movq	REGOFF_RSP(%rsp), %rsp
475	swapgs
476	sysretq
477
478_syscall_pre:
479	call	pre_syscall
480	movl	%eax, %r12d
481	testl	%eax, %eax
482	jne	_syscall_post_call
483	/*
484	 * Didn't abort, so reload the syscall args and invoke the handler.
485	 */
486	movzwl	T_SYSNUM(%r15), %eax
487	jmp	_syscall_invoke
488
489_syscall_ill:
490	call	nosys
491	movq	%rax, %r12
492	movq	%rdx, %r13
493	jmp	_syscall_post_call
494
495_syscall_post:
496	sti
497	/*
498	 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
499	 * so that we can account for the extra work it takes us to finish.
500	 */
501	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
502_syscall_post_call:
503	movq	%r12, %rdi
504	movq	%r13, %rsi
505	call	post_syscall
506	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
507	jmp	sys_rtt_syscall
508	SET_SIZE(sys_syscall)
509
510#endif	/* __lint */
511
512#if defined(__lint)
513
514/*ARGSUSED*/
515void
516sys_syscall32()
517{}
518
519#else	/* __lint */
520
521	ENTRY_NP(sys_syscall32)
522	swapgs
523	movl	%esp, %r10d
524	movq	%gs:CPU_THREAD, %r15
525	movq	T_STACK(%r15), %rsp
526	movl	%eax, %eax
527
528	movl	$U32CS_SEL, REGOFF_CS(%rsp)
529	movl	%ecx, REGOFF_RIP(%rsp)		/* syscall: %rip -> %rcx */
530	movq	%r11, REGOFF_RFL(%rsp)		/* syscall: %rfl -> %r11d */
531	movq	%r10, REGOFF_RSP(%rsp)
532	movl	$UDS_SEL, REGOFF_SS(%rsp)
533
534_syscall32_save:
535
536	movl	%edi, REGOFF_RDI(%rsp)
537	movl	%esi, REGOFF_RSI(%rsp)
538	movl	%ebp, REGOFF_RBP(%rsp)
539	movl	%ebx, REGOFF_RBX(%rsp)
540	movl	%edx, REGOFF_RDX(%rsp)
541	movl	%ecx, REGOFF_RCX(%rsp)
542	movl	%eax, REGOFF_RAX(%rsp)		/* wrapper: sysc# -> %eax */
543	movq	$0, REGOFF_SAVFP(%rsp)
544	movq	$0, REGOFF_SAVPC(%rsp)
545
546	/*
547	 * Copy these registers here in case we end up stopped with
548	 * someone (like, say, /proc) messing with our register state.
549	 * We don't -restore- them unless we have to in update_sregs.
550	 *
551	 * Since userland -can't- change fsbase or gsbase directly,
552	 * we don't bother to capture them here.
553	 */
554	xorl	%ebx, %ebx
555	movw	%ds, %bx
556	movq	%rbx, REGOFF_DS(%rsp)
557	movw	%es, %bx
558	movq	%rbx, REGOFF_ES(%rsp)
559	movw	%fs, %bx
560	movq	%rbx, REGOFF_FS(%rsp)
561	movw	%gs, %bx
562	movq	%rbx, REGOFF_GS(%rsp)
563
564	/*
565	 * Application state saved in the regs structure on the stack
566	 * %eax is the syscall number
567	 * %rsp is the thread's stack, %r15 is curthread
568	 * REG_RSP(%rsp) is the user's stack
569	 */
570
571	SYSCALL_TRAPTRACE32($TT_SYSC)
572
573	movq	%rsp, %rbp
574
575	movq	T_LWP(%r15), %r14
576	ASSERT_NO_RUPDATE_PENDING(%r14)
577
578	ENABLE_INTR_FLAGS
579
580	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
581	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate call) */
582
583	ASSERT_LWPTOREGS(%r14, %rsp)
584
585	incq	 %gs:CPU_STATS_SYS_SYSCALL
586
587	/*
588	 * Make some space for MAXSYSARGS (currently 8) 32-bit args placed
589	 * into 64-bit (long) arg slots, maintaining 16 byte alignment.  Or
590	 * more succinctly:
591	 *
592	 *	SA(MAXSYSARGS * sizeof (long)) == 64
593	 */
594#define	SYS_DROP	64			/* drop for args */
595	subq	$SYS_DROP, %rsp
596	movb	$LWP_SYS, LWP_STATE(%r14)
597	movq	%r15, %rdi
598	movq	%rsp, %rsi
599	call	syscall_entry
600
601	/*
602	 * Fetch the arguments copied onto the kernel stack and put
603	 * them in the right registers to invoke a C-style syscall handler.
604	 * %rax contains the handler address.
605	 *
606	 * Ideas for making all this go faster of course include simply
607	 * forcibly fetching 6 arguments from the user stack under lofault
608	 * protection, reverting to copyin_args only when watchpoints
609	 * are in effect.
610	 *
611	 * (If we do this, make sure that exec and libthread leave
612	 * enough space at the top of the stack to ensure that we'll
613	 * never do a fetch from an invalid page.)
614	 *
615	 * Lots of ideas here, but they won't really help with bringup B-)
616	 * Correctness can't wait, performance can wait a little longer ..
617	 */
618
619	movq	%rax, %rbx
620	movl	0(%rsp), %edi
621	movl	8(%rsp), %esi
622	movl	0x10(%rsp), %edx
623	movl	0x18(%rsp), %ecx
624	movl	0x20(%rsp), %r8d
625	movl	0x28(%rsp), %r9d
626
627	call	*SY_CALLC(%rbx)
628
629	movq	%rbp, %rsp	/* pop the args */
630
631	/*
632	 * amd64 syscall handlers -always- return a 64-bit value in %rax.
633	 * On the 32-bit kernel, they always return that value in %eax:%edx
634	 * as required by the 32-bit ABI.
635	 *
636	 * Simulate the same behaviour by unconditionally splitting the
637	 * return value in the same way.
638	 */
639	movq	%rax, %r13
640	shrq	$32, %r13	/* upper 32-bits into %edx */
641	movl	%eax, %r12d	/* lower 32-bits into %eax */
642
643	/*
644	 * Optimistically assume that there's no post-syscall
645	 * work to do.  (This is to avoid having to call syscall_mstate()
646	 * with interrupts disabled)
647	 */
648	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
649
650	/*
651	 * We must protect ourselves from being descheduled here;
652	 * If we were, and we ended up on another cpu, or another
653	 * lwp got in ahead of us, it could change the segment
654	 * registers without us noticing before we return to userland.
655	 */
656	cli
657	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
658	jne	_full_syscall_postsys32
659	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
660
661	/*
662	 * To get back to userland, we need to put the return %rip in %rcx and
663	 * the return %rfl in %r11d.  The sysret instruction also arranges
664	 * to fix up %cs and %ss; everything else is our responsibility.
665	 */
666
667	movl	%r12d, %eax			/* %eax: rval1 */
668	movl	REGOFF_RBX(%rsp), %ebx
669	/* %ecx used for return pointer */
670	movl	%r13d, %edx			/* %edx: rval2 */
671	movl	REGOFF_RBP(%rsp), %ebp
672	movl	REGOFF_RSI(%rsp), %esi
673	movl	REGOFF_RDI(%rsp), %edi
674
675	movl	REGOFF_RFL(%rsp), %r11d		/* %r11 -> eflags */
676	movl	REGOFF_RIP(%rsp), %ecx		/* %ecx -> %eip */
677	movl	REGOFF_RSP(%rsp), %esp
678
679	swapgs
680	sysretl
681
682_full_syscall_postsys32:
683	sti
684	/*
685	 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
686	 * so that we can account for the extra work it takes us to finish.
687	 */
688	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
689	movq	%r15, %rdi
690	movq	%r12, %rsi			/* rval1 - %eax */
691	movq	%r13, %rdx			/* rval2 - %edx */
692	call	syscall_exit
693	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
694	jmp	sys_rtt_syscall32
695	SET_SIZE(sys_syscall32)
696
697#endif	/* __lint */
698
699/*
700 * System call handler via the sysenter instruction
701 * Used only for 32-bit system calls on the 64-bit kernel.
702 *
703 * The caller in userland has arranged that:
704 *
705 * -	%eax contains the syscall number
706 * -	%ecx contains the user %esp
707 * -	%edx contains the return %eip
708 * -	the user stack contains the args to the syscall
709 *
710 * Hardware and (privileged) initialization code have arranged that by
711 * the time the sysenter instructions completes:
712 *
713 * - %rip is pointing to sys_sysenter (below).
714 * - %cs and %ss are set to kernel text and stack (data) selectors.
715 * - %rsp is pointing at the lwp's stack
716 * - interrupts have been disabled.
717 *
718 * Note that we are unable to return both "rvals" to userland with
719 * this call, as %edx is used by the sysexit instruction.
720 */
721#if defined(__lint)
722
723void
724sys_sysenter()
725{}
726
727#else	/* __lint */
728
729	ENTRY_NP(sys_sysenter)
730	swapgs
731	ALTENTRY(_sys_sysenter_post_swapgs)
732	movq	%gs:CPU_THREAD, %r15
733
734	movl	$U32CS_SEL, REGOFF_CS(%rsp)
735	movl	%ecx, REGOFF_RSP(%rsp)		/* wrapper: %esp -> %ecx */
736	movl	%edx, REGOFF_RIP(%rsp)		/* wrapper: %eip -> %edx */
737	pushfq
738	popq	%r10
739	movl	$UDS_SEL, REGOFF_SS(%rsp)
740
741	/*
742	 * Set the interrupt flag before storing the flags to the
743	 * flags image on the stack so we can return to user with
744	 * interrupts enabled if we return via sys_rtt_syscall32
745	 */
746	orq	$PS_IE, %r10
747	movq	%r10, REGOFF_RFL(%rsp)
748
749	movl	%edi, REGOFF_RDI(%rsp)
750	movl	%esi, REGOFF_RSI(%rsp)
751	movl	%ebp, REGOFF_RBP(%rsp)
752	movl	%ebx, REGOFF_RBX(%rsp)
753	movl	%edx, REGOFF_RDX(%rsp)
754	movl	%ecx, REGOFF_RCX(%rsp)
755	movl	%eax, REGOFF_RAX(%rsp)		/* wrapper: sysc# -> %eax */
756	movq	$0, REGOFF_SAVFP(%rsp)
757	movq	$0, REGOFF_SAVPC(%rsp)
758
759	/*
760	 * Copy these registers here in case we end up stopped with
761	 * someone (like, say, /proc) messing with our register state.
762	 * We don't -restore- them unless we have to in update_sregs.
763	 *
764	 * Since userland -can't- change fsbase or gsbase directly,
765	 * we don't bother to capture them here.
766	 */
767	xorl	%ebx, %ebx
768	movw	%ds, %bx
769	movq	%rbx, REGOFF_DS(%rsp)
770	movw	%es, %bx
771	movq	%rbx, REGOFF_ES(%rsp)
772	movw	%fs, %bx
773	movq	%rbx, REGOFF_FS(%rsp)
774	movw	%gs, %bx
775	movq	%rbx, REGOFF_GS(%rsp)
776
777	/*
778	 * Application state saved in the regs structure on the stack
779	 * %eax is the syscall number
780	 * %rsp is the thread's stack, %r15 is curthread
781	 * REG_RSP(%rsp) is the user's stack
782	 */
783
784	SYSCALL_TRAPTRACE($TT_SYSENTER)
785
786	movq	%rsp, %rbp
787
788	movq	T_LWP(%r15), %r14
789	ASSERT_NO_RUPDATE_PENDING(%r14)
790
791	ENABLE_INTR_FLAGS
792
793	/*
794	 * Catch 64-bit process trying to issue sysenter instruction
795	 * on Nocona based systems.
796	 */
797	movq	LWP_PROCP(%r14), %rax
798	cmpq	$DATAMODEL_ILP32, P_MODEL(%rax)
799	je	7f
800
801	/*
802	 * For a non-32-bit process, simulate a #ud, since that's what
803	 * native hardware does.  The traptrace entry (above) will
804	 * let you know what really happened.
805	 */
806	movq	$T_ILLINST, REGOFF_TRAPNO(%rsp)
807	movq	REGOFF_CS(%rsp), %rdi
808	movq	%rdi, REGOFF_ERR(%rsp)
809	movq	%rsp, %rdi
810	movq	REGOFF_RIP(%rsp), %rsi
811	movl	%gs:CPU_ID, %edx
812	call	trap
813	jmp	_sys_rtt
8147:
815
816	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
817	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate calls) */
818
819	ASSERT_LWPTOREGS(%r14, %rsp)
820
821	incq	%gs:CPU_STATS_SYS_SYSCALL
822
823	/*
824	 * Make some space for MAXSYSARGS (currently 8) 32-bit args
825	 * placed into 64-bit (long) arg slots, plus one 64-bit
826	 * (long) arg count, maintaining 16 byte alignment.
827	 */
828	subq	$SYS_DROP, %rsp
829	movb	$LWP_SYS, LWP_STATE(%r14)
830	movq	%r15, %rdi
831	movq	%rsp, %rsi
832	call	syscall_entry
833
834	/*
835	 * Fetch the arguments copied onto the kernel stack and put
836	 * them in the right registers to invoke a C-style syscall handler.
837	 * %rax contains the handler address.
838	 */
839	movq	%rax, %rbx
840	movl	0(%rsp), %edi
841	movl	8(%rsp), %esi
842	movl	0x10(%rsp), %edx
843	movl	0x18(%rsp), %ecx
844	movl	0x20(%rsp), %r8d
845	movl	0x28(%rsp), %r9d
846
847	call	*SY_CALLC(%rbx)
848
849	movq	%rbp, %rsp	/* pop the args */
850
851	/*
852	 * amd64 syscall handlers -always- return a 64-bit value in %rax.
853	 * On the 32-bit kernel, the always return that value in %eax:%edx
854	 * as required by the 32-bit ABI.
855	 *
856	 * Simulate the same behaviour by unconditionally splitting the
857	 * return value in the same way.
858	 */
859	movq	%rax, %r13
860	shrq	$32, %r13	/* upper 32-bits into %edx */
861	movl	%eax, %r12d	/* lower 32-bits into %eax */
862
863	/*
864	 * Optimistically assume that there's no post-syscall
865	 * work to do.  (This is to avoid having to call syscall_mstate()
866	 * with interrupts disabled)
867	 */
868	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
869
870	/*
871	 * We must protect ourselves from being descheduled here;
872	 * If we were, and we ended up on another cpu, or another
873	 * lwp got int ahead of us, it could change the segment
874	 * registers without us noticing before we return to userland.
875	 */
876	cli
877	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
878	jne	_full_syscall_postsys32
879	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
880
881	/*
882	 * To get back to userland, load up the 32-bit registers and
883	 * sysexit back where we came from.
884	 */
885
886	/*
887	 * Interrupts will be turned on by the 'sti' executed just before
888	 * sysexit.  The following ensures that restoring the user's rflags
889	 * doesn't enable interrupts too soon.
890	 */
891	andq	$_BITNOT(PS_IE), REGOFF_RFL(%rsp)
892
893	/*
894	 * (There's no point in loading up %edx because the sysexit
895	 * mechanism smashes it.)
896	 */
897	movl	%r12d, %eax
898	movl	REGOFF_RBX(%rsp), %ebx
899	movl	REGOFF_RBP(%rsp), %ebp
900	movl	REGOFF_RSI(%rsp), %esi
901	movl	REGOFF_RDI(%rsp), %edi
902
903	movl	REGOFF_RIP(%rsp), %edx	/* sysexit: %edx -> %eip */
904	pushq	REGOFF_RFL(%rsp)
905	popfq
906	movl	REGOFF_RSP(%rsp), %ecx	/* sysexit: %ecx -> %esp */
907	swapgs
908	sti
909	sysexit
910	SET_SIZE(sys_sysenter)
911	SET_SIZE(_sys_sysenter_post_swapgs)
912
913#endif	/* __lint */
914
915
916/*
917 * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by
918 * the generic i386 libc to do system calls. We do a small amount of setup
919 * before jumping into the existing sys_syscall32 path.
920 */
921#if defined(__lint)
922
923/*ARGSUSED*/
924void
925sys_syscall_int()
926{}
927
928#else	/* __lint */
929
930	ENTRY_NP(sys_syscall_int)
931	swapgs
932	movq	%gs:CPU_THREAD, %r15
933	movq	T_STACK(%r15), %rsp
934	movl	%eax, %eax
935	/*
936	 * Set t_post_sys on this thread to force ourselves out via the slow
937	 * path. It might be possible at some later date to optimize this out
938	 * and use a faster return mechanism.
939	 */
940	movb	$1, T_POST_SYS(%r15)
941	jmp	_syscall32_save
942	SET_SIZE(sys_syscall_int)
943
944#endif	/* __lint */
945
946/*
947 * Legacy 32-bit applications and old libc implementations do lcalls;
948 * we should never get here because the LDT entry containing the syscall
949 * segment descriptor has the "segment present" bit cleared, which means
950 * we end up processing those system calls in trap() via a not-present trap.
951 *
952 * We do it this way because a call gate unhelpfully does -nothing- to the
953 * interrupt flag bit, so an interrupt can run us just after the lcall
954 * completes, but just before the swapgs takes effect.   Thus the INTR_PUSH and
955 * INTR_POP paths would have to be slightly more complex to dance around
956 * this problem, and end up depending explicitly on the first
957 * instruction of this handler being either swapgs or cli.
958 */
959
960#if defined(__lint)
961
962/*ARGSUSED*/
963void
964sys_lcall32()
965{}
966
967#else	/* __lint */
968
969	ENTRY_NP(sys_lcall32)
970	swapgs
971	pushq	$0
972	pushq	%rbp
973	movq	%rsp, %rbp
974	leaq	__lcall_panic_str(%rip), %rdi
975	xorl	%eax, %eax
976	call	panic
977	SET_SIZE(sys_lcall32)
978
979__lcall_panic_str:
980	.string	"sys_lcall32: shouldn't be here!"
981
982/*
983 * Declare a uintptr_t which covers the entire pc range of syscall
984 * handlers for the stack walkers that need this.
985 */
986	.align	CPTRSIZE
987	.globl	_allsyscalls_size
988	.type	_allsyscalls_size, @object
989_allsyscalls_size:
990	.NWORD	. - _allsyscalls
991	SET_SIZE(_allsyscalls_size)
992
993#endif	/* __lint */
994
995/*
996 * These are the thread context handlers for lwps using sysenter/sysexit.
997 */
998
999#if defined(__lint)
1000
1001/*ARGSUSED*/
1002void
1003sep_save(void *ksp)
1004{}
1005
1006/*ARGSUSED*/
1007void
1008sep_restore(void *ksp)
1009{}
1010
1011#else	/* __lint */
1012
1013	/*
1014	 * setting this value to zero as we switch away causes the
1015	 * stack-pointer-on-sysenter to be NULL, ensuring that we
1016	 * don't silently corrupt another (preempted) thread stack
1017	 * when running an lwp that (somehow) didn't get sep_restore'd
1018	 */
1019	ENTRY_NP(sep_save)
1020	xorl	%edx, %edx
1021	xorl	%eax, %eax
1022	movl	$MSR_INTC_SEP_ESP, %ecx
1023	wrmsr
1024	ret
1025	SET_SIZE(sep_save)
1026
1027	/*
1028	 * Update the kernel stack pointer as we resume onto this cpu.
1029	 */
1030	ENTRY_NP(sep_restore)
1031	movq	%rdi, %rdx
1032	shrq	$32, %rdx
1033	movl	%edi, %eax
1034	movl	$MSR_INTC_SEP_ESP, %ecx
1035	wrmsr
1036	ret
1037	SET_SIZE(sep_restore)
1038
1039#endif	/* __lint */
1040