xref: /titanic_44/usr/src/uts/i86pc/ml/syscall_asm_amd64.s (revision 53f3aea0943e36e5fed2615ad5f9fd1f17de51d2)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/asm_linkage.h>
27#include <sys/asm_misc.h>
28#include <sys/regset.h>
29#include <sys/privregs.h>
30#include <sys/psw.h>
31#include <sys/machbrand.h>
32
33#if defined(__lint)
34
35#include <sys/types.h>
36#include <sys/thread.h>
37#include <sys/systm.h>
38
39#else	/* __lint */
40
41#include <sys/segments.h>
42#include <sys/pcb.h>
43#include <sys/trap.h>
44#include <sys/ftrace.h>
45#include <sys/traptrace.h>
46#include <sys/clock.h>
47#include <sys/model.h>
48#include <sys/panic.h>
49
50#if defined(__xpv)
51#include <sys/hypervisor.h>
52#endif
53
54#include "assym.h"
55
56#endif	/* __lint */
57
58/*
59 * We implement five flavours of system call entry points
60 *
61 * -	syscall/sysretq		(amd64 generic)
62 * -	syscall/sysretl		(i386 plus SYSC bit)
63 * -	sysenter/sysexit	(i386 plus SEP bit)
64 * -	int/iret		(i386 generic)
65 * -	lcall/iret		(i386 generic)
66 *
67 * The current libc included in Solaris uses int/iret as the base unoptimized
68 * kernel entry method. Older libc implementations and legacy binaries may use
69 * the lcall call gate, so it must continue to be supported.
70 *
71 * System calls that use an lcall call gate are processed in trap() via a
72 * segment-not-present trap, i.e. lcalls are extremely slow(!).
73 *
74 * The basic pattern used in the 32-bit SYSC handler at this point in time is
75 * to have the bare minimum of assembler, and get to the C handlers as
76 * quickly as possible.
77 *
78 * The 64-bit handler is much closer to the sparcv9 handler; that's
79 * because of passing arguments in registers.  The 32-bit world still
80 * passes arguments on the stack -- that makes that handler substantially
81 * more complex.
82 *
83 * The two handlers share a few code fragments which are broken
84 * out into preprocessor macros below.
85 *
86 * XX64	come back and speed all this up later.  The 32-bit stuff looks
87 * especially easy to speed up the argument copying part ..
88 *
89 *
90 * Notes about segment register usage (c.f. the 32-bit kernel)
91 *
92 * In the 32-bit kernel, segment registers are dutifully saved and
93 * restored on all mode transitions because the kernel uses them directly.
94 * When the processor is running in 64-bit mode, segment registers are
95 * largely ignored.
96 *
97 * %cs and %ss
98 *	controlled by the hardware mechanisms that make mode transitions
99 *
100 * The remaining segment registers have to either be pointing at a valid
101 * descriptor i.e. with the 'present' bit set, or they can NULL descriptors
102 *
103 * %ds and %es
104 *	always ignored
105 *
106 * %fs and %gs
107 *	fsbase and gsbase are used to control the place they really point at.
108 *	The kernel only depends on %gs, and controls its own gsbase via swapgs
109 *
110 * Note that loading segment registers is still costly because the GDT
111 * lookup still happens (this is because the hardware can't know that we're
112 * not setting up these segment registers for a 32-bit program).  Thus we
113 * avoid doing this in the syscall path, and defer them to lwp context switch
114 * handlers, so the register values remain virtualized to the lwp.
115 */
116
117#if defined(SYSCALLTRACE)
118#define	ORL_SYSCALLTRACE(r32)		\
119	orl	syscalltrace(%rip), r32
120#else
121#define	ORL_SYSCALLTRACE(r32)
122#endif
123
124/*
125 * In the 32-bit kernel, we do absolutely nothing before getting into the
126 * brand callback checks.  In 64-bit land, we do swapgs and then come here.
127 * We assume that the %rsp- and %r15-stashing fields in the CPU structure
128 * are still unused.
129 *
130 * Check if a brand_mach_ops callback is defined for the specified callback_id
131 * type.  If so invoke it with the kernel's %gs value loaded and the following
132 * data on the stack:
133 *
134 * stack:  --------------------------------------
135 *      40 | user %gs				|
136 *      32 | callback pointer			|
137 *    | 24 | user (or interrupt) stack pointer	|
138 *    | 16 | lwp pointer			|
139 *    v  8 | userland return address		|
140 *       0 | callback wrapper return addr	|
141 *         --------------------------------------
142 *
143 * Since we're pushing the userland return address onto the kernel stack
144 * we need to get that address without accessing the user's stack (since we
145 * can't trust that data).  There are different ways to get the userland
146 * return address depending on how the syscall trap was made:
147 *
148 * a) For sys_syscall and sys_syscall32 the return address is in %rcx.
149 * b) For sys_sysenter the return address is in %rdx.
150 * c) For sys_int80 and sys_syscall_int (int91), upon entry into the macro,
151 *    the stack pointer points at the state saved when we took the interrupt:
152 *	 ------------------------
153 *    |  | user's %ss		|
154 *    |  | user's %esp		|
155 *    |  | EFLAGS register	|
156 *    v  | user's %cs		|
157 *       | user's %eip		|
158 *	 ------------------------
159 *
160 * The 2nd parameter to the BRAND_CALLBACK macro is either the
161 * BRAND_URET_FROM_REG or BRAND_URET_FROM_INTR_STACK macro.  These macros are
162 * used to generate the proper code to get the userland return address for
163 * each syscall entry point.
164 */
165#define BRAND_URET_FROM_REG(rip_reg)					\
166	pushq	rip_reg			/* push the return address	*/
167
168/*
169 * The interrupt stack pointer we saved on entry to the BRAND_CALLBACK macro
170 * is currently pointing at the user return address (%eip).
171 */
172#define BRAND_URET_FROM_INTR_STACK()					\
173	movq	%gs:CPU_RTMP_RSP, %r15	/* grab the intr. stack pointer	*/ ;\
174	pushq	(%r15)			/* push the return address	*/
175
176#define	BRAND_CALLBACK(callback_id, push_userland_ret)			    \
177	movq	%rsp, %gs:CPU_RTMP_RSP	/* save the stack pointer	*/ ;\
178	movq	%r15, %gs:CPU_RTMP_R15	/* save %r15			*/ ;\
179	movq	%gs:CPU_THREAD, %r15	/* load the thread pointer	*/ ;\
180	movq	T_STACK(%r15), %rsp	/* switch to the kernel stack	*/ ;\
181	subq	$24, %rsp		/* save space for 3 pointers	*/ ;\
182	pushq	%r14			/* save %r14			*/ ;\
183	movq	%gs:CPU_RTMP_RSP, %r14					   ;\
184	movq	%r14, 8(%rsp)		/* stash the user stack pointer	*/ ;\
185	popq	%r14			/* restore %r14			*/ ;\
186	movq	T_LWP(%r15), %r15	/* load the lwp pointer		*/ ;\
187	pushq	%r15			/* push the lwp pointer		*/ ;\
188	movq	LWP_PROCP(%r15), %r15	/* load the proc pointer	*/ ;\
189	movq	P_BRAND(%r15), %r15	/* load the brand pointer	*/ ;\
190	movq	B_MACHOPS(%r15), %r15	/* load the machops pointer	*/ ;\
191	movq	_CONST(_MUL(callback_id, CPTRSIZE))(%r15), %r15		   ;\
192	cmpq	$0, %r15						   ;\
193	je	1f							   ;\
194	movq	%r15, 16(%rsp)		/* save the callback pointer	*/ ;\
195	push_userland_ret		/* push the return address	*/ ;\
196	SWAPGS				/* user gsbase			*/ ;\
197	mov	%gs, %r15		/* get %gs			*/ ;\
198	movq	%r15, 32(%rsp)		/* save %gs on stack		*/ ;\
199	SWAPGS				/* kernel gsbase		*/ ;\
200	movq	%gs:CPU_RTMP_R15, %r15	/* restore %r15			*/ ;\
201	call	*24(%rsp)		/* call callback		*/ ;\
2021:	movq	%gs:CPU_RTMP_R15, %r15	/* restore %r15			*/ ;\
203	movq	%gs:CPU_RTMP_RSP, %rsp	/* restore the stack pointer	*/
204
205#define	MSTATE_TRANSITION(from, to)		\
206	movl	$from, %edi;			\
207	movl	$to, %esi;			\
208	call	syscall_mstate
209
210/*
211 * Check to see if a simple (direct) return is possible i.e.
212 *
213 *	if (t->t_post_sys_ast | syscalltrace |
214 *	    lwp->lwp_pcb.pcb_rupdate == 1)
215 *		do full version	;
216 *
217 * Preconditions:
218 * -	t is curthread
219 * Postconditions:
220 * -	condition code NE is set if post-sys is too complex
221 * -	rtmp is zeroed if it isn't (we rely on this!)
222 * -	ltmp is smashed
223 */
224#define	CHECK_POSTSYS_NE(t, ltmp, rtmp)			\
225	movq	T_LWP(t), ltmp;				\
226	movzbl	PCB_RUPDATE(ltmp), rtmp;		\
227	ORL_SYSCALLTRACE(rtmp);				\
228	orl	T_POST_SYS_AST(t), rtmp;		\
229	cmpl	$0, rtmp
230
231/*
232 * Fix up the lwp, thread, and eflags for a successful return
233 *
234 * Preconditions:
235 * -	zwreg contains zero
236 */
237#define	SIMPLE_SYSCALL_POSTSYS(t, lwp, zwreg)		\
238	movb	$LWP_USER, LWP_STATE(lwp);		\
239	movw	zwreg, T_SYSNUM(t);			\
240	andb	$_CONST(0xffff - PS_C), REGOFF_RFL(%rsp)
241
242/*
243 * ASSERT(lwptoregs(lwp) == rp);
244 *
245 * This may seem obvious, but very odd things happen if this
246 * assertion is false
247 *
248 * Preconditions:
249 *	(%rsp is ready for normal call sequence)
250 * Postconditions (if assertion is true):
251 *	%r11 is smashed
252 *
253 * ASSERT(rp->r_cs == descnum)
254 *
255 * The code selector is written into the regs structure when the
256 * lwp stack is created.  We use this ASSERT to validate that
257 * the regs structure really matches how we came in.
258 *
259 * Preconditions:
260 *	(%rsp is ready for normal call sequence)
261 * Postconditions (if assertion is true):
262 *	-none-
263 *
264 * ASSERT(lwp->lwp_pcb.pcb_rupdate == 0);
265 *
266 * If this is false, it meant that we returned to userland without
267 * updating the segment registers as we were supposed to.
268 *
269 * Note that we must ensure no interrupts or other traps intervene
270 * between entering privileged mode and performing the assertion,
271 * otherwise we may perform a context switch on the thread, which
272 * will end up setting pcb_rupdate to 1 again.
273 */
274#if defined(DEBUG)
275
276#if !defined(__lint)
277
278__lwptoregs_msg:
279	.string	"syscall_asm_amd64.s:%d lwptoregs(%p) [%p] != rp [%p]"
280
281__codesel_msg:
282	.string	"syscall_asm_amd64.s:%d rp->r_cs [%ld] != %ld"
283
284__no_rupdate_msg:
285	.string	"syscall_asm_amd64.s:%d lwp %p, pcb_rupdate != 0"
286
287#endif	/* !__lint */
288
289#define	ASSERT_LWPTOREGS(lwp, rp)			\
290	movq	LWP_REGS(lwp), %r11;			\
291	cmpq	rp, %r11;				\
292	je	7f;					\
293	leaq	__lwptoregs_msg(%rip), %rdi;		\
294	movl	$__LINE__, %esi;			\
295	movq	lwp, %rdx;				\
296	movq	%r11, %rcx;				\
297	movq	rp, %r8;				\
298	xorl	%eax, %eax;				\
299	call	panic;					\
3007:
301
302#define	ASSERT_NO_RUPDATE_PENDING(lwp)			\
303	testb	$0x1, PCB_RUPDATE(lwp);			\
304	je	8f;					\
305	movq	lwp, %rdx;				\
306	leaq	__no_rupdate_msg(%rip), %rdi;		\
307	movl	$__LINE__, %esi;			\
308	xorl	%eax, %eax;				\
309	call	panic;					\
3108:
311
312#else
313#define	ASSERT_LWPTOREGS(lwp, rp)
314#define	ASSERT_NO_RUPDATE_PENDING(lwp)
315#endif
316
317/*
318 * Do the traptrace thing and restore any registers we used
319 * in situ.  Assumes that %rsp is pointing at the base of
320 * the struct regs, obviously ..
321 */
322#ifdef TRAPTRACE
323#define	SYSCALL_TRAPTRACE(ttype)				\
324	TRACE_PTR(%rdi, %rbx, %ebx, %rcx, ttype);		\
325	TRACE_REGS(%rdi, %rsp, %rbx, %rcx);			\
326	TRACE_STAMP(%rdi);	/* rdtsc clobbers %eax, %edx */	\
327	movq	REGOFF_RAX(%rsp), %rax;				\
328	movq	REGOFF_RBX(%rsp), %rbx;				\
329	movq	REGOFF_RCX(%rsp), %rcx;				\
330	movq	REGOFF_RDX(%rsp), %rdx;				\
331	movl	%eax, TTR_SYSNUM(%rdi);				\
332	movq	REGOFF_RDI(%rsp), %rdi
333
334#define	SYSCALL_TRAPTRACE32(ttype)				\
335	SYSCALL_TRAPTRACE(ttype);				\
336	/* paranoia: clean the top 32-bits of the registers */	\
337	orl	%eax, %eax;					\
338	orl	%ebx, %ebx;					\
339	orl	%ecx, %ecx;					\
340	orl	%edx, %edx;					\
341	orl	%edi, %edi
342#else	/* TRAPTRACE */
343#define	SYSCALL_TRAPTRACE(ttype)
344#define	SYSCALL_TRAPTRACE32(ttype)
345#endif	/* TRAPTRACE */
346
347/*
348 * The 64-bit libc syscall wrapper does this:
349 *
350 * fn(<args>)
351 * {
352 *	movq	%rcx, %r10	-- because syscall smashes %rcx
353 *	movl	$CODE, %eax
354 *	syscall
355 *	<error processing>
356 * }
357 *
358 * Thus when we come into the kernel:
359 *
360 *	%rdi, %rsi, %rdx, %r10, %r8, %r9 contain first six args
361 *	%rax is the syscall number
362 *	%r12-%r15 contain caller state
363 *
364 * The syscall instruction arranges that:
365 *
366 *	%rcx contains the return %rip
367 *	%r11d contains bottom 32-bits of %rflags
368 *	%rflags is masked (as determined by the SFMASK msr)
369 *	%cs is set to UCS_SEL (as determined by the STAR msr)
370 *	%ss is set to UDS_SEL (as determined by the STAR msr)
371 *	%rip is set to sys_syscall (as determined by the LSTAR msr)
372 *
373 * Or in other words, we have no registers available at all.
374 * Only swapgs can save us!
375 *
376 * Under the hypervisor, the swapgs has happened already.  However, the
377 * state of the world is very different from that we're familiar with.
378 *
379 * In particular, we have a stack structure like that for interrupt
380 * gates, except that the %cs and %ss registers are modified for reasons
381 * that are not entirely clear.  Critically, the %rcx/%r11 values do
382 * *not* reflect the usage of those registers under a 'real' syscall[1];
383 * the stack, therefore, looks like this:
384 *
385 *	0x0(rsp)	potentially junk %rcx
386 *	0x8(rsp)	potentially junk %r11
387 *	0x10(rsp)	user %rip
388 *	0x18(rsp)	modified %cs
389 *	0x20(rsp)	user %rflags
390 *	0x28(rsp)	user %rsp
391 *	0x30(rsp)	modified %ss
392 *
393 *
394 * and before continuing on, we must load the %rip into %rcx and the
395 * %rflags into %r11.
396 *
397 * [1] They used to, and we relied on it, but this was broken in 3.1.1.
398 * Sigh.
399 */
400#if defined(__xpv)
401#define	XPV_SYSCALL_PROD						\
402	movq	0x10(%rsp), %rcx;					\
403	movq	0x20(%rsp), %r11;					\
404	movq	0x28(%rsp), %rsp
405#else
406#define	XPV_SYSCALL_PROD /* nothing */
407#endif
408
409#if defined(__lint)
410
411/*ARGSUSED*/
412void
413sys_syscall()
414{}
415
416void
417_allsyscalls()
418{}
419
420size_t _allsyscalls_size;
421
422#else	/* __lint */
423
424	ENTRY_NP2(brand_sys_syscall,_allsyscalls)
425	SWAPGS				/* kernel gsbase */
426	XPV_SYSCALL_PROD
427	BRAND_CALLBACK(BRAND_CB_SYSCALL, BRAND_URET_FROM_REG(%rcx))
428	SWAPGS				/* user gsbase */
429
430#if defined(__xpv)
431	jmp	noprod_sys_syscall
432#endif
433
434	ALTENTRY(sys_syscall)
435	SWAPGS				/* kernel gsbase */
436	XPV_SYSCALL_PROD
437
438noprod_sys_syscall:
439
440	movq	%r15, %gs:CPU_RTMP_R15
441	movq	%rsp, %gs:CPU_RTMP_RSP
442
443	movq	%gs:CPU_THREAD, %r15
444	movq	T_STACK(%r15), %rsp	/* switch from user to kernel stack */
445
446	ASSERT_UPCALL_MASK_IS_SET
447
448	movl	$UCS_SEL, REGOFF_CS(%rsp)
449	movq	%rcx, REGOFF_RIP(%rsp)		/* syscall: %rip -> %rcx */
450	movq	%r11, REGOFF_RFL(%rsp)		/* syscall: %rfl -> %r11d */
451	movl	$UDS_SEL, REGOFF_SS(%rsp)
452
453	movl	%eax, %eax			/* wrapper: sysc# -> %eax */
454	movq	%rdi, REGOFF_RDI(%rsp)
455	movq	%rsi, REGOFF_RSI(%rsp)
456	movq	%rdx, REGOFF_RDX(%rsp)
457	movq	%r10, REGOFF_RCX(%rsp)		/* wrapper: %rcx -> %r10 */
458	movq	%r10, %rcx			/* arg[3] for direct calls */
459
460	movq	%r8, REGOFF_R8(%rsp)
461	movq	%r9, REGOFF_R9(%rsp)
462	movq	%rax, REGOFF_RAX(%rsp)
463	movq	%rbx, REGOFF_RBX(%rsp)
464
465	movq	%rbp, REGOFF_RBP(%rsp)
466	movq	%r10, REGOFF_R10(%rsp)
467	movq	%gs:CPU_RTMP_RSP, %r11
468	movq	%r11, REGOFF_RSP(%rsp)
469	movq	%r12, REGOFF_R12(%rsp)
470
471	movq	%r13, REGOFF_R13(%rsp)
472	movq	%r14, REGOFF_R14(%rsp)
473	movq	%gs:CPU_RTMP_R15, %r10
474	movq	%r10, REGOFF_R15(%rsp)
475	movq	$0, REGOFF_SAVFP(%rsp)
476	movq	$0, REGOFF_SAVPC(%rsp)
477
478	/*
479	 * Copy these registers here in case we end up stopped with
480	 * someone (like, say, /proc) messing with our register state.
481	 * We don't -restore- them unless we have to in update_sregs.
482	 *
483	 * Since userland -can't- change fsbase or gsbase directly,
484	 * and capturing them involves two serializing instructions,
485	 * we don't bother to capture them here.
486	 */
487	xorl	%ebx, %ebx
488	movw	%ds, %bx
489	movq	%rbx, REGOFF_DS(%rsp)
490	movw	%es, %bx
491	movq	%rbx, REGOFF_ES(%rsp)
492	movw	%fs, %bx
493	movq	%rbx, REGOFF_FS(%rsp)
494	movw	%gs, %bx
495	movq	%rbx, REGOFF_GS(%rsp)
496
497	/*
498	 * Machine state saved in the regs structure on the stack
499	 * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9
500	 * %eax is the syscall number
501	 * %rsp is the thread's stack, %r15 is curthread
502	 * REG_RSP(%rsp) is the user's stack
503	 */
504
505	SYSCALL_TRAPTRACE($TT_SYSC64)
506
507	movq	%rsp, %rbp
508
509	movq	T_LWP(%r15), %r14
510	ASSERT_NO_RUPDATE_PENDING(%r14)
511	ENABLE_INTR_FLAGS
512
513	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
514	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate call) */
515
516	ASSERT_LWPTOREGS(%r14, %rsp)
517
518	movb	$LWP_SYS, LWP_STATE(%r14)
519	incq	LWP_RU_SYSC(%r14)
520	movb	$NORMALRETURN, LWP_EOSYS(%r14)
521
522	incq	%gs:CPU_STATS_SYS_SYSCALL
523
524	movw	%ax, T_SYSNUM(%r15)
525	movzbl	T_PRE_SYS(%r15), %ebx
526	ORL_SYSCALLTRACE(%ebx)
527	testl	%ebx, %ebx
528	jne	_syscall_pre
529
530_syscall_invoke:
531	movq	REGOFF_RDI(%rbp), %rdi
532	movq	REGOFF_RSI(%rbp), %rsi
533	movq	REGOFF_RDX(%rbp), %rdx
534	movq	REGOFF_RCX(%rbp), %rcx
535	movq	REGOFF_R8(%rbp), %r8
536	movq	REGOFF_R9(%rbp), %r9
537
538	cmpl	$NSYSCALL, %eax
539	jae	_syscall_ill
540	shll	$SYSENT_SIZE_SHIFT, %eax
541	leaq	sysent(%rax), %rbx
542
543	call	*SY_CALLC(%rbx)
544
545	movq	%rax, %r12
546	movq	%rdx, %r13
547
548	/*
549	 * If the handler returns two ints, then we need to split the
550	 * 64-bit return value into two 32-bit values.
551	 */
552	testw	$SE_32RVAL2, SY_FLAGS(%rbx)
553	je	5f
554	movq	%r12, %r13
555	shrq	$32, %r13	/* upper 32-bits into %edx */
556	movl	%r12d, %r12d	/* lower 32-bits into %eax */
5575:
558	/*
559	 * Optimistically assume that there's no post-syscall
560	 * work to do.  (This is to avoid having to call syscall_mstate()
561	 * with interrupts disabled)
562	 */
563	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
564
565	/*
566	 * We must protect ourselves from being descheduled here;
567	 * If we were, and we ended up on another cpu, or another
568	 * lwp got in ahead of us, it could change the segment
569	 * registers without us noticing before we return to userland.
570	 */
571	CLI(%r14)
572	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
573	jne	_syscall_post
574	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
575
576	movq	%r12, REGOFF_RAX(%rsp)
577	movq	%r13, REGOFF_RDX(%rsp)
578
579	/*
580	 * To get back to userland, we need the return %rip in %rcx and
581	 * the return %rfl in %r11d.  The sysretq instruction also arranges
582	 * to fix up %cs and %ss; everything else is our responsibility.
583	 */
584	movq	REGOFF_RDI(%rsp), %rdi
585	movq	REGOFF_RSI(%rsp), %rsi
586	movq	REGOFF_RDX(%rsp), %rdx
587	/* %rcx used to restore %rip value */
588
589	movq	REGOFF_R8(%rsp), %r8
590	movq	REGOFF_R9(%rsp), %r9
591	movq	REGOFF_RAX(%rsp), %rax
592	movq	REGOFF_RBX(%rsp), %rbx
593
594	movq	REGOFF_RBP(%rsp), %rbp
595	movq	REGOFF_R10(%rsp), %r10
596	/* %r11 used to restore %rfl value */
597	movq	REGOFF_R12(%rsp), %r12
598
599	movq	REGOFF_R13(%rsp), %r13
600	movq	REGOFF_R14(%rsp), %r14
601	movq	REGOFF_R15(%rsp), %r15
602
603	movq	REGOFF_RIP(%rsp), %rcx
604	movl	REGOFF_RFL(%rsp), %r11d
605
606#if defined(__xpv)
607	addq	$REGOFF_RIP, %rsp
608#else
609	movq	REGOFF_RSP(%rsp), %rsp
610#endif
611
612        /*
613         * There can be no instructions between the ALTENTRY below and
614	 * SYSRET or we could end up breaking brand support. See label usage
615         * in sn1_brand_syscall_callback for an example.
616         */
617	ASSERT_UPCALL_MASK_IS_SET
618#if defined(__xpv)
619	SYSRETQ
620        ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
621
622	/*
623	 * We can only get here after executing a brand syscall
624	 * interposition callback handler and simply need to
625	 * "sysretq" back to userland. On the hypervisor this
626	 * involves the iret hypercall which requires us to construct
627	 * just enough of the stack needed for the hypercall.
628	 * (rip, cs, rflags, rsp, ss).
629	 */
630	movq    %rsp, %gs:CPU_RTMP_RSP		/* save user's rsp */
631	movq	%gs:CPU_THREAD, %r11
632	movq	T_STACK(%r11), %rsp
633
634	movq	%rcx, REGOFF_RIP(%rsp)
635	movl	$UCS_SEL, REGOFF_CS(%rsp)
636	movq	%gs:CPU_RTMP_RSP, %r11
637	movq	%r11, REGOFF_RSP(%rsp)
638	pushfq
639	popq	%r11				/* hypercall enables ints */
640	movq	%r11, REGOFF_RFL(%rsp)
641	movl	$UDS_SEL, REGOFF_SS(%rsp)
642	addq	$REGOFF_RIP, %rsp
643	/*
644	 * XXPV: see comment in SYSRETQ definition for future optimization
645	 *       we could take.
646	 */
647	ASSERT_UPCALL_MASK_IS_SET
648	SYSRETQ
649#else
650        ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
651	SWAPGS				/* user gsbase */
652	SYSRETQ
653#endif
654        /*NOTREACHED*/
655        SET_SIZE(nopop_sys_syscall_swapgs_sysretq)
656
657_syscall_pre:
658	call	pre_syscall
659	movl	%eax, %r12d
660	testl	%eax, %eax
661	jne	_syscall_post_call
662	/*
663	 * Didn't abort, so reload the syscall args and invoke the handler.
664	 */
665	movzwl	T_SYSNUM(%r15), %eax
666	jmp	_syscall_invoke
667
668_syscall_ill:
669	call	nosys
670	movq	%rax, %r12
671	movq	%rdx, %r13
672	jmp	_syscall_post_call
673
674_syscall_post:
675	STI
676	/*
677	 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
678	 * so that we can account for the extra work it takes us to finish.
679	 */
680	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
681_syscall_post_call:
682	movq	%r12, %rdi
683	movq	%r13, %rsi
684	call	post_syscall
685	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
686	jmp	_sys_rtt
687	SET_SIZE(sys_syscall)
688	SET_SIZE(brand_sys_syscall)
689
690#endif	/* __lint */
691
692#if defined(__lint)
693
694/*ARGSUSED*/
695void
696sys_syscall32()
697{}
698
699#else	/* __lint */
700
701	ENTRY_NP(brand_sys_syscall32)
702	SWAPGS				/* kernel gsbase */
703	XPV_TRAP_POP
704	BRAND_CALLBACK(BRAND_CB_SYSCALL32, BRAND_URET_FROM_REG(%rcx))
705	SWAPGS				/* user gsbase */
706
707#if defined(__xpv)
708	jmp	nopop_sys_syscall32
709#endif
710
711	ALTENTRY(sys_syscall32)
712	SWAPGS				/* kernel gsbase */
713
714#if defined(__xpv)
715	XPV_TRAP_POP
716nopop_sys_syscall32:
717#endif
718
719	movl	%esp, %r10d
720	movq	%gs:CPU_THREAD, %r15
721	movq	T_STACK(%r15), %rsp
722	movl	%eax, %eax
723
724	movl	$U32CS_SEL, REGOFF_CS(%rsp)
725	movl	%ecx, REGOFF_RIP(%rsp)		/* syscall: %rip -> %rcx */
726	movq	%r11, REGOFF_RFL(%rsp)		/* syscall: %rfl -> %r11d */
727	movq	%r10, REGOFF_RSP(%rsp)
728	movl	$UDS_SEL, REGOFF_SS(%rsp)
729
730_syscall32_save:
731	movl	%edi, REGOFF_RDI(%rsp)
732	movl	%esi, REGOFF_RSI(%rsp)
733	movl	%ebp, REGOFF_RBP(%rsp)
734	movl	%ebx, REGOFF_RBX(%rsp)
735	movl	%edx, REGOFF_RDX(%rsp)
736	movl	%ecx, REGOFF_RCX(%rsp)
737	movl	%eax, REGOFF_RAX(%rsp)		/* wrapper: sysc# -> %eax */
738	movq	$0, REGOFF_SAVFP(%rsp)
739	movq	$0, REGOFF_SAVPC(%rsp)
740
741	/*
742	 * Copy these registers here in case we end up stopped with
743	 * someone (like, say, /proc) messing with our register state.
744	 * We don't -restore- them unless we have to in update_sregs.
745	 *
746	 * Since userland -can't- change fsbase or gsbase directly,
747	 * we don't bother to capture them here.
748	 */
749	xorl	%ebx, %ebx
750	movw	%ds, %bx
751	movq	%rbx, REGOFF_DS(%rsp)
752	movw	%es, %bx
753	movq	%rbx, REGOFF_ES(%rsp)
754	movw	%fs, %bx
755	movq	%rbx, REGOFF_FS(%rsp)
756	movw	%gs, %bx
757	movq	%rbx, REGOFF_GS(%rsp)
758
759	/*
760	 * Application state saved in the regs structure on the stack
761	 * %eax is the syscall number
762	 * %rsp is the thread's stack, %r15 is curthread
763	 * REG_RSP(%rsp) is the user's stack
764	 */
765
766	SYSCALL_TRAPTRACE32($TT_SYSC)
767
768	movq	%rsp, %rbp
769
770	movq	T_LWP(%r15), %r14
771	ASSERT_NO_RUPDATE_PENDING(%r14)
772
773	ENABLE_INTR_FLAGS
774
775	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
776	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate call) */
777
778	ASSERT_LWPTOREGS(%r14, %rsp)
779
780	incq	 %gs:CPU_STATS_SYS_SYSCALL
781
782	/*
783	 * Make some space for MAXSYSARGS (currently 8) 32-bit args placed
784	 * into 64-bit (long) arg slots, maintaining 16 byte alignment.  Or
785	 * more succinctly:
786	 *
787	 *	SA(MAXSYSARGS * sizeof (long)) == 64
788	 */
789#define	SYS_DROP	64			/* drop for args */
790	subq	$SYS_DROP, %rsp
791	movb	$LWP_SYS, LWP_STATE(%r14)
792	movq	%r15, %rdi
793	movq	%rsp, %rsi
794	call	syscall_entry
795
796	/*
797	 * Fetch the arguments copied onto the kernel stack and put
798	 * them in the right registers to invoke a C-style syscall handler.
799	 * %rax contains the handler address.
800	 *
801	 * Ideas for making all this go faster of course include simply
802	 * forcibly fetching 6 arguments from the user stack under lofault
803	 * protection, reverting to copyin_args only when watchpoints
804	 * are in effect.
805	 *
806	 * (If we do this, make sure that exec and libthread leave
807	 * enough space at the top of the stack to ensure that we'll
808	 * never do a fetch from an invalid page.)
809	 *
810	 * Lots of ideas here, but they won't really help with bringup B-)
811	 * Correctness can't wait, performance can wait a little longer ..
812	 */
813
814	movq	%rax, %rbx
815	movl	0(%rsp), %edi
816	movl	8(%rsp), %esi
817	movl	0x10(%rsp), %edx
818	movl	0x18(%rsp), %ecx
819	movl	0x20(%rsp), %r8d
820	movl	0x28(%rsp), %r9d
821
822	call	*SY_CALLC(%rbx)
823
824	movq	%rbp, %rsp	/* pop the args */
825
826	/*
827	 * amd64 syscall handlers -always- return a 64-bit value in %rax.
828	 * On the 32-bit kernel, they always return that value in %eax:%edx
829	 * as required by the 32-bit ABI.
830	 *
831	 * Simulate the same behaviour by unconditionally splitting the
832	 * return value in the same way.
833	 */
834	movq	%rax, %r13
835	shrq	$32, %r13	/* upper 32-bits into %edx */
836	movl	%eax, %r12d	/* lower 32-bits into %eax */
837
838	/*
839	 * Optimistically assume that there's no post-syscall
840	 * work to do.  (This is to avoid having to call syscall_mstate()
841	 * with interrupts disabled)
842	 */
843	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
844
845	/*
846	 * We must protect ourselves from being descheduled here;
847	 * If we were, and we ended up on another cpu, or another
848	 * lwp got in ahead of us, it could change the segment
849	 * registers without us noticing before we return to userland.
850	 */
851	CLI(%r14)
852	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
853	jne	_full_syscall_postsys32
854	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
855
856	/*
857	 * To get back to userland, we need to put the return %rip in %rcx and
858	 * the return %rfl in %r11d.  The sysret instruction also arranges
859	 * to fix up %cs and %ss; everything else is our responsibility.
860	 */
861
862	movl	%r12d, %eax			/* %eax: rval1 */
863	movl	REGOFF_RBX(%rsp), %ebx
864	/* %ecx used for return pointer */
865	movl	%r13d, %edx			/* %edx: rval2 */
866	movl	REGOFF_RBP(%rsp), %ebp
867	movl	REGOFF_RSI(%rsp), %esi
868	movl	REGOFF_RDI(%rsp), %edi
869
870	movl	REGOFF_RFL(%rsp), %r11d		/* %r11 -> eflags */
871	movl	REGOFF_RIP(%rsp), %ecx		/* %ecx -> %eip */
872	movl	REGOFF_RSP(%rsp), %esp
873
874	ASSERT_UPCALL_MASK_IS_SET
875        ALTENTRY(nopop_sys_syscall32_swapgs_sysretl)
876	SWAPGS				/* user gsbase */
877	SYSRETL
878        SET_SIZE(nopop_sys_syscall32_swapgs_sysretl)
879	/*NOTREACHED*/
880
881_full_syscall_postsys32:
882	STI
883	/*
884	 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
885	 * so that we can account for the extra work it takes us to finish.
886	 */
887	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
888	movq	%r15, %rdi
889	movq	%r12, %rsi			/* rval1 - %eax */
890	movq	%r13, %rdx			/* rval2 - %edx */
891	call	syscall_exit
892	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
893	jmp	_sys_rtt
894	SET_SIZE(sys_syscall32)
895	SET_SIZE(brand_sys_syscall32)
896
897#endif	/* __lint */
898
899/*
900 * System call handler via the sysenter instruction
901 * Used only for 32-bit system calls on the 64-bit kernel.
902 *
903 * The caller in userland has arranged that:
904 *
905 * -	%eax contains the syscall number
906 * -	%ecx contains the user %esp
907 * -	%edx contains the return %eip
908 * -	the user stack contains the args to the syscall
909 *
910 * Hardware and (privileged) initialization code have arranged that by
911 * the time the sysenter instructions completes:
912 *
913 * - %rip is pointing to sys_sysenter (below).
914 * - %cs and %ss are set to kernel text and stack (data) selectors.
915 * - %rsp is pointing at the lwp's stack
916 * - interrupts have been disabled.
917 *
918 * Note that we are unable to return both "rvals" to userland with
919 * this call, as %edx is used by the sysexit instruction.
920 *
921 * One final complication in this routine is its interaction with
922 * single-stepping in a debugger.  For most of the system call mechanisms,
923 * the CPU automatically clears the single-step flag before we enter the
924 * kernel.  The sysenter mechanism does not clear the flag, so a user
925 * single-stepping through a libc routine may suddenly find him/herself
926 * single-stepping through the kernel.  To detect this, kmdb compares the
927 * trap %pc to the [brand_]sys_enter addresses on each single-step trap.
928 * If it finds that we have single-stepped to a sysenter entry point, it
929 * explicitly clears the flag and executes the sys_sysenter routine.
930 *
931 * One final complication in this final complication is the fact that we
932 * have two different entry points for sysenter: brand_sys_sysenter and
933 * sys_sysenter.  If we enter at brand_sys_sysenter and start single-stepping
934 * through the kernel with kmdb, we will eventually hit the instruction at
935 * sys_sysenter.  kmdb cannot distinguish between that valid single-step
936 * and the undesirable one mentioned above.  To avoid this situation, we
937 * simply add a jump over the instruction at sys_sysenter to make it
938 * impossible to single-step to it.
939 */
940#if defined(__lint)
941
942void
943sys_sysenter()
944{}
945
946#else	/* __lint */
947
948	ENTRY_NP(brand_sys_sysenter)
949	SWAPGS				/* kernel gsbase */
950	ALTENTRY(_brand_sys_sysenter_post_swapgs)
951	BRAND_CALLBACK(BRAND_CB_SYSENTER, BRAND_URET_FROM_REG(%rdx))
952	/*
953	 * Jump over sys_sysenter to allow single-stepping as described
954	 * above.
955	 */
956	jmp	_sys_sysenter_post_swapgs
957
958	ALTENTRY(sys_sysenter)
959	SWAPGS				/* kernel gsbase */
960
961	ALTENTRY(_sys_sysenter_post_swapgs)
962	movq	%gs:CPU_THREAD, %r15
963
964	movl	$U32CS_SEL, REGOFF_CS(%rsp)
965	movl	%ecx, REGOFF_RSP(%rsp)		/* wrapper: %esp -> %ecx */
966	movl	%edx, REGOFF_RIP(%rsp)		/* wrapper: %eip -> %edx */
967	pushfq
968	popq	%r10
969	movl	$UDS_SEL, REGOFF_SS(%rsp)
970
971	/*
972	 * Set the interrupt flag before storing the flags to the
973	 * flags image on the stack so we can return to user with
974	 * interrupts enabled if we return via sys_rtt_syscall32
975	 */
976	orq	$PS_IE, %r10
977	movq	%r10, REGOFF_RFL(%rsp)
978
979	movl	%edi, REGOFF_RDI(%rsp)
980	movl	%esi, REGOFF_RSI(%rsp)
981	movl	%ebp, REGOFF_RBP(%rsp)
982	movl	%ebx, REGOFF_RBX(%rsp)
983	movl	%edx, REGOFF_RDX(%rsp)
984	movl	%ecx, REGOFF_RCX(%rsp)
985	movl	%eax, REGOFF_RAX(%rsp)		/* wrapper: sysc# -> %eax */
986	movq	$0, REGOFF_SAVFP(%rsp)
987	movq	$0, REGOFF_SAVPC(%rsp)
988
989	/*
990	 * Copy these registers here in case we end up stopped with
991	 * someone (like, say, /proc) messing with our register state.
992	 * We don't -restore- them unless we have to in update_sregs.
993	 *
994	 * Since userland -can't- change fsbase or gsbase directly,
995	 * we don't bother to capture them here.
996	 */
997	xorl	%ebx, %ebx
998	movw	%ds, %bx
999	movq	%rbx, REGOFF_DS(%rsp)
1000	movw	%es, %bx
1001	movq	%rbx, REGOFF_ES(%rsp)
1002	movw	%fs, %bx
1003	movq	%rbx, REGOFF_FS(%rsp)
1004	movw	%gs, %bx
1005	movq	%rbx, REGOFF_GS(%rsp)
1006
1007	/*
1008	 * Application state saved in the regs structure on the stack
1009	 * %eax is the syscall number
1010	 * %rsp is the thread's stack, %r15 is curthread
1011	 * REG_RSP(%rsp) is the user's stack
1012	 */
1013
1014	SYSCALL_TRAPTRACE($TT_SYSENTER)
1015
1016	movq	%rsp, %rbp
1017
1018	movq	T_LWP(%r15), %r14
1019	ASSERT_NO_RUPDATE_PENDING(%r14)
1020
1021	ENABLE_INTR_FLAGS
1022
1023	/*
1024	 * Catch 64-bit process trying to issue sysenter instruction
1025	 * on Nocona based systems.
1026	 */
1027	movq	LWP_PROCP(%r14), %rax
1028	cmpq	$DATAMODEL_ILP32, P_MODEL(%rax)
1029	je	7f
1030
1031	/*
1032	 * For a non-32-bit process, simulate a #ud, since that's what
1033	 * native hardware does.  The traptrace entry (above) will
1034	 * let you know what really happened.
1035	 */
1036	movq	$T_ILLINST, REGOFF_TRAPNO(%rsp)
1037	movq	REGOFF_CS(%rsp), %rdi
1038	movq	%rdi, REGOFF_ERR(%rsp)
1039	movq	%rsp, %rdi
1040	movq	REGOFF_RIP(%rsp), %rsi
1041	movl	%gs:CPU_ID, %edx
1042	call	trap
1043	jmp	_sys_rtt
10447:
1045
1046	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
1047	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate calls) */
1048
1049	ASSERT_LWPTOREGS(%r14, %rsp)
1050
1051	incq	%gs:CPU_STATS_SYS_SYSCALL
1052
1053	/*
1054	 * Make some space for MAXSYSARGS (currently 8) 32-bit args
1055	 * placed into 64-bit (long) arg slots, plus one 64-bit
1056	 * (long) arg count, maintaining 16 byte alignment.
1057	 */
1058	subq	$SYS_DROP, %rsp
1059	movb	$LWP_SYS, LWP_STATE(%r14)
1060	movq	%r15, %rdi
1061	movq	%rsp, %rsi
1062	call	syscall_entry
1063
1064	/*
1065	 * Fetch the arguments copied onto the kernel stack and put
1066	 * them in the right registers to invoke a C-style syscall handler.
1067	 * %rax contains the handler address.
1068	 */
1069	movq	%rax, %rbx
1070	movl	0(%rsp), %edi
1071	movl	8(%rsp), %esi
1072	movl	0x10(%rsp), %edx
1073	movl	0x18(%rsp), %ecx
1074	movl	0x20(%rsp), %r8d
1075	movl	0x28(%rsp), %r9d
1076
1077	call	*SY_CALLC(%rbx)
1078
1079	movq	%rbp, %rsp	/* pop the args */
1080
1081	/*
1082	 * amd64 syscall handlers -always- return a 64-bit value in %rax.
1083	 * On the 32-bit kernel, the always return that value in %eax:%edx
1084	 * as required by the 32-bit ABI.
1085	 *
1086	 * Simulate the same behaviour by unconditionally splitting the
1087	 * return value in the same way.
1088	 */
1089	movq	%rax, %r13
1090	shrq	$32, %r13	/* upper 32-bits into %edx */
1091	movl	%eax, %r12d	/* lower 32-bits into %eax */
1092
1093	/*
1094	 * Optimistically assume that there's no post-syscall
1095	 * work to do.  (This is to avoid having to call syscall_mstate()
1096	 * with interrupts disabled)
1097	 */
1098	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
1099
1100	/*
1101	 * We must protect ourselves from being descheduled here;
1102	 * If we were, and we ended up on another cpu, or another
1103	 * lwp got int ahead of us, it could change the segment
1104	 * registers without us noticing before we return to userland.
1105	 */
1106	cli
1107	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
1108	jne	_full_syscall_postsys32
1109	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
1110
1111	/*
1112	 * To get back to userland, load up the 32-bit registers and
1113	 * sysexit back where we came from.
1114	 */
1115
1116	/*
1117	 * Interrupts will be turned on by the 'sti' executed just before
1118	 * sysexit.  The following ensures that restoring the user's rflags
1119	 * doesn't enable interrupts too soon.
1120	 */
1121	andq	$_BITNOT(PS_IE), REGOFF_RFL(%rsp)
1122
1123	/*
1124	 * (There's no point in loading up %edx because the sysexit
1125	 * mechanism smashes it.)
1126	 */
1127	movl	%r12d, %eax
1128	movl	REGOFF_RBX(%rsp), %ebx
1129	movl	REGOFF_RBP(%rsp), %ebp
1130	movl	REGOFF_RSI(%rsp), %esi
1131	movl	REGOFF_RDI(%rsp), %edi
1132
1133	movl	REGOFF_RIP(%rsp), %edx	/* sysexit: %edx -> %eip */
1134	pushq	REGOFF_RFL(%rsp)
1135	popfq
1136	movl	REGOFF_RSP(%rsp), %ecx	/* sysexit: %ecx -> %esp */
1137        ALTENTRY(sys_sysenter_swapgs_sysexit)
1138	swapgs
1139	sti
1140	sysexit
1141	SET_SIZE(sys_sysenter_swapgs_sysexit)
1142	SET_SIZE(sys_sysenter)
1143	SET_SIZE(_sys_sysenter_post_swapgs)
1144	SET_SIZE(brand_sys_sysenter)
1145
1146#endif	/* __lint */
1147
1148#if defined(__lint)
1149/*
1150 * System call via an int80.  This entry point is only used by the Linux
1151 * application environment.  Unlike the other entry points, there is no
1152 * default action to take if no callback is registered for this process.
1153 */
1154void
1155sys_int80()
1156{}
1157
1158#else	/* __lint */
1159
1160	ENTRY_NP(brand_sys_int80)
1161	SWAPGS				/* kernel gsbase */
1162	XPV_TRAP_POP
1163	BRAND_CALLBACK(BRAND_CB_INT80, BRAND_URET_FROM_INTR_STACK())
1164	SWAPGS				/* user gsbase */
1165#if defined(__xpv)
1166	jmp	nopop_int80
1167#endif
1168
1169	ENTRY_NP(sys_int80)
1170	/*
1171	 * We hit an int80, but this process isn't of a brand with an int80
1172	 * handler.  Bad process!  Make it look as if the INT failed.
1173	 * Modify %rip to point before the INT, push the expected error
1174	 * code and fake a GP fault. Note on 64-bit hypervisor we need
1175	 * to undo the XPV_TRAP_POP and push rcx and r11 back on the stack
1176	 * because gptrap will pop them again with its own XPV_TRAP_POP.
1177	 */
1178#if defined(__xpv)
1179	XPV_TRAP_POP
1180nopop_int80:
1181#endif
1182	subq	$2, (%rsp)	/* int insn 2-bytes */
1183	pushq	$_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2)
1184#if defined(__xpv)
1185	push	%r11
1186	push	%rcx
1187#endif
1188	jmp	gptrap			/ GP fault
1189	SET_SIZE(sys_int80)
1190	SET_SIZE(brand_sys_int80)
1191#endif	/* __lint */
1192
1193
1194/*
1195 * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by
1196 * the generic i386 libc to do system calls. We do a small amount of setup
1197 * before jumping into the existing sys_syscall32 path.
1198 */
1199#if defined(__lint)
1200
1201/*ARGSUSED*/
1202void
1203sys_syscall_int()
1204{}
1205
1206#else	/* __lint */
1207
1208	ENTRY_NP(brand_sys_syscall_int)
1209	SWAPGS				/* kernel gsbase */
1210	XPV_TRAP_POP
1211	BRAND_CALLBACK(BRAND_CB_INT91, BRAND_URET_FROM_INTR_STACK())
1212	SWAPGS				/* user gsbase */
1213
1214#if defined(__xpv)
1215	jmp	nopop_syscall_int
1216#endif
1217
1218	ALTENTRY(sys_syscall_int)
1219	SWAPGS				/* kernel gsbase */
1220
1221#if defined(__xpv)
1222	XPV_TRAP_POP
1223nopop_syscall_int:
1224#endif
1225
1226	movq	%gs:CPU_THREAD, %r15
1227	movq	T_STACK(%r15), %rsp
1228	movl	%eax, %eax
1229	/*
1230	 * Set t_post_sys on this thread to force ourselves out via the slow
1231	 * path. It might be possible at some later date to optimize this out
1232	 * and use a faster return mechanism.
1233	 */
1234	movb	$1, T_POST_SYS(%r15)
1235	CLEAN_CS
1236	jmp	_syscall32_save
1237	/*
1238	 * There should be no instructions between this label and SWAPGS/IRET
1239	 * or we could end up breaking branded zone support. See the usage of
1240	 * this label in lx_brand_int80_callback and sn1_brand_int91_callback
1241	 * for examples.
1242	 */
1243        ALTENTRY(sys_sysint_swapgs_iret)
1244	SWAPGS				/* user gsbase */
1245	IRET
1246	/*NOTREACHED*/
1247	SET_SIZE(sys_sysint_swapgs_iret)
1248	SET_SIZE(sys_syscall_int)
1249	SET_SIZE(brand_sys_syscall_int)
1250
1251#endif	/* __lint */
1252
1253/*
1254 * Legacy 32-bit applications and old libc implementations do lcalls;
1255 * we should never get here because the LDT entry containing the syscall
1256 * segment descriptor has the "segment present" bit cleared, which means
1257 * we end up processing those system calls in trap() via a not-present trap.
1258 *
1259 * We do it this way because a call gate unhelpfully does -nothing- to the
1260 * interrupt flag bit, so an interrupt can run us just after the lcall
1261 * completes, but just before the swapgs takes effect.   Thus the INTR_PUSH and
1262 * INTR_POP paths would have to be slightly more complex to dance around
1263 * this problem, and end up depending explicitly on the first
1264 * instruction of this handler being either swapgs or cli.
1265 */
1266
1267#if defined(__lint)
1268
1269/*ARGSUSED*/
1270void
1271sys_lcall32()
1272{}
1273
1274#else	/* __lint */
1275
1276	ENTRY_NP(sys_lcall32)
1277	SWAPGS				/* kernel gsbase */
1278	pushq	$0
1279	pushq	%rbp
1280	movq	%rsp, %rbp
1281	leaq	__lcall_panic_str(%rip), %rdi
1282	xorl	%eax, %eax
1283	call	panic
1284	SET_SIZE(sys_lcall32)
1285
1286__lcall_panic_str:
1287	.string	"sys_lcall32: shouldn't be here!"
1288
1289/*
1290 * Declare a uintptr_t which covers the entire pc range of syscall
1291 * handlers for the stack walkers that need this.
1292 */
1293	.align	CPTRSIZE
1294	.globl	_allsyscalls_size
1295	.type	_allsyscalls_size, @object
1296_allsyscalls_size:
1297	.NWORD	. - _allsyscalls
1298	SET_SIZE(_allsyscalls_size)
1299
1300#endif	/* __lint */
1301
1302/*
1303 * These are the thread context handlers for lwps using sysenter/sysexit.
1304 */
1305
1306#if defined(__lint)
1307
1308/*ARGSUSED*/
1309void
1310sep_save(void *ksp)
1311{}
1312
1313/*ARGSUSED*/
1314void
1315sep_restore(void *ksp)
1316{}
1317
1318#else	/* __lint */
1319
1320	/*
1321	 * setting this value to zero as we switch away causes the
1322	 * stack-pointer-on-sysenter to be NULL, ensuring that we
1323	 * don't silently corrupt another (preempted) thread stack
1324	 * when running an lwp that (somehow) didn't get sep_restore'd
1325	 */
1326	ENTRY_NP(sep_save)
1327	xorl	%edx, %edx
1328	xorl	%eax, %eax
1329	movl	$MSR_INTC_SEP_ESP, %ecx
1330	wrmsr
1331	ret
1332	SET_SIZE(sep_save)
1333
1334	/*
1335	 * Update the kernel stack pointer as we resume onto this cpu.
1336	 */
1337	ENTRY_NP(sep_restore)
1338	movq	%rdi, %rdx
1339	shrq	$32, %rdx
1340	movl	%edi, %eax
1341	movl	$MSR_INTC_SEP_ESP, %ecx
1342	wrmsr
1343	ret
1344	SET_SIZE(sep_restore)
1345
1346#endif	/* __lint */
1347