xref: /freebsd/sys/amd64/amd64/exception.S (revision ddab534cd6f6557740c24ff2019642880ad8bef6)
1/*-
2 * Copyright (c) 1989, 1990 William F. Jolitz.
3 * Copyright (c) 1990 The Regents of the University of California.
4 * Copyright (c) 2007-2018 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by A. Joseph Koshy under
8 * sponsorship from the FreeBSD Foundation and Google, Inc.
9 *
10 * Portions of this software were developed by
11 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
12 * the FreeBSD Foundation.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39#include "opt_atpic.h"
40#include "opt_hwpmc_hooks.h"
41
42#include "assym.inc"
43
44#include <machine/psl.h>
45#include <machine/asmacros.h>
46#include <machine/trap.h>
47#include <machine/specialreg.h>
48#include <machine/pmap.h>
49
50#ifdef KDTRACE_HOOKS
51	.bss
52	.globl	dtrace_invop_jump_addr
53	.align	8
54	.type	dtrace_invop_jump_addr,@object
55	.size	dtrace_invop_jump_addr,8
56dtrace_invop_jump_addr:
57	.zero	8
58	.globl	dtrace_invop_calltrap_addr
59	.align	8
60	.type	dtrace_invop_calltrap_addr,@object
61	.size	dtrace_invop_calltrap_addr,8
62dtrace_invop_calltrap_addr:
63	.zero	8
64#endif
65	.text
66#ifdef HWPMC_HOOKS
67	ENTRY(start_exceptions)
68#endif
69
70/*****************************************************************************/
71/* Trap handling                                                             */
72/*****************************************************************************/
73/*
74 * Trap and fault vector routines.
75 *
76 * All traps are 'interrupt gates', SDT_SYSIGT.  An interrupt gate pushes
77 * state on the stack but also disables interrupts.  This is important for
78 * us for the use of the swapgs instruction.  We cannot be interrupted
79 * until the GS.base value is correct.  For most traps, we automatically
80 * then enable interrupts if the interrupted context had them enabled.
81 * This is equivalent to the i386 port's use of SDT_SYS386TGT.
82 *
83 * The cpu will push a certain amount of state onto the kernel stack for
84 * the current process.  See amd64/include/frame.h.
85 * This includes the current RFLAGS (status register, which includes
86 * the interrupt disable state prior to the trap), the code segment register,
87 * and the return instruction pointer are pushed by the cpu.  The cpu
88 * will also push an 'error' code for certain traps.  We push a dummy
89 * error code for those traps where the cpu doesn't in order to maintain
90 * a consistent frame.  We also push a contrived 'trap number'.
91 *
92 * The CPU does not push the general registers, so we must do that, and we
93 * must restore them prior to calling 'iret'.  The CPU adjusts %cs and %ss
94 * but does not mess with %ds, %es, %gs or %fs.  We swap the %gs base for
95 * for the kernel mode operation shortly, without changes to the selector
96 * loaded.  Since superuser long mode works with any selectors loaded into
97 * segment registers other then %cs, which makes them mostly unused in long
98 * mode, and kernel does not reference %fs, leave them alone.  The segment
99 * registers are reloaded on return to the usermode.
100 */
101
102/* Traps that we leave interrupts disabled for. */
103	.macro	TRAP_NOEN	l, trapno
104	PTI_ENTRY	\l,\l\()_pti_k,\l\()_pti_u
105\l\()_pti_k:
106	subq	$TF_RIP,%rsp
107	movl	$\trapno,TF_TRAPNO(%rsp)
108	movq	$0,TF_ADDR(%rsp)
109	movq	$0,TF_ERR(%rsp)
110	jmp	alltraps_noen_k
111\l\()_pti_u:
112	subq	$TF_RIP,%rsp
113	movl	$\trapno,TF_TRAPNO(%rsp)
114	movq	$0,TF_ADDR(%rsp)
115	movq	$0,TF_ERR(%rsp)
116	jmp	alltraps_noen_u
117
118	.globl	X\l
119	.type	X\l,@function
120X\l:
121	subq	$TF_RIP,%rsp
122	movl	$\trapno,TF_TRAPNO(%rsp)
123	movq	$0,TF_ADDR(%rsp)
124	movq	$0,TF_ERR(%rsp)
125	testb	$SEL_RPL_MASK,TF_CS(%rsp)
126	jz	alltraps_noen_k
127	swapgs
128	lfence
129	jmp	alltraps_noen_u
130	.endm
131
132	TRAP_NOEN	bpt, T_BPTFLT
133#ifdef KDTRACE_HOOKS
134	TRAP_NOEN	dtrace_ret, T_DTRACE_RET
135#endif
136
137/* Regular traps; The cpu does not supply tf_err for these. */
138	.macro	TRAP	l, trapno
139	PTI_ENTRY	\l,\l\()_pti_k,\l\()_pti_u
140\l\()_pti_k:
141	subq	$TF_RIP,%rsp
142	movl	$\trapno,TF_TRAPNO(%rsp)
143	movq	$0,TF_ADDR(%rsp)
144	movq	$0,TF_ERR(%rsp)
145	jmp	alltraps_k
146\l\()_pti_u:
147	subq	$TF_RIP,%rsp
148	movl	$\trapno,TF_TRAPNO(%rsp)
149	movq	$0,TF_ADDR(%rsp)
150	movq	$0,TF_ERR(%rsp)
151	jmp	alltraps_u
152
153	.globl	X\l
154	.type	X\l,@function
155X\l:
156	subq	$TF_RIP,%rsp
157	movl	$\trapno,TF_TRAPNO(%rsp)
158	movq	$0,TF_ADDR(%rsp)
159	movq	$0,TF_ERR(%rsp)
160	testb	$SEL_RPL_MASK,TF_CS(%rsp)
161	jz	alltraps_k
162	swapgs
163	lfence
164	jmp	alltraps_u
165	.endm
166
167	TRAP	div, T_DIVIDE
168	TRAP	ofl, T_OFLOW
169	TRAP	bnd, T_BOUND
170	TRAP	ill, T_PRIVINFLT
171	TRAP	dna, T_DNA
172	TRAP	fpusegm, T_FPOPFLT
173	TRAP	rsvd, T_RESERVED
174	TRAP	fpu, T_ARITHTRAP
175	TRAP	xmm, T_XMMFLT
176
177/* This group of traps have tf_err already pushed by the cpu. */
178	.macro	TRAP_ERR	l, trapno
179	PTI_ENTRY	\l,\l\()_pti_k,\l\()_pti_u,has_err=1
180\l\()_pti_k:
181	subq	$TF_ERR,%rsp
182	movl	$\trapno,TF_TRAPNO(%rsp)
183	movq	$0,TF_ADDR(%rsp)
184	jmp	alltraps_k
185\l\()_pti_u:
186	subq	$TF_ERR,%rsp
187	movl	$\trapno,TF_TRAPNO(%rsp)
188	movq	$0,TF_ADDR(%rsp)
189	jmp	alltraps_u
190	.globl	X\l
191	.type	X\l,@function
192X\l:
193	subq	$TF_ERR,%rsp
194	movl	$\trapno,TF_TRAPNO(%rsp)
195	movq	$0,TF_ADDR(%rsp)
196	testb	$SEL_RPL_MASK,TF_CS(%rsp)
197	jz	alltraps_k
198	swapgs
199	lfence
200	jmp	alltraps_u
201	.endm
202
203	TRAP_ERR	tss, T_TSSFLT
204	TRAP_ERR	align, T_ALIGNFLT
205
206	/*
207	 * alltraps_u/k entry points.
208	 * SWAPGS must be already performed by prologue,
209	 * if this is the first time in the kernel from userland.
210	 * Re-enable interrupts if they were enabled before the trap.
211	 * This approximates SDT_SYS386TGT on the i386 port.
212	 */
213	SUPERALIGN_TEXT
214	.globl	alltraps_u
215	.type	alltraps_u,@function
216alltraps_u:
217	movq	%rdi,TF_RDI(%rsp)
218	movq	%rdx,TF_RDX(%rsp)
219	movq	%rax,TF_RAX(%rsp)
220	movq	%rcx,TF_RCX(%rsp)
221	movq	PCPU(CURPCB),%rdi
222	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
223	call	handle_ibrs_entry
224	jmp	alltraps_save_segs
225	SUPERALIGN_TEXT
226	.globl	alltraps_k
227	.type	alltraps_k,@function
228alltraps_k:
229	lfence
230	movq	%rdi,TF_RDI(%rsp)
231	movq	%rdx,TF_RDX(%rsp)
232	movq	%rax,TF_RAX(%rsp)
233	movq	%rcx,TF_RCX(%rsp)
234alltraps_save_segs:
235	SAVE_SEGS
236	testl	$PSL_I,TF_RFLAGS(%rsp)
237	jz	alltraps_pushregs_no_rax
238	sti
239alltraps_pushregs_no_rax:
240	movq	%rsi,TF_RSI(%rsp)
241	movq	%r8,TF_R8(%rsp)
242	movq	%r9,TF_R9(%rsp)
243	movq	%rbx,TF_RBX(%rsp)
244	movq	%rbp,TF_RBP(%rsp)
245	movq	%r10,TF_R10(%rsp)
246	movq	%r11,TF_R11(%rsp)
247	movq	%r12,TF_R12(%rsp)
248	movq	%r13,TF_R13(%rsp)
249	movq	%r14,TF_R14(%rsp)
250	movq	%r15,TF_R15(%rsp)
251	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
252	pushfq
253	andq	$~(PSL_D | PSL_AC),(%rsp)
254	popfq
255#ifdef KDTRACE_HOOKS
256	/*
257	 * DTrace Function Boundary Trace (fbt) probes are triggered
258	 * by int3 (0xcc) which causes the #BP (T_BPTFLT) breakpoint
259	 * interrupt. For all other trap types, just handle them in
260	 * the usual way.
261	 */
262	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
263	jnz	calltrap		/* ignore userland traps */
264	cmpl	$T_BPTFLT,TF_TRAPNO(%rsp)
265	jne	calltrap
266
267	/* Check if there is no DTrace hook registered. */
268	cmpq	$0,dtrace_invop_jump_addr
269	je	calltrap
270
271	/*
272	 * Set our jump address for the jump back in the event that
273	 * the breakpoint wasn't caused by DTrace at all.
274	 */
275	movq	$calltrap,dtrace_invop_calltrap_addr(%rip)
276
277	/* Jump to the code hooked in by DTrace. */
278	jmpq	*dtrace_invop_jump_addr
279#endif
280	.globl	calltrap
281	.type	calltrap,@function
282calltrap:
283	KMSAN_ENTER
284	movq	%rsp, %rdi
285	call	trap_check
286	KMSAN_LEAVE
287	jmp	doreti			/* Handle any pending ASTs */
288
289	/*
290	 * alltraps_noen_u/k entry points.
291	 * Again, SWAPGS must be already performed by prologue, if needed.
292	 * Unlike alltraps above, we want to leave the interrupts disabled.
293	 * This corresponds to SDT_SYS386IGT on the i386 port.
294	 */
295	SUPERALIGN_TEXT
296	.globl	alltraps_noen_u
297	.type	alltraps_noen_u,@function
298alltraps_noen_u:
299	movq	%rdi,TF_RDI(%rsp)
300	movq	PCPU(CURPCB),%rdi
301	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
302	jmp	alltraps_noen_save_segs
303	SUPERALIGN_TEXT
304	.globl	alltraps_noen_k
305	.type	alltraps_noen_k,@function
306alltraps_noen_k:
307	lfence
308	movq	%rdi,TF_RDI(%rsp)
309alltraps_noen_save_segs:
310	SAVE_SEGS
311	movq	%rdx,TF_RDX(%rsp)
312	movq	%rax,TF_RAX(%rsp)
313	movq	%rcx,TF_RCX(%rsp)
314	testb	$SEL_RPL_MASK,TF_CS(%rsp)
315	jz	alltraps_pushregs_no_rax
316	call	handle_ibrs_entry
317	jmp	alltraps_pushregs_no_rax
318
319IDTVEC(dblfault)
320	subq	$TF_ERR,%rsp
321	movl	$T_DOUBLEFLT,TF_TRAPNO(%rsp)
322	movq	$0,TF_ADDR(%rsp)
323	movq	$0,TF_ERR(%rsp)
324	movq	%rdi,TF_RDI(%rsp)
325	movq	%rsi,TF_RSI(%rsp)
326	movq	%rdx,TF_RDX(%rsp)
327	movq	%rcx,TF_RCX(%rsp)
328	movq	%r8,TF_R8(%rsp)
329	movq	%r9,TF_R9(%rsp)
330	movq	%rax,TF_RAX(%rsp)
331	movq	%rbx,TF_RBX(%rsp)
332	movq	%rbp,TF_RBP(%rsp)
333	movq	%r10,TF_R10(%rsp)
334	movq	%r11,TF_R11(%rsp)
335	movq	%r12,TF_R12(%rsp)
336	movq	%r13,TF_R13(%rsp)
337	movq	%r14,TF_R14(%rsp)
338	movq	%r15,TF_R15(%rsp)
339	SAVE_SEGS
340	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
341	pushfq
342	andq	$~(PSL_D | PSL_AC),(%rsp)
343	popfq
344	movq	TF_SIZE(%rsp),%rdx
345	movl	%edx,%eax
346	shrq	$32,%rdx
347	movl	$MSR_GSBASE,%ecx
348	wrmsr
349	movq	%cr3,%rax
350	movq	%rax,PCPU(SAVED_UCR3)
351	movq	PCPU(KCR3),%rax
352	cmpq	$~0,%rax
353	je	2f
354	movq	%rax,%cr3
3552:	KMSAN_ENTER
356	movq	%rsp,%rdi
357	call	dblfault_handler
358	KMSAN_LEAVE
3593:	hlt
360	jmp	3b
361
362	ALIGN_TEXT
363IDTVEC(page_pti)
364	testb	$SEL_RPL_MASK,PTI_CS-PTI_ERR(%rsp)
365	jz	page_k
366	swapgs
367	lfence
368	pushq	%rax
369	movq	%cr3,%rax
370	movq	%rax,PCPU(SAVED_UCR3)
371	cmpq	$~0,PCPU(UCR3)
372	jne	1f
373	popq	%rax
374	jmp	page_u
3751:	pushq	%rdx
376	PTI_UUENTRY has_err=1
377	jmp	page_u
378	ALIGN_TEXT
379IDTVEC(page)
380	testb	$SEL_RPL_MASK,TF_CS-TF_ERR(%rsp) /* Did we come from kernel? */
381	jnz	page_u_swapgs		/* already running with kernel GS.base */
382page_k:
383	lfence
384	subq	$TF_ERR,%rsp
385	movq	%rdi,TF_RDI(%rsp)	/* free up GP registers */
386	movq	%rax,TF_RAX(%rsp)
387	movq	%rdx,TF_RDX(%rsp)
388	movq	%rcx,TF_RCX(%rsp)
389	jmp	page_cr2
390	ALIGN_TEXT
391page_u_swapgs:
392	swapgs
393	lfence
394page_u:
395	subq	$TF_ERR,%rsp
396	movq	%rdi,TF_RDI(%rsp)
397	movq	%rax,TF_RAX(%rsp)
398	movq	%rdx,TF_RDX(%rsp)
399	movq	%rcx,TF_RCX(%rsp)
400	movq	PCPU(CURPCB),%rdi
401	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
402	movq	PCPU(SAVED_UCR3),%rax
403	movq	%rax,PCB_SAVED_UCR3(%rdi)
404	call	handle_ibrs_entry
405page_cr2:
406	movq	%cr2,%rdi		/* preserve %cr2 before ..  */
407	movq	%rdi,TF_ADDR(%rsp)	/* enabling interrupts. */
408	SAVE_SEGS
409	movl	$T_PAGEFLT,TF_TRAPNO(%rsp)
410	testl	$PSL_I,TF_RFLAGS(%rsp)
411	jz	alltraps_pushregs_no_rax
412	sti
413	jmp	alltraps_pushregs_no_rax
414
415	/*
416	 * We have to special-case this one.  If we get a trap in doreti() at
417	 * the iretq stage, we'll reenter with the wrong gs state.  We'll have
418	 * to do a special the swapgs in this case even coming from the kernel.
419	 * XXX linux has a trap handler for their equivalent of load_gs().
420	 *
421	 * On the stack, we have the hardware interrupt frame to return
422	 * to usermode (faulted) and another frame with error code, for
423	 * fault.  For PTI, copy both frames to the main thread stack.
424	 * Handle the potential 16-byte alignment adjustment incurred
425	 * during the second fault by copying both frames independently
426	 * while unwinding the stack in between.
427	 */
428	.macro PROTF_ENTRY name,trapno
429\name\()_pti_doreti:
430	swapgs
431	lfence
432	cmpq	$~0,PCPU(UCR3)
433	je	1f
434	pushq	%rax
435	pushq	%rdx
436	movq	PCPU(KCR3),%rax
437	movq	%rax,%cr3
438	movq	PCPU(RSP0),%rax
439	subq	$2*PTI_SIZE-3*8,%rax /* no err, %rax, %rdx in faulted frame */
440	MOVE_STACKS	(PTI_SIZE / 8)
441	addq	$PTI_SIZE,%rax
442	movq	PTI_RSP(%rsp),%rsp
443	MOVE_STACKS	(PTI_SIZE / 8 - 3)
444	subq	$PTI_SIZE,%rax
445	movq	%rax,%rsp
446	popq	%rdx
447	popq	%rax
4481:	swapgs
449	jmp	X\name
450IDTVEC(\name\()_pti)
451	cmpq	$doreti_iret,PTI_RIP-2*8(%rsp)
452	je	\name\()_pti_doreti
453	testb	$SEL_RPL_MASK,PTI_CS-2*8(%rsp) /* %rax, %rdx not yet pushed */
454	jz	X\name		/* lfence is not needed until %gs: use */
455	PTI_UENTRY has_err=1
456	swapgs	/* fence provided by PTI_UENTRY */
457IDTVEC(\name)
458	subq	$TF_ERR,%rsp
459	movl	$\trapno,TF_TRAPNO(%rsp)
460	jmp	prot_addrf
461	.endm
462
463	PROTF_ENTRY	missing, T_SEGNPFLT
464	PROTF_ENTRY	stk, T_STKFLT
465	PROTF_ENTRY	prot, T_PROTFLT
466
467prot_addrf:
468	movq	$0,TF_ADDR(%rsp)
469	movq	%rdi,TF_RDI(%rsp)	/* free up a GP register */
470	movq	%rax,TF_RAX(%rsp)
471	movq	%rdx,TF_RDX(%rsp)
472	movq	%rcx,TF_RCX(%rsp)
473	movw	%fs,TF_FS(%rsp)
474	movw	%gs,TF_GS(%rsp)
475	leaq	doreti_iret(%rip),%rdi
476	cmpq	%rdi,TF_RIP(%rsp)
477	je	5f			/* kernel but with user gsbase!! */
478	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
479	jz	6f			/* already running with kernel GS.base */
480	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
481	jz	2f
482	rdfsbase %rax
483	rdgsbase %rdx
4842:	swapgs
485	lfence
486	movq	PCPU(CURPCB),%rdi
487	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
488	jz	4f
489	movq	%rax,PCB_FSBASE(%rdi)
490	movq	%rdx,PCB_GSBASE(%rdi)
491	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)	/* full iret from user #gp */
4924:	call	handle_ibrs_entry
493	movw	%es,TF_ES(%rsp)
494	movw	%ds,TF_DS(%rsp)
495	testl	$PSL_I,TF_RFLAGS(%rsp)
496	jz	alltraps_pushregs_no_rax
497	sti
498	jmp	alltraps_pushregs_no_rax
499
5005:	swapgs
5016:	lfence
502	movq	PCPU(CURPCB),%rdi
503	jmp	4b
504
505/*
506 * Fast syscall entry point.  We enter here with just our new %cs/%ss set,
507 * and the new privilege level.  We are still running on the old user stack
508 * pointer.  We have to juggle a few things around to find our stack etc.
509 * swapgs gives us access to our PCPU space only.
510 *
511 * We do not support invoking this from a custom segment registers,
512 * esp. %cs, %ss, %fs, %gs, e.g. using entries from an LDT.
513 */
514	SUPERALIGN_TEXT
515IDTVEC(fast_syscall_pti)
516	swapgs
517	cmpq	$~0,PCPU(UCR3)
518	je	fast_syscall_common
519	movq	%rax,PCPU(SCRATCH_RAX)
520	movq	PCPU(KCR3),%rax
521	movq	%rax,%cr3
522	movq	PCPU(SCRATCH_RAX),%rax
523	jmp	fast_syscall_common
524	SUPERALIGN_TEXT
525IDTVEC(fast_syscall)
526	swapgs
527fast_syscall_common:
528	movq	%rsp,PCPU(SCRATCH_RSP)
529	movq	PCPU(RSP0),%rsp
530	/* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
531	subq	$TF_SIZE,%rsp
532	/* defer TF_RSP till we have a spare register */
533	movq	%r11,TF_RFLAGS(%rsp)
534	movq	%rcx,TF_RIP(%rsp)	/* %rcx original value is in %r10 */
535	movq	PCPU(SCRATCH_RSP),%r11	/* %r11 already saved */
536	movq	%r11,TF_RSP(%rsp)	/* user stack pointer */
537	/*
538	 * Save a few arg registers early to free them for use in
539	 * handle_ibrs_entry().  %r10 is especially tricky.  It is not an
540	 * arg register, but it holds the arg register %rcx.  Profiling
541	 * preserves %rcx, but may clobber %r10.  Profiling may also
542	 * clobber %r11, but %r11 (original %eflags) has been saved.
543	 */
544	movq	%rax,TF_RAX(%rsp)	/* syscall number */
545	movq	%rdx,TF_RDX(%rsp)	/* arg 3 */
546	movq	%r10,TF_RCX(%rsp)	/* arg 4 */
547	SAVE_SEGS
548	call	handle_ibrs_entry
549	movq	PCPU(CURPCB),%r11
550	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r11)
551	sti
552	movq	$KUDSEL,TF_SS(%rsp)
553	movq	$KUCSEL,TF_CS(%rsp)
554	movq	$2,TF_ERR(%rsp)
555	movq	%rdi,TF_RDI(%rsp)	/* arg 1 */
556	movq	%rsi,TF_RSI(%rsp)	/* arg 2 */
557	movq	%r8,TF_R8(%rsp)		/* arg 5 */
558	movq	%r9,TF_R9(%rsp)		/* arg 6 */
559	movq	%rbx,TF_RBX(%rsp)	/* C preserved */
560	movq	%rbp,TF_RBP(%rsp)	/* C preserved */
561	movq	%r12,TF_R12(%rsp)	/* C preserved */
562	movq	%r13,TF_R13(%rsp)	/* C preserved */
563	movq	%r14,TF_R14(%rsp)	/* C preserved */
564	movq	%r15,TF_R15(%rsp)	/* C preserved */
565	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
566	movq	PCPU(CURTHREAD),%rdi
567	movq	%rsp,TD_FRAME(%rdi)
568	movl	TF_RFLAGS(%rsp),%esi
569	andl	$PSL_T,%esi
570	call	amd64_syscall
5711:	movq	PCPU(CURPCB),%rax
572	/* Disable interrupts before testing PCB_FULL_IRET. */
573	cli
574	testl	$PCB_FULL_IRET,PCB_FLAGS(%rax)
575	jnz	4f
576	/* Check for and handle AST's on return to userland. */
577	movq	PCPU(CURTHREAD),%rax
578	cmpl	$0,TD_AST(%rax)
579	jne	3f
580	call	handle_ibrs_exit
581	callq	*mds_handler
582	/* Restore preserved registers. */
583	movq	TF_RDI(%rsp),%rdi	/* bonus; preserve arg 1 */
584	movq	TF_RSI(%rsp),%rsi	/* bonus: preserve arg 2 */
585	movq	TF_RDX(%rsp),%rdx	/* return value 2 */
586	movq	TF_RAX(%rsp),%rax	/* return value 1 */
587	movq	TF_RFLAGS(%rsp),%r11	/* original %rflags */
588	movq	TF_RIP(%rsp),%rcx	/* original %rip */
589	movq	TF_RSP(%rsp),%rsp	/* user stack pointer */
590	xorl	%r8d,%r8d		/* zero the rest of GPRs */
591	xorl	%r10d,%r10d
592	cmpq	$~0,PCPU(UCR3)
593	je	2f
594	movq	PCPU(UCR3),%r9
595	andq	PCPU(UCR3_LOAD_MASK),%r9
596	movq	%r9,%cr3
5972:	xorl	%r9d,%r9d
598	movq	$PMAP_UCR3_NOMASK,PCPU(UCR3_LOAD_MASK)
599	swapgs
600	sysretq
601
6023:	/* AST scheduled. */
603	sti
604	movq	%rsp,%rdi
605	call	ast
606	jmp	1b
607
6084:	/* Requested full context restore, use doreti for that. */
609	jmp	doreti
610
611/*
612 * Here for CYA insurance, in case a "syscall" instruction gets
613 * issued from 32 bit compatibility mode. MSR_CSTAR has to point
614 * to *something* if EFER_SCE is enabled.
615 */
616IDTVEC(fast_syscall32)
617	sysret
618
619/*
620 * DB# handler is very similar to NM#, because 'mov/pop %ss' delay
621 * generation of exception until the next instruction is executed,
622 * which might be a kernel entry.  So we must execute the handler
623 * on IST stack and be ready for non-kernel GSBASE.
624 */
625IDTVEC(dbg)
626	subq	$TF_RIP,%rsp
627	movl	$(T_TRCTRAP),TF_TRAPNO(%rsp)
628	movq	$0,TF_ADDR(%rsp)
629	movq	$0,TF_ERR(%rsp)
630	movq	%rdi,TF_RDI(%rsp)
631	movq	%rsi,TF_RSI(%rsp)
632	movq	%rdx,TF_RDX(%rsp)
633	movq	%rcx,TF_RCX(%rsp)
634	movq	%r8,TF_R8(%rsp)
635	movq	%r9,TF_R9(%rsp)
636	movq	%rax,TF_RAX(%rsp)
637	movq	%rbx,TF_RBX(%rsp)
638	movq	%rbp,TF_RBP(%rsp)
639	movq	%r10,TF_R10(%rsp)
640	movq	%r11,TF_R11(%rsp)
641	movq	%r12,TF_R12(%rsp)
642	movq	%r13,TF_R13(%rsp)
643	movq	%r14,TF_R14(%rsp)
644	movq	%r15,TF_R15(%rsp)
645	SAVE_SEGS
646	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
647	pushfq
648	andq	$~(PSL_D | PSL_AC),(%rsp)
649	popfq
650	testb	$SEL_RPL_MASK,TF_CS(%rsp)
651	jnz	dbg_fromuserspace
652	lfence
653	/*
654	 * We've interrupted the kernel.  See comment in NMI handler about
655	 * registers use.
656	 */
657	movq	%cr2,%r15
658	movl	$MSR_GSBASE,%ecx
659	rdmsr
660	movq	%rax,%r12
661	shlq	$32,%rdx
662	orq	%rdx,%r12
663	/* Retrieve and load the canonical value for GS.base. */
664	movq	TF_SIZE(%rsp),%rdx
665	movl	%edx,%eax
666	shrq	$32,%rdx
667	wrmsr
668	movq	%cr3,%r13
669	movq	PCPU(KCR3),%rax
670	cmpq	$~0,%rax
671	je	1f
672	movq	%rax,%cr3
6731:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
674	je	2f
675	movl	$MSR_IA32_SPEC_CTRL,%ecx
676	rdmsr
677	movl	%eax,%r14d
678	call	handle_ibrs_entry
6792:	movq	%rsp,%rdi
680	call	trap
681	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
682	je	3f
683	movl	%r14d,%eax
684	xorl	%edx,%edx
685	movl	$MSR_IA32_SPEC_CTRL,%ecx
686	wrmsr
687	/*
688	 * Put back the preserved MSR_GSBASE value.
689	 */
6903:	movl	$MSR_GSBASE,%ecx
691	movq	%r12,%rdx
692	movl	%edx,%eax
693	shrq	$32,%rdx
694	wrmsr
695	movq	%r13,%cr3
696	movq	%r15,%cr2
697	RESTORE_REGS
698	addq	$TF_RIP,%rsp
699	jmp	doreti_iret
700dbg_fromuserspace:
701	/*
702	 * Switch to kernel GSBASE and kernel page table, and copy frame
703	 * from the IST stack to the normal kernel stack, since trap()
704	 * re-enables interrupts, and since we might trap on DB# while
705	 * in trap().
706	 */
707	swapgs
708	lfence
709	movq	PCPU(KCR3),%rax
710	cmpq	$~0,%rax
711	je	1f
712	movq	%rax,%cr3
7131:	movq	PCPU(RSP0),%rax
714	movl	$TF_SIZE,%ecx
715	subq	%rcx,%rax
716	movq	%rax,%rdi
717	movq	%rsp,%rsi
718	rep;movsb
719	movq	%rax,%rsp
720	call	handle_ibrs_entry
721	movq	PCPU(CURPCB),%rdi
722	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
723	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
724	jz	3f
725	rdfsbase %rax
726	movq	%rax,PCB_FSBASE(%rdi)
727	movl	$MSR_KGSBASE,%ecx
728	rdmsr
729	shlq	$32,%rdx
730	orq	%rdx,%rax
731	movq	%rax,PCB_GSBASE(%rdi)
7323:	jmp	calltrap
733
734/*
735 * NMI handling is special.
736 *
737 * First, NMIs do not respect the state of the processor's RFLAGS.IF
738 * bit.  The NMI handler may be entered at any time, including when
739 * the processor is in a critical section with RFLAGS.IF == 0.
740 * The processor's GS.base value could be invalid on entry to the
741 * handler.
742 *
743 * Second, the processor treats NMIs specially, blocking further NMIs
744 * until an 'iretq' instruction is executed.  We thus need to execute
745 * the NMI handler with interrupts disabled, to prevent a nested interrupt
746 * from executing an 'iretq' instruction and inadvertently taking the
747 * processor out of NMI mode.
748 *
749 * Third, the NMI handler runs on its own stack (tss_ist2). The canonical
750 * GS.base value for the processor is stored just above the bottom of its
751 * NMI stack.  For NMIs taken from kernel mode, the current value in
752 * the processor's GS.base is saved at entry to C-preserved register %r12,
753 * the canonical value for GS.base is then loaded into the processor, and
754 * the saved value is restored at exit time.  For NMIs taken from user mode,
755 * the cheaper 'SWAPGS' instructions are used for swapping GS.base.
756 */
757
758IDTVEC(nmi)
759	subq	$TF_RIP,%rsp
760	movl	$(T_NMI),TF_TRAPNO(%rsp)
761	movq	$0,TF_ADDR(%rsp)
762	movq	$0,TF_ERR(%rsp)
763	movq	%rdi,TF_RDI(%rsp)
764	movq	%rsi,TF_RSI(%rsp)
765	movq	%rdx,TF_RDX(%rsp)
766	movq	%rcx,TF_RCX(%rsp)
767	movq	%r8,TF_R8(%rsp)
768	movq	%r9,TF_R9(%rsp)
769	movq	%rax,TF_RAX(%rsp)
770	movq	%rbx,TF_RBX(%rsp)
771	movq	%rbp,TF_RBP(%rsp)
772	movq	%r10,TF_R10(%rsp)
773	movq	%r11,TF_R11(%rsp)
774	movq	%r12,TF_R12(%rsp)
775	movq	%r13,TF_R13(%rsp)
776	movq	%r14,TF_R14(%rsp)
777	movq	%r15,TF_R15(%rsp)
778	SAVE_SEGS
779	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
780	pushfq
781	andq	$~(PSL_D | PSL_AC),(%rsp)
782	popfq
783	xorl	%ebx,%ebx
784	testb	$SEL_RPL_MASK,TF_CS(%rsp)
785	jnz	nmi_fromuserspace
786	/*
787	 * We've interrupted the kernel.  Preserve in callee-saved regs:
788	 * GS.base in %r12,
789	 * %cr3 in %r13,
790	 * possibly lower half of MSR_IA32_SPEC_CTL in %r14d,
791	 * %cr2 in %r15.
792	 */
793	lfence
794	movq	%cr2,%r15
795	movl	$MSR_GSBASE,%ecx
796	rdmsr
797	movq	%rax,%r12
798	shlq	$32,%rdx
799	orq	%rdx,%r12
800	/* Retrieve and load the canonical value for GS.base. */
801	movq	TF_SIZE(%rsp),%rdx
802	movl	%edx,%eax
803	shrq	$32,%rdx
804	wrmsr
805	movq	%cr3,%r13
806	movq	PCPU(KCR3),%rax
807	cmpq	$~0,%rax
808	je	1f
809	movq	%rax,%cr3
8101:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
811	je	nmi_calltrap
812	movl	$MSR_IA32_SPEC_CTRL,%ecx
813	rdmsr
814	movl	%eax,%r14d
815	call	handle_ibrs_entry
816	jmp	nmi_calltrap
817nmi_fromuserspace:
818	incl	%ebx
819	swapgs
820	lfence
821	movq	%cr3,%r13
822	movq	PCPU(KCR3),%rax
823	cmpq	$~0,%rax
824	je	1f
825	movq	%rax,%cr3
8261:	call	handle_ibrs_entry
827	movq	PCPU(CURPCB),%rdi
828	testq	%rdi,%rdi
829	jz	3f
830	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
831	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
832	jz	3f
833	rdfsbase %rax
834	movq	%rax,PCB_FSBASE(%rdi)
835	movl	$MSR_KGSBASE,%ecx
836	rdmsr
837	shlq	$32,%rdx
838	orq	%rdx,%rax
839	movq	%rax,PCB_GSBASE(%rdi)
8403:
841/* Note: this label is also used by ddb and gdb: */
842nmi_calltrap:
843	KMSAN_ENTER
844	movq	%rsp,%rdi
845	call	trap
846	KMSAN_LEAVE
847#ifdef HWPMC_HOOKS
848	/*
849	 * Capture a userspace callchain if needed.
850	 *
851	 * - Check if the current trap was from user mode.
852	 * - Check if the current thread is valid.
853	 * - Check if the thread requires a user call chain to be
854	 *   captured.
855	 *
856	 * We are still in NMI mode at this point.
857	 */
858	testl	%ebx,%ebx
859	jz	nocallchain	/* not from userspace */
860	movq	PCPU(CURTHREAD),%rax
861	orq	%rax,%rax	/* curthread present? */
862	jz	nocallchain
863	/*
864	 * Move execution to the regular kernel stack, because we
865	 * committed to return through doreti.
866	 */
867	movq	%rsp,%rsi	/* source stack pointer */
868	movq	$TF_SIZE,%rcx
869	movq	PCPU(RSP0),%rdx
870	subq	%rcx,%rdx
871	movq	%rdx,%rdi	/* destination stack pointer */
872	shrq	$3,%rcx		/* trap frame size in long words */
873	pushfq
874	andq	$~(PSL_D | PSL_AC),(%rsp)
875	popfq
876	rep
877	movsq			/* copy trapframe */
878	movq	%rdx,%rsp	/* we are on the regular kstack */
879
880	testl	$TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
881	jz	nocallchain
882	/*
883	 * A user callchain is to be captured, so:
884	 * - Take the processor out of "NMI" mode by faking an "iret",
885	 *   to allow for nested NMI interrupts.
886	 * - Enable interrupts, so that copyin() can work.
887	 */
888	movl	%ss,%eax
889	pushq	%rax		/* tf_ss */
890	pushq	%rdx		/* tf_rsp (on kernel stack) */
891	pushfq			/* tf_rflags */
892	movl	%cs,%eax
893	pushq	%rax		/* tf_cs */
894	pushq	$outofnmi	/* tf_rip */
895	iretq
896outofnmi:
897	/*
898	 * At this point the processor has exited NMI mode and is running
899	 * with interrupts turned off on the normal kernel stack.
900	 *
901	 * If a pending NMI gets recognized at or after this point, it
902	 * will cause a kernel callchain to be traced.
903	 *
904	 * We turn interrupts back on, and call the user callchain capture hook.
905	 */
906	movq	pmc_hook,%rax
907	orq	%rax,%rax
908	jz	nocallchain
909	movq	PCPU(CURTHREAD),%rdi		/* thread */
910	movq	$PMC_FN_USER_CALLCHAIN,%rsi	/* command */
911	movq	%rsp,%rdx			/* frame */
912	sti
913	call	*%rax
914	cli
915nocallchain:
916#endif
917	testl	%ebx,%ebx	/* %ebx != 0 => return to userland */
918	jnz	doreti_exit
919	/*
920	 * Restore speculation control MSR, if preserved.
921	 */
922	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
923	je	1f
924	movl	%r14d,%eax
925	xorl	%edx,%edx
926	movl	$MSR_IA32_SPEC_CTRL,%ecx
927	wrmsr
928	/*
929	 * Put back the preserved MSR_GSBASE value.
930	 */
9311:	movl	$MSR_GSBASE,%ecx
932	movq	%r12,%rdx
933	movl	%edx,%eax
934	shrq	$32,%rdx
935	wrmsr
936	cmpb	$0, nmi_flush_l1d_sw(%rip)
937	je	2f
938	call	flush_l1d_sw		/* bhyve L1TF assist */
9392:	movq	%r13,%cr3
940	movq	%r15,%cr2
941	RESTORE_REGS
942	addq	$TF_RIP,%rsp
943	jmp	doreti_iret
944
945/*
946 * MC# handling is similar to NMI.
947 *
948 * As with NMIs, machine check exceptions do not respect RFLAGS.IF and
949 * can occur at any time with a GS.base value that does not correspond
950 * to the privilege level in CS.
951 *
952 * Machine checks are not unblocked by iretq, but it is best to run
953 * the handler with interrupts disabled since the exception may have
954 * interrupted a critical section.
955 *
956 * The MC# handler runs on its own stack (tss_ist3).  The canonical
957 * GS.base value for the processor is stored just above the bottom of
958 * its MC# stack.  For exceptions taken from kernel mode, the current
959 * value in the processor's GS.base is saved at entry to C-preserved
960 * register %r12, the canonical value for GS.base is then loaded into
961 * the processor, and the saved value is restored at exit time.  For
962 * exceptions taken from user mode, the cheaper 'SWAPGS' instructions
963 * are used for swapping GS.base.
964 */
965
966IDTVEC(mchk)
967	subq	$TF_RIP,%rsp
968	movl	$(T_MCHK),TF_TRAPNO(%rsp)
969	movq	$0,TF_ADDR(%rsp)
970	movq	$0,TF_ERR(%rsp)
971	movq	%rdi,TF_RDI(%rsp)
972	movq	%rsi,TF_RSI(%rsp)
973	movq	%rdx,TF_RDX(%rsp)
974	movq	%rcx,TF_RCX(%rsp)
975	movq	%r8,TF_R8(%rsp)
976	movq	%r9,TF_R9(%rsp)
977	movq	%rax,TF_RAX(%rsp)
978	movq	%rbx,TF_RBX(%rsp)
979	movq	%rbp,TF_RBP(%rsp)
980	movq	%r10,TF_R10(%rsp)
981	movq	%r11,TF_R11(%rsp)
982	movq	%r12,TF_R12(%rsp)
983	movq	%r13,TF_R13(%rsp)
984	movq	%r14,TF_R14(%rsp)
985	movq	%r15,TF_R15(%rsp)
986	SAVE_SEGS
987	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
988	pushfq
989	andq	$~(PSL_D | PSL_AC),(%rsp)
990	popfq
991	xorl	%ebx,%ebx
992	testb	$SEL_RPL_MASK,TF_CS(%rsp)
993	jnz	mchk_fromuserspace
994	/*
995	 * We've interrupted the kernel.  See comment in NMI handler about
996	 * registers use.
997	 */
998	movq	%cr2,%r15
999	movl	$MSR_GSBASE,%ecx
1000	rdmsr
1001	movq	%rax,%r12
1002	shlq	$32,%rdx
1003	orq	%rdx,%r12
1004	/* Retrieve and load the canonical value for GS.base. */
1005	movq	TF_SIZE(%rsp),%rdx
1006	movl	%edx,%eax
1007	shrq	$32,%rdx
1008	wrmsr
1009	movq	%cr3,%r13
1010	movq	PCPU(KCR3),%rax
1011	cmpq	$~0,%rax
1012	je	1f
1013	movq	%rax,%cr3
10141:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
1015	je	mchk_calltrap
1016	movl	$MSR_IA32_SPEC_CTRL,%ecx
1017	rdmsr
1018	movl	%eax,%r14d
1019	call	handle_ibrs_entry
1020	jmp	mchk_calltrap
1021mchk_fromuserspace:
1022	incl	%ebx
1023	swapgs
1024	movq	%cr3,%r13
1025	movq	PCPU(KCR3),%rax
1026	cmpq	$~0,%rax
1027	je	1f
1028	movq	%rax,%cr3
10291:	call	handle_ibrs_entry
1030/* Note: this label is also used by ddb and gdb: */
1031mchk_calltrap:
1032	KMSAN_ENTER
1033	movq	%rsp,%rdi
1034	call	mca_intr
1035	KMSAN_LEAVE
1036	testl	%ebx,%ebx	/* %ebx != 0 => return to userland */
1037	jnz	doreti_exit
1038	/*
1039	 * Restore speculation control MSR, if preserved.
1040	 */
1041	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
1042	je	1f
1043	movl	%r14d,%eax
1044	xorl	%edx,%edx
1045	movl	$MSR_IA32_SPEC_CTRL,%ecx
1046	wrmsr
1047	/*
1048	 * Put back the preserved MSR_GSBASE value.
1049	 */
10501:	movl	$MSR_GSBASE,%ecx
1051	movq	%r12,%rdx
1052	movl	%edx,%eax
1053	shrq	$32,%rdx
1054	wrmsr
1055	movq	%r13,%cr3
1056	movq	%r15,%cr2
1057	RESTORE_REGS
1058	addq	$TF_RIP,%rsp
1059	jmp	doreti_iret
1060
1061ENTRY(fork_trampoline)
1062	movq	%r12,%rdi		/* function */
1063	movq	%rbx,%rsi		/* arg1 */
1064	movq	%rsp,%rdx		/* trapframe pointer */
1065	call	fork_exit
1066	jmp	doreti			/* Handle any ASTs */
1067
1068/*
1069 * To efficiently implement classification of trap and interrupt handlers
1070 * for profiling, there must be only trap handlers between the labels btrap
1071 * and bintr, and only interrupt handlers between the labels bintr and
1072 * eintr.  This is implemented (partly) by including files that contain
1073 * some of the handlers.  Before including the files, set up a normal asm
1074 * environment so that the included files doesn't need to know that they are
1075 * included.
1076 */
1077
1078#ifdef COMPAT_FREEBSD32
1079	.data
1080	.p2align 4
1081	.text
1082	SUPERALIGN_TEXT
1083
1084#include <amd64/ia32/ia32_exception.S>
1085#endif
1086
1087	.data
1088	.p2align 4
1089	.text
1090	SUPERALIGN_TEXT
1091#include <amd64/amd64/apic_vector.S>
1092
1093#ifdef DEV_ATPIC
1094	.data
1095	.p2align 4
1096	.text
1097	SUPERALIGN_TEXT
1098
1099#include <amd64/amd64/atpic_vector.S>
1100#endif
1101
1102/*
1103 * void doreti(struct trapframe)
1104 *
1105 * Handle return from interrupts, traps and syscalls.
1106 */
1107	.text
1108	SUPERALIGN_TEXT
1109	.type	doreti,@function
1110	.globl	doreti
1111doreti:
1112	/*
1113	 * Check if ASTs can be handled now.
1114	 */
1115	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* are we returning to user mode? */
1116	jz	doreti_exit		/* can't handle ASTs now if not */
1117
1118doreti_ast:
1119	/*
1120	 * Check for ASTs atomically with returning.  Disabling CPU
1121	 * interrupts provides sufficient locking even in the SMP case,
1122	 * since we will be informed of any new ASTs by an IPI.
1123	 */
1124	cli
1125	movq	PCPU(CURTHREAD),%rax
1126	cmpl	$0,TD_AST(%rax)
1127	je	doreti_exit
1128	sti
1129	movq	%rsp,%rdi	/* pass a pointer to the trapframe */
1130	call	ast
1131	jmp	doreti_ast
1132
1133	/*
1134	 * doreti_exit:	pop registers, iret.
1135	 *
1136	 *	The segment register pop is a special case, since it may
1137	 *	fault if (for example) a sigreturn specifies bad segment
1138	 *	registers.  The fault is handled in trap.c.
1139	 */
1140doreti_exit:
1141	movq	PCPU(CURPCB),%r8
1142
1143	/*
1144	 * Do not reload segment registers for kernel.
1145	 * Since we do not reload segments registers with sane
1146	 * values on kernel entry, descriptors referenced by
1147	 * segments registers might be not valid.  This is fatal
1148	 * for user mode, but is not a problem for the kernel.
1149	 */
1150	testb	$SEL_RPL_MASK,TF_CS(%rsp)
1151	jz	ld_regs
1152	testl	$PCB_FULL_IRET,PCB_FLAGS(%r8)
1153	jz	ld_regs
1154	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r8)
1155	testl	$TF_HASSEGS,TF_FLAGS(%rsp)
1156	je	set_segs
1157
1158do_segs:
1159	/* Restore %fs and fsbase */
1160	movw	TF_FS(%rsp),%ax
1161	.globl	ld_fs
1162ld_fs:
1163	movw	%ax,%fs
1164	movl	$MSR_FSBASE,%ecx
1165	movl	PCB_FSBASE(%r8),%eax
1166	movl	PCB_FSBASE+4(%r8),%edx
1167	.globl	ld_fsbase
1168ld_fsbase:
1169	wrmsr
1170	/* Restore %gs and gsbase */
1171	movw	TF_GS(%rsp),%si
1172	pushfq
1173	cli
1174	movl	$MSR_GSBASE,%ecx
1175	/* Save current kernel %gs base into %r12d:%r13d */
1176	rdmsr
1177	movl	%eax,%r12d
1178	movl	%edx,%r13d
1179	.globl	ld_gs
1180ld_gs:
1181	movw	%si,%gs
1182	/* Restore kernel %gs base */
1183	movl	%r12d,%eax
1184	movl	%r13d,%edx
1185	wrmsr
1186	popfq
1187	/*
1188	 * Restore user %gs base, either from PCB if used for TLS, or
1189	 * from the previously saved msr read.
1190	 */
1191	movl	$MSR_KGSBASE,%ecx
1192	movl	PCB_GSBASE(%r8),%eax
1193	movl	PCB_GSBASE+4(%r8),%edx
1194	.globl	ld_gsbase
1195ld_gsbase:
1196	wrmsr	/* May trap if non-canonical, but only for TLS. */
1197	.globl	ld_es
1198ld_es:
1199	movw	TF_ES(%rsp),%es
1200	.globl	ld_ds
1201ld_ds:
1202	movw	TF_DS(%rsp),%ds
1203ld_regs:
1204	RESTORE_REGS
1205	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
1206	jz	2f			/* keep running with kernel GS.base */
1207	cli
1208	call	handle_ibrs_exit_rs
1209	callq	*mds_handler
1210	cmpq	$~0,PCPU(UCR3)
1211	je	1f
1212	pushq	%rdx
1213	movq	PCPU(PTI_RSP0),%rdx
1214	subq	$PTI_SIZE,%rdx
1215	movq	%rax,PTI_RAX(%rdx)
1216	popq	%rax
1217	movq	%rax,PTI_RDX(%rdx)
1218	movq	TF_RIP(%rsp),%rax
1219	movq	%rax,PTI_RIP(%rdx)
1220	movq	TF_CS(%rsp),%rax
1221	movq	%rax,PTI_CS(%rdx)
1222	movq	TF_RFLAGS(%rsp),%rax
1223	movq	%rax,PTI_RFLAGS(%rdx)
1224	movq	TF_RSP(%rsp),%rax
1225	movq	%rax,PTI_RSP(%rdx)
1226	movq	TF_SS(%rsp),%rax
1227	movq	%rax,PTI_SS(%rdx)
1228	movq	PCPU(UCR3),%rax
1229	andq	PCPU(UCR3_LOAD_MASK),%rax
1230	movq	$PMAP_UCR3_NOMASK,PCPU(UCR3_LOAD_MASK)
1231	swapgs
1232	movq	%rdx,%rsp
1233	movq	%rax,%cr3
1234	popq	%rdx
1235	popq	%rax
1236	addq	$8,%rsp
1237	jmp	doreti_iret
12381:	swapgs
12392:	addq	$TF_RIP,%rsp
1240	.globl	doreti_iret
1241doreti_iret:
1242	iretq
1243
1244set_segs:
1245	movw	$KUDSEL,%ax
1246	movw	%ax,TF_DS(%rsp)
1247	movw	%ax,TF_ES(%rsp)
1248	movw	$KUF32SEL,TF_FS(%rsp)
1249	movw	$KUG32SEL,TF_GS(%rsp)
1250	jmp	do_segs
1251
1252	/*
1253	 * doreti_iret_fault.  Alternative return code for
1254	 * the case where we get a fault in the doreti_exit code
1255	 * above.  trap() (amd64/amd64/trap.c) catches this specific
1256	 * case, sends the process a signal and continues in the
1257	 * corresponding place in the code below.
1258	 */
1259	ALIGN_TEXT
1260	.globl	doreti_iret_fault
1261doreti_iret_fault:
1262	subq	$TF_RIP,%rsp		/* space including tf_err, tf_trapno */
1263	movq	%rax,TF_RAX(%rsp)
1264	movq	%rdx,TF_RDX(%rsp)
1265	movq	%rcx,TF_RCX(%rsp)
1266	call	handle_ibrs_entry
1267	testb	$SEL_RPL_MASK,TF_CS(%rsp)
1268	jz	1f
1269	sti
12701:
1271	SAVE_SEGS
1272	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
1273	movq	%rdi,TF_RDI(%rsp)
1274	movq	%rsi,TF_RSI(%rsp)
1275	movq	%r8,TF_R8(%rsp)
1276	movq	%r9,TF_R9(%rsp)
1277	movq	%rbx,TF_RBX(%rsp)
1278	movq	%rbp,TF_RBP(%rsp)
1279	movq	%r10,TF_R10(%rsp)
1280	movq	%r11,TF_R11(%rsp)
1281	movq	%r12,TF_R12(%rsp)
1282	movq	%r13,TF_R13(%rsp)
1283	movq	%r14,TF_R14(%rsp)
1284	movq	%r15,TF_R15(%rsp)
1285	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1286	movq	$0,TF_ERR(%rsp)	/* XXX should be the error code */
1287	movq	$0,TF_ADDR(%rsp)
1288	jmp	calltrap
1289
1290	ALIGN_TEXT
1291	.globl	ds_load_fault
1292ds_load_fault:
1293	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1294	testb	$SEL_RPL_MASK,TF_CS(%rsp)
1295	jz	1f
1296	sti
12971:
1298	movq	%rsp,%rdi
1299	call	trap
1300	movw	$KUDSEL,TF_DS(%rsp)
1301	jmp	doreti
1302
1303	ALIGN_TEXT
1304	.globl	es_load_fault
1305es_load_fault:
1306	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1307	testl	$PSL_I,TF_RFLAGS(%rsp)
1308	jz	1f
1309	sti
13101:
1311	movq	%rsp,%rdi
1312	call	trap
1313	movw	$KUDSEL,TF_ES(%rsp)
1314	jmp	doreti
1315
1316	ALIGN_TEXT
1317	.globl	fs_load_fault
1318fs_load_fault:
1319	testl	$PSL_I,TF_RFLAGS(%rsp)
1320	jz	1f
1321	sti
13221:
1323	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1324	movq	%rsp,%rdi
1325	call	trap
1326	movw	$KUF32SEL,TF_FS(%rsp)
1327	jmp	doreti
1328
1329	ALIGN_TEXT
1330	.globl	gs_load_fault
1331gs_load_fault:
1332	popfq
1333	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1334	testl	$PSL_I,TF_RFLAGS(%rsp)
1335	jz	1f
1336	sti
13371:
1338	movq	%rsp,%rdi
1339	call	trap
1340	movw	$KUG32SEL,TF_GS(%rsp)
1341	jmp	doreti
1342
1343	ALIGN_TEXT
1344	.globl	fsbase_load_fault
1345fsbase_load_fault:
1346	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1347	testl	$PSL_I,TF_RFLAGS(%rsp)
1348	jz	1f
1349	sti
13501:
1351	movq	%rsp,%rdi
1352	call	trap
1353	movq	PCPU(CURTHREAD),%r8
1354	movq	TD_PCB(%r8),%r8
1355	movq	$0,PCB_FSBASE(%r8)
1356	jmp	doreti
1357
1358	ALIGN_TEXT
1359	.globl	gsbase_load_fault
1360gsbase_load_fault:
1361	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1362	testl	$PSL_I,TF_RFLAGS(%rsp)
1363	jz	1f
1364	sti
13651:
1366	movq	%rsp,%rdi
1367	call	trap
1368	movq	PCPU(CURTHREAD),%r8
1369	movq	TD_PCB(%r8),%r8
1370	movq	$0,PCB_GSBASE(%r8)
1371	jmp	doreti
1372
1373#ifdef HWPMC_HOOKS
1374	ENTRY(end_exceptions)
1375#endif
1376