xref: /illumos-gate/usr/src/uts/i86pc/ml/kpti_trampolines.S (revision b3783300013fa93b98278c901b855062f538f7e2)
1/*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source.  A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11/*
12 * Copyright 2019 Joyent, Inc.
13 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
14 */
15
16/*
17 * This file contains the trampolines that are used by KPTI in order to be
18 * able to take interrupts/trap/etc while on the "user" page table.
19 *
20 * We don't map the full kernel text into the user page table: instead we
21 * map this one small section of trampolines (which compiles to ~13 pages).
22 * These trampolines are set in the IDT always (so they will run no matter
23 * whether we're on the kernel or user page table), and their primary job is to
24 * pivot us to the kernel %cr3 and %rsp without ruining everything.
25 *
26 * All of these interrupts use the amd64 IST feature when we have KPTI enabled,
27 * meaning that they will execute with their %rsp set to a known location, even
28 * if we take them in the kernel.
29 *
30 * Over in desctbls.c (for cpu0) and mp_pc.c (other cpus) we set up the IST
31 * stack to point at &cpu->cpu_m.mcpu_kpti.kf_tr_rsp. You can see the mcpu_kpti
32 * (a struct kpti_frame) defined in machcpuvar.h. This struct is set up to be
33 * page-aligned, and we map the page it's on into both page tables. Using a
34 * struct attached to the cpu_t also means that we can use %rsp-relative
35 * addressing to find anything on the cpu_t, so we don't have to touch %gs or
36 * GSBASE at all on incoming interrupt trampolines (which can get pretty hairy).
37 *
38 * This little struct is where the CPU will push the actual interrupt frame.
39 * Then, in the trampoline, we change %cr3, then figure out our destination
40 * stack pointer and "pivot" to it (set %rsp and re-push the CPU's interrupt
41 * frame). Then we jump to the regular ISR in the kernel text and carry on as
42 * normal.
43 *
44 * We leave the original frame and any spilled regs behind in the kpti_frame
45 * lazily until we want to return to userland. Then, we clear any spilled
46 * regs from it, and overwrite the rest with our iret frame. When switching
47 * this cpu to a different process (in hat_switch), we bzero the whole region to
48 * make sure nothing can leak between processes.
49 *
50 * When we're returning back to the original place we took the interrupt later
51 * (especially if it was in userland), we have to jmp back to the "return
52 * trampolines" here, since when we set %cr3 back to the user value, we need to
53 * be executing from code here in these shared pages and not the main kernel
54 * text again. Even though it should be fine to iret directly from kernel text
55 * when returning to kernel code, we make things jmp to a trampoline here just
56 * for consistency.
57 *
58 * Note that with IST, it's very important that we always must have pivoted
59 * away from the IST stack before we could possibly take any other interrupt
60 * on the same IST (unless it's an end-of-the-world fault and we don't care
61 * about coming back from it ever).
62 *
63 * This is particularly relevant to the dbgtrap/brktrap trampolines, as they
64 * regularly have to happen from within trampoline code (e.g. in the sysenter
65 * single-step case) and then return to the world normally. As a result, these
66 * two are IST'd to their own kpti_frame right above the normal one (in the same
67 * page), so they don't clobber their parent interrupt.
68 *
69 * To aid with debugging, we also IST the page fault (#PF/pftrap), general
70 * protection fault (#GP/gptrap) and stack fault (#SS/stktrap) interrupts to
71 * their own separate kpti_frame. This ensures that if we take one of these
72 * due to a bug in trampoline code, we preserve the original trampoline
73 * state that caused the trap.
74 *
75 * NMI, MCE and dblfault interrupts also are taken on their own dedicated IST
76 * stacks, since they can interrupt another ISR at any time. These stacks are
77 * full-sized, however, and not a little kpti_frame struct. We only set %cr3 in
78 * their trampolines (and do it unconditionally), and don't bother pivoting
79 * away. We're either going into the panic() path, or we're going to return
80 * straight away without rescheduling, so it's fine to not be on our real
81 * kthread stack (and some of the state we want to go find it with might be
82 * corrupt!)
83 *
84 * Finally, for these "special" interrupts (NMI/MCE/double fault) we use a
85 * special %cr3 value we stash here in the text (kpti_safe_cr3). We set this to
86 * point at the PML4 for kas early in boot and never touch it again. Hopefully
87 * it survives whatever corruption brings down the rest of the kernel!
88 *
89 * Syscalls are different to interrupts (at least in the SYSENTER/SYSCALL64
90 * cases) in that they do not push an interrupt frame (and also have some other
91 * effects). In the syscall trampolines, we assume that we can only be taking
92 * the call from userland and use swapgs and an unconditional overwrite of %cr3.
93 * We do not do any stack pivoting for syscalls (and we leave SYSENTER's
94 * existing %rsp pivot untouched) -- instead we spill registers into
95 * %gs:CPU_KPTI_* as we need to.
96 *
97 * Note that the normal %cr3 values do not cause invalidations with PCIDE - see
98 * hat_switch().
99 */
100
101/*
102 * The macros here mostly line up with what's in kdi_idthdl.s, too, so if you
103 * fix bugs here check to see if they should be fixed there as well.
104 */
105
106#include <sys/asm_linkage.h>
107#include <sys/asm_misc.h>
108#include <sys/regset.h>
109#include <sys/privregs.h>
110#include <sys/psw.h>
111#include <sys/machbrand.h>
112#include <sys/param.h>
113
114#include <sys/segments.h>
115#include <sys/pcb.h>
116#include <sys/trap.h>
117#include <sys/ftrace.h>
118#include <sys/traptrace.h>
119#include <sys/clock.h>
120#include <sys/model.h>
121#include <sys/panic.h>
122
123#if defined(__xpv)
124#include <sys/hypervisor.h>
125#endif
126
127#include "assym.h"
128
129	.data
130	DGDEF3(kpti_enable, 8, 8)
131	.fill	1, 8, 1
132
133#if DEBUG
134	.data
135_bad_ts_panic_msg:
136	.string "kpti_trampolines.s: tr_iret_user but CR0.TS set"
137#endif
138
139.section ".text";
140.align MMU_PAGESIZE
141
142.global kpti_tramp_start
143kpti_tramp_start:
144	nop
145
146/* This will be set by mlsetup, and then double-checked later */
147.global kpti_safe_cr3
148kpti_safe_cr3:
149	.quad 0
150	SET_SIZE(kpti_safe_cr3)
151
152/* startup_kmem() will overwrite this */
153.global kpti_kbase
154kpti_kbase:
155	.quad KERNELBASE
156	SET_SIZE(kpti_kbase)
157
158#define	SET_KERNEL_CR3(spillreg)		\
159	mov	%cr3, spillreg;			\
160	mov	spillreg, %gs:CPU_KPTI_TR_CR3;	\
161	mov	%gs:CPU_KPTI_KCR3, spillreg;	\
162	cmp	$0, spillreg;			\
163	je	2f;				\
164	mov	spillreg, %cr3;			\
1652:
166
167#if DEBUG
168#define	SET_USER_CR3(spillreg)			\
169	mov	%cr3, spillreg;			\
170	mov	spillreg, %gs:CPU_KPTI_TR_CR3;	\
171	mov	%gs:CPU_KPTI_UCR3, spillreg;	\
172	mov	spillreg, %cr3
173#else
174#define	SET_USER_CR3(spillreg)			\
175	mov	%gs:CPU_KPTI_UCR3, spillreg;	\
176	mov	spillreg, %cr3
177#endif
178
179#define	PIVOT_KPTI_STK(spillreg)		\
180	mov	%rsp, spillreg;			\
181	mov	%gs:CPU_KPTI_RET_RSP, %rsp;	\
182	pushq	T_FRAMERET_SS(spillreg);	\
183	pushq	T_FRAMERET_RSP(spillreg);	\
184	pushq	T_FRAMERET_RFLAGS(spillreg);	\
185	pushq	T_FRAMERET_CS(spillreg);	\
186	pushq	T_FRAMERET_RIP(spillreg)
187
188
189#define	INTERRUPT_TRAMPOLINE_P(errpush)	\
190	pushq	%r13;				\
191	pushq	%r14;				\
192	subq	$KPTI_R14, %rsp;		\
193	/* Save current %cr3. */		\
194	mov	%cr3, %r14;			\
195	mov	%r14, KPTI_TR_CR3(%rsp);	\
196						\
197	cmpw	$KCS_SEL, KPTI_CS(%rsp);	\
198	je	3f;				\
1991:						\
200	/* Change to the "kernel" %cr3 */	\
201	mov	KPTI_KCR3(%rsp), %r14;		\
202	cmp	$0, %r14;			\
203	je	2f;				\
204	mov	%r14, %cr3;			\
2052:						\
206	/* Get our cpu_t in %r13 */		\
207	mov	%rsp, %r13;			\
208	and	$(~(MMU_PAGESIZE - 1)), %r13;	\
209	subq	$CPU_KPTI_START, %r13;		\
210	/* Use top of the kthread stk */	\
211	mov	CPU_THREAD(%r13), %r14;		\
212	mov	T_STACK(%r14), %r14;		\
213	addq	$REGSIZE+MINFRAME, %r14;	\
214	jmp	4f;				\
2153:						\
216	/* Check the %rsp in the frame. */	\
217	/* Is it above kernel base? */		\
218	mov	kpti_kbase, %r14;		\
219	cmp	%r14, KPTI_RSP(%rsp);		\
220	jb	1b;				\
221	/* Use the %rsp from the trap frame */	\
222	mov	KPTI_RSP(%rsp), %r14;		\
223	and	$(~0xf), %r14;			\
2244:						\
225	mov	%rsp, %r13;			\
226	/* %r14 contains our destination stk */	\
227	mov	%r14, %rsp;			\
228	pushq	KPTI_SS(%r13);			\
229	pushq	KPTI_RSP(%r13);			\
230	pushq	KPTI_RFLAGS(%r13);		\
231	pushq	KPTI_CS(%r13);			\
232	pushq	KPTI_RIP(%r13);			\
233	errpush;				\
234	mov	KPTI_R14(%r13), %r14;		\
235	mov	KPTI_R13(%r13), %r13
236
237#define	INTERRUPT_TRAMPOLINE_NOERR		\
238	INTERRUPT_TRAMPOLINE_P(/**/)
239
240#define	INTERRUPT_TRAMPOLINE			\
241	INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13))
242
243/*
244 * This is used for all interrupts that can plausibly be taken inside another
245 * interrupt and are using a kpti_frame stack (so #BP, #DB, #GP, #PF, #SS).
246 *
247 * We also use this for #NP, even though it uses the standard IST: the
248 * additional %rsp checks below will catch when we get an exception doing an
249 * iret to userspace with a bad %cs/%ss.  This appears as a kernel trap, and
250 * only later gets redirected via kern_gpfault().
251 *
252 * We check for whether we took the interrupt while in another trampoline, in
253 * which case we need to use the kthread stack.
254 */
255#define	DBG_INTERRUPT_TRAMPOLINE_P(errpush)	\
256	pushq	%r13;				\
257	pushq	%r14;				\
258	subq	$KPTI_R14, %rsp;		\
259	/* Check for clobbering */		\
260	cmpq	$0, KPTI_FLAG(%rsp);		\
261	je	1f;				\
262	/* Don't worry, this totally works */	\
263	int	$8;				\
2641:						\
265	movq	$1, KPTI_FLAG(%rsp);		\
266	/* Save current %cr3. */		\
267	mov	%cr3, %r14;			\
268	mov	%r14, KPTI_TR_CR3(%rsp);	\
269						\
270	cmpw	$KCS_SEL, KPTI_CS(%rsp);	\
271	je	4f;				\
2722:						\
273	/* Change to the "kernel" %cr3 */	\
274	mov	KPTI_KCR3(%rsp), %r14;		\
275	cmp	$0, %r14;			\
276	je	3f;				\
277	mov	%r14, %cr3;			\
2783:						\
279	/* Get our cpu_t in %r13 */		\
280	mov	%rsp, %r13;			\
281	and	$(~(MMU_PAGESIZE - 1)), %r13;	\
282	subq	$CPU_KPTI_START, %r13;		\
283	/* Use top of the kthread stk */	\
284	mov	CPU_THREAD(%r13), %r14;		\
285	mov	T_STACK(%r14), %r14;		\
286	addq	$REGSIZE+MINFRAME, %r14;	\
287	jmp	6f;				\
2884:						\
289	/* Check the %rsp in the frame. */	\
290	/* Is it above kernel base? */		\
291	/* If not, treat as user. */		\
292	mov	kpti_kbase, %r14;		\
293	cmp	%r14, KPTI_RSP(%rsp);		\
294	jb	2b;				\
295	/* Is it within the kpti_frame page? */	\
296	/* If it is, treat as user interrupt */	\
297	mov	%rsp, %r13;			\
298	and	$(~(MMU_PAGESIZE - 1)), %r13;	\
299	mov	KPTI_RSP(%rsp), %r14;		\
300	and	$(~(MMU_PAGESIZE - 1)), %r14;	\
301	cmp	%r13, %r14;			\
302	je	2b;				\
303	/* Were we in trampoline code? */	\
304	leaq	kpti_tramp_start, %r14;		\
305	cmp	%r14, KPTI_RIP(%rsp);		\
306	jb	5f;				\
307	leaq	kpti_tramp_end, %r14;		\
308	cmp	%r14, KPTI_RIP(%rsp);		\
309	ja	5f;				\
310	/* If we were, change %cr3: we might */	\
311	/* have interrupted before it did. */	\
312	mov	KPTI_KCR3(%rsp), %r14;		\
313	mov	%r14, %cr3;			\
3145:						\
315	/* Use the %rsp from the trap frame */	\
316	mov	KPTI_RSP(%rsp), %r14;		\
317	and	$(~0xf), %r14;			\
3186:						\
319	mov	%rsp, %r13;			\
320	/* %r14 contains our destination stk */	\
321	mov	%r14, %rsp;			\
322	pushq	KPTI_SS(%r13);			\
323	pushq	KPTI_RSP(%r13);			\
324	pushq	KPTI_RFLAGS(%r13);		\
325	pushq	KPTI_CS(%r13);			\
326	pushq	KPTI_RIP(%r13);			\
327	errpush;				\
328	mov	KPTI_R14(%r13), %r14;		\
329	movq	$0, KPTI_FLAG(%r13);		\
330	mov	KPTI_R13(%r13), %r13
331
332#define	DBG_INTERRUPT_TRAMPOLINE_NOERR		\
333	DBG_INTERRUPT_TRAMPOLINE_P(/**/)
334
335#define	DBG_INTERRUPT_TRAMPOLINE		\
336	DBG_INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13))
337
338	/*
339	 * These labels (_start and _end) are used by trap.c to determine if
340	 * we took an interrupt like an NMI during the return process.
341	 */
342.global	tr_sysc_ret_start
343tr_sysc_ret_start:
344
345	/*
346	 * Syscall return trampolines.
347	 *
348	 * These are expected to be called on the kernel %gs. tr_sysret[ql] are
349	 * called after %rsp is changed back to the user value, so we have no
350	 * stack to work with. tr_sysexit has a kernel stack (but has to
351	 * preserve rflags, soooo).
352	 */
353	ENTRY_NP(tr_sysretq)
354	cmpq	$1, kpti_enable
355	jne	1f
356
357	mov	%r13, %gs:CPU_KPTI_R13
358	SET_USER_CR3(%r13)
359	mov	%gs:CPU_KPTI_R13, %r13
360	/* Zero these to make sure they didn't leak from a kernel trap */
361	movq	$0, %gs:CPU_KPTI_R13
362	movq	$0, %gs:CPU_KPTI_R14
3631:
364	swapgs
365	sysretq
366	SET_SIZE(tr_sysretq)
367
368	ENTRY_NP(tr_sysretl)
369	cmpq	$1, kpti_enable
370	jne	1f
371
372	mov	%r13, %gs:CPU_KPTI_R13
373	SET_USER_CR3(%r13)
374	mov	%gs:CPU_KPTI_R13, %r13
375	/* Zero these to make sure they didn't leak from a kernel trap */
376	movq	$0, %gs:CPU_KPTI_R13
377	movq	$0, %gs:CPU_KPTI_R14
3781:
379	SWAPGS
380	SYSRETL
381	SET_SIZE(tr_sysretl)
382
383	ENTRY_NP(tr_sysexit)
384	/*
385	 * Note: we want to preserve RFLAGS across this branch, since sysexit
386	 * (unlike sysret above) does not restore RFLAGS for us.
387	 *
388	 * We still have the real kernel stack (sysexit does restore that), so
389	 * we can use pushfq/popfq.
390	 */
391	pushfq
392
393	cmpq	$1, kpti_enable
394	jne	1f
395
396	/* Have to pop it back off now before we change %cr3! */
397	popfq
398	mov	%r13, %gs:CPU_KPTI_R13
399	SET_USER_CR3(%r13)
400	mov	%gs:CPU_KPTI_R13, %r13
401	/* Zero these to make sure they didn't leak from a kernel trap */
402	movq	$0, %gs:CPU_KPTI_R13
403	movq	$0, %gs:CPU_KPTI_R14
404	jmp	2f
4051:
406	popfq
4072:
408	swapgs
409	sti
410	SYSEXITL
411	SET_SIZE(tr_sysexit)
412
413.global	tr_sysc_ret_end
414tr_sysc_ret_end:
415
416	/*
417	 * Syscall entry trampolines.
418	 */
419
420#if DEBUG
421#define	MK_SYSCALL_TRAMPOLINE(isr)		\
422	ENTRY_NP(tr_##isr);			\
423	swapgs;					\
424	mov	%r13, %gs:CPU_KPTI_R13;		\
425	mov	%cr3, %r13;			\
426	mov	%r13, %gs:CPU_KPTI_TR_CR3;	\
427	mov	%gs:CPU_KPTI_KCR3, %r13;	\
428	mov	%r13, %cr3;			\
429	mov	%gs:CPU_KPTI_R13, %r13;		\
430	swapgs;					\
431	jmp	isr;				\
432	SET_SIZE(tr_##isr)
433#else
434#define	MK_SYSCALL_TRAMPOLINE(isr)		\
435	ENTRY_NP(tr_##isr);			\
436	swapgs;					\
437	mov	%r13, %gs:CPU_KPTI_R13;		\
438	mov	%gs:CPU_KPTI_KCR3, %r13;	\
439	mov	%r13, %cr3;			\
440	mov	%gs:CPU_KPTI_R13, %r13;		\
441	swapgs;					\
442	jmp	isr;				\
443	SET_SIZE(tr_##isr)
444#endif
445
446	MK_SYSCALL_TRAMPOLINE(sys_syscall)
447	MK_SYSCALL_TRAMPOLINE(sys_syscall32)
448	MK_SYSCALL_TRAMPOLINE(brand_sys_syscall)
449	MK_SYSCALL_TRAMPOLINE(brand_sys_syscall32)
450
451	/*
452	 * SYSENTER is special. The CPU is really not very helpful when it
453	 * comes to preserving and restoring state with it, and as a result
454	 * we have to do all of it by hand. So, since we want to preserve
455	 * RFLAGS, we have to be very careful in these trampolines to not
456	 * clobber any bits in it. That means no cmpqs or branches!
457	 */
458	ENTRY_NP(tr_sys_sysenter)
459	swapgs
460	mov	%r13, %gs:CPU_KPTI_R13
461#if DEBUG
462	mov	%cr3, %r13
463	mov	%r13, %gs:CPU_KPTI_TR_CR3
464#endif
465	mov	%gs:CPU_KPTI_KCR3, %r13
466	mov	%r13, %cr3
467	mov	%gs:CPU_KPTI_R13, %r13
468	jmp	_sys_sysenter_post_swapgs
469	SET_SIZE(tr_sys_sysenter)
470
471	ENTRY_NP(tr_brand_sys_sysenter)
472	swapgs
473	mov	%r13, %gs:CPU_KPTI_R13
474#if DEBUG
475	mov	%cr3, %r13
476	mov	%r13, %gs:CPU_KPTI_TR_CR3
477#endif
478	mov	%gs:CPU_KPTI_KCR3, %r13
479	mov	%r13, %cr3
480	mov	%gs:CPU_KPTI_R13, %r13
481	jmp	_brand_sys_sysenter_post_swapgs
482	SET_SIZE(tr_brand_sys_sysenter)
483
484#define	MK_SYSCALL_INT_TRAMPOLINE(isr)		\
485	ENTRY_NP(tr_##isr);			\
486	swapgs;					\
487	mov	%r13, %gs:CPU_KPTI_R13;		\
488	SET_KERNEL_CR3(%r13);			\
489	mov	%gs:CPU_THREAD, %r13;		\
490	mov	T_STACK(%r13), %r13;		\
491	addq	$REGSIZE+MINFRAME, %r13;	\
492	mov	%r13, %rsp;			\
493	pushq	%gs:CPU_KPTI_SS;		\
494	pushq	%gs:CPU_KPTI_RSP;		\
495	pushq	%gs:CPU_KPTI_RFLAGS;		\
496	pushq	%gs:CPU_KPTI_CS;		\
497	pushq	%gs:CPU_KPTI_RIP;		\
498	mov	%gs:CPU_KPTI_R13, %r13;		\
499	swapgs;					\
500	jmp	isr;				\
501	SET_SIZE(tr_##isr)
502
503	MK_SYSCALL_INT_TRAMPOLINE(brand_sys_syscall_int)
504	MK_SYSCALL_INT_TRAMPOLINE(sys_syscall_int)
505
506	/*
507	 * Interrupt/trap return trampolines
508	 */
509
510.global	tr_intr_ret_start
511tr_intr_ret_start:
512
513	ENTRY_NP(tr_iret_auto)
514	cmpq	$1, kpti_enable
515	jne	tr_iret_kernel
516	cmpw	$KCS_SEL, T_FRAMERET_CS(%rsp)
517	je	tr_iret_kernel
518	jmp	tr_iret_user
519	SET_SIZE(tr_iret_auto)
520
521	ENTRY_NP(tr_iret_kernel)
522	/*
523	 * Yes, this does nothing extra. But this way we know if we see iret
524	 * elsewhere, then we've failed to properly consider trampolines there.
525	 */
526	iretq
527	SET_SIZE(tr_iret_kernel)
528
529	ENTRY_NP(tr_iret_user)
530#if DEBUG
531	/*
532	 * Panic if we find CR0.TS set. We're still on the kernel stack and
533	 * %cr3, but we do need to swap back to the kernel gs. (We don't worry
534	 * about swapgs speculation here.)
535	 */
536	pushq	%rax
537	mov	%cr0, %rax
538	testq	$CR0_TS, %rax
539	jz	1f
540	swapgs
541	popq	%rax
542	leaq	_bad_ts_panic_msg(%rip), %rdi
543	xorl	%eax, %eax
544	pushq	%rbp
545	movq	%rsp, %rbp
546	call	panic
5471:
548	popq	%rax
549#endif
550
551	cmpq	$1, kpti_enable
552	jne	1f
553
554	/*
555	 * KPTI enabled: we're on the user gsbase at this point, so we
556	 * need to swap back so we can pivot stacks.
557	 *
558	 * The swapgs lfence mitigation is probably not needed here
559	 * since a mis-speculation of the above branch would imply KPTI
560	 * is disabled, but we'll do so anyway.
561	 */
562	swapgs
563	lfence
564	mov	%r13, %gs:CPU_KPTI_R13
565	PIVOT_KPTI_STK(%r13)
566	SET_USER_CR3(%r13)
567	mov	%gs:CPU_KPTI_R13, %r13
568	/* Zero these to make sure they didn't leak from a kernel trap. */
569	movq	$0, %gs:CPU_KPTI_R13
570	movq	$0, %gs:CPU_KPTI_R14
571	/* And back to user gsbase again. */
572	swapgs
5731:
574	iretq
575	SET_SIZE(tr_iret_user)
576
577	/*
578	 * This special return trampoline is for KDI's use only (with kmdb).
579	 *
580	 * KDI/kmdb do not use swapgs -- they directly write the GSBASE MSR
581	 * instead. This trampoline runs after GSBASE has already been changed
582	 * back to the userland value (so we can't use %gs).
583	 *
584	 * Instead, the caller gives us a pointer to the kpti_dbg frame in %r13.
585	 * The KPTI_R13 member in the kpti_dbg has already been set to what the
586	 * real %r13 should be before we IRET.
587	 *
588	 * Additionally, KDI keeps a copy of the incoming %cr3 value when it
589	 * took an interrupt, and has put that back in the kpti_dbg area for us
590	 * to use, so we don't do any sniffing of %cs here. This is important
591	 * so that debugging code that changes %cr3 is possible.
592	 */
593	ENTRY_NP(tr_iret_kdi)
594	movq	%r14, KPTI_R14(%r13)	/* %r14 has to be preserved by us */
595
596	movq	%rsp, %r14	/* original %rsp is pointing at IRET frame */
597	leaq	KPTI_TOP(%r13), %rsp
598	pushq	T_FRAMERET_SS(%r14)
599	pushq	T_FRAMERET_RSP(%r14)
600	pushq	T_FRAMERET_RFLAGS(%r14)
601	pushq	T_FRAMERET_CS(%r14)
602	pushq	T_FRAMERET_RIP(%r14)
603
604	movq	KPTI_TR_CR3(%r13), %r14
605	movq	%r14, %cr3
606
607	movq	KPTI_R14(%r13), %r14
608	movq	KPTI_R13(%r13), %r13	/* preserved by our caller */
609
610	iretq
611	SET_SIZE(tr_iret_kdi)
612
613.global	tr_intr_ret_end
614tr_intr_ret_end:
615
616	/*
617	 * Interrupt/trap entry trampolines
618	 */
619
620	/* CPU pushed an error code, and ISR wants one */
621#define	MK_INTR_TRAMPOLINE(isr)			\
622	ENTRY_NP(tr_##isr);			\
623	INTERRUPT_TRAMPOLINE;			\
624	jmp	isr;				\
625	SET_SIZE(tr_##isr)
626
627	/* CPU didn't push an error code, and ISR doesn't want one */
628#define	MK_INTR_TRAMPOLINE_NOERR(isr)		\
629	ENTRY_NP(tr_##isr);			\
630	push	$0;				\
631	INTERRUPT_TRAMPOLINE_NOERR;		\
632	jmp	isr;				\
633	SET_SIZE(tr_##isr)
634
635	/* CPU pushed an error code, and ISR wants one */
636#define	MK_DBG_INTR_TRAMPOLINE(isr)	\
637	ENTRY_NP(tr_##isr);			\
638	DBG_INTERRUPT_TRAMPOLINE;		\
639	jmp	isr;				\
640	SET_SIZE(tr_##isr)
641
642	/* CPU didn't push an error code, and ISR doesn't want one */
643#define	MK_DBG_INTR_TRAMPOLINE_NOERR(isr)	\
644	ENTRY_NP(tr_##isr);			\
645	push	$0;				\
646	DBG_INTERRUPT_TRAMPOLINE_NOERR;		\
647	jmp	isr;				\
648	SET_SIZE(tr_##isr)
649
650
651	MK_INTR_TRAMPOLINE_NOERR(div0trap)
652	MK_DBG_INTR_TRAMPOLINE_NOERR(dbgtrap)
653	MK_DBG_INTR_TRAMPOLINE_NOERR(brktrap)
654	MK_INTR_TRAMPOLINE_NOERR(ovflotrap)
655	MK_INTR_TRAMPOLINE_NOERR(boundstrap)
656	MK_INTR_TRAMPOLINE_NOERR(invoptrap)
657	MK_INTR_TRAMPOLINE_NOERR(ndptrap)
658	MK_INTR_TRAMPOLINE(invtsstrap)
659	MK_DBG_INTR_TRAMPOLINE(segnptrap)
660	MK_DBG_INTR_TRAMPOLINE(stktrap)
661	MK_DBG_INTR_TRAMPOLINE(gptrap)
662	MK_DBG_INTR_TRAMPOLINE(pftrap)
663	MK_INTR_TRAMPOLINE_NOERR(resvtrap)
664	MK_INTR_TRAMPOLINE_NOERR(ndperr)
665	MK_INTR_TRAMPOLINE(achktrap)
666	MK_INTR_TRAMPOLINE_NOERR(xmtrap)
667	MK_INTR_TRAMPOLINE_NOERR(invaltrap)
668	MK_INTR_TRAMPOLINE_NOERR(fasttrap)
669	MK_INTR_TRAMPOLINE_NOERR(dtrace_ret)
670
671	/*
672	 * These are special because they can interrupt other traps, and
673	 * each other. We don't need to pivot their stacks, because they have
674	 * dedicated IST stack space, but we need to change %cr3.
675	 */
676	ENTRY_NP(tr_nmiint)
677	pushq	%r13
678	mov	kpti_safe_cr3, %r13
679	mov	%r13, %cr3
680	popq	%r13
681	jmp	nmiint
682	SET_SIZE(tr_nmiint)
683
684#if !defined(__xpv)
685	ENTRY_NP(tr_syserrtrap)
686	/*
687	 * If we got here we should always have a zero error code pushed.
688	 * The INT $0x8 instr doesn't seem to push one, though, which we use
689	 * as an emergency panic in the other trampolines. So adjust things
690	 * here.
691	 */
692	cmpq	$0, (%rsp)
693	je	1f
694	pushq	$0
6951:
696	pushq	%r13
697	mov	kpti_safe_cr3, %r13
698	mov	%r13, %cr3
699	popq	%r13
700	jmp	syserrtrap
701	SET_SIZE(tr_syserrtrap)
702#endif
703
704	ENTRY_NP(tr_mcetrap)
705	pushq	%r13
706	mov	kpti_safe_cr3, %r13
707	mov	%r13, %cr3
708	popq	%r13
709	jmp	mcetrap
710	SET_SIZE(tr_mcetrap)
711
712	/*
713	 * Interrupts start at 32
714	 */
715#define MKIVCT(n)			\
716	ENTRY_NP(tr_ivct##n)		\
717	push	$0;			\
718	INTERRUPT_TRAMPOLINE;		\
719	push	$n - 0x20;		\
720	jmp	cmnint;			\
721	SET_SIZE(tr_ivct##n)
722
723	MKIVCT(32);	MKIVCT(33);	MKIVCT(34);	MKIVCT(35);
724	MKIVCT(36);	MKIVCT(37);	MKIVCT(38);	MKIVCT(39);
725	MKIVCT(40);	MKIVCT(41);	MKIVCT(42);	MKIVCT(43);
726	MKIVCT(44);	MKIVCT(45);	MKIVCT(46);	MKIVCT(47);
727	MKIVCT(48);	MKIVCT(49);	MKIVCT(50);	MKIVCT(51);
728	MKIVCT(52);	MKIVCT(53);	MKIVCT(54);	MKIVCT(55);
729	MKIVCT(56);	MKIVCT(57);	MKIVCT(58);	MKIVCT(59);
730	MKIVCT(60);	MKIVCT(61);	MKIVCT(62);	MKIVCT(63);
731	MKIVCT(64);	MKIVCT(65);	MKIVCT(66);	MKIVCT(67);
732	MKIVCT(68);	MKIVCT(69);	MKIVCT(70);	MKIVCT(71);
733	MKIVCT(72);	MKIVCT(73);	MKIVCT(74);	MKIVCT(75);
734	MKIVCT(76);	MKIVCT(77);	MKIVCT(78);	MKIVCT(79);
735	MKIVCT(80);	MKIVCT(81);	MKIVCT(82);	MKIVCT(83);
736	MKIVCT(84);	MKIVCT(85);	MKIVCT(86);	MKIVCT(87);
737	MKIVCT(88);	MKIVCT(89);	MKIVCT(90);	MKIVCT(91);
738	MKIVCT(92);	MKIVCT(93);	MKIVCT(94);	MKIVCT(95);
739	MKIVCT(96);	MKIVCT(97);	MKIVCT(98);	MKIVCT(99);
740	MKIVCT(100);	MKIVCT(101);	MKIVCT(102);	MKIVCT(103);
741	MKIVCT(104);	MKIVCT(105);	MKIVCT(106);	MKIVCT(107);
742	MKIVCT(108);	MKIVCT(109);	MKIVCT(110);	MKIVCT(111);
743	MKIVCT(112);	MKIVCT(113);	MKIVCT(114);	MKIVCT(115);
744	MKIVCT(116);	MKIVCT(117);	MKIVCT(118);	MKIVCT(119);
745	MKIVCT(120);	MKIVCT(121);	MKIVCT(122);	MKIVCT(123);
746	MKIVCT(124);	MKIVCT(125);	MKIVCT(126);	MKIVCT(127);
747	MKIVCT(128);	MKIVCT(129);	MKIVCT(130);	MKIVCT(131);
748	MKIVCT(132);	MKIVCT(133);	MKIVCT(134);	MKIVCT(135);
749	MKIVCT(136);	MKIVCT(137);	MKIVCT(138);	MKIVCT(139);
750	MKIVCT(140);	MKIVCT(141);	MKIVCT(142);	MKIVCT(143);
751	MKIVCT(144);	MKIVCT(145);	MKIVCT(146);	MKIVCT(147);
752	MKIVCT(148);	MKIVCT(149);	MKIVCT(150);	MKIVCT(151);
753	MKIVCT(152);	MKIVCT(153);	MKIVCT(154);	MKIVCT(155);
754	MKIVCT(156);	MKIVCT(157);	MKIVCT(158);	MKIVCT(159);
755	MKIVCT(160);	MKIVCT(161);	MKIVCT(162);	MKIVCT(163);
756	MKIVCT(164);	MKIVCT(165);	MKIVCT(166);	MKIVCT(167);
757	MKIVCT(168);	MKIVCT(169);	MKIVCT(170);	MKIVCT(171);
758	MKIVCT(172);	MKIVCT(173);	MKIVCT(174);	MKIVCT(175);
759	MKIVCT(176);	MKIVCT(177);	MKIVCT(178);	MKIVCT(179);
760	MKIVCT(180);	MKIVCT(181);	MKIVCT(182);	MKIVCT(183);
761	MKIVCT(184);	MKIVCT(185);	MKIVCT(186);	MKIVCT(187);
762	MKIVCT(188);	MKIVCT(189);	MKIVCT(190);	MKIVCT(191);
763	MKIVCT(192);	MKIVCT(193);	MKIVCT(194);	MKIVCT(195);
764	MKIVCT(196);	MKIVCT(197);	MKIVCT(198);	MKIVCT(199);
765	MKIVCT(200);	MKIVCT(201);	MKIVCT(202);	MKIVCT(203);
766	MKIVCT(204);	MKIVCT(205);	MKIVCT(206);	MKIVCT(207);
767	MKIVCT(208);	MKIVCT(209);	MKIVCT(210);	MKIVCT(211);
768	MKIVCT(212);	MKIVCT(213);	MKIVCT(214);	MKIVCT(215);
769	MKIVCT(216);	MKIVCT(217);	MKIVCT(218);	MKIVCT(219);
770	MKIVCT(220);	MKIVCT(221);	MKIVCT(222);	MKIVCT(223);
771	MKIVCT(224);	MKIVCT(225);	MKIVCT(226);	MKIVCT(227);
772	MKIVCT(228);	MKIVCT(229);	MKIVCT(230);	MKIVCT(231);
773	MKIVCT(232);	MKIVCT(233);	MKIVCT(234);	MKIVCT(235);
774	MKIVCT(236);	MKIVCT(237);	MKIVCT(238);	MKIVCT(239);
775	MKIVCT(240);	MKIVCT(241);	MKIVCT(242);	MKIVCT(243);
776	MKIVCT(244);	MKIVCT(245);	MKIVCT(246);	MKIVCT(247);
777	MKIVCT(248);	MKIVCT(249);	MKIVCT(250);	MKIVCT(251);
778	MKIVCT(252);	MKIVCT(253);	MKIVCT(254);	MKIVCT(255);
779
780	/*
781	 * We're PCIDE, but we don't have INVPCID.  The only way to invalidate a
782	 * PCID other than the current one, then, is to load its cr3 then
783	 * invlpg.  But loading kf_user_cr3 means we can longer access our
784	 * caller's text mapping (or indeed, its stack).  So this little helper
785	 * has to live within our trampoline text region.
786	 *
787	 * Called as tr_mmu_flush_user_range(addr, len, pgsz, cr3)
788	 */
789	ENTRY_NP(tr_mmu_flush_user_range)
790	push	%rbx
791	/* When we read cr3, it never has the NOINVL bit set. */
792	mov	%cr3, %rax
793	movq	$CR3_NOINVL_BIT, %rbx
794	orq	%rbx, %rax
795
796	mov	%rcx, %cr3
797	add	%rdi, %rsi
798.align	ASM_ENTRY_ALIGN
7991:
800	invlpg	(%rdi)
801	add	%rdx, %rdi
802	cmp	%rsi, %rdi
803	jb	1b
804	mov	%rax, %cr3
805	pop	%rbx
806	retq
807	SET_SIZE(tr_mmu_flush_user_range)
808
809.align MMU_PAGESIZE
810.global kpti_tramp_end
811kpti_tramp_end:
812	nop
813