xref: /illumos-gate/usr/src/uts/intel/ml/swtch.S (revision 5a9c36dec5d15f7b4f6ab9beb7d64c4545665ec4)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * Copyright 2020 Joyent, Inc.
28 * Copyright 2024 MNX Cloud, Inc.
29 */
30
31/*
32 * Process switching routines.
33 */
34
35#include <sys/asm_linkage.h>
36#include <sys/asm_misc.h>
37#include <sys/regset.h>
38#include <sys/privregs.h>
39#include <sys/stack.h>
40#include <sys/segments.h>
41#include <sys/psw.h>
42
43#include "assym.h"
44
45/*
46 * resume(thread_id_t t);
47 *
48 * a thread can only run on one processor at a time. there
49 * exists a window on MPs where the current thread on one
50 * processor is capable of being dispatched by another processor.
51 * some overlap between outgoing and incoming threads can happen
52 * when they are the same thread. in this case where the threads
53 * are the same, resume() on one processor will spin on the incoming
54 * thread until resume() on the other processor has finished with
55 * the outgoing thread.
56 *
57 * The MMU context changes when the resuming thread resides in a different
58 * process.  Kernel threads are known by resume to reside in process 0.
59 * The MMU context, therefore, only changes when resuming a thread in
60 * a process different from curproc.
61 *
62 * resume_from_intr() is called when the thread being resumed was not
63 * passivated by resume (e.g. was interrupted).  This means that the
64 * resume lock is already held and that a restore context is not needed.
65 * Also, the MMU context is not changed on the resume in this case.
66 *
67 * resume_from_zombie() is the same as resume except the calling thread
68 * is a zombie and must be put on the deathrow list after the CPU is
69 * off the stack.
70 */
71
72#if LWP_PCB_FPU != 0
73#error LWP_PCB_FPU MUST be defined as 0 for code in swtch.s to work
74#endif	/* LWP_PCB_FPU != 0 */
75
76/*
77 * Save non-volatile regs other than %rsp (%rbx, %rbp, and %r12 - %r15)
78 *
79 * The stack frame must be created before the save of %rsp so that tracebacks
80 * of swtch()ed-out processes show the process as having last called swtch().
81 */
82#define SAVE_REGS(thread_t, retaddr)			\
83	movq	%rbp, T_RBP(thread_t);			\
84	movq	%rbx, T_RBX(thread_t);			\
85	movq	%r12, T_R12(thread_t);			\
86	movq	%r13, T_R13(thread_t);			\
87	movq	%r14, T_R14(thread_t);			\
88	movq	%r15, T_R15(thread_t);			\
89	pushq	%rbp;					\
90	movq	%rsp, %rbp;				\
91	movq	%rsp, T_SP(thread_t);			\
92	movq	retaddr, T_PC(thread_t);		\
93	movq	%rdi, %r12;				\
94	call	__dtrace_probe___sched_off__cpu
95
96/*
97 * Restore non-volatile regs other than %rsp (%rbx, %rbp, and %r12 - %r15)
98 *
99 * We load up %rsp from the label_t as part of the context switch, so
100 * we don't repeat that here.
101 *
102 * We don't do a 'leave,' because reloading %rsp/%rbp from the label_t
103 * already has the effect of putting the stack back the way it was when
104 * we came in.
105 */
106#define RESTORE_REGS(scratch_reg)			\
107	movq	%gs:CPU_THREAD, scratch_reg;		\
108	movq	T_RBP(scratch_reg), %rbp;		\
109	movq	T_RBX(scratch_reg), %rbx;		\
110	movq	T_R12(scratch_reg), %r12;		\
111	movq	T_R13(scratch_reg), %r13;		\
112	movq	T_R14(scratch_reg), %r14;		\
113	movq	T_R15(scratch_reg), %r15
114
115/*
116 * Get pointer to a thread's hat structure
117 */
118#define GET_THREAD_HATP(hatp, thread_t, scratch_reg)	\
119	movq	T_PROCP(thread_t), hatp;		\
120	movq	P_AS(hatp), scratch_reg;		\
121	movq	A_HAT(scratch_reg), hatp
122
123#define	TSC_READ()					\
124	call	tsc_read;				\
125	movq	%rax, %r14;
126
127/*
128 * If we are resuming an interrupt thread, store a timestamp in the thread
129 * structure.  If an interrupt occurs between tsc_read() and its subsequent
130 * store, the timestamp will be stale by the time it is stored.  We can detect
131 * this by doing a compare-and-swap on the thread's timestamp, since any
132 * interrupt occurring in this window will put a new timestamp in the thread's
133 * t_intr_start field.
134 */
135#define	STORE_INTR_START(thread_t)			\
136	testw	$T_INTR_THREAD, T_FLAGS(thread_t);	\
137	jz	1f;					\
1380:							\
139	TSC_READ();					\
140	movq	T_INTR_START(thread_t), %rax;		\
141	cmpxchgq %r14, T_INTR_START(thread_t);		\
142	jnz	0b;					\
1431:
144
145	.global	kpti_enable
146
147	ENTRY(resume)
148	movq	%gs:CPU_THREAD, %rax
149	leaq	resume_return(%rip), %r11
150
151	/*
152	 * Deal with SMAP here. A thread may be switched out at any point while
153	 * it is executing. The thread could be under on_fault() or it could be
154	 * pre-empted while performing a copy interruption. If this happens and
155	 * we're not in the context of an interrupt which happens to handle
156	 * saving and restoring rflags correctly, we may lose our SMAP related
157	 * state.
158	 *
159	 * To handle this, as part of being switched out, we first save whether
160	 * or not userland access is allowed ($PS_ACHK in rflags) and store that
161	 * in t_useracc on the kthread_t and unconditionally enable SMAP to
162	 * protect the system.
163	 *
164	 * Later, when the thread finishes resuming, we potentially disable smap
165	 * if PS_ACHK was present in rflags. See uts/intel/ml/copy.s for
166	 * more information on rflags and SMAP.
167	 */
168	pushfq
169	popq	%rsi
170	andq	$PS_ACHK, %rsi
171	movq	%rsi, T_USERACC(%rax)
172	call	smap_enable
173
174	/*
175	 * Take a moment to potentially clear the RSB buffer. This is done to
176	 * prevent various Spectre variant 2 and SpectreRSB attacks. This may
177	 * not be sufficient. Please see uts/intel/ml/retpoline.S for more
178	 * information about this.
179	 */
180	call	x86_rsb_stuff
181
182        /*
183         * Take another moment to potentially clear the branch history buffer
184         * (BHB). This is done to prevent recent discoveries that branch
185         * history can also be trained to exploit certain compiler-generated
186         * instruction sequences (known as "gadgets") to leak data
187         * speculatively.  As with x86_rsb_stuff, see retpoline.S, and this
188         * may not be sufficient.
189         */
190        call    x86_bhb_clear
191
192	/*
193	 * Save non-volatile registers, and set return address for current
194	 * thread to resume_return.
195	 *
196	 * %r12 = t (new thread) when done
197	 */
198	SAVE_REGS(%rax, %r11)
199
200
201	LOADCPU(%r15)				/* %r15 = CPU */
202	movq	CPU_THREAD(%r15), %r13		/* %r13 = curthread */
203
204	/*
205	 * Call savectx if thread has installed context ops.
206	 *
207	 * Note that if we have floating point context, the save op
208	 * (either fpsave_begin or fpxsave_begin) will issue the
209	 * async save instruction (fnsave or fxsave respectively)
210	 * that we fwait for below.
211	 */
212	cmpq	$0, T_CTX(%r13)		/* should current thread savectx? */
213	je	.nosavectx		/* skip call when zero */
214
215	movq	%r13, %rdi		/* arg = thread pointer */
216	call	savectx			/* call ctx ops */
217.nosavectx:
218
219	/*
220	 * Check that the curthread is not using the FPU while in the kernel.
221	 */
222	call	kernel_fpu_no_swtch
223
224        /*
225         * Call savepctx if process has installed context ops.
226         */
227	movq	T_PROCP(%r13), %r14	/* %r14 = proc */
228        cmpq    $0, P_PCTX(%r14)         /* should current thread savepctx? */
229        je      .nosavepctx              /* skip call when zero */
230
231        movq    %r14, %rdi              /* arg = proc pointer */
232        call    savepctx                 /* call ctx ops */
233.nosavepctx:
234
235	/*
236	 * Temporarily switch to the idle thread's stack
237	 */
238	movq	CPU_IDLE_THREAD(%r15), %rax	/* idle thread pointer */
239
240	/*
241	 * Set the idle thread as the current thread
242	 */
243	movq	T_SP(%rax), %rsp	/* It is safe to set rsp */
244	movq	%rax, CPU_THREAD(%r15)
245
246	/*
247	 * Switch in the hat context for the new thread
248	 *
249	 */
250	GET_THREAD_HATP(%rdi, %r12, %r11)
251	call	hat_switch
252
253	/*
254	 * Clear and unlock previous thread's t_lock
255	 * to allow it to be dispatched by another processor.
256	 */
257	movb	$0, T_LOCK(%r13)
258
259	/*
260	 * IMPORTANT: Registers at this point must be:
261	 *       %r12 = new thread
262	 *
263	 * Here we are in the idle thread, have dropped the old thread.
264	 */
265	ALTENTRY(_resume_from_idle)
266	/*
267	 * spin until dispatched thread's mutex has
268	 * been unlocked. this mutex is unlocked when
269	 * it becomes safe for the thread to run.
270	 */
271.lock_thread_mutex:
272	lock
273	btsl	$0, T_LOCK(%r12)	/* attempt to lock new thread's mutex */
274	jnc	.thread_mutex_locked	/* got it */
275
276.spin_thread_mutex:
277	pause
278	cmpb	$0, T_LOCK(%r12)	/* check mutex status */
279	jz	.lock_thread_mutex	/* clear, retry lock */
280	jmp	.spin_thread_mutex	/* still locked, spin... */
281
282.thread_mutex_locked:
283	/*
284	 * Fix CPU structure to indicate new running thread.
285	 * Set pointer in new thread to the CPU structure.
286	 */
287	LOADCPU(%r13)			/* load current CPU pointer */
288	cmpq	%r13, T_CPU(%r12)
289	je	.setup_cpu
290
291	/* cp->cpu_stats.sys.cpumigrate++ */
292	incq    CPU_STATS_SYS_CPUMIGRATE(%r13)
293	movq	%r13, T_CPU(%r12)	/* set new thread's CPU pointer */
294
295.setup_cpu:
296	/*
297	 * Setup rsp0 (kernel stack) in TSS to curthread's saved regs
298	 * structure.  If this thread doesn't have a regs structure above
299	 * the stack -- that is, if lwp_stk_init() was never called for the
300	 * thread -- this will set rsp0 to the wrong value, but it's harmless
301	 * as it's a kernel thread, and it won't actually attempt to implicitly
302	 * use the rsp0 via a privilege change.
303	 *
304	 * Note that when we have KPTI enabled on amd64, we never use this
305	 * value at all (since all the interrupts have an IST set).
306	 */
307	movq	CPU_TSS(%r13), %r14
308#if !defined(__xpv)
309	cmpq	$1, kpti_enable
310	jne	1f
311	leaq	CPU_KPTI_TR_RSP(%r13), %rax
312	jmp	2f
3131:
314	movq	T_STACK(%r12), %rax
315	addq	$REGSIZE+MINFRAME, %rax	/* to the bottom of thread stack */
3162:
317	movq	%rax, TSS_RSP0(%r14)
318#else
319	movq	T_STACK(%r12), %rax
320	addq	$REGSIZE+MINFRAME, %rax	/* to the bottom of thread stack */
321	movl	$KDS_SEL, %edi
322	movq	%rax, %rsi
323	call	HYPERVISOR_stack_switch
324#endif	/* __xpv */
325
326	movq	%r12, CPU_THREAD(%r13)	/* set CPU's thread pointer */
327	mfence				/* synchronize with mutex_exit() */
328	xorl	%ebp, %ebp		/* make $<threadlist behave better */
329	movq	T_LWP(%r12), %rax	/* set associated lwp to  */
330	movq	%rax, CPU_LWP(%r13)	/* CPU's lwp ptr */
331
332	movq	T_SP(%r12), %rsp	/* switch to outgoing thread's stack */
333	movq	T_PC(%r12), %r13	/* saved return addr */
334
335	/*
336	 * Call restorectx if context ops have been installed.
337	 */
338	cmpq	$0, T_CTX(%r12)		/* should resumed thread restorectx? */
339	jz	.norestorectx		/* skip call when zero */
340	movq	%r12, %rdi		/* arg = thread pointer */
341	call	restorectx		/* call ctx ops */
342.norestorectx:
343
344	/*
345	 * Call restorepctx if context ops have been installed for the proc.
346	 */
347	movq	T_PROCP(%r12), %rcx
348	cmpq	$0, P_PCTX(%rcx)
349	jz	.norestorepctx
350	movq	%rcx, %rdi
351	call	restorepctx
352.norestorepctx:
353
354	STORE_INTR_START(%r12)
355
356	/*
357	 * If we came into swtch with the ability to access userland pages, go
358	 * ahead and restore that fact by disabling SMAP.  Clear the indicator
359	 * flag out of paranoia.
360	 */
361	movq	T_USERACC(%r12), %rax	/* should we disable smap? */
362	cmpq	$0, %rax		/* skip call when zero */
363	jz	.nosmap
364	xorq	%rax, %rax
365	movq	%rax, T_USERACC(%r12)
366	call	smap_disable
367.nosmap:
368
369	call	smt_mark
370
371	/*
372	 * Restore non-volatile registers, then have spl0 return to the
373	 * resuming thread's PC after first setting the priority as low as
374	 * possible and blocking all interrupt threads that may be active.
375	 */
376	movq	%r13, %rax	/* save return address */
377	RESTORE_REGS(%r11)
378	pushq	%rax		/* push return address for spl0() */
379	call	__dtrace_probe___sched_on__cpu
380	jmp	spl0
381
382resume_return:
383	/*
384	 * Remove stack frame created in SAVE_REGS()
385	 */
386	addq	$CLONGSIZE, %rsp
387	ret
388	SET_SIZE(_resume_from_idle)
389	SET_SIZE(resume)
390
391	ENTRY(resume_from_zombie)
392	movq	%gs:CPU_THREAD, %rax
393	leaq	resume_from_zombie_return(%rip), %r11
394
395	/*
396	 * Save non-volatile registers, and set return address for current
397	 * thread to resume_from_zombie_return.
398	 *
399	 * %r12 = t (new thread) when done
400	 */
401	SAVE_REGS(%rax, %r11)
402
403	movq	%gs:CPU_THREAD, %r13	/* %r13 = curthread */
404
405	/* clean up the fp unit. It might be left enabled */
406
407#if defined(__xpv)		/* XXPV XXtclayton */
408	/*
409	 * Remove this after bringup.
410	 * (Too many #gp's for an instrumented hypervisor.)
411	 */
412	STTS(%rax)
413#else
414	movq	%cr0, %rax
415	testq	$CR0_TS, %rax
416	jnz	.zfpu_disabled		/* if TS already set, nothing to do */
417	fninit				/* init fpu & discard pending error */
418	orq	$CR0_TS, %rax
419	movq	%rax, %cr0
420.zfpu_disabled:
421
422#endif	/* __xpv */
423
424	/*
425	 * Temporarily switch to the idle thread's stack so that the zombie
426	 * thread's stack can be reclaimed by the reaper.
427	 */
428	movq	%gs:CPU_IDLE_THREAD, %rax /* idle thread pointer */
429	movq	T_SP(%rax), %rsp	/* get onto idle thread stack */
430
431	/*
432	 * Sigh. If the idle thread has never run thread_start()
433	 * then t_sp is mis-aligned by thread_load().
434	 */
435	andq	$_BITNOT(STACK_ALIGN-1), %rsp
436
437	/*
438	 * Set the idle thread as the current thread.
439	 */
440	movq	%rax, %gs:CPU_THREAD
441
442	/* switch in the hat context for the new thread */
443	GET_THREAD_HATP(%rdi, %r12, %r11)
444	call	hat_switch
445
446	/*
447	 * Put the zombie on death-row.
448	 */
449	movq	%r13, %rdi
450	call	reapq_add
451
452	jmp	_resume_from_idle	/* finish job of resume */
453
454resume_from_zombie_return:
455	RESTORE_REGS(%r11)		/* restore non-volatile registers */
456	call	__dtrace_probe___sched_on__cpu
457
458	/*
459	 * Remove stack frame created in SAVE_REGS()
460	 */
461	addq	$CLONGSIZE, %rsp
462	ret
463	SET_SIZE(resume_from_zombie)
464
465	ENTRY(resume_from_intr)
466	movq	%gs:CPU_THREAD, %rax
467	leaq	resume_from_intr_return(%rip), %r11
468
469	/*
470	 * Save non-volatile registers, and set return address for current
471	 * thread to resume_from_intr_return.
472	 *
473	 * %r12 = t (new thread) when done
474	 */
475	SAVE_REGS(%rax, %r11)
476
477	movq	%gs:CPU_THREAD, %r13	/* %r13 = curthread */
478	movq	%r12, %gs:CPU_THREAD	/* set CPU's thread pointer */
479	mfence				/* synchronize with mutex_exit() */
480	movq	T_SP(%r12), %rsp	/* restore resuming thread's sp */
481	xorl	%ebp, %ebp		/* make $<threadlist behave better */
482
483	/*
484	 * Unlock outgoing thread's mutex dispatched by another processor.
485	 */
486	xorl	%eax, %eax
487	xchgb	%al, T_LOCK(%r13)
488
489	STORE_INTR_START(%r12)
490
491	call	smt_mark
492
493	/*
494	 * Restore non-volatile registers, then have spl0 return to the
495	 * resuming thread's PC after first setting the priority as low as
496	 * possible and blocking all interrupt threads that may be active.
497	 */
498	movq	T_PC(%r12), %rax	/* saved return addr */
499	RESTORE_REGS(%r11);
500	pushq	%rax			/* push return address for spl0() */
501	call	__dtrace_probe___sched_on__cpu
502	jmp	spl0
503
504resume_from_intr_return:
505	/*
506	 * Remove stack frame created in SAVE_REGS()
507	 */
508	addq	$CLONGSIZE, %rsp
509	ret
510	SET_SIZE(resume_from_intr)
511
512	ENTRY(thread_start)
513	popq	%rax		/* start() */
514	popq	%rdi		/* arg */
515	popq	%rsi		/* len */
516	movq	%rsp, %rbp
517	INDIRECT_CALL_REG(rax)
518	call	thread_exit	/* destroy thread if it returns. */
519	/*NOTREACHED*/
520	SET_SIZE(thread_start)
521