xref: /linux/arch/x86/kernel/process_64.c (revision cc69ac7a65820dd96c48fd2988255f8acc2527f2)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Copyright (C) 1995  Linus Torvalds
4  *
5  *  Pentium III FXSR, SSE support
6  *	Gareth Hughes <gareth@valinux.com>, May 2000
7  *
8  *  X86-64 port
9  *	Andi Kleen.
10  *
11  *	CPU hotplug support - ashok.raj@intel.com
12  */
13 
14 /*
15  * This file handles the architecture-dependent parts of process handling..
16  */
17 
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/sched/task.h>
22 #include <linux/sched/task_stack.h>
23 #include <linux/fs.h>
24 #include <linux/kernel.h>
25 #include <linux/mm.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/delay.h>
32 #include <linux/export.h>
33 #include <linux/ptrace.h>
34 #include <linux/notifier.h>
35 #include <linux/kprobes.h>
36 #include <linux/kdebug.h>
37 #include <linux/prctl.h>
38 #include <linux/uaccess.h>
39 #include <linux/io.h>
40 #include <linux/ftrace.h>
41 #include <linux/syscalls.h>
42 #include <linux/iommu.h>
43 
44 #include <asm/processor.h>
45 #include <asm/pkru.h>
46 #include <asm/fpu/sched.h>
47 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
49 #include <asm/desc.h>
50 #include <asm/proto.h>
51 #include <asm/ia32.h>
52 #include <asm/debugreg.h>
53 #include <asm/switch_to.h>
54 #include <asm/xen/hypervisor.h>
55 #include <asm/vdso.h>
56 #include <asm/resctrl.h>
57 #include <asm/unistd.h>
58 #include <asm/fsgsbase.h>
59 #include <asm/fred.h>
60 #include <asm/msr.h>
61 #ifdef CONFIG_IA32_EMULATION
62 /* Not included via unistd.h */
63 #include <asm/unistd_32_ia32.h>
64 #endif
65 
66 #include "process.h"
67 
68 /* Prints also some state that isn't saved in the pt_regs */
__show_regs(struct pt_regs * regs,enum show_regs_mode mode,const char * log_lvl)69 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
70 		 const char *log_lvl)
71 {
72 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
73 	unsigned long d0, d1, d2, d3, d6, d7;
74 	unsigned int fsindex, gsindex;
75 	unsigned int ds, es;
76 
77 	show_iret_regs(regs, log_lvl);
78 
79 	if (regs->orig_ax != -1)
80 		pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
81 	else
82 		pr_cont("\n");
83 
84 	printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
85 	       log_lvl, regs->ax, regs->bx, regs->cx);
86 	printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
87 	       log_lvl, regs->dx, regs->si, regs->di);
88 	printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
89 	       log_lvl, regs->bp, regs->r8, regs->r9);
90 	printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
91 	       log_lvl, regs->r10, regs->r11, regs->r12);
92 	printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
93 	       log_lvl, regs->r13, regs->r14, regs->r15);
94 
95 	if (mode == SHOW_REGS_SHORT)
96 		return;
97 
98 	if (mode == SHOW_REGS_USER) {
99 		rdmsrq(MSR_FS_BASE, fs);
100 		rdmsrq(MSR_KERNEL_GS_BASE, shadowgs);
101 		printk("%sFS:  %016lx GS:  %016lx\n",
102 		       log_lvl, fs, shadowgs);
103 		return;
104 	}
105 
106 	asm("movl %%ds,%0" : "=r" (ds));
107 	asm("movl %%es,%0" : "=r" (es));
108 	asm("movl %%fs,%0" : "=r" (fsindex));
109 	asm("movl %%gs,%0" : "=r" (gsindex));
110 
111 	rdmsrq(MSR_FS_BASE, fs);
112 	rdmsrq(MSR_GS_BASE, gs);
113 	rdmsrq(MSR_KERNEL_GS_BASE, shadowgs);
114 
115 	cr0 = read_cr0();
116 	cr2 = read_cr2();
117 	cr3 = __read_cr3();
118 	cr4 = __read_cr4();
119 
120 	printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
121 	       log_lvl, fs, fsindex, gs, gsindex, shadowgs);
122 	printk("%sCS:  %04x DS: %04x ES: %04x CR0: %016lx\n",
123 		log_lvl, regs->cs, ds, es, cr0);
124 	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
125 		log_lvl, cr2, cr3, cr4);
126 
127 	get_debugreg(d0, 0);
128 	get_debugreg(d1, 1);
129 	get_debugreg(d2, 2);
130 	get_debugreg(d3, 3);
131 	get_debugreg(d6, 6);
132 	get_debugreg(d7, 7);
133 
134 	/* Only print out debug registers if they are in their non-default state. */
135 	if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
136 	    (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))) {
137 		printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
138 		       log_lvl, d0, d1, d2);
139 		printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
140 		       log_lvl, d3, d6, d7);
141 	}
142 
143 	if (cr4 & X86_CR4_PKE)
144 		printk("%sPKRU: %08x\n", log_lvl, read_pkru());
145 }
146 
release_thread(struct task_struct * dead_task)147 void release_thread(struct task_struct *dead_task)
148 {
149 	WARN_ON(dead_task->mm);
150 }
151 
152 enum which_selector {
153 	FS,
154 	GS
155 };
156 
157 /*
158  * Out of line to be protected from kprobes and tracing. If this would be
159  * traced or probed than any access to a per CPU variable happens with
160  * the wrong GS.
161  *
162  * It is not used on Xen paravirt. When paravirt support is needed, it
163  * needs to be renamed with native_ prefix.
164  */
__rdgsbase_inactive(void)165 static noinstr unsigned long __rdgsbase_inactive(void)
166 {
167 	unsigned long gsbase;
168 
169 	lockdep_assert_irqs_disabled();
170 
171 	/*
172 	 * SWAPGS is no longer needed thus NOT allowed with FRED because
173 	 * FRED transitions ensure that an operating system can _always_
174 	 * operate with its own GS base address:
175 	 * - For events that occur in ring 3, FRED event delivery swaps
176 	 *   the GS base address with the IA32_KERNEL_GS_BASE MSR.
177 	 * - ERETU (the FRED transition that returns to ring 3) also swaps
178 	 *   the GS base address with the IA32_KERNEL_GS_BASE MSR.
179 	 *
180 	 * And the operating system can still setup the GS segment for a
181 	 * user thread without the need of loading a user thread GS with:
182 	 * - Using LKGS, available with FRED, to modify other attributes
183 	 *   of the GS segment without compromising its ability always to
184 	 *   operate with its own GS base address.
185 	 * - Accessing the GS segment base address for a user thread as
186 	 *   before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR.
187 	 *
188 	 * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE
189 	 * MSR instead of the GS segment’s descriptor cache. As such, the
190 	 * operating system never changes its runtime GS base address.
191 	 */
192 	if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
193 	    !cpu_feature_enabled(X86_FEATURE_XENPV)) {
194 		native_swapgs();
195 		gsbase = rdgsbase();
196 		native_swapgs();
197 	} else {
198 		instrumentation_begin();
199 		rdmsrq(MSR_KERNEL_GS_BASE, gsbase);
200 		instrumentation_end();
201 	}
202 
203 	return gsbase;
204 }
205 
206 /*
207  * Out of line to be protected from kprobes and tracing. If this would be
208  * traced or probed than any access to a per CPU variable happens with
209  * the wrong GS.
210  *
211  * It is not used on Xen paravirt. When paravirt support is needed, it
212  * needs to be renamed with native_ prefix.
213  */
__wrgsbase_inactive(unsigned long gsbase)214 static noinstr void __wrgsbase_inactive(unsigned long gsbase)
215 {
216 	lockdep_assert_irqs_disabled();
217 
218 	if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
219 	    !cpu_feature_enabled(X86_FEATURE_XENPV)) {
220 		native_swapgs();
221 		wrgsbase(gsbase);
222 		native_swapgs();
223 	} else {
224 		instrumentation_begin();
225 		wrmsrq(MSR_KERNEL_GS_BASE, gsbase);
226 		instrumentation_end();
227 	}
228 }
229 
230 /*
231  * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
232  * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
233  * It's forcibly inlined because it'll generate better code and this function
234  * is hot.
235  */
save_base_legacy(struct task_struct * prev_p,unsigned short selector,enum which_selector which)236 static __always_inline void save_base_legacy(struct task_struct *prev_p,
237 					     unsigned short selector,
238 					     enum which_selector which)
239 {
240 	if (likely(selector == 0)) {
241 		/*
242 		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
243 		 * be the pre-existing saved base or it could be zero.  On AMD
244 		 * (with X86_BUG_NULL_SEG), the segment base could be almost
245 		 * anything.
246 		 *
247 		 * This branch is very hot (it's hit twice on almost every
248 		 * context switch between 64-bit programs), and avoiding
249 		 * the RDMSR helps a lot, so we just assume that whatever
250 		 * value is already saved is correct.  This matches historical
251 		 * Linux behavior, so it won't break existing applications.
252 		 *
253 		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
254 		 * report that the base is zero, it needs to actually be zero:
255 		 * see the corresponding logic in load_seg_legacy.
256 		 */
257 	} else {
258 		/*
259 		 * If the selector is 1, 2, or 3, then the base is zero on
260 		 * !X86_BUG_NULL_SEG CPUs and could be anything on
261 		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
262 		 * has never attempted to preserve the base across context
263 		 * switches.
264 		 *
265 		 * If selector > 3, then it refers to a real segment, and
266 		 * saving the base isn't necessary.
267 		 */
268 		if (which == FS)
269 			prev_p->thread.fsbase = 0;
270 		else
271 			prev_p->thread.gsbase = 0;
272 	}
273 }
274 
save_fsgs(struct task_struct * task)275 static __always_inline void save_fsgs(struct task_struct *task)
276 {
277 	savesegment(fs, task->thread.fsindex);
278 	savesegment(gs, task->thread.gsindex);
279 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
280 		/*
281 		 * If FSGSBASE is enabled, we can't make any useful guesses
282 		 * about the base, and user code expects us to save the current
283 		 * value.  Fortunately, reading the base directly is efficient.
284 		 */
285 		task->thread.fsbase = rdfsbase();
286 		task->thread.gsbase = __rdgsbase_inactive();
287 	} else {
288 		save_base_legacy(task, task->thread.fsindex, FS);
289 		save_base_legacy(task, task->thread.gsindex, GS);
290 	}
291 }
292 
293 /*
294  * While a process is running,current->thread.fsbase and current->thread.gsbase
295  * may not match the corresponding CPU registers (see save_base_legacy()).
296  */
current_save_fsgs(void)297 void current_save_fsgs(void)
298 {
299 	unsigned long flags;
300 
301 	/* Interrupts need to be off for FSGSBASE */
302 	local_irq_save(flags);
303 	save_fsgs(current);
304 	local_irq_restore(flags);
305 }
306 #if IS_ENABLED(CONFIG_KVM)
307 EXPORT_SYMBOL_GPL(current_save_fsgs);
308 #endif
309 
loadseg(enum which_selector which,unsigned short sel)310 static __always_inline void loadseg(enum which_selector which,
311 				    unsigned short sel)
312 {
313 	if (which == FS)
314 		loadsegment(fs, sel);
315 	else
316 		load_gs_index(sel);
317 }
318 
load_seg_legacy(unsigned short prev_index,unsigned long prev_base,unsigned short next_index,unsigned long next_base,enum which_selector which)319 static __always_inline void load_seg_legacy(unsigned short prev_index,
320 					    unsigned long prev_base,
321 					    unsigned short next_index,
322 					    unsigned long next_base,
323 					    enum which_selector which)
324 {
325 	if (likely(next_index <= 3)) {
326 		/*
327 		 * The next task is using 64-bit TLS, is not using this
328 		 * segment at all, or is having fun with arcane CPU features.
329 		 */
330 		if (next_base == 0) {
331 			/*
332 			 * Nasty case: on AMD CPUs, we need to forcibly zero
333 			 * the base.
334 			 */
335 			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
336 				loadseg(which, __USER_DS);
337 				loadseg(which, next_index);
338 			} else {
339 				/*
340 				 * We could try to exhaustively detect cases
341 				 * under which we can skip the segment load,
342 				 * but there's really only one case that matters
343 				 * for performance: if both the previous and
344 				 * next states are fully zeroed, we can skip
345 				 * the load.
346 				 *
347 				 * (This assumes that prev_base == 0 has no
348 				 * false positives.  This is the case on
349 				 * Intel-style CPUs.)
350 				 */
351 				if (likely(prev_index | next_index | prev_base))
352 					loadseg(which, next_index);
353 			}
354 		} else {
355 			if (prev_index != next_index)
356 				loadseg(which, next_index);
357 			wrmsrq(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
358 			       next_base);
359 		}
360 	} else {
361 		/*
362 		 * The next task is using a real segment.  Loading the selector
363 		 * is sufficient.
364 		 */
365 		loadseg(which, next_index);
366 	}
367 }
368 
369 /*
370  * Store prev's PKRU value and load next's PKRU value if they differ. PKRU
371  * is not XSTATE managed on context switch because that would require a
372  * lookup in the task's FPU xsave buffer and require to keep that updated
373  * in various places.
374  */
x86_pkru_load(struct thread_struct * prev,struct thread_struct * next)375 static __always_inline void x86_pkru_load(struct thread_struct *prev,
376 					  struct thread_struct *next)
377 {
378 	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
379 		return;
380 
381 	/* Stash the prev task's value: */
382 	prev->pkru = rdpkru();
383 
384 	/*
385 	 * PKRU writes are slightly expensive.  Avoid them when not
386 	 * strictly necessary:
387 	 */
388 	if (prev->pkru != next->pkru)
389 		wrpkru(next->pkru);
390 }
391 
x86_fsgsbase_load(struct thread_struct * prev,struct thread_struct * next)392 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
393 					      struct thread_struct *next)
394 {
395 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
396 		/* Update the FS and GS selectors if they could have changed. */
397 		if (unlikely(prev->fsindex || next->fsindex))
398 			loadseg(FS, next->fsindex);
399 		if (unlikely(prev->gsindex || next->gsindex))
400 			loadseg(GS, next->gsindex);
401 
402 		/* Update the bases. */
403 		wrfsbase(next->fsbase);
404 		__wrgsbase_inactive(next->gsbase);
405 	} else {
406 		load_seg_legacy(prev->fsindex, prev->fsbase,
407 				next->fsindex, next->fsbase, FS);
408 		load_seg_legacy(prev->gsindex, prev->gsbase,
409 				next->gsindex, next->gsbase, GS);
410 	}
411 }
412 
x86_fsgsbase_read_task(struct task_struct * task,unsigned short selector)413 unsigned long x86_fsgsbase_read_task(struct task_struct *task,
414 				     unsigned short selector)
415 {
416 	unsigned short idx = selector >> 3;
417 	unsigned long base;
418 
419 	if (likely((selector & SEGMENT_TI_MASK) == 0)) {
420 		if (unlikely(idx >= GDT_ENTRIES))
421 			return 0;
422 
423 		/*
424 		 * There are no user segments in the GDT with nonzero bases
425 		 * other than the TLS segments.
426 		 */
427 		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
428 			return 0;
429 
430 		idx -= GDT_ENTRY_TLS_MIN;
431 		base = get_desc_base(&task->thread.tls_array[idx]);
432 	} else {
433 #ifdef CONFIG_MODIFY_LDT_SYSCALL
434 		struct ldt_struct *ldt;
435 
436 		/*
437 		 * If performance here mattered, we could protect the LDT
438 		 * with RCU.  This is a slow path, though, so we can just
439 		 * take the mutex.
440 		 */
441 		mutex_lock(&task->mm->context.lock);
442 		ldt = task->mm->context.ldt;
443 		if (unlikely(!ldt || idx >= ldt->nr_entries))
444 			base = 0;
445 		else
446 			base = get_desc_base(ldt->entries + idx);
447 		mutex_unlock(&task->mm->context.lock);
448 #else
449 		base = 0;
450 #endif
451 	}
452 
453 	return base;
454 }
455 
x86_gsbase_read_cpu_inactive(void)456 unsigned long x86_gsbase_read_cpu_inactive(void)
457 {
458 	unsigned long gsbase;
459 
460 	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
461 		unsigned long flags;
462 
463 		local_irq_save(flags);
464 		gsbase = __rdgsbase_inactive();
465 		local_irq_restore(flags);
466 	} else {
467 		rdmsrq(MSR_KERNEL_GS_BASE, gsbase);
468 	}
469 
470 	return gsbase;
471 }
472 
x86_gsbase_write_cpu_inactive(unsigned long gsbase)473 void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
474 {
475 	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
476 		unsigned long flags;
477 
478 		local_irq_save(flags);
479 		__wrgsbase_inactive(gsbase);
480 		local_irq_restore(flags);
481 	} else {
482 		wrmsrq(MSR_KERNEL_GS_BASE, gsbase);
483 	}
484 }
485 
x86_fsbase_read_task(struct task_struct * task)486 unsigned long x86_fsbase_read_task(struct task_struct *task)
487 {
488 	unsigned long fsbase;
489 
490 	if (task == current)
491 		fsbase = x86_fsbase_read_cpu();
492 	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
493 		 (task->thread.fsindex == 0))
494 		fsbase = task->thread.fsbase;
495 	else
496 		fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
497 
498 	return fsbase;
499 }
500 
x86_gsbase_read_task(struct task_struct * task)501 unsigned long x86_gsbase_read_task(struct task_struct *task)
502 {
503 	unsigned long gsbase;
504 
505 	if (task == current)
506 		gsbase = x86_gsbase_read_cpu_inactive();
507 	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
508 		 (task->thread.gsindex == 0))
509 		gsbase = task->thread.gsbase;
510 	else
511 		gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
512 
513 	return gsbase;
514 }
515 
x86_fsbase_write_task(struct task_struct * task,unsigned long fsbase)516 void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
517 {
518 	WARN_ON_ONCE(task == current);
519 
520 	task->thread.fsbase = fsbase;
521 }
522 
x86_gsbase_write_task(struct task_struct * task,unsigned long gsbase)523 void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
524 {
525 	WARN_ON_ONCE(task == current);
526 
527 	task->thread.gsbase = gsbase;
528 }
529 
530 static void
start_thread_common(struct pt_regs * regs,unsigned long new_ip,unsigned long new_sp,u16 _cs,u16 _ss,u16 _ds)531 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
532 		    unsigned long new_sp,
533 		    u16 _cs, u16 _ss, u16 _ds)
534 {
535 	WARN_ON_ONCE(regs != current_pt_regs());
536 
537 	if (static_cpu_has(X86_BUG_NULL_SEG)) {
538 		/* Loading zero below won't clear the base. */
539 		loadsegment(fs, __USER_DS);
540 		load_gs_index(__USER_DS);
541 	}
542 
543 	reset_thread_features();
544 
545 	loadsegment(fs, 0);
546 	loadsegment(es, _ds);
547 	loadsegment(ds, _ds);
548 	load_gs_index(0);
549 
550 	regs->ip	= new_ip;
551 	regs->sp	= new_sp;
552 	regs->csx	= _cs;
553 	regs->ssx	= _ss;
554 	/*
555 	 * Allow single-step trap and NMI when starting a new task, thus
556 	 * once the new task enters user space, single-step trap and NMI
557 	 * are both enabled immediately.
558 	 *
559 	 * Entering a new task is logically speaking a return from a
560 	 * system call (exec, fork, clone, etc.). As such, if ptrace
561 	 * enables single stepping a single step exception should be
562 	 * allowed to trigger immediately upon entering user space.
563 	 * This is not optional.
564 	 *
565 	 * NMI should *never* be disabled in user space. As such, this
566 	 * is an optional, opportunistic way to catch errors.
567 	 *
568 	 * Paranoia: High-order 48 bits above the lowest 16 bit SS are
569 	 * discarded by the legacy IRET instruction on all Intel, AMD,
570 	 * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally,
571 	 * even when FRED is not enabled. But we choose the safer side
572 	 * to use these bits only when FRED is enabled.
573 	 */
574 	if (cpu_feature_enabled(X86_FEATURE_FRED)) {
575 		regs->fred_ss.swevent	= true;
576 		regs->fred_ss.nmi	= true;
577 	}
578 
579 	regs->flags	= X86_EFLAGS_IF | X86_EFLAGS_FIXED;
580 }
581 
582 void
start_thread(struct pt_regs * regs,unsigned long new_ip,unsigned long new_sp)583 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
584 {
585 	start_thread_common(regs, new_ip, new_sp,
586 			    __USER_CS, __USER_DS, 0);
587 }
588 EXPORT_SYMBOL_GPL(start_thread);
589 
590 #ifdef CONFIG_COMPAT
compat_start_thread(struct pt_regs * regs,u32 new_ip,u32 new_sp,bool x32)591 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
592 {
593 	start_thread_common(regs, new_ip, new_sp,
594 			    x32 ? __USER_CS : __USER32_CS,
595 			    __USER_DS, __USER_DS);
596 }
597 #endif
598 
599 /*
600  *	switch_to(x,y) should switch tasks from x to y.
601  *
602  * This could still be optimized:
603  * - fold all the options into a flag word and test it with a single test.
604  * - could test fs/gs bitsliced
605  *
606  * Kprobes not supported here. Set the probe on schedule instead.
607  * Function graph tracer not supported too.
608  */
609 __no_kmsan_checks
610 __visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct * prev_p,struct task_struct * next_p)611 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
612 {
613 	struct thread_struct *prev = &prev_p->thread;
614 	struct thread_struct *next = &next_p->thread;
615 	int cpu = smp_processor_id();
616 
617 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
618 		     this_cpu_read(hardirq_stack_inuse));
619 
620 	switch_fpu(prev_p, cpu);
621 
622 	/* We must save %fs and %gs before load_TLS() because
623 	 * %fs and %gs may be cleared by load_TLS().
624 	 *
625 	 * (e.g. xen_load_tls())
626 	 */
627 	save_fsgs(prev_p);
628 
629 	/*
630 	 * Load TLS before restoring any segments so that segment loads
631 	 * reference the correct GDT entries.
632 	 */
633 	load_TLS(next, cpu);
634 
635 	/*
636 	 * Leave lazy mode, flushing any hypercalls made here.  This
637 	 * must be done after loading TLS entries in the GDT but before
638 	 * loading segments that might reference them.
639 	 */
640 	arch_end_context_switch(next_p);
641 
642 	/* Switch DS and ES.
643 	 *
644 	 * Reading them only returns the selectors, but writing them (if
645 	 * nonzero) loads the full descriptor from the GDT or LDT.  The
646 	 * LDT for next is loaded in switch_mm, and the GDT is loaded
647 	 * above.
648 	 *
649 	 * We therefore need to write new values to the segment
650 	 * registers on every context switch unless both the new and old
651 	 * values are zero.
652 	 *
653 	 * Note that we don't need to do anything for CS and SS, as
654 	 * those are saved and restored as part of pt_regs.
655 	 */
656 	savesegment(es, prev->es);
657 	if (unlikely(next->es | prev->es))
658 		loadsegment(es, next->es);
659 
660 	savesegment(ds, prev->ds);
661 	if (unlikely(next->ds | prev->ds))
662 		loadsegment(ds, next->ds);
663 
664 	x86_fsgsbase_load(prev, next);
665 
666 	x86_pkru_load(prev, next);
667 
668 	/*
669 	 * Switch the PDA and FPU contexts.
670 	 */
671 	raw_cpu_write(current_task, next_p);
672 	raw_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
673 
674 	/* Reload sp0. */
675 	update_task_stack(next_p);
676 
677 	switch_to_extra(prev_p, next_p);
678 
679 	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
680 		/*
681 		 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
682 		 * does not update the cached descriptor.  As a result, if we
683 		 * do SYSRET while SS is NULL, we'll end up in user mode with
684 		 * SS apparently equal to __USER_DS but actually unusable.
685 		 *
686 		 * The straightforward workaround would be to fix it up just
687 		 * before SYSRET, but that would slow down the system call
688 		 * fast paths.  Instead, we ensure that SS is never NULL in
689 		 * system call context.  We do this by replacing NULL SS
690 		 * selectors at every context switch.  SYSCALL sets up a valid
691 		 * SS, so the only way to get NULL is to re-enter the kernel
692 		 * from CPL 3 through an interrupt.  Since that can't happen
693 		 * in the same task as a running syscall, we are guaranteed to
694 		 * context switch between every interrupt vector entry and a
695 		 * subsequent SYSRET.
696 		 *
697 		 * We read SS first because SS reads are much faster than
698 		 * writes.  Out of caution, we force SS to __KERNEL_DS even if
699 		 * it previously had a different non-NULL value.
700 		 */
701 		unsigned short ss_sel;
702 		savesegment(ss, ss_sel);
703 		if (ss_sel != __KERNEL_DS)
704 			loadsegment(ss, __KERNEL_DS);
705 	}
706 
707 	/* Load the Intel cache allocation PQR MSR. */
708 	resctrl_arch_sched_in(next_p);
709 
710 	return prev_p;
711 }
712 
set_personality_64bit(void)713 void set_personality_64bit(void)
714 {
715 	/* inherit personality from parent */
716 
717 	/* Make sure to be in 64bit mode */
718 	clear_thread_flag(TIF_ADDR32);
719 	/* Pretend that this comes from a 64bit execve */
720 	task_pt_regs(current)->orig_ax = __NR_execve;
721 	current_thread_info()->status &= ~TS_COMPAT;
722 	if (current->mm)
723 		__set_bit(MM_CONTEXT_HAS_VSYSCALL, &current->mm->context.flags);
724 
725 	/* TBD: overwrites user setup. Should have two bits.
726 	   But 64bit processes have always behaved this way,
727 	   so it's not too bad. The main problem is just that
728 	   32bit children are affected again. */
729 	current->personality &= ~READ_IMPLIES_EXEC;
730 }
731 
__set_personality_x32(void)732 static void __set_personality_x32(void)
733 {
734 #ifdef CONFIG_X86_X32_ABI
735 	if (current->mm)
736 		current->mm->context.flags = 0;
737 
738 	current->personality &= ~READ_IMPLIES_EXEC;
739 	/*
740 	 * in_32bit_syscall() uses the presence of the x32 syscall bit
741 	 * flag to determine compat status.  The x86 mmap() code relies on
742 	 * the syscall bitness so set x32 syscall bit right here to make
743 	 * in_32bit_syscall() work during exec().
744 	 *
745 	 * Pretend to come from a x32 execve.
746 	 */
747 	task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
748 	current_thread_info()->status &= ~TS_COMPAT;
749 #endif
750 }
751 
__set_personality_ia32(void)752 static void __set_personality_ia32(void)
753 {
754 #ifdef CONFIG_IA32_EMULATION
755 	if (current->mm) {
756 		/*
757 		 * uprobes applied to this MM need to know this and
758 		 * cannot use user_64bit_mode() at that time.
759 		 */
760 		__set_bit(MM_CONTEXT_UPROBE_IA32, &current->mm->context.flags);
761 	}
762 
763 	current->personality |= force_personality32;
764 	/* Prepare the first "return" to user space */
765 	task_pt_regs(current)->orig_ax = __NR_ia32_execve;
766 	current_thread_info()->status |= TS_COMPAT;
767 #endif
768 }
769 
set_personality_ia32(bool x32)770 void set_personality_ia32(bool x32)
771 {
772 	/* Make sure to be in 32bit mode */
773 	set_thread_flag(TIF_ADDR32);
774 
775 	if (x32)
776 		__set_personality_x32();
777 	else
778 		__set_personality_ia32();
779 }
780 EXPORT_SYMBOL_GPL(set_personality_ia32);
781 
782 #ifdef CONFIG_CHECKPOINT_RESTORE
prctl_map_vdso(const struct vdso_image * image,unsigned long addr)783 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
784 {
785 	int ret;
786 
787 	ret = map_vdso_once(image, addr);
788 	if (ret)
789 		return ret;
790 
791 	return (long)image->size;
792 }
793 #endif
794 
795 #ifdef CONFIG_ADDRESS_MASKING
796 
797 #define LAM_U57_BITS 6
798 
enable_lam_func(void * __mm)799 static void enable_lam_func(void *__mm)
800 {
801 	struct mm_struct *mm = __mm;
802 	unsigned long lam;
803 
804 	if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) {
805 		lam = mm_lam_cr3_mask(mm);
806 		write_cr3(__read_cr3() | lam);
807 		cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
808 	}
809 }
810 
mm_enable_lam(struct mm_struct * mm)811 static void mm_enable_lam(struct mm_struct *mm)
812 {
813 	mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
814 	mm->context.untag_mask =  ~GENMASK(62, 57);
815 
816 	/*
817 	 * Even though the process must still be single-threaded at this
818 	 * point, kernel threads may be using the mm.  IPI those kernel
819 	 * threads if they exist.
820 	 */
821 	on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true);
822 	set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
823 }
824 
prctl_enable_tagged_addr(struct mm_struct * mm,unsigned long nr_bits)825 static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
826 {
827 	if (!cpu_feature_enabled(X86_FEATURE_LAM))
828 		return -ENODEV;
829 
830 	/* PTRACE_ARCH_PRCTL */
831 	if (current->mm != mm)
832 		return -EINVAL;
833 
834 	if (mm_valid_pasid(mm) &&
835 	    !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags))
836 		return -EINVAL;
837 
838 	if (mmap_write_lock_killable(mm))
839 		return -EINTR;
840 
841 	/*
842 	 * MM_CONTEXT_LOCK_LAM is set on clone.  Prevent LAM from
843 	 * being enabled unless the process is single threaded:
844 	 */
845 	if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) {
846 		mmap_write_unlock(mm);
847 		return -EBUSY;
848 	}
849 
850 	if (!nr_bits || nr_bits > LAM_U57_BITS) {
851 		mmap_write_unlock(mm);
852 		return -EINVAL;
853 	}
854 
855 	mm_enable_lam(mm);
856 
857 	mmap_write_unlock(mm);
858 
859 	return 0;
860 }
861 #endif
862 
do_arch_prctl_64(struct task_struct * task,int option,unsigned long arg2)863 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
864 {
865 	int ret = 0;
866 
867 	switch (option) {
868 	case ARCH_SET_GS: {
869 		if (unlikely(arg2 >= TASK_SIZE_MAX))
870 			return -EPERM;
871 
872 		preempt_disable();
873 		/*
874 		 * ARCH_SET_GS has always overwritten the index
875 		 * and the base. Zero is the most sensible value
876 		 * to put in the index, and is the only value that
877 		 * makes any sense if FSGSBASE is unavailable.
878 		 */
879 		if (task == current) {
880 			loadseg(GS, 0);
881 			x86_gsbase_write_cpu_inactive(arg2);
882 
883 			/*
884 			 * On non-FSGSBASE systems, save_base_legacy() expects
885 			 * that we also fill in thread.gsbase.
886 			 */
887 			task->thread.gsbase = arg2;
888 
889 		} else {
890 			task->thread.gsindex = 0;
891 			x86_gsbase_write_task(task, arg2);
892 		}
893 		preempt_enable();
894 		break;
895 	}
896 	case ARCH_SET_FS: {
897 		/*
898 		 * Not strictly needed for %fs, but do it for symmetry
899 		 * with %gs
900 		 */
901 		if (unlikely(arg2 >= TASK_SIZE_MAX))
902 			return -EPERM;
903 
904 		preempt_disable();
905 		/*
906 		 * Set the selector to 0 for the same reason
907 		 * as %gs above.
908 		 */
909 		if (task == current) {
910 			loadseg(FS, 0);
911 			x86_fsbase_write_cpu(arg2);
912 
913 			/*
914 			 * On non-FSGSBASE systems, save_base_legacy() expects
915 			 * that we also fill in thread.fsbase.
916 			 */
917 			task->thread.fsbase = arg2;
918 		} else {
919 			task->thread.fsindex = 0;
920 			x86_fsbase_write_task(task, arg2);
921 		}
922 		preempt_enable();
923 		break;
924 	}
925 	case ARCH_GET_FS: {
926 		unsigned long base = x86_fsbase_read_task(task);
927 
928 		ret = put_user(base, (unsigned long __user *)arg2);
929 		break;
930 	}
931 	case ARCH_GET_GS: {
932 		unsigned long base = x86_gsbase_read_task(task);
933 
934 		ret = put_user(base, (unsigned long __user *)arg2);
935 		break;
936 	}
937 
938 #ifdef CONFIG_CHECKPOINT_RESTORE
939 # ifdef CONFIG_X86_X32_ABI
940 	case ARCH_MAP_VDSO_X32:
941 		return prctl_map_vdso(&vdso_image_x32, arg2);
942 # endif
943 # ifdef CONFIG_IA32_EMULATION
944 	case ARCH_MAP_VDSO_32:
945 		return prctl_map_vdso(&vdso_image_32, arg2);
946 # endif
947 	case ARCH_MAP_VDSO_64:
948 		return prctl_map_vdso(&vdso_image_64, arg2);
949 #endif
950 #ifdef CONFIG_ADDRESS_MASKING
951 	case ARCH_GET_UNTAG_MASK:
952 		return put_user(task->mm->context.untag_mask,
953 				(unsigned long __user *)arg2);
954 	case ARCH_ENABLE_TAGGED_ADDR:
955 		return prctl_enable_tagged_addr(task->mm, arg2);
956 	case ARCH_FORCE_TAGGED_SVA:
957 		if (current != task)
958 			return -EINVAL;
959 		set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags);
960 		return 0;
961 	case ARCH_GET_MAX_TAG_BITS:
962 		if (!cpu_feature_enabled(X86_FEATURE_LAM))
963 			return put_user(0, (unsigned long __user *)arg2);
964 		else
965 			return put_user(LAM_U57_BITS, (unsigned long __user *)arg2);
966 #endif
967 	case ARCH_SHSTK_ENABLE:
968 	case ARCH_SHSTK_DISABLE:
969 	case ARCH_SHSTK_LOCK:
970 	case ARCH_SHSTK_UNLOCK:
971 	case ARCH_SHSTK_STATUS:
972 		return shstk_prctl(task, option, arg2);
973 	default:
974 		ret = -EINVAL;
975 		break;
976 	}
977 
978 	return ret;
979 }
980