xref: /linux/arch/x86/kernel/process_64.c (revision 98e8f2c0e0930feee6a2538450c74d9d7de0a9cc)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Copyright (C) 1995  Linus Torvalds
4  *
5  *  Pentium III FXSR, SSE support
6  *	Gareth Hughes <gareth@valinux.com>, May 2000
7  *
8  *  X86-64 port
9  *	Andi Kleen.
10  *
11  *	CPU hotplug support - ashok.raj@intel.com
12  */
13 
14 /*
15  * This file handles the architecture-dependent parts of process handling..
16  */
17 
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/sched/task.h>
22 #include <linux/sched/task_stack.h>
23 #include <linux/fs.h>
24 #include <linux/kernel.h>
25 #include <linux/mm.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/delay.h>
32 #include <linux/export.h>
33 #include <linux/ptrace.h>
34 #include <linux/notifier.h>
35 #include <linux/kprobes.h>
36 #include <linux/kdebug.h>
37 #include <linux/prctl.h>
38 #include <linux/uaccess.h>
39 #include <linux/io.h>
40 #include <linux/ftrace.h>
41 #include <linux/syscalls.h>
42 #include <linux/iommu.h>
43 
44 #include <asm/processor.h>
45 #include <asm/pkru.h>
46 #include <asm/fpu/sched.h>
47 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
49 #include <asm/desc.h>
50 #include <asm/proto.h>
51 #include <asm/ia32.h>
52 #include <asm/debugreg.h>
53 #include <asm/switch_to.h>
54 #include <asm/xen/hypervisor.h>
55 #include <asm/vdso.h>
56 #include <asm/resctrl.h>
57 #include <asm/unistd.h>
58 #include <asm/fsgsbase.h>
59 #include <asm/fred.h>
60 #include <asm/msr.h>
61 #ifdef CONFIG_IA32_EMULATION
62 /* Not included via unistd.h */
63 #include <asm/unistd_32_ia32.h>
64 #endif
65 
66 #include "process.h"
67 
68 /* Prints also some state that isn't saved in the pt_regs */
__show_regs(struct pt_regs * regs,enum show_regs_mode mode,const char * log_lvl)69 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
70 		 const char *log_lvl)
71 {
72 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
73 	unsigned long d0, d1, d2, d3, d6, d7;
74 	unsigned int fsindex, gsindex;
75 	unsigned int ds, es;
76 
77 	show_iret_regs(regs, log_lvl);
78 
79 	if (regs->orig_ax != -1)
80 		pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
81 	else
82 		pr_cont("\n");
83 
84 	printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
85 	       log_lvl, regs->ax, regs->bx, regs->cx);
86 	printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
87 	       log_lvl, regs->dx, regs->si, regs->di);
88 	printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
89 	       log_lvl, regs->bp, regs->r8, regs->r9);
90 	printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
91 	       log_lvl, regs->r10, regs->r11, regs->r12);
92 	printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
93 	       log_lvl, regs->r13, regs->r14, regs->r15);
94 
95 	if (mode == SHOW_REGS_SHORT)
96 		return;
97 
98 	if (mode == SHOW_REGS_USER) {
99 		rdmsrq(MSR_FS_BASE, fs);
100 		rdmsrq(MSR_KERNEL_GS_BASE, shadowgs);
101 		printk("%sFS:  %016lx GS:  %016lx\n",
102 		       log_lvl, fs, shadowgs);
103 		return;
104 	}
105 
106 	asm("movl %%ds,%0" : "=r" (ds));
107 	asm("movl %%es,%0" : "=r" (es));
108 	asm("movl %%fs,%0" : "=r" (fsindex));
109 	asm("movl %%gs,%0" : "=r" (gsindex));
110 
111 	rdmsrq(MSR_FS_BASE, fs);
112 	rdmsrq(MSR_GS_BASE, gs);
113 	rdmsrq(MSR_KERNEL_GS_BASE, shadowgs);
114 
115 	cr0 = read_cr0();
116 	cr2 = read_cr2();
117 	cr3 = __read_cr3();
118 	cr4 = __read_cr4();
119 
120 	printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
121 	       log_lvl, fs, fsindex, gs, gsindex, shadowgs);
122 	printk("%sCS:  %04x DS: %04x ES: %04x CR0: %016lx\n",
123 		log_lvl, regs->cs, ds, es, cr0);
124 	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
125 		log_lvl, cr2, cr3, cr4);
126 
127 	get_debugreg(d0, 0);
128 	get_debugreg(d1, 1);
129 	get_debugreg(d2, 2);
130 	get_debugreg(d3, 3);
131 	get_debugreg(d6, 6);
132 	get_debugreg(d7, 7);
133 
134 	/* Only print out debug registers if they are in their non-default state. */
135 	if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
136 	    (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))) {
137 		printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
138 		       log_lvl, d0, d1, d2);
139 		printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
140 		       log_lvl, d3, d6, d7);
141 	}
142 
143 	if (cr4 & X86_CR4_PKE)
144 		printk("%sPKRU: %08x\n", log_lvl, read_pkru());
145 }
146 
release_thread(struct task_struct * dead_task)147 void release_thread(struct task_struct *dead_task)
148 {
149 	WARN_ON(dead_task->mm);
150 }
151 
152 enum which_selector {
153 	FS,
154 	GS
155 };
156 
157 /*
158  * Out of line to be protected from kprobes and tracing. If this would be
159  * traced or probed than any access to a per CPU variable happens with
160  * the wrong GS.
161  *
162  * It is not used on Xen paravirt. When paravirt support is needed, it
163  * needs to be renamed with native_ prefix.
164  */
__rdgsbase_inactive(void)165 static noinstr unsigned long __rdgsbase_inactive(void)
166 {
167 	unsigned long gsbase;
168 
169 	lockdep_assert_irqs_disabled();
170 
171 	/*
172 	 * SWAPGS is no longer needed thus NOT allowed with FRED because
173 	 * FRED transitions ensure that an operating system can _always_
174 	 * operate with its own GS base address:
175 	 * - For events that occur in ring 3, FRED event delivery swaps
176 	 *   the GS base address with the IA32_KERNEL_GS_BASE MSR.
177 	 * - ERETU (the FRED transition that returns to ring 3) also swaps
178 	 *   the GS base address with the IA32_KERNEL_GS_BASE MSR.
179 	 *
180 	 * And the operating system can still setup the GS segment for a
181 	 * user thread without the need of loading a user thread GS with:
182 	 * - Using LKGS, available with FRED, to modify other attributes
183 	 *   of the GS segment without compromising its ability always to
184 	 *   operate with its own GS base address.
185 	 * - Accessing the GS segment base address for a user thread as
186 	 *   before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR.
187 	 *
188 	 * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE
189 	 * MSR instead of the GS segment’s descriptor cache. As such, the
190 	 * operating system never changes its runtime GS base address.
191 	 */
192 	if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
193 	    !cpu_feature_enabled(X86_FEATURE_XENPV)) {
194 		native_swapgs();
195 		gsbase = rdgsbase();
196 		native_swapgs();
197 	} else {
198 		instrumentation_begin();
199 		rdmsrq(MSR_KERNEL_GS_BASE, gsbase);
200 		instrumentation_end();
201 	}
202 
203 	return gsbase;
204 }
205 
206 /*
207  * Out of line to be protected from kprobes and tracing. If this would be
208  * traced or probed than any access to a per CPU variable happens with
209  * the wrong GS.
210  *
211  * It is not used on Xen paravirt. When paravirt support is needed, it
212  * needs to be renamed with native_ prefix.
213  */
__wrgsbase_inactive(unsigned long gsbase)214 static noinstr void __wrgsbase_inactive(unsigned long gsbase)
215 {
216 	lockdep_assert_irqs_disabled();
217 
218 	if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
219 	    !cpu_feature_enabled(X86_FEATURE_XENPV)) {
220 		native_swapgs();
221 		wrgsbase(gsbase);
222 		native_swapgs();
223 	} else {
224 		instrumentation_begin();
225 		wrmsrq(MSR_KERNEL_GS_BASE, gsbase);
226 		instrumentation_end();
227 	}
228 }
229 
230 /*
231  * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
232  * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
233  * It's forcibly inlined because it'll generate better code and this function
234  * is hot.
235  */
save_base_legacy(struct task_struct * prev_p,unsigned short selector,enum which_selector which)236 static __always_inline void save_base_legacy(struct task_struct *prev_p,
237 					     unsigned short selector,
238 					     enum which_selector which)
239 {
240 	if (likely(selector == 0)) {
241 		/*
242 		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
243 		 * be the pre-existing saved base or it could be zero.  On AMD
244 		 * (with X86_BUG_NULL_SEG), the segment base could be almost
245 		 * anything.
246 		 *
247 		 * This branch is very hot (it's hit twice on almost every
248 		 * context switch between 64-bit programs), and avoiding
249 		 * the RDMSR helps a lot, so we just assume that whatever
250 		 * value is already saved is correct.  This matches historical
251 		 * Linux behavior, so it won't break existing applications.
252 		 *
253 		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
254 		 * report that the base is zero, it needs to actually be zero:
255 		 * see the corresponding logic in load_seg_legacy.
256 		 */
257 	} else {
258 		/*
259 		 * If the selector is 1, 2, or 3, then the base is zero on
260 		 * !X86_BUG_NULL_SEG CPUs and could be anything on
261 		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
262 		 * has never attempted to preserve the base across context
263 		 * switches.
264 		 *
265 		 * If selector > 3, then it refers to a real segment, and
266 		 * saving the base isn't necessary.
267 		 */
268 		if (which == FS)
269 			prev_p->thread.fsbase = 0;
270 		else
271 			prev_p->thread.gsbase = 0;
272 	}
273 }
274 
save_fsgs(struct task_struct * task)275 static __always_inline void save_fsgs(struct task_struct *task)
276 {
277 	savesegment(fs, task->thread.fsindex);
278 	savesegment(gs, task->thread.gsindex);
279 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
280 		/*
281 		 * If FSGSBASE is enabled, we can't make any useful guesses
282 		 * about the base, and user code expects us to save the current
283 		 * value.  Fortunately, reading the base directly is efficient.
284 		 */
285 		task->thread.fsbase = rdfsbase();
286 		task->thread.gsbase = __rdgsbase_inactive();
287 	} else {
288 		save_base_legacy(task, task->thread.fsindex, FS);
289 		save_base_legacy(task, task->thread.gsindex, GS);
290 	}
291 }
292 
293 /*
294  * While a process is running,current->thread.fsbase and current->thread.gsbase
295  * may not match the corresponding CPU registers (see save_base_legacy()).
296  */
current_save_fsgs(void)297 void current_save_fsgs(void)
298 {
299 	unsigned long flags;
300 
301 	/* Interrupts need to be off for FSGSBASE */
302 	local_irq_save(flags);
303 	save_fsgs(current);
304 	local_irq_restore(flags);
305 }
306 #if IS_ENABLED(CONFIG_KVM)
307 EXPORT_SYMBOL_GPL(current_save_fsgs);
308 #endif
309 
loadseg(enum which_selector which,unsigned short sel)310 static __always_inline void loadseg(enum which_selector which,
311 				    unsigned short sel)
312 {
313 	if (which == FS)
314 		loadsegment(fs, sel);
315 	else
316 		load_gs_index(sel);
317 }
318 
load_seg_legacy(unsigned short prev_index,unsigned long prev_base,unsigned short next_index,unsigned long next_base,enum which_selector which)319 static __always_inline void load_seg_legacy(unsigned short prev_index,
320 					    unsigned long prev_base,
321 					    unsigned short next_index,
322 					    unsigned long next_base,
323 					    enum which_selector which)
324 {
325 	if (likely(next_index <= 3)) {
326 		/*
327 		 * The next task is using 64-bit TLS, is not using this
328 		 * segment at all, or is having fun with arcane CPU features.
329 		 */
330 		if (next_base == 0) {
331 			/*
332 			 * Nasty case: on AMD CPUs, we need to forcibly zero
333 			 * the base.
334 			 */
335 			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
336 				loadseg(which, __USER_DS);
337 				loadseg(which, next_index);
338 			} else {
339 				/*
340 				 * We could try to exhaustively detect cases
341 				 * under which we can skip the segment load,
342 				 * but there's really only one case that matters
343 				 * for performance: if both the previous and
344 				 * next states are fully zeroed, we can skip
345 				 * the load.
346 				 *
347 				 * (This assumes that prev_base == 0 has no
348 				 * false positives.  This is the case on
349 				 * Intel-style CPUs.)
350 				 */
351 				if (likely(prev_index | next_index | prev_base))
352 					loadseg(which, next_index);
353 			}
354 		} else {
355 			if (prev_index != next_index)
356 				loadseg(which, next_index);
357 			wrmsrq(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
358 			       next_base);
359 		}
360 	} else {
361 		/*
362 		 * The next task is using a real segment.  Loading the selector
363 		 * is sufficient.
364 		 */
365 		loadseg(which, next_index);
366 	}
367 }
368 
369 /*
370  * Store prev's PKRU value and load next's PKRU value if they differ. PKRU
371  * is not XSTATE managed on context switch because that would require a
372  * lookup in the task's FPU xsave buffer and require to keep that updated
373  * in various places.
374  */
x86_pkru_load(struct thread_struct * prev,struct thread_struct * next)375 static __always_inline void x86_pkru_load(struct thread_struct *prev,
376 					  struct thread_struct *next)
377 {
378 	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
379 		return;
380 
381 	/* Stash the prev task's value: */
382 	prev->pkru = rdpkru();
383 
384 	/*
385 	 * PKRU writes are slightly expensive.  Avoid them when not
386 	 * strictly necessary:
387 	 */
388 	if (prev->pkru != next->pkru)
389 		wrpkru(next->pkru);
390 }
391 
x86_fsgsbase_load(struct thread_struct * prev,struct thread_struct * next)392 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
393 					      struct thread_struct *next)
394 {
395 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
396 		/* Update the FS and GS selectors if they could have changed. */
397 		if (unlikely(prev->fsindex || next->fsindex))
398 			loadseg(FS, next->fsindex);
399 		if (unlikely(prev->gsindex || next->gsindex))
400 			loadseg(GS, next->gsindex);
401 
402 		/* Update the bases. */
403 		wrfsbase(next->fsbase);
404 		__wrgsbase_inactive(next->gsbase);
405 	} else {
406 		load_seg_legacy(prev->fsindex, prev->fsbase,
407 				next->fsindex, next->fsbase, FS);
408 		load_seg_legacy(prev->gsindex, prev->gsbase,
409 				next->gsindex, next->gsbase, GS);
410 	}
411 }
412 
x86_fsgsbase_read_task(struct task_struct * task,unsigned short selector)413 unsigned long x86_fsgsbase_read_task(struct task_struct *task,
414 				     unsigned short selector)
415 {
416 	unsigned short idx = selector >> 3;
417 	unsigned long base;
418 
419 	if (likely((selector & SEGMENT_TI_MASK) == 0)) {
420 		if (unlikely(idx >= GDT_ENTRIES))
421 			return 0;
422 
423 		/*
424 		 * There are no user segments in the GDT with nonzero bases
425 		 * other than the TLS segments.
426 		 */
427 		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
428 			return 0;
429 
430 		idx -= GDT_ENTRY_TLS_MIN;
431 		base = get_desc_base(&task->thread.tls_array[idx]);
432 	} else {
433 #ifdef CONFIG_MODIFY_LDT_SYSCALL
434 		struct ldt_struct *ldt;
435 
436 		/*
437 		 * If performance here mattered, we could protect the LDT
438 		 * with RCU.  This is a slow path, though, so we can just
439 		 * take the mutex.
440 		 */
441 		mutex_lock(&task->mm->context.lock);
442 		ldt = task->mm->context.ldt;
443 		if (unlikely(!ldt || idx >= ldt->nr_entries))
444 			base = 0;
445 		else
446 			base = get_desc_base(ldt->entries + idx);
447 		mutex_unlock(&task->mm->context.lock);
448 #else
449 		base = 0;
450 #endif
451 	}
452 
453 	return base;
454 }
455 
x86_gsbase_read_cpu_inactive(void)456 unsigned long x86_gsbase_read_cpu_inactive(void)
457 {
458 	unsigned long gsbase;
459 
460 	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
461 		unsigned long flags;
462 
463 		local_irq_save(flags);
464 		gsbase = __rdgsbase_inactive();
465 		local_irq_restore(flags);
466 	} else {
467 		rdmsrq(MSR_KERNEL_GS_BASE, gsbase);
468 	}
469 
470 	return gsbase;
471 }
472 
x86_gsbase_write_cpu_inactive(unsigned long gsbase)473 void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
474 {
475 	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
476 		unsigned long flags;
477 
478 		local_irq_save(flags);
479 		__wrgsbase_inactive(gsbase);
480 		local_irq_restore(flags);
481 	} else {
482 		wrmsrq(MSR_KERNEL_GS_BASE, gsbase);
483 	}
484 }
485 
x86_fsbase_read_task(struct task_struct * task)486 unsigned long x86_fsbase_read_task(struct task_struct *task)
487 {
488 	unsigned long fsbase;
489 
490 	if (task == current)
491 		fsbase = x86_fsbase_read_cpu();
492 	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
493 		 (task->thread.fsindex == 0))
494 		fsbase = task->thread.fsbase;
495 	else
496 		fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
497 
498 	return fsbase;
499 }
500 
x86_gsbase_read_task(struct task_struct * task)501 unsigned long x86_gsbase_read_task(struct task_struct *task)
502 {
503 	unsigned long gsbase;
504 
505 	if (task == current)
506 		gsbase = x86_gsbase_read_cpu_inactive();
507 	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
508 		 (task->thread.gsindex == 0))
509 		gsbase = task->thread.gsbase;
510 	else
511 		gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
512 
513 	return gsbase;
514 }
515 
x86_fsbase_write_task(struct task_struct * task,unsigned long fsbase)516 void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
517 {
518 	WARN_ON_ONCE(task == current);
519 
520 	task->thread.fsbase = fsbase;
521 }
522 
x86_gsbase_write_task(struct task_struct * task,unsigned long gsbase)523 void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
524 {
525 	WARN_ON_ONCE(task == current);
526 
527 	task->thread.gsbase = gsbase;
528 }
529 
530 static void
start_thread_common(struct pt_regs * regs,unsigned long new_ip,unsigned long new_sp,u16 _cs,u16 _ss,u16 _ds)531 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
532 		    unsigned long new_sp,
533 		    u16 _cs, u16 _ss, u16 _ds)
534 {
535 	WARN_ON_ONCE(regs != current_pt_regs());
536 
537 	if (static_cpu_has(X86_BUG_NULL_SEG)) {
538 		/* Loading zero below won't clear the base. */
539 		loadsegment(fs, __USER_DS);
540 		load_gs_index(__USER_DS);
541 	}
542 
543 	reset_thread_features();
544 
545 	loadsegment(fs, 0);
546 	loadsegment(es, _ds);
547 	loadsegment(ds, _ds);
548 	load_gs_index(0);
549 
550 	regs->ip	= new_ip;
551 	regs->sp	= new_sp;
552 	regs->csx	= _cs;
553 	regs->ssx	= _ss;
554 	/*
555 	 * Allow single-step trap and NMI when starting a new task, thus
556 	 * once the new task enters user space, single-step trap and NMI
557 	 * are both enabled immediately.
558 	 *
559 	 * Entering a new task is logically speaking a return from a
560 	 * system call (exec, fork, clone, etc.). As such, if ptrace
561 	 * enables single stepping a single step exception should be
562 	 * allowed to trigger immediately upon entering user space.
563 	 * This is not optional.
564 	 *
565 	 * NMI should *never* be disabled in user space. As such, this
566 	 * is an optional, opportunistic way to catch errors.
567 	 *
568 	 * Paranoia: High-order 48 bits above the lowest 16 bit SS are
569 	 * discarded by the legacy IRET instruction on all Intel, AMD,
570 	 * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally,
571 	 * even when FRED is not enabled. But we choose the safer side
572 	 * to use these bits only when FRED is enabled.
573 	 */
574 	if (cpu_feature_enabled(X86_FEATURE_FRED)) {
575 		regs->fred_ss.swevent	= true;
576 		regs->fred_ss.nmi	= true;
577 	}
578 
579 	regs->flags	= X86_EFLAGS_IF | X86_EFLAGS_FIXED;
580 }
581 
582 void
start_thread(struct pt_regs * regs,unsigned long new_ip,unsigned long new_sp)583 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
584 {
585 	start_thread_common(regs, new_ip, new_sp,
586 			    __USER_CS, __USER_DS, 0);
587 }
588 EXPORT_SYMBOL_GPL(start_thread);
589 
590 #ifdef CONFIG_COMPAT
compat_start_thread(struct pt_regs * regs,u32 new_ip,u32 new_sp,bool x32)591 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
592 {
593 	start_thread_common(regs, new_ip, new_sp,
594 			    x32 ? __USER_CS : __USER32_CS,
595 			    __USER_DS, __USER_DS);
596 }
597 #endif
598 
599 /*
600  *	switch_to(x,y) should switch tasks from x to y.
601  *
602  * This could still be optimized:
603  * - fold all the options into a flag word and test it with a single test.
604  * - could test fs/gs bitsliced
605  *
606  * Kprobes not supported here. Set the probe on schedule instead.
607  * Function graph tracer not supported too.
608  */
609 __no_kmsan_checks
610 __visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct * prev_p,struct task_struct * next_p)611 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
612 {
613 	struct thread_struct *prev = &prev_p->thread;
614 	struct thread_struct *next = &next_p->thread;
615 	int cpu = smp_processor_id();
616 
617 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
618 		     this_cpu_read(hardirq_stack_inuse));
619 
620 	switch_fpu(prev_p, cpu);
621 
622 	/* We must save %fs and %gs before load_TLS() because
623 	 * %fs and %gs may be cleared by load_TLS().
624 	 *
625 	 * (e.g. xen_load_tls())
626 	 */
627 	save_fsgs(prev_p);
628 
629 	/*
630 	 * Load TLS before restoring any segments so that segment loads
631 	 * reference the correct GDT entries.
632 	 */
633 	load_TLS(next, cpu);
634 
635 	/*
636 	 * Leave lazy mode, flushing any hypercalls made here.  This
637 	 * must be done after loading TLS entries in the GDT but before
638 	 * loading segments that might reference them.
639 	 */
640 	arch_end_context_switch(next_p);
641 
642 	/* Switch DS and ES.
643 	 *
644 	 * Reading them only returns the selectors, but writing them (if
645 	 * nonzero) loads the full descriptor from the GDT or LDT.  The
646 	 * LDT for next is loaded in switch_mm, and the GDT is loaded
647 	 * above.
648 	 *
649 	 * We therefore need to write new values to the segment
650 	 * registers on every context switch unless both the new and old
651 	 * values are zero.
652 	 *
653 	 * Note that we don't need to do anything for CS and SS, as
654 	 * those are saved and restored as part of pt_regs.
655 	 */
656 	savesegment(es, prev->es);
657 	if (unlikely(next->es | prev->es))
658 		loadsegment(es, next->es);
659 
660 	savesegment(ds, prev->ds);
661 	if (unlikely(next->ds | prev->ds))
662 		loadsegment(ds, next->ds);
663 
664 	x86_fsgsbase_load(prev, next);
665 
666 	x86_pkru_load(prev, next);
667 
668 	/*
669 	 * Switch the PDA and FPU contexts.
670 	 */
671 	raw_cpu_write(current_task, next_p);
672 	raw_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
673 
674 	/* Reload sp0. */
675 	update_task_stack(next_p);
676 
677 	switch_to_extra(prev_p, next_p);
678 
679 	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
680 		/*
681 		 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
682 		 * does not update the cached descriptor.  As a result, if we
683 		 * do SYSRET while SS is NULL, we'll end up in user mode with
684 		 * SS apparently equal to __USER_DS but actually unusable.
685 		 *
686 		 * The straightforward workaround would be to fix it up just
687 		 * before SYSRET, but that would slow down the system call
688 		 * fast paths.  Instead, we ensure that SS is never NULL in
689 		 * system call context.  We do this by replacing NULL SS
690 		 * selectors at every context switch.  SYSCALL sets up a valid
691 		 * SS, so the only way to get NULL is to re-enter the kernel
692 		 * from CPL 3 through an interrupt.  Since that can't happen
693 		 * in the same task as a running syscall, we are guaranteed to
694 		 * context switch between every interrupt vector entry and a
695 		 * subsequent SYSRET.
696 		 *
697 		 * We read SS first because SS reads are much faster than
698 		 * writes.  Out of caution, we force SS to __KERNEL_DS even if
699 		 * it previously had a different non-NULL value.
700 		 */
701 		unsigned short ss_sel;
702 		savesegment(ss, ss_sel);
703 		if (ss_sel != __KERNEL_DS)
704 			loadsegment(ss, __KERNEL_DS);
705 	}
706 
707 	/* Load the Intel cache allocation PQR MSR. */
708 	resctrl_arch_sched_in(next_p);
709 
710 	/* Reset hw history on AMD CPUs */
711 	if (cpu_feature_enabled(X86_FEATURE_AMD_WORKLOAD_CLASS))
712 		wrmsrl(MSR_AMD_WORKLOAD_HRST, 0x1);
713 
714 	return prev_p;
715 }
716 
set_personality_64bit(void)717 void set_personality_64bit(void)
718 {
719 	/* inherit personality from parent */
720 
721 	/* Make sure to be in 64bit mode */
722 	clear_thread_flag(TIF_ADDR32);
723 	/* Pretend that this comes from a 64bit execve */
724 	task_pt_regs(current)->orig_ax = __NR_execve;
725 	current_thread_info()->status &= ~TS_COMPAT;
726 	if (current->mm)
727 		__set_bit(MM_CONTEXT_HAS_VSYSCALL, &current->mm->context.flags);
728 
729 	/* TBD: overwrites user setup. Should have two bits.
730 	   But 64bit processes have always behaved this way,
731 	   so it's not too bad. The main problem is just that
732 	   32bit children are affected again. */
733 	current->personality &= ~READ_IMPLIES_EXEC;
734 }
735 
__set_personality_x32(void)736 static void __set_personality_x32(void)
737 {
738 #ifdef CONFIG_X86_X32_ABI
739 	if (current->mm)
740 		current->mm->context.flags = 0;
741 
742 	current->personality &= ~READ_IMPLIES_EXEC;
743 	/*
744 	 * in_32bit_syscall() uses the presence of the x32 syscall bit
745 	 * flag to determine compat status.  The x86 mmap() code relies on
746 	 * the syscall bitness so set x32 syscall bit right here to make
747 	 * in_32bit_syscall() work during exec().
748 	 *
749 	 * Pretend to come from a x32 execve.
750 	 */
751 	task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
752 	current_thread_info()->status &= ~TS_COMPAT;
753 #endif
754 }
755 
__set_personality_ia32(void)756 static void __set_personality_ia32(void)
757 {
758 #ifdef CONFIG_IA32_EMULATION
759 	if (current->mm) {
760 		/*
761 		 * uprobes applied to this MM need to know this and
762 		 * cannot use user_64bit_mode() at that time.
763 		 */
764 		__set_bit(MM_CONTEXT_UPROBE_IA32, &current->mm->context.flags);
765 	}
766 
767 	current->personality |= force_personality32;
768 	/* Prepare the first "return" to user space */
769 	task_pt_regs(current)->orig_ax = __NR_ia32_execve;
770 	current_thread_info()->status |= TS_COMPAT;
771 #endif
772 }
773 
set_personality_ia32(bool x32)774 void set_personality_ia32(bool x32)
775 {
776 	/* Make sure to be in 32bit mode */
777 	set_thread_flag(TIF_ADDR32);
778 
779 	if (x32)
780 		__set_personality_x32();
781 	else
782 		__set_personality_ia32();
783 }
784 EXPORT_SYMBOL_GPL(set_personality_ia32);
785 
786 #ifdef CONFIG_CHECKPOINT_RESTORE
prctl_map_vdso(const struct vdso_image * image,unsigned long addr)787 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
788 {
789 	int ret;
790 
791 	ret = map_vdso_once(image, addr);
792 	if (ret)
793 		return ret;
794 
795 	return (long)image->size;
796 }
797 #endif
798 
799 #ifdef CONFIG_ADDRESS_MASKING
800 
801 #define LAM_U57_BITS 6
802 
enable_lam_func(void * __mm)803 static void enable_lam_func(void *__mm)
804 {
805 	struct mm_struct *mm = __mm;
806 	unsigned long lam;
807 
808 	if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) {
809 		lam = mm_lam_cr3_mask(mm);
810 		write_cr3(__read_cr3() | lam);
811 		cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
812 	}
813 }
814 
mm_enable_lam(struct mm_struct * mm)815 static void mm_enable_lam(struct mm_struct *mm)
816 {
817 	mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
818 	mm->context.untag_mask =  ~GENMASK(62, 57);
819 
820 	/*
821 	 * Even though the process must still be single-threaded at this
822 	 * point, kernel threads may be using the mm.  IPI those kernel
823 	 * threads if they exist.
824 	 */
825 	on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true);
826 	set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
827 }
828 
prctl_enable_tagged_addr(struct mm_struct * mm,unsigned long nr_bits)829 static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
830 {
831 	if (!cpu_feature_enabled(X86_FEATURE_LAM))
832 		return -ENODEV;
833 
834 	/* PTRACE_ARCH_PRCTL */
835 	if (current->mm != mm)
836 		return -EINVAL;
837 
838 	if (mm_valid_pasid(mm) &&
839 	    !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags))
840 		return -EINVAL;
841 
842 	if (mmap_write_lock_killable(mm))
843 		return -EINTR;
844 
845 	/*
846 	 * MM_CONTEXT_LOCK_LAM is set on clone.  Prevent LAM from
847 	 * being enabled unless the process is single threaded:
848 	 */
849 	if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) {
850 		mmap_write_unlock(mm);
851 		return -EBUSY;
852 	}
853 
854 	if (!nr_bits || nr_bits > LAM_U57_BITS) {
855 		mmap_write_unlock(mm);
856 		return -EINVAL;
857 	}
858 
859 	mm_enable_lam(mm);
860 
861 	mmap_write_unlock(mm);
862 
863 	return 0;
864 }
865 #endif
866 
do_arch_prctl_64(struct task_struct * task,int option,unsigned long arg2)867 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
868 {
869 	int ret = 0;
870 
871 	switch (option) {
872 	case ARCH_SET_GS: {
873 		if (unlikely(arg2 >= TASK_SIZE_MAX))
874 			return -EPERM;
875 
876 		preempt_disable();
877 		/*
878 		 * ARCH_SET_GS has always overwritten the index
879 		 * and the base. Zero is the most sensible value
880 		 * to put in the index, and is the only value that
881 		 * makes any sense if FSGSBASE is unavailable.
882 		 */
883 		if (task == current) {
884 			loadseg(GS, 0);
885 			x86_gsbase_write_cpu_inactive(arg2);
886 
887 			/*
888 			 * On non-FSGSBASE systems, save_base_legacy() expects
889 			 * that we also fill in thread.gsbase.
890 			 */
891 			task->thread.gsbase = arg2;
892 
893 		} else {
894 			task->thread.gsindex = 0;
895 			x86_gsbase_write_task(task, arg2);
896 		}
897 		preempt_enable();
898 		break;
899 	}
900 	case ARCH_SET_FS: {
901 		/*
902 		 * Not strictly needed for %fs, but do it for symmetry
903 		 * with %gs
904 		 */
905 		if (unlikely(arg2 >= TASK_SIZE_MAX))
906 			return -EPERM;
907 
908 		preempt_disable();
909 		/*
910 		 * Set the selector to 0 for the same reason
911 		 * as %gs above.
912 		 */
913 		if (task == current) {
914 			loadseg(FS, 0);
915 			x86_fsbase_write_cpu(arg2);
916 
917 			/*
918 			 * On non-FSGSBASE systems, save_base_legacy() expects
919 			 * that we also fill in thread.fsbase.
920 			 */
921 			task->thread.fsbase = arg2;
922 		} else {
923 			task->thread.fsindex = 0;
924 			x86_fsbase_write_task(task, arg2);
925 		}
926 		preempt_enable();
927 		break;
928 	}
929 	case ARCH_GET_FS: {
930 		unsigned long base = x86_fsbase_read_task(task);
931 
932 		ret = put_user(base, (unsigned long __user *)arg2);
933 		break;
934 	}
935 	case ARCH_GET_GS: {
936 		unsigned long base = x86_gsbase_read_task(task);
937 
938 		ret = put_user(base, (unsigned long __user *)arg2);
939 		break;
940 	}
941 
942 #ifdef CONFIG_CHECKPOINT_RESTORE
943 # ifdef CONFIG_X86_X32_ABI
944 	case ARCH_MAP_VDSO_X32:
945 		return prctl_map_vdso(&vdso_image_x32, arg2);
946 # endif
947 # ifdef CONFIG_IA32_EMULATION
948 	case ARCH_MAP_VDSO_32:
949 		return prctl_map_vdso(&vdso_image_32, arg2);
950 # endif
951 	case ARCH_MAP_VDSO_64:
952 		return prctl_map_vdso(&vdso_image_64, arg2);
953 #endif
954 #ifdef CONFIG_ADDRESS_MASKING
955 	case ARCH_GET_UNTAG_MASK:
956 		return put_user(task->mm->context.untag_mask,
957 				(unsigned long __user *)arg2);
958 	case ARCH_ENABLE_TAGGED_ADDR:
959 		return prctl_enable_tagged_addr(task->mm, arg2);
960 	case ARCH_FORCE_TAGGED_SVA:
961 		if (current != task)
962 			return -EINVAL;
963 		set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags);
964 		return 0;
965 	case ARCH_GET_MAX_TAG_BITS:
966 		if (!cpu_feature_enabled(X86_FEATURE_LAM))
967 			return put_user(0, (unsigned long __user *)arg2);
968 		else
969 			return put_user(LAM_U57_BITS, (unsigned long __user *)arg2);
970 #endif
971 	case ARCH_SHSTK_ENABLE:
972 	case ARCH_SHSTK_DISABLE:
973 	case ARCH_SHSTK_LOCK:
974 	case ARCH_SHSTK_UNLOCK:
975 	case ARCH_SHSTK_STATUS:
976 		return shstk_prctl(task, option, arg2);
977 	default:
978 		ret = -EINVAL;
979 		break;
980 	}
981 
982 	return ret;
983 }
984