xref: /linux/arch/x86/kernel/process_64.c (revision b0402403e54ae9eb94ce1cbb53c7def776e97426)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Copyright (C) 1995  Linus Torvalds
4  *
5  *  Pentium III FXSR, SSE support
6  *	Gareth Hughes <gareth@valinux.com>, May 2000
7  *
8  *  X86-64 port
9  *	Andi Kleen.
10  *
11  *	CPU hotplug support - ashok.raj@intel.com
12  */
13 
14 /*
15  * This file handles the architecture-dependent parts of process handling..
16  */
17 
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/sched/task.h>
22 #include <linux/sched/task_stack.h>
23 #include <linux/fs.h>
24 #include <linux/kernel.h>
25 #include <linux/mm.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/delay.h>
32 #include <linux/export.h>
33 #include <linux/ptrace.h>
34 #include <linux/notifier.h>
35 #include <linux/kprobes.h>
36 #include <linux/kdebug.h>
37 #include <linux/prctl.h>
38 #include <linux/uaccess.h>
39 #include <linux/io.h>
40 #include <linux/ftrace.h>
41 #include <linux/syscalls.h>
42 #include <linux/iommu.h>
43 
44 #include <asm/processor.h>
45 #include <asm/pkru.h>
46 #include <asm/fpu/sched.h>
47 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
49 #include <asm/desc.h>
50 #include <asm/proto.h>
51 #include <asm/ia32.h>
52 #include <asm/debugreg.h>
53 #include <asm/switch_to.h>
54 #include <asm/xen/hypervisor.h>
55 #include <asm/vdso.h>
56 #include <asm/resctrl.h>
57 #include <asm/unistd.h>
58 #include <asm/fsgsbase.h>
59 #include <asm/fred.h>
60 #ifdef CONFIG_IA32_EMULATION
61 /* Not included via unistd.h */
62 #include <asm/unistd_32_ia32.h>
63 #endif
64 
65 #include "process.h"
66 
67 /* Prints also some state that isn't saved in the pt_regs */
68 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
69 		 const char *log_lvl)
70 {
71 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
72 	unsigned long d0, d1, d2, d3, d6, d7;
73 	unsigned int fsindex, gsindex;
74 	unsigned int ds, es;
75 
76 	show_iret_regs(regs, log_lvl);
77 
78 	if (regs->orig_ax != -1)
79 		pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
80 	else
81 		pr_cont("\n");
82 
83 	printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
84 	       log_lvl, regs->ax, regs->bx, regs->cx);
85 	printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
86 	       log_lvl, regs->dx, regs->si, regs->di);
87 	printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
88 	       log_lvl, regs->bp, regs->r8, regs->r9);
89 	printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
90 	       log_lvl, regs->r10, regs->r11, regs->r12);
91 	printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
92 	       log_lvl, regs->r13, regs->r14, regs->r15);
93 
94 	if (mode == SHOW_REGS_SHORT)
95 		return;
96 
97 	if (mode == SHOW_REGS_USER) {
98 		rdmsrl(MSR_FS_BASE, fs);
99 		rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
100 		printk("%sFS:  %016lx GS:  %016lx\n",
101 		       log_lvl, fs, shadowgs);
102 		return;
103 	}
104 
105 	asm("movl %%ds,%0" : "=r" (ds));
106 	asm("movl %%es,%0" : "=r" (es));
107 	asm("movl %%fs,%0" : "=r" (fsindex));
108 	asm("movl %%gs,%0" : "=r" (gsindex));
109 
110 	rdmsrl(MSR_FS_BASE, fs);
111 	rdmsrl(MSR_GS_BASE, gs);
112 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
113 
114 	cr0 = read_cr0();
115 	cr2 = read_cr2();
116 	cr3 = __read_cr3();
117 	cr4 = __read_cr4();
118 
119 	printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
120 	       log_lvl, fs, fsindex, gs, gsindex, shadowgs);
121 	printk("%sCS:  %04x DS: %04x ES: %04x CR0: %016lx\n",
122 		log_lvl, regs->cs, ds, es, cr0);
123 	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
124 		log_lvl, cr2, cr3, cr4);
125 
126 	get_debugreg(d0, 0);
127 	get_debugreg(d1, 1);
128 	get_debugreg(d2, 2);
129 	get_debugreg(d3, 3);
130 	get_debugreg(d6, 6);
131 	get_debugreg(d7, 7);
132 
133 	/* Only print out debug registers if they are in their non-default state. */
134 	if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
135 	    (d6 == DR6_RESERVED) && (d7 == 0x400))) {
136 		printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
137 		       log_lvl, d0, d1, d2);
138 		printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
139 		       log_lvl, d3, d6, d7);
140 	}
141 
142 	if (cpu_feature_enabled(X86_FEATURE_OSPKE))
143 		printk("%sPKRU: %08x\n", log_lvl, read_pkru());
144 }
145 
146 void release_thread(struct task_struct *dead_task)
147 {
148 	WARN_ON(dead_task->mm);
149 }
150 
151 enum which_selector {
152 	FS,
153 	GS
154 };
155 
156 /*
157  * Out of line to be protected from kprobes and tracing. If this would be
158  * traced or probed than any access to a per CPU variable happens with
159  * the wrong GS.
160  *
161  * It is not used on Xen paravirt. When paravirt support is needed, it
162  * needs to be renamed with native_ prefix.
163  */
164 static noinstr unsigned long __rdgsbase_inactive(void)
165 {
166 	unsigned long gsbase;
167 
168 	lockdep_assert_irqs_disabled();
169 
170 	/*
171 	 * SWAPGS is no longer needed thus NOT allowed with FRED because
172 	 * FRED transitions ensure that an operating system can _always_
173 	 * operate with its own GS base address:
174 	 * - For events that occur in ring 3, FRED event delivery swaps
175 	 *   the GS base address with the IA32_KERNEL_GS_BASE MSR.
176 	 * - ERETU (the FRED transition that returns to ring 3) also swaps
177 	 *   the GS base address with the IA32_KERNEL_GS_BASE MSR.
178 	 *
179 	 * And the operating system can still setup the GS segment for a
180 	 * user thread without the need of loading a user thread GS with:
181 	 * - Using LKGS, available with FRED, to modify other attributes
182 	 *   of the GS segment without compromising its ability always to
183 	 *   operate with its own GS base address.
184 	 * - Accessing the GS segment base address for a user thread as
185 	 *   before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR.
186 	 *
187 	 * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE
188 	 * MSR instead of the GS segment’s descriptor cache. As such, the
189 	 * operating system never changes its runtime GS base address.
190 	 */
191 	if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
192 	    !cpu_feature_enabled(X86_FEATURE_XENPV)) {
193 		native_swapgs();
194 		gsbase = rdgsbase();
195 		native_swapgs();
196 	} else {
197 		instrumentation_begin();
198 		rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
199 		instrumentation_end();
200 	}
201 
202 	return gsbase;
203 }
204 
205 /*
206  * Out of line to be protected from kprobes and tracing. If this would be
207  * traced or probed than any access to a per CPU variable happens with
208  * the wrong GS.
209  *
210  * It is not used on Xen paravirt. When paravirt support is needed, it
211  * needs to be renamed with native_ prefix.
212  */
213 static noinstr void __wrgsbase_inactive(unsigned long gsbase)
214 {
215 	lockdep_assert_irqs_disabled();
216 
217 	if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
218 	    !cpu_feature_enabled(X86_FEATURE_XENPV)) {
219 		native_swapgs();
220 		wrgsbase(gsbase);
221 		native_swapgs();
222 	} else {
223 		instrumentation_begin();
224 		wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
225 		instrumentation_end();
226 	}
227 }
228 
229 /*
230  * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
231  * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
232  * It's forcibly inlined because it'll generate better code and this function
233  * is hot.
234  */
235 static __always_inline void save_base_legacy(struct task_struct *prev_p,
236 					     unsigned short selector,
237 					     enum which_selector which)
238 {
239 	if (likely(selector == 0)) {
240 		/*
241 		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
242 		 * be the pre-existing saved base or it could be zero.  On AMD
243 		 * (with X86_BUG_NULL_SEG), the segment base could be almost
244 		 * anything.
245 		 *
246 		 * This branch is very hot (it's hit twice on almost every
247 		 * context switch between 64-bit programs), and avoiding
248 		 * the RDMSR helps a lot, so we just assume that whatever
249 		 * value is already saved is correct.  This matches historical
250 		 * Linux behavior, so it won't break existing applications.
251 		 *
252 		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
253 		 * report that the base is zero, it needs to actually be zero:
254 		 * see the corresponding logic in load_seg_legacy.
255 		 */
256 	} else {
257 		/*
258 		 * If the selector is 1, 2, or 3, then the base is zero on
259 		 * !X86_BUG_NULL_SEG CPUs and could be anything on
260 		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
261 		 * has never attempted to preserve the base across context
262 		 * switches.
263 		 *
264 		 * If selector > 3, then it refers to a real segment, and
265 		 * saving the base isn't necessary.
266 		 */
267 		if (which == FS)
268 			prev_p->thread.fsbase = 0;
269 		else
270 			prev_p->thread.gsbase = 0;
271 	}
272 }
273 
274 static __always_inline void save_fsgs(struct task_struct *task)
275 {
276 	savesegment(fs, task->thread.fsindex);
277 	savesegment(gs, task->thread.gsindex);
278 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
279 		/*
280 		 * If FSGSBASE is enabled, we can't make any useful guesses
281 		 * about the base, and user code expects us to save the current
282 		 * value.  Fortunately, reading the base directly is efficient.
283 		 */
284 		task->thread.fsbase = rdfsbase();
285 		task->thread.gsbase = __rdgsbase_inactive();
286 	} else {
287 		save_base_legacy(task, task->thread.fsindex, FS);
288 		save_base_legacy(task, task->thread.gsindex, GS);
289 	}
290 }
291 
292 /*
293  * While a process is running,current->thread.fsbase and current->thread.gsbase
294  * may not match the corresponding CPU registers (see save_base_legacy()).
295  */
296 void current_save_fsgs(void)
297 {
298 	unsigned long flags;
299 
300 	/* Interrupts need to be off for FSGSBASE */
301 	local_irq_save(flags);
302 	save_fsgs(current);
303 	local_irq_restore(flags);
304 }
305 #if IS_ENABLED(CONFIG_KVM)
306 EXPORT_SYMBOL_GPL(current_save_fsgs);
307 #endif
308 
309 static __always_inline void loadseg(enum which_selector which,
310 				    unsigned short sel)
311 {
312 	if (which == FS)
313 		loadsegment(fs, sel);
314 	else
315 		load_gs_index(sel);
316 }
317 
318 static __always_inline void load_seg_legacy(unsigned short prev_index,
319 					    unsigned long prev_base,
320 					    unsigned short next_index,
321 					    unsigned long next_base,
322 					    enum which_selector which)
323 {
324 	if (likely(next_index <= 3)) {
325 		/*
326 		 * The next task is using 64-bit TLS, is not using this
327 		 * segment at all, or is having fun with arcane CPU features.
328 		 */
329 		if (next_base == 0) {
330 			/*
331 			 * Nasty case: on AMD CPUs, we need to forcibly zero
332 			 * the base.
333 			 */
334 			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
335 				loadseg(which, __USER_DS);
336 				loadseg(which, next_index);
337 			} else {
338 				/*
339 				 * We could try to exhaustively detect cases
340 				 * under which we can skip the segment load,
341 				 * but there's really only one case that matters
342 				 * for performance: if both the previous and
343 				 * next states are fully zeroed, we can skip
344 				 * the load.
345 				 *
346 				 * (This assumes that prev_base == 0 has no
347 				 * false positives.  This is the case on
348 				 * Intel-style CPUs.)
349 				 */
350 				if (likely(prev_index | next_index | prev_base))
351 					loadseg(which, next_index);
352 			}
353 		} else {
354 			if (prev_index != next_index)
355 				loadseg(which, next_index);
356 			wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
357 			       next_base);
358 		}
359 	} else {
360 		/*
361 		 * The next task is using a real segment.  Loading the selector
362 		 * is sufficient.
363 		 */
364 		loadseg(which, next_index);
365 	}
366 }
367 
368 /*
369  * Store prev's PKRU value and load next's PKRU value if they differ. PKRU
370  * is not XSTATE managed on context switch because that would require a
371  * lookup in the task's FPU xsave buffer and require to keep that updated
372  * in various places.
373  */
374 static __always_inline void x86_pkru_load(struct thread_struct *prev,
375 					  struct thread_struct *next)
376 {
377 	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
378 		return;
379 
380 	/* Stash the prev task's value: */
381 	prev->pkru = rdpkru();
382 
383 	/*
384 	 * PKRU writes are slightly expensive.  Avoid them when not
385 	 * strictly necessary:
386 	 */
387 	if (prev->pkru != next->pkru)
388 		wrpkru(next->pkru);
389 }
390 
391 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
392 					      struct thread_struct *next)
393 {
394 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
395 		/* Update the FS and GS selectors if they could have changed. */
396 		if (unlikely(prev->fsindex || next->fsindex))
397 			loadseg(FS, next->fsindex);
398 		if (unlikely(prev->gsindex || next->gsindex))
399 			loadseg(GS, next->gsindex);
400 
401 		/* Update the bases. */
402 		wrfsbase(next->fsbase);
403 		__wrgsbase_inactive(next->gsbase);
404 	} else {
405 		load_seg_legacy(prev->fsindex, prev->fsbase,
406 				next->fsindex, next->fsbase, FS);
407 		load_seg_legacy(prev->gsindex, prev->gsbase,
408 				next->gsindex, next->gsbase, GS);
409 	}
410 }
411 
412 unsigned long x86_fsgsbase_read_task(struct task_struct *task,
413 				     unsigned short selector)
414 {
415 	unsigned short idx = selector >> 3;
416 	unsigned long base;
417 
418 	if (likely((selector & SEGMENT_TI_MASK) == 0)) {
419 		if (unlikely(idx >= GDT_ENTRIES))
420 			return 0;
421 
422 		/*
423 		 * There are no user segments in the GDT with nonzero bases
424 		 * other than the TLS segments.
425 		 */
426 		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
427 			return 0;
428 
429 		idx -= GDT_ENTRY_TLS_MIN;
430 		base = get_desc_base(&task->thread.tls_array[idx]);
431 	} else {
432 #ifdef CONFIG_MODIFY_LDT_SYSCALL
433 		struct ldt_struct *ldt;
434 
435 		/*
436 		 * If performance here mattered, we could protect the LDT
437 		 * with RCU.  This is a slow path, though, so we can just
438 		 * take the mutex.
439 		 */
440 		mutex_lock(&task->mm->context.lock);
441 		ldt = task->mm->context.ldt;
442 		if (unlikely(!ldt || idx >= ldt->nr_entries))
443 			base = 0;
444 		else
445 			base = get_desc_base(ldt->entries + idx);
446 		mutex_unlock(&task->mm->context.lock);
447 #else
448 		base = 0;
449 #endif
450 	}
451 
452 	return base;
453 }
454 
455 unsigned long x86_gsbase_read_cpu_inactive(void)
456 {
457 	unsigned long gsbase;
458 
459 	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
460 		unsigned long flags;
461 
462 		local_irq_save(flags);
463 		gsbase = __rdgsbase_inactive();
464 		local_irq_restore(flags);
465 	} else {
466 		rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
467 	}
468 
469 	return gsbase;
470 }
471 
472 void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
473 {
474 	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
475 		unsigned long flags;
476 
477 		local_irq_save(flags);
478 		__wrgsbase_inactive(gsbase);
479 		local_irq_restore(flags);
480 	} else {
481 		wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
482 	}
483 }
484 
485 unsigned long x86_fsbase_read_task(struct task_struct *task)
486 {
487 	unsigned long fsbase;
488 
489 	if (task == current)
490 		fsbase = x86_fsbase_read_cpu();
491 	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
492 		 (task->thread.fsindex == 0))
493 		fsbase = task->thread.fsbase;
494 	else
495 		fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
496 
497 	return fsbase;
498 }
499 
500 unsigned long x86_gsbase_read_task(struct task_struct *task)
501 {
502 	unsigned long gsbase;
503 
504 	if (task == current)
505 		gsbase = x86_gsbase_read_cpu_inactive();
506 	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
507 		 (task->thread.gsindex == 0))
508 		gsbase = task->thread.gsbase;
509 	else
510 		gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
511 
512 	return gsbase;
513 }
514 
515 void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
516 {
517 	WARN_ON_ONCE(task == current);
518 
519 	task->thread.fsbase = fsbase;
520 }
521 
522 void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
523 {
524 	WARN_ON_ONCE(task == current);
525 
526 	task->thread.gsbase = gsbase;
527 }
528 
529 static void
530 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
531 		    unsigned long new_sp,
532 		    u16 _cs, u16 _ss, u16 _ds)
533 {
534 	WARN_ON_ONCE(regs != current_pt_regs());
535 
536 	if (static_cpu_has(X86_BUG_NULL_SEG)) {
537 		/* Loading zero below won't clear the base. */
538 		loadsegment(fs, __USER_DS);
539 		load_gs_index(__USER_DS);
540 	}
541 
542 	reset_thread_features();
543 
544 	loadsegment(fs, 0);
545 	loadsegment(es, _ds);
546 	loadsegment(ds, _ds);
547 	load_gs_index(0);
548 
549 	regs->ip	= new_ip;
550 	regs->sp	= new_sp;
551 	regs->csx	= _cs;
552 	regs->ssx	= _ss;
553 	/*
554 	 * Allow single-step trap and NMI when starting a new task, thus
555 	 * once the new task enters user space, single-step trap and NMI
556 	 * are both enabled immediately.
557 	 *
558 	 * Entering a new task is logically speaking a return from a
559 	 * system call (exec, fork, clone, etc.). As such, if ptrace
560 	 * enables single stepping a single step exception should be
561 	 * allowed to trigger immediately upon entering user space.
562 	 * This is not optional.
563 	 *
564 	 * NMI should *never* be disabled in user space. As such, this
565 	 * is an optional, opportunistic way to catch errors.
566 	 *
567 	 * Paranoia: High-order 48 bits above the lowest 16 bit SS are
568 	 * discarded by the legacy IRET instruction on all Intel, AMD,
569 	 * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally,
570 	 * even when FRED is not enabled. But we choose the safer side
571 	 * to use these bits only when FRED is enabled.
572 	 */
573 	if (cpu_feature_enabled(X86_FEATURE_FRED)) {
574 		regs->fred_ss.swevent	= true;
575 		regs->fred_ss.nmi	= true;
576 	}
577 
578 	regs->flags	= X86_EFLAGS_IF | X86_EFLAGS_FIXED;
579 }
580 
581 void
582 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
583 {
584 	start_thread_common(regs, new_ip, new_sp,
585 			    __USER_CS, __USER_DS, 0);
586 }
587 EXPORT_SYMBOL_GPL(start_thread);
588 
589 #ifdef CONFIG_COMPAT
590 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
591 {
592 	start_thread_common(regs, new_ip, new_sp,
593 			    x32 ? __USER_CS : __USER32_CS,
594 			    __USER_DS, __USER_DS);
595 }
596 #endif
597 
598 /*
599  *	switch_to(x,y) should switch tasks from x to y.
600  *
601  * This could still be optimized:
602  * - fold all the options into a flag word and test it with a single test.
603  * - could test fs/gs bitsliced
604  *
605  * Kprobes not supported here. Set the probe on schedule instead.
606  * Function graph tracer not supported too.
607  */
608 __no_kmsan_checks
609 __visible __notrace_funcgraph struct task_struct *
610 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
611 {
612 	struct thread_struct *prev = &prev_p->thread;
613 	struct thread_struct *next = &next_p->thread;
614 	struct fpu *prev_fpu = &prev->fpu;
615 	int cpu = smp_processor_id();
616 
617 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
618 		     this_cpu_read(pcpu_hot.hardirq_stack_inuse));
619 
620 	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
621 		switch_fpu_prepare(prev_fpu, cpu);
622 
623 	/* We must save %fs and %gs before load_TLS() because
624 	 * %fs and %gs may be cleared by load_TLS().
625 	 *
626 	 * (e.g. xen_load_tls())
627 	 */
628 	save_fsgs(prev_p);
629 
630 	/*
631 	 * Load TLS before restoring any segments so that segment loads
632 	 * reference the correct GDT entries.
633 	 */
634 	load_TLS(next, cpu);
635 
636 	/*
637 	 * Leave lazy mode, flushing any hypercalls made here.  This
638 	 * must be done after loading TLS entries in the GDT but before
639 	 * loading segments that might reference them.
640 	 */
641 	arch_end_context_switch(next_p);
642 
643 	/* Switch DS and ES.
644 	 *
645 	 * Reading them only returns the selectors, but writing them (if
646 	 * nonzero) loads the full descriptor from the GDT or LDT.  The
647 	 * LDT for next is loaded in switch_mm, and the GDT is loaded
648 	 * above.
649 	 *
650 	 * We therefore need to write new values to the segment
651 	 * registers on every context switch unless both the new and old
652 	 * values are zero.
653 	 *
654 	 * Note that we don't need to do anything for CS and SS, as
655 	 * those are saved and restored as part of pt_regs.
656 	 */
657 	savesegment(es, prev->es);
658 	if (unlikely(next->es | prev->es))
659 		loadsegment(es, next->es);
660 
661 	savesegment(ds, prev->ds);
662 	if (unlikely(next->ds | prev->ds))
663 		loadsegment(ds, next->ds);
664 
665 	x86_fsgsbase_load(prev, next);
666 
667 	x86_pkru_load(prev, next);
668 
669 	/*
670 	 * Switch the PDA and FPU contexts.
671 	 */
672 	raw_cpu_write(pcpu_hot.current_task, next_p);
673 	raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p));
674 
675 	switch_fpu_finish();
676 
677 	/* Reload sp0. */
678 	update_task_stack(next_p);
679 
680 	switch_to_extra(prev_p, next_p);
681 
682 	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
683 		/*
684 		 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
685 		 * does not update the cached descriptor.  As a result, if we
686 		 * do SYSRET while SS is NULL, we'll end up in user mode with
687 		 * SS apparently equal to __USER_DS but actually unusable.
688 		 *
689 		 * The straightforward workaround would be to fix it up just
690 		 * before SYSRET, but that would slow down the system call
691 		 * fast paths.  Instead, we ensure that SS is never NULL in
692 		 * system call context.  We do this by replacing NULL SS
693 		 * selectors at every context switch.  SYSCALL sets up a valid
694 		 * SS, so the only way to get NULL is to re-enter the kernel
695 		 * from CPL 3 through an interrupt.  Since that can't happen
696 		 * in the same task as a running syscall, we are guaranteed to
697 		 * context switch between every interrupt vector entry and a
698 		 * subsequent SYSRET.
699 		 *
700 		 * We read SS first because SS reads are much faster than
701 		 * writes.  Out of caution, we force SS to __KERNEL_DS even if
702 		 * it previously had a different non-NULL value.
703 		 */
704 		unsigned short ss_sel;
705 		savesegment(ss, ss_sel);
706 		if (ss_sel != __KERNEL_DS)
707 			loadsegment(ss, __KERNEL_DS);
708 	}
709 
710 	/* Load the Intel cache allocation PQR MSR. */
711 	resctrl_sched_in(next_p);
712 
713 	return prev_p;
714 }
715 
716 void set_personality_64bit(void)
717 {
718 	/* inherit personality from parent */
719 
720 	/* Make sure to be in 64bit mode */
721 	clear_thread_flag(TIF_ADDR32);
722 	/* Pretend that this comes from a 64bit execve */
723 	task_pt_regs(current)->orig_ax = __NR_execve;
724 	current_thread_info()->status &= ~TS_COMPAT;
725 	if (current->mm)
726 		__set_bit(MM_CONTEXT_HAS_VSYSCALL, &current->mm->context.flags);
727 
728 	/* TBD: overwrites user setup. Should have two bits.
729 	   But 64bit processes have always behaved this way,
730 	   so it's not too bad. The main problem is just that
731 	   32bit children are affected again. */
732 	current->personality &= ~READ_IMPLIES_EXEC;
733 }
734 
735 static void __set_personality_x32(void)
736 {
737 #ifdef CONFIG_X86_X32_ABI
738 	if (current->mm)
739 		current->mm->context.flags = 0;
740 
741 	current->personality &= ~READ_IMPLIES_EXEC;
742 	/*
743 	 * in_32bit_syscall() uses the presence of the x32 syscall bit
744 	 * flag to determine compat status.  The x86 mmap() code relies on
745 	 * the syscall bitness so set x32 syscall bit right here to make
746 	 * in_32bit_syscall() work during exec().
747 	 *
748 	 * Pretend to come from a x32 execve.
749 	 */
750 	task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
751 	current_thread_info()->status &= ~TS_COMPAT;
752 #endif
753 }
754 
755 static void __set_personality_ia32(void)
756 {
757 #ifdef CONFIG_IA32_EMULATION
758 	if (current->mm) {
759 		/*
760 		 * uprobes applied to this MM need to know this and
761 		 * cannot use user_64bit_mode() at that time.
762 		 */
763 		__set_bit(MM_CONTEXT_UPROBE_IA32, &current->mm->context.flags);
764 	}
765 
766 	current->personality |= force_personality32;
767 	/* Prepare the first "return" to user space */
768 	task_pt_regs(current)->orig_ax = __NR_ia32_execve;
769 	current_thread_info()->status |= TS_COMPAT;
770 #endif
771 }
772 
773 void set_personality_ia32(bool x32)
774 {
775 	/* Make sure to be in 32bit mode */
776 	set_thread_flag(TIF_ADDR32);
777 
778 	if (x32)
779 		__set_personality_x32();
780 	else
781 		__set_personality_ia32();
782 }
783 EXPORT_SYMBOL_GPL(set_personality_ia32);
784 
785 #ifdef CONFIG_CHECKPOINT_RESTORE
786 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
787 {
788 	int ret;
789 
790 	ret = map_vdso_once(image, addr);
791 	if (ret)
792 		return ret;
793 
794 	return (long)image->size;
795 }
796 #endif
797 
798 #ifdef CONFIG_ADDRESS_MASKING
799 
800 #define LAM_U57_BITS 6
801 
802 static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
803 {
804 	if (!cpu_feature_enabled(X86_FEATURE_LAM))
805 		return -ENODEV;
806 
807 	/* PTRACE_ARCH_PRCTL */
808 	if (current->mm != mm)
809 		return -EINVAL;
810 
811 	if (mm_valid_pasid(mm) &&
812 	    !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags))
813 		return -EINVAL;
814 
815 	if (mmap_write_lock_killable(mm))
816 		return -EINTR;
817 
818 	if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) {
819 		mmap_write_unlock(mm);
820 		return -EBUSY;
821 	}
822 
823 	if (!nr_bits) {
824 		mmap_write_unlock(mm);
825 		return -EINVAL;
826 	} else if (nr_bits <= LAM_U57_BITS) {
827 		mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
828 		mm->context.untag_mask =  ~GENMASK(62, 57);
829 	} else {
830 		mmap_write_unlock(mm);
831 		return -EINVAL;
832 	}
833 
834 	write_cr3(__read_cr3() | mm->context.lam_cr3_mask);
835 	set_tlbstate_lam_mode(mm);
836 	set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
837 
838 	mmap_write_unlock(mm);
839 
840 	return 0;
841 }
842 #endif
843 
844 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
845 {
846 	int ret = 0;
847 
848 	switch (option) {
849 	case ARCH_SET_GS: {
850 		if (unlikely(arg2 >= TASK_SIZE_MAX))
851 			return -EPERM;
852 
853 		preempt_disable();
854 		/*
855 		 * ARCH_SET_GS has always overwritten the index
856 		 * and the base. Zero is the most sensible value
857 		 * to put in the index, and is the only value that
858 		 * makes any sense if FSGSBASE is unavailable.
859 		 */
860 		if (task == current) {
861 			loadseg(GS, 0);
862 			x86_gsbase_write_cpu_inactive(arg2);
863 
864 			/*
865 			 * On non-FSGSBASE systems, save_base_legacy() expects
866 			 * that we also fill in thread.gsbase.
867 			 */
868 			task->thread.gsbase = arg2;
869 
870 		} else {
871 			task->thread.gsindex = 0;
872 			x86_gsbase_write_task(task, arg2);
873 		}
874 		preempt_enable();
875 		break;
876 	}
877 	case ARCH_SET_FS: {
878 		/*
879 		 * Not strictly needed for %fs, but do it for symmetry
880 		 * with %gs
881 		 */
882 		if (unlikely(arg2 >= TASK_SIZE_MAX))
883 			return -EPERM;
884 
885 		preempt_disable();
886 		/*
887 		 * Set the selector to 0 for the same reason
888 		 * as %gs above.
889 		 */
890 		if (task == current) {
891 			loadseg(FS, 0);
892 			x86_fsbase_write_cpu(arg2);
893 
894 			/*
895 			 * On non-FSGSBASE systems, save_base_legacy() expects
896 			 * that we also fill in thread.fsbase.
897 			 */
898 			task->thread.fsbase = arg2;
899 		} else {
900 			task->thread.fsindex = 0;
901 			x86_fsbase_write_task(task, arg2);
902 		}
903 		preempt_enable();
904 		break;
905 	}
906 	case ARCH_GET_FS: {
907 		unsigned long base = x86_fsbase_read_task(task);
908 
909 		ret = put_user(base, (unsigned long __user *)arg2);
910 		break;
911 	}
912 	case ARCH_GET_GS: {
913 		unsigned long base = x86_gsbase_read_task(task);
914 
915 		ret = put_user(base, (unsigned long __user *)arg2);
916 		break;
917 	}
918 
919 #ifdef CONFIG_CHECKPOINT_RESTORE
920 # ifdef CONFIG_X86_X32_ABI
921 	case ARCH_MAP_VDSO_X32:
922 		return prctl_map_vdso(&vdso_image_x32, arg2);
923 # endif
924 # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
925 	case ARCH_MAP_VDSO_32:
926 		return prctl_map_vdso(&vdso_image_32, arg2);
927 # endif
928 	case ARCH_MAP_VDSO_64:
929 		return prctl_map_vdso(&vdso_image_64, arg2);
930 #endif
931 #ifdef CONFIG_ADDRESS_MASKING
932 	case ARCH_GET_UNTAG_MASK:
933 		return put_user(task->mm->context.untag_mask,
934 				(unsigned long __user *)arg2);
935 	case ARCH_ENABLE_TAGGED_ADDR:
936 		return prctl_enable_tagged_addr(task->mm, arg2);
937 	case ARCH_FORCE_TAGGED_SVA:
938 		if (current != task)
939 			return -EINVAL;
940 		set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags);
941 		return 0;
942 	case ARCH_GET_MAX_TAG_BITS:
943 		if (!cpu_feature_enabled(X86_FEATURE_LAM))
944 			return put_user(0, (unsigned long __user *)arg2);
945 		else
946 			return put_user(LAM_U57_BITS, (unsigned long __user *)arg2);
947 #endif
948 	case ARCH_SHSTK_ENABLE:
949 	case ARCH_SHSTK_DISABLE:
950 	case ARCH_SHSTK_LOCK:
951 	case ARCH_SHSTK_UNLOCK:
952 	case ARCH_SHSTK_STATUS:
953 		return shstk_prctl(task, option, arg2);
954 	default:
955 		ret = -EINVAL;
956 		break;
957 	}
958 
959 	return ret;
960 }
961 
962 SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
963 {
964 	long ret;
965 
966 	ret = do_arch_prctl_64(current, option, arg2);
967 	if (ret == -EINVAL)
968 		ret = do_arch_prctl_common(option, arg2);
969 
970 	return ret;
971 }
972 
973 #ifdef CONFIG_IA32_EMULATION
974 COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
975 {
976 	return do_arch_prctl_common(option, arg2);
977 }
978 #endif
979 
980 unsigned long KSTK_ESP(struct task_struct *task)
981 {
982 	return task_pt_regs(task)->sp;
983 }
984