xref: /linux/arch/x86/kernel/process_64.c (revision 7fc2cd2e4b398c57c9cf961cfea05eadbf34c05c)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Copyright (C) 1995  Linus Torvalds
4  *
5  *  Pentium III FXSR, SSE support
6  *	Gareth Hughes <gareth@valinux.com>, May 2000
7  *
8  *  X86-64 port
9  *	Andi Kleen.
10  *
11  *	CPU hotplug support - ashok.raj@intel.com
12  */
13 
14 /*
15  * This file handles the architecture-dependent parts of process handling..
16  */
17 
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/sched/task.h>
22 #include <linux/sched/task_stack.h>
23 #include <linux/fs.h>
24 #include <linux/kernel.h>
25 #include <linux/mm.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/delay.h>
32 #include <linux/export.h>
33 #include <linux/kvm_types.h>
34 #include <linux/ptrace.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/prctl.h>
39 #include <linux/uaccess.h>
40 #include <linux/io.h>
41 #include <linux/ftrace.h>
42 #include <linux/syscalls.h>
43 #include <linux/iommu.h>
44 
45 #include <asm/processor.h>
46 #include <asm/pkru.h>
47 #include <asm/fpu/sched.h>
48 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
50 #include <asm/desc.h>
51 #include <asm/proto.h>
52 #include <asm/ia32.h>
53 #include <asm/debugreg.h>
54 #include <asm/switch_to.h>
55 #include <asm/xen/hypervisor.h>
56 #include <asm/vdso.h>
57 #include <asm/resctrl.h>
58 #include <asm/unistd.h>
59 #include <asm/fsgsbase.h>
60 #include <asm/fred.h>
61 #include <asm/msr.h>
62 #ifdef CONFIG_IA32_EMULATION
63 /* Not included via unistd.h */
64 #include <asm/unistd_32_ia32.h>
65 #endif
66 
67 #include "process.h"
68 
69 /* Prints also some state that isn't saved in the pt_regs */
70 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
71 		 const char *log_lvl)
72 {
73 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
74 	unsigned long d0, d1, d2, d3, d6, d7;
75 	unsigned int fsindex, gsindex;
76 	unsigned int ds, es;
77 
78 	show_iret_regs(regs, log_lvl);
79 
80 	if (regs->orig_ax != -1)
81 		pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
82 	else
83 		pr_cont("\n");
84 
85 	printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
86 	       log_lvl, regs->ax, regs->bx, regs->cx);
87 	printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
88 	       log_lvl, regs->dx, regs->si, regs->di);
89 	printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
90 	       log_lvl, regs->bp, regs->r8, regs->r9);
91 	printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
92 	       log_lvl, regs->r10, regs->r11, regs->r12);
93 	printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
94 	       log_lvl, regs->r13, regs->r14, regs->r15);
95 
96 	if (mode == SHOW_REGS_SHORT)
97 		return;
98 
99 	if (mode == SHOW_REGS_USER) {
100 		rdmsrq(MSR_FS_BASE, fs);
101 		rdmsrq(MSR_KERNEL_GS_BASE, shadowgs);
102 		printk("%sFS:  %016lx GS:  %016lx\n",
103 		       log_lvl, fs, shadowgs);
104 		return;
105 	}
106 
107 	asm("movl %%ds,%0" : "=r" (ds));
108 	asm("movl %%es,%0" : "=r" (es));
109 	asm("movl %%fs,%0" : "=r" (fsindex));
110 	asm("movl %%gs,%0" : "=r" (gsindex));
111 
112 	rdmsrq(MSR_FS_BASE, fs);
113 	rdmsrq(MSR_GS_BASE, gs);
114 	rdmsrq(MSR_KERNEL_GS_BASE, shadowgs);
115 
116 	cr0 = read_cr0();
117 	cr2 = read_cr2();
118 	cr3 = __read_cr3();
119 	cr4 = __read_cr4();
120 
121 	printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
122 	       log_lvl, fs, fsindex, gs, gsindex, shadowgs);
123 	printk("%sCS:  %04x DS: %04x ES: %04x CR0: %016lx\n",
124 		log_lvl, regs->cs, ds, es, cr0);
125 	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
126 		log_lvl, cr2, cr3, cr4);
127 
128 	get_debugreg(d0, 0);
129 	get_debugreg(d1, 1);
130 	get_debugreg(d2, 2);
131 	get_debugreg(d3, 3);
132 	get_debugreg(d6, 6);
133 	get_debugreg(d7, 7);
134 
135 	/* Only print out debug registers if they are in their non-default state. */
136 	if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
137 	    (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))) {
138 		printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
139 		       log_lvl, d0, d1, d2);
140 		printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
141 		       log_lvl, d3, d6, d7);
142 	}
143 
144 	if (cr4 & X86_CR4_PKE)
145 		printk("%sPKRU: %08x\n", log_lvl, read_pkru());
146 }
147 
148 void release_thread(struct task_struct *dead_task)
149 {
150 	WARN_ON(dead_task->mm);
151 }
152 
153 enum which_selector {
154 	FS,
155 	GS
156 };
157 
158 /*
159  * Out of line to be protected from kprobes and tracing. If this would be
160  * traced or probed than any access to a per CPU variable happens with
161  * the wrong GS.
162  *
163  * It is not used on Xen paravirt. When paravirt support is needed, it
164  * needs to be renamed with native_ prefix.
165  */
166 static noinstr unsigned long __rdgsbase_inactive(void)
167 {
168 	unsigned long gsbase;
169 
170 	lockdep_assert_irqs_disabled();
171 
172 	/*
173 	 * SWAPGS is no longer needed thus NOT allowed with FRED because
174 	 * FRED transitions ensure that an operating system can _always_
175 	 * operate with its own GS base address:
176 	 * - For events that occur in ring 3, FRED event delivery swaps
177 	 *   the GS base address with the IA32_KERNEL_GS_BASE MSR.
178 	 * - ERETU (the FRED transition that returns to ring 3) also swaps
179 	 *   the GS base address with the IA32_KERNEL_GS_BASE MSR.
180 	 *
181 	 * And the operating system can still setup the GS segment for a
182 	 * user thread without the need of loading a user thread GS with:
183 	 * - Using LKGS, available with FRED, to modify other attributes
184 	 *   of the GS segment without compromising its ability always to
185 	 *   operate with its own GS base address.
186 	 * - Accessing the GS segment base address for a user thread as
187 	 *   before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR.
188 	 *
189 	 * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE
190 	 * MSR instead of the GS segment’s descriptor cache. As such, the
191 	 * operating system never changes its runtime GS base address.
192 	 */
193 	if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
194 	    !cpu_feature_enabled(X86_FEATURE_XENPV)) {
195 		native_swapgs();
196 		gsbase = rdgsbase();
197 		native_swapgs();
198 	} else {
199 		instrumentation_begin();
200 		rdmsrq(MSR_KERNEL_GS_BASE, gsbase);
201 		instrumentation_end();
202 	}
203 
204 	return gsbase;
205 }
206 
207 /*
208  * Out of line to be protected from kprobes and tracing. If this would be
209  * traced or probed than any access to a per CPU variable happens with
210  * the wrong GS.
211  *
212  * It is not used on Xen paravirt. When paravirt support is needed, it
213  * needs to be renamed with native_ prefix.
214  */
215 static noinstr void __wrgsbase_inactive(unsigned long gsbase)
216 {
217 	lockdep_assert_irqs_disabled();
218 
219 	if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
220 	    !cpu_feature_enabled(X86_FEATURE_XENPV)) {
221 		native_swapgs();
222 		wrgsbase(gsbase);
223 		native_swapgs();
224 	} else {
225 		instrumentation_begin();
226 		wrmsrq(MSR_KERNEL_GS_BASE, gsbase);
227 		instrumentation_end();
228 	}
229 }
230 
231 /*
232  * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
233  * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
234  * It's forcibly inlined because it'll generate better code and this function
235  * is hot.
236  */
237 static __always_inline void save_base_legacy(struct task_struct *prev_p,
238 					     unsigned short selector,
239 					     enum which_selector which)
240 {
241 	if (likely(selector == 0)) {
242 		/*
243 		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
244 		 * be the pre-existing saved base or it could be zero.  On AMD
245 		 * (with X86_BUG_NULL_SEG), the segment base could be almost
246 		 * anything.
247 		 *
248 		 * This branch is very hot (it's hit twice on almost every
249 		 * context switch between 64-bit programs), and avoiding
250 		 * the RDMSR helps a lot, so we just assume that whatever
251 		 * value is already saved is correct.  This matches historical
252 		 * Linux behavior, so it won't break existing applications.
253 		 *
254 		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
255 		 * report that the base is zero, it needs to actually be zero:
256 		 * see the corresponding logic in load_seg_legacy.
257 		 */
258 	} else {
259 		/*
260 		 * If the selector is 1, 2, or 3, then the base is zero on
261 		 * !X86_BUG_NULL_SEG CPUs and could be anything on
262 		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
263 		 * has never attempted to preserve the base across context
264 		 * switches.
265 		 *
266 		 * If selector > 3, then it refers to a real segment, and
267 		 * saving the base isn't necessary.
268 		 */
269 		if (which == FS)
270 			prev_p->thread.fsbase = 0;
271 		else
272 			prev_p->thread.gsbase = 0;
273 	}
274 }
275 
276 static __always_inline void save_fsgs(struct task_struct *task)
277 {
278 	savesegment(fs, task->thread.fsindex);
279 	savesegment(gs, task->thread.gsindex);
280 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
281 		/*
282 		 * If FSGSBASE is enabled, we can't make any useful guesses
283 		 * about the base, and user code expects us to save the current
284 		 * value.  Fortunately, reading the base directly is efficient.
285 		 */
286 		task->thread.fsbase = rdfsbase();
287 		task->thread.gsbase = __rdgsbase_inactive();
288 	} else {
289 		save_base_legacy(task, task->thread.fsindex, FS);
290 		save_base_legacy(task, task->thread.gsindex, GS);
291 	}
292 }
293 
294 /*
295  * While a process is running,current->thread.fsbase and current->thread.gsbase
296  * may not match the corresponding CPU registers (see save_base_legacy()).
297  */
298 void current_save_fsgs(void)
299 {
300 	unsigned long flags;
301 
302 	/* Interrupts need to be off for FSGSBASE */
303 	local_irq_save(flags);
304 	save_fsgs(current);
305 	local_irq_restore(flags);
306 }
307 EXPORT_SYMBOL_FOR_KVM(current_save_fsgs);
308 
309 static __always_inline void loadseg(enum which_selector which,
310 				    unsigned short sel)
311 {
312 	if (which == FS)
313 		loadsegment(fs, sel);
314 	else
315 		load_gs_index(sel);
316 }
317 
318 static __always_inline void load_seg_legacy(unsigned short prev_index,
319 					    unsigned long prev_base,
320 					    unsigned short next_index,
321 					    unsigned long next_base,
322 					    enum which_selector which)
323 {
324 	if (likely(next_index <= 3)) {
325 		/*
326 		 * The next task is using 64-bit TLS, is not using this
327 		 * segment at all, or is having fun with arcane CPU features.
328 		 */
329 		if (next_base == 0) {
330 			/*
331 			 * Nasty case: on AMD CPUs, we need to forcibly zero
332 			 * the base.
333 			 */
334 			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
335 				loadseg(which, __USER_DS);
336 				loadseg(which, next_index);
337 			} else {
338 				/*
339 				 * We could try to exhaustively detect cases
340 				 * under which we can skip the segment load,
341 				 * but there's really only one case that matters
342 				 * for performance: if both the previous and
343 				 * next states are fully zeroed, we can skip
344 				 * the load.
345 				 *
346 				 * (This assumes that prev_base == 0 has no
347 				 * false positives.  This is the case on
348 				 * Intel-style CPUs.)
349 				 */
350 				if (likely(prev_index | next_index | prev_base))
351 					loadseg(which, next_index);
352 			}
353 		} else {
354 			if (prev_index != next_index)
355 				loadseg(which, next_index);
356 			wrmsrq(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
357 			       next_base);
358 		}
359 	} else {
360 		/*
361 		 * The next task is using a real segment.  Loading the selector
362 		 * is sufficient.
363 		 */
364 		loadseg(which, next_index);
365 	}
366 }
367 
368 /*
369  * Store prev's PKRU value and load next's PKRU value if they differ. PKRU
370  * is not XSTATE managed on context switch because that would require a
371  * lookup in the task's FPU xsave buffer and require to keep that updated
372  * in various places.
373  */
374 static __always_inline void x86_pkru_load(struct thread_struct *prev,
375 					  struct thread_struct *next)
376 {
377 	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
378 		return;
379 
380 	/* Stash the prev task's value: */
381 	prev->pkru = rdpkru();
382 
383 	/*
384 	 * PKRU writes are slightly expensive.  Avoid them when not
385 	 * strictly necessary:
386 	 */
387 	if (prev->pkru != next->pkru)
388 		wrpkru(next->pkru);
389 }
390 
391 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
392 					      struct thread_struct *next)
393 {
394 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
395 		/* Update the FS and GS selectors if they could have changed. */
396 		if (unlikely(prev->fsindex || next->fsindex))
397 			loadseg(FS, next->fsindex);
398 		if (unlikely(prev->gsindex || next->gsindex))
399 			loadseg(GS, next->gsindex);
400 
401 		/* Update the bases. */
402 		wrfsbase(next->fsbase);
403 		__wrgsbase_inactive(next->gsbase);
404 	} else {
405 		load_seg_legacy(prev->fsindex, prev->fsbase,
406 				next->fsindex, next->fsbase, FS);
407 		load_seg_legacy(prev->gsindex, prev->gsbase,
408 				next->gsindex, next->gsbase, GS);
409 	}
410 }
411 
412 unsigned long x86_fsgsbase_read_task(struct task_struct *task,
413 				     unsigned short selector)
414 {
415 	unsigned short idx = selector >> 3;
416 	unsigned long base;
417 
418 	if (likely((selector & SEGMENT_TI_MASK) == 0)) {
419 		if (unlikely(idx >= GDT_ENTRIES))
420 			return 0;
421 
422 		/*
423 		 * There are no user segments in the GDT with nonzero bases
424 		 * other than the TLS segments.
425 		 */
426 		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
427 			return 0;
428 
429 		idx -= GDT_ENTRY_TLS_MIN;
430 		base = get_desc_base(&task->thread.tls_array[idx]);
431 	} else {
432 #ifdef CONFIG_MODIFY_LDT_SYSCALL
433 		struct ldt_struct *ldt;
434 
435 		/*
436 		 * If performance here mattered, we could protect the LDT
437 		 * with RCU.  This is a slow path, though, so we can just
438 		 * take the mutex.
439 		 */
440 		mutex_lock(&task->mm->context.lock);
441 		ldt = task->mm->context.ldt;
442 		if (unlikely(!ldt || idx >= ldt->nr_entries))
443 			base = 0;
444 		else
445 			base = get_desc_base(ldt->entries + idx);
446 		mutex_unlock(&task->mm->context.lock);
447 #else
448 		base = 0;
449 #endif
450 	}
451 
452 	return base;
453 }
454 
455 unsigned long x86_gsbase_read_cpu_inactive(void)
456 {
457 	unsigned long gsbase;
458 
459 	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
460 		unsigned long flags;
461 
462 		local_irq_save(flags);
463 		gsbase = __rdgsbase_inactive();
464 		local_irq_restore(flags);
465 	} else {
466 		rdmsrq(MSR_KERNEL_GS_BASE, gsbase);
467 	}
468 
469 	return gsbase;
470 }
471 
472 void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
473 {
474 	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
475 		unsigned long flags;
476 
477 		local_irq_save(flags);
478 		__wrgsbase_inactive(gsbase);
479 		local_irq_restore(flags);
480 	} else {
481 		wrmsrq(MSR_KERNEL_GS_BASE, gsbase);
482 	}
483 }
484 
485 unsigned long x86_fsbase_read_task(struct task_struct *task)
486 {
487 	unsigned long fsbase;
488 
489 	if (task == current)
490 		fsbase = x86_fsbase_read_cpu();
491 	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
492 		 (task->thread.fsindex == 0))
493 		fsbase = task->thread.fsbase;
494 	else
495 		fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
496 
497 	return fsbase;
498 }
499 
500 unsigned long x86_gsbase_read_task(struct task_struct *task)
501 {
502 	unsigned long gsbase;
503 
504 	if (task == current)
505 		gsbase = x86_gsbase_read_cpu_inactive();
506 	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
507 		 (task->thread.gsindex == 0))
508 		gsbase = task->thread.gsbase;
509 	else
510 		gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
511 
512 	return gsbase;
513 }
514 
515 void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
516 {
517 	WARN_ON_ONCE(task == current);
518 
519 	task->thread.fsbase = fsbase;
520 }
521 
522 void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
523 {
524 	WARN_ON_ONCE(task == current);
525 
526 	task->thread.gsbase = gsbase;
527 }
528 
529 static void
530 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
531 		    unsigned long new_sp,
532 		    u16 _cs, u16 _ss, u16 _ds)
533 {
534 	WARN_ON_ONCE(regs != current_pt_regs());
535 
536 	if (static_cpu_has(X86_BUG_NULL_SEG)) {
537 		/* Loading zero below won't clear the base. */
538 		loadsegment(fs, __USER_DS);
539 		load_gs_index(__USER_DS);
540 	}
541 
542 	reset_thread_features();
543 
544 	loadsegment(fs, 0);
545 	loadsegment(es, _ds);
546 	loadsegment(ds, _ds);
547 	load_gs_index(0);
548 
549 	regs->ip	= new_ip;
550 	regs->sp	= new_sp;
551 	regs->csx	= _cs;
552 	regs->ssx	= _ss;
553 	/*
554 	 * Allow single-step trap and NMI when starting a new task, thus
555 	 * once the new task enters user space, single-step trap and NMI
556 	 * are both enabled immediately.
557 	 *
558 	 * Entering a new task is logically speaking a return from a
559 	 * system call (exec, fork, clone, etc.). As such, if ptrace
560 	 * enables single stepping a single step exception should be
561 	 * allowed to trigger immediately upon entering user space.
562 	 * This is not optional.
563 	 *
564 	 * NMI should *never* be disabled in user space. As such, this
565 	 * is an optional, opportunistic way to catch errors.
566 	 *
567 	 * Paranoia: High-order 48 bits above the lowest 16 bit SS are
568 	 * discarded by the legacy IRET instruction on all Intel, AMD,
569 	 * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally,
570 	 * even when FRED is not enabled. But we choose the safer side
571 	 * to use these bits only when FRED is enabled.
572 	 */
573 	if (cpu_feature_enabled(X86_FEATURE_FRED)) {
574 		regs->fred_ss.swevent	= true;
575 		regs->fred_ss.nmi	= true;
576 	}
577 
578 	regs->flags	= X86_EFLAGS_IF | X86_EFLAGS_FIXED;
579 }
580 
581 void
582 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
583 {
584 	start_thread_common(regs, new_ip, new_sp,
585 			    __USER_CS, __USER_DS, 0);
586 }
587 EXPORT_SYMBOL_GPL(start_thread);
588 
589 #ifdef CONFIG_COMPAT
590 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
591 {
592 	start_thread_common(regs, new_ip, new_sp,
593 			    x32 ? __USER_CS : __USER32_CS,
594 			    __USER_DS, __USER_DS);
595 }
596 #endif
597 
598 /*
599  *	switch_to(x,y) should switch tasks from x to y.
600  *
601  * This could still be optimized:
602  * - fold all the options into a flag word and test it with a single test.
603  * - could test fs/gs bitsliced
604  *
605  * Kprobes not supported here. Set the probe on schedule instead.
606  * Function graph tracer not supported too.
607  */
608 __no_kmsan_checks
609 __visible __notrace_funcgraph struct task_struct *
610 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
611 {
612 	struct thread_struct *prev = &prev_p->thread;
613 	struct thread_struct *next = &next_p->thread;
614 	int cpu = smp_processor_id();
615 
616 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
617 		     this_cpu_read(hardirq_stack_inuse));
618 
619 	switch_fpu(prev_p, cpu);
620 
621 	/* We must save %fs and %gs before load_TLS() because
622 	 * %fs and %gs may be cleared by load_TLS().
623 	 *
624 	 * (e.g. xen_load_tls())
625 	 */
626 	save_fsgs(prev_p);
627 
628 	/*
629 	 * Load TLS before restoring any segments so that segment loads
630 	 * reference the correct GDT entries.
631 	 */
632 	load_TLS(next, cpu);
633 
634 	/*
635 	 * Leave lazy mode, flushing any hypercalls made here.  This
636 	 * must be done after loading TLS entries in the GDT but before
637 	 * loading segments that might reference them.
638 	 */
639 	arch_end_context_switch(next_p);
640 
641 	/* Switch DS and ES.
642 	 *
643 	 * Reading them only returns the selectors, but writing them (if
644 	 * nonzero) loads the full descriptor from the GDT or LDT.  The
645 	 * LDT for next is loaded in switch_mm, and the GDT is loaded
646 	 * above.
647 	 *
648 	 * We therefore need to write new values to the segment
649 	 * registers on every context switch unless both the new and old
650 	 * values are zero.
651 	 *
652 	 * Note that we don't need to do anything for CS and SS, as
653 	 * those are saved and restored as part of pt_regs.
654 	 */
655 	savesegment(es, prev->es);
656 	if (unlikely(next->es | prev->es))
657 		loadsegment(es, next->es);
658 
659 	savesegment(ds, prev->ds);
660 	if (unlikely(next->ds | prev->ds))
661 		loadsegment(ds, next->ds);
662 
663 	x86_fsgsbase_load(prev, next);
664 
665 	x86_pkru_load(prev, next);
666 
667 	/*
668 	 * Switch the PDA and FPU contexts.
669 	 */
670 	raw_cpu_write(current_task, next_p);
671 	raw_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
672 
673 	/* Reload sp0. */
674 	update_task_stack(next_p);
675 
676 	switch_to_extra(prev_p, next_p);
677 
678 	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
679 		/*
680 		 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
681 		 * does not update the cached descriptor.  As a result, if we
682 		 * do SYSRET while SS is NULL, we'll end up in user mode with
683 		 * SS apparently equal to __USER_DS but actually unusable.
684 		 *
685 		 * The straightforward workaround would be to fix it up just
686 		 * before SYSRET, but that would slow down the system call
687 		 * fast paths.  Instead, we ensure that SS is never NULL in
688 		 * system call context.  We do this by replacing NULL SS
689 		 * selectors at every context switch.  SYSCALL sets up a valid
690 		 * SS, so the only way to get NULL is to re-enter the kernel
691 		 * from CPL 3 through an interrupt.  Since that can't happen
692 		 * in the same task as a running syscall, we are guaranteed to
693 		 * context switch between every interrupt vector entry and a
694 		 * subsequent SYSRET.
695 		 *
696 		 * We read SS first because SS reads are much faster than
697 		 * writes.  Out of caution, we force SS to __KERNEL_DS even if
698 		 * it previously had a different non-NULL value.
699 		 */
700 		unsigned short ss_sel;
701 		savesegment(ss, ss_sel);
702 		if (ss_sel != __KERNEL_DS)
703 			loadsegment(ss, __KERNEL_DS);
704 	}
705 
706 	/* Load the Intel cache allocation PQR MSR. */
707 	resctrl_arch_sched_in(next_p);
708 
709 	/* Reset hw history on AMD CPUs */
710 	if (cpu_feature_enabled(X86_FEATURE_AMD_WORKLOAD_CLASS))
711 		wrmsrl(MSR_AMD_WORKLOAD_HRST, 0x1);
712 
713 	return prev_p;
714 }
715 
716 void set_personality_64bit(void)
717 {
718 	/* inherit personality from parent */
719 
720 	/* Make sure to be in 64bit mode */
721 	clear_thread_flag(TIF_ADDR32);
722 	/* Pretend that this comes from a 64bit execve */
723 	task_pt_regs(current)->orig_ax = __NR_execve;
724 	current_thread_info()->status &= ~TS_COMPAT;
725 	if (current->mm)
726 		__set_bit(MM_CONTEXT_HAS_VSYSCALL, &current->mm->context.flags);
727 
728 	/* TBD: overwrites user setup. Should have two bits.
729 	   But 64bit processes have always behaved this way,
730 	   so it's not too bad. The main problem is just that
731 	   32bit children are affected again. */
732 	current->personality &= ~READ_IMPLIES_EXEC;
733 }
734 
735 static void __set_personality_x32(void)
736 {
737 #ifdef CONFIG_X86_X32_ABI
738 	if (current->mm)
739 		current->mm->context.flags = 0;
740 
741 	current->personality &= ~READ_IMPLIES_EXEC;
742 	/*
743 	 * in_32bit_syscall() uses the presence of the x32 syscall bit
744 	 * flag to determine compat status.  The x86 mmap() code relies on
745 	 * the syscall bitness so set x32 syscall bit right here to make
746 	 * in_32bit_syscall() work during exec().
747 	 *
748 	 * Pretend to come from a x32 execve.
749 	 */
750 	task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
751 	current_thread_info()->status &= ~TS_COMPAT;
752 #endif
753 }
754 
755 static void __set_personality_ia32(void)
756 {
757 #ifdef CONFIG_IA32_EMULATION
758 	if (current->mm) {
759 		/*
760 		 * uprobes applied to this MM need to know this and
761 		 * cannot use user_64bit_mode() at that time.
762 		 */
763 		__set_bit(MM_CONTEXT_UPROBE_IA32, &current->mm->context.flags);
764 	}
765 
766 	current->personality |= force_personality32;
767 	/* Prepare the first "return" to user space */
768 	task_pt_regs(current)->orig_ax = __NR_ia32_execve;
769 	current_thread_info()->status |= TS_COMPAT;
770 #endif
771 }
772 
773 void set_personality_ia32(bool x32)
774 {
775 	/* Make sure to be in 32bit mode */
776 	set_thread_flag(TIF_ADDR32);
777 
778 	if (x32)
779 		__set_personality_x32();
780 	else
781 		__set_personality_ia32();
782 }
783 EXPORT_SYMBOL_GPL(set_personality_ia32);
784 
785 #ifdef CONFIG_CHECKPOINT_RESTORE
786 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
787 {
788 	int ret;
789 
790 	ret = map_vdso_once(image, addr);
791 	if (ret)
792 		return ret;
793 
794 	return (long)image->size;
795 }
796 #endif
797 
798 #ifdef CONFIG_ADDRESS_MASKING
799 
800 #define LAM_U57_BITS 6
801 
802 static void enable_lam_func(void *__mm)
803 {
804 	struct mm_struct *mm = __mm;
805 	unsigned long lam;
806 
807 	if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) {
808 		lam = mm_lam_cr3_mask(mm);
809 		write_cr3(__read_cr3() | lam);
810 		cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
811 	}
812 }
813 
814 static void mm_enable_lam(struct mm_struct *mm)
815 {
816 	mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
817 	mm->context.untag_mask =  ~GENMASK(62, 57);
818 
819 	/*
820 	 * Even though the process must still be single-threaded at this
821 	 * point, kernel threads may be using the mm.  IPI those kernel
822 	 * threads if they exist.
823 	 */
824 	on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true);
825 	set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
826 }
827 
828 static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
829 {
830 	if (!cpu_feature_enabled(X86_FEATURE_LAM))
831 		return -ENODEV;
832 
833 	/* PTRACE_ARCH_PRCTL */
834 	if (current->mm != mm)
835 		return -EINVAL;
836 
837 	if (mm_valid_pasid(mm) &&
838 	    !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags))
839 		return -EINVAL;
840 
841 	if (mmap_write_lock_killable(mm))
842 		return -EINTR;
843 
844 	/*
845 	 * MM_CONTEXT_LOCK_LAM is set on clone.  Prevent LAM from
846 	 * being enabled unless the process is single threaded:
847 	 */
848 	if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) {
849 		mmap_write_unlock(mm);
850 		return -EBUSY;
851 	}
852 
853 	if (!nr_bits || nr_bits > LAM_U57_BITS) {
854 		mmap_write_unlock(mm);
855 		return -EINVAL;
856 	}
857 
858 	mm_enable_lam(mm);
859 
860 	mmap_write_unlock(mm);
861 
862 	return 0;
863 }
864 #endif
865 
866 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
867 {
868 	int ret = 0;
869 
870 	switch (option) {
871 	case ARCH_SET_GS: {
872 		if (unlikely(arg2 >= TASK_SIZE_MAX))
873 			return -EPERM;
874 
875 		preempt_disable();
876 		/*
877 		 * ARCH_SET_GS has always overwritten the index
878 		 * and the base. Zero is the most sensible value
879 		 * to put in the index, and is the only value that
880 		 * makes any sense if FSGSBASE is unavailable.
881 		 */
882 		if (task == current) {
883 			loadseg(GS, 0);
884 			x86_gsbase_write_cpu_inactive(arg2);
885 
886 			/*
887 			 * On non-FSGSBASE systems, save_base_legacy() expects
888 			 * that we also fill in thread.gsbase.
889 			 */
890 			task->thread.gsbase = arg2;
891 
892 		} else {
893 			task->thread.gsindex = 0;
894 			x86_gsbase_write_task(task, arg2);
895 		}
896 		preempt_enable();
897 		break;
898 	}
899 	case ARCH_SET_FS: {
900 		/*
901 		 * Not strictly needed for %fs, but do it for symmetry
902 		 * with %gs
903 		 */
904 		if (unlikely(arg2 >= TASK_SIZE_MAX))
905 			return -EPERM;
906 
907 		preempt_disable();
908 		/*
909 		 * Set the selector to 0 for the same reason
910 		 * as %gs above.
911 		 */
912 		if (task == current) {
913 			loadseg(FS, 0);
914 			x86_fsbase_write_cpu(arg2);
915 
916 			/*
917 			 * On non-FSGSBASE systems, save_base_legacy() expects
918 			 * that we also fill in thread.fsbase.
919 			 */
920 			task->thread.fsbase = arg2;
921 		} else {
922 			task->thread.fsindex = 0;
923 			x86_fsbase_write_task(task, arg2);
924 		}
925 		preempt_enable();
926 		break;
927 	}
928 	case ARCH_GET_FS: {
929 		unsigned long base = x86_fsbase_read_task(task);
930 
931 		ret = put_user(base, (unsigned long __user *)arg2);
932 		break;
933 	}
934 	case ARCH_GET_GS: {
935 		unsigned long base = x86_gsbase_read_task(task);
936 
937 		ret = put_user(base, (unsigned long __user *)arg2);
938 		break;
939 	}
940 
941 #ifdef CONFIG_CHECKPOINT_RESTORE
942 # ifdef CONFIG_X86_X32_ABI
943 	case ARCH_MAP_VDSO_X32:
944 		return prctl_map_vdso(&vdso_image_x32, arg2);
945 # endif
946 # ifdef CONFIG_IA32_EMULATION
947 	case ARCH_MAP_VDSO_32:
948 		return prctl_map_vdso(&vdso_image_32, arg2);
949 # endif
950 	case ARCH_MAP_VDSO_64:
951 		return prctl_map_vdso(&vdso_image_64, arg2);
952 #endif
953 #ifdef CONFIG_ADDRESS_MASKING
954 	case ARCH_GET_UNTAG_MASK:
955 		return put_user(task->mm->context.untag_mask,
956 				(unsigned long __user *)arg2);
957 	case ARCH_ENABLE_TAGGED_ADDR:
958 		return prctl_enable_tagged_addr(task->mm, arg2);
959 	case ARCH_FORCE_TAGGED_SVA:
960 		if (current != task)
961 			return -EINVAL;
962 		set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags);
963 		return 0;
964 	case ARCH_GET_MAX_TAG_BITS:
965 		if (!cpu_feature_enabled(X86_FEATURE_LAM))
966 			return put_user(0, (unsigned long __user *)arg2);
967 		else
968 			return put_user(LAM_U57_BITS, (unsigned long __user *)arg2);
969 #endif
970 	case ARCH_SHSTK_ENABLE:
971 	case ARCH_SHSTK_DISABLE:
972 	case ARCH_SHSTK_LOCK:
973 	case ARCH_SHSTK_UNLOCK:
974 	case ARCH_SHSTK_STATUS:
975 		return shstk_prctl(task, option, arg2);
976 	default:
977 		ret = -EINVAL;
978 		break;
979 	}
980 
981 	return ret;
982 }
983