xref: /linux/arch/x86/kernel/process_64.c (revision 9f3926e08c26607a0dd5b1bc8a8aa1d03f72fcdc)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Copyright (C) 1995  Linus Torvalds
4  *
5  *  Pentium III FXSR, SSE support
6  *	Gareth Hughes <gareth@valinux.com>, May 2000
7  *
8  *  X86-64 port
9  *	Andi Kleen.
10  *
11  *	CPU hotplug support - ashok.raj@intel.com
12  */
13 
14 /*
15  * This file handles the architecture-dependent parts of process handling..
16  */
17 
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/sched/task.h>
22 #include <linux/sched/task_stack.h>
23 #include <linux/fs.h>
24 #include <linux/kernel.h>
25 #include <linux/mm.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/delay.h>
32 #include <linux/export.h>
33 #include <linux/ptrace.h>
34 #include <linux/notifier.h>
35 #include <linux/kprobes.h>
36 #include <linux/kdebug.h>
37 #include <linux/prctl.h>
38 #include <linux/uaccess.h>
39 #include <linux/io.h>
40 #include <linux/ftrace.h>
41 #include <linux/syscalls.h>
42 
43 #include <asm/pgtable.h>
44 #include <asm/processor.h>
45 #include <asm/fpu/internal.h>
46 #include <asm/mmu_context.h>
47 #include <asm/prctl.h>
48 #include <asm/desc.h>
49 #include <asm/proto.h>
50 #include <asm/ia32.h>
51 #include <asm/syscalls.h>
52 #include <asm/debugreg.h>
53 #include <asm/switch_to.h>
54 #include <asm/xen/hypervisor.h>
55 #include <asm/vdso.h>
56 #include <asm/resctrl_sched.h>
57 #include <asm/unistd.h>
58 #include <asm/fsgsbase.h>
59 #ifdef CONFIG_IA32_EMULATION
60 /* Not included via unistd.h */
61 #include <asm/unistd_32_ia32.h>
62 #endif
63 
64 #include "process.h"
65 
66 /* Prints also some state that isn't saved in the pt_regs */
67 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
68 {
69 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
70 	unsigned long d0, d1, d2, d3, d6, d7;
71 	unsigned int fsindex, gsindex;
72 	unsigned int ds, es;
73 
74 	show_iret_regs(regs);
75 
76 	if (regs->orig_ax != -1)
77 		pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
78 	else
79 		pr_cont("\n");
80 
81 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
82 	       regs->ax, regs->bx, regs->cx);
83 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
84 	       regs->dx, regs->si, regs->di);
85 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
86 	       regs->bp, regs->r8, regs->r9);
87 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
88 	       regs->r10, regs->r11, regs->r12);
89 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
90 	       regs->r13, regs->r14, regs->r15);
91 
92 	if (mode == SHOW_REGS_SHORT)
93 		return;
94 
95 	if (mode == SHOW_REGS_USER) {
96 		rdmsrl(MSR_FS_BASE, fs);
97 		rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
98 		printk(KERN_DEFAULT "FS:  %016lx GS:  %016lx\n",
99 		       fs, shadowgs);
100 		return;
101 	}
102 
103 	asm("movl %%ds,%0" : "=r" (ds));
104 	asm("movl %%es,%0" : "=r" (es));
105 	asm("movl %%fs,%0" : "=r" (fsindex));
106 	asm("movl %%gs,%0" : "=r" (gsindex));
107 
108 	rdmsrl(MSR_FS_BASE, fs);
109 	rdmsrl(MSR_GS_BASE, gs);
110 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
111 
112 	cr0 = read_cr0();
113 	cr2 = read_cr2();
114 	cr3 = __read_cr3();
115 	cr4 = __read_cr4();
116 
117 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
118 	       fs, fsindex, gs, gsindex, shadowgs);
119 	printk(KERN_DEFAULT "CS:  %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds,
120 			es, cr0);
121 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
122 			cr4);
123 
124 	get_debugreg(d0, 0);
125 	get_debugreg(d1, 1);
126 	get_debugreg(d2, 2);
127 	get_debugreg(d3, 3);
128 	get_debugreg(d6, 6);
129 	get_debugreg(d7, 7);
130 
131 	/* Only print out debug registers if they are in their non-default state. */
132 	if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
133 	    (d6 == DR6_RESERVED) && (d7 == 0x400))) {
134 		printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
135 		       d0, d1, d2);
136 		printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
137 		       d3, d6, d7);
138 	}
139 
140 	if (boot_cpu_has(X86_FEATURE_OSPKE))
141 		printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
142 }
143 
144 void release_thread(struct task_struct *dead_task)
145 {
146 	if (dead_task->mm) {
147 #ifdef CONFIG_MODIFY_LDT_SYSCALL
148 		if (dead_task->mm->context.ldt) {
149 			pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
150 				dead_task->comm,
151 				dead_task->mm->context.ldt->entries,
152 				dead_task->mm->context.ldt->nr_entries);
153 			BUG();
154 		}
155 #endif
156 	}
157 }
158 
159 enum which_selector {
160 	FS,
161 	GS
162 };
163 
164 /*
165  * Out of line to be protected from kprobes. It is not used on Xen
166  * paravirt. When paravirt support is needed, it needs to be renamed
167  * with native_ prefix.
168  */
169 static noinline unsigned long __rdgsbase_inactive(void)
170 {
171 	unsigned long gsbase;
172 
173 	lockdep_assert_irqs_disabled();
174 
175 	native_swapgs();
176 	gsbase = rdgsbase();
177 	native_swapgs();
178 
179 	return gsbase;
180 }
181 NOKPROBE_SYMBOL(__rdgsbase_inactive);
182 
183 /*
184  * Out of line to be protected from kprobes. It is not used on Xen
185  * paravirt. When paravirt support is needed, it needs to be renamed
186  * with native_ prefix.
187  */
188 static noinline void __wrgsbase_inactive(unsigned long gsbase)
189 {
190 	lockdep_assert_irqs_disabled();
191 
192 	native_swapgs();
193 	wrgsbase(gsbase);
194 	native_swapgs();
195 }
196 NOKPROBE_SYMBOL(__wrgsbase_inactive);
197 
198 /*
199  * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
200  * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
201  * It's forcibly inlined because it'll generate better code and this function
202  * is hot.
203  */
204 static __always_inline void save_base_legacy(struct task_struct *prev_p,
205 					     unsigned short selector,
206 					     enum which_selector which)
207 {
208 	if (likely(selector == 0)) {
209 		/*
210 		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
211 		 * be the pre-existing saved base or it could be zero.  On AMD
212 		 * (with X86_BUG_NULL_SEG), the segment base could be almost
213 		 * anything.
214 		 *
215 		 * This branch is very hot (it's hit twice on almost every
216 		 * context switch between 64-bit programs), and avoiding
217 		 * the RDMSR helps a lot, so we just assume that whatever
218 		 * value is already saved is correct.  This matches historical
219 		 * Linux behavior, so it won't break existing applications.
220 		 *
221 		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
222 		 * report that the base is zero, it needs to actually be zero:
223 		 * see the corresponding logic in load_seg_legacy.
224 		 */
225 	} else {
226 		/*
227 		 * If the selector is 1, 2, or 3, then the base is zero on
228 		 * !X86_BUG_NULL_SEG CPUs and could be anything on
229 		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
230 		 * has never attempted to preserve the base across context
231 		 * switches.
232 		 *
233 		 * If selector > 3, then it refers to a real segment, and
234 		 * saving the base isn't necessary.
235 		 */
236 		if (which == FS)
237 			prev_p->thread.fsbase = 0;
238 		else
239 			prev_p->thread.gsbase = 0;
240 	}
241 }
242 
243 static __always_inline void save_fsgs(struct task_struct *task)
244 {
245 	savesegment(fs, task->thread.fsindex);
246 	savesegment(gs, task->thread.gsindex);
247 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
248 		unsigned long flags;
249 
250 		/*
251 		 * If FSGSBASE is enabled, we can't make any useful guesses
252 		 * about the base, and user code expects us to save the current
253 		 * value.  Fortunately, reading the base directly is efficient.
254 		 */
255 		task->thread.fsbase = rdfsbase();
256 		local_irq_save(flags);
257 		task->thread.gsbase = __rdgsbase_inactive();
258 		local_irq_restore(flags);
259 	} else {
260 		save_base_legacy(task, task->thread.fsindex, FS);
261 		save_base_legacy(task, task->thread.gsindex, GS);
262 	}
263 }
264 
265 #if IS_ENABLED(CONFIG_KVM)
266 /*
267  * While a process is running,current->thread.fsbase and current->thread.gsbase
268  * may not match the corresponding CPU registers (see save_base_legacy()). KVM
269  * wants an efficient way to save and restore FSBASE and GSBASE.
270  * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
271  */
272 void save_fsgs_for_kvm(void)
273 {
274 	save_fsgs(current);
275 }
276 EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
277 #endif
278 
279 static __always_inline void loadseg(enum which_selector which,
280 				    unsigned short sel)
281 {
282 	if (which == FS)
283 		loadsegment(fs, sel);
284 	else
285 		load_gs_index(sel);
286 }
287 
288 static __always_inline void load_seg_legacy(unsigned short prev_index,
289 					    unsigned long prev_base,
290 					    unsigned short next_index,
291 					    unsigned long next_base,
292 					    enum which_selector which)
293 {
294 	if (likely(next_index <= 3)) {
295 		/*
296 		 * The next task is using 64-bit TLS, is not using this
297 		 * segment at all, or is having fun with arcane CPU features.
298 		 */
299 		if (next_base == 0) {
300 			/*
301 			 * Nasty case: on AMD CPUs, we need to forcibly zero
302 			 * the base.
303 			 */
304 			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
305 				loadseg(which, __USER_DS);
306 				loadseg(which, next_index);
307 			} else {
308 				/*
309 				 * We could try to exhaustively detect cases
310 				 * under which we can skip the segment load,
311 				 * but there's really only one case that matters
312 				 * for performance: if both the previous and
313 				 * next states are fully zeroed, we can skip
314 				 * the load.
315 				 *
316 				 * (This assumes that prev_base == 0 has no
317 				 * false positives.  This is the case on
318 				 * Intel-style CPUs.)
319 				 */
320 				if (likely(prev_index | next_index | prev_base))
321 					loadseg(which, next_index);
322 			}
323 		} else {
324 			if (prev_index != next_index)
325 				loadseg(which, next_index);
326 			wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
327 			       next_base);
328 		}
329 	} else {
330 		/*
331 		 * The next task is using a real segment.  Loading the selector
332 		 * is sufficient.
333 		 */
334 		loadseg(which, next_index);
335 	}
336 }
337 
338 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
339 					      struct thread_struct *next)
340 {
341 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
342 		/* Update the FS and GS selectors if they could have changed. */
343 		if (unlikely(prev->fsindex || next->fsindex))
344 			loadseg(FS, next->fsindex);
345 		if (unlikely(prev->gsindex || next->gsindex))
346 			loadseg(GS, next->gsindex);
347 
348 		/* Update the bases. */
349 		wrfsbase(next->fsbase);
350 		__wrgsbase_inactive(next->gsbase);
351 	} else {
352 		load_seg_legacy(prev->fsindex, prev->fsbase,
353 				next->fsindex, next->fsbase, FS);
354 		load_seg_legacy(prev->gsindex, prev->gsbase,
355 				next->gsindex, next->gsbase, GS);
356 	}
357 }
358 
359 static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
360 					    unsigned short selector)
361 {
362 	unsigned short idx = selector >> 3;
363 	unsigned long base;
364 
365 	if (likely((selector & SEGMENT_TI_MASK) == 0)) {
366 		if (unlikely(idx >= GDT_ENTRIES))
367 			return 0;
368 
369 		/*
370 		 * There are no user segments in the GDT with nonzero bases
371 		 * other than the TLS segments.
372 		 */
373 		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
374 			return 0;
375 
376 		idx -= GDT_ENTRY_TLS_MIN;
377 		base = get_desc_base(&task->thread.tls_array[idx]);
378 	} else {
379 #ifdef CONFIG_MODIFY_LDT_SYSCALL
380 		struct ldt_struct *ldt;
381 
382 		/*
383 		 * If performance here mattered, we could protect the LDT
384 		 * with RCU.  This is a slow path, though, so we can just
385 		 * take the mutex.
386 		 */
387 		mutex_lock(&task->mm->context.lock);
388 		ldt = task->mm->context.ldt;
389 		if (unlikely(idx >= ldt->nr_entries))
390 			base = 0;
391 		else
392 			base = get_desc_base(ldt->entries + idx);
393 		mutex_unlock(&task->mm->context.lock);
394 #else
395 		base = 0;
396 #endif
397 	}
398 
399 	return base;
400 }
401 
402 unsigned long x86_gsbase_read_cpu_inactive(void)
403 {
404 	unsigned long gsbase;
405 
406 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
407 		unsigned long flags;
408 
409 		/* Interrupts are disabled here. */
410 		local_irq_save(flags);
411 		gsbase = __rdgsbase_inactive();
412 		local_irq_restore(flags);
413 	} else {
414 		rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
415 	}
416 
417 	return gsbase;
418 }
419 
420 void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
421 {
422 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
423 		unsigned long flags;
424 
425 		/* Interrupts are disabled here. */
426 		local_irq_save(flags);
427 		__wrgsbase_inactive(gsbase);
428 		local_irq_restore(flags);
429 	} else {
430 		wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
431 	}
432 }
433 
434 unsigned long x86_fsbase_read_task(struct task_struct *task)
435 {
436 	unsigned long fsbase;
437 
438 	if (task == current)
439 		fsbase = x86_fsbase_read_cpu();
440 	else if (static_cpu_has(X86_FEATURE_FSGSBASE) ||
441 		 (task->thread.fsindex == 0))
442 		fsbase = task->thread.fsbase;
443 	else
444 		fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
445 
446 	return fsbase;
447 }
448 
449 unsigned long x86_gsbase_read_task(struct task_struct *task)
450 {
451 	unsigned long gsbase;
452 
453 	if (task == current)
454 		gsbase = x86_gsbase_read_cpu_inactive();
455 	else if (static_cpu_has(X86_FEATURE_FSGSBASE) ||
456 		 (task->thread.gsindex == 0))
457 		gsbase = task->thread.gsbase;
458 	else
459 		gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
460 
461 	return gsbase;
462 }
463 
464 void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
465 {
466 	WARN_ON_ONCE(task == current);
467 
468 	task->thread.fsbase = fsbase;
469 }
470 
471 void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
472 {
473 	WARN_ON_ONCE(task == current);
474 
475 	task->thread.gsbase = gsbase;
476 }
477 
478 int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
479 		unsigned long arg, struct task_struct *p, unsigned long tls)
480 {
481 	int err;
482 	struct pt_regs *childregs;
483 	struct fork_frame *fork_frame;
484 	struct inactive_task_frame *frame;
485 	struct task_struct *me = current;
486 
487 	childregs = task_pt_regs(p);
488 	fork_frame = container_of(childregs, struct fork_frame, regs);
489 	frame = &fork_frame->frame;
490 
491 	frame->bp = 0;
492 	frame->ret_addr = (unsigned long) ret_from_fork;
493 	p->thread.sp = (unsigned long) fork_frame;
494 	p->thread.io_bitmap_ptr = NULL;
495 
496 	save_fsgs(me);
497 	p->thread.fsindex = me->thread.fsindex;
498 	p->thread.fsbase = me->thread.fsbase;
499 	p->thread.gsindex = me->thread.gsindex;
500 	p->thread.gsbase = me->thread.gsbase;
501 	savesegment(es, p->thread.es);
502 	savesegment(ds, p->thread.ds);
503 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
504 
505 	if (unlikely(p->flags & PF_KTHREAD)) {
506 		/* kernel thread */
507 		memset(childregs, 0, sizeof(struct pt_regs));
508 		frame->bx = sp;		/* function */
509 		frame->r12 = arg;
510 		return 0;
511 	}
512 	frame->bx = 0;
513 	*childregs = *current_pt_regs();
514 
515 	childregs->ax = 0;
516 	if (sp)
517 		childregs->sp = sp;
518 
519 	err = -ENOMEM;
520 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
521 		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
522 						  IO_BITMAP_BYTES, GFP_KERNEL);
523 		if (!p->thread.io_bitmap_ptr) {
524 			p->thread.io_bitmap_max = 0;
525 			return -ENOMEM;
526 		}
527 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
528 	}
529 
530 	/*
531 	 * Set a new TLS for the child thread?
532 	 */
533 	if (clone_flags & CLONE_SETTLS) {
534 #ifdef CONFIG_IA32_EMULATION
535 		if (in_ia32_syscall())
536 			err = do_set_thread_area(p, -1,
537 				(struct user_desc __user *)tls, 0);
538 		else
539 #endif
540 			err = do_arch_prctl_64(p, ARCH_SET_FS, tls);
541 		if (err)
542 			goto out;
543 	}
544 	err = 0;
545 out:
546 	if (err && p->thread.io_bitmap_ptr) {
547 		kfree(p->thread.io_bitmap_ptr);
548 		p->thread.io_bitmap_max = 0;
549 	}
550 
551 	return err;
552 }
553 
554 static void
555 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
556 		    unsigned long new_sp,
557 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
558 {
559 	WARN_ON_ONCE(regs != current_pt_regs());
560 
561 	if (static_cpu_has(X86_BUG_NULL_SEG)) {
562 		/* Loading zero below won't clear the base. */
563 		loadsegment(fs, __USER_DS);
564 		load_gs_index(__USER_DS);
565 	}
566 
567 	loadsegment(fs, 0);
568 	loadsegment(es, _ds);
569 	loadsegment(ds, _ds);
570 	load_gs_index(0);
571 
572 	regs->ip		= new_ip;
573 	regs->sp		= new_sp;
574 	regs->cs		= _cs;
575 	regs->ss		= _ss;
576 	regs->flags		= X86_EFLAGS_IF;
577 	force_iret();
578 }
579 
580 void
581 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
582 {
583 	start_thread_common(regs, new_ip, new_sp,
584 			    __USER_CS, __USER_DS, 0);
585 }
586 EXPORT_SYMBOL_GPL(start_thread);
587 
588 #ifdef CONFIG_COMPAT
589 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
590 {
591 	start_thread_common(regs, new_ip, new_sp,
592 			    test_thread_flag(TIF_X32)
593 			    ? __USER_CS : __USER32_CS,
594 			    __USER_DS, __USER_DS);
595 }
596 #endif
597 
598 /*
599  *	switch_to(x,y) should switch tasks from x to y.
600  *
601  * This could still be optimized:
602  * - fold all the options into a flag word and test it with a single test.
603  * - could test fs/gs bitsliced
604  *
605  * Kprobes not supported here. Set the probe on schedule instead.
606  * Function graph tracer not supported too.
607  */
608 __visible __notrace_funcgraph struct task_struct *
609 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
610 {
611 	struct thread_struct *prev = &prev_p->thread;
612 	struct thread_struct *next = &next_p->thread;
613 	struct fpu *prev_fpu = &prev->fpu;
614 	struct fpu *next_fpu = &next->fpu;
615 	int cpu = smp_processor_id();
616 
617 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
618 		     this_cpu_read(irq_count) != -1);
619 
620 	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
621 		switch_fpu_prepare(prev_fpu, cpu);
622 
623 	/* We must save %fs and %gs before load_TLS() because
624 	 * %fs and %gs may be cleared by load_TLS().
625 	 *
626 	 * (e.g. xen_load_tls())
627 	 */
628 	save_fsgs(prev_p);
629 
630 	/*
631 	 * Load TLS before restoring any segments so that segment loads
632 	 * reference the correct GDT entries.
633 	 */
634 	load_TLS(next, cpu);
635 
636 	/*
637 	 * Leave lazy mode, flushing any hypercalls made here.  This
638 	 * must be done after loading TLS entries in the GDT but before
639 	 * loading segments that might reference them.
640 	 */
641 	arch_end_context_switch(next_p);
642 
643 	/* Switch DS and ES.
644 	 *
645 	 * Reading them only returns the selectors, but writing them (if
646 	 * nonzero) loads the full descriptor from the GDT or LDT.  The
647 	 * LDT for next is loaded in switch_mm, and the GDT is loaded
648 	 * above.
649 	 *
650 	 * We therefore need to write new values to the segment
651 	 * registers on every context switch unless both the new and old
652 	 * values are zero.
653 	 *
654 	 * Note that we don't need to do anything for CS and SS, as
655 	 * those are saved and restored as part of pt_regs.
656 	 */
657 	savesegment(es, prev->es);
658 	if (unlikely(next->es | prev->es))
659 		loadsegment(es, next->es);
660 
661 	savesegment(ds, prev->ds);
662 	if (unlikely(next->ds | prev->ds))
663 		loadsegment(ds, next->ds);
664 
665 	x86_fsgsbase_load(prev, next);
666 
667 	/*
668 	 * Switch the PDA and FPU contexts.
669 	 */
670 	this_cpu_write(current_task, next_p);
671 	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
672 
673 	switch_fpu_finish(next_fpu);
674 
675 	/* Reload sp0. */
676 	update_task_stack(next_p);
677 
678 	switch_to_extra(prev_p, next_p);
679 
680 #ifdef CONFIG_XEN_PV
681 	/*
682 	 * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
683 	 * current_pt_regs()->flags may not match the current task's
684 	 * intended IOPL.  We need to switch it manually.
685 	 */
686 	if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
687 		     prev->iopl != next->iopl))
688 		xen_set_iopl_mask(next->iopl);
689 #endif
690 
691 	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
692 		/*
693 		 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
694 		 * does not update the cached descriptor.  As a result, if we
695 		 * do SYSRET while SS is NULL, we'll end up in user mode with
696 		 * SS apparently equal to __USER_DS but actually unusable.
697 		 *
698 		 * The straightforward workaround would be to fix it up just
699 		 * before SYSRET, but that would slow down the system call
700 		 * fast paths.  Instead, we ensure that SS is never NULL in
701 		 * system call context.  We do this by replacing NULL SS
702 		 * selectors at every context switch.  SYSCALL sets up a valid
703 		 * SS, so the only way to get NULL is to re-enter the kernel
704 		 * from CPL 3 through an interrupt.  Since that can't happen
705 		 * in the same task as a running syscall, we are guaranteed to
706 		 * context switch between every interrupt vector entry and a
707 		 * subsequent SYSRET.
708 		 *
709 		 * We read SS first because SS reads are much faster than
710 		 * writes.  Out of caution, we force SS to __KERNEL_DS even if
711 		 * it previously had a different non-NULL value.
712 		 */
713 		unsigned short ss_sel;
714 		savesegment(ss, ss_sel);
715 		if (ss_sel != __KERNEL_DS)
716 			loadsegment(ss, __KERNEL_DS);
717 	}
718 
719 	/* Load the Intel cache allocation PQR MSR. */
720 	resctrl_sched_in();
721 
722 	return prev_p;
723 }
724 
725 void set_personality_64bit(void)
726 {
727 	/* inherit personality from parent */
728 
729 	/* Make sure to be in 64bit mode */
730 	clear_thread_flag(TIF_IA32);
731 	clear_thread_flag(TIF_ADDR32);
732 	clear_thread_flag(TIF_X32);
733 	/* Pretend that this comes from a 64bit execve */
734 	task_pt_regs(current)->orig_ax = __NR_execve;
735 	current_thread_info()->status &= ~TS_COMPAT;
736 
737 	/* Ensure the corresponding mm is not marked. */
738 	if (current->mm)
739 		current->mm->context.ia32_compat = 0;
740 
741 	/* TBD: overwrites user setup. Should have two bits.
742 	   But 64bit processes have always behaved this way,
743 	   so it's not too bad. The main problem is just that
744 	   32bit children are affected again. */
745 	current->personality &= ~READ_IMPLIES_EXEC;
746 }
747 
748 static void __set_personality_x32(void)
749 {
750 #ifdef CONFIG_X86_X32
751 	clear_thread_flag(TIF_IA32);
752 	set_thread_flag(TIF_X32);
753 	if (current->mm)
754 		current->mm->context.ia32_compat = TIF_X32;
755 	current->personality &= ~READ_IMPLIES_EXEC;
756 	/*
757 	 * in_32bit_syscall() uses the presence of the x32 syscall bit
758 	 * flag to determine compat status.  The x86 mmap() code relies on
759 	 * the syscall bitness so set x32 syscall bit right here to make
760 	 * in_32bit_syscall() work during exec().
761 	 *
762 	 * Pretend to come from a x32 execve.
763 	 */
764 	task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
765 	current_thread_info()->status &= ~TS_COMPAT;
766 #endif
767 }
768 
769 static void __set_personality_ia32(void)
770 {
771 #ifdef CONFIG_IA32_EMULATION
772 	set_thread_flag(TIF_IA32);
773 	clear_thread_flag(TIF_X32);
774 	if (current->mm)
775 		current->mm->context.ia32_compat = TIF_IA32;
776 	current->personality |= force_personality32;
777 	/* Prepare the first "return" to user space */
778 	task_pt_regs(current)->orig_ax = __NR_ia32_execve;
779 	current_thread_info()->status |= TS_COMPAT;
780 #endif
781 }
782 
783 void set_personality_ia32(bool x32)
784 {
785 	/* Make sure to be in 32bit mode */
786 	set_thread_flag(TIF_ADDR32);
787 
788 	if (x32)
789 		__set_personality_x32();
790 	else
791 		__set_personality_ia32();
792 }
793 EXPORT_SYMBOL_GPL(set_personality_ia32);
794 
795 #ifdef CONFIG_CHECKPOINT_RESTORE
796 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
797 {
798 	int ret;
799 
800 	ret = map_vdso_once(image, addr);
801 	if (ret)
802 		return ret;
803 
804 	return (long)image->size;
805 }
806 #endif
807 
808 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
809 {
810 	int ret = 0;
811 
812 	switch (option) {
813 	case ARCH_SET_GS: {
814 		if (unlikely(arg2 >= TASK_SIZE_MAX))
815 			return -EPERM;
816 
817 		preempt_disable();
818 		/*
819 		 * ARCH_SET_GS has always overwritten the index
820 		 * and the base. Zero is the most sensible value
821 		 * to put in the index, and is the only value that
822 		 * makes any sense if FSGSBASE is unavailable.
823 		 */
824 		if (task == current) {
825 			loadseg(GS, 0);
826 			x86_gsbase_write_cpu_inactive(arg2);
827 
828 			/*
829 			 * On non-FSGSBASE systems, save_base_legacy() expects
830 			 * that we also fill in thread.gsbase.
831 			 */
832 			task->thread.gsbase = arg2;
833 
834 		} else {
835 			task->thread.gsindex = 0;
836 			x86_gsbase_write_task(task, arg2);
837 		}
838 		preempt_enable();
839 		break;
840 	}
841 	case ARCH_SET_FS: {
842 		/*
843 		 * Not strictly needed for %fs, but do it for symmetry
844 		 * with %gs
845 		 */
846 		if (unlikely(arg2 >= TASK_SIZE_MAX))
847 			return -EPERM;
848 
849 		preempt_disable();
850 		/*
851 		 * Set the selector to 0 for the same reason
852 		 * as %gs above.
853 		 */
854 		if (task == current) {
855 			loadseg(FS, 0);
856 			x86_fsbase_write_cpu(arg2);
857 
858 			/*
859 			 * On non-FSGSBASE systems, save_base_legacy() expects
860 			 * that we also fill in thread.fsbase.
861 			 */
862 			task->thread.fsbase = arg2;
863 		} else {
864 			task->thread.fsindex = 0;
865 			x86_fsbase_write_task(task, arg2);
866 		}
867 		preempt_enable();
868 		break;
869 	}
870 	case ARCH_GET_FS: {
871 		unsigned long base = x86_fsbase_read_task(task);
872 
873 		ret = put_user(base, (unsigned long __user *)arg2);
874 		break;
875 	}
876 	case ARCH_GET_GS: {
877 		unsigned long base = x86_gsbase_read_task(task);
878 
879 		ret = put_user(base, (unsigned long __user *)arg2);
880 		break;
881 	}
882 
883 #ifdef CONFIG_CHECKPOINT_RESTORE
884 # ifdef CONFIG_X86_X32_ABI
885 	case ARCH_MAP_VDSO_X32:
886 		return prctl_map_vdso(&vdso_image_x32, arg2);
887 # endif
888 # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
889 	case ARCH_MAP_VDSO_32:
890 		return prctl_map_vdso(&vdso_image_32, arg2);
891 # endif
892 	case ARCH_MAP_VDSO_64:
893 		return prctl_map_vdso(&vdso_image_64, arg2);
894 #endif
895 
896 	default:
897 		ret = -EINVAL;
898 		break;
899 	}
900 
901 	return ret;
902 }
903 
904 SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
905 {
906 	long ret;
907 
908 	ret = do_arch_prctl_64(current, option, arg2);
909 	if (ret == -EINVAL)
910 		ret = do_arch_prctl_common(current, option, arg2);
911 
912 	return ret;
913 }
914 
915 #ifdef CONFIG_IA32_EMULATION
916 COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
917 {
918 	return do_arch_prctl_common(current, option, arg2);
919 }
920 #endif
921 
922 unsigned long KSTK_ESP(struct task_struct *task)
923 {
924 	return task_pt_regs(task)->sp;
925 }
926