xref: /linux/arch/x86/entry/vsyscall/vsyscall_64.c (revision fbf5df34a4dbcd09d433dd4f0916bf9b2ddb16de)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net>
4  *
5  * Based on the original implementation which is:
6  *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
7  *  Copyright 2003 Andi Kleen, SuSE Labs.
8  *
9  *  Parts of the original code have been moved to arch/x86/vdso/vma.c
10  *
11  * This file implements vsyscall emulation.  vsyscalls are a legacy ABI:
12  * Userspace can request certain kernel services by calling fixed
13  * addresses.  This concept is problematic:
14  *
15  * - It interferes with ASLR.
16  * - It's awkward to write code that lives in kernel addresses but is
17  *   callable by userspace at fixed addresses.
18  * - The whole concept is impossible for 32-bit compat userspace.
19  * - UML cannot easily virtualize a vsyscall.
20  *
21  * As of mid-2014, I believe that there is no new userspace code that
22  * will use a vsyscall if the vDSO is present.  I hope that there will
23  * soon be no new userspace code that will ever use a vsyscall.
24  *
25  * The code in this file emulates vsyscalls when notified of a page
26  * fault or a general protection fault to a vsyscall address.
27  */
28 
29 #include <linux/kernel.h>
30 #include <linux/timer.h>
31 #include <linux/sched/signal.h>
32 #include <linux/mm_types.h>
33 #include <linux/syscalls.h>
34 #include <linux/ratelimit.h>
35 
36 #include <asm/vsyscall.h>
37 #include <asm/unistd.h>
38 #include <asm/fixmap.h>
39 #include <asm/traps.h>
40 
41 #define CREATE_TRACE_POINTS
42 #include "vsyscall_trace.h"
43 
44 static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =
45 #ifdef CONFIG_LEGACY_VSYSCALL_NONE
46 	NONE;
47 #elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)
48 	XONLY;
49 #else
50 	#error VSYSCALL config is broken
51 #endif
52 
53 static int __init vsyscall_setup(char *str)
54 {
55 	if (str) {
56 		if (!strcmp("emulate", str))
57 			vsyscall_mode = EMULATE;
58 		else if (!strcmp("xonly", str))
59 			vsyscall_mode = XONLY;
60 		else if (!strcmp("none", str))
61 			vsyscall_mode = NONE;
62 		else
63 			return -EINVAL;
64 
65 		if (cpu_feature_enabled(X86_FEATURE_LASS) && vsyscall_mode == EMULATE) {
66 			setup_clear_cpu_cap(X86_FEATURE_LASS);
67 			pr_warn_once("x86/cpu: Disabling LASS due to vsyscall=emulate\n");
68 		}
69 
70 		return 0;
71 	}
72 
73 	return -EINVAL;
74 }
75 early_param("vsyscall", vsyscall_setup);
76 
77 static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
78 			      const char *message)
79 {
80 	if (!show_unhandled_signals)
81 		return;
82 
83 	printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n",
84 			   level, current->comm, task_pid_nr(current),
85 			   message, regs->ip, regs->cs,
86 			   regs->sp, regs->ax, regs->si, regs->di);
87 }
88 
89 static int addr_to_vsyscall_nr(unsigned long addr)
90 {
91 	int nr;
92 
93 	if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
94 		return -EINVAL;
95 
96 	nr = (addr & 0xC00UL) >> 10;
97 	if (nr >= 3)
98 		return -EINVAL;
99 
100 	return nr;
101 }
102 
103 static bool write_ok_or_segv(unsigned long ptr, size_t size)
104 {
105 	if (!access_ok((void __user *)ptr, size)) {
106 		struct thread_struct *thread = &current->thread;
107 
108 		thread->error_code	= X86_PF_USER | X86_PF_WRITE;
109 		thread->cr2		= ptr;
110 		thread->trap_nr		= X86_TRAP_PF;
111 
112 		force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr);
113 		return false;
114 	} else {
115 		return true;
116 	}
117 }
118 
119 static bool __emulate_vsyscall(struct pt_regs *regs, unsigned long address)
120 {
121 	unsigned long caller;
122 	int vsyscall_nr, syscall_nr, tmp;
123 	long ret;
124 	unsigned long orig_dx;
125 
126 	/* Confirm that the fault happened in 64-bit user mode */
127 	if (!user_64bit_mode(regs))
128 		return false;
129 
130 	if (vsyscall_mode == NONE) {
131 		warn_bad_vsyscall(KERN_INFO, regs,
132 				  "vsyscall attempted with vsyscall=none");
133 		return false;
134 	}
135 
136 	vsyscall_nr = addr_to_vsyscall_nr(address);
137 
138 	trace_emulate_vsyscall(vsyscall_nr);
139 
140 	if (vsyscall_nr < 0) {
141 		warn_bad_vsyscall(KERN_WARNING, regs,
142 				  "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
143 		goto sigsegv;
144 	}
145 
146 	if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
147 		warn_bad_vsyscall(KERN_WARNING, regs,
148 				  "vsyscall with bad stack (exploit attempt?)");
149 		goto sigsegv;
150 	}
151 
152 	/*
153 	 * Check for access_ok violations and find the syscall nr.
154 	 *
155 	 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
156 	 * 64-bit, so we don't need to special-case it here.  For all the
157 	 * vsyscalls, NULL means "don't write anything" not "write it at
158 	 * address 0".
159 	 */
160 	switch (vsyscall_nr) {
161 	case 0:
162 		if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) ||
163 		    !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
164 			ret = -EFAULT;
165 			goto check_fault;
166 		}
167 
168 		syscall_nr = __NR_gettimeofday;
169 		break;
170 
171 	case 1:
172 		if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) {
173 			ret = -EFAULT;
174 			goto check_fault;
175 		}
176 
177 		syscall_nr = __NR_time;
178 		break;
179 
180 	case 2:
181 		if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
182 		    !write_ok_or_segv(regs->si, sizeof(unsigned))) {
183 			ret = -EFAULT;
184 			goto check_fault;
185 		}
186 
187 		syscall_nr = __NR_getcpu;
188 		break;
189 	}
190 
191 	/*
192 	 * Handle seccomp.  regs->ip must be the original value.
193 	 * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst.
194 	 *
195 	 * We could optimize the seccomp disabled case, but performance
196 	 * here doesn't matter.
197 	 */
198 	regs->orig_ax = syscall_nr;
199 	regs->ax = -ENOSYS;
200 	tmp = secure_computing();
201 	if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
202 		warn_bad_vsyscall(KERN_DEBUG, regs,
203 				  "seccomp tried to change syscall nr or ip");
204 		force_exit_sig(SIGSYS);
205 		return true;
206 	}
207 	regs->orig_ax = -1;
208 	if (tmp)
209 		goto do_ret;  /* skip requested */
210 
211 	/*
212 	 * With a real vsyscall, page faults cause SIGSEGV.
213 	 */
214 	ret = -EFAULT;
215 	switch (vsyscall_nr) {
216 	case 0:
217 		/* this decodes regs->di and regs->si on its own */
218 		ret = __x64_sys_gettimeofday(regs);
219 		break;
220 
221 	case 1:
222 		/* this decodes regs->di on its own */
223 		ret = __x64_sys_time(regs);
224 		break;
225 
226 	case 2:
227 		/* while we could clobber regs->dx, we didn't in the past... */
228 		orig_dx = regs->dx;
229 		regs->dx = 0;
230 		/* this decodes regs->di, regs->si and regs->dx on its own */
231 		ret = __x64_sys_getcpu(regs);
232 		regs->dx = orig_dx;
233 		break;
234 	}
235 
236 check_fault:
237 	if (ret == -EFAULT) {
238 		/* Bad news -- userspace fed a bad pointer to a vsyscall. */
239 		warn_bad_vsyscall(KERN_INFO, regs,
240 				  "vsyscall fault (exploit attempt?)");
241 		goto sigsegv;
242 	}
243 
244 	regs->ax = ret;
245 
246 do_ret:
247 	/* Emulate a ret instruction. */
248 	regs->ip = caller;
249 	regs->sp += 8;
250 	return true;
251 
252 sigsegv:
253 	force_sig(SIGSEGV);
254 	return true;
255 }
256 
257 bool emulate_vsyscall_pf(unsigned long error_code, struct pt_regs *regs,
258 			 unsigned long address)
259 {
260 	/* Write faults or kernel-privilege faults never get fixed up. */
261 	if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
262 		return false;
263 
264 	/*
265 	 * Assume that faults at regs->ip are because of an instruction
266 	 * fetch. Return early and avoid emulation for faults during
267 	 * data accesses:
268 	 */
269 	if (address != regs->ip) {
270 		/* Failed vsyscall read */
271 		if (vsyscall_mode == EMULATE)
272 			return false;
273 
274 		/* User code tried and failed to read the vsyscall page. */
275 		warn_bad_vsyscall(KERN_INFO, regs,
276 				  "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
277 		return false;
278 	}
279 
280 	/*
281 	 * X86_PF_INSTR is only set when NX is supported.  When
282 	 * available, use it to double-check that the emulation code
283 	 * is only being used for instruction fetches:
284 	 */
285 	if (cpu_feature_enabled(X86_FEATURE_NX))
286 		WARN_ON_ONCE(!(error_code & X86_PF_INSTR));
287 
288 	return __emulate_vsyscall(regs, address);
289 }
290 
291 bool emulate_vsyscall_gp(struct pt_regs *regs)
292 {
293 	/* Without LASS, vsyscall accesses are expected to generate a #PF */
294 	if (!cpu_feature_enabled(X86_FEATURE_LASS))
295 		return false;
296 
297 	/* Emulate only if the RIP points to the vsyscall address */
298 	if (!is_vsyscall_vaddr(regs->ip))
299 		return false;
300 
301 	return __emulate_vsyscall(regs, regs->ip);
302 }
303 
304 /*
305  * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
306  * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
307  * not need special handling anymore:
308  */
309 static const char *gate_vma_name(struct vm_area_struct *vma)
310 {
311 	return "[vsyscall]";
312 }
313 static const struct vm_operations_struct gate_vma_ops = {
314 	.name = gate_vma_name,
315 };
316 static struct vm_area_struct gate_vma __ro_after_init = {
317 	.vm_start	= VSYSCALL_ADDR,
318 	.vm_end		= VSYSCALL_ADDR + PAGE_SIZE,
319 	.vm_page_prot	= PAGE_READONLY_EXEC,
320 	.vm_flags	= VM_READ | VM_EXEC,
321 	.vm_ops		= &gate_vma_ops,
322 };
323 
324 struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
325 {
326 #ifdef CONFIG_COMPAT
327 	if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags))
328 		return NULL;
329 #endif
330 	if (vsyscall_mode == NONE)
331 		return NULL;
332 	return &gate_vma;
333 }
334 
335 int in_gate_area(struct mm_struct *mm, unsigned long addr)
336 {
337 	struct vm_area_struct *vma = get_gate_vma(mm);
338 
339 	if (!vma)
340 		return 0;
341 
342 	return (addr >= vma->vm_start) && (addr < vma->vm_end);
343 }
344 
345 /*
346  * Use this when you have no reliable mm, typically from interrupt
347  * context. It is less reliable than using a task's mm and may give
348  * false positives.
349  */
350 int in_gate_area_no_mm(unsigned long addr)
351 {
352 	return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
353 }
354 
355 /*
356  * The VSYSCALL page is the only user-accessible page in the kernel address
357  * range.  Normally, the kernel page tables can have _PAGE_USER clear, but
358  * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
359  * are enabled.
360  *
361  * Some day we may create a "minimal" vsyscall mode in which we emulate
362  * vsyscalls but leave the page not present.  If so, we skip calling
363  * this.
364  */
365 void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
366 {
367 	pgd_t *pgd;
368 	p4d_t *p4d;
369 	pud_t *pud;
370 	pmd_t *pmd;
371 
372 	pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
373 	set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
374 	p4d = p4d_offset(pgd, VSYSCALL_ADDR);
375 	set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER));
376 	pud = pud_offset(p4d, VSYSCALL_ADDR);
377 	set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
378 	pmd = pmd_offset(pud, VSYSCALL_ADDR);
379 	set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
380 }
381 
382 void __init map_vsyscall(void)
383 {
384 	extern char __vsyscall_page;
385 	unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
386 
387 	/*
388 	 * For full emulation, the page needs to exist for real.  In
389 	 * execute-only mode, there is no PTE at all backing the vsyscall
390 	 * page.
391 	 */
392 	if (vsyscall_mode == EMULATE) {
393 		__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
394 			     PAGE_KERNEL_VVAR);
395 		set_vsyscall_pgtable_user_bits(swapper_pg_dir);
396 	}
397 
398 	if (vsyscall_mode == XONLY)
399 		vm_flags_init(&gate_vma, VM_EXEC);
400 
401 	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
402 		     (unsigned long)VSYSCALL_ADDR);
403 }
404