1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net> 4 * 5 * Based on the original implementation which is: 6 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 7 * Copyright 2003 Andi Kleen, SuSE Labs. 8 * 9 * Parts of the original code have been moved to arch/x86/vdso/vma.c 10 * 11 * This file implements vsyscall emulation. vsyscalls are a legacy ABI: 12 * Userspace can request certain kernel services by calling fixed 13 * addresses. This concept is problematic: 14 * 15 * - It interferes with ASLR. 16 * - It's awkward to write code that lives in kernel addresses but is 17 * callable by userspace at fixed addresses. 18 * - The whole concept is impossible for 32-bit compat userspace. 19 * - UML cannot easily virtualize a vsyscall. 20 * 21 * As of mid-2014, I believe that there is no new userspace code that 22 * will use a vsyscall if the vDSO is present. I hope that there will 23 * soon be no new userspace code that will ever use a vsyscall. 24 * 25 * The code in this file emulates vsyscalls when notified of a page 26 * fault or a general protection fault to a vsyscall address. 27 */ 28 29 #include <linux/kernel.h> 30 #include <linux/timer.h> 31 #include <linux/sched/signal.h> 32 #include <linux/mm_types.h> 33 #include <linux/syscalls.h> 34 #include <linux/ratelimit.h> 35 36 #include <asm/vsyscall.h> 37 #include <asm/unistd.h> 38 #include <asm/fixmap.h> 39 #include <asm/traps.h> 40 41 #define CREATE_TRACE_POINTS 42 #include "vsyscall_trace.h" 43 44 static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init = 45 #ifdef CONFIG_LEGACY_VSYSCALL_NONE 46 NONE; 47 #elif defined(CONFIG_LEGACY_VSYSCALL_XONLY) 48 XONLY; 49 #else 50 #error VSYSCALL config is broken 51 #endif 52 53 static int __init vsyscall_setup(char *str) 54 { 55 if (str) { 56 if (!strcmp("emulate", str)) 57 vsyscall_mode = EMULATE; 58 else if (!strcmp("xonly", str)) 59 vsyscall_mode = XONLY; 60 else if (!strcmp("none", str)) 61 vsyscall_mode = NONE; 62 else 63 return -EINVAL; 64 65 if (cpu_feature_enabled(X86_FEATURE_LASS) && vsyscall_mode == EMULATE) { 66 setup_clear_cpu_cap(X86_FEATURE_LASS); 67 pr_warn_once("x86/cpu: Disabling LASS due to vsyscall=emulate\n"); 68 } 69 70 return 0; 71 } 72 73 return -EINVAL; 74 } 75 early_param("vsyscall", vsyscall_setup); 76 77 static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, 78 const char *message) 79 { 80 if (!show_unhandled_signals) 81 return; 82 83 printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n", 84 level, current->comm, task_pid_nr(current), 85 message, regs->ip, regs->cs, 86 regs->sp, regs->ax, regs->si, regs->di); 87 } 88 89 static int addr_to_vsyscall_nr(unsigned long addr) 90 { 91 int nr; 92 93 if ((addr & ~0xC00UL) != VSYSCALL_ADDR) 94 return -EINVAL; 95 96 nr = (addr & 0xC00UL) >> 10; 97 if (nr >= 3) 98 return -EINVAL; 99 100 return nr; 101 } 102 103 static bool write_ok_or_segv(unsigned long ptr, size_t size) 104 { 105 if (!access_ok((void __user *)ptr, size)) { 106 struct thread_struct *thread = ¤t->thread; 107 108 thread->error_code = X86_PF_USER | X86_PF_WRITE; 109 thread->cr2 = ptr; 110 thread->trap_nr = X86_TRAP_PF; 111 112 force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr); 113 return false; 114 } else { 115 return true; 116 } 117 } 118 119 static bool __emulate_vsyscall(struct pt_regs *regs, unsigned long address) 120 { 121 unsigned long caller; 122 int vsyscall_nr, syscall_nr, tmp; 123 long ret; 124 unsigned long orig_dx; 125 126 /* Confirm that the fault happened in 64-bit user mode */ 127 if (!user_64bit_mode(regs)) 128 return false; 129 130 if (vsyscall_mode == NONE) { 131 warn_bad_vsyscall(KERN_INFO, regs, 132 "vsyscall attempted with vsyscall=none"); 133 return false; 134 } 135 136 vsyscall_nr = addr_to_vsyscall_nr(address); 137 138 trace_emulate_vsyscall(vsyscall_nr); 139 140 if (vsyscall_nr < 0) { 141 warn_bad_vsyscall(KERN_WARNING, regs, 142 "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); 143 goto sigsegv; 144 } 145 146 if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { 147 warn_bad_vsyscall(KERN_WARNING, regs, 148 "vsyscall with bad stack (exploit attempt?)"); 149 goto sigsegv; 150 } 151 152 /* 153 * Check for access_ok violations and find the syscall nr. 154 * 155 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and 156 * 64-bit, so we don't need to special-case it here. For all the 157 * vsyscalls, NULL means "don't write anything" not "write it at 158 * address 0". 159 */ 160 switch (vsyscall_nr) { 161 case 0: 162 if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) || 163 !write_ok_or_segv(regs->si, sizeof(struct timezone))) { 164 ret = -EFAULT; 165 goto check_fault; 166 } 167 168 syscall_nr = __NR_gettimeofday; 169 break; 170 171 case 1: 172 if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) { 173 ret = -EFAULT; 174 goto check_fault; 175 } 176 177 syscall_nr = __NR_time; 178 break; 179 180 case 2: 181 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || 182 !write_ok_or_segv(regs->si, sizeof(unsigned))) { 183 ret = -EFAULT; 184 goto check_fault; 185 } 186 187 syscall_nr = __NR_getcpu; 188 break; 189 } 190 191 /* 192 * Handle seccomp. regs->ip must be the original value. 193 * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst. 194 * 195 * We could optimize the seccomp disabled case, but performance 196 * here doesn't matter. 197 */ 198 regs->orig_ax = syscall_nr; 199 regs->ax = -ENOSYS; 200 tmp = secure_computing(); 201 if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { 202 warn_bad_vsyscall(KERN_DEBUG, regs, 203 "seccomp tried to change syscall nr or ip"); 204 force_exit_sig(SIGSYS); 205 return true; 206 } 207 regs->orig_ax = -1; 208 if (tmp) 209 goto do_ret; /* skip requested */ 210 211 /* 212 * With a real vsyscall, page faults cause SIGSEGV. 213 */ 214 ret = -EFAULT; 215 switch (vsyscall_nr) { 216 case 0: 217 /* this decodes regs->di and regs->si on its own */ 218 ret = __x64_sys_gettimeofday(regs); 219 break; 220 221 case 1: 222 /* this decodes regs->di on its own */ 223 ret = __x64_sys_time(regs); 224 break; 225 226 case 2: 227 /* while we could clobber regs->dx, we didn't in the past... */ 228 orig_dx = regs->dx; 229 regs->dx = 0; 230 /* this decodes regs->di, regs->si and regs->dx on its own */ 231 ret = __x64_sys_getcpu(regs); 232 regs->dx = orig_dx; 233 break; 234 } 235 236 check_fault: 237 if (ret == -EFAULT) { 238 /* Bad news -- userspace fed a bad pointer to a vsyscall. */ 239 warn_bad_vsyscall(KERN_INFO, regs, 240 "vsyscall fault (exploit attempt?)"); 241 goto sigsegv; 242 } 243 244 regs->ax = ret; 245 246 do_ret: 247 /* Emulate a ret instruction. */ 248 regs->ip = caller; 249 regs->sp += 8; 250 return true; 251 252 sigsegv: 253 force_sig(SIGSEGV); 254 return true; 255 } 256 257 bool emulate_vsyscall_pf(unsigned long error_code, struct pt_regs *regs, 258 unsigned long address) 259 { 260 /* Write faults or kernel-privilege faults never get fixed up. */ 261 if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) 262 return false; 263 264 /* 265 * Assume that faults at regs->ip are because of an instruction 266 * fetch. Return early and avoid emulation for faults during 267 * data accesses: 268 */ 269 if (address != regs->ip) { 270 /* Failed vsyscall read */ 271 if (vsyscall_mode == EMULATE) 272 return false; 273 274 /* User code tried and failed to read the vsyscall page. */ 275 warn_bad_vsyscall(KERN_INFO, regs, 276 "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround"); 277 return false; 278 } 279 280 /* 281 * X86_PF_INSTR is only set when NX is supported. When 282 * available, use it to double-check that the emulation code 283 * is only being used for instruction fetches: 284 */ 285 if (cpu_feature_enabled(X86_FEATURE_NX)) 286 WARN_ON_ONCE(!(error_code & X86_PF_INSTR)); 287 288 return __emulate_vsyscall(regs, address); 289 } 290 291 bool emulate_vsyscall_gp(struct pt_regs *regs) 292 { 293 /* Without LASS, vsyscall accesses are expected to generate a #PF */ 294 if (!cpu_feature_enabled(X86_FEATURE_LASS)) 295 return false; 296 297 /* Emulate only if the RIP points to the vsyscall address */ 298 if (!is_vsyscall_vaddr(regs->ip)) 299 return false; 300 301 return __emulate_vsyscall(regs, regs->ip); 302 } 303 304 /* 305 * A pseudo VMA to allow ptrace access for the vsyscall page. This only 306 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does 307 * not need special handling anymore: 308 */ 309 static const char *gate_vma_name(struct vm_area_struct *vma) 310 { 311 return "[vsyscall]"; 312 } 313 static const struct vm_operations_struct gate_vma_ops = { 314 .name = gate_vma_name, 315 }; 316 static struct vm_area_struct gate_vma __ro_after_init = { 317 .vm_start = VSYSCALL_ADDR, 318 .vm_end = VSYSCALL_ADDR + PAGE_SIZE, 319 .vm_page_prot = PAGE_READONLY_EXEC, 320 .vm_flags = VM_READ | VM_EXEC, 321 .vm_ops = &gate_vma_ops, 322 }; 323 324 struct vm_area_struct *get_gate_vma(struct mm_struct *mm) 325 { 326 #ifdef CONFIG_COMPAT 327 if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags)) 328 return NULL; 329 #endif 330 if (vsyscall_mode == NONE) 331 return NULL; 332 return &gate_vma; 333 } 334 335 int in_gate_area(struct mm_struct *mm, unsigned long addr) 336 { 337 struct vm_area_struct *vma = get_gate_vma(mm); 338 339 if (!vma) 340 return 0; 341 342 return (addr >= vma->vm_start) && (addr < vma->vm_end); 343 } 344 345 /* 346 * Use this when you have no reliable mm, typically from interrupt 347 * context. It is less reliable than using a task's mm and may give 348 * false positives. 349 */ 350 int in_gate_area_no_mm(unsigned long addr) 351 { 352 return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; 353 } 354 355 /* 356 * The VSYSCALL page is the only user-accessible page in the kernel address 357 * range. Normally, the kernel page tables can have _PAGE_USER clear, but 358 * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls 359 * are enabled. 360 * 361 * Some day we may create a "minimal" vsyscall mode in which we emulate 362 * vsyscalls but leave the page not present. If so, we skip calling 363 * this. 364 */ 365 void __init set_vsyscall_pgtable_user_bits(pgd_t *root) 366 { 367 pgd_t *pgd; 368 p4d_t *p4d; 369 pud_t *pud; 370 pmd_t *pmd; 371 372 pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); 373 set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); 374 p4d = p4d_offset(pgd, VSYSCALL_ADDR); 375 set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER)); 376 pud = pud_offset(p4d, VSYSCALL_ADDR); 377 set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); 378 pmd = pmd_offset(pud, VSYSCALL_ADDR); 379 set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER)); 380 } 381 382 void __init map_vsyscall(void) 383 { 384 extern char __vsyscall_page; 385 unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); 386 387 /* 388 * For full emulation, the page needs to exist for real. In 389 * execute-only mode, there is no PTE at all backing the vsyscall 390 * page. 391 */ 392 if (vsyscall_mode == EMULATE) { 393 __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, 394 PAGE_KERNEL_VVAR); 395 set_vsyscall_pgtable_user_bits(swapper_pg_dir); 396 } 397 398 if (vsyscall_mode == XONLY) 399 vm_flags_init(&gate_vma, VM_EXEC); 400 401 BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != 402 (unsigned long)VSYSCALL_ADDR); 403 } 404