1/* 2 * Compatibility mode system call entry point for x86-64. 3 * 4 * Copyright 2000-2002 Andi Kleen, SuSE Labs. 5 */ 6#include "calling.h" 7#include <asm/asm-offsets.h> 8#include <asm/current.h> 9#include <asm/errno.h> 10#include <asm/ia32_unistd.h> 11#include <asm/thread_info.h> 12#include <asm/segment.h> 13#include <asm/irqflags.h> 14#include <asm/asm.h> 15#include <asm/smap.h> 16#include <linux/linkage.h> 17#include <linux/err.h> 18 19/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 20#include <linux/elf-em.h> 21#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) 22#define __AUDIT_ARCH_LE 0x40000000 23 24#ifndef CONFIG_AUDITSYSCALL 25# define sysexit_audit ia32_ret_from_sys_call 26# define sysretl_audit ia32_ret_from_sys_call 27#endif 28 29 .section .entry.text, "ax" 30 31#ifdef CONFIG_PARAVIRT 32ENTRY(native_usergs_sysret32) 33 swapgs 34 sysretl 35ENDPROC(native_usergs_sysret32) 36#endif 37 38/* 39 * 32-bit SYSENTER instruction entry. 40 * 41 * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. 42 * IF and VM in rflags are cleared (IOW: interrupts are off). 43 * SYSENTER does not save anything on the stack, 44 * and does not save old rip (!!!) and rflags. 45 * 46 * Arguments: 47 * eax system call number 48 * ebx arg1 49 * ecx arg2 50 * edx arg3 51 * esi arg4 52 * edi arg5 53 * ebp user stack 54 * 0(%ebp) arg6 55 * 56 * This is purely a fast path. For anything complicated we use the int 0x80 57 * path below. We set up a complete hardware stack frame to share code 58 * with the int 0x80 path. 59 */ 60ENTRY(entry_SYSENTER_compat) 61 /* 62 * Interrupts are off on entry. 63 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, 64 * it is too small to ever cause noticeable irq latency. 65 */ 66 SWAPGS_UNSAFE_STACK 67 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 68 ENABLE_INTERRUPTS(CLBR_NONE) 69 70 /* Zero-extending 32-bit regs, do not remove */ 71 movl %ebp, %ebp 72 movl %eax, %eax 73 74 movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d 75 76 /* Construct struct pt_regs on stack */ 77 pushq $__USER32_DS /* pt_regs->ss */ 78 pushq %rbp /* pt_regs->sp */ 79 pushfq /* pt_regs->flags */ 80 pushq $__USER32_CS /* pt_regs->cs */ 81 pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ 82 pushq %rax /* pt_regs->orig_ax */ 83 pushq %rdi /* pt_regs->di */ 84 pushq %rsi /* pt_regs->si */ 85 pushq %rdx /* pt_regs->dx */ 86 pushq %rcx /* pt_regs->cx */ 87 pushq $-ENOSYS /* pt_regs->ax */ 88 cld 89 sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ 90 91 /* 92 * no need to do an access_ok check here because rbp has been 93 * 32-bit zero extended 94 */ 95 ASM_STAC 961: movl (%rbp), %ebp 97 _ASM_EXTABLE(1b, ia32_badarg) 98 ASM_CLAC 99 100 /* 101 * Sysenter doesn't filter flags, so we need to clear NT 102 * ourselves. To save a few cycles, we can check whether 103 * NT was set instead of doing an unconditional popfq. 104 */ 105 testl $X86_EFLAGS_NT, EFLAGS(%rsp) 106 jnz sysenter_fix_flags 107sysenter_flags_fixed: 108 109 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 110 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 111 jnz sysenter_tracesys 112 113sysenter_do_call: 114 /* 32-bit syscall -> 64-bit C ABI argument conversion */ 115 movl %edi, %r8d /* arg5 */ 116 movl %ebp, %r9d /* arg6 */ 117 xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ 118 movl %ebx, %edi /* arg1 */ 119 movl %edx, %edx /* arg3 (zero extension) */ 120sysenter_dispatch: 121 cmpq $(IA32_NR_syscalls-1), %rax 122 ja 1f 123 call *ia32_sys_call_table(, %rax, 8) 124 movq %rax, RAX(%rsp) 1251: 126 DISABLE_INTERRUPTS(CLBR_NONE) 127 TRACE_IRQS_OFF 128 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 129 jnz sysexit_audit 130sysexit_from_sys_call: 131 /* 132 * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an 133 * NMI between STI and SYSEXIT has poorly specified behavior, 134 * and and NMI followed by an IRQ with usergs is fatal. So 135 * we just pretend we're using SYSEXIT but we really use 136 * SYSRETL instead. 137 * 138 * This code path is still called 'sysexit' because it pairs 139 * with 'sysenter' and it uses the SYSENTER calling convention. 140 */ 141 andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 142 movl RIP(%rsp), %ecx /* User %eip */ 143 movq RAX(%rsp), %rax 144 RESTORE_RSI_RDI 145 xorl %edx, %edx /* Do not leak kernel information */ 146 xorq %r8, %r8 147 xorq %r9, %r9 148 xorq %r10, %r10 149 movl EFLAGS(%rsp), %r11d /* User eflags */ 150 TRACE_IRQS_ON 151 152 /* 153 * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, 154 * since it avoids a dicey window with interrupts enabled. 155 */ 156 movl RSP(%rsp), %esp 157 158 /* 159 * USERGS_SYSRET32 does: 160 * gsbase = user's gs base 161 * eip = ecx 162 * rflags = r11 163 * cs = __USER32_CS 164 * ss = __USER_DS 165 * 166 * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: 167 * 168 * pop %ebp 169 * pop %edx 170 * pop %ecx 171 * 172 * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to 173 * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's 174 * address (already known to user code), and R12-R15 are 175 * callee-saved and therefore don't contain any interesting 176 * kernel data. 177 */ 178 USERGS_SYSRET32 179 180#ifdef CONFIG_AUDITSYSCALL 181 .macro auditsys_entry_common 182 /* 183 * At this point, registers hold syscall args in the 32-bit syscall ABI: 184 * EAX is syscall number, the 6 args are in EBX,ECX,EDX,ESI,EDI,EBP. 185 * 186 * We want to pass them to __audit_syscall_entry(), which is a 64-bit 187 * C function with 5 parameters, so shuffle them to match what 188 * the function expects: RDI,RSI,RDX,RCX,R8. 189 */ 190 movl %esi, %r8d /* arg5 (R8 ) <= 4th syscall arg (ESI) */ 191 xchg %ecx, %edx /* arg4 (RCX) <= 3rd syscall arg (EDX) */ 192 /* arg3 (RDX) <= 2nd syscall arg (ECX) */ 193 movl %ebx, %esi /* arg2 (RSI) <= 1st syscall arg (EBX) */ 194 movl %eax, %edi /* arg1 (RDI) <= syscall number (EAX) */ 195 call __audit_syscall_entry 196 197 /* 198 * We are going to jump back to the syscall dispatch code. 199 * Prepare syscall args as required by the 64-bit C ABI. 200 * Registers clobbered by __audit_syscall_entry() are 201 * loaded from pt_regs on stack: 202 */ 203 movl ORIG_RAX(%rsp), %eax /* syscall number */ 204 movl %ebx, %edi /* arg1 */ 205 movl RCX(%rsp), %esi /* arg2 */ 206 movl RDX(%rsp), %edx /* arg3 */ 207 movl RSI(%rsp), %ecx /* arg4 */ 208 movl RDI(%rsp), %r8d /* arg5 */ 209 .endm 210 211 .macro auditsys_exit exit 212 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 213 jnz ia32_ret_from_sys_call 214 TRACE_IRQS_ON 215 ENABLE_INTERRUPTS(CLBR_NONE) 216 movl %eax, %esi /* second arg, syscall return value */ 217 cmpl $-MAX_ERRNO, %eax /* is it an error ? */ 218 jbe 1f 219 movslq %eax, %rsi /* if error sign extend to 64 bits */ 2201: setbe %al /* 1 if error, 0 if not */ 221 movzbl %al, %edi /* zero-extend that into %edi */ 222 call __audit_syscall_exit 223 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi 224 DISABLE_INTERRUPTS(CLBR_NONE) 225 TRACE_IRQS_OFF 226 testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 227 jz \exit 228 xorl %eax, %eax /* Do not leak kernel information */ 229 movq %rax, R11(%rsp) 230 movq %rax, R10(%rsp) 231 movq %rax, R9(%rsp) 232 movq %rax, R8(%rsp) 233 jmp int_with_check 234 .endm 235 236sysenter_auditsys: 237 auditsys_entry_common 238 movl %ebp, %r9d /* reload 6th syscall arg */ 239 jmp sysenter_dispatch 240 241sysexit_audit: 242 auditsys_exit sysexit_from_sys_call 243#endif 244 245sysenter_fix_flags: 246 pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) 247 popfq 248 jmp sysenter_flags_fixed 249 250sysenter_tracesys: 251#ifdef CONFIG_AUDITSYSCALL 252 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 253 jz sysenter_auditsys 254#endif 255 SAVE_EXTRA_REGS 256 xorl %eax, %eax /* Do not leak kernel information */ 257 movq %rax, R11(%rsp) 258 movq %rax, R10(%rsp) 259 movq %rax, R9(%rsp) 260 movq %rax, R8(%rsp) 261 movq %rsp, %rdi /* &pt_regs -> arg1 */ 262 call syscall_trace_enter 263 264 /* Reload arg registers from stack. (see sysenter_tracesys) */ 265 movl RCX(%rsp), %ecx 266 movl RDX(%rsp), %edx 267 movl RSI(%rsp), %esi 268 movl RDI(%rsp), %edi 269 movl %eax, %eax /* zero extension */ 270 271 RESTORE_EXTRA_REGS 272 jmp sysenter_do_call 273ENDPROC(entry_SYSENTER_compat) 274 275/* 276 * 32-bit SYSCALL instruction entry. 277 * 278 * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, 279 * then loads new ss, cs, and rip from previously programmed MSRs. 280 * rflags gets masked by a value from another MSR (so CLD and CLAC 281 * are not needed). SYSCALL does not save anything on the stack 282 * and does not change rsp. 283 * 284 * Note: rflags saving+masking-with-MSR happens only in Long mode 285 * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). 286 * Don't get confused: rflags saving+masking depends on Long Mode Active bit 287 * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes 288 * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). 289 * 290 * Arguments: 291 * eax system call number 292 * ecx return address 293 * ebx arg1 294 * ebp arg2 (note: not saved in the stack frame, should not be touched) 295 * edx arg3 296 * esi arg4 297 * edi arg5 298 * esp user stack 299 * 0(%esp) arg6 300 * 301 * This is purely a fast path. For anything complicated we use the int 0x80 302 * path below. We set up a complete hardware stack frame to share code 303 * with the int 0x80 path. 304 */ 305ENTRY(entry_SYSCALL_compat) 306 /* 307 * Interrupts are off on entry. 308 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, 309 * it is too small to ever cause noticeable irq latency. 310 */ 311 SWAPGS_UNSAFE_STACK 312 movl %esp, %r8d 313 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 314 ENABLE_INTERRUPTS(CLBR_NONE) 315 316 /* Zero-extending 32-bit regs, do not remove */ 317 movl %eax, %eax 318 319 /* Construct struct pt_regs on stack */ 320 pushq $__USER32_DS /* pt_regs->ss */ 321 pushq %r8 /* pt_regs->sp */ 322 pushq %r11 /* pt_regs->flags */ 323 pushq $__USER32_CS /* pt_regs->cs */ 324 pushq %rcx /* pt_regs->ip */ 325 pushq %rax /* pt_regs->orig_ax */ 326 pushq %rdi /* pt_regs->di */ 327 pushq %rsi /* pt_regs->si */ 328 pushq %rdx /* pt_regs->dx */ 329 pushq %rbp /* pt_regs->cx */ 330 movl %ebp, %ecx 331 pushq $-ENOSYS /* pt_regs->ax */ 332 sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ 333 334 /* 335 * No need to do an access_ok check here because r8 has been 336 * 32-bit zero extended: 337 */ 338 ASM_STAC 3391: movl (%r8), %r9d 340 _ASM_EXTABLE(1b, ia32_badarg) 341 ASM_CLAC 342 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 343 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 344 jnz cstar_tracesys 345 346cstar_do_call: 347 /* 32-bit syscall -> 64-bit C ABI argument conversion */ 348 movl %edi, %r8d /* arg5 */ 349 /* r9 already loaded */ /* arg6 */ 350 xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ 351 movl %ebx, %edi /* arg1 */ 352 movl %edx, %edx /* arg3 (zero extension) */ 353 354cstar_dispatch: 355 cmpq $(IA32_NR_syscalls-1), %rax 356 ja 1f 357 358 call *ia32_sys_call_table(, %rax, 8) 359 movq %rax, RAX(%rsp) 3601: 361 DISABLE_INTERRUPTS(CLBR_NONE) 362 TRACE_IRQS_OFF 363 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 364 jnz sysretl_audit 365 366sysretl_from_sys_call: 367 andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 368 RESTORE_RSI_RDI_RDX 369 movl RIP(%rsp), %ecx 370 movl EFLAGS(%rsp), %r11d 371 movq RAX(%rsp), %rax 372 xorq %r10, %r10 373 xorq %r9, %r9 374 xorq %r8, %r8 375 TRACE_IRQS_ON 376 movl RSP(%rsp), %esp 377 /* 378 * 64-bit->32-bit SYSRET restores eip from ecx, 379 * eflags from r11 (but RF and VM bits are forced to 0), 380 * cs and ss are loaded from MSRs. 381 * (Note: 32-bit->32-bit SYSRET is different: since r11 382 * does not exist, it merely sets eflags.IF=1). 383 * 384 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss 385 * descriptor is not reinitialized. This means that we must 386 * avoid SYSRET with SS == NULL, which could happen if we schedule, 387 * exit the kernel, and re-enter using an interrupt vector. (All 388 * interrupt entries on x86_64 set SS to NULL.) We prevent that 389 * from happening by reloading SS in __switch_to. 390 */ 391 USERGS_SYSRET32 392 393#ifdef CONFIG_AUDITSYSCALL 394cstar_auditsys: 395 movl %r9d, R9(%rsp) /* register to be clobbered by call */ 396 auditsys_entry_common 397 movl R9(%rsp), %r9d /* reload 6th syscall arg */ 398 jmp cstar_dispatch 399 400sysretl_audit: 401 auditsys_exit sysretl_from_sys_call 402#endif 403 404cstar_tracesys: 405#ifdef CONFIG_AUDITSYSCALL 406 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 407 jz cstar_auditsys 408#endif 409 xchgl %r9d, %ebp 410 SAVE_EXTRA_REGS 411 xorl %eax, %eax /* Do not leak kernel information */ 412 movq %rax, R11(%rsp) 413 movq %rax, R10(%rsp) 414 movq %r9, R9(%rsp) 415 movq %rax, R8(%rsp) 416 movq %rsp, %rdi /* &pt_regs -> arg1 */ 417 call syscall_trace_enter 418 movl R9(%rsp), %r9d 419 420 /* Reload arg registers from stack. (see sysenter_tracesys) */ 421 movl RCX(%rsp), %ecx 422 movl RDX(%rsp), %edx 423 movl RSI(%rsp), %esi 424 movl RDI(%rsp), %edi 425 movl %eax, %eax /* zero extension */ 426 427 RESTORE_EXTRA_REGS 428 xchgl %ebp, %r9d 429 jmp cstar_do_call 430END(entry_SYSCALL_compat) 431 432ia32_badarg: 433 ASM_CLAC 434 movq $-EFAULT, RAX(%rsp) 435ia32_ret_from_sys_call: 436 xorl %eax, %eax /* Do not leak kernel information */ 437 movq %rax, R11(%rsp) 438 movq %rax, R10(%rsp) 439 movq %rax, R9(%rsp) 440 movq %rax, R8(%rsp) 441 jmp int_ret_from_sys_call 442 443/* 444 * Emulated IA32 system calls via int 0x80. 445 * 446 * Arguments: 447 * eax system call number 448 * ebx arg1 449 * ecx arg2 450 * edx arg3 451 * esi arg4 452 * edi arg5 453 * ebp arg6 (note: not saved in the stack frame, should not be touched) 454 * 455 * Notes: 456 * Uses the same stack frame as the x86-64 version. 457 * All registers except eax must be saved (but ptrace may violate that). 458 * Arguments are zero extended. For system calls that want sign extension and 459 * take long arguments a wrapper is needed. Most calls can just be called 460 * directly. 461 * Assumes it is only called from user space and entered with interrupts off. 462 */ 463 464ENTRY(entry_INT80_compat) 465 /* 466 * Interrupts are off on entry. 467 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, 468 * it is too small to ever cause noticeable irq latency. 469 */ 470 PARAVIRT_ADJUST_EXCEPTION_FRAME 471 SWAPGS 472 ENABLE_INTERRUPTS(CLBR_NONE) 473 474 /* Zero-extending 32-bit regs, do not remove */ 475 movl %eax, %eax 476 477 /* Construct struct pt_regs on stack (iret frame is already on stack) */ 478 pushq %rax /* pt_regs->orig_ax */ 479 pushq %rdi /* pt_regs->di */ 480 pushq %rsi /* pt_regs->si */ 481 pushq %rdx /* pt_regs->dx */ 482 pushq %rcx /* pt_regs->cx */ 483 pushq $-ENOSYS /* pt_regs->ax */ 484 pushq $0 /* pt_regs->r8 */ 485 pushq $0 /* pt_regs->r9 */ 486 pushq $0 /* pt_regs->r10 */ 487 pushq $0 /* pt_regs->r11 */ 488 cld 489 sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ 490 491 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 492 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 493 jnz ia32_tracesys 494 495ia32_do_call: 496 /* 32-bit syscall -> 64-bit C ABI argument conversion */ 497 movl %edi, %r8d /* arg5 */ 498 movl %ebp, %r9d /* arg6 */ 499 xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ 500 movl %ebx, %edi /* arg1 */ 501 movl %edx, %edx /* arg3 (zero extension) */ 502 cmpq $(IA32_NR_syscalls-1), %rax 503 ja 1f 504 505 call *ia32_sys_call_table(, %rax, 8) 506 movq %rax, RAX(%rsp) 5071: 508 jmp int_ret_from_sys_call 509 510ia32_tracesys: 511 SAVE_EXTRA_REGS 512 movq %rsp, %rdi /* &pt_regs -> arg1 */ 513 call syscall_trace_enter 514 /* 515 * Reload arg registers from stack in case ptrace changed them. 516 * Don't reload %eax because syscall_trace_enter() returned 517 * the %rax value we should see. But do truncate it to 32 bits. 518 * If it's -1 to make us punt the syscall, then (u32)-1 is still 519 * an appropriately invalid value. 520 */ 521 movl RCX(%rsp), %ecx 522 movl RDX(%rsp), %edx 523 movl RSI(%rsp), %esi 524 movl RDI(%rsp), %edi 525 movl %eax, %eax /* zero extension */ 526 RESTORE_EXTRA_REGS 527 jmp ia32_do_call 528END(entry_INT80_compat) 529 530 .macro PTREGSCALL label, func 531 ALIGN 532GLOBAL(\label) 533 leaq \func(%rip), %rax 534 jmp ia32_ptregs_common 535 .endm 536 537 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn 538 PTREGSCALL stub32_sigreturn, sys32_sigreturn 539 PTREGSCALL stub32_fork, sys_fork 540 PTREGSCALL stub32_vfork, sys_vfork 541 542 ALIGN 543GLOBAL(stub32_clone) 544 leaq sys_clone(%rip), %rax 545 /* 546 * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). 547 * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). 548 * 549 * The native 64-bit kernel's sys_clone() implements the latter, 550 * so we need to swap arguments here before calling it: 551 */ 552 xchg %r8, %rcx 553 jmp ia32_ptregs_common 554 555 ALIGN 556ia32_ptregs_common: 557 SAVE_EXTRA_REGS 8 558 call *%rax 559 RESTORE_EXTRA_REGS 8 560 ret 561END(ia32_ptregs_common) 562