1/* 2 * Compatibility mode system call entry point for x86-64. 3 * 4 * Copyright 2000-2002 Andi Kleen, SuSE Labs. 5 */ 6#include "calling.h" 7#include <asm/asm-offsets.h> 8#include <asm/current.h> 9#include <asm/errno.h> 10#include <asm/ia32_unistd.h> 11#include <asm/thread_info.h> 12#include <asm/segment.h> 13#include <asm/irqflags.h> 14#include <asm/asm.h> 15#include <asm/smap.h> 16#include <linux/linkage.h> 17#include <linux/err.h> 18 19/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 20#include <linux/elf-em.h> 21#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) 22#define __AUDIT_ARCH_LE 0x40000000 23 24#ifndef CONFIG_AUDITSYSCALL 25# define sysexit_audit ia32_ret_from_sys_call 26# define sysretl_audit ia32_ret_from_sys_call 27#endif 28 29 .section .entry.text, "ax" 30 31#ifdef CONFIG_PARAVIRT 32ENTRY(native_usergs_sysret32) 33 swapgs 34 sysretl 35ENDPROC(native_usergs_sysret32) 36#endif 37 38/* 39 * 32-bit SYSENTER instruction entry. 40 * 41 * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. 42 * IF and VM in rflags are cleared (IOW: interrupts are off). 43 * SYSENTER does not save anything on the stack, 44 * and does not save old rip (!!!) and rflags. 45 * 46 * Arguments: 47 * eax system call number 48 * ebx arg1 49 * ecx arg2 50 * edx arg3 51 * esi arg4 52 * edi arg5 53 * ebp user stack 54 * 0(%ebp) arg6 55 * 56 * This is purely a fast path. For anything complicated we use the int 0x80 57 * path below. We set up a complete hardware stack frame to share code 58 * with the int 0x80 path. 59 */ 60ENTRY(entry_SYSENTER_compat) 61 /* 62 * Interrupts are off on entry. 63 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, 64 * it is too small to ever cause noticeable irq latency. 65 */ 66 SWAPGS_UNSAFE_STACK 67 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 68 ENABLE_INTERRUPTS(CLBR_NONE) 69 70 /* Zero-extending 32-bit regs, do not remove */ 71 movl %ebp, %ebp 72 movl %eax, %eax 73 74 movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d 75 76 /* Construct struct pt_regs on stack */ 77 pushq $__USER32_DS /* pt_regs->ss */ 78 pushq %rbp /* pt_regs->sp */ 79 pushfq /* pt_regs->flags */ 80 pushq $__USER32_CS /* pt_regs->cs */ 81 pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ 82 pushq %rax /* pt_regs->orig_ax */ 83 pushq %rdi /* pt_regs->di */ 84 pushq %rsi /* pt_regs->si */ 85 pushq %rdx /* pt_regs->dx */ 86 pushq %rcx /* pt_regs->cx */ 87 pushq $-ENOSYS /* pt_regs->ax */ 88 cld 89 sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ 90 91 /* 92 * no need to do an access_ok check here because rbp has been 93 * 32-bit zero extended 94 */ 95 ASM_STAC 961: movl (%rbp), %ebp 97 _ASM_EXTABLE(1b, ia32_badarg) 98 ASM_CLAC 99 100 /* 101 * Sysenter doesn't filter flags, so we need to clear NT 102 * ourselves. To save a few cycles, we can check whether 103 * NT was set instead of doing an unconditional popfq. 104 */ 105 testl $X86_EFLAGS_NT, EFLAGS(%rsp) 106 jnz sysenter_fix_flags 107sysenter_flags_fixed: 108 109 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 110 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 111 jnz sysenter_tracesys 112 113sysenter_do_call: 114 /* 32-bit syscall -> 64-bit C ABI argument conversion */ 115 movl %edi, %r8d /* arg5 */ 116 movl %ebp, %r9d /* arg6 */ 117 xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ 118 movl %ebx, %edi /* arg1 */ 119 movl %edx, %edx /* arg3 (zero extension) */ 120sysenter_dispatch: 121 cmpq $(IA32_NR_syscalls-1), %rax 122 ja 1f 123 call *ia32_sys_call_table(, %rax, 8) 124 movq %rax, RAX(%rsp) 1251: 126 DISABLE_INTERRUPTS(CLBR_NONE) 127 TRACE_IRQS_OFF 128 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 129 jnz sysexit_audit 130sysexit_from_sys_call: 131 /* 132 * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an 133 * NMI between STI and SYSEXIT has poorly specified behavior, 134 * and and NMI followed by an IRQ with usergs is fatal. So 135 * we just pretend we're using SYSEXIT but we really use 136 * SYSRETL instead. 137 * 138 * This code path is still called 'sysexit' because it pairs 139 * with 'sysenter' and it uses the SYSENTER calling convention. 140 */ 141 andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 142 movl RIP(%rsp), %ecx /* User %eip */ 143 RESTORE_RSI_RDI 144 xorl %edx, %edx /* Do not leak kernel information */ 145 xorq %r8, %r8 146 xorq %r9, %r9 147 xorq %r10, %r10 148 movl EFLAGS(%rsp), %r11d /* User eflags */ 149 TRACE_IRQS_ON 150 151 /* 152 * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, 153 * since it avoids a dicey window with interrupts enabled. 154 */ 155 movl RSP(%rsp), %esp 156 157 /* 158 * USERGS_SYSRET32 does: 159 * gsbase = user's gs base 160 * eip = ecx 161 * rflags = r11 162 * cs = __USER32_CS 163 * ss = __USER_DS 164 * 165 * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: 166 * 167 * pop %ebp 168 * pop %edx 169 * pop %ecx 170 * 171 * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to 172 * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's 173 * address (already known to user code), and R12-R15 are 174 * callee-saved and therefore don't contain any interesting 175 * kernel data. 176 */ 177 USERGS_SYSRET32 178 179#ifdef CONFIG_AUDITSYSCALL 180 .macro auditsys_entry_common 181 /* 182 * At this point, registers hold syscall args in the 32-bit syscall ABI: 183 * EAX is syscall number, the 6 args are in EBX,ECX,EDX,ESI,EDI,EBP. 184 * 185 * We want to pass them to __audit_syscall_entry(), which is a 64-bit 186 * C function with 5 parameters, so shuffle them to match what 187 * the function expects: RDI,RSI,RDX,RCX,R8. 188 */ 189 movl %esi, %r8d /* arg5 (R8 ) <= 4th syscall arg (ESI) */ 190 xchg %ecx, %edx /* arg4 (RCX) <= 3rd syscall arg (EDX) */ 191 /* arg3 (RDX) <= 2nd syscall arg (ECX) */ 192 movl %ebx, %esi /* arg2 (RSI) <= 1st syscall arg (EBX) */ 193 movl %eax, %edi /* arg1 (RDI) <= syscall number (EAX) */ 194 call __audit_syscall_entry 195 196 /* 197 * We are going to jump back to the syscall dispatch code. 198 * Prepare syscall args as required by the 64-bit C ABI. 199 * Registers clobbered by __audit_syscall_entry() are 200 * loaded from pt_regs on stack: 201 */ 202 movl ORIG_RAX(%rsp), %eax /* syscall number */ 203 movl %ebx, %edi /* arg1 */ 204 movl RCX(%rsp), %esi /* arg2 */ 205 movl RDX(%rsp), %edx /* arg3 */ 206 movl RSI(%rsp), %ecx /* arg4 */ 207 movl RDI(%rsp), %r8d /* arg5 */ 208 .endm 209 210 .macro auditsys_exit exit 211 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 212 jnz ia32_ret_from_sys_call 213 TRACE_IRQS_ON 214 ENABLE_INTERRUPTS(CLBR_NONE) 215 movl %eax, %esi /* second arg, syscall return value */ 216 cmpl $-MAX_ERRNO, %eax /* is it an error ? */ 217 jbe 1f 218 movslq %eax, %rsi /* if error sign extend to 64 bits */ 2191: setbe %al /* 1 if error, 0 if not */ 220 movzbl %al, %edi /* zero-extend that into %edi */ 221 call __audit_syscall_exit 222 movq RAX(%rsp), %rax /* reload syscall return value */ 223 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi 224 DISABLE_INTERRUPTS(CLBR_NONE) 225 TRACE_IRQS_OFF 226 testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 227 jz \exit 228 xorl %eax, %eax /* Do not leak kernel information */ 229 movq %rax, R11(%rsp) 230 movq %rax, R10(%rsp) 231 movq %rax, R9(%rsp) 232 movq %rax, R8(%rsp) 233 jmp int_with_check 234 .endm 235 236sysenter_auditsys: 237 auditsys_entry_common 238 movl %ebp, %r9d /* reload 6th syscall arg */ 239 jmp sysenter_dispatch 240 241sysexit_audit: 242 auditsys_exit sysexit_from_sys_call 243#endif 244 245sysenter_fix_flags: 246 pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) 247 popfq 248 jmp sysenter_flags_fixed 249 250sysenter_tracesys: 251#ifdef CONFIG_AUDITSYSCALL 252 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 253 jz sysenter_auditsys 254#endif 255 SAVE_EXTRA_REGS 256 xorl %eax, %eax /* Do not leak kernel information */ 257 movq %rax, R11(%rsp) 258 movq %rax, R10(%rsp) 259 movq %rax, R9(%rsp) 260 movq %rax, R8(%rsp) 261 movq %rsp, %rdi /* &pt_regs -> arg1 */ 262 call syscall_trace_enter 263 264 /* Reload arg registers from stack. (see sysenter_tracesys) */ 265 movl RCX(%rsp), %ecx 266 movl RDX(%rsp), %edx 267 movl RSI(%rsp), %esi 268 movl RDI(%rsp), %edi 269 movl %eax, %eax /* zero extension */ 270 271 RESTORE_EXTRA_REGS 272 jmp sysenter_do_call 273ENDPROC(entry_SYSENTER_compat) 274 275/* 276 * 32-bit SYSCALL instruction entry. 277 * 278 * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, 279 * then loads new ss, cs, and rip from previously programmed MSRs. 280 * rflags gets masked by a value from another MSR (so CLD and CLAC 281 * are not needed). SYSCALL does not save anything on the stack 282 * and does not change rsp. 283 * 284 * Note: rflags saving+masking-with-MSR happens only in Long mode 285 * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). 286 * Don't get confused: rflags saving+masking depends on Long Mode Active bit 287 * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes 288 * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). 289 * 290 * Arguments: 291 * eax system call number 292 * ecx return address 293 * ebx arg1 294 * ebp arg2 (note: not saved in the stack frame, should not be touched) 295 * edx arg3 296 * esi arg4 297 * edi arg5 298 * esp user stack 299 * 0(%esp) arg6 300 * 301 * This is purely a fast path. For anything complicated we use the int 0x80 302 * path below. We set up a complete hardware stack frame to share code 303 * with the int 0x80 path. 304 */ 305ENTRY(entry_SYSCALL_compat) 306 /* 307 * Interrupts are off on entry. 308 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, 309 * it is too small to ever cause noticeable irq latency. 310 */ 311 SWAPGS_UNSAFE_STACK 312 movl %esp, %r8d 313 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 314 ENABLE_INTERRUPTS(CLBR_NONE) 315 316 /* Zero-extending 32-bit regs, do not remove */ 317 movl %eax, %eax 318 319 /* Construct struct pt_regs on stack */ 320 pushq $__USER32_DS /* pt_regs->ss */ 321 pushq %r8 /* pt_regs->sp */ 322 pushq %r11 /* pt_regs->flags */ 323 pushq $__USER32_CS /* pt_regs->cs */ 324 pushq %rcx /* pt_regs->ip */ 325 pushq %rax /* pt_regs->orig_ax */ 326 pushq %rdi /* pt_regs->di */ 327 pushq %rsi /* pt_regs->si */ 328 pushq %rdx /* pt_regs->dx */ 329 pushq %rbp /* pt_regs->cx */ 330 movl %ebp, %ecx 331 pushq $-ENOSYS /* pt_regs->ax */ 332 sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ 333 334 /* 335 * No need to do an access_ok check here because r8 has been 336 * 32-bit zero extended: 337 */ 338 ASM_STAC 3391: movl (%r8), %r9d 340 _ASM_EXTABLE(1b, ia32_badarg) 341 ASM_CLAC 342 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 343 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 344 jnz cstar_tracesys 345 346cstar_do_call: 347 /* 32-bit syscall -> 64-bit C ABI argument conversion */ 348 movl %edi, %r8d /* arg5 */ 349 /* r9 already loaded */ /* arg6 */ 350 xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ 351 movl %ebx, %edi /* arg1 */ 352 movl %edx, %edx /* arg3 (zero extension) */ 353 354cstar_dispatch: 355 cmpq $(IA32_NR_syscalls-1), %rax 356 ja 1f 357 358 call *ia32_sys_call_table(, %rax, 8) 359 movq %rax, RAX(%rsp) 3601: 361 DISABLE_INTERRUPTS(CLBR_NONE) 362 TRACE_IRQS_OFF 363 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 364 jnz sysretl_audit 365 366sysretl_from_sys_call: 367 andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 368 RESTORE_RSI_RDI_RDX 369 movl RIP(%rsp), %ecx 370 movl EFLAGS(%rsp), %r11d 371 xorq %r10, %r10 372 xorq %r9, %r9 373 xorq %r8, %r8 374 TRACE_IRQS_ON 375 movl RSP(%rsp), %esp 376 /* 377 * 64-bit->32-bit SYSRET restores eip from ecx, 378 * eflags from r11 (but RF and VM bits are forced to 0), 379 * cs and ss are loaded from MSRs. 380 * (Note: 32-bit->32-bit SYSRET is different: since r11 381 * does not exist, it merely sets eflags.IF=1). 382 * 383 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss 384 * descriptor is not reinitialized. This means that we must 385 * avoid SYSRET with SS == NULL, which could happen if we schedule, 386 * exit the kernel, and re-enter using an interrupt vector. (All 387 * interrupt entries on x86_64 set SS to NULL.) We prevent that 388 * from happening by reloading SS in __switch_to. 389 */ 390 USERGS_SYSRET32 391 392#ifdef CONFIG_AUDITSYSCALL 393cstar_auditsys: 394 movl %r9d, R9(%rsp) /* register to be clobbered by call */ 395 auditsys_entry_common 396 movl R9(%rsp), %r9d /* reload 6th syscall arg */ 397 jmp cstar_dispatch 398 399sysretl_audit: 400 auditsys_exit sysretl_from_sys_call 401#endif 402 403cstar_tracesys: 404#ifdef CONFIG_AUDITSYSCALL 405 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 406 jz cstar_auditsys 407#endif 408 xchgl %r9d, %ebp 409 SAVE_EXTRA_REGS 410 xorl %eax, %eax /* Do not leak kernel information */ 411 movq %rax, R11(%rsp) 412 movq %rax, R10(%rsp) 413 movq %r9, R9(%rsp) 414 movq %rax, R8(%rsp) 415 movq %rsp, %rdi /* &pt_regs -> arg1 */ 416 call syscall_trace_enter 417 movl R9(%rsp), %r9d 418 419 /* Reload arg registers from stack. (see sysenter_tracesys) */ 420 movl RCX(%rsp), %ecx 421 movl RDX(%rsp), %edx 422 movl RSI(%rsp), %esi 423 movl RDI(%rsp), %edi 424 movl %eax, %eax /* zero extension */ 425 426 RESTORE_EXTRA_REGS 427 xchgl %ebp, %r9d 428 jmp cstar_do_call 429END(entry_SYSCALL_compat) 430 431ia32_badarg: 432 ASM_CLAC 433 movq $-EFAULT, RAX(%rsp) 434ia32_ret_from_sys_call: 435 xorl %eax, %eax /* Do not leak kernel information */ 436 movq %rax, R11(%rsp) 437 movq %rax, R10(%rsp) 438 movq %rax, R9(%rsp) 439 movq %rax, R8(%rsp) 440 jmp int_ret_from_sys_call 441 442/* 443 * Emulated IA32 system calls via int 0x80. 444 * 445 * Arguments: 446 * eax system call number 447 * ebx arg1 448 * ecx arg2 449 * edx arg3 450 * esi arg4 451 * edi arg5 452 * ebp arg6 (note: not saved in the stack frame, should not be touched) 453 * 454 * Notes: 455 * Uses the same stack frame as the x86-64 version. 456 * All registers except eax must be saved (but ptrace may violate that). 457 * Arguments are zero extended. For system calls that want sign extension and 458 * take long arguments a wrapper is needed. Most calls can just be called 459 * directly. 460 * Assumes it is only called from user space and entered with interrupts off. 461 */ 462 463ENTRY(entry_INT80_compat) 464 /* 465 * Interrupts are off on entry. 466 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, 467 * it is too small to ever cause noticeable irq latency. 468 */ 469 PARAVIRT_ADJUST_EXCEPTION_FRAME 470 SWAPGS 471 ENABLE_INTERRUPTS(CLBR_NONE) 472 473 /* Zero-extending 32-bit regs, do not remove */ 474 movl %eax, %eax 475 476 /* Construct struct pt_regs on stack (iret frame is already on stack) */ 477 pushq %rax /* pt_regs->orig_ax */ 478 pushq %rdi /* pt_regs->di */ 479 pushq %rsi /* pt_regs->si */ 480 pushq %rdx /* pt_regs->dx */ 481 pushq %rcx /* pt_regs->cx */ 482 pushq $-ENOSYS /* pt_regs->ax */ 483 pushq $0 /* pt_regs->r8 */ 484 pushq $0 /* pt_regs->r9 */ 485 pushq $0 /* pt_regs->r10 */ 486 pushq $0 /* pt_regs->r11 */ 487 cld 488 sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ 489 490 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 491 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 492 jnz ia32_tracesys 493 494ia32_do_call: 495 /* 32-bit syscall -> 64-bit C ABI argument conversion */ 496 movl %edi, %r8d /* arg5 */ 497 movl %ebp, %r9d /* arg6 */ 498 xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ 499 movl %ebx, %edi /* arg1 */ 500 movl %edx, %edx /* arg3 (zero extension) */ 501 cmpq $(IA32_NR_syscalls-1), %rax 502 ja 1f 503 504 call *ia32_sys_call_table(, %rax, 8) 505 movq %rax, RAX(%rsp) 5061: 507 jmp int_ret_from_sys_call 508 509ia32_tracesys: 510 SAVE_EXTRA_REGS 511 movq %rsp, %rdi /* &pt_regs -> arg1 */ 512 call syscall_trace_enter 513 /* 514 * Reload arg registers from stack in case ptrace changed them. 515 * Don't reload %eax because syscall_trace_enter() returned 516 * the %rax value we should see. But do truncate it to 32 bits. 517 * If it's -1 to make us punt the syscall, then (u32)-1 is still 518 * an appropriately invalid value. 519 */ 520 movl RCX(%rsp), %ecx 521 movl RDX(%rsp), %edx 522 movl RSI(%rsp), %esi 523 movl RDI(%rsp), %edi 524 movl %eax, %eax /* zero extension */ 525 RESTORE_EXTRA_REGS 526 jmp ia32_do_call 527END(entry_INT80_compat) 528 529 .macro PTREGSCALL label, func 530 ALIGN 531GLOBAL(\label) 532 leaq \func(%rip), %rax 533 jmp ia32_ptregs_common 534 .endm 535 536 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn 537 PTREGSCALL stub32_sigreturn, sys32_sigreturn 538 PTREGSCALL stub32_fork, sys_fork 539 PTREGSCALL stub32_vfork, sys_vfork 540 541 ALIGN 542GLOBAL(stub32_clone) 543 leaq sys_clone(%rip), %rax 544 /* 545 * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). 546 * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). 547 * 548 * The native 64-bit kernel's sys_clone() implements the latter, 549 * so we need to swap arguments here before calling it: 550 */ 551 xchg %r8, %rcx 552 jmp ia32_ptregs_common 553 554 ALIGN 555ia32_ptregs_common: 556 SAVE_EXTRA_REGS 8 557 call *%rax 558 RESTORE_EXTRA_REGS 8 559 ret 560END(ia32_ptregs_common) 561