1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28#include <sys/asm_linkage.h> 29#include <sys/asm_misc.h> 30#include <sys/regset.h> 31#include <sys/privregs.h> 32#include <sys/psw.h> 33#include <sys/machbrand.h> 34 35#if defined(__lint) 36 37#include <sys/types.h> 38#include <sys/thread.h> 39#include <sys/systm.h> 40 41#else /* __lint */ 42 43#include <sys/segments.h> 44#include <sys/pcb.h> 45#include <sys/trap.h> 46#include <sys/ftrace.h> 47#include <sys/traptrace.h> 48#include <sys/clock.h> 49#include <sys/model.h> 50#include <sys/panic.h> 51 52#if defined(__xpv) 53#include <sys/hypervisor.h> 54#endif 55 56#include "assym.h" 57 58#endif /* __lint */ 59 60/* 61 * We implement five flavours of system call entry points 62 * 63 * - syscall/sysretq (amd64 generic) 64 * - syscall/sysretl (i386 plus SYSC bit) 65 * - sysenter/sysexit (i386 plus SEP bit) 66 * - int/iret (i386 generic) 67 * - lcall/iret (i386 generic) 68 * 69 * The current libc included in Solaris uses int/iret as the base unoptimized 70 * kernel entry method. Older libc implementations and legacy binaries may use 71 * the lcall call gate, so it must continue to be supported. 72 * 73 * System calls that use an lcall call gate are processed in trap() via a 74 * segment-not-present trap, i.e. lcalls are extremely slow(!). 75 * 76 * The basic pattern used in the 32-bit SYSC handler at this point in time is 77 * to have the bare minimum of assembler, and get to the C handlers as 78 * quickly as possible. 79 * 80 * The 64-bit handler is much closer to the sparcv9 handler; that's 81 * because of passing arguments in registers. The 32-bit world still 82 * passes arguments on the stack -- that makes that handler substantially 83 * more complex. 84 * 85 * The two handlers share a few code fragments which are broken 86 * out into preprocessor macros below. 87 * 88 * XX64 come back and speed all this up later. The 32-bit stuff looks 89 * especially easy to speed up the argument copying part .. 90 * 91 * 92 * Notes about segment register usage (c.f. the 32-bit kernel) 93 * 94 * In the 32-bit kernel, segment registers are dutifully saved and 95 * restored on all mode transitions because the kernel uses them directly. 96 * When the processor is running in 64-bit mode, segment registers are 97 * largely ignored. 98 * 99 * %cs and %ss 100 * controlled by the hardware mechanisms that make mode transitions 101 * 102 * The remaining segment registers have to either be pointing at a valid 103 * descriptor i.e. with the 'present' bit set, or they can NULL descriptors 104 * 105 * %ds and %es 106 * always ignored 107 * 108 * %fs and %gs 109 * fsbase and gsbase are used to control the place they really point at. 110 * The kernel only depends on %gs, and controls its own gsbase via swapgs 111 * 112 * Note that loading segment registers is still costly because the GDT 113 * lookup still happens (this is because the hardware can't know that we're 114 * not setting up these segment registers for a 32-bit program). Thus we 115 * avoid doing this in the syscall path, and defer them to lwp context switch 116 * handlers, so the register values remain virtualized to the lwp. 117 */ 118 119#if defined(SYSCALLTRACE) 120#define ORL_SYSCALLTRACE(r32) \ 121 orl syscalltrace(%rip), r32 122#else 123#define ORL_SYSCALLTRACE(r32) 124#endif 125 126/* 127 * In the 32-bit kernel, we do absolutely nothing before getting into the 128 * brand callback checks. In 64-bit land, we do swapgs and then come here. 129 * We assume that the %rsp- and %r15-stashing fields in the CPU structure 130 * are still unused. 131 * 132 * When the callback is invoked, we will be on the user's %gs and 133 * the stack will look like this: 134 * 135 * stack: -------------------------------------- 136 * | callback pointer | 137 * | | user stack pointer | 138 * | | lwp brand data | 139 * | | proc brand data | 140 * v | userland return address | 141 * | callback wrapper return addr | 142 * -------------------------------------- 143 * 144 */ 145#define BRAND_CALLBACK(callback_id) \ 146 movq %rsp, %gs:CPU_RTMP_RSP /* save the stack pointer */ ;\ 147 movq %r15, %gs:CPU_RTMP_R15 /* save %r15 */ ;\ 148 movq %gs:CPU_THREAD, %r15 /* load the thread pointer */ ;\ 149 movq T_STACK(%r15), %rsp /* switch to the kernel stack */ ;\ 150 subq $16, %rsp /* save space for two pointers */ ;\ 151 pushq %r14 /* save %r14 */ ;\ 152 movq %gs:CPU_RTMP_RSP, %r14 ;\ 153 movq %r14, 8(%rsp) /* stash the user stack pointer */ ;\ 154 popq %r14 /* restore %r14 */ ;\ 155 movq T_LWP(%r15), %r15 /* load the lwp pointer */ ;\ 156 pushq LWP_BRAND(%r15) /* push the lwp's brand data */ ;\ 157 movq LWP_PROCP(%r15), %r15 /* load the proc pointer */ ;\ 158 pushq P_BRAND_DATA(%r15) /* push the proc's brand data */ ;\ 159 movq P_BRAND(%r15), %r15 /* load the brand pointer */ ;\ 160 movq B_MACHOPS(%r15), %r15 /* load the machops pointer */ ;\ 161 movq _CONST(_MUL(callback_id, CPTRSIZE))(%r15), %r15 ;\ 162 cmpq $0, %r15 ;\ 163 je 1f ;\ 164 movq %r15, 24(%rsp) /* save the callback pointer */ ;\ 165 movq %gs:CPU_RTMP_RSP, %r15 /* grab the user stack pointer */ ;\ 166 pushq (%r15) /* push the return address */ ;\ 167 movq %gs:CPU_RTMP_R15, %r15 /* restore %r15 */ ;\ 168 SWAPGS /* user gsbase */ ;\ 169 call *32(%rsp) /* call callback */ ;\ 170 SWAPGS /* kernel gsbase */ ;\ 1711: movq %gs:CPU_RTMP_R15, %r15 /* restore %r15 */ ;\ 172 movq %gs:CPU_RTMP_RSP, %rsp /* restore the stack pointer */ 173 174#define MSTATE_TRANSITION(from, to) \ 175 movl $from, %edi; \ 176 movl $to, %esi; \ 177 call syscall_mstate 178 179/* 180 * Check to see if a simple (direct) return is possible i.e. 181 * 182 * if (t->t_post_sys_ast | syscalltrace | 183 * lwp->lwp_pcb.pcb_rupdate == 1) 184 * do full version ; 185 * 186 * Preconditions: 187 * - t is curthread 188 * Postconditions: 189 * - condition code NE is set if post-sys is too complex 190 * - rtmp is zeroed if it isn't (we rely on this!) 191 * - ltmp is smashed 192 */ 193#define CHECK_POSTSYS_NE(t, ltmp, rtmp) \ 194 movq T_LWP(t), ltmp; \ 195 movzbl PCB_RUPDATE(ltmp), rtmp; \ 196 ORL_SYSCALLTRACE(rtmp); \ 197 orl T_POST_SYS_AST(t), rtmp; \ 198 cmpl $0, rtmp 199 200/* 201 * Fix up the lwp, thread, and eflags for a successful return 202 * 203 * Preconditions: 204 * - zwreg contains zero 205 */ 206#define SIMPLE_SYSCALL_POSTSYS(t, lwp, zwreg) \ 207 movb $LWP_USER, LWP_STATE(lwp); \ 208 movw zwreg, T_SYSNUM(t); \ 209 andb $_CONST(0xffff - PS_C), REGOFF_RFL(%rsp) 210 211/* 212 * ASSERT(lwptoregs(lwp) == rp); 213 * 214 * This may seem obvious, but very odd things happen if this 215 * assertion is false 216 * 217 * Preconditions: 218 * (%rsp is ready for normal call sequence) 219 * Postconditions (if assertion is true): 220 * %r11 is smashed 221 * 222 * ASSERT(rp->r_cs == descnum) 223 * 224 * The code selector is written into the regs structure when the 225 * lwp stack is created. We use this ASSERT to validate that 226 * the regs structure really matches how we came in. 227 * 228 * Preconditions: 229 * (%rsp is ready for normal call sequence) 230 * Postconditions (if assertion is true): 231 * -none- 232 * 233 * ASSERT(lwp->lwp_pcb.pcb_rupdate == 0); 234 * 235 * If this is false, it meant that we returned to userland without 236 * updating the segment registers as we were supposed to. 237 * 238 * Note that we must ensure no interrupts or other traps intervene 239 * between entering privileged mode and performing the assertion, 240 * otherwise we may perform a context switch on the thread, which 241 * will end up setting pcb_rupdate to 1 again. 242 */ 243#if defined(DEBUG) 244 245#if !defined(__lint) 246 247__lwptoregs_msg: 248 .string "%M%:%d lwptoregs(%p) [%p] != rp [%p]" 249 250__codesel_msg: 251 .string "%M%:%d rp->r_cs [%ld] != %ld" 252 253__no_rupdate_msg: 254 .string "%M%:%d lwp %p, pcb_rupdate != 0" 255 256#endif /* !__lint */ 257 258#define ASSERT_LWPTOREGS(lwp, rp) \ 259 movq LWP_REGS(lwp), %r11; \ 260 cmpq rp, %r11; \ 261 je 7f; \ 262 leaq __lwptoregs_msg(%rip), %rdi; \ 263 movl $__LINE__, %esi; \ 264 movq lwp, %rdx; \ 265 movq %r11, %rcx; \ 266 movq rp, %r8; \ 267 xorl %eax, %eax; \ 268 call panic; \ 2697: 270 271#define ASSERT_NO_RUPDATE_PENDING(lwp) \ 272 testb $0x1, PCB_RUPDATE(lwp); \ 273 je 8f; \ 274 movq lwp, %rdx; \ 275 leaq __no_rupdate_msg(%rip), %rdi; \ 276 movl $__LINE__, %esi; \ 277 xorl %eax, %eax; \ 278 call panic; \ 2798: 280 281#else 282#define ASSERT_LWPTOREGS(lwp, rp) 283#define ASSERT_NO_RUPDATE_PENDING(lwp) 284#endif 285 286/* 287 * Do the traptrace thing and restore any registers we used 288 * in situ. Assumes that %rsp is pointing at the base of 289 * the struct regs, obviously .. 290 */ 291#ifdef TRAPTRACE 292#define SYSCALL_TRAPTRACE(ttype) \ 293 TRACE_PTR(%rdi, %rbx, %ebx, %rcx, ttype); \ 294 TRACE_REGS(%rdi, %rsp, %rbx, %rcx); \ 295 TRACE_STAMP(%rdi); /* rdtsc clobbers %eax, %edx */ \ 296 movq REGOFF_RAX(%rsp), %rax; \ 297 movq REGOFF_RBX(%rsp), %rbx; \ 298 movq REGOFF_RCX(%rsp), %rcx; \ 299 movq REGOFF_RDX(%rsp), %rdx; \ 300 movl %eax, TTR_SYSNUM(%rdi); \ 301 movq REGOFF_RDI(%rsp), %rdi 302 303#define SYSCALL_TRAPTRACE32(ttype) \ 304 SYSCALL_TRAPTRACE(ttype); \ 305 /* paranoia: clean the top 32-bits of the registers */ \ 306 orl %eax, %eax; \ 307 orl %ebx, %ebx; \ 308 orl %ecx, %ecx; \ 309 orl %edx, %edx; \ 310 orl %edi, %edi 311#else /* TRAPTRACE */ 312#define SYSCALL_TRAPTRACE(ttype) 313#define SYSCALL_TRAPTRACE32(ttype) 314#endif /* TRAPTRACE */ 315 316/* 317 * The 64-bit libc syscall wrapper does this: 318 * 319 * fn(<args>) 320 * { 321 * movq %rcx, %r10 -- because syscall smashes %rcx 322 * movl $CODE, %eax 323 * syscall 324 * <error processing> 325 * } 326 * 327 * Thus when we come into the kernel: 328 * 329 * %rdi, %rsi, %rdx, %r10, %r8, %r9 contain first six args 330 * %rax is the syscall number 331 * %r12-%r15 contain caller state 332 * 333 * The syscall instruction arranges that: 334 * 335 * %rcx contains the return %rip 336 * %r11d contains bottom 32-bits of %rflags 337 * %rflags is masked (as determined by the SFMASK msr) 338 * %cs is set to UCS_SEL (as determined by the STAR msr) 339 * %ss is set to UDS_SEL (as determined by the STAR msr) 340 * %rip is set to sys_syscall (as determined by the LSTAR msr) 341 * 342 * Or in other words, we have no registers available at all. 343 * Only swapgs can save us! 344 */ 345 346#if defined(__lint) 347 348/*ARGSUSED*/ 349void 350sys_syscall() 351{} 352 353void 354_allsyscalls() 355{} 356 357size_t _allsyscalls_size; 358 359#else /* __lint */ 360 361 ENTRY_NP2(brand_sys_syscall,_allsyscalls) 362 SWAPGS /* kernel gsbase */ 363 XPV_TRAP_POP 364 BRAND_CALLBACK(BRAND_CB_SYSCALL) 365 SWAPGS /* user gsbase */ 366 367#if defined(__xpv) 368 /* 369 * Note that swapgs is handled for us by the hypervisor. Here 370 * it is empty. 371 */ 372 jmp nopop_sys_syscall 373#endif 374 375 ALTENTRY(sys_syscall) 376 SWAPGS /* kernel gsbase */ 377#if defined(__xpv) 378 /* 379 * Even though we got here by a syscall instruction from user land 380 * the hypervisor constructs our stack the same way as is done 381 * for interrupt gates. The only exception is that it pushes kernel 382 * cs and ss instead of user cs and ss for some reason. This is all 383 * different from running native on the metal. 384 * 385 * Stack on entry: 386 * (0x0)rsp rcx (user rip) 387 * (0x8)rsp r11 (user rflags) 388 * (0x10)rsp user rip 389 * (0x18)rsp kernel cs 390 * (0x20)rsp user rflags 391 * (0x28)rsp user rsp 392 * (0x30)rsp kernel ss 393 */ 394 395 XPV_TRAP_POP 396nopop_sys_syscall: 397 ASSERT_UPCALL_MASK_IS_SET 398 399 movq %r15, %gs:CPU_RTMP_R15 400 movq 0x18(%rsp), %r15 /* save user stack */ 401 movq %r15, %gs:CPU_RTMP_RSP 402#else 403 movq %r15, %gs:CPU_RTMP_R15 404 movq %rsp, %gs:CPU_RTMP_RSP 405#endif /* __xpv */ 406 407 movq %gs:CPU_THREAD, %r15 408 movq T_STACK(%r15), %rsp 409 410 movl $UCS_SEL, REGOFF_CS(%rsp) 411 movq %rcx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */ 412 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */ 413 movl $UDS_SEL, REGOFF_SS(%rsp) 414 415 movl %eax, %eax /* wrapper: sysc# -> %eax */ 416 movq %rdi, REGOFF_RDI(%rsp) 417 movq %rsi, REGOFF_RSI(%rsp) 418 movq %rdx, REGOFF_RDX(%rsp) 419 movq %r10, REGOFF_RCX(%rsp) /* wrapper: %rcx -> %r10 */ 420 movq %r10, %rcx /* arg[3] for direct calls */ 421 422 movq %r8, REGOFF_R8(%rsp) 423 movq %r9, REGOFF_R9(%rsp) 424 movq %rax, REGOFF_RAX(%rsp) 425 movq %rbx, REGOFF_RBX(%rsp) 426 427 movq %rbp, REGOFF_RBP(%rsp) 428 movq %r10, REGOFF_R10(%rsp) 429 movq %gs:CPU_RTMP_RSP, %r11 430 movq %r11, REGOFF_RSP(%rsp) 431 movq %r12, REGOFF_R12(%rsp) 432 433 movq %r13, REGOFF_R13(%rsp) 434 movq %r14, REGOFF_R14(%rsp) 435 movq %gs:CPU_RTMP_R15, %r10 436 movq %r10, REGOFF_R15(%rsp) 437 movq $0, REGOFF_SAVFP(%rsp) 438 movq $0, REGOFF_SAVPC(%rsp) 439 440 /* 441 * Copy these registers here in case we end up stopped with 442 * someone (like, say, /proc) messing with our register state. 443 * We don't -restore- them unless we have to in update_sregs. 444 * 445 * Since userland -can't- change fsbase or gsbase directly, 446 * and capturing them involves two serializing instructions, 447 * we don't bother to capture them here. 448 */ 449 xorl %ebx, %ebx 450 movw %ds, %bx 451 movq %rbx, REGOFF_DS(%rsp) 452 movw %es, %bx 453 movq %rbx, REGOFF_ES(%rsp) 454 movw %fs, %bx 455 movq %rbx, REGOFF_FS(%rsp) 456 movw %gs, %bx 457 movq %rbx, REGOFF_GS(%rsp) 458 459 /* 460 * Machine state saved in the regs structure on the stack 461 * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9 462 * %eax is the syscall number 463 * %rsp is the thread's stack, %r15 is curthread 464 * REG_RSP(%rsp) is the user's stack 465 */ 466 467 SYSCALL_TRAPTRACE($TT_SYSC64) 468 469 movq %rsp, %rbp 470 471 movq T_LWP(%r15), %r14 472 ASSERT_NO_RUPDATE_PENDING(%r14) 473 ENABLE_INTR_FLAGS 474 475 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 476 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ 477 478 ASSERT_LWPTOREGS(%r14, %rsp) 479 480 movb $LWP_SYS, LWP_STATE(%r14) 481 incq LWP_RU_SYSC(%r14) 482 movb $NORMALRETURN, LWP_EOSYS(%r14) 483 484 incq %gs:CPU_STATS_SYS_SYSCALL 485 486 movw %ax, T_SYSNUM(%r15) 487 movzbl T_PRE_SYS(%r15), %ebx 488 ORL_SYSCALLTRACE(%ebx) 489 testl %ebx, %ebx 490 jne _syscall_pre 491 492_syscall_invoke: 493 movq REGOFF_RDI(%rbp), %rdi 494 movq REGOFF_RSI(%rbp), %rsi 495 movq REGOFF_RDX(%rbp), %rdx 496 movq REGOFF_RCX(%rbp), %rcx 497 movq REGOFF_R8(%rbp), %r8 498 movq REGOFF_R9(%rbp), %r9 499 500 cmpl $NSYSCALL, %eax 501 jae _syscall_ill 502 shll $SYSENT_SIZE_SHIFT, %eax 503 leaq sysent(%rax), %rbx 504 505 call *SY_CALLC(%rbx) 506 507 movq %rax, %r12 508 movq %rdx, %r13 509 510 /* 511 * If the handler returns two ints, then we need to split the 512 * 64-bit return value into two 32-bit values. 513 */ 514 testw $SE_32RVAL2, SY_FLAGS(%rbx) 515 je 5f 516 movq %r12, %r13 517 shrq $32, %r13 /* upper 32-bits into %edx */ 518 movl %r12d, %r12d /* lower 32-bits into %eax */ 5195: 520 /* 521 * Optimistically assume that there's no post-syscall 522 * work to do. (This is to avoid having to call syscall_mstate() 523 * with interrupts disabled) 524 */ 525 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 526 527 /* 528 * We must protect ourselves from being descheduled here; 529 * If we were, and we ended up on another cpu, or another 530 * lwp got in ahead of us, it could change the segment 531 * registers without us noticing before we return to userland. 532 */ 533 CLI(%r14) 534 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 535 jne _syscall_post 536 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 537 538 movq %r12, REGOFF_RAX(%rsp) 539 movq %r13, REGOFF_RDX(%rsp) 540 541 /* 542 * To get back to userland, we need the return %rip in %rcx and 543 * the return %rfl in %r11d. The sysretq instruction also arranges 544 * to fix up %cs and %ss; everything else is our responsibility. 545 */ 546 movq REGOFF_RDI(%rsp), %rdi 547 movq REGOFF_RSI(%rsp), %rsi 548 movq REGOFF_RDX(%rsp), %rdx 549 /* %rcx used to restore %rip value */ 550 551 movq REGOFF_R8(%rsp), %r8 552 movq REGOFF_R9(%rsp), %r9 553 movq REGOFF_RAX(%rsp), %rax 554 movq REGOFF_RBX(%rsp), %rbx 555 556 movq REGOFF_RBP(%rsp), %rbp 557 movq REGOFF_R10(%rsp), %r10 558 /* %r11 used to restore %rfl value */ 559 movq REGOFF_R12(%rsp), %r12 560 561 movq REGOFF_R13(%rsp), %r13 562 movq REGOFF_R14(%rsp), %r14 563 movq REGOFF_R15(%rsp), %r15 564 565 movq REGOFF_RIP(%rsp), %rcx 566 movl REGOFF_RFL(%rsp), %r11d 567 568#if defined(__xpv) 569 addq $REGOFF_RIP, %rsp 570#else 571 movq REGOFF_RSP(%rsp), %rsp 572#endif 573 574 /* 575 * There can be no instructions between the ALTENTRY below and 576 * SYSRET or we could end up breaking brand support. See label usage 577 * in sn1_brand_syscall_callback for an example. 578 */ 579 ASSERT_UPCALL_MASK_IS_SET 580 SWAPGS /* user gsbase */ 581 ALTENTRY(nopop_sys_syscall_sysretq) 582 SYSRETQ 583 /*NOTREACHED*/ 584 SET_SIZE(nopop_sys_syscall_sysretq) 585 586_syscall_pre: 587 call pre_syscall 588 movl %eax, %r12d 589 testl %eax, %eax 590 jne _syscall_post_call 591 /* 592 * Didn't abort, so reload the syscall args and invoke the handler. 593 */ 594 movzwl T_SYSNUM(%r15), %eax 595 jmp _syscall_invoke 596 597_syscall_ill: 598 call nosys 599 movq %rax, %r12 600 movq %rdx, %r13 601 jmp _syscall_post_call 602 603_syscall_post: 604 STI 605 /* 606 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM 607 * so that we can account for the extra work it takes us to finish. 608 */ 609 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 610_syscall_post_call: 611 movq %r12, %rdi 612 movq %r13, %rsi 613 call post_syscall 614 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 615 jmp _sys_rtt 616 SET_SIZE(sys_syscall) 617 SET_SIZE(brand_sys_syscall) 618 619#endif /* __lint */ 620 621#if defined(__lint) 622 623/*ARGSUSED*/ 624void 625sys_syscall32() 626{} 627 628#else /* __lint */ 629 630 ENTRY_NP(brand_sys_syscall32) 631 SWAPGS /* kernel gsbase */ 632 XPV_TRAP_POP 633 BRAND_CALLBACK(BRAND_CB_SYSCALL32) 634 SWAPGS /* user gsbase */ 635 636#if defined(__xpv) 637 jmp nopop_sys_syscall32 638#endif 639 640 ALTENTRY(sys_syscall32) 641 SWAPGS /* kernel gsbase */ 642 643#if defined(__xpv) 644 XPV_TRAP_POP 645nopop_sys_syscall32: 646#endif 647 648 movl %esp, %r10d 649 movq %gs:CPU_THREAD, %r15 650 movq T_STACK(%r15), %rsp 651 movl %eax, %eax 652 653 movl $U32CS_SEL, REGOFF_CS(%rsp) 654 movl %ecx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */ 655 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */ 656 movq %r10, REGOFF_RSP(%rsp) 657 movl $UDS_SEL, REGOFF_SS(%rsp) 658 659_syscall32_save: 660 movl %edi, REGOFF_RDI(%rsp) 661 movl %esi, REGOFF_RSI(%rsp) 662 movl %ebp, REGOFF_RBP(%rsp) 663 movl %ebx, REGOFF_RBX(%rsp) 664 movl %edx, REGOFF_RDX(%rsp) 665 movl %ecx, REGOFF_RCX(%rsp) 666 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */ 667 movq $0, REGOFF_SAVFP(%rsp) 668 movq $0, REGOFF_SAVPC(%rsp) 669 670 /* 671 * Copy these registers here in case we end up stopped with 672 * someone (like, say, /proc) messing with our register state. 673 * We don't -restore- them unless we have to in update_sregs. 674 * 675 * Since userland -can't- change fsbase or gsbase directly, 676 * we don't bother to capture them here. 677 */ 678 xorl %ebx, %ebx 679 movw %ds, %bx 680 movq %rbx, REGOFF_DS(%rsp) 681 movw %es, %bx 682 movq %rbx, REGOFF_ES(%rsp) 683 movw %fs, %bx 684 movq %rbx, REGOFF_FS(%rsp) 685 movw %gs, %bx 686 movq %rbx, REGOFF_GS(%rsp) 687 688 /* 689 * Application state saved in the regs structure on the stack 690 * %eax is the syscall number 691 * %rsp is the thread's stack, %r15 is curthread 692 * REG_RSP(%rsp) is the user's stack 693 */ 694 695 SYSCALL_TRAPTRACE32($TT_SYSC) 696 697 movq %rsp, %rbp 698 699 movq T_LWP(%r15), %r14 700 ASSERT_NO_RUPDATE_PENDING(%r14) 701 702 ENABLE_INTR_FLAGS 703 704 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 705 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ 706 707 ASSERT_LWPTOREGS(%r14, %rsp) 708 709 incq %gs:CPU_STATS_SYS_SYSCALL 710 711 /* 712 * Make some space for MAXSYSARGS (currently 8) 32-bit args placed 713 * into 64-bit (long) arg slots, maintaining 16 byte alignment. Or 714 * more succinctly: 715 * 716 * SA(MAXSYSARGS * sizeof (long)) == 64 717 */ 718#define SYS_DROP 64 /* drop for args */ 719 subq $SYS_DROP, %rsp 720 movb $LWP_SYS, LWP_STATE(%r14) 721 movq %r15, %rdi 722 movq %rsp, %rsi 723 call syscall_entry 724 725 /* 726 * Fetch the arguments copied onto the kernel stack and put 727 * them in the right registers to invoke a C-style syscall handler. 728 * %rax contains the handler address. 729 * 730 * Ideas for making all this go faster of course include simply 731 * forcibly fetching 6 arguments from the user stack under lofault 732 * protection, reverting to copyin_args only when watchpoints 733 * are in effect. 734 * 735 * (If we do this, make sure that exec and libthread leave 736 * enough space at the top of the stack to ensure that we'll 737 * never do a fetch from an invalid page.) 738 * 739 * Lots of ideas here, but they won't really help with bringup B-) 740 * Correctness can't wait, performance can wait a little longer .. 741 */ 742 743 movq %rax, %rbx 744 movl 0(%rsp), %edi 745 movl 8(%rsp), %esi 746 movl 0x10(%rsp), %edx 747 movl 0x18(%rsp), %ecx 748 movl 0x20(%rsp), %r8d 749 movl 0x28(%rsp), %r9d 750 751 call *SY_CALLC(%rbx) 752 753 movq %rbp, %rsp /* pop the args */ 754 755 /* 756 * amd64 syscall handlers -always- return a 64-bit value in %rax. 757 * On the 32-bit kernel, they always return that value in %eax:%edx 758 * as required by the 32-bit ABI. 759 * 760 * Simulate the same behaviour by unconditionally splitting the 761 * return value in the same way. 762 */ 763 movq %rax, %r13 764 shrq $32, %r13 /* upper 32-bits into %edx */ 765 movl %eax, %r12d /* lower 32-bits into %eax */ 766 767 /* 768 * Optimistically assume that there's no post-syscall 769 * work to do. (This is to avoid having to call syscall_mstate() 770 * with interrupts disabled) 771 */ 772 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 773 774 /* 775 * We must protect ourselves from being descheduled here; 776 * If we were, and we ended up on another cpu, or another 777 * lwp got in ahead of us, it could change the segment 778 * registers without us noticing before we return to userland. 779 */ 780 CLI(%r14) 781 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 782 jne _full_syscall_postsys32 783 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 784 785 /* 786 * To get back to userland, we need to put the return %rip in %rcx and 787 * the return %rfl in %r11d. The sysret instruction also arranges 788 * to fix up %cs and %ss; everything else is our responsibility. 789 */ 790 791 movl %r12d, %eax /* %eax: rval1 */ 792 movl REGOFF_RBX(%rsp), %ebx 793 /* %ecx used for return pointer */ 794 movl %r13d, %edx /* %edx: rval2 */ 795 movl REGOFF_RBP(%rsp), %ebp 796 movl REGOFF_RSI(%rsp), %esi 797 movl REGOFF_RDI(%rsp), %edi 798 799 movl REGOFF_RFL(%rsp), %r11d /* %r11 -> eflags */ 800 movl REGOFF_RIP(%rsp), %ecx /* %ecx -> %eip */ 801 movl REGOFF_RSP(%rsp), %esp 802 803 ASSERT_UPCALL_MASK_IS_SET 804 SWAPGS /* user gsbase */ 805 ALTENTRY(nopop_sys_syscall32_sysretl) 806 SYSRETL 807 SET_SIZE(nopop_sys_syscall32_sysretl) 808 /*NOTREACHED*/ 809 810_full_syscall_postsys32: 811 STI 812 /* 813 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM 814 * so that we can account for the extra work it takes us to finish. 815 */ 816 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 817 movq %r15, %rdi 818 movq %r12, %rsi /* rval1 - %eax */ 819 movq %r13, %rdx /* rval2 - %edx */ 820 call syscall_exit 821 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 822 jmp _sys_rtt 823 SET_SIZE(sys_syscall32) 824 SET_SIZE(brand_sys_syscall32) 825 826#endif /* __lint */ 827 828/* 829 * System call handler via the sysenter instruction 830 * Used only for 32-bit system calls on the 64-bit kernel. 831 * 832 * The caller in userland has arranged that: 833 * 834 * - %eax contains the syscall number 835 * - %ecx contains the user %esp 836 * - %edx contains the return %eip 837 * - the user stack contains the args to the syscall 838 * 839 * Hardware and (privileged) initialization code have arranged that by 840 * the time the sysenter instructions completes: 841 * 842 * - %rip is pointing to sys_sysenter (below). 843 * - %cs and %ss are set to kernel text and stack (data) selectors. 844 * - %rsp is pointing at the lwp's stack 845 * - interrupts have been disabled. 846 * 847 * Note that we are unable to return both "rvals" to userland with 848 * this call, as %edx is used by the sysexit instruction. 849 * 850 * One final complication in this routine is its interaction with 851 * single-stepping in a debugger. For most of the system call mechanisms, 852 * the CPU automatically clears the single-step flag before we enter the 853 * kernel. The sysenter mechanism does not clear the flag, so a user 854 * single-stepping through a libc routine may suddenly find him/herself 855 * single-stepping through the kernel. To detect this, kmdb compares the 856 * trap %pc to the [brand_]sys_enter addresses on each single-step trap. 857 * If it finds that we have single-stepped to a sysenter entry point, it 858 * explicitly clears the flag and executes the sys_sysenter routine. 859 * 860 * One final complication in this final complication is the fact that we 861 * have two different entry points for sysenter: brand_sys_sysenter and 862 * sys_sysenter. If we enter at brand_sys_sysenter and start single-stepping 863 * through the kernel with kmdb, we will eventually hit the instruction at 864 * sys_sysenter. kmdb cannot distinguish between that valid single-step 865 * and the undesirable one mentioned above. To avoid this situation, we 866 * simply add a jump over the instruction at sys_sysenter to make it 867 * impossible to single-step to it. 868 */ 869#if defined(__lint) 870 871void 872sys_sysenter() 873{} 874 875#else /* __lint */ 876 877 ENTRY_NP(brand_sys_sysenter) 878 SWAPGS /* kernel gsbase */ 879 ALTENTRY(_brand_sys_sysenter_post_swapgs) 880 BRAND_CALLBACK(BRAND_CB_SYSENTER) 881 /* 882 * Jump over sys_sysenter to allow single-stepping as described 883 * above. 884 */ 885 jmp _sys_sysenter_post_swapgs 886 887 ALTENTRY(sys_sysenter) 888 SWAPGS /* kernel gsbase */ 889 890 ALTENTRY(_sys_sysenter_post_swapgs) 891 movq %gs:CPU_THREAD, %r15 892 893 movl $U32CS_SEL, REGOFF_CS(%rsp) 894 movl %ecx, REGOFF_RSP(%rsp) /* wrapper: %esp -> %ecx */ 895 movl %edx, REGOFF_RIP(%rsp) /* wrapper: %eip -> %edx */ 896 pushfq 897 popq %r10 898 movl $UDS_SEL, REGOFF_SS(%rsp) 899 900 /* 901 * Set the interrupt flag before storing the flags to the 902 * flags image on the stack so we can return to user with 903 * interrupts enabled if we return via sys_rtt_syscall32 904 */ 905 orq $PS_IE, %r10 906 movq %r10, REGOFF_RFL(%rsp) 907 908 movl %edi, REGOFF_RDI(%rsp) 909 movl %esi, REGOFF_RSI(%rsp) 910 movl %ebp, REGOFF_RBP(%rsp) 911 movl %ebx, REGOFF_RBX(%rsp) 912 movl %edx, REGOFF_RDX(%rsp) 913 movl %ecx, REGOFF_RCX(%rsp) 914 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */ 915 movq $0, REGOFF_SAVFP(%rsp) 916 movq $0, REGOFF_SAVPC(%rsp) 917 918 /* 919 * Copy these registers here in case we end up stopped with 920 * someone (like, say, /proc) messing with our register state. 921 * We don't -restore- them unless we have to in update_sregs. 922 * 923 * Since userland -can't- change fsbase or gsbase directly, 924 * we don't bother to capture them here. 925 */ 926 xorl %ebx, %ebx 927 movw %ds, %bx 928 movq %rbx, REGOFF_DS(%rsp) 929 movw %es, %bx 930 movq %rbx, REGOFF_ES(%rsp) 931 movw %fs, %bx 932 movq %rbx, REGOFF_FS(%rsp) 933 movw %gs, %bx 934 movq %rbx, REGOFF_GS(%rsp) 935 936 /* 937 * Application state saved in the regs structure on the stack 938 * %eax is the syscall number 939 * %rsp is the thread's stack, %r15 is curthread 940 * REG_RSP(%rsp) is the user's stack 941 */ 942 943 SYSCALL_TRAPTRACE($TT_SYSENTER) 944 945 movq %rsp, %rbp 946 947 movq T_LWP(%r15), %r14 948 ASSERT_NO_RUPDATE_PENDING(%r14) 949 950 ENABLE_INTR_FLAGS 951 952 /* 953 * Catch 64-bit process trying to issue sysenter instruction 954 * on Nocona based systems. 955 */ 956 movq LWP_PROCP(%r14), %rax 957 cmpq $DATAMODEL_ILP32, P_MODEL(%rax) 958 je 7f 959 960 /* 961 * For a non-32-bit process, simulate a #ud, since that's what 962 * native hardware does. The traptrace entry (above) will 963 * let you know what really happened. 964 */ 965 movq $T_ILLINST, REGOFF_TRAPNO(%rsp) 966 movq REGOFF_CS(%rsp), %rdi 967 movq %rdi, REGOFF_ERR(%rsp) 968 movq %rsp, %rdi 969 movq REGOFF_RIP(%rsp), %rsi 970 movl %gs:CPU_ID, %edx 971 call trap 972 jmp _sys_rtt 9737: 974 975 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 976 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate calls) */ 977 978 ASSERT_LWPTOREGS(%r14, %rsp) 979 980 incq %gs:CPU_STATS_SYS_SYSCALL 981 982 /* 983 * Make some space for MAXSYSARGS (currently 8) 32-bit args 984 * placed into 64-bit (long) arg slots, plus one 64-bit 985 * (long) arg count, maintaining 16 byte alignment. 986 */ 987 subq $SYS_DROP, %rsp 988 movb $LWP_SYS, LWP_STATE(%r14) 989 movq %r15, %rdi 990 movq %rsp, %rsi 991 call syscall_entry 992 993 /* 994 * Fetch the arguments copied onto the kernel stack and put 995 * them in the right registers to invoke a C-style syscall handler. 996 * %rax contains the handler address. 997 */ 998 movq %rax, %rbx 999 movl 0(%rsp), %edi 1000 movl 8(%rsp), %esi 1001 movl 0x10(%rsp), %edx 1002 movl 0x18(%rsp), %ecx 1003 movl 0x20(%rsp), %r8d 1004 movl 0x28(%rsp), %r9d 1005 1006 call *SY_CALLC(%rbx) 1007 1008 movq %rbp, %rsp /* pop the args */ 1009 1010 /* 1011 * amd64 syscall handlers -always- return a 64-bit value in %rax. 1012 * On the 32-bit kernel, the always return that value in %eax:%edx 1013 * as required by the 32-bit ABI. 1014 * 1015 * Simulate the same behaviour by unconditionally splitting the 1016 * return value in the same way. 1017 */ 1018 movq %rax, %r13 1019 shrq $32, %r13 /* upper 32-bits into %edx */ 1020 movl %eax, %r12d /* lower 32-bits into %eax */ 1021 1022 /* 1023 * Optimistically assume that there's no post-syscall 1024 * work to do. (This is to avoid having to call syscall_mstate() 1025 * with interrupts disabled) 1026 */ 1027 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 1028 1029 /* 1030 * We must protect ourselves from being descheduled here; 1031 * If we were, and we ended up on another cpu, or another 1032 * lwp got int ahead of us, it could change the segment 1033 * registers without us noticing before we return to userland. 1034 */ 1035 cli 1036 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 1037 jne _full_syscall_postsys32 1038 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 1039 1040 /* 1041 * To get back to userland, load up the 32-bit registers and 1042 * sysexit back where we came from. 1043 */ 1044 1045 /* 1046 * Interrupts will be turned on by the 'sti' executed just before 1047 * sysexit. The following ensures that restoring the user's rflags 1048 * doesn't enable interrupts too soon. 1049 */ 1050 andq $_BITNOT(PS_IE), REGOFF_RFL(%rsp) 1051 1052 /* 1053 * (There's no point in loading up %edx because the sysexit 1054 * mechanism smashes it.) 1055 */ 1056 movl %r12d, %eax 1057 movl REGOFF_RBX(%rsp), %ebx 1058 movl REGOFF_RBP(%rsp), %ebp 1059 movl REGOFF_RSI(%rsp), %esi 1060 movl REGOFF_RDI(%rsp), %edi 1061 1062 movl REGOFF_RIP(%rsp), %edx /* sysexit: %edx -> %eip */ 1063 pushq REGOFF_RFL(%rsp) 1064 popfq 1065 movl REGOFF_RSP(%rsp), %ecx /* sysexit: %ecx -> %esp */ 1066 swapgs 1067 sti 1068 sysexit 1069 SET_SIZE(sys_sysenter) 1070 SET_SIZE(_sys_sysenter_post_swapgs) 1071 SET_SIZE(brand_sys_sysenter) 1072 1073#endif /* __lint */ 1074 1075#if defined(__lint) 1076/* 1077 * System call via an int80. This entry point is only used by the Linux 1078 * application environment. Unlike the other entry points, there is no 1079 * default action to take if no callback is registered for this process. 1080 */ 1081void 1082sys_int80() 1083{} 1084 1085#else /* __lint */ 1086 1087 ENTRY_NP(brand_sys_int80) 1088 SWAPGS /* kernel gsbase */ 1089 XPV_TRAP_POP 1090 BRAND_CALLBACK(BRAND_CB_INT80) 1091 SWAPGS /* user gsbase */ 1092#if defined(__xpv) 1093 jmp nopop_int80 1094#endif 1095 1096 ENTRY_NP(sys_int80) 1097 /* 1098 * We hit an int80, but this process isn't of a brand with an int80 1099 * handler. Bad process! Make it look as if the INT failed. 1100 * Modify %rip to point before the INT, push the expected error 1101 * code and fake a GP fault. Note on 64-bit hypervisor we need 1102 * to undo the XPV_TRAP_POP and push rcx and r11 back on the stack 1103 * because gptrap will pop them again with its own XPV_TRAP_POP. 1104 */ 1105#if defined(__xpv) 1106 XPV_TRAP_POP 1107nopop_int80: 1108#endif 1109 subq $2, (%rsp) /* int insn 2-bytes */ 1110 pushq $_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2) 1111#if defined(__xpv) 1112 push %r11 1113 push %rcx 1114#endif 1115 jmp gptrap / GP fault 1116 SET_SIZE(sys_int80) 1117 SET_SIZE(brand_sys_int80) 1118#endif /* __lint */ 1119 1120 1121/* 1122 * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by 1123 * the generic i386 libc to do system calls. We do a small amount of setup 1124 * before jumping into the existing sys_syscall32 path. 1125 */ 1126#if defined(__lint) 1127 1128/*ARGSUSED*/ 1129void 1130sys_syscall_int() 1131{} 1132 1133#else /* __lint */ 1134 1135 ENTRY_NP(brand_sys_syscall_int) 1136 SWAPGS /* kernel gsbase */ 1137 XPV_TRAP_POP 1138 BRAND_CALLBACK(BRAND_CB_INT91) 1139 SWAPGS /* user gsbase */ 1140 1141#if defined(__xpv) 1142 jmp nopop_syscall_int 1143#endif 1144 1145 ALTENTRY(sys_syscall_int) 1146 SWAPGS /* kernel gsbase */ 1147 1148#if defined(__xpv) 1149 XPV_TRAP_POP 1150nopop_syscall_int: 1151#endif 1152 1153 movq %gs:CPU_THREAD, %r15 1154 movq T_STACK(%r15), %rsp 1155 movl %eax, %eax 1156 /* 1157 * Set t_post_sys on this thread to force ourselves out via the slow 1158 * path. It might be possible at some later date to optimize this out 1159 * and use a faster return mechanism. 1160 */ 1161 movb $1, T_POST_SYS(%r15) 1162 CLEAN_CS 1163 jmp _syscall32_save 1164 SET_SIZE(sys_syscall_int) 1165 SET_SIZE(brand_sys_syscall_int) 1166 1167#endif /* __lint */ 1168 1169/* 1170 * Legacy 32-bit applications and old libc implementations do lcalls; 1171 * we should never get here because the LDT entry containing the syscall 1172 * segment descriptor has the "segment present" bit cleared, which means 1173 * we end up processing those system calls in trap() via a not-present trap. 1174 * 1175 * We do it this way because a call gate unhelpfully does -nothing- to the 1176 * interrupt flag bit, so an interrupt can run us just after the lcall 1177 * completes, but just before the swapgs takes effect. Thus the INTR_PUSH and 1178 * INTR_POP paths would have to be slightly more complex to dance around 1179 * this problem, and end up depending explicitly on the first 1180 * instruction of this handler being either swapgs or cli. 1181 */ 1182 1183#if defined(__lint) 1184 1185/*ARGSUSED*/ 1186void 1187sys_lcall32() 1188{} 1189 1190#else /* __lint */ 1191 1192 ENTRY_NP(sys_lcall32) 1193 SWAPGS /* kernel gsbase */ 1194 pushq $0 1195 pushq %rbp 1196 movq %rsp, %rbp 1197 leaq __lcall_panic_str(%rip), %rdi 1198 xorl %eax, %eax 1199 call panic 1200 SET_SIZE(sys_lcall32) 1201 1202__lcall_panic_str: 1203 .string "sys_lcall32: shouldn't be here!" 1204 1205/* 1206 * Declare a uintptr_t which covers the entire pc range of syscall 1207 * handlers for the stack walkers that need this. 1208 */ 1209 .align CPTRSIZE 1210 .globl _allsyscalls_size 1211 .type _allsyscalls_size, @object 1212_allsyscalls_size: 1213 .NWORD . - _allsyscalls 1214 SET_SIZE(_allsyscalls_size) 1215 1216#endif /* __lint */ 1217 1218/* 1219 * These are the thread context handlers for lwps using sysenter/sysexit. 1220 */ 1221 1222#if defined(__lint) 1223 1224/*ARGSUSED*/ 1225void 1226sep_save(void *ksp) 1227{} 1228 1229/*ARGSUSED*/ 1230void 1231sep_restore(void *ksp) 1232{} 1233 1234#else /* __lint */ 1235 1236 /* 1237 * setting this value to zero as we switch away causes the 1238 * stack-pointer-on-sysenter to be NULL, ensuring that we 1239 * don't silently corrupt another (preempted) thread stack 1240 * when running an lwp that (somehow) didn't get sep_restore'd 1241 */ 1242 ENTRY_NP(sep_save) 1243 xorl %edx, %edx 1244 xorl %eax, %eax 1245 movl $MSR_INTC_SEP_ESP, %ecx 1246 wrmsr 1247 ret 1248 SET_SIZE(sep_save) 1249 1250 /* 1251 * Update the kernel stack pointer as we resume onto this cpu. 1252 */ 1253 ENTRY_NP(sep_restore) 1254 movq %rdi, %rdx 1255 shrq $32, %rdx 1256 movl %edi, %eax 1257 movl $MSR_INTC_SEP_ESP, %ecx 1258 wrmsr 1259 ret 1260 SET_SIZE(sep_restore) 1261 1262#endif /* __lint */ 1263