1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28#include <sys/asm_linkage.h> 29#include <sys/asm_misc.h> 30#include <sys/regset.h> 31#include <sys/privregs.h> 32#include <sys/psw.h> 33#include <sys/machbrand.h> 34 35#if defined(__lint) 36 37#include <sys/types.h> 38#include <sys/thread.h> 39#include <sys/systm.h> 40 41#else /* __lint */ 42 43#include <sys/segments.h> 44#include <sys/pcb.h> 45#include <sys/trap.h> 46#include <sys/ftrace.h> 47#include <sys/traptrace.h> 48#include <sys/clock.h> 49#include <sys/model.h> 50#include <sys/panic.h> 51 52#if defined(__xpv) 53#include <sys/hypervisor.h> 54#endif 55 56#include "assym.h" 57 58#endif /* __lint */ 59 60/* 61 * We implement five flavours of system call entry points 62 * 63 * - syscall/sysretq (amd64 generic) 64 * - syscall/sysretl (i386 plus SYSC bit) 65 * - sysenter/sysexit (i386 plus SEP bit) 66 * - int/iret (i386 generic) 67 * - lcall/iret (i386 generic) 68 * 69 * The current libc included in Solaris uses int/iret as the base unoptimized 70 * kernel entry method. Older libc implementations and legacy binaries may use 71 * the lcall call gate, so it must continue to be supported. 72 * 73 * System calls that use an lcall call gate are processed in trap() via a 74 * segment-not-present trap, i.e. lcalls are extremely slow(!). 75 * 76 * The basic pattern used in the 32-bit SYSC handler at this point in time is 77 * to have the bare minimum of assembler, and get to the C handlers as 78 * quickly as possible. 79 * 80 * The 64-bit handler is much closer to the sparcv9 handler; that's 81 * because of passing arguments in registers. The 32-bit world still 82 * passes arguments on the stack -- that makes that handler substantially 83 * more complex. 84 * 85 * The two handlers share a few code fragments which are broken 86 * out into preprocessor macros below. 87 * 88 * XX64 come back and speed all this up later. The 32-bit stuff looks 89 * especially easy to speed up the argument copying part .. 90 * 91 * 92 * Notes about segment register usage (c.f. the 32-bit kernel) 93 * 94 * In the 32-bit kernel, segment registers are dutifully saved and 95 * restored on all mode transitions because the kernel uses them directly. 96 * When the processor is running in 64-bit mode, segment registers are 97 * largely ignored. 98 * 99 * %cs and %ss 100 * controlled by the hardware mechanisms that make mode transitions 101 * 102 * The remaining segment registers have to either be pointing at a valid 103 * descriptor i.e. with the 'present' bit set, or they can NULL descriptors 104 * 105 * %ds and %es 106 * always ignored 107 * 108 * %fs and %gs 109 * fsbase and gsbase are used to control the place they really point at. 110 * The kernel only depends on %gs, and controls its own gsbase via swapgs 111 * 112 * Note that loading segment registers is still costly because the GDT 113 * lookup still happens (this is because the hardware can't know that we're 114 * not setting up these segment registers for a 32-bit program). Thus we 115 * avoid doing this in the syscall path, and defer them to lwp context switch 116 * handlers, so the register values remain virtualized to the lwp. 117 */ 118 119#if defined(SYSCALLTRACE) 120#define ORL_SYSCALLTRACE(r32) \ 121 orl syscalltrace(%rip), r32 122#else 123#define ORL_SYSCALLTRACE(r32) 124#endif 125 126/* 127 * In the 32-bit kernel, we do absolutely nothing before getting into the 128 * brand callback checks. In 64-bit land, we do swapgs and then come here. 129 * We assume that the %rsp- and %r15-stashing fields in the CPU structure 130 * are still unused. 131 * 132 * When the callback is invoked, we will be on the user's %gs and 133 * the stack will look like this: 134 * 135 * stack: -------------------------------------- 136 * | callback pointer | 137 * | | user stack pointer | 138 * | | lwp pointer | 139 * v | userland return address | 140 * | callback wrapper return addr | 141 * -------------------------------------- 142 * 143 */ 144#define BRAND_CALLBACK(callback_id) \ 145 movq %rsp, %gs:CPU_RTMP_RSP /* save the stack pointer */ ;\ 146 movq %r15, %gs:CPU_RTMP_R15 /* save %r15 */ ;\ 147 movq %gs:CPU_THREAD, %r15 /* load the thread pointer */ ;\ 148 movq T_STACK(%r15), %rsp /* switch to the kernel stack */ ;\ 149 subq $16, %rsp /* save space for two pointers */ ;\ 150 pushq %r14 /* save %r14 */ ;\ 151 movq %gs:CPU_RTMP_RSP, %r14 ;\ 152 movq %r14, 8(%rsp) /* stash the user stack pointer */ ;\ 153 popq %r14 /* restore %r14 */ ;\ 154 movq T_LWP(%r15), %r15 /* load the lwp pointer */ ;\ 155 pushq %r15 /* push the lwp pointer */ ;\ 156 movq LWP_PROCP(%r15), %r15 /* load the proc pointer */ ;\ 157 movq P_BRAND(%r15), %r15 /* load the brand pointer */ ;\ 158 movq B_MACHOPS(%r15), %r15 /* load the machops pointer */ ;\ 159 movq _CONST(_MUL(callback_id, CPTRSIZE))(%r15), %r15 ;\ 160 cmpq $0, %r15 ;\ 161 je 1f ;\ 162 movq %r15, 16(%rsp) /* save the callback pointer */ ;\ 163 movq %gs:CPU_RTMP_RSP, %r15 /* grab the user stack pointer */ ;\ 164 pushq (%r15) /* push the return address */ ;\ 165 movq %gs:CPU_RTMP_R15, %r15 /* restore %r15 */ ;\ 166 SWAPGS /* user gsbase */ ;\ 167 call *24(%rsp) /* call callback */ ;\ 168 SWAPGS /* kernel gsbase */ ;\ 1691: movq %gs:CPU_RTMP_R15, %r15 /* restore %r15 */ ;\ 170 movq %gs:CPU_RTMP_RSP, %rsp /* restore the stack pointer */ 171 172#define MSTATE_TRANSITION(from, to) \ 173 movl $from, %edi; \ 174 movl $to, %esi; \ 175 call syscall_mstate 176 177/* 178 * Check to see if a simple (direct) return is possible i.e. 179 * 180 * if (t->t_post_sys_ast | syscalltrace | 181 * lwp->lwp_pcb.pcb_rupdate == 1) 182 * do full version ; 183 * 184 * Preconditions: 185 * - t is curthread 186 * Postconditions: 187 * - condition code NE is set if post-sys is too complex 188 * - rtmp is zeroed if it isn't (we rely on this!) 189 * - ltmp is smashed 190 */ 191#define CHECK_POSTSYS_NE(t, ltmp, rtmp) \ 192 movq T_LWP(t), ltmp; \ 193 movzbl PCB_RUPDATE(ltmp), rtmp; \ 194 ORL_SYSCALLTRACE(rtmp); \ 195 orl T_POST_SYS_AST(t), rtmp; \ 196 cmpl $0, rtmp 197 198/* 199 * Fix up the lwp, thread, and eflags for a successful return 200 * 201 * Preconditions: 202 * - zwreg contains zero 203 */ 204#define SIMPLE_SYSCALL_POSTSYS(t, lwp, zwreg) \ 205 movb $LWP_USER, LWP_STATE(lwp); \ 206 movw zwreg, T_SYSNUM(t); \ 207 andb $_CONST(0xffff - PS_C), REGOFF_RFL(%rsp) 208 209/* 210 * ASSERT(lwptoregs(lwp) == rp); 211 * 212 * This may seem obvious, but very odd things happen if this 213 * assertion is false 214 * 215 * Preconditions: 216 * (%rsp is ready for normal call sequence) 217 * Postconditions (if assertion is true): 218 * %r11 is smashed 219 * 220 * ASSERT(rp->r_cs == descnum) 221 * 222 * The code selector is written into the regs structure when the 223 * lwp stack is created. We use this ASSERT to validate that 224 * the regs structure really matches how we came in. 225 * 226 * Preconditions: 227 * (%rsp is ready for normal call sequence) 228 * Postconditions (if assertion is true): 229 * -none- 230 * 231 * ASSERT(lwp->lwp_pcb.pcb_rupdate == 0); 232 * 233 * If this is false, it meant that we returned to userland without 234 * updating the segment registers as we were supposed to. 235 * 236 * Note that we must ensure no interrupts or other traps intervene 237 * between entering privileged mode and performing the assertion, 238 * otherwise we may perform a context switch on the thread, which 239 * will end up setting pcb_rupdate to 1 again. 240 */ 241#if defined(DEBUG) 242 243#if !defined(__lint) 244 245__lwptoregs_msg: 246 .string "%M%:%d lwptoregs(%p) [%p] != rp [%p]" 247 248__codesel_msg: 249 .string "%M%:%d rp->r_cs [%ld] != %ld" 250 251__no_rupdate_msg: 252 .string "%M%:%d lwp %p, pcb_rupdate != 0" 253 254#endif /* !__lint */ 255 256#define ASSERT_LWPTOREGS(lwp, rp) \ 257 movq LWP_REGS(lwp), %r11; \ 258 cmpq rp, %r11; \ 259 je 7f; \ 260 leaq __lwptoregs_msg(%rip), %rdi; \ 261 movl $__LINE__, %esi; \ 262 movq lwp, %rdx; \ 263 movq %r11, %rcx; \ 264 movq rp, %r8; \ 265 xorl %eax, %eax; \ 266 call panic; \ 2677: 268 269#define ASSERT_NO_RUPDATE_PENDING(lwp) \ 270 testb $0x1, PCB_RUPDATE(lwp); \ 271 je 8f; \ 272 movq lwp, %rdx; \ 273 leaq __no_rupdate_msg(%rip), %rdi; \ 274 movl $__LINE__, %esi; \ 275 xorl %eax, %eax; \ 276 call panic; \ 2778: 278 279#else 280#define ASSERT_LWPTOREGS(lwp, rp) 281#define ASSERT_NO_RUPDATE_PENDING(lwp) 282#endif 283 284/* 285 * Do the traptrace thing and restore any registers we used 286 * in situ. Assumes that %rsp is pointing at the base of 287 * the struct regs, obviously .. 288 */ 289#ifdef TRAPTRACE 290#define SYSCALL_TRAPTRACE(ttype) \ 291 TRACE_PTR(%rdi, %rbx, %ebx, %rcx, ttype); \ 292 TRACE_REGS(%rdi, %rsp, %rbx, %rcx); \ 293 TRACE_STAMP(%rdi); /* rdtsc clobbers %eax, %edx */ \ 294 movq REGOFF_RAX(%rsp), %rax; \ 295 movq REGOFF_RBX(%rsp), %rbx; \ 296 movq REGOFF_RCX(%rsp), %rcx; \ 297 movq REGOFF_RDX(%rsp), %rdx; \ 298 movl %eax, TTR_SYSNUM(%rdi); \ 299 movq REGOFF_RDI(%rsp), %rdi 300 301#define SYSCALL_TRAPTRACE32(ttype) \ 302 SYSCALL_TRAPTRACE(ttype); \ 303 /* paranoia: clean the top 32-bits of the registers */ \ 304 orl %eax, %eax; \ 305 orl %ebx, %ebx; \ 306 orl %ecx, %ecx; \ 307 orl %edx, %edx; \ 308 orl %edi, %edi 309#else /* TRAPTRACE */ 310#define SYSCALL_TRAPTRACE(ttype) 311#define SYSCALL_TRAPTRACE32(ttype) 312#endif /* TRAPTRACE */ 313 314/* 315 * The 64-bit libc syscall wrapper does this: 316 * 317 * fn(<args>) 318 * { 319 * movq %rcx, %r10 -- because syscall smashes %rcx 320 * movl $CODE, %eax 321 * syscall 322 * <error processing> 323 * } 324 * 325 * Thus when we come into the kernel: 326 * 327 * %rdi, %rsi, %rdx, %r10, %r8, %r9 contain first six args 328 * %rax is the syscall number 329 * %r12-%r15 contain caller state 330 * 331 * The syscall instruction arranges that: 332 * 333 * %rcx contains the return %rip 334 * %r11d contains bottom 32-bits of %rflags 335 * %rflags is masked (as determined by the SFMASK msr) 336 * %cs is set to UCS_SEL (as determined by the STAR msr) 337 * %ss is set to UDS_SEL (as determined by the STAR msr) 338 * %rip is set to sys_syscall (as determined by the LSTAR msr) 339 * 340 * Or in other words, we have no registers available at all. 341 * Only swapgs can save us! 342 */ 343 344#if defined(__lint) 345 346/*ARGSUSED*/ 347void 348sys_syscall() 349{} 350 351void 352_allsyscalls() 353{} 354 355size_t _allsyscalls_size; 356 357#else /* __lint */ 358 359 ENTRY_NP2(brand_sys_syscall,_allsyscalls) 360 SWAPGS /* kernel gsbase */ 361 XPV_TRAP_POP 362 BRAND_CALLBACK(BRAND_CB_SYSCALL) 363 SWAPGS /* user gsbase */ 364 365#if defined(__xpv) 366 /* 367 * Note that swapgs is handled for us by the hypervisor. Here 368 * it is empty. 369 */ 370 jmp nopop_sys_syscall 371#endif 372 373 ALTENTRY(sys_syscall) 374 SWAPGS /* kernel gsbase */ 375#if defined(__xpv) 376 /* 377 * Even though we got here by a syscall instruction from user land 378 * the hypervisor constructs our stack the same way as is done 379 * for interrupt gates. The only exception is that it pushes kernel 380 * cs and ss instead of user cs and ss for some reason. This is all 381 * different from running native on the metal. 382 * 383 * Stack on entry: 384 * (0x0)rsp rcx (user rip) 385 * (0x8)rsp r11 (user rflags) 386 * (0x10)rsp user rip 387 * (0x18)rsp kernel cs 388 * (0x20)rsp user rflags 389 * (0x28)rsp user rsp 390 * (0x30)rsp kernel ss 391 */ 392 393 XPV_TRAP_POP 394nopop_sys_syscall: 395 ASSERT_UPCALL_MASK_IS_SET 396 397 movq %r15, %gs:CPU_RTMP_R15 398 movq 0x18(%rsp), %r15 /* save user stack */ 399 movq %r15, %gs:CPU_RTMP_RSP 400#else 401 movq %r15, %gs:CPU_RTMP_R15 402 movq %rsp, %gs:CPU_RTMP_RSP 403#endif /* __xpv */ 404 405 movq %gs:CPU_THREAD, %r15 406 movq T_STACK(%r15), %rsp 407 408 movl $UCS_SEL, REGOFF_CS(%rsp) 409 movq %rcx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */ 410 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */ 411 movl $UDS_SEL, REGOFF_SS(%rsp) 412 413 movl %eax, %eax /* wrapper: sysc# -> %eax */ 414 movq %rdi, REGOFF_RDI(%rsp) 415 movq %rsi, REGOFF_RSI(%rsp) 416 movq %rdx, REGOFF_RDX(%rsp) 417 movq %r10, REGOFF_RCX(%rsp) /* wrapper: %rcx -> %r10 */ 418 movq %r10, %rcx /* arg[3] for direct calls */ 419 420 movq %r8, REGOFF_R8(%rsp) 421 movq %r9, REGOFF_R9(%rsp) 422 movq %rax, REGOFF_RAX(%rsp) 423 movq %rbx, REGOFF_RBX(%rsp) 424 425 movq %rbp, REGOFF_RBP(%rsp) 426 movq %r10, REGOFF_R10(%rsp) 427 movq %gs:CPU_RTMP_RSP, %r11 428 movq %r11, REGOFF_RSP(%rsp) 429 movq %r12, REGOFF_R12(%rsp) 430 431 movq %r13, REGOFF_R13(%rsp) 432 movq %r14, REGOFF_R14(%rsp) 433 movq %gs:CPU_RTMP_R15, %r10 434 movq %r10, REGOFF_R15(%rsp) 435 movq $0, REGOFF_SAVFP(%rsp) 436 movq $0, REGOFF_SAVPC(%rsp) 437 438 /* 439 * Copy these registers here in case we end up stopped with 440 * someone (like, say, /proc) messing with our register state. 441 * We don't -restore- them unless we have to in update_sregs. 442 * 443 * Since userland -can't- change fsbase or gsbase directly, 444 * and capturing them involves two serializing instructions, 445 * we don't bother to capture them here. 446 */ 447 xorl %ebx, %ebx 448 movw %ds, %bx 449 movq %rbx, REGOFF_DS(%rsp) 450 movw %es, %bx 451 movq %rbx, REGOFF_ES(%rsp) 452 movw %fs, %bx 453 movq %rbx, REGOFF_FS(%rsp) 454 movw %gs, %bx 455 movq %rbx, REGOFF_GS(%rsp) 456 457 /* 458 * Machine state saved in the regs structure on the stack 459 * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9 460 * %eax is the syscall number 461 * %rsp is the thread's stack, %r15 is curthread 462 * REG_RSP(%rsp) is the user's stack 463 */ 464 465 SYSCALL_TRAPTRACE($TT_SYSC64) 466 467 movq %rsp, %rbp 468 469 movq T_LWP(%r15), %r14 470 ASSERT_NO_RUPDATE_PENDING(%r14) 471 ENABLE_INTR_FLAGS 472 473 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 474 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ 475 476 ASSERT_LWPTOREGS(%r14, %rsp) 477 478 movb $LWP_SYS, LWP_STATE(%r14) 479 incq LWP_RU_SYSC(%r14) 480 movb $NORMALRETURN, LWP_EOSYS(%r14) 481 482 incq %gs:CPU_STATS_SYS_SYSCALL 483 484 movw %ax, T_SYSNUM(%r15) 485 movzbl T_PRE_SYS(%r15), %ebx 486 ORL_SYSCALLTRACE(%ebx) 487 testl %ebx, %ebx 488 jne _syscall_pre 489 490_syscall_invoke: 491 movq REGOFF_RDI(%rbp), %rdi 492 movq REGOFF_RSI(%rbp), %rsi 493 movq REGOFF_RDX(%rbp), %rdx 494 movq REGOFF_RCX(%rbp), %rcx 495 movq REGOFF_R8(%rbp), %r8 496 movq REGOFF_R9(%rbp), %r9 497 498 cmpl $NSYSCALL, %eax 499 jae _syscall_ill 500 shll $SYSENT_SIZE_SHIFT, %eax 501 leaq sysent(%rax), %rbx 502 503 call *SY_CALLC(%rbx) 504 505 movq %rax, %r12 506 movq %rdx, %r13 507 508 /* 509 * If the handler returns two ints, then we need to split the 510 * 64-bit return value into two 32-bit values. 511 */ 512 testw $SE_32RVAL2, SY_FLAGS(%rbx) 513 je 5f 514 movq %r12, %r13 515 shrq $32, %r13 /* upper 32-bits into %edx */ 516 movl %r12d, %r12d /* lower 32-bits into %eax */ 5175: 518 /* 519 * Optimistically assume that there's no post-syscall 520 * work to do. (This is to avoid having to call syscall_mstate() 521 * with interrupts disabled) 522 */ 523 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 524 525 /* 526 * We must protect ourselves from being descheduled here; 527 * If we were, and we ended up on another cpu, or another 528 * lwp got in ahead of us, it could change the segment 529 * registers without us noticing before we return to userland. 530 */ 531 CLI(%r14) 532 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 533 jne _syscall_post 534 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 535 536 movq %r12, REGOFF_RAX(%rsp) 537 movq %r13, REGOFF_RDX(%rsp) 538 539 /* 540 * To get back to userland, we need the return %rip in %rcx and 541 * the return %rfl in %r11d. The sysretq instruction also arranges 542 * to fix up %cs and %ss; everything else is our responsibility. 543 */ 544 movq REGOFF_RDI(%rsp), %rdi 545 movq REGOFF_RSI(%rsp), %rsi 546 movq REGOFF_RDX(%rsp), %rdx 547 /* %rcx used to restore %rip value */ 548 549 movq REGOFF_R8(%rsp), %r8 550 movq REGOFF_R9(%rsp), %r9 551 movq REGOFF_RAX(%rsp), %rax 552 movq REGOFF_RBX(%rsp), %rbx 553 554 movq REGOFF_RBP(%rsp), %rbp 555 movq REGOFF_R10(%rsp), %r10 556 /* %r11 used to restore %rfl value */ 557 movq REGOFF_R12(%rsp), %r12 558 559 movq REGOFF_R13(%rsp), %r13 560 movq REGOFF_R14(%rsp), %r14 561 movq REGOFF_R15(%rsp), %r15 562 563 movq REGOFF_RIP(%rsp), %rcx 564 movl REGOFF_RFL(%rsp), %r11d 565 566#if defined(__xpv) 567 addq $REGOFF_RIP, %rsp 568#else 569 movq REGOFF_RSP(%rsp), %rsp 570#endif 571 572 /* 573 * There can be no instructions between the ALTENTRY below and 574 * SYSRET or we could end up breaking brand support. See label usage 575 * in sn1_brand_syscall_callback for an example. 576 */ 577 ASSERT_UPCALL_MASK_IS_SET 578 SWAPGS /* user gsbase */ 579 ALTENTRY(nopop_sys_syscall_sysretq) 580 SYSRETQ 581 /*NOTREACHED*/ 582 SET_SIZE(nopop_sys_syscall_sysretq) 583 584_syscall_pre: 585 call pre_syscall 586 movl %eax, %r12d 587 testl %eax, %eax 588 jne _syscall_post_call 589 /* 590 * Didn't abort, so reload the syscall args and invoke the handler. 591 */ 592 movzwl T_SYSNUM(%r15), %eax 593 jmp _syscall_invoke 594 595_syscall_ill: 596 call nosys 597 movq %rax, %r12 598 movq %rdx, %r13 599 jmp _syscall_post_call 600 601_syscall_post: 602 STI 603 /* 604 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM 605 * so that we can account for the extra work it takes us to finish. 606 */ 607 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 608_syscall_post_call: 609 movq %r12, %rdi 610 movq %r13, %rsi 611 call post_syscall 612 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 613 jmp _sys_rtt 614 SET_SIZE(sys_syscall) 615 SET_SIZE(brand_sys_syscall) 616 617#endif /* __lint */ 618 619#if defined(__lint) 620 621/*ARGSUSED*/ 622void 623sys_syscall32() 624{} 625 626#else /* __lint */ 627 628 ENTRY_NP(brand_sys_syscall32) 629 SWAPGS /* kernel gsbase */ 630 XPV_TRAP_POP 631 BRAND_CALLBACK(BRAND_CB_SYSCALL32) 632 SWAPGS /* user gsbase */ 633 634#if defined(__xpv) 635 jmp nopop_sys_syscall32 636#endif 637 638 ALTENTRY(sys_syscall32) 639 SWAPGS /* kernel gsbase */ 640 641#if defined(__xpv) 642 XPV_TRAP_POP 643nopop_sys_syscall32: 644#endif 645 646 movl %esp, %r10d 647 movq %gs:CPU_THREAD, %r15 648 movq T_STACK(%r15), %rsp 649 movl %eax, %eax 650 651 movl $U32CS_SEL, REGOFF_CS(%rsp) 652 movl %ecx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */ 653 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */ 654 movq %r10, REGOFF_RSP(%rsp) 655 movl $UDS_SEL, REGOFF_SS(%rsp) 656 657_syscall32_save: 658 movl %edi, REGOFF_RDI(%rsp) 659 movl %esi, REGOFF_RSI(%rsp) 660 movl %ebp, REGOFF_RBP(%rsp) 661 movl %ebx, REGOFF_RBX(%rsp) 662 movl %edx, REGOFF_RDX(%rsp) 663 movl %ecx, REGOFF_RCX(%rsp) 664 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */ 665 movq $0, REGOFF_SAVFP(%rsp) 666 movq $0, REGOFF_SAVPC(%rsp) 667 668 /* 669 * Copy these registers here in case we end up stopped with 670 * someone (like, say, /proc) messing with our register state. 671 * We don't -restore- them unless we have to in update_sregs. 672 * 673 * Since userland -can't- change fsbase or gsbase directly, 674 * we don't bother to capture them here. 675 */ 676 xorl %ebx, %ebx 677 movw %ds, %bx 678 movq %rbx, REGOFF_DS(%rsp) 679 movw %es, %bx 680 movq %rbx, REGOFF_ES(%rsp) 681 movw %fs, %bx 682 movq %rbx, REGOFF_FS(%rsp) 683 movw %gs, %bx 684 movq %rbx, REGOFF_GS(%rsp) 685 686 /* 687 * Application state saved in the regs structure on the stack 688 * %eax is the syscall number 689 * %rsp is the thread's stack, %r15 is curthread 690 * REG_RSP(%rsp) is the user's stack 691 */ 692 693 SYSCALL_TRAPTRACE32($TT_SYSC) 694 695 movq %rsp, %rbp 696 697 movq T_LWP(%r15), %r14 698 ASSERT_NO_RUPDATE_PENDING(%r14) 699 700 ENABLE_INTR_FLAGS 701 702 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 703 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ 704 705 ASSERT_LWPTOREGS(%r14, %rsp) 706 707 incq %gs:CPU_STATS_SYS_SYSCALL 708 709 /* 710 * Make some space for MAXSYSARGS (currently 8) 32-bit args placed 711 * into 64-bit (long) arg slots, maintaining 16 byte alignment. Or 712 * more succinctly: 713 * 714 * SA(MAXSYSARGS * sizeof (long)) == 64 715 */ 716#define SYS_DROP 64 /* drop for args */ 717 subq $SYS_DROP, %rsp 718 movb $LWP_SYS, LWP_STATE(%r14) 719 movq %r15, %rdi 720 movq %rsp, %rsi 721 call syscall_entry 722 723 /* 724 * Fetch the arguments copied onto the kernel stack and put 725 * them in the right registers to invoke a C-style syscall handler. 726 * %rax contains the handler address. 727 * 728 * Ideas for making all this go faster of course include simply 729 * forcibly fetching 6 arguments from the user stack under lofault 730 * protection, reverting to copyin_args only when watchpoints 731 * are in effect. 732 * 733 * (If we do this, make sure that exec and libthread leave 734 * enough space at the top of the stack to ensure that we'll 735 * never do a fetch from an invalid page.) 736 * 737 * Lots of ideas here, but they won't really help with bringup B-) 738 * Correctness can't wait, performance can wait a little longer .. 739 */ 740 741 movq %rax, %rbx 742 movl 0(%rsp), %edi 743 movl 8(%rsp), %esi 744 movl 0x10(%rsp), %edx 745 movl 0x18(%rsp), %ecx 746 movl 0x20(%rsp), %r8d 747 movl 0x28(%rsp), %r9d 748 749 call *SY_CALLC(%rbx) 750 751 movq %rbp, %rsp /* pop the args */ 752 753 /* 754 * amd64 syscall handlers -always- return a 64-bit value in %rax. 755 * On the 32-bit kernel, they always return that value in %eax:%edx 756 * as required by the 32-bit ABI. 757 * 758 * Simulate the same behaviour by unconditionally splitting the 759 * return value in the same way. 760 */ 761 movq %rax, %r13 762 shrq $32, %r13 /* upper 32-bits into %edx */ 763 movl %eax, %r12d /* lower 32-bits into %eax */ 764 765 /* 766 * Optimistically assume that there's no post-syscall 767 * work to do. (This is to avoid having to call syscall_mstate() 768 * with interrupts disabled) 769 */ 770 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 771 772 /* 773 * We must protect ourselves from being descheduled here; 774 * If we were, and we ended up on another cpu, or another 775 * lwp got in ahead of us, it could change the segment 776 * registers without us noticing before we return to userland. 777 */ 778 CLI(%r14) 779 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 780 jne _full_syscall_postsys32 781 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 782 783 /* 784 * To get back to userland, we need to put the return %rip in %rcx and 785 * the return %rfl in %r11d. The sysret instruction also arranges 786 * to fix up %cs and %ss; everything else is our responsibility. 787 */ 788 789 movl %r12d, %eax /* %eax: rval1 */ 790 movl REGOFF_RBX(%rsp), %ebx 791 /* %ecx used for return pointer */ 792 movl %r13d, %edx /* %edx: rval2 */ 793 movl REGOFF_RBP(%rsp), %ebp 794 movl REGOFF_RSI(%rsp), %esi 795 movl REGOFF_RDI(%rsp), %edi 796 797 movl REGOFF_RFL(%rsp), %r11d /* %r11 -> eflags */ 798 movl REGOFF_RIP(%rsp), %ecx /* %ecx -> %eip */ 799 movl REGOFF_RSP(%rsp), %esp 800 801 ASSERT_UPCALL_MASK_IS_SET 802 SWAPGS /* user gsbase */ 803 ALTENTRY(nopop_sys_syscall32_sysretl) 804 SYSRETL 805 SET_SIZE(nopop_sys_syscall32_sysretl) 806 /*NOTREACHED*/ 807 808_full_syscall_postsys32: 809 STI 810 /* 811 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM 812 * so that we can account for the extra work it takes us to finish. 813 */ 814 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 815 movq %r15, %rdi 816 movq %r12, %rsi /* rval1 - %eax */ 817 movq %r13, %rdx /* rval2 - %edx */ 818 call syscall_exit 819 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 820 jmp _sys_rtt 821 SET_SIZE(sys_syscall32) 822 SET_SIZE(brand_sys_syscall32) 823 824#endif /* __lint */ 825 826/* 827 * System call handler via the sysenter instruction 828 * Used only for 32-bit system calls on the 64-bit kernel. 829 * 830 * The caller in userland has arranged that: 831 * 832 * - %eax contains the syscall number 833 * - %ecx contains the user %esp 834 * - %edx contains the return %eip 835 * - the user stack contains the args to the syscall 836 * 837 * Hardware and (privileged) initialization code have arranged that by 838 * the time the sysenter instructions completes: 839 * 840 * - %rip is pointing to sys_sysenter (below). 841 * - %cs and %ss are set to kernel text and stack (data) selectors. 842 * - %rsp is pointing at the lwp's stack 843 * - interrupts have been disabled. 844 * 845 * Note that we are unable to return both "rvals" to userland with 846 * this call, as %edx is used by the sysexit instruction. 847 * 848 * One final complication in this routine is its interaction with 849 * single-stepping in a debugger. For most of the system call mechanisms, 850 * the CPU automatically clears the single-step flag before we enter the 851 * kernel. The sysenter mechanism does not clear the flag, so a user 852 * single-stepping through a libc routine may suddenly find him/herself 853 * single-stepping through the kernel. To detect this, kmdb compares the 854 * trap %pc to the [brand_]sys_enter addresses on each single-step trap. 855 * If it finds that we have single-stepped to a sysenter entry point, it 856 * explicitly clears the flag and executes the sys_sysenter routine. 857 * 858 * One final complication in this final complication is the fact that we 859 * have two different entry points for sysenter: brand_sys_sysenter and 860 * sys_sysenter. If we enter at brand_sys_sysenter and start single-stepping 861 * through the kernel with kmdb, we will eventually hit the instruction at 862 * sys_sysenter. kmdb cannot distinguish between that valid single-step 863 * and the undesirable one mentioned above. To avoid this situation, we 864 * simply add a jump over the instruction at sys_sysenter to make it 865 * impossible to single-step to it. 866 */ 867#if defined(__lint) 868 869void 870sys_sysenter() 871{} 872 873#else /* __lint */ 874 875 ENTRY_NP(brand_sys_sysenter) 876 SWAPGS /* kernel gsbase */ 877 ALTENTRY(_brand_sys_sysenter_post_swapgs) 878 BRAND_CALLBACK(BRAND_CB_SYSENTER) 879 /* 880 * Jump over sys_sysenter to allow single-stepping as described 881 * above. 882 */ 883 jmp _sys_sysenter_post_swapgs 884 885 ALTENTRY(sys_sysenter) 886 SWAPGS /* kernel gsbase */ 887 888 ALTENTRY(_sys_sysenter_post_swapgs) 889 movq %gs:CPU_THREAD, %r15 890 891 movl $U32CS_SEL, REGOFF_CS(%rsp) 892 movl %ecx, REGOFF_RSP(%rsp) /* wrapper: %esp -> %ecx */ 893 movl %edx, REGOFF_RIP(%rsp) /* wrapper: %eip -> %edx */ 894 pushfq 895 popq %r10 896 movl $UDS_SEL, REGOFF_SS(%rsp) 897 898 /* 899 * Set the interrupt flag before storing the flags to the 900 * flags image on the stack so we can return to user with 901 * interrupts enabled if we return via sys_rtt_syscall32 902 */ 903 orq $PS_IE, %r10 904 movq %r10, REGOFF_RFL(%rsp) 905 906 movl %edi, REGOFF_RDI(%rsp) 907 movl %esi, REGOFF_RSI(%rsp) 908 movl %ebp, REGOFF_RBP(%rsp) 909 movl %ebx, REGOFF_RBX(%rsp) 910 movl %edx, REGOFF_RDX(%rsp) 911 movl %ecx, REGOFF_RCX(%rsp) 912 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */ 913 movq $0, REGOFF_SAVFP(%rsp) 914 movq $0, REGOFF_SAVPC(%rsp) 915 916 /* 917 * Copy these registers here in case we end up stopped with 918 * someone (like, say, /proc) messing with our register state. 919 * We don't -restore- them unless we have to in update_sregs. 920 * 921 * Since userland -can't- change fsbase or gsbase directly, 922 * we don't bother to capture them here. 923 */ 924 xorl %ebx, %ebx 925 movw %ds, %bx 926 movq %rbx, REGOFF_DS(%rsp) 927 movw %es, %bx 928 movq %rbx, REGOFF_ES(%rsp) 929 movw %fs, %bx 930 movq %rbx, REGOFF_FS(%rsp) 931 movw %gs, %bx 932 movq %rbx, REGOFF_GS(%rsp) 933 934 /* 935 * Application state saved in the regs structure on the stack 936 * %eax is the syscall number 937 * %rsp is the thread's stack, %r15 is curthread 938 * REG_RSP(%rsp) is the user's stack 939 */ 940 941 SYSCALL_TRAPTRACE($TT_SYSENTER) 942 943 movq %rsp, %rbp 944 945 movq T_LWP(%r15), %r14 946 ASSERT_NO_RUPDATE_PENDING(%r14) 947 948 ENABLE_INTR_FLAGS 949 950 /* 951 * Catch 64-bit process trying to issue sysenter instruction 952 * on Nocona based systems. 953 */ 954 movq LWP_PROCP(%r14), %rax 955 cmpq $DATAMODEL_ILP32, P_MODEL(%rax) 956 je 7f 957 958 /* 959 * For a non-32-bit process, simulate a #ud, since that's what 960 * native hardware does. The traptrace entry (above) will 961 * let you know what really happened. 962 */ 963 movq $T_ILLINST, REGOFF_TRAPNO(%rsp) 964 movq REGOFF_CS(%rsp), %rdi 965 movq %rdi, REGOFF_ERR(%rsp) 966 movq %rsp, %rdi 967 movq REGOFF_RIP(%rsp), %rsi 968 movl %gs:CPU_ID, %edx 969 call trap 970 jmp _sys_rtt 9717: 972 973 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 974 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate calls) */ 975 976 ASSERT_LWPTOREGS(%r14, %rsp) 977 978 incq %gs:CPU_STATS_SYS_SYSCALL 979 980 /* 981 * Make some space for MAXSYSARGS (currently 8) 32-bit args 982 * placed into 64-bit (long) arg slots, plus one 64-bit 983 * (long) arg count, maintaining 16 byte alignment. 984 */ 985 subq $SYS_DROP, %rsp 986 movb $LWP_SYS, LWP_STATE(%r14) 987 movq %r15, %rdi 988 movq %rsp, %rsi 989 call syscall_entry 990 991 /* 992 * Fetch the arguments copied onto the kernel stack and put 993 * them in the right registers to invoke a C-style syscall handler. 994 * %rax contains the handler address. 995 */ 996 movq %rax, %rbx 997 movl 0(%rsp), %edi 998 movl 8(%rsp), %esi 999 movl 0x10(%rsp), %edx 1000 movl 0x18(%rsp), %ecx 1001 movl 0x20(%rsp), %r8d 1002 movl 0x28(%rsp), %r9d 1003 1004 call *SY_CALLC(%rbx) 1005 1006 movq %rbp, %rsp /* pop the args */ 1007 1008 /* 1009 * amd64 syscall handlers -always- return a 64-bit value in %rax. 1010 * On the 32-bit kernel, the always return that value in %eax:%edx 1011 * as required by the 32-bit ABI. 1012 * 1013 * Simulate the same behaviour by unconditionally splitting the 1014 * return value in the same way. 1015 */ 1016 movq %rax, %r13 1017 shrq $32, %r13 /* upper 32-bits into %edx */ 1018 movl %eax, %r12d /* lower 32-bits into %eax */ 1019 1020 /* 1021 * Optimistically assume that there's no post-syscall 1022 * work to do. (This is to avoid having to call syscall_mstate() 1023 * with interrupts disabled) 1024 */ 1025 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 1026 1027 /* 1028 * We must protect ourselves from being descheduled here; 1029 * If we were, and we ended up on another cpu, or another 1030 * lwp got int ahead of us, it could change the segment 1031 * registers without us noticing before we return to userland. 1032 */ 1033 cli 1034 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 1035 jne _full_syscall_postsys32 1036 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 1037 1038 /* 1039 * To get back to userland, load up the 32-bit registers and 1040 * sysexit back where we came from. 1041 */ 1042 1043 /* 1044 * Interrupts will be turned on by the 'sti' executed just before 1045 * sysexit. The following ensures that restoring the user's rflags 1046 * doesn't enable interrupts too soon. 1047 */ 1048 andq $_BITNOT(PS_IE), REGOFF_RFL(%rsp) 1049 1050 /* 1051 * (There's no point in loading up %edx because the sysexit 1052 * mechanism smashes it.) 1053 */ 1054 movl %r12d, %eax 1055 movl REGOFF_RBX(%rsp), %ebx 1056 movl REGOFF_RBP(%rsp), %ebp 1057 movl REGOFF_RSI(%rsp), %esi 1058 movl REGOFF_RDI(%rsp), %edi 1059 1060 movl REGOFF_RIP(%rsp), %edx /* sysexit: %edx -> %eip */ 1061 pushq REGOFF_RFL(%rsp) 1062 popfq 1063 movl REGOFF_RSP(%rsp), %ecx /* sysexit: %ecx -> %esp */ 1064 swapgs 1065 sti 1066 sysexit 1067 SET_SIZE(sys_sysenter) 1068 SET_SIZE(_sys_sysenter_post_swapgs) 1069 SET_SIZE(brand_sys_sysenter) 1070 1071#endif /* __lint */ 1072 1073#if defined(__lint) 1074/* 1075 * System call via an int80. This entry point is only used by the Linux 1076 * application environment. Unlike the other entry points, there is no 1077 * default action to take if no callback is registered for this process. 1078 */ 1079void 1080sys_int80() 1081{} 1082 1083#else /* __lint */ 1084 1085 ENTRY_NP(brand_sys_int80) 1086 SWAPGS /* kernel gsbase */ 1087 XPV_TRAP_POP 1088 BRAND_CALLBACK(BRAND_CB_INT80) 1089 SWAPGS /* user gsbase */ 1090#if defined(__xpv) 1091 jmp nopop_int80 1092#endif 1093 1094 ENTRY_NP(sys_int80) 1095 /* 1096 * We hit an int80, but this process isn't of a brand with an int80 1097 * handler. Bad process! Make it look as if the INT failed. 1098 * Modify %rip to point before the INT, push the expected error 1099 * code and fake a GP fault. Note on 64-bit hypervisor we need 1100 * to undo the XPV_TRAP_POP and push rcx and r11 back on the stack 1101 * because gptrap will pop them again with its own XPV_TRAP_POP. 1102 */ 1103#if defined(__xpv) 1104 XPV_TRAP_POP 1105nopop_int80: 1106#endif 1107 subq $2, (%rsp) /* int insn 2-bytes */ 1108 pushq $_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2) 1109#if defined(__xpv) 1110 push %r11 1111 push %rcx 1112#endif 1113 jmp gptrap / GP fault 1114 SET_SIZE(sys_int80) 1115 SET_SIZE(brand_sys_int80) 1116#endif /* __lint */ 1117 1118 1119/* 1120 * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by 1121 * the generic i386 libc to do system calls. We do a small amount of setup 1122 * before jumping into the existing sys_syscall32 path. 1123 */ 1124#if defined(__lint) 1125 1126/*ARGSUSED*/ 1127void 1128sys_syscall_int() 1129{} 1130 1131#else /* __lint */ 1132 1133 ENTRY_NP(brand_sys_syscall_int) 1134 SWAPGS /* kernel gsbase */ 1135 XPV_TRAP_POP 1136 BRAND_CALLBACK(BRAND_CB_INT91) 1137 SWAPGS /* user gsbase */ 1138 1139#if defined(__xpv) 1140 jmp nopop_syscall_int 1141#endif 1142 1143 ALTENTRY(sys_syscall_int) 1144 SWAPGS /* kernel gsbase */ 1145 1146#if defined(__xpv) 1147 XPV_TRAP_POP 1148nopop_syscall_int: 1149#endif 1150 1151 movq %gs:CPU_THREAD, %r15 1152 movq T_STACK(%r15), %rsp 1153 movl %eax, %eax 1154 /* 1155 * Set t_post_sys on this thread to force ourselves out via the slow 1156 * path. It might be possible at some later date to optimize this out 1157 * and use a faster return mechanism. 1158 */ 1159 movb $1, T_POST_SYS(%r15) 1160 CLEAN_CS 1161 jmp _syscall32_save 1162 SET_SIZE(sys_syscall_int) 1163 SET_SIZE(brand_sys_syscall_int) 1164 1165#endif /* __lint */ 1166 1167/* 1168 * Legacy 32-bit applications and old libc implementations do lcalls; 1169 * we should never get here because the LDT entry containing the syscall 1170 * segment descriptor has the "segment present" bit cleared, which means 1171 * we end up processing those system calls in trap() via a not-present trap. 1172 * 1173 * We do it this way because a call gate unhelpfully does -nothing- to the 1174 * interrupt flag bit, so an interrupt can run us just after the lcall 1175 * completes, but just before the swapgs takes effect. Thus the INTR_PUSH and 1176 * INTR_POP paths would have to be slightly more complex to dance around 1177 * this problem, and end up depending explicitly on the first 1178 * instruction of this handler being either swapgs or cli. 1179 */ 1180 1181#if defined(__lint) 1182 1183/*ARGSUSED*/ 1184void 1185sys_lcall32() 1186{} 1187 1188#else /* __lint */ 1189 1190 ENTRY_NP(sys_lcall32) 1191 SWAPGS /* kernel gsbase */ 1192 pushq $0 1193 pushq %rbp 1194 movq %rsp, %rbp 1195 leaq __lcall_panic_str(%rip), %rdi 1196 xorl %eax, %eax 1197 call panic 1198 SET_SIZE(sys_lcall32) 1199 1200__lcall_panic_str: 1201 .string "sys_lcall32: shouldn't be here!" 1202 1203/* 1204 * Declare a uintptr_t which covers the entire pc range of syscall 1205 * handlers for the stack walkers that need this. 1206 */ 1207 .align CPTRSIZE 1208 .globl _allsyscalls_size 1209 .type _allsyscalls_size, @object 1210_allsyscalls_size: 1211 .NWORD . - _allsyscalls 1212 SET_SIZE(_allsyscalls_size) 1213 1214#endif /* __lint */ 1215 1216/* 1217 * These are the thread context handlers for lwps using sysenter/sysexit. 1218 */ 1219 1220#if defined(__lint) 1221 1222/*ARGSUSED*/ 1223void 1224sep_save(void *ksp) 1225{} 1226 1227/*ARGSUSED*/ 1228void 1229sep_restore(void *ksp) 1230{} 1231 1232#else /* __lint */ 1233 1234 /* 1235 * setting this value to zero as we switch away causes the 1236 * stack-pointer-on-sysenter to be NULL, ensuring that we 1237 * don't silently corrupt another (preempted) thread stack 1238 * when running an lwp that (somehow) didn't get sep_restore'd 1239 */ 1240 ENTRY_NP(sep_save) 1241 xorl %edx, %edx 1242 xorl %eax, %eax 1243 movl $MSR_INTC_SEP_ESP, %ecx 1244 wrmsr 1245 ret 1246 SET_SIZE(sep_save) 1247 1248 /* 1249 * Update the kernel stack pointer as we resume onto this cpu. 1250 */ 1251 ENTRY_NP(sep_restore) 1252 movq %rdi, %rdx 1253 shrq $32, %rdx 1254 movl %edi, %eax 1255 movl $MSR_INTC_SEP_ESP, %ecx 1256 wrmsr 1257 ret 1258 SET_SIZE(sep_restore) 1259 1260#endif /* __lint */ 1261