1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22/* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29#include <sys/asm_linkage.h> 30#include <sys/asm_misc.h> 31#include <sys/regset.h> 32#include <sys/psw.h> 33 34#if defined(__lint) 35 36#include <sys/types.h> 37#include <sys/thread.h> 38#include <sys/systm.h> 39 40#else /* __lint */ 41 42#include <sys/segments.h> 43#include <sys/pcb.h> 44#include <sys/trap.h> 45#include <sys/ftrace.h> 46#include <sys/traptrace.h> 47#include <sys/clock.h> 48#include <sys/model.h> 49#include <sys/panic.h> 50#include "assym.h" 51 52#endif /* __lint */ 53 54/* 55 * We implement five flavours of system call entry points 56 * 57 * - syscall/sysretq (amd64 generic) 58 * - syscall/sysretl (i386 plus SYSC bit) 59 * - sysenter/sysexit (i386 plus SEP bit) 60 * - int/iret (i386 generic) 61 * - lcall/iret (i386 generic) 62 * 63 * The current libc included in Solaris uses int/iret as the base unoptimized 64 * kernel entry method. Older libc implementations and legacy binaries may use 65 * the lcall call gate, so it must continue to be supported. 66 * 67 * System calls that use an lcall call gate are processed in trap() via a 68 * segment-not-present trap, i.e. lcalls are extremely slow(!). 69 * 70 * The basic pattern used in the 32-bit SYSC handler at this point in time is 71 * to have the bare minimum of assembler, and get to the C handlers as 72 * quickly as possible. 73 * 74 * The 64-bit handler is much closer to the sparcv9 handler; that's 75 * because of passing arguments in registers. The 32-bit world still 76 * passes arguments on the stack -- that makes that handler substantially 77 * more complex. 78 * 79 * The two handlers share a few code fragments which are broken 80 * out into preprocessor macros below. 81 * 82 * XX64 come back and speed all this up later. The 32-bit stuff looks 83 * especially easy to speed up the argument copying part .. 84 * 85 * 86 * Notes about segment register usage (c.f. the 32-bit kernel) 87 * 88 * In the 32-bit kernel, segment registers are dutifully saved and 89 * restored on all mode transitions because the kernel uses them directly. 90 * When the processor is running in 64-bit mode, segment registers are 91 * largely ignored. 92 * 93 * %cs and %ss 94 * controlled by the hardware mechanisms that make mode transitions 95 * 96 * The remaining segment registers have to either be pointing at a valid 97 * descriptor i.e. with the 'present' bit set, or they can NULL descriptors 98 * 99 * %ds and %es 100 * always ignored 101 * 102 * %fs and %gs 103 * fsbase and gsbase are used to control the place they really point at. 104 * The kernel only depends on %gs, and controls its own gsbase via swapgs 105 * 106 * Note that loading segment registers is still costly because the GDT 107 * lookup still happens (this is because the hardware can't know that we're 108 * not setting up these segment registers for a 32-bit program). Thus we 109 * avoid doing this in the syscall path, and defer them to lwp context switch 110 * handlers, so the register values remain virtualized to the lwp. 111 */ 112 113#if defined(SYSCALLTRACE) 114#define ORL_SYSCALLTRACE(r32) \ 115 orl syscalltrace(%rip), r32 116#else 117#define ORL_SYSCALLTRACE(r32) 118#endif 119 120#define MSTATE_TRANSITION(from, to) \ 121 movl $from, %edi; \ 122 movl $to, %esi; \ 123 call syscall_mstate 124 125/* 126 * Check to see if a simple (direct) return is possible i.e. 127 * 128 * if ((t->t_post_sys_ast | syscalltrace | 129 * (lwp->lwp_pcb.pcb_flags & RUPDATE_PENDING)) != 0) 130 * do full version ; 131 * 132 * Preconditions: 133 * - t is curthread 134 * Postconditions: 135 * - condition code NE is set if post-sys is too complex 136 * - rtmp is zeroed if it isn't (we rely on this!) 137 * - ltmp is smashed 138 */ 139#define CHECK_POSTSYS_NE(t, ltmp, rtmp) \ 140 movq T_LWP(t), ltmp; \ 141 movl PCB_FLAGS(ltmp), rtmp; \ 142 andl $RUPDATE_PENDING, rtmp; \ 143 ORL_SYSCALLTRACE(rtmp); \ 144 orl T_POST_SYS_AST(t), rtmp; \ 145 cmpl $0, rtmp 146 147/* 148 * Fix up the lwp, thread, and eflags for a successful return 149 * 150 * Preconditions: 151 * - zwreg contains zero 152 */ 153#define SIMPLE_SYSCALL_POSTSYS(t, lwp, zwreg) \ 154 movb $LWP_USER, LWP_STATE(lwp); \ 155 movw zwreg, T_SYSNUM(t); \ 156 andb $_CONST(0xffff - PS_C), REGOFF_RFL(%rsp) 157 158/* 159 * ASSERT(lwptoregs(lwp) == rp); 160 * 161 * This may seem obvious, but very odd things happen if this 162 * assertion is false 163 * 164 * Preconditions: 165 * (%rsp is ready for normal call sequence) 166 * Postconditions (if assertion is true): 167 * %r11 is smashed 168 * 169 * ASSERT(rp->r_cs == descnum) 170 * 171 * The code selector is written into the regs structure when the 172 * lwp stack is created. We use this ASSERT to validate that 173 * the regs structure really matches how we came in. 174 * 175 * Preconditions: 176 * (%rsp is ready for normal call sequence) 177 * Postconditions (if assertion is true): 178 * -none- 179 * 180 * ASSERT((lwp->lwp_pcb.pcb_flags & RUPDATE_PENDING) == 0); 181 * 182 * If this is false, it meant that we returned to userland without 183 * updating the segment registers as we were supposed to. 184 * 185 * Note that we must ensure no interrupts or other traps intervene 186 * between entering privileged mode and performing the assertion, 187 * otherwise we may perform a context switch on the thread, which 188 * will end up setting the RUPDATE_PENDING bit again. 189 */ 190#if defined(DEBUG) 191 192#if !defined(__lint) 193 194__lwptoregs_msg: 195 .string "%M%:%d lwptoregs(%p) [%p] != rp [%p]" 196 197__codesel_msg: 198 .string "%M%:%d rp->r_cs [%ld] != %ld" 199 200__no_rupdate_msg: 201 .string "%M%:%d lwp %p, pcb_flags & RUPDATE_PENDING != 0" 202 203#endif /* !__lint */ 204 205#define ASSERT_LWPTOREGS(lwp, rp) \ 206 movq LWP_REGS(lwp), %r11; \ 207 cmpq rp, %r11; \ 208 je 7f; \ 209 leaq __lwptoregs_msg(%rip), %rdi; \ 210 movl $__LINE__, %esi; \ 211 movq lwp, %rdx; \ 212 movq %r11, %rcx; \ 213 movq rp, %r8; \ 214 xorl %eax, %eax; \ 215 call panic; \ 2167: 217 218#define ASSERT_NO_RUPDATE_PENDING(lwp) \ 219 testl $RUPDATE_PENDING, PCB_FLAGS(lwp); \ 220 je 8f; \ 221 movq lwp, %rdx; \ 222 leaq __no_rupdate_msg(%rip), %rdi; \ 223 movl $__LINE__, %esi; \ 224 xorl %eax, %eax; \ 225 call panic; \ 2268: 227 228#else 229#define ASSERT_LWPTOREGS(lwp, rp) 230#define ASSERT_NO_RUPDATE_PENDING(lwp) 231#endif 232 233/* 234 * Do the traptrace thing and restore any registers we used 235 * in situ. Assumes that %rsp is pointing at the base of 236 * the struct regs, obviously .. 237 */ 238#ifdef TRAPTRACE 239#define SYSCALL_TRAPTRACE(ttype) \ 240 TRACE_PTR(%rdi, %rbx, %ebx, %rcx, ttype); \ 241 TRACE_REGS(%rdi, %rsp, %rbx, %rcx); \ 242 TRACE_STAMP(%rdi); /* rdtsc clobbers %eax, %edx */ \ 243 movq REGOFF_RAX(%rsp), %rax; \ 244 movq REGOFF_RBX(%rsp), %rbx; \ 245 movq REGOFF_RCX(%rsp), %rcx; \ 246 movq REGOFF_RDX(%rsp), %rdx; \ 247 movl %eax, TTR_SYSNUM(%rdi); \ 248 movq REGOFF_RDI(%rsp), %rdi 249 250#define SYSCALL_TRAPTRACE32(ttype) \ 251 SYSCALL_TRAPTRACE(ttype); \ 252 /* paranoia: clean the top 32-bits of the registers */ \ 253 orl %eax, %eax; \ 254 orl %ebx, %ebx; \ 255 orl %ecx, %ecx; \ 256 orl %edx, %edx; \ 257 orl %edi, %edi 258#else /* TRAPTRACE */ 259#define SYSCALL_TRAPTRACE(ttype) 260#define SYSCALL_TRAPTRACE32(ttype) 261#endif /* TRAPTRACE */ 262 263/* 264 * The 64-bit libc syscall wrapper does this: 265 * 266 * fn(<args>) 267 * { 268 * movq %rcx, %r10 -- because syscall smashes %rcx 269 * movl $CODE, %eax 270 * syscall 271 * <error processing> 272 * } 273 * 274 * Thus when we come into the kernel: 275 * 276 * %rdi, %rsi, %rdx, %r10, %r8, %r9 contain first six args 277 * %rax is the syscall number 278 * %r12-%r15 contain caller state 279 * 280 * The syscall instruction arranges that: 281 * 282 * %rcx contains the return %rip 283 * %r11d contains bottom 32-bits of %rflags 284 * %rflags is masked (as determined by the SFMASK msr) 285 * %cs is set to UCS_SEL (as determined by the STAR msr) 286 * %ss is set to UDS_SEL (as determined by the STAR msr) 287 * %rip is set to sys_syscall (as determined by the LSTAR msr) 288 * 289 * Or in other words, we have no registers available at all. 290 * Only swapgs can save us! 291 */ 292 293#if defined(__lint) 294 295/*ARGSUSED*/ 296void 297sys_syscall() 298{} 299 300void 301_allsyscalls() 302{} 303 304size_t _allsyscalls_size; 305 306#else /* __lint */ 307 308 ENTRY_NP2(sys_syscall,_allsyscalls) 309 310 swapgs 311 movq %rsp, %gs:CPU_RTMP_RSP 312 movq %r15, %gs:CPU_RTMP_R15 313 movq %gs:CPU_THREAD, %r15 314 movq T_STACK(%r15), %rsp 315 316 movl $UCS_SEL, REGOFF_CS(%rsp) 317 movq %rcx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */ 318 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */ 319 movl $UDS_SEL, REGOFF_SS(%rsp) 320 321 movl %eax, %eax /* wrapper: sysc# -> %eax */ 322 movq %rdi, REGOFF_RDI(%rsp) 323 movq %rsi, REGOFF_RSI(%rsp) 324 movq %rdx, REGOFF_RDX(%rsp) 325 movq %r10, REGOFF_RCX(%rsp) /* wrapper: %rcx -> %r10 */ 326 movq %r10, %rcx /* arg[3] for direct calls */ 327 328 movq %r8, REGOFF_R8(%rsp) 329 movq %r9, REGOFF_R9(%rsp) 330 movq %rax, REGOFF_RAX(%rsp) 331 movq %rbx, REGOFF_RBX(%rsp) 332 333 movq %rbp, REGOFF_RBP(%rsp) 334 movq %r10, REGOFF_R10(%rsp) 335 movq %gs:CPU_RTMP_RSP, %r11 336 movq %r11, REGOFF_RSP(%rsp) 337 movq %r12, REGOFF_R12(%rsp) 338 339 movq %r13, REGOFF_R13(%rsp) 340 movq %r14, REGOFF_R14(%rsp) 341 movq %gs:CPU_RTMP_R15, %r10 342 movq %r10, REGOFF_R15(%rsp) 343 movq $0, REGOFF_SAVFP(%rsp) 344 movq $0, REGOFF_SAVPC(%rsp) 345 346 /* 347 * Copy these registers here in case we end up stopped with 348 * someone (like, say, /proc) messing with our register state. 349 * We don't -restore- them unless we have to in update_sregs. 350 * 351 * Since userland -can't- change fsbase or gsbase directly, 352 * and capturing them involves two serializing instructions, 353 * we don't bother to capture them here. 354 */ 355 xorl %ebx, %ebx 356 movw %ds, %bx 357 movq %rbx, REGOFF_DS(%rsp) 358 movw %es, %bx 359 movq %rbx, REGOFF_ES(%rsp) 360 movw %fs, %bx 361 movq %rbx, REGOFF_FS(%rsp) 362 movw %gs, %bx 363 movq %rbx, REGOFF_GS(%rsp) 364 365 /* 366 * Machine state saved in the regs structure on the stack 367 * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9 368 * %eax is the syscall number 369 * %rsp is the thread's stack, %r15 is curthread 370 * REG_RSP(%rsp) is the user's stack 371 */ 372 373 SYSCALL_TRAPTRACE($TT_SYSC64) 374 375 movq %rsp, %rbp 376 377 movq T_LWP(%r15), %r14 378 ASSERT_NO_RUPDATE_PENDING(%r14) 379 380 ENABLE_INTR_FLAGS 381 382 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 383 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ 384 385 ASSERT_LWPTOREGS(%r14, %rsp) 386 387 movb $LWP_SYS, LWP_STATE(%r14) 388 incq LWP_RU_SYSC(%r14) 389 movb $NORMALRETURN, LWP_EOSYS(%r14) 390 391 incq %gs:CPU_STATS_SYS_SYSCALL 392 393 movw %ax, T_SYSNUM(%r15) 394 movzbl T_PRE_SYS(%r15), %ebx 395 ORL_SYSCALLTRACE(%ebx) 396 testl %ebx, %ebx 397 jne _syscall_pre 398 399_syscall_invoke: 400 movq REGOFF_RDI(%rbp), %rdi 401 movq REGOFF_RSI(%rbp), %rsi 402 movq REGOFF_RDX(%rbp), %rdx 403 movq REGOFF_RCX(%rbp), %rcx 404 movq REGOFF_R8(%rbp), %r8 405 movq REGOFF_R9(%rbp), %r9 406 407 cmpl $NSYSCALL, %eax 408 jae _syscall_ill 409 shll $SYSENT_SIZE_SHIFT, %eax 410 leaq sysent(%rax), %rbx 411 412 call *SY_CALLC(%rbx) 413 414 movq %rax, %r12 415 movq %rdx, %r13 416 417 /* 418 * If the handler returns two ints, then we need to split the 419 * 64-bit return value into two 32-bit values. 420 */ 421 testw $SE_32RVAL2, SY_FLAGS(%rbx) 422 je 5f 423 movq %r12, %r13 424 shrq $32, %r13 /* upper 32-bits into %edx */ 425 movl %r12d, %r12d /* lower 32-bits into %eax */ 4265: 427 /* 428 * Optimistically assume that there's no post-syscall 429 * work to do. (This is to avoid having to call syscall_mstate() 430 * with interrupts disabled) 431 */ 432 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 433 434 /* 435 * We must protect ourselves from being descheduled here; 436 * If we were, and we ended up on another cpu, or another 437 * lwp got in ahead of us, it could change the segment 438 * registers without us noticing before we return to userland. 439 */ 440 cli 441 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 442 jne _syscall_post 443 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 444 445 movq %r12, REGOFF_RAX(%rsp) 446 movq %r13, REGOFF_RDX(%rsp) 447 448 /* 449 * To get back to userland, we need the return %rip in %rcx and 450 * the return %rfl in %r11d. The sysretq instruction also arranges 451 * to fix up %cs and %ss; everything else is our responsibility. 452 */ 453 movq REGOFF_RDI(%rsp), %rdi 454 movq REGOFF_RSI(%rsp), %rsi 455 movq REGOFF_RDX(%rsp), %rdx 456 /* %rcx used to restore %rip value */ 457 458 movq REGOFF_R8(%rsp), %r8 459 movq REGOFF_R9(%rsp), %r9 460 movq REGOFF_RAX(%rsp), %rax 461 movq REGOFF_RBX(%rsp), %rbx 462 463 movq REGOFF_RBP(%rsp), %rbp 464 movq REGOFF_R10(%rsp), %r10 465 /* %r11 used to restore %rfl value */ 466 movq REGOFF_R12(%rsp), %r12 467 468 movq REGOFF_R13(%rsp), %r13 469 movq REGOFF_R14(%rsp), %r14 470 movq REGOFF_R15(%rsp), %r15 471 472 movq REGOFF_RIP(%rsp), %rcx 473 movl REGOFF_RFL(%rsp), %r11d 474 movq REGOFF_RSP(%rsp), %rsp 475 swapgs 476 sysretq 477 478_syscall_pre: 479 call pre_syscall 480 movl %eax, %r12d 481 testl %eax, %eax 482 jne _syscall_post_call 483 /* 484 * Didn't abort, so reload the syscall args and invoke the handler. 485 */ 486 movzwl T_SYSNUM(%r15), %eax 487 jmp _syscall_invoke 488 489_syscall_ill: 490 call nosys 491 movq %rax, %r12 492 movq %rdx, %r13 493 jmp _syscall_post_call 494 495_syscall_post: 496 sti 497 /* 498 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM 499 * so that we can account for the extra work it takes us to finish. 500 */ 501 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 502_syscall_post_call: 503 movq %r12, %rdi 504 movq %r13, %rsi 505 call post_syscall 506 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 507 jmp sys_rtt_syscall 508 SET_SIZE(sys_syscall) 509 510#endif /* __lint */ 511 512#if defined(__lint) 513 514/*ARGSUSED*/ 515void 516sys_syscall32() 517{} 518 519#else /* __lint */ 520 521 ENTRY_NP(sys_syscall32) 522 swapgs 523 movl %esp, %r10d 524 movq %gs:CPU_THREAD, %r15 525 movq T_STACK(%r15), %rsp 526 movl %eax, %eax 527 528 movl $U32CS_SEL, REGOFF_CS(%rsp) 529 movl %ecx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */ 530 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */ 531 movq %r10, REGOFF_RSP(%rsp) 532 movl $UDS_SEL, REGOFF_SS(%rsp) 533 534_syscall32_save: 535 536 movl %edi, REGOFF_RDI(%rsp) 537 movl %esi, REGOFF_RSI(%rsp) 538 movl %ebp, REGOFF_RBP(%rsp) 539 movl %ebx, REGOFF_RBX(%rsp) 540 movl %edx, REGOFF_RDX(%rsp) 541 movl %ecx, REGOFF_RCX(%rsp) 542 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */ 543 movq $0, REGOFF_SAVFP(%rsp) 544 movq $0, REGOFF_SAVPC(%rsp) 545 546 /* 547 * Copy these registers here in case we end up stopped with 548 * someone (like, say, /proc) messing with our register state. 549 * We don't -restore- them unless we have to in update_sregs. 550 * 551 * Since userland -can't- change fsbase or gsbase directly, 552 * we don't bother to capture them here. 553 */ 554 xorl %ebx, %ebx 555 movw %ds, %bx 556 movq %rbx, REGOFF_DS(%rsp) 557 movw %es, %bx 558 movq %rbx, REGOFF_ES(%rsp) 559 movw %fs, %bx 560 movq %rbx, REGOFF_FS(%rsp) 561 movw %gs, %bx 562 movq %rbx, REGOFF_GS(%rsp) 563 564 /* 565 * Application state saved in the regs structure on the stack 566 * %eax is the syscall number 567 * %rsp is the thread's stack, %r15 is curthread 568 * REG_RSP(%rsp) is the user's stack 569 */ 570 571 SYSCALL_TRAPTRACE32($TT_SYSC) 572 573 movq %rsp, %rbp 574 575 movq T_LWP(%r15), %r14 576 ASSERT_NO_RUPDATE_PENDING(%r14) 577 578 ENABLE_INTR_FLAGS 579 580 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 581 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ 582 583 ASSERT_LWPTOREGS(%r14, %rsp) 584 585 incq %gs:CPU_STATS_SYS_SYSCALL 586 587 /* 588 * Make some space for MAXSYSARGS (currently 8) 32-bit args placed 589 * into 64-bit (long) arg slots, maintaining 16 byte alignment. Or 590 * more succinctly: 591 * 592 * SA(MAXSYSARGS * sizeof (long)) == 64 593 */ 594#define SYS_DROP 64 /* drop for args */ 595 subq $SYS_DROP, %rsp 596 movb $LWP_SYS, LWP_STATE(%r14) 597 movq %r15, %rdi 598 movq %rsp, %rsi 599 call syscall_entry 600 601 /* 602 * Fetch the arguments copied onto the kernel stack and put 603 * them in the right registers to invoke a C-style syscall handler. 604 * %rax contains the handler address. 605 * 606 * Ideas for making all this go faster of course include simply 607 * forcibly fetching 6 arguments from the user stack under lofault 608 * protection, reverting to copyin_args only when watchpoints 609 * are in effect. 610 * 611 * (If we do this, make sure that exec and libthread leave 612 * enough space at the top of the stack to ensure that we'll 613 * never do a fetch from an invalid page.) 614 * 615 * Lots of ideas here, but they won't really help with bringup B-) 616 * Correctness can't wait, performance can wait a little longer .. 617 */ 618 619 movq %rax, %rbx 620 movl 0(%rsp), %edi 621 movl 8(%rsp), %esi 622 movl 0x10(%rsp), %edx 623 movl 0x18(%rsp), %ecx 624 movl 0x20(%rsp), %r8d 625 movl 0x28(%rsp), %r9d 626 627 call *SY_CALLC(%rbx) 628 629 movq %rbp, %rsp /* pop the args */ 630 631 /* 632 * amd64 syscall handlers -always- return a 64-bit value in %rax. 633 * On the 32-bit kernel, they always return that value in %eax:%edx 634 * as required by the 32-bit ABI. 635 * 636 * Simulate the same behaviour by unconditionally splitting the 637 * return value in the same way. 638 */ 639 movq %rax, %r13 640 shrq $32, %r13 /* upper 32-bits into %edx */ 641 movl %eax, %r12d /* lower 32-bits into %eax */ 642 643 /* 644 * Optimistically assume that there's no post-syscall 645 * work to do. (This is to avoid having to call syscall_mstate() 646 * with interrupts disabled) 647 */ 648 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 649 650 /* 651 * We must protect ourselves from being descheduled here; 652 * If we were, and we ended up on another cpu, or another 653 * lwp got in ahead of us, it could change the segment 654 * registers without us noticing before we return to userland. 655 */ 656 cli 657 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 658 jne _full_syscall_postsys32 659 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 660 661 /* 662 * To get back to userland, we need to put the return %rip in %rcx and 663 * the return %rfl in %r11d. The sysret instruction also arranges 664 * to fix up %cs and %ss; everything else is our responsibility. 665 */ 666 667 movl %r12d, %eax /* %eax: rval1 */ 668 movl REGOFF_RBX(%rsp), %ebx 669 /* %ecx used for return pointer */ 670 movl %r13d, %edx /* %edx: rval2 */ 671 movl REGOFF_RBP(%rsp), %ebp 672 movl REGOFF_RSI(%rsp), %esi 673 movl REGOFF_RDI(%rsp), %edi 674 675 movl REGOFF_RFL(%rsp), %r11d /* %r11 -> eflags */ 676 movl REGOFF_RIP(%rsp), %ecx /* %ecx -> %eip */ 677 movl REGOFF_RSP(%rsp), %esp 678 679 swapgs 680 sysretl 681 682_full_syscall_postsys32: 683 sti 684 /* 685 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM 686 * so that we can account for the extra work it takes us to finish. 687 */ 688 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 689 movq %r15, %rdi 690 movq %r12, %rsi /* rval1 - %eax */ 691 movq %r13, %rdx /* rval2 - %edx */ 692 call syscall_exit 693 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 694 jmp sys_rtt_syscall32 695 SET_SIZE(sys_syscall32) 696 697#endif /* __lint */ 698 699/* 700 * System call handler via the sysenter instruction 701 * Used only for 32-bit system calls on the 64-bit kernel. 702 * 703 * The caller in userland has arranged that: 704 * 705 * - %eax contains the syscall number 706 * - %ecx contains the user %esp 707 * - %edx contains the return %eip 708 * - the user stack contains the args to the syscall 709 * 710 * Hardware and (privileged) initialization code have arranged that by 711 * the time the sysenter instructions completes: 712 * 713 * - %rip is pointing to sys_sysenter (below). 714 * - %cs and %ss are set to kernel text and stack (data) selectors. 715 * - %rsp is pointing at the lwp's stack 716 * - interrupts have been disabled. 717 * 718 * Note that we are unable to return both "rvals" to userland with 719 * this call, as %edx is used by the sysexit instruction. 720 */ 721#if defined(__lint) 722 723void 724sys_sysenter() 725{} 726 727#else /* __lint */ 728 729 ENTRY_NP(sys_sysenter) 730 swapgs 731 ALTENTRY(_sys_sysenter_post_swapgs) 732 movq %gs:CPU_THREAD, %r15 733 734 movl $U32CS_SEL, REGOFF_CS(%rsp) 735 movl %ecx, REGOFF_RSP(%rsp) /* wrapper: %esp -> %ecx */ 736 movl %edx, REGOFF_RIP(%rsp) /* wrapper: %eip -> %edx */ 737 pushfq 738 popq %r10 739 movl $UDS_SEL, REGOFF_SS(%rsp) 740 741 /* 742 * Set the interrupt flag before storing the flags to the 743 * flags image on the stack so we can return to user with 744 * interrupts enabled if we return via sys_rtt_syscall32 745 */ 746 orq $PS_IE, %r10 747 movq %r10, REGOFF_RFL(%rsp) 748 749 movl %edi, REGOFF_RDI(%rsp) 750 movl %esi, REGOFF_RSI(%rsp) 751 movl %ebp, REGOFF_RBP(%rsp) 752 movl %ebx, REGOFF_RBX(%rsp) 753 movl %edx, REGOFF_RDX(%rsp) 754 movl %ecx, REGOFF_RCX(%rsp) 755 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */ 756 movq $0, REGOFF_SAVFP(%rsp) 757 movq $0, REGOFF_SAVPC(%rsp) 758 759 /* 760 * Copy these registers here in case we end up stopped with 761 * someone (like, say, /proc) messing with our register state. 762 * We don't -restore- them unless we have to in update_sregs. 763 * 764 * Since userland -can't- change fsbase or gsbase directly, 765 * we don't bother to capture them here. 766 */ 767 xorl %ebx, %ebx 768 movw %ds, %bx 769 movq %rbx, REGOFF_DS(%rsp) 770 movw %es, %bx 771 movq %rbx, REGOFF_ES(%rsp) 772 movw %fs, %bx 773 movq %rbx, REGOFF_FS(%rsp) 774 movw %gs, %bx 775 movq %rbx, REGOFF_GS(%rsp) 776 777 /* 778 * Application state saved in the regs structure on the stack 779 * %eax is the syscall number 780 * %rsp is the thread's stack, %r15 is curthread 781 * REG_RSP(%rsp) is the user's stack 782 */ 783 784 SYSCALL_TRAPTRACE($TT_SYSENTER) 785 786 movq %rsp, %rbp 787 788 movq T_LWP(%r15), %r14 789 ASSERT_NO_RUPDATE_PENDING(%r14) 790 791 ENABLE_INTR_FLAGS 792 793 /* 794 * Catch 64-bit process trying to issue sysenter instruction 795 * on Nocona based systems. 796 */ 797 movq LWP_PROCP(%r14), %rax 798 cmpq $DATAMODEL_ILP32, P_MODEL(%rax) 799 je 7f 800 801 /* 802 * For a non-32-bit process, simulate a #ud, since that's what 803 * native hardware does. The traptrace entry (above) will 804 * let you know what really happened. 805 */ 806 movq $T_ILLINST, REGOFF_TRAPNO(%rsp) 807 movq REGOFF_CS(%rsp), %rdi 808 movq %rdi, REGOFF_ERR(%rsp) 809 movq %rsp, %rdi 810 movq REGOFF_RIP(%rsp), %rsi 811 movl %gs:CPU_ID, %edx 812 call trap 813 jmp _sys_rtt 8147: 815 816 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 817 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate calls) */ 818 819 ASSERT_LWPTOREGS(%r14, %rsp) 820 821 incq %gs:CPU_STATS_SYS_SYSCALL 822 823 /* 824 * Make some space for MAXSYSARGS (currently 8) 32-bit args 825 * placed into 64-bit (long) arg slots, plus one 64-bit 826 * (long) arg count, maintaining 16 byte alignment. 827 */ 828 subq $SYS_DROP, %rsp 829 movb $LWP_SYS, LWP_STATE(%r14) 830 movq %r15, %rdi 831 movq %rsp, %rsi 832 call syscall_entry 833 834 /* 835 * Fetch the arguments copied onto the kernel stack and put 836 * them in the right registers to invoke a C-style syscall handler. 837 * %rax contains the handler address. 838 */ 839 movq %rax, %rbx 840 movl 0(%rsp), %edi 841 movl 8(%rsp), %esi 842 movl 0x10(%rsp), %edx 843 movl 0x18(%rsp), %ecx 844 movl 0x20(%rsp), %r8d 845 movl 0x28(%rsp), %r9d 846 847 call *SY_CALLC(%rbx) 848 849 movq %rbp, %rsp /* pop the args */ 850 851 /* 852 * amd64 syscall handlers -always- return a 64-bit value in %rax. 853 * On the 32-bit kernel, the always return that value in %eax:%edx 854 * as required by the 32-bit ABI. 855 * 856 * Simulate the same behaviour by unconditionally splitting the 857 * return value in the same way. 858 */ 859 movq %rax, %r13 860 shrq $32, %r13 /* upper 32-bits into %edx */ 861 movl %eax, %r12d /* lower 32-bits into %eax */ 862 863 /* 864 * Optimistically assume that there's no post-syscall 865 * work to do. (This is to avoid having to call syscall_mstate() 866 * with interrupts disabled) 867 */ 868 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 869 870 /* 871 * We must protect ourselves from being descheduled here; 872 * If we were, and we ended up on another cpu, or another 873 * lwp got int ahead of us, it could change the segment 874 * registers without us noticing before we return to userland. 875 */ 876 cli 877 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 878 jne _full_syscall_postsys32 879 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 880 881 /* 882 * To get back to userland, load up the 32-bit registers and 883 * sysexit back where we came from. 884 */ 885 886 /* 887 * Interrupts will be turned on by the 'sti' executed just before 888 * sysexit. The following ensures that restoring the user's rflags 889 * doesn't enable interrupts too soon. 890 */ 891 andq $_BITNOT(PS_IE), REGOFF_RFL(%rsp) 892 893 /* 894 * (There's no point in loading up %edx because the sysexit 895 * mechanism smashes it.) 896 */ 897 movl %r12d, %eax 898 movl REGOFF_RBX(%rsp), %ebx 899 movl REGOFF_RBP(%rsp), %ebp 900 movl REGOFF_RSI(%rsp), %esi 901 movl REGOFF_RDI(%rsp), %edi 902 903 movl REGOFF_RIP(%rsp), %edx /* sysexit: %edx -> %eip */ 904 pushq REGOFF_RFL(%rsp) 905 popfq 906 movl REGOFF_RSP(%rsp), %ecx /* sysexit: %ecx -> %esp */ 907 swapgs 908 sti 909 sysexit 910 SET_SIZE(sys_sysenter) 911 SET_SIZE(_sys_sysenter_post_swapgs) 912 913#endif /* __lint */ 914 915 916/* 917 * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by 918 * the generic i386 libc to do system calls. We do a small amount of setup 919 * before jumping into the existing sys_syscall32 path. 920 */ 921#if defined(__lint) 922 923/*ARGSUSED*/ 924void 925sys_syscall_int() 926{} 927 928#else /* __lint */ 929 930 ENTRY_NP(sys_syscall_int) 931 swapgs 932 movq %gs:CPU_THREAD, %r15 933 movq T_STACK(%r15), %rsp 934 movl %eax, %eax 935 /* 936 * Set t_post_sys on this thread to force ourselves out via the slow 937 * path. It might be possible at some later date to optimize this out 938 * and use a faster return mechanism. 939 */ 940 movb $1, T_POST_SYS(%r15) 941 jmp _syscall32_save 942 SET_SIZE(sys_syscall_int) 943 944#endif /* __lint */ 945 946/* 947 * Legacy 32-bit applications and old libc implementations do lcalls; 948 * we should never get here because the LDT entry containing the syscall 949 * segment descriptor has the "segment present" bit cleared, which means 950 * we end up processing those system calls in trap() via a not-present trap. 951 * 952 * We do it this way because a call gate unhelpfully does -nothing- to the 953 * interrupt flag bit, so an interrupt can run us just after the lcall 954 * completes, but just before the swapgs takes effect. Thus the INTR_PUSH and 955 * INTR_POP paths would have to be slightly more complex to dance around 956 * this problem, and end up depending explicitly on the first 957 * instruction of this handler being either swapgs or cli. 958 */ 959 960#if defined(__lint) 961 962/*ARGSUSED*/ 963void 964sys_lcall32() 965{} 966 967#else /* __lint */ 968 969 ENTRY_NP(sys_lcall32) 970 swapgs 971 pushq $0 972 pushq %rbp 973 movq %rsp, %rbp 974 leaq __lcall_panic_str(%rip), %rdi 975 xorl %eax, %eax 976 call panic 977 SET_SIZE(sys_lcall32) 978 979__lcall_panic_str: 980 .string "sys_lcall32: shouldn't be here!" 981 982/* 983 * Declare a uintptr_t which covers the entire pc range of syscall 984 * handlers for the stack walkers that need this. 985 */ 986 .align CPTRSIZE 987 .globl _allsyscalls_size 988 .type _allsyscalls_size, @object 989_allsyscalls_size: 990 .NWORD . - _allsyscalls 991 SET_SIZE(_allsyscalls_size) 992 993#endif /* __lint */ 994 995/* 996 * These are the thread context handlers for lwps using sysenter/sysexit. 997 */ 998 999#if defined(__lint) 1000 1001/*ARGSUSED*/ 1002void 1003sep_save(void *ksp) 1004{} 1005 1006/*ARGSUSED*/ 1007void 1008sep_restore(void *ksp) 1009{} 1010 1011#else /* __lint */ 1012 1013 /* 1014 * setting this value to zero as we switch away causes the 1015 * stack-pointer-on-sysenter to be NULL, ensuring that we 1016 * don't silently corrupt another (preempted) thread stack 1017 * when running an lwp that (somehow) didn't get sep_restore'd 1018 */ 1019 ENTRY_NP(sep_save) 1020 xorl %edx, %edx 1021 xorl %eax, %eax 1022 movl $MSR_INTC_SEP_ESP, %ecx 1023 wrmsr 1024 ret 1025 SET_SIZE(sep_save) 1026 1027 /* 1028 * Update the kernel stack pointer as we resume onto this cpu. 1029 */ 1030 ENTRY_NP(sep_restore) 1031 movq %rdi, %rdx 1032 shrq $32, %rdx 1033 movl %edi, %eax 1034 movl $MSR_INTC_SEP_ESP, %ecx 1035 wrmsr 1036 ret 1037 SET_SIZE(sep_restore) 1038 1039#endif /* __lint */ 1040