1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28#include <sys/asm_linkage.h> 29#include <sys/asm_misc.h> 30#include <sys/regset.h> 31#include <sys/privregs.h> 32#include <sys/psw.h> 33#include <sys/machbrand.h> 34 35#if defined(__lint) 36 37#include <sys/types.h> 38#include <sys/thread.h> 39#include <sys/systm.h> 40 41#else /* __lint */ 42 43#include <sys/segments.h> 44#include <sys/pcb.h> 45#include <sys/trap.h> 46#include <sys/ftrace.h> 47#include <sys/traptrace.h> 48#include <sys/clock.h> 49#include <sys/model.h> 50#include <sys/panic.h> 51#include "assym.h" 52 53#endif /* __lint */ 54 55/* 56 * We implement five flavours of system call entry points 57 * 58 * - syscall/sysretq (amd64 generic) 59 * - syscall/sysretl (i386 plus SYSC bit) 60 * - sysenter/sysexit (i386 plus SEP bit) 61 * - int/iret (i386 generic) 62 * - lcall/iret (i386 generic) 63 * 64 * The current libc included in Solaris uses int/iret as the base unoptimized 65 * kernel entry method. Older libc implementations and legacy binaries may use 66 * the lcall call gate, so it must continue to be supported. 67 * 68 * System calls that use an lcall call gate are processed in trap() via a 69 * segment-not-present trap, i.e. lcalls are extremely slow(!). 70 * 71 * The basic pattern used in the 32-bit SYSC handler at this point in time is 72 * to have the bare minimum of assembler, and get to the C handlers as 73 * quickly as possible. 74 * 75 * The 64-bit handler is much closer to the sparcv9 handler; that's 76 * because of passing arguments in registers. The 32-bit world still 77 * passes arguments on the stack -- that makes that handler substantially 78 * more complex. 79 * 80 * The two handlers share a few code fragments which are broken 81 * out into preprocessor macros below. 82 * 83 * XX64 come back and speed all this up later. The 32-bit stuff looks 84 * especially easy to speed up the argument copying part .. 85 * 86 * 87 * Notes about segment register usage (c.f. the 32-bit kernel) 88 * 89 * In the 32-bit kernel, segment registers are dutifully saved and 90 * restored on all mode transitions because the kernel uses them directly. 91 * When the processor is running in 64-bit mode, segment registers are 92 * largely ignored. 93 * 94 * %cs and %ss 95 * controlled by the hardware mechanisms that make mode transitions 96 * 97 * The remaining segment registers have to either be pointing at a valid 98 * descriptor i.e. with the 'present' bit set, or they can NULL descriptors 99 * 100 * %ds and %es 101 * always ignored 102 * 103 * %fs and %gs 104 * fsbase and gsbase are used to control the place they really point at. 105 * The kernel only depends on %gs, and controls its own gsbase via swapgs 106 * 107 * Note that loading segment registers is still costly because the GDT 108 * lookup still happens (this is because the hardware can't know that we're 109 * not setting up these segment registers for a 32-bit program). Thus we 110 * avoid doing this in the syscall path, and defer them to lwp context switch 111 * handlers, so the register values remain virtualized to the lwp. 112 */ 113 114#if defined(SYSCALLTRACE) 115#define ORL_SYSCALLTRACE(r32) \ 116 orl syscalltrace(%rip), r32 117#else 118#define ORL_SYSCALLTRACE(r32) 119#endif 120 121/* 122 * In the 32-bit kernel, we do absolutely nothing before getting into the 123 * brand callback checks. In 64-bit land, we do swapgs and then come here. 124 * We assume that the %rsp- and %r15-stashing fields in the CPU structure 125 * are still unused. 126 * 127 * When the callback is invoked, we will be on the user's %gs and 128 * the stack will look like this: 129 * 130 * stack: -------------------------------------- 131 * | callback pointer | 132 * | | user stack pointer | 133 * | | lwp brand data | 134 * | | proc brand data | 135 * v | userland return address | 136 * | callback wrapper return addr | 137 * -------------------------------------- 138 * 139 */ 140#define BRAND_CALLBACK(callback_id) \ 141 movq %rsp, %gs:CPU_RTMP_RSP /* save the stack pointer */ ;\ 142 movq %r15, %gs:CPU_RTMP_R15 /* save %r15 */ ;\ 143 movq %gs:CPU_THREAD, %r15 /* load the thread pointer */ ;\ 144 movq T_STACK(%r15), %rsp /* switch to the kernel stack */ ;\ 145 subq $16, %rsp /* save space for two pointers */ ;\ 146 pushq %r14 /* save %r14 */ ;\ 147 movq %gs:CPU_RTMP_RSP, %r14 ;\ 148 movq %r14, 8(%rsp) /* stash the user stack pointer */ ;\ 149 popq %r14 /* restore %r14 */ ;\ 150 movq T_LWP(%r15), %r15 /* load the lwp pointer */ ;\ 151 pushq LWP_BRAND(%r15) /* push the lwp's brand data */ ;\ 152 movq LWP_PROCP(%r15), %r15 /* load the proc pointer */ ;\ 153 pushq P_BRAND_DATA(%r15) /* push the proc's brand data */ ;\ 154 movq P_BRAND(%r15), %r15 /* load the brand pointer */ ;\ 155 movq B_MACHOPS(%r15), %r15 /* load the machops pointer */ ;\ 156 movq _CONST(_MUL(callback_id, CPTRSIZE))(%r15), %r15 ;\ 157 cmpq $0, %r15 ;\ 158 je 1f ;\ 159 movq %r15, 24(%rsp) /* save the callback pointer */ ;\ 160 movq %gs:CPU_RTMP_RSP, %r15 /* grab the user stack pointer */ ;\ 161 pushq (%r15) /* push the return address */ ;\ 162 movq %gs:CPU_RTMP_R15, %r15 /* restore %r15 */ ;\ 163 swapgs ;\ 164 call *32(%rsp) /* call callback */ ;\ 165 swapgs ;\ 1661: movq %gs:CPU_RTMP_R15, %r15 /* restore %r15 */ ;\ 167 movq %gs:CPU_RTMP_RSP, %rsp /* restore the stack pointer */ 168 169#define MSTATE_TRANSITION(from, to) \ 170 movl $from, %edi; \ 171 movl $to, %esi; \ 172 call syscall_mstate 173 174/* 175 * Check to see if a simple (direct) return is possible i.e. 176 * 177 * if ((t->t_post_sys_ast | syscalltrace | 178 * (lwp->lwp_pcb.pcb_flags & RUPDATE_PENDING)) != 0) 179 * do full version ; 180 * 181 * Preconditions: 182 * - t is curthread 183 * Postconditions: 184 * - condition code NE is set if post-sys is too complex 185 * - rtmp is zeroed if it isn't (we rely on this!) 186 * - ltmp is smashed 187 */ 188#define CHECK_POSTSYS_NE(t, ltmp, rtmp) \ 189 movq T_LWP(t), ltmp; \ 190 movl PCB_FLAGS(ltmp), rtmp; \ 191 andl $RUPDATE_PENDING, rtmp; \ 192 ORL_SYSCALLTRACE(rtmp); \ 193 orl T_POST_SYS_AST(t), rtmp; \ 194 cmpl $0, rtmp 195 196/* 197 * Fix up the lwp, thread, and eflags for a successful return 198 * 199 * Preconditions: 200 * - zwreg contains zero 201 */ 202#define SIMPLE_SYSCALL_POSTSYS(t, lwp, zwreg) \ 203 movb $LWP_USER, LWP_STATE(lwp); \ 204 movw zwreg, T_SYSNUM(t); \ 205 andb $_CONST(0xffff - PS_C), REGOFF_RFL(%rsp) 206 207/* 208 * ASSERT(lwptoregs(lwp) == rp); 209 * 210 * This may seem obvious, but very odd things happen if this 211 * assertion is false 212 * 213 * Preconditions: 214 * (%rsp is ready for normal call sequence) 215 * Postconditions (if assertion is true): 216 * %r11 is smashed 217 * 218 * ASSERT(rp->r_cs == descnum) 219 * 220 * The code selector is written into the regs structure when the 221 * lwp stack is created. We use this ASSERT to validate that 222 * the regs structure really matches how we came in. 223 * 224 * Preconditions: 225 * (%rsp is ready for normal call sequence) 226 * Postconditions (if assertion is true): 227 * -none- 228 * 229 * ASSERT((lwp->lwp_pcb.pcb_flags & RUPDATE_PENDING) == 0); 230 * 231 * If this is false, it meant that we returned to userland without 232 * updating the segment registers as we were supposed to. 233 * 234 * Note that we must ensure no interrupts or other traps intervene 235 * between entering privileged mode and performing the assertion, 236 * otherwise we may perform a context switch on the thread, which 237 * will end up setting the RUPDATE_PENDING bit again. 238 */ 239#if defined(DEBUG) 240 241#if !defined(__lint) 242 243__lwptoregs_msg: 244 .string "syscall_asm_amd64.s:%d lwptoregs(%p) [%p] != rp [%p]" 245 246__codesel_msg: 247 .string "syscall_asm_amd64.s:%d rp->r_cs [%ld] != %ld" 248 249__no_rupdate_msg: 250 .string "syscall_asm_amd64.s:%d lwp %p, pcb_flags & RUPDATE_PENDING != 0" 251 252#endif /* !__lint */ 253 254#define ASSERT_LWPTOREGS(lwp, rp) \ 255 movq LWP_REGS(lwp), %r11; \ 256 cmpq rp, %r11; \ 257 je 7f; \ 258 leaq __lwptoregs_msg(%rip), %rdi; \ 259 movl $__LINE__, %esi; \ 260 movq lwp, %rdx; \ 261 movq %r11, %rcx; \ 262 movq rp, %r8; \ 263 xorl %eax, %eax; \ 264 call panic; \ 2657: 266 267#define ASSERT_NO_RUPDATE_PENDING(lwp) \ 268 testl $RUPDATE_PENDING, PCB_FLAGS(lwp); \ 269 je 8f; \ 270 movq lwp, %rdx; \ 271 leaq __no_rupdate_msg(%rip), %rdi; \ 272 movl $__LINE__, %esi; \ 273 xorl %eax, %eax; \ 274 call panic; \ 2758: 276 277#else 278#define ASSERT_LWPTOREGS(lwp, rp) 279#define ASSERT_NO_RUPDATE_PENDING(lwp) 280#endif 281 282/* 283 * Do the traptrace thing and restore any registers we used 284 * in situ. Assumes that %rsp is pointing at the base of 285 * the struct regs, obviously .. 286 */ 287#ifdef TRAPTRACE 288#define SYSCALL_TRAPTRACE(ttype) \ 289 TRACE_PTR(%rdi, %rbx, %ebx, %rcx, ttype); \ 290 TRACE_REGS(%rdi, %rsp, %rbx, %rcx); \ 291 TRACE_STAMP(%rdi); /* rdtsc clobbers %eax, %edx */ \ 292 movq REGOFF_RAX(%rsp), %rax; \ 293 movq REGOFF_RBX(%rsp), %rbx; \ 294 movq REGOFF_RCX(%rsp), %rcx; \ 295 movq REGOFF_RDX(%rsp), %rdx; \ 296 movl %eax, TTR_SYSNUM(%rdi); \ 297 movq REGOFF_RDI(%rsp), %rdi 298 299#define SYSCALL_TRAPTRACE32(ttype) \ 300 SYSCALL_TRAPTRACE(ttype); \ 301 /* paranoia: clean the top 32-bits of the registers */ \ 302 orl %eax, %eax; \ 303 orl %ebx, %ebx; \ 304 orl %ecx, %ecx; \ 305 orl %edx, %edx; \ 306 orl %edi, %edi 307#else /* TRAPTRACE */ 308#define SYSCALL_TRAPTRACE(ttype) 309#define SYSCALL_TRAPTRACE32(ttype) 310#endif /* TRAPTRACE */ 311 312/* 313 * The 64-bit libc syscall wrapper does this: 314 * 315 * fn(<args>) 316 * { 317 * movq %rcx, %r10 -- because syscall smashes %rcx 318 * movl $CODE, %eax 319 * syscall 320 * <error processing> 321 * } 322 * 323 * Thus when we come into the kernel: 324 * 325 * %rdi, %rsi, %rdx, %r10, %r8, %r9 contain first six args 326 * %rax is the syscall number 327 * %r12-%r15 contain caller state 328 * 329 * The syscall instruction arranges that: 330 * 331 * %rcx contains the return %rip 332 * %r11d contains bottom 32-bits of %rflags 333 * %rflags is masked (as determined by the SFMASK msr) 334 * %cs is set to UCS_SEL (as determined by the STAR msr) 335 * %ss is set to UDS_SEL (as determined by the STAR msr) 336 * %rip is set to sys_syscall (as determined by the LSTAR msr) 337 * 338 * Or in other words, we have no registers available at all. 339 * Only swapgs can save us! 340 */ 341 342#if defined(__lint) 343 344/*ARGSUSED*/ 345void 346sys_syscall() 347{} 348 349void 350_allsyscalls() 351{} 352 353size_t _allsyscalls_size; 354 355#else /* __lint */ 356 357 ENTRY_NP2(brand_sys_syscall,_allsyscalls) 358 SWAPGS 359 BRAND_CALLBACK(BRAND_CB_SYSCALL) 360 SWAPGS 361 362 ALTENTRY(sys_syscall) 363 SWAPGS 364 movq %rsp, %gs:CPU_RTMP_RSP 365 movq %r15, %gs:CPU_RTMP_R15 366 movq %gs:CPU_THREAD, %r15 367 movq T_STACK(%r15), %rsp 368 369 movl $UCS_SEL, REGOFF_CS(%rsp) 370 movq %rcx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */ 371 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */ 372 movl $UDS_SEL, REGOFF_SS(%rsp) 373 374 movl %eax, %eax /* wrapper: sysc# -> %eax */ 375 movq %rdi, REGOFF_RDI(%rsp) 376 movq %rsi, REGOFF_RSI(%rsp) 377 movq %rdx, REGOFF_RDX(%rsp) 378 movq %r10, REGOFF_RCX(%rsp) /* wrapper: %rcx -> %r10 */ 379 movq %r10, %rcx /* arg[3] for direct calls */ 380 381 movq %r8, REGOFF_R8(%rsp) 382 movq %r9, REGOFF_R9(%rsp) 383 movq %rax, REGOFF_RAX(%rsp) 384 movq %rbx, REGOFF_RBX(%rsp) 385 386 movq %rbp, REGOFF_RBP(%rsp) 387 movq %r10, REGOFF_R10(%rsp) 388 movq %gs:CPU_RTMP_RSP, %r11 389 movq %r11, REGOFF_RSP(%rsp) 390 movq %r12, REGOFF_R12(%rsp) 391 392 movq %r13, REGOFF_R13(%rsp) 393 movq %r14, REGOFF_R14(%rsp) 394 movq %gs:CPU_RTMP_R15, %r10 395 movq %r10, REGOFF_R15(%rsp) 396 movq $0, REGOFF_SAVFP(%rsp) 397 movq $0, REGOFF_SAVPC(%rsp) 398 399 /* 400 * Copy these registers here in case we end up stopped with 401 * someone (like, say, /proc) messing with our register state. 402 * We don't -restore- them unless we have to in update_sregs. 403 * 404 * Since userland -can't- change fsbase or gsbase directly, 405 * and capturing them involves two serializing instructions, 406 * we don't bother to capture them here. 407 */ 408 xorl %ebx, %ebx 409 movw %ds, %bx 410 movq %rbx, REGOFF_DS(%rsp) 411 movw %es, %bx 412 movq %rbx, REGOFF_ES(%rsp) 413 movw %fs, %bx 414 movq %rbx, REGOFF_FS(%rsp) 415 movw %gs, %bx 416 movq %rbx, REGOFF_GS(%rsp) 417 418 /* 419 * Machine state saved in the regs structure on the stack 420 * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9 421 * %eax is the syscall number 422 * %rsp is the thread's stack, %r15 is curthread 423 * REG_RSP(%rsp) is the user's stack 424 */ 425 426 SYSCALL_TRAPTRACE($TT_SYSC64) 427 428 movq %rsp, %rbp 429 430 movq T_LWP(%r15), %r14 431 ASSERT_NO_RUPDATE_PENDING(%r14) 432 ENABLE_INTR_FLAGS 433 434 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 435 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ 436 437 ASSERT_LWPTOREGS(%r14, %rsp) 438 439 movb $LWP_SYS, LWP_STATE(%r14) 440 incq LWP_RU_SYSC(%r14) 441 movb $NORMALRETURN, LWP_EOSYS(%r14) 442 443 incq %gs:CPU_STATS_SYS_SYSCALL 444 445 movw %ax, T_SYSNUM(%r15) 446 movzbl T_PRE_SYS(%r15), %ebx 447 ORL_SYSCALLTRACE(%ebx) 448 testl %ebx, %ebx 449 jne _syscall_pre 450 451_syscall_invoke: 452 movq REGOFF_RDI(%rbp), %rdi 453 movq REGOFF_RSI(%rbp), %rsi 454 movq REGOFF_RDX(%rbp), %rdx 455 movq REGOFF_RCX(%rbp), %rcx 456 movq REGOFF_R8(%rbp), %r8 457 movq REGOFF_R9(%rbp), %r9 458 459 cmpl $NSYSCALL, %eax 460 jae _syscall_ill 461 shll $SYSENT_SIZE_SHIFT, %eax 462 leaq sysent(%rax), %rbx 463 464 call *SY_CALLC(%rbx) 465 466 movq %rax, %r12 467 movq %rdx, %r13 468 469 /* 470 * If the handler returns two ints, then we need to split the 471 * 64-bit return value into two 32-bit values. 472 */ 473 testw $SE_32RVAL2, SY_FLAGS(%rbx) 474 je 5f 475 movq %r12, %r13 476 shrq $32, %r13 /* upper 32-bits into %edx */ 477 movl %r12d, %r12d /* lower 32-bits into %eax */ 4785: 479 /* 480 * Optimistically assume that there's no post-syscall 481 * work to do. (This is to avoid having to call syscall_mstate() 482 * with interrupts disabled) 483 */ 484 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 485 486 /* 487 * We must protect ourselves from being descheduled here; 488 * If we were, and we ended up on another cpu, or another 489 * lwp got in ahead of us, it could change the segment 490 * registers without us noticing before we return to userland. 491 */ 492 CLI(%r14) 493 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 494 jne _syscall_post 495 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 496 497 movq %r12, REGOFF_RAX(%rsp) 498 movq %r13, REGOFF_RDX(%rsp) 499 500 /* 501 * To get back to userland, we need the return %rip in %rcx and 502 * the return %rfl in %r11d. The sysretq instruction also arranges 503 * to fix up %cs and %ss; everything else is our responsibility. 504 */ 505 movq REGOFF_RDI(%rsp), %rdi 506 movq REGOFF_RSI(%rsp), %rsi 507 movq REGOFF_RDX(%rsp), %rdx 508 /* %rcx used to restore %rip value */ 509 510 movq REGOFF_R8(%rsp), %r8 511 movq REGOFF_R9(%rsp), %r9 512 movq REGOFF_RAX(%rsp), %rax 513 movq REGOFF_RBX(%rsp), %rbx 514 515 movq REGOFF_RBP(%rsp), %rbp 516 movq REGOFF_R10(%rsp), %r10 517 /* %r11 used to restore %rfl value */ 518 movq REGOFF_R12(%rsp), %r12 519 520 movq REGOFF_R13(%rsp), %r13 521 movq REGOFF_R14(%rsp), %r14 522 movq REGOFF_R15(%rsp), %r15 523 524 movq REGOFF_RIP(%rsp), %rcx 525 movl REGOFF_RFL(%rsp), %r11d 526 movq REGOFF_RSP(%rsp), %rsp 527 SWAPGS 528 sysretq 529 530_syscall_pre: 531 call pre_syscall 532 movl %eax, %r12d 533 testl %eax, %eax 534 jne _syscall_post_call 535 /* 536 * Didn't abort, so reload the syscall args and invoke the handler. 537 */ 538 movzwl T_SYSNUM(%r15), %eax 539 jmp _syscall_invoke 540 541_syscall_ill: 542 call nosys 543 movq %rax, %r12 544 movq %rdx, %r13 545 jmp _syscall_post_call 546 547_syscall_post: 548 STI 549 /* 550 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM 551 * so that we can account for the extra work it takes us to finish. 552 */ 553 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 554_syscall_post_call: 555 movq %r12, %rdi 556 movq %r13, %rsi 557 call post_syscall 558 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 559 jmp _sys_rtt 560 SET_SIZE(sys_syscall) 561 SET_SIZE(brand_sys_syscall) 562 563#endif /* __lint */ 564 565#if defined(__lint) 566 567/*ARGSUSED*/ 568void 569sys_syscall32() 570{} 571 572#else /* __lint */ 573 574 ENTRY_NP(brand_sys_syscall32) 575 SWAPGS 576 BRAND_CALLBACK(BRAND_CB_SYSCALL32) 577 SWAPGS 578 579 ALTENTRY(sys_syscall32) 580 SWAPGS 581 movl %esp, %r10d 582 movq %gs:CPU_THREAD, %r15 583 movq T_STACK(%r15), %rsp 584 movl %eax, %eax 585 586 movl $U32CS_SEL, REGOFF_CS(%rsp) 587 movl %ecx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */ 588 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */ 589 movq %r10, REGOFF_RSP(%rsp) 590 movl $UDS_SEL, REGOFF_SS(%rsp) 591 592_syscall32_save: 593 movl %edi, REGOFF_RDI(%rsp) 594 movl %esi, REGOFF_RSI(%rsp) 595 movl %ebp, REGOFF_RBP(%rsp) 596 movl %ebx, REGOFF_RBX(%rsp) 597 movl %edx, REGOFF_RDX(%rsp) 598 movl %ecx, REGOFF_RCX(%rsp) 599 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */ 600 movq $0, REGOFF_SAVFP(%rsp) 601 movq $0, REGOFF_SAVPC(%rsp) 602 603 /* 604 * Copy these registers here in case we end up stopped with 605 * someone (like, say, /proc) messing with our register state. 606 * We don't -restore- them unless we have to in update_sregs. 607 * 608 * Since userland -can't- change fsbase or gsbase directly, 609 * we don't bother to capture them here. 610 */ 611 xorl %ebx, %ebx 612 movw %ds, %bx 613 movq %rbx, REGOFF_DS(%rsp) 614 movw %es, %bx 615 movq %rbx, REGOFF_ES(%rsp) 616 movw %fs, %bx 617 movq %rbx, REGOFF_FS(%rsp) 618 movw %gs, %bx 619 movq %rbx, REGOFF_GS(%rsp) 620 621 /* 622 * Application state saved in the regs structure on the stack 623 * %eax is the syscall number 624 * %rsp is the thread's stack, %r15 is curthread 625 * REG_RSP(%rsp) is the user's stack 626 */ 627 628 SYSCALL_TRAPTRACE32($TT_SYSC) 629 630 movq %rsp, %rbp 631 632 movq T_LWP(%r15), %r14 633 ASSERT_NO_RUPDATE_PENDING(%r14) 634 635 ENABLE_INTR_FLAGS 636 637 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 638 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ 639 640 ASSERT_LWPTOREGS(%r14, %rsp) 641 642 incq %gs:CPU_STATS_SYS_SYSCALL 643 644 /* 645 * Make some space for MAXSYSARGS (currently 8) 32-bit args placed 646 * into 64-bit (long) arg slots, maintaining 16 byte alignment. Or 647 * more succinctly: 648 * 649 * SA(MAXSYSARGS * sizeof (long)) == 64 650 */ 651#define SYS_DROP 64 /* drop for args */ 652 subq $SYS_DROP, %rsp 653 movb $LWP_SYS, LWP_STATE(%r14) 654 movq %r15, %rdi 655 movq %rsp, %rsi 656 call syscall_entry 657 658 /* 659 * Fetch the arguments copied onto the kernel stack and put 660 * them in the right registers to invoke a C-style syscall handler. 661 * %rax contains the handler address. 662 * 663 * Ideas for making all this go faster of course include simply 664 * forcibly fetching 6 arguments from the user stack under lofault 665 * protection, reverting to copyin_args only when watchpoints 666 * are in effect. 667 * 668 * (If we do this, make sure that exec and libthread leave 669 * enough space at the top of the stack to ensure that we'll 670 * never do a fetch from an invalid page.) 671 * 672 * Lots of ideas here, but they won't really help with bringup B-) 673 * Correctness can't wait, performance can wait a little longer .. 674 */ 675 676 movq %rax, %rbx 677 movl 0(%rsp), %edi 678 movl 8(%rsp), %esi 679 movl 0x10(%rsp), %edx 680 movl 0x18(%rsp), %ecx 681 movl 0x20(%rsp), %r8d 682 movl 0x28(%rsp), %r9d 683 684 call *SY_CALLC(%rbx) 685 686 movq %rbp, %rsp /* pop the args */ 687 688 /* 689 * amd64 syscall handlers -always- return a 64-bit value in %rax. 690 * On the 32-bit kernel, they always return that value in %eax:%edx 691 * as required by the 32-bit ABI. 692 * 693 * Simulate the same behaviour by unconditionally splitting the 694 * return value in the same way. 695 */ 696 movq %rax, %r13 697 shrq $32, %r13 /* upper 32-bits into %edx */ 698 movl %eax, %r12d /* lower 32-bits into %eax */ 699 700 /* 701 * Optimistically assume that there's no post-syscall 702 * work to do. (This is to avoid having to call syscall_mstate() 703 * with interrupts disabled) 704 */ 705 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 706 707 /* 708 * We must protect ourselves from being descheduled here; 709 * If we were, and we ended up on another cpu, or another 710 * lwp got in ahead of us, it could change the segment 711 * registers without us noticing before we return to userland. 712 */ 713 CLI(%r14) 714 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 715 jne _full_syscall_postsys32 716 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 717 718 /* 719 * To get back to userland, we need to put the return %rip in %rcx and 720 * the return %rfl in %r11d. The sysret instruction also arranges 721 * to fix up %cs and %ss; everything else is our responsibility. 722 */ 723 724 movl %r12d, %eax /* %eax: rval1 */ 725 movl REGOFF_RBX(%rsp), %ebx 726 /* %ecx used for return pointer */ 727 movl %r13d, %edx /* %edx: rval2 */ 728 movl REGOFF_RBP(%rsp), %ebp 729 movl REGOFF_RSI(%rsp), %esi 730 movl REGOFF_RDI(%rsp), %edi 731 732 movl REGOFF_RFL(%rsp), %r11d /* %r11 -> eflags */ 733 movl REGOFF_RIP(%rsp), %ecx /* %ecx -> %eip */ 734 movl REGOFF_RSP(%rsp), %esp 735 736 swapgs 737 sysretl 738 739_full_syscall_postsys32: 740 STI 741 /* 742 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM 743 * so that we can account for the extra work it takes us to finish. 744 */ 745 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 746 movq %r15, %rdi 747 movq %r12, %rsi /* rval1 - %eax */ 748 movq %r13, %rdx /* rval2 - %edx */ 749 call syscall_exit 750 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 751 jmp _sys_rtt 752 SET_SIZE(sys_syscall32) 753 SET_SIZE(brand_sys_syscall32) 754 755#endif /* __lint */ 756 757/* 758 * System call handler via the sysenter instruction 759 * Used only for 32-bit system calls on the 64-bit kernel. 760 * 761 * The caller in userland has arranged that: 762 * 763 * - %eax contains the syscall number 764 * - %ecx contains the user %esp 765 * - %edx contains the return %eip 766 * - the user stack contains the args to the syscall 767 * 768 * Hardware and (privileged) initialization code have arranged that by 769 * the time the sysenter instructions completes: 770 * 771 * - %rip is pointing to sys_sysenter (below). 772 * - %cs and %ss are set to kernel text and stack (data) selectors. 773 * - %rsp is pointing at the lwp's stack 774 * - interrupts have been disabled. 775 * 776 * Note that we are unable to return both "rvals" to userland with 777 * this call, as %edx is used by the sysexit instruction. 778 * 779 * One final complication in this routine is its interaction with 780 * single-stepping in a debugger. For most of the system call mechanisms, 781 * the CPU automatically clears the single-step flag before we enter the 782 * kernel. The sysenter mechanism does not clear the flag, so a user 783 * single-stepping through a libc routine may suddenly find him/herself 784 * single-stepping through the kernel. To detect this, kmdb compares the 785 * trap %pc to the [brand_]sys_enter addresses on each single-step trap. 786 * If it finds that we have single-stepped to a sysenter entry point, it 787 * explicitly clears the flag and executes the sys_sysenter routine. 788 * 789 * One final complication in this final complication is the fact that we 790 * have two different entry points for sysenter: brand_sys_sysenter and 791 * sys_sysenter. If we enter at brand_sys_sysenter and start single-stepping 792 * through the kernel with kmdb, we will eventually hit the instruction at 793 * sys_sysenter. kmdb cannot distinguish between that valid single-step 794 * and the undesirable one mentioned above. To avoid this situation, we 795 * simply add a jump over the instruction at sys_sysenter to make it 796 * impossible to single-step to it. 797 */ 798#if defined(__lint) 799 800void 801sys_sysenter() 802{} 803 804#else /* __lint */ 805 806 ENTRY_NP(brand_sys_sysenter) 807 SWAPGS 808 809 ALTENTRY(_brand_sys_sysenter_post_swapgs) 810 BRAND_CALLBACK(BRAND_CB_SYSENTER) 811 /* 812 * Jump over sys_sysenter to allow single-stepping as described 813 * above. 814 */ 815 jmp _sys_sysenter_post_swapgs 816 817 ALTENTRY(sys_sysenter) 818 SWAPGS 819 820 ALTENTRY(_sys_sysenter_post_swapgs) 821 movq %gs:CPU_THREAD, %r15 822 823 movl $U32CS_SEL, REGOFF_CS(%rsp) 824 movl %ecx, REGOFF_RSP(%rsp) /* wrapper: %esp -> %ecx */ 825 movl %edx, REGOFF_RIP(%rsp) /* wrapper: %eip -> %edx */ 826 pushfq 827 popq %r10 828 movl $UDS_SEL, REGOFF_SS(%rsp) 829 830 /* 831 * Set the interrupt flag before storing the flags to the 832 * flags image on the stack so we can return to user with 833 * interrupts enabled if we return via sys_rtt_syscall32 834 */ 835 orq $PS_IE, %r10 836 movq %r10, REGOFF_RFL(%rsp) 837 838 movl %edi, REGOFF_RDI(%rsp) 839 movl %esi, REGOFF_RSI(%rsp) 840 movl %ebp, REGOFF_RBP(%rsp) 841 movl %ebx, REGOFF_RBX(%rsp) 842 movl %edx, REGOFF_RDX(%rsp) 843 movl %ecx, REGOFF_RCX(%rsp) 844 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */ 845 movq $0, REGOFF_SAVFP(%rsp) 846 movq $0, REGOFF_SAVPC(%rsp) 847 848 /* 849 * Copy these registers here in case we end up stopped with 850 * someone (like, say, /proc) messing with our register state. 851 * We don't -restore- them unless we have to in update_sregs. 852 * 853 * Since userland -can't- change fsbase or gsbase directly, 854 * we don't bother to capture them here. 855 */ 856 xorl %ebx, %ebx 857 movw %ds, %bx 858 movq %rbx, REGOFF_DS(%rsp) 859 movw %es, %bx 860 movq %rbx, REGOFF_ES(%rsp) 861 movw %fs, %bx 862 movq %rbx, REGOFF_FS(%rsp) 863 movw %gs, %bx 864 movq %rbx, REGOFF_GS(%rsp) 865 866 /* 867 * Application state saved in the regs structure on the stack 868 * %eax is the syscall number 869 * %rsp is the thread's stack, %r15 is curthread 870 * REG_RSP(%rsp) is the user's stack 871 */ 872 873 SYSCALL_TRAPTRACE($TT_SYSENTER) 874 875 movq %rsp, %rbp 876 877 movq T_LWP(%r15), %r14 878 ASSERT_NO_RUPDATE_PENDING(%r14) 879 880 ENABLE_INTR_FLAGS 881 882 /* 883 * Catch 64-bit process trying to issue sysenter instruction 884 * on Nocona based systems. 885 */ 886 movq LWP_PROCP(%r14), %rax 887 cmpq $DATAMODEL_ILP32, P_MODEL(%rax) 888 je 7f 889 890 /* 891 * For a non-32-bit process, simulate a #ud, since that's what 892 * native hardware does. The traptrace entry (above) will 893 * let you know what really happened. 894 */ 895 movq $T_ILLINST, REGOFF_TRAPNO(%rsp) 896 movq REGOFF_CS(%rsp), %rdi 897 movq %rdi, REGOFF_ERR(%rsp) 898 movq %rsp, %rdi 899 movq REGOFF_RIP(%rsp), %rsi 900 movl %gs:CPU_ID, %edx 901 call trap 902 jmp _sys_rtt 9037: 904 905 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 906 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate calls) */ 907 908 ASSERT_LWPTOREGS(%r14, %rsp) 909 910 incq %gs:CPU_STATS_SYS_SYSCALL 911 912 /* 913 * Make some space for MAXSYSARGS (currently 8) 32-bit args 914 * placed into 64-bit (long) arg slots, plus one 64-bit 915 * (long) arg count, maintaining 16 byte alignment. 916 */ 917 subq $SYS_DROP, %rsp 918 movb $LWP_SYS, LWP_STATE(%r14) 919 movq %r15, %rdi 920 movq %rsp, %rsi 921 call syscall_entry 922 923 /* 924 * Fetch the arguments copied onto the kernel stack and put 925 * them in the right registers to invoke a C-style syscall handler. 926 * %rax contains the handler address. 927 */ 928 movq %rax, %rbx 929 movl 0(%rsp), %edi 930 movl 8(%rsp), %esi 931 movl 0x10(%rsp), %edx 932 movl 0x18(%rsp), %ecx 933 movl 0x20(%rsp), %r8d 934 movl 0x28(%rsp), %r9d 935 936 call *SY_CALLC(%rbx) 937 938 movq %rbp, %rsp /* pop the args */ 939 940 /* 941 * amd64 syscall handlers -always- return a 64-bit value in %rax. 942 * On the 32-bit kernel, the always return that value in %eax:%edx 943 * as required by the 32-bit ABI. 944 * 945 * Simulate the same behaviour by unconditionally splitting the 946 * return value in the same way. 947 */ 948 movq %rax, %r13 949 shrq $32, %r13 /* upper 32-bits into %edx */ 950 movl %eax, %r12d /* lower 32-bits into %eax */ 951 952 /* 953 * Optimistically assume that there's no post-syscall 954 * work to do. (This is to avoid having to call syscall_mstate() 955 * with interrupts disabled) 956 */ 957 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 958 959 /* 960 * We must protect ourselves from being descheduled here; 961 * If we were, and we ended up on another cpu, or another 962 * lwp got int ahead of us, it could change the segment 963 * registers without us noticing before we return to userland. 964 */ 965 cli 966 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 967 jne _full_syscall_postsys32 968 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 969 970 /* 971 * To get back to userland, load up the 32-bit registers and 972 * sysexit back where we came from. 973 */ 974 975 /* 976 * Interrupts will be turned on by the 'sti' executed just before 977 * sysexit. The following ensures that restoring the user's rflags 978 * doesn't enable interrupts too soon. 979 */ 980 andq $_BITNOT(PS_IE), REGOFF_RFL(%rsp) 981 982 /* 983 * (There's no point in loading up %edx because the sysexit 984 * mechanism smashes it.) 985 */ 986 movl %r12d, %eax 987 movl REGOFF_RBX(%rsp), %ebx 988 movl REGOFF_RBP(%rsp), %ebp 989 movl REGOFF_RSI(%rsp), %esi 990 movl REGOFF_RDI(%rsp), %edi 991 992 movl REGOFF_RIP(%rsp), %edx /* sysexit: %edx -> %eip */ 993 pushq REGOFF_RFL(%rsp) 994 popfq 995 movl REGOFF_RSP(%rsp), %ecx /* sysexit: %ecx -> %esp */ 996 swapgs 997 sti 998 sysexit 999 SET_SIZE(sys_sysenter) 1000 SET_SIZE(_sys_sysenter_post_swapgs) 1001 SET_SIZE(brand_sys_sysenter) 1002 1003#endif /* __lint */ 1004 1005#if defined(__lint) 1006/* 1007 * System call via an int80. This entry point is only used by the Linux 1008 * application environment. Unlike the other entry points, there is no 1009 * default action to take if no callback is registered for this process. 1010 */ 1011void 1012sys_int80() 1013{} 1014 1015#else /* __lint */ 1016 1017 ENTRY_NP(brand_sys_int80) 1018 swapgs 1019 BRAND_CALLBACK(BRAND_CB_INT80) 1020 swapgs 1021 1022 ENTRY_NP(sys_int80) 1023 /* 1024 * We hit an int80, but this process isn't of a brand with an int80 1025 * handler. Bad process! Make it look as if the INT failed. 1026 * Modify %eip to point before the INT, push the expected error 1027 * code and fake a GP fault. 1028 * 1029 */ 1030 swapgs 1031 subq $2, (%rsp) /* int insn 2-bytes */ 1032 pushq $_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2) 1033 jmp gptrap / GP fault 1034 SET_SIZE(sys_int80) 1035 SET_SIZE(brand_sys_int80) 1036#endif /* __lint */ 1037 1038 1039/* 1040 * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by 1041 * the generic i386 libc to do system calls. We do a small amount of setup 1042 * before jumping into the existing sys_syscall32 path. 1043 */ 1044#if defined(__lint) 1045 1046/*ARGSUSED*/ 1047void 1048sys_syscall_int() 1049{} 1050 1051#else /* __lint */ 1052 1053 ENTRY_NP(brand_sys_syscall_int) 1054 SWAPGS 1055 BRAND_CALLBACK(BRAND_CB_INT91) 1056 swapgs 1057 1058 ALTENTRY(sys_syscall_int) 1059 swapgs 1060 movq %gs:CPU_THREAD, %r15 1061 movq T_STACK(%r15), %rsp 1062 movl %eax, %eax 1063 /* 1064 * Set t_post_sys on this thread to force ourselves out via the slow 1065 * path. It might be possible at some later date to optimize this out 1066 * and use a faster return mechanism. 1067 */ 1068 movb $1, T_POST_SYS(%r15) 1069 CLEAN_CS 1070 jmp _syscall32_save 1071 SET_SIZE(sys_syscall_int) 1072 SET_SIZE(brand_sys_syscall_int) 1073 1074#endif /* __lint */ 1075 1076/* 1077 * Legacy 32-bit applications and old libc implementations do lcalls; 1078 * we should never get here because the LDT entry containing the syscall 1079 * segment descriptor has the "segment present" bit cleared, which means 1080 * we end up processing those system calls in trap() via a not-present trap. 1081 * 1082 * We do it this way because a call gate unhelpfully does -nothing- to the 1083 * interrupt flag bit, so an interrupt can run us just after the lcall 1084 * completes, but just before the swapgs takes effect. Thus the INTR_PUSH and 1085 * INTR_POP paths would have to be slightly more complex to dance around 1086 * this problem, and end up depending explicitly on the first 1087 * instruction of this handler being either swapgs or cli. 1088 */ 1089 1090#if defined(__lint) 1091 1092/*ARGSUSED*/ 1093void 1094sys_lcall32() 1095{} 1096 1097#else /* __lint */ 1098 1099 ENTRY_NP(sys_lcall32) 1100 SWAPGS 1101 pushq $0 1102 pushq %rbp 1103 movq %rsp, %rbp 1104 leaq __lcall_panic_str(%rip), %rdi 1105 xorl %eax, %eax 1106 call panic 1107 SET_SIZE(sys_lcall32) 1108 1109__lcall_panic_str: 1110 .string "sys_lcall32: shouldn't be here!" 1111 1112/* 1113 * Declare a uintptr_t which covers the entire pc range of syscall 1114 * handlers for the stack walkers that need this. 1115 */ 1116 .align CPTRSIZE 1117 .globl _allsyscalls_size 1118 .type _allsyscalls_size, @object 1119_allsyscalls_size: 1120 .NWORD . - _allsyscalls 1121 SET_SIZE(_allsyscalls_size) 1122 1123#endif /* __lint */ 1124 1125/* 1126 * These are the thread context handlers for lwps using sysenter/sysexit. 1127 */ 1128 1129#if defined(__lint) 1130 1131/*ARGSUSED*/ 1132void 1133sep_save(void *ksp) 1134{} 1135 1136/*ARGSUSED*/ 1137void 1138sep_restore(void *ksp) 1139{} 1140 1141#else /* __lint */ 1142 1143 /* 1144 * setting this value to zero as we switch away causes the 1145 * stack-pointer-on-sysenter to be NULL, ensuring that we 1146 * don't silently corrupt another (preempted) thread stack 1147 * when running an lwp that (somehow) didn't get sep_restore'd 1148 */ 1149 ENTRY_NP(sep_save) 1150 xorl %edx, %edx 1151 xorl %eax, %eax 1152 movl $MSR_INTC_SEP_ESP, %ecx 1153 wrmsr 1154 ret 1155 SET_SIZE(sep_save) 1156 1157 /* 1158 * Update the kernel stack pointer as we resume onto this cpu. 1159 */ 1160 ENTRY_NP(sep_restore) 1161 movq %rdi, %rdx 1162 shrq $32, %rdx 1163 movl %edi, %eax 1164 movl $MSR_INTC_SEP_ESP, %ecx 1165 wrmsr 1166 ret 1167 SET_SIZE(sep_restore) 1168 1169#endif /* __lint */ 1170