1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28#include <sys/asm_linkage.h> 29#include <sys/asm_misc.h> 30#include <sys/regset.h> 31#include <sys/psw.h> 32#include <sys/machbrand.h> 33 34#if defined(__lint) 35 36#include <sys/types.h> 37#include <sys/thread.h> 38#include <sys/systm.h> 39 40#else /* __lint */ 41 42#include <sys/segments.h> 43#include <sys/pcb.h> 44#include <sys/trap.h> 45#include <sys/ftrace.h> 46#include <sys/traptrace.h> 47#include <sys/clock.h> 48#include <sys/model.h> 49#include <sys/panic.h> 50#include "assym.h" 51 52#endif /* __lint */ 53 54/* 55 * We implement five flavours of system call entry points 56 * 57 * - syscall/sysretq (amd64 generic) 58 * - syscall/sysretl (i386 plus SYSC bit) 59 * - sysenter/sysexit (i386 plus SEP bit) 60 * - int/iret (i386 generic) 61 * - lcall/iret (i386 generic) 62 * 63 * The current libc included in Solaris uses int/iret as the base unoptimized 64 * kernel entry method. Older libc implementations and legacy binaries may use 65 * the lcall call gate, so it must continue to be supported. 66 * 67 * System calls that use an lcall call gate are processed in trap() via a 68 * segment-not-present trap, i.e. lcalls are extremely slow(!). 69 * 70 * The basic pattern used in the 32-bit SYSC handler at this point in time is 71 * to have the bare minimum of assembler, and get to the C handlers as 72 * quickly as possible. 73 * 74 * The 64-bit handler is much closer to the sparcv9 handler; that's 75 * because of passing arguments in registers. The 32-bit world still 76 * passes arguments on the stack -- that makes that handler substantially 77 * more complex. 78 * 79 * The two handlers share a few code fragments which are broken 80 * out into preprocessor macros below. 81 * 82 * XX64 come back and speed all this up later. The 32-bit stuff looks 83 * especially easy to speed up the argument copying part .. 84 * 85 * 86 * Notes about segment register usage (c.f. the 32-bit kernel) 87 * 88 * In the 32-bit kernel, segment registers are dutifully saved and 89 * restored on all mode transitions because the kernel uses them directly. 90 * When the processor is running in 64-bit mode, segment registers are 91 * largely ignored. 92 * 93 * %cs and %ss 94 * controlled by the hardware mechanisms that make mode transitions 95 * 96 * The remaining segment registers have to either be pointing at a valid 97 * descriptor i.e. with the 'present' bit set, or they can NULL descriptors 98 * 99 * %ds and %es 100 * always ignored 101 * 102 * %fs and %gs 103 * fsbase and gsbase are used to control the place they really point at. 104 * The kernel only depends on %gs, and controls its own gsbase via swapgs 105 * 106 * Note that loading segment registers is still costly because the GDT 107 * lookup still happens (this is because the hardware can't know that we're 108 * not setting up these segment registers for a 32-bit program). Thus we 109 * avoid doing this in the syscall path, and defer them to lwp context switch 110 * handlers, so the register values remain virtualized to the lwp. 111 */ 112 113#if defined(SYSCALLTRACE) 114#define ORL_SYSCALLTRACE(r32) \ 115 orl syscalltrace(%rip), r32 116#else 117#define ORL_SYSCALLTRACE(r32) 118#endif 119 120/* 121 * In the 32-bit kernel, we do absolutely nothing before getting into the 122 * brand callback checks. In 64-bit land, we do swapgs and then come here. 123 * We assume that the %rsp- and %r15-stashing fields in the CPU structure 124 * are still unused. 125 * 126 * When the callback is invoked, we will be on the user's %gs and 127 * the stack will look like this: 128 * 129 * stack: -------------------------------------- 130 * | callback pointer | 131 * | | user stack pointer | 132 * | | lwp brand data | 133 * | | proc brand data | 134 * v | userland return address | 135 * | callback wrapper return addr | 136 * -------------------------------------- 137 * 138 */ 139#define BRAND_CALLBACK(callback_id) \ 140 movq %rsp, %gs:CPU_RTMP_RSP /* save the stack pointer */ ;\ 141 movq %r15, %gs:CPU_RTMP_R15 /* save %r15 */ ;\ 142 movq %gs:CPU_THREAD, %r15 /* load the thread pointer */ ;\ 143 movq T_STACK(%r15), %rsp /* switch to the kernel stack */ ;\ 144 subq $16, %rsp /* save space for two pointers */ ;\ 145 pushq %r14 /* save %r14 */ ;\ 146 movq %gs:CPU_RTMP_RSP, %r14 ;\ 147 movq %r14, 8(%rsp) /* stash the user stack pointer */ ;\ 148 popq %r14 /* restore %r14 */ ;\ 149 movq T_LWP(%r15), %r15 /* load the lwp pointer */ ;\ 150 pushq LWP_BRAND(%r15) /* push the lwp's brand data */ ;\ 151 movq LWP_PROCP(%r15), %r15 /* load the proc pointer */ ;\ 152 pushq P_BRAND_DATA(%r15) /* push the proc's brand data */ ;\ 153 movq P_BRAND(%r15), %r15 /* load the brand pointer */ ;\ 154 movq B_MACHOPS(%r15), %r15 /* load the machops pointer */ ;\ 155 movq _CONST(_MUL(callback_id, CPTRSIZE))(%r15), %r15 ;\ 156 cmpq $0, %r15 ;\ 157 je 1f ;\ 158 movq %r15, 24(%rsp) /* save the callback pointer */ ;\ 159 movq %gs:CPU_RTMP_RSP, %r15 /* grab the user stack pointer */ ;\ 160 pushq (%r15) /* push the return address */ ;\ 161 movq %gs:CPU_RTMP_R15, %r15 /* restore %r15 */ ;\ 162 swapgs ;\ 163 call *32(%rsp) /* call callback */ ;\ 164 swapgs ;\ 1651: movq %gs:CPU_RTMP_R15, %r15 /* restore %r15 */ ;\ 166 movq %gs:CPU_RTMP_RSP, %rsp /* restore the stack pointer */ 167 168#define MSTATE_TRANSITION(from, to) \ 169 movl $from, %edi; \ 170 movl $to, %esi; \ 171 call syscall_mstate 172 173/* 174 * Check to see if a simple (direct) return is possible i.e. 175 * 176 * if ((t->t_post_sys_ast | syscalltrace | 177 * (lwp->lwp_pcb.pcb_flags & RUPDATE_PENDING)) != 0) 178 * do full version ; 179 * 180 * Preconditions: 181 * - t is curthread 182 * Postconditions: 183 * - condition code NE is set if post-sys is too complex 184 * - rtmp is zeroed if it isn't (we rely on this!) 185 * - ltmp is smashed 186 */ 187#define CHECK_POSTSYS_NE(t, ltmp, rtmp) \ 188 movq T_LWP(t), ltmp; \ 189 movl PCB_FLAGS(ltmp), rtmp; \ 190 andl $RUPDATE_PENDING, rtmp; \ 191 ORL_SYSCALLTRACE(rtmp); \ 192 orl T_POST_SYS_AST(t), rtmp; \ 193 cmpl $0, rtmp 194 195/* 196 * Fix up the lwp, thread, and eflags for a successful return 197 * 198 * Preconditions: 199 * - zwreg contains zero 200 */ 201#define SIMPLE_SYSCALL_POSTSYS(t, lwp, zwreg) \ 202 movb $LWP_USER, LWP_STATE(lwp); \ 203 movw zwreg, T_SYSNUM(t); \ 204 andb $_CONST(0xffff - PS_C), REGOFF_RFL(%rsp) 205 206/* 207 * ASSERT(lwptoregs(lwp) == rp); 208 * 209 * This may seem obvious, but very odd things happen if this 210 * assertion is false 211 * 212 * Preconditions: 213 * (%rsp is ready for normal call sequence) 214 * Postconditions (if assertion is true): 215 * %r11 is smashed 216 * 217 * ASSERT(rp->r_cs == descnum) 218 * 219 * The code selector is written into the regs structure when the 220 * lwp stack is created. We use this ASSERT to validate that 221 * the regs structure really matches how we came in. 222 * 223 * Preconditions: 224 * (%rsp is ready for normal call sequence) 225 * Postconditions (if assertion is true): 226 * -none- 227 * 228 * ASSERT((lwp->lwp_pcb.pcb_flags & RUPDATE_PENDING) == 0); 229 * 230 * If this is false, it meant that we returned to userland without 231 * updating the segment registers as we were supposed to. 232 * 233 * Note that we must ensure no interrupts or other traps intervene 234 * between entering privileged mode and performing the assertion, 235 * otherwise we may perform a context switch on the thread, which 236 * will end up setting the RUPDATE_PENDING bit again. 237 */ 238#if defined(DEBUG) 239 240#if !defined(__lint) 241 242__lwptoregs_msg: 243 .string "syscall_asm_amd64.s:%d lwptoregs(%p) [%p] != rp [%p]" 244 245__codesel_msg: 246 .string "syscall_asm_amd64.s:%d rp->r_cs [%ld] != %ld" 247 248__no_rupdate_msg: 249 .string "syscall_asm_amd64.s:%d lwp %p, pcb_flags & RUPDATE_PENDING != 0" 250 251#endif /* !__lint */ 252 253#define ASSERT_LWPTOREGS(lwp, rp) \ 254 movq LWP_REGS(lwp), %r11; \ 255 cmpq rp, %r11; \ 256 je 7f; \ 257 leaq __lwptoregs_msg(%rip), %rdi; \ 258 movl $__LINE__, %esi; \ 259 movq lwp, %rdx; \ 260 movq %r11, %rcx; \ 261 movq rp, %r8; \ 262 xorl %eax, %eax; \ 263 call panic; \ 2647: 265 266#define ASSERT_NO_RUPDATE_PENDING(lwp) \ 267 testl $RUPDATE_PENDING, PCB_FLAGS(lwp); \ 268 je 8f; \ 269 movq lwp, %rdx; \ 270 leaq __no_rupdate_msg(%rip), %rdi; \ 271 movl $__LINE__, %esi; \ 272 xorl %eax, %eax; \ 273 call panic; \ 2748: 275 276#else 277#define ASSERT_LWPTOREGS(lwp, rp) 278#define ASSERT_NO_RUPDATE_PENDING(lwp) 279#endif 280 281/* 282 * Do the traptrace thing and restore any registers we used 283 * in situ. Assumes that %rsp is pointing at the base of 284 * the struct regs, obviously .. 285 */ 286#ifdef TRAPTRACE 287#define SYSCALL_TRAPTRACE(ttype) \ 288 TRACE_PTR(%rdi, %rbx, %ebx, %rcx, ttype); \ 289 TRACE_REGS(%rdi, %rsp, %rbx, %rcx); \ 290 TRACE_STAMP(%rdi); /* rdtsc clobbers %eax, %edx */ \ 291 movq REGOFF_RAX(%rsp), %rax; \ 292 movq REGOFF_RBX(%rsp), %rbx; \ 293 movq REGOFF_RCX(%rsp), %rcx; \ 294 movq REGOFF_RDX(%rsp), %rdx; \ 295 movl %eax, TTR_SYSNUM(%rdi); \ 296 movq REGOFF_RDI(%rsp), %rdi 297 298#define SYSCALL_TRAPTRACE32(ttype) \ 299 SYSCALL_TRAPTRACE(ttype); \ 300 /* paranoia: clean the top 32-bits of the registers */ \ 301 orl %eax, %eax; \ 302 orl %ebx, %ebx; \ 303 orl %ecx, %ecx; \ 304 orl %edx, %edx; \ 305 orl %edi, %edi 306#else /* TRAPTRACE */ 307#define SYSCALL_TRAPTRACE(ttype) 308#define SYSCALL_TRAPTRACE32(ttype) 309#endif /* TRAPTRACE */ 310 311/* 312 * The 64-bit libc syscall wrapper does this: 313 * 314 * fn(<args>) 315 * { 316 * movq %rcx, %r10 -- because syscall smashes %rcx 317 * movl $CODE, %eax 318 * syscall 319 * <error processing> 320 * } 321 * 322 * Thus when we come into the kernel: 323 * 324 * %rdi, %rsi, %rdx, %r10, %r8, %r9 contain first six args 325 * %rax is the syscall number 326 * %r12-%r15 contain caller state 327 * 328 * The syscall instruction arranges that: 329 * 330 * %rcx contains the return %rip 331 * %r11d contains bottom 32-bits of %rflags 332 * %rflags is masked (as determined by the SFMASK msr) 333 * %cs is set to UCS_SEL (as determined by the STAR msr) 334 * %ss is set to UDS_SEL (as determined by the STAR msr) 335 * %rip is set to sys_syscall (as determined by the LSTAR msr) 336 * 337 * Or in other words, we have no registers available at all. 338 * Only swapgs can save us! 339 */ 340 341#if defined(__lint) 342 343/*ARGSUSED*/ 344void 345sys_syscall() 346{} 347 348void 349_allsyscalls() 350{} 351 352size_t _allsyscalls_size; 353 354#else /* __lint */ 355 356 ENTRY_NP2(brand_sys_syscall,_allsyscalls) 357 swapgs 358 BRAND_CALLBACK(BRAND_CB_SYSCALL) 359 swapgs 360 361 ALTENTRY(sys_syscall) 362 swapgs 363 movq %rsp, %gs:CPU_RTMP_RSP 364 movq %r15, %gs:CPU_RTMP_R15 365 movq %gs:CPU_THREAD, %r15 366 movq T_STACK(%r15), %rsp 367 368 movl $UCS_SEL, REGOFF_CS(%rsp) 369 movq %rcx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */ 370 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */ 371 movl $UDS_SEL, REGOFF_SS(%rsp) 372 373 movl %eax, %eax /* wrapper: sysc# -> %eax */ 374 movq %rdi, REGOFF_RDI(%rsp) 375 movq %rsi, REGOFF_RSI(%rsp) 376 movq %rdx, REGOFF_RDX(%rsp) 377 movq %r10, REGOFF_RCX(%rsp) /* wrapper: %rcx -> %r10 */ 378 movq %r10, %rcx /* arg[3] for direct calls */ 379 380 movq %r8, REGOFF_R8(%rsp) 381 movq %r9, REGOFF_R9(%rsp) 382 movq %rax, REGOFF_RAX(%rsp) 383 movq %rbx, REGOFF_RBX(%rsp) 384 385 movq %rbp, REGOFF_RBP(%rsp) 386 movq %r10, REGOFF_R10(%rsp) 387 movq %gs:CPU_RTMP_RSP, %r11 388 movq %r11, REGOFF_RSP(%rsp) 389 movq %r12, REGOFF_R12(%rsp) 390 391 movq %r13, REGOFF_R13(%rsp) 392 movq %r14, REGOFF_R14(%rsp) 393 movq %gs:CPU_RTMP_R15, %r10 394 movq %r10, REGOFF_R15(%rsp) 395 movq $0, REGOFF_SAVFP(%rsp) 396 movq $0, REGOFF_SAVPC(%rsp) 397 398 /* 399 * Copy these registers here in case we end up stopped with 400 * someone (like, say, /proc) messing with our register state. 401 * We don't -restore- them unless we have to in update_sregs. 402 * 403 * Since userland -can't- change fsbase or gsbase directly, 404 * and capturing them involves two serializing instructions, 405 * we don't bother to capture them here. 406 */ 407 xorl %ebx, %ebx 408 movw %ds, %bx 409 movq %rbx, REGOFF_DS(%rsp) 410 movw %es, %bx 411 movq %rbx, REGOFF_ES(%rsp) 412 movw %fs, %bx 413 movq %rbx, REGOFF_FS(%rsp) 414 movw %gs, %bx 415 movq %rbx, REGOFF_GS(%rsp) 416 417 /* 418 * Machine state saved in the regs structure on the stack 419 * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9 420 * %eax is the syscall number 421 * %rsp is the thread's stack, %r15 is curthread 422 * REG_RSP(%rsp) is the user's stack 423 */ 424 425 SYSCALL_TRAPTRACE($TT_SYSC64) 426 427 movq %rsp, %rbp 428 429 movq T_LWP(%r15), %r14 430 ASSERT_NO_RUPDATE_PENDING(%r14) 431 432 ENABLE_INTR_FLAGS 433 434 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 435 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ 436 437 ASSERT_LWPTOREGS(%r14, %rsp) 438 439 movb $LWP_SYS, LWP_STATE(%r14) 440 incq LWP_RU_SYSC(%r14) 441 movb $NORMALRETURN, LWP_EOSYS(%r14) 442 443 incq %gs:CPU_STATS_SYS_SYSCALL 444 445 movw %ax, T_SYSNUM(%r15) 446 movzbl T_PRE_SYS(%r15), %ebx 447 ORL_SYSCALLTRACE(%ebx) 448 testl %ebx, %ebx 449 jne _syscall_pre 450 451_syscall_invoke: 452 movq REGOFF_RDI(%rbp), %rdi 453 movq REGOFF_RSI(%rbp), %rsi 454 movq REGOFF_RDX(%rbp), %rdx 455 movq REGOFF_RCX(%rbp), %rcx 456 movq REGOFF_R8(%rbp), %r8 457 movq REGOFF_R9(%rbp), %r9 458 459 cmpl $NSYSCALL, %eax 460 jae _syscall_ill 461 shll $SYSENT_SIZE_SHIFT, %eax 462 leaq sysent(%rax), %rbx 463 464 call *SY_CALLC(%rbx) 465 466 movq %rax, %r12 467 movq %rdx, %r13 468 469 /* 470 * If the handler returns two ints, then we need to split the 471 * 64-bit return value into two 32-bit values. 472 */ 473 testw $SE_32RVAL2, SY_FLAGS(%rbx) 474 je 5f 475 movq %r12, %r13 476 shrq $32, %r13 /* upper 32-bits into %edx */ 477 movl %r12d, %r12d /* lower 32-bits into %eax */ 4785: 479 /* 480 * Optimistically assume that there's no post-syscall 481 * work to do. (This is to avoid having to call syscall_mstate() 482 * with interrupts disabled) 483 */ 484 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 485 486 /* 487 * We must protect ourselves from being descheduled here; 488 * If we were, and we ended up on another cpu, or another 489 * lwp got in ahead of us, it could change the segment 490 * registers without us noticing before we return to userland. 491 */ 492 cli 493 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 494 jne _syscall_post 495 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 496 497 movq %r12, REGOFF_RAX(%rsp) 498 movq %r13, REGOFF_RDX(%rsp) 499 500 /* 501 * To get back to userland, we need the return %rip in %rcx and 502 * the return %rfl in %r11d. The sysretq instruction also arranges 503 * to fix up %cs and %ss; everything else is our responsibility. 504 */ 505 movq REGOFF_RDI(%rsp), %rdi 506 movq REGOFF_RSI(%rsp), %rsi 507 movq REGOFF_RDX(%rsp), %rdx 508 /* %rcx used to restore %rip value */ 509 510 movq REGOFF_R8(%rsp), %r8 511 movq REGOFF_R9(%rsp), %r9 512 movq REGOFF_RAX(%rsp), %rax 513 movq REGOFF_RBX(%rsp), %rbx 514 515 movq REGOFF_RBP(%rsp), %rbp 516 movq REGOFF_R10(%rsp), %r10 517 /* %r11 used to restore %rfl value */ 518 movq REGOFF_R12(%rsp), %r12 519 520 movq REGOFF_R13(%rsp), %r13 521 movq REGOFF_R14(%rsp), %r14 522 movq REGOFF_R15(%rsp), %r15 523 524 movq REGOFF_RIP(%rsp), %rcx 525 movl REGOFF_RFL(%rsp), %r11d 526 movq REGOFF_RSP(%rsp), %rsp 527 swapgs 528 sysretq 529 530_syscall_pre: 531 call pre_syscall 532 movl %eax, %r12d 533 testl %eax, %eax 534 jne _syscall_post_call 535 /* 536 * Didn't abort, so reload the syscall args and invoke the handler. 537 */ 538 movzwl T_SYSNUM(%r15), %eax 539 jmp _syscall_invoke 540 541_syscall_ill: 542 call nosys 543 movq %rax, %r12 544 movq %rdx, %r13 545 jmp _syscall_post_call 546 547_syscall_post: 548 sti 549 /* 550 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM 551 * so that we can account for the extra work it takes us to finish. 552 */ 553 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 554_syscall_post_call: 555 movq %r12, %rdi 556 movq %r13, %rsi 557 call post_syscall 558 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 559 jmp sys_rtt_syscall 560 SET_SIZE(sys_syscall) 561 SET_SIZE(brand_sys_syscall) 562 563#endif /* __lint */ 564 565#if defined(__lint) 566 567/*ARGSUSED*/ 568void 569sys_syscall32() 570{} 571 572#else /* __lint */ 573 574 ENTRY_NP(brand_sys_syscall32) 575 swapgs 576 BRAND_CALLBACK(BRAND_CB_SYSCALL32) 577 swapgs 578 579 ALTENTRY(sys_syscall32) 580 swapgs 581 movl %esp, %r10d 582 movq %gs:CPU_THREAD, %r15 583 movq T_STACK(%r15), %rsp 584 movl %eax, %eax 585 586 movl $U32CS_SEL, REGOFF_CS(%rsp) 587 movl %ecx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */ 588 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */ 589 movq %r10, REGOFF_RSP(%rsp) 590 movl $UDS_SEL, REGOFF_SS(%rsp) 591 592_syscall32_save: 593 594 movl %edi, REGOFF_RDI(%rsp) 595 movl %esi, REGOFF_RSI(%rsp) 596 movl %ebp, REGOFF_RBP(%rsp) 597 movl %ebx, REGOFF_RBX(%rsp) 598 movl %edx, REGOFF_RDX(%rsp) 599 movl %ecx, REGOFF_RCX(%rsp) 600 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */ 601 movq $0, REGOFF_SAVFP(%rsp) 602 movq $0, REGOFF_SAVPC(%rsp) 603 604 /* 605 * Copy these registers here in case we end up stopped with 606 * someone (like, say, /proc) messing with our register state. 607 * We don't -restore- them unless we have to in update_sregs. 608 * 609 * Since userland -can't- change fsbase or gsbase directly, 610 * we don't bother to capture them here. 611 */ 612 xorl %ebx, %ebx 613 movw %ds, %bx 614 movq %rbx, REGOFF_DS(%rsp) 615 movw %es, %bx 616 movq %rbx, REGOFF_ES(%rsp) 617 movw %fs, %bx 618 movq %rbx, REGOFF_FS(%rsp) 619 movw %gs, %bx 620 movq %rbx, REGOFF_GS(%rsp) 621 622 /* 623 * Application state saved in the regs structure on the stack 624 * %eax is the syscall number 625 * %rsp is the thread's stack, %r15 is curthread 626 * REG_RSP(%rsp) is the user's stack 627 */ 628 629 SYSCALL_TRAPTRACE32($TT_SYSC) 630 631 movq %rsp, %rbp 632 633 movq T_LWP(%r15), %r14 634 ASSERT_NO_RUPDATE_PENDING(%r14) 635 636 ENABLE_INTR_FLAGS 637 638 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 639 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ 640 641 ASSERT_LWPTOREGS(%r14, %rsp) 642 643 incq %gs:CPU_STATS_SYS_SYSCALL 644 645 /* 646 * Make some space for MAXSYSARGS (currently 8) 32-bit args placed 647 * into 64-bit (long) arg slots, maintaining 16 byte alignment. Or 648 * more succinctly: 649 * 650 * SA(MAXSYSARGS * sizeof (long)) == 64 651 */ 652#define SYS_DROP 64 /* drop for args */ 653 subq $SYS_DROP, %rsp 654 movb $LWP_SYS, LWP_STATE(%r14) 655 movq %r15, %rdi 656 movq %rsp, %rsi 657 call syscall_entry 658 659 /* 660 * Fetch the arguments copied onto the kernel stack and put 661 * them in the right registers to invoke a C-style syscall handler. 662 * %rax contains the handler address. 663 * 664 * Ideas for making all this go faster of course include simply 665 * forcibly fetching 6 arguments from the user stack under lofault 666 * protection, reverting to copyin_args only when watchpoints 667 * are in effect. 668 * 669 * (If we do this, make sure that exec and libthread leave 670 * enough space at the top of the stack to ensure that we'll 671 * never do a fetch from an invalid page.) 672 * 673 * Lots of ideas here, but they won't really help with bringup B-) 674 * Correctness can't wait, performance can wait a little longer .. 675 */ 676 677 movq %rax, %rbx 678 movl 0(%rsp), %edi 679 movl 8(%rsp), %esi 680 movl 0x10(%rsp), %edx 681 movl 0x18(%rsp), %ecx 682 movl 0x20(%rsp), %r8d 683 movl 0x28(%rsp), %r9d 684 685 call *SY_CALLC(%rbx) 686 687 movq %rbp, %rsp /* pop the args */ 688 689 /* 690 * amd64 syscall handlers -always- return a 64-bit value in %rax. 691 * On the 32-bit kernel, they always return that value in %eax:%edx 692 * as required by the 32-bit ABI. 693 * 694 * Simulate the same behaviour by unconditionally splitting the 695 * return value in the same way. 696 */ 697 movq %rax, %r13 698 shrq $32, %r13 /* upper 32-bits into %edx */ 699 movl %eax, %r12d /* lower 32-bits into %eax */ 700 701 /* 702 * Optimistically assume that there's no post-syscall 703 * work to do. (This is to avoid having to call syscall_mstate() 704 * with interrupts disabled) 705 */ 706 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 707 708 /* 709 * We must protect ourselves from being descheduled here; 710 * If we were, and we ended up on another cpu, or another 711 * lwp got in ahead of us, it could change the segment 712 * registers without us noticing before we return to userland. 713 */ 714 cli 715 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 716 jne _full_syscall_postsys32 717 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 718 719 /* 720 * To get back to userland, we need to put the return %rip in %rcx and 721 * the return %rfl in %r11d. The sysret instruction also arranges 722 * to fix up %cs and %ss; everything else is our responsibility. 723 */ 724 725 movl %r12d, %eax /* %eax: rval1 */ 726 movl REGOFF_RBX(%rsp), %ebx 727 /* %ecx used for return pointer */ 728 movl %r13d, %edx /* %edx: rval2 */ 729 movl REGOFF_RBP(%rsp), %ebp 730 movl REGOFF_RSI(%rsp), %esi 731 movl REGOFF_RDI(%rsp), %edi 732 733 movl REGOFF_RFL(%rsp), %r11d /* %r11 -> eflags */ 734 movl REGOFF_RIP(%rsp), %ecx /* %ecx -> %eip */ 735 movl REGOFF_RSP(%rsp), %esp 736 737 swapgs 738 sysretl 739 740_full_syscall_postsys32: 741 sti 742 /* 743 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM 744 * so that we can account for the extra work it takes us to finish. 745 */ 746 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 747 movq %r15, %rdi 748 movq %r12, %rsi /* rval1 - %eax */ 749 movq %r13, %rdx /* rval2 - %edx */ 750 call syscall_exit 751 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 752 jmp sys_rtt_syscall32 753 SET_SIZE(sys_syscall32) 754 SET_SIZE(brand_sys_syscall32) 755 756#endif /* __lint */ 757 758/* 759 * System call handler via the sysenter instruction 760 * Used only for 32-bit system calls on the 64-bit kernel. 761 * 762 * The caller in userland has arranged that: 763 * 764 * - %eax contains the syscall number 765 * - %ecx contains the user %esp 766 * - %edx contains the return %eip 767 * - the user stack contains the args to the syscall 768 * 769 * Hardware and (privileged) initialization code have arranged that by 770 * the time the sysenter instructions completes: 771 * 772 * - %rip is pointing to sys_sysenter (below). 773 * - %cs and %ss are set to kernel text and stack (data) selectors. 774 * - %rsp is pointing at the lwp's stack 775 * - interrupts have been disabled. 776 * 777 * Note that we are unable to return both "rvals" to userland with 778 * this call, as %edx is used by the sysexit instruction. 779 * 780 * One final complication in this routine is its interaction with 781 * single-stepping in a debugger. For most of the system call mechanisms, 782 * the CPU automatically clears the single-step flag before we enter the 783 * kernel. The sysenter mechanism does not clear the flag, so a user 784 * single-stepping through a libc routine may suddenly find him/herself 785 * single-stepping through the kernel. To detect this, kmdb compares the 786 * trap %pc to the [brand_]sys_enter addresses on each single-step trap. 787 * If it finds that we have single-stepped to a sysenter entry point, it 788 * explicitly clears the flag and executes the sys_sysenter routine. 789 * 790 * One final complication in this final complication is the fact that we 791 * have two different entry points for sysenter: brand_sys_sysenter and 792 * sys_sysenter. If we enter at brand_sys_sysenter and start single-stepping 793 * through the kernel with kmdb, we will eventually hit the instruction at 794 * sys_sysenter. kmdb cannot distinguish between that valid single-step 795 * and the undesirable one mentioned above. To avoid this situation, we 796 * simply add a jump over the instruction at sys_sysenter to make it 797 * impossible to single-step to it. 798 */ 799#if defined(__lint) 800 801void 802sys_sysenter() 803{} 804 805#else /* __lint */ 806 807 ENTRY_NP(brand_sys_sysenter) 808 swapgs 809 810 ALTENTRY(_brand_sys_sysenter_post_swapgs) 811 BRAND_CALLBACK(BRAND_CB_SYSENTER) 812 /* 813 * Jump over sys_sysenter to allow single-stepping as described 814 * above. 815 */ 816 jmp _sys_sysenter_post_swapgs 817 818 ALTENTRY(sys_sysenter) 819 swapgs 820 821 ALTENTRY(_sys_sysenter_post_swapgs) 822 movq %gs:CPU_THREAD, %r15 823 824 movl $U32CS_SEL, REGOFF_CS(%rsp) 825 movl %ecx, REGOFF_RSP(%rsp) /* wrapper: %esp -> %ecx */ 826 movl %edx, REGOFF_RIP(%rsp) /* wrapper: %eip -> %edx */ 827 pushfq 828 popq %r10 829 movl $UDS_SEL, REGOFF_SS(%rsp) 830 831 /* 832 * Set the interrupt flag before storing the flags to the 833 * flags image on the stack so we can return to user with 834 * interrupts enabled if we return via sys_rtt_syscall32 835 */ 836 orq $PS_IE, %r10 837 movq %r10, REGOFF_RFL(%rsp) 838 839 movl %edi, REGOFF_RDI(%rsp) 840 movl %esi, REGOFF_RSI(%rsp) 841 movl %ebp, REGOFF_RBP(%rsp) 842 movl %ebx, REGOFF_RBX(%rsp) 843 movl %edx, REGOFF_RDX(%rsp) 844 movl %ecx, REGOFF_RCX(%rsp) 845 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */ 846 movq $0, REGOFF_SAVFP(%rsp) 847 movq $0, REGOFF_SAVPC(%rsp) 848 849 /* 850 * Copy these registers here in case we end up stopped with 851 * someone (like, say, /proc) messing with our register state. 852 * We don't -restore- them unless we have to in update_sregs. 853 * 854 * Since userland -can't- change fsbase or gsbase directly, 855 * we don't bother to capture them here. 856 */ 857 xorl %ebx, %ebx 858 movw %ds, %bx 859 movq %rbx, REGOFF_DS(%rsp) 860 movw %es, %bx 861 movq %rbx, REGOFF_ES(%rsp) 862 movw %fs, %bx 863 movq %rbx, REGOFF_FS(%rsp) 864 movw %gs, %bx 865 movq %rbx, REGOFF_GS(%rsp) 866 867 /* 868 * Application state saved in the regs structure on the stack 869 * %eax is the syscall number 870 * %rsp is the thread's stack, %r15 is curthread 871 * REG_RSP(%rsp) is the user's stack 872 */ 873 874 SYSCALL_TRAPTRACE($TT_SYSENTER) 875 876 movq %rsp, %rbp 877 878 movq T_LWP(%r15), %r14 879 ASSERT_NO_RUPDATE_PENDING(%r14) 880 881 ENABLE_INTR_FLAGS 882 883 /* 884 * Catch 64-bit process trying to issue sysenter instruction 885 * on Nocona based systems. 886 */ 887 movq LWP_PROCP(%r14), %rax 888 cmpq $DATAMODEL_ILP32, P_MODEL(%rax) 889 je 7f 890 891 /* 892 * For a non-32-bit process, simulate a #ud, since that's what 893 * native hardware does. The traptrace entry (above) will 894 * let you know what really happened. 895 */ 896 movq $T_ILLINST, REGOFF_TRAPNO(%rsp) 897 movq REGOFF_CS(%rsp), %rdi 898 movq %rdi, REGOFF_ERR(%rsp) 899 movq %rsp, %rdi 900 movq REGOFF_RIP(%rsp), %rsi 901 movl %gs:CPU_ID, %edx 902 call trap 903 jmp _sys_rtt 9047: 905 906 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 907 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate calls) */ 908 909 ASSERT_LWPTOREGS(%r14, %rsp) 910 911 incq %gs:CPU_STATS_SYS_SYSCALL 912 913 /* 914 * Make some space for MAXSYSARGS (currently 8) 32-bit args 915 * placed into 64-bit (long) arg slots, plus one 64-bit 916 * (long) arg count, maintaining 16 byte alignment. 917 */ 918 subq $SYS_DROP, %rsp 919 movb $LWP_SYS, LWP_STATE(%r14) 920 movq %r15, %rdi 921 movq %rsp, %rsi 922 call syscall_entry 923 924 /* 925 * Fetch the arguments copied onto the kernel stack and put 926 * them in the right registers to invoke a C-style syscall handler. 927 * %rax contains the handler address. 928 */ 929 movq %rax, %rbx 930 movl 0(%rsp), %edi 931 movl 8(%rsp), %esi 932 movl 0x10(%rsp), %edx 933 movl 0x18(%rsp), %ecx 934 movl 0x20(%rsp), %r8d 935 movl 0x28(%rsp), %r9d 936 937 call *SY_CALLC(%rbx) 938 939 movq %rbp, %rsp /* pop the args */ 940 941 /* 942 * amd64 syscall handlers -always- return a 64-bit value in %rax. 943 * On the 32-bit kernel, the always return that value in %eax:%edx 944 * as required by the 32-bit ABI. 945 * 946 * Simulate the same behaviour by unconditionally splitting the 947 * return value in the same way. 948 */ 949 movq %rax, %r13 950 shrq $32, %r13 /* upper 32-bits into %edx */ 951 movl %eax, %r12d /* lower 32-bits into %eax */ 952 953 /* 954 * Optimistically assume that there's no post-syscall 955 * work to do. (This is to avoid having to call syscall_mstate() 956 * with interrupts disabled) 957 */ 958 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 959 960 /* 961 * We must protect ourselves from being descheduled here; 962 * If we were, and we ended up on another cpu, or another 963 * lwp got int ahead of us, it could change the segment 964 * registers without us noticing before we return to userland. 965 */ 966 cli 967 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 968 jne _full_syscall_postsys32 969 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 970 971 /* 972 * To get back to userland, load up the 32-bit registers and 973 * sysexit back where we came from. 974 */ 975 976 /* 977 * Interrupts will be turned on by the 'sti' executed just before 978 * sysexit. The following ensures that restoring the user's rflags 979 * doesn't enable interrupts too soon. 980 */ 981 andq $_BITNOT(PS_IE), REGOFF_RFL(%rsp) 982 983 /* 984 * (There's no point in loading up %edx because the sysexit 985 * mechanism smashes it.) 986 */ 987 movl %r12d, %eax 988 movl REGOFF_RBX(%rsp), %ebx 989 movl REGOFF_RBP(%rsp), %ebp 990 movl REGOFF_RSI(%rsp), %esi 991 movl REGOFF_RDI(%rsp), %edi 992 993 movl REGOFF_RIP(%rsp), %edx /* sysexit: %edx -> %eip */ 994 pushq REGOFF_RFL(%rsp) 995 popfq 996 movl REGOFF_RSP(%rsp), %ecx /* sysexit: %ecx -> %esp */ 997 swapgs 998 sti 999 sysexit 1000 SET_SIZE(sys_sysenter) 1001 SET_SIZE(_sys_sysenter_post_swapgs) 1002 SET_SIZE(brand_sys_sysenter) 1003 1004#endif /* __lint */ 1005 1006#if defined(__lint) 1007/* 1008 * System call via an int80. This entry point is only used by the Linux 1009 * application environment. Unlike the other entry points, there is no 1010 * default action to take if no callback is registered for this process. 1011 */ 1012void 1013sys_int80() 1014{} 1015 1016#else /* __lint */ 1017 1018 ENTRY_NP(brand_sys_int80) 1019 swapgs 1020 BRAND_CALLBACK(BRAND_CB_INT80) 1021 swapgs 1022 1023 ENTRY_NP(sys_int80) 1024 /* 1025 * We hit an int80, but this process isn't of a brand with an int80 1026 * handler. Bad process! Make it look as if the INT failed. 1027 * Modify %eip to point before the INT, push the expected error 1028 * code and fake a GP fault. 1029 * 1030 */ 1031 swapgs 1032 subq $2, (%rsp) /* int insn 2-bytes */ 1033 pushq $_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2) 1034 jmp gptrap / GP fault 1035 SET_SIZE(sys_int80) 1036 SET_SIZE(brand_sys_int80) 1037#endif /* __lint */ 1038 1039 1040/* 1041 * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by 1042 * the generic i386 libc to do system calls. We do a small amount of setup 1043 * before jumping into the existing sys_syscall32 path. 1044 */ 1045#if defined(__lint) 1046 1047/*ARGSUSED*/ 1048void 1049sys_syscall_int() 1050{} 1051 1052#else /* __lint */ 1053 1054 ENTRY_NP(brand_sys_syscall_int) 1055 swapgs 1056 BRAND_CALLBACK(BRAND_CB_INT91) 1057 swapgs 1058 1059 ALTENTRY(sys_syscall_int) 1060 swapgs 1061 movq %gs:CPU_THREAD, %r15 1062 movq T_STACK(%r15), %rsp 1063 movl %eax, %eax 1064 /* 1065 * Set t_post_sys on this thread to force ourselves out via the slow 1066 * path. It might be possible at some later date to optimize this out 1067 * and use a faster return mechanism. 1068 */ 1069 movb $1, T_POST_SYS(%r15) 1070 jmp _syscall32_save 1071 SET_SIZE(sys_syscall_int) 1072 SET_SIZE(brand_sys_syscall_int) 1073 1074#endif /* __lint */ 1075 1076/* 1077 * Legacy 32-bit applications and old libc implementations do lcalls; 1078 * we should never get here because the LDT entry containing the syscall 1079 * segment descriptor has the "segment present" bit cleared, which means 1080 * we end up processing those system calls in trap() via a not-present trap. 1081 * 1082 * We do it this way because a call gate unhelpfully does -nothing- to the 1083 * interrupt flag bit, so an interrupt can run us just after the lcall 1084 * completes, but just before the swapgs takes effect. Thus the INTR_PUSH and 1085 * INTR_POP paths would have to be slightly more complex to dance around 1086 * this problem, and end up depending explicitly on the first 1087 * instruction of this handler being either swapgs or cli. 1088 */ 1089 1090#if defined(__lint) 1091 1092/*ARGSUSED*/ 1093void 1094sys_lcall32() 1095{} 1096 1097#else /* __lint */ 1098 1099 ENTRY_NP(sys_lcall32) 1100 swapgs 1101 pushq $0 1102 pushq %rbp 1103 movq %rsp, %rbp 1104 leaq __lcall_panic_str(%rip), %rdi 1105 xorl %eax, %eax 1106 call panic 1107 SET_SIZE(sys_lcall32) 1108 1109__lcall_panic_str: 1110 .string "sys_lcall32: shouldn't be here!" 1111 1112/* 1113 * Declare a uintptr_t which covers the entire pc range of syscall 1114 * handlers for the stack walkers that need this. 1115 */ 1116 .align CPTRSIZE 1117 .globl _allsyscalls_size 1118 .type _allsyscalls_size, @object 1119_allsyscalls_size: 1120 .NWORD . - _allsyscalls 1121 SET_SIZE(_allsyscalls_size) 1122 1123#endif /* __lint */ 1124 1125/* 1126 * These are the thread context handlers for lwps using sysenter/sysexit. 1127 */ 1128 1129#if defined(__lint) 1130 1131/*ARGSUSED*/ 1132void 1133sep_save(void *ksp) 1134{} 1135 1136/*ARGSUSED*/ 1137void 1138sep_restore(void *ksp) 1139{} 1140 1141#else /* __lint */ 1142 1143 /* 1144 * setting this value to zero as we switch away causes the 1145 * stack-pointer-on-sysenter to be NULL, ensuring that we 1146 * don't silently corrupt another (preempted) thread stack 1147 * when running an lwp that (somehow) didn't get sep_restore'd 1148 */ 1149 ENTRY_NP(sep_save) 1150 xorl %edx, %edx 1151 xorl %eax, %eax 1152 movl $MSR_INTC_SEP_ESP, %ecx 1153 wrmsr 1154 ret 1155 SET_SIZE(sep_save) 1156 1157 /* 1158 * Update the kernel stack pointer as we resume onto this cpu. 1159 */ 1160 ENTRY_NP(sep_restore) 1161 movq %rdi, %rdx 1162 shrq $32, %rdx 1163 movl %edi, %eax 1164 movl $MSR_INTC_SEP_ESP, %ecx 1165 wrmsr 1166 ret 1167 SET_SIZE(sep_restore) 1168 1169#endif /* __lint */ 1170