1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28#include <sys/asm_linkage.h> 29#include <sys/asm_misc.h> 30#include <sys/regset.h> 31#include <sys/privregs.h> 32#include <sys/psw.h> 33#include <sys/machbrand.h> 34 35#if defined(__lint) 36 37#include <sys/types.h> 38#include <sys/thread.h> 39#include <sys/systm.h> 40 41#else /* __lint */ 42 43#include <sys/segments.h> 44#include <sys/pcb.h> 45#include <sys/trap.h> 46#include <sys/ftrace.h> 47#include <sys/traptrace.h> 48#include <sys/clock.h> 49#include <sys/model.h> 50#include <sys/panic.h> 51#include "assym.h" 52 53#endif /* __lint */ 54 55/* 56 * We implement five flavours of system call entry points 57 * 58 * - syscall/sysretq (amd64 generic) 59 * - syscall/sysretl (i386 plus SYSC bit) 60 * - sysenter/sysexit (i386 plus SEP bit) 61 * - int/iret (i386 generic) 62 * - lcall/iret (i386 generic) 63 * 64 * The current libc included in Solaris uses int/iret as the base unoptimized 65 * kernel entry method. Older libc implementations and legacy binaries may use 66 * the lcall call gate, so it must continue to be supported. 67 * 68 * System calls that use an lcall call gate are processed in trap() via a 69 * segment-not-present trap, i.e. lcalls are extremely slow(!). 70 * 71 * The basic pattern used in the 32-bit SYSC handler at this point in time is 72 * to have the bare minimum of assembler, and get to the C handlers as 73 * quickly as possible. 74 * 75 * The 64-bit handler is much closer to the sparcv9 handler; that's 76 * because of passing arguments in registers. The 32-bit world still 77 * passes arguments on the stack -- that makes that handler substantially 78 * more complex. 79 * 80 * The two handlers share a few code fragments which are broken 81 * out into preprocessor macros below. 82 * 83 * XX64 come back and speed all this up later. The 32-bit stuff looks 84 * especially easy to speed up the argument copying part .. 85 * 86 * 87 * Notes about segment register usage (c.f. the 32-bit kernel) 88 * 89 * In the 32-bit kernel, segment registers are dutifully saved and 90 * restored on all mode transitions because the kernel uses them directly. 91 * When the processor is running in 64-bit mode, segment registers are 92 * largely ignored. 93 * 94 * %cs and %ss 95 * controlled by the hardware mechanisms that make mode transitions 96 * 97 * The remaining segment registers have to either be pointing at a valid 98 * descriptor i.e. with the 'present' bit set, or they can NULL descriptors 99 * 100 * %ds and %es 101 * always ignored 102 * 103 * %fs and %gs 104 * fsbase and gsbase are used to control the place they really point at. 105 * The kernel only depends on %gs, and controls its own gsbase via swapgs 106 * 107 * Note that loading segment registers is still costly because the GDT 108 * lookup still happens (this is because the hardware can't know that we're 109 * not setting up these segment registers for a 32-bit program). Thus we 110 * avoid doing this in the syscall path, and defer them to lwp context switch 111 * handlers, so the register values remain virtualized to the lwp. 112 */ 113 114#if defined(SYSCALLTRACE) 115#define ORL_SYSCALLTRACE(r32) \ 116 orl syscalltrace(%rip), r32 117#else 118#define ORL_SYSCALLTRACE(r32) 119#endif 120 121/* 122 * In the 32-bit kernel, we do absolutely nothing before getting into the 123 * brand callback checks. In 64-bit land, we do swapgs and then come here. 124 * We assume that the %rsp- and %r15-stashing fields in the CPU structure 125 * are still unused. 126 * 127 * When the callback is invoked, we will be on the user's %gs and 128 * the stack will look like this: 129 * 130 * stack: -------------------------------------- 131 * | callback pointer | 132 * | | user stack pointer | 133 * | | lwp brand data | 134 * | | proc brand data | 135 * v | userland return address | 136 * | callback wrapper return addr | 137 * -------------------------------------- 138 * 139 */ 140#define BRAND_CALLBACK(callback_id) \ 141 movq %rsp, %gs:CPU_RTMP_RSP /* save the stack pointer */ ;\ 142 movq %r15, %gs:CPU_RTMP_R15 /* save %r15 */ ;\ 143 movq %gs:CPU_THREAD, %r15 /* load the thread pointer */ ;\ 144 movq T_STACK(%r15), %rsp /* switch to the kernel stack */ ;\ 145 subq $16, %rsp /* save space for two pointers */ ;\ 146 pushq %r14 /* save %r14 */ ;\ 147 movq %gs:CPU_RTMP_RSP, %r14 ;\ 148 movq %r14, 8(%rsp) /* stash the user stack pointer */ ;\ 149 popq %r14 /* restore %r14 */ ;\ 150 movq T_LWP(%r15), %r15 /* load the lwp pointer */ ;\ 151 pushq LWP_BRAND(%r15) /* push the lwp's brand data */ ;\ 152 movq LWP_PROCP(%r15), %r15 /* load the proc pointer */ ;\ 153 pushq P_BRAND_DATA(%r15) /* push the proc's brand data */ ;\ 154 movq P_BRAND(%r15), %r15 /* load the brand pointer */ ;\ 155 movq B_MACHOPS(%r15), %r15 /* load the machops pointer */ ;\ 156 movq _CONST(_MUL(callback_id, CPTRSIZE))(%r15), %r15 ;\ 157 cmpq $0, %r15 ;\ 158 je 1f ;\ 159 movq %r15, 24(%rsp) /* save the callback pointer */ ;\ 160 movq %gs:CPU_RTMP_RSP, %r15 /* grab the user stack pointer */ ;\ 161 pushq (%r15) /* push the return address */ ;\ 162 movq %gs:CPU_RTMP_R15, %r15 /* restore %r15 */ ;\ 163 swapgs ;\ 164 call *32(%rsp) /* call callback */ ;\ 165 swapgs ;\ 1661: movq %gs:CPU_RTMP_R15, %r15 /* restore %r15 */ ;\ 167 movq %gs:CPU_RTMP_RSP, %rsp /* restore the stack pointer */ 168 169#define MSTATE_TRANSITION(from, to) \ 170 movl $from, %edi; \ 171 movl $to, %esi; \ 172 call syscall_mstate 173 174/* 175 * Check to see if a simple (direct) return is possible i.e. 176 * 177 * if (t->t_post_sys_ast | syscalltrace | 178 * lwp->lwp_pcb.pcb_rupdate == 1) 179 * do full version ; 180 * 181 * Preconditions: 182 * - t is curthread 183 * Postconditions: 184 * - condition code NE is set if post-sys is too complex 185 * - rtmp is zeroed if it isn't (we rely on this!) 186 * - ltmp is smashed 187 */ 188#define CHECK_POSTSYS_NE(t, ltmp, rtmp) \ 189 movq T_LWP(t), ltmp; \ 190 movzbl PCB_RUPDATE(ltmp), rtmp; \ 191 ORL_SYSCALLTRACE(rtmp); \ 192 orl T_POST_SYS_AST(t), rtmp; \ 193 cmpl $0, rtmp 194 195/* 196 * Fix up the lwp, thread, and eflags for a successful return 197 * 198 * Preconditions: 199 * - zwreg contains zero 200 */ 201#define SIMPLE_SYSCALL_POSTSYS(t, lwp, zwreg) \ 202 movb $LWP_USER, LWP_STATE(lwp); \ 203 movw zwreg, T_SYSNUM(t); \ 204 andb $_CONST(0xffff - PS_C), REGOFF_RFL(%rsp) 205 206/* 207 * ASSERT(lwptoregs(lwp) == rp); 208 * 209 * This may seem obvious, but very odd things happen if this 210 * assertion is false 211 * 212 * Preconditions: 213 * (%rsp is ready for normal call sequence) 214 * Postconditions (if assertion is true): 215 * %r11 is smashed 216 * 217 * ASSERT(rp->r_cs == descnum) 218 * 219 * The code selector is written into the regs structure when the 220 * lwp stack is created. We use this ASSERT to validate that 221 * the regs structure really matches how we came in. 222 * 223 * Preconditions: 224 * (%rsp is ready for normal call sequence) 225 * Postconditions (if assertion is true): 226 * -none- 227 * 228 * ASSERT(lwp->lwp_pcb.pcb_rupdate == 0); 229 * 230 * If this is false, it meant that we returned to userland without 231 * updating the segment registers as we were supposed to. 232 * 233 * Note that we must ensure no interrupts or other traps intervene 234 * between entering privileged mode and performing the assertion, 235 * otherwise we may perform a context switch on the thread, which 236 * will end up setting pcb_rupdate to 1 again. 237 */ 238#if defined(DEBUG) 239 240#if !defined(__lint) 241 242__lwptoregs_msg: 243 .string "syscall_asm_amd64.s:%d lwptoregs(%p) [%p] != rp [%p]" 244 245__codesel_msg: 246 .string "syscall_asm_amd64.s:%d rp->r_cs [%ld] != %ld" 247 248__no_rupdate_msg: 249 .string "syscall_asm_amd64.s:%d lwp %p, pcb_rupdate != 0" 250 251#endif /* !__lint */ 252 253#define ASSERT_LWPTOREGS(lwp, rp) \ 254 movq LWP_REGS(lwp), %r11; \ 255 cmpq rp, %r11; \ 256 je 7f; \ 257 leaq __lwptoregs_msg(%rip), %rdi; \ 258 movl $__LINE__, %esi; \ 259 movq lwp, %rdx; \ 260 movq %r11, %rcx; \ 261 movq rp, %r8; \ 262 xorl %eax, %eax; \ 263 call panic; \ 2647: 265 266#define ASSERT_NO_RUPDATE_PENDING(lwp) \ 267 testb $0x1, PCB_RUPDATE(lwp); \ 268 je 8f; \ 269 movq lwp, %rdx; \ 270 leaq __no_rupdate_msg(%rip), %rdi; \ 271 movl $__LINE__, %esi; \ 272 xorl %eax, %eax; \ 273 call panic; \ 2748: 275 276#else 277#define ASSERT_LWPTOREGS(lwp, rp) 278#define ASSERT_NO_RUPDATE_PENDING(lwp) 279#endif 280 281/* 282 * Do the traptrace thing and restore any registers we used 283 * in situ. Assumes that %rsp is pointing at the base of 284 * the struct regs, obviously .. 285 */ 286#ifdef TRAPTRACE 287#define SYSCALL_TRAPTRACE(ttype) \ 288 TRACE_PTR(%rdi, %rbx, %ebx, %rcx, ttype); \ 289 TRACE_REGS(%rdi, %rsp, %rbx, %rcx); \ 290 TRACE_STAMP(%rdi); /* rdtsc clobbers %eax, %edx */ \ 291 movq REGOFF_RAX(%rsp), %rax; \ 292 movq REGOFF_RBX(%rsp), %rbx; \ 293 movq REGOFF_RCX(%rsp), %rcx; \ 294 movq REGOFF_RDX(%rsp), %rdx; \ 295 movl %eax, TTR_SYSNUM(%rdi); \ 296 movq REGOFF_RDI(%rsp), %rdi 297 298#define SYSCALL_TRAPTRACE32(ttype) \ 299 SYSCALL_TRAPTRACE(ttype); \ 300 /* paranoia: clean the top 32-bits of the registers */ \ 301 orl %eax, %eax; \ 302 orl %ebx, %ebx; \ 303 orl %ecx, %ecx; \ 304 orl %edx, %edx; \ 305 orl %edi, %edi 306#else /* TRAPTRACE */ 307#define SYSCALL_TRAPTRACE(ttype) 308#define SYSCALL_TRAPTRACE32(ttype) 309#endif /* TRAPTRACE */ 310 311/* 312 * The 64-bit libc syscall wrapper does this: 313 * 314 * fn(<args>) 315 * { 316 * movq %rcx, %r10 -- because syscall smashes %rcx 317 * movl $CODE, %eax 318 * syscall 319 * <error processing> 320 * } 321 * 322 * Thus when we come into the kernel: 323 * 324 * %rdi, %rsi, %rdx, %r10, %r8, %r9 contain first six args 325 * %rax is the syscall number 326 * %r12-%r15 contain caller state 327 * 328 * The syscall instruction arranges that: 329 * 330 * %rcx contains the return %rip 331 * %r11d contains bottom 32-bits of %rflags 332 * %rflags is masked (as determined by the SFMASK msr) 333 * %cs is set to UCS_SEL (as determined by the STAR msr) 334 * %ss is set to UDS_SEL (as determined by the STAR msr) 335 * %rip is set to sys_syscall (as determined by the LSTAR msr) 336 * 337 * Or in other words, we have no registers available at all. 338 * Only swapgs can save us! 339 */ 340 341#if defined(__lint) 342 343/*ARGSUSED*/ 344void 345sys_syscall() 346{} 347 348void 349_allsyscalls() 350{} 351 352size_t _allsyscalls_size; 353 354#else /* __lint */ 355 356 ENTRY_NP2(brand_sys_syscall,_allsyscalls) 357 SWAPGS 358 BRAND_CALLBACK(BRAND_CB_SYSCALL) 359 SWAPGS 360 361 ALTENTRY(sys_syscall) 362 SWAPGS 363 movq %rsp, %gs:CPU_RTMP_RSP 364 movq %r15, %gs:CPU_RTMP_R15 365 movq %gs:CPU_THREAD, %r15 366 movq T_STACK(%r15), %rsp 367 368 movl $UCS_SEL, REGOFF_CS(%rsp) 369 movq %rcx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */ 370 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */ 371 movl $UDS_SEL, REGOFF_SS(%rsp) 372 373 movl %eax, %eax /* wrapper: sysc# -> %eax */ 374 movq %rdi, REGOFF_RDI(%rsp) 375 movq %rsi, REGOFF_RSI(%rsp) 376 movq %rdx, REGOFF_RDX(%rsp) 377 movq %r10, REGOFF_RCX(%rsp) /* wrapper: %rcx -> %r10 */ 378 movq %r10, %rcx /* arg[3] for direct calls */ 379 380 movq %r8, REGOFF_R8(%rsp) 381 movq %r9, REGOFF_R9(%rsp) 382 movq %rax, REGOFF_RAX(%rsp) 383 movq %rbx, REGOFF_RBX(%rsp) 384 385 movq %rbp, REGOFF_RBP(%rsp) 386 movq %r10, REGOFF_R10(%rsp) 387 movq %gs:CPU_RTMP_RSP, %r11 388 movq %r11, REGOFF_RSP(%rsp) 389 movq %r12, REGOFF_R12(%rsp) 390 391 movq %r13, REGOFF_R13(%rsp) 392 movq %r14, REGOFF_R14(%rsp) 393 movq %gs:CPU_RTMP_R15, %r10 394 movq %r10, REGOFF_R15(%rsp) 395 movq $0, REGOFF_SAVFP(%rsp) 396 movq $0, REGOFF_SAVPC(%rsp) 397 398 /* 399 * Copy these registers here in case we end up stopped with 400 * someone (like, say, /proc) messing with our register state. 401 * We don't -restore- them unless we have to in update_sregs. 402 * 403 * Since userland -can't- change fsbase or gsbase directly, 404 * and capturing them involves two serializing instructions, 405 * we don't bother to capture them here. 406 */ 407 xorl %ebx, %ebx 408 movw %ds, %bx 409 movq %rbx, REGOFF_DS(%rsp) 410 movw %es, %bx 411 movq %rbx, REGOFF_ES(%rsp) 412 movw %fs, %bx 413 movq %rbx, REGOFF_FS(%rsp) 414 movw %gs, %bx 415 movq %rbx, REGOFF_GS(%rsp) 416 417 /* 418 * Machine state saved in the regs structure on the stack 419 * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9 420 * %eax is the syscall number 421 * %rsp is the thread's stack, %r15 is curthread 422 * REG_RSP(%rsp) is the user's stack 423 */ 424 425 SYSCALL_TRAPTRACE($TT_SYSC64) 426 427 movq %rsp, %rbp 428 429 movq T_LWP(%r15), %r14 430 ASSERT_NO_RUPDATE_PENDING(%r14) 431 ENABLE_INTR_FLAGS 432 433 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 434 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ 435 436 ASSERT_LWPTOREGS(%r14, %rsp) 437 438 movb $LWP_SYS, LWP_STATE(%r14) 439 incq LWP_RU_SYSC(%r14) 440 movb $NORMALRETURN, LWP_EOSYS(%r14) 441 442 incq %gs:CPU_STATS_SYS_SYSCALL 443 444 movw %ax, T_SYSNUM(%r15) 445 movzbl T_PRE_SYS(%r15), %ebx 446 ORL_SYSCALLTRACE(%ebx) 447 testl %ebx, %ebx 448 jne _syscall_pre 449 450_syscall_invoke: 451 movq REGOFF_RDI(%rbp), %rdi 452 movq REGOFF_RSI(%rbp), %rsi 453 movq REGOFF_RDX(%rbp), %rdx 454 movq REGOFF_RCX(%rbp), %rcx 455 movq REGOFF_R8(%rbp), %r8 456 movq REGOFF_R9(%rbp), %r9 457 458 cmpl $NSYSCALL, %eax 459 jae _syscall_ill 460 shll $SYSENT_SIZE_SHIFT, %eax 461 leaq sysent(%rax), %rbx 462 463 call *SY_CALLC(%rbx) 464 465 movq %rax, %r12 466 movq %rdx, %r13 467 468 /* 469 * If the handler returns two ints, then we need to split the 470 * 64-bit return value into two 32-bit values. 471 */ 472 testw $SE_32RVAL2, SY_FLAGS(%rbx) 473 je 5f 474 movq %r12, %r13 475 shrq $32, %r13 /* upper 32-bits into %edx */ 476 movl %r12d, %r12d /* lower 32-bits into %eax */ 4775: 478 /* 479 * Optimistically assume that there's no post-syscall 480 * work to do. (This is to avoid having to call syscall_mstate() 481 * with interrupts disabled) 482 */ 483 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 484 485 /* 486 * We must protect ourselves from being descheduled here; 487 * If we were, and we ended up on another cpu, or another 488 * lwp got in ahead of us, it could change the segment 489 * registers without us noticing before we return to userland. 490 */ 491 CLI(%r14) 492 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 493 jne _syscall_post 494 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 495 496 movq %r12, REGOFF_RAX(%rsp) 497 movq %r13, REGOFF_RDX(%rsp) 498 499 /* 500 * To get back to userland, we need the return %rip in %rcx and 501 * the return %rfl in %r11d. The sysretq instruction also arranges 502 * to fix up %cs and %ss; everything else is our responsibility. 503 */ 504 movq REGOFF_RDI(%rsp), %rdi 505 movq REGOFF_RSI(%rsp), %rsi 506 movq REGOFF_RDX(%rsp), %rdx 507 /* %rcx used to restore %rip value */ 508 509 movq REGOFF_R8(%rsp), %r8 510 movq REGOFF_R9(%rsp), %r9 511 movq REGOFF_RAX(%rsp), %rax 512 movq REGOFF_RBX(%rsp), %rbx 513 514 movq REGOFF_RBP(%rsp), %rbp 515 movq REGOFF_R10(%rsp), %r10 516 /* %r11 used to restore %rfl value */ 517 movq REGOFF_R12(%rsp), %r12 518 519 movq REGOFF_R13(%rsp), %r13 520 movq REGOFF_R14(%rsp), %r14 521 movq REGOFF_R15(%rsp), %r15 522 523 movq REGOFF_RIP(%rsp), %rcx 524 movl REGOFF_RFL(%rsp), %r11d 525 movq REGOFF_RSP(%rsp), %rsp 526 SWAPGS 527 sysretq 528 529_syscall_pre: 530 call pre_syscall 531 movl %eax, %r12d 532 testl %eax, %eax 533 jne _syscall_post_call 534 /* 535 * Didn't abort, so reload the syscall args and invoke the handler. 536 */ 537 movzwl T_SYSNUM(%r15), %eax 538 jmp _syscall_invoke 539 540_syscall_ill: 541 call nosys 542 movq %rax, %r12 543 movq %rdx, %r13 544 jmp _syscall_post_call 545 546_syscall_post: 547 STI 548 /* 549 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM 550 * so that we can account for the extra work it takes us to finish. 551 */ 552 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 553_syscall_post_call: 554 movq %r12, %rdi 555 movq %r13, %rsi 556 call post_syscall 557 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 558 jmp _sys_rtt 559 SET_SIZE(sys_syscall) 560 SET_SIZE(brand_sys_syscall) 561 562#endif /* __lint */ 563 564#if defined(__lint) 565 566/*ARGSUSED*/ 567void 568sys_syscall32() 569{} 570 571#else /* __lint */ 572 573 ENTRY_NP(brand_sys_syscall32) 574 SWAPGS 575 BRAND_CALLBACK(BRAND_CB_SYSCALL32) 576 SWAPGS 577 578 ALTENTRY(sys_syscall32) 579 SWAPGS 580 movl %esp, %r10d 581 movq %gs:CPU_THREAD, %r15 582 movq T_STACK(%r15), %rsp 583 movl %eax, %eax 584 585 movl $U32CS_SEL, REGOFF_CS(%rsp) 586 movl %ecx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */ 587 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */ 588 movq %r10, REGOFF_RSP(%rsp) 589 movl $UDS_SEL, REGOFF_SS(%rsp) 590 591_syscall32_save: 592 movl %edi, REGOFF_RDI(%rsp) 593 movl %esi, REGOFF_RSI(%rsp) 594 movl %ebp, REGOFF_RBP(%rsp) 595 movl %ebx, REGOFF_RBX(%rsp) 596 movl %edx, REGOFF_RDX(%rsp) 597 movl %ecx, REGOFF_RCX(%rsp) 598 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */ 599 movq $0, REGOFF_SAVFP(%rsp) 600 movq $0, REGOFF_SAVPC(%rsp) 601 602 /* 603 * Copy these registers here in case we end up stopped with 604 * someone (like, say, /proc) messing with our register state. 605 * We don't -restore- them unless we have to in update_sregs. 606 * 607 * Since userland -can't- change fsbase or gsbase directly, 608 * we don't bother to capture them here. 609 */ 610 xorl %ebx, %ebx 611 movw %ds, %bx 612 movq %rbx, REGOFF_DS(%rsp) 613 movw %es, %bx 614 movq %rbx, REGOFF_ES(%rsp) 615 movw %fs, %bx 616 movq %rbx, REGOFF_FS(%rsp) 617 movw %gs, %bx 618 movq %rbx, REGOFF_GS(%rsp) 619 620 /* 621 * Application state saved in the regs structure on the stack 622 * %eax is the syscall number 623 * %rsp is the thread's stack, %r15 is curthread 624 * REG_RSP(%rsp) is the user's stack 625 */ 626 627 SYSCALL_TRAPTRACE32($TT_SYSC) 628 629 movq %rsp, %rbp 630 631 movq T_LWP(%r15), %r14 632 ASSERT_NO_RUPDATE_PENDING(%r14) 633 634 ENABLE_INTR_FLAGS 635 636 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 637 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ 638 639 ASSERT_LWPTOREGS(%r14, %rsp) 640 641 incq %gs:CPU_STATS_SYS_SYSCALL 642 643 /* 644 * Make some space for MAXSYSARGS (currently 8) 32-bit args placed 645 * into 64-bit (long) arg slots, maintaining 16 byte alignment. Or 646 * more succinctly: 647 * 648 * SA(MAXSYSARGS * sizeof (long)) == 64 649 */ 650#define SYS_DROP 64 /* drop for args */ 651 subq $SYS_DROP, %rsp 652 movb $LWP_SYS, LWP_STATE(%r14) 653 movq %r15, %rdi 654 movq %rsp, %rsi 655 call syscall_entry 656 657 /* 658 * Fetch the arguments copied onto the kernel stack and put 659 * them in the right registers to invoke a C-style syscall handler. 660 * %rax contains the handler address. 661 * 662 * Ideas for making all this go faster of course include simply 663 * forcibly fetching 6 arguments from the user stack under lofault 664 * protection, reverting to copyin_args only when watchpoints 665 * are in effect. 666 * 667 * (If we do this, make sure that exec and libthread leave 668 * enough space at the top of the stack to ensure that we'll 669 * never do a fetch from an invalid page.) 670 * 671 * Lots of ideas here, but they won't really help with bringup B-) 672 * Correctness can't wait, performance can wait a little longer .. 673 */ 674 675 movq %rax, %rbx 676 movl 0(%rsp), %edi 677 movl 8(%rsp), %esi 678 movl 0x10(%rsp), %edx 679 movl 0x18(%rsp), %ecx 680 movl 0x20(%rsp), %r8d 681 movl 0x28(%rsp), %r9d 682 683 call *SY_CALLC(%rbx) 684 685 movq %rbp, %rsp /* pop the args */ 686 687 /* 688 * amd64 syscall handlers -always- return a 64-bit value in %rax. 689 * On the 32-bit kernel, they always return that value in %eax:%edx 690 * as required by the 32-bit ABI. 691 * 692 * Simulate the same behaviour by unconditionally splitting the 693 * return value in the same way. 694 */ 695 movq %rax, %r13 696 shrq $32, %r13 /* upper 32-bits into %edx */ 697 movl %eax, %r12d /* lower 32-bits into %eax */ 698 699 /* 700 * Optimistically assume that there's no post-syscall 701 * work to do. (This is to avoid having to call syscall_mstate() 702 * with interrupts disabled) 703 */ 704 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 705 706 /* 707 * We must protect ourselves from being descheduled here; 708 * If we were, and we ended up on another cpu, or another 709 * lwp got in ahead of us, it could change the segment 710 * registers without us noticing before we return to userland. 711 */ 712 CLI(%r14) 713 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 714 jne _full_syscall_postsys32 715 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 716 717 /* 718 * To get back to userland, we need to put the return %rip in %rcx and 719 * the return %rfl in %r11d. The sysret instruction also arranges 720 * to fix up %cs and %ss; everything else is our responsibility. 721 */ 722 723 movl %r12d, %eax /* %eax: rval1 */ 724 movl REGOFF_RBX(%rsp), %ebx 725 /* %ecx used for return pointer */ 726 movl %r13d, %edx /* %edx: rval2 */ 727 movl REGOFF_RBP(%rsp), %ebp 728 movl REGOFF_RSI(%rsp), %esi 729 movl REGOFF_RDI(%rsp), %edi 730 731 movl REGOFF_RFL(%rsp), %r11d /* %r11 -> eflags */ 732 movl REGOFF_RIP(%rsp), %ecx /* %ecx -> %eip */ 733 movl REGOFF_RSP(%rsp), %esp 734 735 swapgs 736 sysretl 737 738_full_syscall_postsys32: 739 STI 740 /* 741 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM 742 * so that we can account for the extra work it takes us to finish. 743 */ 744 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 745 movq %r15, %rdi 746 movq %r12, %rsi /* rval1 - %eax */ 747 movq %r13, %rdx /* rval2 - %edx */ 748 call syscall_exit 749 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 750 jmp _sys_rtt 751 SET_SIZE(sys_syscall32) 752 SET_SIZE(brand_sys_syscall32) 753 754#endif /* __lint */ 755 756/* 757 * System call handler via the sysenter instruction 758 * Used only for 32-bit system calls on the 64-bit kernel. 759 * 760 * The caller in userland has arranged that: 761 * 762 * - %eax contains the syscall number 763 * - %ecx contains the user %esp 764 * - %edx contains the return %eip 765 * - the user stack contains the args to the syscall 766 * 767 * Hardware and (privileged) initialization code have arranged that by 768 * the time the sysenter instructions completes: 769 * 770 * - %rip is pointing to sys_sysenter (below). 771 * - %cs and %ss are set to kernel text and stack (data) selectors. 772 * - %rsp is pointing at the lwp's stack 773 * - interrupts have been disabled. 774 * 775 * Note that we are unable to return both "rvals" to userland with 776 * this call, as %edx is used by the sysexit instruction. 777 * 778 * One final complication in this routine is its interaction with 779 * single-stepping in a debugger. For most of the system call mechanisms, 780 * the CPU automatically clears the single-step flag before we enter the 781 * kernel. The sysenter mechanism does not clear the flag, so a user 782 * single-stepping through a libc routine may suddenly find him/herself 783 * single-stepping through the kernel. To detect this, kmdb compares the 784 * trap %pc to the [brand_]sys_enter addresses on each single-step trap. 785 * If it finds that we have single-stepped to a sysenter entry point, it 786 * explicitly clears the flag and executes the sys_sysenter routine. 787 * 788 * One final complication in this final complication is the fact that we 789 * have two different entry points for sysenter: brand_sys_sysenter and 790 * sys_sysenter. If we enter at brand_sys_sysenter and start single-stepping 791 * through the kernel with kmdb, we will eventually hit the instruction at 792 * sys_sysenter. kmdb cannot distinguish between that valid single-step 793 * and the undesirable one mentioned above. To avoid this situation, we 794 * simply add a jump over the instruction at sys_sysenter to make it 795 * impossible to single-step to it. 796 */ 797#if defined(__lint) 798 799void 800sys_sysenter() 801{} 802 803#else /* __lint */ 804 805 ENTRY_NP(brand_sys_sysenter) 806 SWAPGS 807 808 ALTENTRY(_brand_sys_sysenter_post_swapgs) 809 BRAND_CALLBACK(BRAND_CB_SYSENTER) 810 /* 811 * Jump over sys_sysenter to allow single-stepping as described 812 * above. 813 */ 814 jmp _sys_sysenter_post_swapgs 815 816 ALTENTRY(sys_sysenter) 817 SWAPGS 818 819 ALTENTRY(_sys_sysenter_post_swapgs) 820 movq %gs:CPU_THREAD, %r15 821 822 movl $U32CS_SEL, REGOFF_CS(%rsp) 823 movl %ecx, REGOFF_RSP(%rsp) /* wrapper: %esp -> %ecx */ 824 movl %edx, REGOFF_RIP(%rsp) /* wrapper: %eip -> %edx */ 825 pushfq 826 popq %r10 827 movl $UDS_SEL, REGOFF_SS(%rsp) 828 829 /* 830 * Set the interrupt flag before storing the flags to the 831 * flags image on the stack so we can return to user with 832 * interrupts enabled if we return via sys_rtt_syscall32 833 */ 834 orq $PS_IE, %r10 835 movq %r10, REGOFF_RFL(%rsp) 836 837 movl %edi, REGOFF_RDI(%rsp) 838 movl %esi, REGOFF_RSI(%rsp) 839 movl %ebp, REGOFF_RBP(%rsp) 840 movl %ebx, REGOFF_RBX(%rsp) 841 movl %edx, REGOFF_RDX(%rsp) 842 movl %ecx, REGOFF_RCX(%rsp) 843 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */ 844 movq $0, REGOFF_SAVFP(%rsp) 845 movq $0, REGOFF_SAVPC(%rsp) 846 847 /* 848 * Copy these registers here in case we end up stopped with 849 * someone (like, say, /proc) messing with our register state. 850 * We don't -restore- them unless we have to in update_sregs. 851 * 852 * Since userland -can't- change fsbase or gsbase directly, 853 * we don't bother to capture them here. 854 */ 855 xorl %ebx, %ebx 856 movw %ds, %bx 857 movq %rbx, REGOFF_DS(%rsp) 858 movw %es, %bx 859 movq %rbx, REGOFF_ES(%rsp) 860 movw %fs, %bx 861 movq %rbx, REGOFF_FS(%rsp) 862 movw %gs, %bx 863 movq %rbx, REGOFF_GS(%rsp) 864 865 /* 866 * Application state saved in the regs structure on the stack 867 * %eax is the syscall number 868 * %rsp is the thread's stack, %r15 is curthread 869 * REG_RSP(%rsp) is the user's stack 870 */ 871 872 SYSCALL_TRAPTRACE($TT_SYSENTER) 873 874 movq %rsp, %rbp 875 876 movq T_LWP(%r15), %r14 877 ASSERT_NO_RUPDATE_PENDING(%r14) 878 879 ENABLE_INTR_FLAGS 880 881 /* 882 * Catch 64-bit process trying to issue sysenter instruction 883 * on Nocona based systems. 884 */ 885 movq LWP_PROCP(%r14), %rax 886 cmpq $DATAMODEL_ILP32, P_MODEL(%rax) 887 je 7f 888 889 /* 890 * For a non-32-bit process, simulate a #ud, since that's what 891 * native hardware does. The traptrace entry (above) will 892 * let you know what really happened. 893 */ 894 movq $T_ILLINST, REGOFF_TRAPNO(%rsp) 895 movq REGOFF_CS(%rsp), %rdi 896 movq %rdi, REGOFF_ERR(%rsp) 897 movq %rsp, %rdi 898 movq REGOFF_RIP(%rsp), %rsi 899 movl %gs:CPU_ID, %edx 900 call trap 901 jmp _sys_rtt 9027: 903 904 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 905 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate calls) */ 906 907 ASSERT_LWPTOREGS(%r14, %rsp) 908 909 incq %gs:CPU_STATS_SYS_SYSCALL 910 911 /* 912 * Make some space for MAXSYSARGS (currently 8) 32-bit args 913 * placed into 64-bit (long) arg slots, plus one 64-bit 914 * (long) arg count, maintaining 16 byte alignment. 915 */ 916 subq $SYS_DROP, %rsp 917 movb $LWP_SYS, LWP_STATE(%r14) 918 movq %r15, %rdi 919 movq %rsp, %rsi 920 call syscall_entry 921 922 /* 923 * Fetch the arguments copied onto the kernel stack and put 924 * them in the right registers to invoke a C-style syscall handler. 925 * %rax contains the handler address. 926 */ 927 movq %rax, %rbx 928 movl 0(%rsp), %edi 929 movl 8(%rsp), %esi 930 movl 0x10(%rsp), %edx 931 movl 0x18(%rsp), %ecx 932 movl 0x20(%rsp), %r8d 933 movl 0x28(%rsp), %r9d 934 935 call *SY_CALLC(%rbx) 936 937 movq %rbp, %rsp /* pop the args */ 938 939 /* 940 * amd64 syscall handlers -always- return a 64-bit value in %rax. 941 * On the 32-bit kernel, the always return that value in %eax:%edx 942 * as required by the 32-bit ABI. 943 * 944 * Simulate the same behaviour by unconditionally splitting the 945 * return value in the same way. 946 */ 947 movq %rax, %r13 948 shrq $32, %r13 /* upper 32-bits into %edx */ 949 movl %eax, %r12d /* lower 32-bits into %eax */ 950 951 /* 952 * Optimistically assume that there's no post-syscall 953 * work to do. (This is to avoid having to call syscall_mstate() 954 * with interrupts disabled) 955 */ 956 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 957 958 /* 959 * We must protect ourselves from being descheduled here; 960 * If we were, and we ended up on another cpu, or another 961 * lwp got int ahead of us, it could change the segment 962 * registers without us noticing before we return to userland. 963 */ 964 cli 965 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 966 jne _full_syscall_postsys32 967 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 968 969 /* 970 * To get back to userland, load up the 32-bit registers and 971 * sysexit back where we came from. 972 */ 973 974 /* 975 * Interrupts will be turned on by the 'sti' executed just before 976 * sysexit. The following ensures that restoring the user's rflags 977 * doesn't enable interrupts too soon. 978 */ 979 andq $_BITNOT(PS_IE), REGOFF_RFL(%rsp) 980 981 /* 982 * (There's no point in loading up %edx because the sysexit 983 * mechanism smashes it.) 984 */ 985 movl %r12d, %eax 986 movl REGOFF_RBX(%rsp), %ebx 987 movl REGOFF_RBP(%rsp), %ebp 988 movl REGOFF_RSI(%rsp), %esi 989 movl REGOFF_RDI(%rsp), %edi 990 991 movl REGOFF_RIP(%rsp), %edx /* sysexit: %edx -> %eip */ 992 pushq REGOFF_RFL(%rsp) 993 popfq 994 movl REGOFF_RSP(%rsp), %ecx /* sysexit: %ecx -> %esp */ 995 swapgs 996 sti 997 sysexit 998 SET_SIZE(sys_sysenter) 999 SET_SIZE(_sys_sysenter_post_swapgs) 1000 SET_SIZE(brand_sys_sysenter) 1001 1002#endif /* __lint */ 1003 1004#if defined(__lint) 1005/* 1006 * System call via an int80. This entry point is only used by the Linux 1007 * application environment. Unlike the other entry points, there is no 1008 * default action to take if no callback is registered for this process. 1009 */ 1010void 1011sys_int80() 1012{} 1013 1014#else /* __lint */ 1015 1016 ENTRY_NP(brand_sys_int80) 1017 swapgs 1018 BRAND_CALLBACK(BRAND_CB_INT80) 1019 swapgs 1020 1021 ENTRY_NP(sys_int80) 1022 /* 1023 * We hit an int80, but this process isn't of a brand with an int80 1024 * handler. Bad process! Make it look as if the INT failed. 1025 * Modify %eip to point before the INT, push the expected error 1026 * code and fake a GP fault. 1027 * 1028 */ 1029 subq $2, (%rsp) /* int insn 2-bytes */ 1030 pushq $_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2) 1031 jmp gptrap / GP fault 1032 SET_SIZE(sys_int80) 1033 SET_SIZE(brand_sys_int80) 1034#endif /* __lint */ 1035 1036 1037/* 1038 * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by 1039 * the generic i386 libc to do system calls. We do a small amount of setup 1040 * before jumping into the existing sys_syscall32 path. 1041 */ 1042#if defined(__lint) 1043 1044/*ARGSUSED*/ 1045void 1046sys_syscall_int() 1047{} 1048 1049#else /* __lint */ 1050 1051 ENTRY_NP(brand_sys_syscall_int) 1052 SWAPGS 1053 BRAND_CALLBACK(BRAND_CB_INT91) 1054 swapgs 1055 1056 ALTENTRY(sys_syscall_int) 1057 swapgs 1058 movq %gs:CPU_THREAD, %r15 1059 movq T_STACK(%r15), %rsp 1060 movl %eax, %eax 1061 /* 1062 * Set t_post_sys on this thread to force ourselves out via the slow 1063 * path. It might be possible at some later date to optimize this out 1064 * and use a faster return mechanism. 1065 */ 1066 movb $1, T_POST_SYS(%r15) 1067 CLEAN_CS 1068 jmp _syscall32_save 1069 SET_SIZE(sys_syscall_int) 1070 SET_SIZE(brand_sys_syscall_int) 1071 1072#endif /* __lint */ 1073 1074/* 1075 * Legacy 32-bit applications and old libc implementations do lcalls; 1076 * we should never get here because the LDT entry containing the syscall 1077 * segment descriptor has the "segment present" bit cleared, which means 1078 * we end up processing those system calls in trap() via a not-present trap. 1079 * 1080 * We do it this way because a call gate unhelpfully does -nothing- to the 1081 * interrupt flag bit, so an interrupt can run us just after the lcall 1082 * completes, but just before the swapgs takes effect. Thus the INTR_PUSH and 1083 * INTR_POP paths would have to be slightly more complex to dance around 1084 * this problem, and end up depending explicitly on the first 1085 * instruction of this handler being either swapgs or cli. 1086 */ 1087 1088#if defined(__lint) 1089 1090/*ARGSUSED*/ 1091void 1092sys_lcall32() 1093{} 1094 1095#else /* __lint */ 1096 1097 ENTRY_NP(sys_lcall32) 1098 SWAPGS 1099 pushq $0 1100 pushq %rbp 1101 movq %rsp, %rbp 1102 leaq __lcall_panic_str(%rip), %rdi 1103 xorl %eax, %eax 1104 call panic 1105 SET_SIZE(sys_lcall32) 1106 1107__lcall_panic_str: 1108 .string "sys_lcall32: shouldn't be here!" 1109 1110/* 1111 * Declare a uintptr_t which covers the entire pc range of syscall 1112 * handlers for the stack walkers that need this. 1113 */ 1114 .align CPTRSIZE 1115 .globl _allsyscalls_size 1116 .type _allsyscalls_size, @object 1117_allsyscalls_size: 1118 .NWORD . - _allsyscalls 1119 SET_SIZE(_allsyscalls_size) 1120 1121#endif /* __lint */ 1122 1123/* 1124 * These are the thread context handlers for lwps using sysenter/sysexit. 1125 */ 1126 1127#if defined(__lint) 1128 1129/*ARGSUSED*/ 1130void 1131sep_save(void *ksp) 1132{} 1133 1134/*ARGSUSED*/ 1135void 1136sep_restore(void *ksp) 1137{} 1138 1139#else /* __lint */ 1140 1141 /* 1142 * setting this value to zero as we switch away causes the 1143 * stack-pointer-on-sysenter to be NULL, ensuring that we 1144 * don't silently corrupt another (preempted) thread stack 1145 * when running an lwp that (somehow) didn't get sep_restore'd 1146 */ 1147 ENTRY_NP(sep_save) 1148 xorl %edx, %edx 1149 xorl %eax, %eax 1150 movl $MSR_INTC_SEP_ESP, %ecx 1151 wrmsr 1152 ret 1153 SET_SIZE(sep_save) 1154 1155 /* 1156 * Update the kernel stack pointer as we resume onto this cpu. 1157 */ 1158 ENTRY_NP(sep_restore) 1159 movq %rdi, %rdx 1160 shrq $32, %rdx 1161 movl %edi, %eax 1162 movl $MSR_INTC_SEP_ESP, %ecx 1163 wrmsr 1164 ret 1165 SET_SIZE(sep_restore) 1166 1167#endif /* __lint */ 1168