1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2019 Joyent, Inc. 24 * Copyright (c) 2016 by Delphix. All rights reserved. 25 */ 26 27#include <sys/asm_linkage.h> 28#include <sys/asm_misc.h> 29#include <sys/regset.h> 30#include <sys/privregs.h> 31#include <sys/psw.h> 32#include <sys/machbrand.h> 33 34#include <sys/segments.h> 35#include <sys/pcb.h> 36#include <sys/trap.h> 37#include <sys/ftrace.h> 38#include <sys/traptrace.h> 39#include <sys/clock.h> 40#include <sys/model.h> 41#include <sys/panic.h> 42 43#if defined(__xpv) 44#include <sys/hypervisor.h> 45#endif 46 47#include "assym.h" 48 49/* 50 * We implement five flavours of system call entry points 51 * 52 * - syscall/sysretq (amd64 generic) 53 * - syscall/sysretl (i386 plus SYSC bit) 54 * - sysenter/sysexit (i386 plus SEP bit) 55 * - int/iret (i386 generic) 56 * - lcall/iret (i386 generic) 57 * 58 * The current libc included in Solaris uses int/iret as the base unoptimized 59 * kernel entry method. Older libc implementations and legacy binaries may use 60 * the lcall call gate, so it must continue to be supported. 61 * 62 * System calls that use an lcall call gate are processed in trap() via a 63 * segment-not-present trap, i.e. lcalls are extremely slow(!). 64 * 65 * The basic pattern used in the 32-bit SYSC handler at this point in time is 66 * to have the bare minimum of assembler, and get to the C handlers as 67 * quickly as possible. 68 * 69 * The 64-bit handler is much closer to the sparcv9 handler; that's 70 * because of passing arguments in registers. The 32-bit world still 71 * passes arguments on the stack -- that makes that handler substantially 72 * more complex. 73 * 74 * The two handlers share a few code fragments which are broken 75 * out into preprocessor macros below. 76 * 77 * XX64 come back and speed all this up later. The 32-bit stuff looks 78 * especially easy to speed up the argument copying part .. 79 * 80 * 81 * Notes about segment register usage (c.f. the 32-bit kernel) 82 * 83 * In the 32-bit kernel, segment registers are dutifully saved and 84 * restored on all mode transitions because the kernel uses them directly. 85 * When the processor is running in 64-bit mode, segment registers are 86 * largely ignored. 87 * 88 * %cs and %ss 89 * controlled by the hardware mechanisms that make mode transitions 90 * 91 * The remaining segment registers have to either be pointing at a valid 92 * descriptor i.e. with the 'present' bit set, or they can NULL descriptors 93 * 94 * %ds and %es 95 * always ignored 96 * 97 * %fs and %gs 98 * fsbase and gsbase are used to control the place they really point at. 99 * The kernel only depends on %gs, and controls its own gsbase via swapgs 100 * 101 * Note that loading segment registers is still costly because the GDT 102 * lookup still happens (this is because the hardware can't know that we're 103 * not setting up these segment registers for a 32-bit program). Thus we 104 * avoid doing this in the syscall path, and defer them to lwp context switch 105 * handlers, so the register values remain virtualized to the lwp. 106 */ 107 108#if defined(SYSCALLTRACE) 109#define ORL_SYSCALLTRACE(r32) \ 110 orl syscalltrace(%rip), r32 111#else 112#define ORL_SYSCALLTRACE(r32) 113#endif 114 115/* 116 * In the 32-bit kernel, we do absolutely nothing before getting into the 117 * brand callback checks. In 64-bit land, we do swapgs and then come here. 118 * We assume that the %rsp- and %r15-stashing fields in the CPU structure 119 * are still unused. 120 * 121 * Check if a brand_mach_ops callback is defined for the specified callback_id 122 * type. If so invoke it with the kernel's %gs value loaded and the following 123 * data on the stack: 124 * 125 * stack: -------------------------------------- 126 * 32 | callback pointer | 127 * | 24 | user (or interrupt) stack pointer | 128 * | 16 | lwp pointer | 129 * v 8 | userland return address | 130 * 0 | callback wrapper return addr | 131 * -------------------------------------- 132 * 133 * Since we're pushing the userland return address onto the kernel stack 134 * we need to get that address without accessing the user's stack (since we 135 * can't trust that data). There are different ways to get the userland 136 * return address depending on how the syscall trap was made: 137 * 138 * a) For sys_syscall and sys_syscall32 the return address is in %rcx. 139 * b) For sys_sysenter the return address is in %rdx. 140 * c) For sys_int80 and sys_syscall_int (int91), upon entry into the macro, 141 * the stack pointer points at the state saved when we took the interrupt: 142 * ------------------------ 143 * | | user's %ss | 144 * | | user's %esp | 145 * | | EFLAGS register | 146 * v | user's %cs | 147 * | user's %eip | 148 * ------------------------ 149 * 150 * The 2nd parameter to the BRAND_CALLBACK macro is either the 151 * BRAND_URET_FROM_REG or BRAND_URET_FROM_INTR_STACK macro. These macros are 152 * used to generate the proper code to get the userland return address for 153 * each syscall entry point. 154 * 155 * The interface to the brand callbacks on the 64-bit kernel assumes %r15 156 * is available as a scratch register within the callback. If the callback 157 * returns within the kernel then this macro will restore %r15. If the 158 * callback is going to return directly to userland then it should restore 159 * %r15 before returning to userland. 160 */ 161#define BRAND_URET_FROM_REG(rip_reg) \ 162 pushq rip_reg /* push the return address */ 163 164/* 165 * The interrupt stack pointer we saved on entry to the BRAND_CALLBACK macro 166 * is currently pointing at the user return address (%eip). 167 */ 168#define BRAND_URET_FROM_INTR_STACK() \ 169 movq %gs:CPU_RTMP_RSP, %r15 /* grab the intr. stack pointer */ ;\ 170 pushq (%r15) /* push the return address */ 171 172#define BRAND_CALLBACK(callback_id, push_userland_ret) \ 173 movq %rsp, %gs:CPU_RTMP_RSP /* save the stack pointer */ ;\ 174 movq %r15, %gs:CPU_RTMP_R15 /* save %r15 */ ;\ 175 movq %gs:CPU_THREAD, %r15 /* load the thread pointer */ ;\ 176 movq T_STACK(%r15), %rsp /* switch to the kernel stack */ ;\ 177 subq $16, %rsp /* save space for 2 pointers */ ;\ 178 pushq %r14 /* save %r14 */ ;\ 179 movq %gs:CPU_RTMP_RSP, %r14 ;\ 180 movq %r14, 8(%rsp) /* stash the user stack pointer */ ;\ 181 popq %r14 /* restore %r14 */ ;\ 182 movq T_LWP(%r15), %r15 /* load the lwp pointer */ ;\ 183 pushq %r15 /* push the lwp pointer */ ;\ 184 movq LWP_PROCP(%r15), %r15 /* load the proc pointer */ ;\ 185 movq P_BRAND(%r15), %r15 /* load the brand pointer */ ;\ 186 movq B_MACHOPS(%r15), %r15 /* load the machops pointer */ ;\ 187 movq _CONST(_MUL(callback_id, CPTRSIZE))(%r15), %r15 ;\ 188 cmpq $0, %r15 ;\ 189 je 1f ;\ 190 movq %r15, 16(%rsp) /* save the callback pointer */ ;\ 191 push_userland_ret /* push the return address */ ;\ 192 movq 24(%rsp), %r15 /* load callback pointer */ ;\ 193 INDIRECT_CALL_REG(r15) /* call callback */ ;\ 1941: movq %gs:CPU_RTMP_R15, %r15 /* restore %r15 */ ;\ 195 movq %gs:CPU_RTMP_RSP, %rsp /* restore the stack pointer */ 196 197#define MSTATE_TRANSITION(from, to) \ 198 movl $from, %edi; \ 199 movl $to, %esi; \ 200 call syscall_mstate 201 202/* 203 * Check to see if a simple (direct) return is possible i.e. 204 * 205 * if (t->t_post_sys_ast | syscalltrace | 206 * lwp->lwp_pcb.pcb_rupdate == 1) 207 * do full version ; 208 * 209 * Preconditions: 210 * - t is curthread 211 * Postconditions: 212 * - condition code NE is set if post-sys is too complex 213 * - rtmp is zeroed if it isn't (we rely on this!) 214 * - ltmp is smashed 215 */ 216#define CHECK_POSTSYS_NE(t, ltmp, rtmp) \ 217 movq T_LWP(t), ltmp; \ 218 movzbl PCB_RUPDATE(ltmp), rtmp; \ 219 ORL_SYSCALLTRACE(rtmp); \ 220 orl T_POST_SYS_AST(t), rtmp; \ 221 cmpl $0, rtmp 222 223/* 224 * Fix up the lwp, thread, and eflags for a successful return 225 * 226 * Preconditions: 227 * - zwreg contains zero 228 */ 229#define SIMPLE_SYSCALL_POSTSYS(t, lwp, zwreg) \ 230 movb $LWP_USER, LWP_STATE(lwp); \ 231 movw zwreg, T_SYSNUM(t); \ 232 andb $_CONST(0xffff - PS_C), REGOFF_RFL(%rsp) 233 234/* 235 * ASSERT(lwptoregs(lwp) == rp); 236 * 237 * This may seem obvious, but very odd things happen if this 238 * assertion is false 239 * 240 * Preconditions: 241 * (%rsp is ready for normal call sequence) 242 * Postconditions (if assertion is true): 243 * %r11 is smashed 244 * 245 * ASSERT(rp->r_cs == descnum) 246 * 247 * The code selector is written into the regs structure when the 248 * lwp stack is created. We use this ASSERT to validate that 249 * the regs structure really matches how we came in. 250 * 251 * Preconditions: 252 * (%rsp is ready for normal call sequence) 253 * Postconditions (if assertion is true): 254 * -none- 255 * 256 * ASSERT(lwp->lwp_pcb.pcb_rupdate == 0); 257 * 258 * If this is false, it meant that we returned to userland without 259 * updating the segment registers as we were supposed to. 260 * 261 * Note that we must ensure no interrupts or other traps intervene 262 * between entering privileged mode and performing the assertion, 263 * otherwise we may perform a context switch on the thread, which 264 * will end up setting pcb_rupdate to 1 again. 265 * 266 * ASSERT(%cr0 & CR0_TS == 0); 267 * Preconditions: 268 * (%rsp is ready for normal call sequence) 269 * Postconditions (if assertion is true): 270 * (specified register is clobbered) 271 * 272 * Check to make sure that we are returning to user land and that CR0.TS 273 * is not set. This is required as part of the eager FPU (see 274 * uts/intel/os/fpu.c for more information). 275 */ 276 277#if defined(DEBUG) 278 279__lwptoregs_msg: 280 .string "syscall_asm_amd64.s:%d lwptoregs(%p) [%p] != rp [%p]" 281 282__codesel_msg: 283 .string "syscall_asm_amd64.s:%d rp->r_cs [%ld] != %ld" 284 285__no_rupdate_msg: 286 .string "syscall_asm_amd64.s:%d lwp %p, pcb_rupdate != 0" 287 288__bad_ts_msg: 289 .string "syscall_asm_amd64.s:%d CR0.TS set on user return" 290 291#define ASSERT_LWPTOREGS(lwp, rp) \ 292 movq LWP_REGS(lwp), %r11; \ 293 cmpq rp, %r11; \ 294 je 7f; \ 295 leaq __lwptoregs_msg(%rip), %rdi; \ 296 movl $__LINE__, %esi; \ 297 movq lwp, %rdx; \ 298 movq %r11, %rcx; \ 299 movq rp, %r8; \ 300 xorl %eax, %eax; \ 301 call panic; \ 3027: 303 304#define ASSERT_NO_RUPDATE_PENDING(lwp) \ 305 testb $0x1, PCB_RUPDATE(lwp); \ 306 je 8f; \ 307 movq lwp, %rdx; \ 308 leaq __no_rupdate_msg(%rip), %rdi; \ 309 movl $__LINE__, %esi; \ 310 xorl %eax, %eax; \ 311 call panic; \ 3128: 313 314#define ASSERT_CR0TS_ZERO(reg) \ 315 movq %cr0, reg; \ 316 testq $CR0_TS, reg; \ 317 jz 9f; \ 318 leaq __bad_ts_msg(%rip), %rdi; \ 319 movl $__LINE__, %esi; \ 320 xorl %eax, %eax; \ 321 call panic; \ 3229: 323 324#else 325#define ASSERT_LWPTOREGS(lwp, rp) 326#define ASSERT_NO_RUPDATE_PENDING(lwp) 327#define ASSERT_CR0TS_ZERO(reg) 328#endif 329 330/* 331 * Do the traptrace thing and restore any registers we used 332 * in situ. Assumes that %rsp is pointing at the base of 333 * the struct regs, obviously .. 334 */ 335#ifdef TRAPTRACE 336#define SYSCALL_TRAPTRACE(ttype) \ 337 TRACE_PTR(%rdi, %rbx, %ebx, %rcx, ttype); \ 338 TRACE_REGS(%rdi, %rsp, %rbx, %rcx); \ 339 TRACE_STAMP(%rdi); /* rdtsc clobbers %eax, %edx */ \ 340 movq REGOFF_RAX(%rsp), %rax; \ 341 movq REGOFF_RBX(%rsp), %rbx; \ 342 movq REGOFF_RCX(%rsp), %rcx; \ 343 movq REGOFF_RDX(%rsp), %rdx; \ 344 movl %eax, TTR_SYSNUM(%rdi); \ 345 movq REGOFF_RDI(%rsp), %rdi 346 347#define SYSCALL_TRAPTRACE32(ttype) \ 348 SYSCALL_TRAPTRACE(ttype); \ 349 /* paranoia: clean the top 32-bits of the registers */ \ 350 orl %eax, %eax; \ 351 orl %ebx, %ebx; \ 352 orl %ecx, %ecx; \ 353 orl %edx, %edx; \ 354 orl %edi, %edi 355#else /* TRAPTRACE */ 356#define SYSCALL_TRAPTRACE(ttype) 357#define SYSCALL_TRAPTRACE32(ttype) 358#endif /* TRAPTRACE */ 359 360/* 361 * The 64-bit libc syscall wrapper does this: 362 * 363 * fn(<args>) 364 * { 365 * movq %rcx, %r10 -- because syscall smashes %rcx 366 * movl $CODE, %eax 367 * syscall 368 * <error processing> 369 * } 370 * 371 * Thus when we come into the kernel: 372 * 373 * %rdi, %rsi, %rdx, %r10, %r8, %r9 contain first six args 374 * %rax is the syscall number 375 * %r12-%r15 contain caller state 376 * 377 * The syscall instruction arranges that: 378 * 379 * %rcx contains the return %rip 380 * %r11d contains bottom 32-bits of %rflags 381 * %rflags is masked (as determined by the SFMASK msr) 382 * %cs is set to UCS_SEL (as determined by the STAR msr) 383 * %ss is set to UDS_SEL (as determined by the STAR msr) 384 * %rip is set to sys_syscall (as determined by the LSTAR msr) 385 * 386 * Or in other words, we have no registers available at all. 387 * Only swapgs can save us! 388 * 389 * Under the hypervisor, the swapgs has happened already. However, the 390 * state of the world is very different from that we're familiar with. 391 * 392 * In particular, we have a stack structure like that for interrupt 393 * gates, except that the %cs and %ss registers are modified for reasons 394 * that are not entirely clear. Critically, the %rcx/%r11 values do 395 * *not* reflect the usage of those registers under a 'real' syscall[1]; 396 * the stack, therefore, looks like this: 397 * 398 * 0x0(rsp) potentially junk %rcx 399 * 0x8(rsp) potentially junk %r11 400 * 0x10(rsp) user %rip 401 * 0x18(rsp) modified %cs 402 * 0x20(rsp) user %rflags 403 * 0x28(rsp) user %rsp 404 * 0x30(rsp) modified %ss 405 * 406 * 407 * and before continuing on, we must load the %rip into %rcx and the 408 * %rflags into %r11. 409 * 410 * [1] They used to, and we relied on it, but this was broken in 3.1.1. 411 * Sigh. 412 */ 413#if defined(__xpv) 414#define XPV_SYSCALL_PROD \ 415 movq 0x10(%rsp), %rcx; \ 416 movq 0x20(%rsp), %r11; \ 417 movq 0x28(%rsp), %rsp 418#else 419#define XPV_SYSCALL_PROD /* nothing */ 420#endif 421 422 ENTRY_NP2(brand_sys_syscall,_allsyscalls) 423 SWAPGS /* kernel gsbase */ 424 XPV_SYSCALL_PROD 425 BRAND_CALLBACK(BRAND_CB_SYSCALL, BRAND_URET_FROM_REG(%rcx)) 426 jmp noprod_sys_syscall 427 428 ALTENTRY(sys_syscall) 429 SWAPGS /* kernel gsbase */ 430 XPV_SYSCALL_PROD 431 432noprod_sys_syscall: 433 movq %r15, %gs:CPU_RTMP_R15 434 movq %rsp, %gs:CPU_RTMP_RSP 435 436 movq %gs:CPU_THREAD, %r15 437 movq T_STACK(%r15), %rsp /* switch from user to kernel stack */ 438 439 ASSERT_UPCALL_MASK_IS_SET 440 441 movl $UCS_SEL, REGOFF_CS(%rsp) 442 movq %rcx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */ 443 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */ 444 movl $UDS_SEL, REGOFF_SS(%rsp) 445 446 movl %eax, %eax /* wrapper: sysc# -> %eax */ 447 movq %rdi, REGOFF_RDI(%rsp) 448 movq %rsi, REGOFF_RSI(%rsp) 449 movq %rdx, REGOFF_RDX(%rsp) 450 movq %r10, REGOFF_RCX(%rsp) /* wrapper: %rcx -> %r10 */ 451 movq %r10, %rcx /* arg[3] for direct calls */ 452 453 movq %r8, REGOFF_R8(%rsp) 454 movq %r9, REGOFF_R9(%rsp) 455 movq %rax, REGOFF_RAX(%rsp) 456 movq %rbx, REGOFF_RBX(%rsp) 457 458 movq %rbp, REGOFF_RBP(%rsp) 459 movq %r10, REGOFF_R10(%rsp) 460 movq %gs:CPU_RTMP_RSP, %r11 461 movq %r11, REGOFF_RSP(%rsp) 462 movq %r12, REGOFF_R12(%rsp) 463 464 movq %r13, REGOFF_R13(%rsp) 465 movq %r14, REGOFF_R14(%rsp) 466 movq %gs:CPU_RTMP_R15, %r10 467 movq %r10, REGOFF_R15(%rsp) 468 movq $0, REGOFF_SAVFP(%rsp) 469 movq $0, REGOFF_SAVPC(%rsp) 470 471 /* 472 * Copy these registers here in case we end up stopped with 473 * someone (like, say, /proc) messing with our register state. 474 * We don't -restore- them unless we have to in update_sregs. 475 * 476 * Since userland -can't- change fsbase or gsbase directly, 477 * and capturing them involves two serializing instructions, 478 * we don't bother to capture them here. 479 */ 480 xorl %ebx, %ebx 481 movw %ds, %bx 482 movq %rbx, REGOFF_DS(%rsp) 483 movw %es, %bx 484 movq %rbx, REGOFF_ES(%rsp) 485 movw %fs, %bx 486 movq %rbx, REGOFF_FS(%rsp) 487 movw %gs, %bx 488 movq %rbx, REGOFF_GS(%rsp) 489 490 /* 491 * If we're trying to use TRAPTRACE though, I take that back: we're 492 * probably debugging some problem in the SWAPGS logic and want to know 493 * what the incoming gsbase was. 494 * 495 * Since we already did SWAPGS, record the KGSBASE. 496 */ 497#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv) 498 movl $MSR_AMD_KGSBASE, %ecx 499 rdmsr 500 movl %eax, REGOFF_GSBASE(%rsp) 501 movl %edx, REGOFF_GSBASE+4(%rsp) 502#endif 503 504 /* 505 * Machine state saved in the regs structure on the stack 506 * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9 507 * %eax is the syscall number 508 * %rsp is the thread's stack, %r15 is curthread 509 * REG_RSP(%rsp) is the user's stack 510 */ 511 512 SYSCALL_TRAPTRACE($TT_SYSC64) 513 514 movq %rsp, %rbp 515 516 movq T_LWP(%r15), %r14 517 ASSERT_NO_RUPDATE_PENDING(%r14) 518 ENABLE_INTR_FLAGS 519 520 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 521 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ 522 523 ASSERT_LWPTOREGS(%r14, %rsp) 524 525 movb $LWP_SYS, LWP_STATE(%r14) 526 incq LWP_RU_SYSC(%r14) 527 movb $NORMALRETURN, LWP_EOSYS(%r14) 528 529 incq %gs:CPU_STATS_SYS_SYSCALL 530 531 movw %ax, T_SYSNUM(%r15) 532 movzbl T_PRE_SYS(%r15), %ebx 533 ORL_SYSCALLTRACE(%ebx) 534 testl %ebx, %ebx 535 jne _syscall_pre 536 537_syscall_invoke: 538 movq REGOFF_RDI(%rbp), %rdi 539 movq REGOFF_RSI(%rbp), %rsi 540 movq REGOFF_RDX(%rbp), %rdx 541 movq REGOFF_RCX(%rbp), %rcx 542 movq REGOFF_R8(%rbp), %r8 543 movq REGOFF_R9(%rbp), %r9 544 545 cmpl $NSYSCALL, %eax 546 jae _syscall_ill 547 shll $SYSENT_SIZE_SHIFT, %eax 548 leaq sysent(%rax), %rbx 549 550 movq SY_CALLC(%rbx), %rax 551 INDIRECT_CALL_REG(rax) 552 553 movq %rax, %r12 554 movq %rdx, %r13 555 556 /* 557 * If the handler returns two ints, then we need to split the 558 * 64-bit return value into two 32-bit values. 559 */ 560 testw $SE_32RVAL2, SY_FLAGS(%rbx) 561 je 5f 562 movq %r12, %r13 563 shrq $32, %r13 /* upper 32-bits into %edx */ 564 movl %r12d, %r12d /* lower 32-bits into %eax */ 5655: 566 /* 567 * Optimistically assume that there's no post-syscall 568 * work to do. (This is to avoid having to call syscall_mstate() 569 * with interrupts disabled) 570 */ 571 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 572 573 /* 574 * We must protect ourselves from being descheduled here; 575 * If we were, and we ended up on another cpu, or another 576 * lwp got in ahead of us, it could change the segment 577 * registers without us noticing before we return to userland. 578 */ 579 CLI(%r14) 580 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 581 jne _syscall_post 582 583 /* 584 * We need to protect ourselves against non-canonical return values 585 * because Intel doesn't check for them on sysret (AMD does). Canonical 586 * addresses on current amd64 processors only use 48-bits for VAs; an 587 * address is canonical if all upper bits (47-63) are identical. If we 588 * find a non-canonical %rip, we opt to go through the full 589 * _syscall_post path which takes us into an iretq which is not 590 * susceptible to the same problems sysret is. 591 * 592 * We're checking for a canonical address by first doing an arithmetic 593 * shift. This will fill in the remaining bits with the value of bit 63. 594 * If the address were canonical, the register would now have either all 595 * zeroes or all ones in it. Therefore we add one (inducing overflow) 596 * and compare against 1. A canonical address will either be zero or one 597 * at this point, hence the use of ja. 598 * 599 * At this point, r12 and r13 have the return value so we can't use 600 * those registers. 601 */ 602 movq REGOFF_RIP(%rsp), %rcx 603 sarq $47, %rcx 604 incq %rcx 605 cmpq $1, %rcx 606 ja _syscall_post 607 608 609 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 610 611 movq %r12, REGOFF_RAX(%rsp) 612 movq %r13, REGOFF_RDX(%rsp) 613 614 /* 615 * Clobber %r11 as we check CR0.TS. 616 */ 617 ASSERT_CR0TS_ZERO(%r11) 618 619 /* 620 * Unlike other cases, because we need to restore the user stack pointer 621 * before exiting the kernel we must clear the microarch state before 622 * getting here. This should be safe because it means that the only 623 * values on the bus after this are based on the user's registers and 624 * potentially the addresses where we stored them. Given the constraints 625 * of sysret, that's how it has to be. 626 */ 627 call x86_md_clear 628 629 /* 630 * To get back to userland, we need the return %rip in %rcx and 631 * the return %rfl in %r11d. The sysretq instruction also arranges 632 * to fix up %cs and %ss; everything else is our responsibility. 633 */ 634 movq REGOFF_RDI(%rsp), %rdi 635 movq REGOFF_RSI(%rsp), %rsi 636 movq REGOFF_RDX(%rsp), %rdx 637 /* %rcx used to restore %rip value */ 638 639 movq REGOFF_R8(%rsp), %r8 640 movq REGOFF_R9(%rsp), %r9 641 movq REGOFF_RAX(%rsp), %rax 642 movq REGOFF_RBX(%rsp), %rbx 643 644 movq REGOFF_RBP(%rsp), %rbp 645 movq REGOFF_R10(%rsp), %r10 646 /* %r11 used to restore %rfl value */ 647 movq REGOFF_R12(%rsp), %r12 648 649 movq REGOFF_R13(%rsp), %r13 650 movq REGOFF_R14(%rsp), %r14 651 movq REGOFF_R15(%rsp), %r15 652 653 movq REGOFF_RIP(%rsp), %rcx 654 movl REGOFF_RFL(%rsp), %r11d 655 656#if defined(__xpv) 657 addq $REGOFF_RIP, %rsp 658#else 659 movq REGOFF_RSP(%rsp), %rsp 660#endif 661 662 /* 663 * There can be no instructions between the ALTENTRY below and 664 * SYSRET or we could end up breaking brand support. See label usage 665 * in sn1_brand_syscall_callback for an example. 666 */ 667 ASSERT_UPCALL_MASK_IS_SET 668#if defined(__xpv) 669 SYSRETQ 670 ALTENTRY(nopop_sys_syscall_swapgs_sysretq) 671 672 /* 673 * We can only get here after executing a brand syscall 674 * interposition callback handler and simply need to 675 * "sysretq" back to userland. On the hypervisor this 676 * involves the iret hypercall which requires us to construct 677 * just enough of the stack needed for the hypercall. 678 * (rip, cs, rflags, rsp, ss). 679 */ 680 movq %rsp, %gs:CPU_RTMP_RSP /* save user's rsp */ 681 movq %gs:CPU_THREAD, %r11 682 movq T_STACK(%r11), %rsp 683 684 movq %rcx, REGOFF_RIP(%rsp) 685 movl $UCS_SEL, REGOFF_CS(%rsp) 686 movq %gs:CPU_RTMP_RSP, %r11 687 movq %r11, REGOFF_RSP(%rsp) 688 pushfq 689 popq %r11 /* hypercall enables ints */ 690 movq %r11, REGOFF_RFL(%rsp) 691 movl $UDS_SEL, REGOFF_SS(%rsp) 692 addq $REGOFF_RIP, %rsp 693 /* 694 * XXPV: see comment in SYSRETQ definition for future optimization 695 * we could take. 696 */ 697 ASSERT_UPCALL_MASK_IS_SET 698 SYSRETQ 699#else 700 ALTENTRY(nopop_sys_syscall_swapgs_sysretq) 701 jmp tr_sysretq 702#endif 703 /*NOTREACHED*/ 704 SET_SIZE(nopop_sys_syscall_swapgs_sysretq) 705 706_syscall_pre: 707 call pre_syscall 708 movl %eax, %r12d 709 testl %eax, %eax 710 jne _syscall_post_call 711 /* 712 * Didn't abort, so reload the syscall args and invoke the handler. 713 */ 714 movzwl T_SYSNUM(%r15), %eax 715 jmp _syscall_invoke 716 717_syscall_ill: 718 call nosys 719 movq %rax, %r12 720 movq %rdx, %r13 721 jmp _syscall_post_call 722 723_syscall_post: 724 STI 725 /* 726 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM 727 * so that we can account for the extra work it takes us to finish. 728 */ 729 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 730_syscall_post_call: 731 movq %r12, %rdi 732 movq %r13, %rsi 733 call post_syscall 734 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 735 jmp _sys_rtt 736 SET_SIZE(sys_syscall) 737 SET_SIZE(brand_sys_syscall) 738 739 ENTRY_NP(brand_sys_syscall32) 740 SWAPGS /* kernel gsbase */ 741 XPV_TRAP_POP 742 BRAND_CALLBACK(BRAND_CB_SYSCALL32, BRAND_URET_FROM_REG(%rcx)) 743 jmp nopop_sys_syscall32 744 745 ALTENTRY(sys_syscall32) 746 SWAPGS /* kernel gsbase */ 747 XPV_TRAP_POP 748 749nopop_sys_syscall32: 750 movl %esp, %r10d 751 movq %gs:CPU_THREAD, %r15 752 movq T_STACK(%r15), %rsp 753 movl %eax, %eax 754 755 movl $U32CS_SEL, REGOFF_CS(%rsp) 756 movl %ecx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */ 757 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */ 758 movq %r10, REGOFF_RSP(%rsp) 759 movl $UDS_SEL, REGOFF_SS(%rsp) 760 761_syscall32_save: 762 movl %edi, REGOFF_RDI(%rsp) 763 movl %esi, REGOFF_RSI(%rsp) 764 movl %ebp, REGOFF_RBP(%rsp) 765 movl %ebx, REGOFF_RBX(%rsp) 766 movl %edx, REGOFF_RDX(%rsp) 767 movl %ecx, REGOFF_RCX(%rsp) 768 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */ 769 movq $0, REGOFF_SAVFP(%rsp) 770 movq $0, REGOFF_SAVPC(%rsp) 771 772 /* 773 * Copy these registers here in case we end up stopped with 774 * someone (like, say, /proc) messing with our register state. 775 * We don't -restore- them unless we have to in update_sregs. 776 * 777 * Since userland -can't- change fsbase or gsbase directly, 778 * we don't bother to capture them here. 779 */ 780 xorl %ebx, %ebx 781 movw %ds, %bx 782 movq %rbx, REGOFF_DS(%rsp) 783 movw %es, %bx 784 movq %rbx, REGOFF_ES(%rsp) 785 movw %fs, %bx 786 movq %rbx, REGOFF_FS(%rsp) 787 movw %gs, %bx 788 movq %rbx, REGOFF_GS(%rsp) 789 790 /* 791 * If we're trying to use TRAPTRACE though, I take that back: we're 792 * probably debugging some problem in the SWAPGS logic and want to know 793 * what the incoming gsbase was. 794 * 795 * Since we already did SWAPGS, record the KGSBASE. 796 */ 797#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv) 798 movl $MSR_AMD_KGSBASE, %ecx 799 rdmsr 800 movl %eax, REGOFF_GSBASE(%rsp) 801 movl %edx, REGOFF_GSBASE+4(%rsp) 802#endif 803 804 /* 805 * Application state saved in the regs structure on the stack 806 * %eax is the syscall number 807 * %rsp is the thread's stack, %r15 is curthread 808 * REG_RSP(%rsp) is the user's stack 809 */ 810 811 SYSCALL_TRAPTRACE32($TT_SYSC) 812 813 movq %rsp, %rbp 814 815 movq T_LWP(%r15), %r14 816 ASSERT_NO_RUPDATE_PENDING(%r14) 817 818 ENABLE_INTR_FLAGS 819 820 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 821 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ 822 823 ASSERT_LWPTOREGS(%r14, %rsp) 824 825 incq %gs:CPU_STATS_SYS_SYSCALL 826 827 /* 828 * Make some space for MAXSYSARGS (currently 8) 32-bit args placed 829 * into 64-bit (long) arg slots, maintaining 16 byte alignment. Or 830 * more succinctly: 831 * 832 * SA(MAXSYSARGS * sizeof (long)) == 64 833 */ 834#define SYS_DROP 64 /* drop for args */ 835 subq $SYS_DROP, %rsp 836 movb $LWP_SYS, LWP_STATE(%r14) 837 movq %r15, %rdi 838 movq %rsp, %rsi 839 call syscall_entry 840 841 /* 842 * Fetch the arguments copied onto the kernel stack and put 843 * them in the right registers to invoke a C-style syscall handler. 844 * %rax contains the handler address. 845 * 846 * Ideas for making all this go faster of course include simply 847 * forcibly fetching 6 arguments from the user stack under lofault 848 * protection, reverting to copyin_args only when watchpoints 849 * are in effect. 850 * 851 * (If we do this, make sure that exec and libthread leave 852 * enough space at the top of the stack to ensure that we'll 853 * never do a fetch from an invalid page.) 854 * 855 * Lots of ideas here, but they won't really help with bringup B-) 856 * Correctness can't wait, performance can wait a little longer .. 857 */ 858 859 movq %rax, %rbx 860 movl 0(%rsp), %edi 861 movl 8(%rsp), %esi 862 movl 0x10(%rsp), %edx 863 movl 0x18(%rsp), %ecx 864 movl 0x20(%rsp), %r8d 865 movl 0x28(%rsp), %r9d 866 867 movq SY_CALLC(%rbx), %rax 868 INDIRECT_CALL_REG(rax) 869 870 movq %rbp, %rsp /* pop the args */ 871 872 /* 873 * amd64 syscall handlers -always- return a 64-bit value in %rax. 874 * On the 32-bit kernel, they always return that value in %eax:%edx 875 * as required by the 32-bit ABI. 876 * 877 * Simulate the same behaviour by unconditionally splitting the 878 * return value in the same way. 879 */ 880 movq %rax, %r13 881 shrq $32, %r13 /* upper 32-bits into %edx */ 882 movl %eax, %r12d /* lower 32-bits into %eax */ 883 884 /* 885 * Optimistically assume that there's no post-syscall 886 * work to do. (This is to avoid having to call syscall_mstate() 887 * with interrupts disabled) 888 */ 889 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 890 891 /* 892 * We must protect ourselves from being descheduled here; 893 * If we were, and we ended up on another cpu, or another 894 * lwp got in ahead of us, it could change the segment 895 * registers without us noticing before we return to userland. 896 */ 897 CLI(%r14) 898 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 899 jne _full_syscall_postsys32 900 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 901 902 /* 903 * Clobber %r11 as we check CR0.TS. 904 */ 905 ASSERT_CR0TS_ZERO(%r11) 906 907 /* 908 * Unlike other cases, because we need to restore the user stack pointer 909 * before exiting the kernel we must clear the microarch state before 910 * getting here. This should be safe because it means that the only 911 * values on the bus after this are based on the user's registers and 912 * potentially the addresses where we stored them. Given the constraints 913 * of sysret, that's how it has to be. 914 */ 915 call x86_md_clear 916 917 /* 918 * To get back to userland, we need to put the return %rip in %rcx and 919 * the return %rfl in %r11d. The sysret instruction also arranges 920 * to fix up %cs and %ss; everything else is our responsibility. 921 */ 922 923 movl %r12d, %eax /* %eax: rval1 */ 924 movl REGOFF_RBX(%rsp), %ebx 925 /* %ecx used for return pointer */ 926 movl %r13d, %edx /* %edx: rval2 */ 927 movl REGOFF_RBP(%rsp), %ebp 928 movl REGOFF_RSI(%rsp), %esi 929 movl REGOFF_RDI(%rsp), %edi 930 931 movl REGOFF_RFL(%rsp), %r11d /* %r11 -> eflags */ 932 movl REGOFF_RIP(%rsp), %ecx /* %ecx -> %eip */ 933 movl REGOFF_RSP(%rsp), %esp 934 935 ASSERT_UPCALL_MASK_IS_SET 936 ALTENTRY(nopop_sys_syscall32_swapgs_sysretl) 937 jmp tr_sysretl 938 SET_SIZE(nopop_sys_syscall32_swapgs_sysretl) 939 /*NOTREACHED*/ 940 941_full_syscall_postsys32: 942 STI 943 /* 944 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM 945 * so that we can account for the extra work it takes us to finish. 946 */ 947 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 948 movq %r15, %rdi 949 movq %r12, %rsi /* rval1 - %eax */ 950 movq %r13, %rdx /* rval2 - %edx */ 951 call syscall_exit 952 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 953 jmp _sys_rtt 954 SET_SIZE(sys_syscall32) 955 SET_SIZE(brand_sys_syscall32) 956 957/* 958 * System call handler via the sysenter instruction 959 * Used only for 32-bit system calls on the 64-bit kernel. 960 * 961 * The caller in userland has arranged that: 962 * 963 * - %eax contains the syscall number 964 * - %ecx contains the user %esp 965 * - %edx contains the return %eip 966 * - the user stack contains the args to the syscall 967 * 968 * Hardware and (privileged) initialization code have arranged that by 969 * the time the sysenter instructions completes: 970 * 971 * - %rip is pointing to sys_sysenter (below). 972 * - %cs and %ss are set to kernel text and stack (data) selectors. 973 * - %rsp is pointing at the lwp's stack 974 * - interrupts have been disabled. 975 * 976 * Note that we are unable to return both "rvals" to userland with 977 * this call, as %edx is used by the sysexit instruction. 978 * 979 * One final complication in this routine is its interaction with 980 * single-stepping in a debugger. For most of the system call mechanisms, the 981 * CPU automatically clears the single-step flag before we enter the kernel. 982 * The sysenter mechanism does not clear the flag, so a user single-stepping 983 * through a libc routine may suddenly find themself single-stepping through the 984 * kernel. To detect this, kmdb and trap() both compare the trap %pc to the 985 * [brand_]sys_enter addresses on each single-step trap. If it finds that we 986 * have single-stepped to a sysenter entry point, it explicitly clears the flag 987 * and executes the sys_sysenter routine. 988 * 989 * One final complication in this final complication is the fact that we have 990 * two different entry points for sysenter: brand_sys_sysenter and sys_sysenter. 991 * If we enter at brand_sys_sysenter and start single-stepping through the 992 * kernel with kmdb, we will eventually hit the instruction at sys_sysenter. 993 * kmdb cannot distinguish between that valid single-step and the undesirable 994 * one mentioned above. To avoid this situation, we simply add a jump over the 995 * instruction at sys_sysenter to make it impossible to single-step to it. 996 */ 997 998 ENTRY_NP(brand_sys_sysenter) 999 SWAPGS /* kernel gsbase */ 1000 ALTENTRY(_brand_sys_sysenter_post_swapgs) 1001 1002 BRAND_CALLBACK(BRAND_CB_SYSENTER, BRAND_URET_FROM_REG(%rdx)) 1003 /* 1004 * Jump over sys_sysenter to allow single-stepping as described 1005 * above. 1006 */ 1007 jmp _sys_sysenter_post_swapgs 1008 1009 ALTENTRY(sys_sysenter) 1010 SWAPGS /* kernel gsbase */ 1011 ALTENTRY(_sys_sysenter_post_swapgs) 1012 1013 movq %gs:CPU_THREAD, %r15 1014 1015 movl $U32CS_SEL, REGOFF_CS(%rsp) 1016 movl %ecx, REGOFF_RSP(%rsp) /* wrapper: %esp -> %ecx */ 1017 movl %edx, REGOFF_RIP(%rsp) /* wrapper: %eip -> %edx */ 1018 /* 1019 * NOTE: none of the instructions that run before we get here should 1020 * clobber bits in (R)FLAGS! This includes the kpti trampoline. 1021 */ 1022 pushfq 1023 popq %r10 1024 movl $UDS_SEL, REGOFF_SS(%rsp) 1025 1026 /* 1027 * Set the interrupt flag before storing the flags to the 1028 * flags image on the stack so we can return to user with 1029 * interrupts enabled if we return via sys_rtt_syscall32 1030 */ 1031 orq $PS_IE, %r10 1032 movq %r10, REGOFF_RFL(%rsp) 1033 1034 movl %edi, REGOFF_RDI(%rsp) 1035 movl %esi, REGOFF_RSI(%rsp) 1036 movl %ebp, REGOFF_RBP(%rsp) 1037 movl %ebx, REGOFF_RBX(%rsp) 1038 movl %edx, REGOFF_RDX(%rsp) 1039 movl %ecx, REGOFF_RCX(%rsp) 1040 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */ 1041 movq $0, REGOFF_SAVFP(%rsp) 1042 movq $0, REGOFF_SAVPC(%rsp) 1043 1044 /* 1045 * Copy these registers here in case we end up stopped with 1046 * someone (like, say, /proc) messing with our register state. 1047 * We don't -restore- them unless we have to in update_sregs. 1048 * 1049 * Since userland -can't- change fsbase or gsbase directly, 1050 * we don't bother to capture them here. 1051 */ 1052 xorl %ebx, %ebx 1053 movw %ds, %bx 1054 movq %rbx, REGOFF_DS(%rsp) 1055 movw %es, %bx 1056 movq %rbx, REGOFF_ES(%rsp) 1057 movw %fs, %bx 1058 movq %rbx, REGOFF_FS(%rsp) 1059 movw %gs, %bx 1060 movq %rbx, REGOFF_GS(%rsp) 1061 1062 /* 1063 * If we're trying to use TRAPTRACE though, I take that back: we're 1064 * probably debugging some problem in the SWAPGS logic and want to know 1065 * what the incoming gsbase was. 1066 * 1067 * Since we already did SWAPGS, record the KGSBASE. 1068 */ 1069#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv) 1070 movl $MSR_AMD_KGSBASE, %ecx 1071 rdmsr 1072 movl %eax, REGOFF_GSBASE(%rsp) 1073 movl %edx, REGOFF_GSBASE+4(%rsp) 1074#endif 1075 1076 /* 1077 * Application state saved in the regs structure on the stack 1078 * %eax is the syscall number 1079 * %rsp is the thread's stack, %r15 is curthread 1080 * REG_RSP(%rsp) is the user's stack 1081 */ 1082 1083 SYSCALL_TRAPTRACE($TT_SYSENTER) 1084 1085 movq %rsp, %rbp 1086 1087 movq T_LWP(%r15), %r14 1088 ASSERT_NO_RUPDATE_PENDING(%r14) 1089 1090 ENABLE_INTR_FLAGS 1091 1092 /* 1093 * Catch 64-bit process trying to issue sysenter instruction 1094 * on Nocona based systems. 1095 */ 1096 movq LWP_PROCP(%r14), %rax 1097 cmpq $DATAMODEL_ILP32, P_MODEL(%rax) 1098 je 7f 1099 1100 /* 1101 * For a non-32-bit process, simulate a #ud, since that's what 1102 * native hardware does. The traptrace entry (above) will 1103 * let you know what really happened. 1104 */ 1105 movq $T_ILLINST, REGOFF_TRAPNO(%rsp) 1106 movq REGOFF_CS(%rsp), %rdi 1107 movq %rdi, REGOFF_ERR(%rsp) 1108 movq %rsp, %rdi 1109 movq REGOFF_RIP(%rsp), %rsi 1110 movl %gs:CPU_ID, %edx 1111 call trap 1112 jmp _sys_rtt 11137: 1114 1115 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 1116 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate calls) */ 1117 1118 ASSERT_LWPTOREGS(%r14, %rsp) 1119 1120 incq %gs:CPU_STATS_SYS_SYSCALL 1121 1122 /* 1123 * Make some space for MAXSYSARGS (currently 8) 32-bit args 1124 * placed into 64-bit (long) arg slots, plus one 64-bit 1125 * (long) arg count, maintaining 16 byte alignment. 1126 */ 1127 subq $SYS_DROP, %rsp 1128 movb $LWP_SYS, LWP_STATE(%r14) 1129 movq %r15, %rdi 1130 movq %rsp, %rsi 1131 call syscall_entry 1132 1133 /* 1134 * Fetch the arguments copied onto the kernel stack and put 1135 * them in the right registers to invoke a C-style syscall handler. 1136 * %rax contains the handler address. 1137 */ 1138 movq %rax, %rbx 1139 movl 0(%rsp), %edi 1140 movl 8(%rsp), %esi 1141 movl 0x10(%rsp), %edx 1142 movl 0x18(%rsp), %ecx 1143 movl 0x20(%rsp), %r8d 1144 movl 0x28(%rsp), %r9d 1145 1146 movq SY_CALLC(%rbx), %rax 1147 INDIRECT_CALL_REG(rax) 1148 1149 movq %rbp, %rsp /* pop the args */ 1150 1151 /* 1152 * amd64 syscall handlers -always- return a 64-bit value in %rax. 1153 * On the 32-bit kernel, the always return that value in %eax:%edx 1154 * as required by the 32-bit ABI. 1155 * 1156 * Simulate the same behaviour by unconditionally splitting the 1157 * return value in the same way. 1158 */ 1159 movq %rax, %r13 1160 shrq $32, %r13 /* upper 32-bits into %edx */ 1161 movl %eax, %r12d /* lower 32-bits into %eax */ 1162 1163 /* 1164 * Optimistically assume that there's no post-syscall 1165 * work to do. (This is to avoid having to call syscall_mstate() 1166 * with interrupts disabled) 1167 */ 1168 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 1169 1170 /* 1171 * We must protect ourselves from being descheduled here; 1172 * If we were, and we ended up on another cpu, or another 1173 * lwp got int ahead of us, it could change the segment 1174 * registers without us noticing before we return to userland. 1175 * 1176 * This cli is undone in the tr_sysexit trampoline code. 1177 */ 1178 cli 1179 CHECK_POSTSYS_NE(%r15, %r14, %ebx) 1180 jne _full_syscall_postsys32 1181 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 1182 1183 /* 1184 * To get back to userland, load up the 32-bit registers and 1185 * sysexit back where we came from. 1186 */ 1187 1188 /* 1189 * Interrupts will be turned on by the 'sti' executed just before 1190 * sysexit. The following ensures that restoring the user's rflags 1191 * doesn't enable interrupts too soon. 1192 */ 1193 andq $_BITNOT(PS_IE), REGOFF_RFL(%rsp) 1194 1195 /* 1196 * Clobber %r11 as we check CR0.TS. 1197 */ 1198 ASSERT_CR0TS_ZERO(%r11) 1199 1200 /* 1201 * (There's no point in loading up %edx because the sysexit 1202 * mechanism smashes it.) 1203 */ 1204 movl %r12d, %eax 1205 movl REGOFF_RBX(%rsp), %ebx 1206 movl REGOFF_RBP(%rsp), %ebp 1207 movl REGOFF_RSI(%rsp), %esi 1208 movl REGOFF_RDI(%rsp), %edi 1209 1210 movl REGOFF_RIP(%rsp), %edx /* sysexit: %edx -> %eip */ 1211 pushq REGOFF_RFL(%rsp) 1212 popfq 1213 movl REGOFF_RSP(%rsp), %ecx /* sysexit: %ecx -> %esp */ 1214 ALTENTRY(sys_sysenter_swapgs_sysexit) 1215 call x86_md_clear 1216 jmp tr_sysexit 1217 SET_SIZE(sys_sysenter_swapgs_sysexit) 1218 SET_SIZE(sys_sysenter) 1219 SET_SIZE(_sys_sysenter_post_swapgs) 1220 SET_SIZE(brand_sys_sysenter) 1221 1222/* 1223 * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by 1224 * the generic i386 libc to do system calls. We do a small amount of setup 1225 * before jumping into the existing sys_syscall32 path. 1226 */ 1227 1228 ENTRY_NP(brand_sys_syscall_int) 1229 SWAPGS /* kernel gsbase */ 1230 XPV_TRAP_POP 1231 call smap_enable 1232 BRAND_CALLBACK(BRAND_CB_INT91, BRAND_URET_FROM_INTR_STACK()) 1233 jmp nopop_syscall_int 1234 1235 ALTENTRY(sys_syscall_int) 1236 SWAPGS /* kernel gsbase */ 1237 XPV_TRAP_POP 1238 call smap_enable 1239 1240nopop_syscall_int: 1241 movq %gs:CPU_THREAD, %r15 1242 movq T_STACK(%r15), %rsp 1243 movl %eax, %eax 1244 /* 1245 * Set t_post_sys on this thread to force ourselves out via the slow 1246 * path. It might be possible at some later date to optimize this out 1247 * and use a faster return mechanism. 1248 */ 1249 movb $1, T_POST_SYS(%r15) 1250 CLEAN_CS 1251 jmp _syscall32_save 1252 /* 1253 * There should be no instructions between this label and SWAPGS/IRET 1254 * or we could end up breaking branded zone support. See the usage of 1255 * this label in lx_brand_int80_callback and sn1_brand_int91_callback 1256 * for examples. 1257 * 1258 * We want to swapgs to maintain the invariant that all entries into 1259 * tr_iret_user are done on the user gsbase. 1260 */ 1261 ALTENTRY(sys_sysint_swapgs_iret) 1262 call x86_md_clear 1263 SWAPGS 1264 jmp tr_iret_user 1265 /*NOTREACHED*/ 1266 SET_SIZE(sys_sysint_swapgs_iret) 1267 SET_SIZE(sys_syscall_int) 1268 SET_SIZE(brand_sys_syscall_int) 1269 1270/* 1271 * Legacy 32-bit applications and old libc implementations do lcalls; 1272 * we should never get here because the LDT entry containing the syscall 1273 * segment descriptor has the "segment present" bit cleared, which means 1274 * we end up processing those system calls in trap() via a not-present trap. 1275 * 1276 * We do it this way because a call gate unhelpfully does -nothing- to the 1277 * interrupt flag bit, so an interrupt can run us just after the lcall 1278 * completes, but just before the swapgs takes effect. Thus the INTR_PUSH and 1279 * INTR_POP paths would have to be slightly more complex to dance around 1280 * this problem, and end up depending explicitly on the first 1281 * instruction of this handler being either swapgs or cli. 1282 */ 1283 1284 ENTRY_NP(sys_lcall32) 1285 SWAPGS /* kernel gsbase */ 1286 pushq $0 1287 pushq %rbp 1288 movq %rsp, %rbp 1289 leaq __lcall_panic_str(%rip), %rdi 1290 xorl %eax, %eax 1291 call panic 1292 SET_SIZE(sys_lcall32) 1293 1294__lcall_panic_str: 1295 .string "sys_lcall32: shouldn't be here!" 1296 1297/* 1298 * Declare a uintptr_t which covers the entire pc range of syscall 1299 * handlers for the stack walkers that need this. 1300 */ 1301 .align CPTRSIZE 1302 .globl _allsyscalls_size 1303 .type _allsyscalls_size, @object 1304_allsyscalls_size: 1305 .NWORD . - _allsyscalls 1306 SET_SIZE(_allsyscalls_size) 1307 1308/* 1309 * These are the thread context handlers for lwps using sysenter/sysexit. 1310 */ 1311 1312 /* 1313 * setting this value to zero as we switch away causes the 1314 * stack-pointer-on-sysenter to be NULL, ensuring that we 1315 * don't silently corrupt another (preempted) thread stack 1316 * when running an lwp that (somehow) didn't get sep_restore'd 1317 */ 1318 ENTRY_NP(sep_save) 1319 xorl %edx, %edx 1320 xorl %eax, %eax 1321 movl $MSR_INTC_SEP_ESP, %ecx 1322 wrmsr 1323 ret 1324 SET_SIZE(sep_save) 1325 1326 /* 1327 * Update the kernel stack pointer as we resume onto this cpu. 1328 */ 1329 ENTRY_NP(sep_restore) 1330 movq %rdi, %rdx 1331 shrq $32, %rdx 1332 movl %edi, %eax 1333 movl $MSR_INTC_SEP_ESP, %ecx 1334 wrmsr 1335 ret 1336 SET_SIZE(sep_restore) 1337