1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22/* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27/* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ 28/* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ 29/* All Rights Reserved */ 30 31/* Copyright (c) 1987, 1988 Microsoft Corporation */ 32/* All Rights Reserved */ 33 34#pragma ident "%Z%%M% %I% %E% SMI" 35 36#include <sys/asm_linkage.h> 37#include <sys/asm_misc.h> 38#include <sys/regset.h> 39#include <sys/psw.h> 40#include <sys/x86_archext.h> 41 42#if defined(__lint) 43 44#include <sys/types.h> 45#include <sys/thread.h> 46#include <sys/systm.h> 47 48#else /* __lint */ 49 50#include <sys/segments.h> 51#include <sys/pcb.h> 52#include <sys/trap.h> 53#include <sys/ftrace.h> 54#include <sys/traptrace.h> 55#include <sys/clock.h> 56#include <sys/panic.h> 57#include "assym.h" 58 59#endif /* __lint */ 60 61/* 62 * We implement two flavours of system call entry points 63 * 64 * - {int,lcall}/iret (i386) 65 * - sysenter/sysexit (Pentium II and beyond) 66 * 67 * The basic pattern used in the handlers is to check to see if we can 68 * do fast (simple) version of the system call; if we can't we use various 69 * C routines that handle corner cases and debugging. 70 * 71 * To reduce the amount of assembler replication, yet keep the system call 72 * implementations vaguely comprehensible, the common code in the body 73 * of the handlers is broken up into a set of preprocessor definitions 74 * below. 75 */ 76 77/* 78 * When we have SYSCALLTRACE defined, we sneak an extra 79 * predicate into a couple of tests. 80 */ 81#if defined(SYSCALLTRACE) 82#define ORL_SYSCALLTRACE(r32) \ 83 orl syscalltrace, r32 84#else 85#define ORL_SYSCALLTRACE(r32) 86#endif 87 88/* 89 * This check is false whenever we want to go fast i.e. 90 * 91 * if (code >= NSYSCALL || 92 * t->t_pre_sys || (t->t_proc_flag & TP_WATCHPT) != 0) 93 * do full version 94 * #ifdef SYSCALLTRACE 95 * if (syscalltrace) 96 * do full version 97 * #endif 98 * 99 * Preconditions: 100 * - t curthread 101 * - code contains the syscall number 102 * Postconditions: 103 * - %ecx and %edi are smashed 104 * - condition code flag ZF is cleared if pre-sys is too complex 105 */ 106#define CHECK_PRESYS_NE(t, code) \ 107 movzbl T_PRE_SYS(t), %edi; \ 108 movzwl T_PROC_FLAG(t), %ecx; \ 109 andl $TP_WATCHPT, %ecx; \ 110 orl %ecx, %edi; \ 111 cmpl $NSYSCALL, code; \ 112 setae %cl; \ 113 movzbl %cl, %ecx; \ 114 orl %ecx, %edi; \ 115 ORL_SYSCALLTRACE(%edi) 116 117#define MSTATE_TRANSITION(from, to) \ 118 pushl $to; \ 119 pushl $from; \ 120 call syscall_mstate; \ 121 addl $0x8, %esp 122 123/* 124 * aka CPU_STATS_ADDQ(CPU, sys.syscall, 1) 125 * This must be called with interrupts or preemption disabled. 126 */ 127#define CPU_STATS_SYS_SYSCALL_INC \ 128 addl $1, %gs:CPU_STATS_SYS_SYSCALL; \ 129 adcl $0, %gs:CPU_STATS_SYS_SYSCALL+4; 130 131#if !defined(__lint) 132 133/* 134 * ASSERT(lwptoregs(lwp) == rp); 135 * 136 * this may seem obvious, but very odd things happen if this 137 * assertion is false 138 * 139 * Preconditions: 140 * -none- 141 * Postconditions (if assertion is true): 142 * %esi and %edi are smashed 143 */ 144#if defined(DEBUG) 145 146__lwptoregs_msg: 147 .string "%M%:%d lwptoregs(%p) [%p] != rp [%p]" 148 149#define ASSERT_LWPTOREGS(t, rp) \ 150 movl T_LWP(t), %esi; \ 151 movl LWP_REGS(%esi), %edi; \ 152 cmpl rp, %edi; \ 153 je 7f; \ 154 pushl rp; \ 155 pushl %edi; \ 156 pushl %esi; \ 157 pushl $__LINE__; \ 158 pushl $__lwptoregs_msg; \ 159 call panic; \ 1607: 161#else 162#define ASSERT_LWPTOREGS(t, rp) 163#endif 164 165#endif /* __lint */ 166 167/* 168 * This is an assembler version of this fragment: 169 * 170 * lwp->lwp_state = LWP_SYS; 171 * lwp->lwp_ru.sysc++; 172 * lwp->lwp_eosys = NORMALRETURN; 173 * lwp->lwp_ap = argp; 174 * 175 * Preconditions: 176 * -none- 177 * Postconditions: 178 * -none- 179 */ 180#define SET_LWP(lwp, argp) \ 181 movb $LWP_SYS, LWP_STATE(lwp); \ 182 addl $1, LWP_RU_SYSC(lwp); \ 183 adcl $0, LWP_RU_SYSC+4(lwp); \ 184 movb $NORMALRETURN, LWP_EOSYS(lwp); \ 185 movl argp, LWP_AP(lwp) 186 187/* 188 * Set up the thread, lwp, find the handler, and copy 189 * in the arguments from userland to the kernel stack. 190 * 191 * Preconditions: 192 * - %eax contains the syscall number 193 * Postconditions: 194 * - %eax contains a pointer to the sysent structure 195 * - %ecx is zeroed 196 * - %esi, %edi are smashed 197 * - %esp is SYS_DROPped ready for the syscall 198 */ 199#define SIMPLE_SYSCALL_PRESYS(t, faultlabel) \ 200 movl T_LWP(t), %esi; \ 201 movw %ax, T_SYSNUM(t); \ 202 subl $SYS_DROP, %esp; \ 203 shll $SYSENT_SIZE_SHIFT, %eax; \ 204 SET_LWP(%esi, %esp); \ 205 leal sysent(%eax), %eax; \ 206 movzbl SY_NARG(%eax), %ecx; \ 207 testl %ecx, %ecx; \ 208 jz 4f; \ 209 movl %esp, %edi; \ 210 movl SYS_DROP + REGOFF_UESP(%esp), %esi; \ 211 movl $faultlabel, T_LOFAULT(t); \ 212 addl $4, %esi; \ 213 rep; \ 214 smovl; \ 215 movl %ecx, T_LOFAULT(t); \ 2164: 217 218/* 219 * Check to see if a simple return is possible i.e. 220 * 221 * if ((t->t_post_sys_ast | syscalltrace) != 0) 222 * do full version; 223 * 224 * Preconditions: 225 * - t is curthread 226 * Postconditions: 227 * - condition code NE is set if post-sys is too complex 228 * - rtmp is zeroed if it isn't (we rely on this!) 229 */ 230#define CHECK_POSTSYS_NE(t, rtmp) \ 231 xorl rtmp, rtmp; \ 232 ORL_SYSCALLTRACE(rtmp); \ 233 orl T_POST_SYS_AST(t), rtmp; \ 234 cmpl $0, rtmp 235 236/* 237 * Fix up the lwp, thread, and eflags for a successful return 238 * 239 * Preconditions: 240 * - zwreg contains zero 241 * Postconditions: 242 * - %esp has been unSYS_DROPped 243 * - %esi is smashed (points to lwp) 244 */ 245#define SIMPLE_SYSCALL_POSTSYS(t, zwreg) \ 246 movl T_LWP(t), %esi; \ 247 addl $SYS_DROP, %esp; \ 248 movw zwreg, T_SYSNUM(t); \ 249 movb $LWP_USER, LWP_STATE(%esi); \ 250 andb $_CONST(0xffff - PS_C), REGOFF_EFL(%esp) 251 252/* 253 * System call handler. This is the destination of both the call 254 * gate (lcall 0x27) _and_ the interrupt gate (int 0x91). For our purposes, 255 * there are two significant differences between an interrupt gate and a call 256 * gate: 257 * 258 * 1) An interrupt gate runs the handler with interrupts disabled, whereas a 259 * call gate runs the handler with whatever EFLAGS settings were in effect at 260 * the time of the call. 261 * 262 * 2) An interrupt gate pushes the contents of the EFLAGS register at the time 263 * of the interrupt onto the stack, whereas a call gate does not. 264 * 265 * Because we use the following code sequence to handle system calls made from 266 * _both_ a call gate _and_ an interrupt gate, these two differences must be 267 * respected. In regards to number 1) above, the handler must ensure that a sane 268 * EFLAGS snapshot is stored on the stack so that when the kernel returns back 269 * to the user via iret (which returns to user with the EFLAGS value saved on 270 * the stack), interrupts are re-enabled. 271 * 272 * In regards to number 2) above, the handler must always put a current snapshot 273 * of EFLAGS onto the stack in the appropriate place. If we came in via an 274 * interrupt gate, we will be clobbering the EFLAGS value that was pushed by 275 * the interrupt gate. This is OK, as the only bit that was changed by the 276 * hardware was the IE (interrupt enable) bit, which for an interrupt gate is 277 * now off. If we were to do nothing, the stack would contain an EFLAGS with 278 * IE off, resulting in us eventually returning back to the user with interrupts 279 * disabled. The solution is to turn on the IE bit in the EFLAGS value saved on 280 * the stack. 281 * 282 * Another subtlety which deserves mention is the difference between the two 283 * descriptors. The call gate descriptor is set to instruct the hardware to copy 284 * one parameter from the user stack to the kernel stack, whereas the interrupt 285 * gate descriptor doesn't use the parameter passing mechanism at all. The 286 * kernel doesn't actually use the parameter that is copied by the hardware; the 287 * only reason it does this is so that there is a space on the stack large 288 * enough to hold an EFLAGS register value, which happens to be in the correct 289 * place for use by iret when we go back to userland. How convenient. 290 * 291 * Stack frame description in syscall() and callees. 292 * 293 * |------------| 294 * | regs | +(8*4)+4 registers 295 * |------------| 296 * | 8 args | <- %esp MAXSYSARGS (currently 8) arguments 297 * |------------| 298 * 299 */ 300#define SYS_DROP _CONST(_MUL(MAXSYSARGS, 4)) 301 302#if defined(__lint) 303 304/*ARGSUSED*/ 305void 306sys_call() 307{} 308 309void 310_allsyscalls() 311{} 312 313size_t _allsyscalls_size; 314 315#else /* __lint */ 316 317 ENTRY_NP2(sys_call, _allsyscalls) 318 319 / on entry eax = system call number 320 / set up the stack to look as in reg.h 321 subl $8, %esp / pad the stack with ERRCODE and TRAPNO 322 323 SYSCALL_PUSH 324 325#ifdef TRAPTRACE 326 TRACE_PTR(%edi, %ebx, %ebx, %ecx, $TT_SYSCALL) / Uses labels "8" and "9" 327 TRACE_REGS(%edi, %esp, %ebx, %ecx) / Uses label "9" 328 pushl %eax 329 TRACE_STAMP(%edi) / Clobbers %eax, %edx, uses "9" 330 popl %eax 331 movl %eax, TTR_SYSNUM(%edi) 332#endif 333 334_watch_do_syscall: 335 movl %esp, %ebp 336 337 pushl %eax / preserve across mstate call 338 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 339 popl %eax 340 341 movl %gs:CPU_THREAD, %ebx 342 343 / Interrupts are enabled here, so we must make sure this thread doesn't 344 / migrate off the CPU while it updates the CPU stats. 345 addb $1, T_PREEMPT(%ebx) 346 CPU_STATS_SYS_SYSCALL_INC 347 subb $1, T_PREEMPT(%ebx) 348 349 / Set EFLAGS to standard kernel settings. 350 ENABLE_INTR_FLAGS 351 352 ASSERT_LWPTOREGS(%ebx, %esp) 353 354 CHECK_PRESYS_NE(%ebx, %eax) 355 jne _full_syscall_presys 356 SIMPLE_SYSCALL_PRESYS(%ebx, _syscall_fault) 357 358_syslcall_call: 359 call *SY_CALLC(%eax) 360 361_syslcall_done: 362 CHECK_POSTSYS_NE(%ebx, %ecx) 363 jne _full_syscall_postsys 364 SIMPLE_SYSCALL_POSTSYS(%ebx, %cx) 365 movl %eax, REGOFF_EAX(%esp) 366 movl %edx, REGOFF_EDX(%esp) 367 368 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 369 370 / 371 / get back via iret 372 / 373 cli 374 jmp set_user_regs 375 376_full_syscall_presys: 377 movl T_LWP(%ebx), %esi 378 subl $SYS_DROP, %esp 379 movb $LWP_SYS, LWP_STATE(%esi) 380 pushl %esp 381 pushl %ebx 382 call syscall_entry 383 addl $8, %esp 384 jmp _syslcall_call 385 386_full_syscall_postsys: 387 addl $SYS_DROP, %esp 388 pushl %edx 389 pushl %eax 390 pushl %ebx 391 call syscall_exit 392 addl $12, %esp 393 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 394 jmp sys_rtt_syscall 395 396_syscall_fault: 397 push $0xe / EFAULT 398 call set_errno 399 addl $4, %esp 400 xorl %eax, %eax / fake syscall_err() 401 xorl %edx, %edx 402 jmp _syslcall_done 403 SET_SIZE(sys_call) 404 405#endif /* __lint */ 406 407/* 408 * System call handler via the sysenter instruction 409 * 410 * Here's how syscall entry usually works (see sys_call for details). 411 * 412 * There, the caller (lcall or int) in userland has arranged that: 413 * 414 * - %eax contains the syscall number 415 * - the user stack contains the args to the syscall 416 * 417 * Normally the lcall instruction into the call gate causes the processor 418 * to push %ss, %esp, <top-of-stack>, %cs, %eip onto the kernel stack. 419 * The sys_call handler then leaves space for r_trapno and r_err, and 420 * pusha's {%eax, %ecx, %edx, %ebx, %esp, %ebp, %esi, %edi}, followed 421 * by %ds, %es, %fs and %gs to capture a 'struct regs' on the stack. 422 * Then the kernel sets %ds, %es and %gs to kernel selectors, and finally 423 * extracts %efl and puts it into r_efl (which happens to live at the offset 424 * that <top-of-stack> was copied into). Note that the value in r_efl has 425 * the IF (interrupt enable) flag turned on. (The int instruction into the 426 * interrupt gate does essentially the same thing, only instead of 427 * <top-of-stack> we get eflags - see comment above.) 428 * 429 * In the sysenter case, things are a lot more primitive. 430 * 431 * The caller in userland has arranged that: 432 * 433 * - %eax contains the syscall number 434 * - %ecx contains the user %esp 435 * - %edx contains the return %eip 436 * - the user stack contains the args to the syscall 437 * 438 * e.g. 439 * <args on the stack> 440 * mov $SYS_callnum, %eax 441 * mov $1f, %edx / return %eip 442 * mov %esp, %ecx / return %esp 443 * sysenter 444 * 1: 445 * 446 * Hardware and (privileged) initialization code have arranged that by 447 * the time the sysenter instructions completes: 448 * 449 * - %eip is pointing to sys_sysenter (below). 450 * - %cs and %ss are set to kernel text and stack (data) selectors. 451 * - %esp is pointing at the lwp's stack 452 * - Interrupts have been disabled. 453 * 454 * The task for the sysenter handler is: 455 * 456 * - recreate the same regs structure on the stack and the same 457 * kernel state as if we'd come in on an lcall 458 * - do the normal work of a syscall 459 * - execute the system call epilogue, use sysexit to return to userland. 460 * 461 * Note that we are unable to return both "rvals" to userland with this 462 * call, as %edx is used by the sysexit instruction. 463 */ 464#if defined(__lint) 465 466void 467sys_sysenter() 468{} 469 470#else /* __lint */ 471 472 ENTRY_NP(sys_sysenter) 473 / 474 / do what the call gate would've done to the stack .. 475 / 476 pushl $UDS_SEL / (really %ss, but it's the same ..) 477 pushl %ecx / userland makes this a copy of %esp 478 pushfl 479 orl $PS_IE, (%esp) / turn interrupts on when we return to user 480 pushl $UCS_SEL 481 pushl %edx / userland makes this a copy of %eip 482 / 483 / done. finish building the stack frame 484 / 485 subl $8, %esp / leave space for ERR and TRAPNO 486 487 SYSENTER_PUSH 488 489#ifdef TRAPTRACE 490 TRACE_PTR(%edi, %ebx, %ebx, %ecx, $TT_SYSENTER) / uses labels 8 and 9 491 TRACE_REGS(%edi, %esp, %ebx, %ecx) / uses label 9 492 pushl %eax 493 TRACE_STAMP(%edi) / clobbers %eax, %edx, uses label 9 494 popl %eax 495 movl %eax, TTR_SYSNUM(%edi) 496#endif 497 movl %esp, %ebp 498 499 CPU_STATS_SYS_SYSCALL_INC 500 501 ENABLE_INTR_FLAGS 502 503 pushl %eax / preserve across mstate call 504 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 505 popl %eax 506 507 movl %gs:CPU_THREAD, %ebx 508 509 ASSERT_LWPTOREGS(%ebx, %esp) 510 511 CHECK_PRESYS_NE(%ebx, %eax) 512 jne _full_syscall_presys 513 SIMPLE_SYSCALL_PRESYS(%ebx, _syscall_fault) 514 515_sysenter_call: 516 call *SY_CALLC(%eax) 517 518_sysenter_done: 519 CHECK_POSTSYS_NE(%ebx, %ecx) 520 jne _full_syscall_postsys 521 SIMPLE_SYSCALL_POSTSYS(%ebx, %cx) 522 / 523 / sysexit uses %edx to restore %eip, so we can't use it 524 / to return a value, sigh. 525 / 526 movl %eax, REGOFF_EAX(%esp) 527 / movl %edx, REGOFF_EDX(%esp) 528 529 / Interrupts will be turned on by the 'sti' executed just before 530 / sysexit. The following ensures that restoring the user's EFLAGS 531 / doesn't enable interrupts too soon. 532 andl $_BITNOT(PS_IE), REGOFF_EFL(%esp) 533 534 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 535 536 cli 537 538 SYSCALL_POP 539 540 popl %edx / sysexit: %edx -> %eip 541 addl $4, %esp / get CS off the stack 542 popfl / EFL 543 popl %ecx / sysexit: %ecx -> %esp 544 sti 545 sysexit 546 SET_SIZE(sys_sysenter) 547 548/* 549 * Declare a uintptr_t which covers the entire pc range of syscall 550 * handlers for the stack walkers that need this. 551 */ 552 .align CPTRSIZE 553 .globl _allsyscalls_size 554 .type _allsyscalls_size, @object 555_allsyscalls_size: 556 .NWORD . - _allsyscalls 557 SET_SIZE(_allsyscalls_size) 558 559#endif /* __lint */ 560 561/* 562 * These are the thread context handlers for lwps using sysenter/sysexit. 563 */ 564 565#if defined(__lint) 566 567/*ARGSUSED*/ 568void 569sep_save(void *ksp) 570{} 571 572/*ARGSUSED*/ 573void 574sep_restore(void *ksp) 575{} 576 577#else /* __lint */ 578 579 /* 580 * setting this value to zero as we switch away causes the 581 * stack-pointer-on-sysenter to be NULL, ensuring that we 582 * don't silently corrupt another (preempted) thread stack 583 * when running an lwp that (somehow) didn't get sep_restore'd 584 */ 585 ENTRY_NP(sep_save) 586 xorl %edx, %edx 587 xorl %eax, %eax 588 movl $MSR_INTC_SEP_ESP, %ecx 589 wrmsr 590 ret 591 SET_SIZE(sep_save) 592 593 /* 594 * Update the kernel stack pointer as we resume onto this cpu. 595 */ 596 ENTRY_NP(sep_restore) 597 movl 4(%esp), %eax /* per-lwp kernel sp */ 598 xorl %edx, %edx 599 movl $MSR_INTC_SEP_ESP, %ecx 600 wrmsr 601 ret 602 SET_SIZE(sep_restore) 603 604#endif /* __lint */ 605 606/* 607 * Call syscall(). Called from trap() on watchpoint at lcall 0,7 608 */ 609 610#if defined(__lint) 611 612void 613watch_syscall(void) 614{} 615 616#else /* __lint */ 617 618 ENTRY_NP(watch_syscall) 619 movl %gs:CPU_THREAD, %ebx 620 movl T_STACK(%ebx), %esp / switch to the thread stack 621 movl REGOFF_EAX(%esp), %eax / recover original syscall# 622 jmp _watch_do_syscall 623 SET_SIZE(watch_syscall) 624 625#endif /* __lint */ 626