/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ /* All Rights Reserved */ /* Copyright (c) 1987, 1988 Microsoft Corporation */ /* All Rights Reserved */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void setup_ldt(proc_t *pp); static void *ldt_map(proc_t *pp, uint_t seli); static void ldt_free(proc_t *pp); extern void rtcsync(void); extern long ggmtl(void); extern void sgmtl(long); /* * sysi86 System Call */ /* ARGSUSED */ int sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) { struct ssd ssd; int error = 0; int c; proc_t *pp = curproc; switch (cmd) { /* * The SI86V86 subsystem call of the SYSI86 system call * supports only one subcode -- V86SC_IOPL. */ case SI86V86: if (arg1 == V86SC_IOPL) { struct regs *rp = lwptoregs(ttolwp(curthread)); greg_t oldpl = rp->r_ps & PS_IOPL; greg_t newpl = arg2 & PS_IOPL; /* * Must be privileged to run this system call * if giving more io privilege. */ if (newpl > oldpl && (error = secpolicy_sys_config(CRED(), B_FALSE)) != 0) return (set_errno(error)); rp->r_ps ^= oldpl ^ newpl; } else error = EINVAL; break; /* * Set a segment descriptor */ case SI86DSCR: /* * There are considerable problems here manipulating * resources shared by many running lwps. Get everyone * into a safe state before changing the LDT. */ if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) { error = EINTR; break; } if (get_udatamodel() == DATAMODEL_LP64) { error = EINVAL; break; } if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) { error = EFAULT; break; } error = setdscr(&ssd); mutex_enter(&pp->p_lock); if (curthread != pp->p_agenttp) continuelwps(pp); mutex_exit(&pp->p_lock); break; case SI86FPHW: c = fp_kind & 0xff; if (suword32((void *)arg1, c) == -1) error = EFAULT; break; case SI86FPSTART: /* * arg1 is the address of _fp_hw * arg2 is the desired x87 FCW value * arg3 is the desired SSE MXCSR value * a return value of one means SSE hardware, else none. */ c = fp_kind & 0xff; if (suword32((void *)arg1, c) == -1) { error = EFAULT; break; } fpsetcw((uint16_t)arg2, (uint32_t)arg3); return (fp_kind == __FP_SSE ? 1 : 0); /* real time clock management commands */ case WTODC: if ((error = secpolicy_settime(CRED())) == 0) { timestruc_t ts; mutex_enter(&tod_lock); gethrestime(&ts); tod_set(ts); mutex_exit(&tod_lock); } break; /* Give some timezone playing room */ #define ONEWEEK (7 * 24 * 60 * 60) case SGMTL: /* * Called from 32 bit land, negative values * are not sign extended, so we do that here * by casting it to an int and back. We also * clamp the value to within reason and detect * when a 64 bit call overflows an int. */ if ((error = secpolicy_settime(CRED())) == 0) { int newlag = (int)arg1; #ifdef _SYSCALL32_IMPL if (get_udatamodel() == DATAMODEL_NATIVE && (long)newlag != (long)arg1) { error = EOVERFLOW; } else #endif if (newlag >= -ONEWEEK && newlag <= ONEWEEK) sgmtl(newlag); else error = EOVERFLOW; } break; case GGMTL: if (get_udatamodel() == DATAMODEL_NATIVE) { if (sulword((void *)arg1, ggmtl()) == -1) error = EFAULT; #ifdef _SYSCALL32_IMPL } else { time_t gmtl; if ((gmtl = ggmtl()) > INT32_MAX) { /* * Since gmt_lag can at most be * +/- 12 hours, something is * *seriously* messed up here. */ error = EOVERFLOW; } else if (suword32((void *)arg1, (int32_t)gmtl) == -1) error = EFAULT; #endif } break; case RTCSYNC: if ((error = secpolicy_settime(CRED())) == 0) rtcsync(); break; /* END OF real time clock management commands */ default: error = EINVAL; break; } return (error == 0 ? 0 : set_errno(error)); } void usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel) { ssd->bo = USEGD_GETBASE(usd); ssd->ls = USEGD_GETLIMIT(usd); ssd->sel = sel; /* * set type, dpl and present bits. */ ssd->acc1 = usd->usd_type; ssd->acc1 |= usd->usd_dpl << 5; ssd->acc1 |= usd->usd_p << (5 + 2); /* * set avl, DB and granularity bits. */ ssd->acc2 = usd->usd_avl; #if defined(__amd64) ssd->acc2 |= usd->usd_long << 1; #else ssd->acc2 |= usd->usd_reserved << 1; #endif ssd->acc2 |= usd->usd_def32 << (1 + 1); ssd->acc2 |= usd->usd_gran << (1 + 1 + 1); } static void ssd_to_usd(struct ssd *ssd, user_desc_t *usd) { USEGD_SETBASE(usd, ssd->bo); USEGD_SETLIMIT(usd, ssd->ls); /* * set type, dpl and present bits. */ usd->usd_type = ssd->acc1; usd->usd_dpl = ssd->acc1 >> 5; usd->usd_p = ssd->acc1 >> (5 + 2); ASSERT(usd->usd_type >= SDT_MEMRO); ASSERT(usd->usd_dpl == SEL_UPL); /* * set avl, DB and granularity bits. */ usd->usd_avl = ssd->acc2; #if defined(__amd64) usd->usd_long = ssd->acc2 >> 1; #else usd->usd_reserved = ssd->acc2 >> 1; #endif usd->usd_def32 = ssd->acc2 >> (1 + 1); usd->usd_gran = ssd->acc2 >> (1 + 1 + 1); } static void ssd_to_sgd(struct ssd *ssd, gate_desc_t *sgd) { sgd->sgd_looffset = ssd->bo; sgd->sgd_hioffset = ssd->bo >> 16; sgd->sgd_selector = ssd->ls; /* * set type, dpl and present bits. */ sgd->sgd_type = ssd->acc1; sgd->sgd_dpl = ssd->acc1 >> 5; sgd->sgd_p = ssd->acc1 >> 7; ASSERT(sgd->sgd_type == SDT_SYSCGT); ASSERT(sgd->sgd_dpl == SEL_UPL); #if defined(__i386) /* reserved, ignored in amd64 */ sgd->sgd_stkcpy = 0; #endif } /* * Load LDT register with the current process's LDT. */ void ldt_load(void) { /* */ *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = curproc->p_ldt_desc; wr_ldtr(ULDT_SEL); } /* * Store a NULL selector in the LDTR. All subsequent illegal references to * the LDT will result in a #gp. */ void ldt_unload(void) { CPU->cpu_gdt[GDT_LDT] = zero_udesc; wr_ldtr(0); } /*ARGSUSED*/ static void ldt_savectx(proc_t *p) { ASSERT(p->p_ldt != NULL); ASSERT(p == curproc); #if defined(__amd64) /* * The 64-bit kernel must be sure to clear any stale ldt * selectors when context switching away from a process that * has a private ldt. Consider the following example: * * Wine creats a ldt descriptor and points a segment register * to it. * * We then context switch away from wine lwp to kernel * thread and hit breakpoint in kernel with kmdb * * When we continue and resume from kmdb we will #gp * fault since kmdb will have saved the stale ldt selector * from wine and will try to restore it but we are no longer in * the context of the wine process and do not have our * ldtr register pointing to the private ldt. */ clr_ldt_sregs(); #endif ldt_unload(); cpu_fast_syscall_enable(NULL); } static void ldt_restorectx(proc_t *p) { ASSERT(p->p_ldt != NULL); ASSERT(p == curproc); ldt_load(); cpu_fast_syscall_disable(NULL); } /* * When a process with a private LDT execs, fast syscalls must be enabled for * the new process image. */ /* ARGSUSED */ static void ldt_freectx(proc_t *p, int isexec) { ASSERT(p->p_ldt); if (isexec) { kpreempt_disable(); cpu_fast_syscall_enable(NULL); kpreempt_enable(); } /* * ldt_free() will free the memory used by the private LDT, reset the * process's descriptor, and re-program the LDTR. */ ldt_free(p); } /* * Install ctx op that ensures syscall/sysenter are disabled. * See comments below. * * When a thread with a private LDT forks, the new process * must have the LDT context ops installed. */ /* ARGSUSED */ static void ldt_installctx(proc_t *p, proc_t *cp) { proc_t *targ = p; kthread_t *t; /* * If this is a fork, operate on the child process. */ if (cp != NULL) { targ = cp; ldt_dup(p, cp); } /* * The process context ops expect the target process as their argument. */ ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx, ldt_installctx, ldt_savectx, ldt_freectx) == 0); installpctx(targ, targ, ldt_savectx, ldt_restorectx, ldt_installctx, ldt_savectx, ldt_freectx); /* * We've just disabled fast system call and return instructions; take * the slow path out to make sure we don't try to use one to return * back to user. We must set t_post_sys for every thread in the * process to make sure none of them escape out via fast return. */ mutex_enter(&targ->p_lock); t = targ->p_tlist; do { t->t_post_sys = 1; } while ((t = t->t_forw) != targ->p_tlist); mutex_exit(&targ->p_lock); } int setdscr(struct ssd *ssd) { ushort_t seli; /* selector index */ user_desc_t *dscrp; /* descriptor pointer */ proc_t *pp = ttoproc(curthread); /* * LDT segments: executable and data at DPL 3 only. */ if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel)) return (EINVAL); /* * check the selector index. */ seli = SELTOIDX(ssd->sel); if (seli >= MAXNLDT || seli < LDT_UDBASE) return (EINVAL); mutex_enter(&pp->p_ldtlock); /* * If this is the first time for this process then setup a * private LDT for it. */ if (pp->p_ldt == NULL) { kpreempt_disable(); setup_ldt(pp); /* * Now that this process has a private LDT, the use of * the syscall/sysret and sysenter/sysexit instructions * is forbidden for this processes because they destroy * the contents of %cs and %ss segment registers. * * Explicity disable them here and add a context handler * to the process. Note that disabling * them here means we can't use sysret or sysexit on * the way out of this system call - so we force this * thread to take the slow path (which doesn't make use * of sysenter or sysexit) back out. */ ldt_installctx(pp, NULL); cpu_fast_syscall_disable(NULL); ASSERT(curthread->t_post_sys != 0); wr_ldtr(ULDT_SEL); kpreempt_enable(); } if (ldt_map(pp, seli) == NULL) { mutex_exit(&pp->p_ldtlock); return (ENOMEM); } ASSERT(seli <= pp->p_ldtlimit); dscrp = &pp->p_ldt[seli]; /* * On the 64-bit kernel, this is where things get more subtle. * Recall that in the 64-bit kernel, when we enter the kernel we * deliberately -don't- reload the segment selectors we came in on * for %ds, %es, %fs or %gs. Messing with selectors is expensive, * and the underlying descriptors are essentially ignored by the * hardware in long mode - except for the base that we override with * the gsbase MSRs. * * However, there's one unfortunate issue with this rosy picture -- * a descriptor that's not marked as 'present' will still generate * an #np when loading a segment register. * * Consider this case. An lwp creates a harmless LDT entry, points * one of it's segment registers at it, then tells the kernel (here) * to delete it. In the 32-bit kernel, the #np will happen on the * way back to userland where we reload the segment registers, and be * handled in kern_gpfault(). In the 64-bit kernel, the same thing * will happen in the normal case too. However, if we're trying to * use a debugger that wants to save and restore the segment registers, * and the debugger things that we have valid segment registers, we * have the problem that the debugger will try and restore the * segment register that points at the now 'not present' descriptor * and will take a #np right there. * * We should obviously fix the debugger to be paranoid about * -not- restoring segment registers that point to bad descriptors; * however we can prevent the problem here if we check to see if any * of the segment registers are still pointing at the thing we're * destroying; if they are, return an error instead. (That also seems * a lot better failure mode than SIGKILL and a core file * from kern_gpfault() too.) */ if (SI86SSD_PRES(ssd) == 0) { kthread_t *t; int bad = 0; /* * Look carefully at the segment registers of every lwp * in the process (they're all stopped by our caller). * If we're about to invalidate a descriptor that's still * being referenced by *any* of them, return an error, * rather than having them #gp on their way out of the kernel. */ ASSERT(pp->p_lwprcnt == 1); mutex_enter(&pp->p_lock); t = pp->p_tlist; do { klwp_t *lwp = ttolwp(t); struct regs *rp = lwp->lwp_regs; #if defined(__amd64) pcb_t *pcb = &lwp->lwp_pcb; #endif if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) { bad = 1; break; } #if defined(__amd64) if (pcb->pcb_flags & RUPDATE_PENDING) { if (ssd->sel == pcb->pcb_ds || ssd->sel == pcb->pcb_es || ssd->sel == pcb->pcb_fs || ssd->sel == pcb->pcb_gs) { bad = 1; break; } } else #endif { if (ssd->sel == rp->r_ds || ssd->sel == rp->r_es || ssd->sel == rp->r_fs || ssd->sel == rp->r_gs) { bad = 1; break; } } } while ((t = t->t_forw) != pp->p_tlist); mutex_exit(&pp->p_lock); if (bad) { mutex_exit(&pp->p_ldtlock); return (EBUSY); } } /* * If acc1 is zero, clear the descriptor (including the 'present' bit) */ if (ssd->acc1 == 0) { bzero(dscrp, sizeof (*dscrp)); mutex_exit(&pp->p_ldtlock); return (0); } /* * Check segment type, allow segment not present and * only user DPL (3). */ if (SI86SSD_DPL(ssd) != SEL_UPL) { mutex_exit(&pp->p_ldtlock); return (EINVAL); } #if defined(__amd64) /* * Do not allow 32-bit applications to create 64-bit mode code * segments. */ if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 && SI86SSD_ISLONG(ssd)) { mutex_exit(&pp->p_ldtlock); return (EINVAL); } #endif /* __amd64 */ /* * Set up a code or data user segment descriptor. */ if (SI86SSD_ISUSEG(ssd)) { ssd_to_usd(ssd, dscrp); mutex_exit(&pp->p_ldtlock); return (0); } /* * Allow a call gate only if the destination is in the LDT. */ if (SI86SSD_TYPE(ssd) == SDT_SYSCGT && SELISLDT(ssd->ls)) { ssd_to_sgd(ssd, (gate_desc_t *)dscrp); mutex_exit(&pp->p_ldtlock); return (0); } mutex_exit(&pp->p_ldtlock); return (EINVAL); } /* * Allocate a private LDT for this process and initialize it with the * default entries. */ static void setup_ldt(proc_t *pp) { user_desc_t *ldtp; /* descriptor pointer */ pgcnt_t npages = btopr(MAXNLDT * sizeof (user_desc_t)); /* * Allocate maximum virtual space we need for this LDT. */ ldtp = vmem_alloc(heap_arena, ptob(npages), VM_SLEEP); /* * Allocate the minimum number of physical pages for LDT. */ (void) segkmem_xalloc(NULL, ldtp, MINNLDT * sizeof (user_desc_t), VM_SLEEP, 0, segkmem_page_create, NULL); bzero(ldtp, ptob(btopr(MINNLDT * sizeof (user_desc_t)))); kpreempt_disable(); /* Update proc structure. XXX - need any locks here??? */ set_syssegd(&pp->p_ldt_desc, ldtp, MINNLDT * sizeof (user_desc_t) - 1, SDT_SYSLDT, SEL_KPL); pp->p_ldtlimit = MINNLDT - 1; pp->p_ldt = ldtp; if (pp == curproc) *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = pp->p_ldt_desc; kpreempt_enable(); } /* * Map the page corresponding to the selector entry. If the page is * already mapped then it simply returns with the pointer to the entry. * Otherwise it allocates a physical page for it and returns the pointer * to the entry. Returns 0 for errors. */ static void * ldt_map(proc_t *pp, uint_t seli) { caddr_t ent0_addr = (caddr_t)&pp->p_ldt[0]; caddr_t ent_addr = (caddr_t)&pp->p_ldt[seli]; volatile caddr_t page = (caddr_t)((uintptr_t)ent0_addr & (~PAGEOFFSET)); caddr_t epage = (caddr_t)((uintptr_t)ent_addr & (~PAGEOFFSET)); on_trap_data_t otd; ASSERT(pp->p_ldt != NULL); if (seli <= pp->p_ldtlimit) return (ent_addr); /* * We are increasing the size of the process's LDT. * Make sure this and all intervening pages are mapped. */ while (page <= epage) { if (!on_trap(&otd, OT_DATA_ACCESS)) (void) *(volatile int *)page; /* peek at the page */ else { /* Allocate a physical page */ (void) segkmem_xalloc(NULL, page, PAGESIZE, VM_SLEEP, 0, segkmem_page_create, NULL); bzero(page, PAGESIZE); } no_trap(); page += PAGESIZE; } /* XXX - need any locks to update proc_t or gdt ??? */ ASSERT(curproc == pp); kpreempt_disable(); pp->p_ldtlimit = seli; SYSSEGD_SETLIMIT(&pp->p_ldt_desc, (seli+1) * sizeof (user_desc_t) -1); ldt_load(); kpreempt_enable(); return (ent_addr); } /* * Free up the kernel memory used for LDT of this process. */ static void ldt_free(proc_t *pp) { on_trap_data_t otd; caddr_t start, end; volatile caddr_t addr; ASSERT(pp->p_ldt != NULL); mutex_enter(&pp->p_ldtlock); start = (caddr_t)pp->p_ldt; /* beginning of the LDT */ end = start + (pp->p_ldtlimit * sizeof (user_desc_t)); /* Free the physical page(s) used for mapping LDT */ for (addr = start; addr <= end; addr += PAGESIZE) { if (!on_trap(&otd, OT_DATA_ACCESS)) { /* peek at the address */ (void) *(volatile int *)addr; segkmem_free(NULL, addr, PAGESIZE); } } no_trap(); /* Free up the virtual address space used for this LDT */ vmem_free(heap_arena, pp->p_ldt, ptob(btopr(MAXNLDT * sizeof (user_desc_t)))); kpreempt_disable(); pp->p_ldt = NULL; pp->p_ldt_desc = zero_sdesc; pp->p_ldtlimit = 0; if (pp == curproc) ldt_unload(); kpreempt_enable(); mutex_exit(&pp->p_ldtlock); } /* * On fork copy new ldt for child. */ void ldt_dup(proc_t *pp, proc_t *cp) { on_trap_data_t otd; caddr_t start, end; volatile caddr_t addr, caddr; int minsize; ASSERT(pp->p_ldt); setup_ldt(cp); mutex_enter(&pp->p_ldtlock); cp->p_ldtlimit = pp->p_ldtlimit; SYSSEGD_SETLIMIT(&cp->p_ldt_desc, (pp->p_ldtlimit+1) * sizeof (user_desc_t) -1); start = (caddr_t)pp->p_ldt; /* beginning of the LDT */ end = start + (pp->p_ldtlimit * sizeof (user_desc_t)); caddr = (caddr_t)cp->p_ldt; /* child LDT start */ minsize = ((MINNLDT * sizeof (user_desc_t)) + PAGESIZE) & ~PAGEOFFSET; /* Walk thru the physical page(s) used for parent's LDT */ for (addr = start; addr <= end; addr += PAGESIZE, caddr += PAGESIZE) { if (!on_trap(&otd, OT_DATA_ACCESS)) { (void) *(volatile int *)addr; /* peek at the address */ /* allocate a page if necessary */ if (caddr >= ((caddr_t)cp->p_ldt + minsize)) { (void) segkmem_xalloc(NULL, caddr, PAGESIZE, VM_SLEEP, 0, segkmem_page_create, NULL); } bcopy(addr, caddr, PAGESIZE); } } no_trap(); mutex_exit(&pp->p_ldtlock); }