/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2021 Joyent, Inc. */ /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ /* All Rights Reserved */ /* Copyright (c) 1987, 1988 Microsoft Corporation */ /* All Rights Reserved */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__xpv) #include #include #endif static void ldt_alloc(proc_t *, uint_t); static void ldt_free(proc_t *); static void ldt_dup(proc_t *, proc_t *); static void ldt_grow(proc_t *, uint_t); /* * sysi86 System Call */ /* ARGSUSED */ int sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) { struct ssd ssd; int error = 0; int c; proc_t *pp = curproc; switch (cmd) { /* * The SI86V86 subsystem call of the SYSI86 system call * supports only one subcode -- V86SC_IOPL. */ case SI86V86: if (arg1 == V86SC_IOPL) { struct regs *rp = lwptoregs(ttolwp(curthread)); greg_t oldpl = rp->r_ps & PS_IOPL; greg_t newpl = arg2 & PS_IOPL; /* * Must be privileged to run this system call * if giving more io privilege. */ if (newpl > oldpl && (error = secpolicy_sys_config(CRED(), B_FALSE)) != 0) return (set_errno(error)); #if defined(__xpv) const struct ctxop_template xen_tpl = { .ct_rev = CTXOP_TPL_REV, .ct_save = xen_disable_user_iopl, .ct_restore = xen_enable_user_iopl, .ct_exit = xen_disable_user_iopl, }; struct ctxop *ctx; ctx = ctxop_allocate(&xen_tpl, NULL); kpreempt_disable(); ctxop_attach(curthread, ctx); xen_enable_user_iopl(NULL); kpreempt_enable(); #else rp->r_ps ^= oldpl ^ newpl; #endif } else error = EINVAL; break; /* * Set a segment descriptor */ case SI86DSCR: /* * There are considerable problems here manipulating * resources shared by many running lwps. Get everyone * into a safe state before changing the LDT. */ if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) { error = EINTR; break; } if (get_udatamodel() == DATAMODEL_LP64) { error = EINVAL; break; } if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) { error = EFAULT; break; } error = setdscr(&ssd); mutex_enter(&pp->p_lock); if (curthread != pp->p_agenttp) continuelwps(pp); mutex_exit(&pp->p_lock); break; case SI86FPHW: c = fp_kind & 0xff; if (suword32((void *)arg1, c) == -1) error = EFAULT; break; case SI86FPSTART: /* * arg1 is the address of _fp_hw * arg2 is the desired x87 FCW value * arg3 is the desired SSE MXCSR value * a return value of one means SSE hardware, else none. */ c = fp_kind & 0xff; if (suword32((void *)arg1, c) == -1) { error = EFAULT; break; } fpsetcw((uint16_t)arg2, (uint32_t)arg3); return ((fp_kind & __FP_SSE) ? 1 : 0); /* real time clock management commands */ case WTODC: if ((error = secpolicy_settime(CRED())) == 0) { timestruc_t ts; mutex_enter(&tod_lock); gethrestime(&ts); tod_set(ts); mutex_exit(&tod_lock); } break; /* Give some timezone playing room */ #define ONEWEEK (7 * 24 * 60 * 60) case SGMTL: /* * Called from 32 bit land, negative values * are not sign extended, so we do that here * by casting it to an int and back. We also * clamp the value to within reason and detect * when a 64 bit call overflows an int. */ if ((error = secpolicy_settime(CRED())) == 0) { int newlag = (int)arg1; #ifdef _SYSCALL32_IMPL if (get_udatamodel() == DATAMODEL_NATIVE && (long)newlag != (long)arg1) { error = EOVERFLOW; } else #endif if (newlag >= -ONEWEEK && newlag <= ONEWEEK) sgmtl(newlag); else error = EOVERFLOW; } break; case GGMTL: if (get_udatamodel() == DATAMODEL_NATIVE) { if (sulword((void *)arg1, ggmtl()) == -1) error = EFAULT; #ifdef _SYSCALL32_IMPL } else { time_t gmtl; if ((gmtl = ggmtl()) > INT32_MAX) { /* * Since gmt_lag can at most be * +/- 12 hours, something is * *seriously* messed up here. */ error = EOVERFLOW; } else if (suword32((void *)arg1, (int32_t)gmtl) == -1) error = EFAULT; #endif } break; case RTCSYNC: if ((error = secpolicy_settime(CRED())) == 0) rtcsync(); break; /* END OF real time clock management commands */ default: error = EINVAL; break; } return (error == 0 ? 0 : set_errno(error)); } void usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel) { ssd->bo = USEGD_GETBASE(usd); ssd->ls = USEGD_GETLIMIT(usd); ssd->sel = sel; /* * set type, dpl and present bits. */ ssd->acc1 = usd->usd_type; ssd->acc1 |= usd->usd_dpl << 5; ssd->acc1 |= usd->usd_p << (5 + 2); /* * set avl, DB and granularity bits. */ ssd->acc2 = usd->usd_avl; ssd->acc2 |= usd->usd_long << 1; ssd->acc2 |= usd->usd_def32 << (1 + 1); ssd->acc2 |= usd->usd_gran << (1 + 1 + 1); } static void ssd_to_usd(struct ssd *ssd, user_desc_t *usd) { ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0); USEGD_SETBASE(usd, ssd->bo); USEGD_SETLIMIT(usd, ssd->ls); /* * Set type, dpl and present bits. * * Force the "accessed" bit to on so that we don't run afoul of * KPTI. */ usd->usd_type = ssd->acc1 | SDT_A; usd->usd_dpl = ssd->acc1 >> 5; usd->usd_p = ssd->acc1 >> (5 + 2); ASSERT(usd->usd_type >= SDT_MEMRO); ASSERT(usd->usd_dpl == SEL_UPL); /* * 64-bit code selectors are never allowed in the LDT. * Reserved bit is always 0 on 32-bit systems. */ usd->usd_long = 0; /* * set avl, DB and granularity bits. */ usd->usd_avl = ssd->acc2; usd->usd_def32 = ssd->acc2 >> (1 + 1); usd->usd_gran = ssd->acc2 >> (1 + 1 + 1); } /* * Load LDT register with the current process's LDT. */ static void ldt_load(void) { #if defined(__xpv) xen_set_ldt(curproc->p_ldt, curproc->p_ldtlimit + 1); #else size_t len; system_desc_t desc; /* * Before we can use the LDT on this CPU, we must install the LDT in the * user mapping table. */ len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t); bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len); CPU->cpu_m.mcpu_ldt_len = len; set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL); *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc; wr_ldtr(ULDT_SEL); #endif } /* * Store a NULL selector in the LDTR. All subsequent illegal references to * the LDT will result in a #gp. */ void ldt_unload(void) { #if defined(__xpv) xen_set_ldt(NULL, 0); #else *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc; wr_ldtr(0); bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len); CPU->cpu_m.mcpu_ldt_len = 0; #endif } /*ARGSUSED*/ static void ldt_savectx(proc_t *p) { ASSERT(p->p_ldt != NULL); ASSERT(p == curproc); /* * The 64-bit kernel must be sure to clear any stale ldt * selectors when context switching away from a process that * has a private ldt. Consider the following example: * * Wine creats a ldt descriptor and points a segment register * to it. * * We then context switch away from wine lwp to kernel * thread and hit breakpoint in kernel with kmdb * * When we continue and resume from kmdb we will #gp * fault since kmdb will have saved the stale ldt selector * from wine and will try to restore it but we are no longer in * the context of the wine process and do not have our * ldtr register pointing to the private ldt. */ reset_sregs(); ldt_unload(); cpu_fast_syscall_enable(); } static void ldt_restorectx(proc_t *p) { ASSERT(p->p_ldt != NULL); ASSERT(p == curproc); ldt_load(); cpu_fast_syscall_disable(); } /* * At exec time, we need to clear up our LDT context and re-enable fast syscalls * for the new process image. * * The same is true for the other case, where we have: * * proc_exit() * ->exitpctx()->ldt_savectx() * ->freepctx()->ldt_freectx() * * Because pre-emption is not prevented between the two callbacks, we could have * come off CPU, and brought back LDT context when coming back on CPU via * ldt_restorectx(). */ /* ARGSUSED */ static void ldt_freectx(proc_t *p, int isexec) { ASSERT(p->p_ldt != NULL); ASSERT(p == curproc); kpreempt_disable(); ldt_free(p); cpu_fast_syscall_enable(); kpreempt_enable(); } /* * Install ctx op that ensures syscall/sysenter are disabled. * See comments below. * * When a thread with a private LDT forks, the new process * must have the LDT context ops installed. */ /* ARGSUSED */ static void ldt_installctx(proc_t *p, proc_t *cp) { proc_t *targ = p; kthread_t *t; /* * If this is a fork, operate on the child process. */ if (cp != NULL) { targ = cp; ldt_dup(p, cp); } /* * The process context ops expect the target process as their argument. */ ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx, ldt_installctx, ldt_savectx, ldt_freectx) == 0); installpctx(targ, targ, ldt_savectx, ldt_restorectx, ldt_installctx, ldt_savectx, ldt_freectx); /* * We've just disabled fast system call and return instructions; take * the slow path out to make sure we don't try to use one to return * back to user. We must set t_post_sys for every thread in the * process to make sure none of them escape out via fast return. */ mutex_enter(&targ->p_lock); t = targ->p_tlist; do { t->t_post_sys = 1; } while ((t = t->t_forw) != targ->p_tlist); mutex_exit(&targ->p_lock); } int setdscr(struct ssd *ssd) { ushort_t seli; /* selector index */ user_desc_t *ldp; /* descriptor pointer */ user_desc_t ndesc; /* new descriptor */ proc_t *pp = curproc; int rc = 0; /* * LDT segments: executable and data at DPL 3 only. */ if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel)) return (EINVAL); /* * check the selector index. */ seli = SELTOIDX(ssd->sel); if (seli >= MAXNLDT || seli < LDT_UDBASE) return (EINVAL); ndesc = null_udesc; mutex_enter(&pp->p_ldtlock); /* * If this is the first time for this process then setup a * private LDT for it. */ if (pp->p_ldt == NULL) { ldt_alloc(pp, seli); /* * Now that this process has a private LDT, the use of * the syscall/sysret and sysenter/sysexit instructions * is forbidden for this processes because they destroy * the contents of %cs and %ss segment registers. * * Explicity disable them here and add a context handler * to the process. Note that disabling * them here means we can't use sysret or sysexit on * the way out of this system call - so we force this * thread to take the slow path (which doesn't make use * of sysenter or sysexit) back out. */ kpreempt_disable(); ldt_installctx(pp, NULL); cpu_fast_syscall_disable(); ASSERT(curthread->t_post_sys != 0); kpreempt_enable(); } else if (seli > pp->p_ldtlimit) { ASSERT(pp->p_pctx != NULL); /* * Increase size of ldt to include seli. */ ldt_grow(pp, seli); } ASSERT(seli <= pp->p_ldtlimit); ldp = &pp->p_ldt[seli]; /* * On the 64-bit kernel, this is where things get more subtle. * Recall that in the 64-bit kernel, when we enter the kernel we * deliberately -don't- reload the segment selectors we came in on * for %ds, %es, %fs or %gs. Messing with selectors is expensive, * and the underlying descriptors are essentially ignored by the * hardware in long mode - except for the base that we override with * the gsbase MSRs. * * However, there's one unfortunate issue with this rosy picture -- * a descriptor that's not marked as 'present' will still generate * an #np when loading a segment register. * * Consider this case. An lwp creates a harmless LDT entry, points * one of it's segment registers at it, then tells the kernel (here) * to delete it. In the 32-bit kernel, the #np will happen on the * way back to userland where we reload the segment registers, and be * handled in kern_gpfault(). In the 64-bit kernel, the same thing * will happen in the normal case too. However, if we're trying to * use a debugger that wants to save and restore the segment registers, * and the debugger things that we have valid segment registers, we * have the problem that the debugger will try and restore the * segment register that points at the now 'not present' descriptor * and will take a #np right there. * * We should obviously fix the debugger to be paranoid about * -not- restoring segment registers that point to bad descriptors; * however we can prevent the problem here if we check to see if any * of the segment registers are still pointing at the thing we're * destroying; if they are, return an error instead. (That also seems * a lot better failure mode than SIGKILL and a core file * from kern_gpfault() too.) */ if (SI86SSD_PRES(ssd) == 0) { kthread_t *t; int bad = 0; /* * Look carefully at the segment registers of every lwp * in the process (they're all stopped by our caller). * If we're about to invalidate a descriptor that's still * being referenced by *any* of them, return an error, * rather than having them #gp on their way out of the kernel. */ ASSERT(pp->p_lwprcnt == 1); mutex_enter(&pp->p_lock); t = pp->p_tlist; do { klwp_t *lwp = ttolwp(t); struct regs *rp = lwp->lwp_regs; pcb_t *pcb = &lwp->lwp_pcb; if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) { bad = 1; break; } if (PCB_NEED_UPDATE_SEGS(pcb)) { if (ssd->sel == pcb->pcb_ds || ssd->sel == pcb->pcb_es || ssd->sel == pcb->pcb_fs || ssd->sel == pcb->pcb_gs) { bad = 1; break; } } else { if (ssd->sel == rp->r_ds || ssd->sel == rp->r_es || ssd->sel == rp->r_fs || ssd->sel == rp->r_gs) { bad = 1; break; } } } while ((t = t->t_forw) != pp->p_tlist); mutex_exit(&pp->p_lock); if (bad) { mutex_exit(&pp->p_ldtlock); return (EBUSY); } } /* * If acc1 is zero, clear the descriptor (including the 'present' bit). * Make sure we update the CPU-private copy of the LDT. */ if (ssd->acc1 == 0) { rc = ldt_update_segd(ldp, &null_udesc); kpreempt_disable(); ldt_load(); kpreempt_enable(); mutex_exit(&pp->p_ldtlock); return (rc); } /* * Check segment type, allow segment not present and * only user DPL (3). */ if (SI86SSD_DPL(ssd) != SEL_UPL) { mutex_exit(&pp->p_ldtlock); return (EINVAL); } /* * Do not allow 32-bit applications to create 64-bit mode code * segments. */ if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 && SI86SSD_ISLONG(ssd)) { mutex_exit(&pp->p_ldtlock); return (EINVAL); } /* * Set up a code or data user segment descriptor, making sure to update * the CPU-private copy of the LDT. */ if (SI86SSD_ISUSEG(ssd)) { ssd_to_usd(ssd, &ndesc); rc = ldt_update_segd(ldp, &ndesc); kpreempt_disable(); ldt_load(); kpreempt_enable(); mutex_exit(&pp->p_ldtlock); return (rc); } mutex_exit(&pp->p_ldtlock); return (EINVAL); } /* * Allocate new LDT for process just large enough to contain seli. Note we * allocate and grow LDT in PAGESIZE chunks. We do this to simplify the * implementation and because on the hypervisor it's required, since the LDT * must live on pages that have PROT_WRITE removed and which are given to the * hypervisor. * * Note that we don't actually load the LDT into the current CPU here: it's done * later by our caller. */ static void ldt_alloc(proc_t *pp, uint_t seli) { user_desc_t *ldt; size_t ldtsz; uint_t nsels; ASSERT(MUTEX_HELD(&pp->p_ldtlock)); ASSERT(pp->p_ldt == NULL); ASSERT(pp->p_ldtlimit == 0); /* * Allocate new LDT just large enough to contain seli. The LDT must * always be allocated in units of pages for KPTI. */ ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE); nsels = ldtsz / sizeof (user_desc_t); ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT); ldt = kmem_zalloc(ldtsz, KM_SLEEP); ASSERT(IS_P2ALIGNED(ldt, PAGESIZE)); #if defined(__xpv) if (xen_ldt_setprot(ldt, ldtsz, PROT_READ)) panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed"); #endif pp->p_ldt = ldt; pp->p_ldtlimit = nsels - 1; } static void ldt_free(proc_t *pp) { user_desc_t *ldt; size_t ldtsz; ASSERT(pp->p_ldt != NULL); mutex_enter(&pp->p_ldtlock); ldt = pp->p_ldt; ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE)); pp->p_ldt = NULL; pp->p_ldtlimit = 0; mutex_exit(&pp->p_ldtlock); if (pp == curproc) { kpreempt_disable(); ldt_unload(); kpreempt_enable(); } #if defined(__xpv) /* * We are not allowed to make the ldt writable until after * we tell the hypervisor to unload it. */ if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE)) panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); #endif kmem_free(ldt, ldtsz); } /* * On fork copy new ldt for child. */ static void ldt_dup(proc_t *pp, proc_t *cp) { size_t ldtsz; ASSERT(pp->p_ldt != NULL); ASSERT(cp != curproc); /* * I assume the parent's ldt can't increase since we're in a fork. */ mutex_enter(&pp->p_ldtlock); mutex_enter(&cp->p_ldtlock); ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); ldt_alloc(cp, pp->p_ldtlimit); #if defined(__xpv) /* * Make child's ldt writable so it can be copied into from * parent's ldt. This works since ldt_alloc above did not load * the ldt since its for the child process. If we tried to make * an LDT writable that is loaded in hw the setprot operation * would fail. */ if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE)) panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); #endif bcopy(pp->p_ldt, cp->p_ldt, ldtsz); #if defined(__xpv) if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ)) panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed"); #endif mutex_exit(&cp->p_ldtlock); mutex_exit(&pp->p_ldtlock); } /* * Note that we don't actually load the LDT into the current CPU here: it's done * later by our caller - unless we take an error. This works out because * ldt_load() does a copy of ->p_ldt instead of directly loading it into the GDT * (and therefore can't be using the freed old LDT), and by definition if the * new entry didn't pass validation, then the proc shouldn't be referencing an * entry in the extended region. */ static void ldt_grow(proc_t *pp, uint_t seli) { user_desc_t *oldt, *nldt; uint_t nsels; size_t oldtsz, nldtsz; ASSERT(MUTEX_HELD(&pp->p_ldtlock)); ASSERT(pp->p_ldt != NULL); ASSERT(pp->p_ldtlimit != 0); /* * Allocate larger LDT just large enough to contain seli. The LDT must * always be allocated in units of pages for KPTI. */ nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE); nsels = nldtsz / sizeof (user_desc_t); ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT); ASSERT(nsels > pp->p_ldtlimit); oldt = pp->p_ldt; oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); nldt = kmem_zalloc(nldtsz, KM_SLEEP); ASSERT(IS_P2ALIGNED(nldt, PAGESIZE)); bcopy(oldt, nldt, oldtsz); /* * unload old ldt. */ kpreempt_disable(); ldt_unload(); kpreempt_enable(); #if defined(__xpv) /* * Make old ldt writable and new ldt read only. */ if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE)) panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); if (xen_ldt_setprot(nldt, nldtsz, PROT_READ)) panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed"); #endif pp->p_ldt = nldt; pp->p_ldtlimit = nsels - 1; kmem_free(oldt, oldtsz); }