1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2021 Joyent, Inc. 24 */ 25 26 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ 28 /* All Rights Reserved */ 29 30 /* Copyright (c) 1987, 1988 Microsoft Corporation */ 31 /* All Rights Reserved */ 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/sysmacros.h> 36 #include <sys/systm.h> 37 #include <sys/signal.h> 38 #include <sys/errno.h> 39 #include <sys/fault.h> 40 #include <sys/syscall.h> 41 #include <sys/cpuvar.h> 42 #include <sys/sysi86.h> 43 #include <sys/psw.h> 44 #include <sys/cred.h> 45 #include <sys/policy.h> 46 #include <sys/thread.h> 47 #include <sys/debug.h> 48 #include <sys/ontrap.h> 49 #include <sys/privregs.h> 50 #include <sys/x86_archext.h> 51 #include <sys/vmem.h> 52 #include <sys/kmem.h> 53 #include <sys/mman.h> 54 #include <sys/archsystm.h> 55 #include <vm/hat.h> 56 #include <vm/as.h> 57 #include <vm/seg.h> 58 #include <vm/seg_kmem.h> 59 #include <vm/faultcode.h> 60 #include <sys/fp.h> 61 #include <sys/cmn_err.h> 62 #include <sys/segments.h> 63 #include <sys/clock.h> 64 #include <vm/hat_i86.h> 65 #if defined(__xpv) 66 #include <sys/hypervisor.h> 67 #include <sys/note.h> 68 #endif 69 70 static void ldt_alloc(proc_t *, uint_t); 71 static void ldt_free(proc_t *); 72 static void ldt_dup(proc_t *, proc_t *); 73 static void ldt_grow(proc_t *, uint_t); 74 75 /* 76 * sysi86 System Call 77 */ 78 79 /* ARGSUSED */ 80 int 81 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) 82 { 83 struct ssd ssd; 84 int error = 0; 85 int c; 86 proc_t *pp = curproc; 87 88 switch (cmd) { 89 90 /* 91 * The SI86V86 subsystem call of the SYSI86 system call 92 * supports only one subcode -- V86SC_IOPL. 93 */ 94 case SI86V86: 95 if (arg1 == V86SC_IOPL) { 96 struct regs *rp = lwptoregs(ttolwp(curthread)); 97 greg_t oldpl = rp->r_ps & PS_IOPL; 98 greg_t newpl = arg2 & PS_IOPL; 99 100 /* 101 * Must be privileged to run this system call 102 * if giving more io privilege. 103 */ 104 if (newpl > oldpl && (error = 105 secpolicy_sys_config(CRED(), B_FALSE)) != 0) 106 return (set_errno(error)); 107 #if defined(__xpv) 108 const struct ctxop_template xen_tpl = { 109 .ct_rev = CTXOP_TPL_REV, 110 .ct_save = xen_disable_user_iopl, 111 .ct_restore = xen_enable_user_iopl, 112 .ct_exit = xen_disable_user_iopl, 113 }; 114 struct ctxop *ctx; 115 116 ctx = ctxop_allocate(&xen_tpl, NULL); 117 kpreempt_disable(); 118 ctxop_attach(curthread, ctx); 119 xen_enable_user_iopl(NULL); 120 kpreempt_enable(); 121 #else 122 rp->r_ps ^= oldpl ^ newpl; 123 #endif 124 } else 125 error = EINVAL; 126 break; 127 128 /* 129 * Set a segment descriptor 130 */ 131 case SI86DSCR: 132 /* 133 * There are considerable problems here manipulating 134 * resources shared by many running lwps. Get everyone 135 * into a safe state before changing the LDT. 136 */ 137 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) { 138 error = EINTR; 139 break; 140 } 141 142 if (get_udatamodel() == DATAMODEL_LP64) { 143 error = EINVAL; 144 break; 145 } 146 147 if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) { 148 error = EFAULT; 149 break; 150 } 151 152 error = setdscr(&ssd); 153 154 mutex_enter(&pp->p_lock); 155 if (curthread != pp->p_agenttp) 156 continuelwps(pp); 157 mutex_exit(&pp->p_lock); 158 break; 159 160 case SI86FPHW: 161 c = fp_kind & 0xff; 162 if (suword32((void *)arg1, c) == -1) 163 error = EFAULT; 164 break; 165 166 case SI86FPSTART: 167 /* 168 * arg1 is the address of _fp_hw 169 * arg2 is the desired x87 FCW value 170 * arg3 is the desired SSE MXCSR value 171 * a return value of one means SSE hardware, else none. 172 */ 173 c = fp_kind & 0xff; 174 if (suword32((void *)arg1, c) == -1) { 175 error = EFAULT; 176 break; 177 } 178 fpsetcw((uint16_t)arg2, (uint32_t)arg3); 179 return ((fp_kind & __FP_SSE) ? 1 : 0); 180 181 /* real time clock management commands */ 182 183 case WTODC: 184 if ((error = secpolicy_settime(CRED())) == 0) { 185 timestruc_t ts; 186 mutex_enter(&tod_lock); 187 gethrestime(&ts); 188 tod_set(ts); 189 mutex_exit(&tod_lock); 190 } 191 break; 192 193 /* Give some timezone playing room */ 194 #define ONEWEEK (7 * 24 * 60 * 60) 195 196 case SGMTL: 197 /* 198 * Called from 32 bit land, negative values 199 * are not sign extended, so we do that here 200 * by casting it to an int and back. We also 201 * clamp the value to within reason and detect 202 * when a 64 bit call overflows an int. 203 */ 204 if ((error = secpolicy_settime(CRED())) == 0) { 205 int newlag = (int)arg1; 206 207 #ifdef _SYSCALL32_IMPL 208 if (get_udatamodel() == DATAMODEL_NATIVE && 209 (long)newlag != (long)arg1) { 210 error = EOVERFLOW; 211 } else 212 #endif 213 if (newlag >= -ONEWEEK && newlag <= ONEWEEK) 214 sgmtl(newlag); 215 else 216 error = EOVERFLOW; 217 } 218 break; 219 220 case GGMTL: 221 if (get_udatamodel() == DATAMODEL_NATIVE) { 222 if (sulword((void *)arg1, ggmtl()) == -1) 223 error = EFAULT; 224 #ifdef _SYSCALL32_IMPL 225 } else { 226 time_t gmtl; 227 228 if ((gmtl = ggmtl()) > INT32_MAX) { 229 /* 230 * Since gmt_lag can at most be 231 * +/- 12 hours, something is 232 * *seriously* messed up here. 233 */ 234 error = EOVERFLOW; 235 } else if (suword32((void *)arg1, (int32_t)gmtl) == -1) 236 error = EFAULT; 237 #endif 238 } 239 break; 240 241 case RTCSYNC: 242 if ((error = secpolicy_settime(CRED())) == 0) 243 rtcsync(); 244 break; 245 246 /* END OF real time clock management commands */ 247 248 default: 249 error = EINVAL; 250 break; 251 } 252 return (error == 0 ? 0 : set_errno(error)); 253 } 254 255 void 256 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel) 257 { 258 ssd->bo = USEGD_GETBASE(usd); 259 ssd->ls = USEGD_GETLIMIT(usd); 260 ssd->sel = sel; 261 262 /* 263 * set type, dpl and present bits. 264 */ 265 ssd->acc1 = usd->usd_type; 266 ssd->acc1 |= usd->usd_dpl << 5; 267 ssd->acc1 |= usd->usd_p << (5 + 2); 268 269 /* 270 * set avl, DB and granularity bits. 271 */ 272 ssd->acc2 = usd->usd_avl; 273 274 ssd->acc2 |= usd->usd_long << 1; 275 276 ssd->acc2 |= usd->usd_def32 << (1 + 1); 277 ssd->acc2 |= usd->usd_gran << (1 + 1 + 1); 278 } 279 280 static void 281 ssd_to_usd(struct ssd *ssd, user_desc_t *usd) 282 { 283 284 ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0); 285 286 USEGD_SETBASE(usd, ssd->bo); 287 USEGD_SETLIMIT(usd, ssd->ls); 288 289 /* 290 * Set type, dpl and present bits. 291 * 292 * Force the "accessed" bit to on so that we don't run afoul of 293 * KPTI. 294 */ 295 usd->usd_type = ssd->acc1 | SDT_A; 296 usd->usd_dpl = ssd->acc1 >> 5; 297 usd->usd_p = ssd->acc1 >> (5 + 2); 298 299 ASSERT(usd->usd_type >= SDT_MEMRO); 300 ASSERT(usd->usd_dpl == SEL_UPL); 301 302 /* 303 * 64-bit code selectors are never allowed in the LDT. 304 * Reserved bit is always 0 on 32-bit systems. 305 */ 306 usd->usd_long = 0; 307 308 /* 309 * set avl, DB and granularity bits. 310 */ 311 usd->usd_avl = ssd->acc2; 312 usd->usd_def32 = ssd->acc2 >> (1 + 1); 313 usd->usd_gran = ssd->acc2 >> (1 + 1 + 1); 314 } 315 316 317 318 /* 319 * Load LDT register with the current process's LDT. 320 */ 321 static void 322 ldt_load(void) 323 { 324 #if defined(__xpv) 325 xen_set_ldt(curproc->p_ldt, curproc->p_ldtlimit + 1); 326 #else 327 size_t len; 328 system_desc_t desc; 329 330 /* 331 * Before we can use the LDT on this CPU, we must install the LDT in the 332 * user mapping table. 333 */ 334 len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t); 335 bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len); 336 CPU->cpu_m.mcpu_ldt_len = len; 337 set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL); 338 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc; 339 340 wr_ldtr(ULDT_SEL); 341 #endif 342 } 343 344 /* 345 * Store a NULL selector in the LDTR. All subsequent illegal references to 346 * the LDT will result in a #gp. 347 */ 348 void 349 ldt_unload(void) 350 { 351 #if defined(__xpv) 352 xen_set_ldt(NULL, 0); 353 #else 354 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc; 355 wr_ldtr(0); 356 357 bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len); 358 CPU->cpu_m.mcpu_ldt_len = 0; 359 #endif 360 } 361 362 /*ARGSUSED*/ 363 static void 364 ldt_savectx(proc_t *p) 365 { 366 ASSERT(p->p_ldt != NULL); 367 ASSERT(p == curproc); 368 369 /* 370 * The 64-bit kernel must be sure to clear any stale ldt 371 * selectors when context switching away from a process that 372 * has a private ldt. Consider the following example: 373 * 374 * Wine creats a ldt descriptor and points a segment register 375 * to it. 376 * 377 * We then context switch away from wine lwp to kernel 378 * thread and hit breakpoint in kernel with kmdb 379 * 380 * When we continue and resume from kmdb we will #gp 381 * fault since kmdb will have saved the stale ldt selector 382 * from wine and will try to restore it but we are no longer in 383 * the context of the wine process and do not have our 384 * ldtr register pointing to the private ldt. 385 */ 386 reset_sregs(); 387 388 ldt_unload(); 389 cpu_fast_syscall_enable(); 390 } 391 392 static void 393 ldt_restorectx(proc_t *p) 394 { 395 ASSERT(p->p_ldt != NULL); 396 ASSERT(p == curproc); 397 398 ldt_load(); 399 cpu_fast_syscall_disable(); 400 } 401 402 /* 403 * At exec time, we need to clear up our LDT context and re-enable fast syscalls 404 * for the new process image. 405 * 406 * The same is true for the other case, where we have: 407 * 408 * proc_exit() 409 * ->exitpctx()->ldt_savectx() 410 * ->freepctx()->ldt_freectx() 411 * 412 * Because pre-emption is not prevented between the two callbacks, we could have 413 * come off CPU, and brought back LDT context when coming back on CPU via 414 * ldt_restorectx(). 415 */ 416 /* ARGSUSED */ 417 static void 418 ldt_freectx(proc_t *p, int isexec) 419 { 420 ASSERT(p->p_ldt != NULL); 421 ASSERT(p == curproc); 422 423 kpreempt_disable(); 424 ldt_free(p); 425 cpu_fast_syscall_enable(); 426 kpreempt_enable(); 427 } 428 429 /* 430 * Install ctx op that ensures syscall/sysenter are disabled. 431 * See comments below. 432 * 433 * When a thread with a private LDT forks, the new process 434 * must have the LDT context ops installed. 435 */ 436 /* ARGSUSED */ 437 static void 438 ldt_installctx(proc_t *p, proc_t *cp) 439 { 440 proc_t *targ = p; 441 kthread_t *t; 442 443 /* 444 * If this is a fork, operate on the child process. 445 */ 446 if (cp != NULL) { 447 targ = cp; 448 ldt_dup(p, cp); 449 } 450 451 /* 452 * The process context ops expect the target process as their argument. 453 */ 454 ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx, 455 ldt_installctx, ldt_savectx, ldt_freectx) == 0); 456 457 installpctx(targ, targ, ldt_savectx, ldt_restorectx, 458 ldt_installctx, ldt_savectx, ldt_freectx); 459 460 /* 461 * We've just disabled fast system call and return instructions; take 462 * the slow path out to make sure we don't try to use one to return 463 * back to user. We must set t_post_sys for every thread in the 464 * process to make sure none of them escape out via fast return. 465 */ 466 467 mutex_enter(&targ->p_lock); 468 t = targ->p_tlist; 469 do { 470 t->t_post_sys = 1; 471 } while ((t = t->t_forw) != targ->p_tlist); 472 mutex_exit(&targ->p_lock); 473 } 474 475 int 476 setdscr(struct ssd *ssd) 477 { 478 ushort_t seli; /* selector index */ 479 user_desc_t *ldp; /* descriptor pointer */ 480 user_desc_t ndesc; /* new descriptor */ 481 proc_t *pp = curproc; 482 int rc = 0; 483 484 /* 485 * LDT segments: executable and data at DPL 3 only. 486 */ 487 if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel)) 488 return (EINVAL); 489 490 /* 491 * check the selector index. 492 */ 493 seli = SELTOIDX(ssd->sel); 494 if (seli >= MAXNLDT || seli < LDT_UDBASE) 495 return (EINVAL); 496 497 ndesc = null_udesc; 498 mutex_enter(&pp->p_ldtlock); 499 500 /* 501 * If this is the first time for this process then setup a 502 * private LDT for it. 503 */ 504 if (pp->p_ldt == NULL) { 505 ldt_alloc(pp, seli); 506 507 /* 508 * Now that this process has a private LDT, the use of 509 * the syscall/sysret and sysenter/sysexit instructions 510 * is forbidden for this processes because they destroy 511 * the contents of %cs and %ss segment registers. 512 * 513 * Explicity disable them here and add a context handler 514 * to the process. Note that disabling 515 * them here means we can't use sysret or sysexit on 516 * the way out of this system call - so we force this 517 * thread to take the slow path (which doesn't make use 518 * of sysenter or sysexit) back out. 519 */ 520 kpreempt_disable(); 521 ldt_installctx(pp, NULL); 522 cpu_fast_syscall_disable(); 523 ASSERT(curthread->t_post_sys != 0); 524 kpreempt_enable(); 525 526 } else if (seli > pp->p_ldtlimit) { 527 ASSERT(pp->p_pctx != NULL); 528 529 /* 530 * Increase size of ldt to include seli. 531 */ 532 ldt_grow(pp, seli); 533 } 534 535 ASSERT(seli <= pp->p_ldtlimit); 536 ldp = &pp->p_ldt[seli]; 537 538 /* 539 * On the 64-bit kernel, this is where things get more subtle. 540 * Recall that in the 64-bit kernel, when we enter the kernel we 541 * deliberately -don't- reload the segment selectors we came in on 542 * for %ds, %es, %fs or %gs. Messing with selectors is expensive, 543 * and the underlying descriptors are essentially ignored by the 544 * hardware in long mode - except for the base that we override with 545 * the gsbase MSRs. 546 * 547 * However, there's one unfortunate issue with this rosy picture -- 548 * a descriptor that's not marked as 'present' will still generate 549 * an #np when loading a segment register. 550 * 551 * Consider this case. An lwp creates a harmless LDT entry, points 552 * one of it's segment registers at it, then tells the kernel (here) 553 * to delete it. In the 32-bit kernel, the #np will happen on the 554 * way back to userland where we reload the segment registers, and be 555 * handled in kern_gpfault(). In the 64-bit kernel, the same thing 556 * will happen in the normal case too. However, if we're trying to 557 * use a debugger that wants to save and restore the segment registers, 558 * and the debugger things that we have valid segment registers, we 559 * have the problem that the debugger will try and restore the 560 * segment register that points at the now 'not present' descriptor 561 * and will take a #np right there. 562 * 563 * We should obviously fix the debugger to be paranoid about 564 * -not- restoring segment registers that point to bad descriptors; 565 * however we can prevent the problem here if we check to see if any 566 * of the segment registers are still pointing at the thing we're 567 * destroying; if they are, return an error instead. (That also seems 568 * a lot better failure mode than SIGKILL and a core file 569 * from kern_gpfault() too.) 570 */ 571 if (SI86SSD_PRES(ssd) == 0) { 572 kthread_t *t; 573 int bad = 0; 574 575 /* 576 * Look carefully at the segment registers of every lwp 577 * in the process (they're all stopped by our caller). 578 * If we're about to invalidate a descriptor that's still 579 * being referenced by *any* of them, return an error, 580 * rather than having them #gp on their way out of the kernel. 581 */ 582 ASSERT(pp->p_lwprcnt == 1); 583 584 mutex_enter(&pp->p_lock); 585 t = pp->p_tlist; 586 do { 587 klwp_t *lwp = ttolwp(t); 588 struct regs *rp = lwp->lwp_regs; 589 pcb_t *pcb = &lwp->lwp_pcb; 590 591 if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) { 592 bad = 1; 593 break; 594 } 595 596 if (PCB_NEED_UPDATE_SEGS(pcb)) { 597 if (ssd->sel == pcb->pcb_ds || 598 ssd->sel == pcb->pcb_es || 599 ssd->sel == pcb->pcb_fs || 600 ssd->sel == pcb->pcb_gs) { 601 bad = 1; 602 break; 603 } 604 } else { 605 if (ssd->sel == rp->r_ds || 606 ssd->sel == rp->r_es || 607 ssd->sel == rp->r_fs || 608 ssd->sel == rp->r_gs) { 609 bad = 1; 610 break; 611 } 612 } 613 614 } while ((t = t->t_forw) != pp->p_tlist); 615 mutex_exit(&pp->p_lock); 616 617 if (bad) { 618 mutex_exit(&pp->p_ldtlock); 619 return (EBUSY); 620 } 621 } 622 623 /* 624 * If acc1 is zero, clear the descriptor (including the 'present' bit). 625 * Make sure we update the CPU-private copy of the LDT. 626 */ 627 if (ssd->acc1 == 0) { 628 rc = ldt_update_segd(ldp, &null_udesc); 629 kpreempt_disable(); 630 ldt_load(); 631 kpreempt_enable(); 632 mutex_exit(&pp->p_ldtlock); 633 return (rc); 634 } 635 636 /* 637 * Check segment type, allow segment not present and 638 * only user DPL (3). 639 */ 640 if (SI86SSD_DPL(ssd) != SEL_UPL) { 641 mutex_exit(&pp->p_ldtlock); 642 return (EINVAL); 643 } 644 645 /* 646 * Do not allow 32-bit applications to create 64-bit mode code 647 * segments. 648 */ 649 if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 && 650 SI86SSD_ISLONG(ssd)) { 651 mutex_exit(&pp->p_ldtlock); 652 return (EINVAL); 653 } 654 655 /* 656 * Set up a code or data user segment descriptor, making sure to update 657 * the CPU-private copy of the LDT. 658 */ 659 if (SI86SSD_ISUSEG(ssd)) { 660 ssd_to_usd(ssd, &ndesc); 661 rc = ldt_update_segd(ldp, &ndesc); 662 kpreempt_disable(); 663 ldt_load(); 664 kpreempt_enable(); 665 mutex_exit(&pp->p_ldtlock); 666 return (rc); 667 } 668 669 mutex_exit(&pp->p_ldtlock); 670 return (EINVAL); 671 } 672 673 /* 674 * Allocate new LDT for process just large enough to contain seli. Note we 675 * allocate and grow LDT in PAGESIZE chunks. We do this to simplify the 676 * implementation and because on the hypervisor it's required, since the LDT 677 * must live on pages that have PROT_WRITE removed and which are given to the 678 * hypervisor. 679 * 680 * Note that we don't actually load the LDT into the current CPU here: it's done 681 * later by our caller. 682 */ 683 static void 684 ldt_alloc(proc_t *pp, uint_t seli) 685 { 686 user_desc_t *ldt; 687 size_t ldtsz; 688 uint_t nsels; 689 690 ASSERT(MUTEX_HELD(&pp->p_ldtlock)); 691 ASSERT(pp->p_ldt == NULL); 692 ASSERT(pp->p_ldtlimit == 0); 693 694 /* 695 * Allocate new LDT just large enough to contain seli. The LDT must 696 * always be allocated in units of pages for KPTI. 697 */ 698 ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE); 699 nsels = ldtsz / sizeof (user_desc_t); 700 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT); 701 702 ldt = kmem_zalloc(ldtsz, KM_SLEEP); 703 ASSERT(IS_P2ALIGNED(ldt, PAGESIZE)); 704 705 #if defined(__xpv) 706 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ)) 707 panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed"); 708 #endif 709 710 pp->p_ldt = ldt; 711 pp->p_ldtlimit = nsels - 1; 712 } 713 714 static void 715 ldt_free(proc_t *pp) 716 { 717 user_desc_t *ldt; 718 size_t ldtsz; 719 720 ASSERT(pp->p_ldt != NULL); 721 722 mutex_enter(&pp->p_ldtlock); 723 ldt = pp->p_ldt; 724 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); 725 726 ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE)); 727 728 pp->p_ldt = NULL; 729 pp->p_ldtlimit = 0; 730 mutex_exit(&pp->p_ldtlock); 731 732 if (pp == curproc) { 733 kpreempt_disable(); 734 ldt_unload(); 735 kpreempt_enable(); 736 } 737 738 #if defined(__xpv) 739 /* 740 * We are not allowed to make the ldt writable until after 741 * we tell the hypervisor to unload it. 742 */ 743 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE)) 744 panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); 745 #endif 746 747 kmem_free(ldt, ldtsz); 748 } 749 750 /* 751 * On fork copy new ldt for child. 752 */ 753 static void 754 ldt_dup(proc_t *pp, proc_t *cp) 755 { 756 size_t ldtsz; 757 758 ASSERT(pp->p_ldt != NULL); 759 ASSERT(cp != curproc); 760 761 /* 762 * I assume the parent's ldt can't increase since we're in a fork. 763 */ 764 mutex_enter(&pp->p_ldtlock); 765 mutex_enter(&cp->p_ldtlock); 766 767 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); 768 769 ldt_alloc(cp, pp->p_ldtlimit); 770 771 #if defined(__xpv) 772 /* 773 * Make child's ldt writable so it can be copied into from 774 * parent's ldt. This works since ldt_alloc above did not load 775 * the ldt since its for the child process. If we tried to make 776 * an LDT writable that is loaded in hw the setprot operation 777 * would fail. 778 */ 779 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE)) 780 panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); 781 #endif 782 783 bcopy(pp->p_ldt, cp->p_ldt, ldtsz); 784 785 #if defined(__xpv) 786 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ)) 787 panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed"); 788 #endif 789 mutex_exit(&cp->p_ldtlock); 790 mutex_exit(&pp->p_ldtlock); 791 792 } 793 794 /* 795 * Note that we don't actually load the LDT into the current CPU here: it's done 796 * later by our caller - unless we take an error. This works out because 797 * ldt_load() does a copy of ->p_ldt instead of directly loading it into the GDT 798 * (and therefore can't be using the freed old LDT), and by definition if the 799 * new entry didn't pass validation, then the proc shouldn't be referencing an 800 * entry in the extended region. 801 */ 802 static void 803 ldt_grow(proc_t *pp, uint_t seli) 804 { 805 user_desc_t *oldt, *nldt; 806 uint_t nsels; 807 size_t oldtsz, nldtsz; 808 809 ASSERT(MUTEX_HELD(&pp->p_ldtlock)); 810 ASSERT(pp->p_ldt != NULL); 811 ASSERT(pp->p_ldtlimit != 0); 812 813 /* 814 * Allocate larger LDT just large enough to contain seli. The LDT must 815 * always be allocated in units of pages for KPTI. 816 */ 817 nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE); 818 nsels = nldtsz / sizeof (user_desc_t); 819 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT); 820 ASSERT(nsels > pp->p_ldtlimit); 821 822 oldt = pp->p_ldt; 823 oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); 824 825 nldt = kmem_zalloc(nldtsz, KM_SLEEP); 826 ASSERT(IS_P2ALIGNED(nldt, PAGESIZE)); 827 828 bcopy(oldt, nldt, oldtsz); 829 830 /* 831 * unload old ldt. 832 */ 833 kpreempt_disable(); 834 ldt_unload(); 835 kpreempt_enable(); 836 837 #if defined(__xpv) 838 839 /* 840 * Make old ldt writable and new ldt read only. 841 */ 842 if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE)) 843 panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); 844 845 if (xen_ldt_setprot(nldt, nldtsz, PROT_READ)) 846 panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed"); 847 #endif 848 849 pp->p_ldt = nldt; 850 pp->p_ldtlimit = nsels - 1; 851 852 kmem_free(oldt, oldtsz); 853 } 854