1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2021 Joyent, Inc. 24 */ 25 26 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ 28 /* All Rights Reserved */ 29 30 /* Copyright (c) 1987, 1988 Microsoft Corporation */ 31 /* All Rights Reserved */ 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/sysmacros.h> 36 #include <sys/systm.h> 37 #include <sys/signal.h> 38 #include <sys/errno.h> 39 #include <sys/fault.h> 40 #include <sys/syscall.h> 41 #include <sys/cpuvar.h> 42 #include <sys/sysi86.h> 43 #include <sys/psw.h> 44 #include <sys/cred.h> 45 #include <sys/policy.h> 46 #include <sys/thread.h> 47 #include <sys/debug.h> 48 #include <sys/ontrap.h> 49 #include <sys/privregs.h> 50 #include <sys/x86_archext.h> 51 #include <sys/vmem.h> 52 #include <sys/kmem.h> 53 #include <sys/mman.h> 54 #include <sys/archsystm.h> 55 #include <vm/hat.h> 56 #include <vm/as.h> 57 #include <vm/seg.h> 58 #include <vm/seg_kmem.h> 59 #include <vm/faultcode.h> 60 #include <sys/fp.h> 61 #include <sys/cmn_err.h> 62 #include <sys/segments.h> 63 #include <sys/clock.h> 64 #include <vm/hat_i86.h> 65 #if defined(__xpv) 66 #include <sys/hypervisor.h> 67 #include <sys/note.h> 68 #endif 69 70 static void ldt_alloc(proc_t *, uint_t); 71 static void ldt_free(proc_t *); 72 static void ldt_dup(proc_t *, proc_t *); 73 static void ldt_grow(proc_t *, uint_t); 74 75 /* 76 * sysi86 System Call 77 */ 78 79 /* ARGSUSED */ 80 int 81 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) 82 { 83 struct ssd ssd; 84 int error = 0; 85 int c; 86 proc_t *pp = curproc; 87 88 switch (cmd) { 89 90 /* 91 * The SI86V86 subsystem call of the SYSI86 system call 92 * supports only one subcode -- V86SC_IOPL. 93 */ 94 case SI86V86: 95 if (arg1 == V86SC_IOPL) { 96 #if defined(__xpv) 97 struct ctxop *ctx; 98 #endif 99 struct regs *rp = lwptoregs(ttolwp(curthread)); 100 greg_t oldpl = rp->r_ps & PS_IOPL; 101 greg_t newpl = arg2 & PS_IOPL; 102 103 /* 104 * Must be privileged to run this system call 105 * if giving more io privilege. 106 */ 107 if (newpl > oldpl && (error = 108 secpolicy_sys_config(CRED(), B_FALSE)) != 0) 109 return (set_errno(error)); 110 #if defined(__xpv) 111 ctx = installctx_preallocate(); 112 kpreempt_disable(); 113 installctx(curthread, NULL, xen_disable_user_iopl, 114 xen_enable_user_iopl, NULL, NULL, 115 xen_disable_user_iopl, NULL, ctx); 116 xen_enable_user_iopl(); 117 kpreempt_enable(); 118 #else 119 rp->r_ps ^= oldpl ^ newpl; 120 #endif 121 } else 122 error = EINVAL; 123 break; 124 125 /* 126 * Set a segment descriptor 127 */ 128 case SI86DSCR: 129 /* 130 * There are considerable problems here manipulating 131 * resources shared by many running lwps. Get everyone 132 * into a safe state before changing the LDT. 133 */ 134 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) { 135 error = EINTR; 136 break; 137 } 138 139 if (get_udatamodel() == DATAMODEL_LP64) { 140 error = EINVAL; 141 break; 142 } 143 144 if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) { 145 error = EFAULT; 146 break; 147 } 148 149 error = setdscr(&ssd); 150 151 mutex_enter(&pp->p_lock); 152 if (curthread != pp->p_agenttp) 153 continuelwps(pp); 154 mutex_exit(&pp->p_lock); 155 break; 156 157 case SI86FPHW: 158 c = fp_kind & 0xff; 159 if (suword32((void *)arg1, c) == -1) 160 error = EFAULT; 161 break; 162 163 case SI86FPSTART: 164 /* 165 * arg1 is the address of _fp_hw 166 * arg2 is the desired x87 FCW value 167 * arg3 is the desired SSE MXCSR value 168 * a return value of one means SSE hardware, else none. 169 */ 170 c = fp_kind & 0xff; 171 if (suword32((void *)arg1, c) == -1) { 172 error = EFAULT; 173 break; 174 } 175 fpsetcw((uint16_t)arg2, (uint32_t)arg3); 176 return ((fp_kind & __FP_SSE) ? 1 : 0); 177 178 /* real time clock management commands */ 179 180 case WTODC: 181 if ((error = secpolicy_settime(CRED())) == 0) { 182 timestruc_t ts; 183 mutex_enter(&tod_lock); 184 gethrestime(&ts); 185 tod_set(ts); 186 mutex_exit(&tod_lock); 187 } 188 break; 189 190 /* Give some timezone playing room */ 191 #define ONEWEEK (7 * 24 * 60 * 60) 192 193 case SGMTL: 194 /* 195 * Called from 32 bit land, negative values 196 * are not sign extended, so we do that here 197 * by casting it to an int and back. We also 198 * clamp the value to within reason and detect 199 * when a 64 bit call overflows an int. 200 */ 201 if ((error = secpolicy_settime(CRED())) == 0) { 202 int newlag = (int)arg1; 203 204 #ifdef _SYSCALL32_IMPL 205 if (get_udatamodel() == DATAMODEL_NATIVE && 206 (long)newlag != (long)arg1) { 207 error = EOVERFLOW; 208 } else 209 #endif 210 if (newlag >= -ONEWEEK && newlag <= ONEWEEK) 211 sgmtl(newlag); 212 else 213 error = EOVERFLOW; 214 } 215 break; 216 217 case GGMTL: 218 if (get_udatamodel() == DATAMODEL_NATIVE) { 219 if (sulword((void *)arg1, ggmtl()) == -1) 220 error = EFAULT; 221 #ifdef _SYSCALL32_IMPL 222 } else { 223 time_t gmtl; 224 225 if ((gmtl = ggmtl()) > INT32_MAX) { 226 /* 227 * Since gmt_lag can at most be 228 * +/- 12 hours, something is 229 * *seriously* messed up here. 230 */ 231 error = EOVERFLOW; 232 } else if (suword32((void *)arg1, (int32_t)gmtl) == -1) 233 error = EFAULT; 234 #endif 235 } 236 break; 237 238 case RTCSYNC: 239 if ((error = secpolicy_settime(CRED())) == 0) 240 rtcsync(); 241 break; 242 243 /* END OF real time clock management commands */ 244 245 default: 246 error = EINVAL; 247 break; 248 } 249 return (error == 0 ? 0 : set_errno(error)); 250 } 251 252 void 253 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel) 254 { 255 ssd->bo = USEGD_GETBASE(usd); 256 ssd->ls = USEGD_GETLIMIT(usd); 257 ssd->sel = sel; 258 259 /* 260 * set type, dpl and present bits. 261 */ 262 ssd->acc1 = usd->usd_type; 263 ssd->acc1 |= usd->usd_dpl << 5; 264 ssd->acc1 |= usd->usd_p << (5 + 2); 265 266 /* 267 * set avl, DB and granularity bits. 268 */ 269 ssd->acc2 = usd->usd_avl; 270 271 ssd->acc2 |= usd->usd_long << 1; 272 273 ssd->acc2 |= usd->usd_def32 << (1 + 1); 274 ssd->acc2 |= usd->usd_gran << (1 + 1 + 1); 275 } 276 277 static void 278 ssd_to_usd(struct ssd *ssd, user_desc_t *usd) 279 { 280 281 ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0); 282 283 USEGD_SETBASE(usd, ssd->bo); 284 USEGD_SETLIMIT(usd, ssd->ls); 285 286 /* 287 * Set type, dpl and present bits. 288 * 289 * Force the "accessed" bit to on so that we don't run afoul of 290 * KPTI. 291 */ 292 usd->usd_type = ssd->acc1 | SDT_A; 293 usd->usd_dpl = ssd->acc1 >> 5; 294 usd->usd_p = ssd->acc1 >> (5 + 2); 295 296 ASSERT(usd->usd_type >= SDT_MEMRO); 297 ASSERT(usd->usd_dpl == SEL_UPL); 298 299 /* 300 * 64-bit code selectors are never allowed in the LDT. 301 * Reserved bit is always 0 on 32-bit systems. 302 */ 303 usd->usd_long = 0; 304 305 /* 306 * set avl, DB and granularity bits. 307 */ 308 usd->usd_avl = ssd->acc2; 309 usd->usd_def32 = ssd->acc2 >> (1 + 1); 310 usd->usd_gran = ssd->acc2 >> (1 + 1 + 1); 311 } 312 313 314 315 /* 316 * Load LDT register with the current process's LDT. 317 */ 318 static void 319 ldt_load(void) 320 { 321 #if defined(__xpv) 322 xen_set_ldt(curproc->p_ldt, curproc->p_ldtlimit + 1); 323 #else 324 size_t len; 325 system_desc_t desc; 326 327 /* 328 * Before we can use the LDT on this CPU, we must install the LDT in the 329 * user mapping table. 330 */ 331 len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t); 332 bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len); 333 CPU->cpu_m.mcpu_ldt_len = len; 334 set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL); 335 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc; 336 337 wr_ldtr(ULDT_SEL); 338 #endif 339 } 340 341 /* 342 * Store a NULL selector in the LDTR. All subsequent illegal references to 343 * the LDT will result in a #gp. 344 */ 345 void 346 ldt_unload(void) 347 { 348 #if defined(__xpv) 349 xen_set_ldt(NULL, 0); 350 #else 351 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc; 352 wr_ldtr(0); 353 354 bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len); 355 CPU->cpu_m.mcpu_ldt_len = 0; 356 #endif 357 } 358 359 /*ARGSUSED*/ 360 static void 361 ldt_savectx(proc_t *p) 362 { 363 ASSERT(p->p_ldt != NULL); 364 ASSERT(p == curproc); 365 366 /* 367 * The 64-bit kernel must be sure to clear any stale ldt 368 * selectors when context switching away from a process that 369 * has a private ldt. Consider the following example: 370 * 371 * Wine creats a ldt descriptor and points a segment register 372 * to it. 373 * 374 * We then context switch away from wine lwp to kernel 375 * thread and hit breakpoint in kernel with kmdb 376 * 377 * When we continue and resume from kmdb we will #gp 378 * fault since kmdb will have saved the stale ldt selector 379 * from wine and will try to restore it but we are no longer in 380 * the context of the wine process and do not have our 381 * ldtr register pointing to the private ldt. 382 */ 383 reset_sregs(); 384 385 ldt_unload(); 386 cpu_fast_syscall_enable(); 387 } 388 389 static void 390 ldt_restorectx(proc_t *p) 391 { 392 ASSERT(p->p_ldt != NULL); 393 ASSERT(p == curproc); 394 395 ldt_load(); 396 cpu_fast_syscall_disable(); 397 } 398 399 /* 400 * At exec time, we need to clear up our LDT context and re-enable fast syscalls 401 * for the new process image. 402 * 403 * The same is true for the other case, where we have: 404 * 405 * proc_exit() 406 * ->exitpctx()->ldt_savectx() 407 * ->freepctx()->ldt_freectx() 408 * 409 * Because pre-emption is not prevented between the two callbacks, we could have 410 * come off CPU, and brought back LDT context when coming back on CPU via 411 * ldt_restorectx(). 412 */ 413 /* ARGSUSED */ 414 static void 415 ldt_freectx(proc_t *p, int isexec) 416 { 417 ASSERT(p->p_ldt != NULL); 418 ASSERT(p == curproc); 419 420 kpreempt_disable(); 421 ldt_free(p); 422 cpu_fast_syscall_enable(); 423 kpreempt_enable(); 424 } 425 426 /* 427 * Install ctx op that ensures syscall/sysenter are disabled. 428 * See comments below. 429 * 430 * When a thread with a private LDT forks, the new process 431 * must have the LDT context ops installed. 432 */ 433 /* ARGSUSED */ 434 static void 435 ldt_installctx(proc_t *p, proc_t *cp) 436 { 437 proc_t *targ = p; 438 kthread_t *t; 439 440 /* 441 * If this is a fork, operate on the child process. 442 */ 443 if (cp != NULL) { 444 targ = cp; 445 ldt_dup(p, cp); 446 } 447 448 /* 449 * The process context ops expect the target process as their argument. 450 */ 451 ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx, 452 ldt_installctx, ldt_savectx, ldt_freectx) == 0); 453 454 installpctx(targ, targ, ldt_savectx, ldt_restorectx, 455 ldt_installctx, ldt_savectx, ldt_freectx); 456 457 /* 458 * We've just disabled fast system call and return instructions; take 459 * the slow path out to make sure we don't try to use one to return 460 * back to user. We must set t_post_sys for every thread in the 461 * process to make sure none of them escape out via fast return. 462 */ 463 464 mutex_enter(&targ->p_lock); 465 t = targ->p_tlist; 466 do { 467 t->t_post_sys = 1; 468 } while ((t = t->t_forw) != targ->p_tlist); 469 mutex_exit(&targ->p_lock); 470 } 471 472 int 473 setdscr(struct ssd *ssd) 474 { 475 ushort_t seli; /* selector index */ 476 user_desc_t *ldp; /* descriptor pointer */ 477 user_desc_t ndesc; /* new descriptor */ 478 proc_t *pp = curproc; 479 int rc = 0; 480 481 /* 482 * LDT segments: executable and data at DPL 3 only. 483 */ 484 if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel)) 485 return (EINVAL); 486 487 /* 488 * check the selector index. 489 */ 490 seli = SELTOIDX(ssd->sel); 491 if (seli >= MAXNLDT || seli < LDT_UDBASE) 492 return (EINVAL); 493 494 ndesc = null_udesc; 495 mutex_enter(&pp->p_ldtlock); 496 497 /* 498 * If this is the first time for this process then setup a 499 * private LDT for it. 500 */ 501 if (pp->p_ldt == NULL) { 502 ldt_alloc(pp, seli); 503 504 /* 505 * Now that this process has a private LDT, the use of 506 * the syscall/sysret and sysenter/sysexit instructions 507 * is forbidden for this processes because they destroy 508 * the contents of %cs and %ss segment registers. 509 * 510 * Explicity disable them here and add a context handler 511 * to the process. Note that disabling 512 * them here means we can't use sysret or sysexit on 513 * the way out of this system call - so we force this 514 * thread to take the slow path (which doesn't make use 515 * of sysenter or sysexit) back out. 516 */ 517 kpreempt_disable(); 518 ldt_installctx(pp, NULL); 519 cpu_fast_syscall_disable(); 520 ASSERT(curthread->t_post_sys != 0); 521 kpreempt_enable(); 522 523 } else if (seli > pp->p_ldtlimit) { 524 ASSERT(pp->p_pctx != NULL); 525 526 /* 527 * Increase size of ldt to include seli. 528 */ 529 ldt_grow(pp, seli); 530 } 531 532 ASSERT(seli <= pp->p_ldtlimit); 533 ldp = &pp->p_ldt[seli]; 534 535 /* 536 * On the 64-bit kernel, this is where things get more subtle. 537 * Recall that in the 64-bit kernel, when we enter the kernel we 538 * deliberately -don't- reload the segment selectors we came in on 539 * for %ds, %es, %fs or %gs. Messing with selectors is expensive, 540 * and the underlying descriptors are essentially ignored by the 541 * hardware in long mode - except for the base that we override with 542 * the gsbase MSRs. 543 * 544 * However, there's one unfortunate issue with this rosy picture -- 545 * a descriptor that's not marked as 'present' will still generate 546 * an #np when loading a segment register. 547 * 548 * Consider this case. An lwp creates a harmless LDT entry, points 549 * one of it's segment registers at it, then tells the kernel (here) 550 * to delete it. In the 32-bit kernel, the #np will happen on the 551 * way back to userland where we reload the segment registers, and be 552 * handled in kern_gpfault(). In the 64-bit kernel, the same thing 553 * will happen in the normal case too. However, if we're trying to 554 * use a debugger that wants to save and restore the segment registers, 555 * and the debugger things that we have valid segment registers, we 556 * have the problem that the debugger will try and restore the 557 * segment register that points at the now 'not present' descriptor 558 * and will take a #np right there. 559 * 560 * We should obviously fix the debugger to be paranoid about 561 * -not- restoring segment registers that point to bad descriptors; 562 * however we can prevent the problem here if we check to see if any 563 * of the segment registers are still pointing at the thing we're 564 * destroying; if they are, return an error instead. (That also seems 565 * a lot better failure mode than SIGKILL and a core file 566 * from kern_gpfault() too.) 567 */ 568 if (SI86SSD_PRES(ssd) == 0) { 569 kthread_t *t; 570 int bad = 0; 571 572 /* 573 * Look carefully at the segment registers of every lwp 574 * in the process (they're all stopped by our caller). 575 * If we're about to invalidate a descriptor that's still 576 * being referenced by *any* of them, return an error, 577 * rather than having them #gp on their way out of the kernel. 578 */ 579 ASSERT(pp->p_lwprcnt == 1); 580 581 mutex_enter(&pp->p_lock); 582 t = pp->p_tlist; 583 do { 584 klwp_t *lwp = ttolwp(t); 585 struct regs *rp = lwp->lwp_regs; 586 pcb_t *pcb = &lwp->lwp_pcb; 587 588 if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) { 589 bad = 1; 590 break; 591 } 592 593 if (PCB_NEED_UPDATE_SEGS(pcb)) { 594 if (ssd->sel == pcb->pcb_ds || 595 ssd->sel == pcb->pcb_es || 596 ssd->sel == pcb->pcb_fs || 597 ssd->sel == pcb->pcb_gs) { 598 bad = 1; 599 break; 600 } 601 } else { 602 if (ssd->sel == rp->r_ds || 603 ssd->sel == rp->r_es || 604 ssd->sel == rp->r_fs || 605 ssd->sel == rp->r_gs) { 606 bad = 1; 607 break; 608 } 609 } 610 611 } while ((t = t->t_forw) != pp->p_tlist); 612 mutex_exit(&pp->p_lock); 613 614 if (bad) { 615 mutex_exit(&pp->p_ldtlock); 616 return (EBUSY); 617 } 618 } 619 620 /* 621 * If acc1 is zero, clear the descriptor (including the 'present' bit). 622 * Make sure we update the CPU-private copy of the LDT. 623 */ 624 if (ssd->acc1 == 0) { 625 rc = ldt_update_segd(ldp, &null_udesc); 626 kpreempt_disable(); 627 ldt_load(); 628 kpreempt_enable(); 629 mutex_exit(&pp->p_ldtlock); 630 return (rc); 631 } 632 633 /* 634 * Check segment type, allow segment not present and 635 * only user DPL (3). 636 */ 637 if (SI86SSD_DPL(ssd) != SEL_UPL) { 638 mutex_exit(&pp->p_ldtlock); 639 return (EINVAL); 640 } 641 642 /* 643 * Do not allow 32-bit applications to create 64-bit mode code 644 * segments. 645 */ 646 if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 && 647 SI86SSD_ISLONG(ssd)) { 648 mutex_exit(&pp->p_ldtlock); 649 return (EINVAL); 650 } 651 652 /* 653 * Set up a code or data user segment descriptor, making sure to update 654 * the CPU-private copy of the LDT. 655 */ 656 if (SI86SSD_ISUSEG(ssd)) { 657 ssd_to_usd(ssd, &ndesc); 658 rc = ldt_update_segd(ldp, &ndesc); 659 kpreempt_disable(); 660 ldt_load(); 661 kpreempt_enable(); 662 mutex_exit(&pp->p_ldtlock); 663 return (rc); 664 } 665 666 mutex_exit(&pp->p_ldtlock); 667 return (EINVAL); 668 } 669 670 /* 671 * Allocate new LDT for process just large enough to contain seli. Note we 672 * allocate and grow LDT in PAGESIZE chunks. We do this to simplify the 673 * implementation and because on the hypervisor it's required, since the LDT 674 * must live on pages that have PROT_WRITE removed and which are given to the 675 * hypervisor. 676 * 677 * Note that we don't actually load the LDT into the current CPU here: it's done 678 * later by our caller. 679 */ 680 static void 681 ldt_alloc(proc_t *pp, uint_t seli) 682 { 683 user_desc_t *ldt; 684 size_t ldtsz; 685 uint_t nsels; 686 687 ASSERT(MUTEX_HELD(&pp->p_ldtlock)); 688 ASSERT(pp->p_ldt == NULL); 689 ASSERT(pp->p_ldtlimit == 0); 690 691 /* 692 * Allocate new LDT just large enough to contain seli. The LDT must 693 * always be allocated in units of pages for KPTI. 694 */ 695 ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE); 696 nsels = ldtsz / sizeof (user_desc_t); 697 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT); 698 699 ldt = kmem_zalloc(ldtsz, KM_SLEEP); 700 ASSERT(IS_P2ALIGNED(ldt, PAGESIZE)); 701 702 #if defined(__xpv) 703 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ)) 704 panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed"); 705 #endif 706 707 pp->p_ldt = ldt; 708 pp->p_ldtlimit = nsels - 1; 709 } 710 711 static void 712 ldt_free(proc_t *pp) 713 { 714 user_desc_t *ldt; 715 size_t ldtsz; 716 717 ASSERT(pp->p_ldt != NULL); 718 719 mutex_enter(&pp->p_ldtlock); 720 ldt = pp->p_ldt; 721 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); 722 723 ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE)); 724 725 pp->p_ldt = NULL; 726 pp->p_ldtlimit = 0; 727 mutex_exit(&pp->p_ldtlock); 728 729 if (pp == curproc) { 730 kpreempt_disable(); 731 ldt_unload(); 732 kpreempt_enable(); 733 } 734 735 #if defined(__xpv) 736 /* 737 * We are not allowed to make the ldt writable until after 738 * we tell the hypervisor to unload it. 739 */ 740 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE)) 741 panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); 742 #endif 743 744 kmem_free(ldt, ldtsz); 745 } 746 747 /* 748 * On fork copy new ldt for child. 749 */ 750 static void 751 ldt_dup(proc_t *pp, proc_t *cp) 752 { 753 size_t ldtsz; 754 755 ASSERT(pp->p_ldt != NULL); 756 ASSERT(cp != curproc); 757 758 /* 759 * I assume the parent's ldt can't increase since we're in a fork. 760 */ 761 mutex_enter(&pp->p_ldtlock); 762 mutex_enter(&cp->p_ldtlock); 763 764 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); 765 766 ldt_alloc(cp, pp->p_ldtlimit); 767 768 #if defined(__xpv) 769 /* 770 * Make child's ldt writable so it can be copied into from 771 * parent's ldt. This works since ldt_alloc above did not load 772 * the ldt since its for the child process. If we tried to make 773 * an LDT writable that is loaded in hw the setprot operation 774 * would fail. 775 */ 776 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE)) 777 panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); 778 #endif 779 780 bcopy(pp->p_ldt, cp->p_ldt, ldtsz); 781 782 #if defined(__xpv) 783 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ)) 784 panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed"); 785 #endif 786 mutex_exit(&cp->p_ldtlock); 787 mutex_exit(&pp->p_ldtlock); 788 789 } 790 791 /* 792 * Note that we don't actually load the LDT into the current CPU here: it's done 793 * later by our caller - unless we take an error. This works out because 794 * ldt_load() does a copy of ->p_ldt instead of directly loading it into the GDT 795 * (and therefore can't be using the freed old LDT), and by definition if the 796 * new entry didn't pass validation, then the proc shouldn't be referencing an 797 * entry in the extended region. 798 */ 799 static void 800 ldt_grow(proc_t *pp, uint_t seli) 801 { 802 user_desc_t *oldt, *nldt; 803 uint_t nsels; 804 size_t oldtsz, nldtsz; 805 806 ASSERT(MUTEX_HELD(&pp->p_ldtlock)); 807 ASSERT(pp->p_ldt != NULL); 808 ASSERT(pp->p_ldtlimit != 0); 809 810 /* 811 * Allocate larger LDT just large enough to contain seli. The LDT must 812 * always be allocated in units of pages for KPTI. 813 */ 814 nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE); 815 nsels = nldtsz / sizeof (user_desc_t); 816 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT); 817 ASSERT(nsels > pp->p_ldtlimit); 818 819 oldt = pp->p_ldt; 820 oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); 821 822 nldt = kmem_zalloc(nldtsz, KM_SLEEP); 823 ASSERT(IS_P2ALIGNED(nldt, PAGESIZE)); 824 825 bcopy(oldt, nldt, oldtsz); 826 827 /* 828 * unload old ldt. 829 */ 830 kpreempt_disable(); 831 ldt_unload(); 832 kpreempt_enable(); 833 834 #if defined(__xpv) 835 836 /* 837 * Make old ldt writable and new ldt read only. 838 */ 839 if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE)) 840 panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); 841 842 if (xen_ldt_setprot(nldt, nldtsz, PROT_READ)) 843 panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed"); 844 #endif 845 846 pp->p_ldt = nldt; 847 pp->p_ldtlimit = nsels - 1; 848 849 kmem_free(oldt, oldtsz); 850 } 851