xref: /titanic_50/usr/src/uts/intel/ia32/os/sysi86.c (revision 174bc6499d233e329ecd3d98a880a7b07df16bfa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*	Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.	*/
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*	Copyright (c) 1987, 1988 Microsoft Corporation	*/
30 /*	  All Rights Reserved	*/
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/sysmacros.h>
35 #include <sys/systm.h>
36 #include <sys/signal.h>
37 #include <sys/errno.h>
38 #include <sys/fault.h>
39 #include <sys/syscall.h>
40 #include <sys/cpuvar.h>
41 #include <sys/sysi86.h>
42 #include <sys/psw.h>
43 #include <sys/cred.h>
44 #include <sys/policy.h>
45 #include <sys/thread.h>
46 #include <sys/debug.h>
47 #include <sys/ontrap.h>
48 #include <sys/privregs.h>
49 #include <sys/x86_archext.h>
50 #include <sys/vmem.h>
51 #include <sys/kmem.h>
52 #include <sys/mman.h>
53 #include <sys/archsystm.h>
54 #include <vm/hat.h>
55 #include <vm/as.h>
56 #include <vm/seg.h>
57 #include <vm/seg_kmem.h>
58 #include <vm/faultcode.h>
59 #include <sys/fp.h>
60 #include <sys/cmn_err.h>
61 #include <sys/segments.h>
62 #include <sys/clock.h>
63 #if defined(__xpv)
64 #include <sys/hypervisor.h>
65 #include <sys/note.h>
66 #endif
67 
68 static void ldt_alloc(proc_t *, uint_t);
69 static void ldt_free(proc_t *);
70 static void ldt_dup(proc_t *, proc_t *);
71 static void ldt_grow(proc_t *, uint_t);
72 
73 /*
74  * sysi86 System Call
75  */
76 
77 /* ARGSUSED */
78 int
79 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
80 {
81 	struct ssd ssd;
82 	int error = 0;
83 	int c;
84 	proc_t *pp = curproc;
85 
86 	switch (cmd) {
87 
88 	/*
89 	 * The SI86V86 subsystem call of the SYSI86 system call
90 	 * supports only one subcode -- V86SC_IOPL.
91 	 */
92 	case SI86V86:
93 		if (arg1 == V86SC_IOPL) {
94 			struct regs *rp = lwptoregs(ttolwp(curthread));
95 			greg_t oldpl = rp->r_ps & PS_IOPL;
96 			greg_t newpl = arg2 & PS_IOPL;
97 
98 			/*
99 			 * Must be privileged to run this system call
100 			 * if giving more io privilege.
101 			 */
102 			if (newpl > oldpl && (error =
103 			    secpolicy_sys_config(CRED(), B_FALSE)) != 0)
104 				return (set_errno(error));
105 #if defined(__xpv)
106 			kpreempt_disable();
107 			installctx(curthread, NULL, xen_disable_user_iopl,
108 			    xen_enable_user_iopl, NULL, NULL,
109 			    xen_disable_user_iopl, NULL);
110 			xen_enable_user_iopl();
111 			kpreempt_enable();
112 #else
113 			rp->r_ps ^= oldpl ^ newpl;
114 #endif
115 		} else
116 			error = EINVAL;
117 		break;
118 
119 	/*
120 	 * Set a segment descriptor
121 	 */
122 	case SI86DSCR:
123 		/*
124 		 * There are considerable problems here manipulating
125 		 * resources shared by many running lwps.  Get everyone
126 		 * into a safe state before changing the LDT.
127 		 */
128 		if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) {
129 			error = EINTR;
130 			break;
131 		}
132 
133 		if (get_udatamodel() == DATAMODEL_LP64) {
134 			error = EINVAL;
135 			break;
136 		}
137 
138 		if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) {
139 			error = EFAULT;
140 			break;
141 		}
142 
143 		error = setdscr(&ssd);
144 
145 		mutex_enter(&pp->p_lock);
146 		if (curthread != pp->p_agenttp)
147 			continuelwps(pp);
148 		mutex_exit(&pp->p_lock);
149 		break;
150 
151 	case SI86FPHW:
152 		c = fp_kind & 0xff;
153 		if (suword32((void *)arg1, c) == -1)
154 			error = EFAULT;
155 		break;
156 
157 	case SI86FPSTART:
158 		/*
159 		 * arg1 is the address of _fp_hw
160 		 * arg2 is the desired x87 FCW value
161 		 * arg3 is the desired SSE MXCSR value
162 		 * a return value of one means SSE hardware, else none.
163 		 */
164 		c = fp_kind & 0xff;
165 		if (suword32((void *)arg1, c) == -1) {
166 			error = EFAULT;
167 			break;
168 		}
169 		fpsetcw((uint16_t)arg2, (uint32_t)arg3);
170 		return ((fp_kind & __FP_SSE) ? 1 : 0);
171 
172 	/* real time clock management commands */
173 
174 	case WTODC:
175 		if ((error = secpolicy_settime(CRED())) == 0) {
176 			timestruc_t ts;
177 			mutex_enter(&tod_lock);
178 			gethrestime(&ts);
179 			tod_set(ts);
180 			mutex_exit(&tod_lock);
181 		}
182 		break;
183 
184 /* Give some timezone playing room */
185 #define	ONEWEEK	(7 * 24 * 60 * 60)
186 
187 	case SGMTL:
188 		/*
189 		 * Called from 32 bit land, negative values
190 		 * are not sign extended, so we do that here
191 		 * by casting it to an int and back.  We also
192 		 * clamp the value to within reason and detect
193 		 * when a 64 bit call overflows an int.
194 		 */
195 		if ((error = secpolicy_settime(CRED())) == 0) {
196 			int newlag = (int)arg1;
197 
198 #ifdef _SYSCALL32_IMPL
199 			if (get_udatamodel() == DATAMODEL_NATIVE &&
200 			    (long)newlag != (long)arg1) {
201 				error = EOVERFLOW;
202 			} else
203 #endif
204 			if (newlag >= -ONEWEEK && newlag <= ONEWEEK)
205 				sgmtl(newlag);
206 			else
207 				error = EOVERFLOW;
208 		}
209 		break;
210 
211 	case GGMTL:
212 		if (get_udatamodel() == DATAMODEL_NATIVE) {
213 			if (sulword((void *)arg1, ggmtl()) == -1)
214 				error = EFAULT;
215 #ifdef _SYSCALL32_IMPL
216 		} else {
217 			time_t gmtl;
218 
219 			if ((gmtl = ggmtl()) > INT32_MAX) {
220 				/*
221 				 * Since gmt_lag can at most be
222 				 * +/- 12 hours, something is
223 				 * *seriously* messed up here.
224 				 */
225 				error = EOVERFLOW;
226 			} else if (suword32((void *)arg1, (int32_t)gmtl) == -1)
227 				error = EFAULT;
228 #endif
229 		}
230 		break;
231 
232 	case RTCSYNC:
233 		if ((error = secpolicy_settime(CRED())) == 0)
234 			rtcsync();
235 		break;
236 
237 	/* END OF real time clock management commands */
238 
239 	default:
240 		error = EINVAL;
241 		break;
242 	}
243 	return (error == 0 ? 0 : set_errno(error));
244 }
245 
246 void
247 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel)
248 {
249 	ssd->bo = USEGD_GETBASE(usd);
250 	ssd->ls = USEGD_GETLIMIT(usd);
251 	ssd->sel = sel;
252 
253 	/*
254 	 * set type, dpl and present bits.
255 	 */
256 	ssd->acc1 = usd->usd_type;
257 	ssd->acc1 |= usd->usd_dpl << 5;
258 	ssd->acc1 |= usd->usd_p << (5 + 2);
259 
260 	/*
261 	 * set avl, DB and granularity bits.
262 	 */
263 	ssd->acc2 = usd->usd_avl;
264 
265 #if defined(__amd64)
266 	ssd->acc2 |= usd->usd_long << 1;
267 #else
268 	ssd->acc2 |= usd->usd_reserved << 1;
269 #endif
270 
271 	ssd->acc2 |= usd->usd_def32 << (1 + 1);
272 	ssd->acc2 |= usd->usd_gran << (1 + 1 + 1);
273 }
274 
275 static void
276 ssd_to_usd(struct ssd *ssd, user_desc_t *usd)
277 {
278 
279 	ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0);
280 
281 	USEGD_SETBASE(usd, ssd->bo);
282 	USEGD_SETLIMIT(usd, ssd->ls);
283 
284 	/*
285 	 * set type, dpl and present bits.
286 	 */
287 	usd->usd_type = ssd->acc1;
288 	usd->usd_dpl = ssd->acc1 >> 5;
289 	usd->usd_p = ssd->acc1 >> (5 + 2);
290 
291 	ASSERT(usd->usd_type >= SDT_MEMRO);
292 	ASSERT(usd->usd_dpl == SEL_UPL);
293 
294 	/*
295 	 * 64-bit code selectors are never allowed in the LDT.
296 	 * Reserved bit is always 0 on 32-bit sytems.
297 	 */
298 #if defined(__amd64)
299 	usd->usd_long = 0;
300 #else
301 	usd->usd_reserved = 0;
302 #endif
303 
304 	/*
305 	 * set avl, DB and granularity bits.
306 	 */
307 	usd->usd_avl = ssd->acc2;
308 	usd->usd_def32 = ssd->acc2 >> (1 + 1);
309 	usd->usd_gran = ssd->acc2 >> (1 + 1 + 1);
310 }
311 
312 
313 #if defined(__i386)
314 
315 static void
316 ssd_to_sgd(struct ssd *ssd, gate_desc_t *sgd)
317 {
318 
319 	ASSERT(bcmp(sgd, &null_sdesc, sizeof (*sgd)) == 0);
320 
321 	sgd->sgd_looffset = ssd->bo;
322 	sgd->sgd_hioffset = ssd->bo >> 16;
323 
324 	sgd->sgd_selector = ssd->ls;
325 
326 	/*
327 	 * set type, dpl and present bits.
328 	 */
329 	sgd->sgd_type = ssd->acc1;
330 	sgd->sgd_dpl = ssd->acc1 >> 5;
331 	sgd->sgd_p = ssd->acc1 >> 7;
332 	ASSERT(sgd->sgd_type == SDT_SYSCGT);
333 	ASSERT(sgd->sgd_dpl == SEL_UPL);
334 	sgd->sgd_stkcpy = 0;
335 }
336 
337 #endif	/* __i386 */
338 
339 /*
340  * Load LDT register with the current process's LDT.
341  */
342 static void
343 ldt_load(void)
344 {
345 #if defined(__xpv)
346 	xen_set_ldt(get_ssd_base(&curproc->p_ldt_desc),
347 	    curproc->p_ldtlimit + 1);
348 #else
349 	*((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = curproc->p_ldt_desc;
350 	wr_ldtr(ULDT_SEL);
351 #endif
352 }
353 
354 /*
355  * Store a NULL selector in the LDTR. All subsequent illegal references to
356  * the LDT will result in a #gp.
357  */
358 void
359 ldt_unload(void)
360 {
361 #if defined(__xpv)
362 	xen_set_ldt(NULL, 0);
363 #else
364 	*((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc;
365 	wr_ldtr(0);
366 #endif
367 }
368 
369 /*ARGSUSED*/
370 static void
371 ldt_savectx(proc_t *p)
372 {
373 	ASSERT(p->p_ldt != NULL);
374 	ASSERT(p == curproc);
375 
376 #if defined(__amd64)
377 	/*
378 	 * The 64-bit kernel must be sure to clear any stale ldt
379 	 * selectors when context switching away from a process that
380 	 * has a private ldt. Consider the following example:
381 	 *
382 	 * 	Wine creats a ldt descriptor and points a segment register
383 	 * 	to it.
384 	 *
385 	 *	We then context switch away from wine lwp to kernel
386 	 *	thread and hit breakpoint in kernel with kmdb
387 	 *
388 	 *	When we continue and resume from kmdb we will #gp
389 	 * 	fault since kmdb will have saved the stale ldt selector
390 	 *	from wine and will try to restore it but we are no longer in
391 	 *	the context of the wine process and do not have our
392 	 *	ldtr register pointing to the private ldt.
393 	 */
394 	reset_sregs();
395 #endif
396 
397 	ldt_unload();
398 	cpu_fast_syscall_enable(NULL);
399 }
400 
401 static void
402 ldt_restorectx(proc_t *p)
403 {
404 	ASSERT(p->p_ldt != NULL);
405 	ASSERT(p == curproc);
406 
407 	ldt_load();
408 	cpu_fast_syscall_disable(NULL);
409 }
410 
411 /*
412  * When a process with a private LDT execs, fast syscalls must be enabled for
413  * the new process image.
414  */
415 /* ARGSUSED */
416 static void
417 ldt_freectx(proc_t *p, int isexec)
418 {
419 	ASSERT(p->p_ldt);
420 
421 	if (isexec) {
422 		kpreempt_disable();
423 		cpu_fast_syscall_enable(NULL);
424 		kpreempt_enable();
425 	}
426 
427 	/*
428 	 * ldt_free() will free the memory used by the private LDT, reset the
429 	 * process's descriptor, and re-program the LDTR.
430 	 */
431 	ldt_free(p);
432 }
433 
434 /*
435  * Install ctx op that ensures syscall/sysenter are disabled.
436  * See comments below.
437  *
438  * When a thread with a private LDT forks, the new process
439  * must have the LDT context ops installed.
440  */
441 /* ARGSUSED */
442 static void
443 ldt_installctx(proc_t *p, proc_t *cp)
444 {
445 	proc_t		*targ = p;
446 	kthread_t	*t;
447 
448 	/*
449 	 * If this is a fork, operate on the child process.
450 	 */
451 	if (cp != NULL) {
452 		targ = cp;
453 		ldt_dup(p, cp);
454 	}
455 
456 	/*
457 	 * The process context ops expect the target process as their argument.
458 	 */
459 	ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx,
460 	    ldt_installctx, ldt_savectx, ldt_freectx) == 0);
461 
462 	installpctx(targ, targ, ldt_savectx, ldt_restorectx,
463 	    ldt_installctx, ldt_savectx, ldt_freectx);
464 
465 	/*
466 	 * We've just disabled fast system call and return instructions; take
467 	 * the slow path out to make sure we don't try to use one to return
468 	 * back to user. We must set t_post_sys for every thread in the
469 	 * process to make sure none of them escape out via fast return.
470 	 */
471 
472 	mutex_enter(&targ->p_lock);
473 	t = targ->p_tlist;
474 	do {
475 		t->t_post_sys = 1;
476 	} while ((t = t->t_forw) != targ->p_tlist);
477 	mutex_exit(&targ->p_lock);
478 }
479 
480 int
481 setdscr(struct ssd *ssd)
482 {
483 	ushort_t seli; 		/* selector index */
484 	user_desc_t *ldp;	/* descriptor pointer */
485 	user_desc_t ndesc;	/* new descriptor */
486 	proc_t	*pp = ttoproc(curthread);
487 	int	rc = 0;
488 
489 	/*
490 	 * LDT segments: executable and data at DPL 3 only.
491 	 */
492 	if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel))
493 		return (EINVAL);
494 
495 	/*
496 	 * check the selector index.
497 	 */
498 	seli = SELTOIDX(ssd->sel);
499 	if (seli >= MAXNLDT || seli < LDT_UDBASE)
500 		return (EINVAL);
501 
502 	ndesc = null_udesc;
503 	mutex_enter(&pp->p_ldtlock);
504 
505 	/*
506 	 * If this is the first time for this process then setup a
507 	 * private LDT for it.
508 	 */
509 	if (pp->p_ldt == NULL) {
510 		ldt_alloc(pp, seli);
511 
512 		/*
513 		 * Now that this process has a private LDT, the use of
514 		 * the syscall/sysret and sysenter/sysexit instructions
515 		 * is forbidden for this processes because they destroy
516 		 * the contents of %cs and %ss segment registers.
517 		 *
518 		 * Explicity disable them here and add a context handler
519 		 * to the process. Note that disabling
520 		 * them here means we can't use sysret or sysexit on
521 		 * the way out of this system call - so we force this
522 		 * thread to take the slow path (which doesn't make use
523 		 * of sysenter or sysexit) back out.
524 		 */
525 		kpreempt_disable();
526 		ldt_installctx(pp, NULL);
527 		cpu_fast_syscall_disable(NULL);
528 		ASSERT(curthread->t_post_sys != 0);
529 		kpreempt_enable();
530 
531 	} else if (seli > pp->p_ldtlimit) {
532 
533 		/*
534 		 * Increase size of ldt to include seli.
535 		 */
536 		ldt_grow(pp, seli);
537 	}
538 
539 	ASSERT(seli <= pp->p_ldtlimit);
540 	ldp = &pp->p_ldt[seli];
541 
542 	/*
543 	 * On the 64-bit kernel, this is where things get more subtle.
544 	 * Recall that in the 64-bit kernel, when we enter the kernel we
545 	 * deliberately -don't- reload the segment selectors we came in on
546 	 * for %ds, %es, %fs or %gs. Messing with selectors is expensive,
547 	 * and the underlying descriptors are essentially ignored by the
548 	 * hardware in long mode - except for the base that we override with
549 	 * the gsbase MSRs.
550 	 *
551 	 * However, there's one unfortunate issue with this rosy picture --
552 	 * a descriptor that's not marked as 'present' will still generate
553 	 * an #np when loading a segment register.
554 	 *
555 	 * Consider this case.  An lwp creates a harmless LDT entry, points
556 	 * one of it's segment registers at it, then tells the kernel (here)
557 	 * to delete it.  In the 32-bit kernel, the #np will happen on the
558 	 * way back to userland where we reload the segment registers, and be
559 	 * handled in kern_gpfault().  In the 64-bit kernel, the same thing
560 	 * will happen in the normal case too.  However, if we're trying to
561 	 * use a debugger that wants to save and restore the segment registers,
562 	 * and the debugger things that we have valid segment registers, we
563 	 * have the problem that the debugger will try and restore the
564 	 * segment register that points at the now 'not present' descriptor
565 	 * and will take a #np right there.
566 	 *
567 	 * We should obviously fix the debugger to be paranoid about
568 	 * -not- restoring segment registers that point to bad descriptors;
569 	 * however we can prevent the problem here if we check to see if any
570 	 * of the segment registers are still pointing at the thing we're
571 	 * destroying; if they are, return an error instead. (That also seems
572 	 * a lot better failure mode than SIGKILL and a core file
573 	 * from kern_gpfault() too.)
574 	 */
575 	if (SI86SSD_PRES(ssd) == 0) {
576 		kthread_t *t;
577 		int bad = 0;
578 
579 		/*
580 		 * Look carefully at the segment registers of every lwp
581 		 * in the process (they're all stopped by our caller).
582 		 * If we're about to invalidate a descriptor that's still
583 		 * being referenced by *any* of them, return an error,
584 		 * rather than having them #gp on their way out of the kernel.
585 		 */
586 		ASSERT(pp->p_lwprcnt == 1);
587 
588 		mutex_enter(&pp->p_lock);
589 		t = pp->p_tlist;
590 		do {
591 			klwp_t *lwp = ttolwp(t);
592 			struct regs *rp = lwp->lwp_regs;
593 #if defined(__amd64)
594 			pcb_t *pcb = &lwp->lwp_pcb;
595 #endif
596 
597 			if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) {
598 				bad = 1;
599 				break;
600 			}
601 
602 #if defined(__amd64)
603 			if (pcb->pcb_rupdate == 1) {
604 				if (ssd->sel == pcb->pcb_ds ||
605 				    ssd->sel == pcb->pcb_es ||
606 				    ssd->sel == pcb->pcb_fs ||
607 				    ssd->sel == pcb->pcb_gs) {
608 					bad = 1;
609 					break;
610 				}
611 			} else
612 #endif
613 			{
614 				if (ssd->sel == rp->r_ds ||
615 				    ssd->sel == rp->r_es ||
616 				    ssd->sel == rp->r_fs ||
617 				    ssd->sel == rp->r_gs) {
618 					bad = 1;
619 					break;
620 				}
621 			}
622 
623 		} while ((t = t->t_forw) != pp->p_tlist);
624 		mutex_exit(&pp->p_lock);
625 
626 		if (bad) {
627 			mutex_exit(&pp->p_ldtlock);
628 			return (EBUSY);
629 		}
630 	}
631 
632 	/*
633 	 * If acc1 is zero, clear the descriptor (including the 'present' bit)
634 	 */
635 	if (ssd->acc1 == 0) {
636 		rc  = ldt_update_segd(ldp, &null_udesc);
637 		mutex_exit(&pp->p_ldtlock);
638 		return (rc);
639 	}
640 
641 	/*
642 	 * Check segment type, allow segment not present and
643 	 * only user DPL (3).
644 	 */
645 	if (SI86SSD_DPL(ssd) != SEL_UPL) {
646 		mutex_exit(&pp->p_ldtlock);
647 		return (EINVAL);
648 	}
649 
650 #if defined(__amd64)
651 	/*
652 	 * Do not allow 32-bit applications to create 64-bit mode code
653 	 * segments.
654 	 */
655 	if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 &&
656 	    SI86SSD_ISLONG(ssd)) {
657 		mutex_exit(&pp->p_ldtlock);
658 		return (EINVAL);
659 	}
660 #endif /* __amd64 */
661 
662 	/*
663 	 * Set up a code or data user segment descriptor.
664 	 */
665 	if (SI86SSD_ISUSEG(ssd)) {
666 		ssd_to_usd(ssd, &ndesc);
667 		rc = ldt_update_segd(ldp, &ndesc);
668 		mutex_exit(&pp->p_ldtlock);
669 		return (rc);
670 	}
671 
672 #if defined(__i386)
673 	/*
674 	 * Allow a call gate only if the destination is in the LDT
675 	 * and the system is running in 32-bit legacy mode.
676 	 *
677 	 * In long mode 32-bit call gates are redefined as 64-bit call
678 	 * gates and the hw enforces that the target code selector
679 	 * of the call gate must be 64-bit selector. A #gp fault is
680 	 * generated if otherwise. Since we do not allow 32-bit processes
681 	 * to switch themselves to 64-bits we never allow call gates
682 	 * on 64-bit system system.
683 	 */
684 	if (SI86SSD_TYPE(ssd) == SDT_SYSCGT && SELISLDT(ssd->ls)) {
685 
686 
687 		ssd_to_sgd(ssd, (gate_desc_t *)&ndesc);
688 		rc = ldt_update_segd(ldp, &ndesc);
689 		mutex_exit(&pp->p_ldtlock);
690 		return (rc);
691 	}
692 #endif	/* __i386 */
693 
694 	mutex_exit(&pp->p_ldtlock);
695 	return (EINVAL);
696 }
697 
698 /*
699  * Allocate new LDT for process just large enough to contain seli.
700  * Note we allocate and grow LDT in PAGESIZE chunks. We do this
701  * to simplify the implementation and because on the hypervisor it's
702  * required, since the LDT must live on pages that have PROT_WRITE
703  * removed and which are given to the hypervisor.
704  */
705 static void
706 ldt_alloc(proc_t *pp, uint_t seli)
707 {
708 	user_desc_t	*ldt;
709 	size_t		ldtsz;
710 	uint_t		nsels;
711 
712 	ASSERT(MUTEX_HELD(&pp->p_ldtlock));
713 	ASSERT(pp->p_ldt == NULL);
714 	ASSERT(pp->p_ldtlimit == 0);
715 
716 	/*
717 	 * Allocate new LDT just large enough to contain seli.
718 	 */
719 	ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
720 	nsels = ldtsz / sizeof (user_desc_t);
721 	ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
722 
723 	ldt = kmem_zalloc(ldtsz, KM_SLEEP);
724 	ASSERT(IS_P2ALIGNED(ldt, PAGESIZE));
725 
726 #if defined(__xpv)
727 	if (xen_ldt_setprot(ldt, ldtsz, PROT_READ))
728 		panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed");
729 #endif
730 
731 	pp->p_ldt = ldt;
732 	pp->p_ldtlimit = nsels - 1;
733 	set_syssegd(&pp->p_ldt_desc, ldt, ldtsz - 1, SDT_SYSLDT, SEL_KPL);
734 
735 	if (pp == curproc) {
736 		kpreempt_disable();
737 		ldt_load();
738 		kpreempt_enable();
739 	}
740 }
741 
742 static void
743 ldt_free(proc_t *pp)
744 {
745 	user_desc_t	*ldt;
746 	size_t		ldtsz;
747 
748 	ASSERT(pp->p_ldt != NULL);
749 
750 	mutex_enter(&pp->p_ldtlock);
751 	ldt = pp->p_ldt;
752 	ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
753 
754 	ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE));
755 
756 	pp->p_ldt = NULL;
757 	pp->p_ldtlimit = 0;
758 	pp->p_ldt_desc = null_sdesc;
759 	mutex_exit(&pp->p_ldtlock);
760 
761 	if (pp == curproc) {
762 		kpreempt_disable();
763 		ldt_unload();
764 		kpreempt_enable();
765 	}
766 
767 #if defined(__xpv)
768 	/*
769 	 * We are not allowed to make the ldt writable until after
770 	 * we tell the hypervisor to unload it.
771 	 */
772 	if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE))
773 		panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
774 #endif
775 
776 	kmem_free(ldt, ldtsz);
777 }
778 
779 /*
780  * On fork copy new ldt for child.
781  */
782 static void
783 ldt_dup(proc_t *pp, proc_t *cp)
784 {
785 	size_t	ldtsz;
786 
787 	ASSERT(pp->p_ldt != NULL);
788 	ASSERT(cp != curproc);
789 
790 	/*
791 	 * I assume the parent's ldt can't increase since we're in a fork.
792 	 */
793 	mutex_enter(&pp->p_ldtlock);
794 	mutex_enter(&cp->p_ldtlock);
795 
796 	ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
797 
798 	ldt_alloc(cp, pp->p_ldtlimit);
799 
800 #if defined(__xpv)
801 	/*
802 	 * Make child's ldt writable so it can be copied into from
803 	 * parent's ldt. This works since ldt_alloc above did not load
804 	 * the ldt since its for the child process. If we tried to make
805 	 * an LDT writable that is loaded in hw the setprot operation
806 	 * would fail.
807 	 */
808 	if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE))
809 		panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
810 #endif
811 
812 	bcopy(pp->p_ldt, cp->p_ldt, ldtsz);
813 
814 #if defined(__xpv)
815 	if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ))
816 		panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed");
817 #endif
818 	mutex_exit(&cp->p_ldtlock);
819 	mutex_exit(&pp->p_ldtlock);
820 
821 }
822 
823 static void
824 ldt_grow(proc_t *pp, uint_t seli)
825 {
826 	user_desc_t	*oldt, *nldt;
827 	uint_t		nsels;
828 	size_t		oldtsz, nldtsz;
829 
830 	ASSERT(MUTEX_HELD(&pp->p_ldtlock));
831 	ASSERT(pp->p_ldt != NULL);
832 	ASSERT(pp->p_ldtlimit != 0);
833 
834 	/*
835 	 * Allocate larger LDT just large enough to contain seli.
836 	 */
837 	nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
838 	nsels = nldtsz / sizeof (user_desc_t);
839 	ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
840 	ASSERT(nsels > pp->p_ldtlimit);
841 
842 	oldt = pp->p_ldt;
843 	oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
844 
845 	nldt = kmem_zalloc(nldtsz, KM_SLEEP);
846 	ASSERT(IS_P2ALIGNED(nldt, PAGESIZE));
847 
848 	bcopy(oldt, nldt, oldtsz);
849 
850 	/*
851 	 * unload old ldt.
852 	 */
853 	kpreempt_disable();
854 	ldt_unload();
855 	kpreempt_enable();
856 
857 #if defined(__xpv)
858 
859 	/*
860 	 * Make old ldt writable and new ldt read only.
861 	 */
862 	if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE))
863 		panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
864 
865 	if (xen_ldt_setprot(nldt, nldtsz, PROT_READ))
866 		panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed");
867 #endif
868 
869 	pp->p_ldt = nldt;
870 	pp->p_ldtlimit = nsels - 1;
871 
872 	/*
873 	 * write new ldt segment descriptor.
874 	 */
875 	set_syssegd(&pp->p_ldt_desc, nldt, nldtsz - 1, SDT_SYSLDT, SEL_KPL);
876 
877 	/*
878 	 * load the new ldt.
879 	 */
880 	kpreempt_disable();
881 	ldt_load();
882 	kpreempt_enable();
883 
884 	kmem_free(oldt, oldtsz);
885 }
886